diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,54769 @@ +{ + "best_global_step": null, + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 1.0, + "eval_steps": 500, + "global_step": 3649, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0002740852405097985, + "grad_norm": 50.75, + "kl": 0.0, + "learning_rate": 1.4285714285714287e-07, + "logits/chosen": 13927143.111111112, + "logits/rejected": 11332061.866666667, + "logps/chosen": -520.2071940104166, + "logps/rejected": -525.8274739583334, + "loss": 0.5, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 1 + }, + { + "epoch": 0.000548170481019597, + "grad_norm": 50.25, + "kl": 0.0, + "learning_rate": 2.8571428571428575e-07, + "logits/chosen": 6138932.0, + "logits/rejected": 53417216.0, + "logps/chosen": -419.6460774739583, + "logps/rejected": -486.9514567057292, + "loss": 0.5, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 2 + }, + { + "epoch": 0.0008222557215293956, + "grad_norm": 58.5, + "kl": 0.21227264404296875, + "learning_rate": 4.285714285714286e-07, + "logits/chosen": 17829473.14285714, + "logits/rejected": 8480004.8, + "logps/chosen": -581.1654575892857, + "logps/rejected": -494.3796875, + "loss": 0.4925, + "rewards/chosen": 0.03145424808774676, + "rewards/margins": 0.05075905855212893, + "rewards/rejected": -0.01930481046438217, + "step": 3 + }, + { + "epoch": 0.001096340962039194, + "grad_norm": 43.5, + "kl": 0.08537165820598602, + "learning_rate": 5.714285714285715e-07, + "logits/chosen": 8679396.363636363, + "logits/rejected": 19455780.923076924, + "logps/chosen": -401.5114080255682, + "logps/rejected": -392.17041015625, + "loss": 0.5027, + "rewards/chosen": -0.048328193751248444, + "rewards/margins": -0.02755581686546752, + "rewards/rejected": -0.020772376885780923, + "step": 4 + }, + { + "epoch": 0.0013704262025489927, + "grad_norm": 69.0, + "kl": 0.11183484643697739, + "learning_rate": 7.142857142857143e-07, + "logits/chosen": 26516902.4, + "logits/rejected": 49527992.88888889, + "logps/chosen": -645.0729817708333, + "logps/rejected": -414.0642361111111, + "loss": 0.4977, + "rewards/chosen": 0.004266563057899475, + "rewards/margins": 0.028189377321137323, + "rewards/rejected": -0.02392281426323785, + "step": 5 + }, + { + "epoch": 0.0016445114430587912, + "grad_norm": 50.25, + "kl": 0.07754262536764145, + "learning_rate": 8.571428571428572e-07, + "logits/chosen": 11069390.857142856, + "logits/rejected": 21200387.2, + "logps/chosen": -470.66636439732144, + "logps/rejected": -320.935498046875, + "loss": 0.5071, + "rewards/chosen": -0.030853109700339183, + "rewards/margins": -0.055996160847800125, + "rewards/rejected": 0.02514305114746094, + "step": 6 + }, + { + "epoch": 0.0019185966835685898, + "grad_norm": 52.0, + "kl": 0.34815216064453125, + "learning_rate": 1.0000000000000002e-06, + "logits/chosen": 23814938.666666668, + "logits/rejected": 16669717.333333334, + "logps/chosen": -595.7197265625, + "logps/rejected": -338.6750081380208, + "loss": 0.4944, + "rewards/chosen": 0.019522984822591145, + "rewards/margins": 0.06013285617033641, + "rewards/rejected": -0.04060987134774526, + "step": 7 + }, + { + "epoch": 0.002192681924078388, + "grad_norm": 50.25, + "kl": 0.14911271631717682, + "learning_rate": 1.142857142857143e-06, + "logits/chosen": 3765517.4545454546, + "logits/rejected": 2731864.6153846155, + "logps/chosen": -544.12646484375, + "logps/rejected": -429.9457256610577, + "loss": 0.4961, + "rewards/chosen": 0.00898340479894118, + "rewards/margins": 0.021482700547138293, + "rewards/rejected": -0.012499295748197116, + "step": 8 + }, + { + "epoch": 0.0024667671645881867, + "grad_norm": 59.75, + "kl": 0.2560485303401947, + "learning_rate": 1.2857142857142856e-06, + "logits/chosen": -6629439.05882353, + "logits/rejected": 37816203.428571425, + "logps/chosen": -439.47443704044116, + "logps/rejected": -329.57212611607144, + "loss": 0.4984, + "rewards/chosen": 0.03969919681549072, + "rewards/margins": 0.011308148503303528, + "rewards/rejected": 0.028391048312187195, + "step": 9 + }, + { + "epoch": 0.0027408524050979853, + "grad_norm": 47.75, + "kl": 0.06622314453125, + "learning_rate": 1.4285714285714286e-06, + "logits/chosen": 4689703.384615385, + "logits/rejected": 39738984.72727273, + "logps/chosen": -441.67427884615387, + "logps/rejected": -464.00297407670456, + "loss": 0.4949, + "rewards/chosen": 0.030432481032151442, + "rewards/margins": 0.03887017618317704, + "rewards/rejected": -0.008437695151025599, + "step": 10 + }, + { + "epoch": 0.003014937645607784, + "grad_norm": 46.75, + "kl": 0.3595088720321655, + "learning_rate": 1.5714285714285714e-06, + "logits/chosen": 23492142.769230768, + "logits/rejected": 15509666.909090908, + "logps/chosen": -491.8811598557692, + "logps/rejected": -397.7107599431818, + "loss": 0.5029, + "rewards/chosen": 0.014393966931563158, + "rewards/margins": -0.0066788027753363136, + "rewards/rejected": 0.02107276970689947, + "step": 11 + }, + { + "epoch": 0.0032890228861175825, + "grad_norm": 53.25, + "kl": 0.5253448486328125, + "learning_rate": 1.7142857142857145e-06, + "logits/chosen": -22031419.2, + "logits/rejected": 21621410.285714287, + "logps/chosen": -622.251708984375, + "logps/rejected": -458.8676060267857, + "loss": 0.4851, + "rewards/chosen": 0.1518975853919983, + "rewards/margins": 0.14506044728415354, + "rewards/rejected": 0.006837138107844761, + "step": 12 + }, + { + "epoch": 0.003563108126627381, + "grad_norm": 43.25, + "kl": 0.3934866786003113, + "learning_rate": 1.8571428571428573e-06, + "logits/chosen": -13052924.444444444, + "logits/rejected": 17158837.333333332, + "logps/chosen": -511.2985026041667, + "logps/rejected": -329.3484375, + "loss": 0.4885, + "rewards/chosen": 0.0893764959441291, + "rewards/margins": 0.09199073356058862, + "rewards/rejected": -0.0026142376164595285, + "step": 13 + }, + { + "epoch": 0.0038371933671371796, + "grad_norm": 43.5, + "kl": 0.6282181739807129, + "learning_rate": 2.0000000000000003e-06, + "logits/chosen": 9690637.333333334, + "logits/rejected": 43223938.666666664, + "logps/chosen": -372.0698649088542, + "logps/rejected": -470.623291015625, + "loss": 0.4993, + "rewards/chosen": 0.09184247255325317, + "rewards/margins": 0.05240772167841593, + "rewards/rejected": 0.03943475087483724, + "step": 14 + }, + { + "epoch": 0.004111278607646978, + "grad_norm": 46.0, + "kl": 0.44204461574554443, + "learning_rate": 2.1428571428571427e-06, + "logits/chosen": 27328170.666666668, + "logits/rejected": 20297512.0, + "logps/chosen": -480.5823567708333, + "logps/rejected": -317.0185953776042, + "loss": 0.4835, + "rewards/chosen": 0.16300062338511148, + "rewards/margins": 0.16654206129411855, + "rewards/rejected": -0.0035414379090070724, + "step": 15 + }, + { + "epoch": 0.004385363848156776, + "grad_norm": 49.0, + "kl": 0.5807406306266785, + "learning_rate": 2.285714285714286e-06, + "logits/chosen": 25201772.307692308, + "logits/rejected": 38787066.18181818, + "logps/chosen": -424.2336989182692, + "logps/rejected": -554.6207386363636, + "loss": 0.4759, + "rewards/chosen": 0.17877949201143706, + "rewards/margins": 0.2151955620392219, + "rewards/rejected": -0.036416070027784866, + "step": 16 + }, + { + "epoch": 0.004659449088666575, + "grad_norm": 56.5, + "kl": 1.4637140035629272, + "learning_rate": 2.428571428571429e-06, + "logits/chosen": 27894973.866666667, + "logits/rejected": 17511440.0, + "logps/chosen": -555.9555338541667, + "logps/rejected": -407.6745334201389, + "loss": 0.4712, + "rewards/chosen": 0.24269622166951496, + "rewards/margins": 0.24497919090920023, + "rewards/rejected": -0.0022829692396852705, + "step": 17 + }, + { + "epoch": 0.0049335343291763735, + "grad_norm": 47.75, + "kl": 1.1878068447113037, + "learning_rate": 2.571428571428571e-06, + "logits/chosen": 17917548.307692308, + "logits/rejected": 39262219.63636363, + "logps/chosen": -504.28665865384613, + "logps/rejected": -306.9205433238636, + "loss": 0.4678, + "rewards/chosen": 0.29997653227586013, + "rewards/margins": 0.2904134330215988, + "rewards/rejected": 0.009563099254261364, + "step": 18 + }, + { + "epoch": 0.0052076195696861725, + "grad_norm": 48.0, + "kl": 1.7262630462646484, + "learning_rate": 2.7142857142857144e-06, + "logits/chosen": 27008704.0, + "logits/rejected": 41710848.0, + "logps/chosen": -450.87025669642856, + "logps/rejected": -471.009033203125, + "loss": 0.4588, + "rewards/chosen": 0.33586086545671734, + "rewards/margins": 0.3269642270037106, + "rewards/rejected": 0.008896638453006745, + "step": 19 + }, + { + "epoch": 0.005481704810195971, + "grad_norm": 51.5, + "kl": 2.1238677501678467, + "learning_rate": 2.8571428571428573e-06, + "logits/chosen": 9376416.0, + "logits/rejected": 1561116.0, + "logps/chosen": -444.3380916819853, + "logps/rejected": -678.6237444196429, + "loss": 0.4709, + "rewards/chosen": 0.33550604651955995, + "rewards/margins": 0.3662089099403189, + "rewards/rejected": -0.030702863420758928, + "step": 20 + }, + { + "epoch": 0.00575579005070577, + "grad_norm": 42.0, + "kl": 1.308091163635254, + "learning_rate": 3e-06, + "logits/chosen": 59523955.2, + "logits/rejected": 14392685.714285715, + "logps/chosen": -408.484619140625, + "logps/rejected": -304.14937918526783, + "loss": 0.4715, + "rewards/chosen": 0.27727417945861815, + "rewards/margins": 0.28442657345107625, + "rewards/rejected": -0.007152393992458071, + "step": 21 + }, + { + "epoch": 0.006029875291215568, + "grad_norm": 46.75, + "kl": 1.8810844421386719, + "learning_rate": 3.142857142857143e-06, + "logits/chosen": 28565369.14285714, + "logits/rejected": 44350809.6, + "logps/chosen": -481.31815011160717, + "logps/rejected": -563.53828125, + "loss": 0.4499, + "rewards/chosen": 0.4027050222669329, + "rewards/margins": 0.45145227568490165, + "rewards/rejected": -0.04874725341796875, + "step": 22 + }, + { + "epoch": 0.006303960531725367, + "grad_norm": 54.0, + "kl": 3.506432056427002, + "learning_rate": 3.285714285714286e-06, + "logits/chosen": 15614805.333333334, + "logits/rejected": 26853038.222222224, + "logps/chosen": -555.7845052083334, + "logps/rejected": -534.6737196180555, + "loss": 0.4343, + "rewards/chosen": 0.5615922292073567, + "rewards/margins": 0.5846288051870134, + "rewards/rejected": -0.023036575979656644, + "step": 23 + }, + { + "epoch": 0.006578045772235165, + "grad_norm": 47.5, + "kl": 2.1787030696868896, + "learning_rate": 3.428571428571429e-06, + "logits/chosen": 13475936.888888888, + "logits/rejected": 8984152.533333333, + "logps/chosen": -514.1055230034722, + "logps/rejected": -519.4493815104166, + "loss": 0.4602, + "rewards/chosen": 0.596612188551161, + "rewards/margins": 0.5181697977913751, + "rewards/rejected": 0.07844239075978597, + "step": 24 + }, + { + "epoch": 0.006852131012744964, + "grad_norm": 47.25, + "kl": 2.2701964378356934, + "learning_rate": 3.5714285714285718e-06, + "logits/chosen": 25019580.0, + "logits/rejected": 49364536.0, + "logps/chosen": -464.3586730957031, + "logps/rejected": -473.77532958984375, + "loss": 0.4445, + "rewards/chosen": 0.5936341285705566, + "rewards/margins": 0.5567014440894127, + "rewards/rejected": 0.03693268448114395, + "step": 25 + }, + { + "epoch": 0.007126216253254762, + "grad_norm": 64.5, + "kl": 5.17418909072876, + "learning_rate": 3.7142857142857146e-06, + "logits/chosen": 8441686.588235294, + "logits/rejected": 62357321.14285714, + "logps/chosen": -448.70237821691177, + "logps/rejected": -516.1609235491071, + "loss": 0.4399, + "rewards/chosen": 0.7655358034021714, + "rewards/margins": 0.5591918460461272, + "rewards/rejected": 0.20634395735604422, + "step": 26 + }, + { + "epoch": 0.007400301493764561, + "grad_norm": 40.5, + "kl": 3.3029277324676514, + "learning_rate": 3.857142857142858e-06, + "logits/chosen": 24676644.923076924, + "logits/rejected": 12172162.909090908, + "logps/chosen": -456.43197866586536, + "logps/rejected": -429.5431019176136, + "loss": 0.4261, + "rewards/chosen": 0.6237812042236328, + "rewards/margins": 0.6781343980268999, + "rewards/rejected": -0.054353193803267044, + "step": 27 + }, + { + "epoch": 0.007674386734274359, + "grad_norm": 39.75, + "kl": 4.1008524894714355, + "learning_rate": 4.000000000000001e-06, + "logits/chosen": 29518685.714285713, + "logits/rejected": 41061369.6, + "logps/chosen": -404.863037109375, + "logps/rejected": -320.807373046875, + "loss": 0.395, + "rewards/chosen": 0.9848604202270508, + "rewards/margins": 1.0156987398862838, + "rewards/rejected": -0.030838319659233095, + "step": 28 + }, + { + "epoch": 0.007948471974784158, + "grad_norm": 41.0, + "kl": 2.776031970977783, + "learning_rate": 4.1428571428571435e-06, + "logits/chosen": 27716812.8, + "logits/rejected": 30704429.714285713, + "logps/chosen": -508.3095703125, + "logps/rejected": -437.04983956473217, + "loss": 0.3691, + "rewards/chosen": 1.162557888031006, + "rewards/margins": 1.3755822999136789, + "rewards/rejected": -0.213024411882673, + "step": 29 + }, + { + "epoch": 0.008222557215293956, + "grad_norm": 44.75, + "kl": 5.293633460998535, + "learning_rate": 4.2857142857142855e-06, + "logits/chosen": 8289705.846153846, + "logits/rejected": 41540800.0, + "logps/chosen": -486.68160306490387, + "logps/rejected": -373.1516779119318, + "loss": 0.4456, + "rewards/chosen": 1.0477979366595929, + "rewards/margins": 1.051271803729184, + "rewards/rejected": -0.0034738670695911755, + "step": 30 + }, + { + "epoch": 0.008496642455803755, + "grad_norm": 42.0, + "kl": 5.815784931182861, + "learning_rate": 4.428571428571429e-06, + "logits/chosen": -3688871.714285714, + "logits/rejected": 18809180.8, + "logps/chosen": -506.93868582589283, + "logps/rejected": -483.13701171875, + "loss": 0.3683, + "rewards/chosen": 1.415254865373884, + "rewards/margins": 1.3565889579909187, + "rewards/rejected": 0.058665907382965087, + "step": 31 + }, + { + "epoch": 0.008770727696313553, + "grad_norm": 40.5, + "kl": 8.473623275756836, + "learning_rate": 4.571428571428572e-06, + "logits/chosen": 35405789.71428572, + "logits/rejected": 30077491.2, + "logps/chosen": -609.8044084821429, + "logps/rejected": -374.19736328125, + "loss": 0.3696, + "rewards/chosen": 1.5361454827444894, + "rewards/margins": 1.4733915669577462, + "rewards/rejected": 0.06275391578674316, + "step": 32 + }, + { + "epoch": 0.009044812936823353, + "grad_norm": 73.5, + "kl": 5.787624835968018, + "learning_rate": 4.714285714285715e-06, + "logits/chosen": 17223326.666666668, + "logits/rejected": 131733930.66666667, + "logps/chosen": -449.9830729166667, + "logps/rejected": -795.5939127604166, + "loss": 0.4131, + "rewards/chosen": 1.0646476745605469, + "rewards/margins": 0.8413174947102865, + "rewards/rejected": 0.2233301798502604, + "step": 33 + }, + { + "epoch": 0.00931889817733315, + "grad_norm": 35.75, + "kl": 4.198353290557861, + "learning_rate": 4.857142857142858e-06, + "logits/chosen": -3141058.6666666665, + "logits/rejected": 10970303.333333334, + "logps/chosen": -414.4723714192708, + "logps/rejected": -447.91064453125, + "loss": 0.3887, + "rewards/chosen": 1.1848435401916504, + "rewards/margins": 1.2730753223101299, + "rewards/rejected": -0.08823178211847942, + "step": 34 + }, + { + "epoch": 0.009592983417842949, + "grad_norm": 34.75, + "kl": 6.246673583984375, + "learning_rate": 5e-06, + "logits/chosen": 19459925.333333332, + "logits/rejected": 29363196.444444444, + "logps/chosen": -479.74423828125, + "logps/rejected": -503.4288736979167, + "loss": 0.2916, + "rewards/chosen": 1.6024121602376302, + "rewards/margins": 2.0010582235124375, + "rewards/rejected": -0.3986460632748074, + "step": 35 + }, + { + "epoch": 0.009867068658352747, + "grad_norm": 32.25, + "kl": 3.9539794921875, + "learning_rate": 5e-06, + "logits/chosen": 12401387.636363637, + "logits/rejected": 16549063.384615384, + "logps/chosen": -373.5501154119318, + "logps/rejected": -312.5066669170673, + "loss": 0.3782, + "rewards/chosen": 1.0413850437511096, + "rewards/margins": 1.3865859491841777, + "rewards/rejected": -0.3452009054330679, + "step": 36 + }, + { + "epoch": 0.010141153898862547, + "grad_norm": 33.0, + "kl": 6.961370944976807, + "learning_rate": 5e-06, + "logits/chosen": 15602493.538461538, + "logits/rejected": 44651985.45454545, + "logps/chosen": -520.2418870192307, + "logps/rejected": -448.80184659090907, + "loss": 0.3531, + "rewards/chosen": 1.7802779857928936, + "rewards/margins": 1.7309738472625091, + "rewards/rejected": 0.04930413853038441, + "step": 37 + }, + { + "epoch": 0.010415239139372345, + "grad_norm": 36.25, + "kl": 7.040739059448242, + "learning_rate": 5e-06, + "logits/chosen": 3953404.5714285714, + "logits/rejected": 16432121.6, + "logps/chosen": -512.9888044084821, + "logps/rejected": -370.351123046875, + "loss": 0.3276, + "rewards/chosen": 1.463531221662249, + "rewards/margins": 1.762472094808306, + "rewards/rejected": -0.2989408731460571, + "step": 38 + }, + { + "epoch": 0.010689324379882143, + "grad_norm": 35.25, + "kl": 13.392007827758789, + "learning_rate": 5e-06, + "logits/chosen": 14701056.0, + "logits/rejected": 41624876.0, + "logps/chosen": -601.6832885742188, + "logps/rejected": -369.640625, + "loss": 0.2529, + "rewards/chosen": 2.621581554412842, + "rewards/margins": 2.705070421099663, + "rewards/rejected": -0.08348886668682098, + "step": 39 + }, + { + "epoch": 0.010963409620391941, + "grad_norm": 33.25, + "kl": 3.6535260677337646, + "learning_rate": 5e-06, + "logits/chosen": 6697520.0, + "logits/rejected": 70534698.66666667, + "logps/chosen": -342.048828125, + "logps/rejected": -490.277099609375, + "loss": 0.3973, + "rewards/chosen": 1.1207311948140461, + "rewards/margins": 1.2539427081743875, + "rewards/rejected": -0.1332115133603414, + "step": 40 + }, + { + "epoch": 0.011237494860901741, + "grad_norm": 30.875, + "kl": 5.0016679763793945, + "learning_rate": 5e-06, + "logits/chosen": 9828660.0, + "logits/rejected": 43756438.85714286, + "logps/chosen": -450.37763671875, + "logps/rejected": -337.4054478236607, + "loss": 0.3206, + "rewards/chosen": 1.7253347396850587, + "rewards/margins": 1.8594807999474663, + "rewards/rejected": -0.13414606026240758, + "step": 41 + }, + { + "epoch": 0.01151158010141154, + "grad_norm": 29.5, + "kl": 8.423419952392578, + "learning_rate": 5e-06, + "logits/chosen": 10457610.666666666, + "logits/rejected": 83750584.8888889, + "logps/chosen": -456.89440104166664, + "logps/rejected": -345.96196831597223, + "loss": 0.2814, + "rewards/chosen": 1.840472412109375, + "rewards/margins": 2.3330771976047093, + "rewards/rejected": -0.4926047854953342, + "step": 42 + }, + { + "epoch": 0.011785665341921337, + "grad_norm": 27.875, + "kl": 9.02241039276123, + "learning_rate": 5e-06, + "logits/chosen": 5947637.142857143, + "logits/rejected": 26396668.8, + "logps/chosen": -516.7006487165179, + "logps/rejected": -685.04111328125, + "loss": 0.2351, + "rewards/chosen": 2.5028574807303294, + "rewards/margins": 2.9223902566092357, + "rewards/rejected": -0.41953277587890625, + "step": 43 + }, + { + "epoch": 0.012059750582431136, + "grad_norm": 29.75, + "kl": 5.592858791351318, + "learning_rate": 5e-06, + "logits/chosen": 33032398.222222224, + "logits/rejected": 42351202.13333333, + "logps/chosen": -465.45941840277777, + "logps/rejected": -427.7013671875, + "loss": 0.3202, + "rewards/chosen": 2.4192110697428384, + "rewards/margins": 2.686837514241536, + "rewards/rejected": -0.26762644449869794, + "step": 44 + }, + { + "epoch": 0.012333835822940935, + "grad_norm": 29.0, + "kl": 3.624695301055908, + "learning_rate": 5e-06, + "logits/chosen": 39387026.28571428, + "logits/rejected": 35902930.823529415, + "logps/chosen": -423.22586495535717, + "logps/rejected": -409.5027286305147, + "loss": 0.3003, + "rewards/chosen": 1.6984740665980749, + "rewards/margins": 2.3937878568633266, + "rewards/rejected": -0.6953137902652516, + "step": 45 + }, + { + "epoch": 0.012607921063450734, + "grad_norm": 38.5, + "kl": 8.498068809509277, + "learning_rate": 5e-06, + "logits/chosen": 23162082.133333333, + "logits/rejected": 69726542.22222222, + "logps/chosen": -444.71106770833336, + "logps/rejected": -387.68402777777777, + "loss": 0.3749, + "rewards/chosen": 1.8610551198323568, + "rewards/margins": 1.9935715039571127, + "rewards/rejected": -0.13251638412475586, + "step": 46 + }, + { + "epoch": 0.012882006303960532, + "grad_norm": 31.875, + "kl": 6.290726661682129, + "learning_rate": 5e-06, + "logits/chosen": 23095166.4, + "logits/rejected": 166788845.7142857, + "logps/chosen": -474.6017578125, + "logps/rejected": -419.70626395089283, + "loss": 0.315, + "rewards/chosen": 1.9880184173583983, + "rewards/margins": 2.792800671713693, + "rewards/rejected": -0.8047822543552944, + "step": 47 + }, + { + "epoch": 0.01315609154447033, + "grad_norm": 25.125, + "kl": 8.459863662719727, + "learning_rate": 5e-06, + "logits/chosen": 6010321.714285715, + "logits/rejected": 93140755.2, + "logps/chosen": -395.0535365513393, + "logps/rejected": -387.6829345703125, + "loss": 0.2301, + "rewards/chosen": 2.4377634865897044, + "rewards/margins": 3.126647077287947, + "rewards/rejected": -0.6888835906982422, + "step": 48 + }, + { + "epoch": 0.013430176784980128, + "grad_norm": 26.75, + "kl": 6.801774501800537, + "learning_rate": 5e-06, + "logits/chosen": 22894212.923076924, + "logits/rejected": 42641245.09090909, + "logps/chosen": -386.40433443509613, + "logps/rejected": -431.5662286931818, + "loss": 0.3159, + "rewards/chosen": 2.003287388728215, + "rewards/margins": 2.004647496280137, + "rewards/rejected": -0.0013601075519214976, + "step": 49 + }, + { + "epoch": 0.013704262025489928, + "grad_norm": 26.375, + "kl": 5.436151027679443, + "learning_rate": 5e-06, + "logits/chosen": 24727900.8, + "logits/rejected": 41986276.571428575, + "logps/chosen": -416.88037109375, + "logps/rejected": -388.4741908482143, + "loss": 0.208, + "rewards/chosen": 3.1476879119873047, + "rewards/margins": 3.8775106157575334, + "rewards/rejected": -0.7298227037702288, + "step": 50 + }, + { + "epoch": 0.013978347265999726, + "grad_norm": 24.25, + "kl": 3.0052928924560547, + "learning_rate": 5e-06, + "logits/chosen": 14159145.333333334, + "logits/rejected": 11019588.666666666, + "logps/chosen": -359.2147623697917, + "logps/rejected": -403.9764811197917, + "loss": 0.2847, + "rewards/chosen": 1.325678030649821, + "rewards/margins": 2.3204700152079267, + "rewards/rejected": -0.9947919845581055, + "step": 51 + }, + { + "epoch": 0.014252432506509524, + "grad_norm": 23.125, + "kl": 5.6787190437316895, + "learning_rate": 5e-06, + "logits/chosen": 42522938.18181818, + "logits/rejected": 16937075.692307692, + "logps/chosen": -460.03981711647725, + "logps/rejected": -322.4841496394231, + "loss": 0.2644, + "rewards/chosen": 2.404735044999556, + "rewards/margins": 3.349144928938859, + "rewards/rejected": -0.9444098839393029, + "step": 52 + }, + { + "epoch": 0.014526517747019322, + "grad_norm": 28.875, + "kl": 7.9893012046813965, + "learning_rate": 5e-06, + "logits/chosen": 32912562.0, + "logits/rejected": 49076544.0, + "logps/chosen": -431.33258056640625, + "logps/rejected": -424.5028381347656, + "loss": 0.2757, + "rewards/chosen": 1.9015986919403076, + "rewards/margins": 2.6104074716567993, + "rewards/rejected": -0.7088087797164917, + "step": 53 + }, + { + "epoch": 0.014800602987529122, + "grad_norm": 28.625, + "kl": 4.304208755493164, + "learning_rate": 5e-06, + "logits/chosen": 23664178.90909091, + "logits/rejected": 12210955.076923076, + "logps/chosen": -536.5038174715909, + "logps/rejected": -494.7594651442308, + "loss": 0.1902, + "rewards/chosen": 2.6065868030894888, + "rewards/margins": 3.5872313226019585, + "rewards/rejected": -0.98064451951247, + "step": 54 + }, + { + "epoch": 0.01507468822803892, + "grad_norm": 27.75, + "kl": 9.380560874938965, + "learning_rate": 5e-06, + "logits/chosen": 23252522.666666668, + "logits/rejected": 79440986.66666667, + "logps/chosen": -431.4990234375, + "logps/rejected": -531.6238199869791, + "loss": 0.2944, + "rewards/chosen": 2.4666922887166343, + "rewards/margins": 3.43416968981425, + "rewards/rejected": -0.9674774010976156, + "step": 55 + }, + { + "epoch": 0.015348773468548719, + "grad_norm": 23.875, + "kl": 4.746701717376709, + "learning_rate": 5e-06, + "logits/chosen": 18870550.153846152, + "logits/rejected": 55028247.27272727, + "logps/chosen": -440.3030348557692, + "logps/rejected": -479.9771839488636, + "loss": 0.2993, + "rewards/chosen": 2.2841456486628604, + "rewards/margins": 3.50526873715274, + "rewards/rejected": -1.2211230884898792, + "step": 56 + }, + { + "epoch": 0.015622858709058517, + "grad_norm": 26.875, + "kl": 1.756322979927063, + "learning_rate": 5e-06, + "logits/chosen": 11870756.0, + "logits/rejected": 48996692.0, + "logps/chosen": -377.71124267578125, + "logps/rejected": -340.19915771484375, + "loss": 0.2706, + "rewards/chosen": 1.803788423538208, + "rewards/margins": 2.7144264578819275, + "rewards/rejected": -0.9106380343437195, + "step": 57 + }, + { + "epoch": 0.015896943949568317, + "grad_norm": 27.125, + "kl": 6.497262477874756, + "learning_rate": 5e-06, + "logits/chosen": 32482503.384615384, + "logits/rejected": 17007783.272727273, + "logps/chosen": -554.7453425480769, + "logps/rejected": -383.93337180397725, + "loss": 0.2213, + "rewards/chosen": 2.4197717813345103, + "rewards/margins": 4.001330875850224, + "rewards/rejected": -1.5815590945157139, + "step": 58 + }, + { + "epoch": 0.016171029190078113, + "grad_norm": 20.875, + "kl": 3.7782459259033203, + "learning_rate": 5e-06, + "logits/chosen": 6875651.636363637, + "logits/rejected": 23413124.923076924, + "logps/chosen": -452.8283025568182, + "logps/rejected": -434.6975285456731, + "loss": 0.1746, + "rewards/chosen": 2.5919872630726206, + "rewards/margins": 4.208964954723012, + "rewards/rejected": -1.6169776916503906, + "step": 59 + }, + { + "epoch": 0.016445114430587913, + "grad_norm": 32.75, + "kl": 2.3381259441375732, + "learning_rate": 5e-06, + "logits/chosen": 25134605.714285713, + "logits/rejected": 19099017.6, + "logps/chosen": -366.24727957589283, + "logps/rejected": -576.251611328125, + "loss": 0.2659, + "rewards/chosen": 1.4540916170392717, + "rewards/margins": 3.568445941380092, + "rewards/rejected": -2.1143543243408205, + "step": 60 + }, + { + "epoch": 0.016719199671097713, + "grad_norm": 23.375, + "kl": 5.8981709480285645, + "learning_rate": 5e-06, + "logits/chosen": 35762795.428571425, + "logits/rejected": 30180304.0, + "logps/chosen": -550.7538713727679, + "logps/rejected": -345.968115234375, + "loss": 0.1862, + "rewards/chosen": 2.724156515938895, + "rewards/margins": 4.091200583321708, + "rewards/rejected": -1.3670440673828126, + "step": 61 + }, + { + "epoch": 0.01699328491160751, + "grad_norm": 23.25, + "kl": 1.3037316799163818, + "learning_rate": 5e-06, + "logits/chosen": 10830165.714285715, + "logits/rejected": 21696726.588235293, + "logps/chosen": -589.4746791294643, + "logps/rejected": -318.08088235294116, + "loss": 0.2419, + "rewards/chosen": 2.2175776617867604, + "rewards/margins": 3.621944435504304, + "rewards/rejected": -1.4043667737175436, + "step": 62 + }, + { + "epoch": 0.01726737015211731, + "grad_norm": 23.25, + "kl": 7.365187168121338, + "learning_rate": 5e-06, + "logits/chosen": 11966174.857142856, + "logits/rejected": 38944505.6, + "logps/chosen": -534.6565987723214, + "logps/rejected": -355.68125, + "loss": 0.2422, + "rewards/chosen": 3.1841708592006137, + "rewards/margins": 4.426242773873465, + "rewards/rejected": -1.2420719146728516, + "step": 63 + }, + { + "epoch": 0.017541455392627105, + "grad_norm": 20.0, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -1307868.7272727273, + "logits/rejected": 23363773.53846154, + "logps/chosen": -502.49027876420456, + "logps/rejected": -584.9458383413462, + "loss": 0.1503, + "rewards/chosen": 2.612940874966708, + "rewards/margins": 4.932725772991047, + "rewards/rejected": -2.3197848980243387, + "step": 64 + }, + { + "epoch": 0.017815540633136905, + "grad_norm": 26.25, + "kl": 5.363725662231445, + "learning_rate": 5e-06, + "logits/chosen": 18424075.076923076, + "logits/rejected": 36056811.63636363, + "logps/chosen": -518.3754507211538, + "logps/rejected": -289.0869806463068, + "loss": 0.224, + "rewards/chosen": 2.8196372985839844, + "rewards/margins": 3.857002691789107, + "rewards/rejected": -1.0373653932051226, + "step": 65 + }, + { + "epoch": 0.018089625873646705, + "grad_norm": 21.875, + "kl": 3.846811294555664, + "learning_rate": 5e-06, + "logits/chosen": -306209.53846153844, + "logits/rejected": 35297736.72727273, + "logps/chosen": -457.67074819711536, + "logps/rejected": -548.7908824573864, + "loss": 0.1629, + "rewards/chosen": 2.3135832273043118, + "rewards/margins": 5.336668734783893, + "rewards/rejected": -3.023085507479581, + "step": 66 + }, + { + "epoch": 0.0183637111141565, + "grad_norm": 20.875, + "kl": 3.4063804149627686, + "learning_rate": 5e-06, + "logits/chosen": 27097432.888888888, + "logits/rejected": 18730331.733333334, + "logps/chosen": -509.33452690972223, + "logps/rejected": -357.07366536458335, + "loss": 0.1752, + "rewards/chosen": 3.244341108534071, + "rewards/margins": 4.878824827406142, + "rewards/rejected": -1.6344837188720702, + "step": 67 + }, + { + "epoch": 0.0186377963546663, + "grad_norm": 19.0, + "kl": 4.6208343505859375, + "learning_rate": 5e-06, + "logits/chosen": 29549668.923076924, + "logits/rejected": 10042684.363636363, + "logps/chosen": -562.0656926081731, + "logps/rejected": -430.71604225852275, + "loss": 0.1512, + "rewards/chosen": 3.3743045513446512, + "rewards/margins": 5.721220403284459, + "rewards/rejected": -2.346915851939808, + "step": 68 + }, + { + "epoch": 0.0189118815951761, + "grad_norm": 18.25, + "kl": 1.4971065521240234, + "learning_rate": 5e-06, + "logits/chosen": 39023526.4, + "logits/rejected": 18489563.42857143, + "logps/chosen": -499.4744140625, + "logps/rejected": -460.47719029017856, + "loss": 0.1333, + "rewards/chosen": 2.648311996459961, + "rewards/margins": 5.496648897443499, + "rewards/rejected": -2.8483369009835378, + "step": 69 + }, + { + "epoch": 0.019185966835685898, + "grad_norm": 19.625, + "kl": 3.564007520675659, + "learning_rate": 5e-06, + "logits/chosen": 63498048.0, + "logits/rejected": 19260384.0, + "logps/chosen": -493.9431966145833, + "logps/rejected": -405.7475179036458, + "loss": 0.1998, + "rewards/chosen": 2.537106513977051, + "rewards/margins": 5.008331616719564, + "rewards/rejected": -2.471225102742513, + "step": 70 + }, + { + "epoch": 0.019460052076195698, + "grad_norm": 23.625, + "kl": 1.2855949401855469, + "learning_rate": 5e-06, + "logits/chosen": 2550151.777777778, + "logits/rejected": 34357211.733333334, + "logps/chosen": -482.25238715277777, + "logps/rejected": -471.84742838541666, + "loss": 0.1745, + "rewards/chosen": 2.126002417670356, + "rewards/margins": 4.086680518256293, + "rewards/rejected": -1.9606781005859375, + "step": 71 + }, + { + "epoch": 0.019734137316705494, + "grad_norm": 20.875, + "kl": 6.143612384796143, + "learning_rate": 5e-06, + "logits/chosen": 34628038.4, + "logits/rejected": 59811556.571428575, + "logps/chosen": -456.66435546875, + "logps/rejected": -355.07338169642856, + "loss": 0.1924, + "rewards/chosen": 3.100025939941406, + "rewards/margins": 4.7760104588099885, + "rewards/rejected": -1.6759845188685827, + "step": 72 + }, + { + "epoch": 0.020008222557215294, + "grad_norm": 17.5, + "kl": 2.6417174339294434, + "learning_rate": 5e-06, + "logits/chosen": 6316999.428571428, + "logits/rejected": 22775555.2, + "logps/chosen": -441.87486049107144, + "logps/rejected": -544.7041015625, + "loss": 0.141, + "rewards/chosen": 2.3974432264055525, + "rewards/margins": 5.558333614894321, + "rewards/rejected": -3.1608903884887694, + "step": 73 + }, + { + "epoch": 0.020282307797725094, + "grad_norm": 19.0, + "kl": 4.240756034851074, + "learning_rate": 5e-06, + "logits/chosen": -6296676.4, + "logits/rejected": 12588715.42857143, + "logps/chosen": -471.886865234375, + "logps/rejected": -306.11488560267856, + "loss": 0.245, + "rewards/chosen": 2.7338836669921873, + "rewards/margins": 4.121359089442661, + "rewards/rejected": -1.3874754224504744, + "step": 74 + }, + { + "epoch": 0.02055639303823489, + "grad_norm": 29.0, + "kl": 1.6686592102050781, + "learning_rate": 5e-06, + "logits/chosen": 29766542.545454547, + "logits/rejected": 48010092.307692304, + "logps/chosen": -468.88485440340907, + "logps/rejected": -411.26318359375, + "loss": 0.2153, + "rewards/chosen": 2.3088212446732954, + "rewards/margins": 4.004663107278464, + "rewards/rejected": -1.6958418626051683, + "step": 75 + }, + { + "epoch": 0.02083047827874469, + "grad_norm": 20.875, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": 23169942.4, + "logits/rejected": 36346192.0, + "logps/chosen": -409.9318359375, + "logps/rejected": -504.4545200892857, + "loss": 0.1852, + "rewards/chosen": 2.074100685119629, + "rewards/margins": 4.416526167733329, + "rewards/rejected": -2.3424254826136996, + "step": 76 + }, + { + "epoch": 0.021104563519254486, + "grad_norm": 18.75, + "kl": 0.6364943385124207, + "learning_rate": 5e-06, + "logits/chosen": 4080342.222222222, + "logits/rejected": 32010973.866666667, + "logps/chosen": -589.8191189236111, + "logps/rejected": -349.69635416666665, + "loss": 0.1676, + "rewards/chosen": 2.507464302910699, + "rewards/margins": 4.6828839196099175, + "rewards/rejected": -2.1754196166992186, + "step": 77 + }, + { + "epoch": 0.021378648759764286, + "grad_norm": 18.375, + "kl": 6.328530311584473, + "learning_rate": 5e-06, + "logits/chosen": 26068838.4, + "logits/rejected": 35921095.11111111, + "logps/chosen": -548.1175130208334, + "logps/rejected": -397.6599934895833, + "loss": 0.1485, + "rewards/chosen": 3.4052574157714846, + "rewards/margins": 5.856091690063477, + "rewards/rejected": -2.450834274291992, + "step": 78 + }, + { + "epoch": 0.021652734000274086, + "grad_norm": 25.0, + "kl": 2.5518507957458496, + "learning_rate": 5e-06, + "logits/chosen": 15718385.6, + "logits/rejected": 34627062.85714286, + "logps/chosen": -325.064306640625, + "logps/rejected": -577.972412109375, + "loss": 0.2106, + "rewards/chosen": 1.8185646057128906, + "rewards/margins": 3.918954631260463, + "rewards/rejected": -2.1003900255475725, + "step": 79 + }, + { + "epoch": 0.021926819240783883, + "grad_norm": 35.0, + "kl": 11.106085777282715, + "learning_rate": 5e-06, + "logits/chosen": 53865472.0, + "logits/rejected": 18882939.2, + "logps/chosen": -450.5677939967105, + "logps/rejected": -463.009033203125, + "loss": 0.307, + "rewards/chosen": 2.154160348992599, + "rewards/margins": 4.554188196282638, + "rewards/rejected": -2.400027847290039, + "step": 80 + }, + { + "epoch": 0.022200904481293682, + "grad_norm": 24.5, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -14250960.0, + "logits/rejected": 26650525.09090909, + "logps/chosen": -570.2341871995193, + "logps/rejected": -534.8829900568181, + "loss": 0.1614, + "rewards/chosen": 2.611414982722356, + "rewards/margins": 6.015355703714011, + "rewards/rejected": -3.403940720991655, + "step": 81 + }, + { + "epoch": 0.022474989721803482, + "grad_norm": 13.375, + "kl": 0.5885226130485535, + "learning_rate": 5e-06, + "logits/chosen": 129458488.8888889, + "logits/rejected": 13414014.933333334, + "logps/chosen": -588.1131184895834, + "logps/rejected": -536.0904947916666, + "loss": 0.1192, + "rewards/chosen": 2.83927239312066, + "rewards/margins": 7.3524798923068575, + "rewards/rejected": -4.513207499186198, + "step": 82 + }, + { + "epoch": 0.02274907496231328, + "grad_norm": 24.75, + "kl": 1.3537757396697998, + "learning_rate": 5e-06, + "logits/chosen": 10850793.846153846, + "logits/rejected": 26208386.90909091, + "logps/chosen": -416.8234675480769, + "logps/rejected": -359.08121004971593, + "loss": 0.2038, + "rewards/chosen": 2.3124424861027646, + "rewards/margins": 4.027981031191099, + "rewards/rejected": -1.7155385450883345, + "step": 83 + }, + { + "epoch": 0.02302316020282308, + "grad_norm": 19.75, + "kl": 6.049086570739746, + "learning_rate": 5e-06, + "logits/chosen": -19335779.2, + "logits/rejected": 18661692.57142857, + "logps/chosen": -463.2177734375, + "logps/rejected": -422.97119140625, + "loss": 0.153, + "rewards/chosen": 3.801902008056641, + "rewards/margins": 6.535798263549805, + "rewards/rejected": -2.733896255493164, + "step": 84 + }, + { + "epoch": 0.023297245443332875, + "grad_norm": 20.875, + "kl": 3.8662221431732178, + "learning_rate": 5e-06, + "logits/chosen": 1038996.0, + "logits/rejected": 47304912.0, + "logps/chosen": -522.1352132161459, + "logps/rejected": -414.3451741536458, + "loss": 0.1852, + "rewards/chosen": 2.422243277231852, + "rewards/margins": 5.102021058400472, + "rewards/rejected": -2.6797777811686196, + "step": 85 + }, + { + "epoch": 0.023571330683842675, + "grad_norm": 17.5, + "kl": 4.065312385559082, + "learning_rate": 5e-06, + "logits/chosen": 18186856.727272727, + "logits/rejected": 36126552.615384616, + "logps/chosen": -410.13503196022725, + "logps/rejected": -509.38179837740387, + "loss": 0.1571, + "rewards/chosen": 2.93945971402255, + "rewards/margins": 6.152276392583246, + "rewards/rejected": -3.212816678560697, + "step": 86 + }, + { + "epoch": 0.023845415924352475, + "grad_norm": 21.625, + "kl": 9.968841552734375, + "learning_rate": 5e-06, + "logits/chosen": 50234496.0, + "logits/rejected": 33597867.63636363, + "logps/chosen": -588.9787409855769, + "logps/rejected": -430.63649680397725, + "loss": 0.1905, + "rewards/chosen": 3.467991755558894, + "rewards/margins": 5.458204629537942, + "rewards/rejected": -1.9902128739790483, + "step": 87 + }, + { + "epoch": 0.02411950116486227, + "grad_norm": 21.75, + "kl": 3.625864028930664, + "learning_rate": 5e-06, + "logits/chosen": 32464818.666666668, + "logits/rejected": 21559377.333333332, + "logps/chosen": -589.9312337239584, + "logps/rejected": -402.20849609375, + "loss": 0.1362, + "rewards/chosen": 2.6200315157572427, + "rewards/margins": 5.054628690083821, + "rewards/rejected": -2.4345971743265786, + "step": 88 + }, + { + "epoch": 0.02439358640537207, + "grad_norm": 17.875, + "kl": 6.760239601135254, + "learning_rate": 5e-06, + "logits/chosen": 8786205.090909092, + "logits/rejected": 9406712.615384616, + "logps/chosen": -498.74369673295456, + "logps/rejected": -358.5177659254808, + "loss": 0.1458, + "rewards/chosen": 3.0020314996892754, + "rewards/margins": 5.687044903948591, + "rewards/rejected": -2.685013404259315, + "step": 89 + }, + { + "epoch": 0.02466767164588187, + "grad_norm": 22.75, + "kl": 7.913074016571045, + "learning_rate": 5e-06, + "logits/chosen": 18659120.0, + "logits/rejected": 12018938.0, + "logps/chosen": -582.2000122070312, + "logps/rejected": -444.6913757324219, + "loss": 0.1693, + "rewards/chosen": 3.051455020904541, + "rewards/margins": 5.743775129318237, + "rewards/rejected": -2.6923201084136963, + "step": 90 + }, + { + "epoch": 0.024941756886391667, + "grad_norm": 21.5, + "kl": 3.418027877807617, + "learning_rate": 5e-06, + "logits/chosen": 17872997.818181816, + "logits/rejected": 16715724.307692308, + "logps/chosen": -365.46872780539775, + "logps/rejected": -434.9141376201923, + "loss": 0.1837, + "rewards/chosen": 2.5711728876287285, + "rewards/margins": 4.873141828950468, + "rewards/rejected": -2.3019689413217397, + "step": 91 + }, + { + "epoch": 0.025215842126901467, + "grad_norm": 23.0, + "kl": 2.7574374675750732, + "learning_rate": 5e-06, + "logits/chosen": 17689121.14285714, + "logits/rejected": 13743030.4, + "logps/chosen": -454.55569893973217, + "logps/rejected": -564.399365234375, + "loss": 0.1495, + "rewards/chosen": 2.0093439647129605, + "rewards/margins": 5.262865311758858, + "rewards/rejected": -3.2535213470458983, + "step": 92 + }, + { + "epoch": 0.025489927367411264, + "grad_norm": 16.875, + "kl": 4.220672607421875, + "learning_rate": 5e-06, + "logits/chosen": 9335516.57142857, + "logits/rejected": 14761451.2, + "logps/chosen": -433.75697544642856, + "logps/rejected": -447.04755859375, + "loss": 0.1231, + "rewards/chosen": 2.8768509456089566, + "rewards/margins": 5.450325448172434, + "rewards/rejected": -2.5734745025634767, + "step": 93 + }, + { + "epoch": 0.025764012607921064, + "grad_norm": 17.0, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": 702501.8181818182, + "logits/rejected": 25841501.53846154, + "logps/chosen": -543.1882102272727, + "logps/rejected": -379.1970027043269, + "loss": 0.1393, + "rewards/chosen": 2.313337499445135, + "rewards/margins": 5.381938080687623, + "rewards/rejected": -3.068600581242488, + "step": 94 + }, + { + "epoch": 0.026038097848430863, + "grad_norm": 16.25, + "kl": 5.628309726715088, + "learning_rate": 5e-06, + "logits/chosen": 24624981.333333332, + "logits/rejected": 22273497.333333332, + "logps/chosen": -557.425048828125, + "logps/rejected": -445.7806803385417, + "loss": 0.1375, + "rewards/chosen": 3.868370691935221, + "rewards/margins": 6.180222670237223, + "rewards/rejected": -2.311851978302002, + "step": 95 + }, + { + "epoch": 0.02631218308894066, + "grad_norm": 20.125, + "kl": 2.1607184410095215, + "learning_rate": 5e-06, + "logits/chosen": 12384234.666666666, + "logits/rejected": 29940743.111111112, + "logps/chosen": -479.4551106770833, + "logps/rejected": -506.44135199652777, + "loss": 0.1065, + "rewards/chosen": 3.095289103190104, + "rewards/margins": 7.713512929280599, + "rewards/rejected": -4.618223826090495, + "step": 96 + }, + { + "epoch": 0.02658626832945046, + "grad_norm": 20.75, + "kl": 1.8822358846664429, + "learning_rate": 5e-06, + "logits/chosen": 28964878.769230768, + "logits/rejected": 19372717.09090909, + "logps/chosen": -560.0998722956731, + "logps/rejected": -548.4446466619319, + "loss": 0.1226, + "rewards/chosen": 2.9954311664287863, + "rewards/margins": 7.5494524815699435, + "rewards/rejected": -4.554021315141157, + "step": 97 + }, + { + "epoch": 0.026860353569960256, + "grad_norm": 14.875, + "kl": 6.619040012359619, + "learning_rate": 5e-06, + "logits/chosen": 16958804.57142857, + "logits/rejected": 18586472.0, + "logps/chosen": -469.22607421875, + "logps/rejected": -429.299853515625, + "loss": 0.1171, + "rewards/chosen": 3.992417199271066, + "rewards/margins": 6.345064789908273, + "rewards/rejected": -2.352647590637207, + "step": 98 + }, + { + "epoch": 0.027134438810470056, + "grad_norm": 16.5, + "kl": 4.733872890472412, + "learning_rate": 5e-06, + "logits/chosen": 15031603.555555556, + "logits/rejected": 16353557.333333334, + "logps/chosen": -440.88495551215277, + "logps/rejected": -495.40244140625, + "loss": 0.1167, + "rewards/chosen": 3.6917800903320312, + "rewards/margins": 6.683789316813151, + "rewards/rejected": -2.99200922648112, + "step": 99 + }, + { + "epoch": 0.027408524050979856, + "grad_norm": 14.5, + "kl": 5.418939590454102, + "learning_rate": 5e-06, + "logits/chosen": 33002867.692307692, + "logits/rejected": 6804550.545454546, + "logps/chosen": -463.97152944711536, + "logps/rejected": -346.8890935724432, + "loss": 0.1094, + "rewards/chosen": 3.111212216890775, + "rewards/margins": 5.815786455061052, + "rewards/rejected": -2.704574238170277, + "step": 100 + }, + { + "epoch": 0.027682609291489652, + "grad_norm": 23.0, + "kl": 7.328195095062256, + "learning_rate": 5e-06, + "logits/chosen": 25669506.285714287, + "logits/rejected": 14558977.6, + "logps/chosen": -482.9292689732143, + "logps/rejected": -398.1857421875, + "loss": 0.1322, + "rewards/chosen": 2.8205160413469588, + "rewards/margins": 6.174603870936803, + "rewards/rejected": -3.3540878295898438, + "step": 101 + }, + { + "epoch": 0.027956694531999452, + "grad_norm": 12.125, + "kl": 1.2081178426742554, + "learning_rate": 5e-06, + "logits/chosen": 8745844.8, + "logits/rejected": 11674523.42857143, + "logps/chosen": -548.87998046875, + "logps/rejected": -578.4268275669643, + "loss": 0.0915, + "rewards/chosen": 3.4765869140625, + "rewards/margins": 7.697880227225168, + "rewards/rejected": -4.2212933131626675, + "step": 102 + }, + { + "epoch": 0.028230779772509252, + "grad_norm": 21.5, + "kl": 9.375618934631348, + "learning_rate": 5e-06, + "logits/chosen": 18404303.05882353, + "logits/rejected": 16273563.42857143, + "logps/chosen": -496.83582261029414, + "logps/rejected": -503.66043526785717, + "loss": 0.1859, + "rewards/chosen": 3.0863616045783546, + "rewards/margins": 6.005081849939683, + "rewards/rejected": -2.918720245361328, + "step": 103 + }, + { + "epoch": 0.02850486501301905, + "grad_norm": 12.625, + "kl": 0.12923431396484375, + "learning_rate": 5e-06, + "logits/chosen": 30547623.111111112, + "logits/rejected": 17241509.333333332, + "logps/chosen": -517.9462890625, + "logps/rejected": -531.9221354166667, + "loss": 0.0915, + "rewards/chosen": 3.109615961710612, + "rewards/margins": 6.860518264770508, + "rewards/rejected": -3.750902303059896, + "step": 104 + }, + { + "epoch": 0.02877895025352885, + "grad_norm": 19.75, + "kl": 3.369232416152954, + "learning_rate": 5e-06, + "logits/chosen": 2220591.2, + "logits/rejected": 17177297.777777776, + "logps/chosen": -372.38893229166666, + "logps/rejected": -648.5984157986111, + "loss": 0.1588, + "rewards/chosen": 2.7208231608072917, + "rewards/margins": 6.64911872016059, + "rewards/rejected": -3.9282955593532987, + "step": 105 + }, + { + "epoch": 0.029053035494038645, + "grad_norm": 18.25, + "kl": 4.21042013168335, + "learning_rate": 5e-06, + "logits/chosen": 7505625.454545454, + "logits/rejected": 30354678.153846152, + "logps/chosen": -345.70751953125, + "logps/rejected": -517.5760216346154, + "loss": 0.178, + "rewards/chosen": 3.0852737426757812, + "rewards/margins": 5.952566293569712, + "rewards/rejected": -2.8672925508939304, + "step": 106 + }, + { + "epoch": 0.029327120734548445, + "grad_norm": 17.5, + "kl": 2.262537717819214, + "learning_rate": 5e-06, + "logits/chosen": 5833050.181818182, + "logits/rejected": 6948367.384615385, + "logps/chosen": -341.8981267755682, + "logps/rejected": -381.22641225961536, + "loss": 0.1836, + "rewards/chosen": 2.06175405328924, + "rewards/margins": 4.966436799589571, + "rewards/rejected": -2.9046827463003306, + "step": 107 + }, + { + "epoch": 0.029601205975058244, + "grad_norm": 20.375, + "kl": 4.341683864593506, + "learning_rate": 5e-06, + "logits/chosen": 17089812.266666666, + "logits/rejected": 52581479.11111111, + "logps/chosen": -442.5478515625, + "logps/rejected": -360.9601779513889, + "loss": 0.1707, + "rewards/chosen": 2.5851648966471354, + "rewards/margins": 4.845933787027995, + "rewards/rejected": -2.2607688903808594, + "step": 108 + }, + { + "epoch": 0.02987529121556804, + "grad_norm": 18.625, + "kl": 1.1561189889907837, + "learning_rate": 5e-06, + "logits/chosen": 10022530.666666666, + "logits/rejected": 13987117.866666667, + "logps/chosen": -378.71031358506946, + "logps/rejected": -383.22239583333334, + "loss": 0.1515, + "rewards/chosen": 3.521940231323242, + "rewards/margins": 5.98422482808431, + "rewards/rejected": -2.4622845967610676, + "step": 109 + }, + { + "epoch": 0.03014937645607784, + "grad_norm": 27.125, + "kl": 4.705845355987549, + "learning_rate": 5e-06, + "logits/chosen": 57224857.6, + "logits/rejected": 25194999.111111112, + "logps/chosen": -399.91888020833335, + "logps/rejected": -493.31005859375, + "loss": 0.208, + "rewards/chosen": 2.0417269388834636, + "rewards/margins": 5.394904327392578, + "rewards/rejected": -3.3531773885091147, + "step": 110 + }, + { + "epoch": 0.030423461696587637, + "grad_norm": 13.5, + "kl": 1.9837684631347656, + "learning_rate": 5e-06, + "logits/chosen": 1804728.7272727273, + "logits/rejected": 29268716.307692308, + "logps/chosen": -444.46102627840907, + "logps/rejected": -500.3703425480769, + "loss": 0.0879, + "rewards/chosen": 4.136935147372159, + "rewards/margins": 7.404892981469215, + "rewards/rejected": -3.2679578340970554, + "step": 111 + }, + { + "epoch": 0.030697546937097437, + "grad_norm": 15.75, + "kl": 5.39693021774292, + "learning_rate": 5e-06, + "logits/chosen": 7782838.222222222, + "logits/rejected": 14582045.866666667, + "logps/chosen": -467.0935329861111, + "logps/rejected": -428.77233072916664, + "loss": 0.151, + "rewards/chosen": 3.5055544111463757, + "rewards/margins": 6.503260252210829, + "rewards/rejected": -2.997705841064453, + "step": 112 + }, + { + "epoch": 0.030971632177607237, + "grad_norm": 17.875, + "kl": 1.4776777029037476, + "learning_rate": 5e-06, + "logits/chosen": 3173813.777777778, + "logits/rejected": 21916565.333333332, + "logps/chosen": -455.9449869791667, + "logps/rejected": -511.13046875, + "loss": 0.0918, + "rewards/chosen": 3.0507030487060547, + "rewards/margins": 6.5917705535888675, + "rewards/rejected": -3.5410675048828124, + "step": 113 + }, + { + "epoch": 0.031245717418117033, + "grad_norm": 19.875, + "kl": 7.317265510559082, + "learning_rate": 5e-06, + "logits/chosen": 23529253.333333332, + "logits/rejected": 12044717.333333334, + "logps/chosen": -563.69921875, + "logps/rejected": -316.1934000651042, + "loss": 0.0988, + "rewards/chosen": 4.081796010335286, + "rewards/margins": 6.754910469055176, + "rewards/rejected": -2.673114458719889, + "step": 114 + }, + { + "epoch": 0.03151980265862683, + "grad_norm": 18.25, + "kl": 10.545022964477539, + "learning_rate": 5e-06, + "logits/chosen": 21121748.0, + "logits/rejected": 1008005.5, + "logps/chosen": -436.3394775390625, + "logps/rejected": -424.2666931152344, + "loss": 0.1042, + "rewards/chosen": 3.8913726806640625, + "rewards/margins": 7.59371280670166, + "rewards/rejected": -3.7023401260375977, + "step": 115 + }, + { + "epoch": 0.03179388789913663, + "grad_norm": 18.5, + "kl": 1.9868406057357788, + "learning_rate": 5e-06, + "logits/chosen": -1585722.6666666667, + "logits/rejected": 32733413.333333332, + "logps/chosen": -437.7894287109375, + "logps/rejected": -594.1539306640625, + "loss": 0.1356, + "rewards/chosen": 3.285210609436035, + "rewards/margins": 6.36833381652832, + "rewards/rejected": -3.083123207092285, + "step": 116 + }, + { + "epoch": 0.03206797313964643, + "grad_norm": 22.125, + "kl": 7.9134345054626465, + "learning_rate": 5e-06, + "logits/chosen": -2366885.714285714, + "logits/rejected": 21332201.6, + "logps/chosen": -428.76171875, + "logps/rejected": -434.93095703125, + "loss": 0.1699, + "rewards/chosen": 3.383405957903181, + "rewards/margins": 5.333840833391462, + "rewards/rejected": -1.9504348754882812, + "step": 117 + }, + { + "epoch": 0.032342058380156226, + "grad_norm": 14.125, + "kl": 0.39247769117355347, + "learning_rate": 5e-06, + "logits/chosen": 9108326.0, + "logits/rejected": 7130486.0, + "logps/chosen": -414.0350748697917, + "logps/rejected": -483.5046793619792, + "loss": 0.1336, + "rewards/chosen": 2.9867026011149087, + "rewards/margins": 6.826655387878418, + "rewards/rejected": -3.8399527867635093, + "step": 118 + }, + { + "epoch": 0.032616143620666026, + "grad_norm": 11.125, + "kl": 5.072084903717041, + "learning_rate": 5e-06, + "logits/chosen": 2911688.0, + "logits/rejected": 5918937.6, + "logps/chosen": -601.43359375, + "logps/rejected": -442.8593424479167, + "loss": 0.0527, + "rewards/chosen": 4.709318372938368, + "rewards/margins": 7.925303480360243, + "rewards/rejected": -3.215985107421875, + "step": 119 + }, + { + "epoch": 0.032890228861175826, + "grad_norm": 16.75, + "kl": 2.4859352111816406, + "learning_rate": 5e-06, + "logits/chosen": 7471055.0, + "logits/rejected": 91416704.0, + "logps/chosen": -404.43719482421875, + "logps/rejected": -426.8681945800781, + "loss": 0.153, + "rewards/chosen": 3.0623435974121094, + "rewards/margins": 5.152045249938965, + "rewards/rejected": -2.0897016525268555, + "step": 120 + }, + { + "epoch": 0.033164314101685625, + "grad_norm": 14.875, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -588565.0, + "logits/rejected": 104797656.0, + "logps/chosen": -430.85693359375, + "logps/rejected": -511.8392333984375, + "loss": 0.0865, + "rewards/chosen": 3.0163121223449707, + "rewards/margins": 7.520627498626709, + "rewards/rejected": -4.504315376281738, + "step": 121 + }, + { + "epoch": 0.033438399342195425, + "grad_norm": 17.625, + "kl": 4.316936492919922, + "learning_rate": 5e-06, + "logits/chosen": 1090698.1818181819, + "logits/rejected": 17009662.769230768, + "logps/chosen": -461.2551935369318, + "logps/rejected": -535.8484074519231, + "loss": 0.1504, + "rewards/chosen": 3.0159277482466265, + "rewards/margins": 7.051921764453809, + "rewards/rejected": -4.035994016207182, + "step": 122 + }, + { + "epoch": 0.03371248458270522, + "grad_norm": 11.75, + "kl": 2.7255916595458984, + "learning_rate": 5e-06, + "logits/chosen": 20351586.666666668, + "logits/rejected": 31297052.444444444, + "logps/chosen": -388.5260416666667, + "logps/rejected": -467.99083116319446, + "loss": 0.0845, + "rewards/chosen": 3.066335995992025, + "rewards/margins": 7.254668023851183, + "rewards/rejected": -4.188332027859158, + "step": 123 + }, + { + "epoch": 0.03398656982321502, + "grad_norm": 13.0625, + "kl": 4.989921569824219, + "learning_rate": 5e-06, + "logits/chosen": 39528119.27272727, + "logits/rejected": 7553741.538461538, + "logps/chosen": -429.36328125, + "logps/rejected": -439.1651141826923, + "loss": 0.1116, + "rewards/chosen": 3.1653626181862573, + "rewards/margins": 7.152938149192117, + "rewards/rejected": -3.9875755310058594, + "step": 124 + }, + { + "epoch": 0.03426065506372482, + "grad_norm": 13.6875, + "kl": 6.987547874450684, + "learning_rate": 5e-06, + "logits/chosen": 26046222.769230768, + "logits/rejected": 15754682.181818182, + "logps/chosen": -382.44095552884613, + "logps/rejected": -461.89506392045456, + "loss": 0.1158, + "rewards/chosen": 3.8392243018517127, + "rewards/margins": 7.147886022821173, + "rewards/rejected": -3.3086617209694604, + "step": 125 + }, + { + "epoch": 0.03453474030423462, + "grad_norm": 22.75, + "kl": 10.74423885345459, + "learning_rate": 5e-06, + "logits/chosen": 6657704.470588235, + "logits/rejected": 55314368.0, + "logps/chosen": -454.2903837316176, + "logps/rejected": -528.1566336495536, + "loss": 0.17, + "rewards/chosen": 3.3222797618192783, + "rewards/margins": 7.636172126321231, + "rewards/rejected": -4.313892364501953, + "step": 126 + }, + { + "epoch": 0.03480882554474442, + "grad_norm": 13.875, + "kl": 3.957669734954834, + "learning_rate": 5e-06, + "logits/chosen": 44045595.428571425, + "logits/rejected": 4808115.2, + "logps/chosen": -515.1978934151786, + "logps/rejected": -412.98388671875, + "loss": 0.1088, + "rewards/chosen": 3.3976309640066966, + "rewards/margins": 6.785617337908064, + "rewards/rejected": -3.3879863739013674, + "step": 127 + }, + { + "epoch": 0.03508291078525421, + "grad_norm": 16.75, + "kl": 2.2041454315185547, + "learning_rate": 5e-06, + "logits/chosen": -5353247.333333333, + "logits/rejected": -7262472.0, + "logps/chosen": -509.7245279947917, + "logps/rejected": -368.511474609375, + "loss": 0.1225, + "rewards/chosen": 3.8793627421061196, + "rewards/margins": 5.869749228159586, + "rewards/rejected": -1.9903864860534668, + "step": 128 + }, + { + "epoch": 0.03535699602576401, + "grad_norm": 15.75, + "kl": 10.216436386108398, + "learning_rate": 5e-06, + "logits/chosen": 4459213.866666666, + "logits/rejected": 50977038.222222224, + "logps/chosen": -469.7241536458333, + "logps/rejected": -501.19097222222223, + "loss": 0.1251, + "rewards/chosen": 3.3956616719563804, + "rewards/margins": 6.54650158352322, + "rewards/rejected": -3.15083991156684, + "step": 129 + }, + { + "epoch": 0.03563108126627381, + "grad_norm": 11.5625, + "kl": 1.150543212890625, + "learning_rate": 5e-06, + "logits/chosen": 1555752.3076923077, + "logits/rejected": 101963810.9090909, + "logps/chosen": -356.1366436298077, + "logps/rejected": -525.0433682528409, + "loss": 0.0814, + "rewards/chosen": 2.7951507568359375, + "rewards/margins": 7.363092595880682, + "rewards/rejected": -4.567941839044744, + "step": 130 + }, + { + "epoch": 0.03590516650678361, + "grad_norm": 18.0, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": 46374665.84615385, + "logits/rejected": 6027564.363636363, + "logps/chosen": -459.18118990384613, + "logps/rejected": -469.4314630681818, + "loss": 0.0739, + "rewards/chosen": 2.945754124568059, + "rewards/margins": 7.130077522117775, + "rewards/rejected": -4.184323397549716, + "step": 131 + }, + { + "epoch": 0.03617925174729341, + "grad_norm": 17.125, + "kl": 7.387209892272949, + "learning_rate": 5e-06, + "logits/chosen": -3445882.8, + "logits/rejected": 30769424.0, + "logps/chosen": -465.807177734375, + "logps/rejected": -549.3658272879464, + "loss": 0.1, + "rewards/chosen": 3.2449859619140624, + "rewards/margins": 7.239930125645229, + "rewards/rejected": -3.9949441637311662, + "step": 132 + }, + { + "epoch": 0.03645333698780321, + "grad_norm": 18.75, + "kl": 5.410910606384277, + "learning_rate": 5e-06, + "logits/chosen": -4222018.909090909, + "logits/rejected": 8481665.23076923, + "logps/chosen": -450.92085404829544, + "logps/rejected": -425.76607572115387, + "loss": 0.171, + "rewards/chosen": 3.3744205128062856, + "rewards/margins": 7.00080583479021, + "rewards/rejected": -3.6263853219839244, + "step": 133 + }, + { + "epoch": 0.036727422228313, + "grad_norm": 25.25, + "kl": 1.5340436697006226, + "learning_rate": 5e-06, + "logits/chosen": 30840853.333333332, + "logits/rejected": 18268949.333333332, + "logps/chosen": -515.9419352213541, + "logps/rejected": -336.60813395182294, + "loss": 0.1652, + "rewards/chosen": 2.9756199518839517, + "rewards/margins": 5.059343655904134, + "rewards/rejected": -2.083723704020182, + "step": 134 + }, + { + "epoch": 0.0370015074688228, + "grad_norm": 22.875, + "kl": 10.762727737426758, + "learning_rate": 5e-06, + "logits/chosen": 20816425.14285714, + "logits/rejected": 45016812.8, + "logps/chosen": -493.72279575892856, + "logps/rejected": -390.991943359375, + "loss": 0.1198, + "rewards/chosen": 3.8694651467459544, + "rewards/margins": 6.836864934648787, + "rewards/rejected": -2.967399787902832, + "step": 135 + }, + { + "epoch": 0.0372755927093326, + "grad_norm": 19.25, + "kl": 2.5975253582000732, + "learning_rate": 5e-06, + "logits/chosen": 5768961.454545454, + "logits/rejected": 17050875.076923076, + "logps/chosen": -515.7679332386364, + "logps/rejected": -595.7152944711538, + "loss": 0.1205, + "rewards/chosen": 3.4394669966264204, + "rewards/margins": 6.635594081211757, + "rewards/rejected": -3.1961270845853367, + "step": 136 + }, + { + "epoch": 0.0375496779498424, + "grad_norm": 21.5, + "kl": 2.327805280685425, + "learning_rate": 5e-06, + "logits/chosen": -525593.4285714285, + "logits/rejected": 12550768.0, + "logps/chosen": -362.6689453125, + "logps/rejected": -468.272998046875, + "loss": 0.2047, + "rewards/chosen": 2.330976758684431, + "rewards/margins": 4.195863996233259, + "rewards/rejected": -1.8648872375488281, + "step": 137 + }, + { + "epoch": 0.0378237631903522, + "grad_norm": 9.4375, + "kl": 0.8102906942367554, + "learning_rate": 5e-06, + "logits/chosen": 17442417.333333332, + "logits/rejected": 6629061.333333333, + "logps/chosen": -463.6735026041667, + "logps/rejected": -414.1232096354167, + "loss": 0.093, + "rewards/chosen": 3.689485232035319, + "rewards/margins": 7.367673238118489, + "rewards/rejected": -3.6781880060831704, + "step": 138 + }, + { + "epoch": 0.038097848430861996, + "grad_norm": 18.5, + "kl": 5.231475830078125, + "learning_rate": 5e-06, + "logits/chosen": 6162806.545454546, + "logits/rejected": 13712226.461538462, + "logps/chosen": -382.6570933948864, + "logps/rejected": -365.7350886418269, + "loss": 0.1691, + "rewards/chosen": 2.1841780922629614, + "rewards/margins": 4.628090011489975, + "rewards/rejected": -2.443911919227013, + "step": 139 + }, + { + "epoch": 0.038371933671371795, + "grad_norm": 18.5, + "kl": 11.546306610107422, + "learning_rate": 5e-06, + "logits/chosen": -3024618.117647059, + "logits/rejected": 5520382.857142857, + "logps/chosen": -445.6412568933824, + "logps/rejected": -402.4121791294643, + "loss": 0.1465, + "rewards/chosen": 3.5650401395909928, + "rewards/margins": 7.207088021671071, + "rewards/rejected": -3.642047882080078, + "step": 140 + }, + { + "epoch": 0.038646018911881595, + "grad_norm": 13.125, + "kl": 8.965350151062012, + "learning_rate": 5e-06, + "logits/chosen": -4687534.857142857, + "logits/rejected": 26242864.0, + "logps/chosen": -520.7423270089286, + "logps/rejected": -476.65322265625, + "loss": 0.0709, + "rewards/chosen": 4.832400730678013, + "rewards/margins": 9.550467899867467, + "rewards/rejected": -4.718067169189453, + "step": 141 + }, + { + "epoch": 0.038920104152391395, + "grad_norm": 12.9375, + "kl": 6.09146785736084, + "learning_rate": 5e-06, + "logits/chosen": 8739299.0, + "logits/rejected": 32563932.0, + "logps/chosen": -442.37908935546875, + "logps/rejected": -514.930419921875, + "loss": 0.1386, + "rewards/chosen": 3.767910957336426, + "rewards/margins": 7.011478424072266, + "rewards/rejected": -3.24356746673584, + "step": 142 + }, + { + "epoch": 0.039194189392901195, + "grad_norm": 14.25, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -6336357.714285715, + "logits/rejected": -4828286.588235294, + "logps/chosen": -435.0844029017857, + "logps/rejected": -479.82907284007354, + "loss": 0.1259, + "rewards/chosen": 3.0613125392368863, + "rewards/margins": 6.45475682491014, + "rewards/rejected": -3.3934442856732536, + "step": 143 + }, + { + "epoch": 0.03946827463341099, + "grad_norm": 14.3125, + "kl": 5.0732598304748535, + "learning_rate": 5e-06, + "logits/chosen": 7137986.5, + "logits/rejected": -8001068.0, + "logps/chosen": -454.30816650390625, + "logps/rejected": -474.5111389160156, + "loss": 0.0835, + "rewards/chosen": 3.823915481567383, + "rewards/margins": 8.35860824584961, + "rewards/rejected": -4.534692764282227, + "step": 144 + }, + { + "epoch": 0.03974235987392079, + "grad_norm": 13.5, + "kl": 4.37014102935791, + "learning_rate": 5e-06, + "logits/chosen": -1062166.1666666667, + "logits/rejected": 9943148.0, + "logps/chosen": -474.2447102864583, + "logps/rejected": -377.0118001302083, + "loss": 0.0843, + "rewards/chosen": 4.5043792724609375, + "rewards/margins": 7.524034182230631, + "rewards/rejected": -3.019654909769694, + "step": 145 + }, + { + "epoch": 0.04001644511443059, + "grad_norm": 10.25, + "kl": 4.937111854553223, + "learning_rate": 5e-06, + "logits/chosen": -1964717.3333333333, + "logits/rejected": -3300825.3333333335, + "logps/chosen": -566.4967447916666, + "logps/rejected": -512.397705078125, + "loss": 0.0873, + "rewards/chosen": 4.372155825297038, + "rewards/margins": 7.626852671305339, + "rewards/rejected": -3.254696846008301, + "step": 146 + }, + { + "epoch": 0.04029053035494039, + "grad_norm": 13.75, + "kl": 0.3398030698299408, + "learning_rate": 5e-06, + "logits/chosen": 16566529.454545455, + "logits/rejected": 64355707.07692308, + "logps/chosen": -405.75217507102275, + "logps/rejected": -638.7633713942307, + "loss": 0.0605, + "rewards/chosen": 3.2560195922851562, + "rewards/margins": 9.292753366323618, + "rewards/rejected": -6.036733774038462, + "step": 147 + }, + { + "epoch": 0.04056461559545019, + "grad_norm": 20.375, + "kl": 6.975541591644287, + "learning_rate": 5e-06, + "logits/chosen": 5985534.857142857, + "logits/rejected": 12324473.6, + "logps/chosen": -489.385986328125, + "logps/rejected": -599.351171875, + "loss": 0.0958, + "rewards/chosen": 3.935640335083008, + "rewards/margins": 7.458247375488281, + "rewards/rejected": -3.5226070404052736, + "step": 148 + }, + { + "epoch": 0.04083870083595998, + "grad_norm": 22.625, + "kl": 5.720283508300781, + "learning_rate": 5e-06, + "logits/chosen": 93705773.1764706, + "logits/rejected": 46988013.71428572, + "logps/chosen": -524.2184627757352, + "logps/rejected": -651.1162806919643, + "loss": 0.18, + "rewards/chosen": 3.1041470695944393, + "rewards/margins": 5.357109590738761, + "rewards/rejected": -2.252962521144322, + "step": 149 + }, + { + "epoch": 0.04111278607646978, + "grad_norm": 12.125, + "kl": 0.1414540708065033, + "learning_rate": 5e-06, + "logits/chosen": 33680704.0, + "logits/rejected": 38841610.666666664, + "logps/chosen": -443.945556640625, + "logps/rejected": -551.6576741536459, + "loss": 0.086, + "rewards/chosen": 3.5813897450764975, + "rewards/margins": 8.469660123189291, + "rewards/rejected": -4.888270378112793, + "step": 150 + }, + { + "epoch": 0.04138687131697958, + "grad_norm": 14.0, + "kl": 6.840159893035889, + "learning_rate": 5e-06, + "logits/chosen": -5499635.692307692, + "logits/rejected": 3975175.6363636362, + "logps/chosen": -463.1681941105769, + "logps/rejected": -458.4490411931818, + "loss": 0.0966, + "rewards/chosen": 4.063002072847807, + "rewards/margins": 8.965942996365207, + "rewards/rejected": -4.9029409235174, + "step": 151 + }, + { + "epoch": 0.04166095655748938, + "grad_norm": 14.5625, + "kl": 1.4093616008758545, + "learning_rate": 5e-06, + "logits/chosen": -9136766.222222222, + "logits/rejected": -3159806.933333333, + "logps/chosen": -424.96693250868054, + "logps/rejected": -451.37083333333334, + "loss": 0.1482, + "rewards/chosen": 3.644777086046007, + "rewards/margins": 6.425176408555773, + "rewards/rejected": -2.7803993225097656, + "step": 152 + }, + { + "epoch": 0.04193504179799918, + "grad_norm": 15.8125, + "kl": 0.11477534472942352, + "learning_rate": 5e-06, + "logits/chosen": 2089156.8, + "logits/rejected": 1807227.4285714286, + "logps/chosen": -466.020458984375, + "logps/rejected": -391.89903041294644, + "loss": 0.0939, + "rewards/chosen": 3.2995338439941406, + "rewards/margins": 6.805568695068359, + "rewards/rejected": -3.5060348510742188, + "step": 153 + }, + { + "epoch": 0.04220912703850897, + "grad_norm": 12.0625, + "kl": 6.059615135192871, + "learning_rate": 5e-06, + "logits/chosen": -3982537.230769231, + "logits/rejected": 15260858.181818182, + "logps/chosen": -512.8073542668269, + "logps/rejected": -418.42587002840907, + "loss": 0.1003, + "rewards/chosen": 4.285635434664213, + "rewards/margins": 7.461113589626926, + "rewards/rejected": -3.175478154962713, + "step": 154 + }, + { + "epoch": 0.04248321227901877, + "grad_norm": 16.625, + "kl": 4.7000813484191895, + "learning_rate": 5e-06, + "logits/chosen": 18414144.0, + "logits/rejected": 34331652.571428575, + "logps/chosen": -419.1076171875, + "logps/rejected": -452.16598074776783, + "loss": 0.1456, + "rewards/chosen": 3.455859375, + "rewards/margins": 6.370354570661272, + "rewards/rejected": -2.914495195661272, + "step": 155 + }, + { + "epoch": 0.04275729751952857, + "grad_norm": 9.25, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -1865476.3636363635, + "logits/rejected": 4743569.846153846, + "logps/chosen": -558.0752840909091, + "logps/rejected": -460.3121995192308, + "loss": 0.0676, + "rewards/chosen": 3.764279452237216, + "rewards/margins": 8.034574815443346, + "rewards/rejected": -4.27029536320613, + "step": 156 + }, + { + "epoch": 0.04303138276003837, + "grad_norm": 15.0625, + "kl": 11.875640869140625, + "learning_rate": 5e-06, + "logits/chosen": 5159369.818181818, + "logits/rejected": 4972001.230769231, + "logps/chosen": -474.6345880681818, + "logps/rejected": -361.26600060096155, + "loss": 0.1156, + "rewards/chosen": 4.618873942982066, + "rewards/margins": 7.128033617993335, + "rewards/rejected": -2.509159675011268, + "step": 157 + }, + { + "epoch": 0.04330546800054817, + "grad_norm": 13.375, + "kl": 1.4234187602996826, + "learning_rate": 5e-06, + "logits/chosen": 1026157.2307692308, + "logits/rejected": 6590210.909090909, + "logps/chosen": -500.7785832331731, + "logps/rejected": -422.14626242897725, + "loss": 0.1131, + "rewards/chosen": 3.89547846867488, + "rewards/margins": 7.2030873331990275, + "rewards/rejected": -3.307608864524148, + "step": 158 + }, + { + "epoch": 0.04357955324105797, + "grad_norm": 15.6875, + "kl": 1.4406242370605469, + "learning_rate": 5e-06, + "logits/chosen": 5291171.2, + "logits/rejected": 28458715.42857143, + "logps/chosen": -381.8375, + "logps/rejected": -494.67745535714283, + "loss": 0.1326, + "rewards/chosen": 2.7426521301269533, + "rewards/margins": 6.884755325317383, + "rewards/rejected": -4.14210319519043, + "step": 159 + }, + { + "epoch": 0.043853638481567765, + "grad_norm": 15.0625, + "kl": 10.282400131225586, + "learning_rate": 5e-06, + "logits/chosen": 8967352.0, + "logits/rejected": 12385390.545454545, + "logps/chosen": -467.5222731370192, + "logps/rejected": -399.13583096590907, + "loss": 0.0803, + "rewards/chosen": 3.573980771578275, + "rewards/margins": 7.412597923012047, + "rewards/rejected": -3.8386171514337715, + "step": 160 + }, + { + "epoch": 0.044127723722077565, + "grad_norm": 17.125, + "kl": 1.6266670227050781, + "learning_rate": 5e-06, + "logits/chosen": 4219183.428571428, + "logits/rejected": 1000269.0, + "logps/chosen": -425.2318638392857, + "logps/rejected": -521.913330078125, + "loss": 0.1021, + "rewards/chosen": 3.2845420837402344, + "rewards/margins": 8.480320358276368, + "rewards/rejected": -5.195778274536133, + "step": 161 + }, + { + "epoch": 0.044401808962587365, + "grad_norm": 21.125, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": 26945221.333333332, + "logits/rejected": 50798208.0, + "logps/chosen": -535.8470052083334, + "logps/rejected": -526.8080078125, + "loss": 0.0893, + "rewards/chosen": 3.835415734185113, + "rewards/margins": 7.45284890068902, + "rewards/rejected": -3.6174331665039063, + "step": 162 + }, + { + "epoch": 0.044675894203097165, + "grad_norm": 11.875, + "kl": 1.4016908407211304, + "learning_rate": 5e-06, + "logits/chosen": 4323036.0, + "logits/rejected": -4481660.8, + "logps/chosen": -443.5078822544643, + "logps/rejected": -486.075390625, + "loss": 0.0423, + "rewards/chosen": 3.7646004813058034, + "rewards/margins": 10.4869752066476, + "rewards/rejected": -6.722374725341797, + "step": 163 + }, + { + "epoch": 0.044949979443606965, + "grad_norm": 17.25, + "kl": 7.307744979858398, + "learning_rate": 5e-06, + "logits/chosen": 4501340.0, + "logits/rejected": 38610982.4, + "logps/chosen": -439.13330078125, + "logps/rejected": -764.902197265625, + "loss": 0.1625, + "rewards/chosen": 3.6295950753348216, + "rewards/margins": 8.521568952287947, + "rewards/rejected": -4.891973876953125, + "step": 164 + }, + { + "epoch": 0.04522406468411676, + "grad_norm": 8.25, + "kl": 1.4399440288543701, + "learning_rate": 5e-06, + "logits/chosen": -8199441.333333333, + "logits/rejected": 73125056.0, + "logps/chosen": -427.3248697916667, + "logps/rejected": -584.5726725260416, + "loss": 0.038, + "rewards/chosen": 4.277071634928386, + "rewards/margins": 9.650992075602215, + "rewards/rejected": -5.373920440673828, + "step": 165 + }, + { + "epoch": 0.04549814992462656, + "grad_norm": 16.25, + "kl": 5.778878688812256, + "learning_rate": 5e-06, + "logits/chosen": -3432344.888888889, + "logits/rejected": -10030792.533333333, + "logps/chosen": -455.89073350694446, + "logps/rejected": -439.53020833333335, + "loss": 0.1203, + "rewards/chosen": 4.417838626437717, + "rewards/margins": 8.125884077284072, + "rewards/rejected": -3.7080454508463543, + "step": 166 + }, + { + "epoch": 0.04577223516513636, + "grad_norm": 14.75, + "kl": 0.9812116622924805, + "learning_rate": 5e-06, + "logits/chosen": 11196534.4, + "logits/rejected": -2763015.4285714286, + "logps/chosen": -414.8537109375, + "logps/rejected": -342.06602260044644, + "loss": 0.1234, + "rewards/chosen": 3.51639404296875, + "rewards/margins": 6.551245389665876, + "rewards/rejected": -3.0348513466971263, + "step": 167 + }, + { + "epoch": 0.04604632040564616, + "grad_norm": 18.5, + "kl": 12.343981742858887, + "learning_rate": 5e-06, + "logits/chosen": -11683740.0, + "logits/rejected": 2785088.6666666665, + "logps/chosen": -494.2982991536458, + "logps/rejected": -260.26031494140625, + "loss": 0.1155, + "rewards/chosen": 4.537640889485677, + "rewards/margins": 6.5785706837972, + "rewards/rejected": -2.0409297943115234, + "step": 168 + }, + { + "epoch": 0.04632040564615596, + "grad_norm": 10.875, + "kl": 7.613009452819824, + "learning_rate": 5e-06, + "logits/chosen": 3751092.3636363638, + "logits/rejected": 22831448.615384616, + "logps/chosen": -350.1577814275568, + "logps/rejected": -469.2224684495192, + "loss": 0.1755, + "rewards/chosen": 3.6616862903941763, + "rewards/margins": 6.950874381965691, + "rewards/rejected": -3.2891880915715146, + "step": 169 + }, + { + "epoch": 0.04659449088666575, + "grad_norm": 15.0, + "kl": 4.901795864105225, + "learning_rate": 5e-06, + "logits/chosen": 499790.1538461539, + "logits/rejected": 36225835.63636363, + "logps/chosen": -465.99988731971155, + "logps/rejected": -540.3160955255681, + "loss": 0.0988, + "rewards/chosen": 3.8305176955003004, + "rewards/margins": 8.146344111515926, + "rewards/rejected": -4.315826416015625, + "step": 170 + }, + { + "epoch": 0.04686857612717555, + "grad_norm": 15.75, + "kl": 5.247824668884277, + "learning_rate": 5e-06, + "logits/chosen": 9125872.0, + "logits/rejected": 15740993.777777778, + "logps/chosen": -552.4172526041667, + "logps/rejected": -497.6188151041667, + "loss": 0.0765, + "rewards/chosen": 4.492021179199218, + "rewards/margins": 7.996091885036892, + "rewards/rejected": -3.5040707058376737, + "step": 171 + }, + { + "epoch": 0.04714266136768535, + "grad_norm": 15.3125, + "kl": 4.287100791931152, + "learning_rate": 5e-06, + "logits/chosen": 18854958.769230768, + "logits/rejected": 28869172.363636363, + "logps/chosen": -434.86527193509613, + "logps/rejected": -588.8723810369319, + "loss": 0.1165, + "rewards/chosen": 3.2667098412146935, + "rewards/margins": 6.739490242271156, + "rewards/rejected": -3.472780401056463, + "step": 172 + }, + { + "epoch": 0.04741674660819515, + "grad_norm": 15.6875, + "kl": 7.779524803161621, + "learning_rate": 5e-06, + "logits/chosen": 42919104.0, + "logits/rejected": 71971170.9090909, + "logps/chosen": -531.3508112980769, + "logps/rejected": -592.1924272017045, + "loss": 0.108, + "rewards/chosen": 4.538337120643029, + "rewards/margins": 10.240909336330173, + "rewards/rejected": -5.702572215687145, + "step": 173 + }, + { + "epoch": 0.04769083184870495, + "grad_norm": 14.1875, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -5054573.454545454, + "logits/rejected": -2682424.0, + "logps/chosen": -409.9319957386364, + "logps/rejected": -542.6337139423077, + "loss": 0.0802, + "rewards/chosen": 3.301650654185902, + "rewards/margins": 7.8283893345119235, + "rewards/rejected": -4.526738680326021, + "step": 174 + }, + { + "epoch": 0.04796491708921474, + "grad_norm": 15.9375, + "kl": 2.1455702781677246, + "learning_rate": 5e-06, + "logits/chosen": -5704532.307692308, + "logits/rejected": 21278651.636363637, + "logps/chosen": -495.7316331129808, + "logps/rejected": -483.4337269176136, + "loss": 0.094, + "rewards/chosen": 4.712146759033203, + "rewards/margins": 8.850966020063922, + "rewards/rejected": -4.138819261030718, + "step": 175 + }, + { + "epoch": 0.04823900232972454, + "grad_norm": 19.75, + "kl": 15.89577865600586, + "learning_rate": 5e-06, + "logits/chosen": 17876523.2, + "logits/rejected": 16360310.857142856, + "logps/chosen": -464.021923828125, + "logps/rejected": -342.09354073660717, + "loss": 0.1398, + "rewards/chosen": 4.993391036987305, + "rewards/margins": 7.877691268920898, + "rewards/rejected": -2.8843002319335938, + "step": 176 + }, + { + "epoch": 0.04851308757023434, + "grad_norm": 16.0, + "kl": 6.69710636138916, + "learning_rate": 5e-06, + "logits/chosen": 29377424.0, + "logits/rejected": 37160886.4, + "logps/chosen": -660.6689453125, + "logps/rejected": -613.596533203125, + "loss": 0.0894, + "rewards/chosen": 4.287473678588867, + "rewards/margins": 10.671078872680663, + "rewards/rejected": -6.383605194091797, + "step": 177 + }, + { + "epoch": 0.04878717281074414, + "grad_norm": 18.125, + "kl": 3.478078842163086, + "learning_rate": 5e-06, + "logits/chosen": -10876278.153846154, + "logits/rejected": 482590.9090909091, + "logps/chosen": -419.2790715144231, + "logps/rejected": -482.77951882102275, + "loss": 0.1105, + "rewards/chosen": 2.966394277719351, + "rewards/margins": 7.834832651631816, + "rewards/rejected": -4.868438373912465, + "step": 178 + }, + { + "epoch": 0.04906125805125394, + "grad_norm": 11.875, + "kl": 4.207137107849121, + "learning_rate": 5e-06, + "logits/chosen": 1222231.8181818181, + "logits/rejected": 981525.0769230769, + "logps/chosen": -505.85227272727275, + "logps/rejected": -470.0186298076923, + "loss": 0.0624, + "rewards/chosen": 4.240551688454368, + "rewards/margins": 8.59122330992372, + "rewards/rejected": -4.350671621469351, + "step": 179 + }, + { + "epoch": 0.04933534329176374, + "grad_norm": 11.75, + "kl": 3.9081592559814453, + "learning_rate": 5e-06, + "logits/chosen": -14504654.666666666, + "logits/rejected": 9512708.666666666, + "logps/chosen": -405.6181640625, + "logps/rejected": -404.8481852213542, + "loss": 0.0897, + "rewards/chosen": 3.81339963277181, + "rewards/margins": 7.379643758138021, + "rewards/rejected": -3.566244125366211, + "step": 180 + }, + { + "epoch": 0.049609428532273535, + "grad_norm": 18.125, + "kl": 5.893155097961426, + "learning_rate": 5e-06, + "logits/chosen": 1433725.3846153845, + "logits/rejected": -3703657.8181818184, + "logps/chosen": -394.33458533653845, + "logps/rejected": -405.2277166193182, + "loss": 0.2003, + "rewards/chosen": 2.606241666353666, + "rewards/margins": 5.70912925346748, + "rewards/rejected": -3.102887587113814, + "step": 181 + }, + { + "epoch": 0.049883513772783335, + "grad_norm": 14.0, + "kl": 3.477199077606201, + "learning_rate": 5e-06, + "logits/chosen": -2805859.6923076925, + "logits/rejected": 19693463.272727273, + "logps/chosen": -405.1745042067308, + "logps/rejected": -519.7160866477273, + "loss": 0.1088, + "rewards/chosen": 3.0947042611929088, + "rewards/margins": 8.17467453429749, + "rewards/rejected": -5.079970273104581, + "step": 182 + }, + { + "epoch": 0.050157599013293135, + "grad_norm": 13.0625, + "kl": 0.9740562438964844, + "learning_rate": 5e-06, + "logits/chosen": 2677949.714285714, + "logits/rejected": -2326293.6, + "logps/chosen": -396.19813755580356, + "logps/rejected": -510.873828125, + "loss": 0.0752, + "rewards/chosen": 3.954784393310547, + "rewards/margins": 8.478173065185548, + "rewards/rejected": -4.523388671875, + "step": 183 + }, + { + "epoch": 0.050431684253802934, + "grad_norm": 21.75, + "kl": 0.07017135620117188, + "learning_rate": 5e-06, + "logits/chosen": 7269542.222222222, + "logits/rejected": 27111517.866666667, + "logps/chosen": -403.87909613715277, + "logps/rejected": -565.3015625, + "loss": 0.0992, + "rewards/chosen": 3.6746283637152777, + "rewards/margins": 7.23223639594184, + "rewards/rejected": -3.5576080322265624, + "step": 184 + }, + { + "epoch": 0.050705769494312734, + "grad_norm": 16.75, + "kl": 0.7380339503288269, + "learning_rate": 5e-06, + "logits/chosen": 14453528.888888888, + "logits/rejected": 24097557.333333332, + "logps/chosen": -535.5159505208334, + "logps/rejected": -407.748046875, + "loss": 0.1229, + "rewards/chosen": 4.7562815348307295, + "rewards/margins": 8.691663869222005, + "rewards/rejected": -3.935382334391276, + "step": 185 + }, + { + "epoch": 0.05097985473482253, + "grad_norm": 7.1875, + "kl": 1.6303462982177734, + "learning_rate": 5e-06, + "logits/chosen": 3836750.222222222, + "logits/rejected": 67559334.4, + "logps/chosen": -436.47108289930554, + "logps/rejected": -569.364453125, + "loss": 0.0625, + "rewards/chosen": 3.1747926076253257, + "rewards/margins": 7.324109268188476, + "rewards/rejected": -4.149316660563151, + "step": 186 + }, + { + "epoch": 0.05125393997533233, + "grad_norm": 9.6875, + "kl": 3.386350154876709, + "learning_rate": 5e-06, + "logits/chosen": 8371529.846153846, + "logits/rejected": 1126565.8181818181, + "logps/chosen": -530.5818058894231, + "logps/rejected": -424.65065696022725, + "loss": 0.0514, + "rewards/chosen": 4.412977658785307, + "rewards/margins": 8.751892009815137, + "rewards/rejected": -4.338914351029829, + "step": 187 + }, + { + "epoch": 0.05152802521584213, + "grad_norm": 13.375, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": 10512130.285714285, + "logits/rejected": 3045296.9411764704, + "logps/chosen": -384.2388392857143, + "logps/rejected": -404.8528837316176, + "loss": 0.0855, + "rewards/chosen": 2.908428192138672, + "rewards/margins": 6.666636074290556, + "rewards/rejected": -3.758207882151884, + "step": 188 + }, + { + "epoch": 0.05180211045635193, + "grad_norm": 10.3125, + "kl": 0.9558185338973999, + "learning_rate": 5e-06, + "logits/chosen": -11954053.0, + "logits/rejected": 3895838.0, + "logps/chosen": -421.6596984863281, + "logps/rejected": -441.7607727050781, + "loss": 0.0746, + "rewards/chosen": 4.198707580566406, + "rewards/margins": 8.476572513580322, + "rewards/rejected": -4.277864933013916, + "step": 189 + }, + { + "epoch": 0.05207619569686173, + "grad_norm": 18.25, + "kl": 4.466094970703125, + "learning_rate": 5e-06, + "logits/chosen": -13750637.538461538, + "logits/rejected": 37969762.90909091, + "logps/chosen": -481.6154597355769, + "logps/rejected": -480.09011008522725, + "loss": 0.1324, + "rewards/chosen": 4.394994882436899, + "rewards/margins": 8.01372213296957, + "rewards/rejected": -3.6187272505326704, + "step": 190 + }, + { + "epoch": 0.05235028093737152, + "grad_norm": 20.375, + "kl": 2.2208545207977295, + "learning_rate": 5e-06, + "logits/chosen": 46390380.8, + "logits/rejected": 2279539.8571428573, + "logps/chosen": -548.43134765625, + "logps/rejected": -307.97935267857144, + "loss": 0.1212, + "rewards/chosen": 5.315782165527343, + "rewards/margins": 7.84322509765625, + "rewards/rejected": -2.5274429321289062, + "step": 191 + }, + { + "epoch": 0.05262436617788132, + "grad_norm": 15.3125, + "kl": 6.22431755065918, + "learning_rate": 5e-06, + "logits/chosen": 21383638.153846152, + "logits/rejected": 14241125.818181818, + "logps/chosen": -429.71987680288464, + "logps/rejected": -358.8785289417614, + "loss": 0.0875, + "rewards/chosen": 3.9996155958909254, + "rewards/margins": 7.152943537785457, + "rewards/rejected": -3.1533279418945312, + "step": 192 + }, + { + "epoch": 0.05289845141839112, + "grad_norm": 10.1875, + "kl": 1.4627799987792969, + "learning_rate": 5e-06, + "logits/chosen": 16685993.333333334, + "logits/rejected": 3016435.3333333335, + "logps/chosen": -562.1373291015625, + "logps/rejected": -434.8553059895833, + "loss": 0.0585, + "rewards/chosen": 5.035298983256022, + "rewards/margins": 8.927838643391928, + "rewards/rejected": -3.892539660135905, + "step": 193 + }, + { + "epoch": 0.05317253665890092, + "grad_norm": 11.25, + "kl": 4.287712097167969, + "learning_rate": 5e-06, + "logits/chosen": -4872255.2727272725, + "logits/rejected": -5848512.0, + "logps/chosen": -428.62539950284093, + "logps/rejected": -464.92052283653845, + "loss": 0.0703, + "rewards/chosen": 4.30358193137429, + "rewards/margins": 8.26336915342958, + "rewards/rejected": -3.9597872220552883, + "step": 194 + }, + { + "epoch": 0.05344662189941072, + "grad_norm": 10.4375, + "kl": 1.0176925659179688, + "learning_rate": 5e-06, + "logits/chosen": 35775028.36363637, + "logits/rejected": 28224755.692307692, + "logps/chosen": -384.02885298295456, + "logps/rejected": -464.6925706129808, + "loss": 0.0938, + "rewards/chosen": 4.1449456648393115, + "rewards/margins": 9.020546626377772, + "rewards/rejected": -4.875600961538462, + "step": 195 + }, + { + "epoch": 0.05372070713992051, + "grad_norm": 13.6875, + "kl": 2.9492831230163574, + "learning_rate": 5e-06, + "logits/chosen": 9383102.666666666, + "logits/rejected": 9798896.666666666, + "logps/chosen": -469.1483968098958, + "logps/rejected": -519.20703125, + "loss": 0.112, + "rewards/chosen": 3.574925104777018, + "rewards/margins": 8.684710184733072, + "rewards/rejected": -5.109785079956055, + "step": 196 + }, + { + "epoch": 0.05399479238043031, + "grad_norm": 15.9375, + "kl": 3.680001974105835, + "learning_rate": 5e-06, + "logits/chosen": 13442461.090909092, + "logits/rejected": 3291914.4615384615, + "logps/chosen": -466.88321200284093, + "logps/rejected": -358.75229116586536, + "loss": 0.1069, + "rewards/chosen": 4.072281924161044, + "rewards/margins": 7.470335020051969, + "rewards/rejected": -3.3980530958909254, + "step": 197 + }, + { + "epoch": 0.05426887762094011, + "grad_norm": 8.125, + "kl": 6.769434928894043, + "learning_rate": 5e-06, + "logits/chosen": 19864017.230769232, + "logits/rejected": 3010255.272727273, + "logps/chosen": -428.7634089543269, + "logps/rejected": -491.7413884943182, + "loss": 0.0648, + "rewards/chosen": 4.95170651949369, + "rewards/margins": 9.77585660494291, + "rewards/rejected": -4.824150085449219, + "step": 198 + }, + { + "epoch": 0.05454296286144991, + "grad_norm": 16.75, + "kl": 3.6588282585144043, + "learning_rate": 5e-06, + "logits/chosen": 1113135.3846153845, + "logits/rejected": 14299457.454545455, + "logps/chosen": -370.3683894230769, + "logps/rejected": -181.45439009232953, + "loss": 0.1795, + "rewards/chosen": 3.3832743718073917, + "rewards/margins": 5.518249978552332, + "rewards/rejected": -2.1349756067449395, + "step": 199 + }, + { + "epoch": 0.05481704810195971, + "grad_norm": 14.25, + "kl": 7.383843421936035, + "learning_rate": 5e-06, + "logits/chosen": 25432183.272727273, + "logits/rejected": 15468454.153846154, + "logps/chosen": -483.94340376420456, + "logps/rejected": -529.8083308293269, + "loss": 0.1097, + "rewards/chosen": 5.084277413108132, + "rewards/margins": 9.61124497527009, + "rewards/rejected": -4.526967562161959, + "step": 200 + }, + { + "epoch": 0.055091133342469505, + "grad_norm": 14.3125, + "kl": 11.056272506713867, + "learning_rate": 5e-06, + "logits/chosen": 6108566.4, + "logits/rejected": 4541319.111111111, + "logps/chosen": -500.247265625, + "logps/rejected": -353.01361762152777, + "loss": 0.0676, + "rewards/chosen": 4.155651092529297, + "rewards/margins": 7.368665059407553, + "rewards/rejected": -3.2130139668782554, + "step": 201 + }, + { + "epoch": 0.055365218582979304, + "grad_norm": 19.75, + "kl": 15.486307144165039, + "learning_rate": 5e-06, + "logits/chosen": 10029453.47368421, + "logits/rejected": -2559727.6, + "logps/chosen": -468.8062808388158, + "logps/rejected": -364.5609130859375, + "loss": 0.1715, + "rewards/chosen": 4.261139719109786, + "rewards/margins": 8.295858804803146, + "rewards/rejected": -4.034719085693359, + "step": 202 + }, + { + "epoch": 0.055639303823489104, + "grad_norm": 14.9375, + "kl": 9.248078346252441, + "learning_rate": 5e-06, + "logits/chosen": 19844104.0, + "logits/rejected": 37087440.0, + "logps/chosen": -389.9686279296875, + "logps/rejected": -463.65704345703125, + "loss": 0.157, + "rewards/chosen": 3.495807647705078, + "rewards/margins": 7.266016006469727, + "rewards/rejected": -3.7702083587646484, + "step": 203 + }, + { + "epoch": 0.055913389063998904, + "grad_norm": 12.8125, + "kl": 6.673659324645996, + "learning_rate": 5e-06, + "logits/chosen": -4767434.5, + "logits/rejected": -1982735.5, + "logps/chosen": -513.214111328125, + "logps/rejected": -428.96954345703125, + "loss": 0.0524, + "rewards/chosen": 4.518033981323242, + "rewards/margins": 8.874419689178467, + "rewards/rejected": -4.356385707855225, + "step": 204 + }, + { + "epoch": 0.056187474304508704, + "grad_norm": 14.0625, + "kl": 1.0780258178710938, + "learning_rate": 5e-06, + "logits/chosen": -3009850.153846154, + "logits/rejected": 6350238.545454546, + "logps/chosen": -424.18810096153845, + "logps/rejected": -443.51908735795456, + "loss": 0.0787, + "rewards/chosen": 3.8292001577524037, + "rewards/margins": 7.236341756540579, + "rewards/rejected": -3.407141598788175, + "step": 205 + }, + { + "epoch": 0.056461559545018504, + "grad_norm": 16.25, + "kl": 10.485641479492188, + "learning_rate": 5e-06, + "logits/chosen": -12033363.692307692, + "logits/rejected": 24036234.181818184, + "logps/chosen": -513.3760141225962, + "logps/rejected": -419.58114346590907, + "loss": 0.1068, + "rewards/chosen": 4.299801166240986, + "rewards/margins": 7.897839579548869, + "rewards/rejected": -3.5980384133078833, + "step": 206 + }, + { + "epoch": 0.0567356447855283, + "grad_norm": 12.3125, + "kl": 1.9726613759994507, + "learning_rate": 5e-06, + "logits/chosen": -1918259.5555555555, + "logits/rejected": 18183485.866666667, + "logps/chosen": -354.55604383680554, + "logps/rejected": -559.9032552083333, + "loss": 0.1006, + "rewards/chosen": 3.3451639811197915, + "rewards/margins": 7.8223108927408855, + "rewards/rejected": -4.477146911621094, + "step": 207 + }, + { + "epoch": 0.0570097300260381, + "grad_norm": 11.0, + "kl": 7.964696884155273, + "learning_rate": 5e-06, + "logits/chosen": -440720.9090909091, + "logits/rejected": 116747864.61538461, + "logps/chosen": -494.40229936079544, + "logps/rejected": -559.3937800480769, + "loss": 0.0515, + "rewards/chosen": 4.483458085493608, + "rewards/margins": 9.575550159374316, + "rewards/rejected": -5.092092073880709, + "step": 208 + }, + { + "epoch": 0.0572838152665479, + "grad_norm": 11.5625, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": 5282568.0, + "logits/rejected": 22053842.82352941, + "logps/chosen": -431.14718191964283, + "logps/rejected": -525.8369140625, + "loss": 0.0645, + "rewards/chosen": 3.2288616725376675, + "rewards/margins": 8.168556245435186, + "rewards/rejected": -4.939694572897518, + "step": 209 + }, + { + "epoch": 0.0575579005070577, + "grad_norm": 20.75, + "kl": 13.416534423828125, + "learning_rate": 5e-06, + "logits/chosen": -12611493.0, + "logits/rejected": -2432364.5, + "logps/chosen": -458.89898681640625, + "logps/rejected": -626.782470703125, + "loss": 0.1587, + "rewards/chosen": 4.080418586730957, + "rewards/margins": 9.986147403717041, + "rewards/rejected": -5.905728816986084, + "step": 210 + }, + { + "epoch": 0.057831985747567496, + "grad_norm": 14.375, + "kl": 4.132503509521484, + "learning_rate": 5e-06, + "logits/chosen": -1156940.3636363635, + "logits/rejected": 9815869.538461538, + "logps/chosen": -454.30131392045456, + "logps/rejected": -447.4434344951923, + "loss": 0.1008, + "rewards/chosen": 3.9283533963290127, + "rewards/margins": 6.948578334354854, + "rewards/rejected": -3.0202249380258412, + "step": 211 + }, + { + "epoch": 0.05810607098807729, + "grad_norm": 16.0, + "kl": 7.021111488342285, + "learning_rate": 5e-06, + "logits/chosen": -14080098.666666666, + "logits/rejected": 11815545.333333334, + "logps/chosen": -476.7827962239583, + "logps/rejected": -521.1528727213541, + "loss": 0.1336, + "rewards/chosen": 3.47343381245931, + "rewards/margins": 8.34574826558431, + "rewards/rejected": -4.872314453125, + "step": 212 + }, + { + "epoch": 0.05838015622858709, + "grad_norm": 14.8125, + "kl": 5.720648765563965, + "learning_rate": 5e-06, + "logits/chosen": 3370417.8666666667, + "logits/rejected": 10946906.666666666, + "logps/chosen": -457.3771158854167, + "logps/rejected": -610.3717990451389, + "loss": 0.0825, + "rewards/chosen": 4.106975301106771, + "rewards/margins": 8.303077019585505, + "rewards/rejected": -4.196101718478733, + "step": 213 + }, + { + "epoch": 0.05865424146909689, + "grad_norm": 11.0625, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -7416556.8, + "logits/rejected": 24578971.42857143, + "logps/chosen": -447.045751953125, + "logps/rejected": -438.4349888392857, + "loss": 0.0772, + "rewards/chosen": 4.363005828857422, + "rewards/margins": 9.002563803536551, + "rewards/rejected": -4.639557974679129, + "step": 214 + }, + { + "epoch": 0.05892832670960669, + "grad_norm": 10.5, + "kl": 0.26170605421066284, + "learning_rate": 5e-06, + "logits/chosen": 6511733.0, + "logits/rejected": 6521426.5, + "logps/chosen": -615.953857421875, + "logps/rejected": -408.9903564453125, + "loss": 0.0695, + "rewards/chosen": 4.427460670471191, + "rewards/margins": 7.9983837604522705, + "rewards/rejected": -3.570923089981079, + "step": 215 + }, + { + "epoch": 0.05920241195011649, + "grad_norm": 10.25, + "kl": 6.5508832931518555, + "learning_rate": 5e-06, + "logits/chosen": 5330468.363636363, + "logits/rejected": 100999020.3076923, + "logps/chosen": -393.74702592329544, + "logps/rejected": -562.4472280649038, + "loss": 0.0515, + "rewards/chosen": 5.456273165616122, + "rewards/margins": 11.124527724472792, + "rewards/rejected": -5.66825455885667, + "step": 216 + }, + { + "epoch": 0.05947649719062628, + "grad_norm": 18.5, + "kl": 5.353166103363037, + "learning_rate": 5e-06, + "logits/chosen": -1463375.5384615385, + "logits/rejected": 15532372.363636363, + "logps/chosen": -295.460693359375, + "logps/rejected": -303.0595703125, + "loss": 0.1375, + "rewards/chosen": 3.10614747267503, + "rewards/margins": 6.287823470322403, + "rewards/rejected": -3.181675997647372, + "step": 217 + }, + { + "epoch": 0.05975058243113608, + "grad_norm": 10.1875, + "kl": 1.7608833312988281, + "learning_rate": 5e-06, + "logits/chosen": -10305269.0, + "logits/rejected": -3087028.5, + "logps/chosen": -346.1761474609375, + "logps/rejected": -469.1958923339844, + "loss": 0.0592, + "rewards/chosen": 4.161206245422363, + "rewards/margins": 9.302098274230957, + "rewards/rejected": -5.140892028808594, + "step": 218 + }, + { + "epoch": 0.06002466767164588, + "grad_norm": 13.0625, + "kl": 4.5908308029174805, + "learning_rate": 5e-06, + "logits/chosen": -6138497.6, + "logits/rejected": 11777070.222222222, + "logps/chosen": -443.49033203125, + "logps/rejected": -327.10302734375, + "loss": 0.0964, + "rewards/chosen": 3.811534881591797, + "rewards/margins": 7.0538722144232855, + "rewards/rejected": -3.2423373328314886, + "step": 219 + }, + { + "epoch": 0.06029875291215568, + "grad_norm": 13.6875, + "kl": 2.3556036949157715, + "learning_rate": 5e-06, + "logits/chosen": -22363188.363636363, + "logits/rejected": 7053648.0, + "logps/chosen": -489.70649857954544, + "logps/rejected": -429.1982421875, + "loss": 0.0831, + "rewards/chosen": 4.779176538640803, + "rewards/margins": 8.27339697884513, + "rewards/rejected": -3.494220440204327, + "step": 220 + }, + { + "epoch": 0.06057283815266548, + "grad_norm": 18.5, + "kl": 14.855815887451172, + "learning_rate": 5e-06, + "logits/chosen": -15365006.0, + "logits/rejected": 17526724.0, + "logps/chosen": -456.07525634765625, + "logps/rejected": -255.51934814453125, + "loss": 0.1168, + "rewards/chosen": 4.42777681350708, + "rewards/margins": 7.579113006591797, + "rewards/rejected": -3.151336193084717, + "step": 221 + }, + { + "epoch": 0.060846923393175274, + "grad_norm": 11.125, + "kl": 2.5943312644958496, + "learning_rate": 5e-06, + "logits/chosen": -4181854.5454545454, + "logits/rejected": 8472212.923076924, + "logps/chosen": -530.3947975852273, + "logps/rejected": -394.90091646634613, + "loss": 0.0777, + "rewards/chosen": 4.756909457120028, + "rewards/margins": 8.87311628648451, + "rewards/rejected": -4.116206829364483, + "step": 222 + }, + { + "epoch": 0.061121008633685074, + "grad_norm": 8.125, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -14311652.444444444, + "logits/rejected": 4806328.533333333, + "logps/chosen": -533.1510416666666, + "logps/rejected": -478.25397135416665, + "loss": 0.0365, + "rewards/chosen": 4.741060045030382, + "rewards/margins": 10.047053697374132, + "rewards/rejected": -5.30599365234375, + "step": 223 + }, + { + "epoch": 0.061395093874194874, + "grad_norm": 14.625, + "kl": 9.388216018676758, + "learning_rate": 5e-06, + "logits/chosen": -1732472.0, + "logits/rejected": 8802879.384615384, + "logps/chosen": -527.9182794744319, + "logps/rejected": -603.7789588341346, + "loss": 0.0972, + "rewards/chosen": 5.2377797907049, + "rewards/margins": 11.379497554752376, + "rewards/rejected": -6.141717764047476, + "step": 224 + }, + { + "epoch": 0.061669179114704674, + "grad_norm": 15.5, + "kl": 5.757241249084473, + "learning_rate": 5e-06, + "logits/chosen": 14995893.333333334, + "logits/rejected": 8014917.333333333, + "logps/chosen": -517.5732421875, + "logps/rejected": -479.071044921875, + "loss": 0.0693, + "rewards/chosen": 4.14844290415446, + "rewards/margins": 7.683147748311361, + "rewards/rejected": -3.534704844156901, + "step": 225 + }, + { + "epoch": 0.061943264355214474, + "grad_norm": 9.75, + "kl": 2.330820083618164, + "learning_rate": 5e-06, + "logits/chosen": 4367716.363636363, + "logits/rejected": 2672217.5384615385, + "logps/chosen": -409.56906960227275, + "logps/rejected": -391.29161658653845, + "loss": 0.0567, + "rewards/chosen": 4.331513144753196, + "rewards/margins": 9.46991975157411, + "rewards/rejected": -5.138406606820913, + "step": 226 + }, + { + "epoch": 0.062217349595724274, + "grad_norm": 11.0625, + "kl": 7.8385820388793945, + "learning_rate": 5e-06, + "logits/chosen": 19304300.307692308, + "logits/rejected": 14306337.454545455, + "logps/chosen": -499.7370417668269, + "logps/rejected": -408.14448686079544, + "loss": 0.1313, + "rewards/chosen": 4.942823556753305, + "rewards/margins": 10.094192131415948, + "rewards/rejected": -5.151368574662642, + "step": 227 + }, + { + "epoch": 0.06249143483623407, + "grad_norm": 10.25, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": 24242362.181818184, + "logits/rejected": 16606225.23076923, + "logps/chosen": -604.2864879261364, + "logps/rejected": -554.0869140625, + "loss": 0.0465, + "rewards/chosen": 4.118512933904475, + "rewards/margins": 10.749394023335064, + "rewards/rejected": -6.630881089430589, + "step": 228 + }, + { + "epoch": 0.06276552007674387, + "grad_norm": 12.4375, + "kl": 3.264291763305664, + "learning_rate": 5e-06, + "logits/chosen": 17365170.46153846, + "logits/rejected": 18591547.636363637, + "logps/chosen": -382.7811748798077, + "logps/rejected": -439.1297496448864, + "loss": 0.1191, + "rewards/chosen": 3.6738451444185696, + "rewards/margins": 7.307094947441474, + "rewards/rejected": -3.633249803022905, + "step": 229 + }, + { + "epoch": 0.06303960531725367, + "grad_norm": 13.25, + "kl": 9.579853057861328, + "learning_rate": 5e-06, + "logits/chosen": -11651626.666666666, + "logits/rejected": 6581990.666666667, + "logps/chosen": -679.1416422526041, + "logps/rejected": -410.2731119791667, + "loss": 0.1684, + "rewards/chosen": 5.971960703531901, + "rewards/margins": 9.261691729227703, + "rewards/rejected": -3.289731025695801, + "step": 230 + }, + { + "epoch": 0.06331369055776347, + "grad_norm": 13.375, + "kl": 3.077338218688965, + "learning_rate": 5e-06, + "logits/chosen": 23111858.90909091, + "logits/rejected": 19078796.307692308, + "logps/chosen": -444.17231889204544, + "logps/rejected": -436.1751051682692, + "loss": 0.1062, + "rewards/chosen": 4.351072484796697, + "rewards/margins": 9.162907847157726, + "rewards/rejected": -4.811835362361028, + "step": 231 + }, + { + "epoch": 0.06358777579827327, + "grad_norm": 15.9375, + "kl": 4.432603359222412, + "learning_rate": 5e-06, + "logits/chosen": 10592062.0, + "logits/rejected": 7002415.0, + "logps/chosen": -493.58642578125, + "logps/rejected": -486.51654052734375, + "loss": 0.0989, + "rewards/chosen": 3.6595046520233154, + "rewards/margins": 7.671621561050415, + "rewards/rejected": -4.0121169090271, + "step": 232 + }, + { + "epoch": 0.06386186103878307, + "grad_norm": 7.78125, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": 22913944.888888888, + "logits/rejected": 78571050.66666667, + "logps/chosen": -396.70545789930554, + "logps/rejected": -588.0014973958333, + "loss": 0.0338, + "rewards/chosen": 3.5782987806532116, + "rewards/margins": 11.110277133517794, + "rewards/rejected": -7.531978352864583, + "step": 233 + }, + { + "epoch": 0.06413594627929287, + "grad_norm": 54.25, + "kl": 5.083621501922607, + "learning_rate": 5e-06, + "logits/chosen": 20707717.818181816, + "logits/rejected": 14602550.153846154, + "logps/chosen": -413.29514382102275, + "logps/rejected": -663.9215745192307, + "loss": 0.0993, + "rewards/chosen": 4.1621104153719815, + "rewards/margins": 9.996567225956417, + "rewards/rejected": -5.834456810584435, + "step": 234 + }, + { + "epoch": 0.06441003151980265, + "grad_norm": 11.0625, + "kl": 10.2030029296875, + "learning_rate": 5e-06, + "logits/chosen": -8643770.133333333, + "logits/rejected": 55987768.88888889, + "logps/chosen": -546.3072265625, + "logps/rejected": -597.9254557291666, + "loss": 0.1066, + "rewards/chosen": 4.503391520182292, + "rewards/margins": 10.501863098144531, + "rewards/rejected": -5.998471577962239, + "step": 235 + }, + { + "epoch": 0.06468411676031245, + "grad_norm": 13.1875, + "kl": 5.806245803833008, + "learning_rate": 5e-06, + "logits/chosen": -5549272.571428572, + "logits/rejected": 12086524.8, + "logps/chosen": -520.2118791852679, + "logps/rejected": -519.3787109375, + "loss": 0.0944, + "rewards/chosen": 4.678683144705636, + "rewards/margins": 9.020408303397042, + "rewards/rejected": -4.341725158691406, + "step": 236 + }, + { + "epoch": 0.06495820200082225, + "grad_norm": 14.875, + "kl": 2.021557331085205, + "learning_rate": 5e-06, + "logits/chosen": 32835833.6, + "logits/rejected": 54510445.71428572, + "logps/chosen": -355.077783203125, + "logps/rejected": -642.2839704241071, + "loss": 0.1259, + "rewards/chosen": 3.2599563598632812, + "rewards/margins": 8.924779074532644, + "rewards/rejected": -5.664822714669364, + "step": 237 + }, + { + "epoch": 0.06523228724133205, + "grad_norm": 10.25, + "kl": 1.1376266479492188, + "learning_rate": 5e-06, + "logits/chosen": -5679687.111111111, + "logits/rejected": 16897065.6, + "logps/chosen": -342.44788953993054, + "logps/rejected": -425.70924479166666, + "loss": 0.1022, + "rewards/chosen": 3.713292015923394, + "rewards/margins": 8.432776472303603, + "rewards/rejected": -4.719484456380209, + "step": 238 + }, + { + "epoch": 0.06550637248184185, + "grad_norm": 10.875, + "kl": 2.6556992530822754, + "learning_rate": 5e-06, + "logits/chosen": 9555662.857142856, + "logits/rejected": 16332102.4, + "logps/chosen": -460.96292550223217, + "logps/rejected": -475.517431640625, + "loss": 0.0799, + "rewards/chosen": 4.528543199811663, + "rewards/margins": 8.05802775791713, + "rewards/rejected": -3.5294845581054686, + "step": 239 + }, + { + "epoch": 0.06578045772235165, + "grad_norm": 9.9375, + "kl": 1.4102262258529663, + "learning_rate": 5e-06, + "logits/chosen": 6603336.0, + "logits/rejected": 28273417.14285714, + "logps/chosen": -545.98671875, + "logps/rejected": -626.6135602678571, + "loss": 0.0451, + "rewards/chosen": 4.669897842407226, + "rewards/margins": 10.963448061261857, + "rewards/rejected": -6.293550218854632, + "step": 240 + }, + { + "epoch": 0.06605454296286145, + "grad_norm": 20.5, + "kl": 3.0436925888061523, + "learning_rate": 5e-06, + "logits/chosen": 38258397.71428572, + "logits/rejected": 19989163.2, + "logps/chosen": -461.94737025669644, + "logps/rejected": -450.12109375, + "loss": 0.0984, + "rewards/chosen": 3.777672358921596, + "rewards/margins": 9.070582362583705, + "rewards/rejected": -5.29291000366211, + "step": 241 + }, + { + "epoch": 0.06632862820337125, + "grad_norm": 12.0, + "kl": 1.8440793752670288, + "learning_rate": 5e-06, + "logits/chosen": 9315063.272727273, + "logits/rejected": 49010131.692307696, + "logps/chosen": -433.865234375, + "logps/rejected": -489.8444260817308, + "loss": 0.0685, + "rewards/chosen": 4.11791298606179, + "rewards/margins": 8.522804953835227, + "rewards/rejected": -4.4048919677734375, + "step": 242 + }, + { + "epoch": 0.06660271344388105, + "grad_norm": 11.4375, + "kl": 6.11643123626709, + "learning_rate": 5e-06, + "logits/chosen": 7375888.0, + "logits/rejected": 48403721.6, + "logps/chosen": -322.82090541294644, + "logps/rejected": -534.0943359375, + "loss": 0.1581, + "rewards/chosen": 3.8321969168526784, + "rewards/margins": 8.205166353498186, + "rewards/rejected": -4.372969436645508, + "step": 243 + }, + { + "epoch": 0.06687679868439085, + "grad_norm": 19.75, + "kl": 14.693050384521484, + "learning_rate": 5e-06, + "logits/chosen": -18356544.0, + "logits/rejected": 36735396.571428575, + "logps/chosen": -461.48816636029414, + "logps/rejected": -792.9794921875, + "loss": 0.1235, + "rewards/chosen": 4.183081682990579, + "rewards/margins": 13.760397967170267, + "rewards/rejected": -9.577316284179688, + "step": 244 + }, + { + "epoch": 0.06715088392490065, + "grad_norm": 17.75, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -8314821.333333333, + "logits/rejected": -2836730.6666666665, + "logps/chosen": -416.8761393229167, + "logps/rejected": -699.8424479166666, + "loss": 0.1116, + "rewards/chosen": 3.572037696838379, + "rewards/margins": 10.237701733907063, + "rewards/rejected": -6.665664037068685, + "step": 245 + }, + { + "epoch": 0.06742496916541044, + "grad_norm": 11.5625, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": 264759.2, + "logits/rejected": 7193585.714285715, + "logps/chosen": -387.6857177734375, + "logps/rejected": -426.6007603236607, + "loss": 0.0969, + "rewards/chosen": 3.474840545654297, + "rewards/margins": 8.703953988211495, + "rewards/rejected": -5.229113442557199, + "step": 246 + }, + { + "epoch": 0.06769905440592024, + "grad_norm": 13.125, + "kl": 10.30768871307373, + "learning_rate": 5e-06, + "logits/chosen": -14994899.42857143, + "logits/rejected": 5681408.8, + "logps/chosen": -444.95982142857144, + "logps/rejected": -494.2611328125, + "loss": 0.0731, + "rewards/chosen": 4.896151951381138, + "rewards/margins": 9.476132801600865, + "rewards/rejected": -4.579980850219727, + "step": 247 + }, + { + "epoch": 0.06797313964643004, + "grad_norm": 9.5625, + "kl": 3.554717540740967, + "learning_rate": 5e-06, + "logits/chosen": 27912142.769230768, + "logits/rejected": 14734832.0, + "logps/chosen": -372.9149639423077, + "logps/rejected": -640.0835404829545, + "loss": 0.0439, + "rewards/chosen": 4.469609187199519, + "rewards/margins": 11.187395109163297, + "rewards/rejected": -6.717785921963778, + "step": 248 + }, + { + "epoch": 0.06824722488693984, + "grad_norm": 8.3125, + "kl": 0.12672869861125946, + "learning_rate": 5e-06, + "logits/chosen": -16766844.8, + "logits/rejected": 6625658.947368421, + "logps/chosen": -457.173291015625, + "logps/rejected": -439.6635485197368, + "loss": 0.0452, + "rewards/chosen": 5.323111724853516, + "rewards/margins": 9.577124384829872, + "rewards/rejected": -4.254012659976357, + "step": 249 + }, + { + "epoch": 0.06852131012744964, + "grad_norm": 15.3125, + "kl": 1.0597375631332397, + "learning_rate": 5e-06, + "logits/chosen": 9964952.615384616, + "logits/rejected": 40427202.90909091, + "logps/chosen": -394.0656550480769, + "logps/rejected": -606.6855912642045, + "loss": 0.0907, + "rewards/chosen": 2.9479874830979567, + "rewards/margins": 8.501980708195614, + "rewards/rejected": -5.553993225097656, + "step": 250 + }, + { + "epoch": 0.06879539536795944, + "grad_norm": 14.125, + "kl": 6.820850372314453, + "learning_rate": 5e-06, + "logits/chosen": 14734212.266666668, + "logits/rejected": -5203939.555555556, + "logps/chosen": -462.8798828125, + "logps/rejected": -474.47059461805554, + "loss": 0.1135, + "rewards/chosen": 4.326311238606771, + "rewards/margins": 7.977598317464193, + "rewards/rejected": -3.651287078857422, + "step": 251 + }, + { + "epoch": 0.06906948060846924, + "grad_norm": 8.3125, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -11703633.777777778, + "logits/rejected": 3396974.933333333, + "logps/chosen": -461.3015950520833, + "logps/rejected": -385.6204427083333, + "loss": 0.0441, + "rewards/chosen": 4.588537851969401, + "rewards/margins": 9.106887563069662, + "rewards/rejected": -4.51834971110026, + "step": 252 + }, + { + "epoch": 0.06934356584897904, + "grad_norm": 10.3125, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": 7985262.4, + "logits/rejected": -798647.7142857143, + "logps/chosen": -449.96728515625, + "logps/rejected": -479.57571847098217, + "loss": 0.101, + "rewards/chosen": 3.827876663208008, + "rewards/margins": 7.801386315482004, + "rewards/rejected": -3.9735096522739957, + "step": 253 + }, + { + "epoch": 0.06961765108948884, + "grad_norm": 10.0625, + "kl": 0.9049758911132812, + "learning_rate": 5e-06, + "logits/chosen": -7319471.2727272725, + "logits/rejected": -3721517.230769231, + "logps/chosen": -533.5440784801136, + "logps/rejected": -414.28455528846155, + "loss": 0.0614, + "rewards/chosen": 4.016970547762784, + "rewards/margins": 8.756836657757525, + "rewards/rejected": -4.7398661099947414, + "step": 254 + }, + { + "epoch": 0.06989173632999864, + "grad_norm": 15.25, + "kl": 7.700930595397949, + "learning_rate": 5e-06, + "logits/chosen": -3431287.2, + "logits/rejected": 2536856.0, + "logps/chosen": -537.388818359375, + "logps/rejected": -405.92818777901783, + "loss": 0.1716, + "rewards/chosen": 5.448218536376953, + "rewards/margins": 9.428494916643416, + "rewards/rejected": -3.9802763802664622, + "step": 255 + }, + { + "epoch": 0.07016582157050842, + "grad_norm": 8.9375, + "kl": 0.2056560516357422, + "learning_rate": 5e-06, + "logits/chosen": 27405304.0, + "logits/rejected": -11109617.333333334, + "logps/chosen": -415.3972981770833, + "logps/rejected": -408.6536458333333, + "loss": 0.0673, + "rewards/chosen": 4.3152875900268555, + "rewards/margins": 9.112414677937824, + "rewards/rejected": -4.79712708791097, + "step": 256 + }, + { + "epoch": 0.07043990681101822, + "grad_norm": 14.125, + "kl": 4.355138778686523, + "learning_rate": 5e-06, + "logits/chosen": -15176532.57142857, + "logits/rejected": 66385024.0, + "logps/chosen": -445.87193080357144, + "logps/rejected": -568.08994140625, + "loss": 0.0579, + "rewards/chosen": 4.3558197021484375, + "rewards/margins": 9.572275161743164, + "rewards/rejected": -5.216455459594727, + "step": 257 + }, + { + "epoch": 0.07071399205152802, + "grad_norm": 13.0, + "kl": 2.433619260787964, + "learning_rate": 5e-06, + "logits/chosen": 14969946.181818182, + "logits/rejected": 48164169.84615385, + "logps/chosen": -427.78426846590907, + "logps/rejected": -479.892578125, + "loss": 0.0777, + "rewards/chosen": 3.599002491344105, + "rewards/margins": 9.995502125133168, + "rewards/rejected": -6.3964996337890625, + "step": 258 + }, + { + "epoch": 0.07098807729203782, + "grad_norm": 9.3125, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": 8117113.333333333, + "logits/rejected": 31431960.0, + "logps/chosen": -541.1790364583334, + "logps/rejected": -497.0306803385417, + "loss": 0.0697, + "rewards/chosen": 4.528116226196289, + "rewards/margins": 9.4809144337972, + "rewards/rejected": -4.952798207600911, + "step": 259 + }, + { + "epoch": 0.07126216253254762, + "grad_norm": 17.75, + "kl": 4.295126438140869, + "learning_rate": 5e-06, + "logits/chosen": 40228067.2, + "logits/rejected": 30496992.0, + "logps/chosen": -608.2, + "logps/rejected": -335.3004673549107, + "loss": 0.0771, + "rewards/chosen": 4.677749252319336, + "rewards/margins": 8.161793463570731, + "rewards/rejected": -3.484044211251395, + "step": 260 + }, + { + "epoch": 0.07153624777305742, + "grad_norm": 12.0, + "kl": 0.4326254725456238, + "learning_rate": 5e-06, + "logits/chosen": -1208177.5833333333, + "logits/rejected": 28106890.666666668, + "logps/chosen": -445.582275390625, + "logps/rejected": -575.3586832682291, + "loss": 0.0713, + "rewards/chosen": 4.889801661173503, + "rewards/margins": 11.608488082885742, + "rewards/rejected": -6.718686421712239, + "step": 261 + }, + { + "epoch": 0.07181033301356722, + "grad_norm": 14.5625, + "kl": 1.7306512594223022, + "learning_rate": 5e-06, + "logits/chosen": 23954191.05882353, + "logits/rejected": 20334240.0, + "logps/chosen": -432.1903722426471, + "logps/rejected": -447.7947474888393, + "loss": 0.0858, + "rewards/chosen": 3.393847072825712, + "rewards/margins": 8.54405895201098, + "rewards/rejected": -5.150211879185268, + "step": 262 + }, + { + "epoch": 0.07208441825407702, + "grad_norm": 17.75, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": 4660982.222222222, + "logits/rejected": 16374439.466666667, + "logps/chosen": -541.5124782986111, + "logps/rejected": -461.14391276041664, + "loss": 0.079, + "rewards/chosen": 4.340162489149305, + "rewards/margins": 8.462654452853734, + "rewards/rejected": -4.1224919637044275, + "step": 263 + }, + { + "epoch": 0.07235850349458682, + "grad_norm": 11.5, + "kl": 7.6823811531066895, + "learning_rate": 5e-06, + "logits/chosen": -19549808.0, + "logits/rejected": 18184868.363636363, + "logps/chosen": -436.8807842548077, + "logps/rejected": -430.6495472301136, + "loss": 0.0531, + "rewards/chosen": 6.502193744365986, + "rewards/margins": 12.324811015095744, + "rewards/rejected": -5.822617270729759, + "step": 264 + }, + { + "epoch": 0.07263258873509662, + "grad_norm": 13.1875, + "kl": 1.491803526878357, + "learning_rate": 5e-06, + "logits/chosen": -14613978.181818182, + "logits/rejected": 30286294.153846152, + "logps/chosen": -492.35129616477275, + "logps/rejected": -460.20079627403845, + "loss": 0.0714, + "rewards/chosen": 4.017611763694069, + "rewards/margins": 8.38382798308259, + "rewards/rejected": -4.366216219388521, + "step": 265 + }, + { + "epoch": 0.07290667397560642, + "grad_norm": 15.0, + "kl": 1.1497396230697632, + "learning_rate": 5e-06, + "logits/chosen": 2628845.1428571427, + "logits/rejected": 21416083.2, + "logps/chosen": -417.68603515625, + "logps/rejected": -503.79892578125, + "loss": 0.0904, + "rewards/chosen": 3.371175765991211, + "rewards/margins": 8.741563034057616, + "rewards/rejected": -5.370387268066406, + "step": 266 + }, + { + "epoch": 0.0731807592161162, + "grad_norm": 19.625, + "kl": 6.655999660491943, + "learning_rate": 5e-06, + "logits/chosen": -13081777.142857144, + "logits/rejected": 33984934.4, + "logps/chosen": -528.3924734933036, + "logps/rejected": -472.630029296875, + "loss": 0.1129, + "rewards/chosen": 4.5996246337890625, + "rewards/margins": 7.437141990661621, + "rewards/rejected": -2.8375173568725587, + "step": 267 + }, + { + "epoch": 0.073454844456626, + "grad_norm": 14.0, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": 4866083.0, + "logits/rejected": 10778604.0, + "logps/chosen": -527.2923583984375, + "logps/rejected": -518.109130859375, + "loss": 0.0734, + "rewards/chosen": 4.740788459777832, + "rewards/margins": 10.537458419799805, + "rewards/rejected": -5.796669960021973, + "step": 268 + }, + { + "epoch": 0.0737289296971358, + "grad_norm": 14.4375, + "kl": 5.852717876434326, + "learning_rate": 5e-06, + "logits/chosen": -4066713.3333333335, + "logits/rejected": 55962876.44444445, + "logps/chosen": -566.7399088541666, + "logps/rejected": -428.1861979166667, + "loss": 0.0647, + "rewards/chosen": 4.9399668375651045, + "rewards/margins": 8.698386722140842, + "rewards/rejected": -3.758419884575738, + "step": 269 + }, + { + "epoch": 0.0740030149376456, + "grad_norm": 19.125, + "kl": 9.196998596191406, + "learning_rate": 5e-06, + "logits/chosen": 23252451.76470588, + "logits/rejected": 48162537.14285714, + "logps/chosen": -469.28624770220586, + "logps/rejected": -401.90586635044644, + "loss": 0.1701, + "rewards/chosen": 3.296945908490349, + "rewards/margins": 7.379231108336889, + "rewards/rejected": -4.08228519984654, + "step": 270 + }, + { + "epoch": 0.0742771001781554, + "grad_norm": 15.6875, + "kl": 4.808794021606445, + "learning_rate": 5e-06, + "logits/chosen": 3625887.272727273, + "logits/rejected": -100800.30769230769, + "logps/chosen": -478.80264559659093, + "logps/rejected": -477.2715594951923, + "loss": 0.0999, + "rewards/chosen": 3.762373143976385, + "rewards/margins": 9.350861796132335, + "rewards/rejected": -5.58848865215595, + "step": 271 + }, + { + "epoch": 0.0745511854186652, + "grad_norm": 17.0, + "kl": 8.700464248657227, + "learning_rate": 5e-06, + "logits/chosen": -4990892.307692308, + "logits/rejected": -5112166.545454546, + "logps/chosen": -533.8067157451923, + "logps/rejected": -640.0094992897727, + "loss": 0.0835, + "rewards/chosen": 5.018348106971154, + "rewards/margins": 12.808261631252048, + "rewards/rejected": -7.789913524280895, + "step": 272 + }, + { + "epoch": 0.074825270659175, + "grad_norm": 12.1875, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -2765644.2, + "logits/rejected": 21635337.14285714, + "logps/chosen": -553.750390625, + "logps/rejected": -528.7015904017857, + "loss": 0.0732, + "rewards/chosen": 4.473266983032227, + "rewards/margins": 10.06370964050293, + "rewards/rejected": -5.590442657470703, + "step": 273 + }, + { + "epoch": 0.0750993558996848, + "grad_norm": 11.8125, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": 10500308.0, + "logits/rejected": 17166392.0, + "logps/chosen": -517.083251953125, + "logps/rejected": -511.386962890625, + "loss": 0.089, + "rewards/chosen": 4.425843715667725, + "rewards/margins": 10.993432998657227, + "rewards/rejected": -6.567589282989502, + "step": 274 + }, + { + "epoch": 0.0753734411401946, + "grad_norm": 14.0625, + "kl": 2.405853271484375, + "learning_rate": 5e-06, + "logits/chosen": 14553065.846153846, + "logits/rejected": 22566573.09090909, + "logps/chosen": -432.07425631009613, + "logps/rejected": -423.16477272727275, + "loss": 0.0961, + "rewards/chosen": 3.9345098642202525, + "rewards/margins": 9.061872202199655, + "rewards/rejected": -5.127362337979403, + "step": 275 + }, + { + "epoch": 0.0756475263807044, + "grad_norm": 8.5, + "kl": 3.8984687328338623, + "learning_rate": 5e-06, + "logits/chosen": -26578803.2, + "logits/rejected": -11575993.142857144, + "logps/chosen": -551.15302734375, + "logps/rejected": -363.7992466517857, + "loss": 0.0639, + "rewards/chosen": 6.405482482910156, + "rewards/margins": 11.70845227922712, + "rewards/rejected": -5.302969796316964, + "step": 276 + }, + { + "epoch": 0.07592161162121419, + "grad_norm": 13.1875, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -11808154.181818182, + "logits/rejected": -2788896.6153846155, + "logps/chosen": -455.95632102272725, + "logps/rejected": -412.1980543870192, + "loss": 0.0818, + "rewards/chosen": 3.849466497247869, + "rewards/margins": 8.932009503557964, + "rewards/rejected": -5.082543006310096, + "step": 277 + }, + { + "epoch": 0.07619569686172399, + "grad_norm": 10.875, + "kl": 3.6099560260772705, + "learning_rate": 5e-06, + "logits/chosen": -7605600.533333333, + "logits/rejected": 35155687.11111111, + "logps/chosen": -479.7279296875, + "logps/rejected": -448.9381510416667, + "loss": 0.1003, + "rewards/chosen": 4.321465555826823, + "rewards/margins": 8.85787387424045, + "rewards/rejected": -4.536408318413629, + "step": 278 + }, + { + "epoch": 0.07646978210223379, + "grad_norm": 8.375, + "kl": 0.5924161672592163, + "learning_rate": 5e-06, + "logits/chosen": 31867325.714285713, + "logits/rejected": 51111110.4, + "logps/chosen": -574.6766880580357, + "logps/rejected": -674.838916015625, + "loss": 0.0476, + "rewards/chosen": 3.7448556082589284, + "rewards/margins": 11.341829027448382, + "rewards/rejected": -7.596973419189453, + "step": 279 + }, + { + "epoch": 0.07674386734274359, + "grad_norm": 8.125, + "kl": 1.069183349609375, + "learning_rate": 5e-06, + "logits/chosen": -3330772.0, + "logits/rejected": -12238551.384615384, + "logps/chosen": -435.54860617897725, + "logps/rejected": -408.17728365384613, + "loss": 0.0625, + "rewards/chosen": 4.37043588811701, + "rewards/margins": 9.958705635337562, + "rewards/rejected": -5.5882697472205525, + "step": 280 + }, + { + "epoch": 0.07701795258325339, + "grad_norm": 18.625, + "kl": 1.81894052028656, + "learning_rate": 5e-06, + "logits/chosen": 2607350.0, + "logits/rejected": 55268697.6, + "logps/chosen": -408.18526785714283, + "logps/rejected": -407.909375, + "loss": 0.1233, + "rewards/chosen": 3.0901903424944197, + "rewards/margins": 7.35550787789481, + "rewards/rejected": -4.26531753540039, + "step": 281 + }, + { + "epoch": 0.07729203782376319, + "grad_norm": 15.75, + "kl": 3.394258499145508, + "learning_rate": 5e-06, + "logits/chosen": -13108462.545454545, + "logits/rejected": 5535024.0, + "logps/chosen": -357.32865767045456, + "logps/rejected": -484.9636042668269, + "loss": 0.1277, + "rewards/chosen": 4.178504250266335, + "rewards/margins": 9.44068780645624, + "rewards/rejected": -5.262183556189904, + "step": 282 + }, + { + "epoch": 0.07756612306427299, + "grad_norm": 10.125, + "kl": 1.1827621459960938, + "learning_rate": 5e-06, + "logits/chosen": 16676073.6, + "logits/rejected": -2959666.8571428573, + "logps/chosen": -420.728076171875, + "logps/rejected": -385.94747488839283, + "loss": 0.0721, + "rewards/chosen": 3.9352622985839845, + "rewards/margins": 8.179723358154297, + "rewards/rejected": -4.2444610595703125, + "step": 283 + }, + { + "epoch": 0.07784020830478279, + "grad_norm": 14.6875, + "kl": 4.102264404296875, + "learning_rate": 5e-06, + "logits/chosen": -13287650.461538462, + "logits/rejected": 6164178.909090909, + "logps/chosen": -486.5637770432692, + "logps/rejected": -580.0447887073864, + "loss": 0.0624, + "rewards/chosen": 3.797785832331731, + "rewards/margins": 11.2286732280171, + "rewards/rejected": -7.430887395685369, + "step": 284 + }, + { + "epoch": 0.07811429354529259, + "grad_norm": 17.25, + "kl": 4.552162170410156, + "learning_rate": 5e-06, + "logits/chosen": 398062.85714285716, + "logits/rejected": -8424584.0, + "logps/chosen": -435.98221261160717, + "logps/rejected": -315.180029296875, + "loss": 0.1867, + "rewards/chosen": 3.202957970755441, + "rewards/margins": 6.262224633353098, + "rewards/rejected": -3.0592666625976563, + "step": 285 + }, + { + "epoch": 0.07838837878580239, + "grad_norm": 7.5625, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -7470672.8, + "logits/rejected": -3814649.714285714, + "logps/chosen": -490.853857421875, + "logps/rejected": -394.91371372767856, + "loss": 0.0538, + "rewards/chosen": 6.014366912841797, + "rewards/margins": 11.41098153250558, + "rewards/rejected": -5.396614619663784, + "step": 286 + }, + { + "epoch": 0.07866246402631219, + "grad_norm": 10.5, + "kl": 3.856412410736084, + "learning_rate": 5e-06, + "logits/chosen": 2724012.5714285714, + "logits/rejected": 41308249.6, + "logps/chosen": -393.5224609375, + "logps/rejected": -533.0912109375, + "loss": 0.0781, + "rewards/chosen": 4.1764213017054965, + "rewards/margins": 11.120481055123467, + "rewards/rejected": -6.944059753417969, + "step": 287 + }, + { + "epoch": 0.07893654926682198, + "grad_norm": 15.5, + "kl": 5.688349723815918, + "learning_rate": 5e-06, + "logits/chosen": -13112982.666666666, + "logits/rejected": 24606485.333333332, + "logps/chosen": -546.221923828125, + "logps/rejected": -347.6874186197917, + "loss": 0.1332, + "rewards/chosen": 4.418943405151367, + "rewards/margins": 8.380524635314941, + "rewards/rejected": -3.961581230163574, + "step": 288 + }, + { + "epoch": 0.07921063450733178, + "grad_norm": 8.3125, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": 3981869.4545454546, + "logits/rejected": -2361308.923076923, + "logps/chosen": -492.59965376420456, + "logps/rejected": -485.8505108173077, + "loss": 0.0659, + "rewards/chosen": 4.404278148304332, + "rewards/margins": 9.444561351429332, + "rewards/rejected": -5.040283203125, + "step": 289 + }, + { + "epoch": 0.07948471974784158, + "grad_norm": 15.875, + "kl": 6.24346923828125, + "learning_rate": 5e-06, + "logits/chosen": 3680886.8571428573, + "logits/rejected": -4384022.4, + "logps/chosen": -497.16660853794644, + "logps/rejected": -324.111474609375, + "loss": 0.1254, + "rewards/chosen": 3.8708904811314175, + "rewards/margins": 6.9683283397129605, + "rewards/rejected": -3.097437858581543, + "step": 290 + }, + { + "epoch": 0.07975880498835138, + "grad_norm": 23.875, + "kl": 10.540879249572754, + "learning_rate": 5e-06, + "logits/chosen": 21684532.70588235, + "logits/rejected": 927929.1428571428, + "logps/chosen": -461.34329044117646, + "logps/rejected": -458.38218470982144, + "loss": 0.1696, + "rewards/chosen": 3.10023655610926, + "rewards/margins": 7.663300041391068, + "rewards/rejected": -4.563063485281808, + "step": 291 + }, + { + "epoch": 0.08003289022886118, + "grad_norm": 12.75, + "kl": 0.9361509084701538, + "learning_rate": 5e-06, + "logits/chosen": 11743655.272727273, + "logits/rejected": 9239201.846153846, + "logps/chosen": -503.54736328125, + "logps/rejected": -563.1848707932693, + "loss": 0.0791, + "rewards/chosen": 4.443031311035156, + "rewards/margins": 10.551200279822716, + "rewards/rejected": -6.10816896878756, + "step": 292 + }, + { + "epoch": 0.08030697546937098, + "grad_norm": 7.1875, + "kl": 4.891231060028076, + "learning_rate": 5e-06, + "logits/chosen": -13001907.692307692, + "logits/rejected": 30297605.818181816, + "logps/chosen": -523.3993013822115, + "logps/rejected": -390.8577769886364, + "loss": 0.0288, + "rewards/chosen": 5.34951899601863, + "rewards/margins": 10.810799125191215, + "rewards/rejected": -5.461280129172585, + "step": 293 + }, + { + "epoch": 0.08058106070988078, + "grad_norm": 12.8125, + "kl": 2.1094970703125, + "learning_rate": 5e-06, + "logits/chosen": -19594950.4, + "logits/rejected": 6164467.428571428, + "logps/chosen": -564.647998046875, + "logps/rejected": -372.67236328125, + "loss": 0.1051, + "rewards/chosen": 5.093234252929688, + "rewards/margins": 8.542799595424107, + "rewards/rejected": -3.4495653424944197, + "step": 294 + }, + { + "epoch": 0.08085514595039058, + "grad_norm": 19.25, + "kl": 4.680455684661865, + "learning_rate": 5e-06, + "logits/chosen": 10780152.533333333, + "logits/rejected": 4866618.666666667, + "logps/chosen": -432.8248046875, + "logps/rejected": -526.0397677951389, + "loss": 0.1247, + "rewards/chosen": 3.477860514322917, + "rewards/margins": 9.723358747694228, + "rewards/rejected": -6.245498233371311, + "step": 295 + }, + { + "epoch": 0.08112923119090037, + "grad_norm": 10.8125, + "kl": 1.2363357543945312, + "learning_rate": 5e-06, + "logits/chosen": 6187233.454545454, + "logits/rejected": -10271917.538461538, + "logps/chosen": -515.5923295454545, + "logps/rejected": -489.28380408653845, + "loss": 0.0458, + "rewards/chosen": 4.562067898837003, + "rewards/margins": 10.348860360525705, + "rewards/rejected": -5.786792461688702, + "step": 296 + }, + { + "epoch": 0.08140331643141017, + "grad_norm": 14.0625, + "kl": 4.663355827331543, + "learning_rate": 5e-06, + "logits/chosen": 7682080.615384615, + "logits/rejected": 45846725.81818182, + "logps/chosen": -469.4047100360577, + "logps/rejected": -339.8621271306818, + "loss": 0.1291, + "rewards/chosen": 4.086775266207182, + "rewards/margins": 7.86245650178069, + "rewards/rejected": -3.7756812355735083, + "step": 297 + }, + { + "epoch": 0.08167740167191996, + "grad_norm": 11.6875, + "kl": 2.9907760620117188, + "learning_rate": 5e-06, + "logits/chosen": 3645667.6363636362, + "logits/rejected": 26743896.615384616, + "logps/chosen": -379.9996448863636, + "logps/rejected": -473.4130108173077, + "loss": 0.0483, + "rewards/chosen": 4.322912042791193, + "rewards/margins": 9.285969340717877, + "rewards/rejected": -4.9630572979266825, + "step": 298 + }, + { + "epoch": 0.08195148691242976, + "grad_norm": 14.125, + "kl": 4.142131805419922, + "learning_rate": 5e-06, + "logits/chosen": 10398168.0, + "logits/rejected": 6104552.615384615, + "logps/chosen": -444.04643110795456, + "logps/rejected": -393.59728064903845, + "loss": 0.1502, + "rewards/chosen": 3.6946293223987925, + "rewards/margins": 8.183096612250054, + "rewards/rejected": -4.488467289851262, + "step": 299 + }, + { + "epoch": 0.08222557215293956, + "grad_norm": 14.8125, + "kl": 8.605305671691895, + "learning_rate": 5e-06, + "logits/chosen": 3454330.933333333, + "logits/rejected": 12275706.666666666, + "logps/chosen": -371.35358072916665, + "logps/rejected": -420.60565863715277, + "loss": 0.1218, + "rewards/chosen": 4.571321105957031, + "rewards/margins": 8.392479621039495, + "rewards/rejected": -3.821158515082465, + "step": 300 + }, + { + "epoch": 0.08249965739344936, + "grad_norm": 7.28125, + "kl": 0.11552556604146957, + "learning_rate": 5e-06, + "logits/chosen": -9031827.636363637, + "logits/rejected": 23478803.692307692, + "logps/chosen": -556.2598987926136, + "logps/rejected": -580.3497596153846, + "loss": 0.0404, + "rewards/chosen": 4.506899053400213, + "rewards/margins": 10.817116690682365, + "rewards/rejected": -6.310217637282151, + "step": 301 + }, + { + "epoch": 0.08277374263395916, + "grad_norm": 10.5625, + "kl": 2.96460223197937, + "learning_rate": 5e-06, + "logits/chosen": 8337247.333333333, + "logits/rejected": 71219349.33333333, + "logps/chosen": -466.9274088541667, + "logps/rejected": -522.2462158203125, + "loss": 0.0647, + "rewards/chosen": 4.609638849894206, + "rewards/margins": 9.838012377421062, + "rewards/rejected": -5.2283735275268555, + "step": 302 + }, + { + "epoch": 0.08304782787446896, + "grad_norm": 9.75, + "kl": 1.0347316265106201, + "learning_rate": 5e-06, + "logits/chosen": -5780146.666666667, + "logits/rejected": 16446557.333333334, + "logps/chosen": -605.89404296875, + "logps/rejected": -491.3582356770833, + "loss": 0.0419, + "rewards/chosen": 4.969841321309407, + "rewards/margins": 10.360783576965332, + "rewards/rejected": -5.390942255655925, + "step": 303 + }, + { + "epoch": 0.08332191311497876, + "grad_norm": 10.875, + "kl": 4.32183837890625, + "learning_rate": 5e-06, + "logits/chosen": 5808237.333333333, + "logits/rejected": 32808330.666666668, + "logps/chosen": -439.6461181640625, + "logps/rejected": -499.350830078125, + "loss": 0.0772, + "rewards/chosen": 3.7525622049967446, + "rewards/margins": 10.664719899495443, + "rewards/rejected": -6.912157694498698, + "step": 304 + }, + { + "epoch": 0.08359599835548856, + "grad_norm": 18.75, + "kl": 10.855632781982422, + "learning_rate": 5e-06, + "logits/chosen": -12223272.615384616, + "logits/rejected": 36265780.36363637, + "logps/chosen": -455.9514723557692, + "logps/rejected": -550.2575461647727, + "loss": 0.1009, + "rewards/chosen": 4.792245718149038, + "rewards/margins": 11.478937002328726, + "rewards/rejected": -6.6866912841796875, + "step": 305 + }, + { + "epoch": 0.08387008359599836, + "grad_norm": 17.75, + "kl": 1.2188060283660889, + "learning_rate": 5e-06, + "logits/chosen": 13440210.0, + "logits/rejected": -12556063.0, + "logps/chosen": -331.47259521484375, + "logps/rejected": -444.20416259765625, + "loss": 0.0939, + "rewards/chosen": 3.523893356323242, + "rewards/margins": 8.299412250518799, + "rewards/rejected": -4.775518894195557, + "step": 306 + }, + { + "epoch": 0.08414416883650816, + "grad_norm": 20.625, + "kl": 4.410481929779053, + "learning_rate": 5e-06, + "logits/chosen": 48077344.0, + "logits/rejected": -985793.2307692308, + "logps/chosen": -461.78573330965907, + "logps/rejected": -475.99815955528845, + "loss": 0.0937, + "rewards/chosen": 3.696801619096236, + "rewards/margins": 8.42352903139341, + "rewards/rejected": -4.726727412297175, + "step": 307 + }, + { + "epoch": 0.08441825407701795, + "grad_norm": 8.0625, + "kl": 3.7214226722717285, + "learning_rate": 5e-06, + "logits/chosen": 8445786.666666666, + "logits/rejected": 6363951.111111111, + "logps/chosen": -419.8125, + "logps/rejected": -481.6486002604167, + "loss": 0.0567, + "rewards/chosen": 5.018906656901041, + "rewards/margins": 10.423315429687499, + "rewards/rejected": -5.404408772786458, + "step": 308 + }, + { + "epoch": 0.08469233931752775, + "grad_norm": 12.9375, + "kl": 1.5139936208724976, + "learning_rate": 5e-06, + "logits/chosen": -8219018.909090909, + "logits/rejected": -16288500.923076924, + "logps/chosen": -399.3187144886364, + "logps/rejected": -517.2471454326923, + "loss": 0.0903, + "rewards/chosen": 3.660085504705256, + "rewards/margins": 8.0196664983576, + "rewards/rejected": -4.359580993652344, + "step": 309 + }, + { + "epoch": 0.08496642455803755, + "grad_norm": 14.0625, + "kl": 9.586676597595215, + "learning_rate": 5e-06, + "logits/chosen": 21250973.714285713, + "logits/rejected": -9885582.4, + "logps/chosen": -479.18899972098217, + "logps/rejected": -363.3520751953125, + "loss": 0.0909, + "rewards/chosen": 4.958606719970703, + "rewards/margins": 9.076781463623046, + "rewards/rejected": -4.118174743652344, + "step": 310 + }, + { + "epoch": 0.08524050979854735, + "grad_norm": 8.6875, + "kl": 9.95068073272705, + "learning_rate": 5e-06, + "logits/chosen": 3080690.933333333, + "logits/rejected": -3955510.222222222, + "logps/chosen": -412.59401041666666, + "logps/rejected": -466.76085069444446, + "loss": 0.0401, + "rewards/chosen": 4.594488525390625, + "rewards/margins": 10.933694797092015, + "rewards/rejected": -6.339206271701389, + "step": 311 + }, + { + "epoch": 0.08551459503905715, + "grad_norm": 7.125, + "kl": 5.563377380371094, + "learning_rate": 5e-06, + "logits/chosen": 1431192.3333333333, + "logits/rejected": -27067.333333333332, + "logps/chosen": -430.8623860677083, + "logps/rejected": -374.1400553385417, + "loss": 0.0432, + "rewards/chosen": 4.691751480102539, + "rewards/margins": 8.68898073832194, + "rewards/rejected": -3.997229258219401, + "step": 312 + }, + { + "epoch": 0.08578868027956695, + "grad_norm": 7.0625, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": 12155381.818181818, + "logits/rejected": 4607276.307692308, + "logps/chosen": -389.10653409090907, + "logps/rejected": -387.4092548076923, + "loss": 0.062, + "rewards/chosen": 4.03867582841353, + "rewards/margins": 9.115452773087508, + "rewards/rejected": -5.076776944673979, + "step": 313 + }, + { + "epoch": 0.08606276552007674, + "grad_norm": 8.75, + "kl": 3.5429065227508545, + "learning_rate": 5e-06, + "logits/chosen": -2829339.111111111, + "logits/rejected": 21292620.8, + "logps/chosen": -475.9368489583333, + "logps/rejected": -629.4335286458333, + "loss": 0.0393, + "rewards/chosen": 5.210927327473958, + "rewards/margins": 12.535593159993489, + "rewards/rejected": -7.324665832519531, + "step": 314 + }, + { + "epoch": 0.08633685076058654, + "grad_norm": 12.375, + "kl": 1.2242025136947632, + "learning_rate": 5e-06, + "logits/chosen": 37832936.0, + "logits/rejected": -28541692.0, + "logps/chosen": -438.2005615234375, + "logps/rejected": -364.8160095214844, + "loss": 0.0806, + "rewards/chosen": 3.253568410873413, + "rewards/margins": 7.023784875869751, + "rewards/rejected": -3.770216464996338, + "step": 315 + }, + { + "epoch": 0.08661093600109634, + "grad_norm": 12.0625, + "kl": 5.899319171905518, + "learning_rate": 5e-06, + "logits/chosen": -7354921.333333333, + "logits/rejected": -6741395.333333333, + "logps/chosen": -420.066650390625, + "logps/rejected": -360.73193359375, + "loss": 0.1111, + "rewards/chosen": 4.450411478678386, + "rewards/margins": 7.956522623697917, + "rewards/rejected": -3.5061111450195312, + "step": 316 + }, + { + "epoch": 0.08688502124160614, + "grad_norm": 14.25, + "kl": 7.771266460418701, + "learning_rate": 5e-06, + "logits/chosen": 7232413.333333333, + "logits/rejected": 88771674.66666667, + "logps/chosen": -513.9732259114584, + "logps/rejected": -639.1307779947916, + "loss": 0.1032, + "rewards/chosen": 4.653367360432942, + "rewards/margins": 13.400952657063801, + "rewards/rejected": -8.74758529663086, + "step": 317 + }, + { + "epoch": 0.08715910648211594, + "grad_norm": 9.125, + "kl": 1.4852209091186523, + "learning_rate": 5e-06, + "logits/chosen": 19518432.0, + "logits/rejected": 18974692.363636363, + "logps/chosen": -499.4041090745192, + "logps/rejected": -503.1810191761364, + "loss": 0.0451, + "rewards/chosen": 5.18667719914363, + "rewards/margins": 9.373006433873744, + "rewards/rejected": -4.186329234730113, + "step": 318 + }, + { + "epoch": 0.08743319172262573, + "grad_norm": 11.5625, + "kl": 9.634284973144531, + "learning_rate": 5e-06, + "logits/chosen": 6825672.615384615, + "logits/rejected": 41824683.63636363, + "logps/chosen": -494.70248647836536, + "logps/rejected": -422.60640092329544, + "loss": 0.0653, + "rewards/chosen": 5.764184805063101, + "rewards/margins": 9.98528777969467, + "rewards/rejected": -4.221102974631569, + "step": 319 + }, + { + "epoch": 0.08770727696313553, + "grad_norm": 9.875, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": 17818560.0, + "logits/rejected": -8392576.0, + "logps/chosen": -410.58417426215277, + "logps/rejected": -601.182421875, + "loss": 0.0456, + "rewards/chosen": 3.4444783528645835, + "rewards/margins": 8.29544423421224, + "rewards/rejected": -4.850965881347657, + "step": 320 + }, + { + "epoch": 0.08798136220364533, + "grad_norm": 8.1875, + "kl": 7.622042655944824, + "learning_rate": 5e-06, + "logits/chosen": -6843279.0, + "logits/rejected": -7313893.0, + "logps/chosen": -478.805908203125, + "logps/rejected": -499.60430908203125, + "loss": 0.0362, + "rewards/chosen": 5.783178329467773, + "rewards/margins": 11.783013343811035, + "rewards/rejected": -5.999835014343262, + "step": 321 + }, + { + "epoch": 0.08825544744415513, + "grad_norm": 11.0, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -14580689.6, + "logits/rejected": -800077.0, + "logps/chosen": -418.745361328125, + "logps/rejected": -363.83182198660717, + "loss": 0.0662, + "rewards/chosen": 5.140164947509765, + "rewards/margins": 9.302838897705078, + "rewards/rejected": -4.1626739501953125, + "step": 322 + }, + { + "epoch": 0.08852953268466493, + "grad_norm": 9.875, + "kl": 2.5579657554626465, + "learning_rate": 5e-06, + "logits/chosen": -15571645.090909092, + "logits/rejected": -3645008.6153846155, + "logps/chosen": -529.8484552556819, + "logps/rejected": -430.7838792067308, + "loss": 0.0697, + "rewards/chosen": 4.406594709916548, + "rewards/margins": 9.81196146078043, + "rewards/rejected": -5.405366750863882, + "step": 323 + }, + { + "epoch": 0.08880361792517473, + "grad_norm": 14.125, + "kl": 2.7109484672546387, + "learning_rate": 5e-06, + "logits/chosen": 20186052.8, + "logits/rejected": 1621664.857142857, + "logps/chosen": -400.7422607421875, + "logps/rejected": -555.0231584821429, + "loss": 0.091, + "rewards/chosen": 4.526076126098633, + "rewards/margins": 9.316374588012696, + "rewards/rejected": -4.7902984619140625, + "step": 324 + }, + { + "epoch": 0.08907770316568453, + "grad_norm": 14.75, + "kl": 7.780281066894531, + "learning_rate": 5e-06, + "logits/chosen": 2269228.3076923075, + "logits/rejected": 49807354.18181818, + "logps/chosen": -450.9328049879808, + "logps/rejected": -643.5640536221591, + "loss": 0.0682, + "rewards/chosen": 5.23037602351262, + "rewards/margins": 12.027452855677037, + "rewards/rejected": -6.797076832164418, + "step": 325 + }, + { + "epoch": 0.08935178840619433, + "grad_norm": 10.5, + "kl": 6.505181789398193, + "learning_rate": 5e-06, + "logits/chosen": 7229774.545454546, + "logits/rejected": -1148411.3846153845, + "logps/chosen": -512.1546963778409, + "logps/rejected": -439.0304612379808, + "loss": 0.0775, + "rewards/chosen": 5.463993766091087, + "rewards/margins": 9.893863197806832, + "rewards/rejected": -4.429869431715745, + "step": 326 + }, + { + "epoch": 0.08962587364670413, + "grad_norm": 10.5625, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": 5305491.2, + "logits/rejected": -7767329.142857143, + "logps/chosen": -349.3232421875, + "logps/rejected": -531.0228097098214, + "loss": 0.0787, + "rewards/chosen": 3.352826690673828, + "rewards/margins": 8.51670172555106, + "rewards/rejected": -5.163875034877232, + "step": 327 + }, + { + "epoch": 0.08989995888721393, + "grad_norm": 5.875, + "kl": 4.851354122161865, + "learning_rate": 5e-06, + "logits/chosen": -15420618.666666666, + "logits/rejected": -12900296.888888888, + "logps/chosen": -430.53583984375, + "logps/rejected": -498.8820529513889, + "loss": 0.0258, + "rewards/chosen": 5.986392211914063, + "rewards/margins": 12.168555874294704, + "rewards/rejected": -6.182163662380642, + "step": 328 + }, + { + "epoch": 0.09017404412772372, + "grad_norm": 7.28125, + "kl": 3.322453260421753, + "learning_rate": 5e-06, + "logits/chosen": -10699344.0, + "logits/rejected": -3330855.6363636362, + "logps/chosen": -520.7551081730769, + "logps/rejected": -404.9727894176136, + "loss": 0.0491, + "rewards/chosen": 5.226038419283354, + "rewards/margins": 10.896529377757254, + "rewards/rejected": -5.670490958473899, + "step": 329 + }, + { + "epoch": 0.09044812936823352, + "grad_norm": 8.0625, + "kl": 3.718661308288574, + "learning_rate": 5e-06, + "logits/chosen": -22441638.85714286, + "logits/rejected": 26954848.0, + "logps/chosen": -463.52022879464283, + "logps/rejected": -610.670849609375, + "loss": 0.0496, + "rewards/chosen": 5.779016767229352, + "rewards/margins": 11.163347898210798, + "rewards/rejected": -5.384331130981446, + "step": 330 + }, + { + "epoch": 0.09072221460874332, + "grad_norm": 13.75, + "kl": 4.474693298339844, + "learning_rate": 5e-06, + "logits/chosen": -11822338.181818182, + "logits/rejected": 44201353.84615385, + "logps/chosen": -488.7800958806818, + "logps/rejected": -452.4543269230769, + "loss": 0.0783, + "rewards/chosen": 4.970636541193182, + "rewards/margins": 9.45377352521136, + "rewards/rejected": -4.483136984018179, + "step": 331 + }, + { + "epoch": 0.09099629984925311, + "grad_norm": 12.75, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": 18749540.8, + "logits/rejected": 4186261.714285714, + "logps/chosen": -411.421533203125, + "logps/rejected": -601.0364118303571, + "loss": 0.0488, + "rewards/chosen": 4.048043060302734, + "rewards/margins": 10.00490973336356, + "rewards/rejected": -5.9568666730608255, + "step": 332 + }, + { + "epoch": 0.09127038508976291, + "grad_norm": 8.25, + "kl": 0.7423636317253113, + "learning_rate": 5e-06, + "logits/chosen": 3511867.5, + "logits/rejected": -10965819.0, + "logps/chosen": -476.4599914550781, + "logps/rejected": -467.89630126953125, + "loss": 0.0396, + "rewards/chosen": 5.1556243896484375, + "rewards/margins": 10.822744846343994, + "rewards/rejected": -5.667120456695557, + "step": 333 + }, + { + "epoch": 0.09154447033027271, + "grad_norm": 8.75, + "kl": 2.613624095916748, + "learning_rate": 5e-06, + "logits/chosen": -7745174.153846154, + "logits/rejected": -2972929.8181818184, + "logps/chosen": -485.1369816706731, + "logps/rejected": -397.18186257102275, + "loss": 0.0422, + "rewards/chosen": 5.342171302208533, + "rewards/margins": 9.805132672503277, + "rewards/rejected": -4.462961370294744, + "step": 334 + }, + { + "epoch": 0.09181855557078251, + "grad_norm": 8.875, + "kl": 4.095698356628418, + "learning_rate": 5e-06, + "logits/chosen": -3232483.6923076925, + "logits/rejected": 37259424.0, + "logps/chosen": -470.3552433894231, + "logps/rejected": -269.6803089488636, + "loss": 0.0476, + "rewards/chosen": 5.65692138671875, + "rewards/margins": 9.9495981389826, + "rewards/rejected": -4.29267675226385, + "step": 335 + }, + { + "epoch": 0.09209264081129231, + "grad_norm": 15.5, + "kl": 12.096663475036621, + "learning_rate": 5e-06, + "logits/chosen": 4483824.0, + "logits/rejected": 95542129.77777778, + "logps/chosen": -397.64716796875, + "logps/rejected": -354.6735568576389, + "loss": 0.096, + "rewards/chosen": 4.3307342529296875, + "rewards/margins": 6.975193023681641, + "rewards/rejected": -2.644458770751953, + "step": 336 + }, + { + "epoch": 0.09236672605180211, + "grad_norm": 2.578125, + "kl": 0.5759099721908569, + "learning_rate": 5e-06, + "logits/chosen": 25946592.0, + "logits/rejected": -1670124.2857142857, + "logps/chosen": -451.4900390625, + "logps/rejected": -658.5890066964286, + "loss": 0.0069, + "rewards/chosen": 5.735967636108398, + "rewards/margins": 13.225248881748744, + "rewards/rejected": -7.489281245640346, + "step": 337 + }, + { + "epoch": 0.09264081129231191, + "grad_norm": 15.0625, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -5532118.222222222, + "logits/rejected": 16164577.066666666, + "logps/chosen": -517.6430121527778, + "logps/rejected": -457.69951171875, + "loss": 0.073, + "rewards/chosen": 4.865934583875868, + "rewards/margins": 9.911490207248264, + "rewards/rejected": -5.045555623372396, + "step": 338 + }, + { + "epoch": 0.09291489653282171, + "grad_norm": 13.625, + "kl": 8.053850173950195, + "learning_rate": 5e-06, + "logits/chosen": -7704076.307692308, + "logits/rejected": 7311906.181818182, + "logps/chosen": -399.5871769831731, + "logps/rejected": -365.02530184659093, + "loss": 0.1125, + "rewards/chosen": 3.833532480093149, + "rewards/margins": 7.884671618054797, + "rewards/rejected": -4.0511391379616475, + "step": 339 + }, + { + "epoch": 0.0931889817733315, + "grad_norm": 9.4375, + "kl": 2.893493175506592, + "learning_rate": 5e-06, + "logits/chosen": -3969793.8181818184, + "logits/rejected": 72880226.46153846, + "logps/chosen": -455.58589311079544, + "logps/rejected": -526.0353816105769, + "loss": 0.0361, + "rewards/chosen": 5.276725422252309, + "rewards/margins": 12.483305951098462, + "rewards/rejected": -7.206580528846154, + "step": 340 + }, + { + "epoch": 0.0934630670138413, + "grad_norm": 11.125, + "kl": 1.9984945058822632, + "learning_rate": 5e-06, + "logits/chosen": 16299812.266666668, + "logits/rejected": 10699376.0, + "logps/chosen": -448.3462239583333, + "logps/rejected": -519.3725043402778, + "loss": 0.0918, + "rewards/chosen": 4.531538899739584, + "rewards/margins": 10.709219868977865, + "rewards/rejected": -6.177680969238281, + "step": 341 + }, + { + "epoch": 0.0937371522543511, + "grad_norm": 12.625, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -7061378.4, + "logits/rejected": 8157065.142857143, + "logps/chosen": -421.717626953125, + "logps/rejected": -378.17264229910717, + "loss": 0.0716, + "rewards/chosen": 4.5264839172363285, + "rewards/margins": 10.464900534493584, + "rewards/rejected": -5.938416617257254, + "step": 342 + }, + { + "epoch": 0.0940112374948609, + "grad_norm": 4.9375, + "kl": 5.768405914306641, + "learning_rate": 5e-06, + "logits/chosen": -3019908.8571428573, + "logits/rejected": 5286672.0, + "logps/chosen": -401.21058872767856, + "logps/rejected": -541.14384765625, + "loss": 0.0768, + "rewards/chosen": 4.963448115757534, + "rewards/margins": 12.115760585239956, + "rewards/rejected": -7.152312469482422, + "step": 343 + }, + { + "epoch": 0.0942853227353707, + "grad_norm": 7.71875, + "kl": 4.360863208770752, + "learning_rate": 5e-06, + "logits/chosen": -7032073.230769231, + "logits/rejected": 12070064.0, + "logps/chosen": -555.7020733173077, + "logps/rejected": -363.00026633522725, + "loss": 0.0805, + "rewards/chosen": 5.065423231858474, + "rewards/margins": 8.882967648806272, + "rewards/rejected": -3.8175444169477983, + "step": 344 + }, + { + "epoch": 0.0945594079758805, + "grad_norm": 9.3125, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -9335779.636363637, + "logits/rejected": -2463865.5384615385, + "logps/chosen": -472.43288352272725, + "logps/rejected": -448.21544471153845, + "loss": 0.0653, + "rewards/chosen": 3.725847764448686, + "rewards/margins": 9.116328392829095, + "rewards/rejected": -5.390480628380408, + "step": 345 + }, + { + "epoch": 0.0948334932163903, + "grad_norm": 13.0, + "kl": 4.8338623046875, + "learning_rate": 5e-06, + "logits/chosen": 14069372.307692308, + "logits/rejected": -19770890.181818184, + "logps/chosen": -455.5188176081731, + "logps/rejected": -386.56156782670456, + "loss": 0.1189, + "rewards/chosen": 3.914758535531851, + "rewards/margins": 9.642044094059017, + "rewards/rejected": -5.727285558527166, + "step": 346 + }, + { + "epoch": 0.0951075784569001, + "grad_norm": 5.625, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -4011559.0, + "logits/rejected": -13196682.666666666, + "logps/chosen": -434.1505533854167, + "logps/rejected": -383.3353678385417, + "loss": 0.0303, + "rewards/chosen": 5.217780431111653, + "rewards/margins": 10.343499501546223, + "rewards/rejected": -5.12571907043457, + "step": 347 + }, + { + "epoch": 0.0953816636974099, + "grad_norm": 12.375, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -4806686.0, + "logits/rejected": 2501639.4285714286, + "logps/chosen": -505.066943359375, + "logps/rejected": -622.4914899553571, + "loss": 0.0684, + "rewards/chosen": 3.9256961822509764, + "rewards/margins": 10.904704448154995, + "rewards/rejected": -6.979008265904018, + "step": 348 + }, + { + "epoch": 0.0956557489379197, + "grad_norm": 14.6875, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": 2373786.8571428573, + "logits/rejected": 3552262.8, + "logps/chosen": -459.16489955357144, + "logps/rejected": -446.603515625, + "loss": 0.0602, + "rewards/chosen": 5.185779571533203, + "rewards/margins": 11.201059341430664, + "rewards/rejected": -6.015279769897461, + "step": 349 + }, + { + "epoch": 0.09592983417842949, + "grad_norm": 14.4375, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -18023790.666666668, + "logits/rejected": -8810012.0, + "logps/chosen": -412.6937255859375, + "logps/rejected": -526.1974690755209, + "loss": 0.1161, + "rewards/chosen": 4.321699778238933, + "rewards/margins": 9.615612665812176, + "rewards/rejected": -5.293912887573242, + "step": 350 + }, + { + "epoch": 0.09620391941893928, + "grad_norm": 16.875, + "kl": 6.75664758682251, + "learning_rate": 5e-06, + "logits/chosen": -13552211.555555556, + "logits/rejected": -9349713.066666666, + "logps/chosen": -493.78955078125, + "logps/rejected": -560.4846354166667, + "loss": 0.0916, + "rewards/chosen": 5.470484839545356, + "rewards/margins": 10.824471961127387, + "rewards/rejected": -5.353987121582032, + "step": 351 + }, + { + "epoch": 0.09647800465944908, + "grad_norm": 17.125, + "kl": 6.74098014831543, + "learning_rate": 5e-06, + "logits/chosen": 10342390.857142856, + "logits/rejected": 188605.7, + "logps/chosen": -450.35951450892856, + "logps/rejected": -388.8828369140625, + "loss": 0.0994, + "rewards/chosen": 4.575883047921317, + "rewards/margins": 8.207985632760185, + "rewards/rejected": -3.6321025848388673, + "step": 352 + }, + { + "epoch": 0.09675208989995888, + "grad_norm": 6.4375, + "kl": 0.013274192810058594, + "learning_rate": 5e-06, + "logits/chosen": 11546056.615384616, + "logits/rejected": 12070858.181818182, + "logps/chosen": -401.8313176081731, + "logps/rejected": -558.6486150568181, + "loss": 0.0315, + "rewards/chosen": 4.768405914306641, + "rewards/margins": 12.035384785045277, + "rewards/rejected": -7.266978870738637, + "step": 353 + }, + { + "epoch": 0.09702617514046868, + "grad_norm": 6.75, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -9673048.0, + "logits/rejected": 17923089.230769232, + "logps/chosen": -573.50244140625, + "logps/rejected": -538.8931790865385, + "loss": 0.0218, + "rewards/chosen": 5.700440146706321, + "rewards/margins": 13.019058334243882, + "rewards/rejected": -7.31861818753756, + "step": 354 + }, + { + "epoch": 0.09730026038097848, + "grad_norm": 8.5625, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": 4220443.0, + "logits/rejected": -12693865.333333334, + "logps/chosen": -594.9004720052084, + "logps/rejected": -612.3279622395834, + "loss": 0.0241, + "rewards/chosen": 6.699169794718425, + "rewards/margins": 14.716889063517254, + "rewards/rejected": -8.017719268798828, + "step": 355 + }, + { + "epoch": 0.09757434562148828, + "grad_norm": 16.0, + "kl": 4.908015251159668, + "learning_rate": 5e-06, + "logits/chosen": -3031238.933333333, + "logits/rejected": 20635235.555555556, + "logps/chosen": -375.221484375, + "logps/rejected": -369.5971408420139, + "loss": 0.0809, + "rewards/chosen": 3.823686218261719, + "rewards/margins": 9.226395331488716, + "rewards/rejected": -5.402709113226996, + "step": 356 + }, + { + "epoch": 0.09784843086199808, + "grad_norm": 13.5625, + "kl": 11.671039581298828, + "learning_rate": 5e-06, + "logits/chosen": 8864902.0, + "logits/rejected": -8920805.0, + "logps/chosen": -485.20489501953125, + "logps/rejected": -727.4148559570312, + "loss": 0.1141, + "rewards/chosen": 4.861318588256836, + "rewards/margins": 10.816500663757324, + "rewards/rejected": -5.955182075500488, + "step": 357 + }, + { + "epoch": 0.09812251610250788, + "grad_norm": 20.0, + "kl": 15.707134246826172, + "learning_rate": 5e-06, + "logits/chosen": -2437111.0588235296, + "logits/rejected": 3111564.8571428573, + "logps/chosen": -506.63005514705884, + "logps/rejected": -311.2676478794643, + "loss": 0.1165, + "rewards/chosen": 4.43244395536535, + "rewards/margins": 8.544677798487559, + "rewards/rejected": -4.11223384312221, + "step": 358 + }, + { + "epoch": 0.09839660134301768, + "grad_norm": 9.125, + "kl": 3.163815975189209, + "learning_rate": 5e-06, + "logits/chosen": -736939.2, + "logits/rejected": -4855220.444444444, + "logps/chosen": -442.7157877604167, + "logps/rejected": -452.0954318576389, + "loss": 0.0473, + "rewards/chosen": 4.398774210611979, + "rewards/margins": 8.719865332709418, + "rewards/rejected": -4.321091122097439, + "step": 359 + }, + { + "epoch": 0.09867068658352748, + "grad_norm": 5.3125, + "kl": 4.461540222167969, + "learning_rate": 5e-06, + "logits/chosen": 5391864.7272727275, + "logits/rejected": 54569398.15384615, + "logps/chosen": -441.72367720170456, + "logps/rejected": -1029.5927734375, + "loss": 0.0507, + "rewards/chosen": 5.646115736527876, + "rewards/margins": 14.262811434018861, + "rewards/rejected": -8.616695697490986, + "step": 360 + }, + { + "epoch": 0.09894477182403727, + "grad_norm": 9.9375, + "kl": 2.9127261638641357, + "learning_rate": 5e-06, + "logits/chosen": -1583598.7692307692, + "logits/rejected": 11708122.181818182, + "logps/chosen": -408.72352013221155, + "logps/rejected": -484.66202059659093, + "loss": 0.0456, + "rewards/chosen": 4.164166083702674, + "rewards/margins": 9.989671427053171, + "rewards/rejected": -5.825505343350497, + "step": 361 + }, + { + "epoch": 0.09921885706454707, + "grad_norm": 18.25, + "kl": 8.243597984313965, + "learning_rate": 5e-06, + "logits/chosen": 3505608.470588235, + "logits/rejected": 4076946.8571428573, + "logps/chosen": -407.02355238970586, + "logps/rejected": -664.6607840401786, + "loss": 0.1496, + "rewards/chosen": 3.9354207655962776, + "rewards/margins": 10.993010705258666, + "rewards/rejected": -7.057589939662388, + "step": 362 + }, + { + "epoch": 0.09949294230505687, + "grad_norm": 10.0625, + "kl": 8.164571762084961, + "learning_rate": 5e-06, + "logits/chosen": -17709230.545454547, + "logits/rejected": 1551192.6153846155, + "logps/chosen": -503.4070933948864, + "logps/rejected": -544.8983999399038, + "loss": 0.0341, + "rewards/chosen": 5.116266424005682, + "rewards/margins": 11.08907819627882, + "rewards/rejected": -5.972811772273137, + "step": 363 + }, + { + "epoch": 0.09976702754556667, + "grad_norm": 14.0625, + "kl": 3.1012039184570312, + "learning_rate": 5e-06, + "logits/chosen": -14781931.076923076, + "logits/rejected": -14546539.636363637, + "logps/chosen": -476.02591646634613, + "logps/rejected": -451.92684659090907, + "loss": 0.0632, + "rewards/chosen": 5.36757073035607, + "rewards/margins": 10.126421868384302, + "rewards/rejected": -4.7588511380282315, + "step": 364 + }, + { + "epoch": 0.10004111278607647, + "grad_norm": 15.875, + "kl": 4.846736431121826, + "learning_rate": 5e-06, + "logits/chosen": 9237574.285714285, + "logits/rejected": 3976045.6, + "logps/chosen": -458.769287109375, + "logps/rejected": -459.04560546875, + "loss": 0.0838, + "rewards/chosen": 4.842789786202567, + "rewards/margins": 9.43100313459124, + "rewards/rejected": -4.588213348388672, + "step": 365 + }, + { + "epoch": 0.10031519802658627, + "grad_norm": 7.78125, + "kl": 3.1894302368164062, + "learning_rate": 5e-06, + "logits/chosen": -22309379.555555556, + "logits/rejected": -5540848.0, + "logps/chosen": -458.9069552951389, + "logps/rejected": -466.1834309895833, + "loss": 0.0363, + "rewards/chosen": 4.928116268581814, + "rewards/margins": 9.858646053738063, + "rewards/rejected": -4.93052978515625, + "step": 366 + }, + { + "epoch": 0.10058928326709607, + "grad_norm": 21.5, + "kl": 9.371637344360352, + "learning_rate": 5e-06, + "logits/chosen": 12825353.846153846, + "logits/rejected": 12731824.0, + "logps/chosen": -314.9335186298077, + "logps/rejected": -421.6270862926136, + "loss": 0.1408, + "rewards/chosen": 4.004116351787861, + "rewards/margins": 7.789084294459203, + "rewards/rejected": -3.7849679426713423, + "step": 367 + }, + { + "epoch": 0.10086336850760587, + "grad_norm": 19.75, + "kl": 5.2935709953308105, + "learning_rate": 5e-06, + "logits/chosen": 6325059.636363637, + "logits/rejected": 11292257.23076923, + "logps/chosen": -461.5269886363636, + "logps/rejected": -477.5490910456731, + "loss": 0.1315, + "rewards/chosen": 4.666685624556108, + "rewards/margins": 8.124520308487899, + "rewards/rejected": -3.457834683931791, + "step": 368 + }, + { + "epoch": 0.10113745374811567, + "grad_norm": 7.96875, + "kl": 3.227548599243164, + "learning_rate": 5e-06, + "logits/chosen": -4774811.2, + "logits/rejected": 321147.1111111111, + "logps/chosen": -373.49095052083334, + "logps/rejected": -469.25889756944446, + "loss": 0.053, + "rewards/chosen": 4.257208251953125, + "rewards/margins": 10.302008480495877, + "rewards/rejected": -6.044800228542751, + "step": 369 + }, + { + "epoch": 0.10141153898862547, + "grad_norm": 10.9375, + "kl": 3.425731658935547, + "learning_rate": 5e-06, + "logits/chosen": 4618500.0, + "logits/rejected": -10885377.23076923, + "logps/chosen": -399.13924893465907, + "logps/rejected": -634.8315805288462, + "loss": 0.035, + "rewards/chosen": 5.7440032958984375, + "rewards/margins": 11.910483140211838, + "rewards/rejected": -6.166479844313401, + "step": 370 + }, + { + "epoch": 0.10168562422913525, + "grad_norm": 7.53125, + "kl": 2.958395004272461, + "learning_rate": 5e-06, + "logits/chosen": 2868309.8181818184, + "logits/rejected": 7781420.923076923, + "logps/chosen": -556.6782670454545, + "logps/rejected": -494.76419771634613, + "loss": 0.0354, + "rewards/chosen": 7.192312067205256, + "rewards/margins": 12.871020657199246, + "rewards/rejected": -5.67870858999399, + "step": 371 + }, + { + "epoch": 0.10195970946964505, + "grad_norm": 9.625, + "kl": 4.565187454223633, + "learning_rate": 5e-06, + "logits/chosen": 2550867.2, + "logits/rejected": -9135171.42857143, + "logps/chosen": -458.90771484375, + "logps/rejected": -357.43980189732144, + "loss": 0.0967, + "rewards/chosen": 5.349785995483399, + "rewards/margins": 9.459080614362446, + "rewards/rejected": -4.109294618879046, + "step": 372 + }, + { + "epoch": 0.10223379471015485, + "grad_norm": 19.375, + "kl": 13.114043235778809, + "learning_rate": 5e-06, + "logits/chosen": 16627749.333333334, + "logits/rejected": 4657855.111111111, + "logps/chosen": -481.7962239583333, + "logps/rejected": -451.67377387152777, + "loss": 0.1969, + "rewards/chosen": 5.295735168457031, + "rewards/margins": 8.98633100721571, + "rewards/rejected": -3.6905958387586804, + "step": 373 + }, + { + "epoch": 0.10250787995066465, + "grad_norm": 9.625, + "kl": 13.949646949768066, + "learning_rate": 5e-06, + "logits/chosen": -11078638.76923077, + "logits/rejected": 20588440.727272727, + "logps/chosen": -468.18348106971155, + "logps/rejected": -531.9199662642045, + "loss": 0.0586, + "rewards/chosen": 6.2476348876953125, + "rewards/margins": 11.495819785378195, + "rewards/rejected": -5.248184897682884, + "step": 374 + }, + { + "epoch": 0.10278196519117445, + "grad_norm": 8.5, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -216040.0, + "logits/rejected": -15845426.285714285, + "logps/chosen": -536.52978515625, + "logps/rejected": -549.4783761160714, + "loss": 0.0278, + "rewards/chosen": 5.194533920288086, + "rewards/margins": 11.767646953037808, + "rewards/rejected": -6.573113032749721, + "step": 375 + }, + { + "epoch": 0.10305605043168425, + "grad_norm": 8.4375, + "kl": 5.845787048339844, + "learning_rate": 5e-06, + "logits/chosen": 3430472.3333333335, + "logits/rejected": -37479450.666666664, + "logps/chosen": -467.2042236328125, + "logps/rejected": -578.019775390625, + "loss": 0.0392, + "rewards/chosen": 4.964664459228516, + "rewards/margins": 11.174613952636719, + "rewards/rejected": -6.209949493408203, + "step": 376 + }, + { + "epoch": 0.10333013567219405, + "grad_norm": 10.3125, + "kl": 2.199061155319214, + "learning_rate": 5e-06, + "logits/chosen": 4752562.133333334, + "logits/rejected": -7086947.555555556, + "logps/chosen": -411.28430989583336, + "logps/rejected": -539.9830729166666, + "loss": 0.0641, + "rewards/chosen": 5.188321431477864, + "rewards/margins": 10.399413638644749, + "rewards/rejected": -5.2110922071668835, + "step": 377 + }, + { + "epoch": 0.10360422091270385, + "grad_norm": 9.1875, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -17745344.0, + "logits/rejected": 13748669.866666667, + "logps/chosen": -308.31456163194446, + "logps/rejected": -532.8786458333333, + "loss": 0.0593, + "rewards/chosen": 3.535329394870334, + "rewards/margins": 9.471394051445856, + "rewards/rejected": -5.936064656575521, + "step": 378 + }, + { + "epoch": 0.10387830615321365, + "grad_norm": 11.25, + "kl": 6.540373802185059, + "learning_rate": 5e-06, + "logits/chosen": -3510293.066666667, + "logits/rejected": -641320.0, + "logps/chosen": -440.75677083333335, + "logps/rejected": -364.64374457465277, + "loss": 0.0737, + "rewards/chosen": 4.4596099853515625, + "rewards/margins": 8.87327660454644, + "rewards/rejected": -4.413666619194879, + "step": 379 + }, + { + "epoch": 0.10415239139372345, + "grad_norm": 13.0625, + "kl": 6.113406181335449, + "learning_rate": 5e-06, + "logits/chosen": 20384891.636363637, + "logits/rejected": 10872384.0, + "logps/chosen": -454.14346590909093, + "logps/rejected": -497.1989933894231, + "loss": 0.0712, + "rewards/chosen": 5.2230307839133525, + "rewards/margins": 11.822215847202115, + "rewards/rejected": -6.599185063288762, + "step": 380 + }, + { + "epoch": 0.10442647663423325, + "grad_norm": 12.5, + "kl": 12.40969467163086, + "learning_rate": 5e-06, + "logits/chosen": 3223406.933333333, + "logits/rejected": 38076576.0, + "logps/chosen": -620.93515625, + "logps/rejected": -421.50355360243054, + "loss": 0.0962, + "rewards/chosen": 5.6459503173828125, + "rewards/margins": 10.100438435872395, + "rewards/rejected": -4.454488118489583, + "step": 381 + }, + { + "epoch": 0.10470056187474304, + "grad_norm": 14.0625, + "kl": 9.922893524169922, + "learning_rate": 5e-06, + "logits/chosen": 1282458.857142857, + "logits/rejected": -3058581.8, + "logps/chosen": -430.85693359375, + "logps/rejected": -533.255615234375, + "loss": 0.0808, + "rewards/chosen": 5.612513950892857, + "rewards/margins": 12.721558598109652, + "rewards/rejected": -7.1090446472167965, + "step": 382 + }, + { + "epoch": 0.10497464711525284, + "grad_norm": 9.3125, + "kl": 4.028563499450684, + "learning_rate": 5e-06, + "logits/chosen": -12041236.57142857, + "logits/rejected": 12947410.4, + "logps/chosen": -368.94984654017856, + "logps/rejected": -544.012451171875, + "loss": 0.0633, + "rewards/chosen": 4.221489225115095, + "rewards/margins": 10.779545865740094, + "rewards/rejected": -6.558056640625, + "step": 383 + }, + { + "epoch": 0.10524873235576264, + "grad_norm": 3.0, + "kl": 0.9948133230209351, + "learning_rate": 5e-06, + "logits/chosen": -7333005.090909091, + "logits/rejected": -5737601.230769231, + "logps/chosen": -497.0949041193182, + "logps/rejected": -588.9089167668269, + "loss": 0.0118, + "rewards/chosen": 5.599761616099965, + "rewards/margins": 12.479461910007718, + "rewards/rejected": -6.879700293907752, + "step": 384 + }, + { + "epoch": 0.10552281759627244, + "grad_norm": 11.4375, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": 6530182.0, + "logits/rejected": 3827299.4285714286, + "logps/chosen": -413.419287109375, + "logps/rejected": -413.5795200892857, + "loss": 0.0945, + "rewards/chosen": 5.414896774291992, + "rewards/margins": 9.934168352399553, + "rewards/rejected": -4.519271578107562, + "step": 385 + }, + { + "epoch": 0.10579690283678224, + "grad_norm": 16.625, + "kl": 7.2806854248046875, + "learning_rate": 5e-06, + "logits/chosen": 10021854.545454545, + "logits/rejected": 5942677.538461538, + "logps/chosen": -476.0545099431818, + "logps/rejected": -370.1890399639423, + "loss": 0.17, + "rewards/chosen": 4.42681260542436, + "rewards/margins": 8.896024770670003, + "rewards/rejected": -4.469212165245643, + "step": 386 + }, + { + "epoch": 0.10607098807729204, + "grad_norm": 7.625, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": 1108325.142857143, + "logits/rejected": -17699843.76470588, + "logps/chosen": -317.35226004464283, + "logps/rejected": -507.4276769301471, + "loss": 0.0458, + "rewards/chosen": 3.9186297825404575, + "rewards/margins": 8.654878023291836, + "rewards/rejected": -4.736248240751379, + "step": 387 + }, + { + "epoch": 0.10634507331780184, + "grad_norm": 12.625, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": 9803357.818181818, + "logits/rejected": -19441282.46153846, + "logps/chosen": -496.7515980113636, + "logps/rejected": -562.0519080528846, + "loss": 0.0696, + "rewards/chosen": 5.582048242742365, + "rewards/margins": 12.599089962619168, + "rewards/rejected": -7.0170417198768025, + "step": 388 + }, + { + "epoch": 0.10661915855831164, + "grad_norm": 8.4375, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -3630399.5, + "logits/rejected": 18051842.0, + "logps/chosen": -442.54681396484375, + "logps/rejected": -481.6558532714844, + "loss": 0.0465, + "rewards/chosen": 4.568587303161621, + "rewards/margins": 10.411843299865723, + "rewards/rejected": -5.843255996704102, + "step": 389 + }, + { + "epoch": 0.10689324379882144, + "grad_norm": 9.25, + "kl": 1.1936544179916382, + "learning_rate": 5e-06, + "logits/chosen": 4993397.333333333, + "logits/rejected": 3522146.1333333333, + "logps/chosen": -459.7312282986111, + "logps/rejected": -415.7174479166667, + "loss": 0.067, + "rewards/chosen": 4.6838573879665795, + "rewards/margins": 9.4553712632921, + "rewards/rejected": -4.771513875325521, + "step": 390 + }, + { + "epoch": 0.10716732903933124, + "grad_norm": 9.625, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": 10646230.4, + "logits/rejected": 7371471.428571428, + "logps/chosen": -334.169580078125, + "logps/rejected": -562.5219377790179, + "loss": 0.0868, + "rewards/chosen": 3.3019550323486326, + "rewards/margins": 10.208338001796177, + "rewards/rejected": -6.906382969447544, + "step": 391 + }, + { + "epoch": 0.10744141427984102, + "grad_norm": 7.75, + "kl": 3.874619960784912, + "learning_rate": 5e-06, + "logits/chosen": -14227780.363636363, + "logits/rejected": -524540.0, + "logps/chosen": -513.6878107244319, + "logps/rejected": -361.2426006610577, + "loss": 0.0451, + "rewards/chosen": 5.7338738874955615, + "rewards/margins": 10.015842224334504, + "rewards/rejected": -4.2819683368389425, + "step": 392 + }, + { + "epoch": 0.10771549952035082, + "grad_norm": 9.5, + "kl": 9.538880348205566, + "learning_rate": 5e-06, + "logits/chosen": -1072111.142857143, + "logits/rejected": 60120844.8, + "logps/chosen": -527.1409737723214, + "logps/rejected": -619.152880859375, + "loss": 0.1049, + "rewards/chosen": 5.260035378592355, + "rewards/margins": 12.980886513846261, + "rewards/rejected": -7.720851135253906, + "step": 393 + }, + { + "epoch": 0.10798958476086062, + "grad_norm": 21.25, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": 16458166.4, + "logits/rejected": 3664633.714285714, + "logps/chosen": -384.166552734375, + "logps/rejected": -562.4048549107143, + "loss": 0.0635, + "rewards/chosen": 3.8510818481445312, + "rewards/margins": 10.844894409179688, + "rewards/rejected": -6.993812561035156, + "step": 394 + }, + { + "epoch": 0.10826367000137042, + "grad_norm": 12.625, + "kl": 3.019012451171875, + "learning_rate": 5e-06, + "logits/chosen": -10347966.666666666, + "logits/rejected": 25305970.666666668, + "logps/chosen": -439.0441080729167, + "logps/rejected": -666.35107421875, + "loss": 0.0689, + "rewards/chosen": 4.786215464274089, + "rewards/margins": 11.87702751159668, + "rewards/rejected": -7.090812047322591, + "step": 395 + }, + { + "epoch": 0.10853775524188022, + "grad_norm": 10.75, + "kl": 0.815506637096405, + "learning_rate": 5e-06, + "logits/chosen": 9216539.636363637, + "logits/rejected": 6513662.153846154, + "logps/chosen": -547.1751154119319, + "logps/rejected": -521.7282527043269, + "loss": 0.0557, + "rewards/chosen": 3.9793451482599433, + "rewards/margins": 9.452274295833561, + "rewards/rejected": -5.472929147573618, + "step": 396 + }, + { + "epoch": 0.10881184048239002, + "grad_norm": 11.125, + "kl": 2.2020812034606934, + "learning_rate": 5e-06, + "logits/chosen": -27553773.333333332, + "logits/rejected": 15273142.666666666, + "logps/chosen": -424.37255859375, + "logps/rejected": -491.4037272135417, + "loss": 0.0638, + "rewards/chosen": 4.9065901438395185, + "rewards/margins": 10.74013392130534, + "rewards/rejected": -5.83354377746582, + "step": 397 + }, + { + "epoch": 0.10908592572289982, + "grad_norm": 13.9375, + "kl": 10.4578857421875, + "learning_rate": 5e-06, + "logits/chosen": 1187757.8181818181, + "logits/rejected": -5323793.846153846, + "logps/chosen": -409.53879616477275, + "logps/rejected": -444.86733774038464, + "loss": 0.116, + "rewards/chosen": 4.526841597123579, + "rewards/margins": 11.752539041159036, + "rewards/rejected": -7.225697444035457, + "step": 398 + }, + { + "epoch": 0.10936001096340962, + "grad_norm": 13.875, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -666236.6666666666, + "logits/rejected": -1785195.6666666667, + "logps/chosen": -483.5561930338542, + "logps/rejected": -387.2552490234375, + "loss": 0.08, + "rewards/chosen": 4.0673214594523115, + "rewards/margins": 7.719374656677246, + "rewards/rejected": -3.652053197224935, + "step": 399 + }, + { + "epoch": 0.10963409620391942, + "grad_norm": 10.375, + "kl": 2.1788525581359863, + "learning_rate": 5e-06, + "logits/chosen": -2113.6666666666665, + "logits/rejected": -3318004.0, + "logps/chosen": -404.5034993489583, + "logps/rejected": -498.3372395833333, + "loss": 0.0786, + "rewards/chosen": 4.300940831502278, + "rewards/margins": 9.744958877563477, + "rewards/rejected": -5.444018046061198, + "step": 400 + }, + { + "epoch": 0.10990818144442922, + "grad_norm": 17.75, + "kl": 2.4937453269958496, + "learning_rate": 5e-06, + "logits/chosen": 10246278.0, + "logits/rejected": 1525123.6666666667, + "logps/chosen": -432.0513102213542, + "logps/rejected": -441.5913899739583, + "loss": 0.1199, + "rewards/chosen": 3.398935317993164, + "rewards/margins": 8.591543515523274, + "rewards/rejected": -5.19260819753011, + "step": 401 + }, + { + "epoch": 0.11018226668493901, + "grad_norm": 4.09375, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -3207872.3636363638, + "logits/rejected": 25495515.076923076, + "logps/chosen": -539.0189098011364, + "logps/rejected": -475.51600060096155, + "loss": 0.0448, + "rewards/chosen": 5.247100830078125, + "rewards/margins": 12.623350290151743, + "rewards/rejected": -7.376249460073618, + "step": 402 + }, + { + "epoch": 0.11045635192544881, + "grad_norm": 8.5, + "kl": 3.560702085494995, + "learning_rate": 5e-06, + "logits/chosen": -17612080.0, + "logits/rejected": 8707564.307692308, + "logps/chosen": -476.72998046875, + "logps/rejected": -318.03673377403845, + "loss": 0.0616, + "rewards/chosen": 4.386832150545987, + "rewards/margins": 8.622750342309057, + "rewards/rejected": -4.235918191763071, + "step": 403 + }, + { + "epoch": 0.11073043716595861, + "grad_norm": 19.75, + "kl": 14.514243125915527, + "learning_rate": 5e-06, + "logits/chosen": -7514420.0, + "logits/rejected": 17417088.0, + "logps/chosen": -467.595703125, + "logps/rejected": -312.0213317871094, + "loss": 0.1554, + "rewards/chosen": 5.004546165466309, + "rewards/margins": 7.014521598815918, + "rewards/rejected": -2.0099754333496094, + "step": 404 + }, + { + "epoch": 0.11100452240646841, + "grad_norm": 14.3125, + "kl": 5.245902061462402, + "learning_rate": 5e-06, + "logits/chosen": -5744848.0, + "logits/rejected": 5922108.4, + "logps/chosen": -408.44594029017856, + "logps/rejected": -467.51474609375, + "loss": 0.1229, + "rewards/chosen": 4.717265537806919, + "rewards/margins": 10.405254200526645, + "rewards/rejected": -5.687988662719727, + "step": 405 + }, + { + "epoch": 0.11127860764697821, + "grad_norm": 5.25, + "kl": 1.5991802215576172, + "learning_rate": 5e-06, + "logits/chosen": 3106993.4, + "logits/rejected": 42130907.428571425, + "logps/chosen": -406.222802734375, + "logps/rejected": -589.76806640625, + "loss": 0.0203, + "rewards/chosen": 5.550785827636719, + "rewards/margins": 12.39765134538923, + "rewards/rejected": -6.846865517752511, + "step": 406 + }, + { + "epoch": 0.11155269288748801, + "grad_norm": 8.9375, + "kl": 6.116461753845215, + "learning_rate": 5e-06, + "logits/chosen": -430017.14285714284, + "logits/rejected": 3349842.2, + "logps/chosen": -333.46407645089283, + "logps/rejected": -369.2785888671875, + "loss": 0.0973, + "rewards/chosen": 4.202733993530273, + "rewards/margins": 8.159285354614259, + "rewards/rejected": -3.956551361083984, + "step": 407 + }, + { + "epoch": 0.11182677812799781, + "grad_norm": 10.5, + "kl": 8.523906707763672, + "learning_rate": 5e-06, + "logits/chosen": -15628277.0, + "logits/rejected": -6887793.0, + "logps/chosen": -471.76092529296875, + "logps/rejected": -359.3701477050781, + "loss": 0.1024, + "rewards/chosen": 5.244580268859863, + "rewards/margins": 10.839628219604492, + "rewards/rejected": -5.595047950744629, + "step": 408 + }, + { + "epoch": 0.11210086336850761, + "grad_norm": 13.4375, + "kl": 2.967818021774292, + "learning_rate": 5e-06, + "logits/chosen": -15955392.0, + "logits/rejected": -7449380.923076923, + "logps/chosen": -482.66428444602275, + "logps/rejected": -599.7571364182693, + "loss": 0.0654, + "rewards/chosen": 4.474745663729581, + "rewards/margins": 11.295598090111792, + "rewards/rejected": -6.820852426382212, + "step": 409 + }, + { + "epoch": 0.11237494860901741, + "grad_norm": 4.9375, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -4290071.555555556, + "logits/rejected": -6525326.933333334, + "logps/chosen": -411.1657986111111, + "logps/rejected": -472.92975260416665, + "loss": 0.0119, + "rewards/chosen": 7.72343275282118, + "rewards/margins": 14.659793429904514, + "rewards/rejected": -6.936360677083333, + "step": 410 + }, + { + "epoch": 0.11264903384952721, + "grad_norm": 13.3125, + "kl": 4.662321090698242, + "learning_rate": 5e-06, + "logits/chosen": 4655643.384615385, + "logits/rejected": -6582004.363636363, + "logps/chosen": -489.75826322115387, + "logps/rejected": -490.99995561079544, + "loss": 0.0662, + "rewards/chosen": 4.497153062086839, + "rewards/margins": 10.713255662184494, + "rewards/rejected": -6.216102600097656, + "step": 411 + }, + { + "epoch": 0.11292311909003701, + "grad_norm": 11.5, + "kl": 4.333498001098633, + "learning_rate": 5e-06, + "logits/chosen": -13396376.615384616, + "logits/rejected": -19145629.09090909, + "logps/chosen": -375.6730769230769, + "logps/rejected": -495.4718572443182, + "loss": 0.065, + "rewards/chosen": 4.358804556039663, + "rewards/margins": 10.14560632772379, + "rewards/rejected": -5.786801771684126, + "step": 412 + }, + { + "epoch": 0.1131972043305468, + "grad_norm": 12.9375, + "kl": 1.5558459758758545, + "learning_rate": 5e-06, + "logits/chosen": -1488176.6153846155, + "logits/rejected": -13841104.0, + "logps/chosen": -495.03579477163464, + "logps/rejected": -599.1341441761364, + "loss": 0.0543, + "rewards/chosen": 5.21119866004357, + "rewards/margins": 11.875739731155075, + "rewards/rejected": -6.664541071111506, + "step": 413 + }, + { + "epoch": 0.1134712895710566, + "grad_norm": 10.0625, + "kl": 0.3039652705192566, + "learning_rate": 5e-06, + "logits/chosen": -7698110.0, + "logits/rejected": 1821611.5, + "logps/chosen": -397.0900573730469, + "logps/rejected": -531.03076171875, + "loss": 0.0456, + "rewards/chosen": 4.839269161224365, + "rewards/margins": 11.225603580474854, + "rewards/rejected": -6.386334419250488, + "step": 414 + }, + { + "epoch": 0.1137453748115664, + "grad_norm": 7.8125, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -19270824.888888888, + "logits/rejected": 18100196.266666666, + "logps/chosen": -528.3715277777778, + "logps/rejected": -592.5204427083333, + "loss": 0.0282, + "rewards/chosen": 4.957862430148655, + "rewards/margins": 12.618770429823133, + "rewards/rejected": -7.660907999674479, + "step": 415 + }, + { + "epoch": 0.1140194600520762, + "grad_norm": 12.4375, + "kl": 1.437565565109253, + "learning_rate": 5e-06, + "logits/chosen": 3215716.4444444445, + "logits/rejected": 9945175.466666667, + "logps/chosen": -336.6449381510417, + "logps/rejected": -516.6180989583333, + "loss": 0.1174, + "rewards/chosen": 2.7909374237060547, + "rewards/margins": 7.929811223347982, + "rewards/rejected": -5.138873799641927, + "step": 416 + }, + { + "epoch": 0.114293545292586, + "grad_norm": 8.0, + "kl": 3.5195860862731934, + "learning_rate": 5e-06, + "logits/chosen": 18414855.272727273, + "logits/rejected": 32377708.307692308, + "logps/chosen": -623.5820756392045, + "logps/rejected": -731.8547175480769, + "loss": 0.0258, + "rewards/chosen": 6.177843960848722, + "rewards/margins": 17.116611614093912, + "rewards/rejected": -10.938767653245192, + "step": 417 + }, + { + "epoch": 0.1145676305330958, + "grad_norm": 14.9375, + "kl": 11.458067893981934, + "learning_rate": 5e-06, + "logits/chosen": -12164310.588235294, + "logits/rejected": -17830915.42857143, + "logps/chosen": -388.7544806985294, + "logps/rejected": -462.2735072544643, + "loss": 0.0856, + "rewards/chosen": 4.858300601734834, + "rewards/margins": 11.368716905097, + "rewards/rejected": -6.510416303362165, + "step": 418 + }, + { + "epoch": 0.1148417157736056, + "grad_norm": 6.25, + "kl": 3.2085494995117188, + "learning_rate": 5e-06, + "logits/chosen": 2921664.1818181816, + "logits/rejected": -1772660.923076923, + "logps/chosen": -391.63578657670456, + "logps/rejected": -496.2317082331731, + "loss": 0.024, + "rewards/chosen": 4.800546125932173, + "rewards/margins": 12.214680224865466, + "rewards/rejected": -7.414134098933293, + "step": 419 + }, + { + "epoch": 0.1151158010141154, + "grad_norm": 13.875, + "kl": 8.706876754760742, + "learning_rate": 5e-06, + "logits/chosen": 12038871.111111112, + "logits/rejected": 12916477.333333334, + "logps/chosen": -341.86943901909723, + "logps/rejected": -424.3650716145833, + "loss": 0.1673, + "rewards/chosen": 4.215212927924262, + "rewards/margins": 8.325309541490343, + "rewards/rejected": -4.110096613566081, + "step": 420 + }, + { + "epoch": 0.11538988625462519, + "grad_norm": 8.9375, + "kl": 0.7990188598632812, + "learning_rate": 5e-06, + "logits/chosen": -16539252.0, + "logits/rejected": -7891428.0, + "logps/chosen": -493.7884521484375, + "logps/rejected": -472.4351399739583, + "loss": 0.0267, + "rewards/chosen": 4.8810930252075195, + "rewards/margins": 11.223545392354328, + "rewards/rejected": -6.34245236714681, + "step": 421 + }, + { + "epoch": 0.11566397149513499, + "grad_norm": 9.375, + "kl": 2.6363677978515625, + "learning_rate": 5e-06, + "logits/chosen": -14195959.272727273, + "logits/rejected": -8672962.461538462, + "logps/chosen": -445.5892223011364, + "logps/rejected": -438.87680288461536, + "loss": 0.038, + "rewards/chosen": 5.242577639493075, + "rewards/margins": 10.599408529855154, + "rewards/rejected": -5.35683089036208, + "step": 422 + }, + { + "epoch": 0.11593805673564478, + "grad_norm": 8.6875, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -13663899.076923076, + "logits/rejected": -3723773.090909091, + "logps/chosen": -356.7087590144231, + "logps/rejected": -391.03608842329544, + "loss": 0.0435, + "rewards/chosen": 4.336830139160156, + "rewards/margins": 10.388533852317117, + "rewards/rejected": -6.05170371315696, + "step": 423 + }, + { + "epoch": 0.11621214197615458, + "grad_norm": 9.6875, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": 4023783.3333333335, + "logits/rejected": 29415626.666666668, + "logps/chosen": -433.2442626953125, + "logps/rejected": -422.5835774739583, + "loss": 0.0531, + "rewards/chosen": 5.0741316477457685, + "rewards/margins": 10.777281443277996, + "rewards/rejected": -5.703149795532227, + "step": 424 + }, + { + "epoch": 0.11648622721666438, + "grad_norm": 8.0625, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -3326568.6666666665, + "logits/rejected": 29719957.333333332, + "logps/chosen": -463.706787109375, + "logps/rejected": -384.6886800130208, + "loss": 0.0408, + "rewards/chosen": 4.230129559834798, + "rewards/margins": 9.778522809346516, + "rewards/rejected": -5.548393249511719, + "step": 425 + }, + { + "epoch": 0.11676031245717418, + "grad_norm": 12.5625, + "kl": 11.489250183105469, + "learning_rate": 5e-06, + "logits/chosen": 3004155.111111111, + "logits/rejected": 3887718.0, + "logps/chosen": -470.66069878472223, + "logps/rejected": -317.6863199869792, + "loss": 0.0941, + "rewards/chosen": 5.331384870741102, + "rewards/margins": 9.732485983106825, + "rewards/rejected": -4.401101112365723, + "step": 426 + }, + { + "epoch": 0.11703439769768398, + "grad_norm": 17.375, + "kl": 6.727948188781738, + "learning_rate": 5e-06, + "logits/chosen": -24456662.85714286, + "logits/rejected": 15297899.2, + "logps/chosen": -519.6393694196429, + "logps/rejected": -438.657177734375, + "loss": 0.0851, + "rewards/chosen": 4.337975365774972, + "rewards/margins": 12.346113640921455, + "rewards/rejected": -8.008138275146484, + "step": 427 + }, + { + "epoch": 0.11730848293819378, + "grad_norm": 6.9375, + "kl": 7.775592803955078, + "learning_rate": 5e-06, + "logits/chosen": -18425237.714285713, + "logits/rejected": 3766884.0, + "logps/chosen": -497.04931640625, + "logps/rejected": -384.038671875, + "loss": 0.0328, + "rewards/chosen": 6.203835623604911, + "rewards/margins": 11.072706168038504, + "rewards/rejected": -4.868870544433594, + "step": 428 + }, + { + "epoch": 0.11758256817870358, + "grad_norm": 6.78125, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -7550387.0, + "logits/rejected": -5048006.0, + "logps/chosen": -437.6584167480469, + "logps/rejected": -530.2982177734375, + "loss": 0.0709, + "rewards/chosen": 4.545597076416016, + "rewards/margins": 10.871126174926758, + "rewards/rejected": -6.325529098510742, + "step": 429 + }, + { + "epoch": 0.11785665341921338, + "grad_norm": 11.6875, + "kl": 0.5750173330307007, + "learning_rate": 5e-06, + "logits/chosen": -233086.15384615384, + "logits/rejected": -2284518.0, + "logps/chosen": -434.5003004807692, + "logps/rejected": -447.03884055397725, + "loss": 0.0471, + "rewards/chosen": 4.457581153282752, + "rewards/margins": 10.256158895425862, + "rewards/rejected": -5.79857774214311, + "step": 430 + }, + { + "epoch": 0.11813073865972318, + "grad_norm": 8.75, + "kl": 1.9956697225570679, + "learning_rate": 5e-06, + "logits/chosen": 11909627.2, + "logits/rejected": -223967.42857142858, + "logps/chosen": -391.791259765625, + "logps/rejected": -432.71700613839283, + "loss": 0.0612, + "rewards/chosen": 5.969361877441406, + "rewards/margins": 11.088246917724609, + "rewards/rejected": -5.118885040283203, + "step": 431 + }, + { + "epoch": 0.11840482390023298, + "grad_norm": 12.4375, + "kl": 11.777565956115723, + "learning_rate": 5e-06, + "logits/chosen": 9536930.133333333, + "logits/rejected": -3777880.0, + "logps/chosen": -378.4199544270833, + "logps/rejected": -461.8505859375, + "loss": 0.1284, + "rewards/chosen": 4.021975199381511, + "rewards/margins": 8.819593641493057, + "rewards/rejected": -4.7976184421115455, + "step": 432 + }, + { + "epoch": 0.11867890914074278, + "grad_norm": 14.25, + "kl": 7.559208393096924, + "learning_rate": 5e-06, + "logits/chosen": -3685064.0, + "logits/rejected": 7974252.0, + "logps/chosen": -453.21072823660717, + "logps/rejected": -452.85166015625, + "loss": 0.1142, + "rewards/chosen": 4.575423104422433, + "rewards/margins": 9.679639489310128, + "rewards/rejected": -5.104216384887695, + "step": 433 + }, + { + "epoch": 0.11895299438125256, + "grad_norm": 10.625, + "kl": 3.732858419418335, + "learning_rate": 5e-06, + "logits/chosen": -10687925.6, + "logits/rejected": -12137588.57142857, + "logps/chosen": -595.43896484375, + "logps/rejected": -470.03578404017856, + "loss": 0.0667, + "rewards/chosen": 5.797731018066406, + "rewards/margins": 11.311407252720425, + "rewards/rejected": -5.513676234654018, + "step": 434 + }, + { + "epoch": 0.11922707962176236, + "grad_norm": 10.3125, + "kl": 6.808411598205566, + "learning_rate": 5e-06, + "logits/chosen": -14575180.57142857, + "logits/rejected": -14702224.0, + "logps/chosen": -468.2163783482143, + "logps/rejected": -401.5806396484375, + "loss": 0.0524, + "rewards/chosen": 4.191730771745954, + "rewards/margins": 9.41584881373814, + "rewards/rejected": -5.224118041992187, + "step": 435 + }, + { + "epoch": 0.11950116486227216, + "grad_norm": 8.9375, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -5027998.933333334, + "logits/rejected": -1982009.7777777778, + "logps/chosen": -296.90032552083335, + "logps/rejected": -681.9200846354166, + "loss": 0.0715, + "rewards/chosen": 3.5995712280273438, + "rewards/margins": 11.70694308810764, + "rewards/rejected": -8.107371860080296, + "step": 436 + }, + { + "epoch": 0.11977525010278196, + "grad_norm": 6.21875, + "kl": 1.868575096130371, + "learning_rate": 5e-06, + "logits/chosen": -22727918.222222224, + "logits/rejected": -11467808.0, + "logps/chosen": -513.1781141493055, + "logps/rejected": -379.51793619791664, + "loss": 0.0404, + "rewards/chosen": 5.166791280110677, + "rewards/margins": 10.032808430989583, + "rewards/rejected": -4.866017150878906, + "step": 437 + }, + { + "epoch": 0.12004933534329176, + "grad_norm": 13.125, + "kl": 1.3064229488372803, + "learning_rate": 5e-06, + "logits/chosen": -1528856.6666666667, + "logits/rejected": 5685832.666666667, + "logps/chosen": -410.4261881510417, + "logps/rejected": -364.9016927083333, + "loss": 0.1029, + "rewards/chosen": 3.6042772928873696, + "rewards/margins": 8.596930185953775, + "rewards/rejected": -4.992652893066406, + "step": 438 + }, + { + "epoch": 0.12032342058380156, + "grad_norm": 13.8125, + "kl": 1.6372134685516357, + "learning_rate": 5e-06, + "logits/chosen": 22419186.0, + "logits/rejected": 23937180.0, + "logps/chosen": -477.5968322753906, + "logps/rejected": -492.0136413574219, + "loss": 0.0803, + "rewards/chosen": 4.7312541007995605, + "rewards/margins": 11.434528827667236, + "rewards/rejected": -6.703274726867676, + "step": 439 + }, + { + "epoch": 0.12059750582431136, + "grad_norm": 7.09375, + "kl": 5.7635297775268555, + "learning_rate": 5e-06, + "logits/chosen": -7858690.0, + "logits/rejected": -3217337.6666666665, + "logps/chosen": -420.45361328125, + "logps/rejected": -355.1445719401042, + "loss": 0.0712, + "rewards/chosen": 5.364744186401367, + "rewards/margins": 11.879725138346355, + "rewards/rejected": -6.514980951944987, + "step": 440 + }, + { + "epoch": 0.12087159106482116, + "grad_norm": 3.390625, + "kl": 5.21973180770874, + "learning_rate": 5e-06, + "logits/chosen": -11270036.923076924, + "logits/rejected": 21840215.272727273, + "logps/chosen": -441.7110126201923, + "logps/rejected": -573.6541637073864, + "loss": 0.0175, + "rewards/chosen": 5.8538947472205525, + "rewards/margins": 14.265495727112242, + "rewards/rejected": -8.41160097989169, + "step": 441 + }, + { + "epoch": 0.12114567630533096, + "grad_norm": 7.8125, + "kl": 1.7652950286865234, + "learning_rate": 5e-06, + "logits/chosen": -14295597.090909092, + "logits/rejected": -8470441.846153846, + "logps/chosen": -396.2638494318182, + "logps/rejected": -565.4736328125, + "loss": 0.0382, + "rewards/chosen": 5.062072060324929, + "rewards/margins": 11.54180838844993, + "rewards/rejected": -6.479736328125, + "step": 442 + }, + { + "epoch": 0.12141976154584076, + "grad_norm": 15.4375, + "kl": 3.535595655441284, + "learning_rate": 5e-06, + "logits/chosen": -5138544.7272727275, + "logits/rejected": 2369894.769230769, + "logps/chosen": -357.2017711292614, + "logps/rejected": -447.4820087139423, + "loss": 0.069, + "rewards/chosen": 4.752412275834517, + "rewards/margins": 9.821964183887403, + "rewards/rejected": -5.069551908052885, + "step": 443 + }, + { + "epoch": 0.12169384678635055, + "grad_norm": 7.65625, + "kl": 5.171342849731445, + "learning_rate": 5e-06, + "logits/chosen": 3158433.1428571427, + "logits/rejected": 91625.2, + "logps/chosen": -525.6520647321429, + "logps/rejected": -410.87998046875, + "loss": 0.0633, + "rewards/chosen": 4.711139678955078, + "rewards/margins": 11.090646362304687, + "rewards/rejected": -6.379506683349609, + "step": 444 + }, + { + "epoch": 0.12196793202686035, + "grad_norm": 13.875, + "kl": 7.881667613983154, + "learning_rate": 5e-06, + "logits/chosen": -16214974.857142856, + "logits/rejected": -15484476.8, + "logps/chosen": -424.52933175223217, + "logps/rejected": -486.9, + "loss": 0.0533, + "rewards/chosen": 4.297619683401925, + "rewards/margins": 10.739328057425363, + "rewards/rejected": -6.441708374023437, + "step": 445 + }, + { + "epoch": 0.12224201726737015, + "grad_norm": 64.0, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": 12903280.0, + "logits/rejected": -11975392.0, + "logps/chosen": -497.42569247159093, + "logps/rejected": -374.8212139423077, + "loss": 0.0921, + "rewards/chosen": 4.8758392333984375, + "rewards/margins": 10.177911611703726, + "rewards/rejected": -5.302072378305288, + "step": 446 + }, + { + "epoch": 0.12251610250787995, + "grad_norm": 16.5, + "kl": 7.73777961730957, + "learning_rate": 5e-06, + "logits/chosen": 6214854.4, + "logits/rejected": 22107936.0, + "logps/chosen": -625.9276692708333, + "logps/rejected": -743.7556966145834, + "loss": 0.048, + "rewards/chosen": 6.159839884440104, + "rewards/margins": 16.415760294596353, + "rewards/rejected": -10.25592041015625, + "step": 447 + }, + { + "epoch": 0.12279018774838975, + "grad_norm": 11.625, + "kl": 7.255550384521484, + "learning_rate": 5e-06, + "logits/chosen": 18127763.692307692, + "logits/rejected": 2452176.909090909, + "logps/chosen": -442.31640625, + "logps/rejected": -522.4675071022727, + "loss": 0.0855, + "rewards/chosen": 4.932028550368089, + "rewards/margins": 10.998615131511555, + "rewards/rejected": -6.066586581143466, + "step": 448 + }, + { + "epoch": 0.12306427298889955, + "grad_norm": 11.5, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -13392304.888888888, + "logits/rejected": -4581746.666666667, + "logps/chosen": -379.14238823784723, + "logps/rejected": -399.48248697916665, + "loss": 0.0787, + "rewards/chosen": 4.210876888699001, + "rewards/margins": 9.513733842637803, + "rewards/rejected": -5.302856953938802, + "step": 449 + }, + { + "epoch": 0.12333835822940935, + "grad_norm": 5.875, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": 10442922.666666666, + "logits/rejected": 5991036.8, + "logps/chosen": -539.3796115451389, + "logps/rejected": -695.9688802083333, + "loss": 0.0123, + "rewards/chosen": 5.890575408935547, + "rewards/margins": 14.102466583251953, + "rewards/rejected": -8.211891174316406, + "step": 450 + }, + { + "epoch": 0.12361244346991915, + "grad_norm": 9.125, + "kl": 3.3635027408599854, + "learning_rate": 5e-06, + "logits/chosen": -11976048.888888888, + "logits/rejected": 9342389.333333334, + "logps/chosen": -406.0400390625, + "logps/rejected": -367.65423177083335, + "loss": 0.091, + "rewards/chosen": 4.3674875895182295, + "rewards/margins": 8.354638163248698, + "rewards/rejected": -3.9871505737304687, + "step": 451 + }, + { + "epoch": 0.12388652871042895, + "grad_norm": 10.4375, + "kl": 0.6373850703239441, + "learning_rate": 5e-06, + "logits/chosen": -12601808.0, + "logits/rejected": 1428642.0, + "logps/chosen": -440.121142578125, + "logps/rejected": -348.94022042410717, + "loss": 0.0659, + "rewards/chosen": 5.3453319549560545, + "rewards/margins": 10.180178451538087, + "rewards/rejected": -4.834846496582031, + "step": 452 + }, + { + "epoch": 0.12416061395093875, + "grad_norm": 12.0625, + "kl": 4.79276180267334, + "learning_rate": 5e-06, + "logits/chosen": -2940508.0, + "logits/rejected": -7162262.4, + "logps/chosen": -396.97262137276783, + "logps/rejected": -626.80634765625, + "loss": 0.083, + "rewards/chosen": 5.480047498430524, + "rewards/margins": 11.15739162990025, + "rewards/rejected": -5.677344131469726, + "step": 453 + }, + { + "epoch": 0.12443469919144855, + "grad_norm": 8.9375, + "kl": 5.5915374755859375, + "learning_rate": 5e-06, + "logits/chosen": -6060873.846153846, + "logits/rejected": 1029700.0, + "logps/chosen": -400.9353215144231, + "logps/rejected": -529.9533913352273, + "loss": 0.0375, + "rewards/chosen": 5.423340430626502, + "rewards/margins": 12.378292323826077, + "rewards/rejected": -6.954951893199574, + "step": 454 + }, + { + "epoch": 0.12470878443195833, + "grad_norm": 10.1875, + "kl": 2.5470480918884277, + "learning_rate": 5e-06, + "logits/chosen": -21044281.6, + "logits/rejected": 5689600.0, + "logps/chosen": -381.8020833333333, + "logps/rejected": -298.42328559027777, + "loss": 0.0919, + "rewards/chosen": 3.9520118713378904, + "rewards/margins": 7.202756585015191, + "rewards/rejected": -3.2507447136773004, + "step": 455 + }, + { + "epoch": 0.12498286967246813, + "grad_norm": 10.25, + "kl": 4.531886100769043, + "learning_rate": 5e-06, + "logits/chosen": 10876402.909090908, + "logits/rejected": -687826.9230769231, + "logps/chosen": -488.38454367897725, + "logps/rejected": -360.48978365384613, + "loss": 0.0368, + "rewards/chosen": 4.953082691539418, + "rewards/margins": 10.720058681247952, + "rewards/rejected": -5.766975989708533, + "step": 456 + }, + { + "epoch": 0.12525695491297795, + "grad_norm": 10.625, + "kl": 0.6735445857048035, + "learning_rate": 5e-06, + "logits/chosen": -1898091.6363636365, + "logits/rejected": 8320043.076923077, + "logps/chosen": -342.6056019176136, + "logps/rejected": -436.19576322115387, + "loss": 0.0798, + "rewards/chosen": 4.387000344016335, + "rewards/margins": 9.348640335189712, + "rewards/rejected": -4.961639991173377, + "step": 457 + }, + { + "epoch": 0.12553104015348773, + "grad_norm": 10.625, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -10754361.333333334, + "logits/rejected": -14138842.666666666, + "logps/chosen": -401.8219401041667, + "logps/rejected": -519.2270100911459, + "loss": 0.0355, + "rewards/chosen": 4.588844935099284, + "rewards/margins": 13.193602244059246, + "rewards/rejected": -8.604757308959961, + "step": 458 + }, + { + "epoch": 0.12580512539399755, + "grad_norm": 7.59375, + "kl": 0.47039923071861267, + "learning_rate": 5e-06, + "logits/chosen": -12107523.0, + "logits/rejected": -14077453.0, + "logps/chosen": -409.70672607421875, + "logps/rejected": -369.8258056640625, + "loss": 0.0556, + "rewards/chosen": 4.790868759155273, + "rewards/margins": 9.652441024780273, + "rewards/rejected": -4.861572265625, + "step": 459 + }, + { + "epoch": 0.12607921063450733, + "grad_norm": 11.4375, + "kl": 4.065341472625732, + "learning_rate": 5e-06, + "logits/chosen": 8706708.57142857, + "logits/rejected": 19295160.0, + "logps/chosen": -458.15248325892856, + "logps/rejected": -517.084765625, + "loss": 0.0669, + "rewards/chosen": 5.410501752580915, + "rewards/margins": 14.067499433244977, + "rewards/rejected": -8.656997680664062, + "step": 460 + }, + { + "epoch": 0.12635329587501712, + "grad_norm": 11.3125, + "kl": 13.221921920776367, + "learning_rate": 5e-06, + "logits/chosen": 1971047.4666666666, + "logits/rejected": -7713275.555555556, + "logps/chosen": -405.24931640625, + "logps/rejected": -386.20024956597223, + "loss": 0.1258, + "rewards/chosen": 5.4412180582682295, + "rewards/margins": 10.04423607720269, + "rewards/rejected": -4.603018018934462, + "step": 461 + }, + { + "epoch": 0.12662738111552693, + "grad_norm": 10.1875, + "kl": 1.1256942749023438, + "learning_rate": 5e-06, + "logits/chosen": -34256608.0, + "logits/rejected": -14039293.0, + "logps/chosen": -417.0491027832031, + "logps/rejected": -493.0030517578125, + "loss": 0.0564, + "rewards/chosen": 4.840474605560303, + "rewards/margins": 11.41145372390747, + "rewards/rejected": -6.570979118347168, + "step": 462 + }, + { + "epoch": 0.12690146635603672, + "grad_norm": 12.5625, + "kl": 7.961170673370361, + "learning_rate": 5e-06, + "logits/chosen": -10097382.4, + "logits/rejected": -15102848.0, + "logps/chosen": -414.6386393229167, + "logps/rejected": -535.7789713541666, + "loss": 0.1701, + "rewards/chosen": 4.0880381266276045, + "rewards/margins": 10.137106153700088, + "rewards/rejected": -6.049068027072483, + "step": 463 + }, + { + "epoch": 0.12717555159654653, + "grad_norm": 6.6875, + "kl": 6.578490257263184, + "learning_rate": 5e-06, + "logits/chosen": -15815629.333333334, + "logits/rejected": 26144360.0, + "logps/chosen": -531.19140625, + "logps/rejected": -502.2237955729167, + "loss": 0.042, + "rewards/chosen": 5.8003692626953125, + "rewards/margins": 11.146399180094402, + "rewards/rejected": -5.346029917399089, + "step": 464 + }, + { + "epoch": 0.12744963683705632, + "grad_norm": 8.4375, + "kl": 0.7406253814697266, + "learning_rate": 5e-06, + "logits/chosen": 3217378.6666666665, + "logits/rejected": -286025.3333333333, + "logps/chosen": -431.3926595052083, + "logps/rejected": -376.3029378255208, + "loss": 0.0869, + "rewards/chosen": 4.371631622314453, + "rewards/margins": 9.80678876241048, + "rewards/rejected": -5.435157140096028, + "step": 465 + }, + { + "epoch": 0.12772372207756613, + "grad_norm": 8.5625, + "kl": 1.3172938823699951, + "learning_rate": 5e-06, + "logits/chosen": -1668962.1666666667, + "logits/rejected": 6739132.666666667, + "logps/chosen": -365.090576171875, + "logps/rejected": -469.5023600260417, + "loss": 0.0646, + "rewards/chosen": 3.446626663208008, + "rewards/margins": 8.896043141682942, + "rewards/rejected": -5.449416478474935, + "step": 466 + }, + { + "epoch": 0.12799780731807592, + "grad_norm": 5.75, + "kl": 0.1311105191707611, + "learning_rate": 5e-06, + "logits/chosen": -4915605.2, + "logits/rejected": 382791.14285714284, + "logps/chosen": -525.1234375, + "logps/rejected": -441.40708705357144, + "loss": 0.021, + "rewards/chosen": 6.357054138183594, + "rewards/margins": 11.639038739885603, + "rewards/rejected": -5.281984601702009, + "step": 467 + }, + { + "epoch": 0.12827189255858573, + "grad_norm": 9.9375, + "kl": 4.100715637207031, + "learning_rate": 5e-06, + "logits/chosen": -18164891.2, + "logits/rejected": 1369805.4285714286, + "logps/chosen": -442.25068359375, + "logps/rejected": -457.75254603794644, + "loss": 0.0397, + "rewards/chosen": 4.2740215301513675, + "rewards/margins": 11.37408539908273, + "rewards/rejected": -7.100063868931362, + "step": 468 + }, + { + "epoch": 0.12854597779909552, + "grad_norm": 14.5, + "kl": 6.389166831970215, + "learning_rate": 5e-06, + "logits/chosen": -5293637.333333333, + "logits/rejected": 733730.5555555555, + "logps/chosen": -369.07939453125, + "logps/rejected": -600.1232096354166, + "loss": 0.1099, + "rewards/chosen": 4.435970052083333, + "rewards/margins": 11.972415669759116, + "rewards/rejected": -7.536445617675781, + "step": 469 + }, + { + "epoch": 0.1288200630396053, + "grad_norm": 5.1875, + "kl": 1.9866111278533936, + "learning_rate": 5e-06, + "logits/chosen": -23277173.333333332, + "logits/rejected": 10052617.333333334, + "logps/chosen": -460.9782307942708, + "logps/rejected": -508.758544921875, + "loss": 0.0217, + "rewards/chosen": 5.395913441975911, + "rewards/margins": 10.774573644002277, + "rewards/rejected": -5.378660202026367, + "step": 470 + }, + { + "epoch": 0.12909414828011512, + "grad_norm": 7.4375, + "kl": 8.938782691955566, + "learning_rate": 5e-06, + "logits/chosen": -7565367.466666667, + "logits/rejected": 393982.22222222225, + "logps/chosen": -427.22350260416664, + "logps/rejected": -504.4758029513889, + "loss": 0.0985, + "rewards/chosen": 5.106573994954427, + "rewards/margins": 11.090380181206598, + "rewards/rejected": -5.9838061862521705, + "step": 471 + }, + { + "epoch": 0.1293682335206249, + "grad_norm": 9.25, + "kl": 12.492782592773438, + "learning_rate": 5e-06, + "logits/chosen": 223685.23076923078, + "logits/rejected": -8374301.090909091, + "logps/chosen": -391.36170372596155, + "logps/rejected": -463.23251065340907, + "loss": 0.0386, + "rewards/chosen": 5.499791071965144, + "rewards/margins": 10.569936418866778, + "rewards/rejected": -5.070145346901634, + "step": 472 + }, + { + "epoch": 0.12964231876113472, + "grad_norm": 8.875, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -20186501.333333332, + "logits/rejected": 41963460.266666666, + "logps/chosen": -465.732421875, + "logps/rejected": -423.13678385416665, + "loss": 0.0454, + "rewards/chosen": 5.138023376464844, + "rewards/margins": 11.074158223470052, + "rewards/rejected": -5.936134847005208, + "step": 473 + }, + { + "epoch": 0.1299164040016445, + "grad_norm": 4.875, + "kl": 6.0998005867004395, + "learning_rate": 5e-06, + "logits/chosen": -2289758.769230769, + "logits/rejected": -16319968.0, + "logps/chosen": -615.0534104567307, + "logps/rejected": -539.7853338068181, + "loss": 0.0274, + "rewards/chosen": 5.290985107421875, + "rewards/margins": 10.992495450106535, + "rewards/rejected": -5.701510342684659, + "step": 474 + }, + { + "epoch": 0.13019048924215432, + "grad_norm": 8.5, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": 7618407.2727272725, + "logits/rejected": -10060240.0, + "logps/chosen": -384.5530894886364, + "logps/rejected": -638.1752554086538, + "loss": 0.0519, + "rewards/chosen": 5.451144131747159, + "rewards/margins": 11.677428452285021, + "rewards/rejected": -6.226284320537861, + "step": 475 + }, + { + "epoch": 0.1304645744826641, + "grad_norm": 10.0, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -4455447.076923077, + "logits/rejected": -5531964.363636363, + "logps/chosen": -400.7497370793269, + "logps/rejected": -576.9176136363636, + "loss": 0.0485, + "rewards/chosen": 4.37075923039363, + "rewards/margins": 10.726849429257268, + "rewards/rejected": -6.356090198863637, + "step": 476 + }, + { + "epoch": 0.13073865972317392, + "grad_norm": 7.1875, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -18831891.555555556, + "logits/rejected": 58664712.53333333, + "logps/chosen": -359.9853244357639, + "logps/rejected": -645.40859375, + "loss": 0.0275, + "rewards/chosen": 4.00537829928928, + "rewards/margins": 17.86859342787001, + "rewards/rejected": -13.863215128580729, + "step": 477 + }, + { + "epoch": 0.1310127449636837, + "grad_norm": 11.375, + "kl": 0.9932206869125366, + "learning_rate": 5e-06, + "logits/chosen": 7383578.4, + "logits/rejected": 17136635.42857143, + "logps/chosen": -472.840869140625, + "logps/rejected": -581.6494140625, + "loss": 0.0558, + "rewards/chosen": 4.694891738891601, + "rewards/margins": 11.847480719430106, + "rewards/rejected": -7.152588980538504, + "step": 478 + }, + { + "epoch": 0.13128683020419352, + "grad_norm": 10.125, + "kl": 2.859952926635742, + "learning_rate": 5e-06, + "logits/chosen": -4974104.615384615, + "logits/rejected": 19360704.0, + "logps/chosen": -443.69106820913464, + "logps/rejected": -543.1739169034091, + "loss": 0.0503, + "rewards/chosen": 4.681705181415264, + "rewards/margins": 9.720261367050917, + "rewards/rejected": -5.038556185635653, + "step": 479 + }, + { + "epoch": 0.1315609154447033, + "grad_norm": 8.5625, + "kl": 6.104207515716553, + "learning_rate": 5e-06, + "logits/chosen": -3431297.846153846, + "logits/rejected": -2234193.090909091, + "logps/chosen": -582.3958834134615, + "logps/rejected": -612.8113458806819, + "loss": 0.0282, + "rewards/chosen": 6.446668771597055, + "rewards/margins": 12.748430745584981, + "rewards/rejected": -6.301761973987926, + "step": 480 + }, + { + "epoch": 0.1318350006852131, + "grad_norm": 11.75, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -1215490.8, + "logits/rejected": 2287215.4285714286, + "logps/chosen": -452.913525390625, + "logps/rejected": -526.5659877232143, + "loss": 0.0664, + "rewards/chosen": 5.2718345642089846, + "rewards/margins": 10.041837201799666, + "rewards/rejected": -4.7700026375906805, + "step": 481 + }, + { + "epoch": 0.1321090859257229, + "grad_norm": 10.8125, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -15478252.444444444, + "logits/rejected": -21899891.2, + "logps/chosen": -373.80186631944446, + "logps/rejected": -599.8356770833333, + "loss": 0.0511, + "rewards/chosen": 3.8004031711154513, + "rewards/margins": 10.850962151421442, + "rewards/rejected": -7.05055898030599, + "step": 482 + }, + { + "epoch": 0.1323831711662327, + "grad_norm": 4.53125, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": 24149251.2, + "logits/rejected": -16857041.14285714, + "logps/chosen": -475.391015625, + "logps/rejected": -528.0207170758929, + "loss": 0.018, + "rewards/chosen": 6.760969543457032, + "rewards/margins": 14.080160522460938, + "rewards/rejected": -7.319190979003906, + "step": 483 + }, + { + "epoch": 0.1326572564067425, + "grad_norm": 8.5, + "kl": 5.930367946624756, + "learning_rate": 5e-06, + "logits/chosen": 9483406.666666666, + "logits/rejected": -663090.8333333334, + "logps/chosen": -359.7976888020833, + "logps/rejected": -390.3515625, + "loss": 0.069, + "rewards/chosen": 4.736769040425618, + "rewards/margins": 11.31773026784261, + "rewards/rejected": -6.580961227416992, + "step": 484 + }, + { + "epoch": 0.1329313416472523, + "grad_norm": 15.125, + "kl": 4.285697937011719, + "learning_rate": 5e-06, + "logits/chosen": -13197861.714285715, + "logits/rejected": 29382864.0, + "logps/chosen": -476.50474330357144, + "logps/rejected": -551.522314453125, + "loss": 0.0954, + "rewards/chosen": 4.913634163992746, + "rewards/margins": 10.423383004324776, + "rewards/rejected": -5.509748840332032, + "step": 485 + }, + { + "epoch": 0.1332054268877621, + "grad_norm": 8.875, + "kl": 1.3951575756072998, + "learning_rate": 5e-06, + "logits/chosen": 46836416.0, + "logits/rejected": -14920528.0, + "logps/chosen": -431.3029119318182, + "logps/rejected": -398.5420673076923, + "loss": 0.0464, + "rewards/chosen": 4.294571616432884, + "rewards/margins": 11.135222908500191, + "rewards/rejected": -6.8406512920673075, + "step": 486 + }, + { + "epoch": 0.1334795121282719, + "grad_norm": 11.4375, + "kl": 5.975696563720703, + "learning_rate": 5e-06, + "logits/chosen": -9240651.692307692, + "logits/rejected": -7182513.454545454, + "logps/chosen": -406.12642728365387, + "logps/rejected": -567.9386541193181, + "loss": 0.0452, + "rewards/chosen": 5.058206411508413, + "rewards/margins": 10.76846350823249, + "rewards/rejected": -5.710257096724077, + "step": 487 + }, + { + "epoch": 0.1337535973687817, + "grad_norm": 8.6875, + "kl": 4.092708110809326, + "learning_rate": 5e-06, + "logits/chosen": -9927616.0, + "logits/rejected": 12996797.866666667, + "logps/chosen": -350.3146158854167, + "logps/rejected": -485.196484375, + "loss": 0.0701, + "rewards/chosen": 4.901311662462023, + "rewards/margins": 12.246675533718534, + "rewards/rejected": -7.34536387125651, + "step": 488 + }, + { + "epoch": 0.1340276826092915, + "grad_norm": 12.125, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": 6894488.0, + "logits/rejected": 79246788.92307693, + "logps/chosen": -538.5767489346591, + "logps/rejected": -462.7756159855769, + "loss": 0.0924, + "rewards/chosen": 5.1970759305087, + "rewards/margins": 12.182582428405336, + "rewards/rejected": -6.985506497896635, + "step": 489 + }, + { + "epoch": 0.1343017678498013, + "grad_norm": 11.5625, + "kl": 6.217202186584473, + "learning_rate": 5e-06, + "logits/chosen": -7668829.333333333, + "logits/rejected": 17056071.111111112, + "logps/chosen": -499.2506510416667, + "logps/rejected": -357.55864800347223, + "loss": 0.0622, + "rewards/chosen": 4.66490478515625, + "rewards/margins": 9.117198181152343, + "rewards/rejected": -4.452293395996094, + "step": 490 + }, + { + "epoch": 0.1345758530903111, + "grad_norm": 6.0, + "kl": 0.38801002502441406, + "learning_rate": 5e-06, + "logits/chosen": -4106256.6666666665, + "logits/rejected": 176450922.66666666, + "logps/chosen": -466.158935546875, + "logps/rejected": -434.757568359375, + "loss": 0.0183, + "rewards/chosen": 5.683380126953125, + "rewards/margins": 12.672976811726887, + "rewards/rejected": -6.989596684773763, + "step": 491 + }, + { + "epoch": 0.13484993833082087, + "grad_norm": 8.9375, + "kl": 1.5582587718963623, + "learning_rate": 5e-06, + "logits/chosen": -11657159.272727273, + "logits/rejected": -12495309.538461538, + "logps/chosen": -453.2750355113636, + "logps/rejected": -508.4688251201923, + "loss": 0.0547, + "rewards/chosen": 3.751702742143111, + "rewards/margins": 9.556195399144313, + "rewards/rejected": -5.804492657001202, + "step": 492 + }, + { + "epoch": 0.1351240235713307, + "grad_norm": 7.03125, + "kl": 4.551628112792969, + "learning_rate": 5e-06, + "logits/chosen": -13225198.222222222, + "logits/rejected": 25445015.466666665, + "logps/chosen": -423.81651475694446, + "logps/rejected": -568.7185546875, + "loss": 0.0364, + "rewards/chosen": 4.700219472249349, + "rewards/margins": 11.236623382568359, + "rewards/rejected": -6.53640391031901, + "step": 493 + }, + { + "epoch": 0.13539810881184047, + "grad_norm": 8.6875, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": 14337210.181818182, + "logits/rejected": 9033452.307692308, + "logps/chosen": -435.11083984375, + "logps/rejected": -442.16856971153845, + "loss": 0.0527, + "rewards/chosen": 4.264907143332741, + "rewards/margins": 9.80527309604458, + "rewards/rejected": -5.540365952711839, + "step": 494 + }, + { + "epoch": 0.1356721940523503, + "grad_norm": 13.3125, + "kl": 3.156829833984375, + "learning_rate": 5e-06, + "logits/chosen": 3337881.5384615385, + "logits/rejected": -10116785.454545455, + "logps/chosen": -462.41654146634613, + "logps/rejected": -371.4866832386364, + "loss": 0.1006, + "rewards/chosen": 5.038512009840745, + "rewards/margins": 9.333317709969474, + "rewards/rejected": -4.2948057001287285, + "step": 495 + }, + { + "epoch": 0.13594627929286007, + "grad_norm": 11.6875, + "kl": 9.76203441619873, + "learning_rate": 5e-06, + "logits/chosen": -159690.96666666667, + "logits/rejected": -1545647.5555555555, + "logps/chosen": -476.39967447916666, + "logps/rejected": -463.54112413194446, + "loss": 0.1324, + "rewards/chosen": 4.569273376464844, + "rewards/margins": 8.653359900580512, + "rewards/rejected": -4.084086524115668, + "step": 496 + }, + { + "epoch": 0.1362203645333699, + "grad_norm": 15.625, + "kl": 7.030264854431152, + "learning_rate": 5e-06, + "logits/chosen": 10732167.272727273, + "logits/rejected": 6265891.076923077, + "logps/chosen": -383.61470170454544, + "logps/rejected": -541.9927133413462, + "loss": 0.1498, + "rewards/chosen": 4.108525362881747, + "rewards/margins": 11.088208285245027, + "rewards/rejected": -6.979682922363281, + "step": 497 + }, + { + "epoch": 0.13649444977387967, + "grad_norm": 8.9375, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": 4460029.333333333, + "logits/rejected": 12726685.333333334, + "logps/chosen": -419.0828450520833, + "logps/rejected": -560.8222249348959, + "loss": 0.0542, + "rewards/chosen": 4.744330724080403, + "rewards/margins": 10.038303057352701, + "rewards/rejected": -5.293972333272298, + "step": 498 + }, + { + "epoch": 0.1367685350143895, + "grad_norm": 10.375, + "kl": 19.49878692626953, + "learning_rate": 5e-06, + "logits/chosen": -32373589.333333332, + "logits/rejected": 9015348.666666666, + "logps/chosen": -586.0295817057291, + "logps/rejected": -454.6122639973958, + "loss": 0.1071, + "rewards/chosen": 6.96650505065918, + "rewards/margins": 12.71131706237793, + "rewards/rejected": -5.74481201171875, + "step": 499 + }, + { + "epoch": 0.13704262025489927, + "grad_norm": 8.875, + "kl": 4.691189765930176, + "learning_rate": 5e-06, + "logits/chosen": -18497729.777777776, + "logits/rejected": -1015334.6666666666, + "logps/chosen": -579.0014105902778, + "logps/rejected": -339.2461263020833, + "loss": 0.0369, + "rewards/chosen": 5.098388671875, + "rewards/margins": 10.15869852701823, + "rewards/rejected": -5.060309855143229, + "step": 500 + }, + { + "epoch": 0.13731670549540909, + "grad_norm": 8.5, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": 14993276.0, + "logits/rejected": 9570859.0, + "logps/chosen": -497.8064270019531, + "logps/rejected": -534.9885864257812, + "loss": 0.0376, + "rewards/chosen": 4.507524013519287, + "rewards/margins": 10.037208557128906, + "rewards/rejected": -5.529684543609619, + "step": 501 + }, + { + "epoch": 0.13759079073591887, + "grad_norm": 13.1875, + "kl": 0.035073600709438324, + "learning_rate": 5e-06, + "logits/chosen": -12058590.545454545, + "logits/rejected": 20815643.076923076, + "logps/chosen": -369.27756569602275, + "logps/rejected": -635.1790865384615, + "loss": 0.063, + "rewards/chosen": 4.785824862393466, + "rewards/margins": 10.578338196227602, + "rewards/rejected": -5.792513333834135, + "step": 502 + }, + { + "epoch": 0.13786487597642866, + "grad_norm": 13.5625, + "kl": 12.73906421661377, + "learning_rate": 5e-06, + "logits/chosen": -5984139.692307692, + "logits/rejected": 25921070.545454547, + "logps/chosen": -481.4001652644231, + "logps/rejected": -411.71133700284093, + "loss": 0.1087, + "rewards/chosen": 5.5100578894981975, + "rewards/margins": 10.571089924632254, + "rewards/rejected": -5.061032035134056, + "step": 503 + }, + { + "epoch": 0.13813896121693847, + "grad_norm": 8.6875, + "kl": 1.8372840881347656, + "learning_rate": 5e-06, + "logits/chosen": -8043960.470588235, + "logits/rejected": -11024427.42857143, + "logps/chosen": -445.95582490808823, + "logps/rejected": -531.8978097098214, + "loss": 0.0329, + "rewards/chosen": 4.800678926355698, + "rewards/margins": 12.881813177541524, + "rewards/rejected": -8.081134251185826, + "step": 504 + }, + { + "epoch": 0.13841304645744826, + "grad_norm": 14.125, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -4572273.714285715, + "logits/rejected": -2528263.0588235296, + "logps/chosen": -375.6006556919643, + "logps/rejected": -524.9367532169117, + "loss": 0.0509, + "rewards/chosen": 3.92398316519601, + "rewards/margins": 9.666525207647757, + "rewards/rejected": -5.742542042451746, + "step": 505 + }, + { + "epoch": 0.13868713169795807, + "grad_norm": 10.5, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": 12713812.0, + "logits/rejected": 10194046.666666666, + "logps/chosen": -376.2766927083333, + "logps/rejected": -689.296630859375, + "loss": 0.0712, + "rewards/chosen": 4.247028350830078, + "rewards/margins": 13.635808944702148, + "rewards/rejected": -9.38878059387207, + "step": 506 + }, + { + "epoch": 0.13896121693846786, + "grad_norm": 10.25, + "kl": 9.869163513183594, + "learning_rate": 5e-06, + "logits/chosen": -3032360.5714285714, + "logits/rejected": -10302848.0, + "logps/chosen": -461.15586635044644, + "logps/rejected": -393.7937255859375, + "loss": 0.0877, + "rewards/chosen": 5.231511797223773, + "rewards/margins": 11.147066606794084, + "rewards/rejected": -5.915554809570312, + "step": 507 + }, + { + "epoch": 0.13923530217897767, + "grad_norm": 8.625, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -28517088.0, + "logits/rejected": 5048304.0, + "logps/chosen": -431.69912109375, + "logps/rejected": -373.35567801339283, + "loss": 0.0583, + "rewards/chosen": 5.117526245117188, + "rewards/margins": 10.944035993303572, + "rewards/rejected": -5.826509748186384, + "step": 508 + }, + { + "epoch": 0.13950938741948746, + "grad_norm": 10.5, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -15208199.384615384, + "logits/rejected": 11198805.818181818, + "logps/chosen": -328.70736929086536, + "logps/rejected": -632.3313210227273, + "loss": 0.0796, + "rewards/chosen": 2.957134246826172, + "rewards/margins": 9.151478507302024, + "rewards/rejected": -6.1943442604758525, + "step": 509 + }, + { + "epoch": 0.13978347265999727, + "grad_norm": 8.625, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": 5627269.454545454, + "logits/rejected": -2761043.076923077, + "logps/chosen": -446.892578125, + "logps/rejected": -432.2028245192308, + "loss": 0.0432, + "rewards/chosen": 5.199734774502841, + "rewards/margins": 11.27003436321979, + "rewards/rejected": -6.0702995887169475, + "step": 510 + }, + { + "epoch": 0.14005755790050706, + "grad_norm": 8.5625, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -3827399.0, + "logits/rejected": -2583587.6666666665, + "logps/chosen": -448.89599609375, + "logps/rejected": -323.4304606119792, + "loss": 0.0571, + "rewards/chosen": 5.343514124552409, + "rewards/margins": 9.734806060791016, + "rewards/rejected": -4.3912919362386065, + "step": 511 + }, + { + "epoch": 0.14033164314101684, + "grad_norm": 6.46875, + "kl": 0.9369189143180847, + "learning_rate": 5e-06, + "logits/chosen": -12849473.6, + "logits/rejected": -2078933.142857143, + "logps/chosen": -490.030908203125, + "logps/rejected": -572.3549107142857, + "loss": 0.0661, + "rewards/chosen": 4.191656112670898, + "rewards/margins": 12.488398143223353, + "rewards/rejected": -8.296742030552455, + "step": 512 + }, + { + "epoch": 0.14060572838152666, + "grad_norm": 6.0, + "kl": 1.24981689453125, + "learning_rate": 5e-06, + "logits/chosen": 9376638.4, + "logits/rejected": -2642023.714285714, + "logps/chosen": -557.68798828125, + "logps/rejected": -404.22059849330356, + "loss": 0.0307, + "rewards/chosen": 4.943109130859375, + "rewards/margins": 11.137554604666573, + "rewards/rejected": -6.194445473807199, + "step": 513 + }, + { + "epoch": 0.14087981362203644, + "grad_norm": 10.9375, + "kl": 4.446951866149902, + "learning_rate": 5e-06, + "logits/chosen": 1985182.2222222222, + "logits/rejected": -79848.53333333334, + "logps/chosen": -567.3676215277778, + "logps/rejected": -534.7527994791667, + "loss": 0.0335, + "rewards/chosen": 7.1575876871744795, + "rewards/margins": 14.304622395833334, + "rewards/rejected": -7.147034708658854, + "step": 514 + }, + { + "epoch": 0.14115389886254626, + "grad_norm": 12.5625, + "kl": 5.346248626708984, + "learning_rate": 5e-06, + "logits/chosen": -19559369.14285714, + "logits/rejected": 1530993.2, + "logps/chosen": -466.83555385044644, + "logps/rejected": -421.85, + "loss": 0.0616, + "rewards/chosen": 6.285591670445034, + "rewards/margins": 11.835429545811245, + "rewards/rejected": -5.549837875366211, + "step": 515 + }, + { + "epoch": 0.14142798410305604, + "grad_norm": 5.71875, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -9020060.363636363, + "logits/rejected": 4662037.538461538, + "logps/chosen": -422.4716796875, + "logps/rejected": -520.0541240985577, + "loss": 0.0427, + "rewards/chosen": 6.069047407670454, + "rewards/margins": 12.706781454019612, + "rewards/rejected": -6.637734046349158, + "step": 516 + }, + { + "epoch": 0.14170206934356586, + "grad_norm": 11.5625, + "kl": 3.5565237998962402, + "learning_rate": 5e-06, + "logits/chosen": -8134530.666666667, + "logits/rejected": -11858000.0, + "logps/chosen": -322.5374755859375, + "logps/rejected": -310.7281901041667, + "loss": 0.116, + "rewards/chosen": 3.8397432963053384, + "rewards/margins": 7.572998046875, + "rewards/rejected": -3.7332547505696616, + "step": 517 + }, + { + "epoch": 0.14197615458407564, + "grad_norm": 8.75, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -11797736.0, + "logits/rejected": 8503396.923076924, + "logps/chosen": -578.5807883522727, + "logps/rejected": -777.0647536057693, + "loss": 0.0245, + "rewards/chosen": 5.814041831276634, + "rewards/margins": 16.607157300402235, + "rewards/rejected": -10.7931154691256, + "step": 518 + }, + { + "epoch": 0.14225023982458546, + "grad_norm": 6.40625, + "kl": 2.913555145263672, + "learning_rate": 5e-06, + "logits/chosen": -4638422.0, + "logits/rejected": -7682525.333333333, + "logps/chosen": -433.8923746744792, + "logps/rejected": -489.7144368489583, + "loss": 0.0233, + "rewards/chosen": 4.760882059733073, + "rewards/margins": 12.429649353027344, + "rewards/rejected": -7.6687672932942705, + "step": 519 + }, + { + "epoch": 0.14252432506509524, + "grad_norm": 10.1875, + "kl": 2.512099027633667, + "learning_rate": 5e-06, + "logits/chosen": 25726848.0, + "logits/rejected": -8342382.285714285, + "logps/chosen": -407.4517578125, + "logps/rejected": -561.9676688058036, + "loss": 0.0663, + "rewards/chosen": 4.558953094482422, + "rewards/margins": 10.855453164236886, + "rewards/rejected": -6.296500069754464, + "step": 520 + }, + { + "epoch": 0.14279841030560506, + "grad_norm": 5.71875, + "kl": 0.8742803335189819, + "learning_rate": 5e-06, + "logits/chosen": 701095.2, + "logits/rejected": 34651491.55555555, + "logps/chosen": -436.42200520833336, + "logps/rejected": -685.9503038194445, + "loss": 0.0192, + "rewards/chosen": 4.2970942179361975, + "rewards/margins": 12.908265516493056, + "rewards/rejected": -8.611171298556858, + "step": 521 + }, + { + "epoch": 0.14307249554611484, + "grad_norm": 10.3125, + "kl": 0.6433258056640625, + "learning_rate": 5e-06, + "logits/chosen": -20622350.222222224, + "logits/rejected": -208717.86666666667, + "logps/chosen": -326.90557183159723, + "logps/rejected": -419.7560221354167, + "loss": 0.0581, + "rewards/chosen": 3.402361339992947, + "rewards/margins": 11.209460745917426, + "rewards/rejected": -7.807099405924479, + "step": 522 + }, + { + "epoch": 0.14334658078662463, + "grad_norm": 11.9375, + "kl": 5.593787670135498, + "learning_rate": 5e-06, + "logits/chosen": 9403181.714285715, + "logits/rejected": 6990640.0, + "logps/chosen": -509.20113699776783, + "logps/rejected": -418.469873046875, + "loss": 0.0732, + "rewards/chosen": 4.748334067208426, + "rewards/margins": 12.227718680245536, + "rewards/rejected": -7.4793846130371096, + "step": 523 + }, + { + "epoch": 0.14362066602713444, + "grad_norm": 12.25, + "kl": 3.1030471324920654, + "learning_rate": 5e-06, + "logits/chosen": -26226726.0, + "logits/rejected": -2436877.25, + "logps/chosen": -325.7071533203125, + "logps/rejected": -530.9447021484375, + "loss": 0.0888, + "rewards/chosen": 4.121739387512207, + "rewards/margins": 10.195124626159668, + "rewards/rejected": -6.073385238647461, + "step": 524 + }, + { + "epoch": 0.14389475126764423, + "grad_norm": 6.875, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": 2405146.4615384615, + "logits/rejected": -18556184.727272727, + "logps/chosen": -365.5768855168269, + "logps/rejected": -579.4757634943181, + "loss": 0.0503, + "rewards/chosen": 4.119638296274038, + "rewards/margins": 13.541882281536822, + "rewards/rejected": -9.422243985262783, + "step": 525 + }, + { + "epoch": 0.14416883650815404, + "grad_norm": 14.5, + "kl": 1.7000045776367188, + "learning_rate": 5e-06, + "logits/chosen": -16251211.636363637, + "logits/rejected": -6131549.538461538, + "logps/chosen": -337.4658203125, + "logps/rejected": -391.82388070913464, + "loss": 0.0924, + "rewards/chosen": 4.4920432350852275, + "rewards/margins": 9.473159309867379, + "rewards/rejected": -4.981116074782151, + "step": 526 + }, + { + "epoch": 0.14444292174866383, + "grad_norm": 5.46875, + "kl": 1.7460473775863647, + "learning_rate": 5e-06, + "logits/chosen": 3414867.3333333335, + "logits/rejected": -3564094.6666666665, + "logps/chosen": -376.7683919270833, + "logps/rejected": -394.3453369140625, + "loss": 0.0347, + "rewards/chosen": 5.104538281758626, + "rewards/margins": 11.024453163146973, + "rewards/rejected": -5.919914881388347, + "step": 527 + }, + { + "epoch": 0.14471700698917364, + "grad_norm": 6.15625, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -15891333.818181818, + "logits/rejected": -6002129.230769231, + "logps/chosen": -448.9606267755682, + "logps/rejected": -384.9141376201923, + "loss": 0.0251, + "rewards/chosen": 5.497893940318715, + "rewards/margins": 10.961095609864989, + "rewards/rejected": -5.463201669546274, + "step": 528 + }, + { + "epoch": 0.14499109222968343, + "grad_norm": 7.34375, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": 3447380.923076923, + "logits/rejected": -13267098.181818182, + "logps/chosen": -487.810546875, + "logps/rejected": -654.3671431107955, + "loss": 0.0204, + "rewards/chosen": 5.145776601938101, + "rewards/margins": 13.031073510230005, + "rewards/rejected": -7.885296908291903, + "step": 529 + }, + { + "epoch": 0.14526517747019324, + "grad_norm": 6.625, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -16926658.90909091, + "logits/rejected": 6817387.076923077, + "logps/chosen": -476.09694602272725, + "logps/rejected": -512.0441706730769, + "loss": 0.032, + "rewards/chosen": 4.4676031632856885, + "rewards/margins": 10.782134929737012, + "rewards/rejected": -6.3145317664513225, + "step": 530 + }, + { + "epoch": 0.14553926271070303, + "grad_norm": 7.5625, + "kl": 9.086786270141602, + "learning_rate": 5e-06, + "logits/chosen": -15603841.454545455, + "logits/rejected": 4450311.384615385, + "logps/chosen": -454.63565340909093, + "logps/rejected": -512.4550030048077, + "loss": 0.0612, + "rewards/chosen": 5.494621276855469, + "rewards/margins": 11.459212669959435, + "rewards/rejected": -5.964591393103967, + "step": 531 + }, + { + "epoch": 0.14581334795121284, + "grad_norm": 6.0625, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -23094937.6, + "logits/rejected": -14437781.714285715, + "logps/chosen": -498.45009765625, + "logps/rejected": -455.9053431919643, + "loss": 0.0336, + "rewards/chosen": 5.446422576904297, + "rewards/margins": 12.016939980643137, + "rewards/rejected": -6.570517403738839, + "step": 532 + }, + { + "epoch": 0.14608743319172263, + "grad_norm": 5.8125, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": 11883890.461538462, + "logits/rejected": -6878886.545454546, + "logps/chosen": -414.7500751201923, + "logps/rejected": -475.79691938920456, + "loss": 0.0348, + "rewards/chosen": 4.9576263427734375, + "rewards/margins": 11.900476629083807, + "rewards/rejected": -6.942850286310369, + "step": 533 + }, + { + "epoch": 0.1463615184322324, + "grad_norm": 8.0, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -35482716.44444445, + "logits/rejected": 96064.53333333334, + "logps/chosen": -480.9538845486111, + "logps/rejected": -512.857421875, + "loss": 0.0833, + "rewards/chosen": 6.2110544840494795, + "rewards/margins": 12.135884602864584, + "rewards/rejected": -5.9248301188151045, + "step": 534 + }, + { + "epoch": 0.14663560367274223, + "grad_norm": 11.875, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -20944864.0, + "logits/rejected": -7660868.363636363, + "logps/chosen": -486.3671875, + "logps/rejected": -622.7566583806819, + "loss": 0.0391, + "rewards/chosen": 4.723124284010667, + "rewards/margins": 13.733304670640639, + "rewards/rejected": -9.01018038662997, + "step": 535 + }, + { + "epoch": 0.146909688913252, + "grad_norm": 21.5, + "kl": 16.1439266204834, + "learning_rate": 5e-06, + "logits/chosen": 21130235.733333334, + "logits/rejected": -4159602.6666666665, + "logps/chosen": -455.7719401041667, + "logps/rejected": -483.6125217013889, + "loss": 0.1556, + "rewards/chosen": 4.39088134765625, + "rewards/margins": 7.254003228081597, + "rewards/rejected": -2.8631218804253473, + "step": 536 + }, + { + "epoch": 0.14718377415376183, + "grad_norm": 11.5, + "kl": 4.660311222076416, + "learning_rate": 5e-06, + "logits/chosen": -2897785.25, + "logits/rejected": -6043845.5, + "logps/chosen": -436.21087646484375, + "logps/rejected": -548.902099609375, + "loss": 0.0439, + "rewards/chosen": 5.428614616394043, + "rewards/margins": 13.121464729309082, + "rewards/rejected": -7.692850112915039, + "step": 537 + }, + { + "epoch": 0.1474578593942716, + "grad_norm": 10.8125, + "kl": 7.010030746459961, + "learning_rate": 5e-06, + "logits/chosen": -7538892.8, + "logits/rejected": -1829990.0, + "logps/chosen": -404.555908203125, + "logps/rejected": -446.54833984375, + "loss": 0.1056, + "rewards/chosen": 5.533795166015625, + "rewards/margins": 10.582305581229074, + "rewards/rejected": -5.048510415213449, + "step": 538 + }, + { + "epoch": 0.14773194463478143, + "grad_norm": 5.375, + "kl": 0.1082509383559227, + "learning_rate": 5e-06, + "logits/chosen": 5883347.692307692, + "logits/rejected": -1411653.4545454546, + "logps/chosen": -383.7883112980769, + "logps/rejected": -518.1652610085227, + "loss": 0.0367, + "rewards/chosen": 4.386815584622896, + "rewards/margins": 10.808527593012457, + "rewards/rejected": -6.42171200838956, + "step": 539 + }, + { + "epoch": 0.1480060298752912, + "grad_norm": 5.65625, + "kl": 1.6048038005828857, + "learning_rate": 5e-06, + "logits/chosen": -1097087.5, + "logits/rejected": -5067624.5, + "logps/chosen": -430.6459655761719, + "logps/rejected": -339.87213134765625, + "loss": 0.0569, + "rewards/chosen": 5.0838212966918945, + "rewards/margins": 10.008060455322266, + "rewards/rejected": -4.924239158630371, + "step": 540 + }, + { + "epoch": 0.14828011511580103, + "grad_norm": 7.75, + "kl": 1.2441266775131226, + "learning_rate": 5e-06, + "logits/chosen": -19783916.8, + "logits/rejected": 10914403.42857143, + "logps/chosen": -439.07099609375, + "logps/rejected": -508.6421595982143, + "loss": 0.0423, + "rewards/chosen": 5.303313827514648, + "rewards/margins": 10.71962045942034, + "rewards/rejected": -5.416306631905692, + "step": 541 + }, + { + "epoch": 0.1485542003563108, + "grad_norm": 9.5625, + "kl": 0.13557052612304688, + "learning_rate": 5e-06, + "logits/chosen": 4850716.666666667, + "logits/rejected": 23761082.666666668, + "logps/chosen": -522.2681884765625, + "logps/rejected": -312.97015380859375, + "loss": 0.0546, + "rewards/chosen": 5.885537465413411, + "rewards/margins": 10.701275825500488, + "rewards/rejected": -4.815738360087077, + "step": 542 + }, + { + "epoch": 0.1488282855968206, + "grad_norm": 12.375, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -13754506.0, + "logits/rejected": -26563936.0, + "logps/chosen": -466.5826721191406, + "logps/rejected": -561.2271728515625, + "loss": 0.053, + "rewards/chosen": 3.7308006286621094, + "rewards/margins": 11.10707139968872, + "rewards/rejected": -7.376270771026611, + "step": 543 + }, + { + "epoch": 0.1491023708373304, + "grad_norm": 12.75, + "kl": 8.343570709228516, + "learning_rate": 5e-06, + "logits/chosen": 26233704.0, + "logits/rejected": -11720745.333333334, + "logps/chosen": -432.2632649739583, + "logps/rejected": -479.3277994791667, + "loss": 0.0609, + "rewards/chosen": 6.085273742675781, + "rewards/margins": 12.366721471150715, + "rewards/rejected": -6.281447728474935, + "step": 544 + }, + { + "epoch": 0.1493764560778402, + "grad_norm": 11.8125, + "kl": 4.027656555175781, + "learning_rate": 5e-06, + "logits/chosen": -10705901.333333334, + "logits/rejected": 7647886.666666667, + "logps/chosen": -478.7068684895833, + "logps/rejected": -400.4776204427083, + "loss": 0.0507, + "rewards/chosen": 5.224957784016927, + "rewards/margins": 10.648794174194336, + "rewards/rejected": -5.423836390177409, + "step": 545 + }, + { + "epoch": 0.14965054131835, + "grad_norm": 9.0, + "kl": 1.295804738998413, + "learning_rate": 5e-06, + "logits/chosen": -12440628.923076924, + "logits/rejected": -17389416.727272727, + "logps/chosen": -465.13979867788464, + "logps/rejected": -471.15891335227275, + "loss": 0.0479, + "rewards/chosen": 5.119925865760217, + "rewards/margins": 12.503448593032944, + "rewards/rejected": -7.3835227272727275, + "step": 546 + }, + { + "epoch": 0.1499246265588598, + "grad_norm": 12.8125, + "kl": 7.0821685791015625, + "learning_rate": 5e-06, + "logits/chosen": 11182172.57142857, + "logits/rejected": 3200452.8, + "logps/chosen": -423.23866489955356, + "logps/rejected": -362.2056884765625, + "loss": 0.1065, + "rewards/chosen": 4.275881631033761, + "rewards/margins": 11.302140481131417, + "rewards/rejected": -7.026258850097657, + "step": 547 + }, + { + "epoch": 0.1501987117993696, + "grad_norm": 6.28125, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -7906512.7272727275, + "logits/rejected": -3982436.0, + "logps/chosen": -517.5380415482955, + "logps/rejected": -377.9611628605769, + "loss": 0.0265, + "rewards/chosen": 5.592125632546165, + "rewards/margins": 12.90800075931149, + "rewards/rejected": -7.315875126765325, + "step": 548 + }, + { + "epoch": 0.1504727970398794, + "grad_norm": 6.5625, + "kl": 1.6002118587493896, + "learning_rate": 5e-06, + "logits/chosen": -18369440.0, + "logits/rejected": 15183968.0, + "logps/chosen": -409.7824041193182, + "logps/rejected": -530.9921123798077, + "loss": 0.036, + "rewards/chosen": 5.696862654252485, + "rewards/margins": 13.087767594344133, + "rewards/rejected": -7.390904940091646, + "step": 549 + }, + { + "epoch": 0.1507468822803892, + "grad_norm": 6.0625, + "kl": 1.3930957317352295, + "learning_rate": 5e-06, + "logits/chosen": -7324716.571428572, + "logits/rejected": -9934646.4, + "logps/chosen": -413.44349888392856, + "logps/rejected": -565.11875, + "loss": 0.0627, + "rewards/chosen": 5.489056723458426, + "rewards/margins": 13.724171774727957, + "rewards/rejected": -8.235115051269531, + "step": 550 + }, + { + "epoch": 0.151020967520899, + "grad_norm": 12.375, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": 8004458.181818182, + "logits/rejected": -16233137.23076923, + "logps/chosen": -530.6799094460227, + "logps/rejected": -475.1277043269231, + "loss": 0.0394, + "rewards/chosen": 5.086612354625355, + "rewards/margins": 11.139715288068865, + "rewards/rejected": -6.05310293344351, + "step": 551 + }, + { + "epoch": 0.1512950527614088, + "grad_norm": 16.125, + "kl": 12.964876174926758, + "learning_rate": 5e-06, + "logits/chosen": -4152775.0, + "logits/rejected": -383553.5, + "logps/chosen": -457.89410400390625, + "logps/rejected": -419.9881896972656, + "loss": 0.1442, + "rewards/chosen": 5.157751560211182, + "rewards/margins": 9.325258731842041, + "rewards/rejected": -4.167507171630859, + "step": 552 + }, + { + "epoch": 0.1515691380019186, + "grad_norm": 4.53125, + "kl": 2.0241293907165527, + "learning_rate": 5e-06, + "logits/chosen": -19208674.666666668, + "logits/rejected": 897165.3333333334, + "logps/chosen": -493.575439453125, + "logps/rejected": -593.1337076822916, + "loss": 0.0395, + "rewards/chosen": 5.848843892415364, + "rewards/margins": 12.711115519205729, + "rewards/rejected": -6.862271626790364, + "step": 553 + }, + { + "epoch": 0.15184322324242838, + "grad_norm": 10.1875, + "kl": 0.7016042470932007, + "learning_rate": 5e-06, + "logits/chosen": -11979268.923076924, + "logits/rejected": -2416802.1818181816, + "logps/chosen": -449.6105769230769, + "logps/rejected": -458.43190696022725, + "loss": 0.0892, + "rewards/chosen": 6.096557030310998, + "rewards/margins": 12.756615485344733, + "rewards/rejected": -6.660058455033735, + "step": 554 + }, + { + "epoch": 0.1521173084829382, + "grad_norm": 11.1875, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": 41501.4, + "logits/rejected": -25648.0, + "logps/chosen": -470.590234375, + "logps/rejected": -546.7232142857143, + "loss": 0.0375, + "rewards/chosen": 5.366466522216797, + "rewards/margins": 11.998538970947266, + "rewards/rejected": -6.632072448730469, + "step": 555 + }, + { + "epoch": 0.15239139372344798, + "grad_norm": 5.84375, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -6421476.444444444, + "logits/rejected": -22378060.8, + "logps/chosen": -370.78724500868054, + "logps/rejected": -500.29749348958336, + "loss": 0.0484, + "rewards/chosen": 4.790027194552952, + "rewards/margins": 11.658258904351129, + "rewards/rejected": -6.868231709798177, + "step": 556 + }, + { + "epoch": 0.1526654789639578, + "grad_norm": 8.625, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -7189026.0, + "logits/rejected": 14818656.0, + "logps/chosen": -505.59527587890625, + "logps/rejected": -503.985595703125, + "loss": 0.0499, + "rewards/chosen": 5.115341663360596, + "rewards/margins": 11.082106113433838, + "rewards/rejected": -5.966764450073242, + "step": 557 + }, + { + "epoch": 0.15293956420446758, + "grad_norm": 9.0625, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -2828444.6666666665, + "logits/rejected": 1565516.6666666667, + "logps/chosen": -418.5796305338542, + "logps/rejected": -471.9374186197917, + "loss": 0.0407, + "rewards/chosen": 4.862676620483398, + "rewards/margins": 11.50564193725586, + "rewards/rejected": -6.642965316772461, + "step": 558 + }, + { + "epoch": 0.1532136494449774, + "grad_norm": 13.875, + "kl": 4.758914947509766, + "learning_rate": 5e-06, + "logits/chosen": -17998400.0, + "logits/rejected": 12307283.0, + "logps/chosen": -534.3897094726562, + "logps/rejected": -460.31878662109375, + "loss": 0.0731, + "rewards/chosen": 4.416209697723389, + "rewards/margins": 11.044950008392334, + "rewards/rejected": -6.628740310668945, + "step": 559 + }, + { + "epoch": 0.15348773468548718, + "grad_norm": 5.6875, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": 1172859.2, + "logits/rejected": -10252682.666666666, + "logps/chosen": -498.25074869791666, + "logps/rejected": -531.81201171875, + "loss": 0.0514, + "rewards/chosen": 4.654095967610677, + "rewards/margins": 13.155108133951824, + "rewards/rejected": -8.501012166341146, + "step": 560 + }, + { + "epoch": 0.153761819925997, + "grad_norm": 7.5, + "kl": 1.7432136535644531, + "learning_rate": 5e-06, + "logits/chosen": -8073301.6, + "logits/rejected": -14402788.57142857, + "logps/chosen": -481.19384765625, + "logps/rejected": -451.77099609375, + "loss": 0.0895, + "rewards/chosen": 5.13271484375, + "rewards/margins": 11.427492850167411, + "rewards/rejected": -6.294778006417411, + "step": 561 + }, + { + "epoch": 0.15403590516650678, + "grad_norm": 10.6875, + "kl": 4.275628089904785, + "learning_rate": 5e-06, + "logits/chosen": -12825746.133333333, + "logits/rejected": 1192836.888888889, + "logps/chosen": -363.2466145833333, + "logps/rejected": -512.9204644097222, + "loss": 0.0796, + "rewards/chosen": 4.072491200764974, + "rewards/margins": 13.442607540554471, + "rewards/rejected": -9.370116339789497, + "step": 562 + }, + { + "epoch": 0.1543099904070166, + "grad_norm": 6.96875, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -30121160.727272727, + "logits/rejected": 14967369.846153846, + "logps/chosen": -509.3460138494318, + "logps/rejected": -593.9396033653846, + "loss": 0.0197, + "rewards/chosen": 5.469707489013672, + "rewards/margins": 12.382302797757662, + "rewards/rejected": -6.91259530874399, + "step": 563 + }, + { + "epoch": 0.15458407564752638, + "grad_norm": 12.4375, + "kl": 5.761371612548828, + "learning_rate": 5e-06, + "logits/chosen": 12711840.0, + "logits/rejected": -16059068.8, + "logps/chosen": -413.27200753348217, + "logps/rejected": -502.84697265625, + "loss": 0.0553, + "rewards/chosen": 4.379270281110491, + "rewards/margins": 11.852776445661274, + "rewards/rejected": -7.473506164550781, + "step": 564 + }, + { + "epoch": 0.15485816088803617, + "grad_norm": 13.875, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -11215968.0, + "logits/rejected": -16592122.666666666, + "logps/chosen": -387.7265625, + "logps/rejected": -510.3687337239583, + "loss": 0.0961, + "rewards/chosen": 4.460521380106608, + "rewards/margins": 10.662376085917156, + "rewards/rejected": -6.201854705810547, + "step": 565 + }, + { + "epoch": 0.15513224612854598, + "grad_norm": 7.375, + "kl": 1.7528165578842163, + "learning_rate": 5e-06, + "logits/chosen": 16104426.181818182, + "logits/rejected": -10833811.692307692, + "logps/chosen": -325.55104758522725, + "logps/rejected": -366.97213040865387, + "loss": 0.0749, + "rewards/chosen": 4.878678408536044, + "rewards/margins": 9.99410469215233, + "rewards/rejected": -5.115426283616286, + "step": 566 + }, + { + "epoch": 0.15540633136905577, + "grad_norm": 17.125, + "kl": 3.4098422527313232, + "learning_rate": 5e-06, + "logits/chosen": 8593019.42857143, + "logits/rejected": -4272636.4, + "logps/chosen": -520.9862583705357, + "logps/rejected": -508.7240234375, + "loss": 0.0704, + "rewards/chosen": 5.530862535749163, + "rewards/margins": 12.385050310407365, + "rewards/rejected": -6.854187774658203, + "step": 567 + }, + { + "epoch": 0.15568041660956558, + "grad_norm": 12.9375, + "kl": 0.9093475341796875, + "learning_rate": 5e-06, + "logits/chosen": 26081213.333333332, + "logits/rejected": 2539883.3333333335, + "logps/chosen": -580.0585123697916, + "logps/rejected": -456.4210205078125, + "loss": 0.0811, + "rewards/chosen": 4.030539512634277, + "rewards/margins": 9.362327257792156, + "rewards/rejected": -5.331787745157878, + "step": 568 + }, + { + "epoch": 0.15595450185007537, + "grad_norm": 16.875, + "kl": 5.2902021408081055, + "learning_rate": 5e-06, + "logits/chosen": 12886873.6, + "logits/rejected": -5512829.714285715, + "logps/chosen": -495.08017578125, + "logps/rejected": -489.44217354910717, + "loss": 0.0659, + "rewards/chosen": 5.811245727539062, + "rewards/margins": 10.864979553222657, + "rewards/rejected": -5.053733825683594, + "step": 569 + }, + { + "epoch": 0.15622858709058518, + "grad_norm": 10.4375, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -22796827.42857143, + "logits/rejected": 3473922.3529411764, + "logps/chosen": -405.1205357142857, + "logps/rejected": -668.552734375, + "loss": 0.0399, + "rewards/chosen": 5.352302006312779, + "rewards/margins": 14.205779035552208, + "rewards/rejected": -8.85347702923943, + "step": 570 + }, + { + "epoch": 0.15650267233109497, + "grad_norm": 15.6875, + "kl": 4.311643600463867, + "learning_rate": 5e-06, + "logits/chosen": -5352140.0, + "logits/rejected": 1022287.8333333334, + "logps/chosen": -416.6410319010417, + "logps/rejected": -515.9449055989584, + "loss": 0.0731, + "rewards/chosen": 5.077417055765788, + "rewards/margins": 9.832870165507, + "rewards/rejected": -4.755453109741211, + "step": 571 + }, + { + "epoch": 0.15677675757160478, + "grad_norm": 11.75, + "kl": 8.629748344421387, + "learning_rate": 5e-06, + "logits/chosen": -17553662.222222224, + "logits/rejected": -18324980.0, + "logps/chosen": -497.08653428819446, + "logps/rejected": -598.8126627604166, + "loss": 0.0595, + "rewards/chosen": 4.78342522515191, + "rewards/margins": 12.485023074679905, + "rewards/rejected": -7.701597849527995, + "step": 572 + }, + { + "epoch": 0.15705084281211457, + "grad_norm": 8.0, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": 17352387.555555556, + "logits/rejected": -11370734.933333334, + "logps/chosen": -442.3085666232639, + "logps/rejected": -461.78427734375, + "loss": 0.0346, + "rewards/chosen": 6.045078277587891, + "rewards/margins": 12.830903371175129, + "rewards/rejected": -6.785825093587239, + "step": 573 + }, + { + "epoch": 0.15732492805262438, + "grad_norm": 20.5, + "kl": 17.206520080566406, + "learning_rate": 5e-06, + "logits/chosen": -12345366.588235294, + "logits/rejected": -4391753.714285715, + "logps/chosen": -511.6578584558824, + "logps/rejected": -577.2845284598214, + "loss": 0.1183, + "rewards/chosen": 4.916720222024357, + "rewards/margins": 13.111989125484179, + "rewards/rejected": -8.195268903459821, + "step": 574 + }, + { + "epoch": 0.15759901329313417, + "grad_norm": 5.46875, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -11385242.666666666, + "logits/rejected": 872564.8, + "logps/chosen": -399.36102973090277, + "logps/rejected": -399.1138020833333, + "loss": 0.0168, + "rewards/chosen": 5.810558742947048, + "rewards/margins": 11.79163835313585, + "rewards/rejected": -5.981079610188802, + "step": 575 + }, + { + "epoch": 0.15787309853364395, + "grad_norm": 8.25, + "kl": 1.7618205547332764, + "learning_rate": 5e-06, + "logits/chosen": -12381991.2, + "logits/rejected": -6964927.428571428, + "logps/chosen": -483.90439453125, + "logps/rejected": -318.56630161830356, + "loss": 0.0578, + "rewards/chosen": 5.67884407043457, + "rewards/margins": 10.486828558785575, + "rewards/rejected": -4.807984488351004, + "step": 576 + }, + { + "epoch": 0.15814718377415377, + "grad_norm": 14.1875, + "kl": 1.9587421417236328, + "learning_rate": 5e-06, + "logits/chosen": -5327485.142857143, + "logits/rejected": -1310499.2, + "logps/chosen": -468.7845982142857, + "logps/rejected": -519.53447265625, + "loss": 0.0602, + "rewards/chosen": 4.542179652622768, + "rewards/margins": 10.737486812046598, + "rewards/rejected": -6.1953071594238285, + "step": 577 + }, + { + "epoch": 0.15842126901466355, + "grad_norm": 7.625, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -6339002.0, + "logits/rejected": 581469.1666666666, + "logps/chosen": -521.9292805989584, + "logps/rejected": -486.0059407552083, + "loss": 0.0387, + "rewards/chosen": 4.194308598836263, + "rewards/margins": 11.413398106892902, + "rewards/rejected": -7.219089508056641, + "step": 578 + }, + { + "epoch": 0.15869535425517337, + "grad_norm": 12.5, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": 34170744.0, + "logits/rejected": -6131188.444444444, + "logps/chosen": -385.1976318359375, + "logps/rejected": -544.8289930555555, + "loss": 0.0946, + "rewards/chosen": 5.003133138020833, + "rewards/margins": 11.47082773844401, + "rewards/rejected": -6.467694600423177, + "step": 579 + }, + { + "epoch": 0.15896943949568315, + "grad_norm": 11.8125, + "kl": 6.543907165527344, + "learning_rate": 5e-06, + "logits/chosen": -10657586.0, + "logits/rejected": 10758698.0, + "logps/chosen": -369.29888916015625, + "logps/rejected": -596.492431640625, + "loss": 0.0901, + "rewards/chosen": 4.716132640838623, + "rewards/margins": 12.851670742034912, + "rewards/rejected": -8.135538101196289, + "step": 580 + }, + { + "epoch": 0.15924352473619297, + "grad_norm": 11.5, + "kl": 0.21943537890911102, + "learning_rate": 5e-06, + "logits/chosen": -16306348.57142857, + "logits/rejected": -4303826.4, + "logps/chosen": -434.2088099888393, + "logps/rejected": -778.4916015625, + "loss": 0.0651, + "rewards/chosen": 3.8467314583914622, + "rewards/margins": 12.133083234514508, + "rewards/rejected": -8.286351776123047, + "step": 581 + }, + { + "epoch": 0.15951760997670275, + "grad_norm": 6.53125, + "kl": 7.1478071212768555, + "learning_rate": 5e-06, + "logits/chosen": 6938957.0, + "logits/rejected": -10143872.0, + "logps/chosen": -512.41943359375, + "logps/rejected": -520.489990234375, + "loss": 0.0445, + "rewards/chosen": 6.454849720001221, + "rewards/margins": 13.70491647720337, + "rewards/rejected": -7.250066757202148, + "step": 582 + }, + { + "epoch": 0.15979169521721256, + "grad_norm": 12.25, + "kl": 10.34714412689209, + "learning_rate": 5e-06, + "logits/chosen": -11323276.235294119, + "logits/rejected": -4436993.142857143, + "logps/chosen": -449.60842715992646, + "logps/rejected": -572.3104073660714, + "loss": 0.0514, + "rewards/chosen": 5.8663428811466, + "rewards/margins": 13.543643470571823, + "rewards/rejected": -7.677300589425223, + "step": 583 + }, + { + "epoch": 0.16006578045772235, + "grad_norm": 9.625, + "kl": 1.4708786010742188, + "learning_rate": 5e-06, + "logits/chosen": -12581109.6, + "logits/rejected": -9056502.285714285, + "logps/chosen": -478.440380859375, + "logps/rejected": -578.5616978236607, + "loss": 0.0499, + "rewards/chosen": 5.401086044311524, + "rewards/margins": 16.342020361764092, + "rewards/rejected": -10.940934317452568, + "step": 584 + }, + { + "epoch": 0.16033986569823214, + "grad_norm": 3.6875, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -9052035.636363637, + "logits/rejected": -3314979.6923076925, + "logps/chosen": -429.2013494318182, + "logps/rejected": -792.3548677884615, + "loss": 0.0109, + "rewards/chosen": 4.987159382213246, + "rewards/margins": 16.688621174205434, + "rewards/rejected": -11.701461791992188, + "step": 585 + }, + { + "epoch": 0.16061395093874195, + "grad_norm": 10.6875, + "kl": 2.2626407146453857, + "learning_rate": 5e-06, + "logits/chosen": -2724571.076923077, + "logits/rejected": -611249.0909090909, + "logps/chosen": -501.31494140625, + "logps/rejected": -517.1008522727273, + "loss": 0.0495, + "rewards/chosen": 5.267755361703726, + "rewards/margins": 11.36803196193455, + "rewards/rejected": -6.100276600230824, + "step": 586 + }, + { + "epoch": 0.16088803617925174, + "grad_norm": 8.9375, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -15509230.0, + "logits/rejected": 7952665.0, + "logps/chosen": -591.14208984375, + "logps/rejected": -548.32080078125, + "loss": 0.0452, + "rewards/chosen": 5.308307647705078, + "rewards/margins": 13.20925235748291, + "rewards/rejected": -7.900944709777832, + "step": 587 + }, + { + "epoch": 0.16116212141976155, + "grad_norm": 7.3125, + "kl": 0.8684476613998413, + "learning_rate": 5e-06, + "logits/chosen": -21464094.545454547, + "logits/rejected": -20607389.53846154, + "logps/chosen": -420.55366654829544, + "logps/rejected": -470.59427584134613, + "loss": 0.0384, + "rewards/chosen": 4.096528833562678, + "rewards/margins": 9.385719859516705, + "rewards/rejected": -5.289191025954026, + "step": 588 + }, + { + "epoch": 0.16143620666027134, + "grad_norm": 9.9375, + "kl": 6.693058013916016, + "learning_rate": 5e-06, + "logits/chosen": 7993347.428571428, + "logits/rejected": -5490477.2, + "logps/chosen": -392.38180106026783, + "logps/rejected": -450.580908203125, + "loss": 0.0692, + "rewards/chosen": 5.129941667829241, + "rewards/margins": 10.285830797467913, + "rewards/rejected": -5.1558891296386715, + "step": 589 + }, + { + "epoch": 0.16171029190078115, + "grad_norm": 4.6875, + "kl": 6.798520088195801, + "learning_rate": 5e-06, + "logits/chosen": 1199547.6923076923, + "logits/rejected": -6975269.818181818, + "logps/chosen": -443.7477463942308, + "logps/rejected": -534.5969016335227, + "loss": 0.0478, + "rewards/chosen": 5.66654557448167, + "rewards/margins": 12.426325071108092, + "rewards/rejected": -6.759779496626421, + "step": 590 + }, + { + "epoch": 0.16198437714129094, + "grad_norm": 6.34375, + "kl": 3.3228163719177246, + "learning_rate": 5e-06, + "logits/chosen": -4859122.333333333, + "logits/rejected": -21014374.666666668, + "logps/chosen": -493.0299479166667, + "logps/rejected": -586.0147298177084, + "loss": 0.0195, + "rewards/chosen": 5.148767789204915, + "rewards/margins": 12.859822273254395, + "rewards/rejected": -7.7110544840494795, + "step": 591 + }, + { + "epoch": 0.16225846238180075, + "grad_norm": 12.75, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -21052065.6, + "logits/rejected": 12456104.0, + "logps/chosen": -477.59609375, + "logps/rejected": -456.83140345982144, + "loss": 0.0545, + "rewards/chosen": 5.6855918884277346, + "rewards/margins": 12.622719682965961, + "rewards/rejected": -6.937127794538226, + "step": 592 + }, + { + "epoch": 0.16253254762231054, + "grad_norm": 10.75, + "kl": 2.7291362285614014, + "learning_rate": 5e-06, + "logits/chosen": -11918392.8, + "logits/rejected": 10465554.285714285, + "logps/chosen": -381.458203125, + "logps/rejected": -606.2400948660714, + "loss": 0.0423, + "rewards/chosen": 5.796901702880859, + "rewards/margins": 11.56010273524693, + "rewards/rejected": -5.763201032366071, + "step": 593 + }, + { + "epoch": 0.16280663286282035, + "grad_norm": 12.0625, + "kl": 7.487538814544678, + "learning_rate": 5e-06, + "logits/chosen": -8001026.909090909, + "logits/rejected": 7078738.461538462, + "logps/chosen": -397.0167791193182, + "logps/rejected": -644.3128004807693, + "loss": 0.085, + "rewards/chosen": 3.7601852416992188, + "rewards/margins": 10.96469937838041, + "rewards/rejected": -7.20451413668119, + "step": 594 + }, + { + "epoch": 0.16308071810333014, + "grad_norm": 1.71875, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -3745489.3333333335, + "logits/rejected": -10639032.666666666, + "logps/chosen": -530.18212890625, + "logps/rejected": -616.0508626302084, + "loss": 0.0058, + "rewards/chosen": 6.1151885986328125, + "rewards/margins": 14.470842361450195, + "rewards/rejected": -8.355653762817383, + "step": 595 + }, + { + "epoch": 0.16335480334383992, + "grad_norm": 7.9375, + "kl": 2.688185453414917, + "learning_rate": 5e-06, + "logits/chosen": -17976625.454545453, + "logits/rejected": -6369225.846153846, + "logps/chosen": -417.20432350852275, + "logps/rejected": -382.56629356971155, + "loss": 0.0695, + "rewards/chosen": 5.68178315596147, + "rewards/margins": 11.078763441606, + "rewards/rejected": -5.396980285644531, + "step": 596 + }, + { + "epoch": 0.16362888858434974, + "grad_norm": 8.5, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -991958.3076923077, + "logits/rejected": -14542749.090909092, + "logps/chosen": -480.1695087139423, + "logps/rejected": -747.0297407670455, + "loss": 0.027, + "rewards/chosen": 4.728157336895283, + "rewards/margins": 14.16811746984095, + "rewards/rejected": -9.439960132945668, + "step": 597 + }, + { + "epoch": 0.16390297382485952, + "grad_norm": 12.5, + "kl": 5.9982171058654785, + "learning_rate": 5e-06, + "logits/chosen": 1032654.9333333333, + "logits/rejected": 1405691.111111111, + "logps/chosen": -506.73681640625, + "logps/rejected": -436.9919704861111, + "loss": 0.0302, + "rewards/chosen": 5.927615356445313, + "rewards/margins": 14.07066141764323, + "rewards/rejected": -8.143046061197916, + "step": 598 + }, + { + "epoch": 0.16417705906536934, + "grad_norm": 4.65625, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -975705.25, + "logits/rejected": -7442788.0, + "logps/chosen": -428.7931823730469, + "logps/rejected": -457.26446533203125, + "loss": 0.0212, + "rewards/chosen": 4.656890869140625, + "rewards/margins": 11.898754119873047, + "rewards/rejected": -7.241863250732422, + "step": 599 + }, + { + "epoch": 0.16445114430587912, + "grad_norm": 15.0, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": 31468435.2, + "logits/rejected": 71734345.14285715, + "logps/chosen": -466.204443359375, + "logps/rejected": -441.74574497767856, + "loss": 0.0429, + "rewards/chosen": 5.582163619995117, + "rewards/margins": 13.26701077052525, + "rewards/rejected": -7.684847150530134, + "step": 600 + }, + { + "epoch": 0.16472522954638893, + "grad_norm": 13.3125, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -21082824.0, + "logits/rejected": 3564345.0, + "logps/chosen": -353.9439290364583, + "logps/rejected": -366.9765218098958, + "loss": 0.0888, + "rewards/chosen": 4.342309951782227, + "rewards/margins": 7.8715051015218105, + "rewards/rejected": -3.5291951497395835, + "step": 601 + }, + { + "epoch": 0.16499931478689872, + "grad_norm": 9.5, + "kl": 3.7175166606903076, + "learning_rate": 5e-06, + "logits/chosen": -9241935.272727273, + "logits/rejected": 1783704.6153846155, + "logps/chosen": -372.6930486505682, + "logps/rejected": -425.8650090144231, + "loss": 0.0955, + "rewards/chosen": 3.685084256258878, + "rewards/margins": 10.668220946838805, + "rewards/rejected": -6.9831366905799275, + "step": 602 + }, + { + "epoch": 0.16527340002740853, + "grad_norm": 12.25, + "kl": 0.018527984619140625, + "learning_rate": 5e-06, + "logits/chosen": -7693715.428571428, + "logits/rejected": -4969118.4, + "logps/chosen": -354.350830078125, + "logps/rejected": -588.23154296875, + "loss": 0.0544, + "rewards/chosen": 4.955654689243862, + "rewards/margins": 12.107929011753628, + "rewards/rejected": -7.152274322509766, + "step": 603 + }, + { + "epoch": 0.16554748526791832, + "grad_norm": 7.75, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": 6869188.0, + "logits/rejected": -2536917.714285714, + "logps/chosen": -309.8878173828125, + "logps/rejected": -489.63246372767856, + "loss": 0.0406, + "rewards/chosen": 5.1540180206298825, + "rewards/margins": 11.837336022513252, + "rewards/rejected": -6.683318001883371, + "step": 604 + }, + { + "epoch": 0.16582157050842813, + "grad_norm": 12.9375, + "kl": 16.354469299316406, + "learning_rate": 5e-06, + "logits/chosen": -19429112.470588237, + "logits/rejected": -1491909.142857143, + "logps/chosen": -405.96961167279414, + "logps/rejected": -448.18540736607144, + "loss": 0.082, + "rewards/chosen": 5.253938562729779, + "rewards/margins": 12.099199727803718, + "rewards/rejected": -6.84526116507394, + "step": 605 + }, + { + "epoch": 0.16609565574893792, + "grad_norm": 8.625, + "kl": 0.2581399381160736, + "learning_rate": 5e-06, + "logits/chosen": -4891841.142857143, + "logits/rejected": -12961888.8, + "logps/chosen": -342.68355887276783, + "logps/rejected": -444.5822265625, + "loss": 0.0365, + "rewards/chosen": 4.993088858468192, + "rewards/margins": 11.781675284249442, + "rewards/rejected": -6.78858642578125, + "step": 606 + }, + { + "epoch": 0.1663697409894477, + "grad_norm": 13.125, + "kl": 0.00039227804518304765, + "learning_rate": 5e-06, + "logits/chosen": -4891206.857142857, + "logits/rejected": 4478147.2, + "logps/chosen": -400.7456752232143, + "logps/rejected": -675.07861328125, + "loss": 0.092, + "rewards/chosen": 4.722159794398716, + "rewards/margins": 14.735124424525669, + "rewards/rejected": -10.012964630126953, + "step": 607 + }, + { + "epoch": 0.16664382622995752, + "grad_norm": 3.671875, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -34091712.0, + "logits/rejected": -25159462.0, + "logps/chosen": -361.41656494140625, + "logps/rejected": -461.3290100097656, + "loss": 0.0206, + "rewards/chosen": 5.284795761108398, + "rewards/margins": 11.552489757537842, + "rewards/rejected": -6.267693996429443, + "step": 608 + }, + { + "epoch": 0.1669179114704673, + "grad_norm": 10.5625, + "kl": 3.3623461723327637, + "learning_rate": 5e-06, + "logits/chosen": 6837292.923076923, + "logits/rejected": -1486046.5454545454, + "logps/chosen": -403.54781400240387, + "logps/rejected": -588.4688387784091, + "loss": 0.0462, + "rewards/chosen": 5.061622619628906, + "rewards/margins": 13.50637470592152, + "rewards/rejected": -8.444752086292613, + "step": 609 + }, + { + "epoch": 0.16719199671097712, + "grad_norm": 14.5, + "kl": 0.8822581171989441, + "learning_rate": 5e-06, + "logits/chosen": 13707438.4, + "logits/rejected": 4474153.714285715, + "logps/chosen": -369.165185546875, + "logps/rejected": -585.3683733258929, + "loss": 0.1197, + "rewards/chosen": 4.216021347045898, + "rewards/margins": 9.927723966326031, + "rewards/rejected": -5.711702619280134, + "step": 610 + }, + { + "epoch": 0.1674660819514869, + "grad_norm": 8.8125, + "kl": 1.596358060836792, + "learning_rate": 5e-06, + "logits/chosen": 26258974.0, + "logits/rejected": -7824429.0, + "logps/chosen": -474.74664306640625, + "logps/rejected": -491.9432373046875, + "loss": 0.0323, + "rewards/chosen": 5.283700466156006, + "rewards/margins": 11.705013275146484, + "rewards/rejected": -6.4213128089904785, + "step": 611 + }, + { + "epoch": 0.16774016719199672, + "grad_norm": 12.1875, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -26799819.42857143, + "logits/rejected": -1542465.4117647058, + "logps/chosen": -592.5936104910714, + "logps/rejected": -425.9193474264706, + "loss": 0.0523, + "rewards/chosen": 6.027210235595703, + "rewards/margins": 12.46871073105756, + "rewards/rejected": -6.441500495461857, + "step": 612 + }, + { + "epoch": 0.1680142524325065, + "grad_norm": 7.84375, + "kl": 2.365126371383667, + "learning_rate": 5e-06, + "logits/chosen": -25530170.181818184, + "logits/rejected": 68952546.46153846, + "logps/chosen": -442.3885387073864, + "logps/rejected": -628.3334585336538, + "loss": 0.046, + "rewards/chosen": 5.079356800426137, + "rewards/margins": 12.586581250170727, + "rewards/rejected": -7.507224449744592, + "step": 613 + }, + { + "epoch": 0.16828833767301632, + "grad_norm": 10.75, + "kl": 0.2827568054199219, + "learning_rate": 5e-06, + "logits/chosen": -4128424.0, + "logits/rejected": -54651.71428571428, + "logps/chosen": -397.627392578125, + "logps/rejected": -557.4456263950893, + "loss": 0.0308, + "rewards/chosen": 6.433086395263672, + "rewards/margins": 13.218214852469309, + "rewards/rejected": -6.785128457205636, + "step": 614 + }, + { + "epoch": 0.1685624229135261, + "grad_norm": 11.8125, + "kl": 7.693988800048828, + "learning_rate": 5e-06, + "logits/chosen": -23898385.230769232, + "logits/rejected": 32550548.363636363, + "logps/chosen": -430.98550180288464, + "logps/rejected": -507.16415127840907, + "loss": 0.0643, + "rewards/chosen": 5.607218815730168, + "rewards/margins": 10.975924912032546, + "rewards/rejected": -5.368706096302379, + "step": 615 + }, + { + "epoch": 0.1688365081540359, + "grad_norm": 11.125, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -19332352.0, + "logits/rejected": 7757929.0, + "logps/chosen": -456.856689453125, + "logps/rejected": -488.0924987792969, + "loss": 0.0649, + "rewards/chosen": 4.695016860961914, + "rewards/margins": 10.000749588012695, + "rewards/rejected": -5.305732727050781, + "step": 616 + }, + { + "epoch": 0.1691105933945457, + "grad_norm": 13.1875, + "kl": 8.47950553894043, + "learning_rate": 5e-06, + "logits/chosen": -18805700.266666666, + "logits/rejected": -1108292.2222222222, + "logps/chosen": -447.11774088541665, + "logps/rejected": -417.92914496527777, + "loss": 0.0408, + "rewards/chosen": 5.531303914388021, + "rewards/margins": 12.833858574761285, + "rewards/rejected": -7.302554660373264, + "step": 617 + }, + { + "epoch": 0.1693846786350555, + "grad_norm": 9.1875, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -24637088.0, + "logits/rejected": 25770372.0, + "logps/chosen": -467.15399169921875, + "logps/rejected": -585.050537109375, + "loss": 0.0609, + "rewards/chosen": 4.638179302215576, + "rewards/margins": 14.519019603729248, + "rewards/rejected": -9.880840301513672, + "step": 618 + }, + { + "epoch": 0.1696587638755653, + "grad_norm": 6.46875, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -16919421.714285713, + "logits/rejected": -9626586.352941176, + "logps/chosen": -408.08370535714283, + "logps/rejected": -449.3669002757353, + "loss": 0.03, + "rewards/chosen": 5.362816946847098, + "rewards/margins": 12.131371634347097, + "rewards/rejected": -6.7685546875, + "step": 619 + }, + { + "epoch": 0.1699328491160751, + "grad_norm": 18.375, + "kl": 6.561077117919922, + "learning_rate": 5e-06, + "logits/chosen": -21778816.0, + "logits/rejected": -12777814.0, + "logps/chosen": -464.49786376953125, + "logps/rejected": -495.2750244140625, + "loss": 0.0716, + "rewards/chosen": 4.5224714279174805, + "rewards/margins": 11.292692184448242, + "rewards/rejected": -6.770220756530762, + "step": 620 + }, + { + "epoch": 0.1702069343565849, + "grad_norm": 9.375, + "kl": 1.5907809734344482, + "learning_rate": 5e-06, + "logits/chosen": -12429041.23076923, + "logits/rejected": -16140980.363636363, + "logps/chosen": -376.4668719951923, + "logps/rejected": -428.6189630681818, + "loss": 0.044, + "rewards/chosen": 4.267967224121094, + "rewards/margins": 13.596860712224787, + "rewards/rejected": -9.328893488103693, + "step": 621 + }, + { + "epoch": 0.1704810195970947, + "grad_norm": 5.28125, + "kl": 0.8825353384017944, + "learning_rate": 5e-06, + "logits/chosen": -34351638.4, + "logits/rejected": -13113014.857142856, + "logps/chosen": -520.41376953125, + "logps/rejected": -529.1662946428571, + "loss": 0.0181, + "rewards/chosen": 6.936149597167969, + "rewards/margins": 14.476635524204799, + "rewards/rejected": -7.540485927036831, + "step": 622 + }, + { + "epoch": 0.1707551048376045, + "grad_norm": 9.6875, + "kl": 1.3214330673217773, + "learning_rate": 5e-06, + "logits/chosen": 3666558.6666666665, + "logits/rejected": -17173086.222222224, + "logps/chosen": -317.87526448567706, + "logps/rejected": -444.1211208767361, + "loss": 0.0465, + "rewards/chosen": 4.317401885986328, + "rewards/margins": 11.369893815782335, + "rewards/rejected": -7.052491929796007, + "step": 623 + }, + { + "epoch": 0.1710291900781143, + "grad_norm": 9.5625, + "kl": 4.629144668579102, + "learning_rate": 5e-06, + "logits/chosen": 2427826.285714286, + "logits/rejected": 14464854.4, + "logps/chosen": -393.33095005580356, + "logps/rejected": -442.07490234375, + "loss": 0.0846, + "rewards/chosen": 4.473875318254743, + "rewards/margins": 9.392814527239118, + "rewards/rejected": -4.918939208984375, + "step": 624 + }, + { + "epoch": 0.1713032753186241, + "grad_norm": 12.0, + "kl": 0.21543249487876892, + "learning_rate": 5e-06, + "logits/chosen": -8418392.727272727, + "logits/rejected": 13623239.384615384, + "logps/chosen": -489.15207741477275, + "logps/rejected": -499.1727764423077, + "loss": 0.0736, + "rewards/chosen": 6.399031205610796, + "rewards/margins": 12.858609499631228, + "rewards/rejected": -6.4595782940204325, + "step": 625 + }, + { + "epoch": 0.1715773605591339, + "grad_norm": 8.125, + "kl": 0.034165702760219574, + "learning_rate": 5e-06, + "logits/chosen": 20084078.545454547, + "logits/rejected": -15428061.538461538, + "logps/chosen": -373.99746981534093, + "logps/rejected": -547.2461313100962, + "loss": 0.0924, + "rewards/chosen": 5.0749827298251065, + "rewards/margins": 12.989877420705515, + "rewards/rejected": -7.914894690880408, + "step": 626 + }, + { + "epoch": 0.17185144579964368, + "grad_norm": 9.25, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -11055938.181818182, + "logits/rejected": -12696649.846153846, + "logps/chosen": -425.85768821022725, + "logps/rejected": -339.84945913461536, + "loss": 0.043, + "rewards/chosen": 4.764149752530185, + "rewards/margins": 11.076504207157589, + "rewards/rejected": -6.312354454627404, + "step": 627 + }, + { + "epoch": 0.1721255310401535, + "grad_norm": 10.625, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -8353955.0, + "logits/rejected": -12929469.0, + "logps/chosen": -470.1595764160156, + "logps/rejected": -592.481201171875, + "loss": 0.0618, + "rewards/chosen": 5.096713542938232, + "rewards/margins": 13.576488971710205, + "rewards/rejected": -8.479775428771973, + "step": 628 + }, + { + "epoch": 0.17239961628066328, + "grad_norm": 6.40625, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": 968077.4, + "logits/rejected": -16293500.631578946, + "logps/chosen": -472.464990234375, + "logps/rejected": -564.8239103618421, + "loss": 0.0257, + "rewards/chosen": 8.043956756591797, + "rewards/margins": 16.87747533697831, + "rewards/rejected": -8.833518580386514, + "step": 629 + }, + { + "epoch": 0.1726737015211731, + "grad_norm": 4.5625, + "kl": 4.010562419891357, + "learning_rate": 5e-06, + "logits/chosen": -3148150.6666666665, + "logits/rejected": -15031994.666666666, + "logps/chosen": -401.37158203125, + "logps/rejected": -481.8024088541667, + "loss": 0.0188, + "rewards/chosen": 5.410300572713216, + "rewards/margins": 11.441255569458008, + "rewards/rejected": -6.030954996744792, + "step": 630 + }, + { + "epoch": 0.17294778676168288, + "grad_norm": 8.8125, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -19013426.90909091, + "logits/rejected": -16223379.692307692, + "logps/chosen": -396.63174715909093, + "logps/rejected": -641.5542367788462, + "loss": 0.0696, + "rewards/chosen": 4.992831143465909, + "rewards/margins": 14.756763004756476, + "rewards/rejected": -9.763931861290565, + "step": 631 + }, + { + "epoch": 0.1732218720021927, + "grad_norm": 4.1875, + "kl": 3.0601553916931152, + "learning_rate": 5e-06, + "logits/chosen": 13681845.818181818, + "logits/rejected": -31343463.384615384, + "logps/chosen": -523.9605823863636, + "logps/rejected": -419.8161808894231, + "loss": 0.02, + "rewards/chosen": 5.150731520219282, + "rewards/margins": 12.677018892514955, + "rewards/rejected": -7.526287372295673, + "step": 632 + }, + { + "epoch": 0.17349595724270248, + "grad_norm": 9.75, + "kl": 0.2849667966365814, + "learning_rate": 5e-06, + "logits/chosen": -12175045.818181818, + "logits/rejected": -2442920.153846154, + "logps/chosen": -407.50736860795456, + "logps/rejected": -473.5400390625, + "loss": 0.052, + "rewards/chosen": 5.11764873157848, + "rewards/margins": 12.648482636138276, + "rewards/rejected": -7.530833904559795, + "step": 633 + }, + { + "epoch": 0.1737700424832123, + "grad_norm": 9.75, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -9995310.0, + "logits/rejected": -8318003.5, + "logps/chosen": -379.6719665527344, + "logps/rejected": -532.6536865234375, + "loss": 0.06, + "rewards/chosen": 4.979827404022217, + "rewards/margins": 13.115407466888428, + "rewards/rejected": -8.135580062866211, + "step": 634 + }, + { + "epoch": 0.17404412772372208, + "grad_norm": 13.625, + "kl": 0.9264259338378906, + "learning_rate": 5e-06, + "logits/chosen": 9588302.0, + "logits/rejected": 50501514.666666664, + "logps/chosen": -468.8567301432292, + "logps/rejected": -655.6817626953125, + "loss": 0.0898, + "rewards/chosen": 5.253516515096028, + "rewards/margins": 15.904151916503906, + "rewards/rejected": -10.650635401407877, + "step": 635 + }, + { + "epoch": 0.1743182129642319, + "grad_norm": 11.75, + "kl": 10.274627685546875, + "learning_rate": 5e-06, + "logits/chosen": -10760670.11764706, + "logits/rejected": -24146541.714285713, + "logps/chosen": -454.4193474264706, + "logps/rejected": -283.66990443638394, + "loss": 0.0883, + "rewards/chosen": 5.011727276970358, + "rewards/margins": 10.053315651516954, + "rewards/rejected": -5.041588374546596, + "step": 636 + }, + { + "epoch": 0.17459229820474167, + "grad_norm": 8.5, + "kl": 0.7969335317611694, + "learning_rate": 5e-06, + "logits/chosen": -1974076.6666666667, + "logits/rejected": -37983381.333333336, + "logps/chosen": -371.3429361979167, + "logps/rejected": -539.6302083333334, + "loss": 0.0628, + "rewards/chosen": 4.60376951429579, + "rewards/margins": 12.085649702284071, + "rewards/rejected": -7.481880187988281, + "step": 637 + }, + { + "epoch": 0.17486638344525146, + "grad_norm": 14.25, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": 4553660.0, + "logits/rejected": -12080604.444444444, + "logps/chosen": -492.7112630208333, + "logps/rejected": -470.0026584201389, + "loss": 0.0563, + "rewards/chosen": 6.685591379801433, + "rewards/margins": 13.051228841145834, + "rewards/rejected": -6.365637461344401, + "step": 638 + }, + { + "epoch": 0.17514046868576127, + "grad_norm": 17.125, + "kl": 1.79473876953125, + "learning_rate": 5e-06, + "logits/chosen": -24468142.545454547, + "logits/rejected": -610688.6153846154, + "logps/chosen": -523.5799893465909, + "logps/rejected": -557.6064077524038, + "loss": 0.0757, + "rewards/chosen": 5.368803544477983, + "rewards/margins": 12.44222531618772, + "rewards/rejected": -7.073421771709736, + "step": 639 + }, + { + "epoch": 0.17541455392627106, + "grad_norm": 9.125, + "kl": 4.310084342956543, + "learning_rate": 5e-06, + "logits/chosen": -15822255.05882353, + "logits/rejected": -22508685.714285713, + "logps/chosen": -472.41676240808823, + "logps/rejected": -481.84444754464283, + "loss": 0.0601, + "rewards/chosen": 4.725383534150965, + "rewards/margins": 11.680610432344324, + "rewards/rejected": -6.955226898193359, + "step": 640 + }, + { + "epoch": 0.17568863916678087, + "grad_norm": 11.0625, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -25526708.0, + "logits/rejected": -4714817.5, + "logps/chosen": -592.0956420898438, + "logps/rejected": -493.6246032714844, + "loss": 0.0248, + "rewards/chosen": 5.870402812957764, + "rewards/margins": 14.203567028045654, + "rewards/rejected": -8.33316421508789, + "step": 641 + }, + { + "epoch": 0.17596272440729066, + "grad_norm": 10.3125, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -12354376.0, + "logits/rejected": -19746139.42857143, + "logps/chosen": -629.892138671875, + "logps/rejected": -516.0794503348214, + "loss": 0.0267, + "rewards/chosen": 5.061809539794922, + "rewards/margins": 13.428099496023995, + "rewards/rejected": -8.366289956229073, + "step": 642 + }, + { + "epoch": 0.17623680964780047, + "grad_norm": 6.5625, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -17058336.0, + "logits/rejected": -4285621.333333333, + "logps/chosen": -470.1681315104167, + "logps/rejected": -617.9827473958334, + "loss": 0.0382, + "rewards/chosen": 4.7091671625773115, + "rewards/margins": 12.296644528706869, + "rewards/rejected": -7.587477366129558, + "step": 643 + }, + { + "epoch": 0.17651089488831026, + "grad_norm": 11.4375, + "kl": 4.509880065917969, + "learning_rate": 5e-06, + "logits/chosen": -17476712.0, + "logits/rejected": -7050060.0, + "logps/chosen": -374.22332763671875, + "logps/rejected": -497.633544921875, + "loss": 0.1316, + "rewards/chosen": 3.506448268890381, + "rewards/margins": 11.660022258758545, + "rewards/rejected": -8.153573989868164, + "step": 644 + }, + { + "epoch": 0.17678498012882007, + "grad_norm": 3.046875, + "kl": 1.3474719524383545, + "learning_rate": 5e-06, + "logits/chosen": -4813805.0, + "logits/rejected": -6792294.666666667, + "logps/chosen": -452.0450846354167, + "logps/rejected": -574.9203287760416, + "loss": 0.0087, + "rewards/chosen": 8.35629145304362, + "rewards/margins": 15.954441706339518, + "rewards/rejected": -7.598150253295898, + "step": 645 + }, + { + "epoch": 0.17705906536932986, + "grad_norm": 10.6875, + "kl": 1.1362636089324951, + "learning_rate": 5e-06, + "logits/chosen": -16212571.636363637, + "logits/rejected": 3122461.5384615385, + "logps/chosen": -414.36430220170456, + "logps/rejected": -474.57019981971155, + "loss": 0.0521, + "rewards/chosen": 4.538210435347124, + "rewards/margins": 10.655191354818278, + "rewards/rejected": -6.116980919471154, + "step": 646 + }, + { + "epoch": 0.17733315060983967, + "grad_norm": 8.375, + "kl": 4.020722389221191, + "learning_rate": 5e-06, + "logits/chosen": -26785319.384615384, + "logits/rejected": -4535187.636363637, + "logps/chosen": -356.5911207932692, + "logps/rejected": -470.1048473011364, + "loss": 0.0511, + "rewards/chosen": 5.003856952373798, + "rewards/margins": 11.34448861075448, + "rewards/rejected": -6.340631658380682, + "step": 647 + }, + { + "epoch": 0.17760723585034946, + "grad_norm": 14.4375, + "kl": 7.32028341293335, + "learning_rate": 5e-06, + "logits/chosen": -5145808.0, + "logits/rejected": 6526503.111111111, + "logps/chosen": -411.61546223958334, + "logps/rejected": -469.951171875, + "loss": 0.1057, + "rewards/chosen": 5.406846618652343, + "rewards/margins": 12.76443125406901, + "rewards/rejected": -7.357584635416667, + "step": 648 + }, + { + "epoch": 0.17788132109085925, + "grad_norm": 16.625, + "kl": 3.731398344039917, + "learning_rate": 5e-06, + "logits/chosen": -14962496.0, + "logits/rejected": -36835926.4, + "logps/chosen": -463.05262974330356, + "logps/rejected": -545.154296875, + "loss": 0.0686, + "rewards/chosen": 4.5617634909493585, + "rewards/margins": 13.439546694074359, + "rewards/rejected": -8.877783203125, + "step": 649 + }, + { + "epoch": 0.17815540633136906, + "grad_norm": 10.875, + "kl": 5.982331275939941, + "learning_rate": 5e-06, + "logits/chosen": 16831413.818181816, + "logits/rejected": -4087279.3846153845, + "logps/chosen": -515.5729314630681, + "logps/rejected": -472.5354191706731, + "loss": 0.0662, + "rewards/chosen": 6.266550931063565, + "rewards/margins": 14.312198185420538, + "rewards/rejected": -8.045647254356972, + "step": 650 + }, + { + "epoch": 0.17842949157187885, + "grad_norm": 10.8125, + "kl": 2.4124507904052734, + "learning_rate": 5e-06, + "logits/chosen": -11410084.923076924, + "logits/rejected": -14445280.0, + "logps/chosen": -454.6585036057692, + "logps/rejected": -368.64945845170456, + "loss": 0.0602, + "rewards/chosen": 4.526606926551232, + "rewards/margins": 9.853844836041645, + "rewards/rejected": -5.327237909490412, + "step": 651 + }, + { + "epoch": 0.17870357681238866, + "grad_norm": 12.125, + "kl": 4.547926902770996, + "learning_rate": 5e-06, + "logits/chosen": -15997245.090909092, + "logits/rejected": 2310278.153846154, + "logps/chosen": -363.4921875, + "logps/rejected": -651.4320913461538, + "loss": 0.0759, + "rewards/chosen": 4.602999600497159, + "rewards/margins": 14.188006314364348, + "rewards/rejected": -9.585006713867188, + "step": 652 + }, + { + "epoch": 0.17897766205289845, + "grad_norm": 6.125, + "kl": 1.457867980003357, + "learning_rate": 5e-06, + "logits/chosen": -16924388.923076924, + "logits/rejected": 4215090.181818182, + "logps/chosen": -386.61083984375, + "logps/rejected": -601.1967329545455, + "loss": 0.0326, + "rewards/chosen": 5.171678396371695, + "rewards/margins": 13.74350706513945, + "rewards/rejected": -8.571828668767756, + "step": 653 + }, + { + "epoch": 0.17925174729340826, + "grad_norm": 11.0, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -23886685.333333332, + "logits/rejected": 15876461.333333334, + "logps/chosen": -452.3121337890625, + "logps/rejected": -528.6349690755209, + "loss": 0.0331, + "rewards/chosen": 5.482163747151692, + "rewards/margins": 12.260026931762695, + "rewards/rejected": -6.777863184611003, + "step": 654 + }, + { + "epoch": 0.17952583253391804, + "grad_norm": 16.625, + "kl": 3.020371913909912, + "learning_rate": 5e-06, + "logits/chosen": 6216066.5, + "logits/rejected": 36494708.0, + "logps/chosen": -473.69439697265625, + "logps/rejected": -546.4367065429688, + "loss": 0.1322, + "rewards/chosen": 4.707186222076416, + "rewards/margins": 11.705403804779053, + "rewards/rejected": -6.998217582702637, + "step": 655 + }, + { + "epoch": 0.17979991777442786, + "grad_norm": 15.25, + "kl": 3.056267499923706, + "learning_rate": 5e-06, + "logits/chosen": -3894609.1428571427, + "logits/rejected": -21451481.6, + "logps/chosen": -356.53885323660717, + "logps/rejected": -570.129443359375, + "loss": 0.1057, + "rewards/chosen": 4.014279229300363, + "rewards/margins": 11.567349297659739, + "rewards/rejected": -7.553070068359375, + "step": 656 + }, + { + "epoch": 0.18007400301493764, + "grad_norm": 11.4375, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -30478187.42857143, + "logits/rejected": 1221798.1176470588, + "logps/chosen": -437.268310546875, + "logps/rejected": -584.4802389705883, + "loss": 0.0423, + "rewards/chosen": 5.009434836251395, + "rewards/margins": 10.995393160010586, + "rewards/rejected": -5.985958323759191, + "step": 657 + }, + { + "epoch": 0.18034808825544743, + "grad_norm": 4.4375, + "kl": 2.5984294414520264, + "learning_rate": 5e-06, + "logits/chosen": -25621408.0, + "logits/rejected": -14317317.818181818, + "logps/chosen": -504.54736328125, + "logps/rejected": -442.17520419034093, + "loss": 0.0141, + "rewards/chosen": 5.8022930438701925, + "rewards/margins": 12.47317067393056, + "rewards/rejected": -6.670877630060369, + "step": 658 + }, + { + "epoch": 0.18062217349595724, + "grad_norm": 12.9375, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -4832799.111111111, + "logits/rejected": 13279668.266666668, + "logps/chosen": -340.9364963107639, + "logps/rejected": -534.7956705729167, + "loss": 0.0628, + "rewards/chosen": 5.769847869873047, + "rewards/margins": 11.672466786702474, + "rewards/rejected": -5.902618916829427, + "step": 659 + }, + { + "epoch": 0.18089625873646703, + "grad_norm": 7.40625, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": 7954560.0, + "logits/rejected": 33777670.4, + "logps/chosen": -442.03763253348217, + "logps/rejected": -826.02236328125, + "loss": 0.0262, + "rewards/chosen": 4.904408046177456, + "rewards/margins": 16.82922417776925, + "rewards/rejected": -11.924816131591797, + "step": 660 + }, + { + "epoch": 0.18117034397697684, + "grad_norm": 11.3125, + "kl": 2.773186445236206, + "learning_rate": 5e-06, + "logits/chosen": 19473920.0, + "logits/rejected": -6037664.0, + "logps/chosen": -381.4449462890625, + "logps/rejected": -668.2956194196429, + "loss": 0.0518, + "rewards/chosen": 5.156189727783203, + "rewards/margins": 12.537233843122209, + "rewards/rejected": -7.381044115339007, + "step": 661 + }, + { + "epoch": 0.18144442921748663, + "grad_norm": 18.75, + "kl": 0.21270053088665009, + "learning_rate": 5e-06, + "logits/chosen": -18838948.57142857, + "logits/rejected": 9693463.2, + "logps/chosen": -350.2017299107143, + "logps/rejected": -526.1, + "loss": 0.1215, + "rewards/chosen": 3.9127535138811385, + "rewards/margins": 8.854418781825474, + "rewards/rejected": -4.941665267944336, + "step": 662 + }, + { + "epoch": 0.18171851445799644, + "grad_norm": 5.65625, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -35685810.666666664, + "logits/rejected": 6807774.222222222, + "logps/chosen": -490.149658203125, + "logps/rejected": -479.57275390625, + "loss": 0.0203, + "rewards/chosen": 6.119256973266602, + "rewards/margins": 12.930627822875977, + "rewards/rejected": -6.811370849609375, + "step": 663 + }, + { + "epoch": 0.18199259969850623, + "grad_norm": 11.625, + "kl": 9.57091236114502, + "learning_rate": 5e-06, + "logits/chosen": -12188646.0, + "logits/rejected": -10997197.0, + "logps/chosen": -408.8459167480469, + "logps/rejected": -491.891357421875, + "loss": 0.0635, + "rewards/chosen": 5.3844380378723145, + "rewards/margins": 13.442701816558838, + "rewards/rejected": -8.058263778686523, + "step": 664 + }, + { + "epoch": 0.18226668493901604, + "grad_norm": 9.25, + "kl": 0.8608468770980835, + "learning_rate": 5e-06, + "logits/chosen": 118912277.33333333, + "logits/rejected": -13355916.0, + "logps/chosen": -469.6709391276042, + "logps/rejected": -407.3434244791667, + "loss": 0.066, + "rewards/chosen": 5.318471272786458, + "rewards/margins": 10.814470926920572, + "rewards/rejected": -5.495999654134114, + "step": 665 + }, + { + "epoch": 0.18254077017952583, + "grad_norm": 13.5, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": 38526944.0, + "logits/rejected": -32757060.57142857, + "logps/chosen": -412.59296875, + "logps/rejected": -650.3498883928571, + "loss": 0.0737, + "rewards/chosen": 3.2014957427978517, + "rewards/margins": 12.405536270141601, + "rewards/rejected": -9.20404052734375, + "step": 666 + }, + { + "epoch": 0.18281485542003564, + "grad_norm": 7.09375, + "kl": 8.214457511901855, + "learning_rate": 5e-06, + "logits/chosen": -3771388.923076923, + "logits/rejected": 3673949.8181818184, + "logps/chosen": -415.4045597956731, + "logps/rejected": -531.1661044034091, + "loss": 0.0858, + "rewards/chosen": 5.785321162297175, + "rewards/margins": 14.031579291070258, + "rewards/rejected": -8.246258128773082, + "step": 667 + }, + { + "epoch": 0.18308894066054543, + "grad_norm": 15.25, + "kl": 11.542875289916992, + "learning_rate": 5e-06, + "logits/chosen": 5794169.6, + "logits/rejected": -9594542.222222222, + "logps/chosen": -464.01360677083335, + "logps/rejected": -657.1080186631945, + "loss": 0.055, + "rewards/chosen": 5.3656260172526045, + "rewards/margins": 13.288129170735678, + "rewards/rejected": -7.922503153483073, + "step": 668 + }, + { + "epoch": 0.18336302590105522, + "grad_norm": 15.125, + "kl": 13.120969772338867, + "learning_rate": 5e-06, + "logits/chosen": -13788983.272727273, + "logits/rejected": 28052258.46153846, + "logps/chosen": -534.2568359375, + "logps/rejected": -481.0540114182692, + "loss": 0.0665, + "rewards/chosen": 6.149587457830256, + "rewards/margins": 12.476706551505135, + "rewards/rejected": -6.32711909367488, + "step": 669 + }, + { + "epoch": 0.18363711114156503, + "grad_norm": 9.0, + "kl": 0.6274642944335938, + "learning_rate": 5e-06, + "logits/chosen": -9907148.0, + "logits/rejected": -1662651.3333333333, + "logps/chosen": -516.4973551432291, + "logps/rejected": -617.02197265625, + "loss": 0.0368, + "rewards/chosen": 6.550614674886067, + "rewards/margins": 14.693857192993164, + "rewards/rejected": -8.143242518107096, + "step": 670 + }, + { + "epoch": 0.18391119638207482, + "grad_norm": 8.0625, + "kl": 2.0946121215820312, + "learning_rate": 5e-06, + "logits/chosen": 6462261.818181818, + "logits/rejected": 10161086.153846154, + "logps/chosen": -370.83327414772725, + "logps/rejected": -646.6476862980769, + "loss": 0.039, + "rewards/chosen": 4.5743935324928975, + "rewards/margins": 12.211834140590854, + "rewards/rejected": -7.637440608097957, + "step": 671 + }, + { + "epoch": 0.18418528162258463, + "grad_norm": 11.75, + "kl": 9.384299278259277, + "learning_rate": 5e-06, + "logits/chosen": -15353504.0, + "logits/rejected": -12988388.0, + "logps/chosen": -507.3971354166667, + "logps/rejected": -484.0600992838542, + "loss": 0.0498, + "rewards/chosen": 6.222944895426433, + "rewards/margins": 12.37914784749349, + "rewards/rejected": -6.156202952067058, + "step": 672 + }, + { + "epoch": 0.18445936686309441, + "grad_norm": 16.0, + "kl": 1.6066831350326538, + "learning_rate": 5e-06, + "logits/chosen": -10940402.666666666, + "logits/rejected": -2078624.3333333333, + "logps/chosen": -344.9315185546875, + "logps/rejected": -482.9273274739583, + "loss": 0.0606, + "rewards/chosen": 4.9975935618082685, + "rewards/margins": 11.776281356811523, + "rewards/rejected": -6.778687795003255, + "step": 673 + }, + { + "epoch": 0.18473345210360423, + "grad_norm": 13.3125, + "kl": 6.260837554931641, + "learning_rate": 5e-06, + "logits/chosen": 5439182.769230769, + "logits/rejected": 2330095.272727273, + "logps/chosen": -428.56201171875, + "logps/rejected": -487.4564098011364, + "loss": 0.0964, + "rewards/chosen": 4.348545954777644, + "rewards/margins": 11.938914052256337, + "rewards/rejected": -7.590368097478693, + "step": 674 + }, + { + "epoch": 0.18500753734411401, + "grad_norm": 12.125, + "kl": 3.449218273162842, + "learning_rate": 5e-06, + "logits/chosen": -15805232.0, + "logits/rejected": 3055830.6666666665, + "logps/chosen": -451.4548746744792, + "logps/rejected": -495.1999918619792, + "loss": 0.0572, + "rewards/chosen": 4.746454238891602, + "rewards/margins": 9.908045768737793, + "rewards/rejected": -5.161591529846191, + "step": 675 + }, + { + "epoch": 0.18528162258462383, + "grad_norm": 8.4375, + "kl": 8.478172302246094, + "learning_rate": 5e-06, + "logits/chosen": -33232389.333333332, + "logits/rejected": 2388525.3333333335, + "logps/chosen": -455.060302734375, + "logps/rejected": -559.5874837239584, + "loss": 0.0375, + "rewards/chosen": 5.600802103678386, + "rewards/margins": 13.319561004638672, + "rewards/rejected": -7.718758900960286, + "step": 676 + }, + { + "epoch": 0.18555570782513361, + "grad_norm": 7.09375, + "kl": 11.30219841003418, + "learning_rate": 5e-06, + "logits/chosen": 4542419.692307692, + "logits/rejected": 6914904.7272727275, + "logps/chosen": -421.0422175480769, + "logps/rejected": -446.58203125, + "loss": 0.0465, + "rewards/chosen": 7.001137366661658, + "rewards/margins": 12.586546357695038, + "rewards/rejected": -5.585408991033381, + "step": 677 + }, + { + "epoch": 0.18582979306564343, + "grad_norm": 27.625, + "kl": 3.571605682373047, + "learning_rate": 5e-06, + "logits/chosen": -34220149.333333336, + "logits/rejected": 12536318.666666666, + "logps/chosen": -459.1047770182292, + "logps/rejected": -315.3266194661458, + "loss": 0.0813, + "rewards/chosen": 5.416045506795247, + "rewards/margins": 10.990102132161457, + "rewards/rejected": -5.574056625366211, + "step": 678 + }, + { + "epoch": 0.18610387830615321, + "grad_norm": 13.25, + "kl": 6.238208770751953, + "learning_rate": 5e-06, + "logits/chosen": 5504319.428571428, + "logits/rejected": -2798040.6, + "logps/chosen": -492.7675083705357, + "logps/rejected": -283.1787841796875, + "loss": 0.0526, + "rewards/chosen": 6.383028302873884, + "rewards/margins": 10.856230436052595, + "rewards/rejected": -4.473202133178711, + "step": 679 + }, + { + "epoch": 0.186377963546663, + "grad_norm": 12.4375, + "kl": 24.762876510620117, + "learning_rate": 5e-06, + "logits/chosen": -8330652.235294118, + "logits/rejected": 36335888.0, + "logps/chosen": -365.7442267922794, + "logps/rejected": -597.2920619419643, + "loss": 0.1758, + "rewards/chosen": 5.247539295869715, + "rewards/margins": 13.82821905312418, + "rewards/rejected": -8.580679757254464, + "step": 680 + }, + { + "epoch": 0.18665204878717281, + "grad_norm": 12.9375, + "kl": 0.7350603938102722, + "learning_rate": 5e-06, + "logits/chosen": -14394880.0, + "logits/rejected": -7032176.0, + "logps/chosen": -381.30984933035717, + "logps/rejected": -557.5697265625, + "loss": 0.0766, + "rewards/chosen": 4.136623382568359, + "rewards/margins": 10.568343353271484, + "rewards/rejected": -6.431719970703125, + "step": 681 + }, + { + "epoch": 0.1869261340276826, + "grad_norm": 14.375, + "kl": 9.99704647064209, + "learning_rate": 5e-06, + "logits/chosen": -21880710.85714286, + "logits/rejected": -20929283.2, + "logps/chosen": -329.3447265625, + "logps/rejected": -394.1942138671875, + "loss": 0.1477, + "rewards/chosen": 4.049205780029297, + "rewards/margins": 9.306630706787109, + "rewards/rejected": -5.257424926757812, + "step": 682 + }, + { + "epoch": 0.1872002192681924, + "grad_norm": 15.5, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": 9460394.666666666, + "logits/rejected": 20073996.8, + "logps/chosen": -472.9391818576389, + "logps/rejected": -387.2791015625, + "loss": 0.0519, + "rewards/chosen": 4.979915195041233, + "rewards/margins": 10.718702019585503, + "rewards/rejected": -5.738786824544271, + "step": 683 + }, + { + "epoch": 0.1874743045087022, + "grad_norm": 16.375, + "kl": 5.977967262268066, + "learning_rate": 5e-06, + "logits/chosen": 25933114.181818184, + "logits/rejected": 1280499.3846153845, + "logps/chosen": -466.85249467329544, + "logps/rejected": -536.5161884014423, + "loss": 0.1256, + "rewards/chosen": 5.021444840864702, + "rewards/margins": 12.220500439197034, + "rewards/rejected": -7.199055598332332, + "step": 684 + }, + { + "epoch": 0.187748389749212, + "grad_norm": 15.3125, + "kl": 14.002288818359375, + "learning_rate": 5e-06, + "logits/chosen": -5936139.764705882, + "logits/rejected": -9890893.714285715, + "logps/chosen": -422.42790670955884, + "logps/rejected": -493.88633510044644, + "loss": 0.0885, + "rewards/chosen": 5.431978113511029, + "rewards/margins": 12.530173870695739, + "rewards/rejected": -7.09819575718471, + "step": 685 + }, + { + "epoch": 0.1880224749897218, + "grad_norm": 14.0625, + "kl": 8.123441696166992, + "learning_rate": 5e-06, + "logits/chosen": 9782142.0, + "logits/rejected": -11026819.555555556, + "logps/chosen": -540.3661702473959, + "logps/rejected": -452.0787760416667, + "loss": 0.0493, + "rewards/chosen": 7.512808481852214, + "rewards/margins": 13.000105539957683, + "rewards/rejected": -5.487297058105469, + "step": 686 + }, + { + "epoch": 0.1882965602302316, + "grad_norm": 7.15625, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": 21941006.222222224, + "logits/rejected": -18894329.6, + "logps/chosen": -452.3079427083333, + "logps/rejected": -579.7229817708334, + "loss": 0.0386, + "rewards/chosen": 5.02400885687934, + "rewards/margins": 12.823171657986112, + "rewards/rejected": -7.799162801106771, + "step": 687 + }, + { + "epoch": 0.1885706454707414, + "grad_norm": 11.0, + "kl": 12.096885681152344, + "learning_rate": 5e-06, + "logits/chosen": -6566683.0, + "logits/rejected": -5916447.5, + "logps/chosen": -410.5346984863281, + "logps/rejected": -535.7684326171875, + "loss": 0.0522, + "rewards/chosen": 6.431801795959473, + "rewards/margins": 13.071205139160156, + "rewards/rejected": -6.639403343200684, + "step": 688 + }, + { + "epoch": 0.18884473071125119, + "grad_norm": 4.3125, + "kl": 3.1340396404266357, + "learning_rate": 5e-06, + "logits/chosen": -18880801.6, + "logits/rejected": -7770425.142857143, + "logps/chosen": -464.6578125, + "logps/rejected": -484.50390625, + "loss": 0.0377, + "rewards/chosen": 6.190269851684571, + "rewards/margins": 12.427403422764371, + "rewards/rejected": -6.2371335710797995, + "step": 689 + }, + { + "epoch": 0.189118815951761, + "grad_norm": 12.1875, + "kl": 11.363642692565918, + "learning_rate": 5e-06, + "logits/chosen": -25526884.0, + "logits/rejected": 30449792.0, + "logps/chosen": -431.1676940917969, + "logps/rejected": -415.81304931640625, + "loss": 0.1039, + "rewards/chosen": 5.80319881439209, + "rewards/margins": 10.151955604553223, + "rewards/rejected": -4.348756790161133, + "step": 690 + }, + { + "epoch": 0.18939290119227079, + "grad_norm": 12.8125, + "kl": 8.589733123779297, + "learning_rate": 5e-06, + "logits/chosen": -14847632.0, + "logits/rejected": 4024552.0, + "logps/chosen": -414.2594517299107, + "logps/rejected": -321.86640625, + "loss": 0.0927, + "rewards/chosen": 5.063065120152065, + "rewards/margins": 9.441832515171598, + "rewards/rejected": -4.378767395019532, + "step": 691 + }, + { + "epoch": 0.1896669864327806, + "grad_norm": 3.71875, + "kl": 9.113994598388672, + "learning_rate": 5e-06, + "logits/chosen": -6806056.615384615, + "logits/rejected": 14492004.363636363, + "logps/chosen": -431.60884915865387, + "logps/rejected": -539.5665838068181, + "loss": 0.0317, + "rewards/chosen": 6.498329162597656, + "rewards/margins": 13.615422335537996, + "rewards/rejected": -7.117093172940341, + "step": 692 + }, + { + "epoch": 0.18994107167329038, + "grad_norm": 12.25, + "kl": 3.9564952850341797, + "learning_rate": 5e-06, + "logits/chosen": -21797067.636363637, + "logits/rejected": 16676704.0, + "logps/chosen": -399.5560191761364, + "logps/rejected": -431.26938100961536, + "loss": 0.0697, + "rewards/chosen": 5.3419199856844815, + "rewards/margins": 9.569448484407438, + "rewards/rejected": -4.227528498722957, + "step": 693 + }, + { + "epoch": 0.1902151569138002, + "grad_norm": 10.6875, + "kl": 5.5463786125183105, + "learning_rate": 5e-06, + "logits/chosen": -27680708.923076924, + "logits/rejected": -142730.54545454544, + "logps/chosen": -468.69775390625, + "logps/rejected": -463.18093039772725, + "loss": 0.028, + "rewards/chosen": 6.627674396221455, + "rewards/margins": 12.447045012787505, + "rewards/rejected": -5.819370616566051, + "step": 694 + }, + { + "epoch": 0.19048924215430998, + "grad_norm": 9.6875, + "kl": 7.48018741607666, + "learning_rate": 5e-06, + "logits/chosen": -20320516.57142857, + "logits/rejected": 8842942.4, + "logps/chosen": -431.44395228794644, + "logps/rejected": -451.68212890625, + "loss": 0.062, + "rewards/chosen": 5.639814104352679, + "rewards/margins": 11.818534197126116, + "rewards/rejected": -6.178720092773437, + "step": 695 + }, + { + "epoch": 0.1907633273948198, + "grad_norm": 8.875, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -4025269.1428571427, + "logits/rejected": 8232991.05882353, + "logps/chosen": -451.0279017857143, + "logps/rejected": -607.4477251838235, + "loss": 0.038, + "rewards/chosen": 5.103515080043247, + "rewards/margins": 13.78954574841411, + "rewards/rejected": -8.686030668370863, + "step": 696 + }, + { + "epoch": 0.19103741263532958, + "grad_norm": 14.75, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -36791669.333333336, + "logits/rejected": -10359975.333333334, + "logps/chosen": -379.1914469401042, + "logps/rejected": -469.575927734375, + "loss": 0.1036, + "rewards/chosen": 4.2799727121988935, + "rewards/margins": 11.186428705851238, + "rewards/rejected": -6.906455993652344, + "step": 697 + }, + { + "epoch": 0.1913114978758394, + "grad_norm": 6.6875, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": 21168508.0, + "logits/rejected": -7287431.0, + "logps/chosen": -383.39117431640625, + "logps/rejected": -932.850341796875, + "loss": 0.0292, + "rewards/chosen": 6.15700101852417, + "rewards/margins": 21.55512762069702, + "rewards/rejected": -15.398126602172852, + "step": 698 + }, + { + "epoch": 0.19158558311634918, + "grad_norm": 11.5, + "kl": 0.5297800898551941, + "learning_rate": 5e-06, + "logits/chosen": -25880888.0, + "logits/rejected": -1804576.3333333333, + "logps/chosen": -485.6241048177083, + "logps/rejected": -706.72705078125, + "loss": 0.0561, + "rewards/chosen": 6.000101725260417, + "rewards/margins": 13.67116673787435, + "rewards/rejected": -7.671065012613933, + "step": 699 + }, + { + "epoch": 0.19185966835685897, + "grad_norm": 7.84375, + "kl": 6.911484718322754, + "learning_rate": 5e-06, + "logits/chosen": -14384587.0, + "logits/rejected": -7100392.0, + "logps/chosen": -476.5574035644531, + "logps/rejected": -406.8119812011719, + "loss": 0.0709, + "rewards/chosen": 5.299774169921875, + "rewards/margins": 10.900629043579102, + "rewards/rejected": -5.600854873657227, + "step": 700 + }, + { + "epoch": 0.19213375359736878, + "grad_norm": 10.375, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": 4824914.666666667, + "logits/rejected": 2091335.4666666666, + "logps/chosen": -421.04497612847223, + "logps/rejected": -477.21318359375, + "loss": 0.0329, + "rewards/chosen": 5.529703776041667, + "rewards/margins": 13.58799031575521, + "rewards/rejected": -8.058286539713542, + "step": 701 + }, + { + "epoch": 0.19240783883787857, + "grad_norm": 12.375, + "kl": 2.9964823722839355, + "learning_rate": 5e-06, + "logits/chosen": 1509169.4285714286, + "logits/rejected": -17006102.4, + "logps/chosen": -434.60658482142856, + "logps/rejected": -422.16484375, + "loss": 0.0492, + "rewards/chosen": 5.763410295758929, + "rewards/margins": 12.628524889264789, + "rewards/rejected": -6.86511459350586, + "step": 702 + }, + { + "epoch": 0.19268192407838838, + "grad_norm": 16.5, + "kl": 16.3209228515625, + "learning_rate": 5e-06, + "logits/chosen": 4892112.0, + "logits/rejected": 13432342.0, + "logps/chosen": -396.4866027832031, + "logps/rejected": -393.23394775390625, + "loss": 0.1409, + "rewards/chosen": 4.887618064880371, + "rewards/margins": 11.367178916931152, + "rewards/rejected": -6.479560852050781, + "step": 703 + }, + { + "epoch": 0.19295600931889817, + "grad_norm": 6.09375, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": 7859377.230769231, + "logits/rejected": 869598.1818181818, + "logps/chosen": -445.0465745192308, + "logps/rejected": -554.4255149147727, + "loss": 0.0282, + "rewards/chosen": 5.356487567608173, + "rewards/margins": 12.82353343830242, + "rewards/rejected": -7.467045870694247, + "step": 704 + }, + { + "epoch": 0.19323009455940798, + "grad_norm": 6.84375, + "kl": 16.127561569213867, + "learning_rate": 5e-06, + "logits/chosen": -14949243.076923076, + "logits/rejected": -19264093.09090909, + "logps/chosen": -492.65914212740387, + "logps/rejected": -525.2933238636364, + "loss": 0.0279, + "rewards/chosen": 7.3312835693359375, + "rewards/margins": 15.643851540305398, + "rewards/rejected": -8.31256797096946, + "step": 705 + }, + { + "epoch": 0.19350417979991777, + "grad_norm": 10.625, + "kl": 4.646829128265381, + "learning_rate": 5e-06, + "logits/chosen": 21552827.076923076, + "logits/rejected": -3521306.5454545454, + "logps/chosen": -501.8141526442308, + "logps/rejected": -419.15700461647725, + "loss": 0.0611, + "rewards/chosen": 5.34136962890625, + "rewards/margins": 12.064824884588068, + "rewards/rejected": -6.723455255681818, + "step": 706 + }, + { + "epoch": 0.19377826504042758, + "grad_norm": 14.9375, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -24425287.111111112, + "logits/rejected": -458613.3333333333, + "logps/chosen": -475.10584852430554, + "logps/rejected": -442.6375, + "loss": 0.0863, + "rewards/chosen": 5.385944366455078, + "rewards/margins": 12.11588617960612, + "rewards/rejected": -6.729941813151042, + "step": 707 + }, + { + "epoch": 0.19405235028093737, + "grad_norm": 11.0625, + "kl": 1.2148100137710571, + "learning_rate": 5e-06, + "logits/chosen": -29926359.272727273, + "logits/rejected": 7947227.692307692, + "logps/chosen": -359.3543146306818, + "logps/rejected": -533.3845402644231, + "loss": 0.0702, + "rewards/chosen": 4.510412736372515, + "rewards/margins": 10.80978900402576, + "rewards/rejected": -6.299376267653245, + "step": 708 + }, + { + "epoch": 0.19432643552144718, + "grad_norm": 10.0625, + "kl": 4.838833332061768, + "learning_rate": 5e-06, + "logits/chosen": 6565408.571428572, + "logits/rejected": -8158547.2, + "logps/chosen": -303.1696079799107, + "logps/rejected": -618.14345703125, + "loss": 0.0799, + "rewards/chosen": 4.552840369088309, + "rewards/margins": 14.604834692818777, + "rewards/rejected": -10.051994323730469, + "step": 709 + }, + { + "epoch": 0.19460052076195697, + "grad_norm": 6.34375, + "kl": 1.5257911682128906, + "learning_rate": 5e-06, + "logits/chosen": -18494774.85714286, + "logits/rejected": -20748417.6, + "logps/chosen": -363.166748046875, + "logps/rejected": -447.96875, + "loss": 0.0267, + "rewards/chosen": 5.571037292480469, + "rewards/margins": 12.719467163085938, + "rewards/rejected": -7.148429870605469, + "step": 710 + }, + { + "epoch": 0.19487460600246675, + "grad_norm": 9.25, + "kl": 1.2903913259506226, + "learning_rate": 5e-06, + "logits/chosen": -1493215.6666666667, + "logits/rejected": 34964330.666666664, + "logps/chosen": -438.3917643229167, + "logps/rejected": -480.4703776041667, + "loss": 0.0517, + "rewards/chosen": 7.058909734090169, + "rewards/margins": 16.08990881178114, + "rewards/rejected": -9.030999077690971, + "step": 711 + }, + { + "epoch": 0.19514869124297657, + "grad_norm": 11.0, + "kl": 10.855573654174805, + "learning_rate": 5e-06, + "logits/chosen": -21955253.333333332, + "logits/rejected": -2516169.0, + "logps/chosen": -510.06201171875, + "logps/rejected": -445.5848795572917, + "loss": 0.0601, + "rewards/chosen": 6.576147079467773, + "rewards/margins": 13.267876942952473, + "rewards/rejected": -6.6917298634847, + "step": 712 + }, + { + "epoch": 0.19542277648348635, + "grad_norm": 11.5625, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -11322525.538461538, + "logits/rejected": 18219720.727272727, + "logps/chosen": -287.1435546875, + "logps/rejected": -495.52854225852275, + "loss": 0.1024, + "rewards/chosen": 4.2139153113731975, + "rewards/margins": 10.29326378882348, + "rewards/rejected": -6.079348477450284, + "step": 713 + }, + { + "epoch": 0.19569686172399617, + "grad_norm": 9.625, + "kl": 10.15253734588623, + "learning_rate": 5e-06, + "logits/chosen": -10300219.692307692, + "logits/rejected": -7304046.545454546, + "logps/chosen": -482.4400165264423, + "logps/rejected": -683.3777521306819, + "loss": 0.0615, + "rewards/chosen": 6.556089547964243, + "rewards/margins": 15.314453445114456, + "rewards/rejected": -8.758363897150213, + "step": 714 + }, + { + "epoch": 0.19597094696450595, + "grad_norm": 13.9375, + "kl": 6.302914619445801, + "learning_rate": 5e-06, + "logits/chosen": -17704455.529411763, + "logits/rejected": -12773211.42857143, + "logps/chosen": -392.19381893382354, + "logps/rejected": -636.4645647321429, + "loss": 0.0461, + "rewards/chosen": 5.798630209530101, + "rewards/margins": 14.405655356014476, + "rewards/rejected": -8.607025146484375, + "step": 715 + }, + { + "epoch": 0.19624503220501577, + "grad_norm": 12.75, + "kl": 2.7957754135131836, + "learning_rate": 5e-06, + "logits/chosen": -17383355.076923076, + "logits/rejected": 1262679.4545454546, + "logps/chosen": -565.4750600961538, + "logps/rejected": -554.9922318892045, + "loss": 0.0484, + "rewards/chosen": 7.117357107309195, + "rewards/margins": 14.707727979113173, + "rewards/rejected": -7.5903708718039775, + "step": 716 + }, + { + "epoch": 0.19651911744552555, + "grad_norm": 6.75, + "kl": 7.880585193634033, + "learning_rate": 5e-06, + "logits/chosen": -11645642.666666666, + "logits/rejected": -17952689.777777776, + "logps/chosen": -552.3585286458333, + "logps/rejected": -425.1178385416667, + "loss": 0.0384, + "rewards/chosen": 6.785609944661458, + "rewards/margins": 12.551067521837023, + "rewards/rejected": -5.765457577175564, + "step": 717 + }, + { + "epoch": 0.19679320268603537, + "grad_norm": 8.5, + "kl": 5.3349409103393555, + "learning_rate": 5e-06, + "logits/chosen": -2866881.8181818184, + "logits/rejected": -25408327.384615384, + "logps/chosen": -350.8917347301136, + "logps/rejected": -575.5890925480769, + "loss": 0.0538, + "rewards/chosen": 6.735143488103693, + "rewards/margins": 12.620541765973286, + "rewards/rejected": -5.885398277869592, + "step": 718 + }, + { + "epoch": 0.19706728792654515, + "grad_norm": 5.5, + "kl": 3.4196105003356934, + "learning_rate": 5e-06, + "logits/chosen": -18115138.0, + "logits/rejected": 31079164.0, + "logps/chosen": -433.5357666015625, + "logps/rejected": -636.82177734375, + "loss": 0.0221, + "rewards/chosen": 5.885141372680664, + "rewards/margins": 14.104856491088867, + "rewards/rejected": -8.219715118408203, + "step": 719 + }, + { + "epoch": 0.19734137316705497, + "grad_norm": 7.21875, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -20628540.0, + "logits/rejected": 16775558.666666666, + "logps/chosen": -495.9139811197917, + "logps/rejected": -602.0727132161459, + "loss": 0.0293, + "rewards/chosen": 6.132659276326497, + "rewards/margins": 13.494274139404297, + "rewards/rejected": -7.3616148630778, + "step": 720 + }, + { + "epoch": 0.19761545840756475, + "grad_norm": 6.46875, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -176410.625, + "logits/rejected": -27929874.0, + "logps/chosen": -437.7750244140625, + "logps/rejected": -567.614013671875, + "loss": 0.0487, + "rewards/chosen": 6.053165912628174, + "rewards/margins": 12.646437168121338, + "rewards/rejected": -6.593271255493164, + "step": 721 + }, + { + "epoch": 0.19788954364807454, + "grad_norm": 11.125, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -1163646.3333333333, + "logits/rejected": -13939045.333333334, + "logps/chosen": -461.3586018880208, + "logps/rejected": -436.9265950520833, + "loss": 0.0465, + "rewards/chosen": 5.901974995930989, + "rewards/margins": 11.133646647135416, + "rewards/rejected": -5.231671651204427, + "step": 722 + }, + { + "epoch": 0.19816362888858435, + "grad_norm": 10.1875, + "kl": 0.39740753173828125, + "learning_rate": 5e-06, + "logits/chosen": -4788304.0, + "logits/rejected": -9888308.57142857, + "logps/chosen": -500.192529296875, + "logps/rejected": -492.1509486607143, + "loss": 0.0498, + "rewards/chosen": 5.4329784393310545, + "rewards/margins": 12.11512096949986, + "rewards/rejected": -6.6821425301688055, + "step": 723 + }, + { + "epoch": 0.19843771412909414, + "grad_norm": 5.15625, + "kl": 6.531761646270752, + "learning_rate": 5e-06, + "logits/chosen": -9526487.384615384, + "logits/rejected": -35594504.72727273, + "logps/chosen": -461.8374774639423, + "logps/rejected": -483.3505859375, + "loss": 0.0384, + "rewards/chosen": 6.30718994140625, + "rewards/margins": 13.611536199396307, + "rewards/rejected": -7.304346257990057, + "step": 724 + }, + { + "epoch": 0.19871179936960395, + "grad_norm": 6.53125, + "kl": 0.8778683543205261, + "learning_rate": 5e-06, + "logits/chosen": -24103076.923076924, + "logits/rejected": -6501032.0, + "logps/chosen": -454.58657602163464, + "logps/rejected": -559.9384765625, + "loss": 0.0613, + "rewards/chosen": 4.76316422682542, + "rewards/margins": 10.678753992894311, + "rewards/rejected": -5.915589766068892, + "step": 725 + }, + { + "epoch": 0.19898588461011374, + "grad_norm": 6.5, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -41056296.0, + "logits/rejected": 9889978.0, + "logps/chosen": -453.27252197265625, + "logps/rejected": -431.15777587890625, + "loss": 0.0379, + "rewards/chosen": 6.120019912719727, + "rewards/margins": 12.189565181732178, + "rewards/rejected": -6.069545269012451, + "step": 726 + }, + { + "epoch": 0.19925996985062355, + "grad_norm": 9.9375, + "kl": 12.888936042785645, + "learning_rate": 5e-06, + "logits/chosen": -16324638.11764706, + "logits/rejected": 2564734.285714286, + "logps/chosen": -431.16819852941177, + "logps/rejected": -426.96707589285717, + "loss": 0.0804, + "rewards/chosen": 6.175759708180147, + "rewards/margins": 11.482111506101464, + "rewards/rejected": -5.306351797921317, + "step": 727 + }, + { + "epoch": 0.19953405509113334, + "grad_norm": 11.125, + "kl": 7.9905524253845215, + "learning_rate": 5e-06, + "logits/chosen": -33257634.666666668, + "logits/rejected": -9452610.0, + "logps/chosen": -475.915771484375, + "logps/rejected": -554.5096028645834, + "loss": 0.0588, + "rewards/chosen": 6.611539204915364, + "rewards/margins": 14.280527750651041, + "rewards/rejected": -7.668988545735677, + "step": 728 + }, + { + "epoch": 0.19980814033164315, + "grad_norm": 9.5625, + "kl": 0.2564353942871094, + "learning_rate": 5e-06, + "logits/chosen": -34153292.8, + "logits/rejected": -13536148.57142857, + "logps/chosen": -464.816943359375, + "logps/rejected": -519.8183244977679, + "loss": 0.0385, + "rewards/chosen": 5.735879516601562, + "rewards/margins": 10.560125514439175, + "rewards/rejected": -4.824245997837612, + "step": 729 + }, + { + "epoch": 0.20008222557215294, + "grad_norm": 14.0625, + "kl": 5.816828727722168, + "learning_rate": 5e-06, + "logits/chosen": -12909109.333333334, + "logits/rejected": 7265863.333333333, + "logps/chosen": -379.2515869140625, + "logps/rejected": -500.5321044921875, + "loss": 0.1002, + "rewards/chosen": 4.2757673263549805, + "rewards/margins": 10.059279441833496, + "rewards/rejected": -5.783512115478516, + "step": 730 + }, + { + "epoch": 0.20035631081266272, + "grad_norm": 16.875, + "kl": 14.327875137329102, + "learning_rate": 5e-06, + "logits/chosen": -17113934.222222224, + "logits/rejected": 47585536.0, + "logps/chosen": -505.7734375, + "logps/rejected": -658.74609375, + "loss": 0.1018, + "rewards/chosen": 6.186133490668403, + "rewards/margins": 11.777341630723742, + "rewards/rejected": -5.591208140055339, + "step": 731 + }, + { + "epoch": 0.20063039605317254, + "grad_norm": 7.75, + "kl": 5.974937438964844, + "learning_rate": 5e-06, + "logits/chosen": -35327635.692307696, + "logits/rejected": 640896.5454545454, + "logps/chosen": -417.02786959134613, + "logps/rejected": -556.1243341619319, + "loss": 0.0296, + "rewards/chosen": 6.063298738919771, + "rewards/margins": 14.588093577565013, + "rewards/rejected": -8.524794838645242, + "step": 732 + }, + { + "epoch": 0.20090448129368232, + "grad_norm": 13.125, + "kl": 8.090620040893555, + "learning_rate": 5e-06, + "logits/chosen": -8485298.0, + "logits/rejected": -5227593.333333333, + "logps/chosen": -374.4207356770833, + "logps/rejected": -398.7611490885417, + "loss": 0.0895, + "rewards/chosen": 5.50567626953125, + "rewards/margins": 11.67841402689616, + "rewards/rejected": -6.172737757364909, + "step": 733 + }, + { + "epoch": 0.20117856653419214, + "grad_norm": 8.125, + "kl": 3.775745391845703, + "learning_rate": 5e-06, + "logits/chosen": 19197398.153846152, + "logits/rejected": -4065925.8181818184, + "logps/chosen": -561.0159254807693, + "logps/rejected": -403.16317471590907, + "loss": 0.0278, + "rewards/chosen": 6.714637169471154, + "rewards/margins": 13.911894471495302, + "rewards/rejected": -7.1972573020241475, + "step": 734 + }, + { + "epoch": 0.20145265177470192, + "grad_norm": 9.25, + "kl": 2.6316299438476562, + "learning_rate": 5e-06, + "logits/chosen": 3244047.272727273, + "logits/rejected": 59480846.76923077, + "logps/chosen": -463.25142045454544, + "logps/rejected": -381.13022085336536, + "loss": 0.0418, + "rewards/chosen": 6.304079922762784, + "rewards/margins": 14.874507450557257, + "rewards/rejected": -8.570427527794472, + "step": 735 + }, + { + "epoch": 0.20172673701521174, + "grad_norm": 10.0, + "kl": 1.4329612255096436, + "learning_rate": 5e-06, + "logits/chosen": 14079408.0, + "logits/rejected": 18993603.2, + "logps/chosen": -473.90614536830356, + "logps/rejected": -662.75947265625, + "loss": 0.0518, + "rewards/chosen": 4.42357417515346, + "rewards/margins": 15.178289903913226, + "rewards/rejected": -10.754715728759766, + "step": 736 + }, + { + "epoch": 0.20200082225572152, + "grad_norm": 7.15625, + "kl": 4.102336883544922, + "learning_rate": 5e-06, + "logits/chosen": -16593812.923076924, + "logits/rejected": 917704.0909090909, + "logps/chosen": -417.0173903245192, + "logps/rejected": -478.88618607954544, + "loss": 0.0328, + "rewards/chosen": 4.877701392540565, + "rewards/margins": 11.527625237311517, + "rewards/rejected": -6.649923844770952, + "step": 737 + }, + { + "epoch": 0.20227490749623134, + "grad_norm": 11.125, + "kl": 1.0472171306610107, + "learning_rate": 5e-06, + "logits/chosen": 17280560.0, + "logits/rejected": 3819339.6923076925, + "logps/chosen": -343.26606889204544, + "logps/rejected": -499.2447040264423, + "loss": 0.0569, + "rewards/chosen": 4.4872072393243965, + "rewards/margins": 11.837170500855347, + "rewards/rejected": -7.34996326153095, + "step": 738 + }, + { + "epoch": 0.20254899273674112, + "grad_norm": 6.375, + "kl": 1.8321311473846436, + "learning_rate": 5e-06, + "logits/chosen": -30059128.0, + "logits/rejected": -7755562.0, + "logps/chosen": -509.92059326171875, + "logps/rejected": -594.15771484375, + "loss": 0.0429, + "rewards/chosen": 7.462072372436523, + "rewards/margins": 16.72804069519043, + "rewards/rejected": -9.265968322753906, + "step": 739 + }, + { + "epoch": 0.20282307797725094, + "grad_norm": 5.0, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -3071576.0, + "logits/rejected": -6016042.0, + "logps/chosen": -448.9353942871094, + "logps/rejected": -435.5020446777344, + "loss": 0.0167, + "rewards/chosen": 6.385833740234375, + "rewards/margins": 12.736654281616211, + "rewards/rejected": -6.350820541381836, + "step": 740 + }, + { + "epoch": 0.20309716321776072, + "grad_norm": 4.9375, + "kl": 4.263530731201172, + "learning_rate": 5e-06, + "logits/chosen": -17656371.42857143, + "logits/rejected": 4092460.0, + "logps/chosen": -408.49539620535717, + "logps/rejected": -529.051318359375, + "loss": 0.0204, + "rewards/chosen": 6.291803632463727, + "rewards/margins": 14.703752027239117, + "rewards/rejected": -8.41194839477539, + "step": 741 + }, + { + "epoch": 0.2033712484582705, + "grad_norm": 15.0625, + "kl": 12.441694259643555, + "learning_rate": 5e-06, + "logits/chosen": 27545934.769230768, + "logits/rejected": 6671768.0, + "logps/chosen": -597.1629356971154, + "logps/rejected": -550.0503373579545, + "loss": 0.0633, + "rewards/chosen": 6.650838998647837, + "rewards/margins": 14.940860721614811, + "rewards/rejected": -8.290021722966975, + "step": 742 + }, + { + "epoch": 0.20364533369878032, + "grad_norm": 13.0, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -14838953.333333334, + "logits/rejected": -4383512.333333333, + "logps/chosen": -368.6467692057292, + "logps/rejected": -533.6416829427084, + "loss": 0.0755, + "rewards/chosen": 3.978175163269043, + "rewards/margins": 11.553542137145996, + "rewards/rejected": -7.575366973876953, + "step": 743 + }, + { + "epoch": 0.2039194189392901, + "grad_norm": 6.15625, + "kl": 1.489454984664917, + "learning_rate": 5e-06, + "logits/chosen": -9509911.111111112, + "logits/rejected": -5558281.6, + "logps/chosen": -452.2099338107639, + "logps/rejected": -538.05439453125, + "loss": 0.043, + "rewards/chosen": 5.201030731201172, + "rewards/margins": 13.582696787516277, + "rewards/rejected": -8.381666056315105, + "step": 744 + }, + { + "epoch": 0.20419350417979992, + "grad_norm": 7.71875, + "kl": 7.205536842346191, + "learning_rate": 5e-06, + "logits/chosen": -31159853.17647059, + "logits/rejected": -17670499.42857143, + "logps/chosen": -561.5029871323529, + "logps/rejected": -303.63392857142856, + "loss": 0.0317, + "rewards/chosen": 5.73604538861443, + "rewards/margins": 12.770618534889543, + "rewards/rejected": -7.034573146275112, + "step": 745 + }, + { + "epoch": 0.2044675894203097, + "grad_norm": 6.53125, + "kl": 3.575793981552124, + "learning_rate": 5e-06, + "logits/chosen": -7594717.333333333, + "logits/rejected": -40512792.0, + "logps/chosen": -454.879150390625, + "logps/rejected": -477.807861328125, + "loss": 0.029, + "rewards/chosen": 6.412909825642903, + "rewards/margins": 13.247353235880533, + "rewards/rejected": -6.83444341023763, + "step": 746 + }, + { + "epoch": 0.20474167466081952, + "grad_norm": 15.8125, + "kl": 7.863116264343262, + "learning_rate": 5e-06, + "logits/chosen": -4591806.571428572, + "logits/rejected": -13552561.6, + "logps/chosen": -424.799560546875, + "logps/rejected": -523.66962890625, + "loss": 0.1562, + "rewards/chosen": 3.535649980817522, + "rewards/margins": 13.113875688825335, + "rewards/rejected": -9.578225708007812, + "step": 747 + }, + { + "epoch": 0.2050157599013293, + "grad_norm": 6.59375, + "kl": 2.9247474670410156, + "learning_rate": 5e-06, + "logits/chosen": -10858391.333333334, + "logits/rejected": 8013246.666666667, + "logps/chosen": -461.3739420572917, + "logps/rejected": -549.2900797526041, + "loss": 0.0241, + "rewards/chosen": 7.246022542317708, + "rewards/margins": 15.394222895304363, + "rewards/rejected": -8.148200352986654, + "step": 748 + }, + { + "epoch": 0.20528984514183912, + "grad_norm": 5.15625, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -11773870.4, + "logits/rejected": -10903076.57142857, + "logps/chosen": -560.899658203125, + "logps/rejected": -398.9017857142857, + "loss": 0.0204, + "rewards/chosen": 7.184989166259766, + "rewards/margins": 13.252052634102958, + "rewards/rejected": -6.067063467843192, + "step": 749 + }, + { + "epoch": 0.2055639303823489, + "grad_norm": 11.625, + "kl": 3.0178604125976562, + "learning_rate": 5e-06, + "logits/chosen": -6680370.461538462, + "logits/rejected": -16510570.181818182, + "logps/chosen": -457.4727313701923, + "logps/rejected": -434.6300159801136, + "loss": 0.0479, + "rewards/chosen": 5.043279794546274, + "rewards/margins": 12.502676156850962, + "rewards/rejected": -7.4593963623046875, + "step": 750 + }, + { + "epoch": 0.20583801562285872, + "grad_norm": 9.6875, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -8074276.266666667, + "logits/rejected": -3445545.777777778, + "logps/chosen": -384.05205078125, + "logps/rejected": -752.4237196180555, + "loss": 0.0422, + "rewards/chosen": 5.084151204427084, + "rewards/margins": 16.653291320800783, + "rewards/rejected": -11.569140116373697, + "step": 751 + }, + { + "epoch": 0.2061121008633685, + "grad_norm": 8.8125, + "kl": 3.126148223876953, + "learning_rate": 5e-06, + "logits/chosen": 8194274.666666667, + "logits/rejected": -14347042.666666666, + "logps/chosen": -616.1604410807291, + "logps/rejected": -397.2712809244792, + "loss": 0.0236, + "rewards/chosen": 5.871391296386719, + "rewards/margins": 13.262235005696613, + "rewards/rejected": -7.3908437093098955, + "step": 752 + }, + { + "epoch": 0.2063861861038783, + "grad_norm": 12.0, + "kl": 1.052136778831482, + "learning_rate": 5e-06, + "logits/chosen": -4264055.692307692, + "logits/rejected": 8075488.0, + "logps/chosen": -426.8695537860577, + "logps/rejected": -492.4988458806818, + "loss": 0.0405, + "rewards/chosen": 5.57391357421875, + "rewards/margins": 12.221940474076703, + "rewards/rejected": -6.648026899857954, + "step": 753 + }, + { + "epoch": 0.2066602713443881, + "grad_norm": 8.75, + "kl": 2.39020037651062, + "learning_rate": 5e-06, + "logits/chosen": -37935581.86666667, + "logits/rejected": -6925031.555555556, + "logps/chosen": -529.8494140625, + "logps/rejected": -600.7239583333334, + "loss": 0.026, + "rewards/chosen": 5.928585815429687, + "rewards/margins": 14.853634643554688, + "rewards/rejected": -8.925048828125, + "step": 754 + }, + { + "epoch": 0.2069343565848979, + "grad_norm": 15.25, + "kl": 4.490790843963623, + "learning_rate": 5e-06, + "logits/chosen": 6774462.666666667, + "logits/rejected": 29389141.333333332, + "logps/chosen": -355.2872721354167, + "logps/rejected": -537.0149739583334, + "loss": 0.1602, + "rewards/chosen": 3.940258344014486, + "rewards/margins": 11.683241844177246, + "rewards/rejected": -7.742983500162761, + "step": 755 + }, + { + "epoch": 0.2072084418254077, + "grad_norm": 5.65625, + "kl": 2.6066641807556152, + "learning_rate": 5e-06, + "logits/chosen": 110708.0, + "logits/rejected": -12107400.727272727, + "logps/chosen": -426.6787109375, + "logps/rejected": -537.2445845170455, + "loss": 0.044, + "rewards/chosen": 6.642066368689904, + "rewards/margins": 14.429367172134507, + "rewards/rejected": -7.7873008034446025, + "step": 756 + }, + { + "epoch": 0.2074825270659175, + "grad_norm": 6.625, + "kl": 3.8408076763153076, + "learning_rate": 5e-06, + "logits/chosen": -26145194.666666668, + "logits/rejected": -25599072.0, + "logps/chosen": -461.6640625, + "logps/rejected": -675.7373046875, + "loss": 0.0434, + "rewards/chosen": 6.228377024332683, + "rewards/margins": 15.096630096435547, + "rewards/rejected": -8.868253072102865, + "step": 757 + }, + { + "epoch": 0.2077566123064273, + "grad_norm": 8.5625, + "kl": 8.143548965454102, + "learning_rate": 5e-06, + "logits/chosen": -18938601.14285714, + "logits/rejected": 1240973.8, + "logps/chosen": -437.90757533482144, + "logps/rejected": -520.993896484375, + "loss": 0.0374, + "rewards/chosen": 6.317909785679409, + "rewards/margins": 13.628887721470424, + "rewards/rejected": -7.310977935791016, + "step": 758 + }, + { + "epoch": 0.2080306975469371, + "grad_norm": 13.6875, + "kl": 11.421082496643066, + "learning_rate": 5e-06, + "logits/chosen": -19957178.666666668, + "logits/rejected": -2915839.6666666665, + "logps/chosen": -469.1311848958333, + "logps/rejected": -485.0441080729167, + "loss": 0.1394, + "rewards/chosen": 5.400608062744141, + "rewards/margins": 11.084373474121094, + "rewards/rejected": -5.683765411376953, + "step": 759 + }, + { + "epoch": 0.2083047827874469, + "grad_norm": 10.1875, + "kl": 10.603328704833984, + "learning_rate": 5e-06, + "logits/chosen": 5204781.866666666, + "logits/rejected": 21218846.222222224, + "logps/chosen": -436.65709635416664, + "logps/rejected": -390.8458658854167, + "loss": 0.0398, + "rewards/chosen": 5.838493855794271, + "rewards/margins": 13.430958726671006, + "rewards/rejected": -7.592464870876736, + "step": 760 + }, + { + "epoch": 0.2085788680279567, + "grad_norm": 6.9375, + "kl": 3.8361754417419434, + "learning_rate": 5e-06, + "logits/chosen": -18129842.666666668, + "logits/rejected": 808959.6666666666, + "logps/chosen": -531.4022623697916, + "logps/rejected": -408.5040690104167, + "loss": 0.013, + "rewards/chosen": 6.396932601928711, + "rewards/margins": 13.449330012003582, + "rewards/rejected": -7.05239741007487, + "step": 761 + }, + { + "epoch": 0.2088529532684665, + "grad_norm": 9.6875, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -27015506.0, + "logits/rejected": -1481068.625, + "logps/chosen": -450.9007568359375, + "logps/rejected": -384.69482421875, + "loss": 0.0566, + "rewards/chosen": 5.734736919403076, + "rewards/margins": 11.846766948699951, + "rewards/rejected": -6.112030029296875, + "step": 762 + }, + { + "epoch": 0.2091270385089763, + "grad_norm": 9.875, + "kl": 3.9314002990722656, + "learning_rate": 5e-06, + "logits/chosen": 683555.7142857143, + "logits/rejected": -14031371.2, + "logps/chosen": -460.1309291294643, + "logps/rejected": -501.671337890625, + "loss": 0.079, + "rewards/chosen": 4.521148136683872, + "rewards/margins": 11.39699957711356, + "rewards/rejected": -6.875851440429687, + "step": 763 + }, + { + "epoch": 0.20940112374948608, + "grad_norm": 8.6875, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -3862527.75, + "logits/rejected": -11927978.0, + "logps/chosen": -433.6981506347656, + "logps/rejected": -569.9501953125, + "loss": 0.0357, + "rewards/chosen": 4.852145671844482, + "rewards/margins": 12.170495986938477, + "rewards/rejected": -7.318350315093994, + "step": 764 + }, + { + "epoch": 0.2096752089899959, + "grad_norm": 7.0625, + "kl": 1.7126191854476929, + "learning_rate": 5e-06, + "logits/chosen": -16204475.636363637, + "logits/rejected": -9703872.0, + "logps/chosen": -385.1491033380682, + "logps/rejected": -430.9963191105769, + "loss": 0.051, + "rewards/chosen": 5.436502283269709, + "rewards/margins": 12.030707699435574, + "rewards/rejected": -6.594205416165865, + "step": 765 + }, + { + "epoch": 0.20994929423050568, + "grad_norm": 6.53125, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -36339733.333333336, + "logits/rejected": -20073753.6, + "logps/chosen": -394.6331380208333, + "logps/rejected": -645.8133463541667, + "loss": 0.0454, + "rewards/chosen": 5.843667348225911, + "rewards/margins": 13.59158198038737, + "rewards/rejected": -7.747914632161458, + "step": 766 + }, + { + "epoch": 0.2102233794710155, + "grad_norm": 14.0625, + "kl": 1.0650825500488281, + "learning_rate": 5e-06, + "logits/chosen": -3090517.3333333335, + "logits/rejected": 15116774.666666666, + "logps/chosen": -489.5052083333333, + "logps/rejected": -596.0078938802084, + "loss": 0.0533, + "rewards/chosen": 5.581108093261719, + "rewards/margins": 14.634136199951172, + "rewards/rejected": -9.053028106689453, + "step": 767 + }, + { + "epoch": 0.21049746471152528, + "grad_norm": 5.15625, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -19592528.0, + "logits/rejected": 161873.7142857143, + "logps/chosen": -454.52294921875, + "logps/rejected": -495.260986328125, + "loss": 0.0169, + "rewards/chosen": 6.7058837890625, + "rewards/margins": 12.815356336321148, + "rewards/rejected": -6.109472547258649, + "step": 768 + }, + { + "epoch": 0.2107715499520351, + "grad_norm": 13.4375, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -12524450.909090908, + "logits/rejected": 58064300.307692304, + "logps/chosen": -386.1804865056818, + "logps/rejected": -664.8931790865385, + "loss": 0.0722, + "rewards/chosen": 5.26461445201527, + "rewards/margins": 16.913541407018272, + "rewards/rejected": -11.648926955003004, + "step": 769 + }, + { + "epoch": 0.21104563519254488, + "grad_norm": 11.1875, + "kl": 7.252317428588867, + "learning_rate": 5e-06, + "logits/chosen": -28301553.454545453, + "logits/rejected": 3253078.769230769, + "logps/chosen": -427.97958096590907, + "logps/rejected": -459.4924128605769, + "loss": 0.0982, + "rewards/chosen": 4.799940629438921, + "rewards/margins": 11.803881505152564, + "rewards/rejected": -7.003940875713642, + "step": 770 + }, + { + "epoch": 0.2113197204330547, + "grad_norm": 8.75, + "kl": 1.2965075969696045, + "learning_rate": 5e-06, + "logits/chosen": 6539432.888888889, + "logits/rejected": -10789243.733333332, + "logps/chosen": -473.87049696180554, + "logps/rejected": -528.2665364583333, + "loss": 0.0305, + "rewards/chosen": 5.418751610649957, + "rewards/margins": 12.655171797010635, + "rewards/rejected": -7.236420186360677, + "step": 771 + }, + { + "epoch": 0.21159380567356448, + "grad_norm": 15.5, + "kl": 6.0154829025268555, + "learning_rate": 5e-06, + "logits/chosen": -5039543.466666667, + "logits/rejected": -13939399.111111112, + "logps/chosen": -405.1870442708333, + "logps/rejected": -411.8894314236111, + "loss": 0.0901, + "rewards/chosen": 4.617605590820313, + "rewards/margins": 8.562693447536892, + "rewards/rejected": -3.94508785671658, + "step": 772 + }, + { + "epoch": 0.21186789091407426, + "grad_norm": 16.375, + "kl": 16.90564727783203, + "learning_rate": 5e-06, + "logits/chosen": -3277237.0, + "logits/rejected": -8283089.0, + "logps/chosen": -481.71783447265625, + "logps/rejected": -411.66265869140625, + "loss": 0.0977, + "rewards/chosen": 6.275279998779297, + "rewards/margins": 11.795593738555908, + "rewards/rejected": -5.520313739776611, + "step": 773 + }, + { + "epoch": 0.21214197615458408, + "grad_norm": 12.1875, + "kl": 14.139385223388672, + "learning_rate": 5e-06, + "logits/chosen": -10913260.235294119, + "logits/rejected": 141905225.14285713, + "logps/chosen": -448.9217313878676, + "logps/rejected": -559.6861397879464, + "loss": 0.067, + "rewards/chosen": 5.830348744111903, + "rewards/margins": 17.722461476045495, + "rewards/rejected": -11.892112731933594, + "step": 774 + }, + { + "epoch": 0.21241606139509386, + "grad_norm": 6.65625, + "kl": 2.413379669189453, + "learning_rate": 5e-06, + "logits/chosen": -5452568.666666667, + "logits/rejected": -22210302.666666668, + "logps/chosen": -447.5837809244792, + "logps/rejected": -522.9932047526041, + "loss": 0.021, + "rewards/chosen": 5.910860697428386, + "rewards/margins": 12.90816370646159, + "rewards/rejected": -6.997303009033203, + "step": 775 + }, + { + "epoch": 0.21269014663560368, + "grad_norm": 9.1875, + "kl": 7.352179527282715, + "learning_rate": 5e-06, + "logits/chosen": -5022522.461538462, + "logits/rejected": -14616078.545454545, + "logps/chosen": -340.37886868990387, + "logps/rejected": -513.9074928977273, + "loss": 0.0847, + "rewards/chosen": 5.00821040226863, + "rewards/margins": 12.542116125146826, + "rewards/rejected": -7.533905722878196, + "step": 776 + }, + { + "epoch": 0.21296423187611346, + "grad_norm": 15.375, + "kl": 15.624171257019043, + "learning_rate": 5e-06, + "logits/chosen": -11162440.0, + "logits/rejected": -2609219.8, + "logps/chosen": -486.7987583705357, + "logps/rejected": -372.30908203125, + "loss": 0.1484, + "rewards/chosen": 5.644995553152902, + "rewards/margins": 12.762373788016184, + "rewards/rejected": -7.117378234863281, + "step": 777 + }, + { + "epoch": 0.21323831711662328, + "grad_norm": 5.28125, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": 2274862.8571428573, + "logits/rejected": -22301392.0, + "logps/chosen": -376.0298549107143, + "logps/rejected": -580.19453125, + "loss": 0.0208, + "rewards/chosen": 5.653802054268973, + "rewards/margins": 15.724290030343191, + "rewards/rejected": -10.070487976074219, + "step": 778 + }, + { + "epoch": 0.21351240235713306, + "grad_norm": 6.625, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -43235232.0, + "logits/rejected": -1844086.0, + "logps/chosen": -658.7725219726562, + "logps/rejected": -447.2777099609375, + "loss": 0.0378, + "rewards/chosen": 7.31680965423584, + "rewards/margins": 12.496614933013916, + "rewards/rejected": -5.179805278778076, + "step": 779 + }, + { + "epoch": 0.21378648759764288, + "grad_norm": 6.75, + "kl": 4.125914573669434, + "learning_rate": 5e-06, + "logits/chosen": 49798352.0, + "logits/rejected": -879772.25, + "logps/chosen": -522.0180053710938, + "logps/rejected": -448.9217529296875, + "loss": 0.031, + "rewards/chosen": 6.138671398162842, + "rewards/margins": 12.214663028717041, + "rewards/rejected": -6.075991630554199, + "step": 780 + }, + { + "epoch": 0.21406057283815266, + "grad_norm": 6.21875, + "kl": 1.8067386150360107, + "learning_rate": 5e-06, + "logits/chosen": -1925317.4545454546, + "logits/rejected": -7771922.461538462, + "logps/chosen": -466.9610706676136, + "logps/rejected": -364.44632662259613, + "loss": 0.0339, + "rewards/chosen": 6.350019281560725, + "rewards/margins": 12.885754191792095, + "rewards/rejected": -6.53573491023137, + "step": 781 + }, + { + "epoch": 0.21433465807866248, + "grad_norm": 8.3125, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -12427677.538461538, + "logits/rejected": -2806401.4545454546, + "logps/chosen": -456.7385817307692, + "logps/rejected": -658.8488103693181, + "loss": 0.0403, + "rewards/chosen": 6.015933697040264, + "rewards/margins": 14.734149639423077, + "rewards/rejected": -8.718215942382812, + "step": 782 + }, + { + "epoch": 0.21460874331917226, + "grad_norm": 11.875, + "kl": 2.139094114303589, + "learning_rate": 5e-06, + "logits/chosen": -7487238.666666667, + "logits/rejected": -10436422.0, + "logps/chosen": -354.6562093098958, + "logps/rejected": -390.2307942708333, + "loss": 0.0901, + "rewards/chosen": 3.337937672932943, + "rewards/margins": 8.396622975667318, + "rewards/rejected": -5.058685302734375, + "step": 783 + }, + { + "epoch": 0.21488282855968205, + "grad_norm": 4.84375, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -15334228.8, + "logits/rejected": -14738584.0, + "logps/chosen": -513.75966796875, + "logps/rejected": -531.1060267857143, + "loss": 0.0135, + "rewards/chosen": 7.004496765136719, + "rewards/margins": 14.720783015659878, + "rewards/rejected": -7.716286250523159, + "step": 784 + }, + { + "epoch": 0.21515691380019186, + "grad_norm": 10.875, + "kl": 1.4460923671722412, + "learning_rate": 5e-06, + "logits/chosen": 13255237.333333334, + "logits/rejected": -14462961.333333334, + "logps/chosen": -407.7301432291667, + "logps/rejected": -540.2618001302084, + "loss": 0.0613, + "rewards/chosen": 5.259487787882487, + "rewards/margins": 11.784413655598959, + "rewards/rejected": -6.524925867716472, + "step": 785 + }, + { + "epoch": 0.21543099904070165, + "grad_norm": 13.1875, + "kl": 8.36020278930664, + "learning_rate": 5e-06, + "logits/chosen": -9509896.533333333, + "logits/rejected": -5673616.888888889, + "logps/chosen": -466.8964518229167, + "logps/rejected": -412.00816514756946, + "loss": 0.0686, + "rewards/chosen": 5.936824544270833, + "rewards/margins": 10.944032626681857, + "rewards/rejected": -5.007208082411024, + "step": 786 + }, + { + "epoch": 0.21570508428121146, + "grad_norm": 14.5625, + "kl": 15.448874473571777, + "learning_rate": 5e-06, + "logits/chosen": -26908597.333333332, + "logits/rejected": -979102.6666666666, + "logps/chosen": -456.72389322916666, + "logps/rejected": -502.7439236111111, + "loss": 0.0592, + "rewards/chosen": 6.203707377115886, + "rewards/margins": 14.351029290093315, + "rewards/rejected": -8.14732191297743, + "step": 787 + }, + { + "epoch": 0.21597916952172125, + "grad_norm": 11.4375, + "kl": 0.4601237177848816, + "learning_rate": 5e-06, + "logits/chosen": -14746101.818181818, + "logits/rejected": 12521933.538461538, + "logps/chosen": -486.6290838068182, + "logps/rejected": -530.7475210336538, + "loss": 0.0442, + "rewards/chosen": 5.7650136080655185, + "rewards/margins": 12.20903967477225, + "rewards/rejected": -6.444026066706731, + "step": 788 + }, + { + "epoch": 0.21625325476223106, + "grad_norm": 7.90625, + "kl": 5.412566184997559, + "learning_rate": 5e-06, + "logits/chosen": -8369248.7272727275, + "logits/rejected": 16714823.384615384, + "logps/chosen": -376.16872336647725, + "logps/rejected": -580.6371694711538, + "loss": 0.029, + "rewards/chosen": 6.25532323663885, + "rewards/margins": 14.336659037983502, + "rewards/rejected": -8.081335801344652, + "step": 789 + }, + { + "epoch": 0.21652734000274085, + "grad_norm": 13.375, + "kl": 3.6281979084014893, + "learning_rate": 5e-06, + "logits/chosen": -15883492.57142857, + "logits/rejected": -22757379.2, + "logps/chosen": -394.3607700892857, + "logps/rejected": -470.15390625, + "loss": 0.0944, + "rewards/chosen": 5.054285321916852, + "rewards/margins": 10.450627408708844, + "rewards/rejected": -5.396342086791992, + "step": 790 + }, + { + "epoch": 0.21680142524325066, + "grad_norm": 6.28125, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": 22199918.0, + "logits/rejected": 347031.5, + "logps/chosen": -478.6980895996094, + "logps/rejected": -371.31634521484375, + "loss": 0.0324, + "rewards/chosen": 5.348327159881592, + "rewards/margins": 11.169523239135742, + "rewards/rejected": -5.82119607925415, + "step": 791 + }, + { + "epoch": 0.21707551048376045, + "grad_norm": 9.5, + "kl": 3.9613208770751953, + "learning_rate": 5e-06, + "logits/chosen": 3375050.769230769, + "logits/rejected": -16863291.636363637, + "logps/chosen": -536.2646484375, + "logps/rejected": -516.8904474431819, + "loss": 0.0549, + "rewards/chosen": 5.846859271709736, + "rewards/margins": 12.592699170946242, + "rewards/rejected": -6.745839899236506, + "step": 792 + }, + { + "epoch": 0.21734959572427026, + "grad_norm": 9.1875, + "kl": 1.864248275756836, + "learning_rate": 5e-06, + "logits/chosen": -1880824.6666666667, + "logits/rejected": -5009074.0, + "logps/chosen": -421.4808756510417, + "logps/rejected": -610.680419921875, + "loss": 0.0505, + "rewards/chosen": 4.074622472127278, + "rewards/margins": 13.100934982299805, + "rewards/rejected": -9.026312510172525, + "step": 793 + }, + { + "epoch": 0.21762368096478005, + "grad_norm": 11.75, + "kl": 1.3024375438690186, + "learning_rate": 5e-06, + "logits/chosen": -17476727.466666665, + "logits/rejected": -19119276.444444444, + "logps/chosen": -340.6991861979167, + "logps/rejected": -455.9899088541667, + "loss": 0.0714, + "rewards/chosen": 4.486894226074218, + "rewards/margins": 9.774479166666666, + "rewards/rejected": -5.287584940592448, + "step": 794 + }, + { + "epoch": 0.21789776620528983, + "grad_norm": 3.359375, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -13838947.555555556, + "logits/rejected": -13920452.266666668, + "logps/chosen": -480.13536241319446, + "logps/rejected": -475.4839192708333, + "loss": 0.0157, + "rewards/chosen": 5.9218860202365455, + "rewards/margins": 13.273551771375868, + "rewards/rejected": -7.351665751139323, + "step": 795 + }, + { + "epoch": 0.21817185144579965, + "grad_norm": 13.8125, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -22990919.272727273, + "logits/rejected": -16966534.153846152, + "logps/chosen": -517.8690518465909, + "logps/rejected": -515.9424579326923, + "loss": 0.0422, + "rewards/chosen": 5.47568789395419, + "rewards/margins": 12.092770929936762, + "rewards/rejected": -6.6170830359825725, + "step": 796 + }, + { + "epoch": 0.21844593668630943, + "grad_norm": 7.3125, + "kl": 4.938290596008301, + "learning_rate": 5e-06, + "logits/chosen": -10226675.42857143, + "logits/rejected": -931422.4, + "logps/chosen": -503.98653738839283, + "logps/rejected": -431.5666015625, + "loss": 0.0621, + "rewards/chosen": 6.058909824916294, + "rewards/margins": 11.766854694911412, + "rewards/rejected": -5.707944869995117, + "step": 797 + }, + { + "epoch": 0.21872002192681925, + "grad_norm": 4.625, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -20675685.818181816, + "logits/rejected": -5755533.538461538, + "logps/chosen": -541.4126420454545, + "logps/rejected": -490.55453725961536, + "loss": 0.0225, + "rewards/chosen": 5.373708204789595, + "rewards/margins": 13.086327105968982, + "rewards/rejected": -7.712618901179387, + "step": 798 + }, + { + "epoch": 0.21899410716732903, + "grad_norm": 11.6875, + "kl": 9.616449356079102, + "learning_rate": 5e-06, + "logits/chosen": -5517784.533333333, + "logits/rejected": -14818889.777777778, + "logps/chosen": -419.859375, + "logps/rejected": -370.29918077256946, + "loss": 0.09, + "rewards/chosen": 5.198553466796875, + "rewards/margins": 9.908153279622397, + "rewards/rejected": -4.7095998128255205, + "step": 799 + }, + { + "epoch": 0.21926819240783885, + "grad_norm": 10.4375, + "kl": 13.885015487670898, + "learning_rate": 5e-06, + "logits/chosen": -19560950.0, + "logits/rejected": 2832783.0, + "logps/chosen": -525.465087890625, + "logps/rejected": -727.39111328125, + "loss": 0.0519, + "rewards/chosen": 6.72096061706543, + "rewards/margins": 16.010310173034668, + "rewards/rejected": -9.289349555969238, + "step": 800 + }, + { + "epoch": 0.21954227764834863, + "grad_norm": 4.5, + "kl": 0.8050836324691772, + "learning_rate": 5e-06, + "logits/chosen": -228709.5, + "logits/rejected": 598701.6, + "logps/chosen": -398.02755301339283, + "logps/rejected": -538.45947265625, + "loss": 0.0564, + "rewards/chosen": 4.699580601283482, + "rewards/margins": 12.03304007393973, + "rewards/rejected": -7.33345947265625, + "step": 801 + }, + { + "epoch": 0.21981636288885845, + "grad_norm": 7.53125, + "kl": 16.348541259765625, + "learning_rate": 5e-06, + "logits/chosen": -14802203.2, + "logits/rejected": -4229220.285714285, + "logps/chosen": -438.68115234375, + "logps/rejected": -509.82376534598217, + "loss": 0.0285, + "rewards/chosen": 7.43970947265625, + "rewards/margins": 15.192590659005301, + "rewards/rejected": -7.752881186349051, + "step": 802 + }, + { + "epoch": 0.22009044812936823, + "grad_norm": 4.375, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -15771357.090909092, + "logits/rejected": -6195572.307692308, + "logps/chosen": -470.28067294034093, + "logps/rejected": -445.89107572115387, + "loss": 0.018, + "rewards/chosen": 5.302019292658025, + "rewards/margins": 12.859784959913133, + "rewards/rejected": -7.557765667255108, + "step": 803 + }, + { + "epoch": 0.22036453336987802, + "grad_norm": 10.875, + "kl": 5.66575813293457, + "learning_rate": 5e-06, + "logits/chosen": -18620745.6, + "logits/rejected": 6343512.0, + "logps/chosen": -449.19765625, + "logps/rejected": -398.8115234375, + "loss": 0.0503, + "rewards/chosen": 6.566942596435547, + "rewards/margins": 11.810203661237445, + "rewards/rejected": -5.243261064801898, + "step": 804 + }, + { + "epoch": 0.22063861861038783, + "grad_norm": 11.625, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -2722838.0, + "logits/rejected": 4748530.133333334, + "logps/chosen": -315.577392578125, + "logps/rejected": -532.30966796875, + "loss": 0.0702, + "rewards/chosen": 4.971497429741754, + "rewards/margins": 13.77307824028863, + "rewards/rejected": -8.801580810546875, + "step": 805 + }, + { + "epoch": 0.22091270385089762, + "grad_norm": 7.34375, + "kl": 5.366238594055176, + "learning_rate": 5e-06, + "logits/chosen": -11429568.888888888, + "logits/rejected": 3661066.6666666665, + "logps/chosen": -341.68519422743054, + "logps/rejected": -619.441162109375, + "loss": 0.0587, + "rewards/chosen": 4.518801795111762, + "rewards/margins": 13.106998231675888, + "rewards/rejected": -8.588196436564127, + "step": 806 + }, + { + "epoch": 0.22118678909140743, + "grad_norm": 8.9375, + "kl": 5.439787864685059, + "learning_rate": 5e-06, + "logits/chosen": -31843468.8, + "logits/rejected": 27567611.42857143, + "logps/chosen": -455.722705078125, + "logps/rejected": -484.18966238839283, + "loss": 0.0259, + "rewards/chosen": 6.9479927062988285, + "rewards/margins": 13.248453521728516, + "rewards/rejected": -6.3004608154296875, + "step": 807 + }, + { + "epoch": 0.22146087433191722, + "grad_norm": 4.9375, + "kl": 1.182965636253357, + "learning_rate": 5e-06, + "logits/chosen": -17608854.4, + "logits/rejected": 25340061.714285713, + "logps/chosen": -518.547607421875, + "logps/rejected": -488.6659458705357, + "loss": 0.0328, + "rewards/chosen": 5.450493621826172, + "rewards/margins": 14.149022456577846, + "rewards/rejected": -8.698528834751674, + "step": 808 + }, + { + "epoch": 0.22173495957242703, + "grad_norm": 4.375, + "kl": 4.537447929382324, + "learning_rate": 5e-06, + "logits/chosen": -2281173.714285714, + "logits/rejected": -6736168.0, + "logps/chosen": -416.07400948660717, + "logps/rejected": -534.679443359375, + "loss": 0.0219, + "rewards/chosen": 5.8769345964704245, + "rewards/margins": 15.361314937046597, + "rewards/rejected": -9.484380340576172, + "step": 809 + }, + { + "epoch": 0.22200904481293682, + "grad_norm": 11.5, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -19360893.714285713, + "logits/rejected": -6470221.647058823, + "logps/chosen": -549.5157645089286, + "logps/rejected": -457.0021541819853, + "loss": 0.0537, + "rewards/chosen": 7.590850285121372, + "rewards/margins": 15.087336275757862, + "rewards/rejected": -7.496485990636489, + "step": 810 + }, + { + "epoch": 0.22228313005344663, + "grad_norm": 10.25, + "kl": 6.14879846572876, + "learning_rate": 5e-06, + "logits/chosen": -13819918.933333334, + "logits/rejected": 31246193.777777776, + "logps/chosen": -475.95009765625, + "logps/rejected": -384.916015625, + "loss": 0.0522, + "rewards/chosen": 6.043702189127604, + "rewards/margins": 12.703101433648003, + "rewards/rejected": -6.659399244520399, + "step": 811 + }, + { + "epoch": 0.22255721529395642, + "grad_norm": 12.0625, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -1886207.7142857143, + "logits/rejected": -18539627.2, + "logps/chosen": -430.6611328125, + "logps/rejected": -633.483154296875, + "loss": 0.0549, + "rewards/chosen": 5.298552376883371, + "rewards/margins": 16.49320798601423, + "rewards/rejected": -11.194655609130859, + "step": 812 + }, + { + "epoch": 0.22283130053446623, + "grad_norm": 11.3125, + "kl": 4.3205976486206055, + "learning_rate": 5e-06, + "logits/chosen": 3930466.4615384615, + "logits/rejected": -7295194.909090909, + "logps/chosen": -411.21615835336536, + "logps/rejected": -448.92764559659093, + "loss": 0.0572, + "rewards/chosen": 5.479726938100962, + "rewards/margins": 12.788760418658491, + "rewards/rejected": -7.309033480557528, + "step": 813 + }, + { + "epoch": 0.22310538577497602, + "grad_norm": 10.8125, + "kl": 2.710541009902954, + "learning_rate": 5e-06, + "logits/chosen": -1771884.0, + "logits/rejected": -17250840.0, + "logps/chosen": -405.97119140625, + "logps/rejected": -572.5330200195312, + "loss": 0.0485, + "rewards/chosen": 5.29817533493042, + "rewards/margins": 14.265697002410889, + "rewards/rejected": -8.967521667480469, + "step": 814 + }, + { + "epoch": 0.2233794710154858, + "grad_norm": 11.375, + "kl": 2.5503196716308594, + "learning_rate": 5e-06, + "logits/chosen": -6752412.363636363, + "logits/rejected": -7237039.384615385, + "logps/chosen": -399.80961470170456, + "logps/rejected": -469.5404522235577, + "loss": 0.0631, + "rewards/chosen": 4.89246957952326, + "rewards/margins": 12.958711504102586, + "rewards/rejected": -8.066241924579327, + "step": 815 + }, + { + "epoch": 0.22365355625599562, + "grad_norm": 5.84375, + "kl": 7.282975673675537, + "learning_rate": 5e-06, + "logits/chosen": -19381006.933333334, + "logits/rejected": -25690888.888888888, + "logps/chosen": -386.33704427083336, + "logps/rejected": -423.8784993489583, + "loss": 0.0613, + "rewards/chosen": 5.49405771891276, + "rewards/margins": 11.416936832004122, + "rewards/rejected": -5.922879113091363, + "step": 816 + }, + { + "epoch": 0.2239276414965054, + "grad_norm": 7.90625, + "kl": 1.9397945404052734, + "learning_rate": 5e-06, + "logits/chosen": -26491810.285714287, + "logits/rejected": -3768008.705882353, + "logps/chosen": -389.41573660714283, + "logps/rejected": -551.8060661764706, + "loss": 0.0249, + "rewards/chosen": 5.859434945242746, + "rewards/margins": 12.902331440388656, + "rewards/rejected": -7.04289649514591, + "step": 817 + }, + { + "epoch": 0.22420172673701522, + "grad_norm": 7.0625, + "kl": 6.762911796569824, + "learning_rate": 5e-06, + "logits/chosen": -22964233.6, + "logits/rejected": 3309326.8571428573, + "logps/chosen": -401.66513671875, + "logps/rejected": -606.759765625, + "loss": 0.0543, + "rewards/chosen": 5.7550914764404295, + "rewards/margins": 14.93085310799735, + "rewards/rejected": -9.17576163155692, + "step": 818 + }, + { + "epoch": 0.224475811977525, + "grad_norm": 8.625, + "kl": 8.852701187133789, + "learning_rate": 5e-06, + "logits/chosen": -6928333.866666666, + "logits/rejected": -25770796.444444444, + "logps/chosen": -460.76402994791664, + "logps/rejected": -557.2111545138889, + "loss": 0.0303, + "rewards/chosen": 7.055112711588541, + "rewards/margins": 14.767147148980033, + "rewards/rejected": -7.712034437391493, + "step": 819 + }, + { + "epoch": 0.22474989721803482, + "grad_norm": 9.625, + "kl": 4.297152996063232, + "learning_rate": 5e-06, + "logits/chosen": -9633650.4, + "logits/rejected": -11117885.714285715, + "logps/chosen": -378.98720703125, + "logps/rejected": -460.8983677455357, + "loss": 0.041, + "rewards/chosen": 4.318521118164062, + "rewards/margins": 12.48156018938337, + "rewards/rejected": -8.163039071219307, + "step": 820 + }, + { + "epoch": 0.2250239824585446, + "grad_norm": 8.3125, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": 3644907.7333333334, + "logits/rejected": 1838000.4444444445, + "logps/chosen": -389.21803385416666, + "logps/rejected": -411.3120388454861, + "loss": 0.0597, + "rewards/chosen": 5.54043935139974, + "rewards/margins": 12.052359856499567, + "rewards/rejected": -6.511920505099827, + "step": 821 + }, + { + "epoch": 0.22529806769905442, + "grad_norm": 7.03125, + "kl": 4.115140914916992, + "learning_rate": 5e-06, + "logits/chosen": -3832325.5384615385, + "logits/rejected": 45581716.36363637, + "logps/chosen": -423.9743464543269, + "logps/rejected": -549.5117631392045, + "loss": 0.0269, + "rewards/chosen": 5.354648883526142, + "rewards/margins": 12.963849101033244, + "rewards/rejected": -7.6092002175071025, + "step": 822 + }, + { + "epoch": 0.2255721529395642, + "grad_norm": 6.75, + "kl": 1.2415618896484375, + "learning_rate": 5e-06, + "logits/chosen": 6387709.6, + "logits/rejected": -16107942.857142856, + "logps/chosen": -486.3078125, + "logps/rejected": -321.22670200892856, + "loss": 0.0321, + "rewards/chosen": 5.934611129760742, + "rewards/margins": 11.814936447143555, + "rewards/rejected": -5.8803253173828125, + "step": 823 + }, + { + "epoch": 0.22584623818007402, + "grad_norm": 7.78125, + "kl": 3.3480162620544434, + "learning_rate": 5e-06, + "logits/chosen": -9403510.4, + "logits/rejected": -10638764.57142857, + "logps/chosen": -466.937451171875, + "logps/rejected": -557.6642717633929, + "loss": 0.0388, + "rewards/chosen": 5.875347137451172, + "rewards/margins": 14.652554648263115, + "rewards/rejected": -8.777207510811943, + "step": 824 + }, + { + "epoch": 0.2261203234205838, + "grad_norm": 6.3125, + "kl": 7.0272440910339355, + "learning_rate": 5e-06, + "logits/chosen": 3942095.272727273, + "logits/rejected": -18973264.0, + "logps/chosen": -406.7698863636364, + "logps/rejected": -535.8597506009615, + "loss": 0.0362, + "rewards/chosen": 5.810187946666371, + "rewards/margins": 12.375477370682297, + "rewards/rejected": -6.565289424015925, + "step": 825 + }, + { + "epoch": 0.2263944086610936, + "grad_norm": 7.15625, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -9365670.4, + "logits/rejected": -13169491.42857143, + "logps/chosen": -557.17099609375, + "logps/rejected": -528.6283133370536, + "loss": 0.0474, + "rewards/chosen": 6.54110107421875, + "rewards/margins": 13.583388083321708, + "rewards/rejected": -7.0422870091029575, + "step": 826 + }, + { + "epoch": 0.2266684939016034, + "grad_norm": 9.25, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": 2509744.3636363638, + "logits/rejected": -2630145.230769231, + "logps/chosen": -384.45565518465907, + "logps/rejected": -475.34555288461536, + "loss": 0.0823, + "rewards/chosen": 4.410513444380327, + "rewards/margins": 10.859631838498416, + "rewards/rejected": -6.449118394118089, + "step": 827 + }, + { + "epoch": 0.2269425791421132, + "grad_norm": 7.8125, + "kl": 1.7063522338867188, + "learning_rate": 5e-06, + "logits/chosen": -13149235.692307692, + "logits/rejected": 23127741.09090909, + "logps/chosen": -406.3348858173077, + "logps/rejected": -502.53151633522725, + "loss": 0.0424, + "rewards/chosen": 5.391617408165565, + "rewards/margins": 13.13696902615207, + "rewards/rejected": -7.745351617986506, + "step": 828 + }, + { + "epoch": 0.227216664382623, + "grad_norm": 12.375, + "kl": 0.6445509791374207, + "learning_rate": 5e-06, + "logits/chosen": -8874888.615384616, + "logits/rejected": -1484684.3636363635, + "logps/chosen": -322.11268028846155, + "logps/rejected": -424.9056285511364, + "loss": 0.0947, + "rewards/chosen": 4.345704885629507, + "rewards/margins": 10.466566979468286, + "rewards/rejected": -6.120862093838778, + "step": 829 + }, + { + "epoch": 0.2274907496231328, + "grad_norm": 6.6875, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -21789399.111111112, + "logits/rejected": 11621902.933333334, + "logps/chosen": -376.5714518229167, + "logps/rejected": -587.4615234375, + "loss": 0.0312, + "rewards/chosen": 4.899071163601345, + "rewards/margins": 15.090054236518013, + "rewards/rejected": -10.190983072916667, + "step": 830 + }, + { + "epoch": 0.2277648348636426, + "grad_norm": 5.1875, + "kl": 6.841798782348633, + "learning_rate": 5e-06, + "logits/chosen": -741924.6666666666, + "logits/rejected": -22391357.333333332, + "logps/chosen": -455.0276692708333, + "logps/rejected": -507.3132731119792, + "loss": 0.0249, + "rewards/chosen": 5.8507639567057295, + "rewards/margins": 12.74376932779948, + "rewards/rejected": -6.89300537109375, + "step": 831 + }, + { + "epoch": 0.2280389201041524, + "grad_norm": 7.625, + "kl": 5.603376388549805, + "learning_rate": 5e-06, + "logits/chosen": -14874090.666666666, + "logits/rejected": 1814346.6666666667, + "logps/chosen": -427.7555338541667, + "logps/rejected": -572.5993245442709, + "loss": 0.0734, + "rewards/chosen": 4.628363291422526, + "rewards/margins": 14.726048787434895, + "rewards/rejected": -10.09768549601237, + "step": 832 + }, + { + "epoch": 0.2283130053446622, + "grad_norm": 12.75, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -9078902.4, + "logits/rejected": -8200260.0, + "logps/chosen": -504.258935546875, + "logps/rejected": -487.64341517857144, + "loss": 0.0438, + "rewards/chosen": 5.383628082275391, + "rewards/margins": 12.725254930768696, + "rewards/rejected": -7.341626848493304, + "step": 833 + }, + { + "epoch": 0.228587090585172, + "grad_norm": 7.25, + "kl": 12.277402877807617, + "learning_rate": 5e-06, + "logits/chosen": -29135682.0, + "logits/rejected": -25316868.0, + "logps/chosen": -446.307861328125, + "logps/rejected": -466.001708984375, + "loss": 0.0463, + "rewards/chosen": 6.411397457122803, + "rewards/margins": 14.00470495223999, + "rewards/rejected": -7.5933074951171875, + "step": 834 + }, + { + "epoch": 0.2288611758256818, + "grad_norm": 3.59375, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -14304940.444444444, + "logits/rejected": -14484059.733333332, + "logps/chosen": -501.45220269097223, + "logps/rejected": -457.49811197916665, + "loss": 0.0266, + "rewards/chosen": 5.408864339192708, + "rewards/margins": 12.365062459309897, + "rewards/rejected": -6.956198120117188, + "step": 835 + }, + { + "epoch": 0.2291352610661916, + "grad_norm": 6.125, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": 1367470.0, + "logits/rejected": -7555722.285714285, + "logps/chosen": -434.8716796875, + "logps/rejected": -539.647705078125, + "loss": 0.0369, + "rewards/chosen": 5.503591156005859, + "rewards/margins": 15.223096466064453, + "rewards/rejected": -9.719505310058594, + "step": 836 + }, + { + "epoch": 0.22940934630670137, + "grad_norm": 9.5, + "kl": 1.6526902914047241, + "learning_rate": 5e-06, + "logits/chosen": -8014273.714285715, + "logits/rejected": 27290144.0, + "logps/chosen": -339.30283900669644, + "logps/rejected": -452.34443359375, + "loss": 0.0713, + "rewards/chosen": 4.129632677350726, + "rewards/margins": 11.169831957135882, + "rewards/rejected": -7.040199279785156, + "step": 837 + }, + { + "epoch": 0.2296834315472112, + "grad_norm": 6.25, + "kl": 2.0782599449157715, + "learning_rate": 5e-06, + "logits/chosen": -4750218.133333334, + "logits/rejected": 10421428.444444444, + "logps/chosen": -455.10768229166666, + "logps/rejected": -545.9123263888889, + "loss": 0.0458, + "rewards/chosen": 6.895383707682291, + "rewards/margins": 13.2638181898329, + "rewards/rejected": -6.368434482150608, + "step": 838 + }, + { + "epoch": 0.22995751678772097, + "grad_norm": 8.4375, + "kl": 9.331491470336914, + "learning_rate": 5e-06, + "logits/chosen": -13010771.764705881, + "logits/rejected": -16067992.0, + "logps/chosen": -380.3355066636029, + "logps/rejected": -625.9729352678571, + "loss": 0.0916, + "rewards/chosen": 4.750011668485754, + "rewards/margins": 15.706105592871914, + "rewards/rejected": -10.956093924386161, + "step": 839 + }, + { + "epoch": 0.2302316020282308, + "grad_norm": 3.90625, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -15415754.181818182, + "logits/rejected": -7267485.538461538, + "logps/chosen": -418.92391690340907, + "logps/rejected": -434.18310546875, + "loss": 0.0304, + "rewards/chosen": 6.664093711159446, + "rewards/margins": 14.567186315576514, + "rewards/rejected": -7.9030926044170675, + "step": 840 + }, + { + "epoch": 0.23050568726874057, + "grad_norm": 9.3125, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -7049078.285714285, + "logits/rejected": -9818782.11764706, + "logps/chosen": -399.5365513392857, + "logps/rejected": -525.4549632352941, + "loss": 0.0318, + "rewards/chosen": 4.99981198992048, + "rewards/margins": 12.433035041103844, + "rewards/rejected": -7.433223051183364, + "step": 841 + }, + { + "epoch": 0.23077977250925039, + "grad_norm": 13.4375, + "kl": 2.628507137298584, + "learning_rate": 5e-06, + "logits/chosen": -3693788.3636363638, + "logits/rejected": 6916694.769230769, + "logps/chosen": -397.1531427556818, + "logps/rejected": -567.6492638221154, + "loss": 0.0837, + "rewards/chosen": 4.833805431019176, + "rewards/margins": 11.013066271801929, + "rewards/rejected": -6.179260840782752, + "step": 842 + }, + { + "epoch": 0.23105385774976017, + "grad_norm": 7.6875, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -8401435.333333334, + "logits/rejected": 30409528.0, + "logps/chosen": -346.2425130208333, + "logps/rejected": -715.8655598958334, + "loss": 0.0505, + "rewards/chosen": 3.380221684773763, + "rewards/margins": 14.07386334737142, + "rewards/rejected": -10.693641662597656, + "step": 843 + }, + { + "epoch": 0.23132794299026999, + "grad_norm": 4.9375, + "kl": 2.6917293071746826, + "learning_rate": 5e-06, + "logits/chosen": -26597280.0, + "logits/rejected": -23587973.333333332, + "logps/chosen": -420.4447428385417, + "logps/rejected": -455.9367268880208, + "loss": 0.0454, + "rewards/chosen": 6.278371175130208, + "rewards/margins": 14.02878189086914, + "rewards/rejected": -7.750410715738933, + "step": 844 + }, + { + "epoch": 0.23160202823077977, + "grad_norm": 5.90625, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -3715337.1428571427, + "logits/rejected": 44092883.2, + "logps/chosen": -414.736572265625, + "logps/rejected": -546.480322265625, + "loss": 0.0261, + "rewards/chosen": 5.7176938738141745, + "rewards/margins": 16.136469813755582, + "rewards/rejected": -10.418775939941407, + "step": 845 + }, + { + "epoch": 0.23187611347128956, + "grad_norm": 10.6875, + "kl": 14.612869262695312, + "learning_rate": 5e-06, + "logits/chosen": -10613205.333333334, + "logits/rejected": -36024458.666666664, + "logps/chosen": -573.4852213541667, + "logps/rejected": -521.3101671006945, + "loss": 0.0426, + "rewards/chosen": 7.659720357259115, + "rewards/margins": 13.315475718180338, + "rewards/rejected": -5.655755360921224, + "step": 846 + }, + { + "epoch": 0.23215019871179937, + "grad_norm": 6.09375, + "kl": 0.1102396696805954, + "learning_rate": 5e-06, + "logits/chosen": 12241957.333333334, + "logits/rejected": -14462712.0, + "logps/chosen": -453.8828125, + "logps/rejected": -367.6239420572917, + "loss": 0.0214, + "rewards/chosen": 5.992277145385742, + "rewards/margins": 12.533732096354168, + "rewards/rejected": -6.541454950968425, + "step": 847 + }, + { + "epoch": 0.23242428395230916, + "grad_norm": 7.25, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -2463405.090909091, + "logits/rejected": 9108174.76923077, + "logps/chosen": -485.3443714488636, + "logps/rejected": -463.6696965144231, + "loss": 0.0162, + "rewards/chosen": 7.034216447310015, + "rewards/margins": 15.52078284416999, + "rewards/rejected": -8.486566396859976, + "step": 848 + }, + { + "epoch": 0.23269836919281897, + "grad_norm": 3.859375, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": 5763527.2727272725, + "logits/rejected": -9133715.076923076, + "logps/chosen": -478.24240944602275, + "logps/rejected": -411.2191631610577, + "loss": 0.0127, + "rewards/chosen": 5.459336020729759, + "rewards/margins": 11.193102429796767, + "rewards/rejected": -5.733766409067007, + "step": 849 + }, + { + "epoch": 0.23297245443332876, + "grad_norm": 7.46875, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -10685323.0, + "logits/rejected": 19510504.0, + "logps/chosen": -472.3848571777344, + "logps/rejected": -462.69207763671875, + "loss": 0.0483, + "rewards/chosen": 5.107512474060059, + "rewards/margins": 11.861164569854736, + "rewards/rejected": -6.753652095794678, + "step": 850 + }, + { + "epoch": 0.23324653967383857, + "grad_norm": 6.90625, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -6298588.666666667, + "logits/rejected": 5180108.666666667, + "logps/chosen": -313.38828531901044, + "logps/rejected": -432.7322998046875, + "loss": 0.0358, + "rewards/chosen": 6.31338373819987, + "rewards/margins": 13.359962463378906, + "rewards/rejected": -7.046578725179036, + "step": 851 + }, + { + "epoch": 0.23352062491434836, + "grad_norm": 9.6875, + "kl": 0.9980697631835938, + "learning_rate": 5e-06, + "logits/chosen": -11001760.0, + "logits/rejected": -5872745.090909091, + "logps/chosen": -425.8888972355769, + "logps/rejected": -456.88503196022725, + "loss": 0.0499, + "rewards/chosen": 5.94000244140625, + "rewards/margins": 9.860009626908736, + "rewards/rejected": -3.920007185502486, + "step": 852 + }, + { + "epoch": 0.23379471015485817, + "grad_norm": 11.8125, + "kl": 1.5001157522201538, + "learning_rate": 5e-06, + "logits/chosen": -5993779.2, + "logits/rejected": -17484995.555555556, + "logps/chosen": -438.0805989583333, + "logps/rejected": -553.3753255208334, + "loss": 0.0492, + "rewards/chosen": 4.777060953776042, + "rewards/margins": 14.218423122829861, + "rewards/rejected": -9.44136216905382, + "step": 853 + }, + { + "epoch": 0.23406879539536796, + "grad_norm": 11.6875, + "kl": 6.219813346862793, + "learning_rate": 5e-06, + "logits/chosen": -10982214.0, + "logits/rejected": -11653192.0, + "logps/chosen": -406.85626220703125, + "logps/rejected": -539.385986328125, + "loss": 0.0572, + "rewards/chosen": 6.004615783691406, + "rewards/margins": 14.016836166381836, + "rewards/rejected": -8.01222038269043, + "step": 854 + }, + { + "epoch": 0.23434288063587777, + "grad_norm": 3.75, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -21399753.6, + "logits/rejected": -5781651.428571428, + "logps/chosen": -486.142529296875, + "logps/rejected": -569.9812709263393, + "loss": 0.0168, + "rewards/chosen": 6.310839462280273, + "rewards/margins": 15.23527248927525, + "rewards/rejected": -8.924433026994977, + "step": 855 + }, + { + "epoch": 0.23461696587638756, + "grad_norm": 3.015625, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -13221537.0, + "logits/rejected": 80187616.0, + "logps/chosen": -374.51287841796875, + "logps/rejected": -535.7410888671875, + "loss": 0.0295, + "rewards/chosen": 4.993886947631836, + "rewards/margins": 13.549175262451172, + "rewards/rejected": -8.555288314819336, + "step": 856 + }, + { + "epoch": 0.23489105111689734, + "grad_norm": 8.9375, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -3183168.533333333, + "logits/rejected": -507579.55555555556, + "logps/chosen": -351.6521484375, + "logps/rejected": -429.3649088541667, + "loss": 0.039, + "rewards/chosen": 5.284422810872396, + "rewards/margins": 12.303526814778646, + "rewards/rejected": -7.01910400390625, + "step": 857 + }, + { + "epoch": 0.23516513635740716, + "grad_norm": 10.875, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -11551660.0, + "logits/rejected": -4113224.3333333335, + "logps/chosen": -363.8453776041667, + "logps/rejected": -483.569580078125, + "loss": 0.098, + "rewards/chosen": 4.33900260925293, + "rewards/margins": 10.507422129313152, + "rewards/rejected": -6.168419520060222, + "step": 858 + }, + { + "epoch": 0.23543922159791694, + "grad_norm": 5.46875, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": 11004660.8, + "logits/rejected": -11528645.714285715, + "logps/chosen": -397.067578125, + "logps/rejected": -531.1529715401786, + "loss": 0.0287, + "rewards/chosen": 6.7886909484863285, + "rewards/margins": 14.73840833391462, + "rewards/rejected": -7.9497173854282925, + "step": 859 + }, + { + "epoch": 0.23571330683842676, + "grad_norm": 12.6875, + "kl": 6.069465160369873, + "learning_rate": 5e-06, + "logits/chosen": -23867997.53846154, + "logits/rejected": -20229099.636363637, + "logps/chosen": -394.72445913461536, + "logps/rejected": -498.8902698863636, + "loss": 0.0677, + "rewards/chosen": 4.605005997877854, + "rewards/margins": 11.642970558646676, + "rewards/rejected": -7.037964560768821, + "step": 860 + }, + { + "epoch": 0.23598739207893654, + "grad_norm": 3.09375, + "kl": 2.1913630962371826, + "learning_rate": 5e-06, + "logits/chosen": -7051408.533333333, + "logits/rejected": -9108555.555555556, + "logps/chosen": -420.20283203125, + "logps/rejected": -683.4813368055555, + "loss": 0.0118, + "rewards/chosen": 6.3790135701497395, + "rewards/margins": 15.89761488172743, + "rewards/rejected": -9.51860131157769, + "step": 861 + }, + { + "epoch": 0.23626147731944636, + "grad_norm": 9.375, + "kl": 1.1645368337631226, + "learning_rate": 5e-06, + "logits/chosen": -11806162.181818182, + "logits/rejected": 12422446.76923077, + "logps/chosen": -436.21835049715907, + "logps/rejected": -687.3269981971154, + "loss": 0.0456, + "rewards/chosen": 5.256955927068537, + "rewards/margins": 16.163410160091374, + "rewards/rejected": -10.906454233022837, + "step": 862 + }, + { + "epoch": 0.23653556255995614, + "grad_norm": 7.21875, + "kl": 4.468235969543457, + "learning_rate": 5e-06, + "logits/chosen": -3313508.923076923, + "logits/rejected": -10538740.363636363, + "logps/chosen": -425.85415414663464, + "logps/rejected": -599.3912020596591, + "loss": 0.0158, + "rewards/chosen": 6.50668217585637, + "rewards/margins": 12.600884390877678, + "rewards/rejected": -6.094202215021307, + "step": 863 + }, + { + "epoch": 0.23680964780046596, + "grad_norm": 4.0, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -19293977.333333332, + "logits/rejected": 9448739.333333334, + "logps/chosen": -411.5407307942708, + "logps/rejected": -601.5840657552084, + "loss": 0.0109, + "rewards/chosen": 6.8816986083984375, + "rewards/margins": 14.870519002278645, + "rewards/rejected": -7.988820393880208, + "step": 864 + }, + { + "epoch": 0.23708373304097574, + "grad_norm": 14.875, + "kl": 3.966329574584961, + "learning_rate": 5e-06, + "logits/chosen": -10584011.733333332, + "logits/rejected": -23760718.222222224, + "logps/chosen": -472.82434895833336, + "logps/rejected": -454.72056749131946, + "loss": 0.0795, + "rewards/chosen": 4.874595133463542, + "rewards/margins": 10.270219082302518, + "rewards/rejected": -5.395623948838976, + "step": 865 + }, + { + "epoch": 0.23735781828148556, + "grad_norm": 4.46875, + "kl": 5.4783525466918945, + "learning_rate": 5e-06, + "logits/chosen": -13866308.266666668, + "logits/rejected": 9327029.333333334, + "logps/chosen": -394.29518229166666, + "logps/rejected": -559.1533203125, + "loss": 0.0194, + "rewards/chosen": 5.881537882486979, + "rewards/margins": 13.649051581488715, + "rewards/rejected": -7.767513699001736, + "step": 866 + }, + { + "epoch": 0.23763190352199534, + "grad_norm": 4.3125, + "kl": 2.953699827194214, + "learning_rate": 5e-06, + "logits/chosen": -1245680.2307692308, + "logits/rejected": -2079248.7272727273, + "logps/chosen": -422.2362530048077, + "logps/rejected": -442.12362393465907, + "loss": 0.0495, + "rewards/chosen": 6.007197453425481, + "rewards/margins": 11.758277359542312, + "rewards/rejected": -5.751079906116832, + "step": 867 + }, + { + "epoch": 0.23790598876250513, + "grad_norm": 11.8125, + "kl": 11.470787048339844, + "learning_rate": 5e-06, + "logits/chosen": -3365308.0, + "logits/rejected": -5892390.5, + "logps/chosen": -348.96038818359375, + "logps/rejected": -326.9132080078125, + "loss": 0.0731, + "rewards/chosen": 5.0208635330200195, + "rewards/margins": 11.66939640045166, + "rewards/rejected": -6.648532867431641, + "step": 868 + }, + { + "epoch": 0.23818007400301494, + "grad_norm": 11.625, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": 7220483.636363637, + "logits/rejected": -8120369.230769231, + "logps/chosen": -439.23508522727275, + "logps/rejected": -361.92300180288464, + "loss": 0.0365, + "rewards/chosen": 7.108888799493963, + "rewards/margins": 14.812320149028217, + "rewards/rejected": -7.703431349534255, + "step": 869 + }, + { + "epoch": 0.23845415924352473, + "grad_norm": 9.875, + "kl": 9.25204849243164, + "learning_rate": 5e-06, + "logits/chosen": -25950004.57142857, + "logits/rejected": 2384699.6, + "logps/chosen": -496.029052734375, + "logps/rejected": -510.03291015625, + "loss": 0.0454, + "rewards/chosen": 6.020174843924386, + "rewards/margins": 13.560754067557198, + "rewards/rejected": -7.540579223632813, + "step": 870 + }, + { + "epoch": 0.23872824448403454, + "grad_norm": 5.90625, + "kl": 2.4701271057128906, + "learning_rate": 5e-06, + "logits/chosen": 4990902.285714285, + "logits/rejected": -6435178.0, + "logps/chosen": -470.89132254464283, + "logps/rejected": -433.12958984375, + "loss": 0.0227, + "rewards/chosen": 6.56896482195173, + "rewards/margins": 12.813875634329658, + "rewards/rejected": -6.244910812377929, + "step": 871 + }, + { + "epoch": 0.23900232972454433, + "grad_norm": 5.84375, + "kl": 1.4306056499481201, + "learning_rate": 5e-06, + "logits/chosen": 14552681.142857144, + "logits/rejected": -20051905.6, + "logps/chosen": -370.39090401785717, + "logps/rejected": -457.485595703125, + "loss": 0.0472, + "rewards/chosen": 4.845833914620536, + "rewards/margins": 13.901859610421315, + "rewards/rejected": -9.05602569580078, + "step": 872 + }, + { + "epoch": 0.23927641496505414, + "grad_norm": 8.875, + "kl": 4.464672088623047, + "learning_rate": 5e-06, + "logits/chosen": 4895942.285714285, + "logits/rejected": -6036876.4, + "logps/chosen": -371.23447963169644, + "logps/rejected": -452.418408203125, + "loss": 0.0543, + "rewards/chosen": 4.55328859601702, + "rewards/margins": 12.490703146798271, + "rewards/rejected": -7.93741455078125, + "step": 873 + }, + { + "epoch": 0.23955050020556393, + "grad_norm": 16.375, + "kl": 3.9444408416748047, + "learning_rate": 5e-06, + "logits/chosen": -18040964.0, + "logits/rejected": -11666229.333333334, + "logps/chosen": -394.443115234375, + "logps/rejected": -465.3247884114583, + "loss": 0.0292, + "rewards/chosen": 5.607100168863933, + "rewards/margins": 13.94128926595052, + "rewards/rejected": -8.334189097086588, + "step": 874 + }, + { + "epoch": 0.23982458544607374, + "grad_norm": 8.3125, + "kl": 4.6769561767578125, + "learning_rate": 5e-06, + "logits/chosen": -10324859.333333334, + "logits/rejected": 3537171.3333333335, + "logps/chosen": -456.6743977864583, + "logps/rejected": -569.1338297526041, + "loss": 0.0274, + "rewards/chosen": 5.548547108968099, + "rewards/margins": 14.42900276184082, + "rewards/rejected": -8.88045565287272, + "step": 875 + }, + { + "epoch": 0.24009867068658353, + "grad_norm": 13.25, + "kl": 3.0734939575195312, + "learning_rate": 5e-06, + "logits/chosen": -19067107.42857143, + "logits/rejected": 2106588.4, + "logps/chosen": -424.40157645089283, + "logps/rejected": -543.92490234375, + "loss": 0.0454, + "rewards/chosen": 5.685338156563895, + "rewards/margins": 11.476146262032646, + "rewards/rejected": -5.79080810546875, + "step": 876 + }, + { + "epoch": 0.2403727559270933, + "grad_norm": 10.1875, + "kl": 6.518584728240967, + "learning_rate": 5e-06, + "logits/chosen": -2253815.3846153845, + "logits/rejected": -2996663.8181818184, + "logps/chosen": -390.07117638221155, + "logps/rejected": -475.89706143465907, + "loss": 0.044, + "rewards/chosen": 5.592864403357873, + "rewards/margins": 13.244854426884151, + "rewards/rejected": -7.651990023526278, + "step": 877 + }, + { + "epoch": 0.24064684116760313, + "grad_norm": 9.3125, + "kl": 4.6004133224487305, + "learning_rate": 5e-06, + "logits/chosen": -5341806.117647059, + "logits/rejected": 8821469.714285715, + "logps/chosen": -352.7598230698529, + "logps/rejected": -570.0659877232143, + "loss": 0.0528, + "rewards/chosen": 4.8933868408203125, + "rewards/margins": 16.146034240722656, + "rewards/rejected": -11.252647399902344, + "step": 878 + }, + { + "epoch": 0.2409209264081129, + "grad_norm": 11.5625, + "kl": 3.9827208518981934, + "learning_rate": 5e-06, + "logits/chosen": -42888944.0, + "logits/rejected": -5705435.333333333, + "logps/chosen": -421.3941243489583, + "logps/rejected": -390.9328206380208, + "loss": 0.0641, + "rewards/chosen": 5.503225326538086, + "rewards/margins": 13.037641525268555, + "rewards/rejected": -7.534416198730469, + "step": 879 + }, + { + "epoch": 0.24119501164862273, + "grad_norm": 5.875, + "kl": 1.0286357402801514, + "learning_rate": 5e-06, + "logits/chosen": -4672495.333333333, + "logits/rejected": -20750872.0, + "logps/chosen": -466.2459309895833, + "logps/rejected": -491.5945638020833, + "loss": 0.021, + "rewards/chosen": 5.598375956217448, + "rewards/margins": 11.390850067138672, + "rewards/rejected": -5.792474110921224, + "step": 880 + }, + { + "epoch": 0.2414690968891325, + "grad_norm": 7.5625, + "kl": 2.0841147899627686, + "learning_rate": 5e-06, + "logits/chosen": 17015794.666666668, + "logits/rejected": -19451601.333333332, + "logps/chosen": -476.5810953776042, + "logps/rejected": -448.53076171875, + "loss": 0.0287, + "rewards/chosen": 5.2958634694417315, + "rewards/margins": 12.929672876993815, + "rewards/rejected": -7.633809407552083, + "step": 881 + }, + { + "epoch": 0.24174318212964233, + "grad_norm": 8.25, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -11155120.888888888, + "logits/rejected": -9350574.933333334, + "logps/chosen": -374.6775716145833, + "logps/rejected": -501.93951822916665, + "loss": 0.0327, + "rewards/chosen": 6.834637112087673, + "rewards/margins": 12.752752346462673, + "rewards/rejected": -5.918115234375, + "step": 882 + }, + { + "epoch": 0.2420172673701521, + "grad_norm": 9.75, + "kl": 2.4677138328552246, + "learning_rate": 5e-06, + "logits/chosen": 963511.7142857143, + "logits/rejected": -10347238.4, + "logps/chosen": -487.35707310267856, + "logps/rejected": -471.41796875, + "loss": 0.0659, + "rewards/chosen": 5.651353018624442, + "rewards/margins": 14.863739340645925, + "rewards/rejected": -9.212386322021484, + "step": 883 + }, + { + "epoch": 0.24229135261066193, + "grad_norm": 7.71875, + "kl": 2.6126277446746826, + "learning_rate": 5e-06, + "logits/chosen": -17941994.666666668, + "logits/rejected": 12105354.666666666, + "logps/chosen": -570.7707112630209, + "logps/rejected": -636.6070556640625, + "loss": 0.055, + "rewards/chosen": 6.329636891682942, + "rewards/margins": 15.800102233886719, + "rewards/rejected": -9.470465342203775, + "step": 884 + }, + { + "epoch": 0.2425654378511717, + "grad_norm": 5.0625, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -21772342.153846152, + "logits/rejected": -18218276.363636363, + "logps/chosen": -412.3405198317308, + "logps/rejected": -598.7698863636364, + "loss": 0.0186, + "rewards/chosen": 5.983622624323918, + "rewards/margins": 16.012707263439687, + "rewards/rejected": -10.029084639115768, + "step": 885 + }, + { + "epoch": 0.24283952309168153, + "grad_norm": 7.34375, + "kl": 0.8917821645736694, + "learning_rate": 5e-06, + "logits/chosen": -5120956.333333333, + "logits/rejected": -13650992.0, + "logps/chosen": -438.1419677734375, + "logps/rejected": -446.6163736979167, + "loss": 0.0488, + "rewards/chosen": 5.668552398681641, + "rewards/margins": 11.684336980183918, + "rewards/rejected": -6.015784581502278, + "step": 886 + }, + { + "epoch": 0.2431136083321913, + "grad_norm": 6.34375, + "kl": 1.4322662353515625, + "learning_rate": 5e-06, + "logits/chosen": -5776847.05882353, + "logits/rejected": -4654774.285714285, + "logps/chosen": -478.38792509191177, + "logps/rejected": -465.9013671875, + "loss": 0.0343, + "rewards/chosen": 5.947717105641084, + "rewards/margins": 14.71735734699153, + "rewards/rejected": -8.769640241350446, + "step": 887 + }, + { + "epoch": 0.2433876935727011, + "grad_norm": 13.1875, + "kl": 3.878490447998047, + "learning_rate": 5e-06, + "logits/chosen": 1161720.6153846155, + "logits/rejected": -10451024.727272727, + "logps/chosen": -428.4821213942308, + "logps/rejected": -398.30934836647725, + "loss": 0.1065, + "rewards/chosen": 5.836492685171274, + "rewards/margins": 12.316798443560835, + "rewards/rejected": -6.48030575838956, + "step": 888 + }, + { + "epoch": 0.2436617788132109, + "grad_norm": 6.96875, + "kl": 0.18441645801067352, + "learning_rate": 5e-06, + "logits/chosen": 500479.1111111111, + "logits/rejected": 8820059.733333332, + "logps/chosen": -455.00005425347223, + "logps/rejected": -467.6315104166667, + "loss": 0.0403, + "rewards/chosen": 6.93484624226888, + "rewards/margins": 13.043108876546224, + "rewards/rejected": -6.108262634277343, + "step": 889 + }, + { + "epoch": 0.2439358640537207, + "grad_norm": 11.625, + "kl": 13.631585121154785, + "learning_rate": 5e-06, + "logits/chosen": -10566150.0, + "logits/rejected": 2417871.0, + "logps/chosen": -451.55584716796875, + "logps/rejected": -634.0518188476562, + "loss": 0.0753, + "rewards/chosen": 6.989587306976318, + "rewards/margins": 14.261556625366211, + "rewards/rejected": -7.271969318389893, + "step": 890 + }, + { + "epoch": 0.2442099492942305, + "grad_norm": 12.4375, + "kl": 0.5342572927474976, + "learning_rate": 5e-06, + "logits/chosen": 3818223.4285714286, + "logits/rejected": -65124.9, + "logps/chosen": -385.6034458705357, + "logps/rejected": -377.2816650390625, + "loss": 0.0669, + "rewards/chosen": 6.423535483224051, + "rewards/margins": 12.09806627546038, + "rewards/rejected": -5.674530792236328, + "step": 891 + }, + { + "epoch": 0.2444840345347403, + "grad_norm": 5.25, + "kl": 0.5142968893051147, + "learning_rate": 5e-06, + "logits/chosen": -5920954.0, + "logits/rejected": -3485066.3333333335, + "logps/chosen": -471.6937255859375, + "logps/rejected": -383.9448649088542, + "loss": 0.0195, + "rewards/chosen": 6.093478520711263, + "rewards/margins": 12.809310277303059, + "rewards/rejected": -6.715831756591797, + "step": 892 + }, + { + "epoch": 0.2447581197752501, + "grad_norm": 10.3125, + "kl": 11.916531562805176, + "learning_rate": 5e-06, + "logits/chosen": 1576177.6, + "logits/rejected": 7609912.888888889, + "logps/chosen": -514.5260416666666, + "logps/rejected": -483.81488715277777, + "loss": 0.0949, + "rewards/chosen": 6.379988098144532, + "rewards/margins": 14.30866427951389, + "rewards/rejected": -7.928676181369358, + "step": 893 + }, + { + "epoch": 0.2450322050157599, + "grad_norm": 3.984375, + "kl": 4.396402359008789, + "learning_rate": 5e-06, + "logits/chosen": -7265622.4, + "logits/rejected": -7037759.428571428, + "logps/chosen": -438.014599609375, + "logps/rejected": -528.0247628348214, + "loss": 0.015, + "rewards/chosen": 7.045130920410156, + "rewards/margins": 15.603974805559432, + "rewards/rejected": -8.558843885149274, + "step": 894 + }, + { + "epoch": 0.2453062902562697, + "grad_norm": 12.0, + "kl": 7.803592205047607, + "learning_rate": 5e-06, + "logits/chosen": 4820914.8, + "logits/rejected": 331545.25, + "logps/chosen": -404.36357421875, + "logps/rejected": -395.761474609375, + "loss": 0.056, + "rewards/chosen": 5.558138656616211, + "rewards/margins": 11.229707935878208, + "rewards/rejected": -5.671569279261997, + "step": 895 + }, + { + "epoch": 0.2455803754967795, + "grad_norm": 10.75, + "kl": 8.456311225891113, + "learning_rate": 5e-06, + "logits/chosen": 3609885.1764705884, + "logits/rejected": -25494605.714285713, + "logps/chosen": -368.40349264705884, + "logps/rejected": -655.7732979910714, + "loss": 0.1357, + "rewards/chosen": 4.552709691664752, + "rewards/margins": 14.29209393413127, + "rewards/rejected": -9.739384242466517, + "step": 896 + }, + { + "epoch": 0.2458544607372893, + "grad_norm": 11.875, + "kl": 0.22890345752239227, + "learning_rate": 5e-06, + "logits/chosen": 2480576.2, + "logits/rejected": -4218635.428571428, + "logps/chosen": -321.97138671875, + "logps/rejected": -498.01210239955356, + "loss": 0.0737, + "rewards/chosen": 3.866904067993164, + "rewards/margins": 10.917189952305385, + "rewards/rejected": -7.050285884312221, + "step": 897 + }, + { + "epoch": 0.2461285459777991, + "grad_norm": 8.125, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": 822912.6, + "logits/rejected": -201781.85714285713, + "logps/chosen": -300.694384765625, + "logps/rejected": -484.88295200892856, + "loss": 0.1059, + "rewards/chosen": 3.2070392608642577, + "rewards/margins": 9.053189250401088, + "rewards/rejected": -5.846149989536831, + "step": 898 + }, + { + "epoch": 0.24640263121830888, + "grad_norm": 8.125, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": 2828576.3333333335, + "logits/rejected": -7794964.0, + "logps/chosen": -343.1846110026042, + "logps/rejected": -340.7328287760417, + "loss": 0.0848, + "rewards/chosen": 5.4699045817057295, + "rewards/margins": 9.893821398417156, + "rewards/rejected": -4.423916816711426, + "step": 899 + }, + { + "epoch": 0.2466767164588187, + "grad_norm": 6.625, + "kl": 3.162677764892578, + "learning_rate": 5e-06, + "logits/chosen": -9911075.2, + "logits/rejected": -17805722.666666668, + "logps/chosen": -394.2755859375, + "logps/rejected": -454.52826605902777, + "loss": 0.0339, + "rewards/chosen": 6.296118672688802, + "rewards/margins": 12.12198452419705, + "rewards/rejected": -5.825865851508246, + "step": 900 + }, + { + "epoch": 0.24695080169932848, + "grad_norm": 5.875, + "kl": 3.4440486431121826, + "learning_rate": 5e-06, + "logits/chosen": -11467348.57142857, + "logits/rejected": -14295361.6, + "logps/chosen": -395.06637137276783, + "logps/rejected": -474.60205078125, + "loss": 0.045, + "rewards/chosen": 5.409457615443638, + "rewards/margins": 10.031068638392856, + "rewards/rejected": -4.621611022949219, + "step": 901 + }, + { + "epoch": 0.2472248869398383, + "grad_norm": 3.0, + "kl": 1.7582563161849976, + "learning_rate": 5e-06, + "logits/chosen": -18748304.0, + "logits/rejected": -18660110.85714286, + "logps/chosen": -325.14931640625, + "logps/rejected": -573.3818708147321, + "loss": 0.0216, + "rewards/chosen": 5.821567535400391, + "rewards/margins": 13.930174691336495, + "rewards/rejected": -8.108607155936104, + "step": 902 + }, + { + "epoch": 0.24749897218034808, + "grad_norm": 6.34375, + "kl": 0.4022468030452728, + "learning_rate": 5e-06, + "logits/chosen": -9868430.545454545, + "logits/rejected": 3265392.0, + "logps/chosen": -461.02059659090907, + "logps/rejected": -606.2365534855769, + "loss": 0.0333, + "rewards/chosen": 6.155001553622159, + "rewards/margins": 14.899684452510382, + "rewards/rejected": -8.744682898888222, + "step": 903 + }, + { + "epoch": 0.2477730574208579, + "grad_norm": 4.34375, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": 18596872.0, + "logits/rejected": -24515113.6, + "logps/chosen": -373.15157645089283, + "logps/rejected": -443.896533203125, + "loss": 0.0426, + "rewards/chosen": 4.868895394461496, + "rewards/margins": 11.96216321672712, + "rewards/rejected": -7.093267822265625, + "step": 904 + }, + { + "epoch": 0.24804714266136768, + "grad_norm": 9.875, + "kl": 1.2324079275131226, + "learning_rate": 5e-06, + "logits/chosen": -5615712.0, + "logits/rejected": 28971086.769230768, + "logps/chosen": -473.41335227272725, + "logps/rejected": -590.9322415865385, + "loss": 0.028, + "rewards/chosen": 5.248553882945668, + "rewards/margins": 13.586131115893384, + "rewards/rejected": -8.337577232947716, + "step": 905 + }, + { + "epoch": 0.2483212279018775, + "grad_norm": 7.53125, + "kl": 6.313266754150391, + "learning_rate": 5e-06, + "logits/chosen": -4732.846153846154, + "logits/rejected": 20555780.363636363, + "logps/chosen": -383.08882962740387, + "logps/rejected": -378.05282315340907, + "loss": 0.0587, + "rewards/chosen": 5.67813227726863, + "rewards/margins": 10.993088942307693, + "rewards/rejected": -5.3149566650390625, + "step": 906 + }, + { + "epoch": 0.24859531314238728, + "grad_norm": 5.71875, + "kl": 2.7646586894989014, + "learning_rate": 5e-06, + "logits/chosen": 2788652.0, + "logits/rejected": -8502876.666666666, + "logps/chosen": -484.665771484375, + "logps/rejected": -618.5754801432291, + "loss": 0.0373, + "rewards/chosen": 5.906757990519206, + "rewards/margins": 14.281230926513672, + "rewards/rejected": -8.374472935994467, + "step": 907 + }, + { + "epoch": 0.2488693983828971, + "grad_norm": 7.8125, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": 6585628.0, + "logits/rejected": 109823945.14285715, + "logps/chosen": -357.176318359375, + "logps/rejected": -491.6630859375, + "loss": 0.0507, + "rewards/chosen": 4.7701984405517575, + "rewards/margins": 12.312666920253207, + "rewards/rejected": -7.5424684797014505, + "step": 908 + }, + { + "epoch": 0.24914348362340688, + "grad_norm": 14.0625, + "kl": 5.3483734130859375, + "learning_rate": 5e-06, + "logits/chosen": -8292202.461538462, + "logits/rejected": -9132279.272727273, + "logps/chosen": -432.00304236778845, + "logps/rejected": -372.68319424715907, + "loss": 0.0859, + "rewards/chosen": 5.593698354867788, + "rewards/margins": 11.776576328944493, + "rewards/rejected": -6.182877974076704, + "step": 909 + }, + { + "epoch": 0.24941756886391667, + "grad_norm": 9.875, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -17409973.333333332, + "logits/rejected": -12914493.866666667, + "logps/chosen": -527.0060763888889, + "logps/rejected": -535.7731770833333, + "loss": 0.0313, + "rewards/chosen": 7.964708116319445, + "rewards/margins": 14.871660529242622, + "rewards/rejected": -6.906952412923177, + "step": 910 + }, + { + "epoch": 0.24969165410442648, + "grad_norm": 13.8125, + "kl": 4.882532119750977, + "learning_rate": 5e-06, + "logits/chosen": -18888985.14285714, + "logits/rejected": 30271404.8, + "logps/chosen": -495.67372349330356, + "logps/rejected": -543.234814453125, + "loss": 0.0408, + "rewards/chosen": 6.301877702985491, + "rewards/margins": 13.817504991803851, + "rewards/rejected": -7.51562728881836, + "step": 911 + }, + { + "epoch": 0.24996573934493627, + "grad_norm": 8.3125, + "kl": 6.9503045082092285, + "learning_rate": 5e-06, + "logits/chosen": -10076728.0, + "logits/rejected": -13727842.0, + "logps/chosen": -463.40985107421875, + "logps/rejected": -469.47760009765625, + "loss": 0.0726, + "rewards/chosen": 5.730602264404297, + "rewards/margins": 12.40587854385376, + "rewards/rejected": -6.675276279449463, + "step": 912 + }, + { + "epoch": 0.2502398245854461, + "grad_norm": 1.4609375, + "kl": 0.29437255859375, + "learning_rate": 5e-06, + "logits/chosen": 6196024.0, + "logits/rejected": -9337963.333333334, + "logps/chosen": -436.1181640625, + "logps/rejected": -603.20166015625, + "loss": 0.0051, + "rewards/chosen": 5.8333485921223955, + "rewards/margins": 13.59798494974772, + "rewards/rejected": -7.764636357625325, + "step": 913 + }, + { + "epoch": 0.2505139098259559, + "grad_norm": 7.0, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": 53867667.2, + "logits/rejected": -8786059.42857143, + "logps/chosen": -465.022607421875, + "logps/rejected": -433.970458984375, + "loss": 0.0464, + "rewards/chosen": 6.6525115966796875, + "rewards/margins": 14.307448250906809, + "rewards/rejected": -7.654936654227121, + "step": 914 + }, + { + "epoch": 0.25078799506646565, + "grad_norm": 6.84375, + "kl": 3.7618823051452637, + "learning_rate": 5e-06, + "logits/chosen": -13159069.333333334, + "logits/rejected": -22417904.0, + "logps/chosen": -524.9901936848959, + "logps/rejected": -395.2134195963542, + "loss": 0.018, + "rewards/chosen": 7.9787336985270185, + "rewards/margins": 14.535828272501629, + "rewards/rejected": -6.557094573974609, + "step": 915 + }, + { + "epoch": 0.25106208030697547, + "grad_norm": 8.4375, + "kl": 7.156263828277588, + "learning_rate": 5e-06, + "logits/chosen": -17660606.0, + "logits/rejected": -11516666.0, + "logps/chosen": -346.79144287109375, + "logps/rejected": -533.2935791015625, + "loss": 0.0556, + "rewards/chosen": 5.673281192779541, + "rewards/margins": 11.974663734436035, + "rewards/rejected": -6.301382541656494, + "step": 916 + }, + { + "epoch": 0.2513361655474853, + "grad_norm": 8.875, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -4414292.8, + "logits/rejected": -22099988.57142857, + "logps/chosen": -433.259765625, + "logps/rejected": -427.9249790736607, + "loss": 0.0548, + "rewards/chosen": 4.791815185546875, + "rewards/margins": 12.418247331891742, + "rewards/rejected": -7.626432146344866, + "step": 917 + }, + { + "epoch": 0.2516102507879951, + "grad_norm": 11.75, + "kl": 0.24113211035728455, + "learning_rate": 5e-06, + "logits/chosen": -37867.73333333333, + "logits/rejected": 96504632.8888889, + "logps/chosen": -385.068359375, + "logps/rejected": -564.9070638020834, + "loss": 0.0602, + "rewards/chosen": 6.2832275390625, + "rewards/margins": 15.637029690212673, + "rewards/rejected": -9.353802151150173, + "step": 918 + }, + { + "epoch": 0.25188433602850485, + "grad_norm": 8.875, + "kl": 1.7308566570281982, + "learning_rate": 5e-06, + "logits/chosen": -7336364.307692308, + "logits/rejected": 86564986.18181819, + "logps/chosen": -475.1014873798077, + "logps/rejected": -531.0253018465909, + "loss": 0.0681, + "rewards/chosen": 4.829886803260217, + "rewards/margins": 14.628054638842602, + "rewards/rejected": -9.798167835582387, + "step": 919 + }, + { + "epoch": 0.25215842126901467, + "grad_norm": 9.625, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": 7530956.666666667, + "logits/rejected": -12078447.111111112, + "logps/chosen": -327.9090576171875, + "logps/rejected": -370.92637803819446, + "loss": 0.03, + "rewards/chosen": 6.435161590576172, + "rewards/margins": 12.497562408447266, + "rewards/rejected": -6.062400817871094, + "step": 920 + }, + { + "epoch": 0.2524325065095245, + "grad_norm": 11.4375, + "kl": 4.517287254333496, + "learning_rate": 5e-06, + "logits/chosen": -9347672.666666666, + "logits/rejected": -2737953.3333333335, + "logps/chosen": -427.499267578125, + "logps/rejected": -537.3381754557291, + "loss": 0.0547, + "rewards/chosen": 5.07589594523112, + "rewards/margins": 11.91788164774577, + "rewards/rejected": -6.841985702514648, + "step": 921 + }, + { + "epoch": 0.25270659175003424, + "grad_norm": 12.9375, + "kl": 8.73958683013916, + "learning_rate": 5e-06, + "logits/chosen": -6200136.0, + "logits/rejected": -10963647.2, + "logps/chosen": -486.68648856026783, + "logps/rejected": -620.94697265625, + "loss": 0.061, + "rewards/chosen": 6.070579528808594, + "rewards/margins": 14.079050445556641, + "rewards/rejected": -8.008470916748047, + "step": 922 + }, + { + "epoch": 0.25298067699054405, + "grad_norm": 9.125, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -14990013.090909092, + "logits/rejected": -11253555.692307692, + "logps/chosen": -421.90580610795456, + "logps/rejected": -401.7461688701923, + "loss": 0.0561, + "rewards/chosen": 6.623251481489702, + "rewards/margins": 12.239715362762238, + "rewards/rejected": -5.616463881272536, + "step": 923 + }, + { + "epoch": 0.25325476223105386, + "grad_norm": 2.1875, + "kl": 2.2463645935058594, + "learning_rate": 5e-06, + "logits/chosen": -13219335.466666667, + "logits/rejected": -21139651.555555556, + "logps/chosen": -407.77161458333336, + "logps/rejected": -457.65668402777777, + "loss": 0.011, + "rewards/chosen": 5.629483032226562, + "rewards/margins": 12.574112277560765, + "rewards/rejected": -6.944629245334202, + "step": 924 + }, + { + "epoch": 0.2535288474715637, + "grad_norm": 6.375, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -13847524.0, + "logits/rejected": -6613991.0, + "logps/chosen": -492.1877136230469, + "logps/rejected": -429.9212646484375, + "loss": 0.0627, + "rewards/chosen": 4.666096210479736, + "rewards/margins": 10.960054874420166, + "rewards/rejected": -6.29395866394043, + "step": 925 + }, + { + "epoch": 0.25380293271207344, + "grad_norm": 4.5, + "kl": 0.01488494873046875, + "learning_rate": 5e-06, + "logits/chosen": -10508772.0, + "logits/rejected": 22788652.0, + "logps/chosen": -442.5178527832031, + "logps/rejected": -452.7877197265625, + "loss": 0.0151, + "rewards/chosen": 6.099307060241699, + "rewards/margins": 14.387831687927246, + "rewards/rejected": -8.288524627685547, + "step": 926 + }, + { + "epoch": 0.25407701795258325, + "grad_norm": 7.0, + "kl": 0.38301214575767517, + "learning_rate": 5e-06, + "logits/chosen": 76005767.1111111, + "logits/rejected": -2757578.6666666665, + "logps/chosen": -529.3077256944445, + "logps/rejected": -470.68916015625, + "loss": 0.0298, + "rewards/chosen": 6.033032735188802, + "rewards/margins": 13.695370992024738, + "rewards/rejected": -7.6623382568359375, + "step": 927 + }, + { + "epoch": 0.25435110319309306, + "grad_norm": 7.65625, + "kl": 12.514925956726074, + "learning_rate": 5e-06, + "logits/chosen": -13094809.6, + "logits/rejected": 21575957.333333332, + "logps/chosen": -452.8899739583333, + "logps/rejected": -416.86382378472223, + "loss": 0.0373, + "rewards/chosen": 7.530692036946615, + "rewards/margins": 12.830259365505643, + "rewards/rejected": -5.299567328559028, + "step": 928 + }, + { + "epoch": 0.2546251884336029, + "grad_norm": 8.3125, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -17092971.636363637, + "logits/rejected": -6762188.923076923, + "logps/chosen": -498.63045987215907, + "logps/rejected": -508.0510817307692, + "loss": 0.0421, + "rewards/chosen": 6.120948097922585, + "rewards/margins": 13.715166425371503, + "rewards/rejected": -7.594218327448918, + "step": 929 + }, + { + "epoch": 0.25489927367411264, + "grad_norm": 14.8125, + "kl": 13.12739086151123, + "learning_rate": 5e-06, + "logits/chosen": -7100736.470588235, + "logits/rejected": -19221458.285714287, + "logps/chosen": -402.6879308363971, + "logps/rejected": -338.19754464285717, + "loss": 0.0843, + "rewards/chosen": 5.8213635612936585, + "rewards/margins": 13.522131078383502, + "rewards/rejected": -7.700767517089844, + "step": 930 + }, + { + "epoch": 0.25517335891462245, + "grad_norm": 8.0625, + "kl": 1.4539146423339844, + "learning_rate": 5e-06, + "logits/chosen": -13405831.384615384, + "logits/rejected": -1545909.8181818181, + "logps/chosen": -439.3039738581731, + "logps/rejected": -433.34419389204544, + "loss": 0.0452, + "rewards/chosen": 5.321765606219952, + "rewards/margins": 11.661070443533518, + "rewards/rejected": -6.339304837313565, + "step": 931 + }, + { + "epoch": 0.25544744415513226, + "grad_norm": 10.0625, + "kl": 3.399259090423584, + "learning_rate": 5e-06, + "logits/chosen": -8413127.272727273, + "logits/rejected": -27279332.923076924, + "logps/chosen": -391.43581321022725, + "logps/rejected": -502.1301457331731, + "loss": 0.0445, + "rewards/chosen": 6.071232188831676, + "rewards/margins": 12.855521035361123, + "rewards/rejected": -6.7842888465294475, + "step": 932 + }, + { + "epoch": 0.255721529395642, + "grad_norm": 2.46875, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -2944584.3333333335, + "logits/rejected": 7941820.0, + "logps/chosen": -428.3954264322917, + "logps/rejected": -517.614501953125, + "loss": 0.0097, + "rewards/chosen": 7.481566111246745, + "rewards/margins": 15.449105580647787, + "rewards/rejected": -7.967539469401042, + "step": 933 + }, + { + "epoch": 0.25599561463615184, + "grad_norm": 10.3125, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -19035586.666666668, + "logits/rejected": -23632624.0, + "logps/chosen": -423.1187337239583, + "logps/rejected": -536.0687255859375, + "loss": 0.0633, + "rewards/chosen": 4.428627014160156, + "rewards/margins": 13.217323303222656, + "rewards/rejected": -8.7886962890625, + "step": 934 + }, + { + "epoch": 0.25626969987666165, + "grad_norm": 9.0, + "kl": 4.850799560546875, + "learning_rate": 5e-06, + "logits/chosen": -26773099.2, + "logits/rejected": 957733.5714285715, + "logps/chosen": -374.823876953125, + "logps/rejected": -399.3833705357143, + "loss": 0.0815, + "rewards/chosen": 4.907474517822266, + "rewards/margins": 9.542278289794922, + "rewards/rejected": -4.634803771972656, + "step": 935 + }, + { + "epoch": 0.25654378511717146, + "grad_norm": 10.125, + "kl": 1.4296506643295288, + "learning_rate": 5e-06, + "logits/chosen": 7132537.333333333, + "logits/rejected": -16720717.333333334, + "logps/chosen": -461.3999430338542, + "logps/rejected": -410.0968017578125, + "loss": 0.0239, + "rewards/chosen": 7.1636098225911455, + "rewards/margins": 13.945627212524414, + "rewards/rejected": -6.7820173899332685, + "step": 936 + }, + { + "epoch": 0.2568178703576812, + "grad_norm": 5.78125, + "kl": 0.2263285368680954, + "learning_rate": 5e-06, + "logits/chosen": -26801489.454545453, + "logits/rejected": -10626057.846153846, + "logps/chosen": -473.2156427556818, + "logps/rejected": -485.4734074519231, + "loss": 0.0334, + "rewards/chosen": 6.293356461958452, + "rewards/margins": 12.83051689521416, + "rewards/rejected": -6.537160433255709, + "step": 937 + }, + { + "epoch": 0.25709195559819104, + "grad_norm": 13.9375, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -25814518.85714286, + "logits/rejected": 16868707.76470588, + "logps/chosen": -533.4038434709821, + "logps/rejected": -456.38634535845586, + "loss": 0.0855, + "rewards/chosen": 7.523799896240234, + "rewards/margins": 12.844219207763672, + "rewards/rejected": -5.3204193115234375, + "step": 938 + }, + { + "epoch": 0.25736604083870085, + "grad_norm": 7.125, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": 21786222.545454547, + "logits/rejected": 8213337.230769231, + "logps/chosen": -404.57803622159093, + "logps/rejected": -502.85013521634613, + "loss": 0.0426, + "rewards/chosen": 6.61287411776456, + "rewards/margins": 15.423713844139257, + "rewards/rejected": -8.810839726374699, + "step": 939 + }, + { + "epoch": 0.2576401260792106, + "grad_norm": 11.5625, + "kl": 1.5645307302474976, + "learning_rate": 5e-06, + "logits/chosen": 2765695.3846153845, + "logits/rejected": 14542126.545454545, + "logps/chosen": -408.25863882211536, + "logps/rejected": -648.1149680397727, + "loss": 0.0527, + "rewards/chosen": 4.792485750638521, + "rewards/margins": 16.15737402855933, + "rewards/rejected": -11.36488827792081, + "step": 940 + }, + { + "epoch": 0.2579142113197204, + "grad_norm": 5.25, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": 9941264.0, + "logits/rejected": -10818266.0, + "logps/chosen": -357.4606119791667, + "logps/rejected": -486.0686442057292, + "loss": 0.036, + "rewards/chosen": 5.202293395996094, + "rewards/margins": 13.249788284301758, + "rewards/rejected": -8.047494888305664, + "step": 941 + }, + { + "epoch": 0.25818829656023023, + "grad_norm": 8.1875, + "kl": 2.035142421722412, + "learning_rate": 5e-06, + "logits/chosen": -10975740.0, + "logits/rejected": -948173.4285714285, + "logps/chosen": -412.54267578125, + "logps/rejected": -449.9885951450893, + "loss": 0.0368, + "rewards/chosen": 6.060214996337891, + "rewards/margins": 12.400022779192243, + "rewards/rejected": -6.339807782854352, + "step": 942 + }, + { + "epoch": 0.25846238180074005, + "grad_norm": 4.6875, + "kl": 5.574146270751953, + "learning_rate": 5e-06, + "logits/chosen": -5063740.0, + "logits/rejected": 17181104.0, + "logps/chosen": -451.06549072265625, + "logps/rejected": -635.756591796875, + "loss": 0.0191, + "rewards/chosen": 5.601768493652344, + "rewards/margins": 16.67848300933838, + "rewards/rejected": -11.076714515686035, + "step": 943 + }, + { + "epoch": 0.2587364670412498, + "grad_norm": 10.5, + "kl": 0.9868850708007812, + "learning_rate": 5e-06, + "logits/chosen": 3458957.5384615385, + "logits/rejected": 23426443.636363637, + "logps/chosen": -466.24181189903845, + "logps/rejected": -457.88960404829544, + "loss": 0.0445, + "rewards/chosen": 5.7142486572265625, + "rewards/margins": 11.384352597323332, + "rewards/rejected": -5.6701039400967685, + "step": 944 + }, + { + "epoch": 0.2590105522817596, + "grad_norm": 3.578125, + "kl": 1.4286088943481445, + "learning_rate": 5e-06, + "logits/chosen": -15297911.272727273, + "logits/rejected": -21364731.076923076, + "logps/chosen": -427.86075106534093, + "logps/rejected": -445.92931189903845, + "loss": 0.0139, + "rewards/chosen": 6.472465515136719, + "rewards/margins": 14.033012390136719, + "rewards/rejected": -7.560546875, + "step": 945 + }, + { + "epoch": 0.25928463752226943, + "grad_norm": 14.8125, + "kl": 9.275156021118164, + "learning_rate": 5e-06, + "logits/chosen": -17854344.0, + "logits/rejected": -8699460.8, + "logps/chosen": -442.778564453125, + "logps/rejected": -437.7201171875, + "loss": 0.1009, + "rewards/chosen": 6.077787126813616, + "rewards/margins": 12.359316362653459, + "rewards/rejected": -6.281529235839844, + "step": 946 + }, + { + "epoch": 0.25955872276277925, + "grad_norm": 8.6875, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -3239731.5555555555, + "logits/rejected": -11263851.733333332, + "logps/chosen": -307.06377495659723, + "logps/rejected": -653.6268880208333, + "loss": 0.0812, + "rewards/chosen": 5.745368957519531, + "rewards/margins": 18.10077667236328, + "rewards/rejected": -12.35540771484375, + "step": 947 + }, + { + "epoch": 0.259832808003289, + "grad_norm": 10.1875, + "kl": 4.156101226806641, + "learning_rate": 5e-06, + "logits/chosen": -5595590.666666667, + "logits/rejected": -6885563.333333333, + "logps/chosen": -435.846435546875, + "logps/rejected": -295.63063557942706, + "loss": 0.0498, + "rewards/chosen": 5.35194714864095, + "rewards/margins": 10.460397402445475, + "rewards/rejected": -5.108450253804524, + "step": 948 + }, + { + "epoch": 0.2601068932437988, + "grad_norm": 13.375, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -18743166.545454547, + "logits/rejected": 18522653.53846154, + "logps/chosen": -437.55317826704544, + "logps/rejected": -432.8920147235577, + "loss": 0.0446, + "rewards/chosen": 5.558706456964666, + "rewards/margins": 11.530129679433117, + "rewards/rejected": -5.97142322246845, + "step": 949 + }, + { + "epoch": 0.26038097848430863, + "grad_norm": 7.46875, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -17414065.6, + "logits/rejected": 41745490.28571428, + "logps/chosen": -442.70380859375, + "logps/rejected": -640.3610491071429, + "loss": 0.0177, + "rewards/chosen": 6.492620849609375, + "rewards/margins": 14.203569248744419, + "rewards/rejected": -7.710948399135044, + "step": 950 + }, + { + "epoch": 0.2606550637248184, + "grad_norm": 8.0, + "kl": 9.705123901367188, + "learning_rate": 5e-06, + "logits/chosen": -21389548.307692308, + "logits/rejected": -4786125.090909091, + "logps/chosen": -479.0183293269231, + "logps/rejected": -502.0965021306818, + "loss": 0.0744, + "rewards/chosen": 5.6826031024639425, + "rewards/margins": 12.391062436403928, + "rewards/rejected": -6.708459333939985, + "step": 951 + }, + { + "epoch": 0.2609291489653282, + "grad_norm": 6.5, + "kl": 1.7347755432128906, + "learning_rate": 5e-06, + "logits/chosen": -24822039.272727273, + "logits/rejected": -21956561.230769232, + "logps/chosen": -503.6298828125, + "logps/rejected": -387.9125225360577, + "loss": 0.0408, + "rewards/chosen": 6.578715931285512, + "rewards/margins": 12.98420886059741, + "rewards/rejected": -6.405492929311899, + "step": 952 + }, + { + "epoch": 0.261203234205838, + "grad_norm": 4.0625, + "kl": 5.236913204193115, + "learning_rate": 5e-06, + "logits/chosen": -23705184.0, + "logits/rejected": -2028384.888888889, + "logps/chosen": -360.6419921875, + "logps/rejected": -516.6667751736111, + "loss": 0.0158, + "rewards/chosen": 5.900875854492187, + "rewards/margins": 12.827637905544705, + "rewards/rejected": -6.926762051052517, + "step": 953 + }, + { + "epoch": 0.26147731944634783, + "grad_norm": 8.6875, + "kl": 0.9404615163803101, + "learning_rate": 5e-06, + "logits/chosen": -18541736.615384616, + "logits/rejected": -10328807.272727273, + "logps/chosen": -398.70229867788464, + "logps/rejected": -507.57759232954544, + "loss": 0.0481, + "rewards/chosen": 6.301127507136418, + "rewards/margins": 14.371227771252187, + "rewards/rejected": -8.070100264115768, + "step": 954 + }, + { + "epoch": 0.2617514046868576, + "grad_norm": 7.875, + "kl": 0.39804649353027344, + "learning_rate": 5e-06, + "logits/chosen": 13279889.6, + "logits/rejected": -24020688.0, + "logps/chosen": -401.1297607421875, + "logps/rejected": -577.0579659598214, + "loss": 0.0441, + "rewards/chosen": 4.8051410675048825, + "rewards/margins": 13.83436655317034, + "rewards/rejected": -9.029225485665458, + "step": 955 + }, + { + "epoch": 0.2620254899273674, + "grad_norm": 7.0625, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -3189024.0, + "logits/rejected": -25619140.57142857, + "logps/chosen": -437.06552734375, + "logps/rejected": -421.9520786830357, + "loss": 0.0253, + "rewards/chosen": 6.554267883300781, + "rewards/margins": 15.914881025041852, + "rewards/rejected": -9.360613141741071, + "step": 956 + }, + { + "epoch": 0.2622995751678772, + "grad_norm": 7.28125, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -3373718.6666666665, + "logits/rejected": -26068816.0, + "logps/chosen": -416.0738932291667, + "logps/rejected": -521.2947591145834, + "loss": 0.0347, + "rewards/chosen": 5.346377690633138, + "rewards/margins": 14.520790735880535, + "rewards/rejected": -9.174413045247396, + "step": 957 + }, + { + "epoch": 0.26257366040838703, + "grad_norm": 9.375, + "kl": 2.0879924297332764, + "learning_rate": 5e-06, + "logits/chosen": -12908710.153846154, + "logits/rejected": -8553201.454545455, + "logps/chosen": -473.35494290865387, + "logps/rejected": -459.91020063920456, + "loss": 0.0334, + "rewards/chosen": 6.431424654447115, + "rewards/margins": 13.793816733193564, + "rewards/rejected": -7.362392078746449, + "step": 958 + }, + { + "epoch": 0.2628477456488968, + "grad_norm": 2.296875, + "kl": 2.938140869140625, + "learning_rate": 5e-06, + "logits/chosen": -10537907.2, + "logits/rejected": -11319142.222222222, + "logps/chosen": -489.4039713541667, + "logps/rejected": -541.1126844618055, + "loss": 0.0102, + "rewards/chosen": 6.293031311035156, + "rewards/margins": 15.245240783691406, + "rewards/rejected": -8.95220947265625, + "step": 959 + }, + { + "epoch": 0.2631218308894066, + "grad_norm": 9.5, + "kl": 5.80673885345459, + "learning_rate": 5e-06, + "logits/chosen": -5391376.94117647, + "logits/rejected": 1361037.857142857, + "logps/chosen": -493.67176011029414, + "logps/rejected": -582.36767578125, + "loss": 0.0504, + "rewards/chosen": 5.294521107393153, + "rewards/margins": 14.139217697271778, + "rewards/rejected": -8.844696589878627, + "step": 960 + }, + { + "epoch": 0.2633959161299164, + "grad_norm": 4.21875, + "kl": 5.678752899169922, + "learning_rate": 5e-06, + "logits/chosen": -4585215.2, + "logits/rejected": -18316629.714285713, + "logps/chosen": -419.330859375, + "logps/rejected": -368.00830078125, + "loss": 0.0212, + "rewards/chosen": 5.6047416687011715, + "rewards/margins": 11.84775913783482, + "rewards/rejected": -6.243017469133649, + "step": 961 + }, + { + "epoch": 0.2636700013704262, + "grad_norm": 11.25, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -11604454.666666666, + "logits/rejected": -426368.0, + "logps/chosen": -365.679443359375, + "logps/rejected": -419.4940185546875, + "loss": 0.0922, + "rewards/chosen": 3.6728665033976235, + "rewards/margins": 11.11265786488851, + "rewards/rejected": -7.439791361490886, + "step": 962 + }, + { + "epoch": 0.263944086610936, + "grad_norm": 3.4375, + "kl": 1.3374608755111694, + "learning_rate": 5e-06, + "logits/chosen": -17885184.0, + "logits/rejected": 32626876.0, + "logps/chosen": -398.31536865234375, + "logps/rejected": -672.5433349609375, + "loss": 0.0106, + "rewards/chosen": 5.379430294036865, + "rewards/margins": 15.035218715667725, + "rewards/rejected": -9.65578842163086, + "step": 963 + }, + { + "epoch": 0.2642181718514458, + "grad_norm": 10.5625, + "kl": 0.6282132863998413, + "learning_rate": 5e-06, + "logits/chosen": -7376353.230769231, + "logits/rejected": 20584650.181818184, + "logps/chosen": -288.2746769831731, + "logps/rejected": -564.7292258522727, + "loss": 0.0494, + "rewards/chosen": 5.300265972430889, + "rewards/margins": 11.28464929540674, + "rewards/rejected": -5.9843833229758525, + "step": 964 + }, + { + "epoch": 0.2644922570919556, + "grad_norm": 7.71875, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -37104252.0, + "logits/rejected": -28856902.0, + "logps/chosen": -463.9609680175781, + "logps/rejected": -454.84942626953125, + "loss": 0.0458, + "rewards/chosen": 6.546067714691162, + "rewards/margins": 13.237388134002686, + "rewards/rejected": -6.691320419311523, + "step": 965 + }, + { + "epoch": 0.2647663423324654, + "grad_norm": 9.4375, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": 20605065.333333332, + "logits/rejected": -4952044.666666667, + "logps/chosen": -372.472900390625, + "logps/rejected": -479.3736979166667, + "loss": 0.0289, + "rewards/chosen": 4.921517690022786, + "rewards/margins": 14.148375193277996, + "rewards/rejected": -9.226857503255209, + "step": 966 + }, + { + "epoch": 0.2650404275729752, + "grad_norm": 11.75, + "kl": 2.2142951488494873, + "learning_rate": 5e-06, + "logits/chosen": 8088939.692307692, + "logits/rejected": -7275341.090909091, + "logps/chosen": -488.35096153846155, + "logps/rejected": -409.36328125, + "loss": 0.0469, + "rewards/chosen": 5.299001253568209, + "rewards/margins": 12.64659022618007, + "rewards/rejected": -7.34758897261186, + "step": 967 + }, + { + "epoch": 0.265314512813485, + "grad_norm": 9.25, + "kl": 2.3059210777282715, + "learning_rate": 5e-06, + "logits/chosen": -33436126.0, + "logits/rejected": -13831083.0, + "logps/chosen": -371.44189453125, + "logps/rejected": -502.6777038574219, + "loss": 0.0398, + "rewards/chosen": 5.133700370788574, + "rewards/margins": 12.68954610824585, + "rewards/rejected": -7.555845737457275, + "step": 968 + }, + { + "epoch": 0.2655885980539948, + "grad_norm": 8.5, + "kl": 1.1195144653320312, + "learning_rate": 5e-06, + "logits/chosen": -45799740.44444445, + "logits/rejected": -22784825.6, + "logps/chosen": -475.7770182291667, + "logps/rejected": -551.4458333333333, + "loss": 0.023, + "rewards/chosen": 6.021810743543837, + "rewards/margins": 14.768728468153212, + "rewards/rejected": -8.746917724609375, + "step": 969 + }, + { + "epoch": 0.2658626832945046, + "grad_norm": 3.484375, + "kl": 1.4408175945281982, + "learning_rate": 5e-06, + "logits/chosen": -13356653.090909092, + "logits/rejected": -16705595.076923076, + "logps/chosen": -428.93887606534093, + "logps/rejected": -451.0604717548077, + "loss": 0.0155, + "rewards/chosen": 6.848553744229403, + "rewards/margins": 14.518089454490822, + "rewards/rejected": -7.669535710261418, + "step": 970 + }, + { + "epoch": 0.2661367685350144, + "grad_norm": 9.375, + "kl": 1.857261061668396, + "learning_rate": 5e-06, + "logits/chosen": -21292941.714285713, + "logits/rejected": -29703740.8, + "logps/chosen": -485.36495535714283, + "logps/rejected": -519.17587890625, + "loss": 0.0344, + "rewards/chosen": 6.820086342947824, + "rewards/margins": 15.830750710623605, + "rewards/rejected": -9.010664367675782, + "step": 971 + }, + { + "epoch": 0.2664108537755242, + "grad_norm": 9.375, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -8251118.4, + "logits/rejected": -28313005.714285713, + "logps/chosen": -460.565966796875, + "logps/rejected": -516.0386788504464, + "loss": 0.0352, + "rewards/chosen": 5.210797882080078, + "rewards/margins": 13.488185228620257, + "rewards/rejected": -8.277387346540179, + "step": 972 + }, + { + "epoch": 0.26668493901603396, + "grad_norm": 10.75, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -3071187.6666666665, + "logits/rejected": 1678408.6666666667, + "logps/chosen": -376.8160400390625, + "logps/rejected": -677.2494303385416, + "loss": 0.0959, + "rewards/chosen": 4.310688018798828, + "rewards/margins": 14.445522944132486, + "rewards/rejected": -10.134834925333658, + "step": 973 + }, + { + "epoch": 0.2669590242565438, + "grad_norm": 5.71875, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -10560091.636363637, + "logits/rejected": -887594.1538461539, + "logps/chosen": -397.77450284090907, + "logps/rejected": -511.9929387019231, + "loss": 0.0818, + "rewards/chosen": 5.70974384654652, + "rewards/margins": 12.299604029088588, + "rewards/rejected": -6.5898601825420675, + "step": 974 + }, + { + "epoch": 0.2672331094970536, + "grad_norm": 4.53125, + "kl": 0.11975988000631332, + "learning_rate": 5e-06, + "logits/chosen": -24671945.14285714, + "logits/rejected": -14682316.8, + "logps/chosen": -442.70082310267856, + "logps/rejected": -454.61201171875, + "loss": 0.014, + "rewards/chosen": 5.8400726318359375, + "rewards/margins": 13.801502990722657, + "rewards/rejected": -7.961430358886719, + "step": 975 + }, + { + "epoch": 0.2675071947375634, + "grad_norm": 5.65625, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -2447245.3333333335, + "logits/rejected": -18352693.333333332, + "logps/chosen": -418.7146809895833, + "logps/rejected": -451.53033854166665, + "loss": 0.019, + "rewards/chosen": 5.473955790201823, + "rewards/margins": 12.943558247884114, + "rewards/rejected": -7.469602457682291, + "step": 976 + }, + { + "epoch": 0.26778127997807316, + "grad_norm": 5.46875, + "kl": 1.3988406658172607, + "learning_rate": 5e-06, + "logits/chosen": -33867656.72727273, + "logits/rejected": -7088601.846153846, + "logps/chosen": -451.82492897727275, + "logps/rejected": -462.0129957932692, + "loss": 0.0326, + "rewards/chosen": 6.2246315696022725, + "rewards/margins": 12.480298449109483, + "rewards/rejected": -6.255666879507212, + "step": 977 + }, + { + "epoch": 0.268055365218583, + "grad_norm": 5.71875, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -27969584.0, + "logits/rejected": 1814046.3333333333, + "logps/chosen": -499.2157389322917, + "logps/rejected": -465.3012288411458, + "loss": 0.0118, + "rewards/chosen": 7.1764787038167315, + "rewards/margins": 16.1704896291097, + "rewards/rejected": -8.994010925292969, + "step": 978 + }, + { + "epoch": 0.2683294504590928, + "grad_norm": 10.8125, + "kl": 11.131546974182129, + "learning_rate": 5e-06, + "logits/chosen": -13004112.0, + "logits/rejected": -2973310.8, + "logps/chosen": -424.23158482142856, + "logps/rejected": -426.372119140625, + "loss": 0.0584, + "rewards/chosen": 5.962975093296596, + "rewards/margins": 11.274077769688198, + "rewards/rejected": -5.311102676391601, + "step": 979 + }, + { + "epoch": 0.2686035356996026, + "grad_norm": 5.25, + "kl": 3.9341049194335938, + "learning_rate": 5e-06, + "logits/chosen": -19889006.933333334, + "logits/rejected": -13125095.111111112, + "logps/chosen": -362.65358072916666, + "logps/rejected": -715.7421875, + "loss": 0.0841, + "rewards/chosen": 4.897745259602865, + "rewards/margins": 15.166280449761285, + "rewards/rejected": -10.26853519015842, + "step": 980 + }, + { + "epoch": 0.26887762094011236, + "grad_norm": 1.625, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -3219412.8571428573, + "logits/rejected": -11993333.6, + "logps/chosen": -423.2242954799107, + "logps/rejected": -354.755419921875, + "loss": 0.0063, + "rewards/chosen": 5.759278433663504, + "rewards/margins": 12.85308336530413, + "rewards/rejected": -7.093804931640625, + "step": 981 + }, + { + "epoch": 0.2691517061806222, + "grad_norm": 11.0625, + "kl": 2.3579013347625732, + "learning_rate": 5e-06, + "logits/chosen": -26604315.42857143, + "logits/rejected": 19238147.2, + "logps/chosen": -442.20486886160717, + "logps/rejected": -653.826904296875, + "loss": 0.0341, + "rewards/chosen": 6.585381099155971, + "rewards/margins": 20.38740441458566, + "rewards/rejected": -13.802023315429688, + "step": 982 + }, + { + "epoch": 0.269425791421132, + "grad_norm": 6.1875, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -30009024.0, + "logits/rejected": 25519003.2, + "logps/chosen": -419.47781808035717, + "logps/rejected": -632.65966796875, + "loss": 0.0277, + "rewards/chosen": 5.76971435546875, + "rewards/margins": 15.481197357177734, + "rewards/rejected": -9.711483001708984, + "step": 983 + }, + { + "epoch": 0.26969987666164175, + "grad_norm": 7.28125, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -29348556.8, + "logits/rejected": -10097856.0, + "logps/chosen": -474.005908203125, + "logps/rejected": -583.9505440848214, + "loss": 0.0264, + "rewards/chosen": 5.45623550415039, + "rewards/margins": 14.396884264264788, + "rewards/rejected": -8.940648760114398, + "step": 984 + }, + { + "epoch": 0.26997396190215156, + "grad_norm": 10.8125, + "kl": 4.431816101074219, + "learning_rate": 5e-06, + "logits/chosen": -12552480.0, + "logits/rejected": -10850514.285714285, + "logps/chosen": -354.415380859375, + "logps/rejected": -665.3643275669643, + "loss": 0.0628, + "rewards/chosen": 4.3610382080078125, + "rewards/margins": 13.503692626953125, + "rewards/rejected": -9.142654418945312, + "step": 985 + }, + { + "epoch": 0.2702480471426614, + "grad_norm": 10.625, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -19739834.0, + "logits/rejected": -22817160.0, + "logps/chosen": -425.3714599609375, + "logps/rejected": -595.1431884765625, + "loss": 0.0471, + "rewards/chosen": 4.480156421661377, + "rewards/margins": 16.289021968841553, + "rewards/rejected": -11.808865547180176, + "step": 986 + }, + { + "epoch": 0.2705221323831712, + "grad_norm": 8.3125, + "kl": 1.653464674949646, + "learning_rate": 5e-06, + "logits/chosen": -33412304.0, + "logits/rejected": 6628528.0, + "logps/chosen": -579.78076171875, + "logps/rejected": -354.072021484375, + "loss": 0.0376, + "rewards/chosen": 6.098360061645508, + "rewards/margins": 12.937888463338215, + "rewards/rejected": -6.839528401692708, + "step": 987 + }, + { + "epoch": 0.27079621762368095, + "grad_norm": 12.125, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -14282215.384615384, + "logits/rejected": -11826376.727272727, + "logps/chosen": -380.7237079326923, + "logps/rejected": -523.0771484375, + "loss": 0.0748, + "rewards/chosen": 4.104017404409555, + "rewards/margins": 14.942578028965663, + "rewards/rejected": -10.838560624556107, + "step": 988 + }, + { + "epoch": 0.27107030286419076, + "grad_norm": 7.09375, + "kl": 5.267779350280762, + "learning_rate": 5e-06, + "logits/chosen": -21226935.466666665, + "logits/rejected": -20124282.666666668, + "logps/chosen": -548.986328125, + "logps/rejected": -367.38921440972223, + "loss": 0.0536, + "rewards/chosen": 7.164665730794271, + "rewards/margins": 14.685030280219184, + "rewards/rejected": -7.520364549424913, + "step": 989 + }, + { + "epoch": 0.2713443881047006, + "grad_norm": 7.40625, + "kl": 0.8214542269706726, + "learning_rate": 5e-06, + "logits/chosen": -22585513.846153848, + "logits/rejected": -28622507.636363637, + "logps/chosen": -510.9357346754808, + "logps/rejected": -514.99609375, + "loss": 0.036, + "rewards/chosen": 6.538578913762019, + "rewards/margins": 18.513403298971536, + "rewards/rejected": -11.974824385209518, + "step": 990 + }, + { + "epoch": 0.2716184733452104, + "grad_norm": 6.59375, + "kl": 1.976820945739746, + "learning_rate": 5e-06, + "logits/chosen": -23198592.0, + "logits/rejected": -19692679.384615384, + "logps/chosen": -360.877197265625, + "logps/rejected": -383.13724459134613, + "loss": 0.0562, + "rewards/chosen": 5.607992345636541, + "rewards/margins": 11.706487829034979, + "rewards/rejected": -6.0984954833984375, + "step": 991 + }, + { + "epoch": 0.27189255858572015, + "grad_norm": 11.3125, + "kl": 5.621987819671631, + "learning_rate": 5e-06, + "logits/chosen": -22548907.42857143, + "logits/rejected": 11118212.0, + "logps/chosen": -425.5569545200893, + "logps/rejected": -503.700439453125, + "loss": 0.0512, + "rewards/chosen": 5.738159724644253, + "rewards/margins": 14.939941951206752, + "rewards/rejected": -9.2017822265625, + "step": 992 + }, + { + "epoch": 0.27216664382622996, + "grad_norm": 7.03125, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -8767764.0, + "logits/rejected": -4957152.666666667, + "logps/chosen": -414.1932779947917, + "logps/rejected": -672.245849609375, + "loss": 0.0243, + "rewards/chosen": 4.566397984822591, + "rewards/margins": 13.29069709777832, + "rewards/rejected": -8.724299112955729, + "step": 993 + }, + { + "epoch": 0.2724407290667398, + "grad_norm": 4.8125, + "kl": 0.08488655090332031, + "learning_rate": 5e-06, + "logits/chosen": -17605513.333333332, + "logits/rejected": 16053541.333333334, + "logps/chosen": -470.0772298177083, + "logps/rejected": -475.3201497395833, + "loss": 0.0228, + "rewards/chosen": 6.6449400583903, + "rewards/margins": 14.797927220662434, + "rewards/rejected": -8.152987162272135, + "step": 994 + }, + { + "epoch": 0.27271481430724953, + "grad_norm": 4.75, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -7045323.333333333, + "logits/rejected": -18671653.333333332, + "logps/chosen": -386.0382486979167, + "logps/rejected": -517.8528238932291, + "loss": 0.0163, + "rewards/chosen": 5.070969263712565, + "rewards/margins": 14.558516820271809, + "rewards/rejected": -9.487547556559244, + "step": 995 + }, + { + "epoch": 0.27298889954775934, + "grad_norm": 11.0, + "kl": 1.0233535766601562, + "learning_rate": 5e-06, + "logits/chosen": -5502548.923076923, + "logits/rejected": -3396532.3636363638, + "logps/chosen": -381.20030799278845, + "logps/rejected": -386.7265625, + "loss": 0.0878, + "rewards/chosen": 5.228311978853666, + "rewards/margins": 10.523257435618582, + "rewards/rejected": -5.294945456764915, + "step": 996 + }, + { + "epoch": 0.27326298478826916, + "grad_norm": 4.5, + "kl": 4.741988658905029, + "learning_rate": 5e-06, + "logits/chosen": -20020589.333333332, + "logits/rejected": -14770521.333333334, + "logps/chosen": -451.7237548828125, + "logps/rejected": -437.9883626302083, + "loss": 0.0381, + "rewards/chosen": 5.643041610717773, + "rewards/margins": 12.191490809122723, + "rewards/rejected": -6.548449198404948, + "step": 997 + }, + { + "epoch": 0.273537070028779, + "grad_norm": 7.09375, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -19967657.14285714, + "logits/rejected": -21280025.6, + "logps/chosen": -479.8775111607143, + "logps/rejected": -523.888818359375, + "loss": 0.0458, + "rewards/chosen": 6.200954437255859, + "rewards/margins": 16.394202423095702, + "rewards/rejected": -10.193247985839843, + "step": 998 + }, + { + "epoch": 0.27381115526928873, + "grad_norm": 2.796875, + "kl": 1.0272510051727295, + "learning_rate": 5e-06, + "logits/chosen": -34588681.14285714, + "logits/rejected": -33578627.2, + "logps/chosen": -558.9356166294643, + "logps/rejected": -542.04912109375, + "loss": 0.0083, + "rewards/chosen": 6.846635001046317, + "rewards/margins": 16.55622591291155, + "rewards/rejected": -9.709590911865234, + "step": 999 + }, + { + "epoch": 0.27408524050979854, + "grad_norm": 5.40625, + "kl": 3.860377788543701, + "learning_rate": 5e-06, + "logits/chosen": -15805382.4, + "logits/rejected": -26299078.85714286, + "logps/chosen": -512.5771484375, + "logps/rejected": -485.08377511160717, + "loss": 0.0107, + "rewards/chosen": 6.135759735107422, + "rewards/margins": 15.47008525303432, + "rewards/rejected": -9.334325517926898, + "step": 1000 + }, + { + "epoch": 0.27435932575030836, + "grad_norm": 7.4375, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -7708006.4, + "logits/rejected": 5997503.111111111, + "logps/chosen": -400.18079427083336, + "logps/rejected": -456.10053168402777, + "loss": 0.0335, + "rewards/chosen": 7.4956720987955725, + "rewards/margins": 13.610594516330295, + "rewards/rejected": -6.114922417534722, + "step": 1001 + }, + { + "epoch": 0.27463341099081817, + "grad_norm": 8.1875, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -11117831.272727273, + "logits/rejected": 4834847.076923077, + "logps/chosen": -299.36330344460225, + "logps/rejected": -569.62890625, + "loss": 0.0305, + "rewards/chosen": 5.209623856977983, + "rewards/margins": 13.365825146228286, + "rewards/rejected": -8.156201289250301, + "step": 1002 + }, + { + "epoch": 0.27490749623132793, + "grad_norm": 6.0625, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -14826829.714285715, + "logits/rejected": -32552480.0, + "logps/chosen": -465.4224330357143, + "logps/rejected": -558.6013671875, + "loss": 0.0165, + "rewards/chosen": 6.533670697893415, + "rewards/margins": 16.270319257463726, + "rewards/rejected": -9.736648559570312, + "step": 1003 + }, + { + "epoch": 0.27518158147183774, + "grad_norm": 11.375, + "kl": 0.4152107238769531, + "learning_rate": 5e-06, + "logits/chosen": -18087392.0, + "logits/rejected": -20402593.454545453, + "logps/chosen": -419.5105543870192, + "logps/rejected": -584.8737127130681, + "loss": 0.0736, + "rewards/chosen": 5.411284813514123, + "rewards/margins": 11.807409913389833, + "rewards/rejected": -6.39612509987571, + "step": 1004 + }, + { + "epoch": 0.27545566671234756, + "grad_norm": 7.09375, + "kl": 3.9716577529907227, + "learning_rate": 5e-06, + "logits/chosen": -25971106.90909091, + "logits/rejected": 27839881.846153848, + "logps/chosen": -466.18026455965907, + "logps/rejected": -565.5613356370193, + "loss": 0.0334, + "rewards/chosen": 5.311735326593572, + "rewards/margins": 16.446032624144657, + "rewards/rejected": -11.134297297551083, + "step": 1005 + }, + { + "epoch": 0.2757297519528573, + "grad_norm": 10.5625, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -30462441.846153848, + "logits/rejected": -6716441.454545454, + "logps/chosen": -393.65902944711536, + "logps/rejected": -616.2245649857955, + "loss": 0.0745, + "rewards/chosen": 4.7719257061298075, + "rewards/margins": 13.450955691037478, + "rewards/rejected": -8.67902998490767, + "step": 1006 + }, + { + "epoch": 0.27600383719336713, + "grad_norm": 9.8125, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -6681082.461538462, + "logits/rejected": -1856813.8181818181, + "logps/chosen": -398.69076772836536, + "logps/rejected": -484.8663884943182, + "loss": 0.0444, + "rewards/chosen": 5.324650691105769, + "rewards/margins": 10.93513125973148, + "rewards/rejected": -5.61048056862571, + "step": 1007 + }, + { + "epoch": 0.27627792243387694, + "grad_norm": 5.625, + "kl": 4.992013454437256, + "learning_rate": 5e-06, + "logits/chosen": -16124926.76923077, + "logits/rejected": -637090.9090909091, + "logps/chosen": -490.6535832331731, + "logps/rejected": -751.0204190340909, + "loss": 0.0159, + "rewards/chosen": 6.189952556903545, + "rewards/margins": 18.99440562641704, + "rewards/rejected": -12.804453069513494, + "step": 1008 + }, + { + "epoch": 0.27655200767438676, + "grad_norm": 4.4375, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -25684523.636363637, + "logits/rejected": 2932078.153846154, + "logps/chosen": -370.8082830255682, + "logps/rejected": -481.65767728365387, + "loss": 0.0364, + "rewards/chosen": 5.543337041681463, + "rewards/margins": 11.617421983838915, + "rewards/rejected": -6.074084942157452, + "step": 1009 + }, + { + "epoch": 0.2768260929148965, + "grad_norm": 5.1875, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -8211702.769230769, + "logits/rejected": -3464994.909090909, + "logps/chosen": -455.84525240384613, + "logps/rejected": -652.8102805397727, + "loss": 0.051, + "rewards/chosen": 6.031146709735577, + "rewards/margins": 16.07746268319083, + "rewards/rejected": -10.046315973455256, + "step": 1010 + }, + { + "epoch": 0.27710017815540633, + "grad_norm": 6.96875, + "kl": 9.899712562561035, + "learning_rate": 5e-06, + "logits/chosen": -45657072.0, + "logits/rejected": -14067370.666666666, + "logps/chosen": -506.8863932291667, + "logps/rejected": -589.3668619791666, + "loss": 0.0248, + "rewards/chosen": 7.091495513916016, + "rewards/margins": 16.63838768005371, + "rewards/rejected": -9.546892166137695, + "step": 1011 + }, + { + "epoch": 0.27737426339591614, + "grad_norm": 10.3125, + "kl": 11.389968872070312, + "learning_rate": 5e-06, + "logits/chosen": -8008195.764705882, + "logits/rejected": -21932246.85714286, + "logps/chosen": -490.2385684742647, + "logps/rejected": -456.27218191964283, + "loss": 0.0904, + "rewards/chosen": 6.639415067784927, + "rewards/margins": 15.04119805728688, + "rewards/rejected": -8.401782989501953, + "step": 1012 + }, + { + "epoch": 0.2776483486364259, + "grad_norm": 6.5625, + "kl": 3.612496852874756, + "learning_rate": 5e-06, + "logits/chosen": -15081585.777777778, + "logits/rejected": 2115933.6, + "logps/chosen": -503.76768663194446, + "logps/rejected": -399.16285807291666, + "loss": 0.0432, + "rewards/chosen": 6.2352489895290795, + "rewards/margins": 13.983489142523872, + "rewards/rejected": -7.748240152994792, + "step": 1013 + }, + { + "epoch": 0.2779224338769357, + "grad_norm": 7.03125, + "kl": 0.9009103775024414, + "learning_rate": 5e-06, + "logits/chosen": 2301244.3333333335, + "logits/rejected": -9967433.333333334, + "logps/chosen": -362.6772054036458, + "logps/rejected": -366.228515625, + "loss": 0.0696, + "rewards/chosen": 4.864762941996257, + "rewards/margins": 10.801440874735516, + "rewards/rejected": -5.936677932739258, + "step": 1014 + }, + { + "epoch": 0.27819651911744553, + "grad_norm": 4.71875, + "kl": 1.807373046875, + "learning_rate": 5e-06, + "logits/chosen": -8368467.333333333, + "logits/rejected": -25375160.0, + "logps/chosen": -488.4856770833333, + "logps/rejected": -398.3640950520833, + "loss": 0.02, + "rewards/chosen": 6.684074401855469, + "rewards/margins": 14.289117177327473, + "rewards/rejected": -7.605042775472005, + "step": 1015 + }, + { + "epoch": 0.27847060435795534, + "grad_norm": 6.125, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -18586404.923076924, + "logits/rejected": -7847058.181818182, + "logps/chosen": -408.17330228365387, + "logps/rejected": -474.54927201704544, + "loss": 0.0341, + "rewards/chosen": 5.805907029371995, + "rewards/margins": 13.35485445035921, + "rewards/rejected": -7.548947420987216, + "step": 1016 + }, + { + "epoch": 0.2787446895984651, + "grad_norm": 9.75, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -1851314.5, + "logits/rejected": -22344561.777777776, + "logps/chosen": -571.4340006510416, + "logps/rejected": -509.1574435763889, + "loss": 0.0837, + "rewards/chosen": 5.746174494425456, + "rewards/margins": 12.603613747490776, + "rewards/rejected": -6.857439253065321, + "step": 1017 + }, + { + "epoch": 0.2790187748389749, + "grad_norm": 8.9375, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -12357066.666666666, + "logits/rejected": -32571981.333333332, + "logps/chosen": -391.3145345052083, + "logps/rejected": -525.4727783203125, + "loss": 0.0277, + "rewards/chosen": 4.827047665913899, + "rewards/margins": 13.94763406117757, + "rewards/rejected": -9.120586395263672, + "step": 1018 + }, + { + "epoch": 0.27929286007948473, + "grad_norm": 6.15625, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -33898921.84615385, + "logits/rejected": -19487517.09090909, + "logps/chosen": -418.3359375, + "logps/rejected": -442.34561434659093, + "loss": 0.0197, + "rewards/chosen": 6.123816856971154, + "rewards/margins": 14.477632109101837, + "rewards/rejected": -8.353815252130682, + "step": 1019 + }, + { + "epoch": 0.27956694531999454, + "grad_norm": 5.5, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -19898533.333333332, + "logits/rejected": -8300217.333333333, + "logps/chosen": -432.1309407552083, + "logps/rejected": -509.2061767578125, + "loss": 0.0372, + "rewards/chosen": 6.292348225911458, + "rewards/margins": 13.916521708170572, + "rewards/rejected": -7.624173482259114, + "step": 1020 + }, + { + "epoch": 0.2798410305605043, + "grad_norm": 8.5625, + "kl": 3.6256484985351562, + "learning_rate": 5e-06, + "logits/chosen": -20317110.85714286, + "logits/rejected": 12318951.2, + "logps/chosen": -450.3526088169643, + "logps/rejected": -439.718701171875, + "loss": 0.0626, + "rewards/chosen": 4.772733960832868, + "rewards/margins": 11.297489820207868, + "rewards/rejected": -6.524755859375, + "step": 1021 + }, + { + "epoch": 0.2801151158010141, + "grad_norm": 10.875, + "kl": 5.631248474121094, + "learning_rate": 5e-06, + "logits/chosen": -14596987.294117646, + "logits/rejected": -16595910.857142856, + "logps/chosen": -464.2237764246324, + "logps/rejected": -456.35609654017856, + "loss": 0.0601, + "rewards/chosen": 6.117100883932674, + "rewards/margins": 15.68095455650522, + "rewards/rejected": -9.563853672572545, + "step": 1022 + }, + { + "epoch": 0.28038920104152393, + "grad_norm": 13.25, + "kl": 8.821293830871582, + "learning_rate": 5e-06, + "logits/chosen": -13515932.235294119, + "logits/rejected": -16695419.42857143, + "logps/chosen": -402.56603285845586, + "logps/rejected": -407.7859584263393, + "loss": 0.0639, + "rewards/chosen": 5.663322897518382, + "rewards/margins": 14.805657426850134, + "rewards/rejected": -9.142334529331752, + "step": 1023 + }, + { + "epoch": 0.2806632862820337, + "grad_norm": 8.0625, + "kl": 5.594987392425537, + "learning_rate": 5e-06, + "logits/chosen": -34416795.428571425, + "logits/rejected": -25783809.6, + "logps/chosen": -460.832275390625, + "logps/rejected": -460.39375, + "loss": 0.0303, + "rewards/chosen": 5.920329502650669, + "rewards/margins": 12.507434300013951, + "rewards/rejected": -6.587104797363281, + "step": 1024 + }, + { + "epoch": 0.2809373715225435, + "grad_norm": 3.34375, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -23935588.57142857, + "logits/rejected": -6905974.117647059, + "logps/chosen": -466.9895717075893, + "logps/rejected": -621.2115119485294, + "loss": 0.0094, + "rewards/chosen": 7.17201178414481, + "rewards/margins": 17.105433199585985, + "rewards/rejected": -9.933421415441176, + "step": 1025 + }, + { + "epoch": 0.2812114567630533, + "grad_norm": 9.9375, + "kl": 4.986053466796875, + "learning_rate": 5e-06, + "logits/chosen": -15462363.636363637, + "logits/rejected": -1763191.3846153845, + "logps/chosen": -457.94380326704544, + "logps/rejected": -499.35460486778845, + "loss": 0.0594, + "rewards/chosen": 6.302353598854759, + "rewards/margins": 15.246554554759207, + "rewards/rejected": -8.944200955904448, + "step": 1026 + }, + { + "epoch": 0.2814855420035631, + "grad_norm": 12.875, + "kl": 2.2345938682556152, + "learning_rate": 5e-06, + "logits/chosen": -27547015.529411763, + "logits/rejected": -55258.0, + "logps/chosen": -429.12327665441177, + "logps/rejected": -591.3708844866071, + "loss": 0.0748, + "rewards/chosen": 6.848017973058364, + "rewards/margins": 15.531502186751165, + "rewards/rejected": -8.683484213692802, + "step": 1027 + }, + { + "epoch": 0.2817596272440729, + "grad_norm": 4.28125, + "kl": 0.0613301619887352, + "learning_rate": 5e-06, + "logits/chosen": -18968784.0, + "logits/rejected": -21105027.2, + "logps/chosen": -425.473876953125, + "logps/rejected": -616.11572265625, + "loss": 0.0176, + "rewards/chosen": 5.183281489780971, + "rewards/margins": 16.001516505650113, + "rewards/rejected": -10.81823501586914, + "step": 1028 + }, + { + "epoch": 0.2820337124845827, + "grad_norm": 7.78125, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -32651392.0, + "logits/rejected": -16293186.133333333, + "logps/chosen": -649.8935004340278, + "logps/rejected": -508.3138020833333, + "loss": 0.0195, + "rewards/chosen": 8.658390469021267, + "rewards/margins": 17.575484381781685, + "rewards/rejected": -8.917093912760416, + "step": 1029 + }, + { + "epoch": 0.2823077977250925, + "grad_norm": 8.875, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -16276788.0, + "logits/rejected": 26987510.0, + "logps/chosen": -340.35546875, + "logps/rejected": -681.496337890625, + "loss": 0.0527, + "rewards/chosen": 4.903312683105469, + "rewards/margins": 15.753532409667969, + "rewards/rejected": -10.8502197265625, + "step": 1030 + }, + { + "epoch": 0.2825818829656023, + "grad_norm": 6.1875, + "kl": 4.352703094482422, + "learning_rate": 5e-06, + "logits/chosen": -10524502.857142856, + "logits/rejected": 5332264.8, + "logps/chosen": -448.61526925223217, + "logps/rejected": -635.91923828125, + "loss": 0.0241, + "rewards/chosen": 4.971036093575614, + "rewards/margins": 13.388267844063893, + "rewards/rejected": -8.41723175048828, + "step": 1031 + }, + { + "epoch": 0.2828559682061121, + "grad_norm": 7.5625, + "kl": 0.734167754650116, + "learning_rate": 5e-06, + "logits/chosen": -15503662.4, + "logits/rejected": 34483782.85714286, + "logps/chosen": -531.09775390625, + "logps/rejected": -451.7713099888393, + "loss": 0.0231, + "rewards/chosen": 6.637080383300781, + "rewards/margins": 14.053517259870258, + "rewards/rejected": -7.416436876569476, + "step": 1032 + }, + { + "epoch": 0.2831300534466219, + "grad_norm": 5.46875, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -22897517.714285713, + "logits/rejected": 12459880.8, + "logps/chosen": -377.11732700892856, + "logps/rejected": -440.646337890625, + "loss": 0.0465, + "rewards/chosen": 6.1419492449079245, + "rewards/margins": 12.590690394810268, + "rewards/rejected": -6.448741149902344, + "step": 1033 + }, + { + "epoch": 0.2834041386871317, + "grad_norm": 10.6875, + "kl": 4.83935546875, + "learning_rate": 5e-06, + "logits/chosen": 2727267.0, + "logits/rejected": -21644440.0, + "logps/chosen": -410.34466552734375, + "logps/rejected": -508.0398254394531, + "loss": 0.0434, + "rewards/chosen": 4.421614170074463, + "rewards/margins": 12.467484951019287, + "rewards/rejected": -8.045870780944824, + "step": 1034 + }, + { + "epoch": 0.28367822392764147, + "grad_norm": 8.0, + "kl": 7.326183795928955, + "learning_rate": 5e-06, + "logits/chosen": -27136601.6, + "logits/rejected": -6415240.0, + "logps/chosen": -499.423828125, + "logps/rejected": -416.54052734375, + "loss": 0.0298, + "rewards/chosen": 6.5901845296223955, + "rewards/margins": 12.002118428548176, + "rewards/rejected": -5.411933898925781, + "step": 1035 + }, + { + "epoch": 0.2839523091681513, + "grad_norm": 3.3125, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -20273792.0, + "logits/rejected": 16015890.666666666, + "logps/chosen": -344.8353678385417, + "logps/rejected": -738.7303059895834, + "loss": 0.0237, + "rewards/chosen": 5.180613199869792, + "rewards/margins": 15.617181142171223, + "rewards/rejected": -10.436567942301432, + "step": 1036 + }, + { + "epoch": 0.2842263944086611, + "grad_norm": 3.828125, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -21548668.444444444, + "logits/rejected": -14511101.866666667, + "logps/chosen": -425.8210177951389, + "logps/rejected": -581.9740234375, + "loss": 0.0151, + "rewards/chosen": 5.401647355821398, + "rewards/margins": 14.373034074571398, + "rewards/rejected": -8.97138671875, + "step": 1037 + }, + { + "epoch": 0.2845004796491709, + "grad_norm": 9.125, + "kl": 1.6272634267807007, + "learning_rate": 5e-06, + "logits/chosen": -4011469.090909091, + "logits/rejected": -13776806.153846154, + "logps/chosen": -305.3956853693182, + "logps/rejected": -323.38852163461536, + "loss": 0.0512, + "rewards/chosen": 6.201800953258168, + "rewards/margins": 11.167413511476317, + "rewards/rejected": -4.965612558218149, + "step": 1038 + }, + { + "epoch": 0.28477456488968067, + "grad_norm": 6.375, + "kl": 7.809026718139648, + "learning_rate": 5e-06, + "logits/chosen": -12320982.4, + "logits/rejected": -11223996.444444444, + "logps/chosen": -357.33131510416666, + "logps/rejected": -393.53266059027777, + "loss": 0.0292, + "rewards/chosen": 6.3090662638346355, + "rewards/margins": 14.172460259331597, + "rewards/rejected": -7.863393995496962, + "step": 1039 + }, + { + "epoch": 0.2850486501301905, + "grad_norm": 9.6875, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -21275539.692307692, + "logits/rejected": -13634042.181818182, + "logps/chosen": -347.28958834134613, + "logps/rejected": -398.72953657670456, + "loss": 0.0894, + "rewards/chosen": 3.9158935546875, + "rewards/margins": 11.239140597256746, + "rewards/rejected": -7.323247042569247, + "step": 1040 + }, + { + "epoch": 0.2853227353707003, + "grad_norm": 8.875, + "kl": 5.999659538269043, + "learning_rate": 5e-06, + "logits/chosen": -23463956.363636363, + "logits/rejected": -13993112.615384616, + "logps/chosen": -531.0636541193181, + "logps/rejected": -499.4268329326923, + "loss": 0.0383, + "rewards/chosen": 6.697977239435369, + "rewards/margins": 14.290571679602136, + "rewards/rejected": -7.592594440166767, + "step": 1041 + }, + { + "epoch": 0.2855968206112101, + "grad_norm": 9.375, + "kl": 2.4499335289001465, + "learning_rate": 5e-06, + "logits/chosen": -14163729.23076923, + "logits/rejected": -15152349.090909092, + "logps/chosen": -491.7963115985577, + "logps/rejected": -424.6080433238636, + "loss": 0.07, + "rewards/chosen": 6.226189246544471, + "rewards/margins": 12.470556672636445, + "rewards/rejected": -6.244367426091975, + "step": 1042 + }, + { + "epoch": 0.28587090585171987, + "grad_norm": 11.1875, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -30352160.0, + "logits/rejected": -11951236.0, + "logps/chosen": -481.19476318359375, + "logps/rejected": -431.94024658203125, + "loss": 0.0471, + "rewards/chosen": 6.449286460876465, + "rewards/margins": 13.613616466522217, + "rewards/rejected": -7.164330005645752, + "step": 1043 + }, + { + "epoch": 0.2861449910922297, + "grad_norm": 10.5625, + "kl": 8.123210906982422, + "learning_rate": 5e-06, + "logits/chosen": -26301561.14285714, + "logits/rejected": -18455566.4, + "logps/chosen": -448.93101283482144, + "logps/rejected": -472.445849609375, + "loss": 0.0659, + "rewards/chosen": 4.612923758370536, + "rewards/margins": 13.62155565534319, + "rewards/rejected": -9.008631896972656, + "step": 1044 + }, + { + "epoch": 0.2864190763327395, + "grad_norm": 10.5625, + "kl": 1.443238615989685, + "learning_rate": 5e-06, + "logits/chosen": -14198080.0, + "logits/rejected": -16394100.923076924, + "logps/chosen": -384.86123934659093, + "logps/rejected": -604.0172776442307, + "loss": 0.0422, + "rewards/chosen": 4.822819796475497, + "rewards/margins": 12.231283494642565, + "rewards/rejected": -7.4084636981670675, + "step": 1045 + }, + { + "epoch": 0.28669316157324926, + "grad_norm": 5.5625, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -11034453.0, + "logits/rejected": -1201113.5, + "logps/chosen": -517.672607421875, + "logps/rejected": -479.0350341796875, + "loss": 0.0134, + "rewards/chosen": 6.930972099304199, + "rewards/margins": 14.210758209228516, + "rewards/rejected": -7.279786109924316, + "step": 1046 + }, + { + "epoch": 0.28696724681375907, + "grad_norm": 11.625, + "kl": 3.6272189617156982, + "learning_rate": 5e-06, + "logits/chosen": -27503059.692307692, + "logits/rejected": -17905413.818181816, + "logps/chosen": -505.3182842548077, + "logps/rejected": -451.25656960227275, + "loss": 0.0387, + "rewards/chosen": 6.806201641376202, + "rewards/margins": 13.643287712043815, + "rewards/rejected": -6.837086070667613, + "step": 1047 + }, + { + "epoch": 0.2872413320542689, + "grad_norm": 5.6875, + "kl": 1.2566936016082764, + "learning_rate": 5e-06, + "logits/chosen": -15148939.636363637, + "logits/rejected": -1871178.7692307692, + "logps/chosen": -375.53151633522725, + "logps/rejected": -432.6023137019231, + "loss": 0.0296, + "rewards/chosen": 5.013955549760298, + "rewards/margins": 12.099335517083015, + "rewards/rejected": -7.085379967322717, + "step": 1048 + }, + { + "epoch": 0.2875154172947787, + "grad_norm": 3.46875, + "kl": 1.2203433513641357, + "learning_rate": 5e-06, + "logits/chosen": -2114926.8, + "logits/rejected": -25927108.57142857, + "logps/chosen": -504.67314453125, + "logps/rejected": -426.61387416294644, + "loss": 0.0478, + "rewards/chosen": 7.112785339355469, + "rewards/margins": 13.9005001613072, + "rewards/rejected": -6.78771482195173, + "step": 1049 + }, + { + "epoch": 0.28778950253528846, + "grad_norm": 5.53125, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -14335989.333333334, + "logits/rejected": -17752946.666666668, + "logps/chosen": -361.5531819661458, + "logps/rejected": -410.8125813802083, + "loss": 0.0319, + "rewards/chosen": 5.563000996907552, + "rewards/margins": 13.638468424479168, + "rewards/rejected": -8.075467427571615, + "step": 1050 + }, + { + "epoch": 0.28806358777579827, + "grad_norm": 4.5, + "kl": 0.7651647329330444, + "learning_rate": 5e-06, + "logits/chosen": -24913780.0, + "logits/rejected": -11836820.0, + "logps/chosen": -385.7103576660156, + "logps/rejected": -479.47235107421875, + "loss": 0.0231, + "rewards/chosen": 4.7784576416015625, + "rewards/margins": 11.89432954788208, + "rewards/rejected": -7.115871906280518, + "step": 1051 + }, + { + "epoch": 0.2883376730163081, + "grad_norm": 8.6875, + "kl": 4.783511161804199, + "learning_rate": 5e-06, + "logits/chosen": -8366705.230769231, + "logits/rejected": -21262398.545454547, + "logps/chosen": -404.19471153846155, + "logps/rejected": -653.4149502840909, + "loss": 0.0451, + "rewards/chosen": 6.038523160494291, + "rewards/margins": 15.793534979120002, + "rewards/rejected": -9.75501181862571, + "step": 1052 + }, + { + "epoch": 0.2886117582568179, + "grad_norm": 6.40625, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -17017168.0, + "logits/rejected": -17867234.666666668, + "logps/chosen": -374.813720703125, + "logps/rejected": -623.4566243489584, + "loss": 0.0511, + "rewards/chosen": 5.069204330444336, + "rewards/margins": 16.847750981648765, + "rewards/rejected": -11.778546651204428, + "step": 1053 + }, + { + "epoch": 0.28888584349732765, + "grad_norm": 6.53125, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -1209590.4, + "logits/rejected": -692758.2857142857, + "logps/chosen": -409.03916015625, + "logps/rejected": -522.5687779017857, + "loss": 0.0349, + "rewards/chosen": 5.010184478759766, + "rewards/margins": 13.48134536743164, + "rewards/rejected": -8.471160888671875, + "step": 1054 + }, + { + "epoch": 0.28915992873783747, + "grad_norm": 9.5, + "kl": 1.2458966970443726, + "learning_rate": 5e-06, + "logits/chosen": -31012428.8, + "logits/rejected": -4998890.857142857, + "logps/chosen": -410.692236328125, + "logps/rejected": -417.02915736607144, + "loss": 0.0408, + "rewards/chosen": 6.262042617797851, + "rewards/margins": 13.982600239345004, + "rewards/rejected": -7.720557621547154, + "step": 1055 + }, + { + "epoch": 0.2894340139783473, + "grad_norm": 11.25, + "kl": 8.454475402832031, + "learning_rate": 5e-06, + "logits/chosen": -17003448.0, + "logits/rejected": -17515996.8, + "logps/chosen": -387.80064174107144, + "logps/rejected": -495.625830078125, + "loss": 0.0956, + "rewards/chosen": 5.443414960588727, + "rewards/margins": 13.534266553606306, + "rewards/rejected": -8.090851593017579, + "step": 1056 + }, + { + "epoch": 0.28970809921885704, + "grad_norm": 3.109375, + "kl": 2.396523952484131, + "learning_rate": 5e-06, + "logits/chosen": -16329322.666666666, + "logits/rejected": 5603040.0, + "logps/chosen": -502.1051432291667, + "logps/rejected": -515.252197265625, + "loss": 0.0099, + "rewards/chosen": 5.879538218180339, + "rewards/margins": 15.340593973795574, + "rewards/rejected": -9.461055755615234, + "step": 1057 + }, + { + "epoch": 0.28998218445936685, + "grad_norm": 5.09375, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -2380156.6666666665, + "logits/rejected": -22600373.333333332, + "logps/chosen": -553.546630859375, + "logps/rejected": -432.5323486328125, + "loss": 0.0134, + "rewards/chosen": 5.283844947814941, + "rewards/margins": 14.104009310404459, + "rewards/rejected": -8.820164362589518, + "step": 1058 + }, + { + "epoch": 0.29025626969987667, + "grad_norm": 1.6953125, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -14871890.285714285, + "logits/rejected": -19869477.647058822, + "logps/chosen": -424.50537109375, + "logps/rejected": -638.6605009191177, + "loss": 0.0045, + "rewards/chosen": 6.789881569998605, + "rewards/margins": 16.971602079247226, + "rewards/rejected": -10.18172050924862, + "step": 1059 + }, + { + "epoch": 0.2905303549403865, + "grad_norm": 7.75, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -32406213.818181816, + "logits/rejected": -10939591.384615384, + "logps/chosen": -451.45432350852275, + "logps/rejected": -473.54574819711536, + "loss": 0.0354, + "rewards/chosen": 5.203213778409091, + "rewards/margins": 13.881738089181326, + "rewards/rejected": -8.678524310772236, + "step": 1060 + }, + { + "epoch": 0.29080444018089624, + "grad_norm": 4.21875, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -5505919.0, + "logits/rejected": -30723940.0, + "logps/chosen": -471.20684814453125, + "logps/rejected": -508.80206298828125, + "loss": 0.0211, + "rewards/chosen": 6.311337471008301, + "rewards/margins": 13.786884307861328, + "rewards/rejected": -7.475546836853027, + "step": 1061 + }, + { + "epoch": 0.29107852542140605, + "grad_norm": 4.5, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -32339125.333333332, + "logits/rejected": -30703978.666666668, + "logps/chosen": -458.0292561848958, + "logps/rejected": -563.4398193359375, + "loss": 0.0131, + "rewards/chosen": 7.351245880126953, + "rewards/margins": 15.702267328898111, + "rewards/rejected": -8.351021448771158, + "step": 1062 + }, + { + "epoch": 0.29135261066191587, + "grad_norm": 3.515625, + "kl": 2.6981561183929443, + "learning_rate": 5e-06, + "logits/chosen": -18537581.333333332, + "logits/rejected": -12362966.666666666, + "logps/chosen": -452.6629638671875, + "logps/rejected": -396.4302571614583, + "loss": 0.0164, + "rewards/chosen": 5.954792022705078, + "rewards/margins": 11.829280853271484, + "rewards/rejected": -5.874488830566406, + "step": 1063 + }, + { + "epoch": 0.2916266959024257, + "grad_norm": 2.828125, + "kl": 3.318840742111206, + "learning_rate": 5e-06, + "logits/chosen": -9820822.857142856, + "logits/rejected": -20950278.4, + "logps/chosen": -472.4717494419643, + "logps/rejected": -518.106640625, + "loss": 0.0125, + "rewards/chosen": 6.551551273890904, + "rewards/margins": 15.169544437953405, + "rewards/rejected": -8.6179931640625, + "step": 1064 + }, + { + "epoch": 0.29190078114293544, + "grad_norm": 13.3125, + "kl": 1.8733642101287842, + "learning_rate": 5e-06, + "logits/chosen": -24955288.470588237, + "logits/rejected": -9010952.57142857, + "logps/chosen": -396.58645450367646, + "logps/rejected": -541.9744001116071, + "loss": 0.0932, + "rewards/chosen": 5.1887525670668655, + "rewards/margins": 13.528749994870996, + "rewards/rejected": -8.33999742780413, + "step": 1065 + }, + { + "epoch": 0.29217486638344525, + "grad_norm": 9.8125, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -15002837.818181818, + "logits/rejected": 627769.2307692308, + "logps/chosen": -409.5984552556818, + "logps/rejected": -557.6712364783654, + "loss": 0.0621, + "rewards/chosen": 5.061722495339134, + "rewards/margins": 15.744879609221346, + "rewards/rejected": -10.683157113882212, + "step": 1066 + }, + { + "epoch": 0.29244895162395507, + "grad_norm": 7.5, + "kl": 1.7091584205627441, + "learning_rate": 5e-06, + "logits/chosen": -24869195.42857143, + "logits/rejected": -26840172.8, + "logps/chosen": -358.55252511160717, + "logps/rejected": -390.28671875, + "loss": 0.0487, + "rewards/chosen": 5.7441302708217075, + "rewards/margins": 11.884954016549248, + "rewards/rejected": -6.140823745727539, + "step": 1067 + }, + { + "epoch": 0.2927230368644648, + "grad_norm": 2.421875, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -9634794.666666666, + "logits/rejected": 1009096.8888888889, + "logps/chosen": -405.4883626302083, + "logps/rejected": -508.52533637152777, + "loss": 0.0272, + "rewards/chosen": 7.490355809529622, + "rewards/margins": 14.689340591430664, + "rewards/rejected": -7.198984781901042, + "step": 1068 + }, + { + "epoch": 0.29299712210497464, + "grad_norm": 8.0, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -17215108.0, + "logits/rejected": -15416437.333333334, + "logps/chosen": -451.6673177083333, + "logps/rejected": -400.9019368489583, + "loss": 0.0459, + "rewards/chosen": 5.178770701090495, + "rewards/margins": 13.202402114868164, + "rewards/rejected": -8.02363141377767, + "step": 1069 + }, + { + "epoch": 0.29327120734548445, + "grad_norm": 14.25, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": 5949926.181818182, + "logits/rejected": -24896526.769230768, + "logps/chosen": -435.1437322443182, + "logps/rejected": -560.3046875, + "loss": 0.0384, + "rewards/chosen": 6.8018341064453125, + "rewards/margins": 16.085703923152042, + "rewards/rejected": -9.28386981670673, + "step": 1070 + }, + { + "epoch": 0.29354529258599427, + "grad_norm": 7.21875, + "kl": 0.16980235278606415, + "learning_rate": 5e-06, + "logits/chosen": -20083329.454545453, + "logits/rejected": -12497774.76923077, + "logps/chosen": -502.06338778409093, + "logps/rejected": -635.7085336538462, + "loss": 0.0234, + "rewards/chosen": 5.924740184437145, + "rewards/margins": 19.461408228307334, + "rewards/rejected": -13.536668043870192, + "step": 1071 + }, + { + "epoch": 0.293819377826504, + "grad_norm": 13.125, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -11031816.888888888, + "logits/rejected": -32099972.266666666, + "logps/chosen": -488.98828125, + "logps/rejected": -458.9402669270833, + "loss": 0.0382, + "rewards/chosen": 6.118995666503906, + "rewards/margins": 13.058966573079427, + "rewards/rejected": -6.939970906575521, + "step": 1072 + }, + { + "epoch": 0.29409346306701384, + "grad_norm": 8.1875, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -6884333.818181818, + "logits/rejected": -35201634.461538464, + "logps/chosen": -393.03329190340907, + "logps/rejected": -581.3615534855769, + "loss": 0.0144, + "rewards/chosen": 6.209661310369318, + "rewards/margins": 14.164065701144558, + "rewards/rejected": -7.95440439077524, + "step": 1073 + }, + { + "epoch": 0.29436754830752365, + "grad_norm": 6.21875, + "kl": 6.744792938232422, + "learning_rate": 5e-06, + "logits/chosen": -15124625.6, + "logits/rejected": -3091367.1428571427, + "logps/chosen": -509.141162109375, + "logps/rejected": -534.9545549665179, + "loss": 0.0604, + "rewards/chosen": 6.743038940429687, + "rewards/margins": 15.220724051339285, + "rewards/rejected": -8.477685110909599, + "step": 1074 + }, + { + "epoch": 0.29464163354803347, + "grad_norm": 13.6875, + "kl": 5.341987609863281, + "learning_rate": 5e-06, + "logits/chosen": -13127037.538461538, + "logits/rejected": -19059591.272727273, + "logps/chosen": -563.2160832331731, + "logps/rejected": -408.50297407670456, + "loss": 0.0749, + "rewards/chosen": 6.3085773174579325, + "rewards/margins": 12.374953743461129, + "rewards/rejected": -6.066376426003196, + "step": 1075 + }, + { + "epoch": 0.2949157187885432, + "grad_norm": 7.21875, + "kl": 8.58033275604248, + "learning_rate": 5e-06, + "logits/chosen": -2816062.153846154, + "logits/rejected": -19647082.181818184, + "logps/chosen": -530.2874849759615, + "logps/rejected": -585.5297407670455, + "loss": 0.0456, + "rewards/chosen": 6.20426999605619, + "rewards/margins": 15.714654989175862, + "rewards/rejected": -9.510384993119674, + "step": 1076 + }, + { + "epoch": 0.29518980402905304, + "grad_norm": 6.375, + "kl": 4.17081356048584, + "learning_rate": 5e-06, + "logits/chosen": -25350200.0, + "logits/rejected": -20847030.666666668, + "logps/chosen": -534.4053548177084, + "logps/rejected": -494.622314453125, + "loss": 0.047, + "rewards/chosen": 5.750114440917969, + "rewards/margins": 15.110954284667969, + "rewards/rejected": -9.36083984375, + "step": 1077 + }, + { + "epoch": 0.29546388926956285, + "grad_norm": 10.5625, + "kl": 8.079852104187012, + "learning_rate": 5e-06, + "logits/chosen": -14046033.23076923, + "logits/rejected": -25527479.272727273, + "logps/chosen": -434.4167668269231, + "logps/rejected": -393.33327414772725, + "loss": 0.0623, + "rewards/chosen": 4.9113910381610575, + "rewards/margins": 12.985945481520432, + "rewards/rejected": -8.074554443359375, + "step": 1078 + }, + { + "epoch": 0.2957379745100726, + "grad_norm": 11.0625, + "kl": 10.608439445495605, + "learning_rate": 5e-06, + "logits/chosen": -25649683.2, + "logits/rejected": 37360160.0, + "logps/chosen": -482.2635091145833, + "logps/rejected": -630.8063151041666, + "loss": 0.0371, + "rewards/chosen": 7.016084798177084, + "rewards/margins": 20.572950914171006, + "rewards/rejected": -13.556866115993923, + "step": 1079 + }, + { + "epoch": 0.2960120597505824, + "grad_norm": 5.90625, + "kl": 0.19478607177734375, + "learning_rate": 5e-06, + "logits/chosen": -16388784.0, + "logits/rejected": -5076516.0, + "logps/chosen": -422.02469308035717, + "logps/rejected": -591.952880859375, + "loss": 0.022, + "rewards/chosen": 5.273594992501395, + "rewards/margins": 14.180884116036552, + "rewards/rejected": -8.907289123535156, + "step": 1080 + }, + { + "epoch": 0.29628614499109224, + "grad_norm": 12.4375, + "kl": 1.0446605682373047, + "learning_rate": 5e-06, + "logits/chosen": 6267887.2727272725, + "logits/rejected": -14359136.0, + "logps/chosen": -479.1066228693182, + "logps/rejected": -519.7198768028846, + "loss": 0.0382, + "rewards/chosen": 4.598660208962181, + "rewards/margins": 12.640118338844992, + "rewards/rejected": -8.041458129882812, + "step": 1081 + }, + { + "epoch": 0.29656023023160205, + "grad_norm": 11.875, + "kl": 5.391844749450684, + "learning_rate": 5e-06, + "logits/chosen": -14120670.857142856, + "logits/rejected": -18279454.4, + "logps/chosen": -445.18603515625, + "logps/rejected": -414.27236328125, + "loss": 0.0771, + "rewards/chosen": 4.615233830043247, + "rewards/margins": 10.571694782802037, + "rewards/rejected": -5.956460952758789, + "step": 1082 + }, + { + "epoch": 0.2968343154721118, + "grad_norm": 5.15625, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -27751571.692307692, + "logits/rejected": -7902020.363636363, + "logps/chosen": -440.64197716346155, + "logps/rejected": -591.2212801846591, + "loss": 0.022, + "rewards/chosen": 5.421122037447416, + "rewards/margins": 13.685796137456293, + "rewards/rejected": -8.264674100008877, + "step": 1083 + }, + { + "epoch": 0.2971084007126216, + "grad_norm": 5.15625, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -38197222.4, + "logits/rejected": -22924128.0, + "logps/chosen": -440.285986328125, + "logps/rejected": -445.1131068638393, + "loss": 0.0194, + "rewards/chosen": 7.108845520019531, + "rewards/margins": 14.604392133440289, + "rewards/rejected": -7.495546613420759, + "step": 1084 + }, + { + "epoch": 0.29738248595313144, + "grad_norm": 14.625, + "kl": 1.947662353515625, + "learning_rate": 5e-06, + "logits/chosen": -27492865.777777776, + "logits/rejected": 852710.4, + "logps/chosen": -419.2498372395833, + "logps/rejected": -514.1194986979167, + "loss": 0.1182, + "rewards/chosen": 4.868280198838976, + "rewards/margins": 11.866567145453558, + "rewards/rejected": -6.998286946614583, + "step": 1085 + }, + { + "epoch": 0.2976565711936412, + "grad_norm": 4.90625, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -18991745.454545453, + "logits/rejected": -15528653.538461538, + "logps/chosen": -537.1700994318181, + "logps/rejected": -519.7867337740385, + "loss": 0.0121, + "rewards/chosen": 6.882707075639204, + "rewards/margins": 16.300579577892808, + "rewards/rejected": -9.417872502253605, + "step": 1086 + }, + { + "epoch": 0.297930656434151, + "grad_norm": 6.0, + "kl": 2.4870707988739014, + "learning_rate": 5e-06, + "logits/chosen": -28695914.666666668, + "logits/rejected": -11593565.333333334, + "logps/chosen": -388.7858072916667, + "logps/rejected": -691.775146484375, + "loss": 0.0218, + "rewards/chosen": 5.29501469930013, + "rewards/margins": 16.264420827229817, + "rewards/rejected": -10.969406127929688, + "step": 1087 + }, + { + "epoch": 0.2982047416746608, + "grad_norm": 3.46875, + "kl": 5.706443786621094, + "learning_rate": 5e-06, + "logits/chosen": -15538813.866666667, + "logits/rejected": -19515484.444444444, + "logps/chosen": -428.5836588541667, + "logps/rejected": -437.85438368055554, + "loss": 0.1, + "rewards/chosen": 5.017101033528646, + "rewards/margins": 14.73963097466363, + "rewards/rejected": -9.722529941134983, + "step": 1088 + }, + { + "epoch": 0.29847882691517064, + "grad_norm": 8.375, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -9325378.285714285, + "logits/rejected": -31410041.6, + "logps/chosen": -370.256103515625, + "logps/rejected": -580.913427734375, + "loss": 0.0355, + "rewards/chosen": 5.489479064941406, + "rewards/margins": 13.690165710449218, + "rewards/rejected": -8.200686645507812, + "step": 1089 + }, + { + "epoch": 0.2987529121556804, + "grad_norm": 2.9375, + "kl": 6.512880802154541, + "learning_rate": 5e-06, + "logits/chosen": -22667570.666666668, + "logits/rejected": -17383560.0, + "logps/chosen": -443.2462972005208, + "logps/rejected": -479.2784830729167, + "loss": 0.0112, + "rewards/chosen": 7.684643427530925, + "rewards/margins": 14.769641240437826, + "rewards/rejected": -7.084997812906901, + "step": 1090 + }, + { + "epoch": 0.2990269973961902, + "grad_norm": 20.125, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -19108027.636363637, + "logits/rejected": -13640694.153846154, + "logps/chosen": -490.7666015625, + "logps/rejected": -496.0592698317308, + "loss": 0.0617, + "rewards/chosen": 5.911700855601918, + "rewards/margins": 14.251769966178841, + "rewards/rejected": -8.340069110576923, + "step": 1091 + }, + { + "epoch": 0.2993010826367, + "grad_norm": 5.90625, + "kl": 5.34882926940918, + "learning_rate": 5e-06, + "logits/chosen": -10878946.0, + "logits/rejected": -28544346.0, + "logps/chosen": -464.678955078125, + "logps/rejected": -375.37664794921875, + "loss": 0.0313, + "rewards/chosen": 6.705815315246582, + "rewards/margins": 14.166656494140625, + "rewards/rejected": -7.460841178894043, + "step": 1092 + }, + { + "epoch": 0.29957516787720984, + "grad_norm": 12.5, + "kl": 18.68014144897461, + "learning_rate": 5e-06, + "logits/chosen": -9616103.111111112, + "logits/rejected": 10151607.333333334, + "logps/chosen": -468.7623697916667, + "logps/rejected": -459.9700520833333, + "loss": 0.0842, + "rewards/chosen": 6.068935818142361, + "rewards/margins": 11.580194897121853, + "rewards/rejected": -5.511259078979492, + "step": 1093 + }, + { + "epoch": 0.2998492531177196, + "grad_norm": 8.0625, + "kl": 0.23200353980064392, + "learning_rate": 5e-06, + "logits/chosen": -8954899.076923076, + "logits/rejected": -15458192.0, + "logps/chosen": -438.33188100961536, + "logps/rejected": -544.5822088068181, + "loss": 0.0396, + "rewards/chosen": 5.914050762469952, + "rewards/margins": 14.054758992228475, + "rewards/rejected": -8.140708229758523, + "step": 1094 + }, + { + "epoch": 0.3001233383582294, + "grad_norm": 4.0, + "kl": 1.9282376766204834, + "learning_rate": 5e-06, + "logits/chosen": -27572057.6, + "logits/rejected": -19553472.0, + "logps/chosen": -507.26526692708336, + "logps/rejected": -418.2261013454861, + "loss": 0.0314, + "rewards/chosen": 6.777867126464844, + "rewards/margins": 13.642784457736546, + "rewards/rejected": -6.864917331271702, + "step": 1095 + }, + { + "epoch": 0.3003974235987392, + "grad_norm": 3.671875, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -34849160.53333333, + "logits/rejected": -35274492.44444445, + "logps/chosen": -473.1590169270833, + "logps/rejected": -361.50086805555554, + "loss": 0.0167, + "rewards/chosen": 6.4980412801106775, + "rewards/margins": 13.608519575330947, + "rewards/rejected": -7.110478295220269, + "step": 1096 + }, + { + "epoch": 0.300671508839249, + "grad_norm": 4.875, + "kl": 4.424947738647461, + "learning_rate": 5e-06, + "logits/chosen": -21203251.692307692, + "logits/rejected": -14793579.636363637, + "logps/chosen": -425.73745492788464, + "logps/rejected": -429.84224076704544, + "loss": 0.0212, + "rewards/chosen": 6.34053215613732, + "rewards/margins": 13.352931309413243, + "rewards/rejected": -7.012399153275923, + "step": 1097 + }, + { + "epoch": 0.3009455940797588, + "grad_norm": 9.8125, + "kl": 3.948117733001709, + "learning_rate": 5e-06, + "logits/chosen": -16994389.714285713, + "logits/rejected": 20260006.4, + "logps/chosen": -335.0615931919643, + "logps/rejected": -589.384521484375, + "loss": 0.0825, + "rewards/chosen": 4.18879154750279, + "rewards/margins": 12.486162458147321, + "rewards/rejected": -8.297370910644531, + "step": 1098 + }, + { + "epoch": 0.3012196793202686, + "grad_norm": 5.9375, + "kl": 1.8686473369598389, + "learning_rate": 5e-06, + "logits/chosen": -35991478.15384615, + "logits/rejected": -22958000.0, + "logps/chosen": -542.4319411057693, + "logps/rejected": -473.32954545454544, + "loss": 0.0201, + "rewards/chosen": 7.028864933894231, + "rewards/margins": 13.908925303212413, + "rewards/rejected": -6.880060369318182, + "step": 1099 + }, + { + "epoch": 0.3014937645607784, + "grad_norm": 8.625, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -6031541.090909091, + "logits/rejected": 2938423.6923076925, + "logps/chosen": -457.18954190340907, + "logps/rejected": -717.6520432692307, + "loss": 0.0269, + "rewards/chosen": 7.409467523748225, + "rewards/margins": 19.386979589929112, + "rewards/rejected": -11.97751206618089, + "step": 1100 + }, + { + "epoch": 0.3017678498012882, + "grad_norm": 7.125, + "kl": 0.8637079000473022, + "learning_rate": 5e-06, + "logits/chosen": -28092068.923076924, + "logits/rejected": -3085385.090909091, + "logps/chosen": -377.3088191105769, + "logps/rejected": -295.3263494318182, + "loss": 0.0616, + "rewards/chosen": 5.493105961726262, + "rewards/margins": 10.273266345470935, + "rewards/rejected": -4.780160383744673, + "step": 1101 + }, + { + "epoch": 0.302041935041798, + "grad_norm": 4.65625, + "kl": 5.014363765716553, + "learning_rate": 5e-06, + "logits/chosen": -16456837.333333334, + "logits/rejected": -19389546.666666668, + "logps/chosen": -430.40797526041666, + "logps/rejected": -496.99175347222223, + "loss": 0.0444, + "rewards/chosen": 5.986407470703125, + "rewards/margins": 13.23175523546007, + "rewards/rejected": -7.245347764756945, + "step": 1102 + }, + { + "epoch": 0.3023160202823078, + "grad_norm": 10.1875, + "kl": 1.3516795635223389, + "learning_rate": 5e-06, + "logits/chosen": -7081457.230769231, + "logits/rejected": -11371383.272727273, + "logps/chosen": -474.31531700721155, + "logps/rejected": -430.0437677556818, + "loss": 0.0701, + "rewards/chosen": 5.885730449969952, + "rewards/margins": 12.511324155580748, + "rewards/rejected": -6.625593705610796, + "step": 1103 + }, + { + "epoch": 0.3025901055228176, + "grad_norm": 10.25, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -3352704.0, + "logits/rejected": 1318257.4285714286, + "logps/chosen": -531.1283203125, + "logps/rejected": -460.05057198660717, + "loss": 0.0713, + "rewards/chosen": 4.3938850402832035, + "rewards/margins": 11.883385358537947, + "rewards/rejected": -7.489500318254743, + "step": 1104 + }, + { + "epoch": 0.3028641907633274, + "grad_norm": 6.84375, + "kl": 5.0400238037109375, + "learning_rate": 5e-06, + "logits/chosen": -26644864.0, + "logits/rejected": -13971832.0, + "logps/chosen": -438.41376953125, + "logps/rejected": -469.74720982142856, + "loss": 0.0279, + "rewards/chosen": 5.338924407958984, + "rewards/margins": 14.50471660069057, + "rewards/rejected": -9.165792192731585, + "step": 1105 + }, + { + "epoch": 0.3031382760038372, + "grad_norm": 4.40625, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -15594854.4, + "logits/rejected": -24808946.285714287, + "logps/chosen": -401.133251953125, + "logps/rejected": -584.4102260044643, + "loss": 0.0137, + "rewards/chosen": 5.9389698028564455, + "rewards/margins": 14.065623201642719, + "rewards/rejected": -8.126653398786273, + "step": 1106 + }, + { + "epoch": 0.303412361244347, + "grad_norm": 9.625, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -41553069.333333336, + "logits/rejected": -16113961.333333334, + "logps/chosen": -327.7891438802083, + "logps/rejected": -740.14599609375, + "loss": 0.0502, + "rewards/chosen": 5.245719909667969, + "rewards/margins": 16.48113250732422, + "rewards/rejected": -11.23541259765625, + "step": 1107 + }, + { + "epoch": 0.30368644648485676, + "grad_norm": 8.75, + "kl": 5.278210639953613, + "learning_rate": 5e-06, + "logits/chosen": -27744164.57142857, + "logits/rejected": -11980136.0, + "logps/chosen": -328.1917201450893, + "logps/rejected": -556.1958984375, + "loss": 0.0802, + "rewards/chosen": 5.312091827392578, + "rewards/margins": 10.576219177246093, + "rewards/rejected": -5.264127349853515, + "step": 1108 + }, + { + "epoch": 0.3039605317253666, + "grad_norm": 4.84375, + "kl": 4.635922908782959, + "learning_rate": 5e-06, + "logits/chosen": -16893806.933333334, + "logits/rejected": -5052448.888888889, + "logps/chosen": -452.74261067708335, + "logps/rejected": -548.5677083333334, + "loss": 0.0153, + "rewards/chosen": 7.021608988444011, + "rewards/margins": 14.54437255859375, + "rewards/rejected": -7.522763570149739, + "step": 1109 + }, + { + "epoch": 0.3042346169658764, + "grad_norm": 7.53125, + "kl": 3.9188945293426514, + "learning_rate": 5e-06, + "logits/chosen": -37166240.0, + "logits/rejected": -31446748.8, + "logps/chosen": -446.92738560267856, + "logps/rejected": -562.90361328125, + "loss": 0.0821, + "rewards/chosen": 5.226675306047712, + "rewards/margins": 13.841503034319196, + "rewards/rejected": -8.614827728271484, + "step": 1110 + }, + { + "epoch": 0.3045087022063862, + "grad_norm": 14.8125, + "kl": 10.944302558898926, + "learning_rate": 5e-06, + "logits/chosen": -6210255.5, + "logits/rejected": -35833408.0, + "logps/chosen": -472.58660888671875, + "logps/rejected": -375.86749267578125, + "loss": 0.053, + "rewards/chosen": 6.941847801208496, + "rewards/margins": 13.216641426086426, + "rewards/rejected": -6.27479362487793, + "step": 1111 + }, + { + "epoch": 0.30478278744689596, + "grad_norm": 0.96484375, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": 5444774.8, + "logits/rejected": -7245877.142857143, + "logps/chosen": -540.574365234375, + "logps/rejected": -568.0019182477679, + "loss": 0.0025, + "rewards/chosen": 7.858541870117188, + "rewards/margins": 18.300874546595985, + "rewards/rejected": -10.442332676478795, + "step": 1112 + }, + { + "epoch": 0.3050568726874058, + "grad_norm": 6.3125, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -24649878.4, + "logits/rejected": -20868141.714285713, + "logps/chosen": -480.0603515625, + "logps/rejected": -444.13260323660717, + "loss": 0.019, + "rewards/chosen": 6.729631042480468, + "rewards/margins": 14.462582070486885, + "rewards/rejected": -7.7329510280064175, + "step": 1113 + }, + { + "epoch": 0.3053309579279156, + "grad_norm": 12.625, + "kl": 9.827690124511719, + "learning_rate": 5e-06, + "logits/chosen": -29150296.615384616, + "logits/rejected": -10360420.363636363, + "logps/chosen": -473.0603215144231, + "logps/rejected": -569.5564630681819, + "loss": 0.0615, + "rewards/chosen": 6.2190716083233175, + "rewards/margins": 15.068694161368416, + "rewards/rejected": -8.8496225530451, + "step": 1114 + }, + { + "epoch": 0.3056050431684254, + "grad_norm": 4.375, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -4732742.769230769, + "logits/rejected": 11744700.363636363, + "logps/chosen": -424.76370943509613, + "logps/rejected": -590.1255326704545, + "loss": 0.0197, + "rewards/chosen": 5.981447073129507, + "rewards/margins": 14.154662072241724, + "rewards/rejected": -8.173214999112217, + "step": 1115 + }, + { + "epoch": 0.30587912840893516, + "grad_norm": 9.4375, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -26557486.545454547, + "logits/rejected": -37149907.692307696, + "logps/chosen": -454.08225319602275, + "logps/rejected": -527.5900691105769, + "loss": 0.0375, + "rewards/chosen": 5.985993818803267, + "rewards/margins": 13.461940098475743, + "rewards/rejected": -7.475946279672476, + "step": 1116 + }, + { + "epoch": 0.306153213649445, + "grad_norm": 6.25, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -4081049.6666666665, + "logits/rejected": 4081008.0, + "logps/chosen": -458.5350748697917, + "logps/rejected": -520.7919514973959, + "loss": 0.0214, + "rewards/chosen": 6.024700164794922, + "rewards/margins": 15.293402353922525, + "rewards/rejected": -9.268702189127604, + "step": 1117 + }, + { + "epoch": 0.3064272988899548, + "grad_norm": 8.8125, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -15540549.333333334, + "logits/rejected": 5576996.666666667, + "logps/chosen": -469.4650065104167, + "logps/rejected": -810.830322265625, + "loss": 0.0513, + "rewards/chosen": 5.644620259602864, + "rewards/margins": 16.225018819173176, + "rewards/rejected": -10.580398559570312, + "step": 1118 + }, + { + "epoch": 0.30670138413046455, + "grad_norm": 5.71875, + "kl": 0.6377710103988647, + "learning_rate": 5e-06, + "logits/chosen": -12035082.4, + "logits/rejected": -36782317.71428572, + "logps/chosen": -386.558837890625, + "logps/rejected": -618.1937779017857, + "loss": 0.029, + "rewards/chosen": 4.778298187255859, + "rewards/margins": 13.055484444754462, + "rewards/rejected": -8.277186257498604, + "step": 1119 + }, + { + "epoch": 0.30697546937097436, + "grad_norm": 5.71875, + "kl": 0.059848152101039886, + "learning_rate": 5e-06, + "logits/chosen": -4329021.142857143, + "logits/rejected": 1868146.9411764706, + "logps/chosen": -363.5145786830357, + "logps/rejected": -422.4412626378676, + "loss": 0.0225, + "rewards/chosen": 6.3840130397251675, + "rewards/margins": 13.100170840736197, + "rewards/rejected": -6.716157801011029, + "step": 1120 + }, + { + "epoch": 0.3072495546114842, + "grad_norm": 3.5625, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -14929307.2, + "logits/rejected": -39248109.71428572, + "logps/chosen": -520.995947265625, + "logps/rejected": -515.9207938058036, + "loss": 0.0188, + "rewards/chosen": 6.51365966796875, + "rewards/margins": 15.609920283726282, + "rewards/rejected": -9.096260615757533, + "step": 1121 + }, + { + "epoch": 0.307523639851994, + "grad_norm": 10.6875, + "kl": 4.210796356201172, + "learning_rate": 5e-06, + "logits/chosen": -6896560.0, + "logits/rejected": -30107827.2, + "logps/chosen": -446.34444754464283, + "logps/rejected": -471.1564453125, + "loss": 0.0522, + "rewards/chosen": 6.8186830793108255, + "rewards/margins": 12.978336552211216, + "rewards/rejected": -6.1596534729003904, + "step": 1122 + }, + { + "epoch": 0.30779772509250375, + "grad_norm": 3.78125, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -14698108.57142857, + "logits/rejected": -1829652.0, + "logps/chosen": -430.99490792410717, + "logps/rejected": -486.8235294117647, + "loss": 0.0132, + "rewards/chosen": 6.43873051234654, + "rewards/margins": 15.55770418824268, + "rewards/rejected": -9.11897367589614, + "step": 1123 + }, + { + "epoch": 0.30807181033301356, + "grad_norm": 12.6875, + "kl": 6.803936004638672, + "learning_rate": 5e-06, + "logits/chosen": -24908819.2, + "logits/rejected": -6831124.444444444, + "logps/chosen": -424.61640625, + "logps/rejected": -677.0373806423611, + "loss": 0.0579, + "rewards/chosen": 5.829174296061198, + "rewards/margins": 16.531361728244356, + "rewards/rejected": -10.702187432183159, + "step": 1124 + }, + { + "epoch": 0.3083458955735234, + "grad_norm": 7.03125, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -28876547.2, + "logits/rejected": -11089851.42857143, + "logps/chosen": -374.8903564453125, + "logps/rejected": -718.9300362723214, + "loss": 0.0434, + "rewards/chosen": 5.724866485595703, + "rewards/margins": 19.45114778791155, + "rewards/rejected": -13.726281302315849, + "step": 1125 + }, + { + "epoch": 0.3086199808140332, + "grad_norm": 9.25, + "kl": 6.458516597747803, + "learning_rate": 5e-06, + "logits/chosen": -27177826.666666668, + "logits/rejected": -16957969.333333332, + "logps/chosen": -433.68896484375, + "logps/rejected": -445.2791341145833, + "loss": 0.0604, + "rewards/chosen": 5.69816525777181, + "rewards/margins": 13.222309112548828, + "rewards/rejected": -7.5241438547770185, + "step": 1126 + }, + { + "epoch": 0.30889406605454295, + "grad_norm": 4.0, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -12285541.6, + "logits/rejected": -25067698.285714287, + "logps/chosen": -522.24833984375, + "logps/rejected": -579.6522391183036, + "loss": 0.0093, + "rewards/chosen": 7.550592041015625, + "rewards/margins": 18.986725289481026, + "rewards/rejected": -11.436133248465401, + "step": 1127 + }, + { + "epoch": 0.30916815129505276, + "grad_norm": 10.5, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -26853388.8, + "logits/rejected": -22417142.85714286, + "logps/chosen": -500.7642578125, + "logps/rejected": -527.8025251116071, + "loss": 0.0554, + "rewards/chosen": 4.376620864868164, + "rewards/margins": 14.514084025791712, + "rewards/rejected": -10.137463160923549, + "step": 1128 + }, + { + "epoch": 0.3094422365355626, + "grad_norm": 13.375, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -19761684.363636363, + "logits/rejected": -20397795.692307692, + "logps/chosen": -382.1985973011364, + "logps/rejected": -504.7409480168269, + "loss": 0.1015, + "rewards/chosen": 4.027758858420632, + "rewards/margins": 14.000074720049238, + "rewards/rejected": -9.972315861628605, + "step": 1129 + }, + { + "epoch": 0.30971632177607233, + "grad_norm": 4.9375, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -17062910.545454547, + "logits/rejected": -18836011.076923076, + "logps/chosen": -515.5007990056819, + "logps/rejected": -462.1457331730769, + "loss": 0.0311, + "rewards/chosen": 7.242712541060015, + "rewards/margins": 16.808415713010135, + "rewards/rejected": -9.56570317195012, + "step": 1130 + }, + { + "epoch": 0.30999040701658215, + "grad_norm": 13.625, + "kl": 7.98950719833374, + "learning_rate": 5e-06, + "logits/chosen": -26201685.333333332, + "logits/rejected": -38847125.333333336, + "logps/chosen": -478.8164876302083, + "logps/rejected": -548.8184814453125, + "loss": 0.0561, + "rewards/chosen": 4.826607386271159, + "rewards/margins": 15.14499346415202, + "rewards/rejected": -10.31838607788086, + "step": 1131 + }, + { + "epoch": 0.31026449225709196, + "grad_norm": 4.65625, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -37136998.4, + "logits/rejected": -32405074.285714287, + "logps/chosen": -559.0873046875, + "logps/rejected": -398.1171177455357, + "loss": 0.0194, + "rewards/chosen": 7.2521202087402346, + "rewards/margins": 16.099533952985492, + "rewards/rejected": -8.847413744245257, + "step": 1132 + }, + { + "epoch": 0.3105385774976018, + "grad_norm": 15.25, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": 36402940.44444445, + "logits/rejected": -4598411.2, + "logps/chosen": -526.6043294270834, + "logps/rejected": -527.4513020833333, + "loss": 0.0528, + "rewards/chosen": 5.774631924099392, + "rewards/margins": 12.982745530870226, + "rewards/rejected": -7.208113606770834, + "step": 1133 + }, + { + "epoch": 0.31081266273811153, + "grad_norm": 2.59375, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -22183154.666666668, + "logits/rejected": -27090005.333333332, + "logps/chosen": -404.3594156901042, + "logps/rejected": -669.7356770833334, + "loss": 0.0085, + "rewards/chosen": 6.2249806722005205, + "rewards/margins": 18.799392700195312, + "rewards/rejected": -12.574412027994791, + "step": 1134 + }, + { + "epoch": 0.31108674797862135, + "grad_norm": 15.1875, + "kl": 11.777234077453613, + "learning_rate": 5e-06, + "logits/chosen": -15224299.789473685, + "logits/rejected": -4054100.8, + "logps/chosen": -482.1708470394737, + "logps/rejected": -533.596142578125, + "loss": 0.0649, + "rewards/chosen": 6.104137219880757, + "rewards/margins": 13.780396069978412, + "rewards/rejected": -7.676258850097656, + "step": 1135 + }, + { + "epoch": 0.31136083321913116, + "grad_norm": 13.0625, + "kl": 5.362753868103027, + "learning_rate": 5e-06, + "logits/chosen": -10899944.0, + "logits/rejected": -4430133.142857143, + "logps/chosen": -492.382421875, + "logps/rejected": -431.46010044642856, + "loss": 0.0561, + "rewards/chosen": 6.355989837646485, + "rewards/margins": 14.646747153145927, + "rewards/rejected": -8.290757315499443, + "step": 1136 + }, + { + "epoch": 0.311634918459641, + "grad_norm": 15.25, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": 8372220.0, + "logits/rejected": -15667513.333333334, + "logps/chosen": -355.4342447916667, + "logps/rejected": -419.4850667317708, + "loss": 0.0927, + "rewards/chosen": 4.725744247436523, + "rewards/margins": 13.69633928934733, + "rewards/rejected": -8.970595041910807, + "step": 1137 + }, + { + "epoch": 0.31190900370015073, + "grad_norm": 3.1875, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -24273705.14285714, + "logits/rejected": -30262032.0, + "logps/chosen": -356.79990931919644, + "logps/rejected": -622.3302734375, + "loss": 0.0333, + "rewards/chosen": 5.3051959446498325, + "rewards/margins": 15.648598044259206, + "rewards/rejected": -10.343402099609374, + "step": 1138 + }, + { + "epoch": 0.31218308894066055, + "grad_norm": 5.9375, + "kl": 7.575168609619141, + "learning_rate": 5e-06, + "logits/chosen": -17733692.307692308, + "logits/rejected": -11937655.272727273, + "logps/chosen": -396.5891676682692, + "logps/rejected": -685.6845259232955, + "loss": 0.0216, + "rewards/chosen": 6.830788832444411, + "rewards/margins": 20.215283960729213, + "rewards/rejected": -13.3844951282848, + "step": 1139 + }, + { + "epoch": 0.31245717418117036, + "grad_norm": 8.125, + "kl": 0.36707115173339844, + "learning_rate": 5e-06, + "logits/chosen": -22980775.111111112, + "logits/rejected": -7483971.2, + "logps/chosen": -414.5800509982639, + "logps/rejected": -522.08681640625, + "loss": 0.0339, + "rewards/chosen": 5.329615698920356, + "rewards/margins": 13.497943539089626, + "rewards/rejected": -8.16832784016927, + "step": 1140 + }, + { + "epoch": 0.3127312594216801, + "grad_norm": 10.375, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -5628656.0, + "logits/rejected": -28622709.333333332, + "logps/chosen": -390.4486490885417, + "logps/rejected": -561.354248046875, + "loss": 0.0398, + "rewards/chosen": 5.95725949605306, + "rewards/margins": 13.520312627156574, + "rewards/rejected": -7.563053131103516, + "step": 1141 + }, + { + "epoch": 0.31300534466218993, + "grad_norm": 6.15625, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": 1750119.3846153845, + "logits/rejected": -13354810.181818182, + "logps/chosen": -475.03087439903845, + "logps/rejected": -551.2029030539773, + "loss": 0.0448, + "rewards/chosen": 6.735497107872596, + "rewards/margins": 16.007895623053702, + "rewards/rejected": -9.272398515181107, + "step": 1142 + }, + { + "epoch": 0.31327942990269975, + "grad_norm": 7.53125, + "kl": 13.705860137939453, + "learning_rate": 5e-06, + "logits/chosen": -42123177.14285714, + "logits/rejected": -28303692.8, + "logps/chosen": -439.58297293526783, + "logps/rejected": -451.337451171875, + "loss": 0.0211, + "rewards/chosen": 6.2049745832170755, + "rewards/margins": 16.425059727260045, + "rewards/rejected": -10.220085144042969, + "step": 1143 + }, + { + "epoch": 0.31355351514320956, + "grad_norm": 10.9375, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -12708219.42857143, + "logits/rejected": -6455980.8, + "logps/chosen": -434.91796875, + "logps/rejected": -442.08388671875, + "loss": 0.0827, + "rewards/chosen": 5.00634275163923, + "rewards/margins": 15.360912758963448, + "rewards/rejected": -10.354570007324218, + "step": 1144 + }, + { + "epoch": 0.3138276003837193, + "grad_norm": 8.625, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -26256867.555555556, + "logits/rejected": -11869030.4, + "logps/chosen": -538.5508897569445, + "logps/rejected": -492.8033203125, + "loss": 0.0353, + "rewards/chosen": 5.790754106309679, + "rewards/margins": 15.462355465359158, + "rewards/rejected": -9.671601359049479, + "step": 1145 + }, + { + "epoch": 0.31410168562422913, + "grad_norm": 5.8125, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -15936115.692307692, + "logits/rejected": -13732283.636363637, + "logps/chosen": -431.0832331730769, + "logps/rejected": -420.4944513494318, + "loss": 0.0364, + "rewards/chosen": 5.07518298809345, + "rewards/margins": 14.975565983698917, + "rewards/rejected": -9.900382995605469, + "step": 1146 + }, + { + "epoch": 0.31437577086473895, + "grad_norm": 9.125, + "kl": 8.274698257446289, + "learning_rate": 5e-06, + "logits/chosen": -8364762.352941177, + "logits/rejected": -4672750.285714285, + "logps/chosen": -361.94450827205884, + "logps/rejected": -514.7030552455357, + "loss": 0.0875, + "rewards/chosen": 5.5512856876148895, + "rewards/margins": 12.671623037642792, + "rewards/rejected": -7.120337350027902, + "step": 1147 + }, + { + "epoch": 0.31464985610524876, + "grad_norm": 10.5, + "kl": 0.43775051832199097, + "learning_rate": 5e-06, + "logits/chosen": -14476633.846153846, + "logits/rejected": -6956013.090909091, + "logps/chosen": -452.74158653846155, + "logps/rejected": -522.5855823863636, + "loss": 0.0334, + "rewards/chosen": 4.967149587777945, + "rewards/margins": 17.044529681439165, + "rewards/rejected": -12.07738009366122, + "step": 1148 + }, + { + "epoch": 0.3149239413457585, + "grad_norm": 9.5, + "kl": 6.221829414367676, + "learning_rate": 5e-06, + "logits/chosen": -9924899.0, + "logits/rejected": -26917100.0, + "logps/chosen": -313.40533447265625, + "logps/rejected": -396.9111022949219, + "loss": 0.0729, + "rewards/chosen": 5.033756732940674, + "rewards/margins": 13.36721658706665, + "rewards/rejected": -8.333459854125977, + "step": 1149 + }, + { + "epoch": 0.31519802658626833, + "grad_norm": 11.6875, + "kl": 9.071938514709473, + "learning_rate": 5e-06, + "logits/chosen": 65293013.333333336, + "logits/rejected": -17332666.666666668, + "logps/chosen": -479.1280924479167, + "logps/rejected": -462.8512912326389, + "loss": 0.0416, + "rewards/chosen": 7.838744099934896, + "rewards/margins": 15.974749077690973, + "rewards/rejected": -8.136004977756077, + "step": 1150 + }, + { + "epoch": 0.31547211182677815, + "grad_norm": 10.5625, + "kl": 7.703329086303711, + "learning_rate": 5e-06, + "logits/chosen": -22556315.42857143, + "logits/rejected": -11899918.4, + "logps/chosen": -493.83583286830356, + "logps/rejected": -467.24814453125, + "loss": 0.0424, + "rewards/chosen": 6.358802250453404, + "rewards/margins": 12.117681721278599, + "rewards/rejected": -5.758879470825195, + "step": 1151 + }, + { + "epoch": 0.3157461970672879, + "grad_norm": 4.9375, + "kl": 4.577509880065918, + "learning_rate": 5e-06, + "logits/chosen": -2467461.3333333335, + "logits/rejected": 2349169.0, + "logps/chosen": -372.2451985677083, + "logps/rejected": -506.5409342447917, + "loss": 0.0515, + "rewards/chosen": 6.364952087402344, + "rewards/margins": 13.255651473999023, + "rewards/rejected": -6.89069938659668, + "step": 1152 + }, + { + "epoch": 0.3160202823077977, + "grad_norm": 7.21875, + "kl": 1.3168342113494873, + "learning_rate": 5e-06, + "logits/chosen": -6725494.0, + "logits/rejected": -32180148.0, + "logps/chosen": -502.929931640625, + "logps/rejected": -595.3538818359375, + "loss": 0.0424, + "rewards/chosen": 6.250344753265381, + "rewards/margins": 16.57170534133911, + "rewards/rejected": -10.32136058807373, + "step": 1153 + }, + { + "epoch": 0.31629436754830753, + "grad_norm": 6.90625, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -17354492.8, + "logits/rejected": 2194358.8571428573, + "logps/chosen": -429.759326171875, + "logps/rejected": -518.90673828125, + "loss": 0.0538, + "rewards/chosen": 5.85051383972168, + "rewards/margins": 12.264706584385465, + "rewards/rejected": -6.414192744663784, + "step": 1154 + }, + { + "epoch": 0.31656845278881734, + "grad_norm": 7.8125, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": 9298368.0, + "logits/rejected": -31593867.636363637, + "logps/chosen": -440.1586162860577, + "logps/rejected": -511.07297585227275, + "loss": 0.0478, + "rewards/chosen": 5.76588146503155, + "rewards/margins": 14.465150846467985, + "rewards/rejected": -8.699269381436435, + "step": 1155 + }, + { + "epoch": 0.3168425380293271, + "grad_norm": 2.84375, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -23880467.692307692, + "logits/rejected": -3423785.4545454546, + "logps/chosen": -447.4307391826923, + "logps/rejected": -541.3955965909091, + "loss": 0.0092, + "rewards/chosen": 6.466619638296274, + "rewards/margins": 15.361125919368718, + "rewards/rejected": -8.894506281072443, + "step": 1156 + }, + { + "epoch": 0.3171166232698369, + "grad_norm": 5.25, + "kl": 2.1809730529785156, + "learning_rate": 5e-06, + "logits/chosen": -21716980.0, + "logits/rejected": 8338357.333333333, + "logps/chosen": -392.3838704427083, + "logps/rejected": -682.95654296875, + "loss": 0.0212, + "rewards/chosen": 5.402378082275391, + "rewards/margins": 16.67902628580729, + "rewards/rejected": -11.2766482035319, + "step": 1157 + }, + { + "epoch": 0.31739070851034673, + "grad_norm": 11.8125, + "kl": 11.737926483154297, + "learning_rate": 5e-06, + "logits/chosen": -7441884.0, + "logits/rejected": -2892612.6666666665, + "logps/chosen": -422.9606119791667, + "logps/rejected": -500.9922688802083, + "loss": 0.1305, + "rewards/chosen": 5.026755650838216, + "rewards/margins": 12.21358553568522, + "rewards/rejected": -7.186829884847005, + "step": 1158 + }, + { + "epoch": 0.3176647937508565, + "grad_norm": 8.75, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -29603122.666666668, + "logits/rejected": -19160085.333333332, + "logps/chosen": -407.663818359375, + "logps/rejected": -512.3623860677084, + "loss": 0.0538, + "rewards/chosen": 6.235420227050781, + "rewards/margins": 14.011001586914062, + "rewards/rejected": -7.775581359863281, + "step": 1159 + }, + { + "epoch": 0.3179388789913663, + "grad_norm": 12.25, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": 15095870.666666666, + "logits/rejected": -22269488.0, + "logps/chosen": -364.275634765625, + "logps/rejected": -513.6405436197916, + "loss": 0.0559, + "rewards/chosen": 5.396472930908203, + "rewards/margins": 12.659716288248699, + "rewards/rejected": -7.263243357340495, + "step": 1160 + }, + { + "epoch": 0.3182129642318761, + "grad_norm": 5.40625, + "kl": 9.279672622680664, + "learning_rate": 5e-06, + "logits/chosen": -11623190.857142856, + "logits/rejected": -8413583.2, + "logps/chosen": -507.645751953125, + "logps/rejected": -586.5779296875, + "loss": 0.0132, + "rewards/chosen": 7.289750235421317, + "rewards/margins": 14.292203085763113, + "rewards/rejected": -7.002452850341797, + "step": 1161 + }, + { + "epoch": 0.31848704947238593, + "grad_norm": 13.375, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -12235054.857142856, + "logits/rejected": -20937072.0, + "logps/chosen": -433.22140066964283, + "logps/rejected": -589.396826171875, + "loss": 0.0528, + "rewards/chosen": 5.0459153311593195, + "rewards/margins": 13.420089830671039, + "rewards/rejected": -8.374174499511719, + "step": 1162 + }, + { + "epoch": 0.3187611347128957, + "grad_norm": 5.03125, + "kl": 0.33131250739097595, + "learning_rate": 5e-06, + "logits/chosen": -9182752.0, + "logits/rejected": -11226645.090909092, + "logps/chosen": -305.6901292067308, + "logps/rejected": -544.9593394886364, + "loss": 0.0524, + "rewards/chosen": 4.891591585599459, + "rewards/margins": 12.744571072238308, + "rewards/rejected": -7.85297948663885, + "step": 1163 + }, + { + "epoch": 0.3190352199534055, + "grad_norm": 7.90625, + "kl": 14.131340980529785, + "learning_rate": 5e-06, + "logits/chosen": -31129692.444444444, + "logits/rejected": -23249680.0, + "logps/chosen": -507.21185980902777, + "logps/rejected": -538.7060139973959, + "loss": 0.0778, + "rewards/chosen": 7.067195468478733, + "rewards/margins": 12.793799930148655, + "rewards/rejected": -5.726604461669922, + "step": 1164 + }, + { + "epoch": 0.3193093051939153, + "grad_norm": 8.25, + "kl": 6.756797790527344, + "learning_rate": 5e-06, + "logits/chosen": 829686.5882352941, + "logits/rejected": -7055081.142857143, + "logps/chosen": -447.29210707720586, + "logps/rejected": -720.0415736607143, + "loss": 0.0619, + "rewards/chosen": 6.437074549057904, + "rewards/margins": 15.577076663490102, + "rewards/rejected": -9.140002114432198, + "step": 1165 + }, + { + "epoch": 0.31958339043442513, + "grad_norm": 5.5, + "kl": 1.960250973701477, + "learning_rate": 5e-06, + "logits/chosen": -16460730.285714285, + "logits/rejected": -3975685.2, + "logps/chosen": -432.08067103794644, + "logps/rejected": -594.32265625, + "loss": 0.0198, + "rewards/chosen": 6.866404942103794, + "rewards/margins": 14.411066654750279, + "rewards/rejected": -7.5446617126464846, + "step": 1166 + }, + { + "epoch": 0.3198574756749349, + "grad_norm": 11.4375, + "kl": 1.6100267171859741, + "learning_rate": 5e-06, + "logits/chosen": -8692463.272727273, + "logits/rejected": 11477192.615384616, + "logps/chosen": -458.4069158380682, + "logps/rejected": -479.3219651442308, + "loss": 0.062, + "rewards/chosen": 5.5694580078125, + "rewards/margins": 11.199720529409555, + "rewards/rejected": -5.630262521597055, + "step": 1167 + }, + { + "epoch": 0.3201315609154447, + "grad_norm": 5.46875, + "kl": 1.4560165405273438, + "learning_rate": 5e-06, + "logits/chosen": 2303709.8, + "logits/rejected": 8425676.57142857, + "logps/chosen": -491.8908203125, + "logps/rejected": -465.68624441964283, + "loss": 0.0171, + "rewards/chosen": 7.9257354736328125, + "rewards/margins": 14.806507110595703, + "rewards/rejected": -6.880771636962891, + "step": 1168 + }, + { + "epoch": 0.3204056461559545, + "grad_norm": 5.5625, + "kl": 6.092613220214844, + "learning_rate": 5e-06, + "logits/chosen": -8744929.714285715, + "logits/rejected": -13399074.4, + "logps/chosen": -329.21397181919644, + "logps/rejected": -598.3533203125, + "loss": 0.0875, + "rewards/chosen": 5.373543875558036, + "rewards/margins": 13.651844351632253, + "rewards/rejected": -8.278300476074218, + "step": 1169 + }, + { + "epoch": 0.3206797313964643, + "grad_norm": 5.46875, + "kl": 3.053645133972168, + "learning_rate": 5e-06, + "logits/chosen": -21026009.6, + "logits/rejected": -6414764.444444444, + "logps/chosen": -456.8955403645833, + "logps/rejected": -422.5910915798611, + "loss": 0.0312, + "rewards/chosen": 6.026104736328125, + "rewards/margins": 14.366967434353299, + "rewards/rejected": -8.340862698025173, + "step": 1170 + }, + { + "epoch": 0.3209538166369741, + "grad_norm": 8.75, + "kl": 1.2905133962631226, + "learning_rate": 5e-06, + "logits/chosen": -11321156.363636363, + "logits/rejected": -1434900.6153846155, + "logps/chosen": -471.4459339488636, + "logps/rejected": -591.0265925480769, + "loss": 0.0243, + "rewards/chosen": 5.98569765957919, + "rewards/margins": 13.28016059715431, + "rewards/rejected": -7.29446293757512, + "step": 1171 + }, + { + "epoch": 0.3212279018774839, + "grad_norm": 1.53125, + "kl": 0.08454259485006332, + "learning_rate": 5e-06, + "logits/chosen": 15376942.4, + "logits/rejected": -14523124.57142857, + "logps/chosen": -505.79921875, + "logps/rejected": -701.3679547991071, + "loss": 0.0061, + "rewards/chosen": 6.9858558654785154, + "rewards/margins": 17.133606719970704, + "rewards/rejected": -10.147750854492188, + "step": 1172 + }, + { + "epoch": 0.3215019871179937, + "grad_norm": 6.75, + "kl": 0.3732573390007019, + "learning_rate": 5e-06, + "logits/chosen": -5336387.733333333, + "logits/rejected": -13616911.111111112, + "logps/chosen": -357.4832356770833, + "logps/rejected": -375.6852213541667, + "loss": 0.0377, + "rewards/chosen": 6.381930541992188, + "rewards/margins": 13.58549296061198, + "rewards/rejected": -7.203562418619792, + "step": 1173 + }, + { + "epoch": 0.3217760723585035, + "grad_norm": 6.5625, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -15811042.461538462, + "logits/rejected": -1010605.4545454546, + "logps/chosen": -417.6706355168269, + "logps/rejected": -329.10074129971593, + "loss": 0.0473, + "rewards/chosen": 5.559964106633113, + "rewards/margins": 11.75258897901415, + "rewards/rejected": -6.192624872381037, + "step": 1174 + }, + { + "epoch": 0.3220501575990133, + "grad_norm": 8.125, + "kl": 0.08994357287883759, + "learning_rate": 5e-06, + "logits/chosen": 2618691.2, + "logits/rejected": -22097147.42857143, + "logps/chosen": -444.18076171875, + "logps/rejected": -534.7527204241071, + "loss": 0.03, + "rewards/chosen": 5.637625122070313, + "rewards/margins": 13.540726906912667, + "rewards/rejected": -7.903101784842355, + "step": 1175 + }, + { + "epoch": 0.3223242428395231, + "grad_norm": 9.5625, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": 5552379.692307692, + "logits/rejected": -17981589.818181816, + "logps/chosen": -456.2801983173077, + "logps/rejected": -680.3558238636364, + "loss": 0.0138, + "rewards/chosen": 6.37734867976262, + "rewards/margins": 16.22558753807228, + "rewards/rejected": -9.848238858309658, + "step": 1176 + }, + { + "epoch": 0.3225983280800329, + "grad_norm": 8.8125, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -16245242.666666666, + "logits/rejected": -25798634.666666668, + "logps/chosen": -399.1853841145833, + "logps/rejected": -558.6941731770834, + "loss": 0.0754, + "rewards/chosen": 4.876808802286784, + "rewards/margins": 14.10021146138509, + "rewards/rejected": -9.223402659098307, + "step": 1177 + }, + { + "epoch": 0.3228724133205427, + "grad_norm": 5.84375, + "kl": 0.3276354670524597, + "learning_rate": 5e-06, + "logits/chosen": -9778686.666666666, + "logits/rejected": 1758652.6666666667, + "logps/chosen": -435.9910888671875, + "logps/rejected": -428.4689534505208, + "loss": 0.0444, + "rewards/chosen": 6.060552597045898, + "rewards/margins": 12.655286153157551, + "rewards/rejected": -6.594733556111653, + "step": 1178 + }, + { + "epoch": 0.3231464985610525, + "grad_norm": 8.375, + "kl": 13.017390251159668, + "learning_rate": 5e-06, + "logits/chosen": -8752453.866666667, + "logits/rejected": -8682142.222222222, + "logps/chosen": -460.2688802083333, + "logps/rejected": -410.3181966145833, + "loss": 0.0309, + "rewards/chosen": 6.629913330078125, + "rewards/margins": 14.555469936794704, + "rewards/rejected": -7.9255566067165795, + "step": 1179 + }, + { + "epoch": 0.3234205838015623, + "grad_norm": 4.5625, + "kl": 8.250862121582031, + "learning_rate": 5e-06, + "logits/chosen": -37483648.0, + "logits/rejected": -9787162.666666666, + "logps/chosen": -402.93662109375, + "logps/rejected": -531.7704535590278, + "loss": 0.0498, + "rewards/chosen": 6.700354512532552, + "rewards/margins": 14.521513875325521, + "rewards/rejected": -7.821159362792969, + "step": 1180 + }, + { + "epoch": 0.32369466904207206, + "grad_norm": 13.4375, + "kl": 10.156219482421875, + "learning_rate": 5e-06, + "logits/chosen": -2263015.3333333335, + "logits/rejected": 5843202.666666667, + "logps/chosen": -464.8185628255208, + "logps/rejected": -539.9193522135416, + "loss": 0.1001, + "rewards/chosen": 6.347422917683919, + "rewards/margins": 12.04693857828776, + "rewards/rejected": -5.699515660603841, + "step": 1181 + }, + { + "epoch": 0.32396875428258187, + "grad_norm": 7.34375, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -3428658.888888889, + "logits/rejected": -16983432.533333335, + "logps/chosen": -374.43500434027777, + "logps/rejected": -651.7907552083333, + "loss": 0.0451, + "rewards/chosen": 5.398522694905599, + "rewards/margins": 13.393142445882162, + "rewards/rejected": -7.994619750976563, + "step": 1182 + }, + { + "epoch": 0.3242428395230917, + "grad_norm": 7.96875, + "kl": 1.2424037456512451, + "learning_rate": 5e-06, + "logits/chosen": 2346592.8, + "logits/rejected": 1676519.857142857, + "logps/chosen": -347.45908203125, + "logps/rejected": -598.1847098214286, + "loss": 0.0437, + "rewards/chosen": 5.641571807861328, + "rewards/margins": 14.52960662841797, + "rewards/rejected": -8.88803482055664, + "step": 1183 + }, + { + "epoch": 0.3245169247636015, + "grad_norm": 9.125, + "kl": 3.5108134746551514, + "learning_rate": 5e-06, + "logits/chosen": -23310902.153846152, + "logits/rejected": -21246859.636363637, + "logps/chosen": -506.07992788461536, + "logps/rejected": -458.53835227272725, + "loss": 0.0346, + "rewards/chosen": 5.9981830303485575, + "rewards/margins": 15.133690787362053, + "rewards/rejected": -9.135507757013494, + "step": 1184 + }, + { + "epoch": 0.32479101000411126, + "grad_norm": 12.0, + "kl": 0.776556670665741, + "learning_rate": 5e-06, + "logits/chosen": -20993942.666666668, + "logits/rejected": -12629253.333333334, + "logps/chosen": -398.3246663411458, + "logps/rejected": -405.6863606770833, + "loss": 0.0685, + "rewards/chosen": 6.175860087076823, + "rewards/margins": 12.362746556599935, + "rewards/rejected": -6.186886469523112, + "step": 1185 + }, + { + "epoch": 0.32506509524462107, + "grad_norm": 2.125, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -20629932.8, + "logits/rejected": -8316620.0, + "logps/chosen": -531.394482421875, + "logps/rejected": -593.3517020089286, + "loss": 0.009, + "rewards/chosen": 6.257732772827149, + "rewards/margins": 14.014148548671177, + "rewards/rejected": -7.756415775844029, + "step": 1186 + }, + { + "epoch": 0.3253391804851309, + "grad_norm": 10.1875, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -16314566.153846154, + "logits/rejected": -10519380.363636363, + "logps/chosen": -419.4812199519231, + "logps/rejected": -789.44921875, + "loss": 0.0356, + "rewards/chosen": 6.7898418719951925, + "rewards/margins": 18.01457193014505, + "rewards/rejected": -11.224730058149857, + "step": 1187 + }, + { + "epoch": 0.3256132657256407, + "grad_norm": 8.5, + "kl": 4.6957292556762695, + "learning_rate": 5e-06, + "logits/chosen": -32607236.0, + "logits/rejected": -21783618.0, + "logps/chosen": -527.1728515625, + "logps/rejected": -517.7452392578125, + "loss": 0.0185, + "rewards/chosen": 7.889807224273682, + "rewards/margins": 15.424062252044678, + "rewards/rejected": -7.534255027770996, + "step": 1188 + }, + { + "epoch": 0.32588735096615046, + "grad_norm": 5.71875, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -10582266.181818182, + "logits/rejected": 55864644.92307692, + "logps/chosen": -313.92001065340907, + "logps/rejected": -639.5745192307693, + "loss": 0.0204, + "rewards/chosen": 6.040659817782315, + "rewards/margins": 16.59304441438688, + "rewards/rejected": -10.552384596604567, + "step": 1189 + }, + { + "epoch": 0.32616143620666027, + "grad_norm": 15.5, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -8842232.727272727, + "logits/rejected": 51223168.0, + "logps/chosen": -419.49032315340907, + "logps/rejected": -630.9265324519231, + "loss": 0.0636, + "rewards/chosen": 4.6804986433549365, + "rewards/margins": 15.662627080103736, + "rewards/rejected": -10.982128436748798, + "step": 1190 + }, + { + "epoch": 0.3264355214471701, + "grad_norm": 7.71875, + "kl": 1.9881821870803833, + "learning_rate": 5e-06, + "logits/chosen": -41177614.76923077, + "logits/rejected": -14505559.272727273, + "logps/chosen": -373.75792518028845, + "logps/rejected": -545.7056107954545, + "loss": 0.0549, + "rewards/chosen": 4.701269589937651, + "rewards/margins": 14.54410336901258, + "rewards/rejected": -9.84283377907493, + "step": 1191 + }, + { + "epoch": 0.32670960668767984, + "grad_norm": 25.875, + "kl": 3.7162349224090576, + "learning_rate": 5e-06, + "logits/chosen": 8560597.333333334, + "logits/rejected": -9689158.222222222, + "logps/chosen": -477.4779296875, + "logps/rejected": -512.9380967881945, + "loss": 0.0757, + "rewards/chosen": 5.362704467773438, + "rewards/margins": 13.413135613335502, + "rewards/rejected": -8.050431145562065, + "step": 1192 + }, + { + "epoch": 0.32698369192818966, + "grad_norm": 11.0, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": 350639.8076923077, + "logits/rejected": 24119421.09090909, + "logps/chosen": -462.45590444711536, + "logps/rejected": -665.71484375, + "loss": 0.0451, + "rewards/chosen": 5.872995229867788, + "rewards/margins": 17.070287210957986, + "rewards/rejected": -11.1972919810902, + "step": 1193 + }, + { + "epoch": 0.32725777716869947, + "grad_norm": 6.90625, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -13231088.0, + "logits/rejected": -21208189.53846154, + "logps/chosen": -386.9197443181818, + "logps/rejected": -531.6293194110577, + "loss": 0.0255, + "rewards/chosen": 6.386638294566762, + "rewards/margins": 15.49598101635913, + "rewards/rejected": -9.109342721792368, + "step": 1194 + }, + { + "epoch": 0.3275318624092093, + "grad_norm": 9.9375, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": 6177538.857142857, + "logits/rejected": -14275531.2, + "logps/chosen": -400.182861328125, + "logps/rejected": -489.24365234375, + "loss": 0.0502, + "rewards/chosen": 3.91058840070452, + "rewards/margins": 11.756761496407645, + "rewards/rejected": -7.846173095703125, + "step": 1195 + }, + { + "epoch": 0.32780594764971904, + "grad_norm": 9.125, + "kl": 1.8942363262176514, + "learning_rate": 5e-06, + "logits/chosen": -3066000.6153846155, + "logits/rejected": 52575749.81818182, + "logps/chosen": -438.84217247596155, + "logps/rejected": -509.70467862215907, + "loss": 0.0362, + "rewards/chosen": 6.329108018141526, + "rewards/margins": 13.518912095289963, + "rewards/rejected": -7.1898040771484375, + "step": 1196 + }, + { + "epoch": 0.32808003289022886, + "grad_norm": 4.71875, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -6300993.230769231, + "logits/rejected": -12380727.272727273, + "logps/chosen": -358.85659555288464, + "logps/rejected": -496.0945933948864, + "loss": 0.0156, + "rewards/chosen": 6.222024184006911, + "rewards/margins": 13.228989127632621, + "rewards/rejected": -7.00696494362571, + "step": 1197 + }, + { + "epoch": 0.32835411813073867, + "grad_norm": 8.75, + "kl": 2.3236682415008545, + "learning_rate": 5e-06, + "logits/chosen": -30072752.0, + "logits/rejected": -27078258.666666668, + "logps/chosen": -385.8374837239583, + "logps/rejected": -400.2322184244792, + "loss": 0.0432, + "rewards/chosen": 5.3966725667317705, + "rewards/margins": 13.283398310343424, + "rewards/rejected": -7.886725743611653, + "step": 1198 + }, + { + "epoch": 0.3286282033712485, + "grad_norm": 16.625, + "kl": 9.980278015136719, + "learning_rate": 5e-06, + "logits/chosen": -20035949.17647059, + "logits/rejected": -71834.28571428571, + "logps/chosen": -390.58185891544116, + "logps/rejected": -609.9865373883929, + "loss": 0.228, + "rewards/chosen": 4.0956932516659, + "rewards/margins": 12.359976568141906, + "rewards/rejected": -8.264283316476005, + "step": 1199 + }, + { + "epoch": 0.32890228861175824, + "grad_norm": 26.25, + "kl": 2.7658839225769043, + "learning_rate": 5e-06, + "logits/chosen": -2437580.3333333335, + "logits/rejected": -38730010.666666664, + "logps/chosen": -350.1627604166667, + "logps/rejected": -462.8528238932292, + "loss": 0.0689, + "rewards/chosen": 5.1633256276448565, + "rewards/margins": 12.883314768473307, + "rewards/rejected": -7.71998914082845, + "step": 1200 + }, + { + "epoch": 0.32917637385226806, + "grad_norm": 10.5625, + "kl": 4.23895263671875, + "learning_rate": 5e-06, + "logits/chosen": -27449652.363636363, + "logits/rejected": -16687508.923076924, + "logps/chosen": -494.7429865056818, + "logps/rejected": -474.8124248798077, + "loss": 0.0327, + "rewards/chosen": 6.80999755859375, + "rewards/margins": 13.831446721003605, + "rewards/rejected": -7.021449162409856, + "step": 1201 + }, + { + "epoch": 0.32945045909277787, + "grad_norm": 7.34375, + "kl": 2.2956137657165527, + "learning_rate": 5e-06, + "logits/chosen": -14003965.333333334, + "logits/rejected": 6040162.666666667, + "logps/chosen": -401.5078938802083, + "logps/rejected": -543.28759765625, + "loss": 0.0245, + "rewards/chosen": 5.706839243570964, + "rewards/margins": 13.346028010050457, + "rewards/rejected": -7.639188766479492, + "step": 1202 + }, + { + "epoch": 0.32972454433328763, + "grad_norm": 2.25, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -13439644.8, + "logits/rejected": -7219532.571428572, + "logps/chosen": -478.514501953125, + "logps/rejected": -491.261474609375, + "loss": 0.0059, + "rewards/chosen": 7.041134643554687, + "rewards/margins": 15.3087890625, + "rewards/rejected": -8.267654418945312, + "step": 1203 + }, + { + "epoch": 0.32999862957379744, + "grad_norm": 9.25, + "kl": 8.120466232299805, + "learning_rate": 5e-06, + "logits/chosen": -17117996.307692308, + "logits/rejected": -27815709.09090909, + "logps/chosen": -490.23189603365387, + "logps/rejected": -375.9583629261364, + "loss": 0.0489, + "rewards/chosen": 7.1848930945763225, + "rewards/margins": 13.07268204055466, + "rewards/rejected": -5.887788945978338, + "step": 1204 + }, + { + "epoch": 0.33027271481430726, + "grad_norm": 9.4375, + "kl": 5.146360874176025, + "learning_rate": 5e-06, + "logits/chosen": -26410899.2, + "logits/rejected": -26770181.333333332, + "logps/chosen": -411.0533203125, + "logps/rejected": -532.7998589409722, + "loss": 0.0291, + "rewards/chosen": 6.613288879394531, + "rewards/margins": 14.24958970811632, + "rewards/rejected": -7.636300828721788, + "step": 1205 + }, + { + "epoch": 0.33054680005481707, + "grad_norm": 8.875, + "kl": 0.6698926091194153, + "learning_rate": 5e-06, + "logits/chosen": -24407264.0, + "logits/rejected": -4042576.0, + "logps/chosen": -412.0711263020833, + "logps/rejected": -420.8854573567708, + "loss": 0.0475, + "rewards/chosen": 7.169291814168294, + "rewards/margins": 13.774815241495768, + "rewards/rejected": -6.605523427327474, + "step": 1206 + }, + { + "epoch": 0.3308208852953268, + "grad_norm": 8.25, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -3838175.6363636362, + "logits/rejected": -28434030.769230768, + "logps/chosen": -455.58735795454544, + "logps/rejected": -501.15268179086536, + "loss": 0.0358, + "rewards/chosen": 5.351837851784446, + "rewards/margins": 13.471432719197306, + "rewards/rejected": -8.11959486741286, + "step": 1207 + }, + { + "epoch": 0.33109497053583664, + "grad_norm": 5.9375, + "kl": 3.0373053550720215, + "learning_rate": 5e-06, + "logits/chosen": -16680700.307692308, + "logits/rejected": 11844421.090909092, + "logps/chosen": -474.7907527043269, + "logps/rejected": -501.40145596590907, + "loss": 0.0279, + "rewards/chosen": 7.270541851337139, + "rewards/margins": 17.022012243737706, + "rewards/rejected": -9.751470392400568, + "step": 1208 + }, + { + "epoch": 0.33136905577634646, + "grad_norm": 5.5, + "kl": 2.2150230407714844, + "learning_rate": 5e-06, + "logits/chosen": -17223036.0, + "logits/rejected": 15182428.0, + "logps/chosen": -358.6787109375, + "logps/rejected": -491.5517578125, + "loss": 0.0475, + "rewards/chosen": 5.241259574890137, + "rewards/margins": 13.420369148254395, + "rewards/rejected": -8.179109573364258, + "step": 1209 + }, + { + "epoch": 0.33164314101685627, + "grad_norm": 5.25, + "kl": 4.423098564147949, + "learning_rate": 5e-06, + "logits/chosen": -7126712.0, + "logits/rejected": -15287925.333333334, + "logps/chosen": -512.468359375, + "logps/rejected": -401.09228515625, + "loss": 0.0201, + "rewards/chosen": 7.392661539713542, + "rewards/margins": 15.382902696397569, + "rewards/rejected": -7.990241156684028, + "step": 1210 + }, + { + "epoch": 0.331917226257366, + "grad_norm": 8.9375, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -21417255.384615384, + "logits/rejected": -23973511.272727273, + "logps/chosen": -362.3595628004808, + "logps/rejected": -353.5812322443182, + "loss": 0.0371, + "rewards/chosen": 5.959391080416166, + "rewards/margins": 11.834482633150541, + "rewards/rejected": -5.875091552734375, + "step": 1211 + }, + { + "epoch": 0.33219131149787584, + "grad_norm": 7.625, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -39593990.4, + "logits/rejected": -17771506.285714287, + "logps/chosen": -327.827978515625, + "logps/rejected": -549.6358119419643, + "loss": 0.0623, + "rewards/chosen": 4.5672649383544925, + "rewards/margins": 12.332192938668388, + "rewards/rejected": -7.764928000313895, + "step": 1212 + }, + { + "epoch": 0.33246539673838565, + "grad_norm": 6.9375, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -6216574.857142857, + "logits/rejected": -26888011.29411765, + "logps/chosen": -418.67236328125, + "logps/rejected": -561.7405215992648, + "loss": 0.0354, + "rewards/chosen": 6.582426343645368, + "rewards/margins": 14.850221681995551, + "rewards/rejected": -8.267795338350183, + "step": 1213 + }, + { + "epoch": 0.3327394819788954, + "grad_norm": 7.28125, + "kl": 2.5900371074676514, + "learning_rate": 5e-06, + "logits/chosen": -16680496.94117647, + "logits/rejected": -18034596.57142857, + "logps/chosen": -393.7159639246324, + "logps/rejected": -426.09901646205356, + "loss": 0.0413, + "rewards/chosen": 5.865534165326287, + "rewards/margins": 13.36280774669487, + "rewards/rejected": -7.4972735813685825, + "step": 1214 + }, + { + "epoch": 0.3330135672194052, + "grad_norm": 6.0, + "kl": 1.3270480632781982, + "learning_rate": 5e-06, + "logits/chosen": 8571613.818181818, + "logits/rejected": -27333137.230769232, + "logps/chosen": -424.51962002840907, + "logps/rejected": -621.3281625600962, + "loss": 0.0222, + "rewards/chosen": 6.209842335094105, + "rewards/margins": 15.447550206751258, + "rewards/rejected": -9.237707871657152, + "step": 1215 + }, + { + "epoch": 0.33328765245991504, + "grad_norm": 7.34375, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -17425413.333333332, + "logits/rejected": -7751737.777777778, + "logps/chosen": -342.398828125, + "logps/rejected": -456.8992513020833, + "loss": 0.0369, + "rewards/chosen": 5.859694417317709, + "rewards/margins": 15.67277340359158, + "rewards/rejected": -9.813078986273872, + "step": 1216 + }, + { + "epoch": 0.33356173770042485, + "grad_norm": 7.875, + "kl": 1.8747692108154297, + "learning_rate": 5e-06, + "logits/chosen": -25668549.818181816, + "logits/rejected": -2268948.923076923, + "logps/chosen": -550.1069779829545, + "logps/rejected": -496.68712439903845, + "loss": 0.0367, + "rewards/chosen": 6.660763827237216, + "rewards/margins": 13.437271865097792, + "rewards/rejected": -6.776508037860577, + "step": 1217 + }, + { + "epoch": 0.3338358229409346, + "grad_norm": 2.15625, + "kl": 0.18373362720012665, + "learning_rate": 5e-06, + "logits/chosen": 2253104.0, + "logits/rejected": -10821824.0, + "logps/chosen": -524.5254720052084, + "logps/rejected": -283.0256754557292, + "loss": 0.0274, + "rewards/chosen": 7.29104741414388, + "rewards/margins": 13.33014170328776, + "rewards/rejected": -6.03909428914388, + "step": 1218 + }, + { + "epoch": 0.3341099081814444, + "grad_norm": 2.109375, + "kl": 3.139235258102417, + "learning_rate": 5e-06, + "logits/chosen": -33475532.8, + "logits/rejected": 16758605.714285715, + "logps/chosen": -484.395361328125, + "logps/rejected": -448.20772879464283, + "loss": 0.0108, + "rewards/chosen": 6.757251739501953, + "rewards/margins": 14.131597791399274, + "rewards/rejected": -7.374346051897321, + "step": 1219 + }, + { + "epoch": 0.33438399342195424, + "grad_norm": 5.0625, + "kl": 6.057814598083496, + "learning_rate": 5e-06, + "logits/chosen": 1713587.0, + "logits/rejected": -29002624.0, + "logps/chosen": -564.0206705729166, + "logps/rejected": -411.810546875, + "loss": 0.017, + "rewards/chosen": 7.454323450724284, + "rewards/margins": 16.042112350463867, + "rewards/rejected": -8.587788899739584, + "step": 1220 + }, + { + "epoch": 0.33465807866246405, + "grad_norm": 9.875, + "kl": 11.155864715576172, + "learning_rate": 5e-06, + "logits/chosen": -8895784.470588235, + "logits/rejected": -18061864.0, + "logps/chosen": -401.45392922794116, + "logps/rejected": -323.546875, + "loss": 0.0742, + "rewards/chosen": 5.523937449735754, + "rewards/margins": 13.568368863658744, + "rewards/rejected": -8.044431413922991, + "step": 1221 + }, + { + "epoch": 0.3349321639029738, + "grad_norm": 3.84375, + "kl": 0.014163970947265625, + "learning_rate": 5e-06, + "logits/chosen": -2635105.0, + "logits/rejected": -16375635.0, + "logps/chosen": -478.462890625, + "logps/rejected": -502.5587158203125, + "loss": 0.0254, + "rewards/chosen": 6.547120094299316, + "rewards/margins": 12.959866523742676, + "rewards/rejected": -6.412746429443359, + "step": 1222 + }, + { + "epoch": 0.3352062491434836, + "grad_norm": 4.6875, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -34490160.0, + "logits/rejected": -29317440.0, + "logps/chosen": -543.1142578125, + "logps/rejected": -554.0764508928571, + "loss": 0.0187, + "rewards/chosen": 6.397125244140625, + "rewards/margins": 15.962221418108259, + "rewards/rejected": -9.565096173967634, + "step": 1223 + }, + { + "epoch": 0.33548033438399344, + "grad_norm": 14.8125, + "kl": 2.9220595359802246, + "learning_rate": 5e-06, + "logits/chosen": -30054137.6, + "logits/rejected": -12470257.142857144, + "logps/chosen": -481.53681640625, + "logps/rejected": -468.2173549107143, + "loss": 0.0551, + "rewards/chosen": 7.220896911621094, + "rewards/margins": 13.366917528424946, + "rewards/rejected": -6.146020616803851, + "step": 1224 + }, + { + "epoch": 0.3357544196245032, + "grad_norm": 8.1875, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -24458290.285714287, + "logits/rejected": -1968883.5294117648, + "logps/chosen": -434.9697963169643, + "logps/rejected": -462.8405330882353, + "loss": 0.0552, + "rewards/chosen": 6.641635894775391, + "rewards/margins": 13.420854680678424, + "rewards/rejected": -6.7792187859030335, + "step": 1225 + }, + { + "epoch": 0.336028504865013, + "grad_norm": 6.3125, + "kl": 1.4849942922592163, + "learning_rate": 5e-06, + "logits/chosen": -38027434.666666664, + "logits/rejected": -16773892.0, + "logps/chosen": -400.3790690104167, + "logps/rejected": -641.7018636067709, + "loss": 0.0488, + "rewards/chosen": 5.829662958780925, + "rewards/margins": 16.001946767171223, + "rewards/rejected": -10.172283808390299, + "step": 1226 + }, + { + "epoch": 0.3363025901055228, + "grad_norm": 6.625, + "kl": 1.1540145874023438, + "learning_rate": 5e-06, + "logits/chosen": -15745933.714285715, + "logits/rejected": -11831040.0, + "logps/chosen": -398.12953404017856, + "logps/rejected": -447.395947265625, + "loss": 0.0451, + "rewards/chosen": 5.938357761928013, + "rewards/margins": 13.683405521937779, + "rewards/rejected": -7.745047760009766, + "step": 1227 + }, + { + "epoch": 0.33657667534603264, + "grad_norm": 6.84375, + "kl": 7.233365058898926, + "learning_rate": 5e-06, + "logits/chosen": -30817.6, + "logits/rejected": -37493504.0, + "logps/chosen": -480.29925130208335, + "logps/rejected": -521.1045464409722, + "loss": 0.0251, + "rewards/chosen": 6.4085240681966145, + "rewards/margins": 16.554313320583766, + "rewards/rejected": -10.145789252387154, + "step": 1228 + }, + { + "epoch": 0.3368507605865424, + "grad_norm": 8.125, + "kl": 0.4291045069694519, + "learning_rate": 5e-06, + "logits/chosen": -7190679.384615385, + "logits/rejected": 1353330.1818181819, + "logps/chosen": -379.36177884615387, + "logps/rejected": -438.89084694602275, + "loss": 0.0341, + "rewards/chosen": 7.482193580040565, + "rewards/margins": 16.629796701711374, + "rewards/rejected": -9.14760312167081, + "step": 1229 + }, + { + "epoch": 0.3371248458270522, + "grad_norm": 3.84375, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -4388480.0, + "logits/rejected": -27243822.933333334, + "logps/chosen": -343.29009331597223, + "logps/rejected": -452.8907877604167, + "loss": 0.0193, + "rewards/chosen": 5.874403211805555, + "rewards/margins": 14.983387586805556, + "rewards/rejected": -9.108984375, + "step": 1230 + }, + { + "epoch": 0.337398931067562, + "grad_norm": 6.0, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": 1485042.0, + "logits/rejected": -43511381.333333336, + "logps/chosen": -437.70703125, + "logps/rejected": -556.7176106770834, + "loss": 0.0266, + "rewards/chosen": 6.309226989746094, + "rewards/margins": 14.758533477783203, + "rewards/rejected": -8.44930648803711, + "step": 1231 + }, + { + "epoch": 0.3376730163080718, + "grad_norm": 3.234375, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -26410978.285714287, + "logits/rejected": 66259494.4, + "logps/chosen": -368.96041434151783, + "logps/rejected": -607.14462890625, + "loss": 0.0188, + "rewards/chosen": 5.5965745108468195, + "rewards/margins": 17.617347063337053, + "rewards/rejected": -12.020772552490234, + "step": 1232 + }, + { + "epoch": 0.3379471015485816, + "grad_norm": 3.828125, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -33956986.18181818, + "logits/rejected": 41840059.07692308, + "logps/chosen": -363.83993252840907, + "logps/rejected": -622.8477313701923, + "loss": 0.0293, + "rewards/chosen": 4.715938221324574, + "rewards/margins": 15.020665628926738, + "rewards/rejected": -10.304727407602163, + "step": 1233 + }, + { + "epoch": 0.3382211867890914, + "grad_norm": 5.25, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -39248147.692307696, + "logits/rejected": 5536930.909090909, + "logps/chosen": -413.15978064903845, + "logps/rejected": -870.0567294034091, + "loss": 0.0161, + "rewards/chosen": 6.6099067101111775, + "rewards/margins": 24.920883605530214, + "rewards/rejected": -18.310976895419035, + "step": 1234 + }, + { + "epoch": 0.3384952720296012, + "grad_norm": 4.25, + "kl": 4.4235758781433105, + "learning_rate": 5e-06, + "logits/chosen": -22588784.0, + "logits/rejected": -13207412.8, + "logps/chosen": -423.654541015625, + "logps/rejected": -378.888720703125, + "loss": 0.0166, + "rewards/chosen": 5.8980222429547995, + "rewards/margins": 12.720809718540737, + "rewards/rejected": -6.822787475585938, + "step": 1235 + }, + { + "epoch": 0.338769357270111, + "grad_norm": 5.09375, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -17226638.545454547, + "logits/rejected": -22389902.769230768, + "logps/chosen": -429.4274236505682, + "logps/rejected": -605.8745868389423, + "loss": 0.0153, + "rewards/chosen": 5.83513086492365, + "rewards/margins": 16.601361521474132, + "rewards/rejected": -10.76623065655048, + "step": 1236 + }, + { + "epoch": 0.3390434425106208, + "grad_norm": 7.96875, + "kl": 5.500591278076172, + "learning_rate": 5e-06, + "logits/chosen": -25090208.0, + "logits/rejected": -4657427.2, + "logps/chosen": -381.8555385044643, + "logps/rejected": -604.9166015625, + "loss": 0.0579, + "rewards/chosen": 6.466400146484375, + "rewards/margins": 18.680760192871094, + "rewards/rejected": -12.21436004638672, + "step": 1237 + }, + { + "epoch": 0.3393175277511306, + "grad_norm": 5.0, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -27828952.615384616, + "logits/rejected": -13589579.636363637, + "logps/chosen": -431.16736778846155, + "logps/rejected": -570.2627840909091, + "loss": 0.0185, + "rewards/chosen": 5.611135042630709, + "rewards/margins": 13.319583279269558, + "rewards/rejected": -7.70844823663885, + "step": 1238 + }, + { + "epoch": 0.3395916129916404, + "grad_norm": 5.9375, + "kl": 3.9764468669891357, + "learning_rate": 5e-06, + "logits/chosen": 5328744.0, + "logits/rejected": -3010482.285714286, + "logps/chosen": -434.7072265625, + "logps/rejected": -555.9549386160714, + "loss": 0.0101, + "rewards/chosen": 6.644402313232422, + "rewards/margins": 16.021378326416016, + "rewards/rejected": -9.376976013183594, + "step": 1239 + }, + { + "epoch": 0.3398656982321502, + "grad_norm": 5.0625, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -39494516.36363637, + "logits/rejected": -18571436.307692308, + "logps/chosen": -407.84237393465907, + "logps/rejected": -459.5261042668269, + "loss": 0.023, + "rewards/chosen": 4.947659232399681, + "rewards/margins": 12.903738568712782, + "rewards/rejected": -7.956079336313101, + "step": 1240 + }, + { + "epoch": 0.34013978347266, + "grad_norm": 7.96875, + "kl": 1.0451158285140991, + "learning_rate": 5e-06, + "logits/chosen": 5439735.0, + "logits/rejected": -23171412.0, + "logps/chosen": -410.4218444824219, + "logps/rejected": -400.0116882324219, + "loss": 0.0449, + "rewards/chosen": 5.493742942810059, + "rewards/margins": 14.057731628417969, + "rewards/rejected": -8.56398868560791, + "step": 1241 + }, + { + "epoch": 0.3404138687131698, + "grad_norm": 4.59375, + "kl": 2.079237699508667, + "learning_rate": 5e-06, + "logits/chosen": 684347.4285714285, + "logits/rejected": -30386025.6, + "logps/chosen": -392.0712890625, + "logps/rejected": -449.209814453125, + "loss": 0.0211, + "rewards/chosen": 5.472547258649554, + "rewards/margins": 13.294048418317523, + "rewards/rejected": -7.821501159667969, + "step": 1242 + }, + { + "epoch": 0.34068795395367957, + "grad_norm": 10.9375, + "kl": 1.346996784210205, + "learning_rate": 5e-06, + "logits/chosen": 70852.45454545454, + "logits/rejected": -18389676.307692308, + "logps/chosen": -430.74391867897725, + "logps/rejected": -300.50184044471155, + "loss": 0.0444, + "rewards/chosen": 5.883422157981179, + "rewards/margins": 12.592322022764833, + "rewards/rejected": -6.708899864783654, + "step": 1243 + }, + { + "epoch": 0.3409620391941894, + "grad_norm": 8.0, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -20621638.4, + "logits/rejected": -35717390.222222224, + "logps/chosen": -289.30638020833334, + "logps/rejected": -397.9353841145833, + "loss": 0.0706, + "rewards/chosen": 4.409767659505208, + "rewards/margins": 11.35365007188585, + "rewards/rejected": -6.943882412380642, + "step": 1244 + }, + { + "epoch": 0.3412361244346992, + "grad_norm": 4.375, + "kl": 0.45884960889816284, + "learning_rate": 5e-06, + "logits/chosen": -597441.1666666666, + "logits/rejected": 20748196.0, + "logps/chosen": -387.3460286458333, + "logps/rejected": -851.9964192708334, + "loss": 0.0153, + "rewards/chosen": 5.4947160085042315, + "rewards/margins": 19.704863866170246, + "rewards/rejected": -14.210147857666016, + "step": 1245 + }, + { + "epoch": 0.341510209675209, + "grad_norm": 8.25, + "kl": 1.7123897075653076, + "learning_rate": 5e-06, + "logits/chosen": -31803944.0, + "logits/rejected": -16072602.0, + "logps/chosen": -382.8211975097656, + "logps/rejected": -392.42889404296875, + "loss": 0.0654, + "rewards/chosen": 4.705574035644531, + "rewards/margins": 9.837162017822266, + "rewards/rejected": -5.131587982177734, + "step": 1246 + }, + { + "epoch": 0.34178429491571877, + "grad_norm": 8.75, + "kl": 3.7896108627319336, + "learning_rate": 5e-06, + "logits/chosen": -12929213.090909092, + "logits/rejected": -15735536.0, + "logps/chosen": -421.2516424005682, + "logps/rejected": -603.4812199519231, + "loss": 0.0361, + "rewards/chosen": 4.884756608442827, + "rewards/margins": 11.926088559877623, + "rewards/rejected": -7.041331951434795, + "step": 1247 + }, + { + "epoch": 0.3420583801562286, + "grad_norm": 10.5625, + "kl": 12.278522491455078, + "learning_rate": 5e-06, + "logits/chosen": -22058148.57142857, + "logits/rejected": 6152788.4, + "logps/chosen": -372.1862095424107, + "logps/rejected": -391.2426513671875, + "loss": 0.0596, + "rewards/chosen": 5.827069418770926, + "rewards/margins": 11.17784914289202, + "rewards/rejected": -5.350779724121094, + "step": 1248 + }, + { + "epoch": 0.3423324653967384, + "grad_norm": 7.53125, + "kl": 6.018805027008057, + "learning_rate": 5e-06, + "logits/chosen": -22310756.0, + "logits/rejected": -22058478.0, + "logps/chosen": -483.52874755859375, + "logps/rejected": -574.6767578125, + "loss": 0.046, + "rewards/chosen": 6.355218887329102, + "rewards/margins": 14.106407165527344, + "rewards/rejected": -7.751188278198242, + "step": 1249 + }, + { + "epoch": 0.3426065506372482, + "grad_norm": 6.25, + "kl": 5.14984130859375, + "learning_rate": 5e-06, + "logits/chosen": 12068906.666666666, + "logits/rejected": 43393853.333333336, + "logps/chosen": -443.9790445963542, + "logps/rejected": -779.3678385416666, + "loss": 0.0166, + "rewards/chosen": 6.326105117797852, + "rewards/margins": 16.827573776245117, + "rewards/rejected": -10.501468658447266, + "step": 1250 + }, + { + "epoch": 0.34288063587775797, + "grad_norm": 7.1875, + "kl": 0.8134326934814453, + "learning_rate": 5e-06, + "logits/chosen": 121450333.0909091, + "logits/rejected": -19215328.0, + "logps/chosen": -441.66677024147725, + "logps/rejected": -484.9346454326923, + "loss": 0.0535, + "rewards/chosen": 4.9761834578080615, + "rewards/margins": 12.474657578901812, + "rewards/rejected": -7.49847412109375, + "step": 1251 + }, + { + "epoch": 0.3431547211182678, + "grad_norm": 12.3125, + "kl": 3.5763092041015625, + "learning_rate": 5e-06, + "logits/chosen": -29359342.222222224, + "logits/rejected": 78147106.13333334, + "logps/chosen": -463.03960503472223, + "logps/rejected": -431.93092447916666, + "loss": 0.0657, + "rewards/chosen": 5.798042721218533, + "rewards/margins": 10.873333655463323, + "rewards/rejected": -5.075290934244792, + "step": 1252 + }, + { + "epoch": 0.3434288063587776, + "grad_norm": 13.6875, + "kl": 10.048030853271484, + "learning_rate": 5e-06, + "logits/chosen": 25359542.85714286, + "logits/rejected": -28719052.8, + "logps/chosen": -545.0076729910714, + "logps/rejected": -491.243359375, + "loss": 0.0437, + "rewards/chosen": 7.314601898193359, + "rewards/margins": 12.439236450195313, + "rewards/rejected": -5.124634552001953, + "step": 1253 + }, + { + "epoch": 0.34370289159928735, + "grad_norm": 7.96875, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -21128974.222222224, + "logits/rejected": -12144360.533333333, + "logps/chosen": -439.1780056423611, + "logps/rejected": -528.5660481770833, + "loss": 0.0394, + "rewards/chosen": 5.507199181450738, + "rewards/margins": 13.276402367485893, + "rewards/rejected": -7.769203186035156, + "step": 1254 + }, + { + "epoch": 0.34397697683979717, + "grad_norm": 10.0625, + "kl": 1.182486891746521, + "learning_rate": 5e-06, + "logits/chosen": -18095720.0, + "logits/rejected": -22321104.0, + "logps/chosen": -472.0280064174107, + "logps/rejected": -531.26640625, + "loss": 0.0584, + "rewards/chosen": 7.239185333251953, + "rewards/margins": 16.559611511230468, + "rewards/rejected": -9.320426177978515, + "step": 1255 + }, + { + "epoch": 0.344251062080307, + "grad_norm": 2.34375, + "kl": 5.579519271850586, + "learning_rate": 5e-06, + "logits/chosen": 2878060.6666666665, + "logits/rejected": 5018867.333333333, + "logps/chosen": -326.13970947265625, + "logps/rejected": -501.0694173177083, + "loss": 0.0227, + "rewards/chosen": 5.993811289469401, + "rewards/margins": 13.87100601196289, + "rewards/rejected": -7.877194722493489, + "step": 1256 + }, + { + "epoch": 0.3445251473208168, + "grad_norm": 11.625, + "kl": 4.639813423156738, + "learning_rate": 5e-06, + "logits/chosen": -37861749.333333336, + "logits/rejected": -14926389.333333334, + "logps/chosen": -470.19384765625, + "logps/rejected": -442.448974609375, + "loss": 0.047, + "rewards/chosen": 5.994414647420247, + "rewards/margins": 11.961570103963215, + "rewards/rejected": -5.967155456542969, + "step": 1257 + }, + { + "epoch": 0.34479923256132655, + "grad_norm": 9.5625, + "kl": 8.764667510986328, + "learning_rate": 5e-06, + "logits/chosen": -19795396.0, + "logits/rejected": 3200943.5, + "logps/chosen": -494.7484130859375, + "logps/rejected": -535.1976928710938, + "loss": 0.0351, + "rewards/chosen": 6.143914222717285, + "rewards/margins": 15.743852615356445, + "rewards/rejected": -9.59993839263916, + "step": 1258 + }, + { + "epoch": 0.34507331780183637, + "grad_norm": 7.21875, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -55769051.428571425, + "logits/rejected": -22123802.352941178, + "logps/chosen": -543.223876953125, + "logps/rejected": -352.78831571691177, + "loss": 0.0921, + "rewards/chosen": 5.605388096400669, + "rewards/margins": 11.228049013794971, + "rewards/rejected": -5.622660917394302, + "step": 1259 + }, + { + "epoch": 0.3453474030423462, + "grad_norm": 3.484375, + "kl": 1.7140382528305054, + "learning_rate": 5e-06, + "logits/chosen": 9496301.714285715, + "logits/rejected": -11398445.176470589, + "logps/chosen": -444.5281459263393, + "logps/rejected": -528.5434857536765, + "loss": 0.0252, + "rewards/chosen": 5.082436152866909, + "rewards/margins": 12.584315676649078, + "rewards/rejected": -7.501879523782169, + "step": 1260 + }, + { + "epoch": 0.345621488282856, + "grad_norm": 3.0625, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": 21938100.0, + "logits/rejected": -30708272.0, + "logps/chosen": -317.86236572265625, + "logps/rejected": -564.5863647460938, + "loss": 0.0099, + "rewards/chosen": 6.581609725952148, + "rewards/margins": 15.669561386108398, + "rewards/rejected": -9.08795166015625, + "step": 1261 + }, + { + "epoch": 0.34589557352336575, + "grad_norm": 5.0625, + "kl": 2.511996030807495, + "learning_rate": 5e-06, + "logits/chosen": -21495342.933333334, + "logits/rejected": 95678158.22222222, + "logps/chosen": -514.0337890625, + "logps/rejected": -590.9471028645834, + "loss": 0.017, + "rewards/chosen": 6.664604695638021, + "rewards/margins": 15.025859069824218, + "rewards/rejected": -8.361254374186197, + "step": 1262 + }, + { + "epoch": 0.34616965876387557, + "grad_norm": 10.8125, + "kl": 0.3677825927734375, + "learning_rate": 5e-06, + "logits/chosen": 64952265.14285714, + "logits/rejected": -8453396.8, + "logps/chosen": -498.4266880580357, + "logps/rejected": -428.1337890625, + "loss": 0.0563, + "rewards/chosen": 6.522565024239676, + "rewards/margins": 14.290187399727959, + "rewards/rejected": -7.767622375488282, + "step": 1263 + }, + { + "epoch": 0.3464437440043854, + "grad_norm": 11.8125, + "kl": 1.3236020803451538, + "learning_rate": 5e-06, + "logits/chosen": -11866116.0, + "logits/rejected": -6028621.5, + "logps/chosen": -437.9803466796875, + "logps/rejected": -489.28802490234375, + "loss": 0.0568, + "rewards/chosen": 6.732024192810059, + "rewards/margins": 14.043596744537354, + "rewards/rejected": -7.311572551727295, + "step": 1264 + }, + { + "epoch": 0.34671782924489514, + "grad_norm": 5.75, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -28284038.0, + "logits/rejected": -9753722.0, + "logps/chosen": -385.1573791503906, + "logps/rejected": -503.81207275390625, + "loss": 0.0262, + "rewards/chosen": 5.118495941162109, + "rewards/margins": 13.432450294494629, + "rewards/rejected": -8.31395435333252, + "step": 1265 + }, + { + "epoch": 0.34699191448540495, + "grad_norm": 6.78125, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -247224.66666666666, + "logits/rejected": -24684392.0, + "logps/chosen": -458.1810709635417, + "logps/rejected": -536.4454752604166, + "loss": 0.0739, + "rewards/chosen": 5.153723398844401, + "rewards/margins": 12.875935872395834, + "rewards/rejected": -7.722212473551433, + "step": 1266 + }, + { + "epoch": 0.34726599972591476, + "grad_norm": 3.46875, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -25667357.09090909, + "logits/rejected": -23449873.230769232, + "logps/chosen": -401.7415216619318, + "logps/rejected": -613.2364032451923, + "loss": 0.0093, + "rewards/chosen": 6.078629927201704, + "rewards/margins": 17.136386791309278, + "rewards/rejected": -11.057756864107573, + "step": 1267 + }, + { + "epoch": 0.3475400849664246, + "grad_norm": 15.0625, + "kl": 23.92203712463379, + "learning_rate": 5e-06, + "logits/chosen": -25815585.684210528, + "logits/rejected": 37112809.6, + "logps/chosen": -499.5717516447368, + "logps/rejected": -475.18759765625, + "loss": 0.0852, + "rewards/chosen": 7.059334202816612, + "rewards/margins": 13.712972661068566, + "rewards/rejected": -6.653638458251953, + "step": 1268 + }, + { + "epoch": 0.34781417020693434, + "grad_norm": 4.6875, + "kl": 1.0004127025604248, + "learning_rate": 5e-06, + "logits/chosen": -32182565.818181816, + "logits/rejected": -295054.76923076925, + "logps/chosen": -480.2322887073864, + "logps/rejected": -501.82534555288464, + "loss": 0.0131, + "rewards/chosen": 6.082162336869673, + "rewards/margins": 14.608731996763002, + "rewards/rejected": -8.52656965989333, + "step": 1269 + }, + { + "epoch": 0.34808825544744415, + "grad_norm": 12.5625, + "kl": 8.940969467163086, + "learning_rate": 5e-06, + "logits/chosen": -4587033.454545454, + "logits/rejected": -13618036.923076924, + "logps/chosen": -468.63045987215907, + "logps/rejected": -391.04285606971155, + "loss": 0.064, + "rewards/chosen": 6.498510187322443, + "rewards/margins": 12.940226308115712, + "rewards/rejected": -6.441716120793269, + "step": 1270 + }, + { + "epoch": 0.34836234068795396, + "grad_norm": 9.8125, + "kl": 3.536268472671509, + "learning_rate": 5e-06, + "logits/chosen": -49845521.45454545, + "logits/rejected": -28157287.384615384, + "logps/chosen": -529.9746537642045, + "logps/rejected": -507.81820913461536, + "loss": 0.0252, + "rewards/chosen": 6.956392461603338, + "rewards/margins": 14.594820782854839, + "rewards/rejected": -7.638428321251502, + "step": 1271 + }, + { + "epoch": 0.3486364259284638, + "grad_norm": 5.4375, + "kl": 6.030410289764404, + "learning_rate": 5e-06, + "logits/chosen": -1422952.3636363635, + "logits/rejected": -6381428.307692308, + "logps/chosen": -420.2405894886364, + "logps/rejected": -366.4655949519231, + "loss": 0.0255, + "rewards/chosen": 5.621126695112749, + "rewards/margins": 12.958489131260585, + "rewards/rejected": -7.337362436147837, + "step": 1272 + }, + { + "epoch": 0.34891051116897354, + "grad_norm": 6.3125, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -20067429.333333332, + "logits/rejected": -10536909.866666667, + "logps/chosen": -459.3997395833333, + "logps/rejected": -525.700390625, + "loss": 0.0285, + "rewards/chosen": 6.251608106825087, + "rewards/margins": 15.306894768608942, + "rewards/rejected": -9.055286661783855, + "step": 1273 + }, + { + "epoch": 0.34918459640948335, + "grad_norm": 9.125, + "kl": 1.0714213848114014, + "learning_rate": 5e-06, + "logits/chosen": -30317140.363636363, + "logits/rejected": -29982867.692307692, + "logps/chosen": -474.99564985795456, + "logps/rejected": -581.5750826322115, + "loss": 0.0384, + "rewards/chosen": 6.4201507568359375, + "rewards/margins": 15.769912719726562, + "rewards/rejected": -9.349761962890625, + "step": 1274 + }, + { + "epoch": 0.34945868164999316, + "grad_norm": 11.125, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -27983162.181818184, + "logits/rejected": -7270101.538461538, + "logps/chosen": -529.8025568181819, + "logps/rejected": -575.8052884615385, + "loss": 0.0465, + "rewards/chosen": 6.788722645152699, + "rewards/margins": 16.14351531342193, + "rewards/rejected": -9.35479266826923, + "step": 1275 + }, + { + "epoch": 0.3497327668905029, + "grad_norm": 11.5625, + "kl": 3.8622679710388184, + "learning_rate": 5e-06, + "logits/chosen": -12276425.333333334, + "logits/rejected": -18188286.666666668, + "logps/chosen": -576.5846354166666, + "logps/rejected": -309.48944091796875, + "loss": 0.046, + "rewards/chosen": 7.082334518432617, + "rewards/margins": 13.89383379618327, + "rewards/rejected": -6.811499277750651, + "step": 1276 + }, + { + "epoch": 0.35000685213101274, + "grad_norm": 7.90625, + "kl": 2.726546049118042, + "learning_rate": 5e-06, + "logits/chosen": -21331565.333333332, + "logits/rejected": -4252472.0, + "logps/chosen": -467.8134358723958, + "logps/rejected": -484.5453287760417, + "loss": 0.0314, + "rewards/chosen": 6.42411994934082, + "rewards/margins": 13.401754379272461, + "rewards/rejected": -6.977634429931641, + "step": 1277 + }, + { + "epoch": 0.35028093737152255, + "grad_norm": 8.6875, + "kl": 3.4084017276763916, + "learning_rate": 5e-06, + "logits/chosen": 13934672.0, + "logits/rejected": 6640946.0, + "logps/chosen": -443.3578796386719, + "logps/rejected": -603.2515258789062, + "loss": 0.0589, + "rewards/chosen": 5.740791320800781, + "rewards/margins": 11.554264068603516, + "rewards/rejected": -5.813472747802734, + "step": 1278 + }, + { + "epoch": 0.35055502261203236, + "grad_norm": 7.1875, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": 52591699.2, + "logits/rejected": -25850893.714285713, + "logps/chosen": -449.125634765625, + "logps/rejected": -508.21023995535717, + "loss": 0.0219, + "rewards/chosen": 6.053909683227539, + "rewards/margins": 15.584746606009347, + "rewards/rejected": -9.530836922781807, + "step": 1279 + }, + { + "epoch": 0.3508291078525421, + "grad_norm": 3.390625, + "kl": 3.964183807373047, + "learning_rate": 5e-06, + "logits/chosen": -22173121.6, + "logits/rejected": -8593845.714285715, + "logps/chosen": -470.144921875, + "logps/rejected": -400.59915597098217, + "loss": 0.0114, + "rewards/chosen": 7.118508148193359, + "rewards/margins": 15.22127500261579, + "rewards/rejected": -8.102766854422432, + "step": 1280 + }, + { + "epoch": 0.35110319309305194, + "grad_norm": 7.75, + "kl": 2.018568754196167, + "learning_rate": 5e-06, + "logits/chosen": -20129236.363636363, + "logits/rejected": -36807724.307692304, + "logps/chosen": -424.83988813920456, + "logps/rejected": -522.8200871394231, + "loss": 0.0225, + "rewards/chosen": 5.767672798850319, + "rewards/margins": 16.836012220049238, + "rewards/rejected": -11.068339421198917, + "step": 1281 + }, + { + "epoch": 0.35137727833356175, + "grad_norm": 4.78125, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -3082195.777777778, + "logits/rejected": -23800405.333333332, + "logps/chosen": -386.9546169704861, + "logps/rejected": -496.0756510416667, + "loss": 0.0144, + "rewards/chosen": 5.993258582221137, + "rewards/margins": 14.455346086290149, + "rewards/rejected": -8.462087504069011, + "step": 1282 + }, + { + "epoch": 0.35165136357407156, + "grad_norm": 11.75, + "kl": 1.0808709859848022, + "learning_rate": 5e-06, + "logits/chosen": -24212796.0, + "logits/rejected": -29993568.0, + "logps/chosen": -426.8018798828125, + "logps/rejected": -517.6547241210938, + "loss": 0.0832, + "rewards/chosen": 5.666676044464111, + "rewards/margins": 13.630431652069092, + "rewards/rejected": -7.9637556076049805, + "step": 1283 + }, + { + "epoch": 0.3519254488145813, + "grad_norm": 8.625, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -26887806.0, + "logits/rejected": -16336717.0, + "logps/chosen": -346.2093811035156, + "logps/rejected": -572.4217529296875, + "loss": 0.076, + "rewards/chosen": 5.168123722076416, + "rewards/margins": 12.926959991455078, + "rewards/rejected": -7.758836269378662, + "step": 1284 + }, + { + "epoch": 0.35219953405509113, + "grad_norm": 9.625, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -23217163.636363637, + "logits/rejected": -19921065.846153848, + "logps/chosen": -453.4841974431818, + "logps/rejected": -404.0202448918269, + "loss": 0.0308, + "rewards/chosen": 6.903073397549716, + "rewards/margins": 14.95590402029611, + "rewards/rejected": -8.052830622746395, + "step": 1285 + }, + { + "epoch": 0.35247361929560095, + "grad_norm": 16.75, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -5985430.0, + "logits/rejected": 22515308.0, + "logps/chosen": -439.6016540527344, + "logps/rejected": -660.1099853515625, + "loss": 0.0387, + "rewards/chosen": 5.528641223907471, + "rewards/margins": 15.703888416290283, + "rewards/rejected": -10.175247192382812, + "step": 1286 + }, + { + "epoch": 0.3527477045361107, + "grad_norm": 12.75, + "kl": 9.942333221435547, + "learning_rate": 5e-06, + "logits/chosen": -32867526.85714286, + "logits/rejected": -25858120.0, + "logps/chosen": -435.10899135044644, + "logps/rejected": -446.90546875, + "loss": 0.0682, + "rewards/chosen": 5.556978498186384, + "rewards/margins": 14.139021955217634, + "rewards/rejected": -8.58204345703125, + "step": 1287 + }, + { + "epoch": 0.3530217897766205, + "grad_norm": 3.53125, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": 18492801.333333332, + "logits/rejected": -19816968.0, + "logps/chosen": -392.7293701171875, + "logps/rejected": -435.1166585286458, + "loss": 0.0172, + "rewards/chosen": 6.396155039469401, + "rewards/margins": 15.138716379801433, + "rewards/rejected": -8.742561340332031, + "step": 1288 + }, + { + "epoch": 0.35329587501713033, + "grad_norm": 5.6875, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": 762718.0, + "logits/rejected": -16257786.666666666, + "logps/chosen": -532.9734700520834, + "logps/rejected": -545.8428955078125, + "loss": 0.0386, + "rewards/chosen": 6.855381011962891, + "rewards/margins": 17.061293284098305, + "rewards/rejected": -10.205912272135416, + "step": 1289 + }, + { + "epoch": 0.35356996025764015, + "grad_norm": 3.96875, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -4128529.4545454546, + "logits/rejected": -33583480.615384616, + "logps/chosen": -458.08371803977275, + "logps/rejected": -362.1698467548077, + "loss": 0.0157, + "rewards/chosen": 6.409679066051137, + "rewards/margins": 13.845279933689358, + "rewards/rejected": -7.435600867638221, + "step": 1290 + }, + { + "epoch": 0.3538440454981499, + "grad_norm": 6.46875, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -35717127.11111111, + "logits/rejected": -4986755.2, + "logps/chosen": -506.0832248263889, + "logps/rejected": -453.01539713541666, + "loss": 0.0168, + "rewards/chosen": 6.179176330566406, + "rewards/margins": 14.51216074625651, + "rewards/rejected": -8.332984415690104, + "step": 1291 + }, + { + "epoch": 0.3541181307386597, + "grad_norm": 4.21875, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -10608055.333333334, + "logits/rejected": -21984636.0, + "logps/chosen": -412.2329915364583, + "logps/rejected": -511.6921793619792, + "loss": 0.0099, + "rewards/chosen": 7.159761428833008, + "rewards/margins": 14.249600728352863, + "rewards/rejected": -7.0898392995198565, + "step": 1292 + }, + { + "epoch": 0.35439221597916953, + "grad_norm": 13.125, + "kl": 6.420409679412842, + "learning_rate": 5e-06, + "logits/chosen": -16684484.0, + "logits/rejected": -11186486.666666666, + "logps/chosen": -406.9567057291667, + "logps/rejected": -520.1183675130209, + "loss": 0.126, + "rewards/chosen": 6.5984446207682295, + "rewards/margins": 16.006303787231445, + "rewards/rejected": -9.407859166463217, + "step": 1293 + }, + { + "epoch": 0.35466630121967935, + "grad_norm": 7.46875, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -10702108.8, + "logits/rejected": -37735364.571428575, + "logps/chosen": -440.56904296875, + "logps/rejected": -430.7725306919643, + "loss": 0.0322, + "rewards/chosen": 6.184767150878907, + "rewards/margins": 13.684653799874443, + "rewards/rejected": -7.499886648995536, + "step": 1294 + }, + { + "epoch": 0.3549403864601891, + "grad_norm": 3.96875, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": 27865605.333333332, + "logits/rejected": 3253661.3333333335, + "logps/chosen": -432.7342936197917, + "logps/rejected": -450.2485677083333, + "loss": 0.0301, + "rewards/chosen": 5.478259616427952, + "rewards/margins": 13.355972629123265, + "rewards/rejected": -7.877713012695312, + "step": 1295 + }, + { + "epoch": 0.3552144717006989, + "grad_norm": 1.453125, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -9333519.272727273, + "logits/rejected": -35939062.15384615, + "logps/chosen": -484.45028409090907, + "logps/rejected": -580.9374248798077, + "loss": 0.0037, + "rewards/chosen": 8.44615450772372, + "rewards/margins": 19.578347266137182, + "rewards/rejected": -11.132192758413462, + "step": 1296 + }, + { + "epoch": 0.35548855694120873, + "grad_norm": 8.75, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -17375426.90909091, + "logits/rejected": 12664857.846153846, + "logps/chosen": -573.8337180397727, + "logps/rejected": -604.5164513221154, + "loss": 0.0295, + "rewards/chosen": 6.536183443936435, + "rewards/margins": 17.810158522812635, + "rewards/rejected": -11.273975078876202, + "step": 1297 + }, + { + "epoch": 0.3557626421817185, + "grad_norm": 5.09375, + "kl": 4.143932342529297, + "learning_rate": 5e-06, + "logits/chosen": -31900462.222222224, + "logits/rejected": -27771767.466666665, + "logps/chosen": -415.7805989583333, + "logps/rejected": -482.64283854166666, + "loss": 0.0182, + "rewards/chosen": 5.759411282009548, + "rewards/margins": 14.93718024359809, + "rewards/rejected": -9.177768961588542, + "step": 1298 + }, + { + "epoch": 0.3560367274222283, + "grad_norm": 11.8125, + "kl": 4.610648155212402, + "learning_rate": 5e-06, + "logits/chosen": -43050269.538461536, + "logits/rejected": -11778701.090909092, + "logps/chosen": -465.49008413461536, + "logps/rejected": -408.39106889204544, + "loss": 0.0272, + "rewards/chosen": 7.130151015061599, + "rewards/margins": 14.056853341055916, + "rewards/rejected": -6.926702325994318, + "step": 1299 + }, + { + "epoch": 0.3563108126627381, + "grad_norm": 6.03125, + "kl": 1.3630321025848389, + "learning_rate": 5e-06, + "logits/chosen": -37713810.28571428, + "logits/rejected": -21411318.4, + "logps/chosen": -517.4641810825893, + "logps/rejected": -494.392236328125, + "loss": 0.0558, + "rewards/chosen": 6.015328543526786, + "rewards/margins": 13.658880179268973, + "rewards/rejected": -7.643551635742187, + "step": 1300 + }, + { + "epoch": 0.35658489790324793, + "grad_norm": 6.15625, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -23903618.285714287, + "logits/rejected": -28081756.8, + "logps/chosen": -445.42867606026783, + "logps/rejected": -688.71748046875, + "loss": 0.0341, + "rewards/chosen": 5.393154689243862, + "rewards/margins": 19.133147975376673, + "rewards/rejected": -13.739993286132812, + "step": 1301 + }, + { + "epoch": 0.3568589831437577, + "grad_norm": 6.875, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -27504354.90909091, + "logits/rejected": -14870432.0, + "logps/chosen": -403.61501242897725, + "logps/rejected": -487.60738431490387, + "loss": 0.025, + "rewards/chosen": 6.670066833496094, + "rewards/margins": 16.93485142634465, + "rewards/rejected": -10.264784592848558, + "step": 1302 + }, + { + "epoch": 0.3571330683842675, + "grad_norm": 11.25, + "kl": 0.9948209524154663, + "learning_rate": 5e-06, + "logits/chosen": -32690249.6, + "logits/rejected": 6021069.714285715, + "logps/chosen": -496.0615234375, + "logps/rejected": -436.96219308035717, + "loss": 0.0531, + "rewards/chosen": 6.707853698730469, + "rewards/margins": 14.730636160714287, + "rewards/rejected": -8.022782461983818, + "step": 1303 + }, + { + "epoch": 0.3574071536247773, + "grad_norm": 7.8125, + "kl": 5.054374694824219, + "learning_rate": 5e-06, + "logits/chosen": -3172121.6, + "logits/rejected": -29711868.444444444, + "logps/chosen": -465.14541015625, + "logps/rejected": -616.58154296875, + "loss": 0.0504, + "rewards/chosen": 6.325357055664062, + "rewards/margins": 15.467130703396267, + "rewards/rejected": -9.141773647732204, + "step": 1304 + }, + { + "epoch": 0.3576812388652871, + "grad_norm": 12.125, + "kl": 1.0861448049545288, + "learning_rate": 5e-06, + "logits/chosen": -18089294.4, + "logits/rejected": -6496945.142857143, + "logps/chosen": -330.7667236328125, + "logps/rejected": -438.0565708705357, + "loss": 0.0809, + "rewards/chosen": 4.689807891845703, + "rewards/margins": 10.761029379708425, + "rewards/rejected": -6.071221487862723, + "step": 1305 + }, + { + "epoch": 0.3579553241057969, + "grad_norm": 10.3125, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -41831384.0, + "logits/rejected": -27157701.333333332, + "logps/chosen": -381.8842366536458, + "logps/rejected": -466.1717936197917, + "loss": 0.0577, + "rewards/chosen": 4.797765731811523, + "rewards/margins": 14.289328893025717, + "rewards/rejected": -9.491563161214193, + "step": 1306 + }, + { + "epoch": 0.3582294093463067, + "grad_norm": 10.25, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -12849508.0, + "logits/rejected": -7583174.666666667, + "logps/chosen": -423.2957356770833, + "logps/rejected": -578.3371175130209, + "loss": 0.0306, + "rewards/chosen": 4.575792948404948, + "rewards/margins": 16.19078318277995, + "rewards/rejected": -11.614990234375, + "step": 1307 + }, + { + "epoch": 0.3585034945868165, + "grad_norm": 11.375, + "kl": 11.734859466552734, + "learning_rate": 5e-06, + "logits/chosen": 9186558.153846154, + "logits/rejected": -21167319.272727273, + "logps/chosen": -283.85509314903845, + "logps/rejected": -653.0593927556819, + "loss": 0.0598, + "rewards/chosen": 5.761540339543269, + "rewards/margins": 16.54813507720307, + "rewards/rejected": -10.7865947376598, + "step": 1308 + }, + { + "epoch": 0.3587775798273263, + "grad_norm": 5.5625, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -22383260.8, + "logits/rejected": -32022848.0, + "logps/chosen": -260.2913818359375, + "logps/rejected": -528.0099748883929, + "loss": 0.0233, + "rewards/chosen": 5.100243759155274, + "rewards/margins": 14.766826139177596, + "rewards/rejected": -9.666582380022321, + "step": 1309 + }, + { + "epoch": 0.3590516650678361, + "grad_norm": 7.84375, + "kl": 6.815630912780762, + "learning_rate": 5e-06, + "logits/chosen": -4721747.076923077, + "logits/rejected": -20130903.272727273, + "logps/chosen": -437.1066706730769, + "logps/rejected": -303.9165704900568, + "loss": 0.0299, + "rewards/chosen": 6.16632080078125, + "rewards/margins": 12.707740090110086, + "rewards/rejected": -6.541419289328835, + "step": 1310 + }, + { + "epoch": 0.3593257503083459, + "grad_norm": 9.25, + "kl": 5.709621429443359, + "learning_rate": 5e-06, + "logits/chosen": -30522522.181818184, + "logits/rejected": -19481489.230769232, + "logps/chosen": -500.94180575284093, + "logps/rejected": -698.640625, + "loss": 0.0336, + "rewards/chosen": 7.874920931729403, + "rewards/margins": 21.157253131999838, + "rewards/rejected": -13.282332200270433, + "step": 1311 + }, + { + "epoch": 0.3595998355488557, + "grad_norm": 9.25, + "kl": 0.2712481915950775, + "learning_rate": 5e-06, + "logits/chosen": -40382870.85714286, + "logits/rejected": -19543784.0, + "logps/chosen": -441.33775111607144, + "logps/rejected": -469.541357421875, + "loss": 0.0426, + "rewards/chosen": 5.598931993756976, + "rewards/margins": 12.396519579206196, + "rewards/rejected": -6.797587585449219, + "step": 1312 + }, + { + "epoch": 0.3598739207893655, + "grad_norm": 4.875, + "kl": 9.364509582519531, + "learning_rate": 5e-06, + "logits/chosen": -13006277.333333334, + "logits/rejected": -22428958.222222224, + "logps/chosen": -452.94423828125, + "logps/rejected": -677.0086805555555, + "loss": 0.0144, + "rewards/chosen": 7.7840627034505205, + "rewards/margins": 18.464282565646702, + "rewards/rejected": -10.68021986219618, + "step": 1313 + }, + { + "epoch": 0.3601480060298753, + "grad_norm": 12.3125, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": 31358734.769230768, + "logits/rejected": -33038609.454545453, + "logps/chosen": -445.72280649038464, + "logps/rejected": -636.6590465198864, + "loss": 0.0468, + "rewards/chosen": 6.730492811936599, + "rewards/margins": 15.706851399028217, + "rewards/rejected": -8.97635858709162, + "step": 1314 + }, + { + "epoch": 0.3604220912703851, + "grad_norm": 10.0625, + "kl": 5.853211879730225, + "learning_rate": 5e-06, + "logits/chosen": -36339108.571428575, + "logits/rejected": -21485129.6, + "logps/chosen": -348.01171875, + "logps/rejected": -370.750830078125, + "loss": 0.0793, + "rewards/chosen": 5.1577301025390625, + "rewards/margins": 12.436305236816406, + "rewards/rejected": -7.278575134277344, + "step": 1315 + }, + { + "epoch": 0.36069617651089486, + "grad_norm": 10.375, + "kl": 2.0093703269958496, + "learning_rate": 5e-06, + "logits/chosen": -41900278.85714286, + "logits/rejected": -18907080.0, + "logps/chosen": -540.6643415178571, + "logps/rejected": -319.0109619140625, + "loss": 0.0465, + "rewards/chosen": 6.819254193987165, + "rewards/margins": 12.67639912196568, + "rewards/rejected": -5.857144927978515, + "step": 1316 + }, + { + "epoch": 0.3609702617514047, + "grad_norm": 9.25, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -14321528.0, + "logits/rejected": -36782116.571428575, + "logps/chosen": -495.96142578125, + "logps/rejected": -556.3396344866071, + "loss": 0.0622, + "rewards/chosen": 5.918019866943359, + "rewards/margins": 15.344394029889788, + "rewards/rejected": -9.426374162946429, + "step": 1317 + }, + { + "epoch": 0.3612443469919145, + "grad_norm": 6.90625, + "kl": 0.8402189016342163, + "learning_rate": 5e-06, + "logits/chosen": -54924160.0, + "logits/rejected": -31828740.923076924, + "logps/chosen": -435.63680752840907, + "logps/rejected": -521.4894831730769, + "loss": 0.0263, + "rewards/chosen": 6.22345664284446, + "rewards/margins": 14.001089136083642, + "rewards/rejected": -7.7776324932391825, + "step": 1318 + }, + { + "epoch": 0.3615184322324243, + "grad_norm": 7.96875, + "kl": 2.3883540630340576, + "learning_rate": 5e-06, + "logits/chosen": -43834096.0, + "logits/rejected": 64513930.666666664, + "logps/chosen": -557.210693359375, + "logps/rejected": -550.3265787760416, + "loss": 0.0147, + "rewards/chosen": 7.8451188405354815, + "rewards/margins": 18.076984405517578, + "rewards/rejected": -10.231865564982096, + "step": 1319 + }, + { + "epoch": 0.36179251747293406, + "grad_norm": 8.0625, + "kl": 6.100249290466309, + "learning_rate": 5e-06, + "logits/chosen": -17344240.0, + "logits/rejected": -22285942.4, + "logps/chosen": -492.3059779575893, + "logps/rejected": -450.86787109375, + "loss": 0.0558, + "rewards/chosen": 6.75000980922154, + "rewards/margins": 16.376309095110212, + "rewards/rejected": -9.626299285888672, + "step": 1320 + }, + { + "epoch": 0.3620666027134439, + "grad_norm": 6.9375, + "kl": 9.57080364227295, + "learning_rate": 5e-06, + "logits/chosen": -36577878.15384615, + "logits/rejected": -16014388.363636363, + "logps/chosen": -405.5471754807692, + "logps/rejected": -669.9740767045455, + "loss": 0.0266, + "rewards/chosen": 5.745517437274639, + "rewards/margins": 15.05219274134069, + "rewards/rejected": -9.30667530406605, + "step": 1321 + }, + { + "epoch": 0.3623406879539537, + "grad_norm": 8.125, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -25470229.333333332, + "logits/rejected": -6783238.666666667, + "logps/chosen": -381.9912923177083, + "logps/rejected": -561.4071451822916, + "loss": 0.0513, + "rewards/chosen": 5.411099116007487, + "rewards/margins": 16.811756769816082, + "rewards/rejected": -11.400657653808594, + "step": 1322 + }, + { + "epoch": 0.3626147731944635, + "grad_norm": 5.65625, + "kl": 3.8147010803222656, + "learning_rate": 5e-06, + "logits/chosen": -6007678.857142857, + "logits/rejected": -13911926.4, + "logps/chosen": -489.59933035714283, + "logps/rejected": -622.409375, + "loss": 0.0221, + "rewards/chosen": 7.675437927246094, + "rewards/margins": 17.146231842041015, + "rewards/rejected": -9.470793914794921, + "step": 1323 + }, + { + "epoch": 0.36288885843497326, + "grad_norm": 7.28125, + "kl": 2.002584457397461, + "learning_rate": 5e-06, + "logits/chosen": -10316900.0, + "logits/rejected": -24719522.285714287, + "logps/chosen": -668.27333984375, + "logps/rejected": -569.5201241629464, + "loss": 0.0243, + "rewards/chosen": 7.305685424804688, + "rewards/margins": 15.383961922781808, + "rewards/rejected": -8.07827649797712, + "step": 1324 + }, + { + "epoch": 0.3631629436754831, + "grad_norm": 11.875, + "kl": 5.34709358215332, + "learning_rate": 5e-06, + "logits/chosen": -38493464.0, + "logits/rejected": -28808568.0, + "logps/chosen": -478.2115885416667, + "logps/rejected": -458.6781412760417, + "loss": 0.0506, + "rewards/chosen": 6.829026540120442, + "rewards/margins": 12.93684196472168, + "rewards/rejected": -6.107815424601237, + "step": 1325 + }, + { + "epoch": 0.3634370289159929, + "grad_norm": 9.8125, + "kl": 1.26107919216156, + "learning_rate": 5e-06, + "logits/chosen": -30977610.666666668, + "logits/rejected": -17471784.0, + "logps/chosen": -384.3649495442708, + "logps/rejected": -447.6797281901042, + "loss": 0.0741, + "rewards/chosen": 5.723009745279948, + "rewards/margins": 12.88739840189616, + "rewards/rejected": -7.164388656616211, + "step": 1326 + }, + { + "epoch": 0.36371111415650265, + "grad_norm": 3.546875, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -29047138.0, + "logits/rejected": -15421160.0, + "logps/chosen": -418.77301025390625, + "logps/rejected": -597.17626953125, + "loss": 0.0147, + "rewards/chosen": 5.837831497192383, + "rewards/margins": 15.897794723510742, + "rewards/rejected": -10.05996322631836, + "step": 1327 + }, + { + "epoch": 0.36398519939701246, + "grad_norm": 9.6875, + "kl": 1.5181134939193726, + "learning_rate": 5e-06, + "logits/chosen": -33980556.307692304, + "logits/rejected": -8399060.363636363, + "logps/chosen": -388.17964993990387, + "logps/rejected": -587.7167080965909, + "loss": 0.0439, + "rewards/chosen": 6.914930490347055, + "rewards/margins": 14.170903612683702, + "rewards/rejected": -7.2559731223366475, + "step": 1328 + }, + { + "epoch": 0.3642592846375223, + "grad_norm": 13.5, + "kl": 5.242144584655762, + "learning_rate": 5e-06, + "logits/chosen": -17161778.0, + "logits/rejected": -29613894.0, + "logps/chosen": -348.9447937011719, + "logps/rejected": -320.2668762207031, + "loss": 0.0915, + "rewards/chosen": 4.60166072845459, + "rewards/margins": 11.523388862609863, + "rewards/rejected": -6.921728134155273, + "step": 1329 + }, + { + "epoch": 0.3645333698780321, + "grad_norm": 6.21875, + "kl": 8.600996017456055, + "learning_rate": 5e-06, + "logits/chosen": -26692068.57142857, + "logits/rejected": 9186982.4, + "logps/chosen": -610.27783203125, + "logps/rejected": -335.0466796875, + "loss": 0.023, + "rewards/chosen": 6.715667724609375, + "rewards/margins": 14.025341796875, + "rewards/rejected": -7.309674072265625, + "step": 1330 + }, + { + "epoch": 0.36480745511854185, + "grad_norm": 8.9375, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -34753081.6, + "logits/rejected": -31398886.85714286, + "logps/chosen": -419.83388671875, + "logps/rejected": -500.0185546875, + "loss": 0.0605, + "rewards/chosen": 6.813080596923828, + "rewards/margins": 15.64086445399693, + "rewards/rejected": -8.827783857073102, + "step": 1331 + }, + { + "epoch": 0.36508154035905166, + "grad_norm": 13.9375, + "kl": 2.5543341636657715, + "learning_rate": 5e-06, + "logits/chosen": -38640743.384615384, + "logits/rejected": -15202846.545454545, + "logps/chosen": -464.4075270432692, + "logps/rejected": -397.57839133522725, + "loss": 0.0646, + "rewards/chosen": 5.463929396409255, + "rewards/margins": 12.175578510844623, + "rewards/rejected": -6.711649114435369, + "step": 1332 + }, + { + "epoch": 0.3653556255995615, + "grad_norm": 9.25, + "kl": 4.818833827972412, + "learning_rate": 5e-06, + "logits/chosen": -21652912.94117647, + "logits/rejected": -23707344.0, + "logps/chosen": -355.64430147058823, + "logps/rejected": -560.6383928571429, + "loss": 0.0596, + "rewards/chosen": 5.611994126263787, + "rewards/margins": 17.947263509285552, + "rewards/rejected": -12.335269383021764, + "step": 1333 + }, + { + "epoch": 0.3656297108400713, + "grad_norm": 2.484375, + "kl": 0.7742919921875, + "learning_rate": 5e-06, + "logits/chosen": -12666893.0, + "logits/rejected": -2695983.5, + "logps/chosen": -546.8629760742188, + "logps/rejected": -516.3068237304688, + "loss": 0.006, + "rewards/chosen": 7.597806453704834, + "rewards/margins": 16.69279432296753, + "rewards/rejected": -9.094987869262695, + "step": 1334 + }, + { + "epoch": 0.36590379608058105, + "grad_norm": 6.9375, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -22029728.0, + "logits/rejected": -4588336.0, + "logps/chosen": -420.8055889423077, + "logps/rejected": -433.0086115056818, + "loss": 0.0716, + "rewards/chosen": 5.376212486853967, + "rewards/margins": 11.60070811451732, + "rewards/rejected": -6.2244956276633525, + "step": 1335 + }, + { + "epoch": 0.36617788132109086, + "grad_norm": 13.4375, + "kl": 14.16617202758789, + "learning_rate": 5e-06, + "logits/chosen": -13624747.555555556, + "logits/rejected": -19007910.666666668, + "logps/chosen": -524.9396158854166, + "logps/rejected": -547.08984375, + "loss": 0.0813, + "rewards/chosen": 6.521681891547309, + "rewards/margins": 15.992014990912544, + "rewards/rejected": -9.470333099365234, + "step": 1336 + }, + { + "epoch": 0.3664519665616007, + "grad_norm": 13.5, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -14086905.6, + "logits/rejected": -32529961.14285714, + "logps/chosen": -311.9728515625, + "logps/rejected": -543.7139718191964, + "loss": 0.0287, + "rewards/chosen": 6.002128982543946, + "rewards/margins": 13.106008638654437, + "rewards/rejected": -7.103879656110491, + "step": 1337 + }, + { + "epoch": 0.36672605180211043, + "grad_norm": 10.625, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -14628756.363636363, + "logits/rejected": -14302305.23076923, + "logps/chosen": -409.42524857954544, + "logps/rejected": -482.17394080528845, + "loss": 0.0572, + "rewards/chosen": 6.726418928666548, + "rewards/margins": 14.125341188657533, + "rewards/rejected": -7.398922259990986, + "step": 1338 + }, + { + "epoch": 0.36700013704262024, + "grad_norm": 9.125, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -5255850.333333333, + "logits/rejected": -12894789.333333334, + "logps/chosen": -472.4346516927083, + "logps/rejected": -596.9496256510416, + "loss": 0.0394, + "rewards/chosen": 5.601472854614258, + "rewards/margins": 14.68197504679362, + "rewards/rejected": -9.080502192179361, + "step": 1339 + }, + { + "epoch": 0.36727422228313006, + "grad_norm": 10.625, + "kl": 5.201087951660156, + "learning_rate": 5e-06, + "logits/chosen": -12326772.0, + "logits/rejected": -41313914.666666664, + "logps/chosen": -404.2187093098958, + "logps/rejected": -576.4457194010416, + "loss": 0.0458, + "rewards/chosen": 5.89248784383138, + "rewards/margins": 15.154366811116535, + "rewards/rejected": -9.261878967285156, + "step": 1340 + }, + { + "epoch": 0.36754830752363987, + "grad_norm": 13.3125, + "kl": 4.881062984466553, + "learning_rate": 5e-06, + "logits/chosen": -21207163.076923076, + "logits/rejected": -14283694.545454545, + "logps/chosen": -517.8909630408654, + "logps/rejected": -464.77885298295456, + "loss": 0.053, + "rewards/chosen": 7.4721538837139425, + "rewards/margins": 14.944844119198674, + "rewards/rejected": -7.47269023548473, + "step": 1341 + }, + { + "epoch": 0.36782239276414963, + "grad_norm": 6.21875, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": 21533912.888888888, + "logits/rejected": -36796381.86666667, + "logps/chosen": -508.58550347222223, + "logps/rejected": -518.9287434895833, + "loss": 0.018, + "rewards/chosen": 6.709846496582031, + "rewards/margins": 16.024578348795572, + "rewards/rejected": -9.314731852213542, + "step": 1342 + }, + { + "epoch": 0.36809647800465944, + "grad_norm": 3.5625, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -28966682.181818184, + "logits/rejected": -18942816.0, + "logps/chosen": -461.1251775568182, + "logps/rejected": -488.08800330528845, + "loss": 0.0216, + "rewards/chosen": 7.002656416459517, + "rewards/margins": 14.10326524214311, + "rewards/rejected": -7.100608825683594, + "step": 1343 + }, + { + "epoch": 0.36837056324516926, + "grad_norm": 12.8125, + "kl": 8.113641738891602, + "learning_rate": 5e-06, + "logits/chosen": -24313412.266666666, + "logits/rejected": -16114282.666666666, + "logps/chosen": -390.80071614583335, + "logps/rejected": -446.17450629340277, + "loss": 0.0769, + "rewards/chosen": 5.332761637369791, + "rewards/margins": 13.47234819200304, + "rewards/rejected": -8.139586554633247, + "step": 1344 + }, + { + "epoch": 0.36864464848567907, + "grad_norm": 3.0, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -40756936.0, + "logits/rejected": -6486666.0, + "logps/chosen": -353.58599853515625, + "logps/rejected": -572.01025390625, + "loss": 0.0104, + "rewards/chosen": 6.601680278778076, + "rewards/margins": 17.377156734466553, + "rewards/rejected": -10.775476455688477, + "step": 1345 + }, + { + "epoch": 0.36891873372618883, + "grad_norm": 6.40625, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -16314056.0, + "logits/rejected": -1064486.0, + "logps/chosen": -392.78729248046875, + "logps/rejected": -533.464599609375, + "loss": 0.0341, + "rewards/chosen": 5.21478271484375, + "rewards/margins": 16.008170127868652, + "rewards/rejected": -10.793387413024902, + "step": 1346 + }, + { + "epoch": 0.36919281896669864, + "grad_norm": 13.375, + "kl": 8.712431907653809, + "learning_rate": 5e-06, + "logits/chosen": -32512925.333333332, + "logits/rejected": -29165002.666666668, + "logps/chosen": -373.238525390625, + "logps/rejected": -544.400146484375, + "loss": 0.0409, + "rewards/chosen": 6.612119038899739, + "rewards/margins": 13.796839396158854, + "rewards/rejected": -7.184720357259114, + "step": 1347 + }, + { + "epoch": 0.36946690420720846, + "grad_norm": 1.953125, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -46394944.0, + "logits/rejected": -19369697.454545453, + "logps/chosen": -421.2126652644231, + "logps/rejected": -451.87473366477275, + "loss": 0.0283, + "rewards/chosen": 6.013519287109375, + "rewards/margins": 15.543168501420455, + "rewards/rejected": -9.52964921431108, + "step": 1348 + }, + { + "epoch": 0.3697409894477182, + "grad_norm": 1.9921875, + "kl": 6.517764091491699, + "learning_rate": 5e-06, + "logits/chosen": -28197072.0, + "logits/rejected": -18243488.0, + "logps/chosen": -483.9344482421875, + "logps/rejected": -399.2773132324219, + "loss": 0.0086, + "rewards/chosen": 6.47272253036499, + "rewards/margins": 13.5272798538208, + "rewards/rejected": -7.0545573234558105, + "step": 1349 + }, + { + "epoch": 0.37001507468822803, + "grad_norm": 8.0625, + "kl": 1.350947380065918, + "learning_rate": 5e-06, + "logits/chosen": -23262821.333333332, + "logits/rejected": -9536202.666666666, + "logps/chosen": -435.9119059244792, + "logps/rejected": -446.5491943359375, + "loss": 0.0557, + "rewards/chosen": 5.531155904134114, + "rewards/margins": 10.774600346883137, + "rewards/rejected": -5.243444442749023, + "step": 1350 + }, + { + "epoch": 0.37028915992873784, + "grad_norm": 5.03125, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -20476981.333333332, + "logits/rejected": -20783402.666666668, + "logps/chosen": -479.5096028645833, + "logps/rejected": -453.2215983072917, + "loss": 0.0203, + "rewards/chosen": 6.991508483886719, + "rewards/margins": 18.298458099365234, + "rewards/rejected": -11.306949615478516, + "step": 1351 + }, + { + "epoch": 0.37056324516924766, + "grad_norm": 13.125, + "kl": 11.53343391418457, + "learning_rate": 5e-06, + "logits/chosen": -31289747.692307692, + "logits/rejected": -23894474.181818184, + "logps/chosen": -379.7421875, + "logps/rejected": -619.6512784090909, + "loss": 0.0585, + "rewards/chosen": 5.971141521747295, + "rewards/margins": 16.978200792432666, + "rewards/rejected": -11.00705927068537, + "step": 1352 + }, + { + "epoch": 0.3708373304097574, + "grad_norm": 10.375, + "kl": 5.800472259521484, + "learning_rate": 5e-06, + "logits/chosen": -7409966.545454546, + "logits/rejected": -18418816.0, + "logps/chosen": -375.73215553977275, + "logps/rejected": -552.8592623197115, + "loss": 0.0257, + "rewards/chosen": 6.080924987792969, + "rewards/margins": 14.06476064828726, + "rewards/rejected": -7.983835660494291, + "step": 1353 + }, + { + "epoch": 0.37111141565026723, + "grad_norm": 8.8125, + "kl": 0.18428167700767517, + "learning_rate": 5e-06, + "logits/chosen": -21571378.285714287, + "logits/rejected": -23533732.8, + "logps/chosen": -328.05008370535717, + "logps/rejected": -570.90224609375, + "loss": 0.0837, + "rewards/chosen": 4.795946938650949, + "rewards/margins": 13.722739846365794, + "rewards/rejected": -8.926792907714844, + "step": 1354 + }, + { + "epoch": 0.37138550089077704, + "grad_norm": 11.3125, + "kl": 8.792890548706055, + "learning_rate": 5e-06, + "logits/chosen": -17316034.46153846, + "logits/rejected": -20969166.545454547, + "logps/chosen": -412.92127403846155, + "logps/rejected": -474.11234907670456, + "loss": 0.1087, + "rewards/chosen": 5.153356698843149, + "rewards/margins": 15.654392909336757, + "rewards/rejected": -10.501036210493607, + "step": 1355 + }, + { + "epoch": 0.37165958613128686, + "grad_norm": 5.46875, + "kl": 2.325054168701172, + "learning_rate": 5e-06, + "logits/chosen": -8730145.6, + "logits/rejected": -9192875.42857143, + "logps/chosen": -421.088671875, + "logps/rejected": -557.3135463169643, + "loss": 0.0252, + "rewards/chosen": 6.443943023681641, + "rewards/margins": 14.80918938773019, + "rewards/rejected": -8.365246364048549, + "step": 1356 + }, + { + "epoch": 0.3719336713717966, + "grad_norm": 11.625, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -24187081.846153848, + "logits/rejected": -29516834.90909091, + "logps/chosen": -456.4699894831731, + "logps/rejected": -656.2371715198864, + "loss": 0.0314, + "rewards/chosen": 6.957057659442608, + "rewards/margins": 16.79329548015461, + "rewards/rejected": -9.836237820712002, + "step": 1357 + }, + { + "epoch": 0.37220775661230643, + "grad_norm": 20.25, + "kl": 12.240156173706055, + "learning_rate": 5e-06, + "logits/chosen": -4085222.588235294, + "logits/rejected": -17976052.57142857, + "logps/chosen": -388.50109145220586, + "logps/rejected": -469.46732003348217, + "loss": 0.1402, + "rewards/chosen": 4.42716261919807, + "rewards/margins": 12.227441964029264, + "rewards/rejected": -7.8002793448311945, + "step": 1358 + }, + { + "epoch": 0.37248184185281624, + "grad_norm": 4.96875, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": 4691790.545454546, + "logits/rejected": -22464556.307692308, + "logps/chosen": -449.96262428977275, + "logps/rejected": -661.587890625, + "loss": 0.0086, + "rewards/chosen": 6.193917707963423, + "rewards/margins": 17.772883115114865, + "rewards/rejected": -11.578965407151442, + "step": 1359 + }, + { + "epoch": 0.372755927093326, + "grad_norm": 13.5, + "kl": 7.822842597961426, + "learning_rate": 5e-06, + "logits/chosen": -25519687.111111112, + "logits/rejected": 2859292.0, + "logps/chosen": -429.31255425347223, + "logps/rejected": -471.4455973307292, + "loss": 0.1126, + "rewards/chosen": 5.375203874376085, + "rewards/margins": 11.127295600043404, + "rewards/rejected": -5.752091725667317, + "step": 1360 + }, + { + "epoch": 0.3730300123338358, + "grad_norm": 10.9375, + "kl": 7.173015594482422, + "learning_rate": 5e-06, + "logits/chosen": -17772228.923076924, + "logits/rejected": -45256526.54545455, + "logps/chosen": -395.1916691706731, + "logps/rejected": -442.22305575284093, + "loss": 0.0502, + "rewards/chosen": 7.150512108435998, + "rewards/margins": 16.54223157976057, + "rewards/rejected": -9.391719471324574, + "step": 1361 + }, + { + "epoch": 0.37330409757434563, + "grad_norm": 7.03125, + "kl": 4.963908672332764, + "learning_rate": 5e-06, + "logits/chosen": -20737308.8, + "logits/rejected": -32698262.85714286, + "logps/chosen": -427.07255859375, + "logps/rejected": -612.3273577008929, + "loss": 0.0339, + "rewards/chosen": 6.004665374755859, + "rewards/margins": 16.832601819719585, + "rewards/rejected": -10.827936444963727, + "step": 1362 + }, + { + "epoch": 0.37357818281485544, + "grad_norm": 9.5625, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -40703926.15384615, + "logits/rejected": -19191863.272727273, + "logps/chosen": -465.59731820913464, + "logps/rejected": -480.9312855113636, + "loss": 0.0643, + "rewards/chosen": 6.849513714130108, + "rewards/margins": 13.332285847697225, + "rewards/rejected": -6.482772133567116, + "step": 1363 + }, + { + "epoch": 0.3738522680553652, + "grad_norm": 8.0625, + "kl": 6.128045558929443, + "learning_rate": 5e-06, + "logits/chosen": -15175236.363636363, + "logits/rejected": -15011224.615384616, + "logps/chosen": -372.8523615056818, + "logps/rejected": -582.0616736778846, + "loss": 0.058, + "rewards/chosen": 5.613186922940341, + "rewards/margins": 14.920761268455664, + "rewards/rejected": -9.307574345515324, + "step": 1364 + }, + { + "epoch": 0.374126353295875, + "grad_norm": 3.875, + "kl": 1.0866343975067139, + "learning_rate": 5e-06, + "logits/chosen": -19945410.46153846, + "logits/rejected": -31464459.636363637, + "logps/chosen": -426.2310321514423, + "logps/rejected": -649.7448952414773, + "loss": 0.0067, + "rewards/chosen": 7.876290541428786, + "rewards/margins": 19.568312691641854, + "rewards/rejected": -11.692022150213068, + "step": 1365 + }, + { + "epoch": 0.3744004385363848, + "grad_norm": 9.5, + "kl": 3.170278549194336, + "learning_rate": 5e-06, + "logits/chosen": -14971720.0, + "logits/rejected": -23046512.0, + "logps/chosen": -390.8773193359375, + "logps/rejected": -539.6959228515625, + "loss": 0.0389, + "rewards/chosen": 5.581034342447917, + "rewards/margins": 15.993284225463867, + "rewards/rejected": -10.412249883015951, + "step": 1366 + }, + { + "epoch": 0.37467452377689464, + "grad_norm": 3.78125, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -26175864.0, + "logits/rejected": -12746666.666666666, + "logps/chosen": -484.115478515625, + "logps/rejected": -506.0573323567708, + "loss": 0.012, + "rewards/chosen": 5.342023213704427, + "rewards/margins": 14.353556315104168, + "rewards/rejected": -9.01153310139974, + "step": 1367 + }, + { + "epoch": 0.3749486090174044, + "grad_norm": 8.9375, + "kl": 5.3463454246521, + "learning_rate": 5e-06, + "logits/chosen": -27990606.933333334, + "logits/rejected": 7252065.333333333, + "logps/chosen": -391.03502604166664, + "logps/rejected": -540.2826605902778, + "loss": 0.0387, + "rewards/chosen": 5.848204549153646, + "rewards/margins": 16.38685760498047, + "rewards/rejected": -10.538653055826822, + "step": 1368 + }, + { + "epoch": 0.3752226942579142, + "grad_norm": 3.515625, + "kl": 2.1995444297790527, + "learning_rate": 5e-06, + "logits/chosen": -22740668.444444444, + "logits/rejected": -15242888.533333333, + "logps/chosen": -356.75726996527777, + "logps/rejected": -487.87421875, + "loss": 0.0368, + "rewards/chosen": 6.148828294542101, + "rewards/margins": 15.101002163357204, + "rewards/rejected": -8.952173868815104, + "step": 1369 + }, + { + "epoch": 0.375496779498424, + "grad_norm": 5.34375, + "kl": 3.834341526031494, + "learning_rate": 5e-06, + "logits/chosen": -17311448.0, + "logits/rejected": -23720056.0, + "logps/chosen": -387.4996337890625, + "logps/rejected": -329.1541442871094, + "loss": 0.0556, + "rewards/chosen": 6.472764492034912, + "rewards/margins": 11.150480270385742, + "rewards/rejected": -4.67771577835083, + "step": 1370 + }, + { + "epoch": 0.3757708647389338, + "grad_norm": 10.1875, + "kl": 2.4680233001708984, + "learning_rate": 5e-06, + "logits/chosen": -23193893.333333332, + "logits/rejected": -22049678.666666668, + "logps/chosen": -473.3784993489583, + "logps/rejected": -551.5137939453125, + "loss": 0.0527, + "rewards/chosen": 5.980538050333659, + "rewards/margins": 15.94972038269043, + "rewards/rejected": -9.969182332356771, + "step": 1371 + }, + { + "epoch": 0.3760449499794436, + "grad_norm": 7.75, + "kl": 5.320826530456543, + "learning_rate": 5e-06, + "logits/chosen": -24369036.307692308, + "logits/rejected": -29353600.0, + "logps/chosen": -529.43212890625, + "logps/rejected": -578.52734375, + "loss": 0.0341, + "rewards/chosen": 6.14867929311899, + "rewards/margins": 16.09266011698263, + "rewards/rejected": -9.943980823863637, + "step": 1372 + }, + { + "epoch": 0.3763190352199534, + "grad_norm": 8.1875, + "kl": 2.913560390472412, + "learning_rate": 5e-06, + "logits/chosen": 42676160.0, + "logits/rejected": -26703561.14285714, + "logps/chosen": -426.602099609375, + "logps/rejected": -672.1701311383929, + "loss": 0.0574, + "rewards/chosen": 5.629318618774414, + "rewards/margins": 17.218637030465263, + "rewards/rejected": -11.589318411690849, + "step": 1373 + }, + { + "epoch": 0.3765931204604632, + "grad_norm": 14.5, + "kl": 3.1393322944641113, + "learning_rate": 5e-06, + "logits/chosen": -24517693.714285713, + "logits/rejected": -22183401.6, + "logps/chosen": -399.96358816964283, + "logps/rejected": -513.32119140625, + "loss": 0.0916, + "rewards/chosen": 5.446321759905134, + "rewards/margins": 13.67736576625279, + "rewards/rejected": -8.231044006347656, + "step": 1374 + }, + { + "epoch": 0.376867205700973, + "grad_norm": 6.65625, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -19839670.85714286, + "logits/rejected": -21188291.76470588, + "logps/chosen": -518.0590122767857, + "logps/rejected": -570.8956801470588, + "loss": 0.0801, + "rewards/chosen": 6.295253753662109, + "rewards/margins": 14.410270017736098, + "rewards/rejected": -8.115016264073988, + "step": 1375 + }, + { + "epoch": 0.3771412909414828, + "grad_norm": 6.40625, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -20146128.0, + "logits/rejected": -23185107.692307692, + "logps/chosen": -456.4454900568182, + "logps/rejected": -450.89148888221155, + "loss": 0.0242, + "rewards/chosen": 6.681622591885653, + "rewards/margins": 13.615015043245329, + "rewards/rejected": -6.933392451359675, + "step": 1376 + }, + { + "epoch": 0.3774153761819926, + "grad_norm": 4.65625, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -28711020.8, + "logits/rejected": -447470.85714285716, + "logps/chosen": -377.9548828125, + "logps/rejected": -476.52968052455356, + "loss": 0.0596, + "rewards/chosen": 5.9559326171875, + "rewards/margins": 12.511324746268137, + "rewards/rejected": -6.555392129080636, + "step": 1377 + }, + { + "epoch": 0.37768946142250237, + "grad_norm": 9.25, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -17720616.0, + "logits/rejected": -11930717.714285715, + "logps/chosen": -322.306591796875, + "logps/rejected": -379.26806640625, + "loss": 0.0442, + "rewards/chosen": 4.535403442382813, + "rewards/margins": 12.624801417759485, + "rewards/rejected": -8.089397975376674, + "step": 1378 + }, + { + "epoch": 0.3779635466630122, + "grad_norm": 6.59375, + "kl": 4.842496395111084, + "learning_rate": 5e-06, + "logits/chosen": -17207272.533333335, + "logits/rejected": -29896711.111111112, + "logps/chosen": -357.76178385416665, + "logps/rejected": -597.9447699652778, + "loss": 0.0663, + "rewards/chosen": 4.686810811360677, + "rewards/margins": 15.42571512858073, + "rewards/rejected": -10.738904317220053, + "step": 1379 + }, + { + "epoch": 0.378237631903522, + "grad_norm": 10.5, + "kl": 8.89041805267334, + "learning_rate": 5e-06, + "logits/chosen": -22908275.2, + "logits/rejected": -15090704.0, + "logps/chosen": -397.74895833333335, + "logps/rejected": -474.81846788194446, + "loss": 0.0681, + "rewards/chosen": 5.940164693196615, + "rewards/margins": 11.84436535305447, + "rewards/rejected": -5.904200659857856, + "step": 1380 + }, + { + "epoch": 0.3785117171440318, + "grad_norm": 12.375, + "kl": 5.787055969238281, + "learning_rate": 5e-06, + "logits/chosen": -25085703.529411763, + "logits/rejected": -30077053.714285713, + "logps/chosen": -430.9404296875, + "logps/rejected": -524.4429757254464, + "loss": 0.0526, + "rewards/chosen": 6.537899690515855, + "rewards/margins": 14.344460751830029, + "rewards/rejected": -7.8065610613141745, + "step": 1381 + }, + { + "epoch": 0.37878580238454157, + "grad_norm": 4.5625, + "kl": 0.07213084399700165, + "learning_rate": 5e-06, + "logits/chosen": -36316472.615384616, + "logits/rejected": -22782487.272727273, + "logps/chosen": -447.64896334134613, + "logps/rejected": -874.220703125, + "loss": 0.0107, + "rewards/chosen": 6.842833298903245, + "rewards/margins": 20.307765827312338, + "rewards/rejected": -13.464932528409092, + "step": 1382 + }, + { + "epoch": 0.3790598876250514, + "grad_norm": 7.46875, + "kl": 1.2391414642333984, + "learning_rate": 5e-06, + "logits/chosen": -10205222.857142856, + "logits/rejected": -9450368.0, + "logps/chosen": -385.673828125, + "logps/rejected": -441.63857421875, + "loss": 0.0427, + "rewards/chosen": 6.154412405831473, + "rewards/margins": 13.412180655343192, + "rewards/rejected": -7.257768249511718, + "step": 1383 + }, + { + "epoch": 0.3793339728655612, + "grad_norm": 4.25, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -33911207.11111111, + "logits/rejected": -13058011.733333332, + "logps/chosen": -400.8806966145833, + "logps/rejected": -427.82255859375, + "loss": 0.0143, + "rewards/chosen": 7.18104722764757, + "rewards/margins": 17.738263617621527, + "rewards/rejected": -10.557216389973958, + "step": 1384 + }, + { + "epoch": 0.379608058106071, + "grad_norm": 9.4375, + "kl": 0.0013427734375, + "learning_rate": 5e-06, + "logits/chosen": -15569785.6, + "logits/rejected": -11483112.0, + "logps/chosen": -404.3706787109375, + "logps/rejected": -402.08642578125, + "loss": 0.0501, + "rewards/chosen": 6.231559371948242, + "rewards/margins": 13.765058408464704, + "rewards/rejected": -7.533499036516462, + "step": 1385 + }, + { + "epoch": 0.37988214334658077, + "grad_norm": 4.46875, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -2147298.4615384615, + "logits/rejected": -16964609.454545453, + "logps/chosen": -378.64633413461536, + "logps/rejected": -603.8248401988636, + "loss": 0.0267, + "rewards/chosen": 5.670162494365986, + "rewards/margins": 15.652236191542832, + "rewards/rejected": -9.982073697176846, + "step": 1386 + }, + { + "epoch": 0.3801562285870906, + "grad_norm": 18.125, + "kl": 8.916831016540527, + "learning_rate": 5e-06, + "logits/chosen": -24932721.454545453, + "logits/rejected": 55292219.07692308, + "logps/chosen": -388.6292613636364, + "logps/rejected": -586.9148137019231, + "loss": 0.1303, + "rewards/chosen": 5.100325150923296, + "rewards/margins": 15.713935051764643, + "rewards/rejected": -10.613609900841347, + "step": 1387 + }, + { + "epoch": 0.3804303138276004, + "grad_norm": 8.125, + "kl": 4.023049831390381, + "learning_rate": 5e-06, + "logits/chosen": -33439646.11764706, + "logits/rejected": -12284657.142857144, + "logps/chosen": -527.9079733455883, + "logps/rejected": -607.30224609375, + "loss": 0.051, + "rewards/chosen": 7.876656924977022, + "rewards/margins": 16.607018863453582, + "rewards/rejected": -8.730361938476562, + "step": 1388 + }, + { + "epoch": 0.38070439906811016, + "grad_norm": 6.625, + "kl": 2.3660855293273926, + "learning_rate": 5e-06, + "logits/chosen": -22813740.307692308, + "logits/rejected": -13124648.727272727, + "logps/chosen": -387.0304987980769, + "logps/rejected": -509.43412642045456, + "loss": 0.0861, + "rewards/chosen": 6.212790269118089, + "rewards/margins": 16.17036459329245, + "rewards/rejected": -9.957574324174361, + "step": 1389 + }, + { + "epoch": 0.38097848430861997, + "grad_norm": 3.28125, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -11514995.692307692, + "logits/rejected": -8128618.181818182, + "logps/chosen": -364.83529897836536, + "logps/rejected": -401.13725142045456, + "loss": 0.0185, + "rewards/chosen": 6.449990492600661, + "rewards/margins": 16.177859006228147, + "rewards/rejected": -9.727868513627486, + "step": 1390 + }, + { + "epoch": 0.3812525695491298, + "grad_norm": 12.125, + "kl": 4.009106636047363, + "learning_rate": 5e-06, + "logits/chosen": 48565058.90909091, + "logits/rejected": -45311271.384615384, + "logps/chosen": -494.5110973011364, + "logps/rejected": -435.9802809495192, + "loss": 0.0496, + "rewards/chosen": 6.272986672141335, + "rewards/margins": 13.433048435024448, + "rewards/rejected": -7.160061762883113, + "step": 1391 + }, + { + "epoch": 0.3815266547896396, + "grad_norm": 8.375, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -25986397.333333332, + "logits/rejected": -11705772.0, + "logps/chosen": -319.29233805338544, + "logps/rejected": -414.7865397135417, + "loss": 0.0442, + "rewards/chosen": 6.384930928548177, + "rewards/margins": 13.787354787190754, + "rewards/rejected": -7.402423858642578, + "step": 1392 + }, + { + "epoch": 0.38180074003014935, + "grad_norm": 5.46875, + "kl": 9.212395668029785, + "learning_rate": 5e-06, + "logits/chosen": -17544302.933333334, + "logits/rejected": -38258378.666666664, + "logps/chosen": -512.2736002604166, + "logps/rejected": -572.9869791666666, + "loss": 0.0939, + "rewards/chosen": 5.774121602376302, + "rewards/margins": 15.125226338704426, + "rewards/rejected": -9.351104736328125, + "step": 1393 + }, + { + "epoch": 0.38207482527065917, + "grad_norm": 5.34375, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -28904883.2, + "logits/rejected": -36532700.44444445, + "logps/chosen": -485.22447916666664, + "logps/rejected": -563.7375759548611, + "loss": 0.0156, + "rewards/chosen": 6.482096354166667, + "rewards/margins": 18.07865227593316, + "rewards/rejected": -11.596555921766493, + "step": 1394 + }, + { + "epoch": 0.382348910511169, + "grad_norm": 5.53125, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -10705168.666666666, + "logits/rejected": -10249010.0, + "logps/chosen": -441.7935384114583, + "logps/rejected": -628.5597330729166, + "loss": 0.016, + "rewards/chosen": 7.558779398600261, + "rewards/margins": 15.805034637451172, + "rewards/rejected": -8.246255238850912, + "step": 1395 + }, + { + "epoch": 0.3826229957516788, + "grad_norm": 8.75, + "kl": 7.114432334899902, + "learning_rate": 5e-06, + "logits/chosen": -42048029.538461536, + "logits/rejected": -18811610.181818184, + "logps/chosen": -326.1633112980769, + "logps/rejected": -493.24507279829544, + "loss": 0.0377, + "rewards/chosen": 7.006379934457632, + "rewards/margins": 15.437968540858556, + "rewards/rejected": -8.431588606400924, + "step": 1396 + }, + { + "epoch": 0.38289708099218855, + "grad_norm": 1.6796875, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -21719678.222222224, + "logits/rejected": -23319820.8, + "logps/chosen": -419.68861219618054, + "logps/rejected": -646.9729817708334, + "loss": 0.0037, + "rewards/chosen": 7.737933688693577, + "rewards/margins": 19.3145511203342, + "rewards/rejected": -11.576617431640624, + "step": 1397 + }, + { + "epoch": 0.38317116623269837, + "grad_norm": 8.6875, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -11433850.0, + "logits/rejected": -1957782.125, + "logps/chosen": -350.4543762207031, + "logps/rejected": -609.0047607421875, + "loss": 0.0213, + "rewards/chosen": 5.920838832855225, + "rewards/margins": 14.65678358078003, + "rewards/rejected": -8.735944747924805, + "step": 1398 + }, + { + "epoch": 0.3834452514732082, + "grad_norm": 4.25, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": 3641027.5555555555, + "logits/rejected": -7396843.733333333, + "logps/chosen": -303.7442220052083, + "logps/rejected": -724.3945963541667, + "loss": 0.0228, + "rewards/chosen": 5.072980244954427, + "rewards/margins": 17.202144877115884, + "rewards/rejected": -12.129164632161459, + "step": 1399 + }, + { + "epoch": 0.38371933671371794, + "grad_norm": 9.875, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -20140606.222222224, + "logits/rejected": -15172292.266666668, + "logps/chosen": -562.4630533854166, + "logps/rejected": -479.6453450520833, + "loss": 0.0699, + "rewards/chosen": 6.297349294026692, + "rewards/margins": 13.762322235107423, + "rewards/rejected": -7.464972941080729, + "step": 1400 + }, + { + "epoch": 0.38399342195422775, + "grad_norm": 14.25, + "kl": 1.3475902080535889, + "learning_rate": 5e-06, + "logits/chosen": 16433543.272727273, + "logits/rejected": -608617.8461538461, + "logps/chosen": -352.2791193181818, + "logps/rejected": -440.7605543870192, + "loss": 0.0852, + "rewards/chosen": 3.7790308865633877, + "rewards/margins": 11.704532783348244, + "rewards/rejected": -7.925501896784856, + "step": 1401 + }, + { + "epoch": 0.38426750719473757, + "grad_norm": 8.125, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -9733805.714285715, + "logits/rejected": -24229008.0, + "logps/chosen": -422.49776785714283, + "logps/rejected": -435.386669921875, + "loss": 0.0317, + "rewards/chosen": 4.980261121477399, + "rewards/margins": 12.659009443010603, + "rewards/rejected": -7.678748321533203, + "step": 1402 + }, + { + "epoch": 0.3845415924352474, + "grad_norm": 5.84375, + "kl": 1.2645753622055054, + "learning_rate": 5e-06, + "logits/chosen": -17009901.09090909, + "logits/rejected": -22763913.846153848, + "logps/chosen": -341.24174360795456, + "logps/rejected": -588.1577899639423, + "loss": 0.0403, + "rewards/chosen": 5.793030478737571, + "rewards/margins": 15.367644783500193, + "rewards/rejected": -9.57461430476262, + "step": 1403 + }, + { + "epoch": 0.38481567767575714, + "grad_norm": 5.4375, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -31081606.4, + "logits/rejected": -14136518.857142856, + "logps/chosen": -438.66708984375, + "logps/rejected": -461.45567103794644, + "loss": 0.0284, + "rewards/chosen": 5.904724884033203, + "rewards/margins": 13.803697531563895, + "rewards/rejected": -7.898972647530692, + "step": 1404 + }, + { + "epoch": 0.38508976291626695, + "grad_norm": 13.125, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -31141169.454545453, + "logits/rejected": -31546112.0, + "logps/chosen": -470.2242542613636, + "logps/rejected": -516.6022385817307, + "loss": 0.0318, + "rewards/chosen": 6.536177201704546, + "rewards/margins": 18.61775068803267, + "rewards/rejected": -12.081573486328125, + "step": 1405 + }, + { + "epoch": 0.38536384815677677, + "grad_norm": 6.75, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -14639452.8, + "logits/rejected": 26190873.14285714, + "logps/chosen": -381.5597412109375, + "logps/rejected": -513.1459612165179, + "loss": 0.0498, + "rewards/chosen": 5.216278457641602, + "rewards/margins": 15.561525998796736, + "rewards/rejected": -10.345247541155134, + "step": 1406 + }, + { + "epoch": 0.3856379333972866, + "grad_norm": 4.78125, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -20138573.714285713, + "logits/rejected": -25199166.11764706, + "logps/chosen": -419.7700892857143, + "logps/rejected": -490.9120519301471, + "loss": 0.0412, + "rewards/chosen": 5.785412924630301, + "rewards/margins": 14.967668389071939, + "rewards/rejected": -9.182255464441637, + "step": 1407 + }, + { + "epoch": 0.38591201863779634, + "grad_norm": 7.4375, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -28348374.4, + "logits/rejected": -19506324.57142857, + "logps/chosen": -591.6673828125, + "logps/rejected": -440.2857142857143, + "loss": 0.0296, + "rewards/chosen": 7.85906982421875, + "rewards/margins": 16.827218191964285, + "rewards/rejected": -8.968148367745536, + "step": 1408 + }, + { + "epoch": 0.38618610387830615, + "grad_norm": 11.4375, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": 7159299.428571428, + "logits/rejected": -19971766.588235293, + "logps/chosen": -489.5111607142857, + "logps/rejected": -512.6785960477941, + "loss": 0.039, + "rewards/chosen": 5.497883387974331, + "rewards/margins": 13.902515956333705, + "rewards/rejected": -8.404632568359375, + "step": 1409 + }, + { + "epoch": 0.38646018911881597, + "grad_norm": 11.0625, + "kl": 8.206087112426758, + "learning_rate": 5e-06, + "logits/chosen": -33406006.0, + "logits/rejected": -7040979.5, + "logps/chosen": -398.5121154785156, + "logps/rejected": -559.1600952148438, + "loss": 0.0661, + "rewards/chosen": 6.292716979980469, + "rewards/margins": 13.778829574584961, + "rewards/rejected": -7.486112594604492, + "step": 1410 + }, + { + "epoch": 0.3867342743593257, + "grad_norm": 9.3125, + "kl": 6.8136115074157715, + "learning_rate": 5e-06, + "logits/chosen": -19231750.4, + "logits/rejected": -16003850.666666666, + "logps/chosen": -343.5164388020833, + "logps/rejected": -518.9685872395834, + "loss": 0.0617, + "rewards/chosen": 5.851529947916666, + "rewards/margins": 13.387879774305556, + "rewards/rejected": -7.536349826388889, + "step": 1411 + }, + { + "epoch": 0.38700835959983554, + "grad_norm": 12.1875, + "kl": 12.79141902923584, + "learning_rate": 5e-06, + "logits/chosen": -12059554.133333333, + "logits/rejected": -29063504.0, + "logps/chosen": -377.70625, + "logps/rejected": -592.2458767361111, + "loss": 0.0584, + "rewards/chosen": 6.876853434244792, + "rewards/margins": 17.14130859375, + "rewards/rejected": -10.264455159505209, + "step": 1412 + }, + { + "epoch": 0.38728244484034535, + "grad_norm": 3.203125, + "kl": 1.6815261840820312, + "learning_rate": 5e-06, + "logits/chosen": -58631099.428571425, + "logits/rejected": -36261612.8, + "logps/chosen": -450.61586216517856, + "logps/rejected": -618.961767578125, + "loss": 0.0119, + "rewards/chosen": 6.43743896484375, + "rewards/margins": 16.019895172119142, + "rewards/rejected": -9.58245620727539, + "step": 1413 + }, + { + "epoch": 0.38755653008085517, + "grad_norm": 4.65625, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -15847315.2, + "logits/rejected": -27325570.285714287, + "logps/chosen": -342.9879150390625, + "logps/rejected": -616.8536551339286, + "loss": 0.0274, + "rewards/chosen": 6.1737007141113285, + "rewards/margins": 15.54806420462472, + "rewards/rejected": -9.374363490513392, + "step": 1414 + }, + { + "epoch": 0.3878306153213649, + "grad_norm": 8.6875, + "kl": 8.707789421081543, + "learning_rate": 5e-06, + "logits/chosen": -19554310.85714286, + "logits/rejected": 3619711.2, + "logps/chosen": -442.80873325892856, + "logps/rejected": -423.546630859375, + "loss": 0.064, + "rewards/chosen": 6.673303876604352, + "rewards/margins": 13.897506604875836, + "rewards/rejected": -7.224202728271484, + "step": 1415 + }, + { + "epoch": 0.38810470056187474, + "grad_norm": 6.09375, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -7865761.142857143, + "logits/rejected": -12082646.4, + "logps/chosen": -541.0916922433036, + "logps/rejected": -528.327001953125, + "loss": 0.019, + "rewards/chosen": 7.124297550746372, + "rewards/margins": 15.25155508858817, + "rewards/rejected": -8.127257537841796, + "step": 1416 + }, + { + "epoch": 0.38837878580238455, + "grad_norm": 9.75, + "kl": 5.503966808319092, + "learning_rate": 5e-06, + "logits/chosen": -47028122.18181818, + "logits/rejected": -22637587.692307692, + "logps/chosen": -474.98393110795456, + "logps/rejected": -493.7075946514423, + "loss": 0.0895, + "rewards/chosen": 6.810101595791903, + "rewards/margins": 12.153839271385353, + "rewards/rejected": -5.34373767559345, + "step": 1417 + }, + { + "epoch": 0.38865287104289437, + "grad_norm": 4.75, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -22519178.666666668, + "logits/rejected": -14820861.333333334, + "logps/chosen": -531.2034098307291, + "logps/rejected": -636.7543131510416, + "loss": 0.0101, + "rewards/chosen": 6.227511723836263, + "rewards/margins": 16.882418314615887, + "rewards/rejected": -10.654906590779623, + "step": 1418 + }, + { + "epoch": 0.3889269562834041, + "grad_norm": 5.46875, + "kl": 1.9965922832489014, + "learning_rate": 5e-06, + "logits/chosen": -9062376.0, + "logits/rejected": -6835682.0, + "logps/chosen": -395.8440348307292, + "logps/rejected": -521.3203125, + "loss": 0.0191, + "rewards/chosen": 6.174262364705403, + "rewards/margins": 13.873225529988606, + "rewards/rejected": -7.698963165283203, + "step": 1419 + }, + { + "epoch": 0.38920104152391394, + "grad_norm": 6.90625, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -22347016.533333335, + "logits/rejected": -31988721.777777776, + "logps/chosen": -360.98984375, + "logps/rejected": -406.25474717881946, + "loss": 0.044, + "rewards/chosen": 5.7815800984700525, + "rewards/margins": 15.438178507486981, + "rewards/rejected": -9.656598409016928, + "step": 1420 + }, + { + "epoch": 0.38947512676442375, + "grad_norm": 7.90625, + "kl": 4.616863250732422, + "learning_rate": 5e-06, + "logits/chosen": -30900057.14285714, + "logits/rejected": 20677763.2, + "logps/chosen": -436.92794363839283, + "logps/rejected": -547.849951171875, + "loss": 0.0322, + "rewards/chosen": 6.030820574079241, + "rewards/margins": 14.604752458844866, + "rewards/rejected": -8.573931884765624, + "step": 1421 + }, + { + "epoch": 0.3897492120049335, + "grad_norm": 2.359375, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -16453453.866666667, + "logits/rejected": -26936094.222222224, + "logps/chosen": -479.99290364583334, + "logps/rejected": -573.5221354166666, + "loss": 0.0033, + "rewards/chosen": 7.086372884114583, + "rewards/margins": 19.061302693684894, + "rewards/rejected": -11.974929809570312, + "step": 1422 + }, + { + "epoch": 0.3900232972454433, + "grad_norm": 11.25, + "kl": 2.436372756958008, + "learning_rate": 5e-06, + "logits/chosen": -6977084.0, + "logits/rejected": -34996189.333333336, + "logps/chosen": -447.3065592447917, + "logps/rejected": -422.8838704427083, + "loss": 0.0303, + "rewards/chosen": 6.295797983805339, + "rewards/margins": 14.933187484741211, + "rewards/rejected": -8.637389500935873, + "step": 1423 + }, + { + "epoch": 0.39029738248595314, + "grad_norm": 8.5, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -32476570.181818184, + "logits/rejected": -3153703.3846153845, + "logps/chosen": -433.28231534090907, + "logps/rejected": -675.8703425480769, + "loss": 0.0293, + "rewards/chosen": 6.067646373401988, + "rewards/margins": 15.876363954343995, + "rewards/rejected": -9.808717580942007, + "step": 1424 + }, + { + "epoch": 0.39057146772646295, + "grad_norm": 17.75, + "kl": 1.9658699035644531, + "learning_rate": 5e-06, + "logits/chosen": -12233398.153846154, + "logits/rejected": 32754670.545454547, + "logps/chosen": -582.0058218149038, + "logps/rejected": -558.1749378551136, + "loss": 0.0454, + "rewards/chosen": 5.968829815204327, + "rewards/margins": 16.03259213320859, + "rewards/rejected": -10.063762318004262, + "step": 1425 + }, + { + "epoch": 0.3908455529669727, + "grad_norm": 8.0, + "kl": 9.317862510681152, + "learning_rate": 5e-06, + "logits/chosen": -19722677.333333332, + "logits/rejected": -32237528.888888888, + "logps/chosen": -525.9533203125, + "logps/rejected": -459.78868272569446, + "loss": 0.0503, + "rewards/chosen": 7.321242268880209, + "rewards/margins": 15.89762437608507, + "rewards/rejected": -8.57638210720486, + "step": 1426 + }, + { + "epoch": 0.3911196382074825, + "grad_norm": 8.6875, + "kl": 8.646275520324707, + "learning_rate": 5e-06, + "logits/chosen": -26821293.17647059, + "logits/rejected": -13473360.0, + "logps/chosen": -454.1703239889706, + "logps/rejected": -393.92947823660717, + "loss": 0.0316, + "rewards/chosen": 6.1007223690257355, + "rewards/margins": 13.657848550491973, + "rewards/rejected": -7.557126181466239, + "step": 1427 + }, + { + "epoch": 0.39139372344799234, + "grad_norm": 10.4375, + "kl": 2.6792781352996826, + "learning_rate": 5e-06, + "logits/chosen": -28234188.307692308, + "logits/rejected": -37368037.81818182, + "logps/chosen": -423.0764723557692, + "logps/rejected": -706.1218039772727, + "loss": 0.0417, + "rewards/chosen": 5.288563654972957, + "rewards/margins": 15.350997071166137, + "rewards/rejected": -10.062433416193182, + "step": 1428 + }, + { + "epoch": 0.39166780868850215, + "grad_norm": 7.625, + "kl": 2.2425589561462402, + "learning_rate": 5e-06, + "logits/chosen": -1235513.142857143, + "logits/rejected": -13479128.470588235, + "logps/chosen": -473.5034877232143, + "logps/rejected": -491.32077205882354, + "loss": 0.023, + "rewards/chosen": 7.6078289576939175, + "rewards/margins": 16.325391464874524, + "rewards/rejected": -8.717562507180606, + "step": 1429 + }, + { + "epoch": 0.3919418939290119, + "grad_norm": 9.6875, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -29087056.0, + "logits/rejected": -6828335.0, + "logps/chosen": -385.7794189453125, + "logps/rejected": -546.1878051757812, + "loss": 0.0306, + "rewards/chosen": 5.247454643249512, + "rewards/margins": 14.41151237487793, + "rewards/rejected": -9.164057731628418, + "step": 1430 + }, + { + "epoch": 0.3922159791695217, + "grad_norm": 6.40625, + "kl": 0.22409455478191376, + "learning_rate": 5e-06, + "logits/chosen": -20071770.181818184, + "logits/rejected": 25014614.153846152, + "logps/chosen": -416.4949396306818, + "logps/rejected": -558.9255934495193, + "loss": 0.0337, + "rewards/chosen": 5.883283441716975, + "rewards/margins": 16.937212697275868, + "rewards/rejected": -11.053929255558895, + "step": 1431 + }, + { + "epoch": 0.39249006441003154, + "grad_norm": 9.125, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -6168787.076923077, + "logits/rejected": -24468468.363636363, + "logps/chosen": -432.57132662259613, + "logps/rejected": -499.22554154829544, + "loss": 0.0362, + "rewards/chosen": 6.501321645883413, + "rewards/margins": 16.599509179175318, + "rewards/rejected": -10.098187533291904, + "step": 1432 + }, + { + "epoch": 0.3927641496505413, + "grad_norm": 3.734375, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -16884940.0, + "logits/rejected": -14892088.0, + "logps/chosen": -368.162353515625, + "logps/rejected": -387.3690999348958, + "loss": 0.0209, + "rewards/chosen": 6.306844711303711, + "rewards/margins": 13.33728535970052, + "rewards/rejected": -7.03044064839681, + "step": 1433 + }, + { + "epoch": 0.3930382348910511, + "grad_norm": 3.96875, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -17802244.0, + "logits/rejected": -19308810.0, + "logps/chosen": -460.7379455566406, + "logps/rejected": -510.0834045410156, + "loss": 0.0137, + "rewards/chosen": 6.377381324768066, + "rewards/margins": 15.60499095916748, + "rewards/rejected": -9.227609634399414, + "step": 1434 + }, + { + "epoch": 0.3933123201315609, + "grad_norm": 55.5, + "kl": 1.450218915939331, + "learning_rate": 5e-06, + "logits/chosen": -23854823.111111112, + "logits/rejected": -6188038.666666667, + "logps/chosen": -410.9397786458333, + "logps/rejected": -627.9956868489584, + "loss": 0.057, + "rewards/chosen": 5.33441162109375, + "rewards/margins": 11.596467971801758, + "rewards/rejected": -6.262056350708008, + "step": 1435 + }, + { + "epoch": 0.39358640537207074, + "grad_norm": 10.625, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -18941240.888888888, + "logits/rejected": -6673364.8, + "logps/chosen": -433.24004448784723, + "logps/rejected": -559.98359375, + "loss": 0.0504, + "rewards/chosen": 6.035040537516276, + "rewards/margins": 16.0218630472819, + "rewards/rejected": -9.986822509765625, + "step": 1436 + }, + { + "epoch": 0.3938604906125805, + "grad_norm": 6.34375, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -12548608.0, + "logits/rejected": -34081949.09090909, + "logps/chosen": -459.91248497596155, + "logps/rejected": -491.638671875, + "loss": 0.0137, + "rewards/chosen": 6.2793438251201925, + "rewards/margins": 15.476913238738799, + "rewards/rejected": -9.197569413618607, + "step": 1437 + }, + { + "epoch": 0.3941345758530903, + "grad_norm": 2.875, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -14534262.0, + "logits/rejected": -15378716.0, + "logps/chosen": -373.912109375, + "logps/rejected": -461.9368591308594, + "loss": 0.0106, + "rewards/chosen": 6.300682544708252, + "rewards/margins": 14.191205501556396, + "rewards/rejected": -7.8905229568481445, + "step": 1438 + }, + { + "epoch": 0.3944086610936001, + "grad_norm": 4.5, + "kl": 2.3463454246520996, + "learning_rate": 5e-06, + "logits/chosen": -15791783.111111112, + "logits/rejected": -23825480.533333335, + "logps/chosen": -546.93505859375, + "logps/rejected": -489.9173177083333, + "loss": 0.0109, + "rewards/chosen": 6.098035176595052, + "rewards/margins": 14.933752950032552, + "rewards/rejected": -8.8357177734375, + "step": 1439 + }, + { + "epoch": 0.39468274633410994, + "grad_norm": 2.59375, + "kl": 9.409567832946777, + "learning_rate": 5e-06, + "logits/chosen": -12557673.846153846, + "logits/rejected": 15575792.0, + "logps/chosen": -422.6071589543269, + "logps/rejected": -383.36452414772725, + "loss": 0.0409, + "rewards/chosen": 7.500178997333233, + "rewards/margins": 15.378815444199354, + "rewards/rejected": -7.878636446866122, + "step": 1440 + }, + { + "epoch": 0.3949568315746197, + "grad_norm": 4.78125, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": 7068266.0, + "logits/rejected": -28445317.333333332, + "logps/chosen": -408.2491861979167, + "logps/rejected": -403.8436686197917, + "loss": 0.0254, + "rewards/chosen": 6.308460871378581, + "rewards/margins": 13.265715281168621, + "rewards/rejected": -6.957254409790039, + "step": 1441 + }, + { + "epoch": 0.3952309168151295, + "grad_norm": 5.71875, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -34314231.27272727, + "logits/rejected": -29510390.153846152, + "logps/chosen": -412.84956498579544, + "logps/rejected": -573.2711087740385, + "loss": 0.0336, + "rewards/chosen": 6.0770111083984375, + "rewards/margins": 15.802318866436298, + "rewards/rejected": -9.72530775803786, + "step": 1442 + }, + { + "epoch": 0.3955050020556393, + "grad_norm": 3.28125, + "kl": 5.1361541748046875, + "learning_rate": 5e-06, + "logits/chosen": -15218531.2, + "logits/rejected": -34328086.85714286, + "logps/chosen": -382.60498046875, + "logps/rejected": -538.9275948660714, + "loss": 0.0121, + "rewards/chosen": 7.175220489501953, + "rewards/margins": 17.16269302368164, + "rewards/rejected": -9.987472534179688, + "step": 1443 + }, + { + "epoch": 0.3957790872961491, + "grad_norm": 1.1640625, + "kl": 6.151179313659668, + "learning_rate": 5e-06, + "logits/chosen": -39765316.92307692, + "logits/rejected": -17213381.818181816, + "logps/chosen": -460.6415264423077, + "logps/rejected": -649.7746803977273, + "loss": 0.004, + "rewards/chosen": 7.516977750338041, + "rewards/margins": 17.8018752945053, + "rewards/rejected": -10.284897544167258, + "step": 1444 + }, + { + "epoch": 0.3960531725366589, + "grad_norm": 7.09375, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": 21204821.818181816, + "logits/rejected": 496978.76923076925, + "logps/chosen": -406.0511363636364, + "logps/rejected": -513.6083233173077, + "loss": 0.0292, + "rewards/chosen": 6.10753700949929, + "rewards/margins": 13.484957875071707, + "rewards/rejected": -7.377420865572416, + "step": 1445 + }, + { + "epoch": 0.3963272577771687, + "grad_norm": 9.625, + "kl": 4.7978034019470215, + "learning_rate": 5e-06, + "logits/chosen": -23328679.384615384, + "logits/rejected": -34616273.45454545, + "logps/chosen": -394.95519080528845, + "logps/rejected": -510.11039595170456, + "loss": 0.0709, + "rewards/chosen": 5.499660785381611, + "rewards/margins": 13.846459075287505, + "rewards/rejected": -8.346798289905895, + "step": 1446 + }, + { + "epoch": 0.3966013430176785, + "grad_norm": 8.25, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -1411829.4285714286, + "logits/rejected": -2949816.0, + "logps/chosen": -447.2909458705357, + "logps/rejected": -506.774169921875, + "loss": 0.1075, + "rewards/chosen": 4.000745500837054, + "rewards/margins": 15.395313371930804, + "rewards/rejected": -11.39456787109375, + "step": 1447 + }, + { + "epoch": 0.3968754282581883, + "grad_norm": 2.9375, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": 1754096.6666666667, + "logits/rejected": -19670812.444444444, + "logps/chosen": -556.821044921875, + "logps/rejected": -558.7339409722222, + "loss": 0.0084, + "rewards/chosen": 7.009188334147136, + "rewards/margins": 16.81033155653212, + "rewards/rejected": -9.801143222384983, + "step": 1448 + }, + { + "epoch": 0.3971495134986981, + "grad_norm": 5.09375, + "kl": 12.061675071716309, + "learning_rate": 5e-06, + "logits/chosen": -22063474.0, + "logits/rejected": -24597092.0, + "logps/chosen": -359.2086486816406, + "logps/rejected": -385.9973449707031, + "loss": 0.0639, + "rewards/chosen": 6.684747695922852, + "rewards/margins": 16.034339904785156, + "rewards/rejected": -9.349592208862305, + "step": 1449 + }, + { + "epoch": 0.3974235987392079, + "grad_norm": 9.375, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": 7487710.545454546, + "logits/rejected": -25872679.384615384, + "logps/chosen": -421.15926846590907, + "logps/rejected": -502.6572265625, + "loss": 0.0208, + "rewards/chosen": 6.651611328125, + "rewards/margins": 16.94122783954327, + "rewards/rejected": -10.28961651141827, + "step": 1450 + }, + { + "epoch": 0.3976976839797177, + "grad_norm": 2.8125, + "kl": 0.4695243835449219, + "learning_rate": 5e-06, + "logits/chosen": -21610115.2, + "logits/rejected": -4513142.285714285, + "logps/chosen": -414.314208984375, + "logps/rejected": -399.8384486607143, + "loss": 0.0095, + "rewards/chosen": 6.281748962402344, + "rewards/margins": 13.931868198939732, + "rewards/rejected": -7.650119236537388, + "step": 1451 + }, + { + "epoch": 0.3979717692202275, + "grad_norm": 9.0, + "kl": 6.9360175132751465, + "learning_rate": 5e-06, + "logits/chosen": 8568599.384615384, + "logits/rejected": -23706859.636363637, + "logps/chosen": -424.1638746995192, + "logps/rejected": -531.8884055397727, + "loss": 0.0647, + "rewards/chosen": 5.8361640343299275, + "rewards/margins": 12.902629238742215, + "rewards/rejected": -7.066465204412287, + "step": 1452 + }, + { + "epoch": 0.3982458544607373, + "grad_norm": 3.359375, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -25886621.09090909, + "logits/rejected": -9216925.538461538, + "logps/chosen": -396.11430220170456, + "logps/rejected": -620.1935096153846, + "loss": 0.0084, + "rewards/chosen": 6.38185605135831, + "rewards/margins": 18.712037466622732, + "rewards/rejected": -12.330181415264423, + "step": 1453 + }, + { + "epoch": 0.3985199397012471, + "grad_norm": 1.9296875, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -646973.5384615385, + "logits/rejected": -11282706.909090908, + "logps/chosen": -413.7139423076923, + "logps/rejected": -451.2281605113636, + "loss": 0.0087, + "rewards/chosen": 5.46990966796875, + "rewards/margins": 13.383010864257812, + "rewards/rejected": -7.9131011962890625, + "step": 1454 + }, + { + "epoch": 0.39879402494175686, + "grad_norm": 7.96875, + "kl": 8.33979606628418, + "learning_rate": 5e-06, + "logits/chosen": -22244800.0, + "logits/rejected": -24258067.2, + "logps/chosen": -515.4043666294643, + "logps/rejected": -580.246875, + "loss": 0.0503, + "rewards/chosen": 8.21474838256836, + "rewards/margins": 19.101691436767577, + "rewards/rejected": -10.886943054199218, + "step": 1455 + }, + { + "epoch": 0.3990681101822667, + "grad_norm": 5.96875, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -49840153.6, + "logits/rejected": 5585209.142857143, + "logps/chosen": -458.6103515625, + "logps/rejected": -358.1227329799107, + "loss": 0.0336, + "rewards/chosen": 6.82957763671875, + "rewards/margins": 13.573753683907645, + "rewards/rejected": -6.744176047188895, + "step": 1456 + }, + { + "epoch": 0.3993421954227765, + "grad_norm": 8.6875, + "kl": 0.7789306640625, + "learning_rate": 5e-06, + "logits/chosen": -17194382.666666668, + "logits/rejected": -22968320.0, + "logps/chosen": -497.4375813802083, + "logps/rejected": -624.2766520182291, + "loss": 0.0178, + "rewards/chosen": 6.036853790283203, + "rewards/margins": 17.87268956502279, + "rewards/rejected": -11.835835774739584, + "step": 1457 + }, + { + "epoch": 0.3996162806632863, + "grad_norm": 13.0, + "kl": 5.994259834289551, + "learning_rate": 5e-06, + "logits/chosen": 3536754.909090909, + "logits/rejected": -18464868.923076924, + "logps/chosen": -408.98876953125, + "logps/rejected": -404.9997746394231, + "loss": 0.0374, + "rewards/chosen": 5.698380556973544, + "rewards/margins": 16.849491866318495, + "rewards/rejected": -11.151111309344952, + "step": 1458 + }, + { + "epoch": 0.39989036590379606, + "grad_norm": 10.6875, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -32262365.09090909, + "logits/rejected": -10437982.76923077, + "logps/chosen": -392.85542436079544, + "logps/rejected": -472.2626201923077, + "loss": 0.0723, + "rewards/chosen": 4.708522103049538, + "rewards/margins": 13.660202213100622, + "rewards/rejected": -8.951680110051083, + "step": 1459 + }, + { + "epoch": 0.4001644511443059, + "grad_norm": 9.4375, + "kl": 0.9056529998779297, + "learning_rate": 5e-06, + "logits/chosen": -17837296.0, + "logits/rejected": -22912341.333333332, + "logps/chosen": -299.2662353515625, + "logps/rejected": -448.9315592447917, + "loss": 0.0694, + "rewards/chosen": 4.599858283996582, + "rewards/margins": 14.52905241648356, + "rewards/rejected": -9.929194132486979, + "step": 1460 + }, + { + "epoch": 0.4004385363848157, + "grad_norm": 6.96875, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -2834306.0, + "logits/rejected": -33643650.13333333, + "logps/chosen": -443.73499891493054, + "logps/rejected": -584.9385416666667, + "loss": 0.017, + "rewards/chosen": 4.902130550808376, + "rewards/margins": 17.25531556871202, + "rewards/rejected": -12.353185017903646, + "step": 1461 + }, + { + "epoch": 0.40071262162532545, + "grad_norm": 4.25, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -12723380.57142857, + "logits/rejected": 3826140.0, + "logps/chosen": -311.38441685267856, + "logps/rejected": -631.50673828125, + "loss": 0.0819, + "rewards/chosen": 4.947787693568638, + "rewards/margins": 11.385347965785435, + "rewards/rejected": -6.437560272216797, + "step": 1462 + }, + { + "epoch": 0.40098670686583526, + "grad_norm": 7.46875, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -9677306.666666666, + "logits/rejected": -13461651.2, + "logps/chosen": -425.11078559027777, + "logps/rejected": -459.22486979166666, + "loss": 0.0402, + "rewards/chosen": 5.371241675482856, + "rewards/margins": 14.55075725979275, + "rewards/rejected": -9.179515584309895, + "step": 1463 + }, + { + "epoch": 0.4012607921063451, + "grad_norm": 9.1875, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -29695066.666666668, + "logits/rejected": -6879804.8, + "logps/chosen": -440.9660915798611, + "logps/rejected": -550.978515625, + "loss": 0.0201, + "rewards/chosen": 7.373270670572917, + "rewards/margins": 17.880476888020834, + "rewards/rejected": -10.507206217447917, + "step": 1464 + }, + { + "epoch": 0.4015348773468549, + "grad_norm": 13.9375, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": 21727227.076923076, + "logits/rejected": -24089861.818181816, + "logps/chosen": -515.1975661057693, + "logps/rejected": -403.83598188920456, + "loss": 0.0531, + "rewards/chosen": 6.092559227576623, + "rewards/margins": 14.058143722427475, + "rewards/rejected": -7.9655844948508525, + "step": 1465 + }, + { + "epoch": 0.40180896258736465, + "grad_norm": 3.703125, + "kl": 1.713127851486206, + "learning_rate": 5e-06, + "logits/chosen": -14022167.384615384, + "logits/rejected": -16301797.818181818, + "logps/chosen": -368.4157151442308, + "logps/rejected": -565.6483931107955, + "loss": 0.0227, + "rewards/chosen": 5.291444631723257, + "rewards/margins": 14.111966593282206, + "rewards/rejected": -8.82052196155895, + "step": 1466 + }, + { + "epoch": 0.40208304782787446, + "grad_norm": 14.375, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -10145942.76923077, + "logits/rejected": -14381547.636363637, + "logps/chosen": -375.4387770432692, + "logps/rejected": -491.78959517045456, + "loss": 0.0704, + "rewards/chosen": 4.754027733435998, + "rewards/margins": 13.056617149939903, + "rewards/rejected": -8.302589416503906, + "step": 1467 + }, + { + "epoch": 0.4023571330683843, + "grad_norm": 7.03125, + "kl": 8.001840591430664, + "learning_rate": 5e-06, + "logits/chosen": -8703178.666666666, + "logits/rejected": -1943296.0, + "logps/chosen": -452.34534505208336, + "logps/rejected": -670.1793619791666, + "loss": 0.0296, + "rewards/chosen": 7.0906824747721355, + "rewards/margins": 17.51414269341363, + "rewards/rejected": -10.423460218641493, + "step": 1468 + }, + { + "epoch": 0.4026312183088941, + "grad_norm": 4.28125, + "kl": 3.2544476985931396, + "learning_rate": 5e-06, + "logits/chosen": -2026193.3333333333, + "logits/rejected": -20799697.333333332, + "logps/chosen": -441.83203125, + "logps/rejected": -389.2533365885417, + "loss": 0.0279, + "rewards/chosen": 7.457075754801433, + "rewards/margins": 15.907121022542317, + "rewards/rejected": -8.450045267740885, + "step": 1469 + }, + { + "epoch": 0.40290530354940385, + "grad_norm": 8.3125, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -24432529.066666666, + "logits/rejected": -31654762.666666668, + "logps/chosen": -483.0033854166667, + "logps/rejected": -527.1343315972222, + "loss": 0.0273, + "rewards/chosen": 6.5596923828125, + "rewards/margins": 15.178263346354166, + "rewards/rejected": -8.618570963541666, + "step": 1470 + }, + { + "epoch": 0.40317938878991366, + "grad_norm": 5.59375, + "kl": 2.2591662406921387, + "learning_rate": 5e-06, + "logits/chosen": -12075089.066666666, + "logits/rejected": -15396216.888888888, + "logps/chosen": -407.08141276041664, + "logps/rejected": -423.15863715277777, + "loss": 0.0252, + "rewards/chosen": 7.035185241699219, + "rewards/margins": 16.038982984754774, + "rewards/rejected": -9.003797743055555, + "step": 1471 + }, + { + "epoch": 0.4034534740304235, + "grad_norm": 3.015625, + "kl": 3.760878324508667, + "learning_rate": 5e-06, + "logits/chosen": -19786685.53846154, + "logits/rejected": -7276115.636363637, + "logps/chosen": -428.9979717548077, + "logps/rejected": -501.9641779119318, + "loss": 0.0095, + "rewards/chosen": 5.708309467022236, + "rewards/margins": 15.015982834609238, + "rewards/rejected": -9.307673367587002, + "step": 1472 + }, + { + "epoch": 0.40372755927093323, + "grad_norm": 6.84375, + "kl": 1.3622945547103882, + "learning_rate": 5e-06, + "logits/chosen": -37337422.76923077, + "logits/rejected": -8294416.0, + "logps/chosen": -471.9011793870192, + "logps/rejected": -687.3347389914773, + "loss": 0.0163, + "rewards/chosen": 6.989632239708533, + "rewards/margins": 18.310638321029558, + "rewards/rejected": -11.321006081321023, + "step": 1473 + }, + { + "epoch": 0.40400164451144305, + "grad_norm": 5.21875, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": 11554678.666666666, + "logits/rejected": -37342432.0, + "logps/chosen": -484.5870361328125, + "logps/rejected": -583.251953125, + "loss": 0.0156, + "rewards/chosen": 5.775282541910808, + "rewards/margins": 16.623162587483723, + "rewards/rejected": -10.847880045572916, + "step": 1474 + }, + { + "epoch": 0.40427572975195286, + "grad_norm": 4.6875, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -26104448.0, + "logits/rejected": 14657998.666666666, + "logps/chosen": -421.9873046875, + "logps/rejected": -673.696533203125, + "loss": 0.0214, + "rewards/chosen": 7.211241404215495, + "rewards/margins": 20.237263997395832, + "rewards/rejected": -13.026022593180338, + "step": 1475 + }, + { + "epoch": 0.4045498149924627, + "grad_norm": 3.640625, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -24809742.545454547, + "logits/rejected": -19864475.076923076, + "logps/chosen": -431.6033824573864, + "logps/rejected": -449.24744591346155, + "loss": 0.0198, + "rewards/chosen": 6.759264859286222, + "rewards/margins": 15.053899164800043, + "rewards/rejected": -8.294634305513823, + "step": 1476 + }, + { + "epoch": 0.40482390023297243, + "grad_norm": 7.78125, + "kl": 1.7629013061523438, + "learning_rate": 5e-06, + "logits/chosen": 15727074.461538462, + "logits/rejected": -18217482.181818184, + "logps/chosen": -315.05618990384613, + "logps/rejected": -458.24360795454544, + "loss": 0.0329, + "rewards/chosen": 4.903113145094651, + "rewards/margins": 13.611186154238826, + "rewards/rejected": -8.708073009144176, + "step": 1477 + }, + { + "epoch": 0.40509798547348225, + "grad_norm": 9.375, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -13248793.6, + "logits/rejected": -16671290.666666666, + "logps/chosen": -411.79072265625, + "logps/rejected": -399.70684136284723, + "loss": 0.0356, + "rewards/chosen": 5.835392761230469, + "rewards/margins": 12.593606228298611, + "rewards/rejected": -6.758213467068142, + "step": 1478 + }, + { + "epoch": 0.40537207071399206, + "grad_norm": 7.53125, + "kl": 3.5350100994110107, + "learning_rate": 5e-06, + "logits/chosen": 1542597.6363636365, + "logits/rejected": -1824450.4615384615, + "logps/chosen": -368.49101118607956, + "logps/rejected": -541.4662710336538, + "loss": 0.0354, + "rewards/chosen": 6.519221912730824, + "rewards/margins": 14.773482636138276, + "rewards/rejected": -8.254260723407452, + "step": 1479 + }, + { + "epoch": 0.4056461559545019, + "grad_norm": 3.3125, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -4878536.363636363, + "logits/rejected": -20892512.0, + "logps/chosen": -420.54398970170456, + "logps/rejected": -611.9679612379807, + "loss": 0.0129, + "rewards/chosen": 6.449182683771307, + "rewards/margins": 17.816755174756885, + "rewards/rejected": -11.367572490985577, + "step": 1480 + }, + { + "epoch": 0.40592024119501163, + "grad_norm": 6.25, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -966379.5, + "logits/rejected": -6144220.0, + "logps/chosen": -266.6871337890625, + "logps/rejected": -547.8576049804688, + "loss": 0.0296, + "rewards/chosen": 4.181737899780273, + "rewards/margins": 14.400148391723633, + "rewards/rejected": -10.21841049194336, + "step": 1481 + }, + { + "epoch": 0.40619432643552145, + "grad_norm": 9.375, + "kl": 3.554900884628296, + "learning_rate": 5e-06, + "logits/chosen": -21535918.0, + "logits/rejected": -28316708.0, + "logps/chosen": -478.109130859375, + "logps/rejected": -551.4567260742188, + "loss": 0.0211, + "rewards/chosen": 6.583198547363281, + "rewards/margins": 15.032108306884766, + "rewards/rejected": -8.448909759521484, + "step": 1482 + }, + { + "epoch": 0.40646841167603126, + "grad_norm": 6.53125, + "kl": 4.1687445640563965, + "learning_rate": 5e-06, + "logits/chosen": -29411104.0, + "logits/rejected": 13486664.0, + "logps/chosen": -496.2179361979167, + "logps/rejected": -661.8578694661459, + "loss": 0.023, + "rewards/chosen": 6.555273691813151, + "rewards/margins": 19.48746617635091, + "rewards/rejected": -12.93219248453776, + "step": 1483 + }, + { + "epoch": 0.406742496916541, + "grad_norm": 6.25, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": 9548754.857142856, + "logits/rejected": -12293316.705882354, + "logps/chosen": -468.99142020089283, + "logps/rejected": -707.2580422794117, + "loss": 0.0161, + "rewards/chosen": 6.234718322753906, + "rewards/margins": 16.49249132941751, + "rewards/rejected": -10.257773006663603, + "step": 1484 + }, + { + "epoch": 0.40701658215705083, + "grad_norm": 8.125, + "kl": 1.4382604360580444, + "learning_rate": 5e-06, + "logits/chosen": -26548958.11764706, + "logits/rejected": -11544485.714285715, + "logps/chosen": -450.20651424632354, + "logps/rejected": -513.9938616071429, + "loss": 0.0389, + "rewards/chosen": 6.181376737706802, + "rewards/margins": 13.13003636207901, + "rewards/rejected": -6.94865962437221, + "step": 1485 + }, + { + "epoch": 0.40729066739756065, + "grad_norm": 6.125, + "kl": 0.13907623291015625, + "learning_rate": 5e-06, + "logits/chosen": -18245578.285714287, + "logits/rejected": -30096371.2, + "logps/chosen": -426.79209681919644, + "logps/rejected": -309.5555419921875, + "loss": 0.03, + "rewards/chosen": 5.954307556152344, + "rewards/margins": 13.038497161865234, + "rewards/rejected": -7.084189605712891, + "step": 1486 + }, + { + "epoch": 0.40756475263807046, + "grad_norm": 7.15625, + "kl": 6.838308334350586, + "learning_rate": 5e-06, + "logits/chosen": -22732484.57142857, + "logits/rejected": -35827315.2, + "logps/chosen": -399.14390345982144, + "logps/rejected": -506.925, + "loss": 0.0238, + "rewards/chosen": 7.1687180655343195, + "rewards/margins": 15.067055620465961, + "rewards/rejected": -7.898337554931641, + "step": 1487 + }, + { + "epoch": 0.4078388378785802, + "grad_norm": 3.625, + "kl": 3.84302020072937, + "learning_rate": 5e-06, + "logits/chosen": -27730457.6, + "logits/rejected": -28825831.111111112, + "logps/chosen": -371.37513020833336, + "logps/rejected": -512.0144314236111, + "loss": 0.0221, + "rewards/chosen": 6.535305786132812, + "rewards/margins": 14.960977681477864, + "rewards/rejected": -8.425671895345053, + "step": 1488 + }, + { + "epoch": 0.40811292311909003, + "grad_norm": 6.28125, + "kl": 19.619991302490234, + "learning_rate": 5e-06, + "logits/chosen": -22912812.0, + "logits/rejected": -47752260.0, + "logps/chosen": -590.8394165039062, + "logps/rejected": -510.35308837890625, + "loss": 0.0327, + "rewards/chosen": 8.189374923706055, + "rewards/margins": 16.432146072387695, + "rewards/rejected": -8.24277114868164, + "step": 1489 + }, + { + "epoch": 0.40838700835959985, + "grad_norm": 8.5625, + "kl": 1.9583232402801514, + "learning_rate": 5e-06, + "logits/chosen": -32802346.666666668, + "logits/rejected": -25454204.444444444, + "logps/chosen": -406.66412760416665, + "logps/rejected": -522.8013237847222, + "loss": 0.0369, + "rewards/chosen": 5.968517049153646, + "rewards/margins": 15.763225301106772, + "rewards/rejected": -9.794708251953125, + "step": 1490 + }, + { + "epoch": 0.40866109360010966, + "grad_norm": 15.1875, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -29992251.076923076, + "logits/rejected": -11183658.181818182, + "logps/chosen": -490.65478515625, + "logps/rejected": -564.2613636363636, + "loss": 0.0504, + "rewards/chosen": 6.267730126014123, + "rewards/margins": 14.67549143971263, + "rewards/rejected": -8.407761313698508, + "step": 1491 + }, + { + "epoch": 0.4089351788406194, + "grad_norm": 1.5390625, + "kl": 0.7082545161247253, + "learning_rate": 5e-06, + "logits/chosen": -20883075.555555556, + "logits/rejected": -10106859.733333332, + "logps/chosen": -437.23963758680554, + "logps/rejected": -509.076953125, + "loss": 0.0049, + "rewards/chosen": 5.979043748643663, + "rewards/margins": 14.628331671820746, + "rewards/rejected": -8.649287923177083, + "step": 1492 + }, + { + "epoch": 0.40920926408112923, + "grad_norm": 7.46875, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -39260433.777777776, + "logits/rejected": -12391978.666666666, + "logps/chosen": -475.2283528645833, + "logps/rejected": -485.5102864583333, + "loss": 0.0186, + "rewards/chosen": 8.768072340223524, + "rewards/margins": 17.70952741834852, + "rewards/rejected": -8.941455078125, + "step": 1493 + }, + { + "epoch": 0.40948334932163905, + "grad_norm": 2.125, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": 7358335.5, + "logits/rejected": -37390452.0, + "logps/chosen": -345.9652404785156, + "logps/rejected": -407.53759765625, + "loss": 0.0051, + "rewards/chosen": 7.745540142059326, + "rewards/margins": 16.702767848968506, + "rewards/rejected": -8.95722770690918, + "step": 1494 + }, + { + "epoch": 0.4097574345621488, + "grad_norm": 8.0, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -17997334.153846152, + "logits/rejected": 22877950.545454547, + "logps/chosen": -411.39107572115387, + "logps/rejected": -775.2912819602273, + "loss": 0.0589, + "rewards/chosen": 5.380047137920673, + "rewards/margins": 18.568201878687717, + "rewards/rejected": -13.188154740767045, + "step": 1495 + }, + { + "epoch": 0.4100315198026586, + "grad_norm": 2.125, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -10091312.0, + "logits/rejected": -30931919.05882353, + "logps/chosen": -419.42923409598217, + "logps/rejected": -580.7510340073529, + "loss": 0.0069, + "rewards/chosen": 5.6043597630092075, + "rewards/margins": 14.940406767260125, + "rewards/rejected": -9.336047004250918, + "step": 1496 + }, + { + "epoch": 0.41030560504316843, + "grad_norm": 10.8125, + "kl": 24.467987060546875, + "learning_rate": 5e-06, + "logits/chosen": -27822548.210526317, + "logits/rejected": -57046400.0, + "logps/chosen": -404.20703125, + "logps/rejected": -479.74892578125, + "loss": 0.1638, + "rewards/chosen": 6.576140554327714, + "rewards/margins": 15.41725307263826, + "rewards/rejected": -8.841112518310547, + "step": 1497 + }, + { + "epoch": 0.41057969028367824, + "grad_norm": 7.0625, + "kl": 4.12349271774292, + "learning_rate": 5e-06, + "logits/chosen": -19401602.46153846, + "logits/rejected": 4737989.818181818, + "logps/chosen": -362.48959585336536, + "logps/rejected": -616.3498757102273, + "loss": 0.0383, + "rewards/chosen": 6.122836773212139, + "rewards/margins": 15.679570338109157, + "rewards/rejected": -9.556733564897018, + "step": 1498 + }, + { + "epoch": 0.410853775524188, + "grad_norm": 5.0625, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -19378572.307692308, + "logits/rejected": -1236792.7272727273, + "logps/chosen": -412.8322566105769, + "logps/rejected": -567.1394708806819, + "loss": 0.0329, + "rewards/chosen": 5.464794452373798, + "rewards/margins": 15.144147646177066, + "rewards/rejected": -9.679353193803268, + "step": 1499 + }, + { + "epoch": 0.4111278607646978, + "grad_norm": 5.53125, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": 43300372.0, + "logits/rejected": -3350165.0, + "logps/chosen": -314.4950866699219, + "logps/rejected": -479.03204345703125, + "loss": 0.0425, + "rewards/chosen": 5.32094669342041, + "rewards/margins": 14.228443145751953, + "rewards/rejected": -8.907496452331543, + "step": 1500 + }, + { + "epoch": 0.41140194600520763, + "grad_norm": 4.90625, + "kl": 4.028920650482178, + "learning_rate": 5e-06, + "logits/chosen": -19877596.0, + "logits/rejected": -22079952.0, + "logps/chosen": -386.90496826171875, + "logps/rejected": -554.68994140625, + "loss": 0.0415, + "rewards/chosen": 6.13759183883667, + "rewards/margins": 19.0024094581604, + "rewards/rejected": -12.86481761932373, + "step": 1501 + }, + { + "epoch": 0.41167603124571744, + "grad_norm": 7.03125, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": 14974318.76923077, + "logits/rejected": -1621306.5454545454, + "logps/chosen": -497.54770132211536, + "logps/rejected": -655.5644975142045, + "loss": 0.0254, + "rewards/chosen": 6.851729759803185, + "rewards/margins": 16.111745074078755, + "rewards/rejected": -9.260015314275568, + "step": 1502 + }, + { + "epoch": 0.4119501164862272, + "grad_norm": 5.71875, + "kl": 4.905412197113037, + "learning_rate": 5e-06, + "logits/chosen": 2976966.4, + "logits/rejected": -22766444.444444444, + "logps/chosen": -342.87298177083335, + "logps/rejected": -566.9479166666666, + "loss": 0.0485, + "rewards/chosen": 5.4793650309244795, + "rewards/margins": 14.509288363986546, + "rewards/rejected": -9.029923333062065, + "step": 1503 + }, + { + "epoch": 0.412224201726737, + "grad_norm": 6.5, + "kl": 0.9857572317123413, + "learning_rate": 5e-06, + "logits/chosen": -11935246.76923077, + "logits/rejected": -9187056.727272727, + "logps/chosen": -405.27249849759613, + "logps/rejected": -377.3447265625, + "loss": 0.0606, + "rewards/chosen": 5.332678574782151, + "rewards/margins": 10.871634610049373, + "rewards/rejected": -5.538956035267223, + "step": 1504 + }, + { + "epoch": 0.41249828696724683, + "grad_norm": 7.09375, + "kl": 3.498997926712036, + "learning_rate": 5e-06, + "logits/chosen": -17043323.733333334, + "logits/rejected": 19309212.444444444, + "logps/chosen": -385.4990234375, + "logps/rejected": -610.4763454861111, + "loss": 0.042, + "rewards/chosen": 6.086223347981771, + "rewards/margins": 17.150648498535155, + "rewards/rejected": -11.064425150553385, + "step": 1505 + }, + { + "epoch": 0.4127723722077566, + "grad_norm": 9.3125, + "kl": 3.7721385955810547, + "learning_rate": 5e-06, + "logits/chosen": -16225155.0, + "logits/rejected": 2870178.75, + "logps/chosen": -301.61871337890625, + "logps/rejected": -507.9135437011719, + "loss": 0.0898, + "rewards/chosen": 5.108923435211182, + "rewards/margins": 12.910522937774658, + "rewards/rejected": -7.801599502563477, + "step": 1506 + }, + { + "epoch": 0.4130464574482664, + "grad_norm": 4.4375, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -11265241.846153846, + "logits/rejected": -40194164.36363637, + "logps/chosen": -399.3532527043269, + "logps/rejected": -685.3951526988636, + "loss": 0.0233, + "rewards/chosen": 6.226615905761719, + "rewards/margins": 17.16845633766868, + "rewards/rejected": -10.94184043190696, + "step": 1507 + }, + { + "epoch": 0.4133205426887762, + "grad_norm": 5.625, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": 9563902.153846154, + "logits/rejected": -2692076.0, + "logps/chosen": -483.16312349759613, + "logps/rejected": -568.1131036931819, + "loss": 0.0195, + "rewards/chosen": 5.880338228665865, + "rewards/margins": 15.491435391085965, + "rewards/rejected": -9.6110971624201, + "step": 1508 + }, + { + "epoch": 0.41359462792928603, + "grad_norm": 3.265625, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -19665776.0, + "logits/rejected": 42146592.0, + "logps/chosen": -390.9180908203125, + "logps/rejected": -755.7329915364584, + "loss": 0.0123, + "rewards/chosen": 5.982678731282552, + "rewards/margins": 22.25913365681966, + "rewards/rejected": -16.27645492553711, + "step": 1509 + }, + { + "epoch": 0.4138687131697958, + "grad_norm": 7.125, + "kl": 0.5511068105697632, + "learning_rate": 5e-06, + "logits/chosen": 559068.3636363636, + "logits/rejected": -25917304.615384616, + "logps/chosen": -451.97749467329544, + "logps/rejected": -488.25304236778845, + "loss": 0.0384, + "rewards/chosen": 6.067348133433949, + "rewards/margins": 14.870933479362435, + "rewards/rejected": -8.803585345928486, + "step": 1510 + }, + { + "epoch": 0.4141427984103056, + "grad_norm": 10.5625, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": 4478400.307692308, + "logits/rejected": 29143296.0, + "logps/chosen": -477.3567082331731, + "logps/rejected": -563.5612571022727, + "loss": 0.0824, + "rewards/chosen": 6.380731435922476, + "rewards/margins": 18.13871039543952, + "rewards/rejected": -11.757978959517045, + "step": 1511 + }, + { + "epoch": 0.4144168836508154, + "grad_norm": 2.390625, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -11917360.0, + "logits/rejected": -11555668.0, + "logps/chosen": -467.65011160714283, + "logps/rejected": -548.305078125, + "loss": 0.0077, + "rewards/chosen": 7.38763918195452, + "rewards/margins": 16.255243246895926, + "rewards/rejected": -8.867604064941407, + "step": 1512 + }, + { + "epoch": 0.41469096889132523, + "grad_norm": 8.75, + "kl": 2.128582000732422, + "learning_rate": 5e-06, + "logits/chosen": -7216981.333333333, + "logits/rejected": -17470984.0, + "logps/chosen": -393.8875325520833, + "logps/rejected": -451.8509928385417, + "loss": 0.0275, + "rewards/chosen": 6.308730443318685, + "rewards/margins": 13.404128392537434, + "rewards/rejected": -7.09539794921875, + "step": 1513 + }, + { + "epoch": 0.414965054131835, + "grad_norm": 5.40625, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -10821109.714285715, + "logits/rejected": -20953388.8, + "logps/chosen": -364.69918387276783, + "logps/rejected": -487.754638671875, + "loss": 0.0173, + "rewards/chosen": 6.099061148507254, + "rewards/margins": 14.400100272042412, + "rewards/rejected": -8.301039123535157, + "step": 1514 + }, + { + "epoch": 0.4152391393723448, + "grad_norm": 14.1875, + "kl": 5.584901809692383, + "learning_rate": 5e-06, + "logits/chosen": -38928392.0, + "logits/rejected": -23322200.0, + "logps/chosen": -643.406982421875, + "logps/rejected": -411.2641296386719, + "loss": 0.0431, + "rewards/chosen": 7.77677583694458, + "rewards/margins": 17.172772884368896, + "rewards/rejected": -9.395997047424316, + "step": 1515 + }, + { + "epoch": 0.4155132246128546, + "grad_norm": 3.296875, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -22475081.14285714, + "logits/rejected": -13483364.705882354, + "logps/chosen": -435.79603794642856, + "logps/rejected": -439.60061465992646, + "loss": 0.0099, + "rewards/chosen": 6.608120509556362, + "rewards/margins": 15.054312633867024, + "rewards/rejected": -8.446192124310661, + "step": 1516 + }, + { + "epoch": 0.4157873098533644, + "grad_norm": 8.625, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -6479731.692307692, + "logits/rejected": -22550773.818181816, + "logps/chosen": -343.9499699519231, + "logps/rejected": -628.3488991477273, + "loss": 0.0511, + "rewards/chosen": 5.263839134803185, + "rewards/margins": 17.39558906821938, + "rewards/rejected": -12.131749933416193, + "step": 1517 + }, + { + "epoch": 0.4160613950938742, + "grad_norm": 5.84375, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -15653361.333333334, + "logits/rejected": -2495365.0, + "logps/chosen": -383.0970458984375, + "logps/rejected": -692.6321614583334, + "loss": 0.016, + "rewards/chosen": 6.279998143513997, + "rewards/margins": 16.005977630615234, + "rewards/rejected": -9.725979487101236, + "step": 1518 + }, + { + "epoch": 0.416335480334384, + "grad_norm": 11.9375, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -5363676.7272727275, + "logits/rejected": 9905513.846153846, + "logps/chosen": -451.9601384943182, + "logps/rejected": -409.0895432692308, + "loss": 0.0447, + "rewards/chosen": 6.084763960404829, + "rewards/margins": 12.914734100128387, + "rewards/rejected": -6.8299701397235575, + "step": 1519 + }, + { + "epoch": 0.4166095655748938, + "grad_norm": 7.375, + "kl": 1.7349803447723389, + "learning_rate": 5e-06, + "logits/chosen": -2002430.0, + "logits/rejected": -25306748.8, + "logps/chosen": -393.39488002232144, + "logps/rejected": -520.854150390625, + "loss": 0.0236, + "rewards/chosen": 5.899818965366909, + "rewards/margins": 15.790530177525113, + "rewards/rejected": -9.890711212158203, + "step": 1520 + }, + { + "epoch": 0.4168836508154036, + "grad_norm": 8.125, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -7214011.636363637, + "logits/rejected": -17185025.230769232, + "logps/chosen": -367.2659357244318, + "logps/rejected": -476.76412259615387, + "loss": 0.0304, + "rewards/chosen": 6.266568270596591, + "rewards/margins": 14.203864037573755, + "rewards/rejected": -7.937295766977163, + "step": 1521 + }, + { + "epoch": 0.4171577360559134, + "grad_norm": 15.375, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": 18128888.615384616, + "logits/rejected": -27855115.636363637, + "logps/chosen": -407.41515174278845, + "logps/rejected": -576.9181019176136, + "loss": 0.0819, + "rewards/chosen": 4.783447852501502, + "rewards/margins": 13.971576237178347, + "rewards/rejected": -9.188128384676846, + "step": 1522 + }, + { + "epoch": 0.4174318212964232, + "grad_norm": 6.90625, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -14389132.444444444, + "logits/rejected": 208934.66666666666, + "logps/chosen": -358.13970269097223, + "logps/rejected": -605.3841145833334, + "loss": 0.0504, + "rewards/chosen": 5.708697848849827, + "rewards/margins": 15.97867668999566, + "rewards/rejected": -10.269978841145834, + "step": 1523 + }, + { + "epoch": 0.417705906536933, + "grad_norm": 12.0625, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -8771157.538461538, + "logits/rejected": -23733809.454545453, + "logps/chosen": -439.9362980769231, + "logps/rejected": -596.68798828125, + "loss": 0.041, + "rewards/chosen": 7.441099900465745, + "rewards/margins": 17.36575200007512, + "rewards/rejected": -9.924652099609375, + "step": 1524 + }, + { + "epoch": 0.41797999177744277, + "grad_norm": 10.0625, + "kl": 5.265926361083984, + "learning_rate": 5e-06, + "logits/chosen": -29577064.727272727, + "logits/rejected": -15021830.153846154, + "logps/chosen": -470.3328746448864, + "logps/rejected": -410.3132136418269, + "loss": 0.0346, + "rewards/chosen": 8.790005770596592, + "rewards/margins": 16.50916364976576, + "rewards/rejected": -7.71915787916917, + "step": 1525 + }, + { + "epoch": 0.4182540770179526, + "grad_norm": 5.53125, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": 41148643.55555555, + "logits/rejected": 4779276.8, + "logps/chosen": -552.0703667534722, + "logps/rejected": -629.9243489583333, + "loss": 0.0095, + "rewards/chosen": 6.657067616780599, + "rewards/margins": 19.003048451741535, + "rewards/rejected": -12.345980834960937, + "step": 1526 + }, + { + "epoch": 0.4185281622584624, + "grad_norm": 7.875, + "kl": 4.155570983886719, + "learning_rate": 5e-06, + "logits/chosen": -31699589.818181816, + "logits/rejected": -10352164.923076924, + "logps/chosen": -484.9675958806818, + "logps/rejected": -558.7210036057693, + "loss": 0.0301, + "rewards/chosen": 6.803590947931463, + "rewards/margins": 16.236626578377678, + "rewards/rejected": -9.433035630446215, + "step": 1527 + }, + { + "epoch": 0.41880224749897216, + "grad_norm": 13.6875, + "kl": 10.14826774597168, + "learning_rate": 5e-06, + "logits/chosen": -2949269.846153846, + "logits/rejected": 3963788.3636363638, + "logps/chosen": -355.0744816706731, + "logps/rejected": -486.1534534801136, + "loss": 0.1219, + "rewards/chosen": 5.580959613506611, + "rewards/margins": 14.338928862885162, + "rewards/rejected": -8.75796924937855, + "step": 1528 + }, + { + "epoch": 0.41907633273948197, + "grad_norm": 4.9375, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -3103815.3846153845, + "logits/rejected": -23953794.90909091, + "logps/chosen": -396.3659855769231, + "logps/rejected": -652.7680220170455, + "loss": 0.0188, + "rewards/chosen": 7.042715219350962, + "rewards/margins": 16.974785864769995, + "rewards/rejected": -9.932070645419033, + "step": 1529 + }, + { + "epoch": 0.4193504179799918, + "grad_norm": 2.140625, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -34180800.0, + "logits/rejected": -32232066.666666668, + "logps/chosen": -329.18017578125, + "logps/rejected": -532.3974609375, + "loss": 0.007, + "rewards/chosen": 6.306168874104817, + "rewards/margins": 15.654330571492512, + "rewards/rejected": -9.348161697387695, + "step": 1530 + }, + { + "epoch": 0.4196245032205016, + "grad_norm": 17.5, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -129105.77777777778, + "logits/rejected": 8079260.8, + "logps/chosen": -586.6809353298611, + "logps/rejected": -533.5797526041666, + "loss": 0.0544, + "rewards/chosen": 6.875941806369358, + "rewards/margins": 17.07572207980686, + "rewards/rejected": -10.1997802734375, + "step": 1531 + }, + { + "epoch": 0.41989858846101136, + "grad_norm": 3.578125, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -26282437.818181816, + "logits/rejected": 72607350.15384616, + "logps/chosen": -416.50577059659093, + "logps/rejected": -671.8360877403846, + "loss": 0.0071, + "rewards/chosen": 6.39859355579723, + "rewards/margins": 19.85296038647632, + "rewards/rejected": -13.454366830679087, + "step": 1532 + }, + { + "epoch": 0.42017267370152117, + "grad_norm": 10.75, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": 36929521.23076923, + "logits/rejected": -11025502.545454545, + "logps/chosen": -495.0349308894231, + "logps/rejected": -596.2455610795455, + "loss": 0.063, + "rewards/chosen": 5.008951040414663, + "rewards/margins": 15.239949686543925, + "rewards/rejected": -10.230998646129262, + "step": 1533 + }, + { + "epoch": 0.420446758942031, + "grad_norm": 7.1875, + "kl": 6.9141340255737305, + "learning_rate": 5e-06, + "logits/chosen": 24656373.333333332, + "logits/rejected": -9297787.733333332, + "logps/chosen": -581.9512261284722, + "logps/rejected": -613.9294921875, + "loss": 0.0428, + "rewards/chosen": 7.181610955132379, + "rewards/margins": 14.871751742892794, + "rewards/rejected": -7.690140787760416, + "step": 1534 + }, + { + "epoch": 0.42072084418254074, + "grad_norm": 5.625, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -8804599.384615384, + "logits/rejected": -27936471.272727273, + "logps/chosen": -485.7540940504808, + "logps/rejected": -486.20485617897725, + "loss": 0.0283, + "rewards/chosen": 5.834888164813702, + "rewards/margins": 14.079300673691543, + "rewards/rejected": -8.244412508877842, + "step": 1535 + }, + { + "epoch": 0.42099492942305056, + "grad_norm": 11.0, + "kl": 5.880061149597168, + "learning_rate": 5e-06, + "logits/chosen": 25260430.769230768, + "logits/rejected": -27633367.272727273, + "logps/chosen": -461.5110051081731, + "logps/rejected": -425.02974076704544, + "loss": 0.0836, + "rewards/chosen": 5.5699638953575725, + "rewards/margins": 12.50929356288243, + "rewards/rejected": -6.939329667524858, + "step": 1536 + }, + { + "epoch": 0.42126901466356037, + "grad_norm": 7.0625, + "kl": 6.828696250915527, + "learning_rate": 5e-06, + "logits/chosen": -36197970.666666664, + "logits/rejected": -22511424.0, + "logps/chosen": -477.1971028645833, + "logps/rejected": -509.8732503255208, + "loss": 0.0365, + "rewards/chosen": 6.798379262288411, + "rewards/margins": 16.467273076375324, + "rewards/rejected": -9.668893814086914, + "step": 1537 + }, + { + "epoch": 0.4215430999040702, + "grad_norm": 8.0625, + "kl": 7.717883110046387, + "learning_rate": 5e-06, + "logits/chosen": -42604224.0, + "logits/rejected": -33150080.0, + "logps/chosen": -479.38162667410717, + "logps/rejected": -471.97060546875, + "loss": 0.0628, + "rewards/chosen": 7.218311309814453, + "rewards/margins": 16.33539810180664, + "rewards/rejected": -9.117086791992188, + "step": 1538 + }, + { + "epoch": 0.42181718514457994, + "grad_norm": 6.8125, + "kl": 6.664118766784668, + "learning_rate": 5e-06, + "logits/chosen": -13320206.545454545, + "logits/rejected": -16031707.076923076, + "logps/chosen": -345.1785333806818, + "logps/rejected": -425.72547325721155, + "loss": 0.0339, + "rewards/chosen": 5.876386469060725, + "rewards/margins": 11.174193428946541, + "rewards/rejected": -5.2978069598858175, + "step": 1539 + }, + { + "epoch": 0.42209127038508976, + "grad_norm": 10.8125, + "kl": 4.454648017883301, + "learning_rate": 5e-06, + "logits/chosen": -17350840.470588237, + "logits/rejected": -10353230.857142856, + "logps/chosen": -408.49046415441177, + "logps/rejected": -409.13145228794644, + "loss": 0.0404, + "rewards/chosen": 6.552255069508272, + "rewards/margins": 15.165470924698004, + "rewards/rejected": -8.613215855189733, + "step": 1540 + }, + { + "epoch": 0.42236535562559957, + "grad_norm": 4.84375, + "kl": 0.8794390559196472, + "learning_rate": 5e-06, + "logits/chosen": -3727641.8181818184, + "logits/rejected": -6911583.384615385, + "logps/chosen": -357.27903053977275, + "logps/rejected": -440.46837439903845, + "loss": 0.0185, + "rewards/chosen": 6.032072587446733, + "rewards/margins": 13.407716977846373, + "rewards/rejected": -7.375644390399639, + "step": 1541 + }, + { + "epoch": 0.4226394408661094, + "grad_norm": 11.4375, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -17418865.333333332, + "logits/rejected": -32748922.666666668, + "logps/chosen": -397.7556966145833, + "logps/rejected": -578.1730143229166, + "loss": 0.0582, + "rewards/chosen": 4.7894948323567705, + "rewards/margins": 16.398882548014324, + "rewards/rejected": -11.609387715657553, + "step": 1542 + }, + { + "epoch": 0.42291352610661914, + "grad_norm": 15.3125, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -36096128.0, + "logits/rejected": -13789058.0, + "logps/chosen": -309.9322509765625, + "logps/rejected": -575.154052734375, + "loss": 0.0497, + "rewards/chosen": 4.656304836273193, + "rewards/margins": 13.237362384796143, + "rewards/rejected": -8.58105754852295, + "step": 1543 + }, + { + "epoch": 0.42318761134712896, + "grad_norm": 3.859375, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -29605749.333333332, + "logits/rejected": -21375784.0, + "logps/chosen": -495.8406982421875, + "logps/rejected": -526.9629720052084, + "loss": 0.0129, + "rewards/chosen": 6.906485239664714, + "rewards/margins": 17.06740125020345, + "rewards/rejected": -10.160916010538736, + "step": 1544 + }, + { + "epoch": 0.42346169658763877, + "grad_norm": 9.4375, + "kl": 1.1882305145263672, + "learning_rate": 5e-06, + "logits/chosen": -15522327.272727273, + "logits/rejected": -37734500.92307692, + "logps/chosen": -433.25887784090907, + "logps/rejected": -568.2223557692307, + "loss": 0.0316, + "rewards/chosen": 6.3574350530451, + "rewards/margins": 18.016937522621422, + "rewards/rejected": -11.659502469576323, + "step": 1545 + }, + { + "epoch": 0.42373578182814853, + "grad_norm": 3.515625, + "kl": 7.600671291351318, + "learning_rate": 5e-06, + "logits/chosen": -23304267.42857143, + "logits/rejected": -23849673.6, + "logps/chosen": -515.9459751674107, + "logps/rejected": -450.842041015625, + "loss": 0.0107, + "rewards/chosen": 8.595849173409599, + "rewards/margins": 16.356603567940848, + "rewards/rejected": -7.76075439453125, + "step": 1546 + }, + { + "epoch": 0.42400986706865834, + "grad_norm": 0.5625, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -20179577.6, + "logits/rejected": -12882137.142857144, + "logps/chosen": -462.02509765625, + "logps/rejected": -656.7594866071429, + "loss": 0.0013, + "rewards/chosen": 7.603567504882813, + "rewards/margins": 20.255046953473773, + "rewards/rejected": -12.65147944859096, + "step": 1547 + }, + { + "epoch": 0.42428395230916816, + "grad_norm": 7.9375, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -7147912.8, + "logits/rejected": -13027499.42857143, + "logps/chosen": -530.90205078125, + "logps/rejected": -538.2541852678571, + "loss": 0.0385, + "rewards/chosen": 6.426943969726563, + "rewards/margins": 17.764019339425225, + "rewards/rejected": -11.337075369698661, + "step": 1548 + }, + { + "epoch": 0.42455803754967797, + "grad_norm": 13.0, + "kl": 1.2523658275604248, + "learning_rate": 5e-06, + "logits/chosen": -902864.4615384615, + "logits/rejected": -26773082.181818184, + "logps/chosen": -422.50939002403845, + "logps/rejected": -414.38449928977275, + "loss": 0.0566, + "rewards/chosen": 5.604836097130408, + "rewards/margins": 16.27329590270569, + "rewards/rejected": -10.668459805575283, + "step": 1549 + }, + { + "epoch": 0.4248321227901877, + "grad_norm": 9.625, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -14996006.857142856, + "logits/rejected": -22476702.4, + "logps/chosen": -434.0245884486607, + "logps/rejected": -437.3607421875, + "loss": 0.0347, + "rewards/chosen": 7.004281180245536, + "rewards/margins": 15.497458975655693, + "rewards/rejected": -8.493177795410157, + "step": 1550 + }, + { + "epoch": 0.42510620803069754, + "grad_norm": 10.3125, + "kl": 8.679424285888672, + "learning_rate": 5e-06, + "logits/chosen": -13035430.857142856, + "logits/rejected": -26929472.0, + "logps/chosen": -404.72178431919644, + "logps/rejected": -436.378125, + "loss": 0.0432, + "rewards/chosen": 6.378510611397879, + "rewards/margins": 14.26559328351702, + "rewards/rejected": -7.88708267211914, + "step": 1551 + }, + { + "epoch": 0.42538029327120735, + "grad_norm": 10.8125, + "kl": 1.219626784324646, + "learning_rate": 5e-06, + "logits/chosen": -22833469.333333332, + "logits/rejected": -25671.333333333332, + "logps/chosen": -440.1573486328125, + "logps/rejected": -453.225830078125, + "loss": 0.0502, + "rewards/chosen": 5.702290852864583, + "rewards/margins": 14.237310409545898, + "rewards/rejected": -8.535019556681315, + "step": 1552 + }, + { + "epoch": 0.42565437851171717, + "grad_norm": 5.96875, + "kl": 0.5507545471191406, + "learning_rate": 5e-06, + "logits/chosen": -20875306.0, + "logits/rejected": -14836620.0, + "logps/chosen": -415.559814453125, + "logps/rejected": -571.2532958984375, + "loss": 0.0169, + "rewards/chosen": 6.394080638885498, + "rewards/margins": 18.272799015045166, + "rewards/rejected": -11.878718376159668, + "step": 1553 + }, + { + "epoch": 0.4259284637522269, + "grad_norm": 3.796875, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -25842684.444444444, + "logits/rejected": -24498816.0, + "logps/chosen": -430.46630859375, + "logps/rejected": -391.51116536458335, + "loss": 0.0205, + "rewards/chosen": 5.49709235297309, + "rewards/margins": 13.227211168077257, + "rewards/rejected": -7.730118815104166, + "step": 1554 + }, + { + "epoch": 0.42620254899273674, + "grad_norm": 9.8125, + "kl": 1.4395898580551147, + "learning_rate": 5e-06, + "logits/chosen": -8721655.384615384, + "logits/rejected": 10477371.636363637, + "logps/chosen": -482.04447115384613, + "logps/rejected": -559.9357688210227, + "loss": 0.0275, + "rewards/chosen": 5.972091087928185, + "rewards/margins": 16.511533670492106, + "rewards/rejected": -10.53944258256392, + "step": 1555 + }, + { + "epoch": 0.42647663423324655, + "grad_norm": 2.703125, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -21717734.85714286, + "logits/rejected": -33201635.2, + "logps/chosen": -462.30747767857144, + "logps/rejected": -448.46484375, + "loss": 0.0265, + "rewards/chosen": 5.7779966081891745, + "rewards/margins": 15.035906764439176, + "rewards/rejected": -9.25791015625, + "step": 1556 + }, + { + "epoch": 0.4267507194737563, + "grad_norm": 9.9375, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -3719223.272727273, + "logits/rejected": -33130646.153846152, + "logps/chosen": -535.8848544034091, + "logps/rejected": -486.3874699519231, + "loss": 0.0477, + "rewards/chosen": 8.232005726207387, + "rewards/margins": 16.519433775148194, + "rewards/rejected": -8.287428048940805, + "step": 1557 + }, + { + "epoch": 0.4270248047142661, + "grad_norm": 6.46875, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -20912887.111111112, + "logits/rejected": -25442094.933333334, + "logps/chosen": -289.104736328125, + "logps/rejected": -583.4145182291667, + "loss": 0.0447, + "rewards/chosen": 4.333204905192058, + "rewards/margins": 15.025232696533202, + "rewards/rejected": -10.692027791341145, + "step": 1558 + }, + { + "epoch": 0.42729888995477594, + "grad_norm": 3.84375, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -29526067.2, + "logits/rejected": -23657299.555555556, + "logps/chosen": -496.2544270833333, + "logps/rejected": -914.4874131944445, + "loss": 0.0127, + "rewards/chosen": 6.553559366861979, + "rewards/margins": 21.393811713324652, + "rewards/rejected": -14.840252346462673, + "step": 1559 + }, + { + "epoch": 0.42757297519528575, + "grad_norm": 6.375, + "kl": 1.7694952487945557, + "learning_rate": 5e-06, + "logits/chosen": -11552968.888888888, + "logits/rejected": -24912558.933333334, + "logps/chosen": -368.04155815972223, + "logps/rejected": -435.56064453125, + "loss": 0.0442, + "rewards/chosen": 6.150076548258464, + "rewards/margins": 14.718037160237632, + "rewards/rejected": -8.567960611979167, + "step": 1560 + }, + { + "epoch": 0.4278470604357955, + "grad_norm": 7.46875, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -20692624.0, + "logits/rejected": -26528910.4, + "logps/chosen": -437.61328125, + "logps/rejected": -654.0646484375, + "loss": 0.0274, + "rewards/chosen": 6.245622907366071, + "rewards/margins": 16.753797040666853, + "rewards/rejected": -10.50817413330078, + "step": 1561 + }, + { + "epoch": 0.4281211456763053, + "grad_norm": 8.0625, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -30765444.923076924, + "logits/rejected": -1327318.6363636365, + "logps/chosen": -423.52110877403845, + "logps/rejected": -454.47274502840907, + "loss": 0.0447, + "rewards/chosen": 5.297383235051082, + "rewards/margins": 13.21655881654966, + "rewards/rejected": -7.919175581498579, + "step": 1562 + }, + { + "epoch": 0.42839523091681514, + "grad_norm": 15.25, + "kl": 2.7215564250946045, + "learning_rate": 5e-06, + "logits/chosen": -20699884.8, + "logits/rejected": -8199564.444444444, + "logps/chosen": -438.71875, + "logps/rejected": -667.3834635416666, + "loss": 0.047, + "rewards/chosen": 5.93561757405599, + "rewards/margins": 16.331015184190537, + "rewards/rejected": -10.395397610134548, + "step": 1563 + }, + { + "epoch": 0.42866931615732495, + "grad_norm": 7.5625, + "kl": 1.8192590475082397, + "learning_rate": 5e-06, + "logits/chosen": -20246765.333333332, + "logits/rejected": -11181466.0, + "logps/chosen": -385.1337076822917, + "logps/rejected": -568.6243489583334, + "loss": 0.0686, + "rewards/chosen": 5.4161726633707685, + "rewards/margins": 12.986188888549805, + "rewards/rejected": -7.570016225179036, + "step": 1564 + }, + { + "epoch": 0.4289434013978347, + "grad_norm": 4.46875, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": 6769580.444444444, + "logits/rejected": -11017386.666666666, + "logps/chosen": -455.2392578125, + "logps/rejected": -462.1536458333333, + "loss": 0.0444, + "rewards/chosen": 5.913819207085504, + "rewards/margins": 13.481346978081596, + "rewards/rejected": -7.567527770996094, + "step": 1565 + }, + { + "epoch": 0.4292174866383445, + "grad_norm": 9.5625, + "kl": 4.43835973739624, + "learning_rate": 5e-06, + "logits/chosen": -14110095.0, + "logits/rejected": -8997012.0, + "logps/chosen": -532.1517333984375, + "logps/rejected": -464.2311706542969, + "loss": 0.0586, + "rewards/chosen": 6.498110294342041, + "rewards/margins": 14.127247333526611, + "rewards/rejected": -7.62913703918457, + "step": 1566 + }, + { + "epoch": 0.42949157187885434, + "grad_norm": 7.1875, + "kl": 1.230910062789917, + "learning_rate": 5e-06, + "logits/chosen": -8651596.57142857, + "logits/rejected": -16544830.4, + "logps/chosen": -416.76597377232144, + "logps/rejected": -434.637451171875, + "loss": 0.0281, + "rewards/chosen": 5.276154654366629, + "rewards/margins": 14.276842825753349, + "rewards/rejected": -9.000688171386718, + "step": 1567 + }, + { + "epoch": 0.4297656571193641, + "grad_norm": 3.140625, + "kl": 3.337705135345459, + "learning_rate": 5e-06, + "logits/chosen": -28913317.333333332, + "logits/rejected": -8583816.0, + "logps/chosen": -375.8260498046875, + "logps/rejected": -551.7837727864584, + "loss": 0.0113, + "rewards/chosen": 6.486595153808594, + "rewards/margins": 15.02840487162272, + "rewards/rejected": -8.541809717814127, + "step": 1568 + }, + { + "epoch": 0.4300397423598739, + "grad_norm": 8.4375, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -30468480.0, + "logits/rejected": -14224932.923076924, + "logps/chosen": -417.07594992897725, + "logps/rejected": -340.54356971153845, + "loss": 0.0738, + "rewards/chosen": 5.6888427734375, + "rewards/margins": 12.522584181565506, + "rewards/rejected": -6.833741408128005, + "step": 1569 + }, + { + "epoch": 0.4303138276003837, + "grad_norm": 3.34375, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -10058890.909090908, + "logits/rejected": -15214660.923076924, + "logps/chosen": -427.70023970170456, + "logps/rejected": -611.5141225961538, + "loss": 0.0081, + "rewards/chosen": 7.318155462091619, + "rewards/margins": 16.024436097045047, + "rewards/rejected": -8.706280634953426, + "step": 1570 + }, + { + "epoch": 0.43058791284089354, + "grad_norm": 3.03125, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -10097241.846153846, + "logits/rejected": 11669538.909090908, + "logps/chosen": -385.2795222355769, + "logps/rejected": -588.7693093039773, + "loss": 0.0319, + "rewards/chosen": 4.551521888146033, + "rewards/margins": 18.70226901394504, + "rewards/rejected": -14.150747125799006, + "step": 1571 + }, + { + "epoch": 0.4308619980814033, + "grad_norm": 12.4375, + "kl": 1.3918012380599976, + "learning_rate": 5e-06, + "logits/chosen": -4073968.5714285714, + "logits/rejected": -9014650.4, + "logps/chosen": -353.56187220982144, + "logps/rejected": -602.799609375, + "loss": 0.134, + "rewards/chosen": 3.9151733943394254, + "rewards/margins": 11.6842590876988, + "rewards/rejected": -7.769085693359375, + "step": 1572 + }, + { + "epoch": 0.4311360833219131, + "grad_norm": 7.21875, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -25475835.42857143, + "logits/rejected": -16630923.294117646, + "logps/chosen": -323.25571986607144, + "logps/rejected": -591.9396829044117, + "loss": 0.0184, + "rewards/chosen": 6.9311948503766745, + "rewards/margins": 16.565570927467668, + "rewards/rejected": -9.634376077090993, + "step": 1573 + }, + { + "epoch": 0.4314101685624229, + "grad_norm": 7.46875, + "kl": 0.5056228637695312, + "learning_rate": 5e-06, + "logits/chosen": -21206793.846153848, + "logits/rejected": -1977051.4545454546, + "logps/chosen": -363.48497596153845, + "logps/rejected": -559.69140625, + "loss": 0.0259, + "rewards/chosen": 7.0742011437049275, + "rewards/margins": 14.355299942976945, + "rewards/rejected": -7.281098799272017, + "step": 1574 + }, + { + "epoch": 0.43168425380293274, + "grad_norm": 17.875, + "kl": 5.256450176239014, + "learning_rate": 5e-06, + "logits/chosen": -17159120.94117647, + "logits/rejected": -16076352.0, + "logps/chosen": -394.5378848805147, + "logps/rejected": -739.6941964285714, + "loss": 0.14, + "rewards/chosen": 4.6304621976964615, + "rewards/margins": 14.0065946178276, + "rewards/rejected": -9.376132420131139, + "step": 1575 + }, + { + "epoch": 0.4319583390434425, + "grad_norm": 9.625, + "kl": 4.344319820404053, + "learning_rate": 5e-06, + "logits/chosen": -26530278.4, + "logits/rejected": 28303500.444444444, + "logps/chosen": -482.57018229166664, + "logps/rejected": -750.7344835069445, + "loss": 0.0698, + "rewards/chosen": 5.96619618733724, + "rewards/margins": 19.952626037597657, + "rewards/rejected": -13.986429850260416, + "step": 1576 + }, + { + "epoch": 0.4322324242839523, + "grad_norm": 7.5625, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -424526.22222222225, + "logits/rejected": -12842584.533333333, + "logps/chosen": -313.5080837673611, + "logps/rejected": -427.5298177083333, + "loss": 0.0371, + "rewards/chosen": 5.234290652804905, + "rewards/margins": 13.049124230278863, + "rewards/rejected": -7.814833577473959, + "step": 1577 + }, + { + "epoch": 0.4325065095244621, + "grad_norm": 7.59375, + "kl": 2.7760372161865234, + "learning_rate": 5e-06, + "logits/chosen": -13555025.142857144, + "logits/rejected": -18968347.2, + "logps/chosen": -427.65945870535717, + "logps/rejected": -552.38154296875, + "loss": 0.0255, + "rewards/chosen": 6.15326908656529, + "rewards/margins": 17.83082798549107, + "rewards/rejected": -11.677558898925781, + "step": 1578 + }, + { + "epoch": 0.4327805947649719, + "grad_norm": 5.15625, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -36134136.615384616, + "logits/rejected": 2205537.090909091, + "logps/chosen": -360.97329477163464, + "logps/rejected": -518.2091619318181, + "loss": 0.0343, + "rewards/chosen": 5.180013803335337, + "rewards/margins": 15.521375802847055, + "rewards/rejected": -10.341361999511719, + "step": 1579 + }, + { + "epoch": 0.4330546800054817, + "grad_norm": 6.96875, + "kl": 4.137892723083496, + "learning_rate": 5e-06, + "logits/chosen": -9889382.4, + "logits/rejected": -10085556.57142857, + "logps/chosen": -515.211279296875, + "logps/rejected": -406.03738839285717, + "loss": 0.0329, + "rewards/chosen": 5.934810638427734, + "rewards/margins": 12.077788761683873, + "rewards/rejected": -6.142978123256138, + "step": 1580 + }, + { + "epoch": 0.4333287652459915, + "grad_norm": 8.5, + "kl": 6.2965192794799805, + "learning_rate": 5e-06, + "logits/chosen": -9774653.714285715, + "logits/rejected": 7323669.6, + "logps/chosen": -446.53724888392856, + "logps/rejected": -575.860302734375, + "loss": 0.0332, + "rewards/chosen": 5.845665522984096, + "rewards/margins": 16.591905757359097, + "rewards/rejected": -10.746240234375, + "step": 1581 + }, + { + "epoch": 0.4336028504865013, + "grad_norm": 1.71875, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -1105105.0, + "logits/rejected": -32466576.0, + "logps/chosen": -472.293701171875, + "logps/rejected": -580.6322631835938, + "loss": 0.0058, + "rewards/chosen": 7.21293830871582, + "rewards/margins": 16.623648643493652, + "rewards/rejected": -9.410710334777832, + "step": 1582 + }, + { + "epoch": 0.4338769357270111, + "grad_norm": 7.1875, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -23046554.666666668, + "logits/rejected": -11823264.0, + "logps/chosen": -419.2908528645833, + "logps/rejected": -382.1836751302083, + "loss": 0.0224, + "rewards/chosen": 4.619925498962402, + "rewards/margins": 13.129349708557129, + "rewards/rejected": -8.509424209594727, + "step": 1583 + }, + { + "epoch": 0.4341510209675209, + "grad_norm": 15.1875, + "kl": 9.030853271484375, + "learning_rate": 5e-06, + "logits/chosen": -14699395.2, + "logits/rejected": -22527586.285714287, + "logps/chosen": -411.340234375, + "logps/rejected": -500.1363002232143, + "loss": 0.0801, + "rewards/chosen": 6.032492065429688, + "rewards/margins": 13.947268458775113, + "rewards/rejected": -7.9147763933454245, + "step": 1584 + }, + { + "epoch": 0.4344251062080307, + "grad_norm": 3.46875, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -10538158.222222222, + "logits/rejected": -5921980.8, + "logps/chosen": -475.8049045138889, + "logps/rejected": -568.1458333333334, + "loss": 0.0096, + "rewards/chosen": 6.532720777723524, + "rewards/margins": 15.846026441786025, + "rewards/rejected": -9.3133056640625, + "step": 1585 + }, + { + "epoch": 0.4346991914485405, + "grad_norm": 7.8125, + "kl": 2.0229835510253906, + "learning_rate": 5e-06, + "logits/chosen": -7294612.0, + "logits/rejected": -8344828.0, + "logps/chosen": -451.2098911830357, + "logps/rejected": -624.97900390625, + "loss": 0.033, + "rewards/chosen": 5.6775327410016745, + "rewards/margins": 15.443360682896206, + "rewards/rejected": -9.765827941894532, + "step": 1586 + }, + { + "epoch": 0.4349732766890503, + "grad_norm": 5.625, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -13978626.285714285, + "logits/rejected": -13349896.0, + "logps/chosen": -395.0674525669643, + "logps/rejected": -491.29462890625, + "loss": 0.0452, + "rewards/chosen": 5.499007088797433, + "rewards/margins": 16.71218763078962, + "rewards/rejected": -11.213180541992188, + "step": 1587 + }, + { + "epoch": 0.4352473619295601, + "grad_norm": 9.1875, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -2072552.6153846155, + "logits/rejected": -20443649.454545453, + "logps/chosen": -337.6418269230769, + "logps/rejected": -472.9308416193182, + "loss": 0.0396, + "rewards/chosen": 4.185869363638071, + "rewards/margins": 12.784905840466907, + "rewards/rejected": -8.599036476828836, + "step": 1588 + }, + { + "epoch": 0.4355214471700699, + "grad_norm": 4.71875, + "kl": 1.0556329488754272, + "learning_rate": 5e-06, + "logits/chosen": 6471694.285714285, + "logits/rejected": 29640513.88235294, + "logps/chosen": -350.63204520089283, + "logps/rejected": -572.3281824448529, + "loss": 0.0566, + "rewards/chosen": 8.143637520926339, + "rewards/margins": 18.516521934701615, + "rewards/rejected": -10.372884413775276, + "step": 1589 + }, + { + "epoch": 0.43579553241057967, + "grad_norm": 10.125, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -27497158.4, + "logits/rejected": -16338928.0, + "logps/chosen": -546.7263671875, + "logps/rejected": -474.30465262276783, + "loss": 0.0206, + "rewards/chosen": 7.434725952148438, + "rewards/margins": 16.715448434012277, + "rewards/rejected": -9.280722481863839, + "step": 1590 + }, + { + "epoch": 0.4360696176510895, + "grad_norm": 7.375, + "kl": 0.036284130066633224, + "learning_rate": 5e-06, + "logits/chosen": 9180269.333333334, + "logits/rejected": -28020896.0, + "logps/chosen": -377.82568359375, + "logps/rejected": -372.40930989583336, + "loss": 0.0489, + "rewards/chosen": 5.5637622409396705, + "rewards/margins": 12.080713229709202, + "rewards/rejected": -6.516950988769532, + "step": 1591 + }, + { + "epoch": 0.4363437028915993, + "grad_norm": 8.25, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -12558744.8, + "logits/rejected": -1304006.857142857, + "logps/chosen": -364.173486328125, + "logps/rejected": -509.8643275669643, + "loss": 0.0622, + "rewards/chosen": 5.0812427520751955, + "rewards/margins": 16.352591977800643, + "rewards/rejected": -11.271349225725446, + "step": 1592 + }, + { + "epoch": 0.4366177881321091, + "grad_norm": 8.5, + "kl": 3.9487390518188477, + "learning_rate": 5e-06, + "logits/chosen": -18162534.85714286, + "logits/rejected": -29886540.8, + "logps/chosen": -414.6497279575893, + "logps/rejected": -505.68251953125, + "loss": 0.0273, + "rewards/chosen": 6.518254961286273, + "rewards/margins": 15.583546556745256, + "rewards/rejected": -9.065291595458984, + "step": 1593 + }, + { + "epoch": 0.43689187337261887, + "grad_norm": 3.953125, + "kl": 1.8361270427703857, + "learning_rate": 5e-06, + "logits/chosen": -20345783.111111112, + "logits/rejected": -26725632.0, + "logps/chosen": -444.0055338541667, + "logps/rejected": -666.4371744791666, + "loss": 0.008, + "rewards/chosen": 7.1822967529296875, + "rewards/margins": 17.07911071777344, + "rewards/rejected": -9.89681396484375, + "step": 1594 + }, + { + "epoch": 0.4371659586131287, + "grad_norm": 5.6875, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -17922396.444444444, + "logits/rejected": -18206350.933333334, + "logps/chosen": -417.0534396701389, + "logps/rejected": -468.61350911458334, + "loss": 0.0221, + "rewards/chosen": 6.047190348307292, + "rewards/margins": 14.934466552734374, + "rewards/rejected": -8.887276204427083, + "step": 1595 + }, + { + "epoch": 0.4374400438536385, + "grad_norm": 7.09375, + "kl": 6.582607269287109, + "learning_rate": 5e-06, + "logits/chosen": -27982576.0, + "logits/rejected": 14815844.0, + "logps/chosen": -401.638427734375, + "logps/rejected": -727.9290364583334, + "loss": 0.0169, + "rewards/chosen": 6.889856338500977, + "rewards/margins": 18.57656796773275, + "rewards/rejected": -11.686711629231771, + "step": 1596 + }, + { + "epoch": 0.4377141290941483, + "grad_norm": 2.90625, + "kl": 4.336249351501465, + "learning_rate": 5e-06, + "logits/chosen": -19859305.846153848, + "logits/rejected": -29676445.09090909, + "logps/chosen": -428.9484675480769, + "logps/rejected": -592.6337002840909, + "loss": 0.0073, + "rewards/chosen": 6.669932438777043, + "rewards/margins": 15.723300747104457, + "rewards/rejected": -9.053368308327414, + "step": 1597 + }, + { + "epoch": 0.43798821433465807, + "grad_norm": 3.703125, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -8397837.333333334, + "logits/rejected": -17573800.0, + "logps/chosen": -336.4859619140625, + "logps/rejected": -511.5308430989583, + "loss": 0.0272, + "rewards/chosen": 5.384997685750325, + "rewards/margins": 16.19845136006673, + "rewards/rejected": -10.813453674316406, + "step": 1598 + }, + { + "epoch": 0.4382622995751679, + "grad_norm": 2.5, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -24738984.727272727, + "logits/rejected": -19300173.53846154, + "logps/chosen": -369.61689897017044, + "logps/rejected": -574.8435246394231, + "loss": 0.0457, + "rewards/chosen": 6.369357022372159, + "rewards/margins": 16.788028850422037, + "rewards/rejected": -10.41867182804988, + "step": 1599 + }, + { + "epoch": 0.4385363848156777, + "grad_norm": 12.625, + "kl": 5.976953029632568, + "learning_rate": 5e-06, + "logits/chosen": -12034653.538461538, + "logits/rejected": -41405765.81818182, + "logps/chosen": -380.30333533653845, + "logps/rejected": -613.7848011363636, + "loss": 0.0403, + "rewards/chosen": 6.570397597092849, + "rewards/margins": 16.242716489138303, + "rewards/rejected": -9.672318892045455, + "step": 1600 + }, + { + "epoch": 0.43881047005618745, + "grad_norm": 11.4375, + "kl": 5.164576530456543, + "learning_rate": 5e-06, + "logits/chosen": -51269477.333333336, + "logits/rejected": -36268021.333333336, + "logps/chosen": -350.6538899739583, + "logps/rejected": -439.0476888020833, + "loss": 0.1183, + "rewards/chosen": 5.8839467366536455, + "rewards/margins": 13.428728739420572, + "rewards/rejected": -7.544782002766927, + "step": 1601 + }, + { + "epoch": 0.43908455529669727, + "grad_norm": 3.453125, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -28004548.923076924, + "logits/rejected": -10725927.272727273, + "logps/chosen": -351.61485877403845, + "logps/rejected": -333.95439009232956, + "loss": 0.0365, + "rewards/chosen": 4.602257361778846, + "rewards/margins": 11.469885285917695, + "rewards/rejected": -6.86762792413885, + "step": 1602 + }, + { + "epoch": 0.4393586405372071, + "grad_norm": 7.875, + "kl": 1.4268290996551514, + "learning_rate": 5e-06, + "logits/chosen": -16060489.6, + "logits/rejected": -21778964.57142857, + "logps/chosen": -431.728076171875, + "logps/rejected": -517.9756905691964, + "loss": 0.0493, + "rewards/chosen": 5.308440399169922, + "rewards/margins": 14.477192796979633, + "rewards/rejected": -9.16875239780971, + "step": 1603 + }, + { + "epoch": 0.4396327257777169, + "grad_norm": 13.25, + "kl": 0.9417349696159363, + "learning_rate": 5e-06, + "logits/chosen": -8169249.230769231, + "logits/rejected": -23015761.454545453, + "logps/chosen": -301.14013671875, + "logps/rejected": -594.4637784090909, + "loss": 0.0514, + "rewards/chosen": 5.947084280160757, + "rewards/margins": 15.255504634830501, + "rewards/rejected": -9.308420354669744, + "step": 1604 + }, + { + "epoch": 0.43990681101822665, + "grad_norm": 4.59375, + "kl": 0.8764635920524597, + "learning_rate": 5e-06, + "logits/chosen": -36448085.333333336, + "logits/rejected": -12963608.888888888, + "logps/chosen": -437.87877604166664, + "logps/rejected": -490.3688151041667, + "loss": 0.0144, + "rewards/chosen": 6.5087336222330725, + "rewards/margins": 17.89163089328342, + "rewards/rejected": -11.382897271050346, + "step": 1605 + }, + { + "epoch": 0.44018089625873646, + "grad_norm": 12.3125, + "kl": 9.698051452636719, + "learning_rate": 5e-06, + "logits/chosen": -7615410.133333334, + "logits/rejected": -17646840.888888888, + "logps/chosen": -540.3231119791667, + "logps/rejected": -424.1411404079861, + "loss": 0.0621, + "rewards/chosen": 8.096905517578126, + "rewards/margins": 17.086988152398003, + "rewards/rejected": -8.990082634819878, + "step": 1606 + }, + { + "epoch": 0.4404549814992463, + "grad_norm": 9.25, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -50491337.84615385, + "logits/rejected": 4594315.636363637, + "logps/chosen": -330.6887770432692, + "logps/rejected": -425.76558061079544, + "loss": 0.0698, + "rewards/chosen": 5.220531757061298, + "rewards/margins": 13.502035607824793, + "rewards/rejected": -8.281503850763494, + "step": 1607 + }, + { + "epoch": 0.44072906673975604, + "grad_norm": 6.4375, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -15257862.857142856, + "logits/rejected": 39383228.23529412, + "logps/chosen": -346.62088448660717, + "logps/rejected": -546.9765050551471, + "loss": 0.035, + "rewards/chosen": 7.4061110360281805, + "rewards/margins": 19.35273130400842, + "rewards/rejected": -11.946620267980238, + "step": 1608 + }, + { + "epoch": 0.44100315198026585, + "grad_norm": 2.625, + "kl": 4.7509260177612305, + "learning_rate": 5e-06, + "logits/chosen": -35992005.81818182, + "logits/rejected": -17769737.846153848, + "logps/chosen": -413.6678355823864, + "logps/rejected": -416.7546198918269, + "loss": 0.0073, + "rewards/chosen": 6.766878995028409, + "rewards/margins": 16.34737732360413, + "rewards/rejected": -9.580498328575722, + "step": 1609 + }, + { + "epoch": 0.44127723722077566, + "grad_norm": 3.515625, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -24966777.14285714, + "logits/rejected": -8148878.4, + "logps/chosen": -475.26021902901783, + "logps/rejected": -639.0001953125, + "loss": 0.0107, + "rewards/chosen": 5.919411250523159, + "rewards/margins": 20.986913081577846, + "rewards/rejected": -15.067501831054688, + "step": 1610 + }, + { + "epoch": 0.4415513224612855, + "grad_norm": 7.5625, + "kl": 3.6646170616149902, + "learning_rate": 5e-06, + "logits/chosen": -8263650.461538462, + "logits/rejected": -14555076.363636363, + "logps/chosen": -545.4847130408654, + "logps/rejected": -457.32151100852275, + "loss": 0.0527, + "rewards/chosen": 5.945017887995793, + "rewards/margins": 14.76159892048869, + "rewards/rejected": -8.816581032492898, + "step": 1611 + }, + { + "epoch": 0.44182540770179524, + "grad_norm": 9.5625, + "kl": 3.506129741668701, + "learning_rate": 5e-06, + "logits/chosen": -21597806.545454547, + "logits/rejected": -22176270.769230768, + "logps/chosen": -437.64657315340907, + "logps/rejected": -523.5126577524038, + "loss": 0.0434, + "rewards/chosen": 6.445435957475142, + "rewards/margins": 16.033998556070394, + "rewards/rejected": -9.588562598595253, + "step": 1612 + }, + { + "epoch": 0.44209949294230505, + "grad_norm": 0.6953125, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -27436484.923076924, + "logits/rejected": 8069934.545454546, + "logps/chosen": -499.4791917067308, + "logps/rejected": -538.3486772017045, + "loss": 0.0023, + "rewards/chosen": 7.95752187875601, + "rewards/margins": 19.195862776749617, + "rewards/rejected": -11.238340897993607, + "step": 1613 + }, + { + "epoch": 0.44237357818281486, + "grad_norm": 13.375, + "kl": 0.4750315546989441, + "learning_rate": 5e-06, + "logits/chosen": -14526560.0, + "logits/rejected": -18913761.6, + "logps/chosen": -353.45703125, + "logps/rejected": -471.638623046875, + "loss": 0.0456, + "rewards/chosen": 6.09002685546875, + "rewards/margins": 14.259207153320313, + "rewards/rejected": -8.169180297851563, + "step": 1614 + }, + { + "epoch": 0.4426476634233247, + "grad_norm": 2.609375, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -11926714.352941176, + "logits/rejected": -17116421.714285713, + "logps/chosen": -407.3124425551471, + "logps/rejected": -643.5661272321429, + "loss": 0.0068, + "rewards/chosen": 7.40140084659352, + "rewards/margins": 17.3354054939847, + "rewards/rejected": -9.934004647391182, + "step": 1615 + }, + { + "epoch": 0.44292174866383444, + "grad_norm": 9.625, + "kl": 0.8935505747795105, + "learning_rate": 5e-06, + "logits/chosen": -11578714.285714285, + "logits/rejected": -24899270.4, + "logps/chosen": -432.53271484375, + "logps/rejected": -547.163916015625, + "loss": 0.0483, + "rewards/chosen": 5.193505423409598, + "rewards/margins": 17.47497591291155, + "rewards/rejected": -12.281470489501952, + "step": 1616 + }, + { + "epoch": 0.44319583390434425, + "grad_norm": 13.125, + "kl": 9.52022933959961, + "learning_rate": 5e-06, + "logits/chosen": -24764077.714285713, + "logits/rejected": -20368204.8, + "logps/chosen": -426.6773158482143, + "logps/rejected": -448.876416015625, + "loss": 0.0998, + "rewards/chosen": 6.779391697474888, + "rewards/margins": 17.181907108851842, + "rewards/rejected": -10.402515411376953, + "step": 1617 + }, + { + "epoch": 0.44346991914485406, + "grad_norm": 3.6875, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -35039989.333333336, + "logits/rejected": -32510370.133333333, + "logps/chosen": -445.3374837239583, + "logps/rejected": -403.90406901041666, + "loss": 0.042, + "rewards/chosen": 6.415061526828342, + "rewards/margins": 15.133513471815322, + "rewards/rejected": -8.71845194498698, + "step": 1618 + }, + { + "epoch": 0.4437440043853638, + "grad_norm": 2.84375, + "kl": 7.0674896240234375, + "learning_rate": 5e-06, + "logits/chosen": -4537326.461538462, + "logits/rejected": -33879360.0, + "logps/chosen": -458.28061147836536, + "logps/rejected": -509.18013139204544, + "loss": 0.0106, + "rewards/chosen": 8.038973881648136, + "rewards/margins": 17.361817953469867, + "rewards/rejected": -9.322844071821732, + "step": 1619 + }, + { + "epoch": 0.44401808962587364, + "grad_norm": 3.359375, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -8616354.823529411, + "logits/rejected": 66077645.71428572, + "logps/chosen": -445.77277688419116, + "logps/rejected": -639.4210379464286, + "loss": 0.0088, + "rewards/chosen": 6.960884543026195, + "rewards/margins": 20.351590196625523, + "rewards/rejected": -13.39070565359933, + "step": 1620 + }, + { + "epoch": 0.44429217486638345, + "grad_norm": 11.9375, + "kl": 4.877805709838867, + "learning_rate": 5e-06, + "logits/chosen": -16048120.888888888, + "logits/rejected": -5844683.733333333, + "logps/chosen": -509.4775390625, + "logps/rejected": -449.883984375, + "loss": 0.0261, + "rewards/chosen": 7.203847249348958, + "rewards/margins": 16.277764892578126, + "rewards/rejected": -9.073917643229167, + "step": 1621 + }, + { + "epoch": 0.44456626010689326, + "grad_norm": 4.1875, + "kl": 0.9618561863899231, + "learning_rate": 5e-06, + "logits/chosen": 5030148.307692308, + "logits/rejected": -19122234.181818184, + "logps/chosen": -470.98937049278845, + "logps/rejected": -554.8478781960227, + "loss": 0.0184, + "rewards/chosen": 6.558130117563101, + "rewards/margins": 15.124824897392646, + "rewards/rejected": -8.566694779829545, + "step": 1622 + }, + { + "epoch": 0.444840345347403, + "grad_norm": 1.6015625, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -2900054.0, + "logits/rejected": -11607685.714285715, + "logps/chosen": -366.6426513671875, + "logps/rejected": -572.0793805803571, + "loss": 0.0037, + "rewards/chosen": 7.732093811035156, + "rewards/margins": 18.361781529017858, + "rewards/rejected": -10.629687717982701, + "step": 1623 + }, + { + "epoch": 0.44511443058791283, + "grad_norm": 4.625, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -10483862.0, + "logits/rejected": -5503631.0, + "logps/chosen": -445.8854166666667, + "logps/rejected": -501.3751627604167, + "loss": 0.0154, + "rewards/chosen": 6.492940266927083, + "rewards/margins": 15.429641723632812, + "rewards/rejected": -8.936701456705729, + "step": 1624 + }, + { + "epoch": 0.44538851582842265, + "grad_norm": 9.6875, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": 19543508.8, + "logits/rejected": -28528944.0, + "logps/chosen": -368.5140869140625, + "logps/rejected": -530.5874720982143, + "loss": 0.0623, + "rewards/chosen": 5.4311073303222654, + "rewards/margins": 14.702875082833426, + "rewards/rejected": -9.271767752511161, + "step": 1625 + }, + { + "epoch": 0.44566260106893246, + "grad_norm": 6.625, + "kl": 0.3709462583065033, + "learning_rate": 5e-06, + "logits/chosen": 6602111.384615385, + "logits/rejected": -23935028.363636363, + "logps/chosen": -433.13269981971155, + "logps/rejected": -405.8216441761364, + "loss": 0.0252, + "rewards/chosen": 5.85598872258113, + "rewards/margins": 14.173687808163518, + "rewards/rejected": -8.317699085582387, + "step": 1626 + }, + { + "epoch": 0.4459366863094422, + "grad_norm": 10.4375, + "kl": 19.157194137573242, + "learning_rate": 5e-06, + "logits/chosen": -26535949.17647059, + "logits/rejected": -34204208.0, + "logps/chosen": -491.8421415441176, + "logps/rejected": -488.80440848214283, + "loss": 0.0653, + "rewards/chosen": 7.5327301025390625, + "rewards/margins": 16.28550556727818, + "rewards/rejected": -8.752775464739118, + "step": 1627 + }, + { + "epoch": 0.44621077154995203, + "grad_norm": 12.0625, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -13101413.333333334, + "logits/rejected": -7527658.666666667, + "logps/chosen": -385.9886067708333, + "logps/rejected": -491.7929280598958, + "loss": 0.0573, + "rewards/chosen": 5.628131866455078, + "rewards/margins": 15.343802134195963, + "rewards/rejected": -9.715670267740885, + "step": 1628 + }, + { + "epoch": 0.44648485679046185, + "grad_norm": 5.0625, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -11132960.0, + "logits/rejected": -22993906.285714287, + "logps/chosen": -375.987841796875, + "logps/rejected": -521.00341796875, + "loss": 0.0345, + "rewards/chosen": 6.91351318359375, + "rewards/margins": 15.45094462803432, + "rewards/rejected": -8.53743144444057, + "step": 1629 + }, + { + "epoch": 0.4467589420309716, + "grad_norm": 7.75, + "kl": 7.083909034729004, + "learning_rate": 5e-06, + "logits/chosen": -20764034.285714287, + "logits/rejected": 33358393.6, + "logps/chosen": -370.28299386160717, + "logps/rejected": -556.053662109375, + "loss": 0.0556, + "rewards/chosen": 6.4532961164202005, + "rewards/margins": 16.902691868373324, + "rewards/rejected": -10.449395751953125, + "step": 1630 + }, + { + "epoch": 0.4470330272714814, + "grad_norm": 2.421875, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -5396382.769230769, + "logits/rejected": -19794064.0, + "logps/chosen": -397.78722205528845, + "logps/rejected": -564.6499467329545, + "loss": 0.0067, + "rewards/chosen": 6.864127525916467, + "rewards/margins": 19.28014453807911, + "rewards/rejected": -12.416017012162643, + "step": 1631 + }, + { + "epoch": 0.44730711251199123, + "grad_norm": 1.984375, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": 19407182.545454547, + "logits/rejected": -17738809.846153848, + "logps/chosen": -450.73193359375, + "logps/rejected": -702.9397536057693, + "loss": 0.0049, + "rewards/chosen": 7.135676990855824, + "rewards/margins": 17.263367232742844, + "rewards/rejected": -10.12769024188702, + "step": 1632 + }, + { + "epoch": 0.44758119775250105, + "grad_norm": 5.4375, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -17481745.6, + "logits/rejected": -11392288.0, + "logps/chosen": -334.5301025390625, + "logps/rejected": -555.6188616071429, + "loss": 0.0367, + "rewards/chosen": 4.077671051025391, + "rewards/margins": 14.48890849522182, + "rewards/rejected": -10.411237444196429, + "step": 1633 + }, + { + "epoch": 0.4478552829930108, + "grad_norm": 12.6875, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": 7127992.0, + "logits/rejected": 96839891.2, + "logps/chosen": -417.3936244419643, + "logps/rejected": -547.777392578125, + "loss": 0.0537, + "rewards/chosen": 5.947007315499442, + "rewards/margins": 14.024064200265066, + "rewards/rejected": -8.077056884765625, + "step": 1634 + }, + { + "epoch": 0.4481293682335206, + "grad_norm": 4.6875, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -29806142.222222224, + "logits/rejected": -12480049.066666666, + "logps/chosen": -503.5505099826389, + "logps/rejected": -471.7436848958333, + "loss": 0.022, + "rewards/chosen": 7.056818220350477, + "rewards/margins": 16.224021996392146, + "rewards/rejected": -9.167203776041667, + "step": 1635 + }, + { + "epoch": 0.44840345347403043, + "grad_norm": 2.0625, + "kl": 7.586144924163818, + "learning_rate": 5e-06, + "logits/chosen": -8740294.857142856, + "logits/rejected": -9576947.2, + "logps/chosen": -505.3546665736607, + "logps/rejected": -496.8560546875, + "loss": 0.0416, + "rewards/chosen": 7.514672415597098, + "rewards/margins": 17.675181143624442, + "rewards/rejected": -10.160508728027343, + "step": 1636 + }, + { + "epoch": 0.44867753871454025, + "grad_norm": 2.3125, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -10192892.8, + "logits/rejected": -19619812.57142857, + "logps/chosen": -403.9939208984375, + "logps/rejected": -609.0163225446429, + "loss": 0.0067, + "rewards/chosen": 7.485856628417968, + "rewards/margins": 18.678268868582588, + "rewards/rejected": -11.19241224016462, + "step": 1637 + }, + { + "epoch": 0.44895162395505, + "grad_norm": 8.1875, + "kl": 2.938762664794922, + "learning_rate": 5e-06, + "logits/chosen": 19759197.866666667, + "logits/rejected": -6147216.444444444, + "logps/chosen": -474.1456705729167, + "logps/rejected": -754.3939887152778, + "loss": 0.0529, + "rewards/chosen": 6.011942545572917, + "rewards/margins": 17.841898600260418, + "rewards/rejected": -11.8299560546875, + "step": 1638 + }, + { + "epoch": 0.4492257091955598, + "grad_norm": 10.0, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -15194781.333333334, + "logits/rejected": -25793173.333333332, + "logps/chosen": -326.8212076822917, + "logps/rejected": -716.6625162760416, + "loss": 0.0467, + "rewards/chosen": 5.539063771565755, + "rewards/margins": 16.822612762451172, + "rewards/rejected": -11.283548990885416, + "step": 1639 + }, + { + "epoch": 0.44949979443606963, + "grad_norm": 5.8125, + "kl": 1.4932245016098022, + "learning_rate": 5e-06, + "logits/chosen": -13225469.714285715, + "logits/rejected": -4212331.6, + "logps/chosen": -329.97813197544644, + "logps/rejected": -611.74267578125, + "loss": 0.0315, + "rewards/chosen": 5.574873788016183, + "rewards/margins": 15.420550973074779, + "rewards/rejected": -9.845677185058594, + "step": 1640 + }, + { + "epoch": 0.4497738796765794, + "grad_norm": 3.15625, + "kl": 8.383417129516602, + "learning_rate": 5e-06, + "logits/chosen": -23534653.53846154, + "logits/rejected": -30002752.0, + "logps/chosen": -573.2593900240385, + "logps/rejected": -455.76518110795456, + "loss": 0.0102, + "rewards/chosen": 8.192196185772236, + "rewards/margins": 18.63432237318346, + "rewards/rejected": -10.44212618741122, + "step": 1641 + }, + { + "epoch": 0.4500479649170892, + "grad_norm": 13.0, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -405986.4, + "logits/rejected": -13343293.714285715, + "logps/chosen": -359.9531494140625, + "logps/rejected": -465.36903599330356, + "loss": 0.0893, + "rewards/chosen": 5.042927169799805, + "rewards/margins": 13.030072729928154, + "rewards/rejected": -7.987145560128348, + "step": 1642 + }, + { + "epoch": 0.450322050157599, + "grad_norm": 3.453125, + "kl": 3.5321948528289795, + "learning_rate": 5e-06, + "logits/chosen": -42219625.14285714, + "logits/rejected": 73464972.8, + "logps/chosen": -499.80186244419644, + "logps/rejected": -615.669091796875, + "loss": 0.0078, + "rewards/chosen": 7.024814060756138, + "rewards/margins": 25.635254124232702, + "rewards/rejected": -18.610440063476563, + "step": 1643 + }, + { + "epoch": 0.45059613539810883, + "grad_norm": 5.4375, + "kl": 3.9668173789978027, + "learning_rate": 5e-06, + "logits/chosen": -11940672.0, + "logits/rejected": -1345260.1818181819, + "logps/chosen": -407.501953125, + "logps/rejected": -656.0498934659091, + "loss": 0.0184, + "rewards/chosen": 7.9962158203125, + "rewards/margins": 17.293535405939274, + "rewards/rejected": -9.297319585626775, + "step": 1644 + }, + { + "epoch": 0.4508702206386186, + "grad_norm": 10.0, + "kl": 8.886152267456055, + "learning_rate": 5e-06, + "logits/chosen": -24678589.53846154, + "logits/rejected": -25159598.545454547, + "logps/chosen": -393.21529447115387, + "logps/rejected": -484.6580255681818, + "loss": 0.0462, + "rewards/chosen": 5.907256493201623, + "rewards/margins": 14.68162824724104, + "rewards/rejected": -8.774371754039418, + "step": 1645 + }, + { + "epoch": 0.4511443058791284, + "grad_norm": 7.59375, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": 453955.5, + "logits/rejected": -20130612.8, + "logps/chosen": -678.9776611328125, + "logps/rejected": -442.9744140625, + "loss": 0.0896, + "rewards/chosen": 7.095569610595703, + "rewards/margins": 13.935923767089843, + "rewards/rejected": -6.840354156494141, + "step": 1646 + }, + { + "epoch": 0.4514183911196382, + "grad_norm": 3.546875, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -10254125.6, + "logits/rejected": -13035568.0, + "logps/chosen": -368.7681884765625, + "logps/rejected": -540.6209542410714, + "loss": 0.0125, + "rewards/chosen": 5.310494613647461, + "rewards/margins": 14.431333323887415, + "rewards/rejected": -9.120838710239955, + "step": 1647 + }, + { + "epoch": 0.45169247636014803, + "grad_norm": 3.203125, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": 41226500.0, + "logits/rejected": -7846700.0, + "logps/chosen": -363.8887634277344, + "logps/rejected": -618.3214111328125, + "loss": 0.0215, + "rewards/chosen": 4.966091632843018, + "rewards/margins": 14.06819772720337, + "rewards/rejected": -9.102106094360352, + "step": 1648 + }, + { + "epoch": 0.4519665616006578, + "grad_norm": 9.3125, + "kl": 3.6866555213928223, + "learning_rate": 5e-06, + "logits/chosen": -7984409.142857143, + "logits/rejected": -16671241.6, + "logps/chosen": -346.58663504464283, + "logps/rejected": -505.9958984375, + "loss": 0.0624, + "rewards/chosen": 6.141032627650669, + "rewards/margins": 13.301349094935826, + "rewards/rejected": -7.160316467285156, + "step": 1649 + }, + { + "epoch": 0.4522406468411676, + "grad_norm": 6.875, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -24967350.4, + "logits/rejected": 5856523.428571428, + "logps/chosen": -333.070458984375, + "logps/rejected": -448.32352120535717, + "loss": 0.0333, + "rewards/chosen": 5.883818817138672, + "rewards/margins": 14.15088849748884, + "rewards/rejected": -8.267069680350167, + "step": 1650 + }, + { + "epoch": 0.4525147320816774, + "grad_norm": 10.1875, + "kl": 1.9765117168426514, + "learning_rate": 5e-06, + "logits/chosen": -11081370.666666666, + "logits/rejected": -27542986.666666668, + "logps/chosen": -377.0992838541667, + "logps/rejected": -460.6643473307292, + "loss": 0.0585, + "rewards/chosen": 5.377266565958659, + "rewards/margins": 12.936354955037435, + "rewards/rejected": -7.559088389078776, + "step": 1651 + }, + { + "epoch": 0.4527888173221872, + "grad_norm": 7.71875, + "kl": 9.872448921203613, + "learning_rate": 5e-06, + "logits/chosen": -8909801.142857144, + "logits/rejected": -4694728.8, + "logps/chosen": -323.0625, + "logps/rejected": -420.320166015625, + "loss": 0.0626, + "rewards/chosen": 6.288860321044922, + "rewards/margins": 14.23117446899414, + "rewards/rejected": -7.942314147949219, + "step": 1652 + }, + { + "epoch": 0.453062902562697, + "grad_norm": 10.6875, + "kl": 7.067468166351318, + "learning_rate": 5e-06, + "logits/chosen": -10801121.23076923, + "logits/rejected": -10053328.0, + "logps/chosen": -407.5549504206731, + "logps/rejected": -362.89284446022725, + "loss": 0.0648, + "rewards/chosen": 7.071421109713041, + "rewards/margins": 12.913342135769504, + "rewards/rejected": -5.841921026056463, + "step": 1653 + }, + { + "epoch": 0.4533369878032068, + "grad_norm": 11.0, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -26776538.666666668, + "logits/rejected": -10423369.333333334, + "logps/chosen": -535.9851888020834, + "logps/rejected": -538.9908040364584, + "loss": 0.0696, + "rewards/chosen": 6.019396464029948, + "rewards/margins": 15.443354924519856, + "rewards/rejected": -9.423958460489908, + "step": 1654 + }, + { + "epoch": 0.4536110730437166, + "grad_norm": 11.125, + "kl": 3.67706298828125, + "learning_rate": 5e-06, + "logits/chosen": -4080334.3333333335, + "logits/rejected": -4359416.0, + "logps/chosen": -308.0704752604167, + "logps/rejected": -367.8809000651042, + "loss": 0.1081, + "rewards/chosen": 4.6833070119222, + "rewards/margins": 10.344533920288086, + "rewards/rejected": -5.661226908365886, + "step": 1655 + }, + { + "epoch": 0.4538851582842264, + "grad_norm": 0.8984375, + "kl": 6.170504093170166, + "learning_rate": 5e-06, + "logits/chosen": -8672504.0, + "logits/rejected": 17395504.0, + "logps/chosen": -429.67578125, + "logps/rejected": -685.2325439453125, + "loss": 0.003, + "rewards/chosen": 8.003698348999023, + "rewards/margins": 19.800585746765137, + "rewards/rejected": -11.796887397766113, + "step": 1656 + }, + { + "epoch": 0.4541592435247362, + "grad_norm": 5.875, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -17426732.0, + "logits/rejected": -8971247.0, + "logps/chosen": -486.8130798339844, + "logps/rejected": -543.311767578125, + "loss": 0.0117, + "rewards/chosen": 6.186143398284912, + "rewards/margins": 15.482910633087158, + "rewards/rejected": -9.296767234802246, + "step": 1657 + }, + { + "epoch": 0.454433328765246, + "grad_norm": 9.75, + "kl": 3.4021365642547607, + "learning_rate": 5e-06, + "logits/chosen": -21958126.933333334, + "logits/rejected": -7450010.666666667, + "logps/chosen": -342.11858723958335, + "logps/rejected": -634.3654513888889, + "loss": 0.0818, + "rewards/chosen": 6.399081420898438, + "rewards/margins": 18.174235534667968, + "rewards/rejected": -11.775154113769531, + "step": 1658 + }, + { + "epoch": 0.4547074140057558, + "grad_norm": 4.5, + "kl": 2.1718270778656006, + "learning_rate": 5e-06, + "logits/chosen": -23876661.333333332, + "logits/rejected": 28407739.733333334, + "logps/chosen": -342.56792534722223, + "logps/rejected": -782.2903645833334, + "loss": 0.0385, + "rewards/chosen": 5.544088999430339, + "rewards/margins": 23.50750249226888, + "rewards/rejected": -17.96341349283854, + "step": 1659 + }, + { + "epoch": 0.4549814992462656, + "grad_norm": 8.1875, + "kl": 6.910480976104736, + "learning_rate": 5e-06, + "logits/chosen": -17576480.0, + "logits/rejected": -29124672.0, + "logps/chosen": -322.58019080528845, + "logps/rejected": -486.3767755681818, + "loss": 0.0266, + "rewards/chosen": 6.523874136117788, + "rewards/margins": 15.915561009120275, + "rewards/rejected": -9.391686873002486, + "step": 1660 + }, + { + "epoch": 0.4552555844867754, + "grad_norm": 11.25, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -27467160.615384616, + "logits/rejected": -34172177.45454545, + "logps/chosen": -429.1819411057692, + "logps/rejected": -388.6783558238636, + "loss": 0.0526, + "rewards/chosen": 6.275446965144231, + "rewards/margins": 13.631985217541247, + "rewards/rejected": -7.356538252397017, + "step": 1661 + }, + { + "epoch": 0.4555296697272852, + "grad_norm": 8.375, + "kl": 11.749387741088867, + "learning_rate": 5e-06, + "logits/chosen": -24009166.933333334, + "logits/rejected": -6718025.777777778, + "logps/chosen": -417.01604817708335, + "logps/rejected": -417.8064236111111, + "loss": 0.0313, + "rewards/chosen": 6.679830932617188, + "rewards/margins": 14.568787638346354, + "rewards/rejected": -7.888956705729167, + "step": 1662 + }, + { + "epoch": 0.45580375496779496, + "grad_norm": 10.1875, + "kl": 0.4525540769100189, + "learning_rate": 5e-06, + "logits/chosen": -17109940.0, + "logits/rejected": 15071477.333333334, + "logps/chosen": -319.3381754557292, + "logps/rejected": -708.2626953125, + "loss": 0.0463, + "rewards/chosen": 5.832986195882161, + "rewards/margins": 14.342573801676433, + "rewards/rejected": -8.509587605794271, + "step": 1663 + }, + { + "epoch": 0.4560778402083048, + "grad_norm": 1.59375, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": 998358.9, + "logits/rejected": -4212055.142857143, + "logps/chosen": -478.3154296875, + "logps/rejected": -441.92201450892856, + "loss": 0.0028, + "rewards/chosen": 7.805268859863281, + "rewards/margins": 16.36726531982422, + "rewards/rejected": -8.561996459960938, + "step": 1664 + }, + { + "epoch": 0.4563519254488146, + "grad_norm": 3.109375, + "kl": 5.793788909912109, + "learning_rate": 5e-06, + "logits/chosen": -15877393.333333334, + "logits/rejected": -28144000.0, + "logps/chosen": -417.4274088541667, + "logps/rejected": -503.7835286458333, + "loss": 0.0139, + "rewards/chosen": 6.937294006347656, + "rewards/margins": 19.01272964477539, + "rewards/rejected": -12.075435638427734, + "step": 1665 + }, + { + "epoch": 0.4566260106893244, + "grad_norm": 8.25, + "kl": 0.32558950781822205, + "learning_rate": 5e-06, + "logits/chosen": -31651800.615384616, + "logits/rejected": 2107970.727272727, + "logps/chosen": -430.36959134615387, + "logps/rejected": -557.9992453835227, + "loss": 0.0708, + "rewards/chosen": 4.91006352351262, + "rewards/margins": 14.11682982544799, + "rewards/rejected": -9.20676630193537, + "step": 1666 + }, + { + "epoch": 0.45690009592983416, + "grad_norm": 17.625, + "kl": 7.495529651641846, + "learning_rate": 5e-06, + "logits/chosen": -16611636.266666668, + "logits/rejected": -6260107.111111111, + "logps/chosen": -425.1860026041667, + "logps/rejected": -519.3334418402778, + "loss": 0.0682, + "rewards/chosen": 6.476250712076823, + "rewards/margins": 14.29713372124566, + "rewards/rejected": -7.820883009168837, + "step": 1667 + }, + { + "epoch": 0.457174181170344, + "grad_norm": 8.0625, + "kl": 3.6604013442993164, + "learning_rate": 5e-06, + "logits/chosen": -7275998.588235294, + "logits/rejected": -6406930.285714285, + "logps/chosen": -620.8793658088235, + "logps/rejected": -491.03763253348217, + "loss": 0.0446, + "rewards/chosen": 6.910154454848346, + "rewards/margins": 13.102085498200745, + "rewards/rejected": -6.191931043352399, + "step": 1668 + }, + { + "epoch": 0.4574482664108538, + "grad_norm": 12.5, + "kl": 7.367467880249023, + "learning_rate": 5e-06, + "logits/chosen": -27970533.818181816, + "logits/rejected": -9707217.23076923, + "logps/chosen": -431.25883345170456, + "logps/rejected": -591.4619140625, + "loss": 0.0444, + "rewards/chosen": 5.689906033602628, + "rewards/margins": 16.720557419570177, + "rewards/rejected": -11.030651385967548, + "step": 1669 + }, + { + "epoch": 0.4577223516513636, + "grad_norm": 8.4375, + "kl": 3.4151358604431152, + "learning_rate": 5e-06, + "logits/chosen": -9808922.666666666, + "logits/rejected": 1718593.3333333333, + "logps/chosen": -374.3586832682292, + "logps/rejected": -573.3936767578125, + "loss": 0.0331, + "rewards/chosen": 6.177549362182617, + "rewards/margins": 15.556535720825195, + "rewards/rejected": -9.378986358642578, + "step": 1670 + }, + { + "epoch": 0.45799643689187336, + "grad_norm": 8.75, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -15295173.333333334, + "logits/rejected": -28531904.0, + "logps/chosen": -400.40098741319446, + "logps/rejected": -526.2947591145834, + "loss": 0.0553, + "rewards/chosen": 7.846744961208767, + "rewards/margins": 15.302373419867621, + "rewards/rejected": -7.455628458658854, + "step": 1671 + }, + { + "epoch": 0.4582705221323832, + "grad_norm": 3.515625, + "kl": 3.4124019145965576, + "learning_rate": 5e-06, + "logits/chosen": -25529612.307692308, + "logits/rejected": -3541481.8181818184, + "logps/chosen": -487.5212590144231, + "logps/rejected": -635.6708984375, + "loss": 0.0382, + "rewards/chosen": 6.04890617957482, + "rewards/margins": 16.29345254964762, + "rewards/rejected": -10.244546370072799, + "step": 1672 + }, + { + "epoch": 0.458544607372893, + "grad_norm": 9.125, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -17195749.333333332, + "logits/rejected": 17806536.533333335, + "logps/chosen": -456.90375434027777, + "logps/rejected": -524.9536458333333, + "loss": 0.0645, + "rewards/chosen": 5.99097654554579, + "rewards/margins": 16.255053287082248, + "rewards/rejected": -10.264076741536458, + "step": 1673 + }, + { + "epoch": 0.45881869261340275, + "grad_norm": 3.984375, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -9824670.222222222, + "logits/rejected": -15499771.733333332, + "logps/chosen": -419.7868381076389, + "logps/rejected": -416.55579427083336, + "loss": 0.0324, + "rewards/chosen": 5.751608106825087, + "rewards/margins": 14.639554680718316, + "rewards/rejected": -8.88794657389323, + "step": 1674 + }, + { + "epoch": 0.45909277785391256, + "grad_norm": 6.21875, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": 39730606.54545455, + "logits/rejected": 10120450.461538462, + "logps/chosen": -443.51376065340907, + "logps/rejected": -590.6284930889423, + "loss": 0.0202, + "rewards/chosen": 5.153550581498579, + "rewards/margins": 15.22309427328043, + "rewards/rejected": -10.06954369178185, + "step": 1675 + }, + { + "epoch": 0.4593668630944224, + "grad_norm": 14.375, + "kl": 4.643971920013428, + "learning_rate": 5e-06, + "logits/chosen": 3026720.4444444445, + "logits/rejected": -6319134.933333334, + "logps/chosen": -358.44476996527777, + "logps/rejected": -569.3490885416667, + "loss": 0.0745, + "rewards/chosen": 6.8018616570366754, + "rewards/margins": 17.46179207695855, + "rewards/rejected": -10.659930419921874, + "step": 1676 + }, + { + "epoch": 0.4596409483349322, + "grad_norm": 4.34375, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": 2710994.909090909, + "logits/rejected": -2109954.153846154, + "logps/chosen": -391.14897017045456, + "logps/rejected": -589.61962890625, + "loss": 0.014, + "rewards/chosen": 6.503354159268466, + "rewards/margins": 18.265018916630243, + "rewards/rejected": -11.761664757361778, + "step": 1677 + }, + { + "epoch": 0.45991503357544194, + "grad_norm": 3.578125, + "kl": 2.1121902465820312, + "learning_rate": 5e-06, + "logits/chosen": -31023896.615384616, + "logits/rejected": -14545905.454545455, + "logps/chosen": -513.7229942908654, + "logps/rejected": -472.2059215198864, + "loss": 0.0123, + "rewards/chosen": 7.115669837364783, + "rewards/margins": 17.71613786604021, + "rewards/rejected": -10.600468028675426, + "step": 1678 + }, + { + "epoch": 0.46018911881595176, + "grad_norm": 4.71875, + "kl": 2.4123740196228027, + "learning_rate": 5e-06, + "logits/chosen": -16045000.0, + "logits/rejected": -22676274.285714287, + "logps/chosen": -449.972607421875, + "logps/rejected": -497.6011439732143, + "loss": 0.028, + "rewards/chosen": 6.625751495361328, + "rewards/margins": 15.715674264090401, + "rewards/rejected": -9.089922768729073, + "step": 1679 + }, + { + "epoch": 0.4604632040564616, + "grad_norm": 4.375, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": 4224403.692307692, + "logits/rejected": -31058740.363636363, + "logps/chosen": -364.01663912259613, + "logps/rejected": -540.4870383522727, + "loss": 0.0144, + "rewards/chosen": 7.314696678748498, + "rewards/margins": 16.639172053837278, + "rewards/rejected": -9.32447537508878, + "step": 1680 + }, + { + "epoch": 0.46073728929697133, + "grad_norm": 7.40625, + "kl": 5.1912946701049805, + "learning_rate": 5e-06, + "logits/chosen": -44700824.615384616, + "logits/rejected": -27330894.545454547, + "logps/chosen": -419.81482872596155, + "logps/rejected": -533.6186079545455, + "loss": 0.0473, + "rewards/chosen": 6.046580387995793, + "rewards/margins": 17.83260084032179, + "rewards/rejected": -11.786020452325994, + "step": 1681 + }, + { + "epoch": 0.46101137453748114, + "grad_norm": 3.9375, + "kl": 2.178518295288086, + "learning_rate": 5e-06, + "logits/chosen": -9626916.705882354, + "logits/rejected": -17525276.57142857, + "logps/chosen": -411.64148667279414, + "logps/rejected": -410.66831752232144, + "loss": 0.0132, + "rewards/chosen": 7.163823296042049, + "rewards/margins": 16.341194729845064, + "rewards/rejected": -9.177371433803014, + "step": 1682 + }, + { + "epoch": 0.46128545977799096, + "grad_norm": 3.171875, + "kl": 1.6191076040267944, + "learning_rate": 5e-06, + "logits/chosen": -13751496.727272727, + "logits/rejected": -21142217.846153848, + "logps/chosen": -354.58200905539775, + "logps/rejected": -442.36910306490387, + "loss": 0.0315, + "rewards/chosen": 4.665382038463246, + "rewards/margins": 13.805455374550986, + "rewards/rejected": -9.14007333608774, + "step": 1683 + }, + { + "epoch": 0.46155954501850077, + "grad_norm": 3.953125, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -29768448.0, + "logits/rejected": -26296806.4, + "logps/chosen": -397.05132378472223, + "logps/rejected": -504.0786458333333, + "loss": 0.0194, + "rewards/chosen": 7.10087415907118, + "rewards/margins": 15.62848646375868, + "rewards/rejected": -8.5276123046875, + "step": 1684 + }, + { + "epoch": 0.46183363025901053, + "grad_norm": 9.0, + "kl": 6.346858978271484, + "learning_rate": 5e-06, + "logits/chosen": -9726483.333333334, + "logits/rejected": -23424117.333333332, + "logps/chosen": -384.3035888671875, + "logps/rejected": -487.3333333333333, + "loss": 0.035, + "rewards/chosen": 6.78436279296875, + "rewards/margins": 18.847468058268227, + "rewards/rejected": -12.063105265299479, + "step": 1685 + }, + { + "epoch": 0.46210771549952034, + "grad_norm": 9.1875, + "kl": 12.066595077514648, + "learning_rate": 5e-06, + "logits/chosen": -28407683.555555556, + "logits/rejected": -92274.0, + "logps/chosen": -437.7995876736111, + "logps/rejected": -476.4726969401042, + "loss": 0.0295, + "rewards/chosen": 6.991308000352648, + "rewards/margins": 17.289805518256294, + "rewards/rejected": -10.298497517903646, + "step": 1686 + }, + { + "epoch": 0.46238180074003016, + "grad_norm": 3.203125, + "kl": 3.8043861389160156, + "learning_rate": 5e-06, + "logits/chosen": -15123502.857142856, + "logits/rejected": -9638312.0, + "logps/chosen": -483.7584751674107, + "logps/rejected": -549.671533203125, + "loss": 0.0268, + "rewards/chosen": 6.622264317103794, + "rewards/margins": 19.14669974190848, + "rewards/rejected": -12.524435424804688, + "step": 1687 + }, + { + "epoch": 0.46265588598053997, + "grad_norm": 4.71875, + "kl": 1.5119298696517944, + "learning_rate": 5e-06, + "logits/chosen": -16942726.4, + "logits/rejected": -17508851.555555556, + "logps/chosen": -358.5192057291667, + "logps/rejected": -442.4229329427083, + "loss": 0.0141, + "rewards/chosen": 5.889860534667969, + "rewards/margins": 15.631549580891928, + "rewards/rejected": -9.741689046223959, + "step": 1688 + }, + { + "epoch": 0.46292997122104973, + "grad_norm": 6.75, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -8961638.222222222, + "logits/rejected": -11250312.533333333, + "logps/chosen": -451.10389539930554, + "logps/rejected": -486.91578776041666, + "loss": 0.0371, + "rewards/chosen": 8.140796237521702, + "rewards/margins": 17.39507276746962, + "rewards/rejected": -9.254276529947917, + "step": 1689 + }, + { + "epoch": 0.46320405646155954, + "grad_norm": 6.9375, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -22575352.0, + "logits/rejected": -33758248.0, + "logps/chosen": -315.0330505371094, + "logps/rejected": -480.6097106933594, + "loss": 0.0159, + "rewards/chosen": 6.096626281738281, + "rewards/margins": 15.455322265625, + "rewards/rejected": -9.358695983886719, + "step": 1690 + }, + { + "epoch": 0.46347814170206936, + "grad_norm": 3.53125, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -30828918.4, + "logits/rejected": -40345165.71428572, + "logps/chosen": -491.60400390625, + "logps/rejected": -497.85585239955356, + "loss": 0.028, + "rewards/chosen": 6.542910766601563, + "rewards/margins": 15.3159907749721, + "rewards/rejected": -8.773080008370536, + "step": 1691 + }, + { + "epoch": 0.4637522269425791, + "grad_norm": 1.65625, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -7056406.4, + "logits/rejected": -13818329.777777778, + "logps/chosen": -418.9163411458333, + "logps/rejected": -548.7373046875, + "loss": 0.0069, + "rewards/chosen": 6.457537841796875, + "rewards/margins": 16.708722093370227, + "rewards/rejected": -10.251184251573351, + "step": 1692 + }, + { + "epoch": 0.46402631218308893, + "grad_norm": 7.34375, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -12750016.0, + "logits/rejected": -16764871.0, + "logps/chosen": -360.90802001953125, + "logps/rejected": -447.96612548828125, + "loss": 0.0454, + "rewards/chosen": 6.860402584075928, + "rewards/margins": 16.18801259994507, + "rewards/rejected": -9.32761001586914, + "step": 1693 + }, + { + "epoch": 0.46430039742359874, + "grad_norm": 5.5, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -12547288.615384616, + "logits/rejected": -24998301.09090909, + "logps/chosen": -327.8277118389423, + "logps/rejected": -546.2495561079545, + "loss": 0.0198, + "rewards/chosen": 5.97627669114333, + "rewards/margins": 16.907631560639068, + "rewards/rejected": -10.931354869495738, + "step": 1694 + }, + { + "epoch": 0.46457448266410856, + "grad_norm": 5.15625, + "kl": 10.120944023132324, + "learning_rate": 5e-06, + "logits/chosen": -28861140.57142857, + "logits/rejected": -31471942.4, + "logps/chosen": -440.75537109375, + "logps/rejected": -512.94384765625, + "loss": 0.0187, + "rewards/chosen": 6.952501569475446, + "rewards/margins": 16.427845655168806, + "rewards/rejected": -9.47534408569336, + "step": 1695 + }, + { + "epoch": 0.4648485679046183, + "grad_norm": 2.734375, + "kl": 1.9487762451171875, + "learning_rate": 5e-06, + "logits/chosen": -27876937.846153848, + "logits/rejected": -26681885.09090909, + "logps/chosen": -517.7034254807693, + "logps/rejected": -583.6222478693181, + "loss": 0.0086, + "rewards/chosen": 7.234901428222656, + "rewards/margins": 16.612262379039418, + "rewards/rejected": -9.377360950816762, + "step": 1696 + }, + { + "epoch": 0.46512265314512813, + "grad_norm": 10.9375, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -17521928.615384616, + "logits/rejected": -15317056.0, + "logps/chosen": -365.98580228365387, + "logps/rejected": -670.5622336647727, + "loss": 0.0767, + "rewards/chosen": 4.622768108661358, + "rewards/margins": 15.317984200857735, + "rewards/rejected": -10.695216092196377, + "step": 1697 + }, + { + "epoch": 0.46539673838563794, + "grad_norm": 8.5625, + "kl": 6.266646862030029, + "learning_rate": 5e-06, + "logits/chosen": -10905769.142857144, + "logits/rejected": -20460694.4, + "logps/chosen": -475.68038504464283, + "logps/rejected": -434.183935546875, + "loss": 0.0688, + "rewards/chosen": 6.765536717006138, + "rewards/margins": 12.914122990199498, + "rewards/rejected": -6.148586273193359, + "step": 1698 + }, + { + "epoch": 0.46567082362614776, + "grad_norm": 7.90625, + "kl": 3.460441827774048, + "learning_rate": 5e-06, + "logits/chosen": -37908524.0, + "logits/rejected": -2656849.0, + "logps/chosen": -483.4461669921875, + "logps/rejected": -510.94000244140625, + "loss": 0.0246, + "rewards/chosen": 6.889766693115234, + "rewards/margins": 14.275043964385986, + "rewards/rejected": -7.385277271270752, + "step": 1699 + }, + { + "epoch": 0.4659449088666575, + "grad_norm": 12.6875, + "kl": 7.921131134033203, + "learning_rate": 5e-06, + "logits/chosen": -8370612.571428572, + "logits/rejected": 69966233.6, + "logps/chosen": -462.45797293526783, + "logps/rejected": -811.92294921875, + "loss": 0.0728, + "rewards/chosen": 6.194572448730469, + "rewards/margins": 23.808604431152343, + "rewards/rejected": -17.614031982421874, + "step": 1700 + }, + { + "epoch": 0.46621899410716733, + "grad_norm": 10.0, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -8374808.888888889, + "logits/rejected": -17345011.2, + "logps/chosen": -389.7767740885417, + "logps/rejected": -533.6797200520833, + "loss": 0.0645, + "rewards/chosen": 5.238567776150173, + "rewards/margins": 15.07330084906684, + "rewards/rejected": -9.834733072916666, + "step": 1701 + }, + { + "epoch": 0.46649307934767714, + "grad_norm": 8.3125, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -30879545.14285714, + "logits/rejected": -15246166.4, + "logps/chosen": -430.812744140625, + "logps/rejected": -457.98857421875, + "loss": 0.032, + "rewards/chosen": 5.609842572893415, + "rewards/margins": 14.478392682756697, + "rewards/rejected": -8.868550109863282, + "step": 1702 + }, + { + "epoch": 0.4667671645881869, + "grad_norm": 7.25, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -21201356.8, + "logits/rejected": -10524163.42857143, + "logps/chosen": -450.152099609375, + "logps/rejected": -442.1358119419643, + "loss": 0.0249, + "rewards/chosen": 8.022964477539062, + "rewards/margins": 15.498146057128906, + "rewards/rejected": -7.475181579589844, + "step": 1703 + }, + { + "epoch": 0.4670412498286967, + "grad_norm": 8.625, + "kl": 12.218151092529297, + "learning_rate": 5e-06, + "logits/chosen": -16933184.0, + "logits/rejected": -5793397.6, + "logps/chosen": -479.412841796875, + "logps/rejected": -651.31826171875, + "loss": 0.0749, + "rewards/chosen": 5.957964760916574, + "rewards/margins": 21.624017007010323, + "rewards/rejected": -15.66605224609375, + "step": 1704 + }, + { + "epoch": 0.46731533506920653, + "grad_norm": 4.40625, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -28486350.769230768, + "logits/rejected": -14592904.727272727, + "logps/chosen": -368.64633413461536, + "logps/rejected": -517.8259943181819, + "loss": 0.0495, + "rewards/chosen": 5.982340299166166, + "rewards/margins": 14.455531300364674, + "rewards/rejected": -8.473191001198508, + "step": 1705 + }, + { + "epoch": 0.46758942030971634, + "grad_norm": 5.71875, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -20912817.230769232, + "logits/rejected": -15670466.909090908, + "logps/chosen": -284.78091195913464, + "logps/rejected": -539.6136807528409, + "loss": 0.0606, + "rewards/chosen": 5.1031329815204325, + "rewards/margins": 16.378137975305943, + "rewards/rejected": -11.275004993785512, + "step": 1706 + }, + { + "epoch": 0.4678635055502261, + "grad_norm": 7.09375, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -25888332.444444444, + "logits/rejected": -20506301.866666667, + "logps/chosen": -365.67206488715277, + "logps/rejected": -610.8548828125, + "loss": 0.0317, + "rewards/chosen": 5.631873236762153, + "rewards/margins": 17.454921129014757, + "rewards/rejected": -11.823047892252605, + "step": 1707 + }, + { + "epoch": 0.4681375907907359, + "grad_norm": 2.5, + "kl": 0.9259414672851562, + "learning_rate": 5e-06, + "logits/chosen": -20356174.769230768, + "logits/rejected": -11810011.636363637, + "logps/chosen": -469.22543569711536, + "logps/rejected": -580.1753373579545, + "loss": 0.0087, + "rewards/chosen": 6.462041414701021, + "rewards/margins": 14.239171541654146, + "rewards/rejected": -7.777130126953125, + "step": 1708 + }, + { + "epoch": 0.4684116760312457, + "grad_norm": 7.46875, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -24580057.14285714, + "logits/rejected": -12401569.88235294, + "logps/chosen": -405.86181640625, + "logps/rejected": -554.1746323529412, + "loss": 0.0274, + "rewards/chosen": 6.433249882289341, + "rewards/margins": 15.36135405853015, + "rewards/rejected": -8.928104176240808, + "step": 1709 + }, + { + "epoch": 0.46868576127175554, + "grad_norm": 2.359375, + "kl": 2.2583796977996826, + "learning_rate": 5e-06, + "logits/chosen": -14685120.0, + "logits/rejected": -7635593.142857143, + "logps/chosen": -485.022314453125, + "logps/rejected": -525.01806640625, + "loss": 0.0054, + "rewards/chosen": 8.168260955810547, + "rewards/margins": 18.577594321114677, + "rewards/rejected": -10.40933336530413, + "step": 1710 + }, + { + "epoch": 0.4689598465122653, + "grad_norm": 2.359375, + "kl": 0.043883007019758224, + "learning_rate": 5e-06, + "logits/chosen": -15452819.2, + "logits/rejected": -24974393.14285714, + "logps/chosen": -525.10888671875, + "logps/rejected": -607.0219029017857, + "loss": 0.0189, + "rewards/chosen": 7.411550140380859, + "rewards/margins": 19.1441529410226, + "rewards/rejected": -11.732602800641741, + "step": 1711 + }, + { + "epoch": 0.4692339317527751, + "grad_norm": 4.90625, + "kl": 2.2185516357421875, + "learning_rate": 5e-06, + "logits/chosen": -11490661.333333334, + "logits/rejected": -25797226.666666668, + "logps/chosen": -439.6583658854167, + "logps/rejected": -459.13916015625, + "loss": 0.0673, + "rewards/chosen": 5.716467115614149, + "rewards/margins": 13.766057544284397, + "rewards/rejected": -8.049590428670248, + "step": 1712 + }, + { + "epoch": 0.4695080169932849, + "grad_norm": 8.125, + "kl": 0.7700144648551941, + "learning_rate": 5e-06, + "logits/chosen": 3892979.4285714286, + "logits/rejected": -30021548.8, + "logps/chosen": -473.704833984375, + "logps/rejected": -445.43486328125, + "loss": 0.0515, + "rewards/chosen": 4.963799612862723, + "rewards/margins": 14.501913016183035, + "rewards/rejected": -9.538113403320313, + "step": 1713 + }, + { + "epoch": 0.4697821022337947, + "grad_norm": 5.75, + "kl": 4.9347381591796875, + "learning_rate": 5e-06, + "logits/chosen": -10010000.0, + "logits/rejected": 50952014.76923077, + "logps/chosen": -448.5056818181818, + "logps/rejected": -546.8798076923077, + "loss": 0.0208, + "rewards/chosen": 6.73655215176669, + "rewards/margins": 19.233326678509478, + "rewards/rejected": -12.496774526742788, + "step": 1714 + }, + { + "epoch": 0.4700561874743045, + "grad_norm": 5.125, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -8735749.6, + "logits/rejected": -15509334.857142856, + "logps/chosen": -338.249853515625, + "logps/rejected": -580.1738978794643, + "loss": 0.0198, + "rewards/chosen": 4.979413223266602, + "rewards/margins": 13.568934358869281, + "rewards/rejected": -8.589521135602679, + "step": 1715 + }, + { + "epoch": 0.4703302727148143, + "grad_norm": 13.25, + "kl": 4.573616981506348, + "learning_rate": 5e-06, + "logits/chosen": -19202152.727272727, + "logits/rejected": 2290331.076923077, + "logps/chosen": -310.43814364346593, + "logps/rejected": -408.9820087139423, + "loss": 0.1011, + "rewards/chosen": 4.24181435324929, + "rewards/margins": 13.496539882846644, + "rewards/rejected": -9.254725529597355, + "step": 1716 + }, + { + "epoch": 0.4706043579553241, + "grad_norm": 16.625, + "kl": 4.853233337402344, + "learning_rate": 5e-06, + "logits/chosen": -6814181.866666666, + "logits/rejected": -24235681.777777776, + "logps/chosen": -474.4140625, + "logps/rejected": -473.2750651041667, + "loss": 0.1028, + "rewards/chosen": 5.99122060139974, + "rewards/margins": 12.235826195610894, + "rewards/rejected": -6.244605594211155, + "step": 1717 + }, + { + "epoch": 0.4708784431958339, + "grad_norm": 9.75, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -27249122.285714287, + "logits/rejected": -31007382.588235293, + "logps/chosen": -300.43948800223217, + "logps/rejected": -581.6171300551471, + "loss": 0.0627, + "rewards/chosen": 4.458695002964565, + "rewards/margins": 11.283843192733638, + "rewards/rejected": -6.825148189769072, + "step": 1718 + }, + { + "epoch": 0.4711525284363437, + "grad_norm": 7.03125, + "kl": 4.356175422668457, + "learning_rate": 5e-06, + "logits/chosen": -29919904.0, + "logits/rejected": 12088084.0, + "logps/chosen": -376.7576904296875, + "logps/rejected": -477.5568033854167, + "loss": 0.0239, + "rewards/chosen": 5.41218630472819, + "rewards/margins": 14.567485173543293, + "rewards/rejected": -9.155298868815104, + "step": 1719 + }, + { + "epoch": 0.4714266136768535, + "grad_norm": 4.03125, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -29162051.2, + "logits/rejected": -21730441.14285714, + "logps/chosen": -494.8162109375, + "logps/rejected": -557.0776018415179, + "loss": 0.0131, + "rewards/chosen": 6.485673522949218, + "rewards/margins": 15.36318849836077, + "rewards/rejected": -8.877514975411552, + "step": 1720 + }, + { + "epoch": 0.4717006989173633, + "grad_norm": 6.5, + "kl": 5.710989952087402, + "learning_rate": 5e-06, + "logits/chosen": -6248456.666666667, + "logits/rejected": -8080106.0, + "logps/chosen": -353.8678385416667, + "logps/rejected": -462.4733479817708, + "loss": 0.0717, + "rewards/chosen": 6.696165720621745, + "rewards/margins": 16.093798955281574, + "rewards/rejected": -9.39763323465983, + "step": 1721 + }, + { + "epoch": 0.4719747841578731, + "grad_norm": 4.6875, + "kl": 2.9611613750457764, + "learning_rate": 5e-06, + "logits/chosen": -31069926.4, + "logits/rejected": -23353648.0, + "logps/chosen": -430.9833658854167, + "logps/rejected": -561.7768012152778, + "loss": 0.0413, + "rewards/chosen": 6.2404052734375, + "rewards/margins": 16.722451612684463, + "rewards/rejected": -10.482046339246962, + "step": 1722 + }, + { + "epoch": 0.4722488693983829, + "grad_norm": 13.3125, + "kl": 2.153919219970703, + "learning_rate": 5e-06, + "logits/chosen": -26730660.57142857, + "logits/rejected": -19227302.4, + "logps/chosen": -438.88462611607144, + "logps/rejected": -528.17666015625, + "loss": 0.0234, + "rewards/chosen": 8.788391658238002, + "rewards/margins": 20.604691096714564, + "rewards/rejected": -11.816299438476562, + "step": 1723 + }, + { + "epoch": 0.4725229546388927, + "grad_norm": 5.28125, + "kl": 8.103209495544434, + "learning_rate": 5e-06, + "logits/chosen": -23273424.94117647, + "logits/rejected": -6795886.857142857, + "logps/chosen": -389.1346220128676, + "logps/rejected": -590.9950474330357, + "loss": 0.0171, + "rewards/chosen": 7.719419591567096, + "rewards/margins": 17.121781870096672, + "rewards/rejected": -9.402362278529576, + "step": 1724 + }, + { + "epoch": 0.47279703987940247, + "grad_norm": 4.625, + "kl": 3.171070098876953, + "learning_rate": 5e-06, + "logits/chosen": -35072192.0, + "logits/rejected": -3654680.0, + "logps/chosen": -446.74776785714283, + "logps/rejected": -346.900537109375, + "loss": 0.0158, + "rewards/chosen": 6.928188868931362, + "rewards/margins": 14.853886958530971, + "rewards/rejected": -7.925698089599609, + "step": 1725 + }, + { + "epoch": 0.4730711251199123, + "grad_norm": 8.0625, + "kl": 0.2731831967830658, + "learning_rate": 5e-06, + "logits/chosen": -4667737.6, + "logits/rejected": -46049581.71428572, + "logps/chosen": -602.6849609375, + "logps/rejected": -691.4379185267857, + "loss": 0.0177, + "rewards/chosen": 7.016824340820312, + "rewards/margins": 18.332723781040734, + "rewards/rejected": -11.315899440220424, + "step": 1726 + }, + { + "epoch": 0.4733452103604221, + "grad_norm": 4.5625, + "kl": 3.666719436645508, + "learning_rate": 5e-06, + "logits/chosen": 9549667.636363637, + "logits/rejected": -24935884.307692308, + "logps/chosen": -497.79647549715907, + "logps/rejected": -404.20804537259613, + "loss": 0.0208, + "rewards/chosen": 8.073626431551846, + "rewards/margins": 15.631741290325884, + "rewards/rejected": -7.558114858774038, + "step": 1727 + }, + { + "epoch": 0.4736192956009319, + "grad_norm": 1.1171875, + "kl": 3.095902919769287, + "learning_rate": 5e-06, + "logits/chosen": -9678567.272727273, + "logits/rejected": -19049222.153846152, + "logps/chosen": -377.5650745738636, + "logps/rejected": -294.47171724759613, + "loss": 0.0039, + "rewards/chosen": 8.276838822798295, + "rewards/margins": 14.808806572760734, + "rewards/rejected": -6.53196774996244, + "step": 1728 + }, + { + "epoch": 0.47389338084144167, + "grad_norm": 3.84375, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -21679616.0, + "logits/rejected": -2558334.6153846155, + "logps/chosen": -588.8974165482955, + "logps/rejected": -595.4732947716346, + "loss": 0.0126, + "rewards/chosen": 8.039434259588068, + "rewards/margins": 18.798513239080258, + "rewards/rejected": -10.759078979492188, + "step": 1729 + }, + { + "epoch": 0.4741674660819515, + "grad_norm": 6.8125, + "kl": 4.439382076263428, + "learning_rate": 5e-06, + "logits/chosen": -33666214.4, + "logits/rejected": 1251648.0, + "logps/chosen": -430.1591796875, + "logps/rejected": -551.0348074776786, + "loss": 0.0286, + "rewards/chosen": 5.9394981384277346, + "rewards/margins": 15.079901885986327, + "rewards/rejected": -9.140403747558594, + "step": 1730 + }, + { + "epoch": 0.4744415513224613, + "grad_norm": 5.03125, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -35950540.8, + "logits/rejected": -24483264.0, + "logps/chosen": -451.118603515625, + "logps/rejected": -535.5440848214286, + "loss": 0.0164, + "rewards/chosen": 6.484437561035156, + "rewards/margins": 18.143871198381696, + "rewards/rejected": -11.65943363734654, + "step": 1731 + }, + { + "epoch": 0.4747156365629711, + "grad_norm": 6.3125, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -4037740.5714285714, + "logits/rejected": -13294748.8, + "logps/chosen": -386.86655970982144, + "logps/rejected": -434.4720703125, + "loss": 0.0253, + "rewards/chosen": 5.585097176688058, + "rewards/margins": 13.711100442068918, + "rewards/rejected": -8.12600326538086, + "step": 1732 + }, + { + "epoch": 0.47498972180348087, + "grad_norm": 6.46875, + "kl": 6.7461957931518555, + "learning_rate": 5e-06, + "logits/chosen": -27108872.0, + "logits/rejected": -11637613.0, + "logps/chosen": -555.9730224609375, + "logps/rejected": -513.642333984375, + "loss": 0.05, + "rewards/chosen": 8.204333305358887, + "rewards/margins": 16.63151264190674, + "rewards/rejected": -8.427179336547852, + "step": 1733 + }, + { + "epoch": 0.4752638070439907, + "grad_norm": 6.71875, + "kl": 5.792693138122559, + "learning_rate": 5e-06, + "logits/chosen": -36192064.0, + "logits/rejected": -17755990.4, + "logps/chosen": -446.2562779017857, + "logps/rejected": -404.3393798828125, + "loss": 0.0358, + "rewards/chosen": 7.077086857386997, + "rewards/margins": 14.093060520717074, + "rewards/rejected": -7.015973663330078, + "step": 1734 + }, + { + "epoch": 0.4755378922845005, + "grad_norm": 12.75, + "kl": 3.104572296142578, + "learning_rate": 5e-06, + "logits/chosen": -17702146.46153846, + "logits/rejected": 3938207.6363636362, + "logps/chosen": -414.87094350961536, + "logps/rejected": -445.5094549005682, + "loss": 0.0719, + "rewards/chosen": 5.019219031700721, + "rewards/margins": 13.122806095576784, + "rewards/rejected": -8.103587063876065, + "step": 1735 + }, + { + "epoch": 0.47581197752501025, + "grad_norm": 4.34375, + "kl": 7.778056621551514, + "learning_rate": 5e-06, + "logits/chosen": -27889886.11764706, + "logits/rejected": -37008740.571428575, + "logps/chosen": -440.0104549632353, + "logps/rejected": -332.8716517857143, + "loss": 0.027, + "rewards/chosen": 5.8062842873966, + "rewards/margins": 13.563672811043363, + "rewards/rejected": -7.757388523646763, + "step": 1736 + }, + { + "epoch": 0.47608606276552007, + "grad_norm": 18.125, + "kl": 6.1865034103393555, + "learning_rate": 5e-06, + "logits/chosen": -14725274.666666666, + "logits/rejected": -17034149.333333332, + "logps/chosen": -513.524658203125, + "logps/rejected": -441.9706217447917, + "loss": 0.077, + "rewards/chosen": 5.434621810913086, + "rewards/margins": 11.689544677734375, + "rewards/rejected": -6.254922866821289, + "step": 1737 + }, + { + "epoch": 0.4763601480060299, + "grad_norm": 5.71875, + "kl": 0.34301885962486267, + "learning_rate": 5e-06, + "logits/chosen": -27259341.333333332, + "logits/rejected": -5174554.0, + "logps/chosen": -477.2246907552083, + "logps/rejected": -479.990478515625, + "loss": 0.0211, + "rewards/chosen": 6.786771138509114, + "rewards/margins": 15.512872695922852, + "rewards/rejected": -8.726101557413736, + "step": 1738 + }, + { + "epoch": 0.4766342332465397, + "grad_norm": 5.625, + "kl": 7.602664947509766, + "learning_rate": 5e-06, + "logits/chosen": -20719444.0, + "logits/rejected": -21218222.0, + "logps/chosen": -430.06549072265625, + "logps/rejected": -493.4507751464844, + "loss": 0.0249, + "rewards/chosen": 6.737005710601807, + "rewards/margins": 14.590097904205322, + "rewards/rejected": -7.853092193603516, + "step": 1739 + }, + { + "epoch": 0.47690831848704945, + "grad_norm": 8.1875, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -6343762.857142857, + "logits/rejected": -14142745.6, + "logps/chosen": -425.992431640625, + "logps/rejected": -544.41064453125, + "loss": 0.0467, + "rewards/chosen": 6.036842891148159, + "rewards/margins": 17.258165522984097, + "rewards/rejected": -11.221322631835937, + "step": 1740 + }, + { + "epoch": 0.47718240372755927, + "grad_norm": 10.9375, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -30742061.333333332, + "logits/rejected": 7463903.333333333, + "logps/chosen": -431.1814778645833, + "logps/rejected": -460.7071533203125, + "loss": 0.0377, + "rewards/chosen": 7.320431391398112, + "rewards/margins": 16.227760950724285, + "rewards/rejected": -8.907329559326172, + "step": 1741 + }, + { + "epoch": 0.4774564889680691, + "grad_norm": 8.375, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -29444497.454545453, + "logits/rejected": -16265868.307692308, + "logps/chosen": -418.0491832386364, + "logps/rejected": -636.1583533653846, + "loss": 0.0376, + "rewards/chosen": 6.305139021439985, + "rewards/margins": 16.02702059445681, + "rewards/rejected": -9.721881573016827, + "step": 1742 + }, + { + "epoch": 0.4777305742085789, + "grad_norm": 8.1875, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": 7733428.666666667, + "logits/rejected": -14062988.0, + "logps/chosen": -447.41259765625, + "logps/rejected": -624.1127522786459, + "loss": 0.048, + "rewards/chosen": 5.816048940022786, + "rewards/margins": 18.700150807698567, + "rewards/rejected": -12.884101867675781, + "step": 1743 + }, + { + "epoch": 0.47800465944908865, + "grad_norm": 7.8125, + "kl": 0.2375590056180954, + "learning_rate": 5e-06, + "logits/chosen": -23543787.2, + "logits/rejected": -20054875.42857143, + "logps/chosen": -526.951171875, + "logps/rejected": -485.21714564732144, + "loss": 0.0164, + "rewards/chosen": 6.54974136352539, + "rewards/margins": 15.982974352155413, + "rewards/rejected": -9.433232988630023, + "step": 1744 + }, + { + "epoch": 0.47827874468959847, + "grad_norm": 4.875, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -19345233.454545453, + "logits/rejected": -4492041.230769231, + "logps/chosen": -400.50346235795456, + "logps/rejected": -653.8682391826923, + "loss": 0.0462, + "rewards/chosen": 6.657537286931818, + "rewards/margins": 16.357398079825447, + "rewards/rejected": -9.69986079289363, + "step": 1745 + }, + { + "epoch": 0.4785528299301083, + "grad_norm": 9.1875, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -38074478.76923077, + "logits/rejected": -26336680.727272727, + "logps/chosen": -431.2440655048077, + "logps/rejected": -668.8501864346591, + "loss": 0.0314, + "rewards/chosen": 6.641356248121995, + "rewards/margins": 16.61587961903819, + "rewards/rejected": -9.974523370916193, + "step": 1746 + }, + { + "epoch": 0.47882691517061804, + "grad_norm": 7.15625, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -18870440.0, + "logits/rejected": -28200805.333333332, + "logps/chosen": -385.0126139322917, + "logps/rejected": -773.9786783854166, + "loss": 0.0197, + "rewards/chosen": 6.617798487345378, + "rewards/margins": 18.33506202697754, + "rewards/rejected": -11.717263539632162, + "step": 1747 + }, + { + "epoch": 0.47910100041112785, + "grad_norm": 4.75, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": 3567980.4444444445, + "logits/rejected": -21074660.266666666, + "logps/chosen": -455.39659288194446, + "logps/rejected": -386.39765625, + "loss": 0.0201, + "rewards/chosen": 4.895331064860026, + "rewards/margins": 14.299363454182942, + "rewards/rejected": -9.404032389322916, + "step": 1748 + }, + { + "epoch": 0.47937508565163767, + "grad_norm": 7.1875, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": 7929053.866666666, + "logits/rejected": -23250912.0, + "logps/chosen": -577.4915364583334, + "logps/rejected": -428.67369249131946, + "loss": 0.0204, + "rewards/chosen": 6.82713623046875, + "rewards/margins": 15.856824747721355, + "rewards/rejected": -9.029688517252604, + "step": 1749 + }, + { + "epoch": 0.4796491708921475, + "grad_norm": 6.1875, + "kl": 12.175565719604492, + "learning_rate": 5e-06, + "logits/chosen": 7249922.823529412, + "logits/rejected": -17282372.57142857, + "logps/chosen": -529.9791475183823, + "logps/rejected": -406.8190220424107, + "loss": 0.079, + "rewards/chosen": 6.499926847570083, + "rewards/margins": 15.574907447109702, + "rewards/rejected": -9.07498059953962, + "step": 1750 + }, + { + "epoch": 0.47992325613265724, + "grad_norm": 6.21875, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -17163776.0, + "logits/rejected": -9804270.76923077, + "logps/chosen": -309.0297185724432, + "logps/rejected": -609.0321138822115, + "loss": 0.0272, + "rewards/chosen": 5.039549047296697, + "rewards/margins": 16.950825057663284, + "rewards/rejected": -11.911276010366587, + "step": 1751 + }, + { + "epoch": 0.48019734137316705, + "grad_norm": 3.265625, + "kl": 0.030724843963980675, + "learning_rate": 5e-06, + "logits/chosen": -9821353.142857144, + "logits/rejected": -45598825.6, + "logps/chosen": -444.2739955357143, + "logps/rejected": -527.3685546875, + "loss": 0.0105, + "rewards/chosen": 5.94829341343471, + "rewards/margins": 15.90647212437221, + "rewards/rejected": -9.9581787109375, + "step": 1752 + }, + { + "epoch": 0.48047142661367687, + "grad_norm": 4.125, + "kl": 2.2532706260681152, + "learning_rate": 5e-06, + "logits/chosen": -38587063.46666667, + "logits/rejected": -11429121.777777778, + "logps/chosen": -438.7443033854167, + "logps/rejected": -459.2112630208333, + "loss": 0.0426, + "rewards/chosen": 6.2213389078776045, + "rewards/margins": 16.98258548312717, + "rewards/rejected": -10.761246575249565, + "step": 1753 + }, + { + "epoch": 0.4807455118541866, + "grad_norm": 7.84375, + "kl": 9.252123832702637, + "learning_rate": 5e-06, + "logits/chosen": 12737844.0, + "logits/rejected": -39653834.666666664, + "logps/chosen": -380.3117268880208, + "logps/rejected": -506.115234375, + "loss": 0.0804, + "rewards/chosen": 7.022637685139974, + "rewards/margins": 16.828820546468098, + "rewards/rejected": -9.806182861328125, + "step": 1754 + }, + { + "epoch": 0.48101959709469644, + "grad_norm": 7.59375, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -12432654.4, + "logits/rejected": 66974898.28571428, + "logps/chosen": -449.4849609375, + "logps/rejected": -510.22813197544644, + "loss": 0.0301, + "rewards/chosen": 6.58228759765625, + "rewards/margins": 17.989154488699775, + "rewards/rejected": -11.406866891043526, + "step": 1755 + }, + { + "epoch": 0.48129368233520625, + "grad_norm": 8.8125, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -49823093.333333336, + "logits/rejected": -17027302.666666668, + "logps/chosen": -458.7109781901042, + "logps/rejected": -493.8128662109375, + "loss": 0.0555, + "rewards/chosen": 5.439146677652995, + "rewards/margins": 13.929757436116535, + "rewards/rejected": -8.490610758463541, + "step": 1756 + }, + { + "epoch": 0.48156776757571607, + "grad_norm": 4.625, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -19180732.0, + "logits/rejected": -19656290.0, + "logps/chosen": -355.6199951171875, + "logps/rejected": -549.4818115234375, + "loss": 0.0319, + "rewards/chosen": 5.0023698806762695, + "rewards/margins": 15.163084983825684, + "rewards/rejected": -10.160715103149414, + "step": 1757 + }, + { + "epoch": 0.4818418528162258, + "grad_norm": 4.125, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": 11025736.888888888, + "logits/rejected": -28984834.133333333, + "logps/chosen": -550.59619140625, + "logps/rejected": -455.3085611979167, + "loss": 0.0133, + "rewards/chosen": 9.22160169813368, + "rewards/margins": 17.972566053602428, + "rewards/rejected": -8.75096435546875, + "step": 1758 + }, + { + "epoch": 0.48211593805673564, + "grad_norm": 2.78125, + "kl": 1.1189759969711304, + "learning_rate": 5e-06, + "logits/chosen": -8539960.0, + "logits/rejected": -20810587.42857143, + "logps/chosen": -566.9564453125, + "logps/rejected": -624.3837890625, + "loss": 0.0088, + "rewards/chosen": 6.439105224609375, + "rewards/margins": 16.1263185773577, + "rewards/rejected": -9.687213352748326, + "step": 1759 + }, + { + "epoch": 0.48239002329724545, + "grad_norm": 7.34375, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -7294793.714285715, + "logits/rejected": -12141376.8, + "logps/chosen": -405.24874441964283, + "logps/rejected": -390.2250732421875, + "loss": 0.0319, + "rewards/chosen": 6.4775559561593195, + "rewards/margins": 14.549434770856585, + "rewards/rejected": -8.071878814697266, + "step": 1760 + }, + { + "epoch": 0.48266410853775527, + "grad_norm": 7.6875, + "kl": 10.363985061645508, + "learning_rate": 5e-06, + "logits/chosen": 1193031.0, + "logits/rejected": -20749960.0, + "logps/chosen": -396.3492736816406, + "logps/rejected": -483.1043701171875, + "loss": 0.0356, + "rewards/chosen": 7.0453901290893555, + "rewards/margins": 18.367045402526855, + "rewards/rejected": -11.3216552734375, + "step": 1761 + }, + { + "epoch": 0.482938193778265, + "grad_norm": 9.5, + "kl": 5.896111488342285, + "learning_rate": 5e-06, + "logits/chosen": -33080827.076923076, + "logits/rejected": 1285000.7272727273, + "logps/chosen": -448.76600060096155, + "logps/rejected": -612.9714133522727, + "loss": 0.0235, + "rewards/chosen": 6.746042691744291, + "rewards/margins": 16.744232444496422, + "rewards/rejected": -9.99818975275213, + "step": 1762 + }, + { + "epoch": 0.48321227901877484, + "grad_norm": 10.25, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -38528996.92307692, + "logits/rejected": -2705815.6363636362, + "logps/chosen": -413.0025165264423, + "logps/rejected": -679.2234108664773, + "loss": 0.0464, + "rewards/chosen": 5.77316166804387, + "rewards/margins": 20.41125509622214, + "rewards/rejected": -14.638093428178268, + "step": 1763 + }, + { + "epoch": 0.48348636425928465, + "grad_norm": 2.90625, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -6499113.333333333, + "logits/rejected": -36666650.666666664, + "logps/chosen": -427.7484130859375, + "logps/rejected": -567.00634765625, + "loss": 0.0113, + "rewards/chosen": 6.396520614624023, + "rewards/margins": 16.856106440226235, + "rewards/rejected": -10.459585825602213, + "step": 1764 + }, + { + "epoch": 0.4837604494997944, + "grad_norm": 6.46875, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -13893102.857142856, + "logits/rejected": -33835027.2, + "logps/chosen": -354.59437779017856, + "logps/rejected": -492.558544921875, + "loss": 0.032, + "rewards/chosen": 6.822902134486607, + "rewards/margins": 16.611228397914342, + "rewards/rejected": -9.788326263427734, + "step": 1765 + }, + { + "epoch": 0.4840345347403042, + "grad_norm": 4.125, + "kl": 5.847678184509277, + "learning_rate": 5e-06, + "logits/chosen": -12912740.363636363, + "logits/rejected": -4621408.615384615, + "logps/chosen": -437.208984375, + "logps/rejected": -496.39693509615387, + "loss": 0.0145, + "rewards/chosen": 7.201310591264204, + "rewards/margins": 15.007785717090528, + "rewards/rejected": -7.8064751258263225, + "step": 1766 + }, + { + "epoch": 0.48430861998081404, + "grad_norm": 12.0625, + "kl": 8.803018569946289, + "learning_rate": 5e-06, + "logits/chosen": -13636448.888888888, + "logits/rejected": 81331968.0, + "logps/chosen": -382.5050998263889, + "logps/rejected": -639.1169026692709, + "loss": 0.0739, + "rewards/chosen": 5.821333991156684, + "rewards/margins": 20.38214662339952, + "rewards/rejected": -14.560812632242838, + "step": 1767 + }, + { + "epoch": 0.48458270522132385, + "grad_norm": 4.5625, + "kl": 3.1757960319519043, + "learning_rate": 5e-06, + "logits/chosen": -17400568.533333335, + "logits/rejected": -15886288.0, + "logps/chosen": -410.6228515625, + "logps/rejected": -468.07183159722223, + "loss": 0.0281, + "rewards/chosen": 6.762692260742187, + "rewards/margins": 18.001434665256077, + "rewards/rejected": -11.23874240451389, + "step": 1768 + }, + { + "epoch": 0.4848567904618336, + "grad_norm": 8.4375, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -13196359.272727273, + "logits/rejected": -30540105.846153848, + "logps/chosen": -594.3966175426136, + "logps/rejected": -680.6549729567307, + "loss": 0.0277, + "rewards/chosen": 5.919022993607954, + "rewards/margins": 17.836759340513, + "rewards/rejected": -11.917736346905048, + "step": 1769 + }, + { + "epoch": 0.4851308757023434, + "grad_norm": 9.75, + "kl": 1.8847808837890625, + "learning_rate": 5e-06, + "logits/chosen": -27459899.076923076, + "logits/rejected": -9223883.636363637, + "logps/chosen": -409.4921875, + "logps/rejected": -722.6735174005681, + "loss": 0.0309, + "rewards/chosen": 6.713321392352764, + "rewards/margins": 19.581411935232737, + "rewards/rejected": -12.86809054287997, + "step": 1770 + }, + { + "epoch": 0.48540496094285324, + "grad_norm": 8.8125, + "kl": 9.634231567382812, + "learning_rate": 5e-06, + "logits/chosen": -8032254.769230769, + "logits/rejected": -12141349.818181818, + "logps/chosen": -411.1780348557692, + "logps/rejected": -578.5405717329545, + "loss": 0.0451, + "rewards/chosen": 7.250727726862981, + "rewards/margins": 17.8469330047394, + "rewards/rejected": -10.59620527787642, + "step": 1771 + }, + { + "epoch": 0.48567904618336305, + "grad_norm": 6.09375, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -7173357.714285715, + "logits/rejected": -46676156.23529412, + "logps/chosen": -351.55392020089283, + "logps/rejected": -469.9995978860294, + "loss": 0.0299, + "rewards/chosen": 5.535685947963169, + "rewards/margins": 15.6855694426208, + "rewards/rejected": -10.14988349465763, + "step": 1772 + }, + { + "epoch": 0.4859531314238728, + "grad_norm": 17.0, + "kl": 19.262712478637695, + "learning_rate": 5e-06, + "logits/chosen": -13251339.294117646, + "logits/rejected": -10916339.42857143, + "logps/chosen": -394.5244140625, + "logps/rejected": -448.7244349888393, + "loss": 0.1239, + "rewards/chosen": 6.423341638901654, + "rewards/margins": 14.032816910944064, + "rewards/rejected": -7.609475272042411, + "step": 1773 + }, + { + "epoch": 0.4862272166643826, + "grad_norm": 10.25, + "kl": 7.473109245300293, + "learning_rate": 5e-06, + "logits/chosen": -7601178.285714285, + "logits/rejected": 1772707.2, + "logps/chosen": -535.5997488839286, + "logps/rejected": -440.668896484375, + "loss": 0.0414, + "rewards/chosen": 7.758562360491071, + "rewards/margins": 14.14832409449986, + "rewards/rejected": -6.389761734008789, + "step": 1774 + }, + { + "epoch": 0.48650130190489244, + "grad_norm": 7.5625, + "kl": 12.39700984954834, + "learning_rate": 5e-06, + "logits/chosen": -21081348.57142857, + "logits/rejected": -32771580.8, + "logps/chosen": -434.640869140625, + "logps/rejected": -508.609619140625, + "loss": 0.0454, + "rewards/chosen": 6.209149496895926, + "rewards/margins": 15.97211946759905, + "rewards/rejected": -9.762969970703125, + "step": 1775 + }, + { + "epoch": 0.4867753871454022, + "grad_norm": 3.78125, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": 2236808.0, + "logits/rejected": -12231328.0, + "logps/chosen": -322.4861572265625, + "logps/rejected": -525.6944056919643, + "loss": 0.0164, + "rewards/chosen": 6.618618774414062, + "rewards/margins": 14.929071916852678, + "rewards/rejected": -8.310453142438616, + "step": 1776 + }, + { + "epoch": 0.487049472385912, + "grad_norm": 7.6875, + "kl": 1.1159381866455078, + "learning_rate": 5e-06, + "logits/chosen": -3692841.714285714, + "logits/rejected": -20519174.4, + "logps/chosen": -451.77197265625, + "logps/rejected": -591.741748046875, + "loss": 0.0373, + "rewards/chosen": 6.078469957624163, + "rewards/margins": 16.16288768223354, + "rewards/rejected": -10.084417724609375, + "step": 1777 + }, + { + "epoch": 0.4873235576264218, + "grad_norm": 5.71875, + "kl": 0.987372636795044, + "learning_rate": 5e-06, + "logits/chosen": -21940395.2, + "logits/rejected": 14402618.285714285, + "logps/chosen": -520.73564453125, + "logps/rejected": -431.87437220982144, + "loss": 0.0464, + "rewards/chosen": 6.190509414672851, + "rewards/margins": 12.403749356951032, + "rewards/rejected": -6.2132399422781805, + "step": 1778 + }, + { + "epoch": 0.48759764286693164, + "grad_norm": 5.125, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": 7143600.571428572, + "logits/rejected": -24578701.17647059, + "logps/chosen": -426.01011439732144, + "logps/rejected": -404.3736213235294, + "loss": 0.0508, + "rewards/chosen": 6.981184823172433, + "rewards/margins": 15.379625913475742, + "rewards/rejected": -8.398441090303308, + "step": 1779 + }, + { + "epoch": 0.4878717281074414, + "grad_norm": 6.34375, + "kl": 4.548999786376953, + "learning_rate": 5e-06, + "logits/chosen": -7529917.714285715, + "logits/rejected": 2496992.0, + "logps/chosen": -464.6592494419643, + "logps/rejected": -700.513720703125, + "loss": 0.0222, + "rewards/chosen": 7.1822509765625, + "rewards/margins": 17.972817993164064, + "rewards/rejected": -10.790567016601562, + "step": 1780 + }, + { + "epoch": 0.4881458133479512, + "grad_norm": 15.625, + "kl": 8.042892456054688, + "learning_rate": 5e-06, + "logits/chosen": -18310933.714285713, + "logits/rejected": -10444678.4, + "logps/chosen": -365.3692103794643, + "logps/rejected": -464.9951171875, + "loss": 0.0922, + "rewards/chosen": 6.2843813214983255, + "rewards/margins": 12.628118351527622, + "rewards/rejected": -6.343737030029297, + "step": 1781 + }, + { + "epoch": 0.488419898588461, + "grad_norm": 6.1875, + "kl": 10.517126083374023, + "learning_rate": 5e-06, + "logits/chosen": -28771372.307692308, + "logits/rejected": -40340928.0, + "logps/chosen": -429.46371694711536, + "logps/rejected": -431.26509232954544, + "loss": 0.0393, + "rewards/chosen": 7.775269728440505, + "rewards/margins": 14.741013746995193, + "rewards/rejected": -6.9657440185546875, + "step": 1782 + }, + { + "epoch": 0.48869398382897083, + "grad_norm": 1.53125, + "kl": 2.7481181621551514, + "learning_rate": 5e-06, + "logits/chosen": -2861492.4444444445, + "logits/rejected": 30862222.933333334, + "logps/chosen": -523.4061957465278, + "logps/rejected": -472.4748046875, + "loss": 0.0066, + "rewards/chosen": 8.23838636610243, + "rewards/margins": 17.44798312717014, + "rewards/rejected": -9.209596761067708, + "step": 1783 + }, + { + "epoch": 0.4889680690694806, + "grad_norm": 10.0, + "kl": 0.05387115478515625, + "learning_rate": 5e-06, + "logits/chosen": 21088487.384615384, + "logits/rejected": -26625960.727272727, + "logps/chosen": -385.3213641826923, + "logps/rejected": -413.4191228693182, + "loss": 0.0369, + "rewards/chosen": 7.828392615685096, + "rewards/margins": 14.475603170328206, + "rewards/rejected": -6.64721055464311, + "step": 1784 + }, + { + "epoch": 0.4892421543099904, + "grad_norm": 7.3125, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -19190988.8, + "logits/rejected": -18204683.42857143, + "logps/chosen": -428.993505859375, + "logps/rejected": -457.1895228794643, + "loss": 0.0498, + "rewards/chosen": 6.439765167236328, + "rewards/margins": 17.247156742640904, + "rewards/rejected": -10.807391575404576, + "step": 1785 + }, + { + "epoch": 0.4895162395505002, + "grad_norm": 3.234375, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -7540512.0, + "logits/rejected": -24668721.454545453, + "logps/chosen": -470.34093299278845, + "logps/rejected": -514.8166725852273, + "loss": 0.0098, + "rewards/chosen": 7.193491422213041, + "rewards/margins": 18.36974569467398, + "rewards/rejected": -11.176254272460938, + "step": 1786 + }, + { + "epoch": 0.48979032479101, + "grad_norm": 3.25, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -30974768.0, + "logits/rejected": -25107849.14285714, + "logps/chosen": -420.82568359375, + "logps/rejected": -497.42759486607144, + "loss": 0.0102, + "rewards/chosen": 6.145232391357422, + "rewards/margins": 14.967936161586216, + "rewards/rejected": -8.822703770228795, + "step": 1787 + }, + { + "epoch": 0.4900644100315198, + "grad_norm": 13.5625, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -20968493.333333332, + "logits/rejected": -5345209.333333333, + "logps/chosen": -519.5088704427084, + "logps/rejected": -413.2561848958333, + "loss": 0.046, + "rewards/chosen": 7.387864430745442, + "rewards/margins": 15.872656504313152, + "rewards/rejected": -8.484792073567709, + "step": 1788 + }, + { + "epoch": 0.4903384952720296, + "grad_norm": 7.84375, + "kl": 15.06696891784668, + "learning_rate": 5e-06, + "logits/chosen": -15060384.0, + "logits/rejected": -25948043.42857143, + "logps/chosen": -516.8956227022059, + "logps/rejected": -639.4048549107143, + "loss": 0.0459, + "rewards/chosen": 7.561774758731618, + "rewards/margins": 19.642165688907397, + "rewards/rejected": -12.080390930175781, + "step": 1789 + }, + { + "epoch": 0.4906125805125394, + "grad_norm": 3.328125, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -36242612.36363637, + "logits/rejected": -21226717.53846154, + "logps/chosen": -390.39204545454544, + "logps/rejected": -588.6441180889423, + "loss": 0.0089, + "rewards/chosen": 7.2524330832741475, + "rewards/margins": 17.29671643663953, + "rewards/rejected": -10.044283353365385, + "step": 1790 + }, + { + "epoch": 0.4908866657530492, + "grad_norm": 7.125, + "kl": 1.9116218090057373, + "learning_rate": 5e-06, + "logits/chosen": -14989486.666666666, + "logits/rejected": 42692237.333333336, + "logps/chosen": -478.9889729817708, + "logps/rejected": -443.2635904947917, + "loss": 0.0359, + "rewards/chosen": 5.837038675944011, + "rewards/margins": 12.345006942749023, + "rewards/rejected": -6.507968266805013, + "step": 1791 + }, + { + "epoch": 0.491160750993559, + "grad_norm": 6.96875, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -53935402.666666664, + "logits/rejected": -34930513.06666667, + "logps/chosen": -539.0223524305555, + "logps/rejected": -579.10078125, + "loss": 0.0213, + "rewards/chosen": 8.24419911702474, + "rewards/margins": 21.686575826009115, + "rewards/rejected": -13.442376708984375, + "step": 1792 + }, + { + "epoch": 0.4914348362340688, + "grad_norm": 9.8125, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -13299793.0, + "logits/rejected": 43039984.0, + "logps/chosen": -510.55133056640625, + "logps/rejected": -688.0726318359375, + "loss": 0.0376, + "rewards/chosen": 6.17347526550293, + "rewards/margins": 17.769323348999023, + "rewards/rejected": -11.595848083496094, + "step": 1793 + }, + { + "epoch": 0.4917089214745786, + "grad_norm": 7.09375, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -8955514.181818182, + "logits/rejected": -20371619.692307692, + "logps/chosen": -387.78346946022725, + "logps/rejected": -542.4397911658654, + "loss": 0.0375, + "rewards/chosen": 6.8514321067116475, + "rewards/margins": 15.744498512961648, + "rewards/rejected": -8.89306640625, + "step": 1794 + }, + { + "epoch": 0.4919830067150884, + "grad_norm": 4.9375, + "kl": 6.049198150634766, + "learning_rate": 5e-06, + "logits/chosen": -20064402.666666668, + "logits/rejected": -35968629.333333336, + "logps/chosen": -456.1905110677083, + "logps/rejected": -389.5896402994792, + "loss": 0.0209, + "rewards/chosen": 7.572961171468099, + "rewards/margins": 15.456235249837238, + "rewards/rejected": -7.883274078369141, + "step": 1795 + }, + { + "epoch": 0.4922570919555982, + "grad_norm": 10.1875, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -31097696.0, + "logits/rejected": -35302218.666666664, + "logps/chosen": -489.4864095052083, + "logps/rejected": -548.312744140625, + "loss": 0.0284, + "rewards/chosen": 7.743176142374675, + "rewards/margins": 16.94200897216797, + "rewards/rejected": -9.198832829793295, + "step": 1796 + }, + { + "epoch": 0.492531177196108, + "grad_norm": 7.9375, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -32802934.4, + "logits/rejected": -19181040.842105262, + "logps/chosen": -374.7264892578125, + "logps/rejected": -571.057462993421, + "loss": 0.0498, + "rewards/chosen": 4.924801635742187, + "rewards/margins": 16.165395796926397, + "rewards/rejected": -11.24059416118421, + "step": 1797 + }, + { + "epoch": 0.49280526243661776, + "grad_norm": 3.640625, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -7968988.8, + "logits/rejected": 7498640.0, + "logps/chosen": -387.3123372395833, + "logps/rejected": -680.4517144097222, + "loss": 0.014, + "rewards/chosen": 6.274061075846354, + "rewards/margins": 21.725726318359374, + "rewards/rejected": -15.451665242513021, + "step": 1798 + }, + { + "epoch": 0.4930793476771276, + "grad_norm": 2.421875, + "kl": 3.057717800140381, + "learning_rate": 5e-06, + "logits/chosen": -25781888.0, + "logits/rejected": -29800940.0, + "logps/chosen": -564.26123046875, + "logps/rejected": -495.64862060546875, + "loss": 0.0059, + "rewards/chosen": 7.116621017456055, + "rewards/margins": 19.313379287719727, + "rewards/rejected": -12.196758270263672, + "step": 1799 + }, + { + "epoch": 0.4933534329176374, + "grad_norm": 4.0625, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -22772544.0, + "logits/rejected": -28937643.42857143, + "logps/chosen": -543.90908203125, + "logps/rejected": -584.10205078125, + "loss": 0.0106, + "rewards/chosen": 6.895449829101563, + "rewards/margins": 16.569644165039062, + "rewards/rejected": -9.6741943359375, + "step": 1800 + }, + { + "epoch": 0.4936275181581472, + "grad_norm": 14.6875, + "kl": 3.2846522331237793, + "learning_rate": 5e-06, + "logits/chosen": -39731185.777777776, + "logits/rejected": -21724569.6, + "logps/chosen": -345.7128634982639, + "logps/rejected": -624.5638671875, + "loss": 0.0218, + "rewards/chosen": 5.8255157470703125, + "rewards/margins": 17.99734903971354, + "rewards/rejected": -12.17183329264323, + "step": 1801 + }, + { + "epoch": 0.49390160339865696, + "grad_norm": 8.4375, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -11421006.545454545, + "logits/rejected": -22220199.384615384, + "logps/chosen": -312.6653941761364, + "logps/rejected": -539.0269681490385, + "loss": 0.0476, + "rewards/chosen": 4.285158677534624, + "rewards/margins": 17.13784563291323, + "rewards/rejected": -12.852686955378605, + "step": 1802 + }, + { + "epoch": 0.4941756886391668, + "grad_norm": 15.5625, + "kl": 5.812921047210693, + "learning_rate": 5e-06, + "logits/chosen": -19305181.866666667, + "logits/rejected": -24361299.555555556, + "logps/chosen": -375.70777994791666, + "logps/rejected": -548.1682400173611, + "loss": 0.0537, + "rewards/chosen": 5.77024180094401, + "rewards/margins": 17.155239698621962, + "rewards/rejected": -11.384997897677952, + "step": 1803 + }, + { + "epoch": 0.4944497738796766, + "grad_norm": 4.5625, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -11050916.0, + "logits/rejected": -16396470.0, + "logps/chosen": -387.3955078125, + "logps/rejected": -863.58935546875, + "loss": 0.0137, + "rewards/chosen": 5.3177032470703125, + "rewards/margins": 20.287123680114746, + "rewards/rejected": -14.969420433044434, + "step": 1804 + }, + { + "epoch": 0.4947238591201864, + "grad_norm": 8.125, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -18526462.666666668, + "logits/rejected": 16376365.333333334, + "logps/chosen": -435.9605712890625, + "logps/rejected": -731.5984700520834, + "loss": 0.052, + "rewards/chosen": 6.001984278361003, + "rewards/margins": 20.137678146362305, + "rewards/rejected": -14.135693868001303, + "step": 1805 + }, + { + "epoch": 0.49499794436069616, + "grad_norm": 10.1875, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -8786562.666666666, + "logits/rejected": -13024266.666666666, + "logps/chosen": -383.3684488932292, + "logps/rejected": -669.991943359375, + "loss": 0.0649, + "rewards/chosen": 4.987113952636719, + "rewards/margins": 15.555606842041016, + "rewards/rejected": -10.568492889404297, + "step": 1806 + }, + { + "epoch": 0.495272029601206, + "grad_norm": 13.25, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -18776821.333333332, + "logits/rejected": -3645256.8, + "logps/chosen": -324.6200358072917, + "logps/rejected": -497.0045572916667, + "loss": 0.0622, + "rewards/chosen": 5.783656650119358, + "rewards/margins": 15.637192620171442, + "rewards/rejected": -9.853535970052084, + "step": 1807 + }, + { + "epoch": 0.4955461148417158, + "grad_norm": 3.734375, + "kl": 6.440056800842285, + "learning_rate": 5e-06, + "logits/chosen": -23790279.384615384, + "logits/rejected": -27859159.272727273, + "logps/chosen": -439.1162860576923, + "logps/rejected": -411.94340376420456, + "loss": 0.0125, + "rewards/chosen": 6.818398695725661, + "rewards/margins": 17.535454970139725, + "rewards/rejected": -10.717056274414062, + "step": 1808 + }, + { + "epoch": 0.49582020008222555, + "grad_norm": 8.0625, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -14330466.909090908, + "logits/rejected": -14577104.0, + "logps/chosen": -322.2854669744318, + "logps/rejected": -451.2936823918269, + "loss": 0.0296, + "rewards/chosen": 6.235917524857954, + "rewards/margins": 14.114596653651525, + "rewards/rejected": -7.87867912879357, + "step": 1809 + }, + { + "epoch": 0.49609428532273536, + "grad_norm": 1.03125, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -9922471.111111112, + "logits/rejected": -10870819.2, + "logps/chosen": -550.3799370659722, + "logps/rejected": -557.6625, + "loss": 0.0032, + "rewards/chosen": 6.913510216606988, + "rewards/margins": 16.32842517428928, + "rewards/rejected": -9.414914957682292, + "step": 1810 + }, + { + "epoch": 0.4963683705632452, + "grad_norm": 2.734375, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -4732532.4, + "logits/rejected": -28183858.285714287, + "logps/chosen": -458.14384765625, + "logps/rejected": -460.00791713169644, + "loss": 0.0268, + "rewards/chosen": 6.871623992919922, + "rewards/margins": 15.24711172921317, + "rewards/rejected": -8.375487736293248, + "step": 1811 + }, + { + "epoch": 0.496642455803755, + "grad_norm": 7.84375, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -20855200.0, + "logits/rejected": -24575352.0, + "logps/chosen": -459.9456787109375, + "logps/rejected": -548.0222574869791, + "loss": 0.0477, + "rewards/chosen": 5.77895991007487, + "rewards/margins": 18.87765884399414, + "rewards/rejected": -13.098698933919271, + "step": 1812 + }, + { + "epoch": 0.49691654104426475, + "grad_norm": 5.6875, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -29765326.769230768, + "logits/rejected": -23818696.727272727, + "logps/chosen": -489.3439378004808, + "logps/rejected": -600.7195933948864, + "loss": 0.0174, + "rewards/chosen": 7.148534334622896, + "rewards/margins": 15.96160584563142, + "rewards/rejected": -8.813071511008523, + "step": 1813 + }, + { + "epoch": 0.49719062628477456, + "grad_norm": 3.515625, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -14640888.615384616, + "logits/rejected": -29845120.0, + "logps/chosen": -329.1374323918269, + "logps/rejected": -432.92041015625, + "loss": 0.0343, + "rewards/chosen": 7.258965125450721, + "rewards/margins": 16.513166841093476, + "rewards/rejected": -9.254201715642756, + "step": 1814 + }, + { + "epoch": 0.4974647115252844, + "grad_norm": 1.3671875, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -24367556.0, + "logits/rejected": -19287996.0, + "logps/chosen": -468.44464111328125, + "logps/rejected": -712.5546875, + "loss": 0.0042, + "rewards/chosen": 6.6940789222717285, + "rewards/margins": 19.19316816329956, + "rewards/rejected": -12.499089241027832, + "step": 1815 + }, + { + "epoch": 0.4977387967657942, + "grad_norm": 11.625, + "kl": 1.2004716396331787, + "learning_rate": 5e-06, + "logits/chosen": -10769569.142857144, + "logits/rejected": -22461260.8, + "logps/chosen": -488.51607840401783, + "logps/rejected": -621.862890625, + "loss": 0.0554, + "rewards/chosen": 6.690207890101841, + "rewards/margins": 14.356658390590123, + "rewards/rejected": -7.666450500488281, + "step": 1816 + }, + { + "epoch": 0.49801288200630395, + "grad_norm": 8.1875, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -8668510.0, + "logits/rejected": -32930832.0, + "logps/chosen": -418.108154296875, + "logps/rejected": -550.1099853515625, + "loss": 0.0533, + "rewards/chosen": 5.765256881713867, + "rewards/margins": 16.202472686767578, + "rewards/rejected": -10.437215805053711, + "step": 1817 + }, + { + "epoch": 0.49828696724681376, + "grad_norm": 1.578125, + "kl": 1.611250638961792, + "learning_rate": 5e-06, + "logits/chosen": -19865902.545454547, + "logits/rejected": -18625083.076923076, + "logps/chosen": -488.65482954545456, + "logps/rejected": -562.1663912259615, + "loss": 0.005, + "rewards/chosen": 6.486215764825994, + "rewards/margins": 16.060727366200695, + "rewards/rejected": -9.574511601374699, + "step": 1818 + }, + { + "epoch": 0.4985610524873236, + "grad_norm": 1.09375, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -20944004.0, + "logits/rejected": -3264879.75, + "logps/chosen": -443.02392578125, + "logps/rejected": -600.4041748046875, + "loss": 0.0034, + "rewards/chosen": 6.1015753746032715, + "rewards/margins": 17.85908079147339, + "rewards/rejected": -11.757505416870117, + "step": 1819 + }, + { + "epoch": 0.49883513772783333, + "grad_norm": 4.375, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -21839620.363636363, + "logits/rejected": -16965442.46153846, + "logps/chosen": -495.56547407670456, + "logps/rejected": -526.857421875, + "loss": 0.0337, + "rewards/chosen": 6.907652421431108, + "rewards/margins": 15.751348428792888, + "rewards/rejected": -8.843696007361778, + "step": 1820 + }, + { + "epoch": 0.49910922296834315, + "grad_norm": 6.125, + "kl": 2.268970012664795, + "learning_rate": 5e-06, + "logits/chosen": -17550619.42857143, + "logits/rejected": -15596236.8, + "logps/chosen": -361.1845005580357, + "logps/rejected": -401.243994140625, + "loss": 0.0539, + "rewards/chosen": 5.743246895926339, + "rewards/margins": 14.230321175711495, + "rewards/rejected": -8.487074279785157, + "step": 1821 + }, + { + "epoch": 0.49938330820885296, + "grad_norm": 7.03125, + "kl": 0.5063247680664062, + "learning_rate": 5e-06, + "logits/chosen": 6362903.2727272725, + "logits/rejected": -6346172.307692308, + "logps/chosen": -344.89262251420456, + "logps/rejected": -632.1823167067307, + "loss": 0.038, + "rewards/chosen": 5.135928414084694, + "rewards/margins": 16.576504820710294, + "rewards/rejected": -11.4405764066256, + "step": 1822 + }, + { + "epoch": 0.4996573934493628, + "grad_norm": 7.625, + "kl": 1.1982269287109375, + "learning_rate": 5e-06, + "logits/chosen": -28694520.888888888, + "logits/rejected": -29844565.333333332, + "logps/chosen": -448.0064290364583, + "logps/rejected": -449.56569010416666, + "loss": 0.0278, + "rewards/chosen": 6.236981709798177, + "rewards/margins": 14.007654317220052, + "rewards/rejected": -7.770672607421875, + "step": 1823 + }, + { + "epoch": 0.49993147868987253, + "grad_norm": 6.40625, + "kl": 0.3127174377441406, + "learning_rate": 5e-06, + "logits/chosen": -13294824.888888888, + "logits/rejected": -15832359.466666667, + "logps/chosen": -490.8831380208333, + "logps/rejected": -418.34381510416665, + "loss": 0.0248, + "rewards/chosen": 6.886906517876519, + "rewards/margins": 15.282404157850479, + "rewards/rejected": -8.395497639973959, + "step": 1824 + }, + { + "epoch": 0.5002055639303824, + "grad_norm": 12.3125, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -2244525.0, + "logits/rejected": 19706854.666666668, + "logps/chosen": -415.6193033854167, + "logps/rejected": -590.5792236328125, + "loss": 0.0834, + "rewards/chosen": 5.650423685709636, + "rewards/margins": 20.31116485595703, + "rewards/rejected": -14.660741170247396, + "step": 1825 + }, + { + "epoch": 0.5004796491708922, + "grad_norm": 1.15625, + "kl": 9.546701431274414, + "learning_rate": 5e-06, + "logits/chosen": -47386484.36363637, + "logits/rejected": -10039006.153846154, + "logps/chosen": -535.4002130681819, + "logps/rejected": -604.6746544471154, + "loss": 0.0031, + "rewards/chosen": 7.414317737926137, + "rewards/margins": 18.394547042313157, + "rewards/rejected": -10.98022930438702, + "step": 1826 + }, + { + "epoch": 0.5007537344114019, + "grad_norm": 5.90625, + "kl": 2.253997802734375, + "learning_rate": 5e-06, + "logits/chosen": -7143191.2727272725, + "logits/rejected": -27050880.0, + "logps/chosen": -314.0930841619318, + "logps/rejected": -633.3143780048077, + "loss": 0.0238, + "rewards/chosen": 5.48170956698331, + "rewards/margins": 16.622956656075857, + "rewards/rejected": -11.141247089092548, + "step": 1827 + }, + { + "epoch": 0.5010278196519118, + "grad_norm": 5.625, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -16392288.0, + "logits/rejected": -12536241.6, + "logps/chosen": -583.5862165178571, + "logps/rejected": -558.447802734375, + "loss": 0.0171, + "rewards/chosen": 7.005336216517857, + "rewards/margins": 17.807517460414342, + "rewards/rejected": -10.802181243896484, + "step": 1828 + }, + { + "epoch": 0.5013019048924215, + "grad_norm": 8.0625, + "kl": 4.29071569442749, + "learning_rate": 5e-06, + "logits/chosen": 4890479.333333333, + "logits/rejected": -13329170.666666666, + "logps/chosen": -426.7911376953125, + "logps/rejected": -521.0794270833334, + "loss": 0.0719, + "rewards/chosen": 6.906386057535808, + "rewards/margins": 14.318745930989584, + "rewards/rejected": -7.412359873453776, + "step": 1829 + }, + { + "epoch": 0.5015759901329313, + "grad_norm": 5.8125, + "kl": 5.934690475463867, + "learning_rate": 5e-06, + "logits/chosen": -19689520.94117647, + "logits/rejected": 8819801.714285715, + "logps/chosen": -429.3717256433824, + "logps/rejected": -534.5111607142857, + "loss": 0.0254, + "rewards/chosen": 7.7367383171530335, + "rewards/margins": 16.239664189955768, + "rewards/rejected": -8.502925872802734, + "step": 1830 + }, + { + "epoch": 0.5018500753734412, + "grad_norm": 5.125, + "kl": 1.1667487621307373, + "learning_rate": 5e-06, + "logits/chosen": -35719658.666666664, + "logits/rejected": 31622680.0, + "logps/chosen": -344.3074544270833, + "logps/rejected": -448.3525797526042, + "loss": 0.0239, + "rewards/chosen": 6.1328074137369795, + "rewards/margins": 15.52423604329427, + "rewards/rejected": -9.391428629557291, + "step": 1831 + }, + { + "epoch": 0.5021241606139509, + "grad_norm": 3.421875, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": 14899998.545454545, + "logits/rejected": -15318818.461538462, + "logps/chosen": -511.4967151988636, + "logps/rejected": -527.5749323918269, + "loss": 0.0257, + "rewards/chosen": 7.173296841708097, + "rewards/margins": 16.46032405399776, + "rewards/rejected": -9.287027212289663, + "step": 1832 + }, + { + "epoch": 0.5023982458544607, + "grad_norm": 8.75, + "kl": 1.407135009765625, + "learning_rate": 5e-06, + "logits/chosen": -24930925.333333332, + "logits/rejected": -23417162.666666668, + "logps/chosen": -497.2250162760417, + "logps/rejected": -557.2020670572916, + "loss": 0.0258, + "rewards/chosen": 7.69239616394043, + "rewards/margins": 17.03074073791504, + "rewards/rejected": -9.33834457397461, + "step": 1833 + }, + { + "epoch": 0.5026723310949706, + "grad_norm": 10.3125, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -6636776.0, + "logits/rejected": 19669465.14285714, + "logps/chosen": -513.41708984375, + "logps/rejected": -584.7953055245536, + "loss": 0.0338, + "rewards/chosen": 7.859170532226562, + "rewards/margins": 17.113591221400668, + "rewards/rejected": -9.254420689174108, + "step": 1834 + }, + { + "epoch": 0.5029464163354803, + "grad_norm": 10.5625, + "kl": 7.7207560539245605, + "learning_rate": 5e-06, + "logits/chosen": -23108189.714285713, + "logits/rejected": -25421233.6, + "logps/chosen": -413.3676060267857, + "logps/rejected": -318.79267578125, + "loss": 0.0438, + "rewards/chosen": 7.418357304164341, + "rewards/margins": 13.943886021205357, + "rewards/rejected": -6.5255287170410154, + "step": 1835 + }, + { + "epoch": 0.5032205015759902, + "grad_norm": 12.875, + "kl": 16.2327880859375, + "learning_rate": 5e-06, + "logits/chosen": -12490603.42857143, + "logits/rejected": -5929892.4, + "logps/chosen": -421.4756556919643, + "logps/rejected": -540.295166015625, + "loss": 0.1043, + "rewards/chosen": 7.031656537737165, + "rewards/margins": 17.331101880754744, + "rewards/rejected": -10.299445343017577, + "step": 1836 + }, + { + "epoch": 0.5034945868165, + "grad_norm": 7.65625, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -43066429.09090909, + "logits/rejected": -21591158.153846152, + "logps/chosen": -327.3441051136364, + "logps/rejected": -673.1984675480769, + "loss": 0.0207, + "rewards/chosen": 5.577524358575994, + "rewards/margins": 18.77634168504835, + "rewards/rejected": -13.198817326472355, + "step": 1837 + }, + { + "epoch": 0.5037686720570097, + "grad_norm": 5.53125, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -9317134.4, + "logits/rejected": -4035559.1428571427, + "logps/chosen": -290.25556640625, + "logps/rejected": -327.90042550223217, + "loss": 0.0303, + "rewards/chosen": 5.409018707275391, + "rewards/margins": 14.109584045410156, + "rewards/rejected": -8.700565338134766, + "step": 1838 + }, + { + "epoch": 0.5040427572975196, + "grad_norm": 7.46875, + "kl": 6.564234733581543, + "learning_rate": 5e-06, + "logits/chosen": -16975453.866666667, + "logits/rejected": -28283537.777777776, + "logps/chosen": -426.565234375, + "logps/rejected": -578.0475260416666, + "loss": 0.0187, + "rewards/chosen": 7.375102742513021, + "rewards/margins": 16.98038330078125, + "rewards/rejected": -9.605280558268229, + "step": 1839 + }, + { + "epoch": 0.5043168425380293, + "grad_norm": 14.375, + "kl": 0.9146105647087097, + "learning_rate": 5e-06, + "logits/chosen": 5948198.153846154, + "logits/rejected": -15579632.0, + "logps/chosen": -311.1503342848558, + "logps/rejected": -407.27294921875, + "loss": 0.0637, + "rewards/chosen": 5.3130317101111775, + "rewards/margins": 11.816049482438947, + "rewards/rejected": -6.50301777232777, + "step": 1840 + }, + { + "epoch": 0.5045909277785391, + "grad_norm": 12.1875, + "kl": 7.996652603149414, + "learning_rate": 5e-06, + "logits/chosen": -8856080.666666666, + "logits/rejected": -10440989.333333334, + "logps/chosen": -432.0009765625, + "logps/rejected": -641.4774169921875, + "loss": 0.0385, + "rewards/chosen": 6.493803024291992, + "rewards/margins": 16.25209744771322, + "rewards/rejected": -9.758294423421225, + "step": 1841 + }, + { + "epoch": 0.504865013019049, + "grad_norm": 10.8125, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -6377368.7272727275, + "logits/rejected": -19187670.153846152, + "logps/chosen": -388.5872247869318, + "logps/rejected": -531.1043419471154, + "loss": 0.0406, + "rewards/chosen": 6.680498296564275, + "rewards/margins": 15.271374949208505, + "rewards/rejected": -8.59087665264423, + "step": 1842 + }, + { + "epoch": 0.5051390982595587, + "grad_norm": 5.4375, + "kl": 11.320798873901367, + "learning_rate": 5e-06, + "logits/chosen": -21605917.53846154, + "logits/rejected": 12298122.181818182, + "logps/chosen": -489.14002403846155, + "logps/rejected": -511.2191051136364, + "loss": 0.0457, + "rewards/chosen": 7.774100083571214, + "rewards/margins": 16.787499381112053, + "rewards/rejected": -9.013399297540838, + "step": 1843 + }, + { + "epoch": 0.5054131835000685, + "grad_norm": 4.28125, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -17401188.923076924, + "logits/rejected": 14707752.727272727, + "logps/chosen": -474.62015474759613, + "logps/rejected": -635.4600053267045, + "loss": 0.0125, + "rewards/chosen": 7.3022308349609375, + "rewards/margins": 18.247997630726207, + "rewards/rejected": -10.94576679576527, + "step": 1844 + }, + { + "epoch": 0.5056872687405783, + "grad_norm": 5.40625, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -12476951.272727273, + "logits/rejected": -17058363.076923076, + "logps/chosen": -442.06893643465907, + "logps/rejected": -701.1896033653846, + "loss": 0.0136, + "rewards/chosen": 5.847994717684659, + "rewards/margins": 19.407363144667833, + "rewards/rejected": -13.559368426983173, + "step": 1845 + }, + { + "epoch": 0.5059613539810881, + "grad_norm": 5.71875, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -37980704.0, + "logits/rejected": -19261977.6, + "logps/chosen": -546.6724175347222, + "logps/rejected": -561.6716145833333, + "loss": 0.0302, + "rewards/chosen": 6.490666283501519, + "rewards/margins": 15.987691836886935, + "rewards/rejected": -9.497025553385416, + "step": 1846 + }, + { + "epoch": 0.506235439221598, + "grad_norm": 8.6875, + "kl": 1.4716072082519531, + "learning_rate": 5e-06, + "logits/chosen": -13067552.0, + "logits/rejected": 18799556.923076924, + "logps/chosen": -441.9396306818182, + "logps/rejected": -680.6442307692307, + "loss": 0.0397, + "rewards/chosen": 7.6706626198508525, + "rewards/margins": 18.677318973141116, + "rewards/rejected": -11.006656353290264, + "step": 1847 + }, + { + "epoch": 0.5065095244621077, + "grad_norm": 3.109375, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": 5786814.4, + "logits/rejected": -21739744.0, + "logps/chosen": -410.9658203125, + "logps/rejected": -488.5762416294643, + "loss": 0.0084, + "rewards/chosen": 5.985995101928711, + "rewards/margins": 14.96154943193708, + "rewards/rejected": -8.97555433000837, + "step": 1848 + }, + { + "epoch": 0.5067836097026175, + "grad_norm": 4.625, + "kl": 1.3896090984344482, + "learning_rate": 5e-06, + "logits/chosen": -13775640.0, + "logits/rejected": -13976825.142857144, + "logps/chosen": -523.57451171875, + "logps/rejected": -486.962890625, + "loss": 0.0252, + "rewards/chosen": 8.149481964111327, + "rewards/margins": 17.0084362574986, + "rewards/rejected": -8.858954293387276, + "step": 1849 + }, + { + "epoch": 0.5070576949431274, + "grad_norm": 5.71875, + "kl": 3.7046284675598145, + "learning_rate": 5e-06, + "logits/chosen": -26023432.0, + "logits/rejected": -7549308.5, + "logps/chosen": -350.8645935058594, + "logps/rejected": -305.64801025390625, + "loss": 0.0538, + "rewards/chosen": 6.324007987976074, + "rewards/margins": 12.380768775939941, + "rewards/rejected": -6.056760787963867, + "step": 1850 + }, + { + "epoch": 0.5073317801836371, + "grad_norm": 2.625, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -31139502.545454547, + "logits/rejected": -18866414.769230768, + "logps/chosen": -493.5810546875, + "logps/rejected": -394.91800631009613, + "loss": 0.0071, + "rewards/chosen": 8.179509943181818, + "rewards/margins": 17.582653018978093, + "rewards/rejected": -9.403143075796274, + "step": 1851 + }, + { + "epoch": 0.5076058654241469, + "grad_norm": 4.3125, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": 2426065.8181818184, + "logits/rejected": -27305331.692307692, + "logps/chosen": -308.933837890625, + "logps/rejected": -464.7083082932692, + "loss": 0.0212, + "rewards/chosen": 5.084782340309837, + "rewards/margins": 13.309036068149378, + "rewards/rejected": -8.224253727839542, + "step": 1852 + }, + { + "epoch": 0.5078799506646567, + "grad_norm": 5.65625, + "kl": 5.825991630554199, + "learning_rate": 5e-06, + "logits/chosen": 2317210.4, + "logits/rejected": -9120069.714285715, + "logps/chosen": -682.869677734375, + "logps/rejected": -619.9910714285714, + "loss": 0.0101, + "rewards/chosen": 9.2813232421875, + "rewards/margins": 23.153476388113837, + "rewards/rejected": -13.872153145926339, + "step": 1853 + }, + { + "epoch": 0.5081540359051665, + "grad_norm": 4.40625, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -24541768.727272727, + "logits/rejected": -29532908.307692308, + "logps/chosen": -340.01027610085225, + "logps/rejected": -621.6083984375, + "loss": 0.0316, + "rewards/chosen": 5.113174091685902, + "rewards/margins": 17.774576867377007, + "rewards/rejected": -12.661402775691105, + "step": 1854 + }, + { + "epoch": 0.5084281211456763, + "grad_norm": 4.625, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": 1346028.6666666667, + "logits/rejected": -18049666.133333333, + "logps/chosen": -387.9413248697917, + "logps/rejected": -490.11474609375, + "loss": 0.0312, + "rewards/chosen": 5.639376322428386, + "rewards/margins": 16.667349751790365, + "rewards/rejected": -11.02797342936198, + "step": 1855 + }, + { + "epoch": 0.5087022063861861, + "grad_norm": 4.375, + "kl": 4.291277885437012, + "learning_rate": 5e-06, + "logits/chosen": -10434742.545454545, + "logits/rejected": -19137699.692307692, + "logps/chosen": -386.36399147727275, + "logps/rejected": -451.5416917067308, + "loss": 0.0231, + "rewards/chosen": 6.99334023215554, + "rewards/margins": 15.965567475432284, + "rewards/rejected": -8.972227243276743, + "step": 1856 + }, + { + "epoch": 0.5089762916266959, + "grad_norm": 7.5, + "kl": 4.054513454437256, + "learning_rate": 5e-06, + "logits/chosen": -16110304.0, + "logits/rejected": -15190083.692307692, + "logps/chosen": -493.87801846590907, + "logps/rejected": -483.08642578125, + "loss": 0.0248, + "rewards/chosen": 6.855175365101207, + "rewards/margins": 16.926067699085582, + "rewards/rejected": -10.070892333984375, + "step": 1857 + }, + { + "epoch": 0.5092503768672058, + "grad_norm": 11.3125, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -29680240.0, + "logits/rejected": -13522218.0, + "logps/chosen": -397.03765869140625, + "logps/rejected": -555.6334228515625, + "loss": 0.0412, + "rewards/chosen": 5.07309627532959, + "rewards/margins": 13.647006034851074, + "rewards/rejected": -8.573909759521484, + "step": 1858 + }, + { + "epoch": 0.5095244621077155, + "grad_norm": 5.625, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -22144178.666666668, + "logits/rejected": -30529136.0, + "logps/chosen": -485.1957194010417, + "logps/rejected": -408.407470703125, + "loss": 0.0228, + "rewards/chosen": 7.819050470987956, + "rewards/margins": 18.65289878845215, + "rewards/rejected": -10.833848317464193, + "step": 1859 + }, + { + "epoch": 0.5097985473482253, + "grad_norm": 5.25, + "kl": 3.126948118209839, + "learning_rate": 5e-06, + "logits/chosen": -13707560.0, + "logits/rejected": -13848467.2, + "logps/chosen": -496.69029017857144, + "logps/rejected": -555.46171875, + "loss": 0.0185, + "rewards/chosen": 7.6481203351702005, + "rewards/margins": 18.640830448695592, + "rewards/rejected": -10.99271011352539, + "step": 1860 + }, + { + "epoch": 0.5100726325887351, + "grad_norm": 4.5625, + "kl": 2.2093162536621094, + "learning_rate": 5e-06, + "logits/chosen": -36180757.333333336, + "logits/rejected": -7962110.666666667, + "logps/chosen": -384.9470621744792, + "logps/rejected": -673.7259114583334, + "loss": 0.0504, + "rewards/chosen": 6.589188893636067, + "rewards/margins": 19.64163335164388, + "rewards/rejected": -13.052444458007812, + "step": 1861 + }, + { + "epoch": 0.5103467178292449, + "grad_norm": 10.1875, + "kl": 10.615885734558105, + "learning_rate": 5e-06, + "logits/chosen": -12669504.0, + "logits/rejected": -29217835.636363637, + "logps/chosen": -433.02982271634613, + "logps/rejected": -521.3946200284091, + "loss": 0.0404, + "rewards/chosen": 7.235099792480469, + "rewards/margins": 17.50708215886896, + "rewards/rejected": -10.271982366388494, + "step": 1862 + }, + { + "epoch": 0.5106208030697547, + "grad_norm": 7.65625, + "kl": 0.06767463684082031, + "learning_rate": 5e-06, + "logits/chosen": -196577.6, + "logits/rejected": -4355159.714285715, + "logps/chosen": -465.3033203125, + "logps/rejected": -381.4557407924107, + "loss": 0.0396, + "rewards/chosen": 7.2331291198730465, + "rewards/margins": 14.528933824811663, + "rewards/rejected": -7.295804704938616, + "step": 1863 + }, + { + "epoch": 0.5108948883102645, + "grad_norm": 2.234375, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -24114486.0, + "logits/rejected": -29719798.0, + "logps/chosen": -408.23822021484375, + "logps/rejected": -499.03668212890625, + "loss": 0.0087, + "rewards/chosen": 7.062049865722656, + "rewards/margins": 17.779264450073242, + "rewards/rejected": -10.717214584350586, + "step": 1864 + }, + { + "epoch": 0.5111689735507743, + "grad_norm": 1.9609375, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -22327145.14285714, + "logits/rejected": -4800770.823529412, + "logps/chosen": -444.88424246651783, + "logps/rejected": -502.23902803308823, + "loss": 0.0034, + "rewards/chosen": 8.982838221958705, + "rewards/margins": 19.19243115336955, + "rewards/rejected": -10.209592931410846, + "step": 1865 + }, + { + "epoch": 0.511443058791284, + "grad_norm": 1.2109375, + "kl": 0.3737386167049408, + "learning_rate": 5e-06, + "logits/chosen": -40184822.4, + "logits/rejected": -23760438.85714286, + "logps/chosen": -383.3470458984375, + "logps/rejected": -593.2956194196429, + "loss": 0.0042, + "rewards/chosen": 6.650343322753907, + "rewards/margins": 17.944960021972655, + "rewards/rejected": -11.29461669921875, + "step": 1866 + }, + { + "epoch": 0.5117171440317939, + "grad_norm": 3.359375, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -9492217.454545455, + "logits/rejected": -29253252.923076924, + "logps/chosen": -335.85391512784093, + "logps/rejected": -512.2162710336538, + "loss": 0.0317, + "rewards/chosen": 6.361948186700994, + "rewards/margins": 18.119322876830203, + "rewards/rejected": -11.757374690129208, + "step": 1867 + }, + { + "epoch": 0.5119912292723037, + "grad_norm": 11.0625, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -9996129.6, + "logits/rejected": -19254921.14285714, + "logps/chosen": -356.5142333984375, + "logps/rejected": -480.8333217075893, + "loss": 0.0451, + "rewards/chosen": 5.988904571533203, + "rewards/margins": 17.89064995901925, + "rewards/rejected": -11.901745387486049, + "step": 1868 + }, + { + "epoch": 0.5122653145128134, + "grad_norm": 6.59375, + "kl": 1.5992724895477295, + "learning_rate": 5e-06, + "logits/chosen": -10774525.333333334, + "logits/rejected": -19462570.666666668, + "logps/chosen": -372.4517415364583, + "logps/rejected": -401.4600423177083, + "loss": 0.0322, + "rewards/chosen": 6.548372268676758, + "rewards/margins": 14.996452967325846, + "rewards/rejected": -8.448080698649088, + "step": 1869 + }, + { + "epoch": 0.5125393997533233, + "grad_norm": 8.0, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -24890174.222222224, + "logits/rejected": -9863232.0, + "logps/chosen": -316.618896484375, + "logps/rejected": -554.4638671875, + "loss": 0.0549, + "rewards/chosen": 4.672776116265191, + "rewards/margins": 14.876859368218316, + "rewards/rejected": -10.204083251953126, + "step": 1870 + }, + { + "epoch": 0.5128134849938331, + "grad_norm": 10.0625, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -6290909.090909091, + "logits/rejected": -16708454.153846154, + "logps/chosen": -417.93701171875, + "logps/rejected": -439.56798377403845, + "loss": 0.0292, + "rewards/chosen": 4.836382779208097, + "rewards/margins": 14.376426536720118, + "rewards/rejected": -9.54004375751202, + "step": 1871 + }, + { + "epoch": 0.5130875702343429, + "grad_norm": 6.71875, + "kl": 0.11554718017578125, + "learning_rate": 5e-06, + "logits/chosen": -23324204.307692308, + "logits/rejected": -16367937.454545455, + "logps/chosen": -418.36117788461536, + "logps/rejected": -413.5341796875, + "loss": 0.0515, + "rewards/chosen": 6.923530578613281, + "rewards/margins": 15.94886502352628, + "rewards/rejected": -9.025334444912998, + "step": 1872 + }, + { + "epoch": 0.5133616554748527, + "grad_norm": 5.09375, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -11474756.0, + "logits/rejected": 9831116.57142857, + "logps/chosen": -361.42431640625, + "logps/rejected": -579.5355050223214, + "loss": 0.0215, + "rewards/chosen": 6.149270629882812, + "rewards/margins": 17.621534511021203, + "rewards/rejected": -11.472263881138392, + "step": 1873 + }, + { + "epoch": 0.5136357407153624, + "grad_norm": 13.1875, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -8456848.0, + "logits/rejected": 11262454.4, + "logps/chosen": -364.95458984375, + "logps/rejected": -454.610302734375, + "loss": 0.0802, + "rewards/chosen": 4.635556357247489, + "rewards/margins": 12.359119742257255, + "rewards/rejected": -7.723563385009766, + "step": 1874 + }, + { + "epoch": 0.5139098259558723, + "grad_norm": 9.25, + "kl": 1.9262620210647583, + "learning_rate": 5e-06, + "logits/chosen": -23867776.0, + "logits/rejected": -11089408.0, + "logps/chosen": -532.9763055098684, + "logps/rejected": -351.346044921875, + "loss": 0.0434, + "rewards/chosen": 6.3782605622944075, + "rewards/margins": 10.843373248451634, + "rewards/rejected": -4.465112686157227, + "step": 1875 + }, + { + "epoch": 0.5141839111963821, + "grad_norm": 5.53125, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -11570776.615384616, + "logits/rejected": -13851825.454545455, + "logps/chosen": -451.7795973557692, + "logps/rejected": -573.1290838068181, + "loss": 0.0286, + "rewards/chosen": 5.472148014948918, + "rewards/margins": 14.107252107633578, + "rewards/rejected": -8.635104092684658, + "step": 1876 + }, + { + "epoch": 0.5144579964368918, + "grad_norm": 6.375, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": 21883036.444444444, + "logits/rejected": 4446074.666666667, + "logps/chosen": -549.728515625, + "logps/rejected": -756.2244140625, + "loss": 0.0106, + "rewards/chosen": 8.419158087836372, + "rewards/margins": 22.26202918158637, + "rewards/rejected": -13.84287109375, + "step": 1877 + }, + { + "epoch": 0.5147320816774017, + "grad_norm": 5.75, + "kl": 5.933967590332031, + "learning_rate": 5e-06, + "logits/chosen": -30933358.545454547, + "logits/rejected": -30025144.615384616, + "logps/chosen": -472.5095880681818, + "logps/rejected": -627.7722355769231, + "loss": 0.0189, + "rewards/chosen": 7.3738181374289775, + "rewards/margins": 18.589600889832823, + "rewards/rejected": -11.215782752403847, + "step": 1878 + }, + { + "epoch": 0.5150061669179115, + "grad_norm": 7.0625, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -9517795.076923076, + "logits/rejected": -20263906.90909091, + "logps/chosen": -473.28695913461536, + "logps/rejected": -557.4214311079545, + "loss": 0.0187, + "rewards/chosen": 6.013870826134315, + "rewards/margins": 22.04481287602778, + "rewards/rejected": -16.030942049893465, + "step": 1879 + }, + { + "epoch": 0.5152802521584212, + "grad_norm": 7.34375, + "kl": 2.782278537750244, + "learning_rate": 5e-06, + "logits/chosen": -35362485.333333336, + "logits/rejected": -27677058.133333333, + "logps/chosen": -417.2151692708333, + "logps/rejected": -470.187109375, + "loss": 0.0129, + "rewards/chosen": 7.338962131076389, + "rewards/margins": 16.576948377821182, + "rewards/rejected": -9.237986246744791, + "step": 1880 + }, + { + "epoch": 0.5155543373989311, + "grad_norm": 7.375, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -19535282.90909091, + "logits/rejected": -24124553.846153848, + "logps/chosen": -393.5009765625, + "logps/rejected": -555.7692683293269, + "loss": 0.0195, + "rewards/chosen": 6.298825350674716, + "rewards/margins": 19.943094853754644, + "rewards/rejected": -13.644269503079927, + "step": 1881 + }, + { + "epoch": 0.5158284226394408, + "grad_norm": 6.9375, + "kl": 0.29446539282798767, + "learning_rate": 5e-06, + "logits/chosen": -33452550.4, + "logits/rejected": -26904429.714285713, + "logps/chosen": -445.69482421875, + "logps/rejected": -479.6420200892857, + "loss": 0.054, + "rewards/chosen": 4.327595138549805, + "rewards/margins": 11.77233216421945, + "rewards/rejected": -7.444737025669643, + "step": 1882 + }, + { + "epoch": 0.5161025078799507, + "grad_norm": 1.21875, + "kl": 0.8824717402458191, + "learning_rate": 5e-06, + "logits/chosen": -4706523.692307692, + "logits/rejected": -28326045.09090909, + "logps/chosen": -485.96292818509613, + "logps/rejected": -601.6827503551136, + "loss": 0.0041, + "rewards/chosen": 6.929058955265925, + "rewards/margins": 21.600723106544333, + "rewards/rejected": -14.671664151278408, + "step": 1883 + }, + { + "epoch": 0.5163765931204605, + "grad_norm": 12.875, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": 920874.9090909091, + "logits/rejected": -14946050.461538462, + "logps/chosen": -275.7028142755682, + "logps/rejected": -519.90771484375, + "loss": 0.0607, + "rewards/chosen": 5.1031060652299365, + "rewards/margins": 13.798762474860347, + "rewards/rejected": -8.69565640963041, + "step": 1884 + }, + { + "epoch": 0.5166506783609702, + "grad_norm": 10.125, + "kl": 5.090258598327637, + "learning_rate": 5e-06, + "logits/chosen": -16397608.0, + "logits/rejected": 5057922.0, + "logps/chosen": -382.042724609375, + "logps/rejected": -639.0416259765625, + "loss": 0.0678, + "rewards/chosen": 6.9650492668151855, + "rewards/margins": 15.941833972930908, + "rewards/rejected": -8.976784706115723, + "step": 1885 + }, + { + "epoch": 0.5169247636014801, + "grad_norm": 6.1875, + "kl": 1.1638858318328857, + "learning_rate": 5e-06, + "logits/chosen": -20332306.46153846, + "logits/rejected": -20958282.181818184, + "logps/chosen": -345.8996394230769, + "logps/rejected": -717.759765625, + "loss": 0.0599, + "rewards/chosen": 4.58966064453125, + "rewards/margins": 19.013531771573156, + "rewards/rejected": -14.423871127041904, + "step": 1886 + }, + { + "epoch": 0.5171988488419899, + "grad_norm": 4.0625, + "kl": 2.4165968894958496, + "learning_rate": 5e-06, + "logits/chosen": -5599796.0, + "logits/rejected": -21969350.4, + "logps/chosen": -490.3152553013393, + "logps/rejected": -550.84326171875, + "loss": 0.011, + "rewards/chosen": 6.665384565080915, + "rewards/margins": 17.901951490129743, + "rewards/rejected": -11.236566925048828, + "step": 1887 + }, + { + "epoch": 0.5174729340824996, + "grad_norm": 7.21875, + "kl": 2.7020556926727295, + "learning_rate": 5e-06, + "logits/chosen": -35967717.81818182, + "logits/rejected": -6111399.384615385, + "logps/chosen": -306.97017045454544, + "logps/rejected": -490.77576622596155, + "loss": 0.0368, + "rewards/chosen": 5.951403531161222, + "rewards/margins": 14.242986078862543, + "rewards/rejected": -8.291582547701323, + "step": 1888 + }, + { + "epoch": 0.5177470193230095, + "grad_norm": 4.375, + "kl": 3.43537974357605, + "learning_rate": 5e-06, + "logits/chosen": -22748465.230769232, + "logits/rejected": 5871506.909090909, + "logps/chosen": -346.5784254807692, + "logps/rejected": -565.7806729403409, + "loss": 0.0356, + "rewards/chosen": 6.065809396597055, + "rewards/margins": 18.55947838629876, + "rewards/rejected": -12.493668989701705, + "step": 1889 + }, + { + "epoch": 0.5180211045635192, + "grad_norm": 16.375, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -26216538.666666668, + "logits/rejected": 3147421.3333333335, + "logps/chosen": -365.5530598958333, + "logps/rejected": -619.2451985677084, + "loss": 0.054, + "rewards/chosen": 6.075915018717448, + "rewards/margins": 17.12962595621745, + "rewards/rejected": -11.0537109375, + "step": 1890 + }, + { + "epoch": 0.518295189804029, + "grad_norm": 7.75, + "kl": 3.3984744548797607, + "learning_rate": 5e-06, + "logits/chosen": -16677794.666666666, + "logits/rejected": -25121280.0, + "logps/chosen": -381.7273356119792, + "logps/rejected": -561.4903971354166, + "loss": 0.0361, + "rewards/chosen": 5.727361679077148, + "rewards/margins": 16.480399449666344, + "rewards/rejected": -10.753037770589193, + "step": 1891 + }, + { + "epoch": 0.5185692750445389, + "grad_norm": 3.46875, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -14567098.666666666, + "logits/rejected": -41658442.666666664, + "logps/chosen": -372.1287841796875, + "logps/rejected": -457.3775227864583, + "loss": 0.0165, + "rewards/chosen": 6.148770650227864, + "rewards/margins": 13.212933222452799, + "rewards/rejected": -7.064162572224935, + "step": 1892 + }, + { + "epoch": 0.5188433602850486, + "grad_norm": 5.84375, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -23222789.818181816, + "logits/rejected": -15932787.692307692, + "logps/chosen": -330.8794611150568, + "logps/rejected": -511.9949293870192, + "loss": 0.0216, + "rewards/chosen": 5.588517622514204, + "rewards/margins": 14.948146153163243, + "rewards/rejected": -9.359628530649038, + "step": 1893 + }, + { + "epoch": 0.5191174455255585, + "grad_norm": 10.0625, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -23355680.0, + "logits/rejected": -12955002.666666666, + "logps/chosen": -390.064697265625, + "logps/rejected": -699.907470703125, + "loss": 0.0486, + "rewards/chosen": 5.150793393452962, + "rewards/margins": 14.431588172912598, + "rewards/rejected": -9.280794779459635, + "step": 1894 + }, + { + "epoch": 0.5193915307660683, + "grad_norm": 4.125, + "kl": 5.222983360290527, + "learning_rate": 5e-06, + "logits/chosen": -28341582.933333334, + "logits/rejected": 30767872.0, + "logps/chosen": -465.3265625, + "logps/rejected": -619.8627387152778, + "loss": 0.0113, + "rewards/chosen": 6.150596110026042, + "rewards/margins": 24.383075629340276, + "rewards/rejected": -18.232479519314236, + "step": 1895 + }, + { + "epoch": 0.519665616006578, + "grad_norm": 1.3515625, + "kl": 2.327498197555542, + "learning_rate": 5e-06, + "logits/chosen": -15522684.57142857, + "logits/rejected": 19005848.0, + "logps/chosen": -496.7705078125, + "logps/rejected": -562.9626953125, + "loss": 0.004, + "rewards/chosen": 8.366912841796875, + "rewards/margins": 21.338568115234374, + "rewards/rejected": -12.9716552734375, + "step": 1896 + }, + { + "epoch": 0.5199397012470879, + "grad_norm": 2.984375, + "kl": 3.4733095169067383, + "learning_rate": 5e-06, + "logits/chosen": -25662582.85714286, + "logits/rejected": -13304514.4, + "logps/chosen": -484.7562779017857, + "logps/rejected": -407.0048095703125, + "loss": 0.0094, + "rewards/chosen": 7.347662789481027, + "rewards/margins": 15.879676491873607, + "rewards/rejected": -8.532013702392579, + "step": 1897 + }, + { + "epoch": 0.5202137864875976, + "grad_norm": 3.84375, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -18621313.230769232, + "logits/rejected": -31088221.09090909, + "logps/chosen": -414.8876953125, + "logps/rejected": -517.861328125, + "loss": 0.0404, + "rewards/chosen": 7.7758636474609375, + "rewards/margins": 17.05133195356889, + "rewards/rejected": -9.275468306107955, + "step": 1898 + }, + { + "epoch": 0.5204878717281074, + "grad_norm": 6.1875, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -5492266.5, + "logits/rejected": 15640038.0, + "logps/chosen": -306.3226318359375, + "logps/rejected": -474.39373779296875, + "loss": 0.033, + "rewards/chosen": 4.177394866943359, + "rewards/margins": 15.207413673400879, + "rewards/rejected": -11.03001880645752, + "step": 1899 + }, + { + "epoch": 0.5207619569686173, + "grad_norm": 5.125, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": 3305295.6666666665, + "logits/rejected": 11371114.666666666, + "logps/chosen": -381.8365885416667, + "logps/rejected": -598.28173828125, + "loss": 0.0596, + "rewards/chosen": 5.81479008992513, + "rewards/margins": 16.378529866536457, + "rewards/rejected": -10.563739776611328, + "step": 1900 + }, + { + "epoch": 0.521036042209127, + "grad_norm": 1.1640625, + "kl": 3.0917296409606934, + "learning_rate": 5e-06, + "logits/chosen": -13124094.4, + "logits/rejected": -28532601.14285714, + "logps/chosen": -458.81748046875, + "logps/rejected": -670.4451032366071, + "loss": 0.0035, + "rewards/chosen": 8.215762329101562, + "rewards/margins": 18.724253845214843, + "rewards/rejected": -10.508491516113281, + "step": 1901 + }, + { + "epoch": 0.5213101274496368, + "grad_norm": 4.625, + "kl": 3.601628065109253, + "learning_rate": 5e-06, + "logits/chosen": -29730502.85714286, + "logits/rejected": -54440435.2, + "logps/chosen": -499.4915248325893, + "logps/rejected": -514.359130859375, + "loss": 0.0136, + "rewards/chosen": 7.848878043038504, + "rewards/margins": 18.167118399483815, + "rewards/rejected": -10.318240356445312, + "step": 1902 + }, + { + "epoch": 0.5215842126901467, + "grad_norm": 1.625, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": 14197552.0, + "logits/rejected": -37051234.28571428, + "logps/chosen": -466.6361328125, + "logps/rejected": -501.86097935267856, + "loss": 0.0051, + "rewards/chosen": 6.280129241943359, + "rewards/margins": 14.516395241873603, + "rewards/rejected": -8.236265999930245, + "step": 1903 + }, + { + "epoch": 0.5218582979306564, + "grad_norm": 3.1875, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -21656476.0, + "logits/rejected": -27574797.333333332, + "logps/chosen": -472.9267578125, + "logps/rejected": -805.6966959635416, + "loss": 0.0116, + "rewards/chosen": 7.136797587076823, + "rewards/margins": 24.64711634318034, + "rewards/rejected": -17.510318756103516, + "step": 1904 + }, + { + "epoch": 0.5221323831711663, + "grad_norm": 4.34375, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -19279808.0, + "logits/rejected": -15402080.0, + "logps/chosen": -488.46591796875, + "logps/rejected": -455.00962611607144, + "loss": 0.0106, + "rewards/chosen": 6.445304107666016, + "rewards/margins": 16.390562111990793, + "rewards/rejected": -9.945258004324776, + "step": 1905 + }, + { + "epoch": 0.522406468411676, + "grad_norm": 7.40625, + "kl": 5.653787136077881, + "learning_rate": 5e-06, + "logits/chosen": -17539414.0, + "logits/rejected": -53599880.0, + "logps/chosen": -320.7252197265625, + "logps/rejected": -653.7774047851562, + "loss": 0.0393, + "rewards/chosen": 5.6380791664123535, + "rewards/margins": 19.029886722564697, + "rewards/rejected": -13.391807556152344, + "step": 1906 + }, + { + "epoch": 0.5226805536521858, + "grad_norm": 5.8125, + "kl": 1.1112544536590576, + "learning_rate": 5e-06, + "logits/chosen": -7801216.0, + "logits/rejected": -19331365.818181816, + "logps/chosen": -487.8351862980769, + "logps/rejected": -719.3346946022727, + "loss": 0.031, + "rewards/chosen": 7.580371563251202, + "rewards/margins": 20.28796877560916, + "rewards/rejected": -12.707597212357955, + "step": 1907 + }, + { + "epoch": 0.5229546388926957, + "grad_norm": 4.1875, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": 58382460.44444445, + "logits/rejected": 12896291.2, + "logps/chosen": -592.8708767361111, + "logps/rejected": -417.68828125, + "loss": 0.0178, + "rewards/chosen": 6.623093075222439, + "rewards/margins": 15.845030127631293, + "rewards/rejected": -9.221937052408855, + "step": 1908 + }, + { + "epoch": 0.5232287241332054, + "grad_norm": 5.40625, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -25105072.0, + "logits/rejected": -29630294.85714286, + "logps/chosen": -459.02587890625, + "logps/rejected": -428.25174386160717, + "loss": 0.0285, + "rewards/chosen": 4.965755844116211, + "rewards/margins": 15.362646756853376, + "rewards/rejected": -10.396890912737165, + "step": 1909 + }, + { + "epoch": 0.5235028093737152, + "grad_norm": 4.03125, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -6964464.0, + "logits/rejected": -7611987.636363637, + "logps/chosen": -335.3508112980769, + "logps/rejected": -612.77734375, + "loss": 0.0228, + "rewards/chosen": 5.282379150390625, + "rewards/margins": 16.125749067826703, + "rewards/rejected": -10.84336991743608, + "step": 1910 + }, + { + "epoch": 0.523776894614225, + "grad_norm": 13.125, + "kl": 11.4205961227417, + "learning_rate": 5e-06, + "logits/chosen": -20914045.866666667, + "logits/rejected": -25641429.333333332, + "logps/chosen": -414.65087890625, + "logps/rejected": -450.92803276909723, + "loss": 0.0646, + "rewards/chosen": 7.178647867838541, + "rewards/margins": 18.031497192382812, + "rewards/rejected": -10.852849324544271, + "step": 1911 + }, + { + "epoch": 0.5240509798547348, + "grad_norm": 8.5, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -11062888.0, + "logits/rejected": -8875125.0, + "logps/chosen": -503.30072021484375, + "logps/rejected": -577.806640625, + "loss": 0.0353, + "rewards/chosen": 6.1350321769714355, + "rewards/margins": 15.135215282440186, + "rewards/rejected": -9.00018310546875, + "step": 1912 + }, + { + "epoch": 0.5243250650952446, + "grad_norm": 7.46875, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -73087013.33333333, + "logits/rejected": -33477482.666666668, + "logps/chosen": -408.7508544921875, + "logps/rejected": -621.7999267578125, + "loss": 0.0456, + "rewards/chosen": 5.904436747233073, + "rewards/margins": 20.747337341308594, + "rewards/rejected": -14.842900594075521, + "step": 1913 + }, + { + "epoch": 0.5245991503357544, + "grad_norm": 2.734375, + "kl": 0.38163504004478455, + "learning_rate": 5e-06, + "logits/chosen": 2442800.727272727, + "logits/rejected": -31625309.53846154, + "logps/chosen": -555.0970348011364, + "logps/rejected": -637.1727764423077, + "loss": 0.0052, + "rewards/chosen": 7.717886491255327, + "rewards/margins": 20.172212240579245, + "rewards/rejected": -12.454325749323917, + "step": 1914 + }, + { + "epoch": 0.5248732355762642, + "grad_norm": 6.34375, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": 1268871.5555555555, + "logits/rejected": -6796360.0, + "logps/chosen": -393.11962890625, + "logps/rejected": -488.2543619791667, + "loss": 0.0391, + "rewards/chosen": 7.170398288302952, + "rewards/margins": 15.955574883355036, + "rewards/rejected": -8.785176595052084, + "step": 1915 + }, + { + "epoch": 0.5251473208167741, + "grad_norm": 5.75, + "kl": 1.0884017944335938, + "learning_rate": 5e-06, + "logits/chosen": -19784699.636363637, + "logits/rejected": -23876049.230769232, + "logps/chosen": -468.9749644886364, + "logps/rejected": -575.9841871995193, + "loss": 0.0321, + "rewards/chosen": 6.497587724165483, + "rewards/margins": 16.8198333953644, + "rewards/rejected": -10.322245671198917, + "step": 1916 + }, + { + "epoch": 0.5254214060572838, + "grad_norm": 5.59375, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -11869862.545454545, + "logits/rejected": -6168193.230769231, + "logps/chosen": -434.3722034801136, + "logps/rejected": -497.6319110576923, + "loss": 0.0155, + "rewards/chosen": 7.442346746271307, + "rewards/margins": 17.917126315457004, + "rewards/rejected": -10.474779569185698, + "step": 1917 + }, + { + "epoch": 0.5256954912977936, + "grad_norm": 3.6875, + "kl": 2.708667755126953, + "learning_rate": 5e-06, + "logits/chosen": -20165664.0, + "logits/rejected": -49794850.461538464, + "logps/chosen": -413.93559126420456, + "logps/rejected": -709.1010366586538, + "loss": 0.0114, + "rewards/chosen": 6.214147394353693, + "rewards/margins": 19.989187067205258, + "rewards/rejected": -13.775039672851562, + "step": 1918 + }, + { + "epoch": 0.5259695765383035, + "grad_norm": 11.1875, + "kl": 1.8729941844940186, + "learning_rate": 5e-06, + "logits/chosen": -11441124.57142857, + "logits/rejected": 106726496.0, + "logps/chosen": -477.22666713169644, + "logps/rejected": -681.40947265625, + "loss": 0.0323, + "rewards/chosen": 6.967890058244977, + "rewards/margins": 29.13556583949498, + "rewards/rejected": -22.16767578125, + "step": 1919 + }, + { + "epoch": 0.5262436617788132, + "grad_norm": 7.90625, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -8745707.692307692, + "logits/rejected": -20061040.0, + "logps/chosen": -327.0386493389423, + "logps/rejected": -431.38623046875, + "loss": 0.0507, + "rewards/chosen": 5.541495689978967, + "rewards/margins": 14.874469516994235, + "rewards/rejected": -9.33297382701527, + "step": 1920 + }, + { + "epoch": 0.526517747019323, + "grad_norm": 8.6875, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -41799416.88888889, + "logits/rejected": -16172872.533333333, + "logps/chosen": -529.3423394097222, + "logps/rejected": -563.8809895833333, + "loss": 0.0356, + "rewards/chosen": 7.070701599121094, + "rewards/margins": 17.671947733561197, + "rewards/rejected": -10.601246134440105, + "step": 1921 + }, + { + "epoch": 0.5267918322598328, + "grad_norm": 8.8125, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -28969390.545454547, + "logits/rejected": 7503457.230769231, + "logps/chosen": -517.2676669034091, + "logps/rejected": -492.0358698918269, + "loss": 0.0281, + "rewards/chosen": 6.236834439364347, + "rewards/margins": 18.676611520193674, + "rewards/rejected": -12.439777080829327, + "step": 1922 + }, + { + "epoch": 0.5270659175003426, + "grad_norm": 2.125, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -70145952.0, + "logits/rejected": 27549692.444444444, + "logps/chosen": -501.6156412760417, + "logps/rejected": -504.2727864583333, + "loss": 0.0049, + "rewards/chosen": 7.216528574625651, + "rewards/margins": 17.471658918592667, + "rewards/rejected": -10.255130343967014, + "step": 1923 + }, + { + "epoch": 0.5273400027408524, + "grad_norm": 1.9765625, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -38176669.333333336, + "logits/rejected": -4263320.888888889, + "logps/chosen": -536.963623046875, + "logps/rejected": -556.6449110243055, + "loss": 0.0051, + "rewards/chosen": 8.188591639200846, + "rewards/margins": 20.428456412421333, + "rewards/rejected": -12.239864773220486, + "step": 1924 + }, + { + "epoch": 0.5276140879813622, + "grad_norm": 2.9375, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -11025019.076923076, + "logits/rejected": -30439653.818181816, + "logps/chosen": -375.61177884615387, + "logps/rejected": -425.6194513494318, + "loss": 0.0098, + "rewards/chosen": 6.99001957820012, + "rewards/margins": 16.946147091738826, + "rewards/rejected": -9.956127513538707, + "step": 1925 + }, + { + "epoch": 0.527888173221872, + "grad_norm": 4.4375, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -14120872.533333333, + "logits/rejected": 26543475.555555556, + "logps/chosen": -360.65869140625, + "logps/rejected": -561.5819769965278, + "loss": 0.0122, + "rewards/chosen": 5.283993530273437, + "rewards/margins": 16.960387674967446, + "rewards/rejected": -11.67639414469401, + "step": 1926 + }, + { + "epoch": 0.5281622584623819, + "grad_norm": 7.90625, + "kl": 4.2393622398376465, + "learning_rate": 5e-06, + "logits/chosen": -22117465.333333332, + "logits/rejected": -10250964.666666666, + "logps/chosen": -420.5400797526042, + "logps/rejected": -457.1145833333333, + "loss": 0.0671, + "rewards/chosen": 5.436605453491211, + "rewards/margins": 13.630580266316732, + "rewards/rejected": -8.193974812825521, + "step": 1927 + }, + { + "epoch": 0.5284363437028916, + "grad_norm": 11.125, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -24334280.727272727, + "logits/rejected": -14612368.0, + "logps/chosen": -483.17928799715907, + "logps/rejected": -389.4661207932692, + "loss": 0.0542, + "rewards/chosen": 5.655465906316584, + "rewards/margins": 13.956480733164542, + "rewards/rejected": -8.301014826847958, + "step": 1928 + }, + { + "epoch": 0.5287104289434014, + "grad_norm": 9.6875, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -31178488.0, + "logits/rejected": -9482198.0, + "logps/chosen": -400.1265869140625, + "logps/rejected": -618.1717529296875, + "loss": 0.0576, + "rewards/chosen": 6.874124526977539, + "rewards/margins": 21.47116152445475, + "rewards/rejected": -14.597036997477213, + "step": 1929 + }, + { + "epoch": 0.5289845141839112, + "grad_norm": 6.46875, + "kl": 2.061540126800537, + "learning_rate": 5e-06, + "logits/chosen": -10773184.0, + "logits/rejected": -56181456.0, + "logps/chosen": -427.9606119791667, + "logps/rejected": -528.2788899739584, + "loss": 0.0187, + "rewards/chosen": 5.620031992594401, + "rewards/margins": 20.075814565022785, + "rewards/rejected": -14.455782572428385, + "step": 1930 + }, + { + "epoch": 0.529258599424421, + "grad_norm": 7.40625, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -10368522.4, + "logits/rejected": -8080509.714285715, + "logps/chosen": -411.1603515625, + "logps/rejected": -658.4390345982143, + "loss": 0.0156, + "rewards/chosen": 6.576564788818359, + "rewards/margins": 16.68648910522461, + "rewards/rejected": -10.10992431640625, + "step": 1931 + }, + { + "epoch": 0.5295326846649308, + "grad_norm": 6.625, + "kl": 1.5108928680419922, + "learning_rate": 5e-06, + "logits/chosen": -17895856.0, + "logits/rejected": -14099390.76923077, + "logps/chosen": -493.08860085227275, + "logps/rejected": -492.0447340745192, + "loss": 0.0227, + "rewards/chosen": 7.391075827858665, + "rewards/margins": 17.477111923111067, + "rewards/rejected": -10.086036095252403, + "step": 1932 + }, + { + "epoch": 0.5298067699054406, + "grad_norm": 2.609375, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -30319116.0, + "logits/rejected": -22291616.0, + "logps/chosen": -552.658203125, + "logps/rejected": -538.4310913085938, + "loss": 0.0059, + "rewards/chosen": 5.961715221405029, + "rewards/margins": 17.51321840286255, + "rewards/rejected": -11.55150318145752, + "step": 1933 + }, + { + "epoch": 0.5300808551459504, + "grad_norm": 3.3125, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -64222900.0, + "logits/rejected": -5946120.0, + "logps/chosen": -445.2442321777344, + "logps/rejected": -400.1105651855469, + "loss": 0.0115, + "rewards/chosen": 7.6532392501831055, + "rewards/margins": 15.924601554870605, + "rewards/rejected": -8.2713623046875, + "step": 1934 + }, + { + "epoch": 0.5303549403864601, + "grad_norm": 7.96875, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -97563.57142857143, + "logits/rejected": -26494393.6, + "logps/chosen": -423.52601841517856, + "logps/rejected": -506.59521484375, + "loss": 0.0497, + "rewards/chosen": 5.208802359444754, + "rewards/margins": 15.333851187569753, + "rewards/rejected": -10.125048828125, + "step": 1935 + }, + { + "epoch": 0.53062902562697, + "grad_norm": 5.90625, + "kl": 3.0983316898345947, + "learning_rate": 5e-06, + "logits/chosen": -29262356.363636363, + "logits/rejected": 3105154.153846154, + "logps/chosen": -446.00319602272725, + "logps/rejected": -586.9841496394231, + "loss": 0.0185, + "rewards/chosen": 7.324712579900568, + "rewards/margins": 18.87095492703098, + "rewards/rejected": -11.54624234713041, + "step": 1936 + }, + { + "epoch": 0.5309031108674798, + "grad_norm": 8.1875, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -28840556.8, + "logits/rejected": -28762845.714285713, + "logps/chosen": -470.2947265625, + "logps/rejected": -499.3191615513393, + "loss": 0.0173, + "rewards/chosen": 8.194895172119141, + "rewards/margins": 19.851217106410438, + "rewards/rejected": -11.656321934291295, + "step": 1937 + }, + { + "epoch": 0.5311771961079896, + "grad_norm": 7.21875, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -16683504.0, + "logits/rejected": -25992532.57142857, + "logps/chosen": -472.56455078125, + "logps/rejected": -474.30594308035717, + "loss": 0.0361, + "rewards/chosen": 6.691064453125, + "rewards/margins": 17.55208740234375, + "rewards/rejected": -10.86102294921875, + "step": 1938 + }, + { + "epoch": 0.5314512813484994, + "grad_norm": 6.5, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -20632753.066666666, + "logits/rejected": -19657960.888888888, + "logps/chosen": -456.39752604166665, + "logps/rejected": -756.5608723958334, + "loss": 0.0344, + "rewards/chosen": 6.146915181477865, + "rewards/margins": 23.124688212076823, + "rewards/rejected": -16.977773030598957, + "step": 1939 + }, + { + "epoch": 0.5317253665890092, + "grad_norm": 11.3125, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": 2770749.8333333335, + "logits/rejected": -12642730.666666666, + "logps/chosen": -344.0498046875, + "logps/rejected": -422.1036376953125, + "loss": 0.0752, + "rewards/chosen": 4.468405087788899, + "rewards/margins": 14.569931983947754, + "rewards/rejected": -10.101526896158854, + "step": 1940 + }, + { + "epoch": 0.531999451829519, + "grad_norm": 11.6875, + "kl": 3.52075457572937, + "learning_rate": 5e-06, + "logits/chosen": 2609126.0, + "logits/rejected": -15036250.666666666, + "logps/chosen": -499.1206461588542, + "logps/rejected": -804.7942708333334, + "loss": 0.0451, + "rewards/chosen": 7.43520991007487, + "rewards/margins": 20.70528793334961, + "rewards/rejected": -13.27007802327474, + "step": 1941 + }, + { + "epoch": 0.5322735370700288, + "grad_norm": 4.84375, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -13955456.0, + "logits/rejected": -27073744.0, + "logps/chosen": -457.776953125, + "logps/rejected": -535.7732282366071, + "loss": 0.0302, + "rewards/chosen": 6.433529663085937, + "rewards/margins": 18.112530517578126, + "rewards/rejected": -11.679000854492188, + "step": 1942 + }, + { + "epoch": 0.5325476223105385, + "grad_norm": 4.59375, + "kl": 2.151803493499756, + "learning_rate": 5e-06, + "logits/chosen": -22342408.533333335, + "logits/rejected": -13536631.111111112, + "logps/chosen": -347.60406901041665, + "logps/rejected": -602.9264865451389, + "loss": 0.0187, + "rewards/chosen": 5.834704081217448, + "rewards/margins": 16.14789326985677, + "rewards/rejected": -10.313189188639322, + "step": 1943 + }, + { + "epoch": 0.5328217075510484, + "grad_norm": 10.375, + "kl": 6.313737392425537, + "learning_rate": 5e-06, + "logits/chosen": -2434903.5, + "logits/rejected": -17463016.0, + "logps/chosen": -466.65289306640625, + "logps/rejected": -357.4850158691406, + "loss": 0.0244, + "rewards/chosen": 7.675724983215332, + "rewards/margins": 18.322507858276367, + "rewards/rejected": -10.646782875061035, + "step": 1944 + }, + { + "epoch": 0.5330957927915582, + "grad_norm": 11.0625, + "kl": 11.716662406921387, + "learning_rate": 5e-06, + "logits/chosen": -10787420.235294119, + "logits/rejected": -15083069.714285715, + "logps/chosen": -480.22144990808823, + "logps/rejected": -665.3152204241071, + "loss": 0.0738, + "rewards/chosen": 6.8426379035500915, + "rewards/margins": 19.173914933405, + "rewards/rejected": -12.331277029854911, + "step": 1945 + }, + { + "epoch": 0.5333698780320679, + "grad_norm": 7.78125, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -9322467.692307692, + "logits/rejected": -34686609.45454545, + "logps/chosen": -488.49962439903845, + "logps/rejected": -637.0181551846591, + "loss": 0.0327, + "rewards/chosen": 6.7118999774639425, + "rewards/margins": 19.84977007245684, + "rewards/rejected": -13.137870094992898, + "step": 1946 + }, + { + "epoch": 0.5336439632725778, + "grad_norm": 3.59375, + "kl": 0.0745900496840477, + "learning_rate": 5e-06, + "logits/chosen": -17357123.2, + "logits/rejected": -19952384.0, + "logps/chosen": -333.0138671875, + "logps/rejected": -529.2732979910714, + "loss": 0.0195, + "rewards/chosen": 5.966352844238282, + "rewards/margins": 16.46481606619699, + "rewards/rejected": -10.498463221958705, + "step": 1947 + }, + { + "epoch": 0.5339180485130876, + "grad_norm": 4.6875, + "kl": 4.096301078796387, + "learning_rate": 5e-06, + "logits/chosen": -18110118.666666668, + "logits/rejected": -24199002.666666668, + "logps/chosen": -491.6464029947917, + "logps/rejected": -588.3450520833334, + "loss": 0.0288, + "rewards/chosen": 7.316490809122722, + "rewards/margins": 18.301964441935223, + "rewards/rejected": -10.9854736328125, + "step": 1948 + }, + { + "epoch": 0.5341921337535974, + "grad_norm": 5.71875, + "kl": 4.816805839538574, + "learning_rate": 5e-06, + "logits/chosen": -17770659.555555556, + "logits/rejected": -12079312.0, + "logps/chosen": -459.4990234375, + "logps/rejected": -807.9728190104166, + "loss": 0.0212, + "rewards/chosen": 6.193811204698351, + "rewards/margins": 24.112455156114365, + "rewards/rejected": -17.918643951416016, + "step": 1949 + }, + { + "epoch": 0.5344662189941072, + "grad_norm": 9.25, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -25580068.923076924, + "logits/rejected": -5554976.363636363, + "logps/chosen": -368.9768629807692, + "logps/rejected": -425.81569602272725, + "loss": 0.0545, + "rewards/chosen": 4.349890488844651, + "rewards/margins": 13.241449049302748, + "rewards/rejected": -8.891558560458096, + "step": 1950 + }, + { + "epoch": 0.5347403042346169, + "grad_norm": 15.8125, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -45346528.0, + "logits/rejected": -1767925.2222222222, + "logps/chosen": -262.2280680338542, + "logps/rejected": -539.8885091145834, + "loss": 0.0307, + "rewards/chosen": 5.899993896484375, + "rewards/margins": 15.31266360812717, + "rewards/rejected": -9.412669711642796, + "step": 1951 + }, + { + "epoch": 0.5350143894751268, + "grad_norm": 12.4375, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -11023790.545454545, + "logits/rejected": -14269948.307692308, + "logps/chosen": -325.12031693892044, + "logps/rejected": -559.7769681490385, + "loss": 0.0883, + "rewards/chosen": 4.952693939208984, + "rewards/margins": 17.034349588247444, + "rewards/rejected": -12.081655649038462, + "step": 1952 + }, + { + "epoch": 0.5352884747156366, + "grad_norm": 6.53125, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -17112700.0, + "logits/rejected": -14826806.666666666, + "logps/chosen": -454.5099690755208, + "logps/rejected": -511.2619222005208, + "loss": 0.0209, + "rewards/chosen": 7.5735626220703125, + "rewards/margins": 17.384615580240883, + "rewards/rejected": -9.811052958170572, + "step": 1953 + }, + { + "epoch": 0.5355625599561463, + "grad_norm": 16.625, + "kl": 7.780580043792725, + "learning_rate": 5e-06, + "logits/chosen": -16542333.866666667, + "logits/rejected": -28814944.0, + "logps/chosen": -360.0992838541667, + "logps/rejected": -593.0552300347222, + "loss": 0.0927, + "rewards/chosen": 5.3006739298502605, + "rewards/margins": 19.115872192382813, + "rewards/rejected": -13.815198262532553, + "step": 1954 + }, + { + "epoch": 0.5358366451966562, + "grad_norm": 7.21875, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -15391741.714285715, + "logits/rejected": -4872432.8, + "logps/chosen": -370.77587890625, + "logps/rejected": -434.694921875, + "loss": 0.0224, + "rewards/chosen": 5.378004891531808, + "rewards/margins": 12.909436471121651, + "rewards/rejected": -7.531431579589844, + "step": 1955 + }, + { + "epoch": 0.536110730437166, + "grad_norm": 13.0, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -29384226.666666668, + "logits/rejected": 192702.66666666666, + "logps/chosen": -373.0771484375, + "logps/rejected": -751.8419596354166, + "loss": 0.047, + "rewards/chosen": 8.282699584960938, + "rewards/margins": 21.722654978434242, + "rewards/rejected": -13.439955393473307, + "step": 1956 + }, + { + "epoch": 0.5363848156776757, + "grad_norm": 9.0625, + "kl": 0.36511868238449097, + "learning_rate": 5e-06, + "logits/chosen": -22393760.0, + "logits/rejected": 152436.46153846153, + "logps/chosen": -510.8670543323864, + "logps/rejected": -721.4514723557693, + "loss": 0.0253, + "rewards/chosen": 7.67048436945135, + "rewards/margins": 23.693743458994618, + "rewards/rejected": -16.02325908954327, + "step": 1957 + }, + { + "epoch": 0.5366589009181856, + "grad_norm": 4.375, + "kl": 2.56642484664917, + "learning_rate": 5e-06, + "logits/chosen": -19478350.933333334, + "logits/rejected": -36351690.666666664, + "logps/chosen": -405.8378580729167, + "logps/rejected": -428.1500651041667, + "loss": 0.027, + "rewards/chosen": 6.3802032470703125, + "rewards/margins": 15.22413804796007, + "rewards/rejected": -8.843934800889757, + "step": 1958 + }, + { + "epoch": 0.5369329861586953, + "grad_norm": 3.5, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -8575765.333333334, + "logits/rejected": -25319496.533333335, + "logps/chosen": -368.2090115017361, + "logps/rejected": -476.01263020833335, + "loss": 0.0081, + "rewards/chosen": 5.872817569308811, + "rewards/margins": 15.252132754855687, + "rewards/rejected": -9.379315185546876, + "step": 1959 + }, + { + "epoch": 0.5372070713992052, + "grad_norm": 6.90625, + "kl": 1.8429229259490967, + "learning_rate": 5e-06, + "logits/chosen": -1284690.1818181819, + "logits/rejected": -15751155.692307692, + "logps/chosen": -451.19802024147725, + "logps/rejected": -373.9114332932692, + "loss": 0.0369, + "rewards/chosen": 6.247333873401988, + "rewards/margins": 14.818925764177228, + "rewards/rejected": -8.57159189077524, + "step": 1960 + }, + { + "epoch": 0.537481156639715, + "grad_norm": 13.5625, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -40356059.07692308, + "logits/rejected": -21112907.636363637, + "logps/chosen": -524.7218299278846, + "logps/rejected": -496.6012073863636, + "loss": 0.0348, + "rewards/chosen": 7.799296452448918, + "rewards/margins": 18.420734218784144, + "rewards/rejected": -10.621437766335227, + "step": 1961 + }, + { + "epoch": 0.5377552418802247, + "grad_norm": 11.0, + "kl": 7.162234306335449, + "learning_rate": 5e-06, + "logits/chosen": -36175218.666666664, + "logits/rejected": -24737245.333333332, + "logps/chosen": -452.5108642578125, + "logps/rejected": -549.0267740885416, + "loss": 0.0468, + "rewards/chosen": 6.665901819864909, + "rewards/margins": 18.578575770060223, + "rewards/rejected": -11.912673950195312, + "step": 1962 + }, + { + "epoch": 0.5380293271207346, + "grad_norm": 2.984375, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -21129294.769230768, + "logits/rejected": -32269352.727272727, + "logps/chosen": -402.28564453125, + "logps/rejected": -435.9655095880682, + "loss": 0.0091, + "rewards/chosen": 6.376314016488882, + "rewards/margins": 14.574995934546411, + "rewards/rejected": -8.19868191805753, + "step": 1963 + }, + { + "epoch": 0.5383034123612443, + "grad_norm": 7.03125, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -11381751.384615384, + "logits/rejected": -7538855.2727272725, + "logps/chosen": -333.07970252403845, + "logps/rejected": -594.9216086647727, + "loss": 0.0314, + "rewards/chosen": 5.624699519230769, + "rewards/margins": 19.437643411276223, + "rewards/rejected": -13.812943892045455, + "step": 1964 + }, + { + "epoch": 0.5385774976017541, + "grad_norm": 5.0, + "kl": 1.2152055501937866, + "learning_rate": 5e-06, + "logits/chosen": 9866650.666666666, + "logits/rejected": -31132634.666666668, + "logps/chosen": -477.2919921875, + "logps/rejected": -482.5303548177083, + "loss": 0.0212, + "rewards/chosen": 6.407098134358724, + "rewards/margins": 17.715740203857422, + "rewards/rejected": -11.308642069498697, + "step": 1965 + }, + { + "epoch": 0.538851582842264, + "grad_norm": 6.4375, + "kl": 0.7388496398925781, + "learning_rate": 5e-06, + "logits/chosen": -21367293.714285713, + "logits/rejected": -12239399.2, + "logps/chosen": -320.51621791294644, + "logps/rejected": -622.8015625, + "loss": 0.0247, + "rewards/chosen": 5.26696286882673, + "rewards/margins": 19.26608396257673, + "rewards/rejected": -13.99912109375, + "step": 1966 + }, + { + "epoch": 0.5391256680827737, + "grad_norm": 1.7578125, + "kl": 2.1680362224578857, + "learning_rate": 5e-06, + "logits/chosen": -40532710.4, + "logits/rejected": 2849061.4285714286, + "logps/chosen": -467.892724609375, + "logps/rejected": -481.31417410714283, + "loss": 0.006, + "rewards/chosen": 6.896630096435547, + "rewards/margins": 14.979842267717633, + "rewards/rejected": -8.083212171282087, + "step": 1967 + }, + { + "epoch": 0.5393997533232835, + "grad_norm": 5.6875, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -23783382.153846152, + "logits/rejected": 18659864.727272727, + "logps/chosen": -418.6029522235577, + "logps/rejected": -606.7215909090909, + "loss": 0.0373, + "rewards/chosen": 6.234024634728065, + "rewards/margins": 15.815543328131827, + "rewards/rejected": -9.581518693403764, + "step": 1968 + }, + { + "epoch": 0.5396738385637934, + "grad_norm": 4.4375, + "kl": 1.3812065124511719, + "learning_rate": 5e-06, + "logits/chosen": -47369002.666666664, + "logits/rejected": -20478418.666666668, + "logps/chosen": -379.4788818359375, + "logps/rejected": -658.6854654947916, + "loss": 0.0333, + "rewards/chosen": 5.3607133229573565, + "rewards/margins": 19.109718958536785, + "rewards/rejected": -13.749005635579428, + "step": 1969 + }, + { + "epoch": 0.5399479238043031, + "grad_norm": 8.4375, + "kl": 6.053230285644531, + "learning_rate": 5e-06, + "logits/chosen": -27791926.153846152, + "logits/rejected": -29703060.363636363, + "logps/chosen": -360.7521784855769, + "logps/rejected": -623.2814719460227, + "loss": 0.056, + "rewards/chosen": 6.214083158052885, + "rewards/margins": 20.021564403613965, + "rewards/rejected": -13.80748124556108, + "step": 1970 + }, + { + "epoch": 0.540222009044813, + "grad_norm": 5.125, + "kl": 0.6773898005485535, + "learning_rate": 5e-06, + "logits/chosen": 22986106.181818184, + "logits/rejected": -23007069.53846154, + "logps/chosen": -389.48251065340907, + "logps/rejected": -547.5648287259615, + "loss": 0.0172, + "rewards/chosen": 5.322064139626243, + "rewards/margins": 17.380810477516867, + "rewards/rejected": -12.058746337890625, + "step": 1971 + }, + { + "epoch": 0.5404960942853227, + "grad_norm": 2.078125, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -9449712.0, + "logits/rejected": -15105815.272727273, + "logps/chosen": -453.7942457932692, + "logps/rejected": -706.8162286931819, + "loss": 0.0098, + "rewards/chosen": 7.13252199613131, + "rewards/margins": 20.995398194639833, + "rewards/rejected": -13.862876198508523, + "step": 1972 + }, + { + "epoch": 0.5407701795258325, + "grad_norm": 9.1875, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -7216758.153846154, + "logits/rejected": -21023970.90909091, + "logps/chosen": -274.46762319711536, + "logps/rejected": -503.69429154829544, + "loss": 0.0707, + "rewards/chosen": 5.892066368689904, + "rewards/margins": 16.26732992959189, + "rewards/rejected": -10.375263560901988, + "step": 1973 + }, + { + "epoch": 0.5410442647663424, + "grad_norm": 10.9375, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -27627069.09090909, + "logits/rejected": 5981154.461538462, + "logps/chosen": -419.23317649147725, + "logps/rejected": -548.1864483173077, + "loss": 0.0443, + "rewards/chosen": 5.88132754239169, + "rewards/margins": 15.616172043593615, + "rewards/rejected": -9.734844501201923, + "step": 1974 + }, + { + "epoch": 0.5413183500068521, + "grad_norm": 6.0625, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -36841524.36363637, + "logits/rejected": 17826752.0, + "logps/chosen": -500.71342329545456, + "logps/rejected": -546.9864783653846, + "loss": 0.0147, + "rewards/chosen": 7.060117548162287, + "rewards/margins": 16.811374637630436, + "rewards/rejected": -9.75125708946815, + "step": 1975 + }, + { + "epoch": 0.5415924352473619, + "grad_norm": 6.03125, + "kl": 1.3003578186035156, + "learning_rate": 5e-06, + "logits/chosen": -12279453.538461538, + "logits/rejected": -35798190.54545455, + "logps/chosen": -446.8046123798077, + "logps/rejected": -557.5252574573864, + "loss": 0.023, + "rewards/chosen": 6.842075054462139, + "rewards/margins": 17.88008677876079, + "rewards/rejected": -11.03801172429865, + "step": 1976 + }, + { + "epoch": 0.5418665204878718, + "grad_norm": 6.09375, + "kl": 2.4943742752075195, + "learning_rate": 5e-06, + "logits/chosen": -12445145.6, + "logits/rejected": -22604787.555555556, + "logps/chosen": -353.2705403645833, + "logps/rejected": -546.7139756944445, + "loss": 0.0894, + "rewards/chosen": 5.2796071370442705, + "rewards/margins": 15.434792412651909, + "rewards/rejected": -10.15518527560764, + "step": 1977 + }, + { + "epoch": 0.5421406057283815, + "grad_norm": 6.90625, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -16530350.857142856, + "logits/rejected": 4020416.0, + "logps/chosen": -453.28421456473217, + "logps/rejected": -452.41259765625, + "loss": 0.0179, + "rewards/chosen": 6.58710697719029, + "rewards/margins": 16.03478306361607, + "rewards/rejected": -9.447676086425782, + "step": 1978 + }, + { + "epoch": 0.5424146909688913, + "grad_norm": 3.921875, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -22152110.769230768, + "logits/rejected": -3269194.909090909, + "logps/chosen": -485.97964242788464, + "logps/rejected": -476.9859730113636, + "loss": 0.0162, + "rewards/chosen": 5.829129732572115, + "rewards/margins": 16.611796025629644, + "rewards/rejected": -10.78266629305753, + "step": 1979 + }, + { + "epoch": 0.5426887762094011, + "grad_norm": 0.1123046875, + "kl": 0.35151419043540955, + "learning_rate": 5e-06, + "logits/chosen": -18356870.666666668, + "logits/rejected": -36956442.666666664, + "logps/chosen": -398.2459309895833, + "logps/rejected": -637.340087890625, + "loss": 0.0003, + "rewards/chosen": 8.705663681030273, + "rewards/margins": 21.358545303344727, + "rewards/rejected": -12.652881622314453, + "step": 1980 + }, + { + "epoch": 0.5429628614499109, + "grad_norm": 1.265625, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -34858764.8, + "logits/rejected": -28057394.285714287, + "logps/chosen": -360.54296875, + "logps/rejected": -565.1248953683036, + "loss": 0.0052, + "rewards/chosen": 7.013519287109375, + "rewards/margins": 17.836090087890625, + "rewards/rejected": -10.82257080078125, + "step": 1981 + }, + { + "epoch": 0.5432369466904208, + "grad_norm": 8.3125, + "kl": 0.17630133032798767, + "learning_rate": 5e-06, + "logits/chosen": -20594290.90909091, + "logits/rejected": -18691708.307692308, + "logps/chosen": -545.8963955965909, + "logps/rejected": -511.82534555288464, + "loss": 0.0483, + "rewards/chosen": 7.334488608620384, + "rewards/margins": 16.91841077471113, + "rewards/rejected": -9.583922166090746, + "step": 1982 + }, + { + "epoch": 0.5435110319309305, + "grad_norm": 6.09375, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -7958555.636363637, + "logits/rejected": -35194158.76923077, + "logps/chosen": -424.89102450284093, + "logps/rejected": -511.74658203125, + "loss": 0.0138, + "rewards/chosen": 6.107039711692116, + "rewards/margins": 19.03239958436339, + "rewards/rejected": -12.925359872671274, + "step": 1983 + }, + { + "epoch": 0.5437851171714403, + "grad_norm": 4.75, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -24852425.846153848, + "logits/rejected": -12955815.272727273, + "logps/chosen": -378.79916616586536, + "logps/rejected": -704.1368075284091, + "loss": 0.0153, + "rewards/chosen": 6.468044574444111, + "rewards/margins": 18.95840795556982, + "rewards/rejected": -12.49036338112571, + "step": 1984 + }, + { + "epoch": 0.5440592024119502, + "grad_norm": 5.9375, + "kl": 9.188819885253906, + "learning_rate": 5e-06, + "logits/chosen": -13647424.0, + "logits/rejected": 72262636.8, + "logps/chosen": -400.63058035714283, + "logps/rejected": -543.56875, + "loss": 0.0523, + "rewards/chosen": 6.1377378191266745, + "rewards/margins": 18.45145775931222, + "rewards/rejected": -12.313719940185546, + "step": 1985 + }, + { + "epoch": 0.5443332876524599, + "grad_norm": 5.96875, + "kl": 6.900744438171387, + "learning_rate": 5e-06, + "logits/chosen": -34747333.333333336, + "logits/rejected": -11934705.333333334, + "logps/chosen": -473.878173828125, + "logps/rejected": -595.72265625, + "loss": 0.0217, + "rewards/chosen": 7.693979263305664, + "rewards/margins": 18.66578229268392, + "rewards/rejected": -10.971803029378256, + "step": 1986 + }, + { + "epoch": 0.5446073728929697, + "grad_norm": 9.8125, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -33546022.4, + "logits/rejected": -29638073.14285714, + "logps/chosen": -390.997705078125, + "logps/rejected": -566.0798688616071, + "loss": 0.0331, + "rewards/chosen": 5.299746704101563, + "rewards/margins": 16.321087210518975, + "rewards/rejected": -11.021340506417411, + "step": 1987 + }, + { + "epoch": 0.5448814581334795, + "grad_norm": 6.5625, + "kl": 0.05865923687815666, + "learning_rate": 5e-06, + "logits/chosen": -9310103.2, + "logits/rejected": -24667369.14285714, + "logps/chosen": -361.86455078125, + "logps/rejected": -651.2667410714286, + "loss": 0.0355, + "rewards/chosen": 6.918270874023437, + "rewards/margins": 19.15074506487165, + "rewards/rejected": -12.232474190848214, + "step": 1988 + }, + { + "epoch": 0.5451555433739893, + "grad_norm": 9.375, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -20576426.0, + "logits/rejected": -31373104.0, + "logps/chosen": -421.66552734375, + "logps/rejected": -472.98565673828125, + "loss": 0.0588, + "rewards/chosen": 6.403536319732666, + "rewards/margins": 16.577624797821045, + "rewards/rejected": -10.174088478088379, + "step": 1989 + }, + { + "epoch": 0.5454296286144991, + "grad_norm": 6.75, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -20569653.333333332, + "logits/rejected": -8977031.466666667, + "logps/chosen": -411.32318793402777, + "logps/rejected": -477.9525390625, + "loss": 0.0233, + "rewards/chosen": 5.814556545681423, + "rewards/margins": 13.878995429144965, + "rewards/rejected": -8.064438883463541, + "step": 1990 + }, + { + "epoch": 0.5457037138550089, + "grad_norm": 12.4375, + "kl": 3.1332836151123047, + "learning_rate": 5e-06, + "logits/chosen": -24891572.0, + "logits/rejected": -27789712.0, + "logps/chosen": -382.2705078125, + "logps/rejected": -617.1807861328125, + "loss": 0.0455, + "rewards/chosen": 5.094465255737305, + "rewards/margins": 21.263795852661133, + "rewards/rejected": -16.169330596923828, + "step": 1991 + }, + { + "epoch": 0.5459777990955187, + "grad_norm": 4.0625, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -28403353.14285714, + "logits/rejected": -28005522.82352941, + "logps/chosen": -304.4720982142857, + "logps/rejected": -530.8615004595588, + "loss": 0.0065, + "rewards/chosen": 7.455472128731864, + "rewards/margins": 19.077648355179473, + "rewards/rejected": -11.62217622644761, + "step": 1992 + }, + { + "epoch": 0.5462518843360286, + "grad_norm": 7.34375, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -440609.5, + "logits/rejected": -20984838.0, + "logps/chosen": -441.7107849121094, + "logps/rejected": -491.2418212890625, + "loss": 0.0166, + "rewards/chosen": 6.9824323654174805, + "rewards/margins": 18.423850059509277, + "rewards/rejected": -11.441417694091797, + "step": 1993 + }, + { + "epoch": 0.5465259695765383, + "grad_norm": 11.0, + "kl": 3.999202251434326, + "learning_rate": 5e-06, + "logits/chosen": -5941496.0, + "logits/rejected": -24071390.4, + "logps/chosen": -448.23228236607144, + "logps/rejected": -404.8801513671875, + "loss": 0.0358, + "rewards/chosen": 7.237390790666852, + "rewards/margins": 15.328091321672712, + "rewards/rejected": -8.09070053100586, + "step": 1994 + }, + { + "epoch": 0.5468000548170481, + "grad_norm": 6.46875, + "kl": 4.966086387634277, + "learning_rate": 5e-06, + "logits/chosen": -24066023.272727273, + "logits/rejected": -24364711.384615384, + "logps/chosen": -440.66170987215907, + "logps/rejected": -506.0388371394231, + "loss": 0.0179, + "rewards/chosen": 7.449176441539418, + "rewards/margins": 17.440627998405404, + "rewards/rejected": -9.991451556865986, + "step": 1995 + }, + { + "epoch": 0.547074140057558, + "grad_norm": 8.3125, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -11662485.818181818, + "logits/rejected": -16985289.846153848, + "logps/chosen": -391.37935014204544, + "logps/rejected": -596.7431640625, + "loss": 0.0665, + "rewards/chosen": 5.484852183948863, + "rewards/margins": 20.324636072545616, + "rewards/rejected": -14.839783888596754, + "step": 1996 + }, + { + "epoch": 0.5473482252980677, + "grad_norm": 7.0625, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -1351426.6666666667, + "logits/rejected": 6659866.133333334, + "logps/chosen": -288.8828125, + "logps/rejected": -631.4895182291667, + "loss": 0.0344, + "rewards/chosen": 6.364362080891927, + "rewards/margins": 16.854852803548177, + "rewards/rejected": -10.49049072265625, + "step": 1997 + }, + { + "epoch": 0.5476223105385775, + "grad_norm": 3.65625, + "kl": 1.9465503692626953, + "learning_rate": 5e-06, + "logits/chosen": -40519776.0, + "logits/rejected": -36212230.85714286, + "logps/chosen": -380.105908203125, + "logps/rejected": -549.4310477120536, + "loss": 0.024, + "rewards/chosen": 6.667947387695312, + "rewards/margins": 19.87635192871094, + "rewards/rejected": -13.208404541015625, + "step": 1998 + }, + { + "epoch": 0.5478963957790873, + "grad_norm": 8.5625, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": 6837592.533333333, + "logits/rejected": -33303793.777777776, + "logps/chosen": -413.36399739583334, + "logps/rejected": -518.0972764756945, + "loss": 0.0462, + "rewards/chosen": 5.534806315104166, + "rewards/margins": 14.876236131456164, + "rewards/rejected": -9.341429816351997, + "step": 1999 + }, + { + "epoch": 0.5481704810195971, + "grad_norm": 12.4375, + "kl": 3.919995069503784, + "learning_rate": 5e-06, + "logits/chosen": 21176918.153846152, + "logits/rejected": -24603816.727272727, + "logps/chosen": -388.97171724759613, + "logps/rejected": -407.07328657670456, + "loss": 0.0379, + "rewards/chosen": 6.078194251427283, + "rewards/margins": 14.486630419751148, + "rewards/rejected": -8.408436168323863, + "step": 2000 + }, + { + "epoch": 0.5484445662601068, + "grad_norm": 8.5, + "kl": 2.1002821922302246, + "learning_rate": 5e-06, + "logits/chosen": -22748801.88235294, + "logits/rejected": -8577352.57142857, + "logps/chosen": -419.35765165441177, + "logps/rejected": -586.9787946428571, + "loss": 0.0304, + "rewards/chosen": 6.951938853544347, + "rewards/margins": 16.81557977500082, + "rewards/rejected": -9.863640921456474, + "step": 2001 + }, + { + "epoch": 0.5487186515006167, + "grad_norm": 3.71875, + "kl": 4.389316558837891, + "learning_rate": 5e-06, + "logits/chosen": -14304073.142857144, + "logits/rejected": -33064582.4, + "logps/chosen": -508.11251395089283, + "logps/rejected": -597.62529296875, + "loss": 0.014, + "rewards/chosen": 7.789364950997489, + "rewards/margins": 21.08175997052874, + "rewards/rejected": -13.29239501953125, + "step": 2002 + }, + { + "epoch": 0.5489927367411265, + "grad_norm": 7.65625, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -24377016.727272727, + "logits/rejected": -21745846.153846152, + "logps/chosen": -594.9388760653409, + "logps/rejected": -512.9338566706731, + "loss": 0.0375, + "rewards/chosen": 6.956851612437855, + "rewards/margins": 20.297070963399392, + "rewards/rejected": -13.340219350961538, + "step": 2003 + }, + { + "epoch": 0.5492668219816363, + "grad_norm": 14.3125, + "kl": 12.712512969970703, + "learning_rate": 5e-06, + "logits/chosen": -41771628.307692304, + "logits/rejected": -28309992.727272727, + "logps/chosen": -453.0762469951923, + "logps/rejected": -446.5817205255682, + "loss": 0.1082, + "rewards/chosen": 7.46902583195613, + "rewards/margins": 17.120673919891143, + "rewards/rejected": -9.651648087935014, + "step": 2004 + }, + { + "epoch": 0.5495409072221461, + "grad_norm": 5.3125, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -28413352.0, + "logits/rejected": -29819712.0, + "logps/chosen": -526.0605061848959, + "logps/rejected": -534.8649088541666, + "loss": 0.0211, + "rewards/chosen": 7.336350758870442, + "rewards/margins": 18.21841557820638, + "rewards/rejected": -10.882064819335938, + "step": 2005 + }, + { + "epoch": 0.5498149924626559, + "grad_norm": 5.65625, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -29634304.0, + "logits/rejected": -17759052.0, + "logps/chosen": -474.9342956542969, + "logps/rejected": -510.7363586425781, + "loss": 0.0172, + "rewards/chosen": 7.7972307205200195, + "rewards/margins": 18.031460762023926, + "rewards/rejected": -10.234230041503906, + "step": 2006 + }, + { + "epoch": 0.5500890777031657, + "grad_norm": 2.328125, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -43483120.0, + "logits/rejected": -11455236.0, + "logps/chosen": -513.032470703125, + "logps/rejected": -562.996337890625, + "loss": 0.0064, + "rewards/chosen": 6.053489685058594, + "rewards/margins": 16.30825901031494, + "rewards/rejected": -10.254769325256348, + "step": 2007 + }, + { + "epoch": 0.5503631629436755, + "grad_norm": 9.1875, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": 6623742.545454546, + "logits/rejected": -11147616.0, + "logps/chosen": -376.6277521306818, + "logps/rejected": -580.9522986778846, + "loss": 0.0254, + "rewards/chosen": 6.469759160822088, + "rewards/margins": 18.07069957172954, + "rewards/rejected": -11.600940410907452, + "step": 2008 + }, + { + "epoch": 0.5506372481841852, + "grad_norm": 12.6875, + "kl": 17.819456100463867, + "learning_rate": 5e-06, + "logits/chosen": -24232265.14285714, + "logits/rejected": -24391321.6, + "logps/chosen": -604.0933314732143, + "logps/rejected": -676.730712890625, + "loss": 0.036, + "rewards/chosen": 7.188533782958984, + "rewards/margins": 19.157642364501953, + "rewards/rejected": -11.969108581542969, + "step": 2009 + }, + { + "epoch": 0.5509113334246951, + "grad_norm": 3.375, + "kl": 3.971749782562256, + "learning_rate": 5e-06, + "logits/chosen": -17507332.363636363, + "logits/rejected": -15790077.538461538, + "logps/chosen": -454.14501953125, + "logps/rejected": -440.35648287259613, + "loss": 0.0503, + "rewards/chosen": 5.758055253462358, + "rewards/margins": 14.230697418426299, + "rewards/rejected": -8.472642164963942, + "step": 2010 + }, + { + "epoch": 0.5511854186652049, + "grad_norm": 2.3125, + "kl": 1.3431930541992188, + "learning_rate": 5e-06, + "logits/chosen": -9262873.142857144, + "logits/rejected": -17143435.2, + "logps/chosen": -444.26988002232144, + "logps/rejected": -467.150537109375, + "loss": 0.0283, + "rewards/chosen": 6.786708286830357, + "rewards/margins": 15.7215822492327, + "rewards/rejected": -8.934873962402344, + "step": 2011 + }, + { + "epoch": 0.5514595039057146, + "grad_norm": 6.1875, + "kl": 1.2001241445541382, + "learning_rate": 5e-06, + "logits/chosen": -4275160.533333333, + "logits/rejected": -9279510.222222222, + "logps/chosen": -471.874609375, + "logps/rejected": -373.954833984375, + "loss": 0.0595, + "rewards/chosen": 5.795854187011718, + "rewards/margins": 12.48376702202691, + "rewards/rejected": -6.687912835015191, + "step": 2012 + }, + { + "epoch": 0.5517335891462245, + "grad_norm": 9.9375, + "kl": 5.4280548095703125, + "learning_rate": 5e-06, + "logits/chosen": -18144305.230769232, + "logits/rejected": -11201975.272727273, + "logps/chosen": -494.38912259615387, + "logps/rejected": -517.2211026278409, + "loss": 0.0742, + "rewards/chosen": 8.248563913198618, + "rewards/margins": 15.803231299340307, + "rewards/rejected": -7.55466738614169, + "step": 2013 + }, + { + "epoch": 0.5520076743867343, + "grad_norm": 6.15625, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -29858752.0, + "logits/rejected": -929911.5294117647, + "logps/chosen": -345.7225864955357, + "logps/rejected": -532.9274471507352, + "loss": 0.0208, + "rewards/chosen": 7.510665348597935, + "rewards/margins": 18.788842946541408, + "rewards/rejected": -11.278177597943474, + "step": 2014 + }, + { + "epoch": 0.552281759627244, + "grad_norm": 1.046875, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -18150323.692307692, + "logits/rejected": -4917349.818181818, + "logps/chosen": -404.5233623798077, + "logps/rejected": -608.3477450284091, + "loss": 0.0046, + "rewards/chosen": 7.727083646334135, + "rewards/margins": 22.257857849547914, + "rewards/rejected": -14.53077420321378, + "step": 2015 + }, + { + "epoch": 0.5525558448677539, + "grad_norm": 5.6875, + "kl": 4.643553733825684, + "learning_rate": 5e-06, + "logits/chosen": -9903636.0, + "logits/rejected": 1795622.6666666667, + "logps/chosen": -358.3850504557292, + "logps/rejected": -513.1094156901041, + "loss": 0.0391, + "rewards/chosen": 6.256624857584636, + "rewards/margins": 18.909868876139324, + "rewards/rejected": -12.653244018554688, + "step": 2016 + }, + { + "epoch": 0.5528299301082636, + "grad_norm": 5.625, + "kl": 3.7463455200195312, + "learning_rate": 5e-06, + "logits/chosen": -17307492.57142857, + "logits/rejected": -16076336.0, + "logps/chosen": -525.8926478794643, + "logps/rejected": -378.65205078125, + "loss": 0.022, + "rewards/chosen": 7.165495736258371, + "rewards/margins": 17.155009896414622, + "rewards/rejected": -9.98951416015625, + "step": 2017 + }, + { + "epoch": 0.5531040153487735, + "grad_norm": 5.40625, + "kl": 10.359808921813965, + "learning_rate": 5e-06, + "logits/chosen": -24406514.82352941, + "logits/rejected": -8153515.428571428, + "logps/chosen": -492.51953125, + "logps/rejected": -519.904296875, + "loss": 0.0151, + "rewards/chosen": 7.565406350528493, + "rewards/margins": 16.806045083438647, + "rewards/rejected": -9.240638732910156, + "step": 2018 + }, + { + "epoch": 0.5533781005892833, + "grad_norm": 7.9375, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -19840677.333333332, + "logits/rejected": -22812433.066666666, + "logps/chosen": -363.25718858506946, + "logps/rejected": -567.8939453125, + "loss": 0.0436, + "rewards/chosen": 6.060742696126302, + "rewards/margins": 18.126434834798175, + "rewards/rejected": -12.065692138671874, + "step": 2019 + }, + { + "epoch": 0.553652185829793, + "grad_norm": 8.5, + "kl": 1.388490080833435, + "learning_rate": 5e-06, + "logits/chosen": 31485188.923076924, + "logits/rejected": -19169920.0, + "logps/chosen": -415.6407001201923, + "logps/rejected": -445.01318359375, + "loss": 0.0273, + "rewards/chosen": 6.720120943509615, + "rewards/margins": 15.20251230093149, + "rewards/rejected": -8.482391357421875, + "step": 2020 + }, + { + "epoch": 0.5539262710703029, + "grad_norm": 9.0625, + "kl": 7.324720859527588, + "learning_rate": 5e-06, + "logits/chosen": -26734882.46153846, + "logits/rejected": -10638718.545454545, + "logps/chosen": -443.71029897836536, + "logps/rejected": -546.6664595170455, + "loss": 0.0713, + "rewards/chosen": 6.382366473858173, + "rewards/margins": 18.598412893868826, + "rewards/rejected": -12.216046420010654, + "step": 2021 + }, + { + "epoch": 0.5542003563108127, + "grad_norm": 6.125, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -39982300.44444445, + "logits/rejected": -11128388.266666668, + "logps/chosen": -506.1388346354167, + "logps/rejected": -546.3864583333333, + "loss": 0.0129, + "rewards/chosen": 8.745025634765625, + "rewards/margins": 18.563565063476563, + "rewards/rejected": -9.818539428710938, + "step": 2022 + }, + { + "epoch": 0.5544744415513224, + "grad_norm": 7.8125, + "kl": 2.730870008468628, + "learning_rate": 5e-06, + "logits/chosen": -18940093.53846154, + "logits/rejected": 8162820.363636363, + "logps/chosen": -369.37439903846155, + "logps/rejected": -807.0591264204545, + "loss": 0.0254, + "rewards/chosen": 7.690899188701923, + "rewards/margins": 21.1404536320613, + "rewards/rejected": -13.449554443359375, + "step": 2023 + }, + { + "epoch": 0.5547485267918323, + "grad_norm": 4.46875, + "kl": 3.093989849090576, + "learning_rate": 5e-06, + "logits/chosen": 12762149.333333334, + "logits/rejected": -33797077.333333336, + "logps/chosen": -398.7218831380208, + "logps/rejected": -507.0123697916667, + "loss": 0.0152, + "rewards/chosen": 6.406976064046224, + "rewards/margins": 15.745165506998699, + "rewards/rejected": -9.338189442952475, + "step": 2024 + }, + { + "epoch": 0.555022612032342, + "grad_norm": 17.875, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": 2560317.8181818184, + "logits/rejected": -18956657.230769232, + "logps/chosen": -377.05868252840907, + "logps/rejected": -517.3298527644231, + "loss": 0.082, + "rewards/chosen": 4.940584009343928, + "rewards/margins": 15.305813689331908, + "rewards/rejected": -10.36522967998798, + "step": 2025 + }, + { + "epoch": 0.5552966972728518, + "grad_norm": 5.5625, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -9723437.6, + "logits/rejected": -33313435.42857143, + "logps/chosen": -350.398291015625, + "logps/rejected": -626.0630580357143, + "loss": 0.0358, + "rewards/chosen": 5.316434097290039, + "rewards/margins": 21.1143248966762, + "rewards/rejected": -15.797890799386161, + "step": 2026 + }, + { + "epoch": 0.5555707825133617, + "grad_norm": 10.0625, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -6109403.5, + "logits/rejected": -20219062.0, + "logps/chosen": -344.7581787109375, + "logps/rejected": -506.08123779296875, + "loss": 0.0213, + "rewards/chosen": 7.274290084838867, + "rewards/margins": 16.290401458740234, + "rewards/rejected": -9.016111373901367, + "step": 2027 + }, + { + "epoch": 0.5558448677538714, + "grad_norm": 9.5625, + "kl": 12.859132766723633, + "learning_rate": 5e-06, + "logits/chosen": -18913673.846153848, + "logits/rejected": -5219924.363636363, + "logps/chosen": -413.19302133413464, + "logps/rejected": -483.62717507102275, + "loss": 0.0525, + "rewards/chosen": 6.9937896728515625, + "rewards/margins": 17.36113947088068, + "rewards/rejected": -10.36734979802912, + "step": 2028 + }, + { + "epoch": 0.5561189529943813, + "grad_norm": 2.96875, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -33190821.818181816, + "logits/rejected": -16883672.615384616, + "logps/chosen": -414.34761186079544, + "logps/rejected": -689.8000300480769, + "loss": 0.0071, + "rewards/chosen": 6.761186773126775, + "rewards/margins": 19.368177880774013, + "rewards/rejected": -12.606991107647236, + "step": 2029 + }, + { + "epoch": 0.5563930382348911, + "grad_norm": 4.03125, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -21189642.666666668, + "logits/rejected": -21940536.0, + "logps/chosen": -425.83642578125, + "logps/rejected": -564.0718180338541, + "loss": 0.015, + "rewards/chosen": 5.437908172607422, + "rewards/margins": 16.38923772176107, + "rewards/rejected": -10.951329549153646, + "step": 2030 + }, + { + "epoch": 0.5566671234754008, + "grad_norm": 2.59375, + "kl": 3.8025107383728027, + "learning_rate": 5e-06, + "logits/chosen": -4889770.0, + "logits/rejected": -15182988.0, + "logps/chosen": -499.7201741536458, + "logps/rejected": -558.3907877604166, + "loss": 0.0074, + "rewards/chosen": 7.622974395751953, + "rewards/margins": 18.72271728515625, + "rewards/rejected": -11.099742889404297, + "step": 2031 + }, + { + "epoch": 0.5569412087159107, + "grad_norm": 3.421875, + "kl": 1.4474284648895264, + "learning_rate": 5e-06, + "logits/chosen": -18885019.636363637, + "logits/rejected": -18024500.923076924, + "logps/chosen": -382.92911044034093, + "logps/rejected": -554.4886568509615, + "loss": 0.0225, + "rewards/chosen": 7.239145452325994, + "rewards/margins": 17.393023297503277, + "rewards/rejected": -10.153877845177284, + "step": 2032 + }, + { + "epoch": 0.5572152939564204, + "grad_norm": 9.8125, + "kl": 5.306757926940918, + "learning_rate": 5e-06, + "logits/chosen": 23703492.923076924, + "logits/rejected": -10440535.272727273, + "logps/chosen": -468.0413161057692, + "logps/rejected": -440.60258345170456, + "loss": 0.0296, + "rewards/chosen": 7.113986088679387, + "rewards/margins": 17.28528680334558, + "rewards/rejected": -10.171300714666193, + "step": 2033 + }, + { + "epoch": 0.5574893791969302, + "grad_norm": 4.96875, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": 19875656.888888888, + "logits/rejected": -25590792.533333335, + "logps/chosen": -424.5439724392361, + "logps/rejected": -533.959375, + "loss": 0.0198, + "rewards/chosen": 6.3433727688259545, + "rewards/margins": 16.014266120062935, + "rewards/rejected": -9.670893351236979, + "step": 2034 + }, + { + "epoch": 0.5577634644374401, + "grad_norm": 7.125, + "kl": 7.219174385070801, + "learning_rate": 5e-06, + "logits/chosen": -11531810.461538462, + "logits/rejected": -7196197.090909091, + "logps/chosen": -371.02869591346155, + "logps/rejected": -471.3123668323864, + "loss": 0.0275, + "rewards/chosen": 6.1788189227764425, + "rewards/margins": 15.091445015860604, + "rewards/rejected": -8.912626093084162, + "step": 2035 + }, + { + "epoch": 0.5580375496779498, + "grad_norm": 7.6875, + "kl": 5.4024658203125, + "learning_rate": 5e-06, + "logits/chosen": -20943251.2, + "logits/rejected": -13999336.888888888, + "logps/chosen": -378.15514322916664, + "logps/rejected": -547.5056423611111, + "loss": 0.0279, + "rewards/chosen": 6.889598592122396, + "rewards/margins": 18.63487854003906, + "rewards/rejected": -11.745279947916666, + "step": 2036 + }, + { + "epoch": 0.5583116349184596, + "grad_norm": 2.234375, + "kl": 2.8902359008789062, + "learning_rate": 5e-06, + "logits/chosen": 19320336.0, + "logits/rejected": -30016698.666666668, + "logps/chosen": -550.89794921875, + "logps/rejected": -527.8994954427084, + "loss": 0.0066, + "rewards/chosen": 7.920032501220703, + "rewards/margins": 18.33830897013346, + "rewards/rejected": -10.41827646891276, + "step": 2037 + }, + { + "epoch": 0.5585857201589695, + "grad_norm": 11.0625, + "kl": 10.563787460327148, + "learning_rate": 5e-06, + "logits/chosen": -23710602.666666668, + "logits/rejected": -17736092.444444444, + "logps/chosen": -506.75166015625, + "logps/rejected": -555.8828667534722, + "loss": 0.0566, + "rewards/chosen": 7.844805908203125, + "rewards/margins": 20.78945041232639, + "rewards/rejected": -12.944644504123264, + "step": 2038 + }, + { + "epoch": 0.5588598053994792, + "grad_norm": 2.96875, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -9721255.272727273, + "logits/rejected": 12484025.846153846, + "logps/chosen": -341.35091885653407, + "logps/rejected": -571.6024639423077, + "loss": 0.0076, + "rewards/chosen": 6.881196455522017, + "rewards/margins": 19.979002945906633, + "rewards/rejected": -13.097806490384615, + "step": 2039 + }, + { + "epoch": 0.5591338906399891, + "grad_norm": 3.546875, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -5810537.714285715, + "logits/rejected": -27473395.2, + "logps/chosen": -483.91231863839283, + "logps/rejected": -432.962353515625, + "loss": 0.0181, + "rewards/chosen": 6.506486075265067, + "rewards/margins": 16.45742220197405, + "rewards/rejected": -9.950936126708985, + "step": 2040 + }, + { + "epoch": 0.5594079758804988, + "grad_norm": 7.65625, + "kl": 2.7337374687194824, + "learning_rate": 5e-06, + "logits/chosen": 3998499.6363636362, + "logits/rejected": -1312556.923076923, + "logps/chosen": -471.07266512784093, + "logps/rejected": -640.7967247596154, + "loss": 0.0238, + "rewards/chosen": 6.662114923650568, + "rewards/margins": 14.56688599486451, + "rewards/rejected": -7.9047710712139425, + "step": 2041 + }, + { + "epoch": 0.5596820611210086, + "grad_norm": 4.84375, + "kl": 0.49485844373703003, + "learning_rate": 5e-06, + "logits/chosen": -35175982.54545455, + "logits/rejected": -14944483.692307692, + "logps/chosen": -481.3671875, + "logps/rejected": -619.9342698317307, + "loss": 0.0215, + "rewards/chosen": 8.081215598366477, + "rewards/margins": 19.871400206239073, + "rewards/rejected": -11.790184607872597, + "step": 2042 + }, + { + "epoch": 0.5599561463615185, + "grad_norm": 6.0625, + "kl": 2.2590346336364746, + "learning_rate": 5e-06, + "logits/chosen": -9188406.4, + "logits/rejected": -21621028.57142857, + "logps/chosen": -465.292919921875, + "logps/rejected": -490.5645228794643, + "loss": 0.0187, + "rewards/chosen": 8.309645843505859, + "rewards/margins": 17.660303715297154, + "rewards/rejected": -9.350657871791295, + "step": 2043 + }, + { + "epoch": 0.5602302316020282, + "grad_norm": 8.5625, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -18853852.0, + "logits/rejected": -33794260.0, + "logps/chosen": -331.5677490234375, + "logps/rejected": -444.5323791503906, + "loss": 0.0401, + "rewards/chosen": 6.604351997375488, + "rewards/margins": 14.793074607849121, + "rewards/rejected": -8.188722610473633, + "step": 2044 + }, + { + "epoch": 0.560504316842538, + "grad_norm": 13.875, + "kl": 5.342080593109131, + "learning_rate": 5e-06, + "logits/chosen": -21493942.666666668, + "logits/rejected": -9117653.333333334, + "logps/chosen": -453.4437662760417, + "logps/rejected": -510.4639078776042, + "loss": 0.0309, + "rewards/chosen": 7.207461675008138, + "rewards/margins": 16.65392239888509, + "rewards/rejected": -9.446460723876953, + "step": 2045 + }, + { + "epoch": 0.5607784020830479, + "grad_norm": 6.4375, + "kl": 2.7111170291900635, + "learning_rate": 5e-06, + "logits/chosen": -12296162.285714285, + "logits/rejected": 9307250.4, + "logps/chosen": -415.09287806919644, + "logps/rejected": -547.463623046875, + "loss": 0.0329, + "rewards/chosen": 6.7943235124860495, + "rewards/margins": 17.042554255894252, + "rewards/rejected": -10.248230743408204, + "step": 2046 + }, + { + "epoch": 0.5610524873235576, + "grad_norm": 7.84375, + "kl": 3.5440711975097656, + "learning_rate": 5e-06, + "logits/chosen": -12938108.307692308, + "logits/rejected": -20840106.181818184, + "logps/chosen": -396.99500450721155, + "logps/rejected": -411.3558238636364, + "loss": 0.0569, + "rewards/chosen": 8.58245849609375, + "rewards/margins": 15.978175076571379, + "rewards/rejected": -7.395716580477628, + "step": 2047 + }, + { + "epoch": 0.5613265725640674, + "grad_norm": 4.90625, + "kl": 0.6454347372055054, + "learning_rate": 5e-06, + "logits/chosen": -10289237.6, + "logits/rejected": -43038907.428571425, + "logps/chosen": -418.886083984375, + "logps/rejected": -506.46484375, + "loss": 0.0112, + "rewards/chosen": 6.091970062255859, + "rewards/margins": 16.380999864850725, + "rewards/rejected": -10.289029802594866, + "step": 2048 + }, + { + "epoch": 0.5616006578045772, + "grad_norm": 8.25, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -18708484.8, + "logits/rejected": -19295536.0, + "logps/chosen": -285.0521240234375, + "logps/rejected": -451.0864955357143, + "loss": 0.0633, + "rewards/chosen": 4.541218566894531, + "rewards/margins": 14.29973624093192, + "rewards/rejected": -9.758517674037389, + "step": 2049 + }, + { + "epoch": 0.561874743045087, + "grad_norm": 4.8125, + "kl": 7.612316131591797, + "learning_rate": 5e-06, + "logits/chosen": -22582970.181818184, + "logits/rejected": -20109723.076923076, + "logps/chosen": -361.4546564275568, + "logps/rejected": -602.7472956730769, + "loss": 0.0164, + "rewards/chosen": 6.975404912775213, + "rewards/margins": 17.735629475200092, + "rewards/rejected": -10.76022456242488, + "step": 2050 + }, + { + "epoch": 0.5621488282855969, + "grad_norm": 2.796875, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -12632060.307692308, + "logits/rejected": -667941.7272727273, + "logps/chosen": -388.11583533653845, + "logps/rejected": -714.5502041903409, + "loss": 0.0235, + "rewards/chosen": 6.783875685471755, + "rewards/margins": 18.89615124255627, + "rewards/rejected": -12.112275557084518, + "step": 2051 + }, + { + "epoch": 0.5624229135261066, + "grad_norm": 8.8125, + "kl": 4.119449615478516, + "learning_rate": 5e-06, + "logits/chosen": -4432965.818181818, + "logits/rejected": 3089712.6153846155, + "logps/chosen": -447.39399857954544, + "logps/rejected": -507.2043269230769, + "loss": 0.0243, + "rewards/chosen": 6.984277898615057, + "rewards/margins": 16.612635419085308, + "rewards/rejected": -9.628357520470253, + "step": 2052 + }, + { + "epoch": 0.5626969987666164, + "grad_norm": 4.125, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -9941067.636363637, + "logits/rejected": -12247873.23076923, + "logps/chosen": -310.43430397727275, + "logps/rejected": -617.3185847355769, + "loss": 0.0368, + "rewards/chosen": 5.308880199085582, + "rewards/margins": 16.242042007979812, + "rewards/rejected": -10.93316180889423, + "step": 2053 + }, + { + "epoch": 0.5629710840071263, + "grad_norm": 5.84375, + "kl": 4.203070640563965, + "learning_rate": 5e-06, + "logits/chosen": -19880276.363636363, + "logits/rejected": -4916661.846153846, + "logps/chosen": -417.80522017045456, + "logps/rejected": -391.92397836538464, + "loss": 0.0281, + "rewards/chosen": 7.09552626176314, + "rewards/margins": 16.464615241630927, + "rewards/rejected": -9.369088979867788, + "step": 2054 + }, + { + "epoch": 0.563245169247636, + "grad_norm": 4.0, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -19920024.727272727, + "logits/rejected": -20132883.692307692, + "logps/chosen": -466.7145330255682, + "logps/rejected": -518.2913161057693, + "loss": 0.0155, + "rewards/chosen": 6.9775779030539775, + "rewards/margins": 19.1888135363172, + "rewards/rejected": -12.211235633263222, + "step": 2055 + }, + { + "epoch": 0.5635192544881458, + "grad_norm": 14.625, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -9207120.0, + "logits/rejected": -22704650.666666668, + "logps/chosen": -462.3973795572917, + "logps/rejected": -587.6271158854166, + "loss": 0.0559, + "rewards/chosen": 5.008958498636882, + "rewards/margins": 14.491937319437664, + "rewards/rejected": -9.482978820800781, + "step": 2056 + }, + { + "epoch": 0.5637933397286556, + "grad_norm": 2.3125, + "kl": 2.9892630577087402, + "learning_rate": 5e-06, + "logits/chosen": -14475642.285714285, + "logits/rejected": -9022905.6, + "logps/chosen": -412.09877232142856, + "logps/rejected": -472.3427734375, + "loss": 0.0339, + "rewards/chosen": 6.996598379952567, + "rewards/margins": 16.97983202253069, + "rewards/rejected": -9.983233642578124, + "step": 2057 + }, + { + "epoch": 0.5640674249691654, + "grad_norm": 12.3125, + "kl": 4.231632232666016, + "learning_rate": 5e-06, + "logits/chosen": -10706216.615384616, + "logits/rejected": -19072450.90909091, + "logps/chosen": -445.7146559495192, + "logps/rejected": -469.58487215909093, + "loss": 0.0562, + "rewards/chosen": 5.621434725247896, + "rewards/margins": 15.29516436170031, + "rewards/rejected": -9.673729636452414, + "step": 2058 + }, + { + "epoch": 0.5643415102096752, + "grad_norm": 1.4375, + "kl": 4.89117431640625, + "learning_rate": 5e-06, + "logits/chosen": -19071538.0, + "logits/rejected": -23976462.0, + "logps/chosen": -457.23980712890625, + "logps/rejected": -851.4812622070312, + "loss": 0.0033, + "rewards/chosen": 8.557952880859375, + "rewards/margins": 23.18563175201416, + "rewards/rejected": -14.627678871154785, + "step": 2059 + }, + { + "epoch": 0.564615595450185, + "grad_norm": 12.1875, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -31121969.777777776, + "logits/rejected": -24147517.866666667, + "logps/chosen": -395.0246310763889, + "logps/rejected": -558.0017578125, + "loss": 0.0618, + "rewards/chosen": 6.109155442979601, + "rewards/margins": 18.30270199245877, + "rewards/rejected": -12.193546549479167, + "step": 2060 + }, + { + "epoch": 0.5648896806906948, + "grad_norm": 1.7890625, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -34704872.0, + "logits/rejected": -30612748.0, + "logps/chosen": -436.7883605957031, + "logps/rejected": -493.3774108886719, + "loss": 0.0047, + "rewards/chosen": 7.130896091461182, + "rewards/margins": 17.539700031280518, + "rewards/rejected": -10.408803939819336, + "step": 2061 + }, + { + "epoch": 0.5651637659312047, + "grad_norm": 7.46875, + "kl": 0.19741439819335938, + "learning_rate": 5e-06, + "logits/chosen": -3984182.0, + "logits/rejected": -26669512.0, + "logps/chosen": -425.657470703125, + "logps/rejected": -492.5303141276042, + "loss": 0.0271, + "rewards/chosen": 6.086368560791016, + "rewards/margins": 15.385078430175781, + "rewards/rejected": -9.298709869384766, + "step": 2062 + }, + { + "epoch": 0.5654378511717144, + "grad_norm": 6.71875, + "kl": 1.0464839935302734, + "learning_rate": 5e-06, + "logits/chosen": -34603608.0, + "logits/rejected": -4211123.666666667, + "logps/chosen": -509.798583984375, + "logps/rejected": -464.4973551432292, + "loss": 0.0664, + "rewards/chosen": 8.021881103515625, + "rewards/margins": 15.55669657389323, + "rewards/rejected": -7.5348154703776045, + "step": 2063 + }, + { + "epoch": 0.5657119364122242, + "grad_norm": 7.15625, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -8792413.333333334, + "logits/rejected": -35298485.333333336, + "logps/chosen": -361.32752821180554, + "logps/rejected": -443.17356770833334, + "loss": 0.0392, + "rewards/chosen": 6.634291330973308, + "rewards/margins": 15.399355824788412, + "rewards/rejected": -8.765064493815105, + "step": 2064 + }, + { + "epoch": 0.565986021652734, + "grad_norm": 3.40625, + "kl": 1.7908185720443726, + "learning_rate": 5e-06, + "logits/chosen": 11494455.0, + "logits/rejected": -32297800.0, + "logps/chosen": -431.0013122558594, + "logps/rejected": -739.7415161132812, + "loss": 0.0124, + "rewards/chosen": 7.094418525695801, + "rewards/margins": 22.49496841430664, + "rewards/rejected": -15.40054988861084, + "step": 2065 + }, + { + "epoch": 0.5662601068932438, + "grad_norm": 8.0625, + "kl": 4.083518028259277, + "learning_rate": 5e-06, + "logits/chosen": -25801853.09090909, + "logits/rejected": -6457996.307692308, + "logps/chosen": -371.51493696732956, + "logps/rejected": -474.00473257211536, + "loss": 0.0415, + "rewards/chosen": 6.3538970947265625, + "rewards/margins": 16.457999596228966, + "rewards/rejected": -10.104102501502403, + "step": 2066 + }, + { + "epoch": 0.5665341921337536, + "grad_norm": 8.625, + "kl": 2.9704437255859375, + "learning_rate": 5e-06, + "logits/chosen": -19606553.6, + "logits/rejected": -21401757.714285713, + "logps/chosen": -405.70302734375, + "logps/rejected": -549.5874720982143, + "loss": 0.0247, + "rewards/chosen": 7.746845245361328, + "rewards/margins": 17.044160570417134, + "rewards/rejected": -9.297315325055804, + "step": 2067 + }, + { + "epoch": 0.5668082773742634, + "grad_norm": 6.6875, + "kl": 0.6808280944824219, + "learning_rate": 5e-06, + "logits/chosen": -33052680.0, + "logits/rejected": -13268816.0, + "logps/chosen": -448.9460856119792, + "logps/rejected": -516.218994140625, + "loss": 0.0307, + "rewards/chosen": 8.415655771891275, + "rewards/margins": 20.167142232259113, + "rewards/rejected": -11.751486460367838, + "step": 2068 + }, + { + "epoch": 0.5670823626147732, + "grad_norm": 9.0, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -21448922.666666668, + "logits/rejected": -17023644.444444444, + "logps/chosen": -383.3302815755208, + "logps/rejected": -648.2427300347222, + "loss": 0.0305, + "rewards/chosen": 6.589866638183594, + "rewards/margins": 19.94148678249783, + "rewards/rejected": -13.351620144314236, + "step": 2069 + }, + { + "epoch": 0.5673564478552829, + "grad_norm": 8.125, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -30491136.0, + "logits/rejected": 11961441.142857144, + "logps/chosen": -423.56982421875, + "logps/rejected": -480.81337193080356, + "loss": 0.0345, + "rewards/chosen": 5.8189697265625, + "rewards/margins": 14.523801531110491, + "rewards/rejected": -8.704831804547991, + "step": 2070 + }, + { + "epoch": 0.5676305330957928, + "grad_norm": 5.90625, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -18974468.923076924, + "logits/rejected": -33562176.0, + "logps/chosen": -348.23106971153845, + "logps/rejected": -618.1946910511364, + "loss": 0.0178, + "rewards/chosen": 6.789068368765024, + "rewards/margins": 22.260890934017155, + "rewards/rejected": -15.47182256525213, + "step": 2071 + }, + { + "epoch": 0.5679046183363026, + "grad_norm": 4.03125, + "kl": 2.0583763122558594, + "learning_rate": 5e-06, + "logits/chosen": 13960800.0, + "logits/rejected": -19700612.923076924, + "logps/chosen": -386.5319158380682, + "logps/rejected": -662.6481370192307, + "loss": 0.0144, + "rewards/chosen": 7.222702719948509, + "rewards/margins": 17.918760553106562, + "rewards/rejected": -10.696057833158052, + "step": 2072 + }, + { + "epoch": 0.5681787035768124, + "grad_norm": 1.296875, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -19122838.666666668, + "logits/rejected": -33310216.0, + "logps/chosen": -517.5096842447916, + "logps/rejected": -631.6138509114584, + "loss": 0.0046, + "rewards/chosen": 6.836645762125651, + "rewards/margins": 19.314071655273438, + "rewards/rejected": -12.477425893147787, + "step": 2073 + }, + { + "epoch": 0.5684527888173222, + "grad_norm": 1.7734375, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -23811507.2, + "logits/rejected": -18274912.0, + "logps/chosen": -460.75107421875, + "logps/rejected": -591.0448521205357, + "loss": 0.0044, + "rewards/chosen": 7.216944885253906, + "rewards/margins": 20.965812465122767, + "rewards/rejected": -13.748867579868861, + "step": 2074 + }, + { + "epoch": 0.568726874057832, + "grad_norm": 11.625, + "kl": 4.345172882080078, + "learning_rate": 5e-06, + "logits/chosen": -36191268.92307692, + "logits/rejected": -5027569.454545454, + "logps/chosen": -431.3273737980769, + "logps/rejected": -473.71360085227275, + "loss": 0.0594, + "rewards/chosen": 7.313653212327224, + "rewards/margins": 17.927016998504424, + "rewards/rejected": -10.613363786177201, + "step": 2075 + }, + { + "epoch": 0.5690009592983418, + "grad_norm": 9.6875, + "kl": 9.799956321716309, + "learning_rate": 5e-06, + "logits/chosen": -17844091.733333334, + "logits/rejected": -13699420.444444444, + "logps/chosen": -419.55286458333336, + "logps/rejected": -607.8879665798611, + "loss": 0.0767, + "rewards/chosen": 5.878955078125, + "rewards/margins": 14.941011725531684, + "rewards/rejected": -9.062056647406685, + "step": 2076 + }, + { + "epoch": 0.5692750445388516, + "grad_norm": 1.5625, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -39617417.14285714, + "logits/rejected": -21855879.529411763, + "logps/chosen": -562.5190778459821, + "logps/rejected": -596.2018612132352, + "loss": 0.004, + "rewards/chosen": 6.751523154122489, + "rewards/margins": 19.029613687210723, + "rewards/rejected": -12.278090533088236, + "step": 2077 + }, + { + "epoch": 0.5695491297793613, + "grad_norm": 8.5, + "kl": 1.5358861684799194, + "learning_rate": 5e-06, + "logits/chosen": -41172168.53333333, + "logits/rejected": -21286344.888888888, + "logps/chosen": -329.3490234375, + "logps/rejected": -472.326171875, + "loss": 0.0754, + "rewards/chosen": 6.476792907714843, + "rewards/margins": 16.300337727864584, + "rewards/rejected": -9.82354482014974, + "step": 2078 + }, + { + "epoch": 0.5698232150198712, + "grad_norm": 3.96875, + "kl": 3.060559034347534, + "learning_rate": 5e-06, + "logits/chosen": -40130934.15384615, + "logits/rejected": -6968980.363636363, + "logps/chosen": -373.3424729567308, + "logps/rejected": -508.31729403409093, + "loss": 0.0134, + "rewards/chosen": 6.35586665226863, + "rewards/margins": 15.839093081601018, + "rewards/rejected": -9.483226429332387, + "step": 2079 + }, + { + "epoch": 0.570097300260381, + "grad_norm": 3.859375, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -8285601.777777778, + "logits/rejected": -25940654.933333334, + "logps/chosen": -478.4104275173611, + "logps/rejected": -489.6453125, + "loss": 0.0052, + "rewards/chosen": 7.346316867404514, + "rewards/margins": 18.148645358615454, + "rewards/rejected": -10.802328491210938, + "step": 2080 + }, + { + "epoch": 0.5703713855008907, + "grad_norm": 6.46875, + "kl": 6.029782772064209, + "learning_rate": 5e-06, + "logits/chosen": -20214668.8, + "logits/rejected": -27193237.333333332, + "logps/chosen": -373.4889322916667, + "logps/rejected": -357.8538411458333, + "loss": 0.0383, + "rewards/chosen": 6.0312449137369795, + "rewards/margins": 14.60493384467231, + "rewards/rejected": -8.57368893093533, + "step": 2081 + }, + { + "epoch": 0.5706454707414006, + "grad_norm": 5.625, + "kl": 3.610734462738037, + "learning_rate": 5e-06, + "logits/chosen": -30202792.0, + "logits/rejected": -22343610.0, + "logps/chosen": -422.1995849609375, + "logps/rejected": -855.8651123046875, + "loss": 0.0218, + "rewards/chosen": 6.630320072174072, + "rewards/margins": 23.159056186676025, + "rewards/rejected": -16.528736114501953, + "step": 2082 + }, + { + "epoch": 0.5709195559819104, + "grad_norm": 9.625, + "kl": 1.3122981786727905, + "learning_rate": 5e-06, + "logits/chosen": -22760862.0, + "logits/rejected": -7302757.5, + "logps/chosen": -475.8143615722656, + "logps/rejected": -486.5482177734375, + "loss": 0.0352, + "rewards/chosen": 7.386274337768555, + "rewards/margins": 19.83949565887451, + "rewards/rejected": -12.453221321105957, + "step": 2083 + }, + { + "epoch": 0.5711936412224202, + "grad_norm": 7.09375, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -9503720.0, + "logits/rejected": -20302489.6, + "logps/chosen": -400.71620396205356, + "logps/rejected": -496.470361328125, + "loss": 0.0198, + "rewards/chosen": 7.212094988141741, + "rewards/margins": 16.57865687779018, + "rewards/rejected": -9.366561889648438, + "step": 2084 + }, + { + "epoch": 0.57146772646293, + "grad_norm": 4.71875, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": 282051.2727272727, + "logits/rejected": -12096541.538461538, + "logps/chosen": -402.0343572443182, + "logps/rejected": -632.6154597355769, + "loss": 0.024, + "rewards/chosen": 5.618443228981712, + "rewards/margins": 17.300356164678828, + "rewards/rejected": -11.681912935697115, + "step": 2085 + }, + { + "epoch": 0.5717418117034397, + "grad_norm": 5.28125, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -34925299.692307696, + "logits/rejected": -24780741.818181816, + "logps/chosen": -377.8694035456731, + "logps/rejected": -489.4947620738636, + "loss": 0.0164, + "rewards/chosen": 5.288227961613582, + "rewards/margins": 16.254274381624235, + "rewards/rejected": -10.966046420010654, + "step": 2086 + }, + { + "epoch": 0.5720158969439496, + "grad_norm": 10.25, + "kl": 6.493488311767578, + "learning_rate": 5e-06, + "logits/chosen": -13684820.363636363, + "logits/rejected": -18635475.692307692, + "logps/chosen": -363.1796875, + "logps/rejected": -615.6890775240385, + "loss": 0.0576, + "rewards/chosen": 6.13356503573331, + "rewards/margins": 17.485255474810835, + "rewards/rejected": -11.351690439077524, + "step": 2087 + }, + { + "epoch": 0.5722899821844594, + "grad_norm": 3.09375, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -34483491.55555555, + "logits/rejected": -14224716.8, + "logps/chosen": -470.396484375, + "logps/rejected": -439.8736979166667, + "loss": 0.0087, + "rewards/chosen": 6.637686411539714, + "rewards/margins": 16.39895960489909, + "rewards/rejected": -9.761273193359376, + "step": 2088 + }, + { + "epoch": 0.5725640674249691, + "grad_norm": 2.8125, + "kl": 3.3315486907958984, + "learning_rate": 5e-06, + "logits/chosen": -28862205.333333332, + "logits/rejected": -21582384.0, + "logps/chosen": -408.689697265625, + "logps/rejected": -504.9444986979167, + "loss": 0.0362, + "rewards/chosen": 7.031878153483073, + "rewards/margins": 16.449883778889973, + "rewards/rejected": -9.4180056254069, + "step": 2089 + }, + { + "epoch": 0.572838152665479, + "grad_norm": 9.75, + "kl": 1.7754848003387451, + "learning_rate": 5e-06, + "logits/chosen": -10075564.0, + "logits/rejected": -10287381.714285715, + "logps/chosen": -329.882080078125, + "logps/rejected": -383.7634974888393, + "loss": 0.0283, + "rewards/chosen": 6.477745056152344, + "rewards/margins": 14.043464660644531, + "rewards/rejected": -7.5657196044921875, + "step": 2090 + }, + { + "epoch": 0.5731122379059888, + "grad_norm": 5.78125, + "kl": 4.885934829711914, + "learning_rate": 5e-06, + "logits/chosen": -25911572.0, + "logits/rejected": -42740664.0, + "logps/chosen": -470.95831298828125, + "logps/rejected": -531.4826049804688, + "loss": 0.0521, + "rewards/chosen": 6.692975044250488, + "rewards/margins": 19.405585289001465, + "rewards/rejected": -12.712610244750977, + "step": 2091 + }, + { + "epoch": 0.5733863231464985, + "grad_norm": 7.9375, + "kl": 11.036136627197266, + "learning_rate": 5e-06, + "logits/chosen": -25945320.0, + "logits/rejected": 1978273.125, + "logps/chosen": -392.59796142578125, + "logps/rejected": -461.633056640625, + "loss": 0.0564, + "rewards/chosen": 6.913478374481201, + "rewards/margins": 13.973340034484863, + "rewards/rejected": -7.059861660003662, + "step": 2092 + }, + { + "epoch": 0.5736604083870084, + "grad_norm": 4.59375, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -32323488.0, + "logits/rejected": -23017210.666666668, + "logps/chosen": -482.9468994140625, + "logps/rejected": -504.5812174479167, + "loss": 0.025, + "rewards/chosen": 7.964707692464192, + "rewards/margins": 18.303264617919922, + "rewards/rejected": -10.338556925455729, + "step": 2093 + }, + { + "epoch": 0.5739344936275181, + "grad_norm": 1.890625, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -21903950.222222224, + "logits/rejected": -16065969.066666666, + "logps/chosen": -369.20421006944446, + "logps/rejected": -490.06565755208334, + "loss": 0.0065, + "rewards/chosen": 7.2705841064453125, + "rewards/margins": 17.45656941731771, + "rewards/rejected": -10.185985310872395, + "step": 2094 + }, + { + "epoch": 0.574208578868028, + "grad_norm": 6.5625, + "kl": 3.5964412689208984, + "learning_rate": 5e-06, + "logits/chosen": -21120157.866666667, + "logits/rejected": -9543590.222222222, + "logps/chosen": -456.9173828125, + "logps/rejected": -478.9441189236111, + "loss": 0.0206, + "rewards/chosen": 8.04248046875, + "rewards/margins": 16.760399712456596, + "rewards/rejected": -8.717919243706596, + "step": 2095 + }, + { + "epoch": 0.5744826641085378, + "grad_norm": 8.0625, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": 20639243.2, + "logits/rejected": -43350070.85714286, + "logps/chosen": -322.284716796875, + "logps/rejected": -423.91385323660717, + "loss": 0.0215, + "rewards/chosen": 6.927011871337891, + "rewards/margins": 15.674321964808874, + "rewards/rejected": -8.747310093470983, + "step": 2096 + }, + { + "epoch": 0.5747567493490475, + "grad_norm": 15.0625, + "kl": 9.382925987243652, + "learning_rate": 5e-06, + "logits/chosen": -29218978.0, + "logits/rejected": -32289220.0, + "logps/chosen": -440.97540283203125, + "logps/rejected": -452.5029296875, + "loss": 0.0838, + "rewards/chosen": 5.846131324768066, + "rewards/margins": 15.74048137664795, + "rewards/rejected": -9.894350051879883, + "step": 2097 + }, + { + "epoch": 0.5750308345895574, + "grad_norm": 1.9765625, + "kl": 5.98236083984375, + "learning_rate": 5e-06, + "logits/chosen": 176190.93333333332, + "logits/rejected": -41584170.666666664, + "logps/chosen": -477.080078125, + "logps/rejected": -473.56749131944446, + "loss": 0.0454, + "rewards/chosen": 7.5953725179036455, + "rewards/margins": 18.389156765407986, + "rewards/rejected": -10.793784247504341, + "step": 2098 + }, + { + "epoch": 0.5753049198300672, + "grad_norm": 10.375, + "kl": 8.216522216796875, + "learning_rate": 5e-06, + "logits/chosen": -45981928.0, + "logits/rejected": -8197049.0, + "logps/chosen": -468.54986572265625, + "logps/rejected": -440.19366455078125, + "loss": 0.0504, + "rewards/chosen": 6.64915657043457, + "rewards/margins": 17.176493644714355, + "rewards/rejected": -10.527337074279785, + "step": 2099 + }, + { + "epoch": 0.5755790050705769, + "grad_norm": 8.125, + "kl": 3.3587217330932617, + "learning_rate": 5e-06, + "logits/chosen": -4487548.0, + "logits/rejected": 2266728.8, + "logps/chosen": -311.0000697544643, + "logps/rejected": -500.89365234375, + "loss": 0.0559, + "rewards/chosen": 6.243882315499442, + "rewards/margins": 14.572942679268973, + "rewards/rejected": -8.329060363769532, + "step": 2100 + }, + { + "epoch": 0.5758530903110868, + "grad_norm": 2.90625, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -20973284.923076924, + "logits/rejected": -31175217.454545453, + "logps/chosen": -377.38198617788464, + "logps/rejected": -736.5512251420455, + "loss": 0.0173, + "rewards/chosen": 5.325739933894231, + "rewards/margins": 21.75927126157534, + "rewards/rejected": -16.43353132768111, + "step": 2101 + }, + { + "epoch": 0.5761271755515965, + "grad_norm": 5.875, + "kl": 1.7646329402923584, + "learning_rate": 5e-06, + "logits/chosen": -31233878.153846152, + "logits/rejected": -4909754.909090909, + "logps/chosen": -432.52249849759613, + "logps/rejected": -491.00852272727275, + "loss": 0.0389, + "rewards/chosen": 7.995442903958834, + "rewards/margins": 18.519511562960965, + "rewards/rejected": -10.52406865900213, + "step": 2102 + }, + { + "epoch": 0.5764012607921063, + "grad_norm": 12.4375, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -27826224.0, + "logits/rejected": -25210718.0, + "logps/chosen": -444.09649658203125, + "logps/rejected": -473.62554931640625, + "loss": 0.0355, + "rewards/chosen": 6.4911909103393555, + "rewards/margins": 15.71739387512207, + "rewards/rejected": -9.226202964782715, + "step": 2103 + }, + { + "epoch": 0.5766753460326162, + "grad_norm": 8.4375, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -27304388.923076924, + "logits/rejected": 103435077.81818181, + "logps/chosen": -373.89453125, + "logps/rejected": -604.2321555397727, + "loss": 0.0325, + "rewards/chosen": 6.190062889685998, + "rewards/margins": 22.136676548244235, + "rewards/rejected": -15.946613658558238, + "step": 2104 + }, + { + "epoch": 0.5769494312731259, + "grad_norm": 13.8125, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -42747159.27272727, + "logits/rejected": -18942505.846153848, + "logps/chosen": -512.6155450994319, + "logps/rejected": -584.7796349158654, + "loss": 0.0264, + "rewards/chosen": 6.569181268865412, + "rewards/margins": 18.834616120878636, + "rewards/rejected": -12.265434852013222, + "step": 2105 + }, + { + "epoch": 0.5772235165136358, + "grad_norm": 7.4375, + "kl": 5.894499778747559, + "learning_rate": 5e-06, + "logits/chosen": -15655228.8, + "logits/rejected": -22649750.85714286, + "logps/chosen": -556.186279296875, + "logps/rejected": -665.0118582589286, + "loss": 0.0175, + "rewards/chosen": 8.099593353271484, + "rewards/margins": 20.019693974086216, + "rewards/rejected": -11.920100620814733, + "step": 2106 + }, + { + "epoch": 0.5774976017541456, + "grad_norm": 13.375, + "kl": 1.5390205383300781, + "learning_rate": 5e-06, + "logits/chosen": -34530001.45454545, + "logits/rejected": -24891788.307692308, + "logps/chosen": -468.5486505681818, + "logps/rejected": -395.0619365985577, + "loss": 0.019, + "rewards/chosen": 6.842016740278765, + "rewards/margins": 15.35609532069493, + "rewards/rejected": -8.514078580416166, + "step": 2107 + }, + { + "epoch": 0.5777716869946553, + "grad_norm": 2.5625, + "kl": 6.553595542907715, + "learning_rate": 5e-06, + "logits/chosen": -52999778.461538464, + "logits/rejected": 17780629.818181816, + "logps/chosen": -571.5456355168269, + "logps/rejected": -501.38210227272725, + "loss": 0.0083, + "rewards/chosen": 8.351004967322716, + "rewards/margins": 17.444307607370654, + "rewards/rejected": -9.09330264004794, + "step": 2108 + }, + { + "epoch": 0.5780457722351652, + "grad_norm": 8.4375, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -20104435.692307692, + "logits/rejected": -49949789.09090909, + "logps/chosen": -457.77640474759613, + "logps/rejected": -535.1843039772727, + "loss": 0.0451, + "rewards/chosen": 6.852756206805889, + "rewards/margins": 20.835590069110577, + "rewards/rejected": -13.982833862304688, + "step": 2109 + }, + { + "epoch": 0.5783198574756749, + "grad_norm": 10.1875, + "kl": 0.7286924123764038, + "learning_rate": 5e-06, + "logits/chosen": -24990557.09090909, + "logits/rejected": -19241465.846153848, + "logps/chosen": -366.5929066051136, + "logps/rejected": -518.2817007211538, + "loss": 0.0321, + "rewards/chosen": 6.137416492808949, + "rewards/margins": 15.97176473124044, + "rewards/rejected": -9.83434823843149, + "step": 2110 + }, + { + "epoch": 0.5785939427161847, + "grad_norm": 5.75, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": 1937323.6, + "logits/rejected": -38430454.85714286, + "logps/chosen": -330.4925048828125, + "logps/rejected": -556.7364327566964, + "loss": 0.0525, + "rewards/chosen": 6.4247795104980465, + "rewards/margins": 19.305113547188895, + "rewards/rejected": -12.880334036690849, + "step": 2111 + }, + { + "epoch": 0.5788680279566946, + "grad_norm": 0.6796875, + "kl": 0.20784315466880798, + "learning_rate": 5e-06, + "logits/chosen": -35404730.18181818, + "logits/rejected": -754229.5384615385, + "logps/chosen": -416.15571732954544, + "logps/rejected": -551.7974008413462, + "loss": 0.0014, + "rewards/chosen": 8.31671142578125, + "rewards/margins": 19.585764958308292, + "rewards/rejected": -11.269053532527042, + "step": 2112 + }, + { + "epoch": 0.5791421131972043, + "grad_norm": 7.0, + "kl": 11.207412719726562, + "learning_rate": 5e-06, + "logits/chosen": -23323861.333333332, + "logits/rejected": -32302269.333333332, + "logps/chosen": -578.2957356770834, + "logps/rejected": -455.6427815755208, + "loss": 0.0209, + "rewards/chosen": 8.872123718261719, + "rewards/margins": 18.69966379801432, + "rewards/rejected": -9.827540079752604, + "step": 2113 + }, + { + "epoch": 0.5794161984377141, + "grad_norm": 9.875, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -15404108.8, + "logits/rejected": -23770276.57142857, + "logps/chosen": -360.096826171875, + "logps/rejected": -498.4780970982143, + "loss": 0.0282, + "rewards/chosen": 6.392802810668945, + "rewards/margins": 18.987138094220843, + "rewards/rejected": -12.594335283551898, + "step": 2114 + }, + { + "epoch": 0.579690283678224, + "grad_norm": 6.8125, + "kl": 3.4968650341033936, + "learning_rate": 5e-06, + "logits/chosen": -20651965.866666667, + "logits/rejected": -18057134.222222224, + "logps/chosen": -452.4965494791667, + "logps/rejected": -531.30859375, + "loss": 0.0355, + "rewards/chosen": 8.154464721679688, + "rewards/margins": 18.46558295355903, + "rewards/rejected": -10.311118231879341, + "step": 2115 + }, + { + "epoch": 0.5799643689187337, + "grad_norm": 9.5625, + "kl": 3.404376983642578, + "learning_rate": 5e-06, + "logits/chosen": -35656276.36363637, + "logits/rejected": -50039950.76923077, + "logps/chosen": -425.41841264204544, + "logps/rejected": -462.19095552884613, + "loss": 0.1039, + "rewards/chosen": 6.326819679953835, + "rewards/margins": 18.366365766191816, + "rewards/rejected": -12.03954608623798, + "step": 2116 + }, + { + "epoch": 0.5802384541592436, + "grad_norm": 5.78125, + "kl": 10.574642181396484, + "learning_rate": 5e-06, + "logits/chosen": -13398089.6, + "logits/rejected": -20997525.333333332, + "logps/chosen": -354.26051432291666, + "logps/rejected": -651.7620442708334, + "loss": 0.0189, + "rewards/chosen": 7.993954976399739, + "rewards/margins": 21.975521511501736, + "rewards/rejected": -13.981566535101997, + "step": 2117 + }, + { + "epoch": 0.5805125393997533, + "grad_norm": 13.9375, + "kl": 7.084535121917725, + "learning_rate": 5e-06, + "logits/chosen": -20724531.2, + "logits/rejected": -26287982.222222224, + "logps/chosen": -432.46630859375, + "logps/rejected": -389.88783094618054, + "loss": 0.0494, + "rewards/chosen": 7.517392476399739, + "rewards/margins": 16.051957024468315, + "rewards/rejected": -8.534564548068577, + "step": 2118 + }, + { + "epoch": 0.5807866246402631, + "grad_norm": 2.546875, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -61059481.6, + "logits/rejected": -19761173.89473684, + "logps/chosen": -608.561279296875, + "logps/rejected": -608.2375616776316, + "loss": 0.0062, + "rewards/chosen": 8.45541763305664, + "rewards/margins": 19.675864571019225, + "rewards/rejected": -11.220446937962583, + "step": 2119 + }, + { + "epoch": 0.581060709880773, + "grad_norm": 1.3515625, + "kl": 0.9387969970703125, + "learning_rate": 5e-06, + "logits/chosen": -23910544.0, + "logits/rejected": -21846635.42857143, + "logps/chosen": -526.762646484375, + "logps/rejected": -644.3900669642857, + "loss": 0.0054, + "rewards/chosen": 9.650862121582032, + "rewards/margins": 22.38133087158203, + "rewards/rejected": -12.73046875, + "step": 2120 + }, + { + "epoch": 0.5813347951212827, + "grad_norm": 7.15625, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -40712740.571428575, + "logits/rejected": 15073232.0, + "logps/chosen": -320.27633231026783, + "logps/rejected": -505.7634765625, + "loss": 0.0387, + "rewards/chosen": 6.698146275111607, + "rewards/margins": 14.364833286830358, + "rewards/rejected": -7.66668701171875, + "step": 2121 + }, + { + "epoch": 0.5816088803617925, + "grad_norm": 8.875, + "kl": 0.3316332697868347, + "learning_rate": 5e-06, + "logits/chosen": 1974687.3333333333, + "logits/rejected": -13635832.0, + "logps/chosen": -317.0728759765625, + "logps/rejected": -713.017822265625, + "loss": 0.0475, + "rewards/chosen": 5.425852457682292, + "rewards/margins": 16.941635131835938, + "rewards/rejected": -11.515782674153646, + "step": 2122 + }, + { + "epoch": 0.5818829656023023, + "grad_norm": 8.1875, + "kl": 0.49021148681640625, + "learning_rate": 5e-06, + "logits/chosen": -8407385.6, + "logits/rejected": -32626346.666666668, + "logps/chosen": -456.38977864583336, + "logps/rejected": -486.6891818576389, + "loss": 0.0569, + "rewards/chosen": 6.524491373697916, + "rewards/margins": 17.892460462782118, + "rewards/rejected": -11.367969089084202, + "step": 2123 + }, + { + "epoch": 0.5821570508428121, + "grad_norm": 6.03125, + "kl": 0.06039460748434067, + "learning_rate": 5e-06, + "logits/chosen": -18291204.923076924, + "logits/rejected": -22242619.636363637, + "logps/chosen": -348.0985576923077, + "logps/rejected": -616.7020152698864, + "loss": 0.0436, + "rewards/chosen": 5.3938129131610575, + "rewards/margins": 17.275525393185916, + "rewards/rejected": -11.881712480024857, + "step": 2124 + }, + { + "epoch": 0.5824311360833219, + "grad_norm": 9.0, + "kl": 6.6717071533203125, + "learning_rate": 5e-06, + "logits/chosen": 2827789.846153846, + "logits/rejected": -42614016.0, + "logps/chosen": -376.6828801081731, + "logps/rejected": -490.75106534090907, + "loss": 0.0477, + "rewards/chosen": 5.8784966102013225, + "rewards/margins": 17.292611822381723, + "rewards/rejected": -11.414115212180398, + "step": 2125 + }, + { + "epoch": 0.5827052213238317, + "grad_norm": 4.21875, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -28013516.8, + "logits/rejected": -20109261.714285713, + "logps/chosen": -481.6634765625, + "logps/rejected": -453.98263113839283, + "loss": 0.0197, + "rewards/chosen": 6.073551940917969, + "rewards/margins": 16.13782958984375, + "rewards/rejected": -10.064277648925781, + "step": 2126 + }, + { + "epoch": 0.5829793065643415, + "grad_norm": 2.09375, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -14363436.307692308, + "logits/rejected": -11805894.545454545, + "logps/chosen": -357.7663762019231, + "logps/rejected": -576.7266068892045, + "loss": 0.0059, + "rewards/chosen": 7.161321786733774, + "rewards/margins": 17.08135810265174, + "rewards/rejected": -9.920036315917969, + "step": 2127 + }, + { + "epoch": 0.5832533918048514, + "grad_norm": 12.4375, + "kl": 3.049069881439209, + "learning_rate": 5e-06, + "logits/chosen": -26736617.14285714, + "logits/rejected": -18772052.8, + "logps/chosen": -388.40555245535717, + "logps/rejected": -412.874853515625, + "loss": 0.0717, + "rewards/chosen": 6.58173097882952, + "rewards/margins": 15.460637228829519, + "rewards/rejected": -8.87890625, + "step": 2128 + }, + { + "epoch": 0.5835274770453611, + "grad_norm": 6.03125, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": 2852836.0, + "logits/rejected": -13626715.0, + "logps/chosen": -357.8809509277344, + "logps/rejected": -507.8498229980469, + "loss": 0.0256, + "rewards/chosen": 7.232987403869629, + "rewards/margins": 17.812052726745605, + "rewards/rejected": -10.579065322875977, + "step": 2129 + }, + { + "epoch": 0.5838015622858709, + "grad_norm": 6.84375, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -13883869.090909092, + "logits/rejected": 13281833.846153846, + "logps/chosen": -416.65944602272725, + "logps/rejected": -661.1028395432693, + "loss": 0.0154, + "rewards/chosen": 5.860104647549716, + "rewards/margins": 19.695092581368826, + "rewards/rejected": -13.83498793381911, + "step": 2130 + }, + { + "epoch": 0.5840756475263807, + "grad_norm": 7.375, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -13170779.636363637, + "logits/rejected": 22437666.46153846, + "logps/chosen": -447.8284357244318, + "logps/rejected": -630.0425931490385, + "loss": 0.0415, + "rewards/chosen": 6.424459284002131, + "rewards/margins": 23.09402903310069, + "rewards/rejected": -16.669569749098557, + "step": 2131 + }, + { + "epoch": 0.5843497327668905, + "grad_norm": 5.6875, + "kl": 2.955056667327881, + "learning_rate": 5e-06, + "logits/chosen": -12953050.181818182, + "logits/rejected": -388118.3076923077, + "logps/chosen": -360.4784490411932, + "logps/rejected": -534.7435021033654, + "loss": 0.0404, + "rewards/chosen": 5.165999325838956, + "rewards/margins": 15.280766547143042, + "rewards/rejected": -10.114767221304087, + "step": 2132 + }, + { + "epoch": 0.5846238180074003, + "grad_norm": 3.09375, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -23227192.888888888, + "logits/rejected": -17811018.666666668, + "logps/chosen": -436.65142144097223, + "logps/rejected": -497.03167317708335, + "loss": 0.0089, + "rewards/chosen": 7.439449734157986, + "rewards/margins": 18.960767279730902, + "rewards/rejected": -11.521317545572916, + "step": 2133 + }, + { + "epoch": 0.5848979032479101, + "grad_norm": 10.8125, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -59352081.45454545, + "logits/rejected": -38902749.538461536, + "logps/chosen": -474.32137784090907, + "logps/rejected": -509.9743088942308, + "loss": 0.0383, + "rewards/chosen": 5.766646645285866, + "rewards/margins": 16.63357826713082, + "rewards/rejected": -10.866931621844952, + "step": 2134 + }, + { + "epoch": 0.5851719884884199, + "grad_norm": 7.375, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -30578150.4, + "logits/rejected": -31764621.714285713, + "logps/chosen": -454.131982421875, + "logps/rejected": -655.3639090401786, + "loss": 0.0261, + "rewards/chosen": 7.082856750488281, + "rewards/margins": 17.900400870186942, + "rewards/rejected": -10.817544119698661, + "step": 2135 + }, + { + "epoch": 0.5854460737289297, + "grad_norm": 9.9375, + "kl": 9.792182922363281, + "learning_rate": 5e-06, + "logits/chosen": -29449686.4, + "logits/rejected": -20426358.85714286, + "logps/chosen": -421.122265625, + "logps/rejected": -512.9408133370536, + "loss": 0.0442, + "rewards/chosen": 6.762929534912109, + "rewards/margins": 16.719585854666573, + "rewards/rejected": -9.956656319754464, + "step": 2136 + }, + { + "epoch": 0.5857201589694395, + "grad_norm": 10.6875, + "kl": 9.099266052246094, + "learning_rate": 5e-06, + "logits/chosen": -39770240.0, + "logits/rejected": -29997649.454545453, + "logps/chosen": -460.45935997596155, + "logps/rejected": -513.3543146306819, + "loss": 0.0567, + "rewards/chosen": 7.289678720327524, + "rewards/margins": 18.36076178917518, + "rewards/rejected": -11.071083068847656, + "step": 2137 + }, + { + "epoch": 0.5859942442099493, + "grad_norm": 20.75, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -9557194.666666666, + "logits/rejected": -21107687.111111112, + "logps/chosen": -432.68483072916666, + "logps/rejected": -362.8634440104167, + "loss": 0.0993, + "rewards/chosen": 6.263851928710937, + "rewards/margins": 13.382362704806859, + "rewards/rejected": -7.1185107760959205, + "step": 2138 + }, + { + "epoch": 0.5862683294504591, + "grad_norm": 7.25, + "kl": 3.507750988006592, + "learning_rate": 5e-06, + "logits/chosen": -4016063.5, + "logits/rejected": -19777628.0, + "logps/chosen": -562.9644775390625, + "logps/rejected": -577.1492919921875, + "loss": 0.0155, + "rewards/chosen": 7.153438568115234, + "rewards/margins": 16.567160606384277, + "rewards/rejected": -9.413722038269043, + "step": 2139 + }, + { + "epoch": 0.5865424146909689, + "grad_norm": 5.625, + "kl": 0.3746757507324219, + "learning_rate": 5e-06, + "logits/chosen": -36201016.0, + "logits/rejected": -27067386.0, + "logps/chosen": -459.77313232421875, + "logps/rejected": -447.0, + "loss": 0.0324, + "rewards/chosen": 7.493675231933594, + "rewards/margins": 18.159661293029785, + "rewards/rejected": -10.665986061096191, + "step": 2140 + }, + { + "epoch": 0.5868164999314787, + "grad_norm": 2.4375, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -23160781.333333332, + "logits/rejected": -46890698.666666664, + "logps/chosen": -341.0271809895833, + "logps/rejected": -540.1982828776041, + "loss": 0.0086, + "rewards/chosen": 5.989185333251953, + "rewards/margins": 16.2037296295166, + "rewards/rejected": -10.214544296264648, + "step": 2141 + }, + { + "epoch": 0.5870905851719885, + "grad_norm": 12.1875, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -31258821.333333332, + "logits/rejected": -33125826.666666668, + "logps/chosen": -496.2452799479167, + "logps/rejected": -561.580078125, + "loss": 0.0967, + "rewards/chosen": 7.1285400390625, + "rewards/margins": 16.92383130391439, + "rewards/rejected": -9.795291264851889, + "step": 2142 + }, + { + "epoch": 0.5873646704124983, + "grad_norm": 6.15625, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -34571792.0, + "logits/rejected": -17220934.0, + "logps/chosen": -383.933837890625, + "logps/rejected": -520.1976318359375, + "loss": 0.0241, + "rewards/chosen": 7.86857795715332, + "rewards/margins": 17.6112642288208, + "rewards/rejected": -9.74268627166748, + "step": 2143 + }, + { + "epoch": 0.587638755653008, + "grad_norm": 6.625, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -31935913.6, + "logits/rejected": -10201350.857142856, + "logps/chosen": -380.1107177734375, + "logps/rejected": -541.0489676339286, + "loss": 0.0115, + "rewards/chosen": 6.821420288085937, + "rewards/margins": 17.963831656319755, + "rewards/rejected": -11.142411368233818, + "step": 2144 + }, + { + "epoch": 0.5879128408935179, + "grad_norm": 16.625, + "kl": 15.671846389770508, + "learning_rate": 5e-06, + "logits/chosen": 2235385.0, + "logits/rejected": -29750584.0, + "logps/chosen": -465.69329833984375, + "logps/rejected": -365.5209045410156, + "loss": 0.0885, + "rewards/chosen": 6.900042533874512, + "rewards/margins": 13.135589599609375, + "rewards/rejected": -6.235547065734863, + "step": 2145 + }, + { + "epoch": 0.5881869261340277, + "grad_norm": 4.0, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -22104452.57142857, + "logits/rejected": -29875206.4, + "logps/chosen": -523.9329310825893, + "logps/rejected": -547.948388671875, + "loss": 0.0491, + "rewards/chosen": 6.992813655308315, + "rewards/margins": 19.851302119663785, + "rewards/rejected": -12.858488464355469, + "step": 2146 + }, + { + "epoch": 0.5884610113745374, + "grad_norm": 6.15625, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -13912362.666666666, + "logits/rejected": -34413752.88888889, + "logps/chosen": -510.5093994140625, + "logps/rejected": -649.1004231770834, + "loss": 0.0165, + "rewards/chosen": 5.4101003011067705, + "rewards/margins": 17.144261678059895, + "rewards/rejected": -11.734161376953125, + "step": 2147 + }, + { + "epoch": 0.5887350966150473, + "grad_norm": 4.40625, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -32375536.0, + "logits/rejected": -24367384.0, + "logps/chosen": -321.416015625, + "logps/rejected": -587.7035522460938, + "loss": 0.0106, + "rewards/chosen": 6.052656173706055, + "rewards/margins": 16.4710693359375, + "rewards/rejected": -10.418413162231445, + "step": 2148 + }, + { + "epoch": 0.5890091818555571, + "grad_norm": 6.46875, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -26518007.111111112, + "logits/rejected": -17126122.666666668, + "logps/chosen": -333.8068576388889, + "logps/rejected": -685.3757161458333, + "loss": 0.0325, + "rewards/chosen": 5.637394799126519, + "rewards/margins": 18.999518415662976, + "rewards/rejected": -13.362123616536458, + "step": 2149 + }, + { + "epoch": 0.5892832670960669, + "grad_norm": 4.78125, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -31661034.666666668, + "logits/rejected": -45120122.666666664, + "logps/chosen": -445.745361328125, + "logps/rejected": -453.5135498046875, + "loss": 0.013, + "rewards/chosen": 7.462151209513347, + "rewards/margins": 18.17829958597819, + "rewards/rejected": -10.716148376464844, + "step": 2150 + }, + { + "epoch": 0.5895573523365767, + "grad_norm": 2.15625, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -25227408.0, + "logits/rejected": -24266072.0, + "logps/chosen": -432.31134033203125, + "logps/rejected": -560.2775268554688, + "loss": 0.0058, + "rewards/chosen": 8.427921295166016, + "rewards/margins": 20.040393829345703, + "rewards/rejected": -11.612472534179688, + "step": 2151 + }, + { + "epoch": 0.5898314375770864, + "grad_norm": 2.171875, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -40920684.8, + "logits/rejected": -24149734.85714286, + "logps/chosen": -388.62744140625, + "logps/rejected": -580.7172502790179, + "loss": 0.0069, + "rewards/chosen": 6.943627166748047, + "rewards/margins": 18.010248129708426, + "rewards/rejected": -11.06662096296038, + "step": 2152 + }, + { + "epoch": 0.5901055228175963, + "grad_norm": 4.65625, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -42653984.0, + "logits/rejected": -24061462.0, + "logps/chosen": -396.92230224609375, + "logps/rejected": -448.247314453125, + "loss": 0.0251, + "rewards/chosen": 5.708291530609131, + "rewards/margins": 14.90488576889038, + "rewards/rejected": -9.19659423828125, + "step": 2153 + }, + { + "epoch": 0.5903796080581061, + "grad_norm": 1.8359375, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -36611568.0, + "logits/rejected": -16978782.666666668, + "logps/chosen": -468.1298828125, + "logps/rejected": -559.4476318359375, + "loss": 0.0067, + "rewards/chosen": 6.507033030192058, + "rewards/margins": 17.14499855041504, + "rewards/rejected": -10.637965520222982, + "step": 2154 + }, + { + "epoch": 0.5906536932986158, + "grad_norm": 7.84375, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -38929178.666666664, + "logits/rejected": -34675002.666666664, + "logps/chosen": -401.1503092447917, + "logps/rejected": -515.5962727864584, + "loss": 0.0384, + "rewards/chosen": 6.114525477091472, + "rewards/margins": 18.458396275838215, + "rewards/rejected": -12.343870798746744, + "step": 2155 + }, + { + "epoch": 0.5909277785391257, + "grad_norm": 3.703125, + "kl": 4.912854194641113, + "learning_rate": 5e-06, + "logits/chosen": -3963438.769230769, + "logits/rejected": -15395319.272727273, + "logps/chosen": -416.56460336538464, + "logps/rejected": -636.7313565340909, + "loss": 0.047, + "rewards/chosen": 7.316204951359675, + "rewards/margins": 22.009377059403, + "rewards/rejected": -14.693172108043324, + "step": 2156 + }, + { + "epoch": 0.5912018637796355, + "grad_norm": 9.5, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -5877941.333333333, + "logits/rejected": -77481500.44444445, + "logps/chosen": -371.11891276041666, + "logps/rejected": -595.7492404513889, + "loss": 0.0507, + "rewards/chosen": 4.714406331380208, + "rewards/margins": 16.93650224473741, + "rewards/rejected": -12.222095913357204, + "step": 2157 + }, + { + "epoch": 0.5914759490201452, + "grad_norm": 3.421875, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -31910208.0, + "logits/rejected": -15788998.857142856, + "logps/chosen": -399.7058349609375, + "logps/rejected": -424.2987583705357, + "loss": 0.0098, + "rewards/chosen": 7.388449096679688, + "rewards/margins": 17.79518519810268, + "rewards/rejected": -10.406736101422991, + "step": 2158 + }, + { + "epoch": 0.5917500342606551, + "grad_norm": 7.65625, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": 20241450.666666668, + "logits/rejected": -21708177.333333332, + "logps/chosen": -600.6414388020834, + "logps/rejected": -480.9698893229167, + "loss": 0.0208, + "rewards/chosen": 6.8651173909505205, + "rewards/margins": 17.84917704264323, + "rewards/rejected": -10.984059651692709, + "step": 2159 + }, + { + "epoch": 0.5920241195011648, + "grad_norm": 1.0859375, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -15893121.6, + "logits/rejected": -30321435.42857143, + "logps/chosen": -463.964599609375, + "logps/rejected": -662.9504743303571, + "loss": 0.0026, + "rewards/chosen": 7.244794464111328, + "rewards/margins": 19.78776866367885, + "rewards/rejected": -12.542974199567523, + "step": 2160 + }, + { + "epoch": 0.5922982047416746, + "grad_norm": 10.375, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -29574117.818181816, + "logits/rejected": -36193341.538461536, + "logps/chosen": -390.03391335227275, + "logps/rejected": -588.4260441706731, + "loss": 0.0266, + "rewards/chosen": 5.063650998202237, + "rewards/margins": 16.232042299283968, + "rewards/rejected": -11.16839130108173, + "step": 2161 + }, + { + "epoch": 0.5925722899821845, + "grad_norm": 3.015625, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -69288535.27272727, + "logits/rejected": -31085550.769230768, + "logps/chosen": -459.8890269886364, + "logps/rejected": -442.49320162259613, + "loss": 0.0074, + "rewards/chosen": 6.653273148970171, + "rewards/margins": 17.302338606827742, + "rewards/rejected": -10.649065457857573, + "step": 2162 + }, + { + "epoch": 0.5928463752226942, + "grad_norm": 8.9375, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -35577841.23076923, + "logits/rejected": -46537026.90909091, + "logps/chosen": -452.7204026442308, + "logps/rejected": -502.05886008522725, + "loss": 0.0214, + "rewards/chosen": 6.3166339580829325, + "rewards/margins": 18.194429090806654, + "rewards/rejected": -11.87779513272372, + "step": 2163 + }, + { + "epoch": 0.5931204604632041, + "grad_norm": 18.5, + "kl": 5.8484697341918945, + "learning_rate": 5e-06, + "logits/chosen": -26128170.666666668, + "logits/rejected": -43174304.0, + "logps/chosen": -406.64153645833335, + "logps/rejected": -555.2554253472222, + "loss": 0.0776, + "rewards/chosen": 5.993400065104167, + "rewards/margins": 18.86180826822917, + "rewards/rejected": -12.868408203125, + "step": 2164 + }, + { + "epoch": 0.5933945457037139, + "grad_norm": 2.78125, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -20227963.2, + "logits/rejected": -41076982.85714286, + "logps/chosen": -482.5244140625, + "logps/rejected": -548.1082589285714, + "loss": 0.0048, + "rewards/chosen": 6.34664306640625, + "rewards/margins": 18.30279039655413, + "rewards/rejected": -11.95614733014788, + "step": 2165 + }, + { + "epoch": 0.5936686309442236, + "grad_norm": 6.28125, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -11015589.818181818, + "logits/rejected": -14553710.76923077, + "logps/chosen": -456.61110617897725, + "logps/rejected": -456.3073542668269, + "loss": 0.0256, + "rewards/chosen": 4.694655678488991, + "rewards/margins": 17.382998433146444, + "rewards/rejected": -12.688342754657452, + "step": 2166 + }, + { + "epoch": 0.5939427161847335, + "grad_norm": 7.34375, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -37773765.333333336, + "logits/rejected": -14648200.0, + "logps/chosen": -390.2578125, + "logps/rejected": -419.5199788411458, + "loss": 0.0207, + "rewards/chosen": 8.50216801961263, + "rewards/margins": 15.816275914510092, + "rewards/rejected": -7.314107894897461, + "step": 2167 + }, + { + "epoch": 0.5942168014252432, + "grad_norm": 2.8125, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -30373757.09090909, + "logits/rejected": -32913331.692307692, + "logps/chosen": -341.3960626775568, + "logps/rejected": -404.111328125, + "loss": 0.008, + "rewards/chosen": 6.762606534090909, + "rewards/margins": 16.919912911795237, + "rewards/rejected": -10.157306377704327, + "step": 2168 + }, + { + "epoch": 0.594490886665753, + "grad_norm": 5.0625, + "kl": 5.508502006530762, + "learning_rate": 5e-06, + "logits/chosen": -26487502.222222224, + "logits/rejected": -33091136.0, + "logps/chosen": -372.16826714409723, + "logps/rejected": -531.287890625, + "loss": 0.0164, + "rewards/chosen": 6.465708838568793, + "rewards/margins": 17.90114991929796, + "rewards/rejected": -11.435441080729166, + "step": 2169 + }, + { + "epoch": 0.5947649719062629, + "grad_norm": 0.37890625, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -49769584.0, + "logits/rejected": -39453385.14285714, + "logps/chosen": -526.72509765625, + "logps/rejected": -636.3349609375, + "loss": 0.0013, + "rewards/chosen": 8.010971832275391, + "rewards/margins": 21.133274296351843, + "rewards/rejected": -13.122302464076451, + "step": 2170 + }, + { + "epoch": 0.5950390571467726, + "grad_norm": 8.5, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -17944529.6, + "logits/rejected": -28860379.42857143, + "logps/chosen": -371.789208984375, + "logps/rejected": -696.0126953125, + "loss": 0.0473, + "rewards/chosen": 4.213309860229492, + "rewards/margins": 17.22279210771833, + "rewards/rejected": -13.009482247488839, + "step": 2171 + }, + { + "epoch": 0.5953131423872824, + "grad_norm": 1.609375, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -11995141.6, + "logits/rejected": -16251163.42857143, + "logps/chosen": -472.012060546875, + "logps/rejected": -449.34769112723217, + "loss": 0.0054, + "rewards/chosen": 7.734413146972656, + "rewards/margins": 17.524215698242188, + "rewards/rejected": -9.789802551269531, + "step": 2172 + }, + { + "epoch": 0.5955872276277923, + "grad_norm": 8.3125, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -45140362.666666664, + "logits/rejected": -41980154.666666664, + "logps/chosen": -315.3769938151042, + "logps/rejected": -655.1410725911459, + "loss": 0.0233, + "rewards/chosen": 5.762478510538737, + "rewards/margins": 18.820287704467773, + "rewards/rejected": -13.057809193929037, + "step": 2173 + }, + { + "epoch": 0.595861312868302, + "grad_norm": 9.75, + "kl": 6.16293478012085, + "learning_rate": 5e-06, + "logits/chosen": -19614102.0, + "logits/rejected": -61638940.0, + "logps/chosen": -410.6842956542969, + "logps/rejected": -565.7598266601562, + "loss": 0.0785, + "rewards/chosen": 6.03289794921875, + "rewards/margins": 20.493722915649414, + "rewards/rejected": -14.460824966430664, + "step": 2174 + }, + { + "epoch": 0.5961353981088119, + "grad_norm": 10.625, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -31477715.2, + "logits/rejected": -19473366.85714286, + "logps/chosen": -527.793359375, + "logps/rejected": -514.9218052455357, + "loss": 0.0169, + "rewards/chosen": 5.627285003662109, + "rewards/margins": 16.793113163539342, + "rewards/rejected": -11.165828159877233, + "step": 2175 + }, + { + "epoch": 0.5964094833493216, + "grad_norm": 11.8125, + "kl": 2.049309492111206, + "learning_rate": 5e-06, + "logits/chosen": -25835832.888888888, + "logits/rejected": -28199449.6, + "logps/chosen": -471.5774739583333, + "logps/rejected": -668.8044270833333, + "loss": 0.0285, + "rewards/chosen": 6.244370354546441, + "rewards/margins": 19.147822909884983, + "rewards/rejected": -12.903452555338541, + "step": 2176 + }, + { + "epoch": 0.5966835685898314, + "grad_norm": 11.5, + "kl": 5.008486270904541, + "learning_rate": 5e-06, + "logits/chosen": -8911688.533333333, + "logits/rejected": 50801731.55555555, + "logps/chosen": -427.08570963541666, + "logps/rejected": -487.0569118923611, + "loss": 0.0354, + "rewards/chosen": 6.230491638183594, + "rewards/margins": 19.46101972791884, + "rewards/rejected": -13.230528089735243, + "step": 2177 + }, + { + "epoch": 0.5969576538303413, + "grad_norm": 5.0, + "kl": 8.84598159790039, + "learning_rate": 5e-06, + "logits/chosen": -26518109.866666667, + "logits/rejected": -85343800.8888889, + "logps/chosen": -501.1429036458333, + "logps/rejected": -598.5286458333334, + "loss": 0.056, + "rewards/chosen": 6.596148681640625, + "rewards/margins": 21.661279975043403, + "rewards/rejected": -15.065131293402779, + "step": 2178 + }, + { + "epoch": 0.597231739070851, + "grad_norm": 10.5625, + "kl": 10.65564250946045, + "learning_rate": 5e-06, + "logits/chosen": -43579933.538461536, + "logits/rejected": -21101469.09090909, + "logps/chosen": -463.27013221153845, + "logps/rejected": -582.2319779829545, + "loss": 0.0384, + "rewards/chosen": 7.344271733210637, + "rewards/margins": 18.386445639016745, + "rewards/rejected": -11.042173905806107, + "step": 2179 + }, + { + "epoch": 0.5975058243113608, + "grad_norm": 7.75, + "kl": 9.445077896118164, + "learning_rate": 5e-06, + "logits/chosen": -26130950.4, + "logits/rejected": 22711854.222222224, + "logps/chosen": -447.4447916666667, + "logps/rejected": -640.5590277777778, + "loss": 0.0724, + "rewards/chosen": 7.231640116373698, + "rewards/margins": 18.150772772894964, + "rewards/rejected": -10.919132656521267, + "step": 2180 + }, + { + "epoch": 0.5977799095518707, + "grad_norm": 2.734375, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -25212189.333333332, + "logits/rejected": -35348394.666666664, + "logps/chosen": -431.1717529296875, + "logps/rejected": -600.1024576822916, + "loss": 0.0058, + "rewards/chosen": 7.027581532796224, + "rewards/margins": 20.477624257405598, + "rewards/rejected": -13.450042724609375, + "step": 2181 + }, + { + "epoch": 0.5980539947923804, + "grad_norm": 3.8125, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -25465553.454545453, + "logits/rejected": -29627505.230769232, + "logps/chosen": -422.99702592329544, + "logps/rejected": -721.3228665865385, + "loss": 0.017, + "rewards/chosen": 7.079569036310369, + "rewards/margins": 20.083618484176956, + "rewards/rejected": -13.004049447866587, + "step": 2182 + }, + { + "epoch": 0.5983280800328902, + "grad_norm": 11.1875, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -13331138.666666666, + "logits/rejected": -25850978.666666668, + "logps/chosen": -325.160888671875, + "logps/rejected": -486.3913981119792, + "loss": 0.0501, + "rewards/chosen": 5.32306448618571, + "rewards/margins": 18.60388469696045, + "rewards/rejected": -13.28082021077474, + "step": 2183 + }, + { + "epoch": 0.5986021652734, + "grad_norm": 2.6875, + "kl": 0.8591588735580444, + "learning_rate": 5e-06, + "logits/chosen": -8470990.76923077, + "logits/rejected": -30197597.09090909, + "logps/chosen": -445.4480168269231, + "logps/rejected": -585.4016335227273, + "loss": 0.0043, + "rewards/chosen": 6.751483623798077, + "rewards/margins": 18.63209330952251, + "rewards/rejected": -11.880609685724432, + "step": 2184 + }, + { + "epoch": 0.5988762505139098, + "grad_norm": 9.5625, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -44460435.2, + "logits/rejected": -18514937.14285714, + "logps/chosen": -436.4451171875, + "logps/rejected": -528.2497209821429, + "loss": 0.0793, + "rewards/chosen": 6.180952835083008, + "rewards/margins": 16.994935117449078, + "rewards/rejected": -10.813982282366071, + "step": 2185 + }, + { + "epoch": 0.5991503357544197, + "grad_norm": 9.25, + "kl": 4.789362907409668, + "learning_rate": 5e-06, + "logits/chosen": -25215620.923076924, + "logits/rejected": -10762240.0, + "logps/chosen": -430.8313551682692, + "logps/rejected": -572.1481267755681, + "loss": 0.0237, + "rewards/chosen": 7.460747352013221, + "rewards/margins": 17.490833522556546, + "rewards/rejected": -10.030086170543324, + "step": 2186 + }, + { + "epoch": 0.5994244209949294, + "grad_norm": 2.890625, + "kl": 0.9782218933105469, + "learning_rate": 5e-06, + "logits/chosen": -35283924.571428575, + "logits/rejected": -17479784.0, + "logps/chosen": -423.9850376674107, + "logps/rejected": -543.0978515625, + "loss": 0.007, + "rewards/chosen": 7.39567620413644, + "rewards/margins": 18.90359170096261, + "rewards/rejected": -11.507915496826172, + "step": 2187 + }, + { + "epoch": 0.5996985062354392, + "grad_norm": 5.9375, + "kl": 3.2888388633728027, + "learning_rate": 5e-06, + "logits/chosen": -32931453.09090909, + "logits/rejected": -25735313.230769232, + "logps/chosen": -430.23495205965907, + "logps/rejected": -574.4508713942307, + "loss": 0.0109, + "rewards/chosen": 7.320572592995384, + "rewards/margins": 19.197718213488173, + "rewards/rejected": -11.877145620492788, + "step": 2188 + }, + { + "epoch": 0.5999725914759491, + "grad_norm": 2.734375, + "kl": 1.8177287578582764, + "learning_rate": 5e-06, + "logits/chosen": -28131313.230769232, + "logits/rejected": -23194909.09090909, + "logps/chosen": -393.1641376201923, + "logps/rejected": -594.4780717329545, + "loss": 0.0273, + "rewards/chosen": 6.558665935809795, + "rewards/margins": 19.073691681548432, + "rewards/rejected": -12.515025745738637, + "step": 2189 + }, + { + "epoch": 0.6002466767164588, + "grad_norm": 3.625, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": 10933206.857142856, + "logits/rejected": -29962352.0, + "logps/chosen": -511.3933803013393, + "logps/rejected": -644.8236328125, + "loss": 0.0182, + "rewards/chosen": 6.798444475446429, + "rewards/margins": 21.989621843610493, + "rewards/rejected": -15.191177368164062, + "step": 2190 + }, + { + "epoch": 0.6005207619569686, + "grad_norm": 0.49609375, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -30437856.0, + "logits/rejected": -18674170.666666668, + "logps/chosen": -450.7813313802083, + "logps/rejected": -480.69829644097223, + "loss": 0.021, + "rewards/chosen": 6.3756459554036455, + "rewards/margins": 17.673858642578125, + "rewards/rejected": -11.298212687174479, + "step": 2191 + }, + { + "epoch": 0.6007948471974784, + "grad_norm": 9.125, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -13283822.933333334, + "logits/rejected": -29446282.666666668, + "logps/chosen": -372.037890625, + "logps/rejected": -457.6717122395833, + "loss": 0.0638, + "rewards/chosen": 5.344114176432291, + "rewards/margins": 14.7434087117513, + "rewards/rejected": -9.39929453531901, + "step": 2192 + }, + { + "epoch": 0.6010689324379882, + "grad_norm": 2.875, + "kl": 5.162571907043457, + "learning_rate": 5e-06, + "logits/chosen": -25485414.85714286, + "logits/rejected": -20776916.8, + "logps/chosen": -425.67494419642856, + "logps/rejected": -455.10771484375, + "loss": 0.0359, + "rewards/chosen": 7.00762939453125, + "rewards/margins": 18.27763214111328, + "rewards/rejected": -11.270002746582032, + "step": 2193 + }, + { + "epoch": 0.601343017678498, + "grad_norm": 5.53125, + "kl": 5.714724540710449, + "learning_rate": 5e-06, + "logits/chosen": -19761680.0, + "logits/rejected": -36270373.333333336, + "logps/chosen": -377.5932888454861, + "logps/rejected": -428.7765299479167, + "loss": 0.0568, + "rewards/chosen": 6.7635345458984375, + "rewards/margins": 18.87397003173828, + "rewards/rejected": -12.110435485839844, + "step": 2194 + }, + { + "epoch": 0.6016171029190078, + "grad_norm": 6.65625, + "kl": 1.1521995067596436, + "learning_rate": 5e-06, + "logits/chosen": -27860149.333333332, + "logits/rejected": -33711648.0, + "logps/chosen": -497.8890380859375, + "logps/rejected": -705.2659505208334, + "loss": 0.0215, + "rewards/chosen": 6.590413411458333, + "rewards/margins": 18.143877665201824, + "rewards/rejected": -11.55346425374349, + "step": 2195 + }, + { + "epoch": 0.6018911881595176, + "grad_norm": 8.375, + "kl": 5.956332206726074, + "learning_rate": 5e-06, + "logits/chosen": -28796224.0, + "logits/rejected": -12498891.2, + "logps/chosen": -402.6757114955357, + "logps/rejected": -654.1390625, + "loss": 0.0381, + "rewards/chosen": 6.262894766671317, + "rewards/margins": 19.256048148018973, + "rewards/rejected": -12.993153381347657, + "step": 2196 + }, + { + "epoch": 0.6021652734000275, + "grad_norm": 8.9375, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -24450624.0, + "logits/rejected": -4004005.0, + "logps/chosen": -505.3919677734375, + "logps/rejected": -537.07373046875, + "loss": 0.0262, + "rewards/chosen": 8.312037467956543, + "rewards/margins": 18.254374504089355, + "rewards/rejected": -9.942337036132812, + "step": 2197 + }, + { + "epoch": 0.6024393586405372, + "grad_norm": 5.125, + "kl": 1.152191162109375, + "learning_rate": 5e-06, + "logits/chosen": -17002477.53846154, + "logits/rejected": -19150842.181818184, + "logps/chosen": -343.7086838942308, + "logps/rejected": -416.40633877840907, + "loss": 0.0206, + "rewards/chosen": 6.872990534855769, + "rewards/margins": 15.219000916380981, + "rewards/rejected": -8.346010381525213, + "step": 2198 + }, + { + "epoch": 0.602713443881047, + "grad_norm": 3.703125, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": 6885761.6, + "logits/rejected": -16070052.57142857, + "logps/chosen": -433.0412109375, + "logps/rejected": -547.5890066964286, + "loss": 0.0097, + "rewards/chosen": 7.574167633056641, + "rewards/margins": 19.15115476335798, + "rewards/rejected": -11.576987130301339, + "step": 2199 + }, + { + "epoch": 0.6029875291215568, + "grad_norm": 6.75, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -12802402.909090908, + "logits/rejected": -41504580.92307692, + "logps/chosen": -491.48970170454544, + "logps/rejected": -556.9084284855769, + "loss": 0.0235, + "rewards/chosen": 7.296288230202415, + "rewards/margins": 19.537225683252295, + "rewards/rejected": -12.24093745304988, + "step": 2200 + }, + { + "epoch": 0.6032616143620666, + "grad_norm": 5.5625, + "kl": 1.661350965499878, + "learning_rate": 5e-06, + "logits/chosen": -23022016.0, + "logits/rejected": -25138682.666666668, + "logps/chosen": -324.7130940755208, + "logps/rejected": -579.317626953125, + "loss": 0.0663, + "rewards/chosen": 7.504355112711589, + "rewards/margins": 17.708365122477215, + "rewards/rejected": -10.204010009765625, + "step": 2201 + }, + { + "epoch": 0.6035356996025764, + "grad_norm": 7.875, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -31673572.0, + "logits/rejected": -24489804.0, + "logps/chosen": -379.821044921875, + "logps/rejected": -467.80255126953125, + "loss": 0.0245, + "rewards/chosen": 5.106947898864746, + "rewards/margins": 14.952818870544434, + "rewards/rejected": -9.845870971679688, + "step": 2202 + }, + { + "epoch": 0.6038097848430862, + "grad_norm": 10.0625, + "kl": 3.2385292053222656, + "learning_rate": 5e-06, + "logits/chosen": -31444234.666666668, + "logits/rejected": 26736200.0, + "logps/chosen": -404.1475423177083, + "logps/rejected": -533.6051432291666, + "loss": 0.0465, + "rewards/chosen": 5.714305877685547, + "rewards/margins": 16.151487986246742, + "rewards/rejected": -10.437182108561197, + "step": 2203 + }, + { + "epoch": 0.604083870083596, + "grad_norm": 7.90625, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -55170662.4, + "logits/rejected": -35821588.571428575, + "logps/chosen": -376.352294921875, + "logps/rejected": -613.7390485491071, + "loss": 0.0282, + "rewards/chosen": 6.302594757080078, + "rewards/margins": 16.748656027657645, + "rewards/rejected": -10.446061270577568, + "step": 2204 + }, + { + "epoch": 0.6043579553241057, + "grad_norm": 4.4375, + "kl": 8.374228477478027, + "learning_rate": 5e-06, + "logits/chosen": -32320134.4, + "logits/rejected": -28131756.0, + "logps/chosen": -400.551513671875, + "logps/rejected": -568.64111328125, + "loss": 0.0162, + "rewards/chosen": 6.81468505859375, + "rewards/margins": 16.114491653442382, + "rewards/rejected": -9.299806594848633, + "step": 2205 + }, + { + "epoch": 0.6046320405646156, + "grad_norm": 11.3125, + "kl": 16.873455047607422, + "learning_rate": 5e-06, + "logits/chosen": -20078905.6, + "logits/rejected": -41725315.55555555, + "logps/chosen": -364.32483723958336, + "logps/rejected": -527.2006293402778, + "loss": 0.167, + "rewards/chosen": 6.464699300130208, + "rewards/margins": 16.78651394314236, + "rewards/rejected": -10.321814643012154, + "step": 2206 + }, + { + "epoch": 0.6049061258051254, + "grad_norm": 3.609375, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -33610870.15384615, + "logits/rejected": -19031214.545454547, + "logps/chosen": -458.69779146634613, + "logps/rejected": -454.1975763494318, + "loss": 0.0253, + "rewards/chosen": 6.2027411827674275, + "rewards/margins": 16.43494666039527, + "rewards/rejected": -10.232205477627842, + "step": 2207 + }, + { + "epoch": 0.6051802110456352, + "grad_norm": 9.125, + "kl": 2.977275848388672, + "learning_rate": 5e-06, + "logits/chosen": -23771072.0, + "logits/rejected": -28571829.333333332, + "logps/chosen": -479.2791748046875, + "logps/rejected": -494.4981689453125, + "loss": 0.0361, + "rewards/chosen": 7.596284866333008, + "rewards/margins": 17.217138290405273, + "rewards/rejected": -9.620853424072266, + "step": 2208 + }, + { + "epoch": 0.605454296286145, + "grad_norm": 9.75, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -37170116.0, + "logits/rejected": 7328840.0, + "logps/chosen": -398.0444641113281, + "logps/rejected": -639.2219848632812, + "loss": 0.0611, + "rewards/chosen": 7.310476779937744, + "rewards/margins": 19.898430347442627, + "rewards/rejected": -12.587953567504883, + "step": 2209 + }, + { + "epoch": 0.6057283815266548, + "grad_norm": 11.0, + "kl": 3.0522563457489014, + "learning_rate": 5e-06, + "logits/chosen": -12626013.6, + "logits/rejected": -6196970.285714285, + "logps/chosen": -405.0981689453125, + "logps/rejected": -742.1803850446429, + "loss": 0.0212, + "rewards/chosen": 7.93250732421875, + "rewards/margins": 19.78261043003627, + "rewards/rejected": -11.850103105817523, + "step": 2210 + }, + { + "epoch": 0.6060024667671646, + "grad_norm": 15.5625, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -5699824.888888889, + "logits/rejected": -19937454.933333334, + "logps/chosen": -317.35557725694446, + "logps/rejected": -490.7828125, + "loss": 0.084, + "rewards/chosen": 5.15890375773112, + "rewards/margins": 16.366183217366537, + "rewards/rejected": -11.207279459635417, + "step": 2211 + }, + { + "epoch": 0.6062765520076744, + "grad_norm": 7.375, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -32469704.727272727, + "logits/rejected": -36530119.384615384, + "logps/chosen": -349.08571555397725, + "logps/rejected": -551.3809720552885, + "loss": 0.053, + "rewards/chosen": 4.857719421386719, + "rewards/margins": 17.705928509051983, + "rewards/rejected": -12.848209087665264, + "step": 2212 + }, + { + "epoch": 0.6065506372481841, + "grad_norm": 3.484375, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -30222850.90909091, + "logits/rejected": -8093676.923076923, + "logps/chosen": -328.38345614346593, + "logps/rejected": -526.7566481370193, + "loss": 0.0173, + "rewards/chosen": 6.276912342418324, + "rewards/margins": 17.7938988959039, + "rewards/rejected": -11.516986553485577, + "step": 2213 + }, + { + "epoch": 0.606824722488694, + "grad_norm": 2.5, + "kl": 1.378225326538086, + "learning_rate": 5e-06, + "logits/chosen": -21569822.0, + "logits/rejected": -40466236.0, + "logps/chosen": -447.0928955078125, + "logps/rejected": -594.4880981445312, + "loss": 0.0068, + "rewards/chosen": 7.886535167694092, + "rewards/margins": 21.771926403045654, + "rewards/rejected": -13.885391235351562, + "step": 2214 + }, + { + "epoch": 0.6070988077292038, + "grad_norm": 4.40625, + "kl": 6.561585903167725, + "learning_rate": 5e-06, + "logits/chosen": -33439818.666666668, + "logits/rejected": -29446546.666666668, + "logps/chosen": -527.9283854166666, + "logps/rejected": -524.9613850911459, + "loss": 0.0126, + "rewards/chosen": 8.343863169352213, + "rewards/margins": 20.65662892659505, + "rewards/rejected": -12.312765757242838, + "step": 2215 + }, + { + "epoch": 0.6073728929697135, + "grad_norm": 9.375, + "kl": 6.46578311920166, + "learning_rate": 5e-06, + "logits/chosen": -26986016.0, + "logits/rejected": -55840976.0, + "logps/chosen": -389.3118591308594, + "logps/rejected": -443.64239501953125, + "loss": 0.037, + "rewards/chosen": 7.304097652435303, + "rewards/margins": 15.58944845199585, + "rewards/rejected": -8.285350799560547, + "step": 2216 + }, + { + "epoch": 0.6076469782102234, + "grad_norm": 9.9375, + "kl": 1.1343930959701538, + "learning_rate": 5e-06, + "logits/chosen": -3047837.8666666667, + "logits/rejected": -32393998.222222224, + "logps/chosen": -444.96751302083334, + "logps/rejected": -474.8603515625, + "loss": 0.0641, + "rewards/chosen": 6.009226989746094, + "rewards/margins": 15.990991889105903, + "rewards/rejected": -9.98176489935981, + "step": 2217 + }, + { + "epoch": 0.6079210634507332, + "grad_norm": 10.5625, + "kl": 10.529559135437012, + "learning_rate": 5e-06, + "logits/chosen": -36874252.8, + "logits/rejected": -7347346.666666667, + "logps/chosen": -421.5343424479167, + "logps/rejected": -597.5794813368055, + "loss": 0.0565, + "rewards/chosen": 7.704297383626302, + "rewards/margins": 20.909359402126736, + "rewards/rejected": -13.205062018500435, + "step": 2218 + }, + { + "epoch": 0.608195148691243, + "grad_norm": 1.8046875, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -22132467.2, + "logits/rejected": 28570875.42857143, + "logps/chosen": -415.97646484375, + "logps/rejected": -556.2882952008929, + "loss": 0.0057, + "rewards/chosen": 6.372903823852539, + "rewards/margins": 15.694548198154994, + "rewards/rejected": -9.321644374302455, + "step": 2219 + }, + { + "epoch": 0.6084692339317528, + "grad_norm": 7.0, + "kl": 2.639923095703125, + "learning_rate": 5e-06, + "logits/chosen": 20789735.384615384, + "logits/rejected": 101690833.45454545, + "logps/chosen": -408.6981670673077, + "logps/rejected": -493.8663884943182, + "loss": 0.0396, + "rewards/chosen": 5.673254159780649, + "rewards/margins": 16.606948479072198, + "rewards/rejected": -10.933694319291549, + "step": 2220 + }, + { + "epoch": 0.6087433191722625, + "grad_norm": 9.0625, + "kl": 12.60730266571045, + "learning_rate": 5e-06, + "logits/chosen": -27130236.444444444, + "logits/rejected": -33253416.533333335, + "logps/chosen": -457.4415690104167, + "logps/rejected": -550.8982421875, + "loss": 0.0284, + "rewards/chosen": 8.146057976616753, + "rewards/margins": 18.544522942437066, + "rewards/rejected": -10.398464965820313, + "step": 2221 + }, + { + "epoch": 0.6090174044127724, + "grad_norm": 4.875, + "kl": 0.00672785472124815, + "learning_rate": 5e-06, + "logits/chosen": -5939803.6, + "logits/rejected": -44286157.71428572, + "logps/chosen": -415.200390625, + "logps/rejected": -508.80726841517856, + "loss": 0.0321, + "rewards/chosen": 5.707733154296875, + "rewards/margins": 15.876656668526786, + "rewards/rejected": -10.168923514229911, + "step": 2222 + }, + { + "epoch": 0.6092914896532822, + "grad_norm": 11.875, + "kl": 0.9351577758789062, + "learning_rate": 5e-06, + "logits/chosen": -22385014.0, + "logits/rejected": 9310759.0, + "logps/chosen": -460.2674255371094, + "logps/rejected": -606.7841186523438, + "loss": 0.0682, + "rewards/chosen": 6.377255439758301, + "rewards/margins": 21.05961322784424, + "rewards/rejected": -14.682357788085938, + "step": 2223 + }, + { + "epoch": 0.6095655748937919, + "grad_norm": 7.34375, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -48796712.72727273, + "logits/rejected": -28113659.076923076, + "logps/chosen": -352.8942205255682, + "logps/rejected": -523.6200796274038, + "loss": 0.0396, + "rewards/chosen": 6.838109796697443, + "rewards/margins": 16.714420878803814, + "rewards/rejected": -9.87631108210637, + "step": 2224 + }, + { + "epoch": 0.6098396601343018, + "grad_norm": 9.75, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -23051000.0, + "logits/rejected": -35185312.0, + "logps/chosen": -358.2611083984375, + "logps/rejected": -525.485107421875, + "loss": 0.0476, + "rewards/chosen": 5.963507970174153, + "rewards/margins": 16.35377566019694, + "rewards/rejected": -10.390267690022787, + "step": 2225 + }, + { + "epoch": 0.6101137453748116, + "grad_norm": 7.71875, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -32422044.8, + "logits/rejected": -37127314.28571428, + "logps/chosen": -450.56953125, + "logps/rejected": -493.89306640625, + "loss": 0.0954, + "rewards/chosen": 7.637179565429688, + "rewards/margins": 16.811651829310826, + "rewards/rejected": -9.174472263881139, + "step": 2226 + }, + { + "epoch": 0.6103878306153213, + "grad_norm": 1.734375, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -19627138.285714287, + "logits/rejected": -26551916.8, + "logps/chosen": -494.6511928013393, + "logps/rejected": -583.94716796875, + "loss": 0.0051, + "rewards/chosen": 6.930153982979911, + "rewards/margins": 20.396908133370534, + "rewards/rejected": -13.466754150390624, + "step": 2227 + }, + { + "epoch": 0.6106619158558312, + "grad_norm": 2.34375, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": 1109473.6363636365, + "logits/rejected": -2094990.1538461538, + "logps/chosen": -425.3722478693182, + "logps/rejected": -763.1319861778846, + "loss": 0.0056, + "rewards/chosen": 7.4795448996803975, + "rewards/margins": 25.27896459619482, + "rewards/rejected": -17.799419696514423, + "step": 2228 + }, + { + "epoch": 0.6109360010963409, + "grad_norm": 8.5, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -31556036.923076924, + "logits/rejected": -20873771.636363637, + "logps/chosen": -413.36583533653845, + "logps/rejected": -456.50772372159093, + "loss": 0.032, + "rewards/chosen": 6.902976989746094, + "rewards/margins": 17.437883550470524, + "rewards/rejected": -10.534906560724432, + "step": 2229 + }, + { + "epoch": 0.6112100863368508, + "grad_norm": 2.796875, + "kl": 2.9898574352264404, + "learning_rate": 5e-06, + "logits/chosen": -37660104.53333333, + "logits/rejected": -45785749.333333336, + "logps/chosen": -406.4861328125, + "logps/rejected": -639.0295138888889, + "loss": 0.0082, + "rewards/chosen": 7.177137247721354, + "rewards/margins": 21.349217393663196, + "rewards/rejected": -14.172080145941841, + "step": 2230 + }, + { + "epoch": 0.6114841715773606, + "grad_norm": 12.5625, + "kl": 7.213981628417969, + "learning_rate": 5e-06, + "logits/chosen": -9138173.0, + "logits/rejected": -16767059.0, + "logps/chosen": -434.93841552734375, + "logps/rejected": -543.642578125, + "loss": 0.0535, + "rewards/chosen": 7.151169776916504, + "rewards/margins": 17.64907741546631, + "rewards/rejected": -10.497907638549805, + "step": 2231 + }, + { + "epoch": 0.6117582568178703, + "grad_norm": 4.34375, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -10112381.866666667, + "logits/rejected": -35595889.777777776, + "logps/chosen": -415.6391276041667, + "logps/rejected": -662.8716362847222, + "loss": 0.0151, + "rewards/chosen": 6.0669504801432295, + "rewards/margins": 17.233535766601562, + "rewards/rejected": -11.166585286458334, + "step": 2232 + }, + { + "epoch": 0.6120323420583802, + "grad_norm": 6.875, + "kl": 1.7060697078704834, + "learning_rate": 5e-06, + "logits/chosen": -18183333.333333332, + "logits/rejected": 13633440.0, + "logps/chosen": -381.26996527777777, + "logps/rejected": -484.55771484375, + "loss": 0.0436, + "rewards/chosen": 6.63770506117079, + "rewards/margins": 18.808417341444226, + "rewards/rejected": -12.170712280273438, + "step": 2233 + }, + { + "epoch": 0.61230642729889, + "grad_norm": 0.625, + "kl": 0.4904734492301941, + "learning_rate": 5e-06, + "logits/chosen": -52908445.538461536, + "logits/rejected": -30421937.454545453, + "logps/chosen": -457.30615234375, + "logps/rejected": -414.54811789772725, + "loss": 0.0016, + "rewards/chosen": 7.998025160569411, + "rewards/margins": 20.36848081575407, + "rewards/rejected": -12.370455655184658, + "step": 2234 + }, + { + "epoch": 0.6125805125393997, + "grad_norm": 2.953125, + "kl": 2.3960700035095215, + "learning_rate": 5e-06, + "logits/chosen": 14248686.545454545, + "logits/rejected": -25378050.46153846, + "logps/chosen": -441.77565696022725, + "logps/rejected": -514.7870342548077, + "loss": 0.0103, + "rewards/chosen": 5.812588778409091, + "rewards/margins": 18.3911551095389, + "rewards/rejected": -12.578566331129808, + "step": 2235 + }, + { + "epoch": 0.6128545977799096, + "grad_norm": 16.875, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -15912428.8, + "logits/rejected": -49016845.71428572, + "logps/chosen": -389.178466796875, + "logps/rejected": -563.1731305803571, + "loss": 0.0573, + "rewards/chosen": 5.948181533813477, + "rewards/margins": 16.135121318272184, + "rewards/rejected": -10.186939784458705, + "step": 2236 + }, + { + "epoch": 0.6131286830204193, + "grad_norm": 6.875, + "kl": 0.5161794424057007, + "learning_rate": 5e-06, + "logits/chosen": -33270985.846153848, + "logits/rejected": -27296357.818181816, + "logps/chosen": -407.37992037259613, + "logps/rejected": -490.0179332386364, + "loss": 0.0718, + "rewards/chosen": 6.7048163780799275, + "rewards/margins": 14.38975567584271, + "rewards/rejected": -7.684939297762784, + "step": 2237 + }, + { + "epoch": 0.6134027682609291, + "grad_norm": 1.875, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -19058714.285714287, + "logits/rejected": -36890764.8, + "logps/chosen": -305.9133998325893, + "logps/rejected": -631.20546875, + "loss": 0.0091, + "rewards/chosen": 6.091927664620536, + "rewards/margins": 17.792318289620535, + "rewards/rejected": -11.700390625, + "step": 2238 + }, + { + "epoch": 0.613676853501439, + "grad_norm": 8.0, + "kl": 2.360995054244995, + "learning_rate": 5e-06, + "logits/chosen": -30128535.272727273, + "logits/rejected": -38909828.92307692, + "logps/chosen": -401.47265625, + "logps/rejected": -485.85096153846155, + "loss": 0.0631, + "rewards/chosen": 6.66457089510831, + "rewards/margins": 15.39655378648451, + "rewards/rejected": -8.731982891376202, + "step": 2239 + }, + { + "epoch": 0.6139509387419487, + "grad_norm": 6.5, + "kl": 10.810160636901855, + "learning_rate": 5e-06, + "logits/chosen": -31649990.85714286, + "logits/rejected": -55283372.8, + "logps/chosen": -601.8902064732143, + "logps/rejected": -470.69775390625, + "loss": 0.0284, + "rewards/chosen": 7.123325892857143, + "rewards/margins": 16.49766137259347, + "rewards/rejected": -9.374335479736327, + "step": 2240 + }, + { + "epoch": 0.6142250239824586, + "grad_norm": 8.6875, + "kl": 6.74765682220459, + "learning_rate": 5e-06, + "logits/chosen": -49009142.15384615, + "logits/rejected": -34234007.27272727, + "logps/chosen": -433.27599158653845, + "logps/rejected": -519.7243430397727, + "loss": 0.0536, + "rewards/chosen": 6.441916245680589, + "rewards/margins": 16.299938068523275, + "rewards/rejected": -9.858021822842685, + "step": 2241 + }, + { + "epoch": 0.6144991092229684, + "grad_norm": 1.734375, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -28022919.384615384, + "logits/rejected": -24360661.818181816, + "logps/chosen": -408.11031400240387, + "logps/rejected": -642.1769353693181, + "loss": 0.0193, + "rewards/chosen": 6.162169236403245, + "rewards/margins": 17.614832311243443, + "rewards/rejected": -11.4526630748402, + "step": 2242 + }, + { + "epoch": 0.6147731944634781, + "grad_norm": 2.71875, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -18307364.0, + "logits/rejected": -36041344.0, + "logps/chosen": -454.84222412109375, + "logps/rejected": -645.6376953125, + "loss": 0.0084, + "rewards/chosen": 6.382572650909424, + "rewards/margins": 18.34472131729126, + "rewards/rejected": -11.962148666381836, + "step": 2243 + }, + { + "epoch": 0.615047279703988, + "grad_norm": 2.859375, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -23437768.888888888, + "logits/rejected": -36871859.2, + "logps/chosen": -511.52338324652777, + "logps/rejected": -480.7215169270833, + "loss": 0.0101, + "rewards/chosen": 8.913897196451822, + "rewards/margins": 16.54248809814453, + "rewards/rejected": -7.628590901692708, + "step": 2244 + }, + { + "epoch": 0.6153213649444977, + "grad_norm": 4.21875, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -20416142.769230768, + "logits/rejected": -2545280.0, + "logps/chosen": -411.72607421875, + "logps/rejected": -608.9821555397727, + "loss": 0.0111, + "rewards/chosen": 6.372613173264724, + "rewards/margins": 18.506321780331486, + "rewards/rejected": -12.133708607066762, + "step": 2245 + }, + { + "epoch": 0.6155954501850075, + "grad_norm": 5.53125, + "kl": 4.147731781005859, + "learning_rate": 5e-06, + "logits/chosen": -51524772.571428575, + "logits/rejected": -27026579.2, + "logps/chosen": -437.31312779017856, + "logps/rejected": -548.12275390625, + "loss": 0.0432, + "rewards/chosen": 7.031842912946429, + "rewards/margins": 17.223289598737445, + "rewards/rejected": -10.191446685791016, + "step": 2246 + }, + { + "epoch": 0.6158695354255174, + "grad_norm": 12.625, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -19896105.14285714, + "logits/rejected": -23128293.647058822, + "logps/chosen": -318.18722098214283, + "logps/rejected": -503.6773897058824, + "loss": 0.0594, + "rewards/chosen": 4.902975899832589, + "rewards/margins": 14.449802430737922, + "rewards/rejected": -9.546826530905332, + "step": 2247 + }, + { + "epoch": 0.6161436206660271, + "grad_norm": 1.84375, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -15960664.727272727, + "logits/rejected": -31329907.692307692, + "logps/chosen": -560.5245028409091, + "logps/rejected": -517.140625, + "loss": 0.0055, + "rewards/chosen": 7.164037531072443, + "rewards/margins": 16.682721918279476, + "rewards/rejected": -9.518684387207031, + "step": 2248 + }, + { + "epoch": 0.6164177059065369, + "grad_norm": 8.5625, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -13313173.333333334, + "logits/rejected": -31268636.444444444, + "logps/chosen": -416.2941487630208, + "logps/rejected": -374.9311794704861, + "loss": 0.0347, + "rewards/chosen": 9.879608154296875, + "rewards/margins": 17.607304043240017, + "rewards/rejected": -7.727695888943142, + "step": 2249 + }, + { + "epoch": 0.6166917911470468, + "grad_norm": 12.25, + "kl": 5.08280086517334, + "learning_rate": 5e-06, + "logits/chosen": -19477312.0, + "logits/rejected": -18892861.09090909, + "logps/chosen": -391.3108097956731, + "logps/rejected": -699.3249733664773, + "loss": 0.0505, + "rewards/chosen": 6.236360990084135, + "rewards/margins": 20.905282720819223, + "rewards/rejected": -14.668921730735086, + "step": 2250 + }, + { + "epoch": 0.6169658763875565, + "grad_norm": 8.0, + "kl": 0.6101049184799194, + "learning_rate": 5e-06, + "logits/chosen": -30530242.666666668, + "logits/rejected": -15835861.333333334, + "logps/chosen": -413.7733154296875, + "logps/rejected": -377.9165852864583, + "loss": 0.0352, + "rewards/chosen": 6.229886372884114, + "rewards/margins": 14.519634246826172, + "rewards/rejected": -8.289747873942057, + "step": 2251 + }, + { + "epoch": 0.6172399616280664, + "grad_norm": 9.75, + "kl": 0.8595353960990906, + "learning_rate": 5e-06, + "logits/chosen": -35914132.571428575, + "logits/rejected": -43363289.6, + "logps/chosen": -425.61178152901783, + "logps/rejected": -641.447900390625, + "loss": 0.0439, + "rewards/chosen": 7.12231935773577, + "rewards/margins": 20.492755453927177, + "rewards/rejected": -13.370436096191407, + "step": 2252 + }, + { + "epoch": 0.6175140468685761, + "grad_norm": 7.34375, + "kl": 2.339046001434326, + "learning_rate": 5e-06, + "logits/chosen": -30420754.666666668, + "logits/rejected": -31623290.666666668, + "logps/chosen": -444.8187662760417, + "logps/rejected": -636.5015869140625, + "loss": 0.0336, + "rewards/chosen": 7.250525156656901, + "rewards/margins": 17.609773635864258, + "rewards/rejected": -10.359248479207357, + "step": 2253 + }, + { + "epoch": 0.6177881321090859, + "grad_norm": 10.0625, + "kl": 10.378274917602539, + "learning_rate": 5e-06, + "logits/chosen": -25889460.57142857, + "logits/rejected": -8796839.2, + "logps/chosen": -517.6070731026786, + "logps/rejected": -658.84580078125, + "loss": 0.0351, + "rewards/chosen": 7.0812481471470425, + "rewards/margins": 17.963485390799388, + "rewards/rejected": -10.882237243652344, + "step": 2254 + }, + { + "epoch": 0.6180622173495958, + "grad_norm": 14.5625, + "kl": 3.533064603805542, + "learning_rate": 5e-06, + "logits/chosen": -20515974.0, + "logits/rejected": -30520112.0, + "logps/chosen": -347.19403076171875, + "logps/rejected": -715.058837890625, + "loss": 0.0674, + "rewards/chosen": 6.336322784423828, + "rewards/margins": 20.053903579711914, + "rewards/rejected": -13.717580795288086, + "step": 2255 + }, + { + "epoch": 0.6183363025901055, + "grad_norm": 4.0, + "kl": 3.4687678813934326, + "learning_rate": 5e-06, + "logits/chosen": -45791689.84615385, + "logits/rejected": -19243872.0, + "logps/chosen": -432.42296424278845, + "logps/rejected": -650.6376953125, + "loss": 0.0108, + "rewards/chosen": 8.064470731295073, + "rewards/margins": 22.8517326941857, + "rewards/rejected": -14.787261962890625, + "step": 2256 + }, + { + "epoch": 0.6186103878306153, + "grad_norm": 6.34375, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -28727532.8, + "logits/rejected": -23845440.0, + "logps/chosen": -399.797607421875, + "logps/rejected": -488.7725306919643, + "loss": 0.0394, + "rewards/chosen": 5.358429718017578, + "rewards/margins": 14.265192522321428, + "rewards/rejected": -8.90676280430385, + "step": 2257 + }, + { + "epoch": 0.6188844730711252, + "grad_norm": 6.34375, + "kl": 6.4546918869018555, + "learning_rate": 5e-06, + "logits/chosen": -24765212.8, + "logits/rejected": -21167282.285714287, + "logps/chosen": -369.341845703125, + "logps/rejected": -551.2506277901786, + "loss": 0.044, + "rewards/chosen": 6.696479034423828, + "rewards/margins": 17.978526851109095, + "rewards/rejected": -11.282047816685267, + "step": 2258 + }, + { + "epoch": 0.6191585583116349, + "grad_norm": 4.34375, + "kl": 5.331413269042969, + "learning_rate": 5e-06, + "logits/chosen": -21245874.90909091, + "logits/rejected": -21189026.46153846, + "logps/chosen": -553.6702325994319, + "logps/rejected": -514.5766977163462, + "loss": 0.0216, + "rewards/chosen": 7.913204539905895, + "rewards/margins": 17.510519227781494, + "rewards/rejected": -9.5973146878756, + "step": 2259 + }, + { + "epoch": 0.6194326435521447, + "grad_norm": 8.3125, + "kl": 2.3706374168395996, + "learning_rate": 5e-06, + "logits/chosen": -32771608.0, + "logits/rejected": -8251354.666666667, + "logps/chosen": -401.4986165364583, + "logps/rejected": -555.4042561848959, + "loss": 0.0417, + "rewards/chosen": 7.855735778808594, + "rewards/margins": 16.887304306030273, + "rewards/rejected": -9.03156852722168, + "step": 2260 + }, + { + "epoch": 0.6197067287926545, + "grad_norm": 3.0625, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -50648473.6, + "logits/rejected": -36129106.28571428, + "logps/chosen": -442.24443359375, + "logps/rejected": -524.4289202008929, + "loss": 0.0096, + "rewards/chosen": 5.793862152099609, + "rewards/margins": 15.535272543770926, + "rewards/rejected": -9.741410391671318, + "step": 2261 + }, + { + "epoch": 0.6199808140331643, + "grad_norm": 2.25, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -13992154.181818182, + "logits/rejected": -30809858.46153846, + "logps/chosen": -533.6478604403409, + "logps/rejected": -465.76900540865387, + "loss": 0.0338, + "rewards/chosen": 6.862068176269531, + "rewards/margins": 16.451420123760517, + "rewards/rejected": -9.589351947490986, + "step": 2262 + }, + { + "epoch": 0.6202548992736742, + "grad_norm": 8.875, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -24092411.076923076, + "logits/rejected": -15696613.818181818, + "logps/chosen": -384.38269981971155, + "logps/rejected": -408.2080078125, + "loss": 0.0519, + "rewards/chosen": 6.16513178898738, + "rewards/margins": 14.78270492020187, + "rewards/rejected": -8.617573131214488, + "step": 2263 + }, + { + "epoch": 0.6205289845141839, + "grad_norm": 3.5, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -706937.7777777778, + "logits/rejected": -16197093.333333334, + "logps/chosen": -411.6455349392361, + "logps/rejected": -556.9069010416666, + "loss": 0.0133, + "rewards/chosen": 5.864325629340278, + "rewards/margins": 17.728632269965278, + "rewards/rejected": -11.864306640625, + "step": 2264 + }, + { + "epoch": 0.6208030697546937, + "grad_norm": 8.875, + "kl": 0.21106529235839844, + "learning_rate": 5e-06, + "logits/chosen": -9195838.857142856, + "logits/rejected": -33793667.2, + "logps/chosen": -377.0289829799107, + "logps/rejected": -459.35625, + "loss": 0.0377, + "rewards/chosen": 7.010033743722098, + "rewards/margins": 17.784449332101005, + "rewards/rejected": -10.774415588378906, + "step": 2265 + }, + { + "epoch": 0.6210771549952036, + "grad_norm": 2.515625, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -26176996.923076924, + "logits/rejected": -37068238.54545455, + "logps/chosen": -388.7800480769231, + "logps/rejected": -442.1077325994318, + "loss": 0.0112, + "rewards/chosen": 6.57044924222506, + "rewards/margins": 14.694805598759153, + "rewards/rejected": -8.124356356534092, + "step": 2266 + }, + { + "epoch": 0.6213512402357133, + "grad_norm": 9.1875, + "kl": 5.470054626464844, + "learning_rate": 5e-06, + "logits/chosen": -17624567.272727273, + "logits/rejected": -32765065.846153848, + "logps/chosen": -380.1687677556818, + "logps/rejected": -490.2918043870192, + "loss": 0.0284, + "rewards/chosen": 7.290783968838778, + "rewards/margins": 16.378360614909994, + "rewards/rejected": -9.087576646071215, + "step": 2267 + }, + { + "epoch": 0.6216253254762231, + "grad_norm": 1.96875, + "kl": 8.556510925292969, + "learning_rate": 5e-06, + "logits/chosen": -28251669.333333332, + "logits/rejected": -14326384.0, + "logps/chosen": -490.3694254557292, + "logps/rejected": -701.0380859375, + "loss": 0.007, + "rewards/chosen": 8.061495463053385, + "rewards/margins": 18.824952443440754, + "rewards/rejected": -10.76345698038737, + "step": 2268 + }, + { + "epoch": 0.6218994107167329, + "grad_norm": 10.4375, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -10465737.6, + "logits/rejected": -26639102.222222224, + "logps/chosen": -327.58160807291665, + "logps/rejected": -612.82177734375, + "loss": 0.0747, + "rewards/chosen": 5.00411376953125, + "rewards/margins": 16.85748867458767, + "rewards/rejected": -11.853374905056423, + "step": 2269 + }, + { + "epoch": 0.6221734959572427, + "grad_norm": 10.5625, + "kl": 9.041720390319824, + "learning_rate": 5e-06, + "logits/chosen": -32585597.866666667, + "logits/rejected": -23220049.777777776, + "logps/chosen": -429.07737630208334, + "logps/rejected": -485.07530381944446, + "loss": 0.077, + "rewards/chosen": 6.9366709391276045, + "rewards/margins": 15.97585890028212, + "rewards/rejected": -9.039187961154514, + "step": 2270 + }, + { + "epoch": 0.6224475811977525, + "grad_norm": 4.9375, + "kl": 3.9046497344970703, + "learning_rate": 5e-06, + "logits/chosen": -28914906.666666668, + "logits/rejected": -20650328.0, + "logps/chosen": -381.34912109375, + "logps/rejected": -402.1669921875, + "loss": 0.0386, + "rewards/chosen": 7.138304392496745, + "rewards/margins": 15.103984832763672, + "rewards/rejected": -7.965680440266927, + "step": 2271 + }, + { + "epoch": 0.6227216664382623, + "grad_norm": 8.5, + "kl": 3.326498031616211, + "learning_rate": 5e-06, + "logits/chosen": -34911008.0, + "logits/rejected": -23523399.272727273, + "logps/chosen": -402.85580679086536, + "logps/rejected": -500.2424982244318, + "loss": 0.0413, + "rewards/chosen": 7.170740567720854, + "rewards/margins": 16.258745633638824, + "rewards/rejected": -9.088005065917969, + "step": 2272 + }, + { + "epoch": 0.6229957516787721, + "grad_norm": 9.375, + "kl": 1.8985061645507812, + "learning_rate": 5e-06, + "logits/chosen": -18583352.0, + "logits/rejected": -7194081.0, + "logps/chosen": -464.0875549316406, + "logps/rejected": -621.821533203125, + "loss": 0.0456, + "rewards/chosen": 5.002586364746094, + "rewards/margins": 18.935165405273438, + "rewards/rejected": -13.932579040527344, + "step": 2273 + }, + { + "epoch": 0.623269836919282, + "grad_norm": 5.0625, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -37601540.92307692, + "logits/rejected": -11611325.090909092, + "logps/chosen": -360.8505859375, + "logps/rejected": -693.8053977272727, + "loss": 0.0458, + "rewards/chosen": 6.0396564190204325, + "rewards/margins": 18.793036934379096, + "rewards/rejected": -12.753380515358664, + "step": 2274 + }, + { + "epoch": 0.6235439221597917, + "grad_norm": 1.53125, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -22005771.2, + "logits/rejected": 195073.14285714287, + "logps/chosen": -285.017724609375, + "logps/rejected": -591.7710658482143, + "loss": 0.0041, + "rewards/chosen": 6.733045196533203, + "rewards/margins": 18.823516736711774, + "rewards/rejected": -12.090471540178571, + "step": 2275 + }, + { + "epoch": 0.6238180074003015, + "grad_norm": 8.1875, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": 4046875.2, + "logits/rejected": -14534667.42857143, + "logps/chosen": -313.912646484375, + "logps/rejected": -504.3140345982143, + "loss": 0.0466, + "rewards/chosen": 5.134371566772461, + "rewards/margins": 14.909637941632951, + "rewards/rejected": -9.775266374860491, + "step": 2276 + }, + { + "epoch": 0.6240920926408113, + "grad_norm": 0.7265625, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -37502485.333333336, + "logits/rejected": -17180034.666666668, + "logps/chosen": -506.0890299479167, + "logps/rejected": -505.0467122395833, + "loss": 0.003, + "rewards/chosen": 7.593138376871745, + "rewards/margins": 18.119576772054035, + "rewards/rejected": -10.526438395182291, + "step": 2277 + }, + { + "epoch": 0.6243661778813211, + "grad_norm": 5.65625, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -18931196.0, + "logits/rejected": -33858780.0, + "logps/chosen": -343.61328125, + "logps/rejected": -412.05316162109375, + "loss": 0.0242, + "rewards/chosen": 7.2034010887146, + "rewards/margins": 16.633633136749268, + "rewards/rejected": -9.430232048034668, + "step": 2278 + }, + { + "epoch": 0.6246402631218309, + "grad_norm": 3.375, + "kl": 1.0449600219726562, + "learning_rate": 5e-06, + "logits/chosen": -17770948.8, + "logits/rejected": -23363945.14285714, + "logps/chosen": -585.177880859375, + "logps/rejected": -548.9298967633929, + "loss": 0.0116, + "rewards/chosen": 8.766486358642577, + "rewards/margins": 18.333898598807195, + "rewards/rejected": -9.56741224016462, + "step": 2279 + }, + { + "epoch": 0.6249143483623407, + "grad_norm": 1.3984375, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -9247786.4, + "logits/rejected": -30178532.57142857, + "logps/chosen": -517.93046875, + "logps/rejected": -482.6301967075893, + "loss": 0.0039, + "rewards/chosen": 7.4687355041503904, + "rewards/margins": 18.806722041538784, + "rewards/rejected": -11.337986537388392, + "step": 2280 + }, + { + "epoch": 0.6251884336028505, + "grad_norm": 3.6875, + "kl": 0.43939846754074097, + "learning_rate": 5e-06, + "logits/chosen": -45393979.07692308, + "logits/rejected": -13707629.090909092, + "logps/chosen": -433.9523737980769, + "logps/rejected": -711.4117542613636, + "loss": 0.0113, + "rewards/chosen": 6.892977201021635, + "rewards/margins": 19.459043596174332, + "rewards/rejected": -12.5660663951527, + "step": 2281 + }, + { + "epoch": 0.6254625188433602, + "grad_norm": 6.96875, + "kl": 17.30350112915039, + "learning_rate": 5e-06, + "logits/chosen": -7529601.230769231, + "logits/rejected": -19361057.454545453, + "logps/chosen": -527.7155573918269, + "logps/rejected": -492.30366654829544, + "loss": 0.043, + "rewards/chosen": 8.7046872652494, + "rewards/margins": 18.072014175094925, + "rewards/rejected": -9.367326909845525, + "step": 2282 + }, + { + "epoch": 0.6257366040838701, + "grad_norm": 5.5625, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -43543896.615384616, + "logits/rejected": -37276416.0, + "logps/chosen": -472.63585486778845, + "logps/rejected": -463.02388139204544, + "loss": 0.055, + "rewards/chosen": 6.948205801156851, + "rewards/margins": 14.88785590325202, + "rewards/rejected": -7.939650102095171, + "step": 2283 + }, + { + "epoch": 0.6260106893243799, + "grad_norm": 4.75, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -3146891.272727273, + "logits/rejected": -25475953.230769232, + "logps/chosen": -354.2637384588068, + "logps/rejected": -637.0505558894231, + "loss": 0.0191, + "rewards/chosen": 7.122901222922585, + "rewards/margins": 19.3488075949929, + "rewards/rejected": -12.225906372070312, + "step": 2284 + }, + { + "epoch": 0.6262847745648897, + "grad_norm": 12.125, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -19408733.09090909, + "logits/rejected": -38241070.76923077, + "logps/chosen": -330.7771661931818, + "logps/rejected": -365.75048828125, + "loss": 0.0679, + "rewards/chosen": 5.283225666392934, + "rewards/margins": 12.858606431867694, + "rewards/rejected": -7.57538076547476, + "step": 2285 + }, + { + "epoch": 0.6265588598053995, + "grad_norm": 6.28125, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -5485379.0, + "logits/rejected": -36871957.333333336, + "logps/chosen": -492.8407796223958, + "logps/rejected": -488.9180908203125, + "loss": 0.0499, + "rewards/chosen": 6.5744279225667315, + "rewards/margins": 16.61695671081543, + "rewards/rejected": -10.042528788248697, + "step": 2286 + }, + { + "epoch": 0.6268329450459093, + "grad_norm": 2.140625, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -34342513.23076923, + "logits/rejected": -1856580.3636363635, + "logps/chosen": -410.1330754206731, + "logps/rejected": -674.4056729403409, + "loss": 0.0058, + "rewards/chosen": 6.63081535926232, + "rewards/margins": 20.743795594968994, + "rewards/rejected": -14.112980235706676, + "step": 2287 + }, + { + "epoch": 0.6271070302864191, + "grad_norm": 9.6875, + "kl": 6.125638961791992, + "learning_rate": 5e-06, + "logits/chosen": -32831960.0, + "logits/rejected": -46639928.0, + "logps/chosen": -465.3251037597656, + "logps/rejected": -707.6123046875, + "loss": 0.0614, + "rewards/chosen": 7.355838298797607, + "rewards/margins": 20.11037588119507, + "rewards/rejected": -12.754537582397461, + "step": 2288 + }, + { + "epoch": 0.6273811155269289, + "grad_norm": 13.75, + "kl": 1.5324440002441406, + "learning_rate": 5e-06, + "logits/chosen": -29377655.272727273, + "logits/rejected": -56797366.15384615, + "logps/chosen": -438.94655539772725, + "logps/rejected": -730.4921875, + "loss": 0.0251, + "rewards/chosen": 7.305580139160156, + "rewards/margins": 22.5052003126878, + "rewards/rejected": -15.199620173527645, + "step": 2289 + }, + { + "epoch": 0.6276552007674386, + "grad_norm": 6.1875, + "kl": 2.48762583732605, + "learning_rate": 5e-06, + "logits/chosen": -12623454.933333334, + "logits/rejected": -12154736.0, + "logps/chosen": -492.37975260416664, + "logps/rejected": -427.2916666666667, + "loss": 0.0333, + "rewards/chosen": 6.629606119791666, + "rewards/margins": 15.18689439561632, + "rewards/rejected": -8.557288275824654, + "step": 2290 + }, + { + "epoch": 0.6279292860079485, + "grad_norm": 10.0, + "kl": 2.3145334720611572, + "learning_rate": 5e-06, + "logits/chosen": -23949728.0, + "logits/rejected": -25529539.2, + "logps/chosen": -411.63643973214283, + "logps/rejected": -507.02646484375, + "loss": 0.0739, + "rewards/chosen": 5.701673235212054, + "rewards/margins": 14.91877681187221, + "rewards/rejected": -9.217103576660156, + "step": 2291 + }, + { + "epoch": 0.6282033712484583, + "grad_norm": 2.703125, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -12644534.222222222, + "logits/rejected": -13222967.466666667, + "logps/chosen": -386.4619140625, + "logps/rejected": -594.2703125, + "loss": 0.0093, + "rewards/chosen": 6.029425726996528, + "rewards/margins": 16.267874823676216, + "rewards/rejected": -10.238449096679688, + "step": 2292 + }, + { + "epoch": 0.628477456488968, + "grad_norm": 6.375, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -37427508.0, + "logits/rejected": -1673542.0, + "logps/chosen": -483.31781005859375, + "logps/rejected": -564.7496948242188, + "loss": 0.0151, + "rewards/chosen": 5.641016960144043, + "rewards/margins": 15.399352073669434, + "rewards/rejected": -9.75833511352539, + "step": 2293 + }, + { + "epoch": 0.6287515417294779, + "grad_norm": 9.25, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -34402316.307692304, + "logits/rejected": -43835624.72727273, + "logps/chosen": -474.27016977163464, + "logps/rejected": -569.9651988636364, + "loss": 0.0405, + "rewards/chosen": 6.367481525127705, + "rewards/margins": 17.13222151536208, + "rewards/rejected": -10.764739990234375, + "step": 2294 + }, + { + "epoch": 0.6290256269699877, + "grad_norm": 2.359375, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -13494114.909090908, + "logits/rejected": -14723051.076923076, + "logps/chosen": -453.0692027698864, + "logps/rejected": -712.8766526442307, + "loss": 0.0077, + "rewards/chosen": 6.962916981090199, + "rewards/margins": 21.580088048548134, + "rewards/rejected": -14.617171067457933, + "step": 2295 + }, + { + "epoch": 0.6292997122104975, + "grad_norm": 4.34375, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -25244100.57142857, + "logits/rejected": -40936000.0, + "logps/chosen": -393.15391322544644, + "logps/rejected": -584.1951976102941, + "loss": 0.015, + "rewards/chosen": 5.582432883126395, + "rewards/margins": 16.03374118965213, + "rewards/rejected": -10.451308306525736, + "step": 2296 + }, + { + "epoch": 0.6295737974510073, + "grad_norm": 11.25, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -53077652.0, + "logits/rejected": -37517112.0, + "logps/chosen": -499.0904541015625, + "logps/rejected": -514.0933837890625, + "loss": 0.0613, + "rewards/chosen": 6.013070106506348, + "rewards/margins": 16.237783432006836, + "rewards/rejected": -10.224713325500488, + "step": 2297 + }, + { + "epoch": 0.629847882691517, + "grad_norm": 10.0, + "kl": 8.113983154296875, + "learning_rate": 5e-06, + "logits/chosen": -8341437.818181818, + "logits/rejected": -17020419.692307692, + "logps/chosen": -349.5145152698864, + "logps/rejected": -507.88773287259613, + "loss": 0.0759, + "rewards/chosen": 6.918300281871449, + "rewards/margins": 16.412371068567666, + "rewards/rejected": -9.494070786696215, + "step": 2298 + }, + { + "epoch": 0.6301219679320269, + "grad_norm": 8.625, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -20421089.777777776, + "logits/rejected": -18416699.733333334, + "logps/chosen": -357.68462456597223, + "logps/rejected": -532.60068359375, + "loss": 0.024, + "rewards/chosen": 6.925285339355469, + "rewards/margins": 16.77421315511068, + "rewards/rejected": -9.848927815755209, + "step": 2299 + }, + { + "epoch": 0.6303960531725367, + "grad_norm": 1.8125, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -9707054.0, + "logits/rejected": -14030388.0, + "logps/chosen": -566.62255859375, + "logps/rejected": -677.9676513671875, + "loss": 0.0041, + "rewards/chosen": 6.222118377685547, + "rewards/margins": 22.764619827270508, + "rewards/rejected": -16.54250144958496, + "step": 2300 + }, + { + "epoch": 0.6306701384130464, + "grad_norm": 4.9375, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -37980002.90909091, + "logits/rejected": -22337585.230769232, + "logps/chosen": -473.77530184659093, + "logps/rejected": -572.0253155048077, + "loss": 0.0115, + "rewards/chosen": 7.055314497514204, + "rewards/margins": 19.938596498716127, + "rewards/rejected": -12.883282001201923, + "step": 2301 + }, + { + "epoch": 0.6309442236535563, + "grad_norm": 17.125, + "kl": 5.685277462005615, + "learning_rate": 5e-06, + "logits/chosen": -22372866.666666668, + "logits/rejected": -14088564.0, + "logps/chosen": -456.0694580078125, + "logps/rejected": -469.5743001302083, + "loss": 0.041, + "rewards/chosen": 6.926605860392253, + "rewards/margins": 18.088432947794598, + "rewards/rejected": -11.161827087402344, + "step": 2302 + }, + { + "epoch": 0.631218308894066, + "grad_norm": 1.84375, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -25157341.09090909, + "logits/rejected": -37053828.92307692, + "logps/chosen": -373.95987215909093, + "logps/rejected": -547.251953125, + "loss": 0.0075, + "rewards/chosen": 5.347729422829368, + "rewards/margins": 14.709054026570353, + "rewards/rejected": -9.361324603740986, + "step": 2303 + }, + { + "epoch": 0.6314923941345758, + "grad_norm": 5.375, + "kl": 1.20572030544281, + "learning_rate": 5e-06, + "logits/chosen": -29836357.818181816, + "logits/rejected": -26018929.230769232, + "logps/chosen": -418.43776633522725, + "logps/rejected": -552.5463115985577, + "loss": 0.0437, + "rewards/chosen": 7.083278309215199, + "rewards/margins": 16.88207415600757, + "rewards/rejected": -9.798795846792368, + "step": 2304 + }, + { + "epoch": 0.6317664793750857, + "grad_norm": 1.0, + "kl": 1.6449006795883179, + "learning_rate": 5e-06, + "logits/chosen": -10193989.818181818, + "logits/rejected": -25633612.307692308, + "logps/chosen": -419.53196022727275, + "logps/rejected": -599.2531550480769, + "loss": 0.003, + "rewards/chosen": 7.188887856223366, + "rewards/margins": 19.139768640478174, + "rewards/rejected": -11.950880784254808, + "step": 2305 + }, + { + "epoch": 0.6320405646155954, + "grad_norm": 14.375, + "kl": 9.879964828491211, + "learning_rate": 5e-06, + "logits/chosen": -7618883.733333333, + "logits/rejected": 7301377.777777778, + "logps/chosen": -356.38701171875, + "logps/rejected": -561.7055121527778, + "loss": 0.0941, + "rewards/chosen": 7.386067199707031, + "rewards/margins": 15.116829257541234, + "rewards/rejected": -7.730762057834202, + "step": 2306 + }, + { + "epoch": 0.6323146498561053, + "grad_norm": 8.0625, + "kl": 3.6283748149871826, + "learning_rate": 5e-06, + "logits/chosen": -26239396.57142857, + "logits/rejected": -30913555.2, + "logps/chosen": -517.2726353236607, + "logps/rejected": -508.691259765625, + "loss": 0.0179, + "rewards/chosen": 7.782998221261161, + "rewards/margins": 19.39622606549944, + "rewards/rejected": -11.613227844238281, + "step": 2307 + }, + { + "epoch": 0.6325887350966151, + "grad_norm": 8.625, + "kl": 6.9409990310668945, + "learning_rate": 5e-06, + "logits/chosen": -26666866.666666668, + "logits/rejected": -24800722.666666668, + "logps/chosen": -475.7073160807292, + "logps/rejected": -515.9648844401041, + "loss": 0.026, + "rewards/chosen": 7.165111541748047, + "rewards/margins": 19.997486114501953, + "rewards/rejected": -12.832374572753906, + "step": 2308 + }, + { + "epoch": 0.6328628203371248, + "grad_norm": 2.46875, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -32202144.0, + "logits/rejected": -12682520.615384616, + "logps/chosen": -424.1964666193182, + "logps/rejected": -480.58067908653845, + "loss": 0.0083, + "rewards/chosen": 6.471744884144176, + "rewards/margins": 15.837552050610523, + "rewards/rejected": -9.365807166466347, + "step": 2309 + }, + { + "epoch": 0.6331369055776347, + "grad_norm": 9.5, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -21516749.09090909, + "logits/rejected": -33420253.53846154, + "logps/chosen": -421.96102627840907, + "logps/rejected": -614.6456580528846, + "loss": 0.0741, + "rewards/chosen": 5.479234175248579, + "rewards/margins": 19.430617325789445, + "rewards/rejected": -13.951383150540865, + "step": 2310 + }, + { + "epoch": 0.6334109908181444, + "grad_norm": 9.25, + "kl": 5.516953945159912, + "learning_rate": 5e-06, + "logits/chosen": 29320667.42857143, + "logits/rejected": -886903.2, + "logps/chosen": -509.23779296875, + "logps/rejected": -472.78642578125, + "loss": 0.0465, + "rewards/chosen": 8.177165985107422, + "rewards/margins": 14.009296417236328, + "rewards/rejected": -5.832130432128906, + "step": 2311 + }, + { + "epoch": 0.6336850760586542, + "grad_norm": 15.1875, + "kl": 12.079658508300781, + "learning_rate": 5e-06, + "logits/chosen": -9545305.6, + "logits/rejected": -13750808.0, + "logps/chosen": -454.905078125, + "logps/rejected": -503.0817057291667, + "loss": 0.0949, + "rewards/chosen": 6.467893473307291, + "rewards/margins": 16.031575181749133, + "rewards/rejected": -9.563681708441841, + "step": 2312 + }, + { + "epoch": 0.6339591612991641, + "grad_norm": 0.423828125, + "kl": 4.344091415405273, + "learning_rate": 5e-06, + "logits/chosen": -8998414.857142856, + "logits/rejected": -20918832.0, + "logps/chosen": -433.5353306361607, + "logps/rejected": -486.877294921875, + "loss": 0.0011, + "rewards/chosen": 8.680709838867188, + "rewards/margins": 20.301110076904298, + "rewards/rejected": -11.62040023803711, + "step": 2313 + }, + { + "epoch": 0.6342332465396738, + "grad_norm": 4.90625, + "kl": 3.0371971130371094, + "learning_rate": 5e-06, + "logits/chosen": -37654754.90909091, + "logits/rejected": -26303320.615384616, + "logps/chosen": -494.57936789772725, + "logps/rejected": -546.6742037259615, + "loss": 0.0187, + "rewards/chosen": 7.742182644930753, + "rewards/margins": 18.387422841745657, + "rewards/rejected": -10.645240196814903, + "step": 2314 + }, + { + "epoch": 0.6345073317801836, + "grad_norm": 7.65625, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -15077793.0, + "logits/rejected": -23345014.0, + "logps/chosen": -342.3008728027344, + "logps/rejected": -609.821044921875, + "loss": 0.0719, + "rewards/chosen": 5.483484745025635, + "rewards/margins": 17.1940598487854, + "rewards/rejected": -11.710575103759766, + "step": 2315 + }, + { + "epoch": 0.6347814170206935, + "grad_norm": 7.21875, + "kl": 6.7192206382751465, + "learning_rate": 5e-06, + "logits/chosen": -20094996.363636363, + "logits/rejected": -3233352.6153846155, + "logps/chosen": -370.68379350142044, + "logps/rejected": -667.7420372596154, + "loss": 0.032, + "rewards/chosen": 6.343107050115412, + "rewards/margins": 17.87626322499522, + "rewards/rejected": -11.533156174879808, + "step": 2316 + }, + { + "epoch": 0.6350555022612032, + "grad_norm": 2.734375, + "kl": 0.5754903554916382, + "learning_rate": 5e-06, + "logits/chosen": -19414284.0, + "logits/rejected": -22842405.333333332, + "logps/chosen": -523.7522786458334, + "logps/rejected": -642.9419759114584, + "loss": 0.0416, + "rewards/chosen": 7.413881301879883, + "rewards/margins": 18.492673873901367, + "rewards/rejected": -11.078792572021484, + "step": 2317 + }, + { + "epoch": 0.635329587501713, + "grad_norm": 3.90625, + "kl": 0.7668317556381226, + "learning_rate": 5e-06, + "logits/chosen": -9128134.666666666, + "logits/rejected": -17408577.333333332, + "logps/chosen": -369.7271321614583, + "logps/rejected": -542.6063639322916, + "loss": 0.0099, + "rewards/chosen": 7.391726175944011, + "rewards/margins": 18.684546152750652, + "rewards/rejected": -11.29281997680664, + "step": 2318 + }, + { + "epoch": 0.6356036727422228, + "grad_norm": 1.3671875, + "kl": 6.559651851654053, + "learning_rate": 5e-06, + "logits/chosen": 355193.4736842105, + "logits/rejected": -21128665.6, + "logps/chosen": -512.1345600328947, + "logps/rejected": -463.432470703125, + "loss": 0.0049, + "rewards/chosen": 8.336070010536595, + "rewards/margins": 17.838362643593236, + "rewards/rejected": -9.50229263305664, + "step": 2319 + }, + { + "epoch": 0.6358777579827326, + "grad_norm": 0.71484375, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": 145204.0, + "logits/rejected": -31504169.14285714, + "logps/chosen": -449.46748046875, + "logps/rejected": -507.3549107142857, + "loss": 0.002, + "rewards/chosen": 7.40081787109375, + "rewards/margins": 19.58673095703125, + "rewards/rejected": -12.1859130859375, + "step": 2320 + }, + { + "epoch": 0.6361518432232425, + "grad_norm": 5.09375, + "kl": 0.19303512573242188, + "learning_rate": 5e-06, + "logits/chosen": -16429948.444444444, + "logits/rejected": -17306257.066666666, + "logps/chosen": -501.5050998263889, + "logps/rejected": -517.4429036458333, + "loss": 0.013, + "rewards/chosen": 9.921808878580729, + "rewards/margins": 19.121229044596355, + "rewards/rejected": -9.199420166015624, + "step": 2321 + }, + { + "epoch": 0.6364259284637522, + "grad_norm": 4.96875, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -30123704.0, + "logits/rejected": -29932821.333333332, + "logps/chosen": -346.3811848958333, + "logps/rejected": -434.4442545572917, + "loss": 0.0375, + "rewards/chosen": 6.315926869710286, + "rewards/margins": 17.9724858601888, + "rewards/rejected": -11.656558990478516, + "step": 2322 + }, + { + "epoch": 0.636700013704262, + "grad_norm": 7.28125, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -22319869.53846154, + "logits/rejected": -21658976.0, + "logps/chosen": -459.07327974759613, + "logps/rejected": -587.9909446022727, + "loss": 0.0184, + "rewards/chosen": 6.18115234375, + "rewards/margins": 17.56654219193892, + "rewards/rejected": -11.38538984818892, + "step": 2323 + }, + { + "epoch": 0.6369740989447719, + "grad_norm": 6.90625, + "kl": 4.074743270874023, + "learning_rate": 5e-06, + "logits/chosen": -6238227.636363637, + "logits/rejected": -25485080.615384616, + "logps/chosen": -524.9512606534091, + "logps/rejected": -371.5505558894231, + "loss": 0.022, + "rewards/chosen": 8.001977400346236, + "rewards/margins": 17.499626373077607, + "rewards/rejected": -9.49764897273137, + "step": 2324 + }, + { + "epoch": 0.6372481841852816, + "grad_norm": 2.984375, + "kl": 1.0259336233139038, + "learning_rate": 5e-06, + "logits/chosen": -20433590.85714286, + "logits/rejected": -27995561.411764707, + "logps/chosen": -483.96456473214283, + "logps/rejected": -561.9739200367648, + "loss": 0.008, + "rewards/chosen": 7.787757328578404, + "rewards/margins": 18.86858223666664, + "rewards/rejected": -11.080824908088236, + "step": 2325 + }, + { + "epoch": 0.6375222694257914, + "grad_norm": 15.1875, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -3121972.727272727, + "logits/rejected": 67202402.46153846, + "logps/chosen": -390.68661221590907, + "logps/rejected": -484.59652944711536, + "loss": 0.0801, + "rewards/chosen": 5.6017608642578125, + "rewards/margins": 15.38384540264423, + "rewards/rejected": -9.782084538386417, + "step": 2326 + }, + { + "epoch": 0.6377963546663012, + "grad_norm": 11.8125, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -29437728.0, + "logits/rejected": -16977150.769230768, + "logps/chosen": -311.2415660511364, + "logps/rejected": -580.9782902644231, + "loss": 0.0595, + "rewards/chosen": 4.952129017223012, + "rewards/margins": 14.542488044792123, + "rewards/rejected": -9.59035902756911, + "step": 2327 + }, + { + "epoch": 0.638070439906811, + "grad_norm": 7.03125, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -18652440.0, + "logits/rejected": -5001249.142857143, + "logps/chosen": -467.490185546875, + "logps/rejected": -627.8726981026786, + "loss": 0.0137, + "rewards/chosen": 6.315210342407227, + "rewards/margins": 19.141584941319056, + "rewards/rejected": -12.82637459891183, + "step": 2328 + }, + { + "epoch": 0.6383445251473208, + "grad_norm": 5.75, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -28123346.285714287, + "logits/rejected": -21821596.8, + "logps/chosen": -368.73740931919644, + "logps/rejected": -530.20439453125, + "loss": 0.0166, + "rewards/chosen": 7.064862387520926, + "rewards/margins": 16.483714621407646, + "rewards/rejected": -9.418852233886719, + "step": 2329 + }, + { + "epoch": 0.6386186103878306, + "grad_norm": 6.1875, + "kl": 7.7794342041015625, + "learning_rate": 5e-06, + "logits/chosen": 61729867.63636363, + "logits/rejected": -31922451.692307692, + "logps/chosen": -469.67724609375, + "logps/rejected": -513.8130258413462, + "loss": 0.0501, + "rewards/chosen": 7.168978604403409, + "rewards/margins": 16.779873614544634, + "rewards/rejected": -9.610895010141226, + "step": 2330 + }, + { + "epoch": 0.6388926956283404, + "grad_norm": 6.53125, + "kl": 7.858129024505615, + "learning_rate": 5e-06, + "logits/chosen": 1369768.0, + "logits/rejected": -35586483.2, + "logps/chosen": -357.281005859375, + "logps/rejected": -428.3232421875, + "loss": 0.0542, + "rewards/chosen": 5.778694152832031, + "rewards/margins": 14.647206878662109, + "rewards/rejected": -8.868512725830078, + "step": 2331 + }, + { + "epoch": 0.6391667808688503, + "grad_norm": 9.5625, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -17287218.285714287, + "logits/rejected": -10285784.8, + "logps/chosen": -392.54202706473217, + "logps/rejected": -336.4615234375, + "loss": 0.0393, + "rewards/chosen": 6.529656546456473, + "rewards/margins": 15.074109976632254, + "rewards/rejected": -8.544453430175782, + "step": 2332 + }, + { + "epoch": 0.63944086610936, + "grad_norm": 10.0625, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -41154666.666666664, + "logits/rejected": -19178042.666666668, + "logps/chosen": -536.8623046875, + "logps/rejected": -552.2638346354166, + "loss": 0.0548, + "rewards/chosen": 7.7133839925130205, + "rewards/margins": 19.895955403645832, + "rewards/rejected": -12.182571411132812, + "step": 2333 + }, + { + "epoch": 0.6397149513498698, + "grad_norm": 6.4375, + "kl": 7.528522491455078, + "learning_rate": 5e-06, + "logits/chosen": -2465454.285714286, + "logits/rejected": -45524620.8, + "logps/chosen": -361.009765625, + "logps/rejected": -650.120263671875, + "loss": 0.0248, + "rewards/chosen": 6.015021732875279, + "rewards/margins": 18.587505558558874, + "rewards/rejected": -12.572483825683594, + "step": 2334 + }, + { + "epoch": 0.6399890365903796, + "grad_norm": 6.09375, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -30254729.14285714, + "logits/rejected": -13082000.0, + "logps/chosen": -356.91238839285717, + "logps/rejected": -554.8513327205883, + "loss": 0.0357, + "rewards/chosen": 6.560591561453683, + "rewards/margins": 17.476464471897156, + "rewards/rejected": -10.915872910443474, + "step": 2335 + }, + { + "epoch": 0.6402631218308894, + "grad_norm": 7.4375, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -29771928.888888888, + "logits/rejected": -21605442.133333333, + "logps/chosen": -345.9914822048611, + "logps/rejected": -557.30625, + "loss": 0.0716, + "rewards/chosen": 5.677116394042969, + "rewards/margins": 15.64331512451172, + "rewards/rejected": -9.96619873046875, + "step": 2336 + }, + { + "epoch": 0.6405372070713992, + "grad_norm": 5.5, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -21565904.0, + "logits/rejected": -19146709.333333332, + "logps/chosen": -339.96202256944446, + "logps/rejected": -467.99853515625, + "loss": 0.0318, + "rewards/chosen": 6.763929578993055, + "rewards/margins": 15.667203097873264, + "rewards/rejected": -8.903273518880209, + "step": 2337 + }, + { + "epoch": 0.640811292311909, + "grad_norm": 12.0625, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -45095341.71428572, + "logits/rejected": -29859275.29411765, + "logps/chosen": -327.95455496651783, + "logps/rejected": -416.1090877757353, + "loss": 0.0279, + "rewards/chosen": 6.724583217075893, + "rewards/margins": 17.959997609883796, + "rewards/rejected": -11.235414392807904, + "step": 2338 + }, + { + "epoch": 0.6410853775524188, + "grad_norm": 4.78125, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -17092460.0, + "logits/rejected": -24352448.0, + "logps/chosen": -421.8753255208333, + "logps/rejected": -449.3505045572917, + "loss": 0.0287, + "rewards/chosen": 7.7597707112630205, + "rewards/margins": 17.45901934305827, + "rewards/rejected": -9.699248631795248, + "step": 2339 + }, + { + "epoch": 0.6413594627929285, + "grad_norm": 4.59375, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -14446902.222222222, + "logits/rejected": -49818965.333333336, + "logps/chosen": -284.30419921875, + "logps/rejected": -558.7053385416667, + "loss": 0.0318, + "rewards/chosen": 5.588070339626736, + "rewards/margins": 17.380123562282986, + "rewards/rejected": -11.79205322265625, + "step": 2340 + }, + { + "epoch": 0.6416335480334384, + "grad_norm": 5.375, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -32604261.818181816, + "logits/rejected": -7479740.307692308, + "logps/chosen": -429.22811612215907, + "logps/rejected": -346.5366962139423, + "loss": 0.0224, + "rewards/chosen": 6.569667469371449, + "rewards/margins": 15.146788910552338, + "rewards/rejected": -8.57712144118089, + "step": 2341 + }, + { + "epoch": 0.6419076332739482, + "grad_norm": 2.28125, + "kl": 3.3816030025482178, + "learning_rate": 5e-06, + "logits/chosen": -27410640.0, + "logits/rejected": -13437534.666666666, + "logps/chosen": -489.5791015625, + "logps/rejected": -482.6736653645833, + "loss": 0.0073, + "rewards/chosen": 7.363979339599609, + "rewards/margins": 18.330762227376304, + "rewards/rejected": -10.966782887776693, + "step": 2342 + }, + { + "epoch": 0.642181718514458, + "grad_norm": 13.875, + "kl": 10.152392387390137, + "learning_rate": 5e-06, + "logits/chosen": -8672636.0, + "logits/rejected": -37641458.666666664, + "logps/chosen": -504.8428548177083, + "logps/rejected": -487.8741048177083, + "loss": 0.0532, + "rewards/chosen": 7.474520365397136, + "rewards/margins": 20.21362813313802, + "rewards/rejected": -12.739107767740885, + "step": 2343 + }, + { + "epoch": 0.6424558037549678, + "grad_norm": 5.40625, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -40198976.0, + "logits/rejected": -22923808.0, + "logps/chosen": -414.36807528409093, + "logps/rejected": -607.5328275240385, + "loss": 0.0296, + "rewards/chosen": 7.251310868696733, + "rewards/margins": 21.752019815511638, + "rewards/rejected": -14.500708946814903, + "step": 2344 + }, + { + "epoch": 0.6427298889954776, + "grad_norm": 8.25, + "kl": 0.8501071929931641, + "learning_rate": 5e-06, + "logits/chosen": -30434025.14285714, + "logits/rejected": -33630716.8, + "logps/chosen": -353.69492885044644, + "logps/rejected": -477.63974609375, + "loss": 0.1252, + "rewards/chosen": 3.9225575583321706, + "rewards/margins": 16.000522341047013, + "rewards/rejected": -12.077964782714844, + "step": 2345 + }, + { + "epoch": 0.6430039742359874, + "grad_norm": 5.75, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -44128211.692307696, + "logits/rejected": -16780404.363636363, + "logps/chosen": -479.3534405048077, + "logps/rejected": -483.9073597301136, + "loss": 0.0178, + "rewards/chosen": 7.617608290452224, + "rewards/margins": 17.081545529665647, + "rewards/rejected": -9.463937239213424, + "step": 2346 + }, + { + "epoch": 0.6432780594764972, + "grad_norm": 18.875, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -3874041.777777778, + "logits/rejected": -30948821.333333332, + "logps/chosen": -465.44048394097223, + "logps/rejected": -476.5234049479167, + "loss": 0.0682, + "rewards/chosen": 4.74562750922309, + "rewards/margins": 15.011470201280382, + "rewards/rejected": -10.265842692057292, + "step": 2347 + }, + { + "epoch": 0.643552144717007, + "grad_norm": 6.8125, + "kl": 5.170435905456543, + "learning_rate": 5e-06, + "logits/chosen": -39074262.85714286, + "logits/rejected": -9117756.0, + "logps/chosen": -397.8086635044643, + "logps/rejected": -531.473193359375, + "loss": 0.021, + "rewards/chosen": 6.424537658691406, + "rewards/margins": 18.928053283691405, + "rewards/rejected": -12.503515625, + "step": 2348 + }, + { + "epoch": 0.6438262299575168, + "grad_norm": 7.0625, + "kl": 4.23336935043335, + "learning_rate": 5e-06, + "logits/chosen": 7816816.0, + "logits/rejected": -32774717.866666667, + "logps/chosen": -443.05211046006946, + "logps/rejected": -536.3595703125, + "loss": 0.0873, + "rewards/chosen": 5.78870349460178, + "rewards/margins": 15.840245649549697, + "rewards/rejected": -10.051542154947917, + "step": 2349 + }, + { + "epoch": 0.6441003151980266, + "grad_norm": 5.03125, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -21924380.8, + "logits/rejected": -16180092.57142857, + "logps/chosen": -488.2400390625, + "logps/rejected": -641.7616489955357, + "loss": 0.0156, + "rewards/chosen": 7.394422912597657, + "rewards/margins": 19.201798139299665, + "rewards/rejected": -11.807375226702009, + "step": 2350 + }, + { + "epoch": 0.6443744004385363, + "grad_norm": 13.4375, + "kl": 4.134966850280762, + "learning_rate": 5e-06, + "logits/chosen": -13911009.6, + "logits/rejected": -30102571.42857143, + "logps/chosen": -463.824462890625, + "logps/rejected": -531.302734375, + "loss": 0.0456, + "rewards/chosen": 7.647559356689453, + "rewards/margins": 18.627834102085657, + "rewards/rejected": -10.980274745396205, + "step": 2351 + }, + { + "epoch": 0.6446484856790462, + "grad_norm": 9.3125, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -9465977.333333334, + "logits/rejected": -13664372.0, + "logps/chosen": -492.649169921875, + "logps/rejected": -701.13232421875, + "loss": 0.025, + "rewards/chosen": 6.622893651326497, + "rewards/margins": 20.400360743204754, + "rewards/rejected": -13.777467091878256, + "step": 2352 + }, + { + "epoch": 0.644922570919556, + "grad_norm": 1.0546875, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -25064622.222222224, + "logits/rejected": -23515569.066666666, + "logps/chosen": -460.23687065972223, + "logps/rejected": -651.0613932291667, + "loss": 0.0033, + "rewards/chosen": 7.374298095703125, + "rewards/margins": 22.99669392903646, + "rewards/rejected": -15.622395833333334, + "step": 2353 + }, + { + "epoch": 0.6451966561600658, + "grad_norm": 10.5, + "kl": 2.155231475830078, + "learning_rate": 5e-06, + "logits/chosen": -13254733.714285715, + "logits/rejected": -25953347.2, + "logps/chosen": -387.84312220982144, + "logps/rejected": -531.51884765625, + "loss": 0.0367, + "rewards/chosen": 6.082830156598773, + "rewards/margins": 19.67585231236049, + "rewards/rejected": -13.59302215576172, + "step": 2354 + }, + { + "epoch": 0.6454707414005756, + "grad_norm": 2.421875, + "kl": 5.181491851806641, + "learning_rate": 5e-06, + "logits/chosen": 105593708.3076923, + "logits/rejected": -31465506.90909091, + "logps/chosen": -528.9339693509615, + "logps/rejected": -591.6859019886364, + "loss": 0.0046, + "rewards/chosen": 9.117188673753004, + "rewards/margins": 21.01971414205911, + "rewards/rejected": -11.902525468306107, + "step": 2355 + }, + { + "epoch": 0.6457448266410853, + "grad_norm": 6.90625, + "kl": 4.034279823303223, + "learning_rate": 5e-06, + "logits/chosen": -23985317.333333332, + "logits/rejected": 26595333.333333332, + "logps/chosen": -418.4737955729167, + "logps/rejected": -409.1017252604167, + "loss": 0.0199, + "rewards/chosen": 6.720266342163086, + "rewards/margins": 17.179765701293945, + "rewards/rejected": -10.45949935913086, + "step": 2356 + }, + { + "epoch": 0.6460189118815952, + "grad_norm": 14.0, + "kl": 2.889268398284912, + "learning_rate": 5e-06, + "logits/chosen": -13208564.57142857, + "logits/rejected": -21171721.6, + "logps/chosen": -421.3032924107143, + "logps/rejected": -573.799658203125, + "loss": 0.0686, + "rewards/chosen": 5.283702850341797, + "rewards/margins": 14.694955444335937, + "rewards/rejected": -9.41125259399414, + "step": 2357 + }, + { + "epoch": 0.646292997122105, + "grad_norm": 3.953125, + "kl": 8.157577514648438, + "learning_rate": 5e-06, + "logits/chosen": -23945902.769230768, + "logits/rejected": 13293589.818181818, + "logps/chosen": -450.8552809495192, + "logps/rejected": -797.7583451704545, + "loss": 0.0079, + "rewards/chosen": 8.58715585561899, + "rewards/margins": 23.505053860324246, + "rewards/rejected": -14.917898004705256, + "step": 2358 + }, + { + "epoch": 0.6465670823626147, + "grad_norm": 4.53125, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -28768258.46153846, + "logits/rejected": -41018801.45454545, + "logps/chosen": -458.1608323317308, + "logps/rejected": -779.330078125, + "loss": 0.0087, + "rewards/chosen": 6.51407975416917, + "rewards/margins": 23.026547571995874, + "rewards/rejected": -16.512467817826703, + "step": 2359 + }, + { + "epoch": 0.6468411676031246, + "grad_norm": 8.0, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -29731560.0, + "logits/rejected": -15944374.666666666, + "logps/chosen": -396.8806966145833, + "logps/rejected": -730.0804850260416, + "loss": 0.0249, + "rewards/chosen": 5.815690358479817, + "rewards/margins": 21.746487935384113, + "rewards/rejected": -15.930797576904297, + "step": 2360 + }, + { + "epoch": 0.6471152528436344, + "grad_norm": 9.9375, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -32752887.466666665, + "logits/rejected": -15490561.777777778, + "logps/chosen": -397.6343098958333, + "logps/rejected": -488.8595920138889, + "loss": 0.0195, + "rewards/chosen": 6.761081949869792, + "rewards/margins": 17.094765387641058, + "rewards/rejected": -10.333683437771267, + "step": 2361 + }, + { + "epoch": 0.6473893380841441, + "grad_norm": 5.75, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -39789122.666666664, + "logits/rejected": -32285781.333333332, + "logps/chosen": -553.6888834635416, + "logps/rejected": -458.9791666666667, + "loss": 0.014, + "rewards/chosen": 7.516263961791992, + "rewards/margins": 17.75932947794596, + "rewards/rejected": -10.24306551615397, + "step": 2362 + }, + { + "epoch": 0.647663423324654, + "grad_norm": 6.15625, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -18326115.692307692, + "logits/rejected": -30175403.636363637, + "logps/chosen": -356.01724008413464, + "logps/rejected": -479.7454723011364, + "loss": 0.04, + "rewards/chosen": 5.8964397723858175, + "rewards/margins": 17.24023138726508, + "rewards/rejected": -11.343791614879262, + "step": 2363 + }, + { + "epoch": 0.6479375085651637, + "grad_norm": 0.72265625, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -24024907.42857143, + "logits/rejected": -15302847.05882353, + "logps/chosen": -698.2177734375, + "logps/rejected": -581.2410386029412, + "loss": 0.0021, + "rewards/chosen": 9.396852765764509, + "rewards/margins": 22.64561500869879, + "rewards/rejected": -13.248762242934284, + "step": 2364 + }, + { + "epoch": 0.6482115938056736, + "grad_norm": 5.5625, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -1761070.4615384615, + "logits/rejected": -25238597.818181816, + "logps/chosen": -409.23989633413464, + "logps/rejected": -593.0818536931819, + "loss": 0.0223, + "rewards/chosen": 6.754403334397536, + "rewards/margins": 18.655319694038873, + "rewards/rejected": -11.900916359641336, + "step": 2365 + }, + { + "epoch": 0.6484856790461834, + "grad_norm": 8.625, + "kl": 5.327315330505371, + "learning_rate": 5e-06, + "logits/chosen": -20136120.0, + "logits/rejected": -24128426.666666668, + "logps/chosen": -402.1129964192708, + "logps/rejected": -531.659912109375, + "loss": 0.0299, + "rewards/chosen": 5.934655507405599, + "rewards/margins": 16.89653778076172, + "rewards/rejected": -10.96188227335612, + "step": 2366 + }, + { + "epoch": 0.6487597642866931, + "grad_norm": 3.0625, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -17566180.8, + "logits/rejected": -23193666.285714287, + "logps/chosen": -429.942919921875, + "logps/rejected": -498.27197265625, + "loss": 0.0123, + "rewards/chosen": 6.3926342010498045, + "rewards/margins": 16.23071665082659, + "rewards/rejected": -9.838082449776786, + "step": 2367 + }, + { + "epoch": 0.649033849527203, + "grad_norm": 0.87109375, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -27534781.333333332, + "logits/rejected": -33993546.666666664, + "logps/chosen": -432.480712890625, + "logps/rejected": -390.7391764322917, + "loss": 0.0243, + "rewards/chosen": 7.254103342692058, + "rewards/margins": 16.214122772216797, + "rewards/rejected": -8.96001942952474, + "step": 2368 + }, + { + "epoch": 0.6493079347677128, + "grad_norm": 6.0, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -30589177.6, + "logits/rejected": -12942620.57142857, + "logps/chosen": -341.2389404296875, + "logps/rejected": -569.0523158482143, + "loss": 0.0375, + "rewards/chosen": 5.284690856933594, + "rewards/margins": 16.175351824079243, + "rewards/rejected": -10.890660967145648, + "step": 2369 + }, + { + "epoch": 0.6495820200082225, + "grad_norm": 7.0625, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -33577688.0, + "logits/rejected": -39207992.0, + "logps/chosen": -279.7102355957031, + "logps/rejected": -620.41162109375, + "loss": 0.0242, + "rewards/chosen": 6.151074409484863, + "rewards/margins": 16.85827350616455, + "rewards/rejected": -10.707199096679688, + "step": 2370 + }, + { + "epoch": 0.6498561052487324, + "grad_norm": 5.5, + "kl": 5.667426109313965, + "learning_rate": 5e-06, + "logits/chosen": -29888366.769230768, + "logits/rejected": -9928642.181818182, + "logps/chosen": -438.35437950721155, + "logps/rejected": -565.68408203125, + "loss": 0.0178, + "rewards/chosen": 7.300264211801382, + "rewards/margins": 17.42004154445408, + "rewards/rejected": -10.1197773326527, + "step": 2371 + }, + { + "epoch": 0.6501301904892421, + "grad_norm": 8.0, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -4937357.818181818, + "logits/rejected": 17582385.230769232, + "logps/chosen": -454.974609375, + "logps/rejected": -511.13882211538464, + "loss": 0.0297, + "rewards/chosen": 7.677060213955966, + "rewards/margins": 17.98216087501366, + "rewards/rejected": -10.305100661057692, + "step": 2372 + }, + { + "epoch": 0.6504042757297519, + "grad_norm": 10.8125, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -28550678.153846152, + "logits/rejected": -7688848.0, + "logps/chosen": -529.1025390625, + "logps/rejected": -731.4957386363636, + "loss": 0.0245, + "rewards/chosen": 6.783702556903545, + "rewards/margins": 18.665741020149284, + "rewards/rejected": -11.882038463245738, + "step": 2373 + }, + { + "epoch": 0.6506783609702618, + "grad_norm": 5.84375, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -12875878.666666666, + "logits/rejected": -14778764.0, + "logps/chosen": -540.1568603515625, + "logps/rejected": -422.6492513020833, + "loss": 0.0131, + "rewards/chosen": 7.062317530314128, + "rewards/margins": 17.698633193969727, + "rewards/rejected": -10.6363156636556, + "step": 2374 + }, + { + "epoch": 0.6509524462107715, + "grad_norm": 2.3125, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -31383603.2, + "logits/rejected": 9424926.285714285, + "logps/chosen": -388.638525390625, + "logps/rejected": -504.67640904017856, + "loss": 0.0114, + "rewards/chosen": 7.081339263916016, + "rewards/margins": 18.945257023402625, + "rewards/rejected": -11.863917759486608, + "step": 2375 + }, + { + "epoch": 0.6512265314512814, + "grad_norm": 5.3125, + "kl": 1.1310060024261475, + "learning_rate": 5e-06, + "logits/chosen": -20371012.0, + "logits/rejected": -30331644.0, + "logps/chosen": -417.0988464355469, + "logps/rejected": -439.4571838378906, + "loss": 0.0157, + "rewards/chosen": 7.391225814819336, + "rewards/margins": 16.17526149749756, + "rewards/rejected": -8.784035682678223, + "step": 2376 + }, + { + "epoch": 0.6515006166917912, + "grad_norm": 6.65625, + "kl": 4.513184547424316, + "learning_rate": 5e-06, + "logits/chosen": -15327075.2, + "logits/rejected": -15445168.0, + "logps/chosen": -356.13154296875, + "logps/rejected": -476.2216099330357, + "loss": 0.0294, + "rewards/chosen": 6.579014587402344, + "rewards/margins": 16.143630109514508, + "rewards/rejected": -9.564615522112165, + "step": 2377 + }, + { + "epoch": 0.6517747019323009, + "grad_norm": 5.875, + "kl": 8.134163856506348, + "learning_rate": 5e-06, + "logits/chosen": -33753467.428571425, + "logits/rejected": -41146387.2, + "logps/chosen": -438.63462611607144, + "logps/rejected": -379.715478515625, + "loss": 0.0179, + "rewards/chosen": 7.405079432896206, + "rewards/margins": 14.913815089634486, + "rewards/rejected": -7.508735656738281, + "step": 2378 + }, + { + "epoch": 0.6520487871728108, + "grad_norm": 1.390625, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -13849171.692307692, + "logits/rejected": 2136717.8181818184, + "logps/chosen": -375.2252854567308, + "logps/rejected": -592.2590997869319, + "loss": 0.0034, + "rewards/chosen": 7.080760075495793, + "rewards/margins": 19.69147646177065, + "rewards/rejected": -12.610716386274857, + "step": 2379 + }, + { + "epoch": 0.6523228724133205, + "grad_norm": 6.59375, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -42510741.333333336, + "logits/rejected": 18425.777777777777, + "logps/chosen": -439.92138671875, + "logps/rejected": -381.84019639756946, + "loss": 0.0485, + "rewards/chosen": 7.237536112467448, + "rewards/margins": 15.51375240749783, + "rewards/rejected": -8.276216295030382, + "step": 2380 + }, + { + "epoch": 0.6525969576538303, + "grad_norm": 9.3125, + "kl": 5.333033561706543, + "learning_rate": 5e-06, + "logits/chosen": -33383278.545454547, + "logits/rejected": -29971401.846153848, + "logps/chosen": -529.0334250710227, + "logps/rejected": -475.2678786057692, + "loss": 0.0136, + "rewards/chosen": 8.305741743607955, + "rewards/margins": 19.79770857804305, + "rewards/rejected": -11.491966834435097, + "step": 2381 + }, + { + "epoch": 0.6528710428943402, + "grad_norm": 9.6875, + "kl": 3.6118011474609375, + "learning_rate": 5e-06, + "logits/chosen": -36270010.666666664, + "logits/rejected": -22816098.666666668, + "logps/chosen": -447.77490234375, + "logps/rejected": -484.1936442057292, + "loss": 0.029, + "rewards/chosen": 6.04641850789388, + "rewards/margins": 16.085221608479817, + "rewards/rejected": -10.038803100585938, + "step": 2382 + }, + { + "epoch": 0.6531451281348499, + "grad_norm": 17.375, + "kl": 7.914766311645508, + "learning_rate": 5e-06, + "logits/chosen": -22342656.0, + "logits/rejected": -34157972.0, + "logps/chosen": -386.4498291015625, + "logps/rejected": -456.197265625, + "loss": 0.0636, + "rewards/chosen": 5.904258728027344, + "rewards/margins": 15.940570831298828, + "rewards/rejected": -10.036312103271484, + "step": 2383 + }, + { + "epoch": 0.6534192133753597, + "grad_norm": 4.8125, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -27401958.4, + "logits/rejected": -51448872.421052635, + "logps/chosen": -400.2741943359375, + "logps/rejected": -496.0094058388158, + "loss": 0.0155, + "rewards/chosen": 6.2424766540527346, + "rewards/margins": 16.13041289480109, + "rewards/rejected": -9.887936240748354, + "step": 2384 + }, + { + "epoch": 0.6536932986158696, + "grad_norm": 10.375, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -36740945.777777776, + "logits/rejected": -14035246.933333334, + "logps/chosen": -362.35907660590277, + "logps/rejected": -387.929296875, + "loss": 0.0385, + "rewards/chosen": 7.046055263943142, + "rewards/margins": 15.856329515245225, + "rewards/rejected": -8.810274251302083, + "step": 2385 + }, + { + "epoch": 0.6539673838563793, + "grad_norm": 2.203125, + "kl": 5.006761074066162, + "learning_rate": 5e-06, + "logits/chosen": -25786560.0, + "logits/rejected": -33272020.363636363, + "logps/chosen": -543.6484375, + "logps/rejected": -524.2081409801136, + "loss": 0.0064, + "rewards/chosen": 9.660604623647837, + "rewards/margins": 21.894971300671983, + "rewards/rejected": -12.234366677024148, + "step": 2386 + }, + { + "epoch": 0.6542414690968892, + "grad_norm": 11.6875, + "kl": 7.444933891296387, + "learning_rate": 5e-06, + "logits/chosen": -57329527.46666667, + "logits/rejected": -23499516.444444444, + "logps/chosen": -412.46897786458334, + "logps/rejected": -672.9491102430555, + "loss": 0.0736, + "rewards/chosen": 6.31374766031901, + "rewards/margins": 17.425733608669706, + "rewards/rejected": -11.111985948350695, + "step": 2387 + }, + { + "epoch": 0.6545155543373989, + "grad_norm": 10.8125, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -35475936.0, + "logits/rejected": -29407063.466666665, + "logps/chosen": -442.09711371527777, + "logps/rejected": -630.287109375, + "loss": 0.0301, + "rewards/chosen": 4.94252183702257, + "rewards/margins": 18.126990424262154, + "rewards/rejected": -13.184468587239584, + "step": 2388 + }, + { + "epoch": 0.6547896395779087, + "grad_norm": 14.0, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -7205937.0, + "logits/rejected": -22157312.0, + "logps/chosen": -449.02728271484375, + "logps/rejected": -451.1173095703125, + "loss": 0.0315, + "rewards/chosen": 7.400700569152832, + "rewards/margins": 17.390746116638184, + "rewards/rejected": -9.990045547485352, + "step": 2389 + }, + { + "epoch": 0.6550637248184186, + "grad_norm": 3.59375, + "kl": 8.980210304260254, + "learning_rate": 5e-06, + "logits/chosen": -23252118.85714286, + "logits/rejected": 12459897.6, + "logps/chosen": -494.0917271205357, + "logps/rejected": -490.17294921875, + "loss": 0.0097, + "rewards/chosen": 6.742616925920759, + "rewards/margins": 15.973977552141461, + "rewards/rejected": -9.231360626220702, + "step": 2390 + }, + { + "epoch": 0.6553378100589283, + "grad_norm": 2.1875, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -11900219.076923076, + "logits/rejected": -60312855.27272727, + "logps/chosen": -448.23929537259613, + "logps/rejected": -532.3605291193181, + "loss": 0.0292, + "rewards/chosen": 6.800325833834135, + "rewards/margins": 19.100111101057145, + "rewards/rejected": -12.299785267223012, + "step": 2391 + }, + { + "epoch": 0.6556118952994381, + "grad_norm": 4.25, + "kl": 6.45037841796875, + "learning_rate": 5e-06, + "logits/chosen": -14901505.142857144, + "logits/rejected": -24192339.2, + "logps/chosen": -511.79788643973217, + "logps/rejected": -525.985205078125, + "loss": 0.0095, + "rewards/chosen": 6.607219151088169, + "rewards/margins": 18.422755650111608, + "rewards/rejected": -11.815536499023438, + "step": 2392 + }, + { + "epoch": 0.655885980539948, + "grad_norm": 6.40625, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -17877380.923076924, + "logits/rejected": -34483726.54545455, + "logps/chosen": -484.39261568509613, + "logps/rejected": -502.771484375, + "loss": 0.0287, + "rewards/chosen": 7.803254934457632, + "rewards/margins": 16.704276718459763, + "rewards/rejected": -8.90102178400213, + "step": 2393 + }, + { + "epoch": 0.6561600657804577, + "grad_norm": 8.625, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -10191058.909090908, + "logits/rejected": -15609942.153846154, + "logps/chosen": -420.00830078125, + "logps/rejected": -484.2668269230769, + "loss": 0.071, + "rewards/chosen": 6.601380781693892, + "rewards/margins": 18.397549182384996, + "rewards/rejected": -11.796168400691105, + "step": 2394 + }, + { + "epoch": 0.6564341510209675, + "grad_norm": 2.375, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -27188806.4, + "logits/rejected": -22595318.85714286, + "logps/chosen": -444.5001953125, + "logps/rejected": -581.2555454799107, + "loss": 0.0073, + "rewards/chosen": 6.686713409423828, + "rewards/margins": 21.449082837785994, + "rewards/rejected": -14.762369428362165, + "step": 2395 + }, + { + "epoch": 0.6567082362614773, + "grad_norm": 48.75, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -32880802.666666668, + "logits/rejected": -8163137.333333333, + "logps/chosen": -463.6217854817708, + "logps/rejected": -580.4785970052084, + "loss": 0.0303, + "rewards/chosen": 8.827757517496744, + "rewards/margins": 20.060492197672524, + "rewards/rejected": -11.232734680175781, + "step": 2396 + }, + { + "epoch": 0.6569823215019871, + "grad_norm": 4.15625, + "kl": 2.8998496532440186, + "learning_rate": 5e-06, + "logits/chosen": -16017709.333333334, + "logits/rejected": -29467674.666666668, + "logps/chosen": -475.038818359375, + "logps/rejected": -513.74609375, + "loss": 0.0341, + "rewards/chosen": 6.62507438659668, + "rewards/margins": 20.606045405069985, + "rewards/rejected": -13.980971018473307, + "step": 2397 + }, + { + "epoch": 0.657256406742497, + "grad_norm": 4.4375, + "kl": 0.6854079961776733, + "learning_rate": 5e-06, + "logits/chosen": -11332352.0, + "logits/rejected": -31084032.0, + "logps/chosen": -375.257568359375, + "logps/rejected": -445.96181640625, + "loss": 0.0193, + "rewards/chosen": 7.713701520647321, + "rewards/margins": 18.086128888811384, + "rewards/rejected": -10.372427368164063, + "step": 2398 + }, + { + "epoch": 0.6575304919830067, + "grad_norm": 3.6875, + "kl": 15.477231979370117, + "learning_rate": 5e-06, + "logits/chosen": -15458796.307692308, + "logits/rejected": -40096866.90909091, + "logps/chosen": -432.18201622596155, + "logps/rejected": -635.4456676136364, + "loss": 0.0478, + "rewards/chosen": 6.991032527043269, + "rewards/margins": 19.803033148492133, + "rewards/rejected": -12.812000621448863, + "step": 2399 + }, + { + "epoch": 0.6578045772235165, + "grad_norm": 1.0625, + "kl": 1.1016604900360107, + "learning_rate": 5e-06, + "logits/chosen": -30279901.333333332, + "logits/rejected": -26960880.0, + "logps/chosen": -498.2027180989583, + "logps/rejected": -525.9344075520834, + "loss": 0.0027, + "rewards/chosen": 8.305161158243815, + "rewards/margins": 18.97705014546712, + "rewards/rejected": -10.671888987223307, + "step": 2400 + }, + { + "epoch": 0.6580786624640264, + "grad_norm": 7.59375, + "kl": 0.7276992797851562, + "learning_rate": 5e-06, + "logits/chosen": -12555019.076923076, + "logits/rejected": -45627531.63636363, + "logps/chosen": -389.86429537259613, + "logps/rejected": -661.2076083096591, + "loss": 0.0546, + "rewards/chosen": 6.453988295335036, + "rewards/margins": 20.616835640860604, + "rewards/rejected": -14.162847345525568, + "step": 2401 + }, + { + "epoch": 0.6583527477045361, + "grad_norm": 2.96875, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -33483202.46153846, + "logits/rejected": -27063924.363636363, + "logps/chosen": -372.24673227163464, + "logps/rejected": -778.8721590909091, + "loss": 0.0097, + "rewards/chosen": 5.363064105694111, + "rewards/margins": 17.68224190665292, + "rewards/rejected": -12.319177800958807, + "step": 2402 + }, + { + "epoch": 0.6586268329450459, + "grad_norm": 5.03125, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -24763034.666666668, + "logits/rejected": -4209767.333333333, + "logps/chosen": -484.9635823567708, + "logps/rejected": -460.5018717447917, + "loss": 0.0174, + "rewards/chosen": 6.251057942708333, + "rewards/margins": 16.118555704752605, + "rewards/rejected": -9.867497762044271, + "step": 2403 + }, + { + "epoch": 0.6589009181855557, + "grad_norm": 4.78125, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -36848508.8, + "logits/rejected": -41782665.14285714, + "logps/chosen": -351.828564453125, + "logps/rejected": -503.7963169642857, + "loss": 0.017, + "rewards/chosen": 6.394371414184571, + "rewards/margins": 18.9405216217041, + "rewards/rejected": -12.546150207519531, + "step": 2404 + }, + { + "epoch": 0.6591750034260655, + "grad_norm": 3.203125, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -53193314.90909091, + "logits/rejected": -37988913.23076923, + "logps/chosen": -412.06196732954544, + "logps/rejected": -457.06971153846155, + "loss": 0.0082, + "rewards/chosen": 5.686680880459872, + "rewards/margins": 15.292544011469488, + "rewards/rejected": -9.605863131009615, + "step": 2405 + }, + { + "epoch": 0.6594490886665753, + "grad_norm": 2.09375, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -4052252.6666666665, + "logits/rejected": -26612074.666666668, + "logps/chosen": -491.6318766276042, + "logps/rejected": -723.4607747395834, + "loss": 0.0051, + "rewards/chosen": 6.503957748413086, + "rewards/margins": 22.661558151245117, + "rewards/rejected": -16.15760040283203, + "step": 2406 + }, + { + "epoch": 0.6597231739070851, + "grad_norm": 8.4375, + "kl": 9.3551025390625, + "learning_rate": 5e-06, + "logits/chosen": -43835799.27272727, + "logits/rejected": -27036580.923076924, + "logps/chosen": -414.65749289772725, + "logps/rejected": -509.48670372596155, + "loss": 0.0906, + "rewards/chosen": 6.626642400568182, + "rewards/margins": 17.130858521361453, + "rewards/rejected": -10.50421612079327, + "step": 2407 + }, + { + "epoch": 0.6599972591475949, + "grad_norm": 5.40625, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -30879546.666666668, + "logits/rejected": -54589546.666666664, + "logps/chosen": -317.24855550130206, + "logps/rejected": -634.1912841796875, + "loss": 0.0187, + "rewards/chosen": 4.749409993489583, + "rewards/margins": 21.851236979166664, + "rewards/rejected": -17.101826985677082, + "step": 2408 + }, + { + "epoch": 0.6602713443881048, + "grad_norm": 8.375, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -21216740.266666666, + "logits/rejected": -29947260.444444444, + "logps/chosen": -439.45625, + "logps/rejected": -511.1832682291667, + "loss": 0.0646, + "rewards/chosen": 7.129347229003907, + "rewards/margins": 21.19706743028429, + "rewards/rejected": -14.067720201280382, + "step": 2409 + }, + { + "epoch": 0.6605454296286145, + "grad_norm": 5.5625, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -41000064.0, + "logits/rejected": -46000439.46666667, + "logps/chosen": -433.16924370659723, + "logps/rejected": -567.2109375, + "loss": 0.0131, + "rewards/chosen": 7.1161693996853295, + "rewards/margins": 20.550688849555122, + "rewards/rejected": -13.434519449869791, + "step": 2410 + }, + { + "epoch": 0.6608195148691243, + "grad_norm": 9.0, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -40523477.333333336, + "logits/rejected": -25257578.666666668, + "logps/chosen": -405.4505859375, + "logps/rejected": -373.28754340277777, + "loss": 0.0466, + "rewards/chosen": 6.730853271484375, + "rewards/margins": 14.458789910210504, + "rewards/rejected": -7.727936638726129, + "step": 2411 + }, + { + "epoch": 0.6610936001096341, + "grad_norm": 4.34375, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -31989713.454545453, + "logits/rejected": -13843849.846153846, + "logps/chosen": -465.60986328125, + "logps/rejected": -536.1025390625, + "loss": 0.0115, + "rewards/chosen": 7.989095514470881, + "rewards/margins": 19.767149385038792, + "rewards/rejected": -11.77805387056791, + "step": 2412 + }, + { + "epoch": 0.6613676853501439, + "grad_norm": 5.125, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -35921400.0, + "logits/rejected": -23393309.333333332, + "logps/chosen": -504.9838053385417, + "logps/rejected": -468.97607421875, + "loss": 0.011, + "rewards/chosen": 6.9152374267578125, + "rewards/margins": 17.002888361612953, + "rewards/rejected": -10.087650934855143, + "step": 2413 + }, + { + "epoch": 0.6616417705906537, + "grad_norm": 9.5625, + "kl": 1.3121846914291382, + "learning_rate": 5e-06, + "logits/chosen": -28607941.333333332, + "logits/rejected": -22750205.333333332, + "logps/chosen": -453.4391276041667, + "logps/rejected": -512.5880533854166, + "loss": 0.0298, + "rewards/chosen": 7.134044011433919, + "rewards/margins": 18.698517481486004, + "rewards/rejected": -11.564473470052084, + "step": 2414 + }, + { + "epoch": 0.6619158558311635, + "grad_norm": 3.421875, + "kl": 2.3284378051757812, + "learning_rate": 5e-06, + "logits/chosen": -28608093.333333332, + "logits/rejected": -27065194.666666668, + "logps/chosen": -475.6542561848958, + "logps/rejected": -525.0315755208334, + "loss": 0.0124, + "rewards/chosen": 8.211662928263346, + "rewards/margins": 18.007545471191406, + "rewards/rejected": -9.79588254292806, + "step": 2415 + }, + { + "epoch": 0.6621899410716733, + "grad_norm": 2.46875, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -24480729.6, + "logits/rejected": -6326390.285714285, + "logps/chosen": -441.609912109375, + "logps/rejected": -530.7398856026786, + "loss": 0.0091, + "rewards/chosen": 5.628167724609375, + "rewards/margins": 18.07066933768136, + "rewards/rejected": -12.442501613071986, + "step": 2416 + }, + { + "epoch": 0.662464026312183, + "grad_norm": 9.5625, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -27671616.0, + "logits/rejected": -30386965.333333332, + "logps/chosen": -366.8681640625, + "logps/rejected": -617.51953125, + "loss": 0.0509, + "rewards/chosen": 5.819865926106771, + "rewards/margins": 16.955013020833334, + "rewards/rejected": -11.135147094726562, + "step": 2417 + }, + { + "epoch": 0.6627381115526929, + "grad_norm": 4.3125, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -38743923.2, + "logits/rejected": -28263888.0, + "logps/chosen": -526.79248046875, + "logps/rejected": -470.38364955357144, + "loss": 0.039, + "rewards/chosen": 8.05074462890625, + "rewards/margins": 19.817538016183036, + "rewards/rejected": -11.766793387276786, + "step": 2418 + }, + { + "epoch": 0.6630121967932027, + "grad_norm": 0.60546875, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -25941980.444444444, + "logits/rejected": 34755541.333333336, + "logps/chosen": -557.8088650173611, + "logps/rejected": -457.39716796875, + "loss": 0.0016, + "rewards/chosen": 9.447589450412327, + "rewards/margins": 24.955736626519098, + "rewards/rejected": -15.508147176106771, + "step": 2419 + }, + { + "epoch": 0.6632862820337125, + "grad_norm": 6.3125, + "kl": 12.99555492401123, + "learning_rate": 5e-06, + "logits/chosen": -14303483.076923076, + "logits/rejected": 66952768.0, + "logps/chosen": -529.2439903846154, + "logps/rejected": -575.1284623579545, + "loss": 0.0215, + "rewards/chosen": 7.952676626352163, + "rewards/margins": 19.38336565777972, + "rewards/rejected": -11.430689031427557, + "step": 2420 + }, + { + "epoch": 0.6635603672742223, + "grad_norm": 11.125, + "kl": 0.5585874319076538, + "learning_rate": 5e-06, + "logits/chosen": -10150430.933333334, + "logits/rejected": -27074192.0, + "logps/chosen": -455.14537760416664, + "logps/rejected": -623.2037217881945, + "loss": 0.1063, + "rewards/chosen": 6.732235717773437, + "rewards/margins": 15.764168124728734, + "rewards/rejected": -9.031932406955296, + "step": 2421 + }, + { + "epoch": 0.663834452514732, + "grad_norm": 6.875, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -34213769.14285714, + "logits/rejected": -37607158.4, + "logps/chosen": -418.547607421875, + "logps/rejected": -525.48095703125, + "loss": 0.0185, + "rewards/chosen": 6.464845929827009, + "rewards/margins": 18.168671308244978, + "rewards/rejected": -11.703825378417969, + "step": 2422 + }, + { + "epoch": 0.6641085377552419, + "grad_norm": 7.78125, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -36531002.18181818, + "logits/rejected": -27939062.153846152, + "logps/chosen": -345.28664328835225, + "logps/rejected": -512.1439302884615, + "loss": 0.0282, + "rewards/chosen": 5.8157525496049365, + "rewards/margins": 16.88191583273294, + "rewards/rejected": -11.066163283128004, + "step": 2423 + }, + { + "epoch": 0.6643826229957517, + "grad_norm": 3.5625, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -6713208.0, + "logits/rejected": -27485705.14285714, + "logps/chosen": -402.9030029296875, + "logps/rejected": -506.90980747767856, + "loss": 0.0514, + "rewards/chosen": 6.399425506591797, + "rewards/margins": 19.274695805140905, + "rewards/rejected": -12.875270298549108, + "step": 2424 + }, + { + "epoch": 0.6646567082362614, + "grad_norm": 2.484375, + "kl": 9.67886734008789, + "learning_rate": 5e-06, + "logits/chosen": -24139744.0, + "logits/rejected": -36974150.4, + "logps/chosen": -396.26639229910717, + "logps/rejected": -553.278466796875, + "loss": 0.0116, + "rewards/chosen": 7.726963588169643, + "rewards/margins": 16.75570504324777, + "rewards/rejected": -9.028741455078125, + "step": 2425 + }, + { + "epoch": 0.6649307934767713, + "grad_norm": 9.3125, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -39336352.0, + "logits/rejected": -23593234.0, + "logps/chosen": -320.43487548828125, + "logps/rejected": -461.9276123046875, + "loss": 0.0701, + "rewards/chosen": 5.0041351318359375, + "rewards/margins": 12.810271263122559, + "rewards/rejected": -7.806136131286621, + "step": 2426 + }, + { + "epoch": 0.6652048787172811, + "grad_norm": 8.125, + "kl": 3.3424792289733887, + "learning_rate": 5e-06, + "logits/chosen": -28365740.0, + "logits/rejected": -8470219.0, + "logps/chosen": -307.9923095703125, + "logps/rejected": -649.0552978515625, + "loss": 0.0763, + "rewards/chosen": 5.968907833099365, + "rewards/margins": 18.238824367523193, + "rewards/rejected": -12.269916534423828, + "step": 2427 + }, + { + "epoch": 0.6654789639577908, + "grad_norm": 5.15625, + "kl": 6.906120300292969, + "learning_rate": 5e-06, + "logits/chosen": -20216939.2, + "logits/rejected": -7835445.714285715, + "logps/chosen": -446.1888671875, + "logps/rejected": -691.05126953125, + "loss": 0.0222, + "rewards/chosen": 8.150788116455079, + "rewards/margins": 20.483016640799384, + "rewards/rejected": -12.332228524344307, + "step": 2428 + }, + { + "epoch": 0.6657530491983007, + "grad_norm": 7.96875, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -10966867.555555556, + "logits/rejected": -2719804.2666666666, + "logps/chosen": -353.6735568576389, + "logps/rejected": -548.8755859375, + "loss": 0.024, + "rewards/chosen": 6.201808929443359, + "rewards/margins": 17.404427337646485, + "rewards/rejected": -11.202618408203126, + "step": 2429 + }, + { + "epoch": 0.6660271344388105, + "grad_norm": 1.421875, + "kl": 5.284012317657471, + "learning_rate": 5e-06, + "logits/chosen": -32661897.846153848, + "logits/rejected": -28934941.09090909, + "logps/chosen": -527.1168870192307, + "logps/rejected": -519.2955433238636, + "loss": 0.0034, + "rewards/chosen": 9.362674419696514, + "rewards/margins": 19.103663411173788, + "rewards/rejected": -9.740988991477273, + "step": 2430 + }, + { + "epoch": 0.6663012196793203, + "grad_norm": 1.3984375, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -13292042.0, + "logits/rejected": -27315458.0, + "logps/chosen": -498.2758483886719, + "logps/rejected": -437.8033142089844, + "loss": 0.0036, + "rewards/chosen": 8.763221740722656, + "rewards/margins": 18.745359420776367, + "rewards/rejected": -9.982137680053711, + "step": 2431 + }, + { + "epoch": 0.6665753049198301, + "grad_norm": 4.65625, + "kl": 2.552082061767578, + "learning_rate": 5e-06, + "logits/chosen": -29711896.888888888, + "logits/rejected": -38811264.0, + "logps/chosen": -374.81035698784723, + "logps/rejected": -548.357421875, + "loss": 0.0141, + "rewards/chosen": 6.820089128282335, + "rewards/margins": 17.19281455145942, + "rewards/rejected": -10.372725423177084, + "step": 2432 + }, + { + "epoch": 0.6668493901603398, + "grad_norm": 3.671875, + "kl": 5.1908674240112305, + "learning_rate": 5e-06, + "logits/chosen": -45172277.333333336, + "logits/rejected": -39003944.0, + "logps/chosen": -541.5013020833334, + "logps/rejected": -427.4015299479167, + "loss": 0.0132, + "rewards/chosen": 8.41897964477539, + "rewards/margins": 16.02618408203125, + "rewards/rejected": -7.607204437255859, + "step": 2433 + }, + { + "epoch": 0.6671234754008497, + "grad_norm": 8.875, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -10349610.0, + "logits/rejected": -32807157.333333332, + "logps/chosen": -455.23193359375, + "logps/rejected": -619.2906494140625, + "loss": 0.0169, + "rewards/chosen": 7.6279551188151045, + "rewards/margins": 17.516155242919922, + "rewards/rejected": -9.888200124104818, + "step": 2434 + }, + { + "epoch": 0.6673975606413595, + "grad_norm": 1.828125, + "kl": 2.373732328414917, + "learning_rate": 5e-06, + "logits/chosen": -18888888.0, + "logits/rejected": -13402053.333333334, + "logps/chosen": -451.1107584635417, + "logps/rejected": -445.8369140625, + "loss": 0.0056, + "rewards/chosen": 8.347919464111328, + "rewards/margins": 17.562217712402344, + "rewards/rejected": -9.214298248291016, + "step": 2435 + }, + { + "epoch": 0.6676716458818692, + "grad_norm": 3.5, + "kl": 11.64498519897461, + "learning_rate": 5e-06, + "logits/chosen": -26763374.933333334, + "logits/rejected": -19788156.444444444, + "logps/chosen": -422.1841796875, + "logps/rejected": -553.8721788194445, + "loss": 0.012, + "rewards/chosen": 7.300200398763021, + "rewards/margins": 19.150803290473092, + "rewards/rejected": -11.85060289171007, + "step": 2436 + }, + { + "epoch": 0.6679457311223791, + "grad_norm": 2.8125, + "kl": 4.028592586517334, + "learning_rate": 5e-06, + "logits/chosen": -30991374.222222224, + "logits/rejected": -27913273.6, + "logps/chosen": -450.3276638454861, + "logps/rejected": -568.1482421875, + "loss": 0.0084, + "rewards/chosen": 7.003792656792535, + "rewards/margins": 18.624678887261286, + "rewards/rejected": -11.62088623046875, + "step": 2437 + }, + { + "epoch": 0.6682198163628889, + "grad_norm": 6.9375, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -25030472.0, + "logits/rejected": -7144409.333333333, + "logps/chosen": -391.3216959635417, + "logps/rejected": -581.8849690755209, + "loss": 0.0261, + "rewards/chosen": 6.685868581136067, + "rewards/margins": 19.627613067626953, + "rewards/rejected": -12.941744486490885, + "step": 2438 + }, + { + "epoch": 0.6684939016033986, + "grad_norm": 2.53125, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -30735264.0, + "logits/rejected": -35872182.85714286, + "logps/chosen": -395.3958984375, + "logps/rejected": -537.9934430803571, + "loss": 0.0085, + "rewards/chosen": 6.887783050537109, + "rewards/margins": 18.066051483154297, + "rewards/rejected": -11.178268432617188, + "step": 2439 + }, + { + "epoch": 0.6687679868439085, + "grad_norm": 11.4375, + "kl": 0.3797881007194519, + "learning_rate": 5e-06, + "logits/chosen": -42553677.71428572, + "logits/rejected": -24851838.4, + "logps/chosen": -417.0825892857143, + "logps/rejected": -562.3650390625, + "loss": 0.0405, + "rewards/chosen": 6.313555036272321, + "rewards/margins": 18.391664777483257, + "rewards/rejected": -12.078109741210938, + "step": 2440 + }, + { + "epoch": 0.6690420720844182, + "grad_norm": 5.0625, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -16453192.888888888, + "logits/rejected": -10203362.133333333, + "logps/chosen": -389.6321614583333, + "logps/rejected": -394.8339518229167, + "loss": 0.0157, + "rewards/chosen": 8.127564324273003, + "rewards/margins": 16.736971876356336, + "rewards/rejected": -8.609407552083333, + "step": 2441 + }, + { + "epoch": 0.6693161573249281, + "grad_norm": 15.125, + "kl": 6.482230186462402, + "learning_rate": 5e-06, + "logits/chosen": -29383805.53846154, + "logits/rejected": -14894237.090909092, + "logps/chosen": -360.9056865985577, + "logps/rejected": -534.3692294034091, + "loss": 0.1041, + "rewards/chosen": 5.593744718111479, + "rewards/margins": 16.33415387560438, + "rewards/rejected": -10.740409157492898, + "step": 2442 + }, + { + "epoch": 0.6695902425654379, + "grad_norm": 14.0, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -18012421.333333332, + "logits/rejected": -23367109.333333332, + "logps/chosen": -442.97705078125, + "logps/rejected": -516.8039143880209, + "loss": 0.0455, + "rewards/chosen": 5.911112467447917, + "rewards/margins": 16.10337766011556, + "rewards/rejected": -10.192265192667643, + "step": 2443 + }, + { + "epoch": 0.6698643278059476, + "grad_norm": 12.875, + "kl": 9.08342456817627, + "learning_rate": 5e-06, + "logits/chosen": -19802940.23529412, + "logits/rejected": -30278546.285714287, + "logps/chosen": -488.04176240808823, + "logps/rejected": -896.6991489955357, + "loss": 0.08, + "rewards/chosen": 7.1038822847254135, + "rewards/margins": 26.30641655160599, + "rewards/rejected": -19.20253426688058, + "step": 2444 + }, + { + "epoch": 0.6701384130464575, + "grad_norm": 9.0, + "kl": 9.147181510925293, + "learning_rate": 5e-06, + "logits/chosen": -27962536.533333335, + "logits/rejected": -22520519.111111112, + "logps/chosen": -370.36858723958335, + "logps/rejected": -500.2429470486111, + "loss": 0.0903, + "rewards/chosen": 6.742433675130209, + "rewards/margins": 17.562698703342015, + "rewards/rejected": -10.820265028211805, + "step": 2445 + }, + { + "epoch": 0.6704124982869673, + "grad_norm": 7.1875, + "kl": 3.324281692504883, + "learning_rate": 5e-06, + "logits/chosen": -20208519.384615384, + "logits/rejected": -26620640.0, + "logps/chosen": -390.6031024639423, + "logps/rejected": -503.6510564630682, + "loss": 0.0336, + "rewards/chosen": 5.936251126802885, + "rewards/margins": 16.499714511257785, + "rewards/rejected": -10.5634633844549, + "step": 2446 + }, + { + "epoch": 0.670686583527477, + "grad_norm": 6.40625, + "kl": 7.816205024719238, + "learning_rate": 5e-06, + "logits/chosen": -11129080.615384616, + "logits/rejected": -16647149.090909092, + "logps/chosen": -328.71176382211536, + "logps/rejected": -634.1387606534091, + "loss": 0.0176, + "rewards/chosen": 6.650658827561599, + "rewards/margins": 19.331700198300236, + "rewards/rejected": -12.681041370738637, + "step": 2447 + }, + { + "epoch": 0.6709606687679869, + "grad_norm": 6.875, + "kl": 5.784738063812256, + "learning_rate": 5e-06, + "logits/chosen": -19481364.0, + "logits/rejected": -50560277.333333336, + "logps/chosen": -474.8658040364583, + "logps/rejected": -679.3194173177084, + "loss": 0.0277, + "rewards/chosen": 7.642716725667317, + "rewards/margins": 20.029149373372395, + "rewards/rejected": -12.386432647705078, + "step": 2448 + }, + { + "epoch": 0.6712347540084966, + "grad_norm": 8.4375, + "kl": 2.1080451011657715, + "learning_rate": 5e-06, + "logits/chosen": -34310058.666666664, + "logits/rejected": -28920833.777777776, + "logps/chosen": -437.22063802083335, + "logps/rejected": -406.52978515625, + "loss": 0.0281, + "rewards/chosen": 6.4876246134440105, + "rewards/margins": 16.332397800021702, + "rewards/rejected": -9.84477318657769, + "step": 2449 + }, + { + "epoch": 0.6715088392490064, + "grad_norm": 3.078125, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -38942919.11111111, + "logits/rejected": -20484309.333333332, + "logps/chosen": -452.68638780381946, + "logps/rejected": -620.1588541666666, + "loss": 0.0087, + "rewards/chosen": 5.733968946668837, + "rewards/margins": 18.068001132541234, + "rewards/rejected": -12.334032185872395, + "step": 2450 + }, + { + "epoch": 0.6717829244895163, + "grad_norm": 2.59375, + "kl": 5.378354549407959, + "learning_rate": 5e-06, + "logits/chosen": -17871867.42857143, + "logits/rejected": -29336947.2, + "logps/chosen": -373.27755301339283, + "logps/rejected": -496.44013671875, + "loss": 0.0475, + "rewards/chosen": 7.431656973702567, + "rewards/margins": 17.649576895577567, + "rewards/rejected": -10.217919921875, + "step": 2451 + }, + { + "epoch": 0.672057009730026, + "grad_norm": 7.25, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -36734914.90909091, + "logits/rejected": -26823751.384615384, + "logps/chosen": -398.36110617897725, + "logps/rejected": -697.8608774038462, + "loss": 0.028, + "rewards/chosen": 5.32981352372603, + "rewards/margins": 21.401919518317378, + "rewards/rejected": -16.072105994591347, + "step": 2452 + }, + { + "epoch": 0.6723310949705359, + "grad_norm": 13.375, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -17210340.363636363, + "logits/rejected": -21789302.153846152, + "logps/chosen": -294.05934836647725, + "logps/rejected": -607.7851186899038, + "loss": 0.0714, + "rewards/chosen": 5.056086453524503, + "rewards/margins": 15.535690147559961, + "rewards/rejected": -10.479603694035458, + "step": 2453 + }, + { + "epoch": 0.6726051802110457, + "grad_norm": 7.9375, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -39410083.55555555, + "logits/rejected": -22392507.733333334, + "logps/chosen": -402.25792100694446, + "logps/rejected": -493.43382161458334, + "loss": 0.0229, + "rewards/chosen": 5.670057084825304, + "rewards/margins": 16.716011471218533, + "rewards/rejected": -11.045954386393229, + "step": 2454 + }, + { + "epoch": 0.6728792654515554, + "grad_norm": 4.65625, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -27483476.363636363, + "logits/rejected": -7371003.076923077, + "logps/chosen": -384.60280539772725, + "logps/rejected": -514.1988055889423, + "loss": 0.022, + "rewards/chosen": 6.814059170809659, + "rewards/margins": 19.16994342937336, + "rewards/rejected": -12.355884258563702, + "step": 2455 + }, + { + "epoch": 0.6731533506920653, + "grad_norm": 5.8125, + "kl": 7.227742671966553, + "learning_rate": 5e-06, + "logits/chosen": -24956541.53846154, + "logits/rejected": -28446568.727272727, + "logps/chosen": -518.7442908653846, + "logps/rejected": -442.21071555397725, + "loss": 0.0252, + "rewards/chosen": 8.881983243502104, + "rewards/margins": 16.987677407431434, + "rewards/rejected": -8.105694163929332, + "step": 2456 + }, + { + "epoch": 0.673427435932575, + "grad_norm": 4.34375, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": 10165420.0, + "logits/rejected": -18593204.0, + "logps/chosen": -456.1442565917969, + "logps/rejected": -549.6728515625, + "loss": 0.0076, + "rewards/chosen": 8.052903175354004, + "rewards/margins": 20.137064933776855, + "rewards/rejected": -12.084161758422852, + "step": 2457 + }, + { + "epoch": 0.6737015211730848, + "grad_norm": 8.3125, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -51422261.333333336, + "logits/rejected": -30190963.2, + "logps/chosen": -367.61073133680554, + "logps/rejected": -717.11328125, + "loss": 0.018, + "rewards/chosen": 5.210396660698785, + "rewards/margins": 16.625537448459202, + "rewards/rejected": -11.415140787760416, + "step": 2458 + }, + { + "epoch": 0.6739756064135947, + "grad_norm": 4.90625, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -33817291.428571425, + "logits/rejected": -9266230.588235294, + "logps/chosen": -542.5466657366071, + "logps/rejected": -635.5035615808823, + "loss": 0.0165, + "rewards/chosen": 7.278719765799386, + "rewards/margins": 21.50588453917944, + "rewards/rejected": -14.227164773380055, + "step": 2459 + }, + { + "epoch": 0.6742496916541044, + "grad_norm": 3.4375, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -35667786.666666664, + "logits/rejected": -25526617.6, + "logps/chosen": -549.2916666666666, + "logps/rejected": -613.251171875, + "loss": 0.007, + "rewards/chosen": 8.584058973524305, + "rewards/margins": 21.2261467827691, + "rewards/rejected": -12.642087809244792, + "step": 2460 + }, + { + "epoch": 0.6745237768946142, + "grad_norm": 6.65625, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -22235209.846153848, + "logits/rejected": -32117934.545454547, + "logps/chosen": -411.96420522836536, + "logps/rejected": -619.0930841619319, + "loss": 0.0282, + "rewards/chosen": 5.317495492788462, + "rewards/margins": 16.19140032788257, + "rewards/rejected": -10.873904835094105, + "step": 2461 + }, + { + "epoch": 0.674797862135124, + "grad_norm": 7.875, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -35945085.09090909, + "logits/rejected": -23873176.615384616, + "logps/chosen": -466.52592329545456, + "logps/rejected": -447.92247596153845, + "loss": 0.0148, + "rewards/chosen": 7.2735137939453125, + "rewards/margins": 19.72777029184195, + "rewards/rejected": -12.454256497896635, + "step": 2462 + }, + { + "epoch": 0.6750719473756338, + "grad_norm": 3.65625, + "kl": 3.3184618949890137, + "learning_rate": 5e-06, + "logits/chosen": -834090.8571428572, + "logits/rejected": -8583200.0, + "logps/chosen": -433.14990234375, + "logps/rejected": -595.12685546875, + "loss": 0.0382, + "rewards/chosen": 7.247400556291852, + "rewards/margins": 18.715421186174666, + "rewards/rejected": -11.468020629882812, + "step": 2463 + }, + { + "epoch": 0.6753460326161436, + "grad_norm": 11.9375, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -10677200.0, + "logits/rejected": -20804859.076923076, + "logps/chosen": -371.2626287286932, + "logps/rejected": -519.8669621394231, + "loss": 0.0435, + "rewards/chosen": 5.25619853626598, + "rewards/margins": 15.97183574329723, + "rewards/rejected": -10.71563720703125, + "step": 2464 + }, + { + "epoch": 0.6756201178566534, + "grad_norm": 6.125, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -11472317.333333334, + "logits/rejected": -29607866.666666668, + "logps/chosen": -360.7281087239583, + "logps/rejected": -679.7373453776041, + "loss": 0.0831, + "rewards/chosen": 4.529615084330241, + "rewards/margins": 18.846517244974773, + "rewards/rejected": -14.316902160644531, + "step": 2465 + }, + { + "epoch": 0.6758942030971632, + "grad_norm": 3.9375, + "kl": 9.044803619384766, + "learning_rate": 5e-06, + "logits/chosen": -45921875.692307696, + "logits/rejected": -38804826.18181818, + "logps/chosen": -451.25668569711536, + "logps/rejected": -397.15114524147725, + "loss": 0.0613, + "rewards/chosen": 6.9332134540264425, + "rewards/margins": 18.222357796622322, + "rewards/rejected": -11.28914434259588, + "step": 2466 + }, + { + "epoch": 0.6761682883376731, + "grad_norm": 2.203125, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": 5022655.0, + "logits/rejected": -15707877.0, + "logps/chosen": -488.4986572265625, + "logps/rejected": -576.91064453125, + "loss": 0.0036, + "rewards/chosen": 9.046950340270996, + "rewards/margins": 21.57048511505127, + "rewards/rejected": -12.523534774780273, + "step": 2467 + }, + { + "epoch": 0.6764423735781828, + "grad_norm": 3.859375, + "kl": 2.8339128494262695, + "learning_rate": 5e-06, + "logits/chosen": -32254746.666666668, + "logits/rejected": -29964752.0, + "logps/chosen": -455.1371663411458, + "logps/rejected": -652.0533854166666, + "loss": 0.0265, + "rewards/chosen": 7.496644337972005, + "rewards/margins": 20.528764088948567, + "rewards/rejected": -13.032119750976562, + "step": 2468 + }, + { + "epoch": 0.6767164588186926, + "grad_norm": 9.6875, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -26274360.0, + "logits/rejected": 16454369.333333334, + "logps/chosen": -444.1380208333333, + "logps/rejected": -805.1922200520834, + "loss": 0.035, + "rewards/chosen": 4.983654975891113, + "rewards/margins": 22.299216906229656, + "rewards/rejected": -17.315561930338543, + "step": 2469 + }, + { + "epoch": 0.6769905440592024, + "grad_norm": 4.1875, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -35645046.4, + "logits/rejected": -12345344.0, + "logps/chosen": -394.7698974609375, + "logps/rejected": -542.2950265066964, + "loss": 0.0079, + "rewards/chosen": 7.107428741455078, + "rewards/margins": 17.625367627825057, + "rewards/rejected": -10.517938886369977, + "step": 2470 + }, + { + "epoch": 0.6772646292997122, + "grad_norm": 9.6875, + "kl": 4.847840785980225, + "learning_rate": 5e-06, + "logits/chosen": -32144896.0, + "logits/rejected": 5853186.222222222, + "logps/chosen": -462.6931640625, + "logps/rejected": -602.3211263020834, + "loss": 0.0313, + "rewards/chosen": 7.398907979329427, + "rewards/margins": 15.989189147949219, + "rewards/rejected": -8.590281168619791, + "step": 2471 + }, + { + "epoch": 0.677538714540222, + "grad_norm": 8.0625, + "kl": 3.2624309062957764, + "learning_rate": 5e-06, + "logits/chosen": -17962261.333333332, + "logits/rejected": -39420789.333333336, + "logps/chosen": -354.3209635416667, + "logps/rejected": -514.4615885416666, + "loss": 0.0334, + "rewards/chosen": 6.125372568766276, + "rewards/margins": 16.653663635253906, + "rewards/rejected": -10.52829106648763, + "step": 2472 + }, + { + "epoch": 0.6778127997807318, + "grad_norm": 6.5, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -12580117.333333334, + "logits/rejected": -41597232.0, + "logps/chosen": -547.4672444661459, + "logps/rejected": -482.82861328125, + "loss": 0.011, + "rewards/chosen": 8.604929606119791, + "rewards/margins": 20.18818918863932, + "rewards/rejected": -11.583259582519531, + "step": 2473 + }, + { + "epoch": 0.6780868850212416, + "grad_norm": 2.40625, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -36799200.0, + "logits/rejected": -40748740.571428575, + "logps/chosen": -351.108837890625, + "logps/rejected": -414.77029854910717, + "loss": 0.0377, + "rewards/chosen": 7.19937744140625, + "rewards/margins": 18.75988071986607, + "rewards/rejected": -11.560503278459821, + "step": 2474 + }, + { + "epoch": 0.6783609702617514, + "grad_norm": 2.9375, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -13853993.333333334, + "logits/rejected": -16096434.666666666, + "logps/chosen": -560.8734130859375, + "logps/rejected": -444.9517415364583, + "loss": 0.0062, + "rewards/chosen": 7.559764862060547, + "rewards/margins": 17.73833338419596, + "rewards/rejected": -10.178568522135416, + "step": 2475 + }, + { + "epoch": 0.6786350555022612, + "grad_norm": 9.6875, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -23963560.0, + "logits/rejected": 13473074.666666666, + "logps/chosen": -457.7668050130208, + "logps/rejected": -584.3710123697916, + "loss": 0.0158, + "rewards/chosen": 8.232007344563803, + "rewards/margins": 21.124104817708336, + "rewards/rejected": -12.892097473144531, + "step": 2476 + }, + { + "epoch": 0.678909140742771, + "grad_norm": 7.09375, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -35286374.4, + "logits/rejected": -33950349.71428572, + "logps/chosen": -423.826171875, + "logps/rejected": -431.23207310267856, + "loss": 0.0709, + "rewards/chosen": 5.163102722167968, + "rewards/margins": 13.891837419782366, + "rewards/rejected": -8.728734697614398, + "step": 2477 + }, + { + "epoch": 0.6791832259832808, + "grad_norm": 2.734375, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -14430392.0, + "logits/rejected": -19045152.0, + "logps/chosen": -490.307177734375, + "logps/rejected": -551.1880580357143, + "loss": 0.0069, + "rewards/chosen": 8.271860504150391, + "rewards/margins": 20.81591044834682, + "rewards/rejected": -12.544049944196429, + "step": 2478 + }, + { + "epoch": 0.6794573112237906, + "grad_norm": 4.875, + "kl": 8.037883758544922, + "learning_rate": 5e-06, + "logits/chosen": -23595176.0, + "logits/rejected": -9860824.0, + "logps/chosen": -407.82623291015625, + "logps/rejected": -424.17822265625, + "loss": 0.0185, + "rewards/chosen": 6.95111608505249, + "rewards/margins": 16.03332281112671, + "rewards/rejected": -9.082206726074219, + "step": 2479 + }, + { + "epoch": 0.6797313964643004, + "grad_norm": 7.25, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -50930205.09090909, + "logits/rejected": -16309505.23076923, + "logps/chosen": -369.40793678977275, + "logps/rejected": -564.1311974158654, + "loss": 0.0523, + "rewards/chosen": 5.441558144309304, + "rewards/margins": 19.512330755487188, + "rewards/rejected": -14.070772611177885, + "step": 2480 + }, + { + "epoch": 0.6800054817048102, + "grad_norm": 5.125, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -17220451.2, + "logits/rejected": -24456438.85714286, + "logps/chosen": -427.144287109375, + "logps/rejected": -517.3304268973214, + "loss": 0.012, + "rewards/chosen": 8.291046905517579, + "rewards/margins": 20.022240447998048, + "rewards/rejected": -11.731193542480469, + "step": 2481 + }, + { + "epoch": 0.68027956694532, + "grad_norm": 4.84375, + "kl": 0.39296597242355347, + "learning_rate": 5e-06, + "logits/chosen": -13926734.4, + "logits/rejected": -23753101.714285713, + "logps/chosen": -454.4521484375, + "logps/rejected": -590.7276785714286, + "loss": 0.009, + "rewards/chosen": 7.731578826904297, + "rewards/margins": 20.936432429722377, + "rewards/rejected": -13.20485360281808, + "step": 2482 + }, + { + "epoch": 0.6805536521858297, + "grad_norm": 2.390625, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -40112923.428571425, + "logits/rejected": -20160334.4, + "logps/chosen": -344.54652622767856, + "logps/rejected": -564.509619140625, + "loss": 0.0084, + "rewards/chosen": 6.047070639474051, + "rewards/margins": 20.638038962227956, + "rewards/rejected": -14.590968322753906, + "step": 2483 + }, + { + "epoch": 0.6808277374263396, + "grad_norm": 13.9375, + "kl": 0.09941991418600082, + "learning_rate": 5e-06, + "logits/chosen": -34471719.384615384, + "logits/rejected": -17569745.454545453, + "logps/chosen": -463.3909254807692, + "logps/rejected": -698.4275124289773, + "loss": 0.0619, + "rewards/chosen": 5.8241436298076925, + "rewards/margins": 18.47656089942772, + "rewards/rejected": -12.65241726962003, + "step": 2484 + }, + { + "epoch": 0.6811018226668494, + "grad_norm": 13.375, + "kl": 0.14623260498046875, + "learning_rate": 5e-06, + "logits/chosen": -31177452.307692308, + "logits/rejected": -3040746.5454545454, + "logps/chosen": -385.49947415865387, + "logps/rejected": -740.3477894176136, + "loss": 0.0355, + "rewards/chosen": 5.465830876277043, + "rewards/margins": 24.444703002076047, + "rewards/rejected": -18.978872125799004, + "step": 2485 + }, + { + "epoch": 0.6813759079073591, + "grad_norm": 8.6875, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -36745210.666666664, + "logits/rejected": -20886600.0, + "logps/chosen": -421.7549641927083, + "logps/rejected": -675.4735921223959, + "loss": 0.0598, + "rewards/chosen": 5.400363922119141, + "rewards/margins": 17.97822697957357, + "rewards/rejected": -12.577863057454428, + "step": 2486 + }, + { + "epoch": 0.681649993147869, + "grad_norm": 4.1875, + "kl": 4.125528335571289, + "learning_rate": 5e-06, + "logits/chosen": -30808214.85714286, + "logits/rejected": 1344288.4, + "logps/chosen": -368.78257533482144, + "logps/rejected": -621.300830078125, + "loss": 0.057, + "rewards/chosen": 5.0691359383719305, + "rewards/margins": 17.897938428606306, + "rewards/rejected": -12.828802490234375, + "step": 2487 + }, + { + "epoch": 0.6819240783883788, + "grad_norm": 2.03125, + "kl": 0.24519602954387665, + "learning_rate": 5e-06, + "logits/chosen": -33372544.0, + "logits/rejected": -4160393.4545454546, + "logps/chosen": -438.48159555288464, + "logps/rejected": -514.9480646306819, + "loss": 0.0052, + "rewards/chosen": 7.659139779897837, + "rewards/margins": 19.209113701240167, + "rewards/rejected": -11.54997392134233, + "step": 2488 + }, + { + "epoch": 0.6821981636288886, + "grad_norm": 9.8125, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": 6604129.714285715, + "logits/rejected": -29971237.647058822, + "logps/chosen": -548.2587890625, + "logps/rejected": -506.4970128676471, + "loss": 0.0186, + "rewards/chosen": 6.886116027832031, + "rewards/margins": 18.29087246165556, + "rewards/rejected": -11.404756433823529, + "step": 2489 + }, + { + "epoch": 0.6824722488693984, + "grad_norm": 6.0625, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -36814960.0, + "logits/rejected": -28480266.0, + "logps/chosen": -354.0772705078125, + "logps/rejected": -647.1279296875, + "loss": 0.0342, + "rewards/chosen": 4.62580680847168, + "rewards/margins": 16.723648071289062, + "rewards/rejected": -12.097841262817383, + "step": 2490 + }, + { + "epoch": 0.6827463341099081, + "grad_norm": 3.703125, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -52669603.55555555, + "logits/rejected": -38107524.266666666, + "logps/chosen": -558.7917209201389, + "logps/rejected": -512.5982421875, + "loss": 0.0116, + "rewards/chosen": 7.454132080078125, + "rewards/margins": 20.460265096028646, + "rewards/rejected": -13.006133015950521, + "step": 2491 + }, + { + "epoch": 0.683020419350418, + "grad_norm": 10.625, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -8663520.0, + "logits/rejected": -23556809.846153848, + "logps/chosen": -304.88332297585225, + "logps/rejected": -418.64438100961536, + "loss": 0.0759, + "rewards/chosen": 6.889386263760653, + "rewards/margins": 15.089329806241121, + "rewards/rejected": -8.199943542480469, + "step": 2492 + }, + { + "epoch": 0.6832945045909278, + "grad_norm": 4.59375, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -26180232.727272727, + "logits/rejected": -20105223.384615384, + "logps/chosen": -450.73530717329544, + "logps/rejected": -545.9845252403846, + "loss": 0.0101, + "rewards/chosen": 7.458898370916193, + "rewards/margins": 18.416603355140953, + "rewards/rejected": -10.95770498422476, + "step": 2493 + }, + { + "epoch": 0.6835685898314375, + "grad_norm": 5.71875, + "kl": 1.6687934398651123, + "learning_rate": 5e-06, + "logits/chosen": -33482048.0, + "logits/rejected": -9562004.0, + "logps/chosen": -536.3672572544643, + "logps/rejected": -556.75517578125, + "loss": 0.0294, + "rewards/chosen": 6.811129978724888, + "rewards/margins": 16.904367283412387, + "rewards/rejected": -10.0932373046875, + "step": 2494 + }, + { + "epoch": 0.6838426750719474, + "grad_norm": 9.4375, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -23889952.0, + "logits/rejected": 33479603.2, + "logps/chosen": -466.69430106026783, + "logps/rejected": -643.322314453125, + "loss": 0.0377, + "rewards/chosen": 7.913416181291852, + "rewards/margins": 23.552437482561384, + "rewards/rejected": -15.639021301269532, + "step": 2495 + }, + { + "epoch": 0.6841167603124572, + "grad_norm": 7.875, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -23475649.454545453, + "logits/rejected": -28990237.53846154, + "logps/chosen": -406.29683061079544, + "logps/rejected": -334.25150240384613, + "loss": 0.0247, + "rewards/chosen": 6.894130359996449, + "rewards/margins": 14.061025712873553, + "rewards/rejected": -7.166895352877104, + "step": 2496 + }, + { + "epoch": 0.6843908455529669, + "grad_norm": 1.4140625, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -14107898.666666666, + "logits/rejected": -34578120.53333333, + "logps/chosen": -424.4782986111111, + "logps/rejected": -722.7399739583333, + "loss": 0.0042, + "rewards/chosen": 6.519984351264106, + "rewards/margins": 21.694852108425565, + "rewards/rejected": -15.174867757161458, + "step": 2497 + }, + { + "epoch": 0.6846649307934768, + "grad_norm": 5.3125, + "kl": 2.8025996685028076, + "learning_rate": 5e-06, + "logits/chosen": -8867438.0, + "logits/rejected": -31839844.0, + "logps/chosen": -506.87371826171875, + "logps/rejected": -548.94189453125, + "loss": 0.0151, + "rewards/chosen": 6.6704607009887695, + "rewards/margins": 21.38427448272705, + "rewards/rejected": -14.713813781738281, + "step": 2498 + }, + { + "epoch": 0.6849390160339865, + "grad_norm": 5.71875, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -47362128.0, + "logits/rejected": -16634456.0, + "logps/chosen": -326.8629455566406, + "logps/rejected": -763.2113037109375, + "loss": 0.0432, + "rewards/chosen": 6.255960464477539, + "rewards/margins": 17.56969165802002, + "rewards/rejected": -11.31373119354248, + "step": 2499 + }, + { + "epoch": 0.6852131012744964, + "grad_norm": 7.6875, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -10599468.444444444, + "logits/rejected": -9246064.0, + "logps/chosen": -381.2661946614583, + "logps/rejected": -531.5733723958333, + "loss": 0.0258, + "rewards/chosen": 6.063198513454861, + "rewards/margins": 19.341500515407986, + "rewards/rejected": -13.278302001953126, + "step": 2500 + }, + { + "epoch": 0.6854871865150062, + "grad_norm": 4.59375, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -17892296.0, + "logits/rejected": -26184064.0, + "logps/chosen": -445.427734375, + "logps/rejected": -493.80394071691177, + "loss": 0.0111, + "rewards/chosen": 6.351378304617746, + "rewards/margins": 14.759025830180704, + "rewards/rejected": -8.40764752556296, + "step": 2501 + }, + { + "epoch": 0.6857612717555159, + "grad_norm": 4.78125, + "kl": 0.28839111328125, + "learning_rate": 5e-06, + "logits/chosen": -10739261.714285715, + "logits/rejected": -24790473.6, + "logps/chosen": -374.33677455357144, + "logps/rejected": -598.583349609375, + "loss": 0.0131, + "rewards/chosen": 6.56275394984654, + "rewards/margins": 18.9727774483817, + "rewards/rejected": -12.410023498535157, + "step": 2502 + }, + { + "epoch": 0.6860353569960258, + "grad_norm": 4.9375, + "kl": 1.6072425842285156, + "learning_rate": 5e-06, + "logits/chosen": -25308096.0, + "logits/rejected": -25067342.769230768, + "logps/chosen": -376.6180308948864, + "logps/rejected": -526.5353064903846, + "loss": 0.0356, + "rewards/chosen": 7.1690826416015625, + "rewards/margins": 16.571421109713043, + "rewards/rejected": -9.402338468111479, + "step": 2503 + }, + { + "epoch": 0.6863094422365356, + "grad_norm": 2.921875, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -25223234.666666668, + "logits/rejected": -23494850.666666668, + "logps/chosen": -386.262939453125, + "logps/rejected": -564.6585693359375, + "loss": 0.0163, + "rewards/chosen": 6.639484405517578, + "rewards/margins": 18.28446706136068, + "rewards/rejected": -11.6449826558431, + "step": 2504 + }, + { + "epoch": 0.6865835274770453, + "grad_norm": 3.453125, + "kl": 1.8358548879623413, + "learning_rate": 5e-06, + "logits/chosen": -23100398.222222224, + "logits/rejected": -38384008.53333333, + "logps/chosen": -420.80235460069446, + "logps/rejected": -561.630078125, + "loss": 0.0381, + "rewards/chosen": 7.057308197021484, + "rewards/margins": 20.02913131713867, + "rewards/rejected": -12.971823120117188, + "step": 2505 + }, + { + "epoch": 0.6868576127175552, + "grad_norm": 8.3125, + "kl": 6.665740013122559, + "learning_rate": 5e-06, + "logits/chosen": -17840727.272727273, + "logits/rejected": -25744354.46153846, + "logps/chosen": -472.85009765625, + "logps/rejected": -591.8600135216346, + "loss": 0.0556, + "rewards/chosen": 7.104264692826704, + "rewards/margins": 17.336234672919854, + "rewards/rejected": -10.23196998009315, + "step": 2506 + }, + { + "epoch": 0.687131697958065, + "grad_norm": 1.0, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -28590515.2, + "logits/rejected": -32414091.42857143, + "logps/chosen": -554.539306640625, + "logps/rejected": -618.4231305803571, + "loss": 0.0019, + "rewards/chosen": 9.12335433959961, + "rewards/margins": 24.50795429774693, + "rewards/rejected": -15.384599958147321, + "step": 2507 + }, + { + "epoch": 0.6874057831985747, + "grad_norm": 6.0, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -32716317.333333332, + "logits/rejected": -25022240.0, + "logps/chosen": -395.9141438802083, + "logps/rejected": -587.3765462239584, + "loss": 0.0187, + "rewards/chosen": 7.308839162190755, + "rewards/margins": 18.387866973876953, + "rewards/rejected": -11.079027811686197, + "step": 2508 + }, + { + "epoch": 0.6876798684390846, + "grad_norm": 2.625, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -35469344.0, + "logits/rejected": -22406405.647058822, + "logps/chosen": -389.849365234375, + "logps/rejected": -492.68054917279414, + "loss": 0.0049, + "rewards/chosen": 6.848002842494419, + "rewards/margins": 19.936482277237065, + "rewards/rejected": -13.088479434742647, + "step": 2509 + }, + { + "epoch": 0.6879539536795943, + "grad_norm": 6.75, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -10381769.6, + "logits/rejected": 7782130.285714285, + "logps/chosen": -482.987548828125, + "logps/rejected": -426.55723353794644, + "loss": 0.0369, + "rewards/chosen": 7.647939300537109, + "rewards/margins": 18.861959075927736, + "rewards/rejected": -11.214019775390625, + "step": 2510 + }, + { + "epoch": 0.6882280389201042, + "grad_norm": 11.25, + "kl": 6.548795700073242, + "learning_rate": 5e-06, + "logits/chosen": -19953992.533333335, + "logits/rejected": -13643460.444444444, + "logps/chosen": -390.27076822916666, + "logps/rejected": -320.6177029079861, + "loss": 0.1013, + "rewards/chosen": 6.360001627604166, + "rewards/margins": 15.101978895399306, + "rewards/rejected": -8.74197726779514, + "step": 2511 + }, + { + "epoch": 0.688502124160614, + "grad_norm": 6.96875, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -13326376.727272727, + "logits/rejected": -34370449.23076923, + "logps/chosen": -472.2008167613636, + "logps/rejected": -448.94249549278845, + "loss": 0.0342, + "rewards/chosen": 6.9110024192116475, + "rewards/margins": 17.40926995977655, + "rewards/rejected": -10.498267540564903, + "step": 2512 + }, + { + "epoch": 0.6887762094011237, + "grad_norm": 2.453125, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -23638408.0, + "logits/rejected": 27114054.85714286, + "logps/chosen": -385.5360595703125, + "logps/rejected": -594.1925223214286, + "loss": 0.0071, + "rewards/chosen": 6.931755065917969, + "rewards/margins": 18.031915937151226, + "rewards/rejected": -11.100160871233259, + "step": 2513 + }, + { + "epoch": 0.6890502946416336, + "grad_norm": 4.5, + "kl": 2.0835227966308594, + "learning_rate": 5e-06, + "logits/chosen": -21409842.666666668, + "logits/rejected": -33288890.666666668, + "logps/chosen": -447.9469807942708, + "logps/rejected": -692.6000162760416, + "loss": 0.0269, + "rewards/chosen": 6.409478505452474, + "rewards/margins": 19.740230560302734, + "rewards/rejected": -13.33075205485026, + "step": 2514 + }, + { + "epoch": 0.6893243798821433, + "grad_norm": 9.75, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -27524605.714285713, + "logits/rejected": -32460448.0, + "logps/chosen": -437.6532505580357, + "logps/rejected": -582.6294921875, + "loss": 0.0188, + "rewards/chosen": 6.387917654854911, + "rewards/margins": 21.173080008370537, + "rewards/rejected": -14.785162353515625, + "step": 2515 + }, + { + "epoch": 0.6895984651226531, + "grad_norm": 8.375, + "kl": 2.970468044281006, + "learning_rate": 5e-06, + "logits/chosen": -34877197.473684214, + "logits/rejected": -17184126.4, + "logps/chosen": -325.95985814144734, + "logps/rejected": -883.4859375, + "loss": 0.0328, + "rewards/chosen": 5.26274952135588, + "rewards/margins": 29.994692270379318, + "rewards/rejected": -24.73194274902344, + "step": 2516 + }, + { + "epoch": 0.689872550363163, + "grad_norm": 4.5, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -3053413.2, + "logits/rejected": -25627286.85714286, + "logps/chosen": -449.8345703125, + "logps/rejected": -420.35728236607144, + "loss": 0.0118, + "rewards/chosen": 7.24633560180664, + "rewards/margins": 15.712119402204241, + "rewards/rejected": -8.4657838003976, + "step": 2517 + }, + { + "epoch": 0.6901466356036727, + "grad_norm": 4.3125, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -19997950.4, + "logits/rejected": -28120740.57142857, + "logps/chosen": -329.17802734375, + "logps/rejected": -589.5248325892857, + "loss": 0.0167, + "rewards/chosen": 7.561723327636718, + "rewards/margins": 21.9649172101702, + "rewards/rejected": -14.403193882533483, + "step": 2518 + }, + { + "epoch": 0.6904207208441825, + "grad_norm": 2.1875, + "kl": 0.3436877131462097, + "learning_rate": 5e-06, + "logits/chosen": -4199341.6, + "logits/rejected": -33363803.42857143, + "logps/chosen": -441.74287109375, + "logps/rejected": -750.1511579241071, + "loss": 0.0041, + "rewards/chosen": 8.71245880126953, + "rewards/margins": 26.248016139439173, + "rewards/rejected": -17.535557338169642, + "step": 2519 + }, + { + "epoch": 0.6906948060846924, + "grad_norm": 1.546875, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -42367922.666666664, + "logits/rejected": -31842893.333333332, + "logps/chosen": -475.9017333984375, + "logps/rejected": -759.91259765625, + "loss": 0.0038, + "rewards/chosen": 8.815165837605795, + "rewards/margins": 22.79889233907064, + "rewards/rejected": -13.983726501464844, + "step": 2520 + }, + { + "epoch": 0.6909688913252021, + "grad_norm": 11.1875, + "kl": 0.9417744278907776, + "learning_rate": 5e-06, + "logits/chosen": -29828132.923076924, + "logits/rejected": -11452868.363636363, + "logps/chosen": -395.56678185096155, + "logps/rejected": -528.6600674715909, + "loss": 0.0196, + "rewards/chosen": 8.192826491135817, + "rewards/margins": 20.402749975244483, + "rewards/rejected": -12.209923484108664, + "step": 2521 + }, + { + "epoch": 0.691242976565712, + "grad_norm": 4.6875, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -18845188.0, + "logits/rejected": -15986913.333333334, + "logps/chosen": -421.89453125, + "logps/rejected": -496.3765055338542, + "loss": 0.0222, + "rewards/chosen": 7.098734537760417, + "rewards/margins": 18.72145716349284, + "rewards/rejected": -11.622722625732422, + "step": 2522 + }, + { + "epoch": 0.6915170618062217, + "grad_norm": 7.09375, + "kl": 8.168704986572266, + "learning_rate": 5e-06, + "logits/chosen": -16165904.842105264, + "logits/rejected": 6905940.8, + "logps/chosen": -400.96016652960526, + "logps/rejected": -469.96708984375, + "loss": 0.0344, + "rewards/chosen": 6.486950121427837, + "rewards/margins": 14.41396046688682, + "rewards/rejected": -7.927010345458984, + "step": 2523 + }, + { + "epoch": 0.6917911470467315, + "grad_norm": 3.625, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -19850893.333333332, + "logits/rejected": -13361664.0, + "logps/chosen": -497.033935546875, + "logps/rejected": -558.137939453125, + "loss": 0.0084, + "rewards/chosen": 8.42928695678711, + "rewards/margins": 20.575188954671226, + "rewards/rejected": -12.145901997884115, + "step": 2524 + }, + { + "epoch": 0.6920652322872414, + "grad_norm": 5.8125, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -48724874.666666664, + "logits/rejected": 18008616.0, + "logps/chosen": -372.0679117838542, + "logps/rejected": -430.7972412109375, + "loss": 0.0369, + "rewards/chosen": 5.547722498575847, + "rewards/margins": 19.23978106180827, + "rewards/rejected": -13.692058563232422, + "step": 2525 + }, + { + "epoch": 0.6923393175277511, + "grad_norm": 6.28125, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -26209170.666666668, + "logits/rejected": 1273750.3333333333, + "logps/chosen": -372.5884602864583, + "logps/rejected": -562.7591145833334, + "loss": 0.0173, + "rewards/chosen": 6.138435363769531, + "rewards/margins": 17.84232838948568, + "rewards/rejected": -11.703893025716146, + "step": 2526 + }, + { + "epoch": 0.6926134027682609, + "grad_norm": 6.6875, + "kl": 1.4955190420150757, + "learning_rate": 5e-06, + "logits/chosen": -45520311.27272727, + "logits/rejected": -37660972.307692304, + "logps/chosen": -412.0654296875, + "logps/rejected": -531.2750525841346, + "loss": 0.0208, + "rewards/chosen": 6.545030073686079, + "rewards/margins": 18.555901374016607, + "rewards/rejected": -12.010871300330528, + "step": 2527 + }, + { + "epoch": 0.6928874880087708, + "grad_norm": 1.5, + "kl": 3.026763916015625, + "learning_rate": 5e-06, + "logits/chosen": -47624836.266666666, + "logits/rejected": -33231075.555555556, + "logps/chosen": -501.41106770833335, + "logps/rejected": -470.5208333333333, + "loss": 0.0027, + "rewards/chosen": 8.343433634440105, + "rewards/margins": 20.90027855767144, + "rewards/rejected": -12.556844923231337, + "step": 2528 + }, + { + "epoch": 0.6931615732492805, + "grad_norm": 8.5625, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -38980907.63636363, + "logits/rejected": -34706530.461538464, + "logps/chosen": -369.7692205255682, + "logps/rejected": -449.54439603365387, + "loss": 0.0261, + "rewards/chosen": 5.76108828457919, + "rewards/margins": 17.63270904967835, + "rewards/rejected": -11.87162076509916, + "step": 2529 + }, + { + "epoch": 0.6934356584897903, + "grad_norm": 4.46875, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -19767321.846153848, + "logits/rejected": -21849163.636363637, + "logps/chosen": -444.71469350961536, + "logps/rejected": -481.2141779119318, + "loss": 0.015, + "rewards/chosen": 6.34532224214994, + "rewards/margins": 16.707896039202495, + "rewards/rejected": -10.362573797052557, + "step": 2530 + }, + { + "epoch": 0.6937097437303001, + "grad_norm": 3.015625, + "kl": 1.6602122783660889, + "learning_rate": 5e-06, + "logits/chosen": -38515060.36363637, + "logits/rejected": -19123971.692307692, + "logps/chosen": -352.9549005681818, + "logps/rejected": -493.92176231971155, + "loss": 0.0345, + "rewards/chosen": 7.350385492498225, + "rewards/margins": 16.843840645743416, + "rewards/rejected": -9.493455153245192, + "step": 2531 + }, + { + "epoch": 0.6939838289708099, + "grad_norm": 4.34375, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -31016163.2, + "logits/rejected": -47976297.14285714, + "logps/chosen": -331.87587890625, + "logps/rejected": -526.2371651785714, + "loss": 0.0295, + "rewards/chosen": 5.273063659667969, + "rewards/margins": 17.063983372279576, + "rewards/rejected": -11.790919712611608, + "step": 2532 + }, + { + "epoch": 0.6942579142113198, + "grad_norm": 2.765625, + "kl": 0.026284536346793175, + "learning_rate": 5e-06, + "logits/chosen": -37537205.333333336, + "logits/rejected": -33883018.666666664, + "logps/chosen": -421.6787923177083, + "logps/rejected": -676.2635091145834, + "loss": 0.011, + "rewards/chosen": 7.198394139607747, + "rewards/margins": 20.661943435668945, + "rewards/rejected": -13.463549296061197, + "step": 2533 + }, + { + "epoch": 0.6945319994518295, + "grad_norm": 4.0625, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -25074500.57142857, + "logits/rejected": -20180992.0, + "logps/chosen": -465.10508510044644, + "logps/rejected": -486.14499080882354, + "loss": 0.0085, + "rewards/chosen": 6.369529179164341, + "rewards/margins": 17.836600712367467, + "rewards/rejected": -11.467071533203125, + "step": 2534 + }, + { + "epoch": 0.6948060846923393, + "grad_norm": 0.494140625, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -12427992.0, + "logits/rejected": -41073378.666666664, + "logps/chosen": -360.5379231770833, + "logps/rejected": -661.1294759114584, + "loss": 0.0017, + "rewards/chosen": 7.14276123046875, + "rewards/margins": 19.28319549560547, + "rewards/rejected": -12.140434265136719, + "step": 2535 + }, + { + "epoch": 0.6950801699328492, + "grad_norm": 0.89453125, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -45657961.14285714, + "logits/rejected": -28577225.411764707, + "logps/chosen": -407.06727818080356, + "logps/rejected": -500.3169806985294, + "loss": 0.0033, + "rewards/chosen": 7.071560450962612, + "rewards/margins": 19.344597632143678, + "rewards/rejected": -12.273037181181067, + "step": 2536 + }, + { + "epoch": 0.6953542551733589, + "grad_norm": 6.21875, + "kl": 0.5941569209098816, + "learning_rate": 5e-06, + "logits/chosen": 9462163.2, + "logits/rejected": -16738916.57142857, + "logps/chosen": -530.477783203125, + "logps/rejected": -526.4522530691964, + "loss": 0.0294, + "rewards/chosen": 6.74328842163086, + "rewards/margins": 17.201512037004743, + "rewards/rejected": -10.458223615373884, + "step": 2537 + }, + { + "epoch": 0.6956283404138687, + "grad_norm": 5.1875, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -24949506.90909091, + "logits/rejected": -34814503.384615384, + "logps/chosen": -381.58620383522725, + "logps/rejected": -310.31734525240387, + "loss": 0.0226, + "rewards/chosen": 7.167523470791903, + "rewards/margins": 16.311204923616422, + "rewards/rejected": -9.14368145282452, + "step": 2538 + }, + { + "epoch": 0.6959024256543785, + "grad_norm": 9.9375, + "kl": 5.172549247741699, + "learning_rate": 5e-06, + "logits/chosen": 10974947.692307692, + "logits/rejected": -44336215.27272727, + "logps/chosen": -495.84322415865387, + "logps/rejected": -547.0343128551136, + "loss": 0.0298, + "rewards/chosen": 6.941631610576923, + "rewards/margins": 17.258004568673513, + "rewards/rejected": -10.316372958096592, + "step": 2539 + }, + { + "epoch": 0.6961765108948883, + "grad_norm": 3.734375, + "kl": 4.151793956756592, + "learning_rate": 5e-06, + "logits/chosen": -25392418.46153846, + "logits/rejected": -32460328.727272727, + "logps/chosen": -377.4986102764423, + "logps/rejected": -515.8240411931819, + "loss": 0.0133, + "rewards/chosen": 6.916736896221455, + "rewards/margins": 18.713961461207248, + "rewards/rejected": -11.797224564985795, + "step": 2540 + }, + { + "epoch": 0.6964505961353981, + "grad_norm": 8.0, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -17491803.42857143, + "logits/rejected": -39895955.2, + "logps/chosen": -390.0341099330357, + "logps/rejected": -362.8431884765625, + "loss": 0.0225, + "rewards/chosen": 6.793362753731864, + "rewards/margins": 14.88486589704241, + "rewards/rejected": -8.091503143310547, + "step": 2541 + }, + { + "epoch": 0.6967246813759079, + "grad_norm": 23.0, + "kl": 7.156922817230225, + "learning_rate": 5e-06, + "logits/chosen": -15395671.384615384, + "logits/rejected": -631162.5454545454, + "logps/chosen": -356.38198617788464, + "logps/rejected": -539.9274680397727, + "loss": 0.0911, + "rewards/chosen": 5.3677203838641825, + "rewards/margins": 18.288320581396142, + "rewards/rejected": -12.92060019753196, + "step": 2542 + }, + { + "epoch": 0.6969987666164177, + "grad_norm": 13.0625, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -11972444.0, + "logits/rejected": -21884030.0, + "logps/chosen": -368.98675537109375, + "logps/rejected": -434.82745361328125, + "loss": 0.0417, + "rewards/chosen": 5.281680583953857, + "rewards/margins": 15.735555171966553, + "rewards/rejected": -10.453874588012695, + "step": 2543 + }, + { + "epoch": 0.6972728518569276, + "grad_norm": 5.34375, + "kl": 0.09688949584960938, + "learning_rate": 5e-06, + "logits/chosen": -24267435.2, + "logits/rejected": -32549590.85714286, + "logps/chosen": -453.059375, + "logps/rejected": -507.98514229910717, + "loss": 0.016, + "rewards/chosen": 7.231008911132813, + "rewards/margins": 17.709175763811384, + "rewards/rejected": -10.478166852678571, + "step": 2544 + }, + { + "epoch": 0.6975469370974373, + "grad_norm": 2.3125, + "kl": 0.22769546508789062, + "learning_rate": 5e-06, + "logits/chosen": -21165828.57142857, + "logits/rejected": -14532382.11764706, + "logps/chosen": -445.7377232142857, + "logps/rejected": -442.6117302389706, + "loss": 0.0048, + "rewards/chosen": 8.745485578264509, + "rewards/margins": 17.9635094394203, + "rewards/rejected": -9.21802386115579, + "step": 2545 + }, + { + "epoch": 0.6978210223379471, + "grad_norm": 2.40625, + "kl": 6.704052925109863, + "learning_rate": 5e-06, + "logits/chosen": -38134972.0, + "logits/rejected": -30442152.0, + "logps/chosen": -397.1434631347656, + "logps/rejected": -421.7760009765625, + "loss": 0.0387, + "rewards/chosen": 6.856868743896484, + "rewards/margins": 18.4556941986084, + "rewards/rejected": -11.598825454711914, + "step": 2546 + }, + { + "epoch": 0.6980951075784569, + "grad_norm": 8.4375, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -38317541.81818182, + "logits/rejected": -35316145.23076923, + "logps/chosen": -393.0470525568182, + "logps/rejected": -579.5593073918269, + "loss": 0.0253, + "rewards/chosen": 7.240823225541548, + "rewards/margins": 19.312023082813184, + "rewards/rejected": -12.071199857271635, + "step": 2547 + }, + { + "epoch": 0.6983691928189667, + "grad_norm": 9.4375, + "kl": 7.008827209472656, + "learning_rate": 5e-06, + "logits/chosen": -1620494.0, + "logits/rejected": -40838085.333333336, + "logps/chosen": -542.5815836588541, + "logps/rejected": -595.7293701171875, + "loss": 0.0345, + "rewards/chosen": 7.221179962158203, + "rewards/margins": 18.248140970865883, + "rewards/rejected": -11.026961008707682, + "step": 2548 + }, + { + "epoch": 0.6986432780594765, + "grad_norm": 9.1875, + "kl": 1.1218808889389038, + "learning_rate": 5e-06, + "logits/chosen": -30805290.666666668, + "logits/rejected": -21006576.0, + "logps/chosen": -413.9704182942708, + "logps/rejected": -498.892822265625, + "loss": 0.0441, + "rewards/chosen": 7.485059102376302, + "rewards/margins": 16.693897883097332, + "rewards/rejected": -9.20883878072103, + "step": 2549 + }, + { + "epoch": 0.6989173632999863, + "grad_norm": 4.875, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -35858981.333333336, + "logits/rejected": 16953861.333333332, + "logps/chosen": -462.0019938151042, + "logps/rejected": -630.7605794270834, + "loss": 0.0518, + "rewards/chosen": 6.827716827392578, + "rewards/margins": 20.80065027872721, + "rewards/rejected": -13.972933451334635, + "step": 2550 + }, + { + "epoch": 0.6991914485404961, + "grad_norm": 2.125, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -24008962.0, + "logits/rejected": -15209712.0, + "logps/chosen": -487.773193359375, + "logps/rejected": -371.56146240234375, + "loss": 0.0063, + "rewards/chosen": 6.907665252685547, + "rewards/margins": 15.659092903137207, + "rewards/rejected": -8.75142765045166, + "step": 2551 + }, + { + "epoch": 0.6994655337810058, + "grad_norm": 4.1875, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -24923511.272727273, + "logits/rejected": -16537095.384615384, + "logps/chosen": -462.5411931818182, + "logps/rejected": -539.9687875600962, + "loss": 0.008, + "rewards/chosen": 7.763155850497159, + "rewards/margins": 17.940961824430453, + "rewards/rejected": -10.177805973933292, + "step": 2552 + }, + { + "epoch": 0.6997396190215157, + "grad_norm": 9.25, + "kl": 11.20605754852295, + "learning_rate": 5e-06, + "logits/chosen": -34151131.428571425, + "logits/rejected": 7702809.6, + "logps/chosen": -513.1828962053571, + "logps/rejected": -473.02373046875, + "loss": 0.0343, + "rewards/chosen": 7.734226226806641, + "rewards/margins": 18.501465606689454, + "rewards/rejected": -10.767239379882813, + "step": 2553 + }, + { + "epoch": 0.7000137042620255, + "grad_norm": 8.0, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -7358801.454545454, + "logits/rejected": -12167619.692307692, + "logps/chosen": -437.27641157670456, + "logps/rejected": -556.7617938701923, + "loss": 0.0488, + "rewards/chosen": 5.7527531710538, + "rewards/margins": 17.489473329557406, + "rewards/rejected": -11.736720158503605, + "step": 2554 + }, + { + "epoch": 0.7002877895025353, + "grad_norm": 2.734375, + "kl": 0.4363740384578705, + "learning_rate": 5e-06, + "logits/chosen": -28986042.181818184, + "logits/rejected": -18481757.53846154, + "logps/chosen": -434.99027876420456, + "logps/rejected": -590.6644381009615, + "loss": 0.0088, + "rewards/chosen": 7.684065385298296, + "rewards/margins": 22.074104682548896, + "rewards/rejected": -14.3900392972506, + "step": 2555 + }, + { + "epoch": 0.7005618747430451, + "grad_norm": 11.3125, + "kl": 3.0295512676239014, + "learning_rate": 5e-06, + "logits/chosen": -17009031.384615384, + "logits/rejected": -19147524.363636363, + "logps/chosen": -389.32534555288464, + "logps/rejected": -470.0579279119318, + "loss": 0.0517, + "rewards/chosen": 6.603324890136719, + "rewards/margins": 14.430173006924715, + "rewards/rejected": -7.826848116787997, + "step": 2556 + }, + { + "epoch": 0.7008359599835549, + "grad_norm": 12.5, + "kl": 1.5613536834716797, + "learning_rate": 5e-06, + "logits/chosen": 16023635.2, + "logits/rejected": -35963026.28571428, + "logps/chosen": -397.81376953125, + "logps/rejected": -654.0105329241071, + "loss": 0.0872, + "rewards/chosen": 6.254082107543946, + "rewards/margins": 16.583687210083006, + "rewards/rejected": -10.329605102539062, + "step": 2557 + }, + { + "epoch": 0.7011100452240647, + "grad_norm": 7.59375, + "kl": 3.7924716472625732, + "learning_rate": 5e-06, + "logits/chosen": -16004132.363636363, + "logits/rejected": -26359042.46153846, + "logps/chosen": -465.34641335227275, + "logps/rejected": -457.67052283653845, + "loss": 0.0186, + "rewards/chosen": 6.409982854669744, + "rewards/margins": 17.88978726046902, + "rewards/rejected": -11.479804405799278, + "step": 2558 + }, + { + "epoch": 0.7013841304645745, + "grad_norm": 6.21875, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -19022423.272727273, + "logits/rejected": 3969114.4615384615, + "logps/chosen": -461.1404474431818, + "logps/rejected": -511.1476862980769, + "loss": 0.0298, + "rewards/chosen": 7.2952728271484375, + "rewards/margins": 18.375692514272835, + "rewards/rejected": -11.0804196871244, + "step": 2559 + }, + { + "epoch": 0.7016582157050842, + "grad_norm": 1.953125, + "kl": 3.213219404220581, + "learning_rate": 5e-06, + "logits/chosen": 4355231.333333333, + "logits/rejected": -14337210.666666666, + "logps/chosen": -477.5693359375, + "logps/rejected": -507.6549479166667, + "loss": 0.0041, + "rewards/chosen": 7.499610265096028, + "rewards/margins": 17.543074289957683, + "rewards/rejected": -10.043464024861654, + "step": 2560 + }, + { + "epoch": 0.7019323009455941, + "grad_norm": 1.9453125, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -15750023.384615384, + "logits/rejected": -21456142.545454547, + "logps/chosen": -475.75939002403845, + "logps/rejected": -585.7506214488636, + "loss": 0.0043, + "rewards/chosen": 7.759370657113882, + "rewards/margins": 21.5609245567055, + "rewards/rejected": -13.80155389959162, + "step": 2561 + }, + { + "epoch": 0.7022063861861039, + "grad_norm": 5.46875, + "kl": 2.0250449180603027, + "learning_rate": 5e-06, + "logits/chosen": 425981.71428571426, + "logits/rejected": -12248508.8, + "logps/chosen": -402.50980050223217, + "logps/rejected": -771.20517578125, + "loss": 0.0399, + "rewards/chosen": 5.850729261125837, + "rewards/margins": 16.546983991350444, + "rewards/rejected": -10.69625473022461, + "step": 2562 + }, + { + "epoch": 0.7024804714266136, + "grad_norm": 11.9375, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -11121663.272727273, + "logits/rejected": -24565248.0, + "logps/chosen": -318.09195223721593, + "logps/rejected": -524.9541766826923, + "loss": 0.0842, + "rewards/chosen": 5.332279552112926, + "rewards/margins": 15.58359650298432, + "rewards/rejected": -10.251316950871395, + "step": 2563 + }, + { + "epoch": 0.7027545566671235, + "grad_norm": 13.875, + "kl": 9.700579643249512, + "learning_rate": 5e-06, + "logits/chosen": -18147294.11764706, + "logits/rejected": -15339315.42857143, + "logps/chosen": -389.07068589154414, + "logps/rejected": -615.5822405133929, + "loss": 0.0786, + "rewards/chosen": 6.646266712861903, + "rewards/margins": 14.885929973185565, + "rewards/rejected": -8.239663260323661, + "step": 2564 + }, + { + "epoch": 0.7030286419076333, + "grad_norm": 1.703125, + "kl": 0.49871063232421875, + "learning_rate": 5e-06, + "logits/chosen": -21469790.4, + "logits/rejected": -14758282.285714285, + "logps/chosen": -493.1912109375, + "logps/rejected": -531.1240931919643, + "loss": 0.0048, + "rewards/chosen": 8.223622131347657, + "rewards/margins": 19.020735822405136, + "rewards/rejected": -10.797113691057477, + "step": 2565 + }, + { + "epoch": 0.7033027271481431, + "grad_norm": 6.15625, + "kl": 3.0068259239196777, + "learning_rate": 5e-06, + "logits/chosen": -28395788.0, + "logits/rejected": -27667970.0, + "logps/chosen": -367.69439697265625, + "logps/rejected": -566.0055541992188, + "loss": 0.0298, + "rewards/chosen": 6.568473815917969, + "rewards/margins": 25.061853408813477, + "rewards/rejected": -18.493379592895508, + "step": 2566 + }, + { + "epoch": 0.7035768123886529, + "grad_norm": 9.0, + "kl": 18.605844497680664, + "learning_rate": 5e-06, + "logits/chosen": -21075303.529411763, + "logits/rejected": -17768106.285714287, + "logps/chosen": -403.11497587316177, + "logps/rejected": -806.7307477678571, + "loss": 0.1018, + "rewards/chosen": 8.096989351160387, + "rewards/margins": 21.390773356461725, + "rewards/rejected": -13.293784005301339, + "step": 2567 + }, + { + "epoch": 0.7038508976291626, + "grad_norm": 8.8125, + "kl": 7.4085516929626465, + "learning_rate": 5e-06, + "logits/chosen": -34741805.71428572, + "logits/rejected": -22249833.6, + "logps/chosen": -485.3038853236607, + "logps/rejected": -615.750927734375, + "loss": 0.0282, + "rewards/chosen": 7.925285884312221, + "rewards/margins": 16.963919612339566, + "rewards/rejected": -9.038633728027344, + "step": 2568 + }, + { + "epoch": 0.7041249828696725, + "grad_norm": 5.375, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -19841426.90909091, + "logits/rejected": -38950872.615384616, + "logps/chosen": -263.4992009943182, + "logps/rejected": -446.44054236778845, + "loss": 0.0283, + "rewards/chosen": 4.991239374334162, + "rewards/margins": 13.880703532612408, + "rewards/rejected": -8.889464158278246, + "step": 2569 + }, + { + "epoch": 0.7043990681101823, + "grad_norm": 6.78125, + "kl": 0.18077406287193298, + "learning_rate": 5e-06, + "logits/chosen": -18507879.111111112, + "logits/rejected": -26135750.4, + "logps/chosen": -485.07318793402777, + "logps/rejected": -545.53623046875, + "loss": 0.0371, + "rewards/chosen": 8.15152316623264, + "rewards/margins": 20.107029554578993, + "rewards/rejected": -11.955506388346354, + "step": 2570 + }, + { + "epoch": 0.704673153350692, + "grad_norm": 2.40625, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -23835596.8, + "logits/rejected": -9088324.0, + "logps/chosen": -431.15009765625, + "logps/rejected": -509.21561104910717, + "loss": 0.0075, + "rewards/chosen": 7.351829528808594, + "rewards/margins": 17.3063714163644, + "rewards/rejected": -9.954541887555804, + "step": 2571 + }, + { + "epoch": 0.7049472385912019, + "grad_norm": 4.53125, + "kl": 4.3454742431640625, + "learning_rate": 5e-06, + "logits/chosen": -20093801.14285714, + "logits/rejected": -9923308.8, + "logps/chosen": -479.45706612723217, + "logps/rejected": -469.652490234375, + "loss": 0.0087, + "rewards/chosen": 7.710748944963727, + "rewards/margins": 17.53282230922154, + "rewards/rejected": -9.822073364257813, + "step": 2572 + }, + { + "epoch": 0.7052213238317117, + "grad_norm": 1.5859375, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -22856123.076923076, + "logits/rejected": 57620497.45454545, + "logps/chosen": -480.4000901442308, + "logps/rejected": -609.5100763494319, + "loss": 0.0046, + "rewards/chosen": 6.9215569129356975, + "rewards/margins": 24.58494109040374, + "rewards/rejected": -17.66338417746804, + "step": 2573 + }, + { + "epoch": 0.7054954090722214, + "grad_norm": 2.515625, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -13169316.923076924, + "logits/rejected": -22255281.454545453, + "logps/chosen": -530.6268404447115, + "logps/rejected": -678.4753639914773, + "loss": 0.0053, + "rewards/chosen": 8.63152606670673, + "rewards/margins": 21.58871129176, + "rewards/rejected": -12.957185225053268, + "step": 2574 + }, + { + "epoch": 0.7057694943127313, + "grad_norm": 3.5625, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -25867068.444444444, + "logits/rejected": -27581873.066666666, + "logps/chosen": -328.9490017361111, + "logps/rejected": -508.50234375, + "loss": 0.009, + "rewards/chosen": 5.364981757269965, + "rewards/margins": 16.75825466579861, + "rewards/rejected": -11.393272908528646, + "step": 2575 + }, + { + "epoch": 0.706043579553241, + "grad_norm": 7.5625, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -16573667.555555556, + "logits/rejected": -181582.93333333332, + "logps/chosen": -354.03187391493054, + "logps/rejected": -638.1151041666667, + "loss": 0.0303, + "rewards/chosen": 4.656467861599392, + "rewards/margins": 15.734569464789496, + "rewards/rejected": -11.078101603190104, + "step": 2576 + }, + { + "epoch": 0.7063176647937509, + "grad_norm": 5.28125, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -19851664.0, + "logits/rejected": 31725368.0, + "logps/chosen": -352.4154459635417, + "logps/rejected": -588.2793782552084, + "loss": 0.019, + "rewards/chosen": 4.683314959208171, + "rewards/margins": 18.599155108133953, + "rewards/rejected": -13.915840148925781, + "step": 2577 + }, + { + "epoch": 0.7065917500342607, + "grad_norm": 3.1875, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -27187904.0, + "logits/rejected": -14916937.846153846, + "logps/chosen": -427.53005149147725, + "logps/rejected": -716.4674729567307, + "loss": 0.0142, + "rewards/chosen": 6.967977350408381, + "rewards/margins": 23.17678331495165, + "rewards/rejected": -16.20880596454327, + "step": 2578 + }, + { + "epoch": 0.7068658352747704, + "grad_norm": 2.90625, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": 1444647.3846153845, + "logits/rejected": -36232046.54545455, + "logps/chosen": -464.96304086538464, + "logps/rejected": -546.2842240767045, + "loss": 0.0075, + "rewards/chosen": 7.278271014873798, + "rewards/margins": 20.210818844241697, + "rewards/rejected": -12.932547829367898, + "step": 2579 + }, + { + "epoch": 0.7071399205152803, + "grad_norm": 4.75, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -21383045.333333332, + "logits/rejected": -39777298.666666664, + "logps/chosen": -499.1304931640625, + "logps/rejected": -547.0313313802084, + "loss": 0.0187, + "rewards/chosen": 7.712627410888672, + "rewards/margins": 19.723331451416016, + "rewards/rejected": -12.010704040527344, + "step": 2580 + }, + { + "epoch": 0.70741400575579, + "grad_norm": 6.46875, + "kl": 5.634732246398926, + "learning_rate": 5e-06, + "logits/chosen": -16959122.285714287, + "logits/rejected": -17480844.8, + "logps/chosen": -340.51747349330356, + "logps/rejected": -455.12138671875, + "loss": 0.0724, + "rewards/chosen": 6.283867972237723, + "rewards/margins": 17.699092429024834, + "rewards/rejected": -11.41522445678711, + "step": 2581 + }, + { + "epoch": 0.7076880909962998, + "grad_norm": 6.03125, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -23541194.666666668, + "logits/rejected": -19703219.555555556, + "logps/chosen": -410.1221028645833, + "logps/rejected": -623.6486002604166, + "loss": 0.0247, + "rewards/chosen": 7.047461446126302, + "rewards/margins": 21.87421145968967, + "rewards/rejected": -14.826750013563368, + "step": 2582 + }, + { + "epoch": 0.7079621762368097, + "grad_norm": 7.5, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -23162521.6, + "logits/rejected": -14001882.285714285, + "logps/chosen": -295.8267333984375, + "logps/rejected": -610.966796875, + "loss": 0.09, + "rewards/chosen": 4.186277389526367, + "rewards/margins": 15.551957539149694, + "rewards/rejected": -11.365680149623326, + "step": 2583 + }, + { + "epoch": 0.7082362614773194, + "grad_norm": 5.53125, + "kl": 3.1148831844329834, + "learning_rate": 5e-06, + "logits/chosen": -4642834.857142857, + "logits/rejected": -26427070.4, + "logps/chosen": -441.4994419642857, + "logps/rejected": -579.576708984375, + "loss": 0.0275, + "rewards/chosen": 8.0073607308524, + "rewards/margins": 20.011008344377792, + "rewards/rejected": -12.003647613525391, + "step": 2584 + }, + { + "epoch": 0.7085103467178292, + "grad_norm": 7.0, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -35057450.666666664, + "logits/rejected": -8985006.933333334, + "logps/chosen": -335.1253255208333, + "logps/rejected": -501.90208333333334, + "loss": 0.053, + "rewards/chosen": 5.659107208251953, + "rewards/margins": 16.715365346272787, + "rewards/rejected": -11.056258138020834, + "step": 2585 + }, + { + "epoch": 0.7087844319583391, + "grad_norm": 0.2197265625, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -31229501.09090909, + "logits/rejected": -33997154.461538464, + "logps/chosen": -552.5278764204545, + "logps/rejected": -460.48091947115387, + "loss": 0.0005, + "rewards/chosen": 9.16111200506037, + "rewards/margins": 20.25561203323044, + "rewards/rejected": -11.094500028170073, + "step": 2586 + }, + { + "epoch": 0.7090585171988488, + "grad_norm": 8.5, + "kl": 2.3261656761169434, + "learning_rate": 5e-06, + "logits/chosen": -18243700.0, + "logits/rejected": -21082152.0, + "logps/chosen": -363.67626953125, + "logps/rejected": -506.8909505208333, + "loss": 0.0401, + "rewards/chosen": 5.944129943847656, + "rewards/margins": 16.687053680419922, + "rewards/rejected": -10.742923736572266, + "step": 2587 + }, + { + "epoch": 0.7093326024393587, + "grad_norm": 5.9375, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -30821142.4, + "logits/rejected": -23807746.285714287, + "logps/chosen": -396.2338623046875, + "logps/rejected": -631.4633091517857, + "loss": 0.024, + "rewards/chosen": 7.406610107421875, + "rewards/margins": 22.09112047467913, + "rewards/rejected": -14.684510367257255, + "step": 2588 + }, + { + "epoch": 0.7096066876798685, + "grad_norm": 8.125, + "kl": 12.182056427001953, + "learning_rate": 5e-06, + "logits/chosen": -13097200.0, + "logits/rejected": -3678794.285714286, + "logps/chosen": -379.318359375, + "logps/rejected": -665.8915318080357, + "loss": 0.033, + "rewards/chosen": 7.3063803280101105, + "rewards/margins": 17.64942515397272, + "rewards/rejected": -10.343044825962611, + "step": 2589 + }, + { + "epoch": 0.7098807729203782, + "grad_norm": 6.9375, + "kl": 5.118979454040527, + "learning_rate": 5e-06, + "logits/chosen": -17778023.272727273, + "logits/rejected": -16918043.076923076, + "logps/chosen": -303.0595703125, + "logps/rejected": -486.376953125, + "loss": 0.0325, + "rewards/chosen": 6.581602616743608, + "rewards/margins": 17.546300821371013, + "rewards/rejected": -10.964698204627403, + "step": 2590 + }, + { + "epoch": 0.7101548581608881, + "grad_norm": 2.234375, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -23438464.0, + "logits/rejected": -27717589.333333332, + "logps/chosen": -342.207275390625, + "logps/rejected": -711.31298828125, + "loss": 0.0115, + "rewards/chosen": 6.510457992553711, + "rewards/margins": 21.17068417867025, + "rewards/rejected": -14.660226186116537, + "step": 2591 + }, + { + "epoch": 0.7104289434013978, + "grad_norm": 13.5625, + "kl": 17.702728271484375, + "learning_rate": 5e-06, + "logits/chosen": -18733348.57142857, + "logits/rejected": -19188950.4, + "logps/chosen": -506.04286411830356, + "logps/rejected": -471.602197265625, + "loss": 0.0438, + "rewards/chosen": 8.199815477643694, + "rewards/margins": 15.845720781598772, + "rewards/rejected": -7.645905303955078, + "step": 2592 + }, + { + "epoch": 0.7107030286419076, + "grad_norm": 9.5, + "kl": 3.7467055320739746, + "learning_rate": 5e-06, + "logits/chosen": -11485690.666666666, + "logits/rejected": 12054789.333333334, + "logps/chosen": -430.0075276692708, + "logps/rejected": -511.8765869140625, + "loss": 0.0333, + "rewards/chosen": 7.155325571695964, + "rewards/margins": 16.623390197753906, + "rewards/rejected": -9.468064626057943, + "step": 2593 + }, + { + "epoch": 0.7109771138824175, + "grad_norm": 9.75, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -26552436.363636363, + "logits/rejected": -4679364.923076923, + "logps/chosen": -266.6949351917614, + "logps/rejected": -388.6796875, + "loss": 0.0552, + "rewards/chosen": 6.694380326704546, + "rewards/margins": 13.794872950840663, + "rewards/rejected": -7.100492624136118, + "step": 2594 + }, + { + "epoch": 0.7112511991229272, + "grad_norm": 10.625, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -18011544.0, + "logits/rejected": -36521590.85714286, + "logps/chosen": -387.8021728515625, + "logps/rejected": -562.537109375, + "loss": 0.0428, + "rewards/chosen": 5.821242904663086, + "rewards/margins": 18.675351442609514, + "rewards/rejected": -12.854108537946429, + "step": 2595 + }, + { + "epoch": 0.711525284363437, + "grad_norm": 6.375, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -19694851.555555556, + "logits/rejected": -7175283.2, + "logps/chosen": -401.6317545572917, + "logps/rejected": -511.2251302083333, + "loss": 0.039, + "rewards/chosen": 5.835266537136501, + "rewards/margins": 15.471897803412544, + "rewards/rejected": -9.636631266276042, + "step": 2596 + }, + { + "epoch": 0.7117993696039469, + "grad_norm": 2.265625, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -24696548.57142857, + "logits/rejected": -11857106.823529411, + "logps/chosen": -469.31480189732144, + "logps/rejected": -668.8839613970588, + "loss": 0.0038, + "rewards/chosen": 7.046894618443081, + "rewards/margins": 19.803861602013853, + "rewards/rejected": -12.756966983570772, + "step": 2597 + }, + { + "epoch": 0.7120734548444566, + "grad_norm": 5.6875, + "kl": 5.368436336517334, + "learning_rate": 5e-06, + "logits/chosen": -24979872.0, + "logits/rejected": -12482886.666666666, + "logps/chosen": -487.506591796875, + "logps/rejected": -703.9689127604166, + "loss": 0.0127, + "rewards/chosen": 7.370084762573242, + "rewards/margins": 19.19927406311035, + "rewards/rejected": -11.82918930053711, + "step": 2598 + }, + { + "epoch": 0.7123475400849665, + "grad_norm": 9.0625, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -1884881.6, + "logits/rejected": -11666035.42857143, + "logps/chosen": -581.86220703125, + "logps/rejected": -597.8922293526786, + "loss": 0.0225, + "rewards/chosen": 8.544448852539062, + "rewards/margins": 19.377467564174108, + "rewards/rejected": -10.833018711635045, + "step": 2599 + }, + { + "epoch": 0.7126216253254762, + "grad_norm": 2.9375, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -1041317.8181818182, + "logits/rejected": -16772930.461538462, + "logps/chosen": -490.7529296875, + "logps/rejected": -582.9695012019231, + "loss": 0.0099, + "rewards/chosen": 7.139339100230824, + "rewards/margins": 19.16311698860222, + "rewards/rejected": -12.023777888371395, + "step": 2600 + }, + { + "epoch": 0.712895710565986, + "grad_norm": 3.84375, + "kl": 0.24924597144126892, + "learning_rate": 5e-06, + "logits/chosen": -17407413.714285713, + "logits/rejected": -17262435.2, + "logps/chosen": -452.21634347098217, + "logps/rejected": -656.78037109375, + "loss": 0.0149, + "rewards/chosen": 7.5985919407435825, + "rewards/margins": 19.896324484688893, + "rewards/rejected": -12.297732543945312, + "step": 2601 + }, + { + "epoch": 0.7131697958064959, + "grad_norm": 8.5625, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -23668497.066666666, + "logits/rejected": -22868821.333333332, + "logps/chosen": -391.18720703125, + "logps/rejected": -629.4321831597222, + "loss": 0.0689, + "rewards/chosen": 5.778464762369792, + "rewards/margins": 19.676724243164063, + "rewards/rejected": -13.898259480794271, + "step": 2602 + }, + { + "epoch": 0.7134438810470056, + "grad_norm": 3.078125, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -11475128.8, + "logits/rejected": -18912928.0, + "logps/chosen": -427.32880859375, + "logps/rejected": -566.7313058035714, + "loss": 0.0083, + "rewards/chosen": 7.2669921875, + "rewards/margins": 21.902714756556918, + "rewards/rejected": -14.63572256905692, + "step": 2603 + }, + { + "epoch": 0.7137179662875154, + "grad_norm": 7.65625, + "kl": 5.387345790863037, + "learning_rate": 5e-06, + "logits/chosen": -12772664.533333333, + "logits/rejected": -14305945.777777778, + "logps/chosen": -388.43883463541664, + "logps/rejected": -489.2320963541667, + "loss": 0.0316, + "rewards/chosen": 6.957390340169271, + "rewards/margins": 20.93263634575738, + "rewards/rejected": -13.975246005588108, + "step": 2604 + }, + { + "epoch": 0.7139920515280253, + "grad_norm": 9.875, + "kl": 6.356036186218262, + "learning_rate": 5e-06, + "logits/chosen": -24569220.0, + "logits/rejected": -7228568.0, + "logps/chosen": -424.05169677734375, + "logps/rejected": -559.0640869140625, + "loss": 0.0578, + "rewards/chosen": 7.443178176879883, + "rewards/margins": 18.857048988342285, + "rewards/rejected": -11.413870811462402, + "step": 2605 + }, + { + "epoch": 0.714266136768535, + "grad_norm": 0.76953125, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -9560300.0, + "logits/rejected": -20515349.333333332, + "logps/chosen": -402.4003499348958, + "logps/rejected": -562.216796875, + "loss": 0.0029, + "rewards/chosen": 7.326997756958008, + "rewards/margins": 21.064994176228844, + "rewards/rejected": -13.737996419270834, + "step": 2606 + }, + { + "epoch": 0.7145402220090448, + "grad_norm": 10.1875, + "kl": 0.17282883822917938, + "learning_rate": 5e-06, + "logits/chosen": -43103344.0, + "logits/rejected": -22243078.0, + "logps/chosen": -438.2596740722656, + "logps/rejected": -445.7430419921875, + "loss": 0.0496, + "rewards/chosen": 6.877728462219238, + "rewards/margins": 14.77550220489502, + "rewards/rejected": -7.897773742675781, + "step": 2607 + }, + { + "epoch": 0.7148143072495546, + "grad_norm": 4.125, + "kl": 1.6081836223602295, + "learning_rate": 5e-06, + "logits/chosen": -33744759.46666667, + "logits/rejected": -82061653.33333333, + "logps/chosen": -384.42565104166664, + "logps/rejected": -594.2899848090278, + "loss": 0.0403, + "rewards/chosen": 6.090129597981771, + "rewards/margins": 20.30707312689887, + "rewards/rejected": -14.216943528917101, + "step": 2608 + }, + { + "epoch": 0.7150883924900644, + "grad_norm": 5.46875, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -17126773.714285713, + "logits/rejected": -21184783.05882353, + "logps/chosen": -435.0693359375, + "logps/rejected": -587.8583984375, + "loss": 0.0228, + "rewards/chosen": 6.591334751674107, + "rewards/margins": 19.939990901145613, + "rewards/rejected": -13.348656149471507, + "step": 2609 + }, + { + "epoch": 0.7153624777305742, + "grad_norm": 5.0625, + "kl": 3.7785301208496094, + "learning_rate": 5e-06, + "logits/chosen": -21725004.0, + "logits/rejected": -41058376.0, + "logps/chosen": -510.3192138671875, + "logps/rejected": -669.19921875, + "loss": 0.0108, + "rewards/chosen": 7.788875102996826, + "rewards/margins": 22.07545804977417, + "rewards/rejected": -14.286582946777344, + "step": 2610 + }, + { + "epoch": 0.715636562971084, + "grad_norm": 8.25, + "kl": 5.578543663024902, + "learning_rate": 5e-06, + "logits/chosen": -22943620.57142857, + "logits/rejected": -29064896.0, + "logps/chosen": -440.3882533482143, + "logps/rejected": -603.17880859375, + "loss": 0.0113, + "rewards/chosen": 7.379636492047991, + "rewards/margins": 19.430806841169087, + "rewards/rejected": -12.051170349121094, + "step": 2611 + }, + { + "epoch": 0.7159106482115938, + "grad_norm": 6.875, + "kl": 4.422034740447998, + "learning_rate": 5e-06, + "logits/chosen": -41410385.45454545, + "logits/rejected": -35858439.384615384, + "logps/chosen": -433.8800159801136, + "logps/rejected": -482.0560772235577, + "loss": 0.0548, + "rewards/chosen": 6.203491904518821, + "rewards/margins": 15.751679160378195, + "rewards/rejected": -9.548187255859375, + "step": 2612 + }, + { + "epoch": 0.7161847334521037, + "grad_norm": 4.6875, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -8903846.153846154, + "logits/rejected": -27268459.636363637, + "logps/chosen": -436.8256084735577, + "logps/rejected": -434.84126420454544, + "loss": 0.0136, + "rewards/chosen": 6.774804335374099, + "rewards/margins": 18.247280947811955, + "rewards/rejected": -11.472476612437855, + "step": 2613 + }, + { + "epoch": 0.7164588186926134, + "grad_norm": 7.15625, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -55081440.0, + "logits/rejected": -16648913.333333334, + "logps/chosen": -540.3990478515625, + "logps/rejected": -537.2507731119791, + "loss": 0.019, + "rewards/chosen": 6.929258982340495, + "rewards/margins": 20.69614028930664, + "rewards/rejected": -13.766881306966146, + "step": 2614 + }, + { + "epoch": 0.7167329039331232, + "grad_norm": 4.15625, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -37252768.0, + "logits/rejected": -31668322.666666668, + "logps/chosen": -445.1585286458333, + "logps/rejected": -664.8103434244791, + "loss": 0.0112, + "rewards/chosen": 7.742584864298503, + "rewards/margins": 21.102001825968426, + "rewards/rejected": -13.359416961669922, + "step": 2615 + }, + { + "epoch": 0.717006989173633, + "grad_norm": 4.90625, + "kl": 0.08795039355754852, + "learning_rate": 5e-06, + "logits/chosen": -16743741.090909092, + "logits/rejected": -17268966.153846152, + "logps/chosen": -415.18625710227275, + "logps/rejected": -589.3641075721154, + "loss": 0.0135, + "rewards/chosen": 7.322934237393466, + "rewards/margins": 20.777966586026277, + "rewards/rejected": -13.455032348632812, + "step": 2616 + }, + { + "epoch": 0.7172810744141428, + "grad_norm": 8.8125, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -36734280.0, + "logits/rejected": -5946881.333333333, + "logps/chosen": -411.3572591145833, + "logps/rejected": -533.7097981770834, + "loss": 0.0241, + "rewards/chosen": 7.283722559611003, + "rewards/margins": 17.73341178894043, + "rewards/rejected": -10.449689229329428, + "step": 2617 + }, + { + "epoch": 0.7175551596546526, + "grad_norm": 2.75, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": 9141596.666666666, + "logits/rejected": -18615553.333333332, + "logps/chosen": -360.5467122395833, + "logps/rejected": -479.8835042317708, + "loss": 0.0096, + "rewards/chosen": 5.957888285319011, + "rewards/margins": 16.21747080485026, + "rewards/rejected": -10.25958251953125, + "step": 2618 + }, + { + "epoch": 0.7178292448951624, + "grad_norm": 1.6875, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -2129838.769230769, + "logits/rejected": -24572136.727272727, + "logps/chosen": -357.47765174278845, + "logps/rejected": -725.2999822443181, + "loss": 0.0056, + "rewards/chosen": 6.53419201190655, + "rewards/margins": 18.262070929253852, + "rewards/rejected": -11.7278789173473, + "step": 2619 + }, + { + "epoch": 0.7181033301356722, + "grad_norm": 8.8125, + "kl": 13.055513381958008, + "learning_rate": 5e-06, + "logits/chosen": -28203749.647058822, + "logits/rejected": -14678465.142857144, + "logps/chosen": -427.32286879595586, + "logps/rejected": -489.75551060267856, + "loss": 0.0208, + "rewards/chosen": 7.491857640883502, + "rewards/margins": 22.24682546663685, + "rewards/rejected": -14.754967825753349, + "step": 2620 + }, + { + "epoch": 0.7183774153761819, + "grad_norm": 6.46875, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -33049552.0, + "logits/rejected": 48069270.4, + "logps/chosen": -430.85996791294644, + "logps/rejected": -710.462451171875, + "loss": 0.0118, + "rewards/chosen": 7.970747811453683, + "rewards/margins": 21.836842782156808, + "rewards/rejected": -13.866094970703125, + "step": 2621 + }, + { + "epoch": 0.7186515006166918, + "grad_norm": 5.8125, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": 40117672.0, + "logits/rejected": -26303004.0, + "logps/chosen": -363.1537170410156, + "logps/rejected": -628.488525390625, + "loss": 0.021, + "rewards/chosen": 5.653667449951172, + "rewards/margins": 17.930513381958008, + "rewards/rejected": -12.276845932006836, + "step": 2622 + }, + { + "epoch": 0.7189255858572016, + "grad_norm": 5.09375, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -42207694.222222224, + "logits/rejected": -21217305.6, + "logps/chosen": -398.2723795572917, + "logps/rejected": -507.60016276041665, + "loss": 0.0295, + "rewards/chosen": 7.790707906087239, + "rewards/margins": 21.107848612467446, + "rewards/rejected": -13.317140706380208, + "step": 2623 + }, + { + "epoch": 0.7191996710977114, + "grad_norm": 7.09375, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -18114131.692307692, + "logits/rejected": -11058281.454545455, + "logps/chosen": -447.2825270432692, + "logps/rejected": -558.1444424715909, + "loss": 0.0213, + "rewards/chosen": 8.043950594388521, + "rewards/margins": 20.287110495400597, + "rewards/rejected": -12.243159901012074, + "step": 2624 + }, + { + "epoch": 0.7194737563382212, + "grad_norm": 1.046875, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -38716721.45454545, + "logits/rejected": -37603763.692307696, + "logps/chosen": -420.10440340909093, + "logps/rejected": -644.1859975961538, + "loss": 0.0024, + "rewards/chosen": 6.889371004971591, + "rewards/margins": 20.190524334674116, + "rewards/rejected": -13.301153329702524, + "step": 2625 + }, + { + "epoch": 0.719747841578731, + "grad_norm": 9.8125, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -23059124.57142857, + "logits/rejected": -8516277.6, + "logps/chosen": -434.07205636160717, + "logps/rejected": -412.666845703125, + "loss": 0.0396, + "rewards/chosen": 5.047840118408203, + "rewards/margins": 15.06540298461914, + "rewards/rejected": -10.017562866210938, + "step": 2626 + }, + { + "epoch": 0.7200219268192408, + "grad_norm": 8.3125, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -15920224.0, + "logits/rejected": -28541712.0, + "logps/chosen": -399.9828776041667, + "logps/rejected": -512.5474175347222, + "loss": 0.0714, + "rewards/chosen": 5.4299875895182295, + "rewards/margins": 19.247387356228298, + "rewards/rejected": -13.81739976671007, + "step": 2627 + }, + { + "epoch": 0.7202960120597506, + "grad_norm": 5.25, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -15234440.727272727, + "logits/rejected": -24123827.692307692, + "logps/chosen": -409.4060724431818, + "logps/rejected": -737.2739633413462, + "loss": 0.0318, + "rewards/chosen": 6.320637096058238, + "rewards/margins": 25.159839976917613, + "rewards/rejected": -18.839202880859375, + "step": 2628 + }, + { + "epoch": 0.7205700973002603, + "grad_norm": 8.625, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -11890407.384615384, + "logits/rejected": -26820925.09090909, + "logps/chosen": -473.0417668269231, + "logps/rejected": -461.57705965909093, + "loss": 0.0359, + "rewards/chosen": 6.177974994365986, + "rewards/margins": 17.94707761110959, + "rewards/rejected": -11.769102616743607, + "step": 2629 + }, + { + "epoch": 0.7208441825407702, + "grad_norm": 4.8125, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -13238995.555555556, + "logits/rejected": -29117013.333333332, + "logps/chosen": -502.30620659722223, + "logps/rejected": -565.0411458333333, + "loss": 0.0267, + "rewards/chosen": 8.835350884331596, + "rewards/margins": 22.946319919162327, + "rewards/rejected": -14.11096903483073, + "step": 2630 + }, + { + "epoch": 0.72111826778128, + "grad_norm": 5.75, + "kl": 6.59589147567749, + "learning_rate": 5e-06, + "logits/chosen": -16324375.466666667, + "logits/rejected": -26408284.444444444, + "logps/chosen": -386.98492838541665, + "logps/rejected": -582.5594618055555, + "loss": 0.0123, + "rewards/chosen": 7.666431172688802, + "rewards/margins": 18.87448476155599, + "rewards/rejected": -11.208053588867188, + "step": 2631 + }, + { + "epoch": 0.7213923530217897, + "grad_norm": 8.875, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -18346444.307692308, + "logits/rejected": -40178769.45454545, + "logps/chosen": -341.68783804086536, + "logps/rejected": -598.6545188210227, + "loss": 0.06, + "rewards/chosen": 4.641294626089243, + "rewards/margins": 18.357324400148194, + "rewards/rejected": -13.71602977405895, + "step": 2632 + }, + { + "epoch": 0.7216664382622996, + "grad_norm": 1.8671875, + "kl": 2.5956740379333496, + "learning_rate": 5e-06, + "logits/chosen": -39821383.384615384, + "logits/rejected": -33585364.36363637, + "logps/chosen": -502.35606971153845, + "logps/rejected": -469.83163174715907, + "loss": 0.005, + "rewards/chosen": 7.126857464130108, + "rewards/margins": 19.90161970445326, + "rewards/rejected": -12.774762240323154, + "step": 2633 + }, + { + "epoch": 0.7219405235028094, + "grad_norm": 15.0, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -11604795.555555556, + "logits/rejected": -12734685.866666667, + "logps/chosen": -294.1544596354167, + "logps/rejected": -517.0826822916666, + "loss": 0.064, + "rewards/chosen": 3.6587740580240884, + "rewards/margins": 15.644377899169921, + "rewards/rejected": -11.985603841145833, + "step": 2634 + }, + { + "epoch": 0.7222146087433192, + "grad_norm": 6.90625, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -28300726.153846152, + "logits/rejected": -14398190.545454545, + "logps/chosen": -381.1809645432692, + "logps/rejected": -525.0435458096591, + "loss": 0.0404, + "rewards/chosen": 7.053570087139423, + "rewards/margins": 16.655742938701923, + "rewards/rejected": -9.6021728515625, + "step": 2635 + }, + { + "epoch": 0.722488693983829, + "grad_norm": 1.140625, + "kl": 0.38903936743736267, + "learning_rate": 5e-06, + "logits/chosen": -23953445.333333332, + "logits/rejected": -30151645.333333332, + "logps/chosen": -413.8717854817708, + "logps/rejected": -549.096923828125, + "loss": 0.004, + "rewards/chosen": 6.656779607137044, + "rewards/margins": 20.757815678914387, + "rewards/rejected": -14.101036071777344, + "step": 2636 + }, + { + "epoch": 0.7227627792243387, + "grad_norm": 7.25, + "kl": 11.249488830566406, + "learning_rate": 5e-06, + "logits/chosen": -19357534.769230768, + "logits/rejected": -23883226.181818184, + "logps/chosen": -531.1221829927885, + "logps/rejected": -593.3679421164773, + "loss": 0.0274, + "rewards/chosen": 8.647554250863882, + "rewards/margins": 19.95525744244769, + "rewards/rejected": -11.307703191583807, + "step": 2637 + }, + { + "epoch": 0.7230368644648486, + "grad_norm": 2.46875, + "kl": 9.155160903930664, + "learning_rate": 5e-06, + "logits/chosen": -20671814.4, + "logits/rejected": -46603064.88888889, + "logps/chosen": -442.0892578125, + "logps/rejected": -563.8114691840278, + "loss": 0.0079, + "rewards/chosen": 6.885252888997396, + "rewards/margins": 19.45135735405816, + "rewards/rejected": -12.566104465060764, + "step": 2638 + }, + { + "epoch": 0.7233109497053584, + "grad_norm": 1.140625, + "kl": 1.1110255718231201, + "learning_rate": 5e-06, + "logits/chosen": -8066301.714285715, + "logits/rejected": -23563267.2, + "logps/chosen": -526.4709123883929, + "logps/rejected": -490.35283203125, + "loss": 0.0033, + "rewards/chosen": 9.421636308942523, + "rewards/margins": 19.882070268903462, + "rewards/rejected": -10.460433959960938, + "step": 2639 + }, + { + "epoch": 0.7235850349458681, + "grad_norm": 5.5625, + "kl": 0.7222824096679688, + "learning_rate": 5e-06, + "logits/chosen": 10624040.615384616, + "logits/rejected": -17292737.454545453, + "logps/chosen": -440.3251953125, + "logps/rejected": -726.3124112215909, + "loss": 0.0103, + "rewards/chosen": 7.533931438739483, + "rewards/margins": 19.464338876150705, + "rewards/rejected": -11.93040743741122, + "step": 2640 + }, + { + "epoch": 0.723859120186378, + "grad_norm": 6.96875, + "kl": 7.835216522216797, + "learning_rate": 5e-06, + "logits/chosen": -8518411.333333334, + "logits/rejected": -17052945.333333332, + "logps/chosen": -415.7190348307292, + "logps/rejected": -678.9264322916666, + "loss": 0.0669, + "rewards/chosen": 7.65081787109375, + "rewards/margins": 23.718453725179035, + "rewards/rejected": -16.067635854085285, + "step": 2641 + }, + { + "epoch": 0.7241332054268877, + "grad_norm": 2.375, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -29463648.0, + "logits/rejected": -38654500.571428575, + "logps/chosen": -463.882373046875, + "logps/rejected": -634.5029296875, + "loss": 0.0048, + "rewards/chosen": 8.336727905273438, + "rewards/margins": 23.07566419328962, + "rewards/rejected": -14.738936288016182, + "step": 2642 + }, + { + "epoch": 0.7244072906673975, + "grad_norm": 3.28125, + "kl": 2.9897868633270264, + "learning_rate": 5e-06, + "logits/chosen": -37685826.666666664, + "logits/rejected": -6739244.666666667, + "logps/chosen": -398.4989013671875, + "logps/rejected": -541.1667073567709, + "loss": 0.0105, + "rewards/chosen": 7.515331268310547, + "rewards/margins": 16.12211481730143, + "rewards/rejected": -8.606783548990885, + "step": 2643 + }, + { + "epoch": 0.7246813759079074, + "grad_norm": 1.875, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -41832029.538461536, + "logits/rejected": -22689472.0, + "logps/chosen": -535.3582857572115, + "logps/rejected": -635.9125532670455, + "loss": 0.0043, + "rewards/chosen": 8.62957763671875, + "rewards/margins": 21.522001786665484, + "rewards/rejected": -12.892424149946732, + "step": 2644 + }, + { + "epoch": 0.7249554611484171, + "grad_norm": 6.21875, + "kl": 8.101367950439453, + "learning_rate": 5e-06, + "logits/chosen": -18781586.666666668, + "logits/rejected": -16085928.0, + "logps/chosen": -390.6508382161458, + "logps/rejected": -574.9107259114584, + "loss": 0.0396, + "rewards/chosen": 7.932188669840495, + "rewards/margins": 19.172074635823567, + "rewards/rejected": -11.239885965983072, + "step": 2645 + }, + { + "epoch": 0.725229546388927, + "grad_norm": 4.9375, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -12266331.2, + "logits/rejected": -38845184.0, + "logps/chosen": -410.74853515625, + "logps/rejected": -681.3715122767857, + "loss": 0.0163, + "rewards/chosen": 6.155493927001953, + "rewards/margins": 20.071784210205077, + "rewards/rejected": -13.916290283203125, + "step": 2646 + }, + { + "epoch": 0.7255036316294368, + "grad_norm": 2.640625, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": 11509166.222222222, + "logits/rejected": -9818698.666666666, + "logps/chosen": -432.76337348090277, + "logps/rejected": -486.0810546875, + "loss": 0.0069, + "rewards/chosen": 7.5594329833984375, + "rewards/margins": 18.919894409179687, + "rewards/rejected": -11.36046142578125, + "step": 2647 + }, + { + "epoch": 0.7257777168699465, + "grad_norm": 7.59375, + "kl": 0.8262647390365601, + "learning_rate": 5e-06, + "logits/chosen": -27047241.846153848, + "logits/rejected": -13721460.363636363, + "logps/chosen": -326.0248272235577, + "logps/rejected": -566.8916015625, + "loss": 0.0324, + "rewards/chosen": 6.810566828801082, + "rewards/margins": 17.545033381535458, + "rewards/rejected": -10.734466552734375, + "step": 2648 + }, + { + "epoch": 0.7260518021104564, + "grad_norm": 4.21875, + "kl": 0.4468180537223816, + "learning_rate": 5e-06, + "logits/chosen": -28499811.2, + "logits/rejected": -46959341.71428572, + "logps/chosen": -367.27158203125, + "logps/rejected": -558.4281529017857, + "loss": 0.013, + "rewards/chosen": 6.025337982177734, + "rewards/margins": 16.5455689566476, + "rewards/rejected": -10.520230974469866, + "step": 2649 + }, + { + "epoch": 0.7263258873509661, + "grad_norm": 3.5, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -14424458.666666666, + "logits/rejected": -23843648.0, + "logps/chosen": -422.3870442708333, + "logps/rejected": -566.1597086588541, + "loss": 0.01, + "rewards/chosen": 6.761892954508464, + "rewards/margins": 20.775811513264973, + "rewards/rejected": -14.01391855875651, + "step": 2650 + }, + { + "epoch": 0.7265999725914759, + "grad_norm": 2.03125, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -21368925.714285713, + "logits/rejected": -22719265.88235294, + "logps/chosen": -408.48880440848217, + "logps/rejected": -480.9989659926471, + "loss": 0.0045, + "rewards/chosen": 6.818302699497768, + "rewards/margins": 16.672876566398045, + "rewards/rejected": -9.854573866900276, + "step": 2651 + }, + { + "epoch": 0.7268740578319858, + "grad_norm": 13.4375, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": 1877067.5, + "logits/rejected": -24088514.666666668, + "logps/chosen": -382.2664388020833, + "logps/rejected": -508.0021158854167, + "loss": 0.0354, + "rewards/chosen": 6.698601404825847, + "rewards/margins": 16.32672627766927, + "rewards/rejected": -9.628124872843424, + "step": 2652 + }, + { + "epoch": 0.7271481430724955, + "grad_norm": 9.25, + "kl": 2.2567317485809326, + "learning_rate": 5e-06, + "logits/chosen": -22453642.666666668, + "logits/rejected": -12969736.0, + "logps/chosen": -416.3486735026042, + "logps/rejected": -635.8216552734375, + "loss": 0.0802, + "rewards/chosen": 6.615076065063477, + "rewards/margins": 21.004299799601235, + "rewards/rejected": -14.38922373453776, + "step": 2653 + }, + { + "epoch": 0.7274222283130053, + "grad_norm": 6.21875, + "kl": 4.242058753967285, + "learning_rate": 5e-06, + "logits/chosen": -20473276.0, + "logits/rejected": -5119358.0, + "logps/chosen": -390.36260986328125, + "logps/rejected": -399.891845703125, + "loss": 0.0501, + "rewards/chosen": 6.4406280517578125, + "rewards/margins": 14.36224889755249, + "rewards/rejected": -7.921620845794678, + "step": 2654 + }, + { + "epoch": 0.7276963135535152, + "grad_norm": 11.75, + "kl": 14.508136749267578, + "learning_rate": 5e-06, + "logits/chosen": -4456695.142857143, + "logits/rejected": -21699168.0, + "logps/chosen": -480.56131417410717, + "logps/rejected": -528.299755859375, + "loss": 0.1056, + "rewards/chosen": 5.676660810198102, + "rewards/margins": 15.570133863176618, + "rewards/rejected": -9.893473052978516, + "step": 2655 + }, + { + "epoch": 0.7279703987940249, + "grad_norm": 8.625, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -590743.0, + "logits/rejected": -23004092.0, + "logps/chosen": -406.42864990234375, + "logps/rejected": -466.5083923339844, + "loss": 0.0332, + "rewards/chosen": 5.842583656311035, + "rewards/margins": 15.703547477722168, + "rewards/rejected": -9.860963821411133, + "step": 2656 + }, + { + "epoch": 0.7282444840345348, + "grad_norm": 4.59375, + "kl": 4.000467300415039, + "learning_rate": 5e-06, + "logits/chosen": -23862807.272727273, + "logits/rejected": -10308596.307692308, + "logps/chosen": -396.99365234375, + "logps/rejected": -487.23550180288464, + "loss": 0.0709, + "rewards/chosen": 5.815146012739702, + "rewards/margins": 16.69126342560028, + "rewards/rejected": -10.876117412860577, + "step": 2657 + }, + { + "epoch": 0.7285185692750445, + "grad_norm": 7.125, + "kl": 6.423708915710449, + "learning_rate": 5e-06, + "logits/chosen": -16287708.57142857, + "logits/rejected": -14522548.8, + "logps/chosen": -407.4361049107143, + "logps/rejected": -330.4301025390625, + "loss": 0.0378, + "rewards/chosen": 6.792657034737723, + "rewards/margins": 12.731528799874443, + "rewards/rejected": -5.938871765136719, + "step": 2658 + }, + { + "epoch": 0.7287926545155543, + "grad_norm": 3.578125, + "kl": 11.26325511932373, + "learning_rate": 5e-06, + "logits/chosen": -10232261.333333334, + "logits/rejected": -33049280.0, + "logps/chosen": -424.73072916666666, + "logps/rejected": -502.05653211805554, + "loss": 0.0166, + "rewards/chosen": 7.164010111490885, + "rewards/margins": 17.092669338650175, + "rewards/rejected": -9.928659227159288, + "step": 2659 + }, + { + "epoch": 0.7290667397560642, + "grad_norm": 11.125, + "kl": 2.549776792526245, + "learning_rate": 5e-06, + "logits/chosen": -17107891.692307692, + "logits/rejected": -13918266.181818182, + "logps/chosen": -495.7174729567308, + "logps/rejected": -480.87215909090907, + "loss": 0.0404, + "rewards/chosen": 8.247179471529448, + "rewards/margins": 17.78483400144777, + "rewards/rejected": -9.537654529918324, + "step": 2660 + }, + { + "epoch": 0.7293408249965739, + "grad_norm": 1.9765625, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -14465568.888888888, + "logits/rejected": -22791622.4, + "logps/chosen": -483.24609375, + "logps/rejected": -595.3781901041667, + "loss": 0.0053, + "rewards/chosen": 7.443887498643663, + "rewards/margins": 20.73209550645616, + "rewards/rejected": -13.2882080078125, + "step": 2661 + }, + { + "epoch": 0.7296149102370837, + "grad_norm": 7.90625, + "kl": 1.6464920043945312, + "learning_rate": 5e-06, + "logits/chosen": 436384.6153846154, + "logits/rejected": -20496110.545454547, + "logps/chosen": -447.39246544471155, + "logps/rejected": -449.4689275568182, + "loss": 0.0333, + "rewards/chosen": 6.689586932842548, + "rewards/margins": 17.04253451474063, + "rewards/rejected": -10.352947581898082, + "step": 2662 + }, + { + "epoch": 0.7298889954775936, + "grad_norm": 4.96875, + "kl": 4.639494895935059, + "learning_rate": 5e-06, + "logits/chosen": -14362264.615384616, + "logits/rejected": -6753346.909090909, + "logps/chosen": -359.55337289663464, + "logps/rejected": -364.4074041193182, + "loss": 0.0267, + "rewards/chosen": 6.950752258300781, + "rewards/margins": 13.3333608453924, + "rewards/rejected": -6.382608587091619, + "step": 2663 + }, + { + "epoch": 0.7301630807181033, + "grad_norm": 1.5546875, + "kl": 2.6637420654296875, + "learning_rate": 5e-06, + "logits/chosen": -9587412.923076924, + "logits/rejected": -43200637.09090909, + "logps/chosen": -337.8242938701923, + "logps/rejected": -519.5831409801136, + "loss": 0.0068, + "rewards/chosen": 7.033336345966045, + "rewards/margins": 19.36573103257826, + "rewards/rejected": -12.332394686612217, + "step": 2664 + }, + { + "epoch": 0.7304371659586131, + "grad_norm": 3.34375, + "kl": 7.239335536956787, + "learning_rate": 5e-06, + "logits/chosen": -5790277.333333333, + "logits/rejected": -29709900.8, + "logps/chosen": -441.68994140625, + "logps/rejected": -591.6975260416667, + "loss": 0.0146, + "rewards/chosen": 8.48467763264974, + "rewards/margins": 19.307146708170574, + "rewards/rejected": -10.822469075520834, + "step": 2665 + }, + { + "epoch": 0.730711251199123, + "grad_norm": 6.09375, + "kl": 3.5676207542419434, + "learning_rate": 5e-06, + "logits/chosen": 12558645.333333334, + "logits/rejected": -26655235.555555556, + "logps/chosen": -437.54401041666665, + "logps/rejected": -390.089599609375, + "loss": 0.0249, + "rewards/chosen": 7.190079752604166, + "rewards/margins": 16.055739678276908, + "rewards/rejected": -8.865659925672743, + "step": 2666 + }, + { + "epoch": 0.7309853364396327, + "grad_norm": 5.96875, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": 1859396.0, + "logits/rejected": -3254543.6, + "logps/chosen": -323.53585379464283, + "logps/rejected": -539.581689453125, + "loss": 0.0535, + "rewards/chosen": 6.243833814348493, + "rewards/margins": 16.00331028529576, + "rewards/rejected": -9.759476470947266, + "step": 2667 + }, + { + "epoch": 0.7312594216801426, + "grad_norm": 13.6875, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -13114403.42857143, + "logits/rejected": -18984442.352941178, + "logps/chosen": -406.21435546875, + "logps/rejected": -439.49333639705884, + "loss": 0.0532, + "rewards/chosen": 5.770454406738281, + "rewards/margins": 14.412145726821002, + "rewards/rejected": -8.641691320082721, + "step": 2668 + }, + { + "epoch": 0.7315335069206523, + "grad_norm": 2.390625, + "kl": 9.330307006835938, + "learning_rate": 5e-06, + "logits/chosen": -32738601.846153848, + "logits/rejected": -21724002.90909091, + "logps/chosen": -547.0467998798077, + "logps/rejected": -484.0672052556818, + "loss": 0.0043, + "rewards/chosen": 9.09197998046875, + "rewards/margins": 17.660683371803977, + "rewards/rejected": -8.568703391335227, + "step": 2669 + }, + { + "epoch": 0.7318075921611621, + "grad_norm": 1.5078125, + "kl": 3.192103147506714, + "learning_rate": 5e-06, + "logits/chosen": -24739868.0, + "logits/rejected": -22145558.0, + "logps/chosen": -347.2518310546875, + "logps/rejected": -583.3622436523438, + "loss": 0.0047, + "rewards/chosen": 7.662715911865234, + "rewards/margins": 19.51686954498291, + "rewards/rejected": -11.854153633117676, + "step": 2670 + }, + { + "epoch": 0.732081677401672, + "grad_norm": 0.69921875, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -30079112.727272727, + "logits/rejected": 12061996.307692308, + "logps/chosen": -404.9815784801136, + "logps/rejected": -557.63525390625, + "loss": 0.0017, + "rewards/chosen": 8.522439436479049, + "rewards/margins": 20.939773186103448, + "rewards/rejected": -12.4173337496244, + "step": 2671 + }, + { + "epoch": 0.7323557626421817, + "grad_norm": 14.5625, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -41582450.666666664, + "logits/rejected": -2060945.0, + "logps/chosen": -374.5331217447917, + "logps/rejected": -551.2835693359375, + "loss": 0.0485, + "rewards/chosen": 6.556082407633464, + "rewards/margins": 17.076361338297527, + "rewards/rejected": -10.520278930664062, + "step": 2672 + }, + { + "epoch": 0.7326298478826915, + "grad_norm": 4.875, + "kl": 7.356524467468262, + "learning_rate": 5e-06, + "logits/chosen": 9911488.0, + "logits/rejected": -3925164.4444444445, + "logps/chosen": -578.3005859375, + "logps/rejected": -476.63628472222223, + "loss": 0.0114, + "rewards/chosen": 7.394559733072916, + "rewards/margins": 17.06181131998698, + "rewards/rejected": -9.667251586914062, + "step": 2673 + }, + { + "epoch": 0.7329039331232013, + "grad_norm": 7.9375, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -26565868.307692308, + "logits/rejected": -29944183.272727273, + "logps/chosen": -329.37582632211536, + "logps/rejected": -642.3413529829545, + "loss": 0.0441, + "rewards/chosen": 6.750729487492488, + "rewards/margins": 21.983664612670044, + "rewards/rejected": -15.232935125177557, + "step": 2674 + }, + { + "epoch": 0.7331780183637111, + "grad_norm": 7.125, + "kl": 2.4564433097839355, + "learning_rate": 5e-06, + "logits/chosen": -23621222.4, + "logits/rejected": -28712146.285714287, + "logps/chosen": -380.5232666015625, + "logps/rejected": -580.865234375, + "loss": 0.0307, + "rewards/chosen": 6.953469085693359, + "rewards/margins": 19.727621568952287, + "rewards/rejected": -12.774152483258929, + "step": 2675 + }, + { + "epoch": 0.7334521036042209, + "grad_norm": 5.625, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -17571066.666666668, + "logits/rejected": -11420470.666666666, + "logps/chosen": -394.8638916015625, + "logps/rejected": -550.5489095052084, + "loss": 0.0243, + "rewards/chosen": 7.3489030202229815, + "rewards/margins": 18.798343022664387, + "rewards/rejected": -11.449440002441406, + "step": 2676 + }, + { + "epoch": 0.7337261888447307, + "grad_norm": 6.625, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -8887977.454545455, + "logits/rejected": -14861508.923076924, + "logps/chosen": -500.0654296875, + "logps/rejected": -374.0075871394231, + "loss": 0.0569, + "rewards/chosen": 7.048998746004972, + "rewards/margins": 17.236712369051848, + "rewards/rejected": -10.187713623046875, + "step": 2677 + }, + { + "epoch": 0.7340002740852405, + "grad_norm": 1.53125, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -33299214.222222224, + "logits/rejected": -15722738.133333333, + "logps/chosen": -468.1647135416667, + "logps/rejected": -627.5692057291667, + "loss": 0.0052, + "rewards/chosen": 8.221702575683594, + "rewards/margins": 21.97362314860026, + "rewards/rejected": -13.751920572916667, + "step": 2678 + }, + { + "epoch": 0.7342743593257504, + "grad_norm": 8.4375, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -20097604.0, + "logits/rejected": -347573.0, + "logps/chosen": -466.0044250488281, + "logps/rejected": -604.3165893554688, + "loss": 0.036, + "rewards/chosen": 7.389938831329346, + "rewards/margins": 19.85542917251587, + "rewards/rejected": -12.465490341186523, + "step": 2679 + }, + { + "epoch": 0.7345484445662601, + "grad_norm": 2.84375, + "kl": 0.4533348083496094, + "learning_rate": 5e-06, + "logits/chosen": -22709916.444444444, + "logits/rejected": -9480025.6, + "logps/chosen": -565.3901909722222, + "logps/rejected": -538.9626953125, + "loss": 0.0068, + "rewards/chosen": 8.878650241427952, + "rewards/margins": 18.555783420138887, + "rewards/rejected": -9.677133178710937, + "step": 2680 + }, + { + "epoch": 0.7348225298067699, + "grad_norm": 3.90625, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -31114008.888888888, + "logits/rejected": -14157173.333333334, + "logps/chosen": -433.6559244791667, + "logps/rejected": -550.4983723958334, + "loss": 0.0131, + "rewards/chosen": 6.20028813680013, + "rewards/margins": 19.885439300537108, + "rewards/rejected": -13.685151163736979, + "step": 2681 + }, + { + "epoch": 0.7350966150472797, + "grad_norm": 12.8125, + "kl": 3.033669948577881, + "learning_rate": 5e-06, + "logits/chosen": -20267140.363636363, + "logits/rejected": -22273947.076923076, + "logps/chosen": -353.97099165482956, + "logps/rejected": -421.50157752403845, + "loss": 0.0645, + "rewards/chosen": 5.686582391912287, + "rewards/margins": 14.62991829185219, + "rewards/rejected": -8.943335899939903, + "step": 2682 + }, + { + "epoch": 0.7353707002877895, + "grad_norm": 4.09375, + "kl": 0.7302157282829285, + "learning_rate": 5e-06, + "logits/chosen": -22874760.0, + "logits/rejected": -9424528.0, + "logps/chosen": -386.0685221354167, + "logps/rejected": -426.5681966145833, + "loss": 0.0153, + "rewards/chosen": 8.218021392822266, + "rewards/margins": 19.67214330037435, + "rewards/rejected": -11.454121907552084, + "step": 2683 + }, + { + "epoch": 0.7356447855282993, + "grad_norm": 6.84375, + "kl": 0.6203429102897644, + "learning_rate": 5e-06, + "logits/chosen": -19367044.8, + "logits/rejected": -17361378.285714287, + "logps/chosen": -382.373828125, + "logps/rejected": -372.33412388392856, + "loss": 0.0217, + "rewards/chosen": 7.104678344726563, + "rewards/margins": 16.702584184919086, + "rewards/rejected": -9.597905840192523, + "step": 2684 + }, + { + "epoch": 0.7359188707688091, + "grad_norm": 4.78125, + "kl": 0.47612762451171875, + "learning_rate": 5e-06, + "logits/chosen": -33877414.4, + "logits/rejected": -12835200.888888888, + "logps/chosen": -350.3816731770833, + "logps/rejected": -486.6467013888889, + "loss": 0.0167, + "rewards/chosen": 6.806805928548177, + "rewards/margins": 19.7803471883138, + "rewards/rejected": -12.973541259765625, + "step": 2685 + }, + { + "epoch": 0.7361929560093189, + "grad_norm": 10.6875, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -14760059.076923076, + "logits/rejected": -29002356.363636363, + "logps/chosen": -304.20179161658655, + "logps/rejected": -457.8099254261364, + "loss": 0.0604, + "rewards/chosen": 6.053218548114483, + "rewards/margins": 15.882213299091045, + "rewards/rejected": -9.828994750976562, + "step": 2686 + }, + { + "epoch": 0.7364670412498286, + "grad_norm": 6.78125, + "kl": 2.9663150310516357, + "learning_rate": 5e-06, + "logits/chosen": -32558716.8, + "logits/rejected": 9230676.0, + "logps/chosen": -419.816015625, + "logps/rejected": -715.5191127232143, + "loss": 0.0178, + "rewards/chosen": 6.926384735107422, + "rewards/margins": 21.50123563494001, + "rewards/rejected": -14.574850899832589, + "step": 2687 + }, + { + "epoch": 0.7367411264903385, + "grad_norm": 12.875, + "kl": 4.707769393920898, + "learning_rate": 5e-06, + "logits/chosen": -28731330.666666668, + "logits/rejected": -18602218.666666668, + "logps/chosen": -361.1649576822917, + "logps/rejected": -552.7494303385416, + "loss": 0.059, + "rewards/chosen": 5.571547190348308, + "rewards/margins": 17.42926534016927, + "rewards/rejected": -11.857718149820963, + "step": 2688 + }, + { + "epoch": 0.7370152117308483, + "grad_norm": 8.625, + "kl": 1.8038686513900757, + "learning_rate": 5e-06, + "logits/chosen": -7621378.666666667, + "logits/rejected": -33234514.666666668, + "logps/chosen": -410.6732584635417, + "logps/rejected": -705.8971354166666, + "loss": 0.0331, + "rewards/chosen": 8.475722630818685, + "rewards/margins": 19.64297421773275, + "rewards/rejected": -11.167251586914062, + "step": 2689 + }, + { + "epoch": 0.7372892969713581, + "grad_norm": 1.34375, + "kl": 0.9315058588981628, + "learning_rate": 5e-06, + "logits/chosen": -13841827.692307692, + "logits/rejected": -49173288.72727273, + "logps/chosen": -506.80528846153845, + "logps/rejected": -502.6536754261364, + "loss": 0.0029, + "rewards/chosen": 9.53683589054988, + "rewards/margins": 19.33441002052147, + "rewards/rejected": -9.797574129971592, + "step": 2690 + }, + { + "epoch": 0.7375633822118679, + "grad_norm": 12.4375, + "kl": 6.031213283538818, + "learning_rate": 5e-06, + "logits/chosen": -21074141.866666667, + "logits/rejected": -28819873.777777776, + "logps/chosen": -449.9322265625, + "logps/rejected": -563.4959852430555, + "loss": 0.0928, + "rewards/chosen": 6.860644022623698, + "rewards/margins": 18.51048533121745, + "rewards/rejected": -11.64984130859375, + "step": 2691 + }, + { + "epoch": 0.7378374674523777, + "grad_norm": 3.671875, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -42777989.333333336, + "logits/rejected": -25452802.666666668, + "logps/chosen": -252.07084147135416, + "logps/rejected": -614.5467122395834, + "loss": 0.0643, + "rewards/chosen": 4.269676844278972, + "rewards/margins": 16.808427810668945, + "rewards/rejected": -12.538750966389975, + "step": 2692 + }, + { + "epoch": 0.7381115526928875, + "grad_norm": 7.59375, + "kl": 2.275054454803467, + "learning_rate": 5e-06, + "logits/chosen": -19313840.0, + "logits/rejected": -3923576.8, + "logps/chosen": -403.49183872767856, + "logps/rejected": -606.36015625, + "loss": 0.074, + "rewards/chosen": 5.261073521205357, + "rewards/margins": 16.232603672572544, + "rewards/rejected": -10.971530151367187, + "step": 2693 + }, + { + "epoch": 0.7383856379333973, + "grad_norm": 4.59375, + "kl": 2.4275588989257812, + "learning_rate": 5e-06, + "logits/chosen": -26998626.46153846, + "logits/rejected": -20452834.90909091, + "logps/chosen": -502.2555588942308, + "logps/rejected": -563.9434481534091, + "loss": 0.0231, + "rewards/chosen": 6.957728459284856, + "rewards/margins": 20.12239576219679, + "rewards/rejected": -13.164667302911932, + "step": 2694 + }, + { + "epoch": 0.738659723173907, + "grad_norm": 8.5, + "kl": 6.591874599456787, + "learning_rate": 5e-06, + "logits/chosen": -13354048.0, + "logits/rejected": -17558184.0, + "logps/chosen": -327.16033935546875, + "logps/rejected": -491.4618225097656, + "loss": 0.081, + "rewards/chosen": 5.672626495361328, + "rewards/margins": 15.484601974487305, + "rewards/rejected": -9.811975479125977, + "step": 2695 + }, + { + "epoch": 0.7389338084144169, + "grad_norm": 12.125, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -23326512.0, + "logits/rejected": -13556891.0, + "logps/chosen": -434.98822021484375, + "logps/rejected": -535.5385131835938, + "loss": 0.0301, + "rewards/chosen": 7.904122829437256, + "rewards/margins": 19.703553676605225, + "rewards/rejected": -11.799430847167969, + "step": 2696 + }, + { + "epoch": 0.7392078936549267, + "grad_norm": 7.0625, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -20074132.923076924, + "logits/rejected": -41803296.0, + "logps/chosen": -356.7965745192308, + "logps/rejected": -663.5252574573864, + "loss": 0.0303, + "rewards/chosen": 4.801141885610727, + "rewards/margins": 20.729324314144108, + "rewards/rejected": -15.92818242853338, + "step": 2697 + }, + { + "epoch": 0.7394819788954364, + "grad_norm": 7.375, + "kl": 1.4415035247802734, + "learning_rate": 5e-06, + "logits/chosen": -30114603.42857143, + "logits/rejected": -13243134.4, + "logps/chosen": -477.9063197544643, + "logps/rejected": -501.009423828125, + "loss": 0.015, + "rewards/chosen": 7.140796661376953, + "rewards/margins": 18.808121490478516, + "rewards/rejected": -11.667324829101563, + "step": 2698 + }, + { + "epoch": 0.7397560641359463, + "grad_norm": 1.375, + "kl": 3.1477432250976562, + "learning_rate": 5e-06, + "logits/chosen": -23600864.0, + "logits/rejected": -20144937.333333332, + "logps/chosen": -457.1627604166667, + "logps/rejected": -529.58251953125, + "loss": 0.0038, + "rewards/chosen": 7.380817413330078, + "rewards/margins": 19.186308542887367, + "rewards/rejected": -11.805491129557291, + "step": 2699 + }, + { + "epoch": 0.7400301493764561, + "grad_norm": 12.9375, + "kl": 7.984150409698486, + "learning_rate": 5e-06, + "logits/chosen": -19824832.0, + "logits/rejected": -33133641.6, + "logps/chosen": -344.44789341517856, + "logps/rejected": -502.24052734375, + "loss": 0.0682, + "rewards/chosen": 6.361656188964844, + "rewards/margins": 15.731674194335938, + "rewards/rejected": -9.370018005371094, + "step": 2700 + }, + { + "epoch": 0.7403042346169659, + "grad_norm": 11.0, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": 3246681.1428571427, + "logits/rejected": -20444492.8, + "logps/chosen": -437.06138392857144, + "logps/rejected": -488.16240234375, + "loss": 0.0689, + "rewards/chosen": 6.747227260044643, + "rewards/margins": 15.07874276297433, + "rewards/rejected": -8.331515502929687, + "step": 2701 + }, + { + "epoch": 0.7405783198574757, + "grad_norm": 4.90625, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -17645602.666666668, + "logits/rejected": -13632040.0, + "logps/chosen": -361.2789713541667, + "logps/rejected": -738.81396484375, + "loss": 0.025, + "rewards/chosen": 6.29255739847819, + "rewards/margins": 22.557297388712566, + "rewards/rejected": -16.264739990234375, + "step": 2702 + }, + { + "epoch": 0.7408524050979854, + "grad_norm": 5.6875, + "kl": 8.14381217956543, + "learning_rate": 5e-06, + "logits/chosen": -20145149.714285713, + "logits/rejected": 7958145.882352941, + "logps/chosen": -403.0532924107143, + "logps/rejected": -537.9503676470588, + "loss": 0.0164, + "rewards/chosen": 6.654112134660993, + "rewards/margins": 19.87265479464491, + "rewards/rejected": -13.218542659983916, + "step": 2703 + }, + { + "epoch": 0.7411264903384953, + "grad_norm": 5.5625, + "kl": 10.222280502319336, + "learning_rate": 5e-06, + "logits/chosen": 3853473.230769231, + "logits/rejected": -43322225.45454545, + "logps/chosen": -507.53617037259613, + "logps/rejected": -542.2218128551136, + "loss": 0.0523, + "rewards/chosen": 8.617762052095854, + "rewards/margins": 21.3530359334879, + "rewards/rejected": -12.735273881392045, + "step": 2704 + }, + { + "epoch": 0.7414005755790051, + "grad_norm": 9.0, + "kl": 5.414065837860107, + "learning_rate": 5e-06, + "logits/chosen": -7149730.0, + "logits/rejected": -18844080.0, + "logps/chosen": -369.1717529296875, + "logps/rejected": -374.7088317871094, + "loss": 0.0777, + "rewards/chosen": 6.8678388595581055, + "rewards/margins": 15.225717544555664, + "rewards/rejected": -8.357878684997559, + "step": 2705 + }, + { + "epoch": 0.7416746608195148, + "grad_norm": 3.0625, + "kl": 4.72934627532959, + "learning_rate": 5e-06, + "logits/chosen": -11517253.0, + "logits/rejected": -9744127.0, + "logps/chosen": -340.8044738769531, + "logps/rejected": -492.0197448730469, + "loss": 0.0483, + "rewards/chosen": 5.96434211730957, + "rewards/margins": 14.80648136138916, + "rewards/rejected": -8.84213924407959, + "step": 2706 + }, + { + "epoch": 0.7419487460600247, + "grad_norm": 3.234375, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -23483281.777777776, + "logits/rejected": -36414685.86666667, + "logps/chosen": -403.9538302951389, + "logps/rejected": -600.3591145833333, + "loss": 0.0072, + "rewards/chosen": 7.825984530978733, + "rewards/margins": 19.032791985405815, + "rewards/rejected": -11.206807454427084, + "step": 2707 + }, + { + "epoch": 0.7422228313005345, + "grad_norm": 10.875, + "kl": 9.122020721435547, + "learning_rate": 5e-06, + "logits/chosen": -9592632.0, + "logits/rejected": -21673325.333333332, + "logps/chosen": -499.9569091796875, + "logps/rejected": -605.9954427083334, + "loss": 0.0474, + "rewards/chosen": 7.771870930989583, + "rewards/margins": 18.419994354248047, + "rewards/rejected": -10.648123423258463, + "step": 2708 + }, + { + "epoch": 0.7424969165410442, + "grad_norm": 3.828125, + "kl": 1.7049955129623413, + "learning_rate": 5e-06, + "logits/chosen": -29805609.411764707, + "logits/rejected": -19518891.42857143, + "logps/chosen": -373.91647518382354, + "logps/rejected": -355.66904994419644, + "loss": 0.0134, + "rewards/chosen": 6.804925357594209, + "rewards/margins": 16.155485137170103, + "rewards/rejected": -9.350559779575892, + "step": 2709 + }, + { + "epoch": 0.7427710017815541, + "grad_norm": 4.4375, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -28922334.222222224, + "logits/rejected": 10980142.933333334, + "logps/chosen": -364.59619140625, + "logps/rejected": -733.05, + "loss": 0.0056, + "rewards/chosen": 6.160168117947048, + "rewards/margins": 23.552662828233508, + "rewards/rejected": -17.39249471028646, + "step": 2710 + }, + { + "epoch": 0.7430450870220638, + "grad_norm": 5.625, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": 3961905.777777778, + "logits/rejected": -19583360.0, + "logps/chosen": -378.3442111545139, + "logps/rejected": -583.7723958333333, + "loss": 0.0301, + "rewards/chosen": 7.803306579589844, + "rewards/margins": 19.08994801839193, + "rewards/rejected": -11.286641438802084, + "step": 2711 + }, + { + "epoch": 0.7433191722625737, + "grad_norm": 7.96875, + "kl": 1.345346450805664, + "learning_rate": 5e-06, + "logits/chosen": -25207694.769230768, + "logits/rejected": 20514605.09090909, + "logps/chosen": -384.21548227163464, + "logps/rejected": -692.5055042613636, + "loss": 0.0638, + "rewards/chosen": 6.101988572340745, + "rewards/margins": 15.49294995928144, + "rewards/rejected": -9.390961386940695, + "step": 2712 + }, + { + "epoch": 0.7435932575030835, + "grad_norm": 1.609375, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -7485404.363636363, + "logits/rejected": -6565165.538461538, + "logps/chosen": -549.1693004261364, + "logps/rejected": -453.6696213942308, + "loss": 0.0052, + "rewards/chosen": 8.236290671608664, + "rewards/margins": 18.123953119024527, + "rewards/rejected": -9.887662447415865, + "step": 2713 + }, + { + "epoch": 0.7438673427435932, + "grad_norm": 4.03125, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -3801290.1818181816, + "logits/rejected": -20813449.846153848, + "logps/chosen": -416.0980113636364, + "logps/rejected": -610.6669170673077, + "loss": 0.0147, + "rewards/chosen": 5.925612016157671, + "rewards/margins": 17.842815479198535, + "rewards/rejected": -11.917203463040865, + "step": 2714 + }, + { + "epoch": 0.7441414279841031, + "grad_norm": 7.90625, + "kl": 3.0875232219696045, + "learning_rate": 5e-06, + "logits/chosen": -14519560.727272727, + "logits/rejected": 5525094.769230769, + "logps/chosen": -545.3326526988636, + "logps/rejected": -685.5635516826923, + "loss": 0.0218, + "rewards/chosen": 7.926984613591975, + "rewards/margins": 23.105278868775265, + "rewards/rejected": -15.178294255183292, + "step": 2715 + }, + { + "epoch": 0.7444155132246129, + "grad_norm": 1.21875, + "kl": 5.225282669067383, + "learning_rate": 5e-06, + "logits/chosen": -69410156.3076923, + "logits/rejected": -37153605.81818182, + "logps/chosen": -594.1336763822115, + "logps/rejected": -809.6459517045455, + "loss": 0.0028, + "rewards/chosen": 9.220966045673077, + "rewards/margins": 25.564661946330038, + "rewards/rejected": -16.34369590065696, + "step": 2716 + }, + { + "epoch": 0.7446895984651226, + "grad_norm": 5.4375, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -21696764.444444444, + "logits/rejected": -3079723.7333333334, + "logps/chosen": -392.79248046875, + "logps/rejected": -651.6735026041666, + "loss": 0.0139, + "rewards/chosen": 7.262157864040798, + "rewards/margins": 18.309708319769967, + "rewards/rejected": -11.047550455729167, + "step": 2717 + }, + { + "epoch": 0.7449636837056325, + "grad_norm": 9.25, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -28339538.666666668, + "logits/rejected": -9436904.0, + "logps/chosen": -441.1502685546875, + "logps/rejected": -560.123779296875, + "loss": 0.0529, + "rewards/chosen": 6.932758967081706, + "rewards/margins": 17.18712552388509, + "rewards/rejected": -10.254366556803385, + "step": 2718 + }, + { + "epoch": 0.7452377689461422, + "grad_norm": 5.1875, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -20570290.666666668, + "logits/rejected": -395989.3333333333, + "logps/chosen": -519.0508626302084, + "logps/rejected": -596.6267496744791, + "loss": 0.0081, + "rewards/chosen": 8.867019653320312, + "rewards/margins": 21.301756540934242, + "rewards/rejected": -12.434736887613932, + "step": 2719 + }, + { + "epoch": 0.745511854186652, + "grad_norm": 5.59375, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -52358848.0, + "logits/rejected": -22695570.82352941, + "logps/chosen": -430.2479771205357, + "logps/rejected": -473.55870863970586, + "loss": 0.0255, + "rewards/chosen": 7.260049547467913, + "rewards/margins": 17.936766071479862, + "rewards/rejected": -10.67671652401195, + "step": 2720 + }, + { + "epoch": 0.7457859394271619, + "grad_norm": 3.125, + "kl": 0.11250432580709457, + "learning_rate": 5e-06, + "logits/chosen": -17661312.0, + "logits/rejected": -31243225.6, + "logps/chosen": -371.06821986607144, + "logps/rejected": -560.464453125, + "loss": 0.0066, + "rewards/chosen": 7.718727111816406, + "rewards/margins": 20.592347717285158, + "rewards/rejected": -12.87362060546875, + "step": 2721 + }, + { + "epoch": 0.7460600246676716, + "grad_norm": 11.4375, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -7465828.0, + "logits/rejected": -19918285.714285713, + "logps/chosen": -364.5410400390625, + "logps/rejected": -424.9153529575893, + "loss": 0.0406, + "rewards/chosen": 6.161744689941406, + "rewards/margins": 15.062951115199498, + "rewards/rejected": -8.901206425258092, + "step": 2722 + }, + { + "epoch": 0.7463341099081815, + "grad_norm": 5.46875, + "kl": 5.474950313568115, + "learning_rate": 5e-06, + "logits/chosen": -8948506.181818182, + "logits/rejected": -10779616.0, + "logps/chosen": -369.56394264914775, + "logps/rejected": -691.7393329326923, + "loss": 0.0227, + "rewards/chosen": 6.408914739435369, + "rewards/margins": 18.63317316228693, + "rewards/rejected": -12.224258422851562, + "step": 2723 + }, + { + "epoch": 0.7466081951486913, + "grad_norm": 6.84375, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -97616.3, + "logits/rejected": -17057113.14285714, + "logps/chosen": -384.51748046875, + "logps/rejected": -439.1149204799107, + "loss": 0.0341, + "rewards/chosen": 7.227957916259766, + "rewards/margins": 17.108730098179407, + "rewards/rejected": -9.880772181919642, + "step": 2724 + }, + { + "epoch": 0.746882280389201, + "grad_norm": 6.90625, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -10509120.0, + "logits/rejected": -17512349.09090909, + "logps/chosen": -398.5988581730769, + "logps/rejected": -462.60764382102275, + "loss": 0.0298, + "rewards/chosen": 7.187856820913462, + "rewards/margins": 17.57561733005764, + "rewards/rejected": -10.387760509144176, + "step": 2725 + }, + { + "epoch": 0.7471563656297109, + "grad_norm": 14.125, + "kl": 4.8497419357299805, + "learning_rate": 5e-06, + "logits/chosen": -22252985.333333332, + "logits/rejected": -10599402.0, + "logps/chosen": -349.8673909505208, + "logps/rejected": -407.8448079427083, + "loss": 0.0534, + "rewards/chosen": 7.15899658203125, + "rewards/margins": 15.66872787475586, + "rewards/rejected": -8.50973129272461, + "step": 2726 + }, + { + "epoch": 0.7474304508702206, + "grad_norm": 2.34375, + "kl": 1.6240921020507812, + "learning_rate": 5e-06, + "logits/chosen": 9771.09090909091, + "logits/rejected": -24846429.53846154, + "logps/chosen": -528.6538529829545, + "logps/rejected": -853.9887319711538, + "loss": 0.0063, + "rewards/chosen": 7.323668740012429, + "rewards/margins": 25.157546303488992, + "rewards/rejected": -17.833877563476562, + "step": 2727 + }, + { + "epoch": 0.7477045361107304, + "grad_norm": 1.8984375, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -16770327.0, + "logits/rejected": -21987540.0, + "logps/chosen": -402.4157409667969, + "logps/rejected": -533.7528686523438, + "loss": 0.0037, + "rewards/chosen": 6.8133039474487305, + "rewards/margins": 21.22376251220703, + "rewards/rejected": -14.4104585647583, + "step": 2728 + }, + { + "epoch": 0.7479786213512403, + "grad_norm": 2.59375, + "kl": 0.7968101501464844, + "learning_rate": 5e-06, + "logits/chosen": -6740416.0, + "logits/rejected": -15835468.307692308, + "logps/chosen": -379.58345170454544, + "logps/rejected": -460.7274639423077, + "loss": 0.0087, + "rewards/chosen": 7.763413862748579, + "rewards/margins": 18.87491100818127, + "rewards/rejected": -11.111497145432692, + "step": 2729 + }, + { + "epoch": 0.74825270659175, + "grad_norm": 1.3515625, + "kl": 9.00290584564209, + "learning_rate": 5e-06, + "logits/chosen": 1671782.5, + "logits/rejected": 33299524.0, + "logps/chosen": -374.1004943847656, + "logps/rejected": -704.9652099609375, + "loss": 0.0037, + "rewards/chosen": 8.541259765625, + "rewards/margins": 25.27133560180664, + "rewards/rejected": -16.73007583618164, + "step": 2730 + }, + { + "epoch": 0.7485267918322598, + "grad_norm": 0.578125, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -26151072.0, + "logits/rejected": -10408676.0, + "logps/chosen": -439.2056579589844, + "logps/rejected": -412.05889892578125, + "loss": 0.0017, + "rewards/chosen": 7.123013973236084, + "rewards/margins": 17.488196849822998, + "rewards/rejected": -10.365182876586914, + "step": 2731 + }, + { + "epoch": 0.7488008770727697, + "grad_norm": 2.71875, + "kl": 0.2917823791503906, + "learning_rate": 5e-06, + "logits/chosen": -4629557.230769231, + "logits/rejected": -39143831.27272727, + "logps/chosen": -499.44249549278845, + "logps/rejected": -611.1733842329545, + "loss": 0.0063, + "rewards/chosen": 7.607479388897236, + "rewards/margins": 19.917750218531467, + "rewards/rejected": -12.310270829634232, + "step": 2732 + }, + { + "epoch": 0.7490749623132794, + "grad_norm": 1.515625, + "kl": 6.904669761657715, + "learning_rate": 5e-06, + "logits/chosen": -10589012.0, + "logits/rejected": 45044188.0, + "logps/chosen": -434.1133117675781, + "logps/rejected": -550.5843505859375, + "loss": 0.0395, + "rewards/chosen": 9.77753734588623, + "rewards/margins": 26.50007152557373, + "rewards/rejected": -16.7225341796875, + "step": 2733 + }, + { + "epoch": 0.7493490475537893, + "grad_norm": 1.671875, + "kl": 0.37591552734375, + "learning_rate": 5e-06, + "logits/chosen": -49024413.09090909, + "logits/rejected": -18218036.923076924, + "logps/chosen": -479.75803444602275, + "logps/rejected": -786.2782451923077, + "loss": 0.0039, + "rewards/chosen": 8.57069951837713, + "rewards/margins": 22.057331645405377, + "rewards/rejected": -13.486632127028246, + "step": 2734 + }, + { + "epoch": 0.749623132794299, + "grad_norm": 13.3125, + "kl": 0.5666402578353882, + "learning_rate": 5e-06, + "logits/chosen": -14675450.181818182, + "logits/rejected": -44741129.84615385, + "logps/chosen": -324.36032936789775, + "logps/rejected": -650.3517127403846, + "loss": 0.1001, + "rewards/chosen": 5.128230701793324, + "rewards/margins": 18.383547802905102, + "rewards/rejected": -13.255317101111778, + "step": 2735 + }, + { + "epoch": 0.7498972180348088, + "grad_norm": 7.34375, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -5641752.615384615, + "logits/rejected": -43062461.09090909, + "logps/chosen": -429.32312950721155, + "logps/rejected": -492.89457563920456, + "loss": 0.0318, + "rewards/chosen": 7.55483891413762, + "rewards/margins": 22.183104294996994, + "rewards/rejected": -14.628265380859375, + "step": 2736 + }, + { + "epoch": 0.7501713032753187, + "grad_norm": 2.9375, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -15204491.636363637, + "logits/rejected": -30660952.615384616, + "logps/chosen": -344.34736772017044, + "logps/rejected": -603.7743389423077, + "loss": 0.0184, + "rewards/chosen": 5.831935535777699, + "rewards/margins": 17.332247754076977, + "rewards/rejected": -11.500312218299278, + "step": 2737 + }, + { + "epoch": 0.7504453885158284, + "grad_norm": 5.9375, + "kl": 1.8681621551513672, + "learning_rate": 5e-06, + "logits/chosen": -22276523.2, + "logits/rejected": -33226018.285714287, + "logps/chosen": -439.54560546875, + "logps/rejected": -520.0145089285714, + "loss": 0.0106, + "rewards/chosen": 6.629454803466797, + "rewards/margins": 17.98721455165318, + "rewards/rejected": -11.357759748186384, + "step": 2738 + }, + { + "epoch": 0.7507194737563382, + "grad_norm": 5.625, + "kl": 5.744556427001953, + "learning_rate": 5e-06, + "logits/chosen": -33910948.571428575, + "logits/rejected": -14960761.6, + "logps/chosen": -419.879150390625, + "logps/rejected": -612.1427734375, + "loss": 0.0201, + "rewards/chosen": 6.7450441632952005, + "rewards/margins": 17.69531042916434, + "rewards/rejected": -10.950266265869141, + "step": 2739 + }, + { + "epoch": 0.750993558996848, + "grad_norm": 2.0625, + "kl": 10.068503379821777, + "learning_rate": 5e-06, + "logits/chosen": -26118510.545454547, + "logits/rejected": -12309021.538461538, + "logps/chosen": -450.9464222301136, + "logps/rejected": -553.5127704326923, + "loss": 0.0056, + "rewards/chosen": 8.2376840764826, + "rewards/margins": 21.945818654307118, + "rewards/rejected": -13.70813457782452, + "step": 2740 + }, + { + "epoch": 0.7512676442373578, + "grad_norm": 5.25, + "kl": 2.1954150199890137, + "learning_rate": 5e-06, + "logits/chosen": -30929367.272727273, + "logits/rejected": -68295995.07692307, + "logps/chosen": -385.61811967329544, + "logps/rejected": -472.44193209134613, + "loss": 0.0235, + "rewards/chosen": 5.688493902033025, + "rewards/margins": 18.529289565719925, + "rewards/rejected": -12.8407956636869, + "step": 2741 + }, + { + "epoch": 0.7515417294778676, + "grad_norm": 12.4375, + "kl": 5.383383750915527, + "learning_rate": 5e-06, + "logits/chosen": -39008448.0, + "logits/rejected": -27296901.818181816, + "logps/chosen": -427.80382361778845, + "logps/rejected": -701.7187056107955, + "loss": 0.0548, + "rewards/chosen": 6.6383220966045675, + "rewards/margins": 21.48512385441707, + "rewards/rejected": -14.8468017578125, + "step": 2742 + }, + { + "epoch": 0.7518158147183774, + "grad_norm": 5.71875, + "kl": 0.46112823486328125, + "learning_rate": 5e-06, + "logits/chosen": -27835113.6, + "logits/rejected": -25422358.85714286, + "logps/chosen": -469.121630859375, + "logps/rejected": -590.0643136160714, + "loss": 0.0156, + "rewards/chosen": 8.385595703125, + "rewards/margins": 20.444896806989398, + "rewards/rejected": -12.059301103864398, + "step": 2743 + }, + { + "epoch": 0.7520898999588872, + "grad_norm": 4.875, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -30213900.0, + "logits/rejected": -28964984.0, + "logps/chosen": -377.7645263671875, + "logps/rejected": -563.6650390625, + "loss": 0.02, + "rewards/chosen": 6.788107872009277, + "rewards/margins": 20.746228218078613, + "rewards/rejected": -13.958120346069336, + "step": 2744 + }, + { + "epoch": 0.7523639851993971, + "grad_norm": 0.59375, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -27290707.2, + "logits/rejected": -34218598.85714286, + "logps/chosen": -534.023388671875, + "logps/rejected": -397.85714285714283, + "loss": 0.0013, + "rewards/chosen": 10.040146636962891, + "rewards/margins": 21.959920283726284, + "rewards/rejected": -11.919773646763392, + "step": 2745 + }, + { + "epoch": 0.7526380704399068, + "grad_norm": 1.09375, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -6738304.5, + "logits/rejected": -39812080.0, + "logps/chosen": -559.9094848632812, + "logps/rejected": -654.3690795898438, + "loss": 0.0032, + "rewards/chosen": 8.14369010925293, + "rewards/margins": 20.12713050842285, + "rewards/rejected": -11.983440399169922, + "step": 2746 + }, + { + "epoch": 0.7529121556804166, + "grad_norm": 5.125, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -19883777.333333332, + "logits/rejected": -24596608.0, + "logps/chosen": -514.8895670572916, + "logps/rejected": -610.1061197916666, + "loss": 0.0501, + "rewards/chosen": 6.6147816975911455, + "rewards/margins": 19.1687494913737, + "rewards/rejected": -12.553967793782553, + "step": 2747 + }, + { + "epoch": 0.7531862409209265, + "grad_norm": 2.578125, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -20126242.90909091, + "logits/rejected": -16345344.0, + "logps/chosen": -498.96786221590907, + "logps/rejected": -364.33360877403845, + "loss": 0.0034, + "rewards/chosen": 7.822874589399858, + "rewards/margins": 18.663453108780867, + "rewards/rejected": -10.84057851938101, + "step": 2748 + }, + { + "epoch": 0.7534603261614362, + "grad_norm": 9.3125, + "kl": 2.3401541709899902, + "learning_rate": 5e-06, + "logits/chosen": -17043862.4, + "logits/rejected": -28987584.0, + "logps/chosen": -432.306640625, + "logps/rejected": -652.8836805555555, + "loss": 0.038, + "rewards/chosen": 6.648186747233073, + "rewards/margins": 21.163343811035155, + "rewards/rejected": -14.515157063802084, + "step": 2749 + }, + { + "epoch": 0.753734411401946, + "grad_norm": 7.84375, + "kl": 22.168609619140625, + "learning_rate": 5e-06, + "logits/chosen": -26040752.94117647, + "logits/rejected": -33782582.85714286, + "logps/chosen": -456.5464728860294, + "logps/rejected": -505.61802455357144, + "loss": 0.0483, + "rewards/chosen": 7.978160184972427, + "rewards/margins": 22.75079550863314, + "rewards/rejected": -14.772635323660714, + "step": 2750 + }, + { + "epoch": 0.7540084966424558, + "grad_norm": 10.375, + "kl": 8.380030632019043, + "learning_rate": 5e-06, + "logits/chosen": -14371001.0, + "logits/rejected": -27816052.0, + "logps/chosen": -409.1214294433594, + "logps/rejected": -423.74658203125, + "loss": 0.0453, + "rewards/chosen": 5.918787479400635, + "rewards/margins": 18.271969318389893, + "rewards/rejected": -12.353181838989258, + "step": 2751 + }, + { + "epoch": 0.7542825818829656, + "grad_norm": 1.765625, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -33878440.0, + "logits/rejected": -35669781.333333336, + "logps/chosen": -322.2492268880208, + "logps/rejected": -469.897216796875, + "loss": 0.0074, + "rewards/chosen": 7.585936228434245, + "rewards/margins": 21.22555414835612, + "rewards/rejected": -13.639617919921875, + "step": 2752 + }, + { + "epoch": 0.7545566671234754, + "grad_norm": 12.0625, + "kl": 8.464765548706055, + "learning_rate": 5e-06, + "logits/chosen": -4653962.285714285, + "logits/rejected": -32812800.0, + "logps/chosen": -425.22872488839283, + "logps/rejected": -504.896435546875, + "loss": 0.0791, + "rewards/chosen": 7.126728602818081, + "rewards/margins": 16.585006495884485, + "rewards/rejected": -9.458277893066406, + "step": 2753 + }, + { + "epoch": 0.7548307523639852, + "grad_norm": 4.03125, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -5264890.181818182, + "logits/rejected": -32070449.230769232, + "logps/chosen": -510.7052112926136, + "logps/rejected": -539.6261268028846, + "loss": 0.0155, + "rewards/chosen": 8.375927318226207, + "rewards/margins": 22.690070572432937, + "rewards/rejected": -14.31414325420673, + "step": 2754 + }, + { + "epoch": 0.755104837604495, + "grad_norm": 4.34375, + "kl": 1.6548932790756226, + "learning_rate": 5e-06, + "logits/chosen": -27793870.769230768, + "logits/rejected": -46987066.18181818, + "logps/chosen": -326.2920673076923, + "logps/rejected": -682.5851828835227, + "loss": 0.0266, + "rewards/chosen": 5.363355783315805, + "rewards/margins": 22.850682665418077, + "rewards/rejected": -17.487326882102273, + "step": 2755 + }, + { + "epoch": 0.7553789228450047, + "grad_norm": 13.1875, + "kl": 6.2933349609375, + "learning_rate": 5e-06, + "logits/chosen": 1085426.3529411764, + "logits/rejected": -33965492.571428575, + "logps/chosen": -470.99060776654414, + "logps/rejected": -506.5685337611607, + "loss": 0.0543, + "rewards/chosen": 6.3771523868336395, + "rewards/margins": 18.189860560312994, + "rewards/rejected": -11.812708173479352, + "step": 2756 + }, + { + "epoch": 0.7556530080855146, + "grad_norm": 8.8125, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -45183284.36363637, + "logits/rejected": -30228406.153846152, + "logps/chosen": -414.92555930397725, + "logps/rejected": -497.68734975961536, + "loss": 0.0539, + "rewards/chosen": 6.167322332208807, + "rewards/margins": 17.516559627506282, + "rewards/rejected": -11.349237295297476, + "step": 2757 + }, + { + "epoch": 0.7559270933260244, + "grad_norm": 6.53125, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -39877617.777777776, + "logits/rejected": 36475225.6, + "logps/chosen": -500.0118815104167, + "logps/rejected": -604.329296875, + "loss": 0.0319, + "rewards/chosen": 6.8258717854817705, + "rewards/margins": 20.620510864257813, + "rewards/rejected": -13.794639078776042, + "step": 2758 + }, + { + "epoch": 0.7562011785665342, + "grad_norm": 11.25, + "kl": 1.819867491722107, + "learning_rate": 5e-06, + "logits/chosen": -28009268.363636363, + "logits/rejected": -34833220.92307692, + "logps/chosen": -373.8841441761364, + "logps/rejected": -520.8594876802885, + "loss": 0.0364, + "rewards/chosen": 8.830890308726918, + "rewards/margins": 21.651241542576077, + "rewards/rejected": -12.82035123384916, + "step": 2759 + }, + { + "epoch": 0.756475263807044, + "grad_norm": 1.0390625, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -18306666.666666668, + "logits/rejected": -4572035.2, + "logps/chosen": -324.7732747395833, + "logps/rejected": -536.0904296875, + "loss": 0.0032, + "rewards/chosen": 5.601757473415798, + "rewards/margins": 18.917286851671008, + "rewards/rejected": -13.315529378255208, + "step": 2760 + }, + { + "epoch": 0.7567493490475538, + "grad_norm": 7.5625, + "kl": 0.9712969660758972, + "learning_rate": 5e-06, + "logits/chosen": -7352921.0, + "logits/rejected": -21148762.0, + "logps/chosen": -362.6809387207031, + "logps/rejected": -546.5841674804688, + "loss": 0.0358, + "rewards/chosen": 6.126959800720215, + "rewards/margins": 22.56828212738037, + "rewards/rejected": -16.441322326660156, + "step": 2761 + }, + { + "epoch": 0.7570234342880636, + "grad_norm": 3.921875, + "kl": 3.365769863128662, + "learning_rate": 5e-06, + "logits/chosen": -33767571.2, + "logits/rejected": -24094855.111111112, + "logps/chosen": -455.7794921875, + "logps/rejected": -504.2360026041667, + "loss": 0.009, + "rewards/chosen": 8.870934041341146, + "rewards/margins": 22.885657925075954, + "rewards/rejected": -14.01472388373481, + "step": 2762 + }, + { + "epoch": 0.7572975195285734, + "grad_norm": 8.5625, + "kl": 0.948267936706543, + "learning_rate": 5e-06, + "logits/chosen": -15870901.333333334, + "logits/rejected": -10099600.0, + "logps/chosen": -401.43549262152777, + "logps/rejected": -619.2928385416667, + "loss": 0.0306, + "rewards/chosen": 5.308479309082031, + "rewards/margins": 21.777992248535156, + "rewards/rejected": -16.469512939453125, + "step": 2763 + }, + { + "epoch": 0.7575716047690831, + "grad_norm": 9.875, + "kl": 2.8153579235076904, + "learning_rate": 5e-06, + "logits/chosen": -36245689.14285714, + "logits/rejected": -15960742.4, + "logps/chosen": -427.4033203125, + "logps/rejected": -663.59462890625, + "loss": 0.0283, + "rewards/chosen": 7.290718078613281, + "rewards/margins": 19.553060913085936, + "rewards/rejected": -12.262342834472657, + "step": 2764 + }, + { + "epoch": 0.757845690009593, + "grad_norm": 2.25, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -22745989.333333332, + "logits/rejected": 23967293.333333332, + "logps/chosen": -381.1148274739583, + "logps/rejected": -573.3662923177084, + "loss": 0.0093, + "rewards/chosen": 7.484049479166667, + "rewards/margins": 27.16640853881836, + "rewards/rejected": -19.68235905965169, + "step": 2765 + }, + { + "epoch": 0.7581197752501028, + "grad_norm": 3.96875, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -21825736.727272727, + "logits/rejected": -36042651.07692308, + "logps/chosen": -380.2742808948864, + "logps/rejected": -554.7297926682693, + "loss": 0.0421, + "rewards/chosen": 6.8695595481178975, + "rewards/margins": 20.68705269340035, + "rewards/rejected": -13.817493145282452, + "step": 2766 + }, + { + "epoch": 0.7583938604906125, + "grad_norm": 3.5, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -9446620.363636363, + "logits/rejected": -42183030.15384615, + "logps/chosen": -285.73486328125, + "logps/rejected": -577.3092322716346, + "loss": 0.0323, + "rewards/chosen": 6.386251969770952, + "rewards/margins": 21.962156229085856, + "rewards/rejected": -15.575904259314903, + "step": 2767 + }, + { + "epoch": 0.7586679457311224, + "grad_norm": 5.6875, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -11040245.333333334, + "logits/rejected": -24943891.2, + "logps/chosen": -364.21503363715277, + "logps/rejected": -518.6682291666667, + "loss": 0.0239, + "rewards/chosen": 6.433459811740452, + "rewards/margins": 19.39150627983941, + "rewards/rejected": -12.958046468098958, + "step": 2768 + }, + { + "epoch": 0.7589420309716322, + "grad_norm": 9.625, + "kl": 3.3568739891052246, + "learning_rate": 5e-06, + "logits/chosen": 2932208.0, + "logits/rejected": -32703424.0, + "logps/chosen": -403.65875244140625, + "logps/rejected": -448.2911071777344, + "loss": 0.0555, + "rewards/chosen": 6.451761245727539, + "rewards/margins": 15.779071807861328, + "rewards/rejected": -9.327310562133789, + "step": 2769 + }, + { + "epoch": 0.759216116212142, + "grad_norm": 6.625, + "kl": 10.46141242980957, + "learning_rate": 5e-06, + "logits/chosen": -4652798.933333334, + "logits/rejected": -21647473.777777776, + "logps/chosen": -415.65341796875, + "logps/rejected": -566.7164713541666, + "loss": 0.0606, + "rewards/chosen": 6.5907038370768225, + "rewards/margins": 18.30269758436415, + "rewards/rejected": -11.711993747287327, + "step": 2770 + }, + { + "epoch": 0.7594902014526518, + "grad_norm": 4.15625, + "kl": 4.638254165649414, + "learning_rate": 5e-06, + "logits/chosen": -16916924.0, + "logits/rejected": -14543397.0, + "logps/chosen": -375.1133117675781, + "logps/rejected": -470.99066162109375, + "loss": 0.0385, + "rewards/chosen": 7.458029747009277, + "rewards/margins": 18.15080738067627, + "rewards/rejected": -10.692777633666992, + "step": 2771 + }, + { + "epoch": 0.7597642866931615, + "grad_norm": 7.6875, + "kl": 0.0429433211684227, + "learning_rate": 5e-06, + "logits/chosen": -23627853.09090909, + "logits/rejected": 19833094.153846152, + "logps/chosen": -357.65793678977275, + "logps/rejected": -709.8342848557693, + "loss": 0.0247, + "rewards/chosen": 6.201148293235085, + "rewards/margins": 24.122542054503114, + "rewards/rejected": -17.92139376126803, + "step": 2772 + }, + { + "epoch": 0.7600383719336714, + "grad_norm": 7.3125, + "kl": 12.601542472839355, + "learning_rate": 5e-06, + "logits/chosen": -32012768.0, + "logits/rejected": -36909309.71428572, + "logps/chosen": -517.175830078125, + "logps/rejected": -567.7024623325893, + "loss": 0.0233, + "rewards/chosen": 9.082037353515625, + "rewards/margins": 20.431782967703683, + "rewards/rejected": -11.349745614188057, + "step": 2773 + }, + { + "epoch": 0.7603124571741812, + "grad_norm": 13.4375, + "kl": 10.489957809448242, + "learning_rate": 5e-06, + "logits/chosen": -11317462.153846154, + "logits/rejected": -12490869.818181818, + "logps/chosen": -378.5908954326923, + "logps/rejected": -560.5603693181819, + "loss": 0.0762, + "rewards/chosen": 6.56368901179387, + "rewards/margins": 16.516286196408572, + "rewards/rejected": -9.952597184614701, + "step": 2774 + }, + { + "epoch": 0.7605865424146909, + "grad_norm": 3.203125, + "kl": 3.2603378295898438, + "learning_rate": 5e-06, + "logits/chosen": -11482518.545454545, + "logits/rejected": -4079618.153846154, + "logps/chosen": -481.03315873579544, + "logps/rejected": -485.361328125, + "loss": 0.0088, + "rewards/chosen": 6.446437488902699, + "rewards/margins": 15.781927202131365, + "rewards/rejected": -9.335489713228666, + "step": 2775 + }, + { + "epoch": 0.7608606276552008, + "grad_norm": 7.25, + "kl": 5.048122406005859, + "learning_rate": 5e-06, + "logits/chosen": -30170673.230769232, + "logits/rejected": -21854478.545454547, + "logps/chosen": -468.8615910456731, + "logps/rejected": -365.1594904119318, + "loss": 0.0265, + "rewards/chosen": 7.086051354041467, + "rewards/margins": 15.6043036400855, + "rewards/rejected": -8.518252286044033, + "step": 2776 + }, + { + "epoch": 0.7611347128957106, + "grad_norm": 12.5625, + "kl": 8.620306015014648, + "learning_rate": 5e-06, + "logits/chosen": -7132297.230769231, + "logits/rejected": -7290752.7272727275, + "logps/chosen": -487.20541616586536, + "logps/rejected": -611.6945578835227, + "loss": 0.072, + "rewards/chosen": 6.250758244441106, + "rewards/margins": 15.176277320701761, + "rewards/rejected": -8.925519076260654, + "step": 2777 + }, + { + "epoch": 0.7614087981362203, + "grad_norm": 3.375, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": 27794440.0, + "logits/rejected": -27030894.0, + "logps/chosen": -520.4371337890625, + "logps/rejected": -433.82806396484375, + "loss": 0.0055, + "rewards/chosen": 8.080987930297852, + "rewards/margins": 20.127267837524414, + "rewards/rejected": -12.046279907226562, + "step": 2778 + }, + { + "epoch": 0.7616828833767302, + "grad_norm": 7.09375, + "kl": 3.4103317260742188, + "learning_rate": 5e-06, + "logits/chosen": -20120382.666666668, + "logits/rejected": -23402672.0, + "logps/chosen": -278.4970703125, + "logps/rejected": -476.1376546223958, + "loss": 0.0409, + "rewards/chosen": 5.894421895345052, + "rewards/margins": 15.181224822998047, + "rewards/rejected": -9.286802927652994, + "step": 2779 + }, + { + "epoch": 0.7619569686172399, + "grad_norm": 7.71875, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -11418368.727272727, + "logits/rejected": 88285006.76923077, + "logps/chosen": -497.0296519886364, + "logps/rejected": -657.0009014423077, + "loss": 0.0214, + "rewards/chosen": 8.048785816539418, + "rewards/margins": 23.285444939886773, + "rewards/rejected": -15.236659123347355, + "step": 2780 + }, + { + "epoch": 0.7622310538577498, + "grad_norm": 8.0, + "kl": 1.2204806804656982, + "learning_rate": 5e-06, + "logits/chosen": -8877542.0, + "logits/rejected": -38662420.0, + "logps/chosen": -379.4473571777344, + "logps/rejected": -642.2404174804688, + "loss": 0.0234, + "rewards/chosen": 6.5233964920043945, + "rewards/margins": 20.88601016998291, + "rewards/rejected": -14.362613677978516, + "step": 2781 + }, + { + "epoch": 0.7625051390982596, + "grad_norm": 4.15625, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -25631069.09090909, + "logits/rejected": -31366857.846153848, + "logps/chosen": -357.8538263494318, + "logps/rejected": -487.42660757211536, + "loss": 0.0136, + "rewards/chosen": 6.778081026944247, + "rewards/margins": 21.566406410057226, + "rewards/rejected": -14.78832538311298, + "step": 2782 + }, + { + "epoch": 0.7627792243387693, + "grad_norm": 14.375, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -2971445.6, + "logits/rejected": -21118742.85714286, + "logps/chosen": -396.394140625, + "logps/rejected": -573.5926688058036, + "loss": 0.0641, + "rewards/chosen": 5.307440567016601, + "rewards/margins": 12.822988183157785, + "rewards/rejected": -7.515547616141183, + "step": 2783 + }, + { + "epoch": 0.7630533095792792, + "grad_norm": 4.125, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -11155323.076923076, + "logits/rejected": -24446801.454545453, + "logps/chosen": -354.4502704326923, + "logps/rejected": -528.2449396306819, + "loss": 0.0382, + "rewards/chosen": 6.098743145282452, + "rewards/margins": 17.129416779204682, + "rewards/rejected": -11.03067363392223, + "step": 2784 + }, + { + "epoch": 0.763327394819789, + "grad_norm": 1.8359375, + "kl": 7.237586975097656, + "learning_rate": 5e-06, + "logits/chosen": -20393757.714285713, + "logits/rejected": -36194105.6, + "logps/chosen": -398.58837890625, + "logps/rejected": -444.62587890625, + "loss": 0.004, + "rewards/chosen": 8.090175083705358, + "rewards/margins": 19.485150364467074, + "rewards/rejected": -11.394975280761718, + "step": 2785 + }, + { + "epoch": 0.7636014800602987, + "grad_norm": 2.609375, + "kl": 1.2128448486328125, + "learning_rate": 5e-06, + "logits/chosen": -24912144.0, + "logits/rejected": -20315877.333333332, + "logps/chosen": -389.2418619791667, + "logps/rejected": -537.0008138020834, + "loss": 0.0103, + "rewards/chosen": 7.044209162394206, + "rewards/margins": 18.151730219523113, + "rewards/rejected": -11.107521057128906, + "step": 2786 + }, + { + "epoch": 0.7638755653008086, + "grad_norm": 4.90625, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -9541568.0, + "logits/rejected": -11481917.714285715, + "logps/chosen": -365.0228515625, + "logps/rejected": -517.5274483816964, + "loss": 0.0113, + "rewards/chosen": 6.764102172851563, + "rewards/margins": 16.086162894112725, + "rewards/rejected": -9.322060721261161, + "step": 2787 + }, + { + "epoch": 0.7641496505413183, + "grad_norm": 5.09375, + "kl": 6.521088600158691, + "learning_rate": 5e-06, + "logits/chosen": -27421030.4, + "logits/rejected": -21560013.714285713, + "logps/chosen": -513.34091796875, + "logps/rejected": -619.1610630580357, + "loss": 0.0524, + "rewards/chosen": 6.687256622314453, + "rewards/margins": 18.37834461757115, + "rewards/rejected": -11.691087995256696, + "step": 2788 + }, + { + "epoch": 0.7644237357818281, + "grad_norm": 6.6875, + "kl": 0.7760137319564819, + "learning_rate": 5e-06, + "logits/chosen": -22276337.066666666, + "logits/rejected": -29321342.222222224, + "logps/chosen": -435.3965169270833, + "logps/rejected": -341.10096571180554, + "loss": 0.041, + "rewards/chosen": 5.757536315917969, + "rewards/margins": 14.152857632107205, + "rewards/rejected": -8.395321316189236, + "step": 2789 + }, + { + "epoch": 0.764697821022338, + "grad_norm": 3.921875, + "kl": 3.9700002670288086, + "learning_rate": 5e-06, + "logits/chosen": -9861978.0, + "logits/rejected": -4631718.5, + "logps/chosen": -405.92230224609375, + "logps/rejected": -506.7681884765625, + "loss": 0.0594, + "rewards/chosen": 6.673892021179199, + "rewards/margins": 18.026334762573242, + "rewards/rejected": -11.352442741394043, + "step": 2790 + }, + { + "epoch": 0.7649719062628477, + "grad_norm": 3.078125, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -23093628.8, + "logits/rejected": -37716566.85714286, + "logps/chosen": -372.560107421875, + "logps/rejected": -705.7066127232143, + "loss": 0.0113, + "rewards/chosen": 6.293927001953125, + "rewards/margins": 20.072068568638393, + "rewards/rejected": -13.778141566685267, + "step": 2791 + }, + { + "epoch": 0.7652459915033576, + "grad_norm": 5.53125, + "kl": 3.795163631439209, + "learning_rate": 5e-06, + "logits/chosen": -13291613.333333334, + "logits/rejected": -32752218.666666668, + "logps/chosen": -440.1899007161458, + "logps/rejected": -513.69189453125, + "loss": 0.0476, + "rewards/chosen": 8.991508483886719, + "rewards/margins": 19.819487253824867, + "rewards/rejected": -10.82797876993815, + "step": 2792 + }, + { + "epoch": 0.7655200767438674, + "grad_norm": 0.291015625, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -15665459.2, + "logits/rejected": -41156448.0, + "logps/chosen": -409.542236328125, + "logps/rejected": -630.5796595982143, + "loss": 0.0009, + "rewards/chosen": 8.324118041992188, + "rewards/margins": 22.33738228934152, + "rewards/rejected": -14.01326424734933, + "step": 2793 + }, + { + "epoch": 0.7657941619843771, + "grad_norm": 1.3359375, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -3131858.1818181816, + "logits/rejected": -15882695.384615384, + "logps/chosen": -495.89377663352275, + "logps/rejected": -524.9518479567307, + "loss": 0.0034, + "rewards/chosen": 8.753265380859375, + "rewards/margins": 19.419004000150238, + "rewards/rejected": -10.665738619290865, + "step": 2794 + }, + { + "epoch": 0.766068247224887, + "grad_norm": 8.5625, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -13885409.6, + "logits/rejected": -27715172.57142857, + "logps/chosen": -367.67373046875, + "logps/rejected": -516.8992396763393, + "loss": 0.0303, + "rewards/chosen": 6.462983703613281, + "rewards/margins": 16.4281010219029, + "rewards/rejected": -9.96511731828962, + "step": 2795 + }, + { + "epoch": 0.7663423324653967, + "grad_norm": 4.96875, + "kl": 5.90837287902832, + "learning_rate": 5e-06, + "logits/chosen": -6164307.428571428, + "logits/rejected": -15107537.6, + "logps/chosen": -436.0736607142857, + "logps/rejected": -590.968017578125, + "loss": 0.0298, + "rewards/chosen": 7.567193167550223, + "rewards/margins": 20.280417960030693, + "rewards/rejected": -12.713224792480469, + "step": 2796 + }, + { + "epoch": 0.7666164177059065, + "grad_norm": 4.9375, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -41429021.333333336, + "logits/rejected": -30401749.333333332, + "logps/chosen": -423.6914469401042, + "logps/rejected": -540.2763671875, + "loss": 0.0379, + "rewards/chosen": 6.636752446492513, + "rewards/margins": 17.299569447835285, + "rewards/rejected": -10.662817001342773, + "step": 2797 + }, + { + "epoch": 0.7668905029464164, + "grad_norm": 13.3125, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -18635230.769230768, + "logits/rejected": -23447570.90909091, + "logps/chosen": -383.5866511418269, + "logps/rejected": -538.7832919034091, + "loss": 0.0334, + "rewards/chosen": 6.635807917668269, + "rewards/margins": 16.61599091216401, + "rewards/rejected": -9.980182994495738, + "step": 2798 + }, + { + "epoch": 0.7671645881869261, + "grad_norm": 3.546875, + "kl": 6.321282386779785, + "learning_rate": 5e-06, + "logits/chosen": -34771248.0, + "logits/rejected": -13755553.333333334, + "logps/chosen": -492.0751139322917, + "logps/rejected": -492.2294514973958, + "loss": 0.0125, + "rewards/chosen": 7.508309682210286, + "rewards/margins": 17.384496053059895, + "rewards/rejected": -9.87618637084961, + "step": 2799 + }, + { + "epoch": 0.7674386734274359, + "grad_norm": 2.484375, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -10484704.0, + "logits/rejected": -18974986.666666668, + "logps/chosen": -441.7309163411458, + "logps/rejected": -584.9000244140625, + "loss": 0.0316, + "rewards/chosen": 7.919079462687175, + "rewards/margins": 19.034894943237305, + "rewards/rejected": -11.11581548055013, + "step": 2800 + }, + { + "epoch": 0.7677127586679457, + "grad_norm": 6.625, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -14610322.133333333, + "logits/rejected": -28312161.777777776, + "logps/chosen": -458.4232421875, + "logps/rejected": -556.4165581597222, + "loss": 0.0164, + "rewards/chosen": 7.640055338541667, + "rewards/margins": 18.28455098470052, + "rewards/rejected": -10.644495646158854, + "step": 2801 + }, + { + "epoch": 0.7679868439084555, + "grad_norm": 2.90625, + "kl": 4.4722490310668945, + "learning_rate": 5e-06, + "logits/chosen": -54667392.0, + "logits/rejected": -25545288.0, + "logps/chosen": -503.2737630208333, + "logps/rejected": -529.4990641276041, + "loss": 0.0062, + "rewards/chosen": 7.746613184611003, + "rewards/margins": 19.436713536580402, + "rewards/rejected": -11.6901003519694, + "step": 2802 + }, + { + "epoch": 0.7682609291489654, + "grad_norm": 4.0, + "kl": 2.389575958251953, + "learning_rate": 5e-06, + "logits/chosen": -16544644.0, + "logits/rejected": -35055706.666666664, + "logps/chosen": -417.4725748697917, + "logps/rejected": -531.7978515625, + "loss": 0.0111, + "rewards/chosen": 7.228902816772461, + "rewards/margins": 19.174506505330406, + "rewards/rejected": -11.945603688557943, + "step": 2803 + }, + { + "epoch": 0.7685350143894751, + "grad_norm": 11.6875, + "kl": 0.8067407608032227, + "learning_rate": 5e-06, + "logits/chosen": -13661553.23076923, + "logits/rejected": -22936864.0, + "logps/chosen": -403.8019831730769, + "logps/rejected": -494.30859375, + "loss": 0.0443, + "rewards/chosen": 5.471716073843149, + "rewards/margins": 19.047695133235905, + "rewards/rejected": -13.575979059392756, + "step": 2804 + }, + { + "epoch": 0.7688090996299849, + "grad_norm": 7.15625, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": 16012308.923076924, + "logits/rejected": -34141457.45454545, + "logps/chosen": -390.3220402644231, + "logps/rejected": -677.2750355113636, + "loss": 0.0379, + "rewards/chosen": 5.717508756197416, + "rewards/margins": 19.783405917507785, + "rewards/rejected": -14.06589716131037, + "step": 2805 + }, + { + "epoch": 0.7690831848704948, + "grad_norm": 8.4375, + "kl": 4.226692199707031, + "learning_rate": 5e-06, + "logits/chosen": -23445642.666666668, + "logits/rejected": -4729795.555555556, + "logps/chosen": -384.3920572916667, + "logps/rejected": -507.30040147569446, + "loss": 0.0426, + "rewards/chosen": 6.511273701985677, + "rewards/margins": 14.940277947319878, + "rewards/rejected": -8.429004245334202, + "step": 2806 + }, + { + "epoch": 0.7693572701110045, + "grad_norm": 5.84375, + "kl": 12.123927116394043, + "learning_rate": 5e-06, + "logits/chosen": -22689018.666666668, + "logits/rejected": -36444232.0, + "logps/chosen": -425.8274739583333, + "logps/rejected": -516.2394205729166, + "loss": 0.0469, + "rewards/chosen": 6.634878158569336, + "rewards/margins": 16.66643842061361, + "rewards/rejected": -10.031560262044271, + "step": 2807 + }, + { + "epoch": 0.7696313553515143, + "grad_norm": 6.21875, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -15727796.0, + "logits/rejected": -37734944.0, + "logps/chosen": -412.4119873046875, + "logps/rejected": -512.010009765625, + "loss": 0.0465, + "rewards/chosen": 5.960187276204427, + "rewards/margins": 18.46774419148763, + "rewards/rejected": -12.507556915283203, + "step": 2808 + }, + { + "epoch": 0.7699054405920241, + "grad_norm": 4.875, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -24367693.333333332, + "logits/rejected": -29983299.555555556, + "logps/chosen": -388.0152994791667, + "logps/rejected": -384.1111111111111, + "loss": 0.0094, + "rewards/chosen": 5.367031097412109, + "rewards/margins": 16.742972903781467, + "rewards/rejected": -11.375941806369358, + "step": 2809 + }, + { + "epoch": 0.7701795258325339, + "grad_norm": 5.25, + "kl": 1.9305700063705444, + "learning_rate": 5e-06, + "logits/chosen": -16649452.0, + "logits/rejected": -41201797.333333336, + "logps/chosen": -395.3647867838542, + "logps/rejected": -618.1743977864584, + "loss": 0.0162, + "rewards/chosen": 6.8584645589192705, + "rewards/margins": 21.74362055460612, + "rewards/rejected": -14.88515599568685, + "step": 2810 + }, + { + "epoch": 0.7704536110730437, + "grad_norm": 6.34375, + "kl": 7.296306610107422, + "learning_rate": 5e-06, + "logits/chosen": -16285592.727272727, + "logits/rejected": 3758679.3846153845, + "logps/chosen": -431.2667791193182, + "logps/rejected": -490.60633263221155, + "loss": 0.0252, + "rewards/chosen": 8.001141634854404, + "rewards/margins": 17.242596126102903, + "rewards/rejected": -9.241454491248497, + "step": 2811 + }, + { + "epoch": 0.7707276963135535, + "grad_norm": 9.4375, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -29894137.6, + "logits/rejected": -30754189.714285713, + "logps/chosen": -434.13642578125, + "logps/rejected": -619.95166015625, + "loss": 0.0379, + "rewards/chosen": 8.404371643066407, + "rewards/margins": 24.367204502650672, + "rewards/rejected": -15.962832859584264, + "step": 2812 + }, + { + "epoch": 0.7710017815540633, + "grad_norm": 13.625, + "kl": 1.6725845336914062, + "learning_rate": 5e-06, + "logits/chosen": -21456325.818181816, + "logits/rejected": -22783227.076923076, + "logps/chosen": -566.2841352982955, + "logps/rejected": -682.7322716346154, + "loss": 0.0363, + "rewards/chosen": 7.469745982776988, + "rewards/margins": 25.630942177939247, + "rewards/rejected": -18.16119619516226, + "step": 2813 + }, + { + "epoch": 0.7712758667945732, + "grad_norm": 12.375, + "kl": 13.2116060256958, + "learning_rate": 5e-06, + "logits/chosen": -25363914.666666668, + "logits/rejected": -53611790.222222224, + "logps/chosen": -459.03740234375, + "logps/rejected": -581.2180989583334, + "loss": 0.0672, + "rewards/chosen": 8.381198628743489, + "rewards/margins": 19.57114512125651, + "rewards/rejected": -11.189946492513021, + "step": 2814 + }, + { + "epoch": 0.7715499520350829, + "grad_norm": 3.453125, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -6353616.0, + "logits/rejected": 2633504.0, + "logps/chosen": -395.72216796875, + "logps/rejected": -681.0286959134615, + "loss": 0.0072, + "rewards/chosen": 7.6667938232421875, + "rewards/margins": 25.72256587101863, + "rewards/rejected": -18.055772047776443, + "step": 2815 + }, + { + "epoch": 0.7718240372755927, + "grad_norm": 12.0625, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -10336154.909090908, + "logits/rejected": -31536546.46153846, + "logps/chosen": -331.8724476207386, + "logps/rejected": -581.6519681490385, + "loss": 0.0443, + "rewards/chosen": 7.583138899369673, + "rewards/margins": 20.938151059450803, + "rewards/rejected": -13.35501216008113, + "step": 2816 + }, + { + "epoch": 0.7720981225161025, + "grad_norm": 2.46875, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -34743431.384615384, + "logits/rejected": -31628578.90909091, + "logps/chosen": -408.64637169471155, + "logps/rejected": -369.9503284801136, + "loss": 0.0096, + "rewards/chosen": 7.187347998985877, + "rewards/margins": 18.07269207080761, + "rewards/rejected": -10.885344071821732, + "step": 2817 + }, + { + "epoch": 0.7723722077566123, + "grad_norm": 5.96875, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -17819303.272727273, + "logits/rejected": -18659105.230769232, + "logps/chosen": -396.6134144176136, + "logps/rejected": -537.9655949519231, + "loss": 0.0162, + "rewards/chosen": 7.8607177734375, + "rewards/margins": 20.037774892953728, + "rewards/rejected": -12.177057119516226, + "step": 2818 + }, + { + "epoch": 0.7726462929971221, + "grad_norm": 6.15625, + "kl": 8.23203182220459, + "learning_rate": 5e-06, + "logits/chosen": -26212890.666666668, + "logits/rejected": -34453029.333333336, + "logps/chosen": -530.7855631510416, + "logps/rejected": -618.326416015625, + "loss": 0.0535, + "rewards/chosen": 7.660545984903972, + "rewards/margins": 20.66975466410319, + "rewards/rejected": -13.009208679199219, + "step": 2819 + }, + { + "epoch": 0.7729203782376319, + "grad_norm": 6.96875, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -16619962.181818182, + "logits/rejected": -18142032.0, + "logps/chosen": -344.142333984375, + "logps/rejected": -393.17236328125, + "loss": 0.0652, + "rewards/chosen": 5.317474018443715, + "rewards/margins": 16.21760407027665, + "rewards/rejected": -10.900130051832933, + "step": 2820 + }, + { + "epoch": 0.7731944634781417, + "grad_norm": 8.125, + "kl": 13.588447570800781, + "learning_rate": 5e-06, + "logits/chosen": -21738710.153846152, + "logits/rejected": -11335099.636363637, + "logps/chosen": -845.2741887019231, + "logps/rejected": -627.4592507102273, + "loss": 0.1087, + "rewards/chosen": 10.26156264085036, + "rewards/margins": 26.98656458287806, + "rewards/rejected": -16.7250019420277, + "step": 2821 + }, + { + "epoch": 0.7734685487186514, + "grad_norm": 4.0625, + "kl": 0.3636443018913269, + "learning_rate": 5e-06, + "logits/chosen": -27517676.8, + "logits/rejected": 11331805.714285715, + "logps/chosen": -344.9762939453125, + "logps/rejected": -442.00558035714283, + "loss": 0.0163, + "rewards/chosen": 6.104502868652344, + "rewards/margins": 16.801250784737725, + "rewards/rejected": -10.69674791608538, + "step": 2822 + }, + { + "epoch": 0.7737426339591613, + "grad_norm": 3.6875, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -36776460.0, + "logits/rejected": -23501068.0, + "logps/chosen": -476.2791748046875, + "logps/rejected": -720.8363647460938, + "loss": 0.0077, + "rewards/chosen": 7.336752414703369, + "rewards/margins": 21.732472896575928, + "rewards/rejected": -14.395720481872559, + "step": 2823 + }, + { + "epoch": 0.7740167191996711, + "grad_norm": 10.5625, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -21757174.4, + "logits/rejected": -32642096.0, + "logps/chosen": -443.52978515625, + "logps/rejected": -526.3249162946429, + "loss": 0.0683, + "rewards/chosen": 6.818023681640625, + "rewards/margins": 17.906465802873882, + "rewards/rejected": -11.088442121233259, + "step": 2824 + }, + { + "epoch": 0.774290804440181, + "grad_norm": 3.453125, + "kl": 7.509147644042969, + "learning_rate": 5e-06, + "logits/chosen": -6948418.909090909, + "logits/rejected": -36704866.461538464, + "logps/chosen": -452.0035511363636, + "logps/rejected": -631.5872896634615, + "loss": 0.0444, + "rewards/chosen": 6.4690163352272725, + "rewards/margins": 20.880254785497705, + "rewards/rejected": -14.411238450270433, + "step": 2825 + }, + { + "epoch": 0.7745648896806907, + "grad_norm": 6.1875, + "kl": 17.531883239746094, + "learning_rate": 5e-06, + "logits/chosen": -20945472.0, + "logits/rejected": -3295581.714285714, + "logps/chosen": -465.3341854319853, + "logps/rejected": -578.8700823102679, + "loss": 0.0746, + "rewards/chosen": 7.721064848058364, + "rewards/margins": 19.44348785656841, + "rewards/rejected": -11.722423008510045, + "step": 2826 + }, + { + "epoch": 0.7748389749212005, + "grad_norm": 1.3515625, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -4953390.4, + "logits/rejected": -57331570.28571428, + "logps/chosen": -505.707177734375, + "logps/rejected": -682.60986328125, + "loss": 0.0047, + "rewards/chosen": 8.36856689453125, + "rewards/margins": 25.16057434082031, + "rewards/rejected": -16.792007446289062, + "step": 2827 + }, + { + "epoch": 0.7751130601617103, + "grad_norm": 2.34375, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -26688128.0, + "logits/rejected": -32031776.0, + "logps/chosen": -343.0675706129808, + "logps/rejected": -521.2676225142045, + "loss": 0.0064, + "rewards/chosen": 6.176868145282452, + "rewards/margins": 17.022586529071514, + "rewards/rejected": -10.845718383789062, + "step": 2828 + }, + { + "epoch": 0.7753871454022201, + "grad_norm": 4.75, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -25890856.533333335, + "logits/rejected": -27513018.666666668, + "logps/chosen": -480.71936848958336, + "logps/rejected": -560.9308810763889, + "loss": 0.02, + "rewards/chosen": 7.081168619791667, + "rewards/margins": 20.25359293619792, + "rewards/rejected": -13.17242431640625, + "step": 2829 + }, + { + "epoch": 0.7756612306427298, + "grad_norm": 5.84375, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -35953459.692307696, + "logits/rejected": -19667351.272727273, + "logps/chosen": -427.00232872596155, + "logps/rejected": -492.2200816761364, + "loss": 0.0712, + "rewards/chosen": 6.018797067495493, + "rewards/margins": 18.162917717353448, + "rewards/rejected": -12.144120649857955, + "step": 2830 + }, + { + "epoch": 0.7759353158832397, + "grad_norm": 5.46875, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -46668468.36363637, + "logits/rejected": -25658889.846153848, + "logps/chosen": -317.45822975852275, + "logps/rejected": -613.8309795673077, + "loss": 0.0728, + "rewards/chosen": 5.653013749556108, + "rewards/margins": 16.72398824625082, + "rewards/rejected": -11.070974496694712, + "step": 2831 + }, + { + "epoch": 0.7762094011237495, + "grad_norm": 7.34375, + "kl": 1.3504054546356201, + "learning_rate": 5e-06, + "logits/chosen": -21966354.666666668, + "logits/rejected": -15801930.666666666, + "logps/chosen": -360.0289713541667, + "logps/rejected": -581.0619710286459, + "loss": 0.0375, + "rewards/chosen": 5.548119227091472, + "rewards/margins": 17.41499392191569, + "rewards/rejected": -11.866874694824219, + "step": 2832 + }, + { + "epoch": 0.7764834863642592, + "grad_norm": 11.9375, + "kl": 3.5843138694763184, + "learning_rate": 5e-06, + "logits/chosen": -30068140.8, + "logits/rejected": -10186405.333333334, + "logps/chosen": -428.53733723958334, + "logps/rejected": -507.2298177083333, + "loss": 0.0633, + "rewards/chosen": 7.788785807291666, + "rewards/margins": 15.838615078396266, + "rewards/rejected": -8.049829271104601, + "step": 2833 + }, + { + "epoch": 0.7767575716047691, + "grad_norm": 4.90625, + "kl": 0.17704519629478455, + "learning_rate": 5e-06, + "logits/chosen": -11908376.0, + "logits/rejected": -31322616.0, + "logps/chosen": -375.1988118489583, + "logps/rejected": -786.3131510416666, + "loss": 0.0095, + "rewards/chosen": 7.477780659993489, + "rewards/margins": 23.652735392252602, + "rewards/rejected": -16.174954732259113, + "step": 2834 + }, + { + "epoch": 0.7770316568452789, + "grad_norm": 7.40625, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -18293016.888888888, + "logits/rejected": -24618257.066666666, + "logps/chosen": -457.75916883680554, + "logps/rejected": -500.7682291666667, + "loss": 0.0664, + "rewards/chosen": 5.5537914699978295, + "rewards/margins": 18.537989468044707, + "rewards/rejected": -12.984197998046875, + "step": 2835 + }, + { + "epoch": 0.7773057420857887, + "grad_norm": 8.625, + "kl": 2.535532236099243, + "learning_rate": 5e-06, + "logits/chosen": -15465584.0, + "logits/rejected": -15865892.923076924, + "logps/chosen": -487.21977095170456, + "logps/rejected": -451.74305138221155, + "loss": 0.0393, + "rewards/chosen": 6.672699668190696, + "rewards/margins": 17.031675592169062, + "rewards/rejected": -10.358975923978365, + "step": 2836 + }, + { + "epoch": 0.7775798273262985, + "grad_norm": 8.4375, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -8824113.6, + "logits/rejected": -26392724.57142857, + "logps/chosen": -282.6585693359375, + "logps/rejected": -530.49267578125, + "loss": 0.0407, + "rewards/chosen": 5.5301166534423825, + "rewards/margins": 18.85423513139997, + "rewards/rejected": -13.324118477957589, + "step": 2837 + }, + { + "epoch": 0.7778539125668082, + "grad_norm": 3.5, + "kl": 4.491074085235596, + "learning_rate": 5e-06, + "logits/chosen": -1290689.0666666667, + "logits/rejected": -31918784.0, + "logps/chosen": -458.6204427083333, + "logps/rejected": -406.8623318142361, + "loss": 0.0234, + "rewards/chosen": 6.838956705729166, + "rewards/margins": 19.36610633002387, + "rewards/rejected": -12.527149624294704, + "step": 2838 + }, + { + "epoch": 0.7781279978073181, + "grad_norm": 4.71875, + "kl": 0.12153689563274384, + "learning_rate": 5e-06, + "logits/chosen": -11447749.714285715, + "logits/rejected": -41613740.8, + "logps/chosen": -342.35916573660717, + "logps/rejected": -600.81318359375, + "loss": 0.0204, + "rewards/chosen": 7.625658852713449, + "rewards/margins": 20.93053250994001, + "rewards/rejected": -13.304873657226562, + "step": 2839 + }, + { + "epoch": 0.7784020830478279, + "grad_norm": 1.9140625, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -19834236.8, + "logits/rejected": -28283990.85714286, + "logps/chosen": -488.04697265625, + "logps/rejected": -583.4552873883929, + "loss": 0.0141, + "rewards/chosen": 6.7544715881347654, + "rewards/margins": 19.231424277169364, + "rewards/rejected": -12.476952689034599, + "step": 2840 + }, + { + "epoch": 0.7786761682883376, + "grad_norm": 1.78125, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -19304696.615384616, + "logits/rejected": -10792196.363636363, + "logps/chosen": -463.5205829326923, + "logps/rejected": -528.7774325284091, + "loss": 0.0092, + "rewards/chosen": 6.716212346003606, + "rewards/margins": 24.01371989216838, + "rewards/rejected": -17.297507546164773, + "step": 2841 + }, + { + "epoch": 0.7789502535288475, + "grad_norm": 8.5625, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -9091724.8, + "logits/rejected": -12825409.142857144, + "logps/chosen": -302.3318115234375, + "logps/rejected": -519.4906180245536, + "loss": 0.0382, + "rewards/chosen": 5.4953960418701175, + "rewards/margins": 16.41942253112793, + "rewards/rejected": -10.924026489257812, + "step": 2842 + }, + { + "epoch": 0.7792243387693573, + "grad_norm": 3.890625, + "kl": 4.063755035400391, + "learning_rate": 5e-06, + "logits/chosen": -20631689.14285714, + "logits/rejected": 26925836.8, + "logps/chosen": -334.15830775669644, + "logps/rejected": -576.071484375, + "loss": 0.055, + "rewards/chosen": 6.603640420096261, + "rewards/margins": 22.401046425955638, + "rewards/rejected": -15.797406005859376, + "step": 2843 + }, + { + "epoch": 0.779498424009867, + "grad_norm": 5.90625, + "kl": 8.215481758117676, + "learning_rate": 5e-06, + "logits/chosen": -21502877.333333332, + "logits/rejected": -8471850.0, + "logps/chosen": -570.9708251953125, + "logps/rejected": -410.0921223958333, + "loss": 0.0293, + "rewards/chosen": 9.11336580912272, + "rewards/margins": 20.35080909729004, + "rewards/rejected": -11.237443288167318, + "step": 2844 + }, + { + "epoch": 0.7797725092503769, + "grad_norm": 5.03125, + "kl": 4.9956488609313965, + "learning_rate": 5e-06, + "logits/chosen": -28586791.384615384, + "logits/rejected": -3449506.1818181816, + "logps/chosen": -430.7175105168269, + "logps/rejected": -662.9293323863636, + "loss": 0.1217, + "rewards/chosen": 6.403877258300781, + "rewards/margins": 18.55200958251953, + "rewards/rejected": -12.14813232421875, + "step": 2845 + }, + { + "epoch": 0.7800465944908866, + "grad_norm": 8.6875, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -14659390.76923077, + "logits/rejected": -27705917.09090909, + "logps/chosen": -424.0090519831731, + "logps/rejected": -532.5851384943181, + "loss": 0.0326, + "rewards/chosen": 5.9791118915264425, + "rewards/margins": 18.005774544669197, + "rewards/rejected": -12.026662653142756, + "step": 2846 + }, + { + "epoch": 0.7803206797313965, + "grad_norm": 0.9609375, + "kl": 5.343292236328125, + "learning_rate": 5e-06, + "logits/chosen": -27801770.666666668, + "logits/rejected": -31206170.666666668, + "logps/chosen": -438.7115071614583, + "logps/rejected": -477.4562174479167, + "loss": 0.0027, + "rewards/chosen": 7.862211227416992, + "rewards/margins": 22.06529426574707, + "rewards/rejected": -14.203083038330078, + "step": 2847 + }, + { + "epoch": 0.7805947649719063, + "grad_norm": 5.59375, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -24674023.111111112, + "logits/rejected": -17360068.266666666, + "logps/chosen": -413.8174641927083, + "logps/rejected": -400.26689453125, + "loss": 0.0524, + "rewards/chosen": 6.364210340711805, + "rewards/margins": 16.068409559461806, + "rewards/rejected": -9.70419921875, + "step": 2848 + }, + { + "epoch": 0.780868850212416, + "grad_norm": 7.71875, + "kl": 5.377932548522949, + "learning_rate": 5e-06, + "logits/chosen": -31052138.666666668, + "logits/rejected": -26898802.666666668, + "logps/chosen": -392.446044921875, + "logps/rejected": -405.1601155598958, + "loss": 0.0296, + "rewards/chosen": 6.793966929117839, + "rewards/margins": 17.054391225179035, + "rewards/rejected": -10.260424296061197, + "step": 2849 + }, + { + "epoch": 0.7811429354529259, + "grad_norm": 1.5390625, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -16789190.666666668, + "logits/rejected": -9655892.0, + "logps/chosen": -388.3678385416667, + "logps/rejected": -753.4952799479166, + "loss": 0.0036, + "rewards/chosen": 6.1981252034505205, + "rewards/margins": 20.40496063232422, + "rewards/rejected": -14.206835428873697, + "step": 2850 + }, + { + "epoch": 0.7814170206934357, + "grad_norm": 1.7890625, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -35234560.0, + "logits/rejected": -31665527.466666665, + "logps/chosen": -356.4167751736111, + "logps/rejected": -442.56653645833336, + "loss": 0.007, + "rewards/chosen": 6.933572133382161, + "rewards/margins": 19.598063913981118, + "rewards/rejected": -12.664491780598958, + "step": 2851 + }, + { + "epoch": 0.7816911059339454, + "grad_norm": 9.5, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -19769068.0, + "logits/rejected": -22614728.0, + "logps/chosen": -458.62310791015625, + "logps/rejected": -534.7234497070312, + "loss": 0.0427, + "rewards/chosen": 6.474660873413086, + "rewards/margins": 18.89506721496582, + "rewards/rejected": -12.420406341552734, + "step": 2852 + }, + { + "epoch": 0.7819651911744553, + "grad_norm": 5.28125, + "kl": 5.103212356567383, + "learning_rate": 5e-06, + "logits/chosen": -29839925.333333332, + "logits/rejected": -5319226.133333334, + "logps/chosen": -447.44769965277777, + "logps/rejected": -441.4768880208333, + "loss": 0.0216, + "rewards/chosen": 6.8475290934244795, + "rewards/margins": 15.184462483723959, + "rewards/rejected": -8.33693339029948, + "step": 2853 + }, + { + "epoch": 0.782239276414965, + "grad_norm": 4.78125, + "kl": 4.375783443450928, + "learning_rate": 5e-06, + "logits/chosen": -20335769.14285714, + "logits/rejected": 9536307.2, + "logps/chosen": -465.608642578125, + "logps/rejected": -449.622802734375, + "loss": 0.0203, + "rewards/chosen": 9.867186410086495, + "rewards/margins": 19.759018380301338, + "rewards/rejected": -9.891831970214843, + "step": 2854 + }, + { + "epoch": 0.7825133616554748, + "grad_norm": 8.5625, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -13326866.666666666, + "logits/rejected": -28659671.466666665, + "logps/chosen": -426.2947591145833, + "logps/rejected": -538.48154296875, + "loss": 0.0373, + "rewards/chosen": 6.317175971137153, + "rewards/margins": 17.52715827094184, + "rewards/rejected": -11.209982299804688, + "step": 2855 + }, + { + "epoch": 0.7827874468959847, + "grad_norm": 12.5, + "kl": 1.0265891551971436, + "learning_rate": 5e-06, + "logits/chosen": -28788005.333333332, + "logits/rejected": -30228178.666666668, + "logps/chosen": -335.186767578125, + "logps/rejected": -525.13232421875, + "loss": 0.0658, + "rewards/chosen": 6.126027425130208, + "rewards/margins": 16.358471552530926, + "rewards/rejected": -10.232444127400717, + "step": 2856 + }, + { + "epoch": 0.7830615321364944, + "grad_norm": 5.40625, + "kl": 5.9848432540893555, + "learning_rate": 5e-06, + "logits/chosen": -12971976.615384616, + "logits/rejected": -15433570.909090908, + "logps/chosen": -478.2451171875, + "logps/rejected": -452.41495028409093, + "loss": 0.0299, + "rewards/chosen": 6.192129868727464, + "rewards/margins": 16.201324676300263, + "rewards/rejected": -10.009194807572799, + "step": 2857 + }, + { + "epoch": 0.7833356173770043, + "grad_norm": 4.21875, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -25584290.90909091, + "logits/rejected": -28913831.384615384, + "logps/chosen": -356.57057883522725, + "logps/rejected": -505.5505183293269, + "loss": 0.0452, + "rewards/chosen": 5.617275931618431, + "rewards/margins": 18.199541932219393, + "rewards/rejected": -12.582266000600962, + "step": 2858 + }, + { + "epoch": 0.7836097026175141, + "grad_norm": 4.40625, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -9113705.333333334, + "logits/rejected": -8856267.333333334, + "logps/chosen": -310.4967041015625, + "logps/rejected": -688.5184733072916, + "loss": 0.0194, + "rewards/chosen": 7.7338714599609375, + "rewards/margins": 22.62132136027018, + "rewards/rejected": -14.887449900309244, + "step": 2859 + }, + { + "epoch": 0.7838837878580238, + "grad_norm": 3.359375, + "kl": 2.8115553855895996, + "learning_rate": 5e-06, + "logits/chosen": -8661968.727272727, + "logits/rejected": -15310368.0, + "logps/chosen": -440.7731267755682, + "logps/rejected": -595.4918870192307, + "loss": 0.0177, + "rewards/chosen": 8.668790643865412, + "rewards/margins": 22.42867012290688, + "rewards/rejected": -13.759879479041466, + "step": 2860 + }, + { + "epoch": 0.7841578730985337, + "grad_norm": 8.875, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -42019232.0, + "logits/rejected": -27922618.666666668, + "logps/chosen": -424.7982584635417, + "logps/rejected": -551.9915771484375, + "loss": 0.0746, + "rewards/chosen": 6.5746199289957685, + "rewards/margins": 19.955677668253582, + "rewards/rejected": -13.381057739257812, + "step": 2861 + }, + { + "epoch": 0.7844319583390434, + "grad_norm": 12.25, + "kl": 1.3162015676498413, + "learning_rate": 5e-06, + "logits/chosen": -11556357.818181818, + "logits/rejected": 90228440.61538461, + "logps/chosen": -349.3308771306818, + "logps/rejected": -592.6906550480769, + "loss": 0.0207, + "rewards/chosen": 6.890751925381747, + "rewards/margins": 17.82707347736492, + "rewards/rejected": -10.936321551983173, + "step": 2862 + }, + { + "epoch": 0.7847060435795532, + "grad_norm": 3.53125, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -10178219.2, + "logits/rejected": -19719058.285714287, + "logps/chosen": -398.419091796875, + "logps/rejected": -380.77242606026783, + "loss": 0.0355, + "rewards/chosen": 7.167796325683594, + "rewards/margins": 15.012430245535715, + "rewards/rejected": -7.844633919852121, + "step": 2863 + }, + { + "epoch": 0.7849801288200631, + "grad_norm": 4.21875, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -22404544.0, + "logits/rejected": -16034029.333333334, + "logps/chosen": -396.91845703125, + "logps/rejected": -530.3962809244791, + "loss": 0.033, + "rewards/chosen": 6.5205332438151045, + "rewards/margins": 16.369771321614582, + "rewards/rejected": -9.849238077799479, + "step": 2864 + }, + { + "epoch": 0.7852542140605728, + "grad_norm": 2.421875, + "kl": 6.2767462730407715, + "learning_rate": 5e-06, + "logits/chosen": -39047312.0, + "logits/rejected": -38903776.0, + "logps/chosen": -509.1624348958333, + "logps/rejected": -531.8264973958334, + "loss": 0.0082, + "rewards/chosen": 7.685483296712239, + "rewards/margins": 19.82374318440755, + "rewards/rejected": -12.138259887695312, + "step": 2865 + }, + { + "epoch": 0.7855282993010826, + "grad_norm": 3.3125, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": 7771589.333333333, + "logits/rejected": -21300646.4, + "logps/chosen": -431.11170789930554, + "logps/rejected": -738.2708333333334, + "loss": 0.0043, + "rewards/chosen": 7.121292961968316, + "rewards/margins": 22.778525627983942, + "rewards/rejected": -15.657232666015625, + "step": 2866 + }, + { + "epoch": 0.7858023845415925, + "grad_norm": 3.71875, + "kl": 4.18095588684082, + "learning_rate": 5e-06, + "logits/chosen": 13093624.0, + "logits/rejected": -22027161.6, + "logps/chosen": -466.77015904017856, + "logps/rejected": -638.530419921875, + "loss": 0.0076, + "rewards/chosen": 8.000876290457589, + "rewards/margins": 23.31075940813337, + "rewards/rejected": -15.309883117675781, + "step": 2867 + }, + { + "epoch": 0.7860764697821022, + "grad_norm": 4.34375, + "kl": 2.1111011505126953, + "learning_rate": 5e-06, + "logits/chosen": -14957288.0, + "logits/rejected": -15918430.666666666, + "logps/chosen": -441.4148356119792, + "logps/rejected": -596.9871419270834, + "loss": 0.0145, + "rewards/chosen": 7.300533294677734, + "rewards/margins": 20.965379079182945, + "rewards/rejected": -13.664845784505209, + "step": 2868 + }, + { + "epoch": 0.7863505550226121, + "grad_norm": 6.0625, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -20336262.666666668, + "logits/rejected": -19021948.0, + "logps/chosen": -382.2298583984375, + "logps/rejected": -600.064697265625, + "loss": 0.023, + "rewards/chosen": 6.630672454833984, + "rewards/margins": 22.36634953816732, + "rewards/rejected": -15.735677083333334, + "step": 2869 + }, + { + "epoch": 0.7866246402631218, + "grad_norm": 0.921875, + "kl": 2.074662685394287, + "learning_rate": 5e-06, + "logits/chosen": -12611770.666666666, + "logits/rejected": -20516246.666666668, + "logps/chosen": -466.3364664713542, + "logps/rejected": -471.2091471354167, + "loss": 0.0042, + "rewards/chosen": 7.684284845987956, + "rewards/margins": 18.33131726582845, + "rewards/rejected": -10.647032419840494, + "step": 2870 + }, + { + "epoch": 0.7868987255036316, + "grad_norm": 3.890625, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -34808376.88888889, + "logits/rejected": -17971669.333333332, + "logps/chosen": -328.66620551215277, + "logps/rejected": -481.8052734375, + "loss": 0.0438, + "rewards/chosen": 6.848253038194445, + "rewards/margins": 16.424500189887155, + "rewards/rejected": -9.576247151692709, + "step": 2871 + }, + { + "epoch": 0.7871728107441415, + "grad_norm": 0.53515625, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -21299901.866666667, + "logits/rejected": -23063893.333333332, + "logps/chosen": -470.25462239583334, + "logps/rejected": -415.0628255208333, + "loss": 0.0019, + "rewards/chosen": 6.8460337320963545, + "rewards/margins": 18.366856045193142, + "rewards/rejected": -11.520822313096788, + "step": 2872 + }, + { + "epoch": 0.7874468959846512, + "grad_norm": 4.0625, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -31872128.0, + "logits/rejected": -24697722.181818184, + "logps/chosen": -419.67371544471155, + "logps/rejected": -509.63671875, + "loss": 0.013, + "rewards/chosen": 5.963643587552584, + "rewards/margins": 17.464256713440367, + "rewards/rejected": -11.500613125887783, + "step": 2873 + }, + { + "epoch": 0.787720981225161, + "grad_norm": 11.8125, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -834722.1538461539, + "logits/rejected": -26349684.363636363, + "logps/chosen": -401.50454477163464, + "logps/rejected": -430.22745028409093, + "loss": 0.0397, + "rewards/chosen": 6.2303936298076925, + "rewards/margins": 16.455942807497678, + "rewards/rejected": -10.225549177689986, + "step": 2874 + }, + { + "epoch": 0.7879950664656709, + "grad_norm": 6.84375, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -31814579.2, + "logits/rejected": -11767389.714285715, + "logps/chosen": -438.11552734375, + "logps/rejected": -544.4513113839286, + "loss": 0.0203, + "rewards/chosen": 8.005134582519531, + "rewards/margins": 19.388978140694753, + "rewards/rejected": -11.383843558175224, + "step": 2875 + }, + { + "epoch": 0.7882691517061806, + "grad_norm": 8.0625, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -4961688.888888889, + "logits/rejected": -9619556.266666668, + "logps/chosen": -322.08672417534723, + "logps/rejected": -622.7278645833334, + "loss": 0.0394, + "rewards/chosen": 5.551285637749566, + "rewards/margins": 17.093802727593317, + "rewards/rejected": -11.54251708984375, + "step": 2876 + }, + { + "epoch": 0.7885432369466904, + "grad_norm": 9.8125, + "kl": 0.6504402160644531, + "learning_rate": 5e-06, + "logits/chosen": -49204499.2, + "logits/rejected": -22251997.714285713, + "logps/chosen": -580.521923828125, + "logps/rejected": -489.08803013392856, + "loss": 0.0508, + "rewards/chosen": 8.442635345458985, + "rewards/margins": 20.4764771597726, + "rewards/rejected": -12.033841814313616, + "step": 2877 + }, + { + "epoch": 0.7888173221872002, + "grad_norm": 17.75, + "kl": 0.9189020991325378, + "learning_rate": 5e-06, + "logits/chosen": -16600676.57142857, + "logits/rejected": -23158888.0, + "logps/chosen": -336.88668387276783, + "logps/rejected": -470.71669921875, + "loss": 0.0275, + "rewards/chosen": 5.709363664899554, + "rewards/margins": 15.122947038922991, + "rewards/rejected": -9.413583374023437, + "step": 2878 + }, + { + "epoch": 0.78909140742771, + "grad_norm": 5.28125, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -9987388.444444444, + "logits/rejected": -29638144.0, + "logps/chosen": -320.92005750868054, + "logps/rejected": -550.6598958333333, + "loss": 0.0265, + "rewards/chosen": 6.531728956434462, + "rewards/margins": 20.47297854953342, + "rewards/rejected": -13.941249593098958, + "step": 2879 + }, + { + "epoch": 0.7893654926682199, + "grad_norm": 1.03125, + "kl": 6.72705078125, + "learning_rate": 5e-06, + "logits/chosen": -54648384.0, + "logits/rejected": -8418682.666666666, + "logps/chosen": -549.7294108072916, + "logps/rejected": -630.442138671875, + "loss": 0.0038, + "rewards/chosen": 8.140851338704428, + "rewards/margins": 27.688891092936196, + "rewards/rejected": -19.54803975423177, + "step": 2880 + }, + { + "epoch": 0.7896395779087296, + "grad_norm": 2.046875, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -36502860.0, + "logits/rejected": -17450833.6, + "logps/chosen": -394.6905212402344, + "logps/rejected": -675.59833984375, + "loss": 0.0037, + "rewards/chosen": 8.988911628723145, + "rewards/margins": 24.835777473449706, + "rewards/rejected": -15.846865844726562, + "step": 2881 + }, + { + "epoch": 0.7899136631492394, + "grad_norm": 6.875, + "kl": 10.597582817077637, + "learning_rate": 5e-06, + "logits/chosen": -13710525.090909092, + "logits/rejected": -32264273.230769232, + "logps/chosen": -405.5367542613636, + "logps/rejected": -588.4933894230769, + "loss": 0.0244, + "rewards/chosen": 7.577973799272017, + "rewards/margins": 17.7246248471987, + "rewards/rejected": -10.146651047926683, + "step": 2882 + }, + { + "epoch": 0.7901877483897493, + "grad_norm": 6.15625, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -21034080.0, + "logits/rejected": -32457837.17647059, + "logps/chosen": -383.83984375, + "logps/rejected": -541.2319623161765, + "loss": 0.0597, + "rewards/chosen": 8.676801409040179, + "rewards/margins": 20.329437768759846, + "rewards/rejected": -11.652636359719668, + "step": 2883 + }, + { + "epoch": 0.790461833630259, + "grad_norm": 8.8125, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -30796656.0, + "logits/rejected": -32119274.666666668, + "logps/chosen": -426.6282145182292, + "logps/rejected": -407.5849609375, + "loss": 0.0417, + "rewards/chosen": 6.6275984446207685, + "rewards/margins": 15.61178970336914, + "rewards/rejected": -8.984191258748373, + "step": 2884 + }, + { + "epoch": 0.7907359188707688, + "grad_norm": 1.7109375, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -46714816.0, + "logits/rejected": -18665670.4, + "logps/chosen": -529.1270616319445, + "logps/rejected": -601.8186848958334, + "loss": 0.0025, + "rewards/chosen": 8.958267211914062, + "rewards/margins": 21.06194051106771, + "rewards/rejected": -12.103673299153646, + "step": 2885 + }, + { + "epoch": 0.7910100041112786, + "grad_norm": 3.6875, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -39365980.8, + "logits/rejected": -31230880.0, + "logps/chosen": -437.1517578125, + "logps/rejected": -471.59305245535717, + "loss": 0.0103, + "rewards/chosen": 6.51204833984375, + "rewards/margins": 17.640037972586494, + "rewards/rejected": -11.127989632742745, + "step": 2886 + }, + { + "epoch": 0.7912840893517884, + "grad_norm": 1.6796875, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -24787692.0, + "logits/rejected": -21873690.0, + "logps/chosen": -355.5706787109375, + "logps/rejected": -521.5496826171875, + "loss": 0.005, + "rewards/chosen": 7.442408561706543, + "rewards/margins": 20.24868392944336, + "rewards/rejected": -12.806275367736816, + "step": 2887 + }, + { + "epoch": 0.7915581745922982, + "grad_norm": 5.0625, + "kl": 9.244460105895996, + "learning_rate": 5e-06, + "logits/chosen": -15398766.11764706, + "logits/rejected": 14528640.0, + "logps/chosen": -401.5880916819853, + "logps/rejected": -672.4836774553571, + "loss": 0.0132, + "rewards/chosen": 7.580014397116268, + "rewards/margins": 21.034410236262474, + "rewards/rejected": -13.454395839146205, + "step": 2888 + }, + { + "epoch": 0.791832259832808, + "grad_norm": 7.6875, + "kl": 6.053360939025879, + "learning_rate": 5e-06, + "logits/chosen": -16268353.142857144, + "logits/rejected": -15902632.0, + "logps/chosen": -436.6389857700893, + "logps/rejected": -443.05556640625, + "loss": 0.0231, + "rewards/chosen": 8.306722913469587, + "rewards/margins": 20.53006275721959, + "rewards/rejected": -12.22333984375, + "step": 2889 + }, + { + "epoch": 0.7921063450733178, + "grad_norm": 10.3125, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -16703242.666666666, + "logits/rejected": -11735410.666666666, + "logps/chosen": -324.52972412109375, + "logps/rejected": -626.8025309244791, + "loss": 0.098, + "rewards/chosen": 5.208025932312012, + "rewards/margins": 18.463962872823082, + "rewards/rejected": -13.255936940511068, + "step": 2890 + }, + { + "epoch": 0.7923804303138277, + "grad_norm": 8.4375, + "kl": 7.759315490722656, + "learning_rate": 5e-06, + "logits/chosen": -25304028.0, + "logits/rejected": -720036.0, + "logps/chosen": -487.8620300292969, + "logps/rejected": -486.28424072265625, + "loss": 0.0237, + "rewards/chosen": 7.267566204071045, + "rewards/margins": 20.198258876800537, + "rewards/rejected": -12.930692672729492, + "step": 2891 + }, + { + "epoch": 0.7926545155543374, + "grad_norm": 6.1875, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -10964103.384615384, + "logits/rejected": -21816029.09090909, + "logps/chosen": -449.1129807692308, + "logps/rejected": -684.9600941051136, + "loss": 0.0137, + "rewards/chosen": 6.656198354867788, + "rewards/margins": 24.08246233746722, + "rewards/rejected": -17.426263982599433, + "step": 2892 + }, + { + "epoch": 0.7929286007948472, + "grad_norm": 4.75, + "kl": 0.2536824643611908, + "learning_rate": 5e-06, + "logits/chosen": -20625568.0, + "logits/rejected": -15743584.0, + "logps/chosen": -444.02298677884613, + "logps/rejected": -463.20015092329544, + "loss": 0.023, + "rewards/chosen": 6.933024479792668, + "rewards/margins": 18.60682624870247, + "rewards/rejected": -11.6738017689098, + "step": 2893 + }, + { + "epoch": 0.793202686035357, + "grad_norm": 10.8125, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -11678922.4, + "logits/rejected": -8576378.857142856, + "logps/chosen": -428.172509765625, + "logps/rejected": -299.19520786830356, + "loss": 0.0668, + "rewards/chosen": 6.088691711425781, + "rewards/margins": 13.152403695242747, + "rewards/rejected": -7.063711983816964, + "step": 2894 + }, + { + "epoch": 0.7934767712758668, + "grad_norm": 6.4375, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -12626055.272727273, + "logits/rejected": 12925249.23076923, + "logps/chosen": -390.03053977272725, + "logps/rejected": -578.7553335336538, + "loss": 0.0516, + "rewards/chosen": 5.467315673828125, + "rewards/margins": 16.677061814528244, + "rewards/rejected": -11.20974614070012, + "step": 2895 + }, + { + "epoch": 0.7937508565163766, + "grad_norm": 9.5625, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -19494267.42857143, + "logits/rejected": -11602771.2, + "logps/chosen": -361.89760044642856, + "logps/rejected": -491.291796875, + "loss": 0.0486, + "rewards/chosen": 7.579740251813616, + "rewards/margins": 19.408275713239398, + "rewards/rejected": -11.828535461425782, + "step": 2896 + }, + { + "epoch": 0.7940249417568864, + "grad_norm": 8.1875, + "kl": 8.821514129638672, + "learning_rate": 5e-06, + "logits/chosen": 68716881.06666666, + "logits/rejected": -22201664.0, + "logps/chosen": -424.53880208333334, + "logps/rejected": -560.7071940104166, + "loss": 0.0626, + "rewards/chosen": 6.13821055094401, + "rewards/margins": 20.066551038953993, + "rewards/rejected": -13.928340488009983, + "step": 2897 + }, + { + "epoch": 0.7942990269973962, + "grad_norm": 2.890625, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -27398947.555555556, + "logits/rejected": -19910999.466666665, + "logps/chosen": -378.99365234375, + "logps/rejected": -671.9454427083333, + "loss": 0.0072, + "rewards/chosen": 6.841688368055555, + "rewards/margins": 22.621451144748264, + "rewards/rejected": -15.779762776692708, + "step": 2898 + }, + { + "epoch": 0.7945731122379059, + "grad_norm": 5.90625, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -30264134.85714286, + "logits/rejected": 21734768.0, + "logps/chosen": -433.31253487723217, + "logps/rejected": -503.4908203125, + "loss": 0.0291, + "rewards/chosen": 7.086239406040737, + "rewards/margins": 18.28629128592355, + "rewards/rejected": -11.200051879882812, + "step": 2899 + }, + { + "epoch": 0.7948471974784158, + "grad_norm": 9.75, + "kl": 8.900300979614258, + "learning_rate": 5e-06, + "logits/chosen": -30532176.0, + "logits/rejected": -26363442.666666668, + "logps/chosen": -469.5016682942708, + "logps/rejected": -629.7933756510416, + "loss": 0.0278, + "rewards/chosen": 7.093247095743815, + "rewards/margins": 22.69905153910319, + "rewards/rejected": -15.605804443359375, + "step": 2900 + }, + { + "epoch": 0.7951212827189256, + "grad_norm": 5.96875, + "kl": 2.1122500896453857, + "learning_rate": 5e-06, + "logits/chosen": -26857133.714285713, + "logits/rejected": -26539388.8, + "logps/chosen": -446.85682896205356, + "logps/rejected": -495.41787109375, + "loss": 0.0194, + "rewards/chosen": 6.864289964948382, + "rewards/margins": 19.29488307407924, + "rewards/rejected": -12.430593109130859, + "step": 2901 + }, + { + "epoch": 0.7953953679594354, + "grad_norm": 7.5, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -24264482.46153846, + "logits/rejected": -21188992.0, + "logps/chosen": -412.39058743990387, + "logps/rejected": -435.68172940340907, + "loss": 0.0456, + "rewards/chosen": 6.673393836388221, + "rewards/margins": 17.450759780990495, + "rewards/rejected": -10.777365944602273, + "step": 2902 + }, + { + "epoch": 0.7956694531999452, + "grad_norm": 13.8125, + "kl": 2.1490478515625, + "learning_rate": 5e-06, + "logits/chosen": -14424086.857142856, + "logits/rejected": -17462331.2, + "logps/chosen": -482.8662806919643, + "logps/rejected": -679.816552734375, + "loss": 0.0655, + "rewards/chosen": 8.037155151367188, + "rewards/margins": 22.388572692871094, + "rewards/rejected": -14.351417541503906, + "step": 2903 + }, + { + "epoch": 0.795943538440455, + "grad_norm": 6.6875, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -14001992.0, + "logits/rejected": -11817012.57142857, + "logps/chosen": -303.076171875, + "logps/rejected": -467.4990931919643, + "loss": 0.0548, + "rewards/chosen": 5.619272613525391, + "rewards/margins": 15.220277077811105, + "rewards/rejected": -9.601004464285714, + "step": 2904 + }, + { + "epoch": 0.7962176236809648, + "grad_norm": 3.6875, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": 18227356.307692308, + "logits/rejected": -13326234.181818182, + "logps/chosen": -435.85141225961536, + "logps/rejected": -752.2523082386364, + "loss": 0.0184, + "rewards/chosen": 7.383201012244592, + "rewards/margins": 24.47797335111178, + "rewards/rejected": -17.094772338867188, + "step": 2905 + }, + { + "epoch": 0.7964917089214746, + "grad_norm": 8.8125, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -30231220.57142857, + "logits/rejected": 114851878.4, + "logps/chosen": -326.76077706473217, + "logps/rejected": -662.0904296875, + "loss": 0.0523, + "rewards/chosen": 5.2315826416015625, + "rewards/margins": 28.596249389648438, + "rewards/rejected": -23.364666748046876, + "step": 2906 + }, + { + "epoch": 0.7967657941619843, + "grad_norm": 5.875, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -10084274.461538462, + "logits/rejected": 59548869.81818182, + "logps/chosen": -385.9163161057692, + "logps/rejected": -517.1178977272727, + "loss": 0.0372, + "rewards/chosen": 6.7994842529296875, + "rewards/margins": 21.335506092418324, + "rewards/rejected": -14.536021839488637, + "step": 2907 + }, + { + "epoch": 0.7970398794024942, + "grad_norm": 8.1875, + "kl": 0.6095403432846069, + "learning_rate": 5e-06, + "logits/chosen": -33393338.666666668, + "logits/rejected": -32332792.0, + "logps/chosen": -481.4646402994792, + "logps/rejected": -673.7671712239584, + "loss": 0.0186, + "rewards/chosen": 5.76461919148763, + "rewards/margins": 19.372236887613933, + "rewards/rejected": -13.607617696126303, + "step": 2908 + }, + { + "epoch": 0.797313964643004, + "grad_norm": 8.0625, + "kl": 7.593690395355225, + "learning_rate": 5e-06, + "logits/chosen": 11696692.266666668, + "logits/rejected": -26617948.444444444, + "logps/chosen": -422.1266276041667, + "logps/rejected": -474.13275824652777, + "loss": 0.0268, + "rewards/chosen": 7.013691202799479, + "rewards/margins": 19.5864256117079, + "rewards/rejected": -12.57273440890842, + "step": 2909 + }, + { + "epoch": 0.7975880498835137, + "grad_norm": 10.5625, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -23127508.8, + "logits/rejected": -18805614.85714286, + "logps/chosen": -424.8681640625, + "logps/rejected": -437.6982421875, + "loss": 0.0433, + "rewards/chosen": 8.758076477050782, + "rewards/margins": 18.743498883928574, + "rewards/rejected": -9.98542240687779, + "step": 2910 + }, + { + "epoch": 0.7978621351240236, + "grad_norm": 8.3125, + "kl": 2.8216285705566406, + "learning_rate": 5e-06, + "logits/chosen": 7064714.181818182, + "logits/rejected": -49301499.07692308, + "logps/chosen": -402.18217329545456, + "logps/rejected": -473.02110877403845, + "loss": 0.023, + "rewards/chosen": 7.488153631036932, + "rewards/margins": 18.455851948344623, + "rewards/rejected": -10.967698317307692, + "step": 2911 + }, + { + "epoch": 0.7981362203645334, + "grad_norm": 7.65625, + "kl": 3.8770651817321777, + "learning_rate": 5e-06, + "logits/chosen": -14832581.333333334, + "logits/rejected": -11673750.666666666, + "logps/chosen": -392.42431640625, + "logps/rejected": -645.83154296875, + "loss": 0.0396, + "rewards/chosen": 7.017647425333659, + "rewards/margins": 24.852898279825848, + "rewards/rejected": -17.835250854492188, + "step": 2912 + }, + { + "epoch": 0.7984103056050431, + "grad_norm": 3.453125, + "kl": 6.0404486656188965, + "learning_rate": 5e-06, + "logits/chosen": -44098656.0, + "logits/rejected": -14168952.615384616, + "logps/chosen": -430.8587535511364, + "logps/rejected": -592.8323317307693, + "loss": 0.0196, + "rewards/chosen": 7.751980868252841, + "rewards/margins": 18.031734333171713, + "rewards/rejected": -10.27975346491887, + "step": 2913 + }, + { + "epoch": 0.798684390845553, + "grad_norm": 10.8125, + "kl": 0.6271114349365234, + "learning_rate": 5e-06, + "logits/chosen": -20363786.666666668, + "logits/rejected": -7085421.866666666, + "logps/chosen": -441.3694118923611, + "logps/rejected": -447.6322916666667, + "loss": 0.047, + "rewards/chosen": 7.426598442925347, + "rewards/margins": 17.53453606499566, + "rewards/rejected": -10.107937622070313, + "step": 2914 + }, + { + "epoch": 0.7989584760860627, + "grad_norm": 4.375, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": 369002.4, + "logits/rejected": -8037789.714285715, + "logps/chosen": -425.56220703125, + "logps/rejected": -487.7893763950893, + "loss": 0.0354, + "rewards/chosen": 8.020793151855468, + "rewards/margins": 20.027528817313055, + "rewards/rejected": -12.006735665457589, + "step": 2915 + }, + { + "epoch": 0.7992325613265726, + "grad_norm": 7.78125, + "kl": 0.33002471923828125, + "learning_rate": 5e-06, + "logits/chosen": -19381149.53846154, + "logits/rejected": -24186141.09090909, + "logps/chosen": -357.67518028846155, + "logps/rejected": -601.8814808238636, + "loss": 0.0563, + "rewards/chosen": 6.958979679987981, + "rewards/margins": 19.730858009178323, + "rewards/rejected": -12.771878329190342, + "step": 2916 + }, + { + "epoch": 0.7995066465670824, + "grad_norm": 1.0, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -20600320.0, + "logits/rejected": -11648048.0, + "logps/chosen": -348.82598876953125, + "logps/rejected": -778.0691528320312, + "loss": 0.0021, + "rewards/chosen": 7.63640022277832, + "rewards/margins": 26.894121170043945, + "rewards/rejected": -19.257720947265625, + "step": 2917 + }, + { + "epoch": 0.7997807318075921, + "grad_norm": 0.83984375, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -19067906.666666668, + "logits/rejected": -24057472.0, + "logps/chosen": -457.9903971354167, + "logps/rejected": -498.9789225260417, + "loss": 0.005, + "rewards/chosen": 8.65463383992513, + "rewards/margins": 21.039928436279297, + "rewards/rejected": -12.385294596354166, + "step": 2918 + }, + { + "epoch": 0.800054817048102, + "grad_norm": 7.09375, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -9981502.545454545, + "logits/rejected": -28236930.46153846, + "logps/chosen": -326.73928000710225, + "logps/rejected": -512.1220703125, + "loss": 0.0222, + "rewards/chosen": 5.557456970214844, + "rewards/margins": 20.742880601149338, + "rewards/rejected": -15.185423630934496, + "step": 2919 + }, + { + "epoch": 0.8003289022886118, + "grad_norm": 3.078125, + "kl": 3.574104070663452, + "learning_rate": 5e-06, + "logits/chosen": -26542029.714285713, + "logits/rejected": -32193590.4, + "logps/chosen": -463.90980747767856, + "logps/rejected": -563.253173828125, + "loss": 0.0442, + "rewards/chosen": 6.649022783551898, + "rewards/margins": 21.921167864118303, + "rewards/rejected": -15.272145080566407, + "step": 2920 + }, + { + "epoch": 0.8006029875291215, + "grad_norm": 3.625, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -34219527.384615384, + "logits/rejected": -22968778.181818184, + "logps/chosen": -334.7506760817308, + "logps/rejected": -515.4572088068181, + "loss": 0.0099, + "rewards/chosen": 6.385186415452224, + "rewards/margins": 18.125788388552365, + "rewards/rejected": -11.740601973100143, + "step": 2921 + }, + { + "epoch": 0.8008770727696314, + "grad_norm": 3.46875, + "kl": 4.330951690673828, + "learning_rate": 5e-06, + "logits/chosen": -26629418.666666668, + "logits/rejected": -9926856.666666666, + "logps/chosen": -422.2461751302083, + "logps/rejected": -519.916748046875, + "loss": 0.0448, + "rewards/chosen": 6.756628672281901, + "rewards/margins": 15.638542811075848, + "rewards/rejected": -8.881914138793945, + "step": 2922 + }, + { + "epoch": 0.8011511580101411, + "grad_norm": 5.96875, + "kl": 2.7307441234588623, + "learning_rate": 5e-06, + "logits/chosen": -28224670.11764706, + "logits/rejected": -22201968.0, + "logps/chosen": -425.4627470128676, + "logps/rejected": -418.35693359375, + "loss": 0.0287, + "rewards/chosen": 6.145176607019761, + "rewards/margins": 15.300758490041524, + "rewards/rejected": -9.155581883021764, + "step": 2923 + }, + { + "epoch": 0.8014252432506509, + "grad_norm": 2.859375, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -36894061.71428572, + "logits/rejected": -23973260.8, + "logps/chosen": -356.30106026785717, + "logps/rejected": -624.915771484375, + "loss": 0.0487, + "rewards/chosen": 7.2563983372279575, + "rewards/margins": 19.090651266915458, + "rewards/rejected": -11.8342529296875, + "step": 2924 + }, + { + "epoch": 0.8016993284911608, + "grad_norm": 7.96875, + "kl": 8.103200912475586, + "learning_rate": 5e-06, + "logits/chosen": -23452261.333333332, + "logits/rejected": -21548526.666666668, + "logps/chosen": -374.26904296875, + "logps/rejected": -523.2708740234375, + "loss": 0.0255, + "rewards/chosen": 7.594289779663086, + "rewards/margins": 20.57869784037272, + "rewards/rejected": -12.984408060709635, + "step": 2925 + }, + { + "epoch": 0.8019734137316705, + "grad_norm": 4.53125, + "kl": 2.862699508666992, + "learning_rate": 5e-06, + "logits/chosen": -47913767.384615384, + "logits/rejected": 42054231.27272727, + "logps/chosen": -457.60452974759613, + "logps/rejected": -796.3243075284091, + "loss": 0.018, + "rewards/chosen": 7.250506474421575, + "rewards/margins": 24.35631091778095, + "rewards/rejected": -17.105804443359375, + "step": 2926 + }, + { + "epoch": 0.8022474989721804, + "grad_norm": 0.416015625, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -18354730.666666668, + "logits/rejected": -27435514.666666668, + "logps/chosen": -436.6797281901042, + "logps/rejected": -588.5630696614584, + "loss": 0.0013, + "rewards/chosen": 8.129596710205078, + "rewards/margins": 22.51305262247721, + "rewards/rejected": -14.383455912272135, + "step": 2927 + }, + { + "epoch": 0.8025215842126902, + "grad_norm": 8.75, + "kl": 10.480623245239258, + "learning_rate": 5e-06, + "logits/chosen": -36062011.733333334, + "logits/rejected": -25285424.0, + "logps/chosen": -400.3919270833333, + "logps/rejected": -569.7599826388889, + "loss": 0.0443, + "rewards/chosen": 7.3742116292317705, + "rewards/margins": 22.087604098849827, + "rewards/rejected": -14.713392469618055, + "step": 2928 + }, + { + "epoch": 0.8027956694531999, + "grad_norm": 4.15625, + "kl": 1.027836561203003, + "learning_rate": 5e-06, + "logits/chosen": -35880872.0, + "logits/rejected": -26169610.666666668, + "logps/chosen": -443.463134765625, + "logps/rejected": -492.2818196614583, + "loss": 0.018, + "rewards/chosen": 8.01871109008789, + "rewards/margins": 19.676308949788414, + "rewards/rejected": -11.657597859700521, + "step": 2929 + }, + { + "epoch": 0.8030697546937098, + "grad_norm": 7.71875, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -17171837.53846154, + "logits/rejected": -30906885.818181816, + "logps/chosen": -410.69478665865387, + "logps/rejected": -766.5926846590909, + "loss": 0.0231, + "rewards/chosen": 7.331060556265024, + "rewards/margins": 27.818093359887186, + "rewards/rejected": -20.48703280362216, + "step": 2930 + }, + { + "epoch": 0.8033438399342195, + "grad_norm": 7.65625, + "kl": 4.8543524742126465, + "learning_rate": 5e-06, + "logits/chosen": -21033646.933333334, + "logits/rejected": -18897820.444444444, + "logps/chosen": -422.30872395833336, + "logps/rejected": -326.8385416666667, + "loss": 0.0361, + "rewards/chosen": 7.132804870605469, + "rewards/margins": 15.937447441948784, + "rewards/rejected": -8.804642571343315, + "step": 2931 + }, + { + "epoch": 0.8036179251747293, + "grad_norm": 11.375, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -27674080.0, + "logits/rejected": -26528112.0, + "logps/chosen": -281.83481852213544, + "logps/rejected": -749.6354166666666, + "loss": 0.0572, + "rewards/chosen": 4.551554997762044, + "rewards/margins": 18.24353090922038, + "rewards/rejected": -13.691975911458334, + "step": 2932 + }, + { + "epoch": 0.8038920104152392, + "grad_norm": 2.515625, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -5200148.666666667, + "logits/rejected": -3603324.0, + "logps/chosen": -406.7721354166667, + "logps/rejected": -664.7139078776041, + "loss": 0.0087, + "rewards/chosen": 7.498743057250977, + "rewards/margins": 20.823628107706703, + "rewards/rejected": -13.324885050455729, + "step": 2933 + }, + { + "epoch": 0.8041660956557489, + "grad_norm": 9.125, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -30086699.42857143, + "logits/rejected": -29628275.2, + "logps/chosen": -369.98304966517856, + "logps/rejected": -560.1966796875, + "loss": 0.042, + "rewards/chosen": 5.6320005144391745, + "rewards/margins": 19.840145656040736, + "rewards/rejected": -14.208145141601562, + "step": 2934 + }, + { + "epoch": 0.8044401808962587, + "grad_norm": 5.875, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -19608496.0, + "logits/rejected": -29066429.333333332, + "logps/chosen": -418.1909586588542, + "logps/rejected": -608.1735432942709, + "loss": 0.0325, + "rewards/chosen": 6.99560546875, + "rewards/margins": 20.725781758626304, + "rewards/rejected": -13.730176289876303, + "step": 2935 + }, + { + "epoch": 0.8047142661367686, + "grad_norm": 4.59375, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -22481371.636363637, + "logits/rejected": -33360049.230769232, + "logps/chosen": -515.2134232954545, + "logps/rejected": -554.0128455528846, + "loss": 0.0158, + "rewards/chosen": 8.098770141601562, + "rewards/margins": 22.151452871469353, + "rewards/rejected": -14.052682729867788, + "step": 2936 + }, + { + "epoch": 0.8049883513772783, + "grad_norm": 6.09375, + "kl": 7.791644096374512, + "learning_rate": 5e-06, + "logits/chosen": -11964154.285714285, + "logits/rejected": -29385641.6, + "logps/chosen": -430.4310825892857, + "logps/rejected": -488.467529296875, + "loss": 0.0569, + "rewards/chosen": 6.533732822963169, + "rewards/margins": 19.30753348214286, + "rewards/rejected": -12.773800659179688, + "step": 2937 + }, + { + "epoch": 0.8052624366177882, + "grad_norm": 5.1875, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -16395808.0, + "logits/rejected": -28277008.0, + "logps/chosen": -396.94998604910717, + "logps/rejected": -543.46337890625, + "loss": 0.0119, + "rewards/chosen": 5.666681562151227, + "rewards/margins": 19.31331089564732, + "rewards/rejected": -13.646629333496094, + "step": 2938 + }, + { + "epoch": 0.8055365218582979, + "grad_norm": 6.125, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -17269660.0, + "logits/rejected": -27539412.0, + "logps/chosen": -482.7665710449219, + "logps/rejected": -592.7437744140625, + "loss": 0.0226, + "rewards/chosen": 6.707770347595215, + "rewards/margins": 19.889079093933105, + "rewards/rejected": -13.18130874633789, + "step": 2939 + }, + { + "epoch": 0.8058106070988077, + "grad_norm": 1.5546875, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -27833422.222222224, + "logits/rejected": -37424128.0, + "logps/chosen": -494.61756727430554, + "logps/rejected": -776.6604166666667, + "loss": 0.0037, + "rewards/chosen": 6.700508541531033, + "rewards/margins": 25.3893675910102, + "rewards/rejected": -18.688859049479166, + "step": 2940 + }, + { + "epoch": 0.8060846923393176, + "grad_norm": 5.09375, + "kl": 7.036757946014404, + "learning_rate": 5e-06, + "logits/chosen": -20827485.866666667, + "logits/rejected": -22027027.555555556, + "logps/chosen": -420.9982421875, + "logps/rejected": -520.2938368055555, + "loss": 0.031, + "rewards/chosen": 6.356710815429688, + "rewards/margins": 20.84026353624132, + "rewards/rejected": -14.483552720811632, + "step": 2941 + }, + { + "epoch": 0.8063587775798273, + "grad_norm": 9.0625, + "kl": 2.078566312789917, + "learning_rate": 5e-06, + "logits/chosen": -26891378.285714287, + "logits/rejected": -56886233.6, + "logps/chosen": -499.06236049107144, + "logps/rejected": -480.240869140625, + "loss": 0.0458, + "rewards/chosen": 7.3634507315499445, + "rewards/margins": 21.17252665928432, + "rewards/rejected": -13.809075927734375, + "step": 2942 + }, + { + "epoch": 0.8066328628203371, + "grad_norm": 12.4375, + "kl": 4.668548583984375, + "learning_rate": 5e-06, + "logits/chosen": -6060640.615384615, + "logits/rejected": 808270.5454545454, + "logps/chosen": -502.58984375, + "logps/rejected": -407.40602805397725, + "loss": 0.0467, + "rewards/chosen": 7.11350602370042, + "rewards/margins": 16.37787078644012, + "rewards/rejected": -9.264364762739701, + "step": 2943 + }, + { + "epoch": 0.806906948060847, + "grad_norm": 18.125, + "kl": 8.545214653015137, + "learning_rate": 5e-06, + "logits/chosen": -28295619.76470588, + "logits/rejected": -15443017.142857144, + "logps/chosen": -438.84978170955884, + "logps/rejected": -639.5335518973214, + "loss": 0.0951, + "rewards/chosen": 6.649835923138787, + "rewards/margins": 21.07633106648421, + "rewards/rejected": -14.426495143345424, + "step": 2944 + }, + { + "epoch": 0.8071810333013567, + "grad_norm": 5.84375, + "kl": 3.3766937255859375, + "learning_rate": 5e-06, + "logits/chosen": -5277508.923076923, + "logits/rejected": 901636.9090909091, + "logps/chosen": -494.52261117788464, + "logps/rejected": -563.0690252130681, + "loss": 0.029, + "rewards/chosen": 5.573218712439904, + "rewards/margins": 16.930917326386993, + "rewards/rejected": -11.357698613947088, + "step": 2945 + }, + { + "epoch": 0.8074551185418665, + "grad_norm": 6.28125, + "kl": 1.8419723510742188, + "learning_rate": 5e-06, + "logits/chosen": -20941074.90909091, + "logits/rejected": -42443416.615384616, + "logps/chosen": -426.8729137073864, + "logps/rejected": -441.58556189903845, + "loss": 0.0624, + "rewards/chosen": 8.047550548206676, + "rewards/margins": 18.112287721433837, + "rewards/rejected": -10.064737173227163, + "step": 2946 + }, + { + "epoch": 0.8077292037823763, + "grad_norm": 6.6875, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -7289088.0, + "logits/rejected": -42775419.07692308, + "logps/chosen": -302.33320756392044, + "logps/rejected": -544.9800555889423, + "loss": 0.014, + "rewards/chosen": 6.763268904252485, + "rewards/margins": 23.723673520388303, + "rewards/rejected": -16.96040461613582, + "step": 2947 + }, + { + "epoch": 0.8080032890228861, + "grad_norm": 9.3125, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": 32293326.769230768, + "logits/rejected": -884780.3636363636, + "logps/chosen": -395.24906099759613, + "logps/rejected": -566.7301136363636, + "loss": 0.0437, + "rewards/chosen": 6.610713078425481, + "rewards/margins": 21.693059214345226, + "rewards/rejected": -15.082346135919744, + "step": 2948 + }, + { + "epoch": 0.808277374263396, + "grad_norm": 5.9375, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -58663590.4, + "logits/rejected": 1601457.7142857143, + "logps/chosen": -420.77421875, + "logps/rejected": -648.6959402901786, + "loss": 0.0139, + "rewards/chosen": 6.930445861816406, + "rewards/margins": 20.57920597621373, + "rewards/rejected": -13.648760114397321, + "step": 2949 + }, + { + "epoch": 0.8085514595039057, + "grad_norm": 3.609375, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -24479936.0, + "logits/rejected": -40808372.36363637, + "logps/chosen": -474.42578125, + "logps/rejected": -508.34774502840907, + "loss": 0.0064, + "rewards/chosen": 7.334375234750601, + "rewards/margins": 21.449523605666794, + "rewards/rejected": -14.115148370916193, + "step": 2950 + }, + { + "epoch": 0.8088255447444155, + "grad_norm": 5.34375, + "kl": 3.905163049697876, + "learning_rate": 5e-06, + "logits/chosen": -19050910.222222224, + "logits/rejected": -9231565.333333334, + "logps/chosen": -384.64911566840277, + "logps/rejected": -608.7244466145834, + "loss": 0.0502, + "rewards/chosen": 6.142951117621528, + "rewards/margins": 16.662147945827908, + "rewards/rejected": -10.51919682820638, + "step": 2951 + }, + { + "epoch": 0.8090996299849254, + "grad_norm": 0.275390625, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -26381357.333333332, + "logits/rejected": -26193498.666666668, + "logps/chosen": -365.5855305989583, + "logps/rejected": -580.7435980902778, + "loss": 0.0008, + "rewards/chosen": 7.046719233194987, + "rewards/margins": 19.308076858520508, + "rewards/rejected": -12.261357625325521, + "step": 2952 + }, + { + "epoch": 0.8093737152254351, + "grad_norm": 4.03125, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -30873353.6, + "logits/rejected": -24274498.285714287, + "logps/chosen": -375.7708984375, + "logps/rejected": -558.0650111607143, + "loss": 0.0187, + "rewards/chosen": 7.397855377197265, + "rewards/margins": 18.860369982038225, + "rewards/rejected": -11.46251460484096, + "step": 2953 + }, + { + "epoch": 0.8096478004659449, + "grad_norm": 8.9375, + "kl": 3.4640414714813232, + "learning_rate": 5e-06, + "logits/chosen": -38053991.384615384, + "logits/rejected": -26326909.09090909, + "logps/chosen": -314.2931565504808, + "logps/rejected": -478.51167436079544, + "loss": 0.0595, + "rewards/chosen": 5.784198467548077, + "rewards/margins": 16.148962594412424, + "rewards/rejected": -10.364764126864346, + "step": 2954 + }, + { + "epoch": 0.8099218857064547, + "grad_norm": 6.4375, + "kl": 2.7148895263671875, + "learning_rate": 5e-06, + "logits/chosen": -10946038.153846154, + "logits/rejected": -20422052.363636363, + "logps/chosen": -414.9625901442308, + "logps/rejected": -558.9720791903409, + "loss": 0.0307, + "rewards/chosen": 6.870703477125901, + "rewards/margins": 18.9010726288482, + "rewards/rejected": -12.0303691517223, + "step": 2955 + }, + { + "epoch": 0.8101959709469645, + "grad_norm": 4.59375, + "kl": 3.007747173309326, + "learning_rate": 5e-06, + "logits/chosen": -22422232.615384616, + "logits/rejected": -25954251.636363637, + "logps/chosen": -428.65054086538464, + "logps/rejected": -506.9098455255682, + "loss": 0.0162, + "rewards/chosen": 8.473077627328726, + "rewards/margins": 21.928491445688103, + "rewards/rejected": -13.455413818359375, + "step": 2956 + }, + { + "epoch": 0.8104700561874743, + "grad_norm": 11.75, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -11421923.692307692, + "logits/rejected": 71512250.18181819, + "logps/chosen": -458.71131310096155, + "logps/rejected": -561.29443359375, + "loss": 0.0529, + "rewards/chosen": 6.770052396334135, + "rewards/margins": 22.32816896238527, + "rewards/rejected": -15.558116566051137, + "step": 2957 + }, + { + "epoch": 0.8107441414279841, + "grad_norm": 4.875, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -16567880.0, + "logits/rejected": -2729865.4, + "logps/chosen": -503.70455496651783, + "logps/rejected": -656.27470703125, + "loss": 0.0078, + "rewards/chosen": 7.906960623604911, + "rewards/margins": 24.267351858956474, + "rewards/rejected": -16.360391235351564, + "step": 2958 + }, + { + "epoch": 0.8110182266684939, + "grad_norm": 6.0625, + "kl": 14.26395320892334, + "learning_rate": 5e-06, + "logits/chosen": -22053882.0, + "logits/rejected": -4269037.5, + "logps/chosen": -369.7419738769531, + "logps/rejected": -526.8917236328125, + "loss": 0.0919, + "rewards/chosen": 6.275437355041504, + "rewards/margins": 19.92252540588379, + "rewards/rejected": -13.647088050842285, + "step": 2959 + }, + { + "epoch": 0.8112923119090037, + "grad_norm": 4.25, + "kl": 2.05066180229187, + "learning_rate": 5e-06, + "logits/chosen": 18513959.384615384, + "logits/rejected": -35290926.54545455, + "logps/chosen": -457.1486253004808, + "logps/rejected": -539.8262606534091, + "loss": 0.0168, + "rewards/chosen": 8.144960256723257, + "rewards/margins": 21.722227990210474, + "rewards/rejected": -13.577267733487217, + "step": 2960 + }, + { + "epoch": 0.8115663971495135, + "grad_norm": 7.4375, + "kl": 5.439858913421631, + "learning_rate": 5e-06, + "logits/chosen": -15087093.333333334, + "logits/rejected": -47844010.666666664, + "logps/chosen": -389.4254150390625, + "logps/rejected": -633.0452880859375, + "loss": 0.0607, + "rewards/chosen": 5.270134290059407, + "rewards/margins": 18.22128454844157, + "rewards/rejected": -12.951150258382162, + "step": 2961 + }, + { + "epoch": 0.8118404823900233, + "grad_norm": 3.203125, + "kl": 1.9766597747802734, + "learning_rate": 5e-06, + "logits/chosen": -26106938.181818184, + "logits/rejected": -33657127.384615384, + "logps/chosen": -376.1188299005682, + "logps/rejected": -605.2761418269231, + "loss": 0.0202, + "rewards/chosen": 6.974944374778054, + "rewards/margins": 19.945694449898244, + "rewards/rejected": -12.970750075120192, + "step": 2962 + }, + { + "epoch": 0.8121145676305331, + "grad_norm": 1.171875, + "kl": 3.767390012741089, + "learning_rate": 5e-06, + "logits/chosen": -28422100.363636363, + "logits/rejected": -16638359.384615384, + "logps/chosen": -451.35031960227275, + "logps/rejected": -491.3792067307692, + "loss": 0.0439, + "rewards/chosen": 7.148022738370028, + "rewards/margins": 17.19126497282015, + "rewards/rejected": -10.04324223445012, + "step": 2963 + }, + { + "epoch": 0.8123886528710429, + "grad_norm": 5.34375, + "kl": 1.576348066329956, + "learning_rate": 5e-06, + "logits/chosen": -14208157.538461538, + "logits/rejected": -20022884.363636363, + "logps/chosen": -378.2539813701923, + "logps/rejected": -628.4304421164773, + "loss": 0.0448, + "rewards/chosen": 6.2001471886268025, + "rewards/margins": 20.101741870800097, + "rewards/rejected": -13.901594682173295, + "step": 2964 + }, + { + "epoch": 0.8126627381115527, + "grad_norm": 6.3125, + "kl": 8.471686363220215, + "learning_rate": 5e-06, + "logits/chosen": -16724442.666666666, + "logits/rejected": -13405389.333333334, + "logps/chosen": -442.5685221354167, + "logps/rejected": -662.3993326822916, + "loss": 0.0728, + "rewards/chosen": 7.8514353434244795, + "rewards/margins": 21.22769546508789, + "rewards/rejected": -13.376260121663412, + "step": 2965 + }, + { + "epoch": 0.8129368233520625, + "grad_norm": 20.0, + "kl": 3.14251708984375, + "learning_rate": 5e-06, + "logits/chosen": -11972701.714285715, + "logits/rejected": -28084662.4, + "logps/chosen": -391.0844029017857, + "logps/rejected": -736.84375, + "loss": 0.0493, + "rewards/chosen": 5.244922637939453, + "rewards/margins": 23.518909454345703, + "rewards/rejected": -18.27398681640625, + "step": 2966 + }, + { + "epoch": 0.8132109085925723, + "grad_norm": 10.375, + "kl": 0.1015116423368454, + "learning_rate": 5e-06, + "logits/chosen": -7633789.6, + "logits/rejected": -18587333.714285713, + "logps/chosen": -513.6568359375, + "logps/rejected": -534.4659598214286, + "loss": 0.0288, + "rewards/chosen": 6.311717224121094, + "rewards/margins": 17.713475690569197, + "rewards/rejected": -11.401758466448102, + "step": 2967 + }, + { + "epoch": 0.813484993833082, + "grad_norm": 1.328125, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -17486032.0, + "logits/rejected": -27331108.0, + "logps/chosen": -468.76312255859375, + "logps/rejected": -573.9779052734375, + "loss": 0.0046, + "rewards/chosen": 7.12348747253418, + "rewards/margins": 19.67691707611084, + "rewards/rejected": -12.55342960357666, + "step": 2968 + }, + { + "epoch": 0.8137590790735919, + "grad_norm": 2.71875, + "kl": 5.020530700683594, + "learning_rate": 5e-06, + "logits/chosen": 23095788.307692308, + "logits/rejected": 780779.2727272727, + "logps/chosen": -489.98839393028845, + "logps/rejected": -718.5211736505681, + "loss": 0.005, + "rewards/chosen": 7.797354478102464, + "rewards/margins": 23.59219867199451, + "rewards/rejected": -15.794844193892045, + "step": 2969 + }, + { + "epoch": 0.8140331643141017, + "grad_norm": 7.25, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -16318068.0, + "logits/rejected": 38590844.0, + "logps/chosen": -421.98602294921875, + "logps/rejected": -448.3902893066406, + "loss": 0.0262, + "rewards/chosen": 7.053119659423828, + "rewards/margins": 16.789642333984375, + "rewards/rejected": -9.736522674560547, + "step": 2970 + }, + { + "epoch": 0.8143072495546115, + "grad_norm": 5.8125, + "kl": 2.75313138961792, + "learning_rate": 5e-06, + "logits/chosen": -20057803.42857143, + "logits/rejected": -20299928.0, + "logps/chosen": -430.57861328125, + "logps/rejected": -664.78720703125, + "loss": 0.0146, + "rewards/chosen": 8.344140189034599, + "rewards/margins": 22.850966971261162, + "rewards/rejected": -14.506826782226563, + "step": 2971 + }, + { + "epoch": 0.8145813347951213, + "grad_norm": 2.734375, + "kl": 0.10400199890136719, + "learning_rate": 5e-06, + "logits/chosen": -14997332.266666668, + "logits/rejected": -30720679.111111112, + "logps/chosen": -369.26438802083334, + "logps/rejected": -573.2050238715278, + "loss": 0.0233, + "rewards/chosen": 6.856388854980469, + "rewards/margins": 20.97302025689019, + "rewards/rejected": -14.116631401909721, + "step": 2972 + }, + { + "epoch": 0.814855420035631, + "grad_norm": 8.375, + "kl": 19.15093231201172, + "learning_rate": 5e-06, + "logits/chosen": -19712149.818181816, + "logits/rejected": 850596.3076923077, + "logps/chosen": -545.1421342329545, + "logps/rejected": -534.6522686298077, + "loss": 0.0618, + "rewards/chosen": 7.29572226784446, + "rewards/margins": 17.60126660753797, + "rewards/rejected": -10.30554433969351, + "step": 2973 + }, + { + "epoch": 0.8151295052761409, + "grad_norm": 2.78125, + "kl": 1.4147758483886719, + "learning_rate": 5e-06, + "logits/chosen": -17074224.0, + "logits/rejected": -22739499.636363637, + "logps/chosen": -496.0793644831731, + "logps/rejected": -548.9930308948864, + "loss": 0.0074, + "rewards/chosen": 8.744147667518028, + "rewards/margins": 19.73067970542641, + "rewards/rejected": -10.98653203790838, + "step": 2974 + }, + { + "epoch": 0.8154035905166507, + "grad_norm": 5.71875, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -24467303.111111112, + "logits/rejected": -28135136.0, + "logps/chosen": -554.4782443576389, + "logps/rejected": -570.5274739583333, + "loss": 0.0331, + "rewards/chosen": 6.034270392523871, + "rewards/margins": 20.841996426052518, + "rewards/rejected": -14.807726033528645, + "step": 2975 + }, + { + "epoch": 0.8156776757571604, + "grad_norm": 10.4375, + "kl": 7.5776286125183105, + "learning_rate": 5e-06, + "logits/chosen": -6791384.5, + "logits/rejected": -25988052.0, + "logps/chosen": -428.77789306640625, + "logps/rejected": -496.07861328125, + "loss": 0.0662, + "rewards/chosen": 6.112661838531494, + "rewards/margins": 20.003032207489014, + "rewards/rejected": -13.89037036895752, + "step": 2976 + }, + { + "epoch": 0.8159517609976703, + "grad_norm": 4.78125, + "kl": 2.277632474899292, + "learning_rate": 5e-06, + "logits/chosen": 44741225.14285714, + "logits/rejected": -17970971.2, + "logps/chosen": -519.955810546875, + "logps/rejected": -503.06640625, + "loss": 0.0216, + "rewards/chosen": 7.192353929792132, + "rewards/margins": 20.492331041608537, + "rewards/rejected": -13.299977111816407, + "step": 2977 + }, + { + "epoch": 0.8162258462381801, + "grad_norm": 2.09375, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -24008953.14285714, + "logits/rejected": -26808568.470588237, + "logps/chosen": -541.2309919084821, + "logps/rejected": -439.03466796875, + "loss": 0.0242, + "rewards/chosen": 8.752831050327845, + "rewards/margins": 19.973377997133912, + "rewards/rejected": -11.220546946806067, + "step": 2978 + }, + { + "epoch": 0.8164999314786898, + "grad_norm": 8.5, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -11756656.0, + "logits/rejected": -31497262.769230768, + "logps/chosen": -453.7935901988636, + "logps/rejected": -653.7888371394231, + "loss": 0.0462, + "rewards/chosen": 7.102746443314985, + "rewards/margins": 18.796562408233857, + "rewards/rejected": -11.69381596491887, + "step": 2979 + }, + { + "epoch": 0.8167740167191997, + "grad_norm": 9.0625, + "kl": 10.110336303710938, + "learning_rate": 5e-06, + "logits/chosen": -15654452.0, + "logits/rejected": -5168475.0, + "logps/chosen": -431.0262145996094, + "logps/rejected": -664.0042114257812, + "loss": 0.1107, + "rewards/chosen": 7.65463924407959, + "rewards/margins": 23.007784843444824, + "rewards/rejected": -15.353145599365234, + "step": 2980 + }, + { + "epoch": 0.8170481019597094, + "grad_norm": 10.875, + "kl": 4.859006404876709, + "learning_rate": 5e-06, + "logits/chosen": -7422132.363636363, + "logits/rejected": -20626980.923076924, + "logps/chosen": -387.1651722301136, + "logps/rejected": -608.6954627403846, + "loss": 0.0571, + "rewards/chosen": 5.116176258433949, + "rewards/margins": 17.466799756030102, + "rewards/rejected": -12.350623497596153, + "step": 2981 + }, + { + "epoch": 0.8173221872002193, + "grad_norm": 7.1875, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -15055160.727272727, + "logits/rejected": -13387702.153846154, + "logps/chosen": -377.80411044034093, + "logps/rejected": -437.5157001201923, + "loss": 0.0244, + "rewards/chosen": 7.635220614346591, + "rewards/margins": 16.822064486416902, + "rewards/rejected": -9.186843872070312, + "step": 2982 + }, + { + "epoch": 0.8175962724407291, + "grad_norm": 3.25, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -27670130.666666668, + "logits/rejected": -38964477.333333336, + "logps/chosen": -429.8306477864583, + "logps/rejected": -542.7388509114584, + "loss": 0.0124, + "rewards/chosen": 6.6639862060546875, + "rewards/margins": 18.833267211914062, + "rewards/rejected": -12.169281005859375, + "step": 2983 + }, + { + "epoch": 0.8178703576812388, + "grad_norm": 1.9140625, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -17262540.0, + "logits/rejected": -2897438.0, + "logps/chosen": -434.28564453125, + "logps/rejected": -653.8324381510416, + "loss": 0.006, + "rewards/chosen": 7.121072769165039, + "rewards/margins": 19.877484003702797, + "rewards/rejected": -12.75641123453776, + "step": 2984 + }, + { + "epoch": 0.8181444429217487, + "grad_norm": 4.15625, + "kl": 7.859609603881836, + "learning_rate": 5e-06, + "logits/chosen": -10109862.153846154, + "logits/rejected": -2652080.0, + "logps/chosen": -307.12300931490387, + "logps/rejected": -439.23885830965907, + "loss": 0.0476, + "rewards/chosen": 6.414438687838041, + "rewards/margins": 16.3916926884151, + "rewards/rejected": -9.97725400057706, + "step": 2985 + }, + { + "epoch": 0.8184185281622585, + "grad_norm": 0.91015625, + "kl": 1.9727885723114014, + "learning_rate": 5e-06, + "logits/chosen": -26265410.46153846, + "logits/rejected": -14141396.363636363, + "logps/chosen": -357.83743990384613, + "logps/rejected": -534.8548029119319, + "loss": 0.0026, + "rewards/chosen": 7.879537729116587, + "rewards/margins": 19.417453739192936, + "rewards/rejected": -11.53791601007635, + "step": 2986 + }, + { + "epoch": 0.8186926134027682, + "grad_norm": 4.875, + "kl": 3.3638153076171875, + "learning_rate": 5e-06, + "logits/chosen": -31935040.0, + "logits/rejected": -11557444.0, + "logps/chosen": -514.8031005859375, + "logps/rejected": -506.956787109375, + "loss": 0.0148, + "rewards/chosen": 8.161571502685547, + "rewards/margins": 20.69153722127279, + "rewards/rejected": -12.52996571858724, + "step": 2987 + }, + { + "epoch": 0.8189666986432781, + "grad_norm": 6.96875, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -26517702.4, + "logits/rejected": -31571753.14285714, + "logps/chosen": -354.1538330078125, + "logps/rejected": -660.8715122767857, + "loss": 0.0366, + "rewards/chosen": 5.70169677734375, + "rewards/margins": 20.402923148018974, + "rewards/rejected": -14.701226370675224, + "step": 2988 + }, + { + "epoch": 0.8192407838837878, + "grad_norm": 1.015625, + "kl": 2.6749885082244873, + "learning_rate": 5e-06, + "logits/chosen": -4922218.4, + "logits/rejected": 38612086.85714286, + "logps/chosen": -384.48974609375, + "logps/rejected": -654.6392996651786, + "loss": 0.0019, + "rewards/chosen": 8.687699890136718, + "rewards/margins": 25.615123639787946, + "rewards/rejected": -16.92742374965123, + "step": 2989 + }, + { + "epoch": 0.8195148691242976, + "grad_norm": 10.125, + "kl": 2.5866122245788574, + "learning_rate": 5e-06, + "logits/chosen": -10418299.636363637, + "logits/rejected": -26673147.076923076, + "logps/chosen": -418.6550958806818, + "logps/rejected": -492.8913386418269, + "loss": 0.0596, + "rewards/chosen": 5.944646661931818, + "rewards/margins": 17.797466251399968, + "rewards/rejected": -11.85281958946815, + "step": 2990 + }, + { + "epoch": 0.8197889543648075, + "grad_norm": 8.0625, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": 3995639.3333333335, + "logits/rejected": -57402890.666666664, + "logps/chosen": -452.4641520182292, + "logps/rejected": -508.5266927083333, + "loss": 0.0477, + "rewards/chosen": 5.7850290934244795, + "rewards/margins": 17.724191029866535, + "rewards/rejected": -11.939161936442057, + "step": 2991 + }, + { + "epoch": 0.8200630396053172, + "grad_norm": 7.0625, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -42541933.333333336, + "logits/rejected": -32869034.666666668, + "logps/chosen": -343.7786051432292, + "logps/rejected": -452.11317274305554, + "loss": 0.052, + "rewards/chosen": 6.888851801554362, + "rewards/margins": 17.752580642700195, + "rewards/rejected": -10.863728841145834, + "step": 2992 + }, + { + "epoch": 0.8203371248458271, + "grad_norm": 3.828125, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -18313526.0, + "logits/rejected": -23776142.0, + "logps/chosen": -462.21185302734375, + "logps/rejected": -491.2329406738281, + "loss": 0.0081, + "rewards/chosen": 10.152057647705078, + "rewards/margins": 23.600571632385254, + "rewards/rejected": -13.448513984680176, + "step": 2993 + }, + { + "epoch": 0.8206112100863369, + "grad_norm": 5.90625, + "kl": 1.3726876974105835, + "learning_rate": 5e-06, + "logits/chosen": -16806562.90909091, + "logits/rejected": -35680305.23076923, + "logps/chosen": -400.30215731534093, + "logps/rejected": -555.7129657451923, + "loss": 0.0354, + "rewards/chosen": 7.981561834161932, + "rewards/margins": 24.826017259717823, + "rewards/rejected": -16.84445542555589, + "step": 2994 + }, + { + "epoch": 0.8208852953268466, + "grad_norm": 8.3125, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -7523565.090909091, + "logits/rejected": -24992110.769230768, + "logps/chosen": -497.6539417613636, + "logps/rejected": -513.3008563701923, + "loss": 0.058, + "rewards/chosen": 7.097227616743608, + "rewards/margins": 19.831370213648658, + "rewards/rejected": -12.734142596905048, + "step": 2995 + }, + { + "epoch": 0.8211593805673565, + "grad_norm": 8.0625, + "kl": 3.773115873336792, + "learning_rate": 5e-06, + "logits/chosen": -11801846.153846154, + "logits/rejected": -40994144.0, + "logps/chosen": -379.5383864182692, + "logps/rejected": -623.8870738636364, + "loss": 0.0451, + "rewards/chosen": 5.752745995154748, + "rewards/margins": 23.725475764774774, + "rewards/rejected": -17.972729769620027, + "step": 2996 + }, + { + "epoch": 0.8214334658078662, + "grad_norm": 0.796875, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -28500652.307692308, + "logits/rejected": -34337890.90909091, + "logps/chosen": -404.8415715144231, + "logps/rejected": -483.14990234375, + "loss": 0.003, + "rewards/chosen": 7.616997938889724, + "rewards/margins": 21.894153034770405, + "rewards/rejected": -14.277155095880682, + "step": 2997 + }, + { + "epoch": 0.821707551048376, + "grad_norm": 2.5625, + "kl": 2.7491111755371094, + "learning_rate": 5e-06, + "logits/chosen": -33156652.307692308, + "logits/rejected": -17913588.363636363, + "logps/chosen": -595.7551832932693, + "logps/rejected": -584.4755415482955, + "loss": 0.01, + "rewards/chosen": 7.499713604266827, + "rewards/margins": 19.729780077100635, + "rewards/rejected": -12.230066472833807, + "step": 2998 + }, + { + "epoch": 0.8219816362888859, + "grad_norm": 0.80859375, + "kl": 3.7713470458984375, + "learning_rate": 5e-06, + "logits/chosen": -14506625.777777778, + "logits/rejected": -20931946.666666668, + "logps/chosen": -509.29448784722223, + "logps/rejected": -545.9958984375, + "loss": 0.0026, + "rewards/chosen": 8.472345140245226, + "rewards/margins": 21.816449144151477, + "rewards/rejected": -13.34410400390625, + "step": 2999 + }, + { + "epoch": 0.8222557215293956, + "grad_norm": 5.96875, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -13271334.153846154, + "logits/rejected": -21166800.0, + "logps/chosen": -454.89637169471155, + "logps/rejected": -572.3176935369319, + "loss": 0.0228, + "rewards/chosen": 6.069163395808293, + "rewards/margins": 19.80192235133031, + "rewards/rejected": -13.732758955522018, + "step": 3000 + }, + { + "epoch": 0.8225298067699054, + "grad_norm": 11.0625, + "kl": 3.4555444717407227, + "learning_rate": 5e-06, + "logits/chosen": -9246415.333333334, + "logits/rejected": -27238733.333333332, + "logps/chosen": -342.3016764322917, + "logps/rejected": -799.7574055989584, + "loss": 0.0712, + "rewards/chosen": 4.938561121622722, + "rewards/margins": 31.755221684773765, + "rewards/rejected": -26.816660563151043, + "step": 3001 + }, + { + "epoch": 0.8228038920104153, + "grad_norm": 9.0, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -32676265.14285714, + "logits/rejected": -28839788.8, + "logps/chosen": -377.77218191964283, + "logps/rejected": -451.23876953125, + "loss": 0.0246, + "rewards/chosen": 6.521275111607143, + "rewards/margins": 19.121326991489955, + "rewards/rejected": -12.600051879882812, + "step": 3002 + }, + { + "epoch": 0.823077977250925, + "grad_norm": 6.5625, + "kl": 2.6847548484802246, + "learning_rate": 5e-06, + "logits/chosen": -13933280.0, + "logits/rejected": -19975076.363636363, + "logps/chosen": -417.23715444711536, + "logps/rejected": -714.4341264204545, + "loss": 0.0162, + "rewards/chosen": 5.869915301983173, + "rewards/margins": 20.264780858179904, + "rewards/rejected": -14.394865556196732, + "step": 3003 + }, + { + "epoch": 0.8233520624914349, + "grad_norm": 11.0, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -36323122.28571428, + "logits/rejected": -23382881.88235294, + "logps/chosen": -444.0166713169643, + "logps/rejected": -601.1120749080883, + "loss": 0.0536, + "rewards/chosen": 5.901943751743862, + "rewards/margins": 20.738505964519597, + "rewards/rejected": -14.836562212775736, + "step": 3004 + }, + { + "epoch": 0.8236261477319446, + "grad_norm": 8.4375, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -19234588.307692308, + "logits/rejected": 13941218.909090908, + "logps/chosen": -371.7841796875, + "logps/rejected": -729.6188299005681, + "loss": 0.0466, + "rewards/chosen": 5.383524968073918, + "rewards/margins": 21.704433948009996, + "rewards/rejected": -16.32090897993608, + "step": 3005 + }, + { + "epoch": 0.8239002329724544, + "grad_norm": 8.9375, + "kl": 22.187721252441406, + "learning_rate": 5e-06, + "logits/chosen": -34458960.0, + "logits/rejected": -93212512.0, + "logps/chosen": -416.24029541015625, + "logps/rejected": -459.9315185546875, + "loss": 0.0741, + "rewards/chosen": 7.69056510925293, + "rewards/margins": 21.524577140808105, + "rewards/rejected": -13.834012031555176, + "step": 3006 + }, + { + "epoch": 0.8241743182129643, + "grad_norm": 6.25, + "kl": 5.665071964263916, + "learning_rate": 5e-06, + "logits/chosen": -39412770.13333333, + "logits/rejected": -52291712.0, + "logps/chosen": -563.7427734375, + "logps/rejected": -495.03559027777777, + "loss": 0.037, + "rewards/chosen": 7.526112874348958, + "rewards/margins": 19.877830505371094, + "rewards/rejected": -12.351717631022135, + "step": 3007 + }, + { + "epoch": 0.824448403453474, + "grad_norm": 13.4375, + "kl": 1.7380092144012451, + "learning_rate": 5e-06, + "logits/chosen": -13707976.888888888, + "logits/rejected": -34175042.666666664, + "logps/chosen": -344.17811414930554, + "logps/rejected": -337.23158772786456, + "loss": 0.0785, + "rewards/chosen": 6.57991706000434, + "rewards/margins": 14.1486267513699, + "rewards/rejected": -7.56870969136556, + "step": 3008 + }, + { + "epoch": 0.8247224886939838, + "grad_norm": 1.828125, + "kl": 3.364922523498535, + "learning_rate": 5e-06, + "logits/chosen": -38685673.84615385, + "logits/rejected": -32130615.272727273, + "logps/chosen": -495.51915564903845, + "logps/rejected": -526.1516779119319, + "loss": 0.0129, + "rewards/chosen": 7.4232342059795675, + "rewards/margins": 20.032634174907123, + "rewards/rejected": -12.609399968927557, + "step": 3009 + }, + { + "epoch": 0.8249965739344937, + "grad_norm": 6.0, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -41562493.333333336, + "logits/rejected": -23333546.666666668, + "logps/chosen": -474.4164225260417, + "logps/rejected": -473.8439534505208, + "loss": 0.0434, + "rewards/chosen": 6.725715637207031, + "rewards/margins": 18.683497111002602, + "rewards/rejected": -11.957781473795572, + "step": 3010 + }, + { + "epoch": 0.8252706591750034, + "grad_norm": 6.5, + "kl": 10.892541885375977, + "learning_rate": 5e-06, + "logits/chosen": -24624758.153846152, + "logits/rejected": -47294330.18181818, + "logps/chosen": -334.49755859375, + "logps/rejected": -592.3669211647727, + "loss": 0.0211, + "rewards/chosen": 7.3174297626201925, + "rewards/margins": 22.280776764129424, + "rewards/rejected": -14.963347001509232, + "step": 3011 + }, + { + "epoch": 0.8255447444155132, + "grad_norm": 6.3125, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -12417512.0, + "logits/rejected": -12087810.666666666, + "logps/chosen": -456.8170166015625, + "logps/rejected": -549.5262451171875, + "loss": 0.034, + "rewards/chosen": 5.349393844604492, + "rewards/margins": 15.826118469238281, + "rewards/rejected": -10.476724624633789, + "step": 3012 + }, + { + "epoch": 0.825818829656023, + "grad_norm": 7.125, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -709019.1428571428, + "logits/rejected": -27893536.0, + "logps/chosen": -385.78787667410717, + "logps/rejected": -687.84482421875, + "loss": 0.041, + "rewards/chosen": 7.548550197056362, + "rewards/margins": 19.257097407749722, + "rewards/rejected": -11.708547210693359, + "step": 3013 + }, + { + "epoch": 0.8260929148965328, + "grad_norm": 5.375, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -34967947.63636363, + "logits/rejected": -22642870.153846152, + "logps/chosen": -464.92622514204544, + "logps/rejected": -526.8941556490385, + "loss": 0.0187, + "rewards/chosen": 8.208550886674361, + "rewards/margins": 18.916415501307775, + "rewards/rejected": -10.707864614633413, + "step": 3014 + }, + { + "epoch": 0.8263670001370427, + "grad_norm": 9.25, + "kl": 18.249536514282227, + "learning_rate": 5e-06, + "logits/chosen": -21478725.333333332, + "logits/rejected": 3685354.6666666665, + "logps/chosen": -430.40814887152777, + "logps/rejected": -613.0563151041666, + "loss": 0.1902, + "rewards/chosen": 7.325974358452691, + "rewards/margins": 20.398167504204643, + "rewards/rejected": -13.072193145751953, + "step": 3015 + }, + { + "epoch": 0.8266410853775524, + "grad_norm": 8.0625, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -26915093.333333332, + "logits/rejected": -38758842.666666664, + "logps/chosen": -406.01025390625, + "logps/rejected": -503.5823160807292, + "loss": 0.0071, + "rewards/chosen": 7.594118118286133, + "rewards/margins": 17.072376251220703, + "rewards/rejected": -9.47825813293457, + "step": 3016 + }, + { + "epoch": 0.8269151706180622, + "grad_norm": 4.625, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -37154885.333333336, + "logits/rejected": -29133109.333333332, + "logps/chosen": -400.1703287760417, + "logps/rejected": -558.37060546875, + "loss": 0.026, + "rewards/chosen": 8.294000625610352, + "rewards/margins": 24.016097386678062, + "rewards/rejected": -15.722096761067709, + "step": 3017 + }, + { + "epoch": 0.8271892558585721, + "grad_norm": 2.0, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -19131637.333333332, + "logits/rejected": -54270749.86666667, + "logps/chosen": -460.15516493055554, + "logps/rejected": -633.2216145833333, + "loss": 0.0185, + "rewards/chosen": 8.27043236626519, + "rewards/margins": 24.44322289360894, + "rewards/rejected": -16.17279052734375, + "step": 3018 + }, + { + "epoch": 0.8274633410990818, + "grad_norm": 6.28125, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -6286634.0, + "logits/rejected": -12865810.0, + "logps/chosen": -367.2996520996094, + "logps/rejected": -427.8921203613281, + "loss": 0.0417, + "rewards/chosen": 6.118628978729248, + "rewards/margins": 15.544469356536865, + "rewards/rejected": -9.425840377807617, + "step": 3019 + }, + { + "epoch": 0.8277374263395916, + "grad_norm": 10.75, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -17719185.454545453, + "logits/rejected": -34162205.538461536, + "logps/chosen": -416.25661399147725, + "logps/rejected": -486.6545222355769, + "loss": 0.0163, + "rewards/chosen": 8.10207089510831, + "rewards/margins": 19.112615892103502, + "rewards/rejected": -11.010544996995192, + "step": 3020 + }, + { + "epoch": 0.8280115115801014, + "grad_norm": 5.8125, + "kl": 2.1949920654296875, + "learning_rate": 5e-06, + "logits/chosen": -7071946.133333334, + "logits/rejected": -21650741.333333332, + "logps/chosen": -403.3005859375, + "logps/rejected": -589.8742404513889, + "loss": 0.0436, + "rewards/chosen": 6.872549438476563, + "rewards/margins": 19.78074917263455, + "rewards/rejected": -12.908199734157986, + "step": 3021 + }, + { + "epoch": 0.8282855968206112, + "grad_norm": 3.796875, + "kl": 0.720550537109375, + "learning_rate": 5e-06, + "logits/chosen": -22439385.14285714, + "logits/rejected": -5794932.0, + "logps/chosen": -465.42201450892856, + "logps/rejected": -418.179833984375, + "loss": 0.0125, + "rewards/chosen": 8.110331399100167, + "rewards/margins": 18.446474892752512, + "rewards/rejected": -10.336143493652344, + "step": 3022 + }, + { + "epoch": 0.828559682061121, + "grad_norm": 0.56640625, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -7941271.5, + "logits/rejected": -24546040.0, + "logps/chosen": -441.7156982421875, + "logps/rejected": -537.6048583984375, + "loss": 0.0019, + "rewards/chosen": 7.043392181396484, + "rewards/margins": 16.71914291381836, + "rewards/rejected": -9.675750732421875, + "step": 3023 + }, + { + "epoch": 0.8288337673016308, + "grad_norm": 5.09375, + "kl": 4.225397109985352, + "learning_rate": 5e-06, + "logits/chosen": -19710145.333333332, + "logits/rejected": -22405608.0, + "logps/chosen": -436.4909261067708, + "logps/rejected": -432.8754069010417, + "loss": 0.0622, + "rewards/chosen": 6.837445576985677, + "rewards/margins": 18.992212931315105, + "rewards/rejected": -12.154767354329428, + "step": 3024 + }, + { + "epoch": 0.8291078525421406, + "grad_norm": 3.703125, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -15411497.846153846, + "logits/rejected": -28873733.818181816, + "logps/chosen": -352.4958683894231, + "logps/rejected": -622.0582830255681, + "loss": 0.0204, + "rewards/chosen": 6.417608407827524, + "rewards/margins": 22.74407436130764, + "rewards/rejected": -16.326465953480113, + "step": 3025 + }, + { + "epoch": 0.8293819377826505, + "grad_norm": 5.59375, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -31078944.0, + "logits/rejected": -10767376.888888888, + "logps/chosen": -432.7640380859375, + "logps/rejected": -520.0346137152778, + "loss": 0.0308, + "rewards/chosen": 8.68283208211263, + "rewards/margins": 21.25079811943902, + "rewards/rejected": -12.56796603732639, + "step": 3026 + }, + { + "epoch": 0.8296560230231602, + "grad_norm": 0.75390625, + "kl": 0.516302764415741, + "learning_rate": 5e-06, + "logits/chosen": -47501682.28571428, + "logits/rejected": 6507841.6, + "logps/chosen": -497.288330078125, + "logps/rejected": -592.030029296875, + "loss": 0.0026, + "rewards/chosen": 7.8393723624093195, + "rewards/margins": 21.473411669049945, + "rewards/rejected": -13.634039306640625, + "step": 3027 + }, + { + "epoch": 0.82993010826367, + "grad_norm": 2.890625, + "kl": 2.3726038932800293, + "learning_rate": 5e-06, + "logits/chosen": -17046208.0, + "logits/rejected": -29430978.90909091, + "logps/chosen": -481.1198542668269, + "logps/rejected": -459.96803977272725, + "loss": 0.0095, + "rewards/chosen": 8.42447486290565, + "rewards/margins": 19.62772332038079, + "rewards/rejected": -11.203248457475143, + "step": 3028 + }, + { + "epoch": 0.8302041935041798, + "grad_norm": 3.09375, + "kl": 2.9452414512634277, + "learning_rate": 5e-06, + "logits/chosen": -1516517.6666666667, + "logits/rejected": -17427253.333333332, + "logps/chosen": -418.7130533854167, + "logps/rejected": -623.8433430989584, + "loss": 0.0129, + "rewards/chosen": 7.567324956258138, + "rewards/margins": 22.01209831237793, + "rewards/rejected": -14.444773356119791, + "step": 3029 + }, + { + "epoch": 0.8304782787446896, + "grad_norm": 6.0625, + "kl": 15.233776092529297, + "learning_rate": 5e-06, + "logits/chosen": -22980700.0, + "logits/rejected": -31984944.0, + "logps/chosen": -500.8255615234375, + "logps/rejected": -640.25830078125, + "loss": 0.0253, + "rewards/chosen": 7.803154468536377, + "rewards/margins": 23.059139728546143, + "rewards/rejected": -15.255985260009766, + "step": 3030 + }, + { + "epoch": 0.8307523639851994, + "grad_norm": 6.71875, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -31734574.222222224, + "logits/rejected": -38560904.53333333, + "logps/chosen": -364.3068033854167, + "logps/rejected": -520.7668619791667, + "loss": 0.0293, + "rewards/chosen": 6.086385515001085, + "rewards/margins": 18.74494162665473, + "rewards/rejected": -12.658556111653645, + "step": 3031 + }, + { + "epoch": 0.8310264492257092, + "grad_norm": 4.75, + "kl": 0.5097700953483582, + "learning_rate": 5e-06, + "logits/chosen": 21592496.0, + "logits/rejected": -3650364.0, + "logps/chosen": -422.1197916666667, + "logps/rejected": -612.87646484375, + "loss": 0.034, + "rewards/chosen": 6.170874913533528, + "rewards/margins": 19.015696843465168, + "rewards/rejected": -12.84482192993164, + "step": 3032 + }, + { + "epoch": 0.831300534466219, + "grad_norm": 11.0, + "kl": 22.01634979248047, + "learning_rate": 5e-06, + "logits/chosen": -26520746.666666668, + "logits/rejected": -109384533.33333333, + "logps/chosen": -464.4757486979167, + "logps/rejected": -576.4449055989584, + "loss": 0.0449, + "rewards/chosen": 8.26057603624132, + "rewards/margins": 25.27828386094835, + "rewards/rejected": -17.01770782470703, + "step": 3033 + }, + { + "epoch": 0.8315746197067287, + "grad_norm": 5.90625, + "kl": 7.900546073913574, + "learning_rate": 5e-06, + "logits/chosen": -17142154.0, + "logits/rejected": -20218472.0, + "logps/chosen": -453.17047119140625, + "logps/rejected": -449.22503662109375, + "loss": 0.0229, + "rewards/chosen": 7.226002216339111, + "rewards/margins": 19.34696054458618, + "rewards/rejected": -12.12095832824707, + "step": 3034 + }, + { + "epoch": 0.8318487049472386, + "grad_norm": 3.140625, + "kl": 2.0312747955322266, + "learning_rate": 5e-06, + "logits/chosen": -19047562.181818184, + "logits/rejected": -5740502.153846154, + "logps/chosen": -571.6716086647727, + "logps/rejected": -551.9929387019231, + "loss": 0.0082, + "rewards/chosen": 9.561365300958807, + "rewards/margins": 22.819424289089817, + "rewards/rejected": -13.25805898813101, + "step": 3035 + }, + { + "epoch": 0.8321227901877484, + "grad_norm": 11.5, + "kl": 5.546632289886475, + "learning_rate": 5e-06, + "logits/chosen": -18124913.333333332, + "logits/rejected": 12932556.0, + "logps/chosen": -473.492919921875, + "logps/rejected": -596.8677978515625, + "loss": 0.0572, + "rewards/chosen": 7.352972666422526, + "rewards/margins": 19.879491170247395, + "rewards/rejected": -12.52651850382487, + "step": 3036 + }, + { + "epoch": 0.8323968754282582, + "grad_norm": 3.875, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -18365442.46153846, + "logits/rejected": -20020494.545454547, + "logps/chosen": -265.99173677884613, + "logps/rejected": -560.1779119318181, + "loss": 0.024, + "rewards/chosen": 5.462162898137019, + "rewards/margins": 22.86968770060506, + "rewards/rejected": -17.40752480246804, + "step": 3037 + }, + { + "epoch": 0.832670960668768, + "grad_norm": 5.875, + "kl": 1.8313071727752686, + "learning_rate": 5e-06, + "logits/chosen": -12845533.0, + "logits/rejected": -9584748.0, + "logps/chosen": -301.6040954589844, + "logps/rejected": -399.22509765625, + "loss": 0.0254, + "rewards/chosen": 8.449505805969238, + "rewards/margins": 17.41270923614502, + "rewards/rejected": -8.963203430175781, + "step": 3038 + }, + { + "epoch": 0.8329450459092778, + "grad_norm": 7.1875, + "kl": 6.436845302581787, + "learning_rate": 5e-06, + "logits/chosen": -16256980.8, + "logits/rejected": -29680500.57142857, + "logps/chosen": -439.496044921875, + "logps/rejected": -636.1413225446429, + "loss": 0.0512, + "rewards/chosen": 8.186920928955079, + "rewards/margins": 22.471704428536555, + "rewards/rejected": -14.284783499581474, + "step": 3039 + }, + { + "epoch": 0.8332191311497876, + "grad_norm": 4.0625, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -24463184.0, + "logits/rejected": -22757821.333333332, + "logps/chosen": -244.298583984375, + "logps/rejected": -500.6197102864583, + "loss": 0.0335, + "rewards/chosen": 7.02651850382487, + "rewards/margins": 18.286375681559246, + "rewards/rejected": -11.259857177734375, + "step": 3040 + }, + { + "epoch": 0.8334932163902974, + "grad_norm": 6.59375, + "kl": 1.3487262725830078, + "learning_rate": 5e-06, + "logits/chosen": -2405308.3076923075, + "logits/rejected": -1242392.1818181819, + "logps/chosen": -376.45511568509613, + "logps/rejected": -584.9209872159091, + "loss": 0.0321, + "rewards/chosen": 5.943089998685396, + "rewards/margins": 18.79791787954477, + "rewards/rejected": -12.854827880859375, + "step": 3041 + }, + { + "epoch": 0.8337673016308071, + "grad_norm": 6.625, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": 42279325.09090909, + "logits/rejected": -22103008.0, + "logps/chosen": -497.8772638494318, + "logps/rejected": -527.8517878605769, + "loss": 0.0327, + "rewards/chosen": 5.84081407026811, + "rewards/margins": 19.94207608949888, + "rewards/rejected": -14.10126201923077, + "step": 3042 + }, + { + "epoch": 0.834041386871317, + "grad_norm": 3.6875, + "kl": 6.508357048034668, + "learning_rate": 5e-06, + "logits/chosen": -11366932.57142857, + "logits/rejected": -15752358.4, + "logps/chosen": -401.26157924107144, + "logps/rejected": -519.65234375, + "loss": 0.0407, + "rewards/chosen": 7.058075496128628, + "rewards/margins": 19.984448787144252, + "rewards/rejected": -12.926373291015626, + "step": 3043 + }, + { + "epoch": 0.8343154721118268, + "grad_norm": 14.0, + "kl": 20.66639518737793, + "learning_rate": 5e-06, + "logits/chosen": -33352711.529411763, + "logits/rejected": -16931529.14285714, + "logps/chosen": -399.59880514705884, + "logps/rejected": -435.5059291294643, + "loss": 0.1235, + "rewards/chosen": 6.529987110811121, + "rewards/margins": 15.805530323701745, + "rewards/rejected": -9.275543212890625, + "step": 3044 + }, + { + "epoch": 0.8345895573523365, + "grad_norm": 0.9375, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -16888092.8, + "logits/rejected": -26430404.57142857, + "logps/chosen": -443.18916015625, + "logps/rejected": -714.5517578125, + "loss": 0.0016, + "rewards/chosen": 7.515119934082032, + "rewards/margins": 25.505955941336495, + "rewards/rejected": -17.990836007254465, + "step": 3045 + }, + { + "epoch": 0.8348636425928464, + "grad_norm": 10.625, + "kl": 1.4111175537109375, + "learning_rate": 5e-06, + "logits/chosen": -21507371.42857143, + "logits/rejected": -35483654.4, + "logps/chosen": -390.0736607142857, + "logps/rejected": -466.40791015625, + "loss": 0.0437, + "rewards/chosen": 7.238246372767857, + "rewards/margins": 18.959382084437777, + "rewards/rejected": -11.721135711669922, + "step": 3046 + }, + { + "epoch": 0.8351377278333562, + "grad_norm": 2.203125, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -30678581.333333332, + "logits/rejected": -23669696.0, + "logps/chosen": -494.0610758463542, + "logps/rejected": -434.3761393229167, + "loss": 0.0052, + "rewards/chosen": 8.71744155883789, + "rewards/margins": 17.849876403808594, + "rewards/rejected": -9.132434844970703, + "step": 3047 + }, + { + "epoch": 0.835411813073866, + "grad_norm": 8.375, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -6896668.8, + "logits/rejected": -16334980.57142857, + "logps/chosen": -520.831640625, + "logps/rejected": -471.01925223214283, + "loss": 0.0227, + "rewards/chosen": 7.882855224609375, + "rewards/margins": 19.141199602399553, + "rewards/rejected": -11.258344377790179, + "step": 3048 + }, + { + "epoch": 0.8356858983143758, + "grad_norm": 2.71875, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -12919424.0, + "logits/rejected": 1396492.923076923, + "logps/chosen": -420.57492897727275, + "logps/rejected": -452.78763521634613, + "loss": 0.0077, + "rewards/chosen": 8.435423417524857, + "rewards/margins": 20.586762434952742, + "rewards/rejected": -12.151339017427885, + "step": 3049 + }, + { + "epoch": 0.8359599835548855, + "grad_norm": 6.84375, + "kl": 3.011685848236084, + "learning_rate": 5e-06, + "logits/chosen": -19498292.0, + "logits/rejected": -13226466.666666666, + "logps/chosen": -496.8500162760417, + "logps/rejected": -461.4683837890625, + "loss": 0.0228, + "rewards/chosen": 6.569044748942058, + "rewards/margins": 16.165627161661785, + "rewards/rejected": -9.596582412719727, + "step": 3050 + }, + { + "epoch": 0.8362340687953954, + "grad_norm": 8.5, + "kl": 18.57172966003418, + "learning_rate": 5e-06, + "logits/chosen": -13390211.0, + "logits/rejected": -27236180.0, + "logps/chosen": -392.1568603515625, + "logps/rejected": -676.2089233398438, + "loss": 0.1019, + "rewards/chosen": 6.96631383895874, + "rewards/margins": 19.327109813690186, + "rewards/rejected": -12.360795974731445, + "step": 3051 + }, + { + "epoch": 0.8365081540359052, + "grad_norm": 9.625, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -21327148.444444444, + "logits/rejected": -6490836.266666667, + "logps/chosen": -489.9283854166667, + "logps/rejected": -467.34609375, + "loss": 0.0339, + "rewards/chosen": 8.391920301649305, + "rewards/margins": 16.810299004448787, + "rewards/rejected": -8.41837870279948, + "step": 3052 + }, + { + "epoch": 0.8367822392764149, + "grad_norm": 9.5625, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -18816593.6, + "logits/rejected": -19818409.14285714, + "logps/chosen": -531.538232421875, + "logps/rejected": -384.82742745535717, + "loss": 0.0604, + "rewards/chosen": 8.202149963378906, + "rewards/margins": 19.898033578055244, + "rewards/rejected": -11.695883614676339, + "step": 3053 + }, + { + "epoch": 0.8370563245169248, + "grad_norm": 11.6875, + "kl": 8.74416732788086, + "learning_rate": 5e-06, + "logits/chosen": -19952474.352941178, + "logits/rejected": -9695203.42857143, + "logps/chosen": -413.48779296875, + "logps/rejected": -406.3557826450893, + "loss": 0.0173, + "rewards/chosen": 6.6592725865981155, + "rewards/margins": 16.61778175931017, + "rewards/rejected": -9.958509172712054, + "step": 3054 + }, + { + "epoch": 0.8373304097574346, + "grad_norm": 3.703125, + "kl": 2.7974658012390137, + "learning_rate": 5e-06, + "logits/chosen": -7206860.266666667, + "logits/rejected": -7775173.333333333, + "logps/chosen": -429.19567057291664, + "logps/rejected": -840.0022786458334, + "loss": 0.0127, + "rewards/chosen": 7.719891357421875, + "rewards/margins": 24.743105061848958, + "rewards/rejected": -17.023213704427082, + "step": 3055 + }, + { + "epoch": 0.8376044949979443, + "grad_norm": 6.15625, + "kl": 1.8314399719238281, + "learning_rate": 5e-06, + "logits/chosen": -14122026.666666666, + "logits/rejected": -18526740.0, + "logps/chosen": -453.152099609375, + "logps/rejected": -472.3063557942708, + "loss": 0.0297, + "rewards/chosen": 6.6902116139729815, + "rewards/margins": 15.467811584472656, + "rewards/rejected": -8.777599970499674, + "step": 3056 + }, + { + "epoch": 0.8378785802384542, + "grad_norm": 1.3515625, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -4429016.0, + "logits/rejected": -32427625.14285714, + "logps/chosen": -435.32255859375, + "logps/rejected": -561.9446149553571, + "loss": 0.0031, + "rewards/chosen": 8.250381469726562, + "rewards/margins": 22.660261971609934, + "rewards/rejected": -14.40988050188337, + "step": 3057 + }, + { + "epoch": 0.8381526654789639, + "grad_norm": 4.0, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -29103638.4, + "logits/rejected": 6434453.714285715, + "logps/chosen": -243.453662109375, + "logps/rejected": -589.9854910714286, + "loss": 0.0147, + "rewards/chosen": 5.918161010742187, + "rewards/margins": 17.81623774937221, + "rewards/rejected": -11.898076738630023, + "step": 3058 + }, + { + "epoch": 0.8384267507194737, + "grad_norm": 11.9375, + "kl": 4.4419755935668945, + "learning_rate": 5e-06, + "logits/chosen": -26512069.818181816, + "logits/rejected": -33338990.769230768, + "logps/chosen": -348.7415216619318, + "logps/rejected": -654.7435396634615, + "loss": 0.0834, + "rewards/chosen": 6.118644714355469, + "rewards/margins": 17.93725057748648, + "rewards/rejected": -11.81860586313101, + "step": 3059 + }, + { + "epoch": 0.8387008359599836, + "grad_norm": 3.875, + "kl": 4.617099285125732, + "learning_rate": 5e-06, + "logits/chosen": 1795476.3333333333, + "logits/rejected": -29624162.666666668, + "logps/chosen": -505.0414225260417, + "logps/rejected": -549.8516438802084, + "loss": 0.0253, + "rewards/chosen": 7.70277214050293, + "rewards/margins": 18.01624870300293, + "rewards/rejected": -10.3134765625, + "step": 3060 + }, + { + "epoch": 0.8389749212004933, + "grad_norm": 0.71484375, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -9191349.333333334, + "logits/rejected": -9474812.0, + "logps/chosen": -465.8145345052083, + "logps/rejected": -460.6668294270833, + "loss": 0.0019, + "rewards/chosen": 7.733978271484375, + "rewards/margins": 20.298812866210938, + "rewards/rejected": -12.564834594726562, + "step": 3061 + }, + { + "epoch": 0.8392490064410032, + "grad_norm": 7.28125, + "kl": 22.34404182434082, + "learning_rate": 5e-06, + "logits/chosen": -14909174.222222222, + "logits/rejected": 23827794.666666668, + "logps/chosen": -479.63878038194446, + "logps/rejected": -405.5511067708333, + "loss": 0.026, + "rewards/chosen": 9.158312479654947, + "rewards/margins": 17.65397771199544, + "rewards/rejected": -8.495665232340494, + "step": 3062 + }, + { + "epoch": 0.839523091681513, + "grad_norm": 0.9453125, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": 366574.0, + "logits/rejected": -26590064.0, + "logps/chosen": -308.0875244140625, + "logps/rejected": -484.8311767578125, + "loss": 0.0031, + "rewards/chosen": 6.467896938323975, + "rewards/margins": 19.98842477798462, + "rewards/rejected": -13.520527839660645, + "step": 3063 + }, + { + "epoch": 0.8397971769220227, + "grad_norm": 3.15625, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -13692866.0, + "logits/rejected": -28267856.0, + "logps/chosen": -246.69561767578125, + "logps/rejected": -529.53369140625, + "loss": 0.0422, + "rewards/chosen": 4.373522758483887, + "rewards/margins": 15.495678520202636, + "rewards/rejected": -11.12215576171875, + "step": 3064 + }, + { + "epoch": 0.8400712621625326, + "grad_norm": 1.640625, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -34292913.45454545, + "logits/rejected": -27002345.846153848, + "logps/chosen": -398.77596768465907, + "logps/rejected": -854.076171875, + "loss": 0.0057, + "rewards/chosen": 6.775479403409091, + "rewards/margins": 28.783381108637457, + "rewards/rejected": -22.007901705228367, + "step": 3065 + }, + { + "epoch": 0.8403453474030423, + "grad_norm": 3.484375, + "kl": 1.7490642070770264, + "learning_rate": 5e-06, + "logits/chosen": -9889444.57142857, + "logits/rejected": -39453494.4, + "logps/chosen": -426.81919642857144, + "logps/rejected": -494.97197265625, + "loss": 0.0106, + "rewards/chosen": 6.740482330322266, + "rewards/margins": 17.697501373291015, + "rewards/rejected": -10.95701904296875, + "step": 3066 + }, + { + "epoch": 0.8406194326435521, + "grad_norm": 8.1875, + "kl": 11.288844108581543, + "learning_rate": 5e-06, + "logits/chosen": -30668856.0, + "logits/rejected": -12298583.0, + "logps/chosen": -368.17974853515625, + "logps/rejected": -406.4898986816406, + "loss": 0.04, + "rewards/chosen": 7.550025939941406, + "rewards/margins": 15.645768165588379, + "rewards/rejected": -8.095742225646973, + "step": 3067 + }, + { + "epoch": 0.840893517884062, + "grad_norm": 5.5625, + "kl": 5.142234802246094, + "learning_rate": 5e-06, + "logits/chosen": -19833974.85714286, + "logits/rejected": -41369296.0, + "logps/chosen": -396.5244838169643, + "logps/rejected": -566.86943359375, + "loss": 0.0153, + "rewards/chosen": 7.580681392124721, + "rewards/margins": 18.8343015398298, + "rewards/rejected": -11.253620147705078, + "step": 3068 + }, + { + "epoch": 0.8411676031245717, + "grad_norm": 12.25, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -11441974.857142856, + "logits/rejected": -24155367.529411763, + "logps/chosen": -371.08279854910717, + "logps/rejected": -602.0854204963235, + "loss": 0.0221, + "rewards/chosen": 6.125100816999163, + "rewards/margins": 16.54176628689806, + "rewards/rejected": -10.416665469898897, + "step": 3069 + }, + { + "epoch": 0.8414416883650815, + "grad_norm": 2.75, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -14246555.636363637, + "logits/rejected": -828224.0, + "logps/chosen": -420.15505149147725, + "logps/rejected": -587.7573617788462, + "loss": 0.0216, + "rewards/chosen": 7.307892539284446, + "rewards/margins": 19.32488613528805, + "rewards/rejected": -12.016993596003605, + "step": 3070 + }, + { + "epoch": 0.8417157736055914, + "grad_norm": 13.625, + "kl": 2.390592098236084, + "learning_rate": 5e-06, + "logits/chosen": -12867076.363636363, + "logits/rejected": -16918872.615384616, + "logps/chosen": -386.31942471590907, + "logps/rejected": -512.8753756009615, + "loss": 0.0289, + "rewards/chosen": 7.828771417791193, + "rewards/margins": 17.764977941979893, + "rewards/rejected": -9.936206524188702, + "step": 3071 + }, + { + "epoch": 0.8419898588461011, + "grad_norm": 8.5625, + "kl": 3.0827701091766357, + "learning_rate": 5e-06, + "logits/chosen": -29878467.76470588, + "logits/rejected": -62485869.71428572, + "logps/chosen": -457.69528377757354, + "logps/rejected": -699.1671316964286, + "loss": 0.0224, + "rewards/chosen": 6.525408576516544, + "rewards/margins": 22.30379646365382, + "rewards/rejected": -15.778387887137276, + "step": 3072 + }, + { + "epoch": 0.842263944086611, + "grad_norm": 11.5, + "kl": 0.9078814387321472, + "learning_rate": 5e-06, + "logits/chosen": 28891313.454545453, + "logits/rejected": -23467564.307692308, + "logps/chosen": -579.8463689630681, + "logps/rejected": -542.5525841346154, + "loss": 0.0609, + "rewards/chosen": 7.431067033247515, + "rewards/margins": 18.34307077047708, + "rewards/rejected": -10.912003737229567, + "step": 3073 + }, + { + "epoch": 0.8425380293271207, + "grad_norm": 9.625, + "kl": 0.6521136164665222, + "learning_rate": 5e-06, + "logits/chosen": -21882285.09090909, + "logits/rejected": -26641368.615384616, + "logps/chosen": -419.8014026988636, + "logps/rejected": -489.61512169471155, + "loss": 0.0516, + "rewards/chosen": 6.044830322265625, + "rewards/margins": 13.249610314002403, + "rewards/rejected": -7.204779991736779, + "step": 3074 + }, + { + "epoch": 0.8428121145676305, + "grad_norm": 3.734375, + "kl": 4.637363433837891, + "learning_rate": 5e-06, + "logits/chosen": -15307318.857142856, + "logits/rejected": -21432958.4, + "logps/chosen": -388.19876534598217, + "logps/rejected": -507.987255859375, + "loss": 0.0082, + "rewards/chosen": 8.144717625209264, + "rewards/margins": 20.267010716029574, + "rewards/rejected": -12.122293090820312, + "step": 3075 + }, + { + "epoch": 0.8430861998081404, + "grad_norm": 14.875, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -10638329.333333334, + "logits/rejected": -25476293.333333332, + "logps/chosen": -363.1772867838542, + "logps/rejected": -733.0882975260416, + "loss": 0.0274, + "rewards/chosen": 6.638240814208984, + "rewards/margins": 21.056870778401695, + "rewards/rejected": -14.418629964192709, + "step": 3076 + }, + { + "epoch": 0.8433602850486501, + "grad_norm": 2.546875, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -19574722.90909091, + "logits/rejected": -27584241.230769232, + "logps/chosen": -335.4595392400568, + "logps/rejected": -635.2331730769231, + "loss": 0.0073, + "rewards/chosen": 7.1116111061789775, + "rewards/margins": 20.460705203609866, + "rewards/rejected": -13.34909409743089, + "step": 3077 + }, + { + "epoch": 0.8436343702891599, + "grad_norm": 5.1875, + "kl": 1.217511534690857, + "learning_rate": 5e-06, + "logits/chosen": -39182759.384615384, + "logits/rejected": -23247138.90909091, + "logps/chosen": -418.0406024639423, + "logps/rejected": -407.8876953125, + "loss": 0.0261, + "rewards/chosen": 6.910495464618389, + "rewards/margins": 19.0824037565218, + "rewards/rejected": -12.171908291903408, + "step": 3078 + }, + { + "epoch": 0.8439084555296698, + "grad_norm": 10.25, + "kl": 0.9199041128158569, + "learning_rate": 5e-06, + "logits/chosen": 11473785.333333334, + "logits/rejected": -14288677.333333334, + "logps/chosen": -354.9694010416667, + "logps/rejected": -450.8922526041667, + "loss": 0.0381, + "rewards/chosen": 8.13098398844401, + "rewards/margins": 17.73423131306966, + "rewards/rejected": -9.60324732462565, + "step": 3079 + }, + { + "epoch": 0.8441825407701795, + "grad_norm": 3.421875, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -30986025.14285714, + "logits/rejected": -19516153.6, + "logps/chosen": -438.04603794642856, + "logps/rejected": -589.60009765625, + "loss": 0.0204, + "rewards/chosen": 7.249068123953683, + "rewards/margins": 22.8684086390904, + "rewards/rejected": -15.619340515136718, + "step": 3080 + }, + { + "epoch": 0.8444566260106893, + "grad_norm": 7.875, + "kl": 2.502382278442383, + "learning_rate": 5e-06, + "logits/chosen": -30657810.285714287, + "logits/rejected": -3767464.8, + "logps/chosen": -434.96641322544644, + "logps/rejected": -495.822705078125, + "loss": 0.0195, + "rewards/chosen": 6.673041752406529, + "rewards/margins": 16.822977665492466, + "rewards/rejected": -10.149935913085937, + "step": 3081 + }, + { + "epoch": 0.8447307112511991, + "grad_norm": 0.1904296875, + "kl": 0.11779403686523438, + "learning_rate": 5e-06, + "logits/chosen": 6673351.428571428, + "logits/rejected": -22640809.411764707, + "logps/chosen": -528.6954868861607, + "logps/rejected": -580.1098920036765, + "loss": 0.0004, + "rewards/chosen": 9.494199480329241, + "rewards/margins": 23.340504878709297, + "rewards/rejected": -13.846305398380055, + "step": 3082 + }, + { + "epoch": 0.8450047964917089, + "grad_norm": 4.9375, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -38363575.27272727, + "logits/rejected": -28535645.53846154, + "logps/chosen": -414.1638849431818, + "logps/rejected": -637.3389423076923, + "loss": 0.0315, + "rewards/chosen": 6.199623107910156, + "rewards/margins": 20.514318026029144, + "rewards/rejected": -14.31469491811899, + "step": 3083 + }, + { + "epoch": 0.8452788817322188, + "grad_norm": 12.3125, + "kl": 6.549201965332031, + "learning_rate": 5e-06, + "logits/chosen": -11166612.0, + "logits/rejected": -24568248.0, + "logps/chosen": -460.3223876953125, + "logps/rejected": -419.9205017089844, + "loss": 0.0561, + "rewards/chosen": 7.37081241607666, + "rewards/margins": 20.564130783081055, + "rewards/rejected": -13.193318367004395, + "step": 3084 + }, + { + "epoch": 0.8455529669727285, + "grad_norm": 14.0625, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -7561457.454545454, + "logits/rejected": -26027694.769230768, + "logps/chosen": -397.4021661931818, + "logps/rejected": -554.9938401442307, + "loss": 0.0325, + "rewards/chosen": 6.769062389026988, + "rewards/margins": 18.391673801662204, + "rewards/rejected": -11.622611412635216, + "step": 3085 + }, + { + "epoch": 0.8458270522132383, + "grad_norm": 12.25, + "kl": 5.2133684158325195, + "learning_rate": 5e-06, + "logits/chosen": -6008286.666666667, + "logits/rejected": -23430442.666666668, + "logps/chosen": -413.4817301432292, + "logps/rejected": -551.6761067708334, + "loss": 0.0538, + "rewards/chosen": 6.985700607299805, + "rewards/margins": 21.098557154337563, + "rewards/rejected": -14.11285654703776, + "step": 3086 + }, + { + "epoch": 0.8461011374537482, + "grad_norm": 0.8359375, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -31946699.636363637, + "logits/rejected": -31609016.615384616, + "logps/chosen": -386.931640625, + "logps/rejected": -600.5295973557693, + "loss": 0.0031, + "rewards/chosen": 8.313458529385654, + "rewards/margins": 21.294178462528684, + "rewards/rejected": -12.980719933143028, + "step": 3087 + }, + { + "epoch": 0.8463752226942579, + "grad_norm": 11.4375, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -10515778.666666666, + "logits/rejected": -32725318.4, + "logps/chosen": -534.7169053819445, + "logps/rejected": -499.6363932291667, + "loss": 0.0494, + "rewards/chosen": 6.4605907864040795, + "rewards/margins": 16.03490922715929, + "rewards/rejected": -9.574318440755208, + "step": 3088 + }, + { + "epoch": 0.8466493079347677, + "grad_norm": 2.15625, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -35996930.90909091, + "logits/rejected": -21101435.076923076, + "logps/chosen": -371.55599698153407, + "logps/rejected": -696.1105769230769, + "loss": 0.0087, + "rewards/chosen": 6.59302451393821, + "rewards/margins": 22.514812656215856, + "rewards/rejected": -15.921788142277645, + "step": 3089 + }, + { + "epoch": 0.8469233931752775, + "grad_norm": 8.3125, + "kl": 4.387738227844238, + "learning_rate": 5e-06, + "logits/chosen": -4853394.0, + "logits/rejected": -1191819.25, + "logps/chosen": -389.4585266113281, + "logps/rejected": -627.125244140625, + "loss": 0.0329, + "rewards/chosen": 6.736742973327637, + "rewards/margins": 20.03480625152588, + "rewards/rejected": -13.298063278198242, + "step": 3090 + }, + { + "epoch": 0.8471974784157873, + "grad_norm": 0.99609375, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -9862889.142857144, + "logits/rejected": -33099241.411764707, + "logps/chosen": -317.41312081473217, + "logps/rejected": -531.2087545955883, + "loss": 0.0028, + "rewards/chosen": 6.845756530761719, + "rewards/margins": 21.04601557114545, + "rewards/rejected": -14.200259040383731, + "step": 3091 + }, + { + "epoch": 0.8474715636562971, + "grad_norm": 6.28125, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -32487371.636363637, + "logits/rejected": 4699779.692307692, + "logps/chosen": -394.5779918323864, + "logps/rejected": -506.43160306490387, + "loss": 0.0171, + "rewards/chosen": 7.384977860884233, + "rewards/margins": 19.953520341352984, + "rewards/rejected": -12.56854248046875, + "step": 3092 + }, + { + "epoch": 0.8477456488968069, + "grad_norm": 5.4375, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -20050619.2, + "logits/rejected": -29577273.14285714, + "logps/chosen": -317.384375, + "logps/rejected": -640.1764787946429, + "loss": 0.052, + "rewards/chosen": 5.771800994873047, + "rewards/margins": 19.79521015712193, + "rewards/rejected": -14.023409162248884, + "step": 3093 + }, + { + "epoch": 0.8480197341373167, + "grad_norm": 3.6875, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -23031889.454545453, + "logits/rejected": -12870902.153846154, + "logps/chosen": -403.13285688920456, + "logps/rejected": -669.9945913461538, + "loss": 0.0195, + "rewards/chosen": 7.429039001464844, + "rewards/margins": 23.723463205190804, + "rewards/rejected": -16.29442420372596, + "step": 3094 + }, + { + "epoch": 0.8482938193778266, + "grad_norm": 13.1875, + "kl": 0.3967437744140625, + "learning_rate": 5e-06, + "logits/chosen": -43288812.307692304, + "logits/rejected": -17007645.09090909, + "logps/chosen": -399.0477764423077, + "logps/rejected": -649.7324662642045, + "loss": 0.0449, + "rewards/chosen": 7.99859853891226, + "rewards/margins": 20.930115706437117, + "rewards/rejected": -12.931517167524857, + "step": 3095 + }, + { + "epoch": 0.8485679046183363, + "grad_norm": 2.703125, + "kl": 1.8036375045776367, + "learning_rate": 5e-06, + "logits/chosen": -15689918.76923077, + "logits/rejected": 98055.27272727272, + "logps/chosen": -365.0935246394231, + "logps/rejected": -513.6486150568181, + "loss": 0.0151, + "rewards/chosen": 7.8542327880859375, + "rewards/margins": 20.195589932528407, + "rewards/rejected": -12.34135714444247, + "step": 3096 + }, + { + "epoch": 0.8488419898588461, + "grad_norm": 1.4921875, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -10879709.090909092, + "logits/rejected": -13912866.461538462, + "logps/chosen": -426.9705699573864, + "logps/rejected": -659.5117938701923, + "loss": 0.0051, + "rewards/chosen": 8.907901417125355, + "rewards/margins": 21.671243467531006, + "rewards/rejected": -12.76334205040565, + "step": 3097 + }, + { + "epoch": 0.8491160750993559, + "grad_norm": 4.03125, + "kl": 11.636991500854492, + "learning_rate": 5e-06, + "logits/chosen": -14825554.0, + "logits/rejected": -24071158.0, + "logps/chosen": -443.99652099609375, + "logps/rejected": -468.216064453125, + "loss": 0.0163, + "rewards/chosen": 7.367401599884033, + "rewards/margins": 23.78453493118286, + "rewards/rejected": -16.417133331298828, + "step": 3098 + }, + { + "epoch": 0.8493901603398657, + "grad_norm": 3.265625, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -20810731.2, + "logits/rejected": -20403241.14285714, + "logps/chosen": -378.8668701171875, + "logps/rejected": -654.5641043526786, + "loss": 0.0151, + "rewards/chosen": 5.586615371704101, + "rewards/margins": 20.715541240147182, + "rewards/rejected": -15.12892586844308, + "step": 3099 + }, + { + "epoch": 0.8496642455803755, + "grad_norm": 2.796875, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -26648993.6, + "logits/rejected": -43092086.85714286, + "logps/chosen": -471.05146484375, + "logps/rejected": -594.1985909598214, + "loss": 0.0074, + "rewards/chosen": 7.9186454772949215, + "rewards/margins": 23.15072293962751, + "rewards/rejected": -15.232077462332589, + "step": 3100 + }, + { + "epoch": 0.8499383308208853, + "grad_norm": 0.76953125, + "kl": 2.6660983562469482, + "learning_rate": 5e-06, + "logits/chosen": -24183502.545454547, + "logits/rejected": -30941809.230769232, + "logps/chosen": -461.19948508522725, + "logps/rejected": -573.4504206730769, + "loss": 0.0021, + "rewards/chosen": 7.690655795010653, + "rewards/margins": 22.269757357510652, + "rewards/rejected": -14.5791015625, + "step": 3101 + }, + { + "epoch": 0.8502124160613951, + "grad_norm": 4.8125, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -20809545.846153848, + "logits/rejected": -29488189.09090909, + "logps/chosen": -339.45755709134613, + "logps/rejected": -572.6257990056819, + "loss": 0.0503, + "rewards/chosen": 5.027592585637019, + "rewards/margins": 17.117981490555344, + "rewards/rejected": -12.090388904918324, + "step": 3102 + }, + { + "epoch": 0.8504865013019048, + "grad_norm": 2.484375, + "kl": 2.0258681774139404, + "learning_rate": 5e-06, + "logits/chosen": -50655227.07692308, + "logits/rejected": -14660709.818181818, + "logps/chosen": -443.65613731971155, + "logps/rejected": -512.1215376420455, + "loss": 0.0078, + "rewards/chosen": 9.191454373873198, + "rewards/margins": 19.952615484491098, + "rewards/rejected": -10.761161110617898, + "step": 3103 + }, + { + "epoch": 0.8507605865424147, + "grad_norm": 6.875, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -33246212.923076924, + "logits/rejected": -4936373.818181818, + "logps/chosen": -421.37635216346155, + "logps/rejected": -431.25532670454544, + "loss": 0.0106, + "rewards/chosen": 6.987131558931791, + "rewards/margins": 18.578139138388465, + "rewards/rejected": -11.591007579456676, + "step": 3104 + }, + { + "epoch": 0.8510346717829245, + "grad_norm": 2.390625, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -27775315.555555556, + "logits/rejected": -33531061.333333332, + "logps/chosen": -352.8411458333333, + "logps/rejected": -497.7388671875, + "loss": 0.0105, + "rewards/chosen": 6.689057244194879, + "rewards/margins": 21.21391075981988, + "rewards/rejected": -14.524853515625, + "step": 3105 + }, + { + "epoch": 0.8513087570234343, + "grad_norm": 1.4609375, + "kl": 1.1898086071014404, + "learning_rate": 5e-06, + "logits/chosen": -32236674.666666668, + "logits/rejected": -42364634.666666664, + "logps/chosen": -481.753173828125, + "logps/rejected": -533.3761393229166, + "loss": 0.0039, + "rewards/chosen": 7.307053883870442, + "rewards/margins": 21.185949961344402, + "rewards/rejected": -13.878896077473959, + "step": 3106 + }, + { + "epoch": 0.8515828422639441, + "grad_norm": 6.34375, + "kl": 3.7332167625427246, + "learning_rate": 5e-06, + "logits/chosen": -52207282.28571428, + "logits/rejected": -39070556.8, + "logps/chosen": -455.34165736607144, + "logps/rejected": -525.227099609375, + "loss": 0.0175, + "rewards/chosen": 7.6496462140764505, + "rewards/margins": 20.85549795968192, + "rewards/rejected": -13.205851745605468, + "step": 3107 + }, + { + "epoch": 0.8518569275044539, + "grad_norm": 3.53125, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -10813788.307692308, + "logits/rejected": -29397329.454545453, + "logps/chosen": -424.50390625, + "logps/rejected": -649.2403231534091, + "loss": 0.0116, + "rewards/chosen": 7.314260629507212, + "rewards/margins": 21.91667484736943, + "rewards/rejected": -14.602414217862217, + "step": 3108 + }, + { + "epoch": 0.8521310127449637, + "grad_norm": 1.1015625, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -22024404.57142857, + "logits/rejected": 10977829.6, + "logps/chosen": -428.00411551339283, + "logps/rejected": -651.43603515625, + "loss": 0.0023, + "rewards/chosen": 7.9311948503766745, + "rewards/margins": 26.417135402134488, + "rewards/rejected": -18.485940551757814, + "step": 3109 + }, + { + "epoch": 0.8524050979854735, + "grad_norm": 14.4375, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -10810633.333333334, + "logits/rejected": -21717540.0, + "logps/chosen": -420.2162679036458, + "logps/rejected": -614.1562093098959, + "loss": 0.0212, + "rewards/chosen": 6.838919321695964, + "rewards/margins": 19.995733896891277, + "rewards/rejected": -13.156814575195312, + "step": 3110 + }, + { + "epoch": 0.8526791832259832, + "grad_norm": 3.90625, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -27217936.0, + "logits/rejected": -29254381.333333332, + "logps/chosen": -428.1899820963542, + "logps/rejected": -590.825439453125, + "loss": 0.0134, + "rewards/chosen": 7.5669294993082685, + "rewards/margins": 21.324534734090168, + "rewards/rejected": -13.7576052347819, + "step": 3111 + }, + { + "epoch": 0.8529532684664931, + "grad_norm": 6.6875, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -20409262.4, + "logits/rejected": -25153888.0, + "logps/chosen": -483.16396484375, + "logps/rejected": -507.36635044642856, + "loss": 0.0126, + "rewards/chosen": 6.837347412109375, + "rewards/margins": 18.857914079938617, + "rewards/rejected": -12.020566667829241, + "step": 3112 + }, + { + "epoch": 0.8532273537070029, + "grad_norm": 2.640625, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -19209690.666666668, + "logits/rejected": -26658605.333333332, + "logps/chosen": -392.3487548828125, + "logps/rejected": -489.9335123697917, + "loss": 0.0077, + "rewards/chosen": 7.715457916259766, + "rewards/margins": 19.263033548990883, + "rewards/rejected": -11.54757563273112, + "step": 3113 + }, + { + "epoch": 0.8535014389475126, + "grad_norm": 10.9375, + "kl": 11.654260635375977, + "learning_rate": 5e-06, + "logits/chosen": -1582786.3529411764, + "logits/rejected": -34971282.28571428, + "logps/chosen": -356.0228630514706, + "logps/rejected": -766.1545758928571, + "loss": 0.0609, + "rewards/chosen": 6.07913522159352, + "rewards/margins": 25.94732544201763, + "rewards/rejected": -19.868190220424108, + "step": 3114 + }, + { + "epoch": 0.8537755241880225, + "grad_norm": 1.734375, + "kl": 4.872922420501709, + "learning_rate": 5e-06, + "logits/chosen": -28412172.8, + "logits/rejected": -47193112.88888889, + "logps/chosen": -415.07490234375, + "logps/rejected": -636.1629774305555, + "loss": 0.0044, + "rewards/chosen": 8.083002726236979, + "rewards/margins": 24.17129347059462, + "rewards/rejected": -16.08829074435764, + "step": 3115 + }, + { + "epoch": 0.8540496094285323, + "grad_norm": 3.171875, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -22321316.57142857, + "logits/rejected": 60744953.6, + "logps/chosen": -531.13720703125, + "logps/rejected": -517.55654296875, + "loss": 0.0205, + "rewards/chosen": 7.566799708775112, + "rewards/margins": 23.553506251743862, + "rewards/rejected": -15.98670654296875, + "step": 3116 + }, + { + "epoch": 0.8543236946690421, + "grad_norm": 4.0625, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -29575621.333333332, + "logits/rejected": -10930774.666666666, + "logps/chosen": -403.9887288411458, + "logps/rejected": -495.2403157552083, + "loss": 0.0278, + "rewards/chosen": 8.31807772318522, + "rewards/margins": 19.368268966674805, + "rewards/rejected": -11.050191243489584, + "step": 3117 + }, + { + "epoch": 0.8545977799095519, + "grad_norm": 4.8125, + "kl": 2.9580702781677246, + "learning_rate": 5e-06, + "logits/chosen": -25514569.14285714, + "logits/rejected": -29190674.82352941, + "logps/chosen": -288.1796875, + "logps/rejected": -533.0337775735294, + "loss": 0.0333, + "rewards/chosen": 5.717225211007254, + "rewards/margins": 17.148133414132253, + "rewards/rejected": -11.430908203125, + "step": 3118 + }, + { + "epoch": 0.8548718651500616, + "grad_norm": 8.0625, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -20059153.230769232, + "logits/rejected": -26059514.181818184, + "logps/chosen": -442.05419921875, + "logps/rejected": -617.9793590198864, + "loss": 0.0263, + "rewards/chosen": 7.17840810922476, + "rewards/margins": 19.882402860201324, + "rewards/rejected": -12.703994750976562, + "step": 3119 + }, + { + "epoch": 0.8551459503905715, + "grad_norm": 8.3125, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -20584816.0, + "logits/rejected": -37428244.571428575, + "logps/chosen": -313.864892578125, + "logps/rejected": -590.0470145089286, + "loss": 0.0474, + "rewards/chosen": 5.549734497070313, + "rewards/margins": 17.89878147670201, + "rewards/rejected": -12.349046979631696, + "step": 3120 + }, + { + "epoch": 0.8554200356310813, + "grad_norm": 5.96875, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -33279595.636363637, + "logits/rejected": -23139037.53846154, + "logps/chosen": -417.96981534090907, + "logps/rejected": -510.640625, + "loss": 0.037, + "rewards/chosen": 7.879395224831321, + "rewards/margins": 18.773553221375792, + "rewards/rejected": -10.894157996544472, + "step": 3121 + }, + { + "epoch": 0.855694120871591, + "grad_norm": 4.375, + "kl": 1.5423177480697632, + "learning_rate": 5e-06, + "logits/chosen": -13259709.090909092, + "logits/rejected": -11503670.153846154, + "logps/chosen": -337.0555974786932, + "logps/rejected": -472.82861328125, + "loss": 0.0211, + "rewards/chosen": 8.492847095836293, + "rewards/margins": 20.74386473969146, + "rewards/rejected": -12.251017643855167, + "step": 3122 + }, + { + "epoch": 0.8559682061121009, + "grad_norm": 0.283203125, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -18793049.333333332, + "logits/rejected": -23410131.555555556, + "logps/chosen": -410.7428792317708, + "logps/rejected": -563.9857855902778, + "loss": 0.001, + "rewards/chosen": 7.731421788533528, + "rewards/margins": 21.493186314900715, + "rewards/rejected": -13.761764526367188, + "step": 3123 + }, + { + "epoch": 0.8562422913526107, + "grad_norm": 11.3125, + "kl": 4.457822322845459, + "learning_rate": 5e-06, + "logits/chosen": -6833095.0, + "logits/rejected": -16939434.0, + "logps/chosen": -447.315185546875, + "logps/rejected": -516.1588134765625, + "loss": 0.0336, + "rewards/chosen": 6.6086530685424805, + "rewards/margins": 18.456658363342285, + "rewards/rejected": -11.848005294799805, + "step": 3124 + }, + { + "epoch": 0.8565163765931204, + "grad_norm": 1.1796875, + "kl": 1.7146899700164795, + "learning_rate": 5e-06, + "logits/chosen": -3254795.7333333334, + "logits/rejected": -22223992.888888888, + "logps/chosen": -519.4892578125, + "logps/rejected": -684.8978949652778, + "loss": 0.0046, + "rewards/chosen": 8.007538859049479, + "rewards/margins": 20.07860327826606, + "rewards/rejected": -12.07106441921658, + "step": 3125 + }, + { + "epoch": 0.8567904618336303, + "grad_norm": 1.5390625, + "kl": 2.8508338928222656, + "learning_rate": 5e-06, + "logits/chosen": -16884103.384615384, + "logits/rejected": 34106082.90909091, + "logps/chosen": -438.88423978365387, + "logps/rejected": -727.5217507102273, + "loss": 0.0031, + "rewards/chosen": 7.608528724083533, + "rewards/margins": 25.41145175320285, + "rewards/rejected": -17.802923029119317, + "step": 3126 + }, + { + "epoch": 0.85706454707414, + "grad_norm": 3.3125, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": 1946219.7142857143, + "logits/rejected": -19757805.17647059, + "logps/chosen": -471.7340611049107, + "logps/rejected": -707.8141659007352, + "loss": 0.0041, + "rewards/chosen": 8.758477347237724, + "rewards/margins": 24.817720942136624, + "rewards/rejected": -16.0592435948989, + "step": 3127 + }, + { + "epoch": 0.8573386323146499, + "grad_norm": 3.875, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -26570193.066666666, + "logits/rejected": 57251669.333333336, + "logps/chosen": -320.12916666666666, + "logps/rejected": -653.3949110243055, + "loss": 0.0359, + "rewards/chosen": 4.796486409505208, + "rewards/margins": 21.293765258789065, + "rewards/rejected": -16.497278849283855, + "step": 3128 + }, + { + "epoch": 0.8576127175551597, + "grad_norm": 5.6875, + "kl": 2.9260001182556152, + "learning_rate": 5e-06, + "logits/chosen": 4089326.153846154, + "logits/rejected": -58176384.0, + "logps/chosen": -399.4011042668269, + "logps/rejected": -687.2548828125, + "loss": 0.024, + "rewards/chosen": 8.142434927133413, + "rewards/margins": 25.684923718859267, + "rewards/rejected": -17.54248879172585, + "step": 3129 + }, + { + "epoch": 0.8578868027956694, + "grad_norm": 8.375, + "kl": 9.691754341125488, + "learning_rate": 5e-06, + "logits/chosen": -6998535.05882353, + "logits/rejected": -32603332.57142857, + "logps/chosen": -395.7469267003676, + "logps/rejected": -570.6534598214286, + "loss": 0.0577, + "rewards/chosen": 7.351587632123162, + "rewards/margins": 21.267582548766576, + "rewards/rejected": -13.915994916643415, + "step": 3130 + }, + { + "epoch": 0.8581608880361793, + "grad_norm": 2.515625, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -17344876.8, + "logits/rejected": -27046236.444444444, + "logps/chosen": -395.25481770833335, + "logps/rejected": -496.8429361979167, + "loss": 0.0211, + "rewards/chosen": 7.439136759440104, + "rewards/margins": 17.800093587239584, + "rewards/rejected": -10.360956827799479, + "step": 3131 + }, + { + "epoch": 0.858434973276689, + "grad_norm": 11.8125, + "kl": 5.991198539733887, + "learning_rate": 5e-06, + "logits/chosen": -5358748.266666667, + "logits/rejected": -18914471.111111112, + "logps/chosen": -427.2026692708333, + "logps/rejected": -479.37217881944446, + "loss": 0.0607, + "rewards/chosen": 7.498921712239583, + "rewards/margins": 21.06257544623481, + "rewards/rejected": -13.563653733995226, + "step": 3132 + }, + { + "epoch": 0.8587090585171988, + "grad_norm": 2.59375, + "kl": 4.364678859710693, + "learning_rate": 5e-06, + "logits/chosen": -40329600.0, + "logits/rejected": -21991856.0, + "logps/chosen": -365.4130533854167, + "logps/rejected": -441.19737413194446, + "loss": 0.0108, + "rewards/chosen": 7.031657409667969, + "rewards/margins": 16.92845204671224, + "rewards/rejected": -9.896794637044271, + "step": 3133 + }, + { + "epoch": 0.8589831437577087, + "grad_norm": 1.8515625, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -20843342.545454547, + "logits/rejected": -23208066.46153846, + "logps/chosen": -380.87198153409093, + "logps/rejected": -650.9710787259615, + "loss": 0.0274, + "rewards/chosen": 5.7303855202414775, + "rewards/margins": 23.047289441515517, + "rewards/rejected": -17.31690392127404, + "step": 3134 + }, + { + "epoch": 0.8592572289982184, + "grad_norm": 4.5, + "kl": 0.40050697326660156, + "learning_rate": 5e-06, + "logits/chosen": -11710947.636363637, + "logits/rejected": -26385179.076923076, + "logps/chosen": -401.8724254261364, + "logps/rejected": -613.0374098557693, + "loss": 0.0178, + "rewards/chosen": 7.047265486283735, + "rewards/margins": 19.373419754988664, + "rewards/rejected": -12.326154268704927, + "step": 3135 + }, + { + "epoch": 0.8595313142387282, + "grad_norm": 6.5625, + "kl": 4.676017761230469, + "learning_rate": 5e-06, + "logits/chosen": -53479909.333333336, + "logits/rejected": -32006666.666666668, + "logps/chosen": -368.783203125, + "logps/rejected": -477.302001953125, + "loss": 0.0476, + "rewards/chosen": 5.708573659261067, + "rewards/margins": 15.540316263834637, + "rewards/rejected": -9.831742604573568, + "step": 3136 + }, + { + "epoch": 0.8598053994792381, + "grad_norm": 3.65625, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -3285187.0, + "logits/rejected": -4861928.0, + "logps/chosen": -310.980224609375, + "logps/rejected": -534.5877685546875, + "loss": 0.0195, + "rewards/chosen": 4.834011554718018, + "rewards/margins": 19.681233882904053, + "rewards/rejected": -14.847222328186035, + "step": 3137 + }, + { + "epoch": 0.8600794847197478, + "grad_norm": 5.21875, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -34349520.0, + "logits/rejected": -9773989.714285715, + "logps/chosen": -432.69306640625, + "logps/rejected": -590.80224609375, + "loss": 0.033, + "rewards/chosen": 7.052513122558594, + "rewards/margins": 19.983027866908483, + "rewards/rejected": -12.930514744349889, + "step": 3138 + }, + { + "epoch": 0.8603535699602577, + "grad_norm": 12.25, + "kl": 3.0050430297851562, + "learning_rate": 5e-06, + "logits/chosen": -3328078.8, + "logits/rejected": -21417389.714285713, + "logps/chosen": -382.9885986328125, + "logps/rejected": -567.7673688616071, + "loss": 0.0672, + "rewards/chosen": 5.710154342651367, + "rewards/margins": 17.920481491088868, + "rewards/rejected": -12.2103271484375, + "step": 3139 + }, + { + "epoch": 0.8606276552007674, + "grad_norm": 4.875, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -11128416.8, + "logits/rejected": -18663104.0, + "logps/chosen": -393.219921875, + "logps/rejected": -507.1406947544643, + "loss": 0.0125, + "rewards/chosen": 6.861688232421875, + "rewards/margins": 17.339510672433036, + "rewards/rejected": -10.477822440011161, + "step": 3140 + }, + { + "epoch": 0.8609017404412772, + "grad_norm": 2.09375, + "kl": 1.8469715118408203, + "learning_rate": 5e-06, + "logits/chosen": -21285130.666666668, + "logits/rejected": -27386090.666666668, + "logps/chosen": -413.5290120442708, + "logps/rejected": -550.4463297526041, + "loss": 0.0049, + "rewards/chosen": 8.179845174153646, + "rewards/margins": 18.75683911641439, + "rewards/rejected": -10.576993942260742, + "step": 3141 + }, + { + "epoch": 0.8611758256817871, + "grad_norm": 264.0, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": 9353497.454545455, + "logits/rejected": 40545563.07692308, + "logps/chosen": -455.34641335227275, + "logps/rejected": -523.5731670673077, + "loss": 0.069, + "rewards/chosen": 5.82231972434304, + "rewards/margins": 15.326999477573207, + "rewards/rejected": -9.504679753230167, + "step": 3142 + }, + { + "epoch": 0.8614499109222968, + "grad_norm": 4.90625, + "kl": 0.9593290090560913, + "learning_rate": 5e-06, + "logits/chosen": -29047720.0, + "logits/rejected": -35620933.333333336, + "logps/chosen": -371.440673828125, + "logps/rejected": -696.8387044270834, + "loss": 0.0378, + "rewards/chosen": 6.4290110270182295, + "rewards/margins": 22.137985229492188, + "rewards/rejected": -15.708974202473959, + "step": 3143 + }, + { + "epoch": 0.8617239961628066, + "grad_norm": 9.375, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -22543485.09090909, + "logits/rejected": -15338166.153846154, + "logps/chosen": -404.56471946022725, + "logps/rejected": -385.48715444711536, + "loss": 0.0697, + "rewards/chosen": 7.255329478870738, + "rewards/margins": 17.02615009654652, + "rewards/rejected": -9.770820617675781, + "step": 3144 + }, + { + "epoch": 0.8619980814033165, + "grad_norm": 7.25, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -29479480.0, + "logits/rejected": -48133024.0, + "logps/chosen": -336.5453694661458, + "logps/rejected": -482.4503987630208, + "loss": 0.0364, + "rewards/chosen": 7.0514882405598955, + "rewards/margins": 19.139918009440105, + "rewards/rejected": -12.088429768880209, + "step": 3145 + }, + { + "epoch": 0.8622721666438262, + "grad_norm": 13.8125, + "kl": 13.376441955566406, + "learning_rate": 5e-06, + "logits/chosen": -24181102.222222224, + "logits/rejected": -29495477.333333332, + "logps/chosen": -410.2206759982639, + "logps/rejected": -654.2296549479166, + "loss": 0.0888, + "rewards/chosen": 7.27203369140625, + "rewards/margins": 21.371376037597656, + "rewards/rejected": -14.099342346191406, + "step": 3146 + }, + { + "epoch": 0.862546251884336, + "grad_norm": 1.859375, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -27039099.076923076, + "logits/rejected": -27239700.363636363, + "logps/chosen": -354.43502103365387, + "logps/rejected": -500.36221590909093, + "loss": 0.0073, + "rewards/chosen": 8.188859205979567, + "rewards/margins": 20.60155396361451, + "rewards/rejected": -12.412694757634943, + "step": 3147 + }, + { + "epoch": 0.8628203371248458, + "grad_norm": 1.1875, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -11601456.0, + "logits/rejected": -45011276.8, + "logps/chosen": -423.14620535714283, + "logps/rejected": -445.6072265625, + "loss": 0.0037, + "rewards/chosen": 7.1498516627720425, + "rewards/margins": 21.467057473318917, + "rewards/rejected": -14.317205810546875, + "step": 3148 + }, + { + "epoch": 0.8630944223653556, + "grad_norm": 4.03125, + "kl": 8.011747360229492, + "learning_rate": 5e-06, + "logits/chosen": -18842678.666666668, + "logits/rejected": -22327632.0, + "logps/chosen": -486.8097737630208, + "logps/rejected": -612.5039876302084, + "loss": 0.0124, + "rewards/chosen": 7.342199325561523, + "rewards/margins": 20.260427474975586, + "rewards/rejected": -12.918228149414062, + "step": 3149 + }, + { + "epoch": 0.8633685076058655, + "grad_norm": 11.125, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -2526579.1666666665, + "logits/rejected": -31273565.333333332, + "logps/chosen": -294.6822916666667, + "logps/rejected": -377.3094889322917, + "loss": 0.0311, + "rewards/chosen": 6.320415496826172, + "rewards/margins": 17.749192555745445, + "rewards/rejected": -11.428777058919271, + "step": 3150 + }, + { + "epoch": 0.8636425928463752, + "grad_norm": 7.25, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": 13724873.6, + "logits/rejected": -26219917.714285713, + "logps/chosen": -534.261181640625, + "logps/rejected": -588.6323939732143, + "loss": 0.0257, + "rewards/chosen": 7.842802429199219, + "rewards/margins": 20.896086120605467, + "rewards/rejected": -13.05328369140625, + "step": 3151 + }, + { + "epoch": 0.863916678086885, + "grad_norm": 5.46875, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -5978164.0, + "logits/rejected": 1035520.8571428572, + "logps/chosen": -451.163232421875, + "logps/rejected": -642.4524274553571, + "loss": 0.0269, + "rewards/chosen": 6.187412261962891, + "rewards/margins": 19.571300506591797, + "rewards/rejected": -13.383888244628906, + "step": 3152 + }, + { + "epoch": 0.8641907633273949, + "grad_norm": 1.234375, + "kl": 2.1598548889160156, + "learning_rate": 5e-06, + "logits/chosen": -19121344.0, + "logits/rejected": -14950080.0, + "logps/chosen": -528.1487630208334, + "logps/rejected": -569.073828125, + "loss": 0.0014, + "rewards/chosen": 9.10966067843967, + "rewards/margins": 20.011901685926652, + "rewards/rejected": -10.90224100748698, + "step": 3153 + }, + { + "epoch": 0.8644648485679046, + "grad_norm": 0.98046875, + "kl": 2.2044882774353027, + "learning_rate": 5e-06, + "logits/chosen": -2017847.6363636365, + "logits/rejected": -9556891.692307692, + "logps/chosen": -588.2023703835227, + "logps/rejected": -497.62338491586536, + "loss": 0.0021, + "rewards/chosen": 9.562713623046875, + "rewards/margins": 20.792724609375, + "rewards/rejected": -11.230010986328125, + "step": 3154 + }, + { + "epoch": 0.8647389338084144, + "grad_norm": 8.0, + "kl": 8.13802433013916, + "learning_rate": 5e-06, + "logits/chosen": -31911470.933333334, + "logits/rejected": -12321317.333333334, + "logps/chosen": -455.1175130208333, + "logps/rejected": -451.46356879340277, + "loss": 0.0882, + "rewards/chosen": 7.306487019856771, + "rewards/margins": 17.146664767795137, + "rewards/rejected": -9.840177747938368, + "step": 3155 + }, + { + "epoch": 0.8650130190489242, + "grad_norm": 4.15625, + "kl": 5.618684768676758, + "learning_rate": 5e-06, + "logits/chosen": -28233737.14285714, + "logits/rejected": -35252032.0, + "logps/chosen": -479.644287109375, + "logps/rejected": -717.5913947610294, + "loss": 0.0046, + "rewards/chosen": 6.794695172991071, + "rewards/margins": 22.61132761209953, + "rewards/rejected": -15.816632439108457, + "step": 3156 + }, + { + "epoch": 0.865287104289434, + "grad_norm": 7.125, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -32626595.555555556, + "logits/rejected": -31450329.6, + "logps/chosen": -502.06260850694446, + "logps/rejected": -565.3595052083333, + "loss": 0.0156, + "rewards/chosen": 6.787715488009983, + "rewards/margins": 21.041843499077693, + "rewards/rejected": -14.254128011067708, + "step": 3157 + }, + { + "epoch": 0.8655611895299438, + "grad_norm": 1.28125, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -2610914.6666666665, + "logits/rejected": -17239912.0, + "logps/chosen": -490.1195475260417, + "logps/rejected": -411.420166015625, + "loss": 0.0066, + "rewards/chosen": 7.619864781697591, + "rewards/margins": 17.44199816385905, + "rewards/rejected": -9.822133382161459, + "step": 3158 + }, + { + "epoch": 0.8658352747704536, + "grad_norm": 2.8125, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -12350104.0, + "logits/rejected": -21414886.4, + "logps/chosen": -374.94398716517856, + "logps/rejected": -489.4158203125, + "loss": 0.0075, + "rewards/chosen": 6.6721698216029575, + "rewards/margins": 17.02626528058733, + "rewards/rejected": -10.354095458984375, + "step": 3159 + }, + { + "epoch": 0.8661093600109634, + "grad_norm": 0.365234375, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -24800971.636363637, + "logits/rejected": 50321196.307692304, + "logps/chosen": -525.8690962357955, + "logps/rejected": -552.6642503004807, + "loss": 0.0009, + "rewards/chosen": 8.719866665926846, + "rewards/margins": 27.479118187110743, + "rewards/rejected": -18.759251521183895, + "step": 3160 + }, + { + "epoch": 0.8663834452514733, + "grad_norm": 6.59375, + "kl": 2.8736283779144287, + "learning_rate": 5e-06, + "logits/chosen": -20234342.85714286, + "logits/rejected": -42306089.6, + "logps/chosen": -381.1083984375, + "logps/rejected": -459.518359375, + "loss": 0.0447, + "rewards/chosen": 6.819292340959821, + "rewards/margins": 16.97599857875279, + "rewards/rejected": -10.156706237792969, + "step": 3161 + }, + { + "epoch": 0.866657530491983, + "grad_norm": 0.12890625, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -30308956.444444444, + "logits/rejected": -26554692.266666666, + "logps/chosen": -478.20209418402777, + "logps/rejected": -646.739453125, + "loss": 0.0004, + "rewards/chosen": 8.489171346028646, + "rewards/margins": 25.27686258951823, + "rewards/rejected": -16.787691243489583, + "step": 3162 + }, + { + "epoch": 0.8669316157324928, + "grad_norm": 1.296875, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": 2543782.0, + "logits/rejected": -24856786.666666668, + "logps/chosen": -398.6488850911458, + "logps/rejected": -547.6736246744791, + "loss": 0.0032, + "rewards/chosen": 8.70384152730306, + "rewards/margins": 22.223095575968426, + "rewards/rejected": -13.519254048665365, + "step": 3163 + }, + { + "epoch": 0.8672057009730026, + "grad_norm": 7.625, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -40067842.666666664, + "logits/rejected": -12917016.0, + "logps/chosen": -463.72119140625, + "logps/rejected": -512.9991048177084, + "loss": 0.0367, + "rewards/chosen": 7.2934926350911455, + "rewards/margins": 18.32274881998698, + "rewards/rejected": -11.029256184895834, + "step": 3164 + }, + { + "epoch": 0.8674797862135124, + "grad_norm": 0.1884765625, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -2742445.4545454546, + "logits/rejected": -21949577.846153848, + "logps/chosen": -512.3514737215909, + "logps/rejected": -537.9208233173077, + "loss": 0.0005, + "rewards/chosen": 9.0091552734375, + "rewards/margins": 22.31741450383113, + "rewards/rejected": -13.30825923039363, + "step": 3165 + }, + { + "epoch": 0.8677538714540222, + "grad_norm": 9.375, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -40158514.666666664, + "logits/rejected": -22200618.666666668, + "logps/chosen": -466.6464029947917, + "logps/rejected": -568.6923014322916, + "loss": 0.0423, + "rewards/chosen": 6.0076249440511065, + "rewards/margins": 18.656140009562176, + "rewards/rejected": -12.648515065511068, + "step": 3166 + }, + { + "epoch": 0.868027956694532, + "grad_norm": 1.609375, + "kl": 3.186666488647461, + "learning_rate": 5e-06, + "logits/chosen": -12971429.818181818, + "logits/rejected": -30648691.692307692, + "logps/chosen": -467.3536931818182, + "logps/rejected": -561.1949368990385, + "loss": 0.0234, + "rewards/chosen": 7.492822126908735, + "rewards/margins": 21.850535092653928, + "rewards/rejected": -14.357712965745192, + "step": 3167 + }, + { + "epoch": 0.8683020419350418, + "grad_norm": 2.5625, + "kl": 13.889026641845703, + "learning_rate": 5e-06, + "logits/chosen": -25562002.285714287, + "logits/rejected": -34079545.6, + "logps/chosen": -423.90523856026783, + "logps/rejected": -458.02958984375, + "loss": 0.0496, + "rewards/chosen": 8.474485124860491, + "rewards/margins": 17.180428423200333, + "rewards/rejected": -8.705943298339843, + "step": 3168 + }, + { + "epoch": 0.8685761271755515, + "grad_norm": 1.9296875, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -41866474.666666664, + "logits/rejected": -21591404.8, + "logps/chosen": -429.4970703125, + "logps/rejected": -664.66328125, + "loss": 0.0028, + "rewards/chosen": 6.967369503445095, + "rewards/margins": 21.711725785997178, + "rewards/rejected": -14.744356282552083, + "step": 3169 + }, + { + "epoch": 0.8688502124160614, + "grad_norm": 16.0, + "kl": 4.597938537597656, + "learning_rate": 5e-06, + "logits/chosen": -19482749.714285713, + "logits/rejected": -2251122.4, + "logps/chosen": -375.26492745535717, + "logps/rejected": -405.34345703125, + "loss": 0.052, + "rewards/chosen": 7.396936144147601, + "rewards/margins": 16.04223872593471, + "rewards/rejected": -8.64530258178711, + "step": 3170 + }, + { + "epoch": 0.8691242976565712, + "grad_norm": 4.46875, + "kl": 2.957514762878418, + "learning_rate": 5e-06, + "logits/chosen": -10901085.090909092, + "logits/rejected": -18062921.846153848, + "logps/chosen": -547.2792080965909, + "logps/rejected": -545.4225135216346, + "loss": 0.0113, + "rewards/chosen": 8.1883413141424, + "rewards/margins": 22.424939402333507, + "rewards/rejected": -14.236598088191105, + "step": 3171 + }, + { + "epoch": 0.869398382897081, + "grad_norm": 10.4375, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -11413944.727272727, + "logits/rejected": -36984196.92307692, + "logps/chosen": -365.8895818536932, + "logps/rejected": -691.6820162259615, + "loss": 0.0363, + "rewards/chosen": 5.012128656560725, + "rewards/margins": 21.562402845262646, + "rewards/rejected": -16.550274188701923, + "step": 3172 + }, + { + "epoch": 0.8696724681375908, + "grad_norm": 7.375, + "kl": 1.524082899093628, + "learning_rate": 5e-06, + "logits/chosen": -21939219.692307692, + "logits/rejected": 74324340.36363636, + "logps/chosen": -453.94643930288464, + "logps/rejected": -539.7320223721591, + "loss": 0.0118, + "rewards/chosen": 6.656046940730168, + "rewards/margins": 22.00577422455474, + "rewards/rejected": -15.349727283824574, + "step": 3173 + }, + { + "epoch": 0.8699465533781006, + "grad_norm": 6.75, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -28307996.444444444, + "logits/rejected": -15961083.733333332, + "logps/chosen": -320.28314887152777, + "logps/rejected": -350.16432291666666, + "loss": 0.0409, + "rewards/chosen": 6.581891377766927, + "rewards/margins": 15.767271423339842, + "rewards/rejected": -9.185380045572916, + "step": 3174 + }, + { + "epoch": 0.8702206386186104, + "grad_norm": 2.234375, + "kl": 2.7071433067321777, + "learning_rate": 5e-06, + "logits/chosen": -2268608.8, + "logits/rejected": -22806496.0, + "logps/chosen": -500.1794921875, + "logps/rejected": -609.8454938616071, + "loss": 0.006, + "rewards/chosen": 8.26336441040039, + "rewards/margins": 20.2001341683524, + "rewards/rejected": -11.936769757952009, + "step": 3175 + }, + { + "epoch": 0.8704947238591202, + "grad_norm": 3.875, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -35068666.18181818, + "logits/rejected": 3151965.5384615385, + "logps/chosen": -411.20729758522725, + "logps/rejected": -946.6681189903846, + "loss": 0.0084, + "rewards/chosen": 6.540205522017046, + "rewards/margins": 35.30138669314084, + "rewards/rejected": -28.7611811711238, + "step": 3176 + }, + { + "epoch": 0.87076880909963, + "grad_norm": 6.34375, + "kl": 10.862166404724121, + "learning_rate": 5e-06, + "logits/chosen": -32126172.8, + "logits/rejected": -29852694.85714286, + "logps/chosen": -441.67060546875, + "logps/rejected": -556.6501116071429, + "loss": 0.0224, + "rewards/chosen": 8.136669158935547, + "rewards/margins": 19.313468388148717, + "rewards/rejected": -11.17679922921317, + "step": 3177 + }, + { + "epoch": 0.8710428943401398, + "grad_norm": 5.25, + "kl": 1.9175708293914795, + "learning_rate": 5e-06, + "logits/chosen": -6142199.2727272725, + "logits/rejected": -12446422.153846154, + "logps/chosen": -448.5359552556818, + "logps/rejected": -595.0501802884615, + "loss": 0.0127, + "rewards/chosen": 8.561208551580256, + "rewards/margins": 22.400331617235302, + "rewards/rejected": -13.839123065655048, + "step": 3178 + }, + { + "epoch": 0.8713169795806496, + "grad_norm": 1.390625, + "kl": 3.4038748741149902, + "learning_rate": 5e-06, + "logits/chosen": -22435481.14285714, + "logits/rejected": -26169715.2, + "logps/chosen": -377.26834542410717, + "logps/rejected": -438.667333984375, + "loss": 0.0036, + "rewards/chosen": 7.159152439662388, + "rewards/margins": 18.22049124581473, + "rewards/rejected": -11.061338806152344, + "step": 3179 + }, + { + "epoch": 0.8715910648211593, + "grad_norm": 3.703125, + "kl": 3.242008924484253, + "learning_rate": 5e-06, + "logits/chosen": -12516497.454545455, + "logits/rejected": -61540637.538461536, + "logps/chosen": -434.6834161931818, + "logps/rejected": -462.1008864182692, + "loss": 0.0453, + "rewards/chosen": 7.40624375776811, + "rewards/margins": 18.00394343662929, + "rewards/rejected": -10.597699678861177, + "step": 3180 + }, + { + "epoch": 0.8718651500616692, + "grad_norm": 4.5, + "kl": 8.882064819335938, + "learning_rate": 5e-06, + "logits/chosen": -13067894.4, + "logits/rejected": -25783921.777777776, + "logps/chosen": -455.0823567708333, + "logps/rejected": -250.95046657986111, + "loss": 0.0377, + "rewards/chosen": 7.6158192952473955, + "rewards/margins": 15.685543484157986, + "rewards/rejected": -8.069724188910591, + "step": 3181 + }, + { + "epoch": 0.872139235302179, + "grad_norm": 3.890625, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -41419724.8, + "logits/rejected": 13013285.333333334, + "logps/chosen": -486.154296875, + "logps/rejected": -601.7275390625, + "loss": 0.0071, + "rewards/chosen": 8.19795633951823, + "rewards/margins": 22.7966552734375, + "rewards/rejected": -14.598698933919271, + "step": 3182 + }, + { + "epoch": 0.8724133205426888, + "grad_norm": 5.96875, + "kl": 0.6556282043457031, + "learning_rate": 5e-06, + "logits/chosen": 20031069.714285713, + "logits/rejected": -29130156.8, + "logps/chosen": -560.0069056919643, + "logps/rejected": -577.076708984375, + "loss": 0.0317, + "rewards/chosen": 7.380458286830357, + "rewards/margins": 20.29951651436942, + "rewards/rejected": -12.919058227539063, + "step": 3183 + }, + { + "epoch": 0.8726874057831986, + "grad_norm": 4.25, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": 9670863.333333334, + "logits/rejected": -28612272.0, + "logps/chosen": -528.5943196614584, + "logps/rejected": -542.1112060546875, + "loss": 0.008, + "rewards/chosen": 8.631248474121094, + "rewards/margins": 21.382328033447266, + "rewards/rejected": -12.751079559326172, + "step": 3184 + }, + { + "epoch": 0.8729614910237083, + "grad_norm": 9.9375, + "kl": 6.341155529022217, + "learning_rate": 5e-06, + "logits/chosen": -9361827.692307692, + "logits/rejected": -18944023.272727273, + "logps/chosen": -482.5969426081731, + "logps/rejected": -533.03125, + "loss": 0.0446, + "rewards/chosen": 8.968907282902645, + "rewards/margins": 21.83179180438702, + "rewards/rejected": -12.862884521484375, + "step": 3185 + }, + { + "epoch": 0.8732355762642182, + "grad_norm": 4.625, + "kl": 4.06303596496582, + "learning_rate": 5e-06, + "logits/chosen": -6157614.285714285, + "logits/rejected": -46830313.6, + "logps/chosen": -453.58517020089283, + "logps/rejected": -619.509423828125, + "loss": 0.0135, + "rewards/chosen": 7.035718645368304, + "rewards/margins": 18.25872355869838, + "rewards/rejected": -11.223004913330078, + "step": 3186 + }, + { + "epoch": 0.873509661504728, + "grad_norm": 7.15625, + "kl": 6.628115653991699, + "learning_rate": 5e-06, + "logits/chosen": -23120403.692307692, + "logits/rejected": -44661445.81818182, + "logps/chosen": -400.7258488581731, + "logps/rejected": -638.9381214488636, + "loss": 0.0432, + "rewards/chosen": 7.011467566856971, + "rewards/margins": 20.86899764054305, + "rewards/rejected": -13.85753007368608, + "step": 3187 + }, + { + "epoch": 0.8737837467452377, + "grad_norm": 1.125, + "kl": 8.186946868896484, + "learning_rate": 5e-06, + "logits/chosen": -8122930.666666667, + "logits/rejected": -13871014.222222222, + "logps/chosen": -527.01591796875, + "logps/rejected": -518.6095920138889, + "loss": 0.0023, + "rewards/chosen": 9.176224772135416, + "rewards/margins": 21.53785383436415, + "rewards/rejected": -12.361629062228733, + "step": 3188 + }, + { + "epoch": 0.8740578319857476, + "grad_norm": 4.0, + "kl": 3.9878592491149902, + "learning_rate": 5e-06, + "logits/chosen": -13739684.923076924, + "logits/rejected": 119918801.45454545, + "logps/chosen": -428.24891075721155, + "logps/rejected": -491.4608043323864, + "loss": 0.0091, + "rewards/chosen": 8.010841369628906, + "rewards/margins": 19.44976529208097, + "rewards/rejected": -11.43892392245206, + "step": 3189 + }, + { + "epoch": 0.8743319172262574, + "grad_norm": 5.4375, + "kl": 7.476861476898193, + "learning_rate": 5e-06, + "logits/chosen": -6346473.846153846, + "logits/rejected": -31474807.272727273, + "logps/chosen": -389.1432542067308, + "logps/rejected": -508.3499200994318, + "loss": 0.0213, + "rewards/chosen": 6.219122666579026, + "rewards/margins": 18.622756477836127, + "rewards/rejected": -12.403633811257102, + "step": 3190 + }, + { + "epoch": 0.8746060024667671, + "grad_norm": 0.71484375, + "kl": 1.5904897451400757, + "learning_rate": 5e-06, + "logits/chosen": -23340829.09090909, + "logits/rejected": -22821267.692307692, + "logps/chosen": -484.4563654119318, + "logps/rejected": -495.09900841346155, + "loss": 0.0023, + "rewards/chosen": 7.607752713290128, + "rewards/margins": 19.705249332881476, + "rewards/rejected": -12.097496619591347, + "step": 3191 + }, + { + "epoch": 0.874880087707277, + "grad_norm": 8.8125, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -9243687.2, + "logits/rejected": -17096585.14285714, + "logps/chosen": -362.3077392578125, + "logps/rejected": -518.8822544642857, + "loss": 0.0325, + "rewards/chosen": 5.513520431518555, + "rewards/margins": 16.945949063982283, + "rewards/rejected": -11.432428632463727, + "step": 3192 + }, + { + "epoch": 0.8751541729477867, + "grad_norm": 11.375, + "kl": 7.28342342376709, + "learning_rate": 5e-06, + "logits/chosen": -26351126.85714286, + "logits/rejected": -12848106.4, + "logps/chosen": -390.53700474330356, + "logps/rejected": -574.09052734375, + "loss": 0.0798, + "rewards/chosen": 5.827968052455357, + "rewards/margins": 18.62753622872489, + "rewards/rejected": -12.799568176269531, + "step": 3193 + }, + { + "epoch": 0.8754282581882966, + "grad_norm": 6.0, + "kl": 2.7055869102478027, + "learning_rate": 5e-06, + "logits/chosen": -52271882.666666664, + "logits/rejected": -30079477.333333332, + "logps/chosen": -568.8559163411459, + "logps/rejected": -612.9208984375, + "loss": 0.0092, + "rewards/chosen": 8.5652707417806, + "rewards/margins": 22.476057688395183, + "rewards/rejected": -13.910786946614584, + "step": 3194 + }, + { + "epoch": 0.8757023434288064, + "grad_norm": 3.078125, + "kl": 8.265227317810059, + "learning_rate": 5e-06, + "logits/chosen": -26491910.85714286, + "logits/rejected": -20166566.4, + "logps/chosen": -584.2923409598214, + "logps/rejected": -357.1662353515625, + "loss": 0.0068, + "rewards/chosen": 9.743141174316406, + "rewards/margins": 19.483753967285157, + "rewards/rejected": -9.74061279296875, + "step": 3195 + }, + { + "epoch": 0.8759764286693161, + "grad_norm": 1.453125, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -16087057.777777778, + "logits/rejected": -27972661.333333332, + "logps/chosen": -437.3299153645833, + "logps/rejected": -471.80882161458334, + "loss": 0.0048, + "rewards/chosen": 6.970879448784722, + "rewards/margins": 17.703941175672743, + "rewards/rejected": -10.733061726888021, + "step": 3196 + }, + { + "epoch": 0.876250513909826, + "grad_norm": 1.2578125, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -29828538.666666668, + "logits/rejected": -13397709.333333334, + "logps/chosen": -440.6895751953125, + "logps/rejected": -400.9492594401042, + "loss": 0.0036, + "rewards/chosen": 9.086783727010092, + "rewards/margins": 17.91275405883789, + "rewards/rejected": -8.825970331827799, + "step": 3197 + }, + { + "epoch": 0.8765245991503358, + "grad_norm": 4.65625, + "kl": 8.31484603881836, + "learning_rate": 5e-06, + "logits/chosen": -12443510.588235294, + "logits/rejected": -20745645.714285713, + "logps/chosen": -407.15412454044116, + "logps/rejected": -543.5262974330357, + "loss": 0.0208, + "rewards/chosen": 7.1947008020737595, + "rewards/margins": 20.58871979272666, + "rewards/rejected": -13.394018990652901, + "step": 3198 + }, + { + "epoch": 0.8767986843908455, + "grad_norm": 2.34375, + "kl": 4.569572925567627, + "learning_rate": 5e-06, + "logits/chosen": -30346573.714285713, + "logits/rejected": -23451796.8, + "logps/chosen": -439.2001953125, + "logps/rejected": -646.142578125, + "loss": 0.0297, + "rewards/chosen": 6.5768230983189175, + "rewards/margins": 22.789808327811105, + "rewards/rejected": -16.212985229492187, + "step": 3199 + }, + { + "epoch": 0.8770727696313554, + "grad_norm": 2.03125, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -26654263.272727273, + "logits/rejected": -24711170.46153846, + "logps/chosen": -403.2916370738636, + "logps/rejected": -620.2149188701923, + "loss": 0.0142, + "rewards/chosen": 8.153409784490412, + "rewards/margins": 21.578315414748825, + "rewards/rejected": -13.424905630258413, + "step": 3200 + }, + { + "epoch": 0.8773468548718651, + "grad_norm": 1.1328125, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -28721797.333333332, + "logits/rejected": -14689474.666666666, + "logps/chosen": -345.1105143229167, + "logps/rejected": -459.375, + "loss": 0.0035, + "rewards/chosen": 7.337057749430339, + "rewards/margins": 18.85095469156901, + "rewards/rejected": -11.513896942138672, + "step": 3201 + }, + { + "epoch": 0.8776209401123749, + "grad_norm": 1.25, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -50239336.72727273, + "logits/rejected": 36964792.615384616, + "logps/chosen": -460.20747514204544, + "logps/rejected": -899.0356820913462, + "loss": 0.0048, + "rewards/chosen": 6.700413097034801, + "rewards/margins": 48.349512593729514, + "rewards/rejected": -41.64909949669471, + "step": 3202 + }, + { + "epoch": 0.8778950253528848, + "grad_norm": 2.984375, + "kl": 3.262734889984131, + "learning_rate": 5e-06, + "logits/chosen": -15846592.0, + "logits/rejected": -17097664.0, + "logps/chosen": -426.89122817095586, + "logps/rejected": -599.8334263392857, + "loss": 0.0227, + "rewards/chosen": 6.816902609432445, + "rewards/margins": 19.739344364454766, + "rewards/rejected": -12.922441755022321, + "step": 3203 + }, + { + "epoch": 0.8781691105933945, + "grad_norm": 12.0625, + "kl": 0.13463720679283142, + "learning_rate": 5e-06, + "logits/chosen": -28322197.333333332, + "logits/rejected": -23185122.666666668, + "logps/chosen": -378.1623942057292, + "logps/rejected": -587.2187906901041, + "loss": 0.0629, + "rewards/chosen": 5.779300053914388, + "rewards/margins": 18.047971725463867, + "rewards/rejected": -12.268671671549479, + "step": 3204 + }, + { + "epoch": 0.8784431958339043, + "grad_norm": 3.984375, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -802321.8571428572, + "logits/rejected": -38076953.6, + "logps/chosen": -547.8897530691964, + "logps/rejected": -682.506494140625, + "loss": 0.0129, + "rewards/chosen": 8.273824964250837, + "rewards/margins": 25.25192555018834, + "rewards/rejected": -16.9781005859375, + "step": 3205 + }, + { + "epoch": 0.8787172810744142, + "grad_norm": 1.4609375, + "kl": 4.436384201049805, + "learning_rate": 5e-06, + "logits/chosen": -15672320.0, + "logits/rejected": -30298444.8, + "logps/chosen": -434.1386021205357, + "logps/rejected": -604.421630859375, + "loss": 0.0054, + "rewards/chosen": 7.252002716064453, + "rewards/margins": 22.34420394897461, + "rewards/rejected": -15.092201232910156, + "step": 3206 + }, + { + "epoch": 0.8789913663149239, + "grad_norm": 6.71875, + "kl": 4.989443778991699, + "learning_rate": 5e-06, + "logits/chosen": -18852660.923076924, + "logits/rejected": -15739104.0, + "logps/chosen": -433.7877854567308, + "logps/rejected": -456.8053089488636, + "loss": 0.0308, + "rewards/chosen": 6.97680898813101, + "rewards/margins": 18.98552591817362, + "rewards/rejected": -12.008716930042613, + "step": 3207 + }, + { + "epoch": 0.8792654515554338, + "grad_norm": 5.3125, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -1226917.5555555555, + "logits/rejected": -7297043.2, + "logps/chosen": -488.3860677083333, + "logps/rejected": -448.04388020833335, + "loss": 0.0039, + "rewards/chosen": 8.799327426486546, + "rewards/margins": 21.262978956434463, + "rewards/rejected": -12.463651529947917, + "step": 3208 + }, + { + "epoch": 0.8795395367959435, + "grad_norm": 9.5, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -30506557.09090909, + "logits/rejected": -15961010.461538462, + "logps/chosen": -366.50297407670456, + "logps/rejected": -551.4542893629807, + "loss": 0.0574, + "rewards/chosen": 5.440392927689985, + "rewards/margins": 18.570702986283735, + "rewards/rejected": -13.13031005859375, + "step": 3209 + }, + { + "epoch": 0.8798136220364533, + "grad_norm": 14.125, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -14442862.76923077, + "logits/rejected": -48436104.72727273, + "logps/chosen": -360.1254131610577, + "logps/rejected": -495.4343927556818, + "loss": 0.0271, + "rewards/chosen": 7.128231928898738, + "rewards/margins": 16.713983549104704, + "rewards/rejected": -9.585751620205967, + "step": 3210 + }, + { + "epoch": 0.8800877072769632, + "grad_norm": 1.3828125, + "kl": 6.178009033203125, + "learning_rate": 5e-06, + "logits/chosen": -18093730.0, + "logits/rejected": -16623450.0, + "logps/chosen": -447.10833740234375, + "logps/rejected": -620.852783203125, + "loss": 0.0024, + "rewards/chosen": 8.873225212097168, + "rewards/margins": 25.081332206726074, + "rewards/rejected": -16.208106994628906, + "step": 3211 + }, + { + "epoch": 0.8803617925174729, + "grad_norm": 2.3125, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -27699138.46153846, + "logits/rejected": -17430839.272727273, + "logps/chosen": -345.91162109375, + "logps/rejected": -768.4699928977273, + "loss": 0.0078, + "rewards/chosen": 6.175666222205529, + "rewards/margins": 26.15525273676519, + "rewards/rejected": -19.97958651455966, + "step": 3212 + }, + { + "epoch": 0.8806358777579827, + "grad_norm": 1.8125, + "kl": 8.027090072631836, + "learning_rate": 5e-06, + "logits/chosen": -5221369.066666666, + "logits/rejected": -27682414.222222224, + "logps/chosen": -522.93095703125, + "logps/rejected": -467.26453993055554, + "loss": 0.0437, + "rewards/chosen": 8.538765462239583, + "rewards/margins": 21.218963283962673, + "rewards/rejected": -12.680197821723091, + "step": 3213 + }, + { + "epoch": 0.8809099629984926, + "grad_norm": 4.5625, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -21917790.0, + "logits/rejected": -26878952.0, + "logps/chosen": -356.3764343261719, + "logps/rejected": -660.5711669921875, + "loss": 0.0447, + "rewards/chosen": 6.173887729644775, + "rewards/margins": 21.972740650177002, + "rewards/rejected": -15.798852920532227, + "step": 3214 + }, + { + "epoch": 0.8811840482390023, + "grad_norm": 13.1875, + "kl": 16.149646759033203, + "learning_rate": 5e-06, + "logits/chosen": -33576756.705882356, + "logits/rejected": -15480754.285714285, + "logps/chosen": -417.81985294117646, + "logps/rejected": -638.1234654017857, + "loss": 0.1324, + "rewards/chosen": 6.479208553538603, + "rewards/margins": 20.503474387801994, + "rewards/rejected": -14.024265834263392, + "step": 3215 + }, + { + "epoch": 0.8814581334795121, + "grad_norm": 3.609375, + "kl": 0.6089484095573425, + "learning_rate": 5e-06, + "logits/chosen": -27170872.0, + "logits/rejected": -27791882.666666668, + "logps/chosen": -350.38427734375, + "logps/rejected": -671.4813639322916, + "loss": 0.0695, + "rewards/chosen": 5.564538319905599, + "rewards/margins": 18.314829508463543, + "rewards/rejected": -12.750291188557943, + "step": 3216 + }, + { + "epoch": 0.8817322187200219, + "grad_norm": 2.359375, + "kl": 3.077131986618042, + "learning_rate": 5e-06, + "logits/chosen": -18529837.09090909, + "logits/rejected": -8565084.307692308, + "logps/chosen": -408.54585404829544, + "logps/rejected": -503.41000600961536, + "loss": 0.0385, + "rewards/chosen": 10.05973261052912, + "rewards/margins": 21.268681772938976, + "rewards/rejected": -11.208949162409855, + "step": 3217 + }, + { + "epoch": 0.8820063039605317, + "grad_norm": 7.3125, + "kl": 6.260440826416016, + "learning_rate": 5e-06, + "logits/chosen": -16344406.153846154, + "logits/rejected": -28025332.363636363, + "logps/chosen": -400.97513521634613, + "logps/rejected": -714.2665127840909, + "loss": 0.0634, + "rewards/chosen": 7.417241610013521, + "rewards/margins": 23.719856689026308, + "rewards/rejected": -16.302615079012785, + "step": 3218 + }, + { + "epoch": 0.8822803892010416, + "grad_norm": 0.63671875, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -36898973.538461536, + "logits/rejected": -27348221.09090909, + "logps/chosen": -470.0754582331731, + "logps/rejected": -513.9871271306819, + "loss": 0.0015, + "rewards/chosen": 9.114378122182993, + "rewards/margins": 20.84045650242092, + "rewards/rejected": -11.726078380237926, + "step": 3219 + }, + { + "epoch": 0.8825544744415513, + "grad_norm": 2.0625, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -36171960.0, + "logits/rejected": -26659076.0, + "logps/chosen": -399.140625, + "logps/rejected": -500.1515197753906, + "loss": 0.0053, + "rewards/chosen": 7.320361614227295, + "rewards/margins": 18.53947687149048, + "rewards/rejected": -11.219115257263184, + "step": 3220 + }, + { + "epoch": 0.8828285596820611, + "grad_norm": 7.46875, + "kl": 1.345106840133667, + "learning_rate": 5e-06, + "logits/chosen": 1249959.5, + "logits/rejected": -17193858.0, + "logps/chosen": -359.1897888183594, + "logps/rejected": -520.11767578125, + "loss": 0.055, + "rewards/chosen": 5.781913757324219, + "rewards/margins": 12.256040096282959, + "rewards/rejected": -6.47412633895874, + "step": 3221 + }, + { + "epoch": 0.883102644922571, + "grad_norm": 6.4375, + "kl": 2.6050708293914795, + "learning_rate": 5e-06, + "logits/chosen": -33832888.615384616, + "logits/rejected": -17101722.181818184, + "logps/chosen": -347.9285231370192, + "logps/rejected": -550.2399236505681, + "loss": 0.0244, + "rewards/chosen": 6.440057020920974, + "rewards/margins": 18.275156674685178, + "rewards/rejected": -11.835099653764205, + "step": 3222 + }, + { + "epoch": 0.8833767301630807, + "grad_norm": 4.3125, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -20163361.230769232, + "logits/rejected": -42422289.45454545, + "logps/chosen": -352.62391075721155, + "logps/rejected": -617.9169921875, + "loss": 0.0542, + "rewards/chosen": 6.9231438269981975, + "rewards/margins": 21.949707991593367, + "rewards/rejected": -15.02656416459517, + "step": 3223 + }, + { + "epoch": 0.8836508154035905, + "grad_norm": 2.34375, + "kl": 11.116971969604492, + "learning_rate": 5e-06, + "logits/chosen": -36479862.4, + "logits/rejected": -26194596.57142857, + "logps/chosen": -522.82529296875, + "logps/rejected": -624.6535993303571, + "loss": 0.0555, + "rewards/chosen": 8.883546447753906, + "rewards/margins": 23.872751944405692, + "rewards/rejected": -14.989205496651786, + "step": 3224 + }, + { + "epoch": 0.8839249006441003, + "grad_norm": 13.3125, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -69733632.0, + "logits/rejected": -38340016.0, + "logps/chosen": -488.1712890625, + "logps/rejected": -525.1149553571429, + "loss": 0.0709, + "rewards/chosen": 7.881427764892578, + "rewards/margins": 18.273502349853516, + "rewards/rejected": -10.392074584960938, + "step": 3225 + }, + { + "epoch": 0.8841989858846101, + "grad_norm": 11.5, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -18728761.14285714, + "logits/rejected": -23744718.4, + "logps/chosen": -347.81065150669644, + "logps/rejected": -345.52724609375, + "loss": 0.0579, + "rewards/chosen": 5.644641876220703, + "rewards/margins": 17.03212661743164, + "rewards/rejected": -11.387484741210937, + "step": 3226 + }, + { + "epoch": 0.8844730711251199, + "grad_norm": 3.15625, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -29630821.818181816, + "logits/rejected": -24755950.769230768, + "logps/chosen": -342.84676846590907, + "logps/rejected": -598.2684420072115, + "loss": 0.0096, + "rewards/chosen": 6.870990406383168, + "rewards/margins": 18.125445972789418, + "rewards/rejected": -11.25445556640625, + "step": 3227 + }, + { + "epoch": 0.8847471563656297, + "grad_norm": 10.1875, + "kl": 8.211261749267578, + "learning_rate": 5e-06, + "logits/chosen": -21798974.545454547, + "logits/rejected": -24631296.0, + "logps/chosen": -463.20481178977275, + "logps/rejected": -521.91796875, + "loss": 0.0397, + "rewards/chosen": 8.655975341796875, + "rewards/margins": 20.75291325495793, + "rewards/rejected": -12.096937913161058, + "step": 3228 + }, + { + "epoch": 0.8850212416061395, + "grad_norm": 7.375, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -41726968.88888889, + "logits/rejected": -18974336.0, + "logps/chosen": -413.1697048611111, + "logps/rejected": -524.1396809895833, + "loss": 0.0296, + "rewards/chosen": 7.8819224039713545, + "rewards/margins": 17.489337158203124, + "rewards/rejected": -9.60741475423177, + "step": 3229 + }, + { + "epoch": 0.8852953268466494, + "grad_norm": 5.875, + "kl": 3.3855419158935547, + "learning_rate": 5e-06, + "logits/chosen": -41981248.0, + "logits/rejected": -9268004.363636363, + "logps/chosen": -401.7417743389423, + "logps/rejected": -364.7477361505682, + "loss": 0.0271, + "rewards/chosen": 8.618561377892128, + "rewards/margins": 18.96642223438183, + "rewards/rejected": -10.347860856489701, + "step": 3230 + }, + { + "epoch": 0.8855694120871591, + "grad_norm": 9.1875, + "kl": 0.9567846059799194, + "learning_rate": 5e-06, + "logits/chosen": 1782868.8235294118, + "logits/rejected": -47970006.85714286, + "logps/chosen": -365.58191636029414, + "logps/rejected": -603.8791155133929, + "loss": 0.0687, + "rewards/chosen": 6.316866257611443, + "rewards/margins": 17.90764188365776, + "rewards/rejected": -11.590775626046318, + "step": 3231 + }, + { + "epoch": 0.8858434973276689, + "grad_norm": 4.40625, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -20957425.454545453, + "logits/rejected": -31004236.307692308, + "logps/chosen": -400.94140625, + "logps/rejected": -550.6796499399038, + "loss": 0.0259, + "rewards/chosen": 7.200594815340909, + "rewards/margins": 20.607779122732737, + "rewards/rejected": -13.407184307391827, + "step": 3232 + }, + { + "epoch": 0.8861175825681787, + "grad_norm": 6.75, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -11620022.857142856, + "logits/rejected": -18695305.411764707, + "logps/chosen": -408.00394112723217, + "logps/rejected": -482.07892922794116, + "loss": 0.0351, + "rewards/chosen": 7.047061375209263, + "rewards/margins": 16.544236704081047, + "rewards/rejected": -9.497175328871784, + "step": 3233 + }, + { + "epoch": 0.8863916678086885, + "grad_norm": 2.75, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -35477460.0, + "logits/rejected": -30092682.0, + "logps/chosen": -442.61346435546875, + "logps/rejected": -574.8838500976562, + "loss": 0.0072, + "rewards/chosen": 7.878837585449219, + "rewards/margins": 18.752997398376465, + "rewards/rejected": -10.874159812927246, + "step": 3234 + }, + { + "epoch": 0.8866657530491983, + "grad_norm": 1.9296875, + "kl": 3.6798102855682373, + "learning_rate": 5e-06, + "logits/chosen": -16043466.181818182, + "logits/rejected": -22780214.153846152, + "logps/chosen": -384.73659446022725, + "logps/rejected": -549.3113356370193, + "loss": 0.0036, + "rewards/chosen": 8.59074679287997, + "rewards/margins": 21.5423646940218, + "rewards/rejected": -12.951617901141827, + "step": 3235 + }, + { + "epoch": 0.8869398382897081, + "grad_norm": 10.625, + "kl": 4.579585075378418, + "learning_rate": 5e-06, + "logits/chosen": -7307372.8, + "logits/rejected": -40877660.44444445, + "logps/chosen": -378.65205078125, + "logps/rejected": -405.95350477430554, + "loss": 0.1039, + "rewards/chosen": 6.167076619466146, + "rewards/margins": 15.764069112141927, + "rewards/rejected": -9.596992492675781, + "step": 3236 + }, + { + "epoch": 0.8872139235302179, + "grad_norm": 2.859375, + "kl": 2.0041847229003906, + "learning_rate": 5e-06, + "logits/chosen": -34567738.666666664, + "logits/rejected": -27282781.333333332, + "logps/chosen": -377.6537679036458, + "logps/rejected": -562.3733723958334, + "loss": 0.0152, + "rewards/chosen": 7.13112195332845, + "rewards/margins": 19.26497968037923, + "rewards/rejected": -12.133857727050781, + "step": 3237 + }, + { + "epoch": 0.8874880087707276, + "grad_norm": 7.90625, + "kl": 8.515624046325684, + "learning_rate": 5e-06, + "logits/chosen": -20098509.53846154, + "logits/rejected": -39245637.81818182, + "logps/chosen": -463.47055288461536, + "logps/rejected": -641.7962979403409, + "loss": 0.0603, + "rewards/chosen": 7.36254648061899, + "rewards/margins": 22.704405777937883, + "rewards/rejected": -15.341859297318893, + "step": 3238 + }, + { + "epoch": 0.8877620940112375, + "grad_norm": 2.875, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -26134370.46153846, + "logits/rejected": -22497213.09090909, + "logps/chosen": -355.5878155048077, + "logps/rejected": -574.5882457386364, + "loss": 0.0108, + "rewards/chosen": 6.702585073617788, + "rewards/margins": 18.84906251280458, + "rewards/rejected": -12.14647743918679, + "step": 3239 + }, + { + "epoch": 0.8880361792517473, + "grad_norm": 2.578125, + "kl": 2.637073278427124, + "learning_rate": 5e-06, + "logits/chosen": -35110082.461538464, + "logits/rejected": -24004939.636363637, + "logps/chosen": -528.8465294471154, + "logps/rejected": -447.39688387784093, + "loss": 0.0063, + "rewards/chosen": 9.799873938927284, + "rewards/margins": 23.168158684577143, + "rewards/rejected": -13.368284745649857, + "step": 3240 + }, + { + "epoch": 0.8883102644922571, + "grad_norm": 15.4375, + "kl": 6.596280574798584, + "learning_rate": 5e-06, + "logits/chosen": -9986984.666666666, + "logits/rejected": 27031077.333333332, + "logps/chosen": -392.3957112630208, + "logps/rejected": -610.51611328125, + "loss": 0.0307, + "rewards/chosen": 7.412587483723958, + "rewards/margins": 19.362889607747395, + "rewards/rejected": -11.950302124023438, + "step": 3241 + }, + { + "epoch": 0.8885843497327669, + "grad_norm": 3.03125, + "kl": 3.1758735179901123, + "learning_rate": 5e-06, + "logits/chosen": -29956166.0, + "logits/rejected": 19353276.0, + "logps/chosen": -415.4676513671875, + "logps/rejected": -559.314697265625, + "loss": 0.0081, + "rewards/chosen": 8.19990348815918, + "rewards/margins": 23.30184555053711, + "rewards/rejected": -15.10194206237793, + "step": 3242 + }, + { + "epoch": 0.8888584349732767, + "grad_norm": 8.625, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": 6949664.0, + "logits/rejected": -21293011.692307692, + "logps/chosen": -564.7020152698864, + "logps/rejected": -666.8571965144231, + "loss": 0.0241, + "rewards/chosen": 6.512542031028054, + "rewards/margins": 16.254537035535265, + "rewards/rejected": -9.741995004507212, + "step": 3243 + }, + { + "epoch": 0.8891325202137865, + "grad_norm": 0.9765625, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -31071170.666666668, + "logits/rejected": 11758368.0, + "logps/chosen": -461.9267578125, + "logps/rejected": -570.314697265625, + "loss": 0.0023, + "rewards/chosen": 7.589200337727864, + "rewards/margins": 21.634119669596355, + "rewards/rejected": -14.04491933186849, + "step": 3244 + }, + { + "epoch": 0.8894066054542963, + "grad_norm": 4.71875, + "kl": 0.6440035700798035, + "learning_rate": 5e-06, + "logits/chosen": -13650864.0, + "logits/rejected": -10280398.545454545, + "logps/chosen": -352.9581956129808, + "logps/rejected": -395.66623757102275, + "loss": 0.024, + "rewards/chosen": 4.956295893742488, + "rewards/margins": 15.518719599797176, + "rewards/rejected": -10.562423706054688, + "step": 3245 + }, + { + "epoch": 0.889680690694806, + "grad_norm": 5.09375, + "kl": 7.820162773132324, + "learning_rate": 5e-06, + "logits/chosen": -24887162.666666668, + "logits/rejected": -18934580.0, + "logps/chosen": -412.9095052083333, + "logps/rejected": -670.3108723958334, + "loss": 0.0158, + "rewards/chosen": 7.128684997558594, + "rewards/margins": 19.75416056315104, + "rewards/rejected": -12.625475565592447, + "step": 3246 + }, + { + "epoch": 0.8899547759353159, + "grad_norm": 13.5, + "kl": 7.234199523925781, + "learning_rate": 5e-06, + "logits/chosen": -27825954.285714287, + "logits/rejected": -26593708.8, + "logps/chosen": -419.59737723214283, + "logps/rejected": -523.5751953125, + "loss": 0.0565, + "rewards/chosen": 7.654515947614398, + "rewards/margins": 19.98647700718471, + "rewards/rejected": -12.331961059570313, + "step": 3247 + }, + { + "epoch": 0.8902288611758257, + "grad_norm": 4.78125, + "kl": 4.804170608520508, + "learning_rate": 5e-06, + "logits/chosen": -17914388.0, + "logits/rejected": -38657592.0, + "logps/chosen": -377.8473205566406, + "logps/rejected": -432.611572265625, + "loss": 0.0221, + "rewards/chosen": 7.371123313903809, + "rewards/margins": 18.325772285461426, + "rewards/rejected": -10.954648971557617, + "step": 3248 + }, + { + "epoch": 0.8905029464163354, + "grad_norm": 3.359375, + "kl": 3.8903567790985107, + "learning_rate": 5e-06, + "logits/chosen": 9733331.333333334, + "logits/rejected": -5229635.0, + "logps/chosen": -425.03125, + "logps/rejected": -435.4539388020833, + "loss": 0.0296, + "rewards/chosen": 7.814868291219075, + "rewards/margins": 17.332443873087566, + "rewards/rejected": -9.51757558186849, + "step": 3249 + }, + { + "epoch": 0.8907770316568453, + "grad_norm": 4.75, + "kl": 0.3783671259880066, + "learning_rate": 5e-06, + "logits/chosen": -19628456.533333335, + "logits/rejected": -26085703.111111112, + "logps/chosen": -371.69781901041665, + "logps/rejected": -464.9138454861111, + "loss": 0.0197, + "rewards/chosen": 7.9822230021158855, + "rewards/margins": 20.478937276204427, + "rewards/rejected": -12.496714274088541, + "step": 3250 + }, + { + "epoch": 0.8910511168973551, + "grad_norm": 4.6875, + "kl": 9.29456901550293, + "learning_rate": 5e-06, + "logits/chosen": -14207396.363636363, + "logits/rejected": -15413304.615384616, + "logps/chosen": -267.20015092329544, + "logps/rejected": -502.5461989182692, + "loss": 0.0352, + "rewards/chosen": 7.559844970703125, + "rewards/margins": 18.99899174616887, + "rewards/rejected": -11.439146775465746, + "step": 3251 + }, + { + "epoch": 0.8913252021378649, + "grad_norm": 4.8125, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": 8623325.090909092, + "logits/rejected": -30061097.846153848, + "logps/chosen": -455.12895063920456, + "logps/rejected": -630.4149639423077, + "loss": 0.0542, + "rewards/chosen": 6.37502011385831, + "rewards/margins": 22.74652467740999, + "rewards/rejected": -16.37150456355168, + "step": 3252 + }, + { + "epoch": 0.8915992873783747, + "grad_norm": 0.5546875, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -30682077.09090909, + "logits/rejected": -42229883.07692308, + "logps/chosen": -443.88676313920456, + "logps/rejected": -582.9907977764423, + "loss": 0.0017, + "rewards/chosen": 9.172198208895596, + "rewards/margins": 25.53758517178622, + "rewards/rejected": -16.365386962890625, + "step": 3253 + }, + { + "epoch": 0.8918733726188844, + "grad_norm": 5.25, + "kl": 6.2825727462768555, + "learning_rate": 5e-06, + "logits/chosen": -38280713.14285714, + "logits/rejected": -22864782.4, + "logps/chosen": -358.9469517299107, + "logps/rejected": -428.0935546875, + "loss": 0.0698, + "rewards/chosen": 5.893016270228794, + "rewards/margins": 14.550309208461215, + "rewards/rejected": -8.657292938232422, + "step": 3254 + }, + { + "epoch": 0.8921474578593943, + "grad_norm": 0.302734375, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -20675216.0, + "logits/rejected": -15089314.0, + "logps/chosen": -462.48016357421875, + "logps/rejected": -560.0872802734375, + "loss": 0.0014, + "rewards/chosen": 7.614752769470215, + "rewards/margins": 23.237372398376465, + "rewards/rejected": -15.62261962890625, + "step": 3255 + }, + { + "epoch": 0.8924215430999041, + "grad_norm": 5.0, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": 7496715.2, + "logits/rejected": -553685.7142857143, + "logps/chosen": -477.6796875, + "logps/rejected": -586.4926060267857, + "loss": 0.0397, + "rewards/chosen": 7.13824234008789, + "rewards/margins": 20.09374553135463, + "rewards/rejected": -12.955503191266741, + "step": 3256 + }, + { + "epoch": 0.8926956283404138, + "grad_norm": 2.46875, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -24596992.0, + "logits/rejected": -37172072.0, + "logps/chosen": -372.56756591796875, + "logps/rejected": -535.0221557617188, + "loss": 0.0186, + "rewards/chosen": 8.130743026733398, + "rewards/margins": 21.714662551879883, + "rewards/rejected": -13.583919525146484, + "step": 3257 + }, + { + "epoch": 0.8929697135809237, + "grad_norm": 2.75, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -10477274.285714285, + "logits/rejected": -21416798.11764706, + "logps/chosen": -498.22422572544644, + "logps/rejected": -618.2403492647059, + "loss": 0.0182, + "rewards/chosen": 8.580824715750557, + "rewards/margins": 23.519140612177488, + "rewards/rejected": -14.93831589642693, + "step": 3258 + }, + { + "epoch": 0.8932437988214335, + "grad_norm": 1.4765625, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -8472916.444444444, + "logits/rejected": -13560296.533333333, + "logps/chosen": -380.42298719618054, + "logps/rejected": -617.4619791666667, + "loss": 0.0036, + "rewards/chosen": 7.398857964409722, + "rewards/margins": 21.708610365125868, + "rewards/rejected": -14.309752400716146, + "step": 3259 + }, + { + "epoch": 0.8935178840619432, + "grad_norm": 5.65625, + "kl": 6.186586380004883, + "learning_rate": 5e-06, + "logits/chosen": -34002786.666666664, + "logits/rejected": -24032426.666666668, + "logps/chosen": -398.94873046875, + "logps/rejected": -482.4241536458333, + "loss": 0.0221, + "rewards/chosen": 8.141326904296875, + "rewards/margins": 20.968124389648438, + "rewards/rejected": -12.826797485351562, + "step": 3260 + }, + { + "epoch": 0.8937919693024531, + "grad_norm": 7.84375, + "kl": 4.848623752593994, + "learning_rate": 5e-06, + "logits/chosen": -9765989.714285715, + "logits/rejected": -29061248.0, + "logps/chosen": -406.53250558035717, + "logps/rejected": -408.15927734375, + "loss": 0.0255, + "rewards/chosen": 7.07335444859096, + "rewards/margins": 18.36395961216518, + "rewards/rejected": -11.290605163574218, + "step": 3261 + }, + { + "epoch": 0.8940660545429628, + "grad_norm": 4.0625, + "kl": 1.4335403442382812, + "learning_rate": 5e-06, + "logits/chosen": -28030698.666666668, + "logits/rejected": -17926590.666666668, + "logps/chosen": -517.23681640625, + "logps/rejected": -499.6307373046875, + "loss": 0.0085, + "rewards/chosen": 7.804893493652344, + "rewards/margins": 20.441182454427086, + "rewards/rejected": -12.63628896077474, + "step": 3262 + }, + { + "epoch": 0.8943401397834727, + "grad_norm": 2.3125, + "kl": 2.4388110637664795, + "learning_rate": 5e-06, + "logits/chosen": -25153641.846153848, + "logits/rejected": -8974000.0, + "logps/chosen": -423.4836989182692, + "logps/rejected": -361.3936656605114, + "loss": 0.0085, + "rewards/chosen": 8.547644981971153, + "rewards/margins": 17.26070067932556, + "rewards/rejected": -8.713055697354404, + "step": 3263 + }, + { + "epoch": 0.8946142250239825, + "grad_norm": 9.875, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -47929358.222222224, + "logits/rejected": -35883669.333333336, + "logps/chosen": -407.88226996527777, + "logps/rejected": -581.3569661458333, + "loss": 0.033, + "rewards/chosen": 7.557284884982639, + "rewards/margins": 20.04415724012587, + "rewards/rejected": -12.486872355143229, + "step": 3264 + }, + { + "epoch": 0.8948883102644922, + "grad_norm": 4.75, + "kl": 6.9351911544799805, + "learning_rate": 5e-06, + "logits/chosen": -25928552.0, + "logits/rejected": -36372784.0, + "logps/chosen": -528.3594563802084, + "logps/rejected": -504.8844807942708, + "loss": 0.0492, + "rewards/chosen": 8.79972775777181, + "rewards/margins": 17.525281270345054, + "rewards/rejected": -8.725553512573242, + "step": 3265 + }, + { + "epoch": 0.8951623955050021, + "grad_norm": 1.8203125, + "kl": 1.251556396484375, + "learning_rate": 5e-06, + "logits/chosen": -25024546.133333333, + "logits/rejected": -57798720.0, + "logps/chosen": -561.7895182291667, + "logps/rejected": -420.4456380208333, + "loss": 0.0042, + "rewards/chosen": 8.66520487467448, + "rewards/margins": 21.299366082085506, + "rewards/rejected": -12.634161207411024, + "step": 3266 + }, + { + "epoch": 0.8954364807455119, + "grad_norm": 5.5, + "kl": 4.426759243011475, + "learning_rate": 5e-06, + "logits/chosen": -2732600.727272727, + "logits/rejected": -9844343.384615384, + "logps/chosen": -399.86820845170456, + "logps/rejected": -543.3680513822115, + "loss": 0.0143, + "rewards/chosen": 9.014307195490057, + "rewards/margins": 22.17362186458561, + "rewards/rejected": -13.159314669095552, + "step": 3267 + }, + { + "epoch": 0.8957105659860216, + "grad_norm": 6.0625, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": 1513986.0, + "logits/rejected": -24948548.0, + "logps/chosen": -296.0464782714844, + "logps/rejected": -579.311767578125, + "loss": 0.0166, + "rewards/chosen": 6.027951240539551, + "rewards/margins": 20.820116996765137, + "rewards/rejected": -14.792165756225586, + "step": 3268 + }, + { + "epoch": 0.8959846512265315, + "grad_norm": 4.4375, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -42215744.0, + "logits/rejected": -24506328.615384616, + "logps/chosen": -357.41537198153407, + "logps/rejected": -581.6946364182693, + "loss": 0.0134, + "rewards/chosen": 7.015277515758168, + "rewards/margins": 19.535771243222108, + "rewards/rejected": -12.520493727463942, + "step": 3269 + }, + { + "epoch": 0.8962587364670412, + "grad_norm": 12.875, + "kl": 9.649497985839844, + "learning_rate": 5e-06, + "logits/chosen": 73620999.52941176, + "logits/rejected": -37814742.85714286, + "logps/chosen": -336.42580997242646, + "logps/rejected": -404.19022042410717, + "loss": 0.0571, + "rewards/chosen": 5.41325782327091, + "rewards/margins": 13.470160572468732, + "rewards/rejected": -8.056902749197823, + "step": 3270 + }, + { + "epoch": 0.896532821707551, + "grad_norm": 1.96875, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -20637604.923076924, + "logits/rejected": -32685352.727272727, + "logps/chosen": -506.9035456730769, + "logps/rejected": -557.9881480823864, + "loss": 0.0054, + "rewards/chosen": 7.201697129469651, + "rewards/margins": 18.66140699053144, + "rewards/rejected": -11.45970986106179, + "step": 3271 + }, + { + "epoch": 0.8968069069480609, + "grad_norm": 5.5, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": 29931418.666666668, + "logits/rejected": -39794386.666666664, + "logps/chosen": -514.921142578125, + "logps/rejected": -506.8558756510417, + "loss": 0.0497, + "rewards/chosen": 5.817975997924805, + "rewards/margins": 19.32566261291504, + "rewards/rejected": -13.507686614990234, + "step": 3272 + }, + { + "epoch": 0.8970809921885706, + "grad_norm": 3.71875, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": 5020721.846153846, + "logits/rejected": -55526318.54545455, + "logps/chosen": -544.6286057692307, + "logps/rejected": -501.5062144886364, + "loss": 0.0111, + "rewards/chosen": 8.043867844801683, + "rewards/margins": 19.868229472553814, + "rewards/rejected": -11.82436162775213, + "step": 3273 + }, + { + "epoch": 0.8973550774290805, + "grad_norm": 8.8125, + "kl": 2.472839832305908, + "learning_rate": 5e-06, + "logits/chosen": -9333807.111111112, + "logits/rejected": -21346265.6, + "logps/chosen": -399.6330295138889, + "logps/rejected": -503.27623697916664, + "loss": 0.0215, + "rewards/chosen": 7.2946582370334205, + "rewards/margins": 18.252613152398006, + "rewards/rejected": -10.957954915364583, + "step": 3274 + }, + { + "epoch": 0.8976291626695903, + "grad_norm": 3.953125, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -16452522.666666666, + "logits/rejected": -19393169.777777776, + "logps/chosen": -379.53753255208335, + "logps/rejected": -445.21978081597223, + "loss": 0.0395, + "rewards/chosen": 5.465037027994792, + "rewards/margins": 16.932533264160156, + "rewards/rejected": -11.467496236165365, + "step": 3275 + }, + { + "epoch": 0.8979032479101, + "grad_norm": 3.625, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -24017180.0, + "logits/rejected": -27939586.0, + "logps/chosen": -454.31378173828125, + "logps/rejected": -641.167236328125, + "loss": 0.0059, + "rewards/chosen": 6.601651668548584, + "rewards/margins": 19.51970624923706, + "rewards/rejected": -12.918054580688477, + "step": 3276 + }, + { + "epoch": 0.8981773331506099, + "grad_norm": 11.25, + "kl": 5.83911657333374, + "learning_rate": 5e-06, + "logits/chosen": -18926806.85714286, + "logits/rejected": -40329222.4, + "logps/chosen": -503.3309849330357, + "logps/rejected": -465.16923828125, + "loss": 0.037, + "rewards/chosen": 8.052865709577288, + "rewards/margins": 19.475077165876115, + "rewards/rejected": -11.422211456298829, + "step": 3277 + }, + { + "epoch": 0.8984514183911196, + "grad_norm": 8.75, + "kl": 5.002000331878662, + "learning_rate": 5e-06, + "logits/chosen": -41034776.88888889, + "logits/rejected": -50876168.53333333, + "logps/chosen": -412.00678168402777, + "logps/rejected": -657.5606770833333, + "loss": 0.0155, + "rewards/chosen": 8.184183756510416, + "rewards/margins": 23.990234375, + "rewards/rejected": -15.806050618489584, + "step": 3278 + }, + { + "epoch": 0.8987255036316294, + "grad_norm": 0.62109375, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -15535637.333333334, + "logits/rejected": -18724253.333333332, + "logps/chosen": -499.831298828125, + "logps/rejected": -517.4775390625, + "loss": 0.0021, + "rewards/chosen": 7.399769465128581, + "rewards/margins": 24.578421910603844, + "rewards/rejected": -17.17865244547526, + "step": 3279 + }, + { + "epoch": 0.8989995888721393, + "grad_norm": 5.9375, + "kl": 2.679523468017578, + "learning_rate": 5e-06, + "logits/chosen": -20396353.333333332, + "logits/rejected": -35951306.666666664, + "logps/chosen": -402.4847005208333, + "logps/rejected": -604.6202392578125, + "loss": 0.0259, + "rewards/chosen": 7.597684224446614, + "rewards/margins": 25.267824808756508, + "rewards/rejected": -17.670140584309895, + "step": 3280 + }, + { + "epoch": 0.899273674112649, + "grad_norm": 12.125, + "kl": 8.645370483398438, + "learning_rate": 5e-06, + "logits/chosen": -24609770.666666668, + "logits/rejected": -29221781.333333332, + "logps/chosen": -388.2260416666667, + "logps/rejected": -514.3255750868055, + "loss": 0.1185, + "rewards/chosen": 6.993094380696615, + "rewards/margins": 19.641464063856336, + "rewards/rejected": -12.648369683159721, + "step": 3281 + }, + { + "epoch": 0.8995477593531588, + "grad_norm": 13.6875, + "kl": 3.4817237854003906, + "learning_rate": 5e-06, + "logits/chosen": -27715874.90909091, + "logits/rejected": -13040940.307692308, + "logps/chosen": -401.4315074573864, + "logps/rejected": -592.7224308894231, + "loss": 0.0731, + "rewards/chosen": 7.350491610440341, + "rewards/margins": 19.860050654911493, + "rewards/rejected": -12.509559044471153, + "step": 3282 + }, + { + "epoch": 0.8998218445936687, + "grad_norm": 1.0546875, + "kl": 1.5985807180404663, + "learning_rate": 5e-06, + "logits/chosen": -12234441.142857144, + "logits/rejected": -32354873.6, + "logps/chosen": -404.85756138392856, + "logps/rejected": -532.83427734375, + "loss": 0.003, + "rewards/chosen": 7.789879935128348, + "rewards/margins": 22.405726187569755, + "rewards/rejected": -14.615846252441406, + "step": 3283 + }, + { + "epoch": 0.9000959298341784, + "grad_norm": 3.484375, + "kl": 7.507717132568359, + "learning_rate": 5e-06, + "logits/chosen": -9595502.666666666, + "logits/rejected": -20425009.333333332, + "logps/chosen": -573.1148274739584, + "logps/rejected": -497.5091145833333, + "loss": 0.0103, + "rewards/chosen": 8.022148768107096, + "rewards/margins": 22.559995651245117, + "rewards/rejected": -14.537846883138021, + "step": 3284 + }, + { + "epoch": 0.9003700150746883, + "grad_norm": 4.0, + "kl": 0.04975064843893051, + "learning_rate": 5e-06, + "logits/chosen": -31902880.0, + "logits/rejected": -32576755.2, + "logps/chosen": -521.7803780691964, + "logps/rejected": -597.17646484375, + "loss": 0.0089, + "rewards/chosen": 8.060741969517299, + "rewards/margins": 23.30248849051339, + "rewards/rejected": -15.241746520996093, + "step": 3285 + }, + { + "epoch": 0.900644100315198, + "grad_norm": 3.375, + "kl": 2.3222084045410156, + "learning_rate": 5e-06, + "logits/chosen": -28077840.0, + "logits/rejected": -45163539.2, + "logps/chosen": -400.31895228794644, + "logps/rejected": -638.448388671875, + "loss": 0.0068, + "rewards/chosen": 7.968287876674107, + "rewards/margins": 25.165590122767856, + "rewards/rejected": -17.19730224609375, + "step": 3286 + }, + { + "epoch": 0.9009181855557078, + "grad_norm": 1.0859375, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -17165484.0, + "logits/rejected": -7780477.5, + "logps/chosen": -423.764892578125, + "logps/rejected": -645.2080078125, + "loss": 0.003, + "rewards/chosen": 8.617183685302734, + "rewards/margins": 23.477739334106445, + "rewards/rejected": -14.860555648803711, + "step": 3287 + }, + { + "epoch": 0.9011922707962177, + "grad_norm": 0.65625, + "kl": 1.9979280233383179, + "learning_rate": 5e-06, + "logits/chosen": -41892352.0, + "logits/rejected": -43194363.733333334, + "logps/chosen": -470.9806857638889, + "logps/rejected": -441.7190755208333, + "loss": 0.0016, + "rewards/chosen": 7.864627414279514, + "rewards/margins": 18.13599582248264, + "rewards/rejected": -10.271368408203125, + "step": 3288 + }, + { + "epoch": 0.9014663560367274, + "grad_norm": 15.0, + "kl": 23.70978355407715, + "learning_rate": 5e-06, + "logits/chosen": -17219866.94736842, + "logits/rejected": -36470704.0, + "logps/chosen": -467.84765625, + "logps/rejected": -448.21650390625, + "loss": 0.1184, + "rewards/chosen": 7.271706028988487, + "rewards/margins": 17.964610692074423, + "rewards/rejected": -10.692904663085937, + "step": 3289 + }, + { + "epoch": 0.9017404412772372, + "grad_norm": 7.65625, + "kl": 11.809857368469238, + "learning_rate": 5e-06, + "logits/chosen": -16688362.0, + "logits/rejected": 1417108.5, + "logps/chosen": -383.31585693359375, + "logps/rejected": -605.9210205078125, + "loss": 0.0697, + "rewards/chosen": 6.945493698120117, + "rewards/margins": 15.371232986450195, + "rewards/rejected": -8.425739288330078, + "step": 3290 + }, + { + "epoch": 0.902014526517747, + "grad_norm": 9.5, + "kl": 1.9428876638412476, + "learning_rate": 5e-06, + "logits/chosen": -19896504.0, + "logits/rejected": -38969388.0, + "logps/chosen": -322.2588195800781, + "logps/rejected": -583.6301879882812, + "loss": 0.0605, + "rewards/chosen": 5.710681438446045, + "rewards/margins": 17.808331966400146, + "rewards/rejected": -12.097650527954102, + "step": 3291 + }, + { + "epoch": 0.9022886117582568, + "grad_norm": 2.09375, + "kl": 1.7962188720703125, + "learning_rate": 5e-06, + "logits/chosen": -4399932.923076923, + "logits/rejected": -49989137.45454545, + "logps/chosen": -558.8298527644231, + "logps/rejected": -584.8534268465909, + "loss": 0.0048, + "rewards/chosen": 8.287368774414062, + "rewards/margins": 25.591967496004973, + "rewards/rejected": -17.30459872159091, + "step": 3292 + }, + { + "epoch": 0.9025626969987666, + "grad_norm": 5.625, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -24190373.818181816, + "logits/rejected": -2121428.4615384615, + "logps/chosen": -359.8349609375, + "logps/rejected": -500.31043419471155, + "loss": 0.0131, + "rewards/chosen": 7.415772871537642, + "rewards/margins": 20.138087559413243, + "rewards/rejected": -12.7223146878756, + "step": 3293 + }, + { + "epoch": 0.9028367822392764, + "grad_norm": 5.09375, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -31601645.714285713, + "logits/rejected": -3818264.9411764704, + "logps/chosen": -480.02284458705356, + "logps/rejected": -606.3738511029412, + "loss": 0.105, + "rewards/chosen": 6.168605804443359, + "rewards/margins": 19.0066660712747, + "rewards/rejected": -12.838060266831341, + "step": 3294 + }, + { + "epoch": 0.9031108674797862, + "grad_norm": 9.125, + "kl": 1.5473588705062866, + "learning_rate": 5e-06, + "logits/chosen": -34153308.8, + "logits/rejected": -34619963.428571425, + "logps/chosen": -493.9662109375, + "logps/rejected": -492.79439871651783, + "loss": 0.0247, + "rewards/chosen": 7.0137184143066404, + "rewards/margins": 21.138778359549384, + "rewards/rejected": -14.125059945242745, + "step": 3295 + }, + { + "epoch": 0.9033849527202961, + "grad_norm": 7.71875, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -3449764.0, + "logits/rejected": -6625284.0, + "logps/chosen": -345.2143249511719, + "logps/rejected": -496.99749755859375, + "loss": 0.0138, + "rewards/chosen": 6.764804840087891, + "rewards/margins": 19.25751495361328, + "rewards/rejected": -12.49271011352539, + "step": 3296 + }, + { + "epoch": 0.9036590379608058, + "grad_norm": 17.0, + "kl": 6.4465227127075195, + "learning_rate": 5e-06, + "logits/chosen": -10127926.857142856, + "logits/rejected": -31868121.6, + "logps/chosen": -406.71337890625, + "logps/rejected": -432.22109375, + "loss": 0.075, + "rewards/chosen": 6.733001708984375, + "rewards/margins": 20.215536499023436, + "rewards/rejected": -13.482534790039063, + "step": 3297 + }, + { + "epoch": 0.9039331232013156, + "grad_norm": 14.625, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -19434801.777777776, + "logits/rejected": -10889654.4, + "logps/chosen": -341.4130045572917, + "logps/rejected": -749.2998046875, + "loss": 0.0679, + "rewards/chosen": 6.726075490315755, + "rewards/margins": 22.735958099365234, + "rewards/rejected": -16.00988260904948, + "step": 3298 + }, + { + "epoch": 0.9042072084418254, + "grad_norm": 10.5, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -25710056.888888888, + "logits/rejected": -31325843.2, + "logps/chosen": -385.25884331597223, + "logps/rejected": -589.96875, + "loss": 0.0423, + "rewards/chosen": 5.708502875434028, + "rewards/margins": 18.298617214626734, + "rewards/rejected": -12.590114339192708, + "step": 3299 + }, + { + "epoch": 0.9044812936823352, + "grad_norm": 10.875, + "kl": 1.775200605392456, + "learning_rate": 5e-06, + "logits/chosen": -24216354.46153846, + "logits/rejected": -54533608.72727273, + "logps/chosen": -402.7688176081731, + "logps/rejected": -464.7224786931818, + "loss": 0.0836, + "rewards/chosen": 5.796341529259315, + "rewards/margins": 19.626916258485167, + "rewards/rejected": -13.830574729225852, + "step": 3300 + }, + { + "epoch": 0.904755378922845, + "grad_norm": 1.8203125, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -13424325.818181818, + "logits/rejected": -9618467.076923076, + "logps/chosen": -457.52077414772725, + "logps/rejected": -594.8576096754807, + "loss": 0.0047, + "rewards/chosen": 8.088642467151988, + "rewards/margins": 21.246160120397178, + "rewards/rejected": -13.157517653245192, + "step": 3301 + }, + { + "epoch": 0.9050294641633548, + "grad_norm": 13.25, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": 20555970.90909091, + "logits/rejected": -22158340.923076924, + "logps/chosen": -378.83695845170456, + "logps/rejected": -463.9070012019231, + "loss": 0.0354, + "rewards/chosen": 6.946484652432528, + "rewards/margins": 19.613466276155485, + "rewards/rejected": -12.666981623722958, + "step": 3302 + }, + { + "epoch": 0.9053035494038646, + "grad_norm": 6.625, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -27979022.545454547, + "logits/rejected": -14154664.615384616, + "logps/chosen": -328.6751154119318, + "logps/rejected": -554.8251953125, + "loss": 0.0184, + "rewards/chosen": 6.936605973677202, + "rewards/margins": 21.32911249974391, + "rewards/rejected": -14.392506526066708, + "step": 3303 + }, + { + "epoch": 0.9055776346443744, + "grad_norm": 8.375, + "kl": 4.2201056480407715, + "learning_rate": 5e-06, + "logits/chosen": -20936901.818181816, + "logits/rejected": -18860345.846153848, + "logps/chosen": -411.34499289772725, + "logps/rejected": -597.7161207932693, + "loss": 0.0122, + "rewards/chosen": 6.73729775168679, + "rewards/margins": 18.493430611136912, + "rewards/rejected": -11.75613285945012, + "step": 3304 + }, + { + "epoch": 0.9058517198848842, + "grad_norm": 2.1875, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -41422557.09090909, + "logits/rejected": -37803426.461538464, + "logps/chosen": -390.5486505681818, + "logps/rejected": -487.20620492788464, + "loss": 0.0101, + "rewards/chosen": 7.379303672096946, + "rewards/margins": 18.997115608695506, + "rewards/rejected": -11.617811936598558, + "step": 3305 + }, + { + "epoch": 0.906125805125394, + "grad_norm": 2.671875, + "kl": 3.5476317405700684, + "learning_rate": 5e-06, + "logits/chosen": -34732945.06666667, + "logits/rejected": -38179360.0, + "logps/chosen": -389.464453125, + "logps/rejected": -506.6282552083333, + "loss": 0.008, + "rewards/chosen": 6.89185791015625, + "rewards/margins": 22.378406439887154, + "rewards/rejected": -15.486548529730904, + "step": 3306 + }, + { + "epoch": 0.9063998903659038, + "grad_norm": 7.84375, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -3932558.8571428573, + "logits/rejected": -30794208.0, + "logps/chosen": -394.23496791294644, + "logps/rejected": -610.3265739889706, + "loss": 0.0137, + "rewards/chosen": 7.777460370744977, + "rewards/margins": 20.734716912277605, + "rewards/rejected": -12.95725654153263, + "step": 3307 + }, + { + "epoch": 0.9066739756064136, + "grad_norm": 1.28125, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -4495482.666666667, + "logits/rejected": -32790775.466666665, + "logps/chosen": -432.0858561197917, + "logps/rejected": -500.7975260416667, + "loss": 0.0032, + "rewards/chosen": 8.912160237630209, + "rewards/margins": 22.178865559895833, + "rewards/rejected": -13.266705322265626, + "step": 3308 + }, + { + "epoch": 0.9069480608469234, + "grad_norm": 2.546875, + "kl": 0.0378367118537426, + "learning_rate": 5e-06, + "logits/chosen": -28232520.727272727, + "logits/rejected": 25076256.0, + "logps/chosen": -438.41015625, + "logps/rejected": -633.1521935096154, + "loss": 0.011, + "rewards/chosen": 5.796332966197621, + "rewards/margins": 19.1668597934963, + "rewards/rejected": -13.370526827298677, + "step": 3309 + }, + { + "epoch": 0.9072221460874332, + "grad_norm": 3.890625, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -4815945.777777778, + "logits/rejected": 51088093.86666667, + "logps/chosen": -350.8306477864583, + "logps/rejected": -584.8909505208334, + "loss": 0.0382, + "rewards/chosen": 6.03243891398112, + "rewards/margins": 25.651264190673828, + "rewards/rejected": -19.618825276692707, + "step": 3310 + }, + { + "epoch": 0.907496231327943, + "grad_norm": 2.171875, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": 18692823.272727273, + "logits/rejected": -22085021.53846154, + "logps/chosen": -410.48197798295456, + "logps/rejected": -676.4742337740385, + "loss": 0.0104, + "rewards/chosen": 6.540905345569957, + "rewards/margins": 21.567096470119235, + "rewards/rejected": -15.026191124549278, + "step": 3311 + }, + { + "epoch": 0.9077703165684528, + "grad_norm": 9.125, + "kl": 5.7908477783203125, + "learning_rate": 5e-06, + "logits/chosen": 1945320.7272727273, + "logits/rejected": -48074515.692307696, + "logps/chosen": -469.8546697443182, + "logps/rejected": -462.57335486778845, + "loss": 0.0238, + "rewards/chosen": 9.134721235795455, + "rewards/margins": 18.53656144575639, + "rewards/rejected": -9.401840209960938, + "step": 3312 + }, + { + "epoch": 0.9080444018089626, + "grad_norm": 2.765625, + "kl": 7.032042503356934, + "learning_rate": 5e-06, + "logits/chosen": -9830882.285714285, + "logits/rejected": -13648312.0, + "logps/chosen": -466.3935546875, + "logps/rejected": -474.48330078125, + "loss": 0.0107, + "rewards/chosen": 7.508291516985212, + "rewards/margins": 23.014451490129744, + "rewards/rejected": -15.50615997314453, + "step": 3313 + }, + { + "epoch": 0.9083184870494724, + "grad_norm": 9.375, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -11875931.076923076, + "logits/rejected": -36352558.54545455, + "logps/chosen": -540.8062650240385, + "logps/rejected": -554.5087002840909, + "loss": 0.0661, + "rewards/chosen": 7.28453357403095, + "rewards/margins": 22.081801141058648, + "rewards/rejected": -14.7972675670277, + "step": 3314 + }, + { + "epoch": 0.9085925722899821, + "grad_norm": 3.796875, + "kl": 7.250267028808594, + "learning_rate": 5e-06, + "logits/chosen": -10800596.0, + "logits/rejected": 763860.6666666666, + "logps/chosen": -437.3125, + "logps/rejected": -757.8673502604166, + "loss": 0.0448, + "rewards/chosen": 8.914283752441406, + "rewards/margins": 21.32230885823568, + "rewards/rejected": -12.408025105794271, + "step": 3315 + }, + { + "epoch": 0.908866657530492, + "grad_norm": 0.68359375, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -18196176.0, + "logits/rejected": -31139337.14285714, + "logps/chosen": -516.197314453125, + "logps/rejected": -455.26046316964283, + "loss": 0.0028, + "rewards/chosen": 6.789205932617188, + "rewards/margins": 20.5324702671596, + "rewards/rejected": -13.743264334542411, + "step": 3316 + }, + { + "epoch": 0.9091407427710018, + "grad_norm": 11.0, + "kl": 3.4795475006103516, + "learning_rate": 5e-06, + "logits/chosen": -13274249.846153846, + "logits/rejected": -23091538.90909091, + "logps/chosen": -378.0248272235577, + "logps/rejected": -443.63991477272725, + "loss": 0.042, + "rewards/chosen": 7.051564730130709, + "rewards/margins": 16.875454029003222, + "rewards/rejected": -9.823889298872514, + "step": 3317 + }, + { + "epoch": 0.9094148280115116, + "grad_norm": 12.1875, + "kl": 6.969302177429199, + "learning_rate": 5e-06, + "logits/chosen": -31188893.53846154, + "logits/rejected": -33680360.72727273, + "logps/chosen": -520.9982346754807, + "logps/rejected": -593.1823064630681, + "loss": 0.0202, + "rewards/chosen": 9.191443223219652, + "rewards/margins": 20.8227430223585, + "rewards/rejected": -11.63129979913885, + "step": 3318 + }, + { + "epoch": 0.9096889132520214, + "grad_norm": 5.25, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -1085160.3333333333, + "logits/rejected": -19498680.0, + "logps/chosen": -311.38816324869794, + "logps/rejected": -502.2137044270833, + "loss": 0.0324, + "rewards/chosen": 5.814308802286784, + "rewards/margins": 16.301945368448894, + "rewards/rejected": -10.48763656616211, + "step": 3319 + }, + { + "epoch": 0.9099629984925311, + "grad_norm": 9.5625, + "kl": 1.4459731578826904, + "learning_rate": 5e-06, + "logits/chosen": -5980937.714285715, + "logits/rejected": -2280762.4, + "logps/chosen": -427.83558872767856, + "logps/rejected": -282.2814453125, + "loss": 0.0479, + "rewards/chosen": 6.707512991768973, + "rewards/margins": 13.047360937935967, + "rewards/rejected": -6.3398479461669925, + "step": 3320 + }, + { + "epoch": 0.910237083733041, + "grad_norm": 7.4375, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -1831506.1818181819, + "logits/rejected": 13196342.153846154, + "logps/chosen": -446.53520063920456, + "logps/rejected": -612.6285306490385, + "loss": 0.0331, + "rewards/chosen": 6.385941938920454, + "rewards/margins": 23.39297976193728, + "rewards/rejected": -17.007037823016827, + "step": 3321 + }, + { + "epoch": 0.9105111689735508, + "grad_norm": 6.40625, + "kl": 11.881876945495605, + "learning_rate": 5e-06, + "logits/chosen": -17743579.733333334, + "logits/rejected": -1220169.888888889, + "logps/chosen": -514.1330078125, + "logps/rejected": -496.5764973958333, + "loss": 0.0247, + "rewards/chosen": 8.094116719563802, + "rewards/margins": 18.461661953396266, + "rewards/rejected": -10.367545233832466, + "step": 3322 + }, + { + "epoch": 0.9107852542140605, + "grad_norm": 5.90625, + "kl": 0.20232391357421875, + "learning_rate": 5e-06, + "logits/chosen": 24814053.818181816, + "logits/rejected": -45069582.76923077, + "logps/chosen": -438.43039772727275, + "logps/rejected": -564.9410682091346, + "loss": 0.0219, + "rewards/chosen": 6.607441295276988, + "rewards/margins": 21.51169474141581, + "rewards/rejected": -14.904253446138823, + "step": 3323 + }, + { + "epoch": 0.9110593394545704, + "grad_norm": 5.40625, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -14095203.2, + "logits/rejected": -17238633.14285714, + "logps/chosen": -408.8573974609375, + "logps/rejected": -479.0831821986607, + "loss": 0.01, + "rewards/chosen": 8.587049865722657, + "rewards/margins": 19.091767011369978, + "rewards/rejected": -10.504717145647321, + "step": 3324 + }, + { + "epoch": 0.9113334246950802, + "grad_norm": 3.984375, + "kl": 6.286547660827637, + "learning_rate": 5e-06, + "logits/chosen": -4521013.818181818, + "logits/rejected": -28297031.384615384, + "logps/chosen": -334.03080610795456, + "logps/rejected": -435.3109600360577, + "loss": 0.0128, + "rewards/chosen": 6.454290216619318, + "rewards/margins": 18.66423706908326, + "rewards/rejected": -12.209946852463942, + "step": 3325 + }, + { + "epoch": 0.9116075099355899, + "grad_norm": 5.28125, + "kl": 0.21935781836509705, + "learning_rate": 5e-06, + "logits/chosen": -22919533.714285713, + "logits/rejected": -41415939.2, + "logps/chosen": -388.0287388392857, + "logps/rejected": -601.068505859375, + "loss": 0.0346, + "rewards/chosen": 6.055495125906808, + "rewards/margins": 18.69427533830915, + "rewards/rejected": -12.638780212402343, + "step": 3326 + }, + { + "epoch": 0.9118815951760998, + "grad_norm": 2.0625, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -17457587.2, + "logits/rejected": -21540884.57142857, + "logps/chosen": -351.455029296875, + "logps/rejected": -588.5281808035714, + "loss": 0.003, + "rewards/chosen": 8.06529541015625, + "rewards/margins": 21.472942679268975, + "rewards/rejected": -13.407647269112724, + "step": 3327 + }, + { + "epoch": 0.9121556804166095, + "grad_norm": 10.5625, + "kl": 0.8077990412712097, + "learning_rate": 5e-06, + "logits/chosen": -29333545.6, + "logits/rejected": -32522457.14285714, + "logps/chosen": -368.6446533203125, + "logps/rejected": -545.244384765625, + "loss": 0.0227, + "rewards/chosen": 8.349209594726563, + "rewards/margins": 20.90127694266183, + "rewards/rejected": -12.552067347935267, + "step": 3328 + }, + { + "epoch": 0.9124297656571194, + "grad_norm": 3.046875, + "kl": 10.729201316833496, + "learning_rate": 5e-06, + "logits/chosen": -39929856.0, + "logits/rejected": -27090435.2, + "logps/chosen": -523.8369489397321, + "logps/rejected": -642.53251953125, + "loss": 0.0139, + "rewards/chosen": 9.105447496686663, + "rewards/margins": 24.920718492780413, + "rewards/rejected": -15.81527099609375, + "step": 3329 + }, + { + "epoch": 0.9127038508976292, + "grad_norm": 1.328125, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -32867744.0, + "logits/rejected": -17678092.8, + "logps/chosen": -448.0390896267361, + "logps/rejected": -493.96546223958336, + "loss": 0.005, + "rewards/chosen": 8.367798699273003, + "rewards/margins": 21.106566196017795, + "rewards/rejected": -12.738767496744792, + "step": 3330 + }, + { + "epoch": 0.9129779361381389, + "grad_norm": 5.875, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -20395031.272727273, + "logits/rejected": -31491766.153846152, + "logps/chosen": -442.13423295454544, + "logps/rejected": -601.3223407451923, + "loss": 0.032, + "rewards/chosen": 6.586248224431818, + "rewards/margins": 19.68912180653819, + "rewards/rejected": -13.10287358210637, + "step": 3331 + }, + { + "epoch": 0.9132520213786488, + "grad_norm": 5.90625, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -16453632.0, + "logits/rejected": -7729715.428571428, + "logps/chosen": -467.888916015625, + "logps/rejected": -484.02103097098217, + "loss": 0.0167, + "rewards/chosen": 8.435335540771485, + "rewards/margins": 18.788296399797712, + "rewards/rejected": -10.352960859026227, + "step": 3332 + }, + { + "epoch": 0.9135261066191586, + "grad_norm": 9.4375, + "kl": 12.308806419372559, + "learning_rate": 5e-06, + "logits/chosen": -14411598.11764706, + "logits/rejected": -15040121.142857144, + "logps/chosen": -406.1475183823529, + "logps/rejected": -414.3553989955357, + "loss": 0.0543, + "rewards/chosen": 7.638027415556066, + "rewards/margins": 15.243520688609916, + "rewards/rejected": -7.605493273053851, + "step": 3333 + }, + { + "epoch": 0.9138001918596683, + "grad_norm": 7.78125, + "kl": 2.6674141883850098, + "learning_rate": 5e-06, + "logits/chosen": -10213242.666666666, + "logits/rejected": -16308610.666666666, + "logps/chosen": -326.64406331380206, + "logps/rejected": -433.8812255859375, + "loss": 0.0824, + "rewards/chosen": 4.804329554239909, + "rewards/margins": 17.169084548950195, + "rewards/rejected": -12.364754994710287, + "step": 3334 + }, + { + "epoch": 0.9140742771001782, + "grad_norm": 2.890625, + "kl": 10.128726959228516, + "learning_rate": 5e-06, + "logits/chosen": -30104874.0, + "logits/rejected": -47511092.0, + "logps/chosen": -431.91748046875, + "logps/rejected": -484.55621337890625, + "loss": 0.0448, + "rewards/chosen": 8.155357360839844, + "rewards/margins": 21.755064964294434, + "rewards/rejected": -13.59970760345459, + "step": 3335 + }, + { + "epoch": 0.914348362340688, + "grad_norm": 15.25, + "kl": 7.72272253036499, + "learning_rate": 5e-06, + "logits/chosen": -12058270.857142856, + "logits/rejected": -7061145.6, + "logps/chosen": -440.4955357142857, + "logps/rejected": -636.43876953125, + "loss": 0.06, + "rewards/chosen": 7.683462960379464, + "rewards/margins": 20.607466561453684, + "rewards/rejected": -12.924003601074219, + "step": 3336 + }, + { + "epoch": 0.9146224475811977, + "grad_norm": 10.5625, + "kl": 12.809988021850586, + "learning_rate": 5e-06, + "logits/chosen": -13071043.0, + "logits/rejected": 15791000.0, + "logps/chosen": -411.1241455078125, + "logps/rejected": -388.14642333984375, + "loss": 0.0713, + "rewards/chosen": 8.219348907470703, + "rewards/margins": 18.48321533203125, + "rewards/rejected": -10.263866424560547, + "step": 3337 + }, + { + "epoch": 0.9148965328217076, + "grad_norm": 3.84375, + "kl": 3.723644256591797, + "learning_rate": 5e-06, + "logits/chosen": -11229681.0, + "logits/rejected": -46033828.0, + "logps/chosen": -320.697265625, + "logps/rejected": -603.2988891601562, + "loss": 0.0402, + "rewards/chosen": 6.5072340965271, + "rewards/margins": 20.72993803024292, + "rewards/rejected": -14.22270393371582, + "step": 3338 + }, + { + "epoch": 0.9151706180622173, + "grad_norm": 7.125, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -36436725.333333336, + "logits/rejected": -26084914.666666668, + "logps/chosen": -405.906982421875, + "logps/rejected": -503.2132568359375, + "loss": 0.0171, + "rewards/chosen": 5.912756601969401, + "rewards/margins": 17.22365951538086, + "rewards/rejected": -11.310902913411459, + "step": 3339 + }, + { + "epoch": 0.9154447033027272, + "grad_norm": 23.0, + "kl": 7.9164533615112305, + "learning_rate": 5e-06, + "logits/chosen": -32049652.57142857, + "logits/rejected": -15002644.8, + "logps/chosen": -441.8412388392857, + "logps/rejected": -619.527978515625, + "loss": 0.0576, + "rewards/chosen": 8.029076167515345, + "rewards/margins": 18.041194697788782, + "rewards/rejected": -10.012118530273437, + "step": 3340 + }, + { + "epoch": 0.915718788543237, + "grad_norm": 10.4375, + "kl": 10.484478950500488, + "learning_rate": 5e-06, + "logits/chosen": 4128715.2, + "logits/rejected": 12848218.666666666, + "logps/chosen": -450.5734375, + "logps/rejected": -711.0851779513889, + "loss": 0.0396, + "rewards/chosen": 7.244621785481771, + "rewards/margins": 22.652460394965278, + "rewards/rejected": -15.407838609483507, + "step": 3341 + }, + { + "epoch": 0.9159928737837467, + "grad_norm": 1.0234375, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -27029120.0, + "logits/rejected": -27617276.0, + "logps/chosen": -345.2381591796875, + "logps/rejected": -615.7932739257812, + "loss": 0.0023, + "rewards/chosen": 6.331422805786133, + "rewards/margins": 20.670973777770996, + "rewards/rejected": -14.339550971984863, + "step": 3342 + }, + { + "epoch": 0.9162669590242566, + "grad_norm": 3.203125, + "kl": 0.9199193716049194, + "learning_rate": 5e-06, + "logits/chosen": -19519228.0, + "logits/rejected": -28133781.333333332, + "logps/chosen": -517.896484375, + "logps/rejected": -548.3609212239584, + "loss": 0.011, + "rewards/chosen": 6.795858383178711, + "rewards/margins": 20.40238126118978, + "rewards/rejected": -13.606522878011068, + "step": 3343 + }, + { + "epoch": 0.9165410442647663, + "grad_norm": 5.875, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -16578236.444444444, + "logits/rejected": -8345756.8, + "logps/chosen": -343.8055013020833, + "logps/rejected": -568.5280598958333, + "loss": 0.0179, + "rewards/chosen": 6.073999616834852, + "rewards/margins": 19.677099185519747, + "rewards/rejected": -13.603099568684895, + "step": 3344 + }, + { + "epoch": 0.9168151295052761, + "grad_norm": 1.625, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -955450.2666666667, + "logits/rejected": -25980302.222222224, + "logps/chosen": -334.21640625, + "logps/rejected": -549.1081271701389, + "loss": 0.0056, + "rewards/chosen": 7.664180501302083, + "rewards/margins": 19.21521521674262, + "rewards/rejected": -11.551034715440538, + "step": 3345 + }, + { + "epoch": 0.917089214745786, + "grad_norm": 6.34375, + "kl": 3.6408298015594482, + "learning_rate": 5e-06, + "logits/chosen": -36823800.0, + "logits/rejected": -27353968.0, + "logps/chosen": -489.185302734375, + "logps/rejected": -471.0218912760417, + "loss": 0.014, + "rewards/chosen": 8.043793360392252, + "rewards/margins": 19.722386042277016, + "rewards/rejected": -11.678592681884766, + "step": 3346 + }, + { + "epoch": 0.9173632999862957, + "grad_norm": 5.0625, + "kl": 5.772083759307861, + "learning_rate": 5e-06, + "logits/chosen": -8294191.111111111, + "logits/rejected": -35144085.333333336, + "logps/chosen": -291.99565972222223, + "logps/rejected": -610.5045572916666, + "loss": 0.0155, + "rewards/chosen": 5.559413062201606, + "rewards/margins": 18.120762549506292, + "rewards/rejected": -12.561349487304687, + "step": 3347 + }, + { + "epoch": 0.9176373852268055, + "grad_norm": 3.40625, + "kl": 5.43698787689209, + "learning_rate": 5e-06, + "logits/chosen": -28829243.42857143, + "logits/rejected": -33776768.0, + "logps/chosen": -418.4281529017857, + "logps/rejected": -606.9734375, + "loss": 0.0075, + "rewards/chosen": 7.974837166922433, + "rewards/margins": 19.612604195731027, + "rewards/rejected": -11.637767028808593, + "step": 3348 + }, + { + "epoch": 0.9179114704673154, + "grad_norm": 0.57421875, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -33789760.0, + "logits/rejected": -29408128.0, + "logps/chosen": -448.42703683035717, + "logps/rejected": -756.9375, + "loss": 0.0019, + "rewards/chosen": 6.513699667794364, + "rewards/margins": 24.7712428950462, + "rewards/rejected": -18.257543227251837, + "step": 3349 + }, + { + "epoch": 0.9181855557078251, + "grad_norm": 8.0, + "kl": 8.651805877685547, + "learning_rate": 5e-06, + "logits/chosen": -13385966.0, + "logits/rejected": -23468596.0, + "logps/chosen": -438.2493591308594, + "logps/rejected": -654.1311645507812, + "loss": 0.0971, + "rewards/chosen": 7.524724006652832, + "rewards/margins": 21.26654815673828, + "rewards/rejected": -13.74182415008545, + "step": 3350 + }, + { + "epoch": 0.9184596409483349, + "grad_norm": 1.28125, + "kl": 4.378342628479004, + "learning_rate": 5e-06, + "logits/chosen": -20906614.153846152, + "logits/rejected": -21334728.727272727, + "logps/chosen": -445.24429086538464, + "logps/rejected": -577.6187855113636, + "loss": 0.0034, + "rewards/chosen": 9.146744948167067, + "rewards/margins": 20.396038882382264, + "rewards/rejected": -11.2492939342152, + "step": 3351 + }, + { + "epoch": 0.9187337261888447, + "grad_norm": 4.5625, + "kl": 5.513841152191162, + "learning_rate": 5e-06, + "logits/chosen": -10949084.307692308, + "logits/rejected": -28297911.272727273, + "logps/chosen": -473.40685096153845, + "logps/rejected": -498.48659446022725, + "loss": 0.0087, + "rewards/chosen": 8.235797588641827, + "rewards/margins": 20.340141349739127, + "rewards/rejected": -12.1043437610973, + "step": 3352 + }, + { + "epoch": 0.9190078114293545, + "grad_norm": 10.0, + "kl": 15.014749526977539, + "learning_rate": 5e-06, + "logits/chosen": -29736750.0, + "logits/rejected": -7058952.5, + "logps/chosen": -475.98114013671875, + "logps/rejected": -511.1832275390625, + "loss": 0.0329, + "rewards/chosen": 7.744750499725342, + "rewards/margins": 18.585761547088623, + "rewards/rejected": -10.841011047363281, + "step": 3353 + }, + { + "epoch": 0.9192818966698644, + "grad_norm": 3.6875, + "kl": 1.1266670227050781, + "learning_rate": 5e-06, + "logits/chosen": -12611994.666666666, + "logits/rejected": -23486538.666666668, + "logps/chosen": -373.43994140625, + "logps/rejected": -370.2648111979167, + "loss": 0.0353, + "rewards/chosen": 7.173103332519531, + "rewards/margins": 17.758376439412437, + "rewards/rejected": -10.585273106892904, + "step": 3354 + }, + { + "epoch": 0.9195559819103741, + "grad_norm": 7.3125, + "kl": 5.344974517822266, + "learning_rate": 5e-06, + "logits/chosen": -23385284.57142857, + "logits/rejected": -31110784.0, + "logps/chosen": -398.56005859375, + "logps/rejected": -591.41904296875, + "loss": 0.0451, + "rewards/chosen": 7.308277675083706, + "rewards/margins": 22.130013820103237, + "rewards/rejected": -14.821736145019532, + "step": 3355 + }, + { + "epoch": 0.9198300671508839, + "grad_norm": 9.0625, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -10707987.2, + "logits/rejected": -36296549.05263158, + "logps/chosen": -488.419384765625, + "logps/rejected": -640.4945518092105, + "loss": 0.0384, + "rewards/chosen": 8.317501068115234, + "rewards/margins": 22.023589485570007, + "rewards/rejected": -13.70608841745477, + "step": 3356 + }, + { + "epoch": 0.9201041523913938, + "grad_norm": 2.390625, + "kl": 0.7407932281494141, + "learning_rate": 5e-06, + "logits/chosen": -7770287.384615385, + "logits/rejected": -30951985.454545453, + "logps/chosen": -505.97220552884613, + "logps/rejected": -604.0204634232955, + "loss": 0.008, + "rewards/chosen": 8.06890399639423, + "rewards/margins": 22.32479250180971, + "rewards/rejected": -14.255888505415482, + "step": 3357 + }, + { + "epoch": 0.9203782376319035, + "grad_norm": 0.9140625, + "kl": 5.944798946380615, + "learning_rate": 5e-06, + "logits/chosen": -15717223.384615384, + "logits/rejected": -9284888.727272727, + "logps/chosen": -461.3288010817308, + "logps/rejected": -627.7811168323864, + "loss": 0.0024, + "rewards/chosen": 9.22305415226863, + "rewards/margins": 20.63383393187623, + "rewards/rejected": -11.4107797796076, + "step": 3358 + }, + { + "epoch": 0.9206523228724133, + "grad_norm": 3.046875, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -34517419.63636363, + "logits/rejected": -13559933.538461538, + "logps/chosen": -445.67569247159093, + "logps/rejected": -477.15283203125, + "loss": 0.0092, + "rewards/chosen": 6.999617143110796, + "rewards/margins": 17.121683934351783, + "rewards/rejected": -10.122066791240986, + "step": 3359 + }, + { + "epoch": 0.9209264081129231, + "grad_norm": 1.734375, + "kl": 6.277764320373535, + "learning_rate": 5e-06, + "logits/chosen": -19074528.0, + "logits/rejected": -34452902.4, + "logps/chosen": -386.4130161830357, + "logps/rejected": -540.15810546875, + "loss": 0.0051, + "rewards/chosen": 8.330276489257812, + "rewards/margins": 21.721803283691408, + "rewards/rejected": -13.391526794433593, + "step": 3360 + }, + { + "epoch": 0.9212004933534329, + "grad_norm": 4.375, + "kl": 1.5443992614746094, + "learning_rate": 5e-06, + "logits/chosen": -8690977.333333334, + "logits/rejected": -20239154.666666668, + "logps/chosen": -384.4140625, + "logps/rejected": -559.2417805989584, + "loss": 0.0111, + "rewards/chosen": 6.130195617675781, + "rewards/margins": 19.726567586263023, + "rewards/rejected": -13.59637196858724, + "step": 3361 + }, + { + "epoch": 0.9214745785939427, + "grad_norm": 8.0625, + "kl": 10.062572479248047, + "learning_rate": 5e-06, + "logits/chosen": -14332957.866666667, + "logits/rejected": -30902627.555555556, + "logps/chosen": -515.60283203125, + "logps/rejected": -833.7744140625, + "loss": 0.029, + "rewards/chosen": 8.325358072916666, + "rewards/margins": 28.76229010687934, + "rewards/rejected": -20.436932033962673, + "step": 3362 + }, + { + "epoch": 0.9217486638344525, + "grad_norm": 6.84375, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -20292208.0, + "logits/rejected": -34671781.64705882, + "logps/chosen": -444.50003487723217, + "logps/rejected": -600.6672794117648, + "loss": 0.0108, + "rewards/chosen": 7.2511187962123325, + "rewards/margins": 24.78714928506803, + "rewards/rejected": -17.536030488855697, + "step": 3363 + }, + { + "epoch": 0.9220227490749623, + "grad_norm": 4.53125, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -7888766.545454546, + "logits/rejected": -35803145.84615385, + "logps/chosen": -450.3074396306818, + "logps/rejected": -524.9089543269231, + "loss": 0.0422, + "rewards/chosen": 6.674138849431818, + "rewards/margins": 18.65461464194985, + "rewards/rejected": -11.980475792518028, + "step": 3364 + }, + { + "epoch": 0.9222968343154722, + "grad_norm": 11.6875, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -28384774.4, + "logits/rejected": -15906018.285714285, + "logps/chosen": -436.725, + "logps/rejected": -567.8333565848214, + "loss": 0.0229, + "rewards/chosen": 6.30821533203125, + "rewards/margins": 20.23296116420201, + "rewards/rejected": -13.924745832170759, + "step": 3365 + }, + { + "epoch": 0.9225709195559819, + "grad_norm": 1.890625, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -11418992.8, + "logits/rejected": -3850138.285714286, + "logps/chosen": -411.0880859375, + "logps/rejected": -402.52553013392856, + "loss": 0.0079, + "rewards/chosen": 8.303677368164063, + "rewards/margins": 19.4287602015904, + "rewards/rejected": -11.125082833426339, + "step": 3366 + }, + { + "epoch": 0.9228450047964917, + "grad_norm": 9.25, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -5556144.615384615, + "logits/rejected": 55080145.45454545, + "logps/chosen": -354.8401442307692, + "logps/rejected": -735.0070578835227, + "loss": 0.0697, + "rewards/chosen": 6.082790081317608, + "rewards/margins": 38.23622968980483, + "rewards/rejected": -32.15343960848722, + "step": 3367 + }, + { + "epoch": 0.9231190900370015, + "grad_norm": 1.65625, + "kl": 2.299424648284912, + "learning_rate": 5e-06, + "logits/chosen": -23961895.384615384, + "logits/rejected": -24282909.09090909, + "logps/chosen": -462.81092247596155, + "logps/rejected": -499.3963068181818, + "loss": 0.0031, + "rewards/chosen": 8.403408930851864, + "rewards/margins": 19.80479276430357, + "rewards/rejected": -11.401383833451705, + "step": 3368 + }, + { + "epoch": 0.9233931752775113, + "grad_norm": 0.275390625, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -4879679.466666667, + "logits/rejected": -39442965.333333336, + "logps/chosen": -426.60436197916664, + "logps/rejected": -661.9214409722222, + "loss": 0.001, + "rewards/chosen": 9.388763427734375, + "rewards/margins": 25.323189629448784, + "rewards/rejected": -15.934426201714409, + "step": 3369 + }, + { + "epoch": 0.9236672605180211, + "grad_norm": 13.625, + "kl": 12.514715194702148, + "learning_rate": 5e-06, + "logits/chosen": -46109138.28571428, + "logits/rejected": -61838937.6, + "logps/chosen": -354.2506626674107, + "logps/rejected": -502.616552734375, + "loss": 0.0551, + "rewards/chosen": 9.021724700927734, + "rewards/margins": 21.094026947021483, + "rewards/rejected": -12.07230224609375, + "step": 3370 + }, + { + "epoch": 0.9239413457585309, + "grad_norm": 9.5625, + "kl": 9.11406135559082, + "learning_rate": 5e-06, + "logits/chosen": -14676361.142857144, + "logits/rejected": -26907334.4, + "logps/chosen": -315.12203543526783, + "logps/rejected": -604.768359375, + "loss": 0.0696, + "rewards/chosen": 5.230806623186384, + "rewards/margins": 20.080393110002788, + "rewards/rejected": -14.849586486816406, + "step": 3371 + }, + { + "epoch": 0.9242154309990407, + "grad_norm": 10.25, + "kl": 12.492095947265625, + "learning_rate": 5e-06, + "logits/chosen": -19511568.0, + "logits/rejected": -33651626.666666664, + "logps/chosen": -405.1120198567708, + "logps/rejected": -623.6302897135416, + "loss": 0.0275, + "rewards/chosen": 8.808015823364258, + "rewards/margins": 23.734952926635742, + "rewards/rejected": -14.926937103271484, + "step": 3372 + }, + { + "epoch": 0.9244895162395504, + "grad_norm": 1.96875, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -41232491.63636363, + "logits/rejected": -32417590.153846152, + "logps/chosen": -495.13876065340907, + "logps/rejected": -570.6780724158654, + "loss": 0.0062, + "rewards/chosen": 8.317302357066762, + "rewards/margins": 21.268419065675538, + "rewards/rejected": -12.951116708608774, + "step": 3373 + }, + { + "epoch": 0.9247636014800603, + "grad_norm": 1.921875, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -22849996.0, + "logits/rejected": -19651524.0, + "logps/chosen": -388.0376281738281, + "logps/rejected": -536.0582275390625, + "loss": 0.0047, + "rewards/chosen": 7.743842601776123, + "rewards/margins": 21.751519680023193, + "rewards/rejected": -14.00767707824707, + "step": 3374 + }, + { + "epoch": 0.9250376867205701, + "grad_norm": 0.68359375, + "kl": 4.536065101623535, + "learning_rate": 5e-06, + "logits/chosen": -41152828.8, + "logits/rejected": -33572557.71428572, + "logps/chosen": -566.35458984375, + "logps/rejected": -652.0510602678571, + "loss": 0.0013, + "rewards/chosen": 10.64853744506836, + "rewards/margins": 26.42458964756557, + "rewards/rejected": -15.77605220249721, + "step": 3375 + }, + { + "epoch": 0.9253117719610799, + "grad_norm": 4.4375, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -39555157.333333336, + "logits/rejected": -29198007.466666665, + "logps/chosen": -592.5280490451389, + "logps/rejected": -579.7295572916667, + "loss": 0.0477, + "rewards/chosen": 9.079884847005209, + "rewards/margins": 22.83074951171875, + "rewards/rejected": -13.750864664713541, + "step": 3376 + }, + { + "epoch": 0.9255858572015897, + "grad_norm": 4.8125, + "kl": 4.550167083740234, + "learning_rate": 5e-06, + "logits/chosen": -24229558.153846152, + "logits/rejected": -39547191.27272727, + "logps/chosen": -497.39644681490387, + "logps/rejected": -561.3672318892045, + "loss": 0.0094, + "rewards/chosen": 8.360382080078125, + "rewards/margins": 22.411386663263492, + "rewards/rejected": -14.05100458318537, + "step": 3377 + }, + { + "epoch": 0.9258599424420995, + "grad_norm": 7.4375, + "kl": 3.998944044113159, + "learning_rate": 5e-06, + "logits/chosen": -17644409.333333332, + "logits/rejected": -46704282.666666664, + "logps/chosen": -374.7574462890625, + "logps/rejected": -584.3626708984375, + "loss": 0.0255, + "rewards/chosen": 6.4333241780598955, + "rewards/margins": 19.696797688802082, + "rewards/rejected": -13.263473510742188, + "step": 3378 + }, + { + "epoch": 0.9261340276826093, + "grad_norm": 4.78125, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -55220078.222222224, + "logits/rejected": -10087877.333333334, + "logps/chosen": -409.44493272569446, + "logps/rejected": -773.777734375, + "loss": 0.0087, + "rewards/chosen": 7.137084113226996, + "rewards/margins": 27.018334113227, + "rewards/rejected": -19.88125, + "step": 3379 + }, + { + "epoch": 0.9264081129231191, + "grad_norm": 5.375, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -20480470.153846152, + "logits/rejected": -12551063.272727273, + "logps/chosen": -432.39652193509613, + "logps/rejected": -517.19140625, + "loss": 0.0246, + "rewards/chosen": 7.009070763221154, + "rewards/margins": 19.219357683942036, + "rewards/rejected": -12.21028692072088, + "step": 3380 + }, + { + "epoch": 0.9266821981636288, + "grad_norm": 22.25, + "kl": 6.884671688079834, + "learning_rate": 5e-06, + "logits/chosen": -16373760.0, + "logits/rejected": -27549061.333333332, + "logps/chosen": -447.00927734375, + "logps/rejected": -536.000732421875, + "loss": 0.0693, + "rewards/chosen": 7.90220324198405, + "rewards/margins": 21.59245491027832, + "rewards/rejected": -13.690251668294271, + "step": 3381 + }, + { + "epoch": 0.9269562834041387, + "grad_norm": 6.9375, + "kl": 1.2552413940429688, + "learning_rate": 5e-06, + "logits/chosen": -26291272.0, + "logits/rejected": -24422390.85714286, + "logps/chosen": -430.88505859375, + "logps/rejected": -582.3169294084821, + "loss": 0.0145, + "rewards/chosen": 6.675546264648437, + "rewards/margins": 21.33895045689174, + "rewards/rejected": -14.663404192243304, + "step": 3382 + }, + { + "epoch": 0.9272303686446485, + "grad_norm": 4.90625, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -22154002.285714287, + "logits/rejected": -14545045.647058824, + "logps/chosen": -554.223388671875, + "logps/rejected": -653.0343520220588, + "loss": 0.0172, + "rewards/chosen": 7.613816397530692, + "rewards/margins": 21.106267784823892, + "rewards/rejected": -13.4924513872932, + "step": 3383 + }, + { + "epoch": 0.9275044538851582, + "grad_norm": 0.7734375, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -12331727.2, + "logits/rejected": -29784681.14285714, + "logps/chosen": -535.16083984375, + "logps/rejected": -534.1060267857143, + "loss": 0.0016, + "rewards/chosen": 9.143683624267577, + "rewards/margins": 24.53513368879046, + "rewards/rejected": -15.39145006452288, + "step": 3384 + }, + { + "epoch": 0.9277785391256681, + "grad_norm": 3.140625, + "kl": 4.454029083251953, + "learning_rate": 5e-06, + "logits/chosen": -44855992.0, + "logits/rejected": -29226278.0, + "logps/chosen": -375.2306213378906, + "logps/rejected": -503.8863525390625, + "loss": 0.0406, + "rewards/chosen": 6.460363388061523, + "rewards/margins": 19.38943862915039, + "rewards/rejected": -12.929075241088867, + "step": 3385 + }, + { + "epoch": 0.9280526243661779, + "grad_norm": 3.59375, + "kl": 3.7664923667907715, + "learning_rate": 5e-06, + "logits/chosen": -33465832.727272727, + "logits/rejected": -29878951.384615384, + "logps/chosen": -327.8447265625, + "logps/rejected": -589.3822866586538, + "loss": 0.0277, + "rewards/chosen": 6.072291981090199, + "rewards/margins": 23.080738307712796, + "rewards/rejected": -17.008446326622597, + "step": 3386 + }, + { + "epoch": 0.9283267096066877, + "grad_norm": 0.84765625, + "kl": 2.8233847618103027, + "learning_rate": 5e-06, + "logits/chosen": -19742025.333333332, + "logits/rejected": -41035061.333333336, + "logps/chosen": -426.9639892578125, + "logps/rejected": -550.7426350911459, + "loss": 0.0016, + "rewards/chosen": 10.505862553914389, + "rewards/margins": 25.228439966837566, + "rewards/rejected": -14.722577412923178, + "step": 3387 + }, + { + "epoch": 0.9286007948471975, + "grad_norm": 7.96875, + "kl": 5.106563568115234, + "learning_rate": 5e-06, + "logits/chosen": -25551273.846153848, + "logits/rejected": -29957536.0, + "logps/chosen": -430.0541240985577, + "logps/rejected": -568.3672762784091, + "loss": 0.0347, + "rewards/chosen": 8.806387094350962, + "rewards/margins": 21.536214975210335, + "rewards/rejected": -12.729827880859375, + "step": 3388 + }, + { + "epoch": 0.9288748800877072, + "grad_norm": 3.359375, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -25863656.727272727, + "logits/rejected": -33376910.769230768, + "logps/chosen": -382.80082563920456, + "logps/rejected": -652.0518329326923, + "loss": 0.013, + "rewards/chosen": 6.578089627352628, + "rewards/margins": 24.305753107671137, + "rewards/rejected": -17.72766348031851, + "step": 3389 + }, + { + "epoch": 0.9291489653282171, + "grad_norm": 8.6875, + "kl": 12.896116256713867, + "learning_rate": 5e-06, + "logits/chosen": -33494820.0, + "logits/rejected": -31246086.0, + "logps/chosen": -382.0301513671875, + "logps/rejected": -591.2089233398438, + "loss": 0.0431, + "rewards/chosen": 7.164003372192383, + "rewards/margins": 19.848447799682617, + "rewards/rejected": -12.684444427490234, + "step": 3390 + }, + { + "epoch": 0.9294230505687269, + "grad_norm": 2.90625, + "kl": 6.915323734283447, + "learning_rate": 5e-06, + "logits/chosen": -26546039.272727273, + "logits/rejected": -21579542.153846152, + "logps/chosen": -409.69442471590907, + "logps/rejected": -616.9637545072115, + "loss": 0.0467, + "rewards/chosen": 7.125471635298296, + "rewards/margins": 20.218381228146853, + "rewards/rejected": -13.092909592848558, + "step": 3391 + }, + { + "epoch": 0.9296971358092366, + "grad_norm": 8.375, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -23879357.53846154, + "logits/rejected": -31503778.90909091, + "logps/chosen": -338.31287560096155, + "logps/rejected": -614.3560014204545, + "loss": 0.0209, + "rewards/chosen": 7.1596832275390625, + "rewards/margins": 23.95425831187855, + "rewards/rejected": -16.79457508433949, + "step": 3392 + }, + { + "epoch": 0.9299712210497465, + "grad_norm": 12.9375, + "kl": 6.368080139160156, + "learning_rate": 5e-06, + "logits/chosen": -27618414.769230768, + "logits/rejected": -19056430.545454547, + "logps/chosen": -351.9773137019231, + "logps/rejected": -471.79434481534093, + "loss": 0.1006, + "rewards/chosen": 6.149282602163462, + "rewards/margins": 19.14517937506829, + "rewards/rejected": -12.99589677290483, + "step": 3393 + }, + { + "epoch": 0.9302453062902563, + "grad_norm": 3.03125, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -25137235.2, + "logits/rejected": -58476178.28571428, + "logps/chosen": -494.1876953125, + "logps/rejected": -641.2269810267857, + "loss": 0.0111, + "rewards/chosen": 8.253870391845703, + "rewards/margins": 23.80929685320173, + "rewards/rejected": -15.555426461356026, + "step": 3394 + }, + { + "epoch": 0.930519391530766, + "grad_norm": 4.34375, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -32201332.0, + "logits/rejected": -8029650.5, + "logps/chosen": -445.8554992675781, + "logps/rejected": -347.0872802734375, + "loss": 0.013, + "rewards/chosen": 7.633358955383301, + "rewards/margins": 18.426295280456543, + "rewards/rejected": -10.792936325073242, + "step": 3395 + }, + { + "epoch": 0.9307934767712759, + "grad_norm": 6.9375, + "kl": 7.748190879821777, + "learning_rate": 5e-06, + "logits/chosen": -30043665.454545453, + "logits/rejected": -16778427.076923076, + "logps/chosen": -396.1749378551136, + "logps/rejected": -613.2966496394231, + "loss": 0.0777, + "rewards/chosen": 6.388296647505327, + "rewards/margins": 17.970391280167586, + "rewards/rejected": -11.58209463266226, + "step": 3396 + }, + { + "epoch": 0.9310675620117856, + "grad_norm": 7.0, + "kl": 0.7843354940414429, + "learning_rate": 5e-06, + "logits/chosen": -28296568.615384616, + "logits/rejected": -11164696.727272727, + "logps/chosen": -382.6115910456731, + "logps/rejected": -642.9716352982955, + "loss": 0.0865, + "rewards/chosen": 7.251071636493389, + "rewards/margins": 19.118281331095663, + "rewards/rejected": -11.867209694602273, + "step": 3397 + }, + { + "epoch": 0.9313416472522955, + "grad_norm": 0.8359375, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -22038394.181818184, + "logits/rejected": -6843496.615384615, + "logps/chosen": -401.4529474431818, + "logps/rejected": -555.4577448918269, + "loss": 0.0074, + "rewards/chosen": 8.32950800115412, + "rewards/margins": 21.07126121254234, + "rewards/rejected": -12.741753211388222, + "step": 3398 + }, + { + "epoch": 0.9316157324928053, + "grad_norm": 10.6875, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -35207674.666666664, + "logits/rejected": -17608216.0, + "logps/chosen": -306.26991780598956, + "logps/rejected": -626.5785725911459, + "loss": 0.0436, + "rewards/chosen": 5.894677480061849, + "rewards/margins": 19.916205088297527, + "rewards/rejected": -14.021527608235678, + "step": 3399 + }, + { + "epoch": 0.931889817733315, + "grad_norm": 4.15625, + "kl": 1.9527359008789062, + "learning_rate": 5e-06, + "logits/chosen": -51903360.0, + "logits/rejected": -29211242.666666668, + "logps/chosen": -376.51953125, + "logps/rejected": -580.7463785807291, + "loss": 0.0105, + "rewards/chosen": 7.436511357625325, + "rewards/margins": 17.612452189127605, + "rewards/rejected": -10.17594083150228, + "step": 3400 + }, + { + "epoch": 0.9321639029738249, + "grad_norm": 7.8125, + "kl": 1.7211673259735107, + "learning_rate": 5e-06, + "logits/chosen": -9733131.076923076, + "logits/rejected": -27034466.90909091, + "logps/chosen": -421.95849609375, + "logps/rejected": -441.73237748579544, + "loss": 0.0206, + "rewards/chosen": 7.815953181340144, + "rewards/margins": 19.368303312288297, + "rewards/rejected": -11.552350130948154, + "step": 3401 + }, + { + "epoch": 0.9324379882143347, + "grad_norm": 5.5, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -12447077.333333334, + "logits/rejected": -26344445.866666667, + "logps/chosen": -404.4858127170139, + "logps/rejected": -663.2051432291667, + "loss": 0.0101, + "rewards/chosen": 7.2527211507161455, + "rewards/margins": 22.2493896484375, + "rewards/rejected": -14.996668497721354, + "step": 3402 + }, + { + "epoch": 0.9327120734548444, + "grad_norm": 12.9375, + "kl": 2.0380046367645264, + "learning_rate": 5e-06, + "logits/chosen": -6556893.333333333, + "logits/rejected": -24484264.0, + "logps/chosen": -335.3662923177083, + "logps/rejected": -496.7454427083333, + "loss": 0.0302, + "rewards/chosen": 6.803058624267578, + "rewards/margins": 18.088197072347008, + "rewards/rejected": -11.285138448079428, + "step": 3403 + }, + { + "epoch": 0.9329861586953543, + "grad_norm": 3.953125, + "kl": 0.6109498739242554, + "learning_rate": 5e-06, + "logits/chosen": -30195917.333333332, + "logits/rejected": -16371033.333333334, + "logps/chosen": -462.5885823567708, + "logps/rejected": -618.2353515625, + "loss": 0.0184, + "rewards/chosen": 6.177724202473958, + "rewards/margins": 20.800453186035156, + "rewards/rejected": -14.622728983561197, + "step": 3404 + }, + { + "epoch": 0.933260243935864, + "grad_norm": 7.21875, + "kl": 5.7952880859375, + "learning_rate": 5e-06, + "logits/chosen": -26290666.666666668, + "logits/rejected": 83041226.66666667, + "logps/chosen": -448.1223551432292, + "logps/rejected": -513.0697021484375, + "loss": 0.025, + "rewards/chosen": 7.332375844319661, + "rewards/margins": 16.97630246480306, + "rewards/rejected": -9.643926620483398, + "step": 3405 + }, + { + "epoch": 0.9335343291763738, + "grad_norm": 1.5703125, + "kl": 6.417951583862305, + "learning_rate": 5e-06, + "logits/chosen": -23586386.285714287, + "logits/rejected": -32818598.4, + "logps/chosen": -414.41524832589283, + "logps/rejected": -432.6564453125, + "loss": 0.0043, + "rewards/chosen": 8.478536878313337, + "rewards/margins": 18.392880140032087, + "rewards/rejected": -9.91434326171875, + "step": 3406 + }, + { + "epoch": 0.9338084144168837, + "grad_norm": 0.828125, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -25187817.846153848, + "logits/rejected": -10567297.454545455, + "logps/chosen": -388.67074819711536, + "logps/rejected": -446.62801846590907, + "loss": 0.003, + "rewards/chosen": 8.978814932016226, + "rewards/margins": 19.28485214126694, + "rewards/rejected": -10.30603720925071, + "step": 3407 + }, + { + "epoch": 0.9340824996573934, + "grad_norm": 3.859375, + "kl": 2.0786032676696777, + "learning_rate": 5e-06, + "logits/chosen": -30406893.714285713, + "logits/rejected": -26210744.0, + "logps/chosen": -355.72701590401783, + "logps/rejected": -390.92255859375, + "loss": 0.0496, + "rewards/chosen": 7.181155613490513, + "rewards/margins": 15.699711826869418, + "rewards/rejected": -8.518556213378906, + "step": 3408 + }, + { + "epoch": 0.9343565848979033, + "grad_norm": 31.75, + "kl": 0.7564811706542969, + "learning_rate": 5e-06, + "logits/chosen": -21325510.85714286, + "logits/rejected": -14687382.4, + "logps/chosen": -452.6568080357143, + "logps/rejected": -566.33095703125, + "loss": 0.0477, + "rewards/chosen": 7.537989480154855, + "rewards/margins": 18.167980630057198, + "rewards/rejected": -10.629991149902343, + "step": 3409 + }, + { + "epoch": 0.9346306701384131, + "grad_norm": 4.78125, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -11131103.111111112, + "logits/rejected": -28209642.666666668, + "logps/chosen": -352.3041178385417, + "logps/rejected": -427.713671875, + "loss": 0.0409, + "rewards/chosen": 7.162240770128038, + "rewards/margins": 17.411534796820746, + "rewards/rejected": -10.249294026692708, + "step": 3410 + }, + { + "epoch": 0.9349047553789228, + "grad_norm": 2.96875, + "kl": 10.877851486206055, + "learning_rate": 5e-06, + "logits/chosen": -21545878.666666668, + "logits/rejected": -50456058.666666664, + "logps/chosen": -393.5066731770833, + "logps/rejected": -522.91845703125, + "loss": 0.0159, + "rewards/chosen": 8.00325075785319, + "rewards/margins": 19.656574885050453, + "rewards/rejected": -11.653324127197266, + "step": 3411 + }, + { + "epoch": 0.9351788406194327, + "grad_norm": 15.375, + "kl": 9.074888229370117, + "learning_rate": 5e-06, + "logits/chosen": -41998678.85714286, + "logits/rejected": -19706931.2, + "logps/chosen": -388.68593052455356, + "logps/rejected": -549.3001953125, + "loss": 0.0724, + "rewards/chosen": 6.77184077671596, + "rewards/margins": 22.166285051618303, + "rewards/rejected": -15.394444274902344, + "step": 3412 + }, + { + "epoch": 0.9354529258599424, + "grad_norm": 1.734375, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -38255788.0, + "logits/rejected": -34627769.6, + "logps/chosen": -391.75726318359375, + "logps/rejected": -602.18896484375, + "loss": 0.0038, + "rewards/chosen": 10.006354331970215, + "rewards/margins": 22.58885250091553, + "rewards/rejected": -12.582498168945312, + "step": 3413 + }, + { + "epoch": 0.9357270111004522, + "grad_norm": 0.283203125, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -37992323.55555555, + "logits/rejected": -35442545.06666667, + "logps/chosen": -547.7611219618055, + "logps/rejected": -673.9326822916667, + "loss": 0.0007, + "rewards/chosen": 8.502001444498697, + "rewards/margins": 25.411095682779944, + "rewards/rejected": -16.90909423828125, + "step": 3414 + }, + { + "epoch": 0.9360010963409621, + "grad_norm": 3.0, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -24727570.0, + "logits/rejected": -22491330.0, + "logps/chosen": -332.92791748046875, + "logps/rejected": -741.558349609375, + "loss": 0.0083, + "rewards/chosen": 6.6514105796813965, + "rewards/margins": 21.515642642974854, + "rewards/rejected": -14.864232063293457, + "step": 3415 + }, + { + "epoch": 0.9362751815814718, + "grad_norm": 3.15625, + "kl": 1.2437922954559326, + "learning_rate": 5e-06, + "logits/chosen": -33059601.454545453, + "logits/rejected": 11000503.384615384, + "logps/chosen": -508.92489346590907, + "logps/rejected": -694.7184495192307, + "loss": 0.0092, + "rewards/chosen": 7.779735218394887, + "rewards/margins": 24.919624275260873, + "rewards/rejected": -17.139889056865986, + "step": 3416 + }, + { + "epoch": 0.9365492668219816, + "grad_norm": 4.1875, + "kl": 6.605035305023193, + "learning_rate": 5e-06, + "logits/chosen": -22875991.466666665, + "logits/rejected": -39466112.0, + "logps/chosen": -389.67962239583335, + "logps/rejected": -1010.5775824652778, + "loss": 0.0123, + "rewards/chosen": 8.483728535970052, + "rewards/margins": 32.29070315890842, + "rewards/rejected": -23.80697462293837, + "step": 3417 + }, + { + "epoch": 0.9368233520624915, + "grad_norm": 10.6875, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -16403845.333333334, + "logits/rejected": -45628970.666666664, + "logps/chosen": -404.5725911458333, + "logps/rejected": -511.01458333333335, + "loss": 0.047, + "rewards/chosen": 5.974240620930989, + "rewards/margins": 17.78717803955078, + "rewards/rejected": -11.812937418619791, + "step": 3418 + }, + { + "epoch": 0.9370974373030012, + "grad_norm": 5.0, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -39007543.27272727, + "logits/rejected": -45141612.307692304, + "logps/chosen": -420.3955078125, + "logps/rejected": -714.2254356971154, + "loss": 0.0158, + "rewards/chosen": 7.382972717285156, + "rewards/margins": 25.94177070030799, + "rewards/rejected": -18.558797983022835, + "step": 3419 + }, + { + "epoch": 0.9373715225435111, + "grad_norm": 4.40625, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": 8382080.0, + "logits/rejected": -34803259.428571425, + "logps/chosen": -449.36396484375, + "logps/rejected": -758.9401506696429, + "loss": 0.039, + "rewards/chosen": 7.739715576171875, + "rewards/margins": 27.233880179268972, + "rewards/rejected": -19.494164603097097, + "step": 3420 + }, + { + "epoch": 0.9376456077840208, + "grad_norm": 5.9375, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -20670208.0, + "logits/rejected": -14764347.076923076, + "logps/chosen": -420.86692116477275, + "logps/rejected": -782.1479867788462, + "loss": 0.0157, + "rewards/chosen": 6.49432373046875, + "rewards/margins": 23.457321166992188, + "rewards/rejected": -16.962997436523438, + "step": 3421 + }, + { + "epoch": 0.9379196930245306, + "grad_norm": 1.046875, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -25063323.076923076, + "logits/rejected": 4549752.363636363, + "logps/chosen": -372.09314903846155, + "logps/rejected": -677.8528497869319, + "loss": 0.0031, + "rewards/chosen": 8.50589810884916, + "rewards/margins": 24.313085275930128, + "rewards/rejected": -15.807187167080967, + "step": 3422 + }, + { + "epoch": 0.9381937782650405, + "grad_norm": 1.109375, + "kl": 0.92041015625, + "learning_rate": 5e-06, + "logits/chosen": -25153361.777777776, + "logits/rejected": -16838790.4, + "logps/chosen": -383.43096245659723, + "logps/rejected": -488.1752604166667, + "loss": 0.003, + "rewards/chosen": 7.861071268717448, + "rewards/margins": 20.252455139160155, + "rewards/rejected": -12.391383870442708, + "step": 3423 + }, + { + "epoch": 0.9384678635055502, + "grad_norm": 8.375, + "kl": 0.36981043219566345, + "learning_rate": 5e-06, + "logits/chosen": -9536131.42857143, + "logits/rejected": -37938771.2, + "logps/chosen": -367.900390625, + "logps/rejected": -734.348291015625, + "loss": 0.0272, + "rewards/chosen": 6.119538443429129, + "rewards/margins": 18.83657488141741, + "rewards/rejected": -12.71703643798828, + "step": 3424 + }, + { + "epoch": 0.93874194874606, + "grad_norm": 3.015625, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -40636213.333333336, + "logits/rejected": -34734498.13333333, + "logps/chosen": -382.9345703125, + "logps/rejected": -471.59583333333336, + "loss": 0.0082, + "rewards/chosen": 8.556411743164062, + "rewards/margins": 21.766018676757813, + "rewards/rejected": -13.20960693359375, + "step": 3425 + }, + { + "epoch": 0.9390160339865699, + "grad_norm": 2.96875, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -21199052.8, + "logits/rejected": -28963734.85714286, + "logps/chosen": -426.973681640625, + "logps/rejected": -532.5344587053571, + "loss": 0.0107, + "rewards/chosen": 6.214456176757812, + "rewards/margins": 18.56299525669643, + "rewards/rejected": -12.348539079938616, + "step": 3426 + }, + { + "epoch": 0.9392901192270796, + "grad_norm": 2.40625, + "kl": 4.914234161376953, + "learning_rate": 5e-06, + "logits/chosen": -43976196.266666666, + "logits/rejected": -27545690.666666668, + "logps/chosen": -506.83483072916664, + "logps/rejected": -574.876953125, + "loss": 0.0072, + "rewards/chosen": 8.91383768717448, + "rewards/margins": 23.247340562608507, + "rewards/rejected": -14.333502875434029, + "step": 3427 + }, + { + "epoch": 0.9395642044675894, + "grad_norm": 0.98828125, + "kl": 1.8802541494369507, + "learning_rate": 5e-06, + "logits/chosen": -16024062.666666666, + "logits/rejected": -25475082.666666668, + "logps/chosen": -448.5851236979167, + "logps/rejected": -599.4901529947916, + "loss": 0.0034, + "rewards/chosen": 8.490619659423828, + "rewards/margins": 22.654665629069008, + "rewards/rejected": -14.164045969645182, + "step": 3428 + }, + { + "epoch": 0.9398382897080992, + "grad_norm": 7.6875, + "kl": 14.381593704223633, + "learning_rate": 5e-06, + "logits/chosen": -18672338.666666668, + "logits/rejected": -26306680.0, + "logps/chosen": -519.50341796875, + "logps/rejected": -627.8221028645834, + "loss": 0.0238, + "rewards/chosen": 8.85653305053711, + "rewards/margins": 26.795093536376953, + "rewards/rejected": -17.938560485839844, + "step": 3429 + }, + { + "epoch": 0.940112374948609, + "grad_norm": 11.1875, + "kl": 8.933425903320312, + "learning_rate": 5e-06, + "logits/chosen": -17360958.85714286, + "logits/rejected": -16556995.2, + "logps/chosen": -401.58206612723217, + "logps/rejected": -558.95693359375, + "loss": 0.0251, + "rewards/chosen": 7.475626264299665, + "rewards/margins": 20.959378705705916, + "rewards/rejected": -13.48375244140625, + "step": 3430 + }, + { + "epoch": 0.9403864601891189, + "grad_norm": 7.34375, + "kl": 4.501676082611084, + "learning_rate": 5e-06, + "logits/chosen": -8432894.76923077, + "logits/rejected": -457272.7272727273, + "logps/chosen": -414.9242412860577, + "logps/rejected": -405.3203125, + "loss": 0.0366, + "rewards/chosen": 7.280244680551382, + "rewards/margins": 17.40515590214229, + "rewards/rejected": -10.124911221590908, + "step": 3431 + }, + { + "epoch": 0.9406605454296286, + "grad_norm": 18.125, + "kl": 2.327228307723999, + "learning_rate": 5e-06, + "logits/chosen": -3380752.5714285714, + "logits/rejected": -36437504.0, + "logps/chosen": -357.13706752232144, + "logps/rejected": -774.196142578125, + "loss": 0.0571, + "rewards/chosen": 6.471590314592634, + "rewards/margins": 23.51155003138951, + "rewards/rejected": -17.039959716796876, + "step": 3432 + }, + { + "epoch": 0.9409346306701384, + "grad_norm": 5.34375, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -29438094.222222224, + "logits/rejected": -18993152.0, + "logps/chosen": -373.2575954861111, + "logps/rejected": -490.10559895833336, + "loss": 0.0209, + "rewards/chosen": 6.2977417839898004, + "rewards/margins": 18.199458906385633, + "rewards/rejected": -11.901717122395834, + "step": 3433 + }, + { + "epoch": 0.9412087159106483, + "grad_norm": 10.4375, + "kl": 11.453731536865234, + "learning_rate": 5e-06, + "logits/chosen": -6946972.266666667, + "logits/rejected": -30796199.111111112, + "logps/chosen": -397.1162109375, + "logps/rejected": -634.8573133680555, + "loss": 0.0413, + "rewards/chosen": 8.064152018229167, + "rewards/margins": 21.160228135850694, + "rewards/rejected": -13.096076117621529, + "step": 3434 + }, + { + "epoch": 0.941482801151158, + "grad_norm": 9.3125, + "kl": 8.138809204101562, + "learning_rate": 5e-06, + "logits/chosen": -15229150.76923077, + "logits/rejected": -31982362.181818184, + "logps/chosen": -356.16466346153845, + "logps/rejected": -415.64901455965907, + "loss": 0.0178, + "rewards/chosen": 8.952622633713942, + "rewards/margins": 22.914067835240928, + "rewards/rejected": -13.961445201526988, + "step": 3435 + }, + { + "epoch": 0.9417568863916678, + "grad_norm": 1.046875, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -24617596.8, + "logits/rejected": -63909577.14285714, + "logps/chosen": -381.643359375, + "logps/rejected": -424.15011160714283, + "loss": 0.0045, + "rewards/chosen": 7.022040557861328, + "rewards/margins": 17.698555319649834, + "rewards/rejected": -10.676514761788505, + "step": 3436 + }, + { + "epoch": 0.9420309716321776, + "grad_norm": 7.78125, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -46875699.2, + "logits/rejected": -39640444.631578945, + "logps/chosen": -547.476171875, + "logps/rejected": -564.4800061677631, + "loss": 0.0084, + "rewards/chosen": 9.35692901611328, + "rewards/margins": 22.90625827186986, + "rewards/rejected": -13.549329255756579, + "step": 3437 + }, + { + "epoch": 0.9423050568726874, + "grad_norm": 11.0625, + "kl": 18.91948699951172, + "learning_rate": 5e-06, + "logits/chosen": -38662212.0, + "logits/rejected": -25670372.0, + "logps/chosen": -375.06134033203125, + "logps/rejected": -448.0779724121094, + "loss": 0.1268, + "rewards/chosen": 7.563299179077148, + "rewards/margins": 16.58810043334961, + "rewards/rejected": -9.024801254272461, + "step": 3438 + }, + { + "epoch": 0.9425791421131972, + "grad_norm": 5.59375, + "kl": 0.6133651733398438, + "learning_rate": 5e-06, + "logits/chosen": -46449157.333333336, + "logits/rejected": -15783918.666666666, + "logps/chosen": -388.2109375, + "logps/rejected": -491.2742513020833, + "loss": 0.0364, + "rewards/chosen": 6.878045399983724, + "rewards/margins": 19.372852325439453, + "rewards/rejected": -12.494806925455729, + "step": 3439 + }, + { + "epoch": 0.942853227353707, + "grad_norm": 4.15625, + "kl": 7.840646266937256, + "learning_rate": 5e-06, + "logits/chosen": -27514848.0, + "logits/rejected": -23225673.6, + "logps/chosen": -439.37935965401783, + "logps/rejected": -538.62744140625, + "loss": 0.0077, + "rewards/chosen": 8.824000222342354, + "rewards/margins": 22.314059121268137, + "rewards/rejected": -13.490058898925781, + "step": 3440 + }, + { + "epoch": 0.9431273125942168, + "grad_norm": 3.375, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -33358611.692307692, + "logits/rejected": 7304728.7272727275, + "logps/chosen": -431.8722956730769, + "logps/rejected": -626.3705166903409, + "loss": 0.0074, + "rewards/chosen": 7.430299025315505, + "rewards/margins": 21.55717446920755, + "rewards/rejected": -14.126875443892045, + "step": 3441 + }, + { + "epoch": 0.9434013978347267, + "grad_norm": 0.38671875, + "kl": 1.4422569274902344, + "learning_rate": 5e-06, + "logits/chosen": 38597752.0, + "logits/rejected": -10437261.0, + "logps/chosen": -527.1065673828125, + "logps/rejected": -636.275146484375, + "loss": 0.0015, + "rewards/chosen": 9.009142875671387, + "rewards/margins": 25.588765144348145, + "rewards/rejected": -16.579622268676758, + "step": 3442 + }, + { + "epoch": 0.9436754830752364, + "grad_norm": 3.421875, + "kl": 0.09761810302734375, + "learning_rate": 5e-06, + "logits/chosen": -18834360.0, + "logits/rejected": -40831228.0, + "logps/chosen": -429.3080749511719, + "logps/rejected": -527.181640625, + "loss": 0.0518, + "rewards/chosen": 6.292591094970703, + "rewards/margins": 19.132850646972656, + "rewards/rejected": -12.840259552001953, + "step": 3443 + }, + { + "epoch": 0.9439495683157462, + "grad_norm": 1.4921875, + "kl": 3.4750709533691406, + "learning_rate": 5e-06, + "logits/chosen": -43972676.92307692, + "logits/rejected": -39257018.18181818, + "logps/chosen": -537.5822190504807, + "logps/rejected": -615.7786310369319, + "loss": 0.0043, + "rewards/chosen": 8.606405404897837, + "rewards/margins": 25.1239460765065, + "rewards/rejected": -16.517540671608664, + "step": 3444 + }, + { + "epoch": 0.944223653556256, + "grad_norm": 6.0, + "kl": 0.8819955587387085, + "learning_rate": 5e-06, + "logits/chosen": -45304960.0, + "logits/rejected": -19918068.57142857, + "logps/chosen": -458.906982421875, + "logps/rejected": -393.17208426339283, + "loss": 0.0307, + "rewards/chosen": 8.231139373779296, + "rewards/margins": 19.788348933628626, + "rewards/rejected": -11.55720955984933, + "step": 3445 + }, + { + "epoch": 0.9444977387967658, + "grad_norm": 1.1796875, + "kl": 2.5557315349578857, + "learning_rate": 5e-06, + "logits/chosen": -27064951.466666665, + "logits/rejected": -22395555.555555556, + "logps/chosen": -468.63798828125, + "logps/rejected": -459.98621961805554, + "loss": 0.0032, + "rewards/chosen": 8.345637003580729, + "rewards/margins": 20.32199944390191, + "rewards/rejected": -11.97636244032118, + "step": 3446 + }, + { + "epoch": 0.9447718240372756, + "grad_norm": 5.53125, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -23301696.0, + "logits/rejected": -28729000.0, + "logps/chosen": -386.01806640625, + "logps/rejected": -610.7843017578125, + "loss": 0.0492, + "rewards/chosen": 6.003445625305176, + "rewards/margins": 23.86717128753662, + "rewards/rejected": -17.863725662231445, + "step": 3447 + }, + { + "epoch": 0.9450459092777854, + "grad_norm": 9.1875, + "kl": 9.868945121765137, + "learning_rate": 5e-06, + "logits/chosen": -27462400.0, + "logits/rejected": -39071845.81818182, + "logps/chosen": -415.8051006610577, + "logps/rejected": -596.3027787642045, + "loss": 0.0262, + "rewards/chosen": 7.323006850022536, + "rewards/margins": 28.441541284947963, + "rewards/rejected": -21.118534434925426, + "step": 3448 + }, + { + "epoch": 0.9453199945182952, + "grad_norm": 4.9375, + "kl": 2.3947906494140625, + "learning_rate": 5e-06, + "logits/chosen": -31458594.90909091, + "logits/rejected": -30983891.692307692, + "logps/chosen": -367.6719415838068, + "logps/rejected": -471.71078725961536, + "loss": 0.0231, + "rewards/chosen": 8.104101701216264, + "rewards/margins": 21.184678671243304, + "rewards/rejected": -13.080576970027042, + "step": 3449 + }, + { + "epoch": 0.9455940797588049, + "grad_norm": 1.90625, + "kl": 0.28998440504074097, + "learning_rate": 5e-06, + "logits/chosen": -32756302.769230768, + "logits/rejected": -39716887.27272727, + "logps/chosen": -388.06002103365387, + "logps/rejected": -586.2642933238636, + "loss": 0.0053, + "rewards/chosen": 6.956165020282452, + "rewards/margins": 22.65045656857791, + "rewards/rejected": -15.694291548295455, + "step": 3450 + }, + { + "epoch": 0.9458681649993148, + "grad_norm": 3.515625, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -10312564.0, + "logits/rejected": -29113778.285714287, + "logps/chosen": -462.761328125, + "logps/rejected": -502.25589425223217, + "loss": 0.0117, + "rewards/chosen": 7.119194030761719, + "rewards/margins": 20.794194902692524, + "rewards/rejected": -13.675000871930804, + "step": 3451 + }, + { + "epoch": 0.9461422502398246, + "grad_norm": 7.96875, + "kl": 5.165338039398193, + "learning_rate": 5e-06, + "logits/chosen": -36090993.23076923, + "logits/rejected": -31595371.636363637, + "logps/chosen": -324.7233323317308, + "logps/rejected": -752.6908735795455, + "loss": 0.099, + "rewards/chosen": 5.56381342961238, + "rewards/margins": 21.254152951540647, + "rewards/rejected": -15.690339521928268, + "step": 3452 + }, + { + "epoch": 0.9464163354803344, + "grad_norm": 3.734375, + "kl": 4.704298973083496, + "learning_rate": 5e-06, + "logits/chosen": -5243384.0, + "logits/rejected": -56727074.90909091, + "logps/chosen": -416.66687950721155, + "logps/rejected": -530.6193625710227, + "loss": 0.0135, + "rewards/chosen": 7.407478919396033, + "rewards/margins": 21.987304580795183, + "rewards/rejected": -14.579825661399148, + "step": 3453 + }, + { + "epoch": 0.9466904207208442, + "grad_norm": 4.78125, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -24758129.230769232, + "logits/rejected": -37629483.63636363, + "logps/chosen": -480.9168043870192, + "logps/rejected": -502.32839133522725, + "loss": 0.033, + "rewards/chosen": 8.013508723332333, + "rewards/margins": 19.69030537638631, + "rewards/rejected": -11.676796653053977, + "step": 3454 + }, + { + "epoch": 0.946964505961354, + "grad_norm": 9.5625, + "kl": 12.257509231567383, + "learning_rate": 5e-06, + "logits/chosen": -34427720.0, + "logits/rejected": -45474424.0, + "logps/chosen": -367.7168273925781, + "logps/rejected": -626.6493530273438, + "loss": 0.0418, + "rewards/chosen": 7.897889137268066, + "rewards/margins": 25.41500186920166, + "rewards/rejected": -17.517112731933594, + "step": 3455 + }, + { + "epoch": 0.9472385912018638, + "grad_norm": 6.65625, + "kl": 11.490344047546387, + "learning_rate": 5e-06, + "logits/chosen": -22234021.647058822, + "logits/rejected": -191558.85714285713, + "logps/chosen": -477.4399988511029, + "logps/rejected": -546.8710239955357, + "loss": 0.0466, + "rewards/chosen": 8.673949297736673, + "rewards/margins": 23.761183794806986, + "rewards/rejected": -15.087234497070312, + "step": 3456 + }, + { + "epoch": 0.9475126764423736, + "grad_norm": 3.078125, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -54478169.6, + "logits/rejected": -20921600.0, + "logps/chosen": -336.578857421875, + "logps/rejected": -466.0127650669643, + "loss": 0.032, + "rewards/chosen": 5.660713195800781, + "rewards/margins": 18.49222128731864, + "rewards/rejected": -12.831508091517858, + "step": 3457 + }, + { + "epoch": 0.9477867616828833, + "grad_norm": 8.0625, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": 3835091.6, + "logits/rejected": -44102870.85714286, + "logps/chosen": -396.1656494140625, + "logps/rejected": -480.27633231026783, + "loss": 0.0274, + "rewards/chosen": 4.689364242553711, + "rewards/margins": 17.612799998692104, + "rewards/rejected": -12.923435756138392, + "step": 3458 + }, + { + "epoch": 0.9480608469233932, + "grad_norm": 5.5625, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -29664697.6, + "logits/rejected": -23642080.0, + "logps/chosen": -482.930419921875, + "logps/rejected": -661.1321847098214, + "loss": 0.0079, + "rewards/chosen": 9.448400115966797, + "rewards/margins": 24.664682551792687, + "rewards/rejected": -15.216282435825892, + "step": 3459 + }, + { + "epoch": 0.948334932163903, + "grad_norm": 4.21875, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -17854686.666666668, + "logits/rejected": -22332408.0, + "logps/chosen": -345.2437337239583, + "logps/rejected": -468.1377766927083, + "loss": 0.0119, + "rewards/chosen": 6.772082010904948, + "rewards/margins": 22.06480662027995, + "rewards/rejected": -15.292724609375, + "step": 3460 + }, + { + "epoch": 0.9486090174044127, + "grad_norm": 1.28125, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -16682672.0, + "logits/rejected": -34806646.15384615, + "logps/chosen": -346.61936257102275, + "logps/rejected": -476.06685697115387, + "loss": 0.0044, + "rewards/chosen": 6.979795976118608, + "rewards/margins": 20.42682215550563, + "rewards/rejected": -13.44702617938702, + "step": 3461 + }, + { + "epoch": 0.9488831026449226, + "grad_norm": 5.0625, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -18801760.0, + "logits/rejected": -39741115.428571425, + "logps/chosen": -296.828857421875, + "logps/rejected": -497.7069614955357, + "loss": 0.0306, + "rewards/chosen": 4.927056884765625, + "rewards/margins": 15.827473667689732, + "rewards/rejected": -10.900416782924108, + "step": 3462 + }, + { + "epoch": 0.9491571878854324, + "grad_norm": 2.8125, + "kl": 12.43424129486084, + "learning_rate": 5e-06, + "logits/chosen": -37935025.45454545, + "logits/rejected": -17652823.384615384, + "logps/chosen": -415.9469549005682, + "logps/rejected": -371.5778996394231, + "loss": 0.0481, + "rewards/chosen": 7.984311884099787, + "rewards/margins": 17.339686733859402, + "rewards/rejected": -9.355374849759615, + "step": 3463 + }, + { + "epoch": 0.9494312731259422, + "grad_norm": 7.21875, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -24169610.666666668, + "logits/rejected": -12840993.333333334, + "logps/chosen": -366.4215494791667, + "logps/rejected": -571.8579508463541, + "loss": 0.0602, + "rewards/chosen": 4.894045511881511, + "rewards/margins": 20.10240427652995, + "rewards/rejected": -15.208358764648438, + "step": 3464 + }, + { + "epoch": 0.949705358366452, + "grad_norm": 1.734375, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -35145669.333333336, + "logits/rejected": -53219786.666666664, + "logps/chosen": -363.1541748046875, + "logps/rejected": -636.3192545572916, + "loss": 0.0064, + "rewards/chosen": 7.2267195383707685, + "rewards/margins": 22.098026911417644, + "rewards/rejected": -14.871307373046875, + "step": 3465 + }, + { + "epoch": 0.9499794436069617, + "grad_norm": 6.625, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -29548634.0, + "logits/rejected": -8778664.0, + "logps/chosen": -426.51165771484375, + "logps/rejected": -460.453125, + "loss": 0.0244, + "rewards/chosen": 6.680514335632324, + "rewards/margins": 18.326653480529785, + "rewards/rejected": -11.646139144897461, + "step": 3466 + }, + { + "epoch": 0.9502535288474716, + "grad_norm": 11.5625, + "kl": 3.5223708152770996, + "learning_rate": 5e-06, + "logits/chosen": -33779168.0, + "logits/rejected": -36726020.571428575, + "logps/chosen": -509.672509765625, + "logps/rejected": -600.9432198660714, + "loss": 0.0213, + "rewards/chosen": 8.48681182861328, + "rewards/margins": 25.062510681152343, + "rewards/rejected": -16.575698852539062, + "step": 3467 + }, + { + "epoch": 0.9505276140879814, + "grad_norm": 9.25, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -41725814.15384615, + "logits/rejected": -26794309.818181816, + "logps/chosen": -404.2289287860577, + "logps/rejected": -468.01455965909093, + "loss": 0.0466, + "rewards/chosen": 5.320456871619592, + "rewards/margins": 17.8433173119605, + "rewards/rejected": -12.522860440340908, + "step": 3468 + }, + { + "epoch": 0.9508016993284911, + "grad_norm": 10.875, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -13212747.42857143, + "logits/rejected": -28581246.11764706, + "logps/chosen": -463.0198451450893, + "logps/rejected": -553.1954848345588, + "loss": 0.0267, + "rewards/chosen": 8.269490923200335, + "rewards/margins": 21.31960258163324, + "rewards/rejected": -13.050111658432904, + "step": 3469 + }, + { + "epoch": 0.951075784569001, + "grad_norm": 18.0, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -18283498.666666668, + "logits/rejected": -24052981.333333332, + "logps/chosen": -407.9171142578125, + "logps/rejected": -629.7786458333334, + "loss": 0.0177, + "rewards/chosen": 8.657002766927084, + "rewards/margins": 25.186361948649086, + "rewards/rejected": -16.529359181722004, + "step": 3470 + }, + { + "epoch": 0.9513498698095108, + "grad_norm": 2.125, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -46758109.538461536, + "logits/rejected": -17778350.545454547, + "logps/chosen": -518.5924353966346, + "logps/rejected": -436.8753551136364, + "loss": 0.0055, + "rewards/chosen": 8.809502234825722, + "rewards/margins": 22.964986521047315, + "rewards/rejected": -14.155484286221592, + "step": 3471 + }, + { + "epoch": 0.9516239550500205, + "grad_norm": 8.5625, + "kl": 0.8235740661621094, + "learning_rate": 5e-06, + "logits/chosen": -43716942.76923077, + "logits/rejected": -23894901.818181816, + "logps/chosen": -427.9021559495192, + "logps/rejected": -622.1334339488636, + "loss": 0.0393, + "rewards/chosen": 7.32743424635667, + "rewards/margins": 22.575558802464627, + "rewards/rejected": -15.248124556107955, + "step": 3472 + }, + { + "epoch": 0.9518980402905304, + "grad_norm": 8.125, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -21954925.09090909, + "logits/rejected": -38413430.15384615, + "logps/chosen": -398.8992365056818, + "logps/rejected": -569.9347205528846, + "loss": 0.012, + "rewards/chosen": 8.761471835049717, + "rewards/margins": 23.22026958332195, + "rewards/rejected": -14.458797748272236, + "step": 3473 + }, + { + "epoch": 0.9521721255310401, + "grad_norm": 14.75, + "kl": 8.449524879455566, + "learning_rate": 5e-06, + "logits/chosen": 6997565.6, + "logits/rejected": -25004521.14285714, + "logps/chosen": -518.8626953125, + "logps/rejected": -346.46083286830356, + "loss": 0.0669, + "rewards/chosen": 7.573148345947265, + "rewards/margins": 17.636869049072267, + "rewards/rejected": -10.063720703125, + "step": 3474 + }, + { + "epoch": 0.95244621077155, + "grad_norm": 13.5625, + "kl": 14.540665626525879, + "learning_rate": 5e-06, + "logits/chosen": -17204813.714285713, + "logits/rejected": -32813264.0, + "logps/chosen": -410.22830636160717, + "logps/rejected": -556.719873046875, + "loss": 0.0801, + "rewards/chosen": 6.325401306152344, + "rewards/margins": 18.923741149902344, + "rewards/rejected": -12.59833984375, + "step": 3475 + }, + { + "epoch": 0.9527202960120598, + "grad_norm": 21.75, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -26570277.818181816, + "logits/rejected": -33866382.76923077, + "logps/chosen": -500.62508877840907, + "logps/rejected": -559.7603290264423, + "loss": 0.0347, + "rewards/chosen": 8.910521073774857, + "rewards/margins": 20.948008397242404, + "rewards/rejected": -12.037487323467548, + "step": 3476 + }, + { + "epoch": 0.9529943812525695, + "grad_norm": 15.5, + "kl": 18.598604202270508, + "learning_rate": 5e-06, + "logits/chosen": -22894240.0, + "logits/rejected": -28737993.6, + "logps/chosen": -426.96641138980266, + "logps/rejected": -700.35341796875, + "loss": 0.1537, + "rewards/chosen": 6.623533951608758, + "rewards/margins": 21.2856295936986, + "rewards/rejected": -14.662095642089843, + "step": 3477 + }, + { + "epoch": 0.9532684664930794, + "grad_norm": 10.625, + "kl": 0.2079060971736908, + "learning_rate": 5e-06, + "logits/chosen": -12316218.181818182, + "logits/rejected": -35803438.76923077, + "logps/chosen": -394.8917347301136, + "logps/rejected": -563.8008939302885, + "loss": 0.0425, + "rewards/chosen": 6.9788596413352275, + "rewards/margins": 20.867592551491477, + "rewards/rejected": -13.88873291015625, + "step": 3478 + }, + { + "epoch": 0.9535425517335891, + "grad_norm": 2.96875, + "kl": 1.6159350872039795, + "learning_rate": 5e-06, + "logits/chosen": -25492441.6, + "logits/rejected": -54832636.44444445, + "logps/chosen": -398.4798828125, + "logps/rejected": -604.7567274305555, + "loss": 0.0114, + "rewards/chosen": 8.175813802083333, + "rewards/margins": 21.314149136013455, + "rewards/rejected": -13.138335333930122, + "step": 3479 + }, + { + "epoch": 0.9538166369740989, + "grad_norm": 4.21875, + "kl": 3.924204111099243, + "learning_rate": 5e-06, + "logits/chosen": -9679000.0, + "logits/rejected": -27139306.666666668, + "logps/chosen": -432.2177327473958, + "logps/rejected": -447.7799072265625, + "loss": 0.0073, + "rewards/chosen": 7.0713653564453125, + "rewards/margins": 19.100387573242188, + "rewards/rejected": -12.029022216796875, + "step": 3480 + }, + { + "epoch": 0.9540907222146088, + "grad_norm": 10.5, + "kl": 3.368633270263672, + "learning_rate": 5e-06, + "logits/chosen": -26641842.0, + "logits/rejected": -32009124.0, + "logps/chosen": -407.6898193359375, + "logps/rejected": -433.2930603027344, + "loss": 0.0212, + "rewards/chosen": 8.742722511291504, + "rewards/margins": 20.760096549987793, + "rewards/rejected": -12.017374038696289, + "step": 3481 + }, + { + "epoch": 0.9543648074551185, + "grad_norm": 6.71875, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -6034719.333333333, + "logits/rejected": -6357474.0, + "logps/chosen": -432.1560465494792, + "logps/rejected": -680.2853190104166, + "loss": 0.0339, + "rewards/chosen": 7.314579010009766, + "rewards/margins": 22.622859954833984, + "rewards/rejected": -15.308280944824219, + "step": 3482 + }, + { + "epoch": 0.9546388926956283, + "grad_norm": 4.15625, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -35322437.81818182, + "logits/rejected": -9065558.76923077, + "logps/chosen": -452.3836115056818, + "logps/rejected": -608.0279447115385, + "loss": 0.0473, + "rewards/chosen": 7.620414733886719, + "rewards/margins": 24.151925307053787, + "rewards/rejected": -16.53151057316707, + "step": 3483 + }, + { + "epoch": 0.9549129779361382, + "grad_norm": 2.8125, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -35973195.63636363, + "logits/rejected": 19646912.0, + "logps/chosen": -348.34017666903407, + "logps/rejected": -608.9723557692307, + "loss": 0.012, + "rewards/chosen": 6.279554887251421, + "rewards/margins": 21.333413717630027, + "rewards/rejected": -15.053858830378605, + "step": 3484 + }, + { + "epoch": 0.9551870631766479, + "grad_norm": 3.609375, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -48245916.44444445, + "logits/rejected": -22967189.333333332, + "logps/chosen": -683.0985243055555, + "logps/rejected": -546.7223958333333, + "loss": 0.0038, + "rewards/chosen": 11.564806620279947, + "rewards/margins": 22.367437235514323, + "rewards/rejected": -10.802630615234374, + "step": 3485 + }, + { + "epoch": 0.9554611484171578, + "grad_norm": 3.5, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -25711316.8, + "logits/rejected": -27258736.0, + "logps/chosen": -330.24873046875, + "logps/rejected": -426.2710658482143, + "loss": 0.0102, + "rewards/chosen": 7.407294464111328, + "rewards/margins": 18.68063932146345, + "rewards/rejected": -11.27334485735212, + "step": 3486 + }, + { + "epoch": 0.9557352336576675, + "grad_norm": 1.1953125, + "kl": 3.266040802001953, + "learning_rate": 5e-06, + "logits/chosen": -6324883.2, + "logits/rejected": -8978522.666666666, + "logps/chosen": -447.7475911458333, + "logps/rejected": -680.1243489583334, + "loss": 0.0049, + "rewards/chosen": 8.301929219563801, + "rewards/margins": 22.338709682888457, + "rewards/rejected": -14.036780463324654, + "step": 3487 + }, + { + "epoch": 0.9560093188981773, + "grad_norm": 1.109375, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -16500189.333333334, + "logits/rejected": -26359482.666666668, + "logps/chosen": -359.9419352213542, + "logps/rejected": -479.7579752604167, + "loss": 0.0031, + "rewards/chosen": 8.331645329793295, + "rewards/margins": 20.10059928894043, + "rewards/rejected": -11.768953959147135, + "step": 3488 + }, + { + "epoch": 0.9562834041386872, + "grad_norm": 4.4375, + "kl": 1.9096959829330444, + "learning_rate": 5e-06, + "logits/chosen": -48586250.666666664, + "logits/rejected": -48505525.333333336, + "logps/chosen": -433.1398111979167, + "logps/rejected": -691.03662109375, + "loss": 0.0174, + "rewards/chosen": 8.212013880411783, + "rewards/margins": 24.508743921915688, + "rewards/rejected": -16.296730041503906, + "step": 3489 + }, + { + "epoch": 0.9565574893791969, + "grad_norm": 7.0625, + "kl": 3.7513670921325684, + "learning_rate": 5e-06, + "logits/chosen": -21216567.466666665, + "logits/rejected": -29031678.222222224, + "logps/chosen": -421.03525390625, + "logps/rejected": -498.0491536458333, + "loss": 0.046, + "rewards/chosen": 6.184070841471354, + "rewards/margins": 19.871333482530382, + "rewards/rejected": -13.687262641059029, + "step": 3490 + }, + { + "epoch": 0.9568315746197067, + "grad_norm": 2.96875, + "kl": 4.086931228637695, + "learning_rate": 5e-06, + "logits/chosen": -33011460.923076924, + "logits/rejected": -26140736.0, + "logps/chosen": -389.3934795673077, + "logps/rejected": -527.4821111505681, + "loss": 0.0171, + "rewards/chosen": 8.489391033466045, + "rewards/margins": 20.27322809179346, + "rewards/rejected": -11.783837058327414, + "step": 3491 + }, + { + "epoch": 0.9571056598602166, + "grad_norm": 1.640625, + "kl": 3.1298789978027344, + "learning_rate": 5e-06, + "logits/chosen": -18036441.14285714, + "logits/rejected": -18213920.0, + "logps/chosen": -376.648681640625, + "logps/rejected": -474.631640625, + "loss": 0.0041, + "rewards/chosen": 8.750745500837054, + "rewards/margins": 21.11329585484096, + "rewards/rejected": -12.362550354003906, + "step": 3492 + }, + { + "epoch": 0.9573797451007263, + "grad_norm": 0.390625, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -37677465.14285714, + "logits/rejected": -44111820.8, + "logps/chosen": -441.11879185267856, + "logps/rejected": -516.188916015625, + "loss": 0.001, + "rewards/chosen": 8.530114310128349, + "rewards/margins": 23.947776358468193, + "rewards/rejected": -15.417662048339844, + "step": 3493 + }, + { + "epoch": 0.9576538303412361, + "grad_norm": 1.53125, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -14849543.111111112, + "logits/rejected": -31809237.333333332, + "logps/chosen": -387.4744466145833, + "logps/rejected": -577.7677083333333, + "loss": 0.0038, + "rewards/chosen": 7.500464545355903, + "rewards/margins": 21.466610378689236, + "rewards/rejected": -13.966145833333334, + "step": 3494 + }, + { + "epoch": 0.957927915581746, + "grad_norm": 7.9375, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": 1591399.6363636365, + "logits/rejected": -19471160.615384616, + "logps/chosen": -308.3612171519886, + "logps/rejected": -519.8533653846154, + "loss": 0.0462, + "rewards/chosen": 6.401912342418324, + "rewards/margins": 18.607736974329384, + "rewards/rejected": -12.205824631911058, + "step": 3495 + }, + { + "epoch": 0.9582020008222557, + "grad_norm": 6.9375, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -14359949.090909092, + "logits/rejected": -35400157.538461536, + "logps/chosen": -373.06764914772725, + "logps/rejected": -568.9601862980769, + "loss": 0.0285, + "rewards/chosen": 7.310435208407315, + "rewards/margins": 22.041706805462603, + "rewards/rejected": -14.731271597055288, + "step": 3496 + }, + { + "epoch": 0.9584760860627656, + "grad_norm": 7.21875, + "kl": 0.2313130795955658, + "learning_rate": 5e-06, + "logits/chosen": -36604468.571428575, + "logits/rejected": -45710416.0, + "logps/chosen": -380.09266880580356, + "logps/rejected": -636.10654296875, + "loss": 0.023, + "rewards/chosen": 7.544419424874442, + "rewards/margins": 20.887691824776784, + "rewards/rejected": -13.343272399902343, + "step": 3497 + }, + { + "epoch": 0.9587501713032753, + "grad_norm": 4.8125, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -25055939.555555556, + "logits/rejected": -31960635.733333334, + "logps/chosen": -378.94292534722223, + "logps/rejected": -514.8358072916667, + "loss": 0.0098, + "rewards/chosen": 7.498731825086805, + "rewards/margins": 21.077817111545137, + "rewards/rejected": -13.579085286458334, + "step": 3498 + }, + { + "epoch": 0.9590242565437851, + "grad_norm": 2.53125, + "kl": 6.67417049407959, + "learning_rate": 5e-06, + "logits/chosen": -20282589.53846154, + "logits/rejected": -31810984.727272727, + "logps/chosen": -437.3205754206731, + "logps/rejected": -442.8494318181818, + "loss": 0.0102, + "rewards/chosen": 8.824201143704927, + "rewards/margins": 18.358231017639586, + "rewards/rejected": -9.534029873934658, + "step": 3499 + }, + { + "epoch": 0.959298341784295, + "grad_norm": 3.90625, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -37148026.18181818, + "logits/rejected": -29252160.0, + "logps/chosen": -325.25545987215907, + "logps/rejected": -533.2506009615385, + "loss": 0.0374, + "rewards/chosen": 5.65296103737571, + "rewards/margins": 19.965081915155157, + "rewards/rejected": -14.312120877779448, + "step": 3500 + }, + { + "epoch": 0.9595724270248047, + "grad_norm": 0.97265625, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -36522327.27272727, + "logits/rejected": -32567891.692307692, + "logps/chosen": -385.9554332386364, + "logps/rejected": -595.0011268028846, + "loss": 0.0023, + "rewards/chosen": 7.379728837446733, + "rewards/margins": 22.190464686680507, + "rewards/rejected": -14.810735849233774, + "step": 3501 + }, + { + "epoch": 0.9598465122653145, + "grad_norm": 9.5, + "kl": 5.2213568687438965, + "learning_rate": 5e-06, + "logits/chosen": -29564989.09090909, + "logits/rejected": -21262852.923076924, + "logps/chosen": -388.2203480113636, + "logps/rejected": -628.578125, + "loss": 0.0587, + "rewards/chosen": 6.585444363680753, + "rewards/margins": 21.00986758145419, + "rewards/rejected": -14.424423217773438, + "step": 3502 + }, + { + "epoch": 0.9601205975058243, + "grad_norm": 9.4375, + "kl": 13.004454612731934, + "learning_rate": 5e-06, + "logits/chosen": -34373376.0, + "logits/rejected": -23664796.0, + "logps/chosen": -373.25823974609375, + "logps/rejected": -570.1533203125, + "loss": 0.0482, + "rewards/chosen": 6.747780799865723, + "rewards/margins": 16.90368938446045, + "rewards/rejected": -10.155908584594727, + "step": 3503 + }, + { + "epoch": 0.9603946827463341, + "grad_norm": 1.0078125, + "kl": 4.032529354095459, + "learning_rate": 5e-06, + "logits/chosen": -24450089.14285714, + "logits/rejected": -49769424.0, + "logps/chosen": -380.454345703125, + "logps/rejected": -676.3556640625, + "loss": 0.0037, + "rewards/chosen": 7.5347121102469305, + "rewards/margins": 24.846824537004743, + "rewards/rejected": -17.312112426757814, + "step": 3504 + }, + { + "epoch": 0.9606687679868439, + "grad_norm": 1.734375, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -6096718.857142857, + "logits/rejected": -45373900.8, + "logps/chosen": -393.62845284598217, + "logps/rejected": -608.2669921875, + "loss": 0.0074, + "rewards/chosen": 8.05252456665039, + "rewards/margins": 22.40707321166992, + "rewards/rejected": -14.35454864501953, + "step": 3505 + }, + { + "epoch": 0.9609428532273537, + "grad_norm": 1.7734375, + "kl": 0.9396858215332031, + "learning_rate": 5e-06, + "logits/chosen": -30875516.8, + "logits/rejected": -32746688.0, + "logps/chosen": -340.6291015625, + "logps/rejected": -531.6363002232143, + "loss": 0.0066, + "rewards/chosen": 6.528956604003906, + "rewards/margins": 21.01976318359375, + "rewards/rejected": -14.490806579589844, + "step": 3506 + }, + { + "epoch": 0.9612169384678635, + "grad_norm": 0.345703125, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -6918222.0, + "logits/rejected": -43475141.333333336, + "logps/chosen": -459.3160807291667, + "logps/rejected": -564.9418131510416, + "loss": 0.0011, + "rewards/chosen": 7.3059641520182295, + "rewards/margins": 22.187389373779297, + "rewards/rejected": -14.881425221761068, + "step": 3507 + }, + { + "epoch": 0.9614910237083732, + "grad_norm": 11.6875, + "kl": 5.058400630950928, + "learning_rate": 5e-06, + "logits/chosen": -20357387.076923076, + "logits/rejected": -18352482.90909091, + "logps/chosen": -418.8249323918269, + "logps/rejected": -558.6079989346591, + "loss": 0.0333, + "rewards/chosen": 7.462120056152344, + "rewards/margins": 20.090037259188563, + "rewards/rejected": -12.62791720303622, + "step": 3508 + }, + { + "epoch": 0.9617651089488831, + "grad_norm": 2.953125, + "kl": 12.650940895080566, + "learning_rate": 5e-06, + "logits/chosen": -23218429.866666667, + "logits/rejected": -22417696.0, + "logps/chosen": -468.9122721354167, + "logps/rejected": -480.83935546875, + "loss": 0.0122, + "rewards/chosen": 7.89049072265625, + "rewards/margins": 18.662231106228298, + "rewards/rejected": -10.771740383572048, + "step": 3509 + }, + { + "epoch": 0.9620391941893929, + "grad_norm": 14.3125, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -31416570.181818184, + "logits/rejected": -32852669.53846154, + "logps/chosen": -345.48140092329544, + "logps/rejected": -552.5873647836538, + "loss": 0.0525, + "rewards/chosen": 5.997440684925426, + "rewards/margins": 18.584530810376148, + "rewards/rejected": -12.587090125450722, + "step": 3510 + }, + { + "epoch": 0.9623132794299027, + "grad_norm": 4.5625, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -10009757.090909092, + "logits/rejected": -35553602.461538464, + "logps/chosen": -303.85251686789775, + "logps/rejected": -474.28549429086536, + "loss": 0.0581, + "rewards/chosen": 6.069184736772017, + "rewards/margins": 18.912712897454107, + "rewards/rejected": -12.84352816068209, + "step": 3511 + }, + { + "epoch": 0.9625873646704125, + "grad_norm": 2.34375, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -15394114.666666666, + "logits/rejected": -31959824.0, + "logps/chosen": -496.8338216145833, + "logps/rejected": -602.5423990885416, + "loss": 0.0229, + "rewards/chosen": 5.367678324381511, + "rewards/margins": 20.410293579101562, + "rewards/rejected": -15.042615254720053, + "step": 3512 + }, + { + "epoch": 0.9628614499109223, + "grad_norm": 2.0, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -34825973.333333336, + "logits/rejected": -20428940.0, + "logps/chosen": -523.9337565104166, + "logps/rejected": -384.2376708984375, + "loss": 0.0055, + "rewards/chosen": 8.139310201009115, + "rewards/margins": 19.56264368693034, + "rewards/rejected": -11.423333485921225, + "step": 3513 + }, + { + "epoch": 0.9631355351514321, + "grad_norm": 5.46875, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -41259036.8, + "logits/rejected": -14191626.285714285, + "logps/chosen": -405.6914306640625, + "logps/rejected": -483.0431431361607, + "loss": 0.0142, + "rewards/chosen": 7.420861053466797, + "rewards/margins": 17.21574914114816, + "rewards/rejected": -9.794888087681361, + "step": 3514 + }, + { + "epoch": 0.9634096203919419, + "grad_norm": 12.625, + "kl": 9.15355110168457, + "learning_rate": 5e-06, + "logits/chosen": -9596122.461538462, + "logits/rejected": -26546411.636363637, + "logps/chosen": -563.6759690504807, + "logps/rejected": -467.3494318181818, + "loss": 0.0382, + "rewards/chosen": 7.991337702824519, + "rewards/margins": 18.58230644172722, + "rewards/rejected": -10.5909687389027, + "step": 3515 + }, + { + "epoch": 0.9636837056324516, + "grad_norm": 0.90625, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -35865989.81818182, + "logits/rejected": -23187798.153846152, + "logps/chosen": -543.0529563210227, + "logps/rejected": -686.3159555288462, + "loss": 0.0013, + "rewards/chosen": 8.76632274280895, + "rewards/margins": 22.70537561803431, + "rewards/rejected": -13.93905287522536, + "step": 3516 + }, + { + "epoch": 0.9639577908729615, + "grad_norm": 4.75, + "kl": 8.702896118164062, + "learning_rate": 5e-06, + "logits/chosen": -24419194.181818184, + "logits/rejected": -32644093.53846154, + "logps/chosen": -446.74311967329544, + "logps/rejected": -507.0386493389423, + "loss": 0.0123, + "rewards/chosen": 7.718525279651988, + "rewards/margins": 22.06513219446569, + "rewards/rejected": -14.346606914813702, + "step": 3517 + }, + { + "epoch": 0.9642318761134713, + "grad_norm": 7.3125, + "kl": 2.5528018474578857, + "learning_rate": 5e-06, + "logits/chosen": -29763121.230769232, + "logits/rejected": -37302702.54545455, + "logps/chosen": -459.0987079326923, + "logps/rejected": -599.9386541193181, + "loss": 0.0338, + "rewards/chosen": 6.612849895770733, + "rewards/margins": 20.085587988366612, + "rewards/rejected": -13.47273809259588, + "step": 3518 + }, + { + "epoch": 0.964505961353981, + "grad_norm": 3.0, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -5577232.4, + "logits/rejected": -35686768.0, + "logps/chosen": -374.5649169921875, + "logps/rejected": -623.2008928571429, + "loss": 0.004, + "rewards/chosen": 7.658928680419922, + "rewards/margins": 20.18013185773577, + "rewards/rejected": -12.521203177315849, + "step": 3519 + }, + { + "epoch": 0.9647800465944909, + "grad_norm": 12.75, + "kl": 2.9217491149902344, + "learning_rate": 5e-06, + "logits/chosen": -22179133.333333332, + "logits/rejected": -19768901.333333332, + "logps/chosen": -313.74228922526044, + "logps/rejected": -601.8114420572916, + "loss": 0.07, + "rewards/chosen": 6.424011866251628, + "rewards/margins": 19.618195215861004, + "rewards/rejected": -13.194183349609375, + "step": 3520 + }, + { + "epoch": 0.9650541318350007, + "grad_norm": 6.125, + "kl": 8.11544418334961, + "learning_rate": 5e-06, + "logits/chosen": -19469456.0, + "logits/rejected": -35846936.0, + "logps/chosen": -437.1904296875, + "logps/rejected": -532.3946533203125, + "loss": 0.0234, + "rewards/chosen": 8.486283302307129, + "rewards/margins": 20.731740951538086, + "rewards/rejected": -12.245457649230957, + "step": 3521 + }, + { + "epoch": 0.9653282170755105, + "grad_norm": 9.1875, + "kl": 8.450611114501953, + "learning_rate": 5e-06, + "logits/chosen": -18814320.0, + "logits/rejected": -28001541.818181816, + "logps/chosen": -452.2600285456731, + "logps/rejected": -492.3132990056818, + "loss": 0.1393, + "rewards/chosen": 6.734277578500601, + "rewards/margins": 18.33988851267141, + "rewards/rejected": -11.60561093417081, + "step": 3522 + }, + { + "epoch": 0.9656023023160203, + "grad_norm": 2.03125, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -11627623.466666667, + "logits/rejected": -36688814.222222224, + "logps/chosen": -459.7584635416667, + "logps/rejected": -668.8950737847222, + "loss": 0.0056, + "rewards/chosen": 8.472954813639323, + "rewards/margins": 21.57437744140625, + "rewards/rejected": -13.101422627766928, + "step": 3523 + }, + { + "epoch": 0.96587638755653, + "grad_norm": 9.375, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -4744120.4, + "logits/rejected": -28162850.285714287, + "logps/chosen": -373.006787109375, + "logps/rejected": -475.87339564732144, + "loss": 0.0341, + "rewards/chosen": 7.355895233154297, + "rewards/margins": 20.959552546909876, + "rewards/rejected": -13.60365731375558, + "step": 3524 + }, + { + "epoch": 0.9661504727970399, + "grad_norm": 3.828125, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -13520666.666666666, + "logits/rejected": 8028576.0, + "logps/chosen": -432.2862955729167, + "logps/rejected": -602.3522135416666, + "loss": 0.0035, + "rewards/chosen": 8.28987948099772, + "rewards/margins": 20.38061968485514, + "rewards/rejected": -12.090740203857422, + "step": 3525 + }, + { + "epoch": 0.9664245580375497, + "grad_norm": 6.84375, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -36512384.0, + "logits/rejected": -40497344.0, + "logps/chosen": -285.8279371995192, + "logps/rejected": -864.5326704545455, + "loss": 0.0538, + "rewards/chosen": 5.810982924241286, + "rewards/margins": 23.744543382337877, + "rewards/rejected": -17.93356045809659, + "step": 3526 + }, + { + "epoch": 0.9666986432780594, + "grad_norm": 13.625, + "kl": 4.096163749694824, + "learning_rate": 5e-06, + "logits/chosen": -29435202.285714287, + "logits/rejected": -28109446.4, + "logps/chosen": -522.8622349330357, + "logps/rejected": -484.4392578125, + "loss": 0.0141, + "rewards/chosen": 8.341444287981306, + "rewards/margins": 18.655095563616072, + "rewards/rejected": -10.313651275634765, + "step": 3527 + }, + { + "epoch": 0.9669727285185693, + "grad_norm": 9.25, + "kl": 7.0249786376953125, + "learning_rate": 5e-06, + "logits/chosen": -28365508.923076924, + "logits/rejected": -30817137.454545453, + "logps/chosen": -369.4426832932692, + "logps/rejected": -607.1237571022727, + "loss": 0.0276, + "rewards/chosen": 6.833125187800481, + "rewards/margins": 22.60900964270105, + "rewards/rejected": -15.775884454900568, + "step": 3528 + }, + { + "epoch": 0.9672468137590791, + "grad_norm": 4.96875, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -32500512.0, + "logits/rejected": -22571735.466666665, + "logps/chosen": -334.09092881944446, + "logps/rejected": -622.0912760416667, + "loss": 0.0334, + "rewards/chosen": 5.333130730523004, + "rewards/margins": 19.426300896538628, + "rewards/rejected": -14.093170166015625, + "step": 3529 + }, + { + "epoch": 0.9675208989995888, + "grad_norm": 9.8125, + "kl": 4.386848449707031, + "learning_rate": 5e-06, + "logits/chosen": -18929560.888888888, + "logits/rejected": -22163876.266666666, + "logps/chosen": -423.7014431423611, + "logps/rejected": -598.5565104166667, + "loss": 0.0551, + "rewards/chosen": 9.678970336914062, + "rewards/margins": 20.69458923339844, + "rewards/rejected": -11.015618896484375, + "step": 3530 + }, + { + "epoch": 0.9677949842400987, + "grad_norm": 7.625, + "kl": 1.0646095275878906, + "learning_rate": 5e-06, + "logits/chosen": -31517230.933333334, + "logits/rejected": -45882424.88888889, + "logps/chosen": -417.12688802083335, + "logps/rejected": -745.3572048611111, + "loss": 0.0445, + "rewards/chosen": 6.990958658854167, + "rewards/margins": 20.490803527832032, + "rewards/rejected": -13.499844868977865, + "step": 3531 + }, + { + "epoch": 0.9680690694806084, + "grad_norm": 2.59375, + "kl": 2.0914320945739746, + "learning_rate": 5e-06, + "logits/chosen": -8867417.846153846, + "logits/rejected": -30041888.0, + "logps/chosen": -378.41128305288464, + "logps/rejected": -693.70703125, + "loss": 0.0072, + "rewards/chosen": 8.692338209885817, + "rewards/margins": 23.602223963170616, + "rewards/rejected": -14.9098857532848, + "step": 3532 + }, + { + "epoch": 0.9683431547211183, + "grad_norm": 6.28125, + "kl": 19.230701446533203, + "learning_rate": 5e-06, + "logits/chosen": -34255917.176470585, + "logits/rejected": -31369010.285714287, + "logps/chosen": -505.3411075367647, + "logps/rejected": -382.40513392857144, + "loss": 0.0502, + "rewards/chosen": 8.158661786247702, + "rewards/margins": 17.81826654001444, + "rewards/rejected": -9.659604753766741, + "step": 3533 + }, + { + "epoch": 0.9686172399616281, + "grad_norm": 1.3984375, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -20141402.666666668, + "logits/rejected": -55440021.333333336, + "logps/chosen": -435.3724365234375, + "logps/rejected": -482.6632486979167, + "loss": 0.0226, + "rewards/chosen": 7.966393788655599, + "rewards/margins": 21.012100219726562, + "rewards/rejected": -13.045706431070963, + "step": 3534 + }, + { + "epoch": 0.9688913252021378, + "grad_norm": 1.453125, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -20242313.6, + "logits/rejected": -27677824.0, + "logps/chosen": -428.943408203125, + "logps/rejected": -565.0154854910714, + "loss": 0.0033, + "rewards/chosen": 6.148410034179688, + "rewards/margins": 21.792451913016183, + "rewards/rejected": -15.644041878836495, + "step": 3535 + }, + { + "epoch": 0.9691654104426477, + "grad_norm": 6.6875, + "kl": 0.5893707275390625, + "learning_rate": 5e-06, + "logits/chosen": -30163342.769230768, + "logits/rejected": -24106168.727272727, + "logps/chosen": -397.81385216346155, + "logps/rejected": -672.3748224431819, + "loss": 0.0242, + "rewards/chosen": 6.893027672400842, + "rewards/margins": 22.65574101801519, + "rewards/rejected": -15.762713345614346, + "step": 3536 + }, + { + "epoch": 0.9694394956831575, + "grad_norm": 9.625, + "kl": 2.756671905517578, + "learning_rate": 5e-06, + "logits/chosen": -10150798.4, + "logits/rejected": -42725261.71428572, + "logps/chosen": -400.3873779296875, + "logps/rejected": -561.9821428571429, + "loss": 0.038, + "rewards/chosen": 7.778680419921875, + "rewards/margins": 20.845246887207033, + "rewards/rejected": -13.066566467285156, + "step": 3537 + }, + { + "epoch": 0.9697135809236672, + "grad_norm": 7.25, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -32340596.0, + "logits/rejected": -30701948.0, + "logps/chosen": -440.1888427734375, + "logps/rejected": -691.2135009765625, + "loss": 0.0174, + "rewards/chosen": 7.200658798217773, + "rewards/margins": 21.612574577331543, + "rewards/rejected": -14.41191577911377, + "step": 3538 + }, + { + "epoch": 0.9699876661641771, + "grad_norm": 2.734375, + "kl": 16.632286071777344, + "learning_rate": 5e-06, + "logits/chosen": -14451886.933333334, + "logits/rejected": -57495285.333333336, + "logps/chosen": -433.2023111979167, + "logps/rejected": -698.9497612847222, + "loss": 0.1253, + "rewards/chosen": 7.028330485026042, + "rewards/margins": 25.84561496310764, + "rewards/rejected": -18.817284478081596, + "step": 3539 + }, + { + "epoch": 0.9702617514046868, + "grad_norm": 18.0, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -18196907.636363637, + "logits/rejected": -31824406.153846152, + "logps/chosen": -467.13334517045456, + "logps/rejected": -596.5655048076923, + "loss": 0.0311, + "rewards/chosen": 7.411164023659446, + "rewards/margins": 21.050835936219542, + "rewards/rejected": -13.639671912560097, + "step": 3540 + }, + { + "epoch": 0.9705358366451966, + "grad_norm": 5.96875, + "kl": 0.0365346297621727, + "learning_rate": 5e-06, + "logits/chosen": -17014333.333333332, + "logits/rejected": -33203058.666666668, + "logps/chosen": -407.9897054036458, + "logps/rejected": -464.337158203125, + "loss": 0.0614, + "rewards/chosen": 8.49478022257487, + "rewards/margins": 21.593100229899086, + "rewards/rejected": -13.098320007324219, + "step": 3541 + }, + { + "epoch": 0.9708099218857065, + "grad_norm": 3.15625, + "kl": 5.360725402832031, + "learning_rate": 5e-06, + "logits/chosen": -46031168.0, + "logits/rejected": -27990180.0, + "logps/chosen": -462.08880615234375, + "logps/rejected": -467.1872863769531, + "loss": 0.0053, + "rewards/chosen": 7.711245536804199, + "rewards/margins": 20.058691024780273, + "rewards/rejected": -12.347445487976074, + "step": 3542 + }, + { + "epoch": 0.9710840071262162, + "grad_norm": 7.21875, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -19706833.6, + "logits/rejected": -39945152.0, + "logps/chosen": -338.6061279296875, + "logps/rejected": -699.7013113839286, + "loss": 0.0128, + "rewards/chosen": 6.2372089385986325, + "rewards/margins": 23.40181857517787, + "rewards/rejected": -17.16460963657924, + "step": 3543 + }, + { + "epoch": 0.9713580923667261, + "grad_norm": 7.03125, + "kl": 3.8296303749084473, + "learning_rate": 5e-06, + "logits/chosen": -6216679.2727272725, + "logits/rejected": -36582119.384615384, + "logps/chosen": -332.7618963068182, + "logps/rejected": -517.0686598557693, + "loss": 0.0624, + "rewards/chosen": 6.093304720791903, + "rewards/margins": 18.37221543105332, + "rewards/rejected": -12.278910710261417, + "step": 3544 + }, + { + "epoch": 0.9716321776072359, + "grad_norm": 4.1875, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -28441760.0, + "logits/rejected": -53562688.0, + "logps/chosen": -401.3419189453125, + "logps/rejected": -547.8071695963541, + "loss": 0.0215, + "rewards/chosen": 7.01066780090332, + "rewards/margins": 18.367910385131836, + "rewards/rejected": -11.357242584228516, + "step": 3545 + }, + { + "epoch": 0.9719062628477456, + "grad_norm": 7.5625, + "kl": 5.924106597900391, + "learning_rate": 5e-06, + "logits/chosen": -16846377.14285714, + "logits/rejected": -35491254.4, + "logps/chosen": -335.62437220982144, + "logps/rejected": -565.35556640625, + "loss": 0.0583, + "rewards/chosen": 8.661420549665179, + "rewards/margins": 20.88053981236049, + "rewards/rejected": -12.219119262695312, + "step": 3546 + }, + { + "epoch": 0.9721803480882555, + "grad_norm": 10.5625, + "kl": 10.228775024414062, + "learning_rate": 5e-06, + "logits/chosen": -25690953.6, + "logits/rejected": -26300905.14285714, + "logps/chosen": -486.68544921875, + "logps/rejected": -706.5576869419643, + "loss": 0.0727, + "rewards/chosen": 7.493565368652344, + "rewards/margins": 21.80130680629185, + "rewards/rejected": -14.307741437639509, + "step": 3547 + }, + { + "epoch": 0.9724544333287652, + "grad_norm": 3.5625, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -31726546.285714287, + "logits/rejected": -33160455.529411763, + "logps/chosen": -373.26639229910717, + "logps/rejected": -571.5479090073529, + "loss": 0.0624, + "rewards/chosen": 6.1331024169921875, + "rewards/margins": 20.737932093003216, + "rewards/rejected": -14.604829676011029, + "step": 3548 + }, + { + "epoch": 0.972728518569275, + "grad_norm": 6.78125, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -20874488.615384616, + "logits/rejected": -22946196.363636363, + "logps/chosen": -373.1749924879808, + "logps/rejected": -712.7975408380681, + "loss": 0.0308, + "rewards/chosen": 7.144061748798077, + "rewards/margins": 25.120912778627623, + "rewards/rejected": -17.976851029829547, + "step": 3549 + }, + { + "epoch": 0.9730026038097849, + "grad_norm": 9.75, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -21064997.818181816, + "logits/rejected": -48141115.07692308, + "logps/chosen": -454.6990855823864, + "logps/rejected": -630.8203125, + "loss": 0.0404, + "rewards/chosen": 6.148569280450994, + "rewards/margins": 24.0846302592671, + "rewards/rejected": -17.936060978816105, + "step": 3550 + }, + { + "epoch": 0.9732766890502946, + "grad_norm": 6.84375, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -21390976.0, + "logits/rejected": -20120994.90909091, + "logps/chosen": -429.208984375, + "logps/rejected": -609.3746448863636, + "loss": 0.0148, + "rewards/chosen": 7.120702303372896, + "rewards/margins": 22.43823631660088, + "rewards/rejected": -15.317534013227982, + "step": 3551 + }, + { + "epoch": 0.9735507742908044, + "grad_norm": 16.875, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -37976608.0, + "logits/rejected": -29787545.14285714, + "logps/chosen": -425.34619140625, + "logps/rejected": -554.0260532924107, + "loss": 0.0443, + "rewards/chosen": 6.994672393798828, + "rewards/margins": 19.628318677629743, + "rewards/rejected": -12.633646283830915, + "step": 3552 + }, + { + "epoch": 0.9738248595313143, + "grad_norm": 5.46875, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -42790179.55555555, + "logits/rejected": -50908450.13333333, + "logps/chosen": -426.5231662326389, + "logps/rejected": -655.3393880208333, + "loss": 0.0153, + "rewards/chosen": 7.2991943359375, + "rewards/margins": 22.9747314453125, + "rewards/rejected": -15.675537109375, + "step": 3553 + }, + { + "epoch": 0.974098944771824, + "grad_norm": 5.25, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -31576640.0, + "logits/rejected": -36929571.55555555, + "logps/chosen": -384.1858723958333, + "logps/rejected": -794.5413953993055, + "loss": 0.0401, + "rewards/chosen": 7.378455607096354, + "rewards/margins": 27.819778781467015, + "rewards/rejected": -20.44132317437066, + "step": 3554 + }, + { + "epoch": 0.9743730300123339, + "grad_norm": 6.71875, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -19929049.6, + "logits/rejected": -41443881.14285714, + "logps/chosen": -311.769775390625, + "logps/rejected": -489.34814453125, + "loss": 0.0166, + "rewards/chosen": 7.801390075683594, + "rewards/margins": 21.91660919189453, + "rewards/rejected": -14.115219116210938, + "step": 3555 + }, + { + "epoch": 0.9746471152528436, + "grad_norm": 3.15625, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": 11654760.0, + "logits/rejected": -31952774.4, + "logps/chosen": -281.54056222098217, + "logps/rejected": -558.431494140625, + "loss": 0.0448, + "rewards/chosen": 5.4049546377999445, + "rewards/margins": 20.070965685163223, + "rewards/rejected": -14.66601104736328, + "step": 3556 + }, + { + "epoch": 0.9749212004933534, + "grad_norm": 8.25, + "kl": 7.270308494567871, + "learning_rate": 5e-06, + "logits/chosen": -28575072.0, + "logits/rejected": -38444266.666666664, + "logps/chosen": -461.3680419921875, + "logps/rejected": -584.7613118489584, + "loss": 0.0263, + "rewards/chosen": 9.213942209879557, + "rewards/margins": 26.046078999837242, + "rewards/rejected": -16.832136789957683, + "step": 3557 + }, + { + "epoch": 0.9751952857338633, + "grad_norm": 3.15625, + "kl": 12.323177337646484, + "learning_rate": 5e-06, + "logits/chosen": -12951035.733333332, + "logits/rejected": -63571889.777777776, + "logps/chosen": -446.98346354166665, + "logps/rejected": -560.0725911458334, + "loss": 0.0485, + "rewards/chosen": 8.075748697916667, + "rewards/margins": 20.634434848361543, + "rewards/rejected": -12.558686150444878, + "step": 3558 + }, + { + "epoch": 0.975469370974373, + "grad_norm": 4.25, + "kl": 7.554084777832031, + "learning_rate": 5e-06, + "logits/chosen": -803484.6666666666, + "logits/rejected": -50948224.0, + "logps/chosen": -492.7294514973958, + "logps/rejected": -644.7333170572916, + "loss": 0.0118, + "rewards/chosen": 8.226719538370768, + "rewards/margins": 24.408611933390297, + "rewards/rejected": -16.18189239501953, + "step": 3559 + }, + { + "epoch": 0.9757434562148828, + "grad_norm": 4.21875, + "kl": 4.749965667724609, + "learning_rate": 5e-06, + "logits/chosen": -20147332.8, + "logits/rejected": -45693805.71428572, + "logps/chosen": -372.9798095703125, + "logps/rejected": -468.00558035714283, + "loss": 0.0212, + "rewards/chosen": 7.61644287109375, + "rewards/margins": 23.431234959193638, + "rewards/rejected": -15.814792088099889, + "step": 3560 + }, + { + "epoch": 0.9760175414553927, + "grad_norm": 7.6875, + "kl": 0.17806372046470642, + "learning_rate": 5e-06, + "logits/chosen": -19771641.333333332, + "logits/rejected": -3220664.0, + "logps/chosen": -426.3463541666667, + "logps/rejected": -439.4940999348958, + "loss": 0.022, + "rewards/chosen": 6.625211079915364, + "rewards/margins": 17.53482437133789, + "rewards/rejected": -10.909613291422525, + "step": 3561 + }, + { + "epoch": 0.9762916266959024, + "grad_norm": 1.9765625, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": 4671830.181818182, + "logits/rejected": -47325902.76923077, + "logps/chosen": -493.09157492897725, + "logps/rejected": -577.9942157451923, + "loss": 0.006, + "rewards/chosen": 7.950892361727628, + "rewards/margins": 24.17608274446501, + "rewards/rejected": -16.22519038273738, + "step": 3562 + }, + { + "epoch": 0.9765657119364122, + "grad_norm": 3.3125, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -29825770.666666668, + "logits/rejected": -42790960.0, + "logps/chosen": -356.5733235677083, + "logps/rejected": -480.21484375, + "loss": 0.0111, + "rewards/chosen": 7.634641011555989, + "rewards/margins": 20.710782368977863, + "rewards/rejected": -13.076141357421875, + "step": 3563 + }, + { + "epoch": 0.976839797176922, + "grad_norm": 2.3125, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -47612401.45454545, + "logits/rejected": -26137538.46153846, + "logps/chosen": -533.5088778409091, + "logps/rejected": -578.6849459134615, + "loss": 0.0042, + "rewards/chosen": 6.839703646573153, + "rewards/margins": 20.588363220641664, + "rewards/rejected": -13.74865957406851, + "step": 3564 + }, + { + "epoch": 0.9771138824174318, + "grad_norm": 8.0625, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -10142395.0, + "logits/rejected": -43458012.0, + "logps/chosen": -349.23052978515625, + "logps/rejected": -623.4749755859375, + "loss": 0.0264, + "rewards/chosen": 6.526202201843262, + "rewards/margins": 24.275324821472168, + "rewards/rejected": -17.749122619628906, + "step": 3565 + }, + { + "epoch": 0.9773879676579417, + "grad_norm": 3.75, + "kl": 1.7272777557373047, + "learning_rate": 5e-06, + "logits/chosen": -29711433.14285714, + "logits/rejected": -33113068.8, + "logps/chosen": -401.8916713169643, + "logps/rejected": -554.208203125, + "loss": 0.0262, + "rewards/chosen": 8.604849679129464, + "rewards/margins": 23.421622140066965, + "rewards/rejected": -14.8167724609375, + "step": 3566 + }, + { + "epoch": 0.9776620528984514, + "grad_norm": 1.7890625, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -19315890.46153846, + "logits/rejected": -30928965.818181816, + "logps/chosen": -421.1165114182692, + "logps/rejected": -625.8044211647727, + "loss": 0.007, + "rewards/chosen": 8.601359440730167, + "rewards/margins": 25.62239576219679, + "rewards/rejected": -17.02103632146662, + "step": 3567 + }, + { + "epoch": 0.9779361381389612, + "grad_norm": 8.4375, + "kl": 1.8298118114471436, + "learning_rate": 5e-06, + "logits/chosen": -41215906.13333333, + "logits/rejected": -13121091.555555556, + "logps/chosen": -434.79567057291666, + "logps/rejected": -694.4776475694445, + "loss": 0.0605, + "rewards/chosen": 6.846635945638021, + "rewards/margins": 27.611358981662327, + "rewards/rejected": -20.764723036024307, + "step": 3568 + }, + { + "epoch": 0.9782102233794711, + "grad_norm": 3.65625, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -26681618.285714287, + "logits/rejected": -45477420.8, + "logps/chosen": -380.21505301339283, + "logps/rejected": -504.183447265625, + "loss": 0.0123, + "rewards/chosen": 7.41624995640346, + "rewards/margins": 21.12763148716518, + "rewards/rejected": -13.711381530761718, + "step": 3569 + }, + { + "epoch": 0.9784843086199808, + "grad_norm": 2.984375, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -23363227.636363637, + "logits/rejected": -26477289.846153848, + "logps/chosen": -444.9064275568182, + "logps/rejected": -602.5181790865385, + "loss": 0.003, + "rewards/chosen": 9.86792685768821, + "rewards/margins": 24.906353183559606, + "rewards/rejected": -15.038426325871395, + "step": 3570 + }, + { + "epoch": 0.9787583938604906, + "grad_norm": 4.5625, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -33025300.0, + "logits/rejected": -29366226.0, + "logps/chosen": -545.0746459960938, + "logps/rejected": -425.994873046875, + "loss": 0.0129, + "rewards/chosen": 8.32065200805664, + "rewards/margins": 20.93206024169922, + "rewards/rejected": -12.611408233642578, + "step": 3571 + }, + { + "epoch": 0.9790324791010004, + "grad_norm": 1.0703125, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -2262151.111111111, + "logits/rejected": -42201190.4, + "logps/chosen": -403.5636393229167, + "logps/rejected": -557.7044270833334, + "loss": 0.0025, + "rewards/chosen": 8.02232191297743, + "rewards/margins": 25.982874891493054, + "rewards/rejected": -17.960552978515626, + "step": 3572 + }, + { + "epoch": 0.9793065643415102, + "grad_norm": 2.0625, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -20981019.636363637, + "logits/rejected": -28277979.076923076, + "logps/chosen": -393.35311612215907, + "logps/rejected": -638.8125751201923, + "loss": 0.0051, + "rewards/chosen": 5.747853365811435, + "rewards/margins": 26.626461482548212, + "rewards/rejected": -20.87860811673678, + "step": 3573 + }, + { + "epoch": 0.97958064958202, + "grad_norm": 6.40625, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -13489372.444444444, + "logits/rejected": -45479210.666666664, + "logps/chosen": -457.3591579861111, + "logps/rejected": -511.42164713541666, + "loss": 0.0126, + "rewards/chosen": 6.241010030110677, + "rewards/margins": 21.816020202636718, + "rewards/rejected": -15.575010172526042, + "step": 3574 + }, + { + "epoch": 0.9798547348225298, + "grad_norm": 1.3203125, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -28255977.6, + "logits/rejected": -17245497.14285714, + "logps/chosen": -445.735595703125, + "logps/rejected": -749.1872209821429, + "loss": 0.0029, + "rewards/chosen": 7.326282501220703, + "rewards/margins": 29.7900755746024, + "rewards/rejected": -22.463793073381698, + "step": 3575 + }, + { + "epoch": 0.9801288200630396, + "grad_norm": 3.0625, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -33777657.6, + "logits/rejected": -55349397.333333336, + "logps/chosen": -454.29986979166665, + "logps/rejected": -548.2758246527778, + "loss": 0.0074, + "rewards/chosen": 8.045513916015626, + "rewards/margins": 21.519791836208768, + "rewards/rejected": -13.474277920193142, + "step": 3576 + }, + { + "epoch": 0.9804029053035495, + "grad_norm": 4.5, + "kl": 7.562534332275391, + "learning_rate": 5e-06, + "logits/chosen": -21244462.666666668, + "logits/rejected": -16437718.666666666, + "logps/chosen": -403.9708658854167, + "logps/rejected": -403.95263671875, + "loss": 0.0099, + "rewards/chosen": 7.656859079996745, + "rewards/margins": 21.32149378458659, + "rewards/rejected": -13.664634704589844, + "step": 3577 + }, + { + "epoch": 0.9806769905440592, + "grad_norm": 14.0625, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -21529305.333333332, + "logits/rejected": -34902208.0, + "logps/chosen": -325.4227701822917, + "logps/rejected": -512.9514973958334, + "loss": 0.052, + "rewards/chosen": 6.004405975341797, + "rewards/margins": 19.087980906168617, + "rewards/rejected": -13.083574930826822, + "step": 3578 + }, + { + "epoch": 0.980951075784569, + "grad_norm": 6.90625, + "kl": 6.691150665283203, + "learning_rate": 5e-06, + "logits/chosen": -25185536.0, + "logits/rejected": -15297749.0, + "logps/chosen": -338.0955810546875, + "logps/rejected": -696.1741943359375, + "loss": 0.0321, + "rewards/chosen": 6.479888916015625, + "rewards/margins": 24.32341766357422, + "rewards/rejected": -17.843528747558594, + "step": 3579 + }, + { + "epoch": 0.9812251610250788, + "grad_norm": 2.0625, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -33253076.57142857, + "logits/rejected": -41786566.4, + "logps/chosen": -380.68411690848217, + "logps/rejected": -566.6609375, + "loss": 0.0272, + "rewards/chosen": 6.353349958147321, + "rewards/margins": 22.806371198381697, + "rewards/rejected": -16.453021240234374, + "step": 3580 + }, + { + "epoch": 0.9814992462655886, + "grad_norm": 7.78125, + "kl": 2.59900164604187, + "learning_rate": 5e-06, + "logits/chosen": -12699725.333333334, + "logits/rejected": -24377613.333333332, + "logps/chosen": -453.1611328125, + "logps/rejected": -571.9591064453125, + "loss": 0.0191, + "rewards/chosen": 8.562185287475586, + "rewards/margins": 24.027722040812172, + "rewards/rejected": -15.465536753336588, + "step": 3581 + }, + { + "epoch": 0.9817733315060984, + "grad_norm": 2.84375, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -18593574.153846152, + "logits/rejected": -20827402.181818184, + "logps/chosen": -416.86283052884613, + "logps/rejected": -584.1973100142045, + "loss": 0.0214, + "rewards/chosen": 8.310040987454927, + "rewards/margins": 23.119561362099816, + "rewards/rejected": -14.809520374644887, + "step": 3582 + }, + { + "epoch": 0.9820474167466082, + "grad_norm": 3.40625, + "kl": 2.366853713989258, + "learning_rate": 5e-06, + "logits/chosen": -23398462.222222224, + "logits/rejected": -28483466.666666668, + "logps/chosen": -398.54454210069446, + "logps/rejected": -454.4078776041667, + "loss": 0.0164, + "rewards/chosen": 6.78425767686632, + "rewards/margins": 19.078533596462673, + "rewards/rejected": -12.294275919596354, + "step": 3583 + }, + { + "epoch": 0.982321501987118, + "grad_norm": 6.125, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -10224628.363636363, + "logits/rejected": -26062638.769230768, + "logps/chosen": -372.18033114346593, + "logps/rejected": -544.9340444711538, + "loss": 0.0293, + "rewards/chosen": 4.622932087291371, + "rewards/margins": 18.50672349729738, + "rewards/rejected": -13.88379141000601, + "step": 3584 + }, + { + "epoch": 0.9825955872276277, + "grad_norm": 6.40625, + "kl": 7.0762939453125, + "learning_rate": 5e-06, + "logits/chosen": -4714595.076923077, + "logits/rejected": -18805693.09090909, + "logps/chosen": -517.4765249399038, + "logps/rejected": -461.85107421875, + "loss": 0.0118, + "rewards/chosen": 9.473702650803785, + "rewards/margins": 21.12185716962481, + "rewards/rejected": -11.648154518821023, + "step": 3585 + }, + { + "epoch": 0.9828696724681376, + "grad_norm": 3.3125, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -20401136.0, + "logits/rejected": -32663542.153846152, + "logps/chosen": -407.80033735795456, + "logps/rejected": -438.6862229567308, + "loss": 0.0101, + "rewards/chosen": 7.235531893643466, + "rewards/margins": 18.586984060861013, + "rewards/rejected": -11.351452167217548, + "step": 3586 + }, + { + "epoch": 0.9831437577086474, + "grad_norm": 5.5, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -19624273.230769232, + "logits/rejected": -35466717.09090909, + "logps/chosen": -317.6305588942308, + "logps/rejected": -594.3020241477273, + "loss": 0.0209, + "rewards/chosen": 6.7949969951923075, + "rewards/margins": 22.98270229526333, + "rewards/rejected": -16.187705300071023, + "step": 3587 + }, + { + "epoch": 0.9834178429491572, + "grad_norm": 0.546875, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -26914626.666666668, + "logits/rejected": 8277124.0, + "logps/chosen": -379.9068196614583, + "logps/rejected": -546.18896484375, + "loss": 0.0013, + "rewards/chosen": 8.490914026896158, + "rewards/margins": 23.36557960510254, + "rewards/rejected": -14.87466557820638, + "step": 3588 + }, + { + "epoch": 0.983691928189667, + "grad_norm": 2.328125, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -14091065.6, + "logits/rejected": -18231353.14285714, + "logps/chosen": -348.579296875, + "logps/rejected": -568.8995535714286, + "loss": 0.0093, + "rewards/chosen": 6.235654067993164, + "rewards/margins": 19.99219790867397, + "rewards/rejected": -13.756543840680804, + "step": 3589 + }, + { + "epoch": 0.9839660134301768, + "grad_norm": 5.15625, + "kl": 8.106104850769043, + "learning_rate": 5e-06, + "logits/chosen": -18533200.0, + "logits/rejected": -12710817.0, + "logps/chosen": -375.63812255859375, + "logps/rejected": -560.1165771484375, + "loss": 0.0449, + "rewards/chosen": 7.7163310050964355, + "rewards/margins": 18.883193492889404, + "rewards/rejected": -11.166862487792969, + "step": 3590 + }, + { + "epoch": 0.9842400986706866, + "grad_norm": 2.171875, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -19933764.0, + "logits/rejected": -19412858.666666668, + "logps/chosen": -442.373046875, + "logps/rejected": -741.8025716145834, + "loss": 0.0051, + "rewards/chosen": 6.737379709879558, + "rewards/margins": 22.204922993977863, + "rewards/rejected": -15.467543284098307, + "step": 3591 + }, + { + "epoch": 0.9845141839111964, + "grad_norm": 5.0, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -13954090.666666666, + "logits/rejected": -39073066.666666664, + "logps/chosen": -449.1911892361111, + "logps/rejected": -500.4018880208333, + "loss": 0.0214, + "rewards/chosen": 6.823911878797743, + "rewards/margins": 19.76974826388889, + "rewards/rejected": -12.945836385091146, + "step": 3592 + }, + { + "epoch": 0.9847882691517061, + "grad_norm": 11.3125, + "kl": 0.5545228123664856, + "learning_rate": 5e-06, + "logits/chosen": -15167001.142857144, + "logits/rejected": 7124444.8, + "logps/chosen": -391.43233816964283, + "logps/rejected": -331.9753173828125, + "loss": 0.0967, + "rewards/chosen": 5.097693307059152, + "rewards/margins": 11.083023507254463, + "rewards/rejected": -5.985330200195312, + "step": 3593 + }, + { + "epoch": 0.985062354392216, + "grad_norm": 2.015625, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -25569808.0, + "logits/rejected": -29410994.666666668, + "logps/chosen": -548.8646647135416, + "logps/rejected": -539.273681640625, + "loss": 0.0042, + "rewards/chosen": 7.89846928914388, + "rewards/margins": 21.153775533040363, + "rewards/rejected": -13.255306243896484, + "step": 3594 + }, + { + "epoch": 0.9853364396327258, + "grad_norm": 1.1875, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -3088798.8571428573, + "logits/rejected": -26242192.94117647, + "logps/chosen": -373.85232979910717, + "logps/rejected": -576.7179457720588, + "loss": 0.0037, + "rewards/chosen": 8.309620448521205, + "rewards/margins": 23.055111027565324, + "rewards/rejected": -14.745490579044118, + "step": 3595 + }, + { + "epoch": 0.9856105248732355, + "grad_norm": 13.0625, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": 11563294.4, + "logits/rejected": -16052094.857142856, + "logps/chosen": -431.00927734375, + "logps/rejected": -695.2404436383929, + "loss": 0.0502, + "rewards/chosen": 6.115864562988281, + "rewards/margins": 18.441027178083147, + "rewards/rejected": -12.325162615094866, + "step": 3596 + }, + { + "epoch": 0.9858846101137454, + "grad_norm": 10.375, + "kl": 1.4532884359359741, + "learning_rate": 5e-06, + "logits/chosen": -7343268.0, + "logits/rejected": -15752978.666666666, + "logps/chosen": -431.1674397786458, + "logps/rejected": -450.476318359375, + "loss": 0.0301, + "rewards/chosen": 6.135073343912761, + "rewards/margins": 17.577181498209637, + "rewards/rejected": -11.442108154296875, + "step": 3597 + }, + { + "epoch": 0.9861586953542552, + "grad_norm": 17.375, + "kl": 2.8201255798339844, + "learning_rate": 5e-06, + "logits/chosen": -31849397.333333332, + "logits/rejected": -31720765.333333332, + "logps/chosen": -375.086669921875, + "logps/rejected": -410.3670247395833, + "loss": 0.0516, + "rewards/chosen": 5.944102605183919, + "rewards/margins": 17.56100018819173, + "rewards/rejected": -11.616897583007812, + "step": 3598 + }, + { + "epoch": 0.986432780594765, + "grad_norm": 10.6875, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -15939437.538461538, + "logits/rejected": -6289474.181818182, + "logps/chosen": -389.18637319711536, + "logps/rejected": -616.1795987215909, + "loss": 0.0495, + "rewards/chosen": 7.133665818434495, + "rewards/margins": 24.90414311335637, + "rewards/rejected": -17.770477294921875, + "step": 3599 + }, + { + "epoch": 0.9867068658352748, + "grad_norm": 13.625, + "kl": 7.709907054901123, + "learning_rate": 5e-06, + "logits/chosen": -8534684.57142857, + "logits/rejected": -34700944.0, + "logps/chosen": -521.1034109933036, + "logps/rejected": -471.765478515625, + "loss": 0.0424, + "rewards/chosen": 6.6756777082170755, + "rewards/margins": 19.19303152901786, + "rewards/rejected": -12.517353820800782, + "step": 3600 + }, + { + "epoch": 0.9869809510757845, + "grad_norm": 12.1875, + "kl": 11.694598197937012, + "learning_rate": 5e-06, + "logits/chosen": -34516238.76923077, + "logits/rejected": -38942039.27272727, + "logps/chosen": -444.71927584134613, + "logps/rejected": -684.5463423295455, + "loss": 0.0935, + "rewards/chosen": 8.046837439903847, + "rewards/margins": 21.08607194807146, + "rewards/rejected": -13.039234508167613, + "step": 3601 + }, + { + "epoch": 0.9872550363162944, + "grad_norm": 8.5, + "kl": 11.121437072753906, + "learning_rate": 5e-06, + "logits/chosen": -13000433.066666666, + "logits/rejected": 22790062.222222224, + "logps/chosen": -431.57024739583335, + "logps/rejected": -662.0048285590278, + "loss": 0.0972, + "rewards/chosen": 6.9398351033528645, + "rewards/margins": 20.622517564561633, + "rewards/rejected": -13.682682461208767, + "step": 3602 + }, + { + "epoch": 0.9875291215568042, + "grad_norm": 13.875, + "kl": 10.904380798339844, + "learning_rate": 5e-06, + "logits/chosen": -15855664.94117647, + "logits/rejected": 1508933.7142857143, + "logps/chosen": -549.8374310661765, + "logps/rejected": -601.9651227678571, + "loss": 0.059, + "rewards/chosen": 7.43915916891659, + "rewards/margins": 15.88980570560744, + "rewards/rejected": -8.450646536690849, + "step": 3603 + }, + { + "epoch": 0.9878032067973139, + "grad_norm": 2.390625, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": 6374894.0, + "logits/rejected": -25099017.14285714, + "logps/chosen": -417.210693359375, + "logps/rejected": -712.4098772321429, + "loss": 0.0088, + "rewards/chosen": 8.587100982666016, + "rewards/margins": 26.74705014910017, + "rewards/rejected": -18.159949166434153, + "step": 3604 + }, + { + "epoch": 0.9880772920378238, + "grad_norm": 14.625, + "kl": 6.155295372009277, + "learning_rate": 5e-06, + "logits/chosen": -1449421.0, + "logits/rejected": -25564464.0, + "logps/chosen": -359.56549072265625, + "logps/rejected": -588.3779296875, + "loss": 0.0486, + "rewards/chosen": 6.328761577606201, + "rewards/margins": 21.56860113143921, + "rewards/rejected": -15.239839553833008, + "step": 3605 + }, + { + "epoch": 0.9883513772783336, + "grad_norm": 4.5625, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -36522240.0, + "logits/rejected": -18185310.0, + "logps/chosen": -400.3812561035156, + "logps/rejected": -499.67608642578125, + "loss": 0.014, + "rewards/chosen": 8.53310775756836, + "rewards/margins": 19.019734382629395, + "rewards/rejected": -10.486626625061035, + "step": 3606 + }, + { + "epoch": 0.9886254625188433, + "grad_norm": 3.796875, + "kl": 0.69879150390625, + "learning_rate": 5e-06, + "logits/chosen": 8454894.4, + "logits/rejected": -16460821.714285715, + "logps/chosen": -505.635791015625, + "logps/rejected": -506.14589146205356, + "loss": 0.0065, + "rewards/chosen": 9.190132141113281, + "rewards/margins": 20.70035879952567, + "rewards/rejected": -11.510226658412389, + "step": 3607 + }, + { + "epoch": 0.9888995477593532, + "grad_norm": 5.625, + "kl": 4.656252861022949, + "learning_rate": 5e-06, + "logits/chosen": -17040304.0, + "logits/rejected": -25927200.0, + "logps/chosen": -397.1709716796875, + "logps/rejected": -659.7024274553571, + "loss": 0.0186, + "rewards/chosen": 8.258100128173828, + "rewards/margins": 25.204297637939455, + "rewards/rejected": -16.946197509765625, + "step": 3608 + }, + { + "epoch": 0.9891736329998629, + "grad_norm": 19.0, + "kl": 9.428572654724121, + "learning_rate": 5e-06, + "logits/chosen": -16596646.4, + "logits/rejected": -43390101.333333336, + "logps/chosen": -368.1940104166667, + "logps/rejected": -715.2808159722222, + "loss": 0.1227, + "rewards/chosen": 6.807832336425781, + "rewards/margins": 19.697230699327257, + "rewards/rejected": -12.889398362901476, + "step": 3609 + }, + { + "epoch": 0.9894477182403728, + "grad_norm": 3.796875, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -32886820.923076924, + "logits/rejected": -37858231.27272727, + "logps/chosen": -437.9258563701923, + "logps/rejected": -367.72927024147725, + "loss": 0.0416, + "rewards/chosen": 8.289321899414062, + "rewards/margins": 19.85065390846946, + "rewards/rejected": -11.561332009055398, + "step": 3610 + }, + { + "epoch": 0.9897218034808826, + "grad_norm": 12.6875, + "kl": 6.3727006912231445, + "learning_rate": 5e-06, + "logits/chosen": -26979074.0, + "logits/rejected": -13591914.0, + "logps/chosen": -385.44793701171875, + "logps/rejected": -500.2922668457031, + "loss": 0.0236, + "rewards/chosen": 6.456875801086426, + "rewards/margins": 17.942729949951172, + "rewards/rejected": -11.485854148864746, + "step": 3611 + }, + { + "epoch": 0.9899958887213923, + "grad_norm": 11.0, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -5431220.8, + "logits/rejected": -32390281.14285714, + "logps/chosen": -411.50869140625, + "logps/rejected": -464.0733119419643, + "loss": 0.0611, + "rewards/chosen": 6.037025451660156, + "rewards/margins": 16.83579363141741, + "rewards/rejected": -10.798768179757255, + "step": 3612 + }, + { + "epoch": 0.9902699739619022, + "grad_norm": 4.84375, + "kl": 7.307845592498779, + "learning_rate": 5e-06, + "logits/chosen": 35700514.13333333, + "logits/rejected": 2933070.222222222, + "logps/chosen": -467.45872395833334, + "logps/rejected": -423.4021267361111, + "loss": 0.0347, + "rewards/chosen": 7.462611389160156, + "rewards/margins": 21.520067511664497, + "rewards/rejected": -14.057456122504341, + "step": 3613 + }, + { + "epoch": 0.990544059202412, + "grad_norm": 11.5, + "kl": 9.125476837158203, + "learning_rate": 5e-06, + "logits/chosen": -21045892.57142857, + "logits/rejected": -23383641.6, + "logps/chosen": -359.2505580357143, + "logps/rejected": -551.03427734375, + "loss": 0.1011, + "rewards/chosen": 7.769123077392578, + "rewards/margins": 21.24990463256836, + "rewards/rejected": -13.480781555175781, + "step": 3614 + }, + { + "epoch": 0.9908181444429217, + "grad_norm": 7.5625, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -28798394.666666668, + "logits/rejected": -19365922.133333333, + "logps/chosen": -417.18785264756946, + "logps/rejected": -559.1694010416667, + "loss": 0.0165, + "rewards/chosen": 8.172709147135416, + "rewards/margins": 19.19697062174479, + "rewards/rejected": -11.024261474609375, + "step": 3615 + }, + { + "epoch": 0.9910922296834316, + "grad_norm": 10.75, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -3446301.5384615385, + "logits/rejected": -22773121.454545453, + "logps/chosen": -486.57677283653845, + "logps/rejected": -531.6511896306819, + "loss": 0.033, + "rewards/chosen": 7.933398907001202, + "rewards/margins": 22.013239827189413, + "rewards/rejected": -14.07984092018821, + "step": 3616 + }, + { + "epoch": 0.9913663149239413, + "grad_norm": 0.451171875, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -32462218.666666668, + "logits/rejected": -17440388.266666666, + "logps/chosen": -406.71454535590277, + "logps/rejected": -558.8741536458333, + "loss": 0.0021, + "rewards/chosen": 8.085383945041233, + "rewards/margins": 22.801381259494356, + "rewards/rejected": -14.715997314453125, + "step": 3617 + }, + { + "epoch": 0.9916404001644511, + "grad_norm": 4.28125, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": 16288180.8, + "logits/rejected": -30895462.85714286, + "logps/chosen": -466.193359375, + "logps/rejected": -539.976806640625, + "loss": 0.0274, + "rewards/chosen": 9.273147583007812, + "rewards/margins": 24.06569344656808, + "rewards/rejected": -14.792545863560267, + "step": 3618 + }, + { + "epoch": 0.991914485404961, + "grad_norm": 7.78125, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -15885328.0, + "logits/rejected": -5100546.181818182, + "logps/chosen": -378.44429837740387, + "logps/rejected": -616.9588512073864, + "loss": 0.0488, + "rewards/chosen": 7.141073960524339, + "rewards/margins": 24.926357829487404, + "rewards/rejected": -17.785283868963067, + "step": 3619 + }, + { + "epoch": 0.9921885706454707, + "grad_norm": 5.09375, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -14042841.777777778, + "logits/rejected": -2734603.7333333334, + "logps/chosen": -452.85313585069446, + "logps/rejected": -512.26474609375, + "loss": 0.0111, + "rewards/chosen": 9.645287407769096, + "rewards/margins": 20.595812310112848, + "rewards/rejected": -10.95052490234375, + "step": 3620 + }, + { + "epoch": 0.9924626558859806, + "grad_norm": 10.625, + "kl": 1.7067184448242188, + "learning_rate": 5e-06, + "logits/chosen": -11007397.333333334, + "logits/rejected": 12827486.666666666, + "logps/chosen": -406.185791015625, + "logps/rejected": -594.7978515625, + "loss": 0.0385, + "rewards/chosen": 7.256401062011719, + "rewards/margins": 18.767031351725258, + "rewards/rejected": -11.510630289713541, + "step": 3621 + }, + { + "epoch": 0.9927367411264904, + "grad_norm": 6.90625, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -37492544.0, + "logits/rejected": -40786870.85714286, + "logps/chosen": -409.158203125, + "logps/rejected": -658.7349330357143, + "loss": 0.0188, + "rewards/chosen": 7.309268188476563, + "rewards/margins": 24.163422502790176, + "rewards/rejected": -16.854154314313615, + "step": 3622 + }, + { + "epoch": 0.9930108263670001, + "grad_norm": 7.03125, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -32187392.0, + "logits/rejected": -18585292.444444444, + "logps/chosen": -442.66809895833336, + "logps/rejected": -556.2862955729166, + "loss": 0.0418, + "rewards/chosen": 6.8881383260091145, + "rewards/margins": 21.98871070014106, + "rewards/rejected": -15.100572374131945, + "step": 3623 + }, + { + "epoch": 0.99328491160751, + "grad_norm": 10.5625, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -22836817.230769232, + "logits/rejected": -32644366.545454547, + "logps/chosen": -429.66811899038464, + "logps/rejected": -618.388671875, + "loss": 0.042, + "rewards/chosen": 6.196348337026743, + "rewards/margins": 22.082323567850608, + "rewards/rejected": -15.885975230823863, + "step": 3624 + }, + { + "epoch": 0.9935589968480197, + "grad_norm": 6.3125, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -13634228.8, + "logits/rejected": -28877737.14285714, + "logps/chosen": -432.554296875, + "logps/rejected": -509.4014369419643, + "loss": 0.014, + "rewards/chosen": 7.775106048583984, + "rewards/margins": 17.25827669416155, + "rewards/rejected": -9.483170645577568, + "step": 3625 + }, + { + "epoch": 0.9938330820885295, + "grad_norm": 3.796875, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -23605104.0, + "logits/rejected": -30948925.333333332, + "logps/chosen": -364.2631022135417, + "logps/rejected": -419.494873046875, + "loss": 0.012, + "rewards/chosen": 7.01107915242513, + "rewards/margins": 19.400211334228516, + "rewards/rejected": -12.389132181803385, + "step": 3626 + }, + { + "epoch": 0.9941071673290394, + "grad_norm": 11.0625, + "kl": 9.597711563110352, + "learning_rate": 5e-06, + "logits/chosen": -27834006.0, + "logits/rejected": -26910984.0, + "logps/chosen": -509.1912841796875, + "logps/rejected": -573.7071533203125, + "loss": 0.0559, + "rewards/chosen": 6.209501266479492, + "rewards/margins": 19.819896697998047, + "rewards/rejected": -13.610395431518555, + "step": 3627 + }, + { + "epoch": 0.9943812525695491, + "grad_norm": 11.0625, + "kl": 11.921829223632812, + "learning_rate": 5e-06, + "logits/chosen": -23882514.82352941, + "logits/rejected": -27715995.42857143, + "logps/chosen": -490.9742647058824, + "logps/rejected": -466.463134765625, + "loss": 0.0387, + "rewards/chosen": 9.344805549172793, + "rewards/margins": 18.454544035326535, + "rewards/rejected": -9.10973848615374, + "step": 3628 + }, + { + "epoch": 0.9946553378100589, + "grad_norm": 1.7109375, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -29154506.666666668, + "logits/rejected": -60413542.4, + "logps/chosen": -367.59483506944446, + "logps/rejected": -616.7327473958334, + "loss": 0.0042, + "rewards/chosen": 6.670477125379774, + "rewards/margins": 21.515720452202693, + "rewards/rejected": -14.845243326822917, + "step": 3629 + }, + { + "epoch": 0.9949294230505688, + "grad_norm": 12.0, + "kl": 6.268505096435547, + "learning_rate": 5e-06, + "logits/chosen": -27911061.333333332, + "logits/rejected": -39923522.666666664, + "logps/chosen": -366.7002360026042, + "logps/rejected": -635.3284098307291, + "loss": 0.0527, + "rewards/chosen": 6.300426483154297, + "rewards/margins": 20.565809885660805, + "rewards/rejected": -14.26538340250651, + "step": 3630 + }, + { + "epoch": 0.9952035082910785, + "grad_norm": 4.90625, + "kl": 1.5191377401351929, + "learning_rate": 5e-06, + "logits/chosen": -27478421.333333332, + "logits/rejected": -20490924.0, + "logps/chosen": -340.99570719401044, + "logps/rejected": -358.2278645833333, + "loss": 0.0247, + "rewards/chosen": 7.67038091023763, + "rewards/margins": 17.053862889607746, + "rewards/rejected": -9.383481979370117, + "step": 3631 + }, + { + "epoch": 0.9954775935315884, + "grad_norm": 7.09375, + "kl": 16.061477661132812, + "learning_rate": 5e-06, + "logits/chosen": -35261196.8, + "logits/rejected": -31102858.666666668, + "logps/chosen": -421.70908203125, + "logps/rejected": -440.92955186631946, + "loss": 0.0271, + "rewards/chosen": 8.463796997070313, + "rewards/margins": 20.695625474717882, + "rewards/rejected": -12.23182847764757, + "step": 3632 + }, + { + "epoch": 0.9957516787720981, + "grad_norm": 9.9375, + "kl": 11.213827133178711, + "learning_rate": 5e-06, + "logits/chosen": -21529618.285714287, + "logits/rejected": -20626009.6, + "logps/chosen": -456.8916015625, + "logps/rejected": -443.22490234375, + "loss": 0.0662, + "rewards/chosen": 8.337461744035993, + "rewards/margins": 18.707200513567244, + "rewards/rejected": -10.36973876953125, + "step": 3633 + }, + { + "epoch": 0.9960257640126079, + "grad_norm": 4.09375, + "kl": 0.4137744903564453, + "learning_rate": 5e-06, + "logits/chosen": -25475540.363636363, + "logits/rejected": -32027628.307692308, + "logps/chosen": -371.6344549005682, + "logps/rejected": -575.2791466346154, + "loss": 0.0079, + "rewards/chosen": 7.322811473499645, + "rewards/margins": 23.175566506552528, + "rewards/rejected": -15.852755033052885, + "step": 3634 + }, + { + "epoch": 0.9962998492531178, + "grad_norm": 5.25, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -12339288.0, + "logits/rejected": -22482441.14285714, + "logps/chosen": -470.05888671875, + "logps/rejected": -566.4724469866071, + "loss": 0.0558, + "rewards/chosen": 8.121621704101562, + "rewards/margins": 24.69756905691964, + "rewards/rejected": -16.57594735281808, + "step": 3635 + }, + { + "epoch": 0.9965739344936275, + "grad_norm": 2.203125, + "kl": 3.384866237640381, + "learning_rate": 5e-06, + "logits/chosen": -27234216.0, + "logits/rejected": -7600042.666666667, + "logps/chosen": -504.1823323567708, + "logps/rejected": -546.7577311197916, + "loss": 0.0066, + "rewards/chosen": 8.560478210449219, + "rewards/margins": 21.382692972819008, + "rewards/rejected": -12.822214762369791, + "step": 3636 + }, + { + "epoch": 0.9968480197341373, + "grad_norm": 6.0625, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -32782828.307692308, + "logits/rejected": -21986786.90909091, + "logps/chosen": -359.2460186298077, + "logps/rejected": -427.43741122159093, + "loss": 0.0315, + "rewards/chosen": 7.304047217735877, + "rewards/margins": 18.96967166287082, + "rewards/rejected": -11.665624445134943, + "step": 3637 + }, + { + "epoch": 0.9971221049746471, + "grad_norm": 8.9375, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -31171770.181818184, + "logits/rejected": -22201282.46153846, + "logps/chosen": -365.6346324573864, + "logps/rejected": -447.81539212740387, + "loss": 0.0611, + "rewards/chosen": 6.478625904430043, + "rewards/margins": 19.78366435657848, + "rewards/rejected": -13.305038452148438, + "step": 3638 + }, + { + "epoch": 0.9973961902151569, + "grad_norm": 2.953125, + "kl": 0.04098256677389145, + "learning_rate": 5e-06, + "logits/chosen": -18492234.181818184, + "logits/rejected": -54803500.307692304, + "logps/chosen": -465.29616477272725, + "logps/rejected": -642.310546875, + "loss": 0.0052, + "rewards/chosen": 8.580132917924361, + "rewards/margins": 25.611121171004292, + "rewards/rejected": -17.03098825307993, + "step": 3639 + }, + { + "epoch": 0.9976702754556667, + "grad_norm": 7.625, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -15378408.0, + "logits/rejected": -10622384.0, + "logps/chosen": -376.8633117675781, + "logps/rejected": -628.2380981445312, + "loss": 0.0401, + "rewards/chosen": 7.506053447723389, + "rewards/margins": 23.489363193511963, + "rewards/rejected": -15.983309745788574, + "step": 3640 + }, + { + "epoch": 0.9979443606961765, + "grad_norm": 7.6875, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -32604757.333333332, + "logits/rejected": -48542097.06666667, + "logps/chosen": -420.656982421875, + "logps/rejected": -480.6986979166667, + "loss": 0.0133, + "rewards/chosen": 7.186045328776042, + "rewards/margins": 19.25531921386719, + "rewards/rejected": -12.069273885091146, + "step": 3641 + }, + { + "epoch": 0.9982184459366863, + "grad_norm": 7.71875, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -33602056.0, + "logits/rejected": -14034064.0, + "logps/chosen": -392.259033203125, + "logps/rejected": -701.5033569335938, + "loss": 0.025, + "rewards/chosen": 6.765414237976074, + "rewards/margins": 24.37526035308838, + "rewards/rejected": -17.609846115112305, + "step": 3642 + }, + { + "epoch": 0.9984925311771962, + "grad_norm": 9.75, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -36125901.71428572, + "logits/rejected": -44250041.6, + "logps/chosen": -324.65164620535717, + "logps/rejected": -741.03671875, + "loss": 0.0302, + "rewards/chosen": 5.386114937918527, + "rewards/margins": 22.446805245535714, + "rewards/rejected": -17.060690307617186, + "step": 3643 + }, + { + "epoch": 0.9987666164177059, + "grad_norm": 6.21875, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -28469436.8, + "logits/rejected": -34244338.28571428, + "logps/chosen": -424.0158203125, + "logps/rejected": -689.2735770089286, + "loss": 0.0129, + "rewards/chosen": 6.912315368652344, + "rewards/margins": 24.429286411830358, + "rewards/rejected": -17.516971043178014, + "step": 3644 + }, + { + "epoch": 0.9990407016582157, + "grad_norm": 2.984375, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -10331165.333333334, + "logits/rejected": -21615513.6, + "logps/chosen": -384.4914822048611, + "logps/rejected": -629.4067708333333, + "loss": 0.0102, + "rewards/chosen": 6.4847971598307295, + "rewards/margins": 22.71676737467448, + "rewards/rejected": -16.23197021484375, + "step": 3645 + }, + { + "epoch": 0.9993147868987255, + "grad_norm": 2.9375, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -23935066.181818184, + "logits/rejected": -19116199.384615384, + "logps/chosen": -405.27463600852275, + "logps/rejected": -673.8553185096154, + "loss": 0.009, + "rewards/chosen": 6.945154363458807, + "rewards/margins": 25.63474012254835, + "rewards/rejected": -18.689585759089542, + "step": 3646 + }, + { + "epoch": 0.9995888721392353, + "grad_norm": 8.625, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -19072386.666666668, + "logits/rejected": -11728288.0, + "logps/chosen": -464.8741861979167, + "logps/rejected": -448.9371337890625, + "loss": 0.0282, + "rewards/chosen": 8.191619237263998, + "rewards/margins": 21.319589614868164, + "rewards/rejected": -13.127970377604166, + "step": 3647 + }, + { + "epoch": 0.9998629573797451, + "grad_norm": 4.40625, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -19389143.111111112, + "logits/rejected": -21315622.4, + "logps/chosen": -538.0897894965278, + "logps/rejected": -820.299609375, + "loss": 0.0249, + "rewards/chosen": 7.493621826171875, + "rewards/margins": 30.48381144205729, + "rewards/rejected": -22.990189615885416, + "step": 3648 + }, + { + "epoch": 1.0, + "grad_norm": 0.85546875, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -58475392.0, + "logits/rejected": -39694160.0, + "logps/chosen": -330.7967529296875, + "logps/rejected": -606.4405110677084, + "loss": 0.0019, + "rewards/chosen": 7.148828506469727, + "rewards/margins": 22.359071731567383, + "rewards/rejected": -15.210243225097656, + "step": 3649 + } + ], + "logging_steps": 1, + "max_steps": 3649, + "num_input_tokens_seen": 0, + "num_train_epochs": 2, + "save_steps": 1825, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": true + }, + "attributes": {} + } + }, + "total_flos": 0.0, + "train_batch_size": 6, + "trial_name": null, + "trial_params": null +}