diff --git "a/checkpoint-1000/trainer_state.json" "b/checkpoint-1000/trainer_state.json" new file mode 100644--- /dev/null +++ "b/checkpoint-1000/trainer_state.json" @@ -0,0 +1,15034 @@ +{ + "best_global_step": null, + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 0.1827318410232983, + "eval_steps": 500, + "global_step": 1000, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0001827318410232983, + "grad_norm": 112.5, + "kl": 0.0, + "learning_rate": 0.0, + "logits/chosen": 15782774.0, + "logits/rejected": 77970768.0, + "logps/chosen": -147.10406494140625, + "logps/rejected": -466.843017578125, + "loss": 0.3, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 1 + }, + { + "epoch": 0.0003654636820465966, + "grad_norm": 133.0, + "kl": 0.0, + "learning_rate": 1.0000000000000001e-07, + "logits/chosen": 112644832.0, + "logits/rejected": 68230944.0, + "logps/chosen": -267.1830749511719, + "logps/rejected": -379.0516357421875, + "loss": 0.4, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 2 + }, + { + "epoch": 0.0005481955230698949, + "grad_norm": 173.0, + "kl": 0.29517078399658203, + "learning_rate": 2.0000000000000002e-07, + "logits/chosen": 4135752.0, + "logits/rejected": 75808315.07692307, + "logps/chosen": -318.4762369791667, + "logps/rejected": -561.4328049879807, + "loss": 0.4211, + "rewards/chosen": -0.17075804869333902, + "rewards/margins": -0.2161886264116336, + "rewards/rejected": 0.04543057771829458, + "step": 3 + }, + { + "epoch": 0.0007309273640931932, + "grad_norm": 113.0, + "kl": 0.39323997497558594, + "learning_rate": 3.0000000000000004e-07, + "logits/chosen": 41318584.0, + "logits/rejected": 87561952.0, + "logps/chosen": -226.8242950439453, + "logps/rejected": -449.3288269042969, + "loss": 0.2983, + "rewards/chosen": 0.052193447947502136, + "rewards/margins": 0.02682456001639366, + "rewards/rejected": 0.025368887931108475, + "step": 4 + }, + { + "epoch": 0.0009136592051164915, + "grad_norm": 146.0, + "kl": 0.30970096588134766, + "learning_rate": 4.0000000000000003e-07, + "logits/chosen": 1815924.142857143, + "logits/rejected": 92250723.55555555, + "logps/chosen": -253.85836356026786, + "logps/rejected": -543.4709201388889, + "loss": 0.3237, + "rewards/chosen": -0.025245449372700283, + "rewards/margins": -0.00683453158726768, + "rewards/rejected": -0.018410917785432603, + "step": 5 + }, + { + "epoch": 0.0010963910461397899, + "grad_norm": 126.0, + "kl": 0.243438720703125, + "learning_rate": 5.000000000000001e-07, + "logits/chosen": 115388672.0, + "logits/rejected": 84891246.54545455, + "logps/chosen": -346.663916015625, + "logps/rejected": -465.8801935369318, + "loss": 0.3657, + "rewards/chosen": -0.020405884087085723, + "rewards/margins": 0.006103693490678616, + "rewards/rejected": -0.02650957757776434, + "step": 6 + }, + { + "epoch": 0.0012791228871630882, + "grad_norm": 98.5, + "kl": 0.07676887512207031, + "learning_rate": 6.000000000000001e-07, + "logits/chosen": 75291360.0, + "logits/rejected": 23800980.0, + "logps/chosen": -291.051025390625, + "logps/rejected": -360.1891174316406, + "loss": 0.2894, + "rewards/chosen": -0.015892984345555305, + "rewards/margins": 0.04175739176571369, + "rewards/rejected": -0.057650376111269, + "step": 7 + }, + { + "epoch": 0.0014618547281863865, + "grad_norm": 67.5, + "kl": 0.29837608337402344, + "learning_rate": 7.000000000000001e-07, + "logits/chosen": 16596210.909090908, + "logits/rejected": 78200524.8, + "logps/chosen": -217.0584383877841, + "logps/rejected": -379.9115234375, + "loss": 0.2303, + "rewards/chosen": -0.003717873584140431, + "rewards/margins": -0.09803947020660747, + "rewards/rejected": 0.09432159662246704, + "step": 8 + }, + { + "epoch": 0.0016445865692096848, + "grad_norm": 89.0, + "kl": 0.15167903900146484, + "learning_rate": 8.000000000000001e-07, + "logits/chosen": 126918599.1111111, + "logits/rejected": -19810706.285714287, + "logps/chosen": -220.13628472222223, + "logps/rejected": -356.80430385044644, + "loss": 0.265, + "rewards/chosen": 0.025842456354035273, + "rewards/margins": 0.09664345899271587, + "rewards/rejected": -0.0708010026386806, + "step": 9 + }, + { + "epoch": 0.001827318410232983, + "grad_norm": 123.5, + "kl": 0.02130126953125, + "learning_rate": 9.000000000000001e-07, + "logits/chosen": 194668068.57142857, + "logits/rejected": 187161244.44444445, + "logps/chosen": -192.11202566964286, + "logps/rejected": -380.90581597222223, + "loss": 0.3132, + "rewards/chosen": -0.09041247197559901, + "rewards/margins": 0.004647744080377006, + "rewards/rejected": -0.09506021605597602, + "step": 10 + }, + { + "epoch": 0.0020100502512562816, + "grad_norm": 48.75, + "kl": 0.1805105209350586, + "learning_rate": 1.0000000000000002e-06, + "logits/chosen": 55980376.615384616, + "logits/rejected": 29491424.0, + "logps/chosen": -260.2264873798077, + "logps/rejected": -256.3162841796875, + "loss": 0.1601, + "rewards/chosen": 0.03314971923828125, + "rewards/margins": 0.3691655397415161, + "rewards/rejected": -0.33601582050323486, + "step": 11 + }, + { + "epoch": 0.0021927820922795797, + "grad_norm": 75.0, + "kl": 0.14109134674072266, + "learning_rate": 1.1e-06, + "logits/chosen": 43570980.0, + "logits/rejected": 50326464.0, + "logps/chosen": -229.23086547851562, + "logps/rejected": -257.52008056640625, + "loss": 0.2674, + "rewards/chosen": -0.007202532142400742, + "rewards/margins": 0.25986937060952187, + "rewards/rejected": -0.2670719027519226, + "step": 12 + }, + { + "epoch": 0.0023755139333028782, + "grad_norm": 108.5, + "kl": 0.016056060791015625, + "learning_rate": 1.2000000000000002e-06, + "logits/chosen": 77368296.0, + "logits/rejected": 86046856.0, + "logps/chosen": -202.3299102783203, + "logps/rejected": -472.40478515625, + "loss": 0.2478, + "rewards/chosen": 0.023720938712358475, + "rewards/margins": 0.44317035749554634, + "rewards/rejected": -0.41944941878318787, + "step": 13 + }, + { + "epoch": 0.0025582457743261763, + "grad_norm": 103.0, + "kl": 0.04313850402832031, + "learning_rate": 1.3e-06, + "logits/chosen": 37632768.0, + "logits/rejected": 91231552.0, + "logps/chosen": -213.20657348632812, + "logps/rejected": -392.5678405761719, + "loss": 0.2413, + "rewards/chosen": -0.03766365349292755, + "rewards/margins": 0.454007163643837, + "rewards/rejected": -0.4916708171367645, + "step": 14 + }, + { + "epoch": 0.002740977615349475, + "grad_norm": 78.5, + "kl": 0.0, + "learning_rate": 1.4000000000000001e-06, + "logits/chosen": 173963861.33333334, + "logits/rejected": 7950694.857142857, + "logps/chosen": -209.51722547743054, + "logps/rejected": -426.8723842075893, + "loss": 0.2043, + "rewards/chosen": 0.029138187567392986, + "rewards/margins": 0.7200297940345037, + "rewards/rejected": -0.6908916064671108, + "step": 15 + }, + { + "epoch": 0.002923709456372773, + "grad_norm": 105.0, + "kl": 0.0, + "learning_rate": 1.5e-06, + "logits/chosen": 69257536.0, + "logits/rejected": 75928166.4, + "logps/chosen": -347.2032063802083, + "logps/rejected": -457.8453125, + "loss": 0.217, + "rewards/chosen": -0.04452260831991831, + "rewards/margins": 0.9133701910575232, + "rewards/rejected": -0.9578927993774414, + "step": 16 + }, + { + "epoch": 0.0031064412973960715, + "grad_norm": 125.5, + "kl": 0.0, + "learning_rate": 1.6000000000000001e-06, + "logits/chosen": 85379632.0, + "logits/rejected": 63594077.538461536, + "logps/chosen": -232.26826985677084, + "logps/rejected": -436.44136868990387, + "loss": 0.2411, + "rewards/chosen": -0.03266093134880066, + "rewards/margins": 0.9665751755237579, + "rewards/rejected": -0.9992361068725586, + "step": 17 + }, + { + "epoch": 0.0032891731384193696, + "grad_norm": 54.25, + "kl": 0.0, + "learning_rate": 1.7000000000000002e-06, + "logits/chosen": 63583456.0, + "logits/rejected": 18620008.0, + "logps/chosen": -246.9355224609375, + "logps/rejected": -320.8817545572917, + "loss": 0.1449, + "rewards/chosen": 0.07600563168525695, + "rewards/margins": 1.379675402243932, + "rewards/rejected": -1.303669770558675, + "step": 18 + }, + { + "epoch": 0.003471904979442668, + "grad_norm": 66.0, + "kl": 0.0, + "learning_rate": 1.8000000000000001e-06, + "logits/chosen": 97445575.1111111, + "logits/rejected": -127729755.42857143, + "logps/chosen": -225.09092881944446, + "logps/rejected": -275.8845738002232, + "loss": 0.1815, + "rewards/chosen": -0.04381239083078173, + "rewards/margins": 0.9048231192051418, + "rewards/rejected": -0.9486355100359235, + "step": 19 + }, + { + "epoch": 0.003654636820465966, + "grad_norm": 46.0, + "kl": 0.09639358520507812, + "learning_rate": 1.9000000000000002e-06, + "logits/chosen": -12096544.0, + "logits/rejected": -21570788.57142857, + "logps/chosen": -181.8446044921875, + "logps/rejected": -506.69991629464283, + "loss": 0.1316, + "rewards/chosen": -0.02704162398974101, + "rewards/margins": 1.7980299833275022, + "rewards/rejected": -1.8250716073172433, + "step": 20 + }, + { + "epoch": 0.0038373686614892647, + "grad_norm": 52.5, + "kl": 0.0, + "learning_rate": 2.0000000000000003e-06, + "logits/chosen": 41582026.666666664, + "logits/rejected": 46615392.0, + "logps/chosen": -226.09501139322916, + "logps/rejected": -406.594287109375, + "loss": 0.1359, + "rewards/chosen": -0.20737546682357788, + "rewards/margins": 1.7020315766334533, + "rewards/rejected": -1.9094070434570312, + "step": 21 + }, + { + "epoch": 0.004020100502512563, + "grad_norm": 50.5, + "kl": 0.0, + "learning_rate": 2.1000000000000002e-06, + "logits/chosen": 64843532.8, + "logits/rejected": 60914414.54545455, + "logps/chosen": -144.94071044921876, + "logps/rejected": -369.3201793323864, + "loss": 0.1161, + "rewards/chosen": -0.017349091172218323, + "rewards/margins": 2.176709760590033, + "rewards/rejected": -2.1940588517622515, + "step": 22 + }, + { + "epoch": 0.004202832343535861, + "grad_norm": 32.0, + "kl": 0.0, + "learning_rate": 2.2e-06, + "logits/chosen": 50831226.666666664, + "logits/rejected": 68062681.6, + "logps/chosen": -205.03849283854166, + "logps/rejected": -391.21748046875, + "loss": 0.1175, + "rewards/chosen": -0.19548797607421875, + "rewards/margins": 2.334707832336426, + "rewards/rejected": -2.5301958084106446, + "step": 23 + }, + { + "epoch": 0.004385564184559159, + "grad_norm": 27.5, + "kl": 0.0, + "learning_rate": 2.3000000000000004e-06, + "logits/chosen": 31897308.444444444, + "logits/rejected": -1302202.2857142857, + "logps/chosen": -253.10533311631946, + "logps/rejected": -416.9341517857143, + "loss": 0.1145, + "rewards/chosen": -0.07121564944585164, + "rewards/margins": 2.835222360633668, + "rewards/rejected": -2.90643801007952, + "step": 24 + }, + { + "epoch": 0.0045682960255824575, + "grad_norm": 25.5, + "kl": 0.0, + "learning_rate": 2.4000000000000003e-06, + "logits/chosen": 8113576.7272727275, + "logits/rejected": -153935244.8, + "logps/chosen": -203.86570046164772, + "logps/rejected": -408.8076904296875, + "loss": 0.1189, + "rewards/chosen": -0.16130523248152298, + "rewards/margins": 2.394134525819258, + "rewards/rejected": -2.555439758300781, + "step": 25 + }, + { + "epoch": 0.0047510278666057565, + "grad_norm": 27.5, + "kl": 0.0, + "learning_rate": 2.5e-06, + "logits/chosen": 23182069.333333332, + "logits/rejected": 47020496.0, + "logps/chosen": -228.5902099609375, + "logps/rejected": -462.409716796875, + "loss": 0.0627, + "rewards/chosen": -0.14248353242874146, + "rewards/margins": 3.5373187899589538, + "rewards/rejected": -3.6798023223876952, + "step": 26 + }, + { + "epoch": 0.0049337597076290545, + "grad_norm": 19.625, + "kl": 0.0, + "learning_rate": 2.6e-06, + "logits/chosen": 77652339.2, + "logits/rejected": 34975092.36363637, + "logps/chosen": -285.44013671875, + "logps/rejected": -501.67067649147725, + "loss": 0.0557, + "rewards/chosen": -0.16572738885879518, + "rewards/margins": 4.193471788276326, + "rewards/rejected": -4.359199177135121, + "step": 27 + }, + { + "epoch": 0.005116491548652353, + "grad_norm": 19.875, + "kl": 0.0, + "learning_rate": 2.7000000000000004e-06, + "logits/chosen": 36826176.0, + "logits/rejected": 29100368.0, + "logps/chosen": -205.88822428385416, + "logps/rejected": -294.87318638392856, + "loss": 0.0932, + "rewards/chosen": -0.046600424581103854, + "rewards/margins": 3.277927588375788, + "rewards/rejected": -3.324528012956892, + "step": 28 + }, + { + "epoch": 0.005299223389675651, + "grad_norm": 16.0, + "kl": 0.0, + "learning_rate": 2.8000000000000003e-06, + "logits/chosen": 60643840.0, + "logits/rejected": -16670002.285714285, + "logps/chosen": -195.23323567708334, + "logps/rejected": -341.07059151785717, + "loss": 0.0824, + "rewards/chosen": -0.29032529724968803, + "rewards/margins": 3.954109059439765, + "rewards/rejected": -4.244434356689453, + "step": 29 + }, + { + "epoch": 0.00548195523069895, + "grad_norm": 16.75, + "kl": 0.0, + "learning_rate": 2.9e-06, + "logits/chosen": 43063024.0, + "logits/rejected": 401290.0, + "logps/chosen": -292.3250427246094, + "logps/rejected": -436.4898681640625, + "loss": 0.0718, + "rewards/chosen": -0.05854110047221184, + "rewards/margins": 5.147633749991655, + "rewards/rejected": -5.206174850463867, + "step": 30 + }, + { + "epoch": 0.005664687071722248, + "grad_norm": 13.25, + "kl": 0.0, + "learning_rate": 3e-06, + "logits/chosen": 74867370.66666667, + "logits/rejected": 55217321.14285714, + "logps/chosen": -228.15885416666666, + "logps/rejected": -295.65028599330356, + "loss": 0.0722, + "rewards/chosen": -0.274595922893948, + "rewards/margins": 3.9277606880854044, + "rewards/rejected": -4.202356610979352, + "step": 31 + }, + { + "epoch": 0.005847418912745546, + "grad_norm": 18.375, + "kl": 0.0, + "learning_rate": 3.1000000000000004e-06, + "logits/chosen": 85035491.55555555, + "logits/rejected": -3890721.1428571427, + "logps/chosen": -207.86821831597223, + "logps/rejected": -368.3097446986607, + "loss": 0.0838, + "rewards/chosen": -0.2733353508843316, + "rewards/margins": 5.309553661043681, + "rewards/rejected": -5.582889011928013, + "step": 32 + }, + { + "epoch": 0.006030150753768844, + "grad_norm": 12.0625, + "kl": 0.0, + "learning_rate": 3.2000000000000003e-06, + "logits/chosen": 29737115.42857143, + "logits/rejected": 3406788.0, + "logps/chosen": -175.48606654575892, + "logps/rejected": -529.9302300347222, + "loss": 0.0537, + "rewards/chosen": -0.18001859528677805, + "rewards/margins": 7.44891139060732, + "rewards/rejected": -7.628929985894097, + "step": 33 + }, + { + "epoch": 0.006212882594792143, + "grad_norm": 17.125, + "kl": 0.0, + "learning_rate": 3.3000000000000006e-06, + "logits/chosen": 28436018.285714287, + "logits/rejected": 47201812.0, + "logps/chosen": -258.4918736049107, + "logps/rejected": -354.69683837890625, + "loss": 0.116, + "rewards/chosen": -0.7010038239615304, + "rewards/margins": 3.9710418837411066, + "rewards/rejected": -4.672045707702637, + "step": 34 + }, + { + "epoch": 0.006395614435815441, + "grad_norm": 11.625, + "kl": 0.0, + "learning_rate": 3.4000000000000005e-06, + "logits/chosen": -8542162.0, + "logits/rejected": 81162888.0, + "logps/chosen": -189.0626220703125, + "logps/rejected": -463.56103515625, + "loss": 0.059, + "rewards/chosen": -0.09594126045703888, + "rewards/margins": 6.939442440867424, + "rewards/rejected": -7.035383701324463, + "step": 35 + }, + { + "epoch": 0.006578346276838739, + "grad_norm": 13.25, + "kl": 0.0, + "learning_rate": 3.5e-06, + "logits/chosen": -24281029.333333332, + "logits/rejected": 22850813.714285713, + "logps/chosen": -192.791015625, + "logps/rejected": -475.39505440848217, + "loss": 0.0718, + "rewards/chosen": -0.28499992688496906, + "rewards/margins": 6.6287345091501875, + "rewards/rejected": -6.913734436035156, + "step": 36 + }, + { + "epoch": 0.006761078117862037, + "grad_norm": 11.125, + "kl": 0.0, + "learning_rate": 3.6000000000000003e-06, + "logits/chosen": -56954577.777777776, + "logits/rejected": 73736694.85714285, + "logps/chosen": -135.03915744357639, + "logps/rejected": -596.0968889508929, + "loss": 0.0784, + "rewards/chosen": -0.43950560357835555, + "rewards/margins": 9.579519926555573, + "rewards/rejected": -10.019025530133929, + "step": 37 + }, + { + "epoch": 0.006943809958885336, + "grad_norm": 14.6875, + "kl": 0.0, + "learning_rate": 3.7e-06, + "logits/chosen": 59501681.777777776, + "logits/rejected": 33996027.428571425, + "logps/chosen": -313.30951605902777, + "logps/rejected": -418.14491489955356, + "loss": 0.0739, + "rewards/chosen": -0.5262299113803439, + "rewards/margins": 7.232405473315527, + "rewards/rejected": -7.758635384695871, + "step": 38 + }, + { + "epoch": 0.007126541799908634, + "grad_norm": 14.5, + "kl": 0.0, + "learning_rate": 3.8000000000000005e-06, + "logits/chosen": 66246130.28571428, + "logits/rejected": 9070653.333333334, + "logps/chosen": -271.7684849330357, + "logps/rejected": -510.77734375, + "loss": 0.0792, + "rewards/chosen": -0.7464541707720075, + "rewards/margins": 8.406610133155944, + "rewards/rejected": -9.153064303927952, + "step": 39 + }, + { + "epoch": 0.007309273640931932, + "grad_norm": 12.0, + "kl": 0.0, + "learning_rate": 3.900000000000001e-06, + "logits/chosen": -20626122.666666668, + "logits/rejected": 41747337.14285714, + "logps/chosen": -200.73832194010416, + "logps/rejected": -318.177978515625, + "loss": 0.0724, + "rewards/chosen": -0.47773440678914386, + "rewards/margins": 6.432780424753825, + "rewards/rejected": -6.910514831542969, + "step": 40 + }, + { + "epoch": 0.00749200548195523, + "grad_norm": 15.25, + "kl": 0.0, + "learning_rate": 4.000000000000001e-06, + "logits/chosen": 63046552.0, + "logits/rejected": 103259312.0, + "logps/chosen": -398.5447082519531, + "logps/rejected": -494.28485107421875, + "loss": 0.0751, + "rewards/chosen": -0.58284991979599, + "rewards/margins": 8.01844173669815, + "rewards/rejected": -8.60129165649414, + "step": 41 + }, + { + "epoch": 0.007674737322978529, + "grad_norm": 10.875, + "kl": 0.0, + "learning_rate": 4.1e-06, + "logits/chosen": 56043893.333333336, + "logits/rejected": 54874116.571428575, + "logps/chosen": -184.25996907552084, + "logps/rejected": -617.7219587053571, + "loss": 0.0583, + "rewards/chosen": -0.027622381846110027, + "rewards/margins": 11.301165285564604, + "rewards/rejected": -11.328787667410714, + "step": 42 + }, + { + "epoch": 0.007857469164001827, + "grad_norm": 15.1875, + "kl": 0.0, + "learning_rate": 4.2000000000000004e-06, + "logits/chosen": -10233938.285714285, + "logits/rejected": 38559537.777777776, + "logps/chosen": -275.71620396205356, + "logps/rejected": -486.02039930555554, + "loss": 0.0563, + "rewards/chosen": -0.11437116350446429, + "rewards/margins": 7.266728961278521, + "rewards/rejected": -7.381100124782986, + "step": 43 + }, + { + "epoch": 0.008040201005025126, + "grad_norm": 11.625, + "kl": 0.0, + "learning_rate": 4.3e-06, + "logits/chosen": 22200675.555555556, + "logits/rejected": 8111230.857142857, + "logps/chosen": -192.610107421875, + "logps/rejected": -354.30562918526783, + "loss": 0.0561, + "rewards/chosen": 0.0005070103539360894, + "rewards/margins": 8.11497136146303, + "rewards/rejected": -8.114464351109095, + "step": 44 + }, + { + "epoch": 0.008222932846048425, + "grad_norm": 14.125, + "kl": 0.0, + "learning_rate": 4.4e-06, + "logits/chosen": 1762648.7272727273, + "logits/rejected": 33646537.6, + "logps/chosen": -257.01979758522725, + "logps/rejected": -442.761376953125, + "loss": 0.1017, + "rewards/chosen": -0.8375052538785067, + "rewards/margins": 7.214427271756258, + "rewards/rejected": -8.051932525634765, + "step": 45 + }, + { + "epoch": 0.008405664687071723, + "grad_norm": 14.75, + "kl": 0.0, + "learning_rate": 4.5e-06, + "logits/chosen": 22469854.4, + "logits/rejected": 15658550.666666666, + "logps/chosen": -216.023974609375, + "logps/rejected": -386.345947265625, + "loss": 0.0669, + "rewards/chosen": -0.09425171613693237, + "rewards/margins": 9.985612785816192, + "rewards/rejected": -10.079864501953125, + "step": 46 + }, + { + "epoch": 0.00858839652809502, + "grad_norm": 12.125, + "kl": 0.0, + "learning_rate": 4.600000000000001e-06, + "logits/chosen": -2256425.0, + "logits/rejected": 4655106.0, + "logps/chosen": -228.53692626953125, + "logps/rejected": -455.4634094238281, + "loss": 0.0499, + "rewards/chosen": 0.09965035319328308, + "rewards/margins": 9.579014748334885, + "rewards/rejected": -9.479364395141602, + "step": 47 + }, + { + "epoch": 0.008771128369118319, + "grad_norm": 12.375, + "kl": 0.0, + "learning_rate": 4.7e-06, + "logits/chosen": -7138146.4, + "logits/rejected": 61952139.63636363, + "logps/chosen": -293.0078369140625, + "logps/rejected": -439.15065696022725, + "loss": 0.0596, + "rewards/chosen": -0.4712371826171875, + "rewards/margins": 7.668499339710582, + "rewards/rejected": -8.13973652232777, + "step": 48 + }, + { + "epoch": 0.008953860210141617, + "grad_norm": 10.0, + "kl": 0.0, + "learning_rate": 4.800000000000001e-06, + "logits/chosen": 72472928.0, + "logits/rejected": -906780.3636363636, + "logps/chosen": -241.548388671875, + "logps/rejected": -531.849609375, + "loss": 0.0379, + "rewards/chosen": 0.06477571725845337, + "rewards/margins": 10.650154884295029, + "rewards/rejected": -10.585379167036576, + "step": 49 + }, + { + "epoch": 0.009136592051164915, + "grad_norm": 12.5625, + "kl": 0.0, + "learning_rate": 4.9000000000000005e-06, + "logits/chosen": 35762898.28571428, + "logits/rejected": 25521418.666666668, + "logps/chosen": -257.4357212611607, + "logps/rejected": -473.20350477430554, + "loss": 0.0412, + "rewards/chosen": 0.33241094861711773, + "rewards/margins": 9.57662278129941, + "rewards/rejected": -9.244211832682291, + "step": 50 + }, + { + "epoch": 0.009319323892188213, + "grad_norm": 14.875, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": 28264566.4, + "logits/rejected": 69546480.0, + "logps/chosen": -273.243798828125, + "logps/rejected": -590.0560709635416, + "loss": 0.0702, + "rewards/chosen": -0.1696515202522278, + "rewards/margins": 11.288042215506236, + "rewards/rejected": -11.457693735758463, + "step": 51 + }, + { + "epoch": 0.009502055733211513, + "grad_norm": 11.4375, + "kl": 0.0, + "learning_rate": 5.1e-06, + "logits/chosen": 20336057.14285714, + "logits/rejected": -2744094.222222222, + "logps/chosen": -266.1215297154018, + "logps/rejected": -427.6263020833333, + "loss": 0.0545, + "rewards/chosen": -0.5023267950330462, + "rewards/margins": 9.582922364038135, + "rewards/rejected": -10.08524915907118, + "step": 52 + }, + { + "epoch": 0.009684787574234811, + "grad_norm": 9.125, + "kl": 0.0, + "learning_rate": 5.2e-06, + "logits/chosen": -70720518.4, + "logits/rejected": 31213399.272727273, + "logps/chosen": -158.567041015625, + "logps/rejected": -458.29141512784093, + "loss": 0.0294, + "rewards/chosen": 0.23568222522735596, + "rewards/margins": 10.3202256267721, + "rewards/rejected": -10.084543401544744, + "step": 53 + }, + { + "epoch": 0.009867519415258109, + "grad_norm": 14.3125, + "kl": 0.0, + "learning_rate": 5.300000000000001e-06, + "logits/chosen": 32120306.285714287, + "logits/rejected": 25868476.444444444, + "logps/chosen": -283.95506068638394, + "logps/rejected": -433.96039496527777, + "loss": 0.06, + "rewards/chosen": -0.14473027842385427, + "rewards/margins": 8.760665229388646, + "rewards/rejected": -8.9053955078125, + "step": 54 + }, + { + "epoch": 0.010050251256281407, + "grad_norm": 10.4375, + "kl": 0.0, + "learning_rate": 5.400000000000001e-06, + "logits/chosen": 19304349.714285713, + "logits/rejected": 1914103.111111111, + "logps/chosen": -190.06251743861608, + "logps/rejected": -403.5530598958333, + "loss": 0.04, + "rewards/chosen": 0.27278382437569754, + "rewards/margins": 9.029814735291495, + "rewards/rejected": -8.757030910915798, + "step": 55 + }, + { + "epoch": 0.010232983097304705, + "grad_norm": 19.25, + "kl": 0.0, + "learning_rate": 5.500000000000001e-06, + "logits/chosen": 30015363.2, + "logits/rejected": 91299285.33333333, + "logps/chosen": -189.96708984375, + "logps/rejected": -557.3203125, + "loss": 0.0625, + "rewards/chosen": 0.2307582378387451, + "rewards/margins": 8.882023064295451, + "rewards/rejected": -8.651264826456705, + "step": 56 + }, + { + "epoch": 0.010415714938328003, + "grad_norm": 12.0, + "kl": 0.0, + "learning_rate": 5.600000000000001e-06, + "logits/chosen": 17728394.666666668, + "logits/rejected": 53752859.428571425, + "logps/chosen": -300.7154134114583, + "logps/rejected": -555.5929129464286, + "loss": 0.0466, + "rewards/chosen": 0.4413594934675429, + "rewards/margins": 9.82733056280348, + "rewards/rejected": -9.385971069335938, + "step": 57 + }, + { + "epoch": 0.010598446779351301, + "grad_norm": 12.4375, + "kl": 0.0, + "learning_rate": 5.7e-06, + "logits/chosen": -17727002.666666668, + "logits/rejected": 14331746.285714285, + "logps/chosen": -180.419189453125, + "logps/rejected": -417.90921456473217, + "loss": 0.0569, + "rewards/chosen": 0.1094371411535475, + "rewards/margins": 10.715325507852766, + "rewards/rejected": -10.605888366699219, + "step": 58 + }, + { + "epoch": 0.0107811786203746, + "grad_norm": 14.5625, + "kl": 0.0, + "learning_rate": 5.8e-06, + "logits/chosen": 34763593.6, + "logits/rejected": 70459008.0, + "logps/chosen": -267.03173828125, + "logps/rejected": -452.3960774739583, + "loss": 0.0739, + "rewards/chosen": -0.22983903884887696, + "rewards/margins": 7.231416352589925, + "rewards/rejected": -7.461255391438802, + "step": 59 + }, + { + "epoch": 0.0109639104613979, + "grad_norm": 10.5, + "kl": 0.0, + "learning_rate": 5.9e-06, + "logits/chosen": -90278610.28571428, + "logits/rejected": 11082392.888888888, + "logps/chosen": -188.48008510044642, + "logps/rejected": -335.1402994791667, + "loss": 0.0414, + "rewards/chosen": 0.2385509865624564, + "rewards/margins": 9.79675733286237, + "rewards/rejected": -9.558206346299913, + "step": 60 + }, + { + "epoch": 0.011146642302421197, + "grad_norm": 14.375, + "kl": 0.0, + "learning_rate": 6e-06, + "logits/chosen": 60350124.8, + "logits/rejected": 58364064.0, + "logps/chosen": -317.6250244140625, + "logps/rejected": -417.03759765625, + "loss": 0.0699, + "rewards/chosen": -0.16060456037521362, + "rewards/margins": 7.431312477588653, + "rewards/rejected": -7.591917037963867, + "step": 61 + }, + { + "epoch": 0.011329374143444496, + "grad_norm": 12.625, + "kl": 0.0, + "learning_rate": 6.1e-06, + "logits/chosen": 121301546.66666667, + "logits/rejected": 5892297.6, + "logps/chosen": -330.53342692057294, + "logps/rejected": -502.404248046875, + "loss": 0.059, + "rewards/chosen": -0.5798990726470947, + "rewards/margins": 9.959031438827514, + "rewards/rejected": -10.538930511474609, + "step": 62 + }, + { + "epoch": 0.011512105984467794, + "grad_norm": 11.5, + "kl": 0.0, + "learning_rate": 6.200000000000001e-06, + "logits/chosen": 18676822.0, + "logits/rejected": 45093320.0, + "logps/chosen": -247.1837615966797, + "logps/rejected": -390.10235595703125, + "loss": 0.0396, + "rewards/chosen": 0.754508376121521, + "rewards/margins": 8.84023892879486, + "rewards/rejected": -8.08573055267334, + "step": 63 + }, + { + "epoch": 0.011694837825491092, + "grad_norm": 15.1875, + "kl": 0.0, + "learning_rate": 6.300000000000001e-06, + "logits/chosen": -52607008.0, + "logits/rejected": 7864679.5, + "logps/chosen": -189.46128845214844, + "logps/rejected": -482.97430419921875, + "loss": 0.0641, + "rewards/chosen": 0.36624985933303833, + "rewards/margins": 8.779046833515167, + "rewards/rejected": -8.412796974182129, + "step": 64 + }, + { + "epoch": 0.01187756966651439, + "grad_norm": 9.8125, + "kl": 0.0, + "learning_rate": 6.4000000000000006e-06, + "logits/chosen": 10255226.0, + "logits/rejected": -18813816.0, + "logps/chosen": -193.3909912109375, + "logps/rejected": -531.240673828125, + "loss": 0.0375, + "rewards/chosen": 0.09934608141581218, + "rewards/margins": 9.311900250116983, + "rewards/rejected": -9.212554168701171, + "step": 65 + }, + { + "epoch": 0.012060301507537688, + "grad_norm": 13.6875, + "kl": 0.0, + "learning_rate": 6.5000000000000004e-06, + "logits/chosen": 46292804.571428575, + "logits/rejected": 72681841.77777778, + "logps/chosen": -220.4049072265625, + "logps/rejected": -547.1560872395834, + "loss": 0.0469, + "rewards/chosen": 0.5976224626813617, + "rewards/margins": 9.060947963169642, + "rewards/rejected": -8.463325500488281, + "step": 66 + }, + { + "epoch": 0.012243033348560986, + "grad_norm": 10.125, + "kl": 0.0, + "learning_rate": 6.600000000000001e-06, + "logits/chosen": 29266077.333333332, + "logits/rejected": 7623460.8, + "logps/chosen": -237.135009765625, + "logps/rejected": -433.35390625, + "loss": 0.0319, + "rewards/chosen": 0.45730340480804443, + "rewards/margins": 8.87908685207367, + "rewards/rejected": -8.421783447265625, + "step": 67 + }, + { + "epoch": 0.012425765189584286, + "grad_norm": 16.25, + "kl": 0.0, + "learning_rate": 6.700000000000001e-06, + "logits/chosen": 32389640.0, + "logits/rejected": -53314780.0, + "logps/chosen": -201.88800048828125, + "logps/rejected": -407.155029296875, + "loss": 0.0642, + "rewards/chosen": 0.5022709369659424, + "rewards/margins": 9.315228700637817, + "rewards/rejected": -8.812957763671875, + "step": 68 + }, + { + "epoch": 0.012608497030607584, + "grad_norm": 13.5625, + "kl": 0.0, + "learning_rate": 6.800000000000001e-06, + "logits/chosen": 31282860.8, + "logits/rejected": -2861626.6666666665, + "logps/chosen": -288.9423828125, + "logps/rejected": -481.4112141927083, + "loss": 0.0513, + "rewards/chosen": 0.47036194801330566, + "rewards/margins": 9.167997598648071, + "rewards/rejected": -8.697635650634766, + "step": 69 + }, + { + "epoch": 0.012791228871630882, + "grad_norm": 15.4375, + "kl": 0.0, + "learning_rate": 6.9e-06, + "logits/chosen": 44984905.14285714, + "logits/rejected": 14817690.666666666, + "logps/chosen": -219.78883579799108, + "logps/rejected": -361.6699490017361, + "loss": 0.048, + "rewards/chosen": 0.8878723553248814, + "rewards/margins": 8.269784586770195, + "rewards/rejected": -7.3819122314453125, + "step": 70 + }, + { + "epoch": 0.01297396071265418, + "grad_norm": 10.6875, + "kl": 0.0, + "learning_rate": 7e-06, + "logits/chosen": 9132338.4, + "logits/rejected": 7451229.333333333, + "logps/chosen": -235.571044921875, + "logps/rejected": -348.5428873697917, + "loss": 0.044, + "rewards/chosen": 1.0057493209838868, + "rewards/margins": 8.156757926940918, + "rewards/rejected": -7.151008605957031, + "step": 71 + }, + { + "epoch": 0.013156692553677478, + "grad_norm": 12.5, + "kl": 0.0, + "learning_rate": 7.100000000000001e-06, + "logits/chosen": 34971003.428571425, + "logits/rejected": 65250168.88888889, + "logps/chosen": -324.94590541294644, + "logps/rejected": -456.6183268229167, + "loss": 0.046, + "rewards/chosen": -0.06857038395745414, + "rewards/margins": 8.9134742598685, + "rewards/rejected": -8.982044643825954, + "step": 72 + }, + { + "epoch": 0.013339424394700776, + "grad_norm": 12.125, + "kl": 0.0, + "learning_rate": 7.2000000000000005e-06, + "logits/chosen": -3398965.0, + "logits/rejected": -21731906.0, + "logps/chosen": -204.3073272705078, + "logps/rejected": -403.509521484375, + "loss": 0.0486, + "rewards/chosen": 0.5571205615997314, + "rewards/margins": 6.593412637710571, + "rewards/rejected": -6.03629207611084, + "step": 73 + }, + { + "epoch": 0.013522156235724074, + "grad_norm": 10.875, + "kl": 0.0, + "learning_rate": 7.3e-06, + "logits/chosen": 83119552.0, + "logits/rejected": 68693467.42857143, + "logps/chosen": -261.94715711805554, + "logps/rejected": -550.209228515625, + "loss": 0.0423, + "rewards/chosen": 0.6942998568216959, + "rewards/margins": 12.523947465987431, + "rewards/rejected": -11.829647609165736, + "step": 74 + }, + { + "epoch": 0.013704888076747372, + "grad_norm": 14.4375, + "kl": 0.0, + "learning_rate": 7.4e-06, + "logits/chosen": 55943360.0, + "logits/rejected": 114304256.0, + "logps/chosen": -208.98626708984375, + "logps/rejected": -380.2847412109375, + "loss": 0.0541, + "rewards/chosen": 0.21400348345438638, + "rewards/margins": 5.249341313044231, + "rewards/rejected": -5.035337829589844, + "step": 75 + }, + { + "epoch": 0.013887619917770672, + "grad_norm": 14.75, + "kl": 0.0, + "learning_rate": 7.500000000000001e-06, + "logits/chosen": 67582536.0, + "logits/rejected": 83633648.0, + "logps/chosen": -196.2489471435547, + "logps/rejected": -383.7486572265625, + "loss": 0.054, + "rewards/chosen": 0.7636520862579346, + "rewards/margins": 6.962081670761108, + "rewards/rejected": -6.198429584503174, + "step": 76 + }, + { + "epoch": 0.01407035175879397, + "grad_norm": 10.25, + "kl": 0.0, + "learning_rate": 7.600000000000001e-06, + "logits/chosen": 49473152.0, + "logits/rejected": -15477678.222222222, + "logps/chosen": -232.21332659040178, + "logps/rejected": -471.06635199652777, + "loss": 0.0324, + "rewards/chosen": 0.6099836485726493, + "rewards/margins": 9.860383767930289, + "rewards/rejected": -9.25040011935764, + "step": 77 + }, + { + "epoch": 0.014253083599817268, + "grad_norm": 7.8125, + "kl": 0.0, + "learning_rate": 7.7e-06, + "logits/chosen": 37876912.0, + "logits/rejected": 52468768.0, + "logps/chosen": -229.62823486328125, + "logps/rejected": -617.780517578125, + "loss": 0.0377, + "rewards/chosen": 0.06940485040346782, + "rewards/margins": 11.81669565240542, + "rewards/rejected": -11.747290802001952, + "step": 78 + }, + { + "epoch": 0.014435815440840567, + "grad_norm": 12.3125, + "kl": 0.0, + "learning_rate": 7.800000000000002e-06, + "logits/chosen": 5237898.545454546, + "logits/rejected": -19879961.6, + "logps/chosen": -289.82914595170456, + "logps/rejected": -436.2798828125, + "loss": 0.0589, + "rewards/chosen": 0.6126820824363015, + "rewards/margins": 6.71890232779763, + "rewards/rejected": -6.106220245361328, + "step": 79 + }, + { + "epoch": 0.014618547281863865, + "grad_norm": 12.25, + "kl": 0.0, + "learning_rate": 7.9e-06, + "logits/chosen": 38552462.222222224, + "logits/rejected": 20666004.57142857, + "logps/chosen": -269.6545681423611, + "logps/rejected": -575.5065569196429, + "loss": 0.0466, + "rewards/chosen": 0.5649046368069119, + "rewards/margins": 9.35118622250027, + "rewards/rejected": -8.78628158569336, + "step": 80 + }, + { + "epoch": 0.014801279122887163, + "grad_norm": 17.875, + "kl": 0.0, + "learning_rate": 8.000000000000001e-06, + "logits/chosen": 13564669.333333334, + "logits/rejected": 33868960.0, + "logps/chosen": -141.48514811197916, + "logps/rejected": -431.35966796875, + "loss": 0.0678, + "rewards/chosen": 0.6909376780192057, + "rewards/margins": 8.418017450968424, + "rewards/rejected": -7.727079772949219, + "step": 81 + }, + { + "epoch": 0.01498401096391046, + "grad_norm": 12.4375, + "kl": 0.0, + "learning_rate": 8.1e-06, + "logits/chosen": 5083985.142857143, + "logits/rejected": 31115694.222222224, + "logps/chosen": -160.9451904296875, + "logps/rejected": -391.66509331597223, + "loss": 0.066, + "rewards/chosen": 0.386770418712071, + "rewards/margins": 7.958814791270664, + "rewards/rejected": -7.572044372558594, + "step": 82 + }, + { + "epoch": 0.015166742804933759, + "grad_norm": 12.9375, + "kl": 0.0, + "learning_rate": 8.2e-06, + "logits/chosen": 37503110.4, + "logits/rejected": -2930108.727272727, + "logps/chosen": -232.54189453125, + "logps/rejected": -335.2604314630682, + "loss": 0.0479, + "rewards/chosen": -0.39811415672302247, + "rewards/margins": 6.911530368978327, + "rewards/rejected": -7.30964452570135, + "step": 83 + }, + { + "epoch": 0.015349474645957059, + "grad_norm": 10.4375, + "kl": 0.0, + "learning_rate": 8.3e-06, + "logits/chosen": -102777052.44444445, + "logits/rejected": 217575515.42857143, + "logps/chosen": -204.67154947916666, + "logps/rejected": -661.4695172991071, + "loss": 0.0423, + "rewards/chosen": 0.6618110868665907, + "rewards/margins": 10.519423696729872, + "rewards/rejected": -9.857612609863281, + "step": 84 + }, + { + "epoch": 0.015532206486980357, + "grad_norm": 11.0625, + "kl": 0.0, + "learning_rate": 8.400000000000001e-06, + "logits/chosen": -25981040.0, + "logits/rejected": 85087816.0, + "logps/chosen": -200.37319946289062, + "logps/rejected": -467.484130859375, + "loss": 0.0486, + "rewards/chosen": 0.09723003208637238, + "rewards/margins": 9.425074651837349, + "rewards/rejected": -9.327844619750977, + "step": 85 + }, + { + "epoch": 0.015714938328003653, + "grad_norm": 5.0, + "kl": 0.0, + "learning_rate": 8.5e-06, + "logits/chosen": 23528900.0, + "logits/rejected": 44369733.333333336, + "logps/chosen": -211.3964080810547, + "logps/rejected": -450.85693359375, + "loss": 0.0125, + "rewards/chosen": 1.4311485290527344, + "rewards/margins": 10.69006093343099, + "rewards/rejected": -9.258912404378256, + "step": 86 + }, + { + "epoch": 0.01589767016902695, + "grad_norm": 12.3125, + "kl": 0.0, + "learning_rate": 8.6e-06, + "logits/chosen": -9556536.0, + "logits/rejected": -17620220.0, + "logps/chosen": -230.34571838378906, + "logps/rejected": -303.6910400390625, + "loss": 0.0392, + "rewards/chosen": 0.6572790145874023, + "rewards/margins": 9.418675422668457, + "rewards/rejected": -8.761396408081055, + "step": 87 + }, + { + "epoch": 0.016080402010050253, + "grad_norm": 12.0, + "kl": 0.0, + "learning_rate": 8.700000000000001e-06, + "logits/chosen": -95842267.42857143, + "logits/rejected": -5690455.111111111, + "logps/chosen": -232.15860421316964, + "logps/rejected": -464.14203559027777, + "loss": 0.0423, + "rewards/chosen": 0.1093566928591047, + "rewards/margins": 9.189569603829158, + "rewards/rejected": -9.080212910970053, + "step": 88 + }, + { + "epoch": 0.01626313385107355, + "grad_norm": 9.4375, + "kl": 0.0, + "learning_rate": 8.8e-06, + "logits/chosen": -17562989.714285713, + "logits/rejected": 48067477.333333336, + "logps/chosen": -207.70406668526786, + "logps/rejected": -307.71636284722223, + "loss": 0.0441, + "rewards/chosen": 0.42997656549726215, + "rewards/margins": 9.164028587795439, + "rewards/rejected": -8.734052022298178, + "step": 89 + }, + { + "epoch": 0.01644586569209685, + "grad_norm": 17.0, + "kl": 0.0, + "learning_rate": 8.900000000000001e-06, + "logits/chosen": 20565392.0, + "logits/rejected": 14730448.0, + "logps/chosen": -171.25608607700892, + "logps/rejected": -358.96465386284723, + "loss": 0.0593, + "rewards/chosen": 0.2978787422180176, + "rewards/margins": 8.180709891849094, + "rewards/rejected": -7.882831149631077, + "step": 90 + }, + { + "epoch": 0.016628597533120147, + "grad_norm": 9.4375, + "kl": 0.0, + "learning_rate": 9e-06, + "logits/chosen": -68254509.71428572, + "logits/rejected": 94339392.0, + "logps/chosen": -197.62510463169642, + "logps/rejected": -506.0188259548611, + "loss": 0.0371, + "rewards/chosen": 0.4769213540213449, + "rewards/margins": 11.284633485097734, + "rewards/rejected": -10.80771213107639, + "step": 91 + }, + { + "epoch": 0.016811329374143445, + "grad_norm": 11.5, + "kl": 0.0, + "learning_rate": 9.100000000000001e-06, + "logits/chosen": 16959508.0, + "logits/rejected": 61137896.0, + "logps/chosen": -281.1986389160156, + "logps/rejected": -506.87066650390625, + "loss": 0.0353, + "rewards/chosen": 1.028641939163208, + "rewards/margins": 9.867717981338501, + "rewards/rejected": -8.839076042175293, + "step": 92 + }, + { + "epoch": 0.016994061215166743, + "grad_norm": 9.5625, + "kl": 0.0, + "learning_rate": 9.200000000000002e-06, + "logits/chosen": 34639218.666666664, + "logits/rejected": 31226460.8, + "logps/chosen": -241.87862141927084, + "logps/rejected": -447.731982421875, + "loss": 0.0385, + "rewards/chosen": 0.0005201896031697592, + "rewards/margins": 9.271690233548483, + "rewards/rejected": -9.271170043945313, + "step": 93 + }, + { + "epoch": 0.01717679305619004, + "grad_norm": 16.5, + "kl": 0.0, + "learning_rate": 9.3e-06, + "logits/chosen": 26745672.727272727, + "logits/rejected": -33017491.2, + "logps/chosen": -277.4129083806818, + "logps/rejected": -281.562744140625, + "loss": 0.0572, + "rewards/chosen": 0.6284575462341309, + "rewards/margins": 10.899500942230224, + "rewards/rejected": -10.271043395996093, + "step": 94 + }, + { + "epoch": 0.01735952489721334, + "grad_norm": 20.75, + "kl": 0.0, + "learning_rate": 9.4e-06, + "logits/chosen": -152069004.8, + "logits/rejected": 111827269.81818181, + "logps/chosen": -327.553076171875, + "logps/rejected": -348.8797052556818, + "loss": 0.0427, + "rewards/chosen": 0.6367965698242187, + "rewards/margins": 10.010933061079545, + "rewards/rejected": -9.374136491255326, + "step": 95 + }, + { + "epoch": 0.017542256738236638, + "grad_norm": 10.75, + "kl": 0.0, + "learning_rate": 9.5e-06, + "logits/chosen": -77862933.33333333, + "logits/rejected": -54159995.428571425, + "logps/chosen": -181.27479383680554, + "logps/rejected": -288.72256905691967, + "loss": 0.0363, + "rewards/chosen": 1.4606913460625544, + "rewards/margins": 8.36744815584213, + "rewards/rejected": -6.9067568097795755, + "step": 96 + }, + { + "epoch": 0.017724988579259936, + "grad_norm": 9.9375, + "kl": 0.0, + "learning_rate": 9.600000000000001e-06, + "logits/chosen": 43054811.428571425, + "logits/rejected": 14771057.777777778, + "logps/chosen": -162.51302664620536, + "logps/rejected": -536.1613498263889, + "loss": 0.0351, + "rewards/chosen": 0.8046503748212542, + "rewards/margins": 10.9816633481828, + "rewards/rejected": -10.177012973361546, + "step": 97 + }, + { + "epoch": 0.017907720420283234, + "grad_norm": 11.9375, + "kl": 0.0, + "learning_rate": 9.7e-06, + "logits/chosen": -13295594.0, + "logits/rejected": 10694756.0, + "logps/chosen": -278.02105712890625, + "logps/rejected": -514.3211669921875, + "loss": 0.0422, + "rewards/chosen": 0.46801531314849854, + "rewards/margins": 10.005221009254456, + "rewards/rejected": -9.537205696105957, + "step": 98 + }, + { + "epoch": 0.018090452261306532, + "grad_norm": 9.75, + "kl": 0.0, + "learning_rate": 9.800000000000001e-06, + "logits/chosen": 8828074.666666666, + "logits/rejected": 53489688.615384616, + "logps/chosen": -263.14764404296875, + "logps/rejected": -460.5266676682692, + "loss": 0.0277, + "rewards/chosen": 0.9861307938893636, + "rewards/margins": 9.716855422044413, + "rewards/rejected": -8.730724628155048, + "step": 99 + }, + { + "epoch": 0.01827318410232983, + "grad_norm": 9.0, + "kl": 0.0, + "learning_rate": 9.9e-06, + "logits/chosen": -17284919.111111112, + "logits/rejected": 23346674.285714287, + "logps/chosen": -183.0745849609375, + "logps/rejected": -321.62465122767856, + "loss": 0.0358, + "rewards/chosen": 1.214132308959961, + "rewards/margins": 8.857803617204938, + "rewards/rejected": -7.643671308244977, + "step": 100 + }, + { + "epoch": 0.018455915943353128, + "grad_norm": 12.3125, + "kl": 0.0, + "learning_rate": 1e-05, + "logits/chosen": -37464534.85714286, + "logits/rejected": 53406990.222222224, + "logps/chosen": -209.09202357700892, + "logps/rejected": -329.81561957465277, + "loss": 0.0575, + "rewards/chosen": 0.7790302549089704, + "rewards/margins": 9.527226326957582, + "rewards/rejected": -8.74819607204861, + "step": 101 + }, + { + "epoch": 0.018638647784376426, + "grad_norm": 16.875, + "kl": 0.0, + "learning_rate": 9.999969538288953e-06, + "logits/chosen": -21487243.2, + "logits/rejected": -3651317.3333333335, + "logps/chosen": -346.1546630859375, + "logps/rejected": -460.31591796875, + "loss": 0.0736, + "rewards/chosen": 0.7104415893554688, + "rewards/margins": 12.46780014038086, + "rewards/rejected": -11.75735855102539, + "step": 102 + }, + { + "epoch": 0.018821379625399724, + "grad_norm": 11.0, + "kl": 0.0, + "learning_rate": 9.999878153526974e-06, + "logits/chosen": 50904558.222222224, + "logits/rejected": 10953250.285714285, + "logps/chosen": -268.544677734375, + "logps/rejected": -570.2978864397321, + "loss": 0.0504, + "rewards/chosen": 0.40636663966708714, + "rewards/margins": 11.244983147061062, + "rewards/rejected": -10.838616507393974, + "step": 103 + }, + { + "epoch": 0.019004111466423026, + "grad_norm": 7.34375, + "kl": 0.0, + "learning_rate": 9.999725846827562e-06, + "logits/chosen": -9445872.0, + "logits/rejected": 25251950.545454547, + "logps/chosen": -180.41806640625, + "logps/rejected": -437.0711115056818, + "loss": 0.0168, + "rewards/chosen": 1.2930338859558106, + "rewards/margins": 10.512652249769731, + "rewards/rejected": -9.21961836381392, + "step": 104 + }, + { + "epoch": 0.019186843307446324, + "grad_norm": 10.4375, + "kl": 0.0, + "learning_rate": 9.999512620046523e-06, + "logits/chosen": -49996564.0, + "logits/rejected": -1641734.75, + "logps/chosen": -164.00706481933594, + "logps/rejected": -521.3729248046875, + "loss": 0.0544, + "rewards/chosen": -0.0724596157670021, + "rewards/margins": 9.39596327394247, + "rewards/rejected": -9.468422889709473, + "step": 105 + }, + { + "epoch": 0.019369575148469622, + "grad_norm": 9.4375, + "kl": 0.0, + "learning_rate": 9.999238475781957e-06, + "logits/chosen": 27255545.14285714, + "logits/rejected": 47709176.88888889, + "logps/chosen": -312.25380161830356, + "logps/rejected": -381.14946831597223, + "loss": 0.0351, + "rewards/chosen": 1.0566960743495397, + "rewards/margins": 10.678171294076103, + "rewards/rejected": -9.621475219726562, + "step": 106 + }, + { + "epoch": 0.01955230698949292, + "grad_norm": 11.5, + "kl": 0.0, + "learning_rate": 9.998903417374228e-06, + "logits/chosen": -19765351.111111112, + "logits/rejected": 41099058.28571428, + "logps/chosen": -304.3002658420139, + "logps/rejected": -595.3833705357143, + "loss": 0.0695, + "rewards/chosen": 1.169258753458659, + "rewards/margins": 12.206085840861002, + "rewards/rejected": -11.036827087402344, + "step": 107 + }, + { + "epoch": 0.019735038830516218, + "grad_norm": 13.0, + "kl": 0.0, + "learning_rate": 9.998507448905917e-06, + "logits/chosen": -22659568.0, + "logits/rejected": -40998188.8, + "logps/chosen": -251.06953568892047, + "logps/rejected": -442.116650390625, + "loss": 0.0731, + "rewards/chosen": -0.15276223962957208, + "rewards/margins": 11.638527280634099, + "rewards/rejected": -11.791289520263671, + "step": 108 + }, + { + "epoch": 0.019917770671539516, + "grad_norm": 14.0, + "kl": 0.0, + "learning_rate": 9.998050575201772e-06, + "logits/chosen": -2681048.75, + "logits/rejected": 35689492.0, + "logps/chosen": -266.5792236328125, + "logps/rejected": -451.05401611328125, + "loss": 0.0516, + "rewards/chosen": 0.20415659248828888, + "rewards/margins": 9.250590041279793, + "rewards/rejected": -9.046433448791504, + "step": 109 + }, + { + "epoch": 0.020100502512562814, + "grad_norm": 6.75, + "kl": 0.0, + "learning_rate": 9.997532801828659e-06, + "logits/chosen": -7905390.222222222, + "logits/rejected": 53790656.0, + "logps/chosen": -156.85923936631946, + "logps/rejected": -489.79554966517856, + "loss": 0.0249, + "rewards/chosen": 1.6843304104275174, + "rewards/margins": 12.438601781451512, + "rewards/rejected": -10.754271371023995, + "step": 110 + }, + { + "epoch": 0.020283234353586112, + "grad_norm": 9.625, + "kl": 0.0, + "learning_rate": 9.99695413509548e-06, + "logits/chosen": 61630336.0, + "logits/rejected": 27755360.0, + "logps/chosen": -350.1818115234375, + "logps/rejected": -519.4198330965909, + "loss": 0.0239, + "rewards/chosen": 0.7862878322601319, + "rewards/margins": 12.593311314149338, + "rewards/rejected": -11.807023481889205, + "step": 111 + }, + { + "epoch": 0.02046596619460941, + "grad_norm": 10.0, + "kl": 0.0, + "learning_rate": 9.996314582053106e-06, + "logits/chosen": 43933837.71428572, + "logits/rejected": 42701859.55555555, + "logps/chosen": -212.70884486607142, + "logps/rejected": -600.5417751736111, + "loss": 0.045, + "rewards/chosen": 0.1969508613858904, + "rewards/margins": 11.625579570967053, + "rewards/rejected": -11.428628709581163, + "step": 112 + }, + { + "epoch": 0.02064869803563271, + "grad_norm": 14.3125, + "kl": 0.0, + "learning_rate": 9.995614150494293e-06, + "logits/chosen": -2217176.888888889, + "logits/rejected": 37728114.28571428, + "logps/chosen": -184.76308865017361, + "logps/rejected": -501.2035435267857, + "loss": 0.0691, + "rewards/chosen": -0.38805879486931694, + "rewards/margins": 8.826879777605573, + "rewards/rejected": -9.214938572474889, + "step": 113 + }, + { + "epoch": 0.020831429876656007, + "grad_norm": 8.625, + "kl": 0.0, + "learning_rate": 9.994852848953574e-06, + "logits/chosen": -9187934.4, + "logits/rejected": 57687674.666666664, + "logps/chosen": -205.9017822265625, + "logps/rejected": -310.98573811848956, + "loss": 0.0275, + "rewards/chosen": 1.8750038146972656, + "rewards/margins": 10.921371459960938, + "rewards/rejected": -9.046367645263672, + "step": 114 + }, + { + "epoch": 0.021014161717679305, + "grad_norm": 7.65625, + "kl": 0.0, + "learning_rate": 9.994030686707171e-06, + "logits/chosen": -2629993.6666666665, + "logits/rejected": 37173392.0, + "logps/chosen": -227.42569986979166, + "logps/rejected": -512.703466796875, + "loss": 0.0225, + "rewards/chosen": 1.132891337076823, + "rewards/margins": 12.55215326944987, + "rewards/rejected": -11.419261932373047, + "step": 115 + }, + { + "epoch": 0.021196893558702603, + "grad_norm": 10.1875, + "kl": 0.0, + "learning_rate": 9.993147673772869e-06, + "logits/chosen": 13853780.0, + "logits/rejected": -55690044.0, + "logps/chosen": -203.25863647460938, + "logps/rejected": -449.16131591796875, + "loss": 0.0296, + "rewards/chosen": 1.07320237159729, + "rewards/margins": 10.275444269180298, + "rewards/rejected": -9.202241897583008, + "step": 116 + }, + { + "epoch": 0.0213796253997259, + "grad_norm": 9.3125, + "kl": 0.0, + "learning_rate": 9.992203820909906e-06, + "logits/chosen": 375226.0, + "logits/rejected": 30554112.0, + "logps/chosen": -198.4293670654297, + "logps/rejected": -608.4130859375, + "loss": 0.0569, + "rewards/chosen": -0.2875816524028778, + "rewards/margins": 11.851035863161087, + "rewards/rejected": -12.138617515563965, + "step": 117 + }, + { + "epoch": 0.0215623572407492, + "grad_norm": 9.0, + "kl": 0.0, + "learning_rate": 9.991199139618828e-06, + "logits/chosen": -82919848.0, + "logits/rejected": 8129475.0, + "logps/chosen": -203.22494506835938, + "logps/rejected": -492.85552978515625, + "loss": 0.0314, + "rewards/chosen": 1.3656760454177856, + "rewards/margins": 12.448919415473938, + "rewards/rejected": -11.083243370056152, + "step": 118 + }, + { + "epoch": 0.021745089081772497, + "grad_norm": 12.4375, + "kl": 0.0, + "learning_rate": 9.990133642141359e-06, + "logits/chosen": -1748370.6666666667, + "logits/rejected": -34097257.14285714, + "logps/chosen": -303.6058756510417, + "logps/rejected": -229.27068219866072, + "loss": 0.0587, + "rewards/chosen": 0.3329376114739312, + "rewards/margins": 5.927888151199099, + "rewards/rejected": -5.5949505397251675, + "step": 119 + }, + { + "epoch": 0.0219278209227958, + "grad_norm": 12.75, + "kl": 0.0, + "learning_rate": 9.989007341460251e-06, + "logits/chosen": -20957076.0, + "logits/rejected": -9733178.0, + "logps/chosen": -292.73272705078125, + "logps/rejected": -420.2886962890625, + "loss": 0.0474, + "rewards/chosen": 0.17691576480865479, + "rewards/margins": 11.150305390357971, + "rewards/rejected": -10.973389625549316, + "step": 120 + }, + { + "epoch": 0.022110552763819097, + "grad_norm": 11.375, + "kl": 0.0, + "learning_rate": 9.987820251299121e-06, + "logits/chosen": -45196476.0, + "logits/rejected": 24584888.0, + "logps/chosen": -202.7886962890625, + "logps/rejected": -468.3090515136719, + "loss": 0.0389, + "rewards/chosen": 0.667470395565033, + "rewards/margins": 9.689128339290619, + "rewards/rejected": -9.021657943725586, + "step": 121 + }, + { + "epoch": 0.022293284604842395, + "grad_norm": 10.25, + "kl": 0.0, + "learning_rate": 9.98657238612229e-06, + "logits/chosen": 9374024.888888888, + "logits/rejected": -61742048.0, + "logps/chosen": -195.44489203559027, + "logps/rejected": -324.91932896205356, + "loss": 0.0441, + "rewards/chosen": 0.6865261395772299, + "rewards/margins": 7.3625787780398415, + "rewards/rejected": -6.676052638462612, + "step": 122 + }, + { + "epoch": 0.022476016445865693, + "grad_norm": 10.6875, + "kl": 0.0, + "learning_rate": 9.985263761134602e-06, + "logits/chosen": -9946294.222222222, + "logits/rejected": 52699771.428571425, + "logps/chosen": -242.14111328125, + "logps/rejected": -473.83290318080356, + "loss": 0.0424, + "rewards/chosen": 0.8854984707302518, + "rewards/margins": 12.165775177970765, + "rewards/rejected": -11.280276707240514, + "step": 123 + }, + { + "epoch": 0.02265874828688899, + "grad_norm": 7.40625, + "kl": 0.0, + "learning_rate": 9.983894392281237e-06, + "logits/chosen": 45211456.0, + "logits/rejected": 7341648.888888889, + "logps/chosen": -144.06535993303572, + "logps/rejected": -579.1583658854166, + "loss": 0.0186, + "rewards/chosen": 2.469797134399414, + "rewards/margins": 12.812082290649414, + "rewards/rejected": -10.34228515625, + "step": 124 + }, + { + "epoch": 0.02284148012791229, + "grad_norm": 6.4375, + "kl": 0.0, + "learning_rate": 9.982464296247523e-06, + "logits/chosen": -31675536.0, + "logits/rejected": 4811034.4, + "logps/chosen": -179.7110392252604, + "logps/rejected": -432.57939453125, + "loss": 0.0272, + "rewards/chosen": 1.7898502349853516, + "rewards/margins": 10.451859664916991, + "rewards/rejected": -8.66200942993164, + "step": 125 + }, + { + "epoch": 0.023024211968935587, + "grad_norm": 12.5625, + "kl": 0.0, + "learning_rate": 9.980973490458728e-06, + "logits/chosen": -46816536.615384616, + "logits/rejected": 142720224.0, + "logps/chosen": -255.6627478966346, + "logps/rejected": -909.1883951822916, + "loss": 0.0739, + "rewards/chosen": 0.43802239344670224, + "rewards/margins": 16.219834425510506, + "rewards/rejected": -15.781812032063803, + "step": 126 + }, + { + "epoch": 0.023206943809958885, + "grad_norm": 9.75, + "kl": 0.0, + "learning_rate": 9.979421993079853e-06, + "logits/chosen": -16367797.714285715, + "logits/rejected": -95168305.77777778, + "logps/chosen": -210.73611886160714, + "logps/rejected": -532.7796766493055, + "loss": 0.0358, + "rewards/chosen": 0.9125944546290806, + "rewards/margins": 9.725546284327432, + "rewards/rejected": -8.812951829698351, + "step": 127 + }, + { + "epoch": 0.023389675650982183, + "grad_norm": 10.1875, + "kl": 0.07001352310180664, + "learning_rate": 9.9778098230154e-06, + "logits/chosen": -15346250.666666666, + "logits/rejected": -34208768.0, + "logps/chosen": -178.3120320638021, + "logps/rejected": -753.46533203125, + "loss": 0.0449, + "rewards/chosen": 1.468371868133545, + "rewards/margins": 10.970693111419678, + "rewards/rejected": -9.502321243286133, + "step": 128 + }, + { + "epoch": 0.02357240749200548, + "grad_norm": 12.0, + "kl": 0.0, + "learning_rate": 9.976136999909156e-06, + "logits/chosen": -12844988.0, + "logits/rejected": 20317320.0, + "logps/chosen": -362.83270263671875, + "logps/rejected": -425.5692545572917, + "loss": 0.0228, + "rewards/chosen": 0.9538429379463196, + "rewards/margins": 9.774348080158234, + "rewards/rejected": -8.820505142211914, + "step": 129 + }, + { + "epoch": 0.02375513933302878, + "grad_norm": 10.75, + "kl": 0.0, + "learning_rate": 9.974403544143942e-06, + "logits/chosen": -65566408.0, + "logits/rejected": 62547320.0, + "logps/chosen": -204.61776733398438, + "logps/rejected": -538.25439453125, + "loss": 0.0325, + "rewards/chosen": 2.2856264114379883, + "rewards/margins": 14.243976593017578, + "rewards/rejected": -11.95835018157959, + "step": 130 + }, + { + "epoch": 0.023937871174052078, + "grad_norm": 7.0, + "kl": 0.0, + "learning_rate": 9.972609476841368e-06, + "logits/chosen": -45865781.333333336, + "logits/rejected": 63436313.6, + "logps/chosen": -168.7638142903646, + "logps/rejected": -403.56806640625, + "loss": 0.0176, + "rewards/chosen": 1.53853972752889, + "rewards/margins": 10.975240929921469, + "rewards/rejected": -9.436701202392578, + "step": 131 + }, + { + "epoch": 0.024120603015075376, + "grad_norm": 10.625, + "kl": 0.0, + "learning_rate": 9.970754819861577e-06, + "logits/chosen": -99302440.0, + "logits/rejected": 10557010.0, + "logps/chosen": -311.8251953125, + "logps/rejected": -477.821533203125, + "loss": 0.0525, + "rewards/chosen": 0.03691141679883003, + "rewards/margins": 8.711327958852053, + "rewards/rejected": -8.674416542053223, + "step": 132 + }, + { + "epoch": 0.024303334856098674, + "grad_norm": 12.0, + "kl": 0.0, + "learning_rate": 9.968839595802982e-06, + "logits/chosen": -17955239.111111112, + "logits/rejected": 40892964.571428575, + "logps/chosen": -265.50591362847223, + "logps/rejected": -425.21498325892856, + "loss": 0.0423, + "rewards/chosen": 0.8885628382364908, + "rewards/margins": 7.299412273225331, + "rewards/rejected": -6.410849434988839, + "step": 133 + }, + { + "epoch": 0.024486066697121972, + "grad_norm": 15.125, + "kl": 0.0, + "learning_rate": 9.966863828001982e-06, + "logits/chosen": -20493070.0, + "logits/rejected": -28061472.0, + "logps/chosen": -242.3866424560547, + "logps/rejected": -333.1738586425781, + "loss": 0.0683, + "rewards/chosen": 0.5944722294807434, + "rewards/margins": 7.844681560993195, + "rewards/rejected": -7.250209331512451, + "step": 134 + }, + { + "epoch": 0.02466879853814527, + "grad_norm": 7.0625, + "kl": 0.0, + "learning_rate": 9.964827540532685e-06, + "logits/chosen": 18096723.2, + "logits/rejected": -64990807.27272727, + "logps/chosen": -122.62847900390625, + "logps/rejected": -443.97580788352275, + "loss": 0.0626, + "rewards/chosen": -0.05418791770935059, + "rewards/margins": 7.402644968032837, + "rewards/rejected": -7.4568328857421875, + "step": 135 + }, + { + "epoch": 0.02485153037916857, + "grad_norm": 14.25, + "kl": 0.0, + "learning_rate": 9.962730758206612e-06, + "logits/chosen": 16501221.714285715, + "logits/rejected": 2570867.111111111, + "logps/chosen": -213.52392578125, + "logps/rejected": -413.6935763888889, + "loss": 0.0574, + "rewards/chosen": 0.31593670163835796, + "rewards/margins": 7.523489293597994, + "rewards/rejected": -7.207552591959636, + "step": 136 + }, + { + "epoch": 0.02503426222019187, + "grad_norm": 6.625, + "kl": 0.0, + "learning_rate": 9.960573506572391e-06, + "logits/chosen": -16532584.0, + "logits/rejected": -24597337.6, + "logps/chosen": -141.91285196940103, + "logps/rejected": -538.758544921875, + "loss": 0.039, + "rewards/chosen": -0.12036831180254619, + "rewards/margins": 9.787748601039251, + "rewards/rejected": -9.908116912841797, + "step": 137 + }, + { + "epoch": 0.025216994061215168, + "grad_norm": 8.875, + "kl": 0.0, + "learning_rate": 9.958355811915452e-06, + "logits/chosen": 17729986.285714287, + "logits/rejected": -8879232.0, + "logps/chosen": -223.06019810267858, + "logps/rejected": -524.7058376736111, + "loss": 0.0302, + "rewards/chosen": 1.1198686872209822, + "rewards/margins": 10.349999200730098, + "rewards/rejected": -9.230130513509115, + "step": 138 + }, + { + "epoch": 0.025399725902238466, + "grad_norm": 14.3125, + "kl": 0.0, + "learning_rate": 9.95607770125771e-06, + "logits/chosen": -49725911.27272727, + "logits/rejected": 77200985.6, + "logps/chosen": -187.3395108309659, + "logps/rejected": -462.8265625, + "loss": 0.0538, + "rewards/chosen": 1.112017891623757, + "rewards/margins": 8.068951485373757, + "rewards/rejected": -6.95693359375, + "step": 139 + }, + { + "epoch": 0.025582457743261764, + "grad_norm": 8.9375, + "kl": 0.0, + "learning_rate": 9.953739202357219e-06, + "logits/chosen": 40394208.0, + "logits/rejected": -12492381.333333334, + "logps/chosen": -328.3868103027344, + "logps/rejected": -395.240234375, + "loss": 0.0268, + "rewards/chosen": 0.6951263546943665, + "rewards/margins": 9.076240042845408, + "rewards/rejected": -8.381113688151041, + "step": 140 + }, + { + "epoch": 0.025765189584285062, + "grad_norm": 8.875, + "kl": 0.0, + "learning_rate": 9.951340343707852e-06, + "logits/chosen": -18110290.666666668, + "logits/rejected": -59622470.4, + "logps/chosen": -213.19305419921875, + "logps/rejected": -289.939453125, + "loss": 0.0405, + "rewards/chosen": 0.12134170532226562, + "rewards/margins": 7.7242431640625, + "rewards/rejected": -7.602901458740234, + "step": 141 + }, + { + "epoch": 0.02594792142530836, + "grad_norm": 11.5625, + "kl": 0.0, + "learning_rate": 9.948881154538946e-06, + "logits/chosen": -40935928.88888889, + "logits/rejected": 14432745.142857144, + "logps/chosen": -254.48125542534723, + "logps/rejected": -628.7952706473214, + "loss": 0.0569, + "rewards/chosen": 0.21923213534884983, + "rewards/margins": 12.136303886534677, + "rewards/rejected": -11.917071751185826, + "step": 142 + }, + { + "epoch": 0.02613065326633166, + "grad_norm": 9.3125, + "kl": 0.0, + "learning_rate": 9.946361664814942e-06, + "logits/chosen": -19903022.0, + "logits/rejected": -21471688.0, + "logps/chosen": -250.59359741210938, + "logps/rejected": -362.408935546875, + "loss": 0.0472, + "rewards/chosen": 0.4167897701263428, + "rewards/margins": 9.261382818222046, + "rewards/rejected": -8.844593048095703, + "step": 143 + }, + { + "epoch": 0.026313385107354956, + "grad_norm": 8.75, + "kl": 0.0, + "learning_rate": 9.94378190523503e-06, + "logits/chosen": -33258147.555555556, + "logits/rejected": 32312660.57142857, + "logps/chosen": -159.10727267795139, + "logps/rejected": -532.9353376116071, + "loss": 0.0364, + "rewards/chosen": 1.1320035722520616, + "rewards/margins": 13.126503868708536, + "rewards/rejected": -11.994500296456474, + "step": 144 + }, + { + "epoch": 0.026496116948378255, + "grad_norm": 12.625, + "kl": 0.0, + "learning_rate": 9.941141907232766e-06, + "logits/chosen": -46186656.0, + "logits/rejected": -43647700.0, + "logps/chosen": -351.12054443359375, + "logps/rejected": -486.73455810546875, + "loss": 0.0419, + "rewards/chosen": 0.7325439453125, + "rewards/margins": 10.345346450805664, + "rewards/rejected": -9.612802505493164, + "step": 145 + }, + { + "epoch": 0.026678848789401553, + "grad_norm": 12.375, + "kl": 0.0, + "learning_rate": 9.938441702975689e-06, + "logits/chosen": -79972480.0, + "logits/rejected": -31790862.222222224, + "logps/chosen": -348.82603236607144, + "logps/rejected": -585.2079535590278, + "loss": 0.0458, + "rewards/chosen": 0.0698665337903159, + "rewards/margins": 11.601745534983893, + "rewards/rejected": -11.531879001193577, + "step": 146 + }, + { + "epoch": 0.02686158063042485, + "grad_norm": 14.1875, + "kl": 0.0, + "learning_rate": 9.93568132536494e-06, + "logits/chosen": -30893907.2, + "logits/rejected": -56223162.666666664, + "logps/chosen": -287.192822265625, + "logps/rejected": -263.88462320963544, + "loss": 0.0686, + "rewards/chosen": -0.14345015287399293, + "rewards/margins": 8.206547685464225, + "rewards/rejected": -8.349997838338217, + "step": 147 + }, + { + "epoch": 0.02704431247144815, + "grad_norm": 6.5, + "kl": 0.0, + "learning_rate": 9.932860808034847e-06, + "logits/chosen": -116337088.0, + "logits/rejected": 25376048.0, + "logps/chosen": -193.46401977539062, + "logps/rejected": -535.2298990885416, + "loss": 0.0183, + "rewards/chosen": 0.9462637305259705, + "rewards/margins": 13.768901924292246, + "rewards/rejected": -12.822638193766275, + "step": 148 + }, + { + "epoch": 0.027227044312471447, + "grad_norm": 19.5, + "kl": 0.0, + "learning_rate": 9.929980185352525e-06, + "logits/chosen": 28538796.0, + "logits/rejected": -40653112.0, + "logps/chosen": -234.37571716308594, + "logps/rejected": -276.44415283203125, + "loss": 0.0336, + "rewards/chosen": 1.3265533447265625, + "rewards/margins": 10.687811851501465, + "rewards/rejected": -9.361258506774902, + "step": 149 + }, + { + "epoch": 0.027409776153494745, + "grad_norm": 11.5, + "kl": 0.0, + "learning_rate": 9.927039492417452e-06, + "logits/chosen": -19731898.666666668, + "logits/rejected": -40777906.28571428, + "logps/chosen": -303.3791775173611, + "logps/rejected": -387.57686941964283, + "loss": 0.0433, + "rewards/chosen": 0.7278247939215766, + "rewards/margins": 8.002785448044065, + "rewards/rejected": -7.274960654122489, + "step": 150 + }, + { + "epoch": 0.027592507994518043, + "grad_norm": 9.4375, + "kl": 0.0, + "learning_rate": 9.924038765061042e-06, + "logits/chosen": -57043101.09090909, + "logits/rejected": 59331129.6, + "logps/chosen": -224.5096102627841, + "logps/rejected": -517.466259765625, + "loss": 0.042, + "rewards/chosen": 1.1913417469371448, + "rewards/margins": 10.57372623790394, + "rewards/rejected": -9.382384490966796, + "step": 151 + }, + { + "epoch": 0.027775239835541345, + "grad_norm": 8.625, + "kl": 0.0, + "learning_rate": 9.92097803984621e-06, + "logits/chosen": -29139027.2, + "logits/rejected": 2807276.0, + "logps/chosen": -268.7079345703125, + "logps/rejected": -569.3520063920455, + "loss": 0.0255, + "rewards/chosen": 0.44924259185791016, + "rewards/margins": 11.564605972983621, + "rewards/rejected": -11.11536338112571, + "step": 152 + }, + { + "epoch": 0.027957971676564643, + "grad_norm": 11.9375, + "kl": 0.0, + "learning_rate": 9.91785735406693e-06, + "logits/chosen": -44307004.44444445, + "logits/rejected": -26356009.14285714, + "logps/chosen": -231.24962022569446, + "logps/rejected": -541.3379255022321, + "loss": 0.0499, + "rewards/chosen": 0.40531084272596574, + "rewards/margins": 12.06020717015342, + "rewards/rejected": -11.654896327427455, + "step": 153 + }, + { + "epoch": 0.02814070351758794, + "grad_norm": 11.4375, + "kl": 0.0, + "learning_rate": 9.914676745747772e-06, + "logits/chosen": 8037914.181818182, + "logits/rejected": 18653041.6, + "logps/chosen": -235.87193714488637, + "logps/rejected": -197.0929443359375, + "loss": 0.0608, + "rewards/chosen": 0.5260301069779829, + "rewards/margins": 7.098288102583451, + "rewards/rejected": -6.572257995605469, + "step": 154 + }, + { + "epoch": 0.02832343535861124, + "grad_norm": 10.75, + "kl": 0.0, + "learning_rate": 9.911436253643445e-06, + "logits/chosen": 40451220.0, + "logits/rejected": -37326276.0, + "logps/chosen": -263.40869140625, + "logps/rejected": -755.1126098632812, + "loss": 0.0533, + "rewards/chosen": 0.12844203412532806, + "rewards/margins": 11.692680582404137, + "rewards/rejected": -11.564238548278809, + "step": 155 + }, + { + "epoch": 0.028506167199634537, + "grad_norm": 11.6875, + "kl": 0.0, + "learning_rate": 9.908135917238321e-06, + "logits/chosen": -23567688.0, + "logits/rejected": -9131751.384615384, + "logps/chosen": -187.83158365885416, + "logps/rejected": -413.91702974759613, + "loss": 0.0304, + "rewards/chosen": -0.033045957485834755, + "rewards/margins": 8.295887171457975, + "rewards/rejected": -8.32893312894381, + "step": 156 + }, + { + "epoch": 0.028688899040657835, + "grad_norm": 8.3125, + "kl": 0.0, + "learning_rate": 9.904775776745959e-06, + "logits/chosen": -58022776.0, + "logits/rejected": 4303653.5, + "logps/chosen": -176.41183471679688, + "logps/rejected": -441.13043212890625, + "loss": 0.025, + "rewards/chosen": 1.453553318977356, + "rewards/margins": 9.714136242866516, + "rewards/rejected": -8.26058292388916, + "step": 157 + }, + { + "epoch": 0.028871630881681133, + "grad_norm": 8.8125, + "kl": 0.0, + "learning_rate": 9.901355873108611e-06, + "logits/chosen": -41874574.222222224, + "logits/rejected": -33911968.0, + "logps/chosen": -230.99560546875, + "logps/rejected": -469.05067661830356, + "loss": 0.0331, + "rewards/chosen": 1.6582782533433702, + "rewards/margins": 11.945867190285334, + "rewards/rejected": -10.287588936941964, + "step": 158 + }, + { + "epoch": 0.02905436272270443, + "grad_norm": 10.625, + "kl": 0.0, + "learning_rate": 9.89787624799672e-06, + "logits/chosen": 15638672.0, + "logits/rejected": -25559271.111111112, + "logps/chosen": -250.52936662946428, + "logps/rejected": -399.26714409722223, + "loss": 0.0396, + "rewards/chosen": 0.26315505164010183, + "rewards/margins": 8.076995281946092, + "rewards/rejected": -7.813840230305989, + "step": 159 + }, + { + "epoch": 0.02923709456372773, + "grad_norm": 15.5625, + "kl": 0.0, + "learning_rate": 9.894336943808426e-06, + "logits/chosen": 4406545.333333333, + "logits/rejected": -13619300.8, + "logps/chosen": -269.4041341145833, + "logps/rejected": -483.96474609375, + "loss": 0.0604, + "rewards/chosen": -0.3536723057428996, + "rewards/margins": 7.6330891688664755, + "rewards/rejected": -7.986761474609375, + "step": 160 + }, + { + "epoch": 0.029419826404751027, + "grad_norm": 14.625, + "kl": 0.0, + "learning_rate": 9.890738003669029e-06, + "logits/chosen": -52975224.88888889, + "logits/rejected": -25453291.42857143, + "logps/chosen": -198.29935709635416, + "logps/rejected": -427.6552734375, + "loss": 0.0564, + "rewards/chosen": 1.5935664706759982, + "rewards/margins": 10.187437996031747, + "rewards/rejected": -8.593871525355748, + "step": 161 + }, + { + "epoch": 0.029602558245774326, + "grad_norm": 11.0625, + "kl": 0.0, + "learning_rate": 9.887079471430481e-06, + "logits/chosen": -43349641.14285714, + "logits/rejected": 6427946.666666667, + "logps/chosen": -323.26475306919644, + "logps/rejected": -462.98621961805554, + "loss": 0.0381, + "rewards/chosen": 0.38401079177856445, + "rewards/margins": 10.87151214811537, + "rewards/rejected": -10.487501356336805, + "step": 162 + }, + { + "epoch": 0.029785290086797624, + "grad_norm": 8.4375, + "kl": 0.0, + "learning_rate": 9.883361391670841e-06, + "logits/chosen": 33870834.28571428, + "logits/rejected": 18836433.777777776, + "logps/chosen": -262.24094935825894, + "logps/rejected": -415.39019097222223, + "loss": 0.0269, + "rewards/chosen": 1.5279685429164342, + "rewards/margins": 10.559711910429456, + "rewards/rejected": -9.031743367513021, + "step": 163 + }, + { + "epoch": 0.02996802192782092, + "grad_norm": 8.3125, + "kl": 0.0, + "learning_rate": 9.879583809693737e-06, + "logits/chosen": -15065575.111111112, + "logits/rejected": 49177124.571428575, + "logps/chosen": -140.16392686631946, + "logps/rejected": -447.26021902901783, + "loss": 0.0536, + "rewards/chosen": 0.27817802959018284, + "rewards/margins": 13.295284222042751, + "rewards/rejected": -13.017106192452568, + "step": 164 + }, + { + "epoch": 0.03015075376884422, + "grad_norm": 9.6875, + "kl": 0.0, + "learning_rate": 9.875746771527817e-06, + "logits/chosen": -21322064.0, + "logits/rejected": -13678916.0, + "logps/chosen": -218.57144165039062, + "logps/rejected": -307.1365661621094, + "loss": 0.0403, + "rewards/chosen": 0.8006727695465088, + "rewards/margins": 10.560196161270142, + "rewards/rejected": -9.759523391723633, + "step": 165 + }, + { + "epoch": 0.030333485609867518, + "grad_norm": 10.25, + "kl": 0.0, + "learning_rate": 9.871850323926178e-06, + "logits/chosen": 17016081.777777776, + "logits/rejected": 36596064.0, + "logps/chosen": -238.42217339409723, + "logps/rejected": -476.06954520089283, + "loss": 0.0382, + "rewards/chosen": 0.7989939583672417, + "rewards/margins": 12.360450911143468, + "rewards/rejected": -11.561456952776227, + "step": 166 + }, + { + "epoch": 0.030516217450890816, + "grad_norm": 2.84375, + "kl": 0.0, + "learning_rate": 9.867894514365802e-06, + "logits/chosen": -67007539.2, + "logits/rejected": 21586718.545454547, + "logps/chosen": -207.189501953125, + "logps/rejected": -452.0479847301136, + "loss": 0.0043, + "rewards/chosen": 3.3779891967773437, + "rewards/margins": 13.73011211048473, + "rewards/rejected": -10.352122913707387, + "step": 167 + }, + { + "epoch": 0.030698949291914118, + "grad_norm": 12.1875, + "kl": 0.0, + "learning_rate": 9.863879391046985e-06, + "logits/chosen": -18489427.555555556, + "logits/rejected": -17043596.57142857, + "logps/chosen": -296.30577256944446, + "logps/rejected": -237.63996233258928, + "loss": 0.0664, + "rewards/chosen": 0.7855969534979926, + "rewards/margins": 6.671243228609599, + "rewards/rejected": -5.885646275111607, + "step": 168 + }, + { + "epoch": 0.030881681132937416, + "grad_norm": 9.625, + "kl": 0.0, + "learning_rate": 9.859805002892733e-06, + "logits/chosen": -14445072.0, + "logits/rejected": 2436061.3333333335, + "logps/chosen": -380.5338657924107, + "logps/rejected": -487.66346571180554, + "loss": 0.0396, + "rewards/chosen": 0.5325230189732143, + "rewards/margins": 11.969546605670262, + "rewards/rejected": -11.437023586697048, + "step": 169 + }, + { + "epoch": 0.031064412973960714, + "grad_norm": 9.1875, + "kl": 0.0, + "learning_rate": 9.85567139954818e-06, + "logits/chosen": -63404448.0, + "logits/rejected": 9063674.666666666, + "logps/chosen": -265.45458984375, + "logps/rejected": -513.2650282118055, + "loss": 0.0345, + "rewards/chosen": 0.567058903830392, + "rewards/margins": 13.263805730002266, + "rewards/rejected": -12.696746826171875, + "step": 170 + }, + { + "epoch": 0.031247144814984012, + "grad_norm": 13.1875, + "kl": 0.0, + "learning_rate": 9.851478631379982e-06, + "logits/chosen": 43092930.666666664, + "logits/rejected": 18036876.0, + "logps/chosen": -210.1650390625, + "logps/rejected": -598.5806274414062, + "loss": 0.0753, + "rewards/chosen": 0.042099197705586754, + "rewards/margins": 12.836153229077658, + "rewards/rejected": -12.79405403137207, + "step": 171 + }, + { + "epoch": 0.031429876656007306, + "grad_norm": 11.625, + "kl": 0.0, + "learning_rate": 9.847226749475696e-06, + "logits/chosen": 13212916.363636363, + "logits/rejected": -8226036.8, + "logps/chosen": -210.6290838068182, + "logps/rejected": -393.9312255859375, + "loss": 0.055, + "rewards/chosen": 1.384822585365989, + "rewards/margins": 11.515863539955832, + "rewards/rejected": -10.131040954589844, + "step": 172 + }, + { + "epoch": 0.031612608497030605, + "grad_norm": 9.75, + "kl": 0.0, + "learning_rate": 9.842915805643156e-06, + "logits/chosen": -31852564.57142857, + "logits/rejected": -29565582.222222224, + "logps/chosen": -391.40781947544644, + "logps/rejected": -451.2497829861111, + "loss": 0.0307, + "rewards/chosen": 0.8976527622767857, + "rewards/margins": 12.211492144872272, + "rewards/rejected": -11.313839382595486, + "step": 173 + }, + { + "epoch": 0.0317953403380539, + "grad_norm": 11.3125, + "kl": 1.0289344787597656, + "learning_rate": 9.838545852409857e-06, + "logits/chosen": 1668840.0, + "logits/rejected": -4132759.6666666665, + "logps/chosen": -221.2080810546875, + "logps/rejected": -506.0093180338542, + "loss": 0.0663, + "rewards/chosen": 0.23057959079742432, + "rewards/margins": 16.29253408908844, + "rewards/rejected": -16.061954498291016, + "step": 174 + }, + { + "epoch": 0.03197807217907721, + "grad_norm": 9.8125, + "kl": 0.0, + "learning_rate": 9.834116943022299e-06, + "logits/chosen": -80290887.1111111, + "logits/rejected": 35682646.85714286, + "logps/chosen": -325.93006727430554, + "logps/rejected": -393.14439174107144, + "loss": 0.037, + "rewards/chosen": 1.2385315365261502, + "rewards/margins": 13.70615347604903, + "rewards/rejected": -12.46762193952288, + "step": 175 + }, + { + "epoch": 0.032160804020100506, + "grad_norm": 10.375, + "kl": 0.06754875183105469, + "learning_rate": 9.829629131445342e-06, + "logits/chosen": -65272328.0, + "logits/rejected": -45008160.0, + "logps/chosen": -246.46725463867188, + "logps/rejected": -581.7747802734375, + "loss": 0.0428, + "rewards/chosen": 0.7156017422676086, + "rewards/margins": 12.126186192035675, + "rewards/rejected": -11.410584449768066, + "step": 176 + }, + { + "epoch": 0.032343535861123804, + "grad_norm": 10.0625, + "kl": 0.0, + "learning_rate": 9.825082472361558e-06, + "logits/chosen": -52046614.85714286, + "logits/rejected": -27206197.333333332, + "logps/chosen": -335.16420200892856, + "logps/rejected": -472.43739149305554, + "loss": 0.0288, + "rewards/chosen": 1.2027976172310966, + "rewards/margins": 13.836552771310958, + "rewards/rejected": -12.63375515407986, + "step": 177 + }, + { + "epoch": 0.0325262677021471, + "grad_norm": 17.625, + "kl": 0.0, + "learning_rate": 9.82047702117055e-06, + "logits/chosen": -19352465.6, + "logits/rejected": -8666183.333333334, + "logps/chosen": -224.7473388671875, + "logps/rejected": -354.0017496744792, + "loss": 0.0843, + "rewards/chosen": 0.054957568645477295, + "rewards/margins": 10.34482624133428, + "rewards/rejected": -10.289868672688803, + "step": 178 + }, + { + "epoch": 0.0327089995431704, + "grad_norm": 5.9375, + "kl": 0.0, + "learning_rate": 9.815812833988292e-06, + "logits/chosen": -19430905.14285714, + "logits/rejected": -5889530.666666667, + "logps/chosen": -112.91384451729911, + "logps/rejected": -500.7010091145833, + "loss": 0.0227, + "rewards/chosen": 2.000849723815918, + "rewards/margins": 17.110204378763832, + "rewards/rejected": -15.109354654947916, + "step": 179 + }, + { + "epoch": 0.0328917313841937, + "grad_norm": 14.4375, + "kl": 0.0, + "learning_rate": 9.811089967646427e-06, + "logits/chosen": -48971741.09090909, + "logits/rejected": -46747500.8, + "logps/chosen": -305.60378196022725, + "logps/rejected": -524.29130859375, + "loss": 0.063, + "rewards/chosen": 0.662750244140625, + "rewards/margins": 10.367561340332031, + "rewards/rejected": -9.704811096191406, + "step": 180 + }, + { + "epoch": 0.033074463225216996, + "grad_norm": 10.4375, + "kl": 0.0, + "learning_rate": 9.806308479691595e-06, + "logits/chosen": -5257199.2, + "logits/rejected": -15931578.666666666, + "logps/chosen": -232.782470703125, + "logps/rejected": -383.7196451822917, + "loss": 0.0626, + "rewards/chosen": 0.03214998841285706, + "rewards/margins": 11.146966018279395, + "rewards/rejected": -11.114816029866537, + "step": 181 + }, + { + "epoch": 0.033257195066240294, + "grad_norm": 12.5625, + "kl": 0.18530845642089844, + "learning_rate": 9.801468428384716e-06, + "logits/chosen": -57171481.6, + "logits/rejected": -78634005.33333333, + "logps/chosen": -259.4974365234375, + "logps/rejected": -381.4061686197917, + "loss": 0.0512, + "rewards/chosen": 0.8064211845397949, + "rewards/margins": 11.525627676645914, + "rewards/rejected": -10.71920649210612, + "step": 182 + }, + { + "epoch": 0.03343992690726359, + "grad_norm": 9.5625, + "kl": 0.0, + "learning_rate": 9.796569872700287e-06, + "logits/chosen": -40673688.88888889, + "logits/rejected": -73377709.71428572, + "logps/chosen": -214.70838758680554, + "logps/rejected": -450.4144810267857, + "loss": 0.0381, + "rewards/chosen": 1.1711487240261502, + "rewards/margins": 13.835245011344789, + "rewards/rejected": -12.664096287318639, + "step": 183 + }, + { + "epoch": 0.03362265874828689, + "grad_norm": 9.6875, + "kl": 0.0, + "learning_rate": 9.791612872325667e-06, + "logits/chosen": -26362646.4, + "logits/rejected": 8563294.545454545, + "logps/chosen": -191.083984375, + "logps/rejected": -404.68661221590907, + "loss": 0.031, + "rewards/chosen": 0.9333098411560059, + "rewards/margins": 8.754247951507569, + "rewards/rejected": -7.8209381103515625, + "step": 184 + }, + { + "epoch": 0.03380539058931019, + "grad_norm": 8.5625, + "kl": 0.0, + "learning_rate": 9.786597487660336e-06, + "logits/chosen": -62268529.777777776, + "logits/rejected": -16005750.857142856, + "logps/chosen": -231.40679253472223, + "logps/rejected": -567.0012555803571, + "loss": 0.0299, + "rewards/chosen": 1.662965562608507, + "rewards/margins": 13.948931618342325, + "rewards/rejected": -12.285966055733818, + "step": 185 + }, + { + "epoch": 0.03398812243033349, + "grad_norm": 12.1875, + "kl": 0.0, + "learning_rate": 9.781523779815178e-06, + "logits/chosen": -5133692.0, + "logits/rejected": 62337152.0, + "logps/chosen": -253.9652587890625, + "logps/rejected": -463.3851725260417, + "loss": 0.054, + "rewards/chosen": 0.7464311599731446, + "rewards/margins": 12.105336952209473, + "rewards/rejected": -11.358905792236328, + "step": 186 + }, + { + "epoch": 0.034170854271356785, + "grad_norm": 11.5625, + "kl": 0.0, + "learning_rate": 9.776391810611719e-06, + "logits/chosen": -32841405.333333332, + "logits/rejected": 4731927.2, + "logps/chosen": -417.6439615885417, + "logps/rejected": -454.07548828125, + "loss": 0.0252, + "rewards/chosen": 0.7369430859883627, + "rewards/margins": 13.015973695119223, + "rewards/rejected": -12.27903060913086, + "step": 187 + }, + { + "epoch": 0.03435358611238008, + "grad_norm": 12.5, + "kl": 0.0, + "learning_rate": 9.771201642581384e-06, + "logits/chosen": -54806880.0, + "logits/rejected": -66316820.0, + "logps/chosen": -258.4720153808594, + "logps/rejected": -570.376953125, + "loss": 0.0445, + "rewards/chosen": 0.37782976031303406, + "rewards/margins": 13.280707567930222, + "rewards/rejected": -12.902877807617188, + "step": 188 + }, + { + "epoch": 0.03453631795340338, + "grad_norm": 8.0625, + "kl": 0.0, + "learning_rate": 9.765953338964736e-06, + "logits/chosen": -24818112.0, + "logits/rejected": 3460893.2, + "logps/chosen": -218.0700480143229, + "logps/rejected": -427.183447265625, + "loss": 0.0319, + "rewards/chosen": 0.8549261093139648, + "rewards/margins": 14.461820030212403, + "rewards/rejected": -13.606893920898438, + "step": 189 + }, + { + "epoch": 0.03471904979442668, + "grad_norm": 10.1875, + "kl": 0.0, + "learning_rate": 9.760646963710694e-06, + "logits/chosen": -13395668.444444444, + "logits/rejected": -93222564.57142857, + "logps/chosen": -251.12144639756946, + "logps/rejected": -615.9485212053571, + "loss": 0.0397, + "rewards/chosen": 0.8633543650309244, + "rewards/margins": 14.796524138677688, + "rewards/rejected": -13.933169773646764, + "step": 190 + }, + { + "epoch": 0.03490178163544998, + "grad_norm": 8.8125, + "kl": 0.0, + "learning_rate": 9.755282581475769e-06, + "logits/chosen": -27684288.0, + "logits/rejected": -19029204.8, + "logps/chosen": -194.96113725142047, + "logps/rejected": -340.7255126953125, + "loss": 0.0469, + "rewards/chosen": 1.2604696967385032, + "rewards/margins": 8.162573883750222, + "rewards/rejected": -6.902104187011719, + "step": 191 + }, + { + "epoch": 0.035084513476473275, + "grad_norm": 12.125, + "kl": 0.0, + "learning_rate": 9.749860257623262e-06, + "logits/chosen": -64808584.0, + "logits/rejected": 7233011.0, + "logps/chosen": -346.5712585449219, + "logps/rejected": -601.621826171875, + "loss": 0.0477, + "rewards/chosen": -0.011472329497337341, + "rewards/margins": 15.098029509186745, + "rewards/rejected": -15.109501838684082, + "step": 192 + }, + { + "epoch": 0.03526724531749657, + "grad_norm": 11.6875, + "kl": 0.0, + "learning_rate": 9.744380058222483e-06, + "logits/chosen": -60967267.55555555, + "logits/rejected": 7443500.0, + "logps/chosen": -219.72913953993054, + "logps/rejected": -427.1424037388393, + "loss": 0.0438, + "rewards/chosen": 1.002476692199707, + "rewards/margins": 14.223971094403948, + "rewards/rejected": -13.221494402204241, + "step": 193 + }, + { + "epoch": 0.03544997715851987, + "grad_norm": 7.53125, + "kl": 0.0, + "learning_rate": 9.73884205004793e-06, + "logits/chosen": 20569312.0, + "logits/rejected": -1274779.5, + "logps/chosen": -289.79229736328125, + "logps/rejected": -567.1852416992188, + "loss": 0.0337, + "rewards/chosen": 1.1757196187973022, + "rewards/margins": 15.353518843650818, + "rewards/rejected": -14.177799224853516, + "step": 194 + }, + { + "epoch": 0.03563270899954317, + "grad_norm": 9.0, + "kl": 0.0, + "learning_rate": 9.733246300578482e-06, + "logits/chosen": -841107.4285714285, + "logits/rejected": 11518600.888888888, + "logps/chosen": -228.26988002232142, + "logps/rejected": -347.89971245659723, + "loss": 0.0273, + "rewards/chosen": 1.1577982221330916, + "rewards/margins": 9.554428403339688, + "rewards/rejected": -8.396630181206596, + "step": 195 + }, + { + "epoch": 0.03581544084056647, + "grad_norm": 9.125, + "kl": 0.0, + "learning_rate": 9.727592877996585e-06, + "logits/chosen": -55041700.0, + "logits/rejected": 15745901.0, + "logps/chosen": -261.9208068847656, + "logps/rejected": -517.54833984375, + "loss": 0.0383, + "rewards/chosen": 1.2090716361999512, + "rewards/margins": 11.772570133209229, + "rewards/rejected": -10.563498497009277, + "step": 196 + }, + { + "epoch": 0.035998172681589766, + "grad_norm": 13.875, + "kl": 0.0, + "learning_rate": 9.721881851187406e-06, + "logits/chosen": -11305482.666666666, + "logits/rejected": -10742916.57142857, + "logps/chosen": -187.72634548611111, + "logps/rejected": -309.48758370535717, + "loss": 0.0807, + "rewards/chosen": 0.6141553454928927, + "rewards/margins": 7.774340478200761, + "rewards/rejected": -7.160185132707868, + "step": 197 + }, + { + "epoch": 0.036180904522613064, + "grad_norm": 9.0625, + "kl": 0.0, + "learning_rate": 9.716113289738005e-06, + "logits/chosen": -14488449.777777778, + "logits/rejected": 30956779.42857143, + "logps/chosen": -156.42909071180554, + "logps/rejected": -590.9328264508929, + "loss": 0.0461, + "rewards/chosen": 0.6735216776529948, + "rewards/margins": 11.88156981695266, + "rewards/rejected": -11.208048139299665, + "step": 198 + }, + { + "epoch": 0.03636363636363636, + "grad_norm": 9.8125, + "kl": 0.0, + "learning_rate": 9.710287263936485e-06, + "logits/chosen": -45437156.571428575, + "logits/rejected": -25214794.666666668, + "logps/chosen": -291.0449916294643, + "logps/rejected": -277.23320855034723, + "loss": 0.0424, + "rewards/chosen": 0.1747100693838937, + "rewards/margins": 9.204261257534935, + "rewards/rejected": -9.029551188151041, + "step": 199 + }, + { + "epoch": 0.03654636820465966, + "grad_norm": 9.75, + "kl": 0.0, + "learning_rate": 9.704403844771128e-06, + "logits/chosen": -53797952.0, + "logits/rejected": 13878934.222222222, + "logps/chosen": -214.87843540736608, + "logps/rejected": -515.4629991319445, + "loss": 0.0243, + "rewards/chosen": 1.6179282324654716, + "rewards/margins": 12.649615651085263, + "rewards/rejected": -11.031687418619791, + "step": 200 + }, + { + "epoch": 0.03672910004568296, + "grad_norm": 8.1875, + "kl": 0.0, + "learning_rate": 9.698463103929542e-06, + "logits/chosen": -22271917.714285713, + "logits/rejected": -33660088.88888889, + "logps/chosen": -268.0447998046875, + "logps/rejected": -436.80677625868054, + "loss": 0.0326, + "rewards/chosen": 1.0749896594456263, + "rewards/margins": 10.880796985020712, + "rewards/rejected": -9.805807325575087, + "step": 201 + }, + { + "epoch": 0.036911831886706256, + "grad_norm": 7.3125, + "kl": 0.0, + "learning_rate": 9.69246511379778e-06, + "logits/chosen": -56770704.0, + "logits/rejected": -43380194.461538464, + "logps/chosen": -327.81345621744794, + "logps/rejected": -471.6627854567308, + "loss": 0.0181, + "rewards/chosen": 0.11525549491246541, + "rewards/margins": 10.303548951943716, + "rewards/rejected": -10.18829345703125, + "step": 202 + }, + { + "epoch": 0.037094563727729554, + "grad_norm": 12.0, + "kl": 0.0, + "learning_rate": 9.68640994745946e-06, + "logits/chosen": -27835845.818181816, + "logits/rejected": 26238947.2, + "logps/chosen": -278.74214311079544, + "logps/rejected": -532.2619140625, + "loss": 0.0581, + "rewards/chosen": 0.9110428203235973, + "rewards/margins": 11.649235569347034, + "rewards/rejected": -10.738192749023437, + "step": 203 + }, + { + "epoch": 0.03727729556875285, + "grad_norm": 9.5625, + "kl": 0.0, + "learning_rate": 9.680297678694867e-06, + "logits/chosen": -16546858.666666666, + "logits/rejected": 4772559.428571428, + "logps/chosen": -259.1062282986111, + "logps/rejected": -484.52256556919644, + "loss": 0.0328, + "rewards/chosen": 1.6422932942708333, + "rewards/margins": 14.314368838355655, + "rewards/rejected": -12.672075544084821, + "step": 204 + }, + { + "epoch": 0.03746002740977615, + "grad_norm": 19.0, + "kl": 0.0, + "learning_rate": 9.674128381980073e-06, + "logits/chosen": -14462457.6, + "logits/rejected": -70404069.33333333, + "logps/chosen": -238.1029541015625, + "logps/rejected": -578.4507242838541, + "loss": 0.0756, + "rewards/chosen": 0.6216578006744384, + "rewards/margins": 12.460792016983032, + "rewards/rejected": -11.839134216308594, + "step": 205 + }, + { + "epoch": 0.03764275925079945, + "grad_norm": 10.6875, + "kl": 0.0, + "learning_rate": 9.667902132486009e-06, + "logits/chosen": -6270232.4, + "logits/rejected": 40026048.0, + "logps/chosen": -268.4323974609375, + "logps/rejected": -500.5160725911458, + "loss": 0.0569, + "rewards/chosen": 0.6220425605773926, + "rewards/margins": 8.599027856190999, + "rewards/rejected": -7.9769852956136065, + "step": 206 + }, + { + "epoch": 0.037825491091822754, + "grad_norm": 6.125, + "kl": 0.0, + "learning_rate": 9.661619006077562e-06, + "logits/chosen": -63066762.666666664, + "logits/rejected": -3959808.3076923075, + "logps/chosen": -97.690185546875, + "logps/rejected": -390.73583984375, + "loss": 0.0142, + "rewards/chosen": 1.2894210815429688, + "rewards/margins": 10.110675518329327, + "rewards/rejected": -8.821254436786358, + "step": 207 + }, + { + "epoch": 0.03800822293284605, + "grad_norm": 11.625, + "kl": 0.0, + "learning_rate": 9.655279079312643e-06, + "logits/chosen": -7003652.363636363, + "logits/rejected": -481204.8, + "logps/chosen": -224.11381392045453, + "logps/rejected": -288.6005859375, + "loss": 0.0908, + "rewards/chosen": 0.4471063180403276, + "rewards/margins": 7.1814569039778275, + "rewards/rejected": -6.7343505859375, + "step": 208 + }, + { + "epoch": 0.03819095477386935, + "grad_norm": 9.5625, + "kl": 0.0, + "learning_rate": 9.648882429441258e-06, + "logits/chosen": -42037464.0, + "logits/rejected": -53379744.0, + "logps/chosen": -229.91390991210938, + "logps/rejected": -498.0215759277344, + "loss": 0.0349, + "rewards/chosen": 0.9505839943885803, + "rewards/margins": 9.821915209293365, + "rewards/rejected": -8.871331214904785, + "step": 209 + }, + { + "epoch": 0.03837368661489265, + "grad_norm": 13.8125, + "kl": 0.0, + "learning_rate": 9.642429134404568e-06, + "logits/chosen": -51373207.27272727, + "logits/rejected": -19388353.6, + "logps/chosen": -285.21171431107956, + "logps/rejected": -305.486376953125, + "loss": 0.0567, + "rewards/chosen": 0.7329272356900302, + "rewards/margins": 10.53462172421542, + "rewards/rejected": -9.80169448852539, + "step": 210 + }, + { + "epoch": 0.038556418455915946, + "grad_norm": 10.5625, + "kl": 0.0, + "learning_rate": 9.635919272833938e-06, + "logits/chosen": -325739.1111111111, + "logits/rejected": 6160974.857142857, + "logps/chosen": -269.47547743055554, + "logps/rejected": -512.673828125, + "loss": 0.031, + "rewards/chosen": 1.5362750159369574, + "rewards/margins": 12.220361406840976, + "rewards/rejected": -10.684086390904017, + "step": 211 + }, + { + "epoch": 0.038739150296939244, + "grad_norm": 11.8125, + "kl": 0.0, + "learning_rate": 9.629352924049975e-06, + "logits/chosen": -43413412.571428575, + "logits/rejected": -5077061.333333333, + "logps/chosen": -226.01569475446428, + "logps/rejected": -445.833984375, + "loss": 0.0509, + "rewards/chosen": 0.21899213109697616, + "rewards/margins": 10.457770987162515, + "rewards/rejected": -10.238778856065538, + "step": 212 + }, + { + "epoch": 0.03892188213796254, + "grad_norm": 10.5, + "kl": 0.0, + "learning_rate": 9.622730168061568e-06, + "logits/chosen": -68047506.28571428, + "logits/rejected": -11456871.111111112, + "logps/chosen": -256.99996512276783, + "logps/rejected": -359.4184299045139, + "loss": 0.0368, + "rewards/chosen": 0.9010866710117885, + "rewards/margins": 8.47964908963158, + "rewards/rejected": -7.578562418619792, + "step": 213 + }, + { + "epoch": 0.03910461397898584, + "grad_norm": 8.5, + "kl": 0.0, + "learning_rate": 9.616051085564905e-06, + "logits/chosen": -57718921.14285714, + "logits/rejected": -12646880.888888888, + "logps/chosen": -214.113525390625, + "logps/rejected": -489.9958767361111, + "loss": 0.0396, + "rewards/chosen": 0.8481114251273019, + "rewards/margins": 11.660875063093881, + "rewards/rejected": -10.81276363796658, + "step": 214 + }, + { + "epoch": 0.03928734582000914, + "grad_norm": 8.375, + "kl": 0.037273406982421875, + "learning_rate": 9.609315757942504e-06, + "logits/chosen": -64493432.88888889, + "logits/rejected": -62974162.28571428, + "logps/chosen": -190.33230251736111, + "logps/rejected": -725.5845424107143, + "loss": 0.0374, + "rewards/chosen": 1.3229291703965929, + "rewards/margins": 17.621359476967463, + "rewards/rejected": -16.29843030657087, + "step": 215 + }, + { + "epoch": 0.039470077661032436, + "grad_norm": 7.0, + "kl": 0.0, + "learning_rate": 9.602524267262202e-06, + "logits/chosen": -54700548.0, + "logits/rejected": -15991440.0, + "logps/chosen": -252.95327758789062, + "logps/rejected": -498.7222493489583, + "loss": 0.0296, + "rewards/chosen": -0.3476036489009857, + "rewards/margins": 11.87343262632688, + "rewards/rejected": -12.221036275227865, + "step": 216 + }, + { + "epoch": 0.039652809502055734, + "grad_norm": 8.9375, + "kl": 0.0, + "learning_rate": 9.595676696276173e-06, + "logits/chosen": -34130378.666666664, + "logits/rejected": -33014019.2, + "logps/chosen": -194.64339192708334, + "logps/rejected": -366.475830078125, + "loss": 0.0366, + "rewards/chosen": 0.8720700740814209, + "rewards/margins": 11.22472128868103, + "rewards/rejected": -10.352651214599609, + "step": 217 + }, + { + "epoch": 0.03983554134307903, + "grad_norm": 5.8125, + "kl": 0.0, + "learning_rate": 9.588773128419907e-06, + "logits/chosen": 20739986.0, + "logits/rejected": 16098666.666666666, + "logps/chosen": -134.20277404785156, + "logps/rejected": -515.0783284505209, + "loss": 0.0273, + "rewards/chosen": -0.18654873967170715, + "rewards/margins": 14.881379574537277, + "rewards/rejected": -15.067928314208984, + "step": 218 + }, + { + "epoch": 0.04001827318410233, + "grad_norm": 9.8125, + "kl": 0.1625080108642578, + "learning_rate": 9.581813647811199e-06, + "logits/chosen": -55154772.0, + "logits/rejected": -78888984.0, + "logps/chosen": -232.89215087890625, + "logps/rejected": -606.115966796875, + "loss": 0.035, + "rewards/chosen": 0.8910728096961975, + "rewards/margins": 14.003282129764557, + "rewards/rejected": -13.11220932006836, + "step": 219 + }, + { + "epoch": 0.04020100502512563, + "grad_norm": 8.0, + "kl": 0.0, + "learning_rate": 9.574798339249124e-06, + "logits/chosen": -110732160.0, + "logits/rejected": -62834396.44444445, + "logps/chosen": -89.84816196986607, + "logps/rejected": -358.51185438368054, + "loss": 0.0566, + "rewards/chosen": -0.5127802576337542, + "rewards/margins": 9.754494387006003, + "rewards/rejected": -10.267274644639757, + "step": 220 + }, + { + "epoch": 0.04038373686614893, + "grad_norm": 6.53125, + "kl": 0.0, + "learning_rate": 9.567727288213005e-06, + "logits/chosen": -1249479.5, + "logits/rejected": -20232302.4, + "logps/chosen": -170.7862548828125, + "logps/rejected": -410.2662109375, + "loss": 0.0295, + "rewards/chosen": 0.8840684096018473, + "rewards/margins": 12.196719471613566, + "rewards/rejected": -11.312651062011719, + "step": 221 + }, + { + "epoch": 0.040566468707172225, + "grad_norm": 19.5, + "kl": 0.0, + "learning_rate": 9.560600580861366e-06, + "logits/chosen": -49246262.85714286, + "logits/rejected": -12597054.222222222, + "logps/chosen": -166.73929268973214, + "logps/rejected": -472.2864040798611, + "loss": 0.0387, + "rewards/chosen": 1.253927503313337, + "rewards/margins": 12.735355861603267, + "rewards/rejected": -11.48142835828993, + "step": 222 + }, + { + "epoch": 0.04074920054819552, + "grad_norm": 10.4375, + "kl": 0.0, + "learning_rate": 9.553418304030886e-06, + "logits/chosen": -63836330.666666664, + "logits/rejected": -56702034.28571428, + "logps/chosen": -241.63226996527777, + "logps/rejected": -499.1747349330357, + "loss": 0.0451, + "rewards/chosen": 0.8244615660773383, + "rewards/margins": 12.35140967747522, + "rewards/rejected": -11.52694811139788, + "step": 223 + }, + { + "epoch": 0.04093193238921882, + "grad_norm": 8.125, + "kl": 0.0, + "learning_rate": 9.546180545235344e-06, + "logits/chosen": -99955008.0, + "logits/rejected": -51037656.88888889, + "logps/chosen": -258.20164271763394, + "logps/rejected": -447.481201171875, + "loss": 0.0348, + "rewards/chosen": 0.8723297119140625, + "rewards/margins": 12.270192464192709, + "rewards/rejected": -11.397862752278646, + "step": 224 + }, + { + "epoch": 0.04111466423024212, + "grad_norm": 10.9375, + "kl": 0.0, + "learning_rate": 9.538887392664544e-06, + "logits/chosen": 30615904.0, + "logits/rejected": -54309382.4, + "logps/chosen": -206.69805908203125, + "logps/rejected": -389.226806640625, + "loss": 0.0227, + "rewards/chosen": 2.5369491577148438, + "rewards/margins": 13.171206665039062, + "rewards/rejected": -10.634257507324218, + "step": 225 + }, + { + "epoch": 0.04129739607126542, + "grad_norm": 11.375, + "kl": 0.0, + "learning_rate": 9.531538935183252e-06, + "logits/chosen": -33295437.333333332, + "logits/rejected": -35727814.4, + "logps/chosen": -368.9524332682292, + "logps/rejected": -280.7441650390625, + "loss": 0.0391, + "rewards/chosen": 0.3141283591588338, + "rewards/margins": 6.608532961209614, + "rewards/rejected": -6.294404602050781, + "step": 226 + }, + { + "epoch": 0.041480127912288715, + "grad_norm": 10.0625, + "kl": 0.0, + "learning_rate": 9.524135262330098e-06, + "logits/chosen": -8133672.0, + "logits/rejected": -14632537.142857144, + "logps/chosen": -250.03488498263889, + "logps/rejected": -550.3585379464286, + "loss": 0.0488, + "rewards/chosen": 0.8397454685635037, + "rewards/margins": 11.633865167224217, + "rewards/rejected": -10.794119698660714, + "step": 227 + }, + { + "epoch": 0.04166285975331201, + "grad_norm": 9.0, + "kl": 0.0, + "learning_rate": 9.516676464316505e-06, + "logits/chosen": -73205840.0, + "logits/rejected": -59354660.0, + "logps/chosen": -221.68788146972656, + "logps/rejected": -583.10888671875, + "loss": 0.0335, + "rewards/chosen": 1.0504353046417236, + "rewards/margins": 13.79934048652649, + "rewards/rejected": -12.748905181884766, + "step": 228 + }, + { + "epoch": 0.04184559159433531, + "grad_norm": 7.375, + "kl": 0.0, + "learning_rate": 9.50916263202557e-06, + "logits/chosen": -100406208.0, + "logits/rejected": -51799112.0, + "logps/chosen": -149.28274536132812, + "logps/rejected": -435.96673583984375, + "loss": 0.0251, + "rewards/chosen": 1.9121941328048706, + "rewards/margins": 12.449264407157898, + "rewards/rejected": -10.537070274353027, + "step": 229 + }, + { + "epoch": 0.04202832343535861, + "grad_norm": 14.25, + "kl": 0.0, + "learning_rate": 9.501593857010968e-06, + "logits/chosen": -13244712.0, + "logits/rejected": -62088394.666666664, + "logps/chosen": -193.961328125, + "logps/rejected": -318.4802652994792, + "loss": 0.0645, + "rewards/chosen": 0.21635305881500244, + "rewards/margins": 9.714069962501526, + "rewards/rejected": -9.497716903686523, + "step": 230 + }, + { + "epoch": 0.04221105527638191, + "grad_norm": 6.875, + "kl": 0.0, + "learning_rate": 9.493970231495836e-06, + "logits/chosen": 575112.0, + "logits/rejected": -19311212.8, + "logps/chosen": -160.38243611653647, + "logps/rejected": -491.66953125, + "loss": 0.027, + "rewards/chosen": 0.6531117757161459, + "rewards/margins": 12.26592534383138, + "rewards/rejected": -11.612813568115234, + "step": 231 + }, + { + "epoch": 0.042393787117405206, + "grad_norm": 7.96875, + "kl": 0.0, + "learning_rate": 9.486291848371642e-06, + "logits/chosen": -97434520.0, + "logits/rejected": -35583968.0, + "logps/chosen": -223.29342651367188, + "logps/rejected": -491.12664794921875, + "loss": 0.0367, + "rewards/chosen": 1.1173399686813354, + "rewards/margins": 10.948604464530945, + "rewards/rejected": -9.83126449584961, + "step": 232 + }, + { + "epoch": 0.042576518958428504, + "grad_norm": 10.0, + "kl": 0.0, + "learning_rate": 9.478558801197065e-06, + "logits/chosen": -45408782.222222224, + "logits/rejected": -88185600.0, + "logps/chosen": -193.05357530381946, + "logps/rejected": -281.0241001674107, + "loss": 0.0482, + "rewards/chosen": 0.43237413300408256, + "rewards/margins": 9.65038626156156, + "rewards/rejected": -9.218012128557477, + "step": 233 + }, + { + "epoch": 0.0427592507994518, + "grad_norm": 8.3125, + "kl": 0.0, + "learning_rate": 9.470771184196842e-06, + "logits/chosen": -64412280.0, + "logits/rejected": -62802064.0, + "logps/chosen": -171.61495971679688, + "logps/rejected": -459.073974609375, + "loss": 0.0276, + "rewards/chosen": 1.6815565824508667, + "rewards/margins": 12.99788749217987, + "rewards/rejected": -11.316330909729004, + "step": 234 + }, + { + "epoch": 0.0429419826404751, + "grad_norm": 12.3125, + "kl": 0.27286720275878906, + "learning_rate": 9.46292909226063e-06, + "logits/chosen": -95658163.2, + "logits/rejected": -56660240.0, + "logps/chosen": -226.9624267578125, + "logps/rejected": -470.8998209635417, + "loss": 0.0506, + "rewards/chosen": 0.7987327575683594, + "rewards/margins": 10.631653467814127, + "rewards/rejected": -9.832920710245768, + "step": 235 + }, + { + "epoch": 0.0431247144814984, + "grad_norm": 9.0625, + "kl": 0.0, + "learning_rate": 9.45503262094184e-06, + "logits/chosen": -43690221.71428572, + "logits/rejected": -34889344.0, + "logps/chosen": -233.23287527901786, + "logps/rejected": -497.76410590277777, + "loss": 0.0404, + "rewards/chosen": 0.38493422099522184, + "rewards/margins": 13.129036699022565, + "rewards/rejected": -12.744102478027344, + "step": 236 + }, + { + "epoch": 0.043307446322521696, + "grad_norm": 7.71875, + "kl": 0.0, + "learning_rate": 9.44708186645649e-06, + "logits/chosen": -72140531.2, + "logits/rejected": -86302574.54545455, + "logps/chosen": -212.1725830078125, + "logps/rejected": -417.7867542613636, + "loss": 0.0285, + "rewards/chosen": 0.6316576957702636, + "rewards/margins": 12.011921284415505, + "rewards/rejected": -11.380263588645242, + "step": 237 + }, + { + "epoch": 0.043490178163544994, + "grad_norm": 10.875, + "kl": 0.0, + "learning_rate": 9.439076925682006e-06, + "logits/chosen": -30598602.666666668, + "logits/rejected": 10074849.142857144, + "logps/chosen": -246.40966796875, + "logps/rejected": -344.77598353794644, + "loss": 0.052, + "rewards/chosen": 0.5911376741197374, + "rewards/margins": 8.204570422096857, + "rewards/rejected": -7.613432747977121, + "step": 238 + }, + { + "epoch": 0.0436729100045683, + "grad_norm": 9.0625, + "kl": 0.0, + "learning_rate": 9.431017896156074e-06, + "logits/chosen": -95778183.1111111, + "logits/rejected": -42548242.28571428, + "logps/chosen": -115.84956868489583, + "logps/rejected": -365.47628348214283, + "loss": 0.0207, + "rewards/chosen": 2.3524417877197266, + "rewards/margins": 12.117151260375977, + "rewards/rejected": -9.76470947265625, + "step": 239 + }, + { + "epoch": 0.0438556418455916, + "grad_norm": 9.125, + "kl": 0.0, + "learning_rate": 9.42290487607542e-06, + "logits/chosen": -15371468.8, + "logits/rejected": -21857269.333333332, + "logps/chosen": -289.8595703125, + "logps/rejected": -270.95920817057294, + "loss": 0.0506, + "rewards/chosen": 0.773958396911621, + "rewards/margins": 9.245296669006347, + "rewards/rejected": -8.471338272094727, + "step": 240 + }, + { + "epoch": 0.044038373686614896, + "grad_norm": 8.625, + "kl": 0.0, + "learning_rate": 9.414737964294636e-06, + "logits/chosen": -55685492.0, + "logits/rejected": -51153992.0, + "logps/chosen": -166.81607055664062, + "logps/rejected": -590.3405151367188, + "loss": 0.0472, + "rewards/chosen": 0.35993489623069763, + "rewards/margins": 14.509568303823471, + "rewards/rejected": -14.149633407592773, + "step": 241 + }, + { + "epoch": 0.044221105527638194, + "grad_norm": 8.6875, + "kl": 0.0, + "learning_rate": 9.406517260324962e-06, + "logits/chosen": -34328164.0, + "logits/rejected": -19445506.0, + "logps/chosen": -249.71066284179688, + "logps/rejected": -479.7159423828125, + "loss": 0.0393, + "rewards/chosen": 0.9154759049415588, + "rewards/margins": 14.208673536777496, + "rewards/rejected": -13.293197631835938, + "step": 242 + }, + { + "epoch": 0.04440383736866149, + "grad_norm": 7.09375, + "kl": 0.0, + "learning_rate": 9.398242864333084e-06, + "logits/chosen": -30861786.666666668, + "logits/rejected": 17510126.4, + "logps/chosen": -169.71258544921875, + "logps/rejected": -550.997900390625, + "loss": 0.0292, + "rewards/chosen": 1.0700219472249348, + "rewards/margins": 12.896270116170248, + "rewards/rejected": -11.826248168945312, + "step": 243 + }, + { + "epoch": 0.04458656920968479, + "grad_norm": 10.0, + "kl": 0.0, + "learning_rate": 9.389914877139903e-06, + "logits/chosen": -39468725.333333336, + "logits/rejected": 20729490.285714287, + "logps/chosen": -183.59814453125, + "logps/rejected": -410.5366908482143, + "loss": 0.0493, + "rewards/chosen": 0.4850413004557292, + "rewards/margins": 12.959932963053385, + "rewards/rejected": -12.474891662597656, + "step": 244 + }, + { + "epoch": 0.04476930105070809, + "grad_norm": 10.375, + "kl": 0.0, + "learning_rate": 9.381533400219319e-06, + "logits/chosen": -55624564.0, + "logits/rejected": 12272176.0, + "logps/chosen": -261.379638671875, + "logps/rejected": -534.6090087890625, + "loss": 0.0673, + "rewards/chosen": 1.1555676460266113, + "rewards/margins": 14.41470193862915, + "rewards/rejected": -13.259134292602539, + "step": 245 + }, + { + "epoch": 0.044952032891731386, + "grad_norm": 14.3125, + "kl": 0.0, + "learning_rate": 9.37309853569698e-06, + "logits/chosen": -38962609.777777776, + "logits/rejected": -17648882.285714287, + "logps/chosen": -189.5947265625, + "logps/rejected": -629.703125, + "loss": 0.0506, + "rewards/chosen": 0.4685042169358995, + "rewards/margins": 15.035492480747283, + "rewards/rejected": -14.566988263811384, + "step": 246 + }, + { + "epoch": 0.045134764732754684, + "grad_norm": 11.625, + "kl": 0.0, + "learning_rate": 9.364610386349048e-06, + "logits/chosen": -33097792.0, + "logits/rejected": -18206659.42857143, + "logps/chosen": -209.92182074652777, + "logps/rejected": -462.09633091517856, + "loss": 0.0557, + "rewards/chosen": 0.06342164675394694, + "rewards/margins": 8.15783431416466, + "rewards/rejected": -8.094412667410714, + "step": 247 + }, + { + "epoch": 0.04531749657377798, + "grad_norm": 10.8125, + "kl": 0.0, + "learning_rate": 9.356069055600949e-06, + "logits/chosen": -33971420.44444445, + "logits/rejected": -21051888.0, + "logps/chosen": -246.76540798611111, + "logps/rejected": -524.2669154575893, + "loss": 0.0528, + "rewards/chosen": 0.46560515297783744, + "rewards/margins": 14.462323423415894, + "rewards/rejected": -13.996718270438057, + "step": 248 + }, + { + "epoch": 0.04550022841480128, + "grad_norm": 9.0, + "kl": 0.0, + "learning_rate": 9.347474647526095e-06, + "logits/chosen": -25686818.90909091, + "logits/rejected": -33045043.2, + "logps/chosen": -269.91006747159093, + "logps/rejected": -404.8924072265625, + "loss": 0.0314, + "rewards/chosen": 1.739968559958718, + "rewards/margins": 15.645612786032938, + "rewards/rejected": -13.90564422607422, + "step": 249 + }, + { + "epoch": 0.04568296025582458, + "grad_norm": 6.875, + "kl": 0.0, + "learning_rate": 9.338827266844643e-06, + "logits/chosen": -72167462.4, + "logits/rejected": -36086632.72727273, + "logps/chosen": -242.8840576171875, + "logps/rejected": -302.3810369318182, + "loss": 0.0141, + "rewards/chosen": 1.8645423889160155, + "rewards/margins": 12.55197788585316, + "rewards/rejected": -10.687435496937145, + "step": 250 + }, + { + "epoch": 0.045865692096847877, + "grad_norm": 12.125, + "kl": 0.0, + "learning_rate": 9.330127018922195e-06, + "logits/chosen": -87457797.33333333, + "logits/rejected": -71549552.0, + "logps/chosen": -318.6561686197917, + "logps/rejected": -373.8993835449219, + "loss": 0.0355, + "rewards/chosen": 2.0680929819742837, + "rewards/margins": 13.903259913126627, + "rewards/rejected": -11.835166931152344, + "step": 251 + }, + { + "epoch": 0.046048423937871175, + "grad_norm": 9.125, + "kl": 0.0, + "learning_rate": 9.321374009768525e-06, + "logits/chosen": -13815109.818181818, + "logits/rejected": -35003084.8, + "logps/chosen": -161.4406183416193, + "logps/rejected": -452.29599609375, + "loss": 0.0655, + "rewards/chosen": 0.19164698774164374, + "rewards/margins": 10.767844802683049, + "rewards/rejected": -10.576197814941406, + "step": 252 + }, + { + "epoch": 0.04623115577889447, + "grad_norm": 7.15625, + "kl": 0.0, + "learning_rate": 9.312568346036288e-06, + "logits/chosen": -135282060.8, + "logits/rejected": -22113800.727272727, + "logps/chosen": -177.53070068359375, + "logps/rejected": -505.8170276988636, + "loss": 0.0195, + "rewards/chosen": 1.3682668685913086, + "rewards/margins": 13.91635008725253, + "rewards/rejected": -12.54808321866122, + "step": 253 + }, + { + "epoch": 0.04641388761991777, + "grad_norm": 9.25, + "kl": 0.0, + "learning_rate": 9.30371013501972e-06, + "logits/chosen": -66763079.11111111, + "logits/rejected": -78443922.28571428, + "logps/chosen": -330.29009331597223, + "logps/rejected": -635.3244977678571, + "loss": 0.0397, + "rewards/chosen": 1.1333126491970487, + "rewards/margins": 14.613140530056423, + "rewards/rejected": -13.479827880859375, + "step": 254 + }, + { + "epoch": 0.04659661946094107, + "grad_norm": 6.78125, + "kl": 0.008310317993164062, + "learning_rate": 9.294799484653323e-06, + "logits/chosen": -8046936.0, + "logits/rejected": 17123134.545454547, + "logps/chosen": -181.633056640625, + "logps/rejected": -459.9918323863636, + "loss": 0.0264, + "rewards/chosen": 1.0707881927490235, + "rewards/margins": 11.053752448342063, + "rewards/rejected": -9.98296425559304, + "step": 255 + }, + { + "epoch": 0.04677935130196437, + "grad_norm": 9.5, + "kl": 0.0, + "learning_rate": 9.285836503510562e-06, + "logits/chosen": -41651252.0, + "logits/rejected": -18730182.0, + "logps/chosen": -158.7765350341797, + "logps/rejected": -332.37908935546875, + "loss": 0.0521, + "rewards/chosen": 1.7330043315887451, + "rewards/margins": 9.266571760177612, + "rewards/rejected": -7.533567428588867, + "step": 256 + }, + { + "epoch": 0.046962083142987665, + "grad_norm": 9.8125, + "kl": 0.0, + "learning_rate": 9.276821300802535e-06, + "logits/chosen": -34959524.571428575, + "logits/rejected": -23566359.111111112, + "logps/chosen": -329.67679268973217, + "logps/rejected": -442.52851019965277, + "loss": 0.038, + "rewards/chosen": 0.48118724141802105, + "rewards/margins": 11.219871153907171, + "rewards/rejected": -10.738683912489149, + "step": 257 + }, + { + "epoch": 0.04714481498401096, + "grad_norm": 7.84375, + "kl": 0.0, + "learning_rate": 9.267753986376638e-06, + "logits/chosen": 1847980.0, + "logits/rejected": -11509600.0, + "logps/chosen": -235.19375610351562, + "logps/rejected": -557.2823893229166, + "loss": 0.0264, + "rewards/chosen": -0.11499347537755966, + "rewards/margins": 14.536441105107466, + "rewards/rejected": -14.651434580485025, + "step": 258 + }, + { + "epoch": 0.04732754682503426, + "grad_norm": 9.5, + "kl": 0.0, + "learning_rate": 9.25863467071524e-06, + "logits/chosen": -59681651.2, + "logits/rejected": -78906517.33333333, + "logps/chosen": -207.1171875, + "logps/rejected": -467.5957845052083, + "loss": 0.0622, + "rewards/chosen": 0.09678634405136108, + "rewards/margins": 9.09658480087916, + "rewards/rejected": -8.999798456827799, + "step": 259 + }, + { + "epoch": 0.04751027866605756, + "grad_norm": 11.375, + "kl": 0.0, + "learning_rate": 9.24946346493432e-06, + "logits/chosen": -45215238.4, + "logits/rejected": 19957458.666666668, + "logps/chosen": -290.9359619140625, + "logps/rejected": -519.7036539713541, + "loss": 0.0669, + "rewards/chosen": 0.0662047266960144, + "rewards/margins": 13.932714482148489, + "rewards/rejected": -13.866509755452475, + "step": 260 + }, + { + "epoch": 0.04769301050708086, + "grad_norm": 7.71875, + "kl": 0.0, + "learning_rate": 9.24024048078213e-06, + "logits/chosen": -61921594.666666664, + "logits/rejected": -21861350.4, + "logps/chosen": -258.41001383463544, + "logps/rejected": -368.0484130859375, + "loss": 0.0271, + "rewards/chosen": 1.052226225535075, + "rewards/margins": 10.091234556833902, + "rewards/rejected": -9.039008331298827, + "step": 261 + }, + { + "epoch": 0.047875742348104156, + "grad_norm": 14.5625, + "kl": 0.0, + "learning_rate": 9.230965830637821e-06, + "logits/chosen": -31380992.0, + "logits/rejected": -8512748.8, + "logps/chosen": -264.41656494140625, + "logps/rejected": -413.2677734375, + "loss": 0.042, + "rewards/chosen": 0.3877418438593547, + "rewards/margins": 12.581877891222636, + "rewards/rejected": -12.194136047363282, + "step": 262 + }, + { + "epoch": 0.048058474189127454, + "grad_norm": 10.25, + "kl": 0.0, + "learning_rate": 9.221639627510076e-06, + "logits/chosen": -44926412.8, + "logits/rejected": 66498816.0, + "logps/chosen": -223.52431640625, + "logps/rejected": -542.204833984375, + "loss": 0.0477, + "rewards/chosen": 0.9879254341125489, + "rewards/margins": 13.778777027130127, + "rewards/rejected": -12.790851593017578, + "step": 263 + }, + { + "epoch": 0.04824120603015075, + "grad_norm": 8.1875, + "kl": 0.0, + "learning_rate": 9.21226198503574e-06, + "logits/chosen": -101966892.8, + "logits/rejected": -41991541.333333336, + "logps/chosen": -171.7102294921875, + "logps/rejected": -393.7198486328125, + "loss": 0.0361, + "rewards/chosen": 1.5020322799682617, + "rewards/margins": 13.278727531433105, + "rewards/rejected": -11.776695251464844, + "step": 264 + }, + { + "epoch": 0.04842393787117405, + "grad_norm": 9.125, + "kl": 0.0, + "learning_rate": 9.202833017478421e-06, + "logits/chosen": -32807068.8, + "logits/rejected": -104223648.0, + "logps/chosen": -240.383642578125, + "logps/rejected": -577.1609700520834, + "loss": 0.0535, + "rewards/chosen": 0.4856086730957031, + "rewards/margins": 18.132830556233724, + "rewards/rejected": -17.64722188313802, + "step": 265 + }, + { + "epoch": 0.04860666971219735, + "grad_norm": 6.40625, + "kl": 0.0, + "learning_rate": 9.193352839727122e-06, + "logits/chosen": -33099426.666666668, + "logits/rejected": -9484900.0, + "logps/chosen": -191.3671671549479, + "logps/rejected": -428.262646484375, + "loss": 0.0228, + "rewards/chosen": 1.5376367568969727, + "rewards/margins": 11.511328315734863, + "rewards/rejected": -9.97369155883789, + "step": 266 + }, + { + "epoch": 0.048789401553220646, + "grad_norm": 8.125, + "kl": 0.0, + "learning_rate": 9.18382156729481e-06, + "logits/chosen": -33410723.555555556, + "logits/rejected": -87788169.14285715, + "logps/chosen": -195.39592827690973, + "logps/rejected": -454.5845424107143, + "loss": 0.0396, + "rewards/chosen": 1.3471894794040256, + "rewards/margins": 14.899320042322552, + "rewards/rejected": -13.552130562918526, + "step": 267 + }, + { + "epoch": 0.048972133394243944, + "grad_norm": 5.28125, + "kl": 0.0, + "learning_rate": 9.174239316317034e-06, + "logits/chosen": -85777536.0, + "logits/rejected": -36194100.36363637, + "logps/chosen": -165.731591796875, + "logps/rejected": -329.87923916903407, + "loss": 0.0251, + "rewards/chosen": 1.2746511459350587, + "rewards/margins": 11.554072865572842, + "rewards/rejected": -10.279421719637783, + "step": 268 + }, + { + "epoch": 0.04915486523526724, + "grad_norm": 7.9375, + "kl": 0.0, + "learning_rate": 9.164606203550498e-06, + "logits/chosen": -108586024.0, + "logits/rejected": -40059968.0, + "logps/chosen": -203.40135192871094, + "logps/rejected": -503.53857421875, + "loss": 0.0291, + "rewards/chosen": 1.654930591583252, + "rewards/margins": 12.635033130645752, + "rewards/rejected": -10.9801025390625, + "step": 269 + }, + { + "epoch": 0.04933759707629054, + "grad_norm": 8.5, + "kl": 0.0, + "learning_rate": 9.154922346371641e-06, + "logits/chosen": -43176188.44444445, + "logits/rejected": -24232562.285714287, + "logps/chosen": -242.08702256944446, + "logps/rejected": -488.64358956473217, + "loss": 0.0286, + "rewards/chosen": 1.4901344511244032, + "rewards/margins": 12.092197221422952, + "rewards/rejected": -10.602062770298549, + "step": 270 + }, + { + "epoch": 0.049520328917313845, + "grad_norm": 9.9375, + "kl": 0.0, + "learning_rate": 9.145187862775208e-06, + "logits/chosen": -95836992.0, + "logits/rejected": -32110576.0, + "logps/chosen": -149.314208984375, + "logps/rejected": -560.8796875, + "loss": 0.0474, + "rewards/chosen": 0.4821698268254598, + "rewards/margins": 12.411980160077414, + "rewards/rejected": -11.929810333251954, + "step": 271 + }, + { + "epoch": 0.04970306075833714, + "grad_norm": 11.1875, + "kl": 0.0, + "learning_rate": 9.13540287137281e-06, + "logits/chosen": -108438464.0, + "logits/rejected": -76861165.71428572, + "logps/chosen": -286.5646158854167, + "logps/rejected": -455.24679129464283, + "loss": 0.0684, + "rewards/chosen": -0.48123158348931205, + "rewards/margins": 11.592035672021291, + "rewards/rejected": -12.073267255510602, + "step": 272 + }, + { + "epoch": 0.04988579259936044, + "grad_norm": 10.6875, + "kl": 0.0, + "learning_rate": 9.125567491391476e-06, + "logits/chosen": -64451840.0, + "logits/rejected": -73663504.0, + "logps/chosen": -198.548193359375, + "logps/rejected": -473.2303059895833, + "loss": 0.0547, + "rewards/chosen": 0.9359296798706055, + "rewards/margins": 14.96054719289144, + "rewards/rejected": -14.024617513020834, + "step": 273 + }, + { + "epoch": 0.05006852444038374, + "grad_norm": 8.25, + "kl": 0.0, + "learning_rate": 9.115681842672211e-06, + "logits/chosen": -141437152.0, + "logits/rejected": -63325120.0, + "logps/chosen": -193.5151824951172, + "logps/rejected": -475.40985107421875, + "loss": 0.0327, + "rewards/chosen": 1.14827561378479, + "rewards/margins": 11.76999020576477, + "rewards/rejected": -10.62171459197998, + "step": 274 + }, + { + "epoch": 0.05025125628140704, + "grad_norm": 14.9375, + "kl": 0.0, + "learning_rate": 9.10574604566852e-06, + "logits/chosen": -84046300.44444445, + "logits/rejected": -51073664.0, + "logps/chosen": -355.29871961805554, + "logps/rejected": -487.46177455357144, + "loss": 0.0464, + "rewards/chosen": 0.4208479192521837, + "rewards/margins": 10.726501082617139, + "rewards/rejected": -10.305653163364955, + "step": 275 + }, + { + "epoch": 0.050433988122430336, + "grad_norm": 10.625, + "kl": 0.0, + "learning_rate": 9.09576022144496e-06, + "logits/chosen": -91668736.0, + "logits/rejected": -25994825.14285714, + "logps/chosen": -221.63762749565973, + "logps/rejected": -427.05880301339283, + "loss": 0.0445, + "rewards/chosen": 0.6869364844428169, + "rewards/margins": 11.409695186312238, + "rewards/rejected": -10.72275870186942, + "step": 276 + }, + { + "epoch": 0.050616719963453634, + "grad_norm": 10.8125, + "kl": 0.0, + "learning_rate": 9.085724491675642e-06, + "logits/chosen": -72953568.0, + "logits/rejected": -24891954.666666668, + "logps/chosen": -211.3012939453125, + "logps/rejected": -457.2064208984375, + "loss": 0.0549, + "rewards/chosen": 0.8022445678710938, + "rewards/margins": 12.13119099934896, + "rewards/rejected": -11.328946431477865, + "step": 277 + }, + { + "epoch": 0.05079945180447693, + "grad_norm": 10.625, + "kl": 0.0, + "learning_rate": 9.07563897864277e-06, + "logits/chosen": -94823846.4, + "logits/rejected": -133356640.0, + "logps/chosen": -175.16771240234374, + "logps/rejected": -441.2943522135417, + "loss": 0.0433, + "rewards/chosen": 1.243113136291504, + "rewards/margins": 12.020437812805175, + "rewards/rejected": -10.777324676513672, + "step": 278 + }, + { + "epoch": 0.05098218364550023, + "grad_norm": 11.25, + "kl": 0.0, + "learning_rate": 9.065503805235139e-06, + "logits/chosen": -37303822.222222224, + "logits/rejected": -80925211.42857143, + "logps/chosen": -282.2945963541667, + "logps/rejected": -441.446044921875, + "loss": 0.0409, + "rewards/chosen": 0.8471256362067329, + "rewards/margins": 12.812931779831176, + "rewards/rejected": -11.965806143624443, + "step": 279 + }, + { + "epoch": 0.05116491548652353, + "grad_norm": 10.0, + "kl": 0.0, + "learning_rate": 9.055319094946633e-06, + "logits/chosen": -39839355.428571425, + "logits/rejected": -49335192.88888889, + "logps/chosen": -278.7440883091518, + "logps/rejected": -486.60546875, + "loss": 0.0362, + "rewards/chosen": 1.037489346095494, + "rewards/margins": 12.63489552149697, + "rewards/rejected": -11.597406175401476, + "step": 280 + }, + { + "epoch": 0.051347647327546826, + "grad_norm": 14.375, + "kl": 0.0, + "learning_rate": 9.045084971874738e-06, + "logits/chosen": -75352181.33333333, + "logits/rejected": -24549658.0, + "logps/chosen": -298.0647786458333, + "logps/rejected": -493.8330078125, + "loss": 0.0622, + "rewards/chosen": 0.8149190743764242, + "rewards/margins": 14.873632987340292, + "rewards/rejected": -14.058713912963867, + "step": 281 + }, + { + "epoch": 0.051530379168570124, + "grad_norm": 13.375, + "kl": 0.0, + "learning_rate": 9.03480156071901e-06, + "logits/chosen": -70502320.0, + "logits/rejected": -74473448.0, + "logps/chosen": -243.04368591308594, + "logps/rejected": -456.94549560546875, + "loss": 0.0605, + "rewards/chosen": -0.11115199327468872, + "rewards/margins": 9.958249747753143, + "rewards/rejected": -10.069401741027832, + "step": 282 + }, + { + "epoch": 0.05171311100959342, + "grad_norm": 9.9375, + "kl": 0.0, + "learning_rate": 9.02446898677957e-06, + "logits/chosen": -76385408.0, + "logits/rejected": -22729654.85714286, + "logps/chosen": -220.80874294704861, + "logps/rejected": -498.16322544642856, + "loss": 0.0431, + "rewards/chosen": 1.019836637708876, + "rewards/margins": 12.715156767103407, + "rewards/rejected": -11.695320129394531, + "step": 283 + }, + { + "epoch": 0.05189584285061672, + "grad_norm": 7.3125, + "kl": 0.0, + "learning_rate": 9.014087375955574e-06, + "logits/chosen": -65849016.0, + "logits/rejected": -54681488.0, + "logps/chosen": -247.00071716308594, + "logps/rejected": -561.5609741210938, + "loss": 0.0464, + "rewards/chosen": 0.45120927691459656, + "rewards/margins": 15.98583909869194, + "rewards/rejected": -15.534629821777344, + "step": 284 + }, + { + "epoch": 0.05207857469164002, + "grad_norm": 8.0625, + "kl": 0.0, + "learning_rate": 9.003656854743667e-06, + "logits/chosen": -75515100.44444445, + "logits/rejected": -14074464.0, + "logps/chosen": -188.88377549913196, + "logps/rejected": -504.78250558035717, + "loss": 0.0608, + "rewards/chosen": -0.07769611146714953, + "rewards/margins": 10.346174520159524, + "rewards/rejected": -10.423870631626674, + "step": 285 + }, + { + "epoch": 0.05226130653266332, + "grad_norm": 9.8125, + "kl": 0.0, + "learning_rate": 8.993177550236464e-06, + "logits/chosen": -58923472.0, + "logits/rejected": -93311961.6, + "logps/chosen": -274.48268636067706, + "logps/rejected": -353.7775146484375, + "loss": 0.0339, + "rewards/chosen": 0.6497639815012614, + "rewards/margins": 9.73081866900126, + "rewards/rejected": -9.0810546875, + "step": 286 + }, + { + "epoch": 0.052444038373686615, + "grad_norm": 12.8125, + "kl": 0.0, + "learning_rate": 8.982649590120982e-06, + "logits/chosen": -62643700.0, + "logits/rejected": -42052928.0, + "logps/chosen": -291.2804870605469, + "logps/rejected": -562.2537231445312, + "loss": 0.0365, + "rewards/chosen": 0.6617844700813293, + "rewards/margins": 17.4278843998909, + "rewards/rejected": -16.76609992980957, + "step": 287 + }, + { + "epoch": 0.05262677021470991, + "grad_norm": 12.3125, + "kl": 0.0, + "learning_rate": 8.972073102677091e-06, + "logits/chosen": -97085440.0, + "logits/rejected": -39990834.28571428, + "logps/chosen": -230.85186089409723, + "logps/rejected": -560.0640694754464, + "loss": 0.0547, + "rewards/chosen": 0.133675217628479, + "rewards/margins": 9.9682263476508, + "rewards/rejected": -9.834551130022321, + "step": 288 + }, + { + "epoch": 0.05280950205573321, + "grad_norm": 5.09375, + "kl": 0.0, + "learning_rate": 8.961448216775955e-06, + "logits/chosen": -58301040.0, + "logits/rejected": -51721120.0, + "logps/chosen": -159.25094604492188, + "logps/rejected": -488.6269124348958, + "loss": 0.0319, + "rewards/chosen": -0.5586967468261719, + "rewards/margins": 10.719547271728516, + "rewards/rejected": -11.278244018554688, + "step": 289 + }, + { + "epoch": 0.05299223389675651, + "grad_norm": 11.125, + "kl": 0.0, + "learning_rate": 8.950775061878453e-06, + "logits/chosen": -60595857.45454545, + "logits/rejected": -26521548.8, + "logps/chosen": -241.5842950994318, + "logps/rejected": -335.58193359375, + "loss": 0.0513, + "rewards/chosen": 0.7216912616382946, + "rewards/margins": 13.228669105876577, + "rewards/rejected": -12.506977844238282, + "step": 290 + }, + { + "epoch": 0.05317496573777981, + "grad_norm": 8.125, + "kl": 0.0, + "learning_rate": 8.94005376803361e-06, + "logits/chosen": -149605333.33333334, + "logits/rejected": -46898640.0, + "logps/chosen": -240.43436686197916, + "logps/rejected": -492.808349609375, + "loss": 0.0389, + "rewards/chosen": -0.06120783090591431, + "rewards/margins": 12.07123693227768, + "rewards/rejected": -12.132444763183594, + "step": 291 + }, + { + "epoch": 0.053357697578803105, + "grad_norm": 7.03125, + "kl": 0.0, + "learning_rate": 8.92928446587701e-06, + "logits/chosen": -185296877.7142857, + "logits/rejected": -26230574.222222224, + "logps/chosen": -267.0884312220982, + "logps/rejected": -441.9440104166667, + "loss": 0.0254, + "rewards/chosen": 2.013406072344099, + "rewards/margins": 15.830034422496007, + "rewards/rejected": -13.816628350151909, + "step": 292 + }, + { + "epoch": 0.0535404294198264, + "grad_norm": 14.0625, + "kl": 0.16009235382080078, + "learning_rate": 8.9184672866292e-06, + "logits/chosen": -75532245.33333333, + "logits/rejected": -29164267.42857143, + "logps/chosen": -273.171142578125, + "logps/rejected": -395.17979213169644, + "loss": 0.0529, + "rewards/chosen": 0.871602906121148, + "rewards/margins": 10.540735683743915, + "rewards/rejected": -9.669132777622767, + "step": 293 + }, + { + "epoch": 0.0537231612608497, + "grad_norm": 7.46875, + "kl": 0.0, + "learning_rate": 8.907602362094094e-06, + "logits/chosen": 5214373.6, + "logits/rejected": -74901224.72727273, + "logps/chosen": -233.884912109375, + "logps/rejected": -488.24076704545456, + "loss": 0.0263, + "rewards/chosen": 0.6758286476135253, + "rewards/margins": 12.471370584314519, + "rewards/rejected": -11.795541936700994, + "step": 294 + }, + { + "epoch": 0.053905893101873, + "grad_norm": 8.375, + "kl": 0.0, + "learning_rate": 8.896689824657371e-06, + "logits/chosen": -83371696.0, + "logits/rejected": -79270056.0, + "logps/chosen": -212.80685424804688, + "logps/rejected": -512.1527099609375, + "loss": 0.0379, + "rewards/chosen": 0.9271903038024902, + "rewards/margins": 16.922475337982178, + "rewards/rejected": -15.995285034179688, + "step": 295 + }, + { + "epoch": 0.0540886249428963, + "grad_norm": 8.375, + "kl": 0.0, + "learning_rate": 8.885729807284855e-06, + "logits/chosen": -59189066.666666664, + "logits/rejected": -18524656.0, + "logps/chosen": -205.114990234375, + "logps/rejected": -244.82255859375, + "loss": 0.0265, + "rewards/chosen": 0.7625180880228678, + "rewards/margins": 9.70884453455607, + "rewards/rejected": -8.946326446533202, + "step": 296 + }, + { + "epoch": 0.054271356783919596, + "grad_norm": 8.1875, + "kl": 0.0, + "learning_rate": 8.874722443520898e-06, + "logits/chosen": -34450025.6, + "logits/rejected": -70915861.33333333, + "logps/chosen": -156.37850341796874, + "logps/rejected": -476.9134928385417, + "loss": 0.0405, + "rewards/chosen": 1.3139680862426757, + "rewards/margins": 17.138554318745932, + "rewards/rejected": -15.824586232503256, + "step": 297 + }, + { + "epoch": 0.054454088624942894, + "grad_norm": 11.6875, + "kl": 0.0, + "learning_rate": 8.863667867486756e-06, + "logits/chosen": -135841408.0, + "logits/rejected": -71360757.33333333, + "logps/chosen": -241.47880859375, + "logps/rejected": -444.2178955078125, + "loss": 0.0601, + "rewards/chosen": 0.39721558094024656, + "rewards/margins": 13.951630965868631, + "rewards/rejected": -13.554415384928385, + "step": 298 + }, + { + "epoch": 0.05463682046596619, + "grad_norm": 9.75, + "kl": 0.0, + "learning_rate": 8.852566213878947e-06, + "logits/chosen": -37784594.666666664, + "logits/rejected": -31726416.0, + "logps/chosen": -265.88037109375, + "logps/rejected": -488.977099609375, + "loss": 0.032, + "rewards/chosen": 1.0780049959818523, + "rewards/margins": 13.89683739344279, + "rewards/rejected": -12.818832397460938, + "step": 299 + }, + { + "epoch": 0.05481955230698949, + "grad_norm": 8.375, + "kl": 0.0, + "learning_rate": 8.841417617967618e-06, + "logits/chosen": -87346377.14285715, + "logits/rejected": 5977928.888888889, + "logps/chosen": -211.27378627232142, + "logps/rejected": -608.5362955729166, + "loss": 0.0342, + "rewards/chosen": 0.7927378245762416, + "rewards/margins": 17.63120150187659, + "rewards/rejected": -16.838463677300346, + "step": 300 + }, + { + "epoch": 0.05500228414801279, + "grad_norm": 8.0625, + "kl": 0.0, + "learning_rate": 8.83022221559489e-06, + "logits/chosen": -84309772.8, + "logits/rejected": -93834199.27272727, + "logps/chosen": -231.1645263671875, + "logps/rejected": -479.19113991477275, + "loss": 0.0231, + "rewards/chosen": 0.7876821517944336, + "rewards/margins": 10.67401180267334, + "rewards/rejected": -9.886329650878906, + "step": 301 + }, + { + "epoch": 0.055185015989036086, + "grad_norm": 11.9375, + "kl": 0.0, + "learning_rate": 8.818980143173212e-06, + "logits/chosen": -81166324.36363636, + "logits/rejected": -17775907.2, + "logps/chosen": -292.39095791903407, + "logps/rejected": -405.137646484375, + "loss": 0.0534, + "rewards/chosen": 0.8455012061379172, + "rewards/margins": 16.525188706137918, + "rewards/rejected": -15.6796875, + "step": 302 + }, + { + "epoch": 0.05536774783005939, + "grad_norm": 3.828125, + "kl": 0.0, + "learning_rate": 8.807691537683685e-06, + "logits/chosen": 7148678.857142857, + "logits/rejected": -27120574.222222224, + "logps/chosen": -106.15602329799107, + "logps/rejected": -298.9296061197917, + "loss": 0.0264, + "rewards/chosen": 1.2329233714512415, + "rewards/margins": 9.668272729903931, + "rewards/rejected": -8.43534935845269, + "step": 303 + }, + { + "epoch": 0.05555047967108269, + "grad_norm": 23.125, + "kl": 0.0, + "learning_rate": 8.796356536674404e-06, + "logits/chosen": -68487620.92307693, + "logits/rejected": -29905392.0, + "logps/chosen": -196.92146183894232, + "logps/rejected": -552.0353190104166, + "loss": 0.042, + "rewards/chosen": 1.961772478543795, + "rewards/margins": 10.237645662747896, + "rewards/rejected": -8.275873184204102, + "step": 304 + }, + { + "epoch": 0.05573321151210599, + "grad_norm": 7.96875, + "kl": 0.0, + "learning_rate": 8.784975278258783e-06, + "logits/chosen": -72896736.0, + "logits/rejected": -38189052.0, + "logps/chosen": -216.5999755859375, + "logps/rejected": -365.5684509277344, + "loss": 0.0319, + "rewards/chosen": 1.341198205947876, + "rewards/margins": 13.320491075515747, + "rewards/rejected": -11.979292869567871, + "step": 305 + }, + { + "epoch": 0.055915943353129285, + "grad_norm": 9.25, + "kl": 0.0, + "learning_rate": 8.773547901113862e-06, + "logits/chosen": -100505608.0, + "logits/rejected": -10850280.0, + "logps/chosen": -234.46432495117188, + "logps/rejected": -481.6189270019531, + "loss": 0.0258, + "rewards/chosen": 1.94510817527771, + "rewards/margins": 12.75366473197937, + "rewards/rejected": -10.80855655670166, + "step": 306 + }, + { + "epoch": 0.056098675194152584, + "grad_norm": 12.0, + "kl": 0.0, + "learning_rate": 8.762074544478622e-06, + "logits/chosen": -52693792.0, + "logits/rejected": -93752533.33333333, + "logps/chosen": -274.3159423828125, + "logps/rejected": -630.453125, + "loss": 0.0639, + "rewards/chosen": -0.009110140800476074, + "rewards/margins": 10.340042742093404, + "rewards/rejected": -10.34915288289388, + "step": 307 + }, + { + "epoch": 0.05628140703517588, + "grad_norm": 10.875, + "kl": 1.1161880493164062, + "learning_rate": 8.750555348152299e-06, + "logits/chosen": -105465408.0, + "logits/rejected": -128747936.0, + "logps/chosen": -155.77908761160714, + "logps/rejected": -402.7994384765625, + "loss": 0.0604, + "rewards/chosen": 1.4253620420183455, + "rewards/margins": 16.508823803492955, + "rewards/rejected": -15.08346176147461, + "step": 308 + }, + { + "epoch": 0.05646413887619918, + "grad_norm": 9.8125, + "kl": 0.0, + "learning_rate": 8.73899045249266e-06, + "logits/chosen": -89622707.2, + "logits/rejected": -35800002.666666664, + "logps/chosen": -235.6194580078125, + "logps/rejected": -401.4231770833333, + "loss": 0.0524, + "rewards/chosen": 0.883517074584961, + "rewards/margins": 11.631567764282227, + "rewards/rejected": -10.748050689697266, + "step": 309 + }, + { + "epoch": 0.05664687071722248, + "grad_norm": 9.6875, + "kl": 0.0, + "learning_rate": 8.727379998414311e-06, + "logits/chosen": -59645060.571428575, + "logits/rejected": -36601934.222222224, + "logps/chosen": -319.59423828125, + "logps/rejected": -512.2197265625, + "loss": 0.031, + "rewards/chosen": 1.1313120978219169, + "rewards/margins": 13.35498729584709, + "rewards/rejected": -12.223675198025173, + "step": 310 + }, + { + "epoch": 0.056829602558245776, + "grad_norm": 7.75, + "kl": 0.0, + "learning_rate": 8.715724127386971e-06, + "logits/chosen": -66989063.11111111, + "logits/rejected": -11339264.0, + "logps/chosen": -212.38127983940973, + "logps/rejected": -480.621826171875, + "loss": 0.0318, + "rewards/chosen": 1.0986220041910808, + "rewards/margins": 13.878076916649228, + "rewards/rejected": -12.779454912458148, + "step": 311 + }, + { + "epoch": 0.057012334399269074, + "grad_norm": 7.96875, + "kl": 0.0, + "learning_rate": 8.70402298143375e-06, + "logits/chosen": -48377738.666666664, + "logits/rejected": -62159968.0, + "logps/chosen": -208.06429036458334, + "logps/rejected": -419.61005859375, + "loss": 0.0151, + "rewards/chosen": 2.37600310643514, + "rewards/margins": 13.739127190907796, + "rewards/rejected": -11.363124084472656, + "step": 312 + }, + { + "epoch": 0.05719506624029237, + "grad_norm": 9.5, + "kl": 0.0, + "learning_rate": 8.692276703129421e-06, + "logits/chosen": -25445124.57142857, + "logits/rejected": -42014616.88888889, + "logps/chosen": -320.17306082589283, + "logps/rejected": -423.9033203125, + "loss": 0.0417, + "rewards/chosen": 0.6788173403058734, + "rewards/margins": 11.237932432265508, + "rewards/rejected": -10.559115091959635, + "step": 313 + }, + { + "epoch": 0.05737779808131567, + "grad_norm": 12.3125, + "kl": 0.0, + "learning_rate": 8.680485435598674e-06, + "logits/chosen": -78710748.44444445, + "logits/rejected": -44051254.85714286, + "logps/chosen": -211.71645779079861, + "logps/rejected": -405.18136160714283, + "loss": 0.0718, + "rewards/chosen": 0.03840569323963589, + "rewards/margins": 9.018298968909278, + "rewards/rejected": -8.979893275669642, + "step": 314 + }, + { + "epoch": 0.05756052992233897, + "grad_norm": 10.0625, + "kl": 0.0, + "learning_rate": 8.668649322514382e-06, + "logits/chosen": -26583544.888888888, + "logits/rejected": -42784173.71428572, + "logps/chosen": -269.48906792534723, + "logps/rejected": -565.5624651227679, + "loss": 0.0328, + "rewards/chosen": 1.1067051357693143, + "rewards/margins": 13.217298659067305, + "rewards/rejected": -12.110593523297991, + "step": 315 + }, + { + "epoch": 0.057743261763362266, + "grad_norm": 9.6875, + "kl": 0.0, + "learning_rate": 8.656768508095853e-06, + "logits/chosen": -68413816.0, + "logits/rejected": 11000334.0, + "logps/chosen": -311.09783935546875, + "logps/rejected": -425.072265625, + "loss": 0.0377, + "rewards/chosen": 0.9442082047462463, + "rewards/margins": 13.49067884683609, + "rewards/rejected": -12.546470642089844, + "step": 316 + }, + { + "epoch": 0.057925993604385564, + "grad_norm": 8.125, + "kl": 0.0, + "learning_rate": 8.644843137107058e-06, + "logits/chosen": -27032793.14285714, + "logits/rejected": -43454023.11111111, + "logps/chosen": -278.83108956473217, + "logps/rejected": -457.37120225694446, + "loss": 0.0367, + "rewards/chosen": 0.7470141819545201, + "rewards/margins": 12.263126615494018, + "rewards/rejected": -11.516112433539497, + "step": 317 + }, + { + "epoch": 0.05810872544540886, + "grad_norm": 7.15625, + "kl": 0.0, + "learning_rate": 8.632873354854881e-06, + "logits/chosen": -102227541.33333333, + "logits/rejected": -56479104.0, + "logps/chosen": -220.34855143229166, + "logps/rejected": -418.252099609375, + "loss": 0.0261, + "rewards/chosen": 1.2814818223317463, + "rewards/margins": 12.740675242741903, + "rewards/rejected": -11.459193420410156, + "step": 318 + }, + { + "epoch": 0.05829145728643216, + "grad_norm": 6.6875, + "kl": 0.0, + "learning_rate": 8.620859307187339e-06, + "logits/chosen": -138499696.0, + "logits/rejected": -116921192.0, + "logps/chosen": -113.35636138916016, + "logps/rejected": -367.8093566894531, + "loss": 0.0362, + "rewards/chosen": 1.086212396621704, + "rewards/margins": 11.888753175735474, + "rewards/rejected": -10.80254077911377, + "step": 319 + }, + { + "epoch": 0.05847418912745546, + "grad_norm": 7.5625, + "kl": 0.0, + "learning_rate": 8.608801140491811e-06, + "logits/chosen": -36158777.6, + "logits/rejected": 47965610.666666664, + "logps/chosen": -219.530908203125, + "logps/rejected": -447.3733723958333, + "loss": 0.0424, + "rewards/chosen": 0.5236353397369384, + "rewards/margins": 11.095134210586547, + "rewards/rejected": -10.57149887084961, + "step": 320 + }, + { + "epoch": 0.05865692096847876, + "grad_norm": 13.0, + "kl": 0.0, + "learning_rate": 8.596699001693257e-06, + "logits/chosen": -44287144.0, + "logits/rejected": -42246620.8, + "logps/chosen": -213.42610677083334, + "logps/rejected": -435.51728515625, + "loss": 0.042, + "rewards/chosen": 1.6573669115702312, + "rewards/margins": 13.773547331492106, + "rewards/rejected": -12.116180419921875, + "step": 321 + }, + { + "epoch": 0.058839652809502055, + "grad_norm": 6.96875, + "kl": 0.0, + "learning_rate": 8.584553038252415e-06, + "logits/chosen": -73989653.33333333, + "logits/rejected": -44242947.2, + "logps/chosen": -174.76607259114584, + "logps/rejected": -397.1581787109375, + "loss": 0.0264, + "rewards/chosen": 0.7595343589782715, + "rewards/margins": 12.905760860443115, + "rewards/rejected": -12.146226501464843, + "step": 322 + }, + { + "epoch": 0.05902238465052535, + "grad_norm": 9.375, + "kl": 0.0, + "learning_rate": 8.572363398164017e-06, + "logits/chosen": -125968028.44444445, + "logits/rejected": -122703268.57142857, + "logps/chosen": -254.62044270833334, + "logps/rejected": -360.49410574776783, + "loss": 0.0392, + "rewards/chosen": 1.2739378611246746, + "rewards/margins": 11.835951759701683, + "rewards/rejected": -10.562013898577009, + "step": 323 + }, + { + "epoch": 0.05920511649154865, + "grad_norm": 8.3125, + "kl": 0.0, + "learning_rate": 8.560130229954985e-06, + "logits/chosen": -18013116.0, + "logits/rejected": -30374224.0, + "logps/chosen": -218.6776580810547, + "logps/rejected": -416.9656982421875, + "loss": 0.0264, + "rewards/chosen": 1.7907061576843262, + "rewards/margins": 13.25577974319458, + "rewards/rejected": -11.465073585510254, + "step": 324 + }, + { + "epoch": 0.05938784833257195, + "grad_norm": 6.59375, + "kl": 0.0, + "learning_rate": 8.547853682682605e-06, + "logits/chosen": -133148918.85714285, + "logits/rejected": -60798791.11111111, + "logps/chosen": -216.43931361607142, + "logps/rejected": -338.73711480034723, + "loss": 0.0269, + "rewards/chosen": 1.2511934552873885, + "rewards/margins": 12.167040385897199, + "rewards/rejected": -10.91584693060981, + "step": 325 + }, + { + "epoch": 0.05957058017359525, + "grad_norm": 12.0, + "kl": 0.0, + "learning_rate": 8.535533905932739e-06, + "logits/chosen": 14983939.555555556, + "logits/rejected": 32685929.14285714, + "logps/chosen": -198.26848687065973, + "logps/rejected": -438.92295619419644, + "loss": 0.032, + "rewards/chosen": 1.695103645324707, + "rewards/margins": 11.822269303458077, + "rewards/rejected": -10.12716565813337, + "step": 326 + }, + { + "epoch": 0.059753312014618545, + "grad_norm": 12.5625, + "kl": 0.0, + "learning_rate": 8.523171049817974e-06, + "logits/chosen": -74358624.0, + "logits/rejected": -18337762.666666668, + "logps/chosen": -232.6323974609375, + "logps/rejected": -385.8327229817708, + "loss": 0.0553, + "rewards/chosen": 1.3013063430786134, + "rewards/margins": 12.65818780263265, + "rewards/rejected": -11.356881459554037, + "step": 327 + }, + { + "epoch": 0.05993604385564184, + "grad_norm": 10.75, + "kl": 0.0, + "learning_rate": 8.510765264975813e-06, + "logits/chosen": -138502456.8888889, + "logits/rejected": -126780598.85714285, + "logps/chosen": -279.53862847222223, + "logps/rejected": -487.79073660714283, + "loss": 0.0516, + "rewards/chosen": 0.2787634531656901, + "rewards/margins": 12.718394234066917, + "rewards/rejected": -12.439630780901227, + "step": 328 + }, + { + "epoch": 0.06011877569666514, + "grad_norm": 10.625, + "kl": 0.0, + "learning_rate": 8.498316702566828e-06, + "logits/chosen": -19058664.0, + "logits/rejected": -42012457.6, + "logps/chosen": -262.98350016276044, + "logps/rejected": -508.05322265625, + "loss": 0.0482, + "rewards/chosen": -0.11862233281135559, + "rewards/margins": 9.407020062208176, + "rewards/rejected": -9.525642395019531, + "step": 329 + }, + { + "epoch": 0.06030150753768844, + "grad_norm": 7.09375, + "kl": 0.0, + "learning_rate": 8.485825514272824e-06, + "logits/chosen": -64936187.428571425, + "logits/rejected": -207278876.44444445, + "logps/chosen": -207.05552455357142, + "logps/rejected": -527.134765625, + "loss": 0.0225, + "rewards/chosen": 1.627145222255162, + "rewards/margins": 17.613122395106725, + "rewards/rejected": -15.985977172851562, + "step": 330 + }, + { + "epoch": 0.06048423937871174, + "grad_norm": 5.40625, + "kl": 0.0, + "learning_rate": 8.473291852294986e-06, + "logits/chosen": -101705179.42857143, + "logits/rejected": -55496846.222222224, + "logps/chosen": -128.1177978515625, + "logps/rejected": -483.25732421875, + "loss": 0.0282, + "rewards/chosen": 1.3494607380458288, + "rewards/margins": 13.600187225947305, + "rewards/rejected": -12.250726487901476, + "step": 331 + }, + { + "epoch": 0.060666971219735036, + "grad_norm": 8.3125, + "kl": 0.0, + "learning_rate": 8.460715869352035e-06, + "logits/chosen": -66992256.0, + "logits/rejected": -21118346.181818184, + "logps/chosen": -269.745751953125, + "logps/rejected": -341.76713423295456, + "loss": 0.0188, + "rewards/chosen": 1.23125, + "rewards/margins": 13.62864990234375, + "rewards/rejected": -12.39739990234375, + "step": 332 + }, + { + "epoch": 0.060849703060758334, + "grad_norm": 10.1875, + "kl": 0.0, + "learning_rate": 8.44809771867835e-06, + "logits/chosen": -47668526.222222224, + "logits/rejected": -9038338.285714285, + "logps/chosen": -197.35389539930554, + "logps/rejected": -385.01273018973217, + "loss": 0.0478, + "rewards/chosen": 1.049852795071072, + "rewards/margins": 11.299027730548191, + "rewards/rejected": -10.24917493547712, + "step": 333 + }, + { + "epoch": 0.06103243490178163, + "grad_norm": 12.5625, + "kl": 0.0, + "learning_rate": 8.435437554022116e-06, + "logits/chosen": -83153720.8888889, + "logits/rejected": 803866.2857142857, + "logps/chosen": -249.07286241319446, + "logps/rejected": -594.7562081473214, + "loss": 0.065, + "rewards/chosen": 0.12942708863152397, + "rewards/margins": 11.326624049080742, + "rewards/rejected": -11.197196960449219, + "step": 334 + }, + { + "epoch": 0.06121516674280494, + "grad_norm": 8.375, + "kl": 0.0, + "learning_rate": 8.422735529643445e-06, + "logits/chosen": -46000789.333333336, + "logits/rejected": 16323635.2, + "logps/chosen": -235.01456705729166, + "logps/rejected": -598.388720703125, + "loss": 0.0398, + "rewards/chosen": 0.035542512933413185, + "rewards/margins": 14.70176260570685, + "rewards/rejected": -14.666220092773438, + "step": 335 + }, + { + "epoch": 0.061397898583828235, + "grad_norm": 11.375, + "kl": 0.0, + "learning_rate": 8.409991800312493e-06, + "logits/chosen": 66735532.8, + "logits/rejected": 40077882.666666664, + "logps/chosen": -273.4470947265625, + "logps/rejected": -429.0470784505208, + "loss": 0.0419, + "rewards/chosen": 1.177035140991211, + "rewards/margins": 14.788421758015952, + "rewards/rejected": -13.61138661702474, + "step": 336 + }, + { + "epoch": 0.06158063042485153, + "grad_norm": 11.25, + "kl": 0.0, + "learning_rate": 8.397206521307584e-06, + "logits/chosen": -43564088.0, + "logits/rejected": -26861510.0, + "logps/chosen": -357.5474853515625, + "logps/rejected": -474.4355773925781, + "loss": 0.0394, + "rewards/chosen": 0.7491832971572876, + "rewards/margins": 12.66252863407135, + "rewards/rejected": -11.913345336914062, + "step": 337 + }, + { + "epoch": 0.06176336226587483, + "grad_norm": 9.3125, + "kl": 0.0, + "learning_rate": 8.384379848413304e-06, + "logits/chosen": -45108672.0, + "logits/rejected": -31249019.42857143, + "logps/chosen": -226.63629828559027, + "logps/rejected": -421.697021484375, + "loss": 0.04, + "rewards/chosen": 1.4946532779269748, + "rewards/margins": 12.545347334846618, + "rewards/rejected": -11.050694056919642, + "step": 338 + }, + { + "epoch": 0.06194609410689813, + "grad_norm": 10.5, + "kl": 0.0, + "learning_rate": 8.371511937918616e-06, + "logits/chosen": -46862710.85714286, + "logits/rejected": -30406058.666666668, + "logps/chosen": -248.88194056919642, + "logps/rejected": -653.6600477430555, + "loss": 0.0443, + "rewards/chosen": 0.021342907633100237, + "rewards/margins": 12.535415749701242, + "rewards/rejected": -12.514072842068142, + "step": 339 + }, + { + "epoch": 0.06212882594792143, + "grad_norm": 7.09375, + "kl": 0.0, + "learning_rate": 8.358602946614952e-06, + "logits/chosen": -17690390.0, + "logits/rejected": 6158956.0, + "logps/chosen": -216.46292114257812, + "logps/rejected": -511.8522644042969, + "loss": 0.0423, + "rewards/chosen": 0.5541330575942993, + "rewards/margins": 13.280667901039124, + "rewards/rejected": -12.726534843444824, + "step": 340 + }, + { + "epoch": 0.062311557788944726, + "grad_norm": 8.3125, + "kl": 0.0, + "learning_rate": 8.345653031794292e-06, + "logits/chosen": 16472808.0, + "logits/rejected": 5891974.0, + "logps/chosen": -158.94493408203124, + "logps/rejected": -368.4425455729167, + "loss": 0.0297, + "rewards/chosen": 1.693167495727539, + "rewards/margins": 11.475193150838217, + "rewards/rejected": -9.782025655110678, + "step": 341 + }, + { + "epoch": 0.062494289629968024, + "grad_norm": 11.6875, + "kl": 0.0, + "learning_rate": 8.332662351247262e-06, + "logits/chosen": 12871673.142857144, + "logits/rejected": -33597134.222222224, + "logps/chosen": -289.99058314732144, + "logps/rejected": -610.376953125, + "loss": 0.0299, + "rewards/chosen": 1.2388297489711217, + "rewards/margins": 12.812209265572685, + "rewards/rejected": -11.573379516601562, + "step": 342 + }, + { + "epoch": 0.06267702147099131, + "grad_norm": 7.6875, + "kl": 0.0, + "learning_rate": 8.319631063261209e-06, + "logits/chosen": -21303202.666666668, + "logits/rejected": 2513922.4, + "logps/chosen": -257.3687744140625, + "logps/rejected": -508.0580078125, + "loss": 0.0186, + "rewards/chosen": 1.651381492614746, + "rewards/margins": 12.806236839294433, + "rewards/rejected": -11.154855346679687, + "step": 343 + }, + { + "epoch": 0.06285975331201461, + "grad_norm": 7.71875, + "kl": 0.0, + "learning_rate": 8.30655932661826e-06, + "logits/chosen": -74039506.28571428, + "logits/rejected": -48710720.0, + "logps/chosen": -165.63560267857142, + "logps/rejected": -538.8254123263889, + "loss": 0.022, + "rewards/chosen": 1.7619686126708984, + "rewards/margins": 12.278905868530273, + "rewards/rejected": -10.516937255859375, + "step": 344 + }, + { + "epoch": 0.06304248515303791, + "grad_norm": 10.5625, + "kl": 0.0, + "learning_rate": 8.293447300593402e-06, + "logits/chosen": -133070008.8888889, + "logits/rejected": -155108388.57142857, + "logps/chosen": -285.33409288194446, + "logps/rejected": -436.99550083705356, + "loss": 0.0462, + "rewards/chosen": 0.7844548755221896, + "rewards/margins": 13.179999745081341, + "rewards/rejected": -12.395544869559151, + "step": 345 + }, + { + "epoch": 0.06322521699406121, + "grad_norm": 6.53125, + "kl": 0.0, + "learning_rate": 8.280295144952537e-06, + "logits/chosen": -41004404.36363637, + "logits/rejected": -15196275.2, + "logps/chosen": -220.23268821022728, + "logps/rejected": -366.8669677734375, + "loss": 0.0338, + "rewards/chosen": 2.144561594182795, + "rewards/margins": 11.085423868352716, + "rewards/rejected": -8.940862274169922, + "step": 346 + }, + { + "epoch": 0.06340794883508451, + "grad_norm": 9.3125, + "kl": 0.0, + "learning_rate": 8.267103019950529e-06, + "logits/chosen": -77524448.0, + "logits/rejected": -58436652.0, + "logps/chosen": -183.64279174804688, + "logps/rejected": -455.103515625, + "loss": 0.0375, + "rewards/chosen": 2.177523136138916, + "rewards/margins": 10.126269817352295, + "rewards/rejected": -7.948746681213379, + "step": 347 + }, + { + "epoch": 0.0635906806761078, + "grad_norm": 12.0, + "kl": 0.0, + "learning_rate": 8.253871086329255e-06, + "logits/chosen": -44199406.222222224, + "logits/rejected": 8554430.857142856, + "logps/chosen": -317.05088975694446, + "logps/rejected": -506.7123325892857, + "loss": 0.0375, + "rewards/chosen": 0.8599002096388075, + "rewards/margins": 15.298263421134344, + "rewards/rejected": -14.438363211495536, + "step": 348 + }, + { + "epoch": 0.0637734125171311, + "grad_norm": 6.15625, + "kl": 0.0, + "learning_rate": 8.240599505315656e-06, + "logits/chosen": -48497043.2, + "logits/rejected": -56785821.09090909, + "logps/chosen": -285.7568359375, + "logps/rejected": -571.4072265625, + "loss": 0.0263, + "rewards/chosen": 1.4963226318359375, + "rewards/margins": 12.921066977761008, + "rewards/rejected": -11.42474434592507, + "step": 349 + }, + { + "epoch": 0.06395614435815442, + "grad_norm": 11.9375, + "kl": 0.0, + "learning_rate": 8.227288438619754e-06, + "logits/chosen": -58797692.0, + "logits/rejected": -35194364.0, + "logps/chosen": -296.17431640625, + "logps/rejected": -509.6337585449219, + "loss": 0.0398, + "rewards/chosen": 1.1789729595184326, + "rewards/margins": 13.882867574691772, + "rewards/rejected": -12.70389461517334, + "step": 350 + }, + { + "epoch": 0.06413887619917771, + "grad_norm": 10.3125, + "kl": 0.0, + "learning_rate": 8.213938048432697e-06, + "logits/chosen": -70033415.1111111, + "logits/rejected": -50845682.28571428, + "logps/chosen": -215.29462348090277, + "logps/rejected": -443.85581752232144, + "loss": 0.043, + "rewards/chosen": 0.6030304696824815, + "rewards/margins": 12.436227927132258, + "rewards/rejected": -11.833197457449776, + "step": 351 + }, + { + "epoch": 0.06432160804020101, + "grad_norm": 7.3125, + "kl": 0.0, + "learning_rate": 8.200548497424779e-06, + "logits/chosen": -35725252.571428575, + "logits/rejected": -27842556.444444444, + "logps/chosen": -199.85069056919642, + "logps/rejected": -412.5703125, + "loss": 0.0323, + "rewards/chosen": 1.837517329624721, + "rewards/margins": 11.918416038392081, + "rewards/rejected": -10.08089870876736, + "step": 352 + }, + { + "epoch": 0.06450433988122431, + "grad_norm": 8.125, + "kl": 0.0, + "learning_rate": 8.18711994874345e-06, + "logits/chosen": -165317540.57142857, + "logits/rejected": -96954261.33333333, + "logps/chosen": -201.77158900669642, + "logps/rejected": -543.52734375, + "loss": 0.035, + "rewards/chosen": 0.6494541849408831, + "rewards/margins": 12.155392397017705, + "rewards/rejected": -11.505938212076822, + "step": 353 + }, + { + "epoch": 0.06468707172224761, + "grad_norm": 17.625, + "kl": 0.0, + "learning_rate": 8.173652566011339e-06, + "logits/chosen": -77931683.55555555, + "logits/rejected": -5185266.285714285, + "logps/chosen": -198.51547580295139, + "logps/rejected": -304.72537667410717, + "loss": 0.0682, + "rewards/chosen": 0.3676435682508681, + "rewards/margins": 7.163562714107453, + "rewards/rejected": -6.795919145856585, + "step": 354 + }, + { + "epoch": 0.0648698035632709, + "grad_norm": 10.25, + "kl": 0.0, + "learning_rate": 8.160146513324256e-06, + "logits/chosen": -59384179.2, + "logits/rejected": -10293804.0, + "logps/chosen": -284.9318359375, + "logps/rejected": -178.7647501627604, + "loss": 0.073, + "rewards/chosen": 1.3017024040222167, + "rewards/margins": 8.864098771413166, + "rewards/rejected": -7.56239636739095, + "step": 355 + }, + { + "epoch": 0.0650525354042942, + "grad_norm": 10.9375, + "kl": 0.0, + "learning_rate": 8.146601955249187e-06, + "logits/chosen": -89063964.44444445, + "logits/rejected": -29960187.42857143, + "logps/chosen": -216.71790907118054, + "logps/rejected": -369.10609654017856, + "loss": 0.0514, + "rewards/chosen": 0.8418865733676486, + "rewards/margins": 9.431157573821054, + "rewards/rejected": -8.589271000453405, + "step": 356 + }, + { + "epoch": 0.0652352672453175, + "grad_norm": 9.5, + "kl": 0.0, + "learning_rate": 8.133019056822303e-06, + "logits/chosen": -97224454.4, + "logits/rejected": -62510810.666666664, + "logps/chosen": -256.4796875, + "logps/rejected": -496.2582194010417, + "loss": 0.0486, + "rewards/chosen": 0.7638035774230957, + "rewards/margins": 14.353733793894449, + "rewards/rejected": -13.589930216471354, + "step": 357 + }, + { + "epoch": 0.0654179990863408, + "grad_norm": 9.625, + "kl": 0.0, + "learning_rate": 8.119397983546932e-06, + "logits/chosen": -64169152.0, + "logits/rejected": -183588352.0, + "logps/chosen": -292.707763671875, + "logps/rejected": -502.04319069602275, + "loss": 0.0255, + "rewards/chosen": 0.3894253492355347, + "rewards/margins": 11.015288857980208, + "rewards/rejected": -10.625863508744674, + "step": 358 + }, + { + "epoch": 0.0656007309273641, + "grad_norm": 8.5625, + "kl": 0.0, + "learning_rate": 8.105738901391553e-06, + "logits/chosen": -67362318.22222222, + "logits/rejected": -60729965.71428572, + "logps/chosen": -187.96652560763889, + "logps/rejected": -468.2021484375, + "loss": 0.028, + "rewards/chosen": 2.5253929562038846, + "rewards/margins": 15.233681966388035, + "rewards/rejected": -12.708289010184151, + "step": 359 + }, + { + "epoch": 0.0657834627683874, + "grad_norm": 6.125, + "kl": 0.0, + "learning_rate": 8.092041976787772e-06, + "logits/chosen": -50474544.0, + "logits/rejected": -63906770.28571428, + "logps/chosen": -278.58709716796875, + "logps/rejected": -455.01077706473217, + "loss": 0.013, + "rewards/chosen": 0.050177767872810364, + "rewards/margins": 12.15738710973944, + "rewards/rejected": -12.10720934186663, + "step": 360 + }, + { + "epoch": 0.0659661946094107, + "grad_norm": 7.5625, + "kl": 0.0, + "learning_rate": 8.078307376628292e-06, + "logits/chosen": -56716601.6, + "logits/rejected": -30380052.363636363, + "logps/chosen": -241.0656005859375, + "logps/rejected": -486.0894886363636, + "loss": 0.0188, + "rewards/chosen": 1.7265125274658204, + "rewards/margins": 14.0214468869296, + "rewards/rejected": -12.29493435946378, + "step": 361 + }, + { + "epoch": 0.06614892645043399, + "grad_norm": 10.1875, + "kl": 0.0, + "learning_rate": 8.064535268264883e-06, + "logits/chosen": -21322376.0, + "logits/rejected": -74807797.33333333, + "logps/chosen": -244.708349609375, + "logps/rejected": -394.6968587239583, + "loss": 0.0521, + "rewards/chosen": 0.89572114944458, + "rewards/margins": 12.37246675491333, + "rewards/rejected": -11.47674560546875, + "step": 362 + }, + { + "epoch": 0.06633165829145729, + "grad_norm": 11.1875, + "kl": 0.0, + "learning_rate": 8.05072581950634e-06, + "logits/chosen": -72504384.0, + "logits/rejected": 74914480.0, + "logps/chosen": -311.6741943359375, + "logps/rejected": -640.3841959635416, + "loss": 0.0566, + "rewards/chosen": 0.3402609348297119, + "rewards/margins": 17.138299417495727, + "rewards/rejected": -16.798038482666016, + "step": 363 + }, + { + "epoch": 0.06651439013248059, + "grad_norm": 7.375, + "kl": 0.0, + "learning_rate": 8.036879198616434e-06, + "logits/chosen": -46542610.28571428, + "logits/rejected": -77058951.1111111, + "logps/chosen": -211.95305524553572, + "logps/rejected": -568.7100694444445, + "loss": 0.0267, + "rewards/chosen": 1.661919321332659, + "rewards/margins": 18.31118757762606, + "rewards/rejected": -16.649268256293404, + "step": 364 + }, + { + "epoch": 0.06669712197350389, + "grad_norm": 14.0, + "kl": 0.0, + "learning_rate": 8.022995574311876e-06, + "logits/chosen": -51083324.44444445, + "logits/rejected": -13117715.42857143, + "logps/chosen": -217.79253472222223, + "logps/rejected": -481.7855747767857, + "loss": 0.0437, + "rewards/chosen": 1.6735289891560872, + "rewards/margins": 15.021817388988676, + "rewards/rejected": -13.348288399832589, + "step": 365 + }, + { + "epoch": 0.06687985381452718, + "grad_norm": 11.4375, + "kl": 0.0, + "learning_rate": 8.009075115760243e-06, + "logits/chosen": -39776113.777777776, + "logits/rejected": 9467402.285714285, + "logps/chosen": -212.68570963541666, + "logps/rejected": -425.75816127232144, + "loss": 0.0488, + "rewards/chosen": 0.7361158794826932, + "rewards/margins": 11.044884015643406, + "rewards/rejected": -10.308768136160714, + "step": 366 + }, + { + "epoch": 0.06706258565555048, + "grad_norm": 10.4375, + "kl": 0.0, + "learning_rate": 7.99511799257793e-06, + "logits/chosen": -76842752.0, + "logits/rejected": -31131331.555555556, + "logps/chosen": -292.47767857142856, + "logps/rejected": -652.7386067708334, + "loss": 0.0264, + "rewards/chosen": 1.2586565017700195, + "rewards/margins": 15.424479696485731, + "rewards/rejected": -14.165823194715712, + "step": 367 + }, + { + "epoch": 0.06724531749657378, + "grad_norm": 8.25, + "kl": 0.0, + "learning_rate": 7.981124374828079e-06, + "logits/chosen": -61144090.666666664, + "logits/rejected": -35262809.6, + "logps/chosen": -290.7404378255208, + "logps/rejected": -536.72802734375, + "loss": 0.0223, + "rewards/chosen": 1.5594415664672852, + "rewards/margins": 15.719475746154785, + "rewards/rejected": -14.1600341796875, + "step": 368 + }, + { + "epoch": 0.06742804933759708, + "grad_norm": 5.875, + "kl": 0.0, + "learning_rate": 7.967094433018508e-06, + "logits/chosen": 14784403.2, + "logits/rejected": -43726920.72727273, + "logps/chosen": -98.09923706054687, + "logps/rejected": -418.6541193181818, + "loss": 0.0288, + "rewards/chosen": 0.20774035453796386, + "rewards/margins": 11.665710713646629, + "rewards/rejected": -11.457970359108664, + "step": 369 + }, + { + "epoch": 0.06761078117862038, + "grad_norm": 11.0, + "kl": 0.0, + "learning_rate": 7.953028338099628e-06, + "logits/chosen": -35109209.6, + "logits/rejected": -11476520.0, + "logps/chosen": -240.82978515625, + "logps/rejected": -406.9553629557292, + "loss": 0.0484, + "rewards/chosen": 0.8630002021789551, + "rewards/margins": 11.32741543451945, + "rewards/rejected": -10.464415232340494, + "step": 370 + }, + { + "epoch": 0.06779351301964368, + "grad_norm": 10.4375, + "kl": 0.0, + "learning_rate": 7.938926261462366e-06, + "logits/chosen": -149587712.0, + "logits/rejected": -55076787.2, + "logps/chosen": -217.29041637073863, + "logps/rejected": -423.289404296875, + "loss": 0.0748, + "rewards/chosen": -0.12218298695304176, + "rewards/margins": 14.678622677109457, + "rewards/rejected": -14.8008056640625, + "step": 371 + }, + { + "epoch": 0.06797624486066697, + "grad_norm": 10.1875, + "kl": 0.0, + "learning_rate": 7.92478837493608e-06, + "logits/chosen": -40078340.0, + "logits/rejected": -19943968.0, + "logps/chosen": -225.93357849121094, + "logps/rejected": -429.5360412597656, + "loss": 0.0331, + "rewards/chosen": 0.9881956577301025, + "rewards/margins": 12.472563982009888, + "rewards/rejected": -11.484368324279785, + "step": 372 + }, + { + "epoch": 0.06815897670169027, + "grad_norm": 9.1875, + "kl": 0.0, + "learning_rate": 7.910614850786448e-06, + "logits/chosen": -18450670.0, + "logits/rejected": -15264499.0, + "logps/chosen": -214.0384979248047, + "logps/rejected": -262.750732421875, + "loss": 0.0358, + "rewards/chosen": 1.1497454643249512, + "rewards/margins": 11.191657543182373, + "rewards/rejected": -10.041912078857422, + "step": 373 + }, + { + "epoch": 0.06834170854271357, + "grad_norm": 11.375, + "kl": 0.0, + "learning_rate": 7.896405861713393e-06, + "logits/chosen": -93122032.0, + "logits/rejected": -3936520.0, + "logps/chosen": -247.00672912597656, + "logps/rejected": -514.626953125, + "loss": 0.0411, + "rewards/chosen": 0.6083171963691711, + "rewards/margins": 11.572052776813507, + "rewards/rejected": -10.963735580444336, + "step": 374 + }, + { + "epoch": 0.06852444038373687, + "grad_norm": 11.25, + "kl": 0.0, + "learning_rate": 7.882161580848966e-06, + "logits/chosen": -70919720.72727273, + "logits/rejected": -155929280.0, + "logps/chosen": -237.71728515625, + "logps/rejected": -426.0515625, + "loss": 0.0483, + "rewards/chosen": 1.2175935398448596, + "rewards/margins": 10.169329990040172, + "rewards/rejected": -8.951736450195312, + "step": 375 + }, + { + "epoch": 0.06870717222476017, + "grad_norm": 11.75, + "kl": 0.0, + "learning_rate": 7.86788218175523e-06, + "logits/chosen": -43773061.81818182, + "logits/rejected": 37554368.0, + "logps/chosen": -233.4556551846591, + "logps/rejected": -467.760986328125, + "loss": 0.0487, + "rewards/chosen": 1.1607334830544211, + "rewards/margins": 10.138670799948953, + "rewards/rejected": -8.977937316894531, + "step": 376 + }, + { + "epoch": 0.06888990406578346, + "grad_norm": 8.3125, + "kl": 0.0, + "learning_rate": 7.85356783842216e-06, + "logits/chosen": -123722645.33333333, + "logits/rejected": -42281142.4, + "logps/chosen": -272.8017985026042, + "logps/rejected": -523.53056640625, + "loss": 0.0259, + "rewards/chosen": 1.2028482755025227, + "rewards/margins": 14.041996224721274, + "rewards/rejected": -12.83914794921875, + "step": 377 + }, + { + "epoch": 0.06907263590680676, + "grad_norm": 9.5625, + "kl": 0.0, + "learning_rate": 7.839218725265507e-06, + "logits/chosen": 60974149.333333336, + "logits/rejected": -1208985.6, + "logps/chosen": -146.9958699544271, + "logps/rejected": -434.390234375, + "loss": 0.0352, + "rewards/chosen": 0.8516283830006918, + "rewards/margins": 11.170337184270224, + "rewards/rejected": -10.318708801269532, + "step": 378 + }, + { + "epoch": 0.06925536774783006, + "grad_norm": 7.96875, + "kl": 0.0, + "learning_rate": 7.82483501712469e-06, + "logits/chosen": -71067578.66666667, + "logits/rejected": -63976753.23076923, + "logps/chosen": -194.02445475260416, + "logps/rejected": -432.6696589543269, + "loss": 0.0177, + "rewards/chosen": 0.7702019214630127, + "rewards/margins": 10.289133383677555, + "rewards/rejected": -9.518931462214542, + "step": 379 + }, + { + "epoch": 0.06943809958885336, + "grad_norm": 12.0, + "kl": 0.0, + "learning_rate": 7.810416889260653e-06, + "logits/chosen": -61075918.222222224, + "logits/rejected": -40885970.28571428, + "logps/chosen": -234.44981553819446, + "logps/rejected": -309.49072265625, + "loss": 0.0565, + "rewards/chosen": 0.16602683067321777, + "rewards/margins": 9.165472064699445, + "rewards/rejected": -8.999445234026227, + "step": 380 + }, + { + "epoch": 0.06962083142987666, + "grad_norm": 10.375, + "kl": 0.0, + "learning_rate": 7.795964517353734e-06, + "logits/chosen": -70987690.66666667, + "logits/rejected": -41001568.0, + "logps/chosen": -327.2459716796875, + "logps/rejected": -506.3935546875, + "loss": 0.0423, + "rewards/chosen": -0.7299525737762451, + "rewards/margins": 11.60236325263977, + "rewards/rejected": -12.332315826416016, + "step": 381 + }, + { + "epoch": 0.06980356327089995, + "grad_norm": 9.75, + "kl": 0.0, + "learning_rate": 7.781478077501526e-06, + "logits/chosen": -18367764.0, + "logits/rejected": 42325644.0, + "logps/chosen": -234.14260864257812, + "logps/rejected": -543.7745361328125, + "loss": 0.0486, + "rewards/chosen": 0.7438967823982239, + "rewards/margins": 14.057949364185333, + "rewards/rejected": -13.31405258178711, + "step": 382 + }, + { + "epoch": 0.06998629511192325, + "grad_norm": 8.8125, + "kl": 0.0, + "learning_rate": 7.76695774621672e-06, + "logits/chosen": -119156814.22222222, + "logits/rejected": -121587821.71428572, + "logps/chosen": -219.66049533420139, + "logps/rejected": -478.26353236607144, + "loss": 0.0296, + "rewards/chosen": 1.392868783738878, + "rewards/margins": 14.081193848261758, + "rewards/rejected": -12.68832506452288, + "step": 383 + }, + { + "epoch": 0.07016902695294655, + "grad_norm": 10.0625, + "kl": 0.0, + "learning_rate": 7.752403700424978e-06, + "logits/chosen": -44450828.0, + "logits/rejected": -8296796.5, + "logps/chosen": -251.9716796875, + "logps/rejected": -603.3408203125, + "loss": 0.0338, + "rewards/chosen": 1.1179184913635254, + "rewards/margins": 15.55370569229126, + "rewards/rejected": -14.435787200927734, + "step": 384 + }, + { + "epoch": 0.07035175879396985, + "grad_norm": 6.71875, + "kl": 0.0, + "learning_rate": 7.737816117462752e-06, + "logits/chosen": 9450889.714285715, + "logits/rejected": -27759955.555555556, + "logps/chosen": -147.85642787388392, + "logps/rejected": -515.4059244791666, + "loss": 0.0235, + "rewards/chosen": 1.1369143213544572, + "rewards/margins": 14.555136536794995, + "rewards/rejected": -13.418222215440538, + "step": 385 + }, + { + "epoch": 0.07053449063499315, + "grad_norm": 7.75, + "kl": 0.0, + "learning_rate": 7.723195175075136e-06, + "logits/chosen": -60220643.55555555, + "logits/rejected": 10933366.857142856, + "logps/chosen": -162.33629014756946, + "logps/rejected": -558.8653390066964, + "loss": 0.0535, + "rewards/chosen": 0.170175658331977, + "rewards/margins": 11.135260279216464, + "rewards/rejected": -10.965084620884486, + "step": 386 + }, + { + "epoch": 0.07071722247601644, + "grad_norm": 12.5625, + "kl": 0.0, + "learning_rate": 7.7085410514137e-06, + "logits/chosen": -72859912.0, + "logits/rejected": -35640576.0, + "logps/chosen": -345.8438720703125, + "logps/rejected": -435.84124755859375, + "loss": 0.0442, + "rewards/chosen": 0.24658547341823578, + "rewards/margins": 13.59377346932888, + "rewards/rejected": -13.347187995910645, + "step": 387 + }, + { + "epoch": 0.07089995431703974, + "grad_norm": 7.15625, + "kl": 0.0, + "learning_rate": 7.693853925034316e-06, + "logits/chosen": -62340416.0, + "logits/rejected": -20992480.0, + "logps/chosen": -217.3953857421875, + "logps/rejected": -509.2806396484375, + "loss": 0.0337, + "rewards/chosen": 1.243261456489563, + "rewards/margins": 13.978176236152649, + "rewards/rejected": -12.734914779663086, + "step": 388 + }, + { + "epoch": 0.07108268615806304, + "grad_norm": 8.0, + "kl": 0.0, + "learning_rate": 7.679133974894984e-06, + "logits/chosen": -50497051.428571425, + "logits/rejected": 6602514.666666667, + "logps/chosen": -255.13581194196428, + "logps/rejected": -576.6335720486111, + "loss": 0.0206, + "rewards/chosen": 2.6136986868722096, + "rewards/margins": 19.653802175370473, + "rewards/rejected": -17.040103488498264, + "step": 389 + }, + { + "epoch": 0.07126541799908634, + "grad_norm": 10.5, + "kl": 0.0, + "learning_rate": 7.66438138035365e-06, + "logits/chosen": -190684864.0, + "logits/rejected": -242908032.0, + "logps/chosen": -298.20855712890625, + "logps/rejected": -627.957275390625, + "loss": 0.0498, + "rewards/chosen": 0.06791170686483383, + "rewards/margins": 13.434807382524014, + "rewards/rejected": -13.36689567565918, + "step": 390 + }, + { + "epoch": 0.07144814984010964, + "grad_norm": 8.25, + "kl": 0.0, + "learning_rate": 7.649596321166024e-06, + "logits/chosen": -55817013.333333336, + "logits/rejected": -22611586.0, + "logps/chosen": -190.51517740885416, + "logps/rejected": -375.03887939453125, + "loss": 0.039, + "rewards/chosen": 1.8578041394551594, + "rewards/margins": 15.79716189702352, + "rewards/rejected": -13.93935775756836, + "step": 391 + }, + { + "epoch": 0.07163088168113294, + "grad_norm": 10.25, + "kl": 0.0, + "learning_rate": 7.634778977483389e-06, + "logits/chosen": -90986581.33333333, + "logits/rejected": -31259430.4, + "logps/chosen": -275.0701497395833, + "logps/rejected": -368.128759765625, + "loss": 0.0411, + "rewards/chosen": 0.7030970255533854, + "rewards/margins": 10.086847178141275, + "rewards/rejected": -9.38375015258789, + "step": 392 + }, + { + "epoch": 0.07181361352215623, + "grad_norm": 9.1875, + "kl": 0.0, + "learning_rate": 7.619929529850397e-06, + "logits/chosen": 13157349.714285715, + "logits/rejected": -1835771.0, + "logps/chosen": -198.23005022321428, + "logps/rejected": -520.1720920138889, + "loss": 0.0396, + "rewards/chosen": 0.9068944113595145, + "rewards/margins": 12.72599722847106, + "rewards/rejected": -11.819102817111546, + "step": 393 + }, + { + "epoch": 0.07199634536317953, + "grad_norm": 8.8125, + "kl": 0.0, + "learning_rate": 7.605048159202884e-06, + "logits/chosen": -31296925.333333332, + "logits/rejected": -78454617.6, + "logps/chosen": -375.2389322916667, + "logps/rejected": -485.16123046875, + "loss": 0.0521, + "rewards/chosen": -0.8901504675547282, + "rewards/margins": 11.830606063206991, + "rewards/rejected": -12.720756530761719, + "step": 394 + }, + { + "epoch": 0.07217907720420283, + "grad_norm": 11.25, + "kl": 0.0, + "learning_rate": 7.590135046865652e-06, + "logits/chosen": -108563477.33333333, + "logits/rejected": -68586427.42857143, + "logps/chosen": -278.0344509548611, + "logps/rejected": -553.8449358258929, + "loss": 0.0357, + "rewards/chosen": 1.3646124733818903, + "rewards/margins": 14.146255932157, + "rewards/rejected": -12.781643458775111, + "step": 395 + }, + { + "epoch": 0.07236180904522613, + "grad_norm": 7.90625, + "kl": 0.0, + "learning_rate": 7.575190374550272e-06, + "logits/chosen": -162502599.1111111, + "logits/rejected": -37334352.0, + "logps/chosen": -204.28352864583334, + "logps/rejected": -481.64017159598217, + "loss": 0.0271, + "rewards/chosen": 1.8589161766899958, + "rewards/margins": 15.367171181572807, + "rewards/rejected": -13.508255004882812, + "step": 396 + }, + { + "epoch": 0.07254454088624943, + "grad_norm": 7.4375, + "kl": 0.0, + "learning_rate": 7.560214324352858e-06, + "logits/chosen": -111806720.0, + "logits/rejected": -52754420.36363637, + "logps/chosen": -285.5540283203125, + "logps/rejected": -441.89999112215907, + "loss": 0.0298, + "rewards/chosen": 0.26764068603515623, + "rewards/margins": 11.123491599343039, + "rewards/rejected": -10.855850913307883, + "step": 397 + }, + { + "epoch": 0.07272727272727272, + "grad_norm": 11.125, + "kl": 0.0, + "learning_rate": 7.545207078751858e-06, + "logits/chosen": -94498264.0, + "logits/rejected": -57021904.0, + "logps/chosen": -364.521484375, + "logps/rejected": -525.9183959960938, + "loss": 0.049, + "rewards/chosen": 0.028162017464637756, + "rewards/margins": 11.592174544930458, + "rewards/rejected": -11.56401252746582, + "step": 398 + }, + { + "epoch": 0.07291000456829602, + "grad_norm": 11.25, + "kl": 0.0, + "learning_rate": 7.530168820605819e-06, + "logits/chosen": -77821368.8888889, + "logits/rejected": -39767113.14285714, + "logps/chosen": -241.04237196180554, + "logps/rejected": -497.4868861607143, + "loss": 0.0396, + "rewards/chosen": 0.6675925254821777, + "rewards/margins": 14.284975119999476, + "rewards/rejected": -13.617382594517299, + "step": 399 + }, + { + "epoch": 0.07309273640931932, + "grad_norm": 10.6875, + "kl": 0.0, + "learning_rate": 7.515099733151177e-06, + "logits/chosen": -94320640.0, + "logits/rejected": -42024888.0, + "logps/chosen": -264.4732971191406, + "logps/rejected": -388.82257080078125, + "loss": 0.0471, + "rewards/chosen": 0.22554072737693787, + "rewards/margins": 13.091660112142563, + "rewards/rejected": -12.866119384765625, + "step": 400 + }, + { + "epoch": 0.07327546825034262, + "grad_norm": 10.8125, + "kl": 0.0, + "learning_rate": 7.500000000000001e-06, + "logits/chosen": -36457171.2, + "logits/rejected": -68266309.33333333, + "logps/chosen": -245.6039794921875, + "logps/rejected": -526.6138102213541, + "loss": 0.0377, + "rewards/chosen": 1.299946403503418, + "rewards/margins": 12.343027051289877, + "rewards/rejected": -11.043080647786459, + "step": 401 + }, + { + "epoch": 0.07345820009136592, + "grad_norm": 11.625, + "kl": 0.0, + "learning_rate": 7.484869805137778e-06, + "logits/chosen": -43540258.666666664, + "logits/rejected": -9836552.0, + "logps/chosen": -300.51641845703125, + "logps/rejected": -686.84296875, + "loss": 0.0324, + "rewards/chosen": 0.2911674578984578, + "rewards/margins": 13.605121620496115, + "rewards/rejected": -13.313954162597657, + "step": 402 + }, + { + "epoch": 0.07364093193238921, + "grad_norm": 8.75, + "kl": 0.0, + "learning_rate": 7.469709332921155e-06, + "logits/chosen": -91083914.66666667, + "logits/rejected": -65566592.0, + "logps/chosen": -264.19488525390625, + "logps/rejected": -436.5255859375, + "loss": 0.0226, + "rewards/chosen": 1.0279507637023926, + "rewards/margins": 9.780662250518798, + "rewards/rejected": -8.752711486816406, + "step": 403 + }, + { + "epoch": 0.07382366377341251, + "grad_norm": 10.625, + "kl": 0.26771068572998047, + "learning_rate": 7.454518768075705e-06, + "logits/chosen": -97050986.66666667, + "logits/rejected": -33177654.4, + "logps/chosen": -288.5988362630208, + "logps/rejected": -412.823388671875, + "loss": 0.0411, + "rewards/chosen": 0.015069074928760529, + "rewards/margins": 10.984572859108448, + "rewards/rejected": -10.969503784179688, + "step": 404 + }, + { + "epoch": 0.07400639561443581, + "grad_norm": 9.625, + "kl": 0.0, + "learning_rate": 7.4392982956936644e-06, + "logits/chosen": -78914968.0, + "logits/rejected": -54663304.0, + "logps/chosen": -187.2249755859375, + "logps/rejected": -579.9502563476562, + "loss": 0.0403, + "rewards/chosen": 1.049789309501648, + "rewards/margins": 13.766927599906921, + "rewards/rejected": -12.717138290405273, + "step": 405 + }, + { + "epoch": 0.07418912745545911, + "grad_norm": 10.5625, + "kl": 0.0, + "learning_rate": 7.424048101231687e-06, + "logits/chosen": -23678616.0, + "logits/rejected": 24980104.0, + "logps/chosen": -245.5043487548828, + "logps/rejected": -337.1605224609375, + "loss": 0.0367, + "rewards/chosen": 0.7058590650558472, + "rewards/margins": 11.538375735282898, + "rewards/rejected": -10.83251667022705, + "step": 406 + }, + { + "epoch": 0.0743718592964824, + "grad_norm": 11.0, + "kl": 0.0, + "learning_rate": 7.408768370508577e-06, + "logits/chosen": -60922944.0, + "logits/rejected": -22467013.333333332, + "logps/chosen": -227.4669189453125, + "logps/rejected": -582.2315266927084, + "loss": 0.0351, + "rewards/chosen": 1.9419073104858398, + "rewards/margins": 12.538298352559408, + "rewards/rejected": -10.596391042073568, + "step": 407 + }, + { + "epoch": 0.0745545911375057, + "grad_norm": 8.3125, + "kl": 0.0, + "learning_rate": 7.393459289703035e-06, + "logits/chosen": 10674114.285714285, + "logits/rejected": -84974400.0, + "logps/chosen": -230.41643415178572, + "logps/rejected": -521.3875325520834, + "loss": 0.0311, + "rewards/chosen": 0.8646859441484723, + "rewards/margins": 14.480078841012622, + "rewards/rejected": -13.615392896864149, + "step": 408 + }, + { + "epoch": 0.074737322978529, + "grad_norm": 10.5, + "kl": 0.0, + "learning_rate": 7.378121045351378e-06, + "logits/chosen": 2895806.6, + "logits/rejected": 54952298.666666664, + "logps/chosen": -222.2055419921875, + "logps/rejected": -568.8627522786459, + "loss": 0.0403, + "rewards/chosen": 1.5950239181518555, + "rewards/margins": 11.457647768656413, + "rewards/rejected": -9.862623850504557, + "step": 409 + }, + { + "epoch": 0.0749200548195523, + "grad_norm": 5.5625, + "kl": 0.0, + "learning_rate": 7.362753824345271e-06, + "logits/chosen": -39853458.666666664, + "logits/rejected": -94641676.8, + "logps/chosen": -153.25489298502603, + "logps/rejected": -378.10888671875, + "loss": 0.0266, + "rewards/chosen": 1.2830846309661865, + "rewards/margins": 10.809566259384155, + "rewards/rejected": -9.526481628417969, + "step": 410 + }, + { + "epoch": 0.0751027866605756, + "grad_norm": 6.21875, + "kl": 0.0, + "learning_rate": 7.347357813929455e-06, + "logits/chosen": -72276208.0, + "logits/rejected": -96021836.8, + "logps/chosen": -129.77182006835938, + "logps/rejected": -444.010302734375, + "loss": 0.02, + "rewards/chosen": 2.564131259918213, + "rewards/margins": 14.537728977203368, + "rewards/rejected": -11.973597717285156, + "step": 411 + }, + { + "epoch": 0.0752855185015989, + "grad_norm": 8.375, + "kl": 0.0, + "learning_rate": 7.3319332016994575e-06, + "logits/chosen": -160134156.8, + "logits/rejected": -97629998.54545455, + "logps/chosen": -236.916455078125, + "logps/rejected": -552.8049982244319, + "loss": 0.021, + "rewards/chosen": 0.812930965423584, + "rewards/margins": 15.234584019400858, + "rewards/rejected": -14.421653053977273, + "step": 412 + }, + { + "epoch": 0.0754682503426222, + "grad_norm": 8.0625, + "kl": 0.0, + "learning_rate": 7.31648017559931e-06, + "logits/chosen": -82810097.77777778, + "logits/rejected": 34241620.571428575, + "logps/chosen": -233.56111653645834, + "logps/rejected": -361.92599051339283, + "loss": 0.0338, + "rewards/chosen": 1.4417814678615994, + "rewards/margins": 12.383415509784031, + "rewards/rejected": -10.941634041922432, + "step": 413 + }, + { + "epoch": 0.07565098218364551, + "grad_norm": 9.1875, + "kl": 0.0, + "learning_rate": 7.300998923919259e-06, + "logits/chosen": -34861556.0, + "logits/rejected": -64108640.0, + "logps/chosen": -228.0576171875, + "logps/rejected": -379.6484680175781, + "loss": 0.0432, + "rewards/chosen": 0.6660580635070801, + "rewards/margins": 10.616549968719482, + "rewards/rejected": -9.950491905212402, + "step": 414 + }, + { + "epoch": 0.0758337140246688, + "grad_norm": 11.0625, + "kl": 0.0, + "learning_rate": 7.285489635293472e-06, + "logits/chosen": -82214054.4, + "logits/rejected": -15085814.666666666, + "logps/chosen": -287.433349609375, + "logps/rejected": -275.35032145182294, + "loss": 0.0597, + "rewards/chosen": 0.28805832862854003, + "rewards/margins": 8.6616570631663, + "rewards/rejected": -8.37359873453776, + "step": 415 + }, + { + "epoch": 0.0760164458656921, + "grad_norm": 7.0, + "kl": 0.0, + "learning_rate": 7.269952498697734e-06, + "logits/chosen": -9881969.6, + "logits/rejected": -20143349.333333332, + "logps/chosen": -139.11033935546874, + "logps/rejected": -465.960693359375, + "loss": 0.0308, + "rewards/chosen": 1.9502758026123046, + "rewards/margins": 12.47593027750651, + "rewards/rejected": -10.525654474894205, + "step": 416 + }, + { + "epoch": 0.0761991777067154, + "grad_norm": 9.75, + "kl": 0.0, + "learning_rate": 7.254387703447154e-06, + "logits/chosen": -126836852.36363636, + "logits/rejected": -93205926.4, + "logps/chosen": -270.8219105113636, + "logps/rejected": -338.2210205078125, + "loss": 0.0618, + "rewards/chosen": 0.3188995664769953, + "rewards/margins": 9.963188201730901, + "rewards/rejected": -9.644288635253906, + "step": 417 + }, + { + "epoch": 0.0763819095477387, + "grad_norm": 12.875, + "kl": 0.0, + "learning_rate": 7.238795439193849e-06, + "logits/chosen": -3550291.5555555555, + "logits/rejected": 6185977.142857143, + "logps/chosen": -245.61900499131946, + "logps/rejected": -394.608154296875, + "loss": 0.0509, + "rewards/chosen": 0.580921490987142, + "rewards/margins": 8.878810655503047, + "rewards/rejected": -8.297889164515905, + "step": 418 + }, + { + "epoch": 0.076564641388762, + "grad_norm": 9.625, + "kl": 0.0, + "learning_rate": 7.223175895924638e-06, + "logits/chosen": -56438857.14285714, + "logits/rejected": -33450481.777777776, + "logps/chosen": -308.87451171875, + "logps/rejected": -445.68853081597223, + "loss": 0.0433, + "rewards/chosen": 0.19282269477844238, + "rewards/margins": 14.204622824986776, + "rewards/rejected": -14.011800130208334, + "step": 419 + }, + { + "epoch": 0.0767473732297853, + "grad_norm": 8.6875, + "kl": 0.0, + "learning_rate": 7.207529263958727e-06, + "logits/chosen": -154728832.0, + "logits/rejected": -142586060.8, + "logps/chosen": -278.9648844401042, + "logps/rejected": -372.85126953125, + "loss": 0.0284, + "rewards/chosen": 0.7464548746744791, + "rewards/margins": 11.833902994791666, + "rewards/rejected": -11.087448120117188, + "step": 420 + }, + { + "epoch": 0.0769301050708086, + "grad_norm": 9.9375, + "kl": 0.0, + "learning_rate": 7.191855733945388e-06, + "logits/chosen": -14440312.0, + "logits/rejected": -39770304.0, + "logps/chosen": -231.03213500976562, + "logps/rejected": -578.7193603515625, + "loss": 0.0483, + "rewards/chosen": 0.08285938203334808, + "rewards/margins": 14.094640120863914, + "rewards/rejected": -14.011780738830566, + "step": 421 + }, + { + "epoch": 0.07711283691183189, + "grad_norm": 11.5625, + "kl": 0.0, + "learning_rate": 7.176155496861639e-06, + "logits/chosen": -62813125.81818182, + "logits/rejected": 2805134.4, + "logps/chosen": -223.60329367897728, + "logps/rejected": -485.805810546875, + "loss": 0.0472, + "rewards/chosen": 1.3051502054387873, + "rewards/margins": 11.95701635534113, + "rewards/rejected": -10.651866149902343, + "step": 422 + }, + { + "epoch": 0.07729556875285519, + "grad_norm": 12.4375, + "kl": 0.0, + "learning_rate": 7.160428744009913e-06, + "logits/chosen": -88734089.14285715, + "logits/rejected": -140670563.55555555, + "logps/chosen": -222.92782156808036, + "logps/rejected": -443.15391710069446, + "loss": 0.0385, + "rewards/chosen": 1.0860251017979212, + "rewards/margins": 11.468242509024483, + "rewards/rejected": -10.382217407226562, + "step": 423 + }, + { + "epoch": 0.07747830059387849, + "grad_norm": 9.5625, + "kl": 0.0, + "learning_rate": 7.1446756670157306e-06, + "logits/chosen": -27727833.6, + "logits/rejected": -74100288.0, + "logps/chosen": -198.3269287109375, + "logps/rejected": -458.1758626302083, + "loss": 0.0386, + "rewards/chosen": 1.291013813018799, + "rewards/margins": 14.575029468536377, + "rewards/rejected": -13.284015655517578, + "step": 424 + }, + { + "epoch": 0.07766103243490179, + "grad_norm": 3.703125, + "kl": 0.0, + "learning_rate": 7.128896457825364e-06, + "logits/chosen": -154468242.2857143, + "logits/rejected": -98447168.0, + "logps/chosen": -120.53294154575893, + "logps/rejected": -474.3444552951389, + "loss": 0.0109, + "rewards/chosen": 2.952259063720703, + "rewards/margins": 14.476668039957682, + "rewards/rejected": -11.524408976236979, + "step": 425 + }, + { + "epoch": 0.07784376427592508, + "grad_norm": 10.1875, + "kl": 0.0, + "learning_rate": 7.113091308703498e-06, + "logits/chosen": -83701210.66666667, + "logits/rejected": 31950787.2, + "logps/chosen": -291.33697509765625, + "logps/rejected": -306.934716796875, + "loss": 0.0405, + "rewards/chosen": 1.0825060208638508, + "rewards/margins": 10.026192696889241, + "rewards/rejected": -8.94368667602539, + "step": 426 + }, + { + "epoch": 0.07802649611694838, + "grad_norm": 12.375, + "kl": 0.0, + "learning_rate": 7.0972604122308865e-06, + "logits/chosen": -79372120.0, + "logits/rejected": -55383160.0, + "logps/chosen": -270.4396057128906, + "logps/rejected": -252.4619140625, + "loss": 0.0422, + "rewards/chosen": 0.8213726282119751, + "rewards/margins": 9.742342591285706, + "rewards/rejected": -8.92096996307373, + "step": 427 + }, + { + "epoch": 0.07820922795797168, + "grad_norm": 12.5, + "kl": 0.0, + "learning_rate": 7.081403961302007e-06, + "logits/chosen": -122043520.0, + "logits/rejected": -92989595.42857143, + "logps/chosen": -282.6222330729167, + "logps/rejected": -576.1865234375, + "loss": 0.0449, + "rewards/chosen": 1.2027452256944444, + "rewards/margins": 12.832296704489087, + "rewards/rejected": -11.629551478794642, + "step": 428 + }, + { + "epoch": 0.07839195979899498, + "grad_norm": 6.15625, + "kl": 0.0, + "learning_rate": 7.06552214912271e-06, + "logits/chosen": -73727126.85714285, + "logits/rejected": -96668138.66666667, + "logps/chosen": -153.84184919084822, + "logps/rejected": -611.1101888020834, + "loss": 0.0297, + "rewards/chosen": 1.1989699772426061, + "rewards/margins": 13.252985242813354, + "rewards/rejected": -12.054015265570747, + "step": 429 + }, + { + "epoch": 0.07857469164001828, + "grad_norm": 6.90625, + "kl": 0.0, + "learning_rate": 7.049615169207864e-06, + "logits/chosen": -91149184.0, + "logits/rejected": -68566458.66666667, + "logps/chosen": -265.83465576171875, + "logps/rejected": -439.7295328776042, + "loss": 0.0191, + "rewards/chosen": 1.1795868873596191, + "rewards/margins": 12.765318393707275, + "rewards/rejected": -11.585731506347656, + "step": 430 + }, + { + "epoch": 0.07875742348104157, + "grad_norm": 9.625, + "kl": 0.0, + "learning_rate": 7.033683215379002e-06, + "logits/chosen": -29693676.8, + "logits/rejected": -56539509.333333336, + "logps/chosen": -224.9722412109375, + "logps/rejected": -424.9996744791667, + "loss": 0.0267, + "rewards/chosen": 2.0156707763671875, + "rewards/margins": 15.844799041748047, + "rewards/rejected": -13.82912826538086, + "step": 431 + }, + { + "epoch": 0.07894015532206487, + "grad_norm": 12.8125, + "kl": 0.0, + "learning_rate": 7.0177264817619514e-06, + "logits/chosen": -91233402.18181819, + "logits/rejected": -187793113.6, + "logps/chosen": -292.1910511363636, + "logps/rejected": -526.27744140625, + "loss": 0.0578, + "rewards/chosen": 0.5772052244706587, + "rewards/margins": 10.651784081892533, + "rewards/rejected": -10.074578857421875, + "step": 432 + }, + { + "epoch": 0.07912288716308817, + "grad_norm": 11.4375, + "kl": 0.0, + "learning_rate": 7.0017451627844765e-06, + "logits/chosen": -66948396.8, + "logits/rejected": -67506800.0, + "logps/chosen": -351.5654052734375, + "logps/rejected": -370.5886637369792, + "loss": 0.0559, + "rewards/chosen": 0.29762959480285645, + "rewards/margins": 12.127262671788534, + "rewards/rejected": -11.829633076985678, + "step": 433 + }, + { + "epoch": 0.07930561900411147, + "grad_norm": 10.375, + "kl": 0.0, + "learning_rate": 6.985739453173903e-06, + "logits/chosen": -85667456.0, + "logits/rejected": -136070994.2857143, + "logps/chosen": -240.82790798611111, + "logps/rejected": -573.4099818638393, + "loss": 0.043, + "rewards/chosen": 0.7544806798299154, + "rewards/margins": 14.676615079243978, + "rewards/rejected": -13.922134399414062, + "step": 434 + }, + { + "epoch": 0.07948835084513477, + "grad_norm": 7.875, + "kl": 0.0, + "learning_rate": 6.9697095479547564e-06, + "logits/chosen": -76288800.0, + "logits/rejected": -160972344.8888889, + "logps/chosen": -149.95840890066964, + "logps/rejected": -658.783203125, + "loss": 0.0346, + "rewards/chosen": 0.8151374544416156, + "rewards/margins": 17.431333133152553, + "rewards/rejected": -16.616195678710938, + "step": 435 + }, + { + "epoch": 0.07967108268615807, + "grad_norm": 10.125, + "kl": 0.0, + "learning_rate": 6.953655642446368e-06, + "logits/chosen": -60273670.4, + "logits/rejected": -11578461.333333334, + "logps/chosen": -226.8931640625, + "logps/rejected": -486.2156575520833, + "loss": 0.0541, + "rewards/chosen": 0.918449878692627, + "rewards/margins": 14.827259540557861, + "rewards/rejected": -13.908809661865234, + "step": 436 + }, + { + "epoch": 0.07985381452718136, + "grad_norm": 9.4375, + "kl": 0.0, + "learning_rate": 6.9375779322605154e-06, + "logits/chosen": -89261894.4, + "logits/rejected": -149209920.0, + "logps/chosen": -233.659912109375, + "logps/rejected": -445.3378092447917, + "loss": 0.0475, + "rewards/chosen": 0.7226373195648194, + "rewards/margins": 16.503304942448935, + "rewards/rejected": -15.780667622884115, + "step": 437 + }, + { + "epoch": 0.08003654636820466, + "grad_norm": 10.0, + "kl": 0.0, + "learning_rate": 6.921476613299018e-06, + "logits/chosen": -70054480.0, + "logits/rejected": -59867292.0, + "logps/chosen": -170.07028198242188, + "logps/rejected": -408.07147216796875, + "loss": 0.04, + "rewards/chosen": 0.8295535445213318, + "rewards/margins": 11.954587876796722, + "rewards/rejected": -11.12503433227539, + "step": 438 + }, + { + "epoch": 0.08021927820922796, + "grad_norm": 12.375, + "kl": 0.0, + "learning_rate": 6.905351881751372e-06, + "logits/chosen": -124830778.18181819, + "logits/rejected": -32075123.2, + "logps/chosen": -285.3295232599432, + "logps/rejected": -515.65244140625, + "loss": 0.0619, + "rewards/chosen": 0.2560230385173451, + "rewards/margins": 14.374959195743907, + "rewards/rejected": -14.118936157226562, + "step": 439 + }, + { + "epoch": 0.08040201005025126, + "grad_norm": 9.0, + "kl": 0.0, + "learning_rate": 6.889203934092337e-06, + "logits/chosen": -167339168.0, + "logits/rejected": -124758224.0, + "logps/chosen": -355.1719055175781, + "logps/rejected": -454.91143798828125, + "loss": 0.0215, + "rewards/chosen": 1.6101433038711548, + "rewards/margins": 14.16809618473053, + "rewards/rejected": -12.557952880859375, + "step": 440 + }, + { + "epoch": 0.08058474189127456, + "grad_norm": 4.875, + "kl": 0.0, + "learning_rate": 6.873032967079562e-06, + "logits/chosen": -111029414.4, + "logits/rejected": -77542621.0909091, + "logps/chosen": -145.3523193359375, + "logps/rejected": -441.10830965909093, + "loss": 0.022, + "rewards/chosen": 1.0517406463623047, + "rewards/margins": 12.178538409146396, + "rewards/rejected": -11.126797762784092, + "step": 441 + }, + { + "epoch": 0.08076747373229785, + "grad_norm": 9.25, + "kl": 0.0, + "learning_rate": 6.856839177751175e-06, + "logits/chosen": -92446694.4, + "logits/rejected": -115283562.66666667, + "logps/chosen": -206.7903564453125, + "logps/rejected": -371.1798095703125, + "loss": 0.0385, + "rewards/chosen": 1.1917333602905273, + "rewards/margins": 10.818057696024576, + "rewards/rejected": -9.626324335734049, + "step": 442 + }, + { + "epoch": 0.08095020557332115, + "grad_norm": 12.4375, + "kl": 0.0, + "learning_rate": 6.840622763423391e-06, + "logits/chosen": -123484231.1111111, + "logits/rejected": -48438884.571428575, + "logps/chosen": -268.0890842013889, + "logps/rejected": -426.15419224330356, + "loss": 0.0568, + "rewards/chosen": 0.07636719942092896, + "rewards/margins": 11.485784488064903, + "rewards/rejected": -11.409417288643974, + "step": 443 + }, + { + "epoch": 0.08113293741434445, + "grad_norm": 8.4375, + "kl": 0.0, + "learning_rate": 6.824383921688098e-06, + "logits/chosen": -97661805.71428572, + "logits/rejected": 17553920.0, + "logps/chosen": -209.64156668526786, + "logps/rejected": -399.1370442708333, + "loss": 0.0462, + "rewards/chosen": -0.0034617696489606586, + "rewards/margins": 11.618603456588017, + "rewards/rejected": -11.622065226236979, + "step": 444 + }, + { + "epoch": 0.08131566925536775, + "grad_norm": 10.1875, + "kl": 0.0, + "learning_rate": 6.808122850410461e-06, + "logits/chosen": -116383590.4, + "logits/rejected": -51200430.54545455, + "logps/chosen": -300.901416015625, + "logps/rejected": -399.45028409090907, + "loss": 0.0304, + "rewards/chosen": 0.09810729622840882, + "rewards/margins": 12.879274066469886, + "rewards/rejected": -12.781166770241477, + "step": 445 + }, + { + "epoch": 0.08149840109639105, + "grad_norm": 6.625, + "kl": 0.0, + "learning_rate": 6.7918397477265e-06, + "logits/chosen": -65449585.777777776, + "logits/rejected": -49058537.14285714, + "logps/chosen": -224.50633409288196, + "logps/rejected": -335.9564732142857, + "loss": 0.0268, + "rewards/chosen": 2.1313273111979165, + "rewards/margins": 11.920497712634859, + "rewards/rejected": -9.789170401436943, + "step": 446 + }, + { + "epoch": 0.08168113293741434, + "grad_norm": 8.3125, + "kl": 0.0, + "learning_rate": 6.775534812040686e-06, + "logits/chosen": -42536992.0, + "logits/rejected": -33890560.0, + "logps/chosen": -239.09285481770834, + "logps/rejected": -402.8487025669643, + "loss": 0.0289, + "rewards/chosen": 1.4890576468573675, + "rewards/margins": 10.519198114909823, + "rewards/rejected": -9.030140468052455, + "step": 447 + }, + { + "epoch": 0.08186386477843764, + "grad_norm": 10.625, + "kl": 0.0, + "learning_rate": 6.759208242023509e-06, + "logits/chosen": -46706618.18181818, + "logits/rejected": -60723635.2, + "logps/chosen": -165.12448952414772, + "logps/rejected": -389.942041015625, + "loss": 0.0523, + "rewards/chosen": 0.8235917524857954, + "rewards/margins": 12.51184248490767, + "rewards/rejected": -11.688250732421874, + "step": 448 + }, + { + "epoch": 0.08204659661946094, + "grad_norm": 9.375, + "kl": 0.0, + "learning_rate": 6.7428602366090764e-06, + "logits/chosen": -111052501.33333333, + "logits/rejected": -69881636.57142857, + "logps/chosen": -216.83097330729166, + "logps/rejected": -390.2393275669643, + "loss": 0.0473, + "rewards/chosen": 0.9194290373060439, + "rewards/margins": 12.420836115640308, + "rewards/rejected": -11.501407078334264, + "step": 449 + }, + { + "epoch": 0.08222932846048424, + "grad_norm": 6.40625, + "kl": 0.0, + "learning_rate": 6.7264909949926735e-06, + "logits/chosen": -70971187.2, + "logits/rejected": -53845602.90909091, + "logps/chosen": -175.9670654296875, + "logps/rejected": -458.18039772727275, + "loss": 0.0268, + "rewards/chosen": 0.6739228248596192, + "rewards/margins": 12.633018172870983, + "rewards/rejected": -11.959095348011363, + "step": 450 + }, + { + "epoch": 0.08241206030150754, + "grad_norm": 8.3125, + "kl": 0.0, + "learning_rate": 6.710100716628345e-06, + "logits/chosen": -58721024.0, + "logits/rejected": -64982464.0, + "logps/chosen": -253.79386393229166, + "logps/rejected": -472.623388671875, + "loss": 0.0372, + "rewards/chosen": 1.5976163546244304, + "rewards/margins": 13.930955282847085, + "rewards/rejected": -12.333338928222656, + "step": 451 + }, + { + "epoch": 0.08259479214253083, + "grad_norm": 9.8125, + "kl": 0.0, + "learning_rate": 6.693689601226458e-06, + "logits/chosen": -121780169.14285715, + "logits/rejected": -65310613.333333336, + "logps/chosen": -294.37332589285717, + "logps/rejected": -482.57948133680554, + "loss": 0.0458, + "rewards/chosen": -0.06915044784545898, + "rewards/margins": 9.19898377524482, + "rewards/rejected": -9.268134223090279, + "step": 452 + }, + { + "epoch": 0.08277752398355413, + "grad_norm": 6.59375, + "kl": 0.0, + "learning_rate": 6.677257848751276e-06, + "logits/chosen": -91999826.28571428, + "logits/rejected": -37803267.55555555, + "logps/chosen": -117.14520263671875, + "logps/rejected": -464.9296875, + "loss": 0.029, + "rewards/chosen": 1.4489547184535436, + "rewards/margins": 14.250686251927936, + "rewards/rejected": -12.801731533474392, + "step": 453 + }, + { + "epoch": 0.08296025582457743, + "grad_norm": 8.25, + "kl": 0.0, + "learning_rate": 6.6608056594185166e-06, + "logits/chosen": -90888170.66666667, + "logits/rejected": -115491788.8, + "logps/chosen": -219.12394205729166, + "logps/rejected": -390.09345703125, + "loss": 0.0339, + "rewards/chosen": 0.22475838661193848, + "rewards/margins": 11.501380395889282, + "rewards/rejected": -11.276622009277343, + "step": 454 + }, + { + "epoch": 0.08314298766560073, + "grad_norm": 10.5, + "kl": 0.0, + "learning_rate": 6.644333233692917e-06, + "logits/chosen": -72508136.0, + "logits/rejected": -41456400.0, + "logps/chosen": -237.67758178710938, + "logps/rejected": -433.1468811035156, + "loss": 0.0316, + "rewards/chosen": 0.9090102910995483, + "rewards/margins": 12.171210646629333, + "rewards/rejected": -11.262200355529785, + "step": 455 + }, + { + "epoch": 0.08332571950662403, + "grad_norm": 9.0, + "kl": 0.0, + "learning_rate": 6.627840772285784e-06, + "logits/chosen": -136466144.0, + "logits/rejected": -112766480.0, + "logps/chosen": -252.1562042236328, + "logps/rejected": -400.4176940917969, + "loss": 0.0334, + "rewards/chosen": 1.8854119777679443, + "rewards/margins": 10.384665250778198, + "rewards/rejected": -8.499253273010254, + "step": 456 + }, + { + "epoch": 0.08350845134764733, + "grad_norm": 6.3125, + "kl": 0.0, + "learning_rate": 6.611328476152557e-06, + "logits/chosen": -73926080.0, + "logits/rejected": -109235446.85714285, + "logps/chosen": -536.882568359375, + "logps/rejected": -574.2274344308036, + "loss": 0.0103, + "rewards/chosen": 0.41116636991500854, + "rewards/margins": 14.163848647049495, + "rewards/rejected": -13.752682277134486, + "step": 457 + }, + { + "epoch": 0.08369118318867062, + "grad_norm": 11.5, + "kl": 0.0, + "learning_rate": 6.594796546490351e-06, + "logits/chosen": -97215411.2, + "logits/rejected": -48058213.81818182, + "logps/chosen": -366.945703125, + "logps/rejected": -700.8164506392045, + "loss": 0.0267, + "rewards/chosen": 1.8663864135742188, + "rewards/margins": 18.72479872270064, + "rewards/rejected": -16.85841230912642, + "step": 458 + }, + { + "epoch": 0.08387391502969392, + "grad_norm": 10.625, + "kl": 0.0, + "learning_rate": 6.578245184735513e-06, + "logits/chosen": -64556180.0, + "logits/rejected": -57889392.0, + "logps/chosen": -243.52239990234375, + "logps/rejected": -495.9393310546875, + "loss": 0.0444, + "rewards/chosen": 0.32900506258010864, + "rewards/margins": 13.353229343891144, + "rewards/rejected": -13.024224281311035, + "step": 459 + }, + { + "epoch": 0.08405664687071722, + "grad_norm": 9.875, + "kl": 0.0, + "learning_rate": 6.561674592561164e-06, + "logits/chosen": -63764107.63636363, + "logits/rejected": -9768249.6, + "logps/chosen": -217.5941716974432, + "logps/rejected": -440.10830078125, + "loss": 0.0442, + "rewards/chosen": 1.4734771034934304, + "rewards/margins": 14.706970145485617, + "rewards/rejected": -13.233493041992187, + "step": 460 + }, + { + "epoch": 0.08423937871174052, + "grad_norm": 8.6875, + "kl": 0.0, + "learning_rate": 6.545084971874738e-06, + "logits/chosen": -66493624.88888889, + "logits/rejected": -27884786.285714287, + "logps/chosen": -152.44681803385416, + "logps/rejected": -376.5940638950893, + "loss": 0.0535, + "rewards/chosen": 0.17081386513180202, + "rewards/margins": 9.545273268033588, + "rewards/rejected": -9.374459402901786, + "step": 461 + }, + { + "epoch": 0.08442211055276382, + "grad_norm": 8.75, + "kl": 0.0, + "learning_rate": 6.5284765248155295e-06, + "logits/chosen": -109239890.28571428, + "logits/rejected": -80678456.8888889, + "logps/chosen": -220.74082728794642, + "logps/rejected": -533.6919487847222, + "loss": 0.0267, + "rewards/chosen": 1.1662725721086775, + "rewards/margins": 12.567150630648175, + "rewards/rejected": -11.400878058539497, + "step": 462 + }, + { + "epoch": 0.08460484239378711, + "grad_norm": 9.1875, + "kl": 0.0, + "learning_rate": 6.5118494537522235e-06, + "logits/chosen": -65470088.0, + "logits/rejected": -8489342.0, + "logps/chosen": -201.96316528320312, + "logps/rejected": -365.789794921875, + "loss": 0.0342, + "rewards/chosen": 1.1026279926300049, + "rewards/margins": 10.773208856582642, + "rewards/rejected": -9.670580863952637, + "step": 463 + }, + { + "epoch": 0.08478757423481041, + "grad_norm": 10.1875, + "kl": 0.0, + "learning_rate": 6.495203961280434e-06, + "logits/chosen": -155965966.2222222, + "logits/rejected": 12655750.857142856, + "logps/chosen": -181.94908311631946, + "logps/rejected": -531.3296595982143, + "loss": 0.0378, + "rewards/chosen": 0.9090642929077148, + "rewards/margins": 14.510645321437291, + "rewards/rejected": -13.601581028529576, + "step": 464 + }, + { + "epoch": 0.08497030607583371, + "grad_norm": 6.0625, + "kl": 0.0, + "learning_rate": 6.4785402502202345e-06, + "logits/chosen": -78366101.33333333, + "logits/rejected": -61514968.615384616, + "logps/chosen": -138.53300984700522, + "logps/rejected": -389.3163311298077, + "loss": 0.0125, + "rewards/chosen": 3.2460269927978516, + "rewards/margins": 14.835120714627779, + "rewards/rejected": -11.589093721829927, + "step": 465 + }, + { + "epoch": 0.08515303791685701, + "grad_norm": 6.625, + "kl": 0.0, + "learning_rate": 6.461858523613684e-06, + "logits/chosen": -49641596.8, + "logits/rejected": -52575697.45454545, + "logps/chosen": -204.39515380859376, + "logps/rejected": -444.3943536931818, + "loss": 0.0143, + "rewards/chosen": 1.571202850341797, + "rewards/margins": 12.994618988037109, + "rewards/rejected": -11.423416137695312, + "step": 466 + }, + { + "epoch": 0.0853357697578803, + "grad_norm": 9.125, + "kl": 0.0, + "learning_rate": 6.445158984722358e-06, + "logits/chosen": -116695754.66666667, + "logits/rejected": -29483372.0, + "logps/chosen": -201.71036783854166, + "logps/rejected": -604.970703125, + "loss": 0.0319, + "rewards/chosen": 2.1157089869181314, + "rewards/margins": 19.503745714823406, + "rewards/rejected": -17.388036727905273, + "step": 467 + }, + { + "epoch": 0.0855185015989036, + "grad_norm": 12.4375, + "kl": 0.0, + "learning_rate": 6.428441837024868e-06, + "logits/chosen": -97586973.0909091, + "logits/rejected": -219337472.0, + "logps/chosen": -240.6651278409091, + "logps/rejected": -414.73505859375, + "loss": 0.0403, + "rewards/chosen": 1.347853573885831, + "rewards/margins": 11.015709408846767, + "rewards/rejected": -9.667855834960937, + "step": 468 + }, + { + "epoch": 0.0857012334399269, + "grad_norm": 10.375, + "kl": 0.0, + "learning_rate": 6.411707284214384e-06, + "logits/chosen": -117540464.0, + "logits/rejected": -145165072.0, + "logps/chosen": -245.08030700683594, + "logps/rejected": -544.5328979492188, + "loss": 0.0543, + "rewards/chosen": -0.2870769500732422, + "rewards/margins": 11.975292205810547, + "rewards/rejected": -12.262369155883789, + "step": 469 + }, + { + "epoch": 0.0858839652809502, + "grad_norm": 8.375, + "kl": 0.0, + "learning_rate": 6.3949555301961474e-06, + "logits/chosen": -131269990.4, + "logits/rejected": -95735648.0, + "logps/chosen": -181.191748046875, + "logps/rejected": -412.1879475911458, + "loss": 0.0246, + "rewards/chosen": 2.213155746459961, + "rewards/margins": 13.257170995076498, + "rewards/rejected": -11.044015248616537, + "step": 470 + }, + { + "epoch": 0.0860666971219735, + "grad_norm": 9.0, + "kl": 0.0, + "learning_rate": 6.378186779084996e-06, + "logits/chosen": -117784112.0, + "logits/rejected": -167869936.0, + "logps/chosen": -258.26287841796875, + "logps/rejected": -320.37078857421875, + "loss": 0.0433, + "rewards/chosen": 0.5608336925506592, + "rewards/margins": 8.703881025314331, + "rewards/rejected": -8.143047332763672, + "step": 471 + }, + { + "epoch": 0.0862494289629968, + "grad_norm": 11.125, + "kl": 0.0, + "learning_rate": 6.361401235202872e-06, + "logits/chosen": -129984000.0, + "logits/rejected": -94317619.2, + "logps/chosen": -231.0477627840909, + "logps/rejected": -698.38701171875, + "loss": 0.0619, + "rewards/chosen": 0.3048063841733066, + "rewards/margins": 17.450620898333465, + "rewards/rejected": -17.145814514160158, + "step": 472 + }, + { + "epoch": 0.0864321608040201, + "grad_norm": 262.0, + "kl": 0.0, + "learning_rate": 6.344599103076329e-06, + "logits/chosen": -137572714.66666666, + "logits/rejected": -22591606.4, + "logps/chosen": -233.6431681315104, + "logps/rejected": -577.17412109375, + "loss": 0.0441, + "rewards/chosen": 1.8375817934672039, + "rewards/margins": 14.738485113779703, + "rewards/rejected": -12.9009033203125, + "step": 473 + }, + { + "epoch": 0.08661489264504339, + "grad_norm": 7.46875, + "kl": 0.0, + "learning_rate": 6.327780587434045e-06, + "logits/chosen": -167606353.45454547, + "logits/rejected": -124684224.0, + "logps/chosen": -180.2293146306818, + "logps/rejected": -378.1886474609375, + "loss": 0.0214, + "rewards/chosen": 2.849421587857333, + "rewards/margins": 11.667555132779206, + "rewards/rejected": -8.818133544921874, + "step": 474 + }, + { + "epoch": 0.08679762448606669, + "grad_norm": 10.6875, + "kl": 0.0, + "learning_rate": 6.310945893204324e-06, + "logits/chosen": -107780503.27272727, + "logits/rejected": -64517420.8, + "logps/chosen": -218.8089932528409, + "logps/rejected": -519.77119140625, + "loss": 0.0463, + "rewards/chosen": 1.1985158053311435, + "rewards/margins": 10.606473263827237, + "rewards/rejected": -9.407957458496094, + "step": 475 + }, + { + "epoch": 0.08698035632708999, + "grad_norm": 13.0, + "kl": 0.0, + "learning_rate": 6.294095225512604e-06, + "logits/chosen": -144405445.8181818, + "logits/rejected": -100824019.2, + "logps/chosen": -234.33802379261363, + "logps/rejected": -465.03603515625, + "loss": 0.0449, + "rewards/chosen": 1.4153223904696377, + "rewards/margins": 12.639979067715732, + "rewards/rejected": -11.224656677246093, + "step": 476 + }, + { + "epoch": 0.08716308816811329, + "grad_norm": 8.4375, + "kl": 0.030710220336914062, + "learning_rate": 6.277228789678953e-06, + "logits/chosen": -94396242.28571428, + "logits/rejected": -62854684.44444445, + "logps/chosen": -234.03102329799108, + "logps/rejected": -436.6261393229167, + "loss": 0.035, + "rewards/chosen": 1.4480861936296736, + "rewards/margins": 11.242131535969083, + "rewards/rejected": -9.794045342339409, + "step": 477 + }, + { + "epoch": 0.0873458200091366, + "grad_norm": 18.875, + "kl": 0.0, + "learning_rate": 6.26034679121557e-06, + "logits/chosen": -111234432.0, + "logits/rejected": -127663246.22222222, + "logps/chosen": -209.97914341517858, + "logps/rejected": -416.71728515625, + "loss": 0.0369, + "rewards/chosen": 1.3267830439976283, + "rewards/margins": 12.71919077918643, + "rewards/rejected": -11.392407735188803, + "step": 478 + }, + { + "epoch": 0.0875285518501599, + "grad_norm": 8.625, + "kl": 0.0, + "learning_rate": 6.243449435824276e-06, + "logits/chosen": -135810744.8888889, + "logits/rejected": -79283602.28571428, + "logps/chosen": -236.2294921875, + "logps/rejected": -336.3936244419643, + "loss": 0.0293, + "rewards/chosen": 1.5298450258043077, + "rewards/margins": 12.499631533547053, + "rewards/rejected": -10.969786507742745, + "step": 479 + }, + { + "epoch": 0.0877112836911832, + "grad_norm": 10.875, + "kl": 0.0, + "learning_rate": 6.2265369293940135e-06, + "logits/chosen": -74847075.55555555, + "logits/rejected": -47666756.571428575, + "logps/chosen": -254.71728515625, + "logps/rejected": -362.182373046875, + "loss": 0.0386, + "rewards/chosen": 1.336128870646159, + "rewards/margins": 11.526128042311894, + "rewards/rejected": -10.189999171665736, + "step": 480 + }, + { + "epoch": 0.0878940155322065, + "grad_norm": 7.15625, + "kl": 0.0, + "learning_rate": 6.209609477998339e-06, + "logits/chosen": -103906856.0, + "logits/rejected": -58691397.333333336, + "logps/chosen": -244.86416625976562, + "logps/rejected": -436.6634521484375, + "loss": 0.022, + "rewards/chosen": 0.4625961184501648, + "rewards/margins": 11.63639118274053, + "rewards/rejected": -11.173795064290365, + "step": 481 + }, + { + "epoch": 0.08807674737322979, + "grad_norm": 13.8125, + "kl": 0.0, + "learning_rate": 6.192667287892905e-06, + "logits/chosen": -59247904.0, + "logits/rejected": -156630816.0, + "logps/chosen": -157.88043212890625, + "logps/rejected": -458.3288167317708, + "loss": 0.0393, + "rewards/chosen": 2.064430809020996, + "rewards/margins": 12.484199460347494, + "rewards/rejected": -10.419768651326498, + "step": 482 + }, + { + "epoch": 0.08825947921425309, + "grad_norm": 7.90625, + "kl": 0.0, + "learning_rate": 6.17571056551295e-06, + "logits/chosen": -111339236.57142857, + "logits/rejected": -197118250.66666666, + "logps/chosen": -166.48311941964286, + "logps/rejected": -305.7463650173611, + "loss": 0.0239, + "rewards/chosen": 1.2968766348702567, + "rewards/margins": 12.23040511116149, + "rewards/rejected": -10.933528476291233, + "step": 483 + }, + { + "epoch": 0.08844221105527639, + "grad_norm": 6.84375, + "kl": 0.0, + "learning_rate": 6.158739517470786e-06, + "logits/chosen": -114743445.33333333, + "logits/rejected": -88934668.8, + "logps/chosen": -187.0727335611979, + "logps/rejected": -523.64609375, + "loss": 0.0164, + "rewards/chosen": 2.3474629720052085, + "rewards/margins": 15.966930135091147, + "rewards/rejected": -13.619467163085938, + "step": 484 + }, + { + "epoch": 0.08862494289629969, + "grad_norm": 7.0, + "kl": 0.0, + "learning_rate": 6.141754350553279e-06, + "logits/chosen": -135875792.0, + "logits/rejected": -79350948.57142857, + "logps/chosen": -230.87998962402344, + "logps/rejected": -333.33119419642856, + "loss": 0.0176, + "rewards/chosen": 0.7837433218955994, + "rewards/margins": 9.731966980866023, + "rewards/rejected": -8.948223658970424, + "step": 485 + }, + { + "epoch": 0.08880767473732298, + "grad_norm": 10.25, + "kl": 0.0, + "learning_rate": 6.124755271719326e-06, + "logits/chosen": -84945068.8, + "logits/rejected": -86438016.0, + "logps/chosen": -184.5691162109375, + "logps/rejected": -437.345458984375, + "loss": 0.0413, + "rewards/chosen": 0.8101354598999023, + "rewards/margins": 14.873117383321127, + "rewards/rejected": -14.062981923421225, + "step": 486 + }, + { + "epoch": 0.08899040657834628, + "grad_norm": 9.4375, + "kl": 0.0, + "learning_rate": 6.107742488097338e-06, + "logits/chosen": -130933432.8888889, + "logits/rejected": -66848100.571428575, + "logps/chosen": -227.26277669270834, + "logps/rejected": -658.072265625, + "loss": 0.0438, + "rewards/chosen": 0.9566428926255968, + "rewards/margins": 21.119260167318675, + "rewards/rejected": -20.16261727469308, + "step": 487 + }, + { + "epoch": 0.08917313841936958, + "grad_norm": 9.875, + "kl": 0.0, + "learning_rate": 6.090716206982714e-06, + "logits/chosen": -67758153.14285715, + "logits/rejected": -179237717.33333334, + "logps/chosen": -240.92328752790178, + "logps/rejected": -479.00526258680554, + "loss": 0.0392, + "rewards/chosen": 0.8936920166015625, + "rewards/margins": 15.211766560872396, + "rewards/rejected": -14.318074544270834, + "step": 488 + }, + { + "epoch": 0.08935587026039288, + "grad_norm": 8.375, + "kl": 0.0, + "learning_rate": 6.073676635835317e-06, + "logits/chosen": -83322873.6, + "logits/rejected": -102732096.0, + "logps/chosen": -176.73974609375, + "logps/rejected": -497.773681640625, + "loss": 0.019, + "rewards/chosen": 2.8394906997680662, + "rewards/margins": 17.238691139221192, + "rewards/rejected": -14.399200439453125, + "step": 489 + }, + { + "epoch": 0.08953860210141618, + "grad_norm": 9.5, + "kl": 0.0, + "learning_rate": 6.056623982276945e-06, + "logits/chosen": -87973576.0, + "logits/rejected": -59173360.0, + "logps/chosen": -180.7642822265625, + "logps/rejected": -522.8771565755209, + "loss": 0.0196, + "rewards/chosen": 1.4392677545547485, + "rewards/margins": 12.120256066322327, + "rewards/rejected": -10.680988311767578, + "step": 490 + }, + { + "epoch": 0.08972133394243947, + "grad_norm": 11.5, + "kl": 0.6619329452514648, + "learning_rate": 6.039558454088796e-06, + "logits/chosen": -135683374.54545453, + "logits/rejected": -73185964.8, + "logps/chosen": -238.29878373579547, + "logps/rejected": -573.8919921875, + "loss": 0.0505, + "rewards/chosen": 1.251308267766779, + "rewards/margins": 16.375426309758964, + "rewards/rejected": -15.124118041992187, + "step": 491 + }, + { + "epoch": 0.08990406578346277, + "grad_norm": 15.0625, + "kl": 0.0, + "learning_rate": 6.022480259208951e-06, + "logits/chosen": -123009123.55555555, + "logits/rejected": -113352484.57142857, + "logps/chosen": -253.86496310763889, + "logps/rejected": -310.7079380580357, + "loss": 0.0408, + "rewards/chosen": 1.300013330247667, + "rewards/margins": 10.694529321458605, + "rewards/rejected": -9.394515991210938, + "step": 492 + }, + { + "epoch": 0.09008679762448607, + "grad_norm": 8.625, + "kl": 0.0, + "learning_rate": 6.005389605729824e-06, + "logits/chosen": -113006336.0, + "logits/rejected": -29933588.57142857, + "logps/chosen": -194.8389892578125, + "logps/rejected": -520.6839425223214, + "loss": 0.0322, + "rewards/chosen": 1.3806390762329102, + "rewards/margins": 14.273163795471191, + "rewards/rejected": -12.892524719238281, + "step": 493 + }, + { + "epoch": 0.09026952946550937, + "grad_norm": 12.875, + "kl": 0.0, + "learning_rate": 5.988286701895631e-06, + "logits/chosen": -124094882.9090909, + "logits/rejected": -43866732.8, + "logps/chosen": -256.41581587357956, + "logps/rejected": -411.510546875, + "loss": 0.0682, + "rewards/chosen": 0.16456367752768777, + "rewards/margins": 9.693237993933938, + "rewards/rejected": -9.52867431640625, + "step": 494 + }, + { + "epoch": 0.09045226130653267, + "grad_norm": 8.5, + "kl": 0.0, + "learning_rate": 5.97117175609986e-06, + "logits/chosen": -53656240.0, + "logits/rejected": -55950707.2, + "logps/chosen": -225.2103068033854, + "logps/rejected": -541.93115234375, + "loss": 0.025, + "rewards/chosen": 0.7654825846354166, + "rewards/margins": 12.89832026163737, + "rewards/rejected": -12.132837677001953, + "step": 495 + }, + { + "epoch": 0.09063499314755596, + "grad_norm": 4.75, + "kl": 0.0, + "learning_rate": 5.954044976882725e-06, + "logits/chosen": -56931496.0, + "logits/rejected": -46079840.0, + "logps/chosen": -144.89122009277344, + "logps/rejected": -413.2593688964844, + "loss": 0.0198, + "rewards/chosen": 2.079800605773926, + "rewards/margins": 16.14587688446045, + "rewards/rejected": -14.066076278686523, + "step": 496 + }, + { + "epoch": 0.09081772498857926, + "grad_norm": 9.0, + "kl": 0.0, + "learning_rate": 5.936906572928625e-06, + "logits/chosen": -47101330.28571428, + "logits/rejected": -77126385.77777778, + "logps/chosen": -275.6321498325893, + "logps/rejected": -513.5877278645834, + "loss": 0.0294, + "rewards/chosen": 1.1921062469482422, + "rewards/margins": 13.138174904717339, + "rewards/rejected": -11.946068657769096, + "step": 497 + }, + { + "epoch": 0.09100045682960256, + "grad_norm": 11.1875, + "kl": 0.0, + "learning_rate": 5.919756753063601e-06, + "logits/chosen": -65495208.0, + "logits/rejected": -86441136.0, + "logps/chosen": -258.7713623046875, + "logps/rejected": -479.7276611328125, + "loss": 0.0343, + "rewards/chosen": 0.8633486032485962, + "rewards/margins": 13.263424515724182, + "rewards/rejected": -12.400075912475586, + "step": 498 + }, + { + "epoch": 0.09118318867062586, + "grad_norm": 10.625, + "kl": 0.0, + "learning_rate": 5.902595726252801e-06, + "logits/chosen": -95500416.0, + "logits/rejected": -69342160.0, + "logps/chosen": -200.61007690429688, + "logps/rejected": -393.5708923339844, + "loss": 0.0445, + "rewards/chosen": 0.31101322174072266, + "rewards/margins": 11.23719596862793, + "rewards/rejected": -10.926182746887207, + "step": 499 + }, + { + "epoch": 0.09136592051164916, + "grad_norm": 7.09375, + "kl": 0.0, + "learning_rate": 5.885423701597918e-06, + "logits/chosen": -74354738.28571428, + "logits/rejected": -72405596.44444445, + "logps/chosen": -176.61251395089286, + "logps/rejected": -536.3819444444445, + "loss": 0.0163, + "rewards/chosen": 2.1806996209280833, + "rewards/margins": 16.946524680606903, + "rewards/rejected": -14.76582505967882, + "step": 500 + }, + { + "epoch": 0.09154865235267245, + "grad_norm": 12.0, + "kl": 0.0, + "learning_rate": 5.8682408883346535e-06, + "logits/chosen": -68254341.33333333, + "logits/rejected": 12131160.0, + "logps/chosen": -275.6013590494792, + "logps/rejected": -527.4082641601562, + "loss": 0.07, + "rewards/chosen": 0.34793253739674884, + "rewards/margins": 13.941206653912863, + "rewards/rejected": -13.593274116516113, + "step": 501 + }, + { + "epoch": 0.09173138419369575, + "grad_norm": 11.0625, + "kl": 0.0, + "learning_rate": 5.851047495830163e-06, + "logits/chosen": -108363016.0, + "logits/rejected": -49274676.0, + "logps/chosen": -261.11126708984375, + "logps/rejected": -382.6251220703125, + "loss": 0.034, + "rewards/chosen": 1.6630139350891113, + "rewards/margins": 14.280150890350342, + "rewards/rejected": -12.61713695526123, + "step": 502 + }, + { + "epoch": 0.09191411603471905, + "grad_norm": 8.3125, + "kl": 0.0, + "learning_rate": 5.8338437335805124e-06, + "logits/chosen": -92348900.57142857, + "logits/rejected": -82003747.55555555, + "logps/chosen": -237.20057896205358, + "logps/rejected": -487.4143337673611, + "loss": 0.0252, + "rewards/chosen": 1.013235092163086, + "rewards/margins": 12.888157102796766, + "rewards/rejected": -11.87492201063368, + "step": 503 + }, + { + "epoch": 0.09209684787574235, + "grad_norm": 10.0625, + "kl": 0.0, + "learning_rate": 5.816629811208112e-06, + "logits/chosen": -79072682.66666667, + "logits/rejected": -97919890.28571428, + "logps/chosen": -240.68755425347223, + "logps/rejected": -383.32920619419644, + "loss": 0.04, + "rewards/chosen": 1.2835330963134766, + "rewards/margins": 13.19430296761649, + "rewards/rejected": -11.910769871303014, + "step": 504 + }, + { + "epoch": 0.09227957971676565, + "grad_norm": 7.9375, + "kl": 0.0, + "learning_rate": 5.799405938459175e-06, + "logits/chosen": -63852202.666666664, + "logits/rejected": -74910899.2, + "logps/chosen": -225.2589314778646, + "logps/rejected": -518.809375, + "loss": 0.0331, + "rewards/chosen": 0.9845474561055502, + "rewards/margins": 14.647081025441489, + "rewards/rejected": -13.662533569335938, + "step": 505 + }, + { + "epoch": 0.09246231155778895, + "grad_norm": 8.9375, + "kl": 0.0, + "learning_rate": 5.782172325201155e-06, + "logits/chosen": -55439512.0, + "logits/rejected": -56279224.0, + "logps/chosen": -194.9553680419922, + "logps/rejected": -435.8100891113281, + "loss": 0.0451, + "rewards/chosen": 0.24342840909957886, + "rewards/margins": 14.375364482402802, + "rewards/rejected": -14.131936073303223, + "step": 506 + }, + { + "epoch": 0.09264504339881224, + "grad_norm": 7.78125, + "kl": 0.0, + "learning_rate": 5.764929181420191e-06, + "logits/chosen": -112011026.28571428, + "logits/rejected": -81655836.44444445, + "logps/chosen": -194.35771833147322, + "logps/rejected": -335.09393988715277, + "loss": 0.056, + "rewards/chosen": 2.0045623779296875, + "rewards/margins": 12.262612236870659, + "rewards/rejected": -10.258049858940971, + "step": 507 + }, + { + "epoch": 0.09282777523983554, + "grad_norm": 10.3125, + "kl": 0.0, + "learning_rate": 5.747676717218549e-06, + "logits/chosen": -83177969.77777778, + "logits/rejected": -84892379.42857143, + "logps/chosen": -179.91897243923611, + "logps/rejected": -402.72520228794644, + "loss": 0.0471, + "rewards/chosen": 0.9441454145643446, + "rewards/margins": 12.394151082114568, + "rewards/rejected": -11.450005667550224, + "step": 508 + }, + { + "epoch": 0.09301050708085884, + "grad_norm": 9.0, + "kl": 0.0, + "learning_rate": 5.730415142812059e-06, + "logits/chosen": -36543339.428571425, + "logits/rejected": -36700487.11111111, + "logps/chosen": -304.4925013950893, + "logps/rejected": -535.6584201388889, + "loss": 0.0305, + "rewards/chosen": 1.087625367300851, + "rewards/margins": 16.34367293403262, + "rewards/rejected": -15.256047566731771, + "step": 509 + }, + { + "epoch": 0.09319323892188214, + "grad_norm": 7.75, + "kl": 0.0, + "learning_rate": 5.7131446685275595e-06, + "logits/chosen": -141815011.55555555, + "logits/rejected": -110428891.42857143, + "logps/chosen": -169.74549696180554, + "logps/rejected": -515.0768694196429, + "loss": 0.0314, + "rewards/chosen": 1.5897374682956271, + "rewards/margins": 14.06522640349373, + "rewards/rejected": -12.475488935198102, + "step": 510 + }, + { + "epoch": 0.09337597076290544, + "grad_norm": 13.875, + "kl": 0.0, + "learning_rate": 5.695865504800328e-06, + "logits/chosen": -160806826.66666666, + "logits/rejected": -140749680.0, + "logps/chosen": -255.79376220703125, + "logps/rejected": -429.4295959472656, + "loss": 0.065, + "rewards/chosen": 0.5334449609120687, + "rewards/margins": 13.523637374242147, + "rewards/rejected": -12.990192413330078, + "step": 511 + }, + { + "epoch": 0.09355870260392873, + "grad_norm": 11.375, + "kl": 0.0, + "learning_rate": 5.678577862171523e-06, + "logits/chosen": -74945294.22222222, + "logits/rejected": -306604489.14285713, + "logps/chosen": -299.2386881510417, + "logps/rejected": -342.9413364955357, + "loss": 0.0582, + "rewards/chosen": 0.11556602848900689, + "rewards/margins": 9.909336954828293, + "rewards/rejected": -9.793770926339286, + "step": 512 + }, + { + "epoch": 0.09374143444495203, + "grad_norm": 7.6875, + "kl": 0.0, + "learning_rate": 5.661281951285613e-06, + "logits/chosen": -104433510.4, + "logits/rejected": -98532842.66666667, + "logps/chosen": -170.28170166015624, + "logps/rejected": -365.7349446614583, + "loss": 0.0321, + "rewards/chosen": 1.753192138671875, + "rewards/margins": 12.451614888509114, + "rewards/rejected": -10.69842274983724, + "step": 513 + }, + { + "epoch": 0.09392416628597533, + "grad_norm": 9.4375, + "kl": 0.0, + "learning_rate": 5.643977982887815e-06, + "logits/chosen": -89742765.71428572, + "logits/rejected": -124222663.1111111, + "logps/chosen": -203.23460170200892, + "logps/rejected": -451.3940700954861, + "loss": 0.039, + "rewards/chosen": 0.2863396235874721, + "rewards/margins": 13.432634111434693, + "rewards/rejected": -13.146294487847221, + "step": 514 + }, + { + "epoch": 0.09410689812699863, + "grad_norm": 7.9375, + "kl": 0.0, + "learning_rate": 5.626666167821522e-06, + "logits/chosen": -159062330.1818182, + "logits/rejected": -89081376.0, + "logps/chosen": -213.1622869318182, + "logps/rejected": -530.221630859375, + "loss": 0.0267, + "rewards/chosen": 2.4969988736239346, + "rewards/margins": 14.82419003573331, + "rewards/rejected": -12.327191162109376, + "step": 515 + }, + { + "epoch": 0.09428962996802193, + "grad_norm": 8.75, + "kl": 0.0, + "learning_rate": 5.609346717025738e-06, + "logits/chosen": -143493301.33333334, + "logits/rejected": -52756208.0, + "logps/chosen": -250.68986002604166, + "logps/rejected": -448.8392578125, + "loss": 0.0234, + "rewards/chosen": 1.2984978357950847, + "rewards/margins": 13.601881472269694, + "rewards/rejected": -12.30338363647461, + "step": 516 + }, + { + "epoch": 0.09447236180904522, + "grad_norm": 9.6875, + "kl": 0.0, + "learning_rate": 5.592019841532507e-06, + "logits/chosen": -193003296.0, + "logits/rejected": -28340312.0, + "logps/chosen": -241.61256408691406, + "logps/rejected": -706.8320922851562, + "loss": 0.0626, + "rewards/chosen": 0.9965553879737854, + "rewards/margins": 17.30060964822769, + "rewards/rejected": -16.304054260253906, + "step": 517 + }, + { + "epoch": 0.09465509365006852, + "grad_norm": 11.0, + "kl": 0.0, + "learning_rate": 5.5746857524643335e-06, + "logits/chosen": -76018403.55555555, + "logits/rejected": -134534217.14285713, + "logps/chosen": -197.67955186631946, + "logps/rejected": -463.12130301339283, + "loss": 0.0475, + "rewards/chosen": 0.36953216128879124, + "rewards/margins": 13.20777373843723, + "rewards/rejected": -12.838241577148438, + "step": 518 + }, + { + "epoch": 0.09483782549109182, + "grad_norm": 9.6875, + "kl": 0.6233978271484375, + "learning_rate": 5.557344661031628e-06, + "logits/chosen": -66896384.0, + "logits/rejected": -54089196.8, + "logps/chosen": -249.16459517045453, + "logps/rejected": -591.01455078125, + "loss": 0.0472, + "rewards/chosen": 1.4162032387473367, + "rewards/margins": 13.784152152321555, + "rewards/rejected": -12.367948913574219, + "step": 519 + }, + { + "epoch": 0.09502055733211512, + "grad_norm": 11.9375, + "kl": 0.0, + "learning_rate": 5.539996778530114e-06, + "logits/chosen": -89528808.72727273, + "logits/rejected": -160771660.8, + "logps/chosen": -277.30442116477275, + "logps/rejected": -325.3177490234375, + "loss": 0.0419, + "rewards/chosen": 1.009166804226962, + "rewards/margins": 12.123741236600008, + "rewards/rejected": -11.114574432373047, + "step": 520 + }, + { + "epoch": 0.09520328917313842, + "grad_norm": 8.875, + "kl": 0.0, + "learning_rate": 5.522642316338268e-06, + "logits/chosen": -105620178.28571428, + "logits/rejected": -102857187.55555555, + "logps/chosen": -176.75045340401786, + "logps/rejected": -464.3083224826389, + "loss": 0.0339, + "rewards/chosen": 1.0218662534441267, + "rewards/margins": 11.256201411050464, + "rewards/rejected": -10.234335157606337, + "step": 521 + }, + { + "epoch": 0.09538602101416171, + "grad_norm": 8.5625, + "kl": 0.0, + "learning_rate": 5.505281485914732e-06, + "logits/chosen": -96576184.0, + "logits/rejected": -123922192.0, + "logps/chosen": -254.70523071289062, + "logps/rejected": -436.0421142578125, + "loss": 0.0345, + "rewards/chosen": 0.8324825763702393, + "rewards/margins": 12.302973985671997, + "rewards/rejected": -11.470491409301758, + "step": 522 + }, + { + "epoch": 0.09556875285518501, + "grad_norm": 11.25, + "kl": 0.0, + "learning_rate": 5.487914498795748e-06, + "logits/chosen": -117593062.4, + "logits/rejected": -102936725.33333333, + "logps/chosen": -199.78427734375, + "logps/rejected": -289.94114176432294, + "loss": 0.0412, + "rewards/chosen": 1.1636048316955567, + "rewards/margins": 12.994354343414306, + "rewards/rejected": -11.83074951171875, + "step": 523 + }, + { + "epoch": 0.09575148469620831, + "grad_norm": 11.3125, + "kl": 0.0, + "learning_rate": 5.470541566592573e-06, + "logits/chosen": -63985628.44444445, + "logits/rejected": -107145645.71428572, + "logps/chosen": -276.11314561631946, + "logps/rejected": -478.8643275669643, + "loss": 0.0471, + "rewards/chosen": 0.8025285402933756, + "rewards/margins": 9.49316476640247, + "rewards/rejected": -8.690636226109095, + "step": 524 + }, + { + "epoch": 0.09593421653723161, + "grad_norm": 7.6875, + "kl": 0.0, + "learning_rate": 5.453162900988902e-06, + "logits/chosen": -101452974.54545455, + "logits/rejected": -34976876.8, + "logps/chosen": -197.6070223721591, + "logps/rejected": -513.8708984375, + "loss": 0.0245, + "rewards/chosen": 2.667246385054155, + "rewards/margins": 14.771863694624466, + "rewards/rejected": -12.104617309570312, + "step": 525 + }, + { + "epoch": 0.09611694837825491, + "grad_norm": 15.875, + "kl": 0.0, + "learning_rate": 5.435778713738292e-06, + "logits/chosen": -100512780.8, + "logits/rejected": -67729210.18181819, + "logps/chosen": -268.8600830078125, + "logps/rejected": -470.89106889204544, + "loss": 0.0377, + "rewards/chosen": 0.19073853492736817, + "rewards/margins": 9.806319007006557, + "rewards/rejected": -9.61558047207919, + "step": 526 + }, + { + "epoch": 0.0962996802192782, + "grad_norm": 11.5, + "kl": 0.2920207977294922, + "learning_rate": 5.41838921666158e-06, + "logits/chosen": -109994060.8, + "logits/rejected": -89187610.66666667, + "logps/chosen": -238.813818359375, + "logps/rejected": -587.4364827473959, + "loss": 0.0455, + "rewards/chosen": 1.297788715362549, + "rewards/margins": 15.09824317296346, + "rewards/rejected": -13.800454457600912, + "step": 527 + }, + { + "epoch": 0.0964824120603015, + "grad_norm": 10.0625, + "kl": 0.0, + "learning_rate": 5.400994621644294e-06, + "logits/chosen": -104910512.0, + "logits/rejected": -97180520.0, + "logps/chosen": -229.7603759765625, + "logps/rejected": -445.06939697265625, + "loss": 0.0345, + "rewards/chosen": 0.934306263923645, + "rewards/margins": 11.799454808235168, + "rewards/rejected": -10.865148544311523, + "step": 528 + }, + { + "epoch": 0.0966651439013248, + "grad_norm": 7.5, + "kl": 0.0, + "learning_rate": 5.383595140634093e-06, + "logits/chosen": -127038630.4, + "logits/rejected": -70982306.9090909, + "logps/chosen": -314.7491455078125, + "logps/rejected": -593.3982599431819, + "loss": 0.0227, + "rewards/chosen": 1.1556243896484375, + "rewards/margins": 14.903500643643467, + "rewards/rejected": -13.74787625399503, + "step": 529 + }, + { + "epoch": 0.0968478757423481, + "grad_norm": 7.25, + "kl": 0.0, + "learning_rate": 5.366190985638159e-06, + "logits/chosen": -158588608.0, + "logits/rejected": -43009811.2, + "logps/chosen": -217.24418131510416, + "logps/rejected": -451.05537109375, + "loss": 0.0302, + "rewards/chosen": 0.7887593905131022, + "rewards/margins": 11.214816824595133, + "rewards/rejected": -10.426057434082031, + "step": 530 + }, + { + "epoch": 0.0970306075833714, + "grad_norm": 10.4375, + "kl": 0.0, + "learning_rate": 5.348782368720627e-06, + "logits/chosen": -81560102.4, + "logits/rejected": -44296469.333333336, + "logps/chosen": -264.1865478515625, + "logps/rejected": -536.5785725911459, + "loss": 0.0592, + "rewards/chosen": 0.3309658050537109, + "rewards/margins": 13.321015803019206, + "rewards/rejected": -12.990049997965494, + "step": 531 + }, + { + "epoch": 0.0972133394243947, + "grad_norm": 10.9375, + "kl": 0.0, + "learning_rate": 5.3313695020000026e-06, + "logits/chosen": -132858022.4, + "logits/rejected": -128926944.0, + "logps/chosen": -254.8996337890625, + "logps/rejected": -329.8273111979167, + "loss": 0.0438, + "rewards/chosen": 1.1996644973754882, + "rewards/margins": 10.196302477518717, + "rewards/rejected": -8.996637980143229, + "step": 532 + }, + { + "epoch": 0.097396071265418, + "grad_norm": 6.09375, + "kl": 0.0, + "learning_rate": 5.3139525976465675e-06, + "logits/chosen": -119382491.42857143, + "logits/rejected": -97839815.1111111, + "logps/chosen": -255.99267578125, + "logps/rejected": -389.373779296875, + "loss": 0.022, + "rewards/chosen": 1.6967718941824776, + "rewards/margins": 11.109086415124318, + "rewards/rejected": -9.412314520941841, + "step": 533 + }, + { + "epoch": 0.09757880310644129, + "grad_norm": 11.375, + "kl": 0.0, + "learning_rate": 5.296531867879809e-06, + "logits/chosen": -95196010.66666667, + "logits/rejected": -51264438.85714286, + "logps/chosen": -282.1120334201389, + "logps/rejected": -484.20186941964283, + "loss": 0.0455, + "rewards/chosen": 0.8506585227118598, + "rewards/margins": 12.153227230859182, + "rewards/rejected": -11.302568708147321, + "step": 534 + }, + { + "epoch": 0.09776153494746459, + "grad_norm": 6.0, + "kl": 0.0, + "learning_rate": 5.27910752496582e-06, + "logits/chosen": -111307699.2, + "logits/rejected": -56621067.63636363, + "logps/chosen": -233.1904296875, + "logps/rejected": -445.90087890625, + "loss": 0.0362, + "rewards/chosen": -0.2954352617263794, + "rewards/margins": 11.550229022719645, + "rewards/rejected": -11.845664284446023, + "step": 535 + }, + { + "epoch": 0.09794426678848789, + "grad_norm": 9.8125, + "kl": 0.0, + "learning_rate": 5.2616797812147205e-06, + "logits/chosen": -92279392.0, + "logits/rejected": -123401952.0, + "logps/chosen": -184.3284149169922, + "logps/rejected": -532.2845458984375, + "loss": 0.0341, + "rewards/chosen": 0.9495430588722229, + "rewards/margins": 12.284286558628082, + "rewards/rejected": -11.33474349975586, + "step": 536 + }, + { + "epoch": 0.09812699862951119, + "grad_norm": 8.875, + "kl": 0.0, + "learning_rate": 5.244248848978067e-06, + "logits/chosen": -135034656.0, + "logits/rejected": -29293064.0, + "logps/chosen": -215.97042846679688, + "logps/rejected": -549.4447021484375, + "loss": 0.036, + "rewards/chosen": 1.5778833627700806, + "rewards/margins": 13.566362977027893, + "rewards/rejected": -11.988479614257812, + "step": 537 + }, + { + "epoch": 0.09830973047053448, + "grad_norm": 7.59375, + "kl": 0.0, + "learning_rate": 5.226814940646268e-06, + "logits/chosen": -74781160.0, + "logits/rejected": -82000416.0, + "logps/chosen": -256.22869873046875, + "logps/rejected": -490.1705017089844, + "loss": 0.0198, + "rewards/chosen": 2.142057418823242, + "rewards/margins": 15.351713180541992, + "rewards/rejected": -13.20965576171875, + "step": 538 + }, + { + "epoch": 0.09849246231155778, + "grad_norm": 4.78125, + "kl": 0.29483985900878906, + "learning_rate": 5.209378268645998e-06, + "logits/chosen": -82554208.0, + "logits/rejected": -72284857.6, + "logps/chosen": -232.73091634114584, + "logps/rejected": -414.9681640625, + "loss": 0.0192, + "rewards/chosen": 1.5294515291849773, + "rewards/margins": 12.351662985483804, + "rewards/rejected": -10.822211456298827, + "step": 539 + }, + { + "epoch": 0.09867519415258108, + "grad_norm": 6.65625, + "kl": 0.0, + "learning_rate": 5.1919390454376e-06, + "logits/chosen": -78327856.0, + "logits/rejected": -74344088.0, + "logps/chosen": -233.28701782226562, + "logps/rejected": -581.3878173828125, + "loss": 0.018, + "rewards/chosen": 2.1544675827026367, + "rewards/margins": 16.260605812072754, + "rewards/rejected": -14.106138229370117, + "step": 540 + }, + { + "epoch": 0.09885792599360439, + "grad_norm": 1.59375, + "kl": 0.0, + "learning_rate": 5.174497483512506e-06, + "logits/chosen": -135106922.66666666, + "logits/rejected": -76833821.53846154, + "logps/chosen": -125.69283040364583, + "logps/rejected": -524.7745267427885, + "loss": 0.0084, + "rewards/chosen": 3.283914883931478, + "rewards/margins": 13.261064260433882, + "rewards/rejected": -9.977149376502403, + "step": 541 + }, + { + "epoch": 0.09904065783462769, + "grad_norm": 10.0625, + "kl": 0.0, + "learning_rate": 5.157053795390642e-06, + "logits/chosen": -131867376.0, + "logits/rejected": -95743104.0, + "logps/chosen": -250.06874084472656, + "logps/rejected": -323.6097717285156, + "loss": 0.0612, + "rewards/chosen": 1.4483827352523804, + "rewards/margins": 9.804996848106384, + "rewards/rejected": -8.356614112854004, + "step": 542 + }, + { + "epoch": 0.09922338967565099, + "grad_norm": 8.25, + "kl": 0.0, + "learning_rate": 5.139608193617846e-06, + "logits/chosen": -25781033.14285714, + "logits/rejected": -121946211.55555555, + "logps/chosen": -214.1656494140625, + "logps/rejected": -440.4914279513889, + "loss": 0.0294, + "rewards/chosen": 0.9232138225010463, + "rewards/margins": 14.096392601255387, + "rewards/rejected": -13.173178778754341, + "step": 543 + }, + { + "epoch": 0.09940612151667429, + "grad_norm": 5.9375, + "kl": 0.0, + "learning_rate": 5.1221608907632665e-06, + "logits/chosen": -92017664.0, + "logits/rejected": -59722040.88888889, + "logps/chosen": -145.94838169642858, + "logps/rejected": -463.14171006944446, + "loss": 0.0179, + "rewards/chosen": 2.3313353402273997, + "rewards/margins": 15.218180399092416, + "rewards/rejected": -12.886845058865017, + "step": 544 + }, + { + "epoch": 0.09958885335769758, + "grad_norm": 10.1875, + "kl": 0.0, + "learning_rate": 5.1047120994167855e-06, + "logits/chosen": -168475117.7142857, + "logits/rejected": -133119943.1111111, + "logps/chosen": -257.80594308035717, + "logps/rejected": -461.9488932291667, + "loss": 0.0261, + "rewards/chosen": 1.1143796784537179, + "rewards/margins": 13.053158873603458, + "rewards/rejected": -11.93877919514974, + "step": 545 + }, + { + "epoch": 0.09977158519872088, + "grad_norm": 10.125, + "kl": 0.0, + "learning_rate": 5.087262032186418e-06, + "logits/chosen": -100942568.0, + "logits/rejected": -92323632.0, + "logps/chosen": -282.0610046386719, + "logps/rejected": -480.5455627441406, + "loss": 0.0331, + "rewards/chosen": 0.953506350517273, + "rewards/margins": 13.654258608818054, + "rewards/rejected": -12.700752258300781, + "step": 546 + }, + { + "epoch": 0.09995431703974418, + "grad_norm": 7.34375, + "kl": 0.0, + "learning_rate": 5.069810901695727e-06, + "logits/chosen": -145240629.33333334, + "logits/rejected": -114257267.2, + "logps/chosen": -317.2076009114583, + "logps/rejected": -504.740234375, + "loss": 0.0211, + "rewards/chosen": 1.4776398340861003, + "rewards/margins": 17.06297156016032, + "rewards/rejected": -15.58533172607422, + "step": 547 + }, + { + "epoch": 0.10013704888076748, + "grad_norm": 11.0625, + "kl": 0.0, + "learning_rate": 5.05235892058123e-06, + "logits/chosen": -136971757.7142857, + "logits/rejected": -28448801.777777776, + "logps/chosen": -255.22938755580358, + "logps/rejected": -439.7749294704861, + "loss": 0.0345, + "rewards/chosen": 1.316413334437779, + "rewards/margins": 11.873955923413474, + "rewards/rejected": -10.557542588975695, + "step": 548 + }, + { + "epoch": 0.10031978072179078, + "grad_norm": 12.3125, + "kl": 0.0, + "learning_rate": 5.034906301489808e-06, + "logits/chosen": -84003008.0, + "logits/rejected": -92656992.0, + "logps/chosen": -265.70204671223956, + "logps/rejected": -500.35357666015625, + "loss": 0.0563, + "rewards/chosen": 0.7977945804595947, + "rewards/margins": 14.240299463272095, + "rewards/rejected": -13.4425048828125, + "step": 549 + }, + { + "epoch": 0.10050251256281408, + "grad_norm": 6.90625, + "kl": 0.0, + "learning_rate": 5.0174532570761194e-06, + "logits/chosen": -109420904.0, + "logits/rejected": -91224688.0, + "logps/chosen": -264.4207763671875, + "logps/rejected": -466.9914245605469, + "loss": 0.0322, + "rewards/chosen": 1.3249187469482422, + "rewards/margins": 13.795262336730957, + "rewards/rejected": -12.470343589782715, + "step": 550 + }, + { + "epoch": 0.10068524440383737, + "grad_norm": 10.1875, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -233165312.0, + "logits/rejected": -128568614.4, + "logps/chosen": -249.78466796875, + "logps/rejected": -393.9983154296875, + "loss": 0.0328, + "rewards/chosen": 0.845457394917806, + "rewards/margins": 12.383010800679525, + "rewards/rejected": -11.537553405761718, + "step": 551 + }, + { + "epoch": 0.10086797624486067, + "grad_norm": 10.3125, + "kl": 0.0, + "learning_rate": 4.982546742923883e-06, + "logits/chosen": -72576908.8, + "logits/rejected": -60719162.666666664, + "logps/chosen": -187.75023193359374, + "logps/rejected": -336.755615234375, + "loss": 0.0438, + "rewards/chosen": 0.7176236629486084, + "rewards/margins": 11.108431768417358, + "rewards/rejected": -10.39080810546875, + "step": 552 + }, + { + "epoch": 0.10105070808588397, + "grad_norm": 9.25, + "kl": 0.8759679794311523, + "learning_rate": 4.965093698510192e-06, + "logits/chosen": -151811008.0, + "logits/rejected": -66613952.0, + "logps/chosen": -204.85870361328125, + "logps/rejected": -580.8685302734375, + "loss": 0.0291, + "rewards/chosen": 2.162749767303467, + "rewards/margins": 15.530417919158936, + "rewards/rejected": -13.367668151855469, + "step": 553 + }, + { + "epoch": 0.10123343992690727, + "grad_norm": 8.875, + "kl": 0.0, + "learning_rate": 4.9476410794187726e-06, + "logits/chosen": -39812504.0, + "logits/rejected": -29907168.0, + "logps/chosen": -230.15713500976562, + "logps/rejected": -549.0218505859375, + "loss": 0.0276, + "rewards/chosen": 1.5418076515197754, + "rewards/margins": 14.379297733306885, + "rewards/rejected": -12.83749008178711, + "step": 554 + }, + { + "epoch": 0.10141617176793057, + "grad_norm": 9.5625, + "kl": 0.0, + "learning_rate": 4.9301890983042744e-06, + "logits/chosen": -107612492.8, + "logits/rejected": -199065088.0, + "logps/chosen": -199.1546630859375, + "logps/rejected": -318.7710774739583, + "loss": 0.0475, + "rewards/chosen": 0.5078530788421631, + "rewards/margins": 11.748259115219117, + "rewards/rejected": -11.240406036376953, + "step": 555 + }, + { + "epoch": 0.10159890360895386, + "grad_norm": 9.5625, + "kl": 0.0, + "learning_rate": 4.9127379678135825e-06, + "logits/chosen": -130689664.0, + "logits/rejected": -63150716.0, + "logps/chosen": -222.0645751953125, + "logps/rejected": -612.7381591796875, + "loss": 0.029, + "rewards/chosen": 1.3263980150222778, + "rewards/margins": 12.935878872871399, + "rewards/rejected": -11.609480857849121, + "step": 556 + }, + { + "epoch": 0.10178163544997716, + "grad_norm": 10.8125, + "kl": 0.0, + "learning_rate": 4.895287900583216e-06, + "logits/chosen": -118817704.0, + "logits/rejected": -63324400.0, + "logps/chosen": -176.75741577148438, + "logps/rejected": -421.8849182128906, + "loss": 0.0347, + "rewards/chosen": 1.6024408340454102, + "rewards/margins": 13.35823917388916, + "rewards/rejected": -11.75579833984375, + "step": 557 + }, + { + "epoch": 0.10196436729100046, + "grad_norm": 9.625, + "kl": 0.0, + "learning_rate": 4.877839109236735e-06, + "logits/chosen": -199544704.0, + "logits/rejected": -90215744.0, + "logps/chosen": -225.48643493652344, + "logps/rejected": -354.87786865234375, + "loss": 0.0383, + "rewards/chosen": 1.0476409196853638, + "rewards/margins": 10.591646313667297, + "rewards/rejected": -9.544005393981934, + "step": 558 + }, + { + "epoch": 0.10214709913202376, + "grad_norm": 10.25, + "kl": 0.0, + "learning_rate": 4.860391806382157e-06, + "logits/chosen": -88492506.66666667, + "logits/rejected": -43800632.0, + "logps/chosen": -219.256103515625, + "logps/rejected": -513.2205810546875, + "loss": 0.0537, + "rewards/chosen": 1.2678675651550293, + "rewards/margins": 15.120533466339111, + "rewards/rejected": -13.852665901184082, + "step": 559 + }, + { + "epoch": 0.10232983097304706, + "grad_norm": 12.25, + "kl": 0.0, + "learning_rate": 4.842946204609359e-06, + "logits/chosen": -85735296.0, + "logits/rejected": -121269045.33333333, + "logps/chosen": -248.852294921875, + "logps/rejected": -302.471435546875, + "loss": 0.0503, + "rewards/chosen": 0.8953987121582031, + "rewards/margins": 11.481696065266927, + "rewards/rejected": -10.586297353108725, + "step": 560 + }, + { + "epoch": 0.10251256281407035, + "grad_norm": 12.0, + "kl": 0.0, + "learning_rate": 4.825502516487497e-06, + "logits/chosen": -58414796.0, + "logits/rejected": -73176296.0, + "logps/chosen": -288.4113464355469, + "logps/rejected": -264.97735595703125, + "loss": 0.0495, + "rewards/chosen": 0.6223594546318054, + "rewards/margins": 9.36371248960495, + "rewards/rejected": -8.741353034973145, + "step": 561 + }, + { + "epoch": 0.10269529465509365, + "grad_norm": 6.46875, + "kl": 0.0, + "learning_rate": 4.8080609545624004e-06, + "logits/chosen": -166534643.2, + "logits/rejected": -50946926.54545455, + "logps/chosen": -213.68984375, + "logps/rejected": -472.5320490056818, + "loss": 0.0221, + "rewards/chosen": 1.140099334716797, + "rewards/margins": 12.375001456520774, + "rewards/rejected": -11.234902121803977, + "step": 562 + }, + { + "epoch": 0.10287802649611695, + "grad_norm": 8.0, + "kl": 0.0, + "learning_rate": 4.7906217313540035e-06, + "logits/chosen": -115116202.66666667, + "logits/rejected": -51563369.6, + "logps/chosen": -246.7069295247396, + "logps/rejected": -386.7706787109375, + "loss": 0.0196, + "rewards/chosen": 1.496345043182373, + "rewards/margins": 11.028777599334717, + "rewards/rejected": -9.532432556152344, + "step": 563 + }, + { + "epoch": 0.10306075833714025, + "grad_norm": 8.9375, + "kl": 0.0, + "learning_rate": 4.7731850593537316e-06, + "logits/chosen": -58490540.0, + "logits/rejected": -93615424.0, + "logps/chosen": -243.59776306152344, + "logps/rejected": -286.3746032714844, + "loss": 0.049, + "rewards/chosen": 0.055083077400922775, + "rewards/margins": 8.176897805184126, + "rewards/rejected": -8.121814727783203, + "step": 564 + }, + { + "epoch": 0.10324349017816355, + "grad_norm": 9.75, + "kl": 0.0, + "learning_rate": 4.755751151021934e-06, + "logits/chosen": -68335360.0, + "logits/rejected": -165574741.33333334, + "logps/chosen": -171.61155482700892, + "logps/rejected": -403.1628689236111, + "loss": 0.05, + "rewards/chosen": -0.32779410907200407, + "rewards/margins": 9.784205402646746, + "rewards/rejected": -10.11199951171875, + "step": 565 + }, + { + "epoch": 0.10342622201918684, + "grad_norm": 9.5, + "kl": 0.0, + "learning_rate": 4.738320218785281e-06, + "logits/chosen": -90687040.0, + "logits/rejected": -65002204.0, + "logps/chosen": -230.6260986328125, + "logps/rejected": -297.477783203125, + "loss": 0.0308, + "rewards/chosen": 1.369433879852295, + "rewards/margins": 8.616368770599365, + "rewards/rejected": -7.24693489074707, + "step": 566 + }, + { + "epoch": 0.10360895386021014, + "grad_norm": 35.25, + "kl": 0.0, + "learning_rate": 4.720892475034181e-06, + "logits/chosen": -99112808.0, + "logits/rejected": -96551824.0, + "logps/chosen": -236.05361938476562, + "logps/rejected": -443.584228515625, + "loss": 0.0534, + "rewards/chosen": 0.7906189560890198, + "rewards/margins": 11.398931562900543, + "rewards/rejected": -10.608312606811523, + "step": 567 + }, + { + "epoch": 0.10379168570123344, + "grad_norm": 5.25, + "kl": 0.0, + "learning_rate": 4.703468132120193e-06, + "logits/chosen": -96767641.6, + "logits/rejected": -18071898.666666668, + "logps/chosen": -194.8552490234375, + "logps/rejected": -422.7384847005208, + "loss": 0.0203, + "rewards/chosen": 2.0608949661254883, + "rewards/margins": 12.996829668680826, + "rewards/rejected": -10.935934702555338, + "step": 568 + }, + { + "epoch": 0.10397441754225674, + "grad_norm": 7.46875, + "kl": 0.0, + "learning_rate": 4.686047402353433e-06, + "logits/chosen": -97747117.71428572, + "logits/rejected": -95797809.77777778, + "logps/chosen": -242.60246930803572, + "logps/rejected": -474.9734700520833, + "loss": 0.0297, + "rewards/chosen": 1.742699078151158, + "rewards/margins": 12.86925717005654, + "rewards/rejected": -11.126558091905382, + "step": 569 + }, + { + "epoch": 0.10415714938328004, + "grad_norm": 6.40625, + "kl": 0.0, + "learning_rate": 4.668630498000001e-06, + "logits/chosen": -92568732.44444445, + "logits/rejected": -177317686.85714287, + "logps/chosen": -141.145263671875, + "logps/rejected": -598.8157087053571, + "loss": 0.0219, + "rewards/chosen": 1.8243579864501953, + "rewards/margins": 15.881159918648857, + "rewards/rejected": -14.056801932198661, + "step": 570 + }, + { + "epoch": 0.10433988122430334, + "grad_norm": 9.5625, + "kl": 0.0, + "learning_rate": 4.651217631279374e-06, + "logits/chosen": -56339346.28571428, + "logits/rejected": -78410666.66666667, + "logps/chosen": -188.82939801897322, + "logps/rejected": -338.1260579427083, + "loss": 0.032, + "rewards/chosen": 0.6969100407191685, + "rewards/margins": 11.560812662518215, + "rewards/rejected": -10.863902621799046, + "step": 571 + }, + { + "epoch": 0.10452261306532663, + "grad_norm": 10.25, + "kl": 0.0, + "learning_rate": 4.6338090143618435e-06, + "logits/chosen": -85315863.27272727, + "logits/rejected": -100204704.0, + "logps/chosen": -235.75392844460228, + "logps/rejected": -392.27548828125, + "loss": 0.0599, + "rewards/chosen": 0.47313703190196643, + "rewards/margins": 12.29850629459728, + "rewards/rejected": -11.825369262695313, + "step": 572 + }, + { + "epoch": 0.10470534490634993, + "grad_norm": 8.3125, + "kl": 0.0, + "learning_rate": 4.6164048593659076e-06, + "logits/chosen": -71538224.0, + "logits/rejected": -123918233.6, + "logps/chosen": -125.42738850911458, + "logps/rejected": -342.490185546875, + "loss": 0.0347, + "rewards/chosen": 0.19289310773213705, + "rewards/margins": 8.51545475323995, + "rewards/rejected": -8.322561645507813, + "step": 573 + }, + { + "epoch": 0.10488807674737323, + "grad_norm": 5.71875, + "kl": 0.0, + "learning_rate": 4.5990053783557066e-06, + "logits/chosen": -113362978.9090909, + "logits/rejected": -12311895.2, + "logps/chosen": -171.7252752130682, + "logps/rejected": -809.25537109375, + "loss": 0.0239, + "rewards/chosen": 2.4437132748690518, + "rewards/margins": 25.45665272799405, + "rewards/rejected": -23.012939453125, + "step": 574 + }, + { + "epoch": 0.10507080858839653, + "grad_norm": 10.125, + "kl": 0.0, + "learning_rate": 4.581610783338424e-06, + "logits/chosen": -92524192.0, + "logits/rejected": -71813632.0, + "logps/chosen": -241.3214111328125, + "logps/rejected": -476.4684753417969, + "loss": 0.0374, + "rewards/chosen": 1.6442216237386067, + "rewards/margins": 16.982489903767902, + "rewards/rejected": -15.338268280029297, + "step": 575 + }, + { + "epoch": 0.10525354042941983, + "grad_norm": 6.21875, + "kl": 0.0, + "learning_rate": 4.564221286261709e-06, + "logits/chosen": -97268873.14285715, + "logits/rejected": -103329905.77777778, + "logps/chosen": -165.47806222098214, + "logps/rejected": -535.0455186631945, + "loss": 0.0222, + "rewards/chosen": 2.2007180622645786, + "rewards/margins": 11.381516063024126, + "rewards/rejected": -9.180798000759548, + "step": 576 + }, + { + "epoch": 0.10543627227044312, + "grad_norm": 8.9375, + "kl": 0.0, + "learning_rate": 4.546837099011101e-06, + "logits/chosen": -49423272.0, + "logits/rejected": -72986048.0, + "logps/chosen": -173.82273864746094, + "logps/rejected": -442.2939758300781, + "loss": 0.0274, + "rewards/chosen": 1.678079605102539, + "rewards/margins": 13.594940185546875, + "rewards/rejected": -11.916860580444336, + "step": 577 + }, + { + "epoch": 0.10561900411146642, + "grad_norm": 11.0, + "kl": 0.0, + "learning_rate": 4.529458433407429e-06, + "logits/chosen": -84290240.0, + "logits/rejected": -178027558.4, + "logps/chosen": -228.34459339488637, + "logps/rejected": -594.07490234375, + "loss": 0.0406, + "rewards/chosen": 1.3942446275190874, + "rewards/margins": 14.9886568589644, + "rewards/rejected": -13.594412231445313, + "step": 578 + }, + { + "epoch": 0.10580173595248972, + "grad_norm": 6.5, + "kl": 0.0, + "learning_rate": 4.512085501204254e-06, + "logits/chosen": -68350232.0, + "logits/rejected": -100604904.0, + "logps/chosen": -152.19198608398438, + "logps/rejected": -463.07781982421875, + "loss": 0.0228, + "rewards/chosen": 1.8399549722671509, + "rewards/margins": 14.421412110328674, + "rewards/rejected": -12.581457138061523, + "step": 579 + }, + { + "epoch": 0.10598446779351302, + "grad_norm": 10.25, + "kl": 0.009983062744140625, + "learning_rate": 4.494718514085269e-06, + "logits/chosen": -79343476.36363636, + "logits/rejected": -98499347.2, + "logps/chosen": -241.46946022727272, + "logps/rejected": -358.307177734375, + "loss": 0.0393, + "rewards/chosen": 1.3233467448841443, + "rewards/margins": 12.008571659434926, + "rewards/rejected": -10.685224914550782, + "step": 580 + }, + { + "epoch": 0.10616719963453632, + "grad_norm": 8.125, + "kl": 0.0, + "learning_rate": 4.477357683661734e-06, + "logits/chosen": -105147690.66666667, + "logits/rejected": -87413165.71428572, + "logps/chosen": -203.89752875434027, + "logps/rejected": -424.28463309151783, + "loss": 0.0305, + "rewards/chosen": 2.4570208655463324, + "rewards/margins": 12.827990365406823, + "rewards/rejected": -10.370969499860491, + "step": 581 + }, + { + "epoch": 0.10634993147555961, + "grad_norm": 11.0625, + "kl": 0.0, + "learning_rate": 4.460003221469886e-06, + "logits/chosen": -85980512.0, + "logits/rejected": -103273176.0, + "logps/chosen": -365.91778564453125, + "logps/rejected": -383.60321044921875, + "loss": 0.05, + "rewards/chosen": 0.026955515146255493, + "rewards/margins": 12.595412164926529, + "rewards/rejected": -12.568456649780273, + "step": 582 + }, + { + "epoch": 0.10653266331658291, + "grad_norm": 8.9375, + "kl": 0.0, + "learning_rate": 4.442655338968373e-06, + "logits/chosen": -206676512.0, + "logits/rejected": -84617728.0, + "logps/chosen": -273.9619445800781, + "logps/rejected": -497.8356018066406, + "loss": 0.0349, + "rewards/chosen": 0.8560299277305603, + "rewards/margins": 12.980144917964935, + "rewards/rejected": -12.124114990234375, + "step": 583 + }, + { + "epoch": 0.10671539515760621, + "grad_norm": 8.1875, + "kl": 0.0, + "learning_rate": 4.425314247535668e-06, + "logits/chosen": -61555014.4, + "logits/rejected": -89702666.66666667, + "logps/chosen": -244.594970703125, + "logps/rejected": -433.9906005859375, + "loss": 0.0284, + "rewards/chosen": 1.6110076904296875, + "rewards/margins": 15.739252726236979, + "rewards/rejected": -14.128245035807291, + "step": 584 + }, + { + "epoch": 0.10689812699862951, + "grad_norm": 8.625, + "kl": 0.0, + "learning_rate": 4.4079801584674955e-06, + "logits/chosen": -144243488.0, + "logits/rejected": -69942664.0, + "logps/chosen": -231.38363647460938, + "logps/rejected": -453.15093994140625, + "loss": 0.0452, + "rewards/chosen": 0.6123269200325012, + "rewards/margins": 13.688275635242462, + "rewards/rejected": -13.075948715209961, + "step": 585 + }, + { + "epoch": 0.1070808588396528, + "grad_norm": 5.34375, + "kl": 0.5568675994873047, + "learning_rate": 4.390653282974264e-06, + "logits/chosen": -59138688.0, + "logits/rejected": -67540597.33333333, + "logps/chosen": -210.20248413085938, + "logps/rejected": -461.3831380208333, + "loss": 0.0237, + "rewards/chosen": 1.369225263595581, + "rewards/margins": 13.34727676709493, + "rewards/rejected": -11.97805150349935, + "step": 586 + }, + { + "epoch": 0.1072635906806761, + "grad_norm": 10.5, + "kl": 0.0, + "learning_rate": 4.373333832178478e-06, + "logits/chosen": -83141352.72727273, + "logits/rejected": -64664179.2, + "logps/chosen": -228.29514382102272, + "logps/rejected": -535.72353515625, + "loss": 0.0394, + "rewards/chosen": 1.635188709605824, + "rewards/margins": 18.996410023082387, + "rewards/rejected": -17.361221313476562, + "step": 587 + }, + { + "epoch": 0.1074463225216994, + "grad_norm": 7.0625, + "kl": 0.0, + "learning_rate": 4.356022017112187e-06, + "logits/chosen": -88053174.85714285, + "logits/rejected": -137558087.1111111, + "logps/chosen": -176.72516741071428, + "logps/rejected": -444.14800347222223, + "loss": 0.025, + "rewards/chosen": 1.7227223260062081, + "rewards/margins": 14.377888104272268, + "rewards/rejected": -12.65516577826606, + "step": 588 + }, + { + "epoch": 0.1076290543627227, + "grad_norm": 8.5625, + "kl": 0.0, + "learning_rate": 4.3387180487143875e-06, + "logits/chosen": -107628472.0, + "logits/rejected": -62912396.0, + "logps/chosen": -218.43600463867188, + "logps/rejected": -522.9843139648438, + "loss": 0.0233, + "rewards/chosen": 2.175706624984741, + "rewards/margins": 14.90075945854187, + "rewards/rejected": -12.725052833557129, + "step": 589 + }, + { + "epoch": 0.107811786203746, + "grad_norm": 7.28125, + "kl": 0.0, + "learning_rate": 4.321422137828479e-06, + "logits/chosen": -74855624.0, + "logits/rejected": -55179956.0, + "logps/chosen": -250.20777893066406, + "logps/rejected": -492.3138427734375, + "loss": 0.0264, + "rewards/chosen": 2.2896337509155273, + "rewards/margins": 13.56324577331543, + "rewards/rejected": -11.273612022399902, + "step": 590 + }, + { + "epoch": 0.1079945180447693, + "grad_norm": 10.625, + "kl": 0.0, + "learning_rate": 4.304134495199675e-06, + "logits/chosen": -82987968.0, + "logits/rejected": -73680844.8, + "logps/chosen": -290.85102982954544, + "logps/rejected": -356.920849609375, + "loss": 0.0392, + "rewards/chosen": 1.256424990567294, + "rewards/margins": 13.193424502286042, + "rewards/rejected": -11.93699951171875, + "step": 591 + }, + { + "epoch": 0.1081772498857926, + "grad_norm": 11.25, + "kl": 0.0, + "learning_rate": 4.286855331472442e-06, + "logits/chosen": -85466579.2, + "logits/rejected": -83458272.0, + "logps/chosen": -264.129345703125, + "logps/rejected": -596.73095703125, + "loss": 0.049, + "rewards/chosen": 0.6364431381225586, + "rewards/margins": 10.109417915344238, + "rewards/rejected": -9.47297477722168, + "step": 592 + }, + { + "epoch": 0.1083599817268159, + "grad_norm": 10.3125, + "kl": 0.0, + "learning_rate": 4.269584857187942e-06, + "logits/chosen": -76212379.42857143, + "logits/rejected": -82237326.22222222, + "logps/chosen": -230.60505022321428, + "logps/rejected": -431.4138454861111, + "loss": 0.0339, + "rewards/chosen": 0.8234494754246303, + "rewards/margins": 11.108671846843901, + "rewards/rejected": -10.285222371419271, + "step": 593 + }, + { + "epoch": 0.10854271356783919, + "grad_norm": 6.375, + "kl": 0.0, + "learning_rate": 4.2523232827814534e-06, + "logits/chosen": -81936676.57142857, + "logits/rejected": -36241959.11111111, + "logps/chosen": -190.42013113839286, + "logps/rejected": -572.7173936631945, + "loss": 0.0236, + "rewards/chosen": 1.3077222279139928, + "rewards/margins": 12.26788492051382, + "rewards/rejected": -10.960162692599827, + "step": 594 + }, + { + "epoch": 0.10872544540886249, + "grad_norm": 5.28125, + "kl": 0.0, + "learning_rate": 4.23507081857981e-06, + "logits/chosen": -67128667.42857143, + "logits/rejected": -90075832.8888889, + "logps/chosen": -229.40984235491072, + "logps/rejected": -322.4080403645833, + "loss": 0.012, + "rewards/chosen": 2.6920618329729353, + "rewards/margins": 11.940283336336652, + "rewards/rejected": -9.248221503363716, + "step": 595 + }, + { + "epoch": 0.10890817724988579, + "grad_norm": 10.0625, + "kl": 0.0, + "learning_rate": 4.217827674798845e-06, + "logits/chosen": -85823600.0, + "logits/rejected": -77487424.0, + "logps/chosen": -181.3414764404297, + "logps/rejected": -328.3720397949219, + "loss": 0.0399, + "rewards/chosen": 0.7368984222412109, + "rewards/margins": 13.591279029846191, + "rewards/rejected": -12.85438060760498, + "step": 596 + }, + { + "epoch": 0.10909090909090909, + "grad_norm": 7.46875, + "kl": 0.0, + "learning_rate": 4.200594061540827e-06, + "logits/chosen": -20138275.2, + "logits/rejected": -32283616.0, + "logps/chosen": -218.967236328125, + "logps/rejected": -556.8770419034091, + "loss": 0.028, + "rewards/chosen": 0.6598290920257568, + "rewards/margins": 14.200957077199762, + "rewards/rejected": -13.541127985174006, + "step": 597 + }, + { + "epoch": 0.10927364093193238, + "grad_norm": 5.34375, + "kl": 0.0, + "learning_rate": 4.183370188791891e-06, + "logits/chosen": -176687065.6, + "logits/rejected": -123074059.63636364, + "logps/chosen": -328.8500244140625, + "logps/rejected": -413.07106711647725, + "loss": 0.0069, + "rewards/chosen": 2.802720069885254, + "rewards/margins": 15.430982676419346, + "rewards/rejected": -12.628262606534092, + "step": 598 + }, + { + "epoch": 0.10945637277295568, + "grad_norm": 7.71875, + "kl": 0.0, + "learning_rate": 4.166156266419489e-06, + "logits/chosen": -147944576.0, + "logits/rejected": -78711383.27272727, + "logps/chosen": -278.31416015625, + "logps/rejected": -409.30104758522725, + "loss": 0.0271, + "rewards/chosen": 0.5568928241729736, + "rewards/margins": 10.897011232376098, + "rewards/rejected": -10.340118408203125, + "step": 599 + }, + { + "epoch": 0.10963910461397898, + "grad_norm": 12.5, + "kl": 0.0, + "learning_rate": 4.148952504169839e-06, + "logits/chosen": -155135388.44444445, + "logits/rejected": -67195899.42857143, + "logps/chosen": -321.15741644965277, + "logps/rejected": -273.4991978236607, + "loss": 0.0527, + "rewards/chosen": 0.2643130620320638, + "rewards/margins": 8.54129441579183, + "rewards/rejected": -8.276981353759766, + "step": 600 + }, + { + "epoch": 0.10982183645500228, + "grad_norm": 7.8125, + "kl": 0.0, + "learning_rate": 4.131759111665349e-06, + "logits/chosen": -101655814.4, + "logits/rejected": -124277312.0, + "logps/chosen": -219.5938232421875, + "logps/rejected": -380.97705078125, + "loss": 0.0308, + "rewards/chosen": 1.9781539916992188, + "rewards/margins": 10.555392074584962, + "rewards/rejected": -8.577238082885742, + "step": 601 + }, + { + "epoch": 0.11000456829602558, + "grad_norm": 9.75, + "kl": 0.0, + "learning_rate": 4.114576298402085e-06, + "logits/chosen": -70856199.1111111, + "logits/rejected": -4505763.428571428, + "logps/chosen": -340.44639756944446, + "logps/rejected": -336.8223353794643, + "loss": 0.0409, + "rewards/chosen": 0.8283873134189181, + "rewards/margins": 9.57964507360307, + "rewards/rejected": -8.751257760184151, + "step": 602 + }, + { + "epoch": 0.11018730013704887, + "grad_norm": 10.5, + "kl": 0.0, + "learning_rate": 4.0974042737472005e-06, + "logits/chosen": -105649292.8, + "logits/rejected": -131470858.66666667, + "logps/chosen": -245.4068115234375, + "logps/rejected": -343.7392578125, + "loss": 0.0284, + "rewards/chosen": 1.6552656173706055, + "rewards/margins": 12.437569745381674, + "rewards/rejected": -10.782304128011068, + "step": 603 + }, + { + "epoch": 0.11037003197807217, + "grad_norm": 9.875, + "kl": 0.0, + "learning_rate": 4.0802432469364e-06, + "logits/chosen": -38579869.333333336, + "logits/rejected": -71415500.8, + "logps/chosen": -266.7564697265625, + "logps/rejected": -354.6275146484375, + "loss": 0.0315, + "rewards/chosen": 0.37617921829223633, + "rewards/margins": 9.58320665359497, + "rewards/rejected": -9.207027435302734, + "step": 604 + }, + { + "epoch": 0.11055276381909548, + "grad_norm": 8.5625, + "kl": 0.0, + "learning_rate": 4.063093427071376e-06, + "logits/chosen": -37758592.0, + "logits/rejected": -71159168.0, + "logps/chosen": -204.02818298339844, + "logps/rejected": -432.7371826171875, + "loss": 0.031, + "rewards/chosen": 1.1544345617294312, + "rewards/margins": 14.298617720603943, + "rewards/rejected": -13.144183158874512, + "step": 605 + }, + { + "epoch": 0.11073549566011878, + "grad_norm": 6.03125, + "kl": 0.0, + "learning_rate": 4.045955023117276e-06, + "logits/chosen": -109479146.66666667, + "logits/rejected": -71692192.0, + "logps/chosen": -200.07830810546875, + "logps/rejected": -464.44462890625, + "loss": 0.0172, + "rewards/chosen": 1.7025294303894043, + "rewards/margins": 15.686785411834716, + "rewards/rejected": -13.984255981445312, + "step": 606 + }, + { + "epoch": 0.11091822750114208, + "grad_norm": 6.96875, + "kl": 0.0, + "learning_rate": 4.028828243900141e-06, + "logits/chosen": -96759404.8, + "logits/rejected": -95084386.9090909, + "logps/chosen": -236.7611572265625, + "logps/rejected": -418.9972478693182, + "loss": 0.0343, + "rewards/chosen": -0.03525831699371338, + "rewards/margins": 10.759513050859624, + "rewards/rejected": -10.794771367853338, + "step": 607 + }, + { + "epoch": 0.11110095934216538, + "grad_norm": 8.75, + "kl": 0.0, + "learning_rate": 4.0117132981043695e-06, + "logits/chosen": -158144256.0, + "logits/rejected": -52449888.0, + "logps/chosen": -215.9217325846354, + "logps/rejected": -440.658740234375, + "loss": 0.0222, + "rewards/chosen": 1.2703217665354412, + "rewards/margins": 11.78005077044169, + "rewards/rejected": -10.50972900390625, + "step": 608 + }, + { + "epoch": 0.11128369118318868, + "grad_norm": 7.21875, + "kl": 0.0, + "learning_rate": 3.994610394270178e-06, + "logits/chosen": -29839880.0, + "logits/rejected": -34282680.0, + "logps/chosen": -140.78123474121094, + "logps/rejected": -547.2689208984375, + "loss": 0.0388, + "rewards/chosen": 0.8935467600822449, + "rewards/margins": 14.850187003612518, + "rewards/rejected": -13.956640243530273, + "step": 609 + }, + { + "epoch": 0.11146642302421197, + "grad_norm": 7.96875, + "kl": 0.0, + "learning_rate": 3.977519740791049e-06, + "logits/chosen": -133756853.33333333, + "logits/rejected": -110832000.0, + "logps/chosen": -326.18646240234375, + "logps/rejected": -481.386669921875, + "loss": 0.0226, + "rewards/chosen": 1.0009659131368, + "rewards/margins": 13.524693330128988, + "rewards/rejected": -12.523727416992188, + "step": 610 + }, + { + "epoch": 0.11164915486523527, + "grad_norm": 9.5, + "kl": 0.0, + "learning_rate": 3.960441545911205e-06, + "logits/chosen": -85709992.0, + "logits/rejected": 8897553.0, + "logps/chosen": -227.45773315429688, + "logps/rejected": -375.70648193359375, + "loss": 0.0287, + "rewards/chosen": 1.5257797241210938, + "rewards/margins": 11.202047348022461, + "rewards/rejected": -9.676267623901367, + "step": 611 + }, + { + "epoch": 0.11183188670625857, + "grad_norm": 4.25, + "kl": 0.0, + "learning_rate": 3.943376017723058e-06, + "logits/chosen": -119999104.0, + "logits/rejected": -113004430.22222222, + "logps/chosen": -126.04107666015625, + "logps/rejected": -593.1810980902778, + "loss": 0.0108, + "rewards/chosen": 3.0613327026367188, + "rewards/margins": 16.052487691243492, + "rewards/rejected": -12.991154988606771, + "step": 612 + }, + { + "epoch": 0.11201461854728187, + "grad_norm": 9.3125, + "kl": 0.0, + "learning_rate": 3.926323364164684e-06, + "logits/chosen": -135772928.0, + "logits/rejected": -89185173.33333333, + "logps/chosen": -179.4945556640625, + "logps/rejected": -446.4346923828125, + "loss": 0.0607, + "rewards/chosen": 0.3227109909057617, + "rewards/margins": 14.046323458353678, + "rewards/rejected": -13.723612467447916, + "step": 613 + }, + { + "epoch": 0.11219735038830517, + "grad_norm": 11.9375, + "kl": 0.0, + "learning_rate": 3.909283793017289e-06, + "logits/chosen": -94099044.57142857, + "logits/rejected": -86616106.66666667, + "logps/chosen": -256.58115931919644, + "logps/rejected": -560.1159939236111, + "loss": 0.0442, + "rewards/chosen": 1.5338457652500697, + "rewards/margins": 11.19193699246361, + "rewards/rejected": -9.658091227213541, + "step": 614 + }, + { + "epoch": 0.11238008222932847, + "grad_norm": 8.75, + "kl": 0.0, + "learning_rate": 3.892257511902664e-06, + "logits/chosen": -77434404.57142857, + "logits/rejected": -71947719.1111111, + "logps/chosen": -262.5162876674107, + "logps/rejected": -601.6742621527778, + "loss": 0.0278, + "rewards/chosen": 1.6595630645751953, + "rewards/margins": 15.482840855916342, + "rewards/rejected": -13.823277791341146, + "step": 615 + }, + { + "epoch": 0.11256281407035176, + "grad_norm": 11.9375, + "kl": 0.0, + "learning_rate": 3.875244728280676e-06, + "logits/chosen": -132673200.0, + "logits/rejected": -116446912.0, + "logps/chosen": -177.49057006835938, + "logps/rejected": -435.3547058105469, + "loss": 0.038, + "rewards/chosen": 1.30196213722229, + "rewards/margins": 11.54032826423645, + "rewards/rejected": -10.23836612701416, + "step": 616 + }, + { + "epoch": 0.11274554591137506, + "grad_norm": 8.875, + "kl": 0.0, + "learning_rate": 3.8582456494467214e-06, + "logits/chosen": -131157888.0, + "logits/rejected": -9618464.0, + "logps/chosen": -171.65439453125, + "logps/rejected": -484.5679524739583, + "loss": 0.0456, + "rewards/chosen": 1.1588326454162599, + "rewards/margins": 9.75233596165975, + "rewards/rejected": -8.59350331624349, + "step": 617 + }, + { + "epoch": 0.11292827775239836, + "grad_norm": 9.25, + "kl": 0.0, + "learning_rate": 3.841260482529215e-06, + "logits/chosen": -82977740.8, + "logits/rejected": -54707979.63636363, + "logps/chosen": -296.424853515625, + "logps/rejected": -434.7078746448864, + "loss": 0.0251, + "rewards/chosen": 0.7747674942016601, + "rewards/margins": 11.326626569574529, + "rewards/rejected": -10.55185907537287, + "step": 618 + }, + { + "epoch": 0.11311100959342166, + "grad_norm": 7.09375, + "kl": 0.0, + "learning_rate": 3.82428943448705e-06, + "logits/chosen": -74435680.0, + "logits/rejected": -44756213.333333336, + "logps/chosen": -281.0823486328125, + "logps/rejected": -322.0531412760417, + "loss": 0.0477, + "rewards/chosen": 2.6280460357666016, + "rewards/margins": 8.891389846801758, + "rewards/rejected": -6.263343811035156, + "step": 619 + }, + { + "epoch": 0.11329374143444496, + "grad_norm": 6.78125, + "kl": 0.0, + "learning_rate": 3.8073327121070968e-06, + "logits/chosen": -105258311.1111111, + "logits/rejected": -66639149.71428572, + "logps/chosen": -198.99948459201389, + "logps/rejected": -430.30308314732144, + "loss": 0.033, + "rewards/chosen": 1.7720780902438693, + "rewards/margins": 11.95045224447099, + "rewards/rejected": -10.17837415422712, + "step": 620 + }, + { + "epoch": 0.11347647327546825, + "grad_norm": 7.75, + "kl": 0.0, + "learning_rate": 3.790390522001662e-06, + "logits/chosen": -124494176.0, + "logits/rejected": -117272870.4, + "logps/chosen": -247.6156005859375, + "logps/rejected": -354.250244140625, + "loss": 0.0212, + "rewards/chosen": 1.2120788892110188, + "rewards/margins": 10.992363770802816, + "rewards/rejected": -9.780284881591797, + "step": 621 + }, + { + "epoch": 0.11365920511649155, + "grad_norm": 10.625, + "kl": 0.0, + "learning_rate": 3.7734630706059873e-06, + "logits/chosen": -126363552.0, + "logits/rejected": -59222360.0, + "logps/chosen": -242.32809448242188, + "logps/rejected": -420.2986755371094, + "loss": 0.0413, + "rewards/chosen": 0.44547224044799805, + "rewards/margins": 12.561496257781982, + "rewards/rejected": -12.116024017333984, + "step": 622 + }, + { + "epoch": 0.11384193695751485, + "grad_norm": 6.84375, + "kl": 0.5507698059082031, + "learning_rate": 3.756550564175727e-06, + "logits/chosen": -137679052.8, + "logits/rejected": -91461888.0, + "logps/chosen": -146.215869140625, + "logps/rejected": -507.0299072265625, + "loss": 0.0191, + "rewards/chosen": 2.806045913696289, + "rewards/margins": 14.616001510620118, + "rewards/rejected": -11.809955596923828, + "step": 623 + }, + { + "epoch": 0.11402466879853815, + "grad_norm": 9.0, + "kl": 0.0, + "learning_rate": 3.7396532087844318e-06, + "logits/chosen": -57257557.333333336, + "logits/rejected": -90027756.8, + "logps/chosen": -319.32191975911456, + "logps/rejected": -419.9021484375, + "loss": 0.0203, + "rewards/chosen": 1.0927114486694336, + "rewards/margins": 13.416908836364746, + "rewards/rejected": -12.324197387695312, + "step": 624 + }, + { + "epoch": 0.11420740063956145, + "grad_norm": 8.25, + "kl": 0.0, + "learning_rate": 3.7227712103210485e-06, + "logits/chosen": -42576973.71428572, + "logits/rejected": -71181994.66666667, + "logps/chosen": -226.03526088169642, + "logps/rejected": -376.94232855902777, + "loss": 0.0254, + "rewards/chosen": 2.0022490365164622, + "rewards/margins": 13.133750976078094, + "rewards/rejected": -11.131501939561632, + "step": 625 + }, + { + "epoch": 0.11439013248058474, + "grad_norm": 7.96875, + "kl": 0.0, + "learning_rate": 3.705904774487396e-06, + "logits/chosen": -88177587.2, + "logits/rejected": -111480234.66666667, + "logps/chosen": -168.18475341796875, + "logps/rejected": -455.3088785807292, + "loss": 0.0294, + "rewards/chosen": 2.041543960571289, + "rewards/margins": 14.29700787862142, + "rewards/rejected": -12.25546391805013, + "step": 626 + }, + { + "epoch": 0.11457286432160804, + "grad_norm": 10.75, + "kl": 0.0, + "learning_rate": 3.6890541067956775e-06, + "logits/chosen": -53162986.666666664, + "logits/rejected": -60815014.4, + "logps/chosen": -199.8506876627604, + "logps/rejected": -491.970361328125, + "loss": 0.0367, + "rewards/chosen": 0.1279990871747335, + "rewards/margins": 12.560731478532157, + "rewards/rejected": -12.432732391357423, + "step": 627 + }, + { + "epoch": 0.11475559616263134, + "grad_norm": 9.9375, + "kl": 0.0, + "learning_rate": 3.672219412565956e-06, + "logits/chosen": -121890659.55555555, + "logits/rejected": -18882089.14285714, + "logps/chosen": -274.62754991319446, + "logps/rejected": -530.7484654017857, + "loss": 0.0424, + "rewards/chosen": 0.7701224750942655, + "rewards/margins": 13.289028114742703, + "rewards/rejected": -12.518905639648438, + "step": 628 + }, + { + "epoch": 0.11493832800365464, + "grad_norm": 9.9375, + "kl": 0.0, + "learning_rate": 3.655400896923672e-06, + "logits/chosen": -130720720.0, + "logits/rejected": -64043096.0, + "logps/chosen": -239.04135131835938, + "logps/rejected": -459.6484069824219, + "loss": 0.0332, + "rewards/chosen": 1.2850780487060547, + "rewards/margins": 12.508049011230469, + "rewards/rejected": -11.222970962524414, + "step": 629 + }, + { + "epoch": 0.11512105984467794, + "grad_norm": 7.5625, + "kl": 0.0, + "learning_rate": 3.6385987647971287e-06, + "logits/chosen": -137385792.0, + "logits/rejected": -86265192.72727273, + "logps/chosen": -194.8170654296875, + "logps/rejected": -345.0782581676136, + "loss": 0.0232, + "rewards/chosen": 1.373218059539795, + "rewards/margins": 10.760995214635676, + "rewards/rejected": -9.38777715509588, + "step": 630 + }, + { + "epoch": 0.11530379168570123, + "grad_norm": 9.75, + "kl": 0.0, + "learning_rate": 3.6218132209150047e-06, + "logits/chosen": -62412776.0, + "logits/rejected": -95775584.0, + "logps/chosen": -208.185546875, + "logps/rejected": -570.5597534179688, + "loss": 0.0318, + "rewards/chosen": 1.4165503978729248, + "rewards/margins": 16.273138761520386, + "rewards/rejected": -14.856588363647461, + "step": 631 + }, + { + "epoch": 0.11548652352672453, + "grad_norm": 6.3125, + "kl": 0.0, + "learning_rate": 3.6050444698038547e-06, + "logits/chosen": -141956807.1111111, + "logits/rejected": -91478774.85714285, + "logps/chosen": -203.621826171875, + "logps/rejected": -511.56556919642856, + "loss": 0.0214, + "rewards/chosen": 2.859153535630968, + "rewards/margins": 15.14696917458186, + "rewards/rejected": -12.287815638950892, + "step": 632 + }, + { + "epoch": 0.11566925536774783, + "grad_norm": 6.34375, + "kl": 0.0, + "learning_rate": 3.5882927157856175e-06, + "logits/chosen": -70625656.8888889, + "logits/rejected": -64359076.571428575, + "logps/chosen": -166.93983289930554, + "logps/rejected": -412.2510463169643, + "loss": 0.0368, + "rewards/chosen": 1.1331762737698026, + "rewards/margins": 12.0220672213842, + "rewards/rejected": -10.888890947614398, + "step": 633 + }, + { + "epoch": 0.11585198720877113, + "grad_norm": 8.625, + "kl": 0.0, + "learning_rate": 3.571558162975133e-06, + "logits/chosen": -93873568.0, + "logits/rejected": -121225024.0, + "logps/chosen": -220.2122802734375, + "logps/rejected": -363.9589029947917, + "loss": 0.0386, + "rewards/chosen": 1.3247323989868165, + "rewards/margins": 11.270587221781412, + "rewards/rejected": -9.945854822794596, + "step": 634 + }, + { + "epoch": 0.11603471904979443, + "grad_norm": 9.5625, + "kl": 0.245574951171875, + "learning_rate": 3.5548410152776414e-06, + "logits/chosen": -150955638.85714287, + "logits/rejected": -67237347.55555555, + "logps/chosen": -247.38895089285714, + "logps/rejected": -282.30650499131946, + "loss": 0.0252, + "rewards/chosen": 1.6686375481741769, + "rewards/margins": 12.031078717065236, + "rewards/rejected": -10.36244116889106, + "step": 635 + }, + { + "epoch": 0.11621745089081773, + "grad_norm": 10.0, + "kl": 0.0, + "learning_rate": 3.538141476386317e-06, + "logits/chosen": -64139880.72727273, + "logits/rejected": -78994329.6, + "logps/chosen": -166.4248712713068, + "logps/rejected": -204.7114990234375, + "loss": 0.0452, + "rewards/chosen": 1.9935493469238281, + "rewards/margins": 9.792948150634766, + "rewards/rejected": -7.799398803710938, + "step": 636 + }, + { + "epoch": 0.11640018273184102, + "grad_norm": 7.46875, + "kl": 0.0, + "learning_rate": 3.521459749779769e-06, + "logits/chosen": -54623420.0, + "logits/rejected": -93371736.0, + "logps/chosen": -179.79649353027344, + "logps/rejected": -522.5574340820312, + "loss": 0.0261, + "rewards/chosen": 2.109544038772583, + "rewards/margins": 15.424026727676392, + "rewards/rejected": -13.314482688903809, + "step": 637 + }, + { + "epoch": 0.11658291457286432, + "grad_norm": 10.875, + "kl": 0.0, + "learning_rate": 3.5047960387195673e-06, + "logits/chosen": -159568028.44444445, + "logits/rejected": -98570441.14285715, + "logps/chosen": -201.41871473524304, + "logps/rejected": -536.634033203125, + "loss": 0.0447, + "rewards/chosen": 0.8677163124084473, + "rewards/margins": 10.938590117863246, + "rewards/rejected": -10.070873805454799, + "step": 638 + }, + { + "epoch": 0.11676564641388762, + "grad_norm": 9.3125, + "kl": 0.0, + "learning_rate": 3.488150546247778e-06, + "logits/chosen": -66798772.36363637, + "logits/rejected": -53404729.6, + "logps/chosen": -213.44655539772728, + "logps/rejected": -579.68779296875, + "loss": 0.0352, + "rewards/chosen": 1.7007633556019177, + "rewards/margins": 15.401408802379262, + "rewards/rejected": -13.700645446777344, + "step": 639 + }, + { + "epoch": 0.11694837825491092, + "grad_norm": 11.75, + "kl": 0.0, + "learning_rate": 3.471523475184472e-06, + "logits/chosen": -138124657.7777778, + "logits/rejected": -153257289.14285713, + "logps/chosen": -323.5697428385417, + "logps/rejected": -556.67919921875, + "loss": 0.0354, + "rewards/chosen": 1.0444081624348958, + "rewards/margins": 15.560387384323848, + "rewards/rejected": -14.515979221888951, + "step": 640 + }, + { + "epoch": 0.11713111009593422, + "grad_norm": 10.25, + "kl": 0.0, + "learning_rate": 3.4549150281252635e-06, + "logits/chosen": -80144051.2, + "logits/rejected": -54501136.0, + "logps/chosen": -175.368115234375, + "logps/rejected": -365.7041015625, + "loss": 0.038, + "rewards/chosen": 1.3539721488952636, + "rewards/margins": 12.050074291229247, + "rewards/rejected": -10.696102142333984, + "step": 641 + }, + { + "epoch": 0.11731384193695751, + "grad_norm": 10.3125, + "kl": 0.0, + "learning_rate": 3.4383254074388373e-06, + "logits/chosen": -114714840.0, + "logits/rejected": -70015896.0, + "logps/chosen": -324.07122802734375, + "logps/rejected": -505.2621765136719, + "loss": 0.0337, + "rewards/chosen": 0.9440308213233948, + "rewards/margins": 12.022766172885895, + "rewards/rejected": -11.0787353515625, + "step": 642 + }, + { + "epoch": 0.11749657377798081, + "grad_norm": 15.5625, + "kl": 0.0, + "learning_rate": 3.4217548152644887e-06, + "logits/chosen": -105198812.44444445, + "logits/rejected": -71265851.42857143, + "logps/chosen": -229.54459635416666, + "logps/rejected": -367.7374790736607, + "loss": 0.0314, + "rewards/chosen": 1.4880302217271593, + "rewards/margins": 12.394751064361088, + "rewards/rejected": -10.906720842633929, + "step": 643 + }, + { + "epoch": 0.11767930561900411, + "grad_norm": 8.4375, + "kl": 0.0, + "learning_rate": 3.40520345350965e-06, + "logits/chosen": -104784464.0, + "logits/rejected": -69350176.0, + "logps/chosen": -342.1327209472656, + "logps/rejected": -591.6220092773438, + "loss": 0.0168, + "rewards/chosen": 2.423448085784912, + "rewards/margins": 16.33216428756714, + "rewards/rejected": -13.908716201782227, + "step": 644 + }, + { + "epoch": 0.11786203746002741, + "grad_norm": 9.8125, + "kl": 0.0, + "learning_rate": 3.3886715238474454e-06, + "logits/chosen": -78235189.33333333, + "logits/rejected": -130714739.2, + "logps/chosen": -419.6353759765625, + "logps/rejected": -370.5298583984375, + "loss": 0.0394, + "rewards/chosen": 0.013947556416193644, + "rewards/margins": 12.670238755146661, + "rewards/rejected": -12.656291198730468, + "step": 645 + }, + { + "epoch": 0.1180447693010507, + "grad_norm": 9.75, + "kl": 0.0, + "learning_rate": 3.372159227714218e-06, + "logits/chosen": -95664204.8, + "logits/rejected": -99727328.0, + "logps/chosen": -186.41278076171875, + "logps/rejected": -436.77685546875, + "loss": 0.0428, + "rewards/chosen": 1.0311882972717286, + "rewards/margins": 13.026352532704673, + "rewards/rejected": -11.995164235432943, + "step": 646 + }, + { + "epoch": 0.118227501142074, + "grad_norm": 10.125, + "kl": 0.0, + "learning_rate": 3.355666766307084e-06, + "logits/chosen": -115999449.6, + "logits/rejected": -135336682.66666666, + "logps/chosen": -298.919873046875, + "logps/rejected": -712.3668619791666, + "loss": 0.0369, + "rewards/chosen": 1.7309720993041993, + "rewards/margins": 21.76955394744873, + "rewards/rejected": -20.03858184814453, + "step": 647 + }, + { + "epoch": 0.1184102329830973, + "grad_norm": 8.5, + "kl": 0.0, + "learning_rate": 3.339194340581485e-06, + "logits/chosen": -90271963.42857143, + "logits/rejected": -82696298.66666667, + "logps/chosen": -149.10157993861608, + "logps/rejected": -572.7481011284722, + "loss": 0.0461, + "rewards/chosen": 0.38024027006966726, + "rewards/margins": 12.215319887040152, + "rewards/rejected": -11.835079616970486, + "step": 648 + }, + { + "epoch": 0.1185929648241206, + "grad_norm": 9.4375, + "kl": 0.0, + "learning_rate": 3.322742151248726e-06, + "logits/chosen": -74346604.8, + "logits/rejected": -80160906.66666667, + "logps/chosen": -239.6134765625, + "logps/rejected": -307.4821370442708, + "loss": 0.0454, + "rewards/chosen": 1.2306669235229493, + "rewards/margins": 10.21347599029541, + "rewards/rejected": -8.982809066772461, + "step": 649 + }, + { + "epoch": 0.1187756966651439, + "grad_norm": 9.1875, + "kl": 0.0, + "learning_rate": 3.3063103987735433e-06, + "logits/chosen": -152383360.0, + "logits/rejected": -77252561.45454545, + "logps/chosen": -239.04521484375, + "logps/rejected": -467.9567205255682, + "loss": 0.0281, + "rewards/chosen": 0.422749662399292, + "rewards/margins": 11.70445257100192, + "rewards/rejected": -11.281702908602627, + "step": 650 + }, + { + "epoch": 0.1189584285061672, + "grad_norm": 8.75, + "kl": 0.0, + "learning_rate": 3.289899283371657e-06, + "logits/chosen": -41591011.2, + "logits/rejected": -92100061.0909091, + "logps/chosen": -252.261474609375, + "logps/rejected": -371.45359108664775, + "loss": 0.0197, + "rewards/chosen": 1.6276590347290039, + "rewards/margins": 10.681050231240011, + "rewards/rejected": -9.053391196511008, + "step": 651 + }, + { + "epoch": 0.1191411603471905, + "grad_norm": 10.0625, + "kl": 0.0, + "learning_rate": 3.273509005007327e-06, + "logits/chosen": -60191680.0, + "logits/rejected": -90196465.77777778, + "logps/chosen": -256.64168875558033, + "logps/rejected": -435.97026909722223, + "loss": 0.0303, + "rewards/chosen": 1.5706800733293806, + "rewards/margins": 12.778529363965232, + "rewards/rejected": -11.207849290635851, + "step": 652 + }, + { + "epoch": 0.11932389218821379, + "grad_norm": 10.125, + "kl": 0.45502281188964844, + "learning_rate": 3.2571397633909252e-06, + "logits/chosen": -104469620.36363636, + "logits/rejected": -66647737.6, + "logps/chosen": -209.0520685369318, + "logps/rejected": -570.14775390625, + "loss": 0.0415, + "rewards/chosen": 1.8859594518488103, + "rewards/margins": 13.588142984563655, + "rewards/rejected": -11.702183532714844, + "step": 653 + }, + { + "epoch": 0.11950662402923709, + "grad_norm": 9.4375, + "kl": 0.0, + "learning_rate": 3.2407917579764914e-06, + "logits/chosen": -92018343.38461539, + "logits/rejected": -122548234.66666667, + "logps/chosen": -283.29676231971155, + "logps/rejected": -496.7533365885417, + "loss": 0.0396, + "rewards/chosen": 1.7605169736422026, + "rewards/margins": 17.72933182349572, + "rewards/rejected": -15.968814849853516, + "step": 654 + }, + { + "epoch": 0.11968935587026039, + "grad_norm": 6.9375, + "kl": 0.0, + "learning_rate": 3.224465187959316e-06, + "logits/chosen": -70684058.66666667, + "logits/rejected": -62723616.0, + "logps/chosen": -322.4065348307292, + "logps/rejected": -316.727685546875, + "loss": 0.0212, + "rewards/chosen": 1.4753586451212566, + "rewards/margins": 12.119414583841959, + "rewards/rejected": -10.644055938720703, + "step": 655 + }, + { + "epoch": 0.11987208771128369, + "grad_norm": 11.4375, + "kl": 0.0, + "learning_rate": 3.2081602522734987e-06, + "logits/chosen": -146265301.33333334, + "logits/rejected": -52143972.0, + "logps/chosen": -256.4151204427083, + "logps/rejected": -410.81048583984375, + "loss": 0.0397, + "rewards/chosen": 1.7188830375671387, + "rewards/margins": 12.639281749725342, + "rewards/rejected": -10.920398712158203, + "step": 656 + }, + { + "epoch": 0.12005481955230698, + "grad_norm": 8.3125, + "kl": 0.0, + "learning_rate": 3.1918771495895395e-06, + "logits/chosen": -122820650.66666667, + "logits/rejected": -82599588.57142857, + "logps/chosen": -182.6466064453125, + "logps/rejected": -438.90279715401783, + "loss": 0.0327, + "rewards/chosen": 1.3199058108859592, + "rewards/margins": 12.159624220832946, + "rewards/rejected": -10.839718409946986, + "step": 657 + }, + { + "epoch": 0.12023755139333028, + "grad_norm": 7.40625, + "kl": 0.0, + "learning_rate": 3.1756160783119015e-06, + "logits/chosen": -102025638.4, + "logits/rejected": -84162826.66666667, + "logps/chosen": -259.278125, + "logps/rejected": -452.2644449869792, + "loss": 0.03, + "rewards/chosen": 2.364429473876953, + "rewards/margins": 14.321159362792969, + "rewards/rejected": -11.956729888916016, + "step": 658 + }, + { + "epoch": 0.12042028323435358, + "grad_norm": 5.75, + "kl": 0.0, + "learning_rate": 3.1593772365766107e-06, + "logits/chosen": -96821368.0, + "logits/rejected": -35767744.0, + "logps/chosen": -277.1356201171875, + "logps/rejected": -463.5028483072917, + "loss": 0.0098, + "rewards/chosen": 2.54237699508667, + "rewards/margins": 16.231452465057373, + "rewards/rejected": -13.689075469970703, + "step": 659 + }, + { + "epoch": 0.12060301507537688, + "grad_norm": 8.375, + "kl": 0.07431221008300781, + "learning_rate": 3.1431608222488276e-06, + "logits/chosen": -76809939.2, + "logits/rejected": -78011968.0, + "logps/chosen": -214.9060546875, + "logps/rejected": -468.3467610677083, + "loss": 0.0394, + "rewards/chosen": 0.889615535736084, + "rewards/margins": 12.343685309092203, + "rewards/rejected": -11.45406977335612, + "step": 660 + }, + { + "epoch": 0.12078574691640018, + "grad_norm": 6.375, + "kl": 0.0, + "learning_rate": 3.12696703292044e-06, + "logits/chosen": -72206444.8, + "logits/rejected": -69148810.66666667, + "logps/chosen": -139.70595703125, + "logps/rejected": -533.9281819661459, + "loss": 0.0292, + "rewards/chosen": 2.0919464111328123, + "rewards/margins": 15.848154195149739, + "rewards/rejected": -13.756207784016928, + "step": 661 + }, + { + "epoch": 0.12096847875742348, + "grad_norm": 6.15625, + "kl": 0.0, + "learning_rate": 3.110796065907665e-06, + "logits/chosen": -42529152.0, + "logits/rejected": -134055994.18181819, + "logps/chosen": -164.03966064453124, + "logps/rejected": -497.2824041193182, + "loss": 0.0207, + "rewards/chosen": 1.2002087593078614, + "rewards/margins": 13.39854527386752, + "rewards/rejected": -12.198336514559658, + "step": 662 + }, + { + "epoch": 0.12115121059844677, + "grad_norm": 5.96875, + "kl": 0.0, + "learning_rate": 3.09464811824863e-06, + "logits/chosen": -85486272.0, + "logits/rejected": -39536526.222222224, + "logps/chosen": -162.17013113839286, + "logps/rejected": -523.0129665798611, + "loss": 0.0137, + "rewards/chosen": 3.378497804914202, + "rewards/margins": 14.373783686804392, + "rewards/rejected": -10.99528588189019, + "step": 663 + }, + { + "epoch": 0.12133394243947007, + "grad_norm": 9.0625, + "kl": 0.0, + "learning_rate": 3.078523386700982e-06, + "logits/chosen": -57382156.0, + "logits/rejected": -56998328.0, + "logps/chosen": -199.49757385253906, + "logps/rejected": -419.0350341796875, + "loss": 0.0334, + "rewards/chosen": 1.1910182237625122, + "rewards/margins": 15.503950238227844, + "rewards/rejected": -14.312932014465332, + "step": 664 + }, + { + "epoch": 0.12151667428049337, + "grad_norm": 7.90625, + "kl": 0.0, + "learning_rate": 3.0624220677394854e-06, + "logits/chosen": -77457179.42857143, + "logits/rejected": -68279196.44444445, + "logps/chosen": -218.09266880580358, + "logps/rejected": -632.1705729166666, + "loss": 0.0338, + "rewards/chosen": 1.2468000139508928, + "rewards/margins": 16.283146449497767, + "rewards/rejected": -15.036346435546875, + "step": 665 + }, + { + "epoch": 0.12169940612151667, + "grad_norm": 9.875, + "kl": 0.0, + "learning_rate": 3.0463443575536324e-06, + "logits/chosen": -122247795.2, + "logits/rejected": -93479466.66666667, + "logps/chosen": -208.180224609375, + "logps/rejected": -476.2403157552083, + "loss": 0.039, + "rewards/chosen": 1.1066829681396484, + "rewards/margins": 13.637596003214517, + "rewards/rejected": -12.53091303507487, + "step": 666 + }, + { + "epoch": 0.12188213796253997, + "grad_norm": 8.9375, + "kl": 0.0, + "learning_rate": 3.030290452045245e-06, + "logits/chosen": -108138240.0, + "logits/rejected": -74423537.77777778, + "logps/chosen": -286.89285714285717, + "logps/rejected": -364.4562174479167, + "loss": 0.0258, + "rewards/chosen": 1.1681134360177177, + "rewards/margins": 11.244126789153569, + "rewards/rejected": -10.076013353135851, + "step": 667 + }, + { + "epoch": 0.12206486980356326, + "grad_norm": 5.78125, + "kl": 0.0, + "learning_rate": 3.0142605468260976e-06, + "logits/chosen": -203297237.33333334, + "logits/rejected": -107138057.84615384, + "logps/chosen": -356.2391764322917, + "logps/rejected": -482.63953575721155, + "loss": 0.0077, + "rewards/chosen": 1.7497151692708333, + "rewards/margins": 14.13136682754908, + "rewards/rejected": -12.381651658278246, + "step": 668 + }, + { + "epoch": 0.12224760164458658, + "grad_norm": 10.875, + "kl": 0.0, + "learning_rate": 2.9982548372155264e-06, + "logits/chosen": -79378361.6, + "logits/rejected": -98453610.66666667, + "logps/chosen": -215.6226318359375, + "logps/rejected": -439.7357584635417, + "loss": 0.0405, + "rewards/chosen": 0.9136153221130371, + "rewards/margins": 11.309388891855875, + "rewards/rejected": -10.395773569742838, + "step": 669 + }, + { + "epoch": 0.12243033348560987, + "grad_norm": 15.8125, + "kl": 0.0, + "learning_rate": 2.98227351823805e-06, + "logits/chosen": -105059093.33333333, + "logits/rejected": -66347657.14285714, + "logps/chosen": -200.13774956597223, + "logps/rejected": -459.21616908482144, + "loss": 0.0475, + "rewards/chosen": 1.9918414221869574, + "rewards/margins": 12.486590218922448, + "rewards/rejected": -10.494748796735491, + "step": 670 + }, + { + "epoch": 0.12261306532663317, + "grad_norm": 7.96875, + "kl": 0.0, + "learning_rate": 2.966316784621e-06, + "logits/chosen": -37889005.71428572, + "logits/rejected": -28149624.888888888, + "logps/chosen": -210.32741001674108, + "logps/rejected": -502.03868272569446, + "loss": 0.023, + "rewards/chosen": 2.413312094552176, + "rewards/margins": 19.451453980945406, + "rewards/rejected": -17.03814188639323, + "step": 671 + }, + { + "epoch": 0.12279579716765647, + "grad_norm": 9.75, + "kl": 0.0, + "learning_rate": 2.9503848307921363e-06, + "logits/chosen": -105657735.1111111, + "logits/rejected": -83608393.14285715, + "logps/chosen": -252.251708984375, + "logps/rejected": -416.9437779017857, + "loss": 0.0233, + "rewards/chosen": 2.7017633650037975, + "rewards/margins": 13.908589711264959, + "rewards/rejected": -11.206826346261161, + "step": 672 + }, + { + "epoch": 0.12297852900867977, + "grad_norm": 8.875, + "kl": 0.0, + "learning_rate": 2.934477850877292e-06, + "logits/chosen": -122254310.4, + "logits/rejected": -224341397.33333334, + "logps/chosen": -259.40302734375, + "logps/rejected": -418.1013590494792, + "loss": 0.0399, + "rewards/chosen": 0.7958401203155517, + "rewards/margins": 10.90877423286438, + "rewards/rejected": -10.112934112548828, + "step": 673 + }, + { + "epoch": 0.12316126084970307, + "grad_norm": 11.0, + "kl": 0.0, + "learning_rate": 2.918596038697995e-06, + "logits/chosen": -91196586.66666667, + "logits/rejected": -99688265.14285715, + "logps/chosen": -246.24864366319446, + "logps/rejected": -320.1798618861607, + "loss": 0.0411, + "rewards/chosen": 0.9774103164672852, + "rewards/margins": 10.402562686375209, + "rewards/rejected": -9.425152369907924, + "step": 674 + }, + { + "epoch": 0.12334399269072636, + "grad_norm": 10.375, + "kl": 0.0, + "learning_rate": 2.9027395877691143e-06, + "logits/chosen": -84270557.0909091, + "logits/rejected": -88747264.0, + "logps/chosen": -234.0729092684659, + "logps/rejected": -382.391455078125, + "loss": 0.0323, + "rewards/chosen": 2.4894249655983667, + "rewards/margins": 14.542874977805399, + "rewards/rejected": -12.053450012207032, + "step": 675 + }, + { + "epoch": 0.12352672453174966, + "grad_norm": 8.3125, + "kl": 0.0, + "learning_rate": 2.886908691296504e-06, + "logits/chosen": -142349888.0, + "logits/rejected": -76725689.6, + "logps/chosen": -264.54412841796875, + "logps/rejected": -449.40537109375, + "loss": 0.0218, + "rewards/chosen": 1.1156916618347168, + "rewards/margins": 13.271435832977295, + "rewards/rejected": -12.155744171142578, + "step": 676 + }, + { + "epoch": 0.12370945637277296, + "grad_norm": 6.0625, + "kl": 0.0, + "learning_rate": 2.871103542174637e-06, + "logits/chosen": -154512586.66666666, + "logits/rejected": -111435507.2, + "logps/chosen": -170.36848958333334, + "logps/rejected": -580.4322265625, + "loss": 0.0112, + "rewards/chosen": 2.8801342646280923, + "rewards/margins": 18.665891710917155, + "rewards/rejected": -15.785757446289063, + "step": 677 + }, + { + "epoch": 0.12389218821379626, + "grad_norm": 9.0, + "kl": 0.0, + "learning_rate": 2.8553243329842715e-06, + "logits/chosen": -77028416.0, + "logits/rejected": -57101324.0, + "logps/chosen": -170.09010314941406, + "logps/rejected": -438.77996826171875, + "loss": 0.0249, + "rewards/chosen": 2.154907703399658, + "rewards/margins": 14.866350650787354, + "rewards/rejected": -12.711442947387695, + "step": 678 + }, + { + "epoch": 0.12407492005481956, + "grad_norm": 8.125, + "kl": 0.0, + "learning_rate": 2.839571255990088e-06, + "logits/chosen": -149882848.0, + "logits/rejected": -7495977.0, + "logps/chosen": -276.99462890625, + "logps/rejected": -376.6250305175781, + "loss": 0.0207, + "rewards/chosen": 2.722191333770752, + "rewards/margins": 12.897154331207275, + "rewards/rejected": -10.174962997436523, + "step": 679 + }, + { + "epoch": 0.12425765189584285, + "grad_norm": 7.5625, + "kl": 0.0, + "learning_rate": 2.8238445031383634e-06, + "logits/chosen": -47792400.0, + "logits/rejected": -42578978.90909091, + "logps/chosen": -321.4134765625, + "logps/rejected": -364.1934259588068, + "loss": 0.0172, + "rewards/chosen": 1.1938196182250977, + "rewards/margins": 15.187983824990011, + "rewards/rejected": -13.994164206764914, + "step": 680 + }, + { + "epoch": 0.12444038373686615, + "grad_norm": 11.3125, + "kl": 0.0, + "learning_rate": 2.8081442660546126e-06, + "logits/chosen": -137940096.0, + "logits/rejected": -146012864.0, + "logps/chosen": -261.017333984375, + "logps/rejected": -505.15496826171875, + "loss": 0.0412, + "rewards/chosen": 0.46475180983543396, + "rewards/margins": 13.92556819319725, + "rewards/rejected": -13.460816383361816, + "step": 681 + }, + { + "epoch": 0.12462311557788945, + "grad_norm": 9.6875, + "kl": 0.0, + "learning_rate": 2.7924707360412743e-06, + "logits/chosen": -144627956.36363637, + "logits/rejected": -80511430.4, + "logps/chosen": -198.2787198153409, + "logps/rejected": -474.334375, + "loss": 0.0419, + "rewards/chosen": 1.3122313239357688, + "rewards/margins": 13.008124420859598, + "rewards/rejected": -11.695893096923829, + "step": 682 + }, + { + "epoch": 0.12480584741891275, + "grad_norm": 15.9375, + "kl": 0.0, + "learning_rate": 2.776824104075364e-06, + "logits/chosen": -130646471.1111111, + "logits/rejected": -14812310.857142856, + "logps/chosen": -300.7469075520833, + "logps/rejected": -567.4651227678571, + "loss": 0.0343, + "rewards/chosen": 2.310657501220703, + "rewards/margins": 13.849564143589564, + "rewards/rejected": -11.538906642368861, + "step": 683 + }, + { + "epoch": 0.12498857925993605, + "grad_norm": 10.1875, + "kl": 0.0, + "learning_rate": 2.761204560806152e-06, + "logits/chosen": -137833792.0, + "logits/rejected": -109302026.66666667, + "logps/chosen": -263.2238037109375, + "logps/rejected": -362.4595540364583, + "loss": 0.0485, + "rewards/chosen": 0.6454781532287598, + "rewards/margins": 8.678002897898356, + "rewards/rejected": -8.032524744669596, + "step": 684 + }, + { + "epoch": 0.12517131110095933, + "grad_norm": 7.0625, + "kl": 0.0, + "learning_rate": 2.7456122965528475e-06, + "logits/chosen": -62754976.0, + "logits/rejected": -80752453.81818181, + "logps/chosen": -197.81884765625, + "logps/rejected": -379.7318004261364, + "loss": 0.0153, + "rewards/chosen": 1.7729095458984374, + "rewards/margins": 11.66182001287287, + "rewards/rejected": -9.888910466974432, + "step": 685 + }, + { + "epoch": 0.12535404294198263, + "grad_norm": 6.96875, + "kl": 0.19458389282226562, + "learning_rate": 2.7300475013022666e-06, + "logits/chosen": -133106916.57142857, + "logits/rejected": -66925454.222222224, + "logps/chosen": -270.294189453125, + "logps/rejected": -473.9279513888889, + "loss": 0.0145, + "rewards/chosen": 2.067577634538923, + "rewards/margins": 12.992400123959495, + "rewards/rejected": -10.924822489420572, + "step": 686 + }, + { + "epoch": 0.12553677478300593, + "grad_norm": 8.9375, + "kl": 0.0, + "learning_rate": 2.714510364706531e-06, + "logits/chosen": -30482695.111111112, + "logits/rejected": -41190162.28571428, + "logps/chosen": -249.044189453125, + "logps/rejected": -400.93380301339283, + "loss": 0.0244, + "rewards/chosen": 1.8663279215494792, + "rewards/margins": 13.030192057291666, + "rewards/rejected": -11.163864135742188, + "step": 687 + }, + { + "epoch": 0.12571950662402923, + "grad_norm": 4.3125, + "kl": 0.0, + "learning_rate": 2.699001076080742e-06, + "logits/chosen": -152358899.2, + "logits/rejected": -73804299.63636364, + "logps/chosen": -142.60670166015626, + "logps/rejected": -432.4446910511364, + "loss": 0.0191, + "rewards/chosen": 2.0243717193603517, + "rewards/margins": 12.90427325855602, + "rewards/rejected": -10.879901539195668, + "step": 688 + }, + { + "epoch": 0.12590223846505252, + "grad_norm": 10.125, + "kl": 0.0, + "learning_rate": 2.683519824400693e-06, + "logits/chosen": -90386400.0, + "logits/rejected": -68670624.0, + "logps/chosen": -215.0010223388672, + "logps/rejected": -304.2410888671875, + "loss": 0.0411, + "rewards/chosen": 1.5481953620910645, + "rewards/margins": 9.434277057647705, + "rewards/rejected": -7.886081695556641, + "step": 689 + }, + { + "epoch": 0.12608497030607582, + "grad_norm": 11.125, + "kl": 0.0, + "learning_rate": 2.6680667983005446e-06, + "logits/chosen": -112127680.0, + "logits/rejected": 40281508.0, + "logps/chosen": -333.7459309895833, + "logps/rejected": -896.422119140625, + "loss": 0.0509, + "rewards/chosen": 1.2184429168701172, + "rewards/margins": 22.33766746520996, + "rewards/rejected": -21.119224548339844, + "step": 690 + }, + { + "epoch": 0.12626770214709912, + "grad_norm": 8.1875, + "kl": 0.0, + "learning_rate": 2.6526421860705474e-06, + "logits/chosen": -111764580.57142857, + "logits/rejected": -50496128.0, + "logps/chosen": -179.17991420200892, + "logps/rejected": -440.37470160590277, + "loss": 0.0291, + "rewards/chosen": 1.1412087849208288, + "rewards/margins": 14.498798506600517, + "rewards/rejected": -13.357589721679688, + "step": 691 + }, + { + "epoch": 0.12645043398812242, + "grad_norm": 12.375, + "kl": 0.0, + "learning_rate": 2.637246175654731e-06, + "logits/chosen": -121768533.33333333, + "logits/rejected": -75922752.0, + "logps/chosen": -258.54375542534723, + "logps/rejected": -359.66249302455356, + "loss": 0.0476, + "rewards/chosen": 2.257830089992947, + "rewards/margins": 10.007211564079164, + "rewards/rejected": -7.749381474086216, + "step": 692 + }, + { + "epoch": 0.12663316582914572, + "grad_norm": 6.21875, + "kl": 0.0, + "learning_rate": 2.6218789546486235e-06, + "logits/chosen": -143036710.4, + "logits/rejected": -110972800.0, + "logps/chosen": -388.1966552734375, + "logps/rejected": -538.4753639914773, + "loss": 0.0191, + "rewards/chosen": 1.0626492500305176, + "rewards/margins": 13.985628431493586, + "rewards/rejected": -12.922979181463068, + "step": 693 + }, + { + "epoch": 0.12681589767016901, + "grad_norm": 10.8125, + "kl": 0.0, + "learning_rate": 2.6065407102969664e-06, + "logits/chosen": -173686186.66666666, + "logits/rejected": -150013001.14285713, + "logps/chosen": -226.59027777777777, + "logps/rejected": -412.37374441964283, + "loss": 0.0307, + "rewards/chosen": 1.7476267284817166, + "rewards/margins": 17.309471690465536, + "rewards/rejected": -15.561844961983818, + "step": 694 + }, + { + "epoch": 0.1269986295111923, + "grad_norm": 13.1875, + "kl": 0.0, + "learning_rate": 2.5912316294914232e-06, + "logits/chosen": -145580379.42857143, + "logits/rejected": -61055274.666666664, + "logps/chosen": -291.0072719029018, + "logps/rejected": -398.2375217013889, + "loss": 0.0509, + "rewards/chosen": 1.1757352011544364, + "rewards/margins": 10.620889179290288, + "rewards/rejected": -9.445153978135851, + "step": 695 + }, + { + "epoch": 0.1271813613522156, + "grad_norm": 8.8125, + "kl": 0.0, + "learning_rate": 2.5759518987683154e-06, + "logits/chosen": -160519808.0, + "logits/rejected": -61511532.0, + "logps/chosen": -282.0469970703125, + "logps/rejected": -609.6806640625, + "loss": 0.0196, + "rewards/chosen": 2.0994021892547607, + "rewards/margins": 13.108491659164429, + "rewards/rejected": -11.009089469909668, + "step": 696 + }, + { + "epoch": 0.1273640931932389, + "grad_norm": 9.4375, + "kl": 0.0, + "learning_rate": 2.560701704306336e-06, + "logits/chosen": -63082131.2, + "logits/rejected": -29119506.666666668, + "logps/chosen": -206.5444091796875, + "logps/rejected": -542.0686442057291, + "loss": 0.0471, + "rewards/chosen": 0.9205838203430176, + "rewards/margins": 15.584553496042886, + "rewards/rejected": -14.66396967569987, + "step": 697 + }, + { + "epoch": 0.1275468250342622, + "grad_norm": 8.875, + "kl": 0.0, + "learning_rate": 2.545481231924296e-06, + "logits/chosen": -127576109.71428572, + "logits/rejected": -110494748.44444445, + "logps/chosen": -200.69660295758928, + "logps/rejected": -390.78716362847223, + "loss": 0.0387, + "rewards/chosen": 0.44404803003583637, + "rewards/margins": 12.267497058898682, + "rewards/rejected": -11.823449028862846, + "step": 698 + }, + { + "epoch": 0.1277295568752855, + "grad_norm": 7.71875, + "kl": 0.0, + "learning_rate": 2.5302906670788463e-06, + "logits/chosen": -46245801.14285714, + "logits/rejected": -47369116.44444445, + "logps/chosen": -193.06183733258928, + "logps/rejected": -487.0212131076389, + "loss": 0.022, + "rewards/chosen": 1.3325984137398856, + "rewards/margins": 14.716946117461674, + "rewards/rejected": -13.384347703721788, + "step": 699 + }, + { + "epoch": 0.12791228871630883, + "grad_norm": 7.59375, + "kl": 0.0, + "learning_rate": 2.5151301948622235e-06, + "logits/chosen": -149335232.0, + "logits/rejected": -79732424.0, + "logps/chosen": -207.64093017578125, + "logps/rejected": -531.2234497070312, + "loss": 0.0261, + "rewards/chosen": 2.168161153793335, + "rewards/margins": 14.412302732467651, + "rewards/rejected": -12.244141578674316, + "step": 700 + }, + { + "epoch": 0.12809502055733213, + "grad_norm": 7.75, + "kl": 0.0, + "learning_rate": 2.5000000000000015e-06, + "logits/chosen": -308382873.6, + "logits/rejected": -107289181.0909091, + "logps/chosen": -284.17919921875, + "logps/rejected": -523.2857776988636, + "loss": 0.0218, + "rewards/chosen": 1.111329936981201, + "rewards/margins": 14.896771942485463, + "rewards/rejected": -13.785442005504262, + "step": 701 + }, + { + "epoch": 0.12827775239835543, + "grad_norm": 7.15625, + "kl": 0.0, + "learning_rate": 2.484900266848825e-06, + "logits/chosen": -59907648.0, + "logits/rejected": -71914720.0, + "logps/chosen": -226.99942016601562, + "logps/rejected": -731.404296875, + "loss": 0.018, + "rewards/chosen": 2.611422061920166, + "rewards/margins": 20.74359941482544, + "rewards/rejected": -18.132177352905273, + "step": 702 + }, + { + "epoch": 0.12846048423937872, + "grad_norm": 7.625, + "kl": 0.0, + "learning_rate": 2.469831179394182e-06, + "logits/chosen": -141194121.14285713, + "logits/rejected": -65216334.222222224, + "logps/chosen": -235.06361607142858, + "logps/rejected": -483.12255859375, + "loss": 0.0186, + "rewards/chosen": 1.8074588775634766, + "rewards/margins": 14.698070314195421, + "rewards/rejected": -12.890611436631945, + "step": 703 + }, + { + "epoch": 0.12864321608040202, + "grad_norm": 8.0, + "kl": 0.0, + "learning_rate": 2.4547929212481436e-06, + "logits/chosen": -63937284.571428575, + "logits/rejected": -72135900.44444445, + "logps/chosen": -221.4688720703125, + "logps/rejected": -379.44048394097223, + "loss": 0.0337, + "rewards/chosen": 0.5862047331673759, + "rewards/margins": 11.224835070352706, + "rewards/rejected": -10.63863033718533, + "step": 704 + }, + { + "epoch": 0.12882594792142532, + "grad_norm": 8.5, + "kl": 0.0, + "learning_rate": 2.4397856756471435e-06, + "logits/chosen": -210765283.55555555, + "logits/rejected": -41844534.85714286, + "logps/chosen": -180.80246310763889, + "logps/rejected": -500.73036411830356, + "loss": 0.0342, + "rewards/chosen": 1.5093820359971788, + "rewards/margins": 15.504913390628875, + "rewards/rejected": -13.995531354631696, + "step": 705 + }, + { + "epoch": 0.12900867976244862, + "grad_norm": 10.4375, + "kl": 0.0, + "learning_rate": 2.424809625449729e-06, + "logits/chosen": -87204832.0, + "logits/rejected": -115445328.0, + "logps/chosen": -295.8175048828125, + "logps/rejected": -465.39630126953125, + "loss": 0.0399, + "rewards/chosen": 1.835432529449463, + "rewards/margins": 15.063571453094482, + "rewards/rejected": -13.22813892364502, + "step": 706 + }, + { + "epoch": 0.12919141160347192, + "grad_norm": 10.0, + "kl": 0.0, + "learning_rate": 2.40986495313435e-06, + "logits/chosen": -77528441.6, + "logits/rejected": -38532056.0, + "logps/chosen": -314.627294921875, + "logps/rejected": -287.54661051432294, + "loss": 0.0248, + "rewards/chosen": 2.3703378677368163, + "rewards/margins": 13.86009667714437, + "rewards/rejected": -11.489758809407553, + "step": 707 + }, + { + "epoch": 0.12937414344449522, + "grad_norm": 10.5, + "kl": 0.0, + "learning_rate": 2.39495184079712e-06, + "logits/chosen": -192219562.66666666, + "logits/rejected": -116607405.71428572, + "logps/chosen": -194.22139485677084, + "logps/rejected": -519.2626255580357, + "loss": 0.0414, + "rewards/chosen": 0.6485424041748047, + "rewards/margins": 16.841967173985072, + "rewards/rejected": -16.193424769810267, + "step": 708 + }, + { + "epoch": 0.1295568752855185, + "grad_norm": 7.84375, + "kl": 0.0, + "learning_rate": 2.380070470149605e-06, + "logits/chosen": -144154410.66666666, + "logits/rejected": -90592140.8, + "logps/chosen": -230.5208943684896, + "logps/rejected": -509.59560546875, + "loss": 0.018, + "rewards/chosen": 1.6695763270060222, + "rewards/margins": 13.054193941752116, + "rewards/rejected": -11.384617614746094, + "step": 709 + }, + { + "epoch": 0.1297396071265418, + "grad_norm": 7.1875, + "kl": 0.0, + "learning_rate": 2.3652210225166122e-06, + "logits/chosen": -86591728.0, + "logits/rejected": -106363923.2, + "logps/chosen": -157.97127278645834, + "logps/rejected": -351.6996337890625, + "loss": 0.0311, + "rewards/chosen": 1.0248863697052002, + "rewards/margins": 12.186399888992309, + "rewards/rejected": -11.161513519287109, + "step": 710 + }, + { + "epoch": 0.1299223389675651, + "grad_norm": 6.5, + "kl": 0.0, + "learning_rate": 2.3504036788339763e-06, + "logits/chosen": -30401496.0, + "logits/rejected": -126361484.8, + "logps/chosen": -140.255615234375, + "logps/rejected": -436.664794921875, + "loss": 0.0255, + "rewards/chosen": 1.0534992218017578, + "rewards/margins": 12.363264083862305, + "rewards/rejected": -11.309764862060547, + "step": 711 + }, + { + "epoch": 0.1301050708085884, + "grad_norm": 9.0, + "kl": 0.0, + "learning_rate": 2.3356186196463497e-06, + "logits/chosen": -105403099.42857143, + "logits/rejected": -78198926.22222222, + "logps/chosen": -296.30514090401783, + "logps/rejected": -479.4059244791667, + "loss": 0.0242, + "rewards/chosen": 1.3993627003261022, + "rewards/margins": 13.415220820714557, + "rewards/rejected": -12.015858120388454, + "step": 712 + }, + { + "epoch": 0.1302878026496117, + "grad_norm": 4.1875, + "kl": 0.0, + "learning_rate": 2.320866025105016e-06, + "logits/chosen": -52999798.4, + "logits/rejected": -86450338.9090909, + "logps/chosen": -154.404345703125, + "logps/rejected": -456.32768110795456, + "loss": 0.0201, + "rewards/chosen": 1.832107925415039, + "rewards/margins": 14.076611987027256, + "rewards/rejected": -12.244504061612217, + "step": 713 + }, + { + "epoch": 0.130470534490635, + "grad_norm": 3.671875, + "kl": 0.0, + "learning_rate": 2.3061460749656844e-06, + "logits/chosen": -238156800.0, + "logits/rejected": -93812117.33333333, + "logps/chosen": -155.51374162946428, + "logps/rejected": -500.130859375, + "loss": 0.0088, + "rewards/chosen": 2.6641148158482144, + "rewards/margins": 14.196058243040056, + "rewards/rejected": -11.531943427191841, + "step": 714 + }, + { + "epoch": 0.1306532663316583, + "grad_norm": 9.375, + "kl": 0.0, + "learning_rate": 2.2914589485863015e-06, + "logits/chosen": -54319405.71428572, + "logits/rejected": -47631566.222222224, + "logps/chosen": -322.6678989955357, + "logps/rejected": -570.1627604166666, + "loss": 0.0212, + "rewards/chosen": 1.66611875806536, + "rewards/margins": 16.244813419523695, + "rewards/rejected": -14.578694661458334, + "step": 715 + }, + { + "epoch": 0.1308359981726816, + "grad_norm": 11.875, + "kl": 0.0, + "learning_rate": 2.2768048249248648e-06, + "logits/chosen": -63099219.692307696, + "logits/rejected": -139589248.0, + "logps/chosen": -199.7818321814904, + "logps/rejected": -661.921875, + "loss": 0.0603, + "rewards/chosen": 0.6951920435978816, + "rewards/margins": 13.007014299050356, + "rewards/rejected": -12.311822255452475, + "step": 716 + }, + { + "epoch": 0.1310187300137049, + "grad_norm": 13.8125, + "kl": 0.0, + "learning_rate": 2.2621838825372496e-06, + "logits/chosen": -19126956.8, + "logits/rejected": -62876567.27272727, + "logps/chosen": -157.6501708984375, + "logps/rejected": -484.2067205255682, + "loss": 0.0293, + "rewards/chosen": 1.4766508102416993, + "rewards/margins": 15.470666590603916, + "rewards/rejected": -13.994015780362217, + "step": 717 + }, + { + "epoch": 0.1312014618547282, + "grad_norm": 12.1875, + "kl": 0.0, + "learning_rate": 2.2475962995750224e-06, + "logits/chosen": -89187909.33333333, + "logits/rejected": -124116080.0, + "logps/chosen": -248.94868977864584, + "logps/rejected": -430.8358154296875, + "loss": 0.0597, + "rewards/chosen": 0.8866754372914633, + "rewards/margins": 11.757124503453573, + "rewards/rejected": -10.87044906616211, + "step": 718 + }, + { + "epoch": 0.1313841936957515, + "grad_norm": 7.59375, + "kl": 0.0, + "learning_rate": 2.23304225378328e-06, + "logits/chosen": -52806693.333333336, + "logits/rejected": -67179609.6, + "logps/chosen": -240.08306884765625, + "logps/rejected": -505.114111328125, + "loss": 0.0187, + "rewards/chosen": 1.9079821904500325, + "rewards/margins": 14.386291058858236, + "rewards/rejected": -12.478308868408202, + "step": 719 + }, + { + "epoch": 0.1315669255367748, + "grad_norm": 10.875, + "kl": 0.0, + "learning_rate": 2.218521922498476e-06, + "logits/chosen": -186914384.0, + "logits/rejected": -74846048.0, + "logps/chosen": -339.9781188964844, + "logps/rejected": -531.666015625, + "loss": 0.0387, + "rewards/chosen": 1.0340685844421387, + "rewards/margins": 15.855690479278564, + "rewards/rejected": -14.821621894836426, + "step": 720 + }, + { + "epoch": 0.1317496573777981, + "grad_norm": 6.09375, + "kl": 0.0, + "learning_rate": 2.204035482646267e-06, + "logits/chosen": -88298992.0, + "logits/rejected": -86740632.0, + "logps/chosen": -215.3624267578125, + "logps/rejected": -341.3442687988281, + "loss": 0.0127, + "rewards/chosen": 2.523061752319336, + "rewards/margins": 13.77957820892334, + "rewards/rejected": -11.256516456604004, + "step": 721 + }, + { + "epoch": 0.1319323892188214, + "grad_norm": 8.25, + "kl": 0.0, + "learning_rate": 2.1895831107393485e-06, + "logits/chosen": -76709824.0, + "logits/rejected": -49905512.0, + "logps/chosen": -216.62203979492188, + "logps/rejected": -354.4139404296875, + "loss": 0.0444, + "rewards/chosen": 1.0450005531311035, + "rewards/margins": 14.25702428817749, + "rewards/rejected": -13.212023735046387, + "step": 722 + }, + { + "epoch": 0.1321151210598447, + "grad_norm": 6.3125, + "kl": 0.0, + "learning_rate": 2.175164982875311e-06, + "logits/chosen": -80579392.0, + "logits/rejected": -73201216.0, + "logps/chosen": -144.041748046875, + "logps/rejected": -515.4444173177084, + "loss": 0.0262, + "rewards/chosen": 2.332357978820801, + "rewards/margins": 16.88359826405843, + "rewards/rejected": -14.55124028523763, + "step": 723 + }, + { + "epoch": 0.13229785290086798, + "grad_norm": 8.4375, + "kl": 0.0, + "learning_rate": 2.1607812747344955e-06, + "logits/chosen": -77812288.0, + "logits/rejected": -104375381.33333333, + "logps/chosen": -212.81119210379464, + "logps/rejected": -483.98687065972223, + "loss": 0.0393, + "rewards/chosen": 0.6726623262677874, + "rewards/margins": 14.344717040894523, + "rewards/rejected": -13.672054714626736, + "step": 724 + }, + { + "epoch": 0.13248058474189128, + "grad_norm": 7.65625, + "kl": 0.0, + "learning_rate": 2.146432161577842e-06, + "logits/chosen": -65989448.0, + "logits/rejected": -84221768.0, + "logps/chosen": -181.05682373046875, + "logps/rejected": -443.2082824707031, + "loss": 0.026, + "rewards/chosen": 2.100694179534912, + "rewards/margins": 14.791536808013916, + "rewards/rejected": -12.690842628479004, + "step": 725 + }, + { + "epoch": 0.13266331658291458, + "grad_norm": 9.3125, + "kl": 0.0, + "learning_rate": 2.132117818244771e-06, + "logits/chosen": -61974644.0, + "logits/rejected": -101496064.0, + "logps/chosen": -232.66119384765625, + "logps/rejected": -452.652587890625, + "loss": 0.0275, + "rewards/chosen": 1.470191478729248, + "rewards/margins": 14.155333995819092, + "rewards/rejected": -12.685142517089844, + "step": 726 + }, + { + "epoch": 0.13284604842393788, + "grad_norm": 5.875, + "kl": 0.0, + "learning_rate": 2.1178384191510344e-06, + "logits/chosen": -105787592.0, + "logits/rejected": -47244277.333333336, + "logps/chosen": -139.09332275390625, + "logps/rejected": -544.7545979817709, + "loss": 0.0122, + "rewards/chosen": 1.2920489311218262, + "rewards/margins": 17.80109103520711, + "rewards/rejected": -16.509042104085285, + "step": 727 + }, + { + "epoch": 0.13302878026496118, + "grad_norm": 9.6875, + "kl": 0.0, + "learning_rate": 2.103594138286607e-06, + "logits/chosen": -96852302.22222222, + "logits/rejected": -73251328.0, + "logps/chosen": -223.70159233940973, + "logps/rejected": -667.8293805803571, + "loss": 0.0353, + "rewards/chosen": 1.141476313273112, + "rewards/margins": 13.757189160301571, + "rewards/rejected": -12.61571284702846, + "step": 728 + }, + { + "epoch": 0.13321151210598448, + "grad_norm": 8.3125, + "kl": 0.0, + "learning_rate": 2.0893851492135536e-06, + "logits/chosen": -37620807.11111111, + "logits/rejected": -82435373.71428572, + "logps/chosen": -184.01951768663196, + "logps/rejected": -577.1834193638393, + "loss": 0.0289, + "rewards/chosen": 1.3228652742173936, + "rewards/margins": 15.828951351226323, + "rewards/rejected": -14.506086077008929, + "step": 729 + }, + { + "epoch": 0.13339424394700777, + "grad_norm": 2.6875, + "kl": 0.0, + "learning_rate": 2.075211625063923e-06, + "logits/chosen": -75170336.0, + "logits/rejected": -100262666.66666667, + "logps/chosen": -236.32476806640625, + "logps/rejected": -402.4522298177083, + "loss": 0.0045, + "rewards/chosen": 2.9848594665527344, + "rewards/margins": 13.22769546508789, + "rewards/rejected": -10.242835998535156, + "step": 730 + }, + { + "epoch": 0.13357697578803107, + "grad_norm": 9.5625, + "kl": 0.0, + "learning_rate": 2.061073738537635e-06, + "logits/chosen": -91672356.57142857, + "logits/rejected": -71217813.33333333, + "logps/chosen": -263.43875558035717, + "logps/rejected": -516.62451171875, + "loss": 0.0273, + "rewards/chosen": 1.3010046822684151, + "rewards/margins": 14.107943701365636, + "rewards/rejected": -12.806939019097221, + "step": 731 + }, + { + "epoch": 0.13375970762905437, + "grad_norm": 8.625, + "kl": 0.0, + "learning_rate": 2.046971661900373e-06, + "logits/chosen": -137771406.2222222, + "logits/rejected": -67879085.71428572, + "logps/chosen": -219.5914306640625, + "logps/rejected": -502.843994140625, + "loss": 0.0314, + "rewards/chosen": 1.4181072447035048, + "rewards/margins": 13.071052626957968, + "rewards/rejected": -11.652945382254464, + "step": 732 + }, + { + "epoch": 0.13394243947007767, + "grad_norm": 7.125, + "kl": 0.0, + "learning_rate": 2.0329055669814936e-06, + "logits/chosen": -72944709.81818181, + "logits/rejected": -48506540.8, + "logps/chosen": -203.39692826704547, + "logps/rejected": -377.89658203125, + "loss": 0.0248, + "rewards/chosen": 2.6547891443425957, + "rewards/margins": 12.299688131159002, + "rewards/rejected": -9.644898986816406, + "step": 733 + }, + { + "epoch": 0.13412517131110097, + "grad_norm": 9.3125, + "kl": 0.0, + "learning_rate": 2.0188756251719204e-06, + "logits/chosen": -138655385.6, + "logits/rejected": -84258698.66666667, + "logps/chosen": -259.3381103515625, + "logps/rejected": -494.64697265625, + "loss": 0.0353, + "rewards/chosen": 1.254824161529541, + "rewards/margins": 13.394904931386312, + "rewards/rejected": -12.140080769856771, + "step": 734 + }, + { + "epoch": 0.13430790315212426, + "grad_norm": 6.5, + "kl": 0.0, + "learning_rate": 2.0048820074220716e-06, + "logits/chosen": -193895500.8, + "logits/rejected": -106755525.81818181, + "logps/chosen": -213.75556640625, + "logps/rejected": -376.60134055397725, + "loss": 0.0124, + "rewards/chosen": 1.8230194091796874, + "rewards/margins": 14.827631724964489, + "rewards/rejected": -13.0046123157848, + "step": 735 + }, + { + "epoch": 0.13449063499314756, + "grad_norm": 8.375, + "kl": 0.024005889892578125, + "learning_rate": 1.990924884239758e-06, + "logits/chosen": -67839349.33333333, + "logits/rejected": -154120486.4, + "logps/chosen": -249.4363810221354, + "logps/rejected": -524.33681640625, + "loss": 0.0287, + "rewards/chosen": 0.5367480119069418, + "rewards/margins": 12.752241786321004, + "rewards/rejected": -12.215493774414062, + "step": 736 + }, + { + "epoch": 0.13467336683417086, + "grad_norm": 13.0, + "kl": 0.0, + "learning_rate": 1.977004425688126e-06, + "logits/chosen": -100059129.6, + "logits/rejected": -111270762.66666667, + "logps/chosen": -345.2010986328125, + "logps/rejected": -540.3433430989584, + "loss": 0.0432, + "rewards/chosen": 0.8550333023071289, + "rewards/margins": 12.333307329813639, + "rewards/rejected": -11.47827402750651, + "step": 737 + }, + { + "epoch": 0.13485609867519416, + "grad_norm": 10.75, + "kl": 0.0, + "learning_rate": 1.9631208013835677e-06, + "logits/chosen": -89324313.6, + "logits/rejected": -91172526.54545455, + "logps/chosen": -247.6336181640625, + "logps/rejected": -427.8458806818182, + "loss": 0.0389, + "rewards/chosen": 0.7238203525543213, + "rewards/margins": 10.46044533035972, + "rewards/rejected": -9.736624977805398, + "step": 738 + }, + { + "epoch": 0.13503883051621746, + "grad_norm": 9.125, + "kl": 0.0, + "learning_rate": 1.9492741804936623e-06, + "logits/chosen": -145393197.7142857, + "logits/rejected": -110471879.1111111, + "logps/chosen": -229.39641462053572, + "logps/rejected": -461.8250325520833, + "loss": 0.0258, + "rewards/chosen": 2.33603640965053, + "rewards/margins": 12.363287759205653, + "rewards/rejected": -10.027251349555122, + "step": 739 + }, + { + "epoch": 0.13522156235724075, + "grad_norm": 11.0625, + "kl": 0.0, + "learning_rate": 1.9354647317351187e-06, + "logits/chosen": -230968917.33333334, + "logits/rejected": -144042989.7142857, + "logps/chosen": -177.46685112847223, + "logps/rejected": -374.4133998325893, + "loss": 0.0488, + "rewards/chosen": 0.37000205781724715, + "rewards/margins": 12.119394975995261, + "rewards/rejected": -11.749392918178014, + "step": 740 + }, + { + "epoch": 0.13540429419826405, + "grad_norm": 8.8125, + "kl": 0.0, + "learning_rate": 1.9216926233717087e-06, + "logits/chosen": -76662613.33333333, + "logits/rejected": -92352146.28571428, + "logps/chosen": -193.50276692708334, + "logps/rejected": -538.1104213169643, + "loss": 0.0348, + "rewards/chosen": 1.5303805669148762, + "rewards/margins": 14.073853946867445, + "rewards/rejected": -12.543473379952568, + "step": 741 + }, + { + "epoch": 0.13558702603928735, + "grad_norm": 12.75, + "kl": 0.0, + "learning_rate": 1.90795802321223e-06, + "logits/chosen": -65012608.0, + "logits/rejected": -78203112.0, + "logps/chosen": -163.23209635416666, + "logps/rejected": -344.78948974609375, + "loss": 0.0783, + "rewards/chosen": 0.3967420260111491, + "rewards/margins": 8.298219362894693, + "rewards/rejected": -7.901477336883545, + "step": 742 + }, + { + "epoch": 0.13576975788031065, + "grad_norm": 9.25, + "kl": 0.43659400939941406, + "learning_rate": 1.8942610986084487e-06, + "logits/chosen": -83448917.33333333, + "logits/rejected": -166619172.57142857, + "logps/chosen": -302.5954861111111, + "logps/rejected": -438.8185337611607, + "loss": 0.0367, + "rewards/chosen": 1.1077468660142686, + "rewards/margins": 14.115367541237482, + "rewards/rejected": -13.007620675223214, + "step": 743 + }, + { + "epoch": 0.13595248972133395, + "grad_norm": 11.5, + "kl": 0.0, + "learning_rate": 1.8806020164530702e-06, + "logits/chosen": -97960409.6, + "logits/rejected": -221891136.0, + "logps/chosen": -230.348779296875, + "logps/rejected": -456.3099772135417, + "loss": 0.0604, + "rewards/chosen": 0.19941787719726561, + "rewards/margins": 12.869689432779948, + "rewards/rejected": -12.670271555582682, + "step": 744 + }, + { + "epoch": 0.13613522156235724, + "grad_norm": 5.375, + "kl": 0.0, + "learning_rate": 1.8669809431776991e-06, + "logits/chosen": -93905546.66666667, + "logits/rejected": -110722521.6, + "logps/chosen": -151.6180623372396, + "logps/rejected": -484.83310546875, + "loss": 0.0141, + "rewards/chosen": 1.7859487533569336, + "rewards/margins": 15.408989524841308, + "rewards/rejected": -13.623040771484375, + "step": 745 + }, + { + "epoch": 0.13631795340338054, + "grad_norm": 8.375, + "kl": 0.0, + "learning_rate": 1.8533980447508138e-06, + "logits/chosen": -101890700.8, + "logits/rejected": -90839296.0, + "logps/chosen": -201.90185546875, + "logps/rejected": -478.9419759114583, + "loss": 0.0385, + "rewards/chosen": 1.1921525955200196, + "rewards/margins": 13.631951586405435, + "rewards/rejected": -12.439798990885416, + "step": 746 + }, + { + "epoch": 0.13650068524440384, + "grad_norm": 10.0625, + "kl": 0.023517608642578125, + "learning_rate": 1.8398534866757455e-06, + "logits/chosen": -199524328.72727272, + "logits/rejected": -1048374.4, + "logps/chosen": -247.3264825994318, + "logps/rejected": -500.780126953125, + "loss": 0.0339, + "rewards/chosen": 1.6612146550958806, + "rewards/margins": 11.452491968328303, + "rewards/rejected": -9.791277313232422, + "step": 747 + }, + { + "epoch": 0.13668341708542714, + "grad_norm": 9.5625, + "kl": 0.0, + "learning_rate": 1.8263474339886628e-06, + "logits/chosen": -105416872.0, + "logits/rejected": -106846632.0, + "logps/chosen": -236.9156494140625, + "logps/rejected": -412.7711486816406, + "loss": 0.0382, + "rewards/chosen": 1.9055794477462769, + "rewards/margins": 15.087803721427917, + "rewards/rejected": -13.18222427368164, + "step": 748 + }, + { + "epoch": 0.13686614892645044, + "grad_norm": 6.78125, + "kl": 0.0, + "learning_rate": 1.8128800512565514e-06, + "logits/chosen": -89462912.0, + "logits/rejected": -97201815.27272727, + "logps/chosen": -176.6627197265625, + "logps/rejected": -558.5279208096591, + "loss": 0.0127, + "rewards/chosen": 2.3023908615112303, + "rewards/margins": 16.09236056587913, + "rewards/rejected": -13.789969704367898, + "step": 749 + }, + { + "epoch": 0.13704888076747374, + "grad_norm": 13.625, + "kl": 0.0, + "learning_rate": 1.799451502575222e-06, + "logits/chosen": -87323276.8, + "logits/rejected": -91001514.66666667, + "logps/chosen": -218.6360107421875, + "logps/rejected": -411.7042643229167, + "loss": 0.0613, + "rewards/chosen": 0.44103550910949707, + "rewards/margins": 12.594947099685669, + "rewards/rejected": -12.153911590576172, + "step": 750 + }, + { + "epoch": 0.13723161260849703, + "grad_norm": 10.1875, + "kl": 0.0, + "learning_rate": 1.7860619515673034e-06, + "logits/chosen": -89844458.66666667, + "logits/rejected": -62018569.14285714, + "logps/chosen": -164.76825629340277, + "logps/rejected": -308.4173060825893, + "loss": 0.0435, + "rewards/chosen": 1.4071918063693576, + "rewards/margins": 9.884856208922372, + "rewards/rejected": -8.477664402553014, + "step": 751 + }, + { + "epoch": 0.13741434444952033, + "grad_norm": 8.3125, + "kl": 0.0, + "learning_rate": 1.7727115613802465e-06, + "logits/chosen": -127626824.0, + "logits/rejected": -156298480.0, + "logps/chosen": -284.375244140625, + "logps/rejected": -404.6254577636719, + "loss": 0.0331, + "rewards/chosen": 0.9153963327407837, + "rewards/margins": 12.373408913612366, + "rewards/rejected": -11.458012580871582, + "step": 752 + }, + { + "epoch": 0.13759707629054363, + "grad_norm": 3.859375, + "kl": 0.0, + "learning_rate": 1.7594004946843458e-06, + "logits/chosen": -141884714.66666666, + "logits/rejected": -76354956.8, + "logps/chosen": -194.7940877278646, + "logps/rejected": -506.615234375, + "loss": 0.0083, + "rewards/chosen": 2.642937660217285, + "rewards/margins": 14.223613166809082, + "rewards/rejected": -11.580675506591797, + "step": 753 + }, + { + "epoch": 0.13777980813156693, + "grad_norm": 9.125, + "kl": 0.0, + "learning_rate": 1.746128913670746e-06, + "logits/chosen": -122146496.0, + "logits/rejected": -139972512.0, + "logps/chosen": -245.34261067708334, + "logps/rejected": -591.1385498046875, + "loss": 0.035, + "rewards/chosen": 1.8640995025634766, + "rewards/margins": 14.259696006774902, + "rewards/rejected": -12.395596504211426, + "step": 754 + }, + { + "epoch": 0.13796253997259023, + "grad_norm": 9.375, + "kl": 0.0, + "learning_rate": 1.7328969800494727e-06, + "logits/chosen": -166610889.14285713, + "logits/rejected": -112795776.0, + "logps/chosen": -420.05772181919644, + "logps/rejected": -540.0336371527778, + "loss": 0.0281, + "rewards/chosen": 1.0982988221304757, + "rewards/margins": 17.36070930390131, + "rewards/rejected": -16.262410481770832, + "step": 755 + }, + { + "epoch": 0.13814527181361352, + "grad_norm": 8.375, + "kl": 0.0, + "learning_rate": 1.7197048550474643e-06, + "logits/chosen": -201822555.42857143, + "logits/rejected": -89381688.8888889, + "logps/chosen": -254.00666155133928, + "logps/rejected": -435.74736870659723, + "loss": 0.021, + "rewards/chosen": 1.5618085861206055, + "rewards/margins": 13.204176478915745, + "rewards/rejected": -11.64236789279514, + "step": 756 + }, + { + "epoch": 0.13832800365463682, + "grad_norm": 9.5625, + "kl": 0.0, + "learning_rate": 1.7065526994065973e-06, + "logits/chosen": -95182329.6, + "logits/rejected": -59935392.0, + "logps/chosen": -219.6243896484375, + "logps/rejected": -642.33837890625, + "loss": 0.0364, + "rewards/chosen": 1.3446226119995117, + "rewards/margins": 10.9933074315389, + "rewards/rejected": -9.648684819539389, + "step": 757 + }, + { + "epoch": 0.13851073549566012, + "grad_norm": 5.9375, + "kl": 0.0, + "learning_rate": 1.6934406733817417e-06, + "logits/chosen": -116782299.42857143, + "logits/rejected": -120742300.44444445, + "logps/chosen": -218.31511579241072, + "logps/rejected": -450.3257649739583, + "loss": 0.0133, + "rewards/chosen": 2.3191144125802174, + "rewards/margins": 12.13001563057067, + "rewards/rejected": -9.810901217990452, + "step": 758 + }, + { + "epoch": 0.13869346733668342, + "grad_norm": 7.09375, + "kl": 0.0, + "learning_rate": 1.680368936738792e-06, + "logits/chosen": -160044240.0, + "logits/rejected": -113134768.0, + "logps/chosen": -180.87823486328125, + "logps/rejected": -576.189453125, + "loss": 0.025, + "rewards/chosen": 1.2501355409622192, + "rewards/margins": 12.947031140327454, + "rewards/rejected": -11.696895599365234, + "step": 759 + }, + { + "epoch": 0.13887619917770672, + "grad_norm": 10.0, + "kl": 0.0, + "learning_rate": 1.6673376487527382e-06, + "logits/chosen": -179514865.7777778, + "logits/rejected": -148558217.14285713, + "logps/chosen": -234.59898546006946, + "logps/rejected": -465.9669712611607, + "loss": 0.0317, + "rewards/chosen": 1.6772859361436632, + "rewards/margins": 13.063294062538752, + "rewards/rejected": -11.386008126395089, + "step": 760 + }, + { + "epoch": 0.13905893101873001, + "grad_norm": 5.84375, + "kl": 0.0, + "learning_rate": 1.6543469682057105e-06, + "logits/chosen": -52532246.85714286, + "logits/rejected": -76149297.77777778, + "logps/chosen": -158.64829799107142, + "logps/rejected": -381.0262044270833, + "loss": 0.0196, + "rewards/chosen": 2.483391898018973, + "rewards/margins": 14.051499502999441, + "rewards/rejected": -11.568107604980469, + "step": 761 + }, + { + "epoch": 0.1392416628597533, + "grad_norm": 9.625, + "kl": 0.0, + "learning_rate": 1.6413970533850498e-06, + "logits/chosen": -239013248.0, + "logits/rejected": -8126976.8, + "logps/chosen": -323.81785074869794, + "logps/rejected": -547.473291015625, + "loss": 0.0286, + "rewards/chosen": 0.9286028544108073, + "rewards/margins": 13.906935373942057, + "rewards/rejected": -12.97833251953125, + "step": 762 + }, + { + "epoch": 0.1394243947007766, + "grad_norm": 5.75, + "kl": 0.0, + "learning_rate": 1.6284880620813847e-06, + "logits/chosen": -139124608.0, + "logits/rejected": -119946737.77777778, + "logps/chosen": -210.89144461495536, + "logps/rejected": -603.64306640625, + "loss": 0.0144, + "rewards/chosen": 2.304782049996512, + "rewards/margins": 14.012384656875852, + "rewards/rejected": -11.707602606879341, + "step": 763 + }, + { + "epoch": 0.1396071265417999, + "grad_norm": 9.125, + "kl": 0.0, + "learning_rate": 1.6156201515866971e-06, + "logits/chosen": -109943142.4, + "logits/rejected": -93533386.66666667, + "logps/chosen": -215.7317138671875, + "logps/rejected": -526.1923828125, + "loss": 0.0307, + "rewards/chosen": 1.5676753997802735, + "rewards/margins": 13.398327000935874, + "rewards/rejected": -11.8306516011556, + "step": 764 + }, + { + "epoch": 0.1397898583828232, + "grad_norm": 8.5, + "kl": 0.0, + "learning_rate": 1.6027934786924187e-06, + "logits/chosen": -102733542.4, + "logits/rejected": -129587797.33333333, + "logps/chosen": -177.17950439453125, + "logps/rejected": -399.3946533203125, + "loss": 0.0417, + "rewards/chosen": 1.2367450714111328, + "rewards/margins": 13.657995732625325, + "rewards/rejected": -12.421250661214193, + "step": 765 + }, + { + "epoch": 0.1399725902238465, + "grad_norm": 8.1875, + "kl": 0.0, + "learning_rate": 1.5900081996875083e-06, + "logits/chosen": -121172664.8888889, + "logits/rejected": -84888000.0, + "logps/chosen": -220.03327094184027, + "logps/rejected": -274.88344029017856, + "loss": 0.0322, + "rewards/chosen": 1.3560880025227864, + "rewards/margins": 10.907956986200242, + "rewards/rejected": -9.551868983677455, + "step": 766 + }, + { + "epoch": 0.1401553220648698, + "grad_norm": 6.21875, + "kl": 0.0, + "learning_rate": 1.5772644703565564e-06, + "logits/chosen": -69780832.0, + "logits/rejected": -95577056.0, + "logps/chosen": -114.56210327148438, + "logps/rejected": -529.074462890625, + "loss": 0.0294, + "rewards/chosen": 1.1605968475341797, + "rewards/margins": 13.567533493041992, + "rewards/rejected": -12.406936645507812, + "step": 767 + }, + { + "epoch": 0.1403380539058931, + "grad_norm": 6.53125, + "kl": 0.0, + "learning_rate": 1.5645624459778858e-06, + "logits/chosen": -108812629.33333333, + "logits/rejected": -104968524.8, + "logps/chosen": -202.348388671875, + "logps/rejected": -452.045947265625, + "loss": 0.0203, + "rewards/chosen": 1.6190279324849446, + "rewards/margins": 14.502086098988851, + "rewards/rejected": -12.883058166503906, + "step": 768 + }, + { + "epoch": 0.1405207857469164, + "grad_norm": 11.8125, + "kl": 0.0, + "learning_rate": 1.551902281321651e-06, + "logits/chosen": -143984173.7142857, + "logits/rejected": -119489920.0, + "logps/chosen": -233.89997209821428, + "logps/rejected": -312.5600857204861, + "loss": 0.0333, + "rewards/chosen": 1.608870233808245, + "rewards/margins": 11.513402772328211, + "rewards/rejected": -9.904532538519966, + "step": 769 + }, + { + "epoch": 0.1407035175879397, + "grad_norm": 9.625, + "kl": 0.0, + "learning_rate": 1.5392841306479667e-06, + "logits/chosen": -98130922.66666667, + "logits/rejected": -81619108.57142857, + "logps/chosen": -194.67522515190973, + "logps/rejected": -490.8998325892857, + "loss": 0.0369, + "rewards/chosen": 1.1283063888549805, + "rewards/margins": 12.20434638432094, + "rewards/rejected": -11.07603999546596, + "step": 770 + }, + { + "epoch": 0.140886249428963, + "grad_norm": 7.03125, + "kl": 0.0, + "learning_rate": 1.5267081477050132e-06, + "logits/chosen": -55392832.0, + "logits/rejected": -123566533.81818181, + "logps/chosen": -356.146875, + "logps/rejected": -337.5880015980114, + "loss": 0.013, + "rewards/chosen": 1.637253189086914, + "rewards/margins": 13.972637211192739, + "rewards/rejected": -12.335384022105824, + "step": 771 + }, + { + "epoch": 0.1410689812699863, + "grad_norm": 5.3125, + "kl": 0.0, + "learning_rate": 1.514174485727178e-06, + "logits/chosen": -165349024.0, + "logits/rejected": -53703104.0, + "logps/chosen": -133.49838256835938, + "logps/rejected": -378.49383544921875, + "loss": 0.0179, + "rewards/chosen": 2.4805424213409424, + "rewards/margins": 12.87861180305481, + "rewards/rejected": -10.398069381713867, + "step": 772 + }, + { + "epoch": 0.1412517131110096, + "grad_norm": 10.375, + "kl": 0.0, + "learning_rate": 1.5016832974331725e-06, + "logits/chosen": -79294560.0, + "logits/rejected": -95467864.0, + "logps/chosen": -365.02862548828125, + "logps/rejected": -482.8752136230469, + "loss": 0.0322, + "rewards/chosen": 1.0463073253631592, + "rewards/margins": 11.512975454330444, + "rewards/rejected": -10.466668128967285, + "step": 773 + }, + { + "epoch": 0.1414344449520329, + "grad_norm": 9.1875, + "kl": 0.0, + "learning_rate": 1.489234735024188e-06, + "logits/chosen": -72291048.0, + "logits/rejected": -104070736.0, + "logps/chosen": -259.31280517578125, + "logps/rejected": -669.2650146484375, + "loss": 0.0295, + "rewards/chosen": 1.5372154712677002, + "rewards/margins": 15.643590211868286, + "rewards/rejected": -14.106374740600586, + "step": 774 + }, + { + "epoch": 0.1416171767930562, + "grad_norm": 8.375, + "kl": 0.0, + "learning_rate": 1.4768289501820265e-06, + "logits/chosen": -109833557.33333333, + "logits/rejected": -92274464.0, + "logps/chosen": -185.20292154947916, + "logps/rejected": -486.580859375, + "loss": 0.0288, + "rewards/chosen": 1.8268548647562664, + "rewards/margins": 14.217080847422281, + "rewards/rejected": -12.390225982666015, + "step": 775 + }, + { + "epoch": 0.14179990863407949, + "grad_norm": 9.4375, + "kl": 0.0, + "learning_rate": 1.4644660940672628e-06, + "logits/chosen": -87327802.18181819, + "logits/rejected": -54134489.6, + "logps/chosen": -219.31538529829547, + "logps/rejected": -392.587548828125, + "loss": 0.0445, + "rewards/chosen": 1.1433317011052913, + "rewards/margins": 14.218032628839666, + "rewards/rejected": -13.074700927734375, + "step": 776 + }, + { + "epoch": 0.14198264047510278, + "grad_norm": 7.65625, + "kl": 0.0, + "learning_rate": 1.4521463173173966e-06, + "logits/chosen": -79220160.0, + "logits/rejected": -127053772.8, + "logps/chosen": -294.9560953776042, + "logps/rejected": -477.655859375, + "loss": 0.0183, + "rewards/chosen": 1.2233360608418782, + "rewards/margins": 13.16456225713094, + "rewards/rejected": -11.941226196289062, + "step": 777 + }, + { + "epoch": 0.14216537231612608, + "grad_norm": 1.8359375, + "kl": 0.0, + "learning_rate": 1.4398697700450181e-06, + "logits/chosen": -171126752.0, + "logits/rejected": -80911686.4, + "logps/chosen": -176.03519694010416, + "logps/rejected": -457.675244140625, + "loss": 0.0138, + "rewards/chosen": 2.790683110555013, + "rewards/margins": 14.969989903767905, + "rewards/rejected": -12.179306793212891, + "step": 778 + }, + { + "epoch": 0.14234810415714938, + "grad_norm": 8.625, + "kl": 0.0, + "learning_rate": 1.4276366018359845e-06, + "logits/chosen": -95195050.66666667, + "logits/rejected": -143620059.42857143, + "logps/chosen": -214.13953993055554, + "logps/rejected": -521.2821916852679, + "loss": 0.0322, + "rewards/chosen": 1.371847152709961, + "rewards/margins": 15.057422365461077, + "rewards/rejected": -13.685575212751116, + "step": 779 + }, + { + "epoch": 0.14253083599817268, + "grad_norm": 10.625, + "kl": 0.0, + "learning_rate": 1.4154469617475864e-06, + "logits/chosen": -90400145.45454545, + "logits/rejected": -68665574.4, + "logps/chosen": -215.2927911931818, + "logps/rejected": -344.03505859375, + "loss": 0.055, + "rewards/chosen": 0.5841385234485973, + "rewards/margins": 12.087956272472034, + "rewards/rejected": -11.503817749023437, + "step": 780 + }, + { + "epoch": 0.14271356783919598, + "grad_norm": 8.9375, + "kl": 0.0, + "learning_rate": 1.4033009983067454e-06, + "logits/chosen": -110913208.8888889, + "logits/rejected": -76233142.85714285, + "logps/chosen": -239.34559461805554, + "logps/rejected": -349.38504464285717, + "loss": 0.0276, + "rewards/chosen": 2.066023932562934, + "rewards/margins": 12.609058077373202, + "rewards/rejected": -10.543034144810267, + "step": 781 + }, + { + "epoch": 0.14289629968021927, + "grad_norm": 9.625, + "kl": 0.0, + "learning_rate": 1.3911988595081894e-06, + "logits/chosen": -98154960.0, + "logits/rejected": -69887240.0, + "logps/chosen": -182.51564025878906, + "logps/rejected": -524.1570434570312, + "loss": 0.0283, + "rewards/chosen": 2.6560564041137695, + "rewards/margins": 17.19243621826172, + "rewards/rejected": -14.53637981414795, + "step": 782 + }, + { + "epoch": 0.14307903152124257, + "grad_norm": 11.3125, + "kl": 0.0, + "learning_rate": 1.3791406928126638e-06, + "logits/chosen": -130801612.8, + "logits/rejected": -106849792.0, + "logps/chosen": -301.9564208984375, + "logps/rejected": -465.1984049479167, + "loss": 0.0487, + "rewards/chosen": 0.5914848327636719, + "rewards/margins": 13.244540659586589, + "rewards/rejected": -12.653055826822916, + "step": 783 + }, + { + "epoch": 0.14326176336226587, + "grad_norm": 10.4375, + "kl": 0.0, + "learning_rate": 1.3671266451451209e-06, + "logits/chosen": -65657425.45454545, + "logits/rejected": -70468620.8, + "logps/chosen": -190.2835360440341, + "logps/rejected": -343.6380859375, + "loss": 0.0571, + "rewards/chosen": 0.8761677308516069, + "rewards/margins": 9.266609625382857, + "rewards/rejected": -8.39044189453125, + "step": 784 + }, + { + "epoch": 0.14344449520328917, + "grad_norm": 9.0625, + "kl": 0.0, + "learning_rate": 1.3551568628929434e-06, + "logits/chosen": -182215594.66666666, + "logits/rejected": -151777709.7142857, + "logps/chosen": -223.51873101128473, + "logps/rejected": -465.4600306919643, + "loss": 0.0227, + "rewards/chosen": 1.6973128848605685, + "rewards/margins": 13.06106279766749, + "rewards/rejected": -11.36374991280692, + "step": 785 + }, + { + "epoch": 0.14362722704431247, + "grad_norm": 8.8125, + "kl": 0.12612342834472656, + "learning_rate": 1.3432314919041478e-06, + "logits/chosen": -85898377.14285715, + "logits/rejected": -116153877.33333333, + "logps/chosen": -326.647216796875, + "logps/rejected": -384.42949761284723, + "loss": 0.0243, + "rewards/chosen": 1.9883693967546736, + "rewards/margins": 14.15502334776379, + "rewards/rejected": -12.166653951009115, + "step": 786 + }, + { + "epoch": 0.14380995888533576, + "grad_norm": 3.90625, + "kl": 0.0, + "learning_rate": 1.3313506774856177e-06, + "logits/chosen": -124742704.0, + "logits/rejected": -95901226.66666667, + "logps/chosen": -248.44992065429688, + "logps/rejected": -466.2888590494792, + "loss": 0.0061, + "rewards/chosen": 2.7783493995666504, + "rewards/margins": 12.833232084910074, + "rewards/rejected": -10.054882685343424, + "step": 787 + }, + { + "epoch": 0.14399269072635906, + "grad_norm": 6.59375, + "kl": 0.0, + "learning_rate": 1.3195145644013286e-06, + "logits/chosen": -132582880.0, + "logits/rejected": -97449550.76923077, + "logps/chosen": -375.740966796875, + "logps/rejected": -466.03891225961536, + "loss": 0.0121, + "rewards/chosen": 0.9092824459075928, + "rewards/margins": 13.478970509309034, + "rewards/rejected": -12.569688063401442, + "step": 788 + }, + { + "epoch": 0.14417542256738236, + "grad_norm": 12.25, + "kl": 0.0, + "learning_rate": 1.3077232968705805e-06, + "logits/chosen": -102956939.63636364, + "logits/rejected": -98828652.8, + "logps/chosen": -245.38975941051137, + "logps/rejected": -806.301318359375, + "loss": 0.0466, + "rewards/chosen": 0.9318028363314542, + "rewards/margins": 14.601257792386143, + "rewards/rejected": -13.669454956054688, + "step": 789 + }, + { + "epoch": 0.14435815440840566, + "grad_norm": 9.875, + "kl": 0.0, + "learning_rate": 1.2959770185662502e-06, + "logits/chosen": -74689664.0, + "logits/rejected": -163918064.0, + "logps/chosen": -392.11639404296875, + "logps/rejected": -311.1434326171875, + "loss": 0.0717, + "rewards/chosen": 0.8956207036972046, + "rewards/margins": 10.372503638267517, + "rewards/rejected": -9.476882934570312, + "step": 790 + }, + { + "epoch": 0.14454088624942896, + "grad_norm": 6.34375, + "kl": 0.0, + "learning_rate": 1.2842758726130283e-06, + "logits/chosen": -88224018.28571428, + "logits/rejected": -141228487.1111111, + "logps/chosen": -171.75057547433036, + "logps/rejected": -444.5600314670139, + "loss": 0.0225, + "rewards/chosen": 1.6171870912824358, + "rewards/margins": 12.669105968778094, + "rewards/rejected": -11.051918877495659, + "step": 791 + }, + { + "epoch": 0.14472361809045226, + "grad_norm": 11.1875, + "kl": 0.0, + "learning_rate": 1.2726200015856893e-06, + "logits/chosen": -137683225.6, + "logits/rejected": -184939584.0, + "logps/chosen": -256.99697265625, + "logps/rejected": -559.069091796875, + "loss": 0.0483, + "rewards/chosen": 0.8465682983398437, + "rewards/margins": 13.955915323893228, + "rewards/rejected": -13.109347025553385, + "step": 792 + }, + { + "epoch": 0.14490634993147555, + "grad_norm": 8.25, + "kl": 0.0, + "learning_rate": 1.2610095475073415e-06, + "logits/chosen": -5419283.2, + "logits/rejected": -167404788.36363637, + "logps/chosen": -416.770068359375, + "logps/rejected": -464.7815607244318, + "loss": 0.0263, + "rewards/chosen": 0.5220520496368408, + "rewards/margins": 10.937279072674839, + "rewards/rejected": -10.415227023037998, + "step": 793 + }, + { + "epoch": 0.14508908177249885, + "grad_norm": 7.125, + "kl": 0.0, + "learning_rate": 1.2494446518477022e-06, + "logits/chosen": -140616192.0, + "logits/rejected": -123320972.8, + "logps/chosen": -276.7524820963542, + "logps/rejected": -533.971728515625, + "loss": 0.0195, + "rewards/chosen": 1.7421471277872722, + "rewards/margins": 13.949621645609538, + "rewards/rejected": -12.207474517822266, + "step": 794 + }, + { + "epoch": 0.14527181361352215, + "grad_norm": 11.6875, + "kl": 0.0, + "learning_rate": 1.2379254555213788e-06, + "logits/chosen": -156039826.2857143, + "logits/rejected": -159637475.55555555, + "logps/chosen": -255.73758370535714, + "logps/rejected": -516.7318250868055, + "loss": 0.0279, + "rewards/chosen": 1.1801880427769251, + "rewards/margins": 13.352282599797324, + "rewards/rejected": -12.172094557020399, + "step": 795 + }, + { + "epoch": 0.14545454545454545, + "grad_norm": 9.0, + "kl": 0.0, + "learning_rate": 1.22645209888614e-06, + "logits/chosen": -174029056.0, + "logits/rejected": -33556323.55555555, + "logps/chosen": -239.05976213727678, + "logps/rejected": -564.5182834201389, + "loss": 0.0221, + "rewards/chosen": 2.215492384774344, + "rewards/margins": 17.88052297016931, + "rewards/rejected": -15.665030585394966, + "step": 796 + }, + { + "epoch": 0.14563727729556875, + "grad_norm": 6.34375, + "kl": 0.0, + "learning_rate": 1.2150247217412186e-06, + "logits/chosen": -71841797.33333333, + "logits/rejected": -53384416.0, + "logps/chosen": -137.2267049153646, + "logps/rejected": -420.031494140625, + "loss": 0.0213, + "rewards/chosen": 1.4134829839070637, + "rewards/margins": 15.23009827931722, + "rewards/rejected": -13.816615295410156, + "step": 797 + }, + { + "epoch": 0.14582000913659204, + "grad_norm": 9.6875, + "kl": 0.0, + "learning_rate": 1.203643463325596e-06, + "logits/chosen": -129929640.0, + "logits/rejected": -51137936.0, + "logps/chosen": -225.40560913085938, + "logps/rejected": -666.5877685546875, + "loss": 0.0295, + "rewards/chosen": 1.7505916357040405, + "rewards/margins": 15.43504273891449, + "rewards/rejected": -13.68445110321045, + "step": 798 + }, + { + "epoch": 0.14600274097761534, + "grad_norm": 7.625, + "kl": 0.0, + "learning_rate": 1.1923084623163172e-06, + "logits/chosen": -88165482.66666667, + "logits/rejected": -87261011.2, + "logps/chosen": -352.3704833984375, + "logps/rejected": -625.168017578125, + "loss": 0.0225, + "rewards/chosen": 1.2081899642944336, + "rewards/margins": 15.83867244720459, + "rewards/rejected": -14.630482482910157, + "step": 799 + }, + { + "epoch": 0.14618547281863864, + "grad_norm": 12.0, + "kl": 0.0, + "learning_rate": 1.1810198568267906e-06, + "logits/chosen": -123590376.72727273, + "logits/rejected": -42821376.0, + "logps/chosen": -198.80213512073863, + "logps/rejected": -542.34638671875, + "loss": 0.0474, + "rewards/chosen": 0.9177025014703925, + "rewards/margins": 13.726711290532892, + "rewards/rejected": -12.8090087890625, + "step": 800 + }, + { + "epoch": 0.14636820465966194, + "grad_norm": 8.5625, + "kl": 0.0, + "learning_rate": 1.1697777844051105e-06, + "logits/chosen": -91013600.0, + "logits/rejected": -128225056.0, + "logps/chosen": -235.5248046875, + "logps/rejected": -502.8588460286458, + "loss": 0.0379, + "rewards/chosen": 1.1972951889038086, + "rewards/margins": 10.268176078796387, + "rewards/rejected": -9.070880889892578, + "step": 801 + }, + { + "epoch": 0.14655093650068524, + "grad_norm": 8.25, + "kl": 0.0, + "learning_rate": 1.1585823820323845e-06, + "logits/chosen": -94999928.8888889, + "logits/rejected": -121079643.42857143, + "logps/chosen": -226.75246853298611, + "logps/rejected": -534.0290178571429, + "loss": 0.0188, + "rewards/chosen": 2.387177997165256, + "rewards/margins": 18.705009854029097, + "rewards/rejected": -16.31783185686384, + "step": 802 + }, + { + "epoch": 0.14673366834170853, + "grad_norm": 11.875, + "kl": 0.0, + "learning_rate": 1.1474337861210543e-06, + "logits/chosen": -99172793.6, + "logits/rejected": -100530144.0, + "logps/chosen": -235.3405517578125, + "logps/rejected": -295.41748046875, + "loss": 0.0445, + "rewards/chosen": 0.9917353630065918, + "rewards/margins": 9.62179937362671, + "rewards/rejected": -8.630064010620117, + "step": 803 + }, + { + "epoch": 0.14691640018273183, + "grad_norm": 17.0, + "kl": 0.0, + "learning_rate": 1.136332132513245e-06, + "logits/chosen": -76980352.0, + "logits/rejected": -84976134.4, + "logps/chosen": -238.20328776041666, + "logps/rejected": -419.010302734375, + "loss": 0.0189, + "rewards/chosen": 1.5825592676798503, + "rewards/margins": 11.061737124125163, + "rewards/rejected": -9.479177856445313, + "step": 804 + }, + { + "epoch": 0.14709913202375513, + "grad_norm": 7.40625, + "kl": 0.0, + "learning_rate": 1.1252775564791023e-06, + "logits/chosen": -214505819.42857143, + "logits/rejected": -92983964.44444445, + "logps/chosen": -186.37053571428572, + "logps/rejected": -571.9895290798611, + "loss": 0.0206, + "rewards/chosen": 1.455951145717076, + "rewards/margins": 14.941533285474021, + "rewards/rejected": -13.485582139756945, + "step": 805 + }, + { + "epoch": 0.14728186386477843, + "grad_norm": 8.375, + "kl": 0.0, + "learning_rate": 1.1142701927151456e-06, + "logits/chosen": -165048832.0, + "logits/rejected": -119619640.8888889, + "logps/chosen": -181.94447544642858, + "logps/rejected": -403.41729058159723, + "loss": 0.0197, + "rewards/chosen": 1.6717729568481445, + "rewards/margins": 12.27253606584337, + "rewards/rejected": -10.600763108995226, + "step": 806 + }, + { + "epoch": 0.14746459570580173, + "grad_norm": 8.5625, + "kl": 0.0, + "learning_rate": 1.1033101753426285e-06, + "logits/chosen": -89255562.66666667, + "logits/rejected": -107483896.0, + "logps/chosen": -232.9383748372396, + "logps/rejected": -273.8987731933594, + "loss": 0.0367, + "rewards/chosen": 1.5431561470031738, + "rewards/margins": 12.946342945098877, + "rewards/rejected": -11.403186798095703, + "step": 807 + }, + { + "epoch": 0.14764732754682502, + "grad_norm": 9.625, + "kl": 0.0, + "learning_rate": 1.0923976379059059e-06, + "logits/chosen": -129997027.55555555, + "logits/rejected": -114185225.14285715, + "logps/chosen": -280.4490017361111, + "logps/rejected": -417.7265625, + "loss": 0.0195, + "rewards/chosen": 2.3202300601535373, + "rewards/margins": 15.122630921621171, + "rewards/rejected": -12.802400861467634, + "step": 808 + }, + { + "epoch": 0.14783005938784832, + "grad_norm": 12.125, + "kl": 0.0, + "learning_rate": 1.0815327133708015e-06, + "logits/chosen": -167170688.0, + "logits/rejected": -177276684.8, + "logps/chosen": -258.02021928267044, + "logps/rejected": -412.63984375, + "loss": 0.0478, + "rewards/chosen": 0.7572222622958097, + "rewards/margins": 14.53688534823331, + "rewards/rejected": -13.7796630859375, + "step": 809 + }, + { + "epoch": 0.14801279122887162, + "grad_norm": 12.5625, + "kl": 0.0, + "learning_rate": 1.0707155341229902e-06, + "logits/chosen": -102290483.2, + "logits/rejected": -154365333.33333334, + "logps/chosen": -274.014697265625, + "logps/rejected": -450.0650227864583, + "loss": 0.0405, + "rewards/chosen": 1.1363767623901366, + "rewards/margins": 14.385199292500815, + "rewards/rejected": -13.248822530110678, + "step": 810 + }, + { + "epoch": 0.14819552306989492, + "grad_norm": 6.625, + "kl": 0.0, + "learning_rate": 1.0599462319663906e-06, + "logits/chosen": -53917365.333333336, + "logits/rejected": -125703104.0, + "logps/chosen": -308.6781819661458, + "logps/rejected": -491.23056640625, + "loss": 0.0116, + "rewards/chosen": 2.6247968673706055, + "rewards/margins": 13.590602684020997, + "rewards/rejected": -10.965805816650391, + "step": 811 + }, + { + "epoch": 0.14837825491091822, + "grad_norm": 10.875, + "kl": 0.0, + "learning_rate": 1.049224938121548e-06, + "logits/chosen": -189138316.8, + "logits/rejected": -83281034.66666667, + "logps/chosen": -207.5142822265625, + "logps/rejected": -666.2886149088541, + "loss": 0.0376, + "rewards/chosen": 1.274832534790039, + "rewards/margins": 20.674900182088216, + "rewards/rejected": -19.400067647298176, + "step": 812 + }, + { + "epoch": 0.14856098675194152, + "grad_norm": 6.6875, + "kl": 0.0, + "learning_rate": 1.0385517832240472e-06, + "logits/chosen": -267747600.0, + "logits/rejected": -79497082.66666667, + "logps/chosen": -206.56008911132812, + "logps/rejected": -402.8004964192708, + "loss": 0.0158, + "rewards/chosen": 0.9949302673339844, + "rewards/margins": 10.961262385050455, + "rewards/rejected": -9.96633211771647, + "step": 813 + }, + { + "epoch": 0.1487437185929648, + "grad_norm": 12.5625, + "kl": 0.0, + "learning_rate": 1.0279268973229089e-06, + "logits/chosen": -138404582.4, + "logits/rejected": -143272010.66666666, + "logps/chosen": -157.769775390625, + "logps/rejected": -367.1588541666667, + "loss": 0.0594, + "rewards/chosen": 0.3307535171508789, + "rewards/margins": 9.039770952860513, + "rewards/rejected": -8.709017435709635, + "step": 814 + }, + { + "epoch": 0.1489264504339881, + "grad_norm": 7.375, + "kl": 0.0, + "learning_rate": 1.0173504098790188e-06, + "logits/chosen": -106106843.42857143, + "logits/rejected": -133124181.33333333, + "logps/chosen": -260.16817801339283, + "logps/rejected": -349.96009657118054, + "loss": 0.0169, + "rewards/chosen": 2.984673636300223, + "rewards/margins": 14.998276846749441, + "rewards/rejected": -12.013603210449219, + "step": 815 + }, + { + "epoch": 0.1491091822750114, + "grad_norm": 10.6875, + "kl": 0.0, + "learning_rate": 1.006822449763537e-06, + "logits/chosen": -128585184.0, + "logits/rejected": -88729808.0, + "logps/chosen": -239.14657592773438, + "logps/rejected": -353.29632568359375, + "loss": 0.0487, + "rewards/chosen": 0.07526593655347824, + "rewards/margins": 11.007058195769787, + "rewards/rejected": -10.931792259216309, + "step": 816 + }, + { + "epoch": 0.1492919141160347, + "grad_norm": 11.6875, + "kl": 0.0, + "learning_rate": 9.963431452563331e-07, + "logits/chosen": -148542336.0, + "logits/rejected": -98936576.0, + "logps/chosen": -315.65283203125, + "logps/rejected": -367.85711669921875, + "loss": 0.0325, + "rewards/chosen": 0.9136552810668945, + "rewards/margins": 12.276965141296387, + "rewards/rejected": -11.363309860229492, + "step": 817 + }, + { + "epoch": 0.149474645957058, + "grad_norm": 6.25, + "kl": 0.0, + "learning_rate": 9.859126240444284e-07, + "logits/chosen": -73890944.0, + "logits/rejected": -94938728.0, + "logps/chosen": -179.9475555419922, + "logps/rejected": -360.9874267578125, + "loss": 0.0275, + "rewards/chosen": 1.6206978559494019, + "rewards/margins": 11.905928492546082, + "rewards/rejected": -10.28523063659668, + "step": 818 + }, + { + "epoch": 0.1496573777980813, + "grad_norm": 6.15625, + "kl": 0.0, + "learning_rate": 9.7553101322043e-07, + "logits/chosen": -91322453.33333333, + "logits/rejected": -126481171.6923077, + "logps/chosen": -155.93172200520834, + "logps/rejected": -475.47228064903845, + "loss": 0.0154, + "rewards/chosen": 0.6350951989491781, + "rewards/margins": 11.35155051182478, + "rewards/rejected": -10.7164553128756, + "step": 819 + }, + { + "epoch": 0.1498401096391046, + "grad_norm": 6.65625, + "kl": 0.0, + "learning_rate": 9.651984392809916e-07, + "logits/chosen": -134387520.0, + "logits/rejected": -69134389.33333333, + "logps/chosen": -147.0417938232422, + "logps/rejected": -481.6470540364583, + "loss": 0.013, + "rewards/chosen": 2.5950047969818115, + "rewards/margins": 13.584112564722696, + "rewards/rejected": -10.989107767740885, + "step": 820 + }, + { + "epoch": 0.1500228414801279, + "grad_norm": 9.375, + "kl": 0.5568618774414062, + "learning_rate": 9.549150281252633e-07, + "logits/chosen": -222890572.8, + "logits/rejected": -107105930.66666667, + "logps/chosen": -263.45791015625, + "logps/rejected": -456.3541666666667, + "loss": 0.0287, + "rewards/chosen": 1.7678586959838867, + "rewards/margins": 13.900357119242349, + "rewards/rejected": -12.132498423258463, + "step": 821 + }, + { + "epoch": 0.1502055733211512, + "grad_norm": 6.96875, + "kl": 0.0, + "learning_rate": 9.446809050533679e-07, + "logits/chosen": -116137700.57142857, + "logits/rejected": -120416753.77777778, + "logps/chosen": -291.68130929129467, + "logps/rejected": -354.10834418402777, + "loss": 0.0182, + "rewards/chosen": 2.435044969831194, + "rewards/margins": 13.278679681202723, + "rewards/rejected": -10.843634711371529, + "step": 822 + }, + { + "epoch": 0.1503883051621745, + "grad_norm": 10.0625, + "kl": 0.0, + "learning_rate": 9.344961947648624e-07, + "logits/chosen": -94863088.0, + "logits/rejected": -108633640.0, + "logps/chosen": -251.44692993164062, + "logps/rejected": -395.10498046875, + "loss": 0.0385, + "rewards/chosen": 1.362130045890808, + "rewards/margins": 10.381954073905945, + "rewards/rejected": -9.019824028015137, + "step": 823 + }, + { + "epoch": 0.1505710370031978, + "grad_norm": 9.1875, + "kl": 0.0, + "learning_rate": 9.243610213572285e-07, + "logits/chosen": -129764376.0, + "logits/rejected": -108117000.0, + "logps/chosen": -171.32411193847656, + "logps/rejected": -515.189697265625, + "loss": 0.0276, + "rewards/chosen": 1.5833295583724976, + "rewards/margins": 13.446354269981384, + "rewards/rejected": -11.863024711608887, + "step": 824 + }, + { + "epoch": 0.1507537688442211, + "grad_norm": 7.25, + "kl": 0.0, + "learning_rate": 9.142755083243577e-07, + "logits/chosen": -89685147.42857143, + "logits/rejected": -107622279.1111111, + "logps/chosen": -208.23170689174108, + "logps/rejected": -620.1728515625, + "loss": 0.025, + "rewards/chosen": 1.374993188040597, + "rewards/margins": 14.800118310110909, + "rewards/rejected": -13.425125122070312, + "step": 825 + }, + { + "epoch": 0.1509365006852444, + "grad_norm": 7.875, + "kl": 0.0, + "learning_rate": 9.042397785550405e-07, + "logits/chosen": -101122019.55555555, + "logits/rejected": -97434496.0, + "logps/chosen": -237.15676540798611, + "logps/rejected": -373.1394740513393, + "loss": 0.0262, + "rewards/chosen": 2.5217823452419705, + "rewards/margins": 12.669099414159382, + "rewards/rejected": -10.147317068917411, + "step": 826 + }, + { + "epoch": 0.1511192325262677, + "grad_norm": 6.75, + "kl": 0.0, + "learning_rate": 8.942539543314799e-07, + "logits/chosen": 8082392.0, + "logits/rejected": -118389408.0, + "logps/chosen": -206.24368286132812, + "logps/rejected": -519.5342203776041, + "loss": 0.0141, + "rewards/chosen": 1.5884411334991455, + "rewards/margins": 15.910226106643677, + "rewards/rejected": -14.321784973144531, + "step": 827 + }, + { + "epoch": 0.15130196436729101, + "grad_norm": 9.0625, + "kl": 0.0, + "learning_rate": 8.843181573277904e-07, + "logits/chosen": -123615195.42857143, + "logits/rejected": -87342279.1111111, + "logps/chosen": -228.77253069196428, + "logps/rejected": -550.1585286458334, + "loss": 0.026, + "rewards/chosen": 1.1522050585065569, + "rewards/margins": 15.47154947311159, + "rewards/rejected": -14.319344414605034, + "step": 828 + }, + { + "epoch": 0.1514846962083143, + "grad_norm": 9.5, + "kl": 0.0, + "learning_rate": 8.744325086085248e-07, + "logits/chosen": -134539192.8888889, + "logits/rejected": -188118089.14285713, + "logps/chosen": -212.14657931857639, + "logps/rejected": -503.28013392857144, + "loss": 0.0287, + "rewards/chosen": 1.5234848658243816, + "rewards/margins": 16.29027389344715, + "rewards/rejected": -14.766789027622767, + "step": 829 + }, + { + "epoch": 0.1516674280493376, + "grad_norm": 8.375, + "kl": 0.0, + "learning_rate": 8.645971286271903e-07, + "logits/chosen": -79899448.8888889, + "logits/rejected": -115204205.71428572, + "logps/chosen": -297.5750325520833, + "logps/rejected": -285.1471644810268, + "loss": 0.0294, + "rewards/chosen": 1.9270568423800998, + "rewards/margins": 11.043522819640145, + "rewards/rejected": -9.116465977260045, + "step": 830 + }, + { + "epoch": 0.1518501598903609, + "grad_norm": 6.53125, + "kl": 0.0, + "learning_rate": 8.54812137224792e-07, + "logits/chosen": -125629019.42857143, + "logits/rejected": -68564202.66666667, + "logps/chosen": -187.31820242745536, + "logps/rejected": -503.37358940972223, + "loss": 0.022, + "rewards/chosen": 1.3658839634486608, + "rewards/margins": 14.516377161419582, + "rewards/rejected": -13.15049319797092, + "step": 831 + }, + { + "epoch": 0.1520328917313842, + "grad_norm": 8.3125, + "kl": 0.0, + "learning_rate": 8.450776536283594e-07, + "logits/chosen": -120603593.14285715, + "logits/rejected": -147538816.0, + "logps/chosen": -177.45521763392858, + "logps/rejected": -544.5496419270834, + "loss": 0.0315, + "rewards/chosen": 0.6492986679077148, + "rewards/margins": 16.170944107903374, + "rewards/rejected": -15.521645439995659, + "step": 832 + }, + { + "epoch": 0.1522156235724075, + "grad_norm": 7.875, + "kl": 3.093008041381836, + "learning_rate": 8.353937964495029e-07, + "logits/chosen": -96298392.0, + "logits/rejected": 13823917.0, + "logps/chosen": -158.4691162109375, + "logps/rejected": -395.2026062011719, + "loss": 0.0339, + "rewards/chosen": 2.284682273864746, + "rewards/margins": 12.512186050415039, + "rewards/rejected": -10.227503776550293, + "step": 833 + }, + { + "epoch": 0.1523983554134308, + "grad_norm": 6.5625, + "kl": 0.0, + "learning_rate": 8.25760683682968e-07, + "logits/chosen": -91507015.1111111, + "logits/rejected": -84749769.14285715, + "logps/chosen": -195.52029079861111, + "logps/rejected": -550.0073939732143, + "loss": 0.0212, + "rewards/chosen": 1.83942625257704, + "rewards/margins": 16.59544475494869, + "rewards/rejected": -14.756018502371651, + "step": 834 + }, + { + "epoch": 0.1525810872544541, + "grad_norm": 7.0625, + "kl": 0.0, + "learning_rate": 8.161784327051919e-07, + "logits/chosen": -129825996.8, + "logits/rejected": -101508584.72727273, + "logps/chosen": -121.38939208984375, + "logps/rejected": -458.83935546875, + "loss": 0.0201, + "rewards/chosen": 3.2048583984375, + "rewards/margins": 14.661616377397017, + "rewards/rejected": -11.456757978959518, + "step": 835 + }, + { + "epoch": 0.1527638190954774, + "grad_norm": 8.25, + "kl": 0.0, + "learning_rate": 8.066471602728804e-07, + "logits/chosen": -190917347.55555555, + "logits/rejected": -182049097.14285713, + "logps/chosen": -176.15272352430554, + "logps/rejected": -528.3292759486607, + "loss": 0.036, + "rewards/chosen": 1.5262605879041884, + "rewards/margins": 10.805896426004077, + "rewards/rejected": -9.279635838099889, + "step": 836 + }, + { + "epoch": 0.1529465509365007, + "grad_norm": 5.96875, + "kl": 0.0, + "learning_rate": 7.971669825215789e-07, + "logits/chosen": -83154792.0, + "logits/rejected": -67921456.0, + "logps/chosen": -167.51583862304688, + "logps/rejected": -330.1724853515625, + "loss": 0.0236, + "rewards/chosen": 2.199831962585449, + "rewards/margins": 13.788846015930176, + "rewards/rejected": -11.589014053344727, + "step": 837 + }, + { + "epoch": 0.153129282777524, + "grad_norm": 6.375, + "kl": 0.0, + "learning_rate": 7.877380149642628e-07, + "logits/chosen": -103847477.33333333, + "logits/rejected": -94886540.8, + "logps/chosen": -127.74991861979167, + "logps/rejected": -472.02177734375, + "loss": 0.0264, + "rewards/chosen": 1.0358514785766602, + "rewards/margins": 14.61311740875244, + "rewards/rejected": -13.57726593017578, + "step": 838 + }, + { + "epoch": 0.1533120146185473, + "grad_norm": 10.75, + "kl": 0.0, + "learning_rate": 7.783603724899258e-07, + "logits/chosen": -190563754.66666666, + "logits/rejected": -61340252.0, + "logps/chosen": -285.03334554036456, + "logps/rejected": -405.8205871582031, + "loss": 0.0296, + "rewards/chosen": 1.8780317306518555, + "rewards/margins": 17.26395320892334, + "rewards/rejected": -15.385921478271484, + "step": 839 + }, + { + "epoch": 0.1534947464595706, + "grad_norm": 8.5, + "kl": 0.0, + "learning_rate": 7.690341693621805e-07, + "logits/chosen": -71300465.77777778, + "logits/rejected": -122669092.57142857, + "logps/chosen": -169.28742133246527, + "logps/rejected": -556.8124302455357, + "loss": 0.04, + "rewards/chosen": 0.8411012225680881, + "rewards/margins": 15.566401602729918, + "rewards/rejected": -14.72530038016183, + "step": 840 + }, + { + "epoch": 0.1536774783005939, + "grad_norm": 7.84375, + "kl": 0.0, + "learning_rate": 7.597595192178702e-07, + "logits/chosen": -73553621.33333333, + "logits/rejected": -149591500.8, + "logps/chosen": -239.80963134765625, + "logps/rejected": -650.20341796875, + "loss": 0.0213, + "rewards/chosen": 1.9484111467997234, + "rewards/margins": 15.428687636057536, + "rewards/rejected": -13.480276489257813, + "step": 841 + }, + { + "epoch": 0.1538602101416172, + "grad_norm": 10.25, + "kl": 0.0, + "learning_rate": 7.505365350656813e-07, + "logits/chosen": -146432682.66666666, + "logits/rejected": -69876283.42857143, + "logps/chosen": -244.06610785590277, + "logps/rejected": -486.5947265625, + "loss": 0.0371, + "rewards/chosen": 0.8560700416564941, + "rewards/margins": 15.430538381849017, + "rewards/rejected": -14.574468340192523, + "step": 842 + }, + { + "epoch": 0.15404294198264049, + "grad_norm": 6.25, + "kl": 0.0, + "learning_rate": 7.413653292847617e-07, + "logits/chosen": -109801098.66666667, + "logits/rejected": -145293414.4, + "logps/chosen": -173.44099934895834, + "logps/rejected": -439.0982421875, + "loss": 0.0227, + "rewards/chosen": 1.4226908683776855, + "rewards/margins": 13.600130748748779, + "rewards/rejected": -12.177439880371093, + "step": 843 + }, + { + "epoch": 0.15422567382366378, + "grad_norm": 8.25, + "kl": 0.0, + "learning_rate": 7.322460136233622e-07, + "logits/chosen": -91269840.0, + "logits/rejected": -34144688.0, + "logps/chosen": -258.4578552246094, + "logps/rejected": -454.8225402832031, + "loss": 0.0397, + "rewards/chosen": 0.6028401851654053, + "rewards/margins": 11.522296667098999, + "rewards/rejected": -10.919456481933594, + "step": 844 + }, + { + "epoch": 0.15440840566468708, + "grad_norm": 9.625, + "kl": 0.0, + "learning_rate": 7.23178699197467e-07, + "logits/chosen": -142619946.66666666, + "logits/rejected": -130290547.2, + "logps/chosen": -311.77919514973956, + "logps/rejected": -468.969384765625, + "loss": 0.0267, + "rewards/chosen": 0.6349604924519857, + "rewards/margins": 15.776446088155112, + "rewards/rejected": -15.141485595703125, + "step": 845 + }, + { + "epoch": 0.15459113750571038, + "grad_norm": 7.40625, + "kl": 0.0, + "learning_rate": 7.141634964894389e-07, + "logits/chosen": -112409301.33333333, + "logits/rejected": -104417828.57142857, + "logps/chosen": -233.52718098958334, + "logps/rejected": -339.79042271205356, + "loss": 0.0278, + "rewards/chosen": 1.6180848015679254, + "rewards/margins": 11.551459478953529, + "rewards/rejected": -9.933374677385602, + "step": 846 + }, + { + "epoch": 0.15477386934673368, + "grad_norm": 9.125, + "kl": 0.0, + "learning_rate": 7.052005153466779e-07, + "logits/chosen": -174554903.27272728, + "logits/rejected": -175790694.4, + "logps/chosen": -168.9518710049716, + "logps/rejected": -623.17763671875, + "loss": 0.0388, + "rewards/chosen": 1.4759630723433061, + "rewards/margins": 15.898309568925338, + "rewards/rejected": -14.422346496582032, + "step": 847 + }, + { + "epoch": 0.15495660118775698, + "grad_norm": 8.875, + "kl": 0.6509876251220703, + "learning_rate": 6.962898649802824e-07, + "logits/chosen": -145640170.66666666, + "logits/rejected": -95781496.0, + "logps/chosen": -180.00703938802084, + "logps/rejected": -467.03948974609375, + "loss": 0.0373, + "rewards/chosen": 1.7723811467488606, + "rewards/margins": 15.868378003438314, + "rewards/rejected": -14.095996856689453, + "step": 848 + }, + { + "epoch": 0.15513933302878027, + "grad_norm": 8.0625, + "kl": 0.0, + "learning_rate": 6.874316539637127e-07, + "logits/chosen": -351985590.85714287, + "logits/rejected": -80979733.33333333, + "logps/chosen": -212.55986676897322, + "logps/rejected": -410.0514865451389, + "loss": 0.0367, + "rewards/chosen": 0.3362260546003069, + "rewards/margins": 11.593654541742234, + "rewards/rejected": -11.257428487141928, + "step": 849 + }, + { + "epoch": 0.15532206486980357, + "grad_norm": 19.0, + "kl": 0.0, + "learning_rate": 6.786259902314768e-07, + "logits/chosen": -115285496.8888889, + "logits/rejected": -48959771.428571425, + "logps/chosen": -207.13589138454861, + "logps/rejected": -306.52584402901783, + "loss": 0.0423, + "rewards/chosen": 1.5974742041693792, + "rewards/margins": 8.552589931185283, + "rewards/rejected": -6.955115727015904, + "step": 850 + }, + { + "epoch": 0.15550479671082687, + "grad_norm": 8.5, + "kl": 0.0, + "learning_rate": 6.698729810778065e-07, + "logits/chosen": -116800394.66666667, + "logits/rejected": -116832972.8, + "logps/chosen": -239.61287434895834, + "logps/rejected": -484.4455078125, + "loss": 0.027, + "rewards/chosen": 0.7022639115651449, + "rewards/margins": 14.111529048283895, + "rewards/rejected": -13.40926513671875, + "step": 851 + }, + { + "epoch": 0.15568752855185017, + "grad_norm": 9.1875, + "kl": 0.0, + "learning_rate": 6.611727331553585e-07, + "logits/chosen": -89204888.0, + "logits/rejected": -82254240.0, + "logps/chosen": -308.7947082519531, + "logps/rejected": -527.1114501953125, + "loss": 0.0337, + "rewards/chosen": 1.1754717826843262, + "rewards/margins": 16.83082628250122, + "rewards/rejected": -15.655354499816895, + "step": 852 + }, + { + "epoch": 0.15587026039287347, + "grad_norm": 11.4375, + "kl": 0.0, + "learning_rate": 6.52525352473905e-07, + "logits/chosen": -137533333.33333334, + "logits/rejected": -108817920.0, + "logps/chosen": -224.7368367513021, + "logps/rejected": -752.294921875, + "loss": 0.0442, + "rewards/chosen": 1.2999324798583984, + "rewards/margins": 14.302218437194824, + "rewards/rejected": -13.002285957336426, + "step": 853 + }, + { + "epoch": 0.15605299223389676, + "grad_norm": 9.25, + "kl": 0.0, + "learning_rate": 6.439309443990532e-07, + "logits/chosen": -108684322.9090909, + "logits/rejected": -142168192.0, + "logps/chosen": -206.1073330965909, + "logps/rejected": -357.6049072265625, + "loss": 0.0325, + "rewards/chosen": 1.7599490772594104, + "rewards/margins": 12.011768687855113, + "rewards/rejected": -10.251819610595703, + "step": 854 + }, + { + "epoch": 0.15623572407492006, + "grad_norm": 10.75, + "kl": 0.0, + "learning_rate": 6.353896136509524e-07, + "logits/chosen": -127930470.4, + "logits/rejected": -150044896.0, + "logps/chosen": -302.6762939453125, + "logps/rejected": -340.0555826822917, + "loss": 0.0397, + "rewards/chosen": 1.3399818420410157, + "rewards/margins": 10.053870646158854, + "rewards/rejected": -8.713888804117838, + "step": 855 + }, + { + "epoch": 0.15641845591594336, + "grad_norm": 8.375, + "kl": 0.0, + "learning_rate": 6.269014643030214e-07, + "logits/chosen": -87873088.0, + "logits/rejected": -123375640.0, + "logps/chosen": -281.5794982910156, + "logps/rejected": -449.1827392578125, + "loss": 0.0313, + "rewards/chosen": 1.6035327911376953, + "rewards/margins": 13.308332443237305, + "rewards/rejected": -11.70479965209961, + "step": 856 + }, + { + "epoch": 0.15660118775696666, + "grad_norm": 11.625, + "kl": 0.0, + "learning_rate": 6.184665997806832e-07, + "logits/chosen": -233271850.66666666, + "logits/rejected": -77740544.0, + "logps/chosen": -303.6317952473958, + "logps/rejected": -348.131103515625, + "loss": 0.0426, + "rewards/chosen": 1.4003372192382812, + "rewards/margins": 9.265161991119385, + "rewards/rejected": -7.8648247718811035, + "step": 857 + }, + { + "epoch": 0.15678391959798996, + "grad_norm": 8.75, + "kl": 0.0, + "learning_rate": 6.100851228600974e-07, + "logits/chosen": -150991981.7142857, + "logits/rejected": -59575548.44444445, + "logps/chosen": -264.6292201450893, + "logps/rejected": -528.6767035590278, + "loss": 0.0276, + "rewards/chosen": 1.071073395865304, + "rewards/margins": 14.816930634634835, + "rewards/rejected": -13.745857238769531, + "step": 858 + }, + { + "epoch": 0.15696665143901326, + "grad_norm": 8.25, + "kl": 0.0, + "learning_rate": 6.017571356669183e-07, + "logits/chosen": -79142243.55555555, + "logits/rejected": -51773120.0, + "logps/chosen": -209.51502821180554, + "logps/rejected": -367.68111746651783, + "loss": 0.0225, + "rewards/chosen": 1.7325831519232855, + "rewards/margins": 12.356637107001411, + "rewards/rejected": -10.624053955078125, + "step": 859 + }, + { + "epoch": 0.15714938328003655, + "grad_norm": 7.28125, + "kl": 0.0, + "learning_rate": 5.934827396750392e-07, + "logits/chosen": -103214624.0, + "logits/rejected": -56463080.0, + "logps/chosen": -139.35275268554688, + "logps/rejected": -365.7665710449219, + "loss": 0.036, + "rewards/chosen": 0.9959429502487183, + "rewards/margins": 11.501348376274109, + "rewards/rejected": -10.50540542602539, + "step": 860 + }, + { + "epoch": 0.15733211512105985, + "grad_norm": 7.15625, + "kl": 0.0, + "learning_rate": 5.852620357053651e-07, + "logits/chosen": -110615961.6, + "logits/rejected": -112971306.66666667, + "logps/chosen": -156.7009765625, + "logps/rejected": -570.8209635416666, + "loss": 0.0412, + "rewards/chosen": 1.5157905578613282, + "rewards/margins": 13.56173350016276, + "rewards/rejected": -12.045942942301432, + "step": 861 + }, + { + "epoch": 0.15751484696208315, + "grad_norm": 8.5625, + "kl": 0.0, + "learning_rate": 5.770951239245803e-07, + "logits/chosen": -133738931.2, + "logits/rejected": -122629024.0, + "logps/chosen": -253.9830078125, + "logps/rejected": -454.2735595703125, + "loss": 0.0256, + "rewards/chosen": 1.7233379364013672, + "rewards/margins": 13.258819707234702, + "rewards/rejected": -11.535481770833334, + "step": 862 + }, + { + "epoch": 0.15769757880310645, + "grad_norm": 10.5625, + "kl": 0.0, + "learning_rate": 5.689821038439264e-07, + "logits/chosen": -113876666.18181819, + "logits/rejected": -60478240.0, + "logps/chosen": -258.72804953835225, + "logps/rejected": -396.695068359375, + "loss": 0.0339, + "rewards/chosen": 1.8196416334672407, + "rewards/margins": 12.312587495283646, + "rewards/rejected": -10.492945861816406, + "step": 863 + }, + { + "epoch": 0.15788031064412975, + "grad_norm": 8.25, + "kl": 0.0, + "learning_rate": 5.609230743179939e-07, + "logits/chosen": -171991482.1818182, + "logits/rejected": -68739264.0, + "logps/chosen": -197.5176447088068, + "logps/rejected": -285.9361083984375, + "loss": 0.0329, + "rewards/chosen": 1.75279391895641, + "rewards/margins": 12.51041339527477, + "rewards/rejected": -10.757619476318359, + "step": 864 + }, + { + "epoch": 0.15806304248515304, + "grad_norm": 6.65625, + "kl": 0.0, + "learning_rate": 5.529181335435124e-07, + "logits/chosen": -109744146.28571428, + "logits/rejected": -118734250.66666667, + "logps/chosen": -144.46651785714286, + "logps/rejected": -795.1806098090278, + "loss": 0.0223, + "rewards/chosen": 2.2493982315063477, + "rewards/margins": 17.28032440609402, + "rewards/rejected": -15.030926174587673, + "step": 865 + }, + { + "epoch": 0.15824577432617634, + "grad_norm": 8.625, + "kl": 0.0, + "learning_rate": 5.449673790581611e-07, + "logits/chosen": -255481237.33333334, + "logits/rejected": -153203648.0, + "logps/chosen": -1006.8837890625, + "logps/rejected": -541.90888671875, + "loss": 0.0403, + "rewards/chosen": -1.461687723795573, + "rewards/margins": 11.579002888997396, + "rewards/rejected": -13.040690612792968, + "step": 866 + }, + { + "epoch": 0.15842850616719964, + "grad_norm": 8.9375, + "kl": 0.0, + "learning_rate": 5.370709077393721e-07, + "logits/chosen": -60064242.28571428, + "logits/rejected": -153591324.44444445, + "logps/chosen": -387.4183872767857, + "logps/rejected": -368.2563747829861, + "loss": 0.0355, + "rewards/chosen": 0.7915980475289481, + "rewards/margins": 12.39318258800204, + "rewards/rejected": -11.601584540473091, + "step": 867 + }, + { + "epoch": 0.15861123800822294, + "grad_norm": 24.125, + "kl": 0.0, + "learning_rate": 5.292288158031595e-07, + "logits/chosen": -167839040.0, + "logits/rejected": -28386604.0, + "logps/chosen": -278.3690185546875, + "logps/rejected": -352.2608947753906, + "loss": 0.042, + "rewards/chosen": 2.281287908554077, + "rewards/margins": 13.167053937911987, + "rewards/rejected": -10.88576602935791, + "step": 868 + }, + { + "epoch": 0.15879396984924624, + "grad_norm": 6.40625, + "kl": 0.0, + "learning_rate": 5.214411988029355e-07, + "logits/chosen": -91372256.0, + "logits/rejected": -64707964.0, + "logps/chosen": -185.42372131347656, + "logps/rejected": -513.0044555664062, + "loss": 0.0189, + "rewards/chosen": 2.2327427864074707, + "rewards/margins": 16.41218328475952, + "rewards/rejected": -14.17944049835205, + "step": 869 + }, + { + "epoch": 0.15897670169026953, + "grad_norm": 6.84375, + "kl": 0.0, + "learning_rate": 5.137081516283582e-07, + "logits/chosen": -154711088.0, + "logits/rejected": -88496592.0, + "logps/chosen": -188.620361328125, + "logps/rejected": -445.767578125, + "loss": 0.0391, + "rewards/chosen": 0.7815443277359009, + "rewards/margins": 11.371756196022034, + "rewards/rejected": -10.590211868286133, + "step": 870 + }, + { + "epoch": 0.15915943353129283, + "grad_norm": 7.46875, + "kl": 0.0, + "learning_rate": 5.06029768504166e-07, + "logits/chosen": -132637366.85714285, + "logits/rejected": -54988928.0, + "logps/chosen": -142.54584612165178, + "logps/rejected": -362.9306640625, + "loss": 0.0277, + "rewards/chosen": 1.798269271850586, + "rewards/margins": 13.247429529825846, + "rewards/rejected": -11.44916025797526, + "step": 871 + }, + { + "epoch": 0.15934216537231613, + "grad_norm": 9.875, + "kl": 0.0, + "learning_rate": 4.984061429890324e-07, + "logits/chosen": -87249557.33333333, + "logits/rejected": -73836105.14285715, + "logps/chosen": -259.6889377170139, + "logps/rejected": -308.62447684151783, + "loss": 0.047, + "rewards/chosen": 1.0647057427300348, + "rewards/margins": 10.019593677823504, + "rewards/rejected": -8.95488793509347, + "step": 872 + }, + { + "epoch": 0.15952489721333943, + "grad_norm": 7.6875, + "kl": 0.0, + "learning_rate": 4.908373679744316e-07, + "logits/chosen": -125323464.0, + "logits/rejected": -118448416.0, + "logps/chosen": -168.81777954101562, + "logps/rejected": -639.84423828125, + "loss": 0.0263, + "rewards/chosen": 1.316214680671692, + "rewards/margins": 16.179187893867493, + "rewards/rejected": -14.8629732131958, + "step": 873 + }, + { + "epoch": 0.15970762905436273, + "grad_norm": 11.375, + "kl": 0.0, + "learning_rate": 4.833235356834959e-07, + "logits/chosen": -124860441.6, + "logits/rejected": -78223306.66666667, + "logps/chosen": -273.9218505859375, + "logps/rejected": -362.3570149739583, + "loss": 0.0717, + "rewards/chosen": 1.076344871520996, + "rewards/margins": 10.769991620381674, + "rewards/rejected": -9.693646748860678, + "step": 874 + }, + { + "epoch": 0.15989036089538602, + "grad_norm": 4.09375, + "kl": 0.0, + "learning_rate": 4.758647376699033e-07, + "logits/chosen": -183140336.0, + "logits/rejected": -105224064.0, + "logps/chosen": -134.0400390625, + "logps/rejected": -495.8023274739583, + "loss": 0.008, + "rewards/chosen": 2.6442461013793945, + "rewards/margins": 14.35466225941976, + "rewards/rejected": -11.710416158040365, + "step": 875 + }, + { + "epoch": 0.16007309273640932, + "grad_norm": 8.9375, + "kl": 0.0, + "learning_rate": 4.6846106481675035e-07, + "logits/chosen": -56234230.85714286, + "logits/rejected": -139981724.44444445, + "logps/chosen": -271.67332240513394, + "logps/rejected": -632.4506293402778, + "loss": 0.0341, + "rewards/chosen": 0.5868918555123466, + "rewards/margins": 14.5960598446074, + "rewards/rejected": -14.009167989095053, + "step": 876 + }, + { + "epoch": 0.16025582457743262, + "grad_norm": 9.6875, + "kl": 0.0, + "learning_rate": 4.6111260733545714e-07, + "logits/chosen": -103568768.0, + "logits/rejected": -99149834.66666667, + "logps/chosen": -215.480224609375, + "logps/rejected": -430.959228515625, + "loss": 0.036, + "rewards/chosen": 1.2281005859375, + "rewards/margins": 13.599760182698569, + "rewards/rejected": -12.371659596761068, + "step": 877 + }, + { + "epoch": 0.16043855641845592, + "grad_norm": 7.25, + "kl": 0.0, + "learning_rate": 4.538194547646574e-07, + "logits/chosen": -149902608.0, + "logits/rejected": -174717984.0, + "logps/chosen": -145.75364685058594, + "logps/rejected": -598.4421997070312, + "loss": 0.0284, + "rewards/chosen": 2.3009567260742188, + "rewards/margins": 15.531708717346191, + "rewards/rejected": -13.230751991271973, + "step": 878 + }, + { + "epoch": 0.16062128825947922, + "grad_norm": 9.5625, + "kl": 0.0, + "learning_rate": 4.4658169596911493e-07, + "logits/chosen": -231393740.8, + "logits/rejected": -123686186.66666667, + "logps/chosen": -225.8618408203125, + "logps/rejected": -560.2189127604166, + "loss": 0.0219, + "rewards/chosen": 3.0178709030151367, + "rewards/margins": 14.571211814880371, + "rewards/rejected": -11.553340911865234, + "step": 879 + }, + { + "epoch": 0.16080402010050251, + "grad_norm": 8.1875, + "kl": 0.0, + "learning_rate": 4.3939941913863525e-07, + "logits/chosen": -151465395.2, + "logits/rejected": -116265322.66666667, + "logps/chosen": -205.3053466796875, + "logps/rejected": -505.2188313802083, + "loss": 0.0483, + "rewards/chosen": 0.946253490447998, + "rewards/margins": 12.795228354136148, + "rewards/rejected": -11.84897486368815, + "step": 880 + }, + { + "epoch": 0.1609867519415258, + "grad_norm": 6.40625, + "kl": 0.0, + "learning_rate": 4.322727117869951e-07, + "logits/chosen": -142481493.33333334, + "logits/rejected": -115637670.4, + "logps/chosen": -193.4556681315104, + "logps/rejected": -436.033642578125, + "loss": 0.0114, + "rewards/chosen": 2.1137792269388833, + "rewards/margins": 12.445406119028727, + "rewards/rejected": -10.331626892089844, + "step": 881 + }, + { + "epoch": 0.1611694837825491, + "grad_norm": 2.28125, + "kl": 0.0, + "learning_rate": 4.2520166075087635e-07, + "logits/chosen": -86684016.0, + "logits/rejected": -91685749.33333333, + "logps/chosen": -221.1931915283203, + "logps/rejected": -498.2650553385417, + "loss": 0.0041, + "rewards/chosen": 2.5845367908477783, + "rewards/margins": 14.078372240066528, + "rewards/rejected": -11.49383544921875, + "step": 882 + }, + { + "epoch": 0.1613522156235724, + "grad_norm": 5.1875, + "kl": 0.0, + "learning_rate": 4.1818635218880186e-07, + "logits/chosen": -77158416.0, + "logits/rejected": -96632736.0, + "logps/chosen": -204.8175252278646, + "logps/rejected": -572.536865234375, + "loss": 0.0094, + "rewards/chosen": 3.030393600463867, + "rewards/margins": 19.8160701751709, + "rewards/rejected": -16.785676574707033, + "step": 883 + }, + { + "epoch": 0.1615349474645957, + "grad_norm": 6.78125, + "kl": 0.0, + "learning_rate": 4.112268715800943e-07, + "logits/chosen": -80752789.33333333, + "logits/rejected": -116774163.6923077, + "logps/chosen": -245.03350830078125, + "logps/rejected": -448.60167518028845, + "loss": 0.0186, + "rewards/chosen": 0.0720168948173523, + "rewards/margins": 11.125454347867231, + "rewards/rejected": -11.05343745304988, + "step": 884 + }, + { + "epoch": 0.161717679305619, + "grad_norm": 10.5, + "kl": 0.0, + "learning_rate": 4.043233037238281e-07, + "logits/chosen": -215571401.14285713, + "logits/rejected": -95874019.55555555, + "logps/chosen": -360.23252650669644, + "logps/rejected": -396.6089138454861, + "loss": 0.0273, + "rewards/chosen": 0.9914407048906598, + "rewards/margins": 13.106669993627639, + "rewards/rejected": -12.115229288736979, + "step": 885 + }, + { + "epoch": 0.1619004111466423, + "grad_norm": 8.5, + "kl": 0.0, + "learning_rate": 3.9747573273779816e-07, + "logits/chosen": -102577139.2, + "logits/rejected": -103882442.66666667, + "logps/chosen": -185.25208740234376, + "logps/rejected": -352.11083984375, + "loss": 0.0315, + "rewards/chosen": 1.917262649536133, + "rewards/margins": 13.15564816792806, + "rewards/rejected": -11.238385518391928, + "step": 886 + }, + { + "epoch": 0.1620831429876656, + "grad_norm": 11.5, + "kl": 0.0, + "learning_rate": 3.90684242057498e-07, + "logits/chosen": -110974122.66666667, + "logits/rejected": -80332114.28571428, + "logps/chosen": -250.05623372395834, + "logps/rejected": -398.8191615513393, + "loss": 0.0412, + "rewards/chosen": 0.9841297997368706, + "rewards/margins": 13.767009220426043, + "rewards/rejected": -12.782879420689174, + "step": 887 + }, + { + "epoch": 0.1622658748286889, + "grad_norm": 6.40625, + "kl": 0.0, + "learning_rate": 3.8394891443509554e-07, + "logits/chosen": -170088960.0, + "logits/rejected": -81269364.36363636, + "logps/chosen": -223.7287109375, + "logps/rejected": -544.1018288352273, + "loss": 0.0235, + "rewards/chosen": 1.3469314575195312, + "rewards/margins": 16.13349151611328, + "rewards/rejected": -14.78656005859375, + "step": 888 + }, + { + "epoch": 0.1624486066697122, + "grad_norm": 8.375, + "kl": 0.0, + "learning_rate": 3.772698319384349e-07, + "logits/chosen": -97592896.0, + "logits/rejected": -94396608.0, + "logps/chosen": -209.1016642252604, + "logps/rejected": -535.521484375, + "loss": 0.0339, + "rewards/chosen": 0.21869937578837076, + "rewards/margins": 13.77877017656962, + "rewards/rejected": -13.56007080078125, + "step": 889 + }, + { + "epoch": 0.1626313385107355, + "grad_norm": 6.21875, + "kl": 0.0, + "learning_rate": 3.7064707595002636e-07, + "logits/chosen": -115672714.66666667, + "logits/rejected": -80250304.0, + "logps/chosen": -234.63533528645834, + "logps/rejected": -410.741748046875, + "loss": 0.0221, + "rewards/chosen": 1.758371353149414, + "rewards/margins": 14.384680557250977, + "rewards/rejected": -12.626309204101563, + "step": 890 + }, + { + "epoch": 0.1628140703517588, + "grad_norm": 9.6875, + "kl": 0.0, + "learning_rate": 3.6408072716606346e-07, + "logits/chosen": -105923150.22222222, + "logits/rejected": -127089536.0, + "logps/chosen": -248.48478190104166, + "logps/rejected": -383.00174386160717, + "loss": 0.033, + "rewards/chosen": 1.656804296705458, + "rewards/margins": 12.154799945770748, + "rewards/rejected": -10.49799564906529, + "step": 891 + }, + { + "epoch": 0.1629968021927821, + "grad_norm": 9.625, + "kl": 0.0, + "learning_rate": 3.575708655954324e-07, + "logits/chosen": -152659328.0, + "logits/rejected": -31131652.57142857, + "logps/chosen": -284.8611111111111, + "logps/rejected": -434.4618443080357, + "loss": 0.0372, + "rewards/chosen": 1.3611794577704535, + "rewards/margins": 11.162090407477486, + "rewards/rejected": -9.800910949707031, + "step": 892 + }, + { + "epoch": 0.1631795340338054, + "grad_norm": 8.5625, + "kl": 0.0, + "learning_rate": 3.511175705587433e-07, + "logits/chosen": -111812838.4, + "logits/rejected": -74654176.0, + "logps/chosen": -223.325146484375, + "logps/rejected": -280.1533203125, + "loss": 0.0443, + "rewards/chosen": 0.9639616012573242, + "rewards/margins": 13.056232770284018, + "rewards/rejected": -12.092271169026693, + "step": 893 + }, + { + "epoch": 0.1633622658748287, + "grad_norm": 8.8125, + "kl": 0.0, + "learning_rate": 3.4472092068735917e-07, + "logits/chosen": -74494176.0, + "logits/rejected": -61665766.4, + "logps/chosen": -284.57696533203125, + "logps/rejected": -501.233935546875, + "loss": 0.0287, + "rewards/chosen": 0.9435033798217773, + "rewards/margins": 13.544577598571777, + "rewards/rejected": -12.60107421875, + "step": 894 + }, + { + "epoch": 0.163544997715852, + "grad_norm": 4.25, + "kl": 0.0, + "learning_rate": 3.3838099392243915e-07, + "logits/chosen": -100695376.0, + "logits/rejected": -95834512.0, + "logps/chosen": -171.62908935546875, + "logps/rejected": -723.3538818359375, + "loss": 0.0142, + "rewards/chosen": 2.9034690856933594, + "rewards/margins": 20.406450271606445, + "rewards/rejected": -17.502981185913086, + "step": 895 + }, + { + "epoch": 0.16372772955687528, + "grad_norm": 7.9375, + "kl": 0.0, + "learning_rate": 3.320978675139919e-07, + "logits/chosen": -116449312.0, + "logits/rejected": -113665408.0, + "logps/chosen": -240.8985595703125, + "logps/rejected": -491.9715270996094, + "loss": 0.023, + "rewards/chosen": 1.650240421295166, + "rewards/margins": 16.621760845184326, + "rewards/rejected": -14.97152042388916, + "step": 896 + }, + { + "epoch": 0.16391046139789858, + "grad_norm": 10.3125, + "kl": 0.0, + "learning_rate": 3.258716180199278e-07, + "logits/chosen": -136303115.63636363, + "logits/rejected": -69759168.0, + "logps/chosen": -225.65647194602272, + "logps/rejected": -364.461279296875, + "loss": 0.0479, + "rewards/chosen": 1.0138032219626687, + "rewards/margins": 13.475933959267355, + "rewards/rejected": -12.462130737304687, + "step": 897 + }, + { + "epoch": 0.16409319323892188, + "grad_norm": 13.3125, + "kl": 0.0, + "learning_rate": 3.1970232130513365e-07, + "logits/chosen": -123368934.4, + "logits/rejected": -74486826.66666667, + "logps/chosen": -205.38251953125, + "logps/rejected": -365.8810221354167, + "loss": 0.057, + "rewards/chosen": 0.5546815872192383, + "rewards/margins": 8.279035377502442, + "rewards/rejected": -7.724353790283203, + "step": 898 + }, + { + "epoch": 0.16427592507994518, + "grad_norm": 9.125, + "kl": 0.0, + "learning_rate": 3.135900525405428e-07, + "logits/chosen": -12001288.0, + "logits/rejected": -102879432.0, + "logps/chosen": -282.4752197265625, + "logps/rejected": -404.7865905761719, + "loss": 0.0339, + "rewards/chosen": 0.8134231567382812, + "rewards/margins": 12.770259857177734, + "rewards/rejected": -11.956836700439453, + "step": 899 + }, + { + "epoch": 0.16445865692096848, + "grad_norm": 4.8125, + "kl": 0.0, + "learning_rate": 3.0753488620222037e-07, + "logits/chosen": -93886169.6, + "logits/rejected": -67866199.27272727, + "logps/chosen": -172.4867919921875, + "logps/rejected": -413.32124467329544, + "loss": 0.0175, + "rewards/chosen": 1.562174415588379, + "rewards/margins": 14.175364112854004, + "rewards/rejected": -12.613189697265625, + "step": 900 + }, + { + "epoch": 0.16464138876199177, + "grad_norm": 8.0, + "kl": 0.0, + "learning_rate": 3.015368960704584e-07, + "logits/chosen": -100028496.0, + "logits/rejected": -59027452.0, + "logps/chosen": -174.2237548828125, + "logps/rejected": -511.83306884765625, + "loss": 0.0267, + "rewards/chosen": 1.4011775255203247, + "rewards/margins": 13.859946370124817, + "rewards/rejected": -12.458768844604492, + "step": 901 + }, + { + "epoch": 0.16482412060301507, + "grad_norm": 7.125, + "kl": 0.0, + "learning_rate": 2.9559615522887275e-07, + "logits/chosen": -164602163.2, + "logits/rejected": -111567441.45454545, + "logps/chosen": -349.265771484375, + "logps/rejected": -434.38401100852275, + "loss": 0.0152, + "rewards/chosen": 1.589792251586914, + "rewards/margins": 13.074500864202326, + "rewards/rejected": -11.484708612615412, + "step": 902 + }, + { + "epoch": 0.16500685244403837, + "grad_norm": 7.1875, + "kl": 0.0, + "learning_rate": 2.8971273606351656e-07, + "logits/chosen": -113567637.33333333, + "logits/rejected": -102600428.8, + "logps/chosen": -261.7483317057292, + "logps/rejected": -448.655224609375, + "loss": 0.0215, + "rewards/chosen": 2.490199565887451, + "rewards/margins": 14.726534557342529, + "rewards/rejected": -12.236334991455077, + "step": 903 + }, + { + "epoch": 0.16518958428506167, + "grad_norm": 10.1875, + "kl": 0.0, + "learning_rate": 2.838867102619952e-07, + "logits/chosen": -100590534.4, + "logits/rejected": -86945770.66666667, + "logps/chosen": -236.782666015625, + "logps/rejected": -461.2770589192708, + "loss": 0.0396, + "rewards/chosen": 1.5760257720947266, + "rewards/margins": 13.623148727416993, + "rewards/rejected": -12.047122955322266, + "step": 904 + }, + { + "epoch": 0.16537231612608497, + "grad_norm": 9.9375, + "kl": 0.0, + "learning_rate": 2.7811814881259503e-07, + "logits/chosen": -99009749.33333333, + "logits/rejected": -137941232.0, + "logps/chosen": -274.69708251953125, + "logps/rejected": -447.8066101074219, + "loss": 0.0251, + "rewards/chosen": 2.1799675623575845, + "rewards/margins": 10.742473284403482, + "rewards/rejected": -8.562505722045898, + "step": 905 + }, + { + "epoch": 0.16555504796710827, + "grad_norm": 7.03125, + "kl": 0.0, + "learning_rate": 2.724071220034158e-07, + "logits/chosen": -86300515.55555555, + "logits/rejected": -81906011.42857143, + "logps/chosen": -162.95692274305554, + "logps/rejected": -520.625, + "loss": 0.0231, + "rewards/chosen": 2.642585966322157, + "rewards/margins": 14.778121070256308, + "rewards/rejected": -12.135535103934151, + "step": 906 + }, + { + "epoch": 0.16573777980813156, + "grad_norm": 9.25, + "kl": 0.0, + "learning_rate": 2.6675369942151864e-07, + "logits/chosen": -127547093.33333333, + "logits/rejected": -154879250.2857143, + "logps/chosen": -282.86602105034723, + "logps/rejected": -500.3209751674107, + "loss": 0.0361, + "rewards/chosen": 1.0719051361083984, + "rewards/margins": 14.010715212140765, + "rewards/rejected": -12.938810076032366, + "step": 907 + }, + { + "epoch": 0.16592051164915486, + "grad_norm": 6.75, + "kl": 0.0, + "learning_rate": 2.611579499520722e-07, + "logits/chosen": -62581077.333333336, + "logits/rejected": -96450060.8, + "logps/chosen": -198.68778483072916, + "logps/rejected": -445.03779296875, + "loss": 0.0214, + "rewards/chosen": 1.2051794528961182, + "rewards/margins": 15.433838510513306, + "rewards/rejected": -14.228659057617188, + "step": 908 + }, + { + "epoch": 0.16610324349017816, + "grad_norm": 5.15625, + "kl": 0.0, + "learning_rate": 2.556199417775174e-07, + "logits/chosen": -107914523.42857143, + "logits/rejected": -151837767.1111111, + "logps/chosen": -139.94417898995536, + "logps/rejected": -503.0490993923611, + "loss": 0.0138, + "rewards/chosen": 2.8423799787248885, + "rewards/margins": 15.10665542360336, + "rewards/rejected": -12.264275444878471, + "step": 909 + }, + { + "epoch": 0.16628597533120146, + "grad_norm": 8.3125, + "kl": 0.0, + "learning_rate": 2.5013974237673824e-07, + "logits/chosen": -69934880.0, + "logits/rejected": -162771872.0, + "logps/chosen": -302.46728515625, + "logps/rejected": -387.87249755859375, + "loss": 0.0433, + "rewards/chosen": 0.898848831653595, + "rewards/margins": 12.825301468372345, + "rewards/rejected": -11.92645263671875, + "step": 910 + }, + { + "epoch": 0.16646870717222476, + "grad_norm": 10.25, + "kl": 0.0, + "learning_rate": 2.447174185242324e-07, + "logits/chosen": -132897877.33333333, + "logits/rejected": -59772992.0, + "logps/chosen": -225.49811469184027, + "logps/rejected": -336.19140625, + "loss": 0.0297, + "rewards/chosen": 1.324817763434516, + "rewards/margins": 11.308562793428937, + "rewards/rejected": -9.98374502999442, + "step": 911 + }, + { + "epoch": 0.16665143901324805, + "grad_norm": 7.5, + "kl": 0.0, + "learning_rate": 2.3935303628930705e-07, + "logits/chosen": -115290120.0, + "logits/rejected": -67100876.0, + "logps/chosen": -147.07467651367188, + "logps/rejected": -468.8291320800781, + "loss": 0.0297, + "rewards/chosen": 2.0127978324890137, + "rewards/margins": 13.695569515228271, + "rewards/rejected": -11.682771682739258, + "step": 912 + }, + { + "epoch": 0.16683417085427135, + "grad_norm": 9.375, + "kl": 0.0, + "learning_rate": 2.3404666103526542e-07, + "logits/chosen": -83440739.55555555, + "logits/rejected": -84277513.14285715, + "logps/chosen": -227.29861111111111, + "logps/rejected": -295.1926967075893, + "loss": 0.0233, + "rewards/chosen": 2.0261029137505426, + "rewards/margins": 11.117149928259472, + "rewards/rejected": -9.091047014508929, + "step": 913 + }, + { + "epoch": 0.16701690269529465, + "grad_norm": 6.5625, + "kl": 0.0, + "learning_rate": 2.287983574186159e-07, + "logits/chosen": -105660176.0, + "logits/rejected": -77311936.0, + "logps/chosen": -199.88565063476562, + "logps/rejected": -518.664794921875, + "loss": 0.0121, + "rewards/chosen": 2.1675422191619873, + "rewards/margins": 16.449987490971886, + "rewards/rejected": -14.282445271809896, + "step": 914 + }, + { + "epoch": 0.16719963453631795, + "grad_norm": 5.6875, + "kl": 0.0, + "learning_rate": 2.2360818938828189e-07, + "logits/chosen": -83215670.85714285, + "logits/rejected": -77876494.22222222, + "logps/chosen": -114.86095319475446, + "logps/rejected": -406.99614800347223, + "loss": 0.0136, + "rewards/chosen": 2.6879899161202565, + "rewards/margins": 12.339654650006976, + "rewards/rejected": -9.651664733886719, + "step": 915 + }, + { + "epoch": 0.16738236637734125, + "grad_norm": 7.8125, + "kl": 0.0, + "learning_rate": 2.1847622018482283e-07, + "logits/chosen": -114017080.0, + "logits/rejected": -123856920.0, + "logps/chosen": -166.11312866210938, + "logps/rejected": -486.7297668457031, + "loss": 0.0226, + "rewards/chosen": 1.7202695608139038, + "rewards/margins": 12.354191184043884, + "rewards/rejected": -10.63392162322998, + "step": 916 + }, + { + "epoch": 0.16756509821836454, + "grad_norm": 11.375, + "kl": 0.0, + "learning_rate": 2.134025123396638e-07, + "logits/chosen": -134876650.66666666, + "logits/rejected": -112230208.0, + "logps/chosen": -201.67146809895834, + "logps/rejected": -453.83837890625, + "loss": 0.0603, + "rewards/chosen": 2.2153402964274087, + "rewards/margins": 12.840434137980143, + "rewards/rejected": -10.625093841552735, + "step": 917 + }, + { + "epoch": 0.16774783005938784, + "grad_norm": 7.9375, + "kl": 0.0, + "learning_rate": 2.083871276743338e-07, + "logits/chosen": -37871723.428571425, + "logits/rejected": -115953080.8888889, + "logps/chosen": -215.6387939453125, + "logps/rejected": -397.03089735243054, + "loss": 0.0237, + "rewards/chosen": 1.4007552010672433, + "rewards/margins": 14.96189208257766, + "rewards/rejected": -13.561136881510416, + "step": 918 + }, + { + "epoch": 0.16793056190041114, + "grad_norm": 9.25, + "kl": 0.0, + "learning_rate": 2.0343012729971244e-07, + "logits/chosen": -100791541.33333333, + "logits/rejected": -32380684.8, + "logps/chosen": -159.7518310546875, + "logps/rejected": -346.1192626953125, + "loss": 0.041, + "rewards/chosen": 1.8542688687642415, + "rewards/margins": 12.885058816274007, + "rewards/rejected": -11.030789947509765, + "step": 919 + }, + { + "epoch": 0.16811329374143444, + "grad_norm": 9.375, + "kl": 0.0, + "learning_rate": 1.9853157161528468e-07, + "logits/chosen": -117121100.8, + "logits/rejected": -91454421.33333333, + "logps/chosen": -248.050048828125, + "logps/rejected": -449.6866455078125, + "loss": 0.0328, + "rewards/chosen": 1.4994428634643555, + "rewards/margins": 12.532128461201987, + "rewards/rejected": -11.03268559773763, + "step": 920 + }, + { + "epoch": 0.16829602558245774, + "grad_norm": 10.625, + "kl": 0.0, + "learning_rate": 1.9369152030840553e-07, + "logits/chosen": -117514712.0, + "logits/rejected": -106056104.0, + "logps/chosen": -295.4185485839844, + "logps/rejected": -464.64019775390625, + "loss": 0.033, + "rewards/chosen": 1.4673011302947998, + "rewards/margins": 12.404999494552612, + "rewards/rejected": -10.937698364257812, + "step": 921 + }, + { + "epoch": 0.16847875742348103, + "grad_norm": 12.125, + "kl": 0.0, + "learning_rate": 1.8891003235357307e-07, + "logits/chosen": -81072104.0, + "logits/rejected": -115564736.0, + "logps/chosen": -300.4124450683594, + "logps/rejected": -578.0819702148438, + "loss": 0.0421, + "rewards/chosen": 0.4471231698989868, + "rewards/margins": 11.67988646030426, + "rewards/rejected": -11.232763290405273, + "step": 922 + }, + { + "epoch": 0.16866148926450433, + "grad_norm": 10.5625, + "kl": 0.0, + "learning_rate": 1.841871660117095e-07, + "logits/chosen": -126263449.6, + "logits/rejected": -118323498.66666667, + "logps/chosen": -245.1674072265625, + "logps/rejected": -653.448486328125, + "loss": 0.0434, + "rewards/chosen": 1.3794803619384766, + "rewards/margins": 14.345317204793295, + "rewards/rejected": -12.965836842854818, + "step": 923 + }, + { + "epoch": 0.16884422110552763, + "grad_norm": 9.0, + "kl": 0.0, + "learning_rate": 1.7952297882945e-07, + "logits/chosen": -123081064.0, + "logits/rejected": -82804760.0, + "logps/chosen": -210.9302215576172, + "logps/rejected": -575.6344604492188, + "loss": 0.0301, + "rewards/chosen": 1.1564841270446777, + "rewards/margins": 15.336616039276123, + "rewards/rejected": -14.180131912231445, + "step": 924 + }, + { + "epoch": 0.16902695294655093, + "grad_norm": 8.9375, + "kl": 0.0, + "learning_rate": 1.7491752763844294e-07, + "logits/chosen": -98190246.4, + "logits/rejected": -57153850.666666664, + "logps/chosen": -135.7724853515625, + "logps/rejected": -483.813232421875, + "loss": 0.0376, + "rewards/chosen": 1.708419418334961, + "rewards/margins": 14.248313522338867, + "rewards/rejected": -12.539894104003906, + "step": 925 + }, + { + "epoch": 0.16920968478757423, + "grad_norm": 9.625, + "kl": 0.0, + "learning_rate": 1.7037086855465902e-07, + "logits/chosen": -161258268.44444445, + "logits/rejected": -55559140.571428575, + "logps/chosen": -183.15562608506946, + "logps/rejected": -677.853515625, + "loss": 0.039, + "rewards/chosen": 1.0454235076904297, + "rewards/margins": 18.26653970990862, + "rewards/rejected": -17.22111620221819, + "step": 926 + }, + { + "epoch": 0.16939241662859753, + "grad_norm": 8.75, + "kl": 0.0, + "learning_rate": 1.6588305697770313e-07, + "logits/chosen": -61004964.0, + "logits/rejected": -107785472.0, + "logps/chosen": -181.05035400390625, + "logps/rejected": -393.9998779296875, + "loss": 0.0415, + "rewards/chosen": 0.640805184841156, + "rewards/margins": 12.55094999074936, + "rewards/rejected": -11.910144805908203, + "step": 927 + }, + { + "epoch": 0.16957514846962082, + "grad_norm": 9.9375, + "kl": 0.0, + "learning_rate": 1.6145414759014433e-07, + "logits/chosen": -157391644.44444445, + "logits/rejected": -167909668.57142857, + "logps/chosen": -314.5147298177083, + "logps/rejected": -375.495361328125, + "loss": 0.0235, + "rewards/chosen": 2.2219732072618275, + "rewards/margins": 11.441694320194305, + "rewards/rejected": -9.219721112932477, + "step": 928 + }, + { + "epoch": 0.16975788031064412, + "grad_norm": 8.1875, + "kl": 0.0, + "learning_rate": 1.5708419435684463e-07, + "logits/chosen": -120403878.4, + "logits/rejected": -268685994.6666667, + "logps/chosen": -270.7590087890625, + "logps/rejected": -521.0355631510416, + "loss": 0.0325, + "rewards/chosen": 1.506359577178955, + "rewards/margins": 12.855448881785074, + "rewards/rejected": -11.34908930460612, + "step": 929 + }, + { + "epoch": 0.16994061215166742, + "grad_norm": 10.3125, + "kl": 0.0, + "learning_rate": 1.5277325052430569e-07, + "logits/chosen": -93859529.14285715, + "logits/rejected": -35499324.44444445, + "logps/chosen": -191.78142438616072, + "logps/rejected": -390.7109646267361, + "loss": 0.0296, + "rewards/chosen": 1.0952136175973075, + "rewards/margins": 10.873848430694096, + "rewards/rejected": -9.778634813096788, + "step": 930 + }, + { + "epoch": 0.17012334399269072, + "grad_norm": 6.71875, + "kl": 0.0, + "learning_rate": 1.4852136862001766e-07, + "logits/chosen": -122601773.71428572, + "logits/rejected": -49644650.666666664, + "logps/chosen": -205.27587890625, + "logps/rejected": -451.2661404079861, + "loss": 0.0158, + "rewards/chosen": 2.7381722586495534, + "rewards/margins": 14.987854367210751, + "rewards/rejected": -12.249682108561197, + "step": 931 + }, + { + "epoch": 0.17030607583371402, + "grad_norm": 9.625, + "kl": 0.0, + "learning_rate": 1.4432860045182019e-07, + "logits/chosen": -83855840.0, + "logits/rejected": -76362666.66666667, + "logps/chosen": -238.402490234375, + "logps/rejected": -466.5862630208333, + "loss": 0.0315, + "rewards/chosen": 1.4985852241516113, + "rewards/margins": 14.426609516143799, + "rewards/rejected": -12.928024291992188, + "step": 932 + }, + { + "epoch": 0.1704888076747373, + "grad_norm": 5.8125, + "kl": 0.0, + "learning_rate": 1.4019499710726913e-07, + "logits/chosen": -137240563.2, + "logits/rejected": -107079925.33333333, + "logps/chosen": -168.97999267578126, + "logps/rejected": -436.0775553385417, + "loss": 0.0155, + "rewards/chosen": 2.4263895034790037, + "rewards/margins": 15.58713830312093, + "rewards/rejected": -13.160748799641928, + "step": 933 + }, + { + "epoch": 0.1706715395157606, + "grad_norm": 7.375, + "kl": 0.0, + "learning_rate": 1.3612060895301759e-07, + "logits/chosen": -87632248.0, + "logits/rejected": -88201000.0, + "logps/chosen": -250.74221801757812, + "logps/rejected": -479.6385498046875, + "loss": 0.024, + "rewards/chosen": 1.731784462928772, + "rewards/margins": 15.164584755897522, + "rewards/rejected": -13.43280029296875, + "step": 934 + }, + { + "epoch": 0.1708542713567839, + "grad_norm": 7.84375, + "kl": 0.0, + "learning_rate": 1.3210548563419857e-07, + "logits/chosen": -107415398.4, + "logits/rejected": -93888465.45454545, + "logps/chosen": -162.2802734375, + "logps/rejected": -419.70010653409093, + "loss": 0.018, + "rewards/chosen": 1.9007139205932617, + "rewards/margins": 12.73309057409113, + "rewards/rejected": -10.83237665349787, + "step": 935 + }, + { + "epoch": 0.1710370031978072, + "grad_norm": 9.1875, + "kl": 0.0, + "learning_rate": 1.2814967607382433e-07, + "logits/chosen": -63252121.6, + "logits/rejected": -147480203.63636363, + "logps/chosen": -413.719580078125, + "logps/rejected": -516.5614346590909, + "loss": 0.0234, + "rewards/chosen": 0.6700860977172851, + "rewards/margins": 13.976127468455921, + "rewards/rejected": -13.306041370738637, + "step": 936 + }, + { + "epoch": 0.1712197350388305, + "grad_norm": 8.1875, + "kl": 0.0, + "learning_rate": 1.2425322847218368e-07, + "logits/chosen": -65929946.666666664, + "logits/rejected": -78952179.2, + "logps/chosen": -237.5601806640625, + "logps/rejected": -362.801513671875, + "loss": 0.0295, + "rewards/chosen": 1.1302984555562336, + "rewards/margins": 12.996775658925374, + "rewards/rejected": -11.86647720336914, + "step": 937 + }, + { + "epoch": 0.1714024668798538, + "grad_norm": 8.1875, + "kl": 0.0, + "learning_rate": 1.2041619030626283e-07, + "logits/chosen": -104401465.6, + "logits/rejected": -81184032.0, + "logps/chosen": -244.539306640625, + "logps/rejected": -523.1985677083334, + "loss": 0.0261, + "rewards/chosen": 2.2217702865600586, + "rewards/margins": 14.626692771911621, + "rewards/rejected": -12.404922485351562, + "step": 938 + }, + { + "epoch": 0.1715851987208771, + "grad_norm": 5.3125, + "kl": 0.0, + "learning_rate": 1.166386083291604e-07, + "logits/chosen": -68344441.6, + "logits/rejected": -54937408.0, + "logps/chosen": -165.1434326171875, + "logps/rejected": -357.2894398082386, + "loss": 0.0137, + "rewards/chosen": 1.7826251983642578, + "rewards/margins": 10.813161503184926, + "rewards/rejected": -9.030536304820668, + "step": 939 + }, + { + "epoch": 0.1717679305619004, + "grad_norm": 10.875, + "kl": 0.0, + "learning_rate": 1.1292052856952063e-07, + "logits/chosen": -79736140.8, + "logits/rejected": -74641056.0, + "logps/chosen": -238.4696044921875, + "logps/rejected": -395.7085774739583, + "loss": 0.0403, + "rewards/chosen": 1.0842785835266113, + "rewards/margins": 13.785067717234293, + "rewards/rejected": -12.700789133707682, + "step": 940 + }, + { + "epoch": 0.1719506624029237, + "grad_norm": 6.90625, + "kl": 0.0, + "learning_rate": 1.0926199633097156e-07, + "logits/chosen": -130926944.0, + "logits/rejected": -72270176.0, + "logps/chosen": -230.25637817382812, + "logps/rejected": -487.4842529296875, + "loss": 0.0201, + "rewards/chosen": 2.552902936935425, + "rewards/margins": 14.010802030563354, + "rewards/rejected": -11.45789909362793, + "step": 941 + }, + { + "epoch": 0.172133394243947, + "grad_norm": 9.3125, + "kl": 0.0, + "learning_rate": 1.0566305619157502e-07, + "logits/chosen": -148089536.0, + "logits/rejected": -171626382.2222222, + "logps/chosen": -260.45445033482144, + "logps/rejected": -620.5439995659722, + "loss": 0.0278, + "rewards/chosen": 1.64511353628976, + "rewards/margins": 14.401420502435593, + "rewards/rejected": -12.756306966145834, + "step": 942 + }, + { + "epoch": 0.1723161260849703, + "grad_norm": 8.8125, + "kl": 0.0, + "learning_rate": 1.0212375200327973e-07, + "logits/chosen": -101392142.22222222, + "logits/rejected": -162869229.7142857, + "logps/chosen": -178.93222384982639, + "logps/rejected": -552.9181082589286, + "loss": 0.0397, + "rewards/chosen": 1.0918627844916449, + "rewards/margins": 14.805296186416868, + "rewards/rejected": -13.713433401925224, + "step": 943 + }, + { + "epoch": 0.1724988579259936, + "grad_norm": 9.1875, + "kl": 0.0, + "learning_rate": 9.864412689139124e-08, + "logits/chosen": -102989480.0, + "logits/rejected": -122779648.0, + "logps/chosen": -287.61065673828125, + "logps/rejected": -494.07208251953125, + "loss": 0.0321, + "rewards/chosen": 0.9653797149658203, + "rewards/margins": 12.672992706298828, + "rewards/rejected": -11.707612991333008, + "step": 944 + }, + { + "epoch": 0.1726815897670169, + "grad_norm": 9.375, + "kl": 0.0, + "learning_rate": 9.522422325404234e-08, + "logits/chosen": -104844616.0, + "logits/rejected": -268504544.0, + "logps/chosen": -215.93060302734375, + "logps/rejected": -562.3094482421875, + "loss": 0.0399, + "rewards/chosen": 0.586142361164093, + "rewards/margins": 13.923035442829132, + "rewards/rejected": -13.336893081665039, + "step": 945 + }, + { + "epoch": 0.1728643216080402, + "grad_norm": 8.3125, + "kl": 0.0, + "learning_rate": 9.186408276168012e-08, + "logits/chosen": -61812166.4, + "logits/rejected": 5060169.333333333, + "logps/chosen": -251.92900390625, + "logps/rejected": -496.4012451171875, + "loss": 0.049, + "rewards/chosen": 0.8614256858825684, + "rewards/margins": 15.755350399017335, + "rewards/rejected": -14.893924713134766, + "step": 946 + }, + { + "epoch": 0.1730470534490635, + "grad_norm": 9.5, + "kl": 0.0, + "learning_rate": 8.856374635655696e-08, + "logits/chosen": -135111121.45454547, + "logits/rejected": -124516326.4, + "logps/chosen": -145.34679066051137, + "logps/rejected": -462.13701171875, + "loss": 0.0364, + "rewards/chosen": 1.8378386064009233, + "rewards/margins": 12.42745194868608, + "rewards/rejected": -10.589613342285157, + "step": 947 + }, + { + "epoch": 0.17322978529008679, + "grad_norm": 8.3125, + "kl": 0.0, + "learning_rate": 8.53232542522292e-08, + "logits/chosen": -128348544.0, + "logits/rejected": -150166357.33333334, + "logps/chosen": -217.1085205078125, + "logps/rejected": -428.292236328125, + "loss": 0.0379, + "rewards/chosen": 1.2898727416992188, + "rewards/margins": 12.959781901041666, + "rewards/rejected": -11.669909159342447, + "step": 948 + }, + { + "epoch": 0.17341251713111008, + "grad_norm": 10.25, + "kl": 0.0, + "learning_rate": 8.214264593307097e-08, + "logits/chosen": -118417254.4, + "logits/rejected": -202086805.33333334, + "logps/chosen": -248.246044921875, + "logps/rejected": -390.9695231119792, + "loss": 0.0358, + "rewards/chosen": 1.1675390243530273, + "rewards/margins": 11.860466066996256, + "rewards/rejected": -10.692927042643229, + "step": 949 + }, + { + "epoch": 0.17359524897213338, + "grad_norm": 9.125, + "kl": 0.0, + "learning_rate": 7.90219601537906e-08, + "logits/chosen": -128690969.6, + "logits/rejected": -106599466.66666667, + "logps/chosen": -176.4697509765625, + "logps/rejected": -600.3011067708334, + "loss": 0.0533, + "rewards/chosen": 0.7291600704193115, + "rewards/margins": 16.810881058375042, + "rewards/rejected": -16.08172098795573, + "step": 950 + }, + { + "epoch": 0.17377798081315668, + "grad_norm": 8.75, + "kl": 0.0, + "learning_rate": 7.59612349389599e-08, + "logits/chosen": -105409781.33333333, + "logits/rejected": -149054643.2, + "logps/chosen": -286.63006591796875, + "logps/rejected": -603.205810546875, + "loss": 0.0209, + "rewards/chosen": 1.2430238723754883, + "rewards/margins": 17.49993762969971, + "rewards/rejected": -16.25691375732422, + "step": 951 + }, + { + "epoch": 0.17396071265417998, + "grad_norm": 9.3125, + "kl": 0.0, + "learning_rate": 7.296050758254958e-08, + "logits/chosen": -182349728.0, + "logits/rejected": -83408800.0, + "logps/chosen": -252.39126586914062, + "logps/rejected": -286.2412109375, + "loss": 0.0322, + "rewards/chosen": 1.3992063999176025, + "rewards/margins": 12.779823541641235, + "rewards/rejected": -11.380617141723633, + "step": 952 + }, + { + "epoch": 0.17414344449520328, + "grad_norm": 11.4375, + "kl": 0.0, + "learning_rate": 7.001981464747565e-08, + "logits/chosen": -115355317.33333333, + "logits/rejected": -120137120.0, + "logps/chosen": -204.5675252278646, + "logps/rejected": -474.7695617675781, + "loss": 0.054, + "rewards/chosen": 1.2149845759073894, + "rewards/margins": 12.64708391825358, + "rewards/rejected": -11.432099342346191, + "step": 953 + }, + { + "epoch": 0.17432617633622657, + "grad_norm": 12.0, + "kl": 0.0, + "learning_rate": 6.713919196515317e-08, + "logits/chosen": -292417678.2222222, + "logits/rejected": -116003730.28571428, + "logps/chosen": -288.4884982638889, + "logps/rejected": -447.37357003348217, + "loss": 0.0477, + "rewards/chosen": 0.3485690752665202, + "rewards/margins": 11.075014954521542, + "rewards/rejected": -10.726445879255023, + "step": 954 + }, + { + "epoch": 0.1745089081772499, + "grad_norm": 4.9375, + "kl": 0.0, + "learning_rate": 6.431867463506047e-08, + "logits/chosen": -148253056.0, + "logits/rejected": -147771366.4, + "logps/chosen": -159.89567057291666, + "logps/rejected": -421.28388671875, + "loss": 0.0145, + "rewards/chosen": 2.7151616414388022, + "rewards/margins": 14.609912618001303, + "rewards/rejected": -11.8947509765625, + "step": 955 + }, + { + "epoch": 0.1746916400182732, + "grad_norm": 5.09375, + "kl": 0.0, + "learning_rate": 6.15582970243117e-08, + "logits/chosen": -120547413.33333333, + "logits/rejected": -130748081.23076923, + "logps/chosen": -156.70390828450522, + "logps/rejected": -436.7726487379808, + "loss": 0.0117, + "rewards/chosen": 1.1276240348815918, + "rewards/margins": 12.053142364208515, + "rewards/rejected": -10.925518329326923, + "step": 956 + }, + { + "epoch": 0.1748743718592965, + "grad_norm": 8.875, + "kl": 0.0, + "learning_rate": 5.8858092767236084e-08, + "logits/chosen": -137473298.2857143, + "logits/rejected": -106112483.55555555, + "logps/chosen": -175.62911551339286, + "logps/rejected": -506.5353190104167, + "loss": 0.0342, + "rewards/chosen": 0.8658638000488281, + "rewards/margins": 14.739746517605251, + "rewards/rejected": -13.873882717556423, + "step": 957 + }, + { + "epoch": 0.1750571037003198, + "grad_norm": 11.25, + "kl": 0.0, + "learning_rate": 5.621809476497098e-08, + "logits/chosen": -71784652.8, + "logits/rejected": -104337450.66666667, + "logps/chosen": -280.3009521484375, + "logps/rejected": -482.8302001953125, + "loss": 0.0499, + "rewards/chosen": 0.8810896873474121, + "rewards/margins": 12.185458024342855, + "rewards/rejected": -11.304368336995443, + "step": 958 + }, + { + "epoch": 0.1752398355413431, + "grad_norm": 8.875, + "kl": 0.0, + "learning_rate": 5.363833518505834e-08, + "logits/chosen": -125506897.45454545, + "logits/rejected": -134266137.6, + "logps/chosen": -160.6038263494318, + "logps/rejected": -368.881884765625, + "loss": 0.0611, + "rewards/chosen": 2.426489223133434, + "rewards/margins": 12.36475979198109, + "rewards/rejected": -9.938270568847656, + "step": 959 + }, + { + "epoch": 0.1754225673823664, + "grad_norm": 8.25, + "kl": 0.0, + "learning_rate": 5.111884546105506e-08, + "logits/chosen": -123255877.81818181, + "logits/rejected": -67134214.4, + "logps/chosen": -206.2481356534091, + "logps/rejected": -374.956494140625, + "loss": 0.0334, + "rewards/chosen": 1.4032635255293413, + "rewards/margins": 12.480571417375044, + "rewards/rejected": -11.077307891845702, + "step": 960 + }, + { + "epoch": 0.1756052992233897, + "grad_norm": 8.25, + "kl": 0.38125038146972656, + "learning_rate": 4.865965629214819e-08, + "logits/chosen": -102042825.14285715, + "logits/rejected": -160889571.55555555, + "logps/chosen": -220.74609375, + "logps/rejected": -319.73328993055554, + "loss": 0.0228, + "rewards/chosen": 2.110917363848005, + "rewards/margins": 13.496859611026824, + "rewards/rejected": -11.38594224717882, + "step": 961 + }, + { + "epoch": 0.175788031064413, + "grad_norm": 8.0625, + "kl": 0.0, + "learning_rate": 4.626079764278202e-08, + "logits/chosen": -163792524.8, + "logits/rejected": -70263104.0, + "logps/chosen": -138.14810791015626, + "logps/rejected": -370.1144612630208, + "loss": 0.0362, + "rewards/chosen": 1.6900953292846679, + "rewards/margins": 12.384384218851725, + "rewards/rejected": -10.694288889567057, + "step": 962 + }, + { + "epoch": 0.17597076290543628, + "grad_norm": 9.625, + "kl": 0.0, + "learning_rate": 4.392229874229159e-08, + "logits/chosen": -102111499.63636364, + "logits/rejected": -55827699.2, + "logps/chosen": -162.14665083451703, + "logps/rejected": -419.233984375, + "loss": 0.0421, + "rewards/chosen": 1.0444441708651455, + "rewards/margins": 14.94115895357999, + "rewards/rejected": -13.896714782714843, + "step": 963 + }, + { + "epoch": 0.17615349474645958, + "grad_norm": 8.4375, + "kl": 0.0, + "learning_rate": 4.164418808454806e-08, + "logits/chosen": -77416192.0, + "logits/rejected": -61707104.0, + "logps/chosen": -207.54170735677084, + "logps/rejected": -559.2511858258929, + "loss": 0.0422, + "rewards/chosen": 0.770388232337104, + "rewards/margins": 14.184279615916903, + "rewards/rejected": -13.413891383579799, + "step": 964 + }, + { + "epoch": 0.17633622658748288, + "grad_norm": 4.96875, + "kl": 0.0, + "learning_rate": 3.9426493427611177e-08, + "logits/chosen": -157423890.2857143, + "logits/rejected": -220221340.44444445, + "logps/chosen": -160.7808837890625, + "logps/rejected": -614.3823784722222, + "loss": 0.016, + "rewards/chosen": 1.958127430507115, + "rewards/margins": 16.585573923020135, + "rewards/rejected": -14.627446492513021, + "step": 965 + }, + { + "epoch": 0.17651895842850618, + "grad_norm": 10.5, + "kl": 0.0, + "learning_rate": 3.726924179339009e-08, + "logits/chosen": -123437856.0, + "logits/rejected": -125337120.0, + "logps/chosen": -206.15565490722656, + "logps/rejected": -503.1556701660156, + "loss": 0.0374, + "rewards/chosen": 1.5190833806991577, + "rewards/margins": 16.500202536582947, + "rewards/rejected": -14.981119155883789, + "step": 966 + }, + { + "epoch": 0.17670169026952948, + "grad_norm": 8.875, + "kl": 0.8326950073242188, + "learning_rate": 3.517245946731529e-08, + "logits/chosen": -114172986.18181819, + "logits/rejected": -94427865.6, + "logps/chosen": -212.80129172585228, + "logps/rejected": -677.412744140625, + "loss": 0.0334, + "rewards/chosen": 2.252361297607422, + "rewards/margins": 22.89334945678711, + "rewards/rejected": -20.640988159179688, + "step": 967 + }, + { + "epoch": 0.17688442211055277, + "grad_norm": 6.875, + "kl": 0.0, + "learning_rate": 3.313617199801777e-08, + "logits/chosen": -73932859.42857143, + "logits/rejected": -61112547.55555555, + "logps/chosen": -227.86368233816964, + "logps/rejected": -434.884521484375, + "loss": 0.0192, + "rewards/chosen": 2.3274552481515065, + "rewards/margins": 15.403009989904978, + "rewards/rejected": -13.075554741753471, + "step": 968 + }, + { + "epoch": 0.17706715395157607, + "grad_norm": 9.0625, + "kl": 0.0, + "learning_rate": 3.1160404197018155e-08, + "logits/chosen": -124147620.57142857, + "logits/rejected": -140997162.66666666, + "logps/chosen": -276.0143345424107, + "logps/rejected": -492.7414279513889, + "loss": 0.0254, + "rewards/chosen": 1.3807251793997628, + "rewards/margins": 13.570798828488304, + "rewards/rejected": -12.190073649088541, + "step": 969 + }, + { + "epoch": 0.17724988579259937, + "grad_norm": 10.0, + "kl": 0.0, + "learning_rate": 2.9245180138423033e-08, + "logits/chosen": -130854641.77777778, + "logits/rejected": -179948233.14285713, + "logps/chosen": -311.44859483506946, + "logps/rejected": -320.3370884486607, + "loss": 0.0237, + "rewards/chosen": 1.6633758544921875, + "rewards/margins": 10.89337158203125, + "rewards/rejected": -9.229995727539062, + "step": 970 + }, + { + "epoch": 0.17743261763362267, + "grad_norm": 10.5, + "kl": 0.0, + "learning_rate": 2.7390523158633552e-08, + "logits/chosen": -130772736.0, + "logits/rejected": -126951936.0, + "logps/chosen": -203.17518199573863, + "logps/rejected": -502.421923828125, + "loss": 0.0331, + "rewards/chosen": 1.66065822948109, + "rewards/margins": 13.068156398426403, + "rewards/rejected": -11.407498168945313, + "step": 971 + }, + { + "epoch": 0.17761534947464597, + "grad_norm": 9.4375, + "kl": 0.0, + "learning_rate": 2.5596455856058966e-08, + "logits/chosen": -104344742.4, + "logits/rejected": -146081226.66666666, + "logps/chosen": -201.5380615234375, + "logps/rejected": -519.0411376953125, + "loss": 0.0395, + "rewards/chosen": 1.0642199516296387, + "rewards/margins": 12.530484040578207, + "rewards/rejected": -11.466264088948568, + "step": 972 + }, + { + "epoch": 0.17779808131566927, + "grad_norm": 10.5, + "kl": 0.0, + "learning_rate": 2.386300009084408e-08, + "logits/chosen": -95280585.14285715, + "logits/rejected": -51022030.222222224, + "logps/chosen": -277.43748256138394, + "logps/rejected": -390.54155815972223, + "loss": 0.0449, + "rewards/chosen": -0.03016693251473563, + "rewards/margins": 11.362126361756099, + "rewards/rejected": -11.392293294270834, + "step": 973 + }, + { + "epoch": 0.17798081315669256, + "grad_norm": 9.1875, + "kl": 0.0, + "learning_rate": 2.219017698460002e-08, + "logits/chosen": -98496369.77777778, + "logits/rejected": -128479250.28571428, + "logps/chosen": -289.107421875, + "logps/rejected": -680.5495256696429, + "loss": 0.0326, + "rewards/chosen": 1.2313110563490126, + "rewards/margins": 14.794613232688299, + "rewards/rejected": -13.563302176339286, + "step": 974 + }, + { + "epoch": 0.17816354499771586, + "grad_norm": 6.65625, + "kl": 0.0, + "learning_rate": 2.057800692014833e-08, + "logits/chosen": -119788761.6, + "logits/rejected": -125884113.45454545, + "logps/chosen": -159.14873046875, + "logps/rejected": -403.8385120738636, + "loss": 0.0136, + "rewards/chosen": 1.9524654388427733, + "rewards/margins": 14.323005710948598, + "rewards/rejected": -12.370540272105824, + "step": 975 + }, + { + "epoch": 0.17834627683873916, + "grad_norm": 7.6875, + "kl": 0.0, + "learning_rate": 1.9026509541272276e-08, + "logits/chosen": -86371416.0, + "logits/rejected": -68331456.0, + "logps/chosen": -187.97874450683594, + "logps/rejected": -419.40631103515625, + "loss": 0.0345, + "rewards/chosen": 1.0748865604400635, + "rewards/margins": 11.12924313545227, + "rewards/rejected": -10.054356575012207, + "step": 976 + }, + { + "epoch": 0.17852900867976246, + "grad_norm": 7.6875, + "kl": 0.0, + "learning_rate": 1.753570375247815e-08, + "logits/chosen": -165643154.2857143, + "logits/rejected": -89813767.1111111, + "logps/chosen": -190.14842006138392, + "logps/rejected": -346.63487413194446, + "loss": 0.0251, + "rewards/chosen": 1.0717768669128418, + "rewards/margins": 13.072163422902426, + "rewards/rejected": -12.000386555989584, + "step": 977 + }, + { + "epoch": 0.17871174052078576, + "grad_norm": 7.125, + "kl": 0.0, + "learning_rate": 1.610560771876435e-08, + "logits/chosen": -159755840.0, + "logits/rejected": -87472008.0, + "logps/chosen": -259.26617431640625, + "logps/rejected": -457.3736877441406, + "loss": 0.0196, + "rewards/chosen": 2.1195902824401855, + "rewards/margins": 14.56647253036499, + "rewards/rejected": -12.446882247924805, + "step": 978 + }, + { + "epoch": 0.17889447236180905, + "grad_norm": 5.375, + "kl": 0.0, + "learning_rate": 1.4736238865398766e-08, + "logits/chosen": -143704394.66666666, + "logits/rejected": -74377120.0, + "logps/chosen": -151.11344401041666, + "logps/rejected": -479.10546875, + "loss": 0.0096, + "rewards/chosen": 2.665674845377604, + "rewards/margins": 18.766756693522137, + "rewards/rejected": -16.10108184814453, + "step": 979 + }, + { + "epoch": 0.17907720420283235, + "grad_norm": 7.9375, + "kl": 0.0, + "learning_rate": 1.3427613877709523e-08, + "logits/chosen": -114779936.0, + "logits/rejected": -157222368.0, + "logps/chosen": -164.35972595214844, + "logps/rejected": -479.4175720214844, + "loss": 0.0301, + "rewards/chosen": 1.49855637550354, + "rewards/margins": 14.379334688186646, + "rewards/rejected": -12.880778312683105, + "step": 980 + }, + { + "epoch": 0.17925993604385565, + "grad_norm": 9.0, + "kl": 0.0, + "learning_rate": 1.2179748700879013e-08, + "logits/chosen": -106339245.71428572, + "logits/rejected": -127074631.1111111, + "logps/chosen": -205.08851841517858, + "logps/rejected": -356.88441297743054, + "loss": 0.0326, + "rewards/chosen": 1.0071250370570592, + "rewards/margins": 9.746128536406019, + "rewards/rejected": -8.739003499348959, + "step": 981 + }, + { + "epoch": 0.17944266788487895, + "grad_norm": 6.46875, + "kl": 0.0, + "learning_rate": 1.0992658539750179e-08, + "logits/chosen": -122615370.66666667, + "logits/rejected": -114689036.8, + "logps/chosen": -237.3461710611979, + "logps/rejected": -536.707177734375, + "loss": 0.01, + "rewards/chosen": 2.4158143997192383, + "rewards/margins": 19.563568305969238, + "rewards/rejected": -17.14775390625, + "step": 982 + }, + { + "epoch": 0.17962539972590225, + "grad_norm": 8.625, + "kl": 0.0, + "learning_rate": 9.866357858642206e-09, + "logits/chosen": -110671305.14285715, + "logits/rejected": -84856789.33333333, + "logps/chosen": -195.31680733816964, + "logps/rejected": -467.71440972222223, + "loss": 0.026, + "rewards/chosen": 1.2106787817818778, + "rewards/margins": 14.199694996788388, + "rewards/rejected": -12.98901621500651, + "step": 983 + }, + { + "epoch": 0.17980813156692554, + "grad_norm": 7.6875, + "kl": 0.0, + "learning_rate": 8.800860381173448e-09, + "logits/chosen": -139253708.8, + "logits/rejected": -49995552.0, + "logps/chosen": -191.4254638671875, + "logps/rejected": -626.2918701171875, + "loss": 0.0254, + "rewards/chosen": 2.1544660568237304, + "rewards/margins": 14.762238883972168, + "rewards/rejected": -12.607772827148438, + "step": 984 + }, + { + "epoch": 0.17999086340794884, + "grad_norm": 12.4375, + "kl": 0.0, + "learning_rate": 7.796179090094891e-09, + "logits/chosen": -108942370.9090909, + "logits/rejected": -101120435.2, + "logps/chosen": -319.0049937855114, + "logps/rejected": -436.4533203125, + "loss": 0.0415, + "rewards/chosen": 1.259160041809082, + "rewards/margins": 15.15600757598877, + "rewards/rejected": -13.896847534179688, + "step": 985 + }, + { + "epoch": 0.18017359524897214, + "grad_norm": 12.0625, + "kl": 0.0, + "learning_rate": 6.852326227130835e-09, + "logits/chosen": -144265113.6, + "logits/rejected": -158144917.33333334, + "logps/chosen": -233.412744140625, + "logps/rejected": -490.7886962890625, + "loss": 0.0562, + "rewards/chosen": 1.8684804916381836, + "rewards/margins": 14.685390281677247, + "rewards/rejected": -12.816909790039062, + "step": 986 + }, + { + "epoch": 0.18035632708999544, + "grad_norm": 8.5625, + "kl": 0.0, + "learning_rate": 5.969313292830126e-09, + "logits/chosen": -57477110.85714286, + "logits/rejected": -106314332.44444445, + "logps/chosen": -183.83255440848214, + "logps/rejected": -390.5205891927083, + "loss": 0.0264, + "rewards/chosen": 1.1666227068219865, + "rewards/margins": 13.685809786357577, + "rewards/rejected": -12.519187079535591, + "step": 987 + }, + { + "epoch": 0.18053905893101874, + "grad_norm": 6.65625, + "kl": 0.0, + "learning_rate": 5.147151046426824e-09, + "logits/chosen": -94735772.44444445, + "logits/rejected": -178242230.85714287, + "logps/chosen": -160.60297309027777, + "logps/rejected": -512.9553571428571, + "loss": 0.0346, + "rewards/chosen": 1.1115425957573786, + "rewards/margins": 13.592544313461062, + "rewards/rejected": -12.481001717703682, + "step": 988 + }, + { + "epoch": 0.18072179077204203, + "grad_norm": 11.5625, + "kl": 0.0, + "learning_rate": 4.385849505708084e-09, + "logits/chosen": -140820522.66666666, + "logits/rejected": -220402560.0, + "logps/chosen": -202.892822265625, + "logps/rejected": -550.2410888671875, + "loss": 0.053, + "rewards/chosen": 0.865626335144043, + "rewards/margins": 15.90500545501709, + "rewards/rejected": -15.039379119873047, + "step": 989 + }, + { + "epoch": 0.18090452261306533, + "grad_norm": 15.4375, + "kl": 0.0, + "learning_rate": 3.685417946894254e-09, + "logits/chosen": -170398370.9090909, + "logits/rejected": -101677996.8, + "logps/chosen": -343.3392888849432, + "logps/rejected": -567.5314453125, + "loss": 0.0481, + "rewards/chosen": 1.5802607102827593, + "rewards/margins": 11.90514855818315, + "rewards/rejected": -10.32488784790039, + "step": 990 + }, + { + "epoch": 0.18108725445408863, + "grad_norm": 7.78125, + "kl": 0.0, + "learning_rate": 3.0458649045211897e-09, + "logits/chosen": -136642459.42857143, + "logits/rejected": -187004359.1111111, + "logps/chosen": -167.53274972098214, + "logps/rejected": -480.55322265625, + "loss": 0.0272, + "rewards/chosen": 0.9585488183157784, + "rewards/margins": 12.868013336544944, + "rewards/rejected": -11.909464518229166, + "step": 991 + }, + { + "epoch": 0.18126998629511193, + "grad_norm": 12.9375, + "kl": 0.0, + "learning_rate": 2.4671981713420003e-09, + "logits/chosen": -152808202.66666666, + "logits/rejected": -183498208.0, + "logps/chosen": -333.4043375651042, + "logps/rejected": -544.4529418945312, + "loss": 0.0528, + "rewards/chosen": 0.9341098467508951, + "rewards/margins": 15.779031912485758, + "rewards/rejected": -14.844922065734863, + "step": 992 + }, + { + "epoch": 0.18145271813613523, + "grad_norm": 8.0625, + "kl": 0.0, + "learning_rate": 1.9494247982282386e-09, + "logits/chosen": -86402440.0, + "logits/rejected": -104357824.0, + "logps/chosen": -191.80682373046875, + "logps/rejected": -478.8999938964844, + "loss": 0.0278, + "rewards/chosen": 1.7965669631958008, + "rewards/margins": 15.134602546691895, + "rewards/rejected": -13.338035583496094, + "step": 993 + }, + { + "epoch": 0.18163544997715853, + "grad_norm": 6.09375, + "kl": 0.0, + "learning_rate": 1.4925510940844157e-09, + "logits/chosen": -121498218.66666667, + "logits/rejected": -131954329.6, + "logps/chosen": -259.2331136067708, + "logps/rejected": -438.349560546875, + "loss": 0.0147, + "rewards/chosen": 2.4098734855651855, + "rewards/margins": 13.69671277999878, + "rewards/rejected": -11.286839294433594, + "step": 994 + }, + { + "epoch": 0.18181818181818182, + "grad_norm": 8.3125, + "kl": 0.0, + "learning_rate": 1.096582625772502e-09, + "logits/chosen": -87850464.0, + "logits/rejected": -160351840.0, + "logps/chosen": -299.846923828125, + "logps/rejected": -473.148681640625, + "loss": 0.0224, + "rewards/chosen": 1.613604187965393, + "rewards/margins": 13.807512879371643, + "rewards/rejected": -12.19390869140625, + "step": 995 + }, + { + "epoch": 0.18200091365920512, + "grad_norm": 10.375, + "kl": 0.0, + "learning_rate": 7.615242180436521e-10, + "logits/chosen": -129980558.22222222, + "logits/rejected": -180486308.57142857, + "logps/chosen": -184.26015896267361, + "logps/rejected": -561.9503696986607, + "loss": 0.0417, + "rewards/chosen": 0.6634567048814561, + "rewards/margins": 16.412671967158243, + "rewards/rejected": -15.749215262276786, + "step": 996 + }, + { + "epoch": 0.18218364550022842, + "grad_norm": 11.6875, + "kl": 0.0, + "learning_rate": 4.87379953478806e-10, + "logits/chosen": -148970892.8, + "logits/rejected": -124454677.33333333, + "logps/chosen": -265.249853515625, + "logps/rejected": -401.5243326822917, + "loss": 0.0401, + "rewards/chosen": 1.052725124359131, + "rewards/margins": 13.833533891042075, + "rewards/rejected": -12.780808766682943, + "step": 997 + }, + { + "epoch": 0.18236637734125172, + "grad_norm": 22.0, + "kl": 0.0, + "learning_rate": 2.741531724392843e-10, + "logits/chosen": -88060437.33333333, + "logits/rejected": -123385499.42857143, + "logps/chosen": -283.5769314236111, + "logps/rejected": -511.84849330357144, + "loss": 0.0548, + "rewards/chosen": 0.5869033601548936, + "rewards/margins": 9.429970529344347, + "rewards/rejected": -8.843067169189453, + "step": 998 + }, + { + "epoch": 0.18254910918227502, + "grad_norm": 7.9375, + "kl": 0.0, + "learning_rate": 1.2184647302626585e-10, + "logits/chosen": -145105720.8888889, + "logits/rejected": -205659026.2857143, + "logps/chosen": -258.89881727430554, + "logps/rejected": -383.13619559151783, + "loss": 0.0313, + "rewards/chosen": 1.457052018907335, + "rewards/margins": 12.775982508583674, + "rewards/rejected": -11.318930489676339, + "step": 999 + }, + { + "epoch": 0.1827318410232983, + "grad_norm": 9.4375, + "kl": 0.0, + "learning_rate": 3.0461711048035415e-11, + "logits/chosen": -100680520.0, + "logits/rejected": -101818096.0, + "logps/chosen": -195.02532958984375, + "logps/rejected": -360.4969482421875, + "loss": 0.0364, + "rewards/chosen": 1.308640718460083, + "rewards/margins": 10.778756380081177, + "rewards/rejected": -9.470115661621094, + "step": 1000 + } + ], + "logging_steps": 1, + "max_steps": 1000, + "num_input_tokens_seen": 0, + "num_train_epochs": 1, + "save_steps": 250, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": true + }, + "attributes": {} + } + }, + "total_flos": 0.0, + "train_batch_size": 2, + "trial_name": null, + "trial_params": null +}