{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 10.0, "eval_steps": 100, "global_step": 750, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.013333333333333334, "grad_norm": 75.5, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": 1803143936.0, "logps/chosen": -1764.083251953125, "loss": 0.632, "rewards/chosen": -0.6753141283988953, "step": 1 }, { "epoch": 0.13333333333333333, "grad_norm": 77.0, "kl": 2.966403007507324, "learning_rate": 9.88e-07, "logits/chosen": 1835470165.3333333, "logps/chosen": -1562.392795138889, "loss": 0.5078, "rewards/chosen": 0.2530868848164876, "step": 10 }, { "epoch": 0.26666666666666666, "grad_norm": 70.0, "kl": 17.817913055419922, "learning_rate": 9.746666666666666e-07, "logits/chosen": 1839740518.4, "logps/chosen": -1660.749609375, "loss": 0.4871, "rewards/chosen": 1.8496942520141602, "step": 20 }, { "epoch": 0.4, "grad_norm": 63.0, "kl": 30.11983871459961, "learning_rate": 9.613333333333334e-07, "logits/chosen": 1844791091.2, "logps/chosen": -1590.46630859375, "loss": 0.5174, "rewards/chosen": 2.9328380584716798, "step": 30 }, { "epoch": 0.5333333333333333, "grad_norm": 44.0, "kl": 39.447776794433594, "learning_rate": 9.479999999999999e-07, "logits/chosen": 1830737728.2397003, "logits/rejected": 1802774180.2264152, "logps/chosen": -1604.0685861423221, "logps/rejected": -1642.3963738207547, "loss": 0.497, "rewards/chosen": 4.055163894253277, "rewards/margins": 0.6231006295864137, "rewards/rejected": 3.4320632646668634, "step": 40 }, { "epoch": 0.6666666666666666, "grad_norm": 61.0, "kl": 27.023143768310547, "learning_rate": 9.346666666666666e-07, "logits/rejected": 1793442406.4, "logps/rejected": -1498.494921875, "loss": 0.4883, "rewards/rejected": 2.6893266677856444, "step": 50 }, { "epoch": 0.8, "grad_norm": 70.5, "kl": 16.976736068725586, "learning_rate": 9.213333333333333e-07, "logits/rejected": 1816138956.8, "logps/rejected": -1576.112890625, "loss": 0.4765, "rewards/rejected": 1.6485807418823242, "step": 60 }, { "epoch": 0.9333333333333333, "grad_norm": 78.0, "kl": 3.4582889080047607, "learning_rate": 9.08e-07, "logits/rejected": 1791525068.8, "logps/rejected": -1609.57080078125, "loss": 0.4898, "rewards/rejected": 0.33603610992431643, "step": 70 }, { "epoch": 1.0666666666666667, "grad_norm": 96.0, "kl": 4.4611639976501465, "learning_rate": 8.946666666666667e-07, "logits/chosen": 1793128038.4, "logits/rejected": 1786466099.2, "logps/chosen": -1679.2375, "logps/rejected": -1366.859375, "loss": 0.4474, "rewards/chosen": 0.8079164505004883, "rewards/margins": 1.4511647701263428, "rewards/rejected": -0.6432483196258545, "step": 80 }, { "epoch": 1.2, "grad_norm": 73.0, "kl": 20.16000747680664, "learning_rate": 8.813333333333332e-07, "logits/chosen": 1788976128.0, "logps/chosen": -1553.37900390625, "loss": 0.4946, "rewards/chosen": 2.074075126647949, "step": 90 }, { "epoch": 1.3333333333333333, "grad_norm": 55.25, "kl": 30.880172729492188, "learning_rate": 8.68e-07, "logits/chosen": 1800670822.4, "logps/chosen": -1629.187109375, "loss": 0.4936, "rewards/chosen": 3.1399845123291015, "step": 100 }, { "epoch": 1.4666666666666668, "grad_norm": 47.75, "kl": 42.36182403564453, "learning_rate": 8.546666666666666e-07, "logits/chosen": 1819451801.6, "logps/chosen": -1631.21865234375, "loss": 0.4832, "rewards/chosen": 4.322412109375, "step": 110 }, { "epoch": 1.6, "grad_norm": 65.0, "kl": 29.110525131225586, "learning_rate": 8.413333333333333e-07, "logits/chosen": 1786882220.2616823, "logits/rejected": 1767476998.0093896, "logps/chosen": -1454.260660046729, "logps/rejected": -1486.950850938967, "loss": 0.4809, "rewards/chosen": 4.8924052871276285, "rewards/margins": 2.9423321882151434, "rewards/rejected": 1.9500730989124853, "step": 120 }, { "epoch": 1.7333333333333334, "grad_norm": 68.0, "kl": 10.5321044921875, "learning_rate": 8.28e-07, "logits/rejected": 1757802086.4, "logps/rejected": -1584.4416015625, "loss": 0.4952, "rewards/rejected": 1.1045079231262207, "step": 130 }, { "epoch": 1.8666666666666667, "grad_norm": 74.5, "kl": 2.656236171722412, "learning_rate": 8.146666666666666e-07, "logits/rejected": 1784456601.6, "logps/rejected": -1606.92255859375, "loss": 0.4411, "rewards/rejected": 0.04395853877067566, "step": 140 }, { "epoch": 2.0, "grad_norm": 47.25, "kl": 0.02235652133822441, "learning_rate": 8.013333333333333e-07, "logits/rejected": 1755790950.4, "logps/rejected": -1492.91025390625, "loss": 0.3227, "rewards/rejected": -0.965308952331543, "step": 150 }, { "epoch": 2.1333333333333333, "grad_norm": 74.5, "kl": 22.667367935180664, "learning_rate": 7.88e-07, "logits/chosen": 1760644710.4, "logps/chosen": -1561.63623046875, "loss": 0.5121, "rewards/chosen": 2.2527976989746095, "step": 160 }, { "epoch": 2.2666666666666666, "grad_norm": 75.5, "kl": 35.21966552734375, "learning_rate": 7.746666666666666e-07, "logits/chosen": 1774286233.6, "logps/chosen": -1643.762109375, "loss": 0.4987, "rewards/chosen": 3.5484432220458983, "step": 170 }, { "epoch": 2.4, "grad_norm": 52.25, "kl": 43.11100387573242, "learning_rate": 7.613333333333333e-07, "logits/chosen": 1786122035.2, "logps/chosen": -1576.08076171875, "loss": 0.4918, "rewards/chosen": 4.371388244628906, "step": 180 }, { "epoch": 2.533333333333333, "grad_norm": 49.75, "kl": 43.39256286621094, "learning_rate": 7.48e-07, "logits/chosen": 1780707128.5692885, "logits/rejected": 1751267212.0754716, "logps/chosen": -1593.653675093633, "logps/rejected": -1665.3343160377358, "loss": 0.481, "rewards/chosen": 5.09666597173455, "rewards/margins": 3.9583941361195003, "rewards/rejected": 1.13827183561505, "step": 190 }, { "epoch": 2.6666666666666665, "grad_norm": 63.5, "kl": 5.4629011154174805, "learning_rate": 7.346666666666666e-07, "logits/rejected": 1737874841.6, "logps/rejected": -1520.10458984375, "loss": 0.4785, "rewards/rejected": 0.5283474445343017, "step": 200 }, { "epoch": 2.8, "grad_norm": 59.25, "kl": 1.2280102968215942, "learning_rate": 7.213333333333334e-07, "logits/rejected": 1761185177.6, "logps/rejected": -1594.97919921875, "loss": 0.4105, "rewards/rejected": -0.23803796768188476, "step": 210 }, { "epoch": 2.9333333333333336, "grad_norm": 70.5, "kl": 0.0, "learning_rate": 7.079999999999999e-07, "logits/rejected": 1738660864.0, "logps/rejected": -1625.093359375, "loss": 0.2885, "rewards/rejected": -1.2162075996398927, "step": 220 }, { "epoch": 3.066666666666667, "grad_norm": 75.5, "kl": 14.01048469543457, "learning_rate": 6.946666666666666e-07, "logits/chosen": 1750113075.2, "logits/rejected": 1743708160.0, "logps/chosen": -1658.2826171875, "logps/rejected": -1375.04150390625, "loss": 0.3769, "rewards/chosen": 2.9034093856811523, "rewards/margins": 4.364873313903809, "rewards/rejected": -1.4614639282226562, "step": 230 }, { "epoch": 3.2, "grad_norm": 63.0, "kl": 37.20917510986328, "learning_rate": 6.813333333333333e-07, "logits/chosen": 1746606899.2, "logps/chosen": -1536.5771484375, "loss": 0.4982, "rewards/chosen": 3.75426025390625, "step": 240 }, { "epoch": 3.3333333333333335, "grad_norm": 54.5, "kl": 43.48841094970703, "learning_rate": 6.68e-07, "logits/chosen": 1757109657.6, "logps/chosen": -1617.16201171875, "loss": 0.5003, "rewards/chosen": 4.342501831054688, "step": 250 }, { "epoch": 3.466666666666667, "grad_norm": 52.5, "kl": 51.4334716796875, "learning_rate": 6.546666666666665e-07, "logits/chosen": 1780435148.8, "logps/chosen": -1622.2716796875, "loss": 0.4796, "rewards/chosen": 5.217120742797851, "step": 260 }, { "epoch": 3.6, "grad_norm": 67.5, "kl": 19.67037582397461, "learning_rate": 6.413333333333333e-07, "logits/chosen": 1755938050.3925233, "logits/rejected": 1727550343.8122065, "logps/chosen": -1448.7593457943926, "logps/rejected": -1504.9633215962442, "loss": 0.4334, "rewards/chosen": 5.442524277161215, "rewards/margins": 5.293697337761961, "rewards/rejected": 0.1488269393992536, "step": 270 }, { "epoch": 3.7333333333333334, "grad_norm": 51.75, "kl": 0.6852197647094727, "learning_rate": 6.28e-07, "logits/rejected": 1722981785.6, "logps/rejected": -1600.43251953125, "loss": 0.3926, "rewards/rejected": -0.49457273483276365, "step": 280 }, { "epoch": 3.8666666666666667, "grad_norm": 49.0, "kl": 0.2839541435241699, "learning_rate": 6.146666666666667e-07, "logits/rejected": 1752589516.8, "logps/rejected": -1619.22822265625, "loss": 0.2889, "rewards/rejected": -1.1866175651550293, "step": 290 }, { "epoch": 4.0, "grad_norm": 37.0, "kl": 0.0, "learning_rate": 6.013333333333334e-07, "logits/rejected": 1728595353.6, "logps/rejected": -1498.5271484375, "loss": 0.2549, "rewards/rejected": -1.526987361907959, "step": 300 }, { "epoch": 4.133333333333334, "grad_norm": 54.0, "kl": 39.12127685546875, "learning_rate": 5.879999999999999e-07, "logits/chosen": 1737360384.0, "logps/chosen": -1544.43232421875, "loss": 0.4935, "rewards/chosen": 3.9731983184814452, "step": 310 }, { "epoch": 4.266666666666667, "grad_norm": 172.0, "kl": 46.1120491027832, "learning_rate": 5.746666666666667e-07, "logits/chosen": 1747944038.4, "logps/chosen": -1632.6375, "loss": 0.4933, "rewards/chosen": 4.660909652709961, "step": 320 }, { "epoch": 4.4, "grad_norm": 60.0, "kl": 50.143592834472656, "learning_rate": 5.613333333333333e-07, "logits/chosen": 1759613952.0, "logps/chosen": -1568.59072265625, "loss": 0.4785, "rewards/chosen": 5.1204078674316404, "step": 330 }, { "epoch": 4.533333333333333, "grad_norm": 42.5, "kl": 45.67478942871094, "learning_rate": 5.48e-07, "logits/chosen": 1756831406.5018728, "logits/rejected": 1722636635.7735848, "logps/chosen": -1589.1077949438202, "logps/rejected": -1681.454304245283, "loss": 0.4447, "rewards/chosen": 5.551258558637641, "rewards/margins": 6.024997390931457, "rewards/rejected": -0.47373883229381636, "step": 340 }, { "epoch": 4.666666666666667, "grad_norm": 217.0, "kl": 0.32102876901626587, "learning_rate": 5.346666666666666e-07, "logits/rejected": 1712375193.6, "logps/rejected": -1534.0259765625, "loss": 0.338, "rewards/rejected": -0.8637893676757813, "step": 350 }, { "epoch": 4.8, "grad_norm": 47.0, "kl": 0.029415320605039597, "learning_rate": 5.213333333333333e-07, "logits/rejected": 1739532697.6, "logps/rejected": -1605.47685546875, "loss": 0.2836, "rewards/rejected": -1.2877922058105469, "step": 360 }, { "epoch": 4.933333333333334, "grad_norm": 50.5, "kl": 0.0, "learning_rate": 5.079999999999999e-07, "logits/rejected": 1720070963.2, "logps/rejected": -1630.8001953125, "loss": 0.2262, "rewards/rejected": -1.7868902206420898, "step": 370 }, { "epoch": 5.066666666666666, "grad_norm": 70.5, "kl": 21.522884368896484, "learning_rate": 4.946666666666666e-07, "logits/chosen": 1738568908.8, "logits/rejected": 1726313472.0, "logps/chosen": -1644.7123046875, "logps/rejected": -1377.4642578125, "loss": 0.3804, "rewards/chosen": 4.260452651977539, "rewards/margins": 5.9641773223876955, "rewards/rejected": -1.7037246704101563, "step": 380 }, { "epoch": 5.2, "grad_norm": 49.75, "kl": 47.77645492553711, "learning_rate": 4.813333333333334e-07, "logits/chosen": 1732489625.6, "logps/chosen": -1525.93984375, "loss": 0.4944, "rewards/chosen": 4.817991256713867, "step": 390 }, { "epoch": 5.333333333333333, "grad_norm": 50.25, "kl": 49.042144775390625, "learning_rate": 4.68e-07, "logits/chosen": 1741244620.8, "logps/chosen": -1609.87353515625, "loss": 0.4644, "rewards/chosen": 5.071358489990234, "step": 400 }, { "epoch": 5.466666666666667, "grad_norm": 48.0, "kl": 55.76483154296875, "learning_rate": 4.5466666666666666e-07, "logits/chosen": 1760807936.0, "logps/chosen": -1618.9923828125, "loss": 0.5002, "rewards/chosen": 5.545055770874024, "step": 410 }, { "epoch": 5.6, "grad_norm": 50.0, "kl": 18.410531997680664, "learning_rate": 4.413333333333333e-07, "logits/chosen": 1736117544.672897, "logits/rejected": 1708876583.6619718, "logps/chosen": -1447.1404789719627, "logps/rejected": -1516.6236795774648, "loss": 0.3554, "rewards/chosen": 5.60440890588493, "rewards/margins": 6.6215995436077515, "rewards/rejected": -1.0171906377228213, "step": 420 }, { "epoch": 5.733333333333333, "grad_norm": 39.25, "kl": 0.028980541974306107, "learning_rate": 4.2799999999999997e-07, "logits/rejected": 1704340684.8, "logps/rejected": -1610.3271484375, "loss": 0.269, "rewards/rejected": -1.4840303421020509, "step": 430 }, { "epoch": 5.866666666666667, "grad_norm": 53.5, "kl": 0.0, "learning_rate": 4.146666666666667e-07, "logits/rejected": 1737012224.0, "logps/rejected": -1625.72900390625, "loss": 0.2277, "rewards/rejected": -1.8366947174072266, "step": 440 }, { "epoch": 6.0, "grad_norm": 39.25, "kl": 0.0, "learning_rate": 4.0133333333333333e-07, "logits/rejected": 1715832217.6, "logps/rejected": -1501.8396484375, "loss": 0.2283, "rewards/rejected": -1.8582498550415039, "step": 450 }, { "epoch": 6.133333333333334, "grad_norm": 55.25, "kl": 47.387760162353516, "learning_rate": 3.88e-07, "logits/chosen": 1729806540.8, "logps/chosen": -1536.3201171875, "loss": 0.4921, "rewards/chosen": 4.784424209594727, "step": 460 }, { "epoch": 6.266666666666667, "grad_norm": 77.0, "kl": 51.74030685424805, "learning_rate": 3.7466666666666663e-07, "logits/chosen": 1736568832.0, "logps/chosen": -1627.24794921875, "loss": 0.4934, "rewards/chosen": 5.199858093261719, "step": 470 }, { "epoch": 6.4, "grad_norm": 61.75, "kl": 53.5655517578125, "learning_rate": 3.6133333333333334e-07, "logits/chosen": 1746720563.2, "logps/chosen": -1565.3529296875, "loss": 0.4801, "rewards/chosen": 5.444185256958008, "step": 480 }, { "epoch": 6.533333333333333, "grad_norm": 40.75, "kl": 47.25954055786133, "learning_rate": 3.4799999999999994e-07, "logits/chosen": 1744727741.8426967, "logits/rejected": 1706023105.2075472, "logps/chosen": -1587.4740168539327, "logps/rejected": -1689.911704009434, "loss": 0.4287, "rewards/chosen": 5.714643957016619, "rewards/margins": 7.034099805566516, "rewards/rejected": -1.3194558485498968, "step": 490 }, { "epoch": 6.666666666666667, "grad_norm": 58.0, "kl": 0.0, "learning_rate": 3.3466666666666665e-07, "logits/rejected": 1700372172.8, "logps/rejected": -1541.06552734375, "loss": 0.265, "rewards/rejected": -1.5677401542663574, "step": 500 }, { "epoch": 6.8, "grad_norm": 39.75, "kl": 0.0, "learning_rate": 3.2133333333333335e-07, "logits/rejected": 1729151795.2, "logps/rejected": -1609.8298828125, "loss": 0.2389, "rewards/rejected": -1.7230974197387696, "step": 510 }, { "epoch": 6.933333333333334, "grad_norm": 42.0, "kl": 0.0, "learning_rate": 3.08e-07, "logits/rejected": 1712730828.8, "logps/rejected": -1633.16318359375, "loss": 0.2112, "rewards/rejected": -2.023202896118164, "step": 520 }, { "epoch": 7.066666666666666, "grad_norm": 64.5, "kl": 25.04488754272461, "learning_rate": 2.9466666666666666e-07, "logits/chosen": 1731994419.2, "logits/rejected": 1720656691.2, "logps/chosen": -1637.28173828125, "logps/rejected": -1377.9115234375, "loss": 0.3763, "rewards/chosen": 5.003516006469726, "rewards/margins": 6.7519731521606445, "rewards/rejected": -1.748457145690918, "step": 530 }, { "epoch": 7.2, "grad_norm": 56.0, "kl": 52.43037033081055, "learning_rate": 2.813333333333333e-07, "logits/chosen": 1726875648.0, "logps/chosen": -1521.06044921875, "loss": 0.4829, "rewards/chosen": 5.305931854248047, "step": 540 }, { "epoch": 7.333333333333333, "grad_norm": 50.0, "kl": 51.872047424316406, "learning_rate": 2.68e-07, "logits/chosen": 1734085222.4, "logps/chosen": -1607.64345703125, "loss": 0.4712, "rewards/chosen": 5.294354629516602, "step": 550 }, { "epoch": 7.466666666666667, "grad_norm": 51.0, "kl": 55.77549362182617, "learning_rate": 2.546666666666666e-07, "logits/chosen": 1753063219.2, "logps/chosen": -1617.415625, "loss": 0.4773, "rewards/chosen": 5.702725982666015, "step": 560 }, { "epoch": 7.6, "grad_norm": 43.75, "kl": 18.287708282470703, "learning_rate": 2.413333333333333e-07, "logits/chosen": 1731032073.5700934, "logits/rejected": 1702629587.5305164, "logps/chosen": -1447.9690420560748, "logps/rejected": -1521.8135269953052, "loss": 0.3238, "rewards/chosen": 5.521567086193049, "rewards/margins": 7.057752337348419, "rewards/rejected": -1.5361852511553697, "step": 570 }, { "epoch": 7.733333333333333, "grad_norm": 41.0, "kl": 0.0, "learning_rate": 2.28e-07, "logits/rejected": 1697045913.6, "logps/rejected": -1615.4837890625, "loss": 0.2158, "rewards/rejected": -1.9997014999389648, "step": 580 }, { "epoch": 7.866666666666667, "grad_norm": 38.75, "kl": 0.0, "learning_rate": 2.1466666666666666e-07, "logits/rejected": 1732887756.8, "logps/rejected": -1627.6578125, "loss": 0.206, "rewards/rejected": -2.029564094543457, "step": 590 }, { "epoch": 8.0, "grad_norm": 43.25, "kl": 0.0, "learning_rate": 2.0133333333333334e-07, "logits/rejected": 1715685171.2, "logps/rejected": -1501.2095703125, "loss": 0.241, "rewards/rejected": -1.795237922668457, "step": 600 }, { "epoch": 8.133333333333333, "grad_norm": 55.0, "kl": 51.67559051513672, "learning_rate": 1.88e-07, "logits/chosen": 1727099904.0, "logps/chosen": -1532.07939453125, "loss": 0.4961, "rewards/chosen": 5.208480453491211, "step": 610 }, { "epoch": 8.266666666666667, "grad_norm": 64.5, "kl": 54.74528884887695, "learning_rate": 1.7466666666666667e-07, "logits/chosen": 1733115904.0, "logps/chosen": -1624.519921875, "loss": 0.5013, "rewards/chosen": 5.472665786743164, "step": 620 }, { "epoch": 8.4, "grad_norm": 54.75, "kl": 54.551849365234375, "learning_rate": 1.6133333333333332e-07, "logits/chosen": 1742884659.2, "logps/chosen": -1564.97412109375, "loss": 0.4915, "rewards/chosen": 5.482054138183594, "step": 630 }, { "epoch": 8.533333333333333, "grad_norm": 30.625, "kl": 46.306522369384766, "learning_rate": 1.4799999999999998e-07, "logits/chosen": 1739496766.3220973, "logits/rejected": 1702134687.3962264, "logps/chosen": -1588.0999531835207, "logps/rejected": -1696.4740566037735, "loss": 0.4184, "rewards/chosen": 5.652020515127575, "rewards/margins": 7.627730826153518, "rewards/rejected": -1.9757103110259433, "step": 640 }, { "epoch": 8.666666666666666, "grad_norm": 56.5, "kl": 0.0, "learning_rate": 1.3466666666666665e-07, "logits/rejected": 1696173260.8, "logps/rejected": -1544.7572265625, "loss": 0.2282, "rewards/rejected": -1.9369186401367187, "step": 650 }, { "epoch": 8.8, "grad_norm": 36.75, "kl": 0.0, "learning_rate": 1.2133333333333333e-07, "logits/rejected": 1725317120.0, "logps/rejected": -1612.53291015625, "loss": 0.2132, "rewards/rejected": -1.9934148788452148, "step": 660 }, { "epoch": 8.933333333333334, "grad_norm": 43.25, "kl": 0.0, "learning_rate": 1.0799999999999999e-07, "logits/rejected": 1711136563.2, "logps/rejected": -1633.72958984375, "loss": 0.2084, "rewards/rejected": -2.079827880859375, "step": 670 }, { "epoch": 9.066666666666666, "grad_norm": 74.5, "kl": 26.150564193725586, "learning_rate": 9.466666666666665e-08, "logits/chosen": 1734777036.8, "logits/rejected": 1720304844.8, "logps/chosen": -1634.3982421875, "logps/rejected": -1376.32314453125, "loss": 0.3847, "rewards/chosen": 5.2918556213378904, "rewards/margins": 6.881484413146973, "rewards/rejected": -1.5896287918090821, "step": 680 }, { "epoch": 9.2, "grad_norm": 45.0, "kl": 54.32807540893555, "learning_rate": 8.133333333333332e-08, "logits/chosen": 1726023270.4, "logps/chosen": -1519.271875, "loss": 0.4882, "rewards/chosen": 5.484774017333985, "step": 690 }, { "epoch": 9.333333333333334, "grad_norm": 51.75, "kl": 53.640716552734375, "learning_rate": 6.8e-08, "logits/chosen": 1733355929.6, "logps/chosen": -1606.2349609375, "loss": 0.4805, "rewards/chosen": 5.435222625732422, "step": 700 }, { "epoch": 9.466666666666667, "grad_norm": 46.75, "kl": 56.87353515625, "learning_rate": 5.4666666666666666e-08, "logits/chosen": 1751316684.8, "logps/chosen": -1617.39033203125, "loss": 0.4885, "rewards/chosen": 5.705249786376953, "step": 710 }, { "epoch": 9.6, "grad_norm": 38.25, "kl": 17.8724308013916, "learning_rate": 4.133333333333333e-08, "logits/chosen": 1729312212.9345794, "logits/rejected": 1699881844.5821595, "logps/chosen": -1447.3971962616822, "logps/rejected": -1524.6291079812206, "loss": 0.2937, "rewards/chosen": 5.578735921984521, "rewards/margins": 7.396489386947476, "rewards/rejected": -1.8177534649629548, "step": 720 }, { "epoch": 9.733333333333333, "grad_norm": 30.375, "kl": 0.0, "learning_rate": 2.8e-08, "logits/rejected": 1695596134.4, "logps/rejected": -1616.91201171875, "loss": 0.2028, "rewards/rejected": -2.142536735534668, "step": 730 }, { "epoch": 9.866666666666667, "grad_norm": 42.5, "kl": 0.0, "learning_rate": 1.4666666666666666e-08, "logits/rejected": 1732339097.6, "logps/rejected": -1627.45458984375, "loss": 0.211, "rewards/rejected": -2.0092498779296877, "step": 740 }, { "epoch": 10.0, "grad_norm": 42.0, "kl": 0.0, "learning_rate": 1.3333333333333333e-09, "logits/rejected": 1714343731.2, "logps/rejected": -1500.21591796875, "loss": 0.2487, "rewards/rejected": -1.6958515167236328, "step": 750 } ], "logging_steps": 10, "max_steps": 750, "num_input_tokens_seen": 0, "num_train_epochs": 10, "save_steps": 100, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 2, "trial_name": null, "trial_params": null }