{ "best_metric": null, "best_model_checkpoint": null, "epoch": 2.0, "eval_steps": 500, "global_step": 10000, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "completion_length": 65.065, "epoch": 0.02, "grad_norm": 10.375, "kl": 0.0006580278992169042, "learning_rate": 5.000000000000001e-07, "loss": -0.0, "match_ratio": 0.995, "reward": 0.8990143708884716, "reward_std": 0.46338544798083603, "rewards/reward_func": 0.8990143708884716, "step": 100 }, { "completion_length": 65.4625, "epoch": 0.04, "grad_norm": 6.875, "kl": 0.0006705577400316542, "learning_rate": 1.0000000000000002e-06, "loss": -0.0, "match_ratio": 1.0, "reward": 0.7165287194028497, "reward_std": 0.40350831425283107, "rewards/reward_func": 0.7165287194028497, "step": 200 }, { "completion_length": 61.175, "epoch": 0.06, "grad_norm": 8.4375, "kl": 0.0007126682825037278, "learning_rate": 1.5e-06, "loss": 0.0, "match_ratio": 1.0, "reward": 0.7286543997749686, "reward_std": 0.4331769395247102, "rewards/reward_func": 0.7286543997749686, "step": 300 }, { "completion_length": 54.655, "epoch": 0.08, "grad_norm": 11.9375, "kl": 0.0008965998092025984, "learning_rate": 2.0000000000000003e-06, "loss": 0.0, "match_ratio": 1.0, "reward": 0.7439908282458783, "reward_std": 0.4567913323547691, "rewards/reward_func": 0.7439908282458783, "step": 400 }, { "completion_length": 62.1025, "epoch": 0.1, "grad_norm": 15.8125, "kl": 0.0019434646295849235, "learning_rate": 2.5e-06, "loss": 0.0, "match_ratio": 1.0, "reward": 0.8668537394329906, "reward_std": 0.3836937860772014, "rewards/reward_func": 0.8668537394329906, "step": 500 }, { "completion_length": 58.14, "epoch": 0.12, "grad_norm": 20.5, "kl": 0.0041620647069066765, "learning_rate": 3e-06, "loss": 0.0, "match_ratio": 1.0, "reward": 0.7577041421830654, "reward_std": 0.43559244139119985, "rewards/reward_func": 0.7577041421830654, "step": 600 }, { "completion_length": 60.0, "epoch": 0.14, "grad_norm": 10.6875, "kl": 0.008898616410442628, "learning_rate": 3.5e-06, "loss": 0.0, "match_ratio": 0.995, "reward": 0.8700515530258417, "reward_std": 0.45400316243059935, "rewards/reward_func": 0.8700515530258417, "step": 700 }, { "completion_length": 58.1275, "epoch": 0.16, "grad_norm": 7.375, "kl": 0.0189549465168966, "learning_rate": 4.000000000000001e-06, "loss": 0.0, "match_ratio": 0.9975, "reward": 0.7298830785602332, "reward_std": 0.4231558512337506, "rewards/reward_func": 0.7298830785602332, "step": 800 }, { "completion_length": 55.55, "epoch": 0.18, "grad_norm": 13.3125, "kl": 0.04016699714120477, "learning_rate": 4.5e-06, "loss": 0.0, "match_ratio": 1.0, "reward": 0.8830712201073766, "reward_std": 0.3306662117503583, "rewards/reward_func": 0.8830712201073766, "step": 900 }, { "completion_length": 55.0225, "epoch": 0.2, "grad_norm": 10.5625, "kl": 0.06088939258828759, "learning_rate": 5e-06, "loss": 0.0, "match_ratio": 1.0, "reward": 0.7573206969350577, "reward_std": 0.32937701970338823, "rewards/reward_func": 0.7573206969350577, "step": 1000 }, { "completion_length": 59.6125, "epoch": 0.22, "grad_norm": 11.0625, "kl": 0.03887372653000057, "learning_rate": 4.99847706754774e-06, "loss": 0.0, "match_ratio": 0.995, "reward": 0.8389806092530488, "reward_std": 0.3803154364787042, "rewards/reward_func": 0.8389806092530488, "step": 1100 }, { "completion_length": 62.01, "epoch": 0.24, "grad_norm": 8.75, "kl": 0.9295957709802315, "learning_rate": 4.993910125649561e-06, "loss": 0.0001, "match_ratio": 0.9925, "reward": 0.8083837843686342, "reward_std": 0.4002057794481516, "rewards/reward_func": 0.8083837843686342, "step": 1200 }, { "completion_length": 61.6575, "epoch": 0.26, "grad_norm": 12.8125, "kl": 0.5295558683061973, "learning_rate": 4.986304738420684e-06, "loss": 0.0001, "match_ratio": 0.9925, "reward": 0.8700573812425136, "reward_std": 0.41881847178563475, "rewards/reward_func": 0.8700573812425136, "step": 1300 }, { "completion_length": 55.82, "epoch": 0.28, "grad_norm": 21.0, "kl": 0.2268725570756942, "learning_rate": 4.975670171853926e-06, "loss": 0.0, "match_ratio": 1.0, "reward": 0.7677264379709959, "reward_std": 0.4127253815624863, "rewards/reward_func": 0.7677264379709959, "step": 1400 }, { "completion_length": 61.605, "epoch": 0.3, "grad_norm": 15.25, "kl": 2.662307023219764, "learning_rate": 4.962019382530521e-06, "loss": 0.0003, "match_ratio": 0.9975, "reward": 0.8300903634727002, "reward_std": 0.330243071205914, "rewards/reward_func": 0.8300903634727002, "step": 1500 }, { "completion_length": 57.8975, "epoch": 0.32, "grad_norm": 24.25, "kl": 0.9898469369392842, "learning_rate": 4.9453690018345144e-06, "loss": 0.0001, "match_ratio": 1.0, "reward": 0.8029376929998397, "reward_std": 0.3678751669218764, "rewards/reward_func": 0.8029376929998397, "step": 1600 }, { "completion_length": 52.7, "epoch": 0.34, "grad_norm": 3.453125, "kl": 116.13146778639405, "learning_rate": 4.925739315689991e-06, "loss": 0.0116, "match_ratio": 1.0, "reward": 0.8077524190768599, "reward_std": 0.33210189862176775, "rewards/reward_func": 0.8077524190768599, "step": 1700 }, { "completion_length": 54.5, "epoch": 0.36, "grad_norm": 12.25, "kl": 5.921828000650276, "learning_rate": 4.903154239845798e-06, "loss": 0.0006, "match_ratio": 1.0, "reward": 0.8151757456362247, "reward_std": 0.3074088580603711, "rewards/reward_func": 0.8151757456362247, "step": 1800 }, { "completion_length": 59.43, "epoch": 0.38, "grad_norm": 22.25, "kl": 0.22536800906993448, "learning_rate": 4.8776412907378845e-06, "loss": 0.0, "match_ratio": 1.0, "reward": 0.8543619333952666, "reward_std": 0.3683507715538144, "rewards/reward_func": 0.8543619333952666, "step": 1900 }, { "completion_length": 59.705, "epoch": 0.4, "grad_norm": 5.21875, "kl": 0.2842459925811272, "learning_rate": 4.849231551964771e-06, "loss": 0.0, "match_ratio": 1.0, "reward": 0.9203101838380099, "reward_std": 0.3454422113858163, "rewards/reward_func": 0.9203101838380099, "step": 2000 }, { "completion_length": 65.095, "epoch": 0.42, "grad_norm": 9.125, "kl": 0.16623427679762245, "learning_rate": 4.817959636416969e-06, "loss": 0.0, "match_ratio": 1.0, "reward": 0.9261023019999266, "reward_std": 0.3223581924289465, "rewards/reward_func": 0.9261023019999266, "step": 2100 }, { "completion_length": 62.8125, "epoch": 0.44, "grad_norm": 11.0625, "kl": 0.10455320389475674, "learning_rate": 4.783863644106502e-06, "loss": 0.0, "match_ratio": 0.9975, "reward": 0.9122521196585148, "reward_std": 0.37498483614996075, "rewards/reward_func": 0.9122521196585148, "step": 2200 }, { "completion_length": 71.1275, "epoch": 0.46, "grad_norm": 12.5, "kl": 0.5584425710327924, "learning_rate": 4.746985115747918e-06, "loss": 0.0001, "match_ratio": 0.9925, "reward": 0.792227897644043, "reward_std": 0.4075765323080123, "rewards/reward_func": 0.792227897644043, "step": 2300 }, { "completion_length": 68.27, "epoch": 0.48, "grad_norm": 10.625, "kl": 1.3814555319957436, "learning_rate": 4.707368982147318e-06, "loss": 0.0001, "match_ratio": 0.9975, "reward": 0.8301830168347806, "reward_std": 0.36762866189703347, "rewards/reward_func": 0.8301830168347806, "step": 2400 }, { "completion_length": 66.965, "epoch": 0.5, "grad_norm": 12.6875, "kl": 0.8518368338712026, "learning_rate": 4.665063509461098e-06, "loss": 0.0001, "match_ratio": 0.995, "reward": 0.8910126995295287, "reward_std": 0.3965667562186718, "rewards/reward_func": 0.8910126995295287, "step": 2500 }, { "completion_length": 70.73, "epoch": 0.52, "grad_norm": 17.75, "kl": 0.5207290647923947, "learning_rate": 4.620120240391065e-06, "loss": 0.0001, "match_ratio": 0.9925, "reward": 0.8480577088147402, "reward_std": 0.4258584909327328, "rewards/reward_func": 0.8480577088147402, "step": 2600 }, { "completion_length": 59.6575, "epoch": 0.54, "grad_norm": 11.375, "kl": 0.7336286423553247, "learning_rate": 4.572593931387604e-06, "loss": 0.0001, "match_ratio": 1.0, "reward": 0.928214335795492, "reward_std": 0.37614597208797934, "rewards/reward_func": 0.928214335795492, "step": 2700 }, { "completion_length": 63.8775, "epoch": 0.56, "grad_norm": 9.5625, "kl": 0.33108121431432663, "learning_rate": 4.522542485937369e-06, "loss": 0.0, "match_ratio": 0.9975, "reward": 0.7988891634345054, "reward_std": 0.36210680682212115, "rewards/reward_func": 0.7988891634345054, "step": 2800 }, { "completion_length": 59.67, "epoch": 0.58, "grad_norm": 8.75, "kl": 72.35348623547704, "learning_rate": 4.470026884016805e-06, "loss": 0.0072, "match_ratio": 0.9975, "reward": 0.8135686150938273, "reward_std": 0.3942835557647049, "rewards/reward_func": 0.8135686150938273, "step": 2900 }, { "completion_length": 56.305, "epoch": 0.6, "grad_norm": 10.0, "kl": 0.14451225536875426, "learning_rate": 4.415111107797445e-06, "loss": 0.0, "match_ratio": 1.0, "reward": 0.8231964718922973, "reward_std": 0.3244973301887512, "rewards/reward_func": 0.8231964718922973, "step": 3000 }, { "completion_length": 59.705, "epoch": 0.62, "grad_norm": 10.0, "kl": 0.16642916494980453, "learning_rate": 4.357862063693486e-06, "loss": 0.0, "match_ratio": 0.9975, "reward": 0.8778230049461127, "reward_std": 0.36886056323535743, "rewards/reward_func": 0.8778230049461127, "step": 3100 }, { "completion_length": 56.71, "epoch": 0.64, "grad_norm": 8.875, "kl": 0.18644527865573765, "learning_rate": 4.2983495008466285e-06, "loss": 0.0, "match_ratio": 0.995, "reward": 0.7874290134198964, "reward_std": 0.38642477702349426, "rewards/reward_func": 0.7874290134198964, "step": 3200 }, { "completion_length": 56.68, "epoch": 0.66, "grad_norm": 27.0, "kl": 0.22375864623580127, "learning_rate": 4.236645926147493e-06, "loss": 0.0, "match_ratio": 1.0, "reward": 0.8582109183818102, "reward_std": 0.36429890371393414, "rewards/reward_func": 0.8582109183818102, "step": 3300 }, { "completion_length": 58.835, "epoch": 0.68, "grad_norm": 6.125, "kl": 0.22189826945774258, "learning_rate": 4.172826515897146e-06, "loss": 0.0, "match_ratio": 1.0, "reward": 0.7524572538957, "reward_std": 0.35272345967590807, "rewards/reward_func": 0.7524572538957, "step": 3400 }, { "completion_length": 50.8375, "epoch": 0.7, "grad_norm": 14.375, "kl": 0.9133326725219376, "learning_rate": 4.106969024216348e-06, "loss": 0.0001, "match_ratio": 1.0, "reward": 0.8736884651333093, "reward_std": 0.3174928646720946, "rewards/reward_func": 0.8736884651333093, "step": 3500 }, { "completion_length": 60.4325, "epoch": 0.72, "grad_norm": 12.125, "kl": 0.16217968232464045, "learning_rate": 4.039153688314146e-06, "loss": 0.0, "match_ratio": 1.0, "reward": 0.9595573445409536, "reward_std": 0.34502996982075274, "rewards/reward_func": 0.9595573445409536, "step": 3600 }, { "completion_length": 63.43, "epoch": 0.74, "grad_norm": 12.9375, "kl": 0.290018264092505, "learning_rate": 3.969463130731183e-06, "loss": 0.0, "match_ratio": 0.995, "reward": 0.7895570612326265, "reward_std": 0.35805795643478633, "rewards/reward_func": 0.7895570612326265, "step": 3700 }, { "completion_length": 55.66, "epoch": 0.76, "grad_norm": 9.0, "kl": 0.19286280857399107, "learning_rate": 3.897982258676867e-06, "loss": 0.0, "match_ratio": 0.9975, "reward": 0.8514765882119536, "reward_std": 0.40829260389087724, "rewards/reward_func": 0.8514765882119536, "step": 3800 }, { "completion_length": 64.0375, "epoch": 0.78, "grad_norm": 6.9375, "kl": 0.3412795978039503, "learning_rate": 3.824798160583012e-06, "loss": 0.0, "match_ratio": 0.9925, "reward": 0.9143854442238808, "reward_std": 0.3976023513358086, "rewards/reward_func": 0.9143854442238808, "step": 3900 }, { "completion_length": 63.335, "epoch": 0.8, "grad_norm": 11.375, "kl": 0.27103981951251627, "learning_rate": 3.7500000000000005e-06, "loss": 0.0, "match_ratio": 0.9975, "reward": 0.7887253789231181, "reward_std": 0.4162597674317658, "rewards/reward_func": 0.7887253789231181, "step": 4000 }, { "completion_length": 59.8575, "epoch": 0.82, "grad_norm": 6.59375, "kl": 0.7605254784226417, "learning_rate": 3.6736789069647273e-06, "loss": 0.0001, "match_ratio": 0.9975, "reward": 0.9478698487579823, "reward_std": 0.30987203000113367, "rewards/reward_func": 0.9478698487579823, "step": 4100 }, { "completion_length": 58.2175, "epoch": 0.84, "grad_norm": 12.3125, "kl": 1.3949576319474728, "learning_rate": 3.595927866972694e-06, "loss": 0.0001, "match_ratio": 1.0, "reward": 0.9541199389472603, "reward_std": 0.3034708809526637, "rewards/reward_func": 0.9541199389472603, "step": 4200 }, { "completion_length": 63.415, "epoch": 0.86, "grad_norm": 27.75, "kl": 0.45249753130599857, "learning_rate": 3.516841607689501e-06, "loss": 0.0, "match_ratio": 0.9975, "reward": 0.8886630642414093, "reward_std": 0.36593056879937647, "rewards/reward_func": 0.8886630642414093, "step": 4300 }, { "completion_length": 55.02, "epoch": 0.88, "grad_norm": 11.25, "kl": 68.24156057231593, "learning_rate": 3.436516483539781e-06, "loss": 0.0068, "match_ratio": 1.0, "reward": 0.866313117146492, "reward_std": 0.35739596346393226, "rewards/reward_func": 0.866313117146492, "step": 4400 }, { "completion_length": 61.2375, "epoch": 0.9, "grad_norm": 21.5, "kl": 0.557325184418587, "learning_rate": 3.3550503583141726e-06, "loss": 0.0001, "match_ratio": 1.0, "reward": 0.845514679402113, "reward_std": 0.3533631762489676, "rewards/reward_func": 0.845514679402113, "step": 4500 }, { "completion_length": 61.4575, "epoch": 0.92, "grad_norm": 10.9375, "kl": 0.31026113393716515, "learning_rate": 3.272542485937369e-06, "loss": 0.0, "match_ratio": 1.0, "reward": 0.8414725087583065, "reward_std": 0.31504234885796906, "rewards/reward_func": 0.8414725087583065, "step": 4600 }, { "completion_length": 55.6725, "epoch": 0.94, "grad_norm": 15.875, "kl": 0.25192020772024987, "learning_rate": 3.189093389542498e-06, "loss": 0.0, "match_ratio": 0.9975, "reward": 0.934078385848552, "reward_std": 0.3096505870204419, "rewards/reward_func": 0.934078385848552, "step": 4700 }, { "completion_length": 57.29, "epoch": 0.96, "grad_norm": 8.75, "kl": 0.6216541412565857, "learning_rate": 3.1048047389991693e-06, "loss": 0.0001, "match_ratio": 0.995, "reward": 0.8389100107550621, "reward_std": 0.3749863849021494, "rewards/reward_func": 0.8389100107550621, "step": 4800 }, { "completion_length": 61.2525, "epoch": 0.98, "grad_norm": 8.125, "kl": 0.342362194955349, "learning_rate": 3.019779227044398e-06, "loss": 0.0, "match_ratio": 0.995, "reward": 0.8242513693869113, "reward_std": 0.36022061900235713, "rewards/reward_func": 0.8242513693869113, "step": 4900 }, { "completion_length": 59.7125, "epoch": 1.0, "grad_norm": 17.5, "kl": 0.41126792770577597, "learning_rate": 2.9341204441673267e-06, "loss": 0.0, "match_ratio": 0.9925, "reward": 0.8274249080568552, "reward_std": 0.4075367634743452, "rewards/reward_func": 0.8274249080568552, "step": 5000 }, { "completion_length": 60.6025, "epoch": 1.02, "grad_norm": 12.375, "kl": 0.23068872857838868, "learning_rate": 2.847932752400164e-06, "loss": 0.0, "match_ratio": 0.9975, "reward": 0.8065017646364868, "reward_std": 0.36464907992631196, "rewards/reward_func": 0.8065017646364868, "step": 5100 }, { "completion_length": 59.0775, "epoch": 1.04, "grad_norm": 20.25, "kl": 0.42417401013895867, "learning_rate": 2.761321158169134e-06, "loss": 0.0, "match_ratio": 0.9975, "reward": 0.7185550931096077, "reward_std": 0.40595400186255576, "rewards/reward_func": 0.7185550931096077, "step": 5200 }, { "completion_length": 63.42, "epoch": 1.06, "grad_norm": 23.875, "kl": 0.3724514145217836, "learning_rate": 2.6743911843603134e-06, "loss": 0.0, "match_ratio": 1.0, "reward": 0.7712494351714849, "reward_std": 0.37587333597242834, "rewards/reward_func": 0.7712494351714849, "step": 5300 }, { "completion_length": 60.4, "epoch": 1.08, "grad_norm": 13.0, "kl": 0.41718232361599805, "learning_rate": 2.587248741756253e-06, "loss": 0.0, "match_ratio": 0.9975, "reward": 0.8315356434136629, "reward_std": 0.35958164227195083, "rewards/reward_func": 0.8315356434136629, "step": 5400 }, { "completion_length": 64.535, "epoch": 1.1, "grad_norm": 13.0, "kl": 0.426719272416085, "learning_rate": 2.5e-06, "loss": 0.0, "match_ratio": 0.9975, "reward": 0.8871817947924137, "reward_std": 0.35918335968628523, "rewards/reward_func": 0.8871817947924137, "step": 5500 }, { "completion_length": 63.825, "epoch": 1.12, "grad_norm": 12.6875, "kl": 10.818288787528873, "learning_rate": 2.4127512582437486e-06, "loss": 0.0011, "match_ratio": 0.9925, "reward": 0.8072922784090042, "reward_std": 0.41990237571299077, "rewards/reward_func": 0.8072922784090042, "step": 5600 }, { "completion_length": 61.5425, "epoch": 1.1400000000000001, "grad_norm": 8.5625, "kl": 1.7851288786903023, "learning_rate": 2.325608815639687e-06, "loss": 0.0002, "match_ratio": 0.9975, "reward": 0.7871765466406941, "reward_std": 0.3983499974012375, "rewards/reward_func": 0.7871765466406941, "step": 5700 }, { "completion_length": 64.99, "epoch": 1.16, "grad_norm": 83.5, "kl": 1.0691816475684754, "learning_rate": 2.238678841830867e-06, "loss": 0.0001, "match_ratio": 1.0, "reward": 0.9227658536192029, "reward_std": 0.3650888724066317, "rewards/reward_func": 0.9227658536192029, "step": 5800 }, { "completion_length": 65.0375, "epoch": 1.18, "grad_norm": 9.8125, "kl": 0.38561805644072594, "learning_rate": 2.1520672475998374e-06, "loss": 0.0, "match_ratio": 1.0, "reward": 0.8525355974957347, "reward_std": 0.36985819303430617, "rewards/reward_func": 0.8525355974957347, "step": 5900 }, { "completion_length": 62.8275, "epoch": 1.2, "grad_norm": 9.3125, "kl": 0.3610994891449809, "learning_rate": 2.0658795558326745e-06, "loss": 0.0, "match_ratio": 0.995, "reward": 1.0021917837299406, "reward_std": 0.3685089880321175, "rewards/reward_func": 1.0021917837299406, "step": 6000 }, { "completion_length": 67.415, "epoch": 1.22, "grad_norm": 14.0, "kl": 0.48120407085865735, "learning_rate": 1.9802207729556023e-06, "loss": 0.0, "match_ratio": 0.9975, "reward": 0.980966807603836, "reward_std": 0.33038541514426467, "rewards/reward_func": 0.980966807603836, "step": 6100 }, { "completion_length": 61.275, "epoch": 1.24, "grad_norm": 17.125, "kl": 1.394161350093782, "learning_rate": 1.895195261000831e-06, "loss": 0.0001, "match_ratio": 1.0, "reward": 0.8998113541305065, "reward_std": 0.37138622866943477, "rewards/reward_func": 0.8998113541305065, "step": 6200 }, { "completion_length": 61.4575, "epoch": 1.26, "grad_norm": 9.5625, "kl": 0.4891889825835824, "learning_rate": 1.8109066104575023e-06, "loss": 0.0, "match_ratio": 0.9975, "reward": 0.7821180200204253, "reward_std": 0.4117224833089858, "rewards/reward_func": 0.7821180200204253, "step": 6300 }, { "completion_length": 62.535, "epoch": 1.28, "grad_norm": 17.75, "kl": 0.701120622754097, "learning_rate": 1.7274575140626318e-06, "loss": 0.0001, "match_ratio": 0.9975, "reward": 0.8505088457465172, "reward_std": 0.35196221828460694, "rewards/reward_func": 0.8505088457465172, "step": 6400 }, { "completion_length": 61.0425, "epoch": 1.3, "grad_norm": 10.4375, "kl": 0.8444961504405364, "learning_rate": 1.6449496416858285e-06, "loss": 0.0001, "match_ratio": 1.0, "reward": 0.8533294384181499, "reward_std": 0.35340342290699484, "rewards/reward_func": 0.8533294384181499, "step": 6500 }, { "completion_length": 66.6525, "epoch": 1.32, "grad_norm": 16.375, "kl": 0.6673866561800241, "learning_rate": 1.56348351646022e-06, "loss": 0.0001, "match_ratio": 0.9925, "reward": 0.962776445467025, "reward_std": 0.39187521073035897, "rewards/reward_func": 0.962776445467025, "step": 6600 }, { "completion_length": 64.05, "epoch": 1.34, "grad_norm": 15.25, "kl": 0.5449268382415176, "learning_rate": 1.4831583923105e-06, "loss": 0.0001, "match_ratio": 0.9975, "reward": 0.9354583528265357, "reward_std": 0.3569179131626152, "rewards/reward_func": 0.9354583528265357, "step": 6700 }, { "completion_length": 64.9625, "epoch": 1.3599999999999999, "grad_norm": 12.875, "kl": 0.9086799253150821, "learning_rate": 1.4040721330273063e-06, "loss": 0.0001, "match_ratio": 1.0, "reward": 0.8598837627470494, "reward_std": 0.36035713417921217, "rewards/reward_func": 0.8598837627470494, "step": 6800 }, { "completion_length": 65.365, "epoch": 1.38, "grad_norm": 16.5, "kl": 0.5496124785766006, "learning_rate": 1.3263210930352737e-06, "loss": 0.0001, "match_ratio": 1.0, "reward": 0.8178367885202169, "reward_std": 0.37116443024016915, "rewards/reward_func": 0.8178367885202169, "step": 6900 }, { "completion_length": 62.0475, "epoch": 1.4, "grad_norm": 26.875, "kl": 0.549699901342392, "learning_rate": 1.2500000000000007e-06, "loss": 0.0001, "match_ratio": 0.9975, "reward": 0.9533282884210348, "reward_std": 0.3430164767615497, "rewards/reward_func": 0.9533282884210348, "step": 7000 }, { "completion_length": 60.79, "epoch": 1.42, "grad_norm": 13.25, "kl": 0.5086184279620647, "learning_rate": 1.1752018394169882e-06, "loss": 0.0001, "match_ratio": 0.9975, "reward": 0.8821756513416767, "reward_std": 0.3568952218815684, "rewards/reward_func": 0.8821756513416767, "step": 7100 }, { "completion_length": 62.3475, "epoch": 1.44, "grad_norm": 21.125, "kl": 0.8897788706421852, "learning_rate": 1.1020177413231334e-06, "loss": 0.0001, "match_ratio": 0.995, "reward": 0.934985687956214, "reward_std": 0.32632746720686556, "rewards/reward_func": 0.934985687956214, "step": 7200 }, { "completion_length": 56.98, "epoch": 1.46, "grad_norm": 35.25, "kl": 0.7370296374708414, "learning_rate": 1.0305368692688175e-06, "loss": 0.0001, "match_ratio": 1.0, "reward": 0.8605528651922941, "reward_std": 0.3732724652206525, "rewards/reward_func": 0.8605528651922941, "step": 7300 }, { "completion_length": 57.71, "epoch": 1.48, "grad_norm": 14.5, "kl": 0.7560949631407857, "learning_rate": 9.608463116858544e-07, "loss": 0.0001, "match_ratio": 0.995, "reward": 0.8051678024046123, "reward_std": 0.3447662947047502, "rewards/reward_func": 0.8051678024046123, "step": 7400 }, { "completion_length": 69.1925, "epoch": 1.5, "grad_norm": 29.5, "kl": 0.6943446175381541, "learning_rate": 8.930309757836517e-07, "loss": 0.0001, "match_ratio": 1.0, "reward": 0.8813337843865157, "reward_std": 0.3512991077173501, "rewards/reward_func": 0.8813337843865157, "step": 7500 }, { "completion_length": 62.1225, "epoch": 1.52, "grad_norm": 11.25, "kl": 0.3682367965579033, "learning_rate": 8.271734841028553e-07, "loss": 0.0, "match_ratio": 0.995, "reward": 0.8769916776567698, "reward_std": 0.35788299994543193, "rewards/reward_func": 0.8769916776567698, "step": 7600 }, { "completion_length": 62.5475, "epoch": 1.54, "grad_norm": 10.125, "kl": 0.35848211450036616, "learning_rate": 7.633540738525066e-07, "loss": 0.0, "match_ratio": 0.9975, "reward": 0.9400549785792828, "reward_std": 0.36338255695067345, "rewards/reward_func": 0.9400549785792828, "step": 7700 }, { "completion_length": 61.32, "epoch": 1.56, "grad_norm": 13.625, "kl": 0.5323085347935558, "learning_rate": 7.016504991533727e-07, "loss": 0.0001, "match_ratio": 0.995, "reward": 0.8762814123183489, "reward_std": 0.35265143546042965, "rewards/reward_func": 0.8762814123183489, "step": 7800 }, { "completion_length": 66.315, "epoch": 1.58, "grad_norm": 18.375, "kl": 0.40489839322865007, "learning_rate": 6.421379363065142e-07, "loss": 0.0, "match_ratio": 0.995, "reward": 0.8989605332165956, "reward_std": 0.3447486224025488, "rewards/reward_func": 0.8989605332165956, "step": 7900 }, { "completion_length": 60.5325, "epoch": 1.6, "grad_norm": 11.75, "kl": 0.690227730597835, "learning_rate": 5.848888922025553e-07, "loss": 0.0001, "match_ratio": 1.0, "reward": 0.8210568431764841, "reward_std": 0.3149324245750904, "rewards/reward_func": 0.8210568431764841, "step": 8000 }, { "completion_length": 66.7825, "epoch": 1.62, "grad_norm": 13.4375, "kl": 0.516472494918853, "learning_rate": 5.299731159831953e-07, "loss": 0.0001, "match_ratio": 0.995, "reward": 0.8683985948190093, "reward_std": 0.345150127671659, "rewards/reward_func": 0.8683985948190093, "step": 8100 }, { "completion_length": 65.72, "epoch": 1.6400000000000001, "grad_norm": 12.875, "kl": 0.4457222482562065, "learning_rate": 4.774575140626317e-07, "loss": 0.0, "match_ratio": 0.9975, "reward": 0.7730556976422668, "reward_std": 0.41231788201257585, "rewards/reward_func": 0.7730556976422668, "step": 8200 }, { "completion_length": 64.4875, "epoch": 1.6600000000000001, "grad_norm": 9.6875, "kl": 0.5901041788049042, "learning_rate": 4.27406068612396e-07, "loss": 0.0001, "match_ratio": 0.9975, "reward": 0.8864369177818299, "reward_std": 0.3609216751717031, "rewards/reward_func": 0.8864369177818299, "step": 8300 }, { "completion_length": 59.98, "epoch": 1.6800000000000002, "grad_norm": 30.5, "kl": 0.3067289407923818, "learning_rate": 3.798797596089351e-07, "loss": 0.0, "match_ratio": 0.9975, "reward": 0.8906753876060247, "reward_std": 0.2947716296184808, "rewards/reward_func": 0.8906753876060247, "step": 8400 }, { "completion_length": 68.6625, "epoch": 1.7, "grad_norm": 24.0, "kl": 2.096909821406007, "learning_rate": 3.3493649053890325e-07, "loss": 0.0002, "match_ratio": 1.0, "reward": 0.9327631609933451, "reward_std": 0.3544263231381774, "rewards/reward_func": 0.9327631609933451, "step": 8500 }, { "completion_length": 66.0125, "epoch": 1.72, "grad_norm": 10.9375, "kl": 0.5123670964688063, "learning_rate": 2.9263101785268253e-07, "loss": 0.0001, "match_ratio": 0.995, "reward": 0.9494618388265371, "reward_std": 0.35082788893952965, "rewards/reward_func": 0.9494618388265371, "step": 8600 }, { "completion_length": 57.8475, "epoch": 1.74, "grad_norm": 15.8125, "kl": 1.2104839562997223, "learning_rate": 2.53014884252083e-07, "loss": 0.0001, "match_ratio": 1.0, "reward": 0.9292678725533188, "reward_std": 0.3112880502641201, "rewards/reward_func": 0.9292678725533188, "step": 8700 }, { "completion_length": 63.66, "epoch": 1.76, "grad_norm": 10.9375, "kl": 0.5559730716235936, "learning_rate": 2.1613635589349756e-07, "loss": 0.0001, "match_ratio": 0.9975, "reward": 0.9810863409936428, "reward_std": 0.36776383105432614, "rewards/reward_func": 0.9810863409936428, "step": 8800 }, { "completion_length": 59.8675, "epoch": 1.78, "grad_norm": 13.25, "kl": 29.805520134083928, "learning_rate": 1.8204036358303173e-07, "loss": 0.003, "match_ratio": 1.0, "reward": 0.8572433185577393, "reward_std": 0.3437820218596607, "rewards/reward_func": 0.8572433185577393, "step": 8900 }, { "completion_length": 60.3075, "epoch": 1.8, "grad_norm": 9.1875, "kl": 0.8873812770657241, "learning_rate": 1.507684480352292e-07, "loss": 0.0001, "match_ratio": 0.9975, "reward": 0.919341038018465, "reward_std": 0.33141862623393537, "rewards/reward_func": 0.919341038018465, "step": 9000 }, { "completion_length": 62.265, "epoch": 1.8199999999999998, "grad_norm": 12.125, "kl": 0.6630043520405888, "learning_rate": 1.223587092621162e-07, "loss": 0.0001, "match_ratio": 0.9975, "reward": 1.029205017723143, "reward_std": 0.3310457341000438, "rewards/reward_func": 1.029205017723143, "step": 9100 }, { "completion_length": 62.0575, "epoch": 1.8399999999999999, "grad_norm": 11.0, "kl": 0.5343834590911866, "learning_rate": 9.684576015420277e-08, "loss": 0.0001, "match_ratio": 0.9975, "reward": 0.8843235304579139, "reward_std": 0.35643110671080647, "rewards/reward_func": 0.8843235304579139, "step": 9200 }, { "completion_length": 64.51, "epoch": 1.8599999999999999, "grad_norm": 13.0, "kl": 0.47331944581121205, "learning_rate": 7.426068431000883e-08, "loss": 0.0, "match_ratio": 1.0, "reward": 0.8205329022929072, "reward_std": 0.33430186320096256, "rewards/reward_func": 0.8205329022929072, "step": 9300 }, { "completion_length": 64.395, "epoch": 1.88, "grad_norm": 42.0, "kl": 1.8421870478987694, "learning_rate": 5.463099816548578e-08, "loss": 0.0002, "match_ratio": 0.9975, "reward": 0.9012074111029506, "reward_std": 0.33403355406597257, "rewards/reward_func": 0.9012074111029506, "step": 9400 }, { "completion_length": 61.4425, "epoch": 1.9, "grad_norm": 12.0, "kl": 0.5331659988686442, "learning_rate": 3.798061746947995e-08, "loss": 0.0001, "match_ratio": 1.0, "reward": 0.7935629660636186, "reward_std": 0.3497585416212678, "rewards/reward_func": 0.7935629660636186, "step": 9500 }, { "completion_length": 51.77, "epoch": 1.92, "grad_norm": 10.875, "kl": 0.7931823456101119, "learning_rate": 2.4329828146074096e-08, "loss": 0.0001, "match_ratio": 1.0, "reward": 0.9315874481201172, "reward_std": 0.44919231578707697, "rewards/reward_func": 0.9315874481201172, "step": 9600 }, { "completion_length": 57.5275, "epoch": 1.94, "grad_norm": 13.9375, "kl": 0.6070253856666387, "learning_rate": 1.3695261579316776e-08, "loss": 0.0001, "match_ratio": 1.0, "reward": 0.9431364990770816, "reward_std": 0.3222667661588639, "rewards/reward_func": 0.9431364990770816, "step": 9700 }, { "completion_length": 59.37, "epoch": 1.96, "grad_norm": 14.875, "kl": 0.5334895004890859, "learning_rate": 6.089874350439507e-09, "loss": 0.0001, "match_ratio": 1.0, "reward": 0.9747491884231567, "reward_std": 0.29319573145825417, "rewards/reward_func": 0.9747491884231567, "step": 9800 }, { "completion_length": 63.385, "epoch": 1.98, "grad_norm": 12.125, "kl": 0.4961644561961293, "learning_rate": 1.5229324522605949e-09, "loss": 0.0, "match_ratio": 0.995, "reward": 0.8896720813587308, "reward_std": 0.4128014264255762, "rewards/reward_func": 0.8896720813587308, "step": 9900 }, { "completion_length": 67.28, "epoch": 2.0, "grad_norm": 10.8125, "kl": 0.30117323972284793, "learning_rate": 0.0, "loss": 0.0, "match_ratio": 0.995, "reward": 0.7791323178261519, "reward_std": 0.35405952845700084, "rewards/reward_func": 0.7791323178261519, "step": 10000 } ], "logging_steps": 100, "max_steps": 10000, "num_input_tokens_seen": 0, "num_train_epochs": 2, "save_steps": 100, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 4, "trial_name": null, "trial_params": null }