| { |
| "best_metric": null, |
| "best_model_checkpoint": null, |
| "epoch": 2.0, |
| "eval_steps": 500, |
| "global_step": 10000, |
| "is_hyper_param_search": false, |
| "is_local_process_zero": true, |
| "is_world_process_zero": true, |
| "log_history": [ |
| { |
| "completion_length": 65.065, |
| "epoch": 0.02, |
| "grad_norm": 10.375, |
| "kl": 0.0006580278992169042, |
| "learning_rate": 5.000000000000001e-07, |
| "loss": -0.0, |
| "match_ratio": 0.995, |
| "reward": 0.8990143708884716, |
| "reward_std": 0.46338544798083603, |
| "rewards/reward_func": 0.8990143708884716, |
| "step": 100 |
| }, |
| { |
| "completion_length": 65.4625, |
| "epoch": 0.04, |
| "grad_norm": 6.875, |
| "kl": 0.0006705577400316542, |
| "learning_rate": 1.0000000000000002e-06, |
| "loss": -0.0, |
| "match_ratio": 1.0, |
| "reward": 0.7165287194028497, |
| "reward_std": 0.40350831425283107, |
| "rewards/reward_func": 0.7165287194028497, |
| "step": 200 |
| }, |
| { |
| "completion_length": 61.175, |
| "epoch": 0.06, |
| "grad_norm": 8.4375, |
| "kl": 0.0007126682825037278, |
| "learning_rate": 1.5e-06, |
| "loss": 0.0, |
| "match_ratio": 1.0, |
| "reward": 0.7286543997749686, |
| "reward_std": 0.4331769395247102, |
| "rewards/reward_func": 0.7286543997749686, |
| "step": 300 |
| }, |
| { |
| "completion_length": 54.655, |
| "epoch": 0.08, |
| "grad_norm": 11.9375, |
| "kl": 0.0008965998092025984, |
| "learning_rate": 2.0000000000000003e-06, |
| "loss": 0.0, |
| "match_ratio": 1.0, |
| "reward": 0.7439908282458783, |
| "reward_std": 0.4567913323547691, |
| "rewards/reward_func": 0.7439908282458783, |
| "step": 400 |
| }, |
| { |
| "completion_length": 62.1025, |
| "epoch": 0.1, |
| "grad_norm": 15.8125, |
| "kl": 0.0019434646295849235, |
| "learning_rate": 2.5e-06, |
| "loss": 0.0, |
| "match_ratio": 1.0, |
| "reward": 0.8668537394329906, |
| "reward_std": 0.3836937860772014, |
| "rewards/reward_func": 0.8668537394329906, |
| "step": 500 |
| }, |
| { |
| "completion_length": 58.14, |
| "epoch": 0.12, |
| "grad_norm": 20.5, |
| "kl": 0.0041620647069066765, |
| "learning_rate": 3e-06, |
| "loss": 0.0, |
| "match_ratio": 1.0, |
| "reward": 0.7577041421830654, |
| "reward_std": 0.43559244139119985, |
| "rewards/reward_func": 0.7577041421830654, |
| "step": 600 |
| }, |
| { |
| "completion_length": 60.0, |
| "epoch": 0.14, |
| "grad_norm": 10.6875, |
| "kl": 0.008898616410442628, |
| "learning_rate": 3.5e-06, |
| "loss": 0.0, |
| "match_ratio": 0.995, |
| "reward": 0.8700515530258417, |
| "reward_std": 0.45400316243059935, |
| "rewards/reward_func": 0.8700515530258417, |
| "step": 700 |
| }, |
| { |
| "completion_length": 58.1275, |
| "epoch": 0.16, |
| "grad_norm": 7.375, |
| "kl": 0.0189549465168966, |
| "learning_rate": 4.000000000000001e-06, |
| "loss": 0.0, |
| "match_ratio": 0.9975, |
| "reward": 0.7298830785602332, |
| "reward_std": 0.4231558512337506, |
| "rewards/reward_func": 0.7298830785602332, |
| "step": 800 |
| }, |
| { |
| "completion_length": 55.55, |
| "epoch": 0.18, |
| "grad_norm": 13.3125, |
| "kl": 0.04016699714120477, |
| "learning_rate": 4.5e-06, |
| "loss": 0.0, |
| "match_ratio": 1.0, |
| "reward": 0.8830712201073766, |
| "reward_std": 0.3306662117503583, |
| "rewards/reward_func": 0.8830712201073766, |
| "step": 900 |
| }, |
| { |
| "completion_length": 55.0225, |
| "epoch": 0.2, |
| "grad_norm": 10.5625, |
| "kl": 0.06088939258828759, |
| "learning_rate": 5e-06, |
| "loss": 0.0, |
| "match_ratio": 1.0, |
| "reward": 0.7573206969350577, |
| "reward_std": 0.32937701970338823, |
| "rewards/reward_func": 0.7573206969350577, |
| "step": 1000 |
| }, |
| { |
| "completion_length": 59.6125, |
| "epoch": 0.22, |
| "grad_norm": 11.0625, |
| "kl": 0.03887372653000057, |
| "learning_rate": 4.99847706754774e-06, |
| "loss": 0.0, |
| "match_ratio": 0.995, |
| "reward": 0.8389806092530488, |
| "reward_std": 0.3803154364787042, |
| "rewards/reward_func": 0.8389806092530488, |
| "step": 1100 |
| }, |
| { |
| "completion_length": 62.01, |
| "epoch": 0.24, |
| "grad_norm": 8.75, |
| "kl": 0.9295957709802315, |
| "learning_rate": 4.993910125649561e-06, |
| "loss": 0.0001, |
| "match_ratio": 0.9925, |
| "reward": 0.8083837843686342, |
| "reward_std": 0.4002057794481516, |
| "rewards/reward_func": 0.8083837843686342, |
| "step": 1200 |
| }, |
| { |
| "completion_length": 61.6575, |
| "epoch": 0.26, |
| "grad_norm": 12.8125, |
| "kl": 0.5295558683061973, |
| "learning_rate": 4.986304738420684e-06, |
| "loss": 0.0001, |
| "match_ratio": 0.9925, |
| "reward": 0.8700573812425136, |
| "reward_std": 0.41881847178563475, |
| "rewards/reward_func": 0.8700573812425136, |
| "step": 1300 |
| }, |
| { |
| "completion_length": 55.82, |
| "epoch": 0.28, |
| "grad_norm": 21.0, |
| "kl": 0.2268725570756942, |
| "learning_rate": 4.975670171853926e-06, |
| "loss": 0.0, |
| "match_ratio": 1.0, |
| "reward": 0.7677264379709959, |
| "reward_std": 0.4127253815624863, |
| "rewards/reward_func": 0.7677264379709959, |
| "step": 1400 |
| }, |
| { |
| "completion_length": 61.605, |
| "epoch": 0.3, |
| "grad_norm": 15.25, |
| "kl": 2.662307023219764, |
| "learning_rate": 4.962019382530521e-06, |
| "loss": 0.0003, |
| "match_ratio": 0.9975, |
| "reward": 0.8300903634727002, |
| "reward_std": 0.330243071205914, |
| "rewards/reward_func": 0.8300903634727002, |
| "step": 1500 |
| }, |
| { |
| "completion_length": 57.8975, |
| "epoch": 0.32, |
| "grad_norm": 24.25, |
| "kl": 0.9898469369392842, |
| "learning_rate": 4.9453690018345144e-06, |
| "loss": 0.0001, |
| "match_ratio": 1.0, |
| "reward": 0.8029376929998397, |
| "reward_std": 0.3678751669218764, |
| "rewards/reward_func": 0.8029376929998397, |
| "step": 1600 |
| }, |
| { |
| "completion_length": 52.7, |
| "epoch": 0.34, |
| "grad_norm": 3.453125, |
| "kl": 116.13146778639405, |
| "learning_rate": 4.925739315689991e-06, |
| "loss": 0.0116, |
| "match_ratio": 1.0, |
| "reward": 0.8077524190768599, |
| "reward_std": 0.33210189862176775, |
| "rewards/reward_func": 0.8077524190768599, |
| "step": 1700 |
| }, |
| { |
| "completion_length": 54.5, |
| "epoch": 0.36, |
| "grad_norm": 12.25, |
| "kl": 5.921828000650276, |
| "learning_rate": 4.903154239845798e-06, |
| "loss": 0.0006, |
| "match_ratio": 1.0, |
| "reward": 0.8151757456362247, |
| "reward_std": 0.3074088580603711, |
| "rewards/reward_func": 0.8151757456362247, |
| "step": 1800 |
| }, |
| { |
| "completion_length": 59.43, |
| "epoch": 0.38, |
| "grad_norm": 22.25, |
| "kl": 0.22536800906993448, |
| "learning_rate": 4.8776412907378845e-06, |
| "loss": 0.0, |
| "match_ratio": 1.0, |
| "reward": 0.8543619333952666, |
| "reward_std": 0.3683507715538144, |
| "rewards/reward_func": 0.8543619333952666, |
| "step": 1900 |
| }, |
| { |
| "completion_length": 59.705, |
| "epoch": 0.4, |
| "grad_norm": 5.21875, |
| "kl": 0.2842459925811272, |
| "learning_rate": 4.849231551964771e-06, |
| "loss": 0.0, |
| "match_ratio": 1.0, |
| "reward": 0.9203101838380099, |
| "reward_std": 0.3454422113858163, |
| "rewards/reward_func": 0.9203101838380099, |
| "step": 2000 |
| }, |
| { |
| "completion_length": 65.095, |
| "epoch": 0.42, |
| "grad_norm": 9.125, |
| "kl": 0.16623427679762245, |
| "learning_rate": 4.817959636416969e-06, |
| "loss": 0.0, |
| "match_ratio": 1.0, |
| "reward": 0.9261023019999266, |
| "reward_std": 0.3223581924289465, |
| "rewards/reward_func": 0.9261023019999266, |
| "step": 2100 |
| }, |
| { |
| "completion_length": 62.8125, |
| "epoch": 0.44, |
| "grad_norm": 11.0625, |
| "kl": 0.10455320389475674, |
| "learning_rate": 4.783863644106502e-06, |
| "loss": 0.0, |
| "match_ratio": 0.9975, |
| "reward": 0.9122521196585148, |
| "reward_std": 0.37498483614996075, |
| "rewards/reward_func": 0.9122521196585148, |
| "step": 2200 |
| }, |
| { |
| "completion_length": 71.1275, |
| "epoch": 0.46, |
| "grad_norm": 12.5, |
| "kl": 0.5584425710327924, |
| "learning_rate": 4.746985115747918e-06, |
| "loss": 0.0001, |
| "match_ratio": 0.9925, |
| "reward": 0.792227897644043, |
| "reward_std": 0.4075765323080123, |
| "rewards/reward_func": 0.792227897644043, |
| "step": 2300 |
| }, |
| { |
| "completion_length": 68.27, |
| "epoch": 0.48, |
| "grad_norm": 10.625, |
| "kl": 1.3814555319957436, |
| "learning_rate": 4.707368982147318e-06, |
| "loss": 0.0001, |
| "match_ratio": 0.9975, |
| "reward": 0.8301830168347806, |
| "reward_std": 0.36762866189703347, |
| "rewards/reward_func": 0.8301830168347806, |
| "step": 2400 |
| }, |
| { |
| "completion_length": 66.965, |
| "epoch": 0.5, |
| "grad_norm": 12.6875, |
| "kl": 0.8518368338712026, |
| "learning_rate": 4.665063509461098e-06, |
| "loss": 0.0001, |
| "match_ratio": 0.995, |
| "reward": 0.8910126995295287, |
| "reward_std": 0.3965667562186718, |
| "rewards/reward_func": 0.8910126995295287, |
| "step": 2500 |
| }, |
| { |
| "completion_length": 70.73, |
| "epoch": 0.52, |
| "grad_norm": 17.75, |
| "kl": 0.5207290647923947, |
| "learning_rate": 4.620120240391065e-06, |
| "loss": 0.0001, |
| "match_ratio": 0.9925, |
| "reward": 0.8480577088147402, |
| "reward_std": 0.4258584909327328, |
| "rewards/reward_func": 0.8480577088147402, |
| "step": 2600 |
| }, |
| { |
| "completion_length": 59.6575, |
| "epoch": 0.54, |
| "grad_norm": 11.375, |
| "kl": 0.7336286423553247, |
| "learning_rate": 4.572593931387604e-06, |
| "loss": 0.0001, |
| "match_ratio": 1.0, |
| "reward": 0.928214335795492, |
| "reward_std": 0.37614597208797934, |
| "rewards/reward_func": 0.928214335795492, |
| "step": 2700 |
| }, |
| { |
| "completion_length": 63.8775, |
| "epoch": 0.56, |
| "grad_norm": 9.5625, |
| "kl": 0.33108121431432663, |
| "learning_rate": 4.522542485937369e-06, |
| "loss": 0.0, |
| "match_ratio": 0.9975, |
| "reward": 0.7988891634345054, |
| "reward_std": 0.36210680682212115, |
| "rewards/reward_func": 0.7988891634345054, |
| "step": 2800 |
| }, |
| { |
| "completion_length": 59.67, |
| "epoch": 0.58, |
| "grad_norm": 8.75, |
| "kl": 72.35348623547704, |
| "learning_rate": 4.470026884016805e-06, |
| "loss": 0.0072, |
| "match_ratio": 0.9975, |
| "reward": 0.8135686150938273, |
| "reward_std": 0.3942835557647049, |
| "rewards/reward_func": 0.8135686150938273, |
| "step": 2900 |
| }, |
| { |
| "completion_length": 56.305, |
| "epoch": 0.6, |
| "grad_norm": 10.0, |
| "kl": 0.14451225536875426, |
| "learning_rate": 4.415111107797445e-06, |
| "loss": 0.0, |
| "match_ratio": 1.0, |
| "reward": 0.8231964718922973, |
| "reward_std": 0.3244973301887512, |
| "rewards/reward_func": 0.8231964718922973, |
| "step": 3000 |
| }, |
| { |
| "completion_length": 59.705, |
| "epoch": 0.62, |
| "grad_norm": 10.0, |
| "kl": 0.16642916494980453, |
| "learning_rate": 4.357862063693486e-06, |
| "loss": 0.0, |
| "match_ratio": 0.9975, |
| "reward": 0.8778230049461127, |
| "reward_std": 0.36886056323535743, |
| "rewards/reward_func": 0.8778230049461127, |
| "step": 3100 |
| }, |
| { |
| "completion_length": 56.71, |
| "epoch": 0.64, |
| "grad_norm": 8.875, |
| "kl": 0.18644527865573765, |
| "learning_rate": 4.2983495008466285e-06, |
| "loss": 0.0, |
| "match_ratio": 0.995, |
| "reward": 0.7874290134198964, |
| "reward_std": 0.38642477702349426, |
| "rewards/reward_func": 0.7874290134198964, |
| "step": 3200 |
| }, |
| { |
| "completion_length": 56.68, |
| "epoch": 0.66, |
| "grad_norm": 27.0, |
| "kl": 0.22375864623580127, |
| "learning_rate": 4.236645926147493e-06, |
| "loss": 0.0, |
| "match_ratio": 1.0, |
| "reward": 0.8582109183818102, |
| "reward_std": 0.36429890371393414, |
| "rewards/reward_func": 0.8582109183818102, |
| "step": 3300 |
| }, |
| { |
| "completion_length": 58.835, |
| "epoch": 0.68, |
| "grad_norm": 6.125, |
| "kl": 0.22189826945774258, |
| "learning_rate": 4.172826515897146e-06, |
| "loss": 0.0, |
| "match_ratio": 1.0, |
| "reward": 0.7524572538957, |
| "reward_std": 0.35272345967590807, |
| "rewards/reward_func": 0.7524572538957, |
| "step": 3400 |
| }, |
| { |
| "completion_length": 50.8375, |
| "epoch": 0.7, |
| "grad_norm": 14.375, |
| "kl": 0.9133326725219376, |
| "learning_rate": 4.106969024216348e-06, |
| "loss": 0.0001, |
| "match_ratio": 1.0, |
| "reward": 0.8736884651333093, |
| "reward_std": 0.3174928646720946, |
| "rewards/reward_func": 0.8736884651333093, |
| "step": 3500 |
| }, |
| { |
| "completion_length": 60.4325, |
| "epoch": 0.72, |
| "grad_norm": 12.125, |
| "kl": 0.16217968232464045, |
| "learning_rate": 4.039153688314146e-06, |
| "loss": 0.0, |
| "match_ratio": 1.0, |
| "reward": 0.9595573445409536, |
| "reward_std": 0.34502996982075274, |
| "rewards/reward_func": 0.9595573445409536, |
| "step": 3600 |
| }, |
| { |
| "completion_length": 63.43, |
| "epoch": 0.74, |
| "grad_norm": 12.9375, |
| "kl": 0.290018264092505, |
| "learning_rate": 3.969463130731183e-06, |
| "loss": 0.0, |
| "match_ratio": 0.995, |
| "reward": 0.7895570612326265, |
| "reward_std": 0.35805795643478633, |
| "rewards/reward_func": 0.7895570612326265, |
| "step": 3700 |
| }, |
| { |
| "completion_length": 55.66, |
| "epoch": 0.76, |
| "grad_norm": 9.0, |
| "kl": 0.19286280857399107, |
| "learning_rate": 3.897982258676867e-06, |
| "loss": 0.0, |
| "match_ratio": 0.9975, |
| "reward": 0.8514765882119536, |
| "reward_std": 0.40829260389087724, |
| "rewards/reward_func": 0.8514765882119536, |
| "step": 3800 |
| }, |
| { |
| "completion_length": 64.0375, |
| "epoch": 0.78, |
| "grad_norm": 6.9375, |
| "kl": 0.3412795978039503, |
| "learning_rate": 3.824798160583012e-06, |
| "loss": 0.0, |
| "match_ratio": 0.9925, |
| "reward": 0.9143854442238808, |
| "reward_std": 0.3976023513358086, |
| "rewards/reward_func": 0.9143854442238808, |
| "step": 3900 |
| }, |
| { |
| "completion_length": 63.335, |
| "epoch": 0.8, |
| "grad_norm": 11.375, |
| "kl": 0.27103981951251627, |
| "learning_rate": 3.7500000000000005e-06, |
| "loss": 0.0, |
| "match_ratio": 0.9975, |
| "reward": 0.7887253789231181, |
| "reward_std": 0.4162597674317658, |
| "rewards/reward_func": 0.7887253789231181, |
| "step": 4000 |
| }, |
| { |
| "completion_length": 59.8575, |
| "epoch": 0.82, |
| "grad_norm": 6.59375, |
| "kl": 0.7605254784226417, |
| "learning_rate": 3.6736789069647273e-06, |
| "loss": 0.0001, |
| "match_ratio": 0.9975, |
| "reward": 0.9478698487579823, |
| "reward_std": 0.30987203000113367, |
| "rewards/reward_func": 0.9478698487579823, |
| "step": 4100 |
| }, |
| { |
| "completion_length": 58.2175, |
| "epoch": 0.84, |
| "grad_norm": 12.3125, |
| "kl": 1.3949576319474728, |
| "learning_rate": 3.595927866972694e-06, |
| "loss": 0.0001, |
| "match_ratio": 1.0, |
| "reward": 0.9541199389472603, |
| "reward_std": 0.3034708809526637, |
| "rewards/reward_func": 0.9541199389472603, |
| "step": 4200 |
| }, |
| { |
| "completion_length": 63.415, |
| "epoch": 0.86, |
| "grad_norm": 27.75, |
| "kl": 0.45249753130599857, |
| "learning_rate": 3.516841607689501e-06, |
| "loss": 0.0, |
| "match_ratio": 0.9975, |
| "reward": 0.8886630642414093, |
| "reward_std": 0.36593056879937647, |
| "rewards/reward_func": 0.8886630642414093, |
| "step": 4300 |
| }, |
| { |
| "completion_length": 55.02, |
| "epoch": 0.88, |
| "grad_norm": 11.25, |
| "kl": 68.24156057231593, |
| "learning_rate": 3.436516483539781e-06, |
| "loss": 0.0068, |
| "match_ratio": 1.0, |
| "reward": 0.866313117146492, |
| "reward_std": 0.35739596346393226, |
| "rewards/reward_func": 0.866313117146492, |
| "step": 4400 |
| }, |
| { |
| "completion_length": 61.2375, |
| "epoch": 0.9, |
| "grad_norm": 21.5, |
| "kl": 0.557325184418587, |
| "learning_rate": 3.3550503583141726e-06, |
| "loss": 0.0001, |
| "match_ratio": 1.0, |
| "reward": 0.845514679402113, |
| "reward_std": 0.3533631762489676, |
| "rewards/reward_func": 0.845514679402113, |
| "step": 4500 |
| }, |
| { |
| "completion_length": 61.4575, |
| "epoch": 0.92, |
| "grad_norm": 10.9375, |
| "kl": 0.31026113393716515, |
| "learning_rate": 3.272542485937369e-06, |
| "loss": 0.0, |
| "match_ratio": 1.0, |
| "reward": 0.8414725087583065, |
| "reward_std": 0.31504234885796906, |
| "rewards/reward_func": 0.8414725087583065, |
| "step": 4600 |
| }, |
| { |
| "completion_length": 55.6725, |
| "epoch": 0.94, |
| "grad_norm": 15.875, |
| "kl": 0.25192020772024987, |
| "learning_rate": 3.189093389542498e-06, |
| "loss": 0.0, |
| "match_ratio": 0.9975, |
| "reward": 0.934078385848552, |
| "reward_std": 0.3096505870204419, |
| "rewards/reward_func": 0.934078385848552, |
| "step": 4700 |
| }, |
| { |
| "completion_length": 57.29, |
| "epoch": 0.96, |
| "grad_norm": 8.75, |
| "kl": 0.6216541412565857, |
| "learning_rate": 3.1048047389991693e-06, |
| "loss": 0.0001, |
| "match_ratio": 0.995, |
| "reward": 0.8389100107550621, |
| "reward_std": 0.3749863849021494, |
| "rewards/reward_func": 0.8389100107550621, |
| "step": 4800 |
| }, |
| { |
| "completion_length": 61.2525, |
| "epoch": 0.98, |
| "grad_norm": 8.125, |
| "kl": 0.342362194955349, |
| "learning_rate": 3.019779227044398e-06, |
| "loss": 0.0, |
| "match_ratio": 0.995, |
| "reward": 0.8242513693869113, |
| "reward_std": 0.36022061900235713, |
| "rewards/reward_func": 0.8242513693869113, |
| "step": 4900 |
| }, |
| { |
| "completion_length": 59.7125, |
| "epoch": 1.0, |
| "grad_norm": 17.5, |
| "kl": 0.41126792770577597, |
| "learning_rate": 2.9341204441673267e-06, |
| "loss": 0.0, |
| "match_ratio": 0.9925, |
| "reward": 0.8274249080568552, |
| "reward_std": 0.4075367634743452, |
| "rewards/reward_func": 0.8274249080568552, |
| "step": 5000 |
| }, |
| { |
| "completion_length": 60.6025, |
| "epoch": 1.02, |
| "grad_norm": 12.375, |
| "kl": 0.23068872857838868, |
| "learning_rate": 2.847932752400164e-06, |
| "loss": 0.0, |
| "match_ratio": 0.9975, |
| "reward": 0.8065017646364868, |
| "reward_std": 0.36464907992631196, |
| "rewards/reward_func": 0.8065017646364868, |
| "step": 5100 |
| }, |
| { |
| "completion_length": 59.0775, |
| "epoch": 1.04, |
| "grad_norm": 20.25, |
| "kl": 0.42417401013895867, |
| "learning_rate": 2.761321158169134e-06, |
| "loss": 0.0, |
| "match_ratio": 0.9975, |
| "reward": 0.7185550931096077, |
| "reward_std": 0.40595400186255576, |
| "rewards/reward_func": 0.7185550931096077, |
| "step": 5200 |
| }, |
| { |
| "completion_length": 63.42, |
| "epoch": 1.06, |
| "grad_norm": 23.875, |
| "kl": 0.3724514145217836, |
| "learning_rate": 2.6743911843603134e-06, |
| "loss": 0.0, |
| "match_ratio": 1.0, |
| "reward": 0.7712494351714849, |
| "reward_std": 0.37587333597242834, |
| "rewards/reward_func": 0.7712494351714849, |
| "step": 5300 |
| }, |
| { |
| "completion_length": 60.4, |
| "epoch": 1.08, |
| "grad_norm": 13.0, |
| "kl": 0.41718232361599805, |
| "learning_rate": 2.587248741756253e-06, |
| "loss": 0.0, |
| "match_ratio": 0.9975, |
| "reward": 0.8315356434136629, |
| "reward_std": 0.35958164227195083, |
| "rewards/reward_func": 0.8315356434136629, |
| "step": 5400 |
| }, |
| { |
| "completion_length": 64.535, |
| "epoch": 1.1, |
| "grad_norm": 13.0, |
| "kl": 0.426719272416085, |
| "learning_rate": 2.5e-06, |
| "loss": 0.0, |
| "match_ratio": 0.9975, |
| "reward": 0.8871817947924137, |
| "reward_std": 0.35918335968628523, |
| "rewards/reward_func": 0.8871817947924137, |
| "step": 5500 |
| }, |
| { |
| "completion_length": 63.825, |
| "epoch": 1.12, |
| "grad_norm": 12.6875, |
| "kl": 10.818288787528873, |
| "learning_rate": 2.4127512582437486e-06, |
| "loss": 0.0011, |
| "match_ratio": 0.9925, |
| "reward": 0.8072922784090042, |
| "reward_std": 0.41990237571299077, |
| "rewards/reward_func": 0.8072922784090042, |
| "step": 5600 |
| }, |
| { |
| "completion_length": 61.5425, |
| "epoch": 1.1400000000000001, |
| "grad_norm": 8.5625, |
| "kl": 1.7851288786903023, |
| "learning_rate": 2.325608815639687e-06, |
| "loss": 0.0002, |
| "match_ratio": 0.9975, |
| "reward": 0.7871765466406941, |
| "reward_std": 0.3983499974012375, |
| "rewards/reward_func": 0.7871765466406941, |
| "step": 5700 |
| }, |
| { |
| "completion_length": 64.99, |
| "epoch": 1.16, |
| "grad_norm": 83.5, |
| "kl": 1.0691816475684754, |
| "learning_rate": 2.238678841830867e-06, |
| "loss": 0.0001, |
| "match_ratio": 1.0, |
| "reward": 0.9227658536192029, |
| "reward_std": 0.3650888724066317, |
| "rewards/reward_func": 0.9227658536192029, |
| "step": 5800 |
| }, |
| { |
| "completion_length": 65.0375, |
| "epoch": 1.18, |
| "grad_norm": 9.8125, |
| "kl": 0.38561805644072594, |
| "learning_rate": 2.1520672475998374e-06, |
| "loss": 0.0, |
| "match_ratio": 1.0, |
| "reward": 0.8525355974957347, |
| "reward_std": 0.36985819303430617, |
| "rewards/reward_func": 0.8525355974957347, |
| "step": 5900 |
| }, |
| { |
| "completion_length": 62.8275, |
| "epoch": 1.2, |
| "grad_norm": 9.3125, |
| "kl": 0.3610994891449809, |
| "learning_rate": 2.0658795558326745e-06, |
| "loss": 0.0, |
| "match_ratio": 0.995, |
| "reward": 1.0021917837299406, |
| "reward_std": 0.3685089880321175, |
| "rewards/reward_func": 1.0021917837299406, |
| "step": 6000 |
| }, |
| { |
| "completion_length": 67.415, |
| "epoch": 1.22, |
| "grad_norm": 14.0, |
| "kl": 0.48120407085865735, |
| "learning_rate": 1.9802207729556023e-06, |
| "loss": 0.0, |
| "match_ratio": 0.9975, |
| "reward": 0.980966807603836, |
| "reward_std": 0.33038541514426467, |
| "rewards/reward_func": 0.980966807603836, |
| "step": 6100 |
| }, |
| { |
| "completion_length": 61.275, |
| "epoch": 1.24, |
| "grad_norm": 17.125, |
| "kl": 1.394161350093782, |
| "learning_rate": 1.895195261000831e-06, |
| "loss": 0.0001, |
| "match_ratio": 1.0, |
| "reward": 0.8998113541305065, |
| "reward_std": 0.37138622866943477, |
| "rewards/reward_func": 0.8998113541305065, |
| "step": 6200 |
| }, |
| { |
| "completion_length": 61.4575, |
| "epoch": 1.26, |
| "grad_norm": 9.5625, |
| "kl": 0.4891889825835824, |
| "learning_rate": 1.8109066104575023e-06, |
| "loss": 0.0, |
| "match_ratio": 0.9975, |
| "reward": 0.7821180200204253, |
| "reward_std": 0.4117224833089858, |
| "rewards/reward_func": 0.7821180200204253, |
| "step": 6300 |
| }, |
| { |
| "completion_length": 62.535, |
| "epoch": 1.28, |
| "grad_norm": 17.75, |
| "kl": 0.701120622754097, |
| "learning_rate": 1.7274575140626318e-06, |
| "loss": 0.0001, |
| "match_ratio": 0.9975, |
| "reward": 0.8505088457465172, |
| "reward_std": 0.35196221828460694, |
| "rewards/reward_func": 0.8505088457465172, |
| "step": 6400 |
| }, |
| { |
| "completion_length": 61.0425, |
| "epoch": 1.3, |
| "grad_norm": 10.4375, |
| "kl": 0.8444961504405364, |
| "learning_rate": 1.6449496416858285e-06, |
| "loss": 0.0001, |
| "match_ratio": 1.0, |
| "reward": 0.8533294384181499, |
| "reward_std": 0.35340342290699484, |
| "rewards/reward_func": 0.8533294384181499, |
| "step": 6500 |
| }, |
| { |
| "completion_length": 66.6525, |
| "epoch": 1.32, |
| "grad_norm": 16.375, |
| "kl": 0.6673866561800241, |
| "learning_rate": 1.56348351646022e-06, |
| "loss": 0.0001, |
| "match_ratio": 0.9925, |
| "reward": 0.962776445467025, |
| "reward_std": 0.39187521073035897, |
| "rewards/reward_func": 0.962776445467025, |
| "step": 6600 |
| }, |
| { |
| "completion_length": 64.05, |
| "epoch": 1.34, |
| "grad_norm": 15.25, |
| "kl": 0.5449268382415176, |
| "learning_rate": 1.4831583923105e-06, |
| "loss": 0.0001, |
| "match_ratio": 0.9975, |
| "reward": 0.9354583528265357, |
| "reward_std": 0.3569179131626152, |
| "rewards/reward_func": 0.9354583528265357, |
| "step": 6700 |
| }, |
| { |
| "completion_length": 64.9625, |
| "epoch": 1.3599999999999999, |
| "grad_norm": 12.875, |
| "kl": 0.9086799253150821, |
| "learning_rate": 1.4040721330273063e-06, |
| "loss": 0.0001, |
| "match_ratio": 1.0, |
| "reward": 0.8598837627470494, |
| "reward_std": 0.36035713417921217, |
| "rewards/reward_func": 0.8598837627470494, |
| "step": 6800 |
| }, |
| { |
| "completion_length": 65.365, |
| "epoch": 1.38, |
| "grad_norm": 16.5, |
| "kl": 0.5496124785766006, |
| "learning_rate": 1.3263210930352737e-06, |
| "loss": 0.0001, |
| "match_ratio": 1.0, |
| "reward": 0.8178367885202169, |
| "reward_std": 0.37116443024016915, |
| "rewards/reward_func": 0.8178367885202169, |
| "step": 6900 |
| }, |
| { |
| "completion_length": 62.0475, |
| "epoch": 1.4, |
| "grad_norm": 26.875, |
| "kl": 0.549699901342392, |
| "learning_rate": 1.2500000000000007e-06, |
| "loss": 0.0001, |
| "match_ratio": 0.9975, |
| "reward": 0.9533282884210348, |
| "reward_std": 0.3430164767615497, |
| "rewards/reward_func": 0.9533282884210348, |
| "step": 7000 |
| }, |
| { |
| "completion_length": 60.79, |
| "epoch": 1.42, |
| "grad_norm": 13.25, |
| "kl": 0.5086184279620647, |
| "learning_rate": 1.1752018394169882e-06, |
| "loss": 0.0001, |
| "match_ratio": 0.9975, |
| "reward": 0.8821756513416767, |
| "reward_std": 0.3568952218815684, |
| "rewards/reward_func": 0.8821756513416767, |
| "step": 7100 |
| }, |
| { |
| "completion_length": 62.3475, |
| "epoch": 1.44, |
| "grad_norm": 21.125, |
| "kl": 0.8897788706421852, |
| "learning_rate": 1.1020177413231334e-06, |
| "loss": 0.0001, |
| "match_ratio": 0.995, |
| "reward": 0.934985687956214, |
| "reward_std": 0.32632746720686556, |
| "rewards/reward_func": 0.934985687956214, |
| "step": 7200 |
| }, |
| { |
| "completion_length": 56.98, |
| "epoch": 1.46, |
| "grad_norm": 35.25, |
| "kl": 0.7370296374708414, |
| "learning_rate": 1.0305368692688175e-06, |
| "loss": 0.0001, |
| "match_ratio": 1.0, |
| "reward": 0.8605528651922941, |
| "reward_std": 0.3732724652206525, |
| "rewards/reward_func": 0.8605528651922941, |
| "step": 7300 |
| }, |
| { |
| "completion_length": 57.71, |
| "epoch": 1.48, |
| "grad_norm": 14.5, |
| "kl": 0.7560949631407857, |
| "learning_rate": 9.608463116858544e-07, |
| "loss": 0.0001, |
| "match_ratio": 0.995, |
| "reward": 0.8051678024046123, |
| "reward_std": 0.3447662947047502, |
| "rewards/reward_func": 0.8051678024046123, |
| "step": 7400 |
| }, |
| { |
| "completion_length": 69.1925, |
| "epoch": 1.5, |
| "grad_norm": 29.5, |
| "kl": 0.6943446175381541, |
| "learning_rate": 8.930309757836517e-07, |
| "loss": 0.0001, |
| "match_ratio": 1.0, |
| "reward": 0.8813337843865157, |
| "reward_std": 0.3512991077173501, |
| "rewards/reward_func": 0.8813337843865157, |
| "step": 7500 |
| }, |
| { |
| "completion_length": 62.1225, |
| "epoch": 1.52, |
| "grad_norm": 11.25, |
| "kl": 0.3682367965579033, |
| "learning_rate": 8.271734841028553e-07, |
| "loss": 0.0, |
| "match_ratio": 0.995, |
| "reward": 0.8769916776567698, |
| "reward_std": 0.35788299994543193, |
| "rewards/reward_func": 0.8769916776567698, |
| "step": 7600 |
| }, |
| { |
| "completion_length": 62.5475, |
| "epoch": 1.54, |
| "grad_norm": 10.125, |
| "kl": 0.35848211450036616, |
| "learning_rate": 7.633540738525066e-07, |
| "loss": 0.0, |
| "match_ratio": 0.9975, |
| "reward": 0.9400549785792828, |
| "reward_std": 0.36338255695067345, |
| "rewards/reward_func": 0.9400549785792828, |
| "step": 7700 |
| }, |
| { |
| "completion_length": 61.32, |
| "epoch": 1.56, |
| "grad_norm": 13.625, |
| "kl": 0.5323085347935558, |
| "learning_rate": 7.016504991533727e-07, |
| "loss": 0.0001, |
| "match_ratio": 0.995, |
| "reward": 0.8762814123183489, |
| "reward_std": 0.35265143546042965, |
| "rewards/reward_func": 0.8762814123183489, |
| "step": 7800 |
| }, |
| { |
| "completion_length": 66.315, |
| "epoch": 1.58, |
| "grad_norm": 18.375, |
| "kl": 0.40489839322865007, |
| "learning_rate": 6.421379363065142e-07, |
| "loss": 0.0, |
| "match_ratio": 0.995, |
| "reward": 0.8989605332165956, |
| "reward_std": 0.3447486224025488, |
| "rewards/reward_func": 0.8989605332165956, |
| "step": 7900 |
| }, |
| { |
| "completion_length": 60.5325, |
| "epoch": 1.6, |
| "grad_norm": 11.75, |
| "kl": 0.690227730597835, |
| "learning_rate": 5.848888922025553e-07, |
| "loss": 0.0001, |
| "match_ratio": 1.0, |
| "reward": 0.8210568431764841, |
| "reward_std": 0.3149324245750904, |
| "rewards/reward_func": 0.8210568431764841, |
| "step": 8000 |
| }, |
| { |
| "completion_length": 66.7825, |
| "epoch": 1.62, |
| "grad_norm": 13.4375, |
| "kl": 0.516472494918853, |
| "learning_rate": 5.299731159831953e-07, |
| "loss": 0.0001, |
| "match_ratio": 0.995, |
| "reward": 0.8683985948190093, |
| "reward_std": 0.345150127671659, |
| "rewards/reward_func": 0.8683985948190093, |
| "step": 8100 |
| }, |
| { |
| "completion_length": 65.72, |
| "epoch": 1.6400000000000001, |
| "grad_norm": 12.875, |
| "kl": 0.4457222482562065, |
| "learning_rate": 4.774575140626317e-07, |
| "loss": 0.0, |
| "match_ratio": 0.9975, |
| "reward": 0.7730556976422668, |
| "reward_std": 0.41231788201257585, |
| "rewards/reward_func": 0.7730556976422668, |
| "step": 8200 |
| }, |
| { |
| "completion_length": 64.4875, |
| "epoch": 1.6600000000000001, |
| "grad_norm": 9.6875, |
| "kl": 0.5901041788049042, |
| "learning_rate": 4.27406068612396e-07, |
| "loss": 0.0001, |
| "match_ratio": 0.9975, |
| "reward": 0.8864369177818299, |
| "reward_std": 0.3609216751717031, |
| "rewards/reward_func": 0.8864369177818299, |
| "step": 8300 |
| }, |
| { |
| "completion_length": 59.98, |
| "epoch": 1.6800000000000002, |
| "grad_norm": 30.5, |
| "kl": 0.3067289407923818, |
| "learning_rate": 3.798797596089351e-07, |
| "loss": 0.0, |
| "match_ratio": 0.9975, |
| "reward": 0.8906753876060247, |
| "reward_std": 0.2947716296184808, |
| "rewards/reward_func": 0.8906753876060247, |
| "step": 8400 |
| }, |
| { |
| "completion_length": 68.6625, |
| "epoch": 1.7, |
| "grad_norm": 24.0, |
| "kl": 2.096909821406007, |
| "learning_rate": 3.3493649053890325e-07, |
| "loss": 0.0002, |
| "match_ratio": 1.0, |
| "reward": 0.9327631609933451, |
| "reward_std": 0.3544263231381774, |
| "rewards/reward_func": 0.9327631609933451, |
| "step": 8500 |
| }, |
| { |
| "completion_length": 66.0125, |
| "epoch": 1.72, |
| "grad_norm": 10.9375, |
| "kl": 0.5123670964688063, |
| "learning_rate": 2.9263101785268253e-07, |
| "loss": 0.0001, |
| "match_ratio": 0.995, |
| "reward": 0.9494618388265371, |
| "reward_std": 0.35082788893952965, |
| "rewards/reward_func": 0.9494618388265371, |
| "step": 8600 |
| }, |
| { |
| "completion_length": 57.8475, |
| "epoch": 1.74, |
| "grad_norm": 15.8125, |
| "kl": 1.2104839562997223, |
| "learning_rate": 2.53014884252083e-07, |
| "loss": 0.0001, |
| "match_ratio": 1.0, |
| "reward": 0.9292678725533188, |
| "reward_std": 0.3112880502641201, |
| "rewards/reward_func": 0.9292678725533188, |
| "step": 8700 |
| }, |
| { |
| "completion_length": 63.66, |
| "epoch": 1.76, |
| "grad_norm": 10.9375, |
| "kl": 0.5559730716235936, |
| "learning_rate": 2.1613635589349756e-07, |
| "loss": 0.0001, |
| "match_ratio": 0.9975, |
| "reward": 0.9810863409936428, |
| "reward_std": 0.36776383105432614, |
| "rewards/reward_func": 0.9810863409936428, |
| "step": 8800 |
| }, |
| { |
| "completion_length": 59.8675, |
| "epoch": 1.78, |
| "grad_norm": 13.25, |
| "kl": 29.805520134083928, |
| "learning_rate": 1.8204036358303173e-07, |
| "loss": 0.003, |
| "match_ratio": 1.0, |
| "reward": 0.8572433185577393, |
| "reward_std": 0.3437820218596607, |
| "rewards/reward_func": 0.8572433185577393, |
| "step": 8900 |
| }, |
| { |
| "completion_length": 60.3075, |
| "epoch": 1.8, |
| "grad_norm": 9.1875, |
| "kl": 0.8873812770657241, |
| "learning_rate": 1.507684480352292e-07, |
| "loss": 0.0001, |
| "match_ratio": 0.9975, |
| "reward": 0.919341038018465, |
| "reward_std": 0.33141862623393537, |
| "rewards/reward_func": 0.919341038018465, |
| "step": 9000 |
| }, |
| { |
| "completion_length": 62.265, |
| "epoch": 1.8199999999999998, |
| "grad_norm": 12.125, |
| "kl": 0.6630043520405888, |
| "learning_rate": 1.223587092621162e-07, |
| "loss": 0.0001, |
| "match_ratio": 0.9975, |
| "reward": 1.029205017723143, |
| "reward_std": 0.3310457341000438, |
| "rewards/reward_func": 1.029205017723143, |
| "step": 9100 |
| }, |
| { |
| "completion_length": 62.0575, |
| "epoch": 1.8399999999999999, |
| "grad_norm": 11.0, |
| "kl": 0.5343834590911866, |
| "learning_rate": 9.684576015420277e-08, |
| "loss": 0.0001, |
| "match_ratio": 0.9975, |
| "reward": 0.8843235304579139, |
| "reward_std": 0.35643110671080647, |
| "rewards/reward_func": 0.8843235304579139, |
| "step": 9200 |
| }, |
| { |
| "completion_length": 64.51, |
| "epoch": 1.8599999999999999, |
| "grad_norm": 13.0, |
| "kl": 0.47331944581121205, |
| "learning_rate": 7.426068431000883e-08, |
| "loss": 0.0, |
| "match_ratio": 1.0, |
| "reward": 0.8205329022929072, |
| "reward_std": 0.33430186320096256, |
| "rewards/reward_func": 0.8205329022929072, |
| "step": 9300 |
| }, |
| { |
| "completion_length": 64.395, |
| "epoch": 1.88, |
| "grad_norm": 42.0, |
| "kl": 1.8421870478987694, |
| "learning_rate": 5.463099816548578e-08, |
| "loss": 0.0002, |
| "match_ratio": 0.9975, |
| "reward": 0.9012074111029506, |
| "reward_std": 0.33403355406597257, |
| "rewards/reward_func": 0.9012074111029506, |
| "step": 9400 |
| }, |
| { |
| "completion_length": 61.4425, |
| "epoch": 1.9, |
| "grad_norm": 12.0, |
| "kl": 0.5331659988686442, |
| "learning_rate": 3.798061746947995e-08, |
| "loss": 0.0001, |
| "match_ratio": 1.0, |
| "reward": 0.7935629660636186, |
| "reward_std": 0.3497585416212678, |
| "rewards/reward_func": 0.7935629660636186, |
| "step": 9500 |
| }, |
| { |
| "completion_length": 51.77, |
| "epoch": 1.92, |
| "grad_norm": 10.875, |
| "kl": 0.7931823456101119, |
| "learning_rate": 2.4329828146074096e-08, |
| "loss": 0.0001, |
| "match_ratio": 1.0, |
| "reward": 0.9315874481201172, |
| "reward_std": 0.44919231578707697, |
| "rewards/reward_func": 0.9315874481201172, |
| "step": 9600 |
| }, |
| { |
| "completion_length": 57.5275, |
| "epoch": 1.94, |
| "grad_norm": 13.9375, |
| "kl": 0.6070253856666387, |
| "learning_rate": 1.3695261579316776e-08, |
| "loss": 0.0001, |
| "match_ratio": 1.0, |
| "reward": 0.9431364990770816, |
| "reward_std": 0.3222667661588639, |
| "rewards/reward_func": 0.9431364990770816, |
| "step": 9700 |
| }, |
| { |
| "completion_length": 59.37, |
| "epoch": 1.96, |
| "grad_norm": 14.875, |
| "kl": 0.5334895004890859, |
| "learning_rate": 6.089874350439507e-09, |
| "loss": 0.0001, |
| "match_ratio": 1.0, |
| "reward": 0.9747491884231567, |
| "reward_std": 0.29319573145825417, |
| "rewards/reward_func": 0.9747491884231567, |
| "step": 9800 |
| }, |
| { |
| "completion_length": 63.385, |
| "epoch": 1.98, |
| "grad_norm": 12.125, |
| "kl": 0.4961644561961293, |
| "learning_rate": 1.5229324522605949e-09, |
| "loss": 0.0, |
| "match_ratio": 0.995, |
| "reward": 0.8896720813587308, |
| "reward_std": 0.4128014264255762, |
| "rewards/reward_func": 0.8896720813587308, |
| "step": 9900 |
| }, |
| { |
| "completion_length": 67.28, |
| "epoch": 2.0, |
| "grad_norm": 10.8125, |
| "kl": 0.30117323972284793, |
| "learning_rate": 0.0, |
| "loss": 0.0, |
| "match_ratio": 0.995, |
| "reward": 0.7791323178261519, |
| "reward_std": 0.35405952845700084, |
| "rewards/reward_func": 0.7791323178261519, |
| "step": 10000 |
| } |
| ], |
| "logging_steps": 100, |
| "max_steps": 10000, |
| "num_input_tokens_seen": 0, |
| "num_train_epochs": 2, |
| "save_steps": 100, |
| "stateful_callbacks": { |
| "TrainerControl": { |
| "args": { |
| "should_epoch_stop": false, |
| "should_evaluate": false, |
| "should_log": false, |
| "should_save": true, |
| "should_training_stop": true |
| }, |
| "attributes": {} |
| } |
| }, |
| "total_flos": 0.0, |
| "train_batch_size": 4, |
| "trial_name": null, |
| "trial_params": null |
| } |
|
|