jhoh2525's picture
Upload folder using huggingface_hub
75aaa57 verified
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 2.0,
"eval_steps": 500,
"global_step": 10000,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"completion_length": 65.065,
"epoch": 0.02,
"grad_norm": 10.375,
"kl": 0.0006580278992169042,
"learning_rate": 5.000000000000001e-07,
"loss": -0.0,
"match_ratio": 0.995,
"reward": 0.8990143708884716,
"reward_std": 0.46338544798083603,
"rewards/reward_func": 0.8990143708884716,
"step": 100
},
{
"completion_length": 65.4625,
"epoch": 0.04,
"grad_norm": 6.875,
"kl": 0.0006705577400316542,
"learning_rate": 1.0000000000000002e-06,
"loss": -0.0,
"match_ratio": 1.0,
"reward": 0.7165287194028497,
"reward_std": 0.40350831425283107,
"rewards/reward_func": 0.7165287194028497,
"step": 200
},
{
"completion_length": 61.175,
"epoch": 0.06,
"grad_norm": 8.4375,
"kl": 0.0007126682825037278,
"learning_rate": 1.5e-06,
"loss": 0.0,
"match_ratio": 1.0,
"reward": 0.7286543997749686,
"reward_std": 0.4331769395247102,
"rewards/reward_func": 0.7286543997749686,
"step": 300
},
{
"completion_length": 54.655,
"epoch": 0.08,
"grad_norm": 11.9375,
"kl": 0.0008965998092025984,
"learning_rate": 2.0000000000000003e-06,
"loss": 0.0,
"match_ratio": 1.0,
"reward": 0.7439908282458783,
"reward_std": 0.4567913323547691,
"rewards/reward_func": 0.7439908282458783,
"step": 400
},
{
"completion_length": 62.1025,
"epoch": 0.1,
"grad_norm": 15.8125,
"kl": 0.0019434646295849235,
"learning_rate": 2.5e-06,
"loss": 0.0,
"match_ratio": 1.0,
"reward": 0.8668537394329906,
"reward_std": 0.3836937860772014,
"rewards/reward_func": 0.8668537394329906,
"step": 500
},
{
"completion_length": 58.14,
"epoch": 0.12,
"grad_norm": 20.5,
"kl": 0.0041620647069066765,
"learning_rate": 3e-06,
"loss": 0.0,
"match_ratio": 1.0,
"reward": 0.7577041421830654,
"reward_std": 0.43559244139119985,
"rewards/reward_func": 0.7577041421830654,
"step": 600
},
{
"completion_length": 60.0,
"epoch": 0.14,
"grad_norm": 10.6875,
"kl": 0.008898616410442628,
"learning_rate": 3.5e-06,
"loss": 0.0,
"match_ratio": 0.995,
"reward": 0.8700515530258417,
"reward_std": 0.45400316243059935,
"rewards/reward_func": 0.8700515530258417,
"step": 700
},
{
"completion_length": 58.1275,
"epoch": 0.16,
"grad_norm": 7.375,
"kl": 0.0189549465168966,
"learning_rate": 4.000000000000001e-06,
"loss": 0.0,
"match_ratio": 0.9975,
"reward": 0.7298830785602332,
"reward_std": 0.4231558512337506,
"rewards/reward_func": 0.7298830785602332,
"step": 800
},
{
"completion_length": 55.55,
"epoch": 0.18,
"grad_norm": 13.3125,
"kl": 0.04016699714120477,
"learning_rate": 4.5e-06,
"loss": 0.0,
"match_ratio": 1.0,
"reward": 0.8830712201073766,
"reward_std": 0.3306662117503583,
"rewards/reward_func": 0.8830712201073766,
"step": 900
},
{
"completion_length": 55.0225,
"epoch": 0.2,
"grad_norm": 10.5625,
"kl": 0.06088939258828759,
"learning_rate": 5e-06,
"loss": 0.0,
"match_ratio": 1.0,
"reward": 0.7573206969350577,
"reward_std": 0.32937701970338823,
"rewards/reward_func": 0.7573206969350577,
"step": 1000
},
{
"completion_length": 59.6125,
"epoch": 0.22,
"grad_norm": 11.0625,
"kl": 0.03887372653000057,
"learning_rate": 4.99847706754774e-06,
"loss": 0.0,
"match_ratio": 0.995,
"reward": 0.8389806092530488,
"reward_std": 0.3803154364787042,
"rewards/reward_func": 0.8389806092530488,
"step": 1100
},
{
"completion_length": 62.01,
"epoch": 0.24,
"grad_norm": 8.75,
"kl": 0.9295957709802315,
"learning_rate": 4.993910125649561e-06,
"loss": 0.0001,
"match_ratio": 0.9925,
"reward": 0.8083837843686342,
"reward_std": 0.4002057794481516,
"rewards/reward_func": 0.8083837843686342,
"step": 1200
},
{
"completion_length": 61.6575,
"epoch": 0.26,
"grad_norm": 12.8125,
"kl": 0.5295558683061973,
"learning_rate": 4.986304738420684e-06,
"loss": 0.0001,
"match_ratio": 0.9925,
"reward": 0.8700573812425136,
"reward_std": 0.41881847178563475,
"rewards/reward_func": 0.8700573812425136,
"step": 1300
},
{
"completion_length": 55.82,
"epoch": 0.28,
"grad_norm": 21.0,
"kl": 0.2268725570756942,
"learning_rate": 4.975670171853926e-06,
"loss": 0.0,
"match_ratio": 1.0,
"reward": 0.7677264379709959,
"reward_std": 0.4127253815624863,
"rewards/reward_func": 0.7677264379709959,
"step": 1400
},
{
"completion_length": 61.605,
"epoch": 0.3,
"grad_norm": 15.25,
"kl": 2.662307023219764,
"learning_rate": 4.962019382530521e-06,
"loss": 0.0003,
"match_ratio": 0.9975,
"reward": 0.8300903634727002,
"reward_std": 0.330243071205914,
"rewards/reward_func": 0.8300903634727002,
"step": 1500
},
{
"completion_length": 57.8975,
"epoch": 0.32,
"grad_norm": 24.25,
"kl": 0.9898469369392842,
"learning_rate": 4.9453690018345144e-06,
"loss": 0.0001,
"match_ratio": 1.0,
"reward": 0.8029376929998397,
"reward_std": 0.3678751669218764,
"rewards/reward_func": 0.8029376929998397,
"step": 1600
},
{
"completion_length": 52.7,
"epoch": 0.34,
"grad_norm": 3.453125,
"kl": 116.13146778639405,
"learning_rate": 4.925739315689991e-06,
"loss": 0.0116,
"match_ratio": 1.0,
"reward": 0.8077524190768599,
"reward_std": 0.33210189862176775,
"rewards/reward_func": 0.8077524190768599,
"step": 1700
},
{
"completion_length": 54.5,
"epoch": 0.36,
"grad_norm": 12.25,
"kl": 5.921828000650276,
"learning_rate": 4.903154239845798e-06,
"loss": 0.0006,
"match_ratio": 1.0,
"reward": 0.8151757456362247,
"reward_std": 0.3074088580603711,
"rewards/reward_func": 0.8151757456362247,
"step": 1800
},
{
"completion_length": 59.43,
"epoch": 0.38,
"grad_norm": 22.25,
"kl": 0.22536800906993448,
"learning_rate": 4.8776412907378845e-06,
"loss": 0.0,
"match_ratio": 1.0,
"reward": 0.8543619333952666,
"reward_std": 0.3683507715538144,
"rewards/reward_func": 0.8543619333952666,
"step": 1900
},
{
"completion_length": 59.705,
"epoch": 0.4,
"grad_norm": 5.21875,
"kl": 0.2842459925811272,
"learning_rate": 4.849231551964771e-06,
"loss": 0.0,
"match_ratio": 1.0,
"reward": 0.9203101838380099,
"reward_std": 0.3454422113858163,
"rewards/reward_func": 0.9203101838380099,
"step": 2000
},
{
"completion_length": 65.095,
"epoch": 0.42,
"grad_norm": 9.125,
"kl": 0.16623427679762245,
"learning_rate": 4.817959636416969e-06,
"loss": 0.0,
"match_ratio": 1.0,
"reward": 0.9261023019999266,
"reward_std": 0.3223581924289465,
"rewards/reward_func": 0.9261023019999266,
"step": 2100
},
{
"completion_length": 62.8125,
"epoch": 0.44,
"grad_norm": 11.0625,
"kl": 0.10455320389475674,
"learning_rate": 4.783863644106502e-06,
"loss": 0.0,
"match_ratio": 0.9975,
"reward": 0.9122521196585148,
"reward_std": 0.37498483614996075,
"rewards/reward_func": 0.9122521196585148,
"step": 2200
},
{
"completion_length": 71.1275,
"epoch": 0.46,
"grad_norm": 12.5,
"kl": 0.5584425710327924,
"learning_rate": 4.746985115747918e-06,
"loss": 0.0001,
"match_ratio": 0.9925,
"reward": 0.792227897644043,
"reward_std": 0.4075765323080123,
"rewards/reward_func": 0.792227897644043,
"step": 2300
},
{
"completion_length": 68.27,
"epoch": 0.48,
"grad_norm": 10.625,
"kl": 1.3814555319957436,
"learning_rate": 4.707368982147318e-06,
"loss": 0.0001,
"match_ratio": 0.9975,
"reward": 0.8301830168347806,
"reward_std": 0.36762866189703347,
"rewards/reward_func": 0.8301830168347806,
"step": 2400
},
{
"completion_length": 66.965,
"epoch": 0.5,
"grad_norm": 12.6875,
"kl": 0.8518368338712026,
"learning_rate": 4.665063509461098e-06,
"loss": 0.0001,
"match_ratio": 0.995,
"reward": 0.8910126995295287,
"reward_std": 0.3965667562186718,
"rewards/reward_func": 0.8910126995295287,
"step": 2500
},
{
"completion_length": 70.73,
"epoch": 0.52,
"grad_norm": 17.75,
"kl": 0.5207290647923947,
"learning_rate": 4.620120240391065e-06,
"loss": 0.0001,
"match_ratio": 0.9925,
"reward": 0.8480577088147402,
"reward_std": 0.4258584909327328,
"rewards/reward_func": 0.8480577088147402,
"step": 2600
},
{
"completion_length": 59.6575,
"epoch": 0.54,
"grad_norm": 11.375,
"kl": 0.7336286423553247,
"learning_rate": 4.572593931387604e-06,
"loss": 0.0001,
"match_ratio": 1.0,
"reward": 0.928214335795492,
"reward_std": 0.37614597208797934,
"rewards/reward_func": 0.928214335795492,
"step": 2700
},
{
"completion_length": 63.8775,
"epoch": 0.56,
"grad_norm": 9.5625,
"kl": 0.33108121431432663,
"learning_rate": 4.522542485937369e-06,
"loss": 0.0,
"match_ratio": 0.9975,
"reward": 0.7988891634345054,
"reward_std": 0.36210680682212115,
"rewards/reward_func": 0.7988891634345054,
"step": 2800
},
{
"completion_length": 59.67,
"epoch": 0.58,
"grad_norm": 8.75,
"kl": 72.35348623547704,
"learning_rate": 4.470026884016805e-06,
"loss": 0.0072,
"match_ratio": 0.9975,
"reward": 0.8135686150938273,
"reward_std": 0.3942835557647049,
"rewards/reward_func": 0.8135686150938273,
"step": 2900
},
{
"completion_length": 56.305,
"epoch": 0.6,
"grad_norm": 10.0,
"kl": 0.14451225536875426,
"learning_rate": 4.415111107797445e-06,
"loss": 0.0,
"match_ratio": 1.0,
"reward": 0.8231964718922973,
"reward_std": 0.3244973301887512,
"rewards/reward_func": 0.8231964718922973,
"step": 3000
},
{
"completion_length": 59.705,
"epoch": 0.62,
"grad_norm": 10.0,
"kl": 0.16642916494980453,
"learning_rate": 4.357862063693486e-06,
"loss": 0.0,
"match_ratio": 0.9975,
"reward": 0.8778230049461127,
"reward_std": 0.36886056323535743,
"rewards/reward_func": 0.8778230049461127,
"step": 3100
},
{
"completion_length": 56.71,
"epoch": 0.64,
"grad_norm": 8.875,
"kl": 0.18644527865573765,
"learning_rate": 4.2983495008466285e-06,
"loss": 0.0,
"match_ratio": 0.995,
"reward": 0.7874290134198964,
"reward_std": 0.38642477702349426,
"rewards/reward_func": 0.7874290134198964,
"step": 3200
},
{
"completion_length": 56.68,
"epoch": 0.66,
"grad_norm": 27.0,
"kl": 0.22375864623580127,
"learning_rate": 4.236645926147493e-06,
"loss": 0.0,
"match_ratio": 1.0,
"reward": 0.8582109183818102,
"reward_std": 0.36429890371393414,
"rewards/reward_func": 0.8582109183818102,
"step": 3300
},
{
"completion_length": 58.835,
"epoch": 0.68,
"grad_norm": 6.125,
"kl": 0.22189826945774258,
"learning_rate": 4.172826515897146e-06,
"loss": 0.0,
"match_ratio": 1.0,
"reward": 0.7524572538957,
"reward_std": 0.35272345967590807,
"rewards/reward_func": 0.7524572538957,
"step": 3400
},
{
"completion_length": 50.8375,
"epoch": 0.7,
"grad_norm": 14.375,
"kl": 0.9133326725219376,
"learning_rate": 4.106969024216348e-06,
"loss": 0.0001,
"match_ratio": 1.0,
"reward": 0.8736884651333093,
"reward_std": 0.3174928646720946,
"rewards/reward_func": 0.8736884651333093,
"step": 3500
},
{
"completion_length": 60.4325,
"epoch": 0.72,
"grad_norm": 12.125,
"kl": 0.16217968232464045,
"learning_rate": 4.039153688314146e-06,
"loss": 0.0,
"match_ratio": 1.0,
"reward": 0.9595573445409536,
"reward_std": 0.34502996982075274,
"rewards/reward_func": 0.9595573445409536,
"step": 3600
},
{
"completion_length": 63.43,
"epoch": 0.74,
"grad_norm": 12.9375,
"kl": 0.290018264092505,
"learning_rate": 3.969463130731183e-06,
"loss": 0.0,
"match_ratio": 0.995,
"reward": 0.7895570612326265,
"reward_std": 0.35805795643478633,
"rewards/reward_func": 0.7895570612326265,
"step": 3700
},
{
"completion_length": 55.66,
"epoch": 0.76,
"grad_norm": 9.0,
"kl": 0.19286280857399107,
"learning_rate": 3.897982258676867e-06,
"loss": 0.0,
"match_ratio": 0.9975,
"reward": 0.8514765882119536,
"reward_std": 0.40829260389087724,
"rewards/reward_func": 0.8514765882119536,
"step": 3800
},
{
"completion_length": 64.0375,
"epoch": 0.78,
"grad_norm": 6.9375,
"kl": 0.3412795978039503,
"learning_rate": 3.824798160583012e-06,
"loss": 0.0,
"match_ratio": 0.9925,
"reward": 0.9143854442238808,
"reward_std": 0.3976023513358086,
"rewards/reward_func": 0.9143854442238808,
"step": 3900
},
{
"completion_length": 63.335,
"epoch": 0.8,
"grad_norm": 11.375,
"kl": 0.27103981951251627,
"learning_rate": 3.7500000000000005e-06,
"loss": 0.0,
"match_ratio": 0.9975,
"reward": 0.7887253789231181,
"reward_std": 0.4162597674317658,
"rewards/reward_func": 0.7887253789231181,
"step": 4000
},
{
"completion_length": 59.8575,
"epoch": 0.82,
"grad_norm": 6.59375,
"kl": 0.7605254784226417,
"learning_rate": 3.6736789069647273e-06,
"loss": 0.0001,
"match_ratio": 0.9975,
"reward": 0.9478698487579823,
"reward_std": 0.30987203000113367,
"rewards/reward_func": 0.9478698487579823,
"step": 4100
},
{
"completion_length": 58.2175,
"epoch": 0.84,
"grad_norm": 12.3125,
"kl": 1.3949576319474728,
"learning_rate": 3.595927866972694e-06,
"loss": 0.0001,
"match_ratio": 1.0,
"reward": 0.9541199389472603,
"reward_std": 0.3034708809526637,
"rewards/reward_func": 0.9541199389472603,
"step": 4200
},
{
"completion_length": 63.415,
"epoch": 0.86,
"grad_norm": 27.75,
"kl": 0.45249753130599857,
"learning_rate": 3.516841607689501e-06,
"loss": 0.0,
"match_ratio": 0.9975,
"reward": 0.8886630642414093,
"reward_std": 0.36593056879937647,
"rewards/reward_func": 0.8886630642414093,
"step": 4300
},
{
"completion_length": 55.02,
"epoch": 0.88,
"grad_norm": 11.25,
"kl": 68.24156057231593,
"learning_rate": 3.436516483539781e-06,
"loss": 0.0068,
"match_ratio": 1.0,
"reward": 0.866313117146492,
"reward_std": 0.35739596346393226,
"rewards/reward_func": 0.866313117146492,
"step": 4400
},
{
"completion_length": 61.2375,
"epoch": 0.9,
"grad_norm": 21.5,
"kl": 0.557325184418587,
"learning_rate": 3.3550503583141726e-06,
"loss": 0.0001,
"match_ratio": 1.0,
"reward": 0.845514679402113,
"reward_std": 0.3533631762489676,
"rewards/reward_func": 0.845514679402113,
"step": 4500
},
{
"completion_length": 61.4575,
"epoch": 0.92,
"grad_norm": 10.9375,
"kl": 0.31026113393716515,
"learning_rate": 3.272542485937369e-06,
"loss": 0.0,
"match_ratio": 1.0,
"reward": 0.8414725087583065,
"reward_std": 0.31504234885796906,
"rewards/reward_func": 0.8414725087583065,
"step": 4600
},
{
"completion_length": 55.6725,
"epoch": 0.94,
"grad_norm": 15.875,
"kl": 0.25192020772024987,
"learning_rate": 3.189093389542498e-06,
"loss": 0.0,
"match_ratio": 0.9975,
"reward": 0.934078385848552,
"reward_std": 0.3096505870204419,
"rewards/reward_func": 0.934078385848552,
"step": 4700
},
{
"completion_length": 57.29,
"epoch": 0.96,
"grad_norm": 8.75,
"kl": 0.6216541412565857,
"learning_rate": 3.1048047389991693e-06,
"loss": 0.0001,
"match_ratio": 0.995,
"reward": 0.8389100107550621,
"reward_std": 0.3749863849021494,
"rewards/reward_func": 0.8389100107550621,
"step": 4800
},
{
"completion_length": 61.2525,
"epoch": 0.98,
"grad_norm": 8.125,
"kl": 0.342362194955349,
"learning_rate": 3.019779227044398e-06,
"loss": 0.0,
"match_ratio": 0.995,
"reward": 0.8242513693869113,
"reward_std": 0.36022061900235713,
"rewards/reward_func": 0.8242513693869113,
"step": 4900
},
{
"completion_length": 59.7125,
"epoch": 1.0,
"grad_norm": 17.5,
"kl": 0.41126792770577597,
"learning_rate": 2.9341204441673267e-06,
"loss": 0.0,
"match_ratio": 0.9925,
"reward": 0.8274249080568552,
"reward_std": 0.4075367634743452,
"rewards/reward_func": 0.8274249080568552,
"step": 5000
},
{
"completion_length": 60.6025,
"epoch": 1.02,
"grad_norm": 12.375,
"kl": 0.23068872857838868,
"learning_rate": 2.847932752400164e-06,
"loss": 0.0,
"match_ratio": 0.9975,
"reward": 0.8065017646364868,
"reward_std": 0.36464907992631196,
"rewards/reward_func": 0.8065017646364868,
"step": 5100
},
{
"completion_length": 59.0775,
"epoch": 1.04,
"grad_norm": 20.25,
"kl": 0.42417401013895867,
"learning_rate": 2.761321158169134e-06,
"loss": 0.0,
"match_ratio": 0.9975,
"reward": 0.7185550931096077,
"reward_std": 0.40595400186255576,
"rewards/reward_func": 0.7185550931096077,
"step": 5200
},
{
"completion_length": 63.42,
"epoch": 1.06,
"grad_norm": 23.875,
"kl": 0.3724514145217836,
"learning_rate": 2.6743911843603134e-06,
"loss": 0.0,
"match_ratio": 1.0,
"reward": 0.7712494351714849,
"reward_std": 0.37587333597242834,
"rewards/reward_func": 0.7712494351714849,
"step": 5300
},
{
"completion_length": 60.4,
"epoch": 1.08,
"grad_norm": 13.0,
"kl": 0.41718232361599805,
"learning_rate": 2.587248741756253e-06,
"loss": 0.0,
"match_ratio": 0.9975,
"reward": 0.8315356434136629,
"reward_std": 0.35958164227195083,
"rewards/reward_func": 0.8315356434136629,
"step": 5400
},
{
"completion_length": 64.535,
"epoch": 1.1,
"grad_norm": 13.0,
"kl": 0.426719272416085,
"learning_rate": 2.5e-06,
"loss": 0.0,
"match_ratio": 0.9975,
"reward": 0.8871817947924137,
"reward_std": 0.35918335968628523,
"rewards/reward_func": 0.8871817947924137,
"step": 5500
},
{
"completion_length": 63.825,
"epoch": 1.12,
"grad_norm": 12.6875,
"kl": 10.818288787528873,
"learning_rate": 2.4127512582437486e-06,
"loss": 0.0011,
"match_ratio": 0.9925,
"reward": 0.8072922784090042,
"reward_std": 0.41990237571299077,
"rewards/reward_func": 0.8072922784090042,
"step": 5600
},
{
"completion_length": 61.5425,
"epoch": 1.1400000000000001,
"grad_norm": 8.5625,
"kl": 1.7851288786903023,
"learning_rate": 2.325608815639687e-06,
"loss": 0.0002,
"match_ratio": 0.9975,
"reward": 0.7871765466406941,
"reward_std": 0.3983499974012375,
"rewards/reward_func": 0.7871765466406941,
"step": 5700
},
{
"completion_length": 64.99,
"epoch": 1.16,
"grad_norm": 83.5,
"kl": 1.0691816475684754,
"learning_rate": 2.238678841830867e-06,
"loss": 0.0001,
"match_ratio": 1.0,
"reward": 0.9227658536192029,
"reward_std": 0.3650888724066317,
"rewards/reward_func": 0.9227658536192029,
"step": 5800
},
{
"completion_length": 65.0375,
"epoch": 1.18,
"grad_norm": 9.8125,
"kl": 0.38561805644072594,
"learning_rate": 2.1520672475998374e-06,
"loss": 0.0,
"match_ratio": 1.0,
"reward": 0.8525355974957347,
"reward_std": 0.36985819303430617,
"rewards/reward_func": 0.8525355974957347,
"step": 5900
},
{
"completion_length": 62.8275,
"epoch": 1.2,
"grad_norm": 9.3125,
"kl": 0.3610994891449809,
"learning_rate": 2.0658795558326745e-06,
"loss": 0.0,
"match_ratio": 0.995,
"reward": 1.0021917837299406,
"reward_std": 0.3685089880321175,
"rewards/reward_func": 1.0021917837299406,
"step": 6000
},
{
"completion_length": 67.415,
"epoch": 1.22,
"grad_norm": 14.0,
"kl": 0.48120407085865735,
"learning_rate": 1.9802207729556023e-06,
"loss": 0.0,
"match_ratio": 0.9975,
"reward": 0.980966807603836,
"reward_std": 0.33038541514426467,
"rewards/reward_func": 0.980966807603836,
"step": 6100
},
{
"completion_length": 61.275,
"epoch": 1.24,
"grad_norm": 17.125,
"kl": 1.394161350093782,
"learning_rate": 1.895195261000831e-06,
"loss": 0.0001,
"match_ratio": 1.0,
"reward": 0.8998113541305065,
"reward_std": 0.37138622866943477,
"rewards/reward_func": 0.8998113541305065,
"step": 6200
},
{
"completion_length": 61.4575,
"epoch": 1.26,
"grad_norm": 9.5625,
"kl": 0.4891889825835824,
"learning_rate": 1.8109066104575023e-06,
"loss": 0.0,
"match_ratio": 0.9975,
"reward": 0.7821180200204253,
"reward_std": 0.4117224833089858,
"rewards/reward_func": 0.7821180200204253,
"step": 6300
},
{
"completion_length": 62.535,
"epoch": 1.28,
"grad_norm": 17.75,
"kl": 0.701120622754097,
"learning_rate": 1.7274575140626318e-06,
"loss": 0.0001,
"match_ratio": 0.9975,
"reward": 0.8505088457465172,
"reward_std": 0.35196221828460694,
"rewards/reward_func": 0.8505088457465172,
"step": 6400
},
{
"completion_length": 61.0425,
"epoch": 1.3,
"grad_norm": 10.4375,
"kl": 0.8444961504405364,
"learning_rate": 1.6449496416858285e-06,
"loss": 0.0001,
"match_ratio": 1.0,
"reward": 0.8533294384181499,
"reward_std": 0.35340342290699484,
"rewards/reward_func": 0.8533294384181499,
"step": 6500
},
{
"completion_length": 66.6525,
"epoch": 1.32,
"grad_norm": 16.375,
"kl": 0.6673866561800241,
"learning_rate": 1.56348351646022e-06,
"loss": 0.0001,
"match_ratio": 0.9925,
"reward": 0.962776445467025,
"reward_std": 0.39187521073035897,
"rewards/reward_func": 0.962776445467025,
"step": 6600
},
{
"completion_length": 64.05,
"epoch": 1.34,
"grad_norm": 15.25,
"kl": 0.5449268382415176,
"learning_rate": 1.4831583923105e-06,
"loss": 0.0001,
"match_ratio": 0.9975,
"reward": 0.9354583528265357,
"reward_std": 0.3569179131626152,
"rewards/reward_func": 0.9354583528265357,
"step": 6700
},
{
"completion_length": 64.9625,
"epoch": 1.3599999999999999,
"grad_norm": 12.875,
"kl": 0.9086799253150821,
"learning_rate": 1.4040721330273063e-06,
"loss": 0.0001,
"match_ratio": 1.0,
"reward": 0.8598837627470494,
"reward_std": 0.36035713417921217,
"rewards/reward_func": 0.8598837627470494,
"step": 6800
},
{
"completion_length": 65.365,
"epoch": 1.38,
"grad_norm": 16.5,
"kl": 0.5496124785766006,
"learning_rate": 1.3263210930352737e-06,
"loss": 0.0001,
"match_ratio": 1.0,
"reward": 0.8178367885202169,
"reward_std": 0.37116443024016915,
"rewards/reward_func": 0.8178367885202169,
"step": 6900
},
{
"completion_length": 62.0475,
"epoch": 1.4,
"grad_norm": 26.875,
"kl": 0.549699901342392,
"learning_rate": 1.2500000000000007e-06,
"loss": 0.0001,
"match_ratio": 0.9975,
"reward": 0.9533282884210348,
"reward_std": 0.3430164767615497,
"rewards/reward_func": 0.9533282884210348,
"step": 7000
},
{
"completion_length": 60.79,
"epoch": 1.42,
"grad_norm": 13.25,
"kl": 0.5086184279620647,
"learning_rate": 1.1752018394169882e-06,
"loss": 0.0001,
"match_ratio": 0.9975,
"reward": 0.8821756513416767,
"reward_std": 0.3568952218815684,
"rewards/reward_func": 0.8821756513416767,
"step": 7100
},
{
"completion_length": 62.3475,
"epoch": 1.44,
"grad_norm": 21.125,
"kl": 0.8897788706421852,
"learning_rate": 1.1020177413231334e-06,
"loss": 0.0001,
"match_ratio": 0.995,
"reward": 0.934985687956214,
"reward_std": 0.32632746720686556,
"rewards/reward_func": 0.934985687956214,
"step": 7200
},
{
"completion_length": 56.98,
"epoch": 1.46,
"grad_norm": 35.25,
"kl": 0.7370296374708414,
"learning_rate": 1.0305368692688175e-06,
"loss": 0.0001,
"match_ratio": 1.0,
"reward": 0.8605528651922941,
"reward_std": 0.3732724652206525,
"rewards/reward_func": 0.8605528651922941,
"step": 7300
},
{
"completion_length": 57.71,
"epoch": 1.48,
"grad_norm": 14.5,
"kl": 0.7560949631407857,
"learning_rate": 9.608463116858544e-07,
"loss": 0.0001,
"match_ratio": 0.995,
"reward": 0.8051678024046123,
"reward_std": 0.3447662947047502,
"rewards/reward_func": 0.8051678024046123,
"step": 7400
},
{
"completion_length": 69.1925,
"epoch": 1.5,
"grad_norm": 29.5,
"kl": 0.6943446175381541,
"learning_rate": 8.930309757836517e-07,
"loss": 0.0001,
"match_ratio": 1.0,
"reward": 0.8813337843865157,
"reward_std": 0.3512991077173501,
"rewards/reward_func": 0.8813337843865157,
"step": 7500
},
{
"completion_length": 62.1225,
"epoch": 1.52,
"grad_norm": 11.25,
"kl": 0.3682367965579033,
"learning_rate": 8.271734841028553e-07,
"loss": 0.0,
"match_ratio": 0.995,
"reward": 0.8769916776567698,
"reward_std": 0.35788299994543193,
"rewards/reward_func": 0.8769916776567698,
"step": 7600
},
{
"completion_length": 62.5475,
"epoch": 1.54,
"grad_norm": 10.125,
"kl": 0.35848211450036616,
"learning_rate": 7.633540738525066e-07,
"loss": 0.0,
"match_ratio": 0.9975,
"reward": 0.9400549785792828,
"reward_std": 0.36338255695067345,
"rewards/reward_func": 0.9400549785792828,
"step": 7700
},
{
"completion_length": 61.32,
"epoch": 1.56,
"grad_norm": 13.625,
"kl": 0.5323085347935558,
"learning_rate": 7.016504991533727e-07,
"loss": 0.0001,
"match_ratio": 0.995,
"reward": 0.8762814123183489,
"reward_std": 0.35265143546042965,
"rewards/reward_func": 0.8762814123183489,
"step": 7800
},
{
"completion_length": 66.315,
"epoch": 1.58,
"grad_norm": 18.375,
"kl": 0.40489839322865007,
"learning_rate": 6.421379363065142e-07,
"loss": 0.0,
"match_ratio": 0.995,
"reward": 0.8989605332165956,
"reward_std": 0.3447486224025488,
"rewards/reward_func": 0.8989605332165956,
"step": 7900
},
{
"completion_length": 60.5325,
"epoch": 1.6,
"grad_norm": 11.75,
"kl": 0.690227730597835,
"learning_rate": 5.848888922025553e-07,
"loss": 0.0001,
"match_ratio": 1.0,
"reward": 0.8210568431764841,
"reward_std": 0.3149324245750904,
"rewards/reward_func": 0.8210568431764841,
"step": 8000
},
{
"completion_length": 66.7825,
"epoch": 1.62,
"grad_norm": 13.4375,
"kl": 0.516472494918853,
"learning_rate": 5.299731159831953e-07,
"loss": 0.0001,
"match_ratio": 0.995,
"reward": 0.8683985948190093,
"reward_std": 0.345150127671659,
"rewards/reward_func": 0.8683985948190093,
"step": 8100
},
{
"completion_length": 65.72,
"epoch": 1.6400000000000001,
"grad_norm": 12.875,
"kl": 0.4457222482562065,
"learning_rate": 4.774575140626317e-07,
"loss": 0.0,
"match_ratio": 0.9975,
"reward": 0.7730556976422668,
"reward_std": 0.41231788201257585,
"rewards/reward_func": 0.7730556976422668,
"step": 8200
},
{
"completion_length": 64.4875,
"epoch": 1.6600000000000001,
"grad_norm": 9.6875,
"kl": 0.5901041788049042,
"learning_rate": 4.27406068612396e-07,
"loss": 0.0001,
"match_ratio": 0.9975,
"reward": 0.8864369177818299,
"reward_std": 0.3609216751717031,
"rewards/reward_func": 0.8864369177818299,
"step": 8300
},
{
"completion_length": 59.98,
"epoch": 1.6800000000000002,
"grad_norm": 30.5,
"kl": 0.3067289407923818,
"learning_rate": 3.798797596089351e-07,
"loss": 0.0,
"match_ratio": 0.9975,
"reward": 0.8906753876060247,
"reward_std": 0.2947716296184808,
"rewards/reward_func": 0.8906753876060247,
"step": 8400
},
{
"completion_length": 68.6625,
"epoch": 1.7,
"grad_norm": 24.0,
"kl": 2.096909821406007,
"learning_rate": 3.3493649053890325e-07,
"loss": 0.0002,
"match_ratio": 1.0,
"reward": 0.9327631609933451,
"reward_std": 0.3544263231381774,
"rewards/reward_func": 0.9327631609933451,
"step": 8500
},
{
"completion_length": 66.0125,
"epoch": 1.72,
"grad_norm": 10.9375,
"kl": 0.5123670964688063,
"learning_rate": 2.9263101785268253e-07,
"loss": 0.0001,
"match_ratio": 0.995,
"reward": 0.9494618388265371,
"reward_std": 0.35082788893952965,
"rewards/reward_func": 0.9494618388265371,
"step": 8600
},
{
"completion_length": 57.8475,
"epoch": 1.74,
"grad_norm": 15.8125,
"kl": 1.2104839562997223,
"learning_rate": 2.53014884252083e-07,
"loss": 0.0001,
"match_ratio": 1.0,
"reward": 0.9292678725533188,
"reward_std": 0.3112880502641201,
"rewards/reward_func": 0.9292678725533188,
"step": 8700
},
{
"completion_length": 63.66,
"epoch": 1.76,
"grad_norm": 10.9375,
"kl": 0.5559730716235936,
"learning_rate": 2.1613635589349756e-07,
"loss": 0.0001,
"match_ratio": 0.9975,
"reward": 0.9810863409936428,
"reward_std": 0.36776383105432614,
"rewards/reward_func": 0.9810863409936428,
"step": 8800
},
{
"completion_length": 59.8675,
"epoch": 1.78,
"grad_norm": 13.25,
"kl": 29.805520134083928,
"learning_rate": 1.8204036358303173e-07,
"loss": 0.003,
"match_ratio": 1.0,
"reward": 0.8572433185577393,
"reward_std": 0.3437820218596607,
"rewards/reward_func": 0.8572433185577393,
"step": 8900
},
{
"completion_length": 60.3075,
"epoch": 1.8,
"grad_norm": 9.1875,
"kl": 0.8873812770657241,
"learning_rate": 1.507684480352292e-07,
"loss": 0.0001,
"match_ratio": 0.9975,
"reward": 0.919341038018465,
"reward_std": 0.33141862623393537,
"rewards/reward_func": 0.919341038018465,
"step": 9000
},
{
"completion_length": 62.265,
"epoch": 1.8199999999999998,
"grad_norm": 12.125,
"kl": 0.6630043520405888,
"learning_rate": 1.223587092621162e-07,
"loss": 0.0001,
"match_ratio": 0.9975,
"reward": 1.029205017723143,
"reward_std": 0.3310457341000438,
"rewards/reward_func": 1.029205017723143,
"step": 9100
},
{
"completion_length": 62.0575,
"epoch": 1.8399999999999999,
"grad_norm": 11.0,
"kl": 0.5343834590911866,
"learning_rate": 9.684576015420277e-08,
"loss": 0.0001,
"match_ratio": 0.9975,
"reward": 0.8843235304579139,
"reward_std": 0.35643110671080647,
"rewards/reward_func": 0.8843235304579139,
"step": 9200
},
{
"completion_length": 64.51,
"epoch": 1.8599999999999999,
"grad_norm": 13.0,
"kl": 0.47331944581121205,
"learning_rate": 7.426068431000883e-08,
"loss": 0.0,
"match_ratio": 1.0,
"reward": 0.8205329022929072,
"reward_std": 0.33430186320096256,
"rewards/reward_func": 0.8205329022929072,
"step": 9300
},
{
"completion_length": 64.395,
"epoch": 1.88,
"grad_norm": 42.0,
"kl": 1.8421870478987694,
"learning_rate": 5.463099816548578e-08,
"loss": 0.0002,
"match_ratio": 0.9975,
"reward": 0.9012074111029506,
"reward_std": 0.33403355406597257,
"rewards/reward_func": 0.9012074111029506,
"step": 9400
},
{
"completion_length": 61.4425,
"epoch": 1.9,
"grad_norm": 12.0,
"kl": 0.5331659988686442,
"learning_rate": 3.798061746947995e-08,
"loss": 0.0001,
"match_ratio": 1.0,
"reward": 0.7935629660636186,
"reward_std": 0.3497585416212678,
"rewards/reward_func": 0.7935629660636186,
"step": 9500
},
{
"completion_length": 51.77,
"epoch": 1.92,
"grad_norm": 10.875,
"kl": 0.7931823456101119,
"learning_rate": 2.4329828146074096e-08,
"loss": 0.0001,
"match_ratio": 1.0,
"reward": 0.9315874481201172,
"reward_std": 0.44919231578707697,
"rewards/reward_func": 0.9315874481201172,
"step": 9600
},
{
"completion_length": 57.5275,
"epoch": 1.94,
"grad_norm": 13.9375,
"kl": 0.6070253856666387,
"learning_rate": 1.3695261579316776e-08,
"loss": 0.0001,
"match_ratio": 1.0,
"reward": 0.9431364990770816,
"reward_std": 0.3222667661588639,
"rewards/reward_func": 0.9431364990770816,
"step": 9700
},
{
"completion_length": 59.37,
"epoch": 1.96,
"grad_norm": 14.875,
"kl": 0.5334895004890859,
"learning_rate": 6.089874350439507e-09,
"loss": 0.0001,
"match_ratio": 1.0,
"reward": 0.9747491884231567,
"reward_std": 0.29319573145825417,
"rewards/reward_func": 0.9747491884231567,
"step": 9800
},
{
"completion_length": 63.385,
"epoch": 1.98,
"grad_norm": 12.125,
"kl": 0.4961644561961293,
"learning_rate": 1.5229324522605949e-09,
"loss": 0.0,
"match_ratio": 0.995,
"reward": 0.8896720813587308,
"reward_std": 0.4128014264255762,
"rewards/reward_func": 0.8896720813587308,
"step": 9900
},
{
"completion_length": 67.28,
"epoch": 2.0,
"grad_norm": 10.8125,
"kl": 0.30117323972284793,
"learning_rate": 0.0,
"loss": 0.0,
"match_ratio": 0.995,
"reward": 0.7791323178261519,
"reward_std": 0.35405952845700084,
"rewards/reward_func": 0.7791323178261519,
"step": 10000
}
],
"logging_steps": 100,
"max_steps": 10000,
"num_input_tokens_seen": 0,
"num_train_epochs": 2,
"save_steps": 100,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 0.0,
"train_batch_size": 4,
"trial_name": null,
"trial_params": null
}