Lansechen's picture
Model save
98b9500 verified
raw
history blame
54.6 kB
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 1.9850746268656716,
"eval_steps": 100,
"global_step": 132,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"clip_ratio": 0.0,
"completion_length": 527.3593978881836,
"epoch": 0.014925373134328358,
"grad_norm": 0.40167078375816345,
"learning_rate": 7.142857142857142e-08,
"loss": 0.0181,
"num_tokens": 614922.0,
"reward": 0.28459822572767735,
"reward_std": 0.3557117339223623,
"rewards/accuracy_reward": 0.2845982164144516,
"step": 1
},
{
"clip_ratio": 0.0,
"completion_length": 501.1183319091797,
"epoch": 0.029850746268656716,
"grad_norm": 0.35646918416023254,
"learning_rate": 1.4285714285714285e-07,
"loss": 0.0126,
"num_tokens": 1210628.0,
"reward": 0.29017858393490314,
"reward_std": 0.3156624883413315,
"rewards/accuracy_reward": 0.29017857648432255,
"step": 2
},
{
"clip_ratio": 0.0,
"completion_length": 512.6149787902832,
"epoch": 0.04477611940298507,
"grad_norm": 0.5019609928131104,
"learning_rate": 2.1428571428571426e-07,
"loss": 0.0133,
"num_tokens": 1832067.0,
"reward": 0.2935268022119999,
"reward_std": 0.3709743171930313,
"rewards/accuracy_reward": 0.29352678544819355,
"step": 3
},
{
"clip_ratio": 0.0,
"completion_length": 546.1875190734863,
"epoch": 0.05970149253731343,
"grad_norm": 0.3622360825538635,
"learning_rate": 2.857142857142857e-07,
"loss": 0.0023,
"num_tokens": 2466195.0,
"reward": 0.25334822200238705,
"reward_std": 0.3327263258397579,
"rewards/accuracy_reward": 0.25334821455180645,
"step": 4
},
{
"clip_ratio": 0.0,
"completion_length": 528.721004486084,
"epoch": 0.07462686567164178,
"grad_norm": 0.43017202615737915,
"learning_rate": 3.5714285714285716e-07,
"loss": 0.0169,
"num_tokens": 3092665.0,
"reward": 0.28348215855658054,
"reward_std": 0.3276054132729769,
"rewards/accuracy_reward": 0.2834821371361613,
"step": 5
},
{
"clip_ratio": 0.0,
"completion_length": 481.4542579650879,
"epoch": 0.08955223880597014,
"grad_norm": 0.4641503393650055,
"learning_rate": 4.285714285714285e-07,
"loss": -0.0264,
"num_tokens": 3663768.0,
"reward": 0.2912946604192257,
"reward_std": 0.3355025686323643,
"rewards/accuracy_reward": 0.29129463620483875,
"step": 6
},
{
"clip_ratio": 0.0,
"completion_length": 502.98106384277344,
"epoch": 0.1044776119402985,
"grad_norm": 0.43246936798095703,
"learning_rate": 5e-07,
"loss": 0.0148,
"num_tokens": 4258719.0,
"reward": 0.30133930034935474,
"reward_std": 0.34083990193903446,
"rewards/accuracy_reward": 0.3013392835855484,
"step": 7
},
{
"clip_ratio": 0.0,
"completion_length": 494.30359649658203,
"epoch": 0.11940298507462686,
"grad_norm": 0.47649499773979187,
"learning_rate": 5.714285714285714e-07,
"loss": 0.0064,
"num_tokens": 4834495.0,
"reward": 0.3147321566939354,
"reward_std": 0.36961813643574715,
"rewards/accuracy_reward": 0.31473213993012905,
"step": 8
},
{
"clip_ratio": 0.0,
"completion_length": 496.06921768188477,
"epoch": 0.13432835820895522,
"grad_norm": 0.3420712947845459,
"learning_rate": 6.428571428571429e-07,
"loss": -0.0137,
"num_tokens": 5419541.0,
"reward": 0.3325892984867096,
"reward_std": 0.3436175622045994,
"rewards/accuracy_reward": 0.3325892873108387,
"step": 9
},
{
"clip_ratio": 0.0,
"completion_length": 494.09265518188477,
"epoch": 0.14925373134328357,
"grad_norm": 0.5828936696052551,
"learning_rate": 7.142857142857143e-07,
"loss": 0.0259,
"num_tokens": 6002120.0,
"reward": 0.3482143022119999,
"reward_std": 0.3884076401591301,
"rewards/accuracy_reward": 0.34821428172290325,
"step": 10
},
{
"clip_ratio": 0.0,
"completion_length": 538.0357360839844,
"epoch": 0.16417910447761194,
"grad_norm": 0.5086678266525269,
"learning_rate": 7.857142857142856e-07,
"loss": -0.0038,
"num_tokens": 6629488.0,
"reward": 0.3716518059372902,
"reward_std": 0.3423391878604889,
"rewards/accuracy_reward": 0.37165178544819355,
"step": 11
},
{
"clip_ratio": 0.0,
"completion_length": 528.3069458007812,
"epoch": 0.1791044776119403,
"grad_norm": 0.34488609433174133,
"learning_rate": 8.57142857142857e-07,
"loss": 0.014,
"num_tokens": 7248163.0,
"reward": 0.3917410857975483,
"reward_std": 0.3631545342504978,
"rewards/accuracy_reward": 0.3917410708963871,
"step": 12
},
{
"clip_ratio": 0.0,
"completion_length": 574.1094093322754,
"epoch": 0.19402985074626866,
"grad_norm": 0.5286847949028015,
"learning_rate": 9.285714285714285e-07,
"loss": -0.0135,
"num_tokens": 7900981.0,
"reward": 0.4866071715950966,
"reward_std": 0.35308703035116196,
"rewards/accuracy_reward": 0.4866071417927742,
"step": 13
},
{
"clip_ratio": 0.0,
"completion_length": 509.4152069091797,
"epoch": 0.208955223880597,
"grad_norm": 0.5608493685722351,
"learning_rate": 1e-06,
"loss": 0.049,
"num_tokens": 8496521.0,
"reward": 0.5814732350409031,
"reward_std": 0.354553097859025,
"rewards/accuracy_reward": 0.5814732164144516,
"step": 14
},
{
"clip_ratio": 0.0,
"completion_length": 506.89064025878906,
"epoch": 0.22388059701492538,
"grad_norm": 0.415044903755188,
"learning_rate": 9.998286624877785e-07,
"loss": -0.0023,
"num_tokens": 9080431.0,
"reward": 0.5881696678698063,
"reward_std": 0.33997591957449913,
"rewards/accuracy_reward": 0.5881696380674839,
"step": 15
},
{
"clip_ratio": 0.0,
"completion_length": 564.6194458007812,
"epoch": 0.23880597014925373,
"grad_norm": 0.4049900472164154,
"learning_rate": 9.99314767377287e-07,
"loss": 0.0035,
"num_tokens": 9724930.0,
"reward": 0.616071455180645,
"reward_std": 0.30958189629018307,
"rewards/accuracy_reward": 0.6160714253783226,
"step": 16
},
{
"clip_ratio": 0.0,
"completion_length": 554.748908996582,
"epoch": 0.2537313432835821,
"grad_norm": 0.2018774151802063,
"learning_rate": 9.98458666866564e-07,
"loss": 0.0057,
"num_tokens": 10386817.0,
"reward": 0.6763393059372902,
"reward_std": 0.2512755785137415,
"rewards/accuracy_reward": 0.676339291036129,
"step": 17
},
{
"clip_ratio": 0.0,
"completion_length": 594.0759201049805,
"epoch": 0.26865671641791045,
"grad_norm": 0.21940495073795319,
"learning_rate": 9.972609476841365e-07,
"loss": 0.02,
"num_tokens": 11070389.0,
"reward": 0.6908482387661934,
"reward_std": 0.2618648335337639,
"rewards/accuracy_reward": 0.6908482164144516,
"step": 18
},
{
"clip_ratio": 0.0,
"completion_length": 615.6250267028809,
"epoch": 0.2835820895522388,
"grad_norm": 0.22275717556476593,
"learning_rate": 9.957224306869053e-07,
"loss": 0.0155,
"num_tokens": 11762013.0,
"reward": 0.7031250298023224,
"reward_std": 0.21245348826050758,
"rewards/accuracy_reward": 0.7031249925494194,
"step": 19
},
{
"clip_ratio": 0.0,
"completion_length": 590.3951225280762,
"epoch": 0.29850746268656714,
"grad_norm": 0.17150121927261353,
"learning_rate": 9.938441702975689e-07,
"loss": 0.0286,
"num_tokens": 12433383.0,
"reward": 0.768973246216774,
"reward_std": 0.20385023020207882,
"rewards/accuracy_reward": 0.768973208963871,
"step": 20
},
{
"clip_ratio": 0.0,
"completion_length": 662.1127624511719,
"epoch": 0.31343283582089554,
"grad_norm": 0.13017700612545013,
"learning_rate": 9.916274537819773e-07,
"loss": 0.0282,
"num_tokens": 13166340.0,
"reward": 0.709821455180645,
"reward_std": 0.18888892605900764,
"rewards/accuracy_reward": 0.7098214253783226,
"step": 21
},
{
"clip_ratio": 0.0,
"completion_length": 582.2143135070801,
"epoch": 0.3283582089552239,
"grad_norm": 0.161940336227417,
"learning_rate": 9.890738003669027e-07,
"loss": 0.0415,
"num_tokens": 13832700.0,
"reward": 0.7511161044239998,
"reward_std": 0.23555724322795868,
"rewards/accuracy_reward": 0.7511160671710968,
"step": 22
},
{
"clip_ratio": 0.0,
"completion_length": 577.9185562133789,
"epoch": 0.34328358208955223,
"grad_norm": 0.30369359254837036,
"learning_rate": 9.861849601988383e-07,
"loss": 0.0423,
"num_tokens": 14496323.0,
"reward": 0.7354910969734192,
"reward_std": 0.15988358668982983,
"rewards/accuracy_reward": 0.7354910746216774,
"step": 23
},
{
"clip_ratio": 0.0,
"completion_length": 671.771240234375,
"epoch": 0.3582089552238806,
"grad_norm": 0.2169778198003769,
"learning_rate": 9.82962913144534e-07,
"loss": 0.0421,
"num_tokens": 15249582.0,
"reward": 0.6964286044239998,
"reward_std": 0.21053165383636951,
"rewards/accuracy_reward": 0.6964285597205162,
"step": 24
},
{
"clip_ratio": 0.0,
"completion_length": 625.8772583007812,
"epoch": 0.373134328358209,
"grad_norm": 0.20843251049518585,
"learning_rate": 9.794098674340966e-07,
"loss": 0.0417,
"num_tokens": 15940760.0,
"reward": 0.800223246216774,
"reward_std": 0.14455711469054222,
"rewards/accuracy_reward": 0.800223208963871,
"step": 25
},
{
"clip_ratio": 0.0,
"completion_length": 673.678596496582,
"epoch": 0.3880597014925373,
"grad_norm": 0.14218851923942566,
"learning_rate": 9.755282581475767e-07,
"loss": 0.0623,
"num_tokens": 16697072.0,
"reward": 0.7087053805589676,
"reward_std": 0.1666869930922985,
"rewards/accuracy_reward": 0.7087053582072258,
"step": 26
},
{
"clip_ratio": 0.0,
"completion_length": 670.2087326049805,
"epoch": 0.40298507462686567,
"grad_norm": 0.14040139317512512,
"learning_rate": 9.713207455460892e-07,
"loss": 0.0681,
"num_tokens": 17439499.0,
"reward": 0.675223246216774,
"reward_std": 0.20043206959962845,
"rewards/accuracy_reward": 0.675223208963871,
"step": 27
},
{
"clip_ratio": 0.0,
"completion_length": 620.5960083007812,
"epoch": 0.417910447761194,
"grad_norm": 0.17488999664783478,
"learning_rate": 9.667902132486008e-07,
"loss": 0.0639,
"num_tokens": 18130177.0,
"reward": 0.7008928880095482,
"reward_std": 0.20606223493814468,
"rewards/accuracy_reward": 0.7008928656578064,
"step": 28
},
{
"clip_ratio": 0.0,
"completion_length": 701.1161117553711,
"epoch": 0.43283582089552236,
"grad_norm": 0.1295379400253296,
"learning_rate": 9.619397662556433e-07,
"loss": 0.037,
"num_tokens": 18912849.0,
"reward": 0.6618303880095482,
"reward_std": 0.2049681916832924,
"rewards/accuracy_reward": 0.6618303582072258,
"step": 29
},
{
"clip_ratio": 0.0,
"completion_length": 666.1741409301758,
"epoch": 0.44776119402985076,
"grad_norm": 0.12175110727548599,
"learning_rate": 9.567727288213004e-07,
"loss": 0.0543,
"num_tokens": 19647693.0,
"reward": 0.7388393208384514,
"reward_std": 0.2054214347153902,
"rewards/accuracy_reward": 0.738839291036129,
"step": 30
},
{
"clip_ratio": 0.0,
"completion_length": 657.982177734375,
"epoch": 0.4626865671641791,
"grad_norm": 0.11877922713756561,
"learning_rate": 9.512926421749303e-07,
"loss": 0.0172,
"num_tokens": 20376269.0,
"reward": 0.7031250298023224,
"reward_std": 0.15015378780663013,
"rewards/accuracy_reward": 0.7031249925494194,
"step": 31
},
{
"clip_ratio": 0.0,
"completion_length": 621.2243576049805,
"epoch": 0.47761194029850745,
"grad_norm": 0.2129260003566742,
"learning_rate": 9.455032620941839e-07,
"loss": 0.0567,
"num_tokens": 21076854.0,
"reward": 0.7354911044239998,
"reward_std": 0.17187250033020973,
"rewards/accuracy_reward": 0.7354910746216774,
"step": 32
},
{
"clip_ratio": 0.0,
"completion_length": 687.317008972168,
"epoch": 0.4925373134328358,
"grad_norm": 0.1212068498134613,
"learning_rate": 9.394085563309826e-07,
"loss": 0.0524,
"num_tokens": 21833434.0,
"reward": 0.6997768133878708,
"reward_std": 0.17784573789685965,
"rewards/accuracy_reward": 0.699776791036129,
"step": 33
},
{
"clip_ratio": 0.0,
"completion_length": 683.4386520385742,
"epoch": 0.5074626865671642,
"grad_norm": 0.1783527433872223,
"learning_rate": 9.330127018922193e-07,
"loss": 0.034,
"num_tokens": 22591323.0,
"reward": 0.7120535895228386,
"reward_std": 0.1899106204509735,
"rewards/accuracy_reward": 0.712053582072258,
"step": 34
},
{
"clip_ratio": 0.0,
"completion_length": 648.4263763427734,
"epoch": 0.5223880597014925,
"grad_norm": 0.2292163074016571,
"learning_rate": 9.26320082177046e-07,
"loss": 0.0355,
"num_tokens": 23327401.0,
"reward": 0.7366071790456772,
"reward_std": 0.1552294003777206,
"rewards/accuracy_reward": 0.7366071492433548,
"step": 35
},
{
"clip_ratio": 0.0,
"completion_length": 651.7165374755859,
"epoch": 0.5373134328358209,
"grad_norm": 0.24602435529232025,
"learning_rate": 9.19335283972712e-07,
"loss": 0.0426,
"num_tokens": 24070107.0,
"reward": 0.7287946790456772,
"reward_std": 0.18044043332338333,
"rewards/accuracy_reward": 0.7287946417927742,
"step": 36
},
{
"clip_ratio": 0.0,
"completion_length": 616.3973541259766,
"epoch": 0.5522388059701493,
"grad_norm": 0.24687685072422028,
"learning_rate": 9.120630943110077e-07,
"loss": 0.0284,
"num_tokens": 24765055.0,
"reward": 0.7399553954601288,
"reward_std": 0.15244218427687883,
"rewards/accuracy_reward": 0.7399553582072258,
"step": 37
},
{
"clip_ratio": 0.0,
"completion_length": 639.3995895385742,
"epoch": 0.5671641791044776,
"grad_norm": 0.42036116123199463,
"learning_rate": 9.045084971874737e-07,
"loss": 0.0429,
"num_tokens": 25498973.0,
"reward": 0.7455357536673546,
"reward_std": 0.1501858728006482,
"rewards/accuracy_reward": 0.745535708963871,
"step": 38
},
{
"clip_ratio": 0.0,
"completion_length": 626.0937843322754,
"epoch": 0.582089552238806,
"grad_norm": 0.18518702685832977,
"learning_rate": 8.966766701456176e-07,
"loss": 0.0309,
"num_tokens": 26201849.0,
"reward": 0.6930803842842579,
"reward_std": 0.1755145499482751,
"rewards/accuracy_reward": 0.6930803470313549,
"step": 39
},
{
"clip_ratio": 0.0,
"completion_length": 725.0134201049805,
"epoch": 0.5970149253731343,
"grad_norm": 0.11620926111936569,
"learning_rate": 8.885729807284854e-07,
"loss": 0.0474,
"num_tokens": 26990389.0,
"reward": 0.7142857536673546,
"reward_std": 0.18479816243052483,
"rewards/accuracy_reward": 0.7142857164144516,
"step": 40
},
{
"clip_ratio": 0.0,
"completion_length": 641.0569534301758,
"epoch": 0.6119402985074627,
"grad_norm": 0.17211312055587769,
"learning_rate": 8.802029828000155e-07,
"loss": 0.0455,
"num_tokens": 27720616.0,
"reward": 0.6741071715950966,
"reward_std": 0.2003892920911312,
"rewards/accuracy_reward": 0.6741071417927742,
"step": 41
},
{
"clip_ratio": 0.0,
"completion_length": 604.3359603881836,
"epoch": 0.6268656716417911,
"grad_norm": 0.15277408063411713,
"learning_rate": 8.71572412738697e-07,
"loss": 0.0497,
"num_tokens": 28399813.0,
"reward": 0.8292411118745804,
"reward_std": 0.13516291230916977,
"rewards/accuracy_reward": 0.8292410746216774,
"step": 42
},
{
"clip_ratio": 0.0,
"completion_length": 622.8638687133789,
"epoch": 0.6417910447761194,
"grad_norm": 0.12167245894670486,
"learning_rate": 8.626871855061437e-07,
"loss": 0.0265,
"num_tokens": 29108747.0,
"reward": 0.7633928954601288,
"reward_std": 0.16634858772158623,
"rewards/accuracy_reward": 0.7633928582072258,
"step": 43
},
{
"clip_ratio": 0.0,
"completion_length": 607.7199020385742,
"epoch": 0.6567164179104478,
"grad_norm": 0.11686498671770096,
"learning_rate": 8.535533905932737e-07,
"loss": 0.0335,
"num_tokens": 29795408.0,
"reward": 0.7488839626312256,
"reward_std": 0.15169290080666542,
"rewards/accuracy_reward": 0.7488839328289032,
"step": 44
},
{
"clip_ratio": 0.0,
"completion_length": 619.7567291259766,
"epoch": 0.6716417910447762,
"grad_norm": 0.09357903152704239,
"learning_rate": 8.441772878468769e-07,
"loss": 0.0355,
"num_tokens": 30492150.0,
"reward": 0.7790178954601288,
"reward_std": 0.1362890424206853,
"rewards/accuracy_reward": 0.7790178507566452,
"step": 45
},
{
"clip_ratio": 0.0,
"completion_length": 701.7835083007812,
"epoch": 0.6865671641791045,
"grad_norm": 0.1133616715669632,
"learning_rate": 8.34565303179429e-07,
"loss": 0.0535,
"num_tokens": 31269972.0,
"reward": 0.6875000298023224,
"reward_std": 0.1872752346098423,
"rewards/accuracy_reward": 0.6875,
"step": 46
},
{
"clip_ratio": 0.0,
"completion_length": 640.8549346923828,
"epoch": 0.7014925373134329,
"grad_norm": 0.08914685249328613,
"learning_rate": 8.247240241650917e-07,
"loss": 0.0132,
"num_tokens": 31975466.0,
"reward": 0.7589286118745804,
"reward_std": 0.12584246136248112,
"rewards/accuracy_reward": 0.7589285671710968,
"step": 47
},
{
"clip_ratio": 0.0,
"completion_length": 637.5033836364746,
"epoch": 0.7164179104477612,
"grad_norm": 0.10743953287601471,
"learning_rate": 8.146601955249187e-07,
"loss": 0.0135,
"num_tokens": 32688717.0,
"reward": 0.7801339626312256,
"reward_std": 0.14560949243605137,
"rewards/accuracy_reward": 0.7801339253783226,
"step": 48
},
{
"clip_ratio": 0.0,
"completion_length": 655.8404159545898,
"epoch": 0.7313432835820896,
"grad_norm": 0.11401466280221939,
"learning_rate": 8.043807145043603e-07,
"loss": 0.0236,
"num_tokens": 33427982.0,
"reward": 0.7265625223517418,
"reward_std": 0.1881414633244276,
"rewards/accuracy_reward": 0.7265625,
"step": 49
},
{
"clip_ratio": 0.0,
"completion_length": 619.4062728881836,
"epoch": 0.746268656716418,
"grad_norm": 0.16785398125648499,
"learning_rate": 7.938926261462365e-07,
"loss": 0.0325,
"num_tokens": 34136354.0,
"reward": 0.6930803954601288,
"reward_std": 0.16814983636140823,
"rewards/accuracy_reward": 0.6930803582072258,
"step": 50
},
{
"clip_ratio": 0.0,
"completion_length": 603.1082992553711,
"epoch": 0.7611940298507462,
"grad_norm": 0.09643765538930893,
"learning_rate": 7.832031184624164e-07,
"loss": 0.0244,
"num_tokens": 34829755.0,
"reward": 0.7366071715950966,
"reward_std": 0.15785480476915836,
"rewards/accuracy_reward": 0.7366071492433548,
"step": 51
},
{
"clip_ratio": 0.0,
"completion_length": 617.9788246154785,
"epoch": 0.7761194029850746,
"grad_norm": 0.1667710691690445,
"learning_rate": 7.723195175075135e-07,
"loss": 0.0208,
"num_tokens": 35527344.0,
"reward": 0.7198661044239998,
"reward_std": 0.11693863570690155,
"rewards/accuracy_reward": 0.7198660671710968,
"step": 52
},
{
"clip_ratio": 0.0,
"completion_length": 620.7823944091797,
"epoch": 0.7910447761194029,
"grad_norm": 0.18700341880321503,
"learning_rate": 7.612492823579744e-07,
"loss": 0.0049,
"num_tokens": 36226469.0,
"reward": 0.7020089626312256,
"reward_std": 0.16777005605399609,
"rewards/accuracy_reward": 0.7020089328289032,
"step": 53
},
{
"clip_ratio": 0.0,
"completion_length": 591.4152069091797,
"epoch": 0.8059701492537313,
"grad_norm": 0.10829013586044312,
"learning_rate": 7.5e-07,
"loss": 0.0084,
"num_tokens": 36906441.0,
"reward": 0.7477678880095482,
"reward_std": 0.14278795383870602,
"rewards/accuracy_reward": 0.7477678507566452,
"step": 54
},
{
"clip_ratio": 0.0,
"completion_length": 602.3995742797852,
"epoch": 0.8208955223880597,
"grad_norm": 0.1018439456820488,
"learning_rate": 7.385793801298042e-07,
"loss": 0.0166,
"num_tokens": 37600543.0,
"reward": 0.7566964626312256,
"reward_std": 0.1503392457962036,
"rewards/accuracy_reward": 0.756696417927742,
"step": 55
},
{
"clip_ratio": 0.0,
"completion_length": 617.087085723877,
"epoch": 0.835820895522388,
"grad_norm": 0.1039031594991684,
"learning_rate": 7.269952498697734e-07,
"loss": 0.0266,
"num_tokens": 38299741.0,
"reward": 0.7165178880095482,
"reward_std": 0.17983242496848106,
"rewards/accuracy_reward": 0.7165178582072258,
"step": 56
},
{
"clip_ratio": 0.0,
"completion_length": 588.6105194091797,
"epoch": 0.8507462686567164,
"grad_norm": 0.09185818582773209,
"learning_rate": 7.152555484041475e-07,
"loss": 0.0155,
"num_tokens": 38965368.0,
"reward": 0.7834821790456772,
"reward_std": 0.13771093636751175,
"rewards/accuracy_reward": 0.7834821343421936,
"step": 57
},
{
"clip_ratio": 0.0,
"completion_length": 611.9263687133789,
"epoch": 0.8656716417910447,
"grad_norm": 0.1022716760635376,
"learning_rate": 7.033683215379002e-07,
"loss": 0.0119,
"num_tokens": 39654854.0,
"reward": 0.7209821715950966,
"reward_std": 0.13534655049443245,
"rewards/accuracy_reward": 0.7209821492433548,
"step": 58
},
{
"clip_ratio": 0.0,
"completion_length": 615.6919937133789,
"epoch": 0.8805970149253731,
"grad_norm": 0.1027907207608223,
"learning_rate": 6.913417161825449e-07,
"loss": 0.0009,
"num_tokens": 40347154.0,
"reward": 0.7879464626312256,
"reward_std": 0.1539812944829464,
"rewards/accuracy_reward": 0.7879464328289032,
"step": 59
},
{
"clip_ratio": 0.0,
"completion_length": 579.677490234375,
"epoch": 0.8955223880597015,
"grad_norm": 0.09836462140083313,
"learning_rate": 6.7918397477265e-07,
"loss": 0.0268,
"num_tokens": 41016673.0,
"reward": 0.7477678805589676,
"reward_std": 0.1540593858808279,
"rewards/accuracy_reward": 0.7477678656578064,
"step": 60
},
{
"clip_ratio": 0.0,
"completion_length": 636.4364013671875,
"epoch": 0.9104477611940298,
"grad_norm": 0.0979558676481247,
"learning_rate": 6.669034296168854e-07,
"loss": 0.0155,
"num_tokens": 41744328.0,
"reward": 0.734375037252903,
"reward_std": 0.13192950701341033,
"rewards/accuracy_reward": 0.7343749925494194,
"step": 61
},
{
"clip_ratio": 0.0,
"completion_length": 593.4297180175781,
"epoch": 0.9253731343283582,
"grad_norm": 0.1013529971241951,
"learning_rate": 6.545084971874736e-07,
"loss": 0.0192,
"num_tokens": 42416689.0,
"reward": 0.8013393208384514,
"reward_std": 0.15724862087517977,
"rewards/accuracy_reward": 0.801339291036129,
"step": 62
},
{
"clip_ratio": 0.0,
"completion_length": 653.9620895385742,
"epoch": 0.9402985074626866,
"grad_norm": 0.10375187546014786,
"learning_rate": 6.420076723519614e-07,
"loss": 0.0264,
"num_tokens": 43147383.0,
"reward": 0.7466518208384514,
"reward_std": 0.19015955366194248,
"rewards/accuracy_reward": 0.7466517835855484,
"step": 63
},
{
"clip_ratio": 0.0,
"completion_length": 568.0390853881836,
"epoch": 0.9552238805970149,
"grad_norm": 0.10790595412254333,
"learning_rate": 6.294095225512604e-07,
"loss": 0.0073,
"num_tokens": 43808258.0,
"reward": 0.7767857536673546,
"reward_std": 0.15112948138266802,
"rewards/accuracy_reward": 0.776785708963871,
"step": 64
},
{
"clip_ratio": 0.0,
"completion_length": 572.7187805175781,
"epoch": 0.9701492537313433,
"grad_norm": 0.10618384927511215,
"learning_rate": 6.167226819279527e-07,
"loss": 0.0079,
"num_tokens": 44469838.0,
"reward": 0.7399553954601288,
"reward_std": 0.15288866125047207,
"rewards/accuracy_reward": 0.7399553507566452,
"step": 65
},
{
"clip_ratio": 0.0,
"completion_length": 647.2789344787598,
"epoch": 0.9850746268656716,
"grad_norm": 0.09327281266450882,
"learning_rate": 6.039558454088795e-07,
"loss": 0.016,
"num_tokens": 45187944.0,
"reward": 0.7433036044239998,
"reward_std": 0.1536117848008871,
"rewards/accuracy_reward": 0.7433035746216774,
"step": 66
},
{
"clip_ratio": 0.0,
"completion_length": 594.9609642028809,
"epoch": 1.0149253731343284,
"grad_norm": 0.08279585093259811,
"learning_rate": 5.911177627460738e-07,
"loss": 0.0129,
"num_tokens": 45854645.0,
"reward": 0.7912946864962578,
"reward_std": 0.1345644756220281,
"rewards/accuracy_reward": 0.7912946492433548,
"step": 67
},
{
"clip_ratio": 0.0,
"completion_length": 649.5837326049805,
"epoch": 1.0298507462686568,
"grad_norm": 0.08794418722391129,
"learning_rate": 5.782172325201155e-07,
"loss": 0.0213,
"num_tokens": 46584856.0,
"reward": 0.7645089626312256,
"reward_std": 0.12895571067929268,
"rewards/accuracy_reward": 0.7645089328289032,
"step": 68
},
{
"clip_ratio": 0.0,
"completion_length": 580.2131881713867,
"epoch": 1.044776119402985,
"grad_norm": 0.10208556056022644,
"learning_rate": 5.652630961100258e-07,
"loss": 0.0136,
"num_tokens": 47245295.0,
"reward": 0.7622768208384514,
"reward_std": 0.12223039288073778,
"rewards/accuracy_reward": 0.7622767835855484,
"step": 69
},
{
"clip_ratio": 0.0,
"completion_length": 577.7857437133789,
"epoch": 1.0597014925373134,
"grad_norm": 0.10996542125940323,
"learning_rate": 5.522642316338268e-07,
"loss": 0.0203,
"num_tokens": 47905519.0,
"reward": 0.7935268357396126,
"reward_std": 0.13906850665807724,
"rewards/accuracy_reward": 0.7935267835855484,
"step": 70
},
{
"clip_ratio": 0.0,
"completion_length": 575.6328315734863,
"epoch": 1.0746268656716418,
"grad_norm": 0.10389033704996109,
"learning_rate": 5.392295478639225e-07,
"loss": 0.0182,
"num_tokens": 48571510.0,
"reward": 0.7901786118745804,
"reward_std": 0.1126860505901277,
"rewards/accuracy_reward": 0.7901785746216774,
"step": 71
},
{
"clip_ratio": 0.0,
"completion_length": 606.5357360839844,
"epoch": 1.0895522388059702,
"grad_norm": 0.09268435835838318,
"learning_rate": 5.26167978121472e-07,
"loss": 0.0128,
"num_tokens": 49252990.0,
"reward": 0.7868303954601288,
"reward_std": 0.14030383061617613,
"rewards/accuracy_reward": 0.7868303433060646,
"step": 72
},
{
"clip_ratio": 0.0,
"completion_length": 605.1551513671875,
"epoch": 1.1044776119402986,
"grad_norm": 0.11817800253629684,
"learning_rate": 5.130884741539366e-07,
"loss": 0.0319,
"num_tokens": 49943121.0,
"reward": 0.6997768208384514,
"reward_std": 0.17450424656271935,
"rewards/accuracy_reward": 0.6997767873108387,
"step": 73
},
{
"clip_ratio": 0.0,
"completion_length": 575.7511291503906,
"epoch": 1.1194029850746268,
"grad_norm": 0.11024966835975647,
"learning_rate": 5e-07,
"loss": 0.0089,
"num_tokens": 50587458.0,
"reward": 0.7901786118745804,
"reward_std": 0.15706316381692886,
"rewards/accuracy_reward": 0.7901785671710968,
"step": 74
},
{
"clip_ratio": 0.0,
"completion_length": 588.0446662902832,
"epoch": 1.1343283582089552,
"grad_norm": 0.10334320366382599,
"learning_rate": 4.869115258460634e-07,
"loss": 0.0111,
"num_tokens": 51253962.0,
"reward": 0.7678571790456772,
"reward_std": 0.13136567501351237,
"rewards/accuracy_reward": 0.7678571492433548,
"step": 75
},
{
"clip_ratio": 0.0,
"completion_length": 579.7600708007812,
"epoch": 1.1492537313432836,
"grad_norm": 0.0887996256351471,
"learning_rate": 4.7383202187852804e-07,
"loss": 0.0138,
"num_tokens": 51915475.0,
"reward": 0.8258928954601288,
"reward_std": 0.11524363234639168,
"rewards/accuracy_reward": 0.8258928582072258,
"step": 76
},
{
"clip_ratio": 0.0,
"completion_length": 502.2779235839844,
"epoch": 1.164179104477612,
"grad_norm": 0.10169437527656555,
"learning_rate": 4.6077045213607755e-07,
"loss": 0.009,
"num_tokens": 52506180.0,
"reward": 0.8459821790456772,
"reward_std": 0.09597061527892947,
"rewards/accuracy_reward": 0.8459821343421936,
"step": 77
},
{
"clip_ratio": 0.0,
"completion_length": 619.1116333007812,
"epoch": 1.1791044776119404,
"grad_norm": 0.0879545658826828,
"learning_rate": 4.477357683661733e-07,
"loss": 0.0187,
"num_tokens": 53202944.0,
"reward": 0.7243303954601288,
"reward_std": 0.14004420721903443,
"rewards/accuracy_reward": 0.7243303507566452,
"step": 78
},
{
"clip_ratio": 0.0,
"completion_length": 620.1060485839844,
"epoch": 1.1940298507462686,
"grad_norm": 0.10439935326576233,
"learning_rate": 4.347369038899743e-07,
"loss": 0.0064,
"num_tokens": 53904935.0,
"reward": 0.7511161044239998,
"reward_std": 0.1467349175363779,
"rewards/accuracy_reward": 0.7511160634458065,
"step": 79
},
{
"clip_ratio": 0.0,
"completion_length": 634.0725631713867,
"epoch": 1.208955223880597,
"grad_norm": 0.10633595287799835,
"learning_rate": 4.2178276747988444e-07,
"loss": 0.0142,
"num_tokens": 54614712.0,
"reward": 0.697544664144516,
"reward_std": 0.14913531299680471,
"rewards/accuracy_reward": 0.6975446492433548,
"step": 80
},
{
"clip_ratio": 0.0,
"completion_length": 590.0245780944824,
"epoch": 1.2238805970149254,
"grad_norm": 0.09192880988121033,
"learning_rate": 4.0888223725392624e-07,
"loss": 0.0119,
"num_tokens": 55283518.0,
"reward": 0.7555803880095482,
"reward_std": 0.11814074404537678,
"rewards/accuracy_reward": 0.7555803656578064,
"step": 81
},
{
"clip_ratio": 0.0,
"completion_length": 551.6027030944824,
"epoch": 1.2388059701492538,
"grad_norm": 0.09458038955926895,
"learning_rate": 3.960441545911204e-07,
"loss": 0.0167,
"num_tokens": 55930090.0,
"reward": 0.7935268208384514,
"reward_std": 0.15564057044684887,
"rewards/accuracy_reward": 0.793526791036129,
"step": 82
},
{
"clip_ratio": 0.0,
"completion_length": 599.2087249755859,
"epoch": 1.2537313432835822,
"grad_norm": 0.2296370565891266,
"learning_rate": 3.8327731807204744e-07,
"loss": 0.0074,
"num_tokens": 56610981.0,
"reward": 0.7232143133878708,
"reward_std": 0.1519518168643117,
"rewards/accuracy_reward": 0.723214291036129,
"step": 83
},
{
"clip_ratio": 0.0,
"completion_length": 599.6797103881836,
"epoch": 1.2686567164179103,
"grad_norm": 0.11104385554790497,
"learning_rate": 3.7059047744873955e-07,
"loss": 0.0174,
"num_tokens": 57302150.0,
"reward": 0.7466518208384514,
"reward_std": 0.1660846211016178,
"rewards/accuracy_reward": 0.746651791036129,
"step": 84
},
{
"clip_ratio": 0.0,
"completion_length": 581.4330596923828,
"epoch": 1.2835820895522387,
"grad_norm": 0.08728226274251938,
"learning_rate": 3.5799232764803867e-07,
"loss": 0.0137,
"num_tokens": 57956850.0,
"reward": 0.8035714626312256,
"reward_std": 0.13602055981755257,
"rewards/accuracy_reward": 0.8035714328289032,
"step": 85
},
{
"clip_ratio": 0.0,
"completion_length": 594.2500228881836,
"epoch": 1.2985074626865671,
"grad_norm": 0.07698384672403336,
"learning_rate": 3.454915028125263e-07,
"loss": 0.0094,
"num_tokens": 58627226.0,
"reward": 0.7165178880095482,
"reward_std": 0.10528812417760491,
"rewards/accuracy_reward": 0.7165178582072258,
"step": 86
},
{
"clip_ratio": 0.0,
"completion_length": 612.5201187133789,
"epoch": 1.3134328358208955,
"grad_norm": 0.09053296595811844,
"learning_rate": 3.330965703831146e-07,
"loss": 0.0179,
"num_tokens": 59323436.0,
"reward": 0.7477678805589676,
"reward_std": 0.128175038844347,
"rewards/accuracy_reward": 0.7477678656578064,
"step": 87
},
{
"clip_ratio": 0.0,
"completion_length": 564.0803680419922,
"epoch": 1.328358208955224,
"grad_norm": 0.1006743460893631,
"learning_rate": 3.2081602522734985e-07,
"loss": 0.0006,
"num_tokens": 59982468.0,
"reward": 0.8537946864962578,
"reward_std": 0.11889003310352564,
"rewards/accuracy_reward": 0.8537946417927742,
"step": 88
},
{
"clip_ratio": 0.0,
"completion_length": 543.8102951049805,
"epoch": 1.3432835820895521,
"grad_norm": 0.10308939963579178,
"learning_rate": 3.086582838174551e-07,
"loss": 0.0275,
"num_tokens": 60610842.0,
"reward": 0.8113839700818062,
"reward_std": 0.13978207390755415,
"rewards/accuracy_reward": 0.811383917927742,
"step": 89
},
{
"clip_ratio": 0.0,
"completion_length": 607.444221496582,
"epoch": 1.3582089552238805,
"grad_norm": 0.10902010649442673,
"learning_rate": 2.9663167846209996e-07,
"loss": 0.0158,
"num_tokens": 61303472.0,
"reward": 0.714285746216774,
"reward_std": 0.14834184758365154,
"rewards/accuracy_reward": 0.714285708963871,
"step": 90
},
{
"clip_ratio": 0.0,
"completion_length": 543.1886367797852,
"epoch": 1.373134328358209,
"grad_norm": 0.10252662748098373,
"learning_rate": 2.847444515958523e-07,
"loss": 0.0222,
"num_tokens": 61937529.0,
"reward": 0.8281250447034836,
"reward_std": 0.136213474906981,
"rewards/accuracy_reward": 0.828125,
"step": 91
},
{
"clip_ratio": 0.0,
"completion_length": 602.7846336364746,
"epoch": 1.3880597014925373,
"grad_norm": 0.08475417643785477,
"learning_rate": 2.730047501302266e-07,
"loss": 0.0157,
"num_tokens": 62618920.0,
"reward": 0.7968750298023224,
"reward_std": 0.12377202790230513,
"rewards/accuracy_reward": 0.796875,
"step": 92
},
{
"clip_ratio": 0.0,
"completion_length": 605.2578353881836,
"epoch": 1.4029850746268657,
"grad_norm": 0.0900595411658287,
"learning_rate": 2.6142061987019574e-07,
"loss": 0.0148,
"num_tokens": 63296367.0,
"reward": 0.784598246216774,
"reward_std": 0.13316552620381117,
"rewards/accuracy_reward": 0.7845982238650322,
"step": 93
},
{
"clip_ratio": 0.0,
"completion_length": 563.7623062133789,
"epoch": 1.417910447761194,
"grad_norm": 0.08642500638961792,
"learning_rate": 2.500000000000001e-07,
"loss": 0.0067,
"num_tokens": 63959314.0,
"reward": 0.7645089626312256,
"reward_std": 0.12305664969608188,
"rewards/accuracy_reward": 0.7645089253783226,
"step": 94
},
{
"clip_ratio": 0.0,
"completion_length": 602.0781555175781,
"epoch": 1.4328358208955223,
"grad_norm": 0.10483755171298981,
"learning_rate": 2.387507176420256e-07,
"loss": 0.0038,
"num_tokens": 64645648.0,
"reward": 0.7388393133878708,
"reward_std": 0.12692945264279842,
"rewards/accuracy_reward": 0.738839291036129,
"step": 95
},
{
"clip_ratio": 0.0,
"completion_length": 634.3705749511719,
"epoch": 1.4477611940298507,
"grad_norm": 0.08823440223932266,
"learning_rate": 2.2768048249248644e-07,
"loss": 0.0214,
"num_tokens": 65358500.0,
"reward": 0.7645089626312256,
"reward_std": 0.12505152076482773,
"rewards/accuracy_reward": 0.7645089253783226,
"step": 96
},
{
"clip_ratio": 0.0,
"completion_length": 555.154052734375,
"epoch": 1.462686567164179,
"grad_norm": 0.11077926307916641,
"learning_rate": 2.167968815375837e-07,
"loss": 0.0077,
"num_tokens": 66006806.0,
"reward": 0.7756696715950966,
"reward_std": 0.12779708206653595,
"rewards/accuracy_reward": 0.7756696492433548,
"step": 97
},
{
"clip_ratio": 0.0,
"completion_length": 632.3147735595703,
"epoch": 1.4776119402985075,
"grad_norm": 0.13761980831623077,
"learning_rate": 2.0610737385376348e-07,
"loss": 0.0282,
"num_tokens": 66730488.0,
"reward": 0.7299107536673546,
"reward_std": 0.15939321741461754,
"rewards/accuracy_reward": 0.729910708963871,
"step": 98
},
{
"clip_ratio": 0.0,
"completion_length": 654.2366409301758,
"epoch": 1.4925373134328357,
"grad_norm": 0.11041553318500519,
"learning_rate": 1.9561928549563966e-07,
"loss": 0.0084,
"num_tokens": 67470004.0,
"reward": 0.670758955180645,
"reward_std": 0.15143328253179789,
"rewards/accuracy_reward": 0.6707589365541935,
"step": 99
},
{
"epoch": 1.5074626865671643,
"grad_norm": 0.09152326732873917,
"learning_rate": 1.8533980447508135e-07,
"loss": 0.0088,
"step": 100
},
{
"epoch": 1.5074626865671643,
"eval_clip_ratio": 0.0,
"eval_completion_length": 581.8313814301731,
"eval_loss": 0.016322219744324684,
"eval_num_tokens": 68121683.0,
"eval_reward": 0.7219673107123242,
"eval_reward_std": 0.16760184048732232,
"eval_rewards/accuracy_reward": 0.7219672777466268,
"eval_runtime": 7556.7096,
"eval_samples_per_second": 0.662,
"eval_steps_per_second": 0.006,
"step": 100
},
{
"clip_ratio": 0.0,
"completion_length": 590.9570579528809,
"epoch": 1.5223880597014925,
"grad_norm": 0.09221798926591873,
"learning_rate": 1.7527597583490823e-07,
"loss": 0.0013,
"num_tokens": 68805103.0,
"reward": 0.7901786044239998,
"reward_std": 0.1202246134635061,
"rewards/accuracy_reward": 0.7901785746216774,
"step": 101
},
{
"clip_ratio": 0.0,
"completion_length": 568.4319305419922,
"epoch": 1.537313432835821,
"grad_norm": 0.08379292488098145,
"learning_rate": 1.6543469682057104e-07,
"loss": 0.0114,
"num_tokens": 69465698.0,
"reward": 0.7633928954601288,
"reward_std": 0.10716536361724138,
"rewards/accuracy_reward": 0.7633928582072258,
"step": 102
},
{
"clip_ratio": 0.0,
"completion_length": 626.8917694091797,
"epoch": 1.5522388059701493,
"grad_norm": 0.09183470904827118,
"learning_rate": 1.5582271215312293e-07,
"loss": 0.0145,
"num_tokens": 70169185.0,
"reward": 0.753348246216774,
"reward_std": 0.12400025688111782,
"rewards/accuracy_reward": 0.7533482164144516,
"step": 103
},
{
"clip_ratio": 0.0,
"completion_length": 561.1886329650879,
"epoch": 1.5671641791044775,
"grad_norm": 0.11465712636709213,
"learning_rate": 1.4644660940672627e-07,
"loss": 0.0233,
"num_tokens": 70823314.0,
"reward": 0.7723214700818062,
"reward_std": 0.14902723766863346,
"rewards/accuracy_reward": 0.7723214328289032,
"step": 104
},
{
"clip_ratio": 0.0,
"completion_length": 595.260066986084,
"epoch": 1.582089552238806,
"grad_norm": 0.10910345613956451,
"learning_rate": 1.3731281449385628e-07,
"loss": 0.0132,
"num_tokens": 71509203.0,
"reward": 0.7834821790456772,
"reward_std": 0.13775580935180187,
"rewards/accuracy_reward": 0.7834821417927742,
"step": 105
},
{
"clip_ratio": 0.0,
"completion_length": 622.6629753112793,
"epoch": 1.5970149253731343,
"grad_norm": 0.10859620571136475,
"learning_rate": 1.284275872613028e-07,
"loss": -0.0033,
"num_tokens": 72213229.0,
"reward": 0.7332589700818062,
"reward_std": 0.13835103251039982,
"rewards/accuracy_reward": 0.7332589253783226,
"step": 106
},
{
"clip_ratio": 0.0,
"completion_length": 647.8147659301758,
"epoch": 1.6119402985074627,
"grad_norm": 0.12190267443656921,
"learning_rate": 1.1979701719998454e-07,
"loss": 0.0389,
"num_tokens": 72943663.0,
"reward": 0.7343750298023224,
"reward_std": 0.20019244402647018,
"rewards/accuracy_reward": 0.7343749925494194,
"step": 107
},
{
"clip_ratio": 0.0,
"completion_length": 584.2020416259766,
"epoch": 1.626865671641791,
"grad_norm": 0.09703335165977478,
"learning_rate": 1.1142701927151454e-07,
"loss": 0.0177,
"num_tokens": 73602268.0,
"reward": 0.8080357536673546,
"reward_std": 0.14263570308685303,
"rewards/accuracy_reward": 0.808035708963871,
"step": 108
},
{
"clip_ratio": 0.0,
"completion_length": 606.7857437133789,
"epoch": 1.6417910447761193,
"grad_norm": 0.10704245418310165,
"learning_rate": 1.0332332985438247e-07,
"loss": 0.0166,
"num_tokens": 74295444.0,
"reward": 0.7779018208384514,
"reward_std": 0.15495909843593836,
"rewards/accuracy_reward": 0.777901791036129,
"step": 109
},
{
"clip_ratio": 0.0,
"completion_length": 562.9475708007812,
"epoch": 1.6567164179104479,
"grad_norm": 0.08908524364233017,
"learning_rate": 9.549150281252632e-08,
"loss": 0.0095,
"num_tokens": 74940989.0,
"reward": 0.7834821790456772,
"reward_std": 0.10505919344723225,
"rewards/accuracy_reward": 0.7834821492433548,
"step": 110
},
{
"clip_ratio": 0.0,
"completion_length": 624.4922180175781,
"epoch": 1.671641791044776,
"grad_norm": 0.09753753989934921,
"learning_rate": 8.793690568899215e-08,
"loss": 0.0206,
"num_tokens": 75639342.0,
"reward": 0.7767857536673546,
"reward_std": 0.14068038668483496,
"rewards/accuracy_reward": 0.776785708963871,
"step": 111
},
{
"clip_ratio": 0.0,
"completion_length": 622.2422256469727,
"epoch": 1.6865671641791045,
"grad_norm": 0.09617064148187637,
"learning_rate": 8.066471602728803e-08,
"loss": 0.0037,
"num_tokens": 76329767.0,
"reward": 0.7946428954601288,
"reward_std": 0.11434714496135712,
"rewards/accuracy_reward": 0.7946428582072258,
"step": 112
},
{
"clip_ratio": 0.0,
"completion_length": 600.4765892028809,
"epoch": 1.7014925373134329,
"grad_norm": 0.08512350916862488,
"learning_rate": 7.36799178229539e-08,
"loss": 0.0183,
"num_tokens": 77010834.0,
"reward": 0.7366071715950966,
"reward_std": 0.10942125041037798,
"rewards/accuracy_reward": 0.7366071343421936,
"step": 113
},
{
"clip_ratio": 0.0,
"completion_length": 561.8080596923828,
"epoch": 1.716417910447761,
"grad_norm": 0.1167522743344307,
"learning_rate": 6.698729810778064e-08,
"loss": 0.0032,
"num_tokens": 77661150.0,
"reward": 0.7801339700818062,
"reward_std": 0.1291090790182352,
"rewards/accuracy_reward": 0.7801339253783226,
"step": 114
},
{
"clip_ratio": 0.0,
"completion_length": 627.9944458007812,
"epoch": 1.7313432835820897,
"grad_norm": 0.09695158898830414,
"learning_rate": 6.059144366901736e-08,
"loss": 0.0176,
"num_tokens": 78367609.0,
"reward": 0.7332589700818062,
"reward_std": 0.12523557152599096,
"rewards/accuracy_reward": 0.733258917927742,
"step": 115
},
{
"clip_ratio": 0.0,
"completion_length": 562.5022583007812,
"epoch": 1.7462686567164178,
"grad_norm": 0.08717180788516998,
"learning_rate": 5.44967379058161e-08,
"loss": 0.0087,
"num_tokens": 79007427.0,
"reward": 0.7689732387661934,
"reward_std": 0.10855362564325333,
"rewards/accuracy_reward": 0.768973208963871,
"step": 116
},
{
"clip_ratio": 0.0,
"completion_length": 622.9989051818848,
"epoch": 1.7611940298507462,
"grad_norm": 0.12031041085720062,
"learning_rate": 4.870735782506979e-08,
"loss": 0.0127,
"num_tokens": 79736314.0,
"reward": 0.725446455180645,
"reward_std": 0.16799716651439667,
"rewards/accuracy_reward": 0.725446417927742,
"step": 117
},
{
"clip_ratio": 0.0,
"completion_length": 629.5993537902832,
"epoch": 1.7761194029850746,
"grad_norm": 0.11032404005527496,
"learning_rate": 4.322727117869951e-08,
"loss": 0.0073,
"num_tokens": 80442075.0,
"reward": 0.7622768208384514,
"reward_std": 0.14162609539926052,
"rewards/accuracy_reward": 0.7622767835855484,
"step": 118
},
{
"clip_ratio": 0.0,
"completion_length": 597.2299423217773,
"epoch": 1.7910447761194028,
"grad_norm": 0.09072305262088776,
"learning_rate": 3.806023374435663e-08,
"loss": 0.0189,
"num_tokens": 81117833.0,
"reward": 0.7522321715950966,
"reward_std": 0.12486787885427475,
"rewards/accuracy_reward": 0.7522321417927742,
"step": 119
},
{
"clip_ratio": 0.0,
"completion_length": 599.5346298217773,
"epoch": 1.8059701492537314,
"grad_norm": 0.10171713680028915,
"learning_rate": 3.3209786751399184e-08,
"loss": 0.009,
"num_tokens": 81809056.0,
"reward": 0.7946428954601288,
"reward_std": 0.1557179531082511,
"rewards/accuracy_reward": 0.7946428507566452,
"step": 120
},
{
"clip_ratio": 0.0,
"completion_length": 579.971004486084,
"epoch": 1.8208955223880596,
"grad_norm": 0.09700023382902145,
"learning_rate": 2.8679254453910785e-08,
"loss": 0.0068,
"num_tokens": 82474246.0,
"reward": 0.7354911044239998,
"reward_std": 0.1312894057482481,
"rewards/accuracy_reward": 0.7354910746216774,
"step": 121
},
{
"clip_ratio": 0.0,
"completion_length": 620.3493499755859,
"epoch": 1.835820895522388,
"grad_norm": 0.09332066029310226,
"learning_rate": 2.4471741852423233e-08,
"loss": 0.0129,
"num_tokens": 83191831.0,
"reward": 0.7555803880095482,
"reward_std": 0.1526290439069271,
"rewards/accuracy_reward": 0.7555803582072258,
"step": 122
},
{
"clip_ratio": 0.0,
"completion_length": 618.1640853881836,
"epoch": 1.8507462686567164,
"grad_norm": 0.1387184113264084,
"learning_rate": 2.0590132565903473e-08,
"loss": 0.0281,
"num_tokens": 83893978.0,
"reward": 0.7700893133878708,
"reward_std": 0.17633870337158442,
"rewards/accuracy_reward": 0.770089291036129,
"step": 123
},
{
"clip_ratio": 0.0,
"completion_length": 581.0748023986816,
"epoch": 1.8656716417910446,
"grad_norm": 0.11000484973192215,
"learning_rate": 1.7037086855465898e-08,
"loss": 0.01,
"num_tokens": 84554317.0,
"reward": 0.7511161118745804,
"reward_std": 0.13362086936831474,
"rewards/accuracy_reward": 0.7511160671710968,
"step": 124
},
{
"clip_ratio": 0.0,
"completion_length": 628.025707244873,
"epoch": 1.8805970149253732,
"grad_norm": 0.0943480134010315,
"learning_rate": 1.3815039801161722e-08,
"loss": 0.0185,
"num_tokens": 85259028.0,
"reward": 0.7377232313156128,
"reward_std": 0.1586036691442132,
"rewards/accuracy_reward": 0.7377232238650322,
"step": 125
},
{
"clip_ratio": 0.0,
"completion_length": 545.2154235839844,
"epoch": 1.8955223880597014,
"grad_norm": 0.11781350523233414,
"learning_rate": 1.0926199633097154e-08,
"loss": 0.023,
"num_tokens": 85888949.0,
"reward": 0.787946455180645,
"reward_std": 0.13380562094971538,
"rewards/accuracy_reward": 0.7879464328289032,
"step": 126
},
{
"clip_ratio": 0.0,
"completion_length": 609.3426513671875,
"epoch": 1.9104477611940298,
"grad_norm": 0.11208699643611908,
"learning_rate": 8.372546218022746e-09,
"loss": 0.0213,
"num_tokens": 86586952.0,
"reward": 0.7243303805589676,
"reward_std": 0.1444790279492736,
"rewards/accuracy_reward": 0.7243303656578064,
"step": 127
},
{
"clip_ratio": 0.0,
"completion_length": 581.1094207763672,
"epoch": 1.9253731343283582,
"grad_norm": 0.09035325050354004,
"learning_rate": 6.15582970243117e-09,
"loss": 0.0098,
"num_tokens": 87253226.0,
"reward": 0.8147321790456772,
"reward_std": 0.11614769464358687,
"rewards/accuracy_reward": 0.8147321492433548,
"step": 128
},
{
"clip_ratio": 0.0,
"completion_length": 607.8404235839844,
"epoch": 1.9402985074626866,
"grad_norm": 0.11255651712417603,
"learning_rate": 4.277569313094809e-09,
"loss": 0.0127,
"num_tokens": 87936411.0,
"reward": 0.7700893208384514,
"reward_std": 0.17164426296949387,
"rewards/accuracy_reward": 0.7700892761349678,
"step": 129
},
{
"clip_ratio": 0.0,
"completion_length": 625.9174423217773,
"epoch": 1.955223880597015,
"grad_norm": 0.09853976219892502,
"learning_rate": 2.739052315863355e-09,
"loss": 0.0062,
"num_tokens": 88654209.0,
"reward": 0.7131696715950966,
"reward_std": 0.1297498755156994,
"rewards/accuracy_reward": 0.7131696417927742,
"step": 130
},
{
"clip_ratio": 0.0,
"completion_length": 566.3649787902832,
"epoch": 1.9701492537313432,
"grad_norm": 0.11755723506212234,
"learning_rate": 1.541333133436018e-09,
"loss": 0.003,
"num_tokens": 89303936.0,
"reward": 0.7968750447034836,
"reward_std": 0.16296614985913038,
"rewards/accuracy_reward": 0.7968750074505806,
"step": 131
},
{
"clip_ratio": 0.0,
"completion_length": 602.0601844787598,
"epoch": 1.9850746268656716,
"grad_norm": 0.09191662073135376,
"learning_rate": 6.852326227130833e-10,
"loss": 0.0104,
"num_tokens": 89996910.0,
"reward": 0.7433036118745804,
"reward_std": 0.12050722911953926,
"rewards/accuracy_reward": 0.7433035671710968,
"step": 132
},
{
"epoch": 1.9850746268656716,
"step": 132,
"total_flos": 0.0,
"train_loss": 0.019297132296328942,
"train_runtime": 35491.2022,
"train_samples_per_second": 0.423,
"train_steps_per_second": 0.004
}
],
"logging_steps": 1,
"max_steps": 134,
"num_input_tokens_seen": 0,
"num_train_epochs": 2,
"save_steps": 500,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": false
},
"attributes": {}
}
},
"total_flos": 0.0,
"train_batch_size": 16,
"trial_name": null,
"trial_params": null
}