chenggong
Model save
9e921c7 verified
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 3.997867803837953,
"eval_steps": 117,
"global_step": 468,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"clip_ratio": 0.0,
"completion_length": 642.2232513427734,
"epoch": 0.008528784648187633,
"grad_norm": 0.29757431149482727,
"kl": 0.0,
"learning_rate": 6.382978723404255e-08,
"loss": 0.0472,
"reward": 0.6718750298023224,
"reward_std": 0.3231801837682724,
"rewards/accuracy_reward": 0.6640625298023224,
"rewards/format_reward": 0.007812500349245965,
"step": 1
},
{
"clip_ratio": 0.0,
"completion_length": 595.3912038803101,
"epoch": 0.042643923240938165,
"grad_norm": 0.35624048113822937,
"kl": 0.0001455843448638916,
"learning_rate": 3.1914893617021275e-07,
"loss": 0.0595,
"reward": 0.6908482443541288,
"reward_std": 0.35730565479025245,
"rewards/accuracy_reward": 0.6819196743890643,
"rewards/format_reward": 0.008928571827709675,
"step": 5
},
{
"clip_ratio": 0.0,
"completion_length": 602.0033729553222,
"epoch": 0.08528784648187633,
"grad_norm": 0.2918407618999481,
"kl": 0.00026810169219970703,
"learning_rate": 6.382978723404255e-07,
"loss": 0.0569,
"reward": 0.6535714589059353,
"reward_std": 0.3369171965867281,
"rewards/accuracy_reward": 0.6473214581608773,
"rewards/format_reward": 0.006250000232830644,
"step": 10
},
{
"clip_ratio": 0.0,
"completion_length": 589.1335098266602,
"epoch": 0.1279317697228145,
"grad_norm": 0.4812746047973633,
"kl": 0.24185171127319335,
"learning_rate": 9.574468085106384e-07,
"loss": 0.0832,
"reward": 0.6883928894996643,
"reward_std": 0.3438419926911592,
"rewards/accuracy_reward": 0.6819196753203869,
"rewards/format_reward": 0.006473214668221772,
"step": 15
},
{
"clip_ratio": 0.0,
"completion_length": 601.767658996582,
"epoch": 0.17057569296375266,
"grad_norm": 0.31131526827812195,
"kl": 0.0008780479431152344,
"learning_rate": 1.276595744680851e-06,
"loss": 0.0705,
"reward": 0.6939732521772385,
"reward_std": 0.3367977850139141,
"rewards/accuracy_reward": 0.6879464671015739,
"rewards/format_reward": 0.006026786030270159,
"step": 20
},
{
"clip_ratio": 0.0,
"completion_length": 588.0732414245606,
"epoch": 0.21321961620469082,
"grad_norm": 0.2977014183998108,
"kl": 0.0019563674926757813,
"learning_rate": 1.5957446808510639e-06,
"loss": 0.0833,
"reward": 0.7379464626312255,
"reward_std": 0.3033597592264414,
"rewards/accuracy_reward": 0.7339286029338836,
"rewards/format_reward": 0.004017857322469354,
"step": 25
},
{
"clip_ratio": 0.0,
"completion_length": 601.1149803161621,
"epoch": 0.255863539445629,
"grad_norm": 0.7156150937080383,
"kl": 0.015515518188476563,
"learning_rate": 1.9148936170212767e-06,
"loss": 0.0826,
"reward": 0.7363839671015739,
"reward_std": 0.28360147699713706,
"rewards/accuracy_reward": 0.7348214641213417,
"rewards/format_reward": 0.0015625000698491931,
"step": 30
},
{
"clip_ratio": 0.0,
"completion_length": 591.7576164245605,
"epoch": 0.29850746268656714,
"grad_norm": 0.339713990688324,
"kl": 0.008817577362060547,
"learning_rate": 2.2340425531914894e-06,
"loss": 0.0708,
"reward": 0.7671875342726707,
"reward_std": 0.24262561108916997,
"rewards/accuracy_reward": 0.766294677555561,
"rewards/format_reward": 0.0008928571827709675,
"step": 35
},
{
"clip_ratio": 0.0,
"completion_length": 583.7185531616211,
"epoch": 0.3411513859275053,
"grad_norm": 1.7096885442733765,
"kl": 0.027852249145507813,
"learning_rate": 2.553191489361702e-06,
"loss": 0.0584,
"reward": 0.7622768223285675,
"reward_std": 0.2263224059715867,
"rewards/accuracy_reward": 0.7602678939700127,
"rewards/format_reward": 0.002008928661234677,
"step": 40
},
{
"clip_ratio": 0.0,
"completion_length": 596.3366325378418,
"epoch": 0.3837953091684435,
"grad_norm": 0.3490237295627594,
"kl": 0.004991340637207031,
"learning_rate": 2.872340425531915e-06,
"loss": 0.0615,
"reward": 0.7779018193483352,
"reward_std": 0.23310719933360816,
"rewards/accuracy_reward": 0.7723214626312256,
"rewards/format_reward": 0.005580357392318547,
"step": 45
},
{
"clip_ratio": 0.0,
"completion_length": 584.2732421875,
"epoch": 0.42643923240938164,
"grad_norm": 0.8813576698303223,
"kl": 0.008109092712402344,
"learning_rate": 2.9996241442585123e-06,
"loss": 0.0564,
"reward": 0.7683036044239998,
"reward_std": 0.2527444614097476,
"rewards/accuracy_reward": 0.7486607477068901,
"rewards/format_reward": 0.019642858114093543,
"step": 50
},
{
"clip_ratio": 0.0,
"completion_length": 575.4134185791015,
"epoch": 0.4690831556503198,
"grad_norm": 1.0579967498779297,
"kl": 0.013035964965820313,
"learning_rate": 2.9973279301399446e-06,
"loss": 0.0374,
"reward": 0.8272321790456771,
"reward_std": 0.3260661941021681,
"rewards/accuracy_reward": 0.7531250283122063,
"rewards/format_reward": 0.07410714614670724,
"step": 55
},
{
"clip_ratio": 0.0,
"completion_length": 591.4556053161621,
"epoch": 0.511727078891258,
"grad_norm": 0.3846156895160675,
"kl": 0.018450927734375,
"learning_rate": 2.992947502998804e-06,
"loss": 0.0393,
"reward": 0.8857143223285675,
"reward_std": 0.37481620348989964,
"rewards/accuracy_reward": 0.7491071775555611,
"rewards/format_reward": 0.1366071492433548,
"step": 60
},
{
"clip_ratio": 0.0,
"completion_length": 565.9245796203613,
"epoch": 0.5543710021321961,
"grad_norm": 0.6243420243263245,
"kl": 0.02995452880859375,
"learning_rate": 2.9864889601923268e-06,
"loss": 0.0221,
"reward": 0.9781250506639481,
"reward_std": 0.44858795329928397,
"rewards/accuracy_reward": 0.7267857432365418,
"rewards/format_reward": 0.2513392990455031,
"step": 65
},
{
"clip_ratio": 0.0,
"completion_length": 586.0239158630371,
"epoch": 0.5970149253731343,
"grad_norm": 26.86927032470703,
"kl": 0.04704437255859375,
"learning_rate": 2.977961291721137e-06,
"loss": 0.0185,
"reward": 1.0526786223053932,
"reward_std": 0.482559996843338,
"rewards/accuracy_reward": 0.710044676065445,
"rewards/format_reward": 0.34263394549489024,
"step": 70
},
{
"clip_ratio": 0.0,
"completion_length": 589.8480171203613,
"epoch": 0.6396588486140725,
"grad_norm": 0.9474160075187683,
"kl": 0.060797119140625,
"learning_rate": 2.9673763677155655e-06,
"loss": 0.0388,
"reward": 1.138169701397419,
"reward_std": 0.5718358919024468,
"rewards/accuracy_reward": 0.6812500283122063,
"rewards/format_reward": 0.4569196656346321,
"step": 75
},
{
"clip_ratio": 0.0,
"completion_length": 596.6254699707031,
"epoch": 0.6823027718550106,
"grad_norm": 3.176877975463867,
"kl": 0.4295654296875,
"learning_rate": 2.9547489219129666e-06,
"loss": 0.0459,
"reward": 0.9727679073810578,
"reward_std": 0.7046953186392784,
"rewards/accuracy_reward": 0.4734375223517418,
"rewards/format_reward": 0.4993303783237934,
"step": 80
},
{
"clip_ratio": 0.0,
"completion_length": 633.0022583007812,
"epoch": 0.7249466950959488,
"grad_norm": 16.31062889099121,
"kl": 0.807666015625,
"learning_rate": 2.9400965311490175e-06,
"loss": 0.067,
"reward": 1.0857143267989158,
"reward_std": 0.7520077109336853,
"rewards/accuracy_reward": 0.5133928790688514,
"rewards/format_reward": 0.5723214574158192,
"step": 85
},
{
"clip_ratio": 0.0,
"completion_length": 640.6631980895996,
"epoch": 0.767590618336887,
"grad_norm": 4.319429397583008,
"kl": 0.7376708984375,
"learning_rate": 2.9234395908915565e-06,
"loss": 0.1219,
"reward": 1.3002232700586318,
"reward_std": 0.6729222685098648,
"rewards/accuracy_reward": 0.6341518126428127,
"rewards/format_reward": 0.6660714603960514,
"step": 90
},
{
"clip_ratio": 0.0,
"completion_length": 637.9712371826172,
"epoch": 0.8102345415778252,
"grad_norm": 24.02198028564453,
"kl": 6.8146484375,
"learning_rate": 2.904801286851009e-06,
"loss": 0.5369,
"reward": 1.4430804237723351,
"reward_std": 0.5716752491891384,
"rewards/accuracy_reward": 0.6941964641213417,
"rewards/format_reward": 0.7488839581608773,
"step": 95
},
{
"clip_ratio": 0.0,
"completion_length": 592.6529304504395,
"epoch": 0.8528784648187633,
"grad_norm": 4.384208679199219,
"kl": 0.373583984375,
"learning_rate": 2.884207562706925e-06,
"loss": 0.0912,
"reward": 1.5466518580913544,
"reward_std": 0.4958520784974098,
"rewards/accuracy_reward": 0.7368303924798966,
"rewards/format_reward": 0.8098214656114578,
"step": 100
},
{
"clip_ratio": 0.0,
"completion_length": 570.5218971252441,
"epoch": 0.8955223880597015,
"grad_norm": 2.8563225269317627,
"kl": 1.043408203125,
"learning_rate": 2.8616870839955444e-06,
"loss": 0.0561,
"reward": 1.5991072207689285,
"reward_std": 0.4629118986427784,
"rewards/accuracy_reward": 0.7482143253087997,
"rewards/format_reward": 0.850892896950245,
"step": 105
},
{
"clip_ratio": 0.0,
"completion_length": 607.3071670532227,
"epoch": 0.9381663113006397,
"grad_norm": 2.282275676727295,
"kl": 1.5532958984375,
"learning_rate": 2.837271198208662e-06,
"loss": 0.0403,
"reward": 1.5997768580913543,
"reward_std": 0.48762847371399404,
"rewards/accuracy_reward": 0.7437500342726707,
"rewards/format_reward": 0.8560268253087997,
"step": 110
},
{
"clip_ratio": 0.0,
"completion_length": 635.0283782958984,
"epoch": 0.9808102345415778,
"grad_norm": 3.15806245803833,
"kl": 2.1205078125,
"learning_rate": 2.8109938911593322e-06,
"loss": 0.0641,
"reward": 1.5205357849597931,
"reward_std": 0.5536904223263264,
"rewards/accuracy_reward": 0.7011161044239997,
"rewards/format_reward": 0.8194196805357933,
"step": 115
},
{
"epoch": 0.997867803837953,
"eval_clip_ratio": 0.0,
"eval_completion_length": 669.1385478670635,
"eval_kl": 4.540426587301587,
"eval_loss": 0.20850437879562378,
"eval_reward": 1.462868539113847,
"eval_reward_std": 0.5733831548501575,
"eval_rewards/accuracy_reward": 0.661422934797075,
"eval_rewards/format_reward": 0.8014456156700377,
"eval_runtime": 903.45,
"eval_samples_per_second": 0.553,
"eval_steps_per_second": 0.006,
"step": 117
},
{
"clip_ratio": 0.0,
"completion_length": 688.7388717651368,
"epoch": 1.0255863539445629,
"grad_norm": 5.824474811553955,
"kl": 1.586767578125,
"learning_rate": 2.7828917396751474e-06,
"loss": 0.065,
"reward": 1.5044643521308898,
"reward_std": 0.5775617159903049,
"rewards/accuracy_reward": 0.703794676065445,
"rewards/format_reward": 0.8006696850061417,
"step": 120
},
{
"clip_ratio": 0.0,
"completion_length": 726.9741363525391,
"epoch": 1.068230277185501,
"grad_norm": 5.099586009979248,
"kl": 6.495361328125,
"learning_rate": 2.753003860684943e-06,
"loss": 0.1463,
"reward": 1.5174107819795608,
"reward_std": 0.5776193253695965,
"rewards/accuracy_reward": 0.6948661051690579,
"rewards/format_reward": 0.8225446805357933,
"step": 125
},
{
"clip_ratio": 0.0,
"completion_length": 741.8879806518555,
"epoch": 1.1108742004264391,
"grad_norm": 4.33757209777832,
"kl": 2.81171875,
"learning_rate": 2.721371856769793e-06,
"loss": 0.1539,
"reward": 1.4988839864730834,
"reward_std": 0.5920813702046871,
"rewards/accuracy_reward": 0.6816964618861675,
"rewards/format_reward": 0.8171875372529029,
"step": 130
},
{
"clip_ratio": 0.0,
"completion_length": 712.3254806518555,
"epoch": 1.1535181236673775,
"grad_norm": 3.2068488597869873,
"kl": 1.8530517578125,
"learning_rate": 2.688039758254093e-06,
"loss": 0.0988,
"reward": 1.574107214808464,
"reward_std": 0.5515650272369385,
"rewards/accuracy_reward": 0.7183035999536515,
"rewards/format_reward": 0.8558036118745804,
"step": 135
},
{
"clip_ratio": 0.0,
"completion_length": 703.3225753784179,
"epoch": 1.1961620469083156,
"grad_norm": 1.6553071737289429,
"kl": 0.72064208984375,
"learning_rate": 2.65305396191733e-06,
"loss": 0.0742,
"reward": 1.5618304312229156,
"reward_std": 0.5021443270146847,
"rewards/accuracy_reward": 0.6995536029338837,
"rewards/format_reward": 0.8622768253087998,
"step": 140
},
{
"clip_ratio": 0.0,
"completion_length": 685.3361877441406,
"epoch": 1.2388059701492538,
"grad_norm": 0.40964633226394653,
"kl": 0.66866455078125,
"learning_rate": 2.61646316641186e-06,
"loss": 0.0625,
"reward": 1.5828125685453416,
"reward_std": 0.47754584178328513,
"rewards/accuracy_reward": 0.7189732477068901,
"rewards/format_reward": 0.8638393208384514,
"step": 145
},
{
"clip_ratio": 0.0,
"completion_length": 653.287085723877,
"epoch": 1.2814498933901919,
"grad_norm": 1.1264029741287231,
"kl": 0.331591796875,
"learning_rate": 2.5783183044765715e-06,
"loss": 0.0573,
"reward": 1.6180804371833801,
"reward_std": 0.44253902062773703,
"rewards/accuracy_reward": 0.7350446723401547,
"rewards/format_reward": 0.8830357536673545,
"step": 150
},
{
"clip_ratio": 0.0,
"completion_length": 651.8187789916992,
"epoch": 1.32409381663113,
"grad_norm": 0.36573150753974915,
"kl": 0.68359375,
"learning_rate": 2.5386724720408135e-06,
"loss": 0.0651,
"reward": 1.6392857909202576,
"reward_std": 0.39724560379981994,
"rewards/accuracy_reward": 0.7477678887546062,
"rewards/format_reward": 0.8915178969502449,
"step": 155
},
{
"clip_ratio": 0.0,
"completion_length": 644.0553833007813,
"epoch": 1.3667377398720681,
"grad_norm": 0.3527699410915375,
"kl": 0.289678955078125,
"learning_rate": 2.49758085431725e-06,
"loss": 0.0217,
"reward": 1.7120536595582962,
"reward_std": 0.3335069250315428,
"rewards/accuracy_reward": 0.7790178924798965,
"rewards/format_reward": 0.9330357566475869,
"step": 160
},
{
"clip_ratio": 0.0,
"completion_length": 634.3732429504395,
"epoch": 1.4093816631130065,
"grad_norm": 2.3950164318084717,
"kl": 0.7220977783203125,
"learning_rate": 2.455100648986533e-06,
"loss": 0.04,
"reward": 1.703571507334709,
"reward_std": 0.31777132861316204,
"rewards/accuracy_reward": 0.7852678909897804,
"rewards/format_reward": 0.9183036163449287,
"step": 165
},
{
"clip_ratio": 0.0,
"completion_length": 649.6995819091796,
"epoch": 1.4520255863539446,
"grad_norm": 0.4751293361186981,
"kl": 0.80157470703125,
"learning_rate": 2.4112909865807053e-06,
"loss": 0.0806,
"reward": 1.6685268580913544,
"reward_std": 0.35685542970895767,
"rewards/accuracy_reward": 0.7654018208384514,
"rewards/format_reward": 0.9031250387430191,
"step": 170
},
{
"clip_ratio": 0.0,
"completion_length": 662.9319503784179,
"epoch": 1.4946695095948828,
"grad_norm": 3.0650556087493896,
"kl": 0.2190185546875,
"learning_rate": 2.366212848176164e-06,
"loss": 0.0385,
"reward": 1.6214286386966705,
"reward_std": 0.3651090878993273,
"rewards/accuracy_reward": 0.7325893193483353,
"rewards/format_reward": 0.8888393297791481,
"step": 175
},
{
"clip_ratio": 0.0,
"completion_length": 638.3734603881836,
"epoch": 1.537313432835821,
"grad_norm": 1.002023458480835,
"kl": 0.2518310546875,
"learning_rate": 2.319928980510752e-06,
"loss": 0.0424,
"reward": 1.6872768700122833,
"reward_std": 0.36593810133635996,
"rewards/accuracy_reward": 0.7772321775555611,
"rewards/format_reward": 0.9100446820259094,
"step": 180
},
{
"clip_ratio": 0.0,
"completion_length": 647.8556060791016,
"epoch": 1.579957356076759,
"grad_norm": 1.074832558631897,
"kl": 0.247601318359375,
"learning_rate": 2.272503808643123e-06,
"loss": 0.0509,
"reward": 1.68526793718338,
"reward_std": 0.3566482378169894,
"rewards/accuracy_reward": 0.7754464611411095,
"rewards/format_reward": 0.9098214745521546,
"step": 185
},
{
"clip_ratio": 0.0,
"completion_length": 655.2254753112793,
"epoch": 1.6226012793176974,
"grad_norm": 0.493257999420166,
"kl": 0.286126708984375,
"learning_rate": 2.2240033462759628e-06,
"loss": 0.0513,
"reward": 1.6654018700122832,
"reward_std": 0.39258838426321746,
"rewards/accuracy_reward": 0.7604911088943481,
"rewards/format_reward": 0.9049107551574707,
"step": 190
},
{
"clip_ratio": 0.0,
"completion_length": 649.5230148315429,
"epoch": 1.6652452025586353,
"grad_norm": 1.0757510662078857,
"kl": 0.28671875,
"learning_rate": 2.1744951038678905e-06,
"loss": 0.0627,
"reward": 1.6694197177886962,
"reward_std": 0.36208211332559587,
"rewards/accuracy_reward": 0.7629464589059353,
"rewards/format_reward": 0.9064732536673545,
"step": 195
},
{
"clip_ratio": 0.0,
"completion_length": 642.9241394042969,
"epoch": 1.7078891257995736,
"grad_norm": 2.5317819118499756,
"kl": 0.55604248046875,
"learning_rate": 2.124047994661941e-06,
"loss": 0.0672,
"reward": 1.636160781979561,
"reward_std": 0.3948578182607889,
"rewards/accuracy_reward": 0.7459821730852128,
"rewards/format_reward": 0.8901786103844642,
"step": 200
},
{
"clip_ratio": 0.0,
"completion_length": 632.1971221923828,
"epoch": 1.7505330490405118,
"grad_norm": 1.68379545211792,
"kl": 0.6220703125,
"learning_rate": 2.072732238761434e-06,
"loss": 0.0781,
"reward": 1.5113840013742448,
"reward_std": 0.4758596081286669,
"rewards/accuracy_reward": 0.6984375290572643,
"rewards/format_reward": 0.8129464641213417,
"step": 205
},
{
"clip_ratio": 0.0,
"completion_length": 623.4424430847168,
"epoch": 1.79317697228145,
"grad_norm": 3.5116868019104004,
"kl": 0.685400390625,
"learning_rate": 2.0206192653867536e-06,
"loss": 0.0961,
"reward": 1.4203125566244126,
"reward_std": 0.5285798832774162,
"rewards/accuracy_reward": 0.6421875290572643,
"rewards/format_reward": 0.7781250327825546,
"step": 210
},
{
"clip_ratio": 0.0,
"completion_length": 614.3951126098633,
"epoch": 1.835820895522388,
"grad_norm": 1.31536865234375,
"kl": 0.9739013671875,
"learning_rate": 1.967781613449095e-06,
"loss": 0.128,
"reward": 1.3087054163217544,
"reward_std": 0.6106407694518566,
"rewards/accuracy_reward": 0.6040178872644901,
"rewards/format_reward": 0.7046875283122063,
"step": 215
},
{
"clip_ratio": 0.0,
"completion_length": 604.7236915588379,
"epoch": 1.8784648187633262,
"grad_norm": 2.7701919078826904,
"kl": 1.267333984375,
"learning_rate": 1.9142928305795637e-06,
"loss": 0.1807,
"reward": 1.0964286282658577,
"reward_std": 0.6850748583674431,
"rewards/accuracy_reward": 0.5145089499652385,
"rewards/format_reward": 0.5819196663796902,
"step": 220
},
{
"clip_ratio": 0.0,
"completion_length": 570.4515830993653,
"epoch": 1.9211087420042645,
"grad_norm": 1.383189082145691,
"kl": 0.865283203125,
"learning_rate": 1.8602273707541886e-06,
"loss": 0.1254,
"reward": 1.1287946969270706,
"reward_std": 0.6823262549936772,
"rewards/accuracy_reward": 0.5196428805589676,
"rewards/format_reward": 0.6091518118977547,
"step": 225
},
{
"clip_ratio": 0.0,
"completion_length": 618.1281509399414,
"epoch": 1.9637526652452024,
"grad_norm": 1.875443935394287,
"kl": 1.5638671875,
"learning_rate": 1.8056604906573418e-06,
"loss": 0.1544,
"reward": 1.001562552154064,
"reward_std": 0.6830957509577275,
"rewards/accuracy_reward": 0.4718750223517418,
"rewards/format_reward": 0.5296875208616256,
"step": 230
},
{
"epoch": 1.997867803837953,
"eval_clip_ratio": 0.0,
"eval_completion_length": 566.5657886323474,
"eval_kl": 1.0294828869047619,
"eval_loss": 0.09497759491205215,
"eval_reward": 1.096938827681163,
"eval_reward_std": 0.70041947516184,
"eval_rewards/accuracy_reward": 0.5144558056952462,
"eval_rewards/format_reward": 0.5824830167823367,
"eval_runtime": 810.3084,
"eval_samples_per_second": 0.617,
"eval_steps_per_second": 0.006,
"step": 234
},
{
"clip_ratio": 0.0,
"completion_length": 581.7695983886719,
"epoch": 2.008528784648188,
"grad_norm": 2.1314055919647217,
"kl": 0.802685546875,
"learning_rate": 1.7506681449278226e-06,
"loss": 0.1221,
"reward": 1.1642857670783997,
"reward_std": 0.700273784250021,
"rewards/accuracy_reward": 0.5546875216066838,
"rewards/format_reward": 0.6095982432365418,
"step": 235
},
{
"clip_ratio": 0.0,
"completion_length": 590.5163177490234,
"epoch": 2.0511727078891258,
"grad_norm": 4.411650657653809,
"kl": 1.93984375,
"learning_rate": 1.6953268804334257e-06,
"loss": 0.1485,
"reward": 0.9665178924798965,
"reward_std": 0.7336546629667282,
"rewards/accuracy_reward": 0.4665178798139095,
"rewards/format_reward": 0.5000000223517418,
"step": 240
},
{
"clip_ratio": 0.0,
"completion_length": 588.587077331543,
"epoch": 2.093816631130064,
"grad_norm": 1.1795450448989868,
"kl": 1.27900390625,
"learning_rate": 1.6397137297211436e-06,
"loss": 0.1474,
"reward": 0.9912946864962577,
"reward_std": 0.7124211765825749,
"rewards/accuracy_reward": 0.48437502309679986,
"rewards/format_reward": 0.5069196656346321,
"step": 245
},
{
"clip_ratio": 0.0,
"completion_length": 577.4261459350586,
"epoch": 2.136460554371002,
"grad_norm": 1.2404223680496216,
"kl": 1.08720703125,
"learning_rate": 1.5839061037913395e-06,
"loss": 0.0908,
"reward": 0.9372768178582191,
"reward_std": 0.6845876269042492,
"rewards/accuracy_reward": 0.4435268074274063,
"rewards/format_reward": 0.4937500201165676,
"step": 250
},
{
"clip_ratio": 0.0,
"completion_length": 538.9498016357422,
"epoch": 2.1791044776119404,
"grad_norm": 5.910031318664551,
"kl": 1.4826171875,
"learning_rate": 1.527981684345115e-06,
"loss": 0.0402,
"reward": 1.0573661237955094,
"reward_std": 0.7098784282803535,
"rewards/accuracy_reward": 0.5082589484751224,
"rewards/format_reward": 0.5491071656346321,
"step": 255
},
{
"clip_ratio": 0.0,
"completion_length": 497.1082817077637,
"epoch": 2.2217484008528783,
"grad_norm": 1.6820718050003052,
"kl": 1.875634765625,
"learning_rate": 1.4720183156548855e-06,
"loss": -0.0382,
"reward": 1.1102679178118706,
"reward_std": 0.6880769707262516,
"rewards/accuracy_reward": 0.528125026077032,
"rewards/format_reward": 0.5821428827941417,
"step": 260
},
{
"clip_ratio": 0.0,
"completion_length": 541.0748046875,
"epoch": 2.2643923240938166,
"grad_norm": 2.385998249053955,
"kl": 2.6990234375,
"learning_rate": 1.4160938962086612e-06,
"loss": 0.0057,
"reward": 1.12098218947649,
"reward_std": 0.6896743580698967,
"rewards/accuracy_reward": 0.5265625223517418,
"rewards/format_reward": 0.5944196678698063,
"step": 265
},
{
"clip_ratio": 0.0,
"completion_length": 556.8611907958984,
"epoch": 2.307036247334755,
"grad_norm": 2.2787282466888428,
"kl": 3.4162109375,
"learning_rate": 1.3602862702788567e-06,
"loss": 0.0818,
"reward": 1.064062552154064,
"reward_std": 0.6750895470380783,
"rewards/accuracy_reward": 0.4955357354134321,
"rewards/format_reward": 0.5685268133878708,
"step": 270
},
{
"clip_ratio": 0.0,
"completion_length": 584.2448959350586,
"epoch": 2.349680170575693,
"grad_norm": 2.604217290878296,
"kl": 3.58203125,
"learning_rate": 1.3046731195665748e-06,
"loss": 0.1074,
"reward": 1.0066964760422707,
"reward_std": 0.7123509004712105,
"rewards/accuracy_reward": 0.47566966339945793,
"rewards/format_reward": 0.5310268089175224,
"step": 275
},
{
"clip_ratio": 0.0,
"completion_length": 574.1310523986816,
"epoch": 2.3923240938166312,
"grad_norm": 2.9245572090148926,
"kl": 3.5134765625,
"learning_rate": 1.2493318550721775e-06,
"loss": 0.1235,
"reward": 1.004464328289032,
"reward_std": 0.7083872497081757,
"rewards/accuracy_reward": 0.4734375201165676,
"rewards/format_reward": 0.5310268081724644,
"step": 280
},
{
"clip_ratio": 0.0,
"completion_length": 589.7712295532226,
"epoch": 2.434968017057569,
"grad_norm": 7.597701072692871,
"kl": 3.9734375,
"learning_rate": 1.1943395093426585e-06,
"loss": 0.2007,
"reward": 1.0401786118745804,
"reward_std": 0.7139236360788346,
"rewards/accuracy_reward": 0.4946428775787354,
"rewards/format_reward": 0.5455357372760773,
"step": 285
},
{
"clip_ratio": 0.0,
"completion_length": 598.5185546875,
"epoch": 2.4776119402985075,
"grad_norm": 16.614126205444336,
"kl": 4.1701171875,
"learning_rate": 1.1397726292458115e-06,
"loss": 0.2304,
"reward": 1.018303619325161,
"reward_std": 0.715107049047947,
"rewards/accuracy_reward": 0.4877232380211353,
"rewards/format_reward": 0.5305803798139095,
"step": 290
},
{
"clip_ratio": 0.0,
"completion_length": 613.7118576049804,
"epoch": 2.520255863539446,
"grad_norm": 10.53600788116455,
"kl": 2.83359375,
"learning_rate": 1.085707169420437e-06,
"loss": 0.1637,
"reward": 0.9716518297791481,
"reward_std": 0.7042301401495934,
"rewards/accuracy_reward": 0.46294644884765146,
"rewards/format_reward": 0.5087053842842579,
"step": 295
},
{
"clip_ratio": 0.0,
"completion_length": 615.3317260742188,
"epoch": 2.5628997867803838,
"grad_norm": 8.81041431427002,
"kl": 4.831640625,
"learning_rate": 1.0322183865509054e-06,
"loss": 0.2796,
"reward": 1.0026786133646965,
"reward_std": 0.7037848606705666,
"rewards/accuracy_reward": 0.47232145331799985,
"rewards/format_reward": 0.5303571715950965,
"step": 300
},
{
"clip_ratio": 0.0,
"completion_length": 602.5529251098633,
"epoch": 2.605543710021322,
"grad_norm": 35.90947723388672,
"kl": 4.2283203125,
"learning_rate": 9.793807346132464e-07,
"loss": 0.2417,
"reward": 0.9839286230504513,
"reward_std": 0.728691854327917,
"rewards/accuracy_reward": 0.4665178794413805,
"rewards/format_reward": 0.5174107391387224,
"step": 305
},
{
"clip_ratio": 0.0,
"completion_length": 605.5623016357422,
"epoch": 2.64818763326226,
"grad_norm": 13.354021072387695,
"kl": 4.1529296875,
"learning_rate": 9.272677612385667e-07,
"loss": 0.2264,
"reward": 1.0729911237955094,
"reward_std": 0.7242146201431752,
"rewards/accuracy_reward": 0.5120535925030708,
"rewards/format_reward": 0.5609375245869159,
"step": 310
},
{
"clip_ratio": 0.0,
"completion_length": 609.2933280944824,
"epoch": 2.6908315565031984,
"grad_norm": 12.610240936279297,
"kl": 4.536328125,
"learning_rate": 8.759520053380591e-07,
"loss": 0.2337,
"reward": 1.0473214656114578,
"reward_std": 0.7167247369885444,
"rewards/accuracy_reward": 0.5008928783237934,
"rewards/format_reward": 0.5464285984635353,
"step": 315
},
{
"clip_ratio": 0.0,
"completion_length": 600.3977935791015,
"epoch": 2.7334754797441363,
"grad_norm": 4.791992664337158,
"kl": 3.415234375,
"learning_rate": 8.255048961321088e-07,
"loss": 0.1756,
"reward": 1.094419687986374,
"reward_std": 0.6878940530121327,
"rewards/accuracy_reward": 0.5276786014437675,
"rewards/format_reward": 0.5667410984635353,
"step": 320
},
{
"clip_ratio": 0.0,
"completion_length": 602.0634201049804,
"epoch": 2.7761194029850746,
"grad_norm": 6.484261512756348,
"kl": 4.11328125,
"learning_rate": 7.759966537240373e-07,
"loss": 0.2121,
"reward": 1.0223214700818062,
"reward_std": 0.7285358726978302,
"rewards/accuracy_reward": 0.48816966116428373,
"rewards/format_reward": 0.5341518081724643,
"step": 325
},
{
"clip_ratio": 0.0,
"completion_length": 589.43842086792,
"epoch": 2.818763326226013,
"grad_norm": 2.419487714767456,
"kl": 5.057421875,
"learning_rate": 7.274961913568773e-07,
"loss": 0.2439,
"reward": 0.9620536170899868,
"reward_std": 0.692897405475378,
"rewards/accuracy_reward": 0.45825895071029665,
"rewards/format_reward": 0.503794664889574,
"step": 330
},
{
"clip_ratio": 0.0,
"completion_length": 586.2620796203613,
"epoch": 2.861407249466951,
"grad_norm": 4.212592124938965,
"kl": 3.05234375,
"learning_rate": 6.800710194892484e-07,
"loss": 0.1322,
"reward": 1.040625052154064,
"reward_std": 0.6996075876057148,
"rewards/accuracy_reward": 0.4897321633994579,
"rewards/format_reward": 0.550892885029316,
"step": 335
},
{
"clip_ratio": 0.0,
"completion_length": 587.5268157958984,
"epoch": 2.9040511727078893,
"grad_norm": 7.860717296600342,
"kl": 4.29765625,
"learning_rate": 6.33787151823836e-07,
"loss": 0.1941,
"reward": 1.003794687986374,
"reward_std": 0.7180271342396736,
"rewards/accuracy_reward": 0.4808035932481289,
"rewards/format_reward": 0.522991093993187,
"step": 340
},
{
"clip_ratio": 0.0,
"completion_length": 592.5397575378418,
"epoch": 2.946695095948827,
"grad_norm": 2.772329092025757,
"kl": 3.987109375,
"learning_rate": 5.887090134192947e-07,
"loss": 0.2082,
"reward": 1.0125000461935998,
"reward_std": 0.6709666073322296,
"rewards/accuracy_reward": 0.48437502384185793,
"rewards/format_reward": 0.5281250223517417,
"step": 345
},
{
"clip_ratio": 0.0,
"completion_length": 584.3073959350586,
"epoch": 2.9893390191897655,
"grad_norm": 5.136852264404297,
"kl": 3.92734375,
"learning_rate": 5.448993510134669e-07,
"loss": 0.1783,
"reward": 0.9609375447034836,
"reward_std": 0.6889998987317085,
"rewards/accuracy_reward": 0.4497768072411418,
"rewards/format_reward": 0.5111607395112514,
"step": 350
},
{
"epoch": 2.997867803837953,
"eval_clip_ratio": 0.0,
"eval_completion_length": 572.8553040519594,
"eval_kl": 5.3444940476190474,
"eval_loss": 0.2360815405845642,
"eval_reward": 0.9090136440973433,
"eval_reward_std": 0.6795897848076291,
"eval_rewards/accuracy_reward": 0.42772110799948376,
"eval_rewards/format_reward": 0.48129253917270237,
"eval_runtime": 816.0021,
"eval_samples_per_second": 0.613,
"eval_steps_per_second": 0.006,
"step": 351
},
{
"clip_ratio": 0.0,
"completion_length": 568.4361267089844,
"epoch": 3.0341151385927505,
"grad_norm": 9.054045677185059,
"kl": 4.098046875,
"learning_rate": 5.024191456827498e-07,
"loss": 0.1624,
"reward": 0.9511161103844643,
"reward_std": 0.6624684408307076,
"rewards/accuracy_reward": 0.46138395071029664,
"rewards/format_reward": 0.4897321678698063,
"step": 355
},
{
"clip_ratio": 0.0,
"completion_length": 568.6207885742188,
"epoch": 3.076759061833689,
"grad_norm": 9.051790237426758,
"kl": 3.809765625,
"learning_rate": 4.6132752795918667e-07,
"loss": 0.1558,
"reward": 1.0131696924567222,
"reward_std": 0.6962591715157032,
"rewards/accuracy_reward": 0.48772323653101923,
"rewards/format_reward": 0.5254464507102966,
"step": 360
},
{
"clip_ratio": 0.0,
"completion_length": 557.1560546875,
"epoch": 3.1194029850746268,
"grad_norm": 3.945366144180298,
"kl": 4.57890625,
"learning_rate": 4.2168169552342905e-07,
"loss": 0.1879,
"reward": 0.968080396950245,
"reward_std": 0.6995945557951927,
"rewards/accuracy_reward": 0.4647321619093418,
"rewards/format_reward": 0.5033482357859611,
"step": 365
},
{
"clip_ratio": 0.0,
"completion_length": 568.9433326721191,
"epoch": 3.162046908315565,
"grad_norm": 8.444324493408203,
"kl": 3.914453125,
"learning_rate": 3.8353683358814046e-07,
"loss": 0.158,
"reward": 0.9988839834928512,
"reward_std": 0.701323488354683,
"rewards/accuracy_reward": 0.47254466637969017,
"rewards/format_reward": 0.5263393051922322,
"step": 370
},
{
"clip_ratio": 0.0,
"completion_length": 570.7756988525391,
"epoch": 3.204690831556503,
"grad_norm": 1.7602710723876953,
"kl": 4.1578125,
"learning_rate": 3.469460380826697e-07,
"loss": 0.1662,
"reward": 0.9948661148548126,
"reward_std": 0.6981454014778137,
"rewards/accuracy_reward": 0.48147323727607727,
"rewards/format_reward": 0.5133928775787353,
"step": 375
},
{
"clip_ratio": 0.0,
"completion_length": 563.1116333007812,
"epoch": 3.2473347547974414,
"grad_norm": 4.784106731414795,
"kl": 3.6669921875,
"learning_rate": 3.119602417459075e-07,
"loss": 0.1403,
"reward": 1.0319196850061416,
"reward_std": 0.699262548983097,
"rewards/accuracy_reward": 0.4930803768336773,
"rewards/format_reward": 0.5388393096625805,
"step": 380
},
{
"clip_ratio": 0.0,
"completion_length": 563.9397567749023,
"epoch": 3.2899786780383797,
"grad_norm": 4.571471691131592,
"kl": 4.50703125,
"learning_rate": 2.786281432302071e-07,
"loss": 0.1753,
"reward": 1.053348256647587,
"reward_std": 0.7323236554861069,
"rewards/accuracy_reward": 0.5071428805589676,
"rewards/format_reward": 0.5462053820490838,
"step": 385
},
{
"clip_ratio": 0.0,
"completion_length": 546.5582817077636,
"epoch": 3.3326226012793176,
"grad_norm": 8.077947616577148,
"kl": 3.4154296875,
"learning_rate": 2.46996139315057e-07,
"loss": 0.115,
"reward": 0.9888393305242061,
"reward_std": 0.6875140987336635,
"rewards/accuracy_reward": 0.470982164517045,
"rewards/format_reward": 0.5178571619093418,
"step": 390
},
{
"clip_ratio": 0.0,
"completion_length": 555.0024826049805,
"epoch": 3.375266524520256,
"grad_norm": 6.658145427703857,
"kl": 3.830078125,
"learning_rate": 2.1710826032485286e-07,
"loss": 0.1539,
"reward": 1.0189732655882835,
"reward_std": 0.7024340078234672,
"rewards/accuracy_reward": 0.49129465967416763,
"rewards/format_reward": 0.5276785992085934,
"step": 395
},
{
"clip_ratio": 0.0,
"completion_length": 540.7460098266602,
"epoch": 3.417910447761194,
"grad_norm": 5.9571213722229,
"kl": 4.2994140625,
"learning_rate": 1.8900610884066817e-07,
"loss": 0.1664,
"reward": 1.0589286148548127,
"reward_std": 0.704091303050518,
"rewards/accuracy_reward": 0.5145089514553547,
"rewards/format_reward": 0.5444196730852127,
"step": 400
},
{
"clip_ratio": 0.0,
"completion_length": 552.391983795166,
"epoch": 3.4605543710021323,
"grad_norm": 1.934449553489685,
"kl": 3.8435546875,
"learning_rate": 1.627288017913383e-07,
"loss": 0.1552,
"reward": 1.032812552154064,
"reward_std": 0.7122370585799217,
"rewards/accuracy_reward": 0.49486609250307084,
"rewards/format_reward": 0.5379464536905288,
"step": 405
},
{
"clip_ratio": 0.0,
"completion_length": 556.8665435791015,
"epoch": 3.50319829424307,
"grad_norm": 2.779977798461914,
"kl": 3.684375,
"learning_rate": 1.3831291600445573e-07,
"loss": 0.1404,
"reward": 1.0571429044008256,
"reward_std": 0.706598898023367,
"rewards/accuracy_reward": 0.5091518111526966,
"rewards/format_reward": 0.5479910977184772,
"step": 410
},
{
"clip_ratio": 0.0,
"completion_length": 541.1424331665039,
"epoch": 3.5458422174840085,
"grad_norm": 2.621488094329834,
"kl": 4.400390625,
"learning_rate": 1.1579243729307487e-07,
"loss": 0.1684,
"reward": 0.9808036178350449,
"reward_std": 0.676436859369278,
"rewards/accuracy_reward": 0.466294664144516,
"rewards/format_reward": 0.514508954435587,
"step": 415
},
{
"clip_ratio": 0.0,
"completion_length": 559.8049331665039,
"epoch": 3.588486140724947,
"grad_norm": 1.7964094877243042,
"kl": 3.96171875,
"learning_rate": 9.519871314899092e-08,
"loss": 0.1398,
"reward": 0.9908482581377029,
"reward_std": 0.6953834608197212,
"rewards/accuracy_reward": 0.47723216786980627,
"rewards/format_reward": 0.5136160954833031,
"step": 420
},
{
"clip_ratio": 0.0,
"completion_length": 554.9152008056641,
"epoch": 3.631130063965885,
"grad_norm": 3.415715456008911,
"kl": 3.9677734375,
"learning_rate": 7.656040910844358e-08,
"loss": 0.1558,
"reward": 1.0417411237955094,
"reward_std": 0.7144467569887638,
"rewards/accuracy_reward": 0.5002232410013676,
"rewards/format_reward": 0.541517885774374,
"step": 425
},
{
"clip_ratio": 0.0,
"completion_length": 550.6768142700196,
"epoch": 3.673773987206823,
"grad_norm": 2.8959734439849854,
"kl": 3.8453125,
"learning_rate": 5.990346885098235e-08,
"loss": 0.1582,
"reward": 1.0953125476837158,
"reward_std": 0.7153457693755627,
"rewards/accuracy_reward": 0.529241094365716,
"rewards/format_reward": 0.5660714536905289,
"step": 430
},
{
"clip_ratio": 0.0,
"completion_length": 537.2187751770019,
"epoch": 3.716417910447761,
"grad_norm": 1.9467185735702515,
"kl": 4.092578125,
"learning_rate": 4.5251078087033493e-08,
"loss": 0.1599,
"reward": 1.00379468947649,
"reward_std": 0.6785944283008576,
"rewards/accuracy_reward": 0.4821428760886192,
"rewards/format_reward": 0.5216518089175224,
"step": 435
},
{
"clip_ratio": 0.0,
"completion_length": 535.1417694091797,
"epoch": 3.7590618336886994,
"grad_norm": 2.523850917816162,
"kl": 4.1220703125,
"learning_rate": 3.262363228443427e-08,
"loss": 0.1558,
"reward": 0.9935268282890319,
"reward_std": 0.6972592443227768,
"rewards/accuracy_reward": 0.4779018059372902,
"rewards/format_reward": 0.5156250230967998,
"step": 440
},
{
"clip_ratio": 0.0,
"completion_length": 532.5212303161621,
"epoch": 3.8017057569296373,
"grad_norm": 1.803943157196045,
"kl": 4.0076171875,
"learning_rate": 2.2038708278862952e-08,
"loss": 0.165,
"reward": 0.9732143253087997,
"reward_std": 0.7266230128705502,
"rewards/accuracy_reward": 0.47075894847512245,
"rewards/format_reward": 0.5024553813040257,
"step": 445
},
{
"clip_ratio": 0.0,
"completion_length": 527.6984603881835,
"epoch": 3.8443496801705757,
"grad_norm": 5.807371616363525,
"kl": 3.9375,
"learning_rate": 1.3511039807673209e-08,
"loss": 0.1457,
"reward": 1.0058036133646966,
"reward_std": 0.7280482016503811,
"rewards/accuracy_reward": 0.4801339514553547,
"rewards/format_reward": 0.5256696626543998,
"step": 450
},
{
"clip_ratio": 0.0,
"completion_length": 547.6140846252441,
"epoch": 3.886993603411514,
"grad_norm": 2.526625394821167,
"kl": 3.7375,
"learning_rate": 7.0524970011963675e-09,
"loss": 0.1027,
"reward": 0.996428620815277,
"reward_std": 0.6856365635991096,
"rewards/accuracy_reward": 0.4709821633994579,
"rewards/format_reward": 0.5254464507102966,
"step": 455
},
{
"clip_ratio": 0.0,
"completion_length": 535.9491348266602,
"epoch": 3.929637526652452,
"grad_norm": 1.175413727760315,
"kl": 3.96484375,
"learning_rate": 2.6720698600553595e-09,
"loss": 0.1365,
"reward": 1.0109375432133674,
"reward_std": 0.7361011810600757,
"rewards/accuracy_reward": 0.4879464481025934,
"rewards/format_reward": 0.522991093993187,
"step": 460
},
{
"clip_ratio": 0.0,
"completion_length": 549.8350677490234,
"epoch": 3.9722814498933903,
"grad_norm": 3.024672269821167,
"kl": 3.743359375,
"learning_rate": 3.7585574148779613e-10,
"loss": 0.1276,
"reward": 1.035491119325161,
"reward_std": 0.7265422374010087,
"rewards/accuracy_reward": 0.4973214499652386,
"rewards/format_reward": 0.5381696693599224,
"step": 465
},
{
"epoch": 3.997867803837953,
"eval_clip_ratio": 0.0,
"eval_completion_length": 531.8020145476811,
"eval_kl": 4.111359126984127,
"eval_loss": 0.11702829599380493,
"eval_reward": 0.9311224893918113,
"eval_reward_std": 0.7056213607863774,
"eval_rewards/accuracy_reward": 0.4399093160080531,
"eval_rewards/format_reward": 0.4912131717280736,
"eval_runtime": 797.4216,
"eval_samples_per_second": 0.627,
"eval_steps_per_second": 0.006,
"step": 468
},
{
"clip_ratio": 0.0,
"completion_length": 537.9717648824056,
"epoch": 3.997867803837953,
"kl": 3.7174479166666665,
"reward": 1.0145089824994404,
"reward_std": 0.6882362899680933,
"rewards/accuracy_reward": 0.4813988270858924,
"rewards/format_reward": 0.5331101417541504,
"step": 468,
"total_flos": 0.0,
"train_loss": 0.1188597827950795,
"train_runtime": 66948.3883,
"train_samples_per_second": 0.448,
"train_steps_per_second": 0.007
}
],
"logging_steps": 5,
"max_steps": 468,
"num_input_tokens_seen": 0,
"num_train_epochs": 4,
"save_steps": 500,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 0.0,
"train_batch_size": 16,
"trial_name": null,
"trial_params": null
}