| { |
| "best_metric": null, |
| "best_model_checkpoint": null, |
| "epoch": 3.997867803837953, |
| "eval_steps": 117, |
| "global_step": 468, |
| "is_hyper_param_search": false, |
| "is_local_process_zero": true, |
| "is_world_process_zero": true, |
| "log_history": [ |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 642.2232513427734, |
| "epoch": 0.008528784648187633, |
| "grad_norm": 0.29757431149482727, |
| "kl": 0.0, |
| "learning_rate": 6.382978723404255e-08, |
| "loss": 0.0472, |
| "reward": 0.6718750298023224, |
| "reward_std": 0.3231801837682724, |
| "rewards/accuracy_reward": 0.6640625298023224, |
| "rewards/format_reward": 0.007812500349245965, |
| "step": 1 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 595.3912038803101, |
| "epoch": 0.042643923240938165, |
| "grad_norm": 0.35624048113822937, |
| "kl": 0.0001455843448638916, |
| "learning_rate": 3.1914893617021275e-07, |
| "loss": 0.0595, |
| "reward": 0.6908482443541288, |
| "reward_std": 0.35730565479025245, |
| "rewards/accuracy_reward": 0.6819196743890643, |
| "rewards/format_reward": 0.008928571827709675, |
| "step": 5 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 602.0033729553222, |
| "epoch": 0.08528784648187633, |
| "grad_norm": 0.2918407618999481, |
| "kl": 0.00026810169219970703, |
| "learning_rate": 6.382978723404255e-07, |
| "loss": 0.0569, |
| "reward": 0.6535714589059353, |
| "reward_std": 0.3369171965867281, |
| "rewards/accuracy_reward": 0.6473214581608773, |
| "rewards/format_reward": 0.006250000232830644, |
| "step": 10 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 589.1335098266602, |
| "epoch": 0.1279317697228145, |
| "grad_norm": 0.4812746047973633, |
| "kl": 0.24185171127319335, |
| "learning_rate": 9.574468085106384e-07, |
| "loss": 0.0832, |
| "reward": 0.6883928894996643, |
| "reward_std": 0.3438419926911592, |
| "rewards/accuracy_reward": 0.6819196753203869, |
| "rewards/format_reward": 0.006473214668221772, |
| "step": 15 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 601.767658996582, |
| "epoch": 0.17057569296375266, |
| "grad_norm": 0.31131526827812195, |
| "kl": 0.0008780479431152344, |
| "learning_rate": 1.276595744680851e-06, |
| "loss": 0.0705, |
| "reward": 0.6939732521772385, |
| "reward_std": 0.3367977850139141, |
| "rewards/accuracy_reward": 0.6879464671015739, |
| "rewards/format_reward": 0.006026786030270159, |
| "step": 20 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 588.0732414245606, |
| "epoch": 0.21321961620469082, |
| "grad_norm": 0.2977014183998108, |
| "kl": 0.0019563674926757813, |
| "learning_rate": 1.5957446808510639e-06, |
| "loss": 0.0833, |
| "reward": 0.7379464626312255, |
| "reward_std": 0.3033597592264414, |
| "rewards/accuracy_reward": 0.7339286029338836, |
| "rewards/format_reward": 0.004017857322469354, |
| "step": 25 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 601.1149803161621, |
| "epoch": 0.255863539445629, |
| "grad_norm": 0.7156150937080383, |
| "kl": 0.015515518188476563, |
| "learning_rate": 1.9148936170212767e-06, |
| "loss": 0.0826, |
| "reward": 0.7363839671015739, |
| "reward_std": 0.28360147699713706, |
| "rewards/accuracy_reward": 0.7348214641213417, |
| "rewards/format_reward": 0.0015625000698491931, |
| "step": 30 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 591.7576164245605, |
| "epoch": 0.29850746268656714, |
| "grad_norm": 0.339713990688324, |
| "kl": 0.008817577362060547, |
| "learning_rate": 2.2340425531914894e-06, |
| "loss": 0.0708, |
| "reward": 0.7671875342726707, |
| "reward_std": 0.24262561108916997, |
| "rewards/accuracy_reward": 0.766294677555561, |
| "rewards/format_reward": 0.0008928571827709675, |
| "step": 35 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 583.7185531616211, |
| "epoch": 0.3411513859275053, |
| "grad_norm": 1.7096885442733765, |
| "kl": 0.027852249145507813, |
| "learning_rate": 2.553191489361702e-06, |
| "loss": 0.0584, |
| "reward": 0.7622768223285675, |
| "reward_std": 0.2263224059715867, |
| "rewards/accuracy_reward": 0.7602678939700127, |
| "rewards/format_reward": 0.002008928661234677, |
| "step": 40 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 596.3366325378418, |
| "epoch": 0.3837953091684435, |
| "grad_norm": 0.3490237295627594, |
| "kl": 0.004991340637207031, |
| "learning_rate": 2.872340425531915e-06, |
| "loss": 0.0615, |
| "reward": 0.7779018193483352, |
| "reward_std": 0.23310719933360816, |
| "rewards/accuracy_reward": 0.7723214626312256, |
| "rewards/format_reward": 0.005580357392318547, |
| "step": 45 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 584.2732421875, |
| "epoch": 0.42643923240938164, |
| "grad_norm": 0.8813576698303223, |
| "kl": 0.008109092712402344, |
| "learning_rate": 2.9996241442585123e-06, |
| "loss": 0.0564, |
| "reward": 0.7683036044239998, |
| "reward_std": 0.2527444614097476, |
| "rewards/accuracy_reward": 0.7486607477068901, |
| "rewards/format_reward": 0.019642858114093543, |
| "step": 50 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 575.4134185791015, |
| "epoch": 0.4690831556503198, |
| "grad_norm": 1.0579967498779297, |
| "kl": 0.013035964965820313, |
| "learning_rate": 2.9973279301399446e-06, |
| "loss": 0.0374, |
| "reward": 0.8272321790456771, |
| "reward_std": 0.3260661941021681, |
| "rewards/accuracy_reward": 0.7531250283122063, |
| "rewards/format_reward": 0.07410714614670724, |
| "step": 55 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 591.4556053161621, |
| "epoch": 0.511727078891258, |
| "grad_norm": 0.3846156895160675, |
| "kl": 0.018450927734375, |
| "learning_rate": 2.992947502998804e-06, |
| "loss": 0.0393, |
| "reward": 0.8857143223285675, |
| "reward_std": 0.37481620348989964, |
| "rewards/accuracy_reward": 0.7491071775555611, |
| "rewards/format_reward": 0.1366071492433548, |
| "step": 60 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 565.9245796203613, |
| "epoch": 0.5543710021321961, |
| "grad_norm": 0.6243420243263245, |
| "kl": 0.02995452880859375, |
| "learning_rate": 2.9864889601923268e-06, |
| "loss": 0.0221, |
| "reward": 0.9781250506639481, |
| "reward_std": 0.44858795329928397, |
| "rewards/accuracy_reward": 0.7267857432365418, |
| "rewards/format_reward": 0.2513392990455031, |
| "step": 65 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 586.0239158630371, |
| "epoch": 0.5970149253731343, |
| "grad_norm": 26.86927032470703, |
| "kl": 0.04704437255859375, |
| "learning_rate": 2.977961291721137e-06, |
| "loss": 0.0185, |
| "reward": 1.0526786223053932, |
| "reward_std": 0.482559996843338, |
| "rewards/accuracy_reward": 0.710044676065445, |
| "rewards/format_reward": 0.34263394549489024, |
| "step": 70 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 589.8480171203613, |
| "epoch": 0.6396588486140725, |
| "grad_norm": 0.9474160075187683, |
| "kl": 0.060797119140625, |
| "learning_rate": 2.9673763677155655e-06, |
| "loss": 0.0388, |
| "reward": 1.138169701397419, |
| "reward_std": 0.5718358919024468, |
| "rewards/accuracy_reward": 0.6812500283122063, |
| "rewards/format_reward": 0.4569196656346321, |
| "step": 75 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 596.6254699707031, |
| "epoch": 0.6823027718550106, |
| "grad_norm": 3.176877975463867, |
| "kl": 0.4295654296875, |
| "learning_rate": 2.9547489219129666e-06, |
| "loss": 0.0459, |
| "reward": 0.9727679073810578, |
| "reward_std": 0.7046953186392784, |
| "rewards/accuracy_reward": 0.4734375223517418, |
| "rewards/format_reward": 0.4993303783237934, |
| "step": 80 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 633.0022583007812, |
| "epoch": 0.7249466950959488, |
| "grad_norm": 16.31062889099121, |
| "kl": 0.807666015625, |
| "learning_rate": 2.9400965311490175e-06, |
| "loss": 0.067, |
| "reward": 1.0857143267989158, |
| "reward_std": 0.7520077109336853, |
| "rewards/accuracy_reward": 0.5133928790688514, |
| "rewards/format_reward": 0.5723214574158192, |
| "step": 85 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 640.6631980895996, |
| "epoch": 0.767590618336887, |
| "grad_norm": 4.319429397583008, |
| "kl": 0.7376708984375, |
| "learning_rate": 2.9234395908915565e-06, |
| "loss": 0.1219, |
| "reward": 1.3002232700586318, |
| "reward_std": 0.6729222685098648, |
| "rewards/accuracy_reward": 0.6341518126428127, |
| "rewards/format_reward": 0.6660714603960514, |
| "step": 90 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 637.9712371826172, |
| "epoch": 0.8102345415778252, |
| "grad_norm": 24.02198028564453, |
| "kl": 6.8146484375, |
| "learning_rate": 2.904801286851009e-06, |
| "loss": 0.5369, |
| "reward": 1.4430804237723351, |
| "reward_std": 0.5716752491891384, |
| "rewards/accuracy_reward": 0.6941964641213417, |
| "rewards/format_reward": 0.7488839581608773, |
| "step": 95 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 592.6529304504395, |
| "epoch": 0.8528784648187633, |
| "grad_norm": 4.384208679199219, |
| "kl": 0.373583984375, |
| "learning_rate": 2.884207562706925e-06, |
| "loss": 0.0912, |
| "reward": 1.5466518580913544, |
| "reward_std": 0.4958520784974098, |
| "rewards/accuracy_reward": 0.7368303924798966, |
| "rewards/format_reward": 0.8098214656114578, |
| "step": 100 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 570.5218971252441, |
| "epoch": 0.8955223880597015, |
| "grad_norm": 2.8563225269317627, |
| "kl": 1.043408203125, |
| "learning_rate": 2.8616870839955444e-06, |
| "loss": 0.0561, |
| "reward": 1.5991072207689285, |
| "reward_std": 0.4629118986427784, |
| "rewards/accuracy_reward": 0.7482143253087997, |
| "rewards/format_reward": 0.850892896950245, |
| "step": 105 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 607.3071670532227, |
| "epoch": 0.9381663113006397, |
| "grad_norm": 2.282275676727295, |
| "kl": 1.5532958984375, |
| "learning_rate": 2.837271198208662e-06, |
| "loss": 0.0403, |
| "reward": 1.5997768580913543, |
| "reward_std": 0.48762847371399404, |
| "rewards/accuracy_reward": 0.7437500342726707, |
| "rewards/format_reward": 0.8560268253087997, |
| "step": 110 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 635.0283782958984, |
| "epoch": 0.9808102345415778, |
| "grad_norm": 3.15806245803833, |
| "kl": 2.1205078125, |
| "learning_rate": 2.8109938911593322e-06, |
| "loss": 0.0641, |
| "reward": 1.5205357849597931, |
| "reward_std": 0.5536904223263264, |
| "rewards/accuracy_reward": 0.7011161044239997, |
| "rewards/format_reward": 0.8194196805357933, |
| "step": 115 |
| }, |
| { |
| "epoch": 0.997867803837953, |
| "eval_clip_ratio": 0.0, |
| "eval_completion_length": 669.1385478670635, |
| "eval_kl": 4.540426587301587, |
| "eval_loss": 0.20850437879562378, |
| "eval_reward": 1.462868539113847, |
| "eval_reward_std": 0.5733831548501575, |
| "eval_rewards/accuracy_reward": 0.661422934797075, |
| "eval_rewards/format_reward": 0.8014456156700377, |
| "eval_runtime": 903.45, |
| "eval_samples_per_second": 0.553, |
| "eval_steps_per_second": 0.006, |
| "step": 117 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 688.7388717651368, |
| "epoch": 1.0255863539445629, |
| "grad_norm": 5.824474811553955, |
| "kl": 1.586767578125, |
| "learning_rate": 2.7828917396751474e-06, |
| "loss": 0.065, |
| "reward": 1.5044643521308898, |
| "reward_std": 0.5775617159903049, |
| "rewards/accuracy_reward": 0.703794676065445, |
| "rewards/format_reward": 0.8006696850061417, |
| "step": 120 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 726.9741363525391, |
| "epoch": 1.068230277185501, |
| "grad_norm": 5.099586009979248, |
| "kl": 6.495361328125, |
| "learning_rate": 2.753003860684943e-06, |
| "loss": 0.1463, |
| "reward": 1.5174107819795608, |
| "reward_std": 0.5776193253695965, |
| "rewards/accuracy_reward": 0.6948661051690579, |
| "rewards/format_reward": 0.8225446805357933, |
| "step": 125 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 741.8879806518555, |
| "epoch": 1.1108742004264391, |
| "grad_norm": 4.33757209777832, |
| "kl": 2.81171875, |
| "learning_rate": 2.721371856769793e-06, |
| "loss": 0.1539, |
| "reward": 1.4988839864730834, |
| "reward_std": 0.5920813702046871, |
| "rewards/accuracy_reward": 0.6816964618861675, |
| "rewards/format_reward": 0.8171875372529029, |
| "step": 130 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 712.3254806518555, |
| "epoch": 1.1535181236673775, |
| "grad_norm": 3.2068488597869873, |
| "kl": 1.8530517578125, |
| "learning_rate": 2.688039758254093e-06, |
| "loss": 0.0988, |
| "reward": 1.574107214808464, |
| "reward_std": 0.5515650272369385, |
| "rewards/accuracy_reward": 0.7183035999536515, |
| "rewards/format_reward": 0.8558036118745804, |
| "step": 135 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 703.3225753784179, |
| "epoch": 1.1961620469083156, |
| "grad_norm": 1.6553071737289429, |
| "kl": 0.72064208984375, |
| "learning_rate": 2.65305396191733e-06, |
| "loss": 0.0742, |
| "reward": 1.5618304312229156, |
| "reward_std": 0.5021443270146847, |
| "rewards/accuracy_reward": 0.6995536029338837, |
| "rewards/format_reward": 0.8622768253087998, |
| "step": 140 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 685.3361877441406, |
| "epoch": 1.2388059701492538, |
| "grad_norm": 0.40964633226394653, |
| "kl": 0.66866455078125, |
| "learning_rate": 2.61646316641186e-06, |
| "loss": 0.0625, |
| "reward": 1.5828125685453416, |
| "reward_std": 0.47754584178328513, |
| "rewards/accuracy_reward": 0.7189732477068901, |
| "rewards/format_reward": 0.8638393208384514, |
| "step": 145 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 653.287085723877, |
| "epoch": 1.2814498933901919, |
| "grad_norm": 1.1264029741287231, |
| "kl": 0.331591796875, |
| "learning_rate": 2.5783183044765715e-06, |
| "loss": 0.0573, |
| "reward": 1.6180804371833801, |
| "reward_std": 0.44253902062773703, |
| "rewards/accuracy_reward": 0.7350446723401547, |
| "rewards/format_reward": 0.8830357536673545, |
| "step": 150 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 651.8187789916992, |
| "epoch": 1.32409381663113, |
| "grad_norm": 0.36573150753974915, |
| "kl": 0.68359375, |
| "learning_rate": 2.5386724720408135e-06, |
| "loss": 0.0651, |
| "reward": 1.6392857909202576, |
| "reward_std": 0.39724560379981994, |
| "rewards/accuracy_reward": 0.7477678887546062, |
| "rewards/format_reward": 0.8915178969502449, |
| "step": 155 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 644.0553833007813, |
| "epoch": 1.3667377398720681, |
| "grad_norm": 0.3527699410915375, |
| "kl": 0.289678955078125, |
| "learning_rate": 2.49758085431725e-06, |
| "loss": 0.0217, |
| "reward": 1.7120536595582962, |
| "reward_std": 0.3335069250315428, |
| "rewards/accuracy_reward": 0.7790178924798965, |
| "rewards/format_reward": 0.9330357566475869, |
| "step": 160 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 634.3732429504395, |
| "epoch": 1.4093816631130065, |
| "grad_norm": 2.3950164318084717, |
| "kl": 0.7220977783203125, |
| "learning_rate": 2.455100648986533e-06, |
| "loss": 0.04, |
| "reward": 1.703571507334709, |
| "reward_std": 0.31777132861316204, |
| "rewards/accuracy_reward": 0.7852678909897804, |
| "rewards/format_reward": 0.9183036163449287, |
| "step": 165 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 649.6995819091796, |
| "epoch": 1.4520255863539446, |
| "grad_norm": 0.4751293361186981, |
| "kl": 0.80157470703125, |
| "learning_rate": 2.4112909865807053e-06, |
| "loss": 0.0806, |
| "reward": 1.6685268580913544, |
| "reward_std": 0.35685542970895767, |
| "rewards/accuracy_reward": 0.7654018208384514, |
| "rewards/format_reward": 0.9031250387430191, |
| "step": 170 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 662.9319503784179, |
| "epoch": 1.4946695095948828, |
| "grad_norm": 3.0650556087493896, |
| "kl": 0.2190185546875, |
| "learning_rate": 2.366212848176164e-06, |
| "loss": 0.0385, |
| "reward": 1.6214286386966705, |
| "reward_std": 0.3651090878993273, |
| "rewards/accuracy_reward": 0.7325893193483353, |
| "rewards/format_reward": 0.8888393297791481, |
| "step": 175 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 638.3734603881836, |
| "epoch": 1.537313432835821, |
| "grad_norm": 1.002023458480835, |
| "kl": 0.2518310546875, |
| "learning_rate": 2.319928980510752e-06, |
| "loss": 0.0424, |
| "reward": 1.6872768700122833, |
| "reward_std": 0.36593810133635996, |
| "rewards/accuracy_reward": 0.7772321775555611, |
| "rewards/format_reward": 0.9100446820259094, |
| "step": 180 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 647.8556060791016, |
| "epoch": 1.579957356076759, |
| "grad_norm": 1.074832558631897, |
| "kl": 0.247601318359375, |
| "learning_rate": 2.272503808643123e-06, |
| "loss": 0.0509, |
| "reward": 1.68526793718338, |
| "reward_std": 0.3566482378169894, |
| "rewards/accuracy_reward": 0.7754464611411095, |
| "rewards/format_reward": 0.9098214745521546, |
| "step": 185 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 655.2254753112793, |
| "epoch": 1.6226012793176974, |
| "grad_norm": 0.493257999420166, |
| "kl": 0.286126708984375, |
| "learning_rate": 2.2240033462759628e-06, |
| "loss": 0.0513, |
| "reward": 1.6654018700122832, |
| "reward_std": 0.39258838426321746, |
| "rewards/accuracy_reward": 0.7604911088943481, |
| "rewards/format_reward": 0.9049107551574707, |
| "step": 190 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 649.5230148315429, |
| "epoch": 1.6652452025586353, |
| "grad_norm": 1.0757510662078857, |
| "kl": 0.28671875, |
| "learning_rate": 2.1744951038678905e-06, |
| "loss": 0.0627, |
| "reward": 1.6694197177886962, |
| "reward_std": 0.36208211332559587, |
| "rewards/accuracy_reward": 0.7629464589059353, |
| "rewards/format_reward": 0.9064732536673545, |
| "step": 195 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 642.9241394042969, |
| "epoch": 1.7078891257995736, |
| "grad_norm": 2.5317819118499756, |
| "kl": 0.55604248046875, |
| "learning_rate": 2.124047994661941e-06, |
| "loss": 0.0672, |
| "reward": 1.636160781979561, |
| "reward_std": 0.3948578182607889, |
| "rewards/accuracy_reward": 0.7459821730852128, |
| "rewards/format_reward": 0.8901786103844642, |
| "step": 200 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 632.1971221923828, |
| "epoch": 1.7505330490405118, |
| "grad_norm": 1.68379545211792, |
| "kl": 0.6220703125, |
| "learning_rate": 2.072732238761434e-06, |
| "loss": 0.0781, |
| "reward": 1.5113840013742448, |
| "reward_std": 0.4758596081286669, |
| "rewards/accuracy_reward": 0.6984375290572643, |
| "rewards/format_reward": 0.8129464641213417, |
| "step": 205 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 623.4424430847168, |
| "epoch": 1.79317697228145, |
| "grad_norm": 3.5116868019104004, |
| "kl": 0.685400390625, |
| "learning_rate": 2.0206192653867536e-06, |
| "loss": 0.0961, |
| "reward": 1.4203125566244126, |
| "reward_std": 0.5285798832774162, |
| "rewards/accuracy_reward": 0.6421875290572643, |
| "rewards/format_reward": 0.7781250327825546, |
| "step": 210 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 614.3951126098633, |
| "epoch": 1.835820895522388, |
| "grad_norm": 1.31536865234375, |
| "kl": 0.9739013671875, |
| "learning_rate": 1.967781613449095e-06, |
| "loss": 0.128, |
| "reward": 1.3087054163217544, |
| "reward_std": 0.6106407694518566, |
| "rewards/accuracy_reward": 0.6040178872644901, |
| "rewards/format_reward": 0.7046875283122063, |
| "step": 215 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 604.7236915588379, |
| "epoch": 1.8784648187633262, |
| "grad_norm": 2.7701919078826904, |
| "kl": 1.267333984375, |
| "learning_rate": 1.9142928305795637e-06, |
| "loss": 0.1807, |
| "reward": 1.0964286282658577, |
| "reward_std": 0.6850748583674431, |
| "rewards/accuracy_reward": 0.5145089499652385, |
| "rewards/format_reward": 0.5819196663796902, |
| "step": 220 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 570.4515830993653, |
| "epoch": 1.9211087420042645, |
| "grad_norm": 1.383189082145691, |
| "kl": 0.865283203125, |
| "learning_rate": 1.8602273707541886e-06, |
| "loss": 0.1254, |
| "reward": 1.1287946969270706, |
| "reward_std": 0.6823262549936772, |
| "rewards/accuracy_reward": 0.5196428805589676, |
| "rewards/format_reward": 0.6091518118977547, |
| "step": 225 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 618.1281509399414, |
| "epoch": 1.9637526652452024, |
| "grad_norm": 1.875443935394287, |
| "kl": 1.5638671875, |
| "learning_rate": 1.8056604906573418e-06, |
| "loss": 0.1544, |
| "reward": 1.001562552154064, |
| "reward_std": 0.6830957509577275, |
| "rewards/accuracy_reward": 0.4718750223517418, |
| "rewards/format_reward": 0.5296875208616256, |
| "step": 230 |
| }, |
| { |
| "epoch": 1.997867803837953, |
| "eval_clip_ratio": 0.0, |
| "eval_completion_length": 566.5657886323474, |
| "eval_kl": 1.0294828869047619, |
| "eval_loss": 0.09497759491205215, |
| "eval_reward": 1.096938827681163, |
| "eval_reward_std": 0.70041947516184, |
| "eval_rewards/accuracy_reward": 0.5144558056952462, |
| "eval_rewards/format_reward": 0.5824830167823367, |
| "eval_runtime": 810.3084, |
| "eval_samples_per_second": 0.617, |
| "eval_steps_per_second": 0.006, |
| "step": 234 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 581.7695983886719, |
| "epoch": 2.008528784648188, |
| "grad_norm": 2.1314055919647217, |
| "kl": 0.802685546875, |
| "learning_rate": 1.7506681449278226e-06, |
| "loss": 0.1221, |
| "reward": 1.1642857670783997, |
| "reward_std": 0.700273784250021, |
| "rewards/accuracy_reward": 0.5546875216066838, |
| "rewards/format_reward": 0.6095982432365418, |
| "step": 235 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 590.5163177490234, |
| "epoch": 2.0511727078891258, |
| "grad_norm": 4.411650657653809, |
| "kl": 1.93984375, |
| "learning_rate": 1.6953268804334257e-06, |
| "loss": 0.1485, |
| "reward": 0.9665178924798965, |
| "reward_std": 0.7336546629667282, |
| "rewards/accuracy_reward": 0.4665178798139095, |
| "rewards/format_reward": 0.5000000223517418, |
| "step": 240 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 588.587077331543, |
| "epoch": 2.093816631130064, |
| "grad_norm": 1.1795450448989868, |
| "kl": 1.27900390625, |
| "learning_rate": 1.6397137297211436e-06, |
| "loss": 0.1474, |
| "reward": 0.9912946864962577, |
| "reward_std": 0.7124211765825749, |
| "rewards/accuracy_reward": 0.48437502309679986, |
| "rewards/format_reward": 0.5069196656346321, |
| "step": 245 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 577.4261459350586, |
| "epoch": 2.136460554371002, |
| "grad_norm": 1.2404223680496216, |
| "kl": 1.08720703125, |
| "learning_rate": 1.5839061037913395e-06, |
| "loss": 0.0908, |
| "reward": 0.9372768178582191, |
| "reward_std": 0.6845876269042492, |
| "rewards/accuracy_reward": 0.4435268074274063, |
| "rewards/format_reward": 0.4937500201165676, |
| "step": 250 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 538.9498016357422, |
| "epoch": 2.1791044776119404, |
| "grad_norm": 5.910031318664551, |
| "kl": 1.4826171875, |
| "learning_rate": 1.527981684345115e-06, |
| "loss": 0.0402, |
| "reward": 1.0573661237955094, |
| "reward_std": 0.7098784282803535, |
| "rewards/accuracy_reward": 0.5082589484751224, |
| "rewards/format_reward": 0.5491071656346321, |
| "step": 255 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 497.1082817077637, |
| "epoch": 2.2217484008528783, |
| "grad_norm": 1.6820718050003052, |
| "kl": 1.875634765625, |
| "learning_rate": 1.4720183156548855e-06, |
| "loss": -0.0382, |
| "reward": 1.1102679178118706, |
| "reward_std": 0.6880769707262516, |
| "rewards/accuracy_reward": 0.528125026077032, |
| "rewards/format_reward": 0.5821428827941417, |
| "step": 260 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 541.0748046875, |
| "epoch": 2.2643923240938166, |
| "grad_norm": 2.385998249053955, |
| "kl": 2.6990234375, |
| "learning_rate": 1.4160938962086612e-06, |
| "loss": 0.0057, |
| "reward": 1.12098218947649, |
| "reward_std": 0.6896743580698967, |
| "rewards/accuracy_reward": 0.5265625223517418, |
| "rewards/format_reward": 0.5944196678698063, |
| "step": 265 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 556.8611907958984, |
| "epoch": 2.307036247334755, |
| "grad_norm": 2.2787282466888428, |
| "kl": 3.4162109375, |
| "learning_rate": 1.3602862702788567e-06, |
| "loss": 0.0818, |
| "reward": 1.064062552154064, |
| "reward_std": 0.6750895470380783, |
| "rewards/accuracy_reward": 0.4955357354134321, |
| "rewards/format_reward": 0.5685268133878708, |
| "step": 270 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 584.2448959350586, |
| "epoch": 2.349680170575693, |
| "grad_norm": 2.604217290878296, |
| "kl": 3.58203125, |
| "learning_rate": 1.3046731195665748e-06, |
| "loss": 0.1074, |
| "reward": 1.0066964760422707, |
| "reward_std": 0.7123509004712105, |
| "rewards/accuracy_reward": 0.47566966339945793, |
| "rewards/format_reward": 0.5310268089175224, |
| "step": 275 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 574.1310523986816, |
| "epoch": 2.3923240938166312, |
| "grad_norm": 2.9245572090148926, |
| "kl": 3.5134765625, |
| "learning_rate": 1.2493318550721775e-06, |
| "loss": 0.1235, |
| "reward": 1.004464328289032, |
| "reward_std": 0.7083872497081757, |
| "rewards/accuracy_reward": 0.4734375201165676, |
| "rewards/format_reward": 0.5310268081724644, |
| "step": 280 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 589.7712295532226, |
| "epoch": 2.434968017057569, |
| "grad_norm": 7.597701072692871, |
| "kl": 3.9734375, |
| "learning_rate": 1.1943395093426585e-06, |
| "loss": 0.2007, |
| "reward": 1.0401786118745804, |
| "reward_std": 0.7139236360788346, |
| "rewards/accuracy_reward": 0.4946428775787354, |
| "rewards/format_reward": 0.5455357372760773, |
| "step": 285 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 598.5185546875, |
| "epoch": 2.4776119402985075, |
| "grad_norm": 16.614126205444336, |
| "kl": 4.1701171875, |
| "learning_rate": 1.1397726292458115e-06, |
| "loss": 0.2304, |
| "reward": 1.018303619325161, |
| "reward_std": 0.715107049047947, |
| "rewards/accuracy_reward": 0.4877232380211353, |
| "rewards/format_reward": 0.5305803798139095, |
| "step": 290 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 613.7118576049804, |
| "epoch": 2.520255863539446, |
| "grad_norm": 10.53600788116455, |
| "kl": 2.83359375, |
| "learning_rate": 1.085707169420437e-06, |
| "loss": 0.1637, |
| "reward": 0.9716518297791481, |
| "reward_std": 0.7042301401495934, |
| "rewards/accuracy_reward": 0.46294644884765146, |
| "rewards/format_reward": 0.5087053842842579, |
| "step": 295 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 615.3317260742188, |
| "epoch": 2.5628997867803838, |
| "grad_norm": 8.81041431427002, |
| "kl": 4.831640625, |
| "learning_rate": 1.0322183865509054e-06, |
| "loss": 0.2796, |
| "reward": 1.0026786133646965, |
| "reward_std": 0.7037848606705666, |
| "rewards/accuracy_reward": 0.47232145331799985, |
| "rewards/format_reward": 0.5303571715950965, |
| "step": 300 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 602.5529251098633, |
| "epoch": 2.605543710021322, |
| "grad_norm": 35.90947723388672, |
| "kl": 4.2283203125, |
| "learning_rate": 9.793807346132464e-07, |
| "loss": 0.2417, |
| "reward": 0.9839286230504513, |
| "reward_std": 0.728691854327917, |
| "rewards/accuracy_reward": 0.4665178794413805, |
| "rewards/format_reward": 0.5174107391387224, |
| "step": 305 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 605.5623016357422, |
| "epoch": 2.64818763326226, |
| "grad_norm": 13.354021072387695, |
| "kl": 4.1529296875, |
| "learning_rate": 9.272677612385667e-07, |
| "loss": 0.2264, |
| "reward": 1.0729911237955094, |
| "reward_std": 0.7242146201431752, |
| "rewards/accuracy_reward": 0.5120535925030708, |
| "rewards/format_reward": 0.5609375245869159, |
| "step": 310 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 609.2933280944824, |
| "epoch": 2.6908315565031984, |
| "grad_norm": 12.610240936279297, |
| "kl": 4.536328125, |
| "learning_rate": 8.759520053380591e-07, |
| "loss": 0.2337, |
| "reward": 1.0473214656114578, |
| "reward_std": 0.7167247369885444, |
| "rewards/accuracy_reward": 0.5008928783237934, |
| "rewards/format_reward": 0.5464285984635353, |
| "step": 315 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 600.3977935791015, |
| "epoch": 2.7334754797441363, |
| "grad_norm": 4.791992664337158, |
| "kl": 3.415234375, |
| "learning_rate": 8.255048961321088e-07, |
| "loss": 0.1756, |
| "reward": 1.094419687986374, |
| "reward_std": 0.6878940530121327, |
| "rewards/accuracy_reward": 0.5276786014437675, |
| "rewards/format_reward": 0.5667410984635353, |
| "step": 320 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 602.0634201049804, |
| "epoch": 2.7761194029850746, |
| "grad_norm": 6.484261512756348, |
| "kl": 4.11328125, |
| "learning_rate": 7.759966537240373e-07, |
| "loss": 0.2121, |
| "reward": 1.0223214700818062, |
| "reward_std": 0.7285358726978302, |
| "rewards/accuracy_reward": 0.48816966116428373, |
| "rewards/format_reward": 0.5341518081724643, |
| "step": 325 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 589.43842086792, |
| "epoch": 2.818763326226013, |
| "grad_norm": 2.419487714767456, |
| "kl": 5.057421875, |
| "learning_rate": 7.274961913568773e-07, |
| "loss": 0.2439, |
| "reward": 0.9620536170899868, |
| "reward_std": 0.692897405475378, |
| "rewards/accuracy_reward": 0.45825895071029665, |
| "rewards/format_reward": 0.503794664889574, |
| "step": 330 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 586.2620796203613, |
| "epoch": 2.861407249466951, |
| "grad_norm": 4.212592124938965, |
| "kl": 3.05234375, |
| "learning_rate": 6.800710194892484e-07, |
| "loss": 0.1322, |
| "reward": 1.040625052154064, |
| "reward_std": 0.6996075876057148, |
| "rewards/accuracy_reward": 0.4897321633994579, |
| "rewards/format_reward": 0.550892885029316, |
| "step": 335 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 587.5268157958984, |
| "epoch": 2.9040511727078893, |
| "grad_norm": 7.860717296600342, |
| "kl": 4.29765625, |
| "learning_rate": 6.33787151823836e-07, |
| "loss": 0.1941, |
| "reward": 1.003794687986374, |
| "reward_std": 0.7180271342396736, |
| "rewards/accuracy_reward": 0.4808035932481289, |
| "rewards/format_reward": 0.522991093993187, |
| "step": 340 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 592.5397575378418, |
| "epoch": 2.946695095948827, |
| "grad_norm": 2.772329092025757, |
| "kl": 3.987109375, |
| "learning_rate": 5.887090134192947e-07, |
| "loss": 0.2082, |
| "reward": 1.0125000461935998, |
| "reward_std": 0.6709666073322296, |
| "rewards/accuracy_reward": 0.48437502384185793, |
| "rewards/format_reward": 0.5281250223517417, |
| "step": 345 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 584.3073959350586, |
| "epoch": 2.9893390191897655, |
| "grad_norm": 5.136852264404297, |
| "kl": 3.92734375, |
| "learning_rate": 5.448993510134669e-07, |
| "loss": 0.1783, |
| "reward": 0.9609375447034836, |
| "reward_std": 0.6889998987317085, |
| "rewards/accuracy_reward": 0.4497768072411418, |
| "rewards/format_reward": 0.5111607395112514, |
| "step": 350 |
| }, |
| { |
| "epoch": 2.997867803837953, |
| "eval_clip_ratio": 0.0, |
| "eval_completion_length": 572.8553040519594, |
| "eval_kl": 5.3444940476190474, |
| "eval_loss": 0.2360815405845642, |
| "eval_reward": 0.9090136440973433, |
| "eval_reward_std": 0.6795897848076291, |
| "eval_rewards/accuracy_reward": 0.42772110799948376, |
| "eval_rewards/format_reward": 0.48129253917270237, |
| "eval_runtime": 816.0021, |
| "eval_samples_per_second": 0.613, |
| "eval_steps_per_second": 0.006, |
| "step": 351 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 568.4361267089844, |
| "epoch": 3.0341151385927505, |
| "grad_norm": 9.054045677185059, |
| "kl": 4.098046875, |
| "learning_rate": 5.024191456827498e-07, |
| "loss": 0.1624, |
| "reward": 0.9511161103844643, |
| "reward_std": 0.6624684408307076, |
| "rewards/accuracy_reward": 0.46138395071029664, |
| "rewards/format_reward": 0.4897321678698063, |
| "step": 355 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 568.6207885742188, |
| "epoch": 3.076759061833689, |
| "grad_norm": 9.051790237426758, |
| "kl": 3.809765625, |
| "learning_rate": 4.6132752795918667e-07, |
| "loss": 0.1558, |
| "reward": 1.0131696924567222, |
| "reward_std": 0.6962591715157032, |
| "rewards/accuracy_reward": 0.48772323653101923, |
| "rewards/format_reward": 0.5254464507102966, |
| "step": 360 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 557.1560546875, |
| "epoch": 3.1194029850746268, |
| "grad_norm": 3.945366144180298, |
| "kl": 4.57890625, |
| "learning_rate": 4.2168169552342905e-07, |
| "loss": 0.1879, |
| "reward": 0.968080396950245, |
| "reward_std": 0.6995945557951927, |
| "rewards/accuracy_reward": 0.4647321619093418, |
| "rewards/format_reward": 0.5033482357859611, |
| "step": 365 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 568.9433326721191, |
| "epoch": 3.162046908315565, |
| "grad_norm": 8.444324493408203, |
| "kl": 3.914453125, |
| "learning_rate": 3.8353683358814046e-07, |
| "loss": 0.158, |
| "reward": 0.9988839834928512, |
| "reward_std": 0.701323488354683, |
| "rewards/accuracy_reward": 0.47254466637969017, |
| "rewards/format_reward": 0.5263393051922322, |
| "step": 370 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 570.7756988525391, |
| "epoch": 3.204690831556503, |
| "grad_norm": 1.7602710723876953, |
| "kl": 4.1578125, |
| "learning_rate": 3.469460380826697e-07, |
| "loss": 0.1662, |
| "reward": 0.9948661148548126, |
| "reward_std": 0.6981454014778137, |
| "rewards/accuracy_reward": 0.48147323727607727, |
| "rewards/format_reward": 0.5133928775787353, |
| "step": 375 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 563.1116333007812, |
| "epoch": 3.2473347547974414, |
| "grad_norm": 4.784106731414795, |
| "kl": 3.6669921875, |
| "learning_rate": 3.119602417459075e-07, |
| "loss": 0.1403, |
| "reward": 1.0319196850061416, |
| "reward_std": 0.699262548983097, |
| "rewards/accuracy_reward": 0.4930803768336773, |
| "rewards/format_reward": 0.5388393096625805, |
| "step": 380 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 563.9397567749023, |
| "epoch": 3.2899786780383797, |
| "grad_norm": 4.571471691131592, |
| "kl": 4.50703125, |
| "learning_rate": 2.786281432302071e-07, |
| "loss": 0.1753, |
| "reward": 1.053348256647587, |
| "reward_std": 0.7323236554861069, |
| "rewards/accuracy_reward": 0.5071428805589676, |
| "rewards/format_reward": 0.5462053820490838, |
| "step": 385 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 546.5582817077636, |
| "epoch": 3.3326226012793176, |
| "grad_norm": 8.077947616577148, |
| "kl": 3.4154296875, |
| "learning_rate": 2.46996139315057e-07, |
| "loss": 0.115, |
| "reward": 0.9888393305242061, |
| "reward_std": 0.6875140987336635, |
| "rewards/accuracy_reward": 0.470982164517045, |
| "rewards/format_reward": 0.5178571619093418, |
| "step": 390 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 555.0024826049805, |
| "epoch": 3.375266524520256, |
| "grad_norm": 6.658145427703857, |
| "kl": 3.830078125, |
| "learning_rate": 2.1710826032485286e-07, |
| "loss": 0.1539, |
| "reward": 1.0189732655882835, |
| "reward_std": 0.7024340078234672, |
| "rewards/accuracy_reward": 0.49129465967416763, |
| "rewards/format_reward": 0.5276785992085934, |
| "step": 395 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 540.7460098266602, |
| "epoch": 3.417910447761194, |
| "grad_norm": 5.9571213722229, |
| "kl": 4.2994140625, |
| "learning_rate": 1.8900610884066817e-07, |
| "loss": 0.1664, |
| "reward": 1.0589286148548127, |
| "reward_std": 0.704091303050518, |
| "rewards/accuracy_reward": 0.5145089514553547, |
| "rewards/format_reward": 0.5444196730852127, |
| "step": 400 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 552.391983795166, |
| "epoch": 3.4605543710021323, |
| "grad_norm": 1.934449553489685, |
| "kl": 3.8435546875, |
| "learning_rate": 1.627288017913383e-07, |
| "loss": 0.1552, |
| "reward": 1.032812552154064, |
| "reward_std": 0.7122370585799217, |
| "rewards/accuracy_reward": 0.49486609250307084, |
| "rewards/format_reward": 0.5379464536905288, |
| "step": 405 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 556.8665435791015, |
| "epoch": 3.50319829424307, |
| "grad_norm": 2.779977798461914, |
| "kl": 3.684375, |
| "learning_rate": 1.3831291600445573e-07, |
| "loss": 0.1404, |
| "reward": 1.0571429044008256, |
| "reward_std": 0.706598898023367, |
| "rewards/accuracy_reward": 0.5091518111526966, |
| "rewards/format_reward": 0.5479910977184772, |
| "step": 410 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 541.1424331665039, |
| "epoch": 3.5458422174840085, |
| "grad_norm": 2.621488094329834, |
| "kl": 4.400390625, |
| "learning_rate": 1.1579243729307487e-07, |
| "loss": 0.1684, |
| "reward": 0.9808036178350449, |
| "reward_std": 0.676436859369278, |
| "rewards/accuracy_reward": 0.466294664144516, |
| "rewards/format_reward": 0.514508954435587, |
| "step": 415 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 559.8049331665039, |
| "epoch": 3.588486140724947, |
| "grad_norm": 1.7964094877243042, |
| "kl": 3.96171875, |
| "learning_rate": 9.519871314899092e-08, |
| "loss": 0.1398, |
| "reward": 0.9908482581377029, |
| "reward_std": 0.6953834608197212, |
| "rewards/accuracy_reward": 0.47723216786980627, |
| "rewards/format_reward": 0.5136160954833031, |
| "step": 420 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 554.9152008056641, |
| "epoch": 3.631130063965885, |
| "grad_norm": 3.415715456008911, |
| "kl": 3.9677734375, |
| "learning_rate": 7.656040910844358e-08, |
| "loss": 0.1558, |
| "reward": 1.0417411237955094, |
| "reward_std": 0.7144467569887638, |
| "rewards/accuracy_reward": 0.5002232410013676, |
| "rewards/format_reward": 0.541517885774374, |
| "step": 425 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 550.6768142700196, |
| "epoch": 3.673773987206823, |
| "grad_norm": 2.8959734439849854, |
| "kl": 3.8453125, |
| "learning_rate": 5.990346885098235e-08, |
| "loss": 0.1582, |
| "reward": 1.0953125476837158, |
| "reward_std": 0.7153457693755627, |
| "rewards/accuracy_reward": 0.529241094365716, |
| "rewards/format_reward": 0.5660714536905289, |
| "step": 430 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 537.2187751770019, |
| "epoch": 3.716417910447761, |
| "grad_norm": 1.9467185735702515, |
| "kl": 4.092578125, |
| "learning_rate": 4.5251078087033493e-08, |
| "loss": 0.1599, |
| "reward": 1.00379468947649, |
| "reward_std": 0.6785944283008576, |
| "rewards/accuracy_reward": 0.4821428760886192, |
| "rewards/format_reward": 0.5216518089175224, |
| "step": 435 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 535.1417694091797, |
| "epoch": 3.7590618336886994, |
| "grad_norm": 2.523850917816162, |
| "kl": 4.1220703125, |
| "learning_rate": 3.262363228443427e-08, |
| "loss": 0.1558, |
| "reward": 0.9935268282890319, |
| "reward_std": 0.6972592443227768, |
| "rewards/accuracy_reward": 0.4779018059372902, |
| "rewards/format_reward": 0.5156250230967998, |
| "step": 440 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 532.5212303161621, |
| "epoch": 3.8017057569296373, |
| "grad_norm": 1.803943157196045, |
| "kl": 4.0076171875, |
| "learning_rate": 2.2038708278862952e-08, |
| "loss": 0.165, |
| "reward": 0.9732143253087997, |
| "reward_std": 0.7266230128705502, |
| "rewards/accuracy_reward": 0.47075894847512245, |
| "rewards/format_reward": 0.5024553813040257, |
| "step": 445 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 527.6984603881835, |
| "epoch": 3.8443496801705757, |
| "grad_norm": 5.807371616363525, |
| "kl": 3.9375, |
| "learning_rate": 1.3511039807673209e-08, |
| "loss": 0.1457, |
| "reward": 1.0058036133646966, |
| "reward_std": 0.7280482016503811, |
| "rewards/accuracy_reward": 0.4801339514553547, |
| "rewards/format_reward": 0.5256696626543998, |
| "step": 450 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 547.6140846252441, |
| "epoch": 3.886993603411514, |
| "grad_norm": 2.526625394821167, |
| "kl": 3.7375, |
| "learning_rate": 7.0524970011963675e-09, |
| "loss": 0.1027, |
| "reward": 0.996428620815277, |
| "reward_std": 0.6856365635991096, |
| "rewards/accuracy_reward": 0.4709821633994579, |
| "rewards/format_reward": 0.5254464507102966, |
| "step": 455 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 535.9491348266602, |
| "epoch": 3.929637526652452, |
| "grad_norm": 1.175413727760315, |
| "kl": 3.96484375, |
| "learning_rate": 2.6720698600553595e-09, |
| "loss": 0.1365, |
| "reward": 1.0109375432133674, |
| "reward_std": 0.7361011810600757, |
| "rewards/accuracy_reward": 0.4879464481025934, |
| "rewards/format_reward": 0.522991093993187, |
| "step": 460 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 549.8350677490234, |
| "epoch": 3.9722814498933903, |
| "grad_norm": 3.024672269821167, |
| "kl": 3.743359375, |
| "learning_rate": 3.7585574148779613e-10, |
| "loss": 0.1276, |
| "reward": 1.035491119325161, |
| "reward_std": 0.7265422374010087, |
| "rewards/accuracy_reward": 0.4973214499652386, |
| "rewards/format_reward": 0.5381696693599224, |
| "step": 465 |
| }, |
| { |
| "epoch": 3.997867803837953, |
| "eval_clip_ratio": 0.0, |
| "eval_completion_length": 531.8020145476811, |
| "eval_kl": 4.111359126984127, |
| "eval_loss": 0.11702829599380493, |
| "eval_reward": 0.9311224893918113, |
| "eval_reward_std": 0.7056213607863774, |
| "eval_rewards/accuracy_reward": 0.4399093160080531, |
| "eval_rewards/format_reward": 0.4912131717280736, |
| "eval_runtime": 797.4216, |
| "eval_samples_per_second": 0.627, |
| "eval_steps_per_second": 0.006, |
| "step": 468 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 537.9717648824056, |
| "epoch": 3.997867803837953, |
| "kl": 3.7174479166666665, |
| "reward": 1.0145089824994404, |
| "reward_std": 0.6882362899680933, |
| "rewards/accuracy_reward": 0.4813988270858924, |
| "rewards/format_reward": 0.5331101417541504, |
| "step": 468, |
| "total_flos": 0.0, |
| "train_loss": 0.1188597827950795, |
| "train_runtime": 66948.3883, |
| "train_samples_per_second": 0.448, |
| "train_steps_per_second": 0.007 |
| } |
| ], |
| "logging_steps": 5, |
| "max_steps": 468, |
| "num_input_tokens_seen": 0, |
| "num_train_epochs": 4, |
| "save_steps": 500, |
| "stateful_callbacks": { |
| "TrainerControl": { |
| "args": { |
| "should_epoch_stop": false, |
| "should_evaluate": false, |
| "should_log": false, |
| "should_save": true, |
| "should_training_stop": true |
| }, |
| "attributes": {} |
| } |
| }, |
| "total_flos": 0.0, |
| "train_batch_size": 16, |
| "trial_name": null, |
| "trial_params": null |
| } |
|
|