Qwen2.5-1.5B-Open-R1-GRPO-math-v1 / trainer_state.json
od2961's picture
Model save
98b111e verified
{
"best_global_step": null,
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 2.374772190851782,
"eval_steps": 100,
"global_step": 1566,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"clip_ratio": 0.0,
"completion_length": 562.7477951049805,
"epoch": 0.0019121030145499089,
"grad_norm": 0.022798627614974976,
"kl": 0.0,
"learning_rate": 1.5923566878980894e-08,
"loss": 0.002,
"num_tokens": 3752220.0,
"reward": 0.011439732770668343,
"reward_std": 0.019404857070185244,
"rewards/pure_accuracy_reward_math": 0.011439732537837699,
"step": 1
},
{
"clip_ratio": 0.0,
"epoch": 0.0038242060290998177,
"grad_norm": 0.02280641719698906,
"kl": 0.0,
"learning_rate": 3.184713375796179e-08,
"loss": 0.002,
"step": 2
},
{
"clip_ratio": 7.760171627069212e-05,
"epoch": 0.005736309043649726,
"grad_norm": 0.02249608002603054,
"kl": 0.00034177303314208984,
"learning_rate": 4.777070063694268e-08,
"loss": 0.002,
"step": 3
},
{
"clip_ratio": 7.010291557207893e-05,
"epoch": 0.0076484120581996355,
"grad_norm": 0.022546162828803062,
"kl": 0.0003476440906524658,
"learning_rate": 6.369426751592358e-08,
"loss": 0.002,
"step": 4
},
{
"clip_ratio": 6.121935876990392e-05,
"epoch": 0.009560515072749545,
"grad_norm": 0.022293007001280785,
"kl": 0.00034675002098083496,
"learning_rate": 7.961783439490447e-08,
"loss": 0.002,
"step": 5
},
{
"clip_ratio": 0.0,
"completion_length": 569.8786544799805,
"epoch": 0.011472618087299453,
"grad_norm": 0.04666038230061531,
"kl": 0.000458449125289917,
"learning_rate": 9.554140127388536e-08,
"loss": 0.0036,
"num_tokens": 7526881.0,
"reward": 0.010323661233996972,
"reward_std": 0.01923220051685348,
"rewards/pure_accuracy_reward_math": 0.01032366111758165,
"step": 6
},
{
"clip_ratio": 9.284320668712098e-05,
"epoch": 0.013384721101849363,
"grad_norm": 0.03701707720756531,
"kl": 0.0004444718360900879,
"learning_rate": 1.1146496815286625e-07,
"loss": 0.0037,
"step": 7
},
{
"clip_ratio": 0.00010049525423028172,
"epoch": 0.015296824116399271,
"grad_norm": 0.05443934351205826,
"kl": 0.0004649162292480469,
"learning_rate": 1.2738853503184715e-07,
"loss": 0.0037,
"step": 8
},
{
"clip_ratio": 9.395023369052069e-05,
"epoch": 0.01720892713094918,
"grad_norm": 0.0357414111495018,
"kl": 0.0004501640796661377,
"learning_rate": 1.4331210191082803e-07,
"loss": 0.0037,
"step": 9
},
{
"clip_ratio": 0.00010371651984542041,
"epoch": 0.01912103014549909,
"grad_norm": 0.05199029669165611,
"kl": 0.0004614591598510742,
"learning_rate": 1.5923566878980893e-07,
"loss": 0.0037,
"step": 10
},
{
"clip_ratio": 0.0,
"completion_length": 570.0694994926453,
"epoch": 0.021033133160048997,
"grad_norm": 0.022845298051834106,
"kl": 0.00035685300827026367,
"learning_rate": 1.751592356687898e-07,
"loss": 0.0025,
"num_tokens": 11302358.0,
"reward": 0.00948660756694153,
"reward_std": 0.017558093182742596,
"rewards/pure_accuracy_reward_math": 0.00948660756694153,
"step": 11
},
{
"clip_ratio": 7.08361723127382e-05,
"epoch": 0.022945236174598906,
"grad_norm": 0.02234972082078457,
"kl": 0.0003580451011657715,
"learning_rate": 1.9108280254777072e-07,
"loss": 0.0025,
"step": 12
},
{
"clip_ratio": 6.80922717606336e-05,
"epoch": 0.024857339189148814,
"grad_norm": 0.021554963663220406,
"kl": 0.00035765767097473145,
"learning_rate": 2.070063694267516e-07,
"loss": 0.0024,
"step": 13
},
{
"clip_ratio": 7.82350492158912e-05,
"epoch": 0.026769442203698725,
"grad_norm": 0.02103673666715622,
"kl": 0.000364154577255249,
"learning_rate": 2.229299363057325e-07,
"loss": 0.0025,
"step": 14
},
{
"clip_ratio": 7.339451894949889e-05,
"epoch": 0.028681545218248634,
"grad_norm": 0.023219415917992592,
"kl": 0.00036078691482543945,
"learning_rate": 2.3885350318471343e-07,
"loss": 0.0025,
"step": 15
},
{
"clip_ratio": 0.0,
"completion_length": 560.8797726631165,
"epoch": 0.030593648232798542,
"grad_norm": 0.024746257811784744,
"kl": 0.0003574490547180176,
"learning_rate": 2.547770700636943e-07,
"loss": 0.0041,
"num_tokens": 15044695.0,
"reward": 0.011160714813740924,
"reward_std": 0.0194911856087856,
"rewards/pure_accuracy_reward_math": 0.011160714755533263,
"step": 16
},
{
"clip_ratio": 9.0199953319825e-05,
"epoch": 0.032505751247348454,
"grad_norm": 0.02409624680876732,
"kl": 0.0003629624843597412,
"learning_rate": 2.707006369426752e-07,
"loss": 0.0042,
"step": 17
},
{
"clip_ratio": 8.157364351291108e-05,
"epoch": 0.03441785426189836,
"grad_norm": 0.023118698969483376,
"kl": 0.0003673136234283447,
"learning_rate": 2.8662420382165606e-07,
"loss": 0.0041,
"step": 18
},
{
"clip_ratio": 9.048881202033954e-05,
"epoch": 0.03632995727644827,
"grad_norm": 0.02316245064139366,
"kl": 0.00036725401878356934,
"learning_rate": 3.02547770700637e-07,
"loss": 0.0041,
"step": 19
},
{
"clip_ratio": 8.188984941170929e-05,
"epoch": 0.03824206029099818,
"grad_norm": 0.021714523434638977,
"kl": 0.0003698766231536865,
"learning_rate": 3.1847133757961787e-07,
"loss": 0.0041,
"step": 20
},
{
"clip_ratio": 0.0,
"completion_length": 554.6908774375916,
"epoch": 0.040154163305548086,
"grad_norm": 0.021168457344174385,
"kl": 0.000368267297744751,
"learning_rate": 3.3439490445859875e-07,
"loss": 0.0026,
"num_tokens": 18758275.0,
"reward": 0.010044643335277215,
"reward_std": 0.018202457285951823,
"rewards/pure_accuracy_reward_math": 0.010044643277069554,
"step": 21
},
{
"clip_ratio": 7.562077911416054e-05,
"epoch": 0.042066266320097995,
"grad_norm": 0.020001132041215897,
"kl": 0.00037425756454467773,
"learning_rate": 3.503184713375796e-07,
"loss": 0.0026,
"step": 22
},
{
"clip_ratio": 7.507880479806772e-05,
"epoch": 0.0439783693346479,
"grad_norm": 0.019386926665902138,
"kl": 0.0003781616687774658,
"learning_rate": 3.6624203821656055e-07,
"loss": 0.0026,
"step": 23
},
{
"clip_ratio": 7.805726602327923e-05,
"epoch": 0.04589047234919781,
"grad_norm": 0.018619129434227943,
"kl": 0.0003878176212310791,
"learning_rate": 3.8216560509554143e-07,
"loss": 0.0026,
"step": 24
},
{
"clip_ratio": 6.671031508176384e-05,
"epoch": 0.04780257536374772,
"grad_norm": 0.01833859272301197,
"kl": 0.00040024518966674805,
"learning_rate": 3.980891719745223e-07,
"loss": 0.0026,
"step": 25
},
{
"clip_ratio": 0.0,
"completion_length": 552.0828938484192,
"epoch": 0.04971467837829763,
"grad_norm": 0.02587960660457611,
"kl": 0.00041344761848449707,
"learning_rate": 4.140127388535032e-07,
"loss": 0.0024,
"num_tokens": 22468764.0,
"reward": 0.012276786321308464,
"reward_std": 0.022195036057382822,
"rewards/pure_accuracy_reward_math": 0.012276786204893142,
"step": 26
},
{
"clip_ratio": 9.613389988771814e-05,
"epoch": 0.05162678139284754,
"grad_norm": 0.02422533929347992,
"kl": 0.00043016672134399414,
"learning_rate": 4.2993630573248406e-07,
"loss": 0.0024,
"step": 27
},
{
"clip_ratio": 8.45099556840978e-05,
"epoch": 0.05353888440739745,
"grad_norm": 0.023998353630304337,
"kl": 0.0004411041736602783,
"learning_rate": 4.45859872611465e-07,
"loss": 0.0024,
"step": 28
},
{
"clip_ratio": 9.715859295056362e-05,
"epoch": 0.05545098742194736,
"grad_norm": 0.023024486377835274,
"kl": 0.0004749894142150879,
"learning_rate": 4.6178343949044587e-07,
"loss": 0.0024,
"step": 29
},
{
"clip_ratio": 9.816014483021718e-05,
"epoch": 0.05736309043649727,
"grad_norm": 0.022171439602971077,
"kl": 0.0005015134811401367,
"learning_rate": 4.777070063694269e-07,
"loss": 0.0024,
"step": 30
},
{
"clip_ratio": 0.0,
"completion_length": 549.791042804718,
"epoch": 0.059275193451047176,
"grad_norm": 0.027614159509539604,
"kl": 0.0005223453044891357,
"learning_rate": 4.936305732484077e-07,
"loss": 0.0029,
"num_tokens": 26170579.0,
"reward": 0.017299108090810478,
"reward_std": 0.03018019301816821,
"rewards/pure_accuracy_reward_math": 0.017299107741564512,
"step": 31
},
{
"clip_ratio": 0.00012569415866892086,
"epoch": 0.061187296465597084,
"grad_norm": 0.02653171494603157,
"kl": 0.0005522072315216064,
"learning_rate": 5.095541401273886e-07,
"loss": 0.0029,
"step": 32
},
{
"clip_ratio": 0.00012863677034147258,
"epoch": 0.06309939948014699,
"grad_norm": 0.0255680400878191,
"kl": 0.0005916953086853027,
"learning_rate": 5.254777070063695e-07,
"loss": 0.0029,
"step": 33
},
{
"clip_ratio": 0.00012797017114962728,
"epoch": 0.06501150249469691,
"grad_norm": 0.02455417811870575,
"kl": 0.0006306171417236328,
"learning_rate": 5.414012738853504e-07,
"loss": 0.0029,
"step": 34
},
{
"clip_ratio": 0.00012855784757448419,
"epoch": 0.06692360550924681,
"grad_norm": 0.024154040962457657,
"kl": 0.0006751418113708496,
"learning_rate": 5.573248407643312e-07,
"loss": 0.0029,
"step": 35
},
{
"clip_ratio": 0.0,
"completion_length": 552.728542804718,
"epoch": 0.06883570852379672,
"grad_norm": 0.023450903594493866,
"kl": 0.000738978385925293,
"learning_rate": 5.732484076433121e-07,
"loss": 0.0034,
"num_tokens": 29883398.0,
"reward": 0.018415179511066526,
"reward_std": 0.030997214023955166,
"rewards/pure_accuracy_reward_math": 0.01841517922002822,
"step": 36
},
{
"clip_ratio": 0.00012425195308196635,
"epoch": 0.07074781153834662,
"grad_norm": 0.023070134222507477,
"kl": 0.0007783770561218262,
"learning_rate": 5.89171974522293e-07,
"loss": 0.0034,
"step": 37
},
{
"clip_ratio": 0.00012334759713894528,
"epoch": 0.07265991455289654,
"grad_norm": 0.023447532206773758,
"kl": 0.0008447170257568359,
"learning_rate": 6.05095541401274e-07,
"loss": 0.0034,
"step": 38
},
{
"clip_ratio": 0.00012615493608336692,
"epoch": 0.07457201756744644,
"grad_norm": 0.024682210758328438,
"kl": 0.0009213089942932129,
"learning_rate": 6.210191082802549e-07,
"loss": 0.0034,
"step": 39
},
{
"clip_ratio": 0.00012461718182521508,
"epoch": 0.07648412058199636,
"grad_norm": 0.02555885910987854,
"kl": 0.000977635383605957,
"learning_rate": 6.369426751592357e-07,
"loss": 0.0033,
"step": 40
},
{
"clip_ratio": 0.0,
"completion_length": 527.030158996582,
"epoch": 0.07839622359654626,
"grad_norm": 0.059237755835056305,
"kl": 0.001125633716583252,
"learning_rate": 6.528662420382166e-07,
"loss": 0.0031,
"num_tokens": 33502406.0,
"reward": 0.024832590454025194,
"reward_std": 0.04194520629243925,
"rewards/pure_accuracy_reward_math": 0.02483259010477923,
"step": 41
},
{
"clip_ratio": 0.00016323180295785278,
"epoch": 0.08030832661109617,
"grad_norm": 0.029172642156481743,
"kl": 0.0011183619499206543,
"learning_rate": 6.687898089171975e-07,
"loss": 0.0031,
"step": 42
},
{
"clip_ratio": 0.0001751068371618203,
"epoch": 0.08222042962564609,
"grad_norm": 0.030453085899353027,
"kl": 0.0011813640594482422,
"learning_rate": 6.847133757961784e-07,
"loss": 0.0031,
"step": 43
},
{
"clip_ratio": 0.00018521026674989116,
"epoch": 0.08413253264019599,
"grad_norm": 0.03091653250157833,
"kl": 0.0012224912643432617,
"learning_rate": 7.006369426751592e-07,
"loss": 0.0031,
"step": 44
},
{
"clip_ratio": 0.00017049979595640252,
"epoch": 0.0860446356547459,
"grad_norm": 0.030593233183026314,
"kl": 0.0012733936309814453,
"learning_rate": 7.165605095541401e-07,
"loss": 0.0031,
"step": 45
},
{
"clip_ratio": 0.0,
"completion_length": 529.7360739707947,
"epoch": 0.0879567386692958,
"grad_norm": 0.03371572494506836,
"kl": 0.0012704133987426758,
"learning_rate": 7.324840764331211e-07,
"loss": 0.0039,
"num_tokens": 37133676.0,
"reward": 0.029017858760198578,
"reward_std": 0.04838265001308173,
"rewards/pure_accuracy_reward_math": 0.029017858061706647,
"step": 46
},
{
"clip_ratio": 0.000227557278265067,
"epoch": 0.08986884168384572,
"grad_norm": 0.033185359090566635,
"kl": 0.0012688040733337402,
"learning_rate": 7.48407643312102e-07,
"loss": 0.0039,
"step": 47
},
{
"clip_ratio": 0.0002238695693677073,
"epoch": 0.09178094469839562,
"grad_norm": 0.03329231217503548,
"kl": 0.0013200044631958008,
"learning_rate": 7.643312101910829e-07,
"loss": 0.0039,
"step": 48
},
{
"clip_ratio": 0.00021458888153347289,
"epoch": 0.09369304771294554,
"grad_norm": 0.03329336270689964,
"kl": 0.0013244152069091797,
"learning_rate": 7.802547770700637e-07,
"loss": 0.0039,
"step": 49
},
{
"clip_ratio": 0.0002193794426830209,
"epoch": 0.09560515072749544,
"grad_norm": 0.0323607362806797,
"kl": 0.0013269782066345215,
"learning_rate": 7.961783439490446e-07,
"loss": 0.0039,
"step": 50
},
{
"clip_ratio": 0.0,
"completion_length": 535.9352931976318,
"epoch": 0.09751725374204535,
"grad_norm": 0.030199358239769936,
"kl": 0.0013506412506103516,
"learning_rate": 8.121019108280255e-07,
"loss": 0.0042,
"num_tokens": 40789896.0,
"reward": 0.030970983498264104,
"reward_std": 0.0486878992523998,
"rewards/pure_accuracy_reward_math": 0.030970983090810478,
"step": 51
},
{
"clip_ratio": 0.00019589511845197194,
"epoch": 0.09942935675659526,
"grad_norm": 0.029786745086312294,
"kl": 0.001370549201965332,
"learning_rate": 8.280254777070064e-07,
"loss": 0.0042,
"step": 52
},
{
"clip_ratio": 0.00021279048064570816,
"epoch": 0.10134145977114517,
"grad_norm": 0.029834378510713577,
"kl": 0.0013399124145507812,
"learning_rate": 8.439490445859872e-07,
"loss": 0.0042,
"step": 53
},
{
"clip_ratio": 0.000190277668878025,
"epoch": 0.10325356278569509,
"grad_norm": 0.029410598799586296,
"kl": 0.00139617919921875,
"learning_rate": 8.598726114649681e-07,
"loss": 0.0042,
"step": 54
},
{
"clip_ratio": 0.00019459096591845082,
"epoch": 0.10516566580024499,
"grad_norm": 0.02935440093278885,
"kl": 0.0014204978942871094,
"learning_rate": 8.757961783439491e-07,
"loss": 0.0042,
"step": 55
},
{
"clip_ratio": 0.0,
"completion_length": 536.9548239707947,
"epoch": 0.1070777688147949,
"grad_norm": 0.02805081568658352,
"kl": 0.0014168024063110352,
"learning_rate": 8.9171974522293e-07,
"loss": 0.0048,
"num_tokens": 44444894.0,
"reward": 0.027901787223527208,
"reward_std": 0.04121451411629096,
"rewards/pure_accuracy_reward_math": 0.02790178669965826,
"step": 56
},
{
"clip_ratio": 0.00016821016617996065,
"epoch": 0.1089898718293448,
"grad_norm": 0.02779608964920044,
"kl": 0.0014551877975463867,
"learning_rate": 9.076433121019109e-07,
"loss": 0.0048,
"step": 57
},
{
"clip_ratio": 0.00018197509814399382,
"epoch": 0.11090197484389472,
"grad_norm": 0.02721741609275341,
"kl": 0.0014206171035766602,
"learning_rate": 9.235668789808917e-07,
"loss": 0.0048,
"step": 58
},
{
"clip_ratio": 0.00016919344039934003,
"epoch": 0.11281407785844462,
"grad_norm": 0.02676265314221382,
"kl": 0.0014575719833374023,
"learning_rate": 9.394904458598727e-07,
"loss": 0.0048,
"step": 59
},
{
"clip_ratio": 0.00017069062050723005,
"epoch": 0.11472618087299453,
"grad_norm": 0.027010478079319,
"kl": 0.0014843940734863281,
"learning_rate": 9.554140127388537e-07,
"loss": 0.0048,
"step": 60
},
{
"clip_ratio": 0.0,
"completion_length": 522.1998043060303,
"epoch": 0.11663828388754444,
"grad_norm": 0.030231643468141556,
"kl": 0.0015106201171875,
"learning_rate": 9.713375796178345e-07,
"loss": 0.0029,
"num_tokens": 48046694.0,
"reward": 0.02762276935391128,
"reward_std": 0.04623683576937765,
"rewards/pure_accuracy_reward_math": 0.02762276877183467,
"step": 61
},
{
"clip_ratio": 0.0001882643781527804,
"epoch": 0.11855038690209435,
"grad_norm": 0.030413959175348282,
"kl": 0.0015065670013427734,
"learning_rate": 9.872611464968155e-07,
"loss": 0.0029,
"step": 62
},
{
"clip_ratio": 0.00019050979824442038,
"epoch": 0.12046248991664425,
"grad_norm": 0.029997214674949646,
"kl": 0.0014984607696533203,
"learning_rate": 1.0031847133757962e-06,
"loss": 0.0029,
"step": 63
},
{
"clip_ratio": 0.0001963579389325787,
"epoch": 0.12237459293119417,
"grad_norm": 0.02927768975496292,
"kl": 0.0014634132385253906,
"learning_rate": 1.0191082802547772e-06,
"loss": 0.0029,
"step": 64
},
{
"clip_ratio": 0.0002130206620449826,
"epoch": 0.12428669594574408,
"grad_norm": 0.028719380497932434,
"kl": 0.0014470815658569336,
"learning_rate": 1.035031847133758e-06,
"loss": 0.0029,
"step": 65
},
{
"clip_ratio": 0.0,
"completion_length": 516.4283137321472,
"epoch": 0.12619879896029398,
"grad_norm": 0.031215306371450424,
"kl": 0.0014127492904663086,
"learning_rate": 1.050955414012739e-06,
"loss": 0.0038,
"num_tokens": 51628501.0,
"reward": 0.03487723405123688,
"reward_std": 0.05173706269124523,
"rewards/pure_accuracy_reward_math": 0.03487723323632963,
"step": 66
},
{
"clip_ratio": 0.00019433782460964721,
"epoch": 0.1281109019748439,
"grad_norm": 0.03108724020421505,
"kl": 0.0014324188232421875,
"learning_rate": 1.06687898089172e-06,
"loss": 0.0038,
"step": 67
},
{
"clip_ratio": 0.00020085336353758976,
"epoch": 0.13002300498939381,
"grad_norm": 0.030220478773117065,
"kl": 0.0014306306838989258,
"learning_rate": 1.0828025477707007e-06,
"loss": 0.0038,
"step": 68
},
{
"clip_ratio": 0.00021161197844321578,
"epoch": 0.1319351080039437,
"grad_norm": 0.030320733785629272,
"kl": 0.001450181007385254,
"learning_rate": 1.0987261146496817e-06,
"loss": 0.0038,
"step": 69
},
{
"clip_ratio": 0.00019352555551677142,
"epoch": 0.13384721101849362,
"grad_norm": 0.02980073168873787,
"kl": 0.0014796257019042969,
"learning_rate": 1.1146496815286625e-06,
"loss": 0.0038,
"step": 70
},
{
"clip_ratio": 0.0,
"completion_length": 525.2854599952698,
"epoch": 0.13575931403304353,
"grad_norm": 0.0338500440120697,
"kl": 0.0015006065368652344,
"learning_rate": 1.1305732484076435e-06,
"loss": 0.006,
"num_tokens": 55247180.0,
"reward": 0.03710937674622983,
"reward_std": 0.05426825548056513,
"rewards/pure_accuracy_reward_math": 0.037109375989530236,
"step": 71
},
{
"clip_ratio": 0.0002256608086668166,
"epoch": 0.13767141704759345,
"grad_norm": 0.03328324109315872,
"kl": 0.0015664100646972656,
"learning_rate": 1.1464968152866242e-06,
"loss": 0.006,
"step": 72
},
{
"clip_ratio": 0.0002166868289350532,
"epoch": 0.13958352006214333,
"grad_norm": 0.03267475962638855,
"kl": 0.0016113519668579102,
"learning_rate": 1.1624203821656052e-06,
"loss": 0.006,
"step": 73
},
{
"clip_ratio": 0.00024709346627105333,
"epoch": 0.14149562307669325,
"grad_norm": 0.032320376485586166,
"kl": 0.0017037391662597656,
"learning_rate": 1.178343949044586e-06,
"loss": 0.006,
"step": 74
},
{
"clip_ratio": 0.00021453456992048814,
"epoch": 0.14340772609124317,
"grad_norm": 0.0322573184967041,
"kl": 0.0017703771591186523,
"learning_rate": 1.194267515923567e-06,
"loss": 0.006,
"step": 75
},
{
"clip_ratio": 0.0,
"completion_length": 539.314199924469,
"epoch": 0.14531982910579308,
"grad_norm": 0.03833702206611633,
"kl": 0.0018039941787719727,
"learning_rate": 1.210191082802548e-06,
"loss": 0.0055,
"num_tokens": 58912938.0,
"reward": 0.04045759144355543,
"reward_std": 0.060838291130494326,
"rewards/pure_accuracy_reward_math": 0.040457590454025194,
"step": 76
},
{
"clip_ratio": 0.0002450900424548763,
"epoch": 0.147231932120343,
"grad_norm": 0.03705858439207077,
"kl": 0.0018303394317626953,
"learning_rate": 1.2261146496815287e-06,
"loss": 0.0055,
"step": 77
},
{
"clip_ratio": 0.0002520209266094753,
"epoch": 0.14914403513489288,
"grad_norm": 0.03624257072806358,
"kl": 0.0019118785858154297,
"learning_rate": 1.2420382165605097e-06,
"loss": 0.0055,
"step": 78
},
{
"clip_ratio": 0.00023157394139161624,
"epoch": 0.1510561381494428,
"grad_norm": 0.03626013919711113,
"kl": 0.001949906349182129,
"learning_rate": 1.2579617834394905e-06,
"loss": 0.0055,
"step": 79
},
{
"clip_ratio": 0.0002889583781211513,
"epoch": 0.1529682411639927,
"grad_norm": 0.03634464740753174,
"kl": 0.001984238624572754,
"learning_rate": 1.2738853503184715e-06,
"loss": 0.0055,
"step": 80
},
{
"clip_ratio": 0.0,
"completion_length": 531.2025918960571,
"epoch": 0.15488034417854263,
"grad_norm": 0.032439347356557846,
"kl": 0.0019190311431884766,
"learning_rate": 1.2898089171974522e-06,
"loss": 0.0067,
"num_tokens": 62551992.0,
"reward": 0.03766741280560382,
"reward_std": 0.0572711571585387,
"rewards/pure_accuracy_reward_math": 0.0376674119324889,
"step": 81
},
{
"clip_ratio": 0.00025730342139240747,
"epoch": 0.15679244719309252,
"grad_norm": 0.03198026493191719,
"kl": 0.001917123794555664,
"learning_rate": 1.3057324840764332e-06,
"loss": 0.0067,
"step": 82
},
{
"clip_ratio": 0.0002504205738205201,
"epoch": 0.15870455020764243,
"grad_norm": 0.02998184598982334,
"kl": 0.0019073486328125,
"learning_rate": 1.3216560509554142e-06,
"loss": 0.0067,
"step": 83
},
{
"clip_ratio": 0.00025362581419585695,
"epoch": 0.16061665322219235,
"grad_norm": 0.029601849615573883,
"kl": 0.0019354820251464844,
"learning_rate": 1.337579617834395e-06,
"loss": 0.0067,
"step": 84
},
{
"clip_ratio": 0.0003167184295307379,
"epoch": 0.16252875623674226,
"grad_norm": 0.030052170157432556,
"kl": 0.0019598007202148438,
"learning_rate": 1.353503184713376e-06,
"loss": 0.0067,
"step": 85
},
{
"clip_ratio": 0.0,
"completion_length": 527.9562168121338,
"epoch": 0.16444085925129218,
"grad_norm": 0.03331635147333145,
"kl": 0.002047300338745117,
"learning_rate": 1.3694267515923567e-06,
"loss": 0.0076,
"num_tokens": 66182275.0,
"reward": 0.04045759132714011,
"reward_std": 0.06074576545506716,
"rewards/pure_accuracy_reward_math": 0.04045759068685584,
"step": 86
},
{
"clip_ratio": 0.0002471263709367122,
"epoch": 0.16635296226584206,
"grad_norm": 0.03298444300889969,
"kl": 0.0020711421966552734,
"learning_rate": 1.3853503184713377e-06,
"loss": 0.0076,
"step": 87
},
{
"clip_ratio": 0.00024866302578629984,
"epoch": 0.16826506528039198,
"grad_norm": 0.03206898272037506,
"kl": 0.0020384788513183594,
"learning_rate": 1.4012738853503185e-06,
"loss": 0.0076,
"step": 88
},
{
"clip_ratio": 0.00026278120321876486,
"epoch": 0.1701771682949419,
"grad_norm": 0.03115510568022728,
"kl": 0.002008795738220215,
"learning_rate": 1.4171974522292995e-06,
"loss": 0.0076,
"step": 89
},
{
"clip_ratio": 0.000245522400405207,
"epoch": 0.1720892713094918,
"grad_norm": 0.030577220022678375,
"kl": 0.0019922256469726562,
"learning_rate": 1.4331210191082802e-06,
"loss": 0.0076,
"step": 90
},
{
"clip_ratio": 0.0,
"completion_length": 523.3948340415955,
"epoch": 0.1740013743240417,
"grad_norm": 0.0348118431866169,
"kl": 0.0019588470458984375,
"learning_rate": 1.4490445859872612e-06,
"loss": 0.0046,
"num_tokens": 69793534.0,
"reward": 0.04436384132714011,
"reward_std": 0.059376906079705805,
"rewards/pure_accuracy_reward_math": 0.044363840570440516,
"step": 91
},
{
"clip_ratio": 0.00021377558331892033,
"epoch": 0.1759134773385916,
"grad_norm": 0.03493114933371544,
"kl": 0.0019345283508300781,
"learning_rate": 1.4649681528662422e-06,
"loss": 0.0046,
"step": 92
},
{
"clip_ratio": 0.00023636125789039397,
"epoch": 0.17782558035314153,
"grad_norm": 0.03362264111638069,
"kl": 0.0019860267639160156,
"learning_rate": 1.480891719745223e-06,
"loss": 0.0046,
"step": 93
},
{
"clip_ratio": 0.00022836430440520417,
"epoch": 0.17973768336769144,
"grad_norm": 0.03336656093597412,
"kl": 0.002032160758972168,
"learning_rate": 1.496815286624204e-06,
"loss": 0.0045,
"step": 94
},
{
"clip_ratio": 0.00024139108904819295,
"epoch": 0.18164978638224133,
"grad_norm": 0.03235051408410072,
"kl": 0.0021082162857055664,
"learning_rate": 1.5127388535031847e-06,
"loss": 0.0045,
"step": 95
},
{
"clip_ratio": 0.0,
"completion_length": 530.8058252334595,
"epoch": 0.18356188939679124,
"grad_norm": 0.03482769802212715,
"kl": 0.0021872520446777344,
"learning_rate": 1.5286624203821657e-06,
"loss": 0.0075,
"num_tokens": 73427974.0,
"reward": 0.04101562709547579,
"reward_std": 0.06101094774203375,
"rewards/pure_accuracy_reward_math": 0.04101562616415322,
"step": 96
},
{
"clip_ratio": 0.00024072786442275174,
"epoch": 0.18547399241134116,
"grad_norm": 0.03345990553498268,
"kl": 0.002261519432067871,
"learning_rate": 1.5445859872611465e-06,
"loss": 0.0075,
"step": 97
},
{
"clip_ratio": 0.00024480573915752757,
"epoch": 0.18738609542589107,
"grad_norm": 0.03318383917212486,
"kl": 0.0022890567779541016,
"learning_rate": 1.5605095541401275e-06,
"loss": 0.0075,
"step": 98
},
{
"clip_ratio": 0.00027489714915418517,
"epoch": 0.189298198440441,
"grad_norm": 0.03230712562799454,
"kl": 0.0023267269134521484,
"learning_rate": 1.5764331210191083e-06,
"loss": 0.0074,
"step": 99
},
{
"clip_ratio": 0.00029621877195040724,
"epoch": 0.19121030145499088,
"grad_norm": 0.03260359168052673,
"kl": 0.002334117889404297,
"learning_rate": 1.5923566878980892e-06,
"loss": 0.0074,
"step": 100
},
{
"clip_ratio": 0.0,
"completion_length": 526.358283996582,
"epoch": 0.0019121030145499089,
"grad_norm": 0.30763256549835205,
"kl": 0.0024461746215820312,
"learning_rate": 1.6082802547770702e-06,
"loss": 0.0053,
"num_tokens": 3621800.0,
"reward": 0.0546875023865141,
"reward_std": 0.06997958500869572,
"rewards/pure_accuracy_reward_math": 0.054687501629814506,
"step": 101
},
{
"clip_ratio": 0.00028505406811518696,
"epoch": 0.0038242060290998177,
"grad_norm": 0.7424792647361755,
"kl": 0.005189061164855957,
"learning_rate": 1.624203821656051e-06,
"loss": 0.0054,
"step": 102
},
{
"clip_ratio": 0.000307778484739174,
"epoch": 0.005736309043649726,
"grad_norm": 0.5747273564338684,
"kl": 0.005206584930419922,
"learning_rate": 1.640127388535032e-06,
"loss": 0.0054,
"step": 103
},
{
"clip_ratio": 0.0003712488735345687,
"epoch": 0.0076484120581996355,
"grad_norm": 0.15304483473300934,
"kl": 0.0026189088821411133,
"learning_rate": 1.6560509554140127e-06,
"loss": 0.0053,
"step": 104
},
{
"clip_ratio": 0.00037476027159755176,
"epoch": 0.009560515072749545,
"grad_norm": 0.2118157148361206,
"kl": 0.00246584415435791,
"learning_rate": 1.6719745222929937e-06,
"loss": 0.0053,
"step": 105
},
{
"clip_ratio": 0.0,
"completion_length": 527.46431016922,
"epoch": 0.011472618087299453,
"grad_norm": 0.2036779820919037,
"kl": 0.0037467479705810547,
"learning_rate": 1.6878980891719745e-06,
"loss": 0.0067,
"num_tokens": 7244448.0,
"reward": 0.05161830596625805,
"reward_std": 0.06822534638922662,
"rewards/pure_accuracy_reward_math": 0.05161830474389717,
"step": 106
},
{
"clip_ratio": 0.0002751678786125922,
"epoch": 0.013384721101849363,
"grad_norm": 0.1858554631471634,
"kl": 0.0035070180892944336,
"learning_rate": 1.7038216560509555e-06,
"loss": 0.0067,
"step": 107
},
{
"clip_ratio": 0.0002901391828800115,
"epoch": 0.015296824116399271,
"grad_norm": 0.06319136172533035,
"kl": 0.0033702850341796875,
"learning_rate": 1.7197452229299363e-06,
"loss": 0.0067,
"step": 108
},
{
"clip_ratio": 0.00029408001091724145,
"epoch": 0.01720892713094918,
"grad_norm": 0.061827220022678375,
"kl": 0.00351715087890625,
"learning_rate": 1.7356687898089172e-06,
"loss": 0.0067,
"step": 109
},
{
"clip_ratio": 0.0002710100695253459,
"epoch": 0.01912103014549909,
"grad_norm": 0.13167870044708252,
"kl": 0.0036835670471191406,
"learning_rate": 1.7515923566878982e-06,
"loss": 0.0067,
"step": 110
},
{
"clip_ratio": 0.0,
"completion_length": 515.4391984939575,
"epoch": 0.021033133160048997,
"grad_norm": 0.034568388015031815,
"kl": 0.0024957656860351562,
"learning_rate": 1.767515923566879e-06,
"loss": 0.0068,
"num_tokens": 10824130.0,
"reward": 0.0468750023865141,
"reward_std": 0.06221334758447483,
"rewards/pure_accuracy_reward_math": 0.04687500116415322,
"step": 111
},
{
"clip_ratio": 0.00025272632768746917,
"epoch": 0.022945236174598906,
"grad_norm": 0.03421744704246521,
"kl": 0.002499222755432129,
"learning_rate": 1.78343949044586e-06,
"loss": 0.0068,
"step": 112
},
{
"clip_ratio": 0.00025192988658773174,
"epoch": 0.024857339189148814,
"grad_norm": 0.03444651514291763,
"kl": 0.002528548240661621,
"learning_rate": 1.7993630573248407e-06,
"loss": 0.0068,
"step": 113
},
{
"clip_ratio": 0.0002639102876287325,
"epoch": 0.026769442203698725,
"grad_norm": 0.033966146409511566,
"kl": 0.0025298595428466797,
"learning_rate": 1.8152866242038217e-06,
"loss": 0.0067,
"step": 114
},
{
"clip_ratio": 0.0002613060296994263,
"epoch": 0.028681545218248634,
"grad_norm": 0.03252725675702095,
"kl": 0.0025829076766967773,
"learning_rate": 1.8312101910828025e-06,
"loss": 0.0067,
"step": 115
},
{
"clip_ratio": 0.0,
"completion_length": 511.59126138687134,
"epoch": 0.030593648232798542,
"grad_norm": 0.04161737114191055,
"kl": 0.002777099609375,
"learning_rate": 1.8471337579617835e-06,
"loss": 0.0084,
"num_tokens": 14389817.0,
"reward": 0.04464285934227519,
"reward_std": 0.0631567623349838,
"rewards/pure_accuracy_reward_math": 0.044642858469160274,
"step": 116
},
{
"clip_ratio": 0.0002685248994680478,
"epoch": 0.032505751247348454,
"grad_norm": 0.03920653462409973,
"kl": 0.002690911293029785,
"learning_rate": 1.8630573248407643e-06,
"loss": 0.0084,
"step": 117
},
{
"clip_ratio": 0.00028247613772691693,
"epoch": 0.03441785426189836,
"grad_norm": 0.037915512919425964,
"kl": 0.0026444196701049805,
"learning_rate": 1.8789808917197455e-06,
"loss": 0.0084,
"step": 118
},
{
"clip_ratio": 0.00028578577973803476,
"epoch": 0.03632995727644827,
"grad_norm": 0.03727024793624878,
"kl": 0.002573251724243164,
"learning_rate": 1.8949044585987264e-06,
"loss": 0.0083,
"step": 119
},
{
"clip_ratio": 0.0003107314861381383,
"epoch": 0.03824206029099818,
"grad_norm": 0.03734543174505234,
"kl": 0.002534151077270508,
"learning_rate": 1.9108280254777074e-06,
"loss": 0.0083,
"step": 120
},
{
"clip_ratio": 0.0,
"completion_length": 511.96654319763184,
"epoch": 0.040154163305548086,
"grad_norm": 18.524425506591797,
"kl": 0.05213046073913574,
"learning_rate": 1.926751592356688e-06,
"loss": 0.0067,
"num_tokens": 17950273.0,
"reward": 0.044642859254963696,
"reward_std": 0.0572310917195864,
"rewards/pure_accuracy_reward_math": 0.04464285838184878,
"step": 121
},
{
"clip_ratio": 0.00024330438452579983,
"epoch": 0.042066266320097995,
"grad_norm": 0.06961806118488312,
"kl": 0.0025354623794555664,
"learning_rate": 1.942675159235669e-06,
"loss": 0.0047,
"step": 122
},
{
"clip_ratio": 0.00023799908234423128,
"epoch": 0.0439783693346479,
"grad_norm": 0.038592379540205,
"kl": 0.0024437904357910156,
"learning_rate": 1.95859872611465e-06,
"loss": 0.0047,
"step": 123
},
{
"clip_ratio": 0.00023513944393016573,
"epoch": 0.04589047234919781,
"grad_norm": 0.036785636097192764,
"kl": 0.002588033676147461,
"learning_rate": 1.974522292993631e-06,
"loss": 0.0047,
"step": 124
},
{
"clip_ratio": 0.0002449645085107477,
"epoch": 0.04780257536374772,
"grad_norm": 0.03537231311202049,
"kl": 0.002721548080444336,
"learning_rate": 1.9904458598726117e-06,
"loss": 0.0047,
"step": 125
},
{
"clip_ratio": 0.0,
"completion_length": 513.618884563446,
"epoch": 0.04971467837829763,
"grad_norm": 0.03746291249990463,
"kl": 0.0026862621307373047,
"learning_rate": 2.0063694267515925e-06,
"loss": 0.0063,
"num_tokens": 21522907.0,
"reward": 0.04492187732830644,
"reward_std": 0.061436392075847834,
"rewards/pure_accuracy_reward_math": 0.04492187616415322,
"step": 126
},
{
"clip_ratio": 0.0002821582585283977,
"epoch": 0.05162678139284754,
"grad_norm": 0.036032602190971375,
"kl": 0.0027321577072143555,
"learning_rate": 2.0222929936305737e-06,
"loss": 0.0063,
"step": 127
},
{
"clip_ratio": 0.0002675421079629814,
"epoch": 0.05353888440739745,
"grad_norm": 0.03723033517599106,
"kl": 0.002848386764526367,
"learning_rate": 2.0382165605095544e-06,
"loss": 0.0062,
"step": 128
},
{
"clip_ratio": 0.00030748845301786787,
"epoch": 0.05545098742194736,
"grad_norm": 0.03697400540113449,
"kl": 0.002881765365600586,
"learning_rate": 2.054140127388535e-06,
"loss": 0.0062,
"step": 129
},
{
"clip_ratio": 0.0003087153630758621,
"epoch": 0.05736309043649727,
"grad_norm": 0.03756724298000336,
"kl": 0.002836942672729492,
"learning_rate": 2.070063694267516e-06,
"loss": 0.0062,
"step": 130
},
{
"clip_ratio": 0.0,
"completion_length": 518.1615762710571,
"epoch": 0.059275193451047176,
"grad_norm": 0.039371710270643234,
"kl": 0.00270843505859375,
"learning_rate": 2.085987261146497e-06,
"loss": 0.0064,
"num_tokens": 25111362.0,
"reward": 0.05106027016881853,
"reward_std": 0.06736206263303757,
"rewards/pure_accuracy_reward_math": 0.051060269062872976,
"step": 131
},
{
"clip_ratio": 0.0002896036380661826,
"epoch": 0.061187296465597084,
"grad_norm": 0.03780793026089668,
"kl": 0.0027250051498413086,
"learning_rate": 2.101910828025478e-06,
"loss": 0.0064,
"step": 132
},
{
"clip_ratio": 0.0002853632216783808,
"epoch": 0.06309939948014699,
"grad_norm": 0.03720535710453987,
"kl": 0.0027070045471191406,
"learning_rate": 2.1178343949044587e-06,
"loss": 0.0064,
"step": 133
},
{
"clip_ratio": 0.0002896762144928289,
"epoch": 0.06501150249469691,
"grad_norm": 0.036468133330345154,
"kl": 0.0027469396591186523,
"learning_rate": 2.13375796178344e-06,
"loss": 0.0064,
"step": 134
},
{
"clip_ratio": 0.0003120482754184195,
"epoch": 0.06692360550924681,
"grad_norm": 0.03586801886558533,
"kl": 0.002748727798461914,
"learning_rate": 2.1496815286624207e-06,
"loss": 0.0063,
"step": 135
},
{
"clip_ratio": 0.0,
"completion_length": 528.0351796150208,
"epoch": 0.06883570852379672,
"grad_norm": 0.03092282824218273,
"kl": 0.002766728401184082,
"learning_rate": 2.1656050955414015e-06,
"loss": 0.0056,
"num_tokens": 28735680.0,
"reward": 0.04017857348662801,
"reward_std": 0.05289319949224591,
"rewards/pure_accuracy_reward_math": 0.040178572438890114,
"step": 136
},
{
"clip_ratio": 0.00020221989311153266,
"epoch": 0.07074781153834662,
"grad_norm": 0.030703941360116005,
"kl": 0.0028089284896850586,
"learning_rate": 2.1815286624203822e-06,
"loss": 0.0056,
"step": 137
},
{
"clip_ratio": 0.00019867721590571819,
"epoch": 0.07265991455289654,
"grad_norm": 0.030248478055000305,
"kl": 0.0027884244918823242,
"learning_rate": 2.1974522292993634e-06,
"loss": 0.0056,
"step": 138
},
{
"clip_ratio": 0.00021304549886735913,
"epoch": 0.07457201756744644,
"grad_norm": 0.029539138078689575,
"kl": 0.002767205238342285,
"learning_rate": 2.213375796178344e-06,
"loss": 0.0056,
"step": 139
},
{
"clip_ratio": 0.00021535260020755231,
"epoch": 0.07648412058199636,
"grad_norm": 0.02955791726708412,
"kl": 0.002725839614868164,
"learning_rate": 2.229299363057325e-06,
"loss": 0.0055,
"step": 140
},
{
"clip_ratio": 0.0,
"completion_length": 525.7212834358215,
"epoch": 0.07839622359654626,
"grad_norm": 0.060058850795030594,
"kl": 0.0032591819763183594,
"learning_rate": 2.245222929936306e-06,
"loss": 0.0071,
"num_tokens": 32349997.0,
"reward": 0.048828127211891115,
"reward_std": 0.056028691527899355,
"rewards/pure_accuracy_reward_math": 0.04882812628056854,
"step": 141
},
{
"clip_ratio": 0.00022036872547914754,
"epoch": 0.08030832661109617,
"grad_norm": 0.03533012047410011,
"kl": 0.002978205680847168,
"learning_rate": 2.261146496815287e-06,
"loss": 0.0071,
"step": 142
},
{
"clip_ratio": 0.0002158615123448726,
"epoch": 0.08222042962564609,
"grad_norm": 0.029908612370491028,
"kl": 0.002841353416442871,
"learning_rate": 2.2770700636942677e-06,
"loss": 0.0071,
"step": 143
},
{
"clip_ratio": 0.0002112481060976279,
"epoch": 0.08413253264019599,
"grad_norm": 0.028638474643230438,
"kl": 0.002796173095703125,
"learning_rate": 2.2929936305732485e-06,
"loss": 0.0071,
"step": 144
},
{
"clip_ratio": 0.00022246911356660348,
"epoch": 0.0860446356547459,
"grad_norm": 0.02828238159418106,
"kl": 0.0027240514755249023,
"learning_rate": 2.3089171974522297e-06,
"loss": 0.007,
"step": 145
},
{
"clip_ratio": 0.0,
"completion_length": 534.0318322181702,
"epoch": 0.0879567386692958,
"grad_norm": 3.060509443283081,
"kl": 0.022321224212646484,
"learning_rate": 2.3248407643312104e-06,
"loss": 0.0062,
"num_tokens": 35996663.0,
"reward": 0.04436384144355543,
"reward_std": 0.06130999844754115,
"rewards/pure_accuracy_reward_math": 0.04436384039581753,
"step": 146
},
{
"clip_ratio": 0.00023404289771633557,
"epoch": 0.08986884168384572,
"grad_norm": 0.28904739022254944,
"kl": 0.004893064498901367,
"learning_rate": 2.3407643312101912e-06,
"loss": 0.0055,
"step": 147
},
{
"clip_ratio": 0.00024259101735424338,
"epoch": 0.09178094469839562,
"grad_norm": 0.03826431185007095,
"kl": 0.0027625560760498047,
"learning_rate": 2.356687898089172e-06,
"loss": 0.0054,
"step": 148
},
{
"clip_ratio": 0.0002517821457672653,
"epoch": 0.09369304771294554,
"grad_norm": 0.03572425991296768,
"kl": 0.002875208854675293,
"learning_rate": 2.372611464968153e-06,
"loss": 0.0054,
"step": 149
},
{
"clip_ratio": 0.00024034848578935453,
"epoch": 0.09560515072749544,
"grad_norm": 0.036431849002838135,
"kl": 0.0031164884567260742,
"learning_rate": 2.388535031847134e-06,
"loss": 0.0054,
"step": 150
},
{
"clip_ratio": 0.0,
"completion_length": 538.2168254852295,
"epoch": 0.09751725374204535,
"grad_norm": 0.03362743556499481,
"kl": 0.002684354782104492,
"learning_rate": 2.4044585987261147e-06,
"loss": 0.0027,
"num_tokens": 39661060.0,
"reward": 0.05133928792201914,
"reward_std": 0.06672389659797773,
"rewards/pure_accuracy_reward_math": 0.05133928681607358,
"step": 151
},
{
"clip_ratio": 0.0002668876670099962,
"epoch": 0.09942935675659526,
"grad_norm": 0.033922772854566574,
"kl": 0.002791762351989746,
"learning_rate": 2.420382165605096e-06,
"loss": 0.0027,
"step": 152
},
{
"clip_ratio": 0.0002435101382616267,
"epoch": 0.10134145977114517,
"grad_norm": 0.03526493161916733,
"kl": 0.002907991409301758,
"learning_rate": 2.4363057324840767e-06,
"loss": 0.0027,
"step": 153
},
{
"clip_ratio": 0.00025345294346834635,
"epoch": 0.10325356278569509,
"grad_norm": 0.034125424921512604,
"kl": 0.0029108524322509766,
"learning_rate": 2.4522292993630575e-06,
"loss": 0.0027,
"step": 154
},
{
"clip_ratio": 0.0002378649581942227,
"epoch": 0.10516566580024499,
"grad_norm": 0.033436987549066544,
"kl": 0.002874612808227539,
"learning_rate": 2.4681528662420382e-06,
"loss": 0.0027,
"step": 155
},
{
"clip_ratio": 0.0,
"completion_length": 541.2424907684326,
"epoch": 0.1070777688147949,
"grad_norm": 0.031592246145009995,
"kl": 0.002785801887512207,
"learning_rate": 2.4840764331210194e-06,
"loss": 0.005,
"num_tokens": 43331425.0,
"reward": 0.044363841181620955,
"reward_std": 0.05607495462754741,
"rewards/pure_accuracy_reward_math": 0.044363840599544346,
"step": 156
},
{
"clip_ratio": 0.00019312051063025137,
"epoch": 0.1089898718293448,
"grad_norm": 0.030642936006188393,
"kl": 0.0027495622634887695,
"learning_rate": 2.5e-06,
"loss": 0.005,
"step": 157
},
{
"clip_ratio": 0.0002267159566713417,
"epoch": 0.11090197484389472,
"grad_norm": 0.03025418519973755,
"kl": 0.002672433853149414,
"learning_rate": 2.515923566878981e-06,
"loss": 0.0049,
"step": 158
},
{
"clip_ratio": 0.00023296605036193796,
"epoch": 0.11281407785844462,
"grad_norm": 0.03024701401591301,
"kl": 0.0026074647903442383,
"learning_rate": 2.531847133757962e-06,
"loss": 0.0049,
"step": 159
},
{
"clip_ratio": 0.00024551542321660236,
"epoch": 0.11472618087299453,
"grad_norm": 0.03065372072160244,
"kl": 0.0025725364685058594,
"learning_rate": 2.547770700636943e-06,
"loss": 0.0049,
"step": 160
},
{
"clip_ratio": 0.0,
"completion_length": 531.590705871582,
"epoch": 0.11663828388754444,
"grad_norm": 0.03286377340555191,
"kl": 0.002618551254272461,
"learning_rate": 2.5636942675159237e-06,
"loss": 0.0032,
"num_tokens": 46966882.0,
"reward": 0.0401785735739395,
"reward_std": 0.05864621384534985,
"rewards/pure_accuracy_reward_math": 0.04017857258440927,
"step": 161
},
{
"clip_ratio": 0.000249601399104904,
"epoch": 0.11855038690209435,
"grad_norm": 0.03168044239282608,
"kl": 0.0025817155838012695,
"learning_rate": 2.5796178343949045e-06,
"loss": 0.0032,
"step": 162
},
{
"clip_ratio": 0.0002426054838338132,
"epoch": 0.12046248991664425,
"grad_norm": 0.03161012753844261,
"kl": 0.0025763511657714844,
"learning_rate": 2.5955414012738857e-06,
"loss": 0.0032,
"step": 163
},
{
"clip_ratio": 0.0002400714004124893,
"epoch": 0.12237459293119417,
"grad_norm": 0.031408168375492096,
"kl": 0.002588987350463867,
"learning_rate": 2.6114649681528665e-06,
"loss": 0.0032,
"step": 164
},
{
"clip_ratio": 0.00024877328468164706,
"epoch": 0.12428669594574408,
"grad_norm": 0.030564049258828163,
"kl": 0.0026369094848632812,
"learning_rate": 2.6273885350318472e-06,
"loss": 0.0031,
"step": 165
},
{
"clip_ratio": 0.0,
"completion_length": 525.5862393379211,
"epoch": 0.12619879896029398,
"grad_norm": 0.03767310827970505,
"kl": 0.0026383399963378906,
"learning_rate": 2.6433121019108284e-06,
"loss": 0.0062,
"num_tokens": 50581511.0,
"reward": 0.04966518055880442,
"reward_std": 0.06985319027444348,
"rewards/pure_accuracy_reward_math": 0.04966517968568951,
"step": 166
},
{
"clip_ratio": 0.0002872111982696879,
"epoch": 0.1281109019748439,
"grad_norm": 0.03578091412782669,
"kl": 0.0027115345001220703,
"learning_rate": 2.659235668789809e-06,
"loss": 0.0062,
"step": 167
},
{
"clip_ratio": 0.0002957127134664006,
"epoch": 0.13002300498939381,
"grad_norm": 0.03471493721008301,
"kl": 0.0028066635131835938,
"learning_rate": 2.67515923566879e-06,
"loss": 0.0062,
"step": 168
},
{
"clip_ratio": 0.0003112256898702981,
"epoch": 0.1319351080039437,
"grad_norm": 0.035491716116666794,
"kl": 0.0028966665267944336,
"learning_rate": 2.6910828025477707e-06,
"loss": 0.0062,
"step": 169
},
{
"clip_ratio": 0.0003354581235726073,
"epoch": 0.13384721101849362,
"grad_norm": 0.03574714809656143,
"kl": 0.0029289722442626953,
"learning_rate": 2.707006369426752e-06,
"loss": 0.0061,
"step": 170
},
{
"clip_ratio": 0.0,
"completion_length": 527.8571667671204,
"epoch": 0.13575931403304353,
"grad_norm": 0.03648287057876587,
"kl": 0.0030307769775390625,
"learning_rate": 2.7229299363057327e-06,
"loss": 0.0061,
"num_tokens": 54209407.0,
"reward": 0.05161830587894656,
"reward_std": 0.06465821276651695,
"rewards/pure_accuracy_reward_math": 0.05161830494762398,
"step": 171
},
{
"clip_ratio": 0.0002587431810354701,
"epoch": 0.13767141704759345,
"grad_norm": 0.03615426644682884,
"kl": 0.0030341148376464844,
"learning_rate": 2.7388535031847135e-06,
"loss": 0.0061,
"step": 172
},
{
"clip_ratio": 0.0002548517101104153,
"epoch": 0.13958352006214333,
"grad_norm": 0.03565597161650658,
"kl": 0.002932310104370117,
"learning_rate": 2.7547770700636942e-06,
"loss": 0.0061,
"step": 173
},
{
"clip_ratio": 0.00027394448250106507,
"epoch": 0.14149562307669325,
"grad_norm": 0.035612594336271286,
"kl": 0.0029175281524658203,
"learning_rate": 2.7707006369426754e-06,
"loss": 0.0061,
"step": 174
},
{
"clip_ratio": 0.00027776476230201297,
"epoch": 0.14340772609124317,
"grad_norm": 0.036588992923498154,
"kl": 0.002942800521850586,
"learning_rate": 2.786624203821656e-06,
"loss": 0.006,
"step": 175
},
{
"clip_ratio": 0.0,
"completion_length": 529.2430500984192,
"epoch": 0.14531982910579308,
"grad_norm": 0.03312006592750549,
"kl": 0.0028772354125976562,
"learning_rate": 2.802547770700637e-06,
"loss": 0.0056,
"num_tokens": 57839070.0,
"reward": 0.04854910931317136,
"reward_std": 0.05881887051509693,
"rewards/pure_accuracy_reward_math": 0.048549108614679426,
"step": 176
},
{
"clip_ratio": 0.00022063881021949783,
"epoch": 0.147231932120343,
"grad_norm": 0.0327099934220314,
"kl": 0.002942681312561035,
"learning_rate": 2.818471337579618e-06,
"loss": 0.0056,
"step": 177
},
{
"clip_ratio": 0.00021944492368675128,
"epoch": 0.14914403513489288,
"grad_norm": 0.03261202201247215,
"kl": 0.002986431121826172,
"learning_rate": 2.834394904458599e-06,
"loss": 0.0056,
"step": 178
},
{
"clip_ratio": 0.0002127133307396889,
"epoch": 0.1510561381494428,
"grad_norm": 0.03220335766673088,
"kl": 0.002970457077026367,
"learning_rate": 2.8503184713375797e-06,
"loss": 0.0056,
"step": 179
},
{
"clip_ratio": 0.0001991192841614975,
"epoch": 0.1529682411639927,
"grad_norm": 0.03179548308253288,
"kl": 0.0029560327529907227,
"learning_rate": 2.8662420382165605e-06,
"loss": 0.0056,
"step": 180
},
{
"clip_ratio": 0.0,
"completion_length": 522.1275358200073,
"epoch": 0.15488034417854263,
"grad_norm": 0.030966561287641525,
"kl": 0.0029218196868896484,
"learning_rate": 2.8821656050955417e-06,
"loss": 0.0048,
"num_tokens": 61445599.0,
"reward": 0.04352678795112297,
"reward_std": 0.05598862626357004,
"rewards/pure_accuracy_reward_math": 0.043526787078008056,
"step": 181
},
{
"clip_ratio": 0.00021554413663693595,
"epoch": 0.15679244719309252,
"grad_norm": 0.030419446527957916,
"kl": 0.0029065608978271484,
"learning_rate": 2.8980891719745225e-06,
"loss": 0.0048,
"step": 182
},
{
"clip_ratio": 0.0002025423377176594,
"epoch": 0.15870455020764243,
"grad_norm": 0.030062729492783546,
"kl": 0.0028995275497436523,
"learning_rate": 2.9140127388535032e-06,
"loss": 0.0048,
"step": 183
},
{
"clip_ratio": 0.00023064417456453157,
"epoch": 0.16061665322219235,
"grad_norm": 0.029301613569259644,
"kl": 0.002888321876525879,
"learning_rate": 2.9299363057324844e-06,
"loss": 0.0048,
"step": 184
},
{
"clip_ratio": 0.0002338091023261768,
"epoch": 0.16252875623674226,
"grad_norm": 0.029127391055226326,
"kl": 0.0028772354125976562,
"learning_rate": 2.945859872611465e-06,
"loss": 0.0047,
"step": 185
},
{
"clip_ratio": 0.0,
"completion_length": 533.0510892868042,
"epoch": 0.16444085925129218,
"grad_norm": 0.036479271948337555,
"kl": 0.002923727035522461,
"learning_rate": 2.961783439490446e-06,
"loss": 0.0063,
"num_tokens": 65094142.0,
"reward": 0.05022321717115119,
"reward_std": 0.06538890500087291,
"rewards/pure_accuracy_reward_math": 0.050223215424921364,
"step": 186
},
{
"clip_ratio": 0.00026048495129771254,
"epoch": 0.16635296226584206,
"grad_norm": 0.036232370883226395,
"kl": 0.0029561519622802734,
"learning_rate": 2.9777070063694267e-06,
"loss": 0.0063,
"step": 187
},
{
"clip_ratio": 0.0002226464382033555,
"epoch": 0.16826506528039198,
"grad_norm": 0.03523917496204376,
"kl": 0.003048419952392578,
"learning_rate": 2.993630573248408e-06,
"loss": 0.0063,
"step": 188
},
{
"clip_ratio": 0.0002362887615845466,
"epoch": 0.1701771682949419,
"grad_norm": 0.03477315977215767,
"kl": 0.003025054931640625,
"learning_rate": 3.0095541401273887e-06,
"loss": 0.0062,
"step": 189
},
{
"clip_ratio": 0.00023160997727700305,
"epoch": 0.1720892713094918,
"grad_norm": 0.03342609107494354,
"kl": 0.0030221939086914062,
"learning_rate": 3.0254777070063695e-06,
"loss": 0.0062,
"step": 190
},
{
"clip_ratio": 0.0,
"completion_length": 529.7776494026184,
"epoch": 0.1740013743240417,
"grad_norm": 0.03668810427188873,
"kl": 0.0029752254486083984,
"learning_rate": 3.0414012738853503e-06,
"loss": 0.0066,
"num_tokens": 68728277.0,
"reward": 0.04994419863214716,
"reward_std": 0.06135626137256622,
"rewards/pure_accuracy_reward_math": 0.04994419787544757,
"step": 191
},
{
"clip_ratio": 0.0002391185845453947,
"epoch": 0.1759134773385916,
"grad_norm": 0.035618141293525696,
"kl": 0.0029642581939697266,
"learning_rate": 3.0573248407643314e-06,
"loss": 0.0066,
"step": 192
},
{
"clip_ratio": 0.00024402707180115613,
"epoch": 0.17782558035314153,
"grad_norm": 0.032588809728622437,
"kl": 0.002981424331665039,
"learning_rate": 3.0732484076433122e-06,
"loss": 0.0066,
"step": 193
},
{
"clip_ratio": 0.0002546731577126593,
"epoch": 0.17973768336769144,
"grad_norm": 0.0323190875351429,
"kl": 0.0030133724212646484,
"learning_rate": 3.089171974522293e-06,
"loss": 0.0066,
"step": 194
},
{
"clip_ratio": 0.0002784079450179888,
"epoch": 0.18164978638224133,
"grad_norm": 0.03181909769773483,
"kl": 0.002997159957885742,
"learning_rate": 3.105095541401274e-06,
"loss": 0.0065,
"step": 195
},
{
"clip_ratio": 0.0,
"completion_length": 536.7226796150208,
"epoch": 0.18356188939679124,
"grad_norm": 0.034835390746593475,
"kl": 0.003053426742553711,
"learning_rate": 3.121019108280255e-06,
"loss": 0.0053,
"num_tokens": 72383923.0,
"reward": 0.04352678789291531,
"reward_std": 0.06164911447558552,
"rewards/pure_accuracy_reward_math": 0.043526787078008056,
"step": 196
},
{
"clip_ratio": 0.00022759345233680506,
"epoch": 0.18547399241134116,
"grad_norm": 0.03316686674952507,
"kl": 0.003064870834350586,
"learning_rate": 3.1369426751592357e-06,
"loss": 0.0053,
"step": 197
},
{
"clip_ratio": 0.00024183520912401946,
"epoch": 0.18738609542589107,
"grad_norm": 0.0329214446246624,
"kl": 0.003040313720703125,
"learning_rate": 3.1528662420382165e-06,
"loss": 0.0053,
"step": 198
},
{
"clip_ratio": 0.0002539973459079192,
"epoch": 0.189298198440441,
"grad_norm": 0.031231405213475227,
"kl": 0.0030624866485595703,
"learning_rate": 3.1687898089171977e-06,
"loss": 0.0052,
"step": 199
},
{
"clip_ratio": 0.0002776768195076329,
"epoch": 0.19121030145499088,
"grad_norm": 0.031124714761972427,
"kl": 0.0030813217163085938,
"learning_rate": 3.1847133757961785e-06,
"loss": 0.0052,
"step": 200
},
{
"clip_ratio": 0.0,
"completion_length": 522.4048767089844,
"epoch": 0.1931224044695408,
"grad_norm": 0.03386811539530754,
"kl": 0.003122568130493164,
"learning_rate": 3.2006369426751592e-06,
"loss": 0.0052,
"num_tokens": 75984438.0,
"reward": 0.04771205616998486,
"reward_std": 0.06319682823959738,
"rewards/pure_accuracy_reward_math": 0.04771205471479334,
"step": 201
},
{
"clip_ratio": 0.00024403837670661233,
"epoch": 0.1950345074840907,
"grad_norm": 0.03252818062901497,
"kl": 0.003181934356689453,
"learning_rate": 3.2165605095541404e-06,
"loss": 0.0052,
"step": 202
},
{
"clip_ratio": 0.0002548924753114079,
"epoch": 0.19694661049864062,
"grad_norm": 0.03233063966035843,
"kl": 0.0032570362091064453,
"learning_rate": 3.232484076433121e-06,
"loss": 0.0052,
"step": 203
},
{
"clip_ratio": 0.0003048134046252926,
"epoch": 0.1988587135131905,
"grad_norm": 0.032457806169986725,
"kl": 0.0032837390899658203,
"learning_rate": 3.248407643312102e-06,
"loss": 0.0051,
"step": 204
},
{
"clip_ratio": 0.0003034327668842707,
"epoch": 0.20077081652774043,
"grad_norm": 0.03239855542778969,
"kl": 0.0032906532287597656,
"learning_rate": 3.2643312101910827e-06,
"loss": 0.0051,
"step": 205
},
{
"clip_ratio": 0.0,
"completion_length": 509.6657609939575,
"epoch": 0.20268291954229034,
"grad_norm": 0.0370325967669487,
"kl": 0.0033235549926757812,
"learning_rate": 3.280254777070064e-06,
"loss": 0.0075,
"num_tokens": 79548556.0,
"reward": 0.052176341792801395,
"reward_std": 0.06135626166360453,
"rewards/pure_accuracy_reward_math": 0.0521763407450635,
"step": 206
},
{
"clip_ratio": 0.00026798775621728055,
"epoch": 0.20459502255684026,
"grad_norm": 0.03616202250123024,
"kl": 0.0032608509063720703,
"learning_rate": 3.2961783439490447e-06,
"loss": 0.0075,
"step": 207
},
{
"clip_ratio": 0.0002652346859690624,
"epoch": 0.20650712557139017,
"grad_norm": 0.03537038713693619,
"kl": 0.0032129287719726562,
"learning_rate": 3.3121019108280255e-06,
"loss": 0.0074,
"step": 208
},
{
"clip_ratio": 0.00026950107780976396,
"epoch": 0.20841922858594006,
"grad_norm": 0.03502323478460312,
"kl": 0.0031485557556152344,
"learning_rate": 3.3280254777070063e-06,
"loss": 0.0074,
"step": 209
},
{
"clip_ratio": 0.00025725525091502277,
"epoch": 0.21033133160048997,
"grad_norm": 0.03380832076072693,
"kl": 0.0031027793884277344,
"learning_rate": 3.3439490445859875e-06,
"loss": 0.0074,
"step": 210
},
{
"clip_ratio": 0.0,
"completion_length": 511.36637449264526,
"epoch": 0.2122434346150399,
"grad_norm": 1.5961617231369019,
"kl": 0.007061004638671875,
"learning_rate": 3.3598726114649682e-06,
"loss": 0.0062,
"num_tokens": 83116585.0,
"reward": 0.05078125247382559,
"reward_std": 0.06568795558996499,
"rewards/pure_accuracy_reward_math": 0.05078125119325705,
"step": 211
},
{
"clip_ratio": 0.0002800602194383828,
"epoch": 0.2141555376295898,
"grad_norm": 0.04389820247888565,
"kl": 0.004379749298095703,
"learning_rate": 3.375796178343949e-06,
"loss": 0.0061,
"step": 212
},
{
"clip_ratio": 0.0002803218378630845,
"epoch": 0.2160676406441397,
"grad_norm": 0.04022788628935814,
"kl": 0.0043125152587890625,
"learning_rate": 3.39171974522293e-06,
"loss": 0.0061,
"step": 213
},
{
"clip_ratio": 0.0002704095267631601,
"epoch": 0.2179797436586896,
"grad_norm": 0.041697319597005844,
"kl": 0.004408597946166992,
"learning_rate": 3.407643312101911e-06,
"loss": 0.0061,
"step": 214
},
{
"clip_ratio": 0.0003097587871820906,
"epoch": 0.21989184667323952,
"grad_norm": 0.04933662340044975,
"kl": 0.004500150680541992,
"learning_rate": 3.4235668789808917e-06,
"loss": 0.006,
"step": 215
},
{
"clip_ratio": 0.0,
"completion_length": 520.8122463226318,
"epoch": 0.22180394968778944,
"grad_norm": 0.03384365886449814,
"kl": 0.0032858848571777344,
"learning_rate": 3.4394904458598725e-06,
"loss": 0.0069,
"num_tokens": 86710660.0,
"reward": 0.041015627270098776,
"reward_std": 0.05345123494043946,
"rewards/pure_accuracy_reward_math": 0.041015626047737896,
"step": 216
},
{
"clip_ratio": 0.00022953049290208583,
"epoch": 0.22371605270233935,
"grad_norm": 0.03259577602148056,
"kl": 0.003277301788330078,
"learning_rate": 3.4554140127388537e-06,
"loss": 0.0069,
"step": 217
},
{
"clip_ratio": 0.00024143920052210888,
"epoch": 0.22562815571688924,
"grad_norm": 0.031054330989718437,
"kl": 0.0031991004943847656,
"learning_rate": 3.4713375796178345e-06,
"loss": 0.0069,
"step": 218
},
{
"clip_ratio": 0.0002552373456978785,
"epoch": 0.22754025873143915,
"grad_norm": 0.031755171716213226,
"kl": 0.003099679946899414,
"learning_rate": 3.4872611464968152e-06,
"loss": 0.0069,
"step": 219
},
{
"clip_ratio": 0.0002681780064790473,
"epoch": 0.22945236174598907,
"grad_norm": 0.031188273802399635,
"kl": 0.003045320510864258,
"learning_rate": 3.5031847133757964e-06,
"loss": 0.0068,
"step": 220
},
{
"clip_ratio": 0.0,
"completion_length": 516.6825013160706,
"epoch": 0.23136446476053898,
"grad_norm": 0.03775335103273392,
"kl": 0.003011941909790039,
"learning_rate": 3.5191082802547772e-06,
"loss": 0.0063,
"num_tokens": 90291858.0,
"reward": 0.04715401996509172,
"reward_std": 0.06113734241807833,
"rewards/pure_accuracy_reward_math": 0.04715401915018447,
"step": 221
},
{
"clip_ratio": 0.0002582234144483664,
"epoch": 0.23327656777508887,
"grad_norm": 0.03602875769138336,
"kl": 0.002973794937133789,
"learning_rate": 3.535031847133758e-06,
"loss": 0.0063,
"step": 222
},
{
"clip_ratio": 0.0002264754746761355,
"epoch": 0.2351886707896388,
"grad_norm": 0.03449266403913498,
"kl": 0.002980470657348633,
"learning_rate": 3.5509554140127388e-06,
"loss": 0.0063,
"step": 223
},
{
"clip_ratio": 0.00025999376231311544,
"epoch": 0.2371007738041887,
"grad_norm": 0.0329199843108654,
"kl": 0.002971053123474121,
"learning_rate": 3.56687898089172e-06,
"loss": 0.0062,
"step": 224
},
{
"clip_ratio": 0.000296181439978227,
"epoch": 0.23901287681873862,
"grad_norm": 0.033409375697374344,
"kl": 0.0030214786529541016,
"learning_rate": 3.5828025477707007e-06,
"loss": 0.0062,
"step": 225
},
{
"clip_ratio": 0.0,
"completion_length": 530.4132494926453,
"epoch": 0.2409249798332885,
"grad_norm": 0.03549947962164879,
"kl": 0.004068970680236816,
"learning_rate": 3.5987261146496815e-06,
"loss": 0.0083,
"num_tokens": 93927655.0,
"reward": 0.039899555675219744,
"reward_std": 0.05890519870445132,
"rewards/pure_accuracy_reward_math": 0.03989955480210483,
"step": 226
},
{
"clip_ratio": 0.00024125495940552355,
"epoch": 0.24283708284783842,
"grad_norm": 0.033262889832258224,
"kl": 0.0040683746337890625,
"learning_rate": 3.6146496815286623e-06,
"loss": 0.0083,
"step": 227
},
{
"clip_ratio": 0.00024547909194438944,
"epoch": 0.24474918586238834,
"grad_norm": 0.03303634375333786,
"kl": 0.004040956497192383,
"learning_rate": 3.6305732484076435e-06,
"loss": 0.0083,
"step": 228
},
{
"clip_ratio": 0.0002773670349256463,
"epoch": 0.24666128887693825,
"grad_norm": 0.03389015421271324,
"kl": 0.00404667854309082,
"learning_rate": 3.6464968152866242e-06,
"loss": 0.0083,
"step": 229
},
{
"clip_ratio": 0.000270649900215858,
"epoch": 0.24857339189148817,
"grad_norm": 0.035877879709005356,
"kl": 0.0038802623748779297,
"learning_rate": 3.662420382165605e-06,
"loss": 0.0082,
"step": 230
},
{
"clip_ratio": 0.0,
"completion_length": 527.9338984489441,
"epoch": 0.25048549490603805,
"grad_norm": 0.032850634306669235,
"kl": 0.0030744075775146484,
"learning_rate": 3.678343949044586e-06,
"loss": 0.0064,
"num_tokens": 97554714.0,
"reward": 0.04743303795112297,
"reward_std": 0.061522720265202224,
"rewards/pure_accuracy_reward_math": 0.047433037019800395,
"step": 231
},
{
"clip_ratio": 0.00024459305313939694,
"epoch": 0.25239759792058797,
"grad_norm": 0.03185749799013138,
"kl": 0.00302886962890625,
"learning_rate": 3.694267515923567e-06,
"loss": 0.0064,
"step": 232
},
{
"clip_ratio": 0.00025332184179660544,
"epoch": 0.2543097009351379,
"grad_norm": 0.03135737404227257,
"kl": 0.002967357635498047,
"learning_rate": 3.7101910828025477e-06,
"loss": 0.0064,
"step": 233
},
{
"clip_ratio": 0.0002861271710798974,
"epoch": 0.2562218039496878,
"grad_norm": 0.030725885182619095,
"kl": 0.0029573440551757812,
"learning_rate": 3.7261146496815285e-06,
"loss": 0.0064,
"step": 234
},
{
"clip_ratio": 0.0002841630366674508,
"epoch": 0.2581339069642377,
"grad_norm": 0.030670415610074997,
"kl": 0.002954721450805664,
"learning_rate": 3.7420382165605097e-06,
"loss": 0.0063,
"step": 235
},
{
"clip_ratio": 0.0,
"completion_length": 531.4467325210571,
"epoch": 0.26004600997878763,
"grad_norm": 0.03534790128469467,
"kl": 0.003011465072631836,
"learning_rate": 3.757961783439491e-06,
"loss": 0.0041,
"num_tokens": 101193143.0,
"reward": 0.04631696638534777,
"reward_std": 0.0601075982558541,
"rewards/pure_accuracy_reward_math": 0.046316965454025194,
"step": 236
},
{
"clip_ratio": 0.00022260297603793333,
"epoch": 0.2619581129933375,
"grad_norm": 0.03438499942421913,
"kl": 0.0030508041381835938,
"learning_rate": 3.773885350318472e-06,
"loss": 0.0041,
"step": 237
},
{
"clip_ratio": 0.00024397839513312647,
"epoch": 0.2638702160078874,
"grad_norm": 0.032804593443870544,
"kl": 0.0030994415283203125,
"learning_rate": 3.789808917197453e-06,
"loss": 0.0041,
"step": 238
},
{
"clip_ratio": 0.0002508007286223801,
"epoch": 0.2657823190224373,
"grad_norm": 0.03402625024318695,
"kl": 0.0031244754791259766,
"learning_rate": 3.8057324840764336e-06,
"loss": 0.004,
"step": 239
},
{
"clip_ratio": 0.00025242620182552855,
"epoch": 0.26769442203698723,
"grad_norm": 0.03291900083422661,
"kl": 0.003187417984008789,
"learning_rate": 3.821656050955415e-06,
"loss": 0.004,
"step": 240
},
{
"clip_ratio": 0.0,
"completion_length": 536.9913763999939,
"epoch": 0.26960652505153715,
"grad_norm": 0.033690325915813446,
"kl": 0.003125429153442383,
"learning_rate": 3.837579617834396e-06,
"loss": 0.0089,
"num_tokens": 104860392.0,
"reward": 0.05496652069268748,
"reward_std": 0.07028483308386058,
"rewards/pure_accuracy_reward_math": 0.05496651929570362,
"step": 241
},
{
"clip_ratio": 0.0002661047830088137,
"epoch": 0.27151862806608706,
"grad_norm": 0.03227640688419342,
"kl": 0.0031244754791259766,
"learning_rate": 3.853503184713376e-06,
"loss": 0.009,
"step": 242
},
{
"clip_ratio": 0.00027503777869242185,
"epoch": 0.273430731080637,
"grad_norm": 0.03168897703289986,
"kl": 0.003157377243041992,
"learning_rate": 3.869426751592357e-06,
"loss": 0.0089,
"step": 243
},
{
"clip_ratio": 0.00029653536631712996,
"epoch": 0.2753428340951869,
"grad_norm": 0.03222280368208885,
"kl": 0.0031862258911132812,
"learning_rate": 3.885350318471338e-06,
"loss": 0.0089,
"step": 244
},
{
"clip_ratio": 0.0003081631187455969,
"epoch": 0.2772549371097368,
"grad_norm": 0.03176514804363251,
"kl": 0.0032341480255126953,
"learning_rate": 3.901273885350319e-06,
"loss": 0.0088,
"step": 245
},
{
"clip_ratio": 0.0,
"completion_length": 513.5616898536682,
"epoch": 0.27916704012428667,
"grad_norm": 0.037929438054561615,
"kl": 0.0035233497619628906,
"learning_rate": 3.9171974522293e-06,
"loss": 0.0075,
"num_tokens": 108427949.0,
"reward": 0.0544084852153901,
"reward_std": 0.0659469406818971,
"rewards/pure_accuracy_reward_math": 0.054408483527367935,
"step": 246
},
{
"clip_ratio": 0.0002633177949178389,
"epoch": 0.2810791431388366,
"grad_norm": 0.03561301901936531,
"kl": 0.0035467147827148438,
"learning_rate": 3.933121019108281e-06,
"loss": 0.0075,
"step": 247
},
{
"clip_ratio": 0.0003005996498472996,
"epoch": 0.2829912461533865,
"grad_norm": 0.035342708230018616,
"kl": 0.003578662872314453,
"learning_rate": 3.949044585987262e-06,
"loss": 0.0075,
"step": 248
},
{
"clip_ratio": 0.0003206986277177748,
"epoch": 0.2849033491679364,
"grad_norm": 0.03841444477438927,
"kl": 0.0036001205444335938,
"learning_rate": 3.964968152866243e-06,
"loss": 0.0075,
"step": 249
},
{
"clip_ratio": 0.00030761192169848073,
"epoch": 0.28681545218248633,
"grad_norm": 0.03515273705124855,
"kl": 0.003624439239501953,
"learning_rate": 3.980891719745223e-06,
"loss": 0.0074,
"step": 250
},
{
"clip_ratio": 0.0,
"completion_length": 504.73858308792114,
"epoch": 0.28872755519703625,
"grad_norm": 0.04030496999621391,
"kl": 0.003686189651489258,
"learning_rate": 3.996815286624204e-06,
"loss": 0.0081,
"num_tokens": 111975532.0,
"reward": 0.0647321458964143,
"reward_std": 0.07547981187235564,
"rewards/pure_accuracy_reward_math": 0.06473214420839213,
"step": 251
},
{
"clip_ratio": 0.00031485489739679906,
"epoch": 0.29063965821158616,
"grad_norm": 0.04058763012290001,
"kl": 0.003683328628540039,
"learning_rate": 4.012738853503185e-06,
"loss": 0.0081,
"step": 252
},
{
"clip_ratio": 0.0003329372994471669,
"epoch": 0.2925517612261361,
"grad_norm": 0.039948880672454834,
"kl": 0.003644227981567383,
"learning_rate": 4.0286624203821666e-06,
"loss": 0.0081,
"step": 253
},
{
"clip_ratio": 0.00031999613804600813,
"epoch": 0.294463864240686,
"grad_norm": 0.038771189749240875,
"kl": 0.003670930862426758,
"learning_rate": 4.044585987261147e-06,
"loss": 0.008,
"step": 254
},
{
"clip_ratio": 0.0003391868065136805,
"epoch": 0.29637596725523585,
"grad_norm": 0.03820183873176575,
"kl": 0.0036439895629882812,
"learning_rate": 4.060509554140128e-06,
"loss": 0.0079,
"step": 255
},
{
"clip_ratio": 0.0,
"completion_length": 507.980770111084,
"epoch": 0.29828807026978577,
"grad_norm": 0.0373733825981617,
"kl": 0.003556966781616211,
"learning_rate": 4.076433121019109e-06,
"loss": 0.0047,
"num_tokens": 115530899.0,
"reward": 0.05217634199652821,
"reward_std": 0.06624599173665047,
"rewards/pure_accuracy_reward_math": 0.05217634030850604,
"step": 256
},
{
"clip_ratio": 0.0002444871162765594,
"epoch": 0.3002001732843357,
"grad_norm": 0.03655192255973816,
"kl": 0.003623485565185547,
"learning_rate": 4.09235668789809e-06,
"loss": 0.0047,
"step": 257
},
{
"clip_ratio": 0.0002544127338524049,
"epoch": 0.3021122762988856,
"grad_norm": 0.035692181438207626,
"kl": 0.003640890121459961,
"learning_rate": 4.10828025477707e-06,
"loss": 0.0046,
"step": 258
},
{
"clip_ratio": 0.0002950017506577751,
"epoch": 0.3040243793134355,
"grad_norm": 0.03550735488533974,
"kl": 0.0036733150482177734,
"learning_rate": 4.124203821656051e-06,
"loss": 0.0046,
"step": 259
},
{
"clip_ratio": 0.0002894491571510116,
"epoch": 0.3059364823279854,
"grad_norm": 0.03471330925822258,
"kl": 0.00366973876953125,
"learning_rate": 4.140127388535032e-06,
"loss": 0.0045,
"step": 260
},
{
"clip_ratio": 0.0,
"completion_length": 517.2921543121338,
"epoch": 0.30784858534253534,
"grad_norm": 0.6632264852523804,
"kl": 0.007929325103759766,
"learning_rate": 4.156050955414014e-06,
"loss": 0.0041,
"num_tokens": 119123970.0,
"reward": 0.046875002153683454,
"reward_std": 0.06358220643596724,
"rewards/pure_accuracy_reward_math": 0.04687500122236088,
"step": 261
},
{
"clip_ratio": 0.00027907352409783925,
"epoch": 0.30976068835708526,
"grad_norm": 0.03735750913619995,
"kl": 0.0038709640502929688,
"learning_rate": 4.171974522292994e-06,
"loss": 0.004,
"step": 262
},
{
"clip_ratio": 0.000277261100677606,
"epoch": 0.31167279137163517,
"grad_norm": 0.03806532546877861,
"kl": 0.004002094268798828,
"learning_rate": 4.187898089171975e-06,
"loss": 0.004,
"step": 263
},
{
"clip_ratio": 0.00026404397090118437,
"epoch": 0.31358489438618503,
"grad_norm": 0.03587675094604492,
"kl": 0.00407719612121582,
"learning_rate": 4.203821656050956e-06,
"loss": 0.0039,
"step": 264
},
{
"clip_ratio": 0.0003132741497324787,
"epoch": 0.31549699740073495,
"grad_norm": 0.03516336902976036,
"kl": 0.004099607467651367,
"learning_rate": 4.219745222929937e-06,
"loss": 0.0039,
"step": 265
},
{
"clip_ratio": 0.0,
"completion_length": 528.596004486084,
"epoch": 0.31740910041528486,
"grad_norm": 0.038204919546842575,
"kl": 0.0035691261291503906,
"learning_rate": 4.2356687898089174e-06,
"loss": 0.006,
"num_tokens": 122758966.0,
"reward": 0.054966520925518125,
"reward_std": 0.06770737608894706,
"rewards/pure_accuracy_reward_math": 0.05496651912108064,
"step": 266
},
{
"clip_ratio": 0.00026713599251593223,
"epoch": 0.3193212034298348,
"grad_norm": 0.03804617002606392,
"kl": 0.003623485565185547,
"learning_rate": 4.251592356687898e-06,
"loss": 0.006,
"step": 267
},
{
"clip_ratio": 0.00027288361513910786,
"epoch": 0.3212333064443847,
"grad_norm": 0.03765474632382393,
"kl": 0.003659486770629883,
"learning_rate": 4.26751592356688e-06,
"loss": 0.006,
"step": 268
},
{
"clip_ratio": 0.0002754389876429286,
"epoch": 0.3231454094589346,
"grad_norm": 0.037356842309236526,
"kl": 0.0036840438842773438,
"learning_rate": 4.283439490445861e-06,
"loss": 0.0059,
"step": 269
},
{
"clip_ratio": 0.0002686067065269526,
"epoch": 0.3250575124734845,
"grad_norm": 0.03656876087188721,
"kl": 0.003694295883178711,
"learning_rate": 4.299363057324841e-06,
"loss": 0.0059,
"step": 270
},
{
"clip_ratio": 0.0,
"completion_length": 543.1746897697449,
"epoch": 0.32696961548803444,
"grad_norm": 0.03417838364839554,
"kl": 0.0035529136657714844,
"learning_rate": 4.315286624203822e-06,
"loss": 0.0076,
"num_tokens": 126443800.0,
"reward": 0.04882812697906047,
"reward_std": 0.05766273388871923,
"rewards/pure_accuracy_reward_math": 0.04882812616415322,
"step": 271
},
{
"clip_ratio": 0.0002270729566475893,
"epoch": 0.32888171850258435,
"grad_norm": 0.03328363224864006,
"kl": 0.0035278797149658203,
"learning_rate": 4.331210191082803e-06,
"loss": 0.0076,
"step": 272
},
{
"clip_ratio": 0.0002132950650661769,
"epoch": 0.3307938215171342,
"grad_norm": 0.03230879083275795,
"kl": 0.0034902095794677734,
"learning_rate": 4.347133757961784e-06,
"loss": 0.0076,
"step": 273
},
{
"clip_ratio": 0.0002096330554195447,
"epoch": 0.3327059245316841,
"grad_norm": 0.031601596623659134,
"kl": 0.003440380096435547,
"learning_rate": 4.3630573248407645e-06,
"loss": 0.0076,
"step": 274
},
{
"clip_ratio": 0.00027223577194490645,
"epoch": 0.33461802754623404,
"grad_norm": 0.033090248703956604,
"kl": 0.003412485122680664,
"learning_rate": 4.378980891719746e-06,
"loss": 0.0075,
"step": 275
},
{
"clip_ratio": 0.0,
"completion_length": 535.8942775726318,
"epoch": 0.33653013056078396,
"grad_norm": 0.03229549527168274,
"kl": 0.003350973129272461,
"learning_rate": 4.394904458598727e-06,
"loss": 0.0057,
"num_tokens": 130099677.0,
"reward": 0.04966518087894656,
"reward_std": 0.060705700190737844,
"rewards/pure_accuracy_reward_math": 0.049665179773001,
"step": 276
},
{
"clip_ratio": 0.00025271691475836633,
"epoch": 0.3384422335753339,
"grad_norm": 0.03214692696928978,
"kl": 0.0033435821533203125,
"learning_rate": 4.410828025477708e-06,
"loss": 0.0057,
"step": 277
},
{
"clip_ratio": 0.00023837689644778948,
"epoch": 0.3403543365898838,
"grad_norm": 0.03055053949356079,
"kl": 0.003403902053833008,
"learning_rate": 4.426751592356688e-06,
"loss": 0.0057,
"step": 278
},
{
"clip_ratio": 0.0002586998209039848,
"epoch": 0.3422664396044337,
"grad_norm": 0.030119990929961205,
"kl": 0.003477334976196289,
"learning_rate": 4.442675159235669e-06,
"loss": 0.0057,
"step": 279
},
{
"clip_ratio": 0.00026621688834893575,
"epoch": 0.3441785426189836,
"grad_norm": 0.030735207721590996,
"kl": 0.0035724639892578125,
"learning_rate": 4.45859872611465e-06,
"loss": 0.0056,
"step": 280
},
{
"clip_ratio": 0.0,
"completion_length": 542.7466769218445,
"epoch": 0.34609064563353353,
"grad_norm": 0.033374350517988205,
"kl": 0.003545999526977539,
"learning_rate": 4.474522292993631e-06,
"loss": 0.0036,
"num_tokens": 133773381.0,
"reward": 0.051339288300368935,
"reward_std": 0.06345581240020692,
"rewards/pure_accuracy_reward_math": 0.05133928690338507,
"step": 281
},
{
"clip_ratio": 0.0002734534241994879,
"epoch": 0.3480027486480834,
"grad_norm": 0.03312847390770912,
"kl": 0.0035567283630371094,
"learning_rate": 4.490445859872612e-06,
"loss": 0.0036,
"step": 282
},
{
"clip_ratio": 0.00022532319422907676,
"epoch": 0.3499148516626333,
"grad_norm": 0.03281605243682861,
"kl": 0.0035707950592041016,
"learning_rate": 4.506369426751593e-06,
"loss": 0.0035,
"step": 283
},
{
"clip_ratio": 0.0002544033526419298,
"epoch": 0.3518269546771832,
"grad_norm": 0.032299675047397614,
"kl": 0.003595113754272461,
"learning_rate": 4.522292993630574e-06,
"loss": 0.0035,
"step": 284
},
{
"clip_ratio": 0.00024219880805276262,
"epoch": 0.35373905769173314,
"grad_norm": 0.031959276646375656,
"kl": 0.0035622119903564453,
"learning_rate": 4.538216560509555e-06,
"loss": 0.0035,
"step": 285
},
{
"clip_ratio": 0.0,
"completion_length": 536.3122510910034,
"epoch": 0.35565116070628305,
"grad_norm": 0.035966720432043076,
"kl": 0.003755331039428711,
"learning_rate": 4.554140127388535e-06,
"loss": 0.0076,
"num_tokens": 137425032.0,
"reward": 0.05524553809664212,
"reward_std": 0.07191267621237785,
"rewards/pure_accuracy_reward_math": 0.055245536990696564,
"step": 286
},
{
"clip_ratio": 0.00029696975889237365,
"epoch": 0.35756326372083297,
"grad_norm": 0.03485076501965523,
"kl": 0.0036923885345458984,
"learning_rate": 4.570063694267516e-06,
"loss": 0.0076,
"step": 287
},
{
"clip_ratio": 0.0003252405772968814,
"epoch": 0.3594753667353829,
"grad_norm": 0.03465472534298897,
"kl": 0.003720998764038086,
"learning_rate": 4.585987261146497e-06,
"loss": 0.0076,
"step": 288
},
{
"clip_ratio": 0.0003269365803362234,
"epoch": 0.3613874697499328,
"grad_norm": 0.033384956419467926,
"kl": 0.003762483596801758,
"learning_rate": 4.601910828025479e-06,
"loss": 0.0075,
"step": 289
},
{
"clip_ratio": 0.0003269619904813226,
"epoch": 0.36329957276448266,
"grad_norm": 0.03343256562948227,
"kl": 0.0037889480590820312,
"learning_rate": 4.617834394904459e-06,
"loss": 0.0075,
"step": 290
},
{
"clip_ratio": 0.0,
"completion_length": 533.6155371665955,
"epoch": 0.3652116757790326,
"grad_norm": 0.035127099603414536,
"kl": 0.0037310123443603516,
"learning_rate": 4.63375796178344e-06,
"loss": 0.0084,
"num_tokens": 141070278.0,
"reward": 0.05580357421422377,
"reward_std": 0.06861072615720332,
"rewards/pure_accuracy_reward_math": 0.05580357281723991,
"step": 291
},
{
"clip_ratio": 0.00026876470258230256,
"epoch": 0.3671237787935825,
"grad_norm": 0.034193847328424454,
"kl": 0.0037539005279541016,
"learning_rate": 4.649681528662421e-06,
"loss": 0.0084,
"step": 292
},
{
"clip_ratio": 0.00024497293054537295,
"epoch": 0.3690358818081324,
"grad_norm": 0.033800724893808365,
"kl": 0.0037734508514404297,
"learning_rate": 4.665605095541402e-06,
"loss": 0.0084,
"step": 293
},
{
"clip_ratio": 0.0002538224067620831,
"epoch": 0.3709479848226823,
"grad_norm": 0.03376767784357071,
"kl": 0.003782033920288086,
"learning_rate": 4.6815286624203824e-06,
"loss": 0.0083,
"step": 294
},
{
"clip_ratio": 0.00027697558522277177,
"epoch": 0.37286008783723223,
"grad_norm": 0.03229675441980362,
"kl": 0.003787994384765625,
"learning_rate": 4.697452229299363e-06,
"loss": 0.0083,
"step": 295
},
{
"clip_ratio": 0.0,
"completion_length": 532.5739660263062,
"epoch": 0.37477219085178215,
"grad_norm": 0.035769619047641754,
"kl": 0.0037794113159179688,
"learning_rate": 4.713375796178344e-06,
"loss": 0.0057,
"num_tokens": 144715023.0,
"reward": 0.05915178812574595,
"reward_std": 0.07096926274243742,
"rewards/pure_accuracy_reward_math": 0.059151787019800395,
"step": 296
},
{
"clip_ratio": 0.00030428163654505624,
"epoch": 0.37668429386633207,
"grad_norm": 0.035648081451654434,
"kl": 0.003717660903930664,
"learning_rate": 4.729299363057326e-06,
"loss": 0.0057,
"step": 297
},
{
"clip_ratio": 0.00029741515106707084,
"epoch": 0.378596396880882,
"grad_norm": 0.03551783785223961,
"kl": 0.0036716461181640625,
"learning_rate": 4.745222929936306e-06,
"loss": 0.0057,
"step": 298
},
{
"clip_ratio": 0.0003008591765478741,
"epoch": 0.38050849989543184,
"grad_norm": 0.03452136367559433,
"kl": 0.0036542415618896484,
"learning_rate": 4.761146496815287e-06,
"loss": 0.0056,
"step": 299
},
{
"clip_ratio": 0.00032588979291858777,
"epoch": 0.38242060290998175,
"grad_norm": 0.03325437009334564,
"kl": 0.003694295883178711,
"learning_rate": 4.777070063694268e-06,
"loss": 0.0056,
"step": 300
},
{
"clip_ratio": 0.0,
"completion_length": 519.2416524887085,
"epoch": 0.38433270592453167,
"grad_norm": 0.04327908158302307,
"kl": 0.004815816879272461,
"learning_rate": 4.792993630573249e-06,
"loss": 0.0041,
"num_tokens": 148307505.0,
"reward": 0.05329241341678426,
"reward_std": 0.061954362492542714,
"rewards/pure_accuracy_reward_math": 0.0532924123108387,
"step": 301
},
{
"clip_ratio": 0.0002521659018839273,
"epoch": 0.3862448089390816,
"grad_norm": 0.041329506784677505,
"kl": 0.004758596420288086,
"learning_rate": 4.8089171974522295e-06,
"loss": 0.0041,
"step": 302
},
{
"clip_ratio": 0.0002661041191913682,
"epoch": 0.3881569119536315,
"grad_norm": 0.03914090245962143,
"kl": 0.0045318603515625,
"learning_rate": 4.82484076433121e-06,
"loss": 0.0041,
"step": 303
},
{
"clip_ratio": 0.0002647961523507547,
"epoch": 0.3900690149681814,
"grad_norm": 0.0363956093788147,
"kl": 0.0043642520904541016,
"learning_rate": 4.840764331210192e-06,
"loss": 0.004,
"step": 304
},
{
"clip_ratio": 0.00030025097066754824,
"epoch": 0.39198111798273133,
"grad_norm": 0.05623022839426994,
"kl": 0.00441288948059082,
"learning_rate": 4.856687898089173e-06,
"loss": 0.004,
"step": 305
},
{
"clip_ratio": 0.0,
"completion_length": 529.0496897697449,
"epoch": 0.39389322099728125,
"grad_norm": 0.03662995249032974,
"kl": 0.0038270950317382812,
"learning_rate": 4.872611464968153e-06,
"loss": 0.0077,
"num_tokens": 151936939.0,
"reward": 0.0560825914144516,
"reward_std": 0.061781705473549664,
"rewards/pure_accuracy_reward_math": 0.05608259071595967,
"step": 306
},
{
"clip_ratio": 0.00025576306325092446,
"epoch": 0.39580532401183116,
"grad_norm": 0.03553188219666481,
"kl": 0.00376129150390625,
"learning_rate": 4.888535031847134e-06,
"loss": 0.0076,
"step": 307
},
{
"clip_ratio": 0.00027371336784653977,
"epoch": 0.397717427026381,
"grad_norm": 0.035399794578552246,
"kl": 0.0036725997924804688,
"learning_rate": 4.904458598726115e-06,
"loss": 0.0076,
"step": 308
},
{
"clip_ratio": 0.0002955471370569285,
"epoch": 0.39962953004093094,
"grad_norm": 0.03487352281808853,
"kl": 0.003664731979370117,
"learning_rate": 4.920382165605096e-06,
"loss": 0.0076,
"step": 309
},
{
"clip_ratio": 0.00030850259520320833,
"epoch": 0.40154163305548085,
"grad_norm": 0.03433185815811157,
"kl": 0.003676176071166992,
"learning_rate": 4.9363057324840765e-06,
"loss": 0.0075,
"step": 310
},
{
"clip_ratio": 0.0,
"completion_length": 524.8312191963196,
"epoch": 0.40345373607003077,
"grad_norm": 0.03824182599782944,
"kl": 0.003762483596801758,
"learning_rate": 4.952229299363058e-06,
"loss": 0.0062,
"num_tokens": 155550782.0,
"reward": 0.05496652075089514,
"reward_std": 0.0689961050520651,
"rewards/pure_accuracy_reward_math": 0.0549665194703266,
"step": 311
},
{
"clip_ratio": 0.0002548059320588436,
"epoch": 0.4053658390845807,
"grad_norm": 0.036028265953063965,
"kl": 0.003760099411010742,
"learning_rate": 4.968152866242039e-06,
"loss": 0.0062,
"step": 312
},
{
"clip_ratio": 0.00029642158040132927,
"epoch": 0.4072779420991306,
"grad_norm": 0.03537724167108536,
"kl": 0.0038378238677978516,
"learning_rate": 4.98407643312102e-06,
"loss": 0.0062,
"step": 313
},
{
"clip_ratio": 0.00030970463706125884,
"epoch": 0.4091900451136805,
"grad_norm": 0.03521754965186119,
"kl": 0.003871440887451172,
"learning_rate": 5e-06,
"loss": 0.0062,
"step": 314
},
{
"clip_ratio": 0.000315766970174991,
"epoch": 0.4111021481282304,
"grad_norm": 0.034070126712322235,
"kl": 0.0037851333618164062,
"learning_rate": 4.999992129526286e-06,
"loss": 0.0061,
"step": 315
},
{
"clip_ratio": 0.0,
"completion_length": 528.3727917671204,
"epoch": 0.41301425114278034,
"grad_norm": 0.12440560013055801,
"kl": 0.005699872970581055,
"learning_rate": 4.999968518154701e-06,
"loss": 0.0041,
"num_tokens": 159174918.0,
"reward": 0.05050223457510583,
"reward_std": 0.06435916194459423,
"rewards/pure_accuracy_reward_math": 0.050502233527367935,
"step": 316
},
{
"clip_ratio": 0.0002532021657657424,
"epoch": 0.4149263541573302,
"grad_norm": 0.05440036952495575,
"kl": 0.005144357681274414,
"learning_rate": 4.99992916603391e-06,
"loss": 0.004,
"step": 317
},
{
"clip_ratio": 0.00025051761485883617,
"epoch": 0.4168384571718801,
"grad_norm": 0.051424141973257065,
"kl": 0.005103111267089844,
"learning_rate": 4.999874073411688e-06,
"loss": 0.004,
"step": 318
},
{
"clip_ratio": 0.0002561948363677402,
"epoch": 0.41875056018643003,
"grad_norm": 0.06930891424417496,
"kl": 0.004969120025634766,
"learning_rate": 4.9998032406349205e-06,
"loss": 0.0039,
"step": 319
},
{
"clip_ratio": 0.0002573228107394243,
"epoch": 0.42066266320097995,
"grad_norm": 0.06900722533464432,
"kl": 0.004853248596191406,
"learning_rate": 4.9997166681495975e-06,
"loss": 0.0039,
"step": 320
},
{
"clip_ratio": 0.0,
"completion_length": 517.6638069152832,
"epoch": 0.42257476621552986,
"grad_norm": 0.03829098492860794,
"kl": 0.0038361549377441406,
"learning_rate": 4.999614356500811e-06,
"loss": 0.0072,
"num_tokens": 162764497.0,
"reward": 0.06110491356230341,
"reward_std": 0.07393209857400507,
"rewards/pure_accuracy_reward_math": 0.06110491222352721,
"step": 321
},
{
"clip_ratio": 0.0002886460991931017,
"epoch": 0.4244868692300798,
"grad_norm": 0.03761793673038483,
"kl": 0.0038406848907470703,
"learning_rate": 4.999496306332755e-06,
"loss": 0.0072,
"step": 322
},
{
"clip_ratio": 0.00029219654425105546,
"epoch": 0.4263989722446297,
"grad_norm": 0.03714153915643692,
"kl": 0.003914356231689453,
"learning_rate": 4.999362518388718e-06,
"loss": 0.0071,
"step": 323
},
{
"clip_ratio": 0.0003099845329757045,
"epoch": 0.4283110752591796,
"grad_norm": 0.03610815480351448,
"kl": 0.0039288997650146484,
"learning_rate": 4.99921299351108e-06,
"loss": 0.0071,
"step": 324
},
{
"clip_ratio": 0.0003404705674370234,
"epoch": 0.4302231782737295,
"grad_norm": 0.03599926084280014,
"kl": 0.003935813903808594,
"learning_rate": 4.999047732641305e-06,
"loss": 0.007,
"step": 325
},
{
"clip_ratio": 0.0,
"completion_length": 510.4832811355591,
"epoch": 0.4321352812882794,
"grad_norm": 0.04078551381826401,
"kl": 0.003900766372680664,
"learning_rate": 4.998866736819938e-06,
"loss": 0.0063,
"num_tokens": 166324161.0,
"reward": 0.059151788242161274,
"reward_std": 0.07354671962093562,
"rewards/pure_accuracy_reward_math": 0.05915178725263104,
"step": 326
},
{
"clip_ratio": 0.00026936357801332633,
"epoch": 0.4340473843028293,
"grad_norm": 0.03855260834097862,
"kl": 0.003957986831665039,
"learning_rate": 4.998670007186599e-06,
"loss": 0.0063,
"step": 327
},
{
"clip_ratio": 0.0002843770836875592,
"epoch": 0.4359594873173792,
"grad_norm": 0.03724536672234535,
"kl": 0.0039751529693603516,
"learning_rate": 4.998457544979971e-06,
"loss": 0.0062,
"step": 328
},
{
"clip_ratio": 0.0003156123698886404,
"epoch": 0.43787159033192913,
"grad_norm": 0.03662634268403053,
"kl": 0.0040798187255859375,
"learning_rate": 4.998229351537797e-06,
"loss": 0.0062,
"step": 329
},
{
"clip_ratio": 0.0003457550078564964,
"epoch": 0.43978369334647904,
"grad_norm": 0.03598077967762947,
"kl": 0.004061460494995117,
"learning_rate": 4.997985428296869e-06,
"loss": 0.0061,
"step": 330
},
{
"clip_ratio": 0.0,
"completion_length": 528.4207811355591,
"epoch": 0.44169579636102896,
"grad_norm": 0.08678283542394638,
"kl": 0.008905410766601562,
"learning_rate": 4.997725776793021e-06,
"loss": 0.0058,
"num_tokens": 169950285.0,
"reward": 0.05636160948779434,
"reward_std": 0.07148723275167868,
"rewards/pure_accuracy_reward_math": 0.05636160867288709,
"step": 331
},
{
"clip_ratio": 0.00029096677934603576,
"epoch": 0.4436078993755789,
"grad_norm": 0.09512893110513687,
"kl": 0.007820606231689453,
"learning_rate": 4.997450398661117e-06,
"loss": 0.0058,
"step": 332
},
{
"clip_ratio": 0.00029938158724007735,
"epoch": 0.4455200023901288,
"grad_norm": 0.24316293001174927,
"kl": 0.007544517517089844,
"learning_rate": 4.9971592956350405e-06,
"loss": 0.0057,
"step": 333
},
{
"clip_ratio": 0.00032061134919558754,
"epoch": 0.4474321054046787,
"grad_norm": 0.07169396430253983,
"kl": 0.006528377532958984,
"learning_rate": 4.996852469547688e-06,
"loss": 0.0057,
"step": 334
},
{
"clip_ratio": 0.00034978831735088534,
"epoch": 0.44934420841922856,
"grad_norm": 0.06073050945997238,
"kl": 0.0060198307037353516,
"learning_rate": 4.996529922330954e-06,
"loss": 0.0056,
"step": 335
},
{
"clip_ratio": 0.0,
"completion_length": 535.8259167671204,
"epoch": 0.4512563114337785,
"grad_norm": 0.034031759947538376,
"kl": 0.0037636756896972656,
"learning_rate": 4.996191656015715e-06,
"loss": 0.0063,
"num_tokens": 173606605.0,
"reward": 0.05273437770665623,
"reward_std": 0.061655311612412333,
"rewards/pure_accuracy_reward_math": 0.05273437625146471,
"step": 336
},
{
"clip_ratio": 0.0002175188884052659,
"epoch": 0.4531684144483284,
"grad_norm": 0.03333257883787155,
"kl": 0.0038194656372070312,
"learning_rate": 4.995837672731827e-06,
"loss": 0.0063,
"step": 337
},
{
"clip_ratio": 0.00022021491247414815,
"epoch": 0.4550805174628783,
"grad_norm": 0.032678041607141495,
"kl": 0.0038101673126220703,
"learning_rate": 4.9954679747081e-06,
"loss": 0.0063,
"step": 338
},
{
"clip_ratio": 0.000264580338352971,
"epoch": 0.4569926204774282,
"grad_norm": 0.032030362635850906,
"kl": 0.0037910938262939453,
"learning_rate": 4.995082564272295e-06,
"loss": 0.0062,
"step": 339
},
{
"clip_ratio": 0.00027159255438391483,
"epoch": 0.45890472349197814,
"grad_norm": 0.031298909336328506,
"kl": 0.0038001537322998047,
"learning_rate": 4.994681443851102e-06,
"loss": 0.0062,
"step": 340
},
{
"clip_ratio": 0.0,
"completion_length": 527.6174931526184,
"epoch": 0.46081682650652805,
"grad_norm": 0.04015278443694115,
"kl": 0.004010200500488281,
"learning_rate": 4.994264615970126e-06,
"loss": 0.0062,
"num_tokens": 177226454.0,
"reward": 0.056361609895247966,
"reward_std": 0.06633232033345848,
"rewards/pure_accuracy_reward_math": 0.05636160867288709,
"step": 341
},
{
"clip_ratio": 0.00026669438159387937,
"epoch": 0.46272892952107797,
"grad_norm": 0.03813392296433449,
"kl": 0.0039997100830078125,
"learning_rate": 4.993832083253874e-06,
"loss": 0.0062,
"step": 342
},
{
"clip_ratio": 0.0003048689098363866,
"epoch": 0.46464103253562783,
"grad_norm": 0.03776548057794571,
"kl": 0.004065752029418945,
"learning_rate": 4.993383848425736e-06,
"loss": 0.0061,
"step": 343
},
{
"clip_ratio": 0.0003051352168768062,
"epoch": 0.46655313555017774,
"grad_norm": 0.03955227509140968,
"kl": 0.0041925907135009766,
"learning_rate": 4.992919914307969e-06,
"loss": 0.0061,
"step": 344
},
{
"clip_ratio": 0.00030118576887616655,
"epoch": 0.46846523856472766,
"grad_norm": 0.036648593842983246,
"kl": 0.00420832633972168,
"learning_rate": 4.992440283821676e-06,
"loss": 0.006,
"step": 345
},
{
"clip_ratio": 0.0,
"completion_length": 527.5131411552429,
"epoch": 0.4703773415792776,
"grad_norm": 13.381791114807129,
"kl": 0.1310877799987793,
"learning_rate": 4.991944959986793e-06,
"loss": 0.018,
"num_tokens": 180852413.0,
"reward": 0.06138393163564615,
"reward_std": 0.07144096971023828,
"rewards/pure_accuracy_reward_math": 0.061383930064039305,
"step": 346
},
{
"clip_ratio": 0.00030088673440786806,
"epoch": 0.4722894445938275,
"grad_norm": 1.359532356262207,
"kl": 0.01866316795349121,
"learning_rate": 4.991433945922068e-06,
"loss": 0.0135,
"step": 347
},
{
"clip_ratio": 0.0003527746957843192,
"epoch": 0.4742015476083774,
"grad_norm": 0.050763800740242004,
"kl": 0.005962371826171875,
"learning_rate": 4.9909072448450386e-06,
"loss": 0.013,
"step": 348
},
{
"clip_ratio": 0.0003426602560239189,
"epoch": 0.4761136506229273,
"grad_norm": 0.0476795993745327,
"kl": 0.006250858306884766,
"learning_rate": 4.990364860072014e-06,
"loss": 0.013,
"step": 349
},
{
"clip_ratio": 0.00033057811066328213,
"epoch": 0.47802575363747724,
"grad_norm": 0.04783082380890846,
"kl": 0.0066144466400146484,
"learning_rate": 4.989806795018054e-06,
"loss": 0.013,
"step": 350
},
{
"clip_ratio": 0.0,
"completion_length": 522.409900188446,
"epoch": 0.47993785665202715,
"grad_norm": 0.036505699157714844,
"kl": 0.0040128231048583984,
"learning_rate": 4.989233053196948e-06,
"loss": 0.0024,
"num_tokens": 184454394.0,
"reward": 0.04771205602446571,
"reward_std": 0.05920424917712808,
"rewards/pure_accuracy_reward_math": 0.047712054976727813,
"step": 351
},
{
"clip_ratio": 0.00023261837060317703,
"epoch": 0.481849959666577,
"grad_norm": 0.037214819341897964,
"kl": 0.004108428955078125,
"learning_rate": 4.988643638221193e-06,
"loss": 0.0024,
"step": 352
},
{
"clip_ratio": 0.0002573013600795093,
"epoch": 0.4837620626811269,
"grad_norm": 0.03702811896800995,
"kl": 0.004202127456665039,
"learning_rate": 4.9880385538019665e-06,
"loss": 0.0024,
"step": 353
},
{
"clip_ratio": 0.0002758479482167786,
"epoch": 0.48567416569567684,
"grad_norm": 0.03838437795639038,
"kl": 0.004250764846801758,
"learning_rate": 4.987417803749112e-06,
"loss": 0.0023,
"step": 354
},
{
"clip_ratio": 0.00024451872050690326,
"epoch": 0.48758626871022676,
"grad_norm": 0.035314518958330154,
"kl": 0.00424647331237793,
"learning_rate": 4.986781391971105e-06,
"loss": 0.0023,
"step": 355
},
{
"clip_ratio": 0.0,
"completion_length": 522.8789310455322,
"epoch": 0.48949837172477667,
"grad_norm": 0.038822874426841736,
"kl": 0.004703998565673828,
"learning_rate": 4.986129322475037e-06,
"loss": 0.006,
"num_tokens": 188061244.0,
"reward": 0.05887277075089514,
"reward_std": 0.0715272988891229,
"rewards/pure_accuracy_reward_math": 0.058872769062872976,
"step": 356
},
{
"clip_ratio": 0.0003040988601696881,
"epoch": 0.4914104747393266,
"grad_norm": 0.03750370442867279,
"kl": 0.004604816436767578,
"learning_rate": 4.985461599366583e-06,
"loss": 0.006,
"step": 357
},
{
"clip_ratio": 0.0003311016299676339,
"epoch": 0.4933225777538765,
"grad_norm": 0.03735021874308586,
"kl": 0.004613637924194336,
"learning_rate": 4.984778226849983e-06,
"loss": 0.0059,
"step": 358
},
{
"clip_ratio": 0.00031427563314423423,
"epoch": 0.4952346807684264,
"grad_norm": 0.037090424448251724,
"kl": 0.00463104248046875,
"learning_rate": 4.984079209228007e-06,
"loss": 0.0059,
"step": 359
},
{
"clip_ratio": 0.0003153682554284387,
"epoch": 0.49714678378297633,
"grad_norm": 0.03496375307440758,
"kl": 0.004604816436767578,
"learning_rate": 4.983364550901936e-06,
"loss": 0.0058,
"step": 360
},
{
"clip_ratio": 0.0,
"completion_length": 523.5016980171204,
"epoch": 0.4990588867975262,
"grad_norm": 1978.1619873046875,
"kl": 5.663617134094238,
"learning_rate": 4.982634256371529e-06,
"loss": 0.2313,
"num_tokens": 191670522.0,
"reward": 0.05943080599536188,
"reward_std": 0.06242607004242018,
"rewards/pure_accuracy_reward_math": 0.059430805064039305,
"step": 361
},
{
"clip_ratio": 0.0003008291907349303,
"epoch": 0.5009709898120761,
"grad_norm": 6.705481052398682,
"kl": 0.07292413711547852,
"learning_rate": 4.981888330234998e-06,
"loss": 0.0076,
"step": 362
},
{
"clip_ratio": 0.00038137949604788446,
"epoch": 0.502883092826626,
"grad_norm": 0.4056338369846344,
"kl": 0.013193130493164062,
"learning_rate": 4.981126777188976e-06,
"loss": 0.0053,
"step": 363
},
{
"clip_ratio": 0.00039371675529764616,
"epoch": 0.5047951958411759,
"grad_norm": 0.40032151341438293,
"kl": 0.009969472885131836,
"learning_rate": 4.980349602028489e-06,
"loss": 0.0052,
"step": 364
},
{
"clip_ratio": 0.0003270253398568457,
"epoch": 0.5067072988557259,
"grad_norm": 0.08224909007549286,
"kl": 0.010345458984375,
"learning_rate": 4.979556809646928e-06,
"loss": 0.0051,
"step": 365
},
{
"clip_ratio": 0.0,
"completion_length": 534.6082878112793,
"epoch": 0.5086194018702758,
"grad_norm": 0.036373648792505264,
"kl": 0.003941535949707031,
"learning_rate": 4.978748405036014e-06,
"loss": 0.0071,
"num_tokens": 195317270.0,
"reward": 0.05552455584984273,
"reward_std": 0.06775363947963342,
"rewards/pure_accuracy_reward_math": 0.05552455486031249,
"step": 366
},
{
"clip_ratio": 0.00027453447256675645,
"epoch": 0.5105315048848257,
"grad_norm": 0.03525104746222496,
"kl": 0.0039365291595458984,
"learning_rate": 4.977924393285767e-06,
"loss": 0.0072,
"step": 367
},
{
"clip_ratio": 0.0003015769660521528,
"epoch": 0.5124436078993756,
"grad_norm": 0.03737647458910942,
"kl": 0.0039522647857666016,
"learning_rate": 4.977084779584479e-06,
"loss": 0.0071,
"step": 368
},
{
"clip_ratio": 0.0002889172319555655,
"epoch": 0.5143557109139255,
"grad_norm": 0.03506501764059067,
"kl": 0.0039052963256835938,
"learning_rate": 4.976229569218676e-06,
"loss": 0.0071,
"step": 369
},
{
"clip_ratio": 0.0002910121094146234,
"epoch": 0.5162678139284754,
"grad_norm": 0.03558839485049248,
"kl": 0.003898143768310547,
"learning_rate": 4.975358767573085e-06,
"loss": 0.007,
"step": 370
},
{
"clip_ratio": 0.0,
"completion_length": 523.1417660713196,
"epoch": 0.5181799169430253,
"grad_norm": 9.403284072875977,
"kl": 0.0705575942993164,
"learning_rate": 4.974472380130605e-06,
"loss": 0.0078,
"num_tokens": 198926094.0,
"reward": 0.06305803885334171,
"reward_std": 0.0737193762906827,
"rewards/pure_accuracy_reward_math": 0.06305803733994253,
"step": 371
},
{
"clip_ratio": 0.00028168898450076085,
"epoch": 0.5200920199575753,
"grad_norm": 0.10174906253814697,
"kl": 0.005540609359741211,
"learning_rate": 4.9735704124722665e-06,
"loss": 0.0053,
"step": 372
},
{
"clip_ratio": 0.00026055807722968893,
"epoch": 0.5220041229721252,
"grad_norm": 0.036394841969013214,
"kl": 0.004784584045410156,
"learning_rate": 4.9726528702771985e-06,
"loss": 0.0052,
"step": 373
},
{
"clip_ratio": 0.0003154287535949152,
"epoch": 0.523916225986675,
"grad_norm": 0.03702308237552643,
"kl": 0.004788875579833984,
"learning_rate": 4.971719759322596e-06,
"loss": 0.0052,
"step": 374
},
{
"clip_ratio": 0.000301387064496339,
"epoch": 0.5258283290012249,
"grad_norm": 0.03516030311584473,
"kl": 0.004770994186401367,
"learning_rate": 4.97077108548368e-06,
"loss": 0.0051,
"step": 375
},
{
"clip_ratio": 0.0,
"completion_length": 520.7994132041931,
"epoch": 0.5277404320157748,
"grad_norm": 0.04183080792427063,
"kl": 0.006031513214111328,
"learning_rate": 4.969806854733658e-06,
"loss": 0.0091,
"num_tokens": 202522419.0,
"reward": 0.0638950924621895,
"reward_std": 0.07990403153235093,
"rewards/pure_accuracy_reward_math": 0.0638950903667137,
"step": 376
},
{
"clip_ratio": 0.00032519385399609746,
"epoch": 0.5296525350303247,
"grad_norm": 0.0407201424241066,
"kl": 0.005979061126708984,
"learning_rate": 4.968827073143694e-06,
"loss": 0.0091,
"step": 377
},
{
"clip_ratio": 0.00031682528469900717,
"epoch": 0.5315646380448746,
"grad_norm": 0.040043942630290985,
"kl": 0.005922555923461914,
"learning_rate": 4.967831746882863e-06,
"loss": 0.0091,
"step": 378
},
{
"clip_ratio": 0.00033513708405052967,
"epoch": 0.5334767410594246,
"grad_norm": 0.03983679041266441,
"kl": 0.005841970443725586,
"learning_rate": 4.966820882218118e-06,
"loss": 0.009,
"step": 379
},
{
"clip_ratio": 0.00034104771594911654,
"epoch": 0.5353888440739745,
"grad_norm": 0.03983955457806587,
"kl": 0.005755186080932617,
"learning_rate": 4.965794485514245e-06,
"loss": 0.0089,
"step": 380
},
{
"clip_ratio": 0.0,
"completion_length": 520.5067186355591,
"epoch": 0.5373009470885244,
"grad_norm": 0.034092146903276443,
"kl": 0.0043926239013671875,
"learning_rate": 4.964752563233826e-06,
"loss": 0.008,
"num_tokens": 206122403.0,
"reward": 0.055803573748562485,
"reward_std": 0.05980854749213904,
"rewards/pure_accuracy_reward_math": 0.05580357275903225,
"step": 381
},
{
"clip_ratio": 0.00025422318708478997,
"epoch": 0.5392130501030743,
"grad_norm": 0.03263320028781891,
"kl": 0.0043218135833740234,
"learning_rate": 4.9636951219372e-06,
"loss": 0.008,
"step": 382
},
{
"clip_ratio": 0.00025885856206286917,
"epoch": 0.5411251531176242,
"grad_norm": 0.032487623393535614,
"kl": 0.004242420196533203,
"learning_rate": 4.962622168282416e-06,
"loss": 0.008,
"step": 383
},
{
"clip_ratio": 0.0002850476581102157,
"epoch": 0.5430372561321741,
"grad_norm": 0.032427769154310226,
"kl": 0.004185199737548828,
"learning_rate": 4.961533709025199e-06,
"loss": 0.0079,
"step": 384
},
{
"clip_ratio": 0.00029774147623129466,
"epoch": 0.544949359146724,
"grad_norm": 0.031092027202248573,
"kl": 0.004144430160522461,
"learning_rate": 4.960429751018901e-06,
"loss": 0.0079,
"step": 385
},
{
"clip_ratio": 0.0,
"completion_length": 522.9258050918579,
"epoch": 0.546861462161274,
"grad_norm": 0.6398438811302185,
"kl": 0.013398170471191406,
"learning_rate": 4.959310301214458e-06,
"loss": 0.0048,
"num_tokens": 209727833.0,
"reward": 0.06668527127476409,
"reward_std": 0.07586519059259444,
"rewards/pure_accuracy_reward_math": 0.06668526941211894,
"step": 386
},
{
"clip_ratio": 0.0002956847454242961,
"epoch": 0.5487735651758239,
"grad_norm": 0.09603609144687653,
"kl": 0.006535530090332031,
"learning_rate": 4.958175366660352e-06,
"loss": 0.0045,
"step": 387
},
{
"clip_ratio": 0.00032585520455086225,
"epoch": 0.5506856681903738,
"grad_norm": 0.042251698672771454,
"kl": 0.004881858825683594,
"learning_rate": 4.95702495450256e-06,
"loss": 0.0045,
"step": 388
},
{
"clip_ratio": 0.00030688931195754776,
"epoch": 0.5525977712049237,
"grad_norm": 0.03725959733128548,
"kl": 0.00462651252746582,
"learning_rate": 4.955859071984512e-06,
"loss": 0.0044,
"step": 389
},
{
"clip_ratio": 0.0002833517196449975,
"epoch": 0.5545098742194736,
"grad_norm": 0.03557269275188446,
"kl": 0.004591941833496094,
"learning_rate": 4.954677726447049e-06,
"loss": 0.0044,
"step": 390
},
{
"clip_ratio": 0.0,
"completion_length": 529.50141954422,
"epoch": 0.5564219772340235,
"grad_norm": 0.03767434135079384,
"kl": 0.0041730403900146484,
"learning_rate": 4.953480925328369e-06,
"loss": 0.0053,
"num_tokens": 213359594.0,
"reward": 0.05636160998255946,
"reward_std": 0.06873711966909468,
"rewards/pure_accuracy_reward_math": 0.05636160829453729,
"step": 391
},
{
"clip_ratio": 0.0002943199858691514,
"epoch": 0.5583340802485733,
"grad_norm": 0.03691519424319267,
"kl": 0.004199981689453125,
"learning_rate": 4.952268676163984e-06,
"loss": 0.0053,
"step": 392
},
{
"clip_ratio": 0.00028674039270981666,
"epoch": 0.5602461832631233,
"grad_norm": 0.036044176667928696,
"kl": 0.004216432571411133,
"learning_rate": 4.951040986586676e-06,
"loss": 0.0053,
"step": 393
},
{
"clip_ratio": 0.0003071572371595721,
"epoch": 0.5621582862776732,
"grad_norm": 0.0358373187482357,
"kl": 0.004226207733154297,
"learning_rate": 4.949797864326442e-06,
"loss": 0.0053,
"step": 394
},
{
"clip_ratio": 0.000308680556543095,
"epoch": 0.5640703892922231,
"grad_norm": 0.0356404110789299,
"kl": 0.004263877868652344,
"learning_rate": 4.9485393172104525e-06,
"loss": 0.0052,
"step": 395
},
{
"clip_ratio": 0.0,
"completion_length": 528.1506924629211,
"epoch": 0.565982492306773,
"grad_norm": 0.03425108641386032,
"kl": 0.004232645034790039,
"learning_rate": 4.947265353162997e-06,
"loss": 0.0047,
"num_tokens": 216984490.0,
"reward": 0.05831473466241732,
"reward_std": 0.06912249873857945,
"rewards/pure_accuracy_reward_math": 0.058314733556471765,
"step": 396
},
{
"clip_ratio": 0.0002443079777663115,
"epoch": 0.5678945953213229,
"grad_norm": 0.03406741842627525,
"kl": 0.004246950149536133,
"learning_rate": 4.945975980205435e-06,
"loss": 0.0046,
"step": 397
},
{
"clip_ratio": 0.00025582832455484095,
"epoch": 0.5698066983358728,
"grad_norm": 0.033892109990119934,
"kl": 0.004239320755004883,
"learning_rate": 4.944671206456148e-06,
"loss": 0.0046,
"step": 398
},
{
"clip_ratio": 0.0002801110364885062,
"epoch": 0.5717188013504227,
"grad_norm": 0.03294463828206062,
"kl": 0.0042018890380859375,
"learning_rate": 4.943351040130485e-06,
"loss": 0.0046,
"step": 399
},
{
"clip_ratio": 0.00030015600407296006,
"epoch": 0.5736309043649727,
"grad_norm": 0.03228214010596275,
"kl": 0.004125118255615234,
"learning_rate": 4.942015489540715e-06,
"loss": 0.0045,
"step": 400
},
{
"clip_ratio": 0.0,
"completion_length": 527.8225684165955,
"epoch": 0.5755430073795226,
"grad_norm": 0.037567272782325745,
"kl": 0.005152702331542969,
"learning_rate": 4.94066456309597e-06,
"loss": 0.0071,
"num_tokens": 220604938.0,
"reward": 0.06166294886497781,
"reward_std": 0.07311507751001045,
"rewards/pure_accuracy_reward_math": 0.06166294764261693,
"step": 401
},
{
"clip_ratio": 0.0002694410874823916,
"epoch": 0.5774551103940725,
"grad_norm": 0.036373041570186615,
"kl": 0.005210161209106445,
"learning_rate": 4.939298269302194e-06,
"loss": 0.0071,
"step": 402
},
{
"clip_ratio": 0.0002891406058438406,
"epoch": 0.5793672134086224,
"grad_norm": 0.03582580015063286,
"kl": 0.0052187442779541016,
"learning_rate": 4.9379166167620915e-06,
"loss": 0.007,
"step": 403
},
{
"clip_ratio": 0.00030127688086167836,
"epoch": 0.5812793164231723,
"grad_norm": 0.035248763859272,
"kl": 0.005229949951171875,
"learning_rate": 4.93651961417507e-06,
"loss": 0.007,
"step": 404
},
{
"clip_ratio": 0.00031262176707969047,
"epoch": 0.5831914194377222,
"grad_norm": 0.03461577743291855,
"kl": 0.00519251823425293,
"learning_rate": 4.9351072703371885e-06,
"loss": 0.0069,
"step": 405
},
{
"clip_ratio": 0.0,
"completion_length": 534.0067219734192,
"epoch": 0.5851035224522722,
"grad_norm": 0.0363302007317543,
"kl": 0.004278659820556641,
"learning_rate": 4.933679594141096e-06,
"loss": 0.0041,
"num_tokens": 224253906.0,
"reward": 0.06222098533180542,
"reward_std": 0.07462272536940873,
"rewards/pure_accuracy_reward_math": 0.06222098329453729,
"step": 406
},
{
"clip_ratio": 0.0002887690876320903,
"epoch": 0.5870156254668221,
"grad_norm": 0.03538454696536064,
"kl": 0.004297971725463867,
"learning_rate": 4.932236594575986e-06,
"loss": 0.0041,
"step": 407
},
{
"clip_ratio": 0.00029836769689950415,
"epoch": 0.588927728481372,
"grad_norm": 0.03521309420466423,
"kl": 0.004305362701416016,
"learning_rate": 4.9307782807275304e-06,
"loss": 0.0041,
"step": 408
},
{
"clip_ratio": 0.0003077857980144927,
"epoch": 0.5908398314959219,
"grad_norm": 0.03468110039830208,
"kl": 0.004298210144042969,
"learning_rate": 4.929304661777823e-06,
"loss": 0.0041,
"step": 409
},
{
"clip_ratio": 0.00030735837987094783,
"epoch": 0.5927519345104717,
"grad_norm": 0.03504593297839165,
"kl": 0.004282474517822266,
"learning_rate": 4.9278157470053305e-06,
"loss": 0.004,
"step": 410
},
{
"clip_ratio": 0.0,
"completion_length": 531.0987973213196,
"epoch": 0.5946640375250216,
"grad_norm": 0.03893313929438591,
"kl": 0.004411935806274414,
"learning_rate": 4.926311545784823e-06,
"loss": 0.0081,
"num_tokens": 227887088.0,
"reward": 0.06138393160654232,
"reward_std": 0.07560620526783168,
"rewards/pure_accuracy_reward_math": 0.061383930034935474,
"step": 411
},
{
"clip_ratio": 0.0003015478255292692,
"epoch": 0.5965761405395715,
"grad_norm": 0.03745520859956741,
"kl": 0.004415750503540039,
"learning_rate": 4.924792067587321e-06,
"loss": 0.0081,
"step": 412
},
{
"clip_ratio": 0.00033068407248038056,
"epoch": 0.5984882435541214,
"grad_norm": 0.037219781428575516,
"kl": 0.004396915435791016,
"learning_rate": 4.923257321980036e-06,
"loss": 0.0081,
"step": 413
},
{
"clip_ratio": 0.00037280973344877566,
"epoch": 0.6004003465686714,
"grad_norm": 0.03754372149705887,
"kl": 0.0044384002685546875,
"learning_rate": 4.9217073186263075e-06,
"loss": 0.0081,
"step": 414
},
{
"clip_ratio": 0.0003646712993372603,
"epoch": 0.6023124495832213,
"grad_norm": 0.03602118790149689,
"kl": 0.004477262496948242,
"learning_rate": 4.920142067285544e-06,
"loss": 0.008,
"step": 415
},
{
"clip_ratio": 0.0,
"completion_length": 508.44282722473145,
"epoch": 0.6042245525977712,
"grad_norm": 0.039943527430295944,
"kl": 0.004469871520996094,
"learning_rate": 4.9185615778131614e-06,
"loss": 0.0078,
"num_tokens": 231443183.0,
"reward": 0.0705915211874526,
"reward_std": 0.07968511217040941,
"rewards/pure_accuracy_reward_math": 0.07059151926659979,
"step": 416
},
{
"clip_ratio": 0.00031770144798315414,
"epoch": 0.6061366556123211,
"grad_norm": 0.039055656641721725,
"kl": 0.004549264907836914,
"learning_rate": 4.916965860160521e-06,
"loss": 0.0078,
"step": 417
},
{
"clip_ratio": 0.00030108455553090607,
"epoch": 0.608048758626871,
"grad_norm": 0.03719799593091011,
"kl": 0.004551410675048828,
"learning_rate": 4.915354924374864e-06,
"loss": 0.0078,
"step": 418
},
{
"clip_ratio": 0.0003208976940527464,
"epoch": 0.6099608616414209,
"grad_norm": 0.03626833111047745,
"kl": 0.004576444625854492,
"learning_rate": 4.913728780599254e-06,
"loss": 0.0077,
"step": 419
},
{
"clip_ratio": 0.00030395733068644404,
"epoch": 0.6118729646559709,
"grad_norm": 0.035672470927238464,
"kl": 0.004616498947143555,
"learning_rate": 4.912087439072508e-06,
"loss": 0.0077,
"step": 420
},
{
"clip_ratio": 0.0,
"completion_length": 519.3401436805725,
"epoch": 0.6137850676705208,
"grad_norm": 0.035979609936475754,
"kl": 0.004936695098876953,
"learning_rate": 4.9104309101291345e-06,
"loss": 0.008,
"num_tokens": 235040570.0,
"reward": 0.0558035739522893,
"reward_std": 0.06414644059259444,
"rewards/pure_accuracy_reward_math": 0.05580357278813608,
"step": 421
},
{
"clip_ratio": 0.0002606460908509689,
"epoch": 0.6156971706850707,
"grad_norm": 0.034824173897504807,
"kl": 0.004873991012573242,
"learning_rate": 4.908759204199268e-06,
"loss": 0.008,
"step": 422
},
{
"clip_ratio": 0.0002711625579081556,
"epoch": 0.6176092736996206,
"grad_norm": 0.034011878073215485,
"kl": 0.00480341911315918,
"learning_rate": 4.907072331808602e-06,
"loss": 0.008,
"step": 423
},
{
"clip_ratio": 0.0002719364555332504,
"epoch": 0.6195213767141705,
"grad_norm": 0.0330798402428627,
"kl": 0.00470733642578125,
"learning_rate": 4.905370303578324e-06,
"loss": 0.0079,
"step": 424
},
{
"clip_ratio": 0.0003164075427548596,
"epoch": 0.6214334797287204,
"grad_norm": 0.03356935828924179,
"kl": 0.004645586013793945,
"learning_rate": 4.903653130225049e-06,
"loss": 0.0079,
"step": 425
},
{
"clip_ratio": 0.0,
"completion_length": 524.4051547050476,
"epoch": 0.6233455827432703,
"grad_norm": 0.037987031042575836,
"kl": 0.004395723342895508,
"learning_rate": 4.901920822560753e-06,
"loss": 0.004,
"num_tokens": 238650146.0,
"reward": 0.056082592491293326,
"reward_std": 0.06946781190345064,
"rewards/pure_accuracy_reward_math": 0.05608259033760987,
"step": 426
},
{
"clip_ratio": 0.0002752577877913609,
"epoch": 0.6252576857578201,
"grad_norm": 0.03711739555001259,
"kl": 0.0043413639068603516,
"learning_rate": 4.900173391492698e-06,
"loss": 0.004,
"step": 427
},
{
"clip_ratio": 0.0002780464546390249,
"epoch": 0.6271697887723701,
"grad_norm": 0.03583519160747528,
"kl": 0.004349231719970703,
"learning_rate": 4.898410848023374e-06,
"loss": 0.004,
"step": 428
},
{
"clip_ratio": 0.0002759867400072835,
"epoch": 0.62908189178692,
"grad_norm": 0.035115331411361694,
"kl": 0.0043909549713134766,
"learning_rate": 4.896633203250424e-06,
"loss": 0.0039,
"step": 429
},
{
"clip_ratio": 0.0002873923492074937,
"epoch": 0.6309939948014699,
"grad_norm": 0.03465187922120094,
"kl": 0.004460573196411133,
"learning_rate": 4.89484046836657e-06,
"loss": 0.0039,
"step": 430
},
{
"clip_ratio": 0.0,
"completion_length": 527.1116304397583,
"epoch": 0.6329060978160198,
"grad_norm": 0.03591939061880112,
"kl": 0.004395723342895508,
"learning_rate": 4.893032654659554e-06,
"loss": 0.0068,
"num_tokens": 242275198.0,
"reward": 0.05859375320142135,
"reward_std": 0.06461814750218764,
"rewards/pure_accuracy_reward_math": 0.05859375110594556,
"step": 431
},
{
"clip_ratio": 0.00021255032419276176,
"epoch": 0.6348182008305697,
"grad_norm": 0.03488593176007271,
"kl": 0.0043849945068359375,
"learning_rate": 4.891209773512054e-06,
"loss": 0.0068,
"step": 432
},
{
"clip_ratio": 0.00023523596212271514,
"epoch": 0.6367303038451196,
"grad_norm": 0.03410722687840462,
"kl": 0.004419565200805664,
"learning_rate": 4.889371836401621e-06,
"loss": 0.0067,
"step": 433
},
{
"clip_ratio": 0.00024576090385153293,
"epoch": 0.6386424068596696,
"grad_norm": 0.03335421159863472,
"kl": 0.004421710968017578,
"learning_rate": 4.887518854900603e-06,
"loss": 0.0067,
"step": 434
},
{
"clip_ratio": 0.0002828803910119859,
"epoch": 0.6405545098742195,
"grad_norm": 0.03240649402141571,
"kl": 0.004340171813964844,
"learning_rate": 4.885650840676074e-06,
"loss": 0.0066,
"step": 435
},
{
"clip_ratio": 0.0,
"completion_length": 532.2051043510437,
"epoch": 0.6424666128887694,
"grad_norm": 0.03588009625673294,
"kl": 0.0044574737548828125,
"learning_rate": 4.88376780548976e-06,
"loss": 0.0041,
"num_tokens": 245917009.0,
"reward": 0.05775669912691228,
"reward_std": 0.06611959752626717,
"rewards/pure_accuracy_reward_math": 0.05775669778813608,
"step": 436
},
{
"clip_ratio": 0.0002524082638899472,
"epoch": 0.6443787159033193,
"grad_norm": 0.03471923619508743,
"kl": 0.0044062137603759766,
"learning_rate": 4.881869761197963e-06,
"loss": 0.0041,
"step": 437
},
{
"clip_ratio": 0.0002889056303843063,
"epoch": 0.6462908189178692,
"grad_norm": 0.03379988297820091,
"kl": 0.004372119903564453,
"learning_rate": 4.879956719751491e-06,
"loss": 0.004,
"step": 438
},
{
"clip_ratio": 0.0003009145272017122,
"epoch": 0.6482029219324191,
"grad_norm": 0.03446533530950546,
"kl": 0.004400730133056641,
"learning_rate": 4.878028693195577e-06,
"loss": 0.004,
"step": 439
},
{
"clip_ratio": 0.00030466545126728306,
"epoch": 0.650115024946969,
"grad_norm": 0.03484022617340088,
"kl": 0.004462242126464844,
"learning_rate": 4.876085693669806e-06,
"loss": 0.0039,
"step": 440
},
{
"clip_ratio": 0.0,
"completion_length": 517.0904240608215,
"epoch": 0.652027127961519,
"grad_norm": 0.0366295725107193,
"kl": 0.004509925842285156,
"learning_rate": 4.8741277334080405e-06,
"loss": 0.0066,
"num_tokens": 249502673.0,
"reward": 0.05719866382423788,
"reward_std": 0.06594694149680436,
"rewards/pure_accuracy_reward_math": 0.057198662078008056,
"step": 441
},
{
"clip_ratio": 0.00023539985437537325,
"epoch": 0.6539392309760689,
"grad_norm": 0.03590084984898567,
"kl": 0.0045740604400634766,
"learning_rate": 4.87215482473834e-06,
"loss": 0.0066,
"step": 442
},
{
"clip_ratio": 0.00022167488214108744,
"epoch": 0.6558513339906188,
"grad_norm": 0.03433714434504509,
"kl": 0.004676342010498047,
"learning_rate": 4.870166980082885e-06,
"loss": 0.0066,
"step": 443
},
{
"clip_ratio": 0.0002476425726172238,
"epoch": 0.6577634370051687,
"grad_norm": 0.03389691188931465,
"kl": 0.004789113998413086,
"learning_rate": 4.868164211957899e-06,
"loss": 0.0065,
"step": 444
},
{
"clip_ratio": 0.00025810993128061455,
"epoch": 0.6596755400197185,
"grad_norm": 0.03417885676026344,
"kl": 0.004879474639892578,
"learning_rate": 4.866146532973569e-06,
"loss": 0.0064,
"step": 445
},
{
"clip_ratio": 0.0,
"completion_length": 520.3697214126587,
"epoch": 0.6615876430342684,
"grad_norm": 0.03560737892985344,
"kl": 0.00455927848815918,
"learning_rate": 4.864113955833967e-06,
"loss": 0.0056,
"num_tokens": 253104314.0,
"reward": 0.06584821722935885,
"reward_std": 0.07672227645525709,
"rewards/pure_accuracy_reward_math": 0.06584821565775201,
"step": 446
},
{
"clip_ratio": 0.00029780695723502504,
"epoch": 0.6634997460488183,
"grad_norm": 0.034836821258068085,
"kl": 0.0045278072357177734,
"learning_rate": 4.862066493336967e-06,
"loss": 0.0056,
"step": 447
},
{
"clip_ratio": 0.00030120932990485016,
"epoch": 0.6654118490633683,
"grad_norm": 0.03460467606782913,
"kl": 0.0045435428619384766,
"learning_rate": 4.860004158374172e-06,
"loss": 0.0055,
"step": 448
},
{
"clip_ratio": 0.000313081463019671,
"epoch": 0.6673239520779182,
"grad_norm": 0.03467562422156334,
"kl": 0.004552364349365234,
"learning_rate": 4.857926963930822e-06,
"loss": 0.0055,
"step": 449
},
{
"clip_ratio": 0.00031086072692687594,
"epoch": 0.6692360550924681,
"grad_norm": 0.03409102186560631,
"kl": 0.004626035690307617,
"learning_rate": 4.855834923085721e-06,
"loss": 0.0054,
"step": 450
},
{
"clip_ratio": 0.0,
"completion_length": 514.4771447181702,
"epoch": 0.671148158107018,
"grad_norm": 0.03815117105841637,
"kl": 0.005002737045288086,
"learning_rate": 4.853728049011151e-06,
"loss": 0.0091,
"num_tokens": 256687388.0,
"reward": 0.06556919938884676,
"reward_std": 0.07874169782735407,
"rewards/pure_accuracy_reward_math": 0.06556919787544757,
"step": 451
},
{
"clip_ratio": 0.0003133106871473501,
"epoch": 0.6730602611215679,
"grad_norm": 0.03761136531829834,
"kl": 0.005041837692260742,
"learning_rate": 4.851606354972791e-06,
"loss": 0.0091,
"step": 452
},
{
"clip_ratio": 0.00034106033973557714,
"epoch": 0.6749723641361178,
"grad_norm": 0.0372379869222641,
"kl": 0.0050508975982666016,
"learning_rate": 4.849469854329629e-06,
"loss": 0.0091,
"step": 453
},
{
"clip_ratio": 0.00033749614277667206,
"epoch": 0.6768844671506677,
"grad_norm": 0.03686762601137161,
"kl": 0.005095005035400391,
"learning_rate": 4.847318560533882e-06,
"loss": 0.009,
"step": 454
},
{
"clip_ratio": 0.00035140375177888927,
"epoch": 0.6787965701652177,
"grad_norm": 0.036469750106334686,
"kl": 0.005120754241943359,
"learning_rate": 4.845152487130914e-06,
"loss": 0.009,
"step": 455
},
{
"clip_ratio": 0.0,
"completion_length": 512.4866299629211,
"epoch": 0.6807086731797676,
"grad_norm": 0.037901297211647034,
"kl": 0.004809379577636719,
"learning_rate": 4.842971647759142e-06,
"loss": 0.0063,
"num_tokens": 260253700.0,
"reward": 0.05775669912691228,
"reward_std": 0.06710927549283952,
"rewards/pure_accuracy_reward_math": 0.05775669767172076,
"step": 456
},
{
"clip_ratio": 0.00026634283756266086,
"epoch": 0.6826207761943175,
"grad_norm": 0.03568252548575401,
"kl": 0.0047724246978759766,
"learning_rate": 4.840776056149957e-06,
"loss": 0.0063,
"step": 457
},
{
"clip_ratio": 0.00027518686636085476,
"epoch": 0.6845328792088674,
"grad_norm": 0.0351024754345417,
"kl": 0.004754543304443359,
"learning_rate": 4.838565726127636e-06,
"loss": 0.0063,
"step": 458
},
{
"clip_ratio": 0.0003387172891393675,
"epoch": 0.6864449822234173,
"grad_norm": 0.03477272391319275,
"kl": 0.004698753356933594,
"learning_rate": 4.836340671609255e-06,
"loss": 0.0062,
"step": 459
},
{
"clip_ratio": 0.0003592506114102889,
"epoch": 0.6883570852379672,
"grad_norm": 0.035812895745038986,
"kl": 0.004735708236694336,
"learning_rate": 4.834100906604601e-06,
"loss": 0.0062,
"step": 460
},
{
"clip_ratio": 0.0,
"completion_length": 536.1403703689575,
"epoch": 0.6902691882525172,
"grad_norm": 0.03566034138202667,
"kl": 0.004418611526489258,
"learning_rate": 4.831846445216082e-06,
"loss": 0.0056,
"num_tokens": 263902651.0,
"reward": 0.05161830614088103,
"reward_std": 0.06899610540131107,
"rewards/pure_accuracy_reward_math": 0.051618304976727813,
"step": 461
},
{
"clip_ratio": 0.00028340513017610647,
"epoch": 0.6921812912670671,
"grad_norm": 0.03495897352695465,
"kl": 0.004414081573486328,
"learning_rate": 4.829577301638642e-06,
"loss": 0.0056,
"step": 462
},
{
"clip_ratio": 0.0002825141077664739,
"epoch": 0.6940933942816169,
"grad_norm": 0.034486111253499985,
"kl": 0.004411220550537109,
"learning_rate": 4.827293490159668e-06,
"loss": 0.0056,
"step": 463
},
{
"clip_ratio": 0.00031019614829119746,
"epoch": 0.6960054972961668,
"grad_norm": 0.035884980112314224,
"kl": 0.004367351531982422,
"learning_rate": 4.824995025158903e-06,
"loss": 0.0055,
"step": 464
},
{
"clip_ratio": 0.0003045983889933268,
"epoch": 0.6979176003107167,
"grad_norm": 0.03378836810588837,
"kl": 0.004292488098144531,
"learning_rate": 4.822681921108355e-06,
"loss": 0.0055,
"step": 465
},
{
"clip_ratio": 0.0,
"completion_length": 525.3783731460571,
"epoch": 0.6998297033252666,
"grad_norm": 0.03726997971534729,
"kl": 0.0065157413482666016,
"learning_rate": 4.8203541925722016e-06,
"loss": 0.0017,
"num_tokens": 267508687.0,
"reward": 0.06724330646102317,
"reward_std": 0.07591145433252677,
"rewards/pure_accuracy_reward_math": 0.06724330500583164,
"step": 466
},
{
"clip_ratio": 0.00026273680936128585,
"epoch": 0.7017418063398165,
"grad_norm": 0.03638988733291626,
"kl": 0.0064983367919921875,
"learning_rate": 4.818011854206706e-06,
"loss": 0.0017,
"step": 467
},
{
"clip_ratio": 0.0002903113285128711,
"epoch": 0.7036539093543664,
"grad_norm": 0.0360158272087574,
"kl": 0.006509542465209961,
"learning_rate": 4.815654920760117e-06,
"loss": 0.0016,
"step": 468
},
{
"clip_ratio": 0.0002849762186087901,
"epoch": 0.7055660123689164,
"grad_norm": 0.03577370196580887,
"kl": 0.006470680236816406,
"learning_rate": 4.81328340707258e-06,
"loss": 0.0016,
"step": 469
},
{
"clip_ratio": 0.00031370155647891806,
"epoch": 0.7074781153834663,
"grad_norm": 0.03484919294714928,
"kl": 0.006468772888183594,
"learning_rate": 4.810897328076045e-06,
"loss": 0.0015,
"step": 470
},
{
"clip_ratio": 0.0,
"completion_length": 530.1677136421204,
"epoch": 0.7093902183980162,
"grad_norm": 0.04198005422949791,
"kl": 0.004724264144897461,
"learning_rate": 4.808496698794171e-06,
"loss": 0.0046,
"num_tokens": 271138708.0,
"reward": 0.07310268204309978,
"reward_std": 0.07646948879119009,
"rewards/pure_accuracy_reward_math": 0.07310267994762398,
"step": 471
},
{
"clip_ratio": 0.00028702764876697984,
"epoch": 0.7113023214125661,
"grad_norm": 0.04015243798494339,
"kl": 0.004670619964599609,
"learning_rate": 4.8060815343422265e-06,
"loss": 0.0045,
"step": 472
},
{
"clip_ratio": 0.0002947892680822406,
"epoch": 0.713214424427116,
"grad_norm": 0.0385352224111557,
"kl": 0.0046727657318115234,
"learning_rate": 4.803651849927004e-06,
"loss": 0.0045,
"step": 473
},
{
"clip_ratio": 0.00036661511779811917,
"epoch": 0.7151265274416659,
"grad_norm": 0.03803607076406479,
"kl": 0.00463414192199707,
"learning_rate": 4.801207660846717e-06,
"loss": 0.0044,
"step": 474
},
{
"clip_ratio": 0.00040073674449558894,
"epoch": 0.7170386304562159,
"grad_norm": 0.03870271518826485,
"kl": 0.00464320182800293,
"learning_rate": 4.798748982490908e-06,
"loss": 0.0044,
"step": 475
},
{
"clip_ratio": 0.0,
"completion_length": 539.262857913971,
"epoch": 0.7189507334707658,
"grad_norm": 0.0374424010515213,
"kl": 0.0045392513275146484,
"learning_rate": 4.796275830340344e-06,
"loss": 0.0081,
"num_tokens": 274802094.0,
"reward": 0.061941967433085665,
"reward_std": 0.07401842664694414,
"rewards/pure_accuracy_reward_math": 0.06194196522119455,
"step": 476
},
{
"clip_ratio": 0.00026828293908920386,
"epoch": 0.7208628364853157,
"grad_norm": 0.03758076950907707,
"kl": 0.004576683044433594,
"learning_rate": 4.793788219966931e-06,
"loss": 0.0081,
"step": 477
},
{
"clip_ratio": 0.0002991793934654652,
"epoch": 0.7227749394998656,
"grad_norm": 0.03570091351866722,
"kl": 0.0045130252838134766,
"learning_rate": 4.7912861670336065e-06,
"loss": 0.008,
"step": 478
},
{
"clip_ratio": 0.00031140293214093617,
"epoch": 0.7246870425144155,
"grad_norm": 0.034991368651390076,
"kl": 0.0044956207275390625,
"learning_rate": 4.788769687294243e-06,
"loss": 0.008,
"step": 479
},
{
"clip_ratio": 0.00034215352269484356,
"epoch": 0.7265991455289653,
"grad_norm": 0.03517301753163338,
"kl": 0.00450587272644043,
"learning_rate": 4.7862387965935504e-06,
"loss": 0.0079,
"step": 480
},
{
"clip_ratio": 0.0,
"completion_length": 535.2455615997314,
"epoch": 0.7285112485435152,
"grad_norm": 0.03517255187034607,
"kl": 0.004718780517578125,
"learning_rate": 4.783693510866977e-06,
"loss": 0.0066,
"num_tokens": 278455030.0,
"reward": 0.06222098530270159,
"reward_std": 0.069766862958204,
"rewards/pure_accuracy_reward_math": 0.062220983498264104,
"step": 481
},
{
"clip_ratio": 0.00026954136529866446,
"epoch": 0.7304233515580651,
"grad_norm": 0.03456445038318634,
"kl": 0.004766225814819336,
"learning_rate": 4.781133846140606e-06,
"loss": 0.0066,
"step": 482
},
{
"clip_ratio": 0.000250861422671278,
"epoch": 0.7323354545726151,
"grad_norm": 0.033632129430770874,
"kl": 0.004829883575439453,
"learning_rate": 4.778559818531055e-06,
"loss": 0.0066,
"step": 483
},
{
"clip_ratio": 0.0002590245896385568,
"epoch": 0.734247557587165,
"grad_norm": 0.03314875811338425,
"kl": 0.00486445426940918,
"learning_rate": 4.775971444245379e-06,
"loss": 0.0065,
"step": 484
},
{
"clip_ratio": 0.0002899982684425595,
"epoch": 0.7361596606017149,
"grad_norm": 0.03288432955741882,
"kl": 0.004921674728393555,
"learning_rate": 4.773368739580963e-06,
"loss": 0.0065,
"step": 485
},
{
"clip_ratio": 0.0,
"completion_length": 524.4258046150208,
"epoch": 0.7380717636162648,
"grad_norm": 0.08309170603752136,
"kl": 0.006993293762207031,
"learning_rate": 4.770751720925422e-06,
"loss": 0.0023,
"num_tokens": 282068152.0,
"reward": 0.06222098495345563,
"reward_std": 0.0712282478925772,
"rewards/pure_accuracy_reward_math": 0.06222098338184878,
"step": 486
},
{
"clip_ratio": 0.0002442373284452515,
"epoch": 0.7399838666308147,
"grad_norm": 0.042120546102523804,
"kl": 0.006081581115722656,
"learning_rate": 4.768120404756497e-06,
"loss": 0.0023,
"step": 487
},
{
"clip_ratio": 0.0002956131474434187,
"epoch": 0.7418959696453646,
"grad_norm": 0.036061204969882965,
"kl": 0.0057599544525146484,
"learning_rate": 4.765474807641951e-06,
"loss": 0.0022,
"step": 488
},
{
"clip_ratio": 0.00030389728723889675,
"epoch": 0.7438080726599146,
"grad_norm": 0.03613469749689102,
"kl": 0.005738019943237305,
"learning_rate": 4.762814946239468e-06,
"loss": 0.0022,
"step": 489
},
{
"clip_ratio": 0.00033159017920070255,
"epoch": 0.7457201756744645,
"grad_norm": 0.0360892117023468,
"kl": 0.00572967529296875,
"learning_rate": 4.760140837296542e-06,
"loss": 0.0021,
"step": 490
},
{
"clip_ratio": 0.0,
"completion_length": 550.3144750595093,
"epoch": 0.7476322786890144,
"grad_norm": 0.03636733815073967,
"kl": 0.004332542419433594,
"learning_rate": 4.757452497650377e-06,
"loss": 0.0072,
"num_tokens": 285770403.0,
"reward": 0.055803573777666315,
"reward_std": 0.07161362667102367,
"rewards/pure_accuracy_reward_math": 0.05580357278813608,
"step": 491
},
{
"clip_ratio": 0.00027637260956225873,
"epoch": 0.7495443817035643,
"grad_norm": 0.035727791488170624,
"kl": 0.004361629486083984,
"learning_rate": 4.754749944227777e-06,
"loss": 0.0072,
"step": 492
},
{
"clip_ratio": 0.0002587454115428045,
"epoch": 0.7514564847181142,
"grad_norm": 0.03512200713157654,
"kl": 0.0043697357177734375,
"learning_rate": 4.752033194045044e-06,
"loss": 0.0072,
"step": 493
},
{
"clip_ratio": 0.00025780797875540884,
"epoch": 0.7533685877326641,
"grad_norm": 0.033817108720541,
"kl": 0.0043947696685791016,
"learning_rate": 4.7493022642078654e-06,
"loss": 0.0071,
"step": 494
},
{
"clip_ratio": 0.00029674232627030506,
"epoch": 0.755280690747214,
"grad_norm": 0.03317062556743622,
"kl": 0.004454851150512695,
"learning_rate": 4.746557171911211e-06,
"loss": 0.0071,
"step": 495
},
{
"clip_ratio": 0.0,
"completion_length": 543.0692186355591,
"epoch": 0.757192793761764,
"grad_norm": 0.05020016431808472,
"kl": 0.0062062740325927734,
"learning_rate": 4.7437979344392236e-06,
"loss": 0.0059,
"num_tokens": 289451319.0,
"reward": 0.0616629492433276,
"reward_std": 0.07071027776692063,
"rewards/pure_accuracy_reward_math": 0.06166294778813608,
"step": 496
},
{
"clip_ratio": 0.00028460744590574905,
"epoch": 0.7591048967763139,
"grad_norm": 0.03948064520955086,
"kl": 0.0061266422271728516,
"learning_rate": 4.741024569165105e-06,
"loss": 0.0059,
"step": 497
},
{
"clip_ratio": 0.0002803450769306437,
"epoch": 0.7610169997908637,
"grad_norm": 0.03621263429522514,
"kl": 0.00614476203918457,
"learning_rate": 4.7382370935510165e-06,
"loss": 0.0059,
"step": 498
},
{
"clip_ratio": 0.0003022695020717947,
"epoch": 0.7629291028054136,
"grad_norm": 0.037622902542352676,
"kl": 0.006256580352783203,
"learning_rate": 4.73543552514796e-06,
"loss": 0.0058,
"step": 499
},
{
"clip_ratio": 0.00030265802058693225,
"epoch": 0.7648412058199635,
"grad_norm": 0.03813454508781433,
"kl": 0.006264209747314453,
"learning_rate": 4.732619881595672e-06,
"loss": 0.0057,
"step": 500
},
{
"clip_ratio": 0.0,
"completion_length": 543.3538174629211,
"epoch": 0.7667533088345134,
"grad_norm": 0.07500133663415909,
"kl": 0.005916118621826172,
"learning_rate": 4.729790180622512e-06,
"loss": 0.0072,
"num_tokens": 293127839.0,
"reward": 0.0513392879802268,
"reward_std": 0.06792009877972305,
"rewards/pure_accuracy_reward_math": 0.051339287048904225,
"step": 501
},
{
"clip_ratio": 0.0002826226679530919,
"epoch": 0.7686654118490633,
"grad_norm": 0.03498294949531555,
"kl": 0.0057086944580078125,
"learning_rate": 4.726946440045348e-06,
"loss": 0.0072,
"step": 502
},
{
"clip_ratio": 0.000292762170943206,
"epoch": 0.7705775148636133,
"grad_norm": 0.0338723324239254,
"kl": 0.0054700374603271484,
"learning_rate": 4.7240886777694495e-06,
"loss": 0.0071,
"step": 503
},
{
"clip_ratio": 0.00031638332251304746,
"epoch": 0.7724896178781632,
"grad_norm": 0.03360189124941826,
"kl": 0.00526118278503418,
"learning_rate": 4.721216911788371e-06,
"loss": 0.0071,
"step": 504
},
{
"clip_ratio": 0.0003445502737804418,
"epoch": 0.7744017208927131,
"grad_norm": 0.03321666270494461,
"kl": 0.005108356475830078,
"learning_rate": 4.71833116018384e-06,
"loss": 0.007,
"step": 505
},
{
"clip_ratio": 0.0,
"completion_length": 525.3041553497314,
"epoch": 0.776313823907263,
"grad_norm": 0.039082907140254974,
"kl": 0.0048329830169677734,
"learning_rate": 4.715431441125639e-06,
"loss": 0.0072,
"num_tokens": 296745449.0,
"reward": 0.056640627823071554,
"reward_std": 0.066464910923969,
"rewards/pure_accuracy_reward_math": 0.05664062636788003,
"step": 506
},
{
"clip_ratio": 0.0002697859709428485,
"epoch": 0.7782259269218129,
"grad_norm": 0.036139652132987976,
"kl": 0.0048868656158447266,
"learning_rate": 4.712517772871503e-06,
"loss": 0.0072,
"step": 507
},
{
"clip_ratio": 0.0002602223319172481,
"epoch": 0.7801380299363628,
"grad_norm": 0.03708622604608536,
"kl": 0.004920244216918945,
"learning_rate": 4.709590173766988e-06,
"loss": 0.0072,
"step": 508
},
{
"clip_ratio": 0.00030563702995323183,
"epoch": 0.7820501329509127,
"grad_norm": 0.03873802721500397,
"kl": 0.004922151565551758,
"learning_rate": 4.706648662245368e-06,
"loss": 0.0071,
"step": 509
},
{
"clip_ratio": 0.00027421732914945096,
"epoch": 0.7839622359654627,
"grad_norm": 0.0337008535861969,
"kl": 0.004686117172241211,
"learning_rate": 4.703693256827515e-06,
"loss": 0.0071,
"step": 510
},
{
"clip_ratio": 0.0,
"completion_length": 544.4595675468445,
"epoch": 0.7858743389800126,
"grad_norm": 0.032148003578186035,
"kl": 0.004284381866455078,
"learning_rate": 4.700723976121782e-06,
"loss": 0.0079,
"num_tokens": 300427724.0,
"reward": 0.05998884211294353,
"reward_std": 0.06822534691309556,
"rewards/pure_accuracy_reward_math": 0.059988840483129025,
"step": 511
},
{
"clip_ratio": 0.00023266997004611767,
"epoch": 0.7877864419945625,
"grad_norm": 0.03213036060333252,
"kl": 0.004235267639160156,
"learning_rate": 4.697740838823884e-06,
"loss": 0.0079,
"step": 512
},
{
"clip_ratio": 0.00023210655439243055,
"epoch": 0.7896985450091124,
"grad_norm": 0.03171762451529503,
"kl": 0.004268169403076172,
"learning_rate": 4.694743863716784e-06,
"loss": 0.0078,
"step": 513
},
{
"clip_ratio": 0.0002433597992990144,
"epoch": 0.7916106480236623,
"grad_norm": 0.030378276482224464,
"kl": 0.004282712936401367,
"learning_rate": 4.691733069670575e-06,
"loss": 0.0078,
"step": 514
},
{
"clip_ratio": 0.00024098603546462982,
"epoch": 0.7935227510382122,
"grad_norm": 0.030135801061987877,
"kl": 0.004299640655517578,
"learning_rate": 4.688708475642356e-06,
"loss": 0.0078,
"step": 515
},
{
"clip_ratio": 0.0,
"completion_length": 520.5064425468445,
"epoch": 0.795434854052762,
"grad_norm": 0.03758488968014717,
"kl": 0.004748344421386719,
"learning_rate": 4.685670100676117e-06,
"loss": 0.0056,
"num_tokens": 304030899.0,
"reward": 0.059151788300368935,
"reward_std": 0.06615966308163479,
"rewards/pure_accuracy_reward_math": 0.05915178684517741,
"step": 516
},
{
"clip_ratio": 0.00024922658519699326,
"epoch": 0.797346957067312,
"grad_norm": 0.03667794167995453,
"kl": 0.004762172698974609,
"learning_rate": 4.6826179639026185e-06,
"loss": 0.0056,
"step": 517
},
{
"clip_ratio": 0.00024439046995894387,
"epoch": 0.7992590600818619,
"grad_norm": 0.03566230833530426,
"kl": 0.004770755767822266,
"learning_rate": 4.679552084539271e-06,
"loss": 0.0055,
"step": 518
},
{
"clip_ratio": 0.00025443012202686077,
"epoch": 0.8011711630964118,
"grad_norm": 0.03555983304977417,
"kl": 0.004889011383056641,
"learning_rate": 4.676472481890012e-06,
"loss": 0.0055,
"step": 519
},
{
"clip_ratio": 0.0002555244412860702,
"epoch": 0.8030832661109617,
"grad_norm": 0.03477266803383827,
"kl": 0.004910707473754883,
"learning_rate": 4.673379175345187e-06,
"loss": 0.0054,
"step": 520
},
{
"clip_ratio": 0.0,
"completion_length": 530.2039861679077,
"epoch": 0.8049953691255116,
"grad_norm": 0.03352927044034004,
"kl": 0.004728078842163086,
"learning_rate": 4.670272184381426e-06,
"loss": 0.0064,
"num_tokens": 307666714.0,
"reward": 0.05106027063447982,
"reward_std": 0.061781705473549664,
"rewards/pure_accuracy_reward_math": 0.05106026888824999,
"step": 521
},
{
"clip_ratio": 0.00022480493561261028,
"epoch": 0.8069074721400615,
"grad_norm": 0.0328591950237751,
"kl": 0.004677772521972656,
"learning_rate": 4.667151528561522e-06,
"loss": 0.0064,
"step": 522
},
{
"clip_ratio": 0.0002208993353463029,
"epoch": 0.8088195751546114,
"grad_norm": 0.0323566235601902,
"kl": 0.004681825637817383,
"learning_rate": 4.664017227534308e-06,
"loss": 0.0064,
"step": 523
},
{
"clip_ratio": 0.0002261604544742113,
"epoch": 0.8107316781691614,
"grad_norm": 0.03178941085934639,
"kl": 0.004633665084838867,
"learning_rate": 4.6608693010345285e-06,
"loss": 0.0063,
"step": 524
},
{
"clip_ratio": 0.0002347389614101303,
"epoch": 0.8126437811837113,
"grad_norm": 0.03144075721502304,
"kl": 0.004633426666259766,
"learning_rate": 4.657707768882723e-06,
"loss": 0.0063,
"step": 525
},
{
"clip_ratio": 0.0,
"completion_length": 534.2078919410706,
"epoch": 0.8145558841982612,
"grad_norm": 36658.046875,
"kl": 696.0046517848969,
"learning_rate": 4.6545326509850965e-06,
"loss": 27.8583,
"num_tokens": 311314491.0,
"reward": 0.05747768114088103,
"reward_std": 0.06521624798187986,
"rewards/pure_accuracy_reward_math": 0.057477680093143135,
"step": 526
},
{
"clip_ratio": 0.0006453408203128674,
"epoch": 0.8164679872128111,
"grad_norm": 3234.42724609375,
"kl": 42.254658937454224,
"learning_rate": 4.651343967333394e-06,
"loss": 1.7021,
"step": 527
},
{
"clip_ratio": 0.0006781478184620937,
"epoch": 0.818380090227361,
"grad_norm": 430.01318359375,
"kl": 0.21270966529846191,
"learning_rate": 4.648141738004776e-06,
"loss": 0.256,
"step": 528
},
{
"clip_ratio": 0.0006916913723671314,
"epoch": 0.8202921932419109,
"grad_norm": 457.1385803222656,
"kl": 0.1541590690612793,
"learning_rate": 4.644925983161691e-06,
"loss": 0.3118,
"step": 529
},
{
"clip_ratio": 0.0007114471513887111,
"epoch": 0.8222042962564609,
"grad_norm": 61.02793884277344,
"kl": 1.6688117980957031,
"learning_rate": 4.641696723051753e-06,
"loss": 0.1081,
"step": 530
},
{
"clip_ratio": 0.0,
"completion_length": 544.7664904594421,
"epoch": 0.8241163992710108,
"grad_norm": 0.03665775805711746,
"kl": 0.0046710968017578125,
"learning_rate": 4.638453978007606e-06,
"loss": 0.0033,
"num_tokens": 315000186.0,
"reward": 0.05691964577999897,
"reward_std": 0.06766731111565605,
"rewards/pure_accuracy_reward_math": 0.056919643975561485,
"step": 531
},
{
"clip_ratio": 0.000247030089042255,
"epoch": 0.8260285022855607,
"grad_norm": 0.03543345257639885,
"kl": 0.004717826843261719,
"learning_rate": 4.635197768446799e-06,
"loss": 0.0033,
"step": 532
},
{
"clip_ratio": 0.00024415442914005325,
"epoch": 0.8279406053001105,
"grad_norm": 0.034531209617853165,
"kl": 0.004744768142700195,
"learning_rate": 4.631928114871667e-06,
"loss": 0.0032,
"step": 533
},
{
"clip_ratio": 0.0002580326566032909,
"epoch": 0.8298527083146604,
"grad_norm": 0.03323632851243019,
"kl": 0.004830360412597656,
"learning_rate": 4.628645037869183e-06,
"loss": 0.0032,
"step": 534
},
{
"clip_ratio": 0.00029695888167680096,
"epoch": 0.8317648113292103,
"grad_norm": 0.03470376506447792,
"kl": 0.0048847198486328125,
"learning_rate": 4.625348558110846e-06,
"loss": 0.0031,
"step": 535
},
{
"clip_ratio": 0.0,
"completion_length": 543.506443977356,
"epoch": 0.8336769143437602,
"grad_norm": 33.48581314086914,
"kl": 0.7041072845458984,
"learning_rate": 4.6220386963525425e-06,
"loss": 0.0349,
"num_tokens": 318683697.0,
"reward": 0.06333705675206147,
"reward_std": 0.0759915838134475,
"rewards/pure_accuracy_reward_math": 0.063337054773001,
"step": 536
},
{
"clip_ratio": 0.00030500417074108555,
"epoch": 0.8355890173583101,
"grad_norm": 5.391356468200684,
"kl": 0.12163639068603516,
"learning_rate": 4.6187154734344144e-06,
"loss": 0.0115,
"step": 537
},
{
"clip_ratio": 0.0003094891900445873,
"epoch": 0.8375011203728601,
"grad_norm": 0.24674992263317108,
"kl": 0.011260032653808594,
"learning_rate": 4.615378910280735e-06,
"loss": 0.007,
"step": 538
},
{
"clip_ratio": 0.0003443351265559613,
"epoch": 0.83941322338741,
"grad_norm": 0.040490083396434784,
"kl": 0.0068547725677490234,
"learning_rate": 4.61202902789977e-06,
"loss": 0.0068,
"step": 539
},
{
"clip_ratio": 0.0003249310258297555,
"epoch": 0.8413253264019599,
"grad_norm": 0.037383101880550385,
"kl": 0.006977081298828125,
"learning_rate": 4.608665847383646e-06,
"loss": 0.0068,
"step": 540
},
{
"clip_ratio": 0.0,
"completion_length": 528.8432207107544,
"epoch": 0.8432374294165098,
"grad_norm": 0.0408562608063221,
"kl": 0.005080223083496094,
"learning_rate": 4.6052893899082244e-06,
"loss": 0.0092,
"num_tokens": 322311955.0,
"reward": 0.07505580695578828,
"reward_std": 0.08672685426427051,
"rewards/pure_accuracy_reward_math": 0.07505580462748185,
"step": 541
},
{
"clip_ratio": 0.0003254984287082152,
"epoch": 0.8451495324310597,
"grad_norm": 0.03888032212853432,
"kl": 0.005081653594970703,
"learning_rate": 4.60189967673296e-06,
"loss": 0.0091,
"step": 542
},
{
"clip_ratio": 0.00032150591908930437,
"epoch": 0.8470616354456096,
"grad_norm": 0.03769301995635033,
"kl": 0.005054950714111328,
"learning_rate": 4.598496729200772e-06,
"loss": 0.0091,
"step": 543
},
{
"clip_ratio": 0.0003807161001532222,
"epoch": 0.8489737384601596,
"grad_norm": 0.03671475872397423,
"kl": 0.005011320114135742,
"learning_rate": 4.595080568737907e-06,
"loss": 0.009,
"step": 544
},
{
"clip_ratio": 0.00040073374452731514,
"epoch": 0.8508858414747095,
"grad_norm": 0.03656642884016037,
"kl": 0.004985332489013672,
"learning_rate": 4.591651216853808e-06,
"loss": 0.009,
"step": 545
},
{
"clip_ratio": 0.0,
"completion_length": 521.1850123405457,
"epoch": 0.8527979444892594,
"grad_norm": 0.04072614386677742,
"kl": 0.005250692367553711,
"learning_rate": 4.588208695140972e-06,
"loss": 0.008,
"num_tokens": 325915646.0,
"reward": 0.06891741379513405,
"reward_std": 0.07457646180409938,
"rewards/pure_accuracy_reward_math": 0.0689174119324889,
"step": 546
},
{
"clip_ratio": 0.0002774237623270892,
"epoch": 0.8547100475038093,
"grad_norm": 0.03891909867525101,
"kl": 0.005267620086669922,
"learning_rate": 4.5847530252748206e-06,
"loss": 0.008,
"step": 547
},
{
"clip_ratio": 0.0003099276901821213,
"epoch": 0.8566221505183592,
"grad_norm": 0.03776893764734268,
"kl": 0.005312681198120117,
"learning_rate": 4.581284229013561e-06,
"loss": 0.008,
"step": 548
},
{
"clip_ratio": 0.0003329096458060121,
"epoch": 0.8585342535329091,
"grad_norm": 0.03786613792181015,
"kl": 0.0053446292877197266,
"learning_rate": 4.57780232819805e-06,
"loss": 0.0079,
"step": 549
},
{
"clip_ratio": 0.0003465502328481307,
"epoch": 0.860446356547459,
"grad_norm": 0.03782954812049866,
"kl": 0.00535893440246582,
"learning_rate": 4.574307344751654e-06,
"loss": 0.0079,
"step": 550
},
{
"clip_ratio": 0.0,
"completion_length": 512.2042679786682,
"epoch": 0.8623584595620088,
"grad_norm": 0.04082540422677994,
"kl": 0.005150318145751953,
"learning_rate": 4.570799300680112e-06,
"loss": 0.0061,
"num_tokens": 329486142.0,
"reward": 0.06696428914438002,
"reward_std": 0.07865536911413074,
"rewards/pure_accuracy_reward_math": 0.06696428681607358,
"step": 551
},
{
"clip_ratio": 0.0002784457984148503,
"epoch": 0.8642705625765588,
"grad_norm": 0.039590511471033096,
"kl": 0.005137205123901367,
"learning_rate": 4.5672782180714005e-06,
"loss": 0.0061,
"step": 552
},
{
"clip_ratio": 0.0003210699376268167,
"epoch": 0.8661826655911087,
"grad_norm": 0.03983275964856148,
"kl": 0.005161285400390625,
"learning_rate": 4.56374411909559e-06,
"loss": 0.0061,
"step": 553
},
{
"clip_ratio": 0.00032905748116718314,
"epoch": 0.8680947686056586,
"grad_norm": 0.03924131765961647,
"kl": 0.0051097869873046875,
"learning_rate": 4.560197026004706e-06,
"loss": 0.006,
"step": 554
},
{
"clip_ratio": 0.00036174511694753164,
"epoch": 0.8700068716202085,
"grad_norm": 0.03864859789609909,
"kl": 0.0051233768463134766,
"learning_rate": 4.556636961132591e-06,
"loss": 0.0059,
"step": 555
},
{
"clip_ratio": 0.0,
"completion_length": 524.8490724563599,
"epoch": 0.8719189746347584,
"grad_norm": 0.03831901028752327,
"kl": 0.005173921585083008,
"learning_rate": 4.553063946894765e-06,
"loss": 0.0089,
"num_tokens": 333101169.0,
"reward": 0.05970982427243143,
"reward_std": 0.06925509037682787,
"rewards/pure_accuracy_reward_math": 0.05970982293365523,
"step": 556
},
{
"clip_ratio": 0.00024058804717697058,
"epoch": 0.8738310776493083,
"grad_norm": 0.03815346583724022,
"kl": 0.005152463912963867,
"learning_rate": 4.549478005788276e-06,
"loss": 0.0088,
"step": 557
},
{
"clip_ratio": 0.0002689754076072859,
"epoch": 0.8757431806638583,
"grad_norm": 0.03663227707147598,
"kl": 0.00511932373046875,
"learning_rate": 4.5458791603915695e-06,
"loss": 0.0088,
"step": 558
},
{
"clip_ratio": 0.0002769273295371022,
"epoch": 0.8776552836784082,
"grad_norm": 0.03534897044301033,
"kl": 0.005173921585083008,
"learning_rate": 4.5422674333643415e-06,
"loss": 0.0087,
"step": 559
},
{
"clip_ratio": 0.0003186316080245888,
"epoch": 0.8795673866929581,
"grad_norm": 0.03454131633043289,
"kl": 0.005182981491088867,
"learning_rate": 4.538642847447393e-06,
"loss": 0.0087,
"step": 560
},
{
"clip_ratio": 0.0,
"completion_length": 499.49025869369507,
"epoch": 0.881479489707508,
"grad_norm": 0.03870520368218422,
"kl": 0.005303621292114258,
"learning_rate": 4.53500542546249e-06,
"loss": 0.0063,
"num_tokens": 336621146.0,
"reward": 0.06724330663564615,
"reward_std": 0.07539348350837827,
"rewards/pure_accuracy_reward_math": 0.0672433050640393,
"step": 561
},
{
"clip_ratio": 0.0002930208739826412,
"epoch": 0.8833915927220579,
"grad_norm": 0.03670111671090126,
"kl": 0.005410432815551758,
"learning_rate": 4.5313551903122195e-06,
"loss": 0.0063,
"step": 562
},
{
"clip_ratio": 0.00033625421181682214,
"epoch": 0.8853036957366078,
"grad_norm": 0.03873737156391144,
"kl": 0.0054399967193603516,
"learning_rate": 4.5276921649798475e-06,
"loss": 0.0063,
"step": 563
},
{
"clip_ratio": 0.0003349392310383337,
"epoch": 0.8872157987511577,
"grad_norm": 0.038494061678647995,
"kl": 0.0053806304931640625,
"learning_rate": 4.524016372529168e-06,
"loss": 0.0062,
"step": 564
},
{
"clip_ratio": 0.00031196477385719845,
"epoch": 0.8891279017657077,
"grad_norm": 0.03559175133705139,
"kl": 0.005260467529296875,
"learning_rate": 4.520327836104363e-06,
"loss": 0.0061,
"step": 565
},
{
"clip_ratio": 0.0,
"completion_length": 521.2452793121338,
"epoch": 0.8910400047802576,
"grad_norm": 0.033526018261909485,
"kl": 0.0050280094146728516,
"learning_rate": 4.516626578929857e-06,
"loss": 0.0083,
"num_tokens": 340217537.0,
"reward": 0.05970982470898889,
"reward_std": 0.06920882686972618,
"rewards/pure_accuracy_reward_math": 0.059709822555305436,
"step": 566
},
{
"clip_ratio": 0.0002854210310374583,
"epoch": 0.8929521077948075,
"grad_norm": 0.03320698440074921,
"kl": 0.00494694709777832,
"learning_rate": 4.512912624310166e-06,
"loss": 0.0083,
"step": 567
},
{
"clip_ratio": 0.00028784406134718665,
"epoch": 0.8948642108093574,
"grad_norm": 0.0334990993142128,
"kl": 0.004927158355712891,
"learning_rate": 4.509185995629758e-06,
"loss": 0.0083,
"step": 568
},
{
"clip_ratio": 0.00028731861192454744,
"epoch": 0.8967763138239072,
"grad_norm": 0.032721105962991714,
"kl": 0.004916667938232422,
"learning_rate": 4.505446716352898e-06,
"loss": 0.0083,
"step": 569
},
{
"clip_ratio": 0.0003211342911981774,
"epoch": 0.8986884168384571,
"grad_norm": 0.031691305339336395,
"kl": 0.0050427913665771484,
"learning_rate": 4.501694810023506e-06,
"loss": 0.0082,
"step": 570
},
{
"clip_ratio": 0.0,
"completion_length": 513.3175444602966,
"epoch": 0.900600519853007,
"grad_norm": 0.039067283272743225,
"kl": 0.0051767826080322266,
"learning_rate": 4.497930300265005e-06,
"loss": 0.0062,
"num_tokens": 343792675.0,
"reward": 0.07254464668221772,
"reward_std": 0.07260330504504964,
"rewards/pure_accuracy_reward_math": 0.07254464394645765,
"step": 571
},
{
"clip_ratio": 0.000284439854624452,
"epoch": 0.902512622867557,
"grad_norm": 0.03746037185192108,
"kl": 0.0051670074462890625,
"learning_rate": 4.494153210780177e-06,
"loss": 0.0062,
"step": 572
},
{
"clip_ratio": 0.0002894837679718876,
"epoch": 0.9044247258821069,
"grad_norm": 0.0363248772919178,
"kl": 0.0051119327545166016,
"learning_rate": 4.490363565351007e-06,
"loss": 0.0061,
"step": 573
},
{
"clip_ratio": 0.00029392389137683494,
"epoch": 0.9063368288966568,
"grad_norm": 0.03513769805431366,
"kl": 0.005059242248535156,
"learning_rate": 4.486561387838539e-06,
"loss": 0.0061,
"step": 574
},
{
"clip_ratio": 0.0003296555175325011,
"epoch": 0.9082489319112067,
"grad_norm": 0.03513012453913689,
"kl": 0.005059242248535156,
"learning_rate": 4.482746702182725e-06,
"loss": 0.006,
"step": 575
},
{
"clip_ratio": 0.0,
"completion_length": 520.8926033973694,
"epoch": 0.9101610349257566,
"grad_norm": 0.049145400524139404,
"kl": 0.011604547500610352,
"learning_rate": 4.478919532402271e-06,
"loss": 0.0046,
"num_tokens": 347395370.0,
"reward": 0.07170759254950099,
"reward_std": 0.0817445982247591,
"rewards/pure_accuracy_reward_math": 0.07170759091968648,
"step": 576
},
{
"clip_ratio": 0.00030760892423131736,
"epoch": 0.9120731379403065,
"grad_norm": 0.04954507574439049,
"kl": 0.011447906494140625,
"learning_rate": 4.4750799025944866e-06,
"loss": 0.0045,
"step": 577
},
{
"clip_ratio": 0.0003202956161487691,
"epoch": 0.9139852409548564,
"grad_norm": 0.04883984476327896,
"kl": 0.010998249053955078,
"learning_rate": 4.471227836935139e-06,
"loss": 0.0045,
"step": 578
},
{
"clip_ratio": 0.0003312723312660637,
"epoch": 0.9158973439694064,
"grad_norm": 0.049066606909036636,
"kl": 0.010381698608398438,
"learning_rate": 4.467363359678291e-06,
"loss": 0.0044,
"step": 579
},
{
"clip_ratio": 0.00041312941800697445,
"epoch": 0.9178094469839563,
"grad_norm": 0.053418997675180435,
"kl": 0.009602546691894531,
"learning_rate": 4.463486495156157e-06,
"loss": 0.0043,
"step": 580
},
{
"clip_ratio": 0.0,
"completion_length": 539.5678267478943,
"epoch": 0.9197215499985062,
"grad_norm": 0.03747523948550224,
"kl": 0.004802227020263672,
"learning_rate": 4.459597267778945e-06,
"loss": 0.0041,
"num_tokens": 351065793.0,
"reward": 0.062220984895247966,
"reward_std": 0.07298868335783482,
"rewards/pure_accuracy_reward_math": 0.0622209832072258,
"step": 581
},
{
"clip_ratio": 0.0002890200073579763,
"epoch": 0.9216336530130561,
"grad_norm": 0.03557584062218666,
"kl": 0.004851579666137695,
"learning_rate": 4.455695702034705e-06,
"loss": 0.0041,
"step": 582
},
{
"clip_ratio": 0.00031045296407228307,
"epoch": 0.923545756027606,
"grad_norm": 0.034734807908535004,
"kl": 0.004895925521850586,
"learning_rate": 4.451781822489173e-06,
"loss": 0.0041,
"step": 583
},
{
"clip_ratio": 0.00032734786560695284,
"epoch": 0.9254578590421559,
"grad_norm": 0.03634972497820854,
"kl": 0.004976511001586914,
"learning_rate": 4.447855653785617e-06,
"loss": 0.004,
"step": 584
},
{
"clip_ratio": 0.00036698238614008005,
"epoch": 0.9273699620567059,
"grad_norm": 0.036671172827482224,
"kl": 0.004954338073730469,
"learning_rate": 4.4439172206446845e-06,
"loss": 0.0039,
"step": 585
},
{
"clip_ratio": 0.0,
"completion_length": 538.6261405944824,
"epoch": 0.9292820650712557,
"grad_norm": 0.03805253654718399,
"kl": 0.005060434341430664,
"learning_rate": 4.439966547864243e-06,
"loss": 0.0061,
"num_tokens": 354732057.0,
"reward": 0.06194196725846268,
"reward_std": 0.07766569184605032,
"rewards/pure_accuracy_reward_math": 0.06194196580327116,
"step": 586
},
{
"clip_ratio": 0.0002944122598478316,
"epoch": 0.9311941680858056,
"grad_norm": 0.03603314608335495,
"kl": 0.005051136016845703,
"learning_rate": 4.436003660319224e-06,
"loss": 0.0061,
"step": 587
},
{
"clip_ratio": 0.0003042620955966413,
"epoch": 0.9331062711003555,
"grad_norm": 0.035505130887031555,
"kl": 0.005032539367675781,
"learning_rate": 4.432028582961472e-06,
"loss": 0.006,
"step": 588
},
{
"clip_ratio": 0.00032173160303727855,
"epoch": 0.9350183741149054,
"grad_norm": 0.03633759915828705,
"kl": 0.00509190559387207,
"learning_rate": 4.428041340819579e-06,
"loss": 0.006,
"step": 589
},
{
"clip_ratio": 0.00038377046530513326,
"epoch": 0.9369304771294553,
"grad_norm": 0.03761395812034607,
"kl": 0.005148649215698242,
"learning_rate": 4.424041958998732e-06,
"loss": 0.0059,
"step": 590
},
{
"clip_ratio": 0.0,
"completion_length": 540.8948354721069,
"epoch": 0.9388425801440052,
"grad_norm": 0.04139011353254318,
"kl": 0.005031585693359375,
"learning_rate": 4.420030462680554e-06,
"loss": 0.007,
"num_tokens": 358409840.0,
"reward": 0.0714285749127157,
"reward_std": 0.07565246830927208,
"rewards/pure_accuracy_reward_math": 0.07142857275903225,
"step": 591
},
{
"clip_ratio": 0.0002982392526291733,
"epoch": 0.9407546831585551,
"grad_norm": 0.03948375955224037,
"kl": 0.005082845687866211,
"learning_rate": 4.416006877122948e-06,
"loss": 0.007,
"step": 592
},
{
"clip_ratio": 0.00033647330587882607,
"epoch": 0.9426667861731051,
"grad_norm": 0.041717879474163055,
"kl": 0.005113363265991211,
"learning_rate": 4.411971227659933e-06,
"loss": 0.0069,
"step": 593
},
{
"clip_ratio": 0.00036752876485479646,
"epoch": 0.944578889187655,
"grad_norm": 0.04109462723135948,
"kl": 0.005068063735961914,
"learning_rate": 4.407923539701486e-06,
"loss": 0.0069,
"step": 594
},
{
"clip_ratio": 0.0003528254699176614,
"epoch": 0.9464909922022049,
"grad_norm": 0.03620041161775589,
"kl": 0.0049245357513427734,
"learning_rate": 4.403863838733386e-06,
"loss": 0.0068,
"step": 595
},
{
"clip_ratio": 0.0,
"completion_length": 545.2444491386414,
"epoch": 0.9484030952167548,
"grad_norm": 42.05046463012695,
"kl": 0.3311493396759033,
"learning_rate": 4.399792150317048e-06,
"loss": 0.0203,
"num_tokens": 362096328.0,
"reward": 0.06026786071015522,
"reward_std": 0.07324766798410565,
"rewards/pure_accuracy_reward_math": 0.06026785832364112,
"step": 596
},
{
"clip_ratio": 0.0003009684866128737,
"epoch": 0.9503151982313047,
"grad_norm": 0.575372040271759,
"kl": 0.01551508903503418,
"learning_rate": 4.395708500089366e-06,
"loss": 0.0076,
"step": 597
},
{
"clip_ratio": 0.0003299758830053179,
"epoch": 0.9522273012458546,
"grad_norm": 0.052088066935539246,
"kl": 0.01082468032836914,
"learning_rate": 4.391612913762549e-06,
"loss": 0.0074,
"step": 598
},
{
"clip_ratio": 0.00032988658261956516,
"epoch": 0.9541394042604046,
"grad_norm": 0.046673182398080826,
"kl": 0.011472225189208984,
"learning_rate": 4.38750541712396e-06,
"loss": 0.0074,
"step": 599
},
{
"clip_ratio": 0.00031585949000145774,
"epoch": 0.9560515072749545,
"grad_norm": 0.04350757598876953,
"kl": 0.011662006378173828,
"learning_rate": 4.383386036035956e-06,
"loss": 0.0074,
"step": 600
},
{
"clip_ratio": 0.0,
"completion_length": 539.0309958457947,
"epoch": 0.9579636102895044,
"grad_norm": 0.04193362593650818,
"kl": 0.005011081695556641,
"learning_rate": 4.379254796435719e-06,
"loss": 0.0085,
"num_tokens": 365761119.0,
"reward": 0.06696428923169151,
"reward_std": 0.08311965479515493,
"rewards/pure_accuracy_reward_math": 0.06696428667055443,
"step": 601
},
{
"clip_ratio": 0.0003076634293392999,
"epoch": 0.9598757133040543,
"grad_norm": 0.04204736277461052,
"kl": 0.005095720291137695,
"learning_rate": 4.375111724335102e-06,
"loss": 0.0085,
"step": 602
},
{
"clip_ratio": 0.0002991189727481469,
"epoch": 0.9617878163186042,
"grad_norm": 0.041649866849184036,
"kl": 0.00509333610534668,
"learning_rate": 4.370956845820455e-06,
"loss": 0.0085,
"step": 603
},
{
"clip_ratio": 0.0003053998929090085,
"epoch": 0.963699919333154,
"grad_norm": 0.03969484567642212,
"kl": 0.005100727081298828,
"learning_rate": 4.366790187052468e-06,
"loss": 0.0084,
"step": 604
},
{
"clip_ratio": 0.0003063883330014505,
"epoch": 0.9656120223477039,
"grad_norm": 0.03833401948213577,
"kl": 0.005064487457275391,
"learning_rate": 4.362611774266005e-06,
"loss": 0.0083,
"step": 605
},
{
"clip_ratio": 0.0,
"completion_length": 534.4046006202698,
"epoch": 0.9675241253622539,
"grad_norm": 0.038279399275779724,
"kl": 0.005177021026611328,
"learning_rate": 4.358421633769934e-06,
"loss": 0.0061,
"num_tokens": 369412689.0,
"reward": 0.07087053885334171,
"reward_std": 0.08299326128326356,
"rewards/pure_accuracy_reward_math": 0.0708705369324889,
"step": 606
},
{
"clip_ratio": 0.00030927538728064974,
"epoch": 0.9694362283768038,
"grad_norm": 0.037665851414203644,
"kl": 0.005164146423339844,
"learning_rate": 4.35421979194697e-06,
"loss": 0.0061,
"step": 607
},
{
"clip_ratio": 0.0003293242310178357,
"epoch": 0.9713483313913537,
"grad_norm": 0.036888375878334045,
"kl": 0.005212306976318359,
"learning_rate": 4.3500062752535e-06,
"loss": 0.006,
"step": 608
},
{
"clip_ratio": 0.0003369250752029984,
"epoch": 0.9732604344059036,
"grad_norm": 0.03607965633273125,
"kl": 0.005278587341308594,
"learning_rate": 4.3457811102194225e-06,
"loss": 0.006,
"step": 609
},
{
"clip_ratio": 0.00034393194414406025,
"epoch": 0.9751725374204535,
"grad_norm": 0.036863330751657486,
"kl": 0.005379676818847656,
"learning_rate": 4.341544323447978e-06,
"loss": 0.0059,
"step": 610
},
{
"clip_ratio": 0.0,
"completion_length": 527.9905385971069,
"epoch": 0.9770846404350034,
"grad_norm": 0.03825363516807556,
"kl": 0.005227804183959961,
"learning_rate": 4.33729594161558e-06,
"loss": 0.0103,
"num_tokens": 373041503.0,
"reward": 0.07254464607103728,
"reward_std": 0.07848271250259131,
"rewards/pure_accuracy_reward_math": 0.07254464444122277,
"step": 611
},
{
"clip_ratio": 0.0002938344064205012,
"epoch": 0.9789967434495533,
"grad_norm": 0.037028077989816666,
"kl": 0.005240917205810547,
"learning_rate": 4.333035991471653e-06,
"loss": 0.0102,
"step": 612
},
{
"clip_ratio": 0.00029232190240691125,
"epoch": 0.9809088464641033,
"grad_norm": 0.03623189404606819,
"kl": 0.005187034606933594,
"learning_rate": 4.328764499838456e-06,
"loss": 0.0102,
"step": 613
},
{
"clip_ratio": 0.000318144969014611,
"epoch": 0.9828209494786532,
"grad_norm": 0.036878351122140884,
"kl": 0.005211830139160156,
"learning_rate": 4.324481493610919e-06,
"loss": 0.0101,
"step": 614
},
{
"clip_ratio": 0.0003371401809317831,
"epoch": 0.9847330524932031,
"grad_norm": 0.036278340965509415,
"kl": 0.0051462650299072266,
"learning_rate": 4.320186999756473e-06,
"loss": 0.0101,
"step": 615
},
{
"clip_ratio": 0.0,
"completion_length": 513.4927659034729,
"epoch": 0.986645155507753,
"grad_norm": 0.037584077566862106,
"kl": 0.005333662033081055,
"learning_rate": 4.315881045314878e-06,
"loss": 0.007,
"num_tokens": 376615645.0,
"reward": 0.07087053899886087,
"reward_std": 0.07342032523592934,
"rewards/pure_accuracy_reward_math": 0.0708705370198004,
"step": 616
},
{
"clip_ratio": 0.0002886684330292155,
"epoch": 0.9885572585223029,
"grad_norm": 0.035872798413038254,
"kl": 0.005288362503051758,
"learning_rate": 4.311563657398056e-06,
"loss": 0.007,
"step": 617
},
{
"clip_ratio": 0.0002961605097766551,
"epoch": 0.9904693615368528,
"grad_norm": 0.034989748150110245,
"kl": 0.0052263736724853516,
"learning_rate": 4.307234863189917e-06,
"loss": 0.007,
"step": 618
},
{
"clip_ratio": 0.0003532402791392997,
"epoch": 0.9923814645514027,
"grad_norm": 0.0338488332927227,
"kl": 0.005165576934814453,
"learning_rate": 4.302894689946189e-06,
"loss": 0.0069,
"step": 619
},
{
"clip_ratio": 0.00035387994120128496,
"epoch": 0.9942935675659527,
"grad_norm": 0.03370453417301178,
"kl": 0.005126953125,
"learning_rate": 4.298543164994249e-06,
"loss": 0.0069,
"step": 620
},
{
"clip_ratio": 0.0,
"completion_length": 526.433337688446,
"epoch": 1.00191210301455,
"grad_norm": 0.0355641208589077,
"kl": 0.004958152770996094,
"learning_rate": 4.294180315732946e-06,
"loss": 0.0063,
"num_tokens": 380233970.0,
"reward": 0.05412946696742438,
"reward_std": 0.06637858302565292,
"rewards/pure_accuracy_reward_math": 0.0541294657450635,
"step": 621
},
{
"clip_ratio": 0.0002793830541349962,
"epoch": 1.0038242060290998,
"grad_norm": 0.034697938710451126,
"kl": 0.004967689514160156,
"learning_rate": 4.289806169632434e-06,
"loss": 0.0063,
"step": 622
},
{
"clip_ratio": 0.00026950584020823953,
"epoch": 1.0057363090436497,
"grad_norm": 0.034267228096723557,
"kl": 0.005029439926147461,
"learning_rate": 4.285420754233992e-06,
"loss": 0.0062,
"step": 623
},
{
"clip_ratio": 0.0002694177366606709,
"epoch": 1.0076484120581997,
"grad_norm": 0.03245500102639198,
"kl": 0.005047798156738281,
"learning_rate": 4.2810240971498594e-06,
"loss": 0.0062,
"step": 624
},
{
"clip_ratio": 0.0002762260926942872,
"epoch": 1.0095605150727496,
"grad_norm": 0.03143523633480072,
"kl": 0.005035400390625,
"learning_rate": 4.276616226063055e-06,
"loss": 0.0061,
"step": 625
},
{
"clip_ratio": 0.0,
"completion_length": 528.094889163971,
"epoch": 1.0114726180872995,
"grad_norm": 0.03780335932970047,
"kl": 0.005240440368652344,
"learning_rate": 4.272197168727204e-06,
"loss": 0.0082,
"num_tokens": 383858818.0,
"reward": 0.06891741388244554,
"reward_std": 0.07891435397323221,
"rewards/pure_accuracy_reward_math": 0.06891741207800806,
"step": 626
},
{
"clip_ratio": 0.0002971897219481434,
"epoch": 1.0133847211018494,
"grad_norm": 0.03676832467317581,
"kl": 0.005240440368652344,
"learning_rate": 4.267766952966369e-06,
"loss": 0.0082,
"step": 627
},
{
"clip_ratio": 0.00032256075144232454,
"epoch": 1.0152968241163993,
"grad_norm": 0.03722486272454262,
"kl": 0.005322933197021484,
"learning_rate": 4.263325606674865e-06,
"loss": 0.0082,
"step": 628
},
{
"clip_ratio": 0.00031109488622860226,
"epoch": 1.0172089271309492,
"grad_norm": 0.036808740347623825,
"kl": 0.0054111480712890625,
"learning_rate": 4.258873157817093e-06,
"loss": 0.0081,
"step": 629
},
{
"clip_ratio": 0.00032292150183366175,
"epoch": 1.0191210301454992,
"grad_norm": 0.03518703579902649,
"kl": 0.005442619323730469,
"learning_rate": 4.254409634427356e-06,
"loss": 0.008,
"step": 630
},
{
"clip_ratio": 0.0,
"completion_length": 515.6958961486816,
"epoch": 1.021033133160049,
"grad_norm": 0.03399791195988655,
"kl": 0.005387306213378906,
"learning_rate": 4.249935064609692e-06,
"loss": 0.0031,
"num_tokens": 387438928.0,
"reward": 0.06250000285217538,
"reward_std": 0.06757478544022888,
"rewards/pure_accuracy_reward_math": 0.06250000145519152,
"step": 631
},
{
"clip_ratio": 0.0002553542814212051,
"epoch": 1.022945236174599,
"grad_norm": 0.03381386399269104,
"kl": 0.005375385284423828,
"learning_rate": 4.245449476537685e-06,
"loss": 0.0031,
"step": 632
},
{
"clip_ratio": 0.00023506408626872144,
"epoch": 1.024857339189149,
"grad_norm": 0.03337083011865616,
"kl": 0.00537109375,
"learning_rate": 4.2409528984543e-06,
"loss": 0.003,
"step": 633
},
{
"clip_ratio": 0.0002632986112871549,
"epoch": 1.0267694422036988,
"grad_norm": 0.03213095664978027,
"kl": 0.005321979522705078,
"learning_rate": 4.236445358671696e-06,
"loss": 0.003,
"step": 634
},
{
"clip_ratio": 0.00025607587781451,
"epoch": 1.0286815452182487,
"grad_norm": 0.03154142573475838,
"kl": 0.005255699157714844,
"learning_rate": 4.23192688557105e-06,
"loss": 0.0029,
"step": 635
},
{
"clip_ratio": 0.0,
"completion_length": 524.272346496582,
"epoch": 1.0305936482327986,
"grad_norm": 0.039318569004535675,
"kl": 0.005155801773071289,
"learning_rate": 4.2273975076023835e-06,
"loss": 0.0075,
"num_tokens": 391053556.0,
"reward": 0.06473214598372579,
"reward_std": 0.07401842583203688,
"rewards/pure_accuracy_reward_math": 0.06473214412108064,
"step": 636
},
{
"clip_ratio": 0.0003024499371804268,
"epoch": 1.0325057512473486,
"grad_norm": 0.03726111724972725,
"kl": 0.0050776004791259766,
"learning_rate": 4.222857253284376e-06,
"loss": 0.0075,
"step": 637
},
{
"clip_ratio": 0.0003151753968495541,
"epoch": 1.0344178542618985,
"grad_norm": 0.03595959022641182,
"kl": 0.005060434341430664,
"learning_rate": 4.218306151204188e-06,
"loss": 0.0074,
"step": 638
},
{
"clip_ratio": 0.0003387899199083222,
"epoch": 1.0363299572764482,
"grad_norm": 0.03628028184175491,
"kl": 0.005034923553466797,
"learning_rate": 4.213744230017283e-06,
"loss": 0.0074,
"step": 639
},
{
"clip_ratio": 0.00037899152403042535,
"epoch": 1.038242060290998,
"grad_norm": 0.03670131787657738,
"kl": 0.005095720291137695,
"learning_rate": 4.209171518447248e-06,
"loss": 0.0073,
"step": 640
},
{
"clip_ratio": 0.0,
"completion_length": 536.5907049179077,
"epoch": 1.040154163305548,
"grad_norm": 0.03938442841172218,
"kl": 0.0051763057708740234,
"learning_rate": 4.204588045285607e-06,
"loss": 0.0022,
"num_tokens": 394708581.0,
"reward": 0.06333705710130744,
"reward_std": 0.07792467664694414,
"rewards/pure_accuracy_reward_math": 0.06333705500583164,
"step": 641
},
{
"clip_ratio": 0.0002767174905216052,
"epoch": 1.042066266320098,
"grad_norm": 0.037835828959941864,
"kl": 0.005267143249511719,
"learning_rate": 4.1999938393916424e-06,
"loss": 0.0022,
"step": 642
},
{
"clip_ratio": 0.0003277845591469486,
"epoch": 1.0439783693346478,
"grad_norm": 0.03832162916660309,
"kl": 0.005464792251586914,
"learning_rate": 4.195388929692217e-06,
"loss": 0.0022,
"step": 643
},
{
"clip_ratio": 0.00035426640954483446,
"epoch": 1.0458904723491977,
"grad_norm": 0.03823033347725868,
"kl": 0.005482673645019531,
"learning_rate": 4.190773345181587e-06,
"loss": 0.0021,
"step": 644
},
{
"clip_ratio": 0.0003763593267649412,
"epoch": 1.0478025753637477,
"grad_norm": 0.036984797567129135,
"kl": 0.005467653274536133,
"learning_rate": 4.186147114921221e-06,
"loss": 0.002,
"step": 645
},
{
"clip_ratio": 0.0,
"completion_length": 528.9266424179077,
"epoch": 1.0497146783782976,
"grad_norm": 0.0355878509581089,
"kl": 0.005333423614501953,
"learning_rate": 4.18151026803962e-06,
"loss": 0.0056,
"num_tokens": 398334618.0,
"reward": 0.06305803850409575,
"reward_std": 0.06942774693015963,
"rewards/pure_accuracy_reward_math": 0.06305803699069656,
"step": 646
},
{
"clip_ratio": 0.00024814905674475085,
"epoch": 1.0516267813928475,
"grad_norm": 0.034741513431072235,
"kl": 0.005269289016723633,
"learning_rate": 4.176862833732127e-06,
"loss": 0.0056,
"step": 647
},
{
"clip_ratio": 0.00027503305113896204,
"epoch": 1.0535388844073974,
"grad_norm": 0.03375249356031418,
"kl": 0.005173683166503906,
"learning_rate": 4.1722048412607495e-06,
"loss": 0.0055,
"step": 648
},
{
"clip_ratio": 0.0002895867207826086,
"epoch": 1.0554509874219473,
"grad_norm": 0.0341072678565979,
"kl": 0.005132198333740234,
"learning_rate": 4.167536319953976e-06,
"loss": 0.0055,
"step": 649
},
{
"clip_ratio": 0.0003005371929134526,
"epoch": 1.0573630904364972,
"grad_norm": 0.033096957951784134,
"kl": 0.005170345306396484,
"learning_rate": 4.162857299206584e-06,
"loss": 0.0054,
"step": 650
},
{
"clip_ratio": 0.0,
"completion_length": 538.7528138160706,
"epoch": 1.0592751934510471,
"grad_norm": 0.03696604445576668,
"kl": 0.0052814483642578125,
"learning_rate": 4.158167808479461e-06,
"loss": 0.0097,
"num_tokens": 401997276.0,
"reward": 0.05943080657743849,
"reward_std": 0.07388583471765742,
"rewards/pure_accuracy_reward_math": 0.05943080494762398,
"step": 651
},
{
"clip_ratio": 0.00029416859939601636,
"epoch": 1.061187296465597,
"grad_norm": 0.03565770015120506,
"kl": 0.005290031433105469,
"learning_rate": 4.153467877299419e-06,
"loss": 0.0097,
"step": 652
},
{
"clip_ratio": 0.00029473524284640007,
"epoch": 1.063099399480147,
"grad_norm": 0.03546367585659027,
"kl": 0.005368709564208984,
"learning_rate": 4.148757535259004e-06,
"loss": 0.0096,
"step": 653
},
{
"clip_ratio": 0.00032781071104182047,
"epoch": 1.065011502494697,
"grad_norm": 0.03601039946079254,
"kl": 0.005382061004638672,
"learning_rate": 4.144036812016317e-06,
"loss": 0.0096,
"step": 654
},
{
"clip_ratio": 0.0003433626044397897,
"epoch": 1.0669236055092468,
"grad_norm": 0.035073794424533844,
"kl": 0.0053446292877197266,
"learning_rate": 4.139305737294818e-06,
"loss": 0.0095,
"step": 655
},
{
"clip_ratio": 0.0,
"completion_length": 520.1163725852966,
"epoch": 1.0688357085237967,
"grad_norm": 0.03852629289031029,
"kl": 0.005383491516113281,
"learning_rate": 4.134564340883148e-06,
"loss": 0.0083,
"num_tokens": 405593985.0,
"reward": 0.06445312793948688,
"reward_std": 0.07135464163729921,
"rewards/pure_accuracy_reward_math": 0.06445312654250301,
"step": 656
},
{
"clip_ratio": 0.0002591365355897324,
"epoch": 1.0707478115383466,
"grad_norm": 0.03745557367801666,
"kl": 0.0053327083587646484,
"learning_rate": 4.129812652634936e-06,
"loss": 0.0083,
"step": 657
},
{
"clip_ratio": 0.0003071958567772981,
"epoch": 1.0726599145528966,
"grad_norm": 0.037043727934360504,
"kl": 0.00532078742980957,
"learning_rate": 4.1250507024686115e-06,
"loss": 0.0083,
"step": 658
},
{
"clip_ratio": 0.00029935286954696494,
"epoch": 1.0745720175674465,
"grad_norm": 0.03582773730158806,
"kl": 0.005355358123779297,
"learning_rate": 4.120278520367217e-06,
"loss": 0.0082,
"step": 659
},
{
"clip_ratio": 0.0003111159166451216,
"epoch": 1.0764841205819964,
"grad_norm": 0.035313159227371216,
"kl": 0.005402326583862305,
"learning_rate": 4.115496136378219e-06,
"loss": 0.0081,
"step": 660
},
{
"clip_ratio": 0.0,
"completion_length": 509.2994108200073,
"epoch": 1.0783962235965463,
"grad_norm": 0.041104141622781754,
"kl": 0.005465507507324219,
"learning_rate": 4.110703580613321e-06,
"loss": 0.0074,
"num_tokens": 409156330.0,
"reward": 0.0641741098370403,
"reward_std": 0.08329231233801693,
"rewards/pure_accuracy_reward_math": 0.06417410826543346,
"step": 661
},
{
"clip_ratio": 0.0003218170786567498,
"epoch": 1.0803083266110962,
"grad_norm": 0.03970121592283249,
"kl": 0.005608558654785156,
"learning_rate": 4.105900883248269e-06,
"loss": 0.0074,
"step": 662
},
{
"clip_ratio": 0.00032362689415776913,
"epoch": 1.0822204296256461,
"grad_norm": 0.039676353335380554,
"kl": 0.005734920501708984,
"learning_rate": 4.101088074522667e-06,
"loss": 0.0074,
"step": 663
},
{
"clip_ratio": 0.000323468098201829,
"epoch": 1.084132532640196,
"grad_norm": 0.03883183002471924,
"kl": 0.005713939666748047,
"learning_rate": 4.096265184739781e-06,
"loss": 0.0073,
"step": 664
},
{
"clip_ratio": 0.00033196881122421473,
"epoch": 1.086044635654746,
"grad_norm": 0.037281692028045654,
"kl": 0.0056934356689453125,
"learning_rate": 4.091432244266354e-06,
"loss": 0.0072,
"step": 665
},
{
"clip_ratio": 0.0,
"completion_length": 522.48774766922,
"epoch": 1.0879567386692959,
"grad_norm": 0.037982553243637085,
"kl": 0.005854606628417969,
"learning_rate": 4.08658928353241e-06,
"loss": 0.0086,
"num_tokens": 412758914.0,
"reward": 0.06835937799769454,
"reward_std": 0.07526708859950304,
"rewards/pure_accuracy_reward_math": 0.06835937630967237,
"step": 666
},
{
"clip_ratio": 0.0002976899445457093,
"epoch": 1.0898688416838458,
"grad_norm": 0.03663322329521179,
"kl": 0.005788326263427734,
"learning_rate": 4.081736333031066e-06,
"loss": 0.0086,
"step": 667
},
{
"clip_ratio": 0.0002965517393818118,
"epoch": 1.0917809446983957,
"grad_norm": 0.03593512997031212,
"kl": 0.005764484405517578,
"learning_rate": 4.0768734233183376e-06,
"loss": 0.0085,
"step": 668
},
{
"clip_ratio": 0.0003466513953753747,
"epoch": 1.0936930477129456,
"grad_norm": 0.03643948212265968,
"kl": 0.005777835845947266,
"learning_rate": 4.072000585012947e-06,
"loss": 0.0085,
"step": 669
},
{
"clip_ratio": 0.00037185640462666925,
"epoch": 1.0956051507274955,
"grad_norm": 0.03601692244410515,
"kl": 0.0058193206787109375,
"learning_rate": 4.06711784879613e-06,
"loss": 0.0084,
"step": 670
},
{
"clip_ratio": 0.0,
"completion_length": 526.0530390739441,
"epoch": 1.0975172537420455,
"grad_norm": 0.03892623260617256,
"kl": 0.005596637725830078,
"learning_rate": 4.062225245411444e-06,
"loss": 0.007,
"num_tokens": 416383588.0,
"reward": 0.061104913387680426,
"reward_std": 0.07539348275167868,
"rewards/pure_accuracy_reward_math": 0.06110491187428124,
"step": 671
},
{
"clip_ratio": 0.0003017952032280391,
"epoch": 1.0994293567565951,
"grad_norm": 0.0375184491276741,
"kl": 0.0056912899017333984,
"learning_rate": 4.057322805664576e-06,
"loss": 0.007,
"step": 672
},
{
"clip_ratio": 0.0002928147856096075,
"epoch": 1.1013414597711453,
"grad_norm": 0.03731007128953934,
"kl": 0.0057830810546875,
"learning_rate": 4.0524105604231435e-06,
"loss": 0.0069,
"step": 673
},
{
"clip_ratio": 0.000317500726794151,
"epoch": 1.103253562785695,
"grad_norm": 0.03885798528790474,
"kl": 0.005819559097290039,
"learning_rate": 4.047488540616503e-06,
"loss": 0.0069,
"step": 674
},
{
"clip_ratio": 0.0003141532706649741,
"epoch": 1.105165665800245,
"grad_norm": 0.03583172708749771,
"kl": 0.005753278732299805,
"learning_rate": 4.042556777235558e-06,
"loss": 0.0068,
"step": 675
},
{
"clip_ratio": 0.0,
"completion_length": 523.9950060844421,
"epoch": 1.1070777688147948,
"grad_norm": 0.03652811422944069,
"kl": 0.005724668502807617,
"learning_rate": 4.037615301332559e-06,
"loss": 0.0088,
"num_tokens": 419993906.0,
"reward": 0.061383931315504014,
"reward_std": 0.07067021139664575,
"rewards/pure_accuracy_reward_math": 0.06138392974389717,
"step": 676
},
{
"clip_ratio": 0.00028260578790195723,
"epoch": 1.1089898718293447,
"grad_norm": 0.035632383078336716,
"kl": 0.0056421756744384766,
"learning_rate": 4.0326641440209114e-06,
"loss": 0.0088,
"step": 677
},
{
"clip_ratio": 0.0002882395116614589,
"epoch": 1.1109019748438946,
"grad_norm": 0.03453977406024933,
"kl": 0.005593061447143555,
"learning_rate": 4.027703336474979e-06,
"loss": 0.0087,
"step": 678
},
{
"clip_ratio": 0.000319835560901538,
"epoch": 1.1128140778584446,
"grad_norm": 0.03415689244866371,
"kl": 0.005594968795776367,
"learning_rate": 4.022732909929883e-06,
"loss": 0.0087,
"step": 679
},
{
"clip_ratio": 0.00033849146848297096,
"epoch": 1.1147261808729945,
"grad_norm": 0.03406994044780731,
"kl": 0.005631208419799805,
"learning_rate": 4.017752895681315e-06,
"loss": 0.0086,
"step": 680
},
{
"clip_ratio": 0.0,
"completion_length": 521.6057720184326,
"epoch": 1.1166382838875444,
"grad_norm": 0.06026715040206909,
"kl": 0.005751848220825195,
"learning_rate": 4.012763325085332e-06,
"loss": 0.0067,
"num_tokens": 423598941.0,
"reward": 0.07198661082657054,
"reward_std": 0.08763020345941186,
"rewards/pure_accuracy_reward_math": 0.07198660844005644,
"step": 681
},
{
"clip_ratio": 0.00031779767027728667,
"epoch": 1.1185503869020943,
"grad_norm": 2.6160011291503906,
"kl": 0.005651235580444336,
"learning_rate": 4.0077642295581605e-06,
"loss": 0.007,
"step": 682
},
{
"clip_ratio": 0.00035409004277653366,
"epoch": 1.1204624899166442,
"grad_norm": 6.490725994110107,
"kl": 0.04636049270629883,
"learning_rate": 4.002755640576002e-06,
"loss": 0.0083,
"step": 683
},
{
"clip_ratio": 0.000386831109835839,
"epoch": 1.1223745929311941,
"grad_norm": 0.13183599710464478,
"kl": 0.0063648223876953125,
"learning_rate": 3.997737589674828e-06,
"loss": 0.0067,
"step": 684
},
{
"clip_ratio": 0.00042002629169246575,
"epoch": 1.124286695945744,
"grad_norm": 61.113468170166016,
"kl": 0.00571751594543457,
"learning_rate": 3.992710108450192e-06,
"loss": 0.0205,
"step": 685
},
{
"clip_ratio": 0.0,
"completion_length": 534.679431438446,
"epoch": 1.126198798960294,
"grad_norm": 0.0341753326356411,
"kl": 0.006865501403808594,
"learning_rate": 3.987673228557017e-06,
"loss": 0.0032,
"num_tokens": 427249916.0,
"reward": 0.056919645285233855,
"reward_std": 0.06538890511728823,
"rewards/pure_accuracy_reward_math": 0.05691964429570362,
"step": 686
},
{
"clip_ratio": 0.00022898520234093667,
"epoch": 1.1281109019748439,
"grad_norm": 0.03356679156422615,
"kl": 0.006783246994018555,
"learning_rate": 3.982626981709412e-06,
"loss": 0.0032,
"step": 687
},
{
"clip_ratio": 0.00023695471924156664,
"epoch": 1.1300230049893938,
"grad_norm": 0.03283276781439781,
"kl": 0.006662845611572266,
"learning_rate": 3.977571399680457e-06,
"loss": 0.0031,
"step": 688
},
{
"clip_ratio": 0.000234549945901108,
"epoch": 1.1319351080039437,
"grad_norm": 0.032041046768426895,
"kl": 0.00657343864440918,
"learning_rate": 3.972506514302013e-06,
"loss": 0.0031,
"step": 689
},
{
"clip_ratio": 0.00026119674055280484,
"epoch": 1.1338472110184936,
"grad_norm": 0.03098335862159729,
"kl": 0.006501674652099609,
"learning_rate": 3.967432357464518e-06,
"loss": 0.003,
"step": 690
},
{
"clip_ratio": 0.0,
"completion_length": 533.4330596923828,
"epoch": 1.1357593140330435,
"grad_norm": 0.03648236393928528,
"kl": 0.005389690399169922,
"learning_rate": 3.962348961116786e-06,
"loss": 0.0075,
"num_tokens": 430894100.0,
"reward": 0.059151788300368935,
"reward_std": 0.06680402747588232,
"rewards/pure_accuracy_reward_math": 0.059151787078008056,
"step": 691
},
{
"clip_ratio": 0.00024069582485708452,
"epoch": 1.1376714170475934,
"grad_norm": 0.03502041473984718,
"kl": 0.005405902862548828,
"learning_rate": 3.957256357265806e-06,
"loss": 0.0075,
"step": 692
},
{
"clip_ratio": 0.00026108162376203836,
"epoch": 1.1395835200621434,
"grad_norm": 0.03438780456781387,
"kl": 0.0054416656494140625,
"learning_rate": 3.952154577976543e-06,
"loss": 0.0075,
"step": 693
},
{
"clip_ratio": 0.0002536772994972125,
"epoch": 1.1414956230766933,
"grad_norm": 0.03388332575559616,
"kl": 0.005480289459228516,
"learning_rate": 3.947043655371734e-06,
"loss": 0.0075,
"step": 694
},
{
"clip_ratio": 0.00027197748300977764,
"epoch": 1.1434077260912432,
"grad_norm": 0.03378571942448616,
"kl": 0.005473136901855469,
"learning_rate": 3.941923621631683e-06,
"loss": 0.0074,
"step": 695
},
{
"clip_ratio": 0.0,
"completion_length": 523.0050506591797,
"epoch": 1.145319829105793,
"grad_norm": 0.040138646960258484,
"kl": 0.005397796630859375,
"learning_rate": 3.936794508994062e-06,
"loss": 0.0033,
"num_tokens": 434502306.0,
"reward": 0.07142857456346974,
"reward_std": 0.08093377470504493,
"rewards/pure_accuracy_reward_math": 0.07142857316648588,
"step": 696
},
{
"clip_ratio": 0.00026038982610998573,
"epoch": 1.147231932120343,
"grad_norm": 0.03855022042989731,
"kl": 0.005437135696411133,
"learning_rate": 3.931656349753709e-06,
"loss": 0.0033,
"step": 697
},
{
"clip_ratio": 0.0002577857798655714,
"epoch": 1.149144035134893,
"grad_norm": 0.03805391117930412,
"kl": 0.005386829376220703,
"learning_rate": 3.9265091762624225e-06,
"loss": 0.0032,
"step": 698
},
{
"clip_ratio": 0.0002938498616913421,
"epoch": 1.1510561381494429,
"grad_norm": 0.03830750659108162,
"kl": 0.005461931228637695,
"learning_rate": 3.921353020928756e-06,
"loss": 0.0032,
"step": 699
},
{
"clip_ratio": 0.00026367085320089245,
"epoch": 1.1529682411639928,
"grad_norm": 0.03759397566318512,
"kl": 0.0055010318756103516,
"learning_rate": 3.916187916217818e-06,
"loss": 0.0031,
"step": 700
},
{
"clip_ratio": 0.0,
"completion_length": 532.7466740608215,
"epoch": 1.1548803441785427,
"grad_norm": 0.03618447855114937,
"kl": 0.0054166316986083984,
"learning_rate": 3.911013894651067e-06,
"loss": 0.0066,
"num_tokens": 438144462.0,
"reward": 0.06501116344588809,
"reward_std": 0.07457646209513769,
"rewards/pure_accuracy_reward_math": 0.06501116175786592,
"step": 701
},
{
"clip_ratio": 0.00028753443712048465,
"epoch": 1.1567924471930926,
"grad_norm": 0.035918354988098145,
"kl": 0.005413532257080078,
"learning_rate": 3.905830988806101e-06,
"loss": 0.0066,
"step": 702
},
{
"clip_ratio": 0.0002842856440565811,
"epoch": 1.1587045502076425,
"grad_norm": 0.03422370180487633,
"kl": 0.005442619323730469,
"learning_rate": 3.90063923131646e-06,
"loss": 0.0066,
"step": 703
},
{
"clip_ratio": 0.0002819241568090547,
"epoch": 1.1606166532221924,
"grad_norm": 0.03359530121088028,
"kl": 0.00537109375,
"learning_rate": 3.895438654871416e-06,
"loss": 0.0065,
"step": 704
},
{
"clip_ratio": 0.0003241457142166837,
"epoch": 1.1625287562367423,
"grad_norm": 0.033465541899204254,
"kl": 0.0053484439849853516,
"learning_rate": 3.890229292215773e-06,
"loss": 0.0065,
"step": 705
},
{
"clip_ratio": 0.0,
"completion_length": 526.7639741897583,
"epoch": 1.1644408592512923,
"grad_norm": 0.03731166943907738,
"kl": 0.00535893440246582,
"learning_rate": 3.885011176149647e-06,
"loss": 0.0071,
"num_tokens": 441760876.0,
"reward": 0.06612723506987095,
"reward_std": 0.06822534691309556,
"rewards/pure_accuracy_reward_math": 0.06612723367288709,
"step": 706
},
{
"clip_ratio": 0.00025104734473302415,
"epoch": 1.166352962265842,
"grad_norm": 0.03429851680994034,
"kl": 0.005263566970825195,
"learning_rate": 3.879784339528277e-06,
"loss": 0.0071,
"step": 707
},
{
"clip_ratio": 0.0002501190919019791,
"epoch": 1.168265065280392,
"grad_norm": 0.034958597272634506,
"kl": 0.0052831172943115234,
"learning_rate": 3.874548815261809e-06,
"loss": 0.0071,
"step": 708
},
{
"clip_ratio": 0.0002633173795629773,
"epoch": 1.1701771682949418,
"grad_norm": 0.032111622393131256,
"kl": 0.005318403244018555,
"learning_rate": 3.869304636315085e-06,
"loss": 0.007,
"step": 709
},
{
"clip_ratio": 0.00028521847832507774,
"epoch": 1.172089271309492,
"grad_norm": 0.03191748261451721,
"kl": 0.005407810211181641,
"learning_rate": 3.864051835707444e-06,
"loss": 0.007,
"step": 710
},
{
"clip_ratio": 0.0,
"completion_length": 522.3457269668579,
"epoch": 1.1740013743240416,
"grad_norm": 0.05126773193478584,
"kl": 0.01187896728515625,
"learning_rate": 3.85879044651251e-06,
"loss": 0.0066,
"num_tokens": 445370959.0,
"reward": 0.06863839653669856,
"reward_std": 0.07951865292852744,
"rewards/pure_accuracy_reward_math": 0.06863839438301511,
"step": 711
},
{
"clip_ratio": 0.00028669004558423694,
"epoch": 1.1759134773385915,
"grad_norm": 0.051731474697589874,
"kl": 0.011458396911621094,
"learning_rate": 3.853520501857981e-06,
"loss": 0.0066,
"step": 712
},
{
"clip_ratio": 0.0003143258599038745,
"epoch": 1.1778255803531414,
"grad_norm": 0.051190439611673355,
"kl": 0.010621786117553711,
"learning_rate": 3.848242034925429e-06,
"loss": 0.0065,
"step": 713
},
{
"clip_ratio": 0.00033165596249773444,
"epoch": 1.1797376833676914,
"grad_norm": 0.04840007424354553,
"kl": 0.009693622589111328,
"learning_rate": 3.842955078950079e-06,
"loss": 0.0064,
"step": 714
},
{
"clip_ratio": 0.00035113433239075675,
"epoch": 1.1816497863822413,
"grad_norm": 0.048264067620038986,
"kl": 0.008889198303222656,
"learning_rate": 3.837659667220612e-06,
"loss": 0.0063,
"step": 715
},
{
"clip_ratio": 0.0,
"completion_length": 547.5633645057678,
"epoch": 1.1835618893967912,
"grad_norm": 0.03458649665117264,
"kl": 0.005284786224365234,
"learning_rate": 3.832355833078945e-06,
"loss": 0.0047,
"num_tokens": 449069046.0,
"reward": 0.05691964572179131,
"reward_std": 0.06861072586616501,
"rewards/pure_accuracy_reward_math": 0.05691964415018447,
"step": 716
},
{
"clip_ratio": 0.0002876185501463624,
"epoch": 1.185473992411341,
"grad_norm": 0.033646877855062485,
"kl": 0.005215167999267578,
"learning_rate": 3.82704360992003e-06,
"loss": 0.0047,
"step": 717
},
{
"clip_ratio": 0.0003252235952686533,
"epoch": 1.187386095425891,
"grad_norm": 0.03455204889178276,
"kl": 0.0051419734954833984,
"learning_rate": 3.8217230311916365e-06,
"loss": 0.0046,
"step": 718
},
{
"clip_ratio": 0.0003351885409870192,
"epoch": 1.189298198440441,
"grad_norm": 0.033362697809934616,
"kl": 0.0050907135009765625,
"learning_rate": 3.816394130394142e-06,
"loss": 0.0046,
"step": 719
},
{
"clip_ratio": 0.00032723310141591355,
"epoch": 1.1912103014549908,
"grad_norm": 0.03211547061800957,
"kl": 0.0051004886627197266,
"learning_rate": 3.811056941080329e-06,
"loss": 0.0045,
"step": 720
},
{
"clip_ratio": 0.0,
"completion_length": 537.3167090415955,
"epoch": 1.1931224044695408,
"grad_norm": 0.03566175699234009,
"kl": 0.0053424835205078125,
"learning_rate": 3.805711496855161e-06,
"loss": 0.009,
"num_tokens": 452726381.0,
"reward": 0.06054687776486389,
"reward_std": 0.07264336961088702,
"rewards/pure_accuracy_reward_math": 0.06054687677533366,
"step": 721
},
{
"clip_ratio": 0.00029346574888222676,
"epoch": 1.1950345074840907,
"grad_norm": 0.03476826474070549,
"kl": 0.005379438400268555,
"learning_rate": 3.800357831375583e-06,
"loss": 0.009,
"step": 722
},
{
"clip_ratio": 0.00027920183202923,
"epoch": 1.1969466104986406,
"grad_norm": 0.03446114435791969,
"kl": 0.005425691604614258,
"learning_rate": 3.794995978350301e-06,
"loss": 0.009,
"step": 723
},
{
"clip_ratio": 0.00031396149876172785,
"epoch": 1.1988587135131905,
"grad_norm": 0.0340140238404274,
"kl": 0.005489826202392578,
"learning_rate": 3.7896259715395727e-06,
"loss": 0.0089,
"step": 724
},
{
"clip_ratio": 0.0002986833567888425,
"epoch": 1.2007708165277404,
"grad_norm": 0.03497212752699852,
"kl": 0.005522489547729492,
"learning_rate": 3.784247844754997e-06,
"loss": 0.0088,
"step": 725
},
{
"clip_ratio": 0.0,
"completion_length": 548.8044338226318,
"epoch": 1.2026829195422903,
"grad_norm": 0.04050953686237335,
"kl": 0.005362510681152344,
"learning_rate": 3.778861631859298e-06,
"loss": 0.0112,
"num_tokens": 456433388.0,
"reward": 0.06696428879513405,
"reward_std": 0.08140548242954537,
"rewards/pure_accuracy_reward_math": 0.06696428728173487,
"step": 726
},
{
"clip_ratio": 0.0003468562302373357,
"epoch": 1.2045950225568403,
"grad_norm": 0.03805195167660713,
"kl": 0.005377531051635742,
"learning_rate": 3.7734673667661133e-06,
"loss": 0.0112,
"step": 727
},
{
"clip_ratio": 0.00037477223943938043,
"epoch": 1.2065071255713902,
"grad_norm": 0.03666882589459419,
"kl": 0.005417585372924805,
"learning_rate": 3.7680650834397804e-06,
"loss": 0.0112,
"step": 728
},
{
"clip_ratio": 0.0003945930936311015,
"epoch": 1.20841922858594,
"grad_norm": 0.03651399165391922,
"kl": 0.005425453186035156,
"learning_rate": 3.762654815895122e-06,
"loss": 0.0111,
"step": 729
},
{
"clip_ratio": 0.0004650242010484362,
"epoch": 1.21033133160049,
"grad_norm": 0.03792130947113037,
"kl": 0.005422115325927734,
"learning_rate": 3.7572365981972335e-06,
"loss": 0.0111,
"step": 730
},
{
"clip_ratio": 0.0,
"completion_length": 528.6861305236816,
"epoch": 1.21224343461504,
"grad_norm": 0.0365571565926075,
"kl": 0.005487203598022461,
"learning_rate": 3.7518104644612663e-06,
"loss": 0.0098,
"num_tokens": 460061367.0,
"reward": 0.06417411062284373,
"reward_std": 0.07478918455308303,
"rewards/pure_accuracy_reward_math": 0.06417410852736793,
"step": 731
},
{
"clip_ratio": 0.0002798708824229834,
"epoch": 1.2141555376295898,
"grad_norm": 0.036456115543842316,
"kl": 0.005484342575073242,
"learning_rate": 3.746376448852216e-06,
"loss": 0.0098,
"step": 732
},
{
"clip_ratio": 0.0003001830394850913,
"epoch": 1.2160676406441397,
"grad_norm": 0.036120470613241196,
"kl": 0.005544900894165039,
"learning_rate": 3.740934585584702e-06,
"loss": 0.0098,
"step": 733
},
{
"clip_ratio": 0.00028155883609315424,
"epoch": 1.2179797436586897,
"grad_norm": 0.03475060313940048,
"kl": 0.005614042282104492,
"learning_rate": 3.735484908922759e-06,
"loss": 0.0097,
"step": 734
},
{
"clip_ratio": 0.00027523975251142474,
"epoch": 1.2198918466732396,
"grad_norm": 0.03388671204447746,
"kl": 0.005706310272216797,
"learning_rate": 3.730027453179617e-06,
"loss": 0.0096,
"step": 735
},
{
"clip_ratio": 0.0,
"completion_length": 518.6091203689575,
"epoch": 1.2218039496877895,
"grad_norm": 0.039098870009183884,
"kl": 0.005930900573730469,
"learning_rate": 3.7245622527174858e-06,
"loss": 0.0072,
"num_tokens": 463651718.0,
"reward": 0.06277902098372579,
"reward_std": 0.06552149687195197,
"rewards/pure_accuracy_reward_math": 0.06277901912108064,
"step": 736
},
{
"clip_ratio": 0.000267848483247235,
"epoch": 1.2237160527023394,
"grad_norm": 0.03896670043468475,
"kl": 0.005952358245849609,
"learning_rate": 3.719089341947337e-06,
"loss": 0.0072,
"step": 737
},
{
"clip_ratio": 0.00026333254504606884,
"epoch": 1.2256281557168893,
"grad_norm": 0.03838280960917473,
"kl": 0.005873680114746094,
"learning_rate": 3.7136087553286916e-06,
"loss": 0.0072,
"step": 738
},
{
"clip_ratio": 0.0002850479507969794,
"epoch": 1.2275402587314392,
"grad_norm": 0.03708336502313614,
"kl": 0.005741596221923828,
"learning_rate": 3.7081205273694005e-06,
"loss": 0.0071,
"step": 739
},
{
"clip_ratio": 0.00030947004142944934,
"epoch": 1.2294523617459892,
"grad_norm": 0.03616032376885414,
"kl": 0.005689144134521484,
"learning_rate": 3.702624692625427e-06,
"loss": 0.007,
"step": 740
},
{
"clip_ratio": 0.0,
"completion_length": 515.3027577400208,
"epoch": 1.231364464760539,
"grad_norm": 473.16009521484375,
"kl": 7.4117608070373535,
"learning_rate": 3.6971212857006277e-06,
"loss": 0.3027,
"num_tokens": 467231411.0,
"reward": 0.07003348527359776,
"reward_std": 0.07058388437144458,
"rewards/pure_accuracy_reward_math": 0.07003348364378326,
"step": 741
},
{
"clip_ratio": 0.00048789031319529386,
"epoch": 1.2332765677750888,
"grad_norm": 15.009349822998047,
"kl": 0.3277552127838135,
"learning_rate": 3.6916103412465405e-06,
"loss": 0.0207,
"step": 742
},
{
"clip_ratio": 0.0005436847095552366,
"epoch": 1.235188670789639,
"grad_norm": 34.010345458984375,
"kl": 0.01839423179626465,
"learning_rate": 3.6860918939621586e-06,
"loss": 0.0299,
"step": 743
},
{
"clip_ratio": 0.000597593801558105,
"epoch": 1.2371007738041886,
"grad_norm": 13.507566452026367,
"kl": 0.02814960479736328,
"learning_rate": 3.6805659785937176e-06,
"loss": 0.0188,
"step": 744
},
{
"clip_ratio": 0.0005609532486232638,
"epoch": 1.2390128768187387,
"grad_norm": 6.263442516326904,
"kl": 0.20073914527893066,
"learning_rate": 3.675032629934475e-06,
"loss": 0.0163,
"step": 745
},
{
"clip_ratio": 0.0,
"completion_length": 530.3340101242065,
"epoch": 1.2409249798332884,
"grad_norm": 0.051358480006456375,
"kl": 0.0063626766204833984,
"learning_rate": 3.6694918828244923e-06,
"loss": 0.0095,
"num_tokens": 470866344.0,
"reward": 0.06333705666474998,
"reward_std": 0.07530095760012046,
"rewards/pure_accuracy_reward_math": 0.06333705509314314,
"step": 746
},
{
"clip_ratio": 0.00029982604212364095,
"epoch": 1.2428370828478383,
"grad_norm": 0.03713027015328407,
"kl": 0.006081342697143555,
"learning_rate": 3.6639437721504108e-06,
"loss": 0.0095,
"step": 747
},
{
"clip_ratio": 0.0002941023938660692,
"epoch": 1.2447491858623883,
"grad_norm": 0.03500093147158623,
"kl": 0.006156444549560547,
"learning_rate": 3.65838833284524e-06,
"loss": 0.0095,
"step": 748
},
{
"clip_ratio": 0.0002858027814340858,
"epoch": 1.2466612888769382,
"grad_norm": 0.03525420278310776,
"kl": 0.006234169006347656,
"learning_rate": 3.652825599888129e-06,
"loss": 0.0094,
"step": 749
},
{
"clip_ratio": 0.0002950350276478275,
"epoch": 1.248573391891488,
"grad_norm": 0.03545543923974037,
"kl": 0.006281852722167969,
"learning_rate": 3.647255608304154e-06,
"loss": 0.0093,
"step": 750
},
{
"clip_ratio": 0.0,
"completion_length": 530.79438829422,
"epoch": 1.250485494906038,
"grad_norm": 0.03711007162928581,
"kl": 0.005670070648193359,
"learning_rate": 3.641678393164092e-06,
"loss": 0.0131,
"num_tokens": 474505191.0,
"reward": 0.07170759318978526,
"reward_std": 0.07251697574974969,
"rewards/pure_accuracy_reward_math": 0.0717075907450635,
"step": 751
},
{
"clip_ratio": 0.00029345202176500607,
"epoch": 1.252397597920588,
"grad_norm": 0.036423034965991974,
"kl": 0.005608320236206055,
"learning_rate": 3.636093989584204e-06,
"loss": 0.0131,
"step": 752
},
{
"clip_ratio": 0.00030187425932126644,
"epoch": 1.2543097009351378,
"grad_norm": 0.03613322973251343,
"kl": 0.005610466003417969,
"learning_rate": 3.630502432726012e-06,
"loss": 0.013,
"step": 753
},
{
"clip_ratio": 0.0003275847485610939,
"epoch": 1.2562218039496877,
"grad_norm": 0.03452349826693535,
"kl": 0.0057184696197509766,
"learning_rate": 3.6249037577960744e-06,
"loss": 0.013,
"step": 754
},
{
"clip_ratio": 0.00034663524741063156,
"epoch": 1.2581339069642377,
"grad_norm": 0.034864939749240875,
"kl": 0.005825996398925781,
"learning_rate": 3.619298000045773e-06,
"loss": 0.0129,
"step": 755
},
{
"clip_ratio": 0.0,
"completion_length": 495.8814425468445,
"epoch": 1.2600460099787876,
"grad_norm": 528.279052734375,
"kl": 9.193241596221924,
"learning_rate": 3.6136851947710804e-06,
"loss": 0.3749,
"num_tokens": 478011678.0,
"reward": 0.07979911071015522,
"reward_std": 0.07470905361697078,
"rewards/pure_accuracy_reward_math": 0.0797991082072258,
"step": 756
},
{
"clip_ratio": 0.00028275052295612113,
"epoch": 1.2619581129933375,
"grad_norm": 44.662696838378906,
"kl": 1.2635960578918457,
"learning_rate": 3.608065377312348e-06,
"loss": 0.057,
"step": 757
},
{
"clip_ratio": 0.00029553008619132015,
"epoch": 1.2638702160078874,
"grad_norm": 4.775911808013916,
"kl": 0.1474595069885254,
"learning_rate": 3.6024385830540758e-06,
"loss": 0.0123,
"step": 758
},
{
"clip_ratio": 0.00033371773997714627,
"epoch": 1.2657823190224373,
"grad_norm": 0.30982905626296997,
"kl": 0.01830148696899414,
"learning_rate": 3.5968048474246925e-06,
"loss": 0.0071,
"step": 759
},
{
"clip_ratio": 0.0003257711730952906,
"epoch": 1.2676944220369872,
"grad_norm": 0.05356259644031525,
"kl": 0.011959552764892578,
"learning_rate": 3.591164205896332e-06,
"loss": 0.0068,
"step": 760
},
{
"clip_ratio": 0.0,
"completion_length": 519.9149203300476,
"epoch": 1.2696065250515371,
"grad_norm": 0.04138460382819176,
"kl": 0.00600886344909668,
"learning_rate": 3.585516693984612e-06,
"loss": 0.0061,
"num_tokens": 481610981.0,
"reward": 0.07059152136207558,
"reward_std": 0.07616424100706354,
"rewards/pure_accuracy_reward_math": 0.07059151938301511,
"step": 761
},
{
"clip_ratio": 0.00029173931721970803,
"epoch": 1.271518628066087,
"grad_norm": 0.04057340323925018,
"kl": 0.0059850215911865234,
"learning_rate": 3.5798623472484074e-06,
"loss": 0.006,
"step": 762
},
{
"clip_ratio": 0.00031361054851686276,
"epoch": 1.273430731080637,
"grad_norm": 0.0383637472987175,
"kl": 0.005931377410888672,
"learning_rate": 3.5742012012896273e-06,
"loss": 0.006,
"step": 763
},
{
"clip_ratio": 0.000302841177983737,
"epoch": 1.275342834095187,
"grad_norm": 0.037009891122579575,
"kl": 0.005960226058959961,
"learning_rate": 3.5685332917529936e-06,
"loss": 0.0059,
"step": 764
},
{
"clip_ratio": 0.00032496250122449055,
"epoch": 1.2772549371097368,
"grad_norm": 0.036052413284778595,
"kl": 0.0060160160064697266,
"learning_rate": 3.5628586543258116e-06,
"loss": 0.0058,
"step": 765
},
{
"clip_ratio": 0.0,
"completion_length": 505.19645166397095,
"epoch": 1.2791670401242867,
"grad_norm": 0.039108723402023315,
"kl": 0.0060214996337890625,
"learning_rate": 3.5571773247377495e-06,
"loss": 0.0077,
"num_tokens": 485155493.0,
"reward": 0.06473214537254535,
"reward_std": 0.07595151849091053,
"rewards/pure_accuracy_reward_math": 0.06473214438301511,
"step": 766
},
{
"clip_ratio": 0.00031215936860462534,
"epoch": 1.2810791431388366,
"grad_norm": 0.03890209272503853,
"kl": 0.0060939788818359375,
"learning_rate": 3.5514893387606113e-06,
"loss": 0.0078,
"step": 767
},
{
"clip_ratio": 0.00029648321913100517,
"epoch": 1.2829912461533866,
"grad_norm": 0.038266174495220184,
"kl": 0.0061397552490234375,
"learning_rate": 3.5457947322081126e-06,
"loss": 0.0077,
"step": 768
},
{
"clip_ratio": 0.0002988063008615427,
"epoch": 1.2849033491679365,
"grad_norm": 0.03760776296257973,
"kl": 0.006152629852294922,
"learning_rate": 3.5400935409356534e-06,
"loss": 0.0076,
"step": 769
},
{
"clip_ratio": 0.00032748817852734646,
"epoch": 1.2868154521824864,
"grad_norm": 0.037058234214782715,
"kl": 0.006194591522216797,
"learning_rate": 3.5343858008400955e-06,
"loss": 0.0076,
"step": 770
},
{
"clip_ratio": 0.0,
"completion_length": 513.085681438446,
"epoch": 1.2887275551970363,
"grad_norm": 0.04272163286805153,
"kl": 0.006904125213623047,
"learning_rate": 3.5286715478595335e-06,
"loss": 0.0066,
"num_tokens": 488731916.0,
"reward": 0.06668527112924494,
"reward_std": 0.07779828266939148,
"rewards/pure_accuracy_reward_math": 0.0666852695576381,
"step": 771
},
{
"clip_ratio": 0.0002989328136209224,
"epoch": 1.2906396582115862,
"grad_norm": 0.039898019284009933,
"kl": 0.006760597229003906,
"learning_rate": 3.52295081797307e-06,
"loss": 0.0066,
"step": 772
},
{
"clip_ratio": 0.0003237332452385999,
"epoch": 1.2925517612261361,
"grad_norm": 0.0380416214466095,
"kl": 0.006653547286987305,
"learning_rate": 3.5172236472005866e-06,
"loss": 0.0065,
"step": 773
},
{
"clip_ratio": 0.0004160679777100995,
"epoch": 1.294463864240686,
"grad_norm": 0.03860335052013397,
"kl": 0.006639003753662109,
"learning_rate": 3.511490071602523e-06,
"loss": 0.0065,
"step": 774
},
{
"clip_ratio": 0.0004345110206713798,
"epoch": 1.2963759672552357,
"grad_norm": 0.0405069962143898,
"kl": 0.006697654724121094,
"learning_rate": 3.505750127279643e-06,
"loss": 0.0064,
"step": 775
},
{
"clip_ratio": 0.0,
"completion_length": 529.7695565223694,
"epoch": 1.2982880702697859,
"grad_norm": 0.040585048496723175,
"kl": 0.006101369857788086,
"learning_rate": 3.500003850372811e-06,
"loss": 0.0043,
"num_tokens": 492363370.0,
"reward": 0.07477678926079534,
"reward_std": 0.08466117118950933,
"rewards/pure_accuracy_reward_math": 0.07477678704890423,
"step": 776
},
{
"clip_ratio": 0.0003347315081327906,
"epoch": 1.3002001732843356,
"grad_norm": 0.039613205939531326,
"kl": 0.0060977935791015625,
"learning_rate": 3.4942512770627655e-06,
"loss": 0.0043,
"step": 777
},
{
"clip_ratio": 0.0003803396672310555,
"epoch": 1.3021122762988857,
"grad_norm": 0.03965132310986519,
"kl": 0.006110668182373047,
"learning_rate": 3.4884924435698875e-06,
"loss": 0.0042,
"step": 778
},
{
"clip_ratio": 0.00035469116983222193,
"epoch": 1.3040243793134354,
"grad_norm": 0.038701362907886505,
"kl": 0.005974292755126953,
"learning_rate": 3.482727386153974e-06,
"loss": 0.0041,
"step": 779
},
{
"clip_ratio": 0.00038596760680320585,
"epoch": 1.3059364823279855,
"grad_norm": 0.03767050802707672,
"kl": 0.0059070587158203125,
"learning_rate": 3.4769561411140123e-06,
"loss": 0.0041,
"step": 780
},
{
"clip_ratio": 0.0,
"completion_length": 528.3593993186951,
"epoch": 1.3078485853425352,
"grad_norm": 0.04520969092845917,
"kl": 0.015022039413452148,
"learning_rate": 3.471178744787948e-06,
"loss": 0.0107,
"num_tokens": 495988466.0,
"reward": 0.07449777098372579,
"reward_std": 0.08161820413079113,
"rewards/pure_accuracy_reward_math": 0.07449777016881853,
"step": 781
},
{
"clip_ratio": 0.00032587463357458546,
"epoch": 1.3097606883570854,
"grad_norm": 0.04337235167622566,
"kl": 0.01485586166381836,
"learning_rate": 3.465395233552458e-06,
"loss": 0.0107,
"step": 782
},
{
"clip_ratio": 0.00031156001216459117,
"epoch": 1.311672791371635,
"grad_norm": 0.04306100681424141,
"kl": 0.014668941497802734,
"learning_rate": 3.459605643822721e-06,
"loss": 0.0106,
"step": 783
},
{
"clip_ratio": 0.00031179932597069637,
"epoch": 1.313584894386185,
"grad_norm": 0.04292943701148033,
"kl": 0.014333724975585938,
"learning_rate": 3.4538100120521884e-06,
"loss": 0.0106,
"step": 784
},
{
"clip_ratio": 0.00034586368491318353,
"epoch": 1.315496997400735,
"grad_norm": 0.04207218065857887,
"kl": 0.013885498046875,
"learning_rate": 3.4480083747323527e-06,
"loss": 0.0105,
"step": 785
},
{
"clip_ratio": 0.0,
"completion_length": 521.3471217155457,
"epoch": 1.3174091004152848,
"grad_norm": 0.04057139530777931,
"kl": 0.006026268005371094,
"learning_rate": 3.4422007683925224e-06,
"loss": 0.0119,
"num_tokens": 499590878.0,
"reward": 0.08091518239234574,
"reward_std": 0.08763020328478888,
"rewards/pure_accuracy_reward_math": 0.08091518023866229,
"step": 786
},
{
"clip_ratio": 0.00030802900647586284,
"epoch": 1.3193212034298347,
"grad_norm": 0.039306215941905975,
"kl": 0.00603485107421875,
"learning_rate": 3.436387229599587e-06,
"loss": 0.0119,
"step": 787
},
{
"clip_ratio": 0.00034579116845634417,
"epoch": 1.3212333064443846,
"grad_norm": 0.03839893266558647,
"kl": 0.006104469299316406,
"learning_rate": 3.4305677949577915e-06,
"loss": 0.0118,
"step": 788
},
{
"clip_ratio": 0.00036078316020393686,
"epoch": 1.3231454094589346,
"grad_norm": 0.03700988367199898,
"kl": 0.006115436553955078,
"learning_rate": 3.4247425011084993e-06,
"loss": 0.0118,
"step": 789
},
{
"clip_ratio": 0.0003916456239494437,
"epoch": 1.3250575124734845,
"grad_norm": 0.03749685734510422,
"kl": 0.006115436553955078,
"learning_rate": 3.418911384729971e-06,
"loss": 0.0117,
"step": 790
},
{
"clip_ratio": 0.0,
"completion_length": 502.7112407684326,
"epoch": 1.3269696154880344,
"grad_norm": 0.03917763754725456,
"kl": 0.009302139282226562,
"learning_rate": 3.413074482537123e-06,
"loss": 0.0077,
"num_tokens": 503128079.0,
"reward": 0.07059152112924494,
"reward_std": 0.07702752505429089,
"rewards/pure_accuracy_reward_math": 0.07059151944122277,
"step": 791
},
{
"clip_ratio": 0.0002787132019079763,
"epoch": 1.3288817185025843,
"grad_norm": 0.03894754871726036,
"kl": 0.009203910827636719,
"learning_rate": 3.4072318312813044e-06,
"loss": 0.0077,
"step": 792
},
{
"clip_ratio": 0.00031091465683630304,
"epoch": 1.3307938215171342,
"grad_norm": 0.03774462640285492,
"kl": 0.008921146392822266,
"learning_rate": 3.4013834677500612e-06,
"loss": 0.0077,
"step": 793
},
{
"clip_ratio": 0.00030987418773520403,
"epoch": 1.3327059245316841,
"grad_norm": 0.03737964481115341,
"kl": 0.008791923522949219,
"learning_rate": 3.395529428766907e-06,
"loss": 0.0076,
"step": 794
},
{
"clip_ratio": 0.0003597256319380904,
"epoch": 1.334618027546234,
"grad_norm": 0.03793202340602875,
"kl": 0.008593559265136719,
"learning_rate": 3.3896697511910898e-06,
"loss": 0.0075,
"step": 795
},
{
"clip_ratio": 0.0,
"completion_length": 516.8552160263062,
"epoch": 1.336530130560784,
"grad_norm": 0.03877223655581474,
"kl": 0.005873441696166992,
"learning_rate": 3.3838044719173603e-06,
"loss": 0.0086,
"num_tokens": 506711636.0,
"reward": 0.06529018195578828,
"reward_std": 0.06942774722119793,
"rewards/pure_accuracy_reward_math": 0.06529017997672781,
"step": 796
},
{
"clip_ratio": 0.0002862633294853367,
"epoch": 1.3384422335753339,
"grad_norm": 0.0376199446618557,
"kl": 0.005820274353027344,
"learning_rate": 3.377933627875739e-06,
"loss": 0.0086,
"step": 797
},
{
"clip_ratio": 0.0002861461452994263,
"epoch": 1.3403543365898838,
"grad_norm": 0.036890070885419846,
"kl": 0.005822658538818359,
"learning_rate": 3.3720572560312854e-06,
"loss": 0.0086,
"step": 798
},
{
"clip_ratio": 0.0003201163677317709,
"epoch": 1.3422664396044337,
"grad_norm": 0.03669756278395653,
"kl": 0.005821704864501953,
"learning_rate": 3.366175393383863e-06,
"loss": 0.0085,
"step": 799
},
{
"clip_ratio": 0.0003494162402830625,
"epoch": 1.3441785426189836,
"grad_norm": 0.03721420839428902,
"kl": 0.005818843841552734,
"learning_rate": 3.360288076967909e-06,
"loss": 0.0084,
"step": 800
},
{
"clip_ratio": 0.0,
"completion_length": 505.6105146408081,
"epoch": 1.3460906456335335,
"grad_norm": 0.040034398436546326,
"kl": 0.006266117095947266,
"learning_rate": 3.3543953438521983e-06,
"loss": 0.0091,
"num_tokens": 510255728.0,
"reward": 0.0675223250000272,
"reward_std": 0.07577886182116345,
"rewards/pure_accuracy_reward_math": 0.06752232249709778,
"step": 801
},
{
"clip_ratio": 0.00027677676553139463,
"epoch": 1.3480027486480834,
"grad_norm": 0.038657769560813904,
"kl": 0.006215572357177734,
"learning_rate": 3.3484972311396114e-06,
"loss": 0.0091,
"step": 802
},
{
"clip_ratio": 0.0002909586188479807,
"epoch": 1.3499148516626334,
"grad_norm": 0.036970507353544235,
"kl": 0.006129741668701172,
"learning_rate": 3.342593775966901e-06,
"loss": 0.009,
"step": 803
},
{
"clip_ratio": 0.0003427068459700422,
"epoch": 1.3518269546771833,
"grad_norm": 0.03707785904407501,
"kl": 0.006056785583496094,
"learning_rate": 3.3366850155044595e-06,
"loss": 0.009,
"step": 804
},
{
"clip_ratio": 0.00038909467849634893,
"epoch": 1.3537390576917332,
"grad_norm": 0.03700149059295654,
"kl": 0.005985736846923828,
"learning_rate": 3.33077098695608e-06,
"loss": 0.0089,
"step": 805
},
{
"clip_ratio": 0.0,
"completion_length": 527.0212287902832,
"epoch": 1.355651160706283,
"grad_norm": 0.04373861476778984,
"kl": 0.005824565887451172,
"learning_rate": 3.3248517275587292e-06,
"loss": 0.0094,
"num_tokens": 513879112.0,
"reward": 0.0703125029685907,
"reward_std": 0.08085364429280162,
"rewards/pure_accuracy_reward_math": 0.07031250145519152,
"step": 806
},
{
"clip_ratio": 0.00031092700191948097,
"epoch": 1.357563263720833,
"grad_norm": 0.04273909702897072,
"kl": 0.0058460235595703125,
"learning_rate": 3.318927274582307e-06,
"loss": 0.0094,
"step": 807
},
{
"clip_ratio": 0.0003359753473546334,
"epoch": 1.359475366735383,
"grad_norm": 0.04217194393277168,
"kl": 0.005980014801025391,
"learning_rate": 3.312997665329414e-06,
"loss": 0.0093,
"step": 808
},
{
"clip_ratio": 0.0003392697701940506,
"epoch": 1.3613874697499329,
"grad_norm": 0.04189891368150711,
"kl": 0.0061492919921875,
"learning_rate": 3.3070629371351176e-06,
"loss": 0.0093,
"step": 809
},
{
"clip_ratio": 0.0003985974152556082,
"epoch": 1.3632995727644825,
"grad_norm": 0.04113880172371864,
"kl": 0.0062618255615234375,
"learning_rate": 3.3011231273667155e-06,
"loss": 0.0092,
"step": 810
},
{
"clip_ratio": 0.0,
"completion_length": 523.8002490997314,
"epoch": 1.3652116757790327,
"grad_norm": 0.039511535316705704,
"kl": 0.007502555847167969,
"learning_rate": 3.295178273423501e-06,
"loss": 0.0065,
"num_tokens": 517489928.0,
"reward": 0.06835937840514816,
"reward_std": 0.0761642413563095,
"rewards/pure_accuracy_reward_math": 0.06835937636788003,
"step": 811
},
{
"clip_ratio": 0.00033993283830113796,
"epoch": 1.3671237787935824,
"grad_norm": 0.03911852091550827,
"kl": 0.0074634552001953125,
"learning_rate": 3.2892284127365277e-06,
"loss": 0.0065,
"step": 812
},
{
"clip_ratio": 0.00029188678922764666,
"epoch": 1.3690358818081325,
"grad_norm": 0.038789719343185425,
"kl": 0.007461071014404297,
"learning_rate": 3.2832735827683733e-06,
"loss": 0.0064,
"step": 813
},
{
"clip_ratio": 0.00031692377649505943,
"epoch": 1.3709479848226822,
"grad_norm": 0.03795900195837021,
"kl": 0.007411956787109375,
"learning_rate": 3.2773138210129037e-06,
"loss": 0.0063,
"step": 814
},
{
"clip_ratio": 0.0003394908647464945,
"epoch": 1.3728600878372323,
"grad_norm": 0.03683575242757797,
"kl": 0.0073795318603515625,
"learning_rate": 3.2713491649950375e-06,
"loss": 0.0063,
"step": 815
},
{
"clip_ratio": 0.0,
"completion_length": 527.1018648147583,
"epoch": 1.374772190851782,
"grad_norm": 0.036948177963495255,
"kl": 0.0058441162109375,
"learning_rate": 3.26537965227051e-06,
"loss": 0.0062,
"num_tokens": 521113961.0,
"reward": 0.06333705675206147,
"reward_std": 0.07041122711962089,
"rewards/pure_accuracy_reward_math": 0.06333705494762398,
"step": 816
},
{
"clip_ratio": 0.0002517415915690435,
"epoch": 1.3766842938663322,
"grad_norm": 0.03634682297706604,
"kl": 0.005847454071044922,
"learning_rate": 3.2594053204256344e-06,
"loss": 0.0062,
"step": 817
},
{
"clip_ratio": 0.00027403954436522326,
"epoch": 1.3785963968808819,
"grad_norm": 0.034690070897340775,
"kl": 0.005870342254638672,
"learning_rate": 3.253426207077069e-06,
"loss": 0.0062,
"step": 818
},
{
"clip_ratio": 0.0002389855896467452,
"epoch": 1.3805084998954318,
"grad_norm": 0.034505974501371384,
"kl": 0.005900382995605469,
"learning_rate": 3.2474423498715772e-06,
"loss": 0.0061,
"step": 819
},
{
"clip_ratio": 0.000287152882663122,
"epoch": 1.3824206029099817,
"grad_norm": 0.03524321690201759,
"kl": 0.005913734436035156,
"learning_rate": 3.241453786485792e-06,
"loss": 0.0061,
"step": 820
},
{
"clip_ratio": 0.0,
"completion_length": 509.66520071029663,
"epoch": 1.3843327059245316,
"grad_norm": 0.039214182645082474,
"kl": 0.006892681121826172,
"learning_rate": 3.2354605546259777e-06,
"loss": 0.0032,
"num_tokens": 524677265.0,
"reward": 0.07979911041911691,
"reward_std": 0.07959878293331712,
"rewards/pure_accuracy_reward_math": 0.07979910867288709,
"step": 821
},
{
"clip_ratio": 0.0002965318878409562,
"epoch": 1.3862448089390815,
"grad_norm": 0.037640273571014404,
"kl": 0.0067348480224609375,
"learning_rate": 3.2294626920277928e-06,
"loss": 0.0031,
"step": 822
},
{
"clip_ratio": 0.00035153192868619954,
"epoch": 1.3881569119536314,
"grad_norm": 0.038182858377695084,
"kl": 0.006665706634521484,
"learning_rate": 3.2234602364560543e-06,
"loss": 0.0031,
"step": 823
},
{
"clip_ratio": 0.0003338070732752385,
"epoch": 1.3900690149681814,
"grad_norm": 0.038163840770721436,
"kl": 0.00667572021484375,
"learning_rate": 3.2174532257044957e-06,
"loss": 0.003,
"step": 824
},
{
"clip_ratio": 0.0003418834434683049,
"epoch": 1.3919811179827313,
"grad_norm": 0.03628409281373024,
"kl": 0.0067596435546875,
"learning_rate": 3.2114416975955347e-06,
"loss": 0.003,
"step": 825
},
{
"clip_ratio": 0.0,
"completion_length": 519.1027045249939,
"epoch": 1.3938932209972812,
"grad_norm": 0.037393856793642044,
"kl": 0.005987644195556641,
"learning_rate": 3.20542568998003e-06,
"loss": 0.0097,
"num_tokens": 528270425.0,
"reward": 0.07784598556463607,
"reward_std": 0.0774529695045203,
"rewards/pure_accuracy_reward_math": 0.07784598329453729,
"step": 826
},
{
"clip_ratio": 0.0002753000243274073,
"epoch": 1.395805324011831,
"grad_norm": 0.03632253408432007,
"kl": 0.00603485107421875,
"learning_rate": 3.199405240737045e-06,
"loss": 0.0097,
"step": 827
},
{
"clip_ratio": 0.00028145005671831314,
"epoch": 1.397717427026381,
"grad_norm": 0.035320475697517395,
"kl": 0.0060482025146484375,
"learning_rate": 3.1933803877736103e-06,
"loss": 0.0097,
"step": 828
},
{
"clip_ratio": 0.00029773840276448027,
"epoch": 1.399629530040931,
"grad_norm": 0.03532904013991356,
"kl": 0.006001472473144531,
"learning_rate": 3.187351169024483e-06,
"loss": 0.0096,
"step": 829
},
{
"clip_ratio": 0.0003131672060590063,
"epoch": 1.4015416330554809,
"grad_norm": 0.03497399017214775,
"kl": 0.0059299468994140625,
"learning_rate": 3.181317622451909e-06,
"loss": 0.0095,
"step": 830
},
{
"clip_ratio": 0.0,
"completion_length": 519.5547099113464,
"epoch": 1.4034537360700308,
"grad_norm": 0.03596203401684761,
"kl": 0.005957126617431641,
"learning_rate": 3.1752797860453854e-06,
"loss": 0.0099,
"num_tokens": 531863545.0,
"reward": 0.06584821754950099,
"reward_std": 0.07359298237133771,
"rewards/pure_accuracy_reward_math": 0.06584821580327116,
"step": 831
},
{
"clip_ratio": 0.0002871401754873659,
"epoch": 1.4053658390845807,
"grad_norm": 0.03569914028048515,
"kl": 0.005918025970458984,
"learning_rate": 3.169237697821417e-06,
"loss": 0.0099,
"step": 832
},
{
"clip_ratio": 0.0002649255456503852,
"epoch": 1.4072779420991306,
"grad_norm": 0.035189539194107056,
"kl": 0.005944252014160156,
"learning_rate": 3.163191395823281e-06,
"loss": 0.0098,
"step": 833
},
{
"clip_ratio": 0.0002522150609252094,
"epoch": 1.4091900451136805,
"grad_norm": 0.03371162712574005,
"kl": 0.006028652191162109,
"learning_rate": 3.1571409181207867e-06,
"loss": 0.0098,
"step": 834
},
{
"clip_ratio": 0.00028182740913962334,
"epoch": 1.4111021481282304,
"grad_norm": 0.03411802276968956,
"kl": 0.006129264831542969,
"learning_rate": 3.151086302810035e-06,
"loss": 0.0097,
"step": 835
},
{
"clip_ratio": 0.0,
"completion_length": 509.0455017089844,
"epoch": 1.4130142511427803,
"grad_norm": 0.042647283524274826,
"kl": 0.006505012512207031,
"learning_rate": 3.1450275880131782e-06,
"loss": 0.0051,
"num_tokens": 535420068.0,
"reward": 0.06919643201399595,
"reward_std": 0.06989945442182943,
"rewards/pure_accuracy_reward_math": 0.06919642980210483,
"step": 836
},
{
"clip_ratio": 0.0002792542761653749,
"epoch": 1.4149263541573303,
"grad_norm": 0.03879564628005028,
"kl": 0.006262302398681641,
"learning_rate": 3.1389648118781795e-06,
"loss": 0.0051,
"step": 837
},
{
"clip_ratio": 0.00032867032479089175,
"epoch": 1.4168384571718802,
"grad_norm": 0.03632555902004242,
"kl": 0.006078004837036133,
"learning_rate": 3.132898012578577e-06,
"loss": 0.005,
"step": 838
},
{
"clip_ratio": 0.0003705890379706034,
"epoch": 1.41875056018643,
"grad_norm": 0.03687159717082977,
"kl": 0.0058705806732177734,
"learning_rate": 3.1268272283132374e-06,
"loss": 0.005,
"step": 839
},
{
"clip_ratio": 0.00039090512018447043,
"epoch": 1.42066266320098,
"grad_norm": 0.03681857883930206,
"kl": 0.005755186080932617,
"learning_rate": 3.1207524973061183e-06,
"loss": 0.0049,
"step": 840
},
{
"clip_ratio": 0.0,
"completion_length": 528.0865178108215,
"epoch": 1.42257476621553,
"grad_norm": 0.077212393283844,
"kl": 0.006708621978759766,
"learning_rate": 3.1146738578060293e-06,
"loss": 0.0034,
"num_tokens": 539042994.0,
"reward": 0.05468750235741027,
"reward_std": 0.06221334764268249,
"rewards/pure_accuracy_reward_math": 0.05468750130967237,
"step": 841
},
{
"clip_ratio": 0.00023407521496210393,
"epoch": 1.4244868692300798,
"grad_norm": 0.03766750544309616,
"kl": 0.005887508392333984,
"learning_rate": 3.108591348086388e-06,
"loss": 0.0034,
"step": 842
},
{
"clip_ratio": 0.00021864835269980176,
"epoch": 1.4263989722446297,
"grad_norm": 0.03435171768069267,
"kl": 0.0057353973388671875,
"learning_rate": 3.102505006444981e-06,
"loss": 0.0033,
"step": 843
},
{
"clip_ratio": 0.0002327330819866802,
"epoch": 1.4283110752591797,
"grad_norm": 0.03385370597243309,
"kl": 0.005730628967285156,
"learning_rate": 3.096414871203721e-06,
"loss": 0.0033,
"step": 844
},
{
"clip_ratio": 0.00025595308994752486,
"epoch": 1.4302231782737296,
"grad_norm": 0.0320701077580452,
"kl": 0.005660533905029297,
"learning_rate": 3.0903209807084085e-06,
"loss": 0.0032,
"step": 845
},
{
"clip_ratio": 0.0,
"completion_length": 532.2009177207947,
"epoch": 1.4321352812882795,
"grad_norm": 0.035687774419784546,
"kl": 0.006323099136352539,
"learning_rate": 3.0842233733284866e-06,
"loss": 0.0055,
"num_tokens": 542686090.0,
"reward": 0.06389509252039716,
"reward_std": 0.06839800346642733,
"rewards/pure_accuracy_reward_math": 0.06389509059954435,
"step": 846
},
{
"clip_ratio": 0.0002455309293054597,
"epoch": 1.4340473843028292,
"grad_norm": 0.03433489799499512,
"kl": 0.006294965744018555,
"learning_rate": 3.078122087456802e-06,
"loss": 0.0055,
"step": 847
},
{
"clip_ratio": 0.0003179283777399178,
"epoch": 1.4359594873173793,
"grad_norm": 0.03377856686711311,
"kl": 0.00630497932434082,
"learning_rate": 3.072017161509364e-06,
"loss": 0.0054,
"step": 848
},
{
"clip_ratio": 0.00030606188772708265,
"epoch": 1.437871590331929,
"grad_norm": 0.03379327058792114,
"kl": 0.006325483322143555,
"learning_rate": 3.065908633925099e-06,
"loss": 0.0054,
"step": 849
},
{
"clip_ratio": 0.00029904921905199444,
"epoch": 1.4397836933464792,
"grad_norm": 0.03319833427667618,
"kl": 0.006340742111206055,
"learning_rate": 3.0597965431656125e-06,
"loss": 0.0053,
"step": 850
},
{
"clip_ratio": 0.0,
"completion_length": 520.9991841316223,
"epoch": 1.00191210301455,
"grad_norm": 0.03730909898877144,
"kl": 0.005851268768310547,
"learning_rate": 3.0536809277149433e-06,
"loss": 0.0058,
"num_tokens": 3602593.0,
"reward": 0.061662948777666315,
"reward_std": 0.0712745109340176,
"rewards/pure_accuracy_reward_math": 0.06166294767172076,
"step": 851
},
{
"clip_ratio": 0.0002445870232463676,
"epoch": 1.0038242060290998,
"grad_norm": 0.036420926451683044,
"kl": 0.005807399749755859,
"learning_rate": 3.047561826079324e-06,
"loss": 0.0057,
"step": 852
},
{
"clip_ratio": 0.0002342841784184202,
"epoch": 1.0057363090436497,
"grad_norm": 0.03534744307398796,
"kl": 0.005809783935546875,
"learning_rate": 3.041439276786937e-06,
"loss": 0.0057,
"step": 853
},
{
"clip_ratio": 0.0003130897791834286,
"epoch": 1.0076484120581997,
"grad_norm": 0.03456578403711319,
"kl": 0.005836963653564453,
"learning_rate": 3.0353133183876745e-06,
"loss": 0.0056,
"step": 854
},
{
"clip_ratio": 0.0003235736477336104,
"epoch": 1.0095605150727496,
"grad_norm": 0.03683493658900261,
"kl": 0.00588226318359375,
"learning_rate": 3.0291839894528907e-06,
"loss": 0.0056,
"step": 855
},
{
"clip_ratio": 0.0,
"completion_length": 529.2422127723694,
"epoch": 1.0114726180872995,
"grad_norm": 3.6328346729278564,
"kl": 0.07409882545471191,
"learning_rate": 3.023051328575164e-06,
"loss": 0.0092,
"num_tokens": 7231613.0,
"reward": 0.06696428847499192,
"reward_std": 0.07320140569936484,
"rewards/pure_accuracy_reward_math": 0.06696428725263104,
"step": 856
},
{
"clip_ratio": 0.0002944787788692338,
"epoch": 1.0133847211018494,
"grad_norm": 0.23805810511112213,
"kl": 0.01258087158203125,
"learning_rate": 3.016915374368052e-06,
"loss": 0.0068,
"step": 857
},
{
"clip_ratio": 0.000328014534943577,
"epoch": 1.0152968241163993,
"grad_norm": 0.038860052824020386,
"kl": 0.008163928985595703,
"learning_rate": 3.0107761654658464e-06,
"loss": 0.0066,
"step": 858
},
{
"clip_ratio": 0.00033978425187797257,
"epoch": 1.0172089271309492,
"grad_norm": 0.037539608776569366,
"kl": 0.008237600326538086,
"learning_rate": 3.0046337405233334e-06,
"loss": 0.0065,
"step": 859
},
{
"clip_ratio": 0.0003289994185706746,
"epoch": 1.0191210301454992,
"grad_norm": 0.03649570420384407,
"kl": 0.008342981338500977,
"learning_rate": 2.9984881382155484e-06,
"loss": 0.0065,
"step": 860
},
{
"clip_ratio": 0.0,
"completion_length": 539.7709541320801,
"epoch": 1.021033133160049,
"grad_norm": 0.03506062552332878,
"kl": 0.0056056976318359375,
"learning_rate": 2.9923393972375337e-06,
"loss": 0.0075,
"num_tokens": 10898500.0,
"reward": 0.06389509155997075,
"reward_std": 0.07427741104038432,
"rewards/pure_accuracy_reward_math": 0.06389509086147882,
"step": 861
},
{
"clip_ratio": 0.00025894983372154456,
"epoch": 1.022945236174599,
"grad_norm": 0.03387964144349098,
"kl": 0.005673408508300781,
"learning_rate": 2.986187556304091e-06,
"loss": 0.0075,
"step": 862
},
{
"clip_ratio": 0.00026048227840647087,
"epoch": 1.024857339189149,
"grad_norm": 0.0339200459420681,
"kl": 0.005715370178222656,
"learning_rate": 2.9800326541495427e-06,
"loss": 0.0074,
"step": 863
},
{
"clip_ratio": 0.000286817725225319,
"epoch": 1.0267694422036988,
"grad_norm": 0.033578090369701385,
"kl": 0.0057220458984375,
"learning_rate": 2.973874729527486e-06,
"loss": 0.0074,
"step": 864
},
{
"clip_ratio": 0.00031288620994018856,
"epoch": 1.0286815452182487,
"grad_norm": 0.03253786265850067,
"kl": 0.005726814270019531,
"learning_rate": 2.967713821210547e-06,
"loss": 0.0073,
"step": 865
},
{
"clip_ratio": 0.0,
"completion_length": 532.484959602356,
"epoch": 1.0305936482327986,
"grad_norm": 0.040393006056547165,
"kl": 0.005712032318115234,
"learning_rate": 2.961549967990139e-06,
"loss": 0.0094,
"num_tokens": 14539070.0,
"reward": 0.0700334852153901,
"reward_std": 0.07968511193757877,
"rewards/pure_accuracy_reward_math": 0.07003348364378326,
"step": 866
},
{
"clip_ratio": 0.00034418605622477116,
"epoch": 1.0325057512473486,
"grad_norm": 0.03829828277230263,
"kl": 0.00571441650390625,
"learning_rate": 2.95538320867622e-06,
"loss": 0.0094,
"step": 867
},
{
"clip_ratio": 0.0003270462358386794,
"epoch": 1.0344178542618985,
"grad_norm": 0.03763904795050621,
"kl": 0.005820035934448242,
"learning_rate": 2.949213582097042e-06,
"loss": 0.0094,
"step": 868
},
{
"clip_ratio": 0.00039861036464117205,
"epoch": 1.0363299572764482,
"grad_norm": 0.03893045708537102,
"kl": 0.005897045135498047,
"learning_rate": 2.9430411270989112e-06,
"loss": 0.0093,
"step": 869
},
{
"clip_ratio": 0.0004073582798014286,
"epoch": 1.038242060290998,
"grad_norm": 0.03808417171239853,
"kl": 0.0059051513671875,
"learning_rate": 2.9368658825459452e-06,
"loss": 0.0092,
"step": 870
},
{
"clip_ratio": 0.0,
"completion_length": 518.7159852981567,
"epoch": 1.040154163305548,
"grad_norm": 0.03680076450109482,
"kl": 0.006183147430419922,
"learning_rate": 2.9306878873198227e-06,
"loss": 0.0073,
"num_tokens": 18123716.0,
"reward": 0.06975446810247377,
"reward_std": 0.07255704078124836,
"rewards/pure_accuracy_reward_math": 0.06975446600699797,
"step": 871
},
{
"clip_ratio": 0.00025267474336487794,
"epoch": 1.042066266320098,
"grad_norm": 0.036574870347976685,
"kl": 0.006196498870849609,
"learning_rate": 2.9245071803195435e-06,
"loss": 0.0072,
"step": 872
},
{
"clip_ratio": 0.0002888958638322947,
"epoch": 1.0439783693346478,
"grad_norm": 0.03539302200078964,
"kl": 0.006276130676269531,
"learning_rate": 2.9183238004611815e-06,
"loss": 0.0072,
"step": 873
},
{
"clip_ratio": 0.00027933804358326597,
"epoch": 1.0458904723491977,
"grad_norm": 0.03457676246762276,
"kl": 0.00629425048828125,
"learning_rate": 2.912137786677639e-06,
"loss": 0.0071,
"step": 874
},
{
"clip_ratio": 0.00026495220328115465,
"epoch": 1.0478025753637477,
"grad_norm": 0.034882258623838425,
"kl": 0.006371974945068359,
"learning_rate": 2.905949177918403e-06,
"loss": 0.0071,
"step": 875
},
{
"clip_ratio": 0.0,
"completion_length": 516.4989104270935,
"epoch": 1.0497146783782976,
"grad_norm": 0.04403652995824814,
"kl": 0.0064754486083984375,
"learning_rate": 2.8997580131493004e-06,
"loss": 0.0104,
"num_tokens": 21706672.0,
"reward": 0.07421875311410986,
"reward_std": 0.08282060426427051,
"rewards/pure_accuracy_reward_math": 0.07421875130967237,
"step": 876
},
{
"clip_ratio": 0.00034863107299543117,
"epoch": 1.0516267813928475,
"grad_norm": 0.040730468928813934,
"kl": 0.006359100341796875,
"learning_rate": 2.89356433135225e-06,
"loss": 0.0104,
"step": 877
},
{
"clip_ratio": 0.0003696895219036378,
"epoch": 1.0535388844073974,
"grad_norm": 0.040028344839811325,
"kl": 0.006321430206298828,
"learning_rate": 2.8873681715250197e-06,
"loss": 0.0104,
"step": 878
},
{
"clip_ratio": 0.00041197048278718285,
"epoch": 1.0554509874219473,
"grad_norm": 0.04009086638689041,
"kl": 0.0062351226806640625,
"learning_rate": 2.881169572680981e-06,
"loss": 0.0103,
"step": 879
},
{
"clip_ratio": 0.0004460485272943515,
"epoch": 1.0573630904364972,
"grad_norm": 0.03965138643980026,
"kl": 0.006242275238037109,
"learning_rate": 2.87496857384886e-06,
"loss": 0.0102,
"step": 880
},
{
"clip_ratio": 0.0,
"completion_length": 524.4285945892334,
"epoch": 1.0592751934510471,
"grad_norm": 0.03920762613415718,
"kl": 0.005979061126708984,
"learning_rate": 2.868765214072495e-06,
"loss": 0.0082,
"num_tokens": 25317588.0,
"reward": 0.07338170023285784,
"reward_std": 0.0805021328269504,
"rewards/pure_accuracy_reward_math": 0.07338169755530544,
"step": 881
},
{
"clip_ratio": 0.0003169273815046836,
"epoch": 1.061187296465597,
"grad_norm": 0.03858224302530289,
"kl": 0.006028175354003906,
"learning_rate": 2.8625595324105925e-06,
"loss": 0.0082,
"step": 882
},
{
"clip_ratio": 0.0003076135093351695,
"epoch": 1.063099399480147,
"grad_norm": 0.03754101321101189,
"kl": 0.006089687347412109,
"learning_rate": 2.8563515679364733e-06,
"loss": 0.0081,
"step": 883
},
{
"clip_ratio": 0.0003307215861809709,
"epoch": 1.065011502494697,
"grad_norm": 0.03692120686173439,
"kl": 0.006084442138671875,
"learning_rate": 2.850141359737836e-06,
"loss": 0.008,
"step": 884
},
{
"clip_ratio": 0.0003362660154380137,
"epoch": 1.0669236055092468,
"grad_norm": 0.03691774606704712,
"kl": 0.006087303161621094,
"learning_rate": 2.843928946916504e-06,
"loss": 0.008,
"step": 885
},
{
"clip_ratio": 0.0,
"completion_length": 541.91938829422,
"epoch": 1.0688357085237967,
"grad_norm": 0.03421162813901901,
"kl": 0.005934238433837891,
"learning_rate": 2.8377143685881835e-06,
"loss": 0.0048,
"num_tokens": 28991667.0,
"reward": 0.06138393090805039,
"reward_std": 0.05770279868738726,
"rewards/pure_accuracy_reward_math": 0.06138392991852015,
"step": 886
},
{
"clip_ratio": 0.00021627708133564738,
"epoch": 1.0707478115383466,
"grad_norm": 0.0331665463745594,
"kl": 0.005833148956298828,
"learning_rate": 2.8314976638822145e-06,
"loss": 0.0048,
"step": 887
},
{
"clip_ratio": 0.00023772416773226723,
"epoch": 1.0726599145528966,
"grad_norm": 0.03265010192990303,
"kl": 0.00572967529296875,
"learning_rate": 2.825278871941325e-06,
"loss": 0.0048,
"step": 888
},
{
"clip_ratio": 0.000255867875353033,
"epoch": 1.0745720175674465,
"grad_norm": 0.031934551894664764,
"kl": 0.0056514739990234375,
"learning_rate": 2.819058031921387e-06,
"loss": 0.0047,
"step": 889
},
{
"clip_ratio": 0.0002752940895334177,
"epoch": 1.0764841205819964,
"grad_norm": 0.03180062025785446,
"kl": 0.005589008331298828,
"learning_rate": 2.812835182991166e-06,
"loss": 0.0047,
"step": 890
},
{
"clip_ratio": 0.0,
"completion_length": 541.6253051757812,
"epoch": 1.0783962235965463,
"grad_norm": 0.0352044515311718,
"kl": 0.006504535675048828,
"learning_rate": 2.8066103643320774e-06,
"loss": 0.005,
"num_tokens": 32662984.0,
"reward": 0.07003348544822074,
"reward_std": 0.07148103549843654,
"rewards/pure_accuracy_reward_math": 0.07003348341095261,
"step": 891
},
{
"clip_ratio": 0.0002908879878305015,
"epoch": 1.0803083266110962,
"grad_norm": 0.03477974981069565,
"kl": 0.006473064422607422,
"learning_rate": 2.800383615137939e-06,
"loss": 0.0049,
"step": 892
},
{
"clip_ratio": 0.00027559091887496834,
"epoch": 1.0822204296256461,
"grad_norm": 0.03371204808354378,
"kl": 0.006519317626953125,
"learning_rate": 2.7941549746147234e-06,
"loss": 0.0049,
"step": 893
},
{
"clip_ratio": 0.00026331023877901316,
"epoch": 1.084132532640196,
"grad_norm": 0.03233867511153221,
"kl": 0.00655364990234375,
"learning_rate": 2.7879244819803104e-06,
"loss": 0.0048,
"step": 894
},
{
"clip_ratio": 0.0003059378379361988,
"epoch": 1.086044635654746,
"grad_norm": 0.032591916620731354,
"kl": 0.006562709808349609,
"learning_rate": 2.781692176464244e-06,
"loss": 0.0048,
"step": 895
},
{
"clip_ratio": 0.0,
"completion_length": 538.9467296600342,
"epoch": 1.0879567386692959,
"grad_norm": 0.0399605967104435,
"kl": 0.007935047149658203,
"learning_rate": 2.7754580973074817e-06,
"loss": 0.0078,
"num_tokens": 36327265.0,
"reward": 0.06640625328873284,
"reward_std": 0.07582512497901917,
"rewards/pure_accuracy_reward_math": 0.06640625142608769,
"step": 896
},
{
"clip_ratio": 0.00029080147635340836,
"epoch": 1.0898688416838458,
"grad_norm": 0.036669787019491196,
"kl": 0.007892131805419922,
"learning_rate": 2.769222283762148e-06,
"loss": 0.0077,
"step": 897
},
{
"clip_ratio": 0.0003202801690349588,
"epoch": 1.0917809446983957,
"grad_norm": 0.036093369126319885,
"kl": 0.007870197296142578,
"learning_rate": 2.7629847750912885e-06,
"loss": 0.0077,
"step": 898
},
{
"clip_ratio": 0.00034906711715620986,
"epoch": 1.0936930477129456,
"grad_norm": 0.036899976432323456,
"kl": 0.007824897766113281,
"learning_rate": 2.756745610568622e-06,
"loss": 0.0076,
"step": 899
},
{
"clip_ratio": 0.0003909627172333785,
"epoch": 1.0956051507274955,
"grad_norm": 0.03607386723160744,
"kl": 0.00782632827758789,
"learning_rate": 2.7505048294782914e-06,
"loss": 0.0076,
"step": 900
},
{
"clip_ratio": 0.0,
"completion_length": 519.9687776565552,
"epoch": 1.0975172537420455,
"grad_norm": 0.04138408601284027,
"kl": 0.006854534149169922,
"learning_rate": 2.7442624711146206e-06,
"loss": 0.0105,
"num_tokens": 39926261.0,
"reward": 0.07561384263681248,
"reward_std": 0.08660046180011705,
"rewards/pure_accuracy_reward_math": 0.07561384089058265,
"step": 901
},
{
"clip_ratio": 0.0003407098130878694,
"epoch": 1.0994293567565951,
"grad_norm": 0.04008745029568672,
"kl": 0.006922245025634766,
"learning_rate": 2.7380185747818628e-06,
"loss": 0.0105,
"step": 902
},
{
"clip_ratio": 0.0003345158028196238,
"epoch": 1.1013414597711453,
"grad_norm": 0.039206936955451965,
"kl": 0.006981372833251953,
"learning_rate": 2.7317731797939566e-06,
"loss": 0.0104,
"step": 903
},
{
"clip_ratio": 0.0003512224284918375,
"epoch": 1.103253562785695,
"grad_norm": 0.03816502168774605,
"kl": 0.006984233856201172,
"learning_rate": 2.7255263254742746e-06,
"loss": 0.0103,
"step": 904
},
{
"clip_ratio": 0.00038539456500075175,
"epoch": 1.105165665800245,
"grad_norm": 0.03802499175071716,
"kl": 0.006890773773193359,
"learning_rate": 2.71927805115538e-06,
"loss": 0.0103,
"step": 905
},
{
"clip_ratio": 0.0,
"completion_length": 522.6635279655457,
"epoch": 1.1070777688147948,
"grad_norm": 0.03780652955174446,
"kl": 0.005947589874267578,
"learning_rate": 2.713028396178776e-06,
"loss": 0.0044,
"num_tokens": 43530039.0,
"reward": 0.0691964318684768,
"reward_std": 0.0774129043566063,
"rewards/pure_accuracy_reward_math": 0.06919642988941632,
"step": 906
},
{
"clip_ratio": 0.0002883933650537074,
"epoch": 1.1089898718293447,
"grad_norm": 0.03706151619553566,
"kl": 0.005948543548583984,
"learning_rate": 2.706777399894656e-06,
"loss": 0.0044,
"step": 907
},
{
"clip_ratio": 0.0003032470573316459,
"epoch": 1.1109019748438946,
"grad_norm": 0.03684515878558159,
"kl": 0.005936622619628906,
"learning_rate": 2.700525101661665e-06,
"loss": 0.0044,
"step": 908
},
{
"clip_ratio": 0.0003385747261290817,
"epoch": 1.1128140778584446,
"grad_norm": 0.03632361814379692,
"kl": 0.005986690521240234,
"learning_rate": 2.6942715408466406e-06,
"loss": 0.0043,
"step": 909
},
{
"clip_ratio": 0.00035084231319615355,
"epoch": 1.1147261808729945,
"grad_norm": 0.0364714041352272,
"kl": 0.005983829498291016,
"learning_rate": 2.6880167568243716e-06,
"loss": 0.0042,
"step": 910
},
{
"clip_ratio": 0.0,
"completion_length": 524.6629705429077,
"epoch": 1.1166382838875444,
"grad_norm": 0.037073228508234024,
"kl": 0.006183624267578125,
"learning_rate": 2.681760788977349e-06,
"loss": 0.0075,
"num_tokens": 47140667.0,
"reward": 0.06166294956346974,
"reward_std": 0.07140090485336259,
"rewards/pure_accuracy_reward_math": 0.061662947526201606,
"step": 911
},
{
"clip_ratio": 0.00026335007953548484,
"epoch": 1.1185503869020943,
"grad_norm": 0.03628791868686676,
"kl": 0.006221771240234375,
"learning_rate": 2.6755036766955172e-06,
"loss": 0.0075,
"step": 912
},
{
"clip_ratio": 0.00029098790395210017,
"epoch": 1.1204624899166442,
"grad_norm": 0.03659017011523247,
"kl": 0.006258964538574219,
"learning_rate": 2.6692454593760255e-06,
"loss": 0.0075,
"step": 913
},
{
"clip_ratio": 0.00033703100632465066,
"epoch": 1.1223745929311941,
"grad_norm": 0.0357106551527977,
"kl": 0.006211757659912109,
"learning_rate": 2.6629861764229824e-06,
"loss": 0.0074,
"step": 914
},
{
"clip_ratio": 0.0003104925490902133,
"epoch": 1.124286695945744,
"grad_norm": 0.03461490571498871,
"kl": 0.006183624267578125,
"learning_rate": 2.6567258672472064e-06,
"loss": 0.0073,
"step": 915
},
{
"clip_ratio": 0.0,
"completion_length": 519.3962297439575,
"epoch": 1.126198798960294,
"grad_norm": 0.038919847458601,
"kl": 0.0060977935791015625,
"learning_rate": 2.650464571265975e-06,
"loss": 0.0062,
"num_tokens": 50733111.0,
"reward": 0.06584821734577417,
"reward_std": 0.07367311330744997,
"rewards/pure_accuracy_reward_math": 0.06584821583237499,
"step": 916
},
{
"clip_ratio": 0.0002951280029606096,
"epoch": 1.1281109019748439,
"grad_norm": 0.038201622664928436,
"kl": 0.0060329437255859375,
"learning_rate": 2.6442023279027805e-06,
"loss": 0.0061,
"step": 917
},
{
"clip_ratio": 0.00029004437487856194,
"epoch": 1.1300230049893938,
"grad_norm": 0.03696547448635101,
"kl": 0.006039619445800781,
"learning_rate": 2.6379391765870828e-06,
"loss": 0.0061,
"step": 918
},
{
"clip_ratio": 0.0003163389113183257,
"epoch": 1.1319351080039437,
"grad_norm": 0.03571280464529991,
"kl": 0.006005764007568359,
"learning_rate": 2.6316751567540527e-06,
"loss": 0.006,
"step": 919
},
{
"clip_ratio": 0.0003592208154259424,
"epoch": 1.1338472110184936,
"grad_norm": 0.03568287193775177,
"kl": 0.005993366241455078,
"learning_rate": 2.625410307844335e-06,
"loss": 0.006,
"step": 920
},
{
"clip_ratio": 0.0,
"completion_length": 538.2659268379211,
"epoch": 1.1357593140330435,
"grad_norm": 0.03899242356419563,
"kl": 0.005813121795654297,
"learning_rate": 2.6191446693037924e-06,
"loss": 0.0071,
"num_tokens": 54398312.0,
"reward": 0.07226562857977115,
"reward_std": 0.07861530320951715,
"rewards/pure_accuracy_reward_math": 0.07226562648429535,
"step": 921
},
{
"clip_ratio": 0.00029711308371815903,
"epoch": 1.1376714170475934,
"grad_norm": 0.038164544850587845,
"kl": 0.0058841705322265625,
"learning_rate": 2.6128782805832605e-06,
"loss": 0.0071,
"step": 922
},
{
"clip_ratio": 0.0003027216810664868,
"epoch": 1.1395835200621434,
"grad_norm": 0.03706645965576172,
"kl": 0.005882740020751953,
"learning_rate": 2.606611181138295e-06,
"loss": 0.007,
"step": 923
},
{
"clip_ratio": 0.00032618250162386175,
"epoch": 1.1414956230766933,
"grad_norm": 0.036637816578149796,
"kl": 0.005909442901611328,
"learning_rate": 2.600343410428931e-06,
"loss": 0.007,
"step": 924
},
{
"clip_ratio": 0.00032713054685018506,
"epoch": 1.1434077260912432,
"grad_norm": 0.036758605390787125,
"kl": 0.005947589874267578,
"learning_rate": 2.5940750079194275e-06,
"loss": 0.0069,
"step": 925
},
{
"clip_ratio": 0.0,
"completion_length": 542.0072803497314,
"epoch": 1.145319829105793,
"grad_norm": 0.03791532665491104,
"kl": 0.0061702728271484375,
"learning_rate": 2.5878060130780225e-06,
"loss": 0.0074,
"num_tokens": 58073722.0,
"reward": 0.06835937863797881,
"reward_std": 0.07715391897363588,
"rewards/pure_accuracy_reward_math": 0.06835937636788003,
"step": 926
},
{
"clip_ratio": 0.00030884258325158953,
"epoch": 1.147231932120343,
"grad_norm": 0.03749171644449234,
"kl": 0.006160736083984375,
"learning_rate": 2.581536465376684e-06,
"loss": 0.0074,
"step": 927
},
{
"clip_ratio": 0.000279198229350186,
"epoch": 1.149144035134893,
"grad_norm": 0.03681938722729683,
"kl": 0.006136417388916016,
"learning_rate": 2.575266404290859e-06,
"loss": 0.0073,
"step": 928
},
{
"clip_ratio": 0.0002930849948370451,
"epoch": 1.1510561381494429,
"grad_norm": 0.035750068724155426,
"kl": 0.006227970123291016,
"learning_rate": 2.5689958692992284e-06,
"loss": 0.0072,
"step": 929
},
{
"clip_ratio": 0.00028936977611238035,
"epoch": 1.1529682411639928,
"grad_norm": 0.03503425419330597,
"kl": 0.006281375885009766,
"learning_rate": 2.562724899883458e-06,
"loss": 0.0072,
"step": 930
},
{
"clip_ratio": 0.0,
"completion_length": 531.6188879013062,
"epoch": 1.1548803441785427,
"grad_norm": 0.05187267065048218,
"kl": 0.007277965545654297,
"learning_rate": 2.5564535355279464e-06,
"loss": 0.0072,
"num_tokens": 61714268.0,
"reward": 0.07505580713041127,
"reward_std": 0.08531173289520666,
"rewards/pure_accuracy_reward_math": 0.07505580491852015,
"step": 931
},
{
"clip_ratio": 0.00033635866333270314,
"epoch": 1.1567924471930926,
"grad_norm": 0.039655230939388275,
"kl": 0.0072231292724609375,
"learning_rate": 2.550181815719581e-06,
"loss": 0.0072,
"step": 932
},
{
"clip_ratio": 0.00035109808851530033,
"epoch": 1.1587045502076425,
"grad_norm": 0.038757406175136566,
"kl": 0.007157802581787109,
"learning_rate": 2.5439097799474867e-06,
"loss": 0.0072,
"step": 933
},
{
"clip_ratio": 0.00037538493586453114,
"epoch": 1.1606166532221924,
"grad_norm": 0.03841486573219299,
"kl": 0.007115840911865234,
"learning_rate": 2.537637467702777e-06,
"loss": 0.0071,
"step": 934
},
{
"clip_ratio": 0.0003936579208243529,
"epoch": 1.1625287562367423,
"grad_norm": 0.038453541696071625,
"kl": 0.0070896148681640625,
"learning_rate": 2.531364918478308e-06,
"loss": 0.007,
"step": 935
},
{
"clip_ratio": 0.0,
"completion_length": 547.6250252723694,
"epoch": 1.1644408592512923,
"grad_norm": 0.03738933801651001,
"kl": 0.00615692138671875,
"learning_rate": 2.5250921717684247e-06,
"loss": 0.0061,
"num_tokens": 65415044.0,
"reward": 0.07561384260770865,
"reward_std": 0.07745296956272796,
"rewards/pure_accuracy_reward_math": 0.07561384062864818,
"step": 936
},
{
"clip_ratio": 0.0002929231292227996,
"epoch": 1.166352962265842,
"grad_norm": 0.03690778836607933,
"kl": 0.006189823150634766,
"learning_rate": 2.5188192670687186e-06,
"loss": 0.0061,
"step": 937
},
{
"clip_ratio": 0.000294325235870474,
"epoch": 1.168265065280392,
"grad_norm": 0.03613179549574852,
"kl": 0.006130695343017578,
"learning_rate": 2.512546243875776e-06,
"loss": 0.0061,
"step": 938
},
{
"clip_ratio": 0.00031920797795237377,
"epoch": 1.1701771682949418,
"grad_norm": 0.03461304306983948,
"kl": 0.006014347076416016,
"learning_rate": 2.5062731416869267e-06,
"loss": 0.006,
"step": 939
},
{
"clip_ratio": 0.00037188214912475814,
"epoch": 1.172089271309492,
"grad_norm": 0.03454398363828659,
"kl": 0.005980968475341797,
"learning_rate": 2.5e-06,
"loss": 0.0059,
"step": 940
},
{
"clip_ratio": 0.0,
"completion_length": 532.1423244476318,
"epoch": 1.1740013743240416,
"grad_norm": 0.03934042155742645,
"kl": 0.006266117095947266,
"learning_rate": 2.493726858313074e-06,
"loss": 0.0078,
"num_tokens": 69057654.0,
"reward": 0.07477678928989917,
"reward_std": 0.08299326134147123,
"rewards/pure_accuracy_reward_math": 0.07477678690338507,
"step": 941
},
{
"clip_ratio": 0.00031629414758072016,
"epoch": 1.1759134773385915,
"grad_norm": 0.03872406855225563,
"kl": 0.0062713623046875,
"learning_rate": 2.4874537561242253e-06,
"loss": 0.0078,
"step": 942
},
{
"clip_ratio": 0.0003434862284166229,
"epoch": 1.1778255803531414,
"grad_norm": 0.03723340108990669,
"kl": 0.00623321533203125,
"learning_rate": 2.481180732931282e-06,
"loss": 0.0077,
"step": 943
},
{
"clip_ratio": 0.00034986940886483353,
"epoch": 1.1797376833676914,
"grad_norm": 0.03732794523239136,
"kl": 0.006276607513427734,
"learning_rate": 2.4749078282315757e-06,
"loss": 0.0076,
"step": 944
},
{
"clip_ratio": 0.0003579597876637308,
"epoch": 1.1816497863822413,
"grad_norm": 0.03668594732880592,
"kl": 0.006198883056640625,
"learning_rate": 2.468635081521693e-06,
"loss": 0.0076,
"step": 945
},
{
"clip_ratio": 0.0,
"completion_length": 528.1718993186951,
"epoch": 1.1835618893967912,
"grad_norm": 0.03715552017092705,
"kl": 0.006759166717529297,
"learning_rate": 2.462362532297224e-06,
"loss": 0.0079,
"num_tokens": 72682654.0,
"reward": 0.06891741449362598,
"reward_std": 0.08248148870188743,
"rewards/pure_accuracy_reward_math": 0.06891741199069656,
"step": 946
},
{
"clip_ratio": 0.0003075862115053951,
"epoch": 1.185473992411341,
"grad_norm": 0.03616279736161232,
"kl": 0.006741523742675781,
"learning_rate": 2.456090220052514e-06,
"loss": 0.0079,
"step": 947
},
{
"clip_ratio": 0.00027696539024191225,
"epoch": 1.187386095425891,
"grad_norm": 0.03556762635707855,
"kl": 0.006789684295654297,
"learning_rate": 2.44981818428042e-06,
"loss": 0.0079,
"step": 948
},
{
"clip_ratio": 0.0002739789470638243,
"epoch": 1.189298198440441,
"grad_norm": 0.03486724570393562,
"kl": 0.006869316101074219,
"learning_rate": 2.4435464644720544e-06,
"loss": 0.0078,
"step": 949
},
{
"clip_ratio": 0.00031816330425726846,
"epoch": 1.1912103014549908,
"grad_norm": 0.03446395695209503,
"kl": 0.006869316101074219,
"learning_rate": 2.4372751001165427e-06,
"loss": 0.0077,
"step": 950
},
{
"clip_ratio": 0.0,
"completion_length": 528.6573901176453,
"epoch": 1.1931224044695408,
"grad_norm": 0.03734345734119415,
"kl": 0.006131649017333984,
"learning_rate": 2.4310041307007716e-06,
"loss": 0.0062,
"num_tokens": 76305578.0,
"reward": 0.07114955657743849,
"reward_std": 0.07526708883233368,
"rewards/pure_accuracy_reward_math": 0.07114955488941632,
"step": 951
},
{
"clip_ratio": 0.00029005661951941875,
"epoch": 1.1950345074840907,
"grad_norm": 0.036443449556827545,
"kl": 0.006079196929931641,
"learning_rate": 2.4247335957091418e-06,
"loss": 0.0062,
"step": 952
},
{
"clip_ratio": 0.0002579906781647878,
"epoch": 1.1969466104986406,
"grad_norm": 0.034940823912620544,
"kl": 0.006037235260009766,
"learning_rate": 2.4184635346233166e-06,
"loss": 0.0061,
"step": 953
},
{
"clip_ratio": 0.00032199256943954424,
"epoch": 1.1988587135131905,
"grad_norm": 0.03445851802825928,
"kl": 0.006024360656738281,
"learning_rate": 2.4121939869219784e-06,
"loss": 0.0061,
"step": 954
},
{
"clip_ratio": 0.0003193520489048751,
"epoch": 1.2007708165277404,
"grad_norm": 0.03448885306715965,
"kl": 0.005992889404296875,
"learning_rate": 2.405924992080573e-06,
"loss": 0.006,
"step": 955
},
{
"clip_ratio": 0.0,
"completion_length": 519.4358487129211,
"epoch": 1.2026829195422903,
"grad_norm": 0.11665105819702148,
"kl": 0.008374214172363281,
"learning_rate": 2.3996565895710692e-06,
"loss": 0.0065,
"num_tokens": 79904712.0,
"reward": 0.07366071760770865,
"reward_std": 0.08458104060264304,
"rewards/pure_accuracy_reward_math": 0.07366071591968648,
"step": 956
},
{
"clip_ratio": 0.00031160829769305565,
"epoch": 1.2045950225568403,
"grad_norm": 0.04096413403749466,
"kl": 0.006944179534912109,
"learning_rate": 2.3933888188617054e-06,
"loss": 0.0064,
"step": 957
},
{
"clip_ratio": 0.00032232171946589006,
"epoch": 1.2065071255713902,
"grad_norm": 0.04049144312739372,
"kl": 0.006976127624511719,
"learning_rate": 2.3871217194167407e-06,
"loss": 0.0063,
"step": 958
},
{
"clip_ratio": 0.0003416440970340773,
"epoch": 1.20841922858594,
"grad_norm": 0.039766065776348114,
"kl": 0.007042884826660156,
"learning_rate": 2.380855330696208e-06,
"loss": 0.0063,
"step": 959
},
{
"clip_ratio": 0.0003523347779150754,
"epoch": 1.21033133160049,
"grad_norm": 0.03884311020374298,
"kl": 0.007153987884521484,
"learning_rate": 2.3745896921556656e-06,
"loss": 0.0062,
"step": 960
},
{
"clip_ratio": 0.0,
"completion_length": 530.392322063446,
"epoch": 1.21224343461504,
"grad_norm": 0.04043371230363846,
"kl": 0.008221149444580078,
"learning_rate": 2.368324843245948e-06,
"loss": 0.0086,
"num_tokens": 83540930.0,
"reward": 0.07952009316068143,
"reward_std": 0.08836089639225975,
"rewards/pure_accuracy_reward_math": 0.0795200911234133,
"step": 961
},
{
"clip_ratio": 0.0003234188988017195,
"epoch": 1.2141555376295898,
"grad_norm": 0.039239391684532166,
"kl": 0.008275985717773438,
"learning_rate": 2.362060823412919e-06,
"loss": 0.0086,
"step": 962
},
{
"clip_ratio": 0.00033211900500873526,
"epoch": 1.2160676406441397,
"grad_norm": 0.03923904523253441,
"kl": 0.008409500122070312,
"learning_rate": 2.355797672097219e-06,
"loss": 0.0086,
"step": 963
},
{
"clip_ratio": 0.00036667373893806143,
"epoch": 1.2179797436586897,
"grad_norm": 0.038865529000759125,
"kl": 0.008434295654296875,
"learning_rate": 2.349535428734026e-06,
"loss": 0.0085,
"step": 964
},
{
"clip_ratio": 0.0003816600048480723,
"epoch": 1.2198918466732396,
"grad_norm": 0.037728771567344666,
"kl": 0.00834512710571289,
"learning_rate": 2.343274132752795e-06,
"loss": 0.0084,
"step": 965
},
{
"clip_ratio": 0.0,
"completion_length": 535.4799346923828,
"epoch": 1.2218039496877895,
"grad_norm": 0.03813539817929268,
"kl": 0.005985260009765625,
"learning_rate": 2.3370138235770184e-06,
"loss": 0.0088,
"num_tokens": 87187574.0,
"reward": 0.060267860419116914,
"reward_std": 0.07384576939512044,
"rewards/pure_accuracy_reward_math": 0.060267858498264104,
"step": 966
},
{
"clip_ratio": 0.0002719826344446119,
"epoch": 1.2237160527023394,
"grad_norm": 0.03676025941967964,
"kl": 0.006021976470947266,
"learning_rate": 2.330754540623975e-06,
"loss": 0.0088,
"step": 967
},
{
"clip_ratio": 0.0002730399019696961,
"epoch": 1.2256281557168893,
"grad_norm": 0.03579593822360039,
"kl": 0.006060123443603516,
"learning_rate": 2.324496323304484e-06,
"loss": 0.0088,
"step": 968
},
{
"clip_ratio": 0.0002800920712502375,
"epoch": 1.2275402587314392,
"grad_norm": 0.0353357158601284,
"kl": 0.0061092376708984375,
"learning_rate": 2.318239211022651e-06,
"loss": 0.0087,
"step": 969
},
{
"clip_ratio": 0.0003294056899108,
"epoch": 1.2294523617459892,
"grad_norm": 0.03521355986595154,
"kl": 0.006182193756103516,
"learning_rate": 2.3119832431756284e-06,
"loss": 0.0086,
"step": 970
},
{
"clip_ratio": 0.0,
"completion_length": 513.8870182037354,
"epoch": 1.231364464760539,
"grad_norm": 0.03882085531949997,
"kl": 0.006420135498046875,
"learning_rate": 2.3057284591533598e-06,
"loss": 0.0093,
"num_tokens": 90758753.0,
"reward": 0.07505580718861893,
"reward_std": 0.07715391827514395,
"rewards/pure_accuracy_reward_math": 0.0750558051513508,
"step": 971
},
{
"clip_ratio": 0.0003045887907546785,
"epoch": 1.2332765677750888,
"grad_norm": 0.03775356709957123,
"kl": 0.006350040435791016,
"learning_rate": 2.299474898338336e-06,
"loss": 0.0093,
"step": 972
},
{
"clip_ratio": 0.0003195773986703898,
"epoch": 1.235188670789639,
"grad_norm": 0.03639310225844383,
"kl": 0.006343841552734375,
"learning_rate": 2.2932226001053444e-06,
"loss": 0.0092,
"step": 973
},
{
"clip_ratio": 0.0003582680616318612,
"epoch": 1.2371007738041886,
"grad_norm": 0.036272380501031876,
"kl": 0.006300926208496094,
"learning_rate": 2.286971603821226e-06,
"loss": 0.0092,
"step": 974
},
{
"clip_ratio": 0.0003946863821511215,
"epoch": 1.2390128768187387,
"grad_norm": 0.03584066033363342,
"kl": 0.006391048431396484,
"learning_rate": 2.280721948844621e-06,
"loss": 0.0091,
"step": 975
},
{
"clip_ratio": 0.0,
"completion_length": 522.3044323921204,
"epoch": 1.2409249798332884,
"grad_norm": 0.038236722350120544,
"kl": 0.006694316864013672,
"learning_rate": 2.274473674525726e-06,
"loss": 0.0094,
"num_tokens": 94365488.0,
"reward": 0.06556919953436591,
"reward_std": 0.07405849196948111,
"rewards/pure_accuracy_reward_math": 0.06556919802096672,
"step": 976
},
{
"clip_ratio": 0.00029697347130763774,
"epoch": 1.2428370828478383,
"grad_norm": 0.0369977168738842,
"kl": 0.006660938262939453,
"learning_rate": 2.268226820206044e-06,
"loss": 0.0094,
"step": 977
},
{
"clip_ratio": 0.000319464833580696,
"epoch": 1.2447491858623883,
"grad_norm": 0.03550850227475166,
"kl": 0.006519794464111328,
"learning_rate": 2.261981425218138e-06,
"loss": 0.0094,
"step": 978
},
{
"clip_ratio": 0.0003469139706453461,
"epoch": 1.2466612888769382,
"grad_norm": 0.03525082767009735,
"kl": 0.006406307220458984,
"learning_rate": 2.2557375288853803e-06,
"loss": 0.0093,
"step": 979
},
{
"clip_ratio": 0.0003654695393606744,
"epoch": 1.248573391891488,
"grad_norm": 0.0355265848338604,
"kl": 0.006331443786621094,
"learning_rate": 2.2494951705217095e-06,
"loss": 0.0092,
"step": 980
},
{
"clip_ratio": 0.0,
"completion_length": 516.76704454422,
"epoch": 1.250485494906038,
"grad_norm": 0.03745350241661072,
"kl": 0.0065135955810546875,
"learning_rate": 2.2432543894313797e-06,
"loss": 0.0042,
"num_tokens": 97952525.0,
"reward": 0.06501116385334171,
"reward_std": 0.07316133996937424,
"rewards/pure_accuracy_reward_math": 0.06501116222352721,
"step": 981
},
{
"clip_ratio": 0.00029299165072416145,
"epoch": 1.252397597920588,
"grad_norm": 0.03690091893076897,
"kl": 0.006426095962524414,
"learning_rate": 2.2370152249087114e-06,
"loss": 0.0042,
"step": 982
},
{
"clip_ratio": 0.0003187885846500649,
"epoch": 1.2543097009351378,
"grad_norm": 0.03645962476730347,
"kl": 0.006396055221557617,
"learning_rate": 2.2307777162378523e-06,
"loss": 0.0042,
"step": 983
},
{
"clip_ratio": 0.00033352292155086616,
"epoch": 1.2562218039496877,
"grad_norm": 0.03598187491297722,
"kl": 0.006333351135253906,
"learning_rate": 2.2245419026925187e-06,
"loss": 0.0041,
"step": 984
},
{
"clip_ratio": 0.0003533332319989313,
"epoch": 1.2581339069642377,
"grad_norm": 0.03577181696891785,
"kl": 0.006278276443481445,
"learning_rate": 2.218307823535757e-06,
"loss": 0.004,
"step": 985
},
{
"clip_ratio": 0.0,
"completion_length": 522.8172650337219,
"epoch": 1.2600460099787876,
"grad_norm": 0.03590444475412369,
"kl": 0.005995273590087891,
"learning_rate": 2.2120755180196904e-06,
"loss": 0.0045,
"num_tokens": 101560026.0,
"reward": 0.06054687811410986,
"reward_std": 0.06865079078124836,
"rewards/pure_accuracy_reward_math": 0.06054687619325705,
"step": 986
},
{
"clip_ratio": 0.00024842098838462334,
"epoch": 1.2619581129933375,
"grad_norm": 0.03513624891638756,
"kl": 0.0059719085693359375,
"learning_rate": 2.2058450253852783e-06,
"loss": 0.0045,
"step": 987
},
{
"clip_ratio": 0.000271169978702801,
"epoch": 1.2638702160078874,
"grad_norm": 0.03392768278717995,
"kl": 0.005938529968261719,
"learning_rate": 2.1996163848620612e-06,
"loss": 0.0044,
"step": 988
},
{
"clip_ratio": 0.0002971922116898895,
"epoch": 1.2657823190224373,
"grad_norm": 0.03286145627498627,
"kl": 0.0060443878173828125,
"learning_rate": 2.1933896356679226e-06,
"loss": 0.0044,
"step": 989
},
{
"clip_ratio": 0.0003229031350429068,
"epoch": 1.2676944220369872,
"grad_norm": 0.032496001571416855,
"kl": 0.006091594696044922,
"learning_rate": 2.1871648170088347e-06,
"loss": 0.0043,
"step": 990
},
{
"clip_ratio": 0.0,
"completion_length": 535.8125224113464,
"epoch": 1.2696065250515371,
"grad_norm": 0.21526122093200684,
"kl": 0.007075309753417969,
"learning_rate": 2.1809419680786143e-06,
"loss": 0.0072,
"num_tokens": 105223050.0,
"reward": 0.07421875381260179,
"reward_std": 0.08054219774203375,
"rewards/pure_accuracy_reward_math": 0.07421875130967237,
"step": 991
},
{
"clip_ratio": 0.00032863151136552915,
"epoch": 1.271518628066087,
"grad_norm": 0.03788222745060921,
"kl": 0.006428241729736328,
"learning_rate": 2.1747211280586758e-06,
"loss": 0.0072,
"step": 992
},
{
"clip_ratio": 0.00034688404628013814,
"epoch": 1.273430731080637,
"grad_norm": 0.03719337284564972,
"kl": 0.0064296722412109375,
"learning_rate": 2.168502336117787e-06,
"loss": 0.0071,
"step": 993
},
{
"clip_ratio": 0.00034599834629034376,
"epoch": 1.275342834095187,
"grad_norm": 0.036535993218421936,
"kl": 0.006348133087158203,
"learning_rate": 2.1622856314118178e-06,
"loss": 0.0071,
"step": 994
},
{
"clip_ratio": 0.00036459101005448247,
"epoch": 1.2772549371097368,
"grad_norm": 0.03548647463321686,
"kl": 0.006353855133056641,
"learning_rate": 2.156071053083496e-06,
"loss": 0.007,
"step": 995
},
{
"clip_ratio": 0.0,
"completion_length": 529.536018371582,
"epoch": 1.2791670401242867,
"grad_norm": 0.03945273160934448,
"kl": 0.006157398223876953,
"learning_rate": 2.1498586402621646e-06,
"loss": 0.0062,
"num_tokens": 108847859.0,
"reward": 0.07366071807336994,
"reward_std": 0.072430647269357,
"rewards/pure_accuracy_reward_math": 0.07366071533760987,
"step": 996
},
{
"clip_ratio": 0.0002439655858097467,
"epoch": 1.2810791431388366,
"grad_norm": 0.03839760273694992,
"kl": 0.006161689758300781,
"learning_rate": 2.1436484320635275e-06,
"loss": 0.0061,
"step": 997
},
{
"clip_ratio": 0.0002514519866281262,
"epoch": 1.2829912461533866,
"grad_norm": 0.03733210638165474,
"kl": 0.0061798095703125,
"learning_rate": 2.1374404675894083e-06,
"loss": 0.0061,
"step": 998
},
{
"clip_ratio": 0.0002774860670342605,
"epoch": 1.2849033491679365,
"grad_norm": 0.03640332072973251,
"kl": 0.006183147430419922,
"learning_rate": 2.131234785927505e-06,
"loss": 0.006,
"step": 999
},
{
"clip_ratio": 0.0002877332713069336,
"epoch": 1.2868154521824864,
"grad_norm": 0.03559413552284241,
"kl": 0.006213665008544922,
"learning_rate": 2.1250314261511414e-06,
"loss": 0.0059,
"step": 1000
},
{
"clip_ratio": 0.0,
"completion_length": 528.9492444992065,
"epoch": 1.2887275551970363,
"grad_norm": 0.04216492921113968,
"kl": 0.0073282718658447266,
"learning_rate": 2.1188304273190196e-06,
"loss": 0.0102,
"num_tokens": 112482213.0,
"reward": 0.0772879500000272,
"reward_std": 0.07908701087580994,
"rewards/pure_accuracy_reward_math": 0.07728794772992842,
"step": 1001
},
{
"clip_ratio": 0.0003075964003755871,
"epoch": 1.2906396582115862,
"grad_norm": 0.039000045508146286,
"kl": 0.007200002670288086,
"learning_rate": 2.1126318284749807e-06,
"loss": 0.0102,
"step": 1002
},
{
"clip_ratio": 0.0003138856436635251,
"epoch": 1.2925517612261361,
"grad_norm": 0.036585696041584015,
"kl": 0.00716710090637207,
"learning_rate": 2.106435668647751e-06,
"loss": 0.0101,
"step": 1003
},
{
"clip_ratio": 0.00033263966838603665,
"epoch": 1.294463864240686,
"grad_norm": 0.03634057566523552,
"kl": 0.007274150848388672,
"learning_rate": 2.1002419868507005e-06,
"loss": 0.01,
"step": 1004
},
{
"clip_ratio": 0.00035104663936635916,
"epoch": 1.2963759672552357,
"grad_norm": 0.03524275869131088,
"kl": 0.0072422027587890625,
"learning_rate": 2.0940508220815978e-06,
"loss": 0.01,
"step": 1005
},
{
"clip_ratio": 0.0,
"completion_length": 519.5226221084595,
"epoch": 1.2982880702697859,
"grad_norm": 0.04047563299536705,
"kl": 0.006965160369873047,
"learning_rate": 2.087862213322362e-06,
"loss": 0.0078,
"num_tokens": 116078946.0,
"reward": 0.06752232470898889,
"reward_std": 0.08269421081058681,
"rewards/pure_accuracy_reward_math": 0.0675223229045514,
"step": 1006
},
{
"clip_ratio": 0.00033451643105308904,
"epoch": 1.3002001732843356,
"grad_norm": 0.03818976879119873,
"kl": 0.0069293975830078125,
"learning_rate": 2.0816761995388198e-06,
"loss": 0.0078,
"step": 1007
},
{
"clip_ratio": 0.0003828123747666723,
"epoch": 1.3021122762988857,
"grad_norm": 0.03969357907772064,
"kl": 0.006967067718505859,
"learning_rate": 2.075492819680457e-06,
"loss": 0.0078,
"step": 1008
},
{
"clip_ratio": 0.0003832018163620887,
"epoch": 1.3040243793134354,
"grad_norm": 0.040100231766700745,
"kl": 0.007086753845214844,
"learning_rate": 2.0693121126801778e-06,
"loss": 0.0077,
"step": 1009
},
{
"clip_ratio": 0.0003569153510625256,
"epoch": 1.3059364823279855,
"grad_norm": 0.037368252873420715,
"kl": 0.007195472717285156,
"learning_rate": 2.063134117454055e-06,
"loss": 0.0076,
"step": 1010
},
{
"clip_ratio": 0.0,
"completion_length": 514.7126340866089,
"epoch": 1.3078485853425352,
"grad_norm": 0.0401712991297245,
"kl": 0.00678253173828125,
"learning_rate": 2.0569588729010896e-06,
"loss": 0.0063,
"num_tokens": 119662772.0,
"reward": 0.0705915214784909,
"reward_std": 0.08484002540353686,
"rewards/pure_accuracy_reward_math": 0.0705915190919768,
"step": 1011
},
{
"clip_ratio": 0.0003401347770477514,
"epoch": 1.3097606883570854,
"grad_norm": 0.03972383588552475,
"kl": 0.006781578063964844,
"learning_rate": 2.0507864179029592e-06,
"loss": 0.0062,
"step": 1012
},
{
"clip_ratio": 0.00040657852025560715,
"epoch": 1.311672791371635,
"grad_norm": 0.04063359647989273,
"kl": 0.006711006164550781,
"learning_rate": 2.044616791323781e-06,
"loss": 0.0062,
"step": 1013
},
{
"clip_ratio": 0.0004189488300880839,
"epoch": 1.313584894386185,
"grad_norm": 0.03818094730377197,
"kl": 0.006552696228027344,
"learning_rate": 2.0384500320098604e-06,
"loss": 0.0061,
"step": 1014
},
{
"clip_ratio": 0.000448550158978378,
"epoch": 1.315496997400735,
"grad_norm": 0.03749743476510048,
"kl": 0.0064678192138671875,
"learning_rate": 2.032286178789454e-06,
"loss": 0.006,
"step": 1015
},
{
"clip_ratio": 0.0,
"completion_length": 529.0069990158081,
"epoch": 1.3174091004152848,
"grad_norm": 0.03775123134255409,
"kl": 0.006552696228027344,
"learning_rate": 2.0261252704725143e-06,
"loss": 0.0047,
"num_tokens": 123299241.0,
"reward": 0.06919643163564615,
"reward_std": 0.0781373989302665,
"rewards/pure_accuracy_reward_math": 0.06919642994762398,
"step": 1016
},
{
"clip_ratio": 0.0003128642913452495,
"epoch": 1.3193212034298347,
"grad_norm": 0.03666616231203079,
"kl": 0.006560325622558594,
"learning_rate": 2.0199673458504577e-06,
"loss": 0.0047,
"step": 1017
},
{
"clip_ratio": 0.00030665075905744743,
"epoch": 1.3212333064443846,
"grad_norm": 0.035805702209472656,
"kl": 0.006537437438964844,
"learning_rate": 2.01381244369591e-06,
"loss": 0.0046,
"step": 1018
},
{
"clip_ratio": 0.0003063842187316368,
"epoch": 1.3231454094589346,
"grad_norm": 0.03492369130253792,
"kl": 0.006512641906738281,
"learning_rate": 2.0076606027624676e-06,
"loss": 0.0046,
"step": 1019
},
{
"clip_ratio": 0.00033027163379983904,
"epoch": 1.3250575124734845,
"grad_norm": 0.03507117182016373,
"kl": 0.006590366363525391,
"learning_rate": 2.0015118617844516e-06,
"loss": 0.0045,
"step": 1020
},
{
"clip_ratio": 0.0,
"completion_length": 536.10493516922,
"epoch": 1.3269696154880344,
"grad_norm": 0.04077515751123428,
"kl": 0.006287097930908203,
"learning_rate": 1.9953662594766675e-06,
"loss": 0.007,
"num_tokens": 126958737.0,
"reward": 0.0756138427532278,
"reward_std": 0.08067478984594345,
"rewards/pure_accuracy_reward_math": 0.07561384083237499,
"step": 1021
},
{
"clip_ratio": 0.0003038725464534764,
"epoch": 1.3288817185025843,
"grad_norm": 0.03825462609529495,
"kl": 0.0063266754150390625,
"learning_rate": 1.9892238345341544e-06,
"loss": 0.007,
"step": 1022
},
{
"clip_ratio": 0.0003366774006963169,
"epoch": 1.3307938215171342,
"grad_norm": 0.03734288364648819,
"kl": 0.006364345550537109,
"learning_rate": 1.983084625631949e-06,
"loss": 0.0069,
"step": 1023
},
{
"clip_ratio": 0.0003749641306853846,
"epoch": 1.3327059245316841,
"grad_norm": 0.03799683600664139,
"kl": 0.006411075592041016,
"learning_rate": 1.9769486714248367e-06,
"loss": 0.0068,
"step": 1024
},
{
"clip_ratio": 0.0003729545476289786,
"epoch": 1.334618027546234,
"grad_norm": 0.03601997718214989,
"kl": 0.006434917449951172,
"learning_rate": 1.9708160105471105e-06,
"loss": 0.0068,
"step": 1025
},
{
"clip_ratio": 0.0,
"completion_length": 529.7709493637085,
"epoch": 1.336530130560784,
"grad_norm": 0.04102141782641411,
"kl": 0.006857395172119141,
"learning_rate": 1.964686681612327e-06,
"loss": 0.0055,
"num_tokens": 130592668.0,
"reward": 0.06556919959257357,
"reward_std": 0.06470447563333437,
"rewards/pure_accuracy_reward_math": 0.0655691981955897,
"step": 1026
},
{
"clip_ratio": 0.00021823535962539609,
"epoch": 1.3384422335753339,
"grad_norm": 0.03428492322564125,
"kl": 0.006598472595214844,
"learning_rate": 1.9585607232130636e-06,
"loss": 0.0054,
"step": 1027
},
{
"clip_ratio": 0.00024637427833340553,
"epoch": 1.3403543365898838,
"grad_norm": 0.032555270940065384,
"kl": 0.006415843963623047,
"learning_rate": 1.952438173920677e-06,
"loss": 0.0054,
"step": 1028
},
{
"clip_ratio": 0.0002563797440870985,
"epoch": 1.3422664396044337,
"grad_norm": 0.03202388435602188,
"kl": 0.006371498107910156,
"learning_rate": 1.946319072285058e-06,
"loss": 0.0053,
"step": 1029
},
{
"clip_ratio": 0.0002687414232696028,
"epoch": 1.3441785426189836,
"grad_norm": 0.03169838339090347,
"kl": 0.006340980529785156,
"learning_rate": 1.9402034568343888e-06,
"loss": 0.0053,
"step": 1030
},
{
"clip_ratio": 0.0,
"completion_length": 549.2184953689575,
"epoch": 1.3460906456335335,
"grad_norm": 0.054084766656160355,
"kl": 0.006264686584472656,
"learning_rate": 1.9340913660749015e-06,
"loss": 0.0071,
"num_tokens": 134289567.0,
"reward": 0.06668527112924494,
"reward_std": 0.07140090392204002,
"rewards/pure_accuracy_reward_math": 0.06668526903376915,
"step": 1031
},
{
"clip_ratio": 0.00022883353369707038,
"epoch": 1.3480027486480834,
"grad_norm": 0.03612653911113739,
"kl": 0.006344318389892578,
"learning_rate": 1.9279828384906373e-06,
"loss": 0.0071,
"step": 1032
},
{
"clip_ratio": 0.0002760976024376305,
"epoch": 1.3499148516626334,
"grad_norm": 0.036703869700431824,
"kl": 0.006397724151611328,
"learning_rate": 1.921877912543198e-06,
"loss": 0.0071,
"step": 1033
},
{
"clip_ratio": 0.00027991523592163503,
"epoch": 1.3518269546771833,
"grad_norm": 0.036445919424295425,
"kl": 0.006428718566894531,
"learning_rate": 1.9157766266715142e-06,
"loss": 0.007,
"step": 1034
},
{
"clip_ratio": 0.0003110420944381076,
"epoch": 1.3537390576917332,
"grad_norm": 0.032879918813705444,
"kl": 0.006253242492675781,
"learning_rate": 1.909679019291592e-06,
"loss": 0.0069,
"step": 1035
},
{
"clip_ratio": 0.0,
"completion_length": 525.200918674469,
"epoch": 1.355651160706283,
"grad_norm": 0.0374806709587574,
"kl": 0.006623744964599609,
"learning_rate": 1.9035851287962797e-06,
"loss": 0.0088,
"num_tokens": 137901395.0,
"reward": 0.07170759295695461,
"reward_std": 0.0834249026956968,
"rewards/pure_accuracy_reward_math": 0.0717075907450635,
"step": 1036
},
{
"clip_ratio": 0.0002719677876825699,
"epoch": 1.357563263720833,
"grad_norm": 0.03692527487874031,
"kl": 0.006625652313232422,
"learning_rate": 1.8974949935550202e-06,
"loss": 0.0088,
"step": 1037
},
{
"clip_ratio": 0.0003176050505544481,
"epoch": 1.359475366735383,
"grad_norm": 0.03605135530233383,
"kl": 0.006484031677246094,
"learning_rate": 1.8914086519136133e-06,
"loss": 0.0088,
"step": 1038
},
{
"clip_ratio": 0.0003420261080577802,
"epoch": 1.3613874697499329,
"grad_norm": 0.03582129627466202,
"kl": 0.006468296051025391,
"learning_rate": 1.8853261421939718e-06,
"loss": 0.0087,
"step": 1039
},
{
"clip_ratio": 0.00034158617637558564,
"epoch": 1.3632995727644825,
"grad_norm": 0.0346604622900486,
"kl": 0.006458282470703125,
"learning_rate": 1.8792475026938823e-06,
"loss": 0.0086,
"step": 1040
},
{
"clip_ratio": 0.0,
"completion_length": 525.6152620315552,
"epoch": 1.3652116757790327,
"grad_norm": 0.03809192404150963,
"kl": 0.006644248962402344,
"learning_rate": 1.8731727716867632e-06,
"loss": 0.0098,
"num_tokens": 141517968.0,
"reward": 0.07477678963914514,
"reward_std": 0.0749618403497152,
"rewards/pure_accuracy_reward_math": 0.07477678678696975,
"step": 1041
},
{
"clip_ratio": 0.0002677642194726104,
"epoch": 1.3671237787935824,
"grad_norm": 0.0377020426094532,
"kl": 0.0066089630126953125,
"learning_rate": 1.8671019874214237e-06,
"loss": 0.0098,
"step": 1042
},
{
"clip_ratio": 0.0002758102658617645,
"epoch": 1.3690358818081325,
"grad_norm": 0.03678804636001587,
"kl": 0.006642341613769531,
"learning_rate": 1.8610351881218211e-06,
"loss": 0.0098,
"step": 1043
},
{
"clip_ratio": 0.0002790037015074631,
"epoch": 1.3709479848226822,
"grad_norm": 0.03615477308630943,
"kl": 0.006649971008300781,
"learning_rate": 1.8549724119868235e-06,
"loss": 0.0097,
"step": 1044
},
{
"clip_ratio": 0.0002795595634097481,
"epoch": 1.3728600878372323,
"grad_norm": 0.03598296642303467,
"kl": 0.006653785705566406,
"learning_rate": 1.8489136971899658e-06,
"loss": 0.0096,
"step": 1045
},
{
"clip_ratio": 0.0,
"completion_length": 539.382839679718,
"epoch": 1.374772190851782,
"grad_norm": 0.03458879515528679,
"kl": 0.0064601898193359375,
"learning_rate": 1.8428590818792135e-06,
"loss": 0.0038,
"num_tokens": 145187116.0,
"reward": 0.06584821731667034,
"reward_std": 0.07200520334299654,
"rewards/pure_accuracy_reward_math": 0.06584821562864818,
"step": 1046
},
{
"clip_ratio": 0.00023162108237784196,
"epoch": 1.3766842938663322,
"grad_norm": 0.03385276347398758,
"kl": 0.006392478942871094,
"learning_rate": 1.836808604176719e-06,
"loss": 0.0038,
"step": 1047
},
{
"clip_ratio": 0.00026906593984676874,
"epoch": 1.3785963968808819,
"grad_norm": 0.0331512950360775,
"kl": 0.0062427520751953125,
"learning_rate": 1.8307623021785837e-06,
"loss": 0.0037,
"step": 1048
},
{
"clip_ratio": 0.00025022312701139526,
"epoch": 1.3805084998954318,
"grad_norm": 0.032765790820121765,
"kl": 0.006190299987792969,
"learning_rate": 1.8247202139546155e-06,
"loss": 0.0037,
"step": 1049
},
{
"clip_ratio": 0.0002507307134465009,
"epoch": 1.3824206029099817,
"grad_norm": 0.0325283482670784,
"kl": 0.006188869476318359,
"learning_rate": 1.8186823775480917e-06,
"loss": 0.0036,
"step": 1050
},
{
"clip_ratio": 0.0,
"completion_length": 539.5159296989441,
"epoch": 1.3843327059245316,
"grad_norm": 0.03628634661436081,
"kl": 0.007945537567138672,
"learning_rate": 1.8126488309755178e-06,
"loss": 0.0101,
"num_tokens": 148852261.0,
"reward": 0.06194196696742438,
"reward_std": 0.06792009872151539,
"rewards/pure_accuracy_reward_math": 0.06194196580327116,
"step": 1051
},
{
"clip_ratio": 0.00025563780241100176,
"epoch": 1.3862448089390815,
"grad_norm": 0.035264719277620316,
"kl": 0.007953643798828125,
"learning_rate": 1.80661961222639e-06,
"loss": 0.0101,
"step": 1052
},
{
"clip_ratio": 0.0002401949207069265,
"epoch": 1.3881569119536314,
"grad_norm": 0.034110233187675476,
"kl": 0.007923126220703125,
"learning_rate": 1.8005947592629551e-06,
"loss": 0.0101,
"step": 1053
},
{
"clip_ratio": 0.00026547102737595196,
"epoch": 1.3900690149681814,
"grad_norm": 0.03364601358771324,
"kl": 0.00788116455078125,
"learning_rate": 1.7945743100199706e-06,
"loss": 0.01,
"step": 1054
},
{
"clip_ratio": 0.0002951583905996813,
"epoch": 1.3919811179827313,
"grad_norm": 0.03397928550839424,
"kl": 0.007859230041503906,
"learning_rate": 1.788558302404466e-06,
"loss": 0.0099,
"step": 1055
},
{
"clip_ratio": 0.0,
"completion_length": 530.25337266922,
"epoch": 1.3938932209972812,
"grad_norm": 0.03863634541630745,
"kl": 0.006538867950439453,
"learning_rate": 1.7825467742955052e-06,
"loss": 0.0066,
"num_tokens": 152486009.0,
"reward": 0.06780134289874695,
"reward_std": 0.06736206321511418,
"rewards/pure_accuracy_reward_math": 0.06780134057044052,
"step": 1056
},
{
"clip_ratio": 0.00027592373527340897,
"epoch": 1.395805324011831,
"grad_norm": 0.036583587527275085,
"kl": 0.0065402984619140625,
"learning_rate": 1.7765397635439468e-06,
"loss": 0.0066,
"step": 1057
},
{
"clip_ratio": 0.0002849266509201698,
"epoch": 1.397717427026381,
"grad_norm": 0.03605053946375847,
"kl": 0.006500244140625,
"learning_rate": 1.7705373079722083e-06,
"loss": 0.0065,
"step": 1058
},
{
"clip_ratio": 0.0003116865132142266,
"epoch": 1.399629530040931,
"grad_norm": 0.03675729036331177,
"kl": 0.006489276885986328,
"learning_rate": 1.7645394453740227e-06,
"loss": 0.0064,
"step": 1059
},
{
"clip_ratio": 0.0003249485117748918,
"epoch": 1.4015416330554809,
"grad_norm": 0.03623329848051071,
"kl": 0.006478786468505859,
"learning_rate": 1.7585462135142083e-06,
"loss": 0.0064,
"step": 1060
},
{
"clip_ratio": 0.0,
"completion_length": 520.029598236084,
"epoch": 1.4034537360700308,
"grad_norm": 0.03506990894675255,
"kl": 0.006392955780029297,
"learning_rate": 1.752557650128423e-06,
"loss": 0.0096,
"num_tokens": 156082643.0,
"reward": 0.06194196664728224,
"reward_std": 0.07560620515141636,
"rewards/pure_accuracy_reward_math": 0.061941966181620955,
"step": 1061
},
{
"clip_ratio": 0.0002744606111662051,
"epoch": 1.4053658390845807,
"grad_norm": 0.03450053185224533,
"kl": 0.006424903869628906,
"learning_rate": 1.7465737929229317e-06,
"loss": 0.0096,
"step": 1062
},
{
"clip_ratio": 0.00027279697263793423,
"epoch": 1.4072779420991306,
"grad_norm": 0.033764585852622986,
"kl": 0.006496906280517578,
"learning_rate": 1.7405946795743665e-06,
"loss": 0.0096,
"step": 1063
},
{
"clip_ratio": 0.000298209258943416,
"epoch": 1.4091900451136805,
"grad_norm": 0.03335048630833626,
"kl": 0.0065898895263671875,
"learning_rate": 1.7346203477294916e-06,
"loss": 0.0095,
"step": 1064
},
{
"clip_ratio": 0.00030832760762677935,
"epoch": 1.4111021481282304,
"grad_norm": 0.03299354016780853,
"kl": 0.006653308868408203,
"learning_rate": 1.7286508350049627e-06,
"loss": 0.0094,
"step": 1065
},
{
"clip_ratio": 0.0,
"completion_length": 525.4023675918579,
"epoch": 1.4130142511427803,
"grad_norm": 0.04127517342567444,
"kl": 0.010558605194091797,
"learning_rate": 1.722686178987097e-06,
"loss": 0.0076,
"num_tokens": 159696133.0,
"reward": 0.06640625282307155,
"reward_std": 0.07264956791186705,
"rewards/pure_accuracy_reward_math": 0.06640625101863407,
"step": 1066
},
{
"clip_ratio": 0.00030437137564831573,
"epoch": 1.4149263541573303,
"grad_norm": 0.039496634155511856,
"kl": 0.010538101196289062,
"learning_rate": 1.7167264172316273e-06,
"loss": 0.0076,
"step": 1067
},
{
"clip_ratio": 0.0003244270092181978,
"epoch": 1.4168384571718802,
"grad_norm": 0.039376117289066315,
"kl": 0.010515689849853516,
"learning_rate": 1.7107715872634731e-06,
"loss": 0.0075,
"step": 1068
},
{
"clip_ratio": 0.0003491952173817481,
"epoch": 1.41875056018643,
"grad_norm": 0.03863466531038284,
"kl": 0.01038360595703125,
"learning_rate": 1.7048217265764993e-06,
"loss": 0.0075,
"step": 1069
},
{
"clip_ratio": 0.00037865171140083476,
"epoch": 1.42066266320098,
"grad_norm": 0.03795957565307617,
"kl": 0.010157585144042969,
"learning_rate": 1.6988768726332856e-06,
"loss": 0.0074,
"step": 1070
},
{
"clip_ratio": 0.0,
"completion_length": 512.8691644668579,
"epoch": 1.42257476621553,
"grad_norm": 0.04360206797719002,
"kl": 0.0067138671875,
"learning_rate": 1.6929370628648828e-06,
"loss": 0.0086,
"num_tokens": 163268528.0,
"reward": 0.08565848623402417,
"reward_std": 0.08861368341604248,
"rewards/pure_accuracy_reward_math": 0.08565848384751007,
"step": 1071
},
{
"clip_ratio": 0.00031944918799808875,
"epoch": 1.4244868692300798,
"grad_norm": 0.04292250797152519,
"kl": 0.006737709045410156,
"learning_rate": 1.6870023346705866e-06,
"loss": 0.0085,
"step": 1072
},
{
"clip_ratio": 0.00031442818647064996,
"epoch": 1.4263989722446297,
"grad_norm": 0.04044810310006142,
"kl": 0.006873607635498047,
"learning_rate": 1.6810727254176937e-06,
"loss": 0.0085,
"step": 1073
},
{
"clip_ratio": 0.0003650832475727839,
"epoch": 1.4283110752591797,
"grad_norm": 0.04156485199928284,
"kl": 0.006984233856201172,
"learning_rate": 1.6751482724412716e-06,
"loss": 0.0084,
"step": 1074
},
{
"clip_ratio": 0.0003947964444250829,
"epoch": 1.4302231782737296,
"grad_norm": 0.04023054987192154,
"kl": 0.007004737854003906,
"learning_rate": 1.669229013043921e-06,
"loss": 0.0083,
"step": 1075
},
{
"clip_ratio": 0.0,
"completion_length": 512.7343969345093,
"epoch": 1.4321352812882795,
"grad_norm": 0.03780645504593849,
"kl": 0.006886005401611328,
"learning_rate": 1.6633149844955415e-06,
"loss": 0.0094,
"num_tokens": 166836260.0,
"reward": 0.0797991111758165,
"reward_std": 0.08157813875004649,
"rewards/pure_accuracy_reward_math": 0.07979910867288709,
"step": 1076
},
{
"clip_ratio": 0.0002608302990552147,
"epoch": 1.4340473843028292,
"grad_norm": 0.03681138530373573,
"kl": 0.006786823272705078,
"learning_rate": 1.6574062240330996e-06,
"loss": 0.0093,
"step": 1077
},
{
"clip_ratio": 0.00031450060896531795,
"epoch": 1.4359594873173793,
"grad_norm": 0.036778852343559265,
"kl": 0.0066986083984375,
"learning_rate": 1.651502768860389e-06,
"loss": 0.0093,
"step": 1078
},
{
"clip_ratio": 0.0003176571812559814,
"epoch": 1.437871590331929,
"grad_norm": 0.03592304140329361,
"kl": 0.006758213043212891,
"learning_rate": 1.6456046561478023e-06,
"loss": 0.0092,
"step": 1079
},
{
"clip_ratio": 0.0003236016519281293,
"epoch": 1.4397836933464792,
"grad_norm": 0.03520684316754341,
"kl": 0.006850242614746094,
"learning_rate": 1.6397119230320919e-06,
"loss": 0.0092,
"step": 1080
},
{
"clip_ratio": 0.0,
"completion_length": 508.80498933792114,
"epoch": 1.4416957963610288,
"grad_norm": 0.04630957916378975,
"kl": 0.01150655746459961,
"learning_rate": 1.633824606616138e-06,
"loss": 0.008,
"num_tokens": 170392081.0,
"reward": 0.07589286129223183,
"reward_std": 0.08140548272058368,
"rewards/pure_accuracy_reward_math": 0.07589285844005644,
"step": 1081
},
{
"clip_ratio": 0.00028873196572476445,
"epoch": 1.443607899375579,
"grad_norm": 0.04534924402832985,
"kl": 0.01107931137084961,
"learning_rate": 1.6279427439687154e-06,
"loss": 0.008,
"step": 1082
},
{
"clip_ratio": 0.000319909158235987,
"epoch": 1.4455200023901287,
"grad_norm": 0.044707395136356354,
"kl": 0.010364532470703125,
"learning_rate": 1.622066372124262e-06,
"loss": 0.0079,
"step": 1083
},
{
"clip_ratio": 0.0003388643909829625,
"epoch": 1.4474321054046788,
"grad_norm": 0.038643479347229004,
"kl": 0.009525775909423828,
"learning_rate": 1.6161955280826399e-06,
"loss": 0.0078,
"step": 1084
},
{
"clip_ratio": 0.0003223289492098047,
"epoch": 1.4493442084192285,
"grad_norm": 0.12098709493875504,
"kl": 0.010370254516601562,
"learning_rate": 1.6103302488089104e-06,
"loss": 0.0078,
"step": 1085
},
{
"clip_ratio": 0.0,
"completion_length": 520.3169894218445,
"epoch": 1.4512563114337784,
"grad_norm": 0.03693209961056709,
"kl": 0.006680965423583984,
"learning_rate": 1.6044705712330932e-06,
"loss": 0.0059,
"num_tokens": 173992817.0,
"reward": 0.07031250311410986,
"reward_std": 0.07530715462053195,
"rewards/pure_accuracy_reward_math": 0.07031250142608769,
"step": 1086
},
{
"clip_ratio": 0.0002918191117657898,
"epoch": 1.4531684144483283,
"grad_norm": 0.03641385957598686,
"kl": 0.0065898895263671875,
"learning_rate": 1.5986165322499398e-06,
"loss": 0.0059,
"step": 1087
},
{
"clip_ratio": 0.0002921736467840219,
"epoch": 1.4550805174628783,
"grad_norm": 0.03598758950829506,
"kl": 0.006548881530761719,
"learning_rate": 1.5927681687186964e-06,
"loss": 0.0058,
"step": 1088
},
{
"clip_ratio": 0.0003169650843233285,
"epoch": 1.4569926204774282,
"grad_norm": 0.036268141120672226,
"kl": 0.006561756134033203,
"learning_rate": 1.5869255174628778e-06,
"loss": 0.0058,
"step": 1089
},
{
"clip_ratio": 0.0003259218068478731,
"epoch": 1.458904723491978,
"grad_norm": 0.03529893979430199,
"kl": 0.006597042083740234,
"learning_rate": 1.5810886152700302e-06,
"loss": 0.0057,
"step": 1090
},
{
"clip_ratio": 0.0,
"completion_length": 533.391206741333,
"epoch": 1.460816826506528,
"grad_norm": 0.04034799709916115,
"kl": 0.006509304046630859,
"learning_rate": 1.5752574988915004e-06,
"loss": 0.0066,
"num_tokens": 177633359.0,
"reward": 0.07477678920258768,
"reward_std": 0.0747891838545911,
"rewards/pure_accuracy_reward_math": 0.07477678699069656,
"step": 1091
},
{
"clip_ratio": 0.0002679697158214367,
"epoch": 1.462728929521078,
"grad_norm": 0.039328683167696,
"kl": 0.006606101989746094,
"learning_rate": 1.5694322050422096e-06,
"loss": 0.0066,
"step": 1092
},
{
"clip_ratio": 0.0002975759220475993,
"epoch": 1.4646410325356278,
"grad_norm": 0.03947217017412186,
"kl": 0.00665283203125,
"learning_rate": 1.5636127704004133e-06,
"loss": 0.0065,
"step": 1093
},
{
"clip_ratio": 0.0003127538088278925,
"epoch": 1.4665531355501777,
"grad_norm": 0.03733786940574646,
"kl": 0.006627559661865234,
"learning_rate": 1.5577992316074783e-06,
"loss": 0.0064,
"step": 1094
},
{
"clip_ratio": 0.00035554791872982605,
"epoch": 1.4684652385647277,
"grad_norm": 0.03660706803202629,
"kl": 0.0065364837646484375,
"learning_rate": 1.5519916252676482e-06,
"loss": 0.0064,
"step": 1095
},
{
"clip_ratio": 0.0,
"completion_length": 530.1163763999939,
"epoch": 1.4703773415792776,
"grad_norm": 0.06871657073497772,
"kl": 0.010003089904785156,
"learning_rate": 1.5461899879478133e-06,
"loss": 0.0057,
"num_tokens": 181268648.0,
"reward": 0.0744977711874526,
"reward_std": 0.08333237702026963,
"rewards/pure_accuracy_reward_math": 0.0744977695576381,
"step": 1096
},
{
"clip_ratio": 0.00032988911306119917,
"epoch": 1.4722894445938275,
"grad_norm": 0.04868275299668312,
"kl": 0.009030342102050781,
"learning_rate": 1.5403943561772789e-06,
"loss": 0.0057,
"step": 1097
},
{
"clip_ratio": 0.0003833602018517013,
"epoch": 1.4742015476083774,
"grad_norm": 0.04073934629559517,
"kl": 0.00842428207397461,
"learning_rate": 1.5346047664475422e-06,
"loss": 0.0056,
"step": 1098
},
{
"clip_ratio": 0.00040459603366116426,
"epoch": 1.4761136506229273,
"grad_norm": 0.04011493921279907,
"kl": 0.008179187774658203,
"learning_rate": 1.5288212552120524e-06,
"loss": 0.0055,
"step": 1099
},
{
"clip_ratio": 0.0004078742092019638,
"epoch": 1.4780257536374772,
"grad_norm": 0.03785649687051773,
"kl": 0.008193016052246094,
"learning_rate": 1.5230438588859881e-06,
"loss": 0.0054,
"step": 1100
},
{
"clip_ratio": 0.0,
"completion_length": 541.5837321281433,
"epoch": 1.4799378566520272,
"grad_norm": 0.04047717526555061,
"kl": 0.007642269134521484,
"learning_rate": 1.517272613846027e-06,
"loss": 0.0051,
"num_tokens": 184939348.0,
"reward": 0.06863839572179131,
"reward_std": 0.07131457631476223,
"rewards/pure_accuracy_reward_math": 0.06863839420839213,
"step": 1101
},
{
"clip_ratio": 0.00026072144959243815,
"epoch": 1.481849959666577,
"grad_norm": 0.037731293588876724,
"kl": 0.007551670074462891,
"learning_rate": 1.511507556430114e-06,
"loss": 0.0051,
"step": 1102
},
{
"clip_ratio": 0.00029216510773721893,
"epoch": 1.483762062681127,
"grad_norm": 0.03771767392754555,
"kl": 0.007477760314941406,
"learning_rate": 1.5057487229372347e-06,
"loss": 0.0051,
"step": 1103
},
{
"clip_ratio": 0.0003181908435294645,
"epoch": 1.485674165695677,
"grad_norm": 0.03619125112891197,
"kl": 0.0074062347412109375,
"learning_rate": 1.4999961496271889e-06,
"loss": 0.005,
"step": 1104
},
{
"clip_ratio": 0.0003646736843165854,
"epoch": 1.4875862687102268,
"grad_norm": 0.035048868507146835,
"kl": 0.007380008697509766,
"learning_rate": 1.4942498727203578e-06,
"loss": 0.0049,
"step": 1105
},
{
"clip_ratio": 0.0,
"completion_length": 541.8585615158081,
"epoch": 1.4894983717247767,
"grad_norm": 0.0386812798678875,
"kl": 0.006747245788574219,
"learning_rate": 1.4885099283974774e-06,
"loss": 0.0071,
"num_tokens": 188614221.0,
"reward": 0.07198661062284373,
"reward_std": 0.08140548341907561,
"rewards/pure_accuracy_reward_math": 0.07198660864378326,
"step": 1106
},
{
"clip_ratio": 0.0003357146362077401,
"epoch": 1.4914104747393266,
"grad_norm": 0.03723128139972687,
"kl": 0.006694316864013672,
"learning_rate": 1.482776352799414e-06,
"loss": 0.0071,
"step": 1107
},
{
"clip_ratio": 0.0003692662889989151,
"epoch": 1.4933225777538766,
"grad_norm": 0.038370903581380844,
"kl": 0.006665706634521484,
"learning_rate": 1.4770491820269317e-06,
"loss": 0.007,
"step": 1108
},
{
"clip_ratio": 0.00040588962588117283,
"epoch": 1.4952346807684265,
"grad_norm": 0.037489671260118484,
"kl": 0.006663322448730469,
"learning_rate": 1.4713284521404678e-06,
"loss": 0.0069,
"step": 1109
},
{
"clip_ratio": 0.00039138679812822375,
"epoch": 1.4971467837829764,
"grad_norm": 0.03641659393906593,
"kl": 0.006697654724121094,
"learning_rate": 1.465614199159905e-06,
"loss": 0.0069,
"step": 1110
},
{
"clip_ratio": 0.0,
"completion_length": 520.476583480835,
"epoch": 1.4990588867975263,
"grad_norm": 1.8961507081985474,
"kl": 0.03508758544921875,
"learning_rate": 1.4599064590643472e-06,
"loss": 0.0056,
"num_tokens": 192212657.0,
"reward": 0.0753348250000272,
"reward_std": 0.07783834805013612,
"rewards/pure_accuracy_reward_math": 0.07533482302096672,
"step": 1111
},
{
"clip_ratio": 0.00029740781877762856,
"epoch": 1.500970989812076,
"grad_norm": 0.08476530015468597,
"kl": 0.011601448059082031,
"learning_rate": 1.4542052677918885e-06,
"loss": 0.0047,
"step": 1112
},
{
"clip_ratio": 0.0003210891072171762,
"epoch": 1.5028830928266261,
"grad_norm": 0.04907820373773575,
"kl": 0.010628223419189453,
"learning_rate": 1.4485106612393897e-06,
"loss": 0.0046,
"step": 1113
},
{
"clip_ratio": 0.00033912417364945213,
"epoch": 1.5047951958411758,
"grad_norm": 0.04438456520438194,
"kl": 0.010659217834472656,
"learning_rate": 1.4428226752622509e-06,
"loss": 0.0046,
"step": 1114
},
{
"clip_ratio": 0.0003756833369834567,
"epoch": 1.506707298855726,
"grad_norm": 0.0422808900475502,
"kl": 0.010442733764648438,
"learning_rate": 1.437141345674189e-06,
"loss": 0.0045,
"step": 1115
},
{
"clip_ratio": 0.0,
"completion_length": 535.0778713226318,
"epoch": 1.5086194018702757,
"grad_norm": 0.048265133053064346,
"kl": 0.007592678070068359,
"learning_rate": 1.4314667082470064e-06,
"loss": 0.0086,
"num_tokens": 195861088.0,
"reward": 0.07142857479630038,
"reward_std": 0.08346496871672571,
"rewards/pure_accuracy_reward_math": 0.07142857287544757,
"step": 1116
},
{
"clip_ratio": 0.0003429410510875641,
"epoch": 1.5105315048848258,
"grad_norm": 0.04287589713931084,
"kl": 0.007152557373046875,
"learning_rate": 1.4257987987103727e-06,
"loss": 0.0085,
"step": 1117
},
{
"clip_ratio": 0.0003726668836634417,
"epoch": 1.5124436078993755,
"grad_norm": 0.0397462397813797,
"kl": 0.006825447082519531,
"learning_rate": 1.420137652751593e-06,
"loss": 0.0085,
"step": 1118
},
{
"clip_ratio": 0.0003763367328133427,
"epoch": 1.5143557109139256,
"grad_norm": 0.03851110488176346,
"kl": 0.006707668304443359,
"learning_rate": 1.4144833060153887e-06,
"loss": 0.0084,
"step": 1119
},
{
"clip_ratio": 0.0003624607439292049,
"epoch": 1.5162678139284753,
"grad_norm": 0.03720558434724808,
"kl": 0.00676727294921875,
"learning_rate": 1.408835794103669e-06,
"loss": 0.0083,
"step": 1120
},
{
"clip_ratio": 0.0,
"completion_length": 524.7569994926453,
"epoch": 1.5181799169430255,
"grad_norm": 0.03832938149571419,
"kl": 0.008425712585449219,
"learning_rate": 1.4031951525753088e-06,
"loss": 0.0071,
"num_tokens": 199475701.0,
"reward": 0.08565848635043949,
"reward_std": 0.08179086120799184,
"rewards/pure_accuracy_reward_math": 0.08565848338184878,
"step": 1121
},
{
"clip_ratio": 0.00028257126655262255,
"epoch": 1.5200920199575751,
"grad_norm": 0.038414496928453445,
"kl": 0.008458137512207031,
"learning_rate": 1.3975614169459253e-06,
"loss": 0.0071,
"step": 1122
},
{
"clip_ratio": 0.0003134008442202685,
"epoch": 1.5220041229721253,
"grad_norm": 0.03928304836153984,
"kl": 0.008496284484863281,
"learning_rate": 1.391934622687652e-06,
"loss": 0.0071,
"step": 1123
},
{
"clip_ratio": 0.00030222541431612626,
"epoch": 1.523916225986675,
"grad_norm": 0.038087427616119385,
"kl": 0.008494377136230469,
"learning_rate": 1.38631480522892e-06,
"loss": 0.007,
"step": 1124
},
{
"clip_ratio": 0.0002927070846396873,
"epoch": 1.525828329001225,
"grad_norm": 0.03641984984278679,
"kl": 0.008457183837890625,
"learning_rate": 1.3807019999542287e-06,
"loss": 0.0069,
"step": 1125
},
{
"clip_ratio": 0.0,
"completion_length": 531.1537666320801,
"epoch": 1.5277404320157748,
"grad_norm": 0.040940940380096436,
"kl": 0.006596565246582031,
"learning_rate": 1.3750962422039269e-06,
"loss": 0.0058,
"num_tokens": 203109136.0,
"reward": 0.07254464621655643,
"reward_std": 0.08217623952077702,
"rewards/pure_accuracy_reward_math": 0.07254464400466532,
"step": 1126
},
{
"clip_ratio": 0.00031519718078243386,
"epoch": 1.5296525350303247,
"grad_norm": 0.038493506610393524,
"kl": 0.006714344024658203,
"learning_rate": 1.369497567273989e-06,
"loss": 0.0058,
"step": 1127
},
{
"clip_ratio": 0.0003513000764314711,
"epoch": 1.5315646380448746,
"grad_norm": 0.039495162665843964,
"kl": 0.006772041320800781,
"learning_rate": 1.3639060104157964e-06,
"loss": 0.0057,
"step": 1128
},
{
"clip_ratio": 0.00033387296190312554,
"epoch": 1.5334767410594246,
"grad_norm": 0.03875305503606796,
"kl": 0.006872653961181641,
"learning_rate": 1.3583216068359078e-06,
"loss": 0.0057,
"step": 1129
},
{
"clip_ratio": 0.00036185752793471693,
"epoch": 1.5353888440739745,
"grad_norm": 0.03817266598343849,
"kl": 0.006899356842041016,
"learning_rate": 1.3527443916958466e-06,
"loss": 0.0056,
"step": 1130
},
{
"clip_ratio": 0.0,
"completion_length": 537.4143671989441,
"epoch": 1.5373009470885244,
"grad_norm": 0.035565100610256195,
"kl": 0.006679058074951172,
"learning_rate": 1.3471744001118718e-06,
"loss": 0.0091,
"num_tokens": 206769717.0,
"reward": 0.07533482497092336,
"reward_std": 0.07436373975360766,
"rewards/pure_accuracy_reward_math": 0.07533482293365523,
"step": 1131
},
{
"clip_ratio": 0.00028060592541123697,
"epoch": 1.5392130501030743,
"grad_norm": 0.036901701241731644,
"kl": 0.006720542907714844,
"learning_rate": 1.3416116671547613e-06,
"loss": 0.0091,
"step": 1132
},
{
"clip_ratio": 0.00034766932589036514,
"epoch": 1.5411251531176242,
"grad_norm": 0.03489091992378235,
"kl": 0.006618499755859375,
"learning_rate": 1.3360562278495899e-06,
"loss": 0.009,
"step": 1133
},
{
"clip_ratio": 0.0003513962886927402,
"epoch": 1.5430372561321741,
"grad_norm": 0.035007573664188385,
"kl": 0.0066070556640625,
"learning_rate": 1.3305081171755092e-06,
"loss": 0.009,
"step": 1134
},
{
"clip_ratio": 0.00036896456708745973,
"epoch": 1.544949359146724,
"grad_norm": 0.03363417461514473,
"kl": 0.006587028503417969,
"learning_rate": 1.3249673700655246e-06,
"loss": 0.0089,
"step": 1135
},
{
"clip_ratio": 0.0,
"completion_length": 531.2251925468445,
"epoch": 1.546861462161274,
"grad_norm": 0.037738338112831116,
"kl": 0.006687164306640625,
"learning_rate": 1.3194340214062828e-06,
"loss": 0.0066,
"num_tokens": 210404892.0,
"reward": 0.07477678978466429,
"reward_std": 0.08492635452421382,
"rewards/pure_accuracy_reward_math": 0.07477678699069656,
"step": 1136
},
{
"clip_ratio": 0.0003166603274848967,
"epoch": 1.5487735651758239,
"grad_norm": 0.03711307421326637,
"kl": 0.0067272186279296875,
"learning_rate": 1.3139081060378423e-06,
"loss": 0.0066,
"step": 1137
},
{
"clip_ratio": 0.00032532861348499864,
"epoch": 1.5506856681903738,
"grad_norm": 0.0381547249853611,
"kl": 0.006831169128417969,
"learning_rate": 1.3083896587534606e-06,
"loss": 0.0065,
"step": 1138
},
{
"clip_ratio": 0.0003168874280845557,
"epoch": 1.5525977712049237,
"grad_norm": 0.03702245280146599,
"kl": 0.0068492889404296875,
"learning_rate": 1.3028787142993723e-06,
"loss": 0.0064,
"step": 1139
},
{
"clip_ratio": 0.00031372528076190065,
"epoch": 1.5545098742194736,
"grad_norm": 0.035462986677885056,
"kl": 0.0068511962890625,
"learning_rate": 1.297375307374574e-06,
"loss": 0.0063,
"step": 1140
},
{
"clip_ratio": 0.0,
"completion_length": 528.9913792610168,
"epoch": 1.5564219772340235,
"grad_norm": 0.0402364507317543,
"kl": 0.006835460662841797,
"learning_rate": 1.2918794726306003e-06,
"loss": 0.0099,
"num_tokens": 214034825.0,
"reward": 0.07310268151923083,
"reward_std": 0.07917333993827924,
"rewards/pure_accuracy_reward_math": 0.07310268000583164,
"step": 1141
},
{
"clip_ratio": 0.0003137970834359294,
"epoch": 1.5583340802485734,
"grad_norm": 0.03920648992061615,
"kl": 0.006829738616943359,
"learning_rate": 1.2863912446713084e-06,
"loss": 0.0098,
"step": 1142
},
{
"clip_ratio": 0.00032378236608110456,
"epoch": 1.5602461832631231,
"grad_norm": 0.03806397691369057,
"kl": 0.006905078887939453,
"learning_rate": 1.2809106580526636e-06,
"loss": 0.0098,
"step": 1143
},
{
"clip_ratio": 0.0003143088524097948,
"epoch": 1.5621582862776733,
"grad_norm": 0.03801356628537178,
"kl": 0.006966590881347656,
"learning_rate": 1.2754377472825153e-06,
"loss": 0.0097,
"step": 1144
},
{
"clip_ratio": 0.00035796050920566813,
"epoch": 1.564070389292223,
"grad_norm": 0.036964964121580124,
"kl": 0.006992816925048828,
"learning_rate": 1.2699725468203832e-06,
"loss": 0.0096,
"step": 1145
},
{
"clip_ratio": 0.0,
"completion_length": 538.6370244026184,
"epoch": 1.565982492306773,
"grad_norm": 0.045449208468198776,
"kl": 0.007224559783935547,
"learning_rate": 1.2645150910772413e-06,
"loss": 0.0043,
"num_tokens": 217697304.0,
"reward": 0.07393973600119352,
"reward_std": 0.08620888477889821,
"rewards/pure_accuracy_reward_math": 0.07393973361467943,
"step": 1146
},
{
"clip_ratio": 0.0003596847872131548,
"epoch": 1.5678945953213228,
"grad_norm": 0.03882161155343056,
"kl": 0.006949901580810547,
"learning_rate": 1.2590654144152992e-06,
"loss": 0.0043,
"step": 1147
},
{
"clip_ratio": 0.0004527134210547956,
"epoch": 1.569806698335873,
"grad_norm": 0.03764580935239792,
"kl": 0.00691986083984375,
"learning_rate": 1.2536235511477852e-06,
"loss": 0.0043,
"step": 1148
},
{
"clip_ratio": 0.0005161078099717997,
"epoch": 1.5717188013504226,
"grad_norm": 0.03833252564072609,
"kl": 0.006892681121826172,
"learning_rate": 1.2481895355387341e-06,
"loss": 0.0042,
"step": 1149
},
{
"clip_ratio": 0.0005320426059824968,
"epoch": 1.5736309043649728,
"grad_norm": 0.03876457363367081,
"kl": 0.006943702697753906,
"learning_rate": 1.2427634018027673e-06,
"loss": 0.0041,
"step": 1150
},
{
"clip_ratio": 0.0,
"completion_length": 529.9707288742065,
"epoch": 1.5755430073795225,
"grad_norm": 0.03937402740120888,
"kl": 0.007305145263671875,
"learning_rate": 1.2373451841048781e-06,
"loss": 0.0078,
"num_tokens": 221325451.0,
"reward": 0.08258928963914514,
"reward_std": 0.08058846154017374,
"rewards/pure_accuracy_reward_math": 0.08258928655413911,
"step": 1151
},
{
"clip_ratio": 0.0002857717965980555,
"epoch": 1.5774551103940726,
"grad_norm": 0.03863917291164398,
"kl": 0.007287502288818359,
"learning_rate": 1.2319349165602202e-06,
"loss": 0.0078,
"step": 1152
},
{
"clip_ratio": 0.0002796752659151025,
"epoch": 1.5793672134086223,
"grad_norm": 0.03722836822271347,
"kl": 0.007286548614501953,
"learning_rate": 1.2265326332338875e-06,
"loss": 0.0077,
"step": 1153
},
{
"clip_ratio": 0.00034041513032434523,
"epoch": 1.5812793164231724,
"grad_norm": 0.03688417002558708,
"kl": 0.007335662841796875,
"learning_rate": 1.2211383681407022e-06,
"loss": 0.0076,
"step": 1154
},
{
"clip_ratio": 0.0003595712430524145,
"epoch": 1.5831914194377221,
"grad_norm": 0.037124987691640854,
"kl": 0.007359981536865234,
"learning_rate": 1.2157521552450035e-06,
"loss": 0.0076,
"step": 1155
},
{
"clip_ratio": 0.0,
"completion_length": 540.098798751831,
"epoch": 1.5851035224522723,
"grad_norm": 0.03577388823032379,
"kl": 0.0069561004638671875,
"learning_rate": 1.210374028460428e-06,
"loss": 0.0065,
"num_tokens": 224996253.0,
"reward": 0.06863839607103728,
"reward_std": 0.07376563857542351,
"rewards/pure_accuracy_reward_math": 0.06863839426659979,
"step": 1156
},
{
"clip_ratio": 0.00025091522741149674,
"epoch": 1.587015625466822,
"grad_norm": 0.03386949375271797,
"kl": 0.006894588470458984,
"learning_rate": 1.2050040216497e-06,
"loss": 0.0065,
"step": 1157
},
{
"clip_ratio": 0.00029767470277874963,
"epoch": 1.588927728481372,
"grad_norm": 0.033231545239686966,
"kl": 0.0068531036376953125,
"learning_rate": 1.1996421686244179e-06,
"loss": 0.0064,
"step": 1158
},
{
"clip_ratio": 0.00030627386024661973,
"epoch": 1.5908398314959218,
"grad_norm": 0.0327543206512928,
"kl": 0.006781578063964844,
"learning_rate": 1.1942885031448397e-06,
"loss": 0.0064,
"step": 1159
},
{
"clip_ratio": 0.00032285955057886895,
"epoch": 1.5927519345104717,
"grad_norm": 0.03283894062042236,
"kl": 0.006725788116455078,
"learning_rate": 1.1889430589196727e-06,
"loss": 0.0063,
"step": 1160
},
{
"clip_ratio": 0.0,
"completion_length": 540.7405333518982,
"epoch": 1.5946640375250216,
"grad_norm": 0.04240734875202179,
"kl": 0.006897449493408203,
"learning_rate": 1.183605869605858e-06,
"loss": 0.0064,
"num_tokens": 228663991.0,
"reward": 0.08091518227593042,
"reward_std": 0.08951703325146809,
"rewards/pure_accuracy_reward_math": 0.08091518018045463,
"step": 1161
},
{
"clip_ratio": 0.00035278943187222467,
"epoch": 1.5965761405395715,
"grad_norm": 0.04050403833389282,
"kl": 0.006961345672607422,
"learning_rate": 1.1782769688083647e-06,
"loss": 0.0064,
"step": 1162
},
{
"clip_ratio": 0.00034535837551175064,
"epoch": 1.5984882435541214,
"grad_norm": 0.03872028365731239,
"kl": 0.007065296173095703,
"learning_rate": 1.1729563900799695e-06,
"loss": 0.0063,
"step": 1163
},
{
"clip_ratio": 0.00037939938943054585,
"epoch": 1.6004003465686714,
"grad_norm": 0.039447493851184845,
"kl": 0.007191181182861328,
"learning_rate": 1.1676441669210543e-06,
"loss": 0.0063,
"step": 1164
},
{
"clip_ratio": 0.00037003348657549395,
"epoch": 1.6023124495832213,
"grad_norm": 0.03724885359406471,
"kl": 0.0071163177490234375,
"learning_rate": 1.1623403327793881e-06,
"loss": 0.0061,
"step": 1165
},
{
"clip_ratio": 0.0,
"completion_length": 531.3211750984192,
"epoch": 1.6042245525977712,
"grad_norm": 0.9447879791259766,
"kl": 0.03227043151855469,
"learning_rate": 1.1570449210499213e-06,
"loss": 0.0085,
"num_tokens": 232302082.0,
"reward": 0.07756696781143546,
"reward_std": 0.0780110054765828,
"rewards/pure_accuracy_reward_math": 0.07756696577416733,
"step": 1166
},
{
"clip_ratio": 0.00036849399879201883,
"epoch": 1.606136655612321,
"grad_norm": 0.26742058992385864,
"kl": 0.011518478393554688,
"learning_rate": 1.1517579650745713e-06,
"loss": 0.0079,
"step": 1167
},
{
"clip_ratio": 0.00029733346730154153,
"epoch": 1.608048758626871,
"grad_norm": 0.3907225728034973,
"kl": 0.017581462860107422,
"learning_rate": 1.1464794981420187e-06,
"loss": 0.0079,
"step": 1168
},
{
"clip_ratio": 0.0003680569542439116,
"epoch": 1.609960861641421,
"grad_norm": 0.1778813600540161,
"kl": 0.010699748992919922,
"learning_rate": 1.1412095534874912e-06,
"loss": 0.0077,
"step": 1169
},
{
"clip_ratio": 0.0003726620370798628,
"epoch": 1.6118729646559709,
"grad_norm": 0.2035137563943863,
"kl": 0.01429891586303711,
"learning_rate": 1.135948164292557e-06,
"loss": 0.0077,
"step": 1170
},
{
"clip_ratio": 0.0,
"completion_length": 519.0362968444824,
"epoch": 1.6137850676705208,
"grad_norm": 0.040138401091098785,
"kl": 0.008060932159423828,
"learning_rate": 1.130695363684916e-06,
"loss": 0.0096,
"num_tokens": 235898380.0,
"reward": 0.0630580390279647,
"reward_std": 0.07195894001051784,
"rewards/pure_accuracy_reward_math": 0.06305803687428124,
"step": 1171
},
{
"clip_ratio": 0.0002708259837049809,
"epoch": 1.6156971706850707,
"grad_norm": 0.03859123960137367,
"kl": 0.008191585540771484,
"learning_rate": 1.1254511847381922e-06,
"loss": 0.0096,
"step": 1172
},
{
"clip_ratio": 0.00029455311903348047,
"epoch": 1.6176092736996206,
"grad_norm": 0.03898981586098671,
"kl": 0.008168697357177734,
"learning_rate": 1.1202156604717234e-06,
"loss": 0.0095,
"step": 1173
},
{
"clip_ratio": 0.0003440694692926627,
"epoch": 1.6195213767141705,
"grad_norm": 0.0370321087539196,
"kl": 0.00800466537475586,
"learning_rate": 1.1149888238503537e-06,
"loss": 0.0094,
"step": 1174
},
{
"clip_ratio": 0.00040963905792068545,
"epoch": 1.6214334797287204,
"grad_norm": 0.03698049858212471,
"kl": 0.007803440093994141,
"learning_rate": 1.109770707784229e-06,
"loss": 0.0094,
"step": 1175
},
{
"clip_ratio": 0.0,
"completion_length": 525.937527179718,
"epoch": 1.6233455827432703,
"grad_norm": 0.039002615958452225,
"kl": 0.007039546966552734,
"learning_rate": 1.1045613451285837e-06,
"loss": 0.0074,
"num_tokens": 239513448.0,
"reward": 0.06584821754950099,
"reward_std": 0.07595151895657182,
"rewards/pure_accuracy_reward_math": 0.06584821516298689,
"step": 1176
},
{
"clip_ratio": 0.0003209126220440339,
"epoch": 1.6252576857578203,
"grad_norm": 0.038693126291036606,
"kl": 0.0069637298583984375,
"learning_rate": 1.0993607686835408e-06,
"loss": 0.0074,
"step": 1177
},
{
"clip_ratio": 0.0003234959946212257,
"epoch": 1.62716978877237,
"grad_norm": 0.03805870935320854,
"kl": 0.006987094879150391,
"learning_rate": 1.0941690111939002e-06,
"loss": 0.0073,
"step": 1178
},
{
"clip_ratio": 0.0003316311403978034,
"epoch": 1.62908189178692,
"grad_norm": 0.03687576577067375,
"kl": 0.0070285797119140625,
"learning_rate": 1.0889861053489341e-06,
"loss": 0.0072,
"step": 1179
},
{
"clip_ratio": 0.00033663610071243966,
"epoch": 1.6309939948014698,
"grad_norm": 0.03717907890677452,
"kl": 0.007116794586181641,
"learning_rate": 1.0838120837821814e-06,
"loss": 0.0071,
"step": 1180
},
{
"clip_ratio": 0.0,
"completion_length": 514.2112393379211,
"epoch": 1.63290609781602,
"grad_norm": 0.04346395656466484,
"kl": 0.007472515106201172,
"learning_rate": 1.0786469790712441e-06,
"loss": 0.0059,
"num_tokens": 243092265.0,
"reward": 0.07700893233413808,
"reward_std": 0.07526089128805324,
"rewards/pure_accuracy_reward_math": 0.07700893029686995,
"step": 1181
},
{
"clip_ratio": 0.0002878125141592136,
"epoch": 1.6348182008305696,
"grad_norm": 0.03890342637896538,
"kl": 0.007323265075683594,
"learning_rate": 1.0734908237375783e-06,
"loss": 0.0059,
"step": 1182
},
{
"clip_ratio": 0.00031910790164602076,
"epoch": 1.6367303038451197,
"grad_norm": 0.03748926892876625,
"kl": 0.007243156433105469,
"learning_rate": 1.0683436502462915e-06,
"loss": 0.0058,
"step": 1183
},
{
"clip_ratio": 0.00036283263597169935,
"epoch": 1.6386424068596694,
"grad_norm": 0.037570755928754807,
"kl": 0.007138252258300781,
"learning_rate": 1.0632054910059391e-06,
"loss": 0.0058,
"step": 1184
},
{
"clip_ratio": 0.00039574184188495565,
"epoch": 1.6405545098742196,
"grad_norm": 0.038306284695863724,
"kl": 0.007193088531494141,
"learning_rate": 1.0580763783683187e-06,
"loss": 0.0057,
"step": 1185
},
{
"clip_ratio": 0.0,
"completion_length": 518.925525188446,
"epoch": 1.6424666128887693,
"grad_norm": 0.04251728951931,
"kl": 0.007372379302978516,
"learning_rate": 1.0529563446282665e-06,
"loss": 0.01,
"num_tokens": 246686482.0,
"reward": 0.08537946754950099,
"reward_std": 0.08939063869183883,
"rewards/pure_accuracy_reward_math": 0.08537946551223285,
"step": 1186
},
{
"clip_ratio": 0.0003136689152256622,
"epoch": 1.6443787159033194,
"grad_norm": 0.04087135195732117,
"kl": 0.007419109344482422,
"learning_rate": 1.0478454220234568e-06,
"loss": 0.0099,
"step": 1187
},
{
"clip_ratio": 0.0003467907941399062,
"epoch": 1.646290818917869,
"grad_norm": 0.039666056632995605,
"kl": 0.007442951202392578,
"learning_rate": 1.0427436427341939e-06,
"loss": 0.0099,
"step": 1188
},
{
"clip_ratio": 0.00038431568484043055,
"epoch": 1.6482029219324192,
"grad_norm": 0.0389142706990242,
"kl": 0.007426738739013672,
"learning_rate": 1.0376510388832147e-06,
"loss": 0.0098,
"step": 1189
},
{
"clip_ratio": 0.000490980125164242,
"epoch": 1.650115024946969,
"grad_norm": 0.03956843912601471,
"kl": 0.007406711578369141,
"learning_rate": 1.0325676425354828e-06,
"loss": 0.0097,
"step": 1190
},
{
"clip_ratio": 0.0,
"completion_length": 508.4835596084595,
"epoch": 1.652027127961519,
"grad_norm": 0.04898946359753609,
"kl": 0.008952617645263672,
"learning_rate": 1.0274934856979876e-06,
"loss": 0.0069,
"num_tokens": 250241299.0,
"reward": 0.07868303955183364,
"reward_std": 0.08381028211442754,
"rewards/pure_accuracy_reward_math": 0.07868303728173487,
"step": 1191
},
{
"clip_ratio": 0.0002854310730526777,
"epoch": 1.6539392309760688,
"grad_norm": 0.04304199293255806,
"kl": 0.008716106414794922,
"learning_rate": 1.0224286003195437e-06,
"loss": 0.0069,
"step": 1192
},
{
"clip_ratio": 0.00029722766299755676,
"epoch": 1.655851333990619,
"grad_norm": 0.039751190692186356,
"kl": 0.008554935455322266,
"learning_rate": 1.017373018290588e-06,
"loss": 0.0068,
"step": 1193
},
{
"clip_ratio": 0.00036785421832519205,
"epoch": 1.6577634370051686,
"grad_norm": 0.039316095411777496,
"kl": 0.00851297378540039,
"learning_rate": 1.0123267714429826e-06,
"loss": 0.0067,
"step": 1194
},
{
"clip_ratio": 0.0003976103018885624,
"epoch": 1.6596755400197185,
"grad_norm": 0.03880908712744713,
"kl": 0.008470535278320312,
"learning_rate": 1.0072898915498094e-06,
"loss": 0.0067,
"step": 1195
},
{
"clip_ratio": 0.0,
"completion_length": 514.2179379463196,
"epoch": 1.6615876430342684,
"grad_norm": 0.04073133319616318,
"kl": 0.0076427459716796875,
"learning_rate": 1.0022624103251727e-06,
"loss": 0.0095,
"num_tokens": 253820892.0,
"reward": 0.08593750416184776,
"reward_std": 0.08978221646975726,
"rewards/pure_accuracy_reward_math": 0.08593750165891834,
"step": 1196
},
{
"clip_ratio": 0.0003768215759691884,
"epoch": 1.6634997460488183,
"grad_norm": 0.039870597422122955,
"kl": 0.007634639739990234,
"learning_rate": 9.972443594239997e-07,
"loss": 0.0095,
"step": 1197
},
{
"clip_ratio": 0.00033531371116168884,
"epoch": 1.6654118490633683,
"grad_norm": 0.039165791124105453,
"kl": 0.007609367370605469,
"learning_rate": 9.922357704418394e-07,
"loss": 0.0094,
"step": 1198
},
{
"clip_ratio": 0.0003830786464504854,
"epoch": 1.6673239520779182,
"grad_norm": 0.0393473282456398,
"kl": 0.0076847076416015625,
"learning_rate": 9.872366749146684e-07,
"loss": 0.0094,
"step": 1199
},
{
"clip_ratio": 0.0003766370310813727,
"epoch": 1.669236055092468,
"grad_norm": 0.037378448992967606,
"kl": 0.007641792297363281,
"learning_rate": 9.822471043186846e-07,
"loss": 0.0093,
"step": 1200
},
{
"clip_ratio": 0.0,
"completion_length": 502.35381841659546,
"epoch": 1.671148158107018,
"grad_norm": 0.051170479506254196,
"kl": 0.008347511291503906,
"learning_rate": 9.772670900701172e-07,
"loss": 0.0074,
"num_tokens": 257360516.0,
"reward": 0.08537946784053929,
"reward_std": 0.09248606633627787,
"rewards/pure_accuracy_reward_math": 0.0853794660361018,
"step": 1201
},
{
"clip_ratio": 0.00036896339207714846,
"epoch": 1.673060261121568,
"grad_norm": 0.04540196433663368,
"kl": 0.008112430572509766,
"learning_rate": 9.722966635250222e-07,
"loss": 0.0074,
"step": 1202
},
{
"clip_ratio": 0.00040850058093155894,
"epoch": 1.6749723641361178,
"grad_norm": 0.0428830124437809,
"kl": 0.007869243621826172,
"learning_rate": 9.673358559790892e-07,
"loss": 0.0073,
"step": 1203
},
{
"clip_ratio": 0.0004735397765216476,
"epoch": 1.6768844671506677,
"grad_norm": 0.04445512220263481,
"kl": 0.007699012756347656,
"learning_rate": 9.623846986674417e-07,
"loss": 0.0072,
"step": 1204
},
{
"clip_ratio": 0.00047387216932293086,
"epoch": 1.6787965701652177,
"grad_norm": 0.04317403957247734,
"kl": 0.0076007843017578125,
"learning_rate": 9.574432227644432e-07,
"loss": 0.0071,
"step": 1205
},
{
"clip_ratio": 0.0,
"completion_length": 511.88367557525635,
"epoch": 1.6807086731797676,
"grad_norm": 0.041338611394166946,
"kl": 0.007639884948730469,
"learning_rate": 9.525114593834975e-07,
"loss": 0.0077,
"num_tokens": 260924667.0,
"reward": 0.07617187869618647,
"reward_std": 0.08037573983892798,
"rewards/pure_accuracy_reward_math": 0.0761718759604264,
"step": 1206
},
{
"clip_ratio": 0.00029646307336861355,
"epoch": 1.6826207761943175,
"grad_norm": 0.040457833558321,
"kl": 0.007670402526855469,
"learning_rate": 9.475894395768579e-07,
"loss": 0.0077,
"step": 1207
},
{
"clip_ratio": 0.0003306309376966965,
"epoch": 1.6845328792088674,
"grad_norm": 0.03946809470653534,
"kl": 0.0076751708984375,
"learning_rate": 9.426771943354249e-07,
"loss": 0.0076,
"step": 1208
},
{
"clip_ratio": 0.0003582578942200598,
"epoch": 1.6864449822234173,
"grad_norm": 0.04006471857428551,
"kl": 0.007700443267822266,
"learning_rate": 9.377747545885569e-07,
"loss": 0.0075,
"step": 1209
},
{
"clip_ratio": 0.00040392828321955676,
"epoch": 1.6883570852379672,
"grad_norm": 0.04037889465689659,
"kl": 0.007681369781494141,
"learning_rate": 9.328821512038716e-07,
"loss": 0.0074,
"step": 1210
},
{
"clip_ratio": 0.0,
"completion_length": 533.6010298728943,
"epoch": 1.6902691882525172,
"grad_norm": 0.03628333657979965,
"kl": 0.006788730621337891,
"learning_rate": 9.279994149870539e-07,
"loss": 0.0073,
"num_tokens": 264564517.0,
"reward": 0.06110491382423788,
"reward_std": 0.06693661888130009,
"rewards/pure_accuracy_reward_math": 0.06110491219442338,
"step": 1211
},
{
"clip_ratio": 0.0002594580842014693,
"epoch": 1.692181291267067,
"grad_norm": 0.034194085747003555,
"kl": 0.006678581237792969,
"learning_rate": 9.231265766816619e-07,
"loss": 0.0073,
"step": 1212
},
{
"clip_ratio": 0.0003170226998463477,
"epoch": 1.6940933942816168,
"grad_norm": 0.035113800317049026,
"kl": 0.006625652313232422,
"learning_rate": 9.182636669689335e-07,
"loss": 0.0073,
"step": 1213
},
{
"clip_ratio": 0.0003448430217076748,
"epoch": 1.696005497296167,
"grad_norm": 0.03626548498868942,
"kl": 0.006573200225830078,
"learning_rate": 9.134107164675898e-07,
"loss": 0.0072,
"step": 1214
},
{
"clip_ratio": 0.00033195262278695736,
"epoch": 1.6979176003107166,
"grad_norm": 0.03465663269162178,
"kl": 0.006582736968994141,
"learning_rate": 9.085677557336465e-07,
"loss": 0.0071,
"step": 1215
},
{
"clip_ratio": 0.0,
"completion_length": 527.8440546989441,
"epoch": 1.6998297033252667,
"grad_norm": 0.038788389414548874,
"kl": 0.009612560272216797,
"learning_rate": 9.037348152602199e-07,
"loss": 0.0052,
"num_tokens": 268179390.0,
"reward": 0.07756696798605844,
"reward_std": 0.0852254037745297,
"rewards/pure_accuracy_reward_math": 0.07756696571595967,
"step": 1216
},
{
"clip_ratio": 0.00027092215094626226,
"epoch": 1.7017418063398164,
"grad_norm": 0.038229282945394516,
"kl": 0.009754657745361328,
"learning_rate": 8.989119254773343e-07,
"loss": 0.0052,
"step": 1217
},
{
"clip_ratio": 0.00027246196253827293,
"epoch": 1.7036539093543666,
"grad_norm": 0.03782220929861069,
"kl": 0.009780406951904297,
"learning_rate": 8.940991167517313e-07,
"loss": 0.0051,
"step": 1218
},
{
"clip_ratio": 0.0003069629718197575,
"epoch": 1.7055660123689163,
"grad_norm": 0.03707100450992584,
"kl": 0.00977468490600586,
"learning_rate": 8.892964193866799e-07,
"loss": 0.005,
"step": 1219
},
{
"clip_ratio": 0.0003035257008150438,
"epoch": 1.7074781153834664,
"grad_norm": 0.03552490472793579,
"kl": 0.009665966033935547,
"learning_rate": 8.845038636217818e-07,
"loss": 0.0049,
"step": 1220
},
{
"clip_ratio": 0.0,
"completion_length": 529.9601240158081,
"epoch": 1.709390218398016,
"grad_norm": 0.04051567241549492,
"kl": 0.007312297821044922,
"learning_rate": 8.797214796327843e-07,
"loss": 0.0079,
"num_tokens": 271808667.0,
"reward": 0.08733259368455037,
"reward_std": 0.08496641932288185,
"rewards/pure_accuracy_reward_math": 0.0873325903667137,
"step": 1221
},
{
"clip_ratio": 0.00033132852740891394,
"epoch": 1.7113023214125662,
"grad_norm": 0.03887411206960678,
"kl": 0.007235527038574219,
"learning_rate": 8.749492975313897e-07,
"loss": 0.0079,
"step": 1222
},
{
"clip_ratio": 0.0003587238066984355,
"epoch": 1.713214424427116,
"grad_norm": 0.04010055959224701,
"kl": 0.007251739501953125,
"learning_rate": 8.701873473650643e-07,
"loss": 0.0079,
"step": 1223
},
{
"clip_ratio": 0.0003504625653079074,
"epoch": 1.715126527441666,
"grad_norm": 0.039550576359033585,
"kl": 0.007262229919433594,
"learning_rate": 8.654356591168522e-07,
"loss": 0.0078,
"step": 1224
},
{
"clip_ratio": 0.0003497420942721874,
"epoch": 1.7170386304562157,
"grad_norm": 0.03883340209722519,
"kl": 0.007348537445068359,
"learning_rate": 8.60694262705182e-07,
"loss": 0.0077,
"step": 1225
},
{
"clip_ratio": 0.0,
"completion_length": 530.5396447181702,
"epoch": 1.7189507334707659,
"grad_norm": 0.037610165774822235,
"kl": 0.007049083709716797,
"learning_rate": 8.559631879836838e-07,
"loss": 0.0065,
"num_tokens": 275440789.0,
"reward": 0.07896205675206147,
"reward_std": 0.07938606152310967,
"rewards/pure_accuracy_reward_math": 0.07896205494762398,
"step": 1226
},
{
"clip_ratio": 0.0002787316387298233,
"epoch": 1.7208628364853156,
"grad_norm": 0.03763109818100929,
"kl": 0.007136821746826172,
"learning_rate": 8.512424647409964e-07,
"loss": 0.0065,
"step": 1227
},
{
"clip_ratio": 0.0003178273858566172,
"epoch": 1.7227749394998657,
"grad_norm": 0.037824735045433044,
"kl": 0.007121562957763672,
"learning_rate": 8.465321227005823e-07,
"loss": 0.0065,
"step": 1228
},
{
"clip_ratio": 0.0002866029928725311,
"epoch": 1.7246870425144154,
"grad_norm": 0.03616493567824364,
"kl": 0.00708770751953125,
"learning_rate": 8.418321915205399e-07,
"loss": 0.0064,
"step": 1229
},
{
"clip_ratio": 0.00031164622902224437,
"epoch": 1.7265991455289653,
"grad_norm": 0.03562076762318611,
"kl": 0.007038593292236328,
"learning_rate": 8.371427007934174e-07,
"loss": 0.0063,
"step": 1230
},
{
"clip_ratio": 0.0,
"completion_length": 536.3178272247314,
"epoch": 1.7285112485435152,
"grad_norm": 0.03759186714887619,
"kl": 0.006800651550292969,
"learning_rate": 8.324636800460242e-07,
"loss": 0.0071,
"num_tokens": 279097568.0,
"reward": 0.07728794903960079,
"reward_std": 0.07732657541055232,
"rewards/pure_accuracy_reward_math": 0.07728794822469354,
"step": 1231
},
{
"clip_ratio": 0.00028705537579298834,
"epoch": 1.7304233515580651,
"grad_norm": 0.036786679178476334,
"kl": 0.006786346435546875,
"learning_rate": 8.277951587392505e-07,
"loss": 0.0071,
"step": 1232
},
{
"clip_ratio": 0.000303516245821811,
"epoch": 1.732335454572615,
"grad_norm": 0.03563455864787102,
"kl": 0.0068149566650390625,
"learning_rate": 8.231371662678741e-07,
"loss": 0.0071,
"step": 1233
},
{
"clip_ratio": 0.0003096325264095867,
"epoch": 1.734247557587165,
"grad_norm": 0.03413652628660202,
"kl": 0.006861209869384766,
"learning_rate": 8.184897319603813e-07,
"loss": 0.007,
"step": 1234
},
{
"clip_ratio": 0.0003550405467649398,
"epoch": 1.736159660601715,
"grad_norm": 0.03433661162853241,
"kl": 0.006935596466064453,
"learning_rate": 8.138528850787792e-07,
"loss": 0.0069,
"step": 1235
},
{
"clip_ratio": 0.0,
"completion_length": 516.8069453239441,
"epoch": 1.7380717636162648,
"grad_norm": 0.2546544671058655,
"kl": 0.012326240539550781,
"learning_rate": 8.092266548184139e-07,
"loss": 0.011,
"num_tokens": 282683384.0,
"reward": 0.07477678873692639,
"reward_std": 0.08165826951153576,
"rewards/pure_accuracy_reward_math": 0.07477678751456551,
"step": 1236
},
{
"clip_ratio": 0.00030172572752462656,
"epoch": 1.7399838666308147,
"grad_norm": 0.042716413736343384,
"kl": 0.0078887939453125,
"learning_rate": 8.046110703077839e-07,
"loss": 0.0108,
"step": 1237
},
{
"clip_ratio": 0.00029401268267292835,
"epoch": 1.7418959696453646,
"grad_norm": 0.038783252239227295,
"kl": 0.007707118988037109,
"learning_rate": 8.000061606083579e-07,
"loss": 0.0107,
"step": 1238
},
{
"clip_ratio": 0.00028625389199987694,
"epoch": 1.7438080726599146,
"grad_norm": 0.0381159707903862,
"kl": 0.007790088653564453,
"learning_rate": 7.954119547143935e-07,
"loss": 0.0107,
"step": 1239
},
{
"clip_ratio": 0.00034677153644224745,
"epoch": 1.7457201756744645,
"grad_norm": 0.038590554147958755,
"kl": 0.007785797119140625,
"learning_rate": 7.90828481552752e-07,
"loss": 0.0106,
"step": 1240
},
{
"clip_ratio": 0.0,
"completion_length": 517.8047132492065,
"epoch": 1.7476322786890144,
"grad_norm": 0.03943649306893349,
"kl": 0.007458209991455078,
"learning_rate": 7.862557699827167e-07,
"loss": 0.0092,
"num_tokens": 286269120.0,
"reward": 0.06640625282307155,
"reward_std": 0.07607791275950149,
"rewards/pure_accuracy_reward_math": 0.06640625130967237,
"step": 1241
},
{
"clip_ratio": 0.00031282668544463377,
"epoch": 1.7495443817035643,
"grad_norm": 0.0388050340116024,
"kl": 0.007348060607910156,
"learning_rate": 7.816938487958131e-07,
"loss": 0.0092,
"step": 1242
},
{
"clip_ratio": 0.0003194147345197962,
"epoch": 1.7514564847181142,
"grad_norm": 0.038322921842336655,
"kl": 0.007298946380615234,
"learning_rate": 7.771427467156256e-07,
"loss": 0.0091,
"step": 1243
},
{
"clip_ratio": 0.0003203335651846828,
"epoch": 1.7533685877326641,
"grad_norm": 0.037499312311410904,
"kl": 0.007254600524902344,
"learning_rate": 7.726024923976169e-07,
"loss": 0.009,
"step": 1244
},
{
"clip_ratio": 0.00032696440513291236,
"epoch": 1.755280690747214,
"grad_norm": 0.03671669587492943,
"kl": 0.007252693176269531,
"learning_rate": 7.680731144289505e-07,
"loss": 0.009,
"step": 1245
},
{
"clip_ratio": 0.0,
"completion_length": 514.8644180297852,
"epoch": 1.757192793761764,
"grad_norm": 0.04826434701681137,
"kl": 0.0094451904296875,
"learning_rate": 7.635546413283054e-07,
"loss": 0.0078,
"num_tokens": 289848950.0,
"reward": 0.07421875323052518,
"reward_std": 0.07818366138963029,
"rewards/pure_accuracy_reward_math": 0.074218751717126,
"step": 1246
},
{
"clip_ratio": 0.000299703156713349,
"epoch": 1.7591048967763139,
"grad_norm": 0.03791136294603348,
"kl": 0.009324073791503906,
"learning_rate": 7.590471015457002e-07,
"loss": 0.0077,
"step": 1247
},
{
"clip_ratio": 0.00030542989918558305,
"epoch": 1.7610169997908636,
"grad_norm": 0.03703403100371361,
"kl": 0.009335517883300781,
"learning_rate": 7.545505234623152e-07,
"loss": 0.0077,
"step": 1248
},
{
"clip_ratio": 0.0002983629839832247,
"epoch": 1.7629291028054137,
"grad_norm": 0.0363752581179142,
"kl": 0.009361743927001953,
"learning_rate": 7.500649353903092e-07,
"loss": 0.0076,
"step": 1249
},
{
"clip_ratio": 0.0002923785563098136,
"epoch": 1.7648412058199634,
"grad_norm": 0.03587965667247772,
"kl": 0.009373664855957031,
"learning_rate": 7.455903655726437e-07,
"loss": 0.0075,
"step": 1250
},
{
"clip_ratio": 0.0,
"completion_length": 510.6543188095093,
"epoch": 1.7667533088345135,
"grad_norm": 0.03651593253016472,
"kl": 0.008678436279296875,
"learning_rate": 7.411268421829076e-07,
"loss": 0.0059,
"num_tokens": 293408275.0,
"reward": 0.07031250264844857,
"reward_std": 0.07401842658873647,
"rewards/pure_accuracy_reward_math": 0.07031250160071068,
"step": 1251
},
{
"clip_ratio": 0.000244510552590782,
"epoch": 1.7686654118490632,
"grad_norm": 0.03525623679161072,
"kl": 0.008609294891357422,
"learning_rate": 7.366743933251349e-07,
"loss": 0.0059,
"step": 1252
},
{
"clip_ratio": 0.000242228649824483,
"epoch": 1.7705775148636134,
"grad_norm": 0.035115260630846024,
"kl": 0.008548259735107422,
"learning_rate": 7.322330470336314e-07,
"loss": 0.0058,
"step": 1253
},
{
"clip_ratio": 0.0002641637478291159,
"epoch": 1.772489617878163,
"grad_norm": 0.03518166393041611,
"kl": 0.008442401885986328,
"learning_rate": 7.278028312727961e-07,
"loss": 0.0058,
"step": 1254
},
{
"clip_ratio": 0.0002555919315909705,
"epoch": 1.7744017208927132,
"grad_norm": 0.03385892137885094,
"kl": 0.00841379165649414,
"learning_rate": 7.233837739369462e-07,
"loss": 0.0057,
"step": 1255
},
{
"clip_ratio": 0.0,
"completion_length": 513.7271451950073,
"epoch": 1.776313823907263,
"grad_norm": 0.03341628611087799,
"kl": 0.006855964660644531,
"learning_rate": 7.189759028501417e-07,
"loss": 0.0062,
"num_tokens": 296984393.0,
"reward": 0.06556919915601611,
"reward_std": 0.06311669771093875,
"rewards/pure_accuracy_reward_math": 0.06556919775903225,
"step": 1256
},
{
"clip_ratio": 0.0002122660096688378,
"epoch": 1.778225926921813,
"grad_norm": 0.03227659687399864,
"kl": 0.006803989410400391,
"learning_rate": 7.145792457660083e-07,
"loss": 0.0062,
"step": 1257
},
{
"clip_ratio": 0.00023682935608348998,
"epoch": 1.7801380299363627,
"grad_norm": 0.03206360712647438,
"kl": 0.006758213043212891,
"learning_rate": 7.101938303675674e-07,
"loss": 0.0062,
"step": 1258
},
{
"clip_ratio": 0.0002413284565250251,
"epoch": 1.7820501329509129,
"grad_norm": 0.031279318034648895,
"kl": 0.006762981414794922,
"learning_rate": 7.058196842670548e-07,
"loss": 0.0061,
"step": 1259
},
{
"clip_ratio": 0.0002680151189338176,
"epoch": 1.7839622359654626,
"grad_norm": 0.031049314886331558,
"kl": 0.006676197052001953,
"learning_rate": 7.014568350057516e-07,
"loss": 0.0061,
"step": 1260
},
{
"clip_ratio": 0.0,
"completion_length": 532.2553224563599,
"epoch": 1.7858743389800127,
"grad_norm": 0.03635333850979805,
"kl": 0.007339000701904297,
"learning_rate": 6.971053100538116e-07,
"loss": 0.0066,
"num_tokens": 300622928.0,
"reward": 0.0711495568684768,
"reward_std": 0.07668221119092777,
"rewards/pure_accuracy_reward_math": 0.07114955512224697,
"step": 1261
},
{
"clip_ratio": 0.00025942773436327116,
"epoch": 1.7877864419945624,
"grad_norm": 0.03595859929919243,
"kl": 0.007373332977294922,
"learning_rate": 6.927651368100843e-07,
"loss": 0.0065,
"step": 1262
},
{
"clip_ratio": 0.00026420129074722354,
"epoch": 1.7896985450091125,
"grad_norm": 0.034778136759996414,
"kl": 0.00739288330078125,
"learning_rate": 6.884363426019444e-07,
"loss": 0.0065,
"step": 1263
},
{
"clip_ratio": 0.0002875854173112202,
"epoch": 1.7916106480236622,
"grad_norm": 0.035560280084609985,
"kl": 0.007449150085449219,
"learning_rate": 6.841189546851224e-07,
"loss": 0.0064,
"step": 1264
},
{
"clip_ratio": 0.00026737677507071567,
"epoch": 1.7935227510382123,
"grad_norm": 0.03407442197203636,
"kl": 0.007452964782714844,
"learning_rate": 6.79813000243528e-07,
"loss": 0.0064,
"step": 1265
},
{
"clip_ratio": 0.0,
"completion_length": 523.543550491333,
"epoch": 1.795434854052762,
"grad_norm": 0.03908964619040489,
"kl": 0.008809566497802734,
"learning_rate": 6.755185063890818e-07,
"loss": 0.0074,
"num_tokens": 304236988.0,
"reward": 0.0747767890279647,
"reward_std": 0.07865536957979202,
"rewards/pure_accuracy_reward_math": 0.07477678745635785,
"step": 1266
},
{
"clip_ratio": 0.0002752643416670253,
"epoch": 1.797346957067312,
"grad_norm": 0.0380408875644207,
"kl": 0.00884389877319336,
"learning_rate": 6.71235500161545e-07,
"loss": 0.0074,
"step": 1267
},
{
"clip_ratio": 0.0002959408872698077,
"epoch": 1.7992590600818619,
"grad_norm": 0.03713267296552658,
"kl": 0.008931636810302734,
"learning_rate": 6.669640085283479e-07,
"loss": 0.0073,
"step": 1268
},
{
"clip_ratio": 0.0003134474755484007,
"epoch": 1.8011711630964118,
"grad_norm": 0.03684492036700249,
"kl": 0.008975982666015625,
"learning_rate": 6.627040583844199e-07,
"loss": 0.0073,
"step": 1269
},
{
"clip_ratio": 0.0003336208075666036,
"epoch": 1.8030832661109617,
"grad_norm": 0.0364052951335907,
"kl": 0.009007453918457031,
"learning_rate": 6.584556765520231e-07,
"loss": 0.0072,
"step": 1270
},
{
"clip_ratio": 0.0,
"completion_length": 532.5468997955322,
"epoch": 1.8049953691255116,
"grad_norm": 0.03688374161720276,
"kl": 0.006972789764404297,
"learning_rate": 6.542188897805782e-07,
"loss": 0.0076,
"num_tokens": 307881200.0,
"reward": 0.06082589610014111,
"reward_std": 0.06925509008578956,
"rewards/pure_accuracy_reward_math": 0.06082589423749596,
"step": 1271
},
{
"clip_ratio": 0.0002535940801635661,
"epoch": 1.8069074721400615,
"grad_norm": 0.03543318435549736,
"kl": 0.006913661956787109,
"learning_rate": 6.499937247465002e-07,
"loss": 0.0076,
"step": 1272
},
{
"clip_ratio": 0.00029529011806062044,
"epoch": 1.8088195751546114,
"grad_norm": 0.034321434795856476,
"kl": 0.006764411926269531,
"learning_rate": 6.457802080530304e-07,
"loss": 0.0075,
"step": 1273
},
{
"clip_ratio": 0.00032198404306882367,
"epoch": 1.8107316781691614,
"grad_norm": 0.03342648968100548,
"kl": 0.006732940673828125,
"learning_rate": 6.415783662300662e-07,
"loss": 0.0075,
"step": 1274
},
{
"clip_ratio": 0.000381207836142039,
"epoch": 1.8126437811837113,
"grad_norm": 0.034588467329740524,
"kl": 0.006687164306640625,
"learning_rate": 6.373882257339964e-07,
"loss": 0.0074,
"step": 1275
},
{
"clip_ratio": 0.0,
"completion_length": 528.7452836036682,
"epoch": 1.8145558841982612,
"grad_norm": 0.039650533348321915,
"kl": 0.012791156768798828,
"learning_rate": 6.33209812947532e-07,
"loss": 0.0068,
"num_tokens": 311509399.0,
"reward": 0.06919643239234574,
"reward_std": 0.07131457643117756,
"rewards/pure_accuracy_reward_math": 0.06919642988941632,
"step": 1276
},
{
"clip_ratio": 0.00028128568749252736,
"epoch": 1.816467987212811,
"grad_norm": 0.039305564016103745,
"kl": 0.012639522552490234,
"learning_rate": 6.290431541795456e-07,
"loss": 0.0068,
"step": 1277
},
{
"clip_ratio": 0.00027201296376233586,
"epoch": 1.818380090227361,
"grad_norm": 0.038404785096645355,
"kl": 0.012586116790771484,
"learning_rate": 6.248882756648988e-07,
"loss": 0.0067,
"step": 1278
},
{
"clip_ratio": 0.00027703067632955936,
"epoch": 1.820292193241911,
"grad_norm": 0.037614692002534866,
"kl": 0.01236581802368164,
"learning_rate": 6.207452035642814e-07,
"loss": 0.0066,
"step": 1279
},
{
"clip_ratio": 0.000309511864088563,
"epoch": 1.8222042962564609,
"grad_norm": 0.03737355023622513,
"kl": 0.012206554412841797,
"learning_rate": 6.166139639640454e-07,
"loss": 0.0065,
"step": 1280
},
{
"clip_ratio": 0.0,
"completion_length": 526.473795413971,
"epoch": 1.8241163992710108,
"grad_norm": 0.03713076934218407,
"kl": 0.007002353668212891,
"learning_rate": 6.124945828760406e-07,
"loss": 0.0059,
"num_tokens": 315129533.0,
"reward": 0.06445312840514816,
"reward_std": 0.06921502435579896,
"rewards/pure_accuracy_reward_math": 0.0644531259604264,
"step": 1281
},
{
"clip_ratio": 0.00024346445911760384,
"epoch": 1.8260285022855607,
"grad_norm": 0.03588669002056122,
"kl": 0.006989955902099609,
"learning_rate": 6.083870862374513e-07,
"loss": 0.0059,
"step": 1282
},
{
"clip_ratio": 0.0002329723478737833,
"epoch": 1.8279406053001104,
"grad_norm": 0.03526683151721954,
"kl": 0.007010459899902344,
"learning_rate": 6.042914999106342e-07,
"loss": 0.0058,
"step": 1283
},
{
"clip_ratio": 0.00023291378442991117,
"epoch": 1.8298527083146605,
"grad_norm": 0.03384559601545334,
"kl": 0.007075786590576172,
"learning_rate": 6.002078496829514e-07,
"loss": 0.0058,
"step": 1284
},
{
"clip_ratio": 0.0002458733478647446,
"epoch": 1.8317648113292102,
"grad_norm": 0.03377237543463707,
"kl": 0.0071315765380859375,
"learning_rate": 5.961361612666139e-07,
"loss": 0.0057,
"step": 1285
},
{
"clip_ratio": 0.0,
"completion_length": 525.0859618186951,
"epoch": 1.8336769143437603,
"grad_norm": 0.0914173573255539,
"kl": 0.012554645538330078,
"learning_rate": 5.920764602985141e-07,
"loss": 0.0058,
"num_tokens": 318747025.0,
"reward": 0.06612723506987095,
"reward_std": 0.06865079142153263,
"rewards/pure_accuracy_reward_math": 0.06612723355647177,
"step": 1286
},
{
"clip_ratio": 0.00025586230526641884,
"epoch": 1.83558901735831,
"grad_norm": 0.04225718230009079,
"kl": 0.010876655578613281,
"learning_rate": 5.88028772340068e-07,
"loss": 0.0057,
"step": 1287
},
{
"clip_ratio": 0.00024814432106268214,
"epoch": 1.8375011203728602,
"grad_norm": 0.03636258468031883,
"kl": 0.010531425476074219,
"learning_rate": 5.839931228770526e-07,
"loss": 0.0057,
"step": 1288
},
{
"clip_ratio": 0.0002984523198108491,
"epoch": 1.8394132233874099,
"grad_norm": 0.03610241040587425,
"kl": 0.010416984558105469,
"learning_rate": 5.799695373194461e-07,
"loss": 0.0056,
"step": 1289
},
{
"clip_ratio": 0.00032527196299270145,
"epoch": 1.84132532640196,
"grad_norm": 0.034912850707769394,
"kl": 0.010428428649902344,
"learning_rate": 5.759580410012691e-07,
"loss": 0.0055,
"step": 1290
},
{
"clip_ratio": 0.0,
"completion_length": 520.4793767929077,
"epoch": 1.8432374294165097,
"grad_norm": 0.04220513626933098,
"kl": 0.009058475494384766,
"learning_rate": 5.719586591804222e-07,
"loss": 0.0071,
"num_tokens": 322345307.0,
"reward": 0.07366071786964312,
"reward_std": 0.07878176297526807,
"rewards/pure_accuracy_reward_math": 0.07366071542492136,
"step": 1291
},
{
"clip_ratio": 0.00030183524040694465,
"epoch": 1.8451495324310598,
"grad_norm": 0.03849344700574875,
"kl": 0.009106636047363281,
"learning_rate": 5.679714170385283e-07,
"loss": 0.0071,
"step": 1292
},
{
"clip_ratio": 0.00035880112773156725,
"epoch": 1.8470616354456095,
"grad_norm": 0.037096235901117325,
"kl": 0.009167194366455078,
"learning_rate": 5.63996339680776e-07,
"loss": 0.0071,
"step": 1293
},
{
"clip_ratio": 0.00040293739141361584,
"epoch": 1.8489737384601597,
"grad_norm": 0.03884498402476311,
"kl": 0.009192943572998047,
"learning_rate": 5.600334521357581e-07,
"loss": 0.007,
"step": 1294
},
{
"clip_ratio": 0.00038201194092835067,
"epoch": 1.8508858414747094,
"grad_norm": 0.03875093162059784,
"kl": 0.009291648864746094,
"learning_rate": 5.560827793553159e-07,
"loss": 0.0069,
"step": 1295
},
{
"clip_ratio": 0.0,
"completion_length": 518.3301024436951,
"epoch": 1.8527979444892595,
"grad_norm": 0.04254430532455444,
"kl": 0.008441925048828125,
"learning_rate": 5.52144346214383e-07,
"loss": 0.0063,
"num_tokens": 325938766.0,
"reward": 0.07840402127476409,
"reward_std": 0.08084744628285989,
"rewards/pure_accuracy_reward_math": 0.07840401929570362,
"step": 1296
},
{
"clip_ratio": 0.0002986583057804637,
"epoch": 1.8547100475038092,
"grad_norm": 0.041676584631204605,
"kl": 0.008450508117675781,
"learning_rate": 5.482181775108278e-07,
"loss": 0.0062,
"step": 1297
},
{
"clip_ratio": 0.00031948441494478175,
"epoch": 1.8566221505183593,
"grad_norm": 0.03955300524830818,
"kl": 0.008507251739501953,
"learning_rate": 5.443042979652957e-07,
"loss": 0.0062,
"step": 1298
},
{
"clip_ratio": 0.0003085145480667961,
"epoch": 1.858534253532909,
"grad_norm": 0.03848061338067055,
"kl": 0.008501052856445312,
"learning_rate": 5.404027322210556e-07,
"loss": 0.0061,
"step": 1299
},
{
"clip_ratio": 0.0003855731235944404,
"epoch": 1.8604463565474592,
"grad_norm": 0.04076399654150009,
"kl": 0.00849771499633789,
"learning_rate": 5.365135048438438e-07,
"loss": 0.006,
"step": 1300
},
{
"clip_ratio": 0.0,
"completion_length": 529.5170464515686,
"epoch": 1.8623584595620088,
"grad_norm": 0.14906181395053864,
"kl": 0.007767677307128906,
"learning_rate": 5.326366403217093e-07,
"loss": 0.0084,
"num_tokens": 329571311.0,
"reward": 0.07254464630386792,
"reward_std": 0.08418946416350082,
"rewards/pure_accuracy_reward_math": 0.07254464438301511,
"step": 1301
},
{
"clip_ratio": 0.00028383656763253384,
"epoch": 1.8642705625765588,
"grad_norm": 0.04550671949982643,
"kl": 0.008212089538574219,
"learning_rate": 5.287721630648615e-07,
"loss": 0.0083,
"step": 1302
},
{
"clip_ratio": 0.0003281467976989916,
"epoch": 1.8661826655911087,
"grad_norm": 0.05260877683758736,
"kl": 0.008829593658447266,
"learning_rate": 5.249200974055132e-07,
"loss": 0.0083,
"step": 1303
},
{
"clip_ratio": 0.00036754867960553383,
"epoch": 1.8680947686056586,
"grad_norm": 0.0511869452893734,
"kl": 0.008836746215820312,
"learning_rate": 5.210804675977299e-07,
"loss": 0.0082,
"step": 1304
},
{
"clip_ratio": 0.0004018283953541868,
"epoch": 1.8700068716202085,
"grad_norm": 0.044321924448013306,
"kl": 0.008379459381103516,
"learning_rate": 5.172532978172753e-07,
"loss": 0.0081,
"step": 1305
},
{
"clip_ratio": 0.0,
"completion_length": 512.9788198471069,
"epoch": 1.8719189746347584,
"grad_norm": 0.04202428087592125,
"kl": 0.0076198577880859375,
"learning_rate": 5.134386121614615e-07,
"loss": 0.0072,
"num_tokens": 333143795.0,
"reward": 0.07421875317231752,
"reward_std": 0.07986396714113653,
"rewards/pure_accuracy_reward_math": 0.074218751717126,
"step": 1306
},
{
"clip_ratio": 0.00027569573836672134,
"epoch": 1.8738310776493083,
"grad_norm": 0.040443304926157,
"kl": 0.007631778717041016,
"learning_rate": 5.096364346489935e-07,
"loss": 0.0072,
"step": 1307
},
{
"clip_ratio": 0.00027392168607320855,
"epoch": 1.8757431806638583,
"grad_norm": 0.040238041430711746,
"kl": 0.007664203643798828,
"learning_rate": 5.058467892198241e-07,
"loss": 0.0071,
"step": 1308
},
{
"clip_ratio": 0.0003170029604007141,
"epoch": 1.8776552836784082,
"grad_norm": 0.039109617471694946,
"kl": 0.007664203643798828,
"learning_rate": 5.02069699734995e-07,
"loss": 0.007,
"step": 1309
},
{
"clip_ratio": 0.0003183572773082233,
"epoch": 1.879567386692958,
"grad_norm": 0.03724955767393112,
"kl": 0.007700443267822266,
"learning_rate": 4.983051899764946e-07,
"loss": 0.007,
"step": 1310
},
{
"clip_ratio": 0.0,
"completion_length": 505.4592852592468,
"epoch": 1.881479489707508,
"grad_norm": 0.03964386135339737,
"kl": 0.007820606231689453,
"learning_rate": 4.945532836471026e-07,
"loss": 0.0074,
"num_tokens": 336685165.0,
"reward": 0.0848214327415917,
"reward_std": 0.07835631881607696,
"rewards/pure_accuracy_reward_math": 0.08482142965658568,
"step": 1311
},
{
"clip_ratio": 0.0002873320136700386,
"epoch": 1.883391592722058,
"grad_norm": 0.03871289640665054,
"kl": 0.007764339447021484,
"learning_rate": 4.908140043702426e-07,
"loss": 0.0074,
"step": 1312
},
{
"clip_ratio": 0.0003113469839775007,
"epoch": 1.8853036957366078,
"grad_norm": 0.03769771382212639,
"kl": 0.007766246795654297,
"learning_rate": 4.870873756898345e-07,
"loss": 0.0074,
"step": 1313
},
{
"clip_ratio": 0.00034381698696961394,
"epoch": 1.8872157987511577,
"grad_norm": 0.03724011033773422,
"kl": 0.007775783538818359,
"learning_rate": 4.833734210701435e-07,
"loss": 0.0073,
"step": 1314
},
{
"clip_ratio": 0.0003651243675335536,
"epoch": 1.8891279017657077,
"grad_norm": 0.03757576644420624,
"kl": 0.007784366607666016,
"learning_rate": 4.796721638956376e-07,
"loss": 0.0072,
"step": 1315
},
{
"clip_ratio": 0.0,
"completion_length": 527.5703339576721,
"epoch": 1.8910400047802576,
"grad_norm": 0.03592124208807945,
"kl": 0.007517337799072266,
"learning_rate": 4.7598362747083293e-07,
"loss": 0.008,
"num_tokens": 340304225.0,
"reward": 0.06501116388244554,
"reward_std": 0.0762443722342141,
"rewards/pure_accuracy_reward_math": 0.06501116219442338,
"step": 1316
},
{
"clip_ratio": 0.00026663288446115985,
"epoch": 1.8929521077948075,
"grad_norm": 0.03529619425535202,
"kl": 0.007477283477783203,
"learning_rate": 4.7230783502015346e-07,
"loss": 0.008,
"step": 1317
},
{
"clip_ratio": 0.00025462434007295087,
"epoch": 1.8948642108093574,
"grad_norm": 0.03387421742081642,
"kl": 0.007337093353271484,
"learning_rate": 4.6864480968778103e-07,
"loss": 0.008,
"step": 1318
},
{
"clip_ratio": 0.00031681645646131074,
"epoch": 1.8967763138239073,
"grad_norm": 0.033014364540576935,
"kl": 0.007318019866943359,
"learning_rate": 4.649945745375109e-07,
"loss": 0.0079,
"step": 1319
},
{
"clip_ratio": 0.00037019279989181086,
"epoch": 1.898688416838457,
"grad_norm": 0.033140987157821655,
"kl": 0.007157325744628906,
"learning_rate": 4.613571525526081e-07,
"loss": 0.0078,
"step": 1320
},
{
"clip_ratio": 0.0,
"completion_length": 523.3727917671204,
"epoch": 1.9006005198530072,
"grad_norm": 0.03997303172945976,
"kl": 0.007628440856933594,
"learning_rate": 4.577325666356586e-07,
"loss": 0.0118,
"num_tokens": 343915401.0,
"reward": 0.08816964740981348,
"reward_std": 0.08973595389397815,
"rewards/pure_accuracy_reward_math": 0.08816964426659979,
"step": 1321
},
{
"clip_ratio": 0.0003053776546835252,
"epoch": 1.9025126228675568,
"grad_norm": 0.039738208055496216,
"kl": 0.007574558258056641,
"learning_rate": 4.541208396084304e-07,
"loss": 0.0117,
"step": 1322
},
{
"clip_ratio": 0.00030029478972437573,
"epoch": 1.904424725882107,
"grad_norm": 0.038392502814531326,
"kl": 0.007514476776123047,
"learning_rate": 4.5052199421172475e-07,
"loss": 0.0117,
"step": 1323
},
{
"clip_ratio": 0.0003343055576010556,
"epoch": 1.9063368288966567,
"grad_norm": 0.037236347794532776,
"kl": 0.007477760314941406,
"learning_rate": 4.4693605310523636e-07,
"loss": 0.0116,
"step": 1324
},
{
"clip_ratio": 0.00032557199602933906,
"epoch": 1.9082489319112068,
"grad_norm": 0.03678731992840767,
"kl": 0.007478237152099609,
"learning_rate": 4.43363038867409e-07,
"loss": 0.0115,
"step": 1325
},
{
"clip_ratio": 0.0,
"completion_length": 513.3047099113464,
"epoch": 1.9101610349257565,
"grad_norm": 0.11113768815994263,
"kl": 0.013922691345214844,
"learning_rate": 4.39802973995295e-07,
"loss": 0.0093,
"num_tokens": 347490901.0,
"reward": 0.09486607549479231,
"reward_std": 0.09372853260720149,
"rewards/pure_accuracy_reward_math": 0.09486607305007055,
"step": 1326
},
{
"clip_ratio": 0.00036943193325100765,
"epoch": 1.9120731379403066,
"grad_norm": 0.055216722190380096,
"kl": 0.013732433319091797,
"learning_rate": 4.362558809044107e-07,
"loss": 0.0093,
"step": 1327
},
{
"clip_ratio": 0.0004000666916681439,
"epoch": 1.9139852409548563,
"grad_norm": 0.045698132365942,
"kl": 0.013063907623291016,
"learning_rate": 4.327217819286e-07,
"loss": 0.0092,
"step": 1328
},
{
"clip_ratio": 0.0004443397794489101,
"epoch": 1.9158973439694065,
"grad_norm": 0.04273562505841255,
"kl": 0.012539863586425781,
"learning_rate": 4.292006993198888e-07,
"loss": 0.009,
"step": 1329
},
{
"clip_ratio": 0.0004470848766686686,
"epoch": 1.9178094469839562,
"grad_norm": 0.04232070967555046,
"kl": 0.012142658233642578,
"learning_rate": 4.2569265524834756e-07,
"loss": 0.0089,
"step": 1330
},
{
"clip_ratio": 0.0,
"completion_length": 518.7550463676453,
"epoch": 1.9197215499985063,
"grad_norm": 0.03724661469459534,
"kl": 0.007449150085449219,
"learning_rate": 4.221976718019505e-07,
"loss": 0.007,
"num_tokens": 351086731.0,
"reward": 0.06919643189758062,
"reward_std": 0.07200520270271227,
"rewards/pure_accuracy_reward_math": 0.06919642974389717,
"step": 1331
},
{
"clip_ratio": 0.00027471570277270985,
"epoch": 1.921633653013056,
"grad_norm": 0.03599303960800171,
"kl": 0.007382869720458984,
"learning_rate": 4.187157709864392e-07,
"loss": 0.007,
"step": 1332
},
{
"clip_ratio": 0.0002737036326720954,
"epoch": 1.9235457560276061,
"grad_norm": 0.03614535927772522,
"kl": 0.007375240325927734,
"learning_rate": 4.152469747251794e-07,
"loss": 0.0069,
"step": 1333
},
{
"clip_ratio": 0.00030229948259830053,
"epoch": 1.9254578590421558,
"grad_norm": 0.03546711429953575,
"kl": 0.0072498321533203125,
"learning_rate": 4.117913048590283e-07,
"loss": 0.0069,
"step": 1334
},
{
"clip_ratio": 0.00030038867771509103,
"epoch": 1.927369962056706,
"grad_norm": 0.03401359170675278,
"kl": 0.007149219512939453,
"learning_rate": 4.0834878314619244e-07,
"loss": 0.0068,
"step": 1335
},
{
"clip_ratio": 0.0,
"completion_length": 526.2182154655457,
"epoch": 1.9292820650712557,
"grad_norm": 0.04080551117658615,
"kl": 0.006867885589599609,
"learning_rate": 4.049194312620927e-07,
"loss": 0.0092,
"num_tokens": 354708525.0,
"reward": 0.07756696798605844,
"reward_std": 0.08467356563778594,
"rewards/pure_accuracy_reward_math": 0.07756696530850604,
"step": 1336
},
{
"clip_ratio": 0.0002796990767137686,
"epoch": 1.9311941680858056,
"grad_norm": 0.038895782083272934,
"kl": 0.006824970245361328,
"learning_rate": 4.015032707992286e-07,
"loss": 0.0092,
"step": 1337
},
{
"clip_ratio": 0.00032694752422912643,
"epoch": 1.9331062711003555,
"grad_norm": 0.03889061138033867,
"kl": 0.006866931915283203,
"learning_rate": 3.9810032326704106e-07,
"loss": 0.0091,
"step": 1338
},
{
"clip_ratio": 0.0003511786251237936,
"epoch": 1.9350183741149054,
"grad_norm": 0.03880919888615608,
"kl": 0.006947994232177734,
"learning_rate": 3.9471061009177693e-07,
"loss": 0.009,
"step": 1339
},
{
"clip_ratio": 0.000323922223401496,
"epoch": 1.9369304771294553,
"grad_norm": 0.036964643746614456,
"kl": 0.007033824920654297,
"learning_rate": 3.91334152616355e-07,
"loss": 0.0089,
"step": 1340
},
{
"clip_ratio": 0.0,
"completion_length": 527.7076120376587,
"epoch": 1.9388425801440052,
"grad_norm": 0.04040682688355446,
"kl": 0.007448673248291016,
"learning_rate": 3.879709721002317e-07,
"loss": 0.0052,
"num_tokens": 358339045.0,
"reward": 0.07896205660654232,
"reward_std": 0.08278053888352588,
"rewards/pure_accuracy_reward_math": 0.07896205550059676,
"step": 1341
},
{
"clip_ratio": 0.00029579239503618737,
"epoch": 1.9407546831585551,
"grad_norm": 0.03910582885146141,
"kl": 0.007539272308349609,
"learning_rate": 3.8462108971926564e-07,
"loss": 0.0052,
"step": 1342
},
{
"clip_ratio": 0.0003078770084812277,
"epoch": 1.942666786173105,
"grad_norm": 0.03942732512950897,
"kl": 0.007628440856933594,
"learning_rate": 3.8128452656558623e-07,
"loss": 0.0051,
"step": 1343
},
{
"clip_ratio": 0.0003229538778555252,
"epoch": 1.944578889187655,
"grad_norm": 0.03747202083468437,
"kl": 0.007678031921386719,
"learning_rate": 3.779613036474583e-07,
"loss": 0.005,
"step": 1344
},
{
"clip_ratio": 0.000363169818285769,
"epoch": 1.946490992202205,
"grad_norm": 0.036778781563043594,
"kl": 0.0076923370361328125,
"learning_rate": 3.746514418891545e-07,
"loss": 0.0049,
"step": 1345
},
{
"clip_ratio": 0.0,
"completion_length": 532.7960658073425,
"epoch": 1.9484030952167548,
"grad_norm": 0.040943268686532974,
"kl": 0.011704444885253906,
"learning_rate": 3.713549621308174e-07,
"loss": 0.005,
"num_tokens": 361980918.0,
"reward": 0.07059152092551813,
"reward_std": 0.07973137585213408,
"rewards/pure_accuracy_reward_math": 0.07059151900466532,
"step": 1346
},
{
"clip_ratio": 0.00029914512055029263,
"epoch": 1.9503151982313047,
"grad_norm": 0.04052672162652016,
"kl": 0.0114288330078125,
"learning_rate": 3.6807188512833406e-07,
"loss": 0.005,
"step": 1347
},
{
"clip_ratio": 0.000334167169853572,
"epoch": 1.9522273012458546,
"grad_norm": 0.04054692015051842,
"kl": 0.011135578155517578,
"learning_rate": 3.648022315532007e-07,
"loss": 0.0049,
"step": 1348
},
{
"clip_ratio": 0.00035840429575273447,
"epoch": 1.9541394042604046,
"grad_norm": 0.03996079042553902,
"kl": 0.010680675506591797,
"learning_rate": 3.615460219923955e-07,
"loss": 0.0048,
"step": 1349
},
{
"clip_ratio": 0.00034668986540964397,
"epoch": 1.9560515072749545,
"grad_norm": 0.037566084414720535,
"kl": 0.010373115539550781,
"learning_rate": 3.5830327694824777e-07,
"loss": 0.0047,
"step": 1350
},
{
"clip_ratio": 0.0,
"completion_length": 534.6453948020935,
"epoch": 1.9579636102895044,
"grad_norm": 0.03812556713819504,
"kl": 0.007121086120605469,
"learning_rate": 3.5507401683830933e-07,
"loss": 0.0114,
"num_tokens": 365629991.0,
"reward": 0.07672991411527619,
"reward_std": 0.07831625349353999,
"rewards/pure_accuracy_reward_math": 0.07672991178696975,
"step": 1351
},
{
"clip_ratio": 0.0003128355612602718,
"epoch": 1.9598757133040543,
"grad_norm": 0.03631382808089256,
"kl": 0.007141590118408203,
"learning_rate": 3.518582619952257e-07,
"loss": 0.0114,
"step": 1352
},
{
"clip_ratio": 0.00033067399391484287,
"epoch": 1.9617878163186042,
"grad_norm": 0.03752359002828598,
"kl": 0.007140636444091797,
"learning_rate": 3.486560326666072e-07,
"loss": 0.0113,
"step": 1353
},
{
"clip_ratio": 0.00037038392605381887,
"epoch": 1.9636999193331541,
"grad_norm": 0.03724711388349533,
"kl": 0.007131099700927734,
"learning_rate": 3.4546734901490466e-07,
"loss": 0.0112,
"step": 1354
},
{
"clip_ratio": 0.00040464663743478013,
"epoch": 1.9656120223477038,
"grad_norm": 0.034875430166721344,
"kl": 0.007108211517333984,
"learning_rate": 3.42292231117278e-07,
"loss": 0.0112,
"step": 1355
},
{
"clip_ratio": 0.0,
"completion_length": 519.9101786613464,
"epoch": 1.967524125362254,
"grad_norm": 0.04123640060424805,
"kl": 0.007243156433105469,
"learning_rate": 3.3913069896547217e-07,
"loss": 0.0069,
"num_tokens": 369229613.0,
"reward": 0.08007812878349796,
"reward_std": 0.085311732836999,
"rewards/pure_accuracy_reward_math": 0.0800781263387762,
"step": 1356
},
{
"clip_ratio": 0.00033138683619426956,
"epoch": 1.9694362283768037,
"grad_norm": 0.04048166796565056,
"kl": 0.007332801818847656,
"learning_rate": 3.3598277246569307e-07,
"loss": 0.0069,
"step": 1357
},
{
"clip_ratio": 0.0003668193609200898,
"epoch": 1.9713483313913538,
"grad_norm": 0.042313288897275925,
"kl": 0.007485866546630859,
"learning_rate": 3.3284847143847834e-07,
"loss": 0.0068,
"step": 1358
},
{
"clip_ratio": 0.0003713441701620468,
"epoch": 1.9732604344059035,
"grad_norm": 0.04199962690472603,
"kl": 0.007598400115966797,
"learning_rate": 3.2972781561857433e-07,
"loss": 0.0067,
"step": 1359
},
{
"clip_ratio": 0.0003367169608736731,
"epoch": 1.9751725374204536,
"grad_norm": 0.03874565288424492,
"kl": 0.007636547088623047,
"learning_rate": 3.266208246548136e-07,
"loss": 0.0066,
"step": 1360
},
{
"clip_ratio": 0.0,
"completion_length": 516.4445023536682,
"epoch": 1.9770846404350033,
"grad_norm": 0.040357448160648346,
"kl": 0.007414817810058594,
"learning_rate": 3.2352751810998896e-07,
"loss": 0.0055,
"num_tokens": 372817046.0,
"reward": 0.08258928993018344,
"reward_std": 0.09080576250562444,
"rewards/pure_accuracy_reward_math": 0.08258928690338507,
"step": 1361
},
{
"clip_ratio": 0.00038423701278134104,
"epoch": 1.9789967434495535,
"grad_norm": 0.03990958258509636,
"kl": 0.007411479949951172,
"learning_rate": 3.2044791546072985e-07,
"loss": 0.0055,
"step": 1362
},
{
"clip_ratio": 0.00044172884827275993,
"epoch": 1.9809088464641031,
"grad_norm": 0.042212970554828644,
"kl": 0.007319450378417969,
"learning_rate": 3.173820360973823e-07,
"loss": 0.0054,
"step": 1363
},
{
"clip_ratio": 0.00042502668532051757,
"epoch": 1.9828209494786533,
"grad_norm": 0.03946436941623688,
"kl": 0.0072727203369140625,
"learning_rate": 3.1432989932388416e-07,
"loss": 0.0053,
"step": 1364
},
{
"clip_ratio": 0.00040032099315112646,
"epoch": 1.984733052493203,
"grad_norm": 0.03701746463775635,
"kl": 0.007288455963134766,
"learning_rate": 3.1129152435764473e-07,
"loss": 0.0052,
"step": 1365
},
{
"clip_ratio": 0.0,
"completion_length": 519.9707279205322,
"epoch": 1.9866451555077531,
"grad_norm": 0.03677362576127052,
"kl": 0.00740814208984375,
"learning_rate": 3.0826693032942586e-07,
"loss": 0.008,
"num_tokens": 376414405.0,
"reward": 0.07087053926079534,
"reward_std": 0.07741290412377566,
"rewards/pure_accuracy_reward_math": 0.07087053710711189,
"step": 1366
},
{
"clip_ratio": 0.0002998853265978596,
"epoch": 1.9885572585223028,
"grad_norm": 0.03619634732604027,
"kl": 0.0074787139892578125,
"learning_rate": 3.0525613628321656e-07,
"loss": 0.0079,
"step": 1367
},
{
"clip_ratio": 0.00031987275491474065,
"epoch": 1.990469361536853,
"grad_norm": 0.03580261766910553,
"kl": 0.007512092590332031,
"learning_rate": 3.022591611761169e-07,
"loss": 0.0079,
"step": 1368
},
{
"clip_ratio": 0.00029055258056587263,
"epoch": 1.9923814645514026,
"grad_norm": 0.03512256592512131,
"kl": 0.007531166076660156,
"learning_rate": 2.9927602387821916e-07,
"loss": 0.0078,
"step": 1369
},
{
"clip_ratio": 0.0003325358438814874,
"epoch": 1.9942935675659528,
"grad_norm": 0.03404110670089722,
"kl": 0.007470130920410156,
"learning_rate": 2.963067431724856e-07,
"loss": 0.0077,
"step": 1370
},
{
"clip_ratio": 0.0,
"completion_length": 524.95845079422,
"epoch": 2.0019121030145497,
"grad_norm": 0.03709035739302635,
"kl": 0.007386684417724609,
"learning_rate": 2.9335133775463266e-07,
"loss": 0.011,
"num_tokens": 380027444.0,
"reward": 0.07198661039001308,
"reward_std": 0.07208533387165517,
"rewards/pure_accuracy_reward_math": 0.07198660876019858,
"step": 1371
},
{
"clip_ratio": 0.0002751371110321088,
"epoch": 2.0038242060291,
"grad_norm": 0.03661485016345978,
"kl": 0.007431507110595703,
"learning_rate": 2.9040982623301264e-07,
"loss": 0.011,
"step": 1372
},
{
"clip_ratio": 0.0003175289227783651,
"epoch": 2.0057363090436495,
"grad_norm": 0.036799393594264984,
"kl": 0.007405281066894531,
"learning_rate": 2.874822271284977e-07,
"loss": 0.0109,
"step": 1373
},
{
"clip_ratio": 0.0003284543961399322,
"epoch": 2.0076484120581997,
"grad_norm": 0.036977026611566544,
"kl": 0.007386684417724609,
"learning_rate": 2.8456855887436074e-07,
"loss": 0.0108,
"step": 1374
},
{
"clip_ratio": 0.00032697250054525284,
"epoch": 2.0095605150727494,
"grad_norm": 0.03594314306974411,
"kl": 0.00739288330078125,
"learning_rate": 2.816688398161613e-07,
"loss": 0.0108,
"step": 1375
},
{
"clip_ratio": 0.0,
"completion_length": 524.5270891189575,
"epoch": 2.0114726180872995,
"grad_norm": 15.976890563964844,
"kl": 0.4394536018371582,
"learning_rate": 2.7878308821162964e-07,
"loss": 0.0259,
"num_tokens": 383639505.0,
"reward": 0.08286830733413808,
"reward_std": 0.08972975501092151,
"rewards/pure_accuracy_reward_math": 0.08286830488941632,
"step": 1376
},
{
"clip_ratio": 0.0003084787746274742,
"epoch": 2.013384721101849,
"grad_norm": 1.2859545946121216,
"kl": 0.04446220397949219,
"learning_rate": 2.759113222305512e-07,
"loss": 0.0102,
"step": 1377
},
{
"clip_ratio": 0.00034848380650487343,
"epoch": 2.0152968241163993,
"grad_norm": 0.0618804506957531,
"kl": 0.009487152099609375,
"learning_rate": 2.730535599546524e-07,
"loss": 0.0087,
"step": 1378
},
{
"clip_ratio": 0.000346398171132023,
"epoch": 2.017208927130949,
"grad_norm": 0.039353594183921814,
"kl": 0.008243560791015625,
"learning_rate": 2.702098193774891e-07,
"loss": 0.0087,
"step": 1379
},
{
"clip_ratio": 0.000389314118024231,
"epoch": 2.019121030145499,
"grad_norm": 0.03626256063580513,
"kl": 0.0083465576171875,
"learning_rate": 2.6738011840432817e-07,
"loss": 0.0086,
"step": 1380
},
{
"clip_ratio": 0.0,
"completion_length": 504.881441116333,
"epoch": 2.021033133160049,
"grad_norm": 0.03991848975419998,
"kl": 0.00807046890258789,
"learning_rate": 2.6456447485204014e-07,
"loss": 0.0078,
"num_tokens": 387180856.0,
"reward": 0.07700893218861893,
"reward_std": 0.0893906393321231,
"rewards/pure_accuracy_reward_math": 0.07700893026776612,
"step": 1381
},
{
"clip_ratio": 0.00029079897933570464,
"epoch": 2.022945236174599,
"grad_norm": 0.03955512493848801,
"kl": 0.008087635040283203,
"learning_rate": 2.617629064489838e-07,
"loss": 0.0078,
"step": 1382
},
{
"clip_ratio": 0.00034119405472665676,
"epoch": 2.0248573391891487,
"grad_norm": 0.04050750657916069,
"kl": 0.008031845092773438,
"learning_rate": 2.5897543083489544e-07,
"loss": 0.0077,
"step": 1383
},
{
"clip_ratio": 0.0003633832532159431,
"epoch": 2.026769442203699,
"grad_norm": 0.03760417178273201,
"kl": 0.007889270782470703,
"learning_rate": 2.562020655607772e-07,
"loss": 0.0076,
"step": 1384
},
{
"clip_ratio": 0.00040043183099669477,
"epoch": 2.0286815452182485,
"grad_norm": 0.036376822739839554,
"kl": 0.007742404937744141,
"learning_rate": 2.534428280887891e-07,
"loss": 0.0076,
"step": 1385
},
{
"clip_ratio": 0.0,
"completion_length": 521.2332820892334,
"epoch": 2.0305936482327986,
"grad_norm": 0.03659322112798691,
"kl": 0.0079498291015625,
"learning_rate": 2.50697735792135e-07,
"loss": 0.0074,
"num_tokens": 390784592.0,
"reward": 0.0678013424621895,
"reward_std": 0.07990403228905052,
"rewards/pure_accuracy_reward_math": 0.06780134083237499,
"step": 1386
},
{
"clip_ratio": 0.0003029348101790674,
"epoch": 2.0325057512473483,
"grad_norm": 0.03603421524167061,
"kl": 0.0077915191650390625,
"learning_rate": 2.47966805954957e-07,
"loss": 0.0073,
"step": 1387
},
{
"clip_ratio": 0.0002788126068935526,
"epoch": 2.0344178542618985,
"grad_norm": 0.035584706813097,
"kl": 0.00768280029296875,
"learning_rate": 2.4525005577222373e-07,
"loss": 0.0073,
"step": 1388
},
{
"clip_ratio": 0.00033219700696918153,
"epoch": 2.036329957276448,
"grad_norm": 0.033913753926754,
"kl": 0.007656097412109375,
"learning_rate": 2.42547502349624e-07,
"loss": 0.0072,
"step": 1389
},
{
"clip_ratio": 0.00034793876449157324,
"epoch": 2.0382420602909983,
"grad_norm": 0.033490557223558426,
"kl": 0.007609367370605469,
"learning_rate": 2.398591627034588e-07,
"loss": 0.0072,
"step": 1390
},
{
"clip_ratio": 0.0,
"completion_length": 534.8217334747314,
"epoch": 2.040154163305548,
"grad_norm": 0.04065319523215294,
"kl": 0.007349491119384766,
"learning_rate": 2.3718505376053246e-07,
"loss": 0.0094,
"num_tokens": 394433277.0,
"reward": 0.07589286056463607,
"reward_std": 0.09050671145087108,
"rewards/pure_accuracy_reward_math": 0.07589285823632963,
"step": 1391
},
{
"clip_ratio": 0.00032872594630362073,
"epoch": 2.042066266320098,
"grad_norm": 0.0390729084610939,
"kl": 0.007353305816650391,
"learning_rate": 2.345251923580491e-07,
"loss": 0.0094,
"step": 1392
},
{
"clip_ratio": 0.00038015836332760955,
"epoch": 2.043978369334648,
"grad_norm": 0.037973206490278244,
"kl": 0.007381916046142578,
"learning_rate": 2.3187959524350352e-07,
"loss": 0.0093,
"step": 1393
},
{
"clip_ratio": 0.00041672343576237836,
"epoch": 2.045890472349198,
"grad_norm": 0.037547629326581955,
"kl": 0.007441043853759766,
"learning_rate": 2.2924827907457841e-07,
"loss": 0.0092,
"step": 1394
},
{
"clip_ratio": 0.00047711057584365335,
"epoch": 2.0478025753637477,
"grad_norm": 0.037767618894577026,
"kl": 0.007452487945556641,
"learning_rate": 2.266312604190374e-07,
"loss": 0.0091,
"step": 1395
},
{
"clip_ratio": 0.0,
"completion_length": 520.9163165092468,
"epoch": 2.049714678378298,
"grad_norm": 0.039165694266557693,
"kl": 0.007717609405517578,
"learning_rate": 2.2402855575462152e-07,
"loss": 0.0071,
"num_tokens": 398030605.0,
"reward": 0.07840402194415219,
"reward_std": 0.08072105259634554,
"rewards/pure_accuracy_reward_math": 0.07840401885914616,
"step": 1396
},
{
"clip_ratio": 0.0002864374472437703,
"epoch": 2.0516267813928475,
"grad_norm": 0.03918104246258736,
"kl": 0.007798194885253906,
"learning_rate": 2.2144018146894542e-07,
"loss": 0.007,
"step": 1397
},
{
"clip_ratio": 0.00028412381868747616,
"epoch": 2.0535388844073976,
"grad_norm": 0.03787809982895851,
"kl": 0.007855415344238281,
"learning_rate": 2.1886615385939502e-07,
"loss": 0.007,
"step": 1398
},
{
"clip_ratio": 0.0002802736350417945,
"epoch": 2.0554509874219473,
"grad_norm": 0.03685666248202324,
"kl": 0.007898807525634766,
"learning_rate": 2.1630648913302354e-07,
"loss": 0.0069,
"step": 1399
},
{
"clip_ratio": 0.0003048399971703475,
"epoch": 2.0573630904364975,
"grad_norm": 0.03653446584939957,
"kl": 0.0079193115234375,
"learning_rate": 2.1376120340645014e-07,
"loss": 0.0068,
"step": 1400
},
{
"clip_ratio": 0.0,
"completion_length": 523.7120804786682,
"epoch": 2.059275193451047,
"grad_norm": 0.041400156915187836,
"kl": 0.0076904296875,
"learning_rate": 2.1123031270575827e-07,
"loss": 0.0112,
"num_tokens": 401639357.0,
"reward": 0.08398437922005542,
"reward_std": 0.08836089540272951,
"rewards/pure_accuracy_reward_math": 0.08398437665891834,
"step": 1401
},
{
"clip_ratio": 0.0003276587292475597,
"epoch": 2.0611872964655973,
"grad_norm": 0.04058953374624252,
"kl": 0.007676601409912109,
"learning_rate": 2.0871383296639487e-07,
"loss": 0.0112,
"step": 1402
},
{
"clip_ratio": 0.00033817819053183484,
"epoch": 2.063099399480147,
"grad_norm": 0.040160875767469406,
"kl": 0.007659435272216797,
"learning_rate": 2.062117800330693e-07,
"loss": 0.0112,
"step": 1403
},
{
"clip_ratio": 0.00034579052078242967,
"epoch": 2.065011502494697,
"grad_norm": 0.03876737132668495,
"kl": 0.007627964019775391,
"learning_rate": 2.0372416965965675e-07,
"loss": 0.0111,
"step": 1404
},
{
"clip_ratio": 0.00035969930786450277,
"epoch": 2.066923605509247,
"grad_norm": 0.03797266259789467,
"kl": 0.007703304290771484,
"learning_rate": 2.0125101750909315e-07,
"loss": 0.011,
"step": 1405
},
{
"clip_ratio": 0.0,
"completion_length": 514.2500252723694,
"epoch": 2.068835708523797,
"grad_norm": 0.05333253741264343,
"kl": 0.010094165802001953,
"learning_rate": 1.9879233915328312e-07,
"loss": 0.0065,
"num_tokens": 405215041.0,
"reward": 0.08231027176952921,
"reward_std": 0.08208991179708391,
"rewards/pure_accuracy_reward_math": 0.08231026903376915,
"step": 1406
},
{
"clip_ratio": 0.0002884399551135175,
"epoch": 2.0707478115383466,
"grad_norm": 0.04066501557826996,
"kl": 0.009914398193359375,
"learning_rate": 1.9634815007299634e-07,
"loss": 0.0065,
"step": 1407
},
{
"clip_ratio": 0.0003325861029566113,
"epoch": 2.0726599145528963,
"grad_norm": 0.03939688578248024,
"kl": 0.00982666015625,
"learning_rate": 1.9391846565777418e-07,
"loss": 0.0064,
"step": 1408
},
{
"clip_ratio": 0.0003743518978467364,
"epoch": 2.0745720175674465,
"grad_norm": 0.03857440873980522,
"kl": 0.009755611419677734,
"learning_rate": 1.9150330120583012e-07,
"loss": 0.0063,
"step": 1409
},
{
"clip_ratio": 0.0004666026043196325,
"epoch": 2.076484120581996,
"grad_norm": 0.03952641412615776,
"kl": 0.0096588134765625,
"learning_rate": 1.891026719239547e-07,
"loss": 0.0062,
"step": 1410
},
{
"clip_ratio": 0.0,
"completion_length": 516.8532605171204,
"epoch": 2.0783962235965463,
"grad_norm": 0.04142899066209793,
"kl": 0.008448123931884766,
"learning_rate": 1.8671659292742007e-07,
"loss": 0.0099,
"num_tokens": 408804459.0,
"reward": 0.08286830742144957,
"reward_std": 0.08260788215557113,
"rewards/pure_accuracy_reward_math": 0.08286830509314314,
"step": 1411
},
{
"clip_ratio": 0.0003487231184635675,
"epoch": 2.080308326611096,
"grad_norm": 0.040530916303396225,
"kl": 0.008367538452148438,
"learning_rate": 1.8434507923988375e-07,
"loss": 0.0099,
"step": 1412
},
{
"clip_ratio": 0.0003221970002869057,
"epoch": 2.082220429625646,
"grad_norm": 0.03941330686211586,
"kl": 0.008350849151611328,
"learning_rate": 1.8198814579329426e-07,
"loss": 0.0098,
"step": 1413
},
{
"clip_ratio": 0.00037204451541583694,
"epoch": 2.084132532640196,
"grad_norm": 0.03861032798886299,
"kl": 0.008304595947265625,
"learning_rate": 1.7964580742779847e-07,
"loss": 0.0097,
"step": 1414
},
{
"clip_ratio": 0.0003590778907209824,
"epoch": 2.086044635654746,
"grad_norm": 0.03945469483733177,
"kl": 0.008287906646728516,
"learning_rate": 1.7731807889164537e-07,
"loss": 0.0096,
"step": 1415
},
{
"clip_ratio": 0.0,
"completion_length": 529.592381477356,
"epoch": 2.0879567386692957,
"grad_norm": 0.03833872824907303,
"kl": 0.0077228546142578125,
"learning_rate": 1.7500497484109703e-07,
"loss": 0.0109,
"num_tokens": 412432506.0,
"reward": 0.07449777142028324,
"reward_std": 0.08200978167587891,
"rewards/pure_accuracy_reward_math": 0.07449776885914616,
"step": 1416
},
{
"clip_ratio": 0.0002795722035671133,
"epoch": 2.089868841683846,
"grad_norm": 0.03684116527438164,
"kl": 0.007727146148681641,
"learning_rate": 1.7270650984033245e-07,
"loss": 0.0108,
"step": 1417
},
{
"clip_ratio": 0.00033119657558700055,
"epoch": 2.0917809446983955,
"grad_norm": 0.03667665645480156,
"kl": 0.007739067077636719,
"learning_rate": 1.7042269836135882e-07,
"loss": 0.0108,
"step": 1418
},
{
"clip_ratio": 0.00036255177064958843,
"epoch": 2.0936930477129456,
"grad_norm": 0.037857044488191605,
"kl": 0.007757663726806641,
"learning_rate": 1.6815355478391886e-07,
"loss": 0.0107,
"step": 1419
},
{
"clip_ratio": 0.0003589615364489873,
"epoch": 2.0956051507274953,
"grad_norm": 0.0360855907201767,
"kl": 0.007729053497314453,
"learning_rate": 1.6589909339539968e-07,
"loss": 0.0106,
"step": 1420
},
{
"clip_ratio": 0.0,
"completion_length": 523.7469544410706,
"epoch": 2.0975172537420455,
"grad_norm": 0.041348401457071304,
"kl": 0.007639408111572266,
"learning_rate": 1.6365932839074532e-07,
"loss": 0.0099,
"num_tokens": 416048915.0,
"reward": 0.07979911076836288,
"reward_std": 0.08175079576903954,
"rewards/pure_accuracy_reward_math": 0.07979910861467943,
"step": 1421
},
{
"clip_ratio": 0.00028084742956480113,
"epoch": 2.099429356756595,
"grad_norm": 0.03983917832374573,
"kl": 0.007691860198974609,
"learning_rate": 1.6143427387236455e-07,
"loss": 0.0099,
"step": 1422
},
{
"clip_ratio": 0.00032101355429858813,
"epoch": 2.1013414597711453,
"grad_norm": 0.04035898670554161,
"kl": 0.007829666137695312,
"learning_rate": 1.592239438500434e-07,
"loss": 0.0098,
"step": 1423
},
{
"clip_ratio": 0.00036129408920260175,
"epoch": 2.103253562785695,
"grad_norm": 0.03893222287297249,
"kl": 0.0079498291015625,
"learning_rate": 1.570283522408586e-07,
"loss": 0.0097,
"step": 1424
},
{
"clip_ratio": 0.0003233651194136655,
"epoch": 2.105165665800245,
"grad_norm": 0.03798089176416397,
"kl": 0.008071422576904297,
"learning_rate": 1.5484751286908655e-07,
"loss": 0.0097,
"step": 1425
},
{
"clip_ratio": 0.0,
"completion_length": 515.3281455039978,
"epoch": 2.107077768814795,
"grad_norm": 0.04489213973283768,
"kl": 0.00823831558227539,
"learning_rate": 1.5268143946611802e-07,
"loss": 0.01,
"num_tokens": 419628171.0,
"reward": 0.07952009321888909,
"reward_std": 0.0892580482759513,
"rewards/pure_accuracy_reward_math": 0.07952009089058265,
"step": 1426
},
{
"clip_ratio": 0.0003507794546067089,
"epoch": 2.108989871829345,
"grad_norm": 0.04182901233434677,
"kl": 0.008199691772460938,
"learning_rate": 1.5053014567037171e-07,
"loss": 0.01,
"step": 1427
},
{
"clip_ratio": 0.0004634781105323782,
"epoch": 2.1109019748438946,
"grad_norm": 0.04111779108643532,
"kl": 0.008260250091552734,
"learning_rate": 1.483936450272097e-07,
"loss": 0.0099,
"step": 1428
},
{
"clip_ratio": 0.0005032591409417364,
"epoch": 2.1128140778584448,
"grad_norm": 0.04071485623717308,
"kl": 0.008274078369140625,
"learning_rate": 1.4627195098884856e-07,
"loss": 0.0098,
"step": 1429
},
{
"clip_ratio": 0.0005640338476382567,
"epoch": 2.1147261808729945,
"grad_norm": 0.041747044771909714,
"kl": 0.008271217346191406,
"learning_rate": 1.441650769142791e-07,
"loss": 0.0097,
"step": 1430
},
{
"clip_ratio": 0.0,
"completion_length": 527.8217334747314,
"epoch": 2.1166382838875446,
"grad_norm": 0.04057304188609123,
"kl": 0.00798797607421875,
"learning_rate": 1.4207303606917856e-07,
"loss": 0.0057,
"num_tokens": 423255484.0,
"reward": 0.08761161076836288,
"reward_std": 0.09866452467394993,
"rewards/pure_accuracy_reward_math": 0.08761160855647177,
"step": 1431
},
{
"clip_ratio": 0.0003497144300581567,
"epoch": 2.1185503869020943,
"grad_norm": 0.03972388803958893,
"kl": 0.007953643798828125,
"learning_rate": 1.3999584162582874e-07,
"loss": 0.0057,
"step": 1432
},
{
"clip_ratio": 0.00037741022566706306,
"epoch": 2.1204624899166444,
"grad_norm": 0.03924018144607544,
"kl": 0.00795888900756836,
"learning_rate": 1.3793350666303328e-07,
"loss": 0.0056,
"step": 1433
},
{
"clip_ratio": 0.0003785647801350933,
"epoch": 2.122374592931194,
"grad_norm": 0.03913624957203865,
"kl": 0.007895946502685547,
"learning_rate": 1.3588604416603424e-07,
"loss": 0.0055,
"step": 1434
},
{
"clip_ratio": 0.0003937934675377619,
"epoch": 2.1242866959457443,
"grad_norm": 0.03699544072151184,
"kl": 0.00783538818359375,
"learning_rate": 1.3385346702643188e-07,
"loss": 0.0054,
"step": 1435
},
{
"clip_ratio": 0.0,
"completion_length": 533.7888078689575,
"epoch": 2.126198798960294,
"grad_norm": 0.042676378041505814,
"kl": 0.010451793670654297,
"learning_rate": 1.3183578804210173e-07,
"loss": 0.0098,
"num_tokens": 426903267.0,
"reward": 0.07645089671132155,
"reward_std": 0.08488008996937424,
"rewards/pure_accuracy_reward_math": 0.07645089426659979,
"step": 1436
},
{
"clip_ratio": 0.00036263700505401175,
"epoch": 2.128110901974844,
"grad_norm": 0.03884616866707802,
"kl": 0.010242462158203125,
"learning_rate": 1.2983301991711578e-07,
"loss": 0.0098,
"step": 1437
},
{
"clip_ratio": 0.0003990789759313884,
"epoch": 2.130023004989394,
"grad_norm": 0.0399676114320755,
"kl": 0.01007843017578125,
"learning_rate": 1.278451752616608e-07,
"loss": 0.0097,
"step": 1438
},
{
"clip_ratio": 0.0004171350746560165,
"epoch": 2.131935108003944,
"grad_norm": 0.039714373648166656,
"kl": 0.010037422180175781,
"learning_rate": 1.258722665919604e-07,
"loss": 0.0097,
"step": 1439
},
{
"clip_ratio": 0.00039808801824392503,
"epoch": 2.1338472110184936,
"grad_norm": 0.03794709965586662,
"kl": 0.009942054748535156,
"learning_rate": 1.2391430633019452e-07,
"loss": 0.0096,
"step": 1440
},
{
"clip_ratio": 0.0,
"completion_length": 525.7826709747314,
"epoch": 2.1357593140330433,
"grad_norm": 0.05131447687745094,
"kl": 0.00860595703125,
"learning_rate": 1.2197130680442399e-07,
"loss": 0.0073,
"num_tokens": 430520032.0,
"reward": 0.07282366428989917,
"reward_std": 0.0797313749208115,
"rewards/pure_accuracy_reward_math": 0.07282366172876209,
"step": 1441
},
{
"clip_ratio": 0.0003007381984616586,
"epoch": 2.1376714170475934,
"grad_norm": 0.03815394267439842,
"kl": 0.008358001708984375,
"learning_rate": 1.2004328024850938e-07,
"loss": 0.0073,
"step": 1442
},
{
"clip_ratio": 0.0003256684682355626,
"epoch": 2.139583520062143,
"grad_norm": 0.03841105103492737,
"kl": 0.008275985717773438,
"learning_rate": 1.1813023880203722e-07,
"loss": 0.0072,
"step": 1443
},
{
"clip_ratio": 0.00034418403180325186,
"epoch": 2.1414956230766933,
"grad_norm": 0.041511572897434235,
"kl": 0.008276939392089844,
"learning_rate": 1.1623219451024098e-07,
"loss": 0.0071,
"step": 1444
},
{
"clip_ratio": 0.00032526867431670325,
"epoch": 2.143407726091243,
"grad_norm": 0.03922862559556961,
"kl": 0.008294105529785156,
"learning_rate": 1.1434915932392682e-07,
"loss": 0.007,
"step": 1445
},
{
"clip_ratio": 0.0,
"completion_length": 526.7310523986816,
"epoch": 2.145319829105793,
"grad_norm": 0.04134941101074219,
"kl": 0.008166313171386719,
"learning_rate": 1.1248114509939817e-07,
"loss": 0.0067,
"num_tokens": 434141592.0,
"reward": 0.08342634307336994,
"reward_std": 0.08578344061970711,
"rewards/pure_accuracy_reward_math": 0.08342634132714011,
"step": 1446
},
{
"clip_ratio": 0.00029539940015865795,
"epoch": 2.147231932120343,
"grad_norm": 0.04034848138689995,
"kl": 0.008122920989990234,
"learning_rate": 1.1062816359838024e-07,
"loss": 0.0066,
"step": 1447
},
{
"clip_ratio": 0.0003565281184592095,
"epoch": 2.149144035134893,
"grad_norm": 0.04018424078822136,
"kl": 0.00803232192993164,
"learning_rate": 1.0879022648794645e-07,
"loss": 0.0066,
"step": 1448
},
{
"clip_ratio": 0.0003515161848781645,
"epoch": 2.1510561381494426,
"grad_norm": 0.03917380049824715,
"kl": 0.007886886596679688,
"learning_rate": 1.0696734534044629e-07,
"loss": 0.0065,
"step": 1449
},
{
"clip_ratio": 0.0004228238227028669,
"epoch": 2.1529682411639928,
"grad_norm": 0.038036227226257324,
"kl": 0.00785064697265625,
"learning_rate": 1.0515953163342973e-07,
"loss": 0.0064,
"step": 1450
},
{
"clip_ratio": 0.0,
"completion_length": 544.0078330039978,
"epoch": 2.1548803441785425,
"grad_norm": 0.03814779594540596,
"kl": 0.008002758026123047,
"learning_rate": 1.0336679674957716e-07,
"loss": 0.0113,
"num_tokens": 437824108.0,
"reward": 0.07533482514554635,
"reward_std": 0.07659588241949677,
"rewards/pure_accuracy_reward_math": 0.07533482287544757,
"step": 1451
},
{
"clip_ratio": 0.0002914705042371679,
"epoch": 2.1567924471930926,
"grad_norm": 0.03763413056731224,
"kl": 0.00798654556274414,
"learning_rate": 1.0158915197662628e-07,
"loss": 0.0113,
"step": 1452
},
{
"clip_ratio": 0.0002916823746659247,
"epoch": 2.1587045502076423,
"grad_norm": 0.036225125193595886,
"kl": 0.008030414581298828,
"learning_rate": 9.982660850730269e-08,
"loss": 0.0112,
"step": 1453
},
{
"clip_ratio": 0.0002708278207137482,
"epoch": 2.1606166532221924,
"grad_norm": 0.03529945760965347,
"kl": 0.00803375244140625,
"learning_rate": 9.807917743924838e-08,
"loss": 0.0112,
"step": 1454
},
{
"clip_ratio": 0.0002930295025862506,
"epoch": 2.162528756236742,
"grad_norm": 0.03426925837993622,
"kl": 0.007987022399902344,
"learning_rate": 9.634686977495089e-08,
"loss": 0.0111,
"step": 1455
},
{
"clip_ratio": 0.0,
"completion_length": 517.6585068702698,
"epoch": 2.1644408592512923,
"grad_norm": 0.038425736129283905,
"kl": 0.008115291595458984,
"learning_rate": 9.462969642167613e-08,
"loss": 0.0052,
"num_tokens": 441407888.0,
"reward": 0.07617187869618647,
"reward_std": 0.0740246243076399,
"rewards/pure_accuracy_reward_math": 0.07617187630967237,
"step": 1456
},
{
"clip_ratio": 0.00023060813538222646,
"epoch": 2.166352962265842,
"grad_norm": 0.03851727396249771,
"kl": 0.008001327514648438,
"learning_rate": 9.292766819139847e-08,
"loss": 0.0052,
"step": 1457
},
{
"clip_ratio": 0.0002378168165932948,
"epoch": 2.168265065280392,
"grad_norm": 0.040155645459890366,
"kl": 0.007994651794433594,
"learning_rate": 9.12407958007322e-08,
"loss": 0.0051,
"step": 1458
},
{
"clip_ratio": 0.0002497726611068174,
"epoch": 2.170177168294942,
"grad_norm": 0.0425233468413353,
"kl": 0.007935047149658203,
"learning_rate": 8.956908987086538e-08,
"loss": 0.005,
"step": 1459
},
{
"clip_ratio": 0.00030142679486289126,
"epoch": 2.172089271309492,
"grad_norm": 0.03647738695144653,
"kl": 0.007966041564941406,
"learning_rate": 8.791256092749223e-08,
"loss": 0.0049,
"step": 1460
},
{
"clip_ratio": 0.0,
"completion_length": 520.2968997955322,
"epoch": 2.1740013743240416,
"grad_norm": 0.22045741975307465,
"kl": 0.022356510162353516,
"learning_rate": 8.627121940074645e-08,
"loss": 0.0122,
"num_tokens": 445010628.0,
"reward": 0.08705357578583062,
"reward_std": 0.08814817463280633,
"rewards/pure_accuracy_reward_math": 0.08705357281723991,
"step": 1461
},
{
"clip_ratio": 0.00031046926528688346,
"epoch": 2.1759134773385918,
"grad_norm": 0.06329243630170822,
"kl": 0.015823841094970703,
"learning_rate": 8.464507562513657e-08,
"loss": 0.0119,
"step": 1462
},
{
"clip_ratio": 0.0003438202776351318,
"epoch": 2.1778255803531414,
"grad_norm": 0.05041000247001648,
"kl": 0.014271736145019531,
"learning_rate": 8.303413983948017e-08,
"loss": 0.0118,
"step": 1463
},
{
"clip_ratio": 0.0003563892260558532,
"epoch": 2.1797376833676916,
"grad_norm": 0.04660080000758171,
"kl": 0.013462543487548828,
"learning_rate": 8.143842218683862e-08,
"loss": 0.0117,
"step": 1464
},
{
"clip_ratio": 0.0004125210731444895,
"epoch": 2.1816497863822413,
"grad_norm": 0.04536700248718262,
"kl": 0.012927532196044922,
"learning_rate": 7.985793271445636e-08,
"loss": 0.0116,
"step": 1465
},
{
"clip_ratio": 0.0,
"completion_length": 517.6127443313599,
"epoch": 2.1835618893967914,
"grad_norm": 0.08454474061727524,
"kl": 0.010744094848632812,
"learning_rate": 7.829268137369311e-08,
"loss": 0.0075,
"num_tokens": 448601372.0,
"reward": 0.0750558071595151,
"reward_std": 0.0813654173980467,
"rewards/pure_accuracy_reward_math": 0.07505580488941632,
"step": 1466
},
{
"clip_ratio": 0.00028517025145902153,
"epoch": 2.185473992411341,
"grad_norm": 0.04138394817709923,
"kl": 0.009669780731201172,
"learning_rate": 7.674267801996427e-08,
"loss": 0.0075,
"step": 1467
},
{
"clip_ratio": 0.00027802770790685827,
"epoch": 2.1873860954258912,
"grad_norm": 0.03745463490486145,
"kl": 0.009511947631835938,
"learning_rate": 7.52079324126792e-08,
"loss": 0.0074,
"step": 1468
},
{
"clip_ratio": 0.0003267590287805433,
"epoch": 2.189298198440441,
"grad_norm": 0.036841075867414474,
"kl": 0.00956106185913086,
"learning_rate": 7.368845421517779e-08,
"loss": 0.0073,
"step": 1469
},
{
"clip_ratio": 0.0003443693621534294,
"epoch": 2.191210301454991,
"grad_norm": 0.0362345427274704,
"kl": 0.009715557098388672,
"learning_rate": 7.21842529946698e-08,
"loss": 0.0072,
"step": 1470
},
{
"clip_ratio": 0.0,
"completion_length": 499.83763551712036,
"epoch": 2.1931224044695408,
"grad_norm": 0.0431695282459259,
"kl": 0.008378028869628906,
"learning_rate": 7.0695338222177e-08,
"loss": 0.0093,
"num_tokens": 452124382.0,
"reward": 0.07756696839351207,
"reward_std": 0.08685944566968828,
"rewards/pure_accuracy_reward_math": 0.07756696530850604,
"step": 1471
},
{
"clip_ratio": 0.0003288618632950602,
"epoch": 2.195034507484091,
"grad_norm": 0.042445823550224304,
"kl": 0.008408546447753906,
"learning_rate": 6.922171927247062e-08,
"loss": 0.0092,
"step": 1472
},
{
"clip_ratio": 0.0003429904774066017,
"epoch": 2.1969466104986406,
"grad_norm": 0.04231419414281845,
"kl": 0.008434295654296875,
"learning_rate": 6.776340542401422e-08,
"loss": 0.0092,
"step": 1473
},
{
"clip_ratio": 0.00035230960349963425,
"epoch": 2.1988587135131903,
"grad_norm": 0.04162426292896271,
"kl": 0.008434295654296875,
"learning_rate": 6.632040585890398e-08,
"loss": 0.0091,
"step": 1474
},
{
"clip_ratio": 0.000348456743722636,
"epoch": 2.2007708165277404,
"grad_norm": 0.04009128361940384,
"kl": 0.008394718170166016,
"learning_rate": 6.489272966281269e-08,
"loss": 0.009,
"step": 1475
},
{
"clip_ratio": 0.0,
"completion_length": 511.53015899658203,
"epoch": 2.2026829195422906,
"grad_norm": 0.03803718462586403,
"kl": 0.008605003356933594,
"learning_rate": 6.348038582493e-08,
"loss": 0.0064,
"num_tokens": 455697798.0,
"reward": 0.06863839633297175,
"reward_std": 0.0772402475704439,
"rewards/pure_accuracy_reward_math": 0.06863839423749596,
"step": 1476
},
{
"clip_ratio": 0.0002735381897878142,
"epoch": 2.2045950225568403,
"grad_norm": 0.036724258214235306,
"kl": 0.008575439453125,
"learning_rate": 6.208338323790891e-08,
"loss": 0.0064,
"step": 1477
},
{
"clip_ratio": 0.000271568493644736,
"epoch": 2.20650712557139,
"grad_norm": 0.03627302870154381,
"kl": 0.008494853973388672,
"learning_rate": 6.070173069780638e-08,
"loss": 0.0063,
"step": 1478
},
{
"clip_ratio": 0.0003129301562694309,
"epoch": 2.20841922858594,
"grad_norm": 0.035685960203409195,
"kl": 0.008512496948242188,
"learning_rate": 5.933543690403082e-08,
"loss": 0.0063,
"step": 1479
},
{
"clip_ratio": 0.0003575469975203305,
"epoch": 2.21033133160049,
"grad_norm": 0.03495527431368828,
"kl": 0.008492469787597656,
"learning_rate": 5.7984510459285215e-08,
"loss": 0.0062,
"step": 1480
},
{
"clip_ratio": 0.0,
"completion_length": 527.403482913971,
"epoch": 2.21224343461504,
"grad_norm": 0.041989997029304504,
"kl": 0.008183956146240234,
"learning_rate": 5.6648959869514965e-08,
"loss": 0.0075,
"num_tokens": 459321180.0,
"reward": 0.07617187898722477,
"reward_std": 0.0817908609751612,
"rewards/pure_accuracy_reward_math": 0.07617187630967237,
"step": 1481
},
{
"clip_ratio": 0.0003129412224893713,
"epoch": 2.2141555376295896,
"grad_norm": 0.04108978435397148,
"kl": 0.00823974609375,
"learning_rate": 5.532879354385234e-08,
"loss": 0.0075,
"step": 1482
},
{
"clip_ratio": 0.0003202799926498301,
"epoch": 2.2160676406441397,
"grad_norm": 0.03990933671593666,
"kl": 0.00827646255493164,
"learning_rate": 5.4024019794565176e-08,
"loss": 0.0075,
"step": 1483
},
{
"clip_ratio": 0.0003925440155398974,
"epoch": 2.2179797436586894,
"grad_norm": 0.039193831384181976,
"kl": 0.008234977722167969,
"learning_rate": 5.273464683700352e-08,
"loss": 0.0074,
"step": 1484
},
{
"clip_ratio": 0.0004001183214654702,
"epoch": 2.2198918466732396,
"grad_norm": 0.039878588169813156,
"kl": 0.00826406478881836,
"learning_rate": 5.1460682789547526e-08,
"loss": 0.0073,
"step": 1485
},
{
"clip_ratio": 0.0,
"completion_length": 531.470449924469,
"epoch": 2.2218039496877893,
"grad_norm": 0.04079683497548103,
"kl": 0.011513710021972656,
"learning_rate": 5.020213567355825e-08,
"loss": 0.0091,
"num_tokens": 462957626.0,
"reward": 0.06752232459257357,
"reward_std": 0.07320140459341928,
"rewards/pure_accuracy_reward_math": 0.0675223229045514,
"step": 1486
},
{
"clip_ratio": 0.0002717390548241383,
"epoch": 2.2237160527023394,
"grad_norm": 0.037311483174562454,
"kl": 0.011410713195800781,
"learning_rate": 4.8959013413324705e-08,
"loss": 0.009,
"step": 1487
},
{
"clip_ratio": 0.0002951391629721911,
"epoch": 2.225628155716889,
"grad_norm": 0.035728756338357925,
"kl": 0.011387348175048828,
"learning_rate": 4.773132383601664e-08,
"loss": 0.009,
"step": 1488
},
{
"clip_ratio": 0.00030970129540719427,
"epoch": 2.2275402587314392,
"grad_norm": 0.03630708530545235,
"kl": 0.011130332946777344,
"learning_rate": 4.6519074671631805e-08,
"loss": 0.0089,
"step": 1489
},
{
"clip_ratio": 0.00035198272149727927,
"epoch": 2.229452361745989,
"grad_norm": 0.035501569509506226,
"kl": 0.010982990264892578,
"learning_rate": 4.5322273552951265e-08,
"loss": 0.0088,
"step": 1490
},
{
"clip_ratio": 0.0,
"completion_length": 516.0912661552429,
"epoch": 2.231364464760539,
"grad_norm": 0.039065275341272354,
"kl": 0.008381366729736328,
"learning_rate": 4.4140928015488085e-08,
"loss": 0.0067,
"num_tokens": 466540145.0,
"reward": 0.08007812951109372,
"reward_std": 0.07346039032563567,
"rewards/pure_accuracy_reward_math": 0.08007812619325705,
"step": 1491
},
{
"clip_ratio": 0.0002747246091985289,
"epoch": 2.2332765677750888,
"grad_norm": 0.03766880929470062,
"kl": 0.008387088775634766,
"learning_rate": 4.297504549744119e-08,
"loss": 0.0067,
"step": 1492
},
{
"clip_ratio": 0.0002486348788579562,
"epoch": 2.235188670789639,
"grad_norm": 0.03599947690963745,
"kl": 0.0084991455078125,
"learning_rate": 4.182463333964909e-08,
"loss": 0.0066,
"step": 1493
},
{
"clip_ratio": 0.0002674886795261955,
"epoch": 2.2371007738041886,
"grad_norm": 0.0361332893371582,
"kl": 0.008679389953613281,
"learning_rate": 4.068969878554263e-08,
"loss": 0.0066,
"step": 1494
},
{
"clip_ratio": 0.00031218544620514876,
"epoch": 2.2390128768187387,
"grad_norm": 0.035462211817502975,
"kl": 0.008719921112060547,
"learning_rate": 3.957024898110007e-08,
"loss": 0.0065,
"step": 1495
},
{
"clip_ratio": 0.0,
"completion_length": 507.05945777893066,
"epoch": 2.2409249798332884,
"grad_norm": 0.10880274325609207,
"kl": 0.012134075164794922,
"learning_rate": 3.846629097480126e-08,
"loss": 0.0046,
"num_tokens": 470091662.0,
"reward": 0.07952009330620058,
"reward_std": 0.08660046098520979,
"rewards/pure_accuracy_reward_math": 0.0795200907450635,
"step": 1496
},
{
"clip_ratio": 0.00034633993402621854,
"epoch": 2.2428370828478386,
"grad_norm": 0.04444468766450882,
"kl": 0.010071754455566406,
"learning_rate": 3.737783171758408e-08,
"loss": 0.0045,
"step": 1497
},
{
"clip_ratio": 0.00040814166391101026,
"epoch": 2.2447491858623883,
"grad_norm": 0.050679393112659454,
"kl": 0.009745597839355469,
"learning_rate": 3.630487806280086e-08,
"loss": 0.0044,
"step": 1498
},
{
"clip_ratio": 0.00040935890626769833,
"epoch": 2.2466612888769384,
"grad_norm": 0.04249563813209534,
"kl": 0.009531974792480469,
"learning_rate": 3.524743676617426e-08,
"loss": 0.0044,
"step": 1499
},
{
"clip_ratio": 0.00041069585563491273,
"epoch": 2.248573391891488,
"grad_norm": 0.04013880342245102,
"kl": 0.009422779083251953,
"learning_rate": 3.42055144857556e-08,
"loss": 0.0042,
"step": 1500
},
{
"clip_ratio": 0.0,
"completion_length": 530.4908156394958,
"epoch": 2.250485494906038,
"grad_norm": 0.04119328781962395,
"kl": 0.00858306884765625,
"learning_rate": 3.3179117781882154e-08,
"loss": 0.0064,
"num_tokens": 473729421.0,
"reward": 0.08175223629223183,
"reward_std": 0.080375739664305,
"rewards/pure_accuracy_reward_math": 0.08175223390571773,
"step": 1501
},
{
"clip_ratio": 0.00027040669908728887,
"epoch": 2.252397597920588,
"grad_norm": 0.03726639971137047,
"kl": 0.008556365966796875,
"learning_rate": 3.216825311713689e-08,
"loss": 0.0064,
"step": 1502
},
{
"clip_ratio": 0.0003022322244419229,
"epoch": 2.254309700935138,
"grad_norm": 0.03740008547902107,
"kl": 0.008624553680419922,
"learning_rate": 3.11729268563063e-08,
"loss": 0.0063,
"step": 1503
},
{
"clip_ratio": 0.0002972338604081415,
"epoch": 2.2562218039496877,
"grad_norm": 0.036019936203956604,
"kl": 0.008683204650878906,
"learning_rate": 3.019314526634232e-08,
"loss": 0.0062,
"step": 1504
},
{
"clip_ratio": 0.0003317092545103151,
"epoch": 2.258133906964238,
"grad_norm": 0.035242002457380295,
"kl": 0.008699893951416016,
"learning_rate": 2.922891451632076e-08,
"loss": 0.0062,
"step": 1505
},
{
"clip_ratio": 0.0,
"completion_length": 516.8340096473694,
"epoch": 2.2600460099787876,
"grad_norm": 0.04786042869091034,
"kl": 0.0166015625,
"learning_rate": 2.8280240677403813e-08,
"loss": 0.0117,
"num_tokens": 477311002.0,
"reward": 0.08593750389991328,
"reward_std": 0.09509739134227857,
"rewards/pure_accuracy_reward_math": 0.08593750139698386,
"step": 1506
},
{
"clip_ratio": 0.0003771551589011324,
"epoch": 2.2619581129933373,
"grad_norm": 0.04542854428291321,
"kl": 0.016517162322998047,
"learning_rate": 2.7347129722801736e-08,
"loss": 0.0117,
"step": 1507
},
{
"clip_ratio": 0.00043879733209450933,
"epoch": 2.2638702160078874,
"grad_norm": 0.04336082562804222,
"kl": 0.016106605529785156,
"learning_rate": 2.6429587527734835e-08,
"loss": 0.0116,
"step": 1508
},
{
"clip_ratio": 0.0005006881825977416,
"epoch": 2.2657823190224375,
"grad_norm": 0.04397574067115784,
"kl": 0.015746116638183594,
"learning_rate": 2.5527619869396003e-08,
"loss": 0.0115,
"step": 1509
},
{
"clip_ratio": 0.0005348546662844456,
"epoch": 2.2676944220369872,
"grad_norm": 0.043936342000961304,
"kl": 0.015500068664550781,
"learning_rate": 2.464123242691574e-08,
"loss": 0.0114,
"step": 1510
},
{
"clip_ratio": 0.0,
"completion_length": 526.8474016189575,
"epoch": 2.269606525051537,
"grad_norm": 0.04165401682257652,
"kl": 0.008256912231445312,
"learning_rate": 2.377043078132496e-08,
"loss": 0.0079,
"num_tokens": 480935151.0,
"reward": 0.08342634345171973,
"reward_std": 0.09024772583507001,
"rewards/pure_accuracy_reward_math": 0.08342634071595967,
"step": 1511
},
{
"clip_ratio": 0.0003286536882569635,
"epoch": 2.271518628066087,
"grad_norm": 0.04013460502028465,
"kl": 0.008354663848876953,
"learning_rate": 2.291522041552141e-08,
"loss": 0.0079,
"step": 1512
},
{
"clip_ratio": 0.00034448601985559435,
"epoch": 2.273430731080637,
"grad_norm": 0.03929148614406586,
"kl": 0.008509159088134766,
"learning_rate": 2.207560671423331e-08,
"loss": 0.0078,
"step": 1513
},
{
"clip_ratio": 0.00038580430322099346,
"epoch": 2.275342834095187,
"grad_norm": 0.04108521342277527,
"kl": 0.008730888366699219,
"learning_rate": 2.1251594963986876e-08,
"loss": 0.0077,
"step": 1514
},
{
"clip_ratio": 0.00038072799372912414,
"epoch": 2.2772549371097366,
"grad_norm": 0.038887783885002136,
"kl": 0.008725643157958984,
"learning_rate": 2.0443190353072185e-08,
"loss": 0.0076,
"step": 1515
},
{
"clip_ratio": 0.0,
"completion_length": 519.4051609039307,
"epoch": 2.2791670401242867,
"grad_norm": 0.03783741220831871,
"kl": 0.008581161499023438,
"learning_rate": 1.9650397971510972e-08,
"loss": 0.0064,
"num_tokens": 484530587.0,
"reward": 0.08231027124566026,
"reward_std": 0.08037574036279693,
"rewards/pure_accuracy_reward_math": 0.08231026897556148,
"step": 1516
},
{
"clip_ratio": 0.0002746778108644321,
"epoch": 2.2810791431388364,
"grad_norm": 0.03765445947647095,
"kl": 0.008580207824707031,
"learning_rate": 1.8873222811024717e-08,
"loss": 0.0063,
"step": 1517
},
{
"clip_ratio": 0.00031986788579274616,
"epoch": 2.2829912461533866,
"grad_norm": 0.03684096038341522,
"kl": 0.008593082427978516,
"learning_rate": 1.8111669765003005e-08,
"loss": 0.0063,
"step": 1518
},
{
"clip_ratio": 0.0003354349921380617,
"epoch": 2.2849033491679362,
"grad_norm": 0.03599463030695915,
"kl": 0.008591175079345703,
"learning_rate": 1.73657436284716e-08,
"loss": 0.0062,
"step": 1519
},
{
"clip_ratio": 0.0003505910435706028,
"epoch": 2.2868154521824864,
"grad_norm": 0.035750966519117355,
"kl": 0.00874948501586914,
"learning_rate": 1.6635449098064972e-08,
"loss": 0.0061,
"step": 1520
},
{
"clip_ratio": 0.0,
"completion_length": 521.2455606460571,
"epoch": 2.288727555197036,
"grad_norm": 0.03890154883265495,
"kl": 0.008922100067138672,
"learning_rate": 1.5920790771993822e-08,
"loss": 0.0078,
"num_tokens": 488136255.0,
"reward": 0.07952009289874695,
"reward_std": 0.07556614064378664,
"rewards/pure_accuracy_reward_math": 0.07952009068685584,
"step": 1521
},
{
"clip_ratio": 0.00024827225587387147,
"epoch": 2.290639658211586,
"grad_norm": 0.037810854613780975,
"kl": 0.008934974670410156,
"learning_rate": 1.5221773150017882e-08,
"loss": 0.0078,
"step": 1522
},
{
"clip_ratio": 0.0002384709360967463,
"epoch": 2.292551761226136,
"grad_norm": 0.0364384800195694,
"kl": 0.008936882019042969,
"learning_rate": 1.4538400633417049e-08,
"loss": 0.0077,
"step": 1523
},
{
"clip_ratio": 0.0002599185108635993,
"epoch": 2.294463864240686,
"grad_norm": 0.035106074064970016,
"kl": 0.008829116821289062,
"learning_rate": 1.387067752496335e-08,
"loss": 0.0076,
"step": 1524
},
{
"clip_ratio": 0.0003290796867077006,
"epoch": 2.2963759672552357,
"grad_norm": 0.03489363566040993,
"kl": 0.0086822509765625,
"learning_rate": 1.3218608028895131e-08,
"loss": 0.0076,
"step": 1525
},
{
"clip_ratio": 0.0,
"completion_length": 517.0547122955322,
"epoch": 2.298288070269786,
"grad_norm": 0.040062014013528824,
"kl": 0.008834362030029297,
"learning_rate": 1.2582196250888745e-08,
"loss": 0.0071,
"num_tokens": 491722139.0,
"reward": 0.08621652179863304,
"reward_std": 0.08020308247068897,
"rewards/pure_accuracy_reward_math": 0.08621651906287298,
"step": 1526
},
{
"clip_ratio": 0.00031514769625573535,
"epoch": 2.3002001732843356,
"grad_norm": 0.03938477113842964,
"kl": 0.008733272552490234,
"learning_rate": 1.1961446198033855e-08,
"loss": 0.0071,
"step": 1527
},
{
"clip_ratio": 0.00030386562087869606,
"epoch": 2.3021122762988857,
"grad_norm": 0.03844742849469185,
"kl": 0.008654594421386719,
"learning_rate": 1.1356361778808167e-08,
"loss": 0.007,
"step": 1528
},
{
"clip_ratio": 0.00034510965764411594,
"epoch": 2.3040243793134354,
"grad_norm": 0.03755528852343559,
"kl": 0.00861358642578125,
"learning_rate": 1.076694680305218e-08,
"loss": 0.007,
"step": 1529
},
{
"clip_ratio": 0.00035207756366162357,
"epoch": 2.3059364823279855,
"grad_norm": 0.03696778416633606,
"kl": 0.008616447448730469,
"learning_rate": 1.0193204981946426e-08,
"loss": 0.0069,
"step": 1530
},
{
"clip_ratio": 0.0,
"completion_length": 516.7249145507812,
"epoch": 2.3078485853425352,
"grad_norm": 0.045076508074998856,
"kl": 0.014521598815917969,
"learning_rate": 9.63513992798676e-09,
"loss": 0.0065,
"num_tokens": 495305537.0,
"reward": 0.07505580713041127,
"reward_std": 0.07844264624873176,
"rewards/pure_accuracy_reward_math": 0.07505580480210483,
"step": 1531
},
{
"clip_ratio": 0.0003054732096074986,
"epoch": 2.3097606883570854,
"grad_norm": 0.041828691959381104,
"kl": 0.01419973373413086,
"learning_rate": 9.092755154961886e-09,
"loss": 0.0065,
"step": 1532
},
{
"clip_ratio": 0.00030572324658351135,
"epoch": 2.311672791371635,
"grad_norm": 0.03949357569217682,
"kl": 0.013697624206542969,
"learning_rate": 8.566054077932262e-09,
"loss": 0.0064,
"step": 1533
},
{
"clip_ratio": 0.0003279060996987937,
"epoch": 2.313584894386185,
"grad_norm": 0.038545649498701096,
"kl": 0.01345968246459961,
"learning_rate": 8.055040013207061e-09,
"loss": 0.0063,
"step": 1534
},
{
"clip_ratio": 0.00033917763732915773,
"epoch": 2.315496997400735,
"grad_norm": 0.03716408833861351,
"kl": 0.01330709457397461,
"learning_rate": 7.559716178325016e-09,
"loss": 0.0062,
"step": 1535
},
{
"clip_ratio": 0.0,
"completion_length": 519.2921552658081,
"epoch": 2.317409100415285,
"grad_norm": 0.041162386536598206,
"kl": 0.008297443389892578,
"learning_rate": 7.080085692032224e-09,
"loss": 0.0079,
"num_tokens": 498900584.0,
"reward": 0.08928571816068143,
"reward_std": 0.08428199036279693,
"rewards/pure_accuracy_reward_math": 0.08928571571595967,
"step": 1536
},
{
"clip_ratio": 0.00029752771973790004,
"epoch": 2.3193212034298347,
"grad_norm": 0.03933210298418999,
"kl": 0.008346080780029297,
"learning_rate": 6.616151574264374e-09,
"loss": 0.0079,
"step": 1537
},
{
"clip_ratio": 0.0003302163729017593,
"epoch": 2.321233306444385,
"grad_norm": 0.038146842271089554,
"kl": 0.008320331573486328,
"learning_rate": 6.1679167461262124e-09,
"loss": 0.0078,
"step": 1538
},
{
"clip_ratio": 0.0003326926421891585,
"epoch": 2.3231454094589346,
"grad_norm": 0.038072116672992706,
"kl": 0.008330345153808594,
"learning_rate": 5.735384029874336e-09,
"loss": 0.0077,
"step": 1539
},
{
"clip_ratio": 0.00038002995881925017,
"epoch": 2.3250575124734847,
"grad_norm": 0.037320397794246674,
"kl": 0.008296012878417969,
"learning_rate": 5.31855614889859e-09,
"loss": 0.0076,
"step": 1540
},
{
"clip_ratio": 0.0,
"completion_length": 520.1487407684326,
"epoch": 2.3269696154880344,
"grad_norm": 0.03688493371009827,
"kl": 0.008476734161376953,
"learning_rate": 4.917435727704867e-09,
"loss": 0.0024,
"num_tokens": 502500281.0,
"reward": 0.0811942005821038,
"reward_std": 0.0787416979437694,
"rewards/pure_accuracy_reward_math": 0.08119419842842035,
"step": 1541
},
{
"clip_ratio": 0.00028201957394458077,
"epoch": 2.3288817185025845,
"grad_norm": 0.03607385605573654,
"kl": 0.008441448211669922,
"learning_rate": 4.53202529190011e-09,
"loss": 0.0023,
"step": 1542
},
{
"clip_ratio": 0.0002742231245633775,
"epoch": 2.330793821517134,
"grad_norm": 0.03572804853320122,
"kl": 0.00852060317993164,
"learning_rate": 4.162327268173727e-09,
"loss": 0.0023,
"step": 1543
},
{
"clip_ratio": 0.0003046261713848253,
"epoch": 2.332705924531684,
"grad_norm": 0.034965962171554565,
"kl": 0.00861501693725586,
"learning_rate": 3.80834398428509e-09,
"loss": 0.0022,
"step": 1544
},
{
"clip_ratio": 0.0003226917802976459,
"epoch": 2.334618027546234,
"grad_norm": 0.034803807735443115,
"kl": 0.008724212646484375,
"learning_rate": 3.470077669046612e-09,
"loss": 0.0021,
"step": 1545
},
{
"clip_ratio": 0.0,
"completion_length": 538.0273699760437,
"epoch": 2.336530130560784,
"grad_norm": 0.034996818751096725,
"kl": 0.008575439453125,
"learning_rate": 3.147530452311809e-09,
"loss": 0.0064,
"num_tokens": 506159719.0,
"reward": 0.06891741408617236,
"reward_std": 0.07063014718005434,
"rewards/pure_accuracy_reward_math": 0.06891741210711189,
"step": 1546
},
{
"clip_ratio": 0.00023073077210256088,
"epoch": 2.338442233575334,
"grad_norm": 0.03347066789865494,
"kl": 0.008565902709960938,
"learning_rate": 2.8407043649597567e-09,
"loss": 0.0063,
"step": 1547
},
{
"clip_ratio": 0.000268154504112772,
"epoch": 2.3403543365898836,
"grad_norm": 0.03273630142211914,
"kl": 0.008545398712158203,
"learning_rate": 2.549601338883989e-09,
"loss": 0.0063,
"step": 1548
},
{
"clip_ratio": 0.00029292683666426456,
"epoch": 2.3422664396044337,
"grad_norm": 0.032376162707805634,
"kl": 0.008570671081542969,
"learning_rate": 2.2742232069794533e-09,
"loss": 0.0063,
"step": 1549
},
{
"clip_ratio": 0.0003443536306235728,
"epoch": 2.344178542618984,
"grad_norm": 0.031950000673532486,
"kl": 0.008484363555908203,
"learning_rate": 2.01457170313113e-09,
"loss": 0.0062,
"step": 1550
},
{
"clip_ratio": 0.0,
"completion_length": 520.7207255363464,
"epoch": 2.3460906456335335,
"grad_norm": 0.04171088710427284,
"kl": 0.009114742279052734,
"learning_rate": 1.7706484622034837e-09,
"loss": 0.005,
"num_tokens": 509757966.0,
"reward": 0.07672991443541832,
"reward_std": 0.08149181143380702,
"rewards/pure_accuracy_reward_math": 0.07672991228173487,
"step": 1551
},
{
"clip_ratio": 0.0003305982788788242,
"epoch": 2.3480027486480832,
"grad_norm": 0.04123101010918617,
"kl": 0.009046554565429688,
"learning_rate": 1.5424550200293653e-09,
"loss": 0.005,
"step": 1552
},
{
"clip_ratio": 0.0003486324259256435,
"epoch": 2.3499148516626334,
"grad_norm": 0.039809513837099075,
"kl": 0.008966445922851562,
"learning_rate": 1.3299928134014039e-09,
"loss": 0.0049,
"step": 1553
},
{
"clip_ratio": 0.0003954665013452541,
"epoch": 2.351826954677183,
"grad_norm": 0.0393875353038311,
"kl": 0.008915901184082031,
"learning_rate": 1.1332631800620164e-09,
"loss": 0.0049,
"step": 1554
},
{
"clip_ratio": 0.0004334128346954458,
"epoch": 2.353739057691733,
"grad_norm": 0.03990260884165764,
"kl": 0.008862972259521484,
"learning_rate": 9.522673586956355e-10,
"loss": 0.0047,
"step": 1555
},
{
"clip_ratio": 0.0,
"completion_length": 518.488025188446,
"epoch": 2.355651160706283,
"grad_norm": 0.04300679266452789,
"kl": 0.009171009063720703,
"learning_rate": 7.870064889206608e-10,
"loss": 0.0082,
"num_tokens": 513350767.0,
"reward": 0.07728794994181953,
"reward_std": 0.08290693227900192,
"rewards/pure_accuracy_reward_math": 0.07728794743889011,
"step": 1556
},
{
"clip_ratio": 0.000295089724772879,
"epoch": 2.357563263720833,
"grad_norm": 0.04144243150949478,
"kl": 0.009136676788330078,
"learning_rate": 6.374816112819648e-10,
"loss": 0.0082,
"step": 1557
},
{
"clip_ratio": 0.0003283331608940898,
"epoch": 2.3594753667353827,
"grad_norm": 0.039357006549835205,
"kl": 0.009202003479003906,
"learning_rate": 5.036936672447868e-10,
"loss": 0.0081,
"step": 1558
},
{
"clip_ratio": 0.00036647373104869985,
"epoch": 2.361387469749933,
"grad_norm": 0.03904441371560097,
"kl": 0.009307384490966797,
"learning_rate": 3.8564349918890356e-10,
"loss": 0.008,
"step": 1559
},
{
"clip_ratio": 0.0004084905730792343,
"epoch": 2.3632995727644825,
"grad_norm": 0.03901646286249161,
"kl": 0.00932168960571289,
"learning_rate": 2.833318504030791e-10,
"loss": 0.0079,
"step": 1560
},
{
"clip_ratio": 0.0,
"completion_length": 527.474356174469,
"epoch": 2.3652116757790327,
"grad_norm": 5.391517162322998,
"kl": 0.0942845344543457,
"learning_rate": 1.9675936507979056e-10,
"loss": 0.0081,
"num_tokens": 516974751.0,
"reward": 0.06975446754950099,
"reward_std": 0.06989945453824475,
"rewards/pure_accuracy_reward_math": 0.06975446597789414,
"step": 1561
},
{
"clip_ratio": 0.0002886794856635788,
"epoch": 2.3671237787935824,
"grad_norm": 0.1764528900384903,
"kl": 0.013553619384765625,
"learning_rate": 1.2592658831245274e-10,
"loss": 0.0049,
"step": 1562
},
{
"clip_ratio": 0.00028670978349509824,
"epoch": 2.3690358818081325,
"grad_norm": 0.03846847265958786,
"kl": 0.009183406829833984,
"learning_rate": 7.083396609097737e-11,
"loss": 0.0047,
"step": 1563
},
{
"clip_ratio": 0.0002776476591748178,
"epoch": 2.370947984822682,
"grad_norm": 0.035545963793992996,
"kl": 0.008979320526123047,
"learning_rate": 3.148184529927489e-11,
"loss": 0.0046,
"step": 1564
},
{
"clip_ratio": 0.00032522391097700165,
"epoch": 2.3728600878372323,
"grad_norm": 0.1538141518831253,
"kl": 0.009156227111816406,
"learning_rate": 7.870473713589288e-12,
"loss": 0.0046,
"step": 1565
},
{
"clip_ratio": 0.0,
"completion_length": 530.6135845184326,
"epoch": 2.374772190851782,
"grad_norm": 0.0368269719183445,
"kl": 0.008574485778808594,
"learning_rate": 0.0,
"loss": 0.0087,
"num_tokens": 520611370.0,
"reward": 0.07142857427243143,
"reward_std": 0.07900068280287087,
"rewards/pure_accuracy_reward_math": 0.07142857293365523,
"step": 1566
},
{
"epoch": 2.374772190851782,
"step": 1566,
"total_flos": 0.0,
"train_loss": 0.003398028112404372,
"train_runtime": 273585.6306,
"train_samples_per_second": 1.028,
"train_steps_per_second": 0.006
}
],
"logging_steps": 1,
"max_steps": 1566,
"num_input_tokens_seen": 0,
"num_train_epochs": 3,
"save_steps": 50,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 0.0,
"train_batch_size": 8,
"trial_name": null,
"trial_params": null
}