Qwen2.5-7B-Open-R1-GRPO-math-7b / trainer_state.json
od2961's picture
Training in progress, step 550
253aa0e verified
{
"best_global_step": null,
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 0.40973636026069477,
"eval_steps": 500,
"global_step": 500,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"clip_ratio": 0.0,
"completion_length": 606.2532775402069,
"epoch": 0.001638945441042779,
"grad_norm": 0.05986390635371208,
"kl": 0.0,
"learning_rate": 1.3661202185792351e-08,
"loss": 0.0057,
"num_tokens": 3348938.0,
"reward": 0.1718750048603397,
"reward_std": 0.09577879420248792,
"rewards/pure_accuracy_reward_math": 0.17187500078580342,
"step": 1
},
{
"clip_ratio": 0.0,
"epoch": 0.003277890882085558,
"grad_norm": 0.05986390635371208,
"kl": 0.0,
"learning_rate": 2.7322404371584703e-08,
"loss": 0.0057,
"step": 2
},
{
"clip_ratio": 0.0006339755559565674,
"epoch": 0.004916836323128337,
"grad_norm": 0.05929790809750557,
"kl": 0.0005019009113311768,
"learning_rate": 4.098360655737705e-08,
"loss": 0.0057,
"step": 3
},
{
"clip_ratio": 0.0006407226928217824,
"epoch": 0.006555781764171116,
"grad_norm": 0.059925854206085205,
"kl": 0.0005110502243041992,
"learning_rate": 5.4644808743169406e-08,
"loss": 0.0057,
"step": 4
},
{
"clip_ratio": 0.0006387700201457847,
"epoch": 0.008194727205213895,
"grad_norm": 0.05939409136772156,
"kl": 0.0005159676074981689,
"learning_rate": 6.830601092896175e-08,
"loss": 0.0057,
"step": 5
},
{
"clip_ratio": 0.0,
"completion_length": 612.4726753234863,
"epoch": 0.009833672646256675,
"grad_norm": 0.072689488530159,
"kl": 0.000512346625328064,
"learning_rate": 8.19672131147541e-08,
"loss": 0.0067,
"num_tokens": 6714854.0,
"reward": 0.16438802544143982,
"reward_std": 0.11541076033608988,
"rewards/pure_accuracy_reward_math": 0.16438802113407291,
"step": 6
},
{
"clip_ratio": 0.0007277115302031234,
"epoch": 0.011472618087299453,
"grad_norm": 0.07328997552394867,
"kl": 0.0005197674036026001,
"learning_rate": 9.562841530054645e-08,
"loss": 0.0068,
"step": 7
},
{
"clip_ratio": 0.0007614574305989663,
"epoch": 0.013111563528342233,
"grad_norm": 0.07325445115566254,
"kl": 0.0005202591419219971,
"learning_rate": 1.0928961748633881e-07,
"loss": 0.0068,
"step": 8
},
{
"clip_ratio": 0.0007783421593785533,
"epoch": 0.01475050896938501,
"grad_norm": 0.07128091156482697,
"kl": 0.000517427921295166,
"learning_rate": 1.2295081967213116e-07,
"loss": 0.0068,
"step": 9
},
{
"clip_ratio": 0.0007585194575767673,
"epoch": 0.01638945441042779,
"grad_norm": 0.07174714654684067,
"kl": 0.0005128979682922363,
"learning_rate": 1.366120218579235e-07,
"loss": 0.0068,
"step": 10
},
{
"clip_ratio": 0.0,
"completion_length": 609.4596562385559,
"epoch": 0.018028399851470568,
"grad_norm": 0.060616616159677505,
"kl": 0.0005253106355667114,
"learning_rate": 1.5027322404371585e-07,
"loss": 0.0052,
"num_tokens": 10075962.0,
"reward": 0.17447917186655104,
"reward_std": 0.09832898661261424,
"rewards/pure_accuracy_reward_math": 0.17447916814126074,
"step": 11
},
{
"clip_ratio": 0.0006354124035397035,
"epoch": 0.01966734529251335,
"grad_norm": 0.05994507297873497,
"kl": 0.0005232691764831543,
"learning_rate": 1.639344262295082e-07,
"loss": 0.0053,
"step": 12
},
{
"clip_ratio": 0.0006359500578128063,
"epoch": 0.021306290733556128,
"grad_norm": 0.060422513633966446,
"kl": 0.0005258470773696899,
"learning_rate": 1.7759562841530054e-07,
"loss": 0.0053,
"step": 13
},
{
"clip_ratio": 0.0006202999380775509,
"epoch": 0.022945236174598906,
"grad_norm": 0.06020491570234299,
"kl": 0.000526919960975647,
"learning_rate": 1.912568306010929e-07,
"loss": 0.0053,
"step": 14
},
{
"clip_ratio": 0.0006456842476154634,
"epoch": 0.024584181615641687,
"grad_norm": 0.06016543507575989,
"kl": 0.0005295425653457642,
"learning_rate": 2.0491803278688524e-07,
"loss": 0.0053,
"step": 15
},
{
"clip_ratio": 0.0,
"completion_length": 613.3219571113586,
"epoch": 0.026223127056684465,
"grad_norm": 0.06271925568580627,
"kl": 0.0005239248275756836,
"learning_rate": 2.1857923497267762e-07,
"loss": 0.0081,
"num_tokens": 13445671.0,
"reward": 0.1438802126212977,
"reward_std": 0.10509481013286859,
"rewards/pure_accuracy_reward_math": 0.1438802084303461,
"step": 16
},
{
"clip_ratio": 0.0007483757581212558,
"epoch": 0.027862072497727243,
"grad_norm": 0.06271728873252869,
"kl": 0.000528186559677124,
"learning_rate": 2.3224043715846998e-07,
"loss": 0.0081,
"step": 17
},
{
"clip_ratio": 0.0006768568357529148,
"epoch": 0.02950101793877002,
"grad_norm": 0.06163553521037102,
"kl": 0.0005240440368652344,
"learning_rate": 2.459016393442623e-07,
"loss": 0.0081,
"step": 18
},
{
"clip_ratio": 0.00073299726238929,
"epoch": 0.031139963379812802,
"grad_norm": 0.062258753925561905,
"kl": 0.000529751181602478,
"learning_rate": 2.595628415300547e-07,
"loss": 0.0081,
"step": 19
},
{
"clip_ratio": 0.0007049078883483162,
"epoch": 0.03277890882085558,
"grad_norm": 0.061678871512413025,
"kl": 0.0005273669958114624,
"learning_rate": 2.73224043715847e-07,
"loss": 0.0081,
"step": 20
},
{
"clip_ratio": 0.0,
"completion_length": 610.4453291893005,
"epoch": 0.03441785426189836,
"grad_norm": 0.06878828257322311,
"kl": 0.0005384832620620728,
"learning_rate": 2.8688524590163937e-07,
"loss": 0.0051,
"num_tokens": 16799755.0,
"reward": 0.15136719125439413,
"reward_std": 0.10323517030337825,
"rewards/pure_accuracy_reward_math": 0.15136718822759576,
"step": 21
},
{
"clip_ratio": 0.0007487565382007233,
"epoch": 0.036056799702941136,
"grad_norm": 0.06954149156808853,
"kl": 0.0005507916212081909,
"learning_rate": 3.005464480874317e-07,
"loss": 0.0051,
"step": 22
},
{
"clip_ratio": 0.0007795561222110337,
"epoch": 0.03769574514398392,
"grad_norm": 0.06806771457195282,
"kl": 0.0005584806203842163,
"learning_rate": 3.142076502732241e-07,
"loss": 0.0051,
"step": 23
},
{
"clip_ratio": 0.0007387081783463145,
"epoch": 0.0393346905850267,
"grad_norm": 0.06814352422952652,
"kl": 0.0005674809217453003,
"learning_rate": 3.278688524590164e-07,
"loss": 0.0051,
"step": 24
},
{
"clip_ratio": 0.0007619177375772779,
"epoch": 0.040973636026069474,
"grad_norm": 0.06729913502931595,
"kl": 0.0005744844675064087,
"learning_rate": 3.415300546448088e-07,
"loss": 0.0051,
"step": 25
},
{
"clip_ratio": 0.0,
"completion_length": 596.7343945503235,
"epoch": 0.042612581467112255,
"grad_norm": 0.06346436589956284,
"kl": 0.0006166845560073853,
"learning_rate": 3.551912568306011e-07,
"loss": 0.0038,
"num_tokens": 20112563.0,
"reward": 0.16373698358074762,
"reward_std": 0.09582553629297763,
"rewards/pure_accuracy_reward_math": 0.163736979739042,
"step": 26
},
{
"clip_ratio": 0.0006948200579017794,
"epoch": 0.04425152690815504,
"grad_norm": 0.06285525858402252,
"kl": 0.0006320923566818237,
"learning_rate": 3.6885245901639347e-07,
"loss": 0.0039,
"step": 27
},
{
"clip_ratio": 0.0006778589096256837,
"epoch": 0.04589047234919781,
"grad_norm": 0.06269308924674988,
"kl": 0.000654950737953186,
"learning_rate": 3.825136612021858e-07,
"loss": 0.0039,
"step": 28
},
{
"clip_ratio": 0.0006392495685076938,
"epoch": 0.04752941779024059,
"grad_norm": 0.06292663514614105,
"kl": 0.0006759315729141235,
"learning_rate": 3.961748633879782e-07,
"loss": 0.0039,
"step": 29
},
{
"clip_ratio": 0.000681757599068078,
"epoch": 0.049168363231283374,
"grad_norm": 0.06097942218184471,
"kl": 0.0007022321224212646,
"learning_rate": 4.0983606557377047e-07,
"loss": 0.0039,
"step": 30
},
{
"clip_ratio": 0.0,
"completion_length": 606.3161022663116,
"epoch": 0.05080730867232615,
"grad_norm": 0.06853298097848892,
"kl": 0.0007193088531494141,
"learning_rate": 4.2349726775956286e-07,
"loss": 0.005,
"num_tokens": 23460802.0,
"reward": 0.15071615006309003,
"reward_std": 0.10664506914326921,
"rewards/pure_accuracy_reward_math": 0.1507161462213844,
"step": 31
},
{
"clip_ratio": 0.0007206783092215119,
"epoch": 0.05244625411336893,
"grad_norm": 0.06669250130653381,
"kl": 0.0007403194904327393,
"learning_rate": 4.3715846994535524e-07,
"loss": 0.005,
"step": 32
},
{
"clip_ratio": 0.0008033858404132843,
"epoch": 0.05408519955441171,
"grad_norm": 0.06685461103916168,
"kl": 0.0007804930210113525,
"learning_rate": 4.508196721311476e-07,
"loss": 0.005,
"step": 33
},
{
"clip_ratio": 0.0007623738173379024,
"epoch": 0.055724144995454486,
"grad_norm": 0.06673412770032883,
"kl": 0.0008253157138824463,
"learning_rate": 4.6448087431693996e-07,
"loss": 0.005,
"step": 34
},
{
"clip_ratio": 0.0007461598812597003,
"epoch": 0.05736309043649727,
"grad_norm": 0.06533104181289673,
"kl": 0.0008644461631774902,
"learning_rate": 4.781420765027322e-07,
"loss": 0.005,
"step": 35
},
{
"clip_ratio": 0.0,
"completion_length": 597.9977385997772,
"epoch": 0.05900203587754004,
"grad_norm": 0.07363387197256088,
"kl": 0.00089988112449646,
"learning_rate": 4.918032786885246e-07,
"loss": 0.0081,
"num_tokens": 26781715.0,
"reward": 0.17936198392999358,
"reward_std": 0.1202162274857983,
"rewards/pure_accuracy_reward_math": 0.179361979739042,
"step": 36
},
{
"clip_ratio": 0.0008816556705255607,
"epoch": 0.06064098131858282,
"grad_norm": 0.06755447387695312,
"kl": 0.0009488761425018311,
"learning_rate": 5.05464480874317e-07,
"loss": 0.0081,
"step": 37
},
{
"clip_ratio": 0.0008573625917165373,
"epoch": 0.062279926759625605,
"grad_norm": 0.06729397177696228,
"kl": 0.0010100901126861572,
"learning_rate": 5.191256830601094e-07,
"loss": 0.0081,
"step": 38
},
{
"clip_ratio": 0.000872175712970602,
"epoch": 0.06391887220066839,
"grad_norm": 0.06972332298755646,
"kl": 0.0010748803615570068,
"learning_rate": 5.327868852459017e-07,
"loss": 0.0081,
"step": 39
},
{
"clip_ratio": 0.000930704369693558,
"epoch": 0.06555781764171116,
"grad_norm": 0.06739407032728195,
"kl": 0.0011384189128875732,
"learning_rate": 5.46448087431694e-07,
"loss": 0.0081,
"step": 40
},
{
"clip_ratio": 0.0,
"completion_length": 606.660826921463,
"epoch": 0.06719676308275394,
"grad_norm": 0.05486822873353958,
"kl": 0.001088649034500122,
"learning_rate": 5.601092896174863e-07,
"loss": 0.0058,
"num_tokens": 30130177.0,
"reward": 0.14843750451109372,
"reward_std": 0.08996616111835465,
"rewards/pure_accuracy_reward_math": 0.14843749973806553,
"step": 41
},
{
"clip_ratio": 0.000587868101206368,
"epoch": 0.06883570852379672,
"grad_norm": 0.053968992084264755,
"kl": 0.0011524856090545654,
"learning_rate": 5.737704918032787e-07,
"loss": 0.0058,
"step": 42
},
{
"clip_ratio": 0.0005904338165692025,
"epoch": 0.0704746539648395,
"grad_norm": 0.05430474132299423,
"kl": 0.0012042820453643799,
"learning_rate": 5.874316939890711e-07,
"loss": 0.0058,
"step": 43
},
{
"clip_ratio": 0.0005757618986308444,
"epoch": 0.07211359940588227,
"grad_norm": 0.05444110184907913,
"kl": 0.0012355148792266846,
"learning_rate": 6.010928961748634e-07,
"loss": 0.0058,
"step": 44
},
{
"clip_ratio": 0.0006261014578967661,
"epoch": 0.07375254484692506,
"grad_norm": 0.054937466979026794,
"kl": 0.0012827813625335693,
"learning_rate": 6.147540983606558e-07,
"loss": 0.0058,
"step": 45
},
{
"clip_ratio": 0.0,
"completion_length": 584.1201364994049,
"epoch": 0.07539149028796784,
"grad_norm": 0.06546950340270996,
"kl": 0.0014158189296722412,
"learning_rate": 6.284153005464482e-07,
"loss": 0.0043,
"num_tokens": 33407966.0,
"reward": 0.1715494836680591,
"reward_std": 0.11086069961311296,
"rewards/pure_accuracy_reward_math": 0.17154948017559946,
"step": 46
},
{
"clip_ratio": 0.0007843592767358132,
"epoch": 0.07703043572901061,
"grad_norm": 0.06173517182469368,
"kl": 0.0014501512050628662,
"learning_rate": 6.420765027322406e-07,
"loss": 0.0043,
"step": 47
},
{
"clip_ratio": 0.0008111927813843067,
"epoch": 0.0786693811700534,
"grad_norm": 0.06110456958413124,
"kl": 0.0014650523662567139,
"learning_rate": 6.557377049180328e-07,
"loss": 0.0043,
"step": 48
},
{
"clip_ratio": 0.0007597751833827715,
"epoch": 0.08030832661109617,
"grad_norm": 0.06199155002832413,
"kl": 0.0015124678611755371,
"learning_rate": 6.693989071038252e-07,
"loss": 0.0043,
"step": 49
},
{
"clip_ratio": 0.0007640034893938719,
"epoch": 0.08194727205213895,
"grad_norm": 0.06190052628517151,
"kl": 0.0015333890914916992,
"learning_rate": 6.830601092896176e-07,
"loss": 0.0043,
"step": 50
},
{
"clip_ratio": 0.0,
"completion_length": 593.0820529460907,
"epoch": 0.08358621749318174,
"grad_norm": 0.06474039703607559,
"kl": 0.0014415383338928223,
"learning_rate": 6.967213114754098e-07,
"loss": 0.0076,
"num_tokens": 36714234.0,
"reward": 0.1923828188155312,
"reward_std": 0.1178674673428759,
"rewards/pure_accuracy_reward_math": 0.1923828122962732,
"step": 51
},
{
"clip_ratio": 0.000813577574433566,
"epoch": 0.08522516293422451,
"grad_norm": 0.06284686177968979,
"kl": 0.001471877098083496,
"learning_rate": 7.103825136612022e-07,
"loss": 0.0077,
"step": 52
},
{
"clip_ratio": 0.0007952848112040556,
"epoch": 0.08686410837526728,
"grad_norm": 0.0626569464802742,
"kl": 0.0014744699001312256,
"learning_rate": 7.240437158469946e-07,
"loss": 0.0076,
"step": 53
},
{
"clip_ratio": 0.000757519129024331,
"epoch": 0.08850305381631007,
"grad_norm": 0.06075895577669144,
"kl": 0.0014587044715881348,
"learning_rate": 7.377049180327869e-07,
"loss": 0.0076,
"step": 54
},
{
"clip_ratio": 0.0008223805086799985,
"epoch": 0.09014199925735285,
"grad_norm": 0.06047751381993294,
"kl": 0.0014570355415344238,
"learning_rate": 7.513661202185793e-07,
"loss": 0.0076,
"step": 55
},
{
"clip_ratio": 0.0,
"completion_length": 595.8216323852539,
"epoch": 0.09178094469839562,
"grad_norm": 0.06398054957389832,
"kl": 0.00144881010055542,
"learning_rate": 7.650273224043716e-07,
"loss": 0.0086,
"num_tokens": 40026830.0,
"reward": 0.20247396413469687,
"reward_std": 0.11906883475603536,
"rewards/pure_accuracy_reward_math": 0.2024739590124227,
"step": 56
},
{
"clip_ratio": 0.00078221041519555,
"epoch": 0.09341989013943841,
"grad_norm": 0.06344633549451828,
"kl": 0.0014292001724243164,
"learning_rate": 7.78688524590164e-07,
"loss": 0.0087,
"step": 57
},
{
"clip_ratio": 0.0008090036571957171,
"epoch": 0.09505883558048119,
"grad_norm": 0.061615679413080215,
"kl": 0.0014474093914031982,
"learning_rate": 7.923497267759564e-07,
"loss": 0.0087,
"step": 58
},
{
"clip_ratio": 0.0008085054041657713,
"epoch": 0.09669778102152396,
"grad_norm": 0.06151620298624039,
"kl": 0.0014512240886688232,
"learning_rate": 8.060109289617488e-07,
"loss": 0.0086,
"step": 59
},
{
"clip_ratio": 0.000824362684852531,
"epoch": 0.09833672646256675,
"grad_norm": 0.06084871292114258,
"kl": 0.0014411509037017822,
"learning_rate": 8.196721311475409e-07,
"loss": 0.0086,
"step": 60
},
{
"clip_ratio": 0.0,
"completion_length": 598.2584838867188,
"epoch": 0.09997567190360952,
"grad_norm": 0.06428408622741699,
"kl": 0.0015523433685302734,
"learning_rate": 8.333333333333333e-07,
"loss": 0.0082,
"num_tokens": 43356588.0,
"reward": 0.18912761070532724,
"reward_std": 0.11445756902685389,
"rewards/pure_accuracy_reward_math": 0.18912760430248454,
"step": 61
},
{
"clip_ratio": 0.0008156659468454563,
"epoch": 0.1016146173446523,
"grad_norm": 0.06184009462594986,
"kl": 0.001552283763885498,
"learning_rate": 8.469945355191257e-07,
"loss": 0.0082,
"step": 62
},
{
"clip_ratio": 0.0008079836062506729,
"epoch": 0.10325356278569509,
"grad_norm": 0.060980089008808136,
"kl": 0.001578688621520996,
"learning_rate": 8.606557377049181e-07,
"loss": 0.0082,
"step": 63
},
{
"clip_ratio": 0.000800917034325721,
"epoch": 0.10489250822673786,
"grad_norm": 0.061832476407289505,
"kl": 0.0016154646873474121,
"learning_rate": 8.743169398907105e-07,
"loss": 0.0082,
"step": 64
},
{
"clip_ratio": 0.0008089348676776353,
"epoch": 0.10653145366778063,
"grad_norm": 0.0595347136259079,
"kl": 0.0017150640487670898,
"learning_rate": 8.879781420765028e-07,
"loss": 0.0081,
"step": 65
},
{
"clip_ratio": 0.0,
"completion_length": 602.9000825881958,
"epoch": 0.10817039910882342,
"grad_norm": 0.06361431628465652,
"kl": 0.001691579818725586,
"learning_rate": 9.016393442622952e-07,
"loss": 0.005,
"num_tokens": 46690213.0,
"reward": 0.18261719372821972,
"reward_std": 0.10855145199457183,
"rewards/pure_accuracy_reward_math": 0.18261718755820766,
"step": 66
},
{
"clip_ratio": 0.0007102687751512349,
"epoch": 0.1098093445498662,
"grad_norm": 0.06422943621873856,
"kl": 0.001762300729751587,
"learning_rate": 9.153005464480875e-07,
"loss": 0.005,
"step": 67
},
{
"clip_ratio": 0.0007208503458286941,
"epoch": 0.11144828999090897,
"grad_norm": 0.062008682638406754,
"kl": 0.0017663836479187012,
"learning_rate": 9.289617486338799e-07,
"loss": 0.005,
"step": 68
},
{
"clip_ratio": 0.0007175619265353816,
"epoch": 0.11308723543195176,
"grad_norm": 0.061343614012002945,
"kl": 0.001800447702407837,
"learning_rate": 9.426229508196721e-07,
"loss": 0.0049,
"step": 69
},
{
"clip_ratio": 0.0007331656333917635,
"epoch": 0.11472618087299453,
"grad_norm": 0.05962536856532097,
"kl": 0.001809924840927124,
"learning_rate": 9.562841530054645e-07,
"loss": 0.0049,
"step": 70
},
{
"clip_ratio": 0.0,
"completion_length": 597.3851110935211,
"epoch": 0.11636512631403731,
"grad_norm": 0.07380504906177521,
"kl": 0.001956164836883545,
"learning_rate": 9.69945355191257e-07,
"loss": 0.0059,
"num_tokens": 50008096.0,
"reward": 0.18815104707027785,
"reward_std": 0.118169616907835,
"rewards/pure_accuracy_reward_math": 0.18815104159875773,
"step": 71
},
{
"clip_ratio": 0.0008107801862706765,
"epoch": 0.11800407175508008,
"grad_norm": 0.06983543187379837,
"kl": 0.0019207000732421875,
"learning_rate": 9.836065573770493e-07,
"loss": 0.0059,
"step": 72
},
{
"clip_ratio": 0.0008206042518850154,
"epoch": 0.11964301719612287,
"grad_norm": 0.06862860172986984,
"kl": 0.001914680004119873,
"learning_rate": 9.972677595628415e-07,
"loss": 0.0059,
"step": 73
},
{
"clip_ratio": 0.0008123442846681428,
"epoch": 0.12128196263716565,
"grad_norm": 0.06780818104743958,
"kl": 0.001929640769958496,
"learning_rate": 1.010928961748634e-06,
"loss": 0.0058,
"step": 74
},
{
"clip_ratio": 0.0008305984221124163,
"epoch": 0.12292090807820842,
"grad_norm": 0.06472048163414001,
"kl": 0.0019400715827941895,
"learning_rate": 1.0245901639344263e-06,
"loss": 0.0058,
"step": 75
},
{
"clip_ratio": 0.0,
"completion_length": 599.2868046760559,
"epoch": 0.12455985351925121,
"grad_norm": 0.06543949246406555,
"kl": 0.0018826127052307129,
"learning_rate": 1.0382513661202188e-06,
"loss": 0.0094,
"num_tokens": 53333081.0,
"reward": 0.19205729727400467,
"reward_std": 0.12739214790053666,
"rewards/pure_accuracy_reward_math": 0.19205729191889986,
"step": 76
},
{
"clip_ratio": 0.0008518788816900269,
"epoch": 0.12619879896029398,
"grad_norm": 0.06384909898042679,
"kl": 0.0019139647483825684,
"learning_rate": 1.051912568306011e-06,
"loss": 0.0094,
"step": 77
},
{
"clip_ratio": 0.0008921786497353423,
"epoch": 0.12783774440133677,
"grad_norm": 0.06342752277851105,
"kl": 0.001939535140991211,
"learning_rate": 1.0655737704918034e-06,
"loss": 0.0094,
"step": 78
},
{
"clip_ratio": 0.0008912816550719072,
"epoch": 0.12947668984237953,
"grad_norm": 0.06367272883653641,
"kl": 0.0019831061363220215,
"learning_rate": 1.0792349726775956e-06,
"loss": 0.0093,
"step": 79
},
{
"clip_ratio": 0.0008512400360132233,
"epoch": 0.13111563528342232,
"grad_norm": 0.062457580119371414,
"kl": 0.0020416975021362305,
"learning_rate": 1.092896174863388e-06,
"loss": 0.0093,
"step": 80
},
{
"clip_ratio": 0.0,
"completion_length": 600.7868010997772,
"epoch": 0.1327545807244651,
"grad_norm": 0.069603331387043,
"kl": 0.0021752119064331055,
"learning_rate": 1.1065573770491804e-06,
"loss": 0.0066,
"num_tokens": 56665038.0,
"reward": 0.19889323483221233,
"reward_std": 0.12287436821497977,
"rewards/pure_accuracy_reward_math": 0.19889323005918413,
"step": 81
},
{
"clip_ratio": 0.0009323512667833711,
"epoch": 0.13439352616550787,
"grad_norm": 0.06583772599697113,
"kl": 0.002191603183746338,
"learning_rate": 1.1202185792349727e-06,
"loss": 0.0066,
"step": 82
},
{
"clip_ratio": 0.0009406020556070871,
"epoch": 0.13603247160655066,
"grad_norm": 0.06439989805221558,
"kl": 0.00225830078125,
"learning_rate": 1.1338797814207652e-06,
"loss": 0.0066,
"step": 83
},
{
"clip_ratio": 0.0009481842756713377,
"epoch": 0.13767141704759345,
"grad_norm": 0.06453175097703934,
"kl": 0.0023380517959594727,
"learning_rate": 1.1475409836065575e-06,
"loss": 0.0065,
"step": 84
},
{
"clip_ratio": 0.00098607516224547,
"epoch": 0.1393103624886362,
"grad_norm": 0.06561443954706192,
"kl": 0.0024124979972839355,
"learning_rate": 1.16120218579235e-06,
"loss": 0.0065,
"step": 85
},
{
"clip_ratio": 0.0,
"completion_length": 592.1575720310211,
"epoch": 0.140949307929679,
"grad_norm": 0.07061439007520676,
"kl": 0.002596259117126465,
"learning_rate": 1.1748633879781422e-06,
"loss": 0.0094,
"num_tokens": 59968450.0,
"reward": 0.2067057350941468,
"reward_std": 0.12307580112246796,
"rewards/pure_accuracy_reward_math": 0.20670572892413475,
"step": 86
},
{
"clip_ratio": 0.0007981168380410963,
"epoch": 0.14258825337072178,
"grad_norm": 0.0682518407702446,
"kl": 0.002633213996887207,
"learning_rate": 1.1885245901639345e-06,
"loss": 0.0094,
"step": 87
},
{
"clip_ratio": 0.0008212789625190453,
"epoch": 0.14422719881176455,
"grad_norm": 0.06932378560304642,
"kl": 0.0026621222496032715,
"learning_rate": 1.2021857923497268e-06,
"loss": 0.0094,
"step": 88
},
{
"clip_ratio": 0.0008140442066633113,
"epoch": 0.14586614425280733,
"grad_norm": 0.06654822826385498,
"kl": 0.002701401710510254,
"learning_rate": 1.215846994535519e-06,
"loss": 0.0093,
"step": 89
},
{
"clip_ratio": 0.0008207391882706361,
"epoch": 0.14750508969385012,
"grad_norm": 0.06492628902196884,
"kl": 0.0028305649757385254,
"learning_rate": 1.2295081967213116e-06,
"loss": 0.0092,
"step": 90
},
{
"clip_ratio": 0.0,
"completion_length": 596.5735874176025,
"epoch": 0.14914403513489288,
"grad_norm": 0.06914262473583221,
"kl": 0.0026297569274902344,
"learning_rate": 1.2431693989071039e-06,
"loss": 0.0089,
"num_tokens": 63290872.0,
"reward": 0.19986979712848552,
"reward_std": 0.12688133475603536,
"rewards/pure_accuracy_reward_math": 0.19986979235545732,
"step": 91
},
{
"clip_ratio": 0.0008256697961996906,
"epoch": 0.15078298057593567,
"grad_norm": 0.06898585706949234,
"kl": 0.002676546573638916,
"learning_rate": 1.2568306010928963e-06,
"loss": 0.009,
"step": 92
},
{
"clip_ratio": 0.0008680100598894569,
"epoch": 0.15242192601697846,
"grad_norm": 0.06637588888406754,
"kl": 0.0026517510414123535,
"learning_rate": 1.2704918032786886e-06,
"loss": 0.0089,
"step": 93
},
{
"clip_ratio": 0.000874812991582985,
"epoch": 0.15406087145802122,
"grad_norm": 0.06262248754501343,
"kl": 0.0026916861534118652,
"learning_rate": 1.2841530054644811e-06,
"loss": 0.0089,
"step": 94
},
{
"clip_ratio": 0.0009557890944051906,
"epoch": 0.155699816899064,
"grad_norm": 0.0627315565943718,
"kl": 0.0027064085006713867,
"learning_rate": 1.2978142076502734e-06,
"loss": 0.0088,
"step": 95
},
{
"clip_ratio": 0.0,
"completion_length": 609.2135615348816,
"epoch": 0.1573387623401068,
"grad_norm": 0.06779834628105164,
"kl": 0.002728700637817383,
"learning_rate": 1.3114754098360657e-06,
"loss": 0.0115,
"num_tokens": 66649152.0,
"reward": 0.20214844364090823,
"reward_std": 0.12382755958242342,
"rewards/pure_accuracy_reward_math": 0.2021484377037268,
"step": 96
},
{
"clip_ratio": 0.0008006048282709344,
"epoch": 0.15897770778114956,
"grad_norm": 0.06556153297424316,
"kl": 0.0027396678924560547,
"learning_rate": 1.3251366120218582e-06,
"loss": 0.0115,
"step": 97
},
{
"clip_ratio": 0.0008465125227985482,
"epoch": 0.16061665322219235,
"grad_norm": 0.06473369896411896,
"kl": 0.0027694106101989746,
"learning_rate": 1.3387978142076505e-06,
"loss": 0.0115,
"step": 98
},
{
"clip_ratio": 0.000838196794347823,
"epoch": 0.16225559866323513,
"grad_norm": 0.06346935033798218,
"kl": 0.002801954746246338,
"learning_rate": 1.352459016393443e-06,
"loss": 0.0114,
"step": 99
},
{
"clip_ratio": 0.000809030144978351,
"epoch": 0.1638945441042779,
"grad_norm": 0.061877407133579254,
"kl": 0.0028792619705200195,
"learning_rate": 1.3661202185792352e-06,
"loss": 0.0113,
"step": 100
},
{
"clip_ratio": 0.0,
"completion_length": 596.9437062740326,
"epoch": 0.16553348954532068,
"grad_norm": 0.06968217343091965,
"kl": 0.003116130828857422,
"learning_rate": 1.3797814207650273e-06,
"loss": 0.0085,
"num_tokens": 69972491.0,
"reward": 0.20605469375732355,
"reward_std": 0.12006876862142235,
"rewards/pure_accuracy_reward_math": 0.20605468933354132,
"step": 101
},
{
"clip_ratio": 0.0008418516855499547,
"epoch": 0.16717243498636347,
"grad_norm": 0.07103519886732101,
"kl": 0.003129124641418457,
"learning_rate": 1.3934426229508196e-06,
"loss": 0.0086,
"step": 102
},
{
"clip_ratio": 0.000812729414064961,
"epoch": 0.16881138042740623,
"grad_norm": 0.06863201409578323,
"kl": 0.0031093955039978027,
"learning_rate": 1.407103825136612e-06,
"loss": 0.0085,
"step": 103
},
{
"clip_ratio": 0.0008020297700568335,
"epoch": 0.17045032586844902,
"grad_norm": 0.06707657128572464,
"kl": 0.003110051155090332,
"learning_rate": 1.4207650273224043e-06,
"loss": 0.0084,
"step": 104
},
{
"clip_ratio": 0.0008383456649880827,
"epoch": 0.1720892713094918,
"grad_norm": 0.06547861546278,
"kl": 0.0031203627586364746,
"learning_rate": 1.4344262295081968e-06,
"loss": 0.0083,
"step": 105
},
{
"clip_ratio": 0.0,
"completion_length": 594.2750873565674,
"epoch": 0.17372821675053457,
"grad_norm": 0.06509453058242798,
"kl": 0.003070056438446045,
"learning_rate": 1.4480874316939891e-06,
"loss": 0.0082,
"num_tokens": 73285196.0,
"reward": 0.21744792378740385,
"reward_std": 0.12147156818537042,
"rewards/pure_accuracy_reward_math": 0.21744791680248454,
"step": 106
},
{
"clip_ratio": 0.0008183834372630372,
"epoch": 0.17536716219157736,
"grad_norm": 0.06286683678627014,
"kl": 0.003090500831604004,
"learning_rate": 1.4617486338797814e-06,
"loss": 0.0082,
"step": 107
},
{
"clip_ratio": 0.0008020995180686441,
"epoch": 0.17700610763262015,
"grad_norm": 0.061473019421100616,
"kl": 0.003094911575317383,
"learning_rate": 1.4754098360655739e-06,
"loss": 0.0082,
"step": 108
},
{
"clip_ratio": 0.0008129155939968769,
"epoch": 0.1786450530736629,
"grad_norm": 0.06097801774740219,
"kl": 0.0031610727310180664,
"learning_rate": 1.4890710382513662e-06,
"loss": 0.0081,
"step": 109
},
{
"clip_ratio": 0.0008801428618880891,
"epoch": 0.1802839985147057,
"grad_norm": 0.06094435974955559,
"kl": 0.003253757953643799,
"learning_rate": 1.5027322404371587e-06,
"loss": 0.008,
"step": 110
},
{
"clip_ratio": 0.0,
"completion_length": 591.4017164707184,
"epoch": 0.18192294395574848,
"grad_norm": 0.07037521153688431,
"kl": 0.0035175085067749023,
"learning_rate": 1.516393442622951e-06,
"loss": 0.0095,
"num_tokens": 76587766.0,
"reward": 0.23372396477498114,
"reward_std": 0.13153934240108356,
"rewards/pure_accuracy_reward_math": 0.2337239591870457,
"step": 111
},
{
"clip_ratio": 0.0009014522622123877,
"epoch": 0.18356188939679124,
"grad_norm": 0.06573645025491714,
"kl": 0.003545045852661133,
"learning_rate": 1.5300546448087432e-06,
"loss": 0.0096,
"step": 112
},
{
"clip_ratio": 0.0009236231287559349,
"epoch": 0.18520083483783403,
"grad_norm": 0.06465188413858414,
"kl": 0.00360715389251709,
"learning_rate": 1.5437158469945357e-06,
"loss": 0.0095,
"step": 113
},
{
"clip_ratio": 0.0009181838793210773,
"epoch": 0.18683978027887682,
"grad_norm": 0.06287030875682831,
"kl": 0.0036890506744384766,
"learning_rate": 1.557377049180328e-06,
"loss": 0.0094,
"step": 114
},
{
"clip_ratio": 0.0008825006539154856,
"epoch": 0.18847872571991958,
"grad_norm": 0.06144850701093674,
"kl": 0.003753662109375,
"learning_rate": 1.5710382513661205e-06,
"loss": 0.0093,
"step": 115
},
{
"clip_ratio": 0.0,
"completion_length": 594.1959838867188,
"epoch": 0.19011767116096237,
"grad_norm": 0.06793060898780823,
"kl": 0.003781437873840332,
"learning_rate": 1.5846994535519128e-06,
"loss": 0.011,
"num_tokens": 79893856.0,
"reward": 0.21386719393194653,
"reward_std": 0.12512964301276952,
"rewards/pure_accuracy_reward_math": 0.2138671872962732,
"step": 116
},
{
"clip_ratio": 0.0008201918570875932,
"epoch": 0.19175661660200516,
"grad_norm": 0.06297077238559723,
"kl": 0.0038176774978637695,
"learning_rate": 1.5983606557377053e-06,
"loss": 0.011,
"step": 117
},
{
"clip_ratio": 0.0007795650292337086,
"epoch": 0.19339556204304792,
"grad_norm": 0.061727218329906464,
"kl": 0.003816843032836914,
"learning_rate": 1.6120218579234975e-06,
"loss": 0.011,
"step": 118
},
{
"clip_ratio": 0.0008128985705297964,
"epoch": 0.1950345074840907,
"grad_norm": 0.05955222249031067,
"kl": 0.003865480422973633,
"learning_rate": 1.6256830601092896e-06,
"loss": 0.0109,
"step": 119
},
{
"clip_ratio": 0.0008892948203538253,
"epoch": 0.1966734529251335,
"grad_norm": 0.05931426212191582,
"kl": 0.003886103630065918,
"learning_rate": 1.6393442622950819e-06,
"loss": 0.0108,
"step": 120
},
{
"clip_ratio": 0.0,
"completion_length": 586.1058125495911,
"epoch": 0.19831239836617626,
"grad_norm": 0.07331722974777222,
"kl": 0.0038805007934570312,
"learning_rate": 1.6530054644808744e-06,
"loss": 0.0069,
"num_tokens": 83181421.0,
"reward": 0.2060546927677933,
"reward_std": 0.12338518165051937,
"rewards/pure_accuracy_reward_math": 0.20605468822759576,
"step": 121
},
{
"clip_ratio": 0.0008778467455385908,
"epoch": 0.19995134380721905,
"grad_norm": 0.06926850229501724,
"kl": 0.0038404464721679688,
"learning_rate": 1.6666666666666667e-06,
"loss": 0.0069,
"step": 122
},
{
"clip_ratio": 0.0009394907406203856,
"epoch": 0.20159028924826183,
"grad_norm": 0.06973890960216522,
"kl": 0.003817915916442871,
"learning_rate": 1.6803278688524592e-06,
"loss": 0.0069,
"step": 123
},
{
"clip_ratio": 0.000969218867339805,
"epoch": 0.2032292346893046,
"grad_norm": 0.06822917610406876,
"kl": 0.0038552284240722656,
"learning_rate": 1.6939890710382514e-06,
"loss": 0.0068,
"step": 124
},
{
"clip_ratio": 0.0009342714683953091,
"epoch": 0.20486818013034738,
"grad_norm": 0.06682004034519196,
"kl": 0.003947019577026367,
"learning_rate": 1.7076502732240437e-06,
"loss": 0.0066,
"step": 125
},
{
"clip_ratio": 0.0,
"completion_length": 589.0709819793701,
"epoch": 0.20650712557139017,
"grad_norm": 0.07198912650346756,
"kl": 0.003998160362243652,
"learning_rate": 1.7213114754098362e-06,
"loss": 0.0125,
"num_tokens": 86478355.0,
"reward": 0.22949219273868948,
"reward_std": 0.14352073048939928,
"rewards/pure_accuracy_reward_math": 0.22949218878056854,
"step": 126
},
{
"clip_ratio": 0.0010473157254864418,
"epoch": 0.20814607101243293,
"grad_norm": 0.07177633047103882,
"kl": 0.004043221473693848,
"learning_rate": 1.7349726775956285e-06,
"loss": 0.0125,
"step": 127
},
{
"clip_ratio": 0.0010181566002529507,
"epoch": 0.20978501645347572,
"grad_norm": 0.06755513697862625,
"kl": 0.0041484832763671875,
"learning_rate": 1.748633879781421e-06,
"loss": 0.0124,
"step": 128
},
{
"clip_ratio": 0.0010040162792392948,
"epoch": 0.2114239618945185,
"grad_norm": 0.06670001894235611,
"kl": 0.004278779029846191,
"learning_rate": 1.7622950819672133e-06,
"loss": 0.0123,
"step": 129
},
{
"clip_ratio": 0.0010213782433083907,
"epoch": 0.21306290733556127,
"grad_norm": 0.06810087710618973,
"kl": 0.004375338554382324,
"learning_rate": 1.7759562841530055e-06,
"loss": 0.0121,
"step": 130
},
{
"clip_ratio": 0.0,
"completion_length": 585.0755393505096,
"epoch": 0.21470185277660406,
"grad_norm": 0.07637549191713333,
"kl": 0.004368305206298828,
"learning_rate": 1.789617486338798e-06,
"loss": 0.0075,
"num_tokens": 89764315.0,
"reward": 0.22656250739237294,
"reward_std": 0.14191649784334004,
"rewards/pure_accuracy_reward_math": 0.2265624997089617,
"step": 131
},
{
"clip_ratio": 0.0010003317277096357,
"epoch": 0.21634079821764685,
"grad_norm": 0.07960700243711472,
"kl": 0.0042803287506103516,
"learning_rate": 1.8032786885245903e-06,
"loss": 0.0075,
"step": 132
},
{
"clip_ratio": 0.0010098553934767551,
"epoch": 0.2179797436586896,
"grad_norm": 0.0741487368941307,
"kl": 0.004350185394287109,
"learning_rate": 1.8169398907103828e-06,
"loss": 0.0074,
"step": 133
},
{
"clip_ratio": 0.0010473617899151577,
"epoch": 0.2196186890997324,
"grad_norm": 0.07375472038984299,
"kl": 0.004483342170715332,
"learning_rate": 1.830601092896175e-06,
"loss": 0.0073,
"step": 134
},
{
"clip_ratio": 0.0010608058864818304,
"epoch": 0.22125763454077518,
"grad_norm": 0.06948796659708023,
"kl": 0.004660606384277344,
"learning_rate": 1.8442622950819674e-06,
"loss": 0.0071,
"step": 135
},
{
"clip_ratio": 0.0,
"completion_length": 599.9541211128235,
"epoch": 0.22289657998181794,
"grad_norm": 0.06568682193756104,
"kl": 0.004492521286010742,
"learning_rate": 1.8579234972677599e-06,
"loss": 0.0076,
"num_tokens": 93086726.0,
"reward": 0.20638021410559304,
"reward_std": 0.11796818353468552,
"rewards/pure_accuracy_reward_math": 0.20638020828482695,
"step": 136
},
{
"clip_ratio": 0.0007427198539744495,
"epoch": 0.22453552542286073,
"grad_norm": 0.061326853930950165,
"kl": 0.004569292068481445,
"learning_rate": 1.8715846994535521e-06,
"loss": 0.0076,
"step": 137
},
{
"clip_ratio": 0.0007810102034682131,
"epoch": 0.22617447086390352,
"grad_norm": 0.06033333018422127,
"kl": 0.0046776533126831055,
"learning_rate": 1.8852459016393442e-06,
"loss": 0.0075,
"step": 138
},
{
"clip_ratio": 0.0007891726669413401,
"epoch": 0.22781341630494628,
"grad_norm": 0.057988133281469345,
"kl": 0.004709959030151367,
"learning_rate": 1.8989071038251367e-06,
"loss": 0.0074,
"step": 139
},
{
"clip_ratio": 0.00077407437288457,
"epoch": 0.22945236174598907,
"grad_norm": 0.055629778653383255,
"kl": 0.0047043561935424805,
"learning_rate": 1.912568306010929e-06,
"loss": 0.0073,
"step": 140
},
{
"clip_ratio": 0.0,
"completion_length": 589.5273640155792,
"epoch": 0.23109130718703186,
"grad_norm": 0.08023487031459808,
"kl": 0.004754543304443359,
"learning_rate": 1.9262295081967215e-06,
"loss": 0.0061,
"num_tokens": 96377242.0,
"reward": 0.22167969349538907,
"reward_std": 0.11916955123888329,
"rewards/pure_accuracy_reward_math": 0.2216796882566996,
"step": 141
},
{
"clip_ratio": 0.0008194281034548112,
"epoch": 0.23273025262807462,
"grad_norm": 0.06729461997747421,
"kl": 0.004743695259094238,
"learning_rate": 1.939890710382514e-06,
"loss": 0.0061,
"step": 142
},
{
"clip_ratio": 0.0008319891088035547,
"epoch": 0.2343691980691174,
"grad_norm": 0.0685749426484108,
"kl": 0.004932522773742676,
"learning_rate": 1.953551912568306e-06,
"loss": 0.006,
"step": 143
},
{
"clip_ratio": 0.000810704066566359,
"epoch": 0.23600814351016017,
"grad_norm": 0.0689912959933281,
"kl": 0.005072951316833496,
"learning_rate": 1.9672131147540985e-06,
"loss": 0.0058,
"step": 144
},
{
"clip_ratio": 0.0008489251890750893,
"epoch": 0.23764708895120296,
"grad_norm": 0.06294326484203339,
"kl": 0.005017280578613281,
"learning_rate": 1.980874316939891e-06,
"loss": 0.0057,
"step": 145
},
{
"clip_ratio": 0.0,
"completion_length": 600.1722221374512,
"epoch": 0.23928603439224574,
"grad_norm": 0.06795884668827057,
"kl": 0.005151629447937012,
"learning_rate": 1.994535519125683e-06,
"loss": 0.0096,
"num_tokens": 99709727.0,
"reward": 0.2047526100941468,
"reward_std": 0.1280827015871182,
"rewards/pure_accuracy_reward_math": 0.2047526053211186,
"step": 146
},
{
"clip_ratio": 0.0008647626258380114,
"epoch": 0.2409249798332885,
"grad_norm": 0.06390897184610367,
"kl": 0.0051021575927734375,
"learning_rate": 2.0081967213114756e-06,
"loss": 0.0096,
"step": 147
},
{
"clip_ratio": 0.0009481125781576338,
"epoch": 0.2425639252743313,
"grad_norm": 0.062446512281894684,
"kl": 0.005044102668762207,
"learning_rate": 2.021857923497268e-06,
"loss": 0.0095,
"step": 148
},
{
"clip_ratio": 0.0009975744914072493,
"epoch": 0.24420287071537408,
"grad_norm": 0.06106211990118027,
"kl": 0.00504612922668457,
"learning_rate": 2.03551912568306e-06,
"loss": 0.0093,
"step": 149
},
{
"clip_ratio": 0.0009842645606568112,
"epoch": 0.24584181615641684,
"grad_norm": 0.058460384607315063,
"kl": 0.005153179168701172,
"learning_rate": 2.0491803278688526e-06,
"loss": 0.0092,
"step": 150
},
{
"clip_ratio": 0.0,
"completion_length": 599.0677282810211,
"epoch": 0.24748076159745963,
"grad_norm": 0.08131567388772964,
"kl": 0.005348920822143555,
"learning_rate": 2.062841530054645e-06,
"loss": 0.0096,
"num_tokens": 103039175.0,
"reward": 0.2106119856762234,
"reward_std": 0.14471486618276685,
"rewards/pure_accuracy_reward_math": 0.21061197892413475,
"step": 151
},
{
"clip_ratio": 0.0011485503518997575,
"epoch": 0.24911970703850242,
"grad_norm": 0.0808255672454834,
"kl": 0.0054149627685546875,
"learning_rate": 2.0765027322404376e-06,
"loss": 0.0096,
"step": 152
},
{
"clip_ratio": 0.0011561684764274105,
"epoch": 0.2507586524795452,
"grad_norm": 0.07708927989006042,
"kl": 0.005404829978942871,
"learning_rate": 2.0901639344262297e-06,
"loss": 0.0095,
"step": 153
},
{
"clip_ratio": 0.0011439574755058857,
"epoch": 0.25239759792058797,
"grad_norm": 0.07077940553426743,
"kl": 0.005424022674560547,
"learning_rate": 2.103825136612022e-06,
"loss": 0.0093,
"step": 154
},
{
"clip_ratio": 0.0011864712664646504,
"epoch": 0.25403654336163073,
"grad_norm": 0.07691214233636856,
"kl": 0.005586385726928711,
"learning_rate": 2.1174863387978147e-06,
"loss": 0.0091,
"step": 155
},
{
"clip_ratio": 0.0,
"completion_length": 600.4765803813934,
"epoch": 0.25567548880267355,
"grad_norm": 0.06585235148668289,
"kl": 0.005746960639953613,
"learning_rate": 2.1311475409836067e-06,
"loss": 0.0081,
"num_tokens": 106367483.0,
"reward": 0.2112630266638007,
"reward_std": 0.11601505969883874,
"rewards/pure_accuracy_reward_math": 0.21126302142511122,
"step": 156
},
{
"clip_ratio": 0.0008758294210338136,
"epoch": 0.2573144342437163,
"grad_norm": 0.07339663803577423,
"kl": 0.005802512168884277,
"learning_rate": 2.144808743169399e-06,
"loss": 0.0081,
"step": 157
},
{
"clip_ratio": 0.0008576641474746793,
"epoch": 0.25895337968475907,
"grad_norm": 0.06242053955793381,
"kl": 0.005854010581970215,
"learning_rate": 2.1584699453551913e-06,
"loss": 0.008,
"step": 158
},
{
"clip_ratio": 0.0008841920518989355,
"epoch": 0.2605923251258019,
"grad_norm": 0.06326813995838165,
"kl": 0.0059430599212646484,
"learning_rate": 2.1721311475409838e-06,
"loss": 0.0078,
"step": 159
},
{
"clip_ratio": 0.0009176517396554118,
"epoch": 0.26223127056684464,
"grad_norm": 0.06189825013279915,
"kl": 0.005987405776977539,
"learning_rate": 2.185792349726776e-06,
"loss": 0.0077,
"step": 160
},
{
"clip_ratio": 0.0,
"completion_length": 588.5446150302887,
"epoch": 0.2638702160078874,
"grad_norm": 0.06866484135389328,
"kl": 0.006237506866455078,
"learning_rate": 2.1994535519125683e-06,
"loss": 0.0078,
"num_tokens": 109663204.0,
"reward": 0.21061198558891192,
"reward_std": 0.12186720367753878,
"rewards/pure_accuracy_reward_math": 0.21061197930248454,
"step": 161
},
{
"clip_ratio": 0.0009436907939743833,
"epoch": 0.2655091614489302,
"grad_norm": 0.07263052463531494,
"kl": 0.0061321258544921875,
"learning_rate": 2.213114754098361e-06,
"loss": 0.0078,
"step": 162
},
{
"clip_ratio": 0.0009464566720680523,
"epoch": 0.267148106889973,
"grad_norm": 0.06491200625896454,
"kl": 0.006033658981323242,
"learning_rate": 2.2267759562841533e-06,
"loss": 0.0076,
"step": 163
},
{
"clip_ratio": 0.000968025926908922,
"epoch": 0.26878705233101574,
"grad_norm": 0.06358778476715088,
"kl": 0.006163120269775391,
"learning_rate": 2.2404371584699454e-06,
"loss": 0.0075,
"step": 164
},
{
"clip_ratio": 0.0009389551167942045,
"epoch": 0.27042599777205856,
"grad_norm": 0.06583644449710846,
"kl": 0.006374359130859375,
"learning_rate": 2.254098360655738e-06,
"loss": 0.0073,
"step": 165
},
{
"clip_ratio": 0.0,
"completion_length": 594.6982605457306,
"epoch": 0.2720649432131013,
"grad_norm": 0.07151180505752563,
"kl": 0.006270289421081543,
"learning_rate": 2.2677595628415304e-06,
"loss": 0.0071,
"num_tokens": 112983493.0,
"reward": 0.21158854870009236,
"reward_std": 0.1309350436204113,
"rewards/pure_accuracy_reward_math": 0.21158854171517305,
"step": 166
},
{
"clip_ratio": 0.0010227661889530282,
"epoch": 0.2737038886541441,
"grad_norm": 0.08615773171186447,
"kl": 0.0064040422439575195,
"learning_rate": 2.2814207650273224e-06,
"loss": 0.0071,
"step": 167
},
{
"clip_ratio": 0.0009205440467212611,
"epoch": 0.2753428340951869,
"grad_norm": 0.06637667864561081,
"kl": 0.00621640682220459,
"learning_rate": 2.295081967213115e-06,
"loss": 0.0069,
"step": 168
},
{
"clip_ratio": 0.001003933336846785,
"epoch": 0.27698177953622966,
"grad_norm": 0.076202891767025,
"kl": 0.0062408447265625,
"learning_rate": 2.3087431693989074e-06,
"loss": 0.0068,
"step": 169
},
{
"clip_ratio": 0.0009103266062311377,
"epoch": 0.2786207249772724,
"grad_norm": 0.06154695898294449,
"kl": 0.006373286247253418,
"learning_rate": 2.3224043715847e-06,
"loss": 0.0065,
"step": 170
},
{
"clip_ratio": 0.0,
"completion_length": 584.0752146244049,
"epoch": 0.28025967041831523,
"grad_norm": 0.06730078905820847,
"kl": 0.006638884544372559,
"learning_rate": 2.336065573770492e-06,
"loss": 0.0089,
"num_tokens": 116258180.0,
"reward": 0.220703131693881,
"reward_std": 0.12132410902995616,
"rewards/pure_accuracy_reward_math": 0.2207031263387762,
"step": 171
},
{
"clip_ratio": 0.001059339236917367,
"epoch": 0.281898615859358,
"grad_norm": 0.08054529875516891,
"kl": 0.0067511796951293945,
"learning_rate": 2.3497267759562845e-06,
"loss": 0.0089,
"step": 172
},
{
"clip_ratio": 0.0010770070745707017,
"epoch": 0.28353756130040075,
"grad_norm": 0.06891456246376038,
"kl": 0.006635904312133789,
"learning_rate": 2.363387978142077e-06,
"loss": 0.0088,
"step": 173
},
{
"clip_ratio": 0.0009533684936400277,
"epoch": 0.28517650674144357,
"grad_norm": 0.06477612257003784,
"kl": 0.006537199020385742,
"learning_rate": 2.377049180327869e-06,
"loss": 0.0086,
"step": 174
},
{
"clip_ratio": 0.0008389282212419857,
"epoch": 0.28681545218248633,
"grad_norm": 0.06404498219490051,
"kl": 0.006713271141052246,
"learning_rate": 2.390710382513661e-06,
"loss": 0.0084,
"step": 175
},
{
"clip_ratio": 0.0,
"completion_length": 586.0224783420563,
"epoch": 0.2884543976235291,
"grad_norm": 0.07143088430166245,
"kl": 0.006848454475402832,
"learning_rate": 2.4043715846994536e-06,
"loss": 0.0081,
"num_tokens": 119549581.0,
"reward": 0.23144531846628524,
"reward_std": 0.11726316896965727,
"rewards/pure_accuracy_reward_math": 0.23144531299476512,
"step": 176
},
{
"clip_ratio": 0.0008353526282007806,
"epoch": 0.2900933430645719,
"grad_norm": 0.07284073531627655,
"kl": 0.006837129592895508,
"learning_rate": 2.418032786885246e-06,
"loss": 0.0081,
"step": 177
},
{
"clip_ratio": 0.0008791502111762384,
"epoch": 0.29173228850561467,
"grad_norm": 0.06452663242816925,
"kl": 0.006670117378234863,
"learning_rate": 2.431693989071038e-06,
"loss": 0.008,
"step": 178
},
{
"clip_ratio": 0.0009922128726884694,
"epoch": 0.29337123394665743,
"grad_norm": 0.07056602835655212,
"kl": 0.006812095642089844,
"learning_rate": 2.4453551912568307e-06,
"loss": 0.0078,
"step": 179
},
{
"clip_ratio": 0.0009285092224899927,
"epoch": 0.29501017938770024,
"grad_norm": 0.06236054748296738,
"kl": 0.0068634748458862305,
"learning_rate": 2.459016393442623e-06,
"loss": 0.0075,
"step": 180
},
{
"clip_ratio": 0.0,
"completion_length": 587.9111535549164,
"epoch": 0.296649124828743,
"grad_norm": 0.07245080173015594,
"kl": 0.007324337959289551,
"learning_rate": 2.4726775956284156e-06,
"loss": 0.0063,
"num_tokens": 122841384.0,
"reward": 0.22916667381650768,
"reward_std": 0.12823739141458645,
"rewards/pure_accuracy_reward_math": 0.22916666624951176,
"step": 181
},
{
"clip_ratio": 0.0010925326015467363,
"epoch": 0.29828807026978577,
"grad_norm": 0.08096741139888763,
"kl": 0.007236003875732422,
"learning_rate": 2.4863387978142077e-06,
"loss": 0.0062,
"step": 182
},
{
"clip_ratio": 0.0010355811738236298,
"epoch": 0.2999270157108286,
"grad_norm": 0.06912072002887726,
"kl": 0.007112741470336914,
"learning_rate": 2.5e-06,
"loss": 0.0061,
"step": 183
},
{
"clip_ratio": 0.0009683458151812374,
"epoch": 0.30156596115187134,
"grad_norm": 0.07461241632699966,
"kl": 0.007212400436401367,
"learning_rate": 2.5136612021857927e-06,
"loss": 0.0058,
"step": 184
},
{
"clip_ratio": 0.0009423685739875509,
"epoch": 0.3032049065929141,
"grad_norm": 0.0647897720336914,
"kl": 0.007313847541809082,
"learning_rate": 2.5273224043715848e-06,
"loss": 0.0055,
"step": 185
},
{
"clip_ratio": 0.0,
"completion_length": 583.5849809646606,
"epoch": 0.3048438520339569,
"grad_norm": 0.06853717565536499,
"kl": 0.007729768753051758,
"learning_rate": 2.5409836065573773e-06,
"loss": 0.0077,
"num_tokens": 126127813.0,
"reward": 0.20898438137373887,
"reward_std": 0.11270587670151144,
"rewards/pure_accuracy_reward_math": 0.2089843761350494,
"step": 186
},
{
"clip_ratio": 0.0010363689809764765,
"epoch": 0.3064827974749997,
"grad_norm": 0.07357639819383621,
"kl": 0.007730722427368164,
"learning_rate": 2.5546448087431697e-06,
"loss": 0.0076,
"step": 187
},
{
"clip_ratio": 0.0010397096725682786,
"epoch": 0.30812174291604244,
"grad_norm": 0.06807340681552887,
"kl": 0.007578372955322266,
"learning_rate": 2.5683060109289622e-06,
"loss": 0.0075,
"step": 188
},
{
"clip_ratio": 0.0007689736390830149,
"epoch": 0.30976068835708526,
"grad_norm": 0.06024845689535141,
"kl": 0.007673501968383789,
"learning_rate": 2.5819672131147543e-06,
"loss": 0.0072,
"step": 189
},
{
"clip_ratio": 0.0007949806515625824,
"epoch": 0.311399633798128,
"grad_norm": 0.06614933907985687,
"kl": 0.007935523986816406,
"learning_rate": 2.595628415300547e-06,
"loss": 0.007,
"step": 190
},
{
"clip_ratio": 0.0,
"completion_length": 582.4375193119049,
"epoch": 0.3130385792391708,
"grad_norm": 0.07819633185863495,
"kl": 0.00816202163696289,
"learning_rate": 2.6092896174863393e-06,
"loss": 0.0046,
"num_tokens": 129404757.0,
"reward": 0.23046875756699592,
"reward_std": 0.12788849917706102,
"rewards/pure_accuracy_reward_math": 0.2304687495343387,
"step": 191
},
{
"clip_ratio": 0.0010027453071188575,
"epoch": 0.3146775246802136,
"grad_norm": 0.07076407223939896,
"kl": 0.007757902145385742,
"learning_rate": 2.6229508196721314e-06,
"loss": 0.0045,
"step": 192
},
{
"clip_ratio": 0.0011502429521215163,
"epoch": 0.31631647012125635,
"grad_norm": 0.06905192136764526,
"kl": 0.007544517517089844,
"learning_rate": 2.636612021857924e-06,
"loss": 0.0044,
"step": 193
},
{
"clip_ratio": 0.001169734060454175,
"epoch": 0.3179554155622991,
"grad_norm": 0.07402996718883514,
"kl": 0.007522106170654297,
"learning_rate": 2.6502732240437163e-06,
"loss": 0.0042,
"step": 194
},
{
"clip_ratio": 0.001001289329451538,
"epoch": 0.31959436100334193,
"grad_norm": 0.0615554116666317,
"kl": 0.007868766784667969,
"learning_rate": 2.6639344262295084e-06,
"loss": 0.0039,
"step": 195
},
{
"clip_ratio": 0.0,
"completion_length": 598.659848690033,
"epoch": 0.3212333064443847,
"grad_norm": 0.07155327498912811,
"kl": 0.0077495574951171875,
"learning_rate": 2.677595628415301e-06,
"loss": 0.009,
"num_tokens": 132735652.0,
"reward": 0.21126302669290453,
"reward_std": 0.12602885958040133,
"rewards/pure_accuracy_reward_math": 0.21126302133779973,
"step": 196
},
{
"clip_ratio": 0.0009628182596088664,
"epoch": 0.32287225188542745,
"grad_norm": 0.07324164360761642,
"kl": 0.00766444206237793,
"learning_rate": 2.6912568306010934e-06,
"loss": 0.0089,
"step": 197
},
{
"clip_ratio": 0.001045915161398625,
"epoch": 0.32451119732647027,
"grad_norm": 0.07669375091791153,
"kl": 0.0074596405029296875,
"learning_rate": 2.704918032786886e-06,
"loss": 0.0087,
"step": 198
},
{
"clip_ratio": 0.0009246684501249547,
"epoch": 0.32615014276751303,
"grad_norm": 0.0650852844119072,
"kl": 0.0074880123138427734,
"learning_rate": 2.718579234972678e-06,
"loss": 0.0085,
"step": 199
},
{
"clip_ratio": 0.0009262548958304251,
"epoch": 0.3277890882085558,
"grad_norm": 0.0722322165966034,
"kl": 0.007855653762817383,
"learning_rate": 2.7322404371584705e-06,
"loss": 0.0082,
"step": 200
},
{
"clip_ratio": 0.0,
"completion_length": 591.9775581359863,
"epoch": 0.3294280336495986,
"grad_norm": 0.07280802726745605,
"kl": 0.007916688919067383,
"learning_rate": 2.745901639344263e-06,
"loss": 0.0084,
"num_tokens": 136043335.0,
"reward": 0.2236328196595423,
"reward_std": 0.12912937795044854,
"rewards/pure_accuracy_reward_math": 0.22363281302386895,
"step": 201
},
{
"clip_ratio": 0.0010444082931826415,
"epoch": 0.33106697909064137,
"grad_norm": 0.0775647759437561,
"kl": 0.007770538330078125,
"learning_rate": 2.7595628415300546e-06,
"loss": 0.0083,
"step": 202
},
{
"clip_ratio": 0.0010056693769797675,
"epoch": 0.3327059245316841,
"grad_norm": 0.06984438002109528,
"kl": 0.0076978206634521484,
"learning_rate": 2.773224043715847e-06,
"loss": 0.0081,
"step": 203
},
{
"clip_ratio": 0.0010063842889849184,
"epoch": 0.33434486997272694,
"grad_norm": 0.07507704943418503,
"kl": 0.007877111434936523,
"learning_rate": 2.786885245901639e-06,
"loss": 0.0079,
"step": 204
},
{
"clip_ratio": 0.0010283672744435535,
"epoch": 0.3359838154137697,
"grad_norm": 0.07364527881145477,
"kl": 0.00825810432434082,
"learning_rate": 2.8005464480874316e-06,
"loss": 0.0076,
"step": 205
},
{
"clip_ratio": 0.0,
"completion_length": 588.1670074462891,
"epoch": 0.33762276085481246,
"grad_norm": 0.06861822307109833,
"kl": 0.00839853286743164,
"learning_rate": 2.814207650273224e-06,
"loss": 0.0057,
"num_tokens": 139337308.0,
"reward": 0.2106119850941468,
"reward_std": 0.12027020112145692,
"rewards/pure_accuracy_reward_math": 0.21061198008828796,
"step": 206
},
{
"clip_ratio": 0.0010541207553558252,
"epoch": 0.3392617062958553,
"grad_norm": 0.08106576651334763,
"kl": 0.008537769317626953,
"learning_rate": 2.8278688524590166e-06,
"loss": 0.0057,
"step": 207
},
{
"clip_ratio": 0.0009489937833109252,
"epoch": 0.34090065173689804,
"grad_norm": 0.0691104531288147,
"kl": 0.008366107940673828,
"learning_rate": 2.8415300546448087e-06,
"loss": 0.0054,
"step": 208
},
{
"clip_ratio": 0.0009892520201901789,
"epoch": 0.3425395971779408,
"grad_norm": 0.06807916611433029,
"kl": 0.008470535278320312,
"learning_rate": 2.855191256830601e-06,
"loss": 0.0052,
"step": 209
},
{
"clip_ratio": 0.00096842655295859,
"epoch": 0.3441785426189836,
"grad_norm": 0.0654783844947815,
"kl": 0.008765220642089844,
"learning_rate": 2.8688524590163937e-06,
"loss": 0.0049,
"step": 210
},
{
"clip_ratio": 0.0,
"completion_length": 590.6634306907654,
"epoch": 0.3458174880600264,
"grad_norm": 0.0704723373055458,
"kl": 0.008867502212524414,
"learning_rate": 2.8825136612021857e-06,
"loss": 0.0091,
"num_tokens": 142633758.0,
"reward": 0.21549479861278087,
"reward_std": 0.13379461748991162,
"rewards/pure_accuracy_reward_math": 0.21549479197710752,
"step": 211
},
{
"clip_ratio": 0.0011996210827192044,
"epoch": 0.34745643350106914,
"grad_norm": 0.08370186388492584,
"kl": 0.008816242218017578,
"learning_rate": 2.8961748633879782e-06,
"loss": 0.009,
"step": 212
},
{
"clip_ratio": 0.001070254641945212,
"epoch": 0.34909537894211196,
"grad_norm": 0.06448537111282349,
"kl": 0.008533716201782227,
"learning_rate": 2.9098360655737707e-06,
"loss": 0.0088,
"step": 213
},
{
"clip_ratio": 0.0011582542088603986,
"epoch": 0.3507343243831547,
"grad_norm": 0.07735106348991394,
"kl": 0.008788824081420898,
"learning_rate": 2.923497267759563e-06,
"loss": 0.0085,
"step": 214
},
{
"clip_ratio": 0.0010283683568559354,
"epoch": 0.3523732698241975,
"grad_norm": 0.06124194711446762,
"kl": 0.008962869644165039,
"learning_rate": 2.9371584699453553e-06,
"loss": 0.0082,
"step": 215
},
{
"clip_ratio": 0.0,
"completion_length": 563.4902558326721,
"epoch": 0.3540122152652403,
"grad_norm": 0.07734435796737671,
"kl": 0.009832620620727539,
"learning_rate": 2.9508196721311478e-06,
"loss": 0.0061,
"num_tokens": 145848300.0,
"reward": 0.24381511058891192,
"reward_std": 0.13013654301175848,
"rewards/pure_accuracy_reward_math": 0.2438151046517305,
"step": 216
},
{
"clip_ratio": 0.0012246508512134824,
"epoch": 0.35565116070628305,
"grad_norm": 0.08686057478189468,
"kl": 0.009522438049316406,
"learning_rate": 2.9644808743169403e-06,
"loss": 0.0061,
"step": 217
},
{
"clip_ratio": 0.0011569151299681835,
"epoch": 0.3572901061473258,
"grad_norm": 0.07663314044475555,
"kl": 0.009255170822143555,
"learning_rate": 2.9781420765027323e-06,
"loss": 0.0058,
"step": 218
},
{
"clip_ratio": 0.0010811529527927632,
"epoch": 0.35892905158836863,
"grad_norm": 0.07616522163152695,
"kl": 0.009699821472167969,
"learning_rate": 2.991803278688525e-06,
"loss": 0.0055,
"step": 219
},
{
"clip_ratio": 0.0009544987469780608,
"epoch": 0.3605679970294114,
"grad_norm": 0.07570254802703857,
"kl": 0.010393381118774414,
"learning_rate": 3.0054644808743173e-06,
"loss": 0.0052,
"step": 220
},
{
"clip_ratio": 0.0,
"completion_length": 576.4231963157654,
"epoch": 0.36220694247045415,
"grad_norm": 0.0710562989115715,
"kl": 0.009819984436035156,
"learning_rate": 3.0191256830601094e-06,
"loss": 0.008,
"num_tokens": 149101036.0,
"reward": 0.2226562555297278,
"reward_std": 0.12332397617865354,
"rewards/pure_accuracy_reward_math": 0.22265625168802217,
"step": 221
},
{
"clip_ratio": 0.0012251571324668475,
"epoch": 0.36384588791149697,
"grad_norm": 0.08233921229839325,
"kl": 0.0095062255859375,
"learning_rate": 3.032786885245902e-06,
"loss": 0.0079,
"step": 222
},
{
"clip_ratio": 0.001105058086977806,
"epoch": 0.36548483335253973,
"grad_norm": 0.07291049510240555,
"kl": 0.009292364120483398,
"learning_rate": 3.0464480874316944e-06,
"loss": 0.0076,
"step": 223
},
{
"clip_ratio": 0.0009599913582860609,
"epoch": 0.3671237787935825,
"grad_norm": 0.07015552371740341,
"kl": 0.009765148162841797,
"learning_rate": 3.0601092896174864e-06,
"loss": 0.0073,
"step": 224
},
{
"clip_ratio": 0.0009534798105050868,
"epoch": 0.3687627242346253,
"grad_norm": 0.07405047863721848,
"kl": 0.010376691818237305,
"learning_rate": 3.073770491803279e-06,
"loss": 0.007,
"step": 225
},
{
"clip_ratio": 0.0,
"completion_length": 578.5188989639282,
"epoch": 0.37040166967566807,
"grad_norm": 0.0744408518075943,
"kl": 0.010227680206298828,
"learning_rate": 3.0874316939890714e-06,
"loss": 0.0094,
"num_tokens": 152364698.0,
"reward": 0.23632813163567334,
"reward_std": 0.126384983304888,
"rewards/pure_accuracy_reward_math": 0.236328125,
"step": 226
},
{
"clip_ratio": 0.0011350565871453,
"epoch": 0.3720406151167108,
"grad_norm": 0.09323269873857498,
"kl": 0.009792804718017578,
"learning_rate": 3.101092896174864e-06,
"loss": 0.0094,
"step": 227
},
{
"clip_ratio": 0.0009327100992777559,
"epoch": 0.37367956055775364,
"grad_norm": 0.07071880251169205,
"kl": 0.009824752807617188,
"learning_rate": 3.114754098360656e-06,
"loss": 0.0091,
"step": 228
},
{
"clip_ratio": 0.0010184358247897762,
"epoch": 0.3753185059987964,
"grad_norm": 0.07402479648590088,
"kl": 0.010513544082641602,
"learning_rate": 3.1284153005464485e-06,
"loss": 0.0088,
"step": 229
},
{
"clip_ratio": 0.0010424532179058588,
"epoch": 0.37695745143983916,
"grad_norm": 0.07837292551994324,
"kl": 0.010687112808227539,
"learning_rate": 3.142076502732241e-06,
"loss": 0.0085,
"step": 230
},
{
"clip_ratio": 0.0,
"completion_length": 583.3717656135559,
"epoch": 0.378596396880882,
"grad_norm": 0.07237120717763901,
"kl": 0.010539531707763672,
"learning_rate": 3.155737704918033e-06,
"loss": 0.0093,
"num_tokens": 155643680.0,
"reward": 0.22591146369813941,
"reward_std": 0.13715054193744436,
"rewards/pure_accuracy_reward_math": 0.22591145869228058,
"step": 231
},
{
"clip_ratio": 0.001438524371224048,
"epoch": 0.38023534232192474,
"grad_norm": 0.45248183608055115,
"kl": 0.011214733123779297,
"learning_rate": 3.1693989071038255e-06,
"loss": 0.0093,
"step": 232
},
{
"clip_ratio": 0.001912088545395818,
"epoch": 0.3818742877629675,
"grad_norm": 0.11236479133367538,
"kl": 0.009836912155151367,
"learning_rate": 3.183060109289618e-06,
"loss": 0.0094,
"step": 233
},
{
"clip_ratio": 0.0011414756233989465,
"epoch": 0.3835132332040103,
"grad_norm": 0.07030442357063293,
"kl": 0.010227203369140625,
"learning_rate": 3.1967213114754105e-06,
"loss": 0.009,
"step": 234
},
{
"clip_ratio": 0.0015166988691817096,
"epoch": 0.3851521786450531,
"grad_norm": 0.10437261313199997,
"kl": 0.011615991592407227,
"learning_rate": 3.2103825136612026e-06,
"loss": 0.0088,
"step": 235
},
{
"clip_ratio": 0.0,
"completion_length": 571.2672717571259,
"epoch": 0.38679112408609584,
"grad_norm": 0.0721583217382431,
"kl": 0.011221885681152344,
"learning_rate": 3.224043715846995e-06,
"loss": 0.0093,
"num_tokens": 158884749.0,
"reward": 0.21223958939663135,
"reward_std": 0.12057235097745433,
"rewards/pure_accuracy_reward_math": 0.2122395838086959,
"step": 236
},
{
"clip_ratio": 0.001226626716629653,
"epoch": 0.38843006952713865,
"grad_norm": 0.08837593346834183,
"kl": 0.010795831680297852,
"learning_rate": 3.2377049180327876e-06,
"loss": 0.0092,
"step": 237
},
{
"clip_ratio": 0.0012072520969468314,
"epoch": 0.3900690149681814,
"grad_norm": 0.08174102008342743,
"kl": 0.010251283645629883,
"learning_rate": 3.2513661202185792e-06,
"loss": 0.0089,
"step": 238
},
{
"clip_ratio": 0.0008923051470901555,
"epoch": 0.3917079604092242,
"grad_norm": 0.06714540719985962,
"kl": 0.010812044143676758,
"learning_rate": 3.2650273224043717e-06,
"loss": 0.0086,
"step": 239
},
{
"clip_ratio": 0.0008945376886231315,
"epoch": 0.393346905850267,
"grad_norm": 0.07600870728492737,
"kl": 0.011825799942016602,
"learning_rate": 3.2786885245901638e-06,
"loss": 0.0082,
"step": 240
},
{
"clip_ratio": 0.0,
"completion_length": 579.9231960773468,
"epoch": 0.39498585129130975,
"grad_norm": 0.0788060799241066,
"kl": 0.011470317840576172,
"learning_rate": 3.2923497267759563e-06,
"loss": 0.008,
"num_tokens": 162151041.0,
"reward": 0.2262369857635349,
"reward_std": 0.1436142157181166,
"rewards/pure_accuracy_reward_math": 0.22623697842936963,
"step": 241
},
{
"clip_ratio": 0.001235522729416516,
"epoch": 0.3966247967323525,
"grad_norm": 0.08819200098514557,
"kl": 0.011005401611328125,
"learning_rate": 3.3060109289617488e-06,
"loss": 0.0079,
"step": 242
},
{
"clip_ratio": 0.0011237937412715837,
"epoch": 0.39826374217339533,
"grad_norm": 0.07336119562387466,
"kl": 0.010800600051879883,
"learning_rate": 3.3196721311475413e-06,
"loss": 0.0075,
"step": 243
},
{
"clip_ratio": 0.0010676182721454097,
"epoch": 0.3999026876144381,
"grad_norm": 0.07694102078676224,
"kl": 0.011488199234008789,
"learning_rate": 3.3333333333333333e-06,
"loss": 0.0072,
"step": 244
},
{
"clip_ratio": 0.0011172947895374818,
"epoch": 0.40154163305548085,
"grad_norm": 0.08463244885206223,
"kl": 0.012181282043457031,
"learning_rate": 3.346994535519126e-06,
"loss": 0.0068,
"step": 245
},
{
"clip_ratio": 0.0,
"completion_length": 582.9391458034515,
"epoch": 0.40318057849652367,
"grad_norm": 0.0712461844086647,
"kl": 0.01132655143737793,
"learning_rate": 3.3606557377049183e-06,
"loss": 0.0082,
"num_tokens": 165429094.0,
"reward": 0.24869792442768812,
"reward_std": 0.12578068423317745,
"rewards/pure_accuracy_reward_math": 0.24869791674427688,
"step": 246
},
{
"clip_ratio": 0.0011233826196530572,
"epoch": 0.40481952393756643,
"grad_norm": 0.07659593969583511,
"kl": 0.01063847541809082,
"learning_rate": 3.3743169398907104e-06,
"loss": 0.0081,
"step": 247
},
{
"clip_ratio": 0.0012855593090534967,
"epoch": 0.4064584693786092,
"grad_norm": 0.07479391992092133,
"kl": 0.010470390319824219,
"learning_rate": 3.387978142076503e-06,
"loss": 0.0078,
"step": 248
},
{
"clip_ratio": 0.0009941341145349725,
"epoch": 0.408097414819652,
"grad_norm": 0.06663769483566284,
"kl": 0.011182785034179688,
"learning_rate": 3.4016393442622954e-06,
"loss": 0.0075,
"step": 249
},
{
"clip_ratio": 0.0009338884319731733,
"epoch": 0.40973636026069477,
"grad_norm": 0.07455974817276001,
"kl": 0.011932849884033203,
"learning_rate": 3.4153005464480874e-06,
"loss": 0.0071,
"step": 250
},
{
"clip_ratio": 0.0,
"completion_length": 579.4368662834167,
"epoch": 0.001638945441042779,
"grad_norm": 0.06936674565076828,
"kl": 0.011360645294189453,
"learning_rate": 3.42896174863388e-06,
"loss": 0.0077,
"num_tokens": 3266558.0,
"reward": 0.23990886102546938,
"reward_std": 0.1189681178657338,
"rewards/pure_accuracy_reward_math": 0.239908854739042,
"step": 251
},
{
"clip_ratio": 0.0010949637241992605,
"epoch": 0.003277890882085558,
"grad_norm": 0.0754990503191948,
"kl": 0.010671854019165039,
"learning_rate": 3.4426229508196724e-06,
"loss": 0.0076,
"step": 252
},
{
"clip_ratio": 0.0011387738637722578,
"epoch": 0.004916836323128337,
"grad_norm": 0.07142341136932373,
"kl": 0.010357856750488281,
"learning_rate": 3.456284153005465e-06,
"loss": 0.0074,
"step": 253
},
{
"clip_ratio": 0.0008552854768026918,
"epoch": 0.006555781764171116,
"grad_norm": 0.0586932897567749,
"kl": 0.010814428329467773,
"learning_rate": 3.469945355191257e-06,
"loss": 0.007,
"step": 254
},
{
"clip_ratio": 0.0008318971481457993,
"epoch": 0.008194727205213895,
"grad_norm": 0.07276652008295059,
"kl": 0.011636495590209961,
"learning_rate": 3.4836065573770495e-06,
"loss": 0.0067,
"step": 255
},
{
"clip_ratio": 0.0,
"completion_length": 589.9453327655792,
"epoch": 0.009833672646256675,
"grad_norm": 0.07384130358695984,
"kl": 0.011546134948730469,
"learning_rate": 3.497267759562842e-06,
"loss": 0.0092,
"num_tokens": 6563270.0,
"reward": 0.24088542279787362,
"reward_std": 0.13925835717236623,
"rewards/pure_accuracy_reward_math": 0.2408854168606922,
"step": 256
},
{
"clip_ratio": 0.000994754147995991,
"epoch": 0.011472618087299453,
"grad_norm": 0.07237172871828079,
"kl": 0.011071443557739258,
"learning_rate": 3.510928961748634e-06,
"loss": 0.0091,
"step": 257
},
{
"clip_ratio": 0.0009974641966437048,
"epoch": 0.013111563528342233,
"grad_norm": 0.0677863284945488,
"kl": 0.010922431945800781,
"learning_rate": 3.5245901639344265e-06,
"loss": 0.0088,
"step": 258
},
{
"clip_ratio": 0.0009937005115716602,
"epoch": 0.01475050896938501,
"grad_norm": 0.06459185481071472,
"kl": 0.01144552230834961,
"learning_rate": 3.538251366120219e-06,
"loss": 0.0084,
"step": 259
},
{
"clip_ratio": 0.0010135341441355195,
"epoch": 0.01638945441042779,
"grad_norm": 0.0639120563864708,
"kl": 0.01173710823059082,
"learning_rate": 3.551912568306011e-06,
"loss": 0.008,
"step": 260
},
{
"clip_ratio": 0.0,
"completion_length": 589.9293835163116,
"epoch": 0.018028399851470568,
"grad_norm": 0.0708698108792305,
"kl": 0.011642932891845703,
"learning_rate": 3.5655737704918036e-06,
"loss": 0.0082,
"num_tokens": 9864381.0,
"reward": 0.2343750073632691,
"reward_std": 0.1295394750777632,
"rewards/pure_accuracy_reward_math": 0.23437499956344254,
"step": 261
},
{
"clip_ratio": 0.0011215963202744206,
"epoch": 0.01966734529251335,
"grad_norm": 0.06814540177583694,
"kl": 0.011007308959960938,
"learning_rate": 3.579234972677596e-06,
"loss": 0.0081,
"step": 262
},
{
"clip_ratio": 0.0012566405258667146,
"epoch": 0.021306290733556128,
"grad_norm": 0.07573528587818146,
"kl": 0.010967016220092773,
"learning_rate": 3.5928961748633886e-06,
"loss": 0.0079,
"step": 263
},
{
"clip_ratio": 0.0009570858208007849,
"epoch": 0.022945236174598906,
"grad_norm": 0.05915817990899086,
"kl": 0.011373281478881836,
"learning_rate": 3.6065573770491806e-06,
"loss": 0.0075,
"step": 264
},
{
"clip_ratio": 0.000911612167669773,
"epoch": 0.024584181615641687,
"grad_norm": 0.0663297101855278,
"kl": 0.012076139450073242,
"learning_rate": 3.620218579234973e-06,
"loss": 0.0071,
"step": 265
},
{
"clip_ratio": 0.0,
"completion_length": 591.5680522918701,
"epoch": 0.026223127056684465,
"grad_norm": 0.11742489039897919,
"kl": 0.013937711715698242,
"learning_rate": 3.6338797814207656e-06,
"loss": 0.0079,
"num_tokens": 13167262.0,
"reward": 0.20540365105262026,
"reward_std": 0.12122339283814654,
"rewards/pure_accuracy_reward_math": 0.20540364709449932,
"step": 266
},
{
"clip_ratio": 0.0010878853252052068,
"epoch": 0.027862072497727243,
"grad_norm": 0.9664380550384521,
"kl": 0.011089324951171875,
"learning_rate": 3.6475409836065577e-06,
"loss": 0.0088,
"step": 267
},
{
"clip_ratio": 0.0013143416176717437,
"epoch": 0.02950101793877002,
"grad_norm": 0.17526276409626007,
"kl": 0.01159524917602539,
"learning_rate": 3.66120218579235e-06,
"loss": 0.0077,
"step": 268
},
{
"clip_ratio": 0.0010903547959060234,
"epoch": 0.031139963379812802,
"grad_norm": 2.172806739807129,
"kl": 0.04994964599609375,
"learning_rate": 3.6748633879781427e-06,
"loss": 0.0089,
"step": 269
},
{
"clip_ratio": 0.0011699927540576027,
"epoch": 0.03277890882085558,
"grad_norm": 3.1674540042877197,
"kl": 0.10472512245178223,
"learning_rate": 3.6885245901639347e-06,
"loss": 0.011,
"step": 270
},
{
"clip_ratio": 0.0,
"completion_length": 602.3287951946259,
"epoch": 0.03441785426189836,
"grad_norm": 0.06581036746501923,
"kl": 0.011269092559814453,
"learning_rate": 3.7021857923497272e-06,
"loss": 0.009,
"num_tokens": 16496412.0,
"reward": 0.2086588600068353,
"reward_std": 0.12463329132879153,
"rewards/pure_accuracy_reward_math": 0.20865885500097647,
"step": 271
},
{
"clip_ratio": 0.0010178338038713264,
"epoch": 0.036056799702941136,
"grad_norm": 0.07616181671619415,
"kl": 0.0111236572265625,
"learning_rate": 3.7158469945355197e-06,
"loss": 0.009,
"step": 272
},
{
"clip_ratio": 0.0011148875312301243,
"epoch": 0.03769574514398392,
"grad_norm": 0.07324493676424026,
"kl": 0.010937929153442383,
"learning_rate": 3.729508196721312e-06,
"loss": 0.0088,
"step": 273
},
{
"clip_ratio": 0.0009064768914868182,
"epoch": 0.0393346905850267,
"grad_norm": 0.0614241324365139,
"kl": 0.011002779006958008,
"learning_rate": 3.7431693989071043e-06,
"loss": 0.0085,
"step": 274
},
{
"clip_ratio": 0.0008406615522176253,
"epoch": 0.040973636026069474,
"grad_norm": 0.0580308772623539,
"kl": 0.011270523071289062,
"learning_rate": 3.7568306010928963e-06,
"loss": 0.0081,
"step": 275
},
{
"clip_ratio": 0.0,
"completion_length": 576.4791839122772,
"epoch": 0.042612581467112255,
"grad_norm": 0.07072403281927109,
"kl": 0.012294530868530273,
"learning_rate": 3.7704918032786884e-06,
"loss": 0.0069,
"num_tokens": 19746996.0,
"reward": 0.23209636090905406,
"reward_std": 0.13299611589172855,
"rewards/pure_accuracy_reward_math": 0.232096354739042,
"step": 276
},
{
"clip_ratio": 0.0008154532818025473,
"epoch": 0.04425152690815504,
"grad_norm": 0.06746868789196014,
"kl": 0.012226104736328125,
"learning_rate": 3.784153005464481e-06,
"loss": 0.0068,
"step": 277
},
{
"clip_ratio": 0.0009088635895295738,
"epoch": 0.04589047234919781,
"grad_norm": 0.062604621052742,
"kl": 0.012192249298095703,
"learning_rate": 3.7978142076502734e-06,
"loss": 0.0065,
"step": 278
},
{
"clip_ratio": 0.0009255672589461028,
"epoch": 0.04752941779024059,
"grad_norm": 0.06473197042942047,
"kl": 0.012347936630249023,
"learning_rate": 3.811475409836066e-06,
"loss": 0.0062,
"step": 279
},
{
"clip_ratio": 0.000926908637438828,
"epoch": 0.049168363231283374,
"grad_norm": 0.0617368146777153,
"kl": 0.012591838836669922,
"learning_rate": 3.825136612021858e-06,
"loss": 0.0058,
"step": 280
},
{
"clip_ratio": 0.0,
"completion_length": 590.2998232841492,
"epoch": 0.05080730867232615,
"grad_norm": 0.0707988291978836,
"kl": 0.012273788452148438,
"learning_rate": 3.8387978142076504e-06,
"loss": 0.0051,
"num_tokens": 23046033.0,
"reward": 0.2106119856762234,
"reward_std": 0.12753237638389692,
"rewards/pure_accuracy_reward_math": 0.21061197997187264,
"step": 281
},
{
"clip_ratio": 0.0007957301904752967,
"epoch": 0.05244625411336893,
"grad_norm": 0.07150708138942719,
"kl": 0.012214422225952148,
"learning_rate": 3.852459016393443e-06,
"loss": 0.005,
"step": 282
},
{
"clip_ratio": 0.0008087110562655653,
"epoch": 0.05408519955441171,
"grad_norm": 0.06467320770025253,
"kl": 0.012126684188842773,
"learning_rate": 3.8661202185792354e-06,
"loss": 0.0047,
"step": 283
},
{
"clip_ratio": 0.0008826969724395894,
"epoch": 0.055724144995454486,
"grad_norm": 0.06448128819465637,
"kl": 0.012229204177856445,
"learning_rate": 3.879781420765028e-06,
"loss": 0.0043,
"step": 284
},
{
"clip_ratio": 0.000837775871445956,
"epoch": 0.05736309043649727,
"grad_norm": 0.05940267816185951,
"kl": 0.012416601181030273,
"learning_rate": 3.8934426229508196e-06,
"loss": 0.0039,
"step": 285
},
{
"clip_ratio": 0.0,
"completion_length": 586.4531440734863,
"epoch": 0.05900203587754004,
"grad_norm": 0.09300405532121658,
"kl": 0.012730836868286133,
"learning_rate": 3.907103825136612e-06,
"loss": 0.0073,
"num_tokens": 26331481.0,
"reward": 0.24609375710133463,
"reward_std": 0.1337946176645346,
"rewards/pure_accuracy_reward_math": 0.24609375081490725,
"step": 286
},
{
"clip_ratio": 0.0007959069370144789,
"epoch": 0.06064098131858282,
"grad_norm": 0.07242298871278763,
"kl": 0.012778043746948242,
"learning_rate": 3.9207650273224046e-06,
"loss": 0.0071,
"step": 287
},
{
"clip_ratio": 0.0007729592513214811,
"epoch": 0.062279926759625605,
"grad_norm": 0.06439978629350662,
"kl": 0.012783050537109375,
"learning_rate": 3.934426229508197e-06,
"loss": 0.0068,
"step": 288
},
{
"clip_ratio": 0.0008412073416366184,
"epoch": 0.06391887220066839,
"grad_norm": 0.06673026084899902,
"kl": 0.012759208679199219,
"learning_rate": 3.9480874316939895e-06,
"loss": 0.0064,
"step": 289
},
{
"clip_ratio": 0.0008529388742317678,
"epoch": 0.06555781764171116,
"grad_norm": 0.06457261741161346,
"kl": 0.012978315353393555,
"learning_rate": 3.961748633879782e-06,
"loss": 0.006,
"step": 290
},
{
"clip_ratio": 0.0,
"completion_length": 593.4179866313934,
"epoch": 0.06719676308275394,
"grad_norm": 0.061959490180015564,
"kl": 0.012590646743774414,
"learning_rate": 3.975409836065574e-06,
"loss": 0.0048,
"num_tokens": 29639261.0,
"reward": 0.19140625570435077,
"reward_std": 0.1178207247867249,
"rewards/pure_accuracy_reward_math": 0.19140625011641532,
"step": 291
},
{
"clip_ratio": 0.0007923052341993753,
"epoch": 0.06883570852379672,
"grad_norm": 0.06405281275510788,
"kl": 0.012594223022460938,
"learning_rate": 3.989071038251366e-06,
"loss": 0.0047,
"step": 292
},
{
"clip_ratio": 0.0008128494168886391,
"epoch": 0.0704746539648395,
"grad_norm": 0.05796763673424721,
"kl": 0.012372016906738281,
"learning_rate": 4.002732240437159e-06,
"loss": 0.0044,
"step": 293
},
{
"clip_ratio": 0.0008259461983470828,
"epoch": 0.07211359940588227,
"grad_norm": 0.05945519357919693,
"kl": 0.012368202209472656,
"learning_rate": 4.016393442622951e-06,
"loss": 0.0041,
"step": 294
},
{
"clip_ratio": 0.0008090365032558111,
"epoch": 0.07375254484692506,
"grad_norm": 0.05839954689145088,
"kl": 0.012590169906616211,
"learning_rate": 4.030054644808744e-06,
"loss": 0.0038,
"step": 295
},
{
"clip_ratio": 0.0,
"completion_length": 578.3847825527191,
"epoch": 0.07539149028796784,
"grad_norm": 0.07029297947883606,
"kl": 0.013827800750732422,
"learning_rate": 4.043715846994536e-06,
"loss": 0.0079,
"num_tokens": 32899431.0,
"reward": 0.22656250576255843,
"reward_std": 0.1293920156895183,
"rewards/pure_accuracy_reward_math": 0.22656250040745363,
"step": 296
},
{
"clip_ratio": 0.0007177354492569066,
"epoch": 0.07703043572901061,
"grad_norm": 0.07095961272716522,
"kl": 0.013935565948486328,
"learning_rate": 4.057377049180329e-06,
"loss": 0.0078,
"step": 297
},
{
"clip_ratio": 0.0007291494763990158,
"epoch": 0.0786693811700534,
"grad_norm": 0.062031351029872894,
"kl": 0.01368570327758789,
"learning_rate": 4.07103825136612e-06,
"loss": 0.0075,
"step": 298
},
{
"clip_ratio": 0.0009114736896549402,
"epoch": 0.08030832661109617,
"grad_norm": 0.06610522419214249,
"kl": 0.01354837417602539,
"learning_rate": 4.084699453551913e-06,
"loss": 0.0072,
"step": 299
},
{
"clip_ratio": 0.0008185061662402404,
"epoch": 0.08194727205213895,
"grad_norm": 0.05733739957213402,
"kl": 0.013862133026123047,
"learning_rate": 4.098360655737705e-06,
"loss": 0.0068,
"step": 300
},
{
"clip_ratio": 0.0,
"completion_length": 581.9811391830444,
"epoch": 0.08358621749318174,
"grad_norm": 0.07516755163669586,
"kl": 0.013712882995605469,
"learning_rate": 4.112021857923498e-06,
"loss": 0.0123,
"num_tokens": 36171597.0,
"reward": 0.24316406939760782,
"reward_std": 0.1390101815923117,
"rewards/pure_accuracy_reward_math": 0.24316406299476512,
"step": 301
},
{
"clip_ratio": 0.0008417515526843999,
"epoch": 0.08522516293422451,
"grad_norm": 0.07285764813423157,
"kl": 0.013661384582519531,
"learning_rate": 4.12568306010929e-06,
"loss": 0.0122,
"step": 302
},
{
"clip_ratio": 0.0010243687736419815,
"epoch": 0.08686410837526728,
"grad_norm": 0.06916587054729462,
"kl": 0.013316631317138672,
"learning_rate": 4.139344262295083e-06,
"loss": 0.0118,
"step": 303
},
{
"clip_ratio": 0.0010284557414479423,
"epoch": 0.08850305381631007,
"grad_norm": 0.06860698759555817,
"kl": 0.01330423355102539,
"learning_rate": 4.153005464480875e-06,
"loss": 0.0115,
"step": 304
},
{
"clip_ratio": 0.0009143141991216908,
"epoch": 0.09014199925735285,
"grad_norm": 0.06032150238752365,
"kl": 0.0137176513671875,
"learning_rate": 4.166666666666667e-06,
"loss": 0.011,
"step": 305
},
{
"clip_ratio": 0.0,
"completion_length": 586.3743689060211,
"epoch": 0.09178094469839562,
"grad_norm": 0.07405151426792145,
"kl": 0.013857364654541016,
"learning_rate": 4.180327868852459e-06,
"loss": 0.011,
"num_tokens": 39455171.0,
"reward": 0.2532552155898884,
"reward_std": 0.14932613197015598,
"rewards/pure_accuracy_reward_math": 0.25325520941987634,
"step": 306
},
{
"clip_ratio": 0.0008296436263890428,
"epoch": 0.09341989013943841,
"grad_norm": 0.06666728854179382,
"kl": 0.013742923736572266,
"learning_rate": 4.193989071038252e-06,
"loss": 0.0109,
"step": 307
},
{
"clip_ratio": 0.0009970029186661122,
"epoch": 0.09505883558048119,
"grad_norm": 0.0645456612110138,
"kl": 0.013346672058105469,
"learning_rate": 4.207650273224044e-06,
"loss": 0.0106,
"step": 308
},
{
"clip_ratio": 0.001063040656163139,
"epoch": 0.09669778102152396,
"grad_norm": 0.061983004212379456,
"kl": 0.013351917266845703,
"learning_rate": 4.221311475409837e-06,
"loss": 0.0102,
"step": 309
},
{
"clip_ratio": 0.000925353787010863,
"epoch": 0.09833672646256675,
"grad_norm": 0.054489802569150925,
"kl": 0.013745784759521484,
"learning_rate": 4.234972677595629e-06,
"loss": 0.0098,
"step": 310
},
{
"clip_ratio": 0.0,
"completion_length": 576.9381697177887,
"epoch": 0.09997567190360952,
"grad_norm": 0.07431261986494064,
"kl": 0.014829158782958984,
"learning_rate": 4.248633879781421e-06,
"loss": 0.0086,
"num_tokens": 42719433.0,
"reward": 0.23014323521056212,
"reward_std": 0.14185529336100444,
"rewards/pure_accuracy_reward_math": 0.23014322962262668,
"step": 311
},
{
"clip_ratio": 0.0008892922282370819,
"epoch": 0.1016146173446523,
"grad_norm": 0.0675373449921608,
"kl": 0.01420736312866211,
"learning_rate": 4.2622950819672135e-06,
"loss": 0.0084,
"step": 312
},
{
"clip_ratio": 0.0011247390369817367,
"epoch": 0.10325356278569509,
"grad_norm": 0.06642100214958191,
"kl": 0.013678550720214844,
"learning_rate": 4.275956284153006e-06,
"loss": 0.0081,
"step": 313
},
{
"clip_ratio": 0.001105548773011833,
"epoch": 0.10489250822673786,
"grad_norm": 0.06353385746479034,
"kl": 0.01375722885131836,
"learning_rate": 4.289617486338798e-06,
"loss": 0.0077,
"step": 314
},
{
"clip_ratio": 0.0008872896562479582,
"epoch": 0.10653145366778063,
"grad_norm": 0.0578172467648983,
"kl": 0.014369010925292969,
"learning_rate": 4.30327868852459e-06,
"loss": 0.0073,
"step": 315
},
{
"clip_ratio": 0.0,
"completion_length": 581.5644719600677,
"epoch": 0.10817039910882342,
"grad_norm": 0.10828240215778351,
"kl": 0.014815330505371094,
"learning_rate": 4.316939890710383e-06,
"loss": 0.0079,
"num_tokens": 45987515.0,
"reward": 0.23828125750878826,
"reward_std": 0.1280899328412488,
"rewards/pure_accuracy_reward_math": 0.23828124959254637,
"step": 316
},
{
"clip_ratio": 0.0007055829685214121,
"epoch": 0.1098093445498662,
"grad_norm": 0.06897052377462387,
"kl": 0.014089107513427734,
"learning_rate": 4.330601092896175e-06,
"loss": 0.0077,
"step": 317
},
{
"clip_ratio": 0.0009552787060442824,
"epoch": 0.11144828999090897,
"grad_norm": 0.06946240365505219,
"kl": 0.013627052307128906,
"learning_rate": 4.3442622950819676e-06,
"loss": 0.0074,
"step": 318
},
{
"clip_ratio": 0.0009577763585184584,
"epoch": 0.11308723543195176,
"grad_norm": 0.06384962797164917,
"kl": 0.013594627380371094,
"learning_rate": 4.35792349726776e-06,
"loss": 0.007,
"step": 319
},
{
"clip_ratio": 0.0008583279737877092,
"epoch": 0.11472618087299453,
"grad_norm": 0.05853092297911644,
"kl": 0.014088630676269531,
"learning_rate": 4.371584699453552e-06,
"loss": 0.0066,
"step": 320
},
{
"clip_ratio": 0.0,
"completion_length": 574.9983899593353,
"epoch": 0.11636512631403731,
"grad_norm": 0.07647594809532166,
"kl": 0.01479482650756836,
"learning_rate": 4.385245901639344e-06,
"loss": 0.0092,
"num_tokens": 49236626.0,
"reward": 0.22591146448394284,
"reward_std": 0.1385065988288261,
"rewards/pure_accuracy_reward_math": 0.2259114588960074,
"step": 321
},
{
"clip_ratio": 0.000817699462913879,
"epoch": 0.11800407175508008,
"grad_norm": 0.0680047944188118,
"kl": 0.014449596405029297,
"learning_rate": 4.398907103825137e-06,
"loss": 0.009,
"step": 322
},
{
"clip_ratio": 0.00102761085952352,
"epoch": 0.11964301719612287,
"grad_norm": 0.06830534338951111,
"kl": 0.01408243179321289,
"learning_rate": 4.412568306010929e-06,
"loss": 0.0087,
"step": 323
},
{
"clip_ratio": 0.0010830692142462794,
"epoch": 0.12128196263716565,
"grad_norm": 0.06523703783750534,
"kl": 0.01411581039428711,
"learning_rate": 4.426229508196722e-06,
"loss": 0.0083,
"step": 324
},
{
"clip_ratio": 0.0009552010853894899,
"epoch": 0.12292090807820842,
"grad_norm": 0.05952048301696777,
"kl": 0.01461029052734375,
"learning_rate": 4.439890710382514e-06,
"loss": 0.0078,
"step": 325
},
{
"clip_ratio": 0.0,
"completion_length": 574.7955937385559,
"epoch": 0.12455985351925121,
"grad_norm": 0.07190460711717606,
"kl": 0.015045642852783203,
"learning_rate": 4.453551912568307e-06,
"loss": 0.0105,
"num_tokens": 52486374.0,
"reward": 0.2360026103851851,
"reward_std": 0.12637775152688846,
"rewards/pure_accuracy_reward_math": 0.2360026056121569,
"step": 326
},
{
"clip_ratio": 0.0006602608530101861,
"epoch": 0.12619879896029398,
"grad_norm": 0.06684302538633347,
"kl": 0.014774322509765625,
"learning_rate": 4.467213114754098e-06,
"loss": 0.0103,
"step": 327
},
{
"clip_ratio": 0.0008040992978521899,
"epoch": 0.12783774440133677,
"grad_norm": 0.06550217419862747,
"kl": 0.01435995101928711,
"learning_rate": 4.480874316939891e-06,
"loss": 0.01,
"step": 328
},
{
"clip_ratio": 0.0008306863429652367,
"epoch": 0.12947668984237953,
"grad_norm": 0.0616220086812973,
"kl": 0.014432430267333984,
"learning_rate": 4.494535519125683e-06,
"loss": 0.0096,
"step": 329
},
{
"clip_ratio": 0.0008213962388481377,
"epoch": 0.13111563528342232,
"grad_norm": 0.061259008944034576,
"kl": 0.014967918395996094,
"learning_rate": 4.508196721311476e-06,
"loss": 0.0092,
"step": 330
},
{
"clip_ratio": 0.0,
"completion_length": 578.8222842216492,
"epoch": 0.1327545807244651,
"grad_norm": 0.07180505990982056,
"kl": 0.014912128448486328,
"learning_rate": 4.521857923497268e-06,
"loss": 0.0054,
"num_tokens": 55750856.0,
"reward": 0.22428386053070426,
"reward_std": 0.125935374526307,
"rewards/pure_accuracy_reward_math": 0.22428385610692203,
"step": 331
},
{
"clip_ratio": 0.0008310234973123443,
"epoch": 0.13439352616550787,
"grad_norm": 0.06554125249385834,
"kl": 0.014460086822509766,
"learning_rate": 4.535519125683061e-06,
"loss": 0.0052,
"step": 332
},
{
"clip_ratio": 0.0009444170593724266,
"epoch": 0.13603247160655066,
"grad_norm": 0.0650697648525238,
"kl": 0.014264106750488281,
"learning_rate": 4.549180327868853e-06,
"loss": 0.0049,
"step": 333
},
{
"clip_ratio": 0.0009593672889991467,
"epoch": 0.13767141704759345,
"grad_norm": 0.06275759637355804,
"kl": 0.014463424682617188,
"learning_rate": 4.562841530054645e-06,
"loss": 0.0045,
"step": 334
},
{
"clip_ratio": 0.0008741978416537677,
"epoch": 0.1393103624886362,
"grad_norm": 0.06349465250968933,
"kl": 0.015105247497558594,
"learning_rate": 4.576502732240437e-06,
"loss": 0.0041,
"step": 335
},
{
"clip_ratio": 0.0,
"completion_length": 581.4983899593353,
"epoch": 0.140949307929679,
"grad_norm": 0.07185006141662598,
"kl": 0.015033245086669922,
"learning_rate": 4.59016393442623e-06,
"loss": 0.0092,
"num_tokens": 59021523.0,
"reward": 0.24804688125732355,
"reward_std": 0.12332397676073015,
"rewards/pure_accuracy_reward_math": 0.2480468761350494,
"step": 336
},
{
"clip_ratio": 0.0007917967001276338,
"epoch": 0.14258825337072178,
"grad_norm": 0.06418469548225403,
"kl": 0.014570236206054688,
"learning_rate": 4.603825136612022e-06,
"loss": 0.009,
"step": 337
},
{
"clip_ratio": 0.0011276908828676824,
"epoch": 0.14422719881176455,
"grad_norm": 0.06706573814153671,
"kl": 0.014203071594238281,
"learning_rate": 4.617486338797815e-06,
"loss": 0.0087,
"step": 338
},
{
"clip_ratio": 0.0010211615006028296,
"epoch": 0.14586614425280733,
"grad_norm": 0.06293198466300964,
"kl": 0.014473915100097656,
"learning_rate": 4.631147540983607e-06,
"loss": 0.0084,
"step": 339
},
{
"clip_ratio": 0.0008425270717680178,
"epoch": 0.14750508969385012,
"grad_norm": 0.058640848845243454,
"kl": 0.01514291763305664,
"learning_rate": 4.6448087431694e-06,
"loss": 0.008,
"step": 340
},
{
"clip_ratio": 0.0,
"completion_length": 581.9489109516144,
"epoch": 0.14914403513489288,
"grad_norm": 0.08432208746671677,
"kl": 0.015330791473388672,
"learning_rate": 4.6584699453551915e-06,
"loss": 0.0085,
"num_tokens": 62299018.0,
"reward": 0.2382812559371814,
"reward_std": 0.14492353051900864,
"rewards/pure_accuracy_reward_math": 0.2382812504656613,
"step": 341
},
{
"clip_ratio": 0.000848330670521591,
"epoch": 0.15078298057593567,
"grad_norm": 0.06801754236221313,
"kl": 0.014659404754638672,
"learning_rate": 4.672131147540984e-06,
"loss": 0.0084,
"step": 342
},
{
"clip_ratio": 0.0010827027111872667,
"epoch": 0.15242192601697846,
"grad_norm": 0.06549172848463058,
"kl": 0.014311790466308594,
"learning_rate": 4.6857923497267765e-06,
"loss": 0.0081,
"step": 343
},
{
"clip_ratio": 0.0010740470830796767,
"epoch": 0.15406087145802122,
"grad_norm": 0.06515967845916748,
"kl": 0.01453399658203125,
"learning_rate": 4.699453551912569e-06,
"loss": 0.0077,
"step": 344
},
{
"clip_ratio": 0.0009672553374002746,
"epoch": 0.155699816899064,
"grad_norm": 0.0627971738576889,
"kl": 0.015265464782714844,
"learning_rate": 4.7131147540983615e-06,
"loss": 0.0072,
"step": 345
},
{
"clip_ratio": 0.0,
"completion_length": 594.5420119762421,
"epoch": 0.1573387623401068,
"grad_norm": 0.0651373565196991,
"kl": 0.01506948471069336,
"learning_rate": 4.726775956284154e-06,
"loss": 0.0061,
"num_tokens": 65612227.0,
"reward": 0.22656250555883162,
"reward_std": 0.11576688283821568,
"rewards/pure_accuracy_reward_math": 0.2265625006693881,
"step": 346
},
{
"clip_ratio": 0.0006889042056741346,
"epoch": 0.15897770778114956,
"grad_norm": 0.05911775305867195,
"kl": 0.014788627624511719,
"learning_rate": 4.740437158469946e-06,
"loss": 0.006,
"step": 347
},
{
"clip_ratio": 0.0008533449156971074,
"epoch": 0.16061665322219235,
"grad_norm": 0.06107313930988312,
"kl": 0.014514446258544922,
"learning_rate": 4.754098360655738e-06,
"loss": 0.0058,
"step": 348
},
{
"clip_ratio": 0.0008506776480317058,
"epoch": 0.16225559866323513,
"grad_norm": 0.05840134993195534,
"kl": 0.014555931091308594,
"learning_rate": 4.767759562841531e-06,
"loss": 0.0054,
"step": 349
},
{
"clip_ratio": 0.0007001224406621986,
"epoch": 0.1638945441042779,
"grad_norm": 0.052340634167194366,
"kl": 0.014971256256103516,
"learning_rate": 4.781420765027322e-06,
"loss": 0.0051,
"step": 350
},
{
"clip_ratio": 0.0,
"completion_length": 582.0423378944397,
"epoch": 0.16553348954532068,
"grad_norm": 0.07505487650632858,
"kl": 0.015714168548583984,
"learning_rate": 4.795081967213115e-06,
"loss": 0.01,
"num_tokens": 68889789.0,
"reward": 0.22949219317524694,
"reward_std": 0.13435217371443287,
"rewards/pure_accuracy_reward_math": 0.2294921897992026,
"step": 351
},
{
"clip_ratio": 0.0008628651630715467,
"epoch": 0.16717243498636347,
"grad_norm": 0.06674539297819138,
"kl": 0.015304088592529297,
"learning_rate": 4.808743169398907e-06,
"loss": 0.0098,
"step": 352
},
{
"clip_ratio": 0.001037854259834603,
"epoch": 0.16881138042740623,
"grad_norm": 0.07000827044248581,
"kl": 0.014843463897705078,
"learning_rate": 4.8224043715847e-06,
"loss": 0.0095,
"step": 353
},
{
"clip_ratio": 0.0010051641423842739,
"epoch": 0.17045032586844902,
"grad_norm": 0.06692034751176834,
"kl": 0.01481771469116211,
"learning_rate": 4.836065573770492e-06,
"loss": 0.0091,
"step": 354
},
{
"clip_ratio": 0.0009215611881927543,
"epoch": 0.1720892713094918,
"grad_norm": 0.05842750146985054,
"kl": 0.015254974365234375,
"learning_rate": 4.849726775956285e-06,
"loss": 0.0086,
"step": 355
},
{
"clip_ratio": 0.0,
"completion_length": 580.9974174499512,
"epoch": 0.17372821675053457,
"grad_norm": 0.07343181222677231,
"kl": 0.015576362609863281,
"learning_rate": 4.863387978142076e-06,
"loss": 0.0104,
"num_tokens": 72161705.0,
"reward": 0.24218750657746568,
"reward_std": 0.12447860068641603,
"rewards/pure_accuracy_reward_math": 0.24218750098953024,
"step": 356
},
{
"clip_ratio": 0.000678558928370876,
"epoch": 0.17536716219157736,
"grad_norm": 0.06728224456310272,
"kl": 0.015001773834228516,
"learning_rate": 4.877049180327869e-06,
"loss": 0.0103,
"step": 357
},
{
"clip_ratio": 0.0009087121708262202,
"epoch": 0.17700610763262015,
"grad_norm": 0.06502145528793335,
"kl": 0.014545440673828125,
"learning_rate": 4.890710382513661e-06,
"loss": 0.0099,
"step": 358
},
{
"clip_ratio": 0.0008997945463420365,
"epoch": 0.1786450530736629,
"grad_norm": 0.06085266172885895,
"kl": 0.01470804214477539,
"learning_rate": 4.904371584699454e-06,
"loss": 0.0096,
"step": 359
},
{
"clip_ratio": 0.0008065323049777362,
"epoch": 0.1802839985147057,
"grad_norm": 0.05810590460896492,
"kl": 0.01529693603515625,
"learning_rate": 4.918032786885246e-06,
"loss": 0.0092,
"step": 360
},
{
"clip_ratio": 0.0,
"completion_length": 574.3659031391144,
"epoch": 0.18192294395574848,
"grad_norm": 0.07119102030992508,
"kl": 0.015850543975830078,
"learning_rate": 4.931693989071039e-06,
"loss": 0.0088,
"num_tokens": 75411941.0,
"reward": 0.26106771623017266,
"reward_std": 0.12748563423519954,
"rewards/pure_accuracy_reward_math": 0.26106770982732996,
"step": 361
},
{
"clip_ratio": 0.0007838922517180436,
"epoch": 0.18356188939679124,
"grad_norm": 0.0668591633439064,
"kl": 0.015510082244873047,
"learning_rate": 4.945355191256831e-06,
"loss": 0.0086,
"step": 362
},
{
"clip_ratio": 0.0009197715589834843,
"epoch": 0.18520083483783403,
"grad_norm": 0.06584400683641434,
"kl": 0.015251636505126953,
"learning_rate": 4.959016393442623e-06,
"loss": 0.0083,
"step": 363
},
{
"clip_ratio": 0.0007934075148341435,
"epoch": 0.18683978027887682,
"grad_norm": 0.06021925061941147,
"kl": 0.015304088592529297,
"learning_rate": 4.9726775956284154e-06,
"loss": 0.0079,
"step": 364
},
{
"clip_ratio": 0.0008117128969615806,
"epoch": 0.18847872571991958,
"grad_norm": 0.054787032306194305,
"kl": 0.015666484832763672,
"learning_rate": 4.986338797814208e-06,
"loss": 0.0075,
"step": 365
},
{
"clip_ratio": 0.0,
"completion_length": 581.3089377880096,
"epoch": 0.19011767116096237,
"grad_norm": 0.07014758884906769,
"kl": 0.01603412628173828,
"learning_rate": 5e-06,
"loss": 0.0102,
"num_tokens": 78678442.0,
"reward": 0.2386067773331888,
"reward_std": 0.1381109645590186,
"rewards/pure_accuracy_reward_math": 0.23860677116317675,
"step": 366
},
{
"clip_ratio": 0.0008581371083096201,
"epoch": 0.19175661660200516,
"grad_norm": 0.06626458466053009,
"kl": 0.015540599822998047,
"learning_rate": 4.9999942439118225e-06,
"loss": 0.01,
"step": 367
},
{
"clip_ratio": 0.0010326972058010142,
"epoch": 0.19339556204304792,
"grad_norm": 0.06585969030857086,
"kl": 0.015045166015625,
"learning_rate": 4.999976975673795e-06,
"loss": 0.0097,
"step": 368
},
{
"clip_ratio": 0.000947793353589077,
"epoch": 0.1950345074840907,
"grad_norm": 0.06269653141498566,
"kl": 0.015254497528076172,
"learning_rate": 4.999948195365436e-06,
"loss": 0.0092,
"step": 369
},
{
"clip_ratio": 0.0008362581023675375,
"epoch": 0.1966734529251335,
"grad_norm": 0.059586890041828156,
"kl": 0.01586627960205078,
"learning_rate": 4.9999079031192755e-06,
"loss": 0.0088,
"step": 370
},
{
"clip_ratio": 0.0,
"completion_length": 568.2786619663239,
"epoch": 0.19831239836617626,
"grad_norm": 0.07526068389415741,
"kl": 0.016698837280273438,
"learning_rate": 4.999856099120852e-06,
"loss": 0.0081,
"num_tokens": 81911242.0,
"reward": 0.24251302849734202,
"reward_std": 0.126928077545017,
"rewards/pure_accuracy_reward_math": 0.24251302162883803,
"step": 371
},
{
"clip_ratio": 0.0009173538178401941,
"epoch": 0.19995134380721905,
"grad_norm": 0.06963901966810226,
"kl": 0.016336441040039062,
"learning_rate": 4.99979278360872e-06,
"loss": 0.008,
"step": 372
},
{
"clip_ratio": 0.0011110180128071079,
"epoch": 0.20159028924826183,
"grad_norm": 0.06961624324321747,
"kl": 0.015837669372558594,
"learning_rate": 4.999717956874435e-06,
"loss": 0.0076,
"step": 373
},
{
"clip_ratio": 0.0009433047086986335,
"epoch": 0.2032292346893046,
"grad_norm": 0.06556432694196701,
"kl": 0.01593923568725586,
"learning_rate": 4.9996316192625675e-06,
"loss": 0.0072,
"step": 374
},
{
"clip_ratio": 0.0008095512553154549,
"epoch": 0.20486818013034738,
"grad_norm": 0.06139687821269035,
"kl": 0.01659393310546875,
"learning_rate": 4.99953377117069e-06,
"loss": 0.0067,
"step": 375
},
{
"clip_ratio": 0.0,
"completion_length": 573.026712179184,
"epoch": 0.20650712557139017,
"grad_norm": 0.07706455886363983,
"kl": 0.016510486602783203,
"learning_rate": 4.99942441304938e-06,
"loss": 0.0125,
"num_tokens": 85158888.0,
"reward": 0.2682291740202345,
"reward_std": 0.14631909935269505,
"rewards/pure_accuracy_reward_math": 0.2682291676173918,
"step": 376
},
{
"clip_ratio": 0.0009323210776983615,
"epoch": 0.20814607101243293,
"grad_norm": 0.07358861714601517,
"kl": 0.015882015228271484,
"learning_rate": 4.999303545402218e-06,
"loss": 0.0123,
"step": 377
},
{
"clip_ratio": 0.0011543770483513072,
"epoch": 0.20978501645347572,
"grad_norm": 0.06775986403226852,
"kl": 0.015225410461425781,
"learning_rate": 4.999171168785783e-06,
"loss": 0.012,
"step": 378
},
{
"clip_ratio": 0.0010423319904475647,
"epoch": 0.2114239618945185,
"grad_norm": 0.06506908684968948,
"kl": 0.01537466049194336,
"learning_rate": 4.999027283809653e-06,
"loss": 0.0116,
"step": 379
},
{
"clip_ratio": 0.0009310034193958927,
"epoch": 0.21306290733556127,
"grad_norm": 0.06132827699184418,
"kl": 0.01622772216796875,
"learning_rate": 4.9988718911364e-06,
"loss": 0.0111,
"step": 380
},
{
"clip_ratio": 0.0,
"completion_length": 571.3756697177887,
"epoch": 0.21470185277660406,
"grad_norm": 0.0720067173242569,
"kl": 0.017137527465820312,
"learning_rate": 4.998704991481587e-06,
"loss": 0.0108,
"num_tokens": 88402762.0,
"reward": 0.24674479925306514,
"reward_std": 0.1313918832456693,
"rewards/pure_accuracy_reward_math": 0.24674479168606922,
"step": 381
},
{
"clip_ratio": 0.0007053178948126515,
"epoch": 0.21634079821764685,
"grad_norm": 0.06856828182935715,
"kl": 0.016861915588378906,
"learning_rate": 4.998526585613763e-06,
"loss": 0.0107,
"step": 382
},
{
"clip_ratio": 0.0009198855559588992,
"epoch": 0.2179797436586896,
"grad_norm": 0.06308390200138092,
"kl": 0.01618671417236328,
"learning_rate": 4.998336674354468e-06,
"loss": 0.0103,
"step": 383
},
{
"clip_ratio": 0.0009896415027697003,
"epoch": 0.2196186890997324,
"grad_norm": 0.059695471078157425,
"kl": 0.01616191864013672,
"learning_rate": 4.9981352585782154e-06,
"loss": 0.01,
"step": 384
},
{
"clip_ratio": 0.0009851711494093252,
"epoch": 0.22125763454077518,
"grad_norm": 0.06119159981608391,
"kl": 0.016726016998291016,
"learning_rate": 4.997922339212501e-06,
"loss": 0.0095,
"step": 385
},
{
"clip_ratio": 0.0,
"completion_length": 582.7536017894745,
"epoch": 0.22289657998181794,
"grad_norm": 0.06853786110877991,
"kl": 0.01645803451538086,
"learning_rate": 4.997697917237789e-06,
"loss": 0.0092,
"num_tokens": 91672333.0,
"reward": 0.2298177152988501,
"reward_std": 0.1270827678963542,
"rewards/pure_accuracy_reward_math": 0.22981770866317675,
"step": 386
},
{
"clip_ratio": 0.0006475677645312317,
"epoch": 0.22453552542286073,
"grad_norm": 0.06568547338247299,
"kl": 0.016225337982177734,
"learning_rate": 4.997461993687514e-06,
"loss": 0.0091,
"step": 387
},
{
"clip_ratio": 0.000736436435545329,
"epoch": 0.22617447086390352,
"grad_norm": 0.06055685877799988,
"kl": 0.015880584716796875,
"learning_rate": 4.997214569648075e-06,
"loss": 0.0088,
"step": 388
},
{
"clip_ratio": 0.0007173336517780626,
"epoch": 0.22781341630494628,
"grad_norm": 0.054837051779031754,
"kl": 0.01586627960205078,
"learning_rate": 4.996955646258826e-06,
"loss": 0.0084,
"step": 389
},
{
"clip_ratio": 0.0007432895852730326,
"epoch": 0.22945236174598907,
"grad_norm": 0.05363443121314049,
"kl": 0.01619720458984375,
"learning_rate": 4.996685224712077e-06,
"loss": 0.008,
"step": 390
},
{
"clip_ratio": 0.0,
"completion_length": 569.7708532810211,
"epoch": 0.23109130718703186,
"grad_norm": 0.08375601470470428,
"kl": 0.016201019287109375,
"learning_rate": 4.9964033062530825e-06,
"loss": 0.0067,
"num_tokens": 94902157.0,
"reward": 0.2470703189901542,
"reward_std": 0.12347866676282138,
"rewards/pure_accuracy_reward_math": 0.2470703137514647,
"step": 391
},
{
"clip_ratio": 0.0006189611340801093,
"epoch": 0.23273025262807462,
"grad_norm": 0.06883595138788223,
"kl": 0.016017436981201172,
"learning_rate": 4.996109892180041e-06,
"loss": 0.0065,
"step": 392
},
{
"clip_ratio": 0.0007823226997629718,
"epoch": 0.2343691980691174,
"grad_norm": 0.08916032314300537,
"kl": 0.01654815673828125,
"learning_rate": 4.995804983844088e-06,
"loss": 0.0062,
"step": 393
},
{
"clip_ratio": 0.0008161565754676303,
"epoch": 0.23600814351016017,
"grad_norm": 0.06091364100575447,
"kl": 0.015578985214233398,
"learning_rate": 4.995488582649286e-06,
"loss": 0.0058,
"step": 394
},
{
"clip_ratio": 0.0008045715469506831,
"epoch": 0.23764708895120296,
"grad_norm": 0.0608866885304451,
"kl": 0.015944957733154297,
"learning_rate": 4.99516069005262e-06,
"loss": 0.0054,
"step": 395
},
{
"clip_ratio": 0.0,
"completion_length": 589.5065298080444,
"epoch": 0.23928603439224574,
"grad_norm": 0.07174283266067505,
"kl": 0.01667642593383789,
"learning_rate": 4.994821307563995e-06,
"loss": 0.0062,
"num_tokens": 98201877.0,
"reward": 0.2291666735545732,
"reward_std": 0.12557925208238885,
"rewards/pure_accuracy_reward_math": 0.22916666680248454,
"step": 396
},
{
"clip_ratio": 0.0007434075716901134,
"epoch": 0.2409249798332885,
"grad_norm": 0.06661787629127502,
"kl": 0.01587820053100586,
"learning_rate": 4.994470436746222e-06,
"loss": 0.0061,
"step": 397
},
{
"clip_ratio": 0.0009446046724406187,
"epoch": 0.2425639252743313,
"grad_norm": 0.06436329334974289,
"kl": 0.015263080596923828,
"learning_rate": 4.994108079215016e-06,
"loss": 0.0058,
"step": 398
},
{
"clip_ratio": 0.0009035653077944517,
"epoch": 0.24420287071537408,
"grad_norm": 0.05970580503344536,
"kl": 0.0152130126953125,
"learning_rate": 4.9937342366389875e-06,
"loss": 0.0054,
"step": 399
},
{
"clip_ratio": 0.000805649116728091,
"epoch": 0.24584181615641684,
"grad_norm": 0.05798059329390526,
"kl": 0.015795230865478516,
"learning_rate": 4.9933489107396324e-06,
"loss": 0.005,
"step": 400
},
{
"clip_ratio": 0.0,
"completion_length": 585.6373908519745,
"epoch": 0.24748076159745963,
"grad_norm": 0.07425414025783539,
"kl": 0.016396045684814453,
"learning_rate": 4.992952103291327e-06,
"loss": 0.0062,
"num_tokens": 101490067.0,
"reward": 0.2379557362291962,
"reward_std": 0.14541265065781772,
"rewards/pure_accuracy_reward_math": 0.23795572970993817,
"step": 401
},
{
"clip_ratio": 0.0006993081486825758,
"epoch": 0.24911970703850242,
"grad_norm": 0.0683765783905983,
"kl": 0.016202926635742188,
"learning_rate": 4.992543816121317e-06,
"loss": 0.006,
"step": 402
},
{
"clip_ratio": 0.0007977800468097485,
"epoch": 0.2507586524795452,
"grad_norm": 0.06357114762067795,
"kl": 0.015718460083007812,
"learning_rate": 4.992124051109714e-06,
"loss": 0.0056,
"step": 403
},
{
"clip_ratio": 0.0009015119801460969,
"epoch": 0.25239759792058797,
"grad_norm": 0.06347363442182541,
"kl": 0.015771865844726562,
"learning_rate": 4.991692810189479e-06,
"loss": 0.0051,
"step": 404
},
{
"clip_ratio": 0.0008236684840881026,
"epoch": 0.25403654336163073,
"grad_norm": 0.058691952377557755,
"kl": 0.016323566436767578,
"learning_rate": 4.991250095346423e-06,
"loss": 0.0047,
"step": 405
},
{
"clip_ratio": 0.0,
"completion_length": 583.5478718280792,
"epoch": 0.25567548880267355,
"grad_norm": 0.07196501642465591,
"kl": 0.01677703857421875,
"learning_rate": 4.990795908619189e-06,
"loss": 0.0083,
"num_tokens": 104766370.0,
"reward": 0.2216796949505806,
"reward_std": 0.12527710193535313,
"rewards/pure_accuracy_reward_math": 0.22167968738358468,
"step": 406
},
{
"clip_ratio": 0.0007391628066670819,
"epoch": 0.2573144342437163,
"grad_norm": 0.07041583210229874,
"kl": 0.016283512115478516,
"learning_rate": 4.990330252099249e-06,
"loss": 0.0081,
"step": 407
},
{
"clip_ratio": 0.0009407289188061441,
"epoch": 0.25895337968475907,
"grad_norm": 0.06628228724002838,
"kl": 0.015958786010742188,
"learning_rate": 4.98985312793089e-06,
"loss": 0.0078,
"step": 408
},
{
"clip_ratio": 0.0008640961274295478,
"epoch": 0.2605923251258019,
"grad_norm": 0.08439858257770538,
"kl": 0.01657581329345703,
"learning_rate": 4.989364538311209e-06,
"loss": 0.0074,
"step": 409
},
{
"clip_ratio": 0.0008074077862829654,
"epoch": 0.26223127056684464,
"grad_norm": 0.06573989987373352,
"kl": 0.016643524169921875,
"learning_rate": 4.988864485490096e-06,
"loss": 0.007,
"step": 410
},
{
"clip_ratio": 0.0,
"completion_length": 577.0521020889282,
"epoch": 0.2638702160078874,
"grad_norm": 0.0664341077208519,
"kl": 0.017581939697265625,
"learning_rate": 4.988352971770229e-06,
"loss": 0.008,
"num_tokens": 108026786.0,
"reward": 0.22265625622821972,
"reward_std": 0.10564513533608988,
"rewards/pure_accuracy_reward_math": 0.2226562507566996,
"step": 411
},
{
"clip_ratio": 0.0005124467848531822,
"epoch": 0.2655091614489302,
"grad_norm": 0.058590181171894073,
"kl": 0.016813278198242188,
"learning_rate": 4.987829999507065e-06,
"loss": 0.0078,
"step": 412
},
{
"clip_ratio": 0.000724739855968437,
"epoch": 0.267148106889973,
"grad_norm": 0.058657143265008926,
"kl": 0.016211986541748047,
"learning_rate": 4.9872955711088215e-06,
"loss": 0.0076,
"step": 413
},
{
"clip_ratio": 0.0007133069941573922,
"epoch": 0.26878705233101574,
"grad_norm": 0.054359566420316696,
"kl": 0.01609182357788086,
"learning_rate": 4.9867496890364734e-06,
"loss": 0.0072,
"step": 414
},
{
"clip_ratio": 0.0006473830337654363,
"epoch": 0.27042599777205856,
"grad_norm": 0.0523286908864975,
"kl": 0.016422271728515625,
"learning_rate": 4.986192355803735e-06,
"loss": 0.0069,
"step": 415
},
{
"clip_ratio": 0.0,
"completion_length": 587.6103718280792,
"epoch": 0.2720649432131013,
"grad_norm": 0.06779833137989044,
"kl": 0.016475200653076172,
"learning_rate": 4.985623573977056e-06,
"loss": 0.0092,
"num_tokens": 111325301.0,
"reward": 0.23014323599636555,
"reward_std": 0.14115750859491527,
"rewards/pure_accuracy_reward_math": 0.23014322959352285,
"step": 416
},
{
"clip_ratio": 0.0006248025758850417,
"epoch": 0.2737038886541441,
"grad_norm": 0.06379402428865433,
"kl": 0.016280651092529297,
"learning_rate": 4.985043346175602e-06,
"loss": 0.009,
"step": 417
},
{
"clip_ratio": 0.000747511406416379,
"epoch": 0.2753428340951869,
"grad_norm": 0.060899555683135986,
"kl": 0.015888690948486328,
"learning_rate": 4.984451675071247e-06,
"loss": 0.0086,
"step": 418
},
{
"clip_ratio": 0.0007759029947465024,
"epoch": 0.27698177953622966,
"grad_norm": 0.059268273413181305,
"kl": 0.01577615737915039,
"learning_rate": 4.983848563388559e-06,
"loss": 0.0082,
"step": 419
},
{
"clip_ratio": 0.0007596586527824911,
"epoch": 0.2786207249772724,
"grad_norm": 0.05496392399072647,
"kl": 0.016138076782226562,
"learning_rate": 4.983234013904791e-06,
"loss": 0.0078,
"step": 420
},
{
"clip_ratio": 0.0,
"completion_length": 575.2080256938934,
"epoch": 0.28025967041831523,
"grad_norm": 0.06906809657812119,
"kl": 0.016767501831054688,
"learning_rate": 4.9826080294498615e-06,
"loss": 0.0087,
"num_tokens": 114572748.0,
"reward": 0.2369791732635349,
"reward_std": 0.12557925086002797,
"rewards/pure_accuracy_reward_math": 0.2369791673263535,
"step": 421
},
{
"clip_ratio": 0.0006102707594664025,
"epoch": 0.281898615859358,
"grad_norm": 0.06671704351902008,
"kl": 0.01644277572631836,
"learning_rate": 4.98197061290635e-06,
"loss": 0.0085,
"step": 422
},
{
"clip_ratio": 0.0008330631824264856,
"epoch": 0.28353756130040075,
"grad_norm": 0.06136437505483627,
"kl": 0.016006946563720703,
"learning_rate": 4.981321767209477e-06,
"loss": 0.0082,
"step": 423
},
{
"clip_ratio": 0.0008570863296881726,
"epoch": 0.28517650674144357,
"grad_norm": 0.05813751742243767,
"kl": 0.015911102294921875,
"learning_rate": 4.980661495347092e-06,
"loss": 0.0078,
"step": 424
},
{
"clip_ratio": 0.0007138608199284135,
"epoch": 0.28681545218248633,
"grad_norm": 0.055987462401390076,
"kl": 0.016295433044433594,
"learning_rate": 4.979989800359661e-06,
"loss": 0.0074,
"step": 425
},
{
"clip_ratio": 0.0,
"completion_length": 581.0205252170563,
"epoch": 0.2884543976235291,
"grad_norm": 0.07705598324537277,
"kl": 0.016697406768798828,
"learning_rate": 4.9793066853402535e-06,
"loss": 0.0104,
"num_tokens": 117848783.0,
"reward": 0.2561849042249378,
"reward_std": 0.13690236682305112,
"rewards/pure_accuracy_reward_math": 0.2561848958430346,
"step": 426
},
{
"clip_ratio": 0.0006357220343033987,
"epoch": 0.2900933430645719,
"grad_norm": 0.06893625855445862,
"kl": 0.016409873962402344,
"learning_rate": 4.978612153434527e-06,
"loss": 0.0102,
"step": 427
},
{
"clip_ratio": 0.0008826974139992672,
"epoch": 0.29173228850561467,
"grad_norm": 0.06487911939620972,
"kl": 0.01578235626220703,
"learning_rate": 4.977906207840708e-06,
"loss": 0.0099,
"step": 428
},
{
"clip_ratio": 0.0009138368169487876,
"epoch": 0.29337123394665743,
"grad_norm": 0.05983469635248184,
"kl": 0.015604972839355469,
"learning_rate": 4.9771888518095855e-06,
"loss": 0.0094,
"step": 429
},
{
"clip_ratio": 0.0008240164653443571,
"epoch": 0.29501017938770024,
"grad_norm": 0.05934643745422363,
"kl": 0.016060352325439453,
"learning_rate": 4.976460088644493e-06,
"loss": 0.009,
"step": 430
},
{
"clip_ratio": 0.0,
"completion_length": 572.7008640766144,
"epoch": 0.296649124828743,
"grad_norm": 0.10878130793571472,
"kl": 0.01746988296508789,
"learning_rate": 4.9757199217012884e-06,
"loss": 0.012,
"num_tokens": 121093860.0,
"reward": 0.25097657056176104,
"reward_std": 0.13550679641775787,
"rewards/pure_accuracy_reward_math": 0.25097656264551915,
"step": 431
},
{
"clip_ratio": 0.00068632745751529,
"epoch": 0.29828807026978577,
"grad_norm": 0.0702020674943924,
"kl": 0.017047405242919922,
"learning_rate": 4.974968354388346e-06,
"loss": 0.0118,
"step": 432
},
{
"clip_ratio": 0.0008000754407930799,
"epoch": 0.2999270157108286,
"grad_norm": 0.06406186521053314,
"kl": 0.016495227813720703,
"learning_rate": 4.974205390166535e-06,
"loss": 0.0115,
"step": 433
},
{
"clip_ratio": 0.0008013431938707072,
"epoch": 0.30156596115187134,
"grad_norm": 4.406322956085205,
"kl": 0.020737171173095703,
"learning_rate": 4.973431032549207e-06,
"loss": 0.0112,
"step": 434
},
{
"clip_ratio": 0.0010278673380526016,
"epoch": 0.3032049065929141,
"grad_norm": 0.07802355289459229,
"kl": 0.016713619232177734,
"learning_rate": 4.9726452851021804e-06,
"loss": 0.0107,
"step": 435
},
{
"clip_ratio": 0.0,
"completion_length": 571.912127494812,
"epoch": 0.3048438520339569,
"grad_norm": 0.07019820809364319,
"kl": 0.01775836944580078,
"learning_rate": 4.971848151443718e-06,
"loss": 0.0087,
"num_tokens": 124344430.0,
"reward": 0.234049486432923,
"reward_std": 0.11882065865211189,
"rewards/pure_accuracy_reward_math": 0.2340494789823424,
"step": 436
},
{
"clip_ratio": 0.0009248623491657781,
"epoch": 0.3064827974749997,
"grad_norm": 0.07924344390630722,
"kl": 0.017708301544189453,
"learning_rate": 4.9710396352445175e-06,
"loss": 0.0086,
"step": 437
},
{
"clip_ratio": 0.0008092627095379612,
"epoch": 0.30812174291604244,
"grad_norm": 0.06455735862255096,
"kl": 0.01685619354248047,
"learning_rate": 4.970219740227693e-06,
"loss": 0.0082,
"step": 438
},
{
"clip_ratio": 0.0008741315057250176,
"epoch": 0.30976068835708526,
"grad_norm": 0.0737844780087471,
"kl": 0.016612529754638672,
"learning_rate": 4.969388470168754e-06,
"loss": 0.0078,
"step": 439
},
{
"clip_ratio": 0.0006731455727049251,
"epoch": 0.311399633798128,
"grad_norm": 0.061306241899728775,
"kl": 0.01703643798828125,
"learning_rate": 4.96854582889559e-06,
"loss": 0.0074,
"step": 440
},
{
"clip_ratio": 0.0,
"completion_length": 560.4661660194397,
"epoch": 0.3130385792391708,
"grad_norm": 0.07420651614665985,
"kl": 0.018575191497802734,
"learning_rate": 4.967691820288457e-06,
"loss": 0.0089,
"num_tokens": 127553878.0,
"reward": 0.24218750634463504,
"reward_std": 0.1318487230455503,
"rewards/pure_accuracy_reward_math": 0.24218749982537702,
"step": 441
},
{
"clip_ratio": 0.0007318889370253601,
"epoch": 0.3146775246802136,
"grad_norm": 0.0815000906586647,
"kl": 0.018791675567626953,
"learning_rate": 4.9668264482799535e-06,
"loss": 0.0087,
"step": 442
},
{
"clip_ratio": 0.0007262505477001469,
"epoch": 0.31631647012125635,
"grad_norm": 0.06461174786090851,
"kl": 0.01784658432006836,
"learning_rate": 4.965949716855006e-06,
"loss": 0.0083,
"step": 443
},
{
"clip_ratio": 0.001082696728190058,
"epoch": 0.3179554155622991,
"grad_norm": 0.0798153281211853,
"kl": 0.017561912536621094,
"learning_rate": 4.965061630050848e-06,
"loss": 0.0079,
"step": 444
},
{
"clip_ratio": 0.0007142422628589884,
"epoch": 0.31959436100334193,
"grad_norm": 0.05629098415374756,
"kl": 0.018102645874023438,
"learning_rate": 4.9641621919570045e-06,
"loss": 0.0074,
"step": 445
},
{
"clip_ratio": 0.0,
"completion_length": 573.8131718635559,
"epoch": 0.3212333064443847,
"grad_norm": 0.07845748215913773,
"kl": 0.019147872924804688,
"learning_rate": 4.963251406715272e-06,
"loss": 0.0121,
"num_tokens": 130808444.0,
"reward": 0.23372396570630372,
"reward_std": 0.1444127168506384,
"rewards/pure_accuracy_reward_math": 0.23372395883779973,
"step": 446
},
{
"clip_ratio": 0.0008133034913271331,
"epoch": 0.32287225188542745,
"grad_norm": 0.0820281058549881,
"kl": 0.018957138061523438,
"learning_rate": 4.9623292785197e-06,
"loss": 0.012,
"step": 447
},
{
"clip_ratio": 0.0009565782518166088,
"epoch": 0.32451119732647027,
"grad_norm": 0.06929846853017807,
"kl": 0.01794910430908203,
"learning_rate": 4.961395811616567e-06,
"loss": 0.0115,
"step": 448
},
{
"clip_ratio": 0.0012002166286038118,
"epoch": 0.32615014276751303,
"grad_norm": 0.08353662490844727,
"kl": 0.017772197723388672,
"learning_rate": 4.960451010304368e-06,
"loss": 0.0111,
"step": 449
},
{
"clip_ratio": 0.0008637306386845012,
"epoch": 0.3277890882085558,
"grad_norm": 0.059568535536527634,
"kl": 0.01876544952392578,
"learning_rate": 4.959494878933792e-06,
"loss": 0.0105,
"step": 450
},
{
"clip_ratio": 0.0,
"completion_length": 562.3017749786377,
"epoch": 0.3294280336495986,
"grad_norm": 0.08181121200323105,
"kl": 0.020171165466308594,
"learning_rate": 4.958527421907697e-06,
"loss": 0.0075,
"num_tokens": 134024963.0,
"reward": 0.2363281317811925,
"reward_std": 0.12392827571602538,
"rewards/pure_accuracy_reward_math": 0.23632812607684173,
"step": 451
},
{
"clip_ratio": 0.0007134260189332053,
"epoch": 0.33106697909064137,
"grad_norm": 0.17417100071907043,
"kl": 0.019116878509521484,
"learning_rate": 4.957548643681102e-06,
"loss": 0.0076,
"step": 452
},
{
"clip_ratio": 0.000948693925124644,
"epoch": 0.3327059245316841,
"grad_norm": 10.566765785217285,
"kl": 0.22874164581298828,
"learning_rate": 4.95655854876115e-06,
"loss": 0.0154,
"step": 453
},
{
"clip_ratio": 0.001509593688069799,
"epoch": 0.33434486997272694,
"grad_norm": 0.37215539813041687,
"kl": 0.024587154388427734,
"learning_rate": 4.955557141707102e-06,
"loss": 0.0071,
"step": 454
},
{
"clip_ratio": 0.0016867685367287777,
"epoch": 0.3359838154137697,
"grad_norm": 0.11960741132497787,
"kl": 0.018782615661621094,
"learning_rate": 4.954544427130308e-06,
"loss": 0.0071,
"step": 455
},
{
"clip_ratio": 0.0,
"completion_length": 559.3082876205444,
"epoch": 0.33762276085481246,
"grad_norm": 0.07540658116340637,
"kl": 0.018846988677978516,
"learning_rate": 4.953520409694186e-06,
"loss": 0.0064,
"num_tokens": 137230282.0,
"reward": 0.23795573617098853,
"reward_std": 0.12567996798316017,
"rewards/pure_accuracy_reward_math": 0.2379557301173918,
"step": 456
},
{
"clip_ratio": 0.0007552293965318313,
"epoch": 0.3392617062958553,
"grad_norm": 0.08643142879009247,
"kl": 0.019680500030517578,
"learning_rate": 4.9524850941142045e-06,
"loss": 0.0063,
"step": 457
},
{
"clip_ratio": 0.0008370072589514166,
"epoch": 0.34090065173689804,
"grad_norm": 0.07104479521512985,
"kl": 0.01837015151977539,
"learning_rate": 4.951438485157858e-06,
"loss": 0.0059,
"step": 458
},
{
"clip_ratio": 0.0012107024614920192,
"epoch": 0.3425395971779408,
"grad_norm": 0.07779641449451447,
"kl": 0.017088890075683594,
"learning_rate": 4.950380587644645e-06,
"loss": 0.0055,
"step": 459
},
{
"clip_ratio": 0.001106619913571194,
"epoch": 0.3441785426189836,
"grad_norm": 0.07811883091926575,
"kl": 0.017291545867919922,
"learning_rate": 4.949311406446047e-06,
"loss": 0.005,
"step": 460
},
{
"clip_ratio": 0.0,
"completion_length": 571.5631704330444,
"epoch": 0.3458174880600264,
"grad_norm": 0.07503899931907654,
"kl": 0.018761634826660156,
"learning_rate": 4.948230946485504e-06,
"loss": 0.0099,
"num_tokens": 140468056.0,
"reward": 0.23144531933940016,
"reward_std": 0.1391576409805566,
"rewards/pure_accuracy_reward_math": 0.2314453127037268,
"step": 461
},
{
"clip_ratio": 0.0005579163533582232,
"epoch": 0.34745643350106914,
"grad_norm": 0.07867737859487534,
"kl": 0.019515037536621094,
"learning_rate": 4.947139212738395e-06,
"loss": 0.0097,
"step": 462
},
{
"clip_ratio": 0.0005302657258994259,
"epoch": 0.34909537894211196,
"grad_norm": 0.06822054833173752,
"kl": 0.018963336944580078,
"learning_rate": 4.946036210232013e-06,
"loss": 0.0093,
"step": 463
},
{
"clip_ratio": 0.0007419603928155993,
"epoch": 0.3507343243831547,
"grad_norm": 0.06452897191047668,
"kl": 0.017910480499267578,
"learning_rate": 4.9449219440455406e-06,
"loss": 0.0089,
"step": 464
},
{
"clip_ratio": 0.0009656345523580967,
"epoch": 0.3523732698241975,
"grad_norm": 0.06394355744123459,
"kl": 0.017592430114746094,
"learning_rate": 4.94379641931003e-06,
"loss": 0.0084,
"step": 465
},
{
"clip_ratio": 0.0,
"completion_length": 550.8063335418701,
"epoch": 0.3540122152652403,
"grad_norm": 0.07220590114593506,
"kl": 0.019676685333251953,
"learning_rate": 4.9426596412083775e-06,
"loss": 0.0073,
"num_tokens": 143643633.0,
"reward": 0.2643229244858958,
"reward_std": 0.129740908567328,
"rewards/pure_accuracy_reward_math": 0.26432291738456115,
"step": 466
},
{
"clip_ratio": 0.0004138159448530132,
"epoch": 0.35565116070628305,
"grad_norm": 0.06854696571826935,
"kl": 0.019529342651367188,
"learning_rate": 4.9415116149752975e-06,
"loss": 0.0071,
"step": 467
},
{
"clip_ratio": 0.0006204222647170354,
"epoch": 0.3572901061473258,
"grad_norm": 0.0629592314362526,
"kl": 0.01886892318725586,
"learning_rate": 4.940352345897304e-06,
"loss": 0.0068,
"step": 468
},
{
"clip_ratio": 0.0008853772396264503,
"epoch": 0.35892905158836863,
"grad_norm": 0.07459286600351334,
"kl": 0.018596172332763672,
"learning_rate": 4.93918183931268e-06,
"loss": 0.0064,
"step": 469
},
{
"clip_ratio": 0.0006601580334404389,
"epoch": 0.3605679970294114,
"grad_norm": 0.06547638773918152,
"kl": 0.019172191619873047,
"learning_rate": 4.938000100611456e-06,
"loss": 0.0059,
"step": 470
},
{
"clip_ratio": 0.0,
"completion_length": 563.5420064926147,
"epoch": 0.36220694247045415,
"grad_norm": 0.07862798869609833,
"kl": 0.019073963165283203,
"learning_rate": 4.936807135235389e-06,
"loss": 0.0082,
"num_tokens": 146856798.0,
"reward": 0.24414063114090823,
"reward_std": 0.13599591748788953,
"rewards/pure_accuracy_reward_math": 0.2441406262514647,
"step": 471
},
{
"clip_ratio": 0.000502235996748368,
"epoch": 0.36384588791149697,
"grad_norm": 0.07364361733198166,
"kl": 0.018817424774169922,
"learning_rate": 4.935602948677925e-06,
"loss": 0.008,
"step": 472
},
{
"clip_ratio": 0.0008022733071584298,
"epoch": 0.36548483335253973,
"grad_norm": 0.0682106539607048,
"kl": 0.018253803253173828,
"learning_rate": 4.934387546484192e-06,
"loss": 0.0076,
"step": 473
},
{
"clip_ratio": 0.0009958607009821208,
"epoch": 0.3671237787935825,
"grad_norm": 0.06967198103666306,
"kl": 0.01818084716796875,
"learning_rate": 4.933160934250957e-06,
"loss": 0.0072,
"step": 474
},
{
"clip_ratio": 0.0007515698841871199,
"epoch": 0.3687627242346253,
"grad_norm": 0.05721515789628029,
"kl": 0.018873214721679688,
"learning_rate": 4.931923117626611e-06,
"loss": 0.0067,
"step": 475
},
{
"clip_ratio": 0.0,
"completion_length": 563.7535984516144,
"epoch": 0.37040166967566807,
"grad_norm": 0.07566659152507782,
"kl": 0.019544124603271484,
"learning_rate": 4.93067410231114e-06,
"loss": 0.0064,
"num_tokens": 150075101.0,
"reward": 0.25325521553168073,
"reward_std": 0.13043869246030226,
"rewards/pure_accuracy_reward_math": 0.25325520912883803,
"step": 476
},
{
"clip_ratio": 0.0004966380013229355,
"epoch": 0.3720406151167108,
"grad_norm": 0.0666096955537796,
"kl": 0.019169330596923828,
"learning_rate": 4.929413894056098e-06,
"loss": 0.0062,
"step": 477
},
{
"clip_ratio": 0.0010111166515116565,
"epoch": 0.37367956055775364,
"grad_norm": 0.07089894264936447,
"kl": 0.01856708526611328,
"learning_rate": 4.928142498664579e-06,
"loss": 0.0059,
"step": 478
},
{
"clip_ratio": 0.0009684021274551924,
"epoch": 0.3753185059987964,
"grad_norm": 0.07048792392015457,
"kl": 0.0184478759765625,
"learning_rate": 4.926859921991196e-06,
"loss": 0.0054,
"step": 479
},
{
"clip_ratio": 0.0007222936578727968,
"epoch": 0.37695745143983916,
"grad_norm": 0.0751122385263443,
"kl": 0.01910114288330078,
"learning_rate": 4.925566169942048e-06,
"loss": 0.0049,
"step": 480
},
{
"clip_ratio": 0.0,
"completion_length": 565.988950252533,
"epoch": 0.378596396880882,
"grad_norm": 0.07866821438074112,
"kl": 0.019292354583740234,
"learning_rate": 4.924261248474696e-06,
"loss": 0.0077,
"num_tokens": 153300683.0,
"reward": 0.24186198544339277,
"reward_std": 0.14245959254913032,
"rewards/pure_accuracy_reward_math": 0.2418619791569654,
"step": 481
},
{
"clip_ratio": 0.0005219346817284531,
"epoch": 0.38023534232192474,
"grad_norm": 0.07202895730733871,
"kl": 0.018962383270263672,
"learning_rate": 4.922945163598134e-06,
"loss": 0.0074,
"step": 482
},
{
"clip_ratio": 0.0007680588737457583,
"epoch": 0.3818742877629675,
"grad_norm": 0.06937456876039505,
"kl": 0.018546104431152344,
"learning_rate": 4.921617921372764e-06,
"loss": 0.0071,
"step": 483
},
{
"clip_ratio": 0.0008267570608495589,
"epoch": 0.3835132332040103,
"grad_norm": 0.06490996479988098,
"kl": 0.018600940704345703,
"learning_rate": 4.920279527910361e-06,
"loss": 0.0066,
"step": 484
},
{
"clip_ratio": 0.0007928038041882246,
"epoch": 0.3851521786450531,
"grad_norm": 0.06093154847621918,
"kl": 0.019063949584960938,
"learning_rate": 4.918929989374057e-06,
"loss": 0.006,
"step": 485
},
{
"clip_ratio": 0.0,
"completion_length": 559.4954574108124,
"epoch": 0.38679112408609584,
"grad_norm": 0.07598863542079926,
"kl": 0.020077228546142578,
"learning_rate": 4.917569311978301e-06,
"loss": 0.0076,
"num_tokens": 156505589.0,
"reward": 0.23372396451304667,
"reward_std": 0.12227730004815385,
"rewards/pure_accuracy_reward_math": 0.23372395892511122,
"step": 486
},
{
"clip_ratio": 0.0005095607535281488,
"epoch": 0.38843006952713865,
"grad_norm": 0.06875687837600708,
"kl": 0.019557952880859375,
"learning_rate": 4.916197501988836e-06,
"loss": 0.0073,
"step": 487
},
{
"clip_ratio": 0.0009137496642779297,
"epoch": 0.3900690149681814,
"grad_norm": 0.0720105841755867,
"kl": 0.018970012664794922,
"learning_rate": 4.914814565722671e-06,
"loss": 0.007,
"step": 488
},
{
"clip_ratio": 0.000769516089917488,
"epoch": 0.3917079604092242,
"grad_norm": 0.06368213146924973,
"kl": 0.019055843353271484,
"learning_rate": 4.913420509548047e-06,
"loss": 0.0065,
"step": 489
},
{
"clip_ratio": 0.000659781087506417,
"epoch": 0.393346905850267,
"grad_norm": 0.06218770891427994,
"kl": 0.019764423370361328,
"learning_rate": 4.912015339884412e-06,
"loss": 0.006,
"step": 490
},
{
"clip_ratio": 0.0,
"completion_length": 563.5508005619049,
"epoch": 0.39498585129130975,
"grad_norm": 0.07790997624397278,
"kl": 0.019940853118896484,
"learning_rate": 4.910599063202391e-06,
"loss": 0.0037,
"num_tokens": 159721585.0,
"reward": 0.25651042349636555,
"reward_std": 0.14637307275552303,
"rewards/pure_accuracy_reward_math": 0.25651041651144624,
"step": 491
},
{
"clip_ratio": 0.00052815272999851,
"epoch": 0.3966247967323525,
"grad_norm": 0.06944292038679123,
"kl": 0.019421100616455078,
"learning_rate": 4.9091716860237545e-06,
"loss": 0.0035,
"step": 492
},
{
"clip_ratio": 0.0008616803660288497,
"epoch": 0.39826374217339533,
"grad_norm": 0.0701906755566597,
"kl": 0.018842220306396484,
"learning_rate": 4.907733214921391e-06,
"loss": 0.0031,
"step": 493
},
{
"clip_ratio": 0.0009441319247116553,
"epoch": 0.3999026876144381,
"grad_norm": 0.06845781207084656,
"kl": 0.018959999084472656,
"learning_rate": 4.906283656519271e-06,
"loss": 0.0026,
"step": 494
},
{
"clip_ratio": 0.0007721009840224724,
"epoch": 0.40154163305548085,
"grad_norm": 0.06344389170408249,
"kl": 0.019802570343017578,
"learning_rate": 4.904823017492425e-06,
"loss": 0.002,
"step": 495
},
{
"clip_ratio": 0.0,
"completion_length": 564.9492375850677,
"epoch": 0.40318057849652367,
"grad_norm": 0.07819291949272156,
"kl": 0.019987106323242188,
"learning_rate": 4.903351304566907e-06,
"loss": 0.0087,
"num_tokens": 162944373.0,
"reward": 0.2613932383537758,
"reward_std": 0.14731903292704374,
"rewards/pure_accuracy_reward_math": 0.2613932291569654,
"step": 496
},
{
"clip_ratio": 0.0005454795893342634,
"epoch": 0.40481952393756643,
"grad_norm": 0.0714847669005394,
"kl": 0.019627094268798828,
"learning_rate": 4.9018685245197625e-06,
"loss": 0.0084,
"step": 497
},
{
"clip_ratio": 0.0009138085124504869,
"epoch": 0.4064584693786092,
"grad_norm": 0.07778745144605637,
"kl": 0.019023895263671875,
"learning_rate": 4.900374684179005e-06,
"loss": 0.008,
"step": 498
},
{
"clip_ratio": 0.0008562761545363173,
"epoch": 0.408097414819652,
"grad_norm": 0.07164430618286133,
"kl": 0.01940298080444336,
"learning_rate": 4.898869790423573e-06,
"loss": 0.0075,
"step": 499
},
{
"clip_ratio": 0.0008455659439050578,
"epoch": 0.40973636026069477,
"grad_norm": 0.07439333200454712,
"kl": 0.01979207992553711,
"learning_rate": 4.897353850183308e-06,
"loss": 0.007,
"step": 500
},
{
"epoch": 0.40973636026069477,
"step": 500,
"total_flos": 0.0,
"train_loss": 0.0,
"train_runtime": 6.6841,
"train_samples_per_second": 42069.596,
"train_steps_per_second": 54.756
}
],
"logging_steps": 1,
"max_steps": 366,
"num_input_tokens_seen": 0,
"num_train_epochs": 3,
"save_steps": 50,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": false
},
"attributes": {}
}
},
"total_flos": 0.0,
"train_batch_size": 4,
"trial_name": null,
"trial_params": null
}