Training in progress, step 550

253aa0e verified 7 months ago

147 kB

	{
	"best_global_step": null,
	"best_metric": null,
	"best_model_checkpoint": null,
	"epoch": 0.40973636026069477,
	"eval_steps": 500,
	"global_step": 500,
	"is_hyper_param_search": false,
	"is_local_process_zero": true,
	"is_world_process_zero": true,
	"log_history": [
	{
	"clip_ratio": 0.0,
	"completion_length": 606.2532775402069,
	"epoch": 0.001638945441042779,
	"grad_norm": 0.05986390635371208,
	"kl": 0.0,
	"learning_rate": 1.3661202185792351e-08,
	"loss": 0.0057,
	"num_tokens": 3348938.0,
	"reward": 0.1718750048603397,
	"reward_std": 0.09577879420248792,
	"rewards/pure_accuracy_reward_math": 0.17187500078580342,
	"step": 1
	},
	{
	"clip_ratio": 0.0,
	"epoch": 0.003277890882085558,
	"grad_norm": 0.05986390635371208,
	"kl": 0.0,
	"learning_rate": 2.7322404371584703e-08,
	"loss": 0.0057,
	"step": 2
	},
	{
	"clip_ratio": 0.0006339755559565674,
	"epoch": 0.004916836323128337,
	"grad_norm": 0.05929790809750557,
	"kl": 0.0005019009113311768,
	"learning_rate": 4.098360655737705e-08,
	"loss": 0.0057,
	"step": 3
	},
	{
	"clip_ratio": 0.0006407226928217824,
	"epoch": 0.006555781764171116,
	"grad_norm": 0.059925854206085205,
	"kl": 0.0005110502243041992,
	"learning_rate": 5.4644808743169406e-08,
	"loss": 0.0057,
	"step": 4
	},
	{
	"clip_ratio": 0.0006387700201457847,
	"epoch": 0.008194727205213895,
	"grad_norm": 0.05939409136772156,
	"kl": 0.0005159676074981689,
	"learning_rate": 6.830601092896175e-08,
	"loss": 0.0057,
	"step": 5
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 612.4726753234863,
	"epoch": 0.009833672646256675,
	"grad_norm": 0.072689488530159,
	"kl": 0.000512346625328064,
	"learning_rate": 8.19672131147541e-08,
	"loss": 0.0067,
	"num_tokens": 6714854.0,
	"reward": 0.16438802544143982,
	"reward_std": 0.11541076033608988,
	"rewards/pure_accuracy_reward_math": 0.16438802113407291,
	"step": 6
	},
	{
	"clip_ratio": 0.0007277115302031234,
	"epoch": 0.011472618087299453,
	"grad_norm": 0.07328997552394867,
	"kl": 0.0005197674036026001,
	"learning_rate": 9.562841530054645e-08,
	"loss": 0.0068,
	"step": 7
	},
	{
	"clip_ratio": 0.0007614574305989663,
	"epoch": 0.013111563528342233,
	"grad_norm": 0.07325445115566254,
	"kl": 0.0005202591419219971,
	"learning_rate": 1.0928961748633881e-07,
	"loss": 0.0068,
	"step": 8
	},
	{
	"clip_ratio": 0.0007783421593785533,
	"epoch": 0.01475050896938501,
	"grad_norm": 0.07128091156482697,
	"kl": 0.000517427921295166,
	"learning_rate": 1.2295081967213116e-07,
	"loss": 0.0068,
	"step": 9
	},
	{
	"clip_ratio": 0.0007585194575767673,
	"epoch": 0.01638945441042779,
	"grad_norm": 0.07174714654684067,
	"kl": 0.0005128979682922363,
	"learning_rate": 1.366120218579235e-07,
	"loss": 0.0068,
	"step": 10
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 609.4596562385559,
	"epoch": 0.018028399851470568,
	"grad_norm": 0.060616616159677505,
	"kl": 0.0005253106355667114,
	"learning_rate": 1.5027322404371585e-07,
	"loss": 0.0052,
	"num_tokens": 10075962.0,
	"reward": 0.17447917186655104,
	"reward_std": 0.09832898661261424,
	"rewards/pure_accuracy_reward_math": 0.17447916814126074,
	"step": 11
	},
	{
	"clip_ratio": 0.0006354124035397035,
	"epoch": 0.01966734529251335,
	"grad_norm": 0.05994507297873497,
	"kl": 0.0005232691764831543,
	"learning_rate": 1.639344262295082e-07,
	"loss": 0.0053,
	"step": 12
	},
	{
	"clip_ratio": 0.0006359500578128063,
	"epoch": 0.021306290733556128,
	"grad_norm": 0.060422513633966446,
	"kl": 0.0005258470773696899,
	"learning_rate": 1.7759562841530054e-07,
	"loss": 0.0053,
	"step": 13
	},
	{
	"clip_ratio": 0.0006202999380775509,
	"epoch": 0.022945236174598906,
	"grad_norm": 0.06020491570234299,
	"kl": 0.000526919960975647,
	"learning_rate": 1.912568306010929e-07,
	"loss": 0.0053,
	"step": 14
	},
	{
	"clip_ratio": 0.0006456842476154634,
	"epoch": 0.024584181615641687,
	"grad_norm": 0.06016543507575989,
	"kl": 0.0005295425653457642,
	"learning_rate": 2.0491803278688524e-07,
	"loss": 0.0053,
	"step": 15
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 613.3219571113586,
	"epoch": 0.026223127056684465,
	"grad_norm": 0.06271925568580627,
	"kl": 0.0005239248275756836,
	"learning_rate": 2.1857923497267762e-07,
	"loss": 0.0081,
	"num_tokens": 13445671.0,
	"reward": 0.1438802126212977,
	"reward_std": 0.10509481013286859,
	"rewards/pure_accuracy_reward_math": 0.1438802084303461,
	"step": 16
	},
	{
	"clip_ratio": 0.0007483757581212558,
	"epoch": 0.027862072497727243,
	"grad_norm": 0.06271728873252869,
	"kl": 0.000528186559677124,
	"learning_rate": 2.3224043715846998e-07,
	"loss": 0.0081,
	"step": 17
	},
	{
	"clip_ratio": 0.0006768568357529148,
	"epoch": 0.02950101793877002,
	"grad_norm": 0.06163553521037102,
	"kl": 0.0005240440368652344,
	"learning_rate": 2.459016393442623e-07,
	"loss": 0.0081,
	"step": 18
	},
	{
	"clip_ratio": 0.00073299726238929,
	"epoch": 0.031139963379812802,
	"grad_norm": 0.062258753925561905,
	"kl": 0.000529751181602478,
	"learning_rate": 2.595628415300547e-07,
	"loss": 0.0081,
	"step": 19
	},
	{
	"clip_ratio": 0.0007049078883483162,
	"epoch": 0.03277890882085558,
	"grad_norm": 0.061678871512413025,
	"kl": 0.0005273669958114624,
	"learning_rate": 2.73224043715847e-07,
	"loss": 0.0081,
	"step": 20
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 610.4453291893005,
	"epoch": 0.03441785426189836,
	"grad_norm": 0.06878828257322311,
	"kl": 0.0005384832620620728,
	"learning_rate": 2.8688524590163937e-07,
	"loss": 0.0051,
	"num_tokens": 16799755.0,
	"reward": 0.15136719125439413,
	"reward_std": 0.10323517030337825,
	"rewards/pure_accuracy_reward_math": 0.15136718822759576,
	"step": 21
	},
	{
	"clip_ratio": 0.0007487565382007233,
	"epoch": 0.036056799702941136,
	"grad_norm": 0.06954149156808853,
	"kl": 0.0005507916212081909,
	"learning_rate": 3.005464480874317e-07,
	"loss": 0.0051,
	"step": 22
	},
	{
	"clip_ratio": 0.0007795561222110337,
	"epoch": 0.03769574514398392,
	"grad_norm": 0.06806771457195282,
	"kl": 0.0005584806203842163,
	"learning_rate": 3.142076502732241e-07,
	"loss": 0.0051,
	"step": 23
	},
	{
	"clip_ratio": 0.0007387081783463145,
	"epoch": 0.0393346905850267,
	"grad_norm": 0.06814352422952652,
	"kl": 0.0005674809217453003,
	"learning_rate": 3.278688524590164e-07,
	"loss": 0.0051,
	"step": 24
	},
	{
	"clip_ratio": 0.0007619177375772779,
	"epoch": 0.040973636026069474,
	"grad_norm": 0.06729913502931595,
	"kl": 0.0005744844675064087,
	"learning_rate": 3.415300546448088e-07,
	"loss": 0.0051,
	"step": 25
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 596.7343945503235,
	"epoch": 0.042612581467112255,
	"grad_norm": 0.06346436589956284,
	"kl": 0.0006166845560073853,
	"learning_rate": 3.551912568306011e-07,
	"loss": 0.0038,
	"num_tokens": 20112563.0,
	"reward": 0.16373698358074762,
	"reward_std": 0.09582553629297763,
	"rewards/pure_accuracy_reward_math": 0.163736979739042,
	"step": 26
	},
	{
	"clip_ratio": 0.0006948200579017794,
	"epoch": 0.04425152690815504,
	"grad_norm": 0.06285525858402252,
	"kl": 0.0006320923566818237,
	"learning_rate": 3.6885245901639347e-07,
	"loss": 0.0039,
	"step": 27
	},
	{
	"clip_ratio": 0.0006778589096256837,
	"epoch": 0.04589047234919781,
	"grad_norm": 0.06269308924674988,
	"kl": 0.000654950737953186,
	"learning_rate": 3.825136612021858e-07,
	"loss": 0.0039,
	"step": 28
	},
	{
	"clip_ratio": 0.0006392495685076938,
	"epoch": 0.04752941779024059,
	"grad_norm": 0.06292663514614105,
	"kl": 0.0006759315729141235,
	"learning_rate": 3.961748633879782e-07,
	"loss": 0.0039,
	"step": 29
	},
	{
	"clip_ratio": 0.000681757599068078,
	"epoch": 0.049168363231283374,
	"grad_norm": 0.06097942218184471,
	"kl": 0.0007022321224212646,
	"learning_rate": 4.0983606557377047e-07,
	"loss": 0.0039,
	"step": 30
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 606.3161022663116,
	"epoch": 0.05080730867232615,
	"grad_norm": 0.06853298097848892,
	"kl": 0.0007193088531494141,
	"learning_rate": 4.2349726775956286e-07,
	"loss": 0.005,
	"num_tokens": 23460802.0,
	"reward": 0.15071615006309003,
	"reward_std": 0.10664506914326921,
	"rewards/pure_accuracy_reward_math": 0.1507161462213844,
	"step": 31
	},
	{
	"clip_ratio": 0.0007206783092215119,
	"epoch": 0.05244625411336893,
	"grad_norm": 0.06669250130653381,
	"kl": 0.0007403194904327393,
	"learning_rate": 4.3715846994535524e-07,
	"loss": 0.005,
	"step": 32
	},
	{
	"clip_ratio": 0.0008033858404132843,
	"epoch": 0.05408519955441171,
	"grad_norm": 0.06685461103916168,
	"kl": 0.0007804930210113525,
	"learning_rate": 4.508196721311476e-07,
	"loss": 0.005,
	"step": 33
	},
	{
	"clip_ratio": 0.0007623738173379024,
	"epoch": 0.055724144995454486,
	"grad_norm": 0.06673412770032883,
	"kl": 0.0008253157138824463,
	"learning_rate": 4.6448087431693996e-07,
	"loss": 0.005,
	"step": 34
	},
	{
	"clip_ratio": 0.0007461598812597003,
	"epoch": 0.05736309043649727,
	"grad_norm": 0.06533104181289673,
	"kl": 0.0008644461631774902,
	"learning_rate": 4.781420765027322e-07,
	"loss": 0.005,
	"step": 35
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 597.9977385997772,
	"epoch": 0.05900203587754004,
	"grad_norm": 0.07363387197256088,
	"kl": 0.00089988112449646,
	"learning_rate": 4.918032786885246e-07,
	"loss": 0.0081,
	"num_tokens": 26781715.0,
	"reward": 0.17936198392999358,
	"reward_std": 0.1202162274857983,
	"rewards/pure_accuracy_reward_math": 0.179361979739042,
	"step": 36
	},
	{
	"clip_ratio": 0.0008816556705255607,
	"epoch": 0.06064098131858282,
	"grad_norm": 0.06755447387695312,
	"kl": 0.0009488761425018311,
	"learning_rate": 5.05464480874317e-07,
	"loss": 0.0081,
	"step": 37
	},
	{
	"clip_ratio": 0.0008573625917165373,
	"epoch": 0.062279926759625605,
	"grad_norm": 0.06729397177696228,
	"kl": 0.0010100901126861572,
	"learning_rate": 5.191256830601094e-07,
	"loss": 0.0081,
	"step": 38
	},
	{
	"clip_ratio": 0.000872175712970602,
	"epoch": 0.06391887220066839,
	"grad_norm": 0.06972332298755646,
	"kl": 0.0010748803615570068,
	"learning_rate": 5.327868852459017e-07,
	"loss": 0.0081,
	"step": 39
	},
	{
	"clip_ratio": 0.000930704369693558,
	"epoch": 0.06555781764171116,
	"grad_norm": 0.06739407032728195,
	"kl": 0.0011384189128875732,
	"learning_rate": 5.46448087431694e-07,
	"loss": 0.0081,
	"step": 40
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 606.660826921463,
	"epoch": 0.06719676308275394,
	"grad_norm": 0.05486822873353958,
	"kl": 0.001088649034500122,
	"learning_rate": 5.601092896174863e-07,
	"loss": 0.0058,
	"num_tokens": 30130177.0,
	"reward": 0.14843750451109372,
	"reward_std": 0.08996616111835465,
	"rewards/pure_accuracy_reward_math": 0.14843749973806553,
	"step": 41
	},
	{
	"clip_ratio": 0.000587868101206368,
	"epoch": 0.06883570852379672,
	"grad_norm": 0.053968992084264755,
	"kl": 0.0011524856090545654,
	"learning_rate": 5.737704918032787e-07,
	"loss": 0.0058,
	"step": 42
	},
	{
	"clip_ratio": 0.0005904338165692025,
	"epoch": 0.0704746539648395,
	"grad_norm": 0.05430474132299423,
	"kl": 0.0012042820453643799,
	"learning_rate": 5.874316939890711e-07,
	"loss": 0.0058,
	"step": 43
	},
	{
	"clip_ratio": 0.0005757618986308444,
	"epoch": 0.07211359940588227,
	"grad_norm": 0.05444110184907913,
	"kl": 0.0012355148792266846,
	"learning_rate": 6.010928961748634e-07,
	"loss": 0.0058,
	"step": 44
	},
	{
	"clip_ratio": 0.0006261014578967661,
	"epoch": 0.07375254484692506,
	"grad_norm": 0.054937466979026794,
	"kl": 0.0012827813625335693,
	"learning_rate": 6.147540983606558e-07,
	"loss": 0.0058,
	"step": 45
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 584.1201364994049,
	"epoch": 0.07539149028796784,
	"grad_norm": 0.06546950340270996,
	"kl": 0.0014158189296722412,
	"learning_rate": 6.284153005464482e-07,
	"loss": 0.0043,
	"num_tokens": 33407966.0,
	"reward": 0.1715494836680591,
	"reward_std": 0.11086069961311296,
	"rewards/pure_accuracy_reward_math": 0.17154948017559946,
	"step": 46
	},
	{
	"clip_ratio": 0.0007843592767358132,
	"epoch": 0.07703043572901061,
	"grad_norm": 0.06173517182469368,
	"kl": 0.0014501512050628662,
	"learning_rate": 6.420765027322406e-07,
	"loss": 0.0043,
	"step": 47
	},
	{
	"clip_ratio": 0.0008111927813843067,
	"epoch": 0.0786693811700534,
	"grad_norm": 0.06110456958413124,
	"kl": 0.0014650523662567139,
	"learning_rate": 6.557377049180328e-07,
	"loss": 0.0043,
	"step": 48
	},
	{
	"clip_ratio": 0.0007597751833827715,
	"epoch": 0.08030832661109617,
	"grad_norm": 0.06199155002832413,
	"kl": 0.0015124678611755371,
	"learning_rate": 6.693989071038252e-07,
	"loss": 0.0043,
	"step": 49
	},
	{
	"clip_ratio": 0.0007640034893938719,
	"epoch": 0.08194727205213895,
	"grad_norm": 0.06190052628517151,
	"kl": 0.0015333890914916992,
	"learning_rate": 6.830601092896176e-07,
	"loss": 0.0043,
	"step": 50
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 593.0820529460907,
	"epoch": 0.08358621749318174,
	"grad_norm": 0.06474039703607559,
	"kl": 0.0014415383338928223,
	"learning_rate": 6.967213114754098e-07,
	"loss": 0.0076,
	"num_tokens": 36714234.0,
	"reward": 0.1923828188155312,
	"reward_std": 0.1178674673428759,
	"rewards/pure_accuracy_reward_math": 0.1923828122962732,
	"step": 51
	},
	{
	"clip_ratio": 0.000813577574433566,
	"epoch": 0.08522516293422451,
	"grad_norm": 0.06284686177968979,
	"kl": 0.001471877098083496,
	"learning_rate": 7.103825136612022e-07,
	"loss": 0.0077,
	"step": 52
	},
	{
	"clip_ratio": 0.0007952848112040556,
	"epoch": 0.08686410837526728,
	"grad_norm": 0.0626569464802742,
	"kl": 0.0014744699001312256,
	"learning_rate": 7.240437158469946e-07,
	"loss": 0.0076,
	"step": 53
	},
	{
	"clip_ratio": 0.000757519129024331,
	"epoch": 0.08850305381631007,
	"grad_norm": 0.06075895577669144,
	"kl": 0.0014587044715881348,
	"learning_rate": 7.377049180327869e-07,
	"loss": 0.0076,
	"step": 54
	},
	{
	"clip_ratio": 0.0008223805086799985,
	"epoch": 0.09014199925735285,
	"grad_norm": 0.06047751381993294,
	"kl": 0.0014570355415344238,
	"learning_rate": 7.513661202185793e-07,
	"loss": 0.0076,
	"step": 55
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 595.8216323852539,
	"epoch": 0.09178094469839562,
	"grad_norm": 0.06398054957389832,
	"kl": 0.00144881010055542,
	"learning_rate": 7.650273224043716e-07,
	"loss": 0.0086,
	"num_tokens": 40026830.0,
	"reward": 0.20247396413469687,
	"reward_std": 0.11906883475603536,
	"rewards/pure_accuracy_reward_math": 0.2024739590124227,
	"step": 56
	},
	{
	"clip_ratio": 0.00078221041519555,
	"epoch": 0.09341989013943841,
	"grad_norm": 0.06344633549451828,
	"kl": 0.0014292001724243164,
	"learning_rate": 7.78688524590164e-07,
	"loss": 0.0087,
	"step": 57
	},
	{
	"clip_ratio": 0.0008090036571957171,
	"epoch": 0.09505883558048119,
	"grad_norm": 0.061615679413080215,
	"kl": 0.0014474093914031982,
	"learning_rate": 7.923497267759564e-07,
	"loss": 0.0087,
	"step": 58
	},
	{
	"clip_ratio": 0.0008085054041657713,
	"epoch": 0.09669778102152396,
	"grad_norm": 0.06151620298624039,
	"kl": 0.0014512240886688232,
	"learning_rate": 8.060109289617488e-07,
	"loss": 0.0086,
	"step": 59
	},
	{
	"clip_ratio": 0.000824362684852531,
	"epoch": 0.09833672646256675,
	"grad_norm": 0.06084871292114258,
	"kl": 0.0014411509037017822,
	"learning_rate": 8.196721311475409e-07,
	"loss": 0.0086,
	"step": 60
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 598.2584838867188,
	"epoch": 0.09997567190360952,
	"grad_norm": 0.06428408622741699,
	"kl": 0.0015523433685302734,
	"learning_rate": 8.333333333333333e-07,
	"loss": 0.0082,
	"num_tokens": 43356588.0,
	"reward": 0.18912761070532724,
	"reward_std": 0.11445756902685389,
	"rewards/pure_accuracy_reward_math": 0.18912760430248454,
	"step": 61
	},
	{
	"clip_ratio": 0.0008156659468454563,
	"epoch": 0.1016146173446523,
	"grad_norm": 0.06184009462594986,
	"kl": 0.001552283763885498,
	"learning_rate": 8.469945355191257e-07,
	"loss": 0.0082,
	"step": 62
	},
	{
	"clip_ratio": 0.0008079836062506729,
	"epoch": 0.10325356278569509,
	"grad_norm": 0.060980089008808136,
	"kl": 0.001578688621520996,
	"learning_rate": 8.606557377049181e-07,
	"loss": 0.0082,
	"step": 63
	},
	{
	"clip_ratio": 0.000800917034325721,
	"epoch": 0.10489250822673786,
	"grad_norm": 0.061832476407289505,
	"kl": 0.0016154646873474121,
	"learning_rate": 8.743169398907105e-07,
	"loss": 0.0082,
	"step": 64
	},
	{
	"clip_ratio": 0.0008089348676776353,
	"epoch": 0.10653145366778063,
	"grad_norm": 0.0595347136259079,
	"kl": 0.0017150640487670898,
	"learning_rate": 8.879781420765028e-07,
	"loss": 0.0081,
	"step": 65
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 602.9000825881958,
	"epoch": 0.10817039910882342,
	"grad_norm": 0.06361431628465652,
	"kl": 0.001691579818725586,
	"learning_rate": 9.016393442622952e-07,
	"loss": 0.005,
	"num_tokens": 46690213.0,
	"reward": 0.18261719372821972,
	"reward_std": 0.10855145199457183,
	"rewards/pure_accuracy_reward_math": 0.18261718755820766,
	"step": 66
	},
	{
	"clip_ratio": 0.0007102687751512349,
	"epoch": 0.1098093445498662,
	"grad_norm": 0.06422943621873856,
	"kl": 0.001762300729751587,
	"learning_rate": 9.153005464480875e-07,
	"loss": 0.005,
	"step": 67
	},
	{
	"clip_ratio": 0.0007208503458286941,
	"epoch": 0.11144828999090897,
	"grad_norm": 0.062008682638406754,
	"kl": 0.0017663836479187012,
	"learning_rate": 9.289617486338799e-07,
	"loss": 0.005,
	"step": 68
	},
	{
	"clip_ratio": 0.0007175619265353816,
	"epoch": 0.11308723543195176,
	"grad_norm": 0.061343614012002945,
	"kl": 0.001800447702407837,
	"learning_rate": 9.426229508196721e-07,
	"loss": 0.0049,
	"step": 69
	},
	{
	"clip_ratio": 0.0007331656333917635,
	"epoch": 0.11472618087299453,
	"grad_norm": 0.05962536856532097,
	"kl": 0.001809924840927124,
	"learning_rate": 9.562841530054645e-07,
	"loss": 0.0049,
	"step": 70
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 597.3851110935211,
	"epoch": 0.11636512631403731,
	"grad_norm": 0.07380504906177521,
	"kl": 0.001956164836883545,
	"learning_rate": 9.69945355191257e-07,
	"loss": 0.0059,
	"num_tokens": 50008096.0,
	"reward": 0.18815104707027785,
	"reward_std": 0.118169616907835,
	"rewards/pure_accuracy_reward_math": 0.18815104159875773,
	"step": 71
	},
	{
	"clip_ratio": 0.0008107801862706765,
	"epoch": 0.11800407175508008,
	"grad_norm": 0.06983543187379837,
	"kl": 0.0019207000732421875,
	"learning_rate": 9.836065573770493e-07,
	"loss": 0.0059,
	"step": 72
	},
	{
	"clip_ratio": 0.0008206042518850154,
	"epoch": 0.11964301719612287,
	"grad_norm": 0.06862860172986984,
	"kl": 0.001914680004119873,
	"learning_rate": 9.972677595628415e-07,
	"loss": 0.0059,
	"step": 73
	},
	{
	"clip_ratio": 0.0008123442846681428,
	"epoch": 0.12128196263716565,
	"grad_norm": 0.06780818104743958,
	"kl": 0.001929640769958496,
	"learning_rate": 1.010928961748634e-06,
	"loss": 0.0058,
	"step": 74
	},
	{
	"clip_ratio": 0.0008305984221124163,
	"epoch": 0.12292090807820842,
	"grad_norm": 0.06472048163414001,
	"kl": 0.0019400715827941895,
	"learning_rate": 1.0245901639344263e-06,
	"loss": 0.0058,
	"step": 75
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 599.2868046760559,
	"epoch": 0.12455985351925121,
	"grad_norm": 0.06543949246406555,
	"kl": 0.0018826127052307129,
	"learning_rate": 1.0382513661202188e-06,
	"loss": 0.0094,
	"num_tokens": 53333081.0,
	"reward": 0.19205729727400467,
	"reward_std": 0.12739214790053666,
	"rewards/pure_accuracy_reward_math": 0.19205729191889986,
	"step": 76
	},
	{
	"clip_ratio": 0.0008518788816900269,
	"epoch": 0.12619879896029398,
	"grad_norm": 0.06384909898042679,
	"kl": 0.0019139647483825684,
	"learning_rate": 1.051912568306011e-06,
	"loss": 0.0094,
	"step": 77
	},
	{
	"clip_ratio": 0.0008921786497353423,
	"epoch": 0.12783774440133677,
	"grad_norm": 0.06342752277851105,
	"kl": 0.001939535140991211,
	"learning_rate": 1.0655737704918034e-06,
	"loss": 0.0094,
	"step": 78
	},
	{
	"clip_ratio": 0.0008912816550719072,
	"epoch": 0.12947668984237953,
	"grad_norm": 0.06367272883653641,
	"kl": 0.0019831061363220215,
	"learning_rate": 1.0792349726775956e-06,
	"loss": 0.0093,
	"step": 79
	},
	{
	"clip_ratio": 0.0008512400360132233,
	"epoch": 0.13111563528342232,
	"grad_norm": 0.062457580119371414,
	"kl": 0.0020416975021362305,
	"learning_rate": 1.092896174863388e-06,
	"loss": 0.0093,
	"step": 80
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 600.7868010997772,
	"epoch": 0.1327545807244651,
	"grad_norm": 0.069603331387043,
	"kl": 0.0021752119064331055,
	"learning_rate": 1.1065573770491804e-06,
	"loss": 0.0066,
	"num_tokens": 56665038.0,
	"reward": 0.19889323483221233,
	"reward_std": 0.12287436821497977,
	"rewards/pure_accuracy_reward_math": 0.19889323005918413,
	"step": 81
	},
	{
	"clip_ratio": 0.0009323512667833711,
	"epoch": 0.13439352616550787,
	"grad_norm": 0.06583772599697113,
	"kl": 0.002191603183746338,
	"learning_rate": 1.1202185792349727e-06,
	"loss": 0.0066,
	"step": 82
	},
	{
	"clip_ratio": 0.0009406020556070871,
	"epoch": 0.13603247160655066,
	"grad_norm": 0.06439989805221558,
	"kl": 0.00225830078125,
	"learning_rate": 1.1338797814207652e-06,
	"loss": 0.0066,
	"step": 83
	},
	{
	"clip_ratio": 0.0009481842756713377,
	"epoch": 0.13767141704759345,
	"grad_norm": 0.06453175097703934,
	"kl": 0.0023380517959594727,
	"learning_rate": 1.1475409836065575e-06,
	"loss": 0.0065,
	"step": 84
	},
	{
	"clip_ratio": 0.00098607516224547,
	"epoch": 0.1393103624886362,
	"grad_norm": 0.06561443954706192,
	"kl": 0.0024124979972839355,
	"learning_rate": 1.16120218579235e-06,
	"loss": 0.0065,
	"step": 85
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 592.1575720310211,
	"epoch": 0.140949307929679,
	"grad_norm": 0.07061439007520676,
	"kl": 0.002596259117126465,
	"learning_rate": 1.1748633879781422e-06,
	"loss": 0.0094,
	"num_tokens": 59968450.0,
	"reward": 0.2067057350941468,
	"reward_std": 0.12307580112246796,
	"rewards/pure_accuracy_reward_math": 0.20670572892413475,
	"step": 86
	},
	{
	"clip_ratio": 0.0007981168380410963,
	"epoch": 0.14258825337072178,
	"grad_norm": 0.0682518407702446,
	"kl": 0.002633213996887207,
	"learning_rate": 1.1885245901639345e-06,
	"loss": 0.0094,
	"step": 87
	},
	{
	"clip_ratio": 0.0008212789625190453,
	"epoch": 0.14422719881176455,
	"grad_norm": 0.06932378560304642,
	"kl": 0.0026621222496032715,
	"learning_rate": 1.2021857923497268e-06,
	"loss": 0.0094,
	"step": 88
	},
	{
	"clip_ratio": 0.0008140442066633113,
	"epoch": 0.14586614425280733,
	"grad_norm": 0.06654822826385498,
	"kl": 0.002701401710510254,
	"learning_rate": 1.215846994535519e-06,
	"loss": 0.0093,
	"step": 89
	},
	{
	"clip_ratio": 0.0008207391882706361,
	"epoch": 0.14750508969385012,
	"grad_norm": 0.06492628902196884,
	"kl": 0.0028305649757385254,
	"learning_rate": 1.2295081967213116e-06,
	"loss": 0.0092,
	"step": 90
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 596.5735874176025,
	"epoch": 0.14914403513489288,
	"grad_norm": 0.06914262473583221,
	"kl": 0.0026297569274902344,
	"learning_rate": 1.2431693989071039e-06,
	"loss": 0.0089,
	"num_tokens": 63290872.0,
	"reward": 0.19986979712848552,
	"reward_std": 0.12688133475603536,
	"rewards/pure_accuracy_reward_math": 0.19986979235545732,
	"step": 91
	},
	{
	"clip_ratio": 0.0008256697961996906,
	"epoch": 0.15078298057593567,
	"grad_norm": 0.06898585706949234,
	"kl": 0.002676546573638916,
	"learning_rate": 1.2568306010928963e-06,
	"loss": 0.009,
	"step": 92
	},
	{
	"clip_ratio": 0.0008680100598894569,
	"epoch": 0.15242192601697846,
	"grad_norm": 0.06637588888406754,
	"kl": 0.0026517510414123535,
	"learning_rate": 1.2704918032786886e-06,
	"loss": 0.0089,
	"step": 93
	},
	{
	"clip_ratio": 0.000874812991582985,
	"epoch": 0.15406087145802122,
	"grad_norm": 0.06262248754501343,
	"kl": 0.0026916861534118652,
	"learning_rate": 1.2841530054644811e-06,
	"loss": 0.0089,
	"step": 94
	},
	{
	"clip_ratio": 0.0009557890944051906,
	"epoch": 0.155699816899064,
	"grad_norm": 0.0627315565943718,
	"kl": 0.0027064085006713867,
	"learning_rate": 1.2978142076502734e-06,
	"loss": 0.0088,
	"step": 95
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 609.2135615348816,
	"epoch": 0.1573387623401068,
	"grad_norm": 0.06779834628105164,
	"kl": 0.002728700637817383,
	"learning_rate": 1.3114754098360657e-06,
	"loss": 0.0115,
	"num_tokens": 66649152.0,
	"reward": 0.20214844364090823,
	"reward_std": 0.12382755958242342,
	"rewards/pure_accuracy_reward_math": 0.2021484377037268,
	"step": 96
	},
	{
	"clip_ratio": 0.0008006048282709344,
	"epoch": 0.15897770778114956,
	"grad_norm": 0.06556153297424316,
	"kl": 0.0027396678924560547,
	"learning_rate": 1.3251366120218582e-06,
	"loss": 0.0115,
	"step": 97
	},
	{
	"clip_ratio": 0.0008465125227985482,
	"epoch": 0.16061665322219235,
	"grad_norm": 0.06473369896411896,
	"kl": 0.0027694106101989746,
	"learning_rate": 1.3387978142076505e-06,
	"loss": 0.0115,
	"step": 98
	},
	{
	"clip_ratio": 0.000838196794347823,
	"epoch": 0.16225559866323513,
	"grad_norm": 0.06346935033798218,
	"kl": 0.002801954746246338,
	"learning_rate": 1.352459016393443e-06,
	"loss": 0.0114,
	"step": 99
	},
	{
	"clip_ratio": 0.000809030144978351,
	"epoch": 0.1638945441042779,
	"grad_norm": 0.061877407133579254,
	"kl": 0.0028792619705200195,
	"learning_rate": 1.3661202185792352e-06,
	"loss": 0.0113,
	"step": 100
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 596.9437062740326,
	"epoch": 0.16553348954532068,
	"grad_norm": 0.06968217343091965,
	"kl": 0.003116130828857422,
	"learning_rate": 1.3797814207650273e-06,
	"loss": 0.0085,
	"num_tokens": 69972491.0,
	"reward": 0.20605469375732355,
	"reward_std": 0.12006876862142235,
	"rewards/pure_accuracy_reward_math": 0.20605468933354132,
	"step": 101
	},
	{
	"clip_ratio": 0.0008418516855499547,
	"epoch": 0.16717243498636347,
	"grad_norm": 0.07103519886732101,
	"kl": 0.003129124641418457,
	"learning_rate": 1.3934426229508196e-06,
	"loss": 0.0086,
	"step": 102
	},
	{
	"clip_ratio": 0.000812729414064961,
	"epoch": 0.16881138042740623,
	"grad_norm": 0.06863201409578323,
	"kl": 0.0031093955039978027,
	"learning_rate": 1.407103825136612e-06,
	"loss": 0.0085,
	"step": 103
	},
	{
	"clip_ratio": 0.0008020297700568335,
	"epoch": 0.17045032586844902,
	"grad_norm": 0.06707657128572464,
	"kl": 0.003110051155090332,
	"learning_rate": 1.4207650273224043e-06,
	"loss": 0.0084,
	"step": 104
	},
	{
	"clip_ratio": 0.0008383456649880827,
	"epoch": 0.1720892713094918,
	"grad_norm": 0.06547861546278,
	"kl": 0.0031203627586364746,
	"learning_rate": 1.4344262295081968e-06,
	"loss": 0.0083,
	"step": 105
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 594.2750873565674,
	"epoch": 0.17372821675053457,
	"grad_norm": 0.06509453058242798,
	"kl": 0.003070056438446045,
	"learning_rate": 1.4480874316939891e-06,
	"loss": 0.0082,
	"num_tokens": 73285196.0,
	"reward": 0.21744792378740385,
	"reward_std": 0.12147156818537042,
	"rewards/pure_accuracy_reward_math": 0.21744791680248454,
	"step": 106
	},
	{
	"clip_ratio": 0.0008183834372630372,
	"epoch": 0.17536716219157736,
	"grad_norm": 0.06286683678627014,
	"kl": 0.003090500831604004,
	"learning_rate": 1.4617486338797814e-06,
	"loss": 0.0082,
	"step": 107
	},
	{
	"clip_ratio": 0.0008020995180686441,
	"epoch": 0.17700610763262015,
	"grad_norm": 0.061473019421100616,
	"kl": 0.003094911575317383,
	"learning_rate": 1.4754098360655739e-06,
	"loss": 0.0082,
	"step": 108
	},
	{
	"clip_ratio": 0.0008129155939968769,
	"epoch": 0.1786450530736629,
	"grad_norm": 0.06097801774740219,
	"kl": 0.0031610727310180664,
	"learning_rate": 1.4890710382513662e-06,
	"loss": 0.0081,
	"step": 109
	},
	{
	"clip_ratio": 0.0008801428618880891,
	"epoch": 0.1802839985147057,
	"grad_norm": 0.06094435974955559,
	"kl": 0.003253757953643799,
	"learning_rate": 1.5027322404371587e-06,
	"loss": 0.008,
	"step": 110
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 591.4017164707184,
	"epoch": 0.18192294395574848,
	"grad_norm": 0.07037521153688431,
	"kl": 0.0035175085067749023,
	"learning_rate": 1.516393442622951e-06,
	"loss": 0.0095,
	"num_tokens": 76587766.0,
	"reward": 0.23372396477498114,
	"reward_std": 0.13153934240108356,
	"rewards/pure_accuracy_reward_math": 0.2337239591870457,
	"step": 111
	},
	{
	"clip_ratio": 0.0009014522622123877,
	"epoch": 0.18356188939679124,
	"grad_norm": 0.06573645025491714,
	"kl": 0.003545045852661133,
	"learning_rate": 1.5300546448087432e-06,
	"loss": 0.0096,
	"step": 112
	},
	{
	"clip_ratio": 0.0009236231287559349,
	"epoch": 0.18520083483783403,
	"grad_norm": 0.06465188413858414,
	"kl": 0.00360715389251709,
	"learning_rate": 1.5437158469945357e-06,
	"loss": 0.0095,
	"step": 113
	},
	{
	"clip_ratio": 0.0009181838793210773,
	"epoch": 0.18683978027887682,
	"grad_norm": 0.06287030875682831,
	"kl": 0.0036890506744384766,
	"learning_rate": 1.557377049180328e-06,
	"loss": 0.0094,
	"step": 114
	},
	{
	"clip_ratio": 0.0008825006539154856,
	"epoch": 0.18847872571991958,
	"grad_norm": 0.06144850701093674,
	"kl": 0.003753662109375,
	"learning_rate": 1.5710382513661205e-06,
	"loss": 0.0093,
	"step": 115
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 594.1959838867188,
	"epoch": 0.19011767116096237,
	"grad_norm": 0.06793060898780823,
	"kl": 0.003781437873840332,
	"learning_rate": 1.5846994535519128e-06,
	"loss": 0.011,
	"num_tokens": 79893856.0,
	"reward": 0.21386719393194653,
	"reward_std": 0.12512964301276952,
	"rewards/pure_accuracy_reward_math": 0.2138671872962732,
	"step": 116
	},
	{
	"clip_ratio": 0.0008201918570875932,
	"epoch": 0.19175661660200516,
	"grad_norm": 0.06297077238559723,
	"kl": 0.0038176774978637695,
	"learning_rate": 1.5983606557377053e-06,
	"loss": 0.011,
	"step": 117
	},
	{
	"clip_ratio": 0.0007795650292337086,
	"epoch": 0.19339556204304792,
	"grad_norm": 0.061727218329906464,
	"kl": 0.003816843032836914,
	"learning_rate": 1.6120218579234975e-06,
	"loss": 0.011,
	"step": 118
	},
	{
	"clip_ratio": 0.0008128985705297964,
	"epoch": 0.1950345074840907,
	"grad_norm": 0.05955222249031067,
	"kl": 0.003865480422973633,
	"learning_rate": 1.6256830601092896e-06,
	"loss": 0.0109,
	"step": 119
	},
	{
	"clip_ratio": 0.0008892948203538253,
	"epoch": 0.1966734529251335,
	"grad_norm": 0.05931426212191582,
	"kl": 0.003886103630065918,
	"learning_rate": 1.6393442622950819e-06,
	"loss": 0.0108,
	"step": 120
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 586.1058125495911,
	"epoch": 0.19831239836617626,
	"grad_norm": 0.07331722974777222,
	"kl": 0.0038805007934570312,
	"learning_rate": 1.6530054644808744e-06,
	"loss": 0.0069,
	"num_tokens": 83181421.0,
	"reward": 0.2060546927677933,
	"reward_std": 0.12338518165051937,
	"rewards/pure_accuracy_reward_math": 0.20605468822759576,
	"step": 121
	},
	{
	"clip_ratio": 0.0008778467455385908,
	"epoch": 0.19995134380721905,
	"grad_norm": 0.06926850229501724,
	"kl": 0.0038404464721679688,
	"learning_rate": 1.6666666666666667e-06,
	"loss": 0.0069,
	"step": 122
	},
	{
	"clip_ratio": 0.0009394907406203856,
	"epoch": 0.20159028924826183,
	"grad_norm": 0.06973890960216522,
	"kl": 0.003817915916442871,
	"learning_rate": 1.6803278688524592e-06,
	"loss": 0.0069,
	"step": 123
	},
	{
	"clip_ratio": 0.000969218867339805,
	"epoch": 0.2032292346893046,
	"grad_norm": 0.06822917610406876,
	"kl": 0.0038552284240722656,
	"learning_rate": 1.6939890710382514e-06,
	"loss": 0.0068,
	"step": 124
	},
	{
	"clip_ratio": 0.0009342714683953091,
	"epoch": 0.20486818013034738,
	"grad_norm": 0.06682004034519196,
	"kl": 0.003947019577026367,
	"learning_rate": 1.7076502732240437e-06,
	"loss": 0.0066,
	"step": 125
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 589.0709819793701,
	"epoch": 0.20650712557139017,
	"grad_norm": 0.07198912650346756,
	"kl": 0.003998160362243652,
	"learning_rate": 1.7213114754098362e-06,
	"loss": 0.0125,
	"num_tokens": 86478355.0,
	"reward": 0.22949219273868948,
	"reward_std": 0.14352073048939928,
	"rewards/pure_accuracy_reward_math": 0.22949218878056854,
	"step": 126
	},
	{
	"clip_ratio": 0.0010473157254864418,
	"epoch": 0.20814607101243293,
	"grad_norm": 0.07177633047103882,
	"kl": 0.004043221473693848,
	"learning_rate": 1.7349726775956285e-06,
	"loss": 0.0125,
	"step": 127
	},
	{
	"clip_ratio": 0.0010181566002529507,
	"epoch": 0.20978501645347572,
	"grad_norm": 0.06755513697862625,
	"kl": 0.0041484832763671875,
	"learning_rate": 1.748633879781421e-06,
	"loss": 0.0124,
	"step": 128
	},
	{
	"clip_ratio": 0.0010040162792392948,
	"epoch": 0.2114239618945185,
	"grad_norm": 0.06670001894235611,
	"kl": 0.004278779029846191,
	"learning_rate": 1.7622950819672133e-06,
	"loss": 0.0123,
	"step": 129
	},
	{
	"clip_ratio": 0.0010213782433083907,
	"epoch": 0.21306290733556127,
	"grad_norm": 0.06810087710618973,
	"kl": 0.004375338554382324,
	"learning_rate": 1.7759562841530055e-06,
	"loss": 0.0121,
	"step": 130
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 585.0755393505096,
	"epoch": 0.21470185277660406,
	"grad_norm": 0.07637549191713333,
	"kl": 0.004368305206298828,
	"learning_rate": 1.789617486338798e-06,
	"loss": 0.0075,
	"num_tokens": 89764315.0,
	"reward": 0.22656250739237294,
	"reward_std": 0.14191649784334004,
	"rewards/pure_accuracy_reward_math": 0.2265624997089617,
	"step": 131
	},
	{
	"clip_ratio": 0.0010003317277096357,
	"epoch": 0.21634079821764685,
	"grad_norm": 0.07960700243711472,
	"kl": 0.0042803287506103516,
	"learning_rate": 1.8032786885245903e-06,
	"loss": 0.0075,
	"step": 132
	},
	{
	"clip_ratio": 0.0010098553934767551,
	"epoch": 0.2179797436586896,
	"grad_norm": 0.0741487368941307,
	"kl": 0.004350185394287109,
	"learning_rate": 1.8169398907103828e-06,
	"loss": 0.0074,
	"step": 133
	},
	{
	"clip_ratio": 0.0010473617899151577,
	"epoch": 0.2196186890997324,
	"grad_norm": 0.07375472038984299,
	"kl": 0.004483342170715332,
	"learning_rate": 1.830601092896175e-06,
	"loss": 0.0073,
	"step": 134
	},
	{
	"clip_ratio": 0.0010608058864818304,
	"epoch": 0.22125763454077518,
	"grad_norm": 0.06948796659708023,
	"kl": 0.004660606384277344,
	"learning_rate": 1.8442622950819674e-06,
	"loss": 0.0071,
	"step": 135
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 599.9541211128235,
	"epoch": 0.22289657998181794,
	"grad_norm": 0.06568682193756104,
	"kl": 0.004492521286010742,
	"learning_rate": 1.8579234972677599e-06,
	"loss": 0.0076,
	"num_tokens": 93086726.0,
	"reward": 0.20638021410559304,
	"reward_std": 0.11796818353468552,
	"rewards/pure_accuracy_reward_math": 0.20638020828482695,
	"step": 136
	},
	{
	"clip_ratio": 0.0007427198539744495,
	"epoch": 0.22453552542286073,
	"grad_norm": 0.061326853930950165,
	"kl": 0.004569292068481445,
	"learning_rate": 1.8715846994535521e-06,
	"loss": 0.0076,
	"step": 137
	},
	{
	"clip_ratio": 0.0007810102034682131,
	"epoch": 0.22617447086390352,
	"grad_norm": 0.06033333018422127,
	"kl": 0.0046776533126831055,
	"learning_rate": 1.8852459016393442e-06,
	"loss": 0.0075,
	"step": 138
	},
	{
	"clip_ratio": 0.0007891726669413401,
	"epoch": 0.22781341630494628,
	"grad_norm": 0.057988133281469345,
	"kl": 0.004709959030151367,
	"learning_rate": 1.8989071038251367e-06,
	"loss": 0.0074,
	"step": 139
	},
	{
	"clip_ratio": 0.00077407437288457,
	"epoch": 0.22945236174598907,
	"grad_norm": 0.055629778653383255,
	"kl": 0.0047043561935424805,
	"learning_rate": 1.912568306010929e-06,
	"loss": 0.0073,
	"step": 140
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 589.5273640155792,
	"epoch": 0.23109130718703186,
	"grad_norm": 0.08023487031459808,
	"kl": 0.004754543304443359,
	"learning_rate": 1.9262295081967215e-06,
	"loss": 0.0061,
	"num_tokens": 96377242.0,
	"reward": 0.22167969349538907,
	"reward_std": 0.11916955123888329,
	"rewards/pure_accuracy_reward_math": 0.2216796882566996,
	"step": 141
	},
	{
	"clip_ratio": 0.0008194281034548112,
	"epoch": 0.23273025262807462,
	"grad_norm": 0.06729461997747421,
	"kl": 0.004743695259094238,
	"learning_rate": 1.939890710382514e-06,
	"loss": 0.0061,
	"step": 142
	},
	{
	"clip_ratio": 0.0008319891088035547,
	"epoch": 0.2343691980691174,
	"grad_norm": 0.0685749426484108,
	"kl": 0.004932522773742676,
	"learning_rate": 1.953551912568306e-06,
	"loss": 0.006,
	"step": 143
	},
	{
	"clip_ratio": 0.000810704066566359,
	"epoch": 0.23600814351016017,
	"grad_norm": 0.0689912959933281,
	"kl": 0.005072951316833496,
	"learning_rate": 1.9672131147540985e-06,
	"loss": 0.0058,
	"step": 144
	},
	{
	"clip_ratio": 0.0008489251890750893,
	"epoch": 0.23764708895120296,
	"grad_norm": 0.06294326484203339,
	"kl": 0.005017280578613281,
	"learning_rate": 1.980874316939891e-06,
	"loss": 0.0057,
	"step": 145
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 600.1722221374512,
	"epoch": 0.23928603439224574,
	"grad_norm": 0.06795884668827057,
	"kl": 0.005151629447937012,
	"learning_rate": 1.994535519125683e-06,
	"loss": 0.0096,
	"num_tokens": 99709727.0,
	"reward": 0.2047526100941468,
	"reward_std": 0.1280827015871182,
	"rewards/pure_accuracy_reward_math": 0.2047526053211186,
	"step": 146
	},
	{
	"clip_ratio": 0.0008647626258380114,
	"epoch": 0.2409249798332885,
	"grad_norm": 0.06390897184610367,
	"kl": 0.0051021575927734375,
	"learning_rate": 2.0081967213114756e-06,
	"loss": 0.0096,
	"step": 147
	},
	{
	"clip_ratio": 0.0009481125781576338,
	"epoch": 0.2425639252743313,
	"grad_norm": 0.062446512281894684,
	"kl": 0.005044102668762207,
	"learning_rate": 2.021857923497268e-06,
	"loss": 0.0095,
	"step": 148
	},
	{
	"clip_ratio": 0.0009975744914072493,
	"epoch": 0.24420287071537408,
	"grad_norm": 0.06106211990118027,
	"kl": 0.00504612922668457,
	"learning_rate": 2.03551912568306e-06,
	"loss": 0.0093,
	"step": 149
	},
	{
	"clip_ratio": 0.0009842645606568112,
	"epoch": 0.24584181615641684,
	"grad_norm": 0.058460384607315063,
	"kl": 0.005153179168701172,
	"learning_rate": 2.0491803278688526e-06,
	"loss": 0.0092,
	"step": 150
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 599.0677282810211,
	"epoch": 0.24748076159745963,
	"grad_norm": 0.08131567388772964,
	"kl": 0.005348920822143555,
	"learning_rate": 2.062841530054645e-06,
	"loss": 0.0096,
	"num_tokens": 103039175.0,
	"reward": 0.2106119856762234,
	"reward_std": 0.14471486618276685,
	"rewards/pure_accuracy_reward_math": 0.21061197892413475,
	"step": 151
	},
	{
	"clip_ratio": 0.0011485503518997575,
	"epoch": 0.24911970703850242,
	"grad_norm": 0.0808255672454834,
	"kl": 0.0054149627685546875,
	"learning_rate": 2.0765027322404376e-06,
	"loss": 0.0096,
	"step": 152
	},
	{
	"clip_ratio": 0.0011561684764274105,
	"epoch": 0.2507586524795452,
	"grad_norm": 0.07708927989006042,
	"kl": 0.005404829978942871,
	"learning_rate": 2.0901639344262297e-06,
	"loss": 0.0095,
	"step": 153
	},
	{
	"clip_ratio": 0.0011439574755058857,
	"epoch": 0.25239759792058797,
	"grad_norm": 0.07077940553426743,
	"kl": 0.005424022674560547,
	"learning_rate": 2.103825136612022e-06,
	"loss": 0.0093,
	"step": 154
	},
	{
	"clip_ratio": 0.0011864712664646504,
	"epoch": 0.25403654336163073,
	"grad_norm": 0.07691214233636856,
	"kl": 0.005586385726928711,
	"learning_rate": 2.1174863387978147e-06,
	"loss": 0.0091,
	"step": 155
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 600.4765803813934,
	"epoch": 0.25567548880267355,
	"grad_norm": 0.06585235148668289,
	"kl": 0.005746960639953613,
	"learning_rate": 2.1311475409836067e-06,
	"loss": 0.0081,
	"num_tokens": 106367483.0,
	"reward": 0.2112630266638007,
	"reward_std": 0.11601505969883874,
	"rewards/pure_accuracy_reward_math": 0.21126302142511122,
	"step": 156
	},
	{
	"clip_ratio": 0.0008758294210338136,
	"epoch": 0.2573144342437163,
	"grad_norm": 0.07339663803577423,
	"kl": 0.005802512168884277,
	"learning_rate": 2.144808743169399e-06,
	"loss": 0.0081,
	"step": 157
	},
	{
	"clip_ratio": 0.0008576641474746793,
	"epoch": 0.25895337968475907,
	"grad_norm": 0.06242053955793381,
	"kl": 0.005854010581970215,
	"learning_rate": 2.1584699453551913e-06,
	"loss": 0.008,
	"step": 158
	},
	{
	"clip_ratio": 0.0008841920518989355,
	"epoch": 0.2605923251258019,
	"grad_norm": 0.06326813995838165,
	"kl": 0.0059430599212646484,
	"learning_rate": 2.1721311475409838e-06,
	"loss": 0.0078,
	"step": 159
	},
	{
	"clip_ratio": 0.0009176517396554118,
	"epoch": 0.26223127056684464,
	"grad_norm": 0.06189825013279915,
	"kl": 0.005987405776977539,
	"learning_rate": 2.185792349726776e-06,
	"loss": 0.0077,
	"step": 160
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 588.5446150302887,
	"epoch": 0.2638702160078874,
	"grad_norm": 0.06866484135389328,
	"kl": 0.006237506866455078,
	"learning_rate": 2.1994535519125683e-06,
	"loss": 0.0078,
	"num_tokens": 109663204.0,
	"reward": 0.21061198558891192,
	"reward_std": 0.12186720367753878,
	"rewards/pure_accuracy_reward_math": 0.21061197930248454,
	"step": 161
	},
	{
	"clip_ratio": 0.0009436907939743833,
	"epoch": 0.2655091614489302,
	"grad_norm": 0.07263052463531494,
	"kl": 0.0061321258544921875,
	"learning_rate": 2.213114754098361e-06,
	"loss": 0.0078,
	"step": 162
	},
	{
	"clip_ratio": 0.0009464566720680523,
	"epoch": 0.267148106889973,
	"grad_norm": 0.06491200625896454,
	"kl": 0.006033658981323242,
	"learning_rate": 2.2267759562841533e-06,
	"loss": 0.0076,
	"step": 163
	},
	{
	"clip_ratio": 0.000968025926908922,
	"epoch": 0.26878705233101574,
	"grad_norm": 0.06358778476715088,
	"kl": 0.006163120269775391,
	"learning_rate": 2.2404371584699454e-06,
	"loss": 0.0075,
	"step": 164
	},
	{
	"clip_ratio": 0.0009389551167942045,
	"epoch": 0.27042599777205856,
	"grad_norm": 0.06583644449710846,
	"kl": 0.006374359130859375,
	"learning_rate": 2.254098360655738e-06,
	"loss": 0.0073,
	"step": 165
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 594.6982605457306,
	"epoch": 0.2720649432131013,
	"grad_norm": 0.07151180505752563,
	"kl": 0.006270289421081543,
	"learning_rate": 2.2677595628415304e-06,
	"loss": 0.0071,
	"num_tokens": 112983493.0,
	"reward": 0.21158854870009236,
	"reward_std": 0.1309350436204113,
	"rewards/pure_accuracy_reward_math": 0.21158854171517305,
	"step": 166
	},
	{
	"clip_ratio": 0.0010227661889530282,
	"epoch": 0.2737038886541441,
	"grad_norm": 0.08615773171186447,
	"kl": 0.0064040422439575195,
	"learning_rate": 2.2814207650273224e-06,
	"loss": 0.0071,
	"step": 167
	},
	{
	"clip_ratio": 0.0009205440467212611,
	"epoch": 0.2753428340951869,
	"grad_norm": 0.06637667864561081,
	"kl": 0.00621640682220459,
	"learning_rate": 2.295081967213115e-06,
	"loss": 0.0069,
	"step": 168
	},
	{
	"clip_ratio": 0.001003933336846785,
	"epoch": 0.27698177953622966,
	"grad_norm": 0.076202891767025,
	"kl": 0.0062408447265625,
	"learning_rate": 2.3087431693989074e-06,
	"loss": 0.0068,
	"step": 169
	},
	{
	"clip_ratio": 0.0009103266062311377,
	"epoch": 0.2786207249772724,
	"grad_norm": 0.06154695898294449,
	"kl": 0.006373286247253418,
	"learning_rate": 2.3224043715847e-06,
	"loss": 0.0065,
	"step": 170
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 584.0752146244049,
	"epoch": 0.28025967041831523,
	"grad_norm": 0.06730078905820847,
	"kl": 0.006638884544372559,
	"learning_rate": 2.336065573770492e-06,
	"loss": 0.0089,
	"num_tokens": 116258180.0,
	"reward": 0.220703131693881,
	"reward_std": 0.12132410902995616,
	"rewards/pure_accuracy_reward_math": 0.2207031263387762,
	"step": 171
	},
	{
	"clip_ratio": 0.001059339236917367,
	"epoch": 0.281898615859358,
	"grad_norm": 0.08054529875516891,
	"kl": 0.0067511796951293945,
	"learning_rate": 2.3497267759562845e-06,
	"loss": 0.0089,
	"step": 172
	},
	{
	"clip_ratio": 0.0010770070745707017,
	"epoch": 0.28353756130040075,
	"grad_norm": 0.06891456246376038,
	"kl": 0.006635904312133789,
	"learning_rate": 2.363387978142077e-06,
	"loss": 0.0088,
	"step": 173
	},
	{
	"clip_ratio": 0.0009533684936400277,
	"epoch": 0.28517650674144357,
	"grad_norm": 0.06477612257003784,
	"kl": 0.006537199020385742,
	"learning_rate": 2.377049180327869e-06,
	"loss": 0.0086,
	"step": 174
	},
	{
	"clip_ratio": 0.0008389282212419857,
	"epoch": 0.28681545218248633,
	"grad_norm": 0.06404498219490051,
	"kl": 0.006713271141052246,
	"learning_rate": 2.390710382513661e-06,
	"loss": 0.0084,
	"step": 175
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 586.0224783420563,
	"epoch": 0.2884543976235291,
	"grad_norm": 0.07143088430166245,
	"kl": 0.006848454475402832,
	"learning_rate": 2.4043715846994536e-06,
	"loss": 0.0081,
	"num_tokens": 119549581.0,
	"reward": 0.23144531846628524,
	"reward_std": 0.11726316896965727,
	"rewards/pure_accuracy_reward_math": 0.23144531299476512,
	"step": 176
	},
	{
	"clip_ratio": 0.0008353526282007806,
	"epoch": 0.2900933430645719,
	"grad_norm": 0.07284073531627655,
	"kl": 0.006837129592895508,
	"learning_rate": 2.418032786885246e-06,
	"loss": 0.0081,
	"step": 177
	},
	{
	"clip_ratio": 0.0008791502111762384,
	"epoch": 0.29173228850561467,
	"grad_norm": 0.06452663242816925,
	"kl": 0.006670117378234863,
	"learning_rate": 2.431693989071038e-06,
	"loss": 0.008,
	"step": 178
	},
	{
	"clip_ratio": 0.0009922128726884694,
	"epoch": 0.29337123394665743,
	"grad_norm": 0.07056602835655212,
	"kl": 0.006812095642089844,
	"learning_rate": 2.4453551912568307e-06,
	"loss": 0.0078,
	"step": 179
	},
	{
	"clip_ratio": 0.0009285092224899927,
	"epoch": 0.29501017938770024,
	"grad_norm": 0.06236054748296738,
	"kl": 0.0068634748458862305,
	"learning_rate": 2.459016393442623e-06,
	"loss": 0.0075,
	"step": 180
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 587.9111535549164,
	"epoch": 0.296649124828743,
	"grad_norm": 0.07245080173015594,
	"kl": 0.007324337959289551,
	"learning_rate": 2.4726775956284156e-06,
	"loss": 0.0063,
	"num_tokens": 122841384.0,
	"reward": 0.22916667381650768,
	"reward_std": 0.12823739141458645,
	"rewards/pure_accuracy_reward_math": 0.22916666624951176,
	"step": 181
	},
	{
	"clip_ratio": 0.0010925326015467363,
	"epoch": 0.29828807026978577,
	"grad_norm": 0.08096741139888763,
	"kl": 0.007236003875732422,
	"learning_rate": 2.4863387978142077e-06,
	"loss": 0.0062,
	"step": 182
	},
	{
	"clip_ratio": 0.0010355811738236298,
	"epoch": 0.2999270157108286,
	"grad_norm": 0.06912072002887726,
	"kl": 0.007112741470336914,
	"learning_rate": 2.5e-06,
	"loss": 0.0061,
	"step": 183
	},
	{
	"clip_ratio": 0.0009683458151812374,
	"epoch": 0.30156596115187134,
	"grad_norm": 0.07461241632699966,
	"kl": 0.007212400436401367,
	"learning_rate": 2.5136612021857927e-06,
	"loss": 0.0058,
	"step": 184
	},
	{
	"clip_ratio": 0.0009423685739875509,
	"epoch": 0.3032049065929141,
	"grad_norm": 0.0647897720336914,
	"kl": 0.007313847541809082,
	"learning_rate": 2.5273224043715848e-06,
	"loss": 0.0055,
	"step": 185
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 583.5849809646606,
	"epoch": 0.3048438520339569,
	"grad_norm": 0.06853717565536499,
	"kl": 0.007729768753051758,
	"learning_rate": 2.5409836065573773e-06,
	"loss": 0.0077,
	"num_tokens": 126127813.0,
	"reward": 0.20898438137373887,
	"reward_std": 0.11270587670151144,
	"rewards/pure_accuracy_reward_math": 0.2089843761350494,
	"step": 186
	},
	{
	"clip_ratio": 0.0010363689809764765,
	"epoch": 0.3064827974749997,
	"grad_norm": 0.07357639819383621,
	"kl": 0.007730722427368164,
	"learning_rate": 2.5546448087431697e-06,
	"loss": 0.0076,
	"step": 187
	},
	{
	"clip_ratio": 0.0010397096725682786,
	"epoch": 0.30812174291604244,
	"grad_norm": 0.06807340681552887,
	"kl": 0.007578372955322266,
	"learning_rate": 2.5683060109289622e-06,
	"loss": 0.0075,
	"step": 188
	},
	{
	"clip_ratio": 0.0007689736390830149,
	"epoch": 0.30976068835708526,
	"grad_norm": 0.06024845689535141,
	"kl": 0.007673501968383789,
	"learning_rate": 2.5819672131147543e-06,
	"loss": 0.0072,
	"step": 189
	},
	{
	"clip_ratio": 0.0007949806515625824,
	"epoch": 0.311399633798128,
	"grad_norm": 0.06614933907985687,
	"kl": 0.007935523986816406,
	"learning_rate": 2.595628415300547e-06,
	"loss": 0.007,
	"step": 190
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 582.4375193119049,
	"epoch": 0.3130385792391708,
	"grad_norm": 0.07819633185863495,
	"kl": 0.00816202163696289,
	"learning_rate": 2.6092896174863393e-06,
	"loss": 0.0046,
	"num_tokens": 129404757.0,
	"reward": 0.23046875756699592,
	"reward_std": 0.12788849917706102,
	"rewards/pure_accuracy_reward_math": 0.2304687495343387,
	"step": 191
	},
	{
	"clip_ratio": 0.0010027453071188575,
	"epoch": 0.3146775246802136,
	"grad_norm": 0.07076407223939896,
	"kl": 0.007757902145385742,
	"learning_rate": 2.6229508196721314e-06,
	"loss": 0.0045,
	"step": 192
	},
	{
	"clip_ratio": 0.0011502429521215163,
	"epoch": 0.31631647012125635,
	"grad_norm": 0.06905192136764526,
	"kl": 0.007544517517089844,
	"learning_rate": 2.636612021857924e-06,
	"loss": 0.0044,
	"step": 193
	},
	{
	"clip_ratio": 0.001169734060454175,
	"epoch": 0.3179554155622991,
	"grad_norm": 0.07402996718883514,
	"kl": 0.007522106170654297,
	"learning_rate": 2.6502732240437163e-06,
	"loss": 0.0042,
	"step": 194
	},
	{
	"clip_ratio": 0.001001289329451538,
	"epoch": 0.31959436100334193,
	"grad_norm": 0.0615554116666317,
	"kl": 0.007868766784667969,
	"learning_rate": 2.6639344262295084e-06,
	"loss": 0.0039,
	"step": 195
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 598.659848690033,
	"epoch": 0.3212333064443847,
	"grad_norm": 0.07155327498912811,
	"kl": 0.0077495574951171875,
	"learning_rate": 2.677595628415301e-06,
	"loss": 0.009,
	"num_tokens": 132735652.0,
	"reward": 0.21126302669290453,
	"reward_std": 0.12602885958040133,
	"rewards/pure_accuracy_reward_math": 0.21126302133779973,
	"step": 196
	},
	{
	"clip_ratio": 0.0009628182596088664,
	"epoch": 0.32287225188542745,
	"grad_norm": 0.07324164360761642,
	"kl": 0.00766444206237793,
	"learning_rate": 2.6912568306010934e-06,
	"loss": 0.0089,
	"step": 197
	},
	{
	"clip_ratio": 0.001045915161398625,
	"epoch": 0.32451119732647027,
	"grad_norm": 0.07669375091791153,
	"kl": 0.0074596405029296875,
	"learning_rate": 2.704918032786886e-06,
	"loss": 0.0087,
	"step": 198
	},
	{
	"clip_ratio": 0.0009246684501249547,
	"epoch": 0.32615014276751303,
	"grad_norm": 0.0650852844119072,
	"kl": 0.0074880123138427734,
	"learning_rate": 2.718579234972678e-06,
	"loss": 0.0085,
	"step": 199
	},
	{
	"clip_ratio": 0.0009262548958304251,
	"epoch": 0.3277890882085558,
	"grad_norm": 0.0722322165966034,
	"kl": 0.007855653762817383,
	"learning_rate": 2.7322404371584705e-06,
	"loss": 0.0082,
	"step": 200
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 591.9775581359863,
	"epoch": 0.3294280336495986,
	"grad_norm": 0.07280802726745605,
	"kl": 0.007916688919067383,
	"learning_rate": 2.745901639344263e-06,
	"loss": 0.0084,
	"num_tokens": 136043335.0,
	"reward": 0.2236328196595423,
	"reward_std": 0.12912937795044854,
	"rewards/pure_accuracy_reward_math": 0.22363281302386895,
	"step": 201
	},
	{
	"clip_ratio": 0.0010444082931826415,
	"epoch": 0.33106697909064137,
	"grad_norm": 0.0775647759437561,
	"kl": 0.007770538330078125,
	"learning_rate": 2.7595628415300546e-06,
	"loss": 0.0083,
	"step": 202
	},
	{
	"clip_ratio": 0.0010056693769797675,
	"epoch": 0.3327059245316841,
	"grad_norm": 0.06984438002109528,
	"kl": 0.0076978206634521484,
	"learning_rate": 2.773224043715847e-06,
	"loss": 0.0081,
	"step": 203
	},
	{
	"clip_ratio": 0.0010063842889849184,
	"epoch": 0.33434486997272694,
	"grad_norm": 0.07507704943418503,
	"kl": 0.007877111434936523,
	"learning_rate": 2.786885245901639e-06,
	"loss": 0.0079,
	"step": 204
	},
	{
	"clip_ratio": 0.0010283672744435535,
	"epoch": 0.3359838154137697,
	"grad_norm": 0.07364527881145477,
	"kl": 0.00825810432434082,
	"learning_rate": 2.8005464480874316e-06,
	"loss": 0.0076,
	"step": 205
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 588.1670074462891,
	"epoch": 0.33762276085481246,
	"grad_norm": 0.06861822307109833,
	"kl": 0.00839853286743164,
	"learning_rate": 2.814207650273224e-06,
	"loss": 0.0057,
	"num_tokens": 139337308.0,
	"reward": 0.2106119850941468,
	"reward_std": 0.12027020112145692,
	"rewards/pure_accuracy_reward_math": 0.21061198008828796,
	"step": 206
	},
	{
	"clip_ratio": 0.0010541207553558252,
	"epoch": 0.3392617062958553,
	"grad_norm": 0.08106576651334763,
	"kl": 0.008537769317626953,
	"learning_rate": 2.8278688524590166e-06,
	"loss": 0.0057,
	"step": 207
	},
	{
	"clip_ratio": 0.0009489937833109252,
	"epoch": 0.34090065173689804,
	"grad_norm": 0.0691104531288147,
	"kl": 0.008366107940673828,
	"learning_rate": 2.8415300546448087e-06,
	"loss": 0.0054,
	"step": 208
	},
	{
	"clip_ratio": 0.0009892520201901789,
	"epoch": 0.3425395971779408,
	"grad_norm": 0.06807916611433029,
	"kl": 0.008470535278320312,
	"learning_rate": 2.855191256830601e-06,
	"loss": 0.0052,
	"step": 209
	},
	{
	"clip_ratio": 0.00096842655295859,
	"epoch": 0.3441785426189836,
	"grad_norm": 0.0654783844947815,
	"kl": 0.008765220642089844,
	"learning_rate": 2.8688524590163937e-06,
	"loss": 0.0049,
	"step": 210
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 590.6634306907654,
	"epoch": 0.3458174880600264,
	"grad_norm": 0.0704723373055458,
	"kl": 0.008867502212524414,
	"learning_rate": 2.8825136612021857e-06,
	"loss": 0.0091,
	"num_tokens": 142633758.0,
	"reward": 0.21549479861278087,
	"reward_std": 0.13379461748991162,
	"rewards/pure_accuracy_reward_math": 0.21549479197710752,
	"step": 211
	},
	{
	"clip_ratio": 0.0011996210827192044,
	"epoch": 0.34745643350106914,
	"grad_norm": 0.08370186388492584,
	"kl": 0.008816242218017578,
	"learning_rate": 2.8961748633879782e-06,
	"loss": 0.009,
	"step": 212
	},
	{
	"clip_ratio": 0.001070254641945212,
	"epoch": 0.34909537894211196,
	"grad_norm": 0.06448537111282349,
	"kl": 0.008533716201782227,
	"learning_rate": 2.9098360655737707e-06,
	"loss": 0.0088,
	"step": 213
	},
	{
	"clip_ratio": 0.0011582542088603986,
	"epoch": 0.3507343243831547,
	"grad_norm": 0.07735106348991394,
	"kl": 0.008788824081420898,
	"learning_rate": 2.923497267759563e-06,
	"loss": 0.0085,
	"step": 214
	},
	{
	"clip_ratio": 0.0010283683568559354,
	"epoch": 0.3523732698241975,
	"grad_norm": 0.06124194711446762,
	"kl": 0.008962869644165039,
	"learning_rate": 2.9371584699453553e-06,
	"loss": 0.0082,
	"step": 215
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 563.4902558326721,
	"epoch": 0.3540122152652403,
	"grad_norm": 0.07734435796737671,
	"kl": 0.009832620620727539,
	"learning_rate": 2.9508196721311478e-06,
	"loss": 0.0061,
	"num_tokens": 145848300.0,
	"reward": 0.24381511058891192,
	"reward_std": 0.13013654301175848,
	"rewards/pure_accuracy_reward_math": 0.2438151046517305,
	"step": 216
	},
	{
	"clip_ratio": 0.0012246508512134824,
	"epoch": 0.35565116070628305,
	"grad_norm": 0.08686057478189468,
	"kl": 0.009522438049316406,
	"learning_rate": 2.9644808743169403e-06,
	"loss": 0.0061,
	"step": 217
	},
	{
	"clip_ratio": 0.0011569151299681835,
	"epoch": 0.3572901061473258,
	"grad_norm": 0.07663314044475555,
	"kl": 0.009255170822143555,
	"learning_rate": 2.9781420765027323e-06,
	"loss": 0.0058,
	"step": 218
	},
	{
	"clip_ratio": 0.0010811529527927632,
	"epoch": 0.35892905158836863,
	"grad_norm": 0.07616522163152695,
	"kl": 0.009699821472167969,
	"learning_rate": 2.991803278688525e-06,
	"loss": 0.0055,
	"step": 219
	},
	{
	"clip_ratio": 0.0009544987469780608,
	"epoch": 0.3605679970294114,
	"grad_norm": 0.07570254802703857,
	"kl": 0.010393381118774414,
	"learning_rate": 3.0054644808743173e-06,
	"loss": 0.0052,
	"step": 220
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 576.4231963157654,
	"epoch": 0.36220694247045415,
	"grad_norm": 0.0710562989115715,
	"kl": 0.009819984436035156,
	"learning_rate": 3.0191256830601094e-06,
	"loss": 0.008,
	"num_tokens": 149101036.0,
	"reward": 0.2226562555297278,
	"reward_std": 0.12332397617865354,
	"rewards/pure_accuracy_reward_math": 0.22265625168802217,
	"step": 221
	},
	{
	"clip_ratio": 0.0012251571324668475,
	"epoch": 0.36384588791149697,
	"grad_norm": 0.08233921229839325,
	"kl": 0.0095062255859375,
	"learning_rate": 3.032786885245902e-06,
	"loss": 0.0079,
	"step": 222
	},
	{
	"clip_ratio": 0.001105058086977806,
	"epoch": 0.36548483335253973,
	"grad_norm": 0.07291049510240555,
	"kl": 0.009292364120483398,
	"learning_rate": 3.0464480874316944e-06,
	"loss": 0.0076,
	"step": 223
	},
	{
	"clip_ratio": 0.0009599913582860609,
	"epoch": 0.3671237787935825,
	"grad_norm": 0.07015552371740341,
	"kl": 0.009765148162841797,
	"learning_rate": 3.0601092896174864e-06,
	"loss": 0.0073,
	"step": 224
	},
	{
	"clip_ratio": 0.0009534798105050868,
	"epoch": 0.3687627242346253,
	"grad_norm": 0.07405047863721848,
	"kl": 0.010376691818237305,
	"learning_rate": 3.073770491803279e-06,
	"loss": 0.007,
	"step": 225
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 578.5188989639282,
	"epoch": 0.37040166967566807,
	"grad_norm": 0.0744408518075943,
	"kl": 0.010227680206298828,
	"learning_rate": 3.0874316939890714e-06,
	"loss": 0.0094,
	"num_tokens": 152364698.0,
	"reward": 0.23632813163567334,
	"reward_std": 0.126384983304888,
	"rewards/pure_accuracy_reward_math": 0.236328125,
	"step": 226
	},
	{
	"clip_ratio": 0.0011350565871453,
	"epoch": 0.3720406151167108,
	"grad_norm": 0.09323269873857498,
	"kl": 0.009792804718017578,
	"learning_rate": 3.101092896174864e-06,
	"loss": 0.0094,
	"step": 227
	},
	{
	"clip_ratio": 0.0009327100992777559,
	"epoch": 0.37367956055775364,
	"grad_norm": 0.07071880251169205,
	"kl": 0.009824752807617188,
	"learning_rate": 3.114754098360656e-06,
	"loss": 0.0091,
	"step": 228
	},
	{
	"clip_ratio": 0.0010184358247897762,
	"epoch": 0.3753185059987964,
	"grad_norm": 0.07402479648590088,
	"kl": 0.010513544082641602,
	"learning_rate": 3.1284153005464485e-06,
	"loss": 0.0088,
	"step": 229
	},
	{
	"clip_ratio": 0.0010424532179058588,
	"epoch": 0.37695745143983916,
	"grad_norm": 0.07837292551994324,
	"kl": 0.010687112808227539,
	"learning_rate": 3.142076502732241e-06,
	"loss": 0.0085,
	"step": 230
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 583.3717656135559,
	"epoch": 0.378596396880882,
	"grad_norm": 0.07237120717763901,
	"kl": 0.010539531707763672,
	"learning_rate": 3.155737704918033e-06,
	"loss": 0.0093,
	"num_tokens": 155643680.0,
	"reward": 0.22591146369813941,
	"reward_std": 0.13715054193744436,
	"rewards/pure_accuracy_reward_math": 0.22591145869228058,
	"step": 231
	},
	{
	"clip_ratio": 0.001438524371224048,
	"epoch": 0.38023534232192474,
	"grad_norm": 0.45248183608055115,
	"kl": 0.011214733123779297,
	"learning_rate": 3.1693989071038255e-06,
	"loss": 0.0093,
	"step": 232
	},
	{
	"clip_ratio": 0.001912088545395818,
	"epoch": 0.3818742877629675,
	"grad_norm": 0.11236479133367538,
	"kl": 0.009836912155151367,
	"learning_rate": 3.183060109289618e-06,
	"loss": 0.0094,
	"step": 233
	},
	{
	"clip_ratio": 0.0011414756233989465,
	"epoch": 0.3835132332040103,
	"grad_norm": 0.07030442357063293,
	"kl": 0.010227203369140625,
	"learning_rate": 3.1967213114754105e-06,
	"loss": 0.009,
	"step": 234
	},
	{
	"clip_ratio": 0.0015166988691817096,
	"epoch": 0.3851521786450531,
	"grad_norm": 0.10437261313199997,
	"kl": 0.011615991592407227,
	"learning_rate": 3.2103825136612026e-06,
	"loss": 0.0088,
	"step": 235
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 571.2672717571259,
	"epoch": 0.38679112408609584,
	"grad_norm": 0.0721583217382431,
	"kl": 0.011221885681152344,
	"learning_rate": 3.224043715846995e-06,
	"loss": 0.0093,
	"num_tokens": 158884749.0,
	"reward": 0.21223958939663135,
	"reward_std": 0.12057235097745433,
	"rewards/pure_accuracy_reward_math": 0.2122395838086959,
	"step": 236
	},
	{
	"clip_ratio": 0.001226626716629653,
	"epoch": 0.38843006952713865,
	"grad_norm": 0.08837593346834183,
	"kl": 0.010795831680297852,
	"learning_rate": 3.2377049180327876e-06,
	"loss": 0.0092,
	"step": 237
	},
	{
	"clip_ratio": 0.0012072520969468314,
	"epoch": 0.3900690149681814,
	"grad_norm": 0.08174102008342743,
	"kl": 0.010251283645629883,
	"learning_rate": 3.2513661202185792e-06,
	"loss": 0.0089,
	"step": 238
	},
	{
	"clip_ratio": 0.0008923051470901555,
	"epoch": 0.3917079604092242,
	"grad_norm": 0.06714540719985962,
	"kl": 0.010812044143676758,
	"learning_rate": 3.2650273224043717e-06,
	"loss": 0.0086,
	"step": 239
	},
	{
	"clip_ratio": 0.0008945376886231315,
	"epoch": 0.393346905850267,
	"grad_norm": 0.07600870728492737,
	"kl": 0.011825799942016602,
	"learning_rate": 3.2786885245901638e-06,
	"loss": 0.0082,
	"step": 240
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 579.9231960773468,
	"epoch": 0.39498585129130975,
	"grad_norm": 0.0788060799241066,
	"kl": 0.011470317840576172,
	"learning_rate": 3.2923497267759563e-06,
	"loss": 0.008,
	"num_tokens": 162151041.0,
	"reward": 0.2262369857635349,
	"reward_std": 0.1436142157181166,
	"rewards/pure_accuracy_reward_math": 0.22623697842936963,
	"step": 241
	},
	{
	"clip_ratio": 0.001235522729416516,
	"epoch": 0.3966247967323525,
	"grad_norm": 0.08819200098514557,
	"kl": 0.011005401611328125,
	"learning_rate": 3.3060109289617488e-06,
	"loss": 0.0079,
	"step": 242
	},
	{
	"clip_ratio": 0.0011237937412715837,
	"epoch": 0.39826374217339533,
	"grad_norm": 0.07336119562387466,
	"kl": 0.010800600051879883,
	"learning_rate": 3.3196721311475413e-06,
	"loss": 0.0075,
	"step": 243
	},
	{
	"clip_ratio": 0.0010676182721454097,
	"epoch": 0.3999026876144381,
	"grad_norm": 0.07694102078676224,
	"kl": 0.011488199234008789,
	"learning_rate": 3.3333333333333333e-06,
	"loss": 0.0072,
	"step": 244
	},
	{
	"clip_ratio": 0.0011172947895374818,
	"epoch": 0.40154163305548085,
	"grad_norm": 0.08463244885206223,
	"kl": 0.012181282043457031,
	"learning_rate": 3.346994535519126e-06,
	"loss": 0.0068,
	"step": 245
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 582.9391458034515,
	"epoch": 0.40318057849652367,
	"grad_norm": 0.0712461844086647,
	"kl": 0.01132655143737793,
	"learning_rate": 3.3606557377049183e-06,
	"loss": 0.0082,
	"num_tokens": 165429094.0,
	"reward": 0.24869792442768812,
	"reward_std": 0.12578068423317745,
	"rewards/pure_accuracy_reward_math": 0.24869791674427688,
	"step": 246
	},
	{
	"clip_ratio": 0.0011233826196530572,
	"epoch": 0.40481952393756643,
	"grad_norm": 0.07659593969583511,
	"kl": 0.01063847541809082,
	"learning_rate": 3.3743169398907104e-06,
	"loss": 0.0081,
	"step": 247
	},
	{
	"clip_ratio": 0.0012855593090534967,
	"epoch": 0.4064584693786092,
	"grad_norm": 0.07479391992092133,
	"kl": 0.010470390319824219,
	"learning_rate": 3.387978142076503e-06,
	"loss": 0.0078,
	"step": 248
	},
	{
	"clip_ratio": 0.0009941341145349725,
	"epoch": 0.408097414819652,
	"grad_norm": 0.06663769483566284,
	"kl": 0.011182785034179688,
	"learning_rate": 3.4016393442622954e-06,
	"loss": 0.0075,
	"step": 249
	},
	{
	"clip_ratio": 0.0009338884319731733,
	"epoch": 0.40973636026069477,
	"grad_norm": 0.07455974817276001,
	"kl": 0.011932849884033203,
	"learning_rate": 3.4153005464480874e-06,
	"loss": 0.0071,
	"step": 250
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 579.4368662834167,
	"epoch": 0.001638945441042779,
	"grad_norm": 0.06936674565076828,
	"kl": 0.011360645294189453,
	"learning_rate": 3.42896174863388e-06,
	"loss": 0.0077,
	"num_tokens": 3266558.0,
	"reward": 0.23990886102546938,
	"reward_std": 0.1189681178657338,
	"rewards/pure_accuracy_reward_math": 0.239908854739042,
	"step": 251
	},
	{
	"clip_ratio": 0.0010949637241992605,
	"epoch": 0.003277890882085558,
	"grad_norm": 0.0754990503191948,
	"kl": 0.010671854019165039,
	"learning_rate": 3.4426229508196724e-06,
	"loss": 0.0076,
	"step": 252
	},
	{
	"clip_ratio": 0.0011387738637722578,
	"epoch": 0.004916836323128337,
	"grad_norm": 0.07142341136932373,
	"kl": 0.010357856750488281,
	"learning_rate": 3.456284153005465e-06,
	"loss": 0.0074,
	"step": 253
	},
	{
	"clip_ratio": 0.0008552854768026918,
	"epoch": 0.006555781764171116,
	"grad_norm": 0.0586932897567749,
	"kl": 0.010814428329467773,
	"learning_rate": 3.469945355191257e-06,
	"loss": 0.007,
	"step": 254
	},
	{
	"clip_ratio": 0.0008318971481457993,
	"epoch": 0.008194727205213895,
	"grad_norm": 0.07276652008295059,
	"kl": 0.011636495590209961,
	"learning_rate": 3.4836065573770495e-06,
	"loss": 0.0067,
	"step": 255
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 589.9453327655792,
	"epoch": 0.009833672646256675,
	"grad_norm": 0.07384130358695984,
	"kl": 0.011546134948730469,
	"learning_rate": 3.497267759562842e-06,
	"loss": 0.0092,
	"num_tokens": 6563270.0,
	"reward": 0.24088542279787362,
	"reward_std": 0.13925835717236623,
	"rewards/pure_accuracy_reward_math": 0.2408854168606922,
	"step": 256
	},
	{
	"clip_ratio": 0.000994754147995991,
	"epoch": 0.011472618087299453,
	"grad_norm": 0.07237172871828079,
	"kl": 0.011071443557739258,
	"learning_rate": 3.510928961748634e-06,
	"loss": 0.0091,
	"step": 257
	},
	{
	"clip_ratio": 0.0009974641966437048,
	"epoch": 0.013111563528342233,
	"grad_norm": 0.0677863284945488,
	"kl": 0.010922431945800781,
	"learning_rate": 3.5245901639344265e-06,
	"loss": 0.0088,
	"step": 258
	},
	{
	"clip_ratio": 0.0009937005115716602,
	"epoch": 0.01475050896938501,
	"grad_norm": 0.06459185481071472,
	"kl": 0.01144552230834961,
	"learning_rate": 3.538251366120219e-06,
	"loss": 0.0084,
	"step": 259
	},
	{
	"clip_ratio": 0.0010135341441355195,
	"epoch": 0.01638945441042779,
	"grad_norm": 0.0639120563864708,
	"kl": 0.01173710823059082,
	"learning_rate": 3.551912568306011e-06,
	"loss": 0.008,
	"step": 260
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 589.9293835163116,
	"epoch": 0.018028399851470568,
	"grad_norm": 0.0708698108792305,
	"kl": 0.011642932891845703,
	"learning_rate": 3.5655737704918036e-06,
	"loss": 0.0082,
	"num_tokens": 9864381.0,
	"reward": 0.2343750073632691,
	"reward_std": 0.1295394750777632,
	"rewards/pure_accuracy_reward_math": 0.23437499956344254,
	"step": 261
	},
	{
	"clip_ratio": 0.0011215963202744206,
	"epoch": 0.01966734529251335,
	"grad_norm": 0.06814540177583694,
	"kl": 0.011007308959960938,
	"learning_rate": 3.579234972677596e-06,
	"loss": 0.0081,
	"step": 262
	},
	{
	"clip_ratio": 0.0012566405258667146,
	"epoch": 0.021306290733556128,
	"grad_norm": 0.07573528587818146,
	"kl": 0.010967016220092773,
	"learning_rate": 3.5928961748633886e-06,
	"loss": 0.0079,
	"step": 263
	},
	{
	"clip_ratio": 0.0009570858208007849,
	"epoch": 0.022945236174598906,
	"grad_norm": 0.05915817990899086,
	"kl": 0.011373281478881836,
	"learning_rate": 3.6065573770491806e-06,
	"loss": 0.0075,
	"step": 264
	},
	{
	"clip_ratio": 0.000911612167669773,
	"epoch": 0.024584181615641687,
	"grad_norm": 0.0663297101855278,
	"kl": 0.012076139450073242,
	"learning_rate": 3.620218579234973e-06,
	"loss": 0.0071,
	"step": 265
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 591.5680522918701,
	"epoch": 0.026223127056684465,
	"grad_norm": 0.11742489039897919,
	"kl": 0.013937711715698242,
	"learning_rate": 3.6338797814207656e-06,
	"loss": 0.0079,
	"num_tokens": 13167262.0,
	"reward": 0.20540365105262026,
	"reward_std": 0.12122339283814654,
	"rewards/pure_accuracy_reward_math": 0.20540364709449932,
	"step": 266
	},
	{
	"clip_ratio": 0.0010878853252052068,
	"epoch": 0.027862072497727243,
	"grad_norm": 0.9664380550384521,
	"kl": 0.011089324951171875,
	"learning_rate": 3.6475409836065577e-06,
	"loss": 0.0088,
	"step": 267
	},
	{
	"clip_ratio": 0.0013143416176717437,
	"epoch": 0.02950101793877002,
	"grad_norm": 0.17526276409626007,
	"kl": 0.01159524917602539,
	"learning_rate": 3.66120218579235e-06,
	"loss": 0.0077,
	"step": 268
	},
	{
	"clip_ratio": 0.0010903547959060234,
	"epoch": 0.031139963379812802,
	"grad_norm": 2.172806739807129,
	"kl": 0.04994964599609375,
	"learning_rate": 3.6748633879781427e-06,
	"loss": 0.0089,
	"step": 269
	},
	{
	"clip_ratio": 0.0011699927540576027,
	"epoch": 0.03277890882085558,
	"grad_norm": 3.1674540042877197,
	"kl": 0.10472512245178223,
	"learning_rate": 3.6885245901639347e-06,
	"loss": 0.011,
	"step": 270
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 602.3287951946259,
	"epoch": 0.03441785426189836,
	"grad_norm": 0.06581036746501923,
	"kl": 0.011269092559814453,
	"learning_rate": 3.7021857923497272e-06,
	"loss": 0.009,
	"num_tokens": 16496412.0,
	"reward": 0.2086588600068353,
	"reward_std": 0.12463329132879153,
	"rewards/pure_accuracy_reward_math": 0.20865885500097647,
	"step": 271
	},
	{
	"clip_ratio": 0.0010178338038713264,
	"epoch": 0.036056799702941136,
	"grad_norm": 0.07616181671619415,
	"kl": 0.0111236572265625,
	"learning_rate": 3.7158469945355197e-06,
	"loss": 0.009,
	"step": 272
	},
	{
	"clip_ratio": 0.0011148875312301243,
	"epoch": 0.03769574514398392,
	"grad_norm": 0.07324493676424026,
	"kl": 0.010937929153442383,
	"learning_rate": 3.729508196721312e-06,
	"loss": 0.0088,
	"step": 273
	},
	{
	"clip_ratio": 0.0009064768914868182,
	"epoch": 0.0393346905850267,
	"grad_norm": 0.0614241324365139,
	"kl": 0.011002779006958008,
	"learning_rate": 3.7431693989071043e-06,
	"loss": 0.0085,
	"step": 274
	},
	{
	"clip_ratio": 0.0008406615522176253,
	"epoch": 0.040973636026069474,
	"grad_norm": 0.0580308772623539,
	"kl": 0.011270523071289062,
	"learning_rate": 3.7568306010928963e-06,
	"loss": 0.0081,
	"step": 275
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 576.4791839122772,
	"epoch": 0.042612581467112255,
	"grad_norm": 0.07072403281927109,
	"kl": 0.012294530868530273,
	"learning_rate": 3.7704918032786884e-06,
	"loss": 0.0069,
	"num_tokens": 19746996.0,
	"reward": 0.23209636090905406,
	"reward_std": 0.13299611589172855,
	"rewards/pure_accuracy_reward_math": 0.232096354739042,
	"step": 276
	},
	{
	"clip_ratio": 0.0008154532818025473,
	"epoch": 0.04425152690815504,
	"grad_norm": 0.06746868789196014,
	"kl": 0.012226104736328125,
	"learning_rate": 3.784153005464481e-06,
	"loss": 0.0068,
	"step": 277
	},
	{
	"clip_ratio": 0.0009088635895295738,
	"epoch": 0.04589047234919781,
	"grad_norm": 0.062604621052742,
	"kl": 0.012192249298095703,
	"learning_rate": 3.7978142076502734e-06,
	"loss": 0.0065,
	"step": 278
	},
	{
	"clip_ratio": 0.0009255672589461028,
	"epoch": 0.04752941779024059,
	"grad_norm": 0.06473197042942047,
	"kl": 0.012347936630249023,
	"learning_rate": 3.811475409836066e-06,
	"loss": 0.0062,
	"step": 279
	},
	{
	"clip_ratio": 0.000926908637438828,
	"epoch": 0.049168363231283374,
	"grad_norm": 0.0617368146777153,
	"kl": 0.012591838836669922,
	"learning_rate": 3.825136612021858e-06,
	"loss": 0.0058,
	"step": 280
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 590.2998232841492,
	"epoch": 0.05080730867232615,
	"grad_norm": 0.0707988291978836,
	"kl": 0.012273788452148438,
	"learning_rate": 3.8387978142076504e-06,
	"loss": 0.0051,
	"num_tokens": 23046033.0,
	"reward": 0.2106119856762234,
	"reward_std": 0.12753237638389692,
	"rewards/pure_accuracy_reward_math": 0.21061197997187264,
	"step": 281
	},
	{
	"clip_ratio": 0.0007957301904752967,
	"epoch": 0.05244625411336893,
	"grad_norm": 0.07150708138942719,
	"kl": 0.012214422225952148,
	"learning_rate": 3.852459016393443e-06,
	"loss": 0.005,
	"step": 282
	},
	{
	"clip_ratio": 0.0008087110562655653,
	"epoch": 0.05408519955441171,
	"grad_norm": 0.06467320770025253,
	"kl": 0.012126684188842773,
	"learning_rate": 3.8661202185792354e-06,
	"loss": 0.0047,
	"step": 283
	},
	{
	"clip_ratio": 0.0008826969724395894,
	"epoch": 0.055724144995454486,
	"grad_norm": 0.06448128819465637,
	"kl": 0.012229204177856445,
	"learning_rate": 3.879781420765028e-06,
	"loss": 0.0043,
	"step": 284
	},
	{
	"clip_ratio": 0.000837775871445956,
	"epoch": 0.05736309043649727,
	"grad_norm": 0.05940267816185951,
	"kl": 0.012416601181030273,
	"learning_rate": 3.8934426229508196e-06,
	"loss": 0.0039,
	"step": 285
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 586.4531440734863,
	"epoch": 0.05900203587754004,
	"grad_norm": 0.09300405532121658,
	"kl": 0.012730836868286133,
	"learning_rate": 3.907103825136612e-06,
	"loss": 0.0073,
	"num_tokens": 26331481.0,
	"reward": 0.24609375710133463,
	"reward_std": 0.1337946176645346,
	"rewards/pure_accuracy_reward_math": 0.24609375081490725,
	"step": 286
	},
	{
	"clip_ratio": 0.0007959069370144789,
	"epoch": 0.06064098131858282,
	"grad_norm": 0.07242298871278763,
	"kl": 0.012778043746948242,
	"learning_rate": 3.9207650273224046e-06,
	"loss": 0.0071,
	"step": 287
	},
	{
	"clip_ratio": 0.0007729592513214811,
	"epoch": 0.062279926759625605,
	"grad_norm": 0.06439978629350662,
	"kl": 0.012783050537109375,
	"learning_rate": 3.934426229508197e-06,
	"loss": 0.0068,
	"step": 288
	},
	{
	"clip_ratio": 0.0008412073416366184,
	"epoch": 0.06391887220066839,
	"grad_norm": 0.06673026084899902,
	"kl": 0.012759208679199219,
	"learning_rate": 3.9480874316939895e-06,
	"loss": 0.0064,
	"step": 289
	},
	{
	"clip_ratio": 0.0008529388742317678,
	"epoch": 0.06555781764171116,
	"grad_norm": 0.06457261741161346,
	"kl": 0.012978315353393555,
	"learning_rate": 3.961748633879782e-06,
	"loss": 0.006,
	"step": 290
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 593.4179866313934,
	"epoch": 0.06719676308275394,
	"grad_norm": 0.061959490180015564,
	"kl": 0.012590646743774414,
	"learning_rate": 3.975409836065574e-06,
	"loss": 0.0048,
	"num_tokens": 29639261.0,
	"reward": 0.19140625570435077,
	"reward_std": 0.1178207247867249,
	"rewards/pure_accuracy_reward_math": 0.19140625011641532,
	"step": 291
	},
	{
	"clip_ratio": 0.0007923052341993753,
	"epoch": 0.06883570852379672,
	"grad_norm": 0.06405281275510788,
	"kl": 0.012594223022460938,
	"learning_rate": 3.989071038251366e-06,
	"loss": 0.0047,
	"step": 292
	},
	{
	"clip_ratio": 0.0008128494168886391,
	"epoch": 0.0704746539648395,
	"grad_norm": 0.05796763673424721,
	"kl": 0.012372016906738281,
	"learning_rate": 4.002732240437159e-06,
	"loss": 0.0044,
	"step": 293
	},
	{
	"clip_ratio": 0.0008259461983470828,
	"epoch": 0.07211359940588227,
	"grad_norm": 0.05945519357919693,
	"kl": 0.012368202209472656,
	"learning_rate": 4.016393442622951e-06,
	"loss": 0.0041,
	"step": 294
	},
	{
	"clip_ratio": 0.0008090365032558111,
	"epoch": 0.07375254484692506,
	"grad_norm": 0.05839954689145088,
	"kl": 0.012590169906616211,
	"learning_rate": 4.030054644808744e-06,
	"loss": 0.0038,
	"step": 295
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 578.3847825527191,
	"epoch": 0.07539149028796784,
	"grad_norm": 0.07029297947883606,
	"kl": 0.013827800750732422,
	"learning_rate": 4.043715846994536e-06,
	"loss": 0.0079,
	"num_tokens": 32899431.0,
	"reward": 0.22656250576255843,
	"reward_std": 0.1293920156895183,
	"rewards/pure_accuracy_reward_math": 0.22656250040745363,
	"step": 296
	},
	{
	"clip_ratio": 0.0007177354492569066,
	"epoch": 0.07703043572901061,
	"grad_norm": 0.07095961272716522,
	"kl": 0.013935565948486328,
	"learning_rate": 4.057377049180329e-06,
	"loss": 0.0078,
	"step": 297
	},
	{
	"clip_ratio": 0.0007291494763990158,
	"epoch": 0.0786693811700534,
	"grad_norm": 0.062031351029872894,
	"kl": 0.01368570327758789,
	"learning_rate": 4.07103825136612e-06,
	"loss": 0.0075,
	"step": 298
	},
	{
	"clip_ratio": 0.0009114736896549402,
	"epoch": 0.08030832661109617,
	"grad_norm": 0.06610522419214249,
	"kl": 0.01354837417602539,
	"learning_rate": 4.084699453551913e-06,
	"loss": 0.0072,
	"step": 299
	},
	{
	"clip_ratio": 0.0008185061662402404,
	"epoch": 0.08194727205213895,
	"grad_norm": 0.05733739957213402,
	"kl": 0.013862133026123047,
	"learning_rate": 4.098360655737705e-06,
	"loss": 0.0068,
	"step": 300
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 581.9811391830444,
	"epoch": 0.08358621749318174,
	"grad_norm": 0.07516755163669586,
	"kl": 0.013712882995605469,
	"learning_rate": 4.112021857923498e-06,
	"loss": 0.0123,
	"num_tokens": 36171597.0,
	"reward": 0.24316406939760782,
	"reward_std": 0.1390101815923117,
	"rewards/pure_accuracy_reward_math": 0.24316406299476512,
	"step": 301
	},
	{
	"clip_ratio": 0.0008417515526843999,
	"epoch": 0.08522516293422451,
	"grad_norm": 0.07285764813423157,
	"kl": 0.013661384582519531,
	"learning_rate": 4.12568306010929e-06,
	"loss": 0.0122,
	"step": 302
	},
	{
	"clip_ratio": 0.0010243687736419815,
	"epoch": 0.08686410837526728,
	"grad_norm": 0.06916587054729462,
	"kl": 0.013316631317138672,
	"learning_rate": 4.139344262295083e-06,
	"loss": 0.0118,
	"step": 303
	},
	{
	"clip_ratio": 0.0010284557414479423,
	"epoch": 0.08850305381631007,
	"grad_norm": 0.06860698759555817,
	"kl": 0.01330423355102539,
	"learning_rate": 4.153005464480875e-06,
	"loss": 0.0115,
	"step": 304
	},
	{
	"clip_ratio": 0.0009143141991216908,
	"epoch": 0.09014199925735285,
	"grad_norm": 0.06032150238752365,
	"kl": 0.0137176513671875,
	"learning_rate": 4.166666666666667e-06,
	"loss": 0.011,
	"step": 305
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 586.3743689060211,
	"epoch": 0.09178094469839562,
	"grad_norm": 0.07405151426792145,
	"kl": 0.013857364654541016,
	"learning_rate": 4.180327868852459e-06,
	"loss": 0.011,
	"num_tokens": 39455171.0,
	"reward": 0.2532552155898884,
	"reward_std": 0.14932613197015598,
	"rewards/pure_accuracy_reward_math": 0.25325520941987634,
	"step": 306
	},
	{
	"clip_ratio": 0.0008296436263890428,
	"epoch": 0.09341989013943841,
	"grad_norm": 0.06666728854179382,
	"kl": 0.013742923736572266,
	"learning_rate": 4.193989071038252e-06,
	"loss": 0.0109,
	"step": 307
	},
	{
	"clip_ratio": 0.0009970029186661122,
	"epoch": 0.09505883558048119,
	"grad_norm": 0.0645456612110138,
	"kl": 0.013346672058105469,
	"learning_rate": 4.207650273224044e-06,
	"loss": 0.0106,
	"step": 308
	},
	{
	"clip_ratio": 0.001063040656163139,
	"epoch": 0.09669778102152396,
	"grad_norm": 0.061983004212379456,
	"kl": 0.013351917266845703,
	"learning_rate": 4.221311475409837e-06,
	"loss": 0.0102,
	"step": 309
	},
	{
	"clip_ratio": 0.000925353787010863,
	"epoch": 0.09833672646256675,
	"grad_norm": 0.054489802569150925,
	"kl": 0.013745784759521484,
	"learning_rate": 4.234972677595629e-06,
	"loss": 0.0098,
	"step": 310
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 576.9381697177887,
	"epoch": 0.09997567190360952,
	"grad_norm": 0.07431261986494064,
	"kl": 0.014829158782958984,
	"learning_rate": 4.248633879781421e-06,
	"loss": 0.0086,
	"num_tokens": 42719433.0,
	"reward": 0.23014323521056212,
	"reward_std": 0.14185529336100444,
	"rewards/pure_accuracy_reward_math": 0.23014322962262668,
	"step": 311
	},
	{
	"clip_ratio": 0.0008892922282370819,
	"epoch": 0.1016146173446523,
	"grad_norm": 0.0675373449921608,
	"kl": 0.01420736312866211,
	"learning_rate": 4.2622950819672135e-06,
	"loss": 0.0084,
	"step": 312
	},
	{
	"clip_ratio": 0.0011247390369817367,
	"epoch": 0.10325356278569509,
	"grad_norm": 0.06642100214958191,
	"kl": 0.013678550720214844,
	"learning_rate": 4.275956284153006e-06,
	"loss": 0.0081,
	"step": 313
	},
	{
	"clip_ratio": 0.001105548773011833,
	"epoch": 0.10489250822673786,
	"grad_norm": 0.06353385746479034,
	"kl": 0.01375722885131836,
	"learning_rate": 4.289617486338798e-06,
	"loss": 0.0077,
	"step": 314
	},
	{
	"clip_ratio": 0.0008872896562479582,
	"epoch": 0.10653145366778063,
	"grad_norm": 0.0578172467648983,
	"kl": 0.014369010925292969,
	"learning_rate": 4.30327868852459e-06,
	"loss": 0.0073,
	"step": 315
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 581.5644719600677,
	"epoch": 0.10817039910882342,
	"grad_norm": 0.10828240215778351,
	"kl": 0.014815330505371094,
	"learning_rate": 4.316939890710383e-06,
	"loss": 0.0079,
	"num_tokens": 45987515.0,
	"reward": 0.23828125750878826,
	"reward_std": 0.1280899328412488,
	"rewards/pure_accuracy_reward_math": 0.23828124959254637,
	"step": 316
	},
	{
	"clip_ratio": 0.0007055829685214121,
	"epoch": 0.1098093445498662,
	"grad_norm": 0.06897052377462387,
	"kl": 0.014089107513427734,
	"learning_rate": 4.330601092896175e-06,
	"loss": 0.0077,
	"step": 317
	},
	{
	"clip_ratio": 0.0009552787060442824,
	"epoch": 0.11144828999090897,
	"grad_norm": 0.06946240365505219,
	"kl": 0.013627052307128906,
	"learning_rate": 4.3442622950819676e-06,
	"loss": 0.0074,
	"step": 318
	},
	{
	"clip_ratio": 0.0009577763585184584,
	"epoch": 0.11308723543195176,
	"grad_norm": 0.06384962797164917,
	"kl": 0.013594627380371094,
	"learning_rate": 4.35792349726776e-06,
	"loss": 0.007,
	"step": 319
	},
	{
	"clip_ratio": 0.0008583279737877092,
	"epoch": 0.11472618087299453,
	"grad_norm": 0.05853092297911644,
	"kl": 0.014088630676269531,
	"learning_rate": 4.371584699453552e-06,
	"loss": 0.0066,
	"step": 320
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 574.9983899593353,
	"epoch": 0.11636512631403731,
	"grad_norm": 0.07647594809532166,
	"kl": 0.01479482650756836,
	"learning_rate": 4.385245901639344e-06,
	"loss": 0.0092,
	"num_tokens": 49236626.0,
	"reward": 0.22591146448394284,
	"reward_std": 0.1385065988288261,
	"rewards/pure_accuracy_reward_math": 0.2259114588960074,
	"step": 321
	},
	{
	"clip_ratio": 0.000817699462913879,
	"epoch": 0.11800407175508008,
	"grad_norm": 0.0680047944188118,
	"kl": 0.014449596405029297,
	"learning_rate": 4.398907103825137e-06,
	"loss": 0.009,
	"step": 322
	},
	{
	"clip_ratio": 0.00102761085952352,
	"epoch": 0.11964301719612287,
	"grad_norm": 0.06830534338951111,
	"kl": 0.01408243179321289,
	"learning_rate": 4.412568306010929e-06,
	"loss": 0.0087,
	"step": 323
	},
	{
	"clip_ratio": 0.0010830692142462794,
	"epoch": 0.12128196263716565,
	"grad_norm": 0.06523703783750534,
	"kl": 0.01411581039428711,
	"learning_rate": 4.426229508196722e-06,
	"loss": 0.0083,
	"step": 324
	},
	{
	"clip_ratio": 0.0009552010853894899,
	"epoch": 0.12292090807820842,
	"grad_norm": 0.05952048301696777,
	"kl": 0.01461029052734375,
	"learning_rate": 4.439890710382514e-06,
	"loss": 0.0078,
	"step": 325
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 574.7955937385559,
	"epoch": 0.12455985351925121,
	"grad_norm": 0.07190460711717606,
	"kl": 0.015045642852783203,
	"learning_rate": 4.453551912568307e-06,
	"loss": 0.0105,
	"num_tokens": 52486374.0,
	"reward": 0.2360026103851851,
	"reward_std": 0.12637775152688846,
	"rewards/pure_accuracy_reward_math": 0.2360026056121569,
	"step": 326
	},
	{
	"clip_ratio": 0.0006602608530101861,
	"epoch": 0.12619879896029398,
	"grad_norm": 0.06684302538633347,
	"kl": 0.014774322509765625,
	"learning_rate": 4.467213114754098e-06,
	"loss": 0.0103,
	"step": 327
	},
	{
	"clip_ratio": 0.0008040992978521899,
	"epoch": 0.12783774440133677,
	"grad_norm": 0.06550217419862747,
	"kl": 0.01435995101928711,
	"learning_rate": 4.480874316939891e-06,
	"loss": 0.01,
	"step": 328
	},
	{
	"clip_ratio": 0.0008306863429652367,
	"epoch": 0.12947668984237953,
	"grad_norm": 0.0616220086812973,
	"kl": 0.014432430267333984,
	"learning_rate": 4.494535519125683e-06,
	"loss": 0.0096,
	"step": 329
	},
	{
	"clip_ratio": 0.0008213962388481377,
	"epoch": 0.13111563528342232,
	"grad_norm": 0.061259008944034576,
	"kl": 0.014967918395996094,
	"learning_rate": 4.508196721311476e-06,
	"loss": 0.0092,
	"step": 330
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 578.8222842216492,
	"epoch": 0.1327545807244651,
	"grad_norm": 0.07180505990982056,
	"kl": 0.014912128448486328,
	"learning_rate": 4.521857923497268e-06,
	"loss": 0.0054,
	"num_tokens": 55750856.0,
	"reward": 0.22428386053070426,
	"reward_std": 0.125935374526307,
	"rewards/pure_accuracy_reward_math": 0.22428385610692203,
	"step": 331
	},
	{
	"clip_ratio": 0.0008310234973123443,
	"epoch": 0.13439352616550787,
	"grad_norm": 0.06554125249385834,
	"kl": 0.014460086822509766,
	"learning_rate": 4.535519125683061e-06,
	"loss": 0.0052,
	"step": 332
	},
	{
	"clip_ratio": 0.0009444170593724266,
	"epoch": 0.13603247160655066,
	"grad_norm": 0.0650697648525238,
	"kl": 0.014264106750488281,
	"learning_rate": 4.549180327868853e-06,
	"loss": 0.0049,
	"step": 333
	},
	{
	"clip_ratio": 0.0009593672889991467,
	"epoch": 0.13767141704759345,
	"grad_norm": 0.06275759637355804,
	"kl": 0.014463424682617188,
	"learning_rate": 4.562841530054645e-06,
	"loss": 0.0045,
	"step": 334
	},
	{
	"clip_ratio": 0.0008741978416537677,
	"epoch": 0.1393103624886362,
	"grad_norm": 0.06349465250968933,
	"kl": 0.015105247497558594,
	"learning_rate": 4.576502732240437e-06,
	"loss": 0.0041,
	"step": 335
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 581.4983899593353,
	"epoch": 0.140949307929679,
	"grad_norm": 0.07185006141662598,
	"kl": 0.015033245086669922,
	"learning_rate": 4.59016393442623e-06,
	"loss": 0.0092,
	"num_tokens": 59021523.0,
	"reward": 0.24804688125732355,
	"reward_std": 0.12332397676073015,
	"rewards/pure_accuracy_reward_math": 0.2480468761350494,
	"step": 336
	},
	{
	"clip_ratio": 0.0007917967001276338,
	"epoch": 0.14258825337072178,
	"grad_norm": 0.06418469548225403,
	"kl": 0.014570236206054688,
	"learning_rate": 4.603825136612022e-06,
	"loss": 0.009,
	"step": 337
	},
	{
	"clip_ratio": 0.0011276908828676824,
	"epoch": 0.14422719881176455,
	"grad_norm": 0.06706573814153671,
	"kl": 0.014203071594238281,
	"learning_rate": 4.617486338797815e-06,
	"loss": 0.0087,
	"step": 338
	},
	{
	"clip_ratio": 0.0010211615006028296,
	"epoch": 0.14586614425280733,
	"grad_norm": 0.06293198466300964,
	"kl": 0.014473915100097656,
	"learning_rate": 4.631147540983607e-06,
	"loss": 0.0084,
	"step": 339
	},
	{
	"clip_ratio": 0.0008425270717680178,
	"epoch": 0.14750508969385012,
	"grad_norm": 0.058640848845243454,
	"kl": 0.01514291763305664,
	"learning_rate": 4.6448087431694e-06,
	"loss": 0.008,
	"step": 340
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 581.9489109516144,
	"epoch": 0.14914403513489288,
	"grad_norm": 0.08432208746671677,
	"kl": 0.015330791473388672,
	"learning_rate": 4.6584699453551915e-06,
	"loss": 0.0085,
	"num_tokens": 62299018.0,
	"reward": 0.2382812559371814,
	"reward_std": 0.14492353051900864,
	"rewards/pure_accuracy_reward_math": 0.2382812504656613,
	"step": 341
	},
	{
	"clip_ratio": 0.000848330670521591,
	"epoch": 0.15078298057593567,
	"grad_norm": 0.06801754236221313,
	"kl": 0.014659404754638672,
	"learning_rate": 4.672131147540984e-06,
	"loss": 0.0084,
	"step": 342
	},
	{
	"clip_ratio": 0.0010827027111872667,
	"epoch": 0.15242192601697846,
	"grad_norm": 0.06549172848463058,
	"kl": 0.014311790466308594,
	"learning_rate": 4.6857923497267765e-06,
	"loss": 0.0081,
	"step": 343
	},
	{
	"clip_ratio": 0.0010740470830796767,
	"epoch": 0.15406087145802122,
	"grad_norm": 0.06515967845916748,
	"kl": 0.01453399658203125,
	"learning_rate": 4.699453551912569e-06,
	"loss": 0.0077,
	"step": 344
	},
	{
	"clip_ratio": 0.0009672553374002746,
	"epoch": 0.155699816899064,
	"grad_norm": 0.0627971738576889,
	"kl": 0.015265464782714844,
	"learning_rate": 4.7131147540983615e-06,
	"loss": 0.0072,
	"step": 345
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 594.5420119762421,
	"epoch": 0.1573387623401068,
	"grad_norm": 0.0651373565196991,
	"kl": 0.01506948471069336,
	"learning_rate": 4.726775956284154e-06,
	"loss": 0.0061,
	"num_tokens": 65612227.0,
	"reward": 0.22656250555883162,
	"reward_std": 0.11576688283821568,
	"rewards/pure_accuracy_reward_math": 0.2265625006693881,
	"step": 346
	},
	{
	"clip_ratio": 0.0006889042056741346,
	"epoch": 0.15897770778114956,
	"grad_norm": 0.05911775305867195,
	"kl": 0.014788627624511719,
	"learning_rate": 4.740437158469946e-06,
	"loss": 0.006,
	"step": 347
	},
	{
	"clip_ratio": 0.0008533449156971074,
	"epoch": 0.16061665322219235,
	"grad_norm": 0.06107313930988312,
	"kl": 0.014514446258544922,
	"learning_rate": 4.754098360655738e-06,
	"loss": 0.0058,
	"step": 348
	},
	{
	"clip_ratio": 0.0008506776480317058,
	"epoch": 0.16225559866323513,
	"grad_norm": 0.05840134993195534,
	"kl": 0.014555931091308594,
	"learning_rate": 4.767759562841531e-06,
	"loss": 0.0054,
	"step": 349
	},
	{
	"clip_ratio": 0.0007001224406621986,
	"epoch": 0.1638945441042779,
	"grad_norm": 0.052340634167194366,
	"kl": 0.014971256256103516,
	"learning_rate": 4.781420765027322e-06,
	"loss": 0.0051,
	"step": 350
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 582.0423378944397,
	"epoch": 0.16553348954532068,
	"grad_norm": 0.07505487650632858,
	"kl": 0.015714168548583984,
	"learning_rate": 4.795081967213115e-06,
	"loss": 0.01,
	"num_tokens": 68889789.0,
	"reward": 0.22949219317524694,
	"reward_std": 0.13435217371443287,
	"rewards/pure_accuracy_reward_math": 0.2294921897992026,
	"step": 351
	},
	{
	"clip_ratio": 0.0008628651630715467,
	"epoch": 0.16717243498636347,
	"grad_norm": 0.06674539297819138,
	"kl": 0.015304088592529297,
	"learning_rate": 4.808743169398907e-06,
	"loss": 0.0098,
	"step": 352
	},
	{
	"clip_ratio": 0.001037854259834603,
	"epoch": 0.16881138042740623,
	"grad_norm": 0.07000827044248581,
	"kl": 0.014843463897705078,
	"learning_rate": 4.8224043715847e-06,
	"loss": 0.0095,
	"step": 353
	},
	{
	"clip_ratio": 0.0010051641423842739,
	"epoch": 0.17045032586844902,
	"grad_norm": 0.06692034751176834,
	"kl": 0.01481771469116211,
	"learning_rate": 4.836065573770492e-06,
	"loss": 0.0091,
	"step": 354
	},
	{
	"clip_ratio": 0.0009215611881927543,
	"epoch": 0.1720892713094918,
	"grad_norm": 0.05842750146985054,
	"kl": 0.015254974365234375,
	"learning_rate": 4.849726775956285e-06,
	"loss": 0.0086,
	"step": 355
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 580.9974174499512,
	"epoch": 0.17372821675053457,
	"grad_norm": 0.07343181222677231,
	"kl": 0.015576362609863281,
	"learning_rate": 4.863387978142076e-06,
	"loss": 0.0104,
	"num_tokens": 72161705.0,
	"reward": 0.24218750657746568,
	"reward_std": 0.12447860068641603,
	"rewards/pure_accuracy_reward_math": 0.24218750098953024,
	"step": 356
	},
	{
	"clip_ratio": 0.000678558928370876,
	"epoch": 0.17536716219157736,
	"grad_norm": 0.06728224456310272,
	"kl": 0.015001773834228516,
	"learning_rate": 4.877049180327869e-06,
	"loss": 0.0103,
	"step": 357
	},
	{
	"clip_ratio": 0.0009087121708262202,
	"epoch": 0.17700610763262015,
	"grad_norm": 0.06502145528793335,
	"kl": 0.014545440673828125,
	"learning_rate": 4.890710382513661e-06,
	"loss": 0.0099,
	"step": 358
	},
	{
	"clip_ratio": 0.0008997945463420365,
	"epoch": 0.1786450530736629,
	"grad_norm": 0.06085266172885895,
	"kl": 0.01470804214477539,
	"learning_rate": 4.904371584699454e-06,
	"loss": 0.0096,
	"step": 359
	},
	{
	"clip_ratio": 0.0008065323049777362,
	"epoch": 0.1802839985147057,
	"grad_norm": 0.05810590460896492,
	"kl": 0.01529693603515625,
	"learning_rate": 4.918032786885246e-06,
	"loss": 0.0092,
	"step": 360
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 574.3659031391144,
	"epoch": 0.18192294395574848,
	"grad_norm": 0.07119102030992508,
	"kl": 0.015850543975830078,
	"learning_rate": 4.931693989071039e-06,
	"loss": 0.0088,
	"num_tokens": 75411941.0,
	"reward": 0.26106771623017266,
	"reward_std": 0.12748563423519954,
	"rewards/pure_accuracy_reward_math": 0.26106770982732996,
	"step": 361
	},
	{
	"clip_ratio": 0.0007838922517180436,
	"epoch": 0.18356188939679124,
	"grad_norm": 0.0668591633439064,
	"kl": 0.015510082244873047,
	"learning_rate": 4.945355191256831e-06,
	"loss": 0.0086,
	"step": 362
	},
	{
	"clip_ratio": 0.0009197715589834843,
	"epoch": 0.18520083483783403,
	"grad_norm": 0.06584400683641434,
	"kl": 0.015251636505126953,
	"learning_rate": 4.959016393442623e-06,
	"loss": 0.0083,
	"step": 363
	},
	{
	"clip_ratio": 0.0007934075148341435,
	"epoch": 0.18683978027887682,
	"grad_norm": 0.06021925061941147,
	"kl": 0.015304088592529297,
	"learning_rate": 4.9726775956284154e-06,
	"loss": 0.0079,
	"step": 364
	},
	{
	"clip_ratio": 0.0008117128969615806,
	"epoch": 0.18847872571991958,
	"grad_norm": 0.054787032306194305,
	"kl": 0.015666484832763672,
	"learning_rate": 4.986338797814208e-06,
	"loss": 0.0075,
	"step": 365
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 581.3089377880096,
	"epoch": 0.19011767116096237,
	"grad_norm": 0.07014758884906769,
	"kl": 0.01603412628173828,
	"learning_rate": 5e-06,
	"loss": 0.0102,
	"num_tokens": 78678442.0,
	"reward": 0.2386067773331888,
	"reward_std": 0.1381109645590186,
	"rewards/pure_accuracy_reward_math": 0.23860677116317675,
	"step": 366
	},
	{
	"clip_ratio": 0.0008581371083096201,
	"epoch": 0.19175661660200516,
	"grad_norm": 0.06626458466053009,
	"kl": 0.015540599822998047,
	"learning_rate": 4.9999942439118225e-06,
	"loss": 0.01,
	"step": 367
	},
	{
	"clip_ratio": 0.0010326972058010142,
	"epoch": 0.19339556204304792,
	"grad_norm": 0.06585969030857086,
	"kl": 0.015045166015625,
	"learning_rate": 4.999976975673795e-06,
	"loss": 0.0097,
	"step": 368
	},
	{
	"clip_ratio": 0.000947793353589077,
	"epoch": 0.1950345074840907,
	"grad_norm": 0.06269653141498566,
	"kl": 0.015254497528076172,
	"learning_rate": 4.999948195365436e-06,
	"loss": 0.0092,
	"step": 369
	},
	{
	"clip_ratio": 0.0008362581023675375,
	"epoch": 0.1966734529251335,
	"grad_norm": 0.059586890041828156,
	"kl": 0.01586627960205078,
	"learning_rate": 4.9999079031192755e-06,
	"loss": 0.0088,
	"step": 370
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 568.2786619663239,
	"epoch": 0.19831239836617626,
	"grad_norm": 0.07526068389415741,
	"kl": 0.016698837280273438,
	"learning_rate": 4.999856099120852e-06,
	"loss": 0.0081,
	"num_tokens": 81911242.0,
	"reward": 0.24251302849734202,
	"reward_std": 0.126928077545017,
	"rewards/pure_accuracy_reward_math": 0.24251302162883803,
	"step": 371
	},
	{
	"clip_ratio": 0.0009173538178401941,
	"epoch": 0.19995134380721905,
	"grad_norm": 0.06963901966810226,
	"kl": 0.016336441040039062,
	"learning_rate": 4.99979278360872e-06,
	"loss": 0.008,
	"step": 372
	},
	{
	"clip_ratio": 0.0011110180128071079,
	"epoch": 0.20159028924826183,
	"grad_norm": 0.06961624324321747,
	"kl": 0.015837669372558594,
	"learning_rate": 4.999717956874435e-06,
	"loss": 0.0076,
	"step": 373
	},
	{
	"clip_ratio": 0.0009433047086986335,
	"epoch": 0.2032292346893046,
	"grad_norm": 0.06556432694196701,
	"kl": 0.01593923568725586,
	"learning_rate": 4.9996316192625675e-06,
	"loss": 0.0072,
	"step": 374
	},
	{
	"clip_ratio": 0.0008095512553154549,
	"epoch": 0.20486818013034738,
	"grad_norm": 0.06139687821269035,
	"kl": 0.01659393310546875,
	"learning_rate": 4.99953377117069e-06,
	"loss": 0.0067,
	"step": 375
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 573.026712179184,
	"epoch": 0.20650712557139017,
	"grad_norm": 0.07706455886363983,
	"kl": 0.016510486602783203,
	"learning_rate": 4.99942441304938e-06,
	"loss": 0.0125,
	"num_tokens": 85158888.0,
	"reward": 0.2682291740202345,
	"reward_std": 0.14631909935269505,
	"rewards/pure_accuracy_reward_math": 0.2682291676173918,
	"step": 376
	},
	{
	"clip_ratio": 0.0009323210776983615,
	"epoch": 0.20814607101243293,
	"grad_norm": 0.07358861714601517,
	"kl": 0.015882015228271484,
	"learning_rate": 4.999303545402218e-06,
	"loss": 0.0123,
	"step": 377
	},
	{
	"clip_ratio": 0.0011543770483513072,
	"epoch": 0.20978501645347572,
	"grad_norm": 0.06775986403226852,
	"kl": 0.015225410461425781,
	"learning_rate": 4.999171168785783e-06,
	"loss": 0.012,
	"step": 378
	},
	{
	"clip_ratio": 0.0010423319904475647,
	"epoch": 0.2114239618945185,
	"grad_norm": 0.06506908684968948,
	"kl": 0.01537466049194336,
	"learning_rate": 4.999027283809653e-06,
	"loss": 0.0116,
	"step": 379
	},
	{
	"clip_ratio": 0.0009310034193958927,
	"epoch": 0.21306290733556127,
	"grad_norm": 0.06132827699184418,
	"kl": 0.01622772216796875,
	"learning_rate": 4.9988718911364e-06,
	"loss": 0.0111,
	"step": 380
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 571.3756697177887,
	"epoch": 0.21470185277660406,
	"grad_norm": 0.0720067173242569,
	"kl": 0.017137527465820312,
	"learning_rate": 4.998704991481587e-06,
	"loss": 0.0108,
	"num_tokens": 88402762.0,
	"reward": 0.24674479925306514,
	"reward_std": 0.1313918832456693,
	"rewards/pure_accuracy_reward_math": 0.24674479168606922,
	"step": 381
	},
	{
	"clip_ratio": 0.0007053178948126515,
	"epoch": 0.21634079821764685,
	"grad_norm": 0.06856828182935715,
	"kl": 0.016861915588378906,
	"learning_rate": 4.998526585613763e-06,
	"loss": 0.0107,
	"step": 382
	},
	{
	"clip_ratio": 0.0009198855559588992,
	"epoch": 0.2179797436586896,
	"grad_norm": 0.06308390200138092,
	"kl": 0.01618671417236328,
	"learning_rate": 4.998336674354468e-06,
	"loss": 0.0103,
	"step": 383
	},
	{
	"clip_ratio": 0.0009896415027697003,
	"epoch": 0.2196186890997324,
	"grad_norm": 0.059695471078157425,
	"kl": 0.01616191864013672,
	"learning_rate": 4.9981352585782154e-06,
	"loss": 0.01,
	"step": 384
	},
	{
	"clip_ratio": 0.0009851711494093252,
	"epoch": 0.22125763454077518,
	"grad_norm": 0.06119159981608391,
	"kl": 0.016726016998291016,
	"learning_rate": 4.997922339212501e-06,
	"loss": 0.0095,
	"step": 385
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 582.7536017894745,
	"epoch": 0.22289657998181794,
	"grad_norm": 0.06853786110877991,
	"kl": 0.01645803451538086,
	"learning_rate": 4.997697917237789e-06,
	"loss": 0.0092,
	"num_tokens": 91672333.0,
	"reward": 0.2298177152988501,
	"reward_std": 0.1270827678963542,
	"rewards/pure_accuracy_reward_math": 0.22981770866317675,
	"step": 386
	},
	{
	"clip_ratio": 0.0006475677645312317,
	"epoch": 0.22453552542286073,
	"grad_norm": 0.06568547338247299,
	"kl": 0.016225337982177734,
	"learning_rate": 4.997461993687514e-06,
	"loss": 0.0091,
	"step": 387
	},
	{
	"clip_ratio": 0.000736436435545329,
	"epoch": 0.22617447086390352,
	"grad_norm": 0.06055685877799988,
	"kl": 0.015880584716796875,
	"learning_rate": 4.997214569648075e-06,
	"loss": 0.0088,
	"step": 388
	},
	{
	"clip_ratio": 0.0007173336517780626,
	"epoch": 0.22781341630494628,
	"grad_norm": 0.054837051779031754,
	"kl": 0.01586627960205078,
	"learning_rate": 4.996955646258826e-06,
	"loss": 0.0084,
	"step": 389
	},
	{
	"clip_ratio": 0.0007432895852730326,
	"epoch": 0.22945236174598907,
	"grad_norm": 0.05363443121314049,
	"kl": 0.01619720458984375,
	"learning_rate": 4.996685224712077e-06,
	"loss": 0.008,
	"step": 390
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 569.7708532810211,
	"epoch": 0.23109130718703186,
	"grad_norm": 0.08375601470470428,
	"kl": 0.016201019287109375,
	"learning_rate": 4.9964033062530825e-06,
	"loss": 0.0067,
	"num_tokens": 94902157.0,
	"reward": 0.2470703189901542,
	"reward_std": 0.12347866676282138,
	"rewards/pure_accuracy_reward_math": 0.2470703137514647,
	"step": 391
	},
	{
	"clip_ratio": 0.0006189611340801093,
	"epoch": 0.23273025262807462,
	"grad_norm": 0.06883595138788223,
	"kl": 0.016017436981201172,
	"learning_rate": 4.996109892180041e-06,
	"loss": 0.0065,
	"step": 392
	},
	{
	"clip_ratio": 0.0007823226997629718,
	"epoch": 0.2343691980691174,
	"grad_norm": 0.08916032314300537,
	"kl": 0.01654815673828125,
	"learning_rate": 4.995804983844088e-06,
	"loss": 0.0062,
	"step": 393
	},
	{
	"clip_ratio": 0.0008161565754676303,
	"epoch": 0.23600814351016017,
	"grad_norm": 0.06091364100575447,
	"kl": 0.015578985214233398,
	"learning_rate": 4.995488582649286e-06,
	"loss": 0.0058,
	"step": 394
	},
	{
	"clip_ratio": 0.0008045715469506831,
	"epoch": 0.23764708895120296,
	"grad_norm": 0.0608866885304451,
	"kl": 0.015944957733154297,
	"learning_rate": 4.99516069005262e-06,
	"loss": 0.0054,
	"step": 395
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 589.5065298080444,
	"epoch": 0.23928603439224574,
	"grad_norm": 0.07174283266067505,
	"kl": 0.01667642593383789,
	"learning_rate": 4.994821307563995e-06,
	"loss": 0.0062,
	"num_tokens": 98201877.0,
	"reward": 0.2291666735545732,
	"reward_std": 0.12557925208238885,
	"rewards/pure_accuracy_reward_math": 0.22916666680248454,
	"step": 396
	},
	{
	"clip_ratio": 0.0007434075716901134,
	"epoch": 0.2409249798332885,
	"grad_norm": 0.06661787629127502,
	"kl": 0.01587820053100586,
	"learning_rate": 4.994470436746222e-06,
	"loss": 0.0061,
	"step": 397
	},
	{
	"clip_ratio": 0.0009446046724406187,
	"epoch": 0.2425639252743313,
	"grad_norm": 0.06436329334974289,
	"kl": 0.015263080596923828,
	"learning_rate": 4.994108079215016e-06,
	"loss": 0.0058,
	"step": 398
	},
	{
	"clip_ratio": 0.0009035653077944517,
	"epoch": 0.24420287071537408,
	"grad_norm": 0.05970580503344536,
	"kl": 0.0152130126953125,
	"learning_rate": 4.9937342366389875e-06,
	"loss": 0.0054,
	"step": 399
	},
	{
	"clip_ratio": 0.000805649116728091,
	"epoch": 0.24584181615641684,
	"grad_norm": 0.05798059329390526,
	"kl": 0.015795230865478516,
	"learning_rate": 4.9933489107396324e-06,
	"loss": 0.005,
	"step": 400
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 585.6373908519745,
	"epoch": 0.24748076159745963,
	"grad_norm": 0.07425414025783539,
	"kl": 0.016396045684814453,
	"learning_rate": 4.992952103291327e-06,
	"loss": 0.0062,
	"num_tokens": 101490067.0,
	"reward": 0.2379557362291962,
	"reward_std": 0.14541265065781772,
	"rewards/pure_accuracy_reward_math": 0.23795572970993817,
	"step": 401
	},
	{
	"clip_ratio": 0.0006993081486825758,
	"epoch": 0.24911970703850242,
	"grad_norm": 0.0683765783905983,
	"kl": 0.016202926635742188,
	"learning_rate": 4.992543816121317e-06,
	"loss": 0.006,
	"step": 402
	},
	{
	"clip_ratio": 0.0007977800468097485,
	"epoch": 0.2507586524795452,
	"grad_norm": 0.06357114762067795,
	"kl": 0.015718460083007812,
	"learning_rate": 4.992124051109714e-06,
	"loss": 0.0056,
	"step": 403
	},
	{
	"clip_ratio": 0.0009015119801460969,
	"epoch": 0.25239759792058797,
	"grad_norm": 0.06347363442182541,
	"kl": 0.015771865844726562,
	"learning_rate": 4.991692810189479e-06,
	"loss": 0.0051,
	"step": 404
	},
	{
	"clip_ratio": 0.0008236684840881026,
	"epoch": 0.25403654336163073,
	"grad_norm": 0.058691952377557755,
	"kl": 0.016323566436767578,
	"learning_rate": 4.991250095346423e-06,
	"loss": 0.0047,
	"step": 405
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 583.5478718280792,
	"epoch": 0.25567548880267355,
	"grad_norm": 0.07196501642465591,
	"kl": 0.01677703857421875,
	"learning_rate": 4.990795908619189e-06,
	"loss": 0.0083,
	"num_tokens": 104766370.0,
	"reward": 0.2216796949505806,
	"reward_std": 0.12527710193535313,
	"rewards/pure_accuracy_reward_math": 0.22167968738358468,
	"step": 406
	},
	{
	"clip_ratio": 0.0007391628066670819,
	"epoch": 0.2573144342437163,
	"grad_norm": 0.07041583210229874,
	"kl": 0.016283512115478516,
	"learning_rate": 4.990330252099249e-06,
	"loss": 0.0081,
	"step": 407
	},
	{
	"clip_ratio": 0.0009407289188061441,
	"epoch": 0.25895337968475907,
	"grad_norm": 0.06628228724002838,
	"kl": 0.015958786010742188,
	"learning_rate": 4.98985312793089e-06,
	"loss": 0.0078,
	"step": 408
	},
	{
	"clip_ratio": 0.0008640961274295478,
	"epoch": 0.2605923251258019,
	"grad_norm": 0.08439858257770538,
	"kl": 0.01657581329345703,
	"learning_rate": 4.989364538311209e-06,
	"loss": 0.0074,
	"step": 409
	},
	{
	"clip_ratio": 0.0008074077862829654,
	"epoch": 0.26223127056684464,
	"grad_norm": 0.06573989987373352,
	"kl": 0.016643524169921875,
	"learning_rate": 4.988864485490096e-06,
	"loss": 0.007,
	"step": 410
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 577.0521020889282,
	"epoch": 0.2638702160078874,
	"grad_norm": 0.0664341077208519,
	"kl": 0.017581939697265625,
	"learning_rate": 4.988352971770229e-06,
	"loss": 0.008,
	"num_tokens": 108026786.0,
	"reward": 0.22265625622821972,
	"reward_std": 0.10564513533608988,
	"rewards/pure_accuracy_reward_math": 0.2226562507566996,
	"step": 411
	},
	{
	"clip_ratio": 0.0005124467848531822,
	"epoch": 0.2655091614489302,
	"grad_norm": 0.058590181171894073,
	"kl": 0.016813278198242188,
	"learning_rate": 4.987829999507065e-06,
	"loss": 0.0078,
	"step": 412
	},
	{
	"clip_ratio": 0.000724739855968437,
	"epoch": 0.267148106889973,
	"grad_norm": 0.058657143265008926,
	"kl": 0.016211986541748047,
	"learning_rate": 4.9872955711088215e-06,
	"loss": 0.0076,
	"step": 413
	},
	{
	"clip_ratio": 0.0007133069941573922,
	"epoch": 0.26878705233101574,
	"grad_norm": 0.054359566420316696,
	"kl": 0.01609182357788086,
	"learning_rate": 4.9867496890364734e-06,
	"loss": 0.0072,
	"step": 414
	},
	{
	"clip_ratio": 0.0006473830337654363,
	"epoch": 0.27042599777205856,
	"grad_norm": 0.0523286908864975,
	"kl": 0.016422271728515625,
	"learning_rate": 4.986192355803735e-06,
	"loss": 0.0069,
	"step": 415
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 587.6103718280792,
	"epoch": 0.2720649432131013,
	"grad_norm": 0.06779833137989044,
	"kl": 0.016475200653076172,
	"learning_rate": 4.985623573977056e-06,
	"loss": 0.0092,
	"num_tokens": 111325301.0,
	"reward": 0.23014323599636555,
	"reward_std": 0.14115750859491527,
	"rewards/pure_accuracy_reward_math": 0.23014322959352285,
	"step": 416
	},
	{
	"clip_ratio": 0.0006248025758850417,
	"epoch": 0.2737038886541441,
	"grad_norm": 0.06379402428865433,
	"kl": 0.016280651092529297,
	"learning_rate": 4.985043346175602e-06,
	"loss": 0.009,
	"step": 417
	},
	{
	"clip_ratio": 0.000747511406416379,
	"epoch": 0.2753428340951869,
	"grad_norm": 0.060899555683135986,
	"kl": 0.015888690948486328,
	"learning_rate": 4.984451675071247e-06,
	"loss": 0.0086,
	"step": 418
	},
	{
	"clip_ratio": 0.0007759029947465024,
	"epoch": 0.27698177953622966,
	"grad_norm": 0.059268273413181305,
	"kl": 0.01577615737915039,
	"learning_rate": 4.983848563388559e-06,
	"loss": 0.0082,
	"step": 419
	},
	{
	"clip_ratio": 0.0007596586527824911,
	"epoch": 0.2786207249772724,
	"grad_norm": 0.05496392399072647,
	"kl": 0.016138076782226562,
	"learning_rate": 4.983234013904791e-06,
	"loss": 0.0078,
	"step": 420
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 575.2080256938934,
	"epoch": 0.28025967041831523,
	"grad_norm": 0.06906809657812119,
	"kl": 0.016767501831054688,
	"learning_rate": 4.9826080294498615e-06,
	"loss": 0.0087,
	"num_tokens": 114572748.0,
	"reward": 0.2369791732635349,
	"reward_std": 0.12557925086002797,
	"rewards/pure_accuracy_reward_math": 0.2369791673263535,
	"step": 421
	},
	{
	"clip_ratio": 0.0006102707594664025,
	"epoch": 0.281898615859358,
	"grad_norm": 0.06671704351902008,
	"kl": 0.01644277572631836,
	"learning_rate": 4.98197061290635e-06,
	"loss": 0.0085,
	"step": 422
	},
	{
	"clip_ratio": 0.0008330631824264856,
	"epoch": 0.28353756130040075,
	"grad_norm": 0.06136437505483627,
	"kl": 0.016006946563720703,
	"learning_rate": 4.981321767209477e-06,
	"loss": 0.0082,
	"step": 423
	},
	{
	"clip_ratio": 0.0008570863296881726,
	"epoch": 0.28517650674144357,
	"grad_norm": 0.05813751742243767,
	"kl": 0.015911102294921875,
	"learning_rate": 4.980661495347092e-06,
	"loss": 0.0078,
	"step": 424
	},
	{
	"clip_ratio": 0.0007138608199284135,
	"epoch": 0.28681545218248633,
	"grad_norm": 0.055987462401390076,
	"kl": 0.016295433044433594,
	"learning_rate": 4.979989800359661e-06,
	"loss": 0.0074,
	"step": 425
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 581.0205252170563,
	"epoch": 0.2884543976235291,
	"grad_norm": 0.07705598324537277,
	"kl": 0.016697406768798828,
	"learning_rate": 4.9793066853402535e-06,
	"loss": 0.0104,
	"num_tokens": 117848783.0,
	"reward": 0.2561849042249378,
	"reward_std": 0.13690236682305112,
	"rewards/pure_accuracy_reward_math": 0.2561848958430346,
	"step": 426
	},
	{
	"clip_ratio": 0.0006357220343033987,
	"epoch": 0.2900933430645719,
	"grad_norm": 0.06893625855445862,
	"kl": 0.016409873962402344,
	"learning_rate": 4.978612153434527e-06,
	"loss": 0.0102,
	"step": 427
	},
	{
	"clip_ratio": 0.0008826974139992672,
	"epoch": 0.29173228850561467,
	"grad_norm": 0.06487911939620972,
	"kl": 0.01578235626220703,
	"learning_rate": 4.977906207840708e-06,
	"loss": 0.0099,
	"step": 428
	},
	{
	"clip_ratio": 0.0009138368169487876,
	"epoch": 0.29337123394665743,
	"grad_norm": 0.05983469635248184,
	"kl": 0.015604972839355469,
	"learning_rate": 4.9771888518095855e-06,
	"loss": 0.0094,
	"step": 429
	},
	{
	"clip_ratio": 0.0008240164653443571,
	"epoch": 0.29501017938770024,
	"grad_norm": 0.05934643745422363,
	"kl": 0.016060352325439453,
	"learning_rate": 4.976460088644493e-06,
	"loss": 0.009,
	"step": 430
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 572.7008640766144,
	"epoch": 0.296649124828743,
	"grad_norm": 0.10878130793571472,
	"kl": 0.01746988296508789,
	"learning_rate": 4.9757199217012884e-06,
	"loss": 0.012,
	"num_tokens": 121093860.0,
	"reward": 0.25097657056176104,
	"reward_std": 0.13550679641775787,
	"rewards/pure_accuracy_reward_math": 0.25097656264551915,
	"step": 431
	},
	{
	"clip_ratio": 0.00068632745751529,
	"epoch": 0.29828807026978577,
	"grad_norm": 0.0702020674943924,
	"kl": 0.017047405242919922,
	"learning_rate": 4.974968354388346e-06,
	"loss": 0.0118,
	"step": 432
	},
	{
	"clip_ratio": 0.0008000754407930799,
	"epoch": 0.2999270157108286,
	"grad_norm": 0.06406186521053314,
	"kl": 0.016495227813720703,
	"learning_rate": 4.974205390166535e-06,
	"loss": 0.0115,
	"step": 433
	},
	{
	"clip_ratio": 0.0008013431938707072,
	"epoch": 0.30156596115187134,
	"grad_norm": 4.406322956085205,
	"kl": 0.020737171173095703,
	"learning_rate": 4.973431032549207e-06,
	"loss": 0.0112,
	"step": 434
	},
	{
	"clip_ratio": 0.0010278673380526016,
	"epoch": 0.3032049065929141,
	"grad_norm": 0.07802355289459229,
	"kl": 0.016713619232177734,
	"learning_rate": 4.9726452851021804e-06,
	"loss": 0.0107,
	"step": 435
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 571.912127494812,
	"epoch": 0.3048438520339569,
	"grad_norm": 0.07019820809364319,
	"kl": 0.01775836944580078,
	"learning_rate": 4.971848151443718e-06,
	"loss": 0.0087,
	"num_tokens": 124344430.0,
	"reward": 0.234049486432923,
	"reward_std": 0.11882065865211189,
	"rewards/pure_accuracy_reward_math": 0.2340494789823424,
	"step": 436
	},
	{
	"clip_ratio": 0.0009248623491657781,
	"epoch": 0.3064827974749997,
	"grad_norm": 0.07924344390630722,
	"kl": 0.017708301544189453,
	"learning_rate": 4.9710396352445175e-06,
	"loss": 0.0086,
	"step": 437
	},
	{
	"clip_ratio": 0.0008092627095379612,
	"epoch": 0.30812174291604244,
	"grad_norm": 0.06455735862255096,
	"kl": 0.01685619354248047,
	"learning_rate": 4.970219740227693e-06,
	"loss": 0.0082,
	"step": 438
	},
	{
	"clip_ratio": 0.0008741315057250176,
	"epoch": 0.30976068835708526,
	"grad_norm": 0.0737844780087471,
	"kl": 0.016612529754638672,
	"learning_rate": 4.969388470168754e-06,
	"loss": 0.0078,
	"step": 439
	},
	{
	"clip_ratio": 0.0006731455727049251,
	"epoch": 0.311399633798128,
	"grad_norm": 0.061306241899728775,
	"kl": 0.01703643798828125,
	"learning_rate": 4.96854582889559e-06,
	"loss": 0.0074,
	"step": 440
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 560.4661660194397,
	"epoch": 0.3130385792391708,
	"grad_norm": 0.07420651614665985,
	"kl": 0.018575191497802734,
	"learning_rate": 4.967691820288457e-06,
	"loss": 0.0089,
	"num_tokens": 127553878.0,
	"reward": 0.24218750634463504,
	"reward_std": 0.1318487230455503,
	"rewards/pure_accuracy_reward_math": 0.24218749982537702,
	"step": 441
	},
	{
	"clip_ratio": 0.0007318889370253601,
	"epoch": 0.3146775246802136,
	"grad_norm": 0.0815000906586647,
	"kl": 0.018791675567626953,
	"learning_rate": 4.9668264482799535e-06,
	"loss": 0.0087,
	"step": 442
	},
	{
	"clip_ratio": 0.0007262505477001469,
	"epoch": 0.31631647012125635,
	"grad_norm": 0.06461174786090851,
	"kl": 0.01784658432006836,
	"learning_rate": 4.965949716855006e-06,
	"loss": 0.0083,
	"step": 443
	},
	{
	"clip_ratio": 0.001082696728190058,
	"epoch": 0.3179554155622991,
	"grad_norm": 0.0798153281211853,
	"kl": 0.017561912536621094,
	"learning_rate": 4.965061630050848e-06,
	"loss": 0.0079,
	"step": 444
	},
	{
	"clip_ratio": 0.0007142422628589884,
	"epoch": 0.31959436100334193,
	"grad_norm": 0.05629098415374756,
	"kl": 0.018102645874023438,
	"learning_rate": 4.9641621919570045e-06,
	"loss": 0.0074,
	"step": 445
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 573.8131718635559,
	"epoch": 0.3212333064443847,
	"grad_norm": 0.07845748215913773,
	"kl": 0.019147872924804688,
	"learning_rate": 4.963251406715272e-06,
	"loss": 0.0121,
	"num_tokens": 130808444.0,
	"reward": 0.23372396570630372,
	"reward_std": 0.1444127168506384,
	"rewards/pure_accuracy_reward_math": 0.23372395883779973,
	"step": 446
	},
	{
	"clip_ratio": 0.0008133034913271331,
	"epoch": 0.32287225188542745,
	"grad_norm": 0.0820281058549881,
	"kl": 0.018957138061523438,
	"learning_rate": 4.9623292785197e-06,
	"loss": 0.012,
	"step": 447
	},
	{
	"clip_ratio": 0.0009565782518166088,
	"epoch": 0.32451119732647027,
	"grad_norm": 0.06929846853017807,
	"kl": 0.01794910430908203,
	"learning_rate": 4.961395811616567e-06,
	"loss": 0.0115,
	"step": 448
	},
	{
	"clip_ratio": 0.0012002166286038118,
	"epoch": 0.32615014276751303,
	"grad_norm": 0.08353662490844727,
	"kl": 0.017772197723388672,
	"learning_rate": 4.960451010304368e-06,
	"loss": 0.0111,
	"step": 449
	},
	{
	"clip_ratio": 0.0008637306386845012,
	"epoch": 0.3277890882085558,
	"grad_norm": 0.059568535536527634,
	"kl": 0.01876544952392578,
	"learning_rate": 4.959494878933792e-06,
	"loss": 0.0105,
	"step": 450
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 562.3017749786377,
	"epoch": 0.3294280336495986,
	"grad_norm": 0.08181121200323105,
	"kl": 0.020171165466308594,
	"learning_rate": 4.958527421907697e-06,
	"loss": 0.0075,
	"num_tokens": 134024963.0,
	"reward": 0.2363281317811925,
	"reward_std": 0.12392827571602538,
	"rewards/pure_accuracy_reward_math": 0.23632812607684173,
	"step": 451
	},
	{
	"clip_ratio": 0.0007134260189332053,
	"epoch": 0.33106697909064137,
	"grad_norm": 0.17417100071907043,
	"kl": 0.019116878509521484,
	"learning_rate": 4.957548643681102e-06,
	"loss": 0.0076,
	"step": 452
	},
	{
	"clip_ratio": 0.000948693925124644,
	"epoch": 0.3327059245316841,
	"grad_norm": 10.566765785217285,
	"kl": 0.22874164581298828,
	"learning_rate": 4.95655854876115e-06,
	"loss": 0.0154,
	"step": 453
	},
	{
	"clip_ratio": 0.001509593688069799,
	"epoch": 0.33434486997272694,
	"grad_norm": 0.37215539813041687,
	"kl": 0.024587154388427734,
	"learning_rate": 4.955557141707102e-06,
	"loss": 0.0071,
	"step": 454
	},
	{
	"clip_ratio": 0.0016867685367287777,
	"epoch": 0.3359838154137697,
	"grad_norm": 0.11960741132497787,
	"kl": 0.018782615661621094,
	"learning_rate": 4.954544427130308e-06,
	"loss": 0.0071,
	"step": 455
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 559.3082876205444,
	"epoch": 0.33762276085481246,
	"grad_norm": 0.07540658116340637,
	"kl": 0.018846988677978516,
	"learning_rate": 4.953520409694186e-06,
	"loss": 0.0064,
	"num_tokens": 137230282.0,
	"reward": 0.23795573617098853,
	"reward_std": 0.12567996798316017,
	"rewards/pure_accuracy_reward_math": 0.2379557301173918,
	"step": 456
	},
	{
	"clip_ratio": 0.0007552293965318313,
	"epoch": 0.3392617062958553,
	"grad_norm": 0.08643142879009247,
	"kl": 0.019680500030517578,
	"learning_rate": 4.9524850941142045e-06,
	"loss": 0.0063,
	"step": 457
	},
	{
	"clip_ratio": 0.0008370072589514166,
	"epoch": 0.34090065173689804,
	"grad_norm": 0.07104479521512985,
	"kl": 0.01837015151977539,
	"learning_rate": 4.951438485157858e-06,
	"loss": 0.0059,
	"step": 458
	},
	{
	"clip_ratio": 0.0012107024614920192,
	"epoch": 0.3425395971779408,
	"grad_norm": 0.07779641449451447,
	"kl": 0.017088890075683594,
	"learning_rate": 4.950380587644645e-06,
	"loss": 0.0055,
	"step": 459
	},
	{
	"clip_ratio": 0.001106619913571194,
	"epoch": 0.3441785426189836,
	"grad_norm": 0.07811883091926575,
	"kl": 0.017291545867919922,
	"learning_rate": 4.949311406446047e-06,
	"loss": 0.005,
	"step": 460
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 571.5631704330444,
	"epoch": 0.3458174880600264,
	"grad_norm": 0.07503899931907654,
	"kl": 0.018761634826660156,
	"learning_rate": 4.948230946485504e-06,
	"loss": 0.0099,
	"num_tokens": 140468056.0,
	"reward": 0.23144531933940016,
	"reward_std": 0.1391576409805566,
	"rewards/pure_accuracy_reward_math": 0.2314453127037268,
	"step": 461
	},
	{
	"clip_ratio": 0.0005579163533582232,
	"epoch": 0.34745643350106914,
	"grad_norm": 0.07867737859487534,
	"kl": 0.019515037536621094,
	"learning_rate": 4.947139212738395e-06,
	"loss": 0.0097,
	"step": 462
	},
	{
	"clip_ratio": 0.0005302657258994259,
	"epoch": 0.34909537894211196,
	"grad_norm": 0.06822054833173752,
	"kl": 0.018963336944580078,
	"learning_rate": 4.946036210232013e-06,
	"loss": 0.0093,
	"step": 463
	},
	{
	"clip_ratio": 0.0007419603928155993,
	"epoch": 0.3507343243831547,
	"grad_norm": 0.06452897191047668,
	"kl": 0.017910480499267578,
	"learning_rate": 4.9449219440455406e-06,
	"loss": 0.0089,
	"step": 464
	},
	{
	"clip_ratio": 0.0009656345523580967,
	"epoch": 0.3523732698241975,
	"grad_norm": 0.06394355744123459,
	"kl": 0.017592430114746094,
	"learning_rate": 4.94379641931003e-06,
	"loss": 0.0084,
	"step": 465
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 550.8063335418701,
	"epoch": 0.3540122152652403,
	"grad_norm": 0.07220590114593506,
	"kl": 0.019676685333251953,
	"learning_rate": 4.9426596412083775e-06,
	"loss": 0.0073,
	"num_tokens": 143643633.0,
	"reward": 0.2643229244858958,
	"reward_std": 0.129740908567328,
	"rewards/pure_accuracy_reward_math": 0.26432291738456115,
	"step": 466
	},
	{
	"clip_ratio": 0.0004138159448530132,
	"epoch": 0.35565116070628305,
	"grad_norm": 0.06854696571826935,
	"kl": 0.019529342651367188,
	"learning_rate": 4.9415116149752975e-06,
	"loss": 0.0071,
	"step": 467
	},
	{
	"clip_ratio": 0.0006204222647170354,
	"epoch": 0.3572901061473258,
	"grad_norm": 0.0629592314362526,
	"kl": 0.01886892318725586,
	"learning_rate": 4.940352345897304e-06,
	"loss": 0.0068,
	"step": 468
	},
	{
	"clip_ratio": 0.0008853772396264503,
	"epoch": 0.35892905158836863,
	"grad_norm": 0.07459286600351334,
	"kl": 0.018596172332763672,
	"learning_rate": 4.93918183931268e-06,
	"loss": 0.0064,
	"step": 469
	},
	{
	"clip_ratio": 0.0006601580334404389,
	"epoch": 0.3605679970294114,
	"grad_norm": 0.06547638773918152,
	"kl": 0.019172191619873047,
	"learning_rate": 4.938000100611456e-06,
	"loss": 0.0059,
	"step": 470
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 563.5420064926147,
	"epoch": 0.36220694247045415,
	"grad_norm": 0.07862798869609833,
	"kl": 0.019073963165283203,
	"learning_rate": 4.936807135235389e-06,
	"loss": 0.0082,
	"num_tokens": 146856798.0,
	"reward": 0.24414063114090823,
	"reward_std": 0.13599591748788953,
	"rewards/pure_accuracy_reward_math": 0.2441406262514647,
	"step": 471
	},
	{
	"clip_ratio": 0.000502235996748368,
	"epoch": 0.36384588791149697,
	"grad_norm": 0.07364361733198166,
	"kl": 0.018817424774169922,
	"learning_rate": 4.935602948677925e-06,
	"loss": 0.008,
	"step": 472
	},
	{
	"clip_ratio": 0.0008022733071584298,
	"epoch": 0.36548483335253973,
	"grad_norm": 0.0682106539607048,
	"kl": 0.018253803253173828,
	"learning_rate": 4.934387546484192e-06,
	"loss": 0.0076,
	"step": 473
	},
	{
	"clip_ratio": 0.0009958607009821208,
	"epoch": 0.3671237787935825,
	"grad_norm": 0.06967198103666306,
	"kl": 0.01818084716796875,
	"learning_rate": 4.933160934250957e-06,
	"loss": 0.0072,
	"step": 474
	},
	{
	"clip_ratio": 0.0007515698841871199,
	"epoch": 0.3687627242346253,
	"grad_norm": 0.05721515789628029,
	"kl": 0.018873214721679688,
	"learning_rate": 4.931923117626611e-06,
	"loss": 0.0067,
	"step": 475
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 563.7535984516144,
	"epoch": 0.37040166967566807,
	"grad_norm": 0.07566659152507782,
	"kl": 0.019544124603271484,
	"learning_rate": 4.93067410231114e-06,
	"loss": 0.0064,
	"num_tokens": 150075101.0,
	"reward": 0.25325521553168073,
	"reward_std": 0.13043869246030226,
	"rewards/pure_accuracy_reward_math": 0.25325520912883803,
	"step": 476
	},
	{
	"clip_ratio": 0.0004966380013229355,
	"epoch": 0.3720406151167108,
	"grad_norm": 0.0666096955537796,
	"kl": 0.019169330596923828,
	"learning_rate": 4.929413894056098e-06,
	"loss": 0.0062,
	"step": 477
	},
	{
	"clip_ratio": 0.0010111166515116565,
	"epoch": 0.37367956055775364,
	"grad_norm": 0.07089894264936447,
	"kl": 0.01856708526611328,
	"learning_rate": 4.928142498664579e-06,
	"loss": 0.0059,
	"step": 478
	},
	{
	"clip_ratio": 0.0009684021274551924,
	"epoch": 0.3753185059987964,
	"grad_norm": 0.07048792392015457,
	"kl": 0.0184478759765625,
	"learning_rate": 4.926859921991196e-06,
	"loss": 0.0054,
	"step": 479
	},
	{
	"clip_ratio": 0.0007222936578727968,
	"epoch": 0.37695745143983916,
	"grad_norm": 0.0751122385263443,
	"kl": 0.01910114288330078,
	"learning_rate": 4.925566169942048e-06,
	"loss": 0.0049,
	"step": 480
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 565.988950252533,
	"epoch": 0.378596396880882,
	"grad_norm": 0.07866821438074112,
	"kl": 0.019292354583740234,
	"learning_rate": 4.924261248474696e-06,
	"loss": 0.0077,
	"num_tokens": 153300683.0,
	"reward": 0.24186198544339277,
	"reward_std": 0.14245959254913032,
	"rewards/pure_accuracy_reward_math": 0.2418619791569654,
	"step": 481
	},
	{
	"clip_ratio": 0.0005219346817284531,
	"epoch": 0.38023534232192474,
	"grad_norm": 0.07202895730733871,
	"kl": 0.018962383270263672,
	"learning_rate": 4.922945163598134e-06,
	"loss": 0.0074,
	"step": 482
	},
	{
	"clip_ratio": 0.0007680588737457583,
	"epoch": 0.3818742877629675,
	"grad_norm": 0.06937456876039505,
	"kl": 0.018546104431152344,
	"learning_rate": 4.921617921372764e-06,
	"loss": 0.0071,
	"step": 483
	},
	{
	"clip_ratio": 0.0008267570608495589,
	"epoch": 0.3835132332040103,
	"grad_norm": 0.06490996479988098,
	"kl": 0.018600940704345703,
	"learning_rate": 4.920279527910361e-06,
	"loss": 0.0066,
	"step": 484
	},
	{
	"clip_ratio": 0.0007928038041882246,
	"epoch": 0.3851521786450531,
	"grad_norm": 0.06093154847621918,
	"kl": 0.019063949584960938,
	"learning_rate": 4.918929989374057e-06,
	"loss": 0.006,
	"step": 485
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 559.4954574108124,
	"epoch": 0.38679112408609584,
	"grad_norm": 0.07598863542079926,
	"kl": 0.020077228546142578,
	"learning_rate": 4.917569311978301e-06,
	"loss": 0.0076,
	"num_tokens": 156505589.0,
	"reward": 0.23372396451304667,
	"reward_std": 0.12227730004815385,
	"rewards/pure_accuracy_reward_math": 0.23372395892511122,
	"step": 486
	},
	{
	"clip_ratio": 0.0005095607535281488,
	"epoch": 0.38843006952713865,
	"grad_norm": 0.06875687837600708,
	"kl": 0.019557952880859375,
	"learning_rate": 4.916197501988836e-06,
	"loss": 0.0073,
	"step": 487
	},
	{
	"clip_ratio": 0.0009137496642779297,
	"epoch": 0.3900690149681814,
	"grad_norm": 0.0720105841755867,
	"kl": 0.018970012664794922,
	"learning_rate": 4.914814565722671e-06,
	"loss": 0.007,
	"step": 488
	},
	{
	"clip_ratio": 0.000769516089917488,
	"epoch": 0.3917079604092242,
	"grad_norm": 0.06368213146924973,
	"kl": 0.019055843353271484,
	"learning_rate": 4.913420509548047e-06,
	"loss": 0.0065,
	"step": 489
	},
	{
	"clip_ratio": 0.000659781087506417,
	"epoch": 0.393346905850267,
	"grad_norm": 0.06218770891427994,
	"kl": 0.019764423370361328,
	"learning_rate": 4.912015339884412e-06,
	"loss": 0.006,
	"step": 490
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 563.5508005619049,
	"epoch": 0.39498585129130975,
	"grad_norm": 0.07790997624397278,
	"kl": 0.019940853118896484,
	"learning_rate": 4.910599063202391e-06,
	"loss": 0.0037,
	"num_tokens": 159721585.0,
	"reward": 0.25651042349636555,
	"reward_std": 0.14637307275552303,
	"rewards/pure_accuracy_reward_math": 0.25651041651144624,
	"step": 491
	},
	{
	"clip_ratio": 0.00052815272999851,
	"epoch": 0.3966247967323525,
	"grad_norm": 0.06944292038679123,
	"kl": 0.019421100616455078,
	"learning_rate": 4.9091716860237545e-06,
	"loss": 0.0035,
	"step": 492
	},
	{
	"clip_ratio": 0.0008616803660288497,
	"epoch": 0.39826374217339533,
	"grad_norm": 0.0701906755566597,
	"kl": 0.018842220306396484,
	"learning_rate": 4.907733214921391e-06,
	"loss": 0.0031,
	"step": 493
	},
	{
	"clip_ratio": 0.0009441319247116553,
	"epoch": 0.3999026876144381,
	"grad_norm": 0.06845781207084656,
	"kl": 0.018959999084472656,
	"learning_rate": 4.906283656519271e-06,
	"loss": 0.0026,
	"step": 494
	},
	{
	"clip_ratio": 0.0007721009840224724,
	"epoch": 0.40154163305548085,
	"grad_norm": 0.06344389170408249,
	"kl": 0.019802570343017578,
	"learning_rate": 4.904823017492425e-06,
	"loss": 0.002,
	"step": 495
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 564.9492375850677,
	"epoch": 0.40318057849652367,
	"grad_norm": 0.07819291949272156,
	"kl": 0.019987106323242188,
	"learning_rate": 4.903351304566907e-06,
	"loss": 0.0087,
	"num_tokens": 162944373.0,
	"reward": 0.2613932383537758,
	"reward_std": 0.14731903292704374,
	"rewards/pure_accuracy_reward_math": 0.2613932291569654,
	"step": 496
	},
	{
	"clip_ratio": 0.0005454795893342634,
	"epoch": 0.40481952393756643,
	"grad_norm": 0.0714847669005394,
	"kl": 0.019627094268798828,
	"learning_rate": 4.9018685245197625e-06,
	"loss": 0.0084,
	"step": 497
	},
	{
	"clip_ratio": 0.0009138085124504869,
	"epoch": 0.4064584693786092,
	"grad_norm": 0.07778745144605637,
	"kl": 0.019023895263671875,
	"learning_rate": 4.900374684179005e-06,
	"loss": 0.008,
	"step": 498
	},
	{
	"clip_ratio": 0.0008562761545363173,
	"epoch": 0.408097414819652,
	"grad_norm": 0.07164430618286133,
	"kl": 0.01940298080444336,
	"learning_rate": 4.898869790423573e-06,
	"loss": 0.0075,
	"step": 499
	},
	{
	"clip_ratio": 0.0008455659439050578,
	"epoch": 0.40973636026069477,
	"grad_norm": 0.07439333200454712,
	"kl": 0.01979207992553711,
	"learning_rate": 4.897353850183308e-06,
	"loss": 0.007,
	"step": 500
	},
	{
	"epoch": 0.40973636026069477,
	"step": 500,
	"total_flos": 0.0,
	"train_loss": 0.0,
	"train_runtime": 6.6841,
	"train_samples_per_second": 42069.596,
	"train_steps_per_second": 54.756
	}
	],
	"logging_steps": 1,
	"max_steps": 366,
	"num_input_tokens_seen": 0,
	"num_train_epochs": 3,
	"save_steps": 50,
	"stateful_callbacks": {
	"TrainerControl": {
	"args": {
	"should_epoch_stop": false,
	"should_evaluate": false,
	"should_log": false,
	"should_save": true,
	"should_training_stop": false
	},
	"attributes": {}
	}
	},
	"total_flos": 0.0,
	"train_batch_size": 4,
	"trial_name": null,
	"trial_params": null
	}