OpenRS-GRPO / trainer_state.json
sonicdog00's picture
Model save
d6c9c50 verified
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 0.856898029134533,
"eval_steps": 500,
"global_step": 500,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"clip_ratio": 0.0,
"completion_length": 673.8611145019531,
"epoch": 0.001713796058269066,
"grad_norm": 0.6894496083259583,
"kl": 0.0,
"learning_rate": 2e-08,
"loss": 0.0307,
"reward": 0.5902777910232544,
"reward_std": 0.3424043729901314,
"rewards/accuracy_reward": 0.16666666977107525,
"rewards/format_reward": 0.2569444514811039,
"step": 1
},
{
"clip_ratio": 0.0,
"completion_length": 663.3889007568359,
"epoch": 0.003427592116538132,
"grad_norm": 0.6292832493782043,
"kl": 0.0,
"learning_rate": 4e-08,
"loss": -0.0039,
"reward": 0.4583333358168602,
"reward_std": 0.45602361112833023,
"rewards/accuracy_reward": 0.0972222238779068,
"rewards/format_reward": 0.2638888917863369,
"step": 2
},
{
"clip_ratio": 0.0,
"completion_length": 843.0972442626953,
"epoch": 0.005141388174807198,
"grad_norm": 0.5272438526153564,
"kl": 0.00014257431030273438,
"learning_rate": 6e-08,
"loss": 0.0375,
"reward": 0.5972222238779068,
"reward_std": 0.4886699207127094,
"rewards/accuracy_reward": 0.16666667256504297,
"rewards/format_reward": 0.2638888955116272,
"step": 3
},
{
"clip_ratio": 0.0,
"completion_length": 708.4166717529297,
"epoch": 0.006855184233076264,
"grad_norm": 0.59702068567276,
"kl": 0.0001093149185180664,
"learning_rate": 8e-08,
"loss": 0.0373,
"reward": 0.5069444477558136,
"reward_std": 0.563178788870573,
"rewards/accuracy_reward": 0.13888888992369175,
"rewards/format_reward": 0.2291666716337204,
"step": 4
},
{
"clip_ratio": 0.0,
"completion_length": 663.9722290039062,
"epoch": 0.00856898029134533,
"grad_norm": 0.8420807123184204,
"kl": 0.00011909008026123047,
"learning_rate": 1e-07,
"loss": 0.0547,
"reward": 0.666666679084301,
"reward_std": 0.5673302263021469,
"rewards/accuracy_reward": 0.22222222480922937,
"rewards/format_reward": 0.2222222238779068,
"step": 5
},
{
"clip_ratio": 0.0,
"completion_length": 630.0,
"epoch": 0.010282776349614395,
"grad_norm": 0.8917471170425415,
"kl": 0.00018858909606933594,
"learning_rate": 1.2e-07,
"loss": 0.0357,
"reward": 0.5833333358168602,
"reward_std": 0.4646867737174034,
"rewards/accuracy_reward": 0.16666667070239782,
"rewards/format_reward": 0.2500000037252903,
"step": 6
},
{
"clip_ratio": 0.0,
"completion_length": 650.5416870117188,
"epoch": 0.011996572407883462,
"grad_norm": 0.7703569531440735,
"kl": 8.26716423034668e-05,
"learning_rate": 1.4e-07,
"loss": -0.0085,
"reward": 0.4305555634200573,
"reward_std": 0.31282100826501846,
"rewards/accuracy_reward": 0.0555555559694767,
"rewards/format_reward": 0.3194444514811039,
"step": 7
},
{
"clip_ratio": 0.0,
"completion_length": 775.2500152587891,
"epoch": 0.013710368466152529,
"grad_norm": 0.7724463939666748,
"kl": 0.00013768672943115234,
"learning_rate": 1.6e-07,
"loss": 0.0749,
"reward": 0.3611111082136631,
"reward_std": 0.41373568773269653,
"rewards/accuracy_reward": 0.06944444589316845,
"rewards/format_reward": 0.2222222276031971,
"step": 8
},
{
"clip_ratio": 0.0,
"completion_length": 634.8472213745117,
"epoch": 0.015424164524421594,
"grad_norm": 0.5836902856826782,
"kl": 0.0001068115234375,
"learning_rate": 1.8e-07,
"loss": 0.0627,
"reward": 0.4861111156642437,
"reward_std": 0.46435344591736794,
"rewards/accuracy_reward": 0.12500000465661287,
"rewards/format_reward": 0.2361111119389534,
"step": 9
},
{
"clip_ratio": 0.0,
"completion_length": 768.4305572509766,
"epoch": 0.01713796058269066,
"grad_norm": 0.8060944676399231,
"kl": 0.0001360177993774414,
"learning_rate": 2e-07,
"loss": 0.0023,
"reward": 0.7847222089767456,
"reward_std": 0.46876538544893265,
"rewards/accuracy_reward": 0.291666672565043,
"rewards/format_reward": 0.2013888955116272,
"step": 10
},
{
"clip_ratio": 0.0,
"completion_length": 661.7638854980469,
"epoch": 0.018851756640959727,
"grad_norm": 0.8618626594543457,
"kl": 9.322166442871094e-05,
"learning_rate": 2.1999999999999998e-07,
"loss": 0.0605,
"reward": 0.5763889029622078,
"reward_std": 0.5653917491436005,
"rewards/accuracy_reward": 0.15277778264135122,
"rewards/format_reward": 0.2708333358168602,
"step": 11
},
{
"clip_ratio": 0.0,
"completion_length": 847.4583435058594,
"epoch": 0.02056555269922879,
"grad_norm": 0.5977460741996765,
"kl": 0.0001245737075805664,
"learning_rate": 2.4e-07,
"loss": 0.0545,
"reward": 0.4583333395421505,
"reward_std": 0.4860685095191002,
"rewards/accuracy_reward": 0.0972222238779068,
"rewards/format_reward": 0.2638888917863369,
"step": 12
},
{
"clip_ratio": 0.0,
"completion_length": 687.4305725097656,
"epoch": 0.022279348757497857,
"grad_norm": 0.7761760354042053,
"kl": 0.00014710426330566406,
"learning_rate": 2.6e-07,
"loss": 0.0447,
"reward": 0.3888888955116272,
"reward_std": 0.4280589930713177,
"rewards/accuracy_reward": 0.06944444589316845,
"rewards/format_reward": 0.25,
"step": 13
},
{
"clip_ratio": 0.0,
"completion_length": 776.2916641235352,
"epoch": 0.023993144815766924,
"grad_norm": 1.0175061225891113,
"kl": 0.00014960765838623047,
"learning_rate": 2.8e-07,
"loss": 0.0222,
"reward": 0.611111119389534,
"reward_std": 0.48110663890838623,
"rewards/accuracy_reward": 0.1944444477558136,
"rewards/format_reward": 0.2222222238779068,
"step": 14
},
{
"clip_ratio": 0.0,
"completion_length": 808.4722290039062,
"epoch": 0.02570694087403599,
"grad_norm": 0.5965744853019714,
"kl": 0.0001302957534790039,
"learning_rate": 3e-07,
"loss": 0.0488,
"reward": 0.43055555410683155,
"reward_std": 0.3586147967725992,
"rewards/accuracy_reward": 0.1111111119389534,
"rewards/format_reward": 0.20833333767950535,
"step": 15
},
{
"clip_ratio": 0.0,
"completion_length": 577.4027862548828,
"epoch": 0.027420736932305057,
"grad_norm": 0.8511067032814026,
"kl": 0.00014138221740722656,
"learning_rate": 3.2e-07,
"loss": 0.0287,
"reward": 0.7916666716337204,
"reward_std": 0.7402771413326263,
"rewards/accuracy_reward": 0.2638888992369175,
"rewards/format_reward": 0.2638888922519982,
"step": 16
},
{
"clip_ratio": 0.0,
"completion_length": 693.2777786254883,
"epoch": 0.02913453299057412,
"grad_norm": 1.0029399394989014,
"kl": 0.00013744831085205078,
"learning_rate": 3.4000000000000003e-07,
"loss": -0.0047,
"reward": 0.7430555745959282,
"reward_std": 0.5347021222114563,
"rewards/accuracy_reward": 0.2777777863666415,
"rewards/format_reward": 0.18750000093132257,
"step": 17
},
{
"clip_ratio": 0.0,
"completion_length": 694.6666870117188,
"epoch": 0.030848329048843187,
"grad_norm": 0.929906964302063,
"kl": 0.0001380443572998047,
"learning_rate": 3.6e-07,
"loss": 0.1027,
"reward": 0.3194444477558136,
"reward_std": 0.3645694628357887,
"rewards/accuracy_reward": 0.041666666977107525,
"rewards/format_reward": 0.23611111752688885,
"step": 18
},
{
"clip_ratio": 0.0,
"completion_length": 837.4583587646484,
"epoch": 0.032562125107112254,
"grad_norm": 0.5381235480308533,
"kl": 0.00016641616821289062,
"learning_rate": 3.7999999999999996e-07,
"loss": 0.0715,
"reward": 0.5347222164273262,
"reward_std": 0.5603309497237206,
"rewards/accuracy_reward": 0.13888889271765947,
"rewards/format_reward": 0.2569444477558136,
"step": 19
},
{
"clip_ratio": 0.0,
"completion_length": 865.6805419921875,
"epoch": 0.03427592116538132,
"grad_norm": 0.6289723515510559,
"kl": 0.0001741647720336914,
"learning_rate": 4e-07,
"loss": -0.0123,
"reward": 0.3333333320915699,
"reward_std": 0.3452591709792614,
"rewards/accuracy_reward": 0.08333333395421505,
"rewards/format_reward": 0.1666666716337204,
"step": 20
},
{
"clip_ratio": 0.0,
"completion_length": 664.694450378418,
"epoch": 0.03598971722365039,
"grad_norm": 0.8022226691246033,
"kl": 0.00022649765014648438,
"learning_rate": 4.1999999999999995e-07,
"loss": 0.0523,
"reward": 0.5833333358168602,
"reward_std": 0.3963186591863632,
"rewards/accuracy_reward": 0.12500000465661287,
"rewards/format_reward": 0.3333333320915699,
"step": 21
},
{
"clip_ratio": 0.0,
"completion_length": 874.8055725097656,
"epoch": 0.037703513281919454,
"grad_norm": 0.6999794244766235,
"kl": 0.00022101402282714844,
"learning_rate": 4.3999999999999997e-07,
"loss": 0.0188,
"reward": 0.319444440305233,
"reward_std": 0.36139946803450584,
"rewards/accuracy_reward": 0.055555556900799274,
"rewards/format_reward": 0.2083333358168602,
"step": 22
},
{
"clip_ratio": 0.0,
"completion_length": 873.7361145019531,
"epoch": 0.03941730934018852,
"grad_norm": 0.5962035059928894,
"kl": 0.00021910667419433594,
"learning_rate": 4.6e-07,
"loss": 0.0391,
"reward": 0.611111119389534,
"reward_std": 0.31651007384061813,
"rewards/accuracy_reward": 0.15277778450399637,
"rewards/format_reward": 0.3055555522441864,
"step": 23
},
{
"clip_ratio": 0.0,
"completion_length": 691.0277862548828,
"epoch": 0.04113110539845758,
"grad_norm": 0.6845191121101379,
"kl": 0.0003848075866699219,
"learning_rate": 4.8e-07,
"loss": 0.0395,
"reward": 0.6388888955116272,
"reward_std": 0.44496994838118553,
"rewards/accuracy_reward": 0.18055556062608957,
"rewards/format_reward": 0.2777777835726738,
"step": 24
},
{
"clip_ratio": 0.0,
"completion_length": 606.1527862548828,
"epoch": 0.04284490145672665,
"grad_norm": 0.7037742733955383,
"kl": 0.0006976127624511719,
"learning_rate": 5e-07,
"loss": 0.0193,
"reward": 0.6875000149011612,
"reward_std": 0.49456192925572395,
"rewards/accuracy_reward": 0.16666666883975267,
"rewards/format_reward": 0.3541666716337204,
"step": 25
},
{
"clip_ratio": 0.0,
"completion_length": 615.1250152587891,
"epoch": 0.044558697514995714,
"grad_norm": 0.8331264853477478,
"kl": 0.0008344650268554688,
"learning_rate": 5.2e-07,
"loss": 0.0633,
"reward": 0.7152777686715126,
"reward_std": 0.2888181023299694,
"rewards/accuracy_reward": 0.19444444868713617,
"rewards/format_reward": 0.3263888880610466,
"step": 26
},
{
"clip_ratio": 0.0,
"completion_length": 814.9166717529297,
"epoch": 0.04627249357326478,
"grad_norm": 0.9271811246871948,
"kl": 0.0009531974792480469,
"learning_rate": 5.4e-07,
"loss": 0.056,
"reward": 0.3472222238779068,
"reward_std": 0.30904670804739,
"rewards/accuracy_reward": 0.02777777798473835,
"rewards/format_reward": 0.2916666716337204,
"step": 27
},
{
"clip_ratio": 0.0,
"completion_length": 509.0,
"epoch": 0.04798628963153385,
"grad_norm": 0.755454957485199,
"kl": 0.00131988525390625,
"learning_rate": 5.6e-07,
"loss": -0.0102,
"reward": 1.0277777835726738,
"reward_std": 0.6238088309764862,
"rewards/accuracy_reward": 0.31944445241242647,
"rewards/format_reward": 0.3888888955116272,
"step": 28
},
{
"clip_ratio": 0.0,
"completion_length": 634.9027786254883,
"epoch": 0.049700085689802914,
"grad_norm": 0.8711504936218262,
"kl": 0.0018901824951171875,
"learning_rate": 5.8e-07,
"loss": 0.039,
"reward": 0.5763888880610466,
"reward_std": 0.5907959416508675,
"rewards/accuracy_reward": 0.13888889271765947,
"rewards/format_reward": 0.2986111119389534,
"step": 29
},
{
"clip_ratio": 0.0,
"completion_length": 588.5833358764648,
"epoch": 0.05141388174807198,
"grad_norm": 0.6898062825202942,
"kl": 0.002155303955078125,
"learning_rate": 6e-07,
"loss": -0.014,
"reward": 0.493055559694767,
"reward_std": 0.41614027321338654,
"rewards/accuracy_reward": 0.06944444496184587,
"rewards/format_reward": 0.3541666641831398,
"step": 30
},
{
"clip_ratio": 0.0,
"completion_length": 582.9166717529297,
"epoch": 0.05312767780634105,
"grad_norm": 0.9486229419708252,
"kl": 0.002727508544921875,
"learning_rate": 6.2e-07,
"loss": 0.0461,
"reward": 0.6319444626569748,
"reward_std": 0.5628423318266869,
"rewards/accuracy_reward": 0.12500000186264515,
"rewards/format_reward": 0.381944440305233,
"step": 31
},
{
"clip_ratio": 0.0,
"completion_length": 674.3472290039062,
"epoch": 0.054841473864610114,
"grad_norm": 0.5769440531730652,
"kl": 0.00347137451171875,
"learning_rate": 6.4e-07,
"loss": 0.0924,
"reward": 0.6597222238779068,
"reward_std": 0.572759248316288,
"rewards/accuracy_reward": 0.13888889271765947,
"rewards/format_reward": 0.3819444477558136,
"step": 32
},
{
"clip_ratio": 0.0,
"completion_length": 606.5972213745117,
"epoch": 0.056555269922879174,
"grad_norm": 0.8302273154258728,
"kl": 0.004093170166015625,
"learning_rate": 6.6e-07,
"loss": -0.0183,
"reward": 0.6944444477558136,
"reward_std": 0.5897158365696669,
"rewards/accuracy_reward": 0.16666666977107525,
"rewards/format_reward": 0.3611111156642437,
"step": 33
},
{
"clip_ratio": 0.0,
"completion_length": 750.2639007568359,
"epoch": 0.05826906598114824,
"grad_norm": 0.8029855489730835,
"kl": 0.00673675537109375,
"learning_rate": 6.800000000000001e-07,
"loss": 0.1226,
"reward": 0.8194444477558136,
"reward_std": 0.5727398172020912,
"rewards/accuracy_reward": 0.19444444868713617,
"rewards/format_reward": 0.4305555522441864,
"step": 34
},
{
"clip_ratio": 0.0,
"completion_length": 748.4305572509766,
"epoch": 0.05998286203941731,
"grad_norm": 0.751978874206543,
"kl": 0.00594329833984375,
"learning_rate": 7e-07,
"loss": 0.1349,
"reward": 0.5000000074505806,
"reward_std": 0.3002382256090641,
"rewards/accuracy_reward": 0.0555555559694767,
"rewards/format_reward": 0.3888888880610466,
"step": 35
},
{
"clip_ratio": 0.0,
"completion_length": 589.0416793823242,
"epoch": 0.061696658097686374,
"grad_norm": 0.7630824446678162,
"kl": 0.0087432861328125,
"learning_rate": 7.2e-07,
"loss": 0.0179,
"reward": 0.6041666865348816,
"reward_std": 0.33818795159459114,
"rewards/accuracy_reward": 0.12500000186264515,
"rewards/format_reward": 0.3541666716337204,
"step": 36
},
{
"clip_ratio": 0.0,
"completion_length": 692.2500152587891,
"epoch": 0.06341045415595545,
"grad_norm": 0.5925531387329102,
"kl": 0.0101165771484375,
"learning_rate": 7.4e-07,
"loss": 0.0299,
"reward": 0.5138888955116272,
"reward_std": 0.3357668612152338,
"rewards/accuracy_reward": 0.055555556900799274,
"rewards/format_reward": 0.4027777835726738,
"step": 37
},
{
"clip_ratio": 0.0,
"completion_length": 792.1805725097656,
"epoch": 0.06512425021422451,
"grad_norm": 1.0705430507659912,
"kl": 0.01534271240234375,
"learning_rate": 7.599999999999999e-07,
"loss": 0.0466,
"reward": 0.5000000074505806,
"reward_std": 0.41504133865237236,
"rewards/accuracy_reward": 0.0555555559694767,
"rewards/format_reward": 0.3888888880610466,
"step": 38
},
{
"clip_ratio": 0.0,
"completion_length": 699.8888854980469,
"epoch": 0.06683804627249357,
"grad_norm": 0.9300626516342163,
"kl": 0.01090240478515625,
"learning_rate": 7.799999999999999e-07,
"loss": 0.0644,
"reward": 0.7847222238779068,
"reward_std": 0.4304216764867306,
"rewards/accuracy_reward": 0.2083333432674408,
"rewards/format_reward": 0.3680555522441864,
"step": 39
},
{
"clip_ratio": 0.0,
"completion_length": 575.0833206176758,
"epoch": 0.06855184233076264,
"grad_norm": 1.078953742980957,
"kl": 0.016937255859375,
"learning_rate": 8e-07,
"loss": 0.0414,
"reward": 0.9791666939854622,
"reward_std": 0.4737792070955038,
"rewards/accuracy_reward": 0.2638888955116272,
"rewards/format_reward": 0.4513888955116272,
"step": 40
},
{
"clip_ratio": 0.0,
"completion_length": 626.5694580078125,
"epoch": 0.0702656383890317,
"grad_norm": 0.6470286846160889,
"kl": 0.0107269287109375,
"learning_rate": 8.199999999999999e-07,
"loss": 0.0883,
"reward": 0.6597222313284874,
"reward_std": 0.3953079264611006,
"rewards/accuracy_reward": 0.11111111380159855,
"rewards/format_reward": 0.4375,
"step": 41
},
{
"clip_ratio": 0.0,
"completion_length": 585.4166717529297,
"epoch": 0.07197943444730077,
"grad_norm": 2.4607300758361816,
"kl": 0.0234375,
"learning_rate": 8.399999999999999e-07,
"loss": 0.0947,
"reward": 0.8263889104127884,
"reward_std": 0.5376365929841995,
"rewards/accuracy_reward": 0.1944444514811039,
"rewards/format_reward": 0.4375000074505806,
"step": 42
},
{
"clip_ratio": 0.0,
"completion_length": 705.0555572509766,
"epoch": 0.07369323050556983,
"grad_norm": 0.5106288194656372,
"kl": 0.0121002197265625,
"learning_rate": 8.599999999999999e-07,
"loss": 0.0658,
"reward": 0.6736111342906952,
"reward_std": 0.4738336503505707,
"rewards/accuracy_reward": 0.1250000037252903,
"rewards/format_reward": 0.4236111119389534,
"step": 43
},
{
"clip_ratio": 0.0,
"completion_length": 754.8611145019531,
"epoch": 0.07540702656383891,
"grad_norm": 1.0860414505004883,
"kl": 0.01665496826171875,
"learning_rate": 8.799999999999999e-07,
"loss": -0.0223,
"reward": 0.7500000149011612,
"reward_std": 0.4002586603164673,
"rewards/accuracy_reward": 0.15277778171002865,
"rewards/format_reward": 0.4444444477558136,
"step": 44
},
{
"clip_ratio": 0.0,
"completion_length": 764.9583358764648,
"epoch": 0.07712082262210797,
"grad_norm": 0.4417635202407837,
"kl": 0.010406494140625,
"learning_rate": 9e-07,
"loss": 0.0694,
"reward": 0.8125000223517418,
"reward_std": 0.3787213396281004,
"rewards/accuracy_reward": 0.180555559694767,
"rewards/format_reward": 0.4513888955116272,
"step": 45
},
{
"clip_ratio": 0.0,
"completion_length": 524.1666717529297,
"epoch": 0.07883461868037704,
"grad_norm": 0.9410390853881836,
"kl": 0.013397216796875,
"learning_rate": 9.2e-07,
"loss": 0.008,
"reward": 0.6875,
"reward_std": 0.4043467417359352,
"rewards/accuracy_reward": 0.12500000186264515,
"rewards/format_reward": 0.4375000074505806,
"step": 46
},
{
"clip_ratio": 0.0,
"completion_length": 608.4305725097656,
"epoch": 0.0805484147386461,
"grad_norm": 0.915087878704071,
"kl": 0.014129638671875,
"learning_rate": 9.399999999999999e-07,
"loss": 0.0755,
"reward": 0.6527777910232544,
"reward_std": 0.49068866297602654,
"rewards/accuracy_reward": 0.11111111380159855,
"rewards/format_reward": 0.4305555671453476,
"step": 47
},
{
"clip_ratio": 0.0,
"completion_length": 592.1250076293945,
"epoch": 0.08226221079691516,
"grad_norm": 0.7676656246185303,
"kl": 0.0137786865234375,
"learning_rate": 9.6e-07,
"loss": 0.0019,
"reward": 0.7291666641831398,
"reward_std": 0.3599853590130806,
"rewards/accuracy_reward": 0.1527777798473835,
"rewards/format_reward": 0.4236111119389534,
"step": 48
},
{
"clip_ratio": 0.0,
"completion_length": 472.7777862548828,
"epoch": 0.08397600685518423,
"grad_norm": 0.8259297609329224,
"kl": 0.01544189453125,
"learning_rate": 9.8e-07,
"loss": 0.0269,
"reward": 0.8958333358168602,
"reward_std": 0.5085582789033651,
"rewards/accuracy_reward": 0.20833333767950535,
"rewards/format_reward": 0.479166679084301,
"step": 49
},
{
"clip_ratio": 0.0,
"completion_length": 522.7639007568359,
"epoch": 0.0856898029134533,
"grad_norm": 7704.1875,
"kl": 2.003662109375,
"learning_rate": 1e-06,
"loss": 0.0621,
"reward": 0.5069444477558136,
"reward_std": 0.11907241865992546,
"rewards/accuracy_reward": 0.013888888992369175,
"rewards/format_reward": 0.4791666716337204,
"step": 50
},
{
"clip_ratio": 0.0,
"completion_length": 569.9583435058594,
"epoch": 0.08740359897172237,
"grad_norm": 0.9847874045372009,
"kl": 0.01806640625,
"learning_rate": 9.999890338174275e-07,
"loss": -0.0313,
"reward": 1.0069444626569748,
"reward_std": 0.6746698617935181,
"rewards/accuracy_reward": 0.2916666753590107,
"rewards/format_reward": 0.4236111268401146,
"step": 51
},
{
"clip_ratio": 0.0,
"completion_length": 851.2639007568359,
"epoch": 0.08911739502999143,
"grad_norm": 0.4385327994823456,
"kl": 0.0133514404296875,
"learning_rate": 9.999561358041868e-07,
"loss": -0.007,
"reward": 0.7708333507180214,
"reward_std": 0.3091294076293707,
"rewards/accuracy_reward": 0.1527777835726738,
"rewards/format_reward": 0.4652777910232544,
"step": 52
},
{
"clip_ratio": 0.0,
"completion_length": 672.9305572509766,
"epoch": 0.0908311910882605,
"grad_norm": 0.6572834849357605,
"kl": 0.0176544189453125,
"learning_rate": 9.999013075636804e-07,
"loss": -0.0003,
"reward": 0.673611119389534,
"reward_std": 0.3502005450427532,
"rewards/accuracy_reward": 0.1111111156642437,
"rewards/format_reward": 0.4513888880610466,
"step": 53
},
{
"clip_ratio": 0.0,
"completion_length": 668.2361221313477,
"epoch": 0.09254498714652956,
"grad_norm": 0.5134143233299255,
"kl": 0.0155029296875,
"learning_rate": 9.998245517681593e-07,
"loss": 0.0325,
"reward": 0.6944444552063942,
"reward_std": 0.36250423453748226,
"rewards/accuracy_reward": 0.11111111287027597,
"rewards/format_reward": 0.4722222313284874,
"step": 54
},
{
"clip_ratio": 0.0,
"completion_length": 767.4722137451172,
"epoch": 0.09425878320479864,
"grad_norm": 0.5993247628211975,
"kl": 0.018310546875,
"learning_rate": 9.997258721585931e-07,
"loss": 0.0402,
"reward": 0.7222222238779068,
"reward_std": 0.4854987859725952,
"rewards/accuracy_reward": 0.12500000093132257,
"rewards/format_reward": 0.4722222238779068,
"step": 55
},
{
"clip_ratio": 0.0,
"completion_length": 582.5555572509766,
"epoch": 0.0959725792630677,
"grad_norm": 0.6086099147796631,
"kl": 0.019317626953125,
"learning_rate": 9.996052735444862e-07,
"loss": -0.0089,
"reward": 0.8958333432674408,
"reward_std": 0.5041182190179825,
"rewards/accuracy_reward": 0.20833333674818277,
"rewards/format_reward": 0.4791666716337204,
"step": 56
},
{
"clip_ratio": 0.0,
"completion_length": 666.3611297607422,
"epoch": 0.09768637532133675,
"grad_norm": 0.3199848532676697,
"kl": 0.0146942138671875,
"learning_rate": 9.994627618036452e-07,
"loss": 0.0068,
"reward": 0.6180555671453476,
"reward_std": 0.10077410563826561,
"rewards/accuracy_reward": 0.0694444477558136,
"rewards/format_reward": 0.479166679084301,
"step": 57
},
{
"clip_ratio": 0.0,
"completion_length": 535.6944351196289,
"epoch": 0.09940017137960583,
"grad_norm": 1.065861701965332,
"kl": 0.02264404296875,
"learning_rate": 9.992983438818915e-07,
"loss": 0.0328,
"reward": 0.5972222164273262,
"reward_std": 0.3061862140893936,
"rewards/accuracy_reward": 0.0555555559694767,
"rewards/format_reward": 0.486111119389534,
"step": 58
},
{
"clip_ratio": 0.0,
"completion_length": 645.0833435058594,
"epoch": 0.10111396743787489,
"grad_norm": 0.8223654627799988,
"kl": 0.017425537109375,
"learning_rate": 9.991120277927223e-07,
"loss": -0.0678,
"reward": 0.9791666716337204,
"reward_std": 0.7214617803692818,
"rewards/accuracy_reward": 0.25000000558793545,
"rewards/format_reward": 0.4791666716337204,
"step": 59
},
{
"clip_ratio": 0.0,
"completion_length": 758.6388854980469,
"epoch": 0.10282776349614396,
"grad_norm": 0.559778094291687,
"kl": 0.0177154541015625,
"learning_rate": 9.989038226169207e-07,
"loss": -0.0276,
"reward": 0.7222222238779068,
"reward_std": 0.30821535736322403,
"rewards/accuracy_reward": 0.11111111287027597,
"rewards/format_reward": 0.5,
"step": 60
},
{
"clip_ratio": 0.0,
"completion_length": 653.9860992431641,
"epoch": 0.10454155955441302,
"grad_norm": 0.4272245168685913,
"kl": 0.02130126953125,
"learning_rate": 9.98673738502114e-07,
"loss": 0.0077,
"reward": 0.6319444477558136,
"reward_std": 0.17111802101135254,
"rewards/accuracy_reward": 0.06944444496184587,
"rewards/format_reward": 0.493055559694767,
"step": 61
},
{
"clip_ratio": 0.0,
"completion_length": 746.0694580078125,
"epoch": 0.1062553556126821,
"grad_norm": 0.7891212105751038,
"kl": 0.0184478759765625,
"learning_rate": 9.98421786662277e-07,
"loss": 0.0491,
"reward": 0.8263888955116272,
"reward_std": 0.3267286717891693,
"rewards/accuracy_reward": 0.18055556062608957,
"rewards/format_reward": 0.4652777761220932,
"step": 62
},
{
"clip_ratio": 0.0,
"completion_length": 677.9722137451172,
"epoch": 0.10796915167095116,
"grad_norm": 0.4481966495513916,
"kl": 0.016571044921875,
"learning_rate": 9.981479793771866e-07,
"loss": 0.0352,
"reward": 0.6250000074505806,
"reward_std": 0.2613905593752861,
"rewards/accuracy_reward": 0.06944444496184587,
"rewards/format_reward": 0.4861111119389534,
"step": 63
},
{
"clip_ratio": 0.0,
"completion_length": 650.25,
"epoch": 0.10968294772922023,
"grad_norm": 0.484955370426178,
"kl": 0.0130157470703125,
"learning_rate": 9.97852329991824e-07,
"loss": 0.0549,
"reward": 0.6875000149011612,
"reward_std": 0.21065950952470303,
"rewards/accuracy_reward": 0.1111111119389534,
"rewards/format_reward": 0.4652777835726738,
"step": 64
},
{
"clip_ratio": 0.0,
"completion_length": 718.8333435058594,
"epoch": 0.11139674378748929,
"grad_norm": 0.6917220950126648,
"kl": 0.0135955810546875,
"learning_rate": 9.975348529157229e-07,
"loss": 0.0278,
"reward": 0.7083333432674408,
"reward_std": 0.5276868715882301,
"rewards/accuracy_reward": 0.12500000186264515,
"rewards/format_reward": 0.4583333358168602,
"step": 65
},
{
"clip_ratio": 0.0,
"completion_length": 724.4861221313477,
"epoch": 0.11311053984575835,
"grad_norm": 0.6181950569152832,
"kl": 0.009735107421875,
"learning_rate": 9.971955636222684e-07,
"loss": 0.0776,
"reward": 0.7916666865348816,
"reward_std": 0.36798322945833206,
"rewards/accuracy_reward": 0.1666666716337204,
"rewards/format_reward": 0.4583333358168602,
"step": 66
},
{
"clip_ratio": 0.0,
"completion_length": 645.2639007568359,
"epoch": 0.11482433590402742,
"grad_norm": 0.47965365648269653,
"kl": 0.0118865966796875,
"learning_rate": 9.968344786479415e-07,
"loss": 0.0211,
"reward": 0.6805555522441864,
"reward_std": 0.29340869560837746,
"rewards/accuracy_reward": 0.1111111156642437,
"rewards/format_reward": 0.4583333358168602,
"step": 67
},
{
"clip_ratio": 0.0,
"completion_length": 833.1944427490234,
"epoch": 0.11653813196229648,
"grad_norm": 0.5236086249351501,
"kl": 0.01050567626953125,
"learning_rate": 9.964516155915151e-07,
"loss": 0.0772,
"reward": 0.8611111268401146,
"reward_std": 0.3695474322885275,
"rewards/accuracy_reward": 0.2083333432674408,
"rewards/format_reward": 0.4444444552063942,
"step": 68
},
{
"clip_ratio": 0.0,
"completion_length": 614.4027709960938,
"epoch": 0.11825192802056556,
"grad_norm": 2.3860788345336914,
"kl": 0.0256195068359375,
"learning_rate": 9.960469931131936e-07,
"loss": 0.0186,
"reward": 0.7222222462296486,
"reward_std": 0.4142109379172325,
"rewards/accuracy_reward": 0.13888888992369175,
"rewards/format_reward": 0.4444444552063942,
"step": 69
},
{
"clip_ratio": 0.0,
"completion_length": 731.9722290039062,
"epoch": 0.11996572407883462,
"grad_norm": 0.7797493934631348,
"kl": 0.01611328125,
"learning_rate": 9.956206309337066e-07,
"loss": 0.0554,
"reward": 0.8125000074505806,
"reward_std": 0.4372703805565834,
"rewards/accuracy_reward": 0.1944444477558136,
"rewards/format_reward": 0.4236111119389534,
"step": 70
},
{
"clip_ratio": 0.0,
"completion_length": 663.3055725097656,
"epoch": 0.12167952013710369,
"grad_norm": 0.6972033977508545,
"kl": 0.011383056640625,
"learning_rate": 9.951725498333448e-07,
"loss": 0.0371,
"reward": 0.9583333432674408,
"reward_std": 0.5939657315611839,
"rewards/accuracy_reward": 0.23611112032085657,
"rewards/format_reward": 0.4861111119389534,
"step": 71
},
{
"clip_ratio": 0.0,
"completion_length": 533.8472290039062,
"epoch": 0.12339331619537275,
"grad_norm": 0.628699541091919,
"kl": 0.0160675048828125,
"learning_rate": 9.947027716509488e-07,
"loss": 0.0075,
"reward": 0.798611119389534,
"reward_std": 0.34056369215250015,
"rewards/accuracy_reward": 0.16666666977107525,
"rewards/format_reward": 0.4652777761220932,
"step": 72
},
{
"clip_ratio": 0.0,
"completion_length": 725.944450378418,
"epoch": 0.12510711225364182,
"grad_norm": 0.5667747855186462,
"kl": 0.0106201171875,
"learning_rate": 9.942113192828444e-07,
"loss": 0.0505,
"reward": 0.8611111342906952,
"reward_std": 0.3817775323987007,
"rewards/accuracy_reward": 0.1944444477558136,
"rewards/format_reward": 0.4722222313284874,
"step": 73
},
{
"clip_ratio": 0.0,
"completion_length": 596.125,
"epoch": 0.1268209083119109,
"grad_norm": 0.8628817796707153,
"kl": 0.0158233642578125,
"learning_rate": 9.93698216681727e-07,
"loss": 0.0773,
"reward": 0.923611119389534,
"reward_std": 0.4358247146010399,
"rewards/accuracy_reward": 0.22222222574055195,
"rewards/format_reward": 0.4791666716337204,
"step": 74
},
{
"clip_ratio": 0.0,
"completion_length": 812.0555572509766,
"epoch": 0.12853470437017994,
"grad_norm": 0.41737478971481323,
"kl": 0.0092620849609375,
"learning_rate": 9.931634888554935e-07,
"loss": -0.0076,
"reward": 0.7500000149011612,
"reward_std": 0.2785004451870918,
"rewards/accuracy_reward": 0.13888889271765947,
"rewards/format_reward": 0.4722222238779068,
"step": 75
},
{
"clip_ratio": 0.0,
"completion_length": 508.01390075683594,
"epoch": 0.13024850042844902,
"grad_norm": 0.772993803024292,
"kl": 0.0087738037109375,
"learning_rate": 9.926071618660237e-07,
"loss": 0.0171,
"reward": 1.0069444477558136,
"reward_std": 0.6597441658377647,
"rewards/accuracy_reward": 0.2638888955116272,
"rewards/format_reward": 0.4791666716337204,
"step": 76
},
{
"clip_ratio": 0.0,
"completion_length": 746.2500152587891,
"epoch": 0.1319622964867181,
"grad_norm": 0.4772380292415619,
"kl": 0.00911712646484375,
"learning_rate": 9.9202926282791e-07,
"loss": 0.0168,
"reward": 0.9166666567325592,
"reward_std": 0.6034458577632904,
"rewards/accuracy_reward": 0.22222222667187452,
"rewards/format_reward": 0.4722222313284874,
"step": 77
},
{
"clip_ratio": 0.0,
"completion_length": 952.3888854980469,
"epoch": 0.13367609254498714,
"grad_norm": 0.49596795439720154,
"kl": 0.0111846923828125,
"learning_rate": 9.91429819907136e-07,
"loss": 0.033,
"reward": 0.6180555745959282,
"reward_std": 0.34004957228899,
"rewards/accuracy_reward": 0.09722222480922937,
"rewards/format_reward": 0.423611119389534,
"step": 78
},
{
"clip_ratio": 0.0,
"completion_length": 533.5555725097656,
"epoch": 0.1353898886032562,
"grad_norm": 0.5074121952056885,
"kl": 0.011474609375,
"learning_rate": 9.908088623197048e-07,
"loss": -0.0007,
"reward": 0.5902777835726738,
"reward_std": 0.2606759797781706,
"rewards/accuracy_reward": 0.0555555559694767,
"rewards/format_reward": 0.4791666716337204,
"step": 79
},
{
"clip_ratio": 0.0,
"completion_length": 704.5833282470703,
"epoch": 0.13710368466152528,
"grad_norm": 0.45615482330322266,
"kl": 0.00933074951171875,
"learning_rate": 9.901664203302124e-07,
"loss": 0.1073,
"reward": 0.6111111119389534,
"reward_std": 0.3000170197337866,
"rewards/accuracy_reward": 0.08333333395421505,
"rewards/format_reward": 0.4444444552063942,
"step": 80
},
{
"clip_ratio": 0.0,
"completion_length": 709.5694580078125,
"epoch": 0.13881748071979436,
"grad_norm": 0.440580815076828,
"kl": 0.0095977783203125,
"learning_rate": 9.895025252503755e-07,
"loss": -0.0114,
"reward": 1.0347222536802292,
"reward_std": 0.26498175598680973,
"rewards/accuracy_reward": 0.2916666679084301,
"rewards/format_reward": 0.4513888955116272,
"step": 81
},
{
"clip_ratio": 0.0,
"completion_length": 652.4166717529297,
"epoch": 0.1405312767780634,
"grad_norm": 0.4756879210472107,
"kl": 0.00921630859375,
"learning_rate": 9.888172094375033e-07,
"loss": 0.0445,
"reward": 0.7638888955116272,
"reward_std": 0.37851114571094513,
"rewards/accuracy_reward": 0.15277778171002865,
"rewards/format_reward": 0.4583333358168602,
"step": 82
},
{
"clip_ratio": 0.0,
"completion_length": 585.6250076293945,
"epoch": 0.14224507283633248,
"grad_norm": 0.9698956608772278,
"kl": 0.0133819580078125,
"learning_rate": 9.881105062929221e-07,
"loss": 0.0097,
"reward": 0.652777798473835,
"reward_std": 0.37932526133954525,
"rewards/accuracy_reward": 0.09722222574055195,
"rewards/format_reward": 0.4583333358168602,
"step": 83
},
{
"clip_ratio": 0.0,
"completion_length": 683.9166564941406,
"epoch": 0.14395886889460155,
"grad_norm": 0.4502439796924591,
"kl": 0.0113525390625,
"learning_rate": 9.873824502603459e-07,
"loss": 0.0209,
"reward": 0.8125000074505806,
"reward_std": 0.421932702884078,
"rewards/accuracy_reward": 0.16666666977107525,
"rewards/format_reward": 0.4791666716337204,
"step": 84
},
{
"clip_ratio": 0.0,
"completion_length": 719.1388854980469,
"epoch": 0.1456726649528706,
"grad_norm": 0.6199227571487427,
"kl": 0.01055908203125,
"learning_rate": 9.866330768241983e-07,
"loss": -0.0255,
"reward": 0.5694444626569748,
"reward_std": 0.33752935379743576,
"rewards/accuracy_reward": 0.0555555559694767,
"rewards/format_reward": 0.4583333358168602,
"step": 85
},
{
"clip_ratio": 0.0,
"completion_length": 678.3611221313477,
"epoch": 0.14738646101113967,
"grad_norm": 0.8055247068405151,
"kl": 0.0143890380859375,
"learning_rate": 9.85862422507884e-07,
"loss": -0.0329,
"reward": 0.972222238779068,
"reward_std": 0.5578342527151108,
"rewards/accuracy_reward": 0.2500000046566129,
"rewards/format_reward": 0.4722222238779068,
"step": 86
},
{
"clip_ratio": 0.0,
"completion_length": 870.9166870117188,
"epoch": 0.14910025706940874,
"grad_norm": 0.7032153010368347,
"kl": 0.015716552734375,
"learning_rate": 9.850705248720068e-07,
"loss": 0.1143,
"reward": 0.7083333432674408,
"reward_std": 0.29449621587991714,
"rewards/accuracy_reward": 0.11111111473292112,
"rewards/format_reward": 0.4861111119389534,
"step": 87
},
{
"clip_ratio": 0.0,
"completion_length": 809.2361297607422,
"epoch": 0.15081405312767782,
"grad_norm": 0.6152629256248474,
"kl": 0.0126953125,
"learning_rate": 9.8425742251254e-07,
"loss": 0.0084,
"reward": 0.777777798473835,
"reward_std": 0.45732562988996506,
"rewards/accuracy_reward": 0.15277778171002865,
"rewards/format_reward": 0.4722222313284874,
"step": 88
},
{
"clip_ratio": 0.0,
"completion_length": 596.125,
"epoch": 0.15252784918594686,
"grad_norm": 0.4119075536727905,
"kl": 0.0131683349609375,
"learning_rate": 9.83423155058946e-07,
"loss": 0.006,
"reward": 0.7569444552063942,
"reward_std": 0.37087361328303814,
"rewards/accuracy_reward": 0.13888889364898205,
"rewards/format_reward": 0.479166679084301,
"step": 89
},
{
"clip_ratio": 0.0,
"completion_length": 719.9861221313477,
"epoch": 0.15424164524421594,
"grad_norm": 0.6751371622085571,
"kl": 0.01473236083984375,
"learning_rate": 9.825677631722435e-07,
"loss": 0.0284,
"reward": 0.6805555671453476,
"reward_std": 0.2974403705447912,
"rewards/accuracy_reward": 0.09722222574055195,
"rewards/format_reward": 0.486111119389534,
"step": 90
},
{
"clip_ratio": 0.0,
"completion_length": 676.0000152587891,
"epoch": 0.155955441302485,
"grad_norm": 0.4955655634403229,
"kl": 0.00868988037109375,
"learning_rate": 9.816912885430258e-07,
"loss": 0.0648,
"reward": 0.8750000149011612,
"reward_std": 0.38056252896785736,
"rewards/accuracy_reward": 0.19444445054978132,
"rewards/format_reward": 0.486111119389534,
"step": 91
},
{
"clip_ratio": 0.0,
"completion_length": 685.569450378418,
"epoch": 0.15766923736075408,
"grad_norm": 0.6032452583312988,
"kl": 0.0105133056640625,
"learning_rate": 9.807937738894303e-07,
"loss": 0.0959,
"reward": 1.0208333432674408,
"reward_std": 0.41651279479265213,
"rewards/accuracy_reward": 0.2638888992369175,
"rewards/format_reward": 0.493055559694767,
"step": 92
},
{
"clip_ratio": 0.0,
"completion_length": 552.9722290039062,
"epoch": 0.15938303341902313,
"grad_norm": 0.5042760968208313,
"kl": 0.010833740234375,
"learning_rate": 9.798752629550546e-07,
"loss": 0.0277,
"reward": 0.5972222089767456,
"reward_std": 0.25616975128650665,
"rewards/accuracy_reward": 0.0555555559694767,
"rewards/format_reward": 0.486111119389534,
"step": 93
},
{
"clip_ratio": 0.0,
"completion_length": 613.8750076293945,
"epoch": 0.1610968294772922,
"grad_norm": 1.230424165725708,
"kl": 0.01520538330078125,
"learning_rate": 9.78935800506826e-07,
"loss": 0.0236,
"reward": 0.7291666716337204,
"reward_std": 0.3527771979570389,
"rewards/accuracy_reward": 0.12500000093132257,
"rewards/format_reward": 0.4791666716337204,
"step": 94
},
{
"clip_ratio": 0.0,
"completion_length": 667.2500152587891,
"epoch": 0.16281062553556128,
"grad_norm": 0.874912440776825,
"kl": 0.014312744140625,
"learning_rate": 9.779754323328192e-07,
"loss": -0.0369,
"reward": 0.8194444477558136,
"reward_std": 0.24447975307703018,
"rewards/accuracy_reward": 0.16666666883975267,
"rewards/format_reward": 0.486111119389534,
"step": 95
},
{
"clip_ratio": 0.0,
"completion_length": 696.3333435058594,
"epoch": 0.16452442159383032,
"grad_norm": 0.4739597737789154,
"kl": 0.00701904296875,
"learning_rate": 9.769942052400235e-07,
"loss": -0.0386,
"reward": 0.7152777761220932,
"reward_std": 0.4169478937983513,
"rewards/accuracy_reward": 0.12500000186264515,
"rewards/format_reward": 0.4652777761220932,
"step": 96
},
{
"clip_ratio": 0.0,
"completion_length": 700.5833435058594,
"epoch": 0.1662382176520994,
"grad_norm": 0.5326427817344666,
"kl": 0.0096435546875,
"learning_rate": 9.759921670520634e-07,
"loss": 0.0463,
"reward": 0.6458333507180214,
"reward_std": 0.29642581194639206,
"rewards/accuracy_reward": 0.08333333488553762,
"rewards/format_reward": 0.479166679084301,
"step": 97
},
{
"clip_ratio": 0.0,
"completion_length": 687.944450378418,
"epoch": 0.16795201371036847,
"grad_norm": 0.41333743929862976,
"kl": 0.0089111328125,
"learning_rate": 9.749693666068663e-07,
"loss": 0.0199,
"reward": 0.7916666716337204,
"reward_std": 0.32597118616104126,
"rewards/accuracy_reward": 0.15277778450399637,
"rewards/format_reward": 0.4861111119389534,
"step": 98
},
{
"clip_ratio": 0.0,
"completion_length": 647.2638854980469,
"epoch": 0.16966580976863754,
"grad_norm": 0.4722951054573059,
"kl": 0.0088348388671875,
"learning_rate": 9.739258537542835e-07,
"loss": 0.036,
"reward": 0.7291666567325592,
"reward_std": 0.43501005321741104,
"rewards/accuracy_reward": 0.12500000186264515,
"rewards/format_reward": 0.4791666641831398,
"step": 99
},
{
"clip_ratio": 0.0,
"completion_length": 610.4722213745117,
"epoch": 0.1713796058269066,
"grad_norm": 0.4890177547931671,
"kl": 0.0101318359375,
"learning_rate": 9.728616793536587e-07,
"loss": -0.0005,
"reward": 0.8333333283662796,
"reward_std": 0.4303314909338951,
"rewards/accuracy_reward": 0.16666666977107525,
"rewards/format_reward": 0.5,
"step": 100
},
{
"clip_ratio": 0.0,
"completion_length": 803.6388702392578,
"epoch": 0.17309340188517566,
"grad_norm": 0.5259881019592285,
"kl": 0.0117645263671875,
"learning_rate": 9.717768952713511e-07,
"loss": 0.0197,
"reward": 0.791666679084301,
"reward_std": 0.39858745597302914,
"rewards/accuracy_reward": 0.16666667349636555,
"rewards/format_reward": 0.4583333432674408,
"step": 101
},
{
"clip_ratio": 0.0,
"completion_length": 725.625,
"epoch": 0.17480719794344474,
"grad_norm": 0.565196692943573,
"kl": 0.0084381103515625,
"learning_rate": 9.706715543782064e-07,
"loss": -0.0314,
"reward": 0.7430555671453476,
"reward_std": 0.4483235850930214,
"rewards/accuracy_reward": 0.13888888992369175,
"rewards/format_reward": 0.4652777835726738,
"step": 102
},
{
"clip_ratio": 0.0,
"completion_length": 685.8611145019531,
"epoch": 0.17652099400171378,
"grad_norm": 0.30295926332473755,
"kl": 0.010772705078125,
"learning_rate": 9.695457105469804e-07,
"loss": 0.0335,
"reward": 1.0208333283662796,
"reward_std": 0.31875650584697723,
"rewards/accuracy_reward": 0.2638888917863369,
"rewards/format_reward": 0.493055559694767,
"step": 103
},
{
"clip_ratio": 0.0,
"completion_length": 763.1389007568359,
"epoch": 0.17823479005998286,
"grad_norm": 0.45252788066864014,
"kl": 0.01111602783203125,
"learning_rate": 9.683994186497132e-07,
"loss": 0.0369,
"reward": 0.5069444477558136,
"reward_std": 0.11907241307199001,
"rewards/accuracy_reward": 0.013888888992369175,
"rewards/format_reward": 0.479166679084301,
"step": 104
},
{
"clip_ratio": 0.0,
"completion_length": 673.9444580078125,
"epoch": 0.17994858611825193,
"grad_norm": 0.800912618637085,
"kl": 0.0160675048828125,
"learning_rate": 9.672327345550543e-07,
"loss": 0.0112,
"reward": 0.7986111044883728,
"reward_std": 0.4165128022432327,
"rewards/accuracy_reward": 0.15277778171002865,
"rewards/format_reward": 0.493055559694767,
"step": 105
},
{
"clip_ratio": 0.0,
"completion_length": 660.4027862548828,
"epoch": 0.181662382176521,
"grad_norm": 0.5626226663589478,
"kl": 0.0115509033203125,
"learning_rate": 9.66045715125541e-07,
"loss": 0.003,
"reward": 0.7152777761220932,
"reward_std": 0.46130844950675964,
"rewards/accuracy_reward": 0.11111111287027597,
"rewards/format_reward": 0.493055559694767,
"step": 106
},
{
"clip_ratio": 0.0,
"completion_length": 648.5833435058594,
"epoch": 0.18337617823479005,
"grad_norm": 0.566942572593689,
"kl": 0.0107421875,
"learning_rate": 9.648384182148252e-07,
"loss": -0.0022,
"reward": 1.0833333432674408,
"reward_std": 0.41200654953718185,
"rewards/accuracy_reward": 0.3055555671453476,
"rewards/format_reward": 0.4722222238779068,
"step": 107
},
{
"clip_ratio": 0.0,
"completion_length": 681.1666717529297,
"epoch": 0.18508997429305912,
"grad_norm": 0.46719685196876526,
"kl": 0.0086517333984375,
"learning_rate": 9.636109026648554e-07,
"loss": 0.008,
"reward": 0.5277777835726738,
"reward_std": 0.19162002205848694,
"rewards/accuracy_reward": 0.02777777798473835,
"rewards/format_reward": 0.4722222238779068,
"step": 108
},
{
"clip_ratio": 0.0,
"completion_length": 628.8194580078125,
"epoch": 0.1868037703513282,
"grad_norm": 0.5135090351104736,
"kl": 0.011474609375,
"learning_rate": 9.623632283030077e-07,
"loss": 0.0236,
"reward": 0.722222238779068,
"reward_std": 0.19795495830476284,
"rewards/accuracy_reward": 0.12500000279396772,
"rewards/format_reward": 0.4722222238779068,
"step": 109
},
{
"clip_ratio": 0.0,
"completion_length": 732.9305725097656,
"epoch": 0.18851756640959727,
"grad_norm": 0.6838110089302063,
"kl": 0.00803375244140625,
"learning_rate": 9.610954559391704e-07,
"loss": -0.0496,
"reward": 0.6666666567325592,
"reward_std": 0.44429811835289,
"rewards/accuracy_reward": 0.0972222238779068,
"rewards/format_reward": 0.4722222313284874,
"step": 110
},
{
"clip_ratio": 0.0,
"completion_length": 688.0555725097656,
"epoch": 0.19023136246786632,
"grad_norm": 0.47608956694602966,
"kl": 0.009918212890625,
"learning_rate": 9.598076473627796e-07,
"loss": 0.0029,
"reward": 0.6874999850988388,
"reward_std": 0.2559359297156334,
"rewards/accuracy_reward": 0.09722222480922937,
"rewards/format_reward": 0.493055559694767,
"step": 111
},
{
"clip_ratio": 0.0,
"completion_length": 724.0139007568359,
"epoch": 0.1919451585261354,
"grad_norm": 0.34519946575164795,
"kl": 0.00824737548828125,
"learning_rate": 9.58499865339809e-07,
"loss": 0.011,
"reward": 0.6875,
"reward_std": 0.31875649094581604,
"rewards/accuracy_reward": 0.09722222574055195,
"rewards/format_reward": 0.493055559694767,
"step": 112
},
{
"clip_ratio": 0.0,
"completion_length": 731.2083435058594,
"epoch": 0.19365895458440446,
"grad_norm": 0.5883038640022278,
"kl": 0.00905609130859375,
"learning_rate": 9.571721736097088e-07,
"loss": 0.0809,
"reward": 0.9236111044883728,
"reward_std": 0.5362608954310417,
"rewards/accuracy_reward": 0.22222222667187452,
"rewards/format_reward": 0.4791666716337204,
"step": 113
},
{
"clip_ratio": 0.0,
"completion_length": 585.2639007568359,
"epoch": 0.1953727506426735,
"grad_norm": 0.6275684237480164,
"kl": 0.0088043212890625,
"learning_rate": 9.55824636882301e-07,
"loss": -0.0049,
"reward": 0.6319444477558136,
"reward_std": 0.25718430429697037,
"rewards/accuracy_reward": 0.06944444496184587,
"rewards/format_reward": 0.493055559694767,
"step": 114
},
{
"clip_ratio": 0.0,
"completion_length": 509.3611145019531,
"epoch": 0.19708654670094258,
"grad_norm": 0.5656007528305054,
"kl": 0.0132293701171875,
"learning_rate": 9.54457320834625e-07,
"loss": -0.0101,
"reward": 0.7152777761220932,
"reward_std": 0.3016466051340103,
"rewards/accuracy_reward": 0.11111111287027597,
"rewards/format_reward": 0.493055559694767,
"step": 115
},
{
"clip_ratio": 0.0,
"completion_length": 882.1666564941406,
"epoch": 0.19880034275921166,
"grad_norm": 0.34537801146507263,
"kl": 0.0095062255859375,
"learning_rate": 9.530702921077358e-07,
"loss": 0.0496,
"reward": 0.770833320915699,
"reward_std": 0.32522569596767426,
"rewards/accuracy_reward": 0.1388888917863369,
"rewards/format_reward": 0.493055559694767,
"step": 116
},
{
"clip_ratio": 0.0,
"completion_length": 612.0138931274414,
"epoch": 0.20051413881748073,
"grad_norm": 0.36981117725372314,
"kl": 0.0112152099609375,
"learning_rate": 9.516636183034564e-07,
"loss": -0.002,
"reward": 0.9097222089767456,
"reward_std": 0.31875649094581604,
"rewards/accuracy_reward": 0.20833333767950535,
"rewards/format_reward": 0.493055559694767,
"step": 117
},
{
"clip_ratio": 0.0,
"completion_length": 612.069450378418,
"epoch": 0.20222793487574978,
"grad_norm": 0.5904315710067749,
"kl": 0.0120697021484375,
"learning_rate": 9.502373679810839e-07,
"loss": 0.0093,
"reward": 0.7361111044883728,
"reward_std": 0.34745684266090393,
"rewards/accuracy_reward": 0.12500000186264515,
"rewards/format_reward": 0.486111119389534,
"step": 118
},
{
"clip_ratio": 0.0,
"completion_length": 580.7361221313477,
"epoch": 0.20394173093401885,
"grad_norm": 0.6722186803817749,
"kl": 0.0094146728515625,
"learning_rate": 9.487916106540465e-07,
"loss": 0.0115,
"reward": 1.0000000149011612,
"reward_std": 0.4072999134659767,
"rewards/accuracy_reward": 0.26388889644294977,
"rewards/format_reward": 0.4722222238779068,
"step": 119
},
{
"clip_ratio": 0.0,
"completion_length": 658.5277862548828,
"epoch": 0.20565552699228792,
"grad_norm": 0.5796108245849609,
"kl": 0.009918212890625,
"learning_rate": 9.473264167865171e-07,
"loss": -0.0049,
"reward": 0.798611119389534,
"reward_std": 0.5176598504185677,
"rewards/accuracy_reward": 0.15277778264135122,
"rewards/format_reward": 0.493055559694767,
"step": 120
},
{
"clip_ratio": 0.0,
"completion_length": 715.8750076293945,
"epoch": 0.207369323050557,
"grad_norm": 0.42695531249046326,
"kl": 0.0109405517578125,
"learning_rate": 9.458418577899774e-07,
"loss": -0.0034,
"reward": 0.9166666865348816,
"reward_std": 0.3762567415833473,
"rewards/accuracy_reward": 0.20833333767950535,
"rewards/format_reward": 0.5,
"step": 121
},
{
"clip_ratio": 0.0,
"completion_length": 681.5833435058594,
"epoch": 0.20908311910882604,
"grad_norm": 0.3627341389656067,
"kl": 0.0103912353515625,
"learning_rate": 9.443380060197385e-07,
"loss": 0.0006,
"reward": 0.75,
"reward_std": 0.3134361356496811,
"rewards/accuracy_reward": 0.12500000186264515,
"rewards/format_reward": 0.5,
"step": 122
},
{
"clip_ratio": 0.0,
"completion_length": 732.1249923706055,
"epoch": 0.21079691516709512,
"grad_norm": 0.4604179263114929,
"kl": 0.0093231201171875,
"learning_rate": 9.428149347714143e-07,
"loss": 0.0102,
"reward": 1.0208333432674408,
"reward_std": 0.3769379239529371,
"rewards/accuracy_reward": 0.2638888955116272,
"rewards/format_reward": 0.493055559694767,
"step": 123
},
{
"clip_ratio": 0.0,
"completion_length": 731.8472290039062,
"epoch": 0.2125107112253642,
"grad_norm": 0.48651084303855896,
"kl": 0.008941650390625,
"learning_rate": 9.412727182773486e-07,
"loss": 0.0067,
"reward": 0.6875,
"reward_std": 0.3304464891552925,
"rewards/accuracy_reward": 0.09722222574055195,
"rewards/format_reward": 0.493055559694767,
"step": 124
},
{
"clip_ratio": 0.0,
"completion_length": 543.0972213745117,
"epoch": 0.21422450728363324,
"grad_norm": 0.9337839484214783,
"kl": 0.0222625732421875,
"learning_rate": 9.397114317029974e-07,
"loss": 0.0038,
"reward": 0.9999999850988388,
"reward_std": 0.48787199705839157,
"rewards/accuracy_reward": 0.2638888908550143,
"rewards/format_reward": 0.4722222238779068,
"step": 125
},
{
"clip_ratio": 0.0,
"completion_length": 683.7639007568359,
"epoch": 0.2159383033419023,
"grad_norm": 0.5181577801704407,
"kl": 0.0086822509765625,
"learning_rate": 9.381311511432658e-07,
"loss": 0.0582,
"reward": 0.7638889029622078,
"reward_std": 0.4154982175678015,
"rewards/accuracy_reward": 0.13888889271765947,
"rewards/format_reward": 0.486111119389534,
"step": 126
},
{
"clip_ratio": 0.0,
"completion_length": 701.3888854980469,
"epoch": 0.21765209940017138,
"grad_norm": 0.4001893401145935,
"kl": 0.01253509521484375,
"learning_rate": 9.36531953618799e-07,
"loss": 0.0222,
"reward": 1.0,
"reward_std": 0.30821534991264343,
"rewards/accuracy_reward": 0.2500000046566129,
"rewards/format_reward": 0.5,
"step": 127
},
{
"clip_ratio": 0.0,
"completion_length": 776.9444351196289,
"epoch": 0.21936589545844046,
"grad_norm": 0.4341527819633484,
"kl": 0.01218414306640625,
"learning_rate": 9.34913917072228e-07,
"loss": -0.0215,
"reward": 0.9375,
"reward_std": 0.33678142726421356,
"rewards/accuracy_reward": 0.22222222574055195,
"rewards/format_reward": 0.493055559694767,
"step": 128
},
{
"clip_ratio": 0.0,
"completion_length": 656.3055725097656,
"epoch": 0.2210796915167095,
"grad_norm": 1.436949610710144,
"kl": 0.02315521240234375,
"learning_rate": 9.332771203643714e-07,
"loss": -0.0426,
"reward": 0.923611119389534,
"reward_std": 0.435094453394413,
"rewards/accuracy_reward": 0.22222222294658422,
"rewards/format_reward": 0.4791666716337204,
"step": 129
},
{
"clip_ratio": 0.0,
"completion_length": 562.3333511352539,
"epoch": 0.22279348757497858,
"grad_norm": 0.7487984895706177,
"kl": 0.0170440673828125,
"learning_rate": 9.316216432703916e-07,
"loss": -0.0057,
"reward": 1.3680555522441864,
"reward_std": 0.4477668162435293,
"rewards/accuracy_reward": 0.4444444477558136,
"rewards/format_reward": 0.479166679084301,
"step": 130
},
{
"clip_ratio": 0.0,
"completion_length": 862.7222442626953,
"epoch": 0.22450728363324765,
"grad_norm": 0.38078296184539795,
"kl": 0.01318359375,
"learning_rate": 9.299475664759068e-07,
"loss": 0.0882,
"reward": 0.7083333283662796,
"reward_std": 0.3526776432991028,
"rewards/accuracy_reward": 0.11111111287027597,
"rewards/format_reward": 0.486111119389534,
"step": 131
},
{
"clip_ratio": 0.0,
"completion_length": 602.3888854980469,
"epoch": 0.2262210796915167,
"grad_norm": 0.5312814712524414,
"kl": 0.015228271484375,
"learning_rate": 9.282549715730579e-07,
"loss": 0.0104,
"reward": 0.5902777686715126,
"reward_std": 0.26067597232759,
"rewards/accuracy_reward": 0.055555556900799274,
"rewards/format_reward": 0.4791666716337204,
"step": 132
},
{
"clip_ratio": 0.0,
"completion_length": 644.4722290039062,
"epoch": 0.22793487574978577,
"grad_norm": 1.1302504539489746,
"kl": 0.022735595703125,
"learning_rate": 9.265439410565328e-07,
"loss": 0.062,
"reward": 0.8263889029622078,
"reward_std": 0.23915939591825008,
"rewards/accuracy_reward": 0.16666667349636555,
"rewards/format_reward": 0.493055559694767,
"step": 133
},
{
"clip_ratio": 0.0,
"completion_length": 684.6527709960938,
"epoch": 0.22964867180805484,
"grad_norm": 0.5044618844985962,
"kl": 0.0118408203125,
"learning_rate": 9.248145583195447e-07,
"loss": 0.0375,
"reward": 0.9930555671453476,
"reward_std": 0.5024402439594269,
"rewards/accuracy_reward": 0.2500000037252903,
"rewards/format_reward": 0.493055559694767,
"step": 134
},
{
"clip_ratio": 0.0,
"completion_length": 604.6944427490234,
"epoch": 0.23136246786632392,
"grad_norm": 0.4981021285057068,
"kl": 0.0177459716796875,
"learning_rate": 9.230669076497687e-07,
"loss": 0.0255,
"reward": 0.965277761220932,
"reward_std": 0.3830488696694374,
"rewards/accuracy_reward": 0.23611111659556627,
"rewards/format_reward": 0.493055559694767,
"step": 135
},
{
"clip_ratio": 0.0,
"completion_length": 517.2777824401855,
"epoch": 0.23307626392459296,
"grad_norm": 1.7722994089126587,
"kl": 0.03118896484375,
"learning_rate": 9.213010742252327e-07,
"loss": 0.0292,
"reward": 1.145833358168602,
"reward_std": 0.46534085273742676,
"rewards/accuracy_reward": 0.3333333320915699,
"rewards/format_reward": 0.4791666716337204,
"step": 136
},
{
"clip_ratio": 0.0,
"completion_length": 779.9444580078125,
"epoch": 0.23479005998286204,
"grad_norm": 0.5337279438972473,
"kl": 0.01290130615234375,
"learning_rate": 9.195171441101668e-07,
"loss": -0.0092,
"reward": 0.9583333507180214,
"reward_std": 0.39858742617070675,
"rewards/accuracy_reward": 0.2500000111758709,
"rewards/format_reward": 0.4583333358168602,
"step": 137
},
{
"clip_ratio": 0.0,
"completion_length": 704.3889007568359,
"epoch": 0.2365038560411311,
"grad_norm": 0.47012683749198914,
"kl": 0.01031494140625,
"learning_rate": 9.177152042508077e-07,
"loss": 0.0208,
"reward": 0.729166679084301,
"reward_std": 0.32953148148953915,
"rewards/accuracy_reward": 0.12500000465661287,
"rewards/format_reward": 0.479166679084301,
"step": 138
},
{
"clip_ratio": 0.0,
"completion_length": 647.3472366333008,
"epoch": 0.23821765209940018,
"grad_norm": 0.5425974726676941,
"kl": 0.01230621337890625,
"learning_rate": 9.158953424711624e-07,
"loss": 0.0068,
"reward": 0.6319444477558136,
"reward_std": 0.17111803591251373,
"rewards/accuracy_reward": 0.06944444589316845,
"rewards/format_reward": 0.493055559694767,
"step": 139
},
{
"clip_ratio": 0.0,
"completion_length": 559.319465637207,
"epoch": 0.23993144815766923,
"grad_norm": 0.4788627028465271,
"kl": 0.012969970703125,
"learning_rate": 9.140576474687263e-07,
"loss": 0.0135,
"reward": 1.0694444477558136,
"reward_std": 0.3177530914545059,
"rewards/accuracy_reward": 0.2916666753590107,
"rewards/format_reward": 0.4861111119389534,
"step": 140
},
{
"clip_ratio": 0.0,
"completion_length": 711.7222290039062,
"epoch": 0.2416452442159383,
"grad_norm": 0.39043956995010376,
"kl": 0.0106658935546875,
"learning_rate": 9.122022088101613e-07,
"loss": 0.0037,
"reward": 0.8472222238779068,
"reward_std": 0.4586464837193489,
"rewards/accuracy_reward": 0.19444444868713617,
"rewards/format_reward": 0.4583333432674408,
"step": 141
},
{
"clip_ratio": 0.0,
"completion_length": 674.2222290039062,
"epoch": 0.24335904027420738,
"grad_norm": 0.1789097785949707,
"kl": 0.01094818115234375,
"learning_rate": 9.103291169269299e-07,
"loss": 0.0154,
"reward": 0.6944444328546524,
"reward_std": 0.06804138422012329,
"rewards/accuracy_reward": 0.09722222480922937,
"rewards/format_reward": 0.5,
"step": 142
},
{
"clip_ratio": 0.0,
"completion_length": 771.5555572509766,
"epoch": 0.24507283633247642,
"grad_norm": 0.4400465488433838,
"kl": 0.0099334716796875,
"learning_rate": 9.084384631108882e-07,
"loss": 0.155,
"reward": 0.7500000223517418,
"reward_std": 0.4042903557419777,
"rewards/accuracy_reward": 0.13888889364898205,
"rewards/format_reward": 0.4722222313284874,
"step": 143
},
{
"clip_ratio": 0.0,
"completion_length": 851.3611145019531,
"epoch": 0.2467866323907455,
"grad_norm": 0.6633160710334778,
"kl": 0.01108551025390625,
"learning_rate": 9.065303395098358e-07,
"loss": 0.0257,
"reward": 0.7708333432674408,
"reward_std": 0.5133540704846382,
"rewards/accuracy_reward": 0.15277778264135122,
"rewards/format_reward": 0.4652777835726738,
"step": 144
},
{
"clip_ratio": 0.0,
"completion_length": 711.4722290039062,
"epoch": 0.24850042844901457,
"grad_norm": 0.46885278820991516,
"kl": 0.0117034912109375,
"learning_rate": 9.046048391230247e-07,
"loss": -0.0231,
"reward": 0.7777777910232544,
"reward_std": 0.2221490517258644,
"rewards/accuracy_reward": 0.1388888955116272,
"rewards/format_reward": 0.5,
"step": 145
},
{
"clip_ratio": 0.0,
"completion_length": 760.0555572509766,
"epoch": 0.25021422450728364,
"grad_norm": 0.49422281980514526,
"kl": 0.010040283203125,
"learning_rate": 9.026620557966279e-07,
"loss": 0.0101,
"reward": 0.8750000298023224,
"reward_std": 0.43057897686958313,
"rewards/accuracy_reward": 0.19444445054978132,
"rewards/format_reward": 0.486111119389534,
"step": 146
},
{
"clip_ratio": 0.0,
"completion_length": 614.7639007568359,
"epoch": 0.2519280205655527,
"grad_norm": 0.4453893005847931,
"kl": 0.0105438232421875,
"learning_rate": 9.007020842191634e-07,
"loss": 0.0151,
"reward": 0.7291666641831398,
"reward_std": 0.41478364542126656,
"rewards/accuracy_reward": 0.12500000186264515,
"rewards/format_reward": 0.4791666641831398,
"step": 147
},
{
"clip_ratio": 0.0,
"completion_length": 682.7500152587891,
"epoch": 0.2536418166238218,
"grad_norm": 0.4301265776157379,
"kl": 0.0096435546875,
"learning_rate": 8.987250199168808e-07,
"loss": -0.0056,
"reward": 0.7152777761220932,
"reward_std": 0.33566733449697495,
"rewards/accuracy_reward": 0.11111111287027597,
"rewards/format_reward": 0.493055559694767,
"step": 148
},
{
"clip_ratio": 0.0,
"completion_length": 757.3611221313477,
"epoch": 0.25535561268209084,
"grad_norm": 0.4108283221721649,
"kl": 0.0125579833984375,
"learning_rate": 8.967309592491052e-07,
"loss": 0.007,
"reward": 0.5555555522441864,
"reward_std": 0.20964494906365871,
"rewards/accuracy_reward": 0.041666666977107525,
"rewards/format_reward": 0.4722222238779068,
"step": 149
},
{
"clip_ratio": 0.0,
"completion_length": 684.3611297607422,
"epoch": 0.2570694087403599,
"grad_norm": 0.3349432647228241,
"kl": 0.0099334716796875,
"learning_rate": 8.9471999940354e-07,
"loss": 0.0087,
"reward": 1.111111119389534,
"reward_std": 0.3082153648138046,
"rewards/accuracy_reward": 0.3055555550381541,
"rewards/format_reward": 0.5,
"step": 150
},
{
"clip_ratio": 0.0,
"completion_length": 507.1805648803711,
"epoch": 0.258783204798629,
"grad_norm": 0.4955935776233673,
"kl": 0.01495361328125,
"learning_rate": 8.926922383915315e-07,
"loss": -0.0025,
"reward": 0.6597222238779068,
"reward_std": 0.3041820004582405,
"rewards/accuracy_reward": 0.08333333488553762,
"rewards/format_reward": 0.493055559694767,
"step": 151
},
{
"clip_ratio": 0.0,
"completion_length": 778.1389007568359,
"epoch": 0.26049700085689803,
"grad_norm": 0.46859902143478394,
"kl": 0.01029205322265625,
"learning_rate": 8.906477750432903e-07,
"loss": 0.0761,
"reward": 0.8125,
"reward_std": 0.38506873697042465,
"rewards/accuracy_reward": 0.16666666977107525,
"rewards/format_reward": 0.4791666716337204,
"step": 152
},
{
"clip_ratio": 0.0,
"completion_length": 646.8194427490234,
"epoch": 0.2622107969151671,
"grad_norm": 0.5361355543136597,
"kl": 0.0163421630859375,
"learning_rate": 8.88586709003076e-07,
"loss": 0.1016,
"reward": 0.9861111268401146,
"reward_std": 0.4283023551106453,
"rewards/accuracy_reward": 0.2500000037252903,
"rewards/format_reward": 0.4861111119389534,
"step": 153
},
{
"clip_ratio": 0.0,
"completion_length": 603.5000076293945,
"epoch": 0.2639245929734362,
"grad_norm": 0.6667534708976746,
"kl": 0.015716552734375,
"learning_rate": 8.865091407243394e-07,
"loss": -0.0517,
"reward": 0.888888880610466,
"reward_std": 0.4442981034517288,
"rewards/accuracy_reward": 0.19444444496184587,
"rewards/format_reward": 0.5,
"step": 154
},
{
"clip_ratio": 0.0,
"completion_length": 730.5138854980469,
"epoch": 0.2656383890317052,
"grad_norm": 0.5823290944099426,
"kl": 0.0100250244140625,
"learning_rate": 8.844151714648274e-07,
"loss": 0.0032,
"reward": 0.9652777761220932,
"reward_std": 0.3304465189576149,
"rewards/accuracy_reward": 0.23611111752688885,
"rewards/format_reward": 0.493055559694767,
"step": 155
},
{
"clip_ratio": 0.0,
"completion_length": 699.7500152587891,
"epoch": 0.26735218508997427,
"grad_norm": 0.3316850960254669,
"kl": 0.00982666015625,
"learning_rate": 8.823049032816478e-07,
"loss": 0.0064,
"reward": 0.798611119389534,
"reward_std": 0.19436374306678772,
"rewards/accuracy_reward": 0.1527777798473835,
"rewards/format_reward": 0.493055559694767,
"step": 156
},
{
"clip_ratio": 0.0,
"completion_length": 678.5833358764648,
"epoch": 0.26906598114824337,
"grad_norm": 0.43551284074783325,
"kl": 0.0104827880859375,
"learning_rate": 8.801784390262943e-07,
"loss": 0.0163,
"reward": 1.013888880610466,
"reward_std": 0.34745684266090393,
"rewards/accuracy_reward": 0.2638888927176595,
"rewards/format_reward": 0.486111119389534,
"step": 157
},
{
"clip_ratio": 0.0,
"completion_length": 657.9027862548828,
"epoch": 0.2707797772065124,
"grad_norm": 0.3160940110683441,
"kl": 0.00923919677734375,
"learning_rate": 8.780358823396352e-07,
"loss": -0.0074,
"reward": 0.604166679084301,
"reward_std": 0.17633883468806744,
"rewards/accuracy_reward": 0.0555555559694767,
"rewards/format_reward": 0.493055559694767,
"step": 158
},
{
"clip_ratio": 0.0,
"completion_length": 762.0138854980469,
"epoch": 0.27249357326478146,
"grad_norm": 0.8683849573135376,
"kl": 0.02294921875,
"learning_rate": 8.758773376468604e-07,
"loss": 0.0224,
"reward": 0.597222238779068,
"reward_std": 0.25616976991295815,
"rewards/accuracy_reward": 0.06944444589316845,
"rewards/format_reward": 0.4583333432674408,
"step": 159
},
{
"clip_ratio": 0.0,
"completion_length": 648.625,
"epoch": 0.27420736932305056,
"grad_norm": 0.44514918327331543,
"kl": 0.00907135009765625,
"learning_rate": 8.737029101523929e-07,
"loss": 0.0111,
"reward": 1.0138888955116272,
"reward_std": 0.32083219289779663,
"rewards/accuracy_reward": 0.2638888927176595,
"rewards/format_reward": 0.486111119389534,
"step": 160
},
{
"clip_ratio": 0.0,
"completion_length": 669.7778015136719,
"epoch": 0.2759211653813196,
"grad_norm": 0.5343595743179321,
"kl": 0.0099029541015625,
"learning_rate": 8.715127058347614e-07,
"loss": -0.0242,
"reward": 0.7777777910232544,
"reward_std": 0.4600205048918724,
"rewards/accuracy_reward": 0.13888889271765947,
"rewards/format_reward": 0.5000000074505806,
"step": 161
},
{
"clip_ratio": 0.0,
"completion_length": 619.6527938842773,
"epoch": 0.2776349614395887,
"grad_norm": 0.24237516522407532,
"kl": 0.00861358642578125,
"learning_rate": 8.693068314414344e-07,
"loss": 0.0059,
"reward": 0.777777761220932,
"reward_std": 0.17213259637355804,
"rewards/accuracy_reward": 0.1388888917863369,
"rewards/format_reward": 0.5,
"step": 162
},
{
"clip_ratio": 0.0,
"completion_length": 686.5416717529297,
"epoch": 0.27934875749785776,
"grad_norm": 0.35996633768081665,
"kl": 0.0109405517578125,
"learning_rate": 8.670853944836176e-07,
"loss": 0.0226,
"reward": 0.652777798473835,
"reward_std": 0.2794154789298773,
"rewards/accuracy_reward": 0.08333333395421505,
"rewards/format_reward": 0.486111119389534,
"step": 163
},
{
"clip_ratio": 0.0,
"completion_length": 563.8611221313477,
"epoch": 0.2810625535561268,
"grad_norm": 0.41780418157577515,
"kl": 0.011383056640625,
"learning_rate": 8.648485032310144e-07,
"loss": -0.0036,
"reward": 0.8611111342906952,
"reward_std": 0.3134361729025841,
"rewards/accuracy_reward": 0.18055555690079927,
"rewards/format_reward": 0.5,
"step": 164
},
{
"clip_ratio": 0.0,
"completion_length": 503.6388931274414,
"epoch": 0.2827763496143959,
"grad_norm": 0.34387895464897156,
"kl": 0.011260986328125,
"learning_rate": 8.625962667065487e-07,
"loss": -0.011,
"reward": 0.6111111044883728,
"reward_std": 0.15932847559452057,
"rewards/accuracy_reward": 0.055555556900799274,
"rewards/format_reward": 0.5,
"step": 165
},
{
"clip_ratio": 0.0,
"completion_length": 784.4444580078125,
"epoch": 0.28449014567266495,
"grad_norm": 0.47405433654785156,
"kl": 0.00818634033203125,
"learning_rate": 8.603287946810513e-07,
"loss": 0.0147,
"reward": 0.8055555671453476,
"reward_std": 0.5355970486998558,
"rewards/accuracy_reward": 0.16666666883975267,
"rewards/format_reward": 0.4722222313284874,
"step": 166
},
{
"clip_ratio": 0.0,
"completion_length": 674.5833282470703,
"epoch": 0.286203941730934,
"grad_norm": 0.43140318989753723,
"kl": 0.008544921875,
"learning_rate": 8.580461976679099e-07,
"loss": 0.0226,
"reward": 0.7638888955116272,
"reward_std": 0.4538358449935913,
"rewards/accuracy_reward": 0.13888889364898205,
"rewards/format_reward": 0.4861111119389534,
"step": 167
},
{
"clip_ratio": 0.0,
"completion_length": 789.8472290039062,
"epoch": 0.2879177377892031,
"grad_norm": 0.7944321632385254,
"kl": 0.01107025146484375,
"learning_rate": 8.557485869176825e-07,
"loss": 0.0802,
"reward": 0.680555559694767,
"reward_std": 0.28722215443849564,
"rewards/accuracy_reward": 0.09722222574055195,
"rewards/format_reward": 0.486111119389534,
"step": 168
},
{
"clip_ratio": 0.0,
"completion_length": 594.7222366333008,
"epoch": 0.28963153384747214,
"grad_norm": 0.47057613730430603,
"kl": 0.01092529296875,
"learning_rate": 8.534360744126753e-07,
"loss": -0.0071,
"reward": 0.8263888880610466,
"reward_std": 0.2624051198363304,
"rewards/accuracy_reward": 0.16666666697710752,
"rewards/format_reward": 0.493055559694767,
"step": 169
},
{
"clip_ratio": 0.0,
"completion_length": 575.5000076293945,
"epoch": 0.2913453299057412,
"grad_norm": 0.48645710945129395,
"kl": 0.0149688720703125,
"learning_rate": 8.511087728614862e-07,
"loss": 0.0024,
"reward": 0.965277761220932,
"reward_std": 0.37693794071674347,
"rewards/accuracy_reward": 0.23611111380159855,
"rewards/format_reward": 0.493055559694767,
"step": 170
},
{
"clip_ratio": 0.0,
"completion_length": 738.7916717529297,
"epoch": 0.2930591259640103,
"grad_norm": 0.4424607753753662,
"kl": 0.00809478759765625,
"learning_rate": 8.487667956935087e-07,
"loss": 0.0969,
"reward": 0.7291666865348816,
"reward_std": 0.2601025812327862,
"rewards/accuracy_reward": 0.12500000093132257,
"rewards/format_reward": 0.479166679084301,
"step": 171
},
{
"clip_ratio": 0.0,
"completion_length": 730.7361297607422,
"epoch": 0.29477292202227934,
"grad_norm": 0.4389359652996063,
"kl": 0.00835418701171875,
"learning_rate": 8.464102570534061e-07,
"loss": 0.0355,
"reward": 0.7430555447936058,
"reward_std": 0.24438021332025528,
"rewards/accuracy_reward": 0.1250000037252903,
"rewards/format_reward": 0.493055559694767,
"step": 172
},
{
"clip_ratio": 0.0,
"completion_length": 682.6111068725586,
"epoch": 0.29648671808054844,
"grad_norm": 0.6182007193565369,
"kl": 0.013397216796875,
"learning_rate": 8.440392717955475e-07,
"loss": 0.0092,
"reward": 0.7777777761220932,
"reward_std": 0.4936107471585274,
"rewards/accuracy_reward": 0.15277778171002865,
"rewards/format_reward": 0.4722222238779068,
"step": 173
},
{
"clip_ratio": 0.0,
"completion_length": 690.2222290039062,
"epoch": 0.2982005141388175,
"grad_norm": 0.30457955598831177,
"kl": 0.0081939697265625,
"learning_rate": 8.416539554784089e-07,
"loss": 0.0026,
"reward": 0.7708333432674408,
"reward_std": 0.23915940523147583,
"rewards/accuracy_reward": 0.13888889271765947,
"rewards/format_reward": 0.493055559694767,
"step": 174
},
{
"clip_ratio": 0.0,
"completion_length": 696.6527938842773,
"epoch": 0.29991431019708653,
"grad_norm": 0.3551907241344452,
"kl": 0.010650634765625,
"learning_rate": 8.392544243589427e-07,
"loss": 0.0106,
"reward": 0.5833333283662796,
"reward_std": 0.15410767495632172,
"rewards/accuracy_reward": 0.041666666977107525,
"rewards/format_reward": 0.5,
"step": 175
},
{
"clip_ratio": 0.0,
"completion_length": 606.7916717529297,
"epoch": 0.30162810625535563,
"grad_norm": 0.4258103370666504,
"kl": 0.0097503662109375,
"learning_rate": 8.368407953869103e-07,
"loss": 0.0108,
"reward": 0.7152777761220932,
"reward_std": 0.27087756246328354,
"rewards/accuracy_reward": 0.11111111473292112,
"rewards/format_reward": 0.493055559694767,
"step": 176
},
{
"clip_ratio": 0.0,
"completion_length": 702.7916564941406,
"epoch": 0.3033419023136247,
"grad_norm": 0.38482344150543213,
"kl": 0.009063720703125,
"learning_rate": 8.344131861991828e-07,
"loss": 0.0025,
"reward": 0.7986111268401146,
"reward_std": 0.416512792930007,
"rewards/accuracy_reward": 0.15277778171002865,
"rewards/format_reward": 0.493055559694767,
"step": 177
},
{
"clip_ratio": 0.0,
"completion_length": 673.5555572509766,
"epoch": 0.3050556983718937,
"grad_norm": 0.4185694456100464,
"kl": 0.00726318359375,
"learning_rate": 8.319717151140072e-07,
"loss": 0.0183,
"reward": 0.7430555522441864,
"reward_std": 0.3815770819783211,
"rewards/accuracy_reward": 0.12500000465661287,
"rewards/format_reward": 0.493055559694767,
"step": 178
},
{
"clip_ratio": 0.0,
"completion_length": 740.5277709960938,
"epoch": 0.3067694944301628,
"grad_norm": 0.4638974070549011,
"kl": 0.00782012939453125,
"learning_rate": 8.295165011252396e-07,
"loss": -0.0087,
"reward": 0.7083333507180214,
"reward_std": 0.4054961260408163,
"rewards/accuracy_reward": 0.12500000186264515,
"rewards/format_reward": 0.4583333432674408,
"step": 179
},
{
"clip_ratio": 0.0,
"completion_length": 577.0416641235352,
"epoch": 0.30848329048843187,
"grad_norm": 0.5336411595344543,
"kl": 0.0118560791015625,
"learning_rate": 8.270476638965461e-07,
"loss": 0.0215,
"reward": 0.8333333432674408,
"reward_std": 0.358231820166111,
"rewards/accuracy_reward": 0.16666667070239782,
"rewards/format_reward": 0.5,
"step": 180
},
{
"clip_ratio": 0.0,
"completion_length": 692.4027862548828,
"epoch": 0.3101970865467009,
"grad_norm": 0.5019733905792236,
"kl": 0.0130157470703125,
"learning_rate": 8.245653237555705e-07,
"loss": -0.0101,
"reward": 0.75,
"reward_std": 0.33668188750743866,
"rewards/accuracy_reward": 0.12500000186264515,
"rewards/format_reward": 0.5,
"step": 181
},
{
"clip_ratio": 0.0,
"completion_length": 664.3889007568359,
"epoch": 0.31191088260497,
"grad_norm": 0.36695027351379395,
"kl": 0.0102386474609375,
"learning_rate": 8.220696016880687e-07,
"loss": 0.0083,
"reward": 0.7291666716337204,
"reward_std": 0.30504853278398514,
"rewards/accuracy_reward": 0.1250000037252903,
"rewards/format_reward": 0.4791666716337204,
"step": 182
},
{
"clip_ratio": 0.0,
"completion_length": 741.8055648803711,
"epoch": 0.31362467866323906,
"grad_norm": 0.5278235077857971,
"kl": 0.0111541748046875,
"learning_rate": 8.195606193320136e-07,
"loss": 0.0004,
"reward": 0.9166666716337204,
"reward_std": 0.46232304722070694,
"rewards/accuracy_reward": 0.20833334140479565,
"rewards/format_reward": 0.5,
"step": 183
},
{
"clip_ratio": 0.0,
"completion_length": 648.9861221313477,
"epoch": 0.31533847472150817,
"grad_norm": 0.48935696482658386,
"kl": 0.0121612548828125,
"learning_rate": 8.170384989716657e-07,
"loss": 0.0721,
"reward": 0.7847222089767456,
"reward_std": 0.3912379518151283,
"rewards/accuracy_reward": 0.15277778171002865,
"rewards/format_reward": 0.4791666641831398,
"step": 184
},
{
"clip_ratio": 0.0,
"completion_length": 652.9166793823242,
"epoch": 0.3170522707797772,
"grad_norm": 0.5268736481666565,
"kl": 0.0102691650390625,
"learning_rate": 8.145033635316128e-07,
"loss": 0.0095,
"reward": 1.2916667014360428,
"reward_std": 0.7127073556184769,
"rewards/accuracy_reward": 0.4027777947485447,
"rewards/format_reward": 0.486111119389534,
"step": 185
},
{
"clip_ratio": 0.0,
"completion_length": 650.0972290039062,
"epoch": 0.31876606683804626,
"grad_norm": 0.3435427248477936,
"kl": 0.0112152099609375,
"learning_rate": 8.119553365707802e-07,
"loss": -0.0147,
"reward": 0.7222222238779068,
"reward_std": 0.3314610570669174,
"rewards/accuracy_reward": 0.11111111473292112,
"rewards/format_reward": 0.5,
"step": 186
},
{
"clip_ratio": 0.0,
"completion_length": 668.9722290039062,
"epoch": 0.32047986289631536,
"grad_norm": 0.5294600129127502,
"kl": 0.01114654541015625,
"learning_rate": 8.093945422764069e-07,
"loss": -0.0126,
"reward": 1.0277777910232544,
"reward_std": 0.6216515377163887,
"rewards/accuracy_reward": 0.26388888992369175,
"rewards/format_reward": 0.5,
"step": 187
},
{
"clip_ratio": 0.0,
"completion_length": 735.2639007568359,
"epoch": 0.3221936589545844,
"grad_norm": 0.5934704542160034,
"kl": 0.015716552734375,
"learning_rate": 8.068211054579943e-07,
"loss": -0.0676,
"reward": 0.736111119389534,
"reward_std": 0.3602609410881996,
"rewards/accuracy_reward": 0.12500000093132257,
"rewards/format_reward": 0.486111119389534,
"step": 188
},
{
"clip_ratio": 0.0,
"completion_length": 708.2638854980469,
"epoch": 0.32390745501285345,
"grad_norm": 0.29051750898361206,
"kl": 0.012054443359375,
"learning_rate": 8.04235151541222e-07,
"loss": 0.0144,
"reward": 0.6666666567325592,
"reward_std": 0.2453947737812996,
"rewards/accuracy_reward": 0.08333333488553762,
"rewards/format_reward": 0.5,
"step": 189
},
{
"clip_ratio": 0.0,
"completion_length": 791.9305572509766,
"epoch": 0.32562125107112255,
"grad_norm": 0.6270461082458496,
"kl": 0.00994110107421875,
"learning_rate": 8.01636806561836e-07,
"loss": 0.1261,
"reward": 0.6319444552063942,
"reward_std": 0.306252408772707,
"rewards/accuracy_reward": 0.0833333358168602,
"rewards/format_reward": 0.4652777835726738,
"step": 190
},
{
"clip_ratio": 0.0,
"completion_length": 759.0139007568359,
"epoch": 0.3273350471293916,
"grad_norm": 0.4964490532875061,
"kl": 0.0179290771484375,
"learning_rate": 7.990261971595048e-07,
"loss": 0.0119,
"reward": 0.9027777910232544,
"reward_std": 0.5300310179591179,
"rewards/accuracy_reward": 0.20833333674818277,
"rewards/format_reward": 0.486111119389534,
"step": 191
},
{
"clip_ratio": 0.0,
"completion_length": 513.6249923706055,
"epoch": 0.32904884318766064,
"grad_norm": 0.5976383090019226,
"kl": 0.0132598876953125,
"learning_rate": 7.964034505716476e-07,
"loss": -0.0007,
"reward": 1.0555555820465088,
"reward_std": 0.530364416539669,
"rewards/accuracy_reward": 0.2777777798473835,
"rewards/format_reward": 0.5,
"step": 192
},
{
"clip_ratio": 0.0,
"completion_length": 626.6250152587891,
"epoch": 0.33076263924592975,
"grad_norm": 0.6004844903945923,
"kl": 0.0198211669921875,
"learning_rate": 7.93768694627233e-07,
"loss": 0.0022,
"reward": 0.9375000149011612,
"reward_std": 0.32522570341825485,
"rewards/accuracy_reward": 0.22222222667187452,
"rewards/format_reward": 0.493055559694767,
"step": 193
},
{
"clip_ratio": 0.0,
"completion_length": 613.7361068725586,
"epoch": 0.3324764353041988,
"grad_norm": 0.5730006098747253,
"kl": 0.021759033203125,
"learning_rate": 7.911220577405484e-07,
"loss": 0.0095,
"reward": 0.7777777910232544,
"reward_std": 0.29541123658418655,
"rewards/accuracy_reward": 0.13888888992369175,
"rewards/format_reward": 0.5,
"step": 194
},
{
"clip_ratio": 0.0,
"completion_length": 774.7083435058594,
"epoch": 0.3341902313624679,
"grad_norm": 0.42059004306793213,
"kl": 0.01324462890625,
"learning_rate": 7.884636689049422e-07,
"loss": 0.0033,
"reward": 0.6666666567325592,
"reward_std": 0.29541125148534775,
"rewards/accuracy_reward": 0.08333333488553762,
"rewards/format_reward": 0.5,
"step": 195
},
{
"clip_ratio": 0.0,
"completion_length": 683.8888854980469,
"epoch": 0.33590402742073694,
"grad_norm": 0.3196467459201813,
"kl": 0.0126495361328125,
"learning_rate": 7.857936576865356e-07,
"loss": 0.0156,
"reward": 0.8819444626569748,
"reward_std": 0.25071514397859573,
"rewards/accuracy_reward": 0.1944444514811039,
"rewards/format_reward": 0.493055559694767,
"step": 196
},
{
"clip_ratio": 0.0,
"completion_length": 793.1388854980469,
"epoch": 0.337617823479006,
"grad_norm": 0.6156473755836487,
"kl": 0.01934814453125,
"learning_rate": 7.831121542179086e-07,
"loss": 0.0594,
"reward": 0.8819444552063942,
"reward_std": 0.4707336239516735,
"rewards/accuracy_reward": 0.20833334140479565,
"rewards/format_reward": 0.4652777835726738,
"step": 197
},
{
"clip_ratio": 0.0,
"completion_length": 601.7777862548828,
"epoch": 0.3393316195372751,
"grad_norm": 0.5810103416442871,
"kl": 0.0160675048828125,
"learning_rate": 7.804192891917571e-07,
"loss": 0.0897,
"reward": 0.8750000149011612,
"reward_std": 0.4387439265847206,
"rewards/accuracy_reward": 0.19444444868713617,
"rewards/format_reward": 0.486111119389534,
"step": 198
},
{
"clip_ratio": 0.0,
"completion_length": 739.5694580078125,
"epoch": 0.34104541559554413,
"grad_norm": 0.4028480350971222,
"kl": 0.0103607177734375,
"learning_rate": 7.777151938545235e-07,
"loss": 0.0106,
"reward": 0.7222222238779068,
"reward_std": 0.30821534991264343,
"rewards/accuracy_reward": 0.11111111473292112,
"rewards/format_reward": 0.5,
"step": 199
},
{
"clip_ratio": 0.0,
"completion_length": 765.2083587646484,
"epoch": 0.3427592116538132,
"grad_norm": 0.5337042808532715,
"kl": 0.0111236572265625,
"learning_rate": 7.75e-07,
"loss": 0.0084,
"reward": 0.7083333432674408,
"reward_std": 0.37828588485717773,
"rewards/accuracy_reward": 0.1111111119389534,
"rewards/format_reward": 0.4861111119389534,
"step": 200
},
{
"clip_ratio": 0.0,
"completion_length": 755.25,
"epoch": 0.3444730077120823,
"grad_norm": 0.5041696429252625,
"kl": 0.00969696044921875,
"learning_rate": 7.72273839962904e-07,
"loss": -0.0162,
"reward": 0.8541666567325592,
"reward_std": 0.4137297794222832,
"rewards/accuracy_reward": 0.19444444682449102,
"rewards/format_reward": 0.4652777761220932,
"step": 201
},
{
"clip_ratio": 0.0,
"completion_length": 645.0277862548828,
"epoch": 0.3461868037703513,
"grad_norm": 0.4631847143173218,
"kl": 0.0138092041015625,
"learning_rate": 7.695368466124296e-07,
"loss": 0.023,
"reward": 0.8541666716337204,
"reward_std": 0.41651278734207153,
"rewards/accuracy_reward": 0.18055556062608957,
"rewards/format_reward": 0.493055559694767,
"step": 202
},
{
"clip_ratio": 0.0,
"completion_length": 741.7361145019531,
"epoch": 0.34790059982862037,
"grad_norm": 0.5321578979492188,
"kl": 0.01479339599609375,
"learning_rate": 7.667891533457718e-07,
"loss": 0.106,
"reward": 1.1458333730697632,
"reward_std": 0.6571117714047432,
"rewards/accuracy_reward": 0.3333333432674408,
"rewards/format_reward": 0.479166679084301,
"step": 203
},
{
"clip_ratio": 0.0,
"completion_length": 689.3194427490234,
"epoch": 0.3496143958868895,
"grad_norm": 0.3890639841556549,
"kl": 0.012969970703125,
"learning_rate": 7.640308940816239e-07,
"loss": -0.0073,
"reward": 0.8541666716337204,
"reward_std": 0.2211344763636589,
"rewards/accuracy_reward": 0.18055556155741215,
"rewards/format_reward": 0.493055559694767,
"step": 204
},
{
"clip_ratio": 0.0,
"completion_length": 654.2638931274414,
"epoch": 0.3513281919451585,
"grad_norm": 0.5081583857536316,
"kl": 0.01397705078125,
"learning_rate": 7.612622032536507e-07,
"loss": 0.0108,
"reward": 0.9166666865348816,
"reward_std": 0.32624027878046036,
"rewards/accuracy_reward": 0.2083333358168602,
"rewards/format_reward": 0.5,
"step": 205
},
{
"clip_ratio": 0.0,
"completion_length": 681.0694580078125,
"epoch": 0.35304198800342756,
"grad_norm": 0.48619431257247925,
"kl": 0.0098114013671875,
"learning_rate": 7.584832158039378e-07,
"loss": -0.005,
"reward": 0.7361111044883728,
"reward_std": 0.32421112060546875,
"rewards/accuracy_reward": 0.1250000037252903,
"rewards/format_reward": 0.486111119389534,
"step": 206
},
{
"clip_ratio": 0.0,
"completion_length": 603.4305572509766,
"epoch": 0.35475578406169667,
"grad_norm": 0.4051726162433624,
"kl": 0.01464080810546875,
"learning_rate": 7.556940671764124e-07,
"loss": -0.0038,
"reward": 1.1944444626569748,
"reward_std": 0.3134361505508423,
"rewards/accuracy_reward": 0.3472222350537777,
"rewards/format_reward": 0.5,
"step": 207
},
{
"clip_ratio": 0.0,
"completion_length": 680.7916793823242,
"epoch": 0.3564695801199657,
"grad_norm": 0.5050489902496338,
"kl": 0.0126953125,
"learning_rate": 7.528948933102438e-07,
"loss": 0.0505,
"reward": 0.7916666567325592,
"reward_std": 0.32421112060546875,
"rewards/accuracy_reward": 0.15277777798473835,
"rewards/format_reward": 0.486111119389534,
"step": 208
},
{
"clip_ratio": 0.0,
"completion_length": 656.4722442626953,
"epoch": 0.3581833761782348,
"grad_norm": 0.29651615023612976,
"kl": 0.00934600830078125,
"learning_rate": 7.500858306332172e-07,
"loss": -0.0032,
"reward": 0.6666666567325592,
"reward_std": 0.25819889456033707,
"rewards/accuracy_reward": 0.08333333395421505,
"rewards/format_reward": 0.5,
"step": 209
},
{
"clip_ratio": 0.0,
"completion_length": 515.0,
"epoch": 0.35989717223650386,
"grad_norm": 0.8540976643562317,
"kl": 0.028778076171875,
"learning_rate": 7.472670160550848e-07,
"loss": 0.0138,
"reward": 0.7708333283662796,
"reward_std": 0.46130845695734024,
"rewards/accuracy_reward": 0.1388888917863369,
"rewards/format_reward": 0.493055559694767,
"step": 210
},
{
"clip_ratio": 0.0,
"completion_length": 965.8055725097656,
"epoch": 0.3616109682947729,
"grad_norm": 0.3300262689590454,
"kl": 0.0074920654296875,
"learning_rate": 7.444385869608921e-07,
"loss": 0.0311,
"reward": 0.6527777835726738,
"reward_std": 0.17010344564914703,
"rewards/accuracy_reward": 0.08333333674818277,
"rewards/format_reward": 0.486111119389534,
"step": 211
},
{
"clip_ratio": 0.0,
"completion_length": 625.5555572509766,
"epoch": 0.363324764353042,
"grad_norm": 0.5068047642707825,
"kl": 0.012664794921875,
"learning_rate": 7.416006812042827e-07,
"loss": -0.0061,
"reward": 0.9375,
"reward_std": 0.4217336028814316,
"rewards/accuracy_reward": 0.22222222480922937,
"rewards/format_reward": 0.493055559694767,
"step": 212
},
{
"clip_ratio": 0.0,
"completion_length": 909.013916015625,
"epoch": 0.36503856041131105,
"grad_norm": 0.3107520341873169,
"kl": 0.00826263427734375,
"learning_rate": 7.387534371007797e-07,
"loss": 0.0477,
"reward": 0.7361111044883728,
"reward_std": 0.34775684773921967,
"rewards/accuracy_reward": 0.12500000093132257,
"rewards/format_reward": 0.4861111119389534,
"step": 213
},
{
"clip_ratio": 0.0,
"completion_length": 708.1805572509766,
"epoch": 0.3667523564695801,
"grad_norm": 0.5627197027206421,
"kl": 0.00879669189453125,
"learning_rate": 7.358969934210438e-07,
"loss": 0.0545,
"reward": 0.958333358168602,
"reward_std": 0.5707200393080711,
"rewards/accuracy_reward": 0.23611111659556627,
"rewards/format_reward": 0.486111119389534,
"step": 214
},
{
"clip_ratio": 0.0,
"completion_length": 702.1388854980469,
"epoch": 0.3684661525278492,
"grad_norm": 0.5161833763122559,
"kl": 0.01049041748046875,
"learning_rate": 7.330314893841101e-07,
"loss": -0.0212,
"reward": 1.1388889104127884,
"reward_std": 0.5355852097272873,
"rewards/accuracy_reward": 0.3194444486871362,
"rewards/format_reward": 0.5,
"step": 215
},
{
"clip_ratio": 0.0,
"completion_length": 695.9722290039062,
"epoch": 0.37017994858611825,
"grad_norm": 0.4373010993003845,
"kl": 0.00847625732421875,
"learning_rate": 7.301570646506027e-07,
"loss": 0.0195,
"reward": 0.7430555447936058,
"reward_std": 0.30720078758895397,
"rewards/accuracy_reward": 0.1250000037252903,
"rewards/format_reward": 0.493055559694767,
"step": 216
},
{
"clip_ratio": 0.0,
"completion_length": 636.6944580078125,
"epoch": 0.3718937446443873,
"grad_norm": 0.49228960275650024,
"kl": 0.009765625,
"learning_rate": 7.27273859315928e-07,
"loss": 0.0262,
"reward": 0.8958333432674408,
"reward_std": 0.46008094400167465,
"rewards/accuracy_reward": 0.20833333767950535,
"rewards/format_reward": 0.4791666716337204,
"step": 217
},
{
"clip_ratio": 0.0,
"completion_length": 717.819450378418,
"epoch": 0.3736075407026564,
"grad_norm": 0.43165305256843567,
"kl": 0.0099334716796875,
"learning_rate": 7.243820139034464e-07,
"loss": 0.0048,
"reward": 0.8124999925494194,
"reward_std": 0.29642581194639206,
"rewards/accuracy_reward": 0.1666666679084301,
"rewards/format_reward": 0.4791666716337204,
"step": 218
},
{
"clip_ratio": 0.0,
"completion_length": 838.875,
"epoch": 0.37532133676092544,
"grad_norm": 0.4618090093135834,
"kl": 0.00933837890625,
"learning_rate": 7.214816693576234e-07,
"loss": 0.0164,
"reward": 0.7083333432674408,
"reward_std": 0.466628834605217,
"rewards/accuracy_reward": 0.11111111380159855,
"rewards/format_reward": 0.486111119389534,
"step": 219
},
{
"clip_ratio": 0.0,
"completion_length": 847.2222290039062,
"epoch": 0.37703513281919454,
"grad_norm": 0.12107283622026443,
"kl": 0.00963592529296875,
"learning_rate": 7.185729670371604e-07,
"loss": -0.0002,
"reward": 0.6944444328546524,
"reward_std": 0.0680413767695427,
"rewards/accuracy_reward": 0.09722222480922937,
"rewards/format_reward": 0.5,
"step": 220
},
{
"clip_ratio": 0.0,
"completion_length": 738.3472290039062,
"epoch": 0.3787489288774636,
"grad_norm": 0.15903827548027039,
"kl": 0.00983428955078125,
"learning_rate": 7.156560487081051e-07,
"loss": 0.0021,
"reward": 0.5277777761220932,
"reward_std": 0.0680413767695427,
"rewards/accuracy_reward": 0.013888888992369175,
"rewards/format_reward": 0.5,
"step": 221
},
{
"clip_ratio": 0.0,
"completion_length": 745.5694427490234,
"epoch": 0.38046272493573263,
"grad_norm": 0.331328809261322,
"kl": 0.01165771484375,
"learning_rate": 7.127310565369415e-07,
"loss": 0.012,
"reward": 0.7083333432674408,
"reward_std": 0.2549325004220009,
"rewards/accuracy_reward": 0.11111111287027597,
"rewards/format_reward": 0.4861111119389534,
"step": 222
},
{
"clip_ratio": 0.0,
"completion_length": 747.9027862548828,
"epoch": 0.38217652099400173,
"grad_norm": 0.43389493227005005,
"kl": 0.0093536376953125,
"learning_rate": 7.097981330836616e-07,
"loss": 0.0764,
"reward": 0.6250000074505806,
"reward_std": 0.2613905519247055,
"rewards/accuracy_reward": 0.06944444496184587,
"rewards/format_reward": 0.4861111119389534,
"step": 223
},
{
"clip_ratio": 0.0,
"completion_length": 655.5694580078125,
"epoch": 0.3838903170522708,
"grad_norm": 0.2923060655593872,
"kl": 0.0098876953125,
"learning_rate": 7.068574212948169e-07,
"loss": 0.0202,
"reward": 0.5138888955116272,
"reward_std": 0.10206206515431404,
"rewards/accuracy_reward": 0.013888888992369175,
"rewards/format_reward": 0.486111119389534,
"step": 224
},
{
"clip_ratio": 0.0,
"completion_length": 595.1666793823242,
"epoch": 0.3856041131105398,
"grad_norm": 0.3962916433811188,
"kl": 0.01021575927734375,
"learning_rate": 7.039090644965509e-07,
"loss": -0.0231,
"reward": 0.75,
"reward_std": 0.2901904508471489,
"rewards/accuracy_reward": 0.1250000037252903,
"rewards/format_reward": 0.5,
"step": 225
},
{
"clip_ratio": 0.0,
"completion_length": 801.6944580078125,
"epoch": 0.3873179091688089,
"grad_norm": 0.42240843176841736,
"kl": 0.00876617431640625,
"learning_rate": 7.009532063876148e-07,
"loss": 0.0223,
"reward": 0.7708333507180214,
"reward_std": 0.5345706399530172,
"rewards/accuracy_reward": 0.13888889271765947,
"rewards/format_reward": 0.493055559694767,
"step": 226
},
{
"clip_ratio": 0.0,
"completion_length": 698.6527938842773,
"epoch": 0.389031705227078,
"grad_norm": 0.5248401165008545,
"kl": 0.0102081298828125,
"learning_rate": 6.979899910323624e-07,
"loss": -0.0301,
"reward": 1.0763889104127884,
"reward_std": 0.39326707273721695,
"rewards/accuracy_reward": 0.29166667722165585,
"rewards/format_reward": 0.493055559694767,
"step": 227
},
{
"clip_ratio": 0.0,
"completion_length": 682.1111297607422,
"epoch": 0.390745501285347,
"grad_norm": 0.6135608553886414,
"kl": 0.01019287109375,
"learning_rate": 6.950195628537299e-07,
"loss": 0.0276,
"reward": 1.0694444626569748,
"reward_std": 0.496343731880188,
"rewards/accuracy_reward": 0.2916666679084301,
"rewards/format_reward": 0.486111119389534,
"step": 228
},
{
"clip_ratio": 0.0,
"completion_length": 743.4027862548828,
"epoch": 0.3924592973436161,
"grad_norm": 0.2736571133136749,
"kl": 0.01031494140625,
"learning_rate": 6.920420666261961e-07,
"loss": 0.0257,
"reward": 0.8472222238779068,
"reward_std": 0.18812836706638336,
"rewards/accuracy_reward": 0.180555559694767,
"rewards/format_reward": 0.486111119389534,
"step": 229
},
{
"clip_ratio": 0.0,
"completion_length": 773.6527862548828,
"epoch": 0.39417309340188517,
"grad_norm": 0.4546229839324951,
"kl": 0.0096588134765625,
"learning_rate": 6.890576474687263e-07,
"loss": 0.0489,
"reward": 1.0277778059244156,
"reward_std": 0.3637526258826256,
"rewards/accuracy_reward": 0.2777777835726738,
"rewards/format_reward": 0.4722222238779068,
"step": 230
},
{
"clip_ratio": 0.0,
"completion_length": 651.8611145019531,
"epoch": 0.39588688946015427,
"grad_norm": 0.40317994356155396,
"kl": 0.01104736328125,
"learning_rate": 6.860664508377001e-07,
"loss": -0.0009,
"reward": 1.0555555522441864,
"reward_std": 0.2721655070781708,
"rewards/accuracy_reward": 0.27777778543531895,
"rewards/format_reward": 0.5,
"step": 231
},
{
"clip_ratio": 0.0,
"completion_length": 591.1805725097656,
"epoch": 0.3976006855184233,
"grad_norm": 0.4714021384716034,
"kl": 0.0143280029296875,
"learning_rate": 6.83068622519821e-07,
"loss": 0.0116,
"reward": 1.0,
"reward_std": 0.30821534991264343,
"rewards/accuracy_reward": 0.25000000186264515,
"rewards/format_reward": 0.5,
"step": 232
},
{
"clip_ratio": 0.0,
"completion_length": 549.2777786254883,
"epoch": 0.39931448157669236,
"grad_norm": 0.6059587001800537,
"kl": 0.01348876953125,
"learning_rate": 6.800643086250121e-07,
"loss": -0.0337,
"reward": 1.0138889104127884,
"reward_std": 0.5345955863595009,
"rewards/accuracy_reward": 0.26388889364898205,
"rewards/format_reward": 0.486111119389534,
"step": 233
},
{
"clip_ratio": 0.0,
"completion_length": 809.9444580078125,
"epoch": 0.40102827763496146,
"grad_norm": 0.45386579632759094,
"kl": 0.0084228515625,
"learning_rate": 6.770536555792944e-07,
"loss": 0.0403,
"reward": 0.9305555671453476,
"reward_std": 0.4666288197040558,
"rewards/accuracy_reward": 0.22222222574055195,
"rewards/format_reward": 0.4861111119389534,
"step": 234
},
{
"clip_ratio": 0.0,
"completion_length": 696.6666717529297,
"epoch": 0.4027420736932305,
"grad_norm": 0.1798969805240631,
"kl": 0.0090789794921875,
"learning_rate": 6.740368101176495e-07,
"loss": 0.0142,
"reward": 0.8611111044883728,
"reward_std": 0.06804138422012329,
"rewards/accuracy_reward": 0.180555559694767,
"rewards/format_reward": 0.5,
"step": 235
},
{
"clip_ratio": 0.0,
"completion_length": 741.5277862548828,
"epoch": 0.40445586975149955,
"grad_norm": 0.37443897128105164,
"kl": 0.0080413818359375,
"learning_rate": 6.710139192768694e-07,
"loss": -0.0028,
"reward": 0.7986111268401146,
"reward_std": 0.24438021332025528,
"rewards/accuracy_reward": 0.15277778450399637,
"rewards/format_reward": 0.493055559694767,
"step": 236
},
{
"clip_ratio": 0.0,
"completion_length": 569.6111221313477,
"epoch": 0.40616966580976865,
"grad_norm": 0.3133380711078644,
"kl": 0.013458251953125,
"learning_rate": 6.679851303883891e-07,
"loss": 0.007,
"reward": 0.8888888657093048,
"reward_std": 0.13608276844024658,
"rewards/accuracy_reward": 0.19444444961845875,
"rewards/format_reward": 0.5,
"step": 237
},
{
"clip_ratio": 0.0,
"completion_length": 639.3194427490234,
"epoch": 0.4078834618680377,
"grad_norm": 0.3414314091205597,
"kl": 0.0097808837890625,
"learning_rate": 6.649505910711058e-07,
"loss": 0.0132,
"reward": 0.7916666567325592,
"reward_std": 0.10206206887960434,
"rewards/accuracy_reward": 0.1527777835726738,
"rewards/format_reward": 0.4861111119389534,
"step": 238
},
{
"clip_ratio": 0.0,
"completion_length": 698.0000152587891,
"epoch": 0.40959725792630675,
"grad_norm": 0.47577425837516785,
"kl": 0.0128326416015625,
"learning_rate": 6.619104492241847e-07,
"loss": 0.0208,
"reward": 0.9375000149011612,
"reward_std": 0.4217335730791092,
"rewards/accuracy_reward": 0.22222223225980997,
"rewards/format_reward": 0.493055559694767,
"step": 239
},
{
"clip_ratio": 0.0,
"completion_length": 496.9305648803711,
"epoch": 0.41131105398457585,
"grad_norm": 0.5120421051979065,
"kl": 0.017822265625,
"learning_rate": 6.588648530198504e-07,
"loss": -0.0194,
"reward": 0.9027777761220932,
"reward_std": 0.34775684028863907,
"rewards/accuracy_reward": 0.20833333674818277,
"rewards/format_reward": 0.4861111119389534,
"step": 240
},
{
"clip_ratio": 0.0,
"completion_length": 568.9166946411133,
"epoch": 0.4130248500428449,
"grad_norm": 0.4153745472431183,
"kl": 0.00946044921875,
"learning_rate": 6.558139508961654e-07,
"loss": -0.0283,
"reward": 1.1111111044883728,
"reward_std": 0.3314610719680786,
"rewards/accuracy_reward": 0.30555556435137987,
"rewards/format_reward": 0.5,
"step": 241
},
{
"clip_ratio": 0.0,
"completion_length": 679.4027862548828,
"epoch": 0.414738646101114,
"grad_norm": 0.2539161145687103,
"kl": 0.0107269287109375,
"learning_rate": 6.527578915497951e-07,
"loss": -0.0036,
"reward": 0.7222222089767456,
"reward_std": 0.26864049583673477,
"rewards/accuracy_reward": 0.11111111380159855,
"rewards/format_reward": 0.5,
"step": 242
},
{
"clip_ratio": 0.0,
"completion_length": 663.6388778686523,
"epoch": 0.41645244215938304,
"grad_norm": 0.4045405089855194,
"kl": 0.01134490966796875,
"learning_rate": 6.496968239287603e-07,
"loss": -0.0349,
"reward": 0.8263889029622078,
"reward_std": 0.3752421587705612,
"rewards/accuracy_reward": 0.16666667349636555,
"rewards/format_reward": 0.493055559694767,
"step": 243
},
{
"clip_ratio": 0.0,
"completion_length": 684.1250152587891,
"epoch": 0.4181662382176521,
"grad_norm": 0.33096015453338623,
"kl": 0.00829315185546875,
"learning_rate": 6.466308972251785e-07,
"loss": -0.0064,
"reward": 0.5833333283662796,
"reward_std": 0.20412413775920868,
"rewards/accuracy_reward": 0.041666666977107525,
"rewards/format_reward": 0.5,
"step": 244
},
{
"clip_ratio": 0.0,
"completion_length": 503.8194580078125,
"epoch": 0.4198800342759212,
"grad_norm": 0.6509292721748352,
"kl": 0.0207977294921875,
"learning_rate": 6.435602608679916e-07,
"loss": -0.0065,
"reward": 0.9166666567325592,
"reward_std": 0.4262731820344925,
"rewards/accuracy_reward": 0.2083333320915699,
"rewards/format_reward": 0.5,
"step": 245
},
{
"clip_ratio": 0.0,
"completion_length": 753.3333282470703,
"epoch": 0.42159383033419023,
"grad_norm": 0.23314639925956726,
"kl": 0.0078582763671875,
"learning_rate": 6.404850645156841e-07,
"loss": 0.0163,
"reward": 1.2430555522441864,
"reward_std": 0.28890247642993927,
"rewards/accuracy_reward": 0.3750000149011612,
"rewards/format_reward": 0.493055559694767,
"step": 246
},
{
"clip_ratio": 0.0,
"completion_length": 731.4444580078125,
"epoch": 0.4233076263924593,
"grad_norm": 0.2738264203071594,
"kl": 0.01085662841796875,
"learning_rate": 6.374054580489873e-07,
"loss": 0.004,
"reward": 0.6805555522441864,
"reward_std": 0.10206207446753979,
"rewards/accuracy_reward": 0.09722222480922937,
"rewards/format_reward": 0.486111119389534,
"step": 247
},
{
"clip_ratio": 0.0,
"completion_length": 645.1805801391602,
"epoch": 0.4250214224507284,
"grad_norm": 0.41325727105140686,
"kl": 0.0088348388671875,
"learning_rate": 6.343215915635761e-07,
"loss": 0.0097,
"reward": 0.8888888955116272,
"reward_std": 0.3547067791223526,
"rewards/accuracy_reward": 0.1944444477558136,
"rewards/format_reward": 0.5,
"step": 248
},
{
"clip_ratio": 0.0,
"completion_length": 734.1944427490234,
"epoch": 0.4267352185089974,
"grad_norm": 0.39983558654785156,
"kl": 0.009674072265625,
"learning_rate": 6.31233615362752e-07,
"loss": 0.0139,
"reward": 0.861111119389534,
"reward_std": 0.4262731969356537,
"rewards/accuracy_reward": 0.18055556248873472,
"rewards/format_reward": 0.5,
"step": 249
},
{
"clip_ratio": 0.0,
"completion_length": 676.3472290039062,
"epoch": 0.4284490145672665,
"grad_norm": 0.3825208246707916,
"kl": 0.00958251953125,
"learning_rate": 6.281416799501187e-07,
"loss": -0.0015,
"reward": 0.6944444328546524,
"reward_std": 0.20412414520978928,
"rewards/accuracy_reward": 0.09722222480922937,
"rewards/format_reward": 0.5,
"step": 250
},
{
"clip_ratio": 0.0,
"completion_length": 725.7222290039062,
"epoch": 0.4301628106255356,
"grad_norm": 0.247074156999588,
"kl": 0.011627197265625,
"learning_rate": 6.25045936022246e-07,
"loss": -0.0156,
"reward": 0.9444444328546524,
"reward_std": 0.2453947737812996,
"rewards/accuracy_reward": 0.2222222276031971,
"rewards/format_reward": 0.5,
"step": 251
},
{
"clip_ratio": 0.0,
"completion_length": 644.7083435058594,
"epoch": 0.4318766066838046,
"grad_norm": 0.4444674849510193,
"kl": 0.009063720703125,
"learning_rate": 6.219465344613258e-07,
"loss": 0.009,
"reward": 0.8333333432674408,
"reward_std": 0.40472324192523956,
"rewards/accuracy_reward": 0.16666666883975267,
"rewards/format_reward": 0.5,
"step": 252
},
{
"clip_ratio": 0.0,
"completion_length": 747.0277862548828,
"epoch": 0.43359040274207367,
"grad_norm": 0.5097996592521667,
"kl": 0.00933837890625,
"learning_rate": 6.188436263278172e-07,
"loss": -0.0255,
"reward": 0.7569444552063942,
"reward_std": 0.5057707708328962,
"rewards/accuracy_reward": 0.13888889271765947,
"rewards/format_reward": 0.479166679084301,
"step": 253
},
{
"clip_ratio": 0.0,
"completion_length": 728.7083358764648,
"epoch": 0.43530419880034277,
"grad_norm": 0.35494205355644226,
"kl": 0.0090484619140625,
"learning_rate": 6.157373628530852e-07,
"loss": 0.0083,
"reward": 0.8541666567325592,
"reward_std": 0.34325060993433,
"rewards/accuracy_reward": 0.180555559694767,
"rewards/format_reward": 0.493055559694767,
"step": 254
},
{
"clip_ratio": 0.0,
"completion_length": 681.2361145019531,
"epoch": 0.4370179948586118,
"grad_norm": 0.3796377182006836,
"kl": 0.0113677978515625,
"learning_rate": 6.126278954320294e-07,
"loss": 0.0005,
"reward": 0.7777777910232544,
"reward_std": 0.29541125893592834,
"rewards/accuracy_reward": 0.13888889271765947,
"rewards/format_reward": 0.5,
"step": 255
},
{
"clip_ratio": 0.0,
"completion_length": 747.6666717529297,
"epoch": 0.4387317909168809,
"grad_norm": 0.400722861289978,
"kl": 0.00862884521484375,
"learning_rate": 6.095153756157051e-07,
"loss": -0.0062,
"reward": 0.6249999925494194,
"reward_std": 0.24970055185258389,
"rewards/accuracy_reward": 0.06944444589316845,
"rewards/format_reward": 0.486111119389534,
"step": 256
},
{
"clip_ratio": 0.0,
"completion_length": 794.6527862548828,
"epoch": 0.44044558697514996,
"grad_norm": 0.4615187644958496,
"kl": 0.00820159912109375,
"learning_rate": 6.06399955103937e-07,
"loss": -0.0081,
"reward": 0.6805555522441864,
"reward_std": 0.311707004904747,
"rewards/accuracy_reward": 0.09722222574055195,
"rewards/format_reward": 0.4861111119389534,
"step": 257
},
{
"clip_ratio": 0.0,
"completion_length": 754.5277709960938,
"epoch": 0.442159383033419,
"grad_norm": 0.3552338480949402,
"kl": 0.0101165771484375,
"learning_rate": 6.032817857379256e-07,
"loss": 0.0125,
"reward": 0.6875000074505806,
"reward_std": 0.2571843173354864,
"rewards/accuracy_reward": 0.0972222238779068,
"rewards/format_reward": 0.493055559694767,
"step": 258
},
{
"clip_ratio": 0.0,
"completion_length": 634.4166717529297,
"epoch": 0.4438731790916881,
"grad_norm": 0.6070718765258789,
"kl": 0.00984954833984375,
"learning_rate": 6.001610194928464e-07,
"loss": 0.006,
"reward": 1.0555555522441864,
"reward_std": 0.5303644090890884,
"rewards/accuracy_reward": 0.27777778543531895,
"rewards/format_reward": 0.5,
"step": 259
},
{
"clip_ratio": 0.0,
"completion_length": 624.7361145019531,
"epoch": 0.44558697514995715,
"grad_norm": 0.5230724215507507,
"kl": 0.00922393798828125,
"learning_rate": 5.97037808470444e-07,
"loss": -0.0235,
"reward": 0.7777777761220932,
"reward_std": 0.3582318127155304,
"rewards/accuracy_reward": 0.13888889085501432,
"rewards/format_reward": 0.5,
"step": 260
},
{
"clip_ratio": 0.0,
"completion_length": 691.1666870117188,
"epoch": 0.4473007712082262,
"grad_norm": 0.39952459931373596,
"kl": 0.00872802734375,
"learning_rate": 5.939123048916173e-07,
"loss": 0.0159,
"reward": 0.8333333283662796,
"reward_std": 0.2221490666270256,
"rewards/accuracy_reward": 0.16666666977107525,
"rewards/format_reward": 0.5,
"step": 261
},
{
"clip_ratio": 0.0,
"completion_length": 800.1250152587891,
"epoch": 0.4490145672664953,
"grad_norm": 0.41582420468330383,
"kl": 0.008209228515625,
"learning_rate": 5.907846610890011e-07,
"loss": 0.0729,
"reward": 1.2500000149011612,
"reward_std": 0.5078206732869148,
"rewards/accuracy_reward": 0.3888888992369175,
"rewards/format_reward": 0.4722222238779068,
"step": 262
},
{
"clip_ratio": 0.0,
"completion_length": 708.9583282470703,
"epoch": 0.45072836332476435,
"grad_norm": 0.5363173484802246,
"kl": 0.00957489013671875,
"learning_rate": 5.87655029499542e-07,
"loss": -0.022,
"reward": 0.6527777761220932,
"reward_std": 0.25616974383592606,
"rewards/accuracy_reward": 0.08333333395421505,
"rewards/format_reward": 0.486111119389534,
"step": 263
},
{
"clip_ratio": 0.0,
"completion_length": 608.2083511352539,
"epoch": 0.4524421593830334,
"grad_norm": 0.4334163963794708,
"kl": 0.00849151611328125,
"learning_rate": 5.845235626570683e-07,
"loss": 0.011,
"reward": 0.8333333432674408,
"reward_std": 0.3082153648138046,
"rewards/accuracy_reward": 0.16666667070239782,
"rewards/format_reward": 0.5,
"step": 264
},
{
"clip_ratio": 0.0,
"completion_length": 678.3750152587891,
"epoch": 0.4541559554413025,
"grad_norm": 0.38257157802581787,
"kl": 0.0098876953125,
"learning_rate": 5.813904131848564e-07,
"loss": -0.0024,
"reward": 1.1944444328546524,
"reward_std": 0.38669832795858383,
"rewards/accuracy_reward": 0.34722223225980997,
"rewards/format_reward": 0.5,
"step": 265
},
{
"clip_ratio": 0.0,
"completion_length": 795.8889007568359,
"epoch": 0.45586975149957154,
"grad_norm": 0.36699798703193665,
"kl": 0.0082855224609375,
"learning_rate": 5.78255733788191e-07,
"loss": 0.0402,
"reward": 0.8333333283662796,
"reward_std": 0.30821535736322403,
"rewards/accuracy_reward": 0.16666666697710752,
"rewards/format_reward": 0.5,
"step": 266
},
{
"clip_ratio": 0.0,
"completion_length": 679.8611221313477,
"epoch": 0.45758354755784064,
"grad_norm": 7.172515869140625,
"kl": 0.10137939453125,
"learning_rate": 5.751196772469237e-07,
"loss": -0.0182,
"reward": 0.625,
"reward_std": 0.16182994842529297,
"rewards/accuracy_reward": 0.06944444496184587,
"rewards/format_reward": 0.4861111119389534,
"step": 267
},
{
"clip_ratio": 0.0,
"completion_length": 669.3194580078125,
"epoch": 0.4592973436161097,
"grad_norm": 0.4328586459159851,
"kl": 0.01244354248046875,
"learning_rate": 5.71982396408026e-07,
"loss": 0.0242,
"reward": 0.7430555671453476,
"reward_std": 0.25718431919813156,
"rewards/accuracy_reward": 0.1250000037252903,
"rewards/format_reward": 0.493055559694767,
"step": 268
},
{
"clip_ratio": 0.0,
"completion_length": 706.1805725097656,
"epoch": 0.46101113967437873,
"grad_norm": 0.15796984732151031,
"kl": 0.00635528564453125,
"learning_rate": 5.688440441781398e-07,
"loss": -0.0015,
"reward": 0.7777777761220932,
"reward_std": 0.08606629818677902,
"rewards/accuracy_reward": 0.1388888917863369,
"rewards/format_reward": 0.5,
"step": 269
},
{
"clip_ratio": 0.0,
"completion_length": 763.4861145019531,
"epoch": 0.46272493573264784,
"grad_norm": 0.42503196001052856,
"kl": 0.010040283203125,
"learning_rate": 5.657047735161255e-07,
"loss": -0.0154,
"reward": 1.0138888955116272,
"reward_std": 0.3125211223959923,
"rewards/accuracy_reward": 0.26388889644294977,
"rewards/format_reward": 0.4861111119389534,
"step": 270
},
{
"clip_ratio": 0.0,
"completion_length": 671.2639007568359,
"epoch": 0.4644387317909169,
"grad_norm": 0.442844957113266,
"kl": 0.0098724365234375,
"learning_rate": 5.625647374256061e-07,
"loss": -0.0138,
"reward": 0.8541666567325592,
"reward_std": 0.3072007820010185,
"rewards/accuracy_reward": 0.18055556155741215,
"rewards/format_reward": 0.493055559694767,
"step": 271
},
{
"clip_ratio": 0.0,
"completion_length": 584.1666793823242,
"epoch": 0.4661525278491859,
"grad_norm": 0.6295328140258789,
"kl": 0.0174713134765625,
"learning_rate": 5.594240889475106e-07,
"loss": 0.0023,
"reward": 0.8750000149011612,
"reward_std": 0.46228964626789093,
"rewards/accuracy_reward": 0.19444444961845875,
"rewards/format_reward": 0.4861111119389534,
"step": 272
},
{
"clip_ratio": 0.0,
"completion_length": 709.3611145019531,
"epoch": 0.46786632390745503,
"grad_norm": 0.34427711367607117,
"kl": 0.0111846923828125,
"learning_rate": 5.562829811526154e-07,
"loss": 0.0205,
"reward": 0.8541666865348816,
"reward_std": 0.25718431919813156,
"rewards/accuracy_reward": 0.180555559694767,
"rewards/format_reward": 0.493055559694767,
"step": 273
},
{
"clip_ratio": 0.0,
"completion_length": 725.625,
"epoch": 0.4695801199657241,
"grad_norm": 0.3454584777355194,
"kl": 0.00966644287109375,
"learning_rate": 5.531415671340826e-07,
"loss": -0.0354,
"reward": 0.5972222238779068,
"reward_std": 0.28170324862003326,
"rewards/accuracy_reward": 0.0555555559694767,
"rewards/format_reward": 0.4861111119389534,
"step": 274
},
{
"clip_ratio": 0.0,
"completion_length": 674.6111145019531,
"epoch": 0.4712939160239931,
"grad_norm": 0.5319638252258301,
"kl": 0.0121307373046875,
"learning_rate": 5.5e-07,
"loss": -0.0055,
"reward": 0.8055555522441864,
"reward_std": 0.3995024487376213,
"rewards/accuracy_reward": 0.1527777835726738,
"rewards/format_reward": 0.5,
"step": 275
},
{
"clip_ratio": 0.0,
"completion_length": 655.1527862548828,
"epoch": 0.4730077120822622,
"grad_norm": 0.4211221933364868,
"kl": 0.0104217529296875,
"learning_rate": 5.468584328659172e-07,
"loss": 0.0072,
"reward": 0.8055555671453476,
"reward_std": 0.3995024487376213,
"rewards/accuracy_reward": 0.15277778171002865,
"rewards/format_reward": 0.5,
"step": 276
},
{
"clip_ratio": 0.0,
"completion_length": 731.9305725097656,
"epoch": 0.47472150814053127,
"grad_norm": 0.4488551914691925,
"kl": 0.011688232421875,
"learning_rate": 5.437170188473847e-07,
"loss": -0.0014,
"reward": 1.1041666716337204,
"reward_std": 0.34847141802310944,
"rewards/accuracy_reward": 0.3055555634200573,
"rewards/format_reward": 0.493055559694767,
"step": 277
},
{
"clip_ratio": 0.0,
"completion_length": 681.75,
"epoch": 0.47643530419880037,
"grad_norm": 0.3861945867538452,
"kl": 0.011749267578125,
"learning_rate": 5.405759110524894e-07,
"loss": 0.02,
"reward": 0.8749999925494194,
"reward_std": 0.34098767302930355,
"rewards/accuracy_reward": 0.1944444514811039,
"rewards/format_reward": 0.486111119389534,
"step": 278
},
{
"clip_ratio": 0.0,
"completion_length": 790.6389007568359,
"epoch": 0.4781491002570694,
"grad_norm": 0.6166090369224548,
"kl": 0.0152740478515625,
"learning_rate": 5.37435262574394e-07,
"loss": 0.0135,
"reward": 0.7152777910232544,
"reward_std": 0.3984878733754158,
"rewards/accuracy_reward": 0.11111111473292112,
"rewards/format_reward": 0.493055559694767,
"step": 279
},
{
"clip_ratio": 0.0,
"completion_length": 613.5277862548828,
"epoch": 0.47986289631533846,
"grad_norm": 0.3968028426170349,
"kl": 0.0106964111328125,
"learning_rate": 5.342952264838747e-07,
"loss": 0.0027,
"reward": 0.8611111044883728,
"reward_std": 0.24017397314310074,
"rewards/accuracy_reward": 0.180555559694767,
"rewards/format_reward": 0.5,
"step": 280
},
{
"clip_ratio": 0.0,
"completion_length": 728.4861145019531,
"epoch": 0.48157669237360756,
"grad_norm": 0.4651610851287842,
"kl": 0.00959014892578125,
"learning_rate": 5.311559558218603e-07,
"loss": 0.0424,
"reward": 1.1250000149011612,
"reward_std": 0.40491778403520584,
"rewards/accuracy_reward": 0.31944445613771677,
"rewards/format_reward": 0.486111119389534,
"step": 281
},
{
"clip_ratio": 0.0,
"completion_length": 719.4027862548828,
"epoch": 0.4832904884318766,
"grad_norm": 0.4324786365032196,
"kl": 0.0084686279296875,
"learning_rate": 5.28017603591974e-07,
"loss": 0.0187,
"reward": 0.7986111268401146,
"reward_std": 0.24438020400702953,
"rewards/accuracy_reward": 0.1527777798473835,
"rewards/format_reward": 0.493055559694767,
"step": 282
},
{
"clip_ratio": 0.0,
"completion_length": 743.3889007568359,
"epoch": 0.48500428449014565,
"grad_norm": 0.317564994096756,
"kl": 0.009735107421875,
"learning_rate": 5.248803227530763e-07,
"loss": 0.0143,
"reward": 0.729166679084301,
"reward_std": 0.25436214357614517,
"rewards/accuracy_reward": 0.12500000186264515,
"rewards/format_reward": 0.479166679084301,
"step": 283
},
{
"clip_ratio": 0.0,
"completion_length": 799.6666870117188,
"epoch": 0.48671808054841476,
"grad_norm": 0.3215982913970947,
"kl": 0.0088348388671875,
"learning_rate": 5.21744266211809e-07,
"loss": -0.0185,
"reward": 0.9652777910232544,
"reward_std": 0.3315606266260147,
"rewards/accuracy_reward": 0.23611111845821142,
"rewards/format_reward": 0.493055559694767,
"step": 284
},
{
"clip_ratio": 0.0,
"completion_length": 815.6944580078125,
"epoch": 0.4884318766066838,
"grad_norm": 0.28981852531433105,
"kl": 0.00812530517578125,
"learning_rate": 5.186095868151436e-07,
"loss": -0.002,
"reward": 0.805555559694767,
"reward_std": 0.03983211889863014,
"rewards/accuracy_reward": 0.1666666716337204,
"rewards/format_reward": 0.4722222238779068,
"step": 285
},
{
"clip_ratio": 0.0,
"completion_length": 738.4305725097656,
"epoch": 0.49014567266495285,
"grad_norm": 0.437377393245697,
"kl": 0.00920867919921875,
"learning_rate": 5.154764373429315e-07,
"loss": 0.0761,
"reward": 0.7638889029622078,
"reward_std": 0.2561697345227003,
"rewards/accuracy_reward": 0.13888889085501432,
"rewards/format_reward": 0.486111119389534,
"step": 286
},
{
"clip_ratio": 0.0,
"completion_length": 681.2222290039062,
"epoch": 0.49185946872322195,
"grad_norm": 0.44190195202827454,
"kl": 0.0103912353515625,
"learning_rate": 5.123449705004581e-07,
"loss": -0.0106,
"reward": 0.972222238779068,
"reward_std": 0.46232303231954575,
"rewards/accuracy_reward": 0.236111119389534,
"rewards/format_reward": 0.5,
"step": 287
},
{
"clip_ratio": 0.0,
"completion_length": 695.5277709960938,
"epoch": 0.493573264781491,
"grad_norm": 0.496455579996109,
"kl": 0.01013946533203125,
"learning_rate": 5.09215338910999e-07,
"loss": -0.0123,
"reward": 0.6250000149011612,
"reward_std": 0.2721321564167738,
"rewards/accuracy_reward": 0.0833333358168602,
"rewards/format_reward": 0.4583333432674408,
"step": 288
},
{
"clip_ratio": 0.0,
"completion_length": 728.0972290039062,
"epoch": 0.4952870608397601,
"grad_norm": 0.6901673078536987,
"kl": 0.01247406005859375,
"learning_rate": 5.060876951083828e-07,
"loss": -0.0307,
"reward": 0.8958333283662796,
"reward_std": 0.4455699250102043,
"rewards/accuracy_reward": 0.2083333395421505,
"rewards/format_reward": 0.4791666641831398,
"step": 289
},
{
"clip_ratio": 0.0,
"completion_length": 736.1666717529297,
"epoch": 0.49700085689802914,
"grad_norm": 0.4506691098213196,
"kl": 0.00757598876953125,
"learning_rate": 5.02962191529556e-07,
"loss": 0.0253,
"reward": 0.8958333283662796,
"reward_std": 0.48350031673908234,
"rewards/accuracy_reward": 0.20833333488553762,
"rewards/format_reward": 0.479166679084301,
"step": 290
},
{
"clip_ratio": 0.0,
"completion_length": 784.4305572509766,
"epoch": 0.4987146529562982,
"grad_norm": 0.39211300015449524,
"kl": 0.01044464111328125,
"learning_rate": 4.998389805071536e-07,
"loss": 0.0139,
"reward": 0.9305555671453476,
"reward_std": 0.38056251406669617,
"rewards/accuracy_reward": 0.22222223225980997,
"rewards/format_reward": 0.4861111119389534,
"step": 291
},
{
"clip_ratio": 0.0,
"completion_length": 630.5416793823242,
"epoch": 0.5004284490145673,
"grad_norm": 0.6736369729042053,
"kl": 0.011474609375,
"learning_rate": 4.967182142620745e-07,
"loss": -0.0179,
"reward": 1.0555555820465088,
"reward_std": 0.553610123693943,
"rewards/accuracy_reward": 0.2777777835726738,
"rewards/format_reward": 0.5,
"step": 292
},
{
"clip_ratio": 0.0,
"completion_length": 599.7500076293945,
"epoch": 0.5021422450728363,
"grad_norm": 0.35509321093559265,
"kl": 0.0101470947265625,
"learning_rate": 4.93600044896063e-07,
"loss": 0.0294,
"reward": 0.8333333432674408,
"reward_std": 0.331461064517498,
"rewards/accuracy_reward": 0.16666666883975267,
"rewards/format_reward": 0.5,
"step": 293
},
{
"clip_ratio": 0.0,
"completion_length": 651.8194580078125,
"epoch": 0.5038560411311054,
"grad_norm": 0.35510072112083435,
"kl": 0.0114593505859375,
"learning_rate": 4.904846243842949e-07,
"loss": -0.0076,
"reward": 0.9166666716337204,
"reward_std": 0.2634196802973747,
"rewards/accuracy_reward": 0.2083333395421505,
"rewards/format_reward": 0.5,
"step": 294
},
{
"clip_ratio": 0.0,
"completion_length": 702.9583435058594,
"epoch": 0.5055698371893744,
"grad_norm": 0.37312352657318115,
"kl": 0.01067352294921875,
"learning_rate": 4.873721045679706e-07,
"loss": -0.0084,
"reward": 0.7916666865348816,
"reward_std": 0.46386218070983887,
"rewards/accuracy_reward": 0.15277778171002865,
"rewards/format_reward": 0.486111119389534,
"step": 295
},
{
"clip_ratio": 0.0,
"completion_length": 737.9583511352539,
"epoch": 0.5072836332476436,
"grad_norm": 0.5573227405548096,
"kl": 0.0097503662109375,
"learning_rate": 4.842626371469149e-07,
"loss": -0.0192,
"reward": 1.034722238779068,
"reward_std": 0.7432259321212769,
"rewards/accuracy_reward": 0.27777778171002865,
"rewards/format_reward": 0.4791666716337204,
"step": 296
},
{
"clip_ratio": 0.0,
"completion_length": 685.2639007568359,
"epoch": 0.5089974293059126,
"grad_norm": 0.5244731307029724,
"kl": 0.00927734375,
"learning_rate": 4.811563736721829e-07,
"loss": -0.0399,
"reward": 0.9305555671453476,
"reward_std": 0.5026786401867867,
"rewards/accuracy_reward": 0.2222222276031971,
"rewards/format_reward": 0.486111119389534,
"step": 297
},
{
"clip_ratio": 0.0,
"completion_length": 633.9722290039062,
"epoch": 0.5107112253641817,
"grad_norm": 0.5318572521209717,
"kl": 0.010101318359375,
"learning_rate": 4.780534655386743e-07,
"loss": 0.0125,
"reward": 0.8402777761220932,
"reward_std": 0.2809867858886719,
"rewards/accuracy_reward": 0.18055555876344442,
"rewards/format_reward": 0.4791666716337204,
"step": 298
},
{
"clip_ratio": 0.0,
"completion_length": 738.1944580078125,
"epoch": 0.5124250214224507,
"grad_norm": 0.4404042065143585,
"kl": 0.0084686279296875,
"learning_rate": 4.749540639777539e-07,
"loss": 0.0031,
"reward": 1.1250000149011612,
"reward_std": 0.4589441120624542,
"rewards/accuracy_reward": 0.31944445334374905,
"rewards/format_reward": 0.486111119389534,
"step": 299
},
{
"clip_ratio": 0.0,
"completion_length": 715.8611145019531,
"epoch": 0.5141388174807198,
"grad_norm": 0.36858052015304565,
"kl": 0.010406494140625,
"learning_rate": 4.7185832004988133e-07,
"loss": -0.0088,
"reward": 0.7499999850988388,
"reward_std": 0.33668187260627747,
"rewards/accuracy_reward": 0.1250000037252903,
"rewards/format_reward": 0.5,
"step": 300
},
{
"clip_ratio": 0.0,
"completion_length": 658.6111297607422,
"epoch": 0.5158526135389888,
"grad_norm": 0.44451257586479187,
"kl": 0.00970458984375,
"learning_rate": 4.68766384637248e-07,
"loss": -0.0124,
"reward": 0.8055555522441864,
"reward_std": 0.2901904284954071,
"rewards/accuracy_reward": 0.1527777798473835,
"rewards/format_reward": 0.5,
"step": 301
},
{
"clip_ratio": 0.0,
"completion_length": 743.5555725097656,
"epoch": 0.517566409597258,
"grad_norm": 0.394045889377594,
"kl": 0.00940704345703125,
"learning_rate": 4.656784084364238e-07,
"loss": 0.0071,
"reward": 0.861111119389534,
"reward_std": 0.3762567415833473,
"rewards/accuracy_reward": 0.18055555783212185,
"rewards/format_reward": 0.5,
"step": 302
},
{
"clip_ratio": 0.0,
"completion_length": 686.5833435058594,
"epoch": 0.519280205655527,
"grad_norm": 0.41484999656677246,
"kl": 0.01177978515625,
"learning_rate": 4.6259454195101267e-07,
"loss": 0.0097,
"reward": 0.6388888955116272,
"reward_std": 0.22736987471580505,
"rewards/accuracy_reward": 0.06944444496184587,
"rewards/format_reward": 0.5,
"step": 303
},
{
"clip_ratio": 0.0,
"completion_length": 663.4027862548828,
"epoch": 0.5209940017137961,
"grad_norm": 0.6104622483253479,
"kl": 0.011749267578125,
"learning_rate": 4.59514935484316e-07,
"loss": 0.0155,
"reward": 0.826388880610466,
"reward_std": 0.4845541790127754,
"rewards/accuracy_reward": 0.16666667070239782,
"rewards/format_reward": 0.493055559694767,
"step": 304
},
{
"clip_ratio": 0.0,
"completion_length": 673.0277862548828,
"epoch": 0.5227077977720651,
"grad_norm": 0.412063330411911,
"kl": 0.010986328125,
"learning_rate": 4.5643973913200837e-07,
"loss": 0.0303,
"reward": 0.9513889029622078,
"reward_std": 0.3880129065364599,
"rewards/accuracy_reward": 0.2361111119389534,
"rewards/format_reward": 0.4791666716337204,
"step": 305
},
{
"clip_ratio": 0.0,
"completion_length": 701.0416717529297,
"epoch": 0.5244215938303342,
"grad_norm": 0.5593081116676331,
"kl": 0.0098724365234375,
"learning_rate": 4.5336910277482155e-07,
"loss": 0.0378,
"reward": 0.7986111044883728,
"reward_std": 0.39326707273721695,
"rewards/accuracy_reward": 0.1527777835726738,
"rewards/format_reward": 0.493055559694767,
"step": 306
},
{
"clip_ratio": 0.0,
"completion_length": 751.9722290039062,
"epoch": 0.5261353898886033,
"grad_norm": 0.48807039856910706,
"kl": 0.0091400146484375,
"learning_rate": 4.503031760712397e-07,
"loss": 0.0218,
"reward": 1.1875000149011612,
"reward_std": 0.6258577555418015,
"rewards/accuracy_reward": 0.34722222946584225,
"rewards/format_reward": 0.493055559694767,
"step": 307
},
{
"clip_ratio": 0.0,
"completion_length": 835.2916717529297,
"epoch": 0.5278491859468724,
"grad_norm": 0.4123370349407196,
"kl": 0.010833740234375,
"learning_rate": 4.4724210845020494e-07,
"loss": -0.0059,
"reward": 0.7777777910232544,
"reward_std": 0.205099418759346,
"rewards/accuracy_reward": 0.15277778077870607,
"rewards/format_reward": 0.4722222313284874,
"step": 308
},
{
"clip_ratio": 0.0,
"completion_length": 705.0555572509766,
"epoch": 0.5295629820051414,
"grad_norm": 0.466584712266922,
"kl": 0.00885009765625,
"learning_rate": 4.441860491038345e-07,
"loss": 0.0187,
"reward": 0.8750000074505806,
"reward_std": 0.3422360420227051,
"rewards/accuracy_reward": 0.19444444496184587,
"rewards/format_reward": 0.486111119389534,
"step": 309
},
{
"clip_ratio": 0.0,
"completion_length": 760.2639007568359,
"epoch": 0.5312767780634104,
"grad_norm": 0.33340954780578613,
"kl": 0.00978851318359375,
"learning_rate": 4.4113514698014953e-07,
"loss": 0.0057,
"reward": 0.7708333358168602,
"reward_std": 0.17633881978690624,
"rewards/accuracy_reward": 0.1388888917863369,
"rewards/format_reward": 0.493055559694767,
"step": 310
},
{
"clip_ratio": 0.0,
"completion_length": 770.3194427490234,
"epoch": 0.5329905741216795,
"grad_norm": 0.34279051423072815,
"kl": 0.0094146728515625,
"learning_rate": 4.3808955077581546e-07,
"loss": -0.0054,
"reward": 0.6875000074505806,
"reward_std": 0.25718431919813156,
"rewards/accuracy_reward": 0.0972222238779068,
"rewards/format_reward": 0.493055559694767,
"step": 311
},
{
"clip_ratio": 0.0,
"completion_length": 807.4861297607422,
"epoch": 0.5347043701799485,
"grad_norm": 0.42101454734802246,
"kl": 0.007659912109375,
"learning_rate": 4.350494089288943e-07,
"loss": 0.0135,
"reward": 0.9930555671453476,
"reward_std": 0.49956031143665314,
"rewards/accuracy_reward": 0.25000000838190317,
"rewards/format_reward": 0.493055559694767,
"step": 312
},
{
"clip_ratio": 0.0,
"completion_length": 615.5277709960938,
"epoch": 0.5364181662382177,
"grad_norm": 0.5372198820114136,
"kl": 0.0162506103515625,
"learning_rate": 4.3201486961161093e-07,
"loss": -0.0168,
"reward": 1.1875000149011612,
"reward_std": 0.4131338596343994,
"rewards/accuracy_reward": 0.3472222276031971,
"rewards/format_reward": 0.493055559694767,
"step": 313
},
{
"clip_ratio": 0.0,
"completion_length": 725.8750152587891,
"epoch": 0.5381319622964867,
"grad_norm": 0.4827311038970947,
"kl": 0.00830078125,
"learning_rate": 4.2898608072313045e-07,
"loss": 0.0221,
"reward": 1.0000000149011612,
"reward_std": 0.5675767734646797,
"rewards/accuracy_reward": 0.25000000558793545,
"rewards/format_reward": 0.5,
"step": 314
},
{
"clip_ratio": 0.0,
"completion_length": 631.4444427490234,
"epoch": 0.5398457583547558,
"grad_norm": 0.32586589455604553,
"kl": 0.0107269287109375,
"learning_rate": 4.2596318988235037e-07,
"loss": -0.011,
"reward": 0.861111119389534,
"reward_std": 0.22736985981464386,
"rewards/accuracy_reward": 0.1805555634200573,
"rewards/format_reward": 0.5,
"step": 315
},
{
"clip_ratio": 0.0,
"completion_length": 760.6944427490234,
"epoch": 0.5415595544130248,
"grad_norm": 0.36847296357154846,
"kl": 0.0100860595703125,
"learning_rate": 4.2294634442070553e-07,
"loss": 0.0058,
"reward": 0.8819444477558136,
"reward_std": 0.3717171251773834,
"rewards/accuracy_reward": 0.1944444477558136,
"rewards/format_reward": 0.493055559694767,
"step": 316
},
{
"clip_ratio": 0.0,
"completion_length": 860.0138854980469,
"epoch": 0.5432733504712939,
"grad_norm": 0.33621668815612793,
"kl": 0.0082855224609375,
"learning_rate": 4.1993569137498776e-07,
"loss": 0.0228,
"reward": 0.8958333656191826,
"reward_std": 0.3794733416289091,
"rewards/accuracy_reward": 0.2083333358168602,
"rewards/format_reward": 0.4791666716337204,
"step": 317
},
{
"clip_ratio": 0.0,
"completion_length": 591.1944580078125,
"epoch": 0.5449871465295629,
"grad_norm": 0.5173898935317993,
"kl": 0.01264190673828125,
"learning_rate": 4.1693137748017915e-07,
"loss": -0.0109,
"reward": 1.083333358168602,
"reward_std": 0.4227481558918953,
"rewards/accuracy_reward": 0.2916666753590107,
"rewards/format_reward": 0.5,
"step": 318
},
{
"clip_ratio": 0.0,
"completion_length": 781.0694427490234,
"epoch": 0.5467009425878321,
"grad_norm": 0.4726315438747406,
"kl": 0.01230621337890625,
"learning_rate": 4.1393354916230005e-07,
"loss": 0.0288,
"reward": 0.6666666939854622,
"reward_std": 0.27821177802979946,
"rewards/accuracy_reward": 0.0972222238779068,
"rewards/format_reward": 0.472222238779068,
"step": 319
},
{
"clip_ratio": 0.0,
"completion_length": 706.5139007568359,
"epoch": 0.5484147386461011,
"grad_norm": 0.4803582727909088,
"kl": 0.0104217529296875,
"learning_rate": 4.1094235253127374e-07,
"loss": 0.0114,
"reward": 1.1666666865348816,
"reward_std": 0.5012311488389969,
"rewards/accuracy_reward": 0.3333333432674408,
"rewards/format_reward": 0.5,
"step": 320
},
{
"clip_ratio": 0.0,
"completion_length": 652.2222137451172,
"epoch": 0.5501285347043702,
"grad_norm": 0.4137377440929413,
"kl": 0.00899505615234375,
"learning_rate": 4.079579333738039e-07,
"loss": -0.0062,
"reward": 0.680555559694767,
"reward_std": 0.27419466339051723,
"rewards/accuracy_reward": 0.09722222294658422,
"rewards/format_reward": 0.486111119389534,
"step": 321
},
{
"clip_ratio": 0.0,
"completion_length": 661.7500152587891,
"epoch": 0.5518423307626392,
"grad_norm": 0.7754374146461487,
"kl": 0.01078033447265625,
"learning_rate": 4.0498043714627006e-07,
"loss": 0.0696,
"reward": 0.7083333283662796,
"reward_std": 0.4283023327589035,
"rewards/accuracy_reward": 0.11111111287027597,
"rewards/format_reward": 0.486111119389534,
"step": 322
},
{
"clip_ratio": 0.0,
"completion_length": 747.7777709960938,
"epoch": 0.5535561268209083,
"grad_norm": 0.23274828493595123,
"kl": 0.01306915283203125,
"learning_rate": 4.020100089676376e-07,
"loss": 0.0218,
"reward": 0.8541666641831398,
"reward_std": 0.08505172841250896,
"rewards/accuracy_reward": 0.180555559694767,
"rewards/format_reward": 0.493055559694767,
"step": 323
},
{
"clip_ratio": 0.0,
"completion_length": 794.263916015625,
"epoch": 0.5552699228791774,
"grad_norm": 0.5167786478996277,
"kl": 0.0105438232421875,
"learning_rate": 3.9904679361238526e-07,
"loss": 0.0041,
"reward": 1.0347222089767456,
"reward_std": 0.5008072182536125,
"rewards/accuracy_reward": 0.2777777835726738,
"rewards/format_reward": 0.4791666641831398,
"step": 324
},
{
"clip_ratio": 0.0,
"completion_length": 756.1250305175781,
"epoch": 0.5569837189374465,
"grad_norm": 0.42761853337287903,
"kl": 0.01197052001953125,
"learning_rate": 3.9609093550344907e-07,
"loss": 0.0332,
"reward": 0.7152777761220932,
"reward_std": 0.4112919941544533,
"rewards/accuracy_reward": 0.11111111380159855,
"rewards/format_reward": 0.493055559694767,
"step": 325
},
{
"clip_ratio": 0.0,
"completion_length": 678.4583358764648,
"epoch": 0.5586975149957155,
"grad_norm": 0.6914120316505432,
"kl": 0.01470947265625,
"learning_rate": 3.931425787051832e-07,
"loss": 0.0291,
"reward": 1.0763888955116272,
"reward_std": 0.43159355968236923,
"rewards/accuracy_reward": 0.29166667349636555,
"rewards/format_reward": 0.493055559694767,
"step": 326
},
{
"clip_ratio": 0.0,
"completion_length": 575.4722290039062,
"epoch": 0.5604113110539846,
"grad_norm": 0.608383059501648,
"kl": 0.0129547119140625,
"learning_rate": 3.902018669163384e-07,
"loss": 0.0089,
"reward": 1.0486111044883728,
"reward_std": 0.46130846440792084,
"rewards/accuracy_reward": 0.2777777807787061,
"rewards/format_reward": 0.493055559694767,
"step": 327
},
{
"clip_ratio": 0.0,
"completion_length": 621.3055725097656,
"epoch": 0.5621251071122536,
"grad_norm": 0.9387192130088806,
"kl": 0.0234832763671875,
"learning_rate": 3.872689434630585e-07,
"loss": 0.0096,
"reward": 0.8263888955116272,
"reward_std": 0.32522570341825485,
"rewards/accuracy_reward": 0.1666666679084301,
"rewards/format_reward": 0.493055559694767,
"step": 328
},
{
"clip_ratio": 0.0,
"completion_length": 718.8333587646484,
"epoch": 0.5638389031705227,
"grad_norm": 0.36304396390914917,
"kl": 0.00838470458984375,
"learning_rate": 3.843439512918949e-07,
"loss": -0.0051,
"reward": 0.75,
"reward_std": 0.33668185770511627,
"rewards/accuracy_reward": 0.12500000465661287,
"rewards/format_reward": 0.5,
"step": 329
},
{
"clip_ratio": 0.0,
"completion_length": 838.6250152587891,
"epoch": 0.5655526992287918,
"grad_norm": 0.4335583746433258,
"kl": 0.0093231201171875,
"learning_rate": 3.8142703296283953e-07,
"loss": 0.0504,
"reward": 0.8958333358168602,
"reward_std": 0.2658967934548855,
"rewards/accuracy_reward": 0.2083333320915699,
"rewards/format_reward": 0.4791666641831398,
"step": 330
},
{
"clip_ratio": 0.0,
"completion_length": 664.0694427490234,
"epoch": 0.5672664952870609,
"grad_norm": 0.44489485025405884,
"kl": 0.0103607177734375,
"learning_rate": 3.785183306423767e-07,
"loss": -0.0029,
"reward": 1.1944444328546524,
"reward_std": 0.3762567266821861,
"rewards/accuracy_reward": 0.3472222276031971,
"rewards/format_reward": 0.5,
"step": 331
},
{
"clip_ratio": 0.0,
"completion_length": 668.3055725097656,
"epoch": 0.5689802913453299,
"grad_norm": 0.22135576605796814,
"kl": 0.01212310791015625,
"learning_rate": 3.7561798609655373e-07,
"loss": 0.0198,
"reward": 0.8541666641831398,
"reward_std": 0.19436372630298138,
"rewards/accuracy_reward": 0.180555559694767,
"rewards/format_reward": 0.493055559694767,
"step": 332
},
{
"clip_ratio": 0.0,
"completion_length": 756.5416717529297,
"epoch": 0.570694087403599,
"grad_norm": 0.3432075083255768,
"kl": 0.0111541748046875,
"learning_rate": 3.72726140684072e-07,
"loss": -0.0046,
"reward": 0.6597222164273262,
"reward_std": 0.017010344192385674,
"rewards/accuracy_reward": 0.0833333358168602,
"rewards/format_reward": 0.493055559694767,
"step": 333
},
{
"clip_ratio": 0.0,
"completion_length": 798.2083435058594,
"epoch": 0.572407883461868,
"grad_norm": 0.3384978473186493,
"kl": 0.00952911376953125,
"learning_rate": 3.6984293534939737e-07,
"loss": 0.0133,
"reward": 0.8750000298023224,
"reward_std": 0.4249234274029732,
"rewards/accuracy_reward": 0.19444444589316845,
"rewards/format_reward": 0.486111119389534,
"step": 334
},
{
"clip_ratio": 0.0,
"completion_length": 641.7222290039062,
"epoch": 0.5741216795201372,
"grad_norm": 0.41000911593437195,
"kl": 0.0142669677734375,
"learning_rate": 3.6696851061588994e-07,
"loss": 0.0868,
"reward": 0.9583333283662796,
"reward_std": 0.3291585296392441,
"rewards/accuracy_reward": 0.23611110914498568,
"rewards/format_reward": 0.486111119389534,
"step": 335
},
{
"clip_ratio": 0.0,
"completion_length": 671.9583358764648,
"epoch": 0.5758354755784062,
"grad_norm": 0.39112672209739685,
"kl": 0.009521484375,
"learning_rate": 3.641030065789562e-07,
"loss": -0.0072,
"reward": 0.7152777835726738,
"reward_std": 0.26240511797368526,
"rewards/accuracy_reward": 0.11111111287027597,
"rewards/format_reward": 0.493055559694767,
"step": 336
},
{
"clip_ratio": 0.0,
"completion_length": 706.9722366333008,
"epoch": 0.5775492716366752,
"grad_norm": 0.48448535799980164,
"kl": 0.00994873046875,
"learning_rate": 3.612465628992203e-07,
"loss": -0.0094,
"reward": 0.9652777910232544,
"reward_std": 0.332209013402462,
"rewards/accuracy_reward": 0.25000000838190317,
"rewards/format_reward": 0.4652777835726738,
"step": 337
},
{
"clip_ratio": 0.0,
"completion_length": 729.3333282470703,
"epoch": 0.5792630676949443,
"grad_norm": 0.5280615091323853,
"kl": 0.012420654296875,
"learning_rate": 3.5839931879571725e-07,
"loss": -0.0209,
"reward": 0.965277798473835,
"reward_std": 0.5258247926831245,
"rewards/accuracy_reward": 0.23611112125217915,
"rewards/format_reward": 0.493055559694767,
"step": 338
},
{
"clip_ratio": 0.0,
"completion_length": 779.4166717529297,
"epoch": 0.5809768637532133,
"grad_norm": 0.32812824845314026,
"kl": 0.0185089111328125,
"learning_rate": 3.555614130391079e-07,
"loss": 0.0169,
"reward": 0.6666666716337204,
"reward_std": 0.25819889456033707,
"rewards/accuracy_reward": 0.08333333395421505,
"rewards/format_reward": 0.5,
"step": 339
},
{
"clip_ratio": 0.0,
"completion_length": 616.9444351196289,
"epoch": 0.5826906598114824,
"grad_norm": 0.5442182421684265,
"kl": 0.014892578125,
"learning_rate": 3.5273298394491515e-07,
"loss": -0.0563,
"reward": 0.8055555522441864,
"reward_std": 0.44179464131593704,
"rewards/accuracy_reward": 0.16666667070239782,
"rewards/format_reward": 0.4722222238779068,
"step": 340
},
{
"clip_ratio": 0.0,
"completion_length": 625.2639007568359,
"epoch": 0.5844044558697515,
"grad_norm": 0.08322001248598099,
"kl": 0.01381683349609375,
"learning_rate": 3.4991416936678276e-07,
"loss": 0.0005,
"reward": 0.5,
"reward_std": 0.0,
"rewards/accuracy_reward": 0.0,
"rewards/format_reward": 0.5,
"step": 341
},
{
"clip_ratio": 0.0,
"completion_length": 662.7639007568359,
"epoch": 0.5861182519280206,
"grad_norm": 0.42785269021987915,
"kl": 0.00940704345703125,
"learning_rate": 3.471051066897562e-07,
"loss": -0.0295,
"reward": 0.861111119389534,
"reward_std": 0.47276464104652405,
"rewards/accuracy_reward": 0.18055556062608957,
"rewards/format_reward": 0.5,
"step": 342
},
{
"clip_ratio": 0.0,
"completion_length": 588.5138854980469,
"epoch": 0.5878320479862896,
"grad_norm": 0.28376224637031555,
"kl": 0.0150909423828125,
"learning_rate": 3.4430593282358777e-07,
"loss": 0.0035,
"reward": 1.2430555373430252,
"reward_std": 0.24696609377861023,
"rewards/accuracy_reward": 0.3750000037252903,
"rewards/format_reward": 0.493055559694767,
"step": 343
},
{
"clip_ratio": 0.0,
"completion_length": 748.7083282470703,
"epoch": 0.5895458440445587,
"grad_norm": 0.47153374552726746,
"kl": 0.0116729736328125,
"learning_rate": 3.4151678419606233e-07,
"loss": 0.0016,
"reward": 0.6527777686715126,
"reward_std": 0.33054604940116405,
"rewards/accuracy_reward": 0.08333333488553762,
"rewards/format_reward": 0.486111119389534,
"step": 344
},
{
"clip_ratio": 0.0,
"completion_length": 730.5972290039062,
"epoch": 0.5912596401028277,
"grad_norm": 0.45074042677879333,
"kl": 0.0081787109375,
"learning_rate": 3.387377967463493e-07,
"loss": -0.0155,
"reward": 1.3472222238779068,
"reward_std": 0.6584814712405205,
"rewards/accuracy_reward": 0.4305555634200573,
"rewards/format_reward": 0.4861111119389534,
"step": 345
},
{
"clip_ratio": 0.0,
"completion_length": 765.8611297607422,
"epoch": 0.5929734361610969,
"grad_norm": 0.38273346424102783,
"kl": 0.00989532470703125,
"learning_rate": 3.359691059183761e-07,
"loss": 0.0365,
"reward": 0.9166666716337204,
"reward_std": 0.3995024487376213,
"rewards/accuracy_reward": 0.2083333395421505,
"rewards/format_reward": 0.5,
"step": 346
},
{
"clip_ratio": 0.0,
"completion_length": 656.5416717529297,
"epoch": 0.5946872322193659,
"grad_norm": 0.42742764949798584,
"kl": 0.01055908203125,
"learning_rate": 3.3321084665422803e-07,
"loss": 0.0121,
"reward": 0.9930555671453476,
"reward_std": 0.34847141802310944,
"rewards/accuracy_reward": 0.25000000558793545,
"rewards/format_reward": 0.493055559694767,
"step": 347
},
{
"clip_ratio": 0.0,
"completion_length": 760.1111297607422,
"epoch": 0.596401028277635,
"grad_norm": 0.2555226981639862,
"kl": 0.0147552490234375,
"learning_rate": 3.3046315338757026e-07,
"loss": 0.0076,
"reward": 0.6111111044883728,
"reward_std": 0.15932847559452057,
"rewards/accuracy_reward": 0.055555556900799274,
"rewards/format_reward": 0.5,
"step": 348
},
{
"clip_ratio": 0.0,
"completion_length": 742.7777862548828,
"epoch": 0.598114824335904,
"grad_norm": 0.3386673033237457,
"kl": 0.01026153564453125,
"learning_rate": 3.2772616003709616e-07,
"loss": 0.01,
"reward": 0.9930555820465088,
"reward_std": 0.32522569596767426,
"rewards/accuracy_reward": 0.2500000027939677,
"rewards/format_reward": 0.493055559694767,
"step": 349
},
{
"clip_ratio": 0.0,
"completion_length": 694.3333435058594,
"epoch": 0.5998286203941731,
"grad_norm": 0.4300551116466522,
"kl": 0.01024627685546875,
"learning_rate": 3.250000000000001e-07,
"loss": -0.0217,
"reward": 1.048611119389534,
"reward_std": 0.4845541790127754,
"rewards/accuracy_reward": 0.27777778171002865,
"rewards/format_reward": 0.493055559694767,
"step": 350
},
{
"clip_ratio": 0.0,
"completion_length": 710.8611145019531,
"epoch": 0.6015424164524421,
"grad_norm": 0.4361790716648102,
"kl": 0.01013946533203125,
"learning_rate": 3.222848061454764e-07,
"loss": -0.0161,
"reward": 0.7430555671453476,
"reward_std": 0.3072007894515991,
"rewards/accuracy_reward": 0.12500000186264515,
"rewards/format_reward": 0.493055559694767,
"step": 351
},
{
"clip_ratio": 0.0,
"completion_length": 867.2222137451172,
"epoch": 0.6032562125107113,
"grad_norm": 0.41655802726745605,
"kl": 0.0092926025390625,
"learning_rate": 3.195807108082429e-07,
"loss": 0.0058,
"reward": 0.7777777910232544,
"reward_std": 0.4547397345304489,
"rewards/accuracy_reward": 0.13888888992369175,
"rewards/format_reward": 0.5,
"step": 352
},
{
"clip_ratio": 0.0,
"completion_length": 808.7639007568359,
"epoch": 0.6049700085689803,
"grad_norm": 0.32335948944091797,
"kl": 0.009429931640625,
"learning_rate": 3.168878457820915e-07,
"loss": 0.0072,
"reward": 1.0277777761220932,
"reward_std": 0.33668188750743866,
"rewards/accuracy_reward": 0.26388889364898205,
"rewards/format_reward": 0.5,
"step": 353
},
{
"clip_ratio": 0.0,
"completion_length": 568.3610992431641,
"epoch": 0.6066838046272494,
"grad_norm": 0.747093915939331,
"kl": 0.011749267578125,
"learning_rate": 3.142063423134644e-07,
"loss": 0.0159,
"reward": 1.4097222536802292,
"reward_std": 0.6619075667113066,
"rewards/accuracy_reward": 0.4583333395421505,
"rewards/format_reward": 0.493055559694767,
"step": 354
},
{
"clip_ratio": 0.0,
"completion_length": 693.5972290039062,
"epoch": 0.6083976006855184,
"grad_norm": 0.44648948311805725,
"kl": 0.01004791259765625,
"learning_rate": 3.115363310950578e-07,
"loss": 0.0044,
"reward": 0.9027778059244156,
"reward_std": 0.24970055185258389,
"rewards/accuracy_reward": 0.20833333674818277,
"rewards/format_reward": 0.486111119389534,
"step": 355
},
{
"clip_ratio": 0.0,
"completion_length": 603.2638931274414,
"epoch": 0.6101113967437874,
"grad_norm": 0.484390527009964,
"kl": 0.012542724609375,
"learning_rate": 3.0887794225945143e-07,
"loss": -0.0051,
"reward": 0.6249999925494194,
"reward_std": 0.3125211279839277,
"rewards/accuracy_reward": 0.06944444496184587,
"rewards/format_reward": 0.486111119389534,
"step": 356
},
{
"clip_ratio": 0.0,
"completion_length": 628.0138931274414,
"epoch": 0.6118251928020566,
"grad_norm": 0.5177603363990784,
"kl": 0.0098114013671875,
"learning_rate": 3.062313053727671e-07,
"loss": 0.0262,
"reward": 1.0833333283662796,
"reward_std": 0.4495188891887665,
"rewards/accuracy_reward": 0.2916666669771075,
"rewards/format_reward": 0.5,
"step": 357
},
{
"clip_ratio": 0.0,
"completion_length": 677.9444580078125,
"epoch": 0.6135389888603257,
"grad_norm": 0.5876026153564453,
"kl": 0.0117034912109375,
"learning_rate": 3.0359654942835247e-07,
"loss": 0.0255,
"reward": 0.722222238779068,
"reward_std": 0.45503970980644226,
"rewards/accuracy_reward": 0.12500000279396772,
"rewards/format_reward": 0.4722222313284874,
"step": 358
},
{
"clip_ratio": 0.0,
"completion_length": 610.0972290039062,
"epoch": 0.6152527849185947,
"grad_norm": 0.44811731576919556,
"kl": 0.0104827880859375,
"learning_rate": 3.0097380284049523e-07,
"loss": 0.0011,
"reward": 0.9027777761220932,
"reward_std": 0.2613905444741249,
"rewards/accuracy_reward": 0.20833333302289248,
"rewards/format_reward": 0.4861111119389534,
"step": 359
},
{
"clip_ratio": 0.0,
"completion_length": 682.4583435058594,
"epoch": 0.6169665809768637,
"grad_norm": 0.5113480687141418,
"kl": 0.0124053955078125,
"learning_rate": 2.9836319343816397e-07,
"loss": -0.0284,
"reward": 0.826388880610466,
"reward_std": 0.44850434362888336,
"rewards/accuracy_reward": 0.16666666977107525,
"rewards/format_reward": 0.493055559694767,
"step": 360
},
{
"clip_ratio": 0.0,
"completion_length": 613.2639007568359,
"epoch": 0.6186803770351328,
"grad_norm": 0.6223089098930359,
"kl": 0.0121612548828125,
"learning_rate": 2.9576484845877793e-07,
"loss": 0.0238,
"reward": 1.0277777910232544,
"reward_std": 0.6448972225189209,
"rewards/accuracy_reward": 0.2638888955116272,
"rewards/format_reward": 0.5,
"step": 361
},
{
"clip_ratio": 0.0,
"completion_length": 640.7083282470703,
"epoch": 0.6203941730934018,
"grad_norm": 0.5173635482788086,
"kl": 0.00988006591796875,
"learning_rate": 2.931788945420058e-07,
"loss": 0.0152,
"reward": 0.7708333283662796,
"reward_std": 0.4405169114470482,
"rewards/accuracy_reward": 0.13888889364898205,
"rewards/format_reward": 0.493055559694767,
"step": 362
},
{
"clip_ratio": 0.0,
"completion_length": 723.5555725097656,
"epoch": 0.622107969151671,
"grad_norm": 0.49564608931541443,
"kl": 0.0084381103515625,
"learning_rate": 2.9060545772359305e-07,
"loss": -0.0065,
"reward": 1.0138888955116272,
"reward_std": 0.3242111261934042,
"rewards/accuracy_reward": 0.2638888955116272,
"rewards/format_reward": 0.486111119389534,
"step": 363
},
{
"clip_ratio": 0.0,
"completion_length": 653.4722290039062,
"epoch": 0.62382176520994,
"grad_norm": 0.39930129051208496,
"kl": 0.01010894775390625,
"learning_rate": 2.8804466342921987e-07,
"loss": -0.0025,
"reward": 0.9444444328546524,
"reward_std": 0.38147754967212677,
"rewards/accuracy_reward": 0.22222222946584225,
"rewards/format_reward": 0.5,
"step": 364
},
{
"clip_ratio": 0.0,
"completion_length": 837.6944732666016,
"epoch": 0.6255355612682091,
"grad_norm": 0.3432636260986328,
"kl": 0.00820159912109375,
"learning_rate": 2.854966364683872e-07,
"loss": 0.1109,
"reward": 0.7986111268401146,
"reward_std": 0.36156320944428444,
"rewards/accuracy_reward": 0.16666666977107525,
"rewards/format_reward": 0.4652777835726738,
"step": 365
},
{
"clip_ratio": 0.0,
"completion_length": 699.8750152587891,
"epoch": 0.6272493573264781,
"grad_norm": 0.3920920789241791,
"kl": 0.00817108154296875,
"learning_rate": 2.829615010283344e-07,
"loss": -0.0084,
"reward": 0.8263888955116272,
"reward_std": 0.153093121945858,
"rewards/accuracy_reward": 0.1666666716337204,
"rewards/format_reward": 0.493055559694767,
"step": 366
},
{
"clip_ratio": 0.0,
"completion_length": 733.0416717529297,
"epoch": 0.6289631533847472,
"grad_norm": 0.6963837146759033,
"kl": 0.01009368896484375,
"learning_rate": 2.8043938066798645e-07,
"loss": 0.1349,
"reward": 1.2152777910232544,
"reward_std": 0.6726825386285782,
"rewards/accuracy_reward": 0.3611111156642437,
"rewards/format_reward": 0.493055559694767,
"step": 367
},
{
"clip_ratio": 0.0,
"completion_length": 789.3610992431641,
"epoch": 0.6306769494430163,
"grad_norm": 0.48074987530708313,
"kl": 0.00942230224609375,
"learning_rate": 2.7793039831193133e-07,
"loss": 0.1273,
"reward": 0.7777777835726738,
"reward_std": 0.36897341534495354,
"rewards/accuracy_reward": 0.15277778450399637,
"rewards/format_reward": 0.4722222238779068,
"step": 368
},
{
"clip_ratio": 0.0,
"completion_length": 684.4861297607422,
"epoch": 0.6323907455012854,
"grad_norm": 0.5865346789360046,
"kl": 0.0122222900390625,
"learning_rate": 2.7543467624442956e-07,
"loss": 0.0003,
"reward": 1.0277777910232544,
"reward_std": 0.3762567415833473,
"rewards/accuracy_reward": 0.26388888992369175,
"rewards/format_reward": 0.5,
"step": 369
},
{
"clip_ratio": 0.0,
"completion_length": 657.6528015136719,
"epoch": 0.6341045415595544,
"grad_norm": 0.5562211871147156,
"kl": 0.0120849609375,
"learning_rate": 2.729523361034538e-07,
"loss": 0.0188,
"reward": 1.1388888955116272,
"reward_std": 0.3075363263487816,
"rewards/accuracy_reward": 0.3333333432674408,
"rewards/format_reward": 0.4722222238779068,
"step": 370
},
{
"clip_ratio": 0.0,
"completion_length": 580.1111297607422,
"epoch": 0.6358183376178235,
"grad_norm": 0.35350364446640015,
"kl": 0.0099639892578125,
"learning_rate": 2.7048349887476037e-07,
"loss": -0.007,
"reward": 1.0902777761220932,
"reward_std": 0.3765922859311104,
"rewards/accuracy_reward": 0.3055555550381541,
"rewards/format_reward": 0.4791666641831398,
"step": 371
},
{
"clip_ratio": 0.0,
"completion_length": 718.4861297607422,
"epoch": 0.6375321336760925,
"grad_norm": 0.3661295771598816,
"kl": 0.00765228271484375,
"learning_rate": 2.6802828488599294e-07,
"loss": 0.0188,
"reward": 0.826388880610466,
"reward_std": 0.4112919941544533,
"rewards/accuracy_reward": 0.16666666977107525,
"rewards/format_reward": 0.493055559694767,
"step": 372
},
{
"clip_ratio": 0.0,
"completion_length": 756.375,
"epoch": 0.6392459297343616,
"grad_norm": 0.3112243115901947,
"kl": 0.01010894775390625,
"learning_rate": 2.655868138008171e-07,
"loss": -0.0013,
"reward": 0.8611111044883728,
"reward_std": 0.24017397314310074,
"rewards/accuracy_reward": 0.18055555876344442,
"rewards/format_reward": 0.5,
"step": 373
},
{
"clip_ratio": 0.0,
"completion_length": 813.3194427490234,
"epoch": 0.6409597257926307,
"grad_norm": 0.3174319863319397,
"kl": 0.007965087890625,
"learning_rate": 2.631592046130896e-07,
"loss": -0.0123,
"reward": 1.0833333432674408,
"reward_std": 0.2901904284954071,
"rewards/accuracy_reward": 0.2916666716337204,
"rewards/format_reward": 0.5,
"step": 374
},
{
"clip_ratio": 0.0,
"completion_length": 705.0972290039062,
"epoch": 0.6426735218508998,
"grad_norm": 0.46651625633239746,
"kl": 0.0082550048828125,
"learning_rate": 2.6074557564105724e-07,
"loss": 0.0328,
"reward": 1.0486111044883728,
"reward_std": 0.6196977943181992,
"rewards/accuracy_reward": 0.27777778171002865,
"rewards/format_reward": 0.493055559694767,
"step": 375
},
{
"clip_ratio": 0.0,
"completion_length": 642.0972290039062,
"epoch": 0.6443873179091688,
"grad_norm": 0.5062349438667297,
"kl": 0.0101165771484375,
"learning_rate": 2.583460445215911e-07,
"loss": -0.0277,
"reward": 0.9861111342906952,
"reward_std": 0.3804878890514374,
"rewards/accuracy_reward": 0.25000000931322575,
"rewards/format_reward": 0.4861111119389534,
"step": 376
},
{
"clip_ratio": 0.0,
"completion_length": 701.2361145019531,
"epoch": 0.6461011139674379,
"grad_norm": 0.4397139549255371,
"kl": 0.008392333984375,
"learning_rate": 2.5596072820445254e-07,
"loss": -0.0174,
"reward": 1.0277777910232544,
"reward_std": 0.47276463359594345,
"rewards/accuracy_reward": 0.26388890016824007,
"rewards/format_reward": 0.5,
"step": 377
},
{
"clip_ratio": 0.0,
"completion_length": 680.9861145019531,
"epoch": 0.6478149100257069,
"grad_norm": 0.20873984694480896,
"kl": 0.0089569091796875,
"learning_rate": 2.5358974294659373e-07,
"loss": -0.0031,
"reward": 0.5555555522441864,
"reward_std": 0.13608276098966599,
"rewards/accuracy_reward": 0.02777777798473835,
"rewards/format_reward": 0.5,
"step": 378
},
{
"clip_ratio": 0.0,
"completion_length": 640.6666717529297,
"epoch": 0.6495287060839761,
"grad_norm": 0.4596330225467682,
"kl": 0.0101165771484375,
"learning_rate": 2.512332043064913e-07,
"loss": -0.009,
"reward": 0.7708333432674408,
"reward_std": 0.31242159754037857,
"rewards/accuracy_reward": 0.13888889271765947,
"rewards/format_reward": 0.493055559694767,
"step": 379
},
{
"clip_ratio": 0.0,
"completion_length": 522.8194427490234,
"epoch": 0.6512425021422451,
"grad_norm": 0.3971538543701172,
"kl": 0.011444091796875,
"learning_rate": 2.488912271385139e-07,
"loss": 0.0003,
"reward": 0.972222238779068,
"reward_std": 0.4123065695166588,
"rewards/accuracy_reward": 0.23611111380159855,
"rewards/format_reward": 0.5,
"step": 380
},
{
"clip_ratio": 0.0,
"completion_length": 829.1250152587891,
"epoch": 0.6529562982005142,
"grad_norm": 0.2756218910217285,
"kl": 0.0085296630859375,
"learning_rate": 2.465639255873246e-07,
"loss": 0.0465,
"reward": 0.5624999925494194,
"reward_std": 0.23096106760203838,
"rewards/accuracy_reward": 0.041666666977107525,
"rewards/format_reward": 0.4791666716337204,
"step": 381
},
{
"clip_ratio": 0.0,
"completion_length": 635.0833435058594,
"epoch": 0.6546700942587832,
"grad_norm": 0.42576533555984497,
"kl": 0.00997161865234375,
"learning_rate": 2.4425141308231765e-07,
"loss": -0.0305,
"reward": 1.0000000149011612,
"reward_std": 0.39428164809942245,
"rewards/accuracy_reward": 0.25000000838190317,
"rewards/format_reward": 0.5,
"step": 382
},
{
"clip_ratio": 0.0,
"completion_length": 768.0277862548828,
"epoch": 0.6563838903170522,
"grad_norm": 0.4228789508342743,
"kl": 0.010772705078125,
"learning_rate": 2.4195380233209006e-07,
"loss": 0.0427,
"reward": 0.7361110970377922,
"reward_std": 0.36172348074615,
"rewards/accuracy_reward": 0.12500000279396772,
"rewards/format_reward": 0.4861111119389534,
"step": 383
},
{
"clip_ratio": 0.0,
"completion_length": 730.5555572509766,
"epoch": 0.6580976863753213,
"grad_norm": 0.25273385643959045,
"kl": 0.0084991455078125,
"learning_rate": 2.3967120531894857e-07,
"loss": -0.0127,
"reward": 0.6388888955116272,
"reward_std": 0.15410767495632172,
"rewards/accuracy_reward": 0.06944444496184587,
"rewards/format_reward": 0.5,
"step": 384
},
{
"clip_ratio": 0.0,
"completion_length": 778.0833587646484,
"epoch": 0.6598114824335904,
"grad_norm": 0.39399486780166626,
"kl": 0.01232147216796875,
"learning_rate": 2.374037332934512e-07,
"loss": 0.0025,
"reward": 0.7083333432674408,
"reward_std": 0.26772547513246536,
"rewards/accuracy_reward": 0.11111111287027597,
"rewards/format_reward": 0.486111119389534,
"step": 385
},
{
"clip_ratio": 0.0,
"completion_length": 661.8750152587891,
"epoch": 0.6615252784918595,
"grad_norm": 0.35121214389801025,
"kl": 0.01195526123046875,
"learning_rate": 2.3515149676898552e-07,
"loss": -0.0003,
"reward": 0.6111111044883728,
"reward_std": 0.222149059176445,
"rewards/accuracy_reward": 0.0555555559694767,
"rewards/format_reward": 0.5,
"step": 386
},
{
"clip_ratio": 0.0,
"completion_length": 646.2222366333008,
"epoch": 0.6632390745501285,
"grad_norm": 0.3854869604110718,
"kl": 0.01076507568359375,
"learning_rate": 2.3291460551638237e-07,
"loss": -0.016,
"reward": 0.8611110895872116,
"reward_std": 0.20412414520978928,
"rewards/accuracy_reward": 0.18055556062608957,
"rewards/format_reward": 0.5,
"step": 387
},
{
"clip_ratio": 0.0,
"completion_length": 707.0277786254883,
"epoch": 0.6649528706083976,
"grad_norm": 0.6442806720733643,
"kl": 0.017303466796875,
"learning_rate": 2.306931685585657e-07,
"loss": -0.0025,
"reward": 0.8611111044883728,
"reward_std": 0.5538673847913742,
"rewards/accuracy_reward": 0.19444445054978132,
"rewards/format_reward": 0.4722222238779068,
"step": 388
},
{
"clip_ratio": 0.0,
"completion_length": 647.8611145019531,
"epoch": 0.6666666666666666,
"grad_norm": 0.5503394603729248,
"kl": 0.0092926025390625,
"learning_rate": 2.2848729416523859e-07,
"loss": 0.0202,
"reward": 0.9861111119389534,
"reward_std": 0.5015645399689674,
"rewards/accuracy_reward": 0.25000000186264515,
"rewards/format_reward": 0.4861111119389534,
"step": 389
},
{
"clip_ratio": 0.0,
"completion_length": 731.2638854980469,
"epoch": 0.6683804627249358,
"grad_norm": 0.44946956634521484,
"kl": 0.00981903076171875,
"learning_rate": 2.2629708984760706e-07,
"loss": 0.1489,
"reward": 0.9583333432674408,
"reward_std": 0.35901258140802383,
"rewards/accuracy_reward": 0.236111119389534,
"rewards/format_reward": 0.486111119389534,
"step": 390
},
{
"clip_ratio": 0.0,
"completion_length": 808.2083435058594,
"epoch": 0.6700942587832048,
"grad_norm": 0.4584062695503235,
"kl": 0.00930023193359375,
"learning_rate": 2.2412266235313973e-07,
"loss": -0.0126,
"reward": 1.020833358168602,
"reward_std": 0.5443559736013412,
"rewards/accuracy_reward": 0.2638888917863369,
"rewards/format_reward": 0.493055559694767,
"step": 391
},
{
"clip_ratio": 0.0,
"completion_length": 662.8055572509766,
"epoch": 0.6718080548414739,
"grad_norm": 0.5026288628578186,
"kl": 0.01027679443359375,
"learning_rate": 2.2196411766036487e-07,
"loss": 0.0043,
"reward": 0.7222222238779068,
"reward_std": 0.25819889456033707,
"rewards/accuracy_reward": 0.1111111119389534,
"rewards/format_reward": 0.5,
"step": 392
},
{
"clip_ratio": 0.0,
"completion_length": 827.4861145019531,
"epoch": 0.6735218508997429,
"grad_norm": 0.4576079845428467,
"kl": 0.01000213623046875,
"learning_rate": 2.1982156097370557e-07,
"loss": 0.0398,
"reward": 0.7708333134651184,
"reward_std": 0.31242159754037857,
"rewards/accuracy_reward": 0.13888889271765947,
"rewards/format_reward": 0.493055559694767,
"step": 393
},
{
"clip_ratio": 0.0,
"completion_length": 632.625,
"epoch": 0.675235646958012,
"grad_norm": 0.46405985951423645,
"kl": 0.010711669921875,
"learning_rate": 2.1769509671835223e-07,
"loss": -0.0166,
"reward": 0.7569444477558136,
"reward_std": 0.36649633944034576,
"rewards/accuracy_reward": 0.12500000186264515,
"rewards/format_reward": 0.5069444477558136,
"step": 394
},
{
"clip_ratio": 0.0,
"completion_length": 740.8055572509766,
"epoch": 0.676949443016281,
"grad_norm": 0.33941328525543213,
"kl": 0.0098724365234375,
"learning_rate": 2.1558482853517253e-07,
"loss": 0.0659,
"reward": 0.743055559694767,
"reward_std": 0.24438020400702953,
"rewards/accuracy_reward": 0.12500000465661287,
"rewards/format_reward": 0.493055559694767,
"step": 395
},
{
"clip_ratio": 0.0,
"completion_length": 810.3055725097656,
"epoch": 0.6786632390745502,
"grad_norm": 0.3110896348953247,
"kl": 0.00872802734375,
"learning_rate": 2.134908592756607e-07,
"loss": 0.0229,
"reward": 0.819444440305233,
"reward_std": 0.17010344192385674,
"rewards/accuracy_reward": 0.16666667256504297,
"rewards/format_reward": 0.4861111119389534,
"step": 396
},
{
"clip_ratio": 0.0,
"completion_length": 735.9027862548828,
"epoch": 0.6803770351328192,
"grad_norm": 0.8936165571212769,
"kl": 0.01611328125,
"learning_rate": 2.1141329099692406e-07,
"loss": -0.0202,
"reward": 0.7013889029622078,
"reward_std": 0.26149011217057705,
"rewards/accuracy_reward": 0.11111111287027597,
"rewards/format_reward": 0.479166679084301,
"step": 397
},
{
"clip_ratio": 0.0,
"completion_length": 689.8472290039062,
"epoch": 0.6820908311910883,
"grad_norm": 0.23036593198776245,
"kl": 0.00927734375,
"learning_rate": 2.0935222495670968e-07,
"loss": 0.0021,
"reward": 0.8055555671453476,
"reward_std": 0.1773533970117569,
"rewards/accuracy_reward": 0.1527777798473835,
"rewards/format_reward": 0.5,
"step": 398
},
{
"clip_ratio": 0.0,
"completion_length": 606.8750076293945,
"epoch": 0.6838046272493573,
"grad_norm": 0.5848769545555115,
"kl": 0.009002685546875,
"learning_rate": 2.0730776160846853e-07,
"loss": 0.0063,
"reward": 1.0208333432674408,
"reward_std": 0.5025790855288506,
"rewards/accuracy_reward": 0.2638888955116272,
"rewards/format_reward": 0.493055559694767,
"step": 399
},
{
"clip_ratio": 0.0,
"completion_length": 822.8333435058594,
"epoch": 0.6855184233076264,
"grad_norm": 0.5785830020904541,
"kl": 0.00983428955078125,
"learning_rate": 2.0528000059645995e-07,
"loss": 0.0642,
"reward": 0.8750000298023224,
"reward_std": 0.6504513919353485,
"rewards/accuracy_reward": 0.1944444514811039,
"rewards/format_reward": 0.486111119389534,
"step": 400
},
{
"clip_ratio": 0.0,
"completion_length": 748.5138854980469,
"epoch": 0.6872322193658955,
"grad_norm": 0.3859395980834961,
"kl": 0.009033203125,
"learning_rate": 2.032690407508949e-07,
"loss": -0.0078,
"reward": 0.8750000149011612,
"reward_std": 0.3910152539610863,
"rewards/accuracy_reward": 0.19444444589316845,
"rewards/format_reward": 0.4861111119389534,
"step": 401
},
{
"clip_ratio": 0.0,
"completion_length": 705.9583435058594,
"epoch": 0.6889460154241646,
"grad_norm": 0.46305710077285767,
"kl": 0.009429931640625,
"learning_rate": 2.0127498008311922e-07,
"loss": 0.0405,
"reward": 0.659722238779068,
"reward_std": 0.3002174627035856,
"rewards/accuracy_reward": 0.09722222574055195,
"rewards/format_reward": 0.4652777835726738,
"step": 402
},
{
"clip_ratio": 0.0,
"completion_length": 689.0972290039062,
"epoch": 0.6906598114824336,
"grad_norm": 0.30027008056640625,
"kl": 0.01239013671875,
"learning_rate": 1.9929791578083655e-07,
"loss": 0.0017,
"reward": 0.986111119389534,
"reward_std": 0.3444380611181259,
"rewards/accuracy_reward": 0.2500000111758709,
"rewards/format_reward": 0.486111119389534,
"step": 403
},
{
"clip_ratio": 0.0,
"completion_length": 593.8333282470703,
"epoch": 0.6923736075407027,
"grad_norm": 0.0383455790579319,
"kl": 0.00807952880859375,
"learning_rate": 1.9733794420337213e-07,
"loss": 0.0003,
"reward": 0.6666666567325592,
"reward_std": 0.0,
"rewards/accuracy_reward": 0.0833333358168602,
"rewards/format_reward": 0.5,
"step": 404
},
{
"clip_ratio": 0.0,
"completion_length": 667.8194427490234,
"epoch": 0.6940874035989717,
"grad_norm": 0.36720189452171326,
"kl": 0.00794219970703125,
"learning_rate": 1.9539516087697517e-07,
"loss": 0.0015,
"reward": 1.1944444626569748,
"reward_std": 0.3762567341327667,
"rewards/accuracy_reward": 0.3472222276031971,
"rewards/format_reward": 0.5,
"step": 405
},
{
"clip_ratio": 0.0,
"completion_length": 609.4305725097656,
"epoch": 0.6958011996572407,
"grad_norm": 0.5361925959587097,
"kl": 0.00843048095703125,
"learning_rate": 1.934696604901642e-07,
"loss": 0.0433,
"reward": 0.9166666716337204,
"reward_std": 0.3995024636387825,
"rewards/accuracy_reward": 0.20833333395421505,
"rewards/format_reward": 0.5,
"step": 406
},
{
"clip_ratio": 0.0,
"completion_length": 680.8055725097656,
"epoch": 0.6975149957155099,
"grad_norm": 0.3511025905609131,
"kl": 0.010101318359375,
"learning_rate": 1.915615368891117e-07,
"loss": -0.0073,
"reward": 1.0138888955116272,
"reward_std": 0.40500660240650177,
"rewards/accuracy_reward": 0.2638888917863369,
"rewards/format_reward": 0.486111119389534,
"step": 407
},
{
"clip_ratio": 0.0,
"completion_length": 488.3472213745117,
"epoch": 0.699228791773779,
"grad_norm": 0.56728196144104,
"kl": 0.0164031982421875,
"learning_rate": 1.8967088307307e-07,
"loss": -0.0141,
"reward": 1.3819444477558136,
"reward_std": 0.46130847185850143,
"rewards/accuracy_reward": 0.44444444961845875,
"rewards/format_reward": 0.493055559694767,
"step": 408
},
{
"clip_ratio": 0.0,
"completion_length": 706.125,
"epoch": 0.700942587832048,
"grad_norm": 0.5061081647872925,
"kl": 0.0098876953125,
"learning_rate": 1.8779779118983867e-07,
"loss": -0.0202,
"reward": 0.8958333358168602,
"reward_std": 0.524095680564642,
"rewards/accuracy_reward": 0.20833333674818277,
"rewards/format_reward": 0.4791666641831398,
"step": 409
},
{
"clip_ratio": 0.0,
"completion_length": 616.6944580078125,
"epoch": 0.702656383890317,
"grad_norm": 0.4515543282032013,
"kl": 0.0114288330078125,
"learning_rate": 1.8594235253127372e-07,
"loss": 0.0007,
"reward": 0.7430555745959282,
"reward_std": 0.33044650219380856,
"rewards/accuracy_reward": 0.1250000037252903,
"rewards/format_reward": 0.493055559694767,
"step": 410
},
{
"clip_ratio": 0.0,
"completion_length": 776.875,
"epoch": 0.7043701799485861,
"grad_norm": 0.4085945785045624,
"kl": 0.0094146728515625,
"learning_rate": 1.8410465752883758e-07,
"loss": 0.0977,
"reward": 0.8402777835726738,
"reward_std": 0.2934070285409689,
"rewards/accuracy_reward": 0.18055555690079927,
"rewards/format_reward": 0.479166679084301,
"step": 411
},
{
"clip_ratio": 0.0,
"completion_length": 741.5000152587891,
"epoch": 0.7060839760068551,
"grad_norm": 0.4242144823074341,
"kl": 0.010284423828125,
"learning_rate": 1.822847957491922e-07,
"loss": 0.0346,
"reward": 0.5624999925494194,
"reward_std": 0.25515517219901085,
"rewards/accuracy_reward": 0.041666666977107525,
"rewards/format_reward": 0.4791666716337204,
"step": 412
},
{
"clip_ratio": 0.0,
"completion_length": 657.5694580078125,
"epoch": 0.7077977720651243,
"grad_norm": 0.48833325505256653,
"kl": 0.0101318359375,
"learning_rate": 1.804828558898332e-07,
"loss": 0.0055,
"reward": 0.7777777761220932,
"reward_std": 0.40472327172756195,
"rewards/accuracy_reward": 0.13888889085501432,
"rewards/format_reward": 0.5,
"step": 413
},
{
"clip_ratio": 0.0,
"completion_length": 751.5833587646484,
"epoch": 0.7095115681233933,
"grad_norm": 0.49302709102630615,
"kl": 0.00957489013671875,
"learning_rate": 1.7869892577476722e-07,
"loss": 0.069,
"reward": 0.6458333432674408,
"reward_std": 0.28473581932485104,
"rewards/accuracy_reward": 0.08333333395421505,
"rewards/format_reward": 0.479166679084301,
"step": 414
},
{
"clip_ratio": 0.0,
"completion_length": 776.7916870117188,
"epoch": 0.7112253641816624,
"grad_norm": 0.568556010723114,
"kl": 0.0110321044921875,
"learning_rate": 1.7693309235023127e-07,
"loss": -0.042,
"reward": 0.6875,
"reward_std": 0.2086303625255823,
"rewards/accuracy_reward": 0.11111111287027597,
"rewards/format_reward": 0.4652777835726738,
"step": 415
},
{
"clip_ratio": 0.0,
"completion_length": 628.6527862548828,
"epoch": 0.7129391602399314,
"grad_norm": 0.33500367403030396,
"kl": 0.0119171142578125,
"learning_rate": 1.7518544168045524e-07,
"loss": -0.0129,
"reward": 0.9444444626569748,
"reward_std": 0.2453947812318802,
"rewards/accuracy_reward": 0.2222222276031971,
"rewards/format_reward": 0.5,
"step": 416
},
{
"clip_ratio": 0.0,
"completion_length": 682.6250076293945,
"epoch": 0.7146529562982005,
"grad_norm": 0.42712926864624023,
"kl": 0.010406494140625,
"learning_rate": 1.7345605894346726e-07,
"loss": -0.0008,
"reward": 0.7152777910232544,
"reward_std": 0.4368143603205681,
"rewards/accuracy_reward": 0.11111111287027597,
"rewards/format_reward": 0.493055559694767,
"step": 417
},
{
"clip_ratio": 0.0,
"completion_length": 570.2222366333008,
"epoch": 0.7163667523564696,
"grad_norm": 0.47915688157081604,
"kl": 0.0113983154296875,
"learning_rate": 1.7174502842694212e-07,
"loss": 0.0123,
"reward": 0.5694444477558136,
"reward_std": 0.12530778720974922,
"rewards/accuracy_reward": 0.0416666679084301,
"rewards/format_reward": 0.486111119389534,
"step": 418
},
{
"clip_ratio": 0.0,
"completion_length": 574.3750152587891,
"epoch": 0.7180805484147387,
"grad_norm": 0.43631237745285034,
"kl": 0.01261138916015625,
"learning_rate": 1.7005243352409333e-07,
"loss": 0.0184,
"reward": 0.8263888955116272,
"reward_std": 0.35694384574890137,
"rewards/accuracy_reward": 0.16666667070239782,
"rewards/format_reward": 0.493055559694767,
"step": 419
},
{
"clip_ratio": 0.0,
"completion_length": 711.5833358764648,
"epoch": 0.7197943444730077,
"grad_norm": 0.3256857693195343,
"kl": 0.00872039794921875,
"learning_rate": 1.6837835672960831e-07,
"loss": 0.0162,
"reward": 0.7361111268401146,
"reward_std": 0.26139055751264095,
"rewards/accuracy_reward": 0.12500000093132257,
"rewards/format_reward": 0.486111119389534,
"step": 420
},
{
"clip_ratio": 0.0,
"completion_length": 578.9722137451172,
"epoch": 0.7215081405312768,
"grad_norm": 0.5653530359268188,
"kl": 0.0140533447265625,
"learning_rate": 1.6672287963562852e-07,
"loss": 0.0693,
"reward": 1.0208333432674408,
"reward_std": 0.4548392668366432,
"rewards/accuracy_reward": 0.2638888927176595,
"rewards/format_reward": 0.493055559694767,
"step": 421
},
{
"clip_ratio": 0.0,
"completion_length": 655.0555572509766,
"epoch": 0.7232219365895458,
"grad_norm": 0.47506195306777954,
"kl": 0.00907135009765625,
"learning_rate": 1.6508608292777203e-07,
"loss": 0.0037,
"reward": 0.6597222238779068,
"reward_std": 0.23915940523147583,
"rewards/accuracy_reward": 0.08333333395421505,
"rewards/format_reward": 0.493055559694767,
"step": 422
},
{
"clip_ratio": 0.0,
"completion_length": 802.1111297607422,
"epoch": 0.7249357326478149,
"grad_norm": 0.41230887174606323,
"kl": 0.01018524169921875,
"learning_rate": 1.6346804638120098e-07,
"loss": 0.0106,
"reward": 0.9652777761220932,
"reward_std": 0.41896694898605347,
"rewards/accuracy_reward": 0.23611111752688885,
"rewards/format_reward": 0.493055559694767,
"step": 423
},
{
"clip_ratio": 0.0,
"completion_length": 806.0416717529297,
"epoch": 0.726649528706084,
"grad_norm": 0.41215652227401733,
"kl": 0.0081787109375,
"learning_rate": 1.6186884885673413e-07,
"loss": 0.0068,
"reward": 0.784722238779068,
"reward_std": 0.359107568860054,
"rewards/accuracy_reward": 0.15277778077870607,
"rewards/format_reward": 0.4791666716337204,
"step": 424
},
{
"clip_ratio": 0.0,
"completion_length": 700.6111221313477,
"epoch": 0.7283633247643531,
"grad_norm": 0.6025084257125854,
"kl": 0.0133056640625,
"learning_rate": 1.6028856829700258e-07,
"loss": -0.0072,
"reward": 1.0972222238779068,
"reward_std": 0.6032442003488541,
"rewards/accuracy_reward": 0.3055555634200573,
"rewards/format_reward": 0.486111119389534,
"step": 425
},
{
"clip_ratio": 0.0,
"completion_length": 503.37500762939453,
"epoch": 0.7300771208226221,
"grad_norm": 0.6861910223960876,
"kl": 0.0125579833984375,
"learning_rate": 1.5872728172265146e-07,
"loss": -0.0144,
"reward": 1.0555555671453476,
"reward_std": 0.5035936608910561,
"rewards/accuracy_reward": 0.2777777798473835,
"rewards/format_reward": 0.5,
"step": 426
},
{
"clip_ratio": 0.0,
"completion_length": 672.6805572509766,
"epoch": 0.7317909168808912,
"grad_norm": 0.5222618579864502,
"kl": 0.01123046875,
"learning_rate": 1.5718506522858572e-07,
"loss": 0.0373,
"reward": 0.7291666716337204,
"reward_std": 0.34122148901224136,
"rewards/accuracy_reward": 0.12500000093132257,
"rewards/format_reward": 0.479166679084301,
"step": 427
},
{
"clip_ratio": 0.0,
"completion_length": 678.2916793823242,
"epoch": 0.7335047129391602,
"grad_norm": 0.510343074798584,
"kl": 0.01007843017578125,
"learning_rate": 1.5566199398026147e-07,
"loss": 0.0694,
"reward": 1.0277777761220932,
"reward_std": 0.6949137225747108,
"rewards/accuracy_reward": 0.26388889644294977,
"rewards/format_reward": 0.5,
"step": 428
},
{
"clip_ratio": 0.0,
"completion_length": 694.0277709960938,
"epoch": 0.7352185089974294,
"grad_norm": 0.5112901329994202,
"kl": 0.010711669921875,
"learning_rate": 1.5415814221002265e-07,
"loss": -0.0032,
"reward": 0.770833320915699,
"reward_std": 0.4112919941544533,
"rewards/accuracy_reward": 0.13888889271765947,
"rewards/format_reward": 0.493055559694767,
"step": 429
},
{
"clip_ratio": 0.0,
"completion_length": 779.0555572509766,
"epoch": 0.7369323050556984,
"grad_norm": 0.40487441420555115,
"kl": 0.00792694091796875,
"learning_rate": 1.5267358321348285e-07,
"loss": -0.007,
"reward": 0.8541666716337204,
"reward_std": 0.3304464966058731,
"rewards/accuracy_reward": 0.18055555690079927,
"rewards/format_reward": 0.493055559694767,
"step": 430
},
{
"clip_ratio": 0.0,
"completion_length": 679.5278015136719,
"epoch": 0.7386461011139674,
"grad_norm": 0.5640285015106201,
"kl": 0.0105438232421875,
"learning_rate": 1.5120838934595337e-07,
"loss": 0.016,
"reward": 0.8750000149011612,
"reward_std": 0.31113363057374954,
"rewards/accuracy_reward": 0.1944444514811039,
"rewards/format_reward": 0.486111119389534,
"step": 431
},
{
"clip_ratio": 0.0,
"completion_length": 642.7639007568359,
"epoch": 0.7403598971722365,
"grad_norm": 0.5031344890594482,
"kl": 0.011260986328125,
"learning_rate": 1.4976263201891613e-07,
"loss": 0.0137,
"reward": 0.986111119389534,
"reward_std": 0.4052800089120865,
"rewards/accuracy_reward": 0.2500000046566129,
"rewards/format_reward": 0.486111119389534,
"step": 432
},
{
"clip_ratio": 0.0,
"completion_length": 764.3055572509766,
"epoch": 0.7420736932305055,
"grad_norm": 0.30649498105049133,
"kl": 0.00821685791015625,
"learning_rate": 1.483363816965435e-07,
"loss": 0.0368,
"reward": 0.9027778059244156,
"reward_std": 0.28722215443849564,
"rewards/accuracy_reward": 0.20833333767950535,
"rewards/format_reward": 0.486111119389534,
"step": 433
},
{
"clip_ratio": 0.0,
"completion_length": 690.9027709960938,
"epoch": 0.7437874892887746,
"grad_norm": 0.3682223856449127,
"kl": 0.00897216796875,
"learning_rate": 1.469297078922642e-07,
"loss": -0.0162,
"reward": 0.6874999925494194,
"reward_std": 0.26762592047452927,
"rewards/accuracy_reward": 0.09722222480922937,
"rewards/format_reward": 0.493055559694767,
"step": 434
},
{
"clip_ratio": 0.0,
"completion_length": 688.2777862548828,
"epoch": 0.7455012853470437,
"grad_norm": 0.5453760027885437,
"kl": 0.0087127685546875,
"learning_rate": 1.4554267916537495e-07,
"loss": -0.002,
"reward": 0.9097222164273262,
"reward_std": 0.416512792930007,
"rewards/accuracy_reward": 0.2083333358168602,
"rewards/format_reward": 0.493055559694767,
"step": 435
},
{
"clip_ratio": 0.0,
"completion_length": 720.7500152587891,
"epoch": 0.7472150814053128,
"grad_norm": 0.4208390712738037,
"kl": 0.00977325439453125,
"learning_rate": 1.4417536311769885e-07,
"loss": -0.0004,
"reward": 1.0486111342906952,
"reward_std": 0.4112920016050339,
"rewards/accuracy_reward": 0.27777778822928667,
"rewards/format_reward": 0.493055559694767,
"step": 436
},
{
"clip_ratio": 0.0,
"completion_length": 646.1666717529297,
"epoch": 0.7489288774635818,
"grad_norm": 0.38405895233154297,
"kl": 0.00957489013671875,
"learning_rate": 1.4282782639029128e-07,
"loss": 0.0228,
"reward": 0.861111119389534,
"reward_std": 0.2901904284954071,
"rewards/accuracy_reward": 0.18055555783212185,
"rewards/format_reward": 0.5,
"step": 437
},
{
"clip_ratio": 0.0,
"completion_length": 586.0555572509766,
"epoch": 0.7506426735218509,
"grad_norm": 0.2572639584541321,
"kl": 0.0116729736328125,
"learning_rate": 1.4150013466019114e-07,
"loss": 0.0052,
"reward": 0.9861111044883728,
"reward_std": 0.12496887892484665,
"rewards/accuracy_reward": 0.2500000009313226,
"rewards/format_reward": 0.4861111119389534,
"step": 438
},
{
"clip_ratio": 0.0,
"completion_length": 721.8333435058594,
"epoch": 0.7523564695801199,
"grad_norm": 0.3360980451107025,
"kl": 0.00910186767578125,
"learning_rate": 1.4019235263722034e-07,
"loss": -0.0078,
"reward": 0.909722238779068,
"reward_std": 0.25718431919813156,
"rewards/accuracy_reward": 0.20833334047347307,
"rewards/format_reward": 0.493055559694767,
"step": 439
},
{
"clip_ratio": 0.0,
"completion_length": 659.875,
"epoch": 0.7540702656383891,
"grad_norm": 0.4243048131465912,
"kl": 0.0103912353515625,
"learning_rate": 1.3890454406082956e-07,
"loss": 0.0002,
"reward": 0.9375,
"reward_std": 0.3150074779987335,
"rewards/accuracy_reward": 0.22222222108393908,
"rewards/format_reward": 0.493055559694767,
"step": 440
},
{
"clip_ratio": 0.0,
"completion_length": 776.7083282470703,
"epoch": 0.7557840616966581,
"grad_norm": 0.25701966881752014,
"kl": 0.00785064697265625,
"learning_rate": 1.3763677169699217e-07,
"loss": 0.0297,
"reward": 0.8750000149011612,
"reward_std": 0.1584134679287672,
"rewards/accuracy_reward": 0.19444444961845875,
"rewards/format_reward": 0.486111119389534,
"step": 441
},
{
"clip_ratio": 0.0,
"completion_length": 634.2500076293945,
"epoch": 0.7574978577549272,
"grad_norm": 0.42774713039398193,
"kl": 0.00933837890625,
"learning_rate": 1.3638909733514452e-07,
"loss": 0.0248,
"reward": 0.9583333432674408,
"reward_std": 0.3544755354523659,
"rewards/accuracy_reward": 0.236111119389534,
"rewards/format_reward": 0.486111119389534,
"step": 442
},
{
"clip_ratio": 0.0,
"completion_length": 714.0833282470703,
"epoch": 0.7592116538131962,
"grad_norm": 0.5273145437240601,
"kl": 0.01111602783203125,
"learning_rate": 1.351615817851748e-07,
"loss": 0.0447,
"reward": 1.0486111342906952,
"reward_std": 0.421733595430851,
"rewards/accuracy_reward": 0.2777777835726738,
"rewards/format_reward": 0.493055559694767,
"step": 443
},
{
"clip_ratio": 0.0,
"completion_length": 805.0972442626953,
"epoch": 0.7609254498714653,
"grad_norm": 0.28000712394714355,
"kl": 0.00943756103515625,
"learning_rate": 1.3395428487445914e-07,
"loss": 0.0138,
"reward": 0.7708333432674408,
"reward_std": 0.40107376128435135,
"rewards/accuracy_reward": 0.13888889364898205,
"rewards/format_reward": 0.493055559694767,
"step": 444
},
{
"clip_ratio": 0.0,
"completion_length": 711.0416870117188,
"epoch": 0.7626392459297343,
"grad_norm": 0.48471271991729736,
"kl": 0.0093841552734375,
"learning_rate": 1.3276726544494571e-07,
"loss": -0.0408,
"reward": 0.7708333283662796,
"reward_std": 0.399602010846138,
"rewards/accuracy_reward": 0.13888889085501432,
"rewards/format_reward": 0.493055559694767,
"step": 445
},
{
"clip_ratio": 0.0,
"completion_length": 650.1944427490234,
"epoch": 0.7643530419880035,
"grad_norm": 0.2905956208705902,
"kl": 0.0093841552734375,
"learning_rate": 1.316005813502869e-07,
"loss": -0.0041,
"reward": 0.8541666865348816,
"reward_std": 0.2211344838142395,
"rewards/accuracy_reward": 0.18055556155741215,
"rewards/format_reward": 0.493055559694767,
"step": 446
},
{
"clip_ratio": 0.0,
"completion_length": 744.8194580078125,
"epoch": 0.7660668380462725,
"grad_norm": 0.6048784255981445,
"kl": 0.0104522705078125,
"learning_rate": 1.3045428945301953e-07,
"loss": -0.0198,
"reward": 1.0555555820465088,
"reward_std": 0.4907895475625992,
"rewards/accuracy_reward": 0.2777777835726738,
"rewards/format_reward": 0.5,
"step": 447
},
{
"clip_ratio": 0.0,
"completion_length": 734.8194580078125,
"epoch": 0.7677806341045416,
"grad_norm": 0.3953067362308502,
"kl": 0.00873565673828125,
"learning_rate": 1.2932844562179352e-07,
"loss": 0.0348,
"reward": 1.0694444477558136,
"reward_std": 0.43352314084768295,
"rewards/accuracy_reward": 0.2916666716337204,
"rewards/format_reward": 0.486111119389534,
"step": 448
},
{
"clip_ratio": 0.0,
"completion_length": 751.0555877685547,
"epoch": 0.7694944301628106,
"grad_norm": 0.3929743766784668,
"kl": 0.0112152099609375,
"learning_rate": 1.2822310472864885e-07,
"loss": 0.0141,
"reward": 0.798611119389534,
"reward_std": 0.15942803025245667,
"rewards/accuracy_reward": 0.15277778450399637,
"rewards/format_reward": 0.493055559694767,
"step": 449
},
{
"clip_ratio": 0.0,
"completion_length": 678.0,
"epoch": 0.7712082262210797,
"grad_norm": 0.38888290524482727,
"kl": 0.0111541748046875,
"learning_rate": 1.2713832064634125e-07,
"loss": -0.0292,
"reward": 0.888888880610466,
"reward_std": 0.4547397494316101,
"rewards/accuracy_reward": 0.19444444868713617,
"rewards/format_reward": 0.5,
"step": 450
},
{
"clip_ratio": 0.0,
"completion_length": 732.7083587646484,
"epoch": 0.7729220222793488,
"grad_norm": 0.4212208688259125,
"kl": 0.0086212158203125,
"learning_rate": 1.260741462457165e-07,
"loss": 0.0172,
"reward": 0.6597222164273262,
"reward_std": 0.2624051198363304,
"rewards/accuracy_reward": 0.08333333488553762,
"rewards/format_reward": 0.493055559694767,
"step": 451
},
{
"clip_ratio": 0.0,
"completion_length": 701.0416870117188,
"epoch": 0.7746358183376179,
"grad_norm": 0.4472486078739166,
"kl": 0.0110626220703125,
"learning_rate": 1.2503063339313356e-07,
"loss": -0.0389,
"reward": 0.9027777761220932,
"reward_std": 0.44951891899108887,
"rewards/accuracy_reward": 0.20833333674818277,
"rewards/format_reward": 0.4861111119389534,
"step": 452
},
{
"clip_ratio": 0.0,
"completion_length": 750.6527862548828,
"epoch": 0.7763496143958869,
"grad_norm": 0.31978559494018555,
"kl": 0.00965118408203125,
"learning_rate": 1.2400783294793668e-07,
"loss": 0.0029,
"reward": 0.6527777761220932,
"reward_std": 0.24447975307703018,
"rewards/accuracy_reward": 0.08333333395421505,
"rewards/format_reward": 0.486111119389534,
"step": 453
},
{
"clip_ratio": 0.0,
"completion_length": 627.0555725097656,
"epoch": 0.778063410454156,
"grad_norm": 0.5572839975357056,
"kl": 0.0136260986328125,
"learning_rate": 1.2300579475997657e-07,
"loss": -0.0167,
"reward": 1.1875000149011612,
"reward_std": 0.4453127048909664,
"rewards/accuracy_reward": 0.3472222248092294,
"rewards/format_reward": 0.493055559694767,
"step": 454
},
{
"clip_ratio": 0.0,
"completion_length": 705.1110992431641,
"epoch": 0.779777206512425,
"grad_norm": 0.3036157190799713,
"kl": 0.0082244873046875,
"learning_rate": 1.220245676671809e-07,
"loss": 0.0153,
"reward": 0.6944444328546524,
"reward_std": 0.2634196951985359,
"rewards/accuracy_reward": 0.0972222238779068,
"rewards/format_reward": 0.5,
"step": 455
},
{
"clip_ratio": 0.0,
"completion_length": 696.0000152587891,
"epoch": 0.781491002570694,
"grad_norm": 0.304671049118042,
"kl": 0.0107574462890625,
"learning_rate": 1.2106419949317388e-07,
"loss": -0.0076,
"reward": 0.8333333283662796,
"reward_std": 0.2221490517258644,
"rewards/accuracy_reward": 0.16666666977107525,
"rewards/format_reward": 0.5,
"step": 456
},
{
"clip_ratio": 0.0,
"completion_length": 688.875,
"epoch": 0.7832047986289632,
"grad_norm": 0.5803426504135132,
"kl": 0.010406494140625,
"learning_rate": 1.2012473704494537e-07,
"loss": 0.0092,
"reward": 0.9861111417412758,
"reward_std": 0.3805625271052122,
"rewards/accuracy_reward": 0.2500000074505806,
"rewards/format_reward": 0.486111119389534,
"step": 457
},
{
"clip_ratio": 0.0,
"completion_length": 601.5277862548828,
"epoch": 0.7849185946872322,
"grad_norm": 0.7371811270713806,
"kl": 0.0137481689453125,
"learning_rate": 1.1920622611056974e-07,
"loss": 0.0169,
"reward": 0.8333333283662796,
"reward_std": 0.43149399757385254,
"rewards/accuracy_reward": 0.16666666977107525,
"rewards/format_reward": 0.5,
"step": 458
},
{
"clip_ratio": 0.0,
"completion_length": 628.7500152587891,
"epoch": 0.7866323907455013,
"grad_norm": 0.5148778557777405,
"kl": 0.011505126953125,
"learning_rate": 1.1830871145697412e-07,
"loss": 0.0304,
"reward": 1.0763888955116272,
"reward_std": 0.2211344838142395,
"rewards/accuracy_reward": 0.291666679084301,
"rewards/format_reward": 0.493055559694767,
"step": 459
},
{
"clip_ratio": 0.0,
"completion_length": 757.3889007568359,
"epoch": 0.7883461868037703,
"grad_norm": 0.4826764464378357,
"kl": 0.0094757080078125,
"learning_rate": 1.1743223682775649e-07,
"loss": -0.0019,
"reward": 0.7152777761220932,
"reward_std": 0.384667344391346,
"rewards/accuracy_reward": 0.1111111119389534,
"rewards/format_reward": 0.493055559694767,
"step": 460
},
{
"clip_ratio": 0.0,
"completion_length": 667.8055572509766,
"epoch": 0.7900599828620394,
"grad_norm": 0.5709467530250549,
"kl": 0.01129150390625,
"learning_rate": 1.1657684494105386e-07,
"loss": 0.0253,
"reward": 0.9027778059244156,
"reward_std": 0.5824100151658058,
"rewards/accuracy_reward": 0.20833334140479565,
"rewards/format_reward": 0.486111119389534,
"step": 461
},
{
"clip_ratio": 0.0,
"completion_length": 782.0694274902344,
"epoch": 0.7917737789203085,
"grad_norm": 0.359210729598999,
"kl": 0.01105499267578125,
"learning_rate": 1.1574257748745986e-07,
"loss": 0.0259,
"reward": 0.8194444477558136,
"reward_std": 0.27941547334194183,
"rewards/accuracy_reward": 0.1666666716337204,
"rewards/format_reward": 0.4861111119389534,
"step": 462
},
{
"clip_ratio": 0.0,
"completion_length": 722.0555572509766,
"epoch": 0.7934875749785776,
"grad_norm": 0.47103169560432434,
"kl": 0.01102447509765625,
"learning_rate": 1.1492947512799328e-07,
"loss": 0.0043,
"reward": 0.8124999925494194,
"reward_std": 0.43280857615172863,
"rewards/accuracy_reward": 0.16666667256504297,
"rewards/format_reward": 0.4791666716337204,
"step": 463
},
{
"clip_ratio": 0.0,
"completion_length": 708.9722290039062,
"epoch": 0.7952013710368466,
"grad_norm": 0.34840986132621765,
"kl": 0.0095672607421875,
"learning_rate": 1.1413757749211602e-07,
"loss": -0.0008,
"reward": 0.888888880610466,
"reward_std": 0.13608276098966599,
"rewards/accuracy_reward": 0.19444444868713617,
"rewards/format_reward": 0.5,
"step": 464
},
{
"clip_ratio": 0.0,
"completion_length": 709.2083282470703,
"epoch": 0.7969151670951157,
"grad_norm": 0.34385910630226135,
"kl": 0.0106964111328125,
"learning_rate": 1.1336692317580158e-07,
"loss": 0.0174,
"reward": 0.8263888955116272,
"reward_std": 0.3135357052087784,
"rewards/accuracy_reward": 0.1666666679084301,
"rewards/format_reward": 0.493055559694767,
"step": 465
},
{
"clip_ratio": 0.0,
"completion_length": 738.0416717529297,
"epoch": 0.7986289631533847,
"grad_norm": 0.39174365997314453,
"kl": 0.0086212158203125,
"learning_rate": 1.1261754973965422e-07,
"loss": 0.0083,
"reward": 1.0833333730697632,
"reward_std": 0.4855687543749809,
"rewards/accuracy_reward": 0.2916666716337204,
"rewards/format_reward": 0.5,
"step": 466
},
{
"clip_ratio": 0.0,
"completion_length": 783.3611145019531,
"epoch": 0.8003427592116538,
"grad_norm": 0.21360760927200317,
"kl": 0.0073394775390625,
"learning_rate": 1.1188949370707787e-07,
"loss": 0.0061,
"reward": 0.7222222238779068,
"reward_std": 0.13608276098966599,
"rewards/accuracy_reward": 0.11111111287027597,
"rewards/format_reward": 0.5,
"step": 467
},
{
"clip_ratio": 0.0,
"completion_length": 591.0416564941406,
"epoch": 0.8020565552699229,
"grad_norm": 0.44213274121284485,
"kl": 0.0121307373046875,
"learning_rate": 1.1118279056249653e-07,
"loss": -0.0334,
"reward": 0.9375000074505806,
"reward_std": 0.176338829100132,
"rewards/accuracy_reward": 0.2222222238779068,
"rewards/format_reward": 0.493055559694767,
"step": 468
},
{
"clip_ratio": 0.0,
"completion_length": 908.9305419921875,
"epoch": 0.803770351328192,
"grad_norm": 2.4339747428894043,
"kl": 0.04512786865234375,
"learning_rate": 1.1049747474962444e-07,
"loss": 0.0283,
"reward": 0.8472222164273262,
"reward_std": 0.4613333996385336,
"rewards/accuracy_reward": 0.18055556155741215,
"rewards/format_reward": 0.486111119389534,
"step": 469
},
{
"clip_ratio": 0.0,
"completion_length": 672.7222290039062,
"epoch": 0.805484147386461,
"grad_norm": 0.3775832951068878,
"kl": 0.00971221923828125,
"learning_rate": 1.0983357966978745e-07,
"loss": 0.0084,
"reward": 0.826388880610466,
"reward_std": 0.2337997630238533,
"rewards/accuracy_reward": 0.16666666977107525,
"rewards/format_reward": 0.493055559694767,
"step": 470
},
{
"clip_ratio": 0.0,
"completion_length": 837.1944580078125,
"epoch": 0.8071979434447301,
"grad_norm": 0.3247397840023041,
"kl": 0.0085601806640625,
"learning_rate": 1.0919113768029517e-07,
"loss": 0.0263,
"reward": 0.8750000149011612,
"reward_std": 0.40380824357271194,
"rewards/accuracy_reward": 0.19444444682449102,
"rewards/format_reward": 0.486111119389534,
"step": 471
},
{
"clip_ratio": 0.0,
"completion_length": 636.1527862548828,
"epoch": 0.8089117395029991,
"grad_norm": 0.30879315733909607,
"kl": 0.0115203857421875,
"learning_rate": 1.0857018009286381e-07,
"loss": -0.0045,
"reward": 0.7708333432674408,
"reward_std": 0.22086109220981598,
"rewards/accuracy_reward": 0.1388888955116272,
"rewards/format_reward": 0.493055559694767,
"step": 472
},
{
"clip_ratio": 0.0,
"completion_length": 697.8194427490234,
"epoch": 0.8106255355612683,
"grad_norm": 0.5799990296363831,
"kl": 0.0116119384765625,
"learning_rate": 1.0797073717209013e-07,
"loss": 0.0063,
"reward": 0.8333333656191826,
"reward_std": 0.2221490480005741,
"rewards/accuracy_reward": 0.18055556155741215,
"rewards/format_reward": 0.472222238779068,
"step": 473
},
{
"clip_ratio": 0.0,
"completion_length": 597.6944580078125,
"epoch": 0.8123393316195373,
"grad_norm": 0.673372209072113,
"kl": 0.012725830078125,
"learning_rate": 1.0739283813397639e-07,
"loss": -0.0041,
"reward": 1.1527777910232544,
"reward_std": 0.6239039897918701,
"rewards/accuracy_reward": 0.33333334140479565,
"rewards/format_reward": 0.486111119389534,
"step": 474
},
{
"clip_ratio": 0.0,
"completion_length": 647.6944427490234,
"epoch": 0.8140531276778064,
"grad_norm": 0.47942498326301575,
"kl": 0.0100555419921875,
"learning_rate": 1.068365111445064e-07,
"loss": -0.0141,
"reward": 0.8888888955116272,
"reward_std": 0.4675438329577446,
"rewards/accuracy_reward": 0.1944444514811039,
"rewards/format_reward": 0.5,
"step": 475
},
{
"clip_ratio": 0.0,
"completion_length": 785.625,
"epoch": 0.8157669237360754,
"grad_norm": 0.2584112882614136,
"kl": 0.00859832763671875,
"learning_rate": 1.063017833182728e-07,
"loss": 0.0052,
"reward": 0.6944444328546524,
"reward_std": 0.0680413767695427,
"rewards/accuracy_reward": 0.09722222480922937,
"rewards/format_reward": 0.5,
"step": 476
},
{
"clip_ratio": 0.0,
"completion_length": 718.2083282470703,
"epoch": 0.8174807197943444,
"grad_norm": 0.33669352531433105,
"kl": 0.01165771484375,
"learning_rate": 1.0578868071715544e-07,
"loss": 0.0043,
"reward": 0.9861111342906952,
"reward_std": 0.32973192632198334,
"rewards/accuracy_reward": 0.25000000838190317,
"rewards/format_reward": 0.4861111119389534,
"step": 477
},
{
"clip_ratio": 0.0,
"completion_length": 617.8611145019531,
"epoch": 0.8191945158526135,
"grad_norm": 0.8135344982147217,
"kl": 0.01825714111328125,
"learning_rate": 1.0529722834905125e-07,
"loss": -0.0315,
"reward": 0.9722222238779068,
"reward_std": 0.4262731894850731,
"rewards/accuracy_reward": 0.23611111473292112,
"rewards/format_reward": 0.5,
"step": 478
},
{
"clip_ratio": 0.0,
"completion_length": 593.4305648803711,
"epoch": 0.8209083119108826,
"grad_norm": 0.4805351197719574,
"kl": 0.0129547119140625,
"learning_rate": 1.0482745016665526e-07,
"loss": -0.0022,
"reward": 1.194444477558136,
"reward_std": 0.5820766538381577,
"rewards/accuracy_reward": 0.3472222238779068,
"rewards/format_reward": 0.5,
"step": 479
},
{
"clip_ratio": 0.0,
"completion_length": 634.7777862548828,
"epoch": 0.8226221079691517,
"grad_norm": 0.37174245715141296,
"kl": 0.01177215576171875,
"learning_rate": 1.0437936906629334e-07,
"loss": -0.0185,
"reward": 1.4375,
"reward_std": 0.4637626111507416,
"rewards/accuracy_reward": 0.4722222313284874,
"rewards/format_reward": 0.493055559694767,
"step": 480
},
{
"clip_ratio": 0.0,
"completion_length": 670.3750152587891,
"epoch": 0.8243359040274207,
"grad_norm": 0.5397735834121704,
"kl": 0.0081024169921875,
"learning_rate": 1.0395300688680625e-07,
"loss": -0.0052,
"reward": 1.166666641831398,
"reward_std": 0.41752735525369644,
"rewards/accuracy_reward": 0.3333333367481828,
"rewards/format_reward": 0.5,
"step": 481
},
{
"clip_ratio": 0.0,
"completion_length": 691.2777862548828,
"epoch": 0.8260497000856898,
"grad_norm": 0.3484320342540741,
"kl": 0.011199951171875,
"learning_rate": 1.0354838440848501e-07,
"loss": 0.0018,
"reward": 0.7430555671453476,
"reward_std": 0.33303238451480865,
"rewards/accuracy_reward": 0.12500000186264515,
"rewards/format_reward": 0.493055559694767,
"step": 482
},
{
"clip_ratio": 0.0,
"completion_length": 664.263916015625,
"epoch": 0.8277634961439588,
"grad_norm": 0.43313318490982056,
"kl": 0.014739990234375,
"learning_rate": 1.0316552135205837e-07,
"loss": 0.0133,
"reward": 0.6527777761220932,
"reward_std": 0.25279081612825394,
"rewards/accuracy_reward": 0.08333333488553762,
"rewards/format_reward": 0.486111119389534,
"step": 483
},
{
"clip_ratio": 0.0,
"completion_length": 688.8888854980469,
"epoch": 0.829477292202228,
"grad_norm": 0.5132657289505005,
"kl": 0.009918212890625,
"learning_rate": 1.0280443637773163e-07,
"loss": 0.01,
"reward": 1.0277778059244156,
"reward_std": 0.49601035565137863,
"rewards/accuracy_reward": 0.2638888917863369,
"rewards/format_reward": 0.5,
"step": 484
},
{
"clip_ratio": 0.0,
"completion_length": 637.1388854980469,
"epoch": 0.831191088260497,
"grad_norm": 0.470441997051239,
"kl": 0.01073455810546875,
"learning_rate": 1.0246514708427701e-07,
"loss": -0.0112,
"reward": 0.888888880610466,
"reward_std": 0.4803479462862015,
"rewards/accuracy_reward": 0.19444444868713617,
"rewards/format_reward": 0.5,
"step": 485
},
{
"clip_ratio": 0.0,
"completion_length": 705.2777862548828,
"epoch": 0.8329048843187661,
"grad_norm": 0.574053943157196,
"kl": 0.0115509033203125,
"learning_rate": 1.0214767000817596e-07,
"loss": 0.0604,
"reward": 1.1041666567325592,
"reward_std": 0.4360002353787422,
"rewards/accuracy_reward": 0.3194444486871362,
"rewards/format_reward": 0.4652777835726738,
"step": 486
},
{
"clip_ratio": 0.0,
"completion_length": 708.1666793823242,
"epoch": 0.8346186803770351,
"grad_norm": 0.3975141942501068,
"kl": 0.01026153564453125,
"learning_rate": 1.0185202062281336e-07,
"loss": 0.011,
"reward": 0.6250000074505806,
"reward_std": 0.24970055185258389,
"rewards/accuracy_reward": 0.06944444496184587,
"rewards/format_reward": 0.486111119389534,
"step": 487
},
{
"clip_ratio": 0.0,
"completion_length": 667.0555572509766,
"epoch": 0.8363324764353042,
"grad_norm": 0.4273344576358795,
"kl": 0.0111236572265625,
"learning_rate": 1.0157821333772304e-07,
"loss": -0.0084,
"reward": 1.2708333283662796,
"reward_std": 0.32522569596767426,
"rewards/accuracy_reward": 0.38888888619840145,
"rewards/format_reward": 0.493055559694767,
"step": 488
},
{
"clip_ratio": 0.0,
"completion_length": 709.2083435058594,
"epoch": 0.8380462724935732,
"grad_norm": 0.3746323883533478,
"kl": 0.008514404296875,
"learning_rate": 1.013262614978859e-07,
"loss": -0.0182,
"reward": 0.6944444477558136,
"reward_std": 0.2901904359459877,
"rewards/accuracy_reward": 0.09722222574055195,
"rewards/format_reward": 0.5,
"step": 489
},
{
"clip_ratio": 0.0,
"completion_length": 678.0555572509766,
"epoch": 0.8397600685518424,
"grad_norm": 0.44212105870246887,
"kl": 0.0089874267578125,
"learning_rate": 1.0109617738307911e-07,
"loss": 0.0007,
"reward": 1.1666666567325592,
"reward_std": 0.30821534991264343,
"rewards/accuracy_reward": 0.33333333767950535,
"rewards/format_reward": 0.5,
"step": 490
},
{
"clip_ratio": 0.0,
"completion_length": 675.7777938842773,
"epoch": 0.8414738646101114,
"grad_norm": 0.45065274834632874,
"kl": 0.0085906982421875,
"learning_rate": 1.0088797220727779e-07,
"loss": 0.0092,
"reward": 0.8055555522441864,
"reward_std": 0.3995024636387825,
"rewards/accuracy_reward": 0.15277778077870607,
"rewards/format_reward": 0.5,
"step": 491
},
{
"clip_ratio": 0.0,
"completion_length": 724.4444580078125,
"epoch": 0.8431876606683805,
"grad_norm": 0.2776910364627838,
"kl": 0.008880615234375,
"learning_rate": 1.0070165611810855e-07,
"loss": -0.019,
"reward": 0.7430555745959282,
"reward_std": 0.22113448940217495,
"rewards/accuracy_reward": 0.12500000093132257,
"rewards/format_reward": 0.493055559694767,
"step": 492
},
{
"clip_ratio": 0.0,
"completion_length": 734.7500152587891,
"epoch": 0.8449014567266495,
"grad_norm": 0.273231565952301,
"kl": 0.01227569580078125,
"learning_rate": 1.005372381963547e-07,
"loss": 0.0169,
"reward": 0.75,
"reward_std": 0.15410767495632172,
"rewards/accuracy_reward": 0.12500000186264515,
"rewards/format_reward": 0.5,
"step": 493
},
{
"clip_ratio": 0.0,
"completion_length": 817.6666717529297,
"epoch": 0.8466152527849186,
"grad_norm": 0.3405468463897705,
"kl": 0.0100250244140625,
"learning_rate": 1.0039472645551372e-07,
"loss": -0.0042,
"reward": 0.9097222238779068,
"reward_std": 0.3072007745504379,
"rewards/accuracy_reward": 0.2083333395421505,
"rewards/format_reward": 0.493055559694767,
"step": 494
},
{
"clip_ratio": 0.0,
"completion_length": 620.8611068725586,
"epoch": 0.8483290488431876,
"grad_norm": 0.8697577714920044,
"kl": 0.0175018310546875,
"learning_rate": 1.002741278414069e-07,
"loss": -0.0113,
"reward": 1.0625000149011612,
"reward_std": 0.4665292650461197,
"rewards/accuracy_reward": 0.29166667349636555,
"rewards/format_reward": 0.479166679084301,
"step": 495
},
{
"clip_ratio": 0.0,
"completion_length": 636.8472290039062,
"epoch": 0.8500428449014568,
"grad_norm": 0.5277899503707886,
"kl": 0.0098876953125,
"learning_rate": 1.0017544823184055e-07,
"loss": 0.0341,
"reward": 1.2222222238779068,
"reward_std": 0.686167873442173,
"rewards/accuracy_reward": 0.3611111231148243,
"rewards/format_reward": 0.5,
"step": 496
},
{
"clip_ratio": 0.0,
"completion_length": 800.9861145019531,
"epoch": 0.8517566409597258,
"grad_norm": 0.44679829478263855,
"kl": 0.012115478515625,
"learning_rate": 1.0009869243631952e-07,
"loss": 0.0143,
"reward": 0.8819444552063942,
"reward_std": 0.34861752949655056,
"rewards/accuracy_reward": 0.2083333395421505,
"rewards/format_reward": 0.4652777910232544,
"step": 497
},
{
"clip_ratio": 0.0,
"completion_length": 856.0694580078125,
"epoch": 0.8534704370179949,
"grad_norm": 0.22854942083358765,
"kl": 0.01018524169921875,
"learning_rate": 1.000438641958131e-07,
"loss": 0.0058,
"reward": 0.6597222238779068,
"reward_std": 0.13479479402303696,
"rewards/accuracy_reward": 0.08333333674818277,
"rewards/format_reward": 0.493055559694767,
"step": 498
},
{
"clip_ratio": 0.0,
"completion_length": 564.8888854980469,
"epoch": 0.8551842330762639,
"grad_norm": 0.7382153272628784,
"kl": 0.0165252685546875,
"learning_rate": 1.0001096618257236e-07,
"loss": -0.0162,
"reward": 1.0277777761220932,
"reward_std": 0.2901904284954071,
"rewards/accuracy_reward": 0.26388889644294977,
"rewards/format_reward": 0.5,
"step": 499
},
{
"clip_ratio": 0.0,
"completion_length": 571.8194351196289,
"epoch": 0.856898029134533,
"grad_norm": 0.5685946941375732,
"kl": 0.0124969482421875,
"learning_rate": 1e-07,
"loss": -0.0212,
"reward": 1.1597222089767456,
"reward_std": 0.4405168890953064,
"rewards/accuracy_reward": 0.3333333330228925,
"rewards/format_reward": 0.493055559694767,
"step": 500
},
{
"epoch": 0.856898029134533,
"step": 500,
"total_flos": 0.0,
"train_loss": 0.01467308583567501,
"train_runtime": 25494.6867,
"train_samples_per_second": 1.412,
"train_steps_per_second": 0.02
}
],
"logging_steps": 1,
"max_steps": 500,
"num_input_tokens_seen": 0,
"num_train_epochs": 1,
"save_steps": 100,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 0.0,
"train_batch_size": 6,
"trial_name": null,
"trial_params": null
}