{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.856898029134533, "eval_steps": 500, "global_step": 500, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "clip_ratio": 0.0, "completion_length": 673.8611145019531, "epoch": 0.001713796058269066, "grad_norm": 0.6894496083259583, "kl": 0.0, "learning_rate": 2e-08, "loss": 0.0307, "reward": 0.5902777910232544, "reward_std": 0.3424043729901314, "rewards/accuracy_reward": 0.16666666977107525, "rewards/format_reward": 0.2569444514811039, "step": 1 }, { "clip_ratio": 0.0, "completion_length": 663.3889007568359, "epoch": 0.003427592116538132, "grad_norm": 0.6292832493782043, "kl": 0.0, "learning_rate": 4e-08, "loss": -0.0039, "reward": 0.4583333358168602, "reward_std": 0.45602361112833023, "rewards/accuracy_reward": 0.0972222238779068, "rewards/format_reward": 0.2638888917863369, "step": 2 }, { "clip_ratio": 0.0, "completion_length": 843.0972442626953, "epoch": 0.005141388174807198, "grad_norm": 0.5272438526153564, "kl": 0.00014257431030273438, "learning_rate": 6e-08, "loss": 0.0375, "reward": 0.5972222238779068, "reward_std": 0.4886699207127094, "rewards/accuracy_reward": 0.16666667256504297, "rewards/format_reward": 0.2638888955116272, "step": 3 }, { "clip_ratio": 0.0, "completion_length": 708.4166717529297, "epoch": 0.006855184233076264, "grad_norm": 0.59702068567276, "kl": 0.0001093149185180664, "learning_rate": 8e-08, "loss": 0.0373, "reward": 0.5069444477558136, "reward_std": 0.563178788870573, "rewards/accuracy_reward": 0.13888888992369175, "rewards/format_reward": 0.2291666716337204, "step": 4 }, { "clip_ratio": 0.0, "completion_length": 663.9722290039062, "epoch": 0.00856898029134533, "grad_norm": 0.8420807123184204, "kl": 0.00011909008026123047, "learning_rate": 1e-07, "loss": 0.0547, "reward": 0.666666679084301, "reward_std": 0.5673302263021469, "rewards/accuracy_reward": 0.22222222480922937, "rewards/format_reward": 0.2222222238779068, "step": 5 }, { "clip_ratio": 0.0, "completion_length": 630.0, "epoch": 0.010282776349614395, "grad_norm": 0.8917471170425415, "kl": 0.00018858909606933594, "learning_rate": 1.2e-07, "loss": 0.0357, "reward": 0.5833333358168602, "reward_std": 0.4646867737174034, "rewards/accuracy_reward": 0.16666667070239782, "rewards/format_reward": 0.2500000037252903, "step": 6 }, { "clip_ratio": 0.0, "completion_length": 650.5416870117188, "epoch": 0.011996572407883462, "grad_norm": 0.7703569531440735, "kl": 8.26716423034668e-05, "learning_rate": 1.4e-07, "loss": -0.0085, "reward": 0.4305555634200573, "reward_std": 0.31282100826501846, "rewards/accuracy_reward": 0.0555555559694767, "rewards/format_reward": 0.3194444514811039, "step": 7 }, { "clip_ratio": 0.0, "completion_length": 775.2500152587891, "epoch": 0.013710368466152529, "grad_norm": 0.7724463939666748, "kl": 0.00013768672943115234, "learning_rate": 1.6e-07, "loss": 0.0749, "reward": 0.3611111082136631, "reward_std": 0.41373568773269653, "rewards/accuracy_reward": 0.06944444589316845, "rewards/format_reward": 0.2222222276031971, "step": 8 }, { "clip_ratio": 0.0, "completion_length": 634.8472213745117, "epoch": 0.015424164524421594, "grad_norm": 0.5836902856826782, "kl": 0.0001068115234375, "learning_rate": 1.8e-07, "loss": 0.0627, "reward": 0.4861111156642437, "reward_std": 0.46435344591736794, "rewards/accuracy_reward": 0.12500000465661287, "rewards/format_reward": 0.2361111119389534, "step": 9 }, { "clip_ratio": 0.0, "completion_length": 768.4305572509766, "epoch": 0.01713796058269066, "grad_norm": 0.8060944676399231, "kl": 0.0001360177993774414, "learning_rate": 2e-07, "loss": 0.0023, "reward": 0.7847222089767456, "reward_std": 0.46876538544893265, "rewards/accuracy_reward": 0.291666672565043, "rewards/format_reward": 0.2013888955116272, "step": 10 }, { "clip_ratio": 0.0, "completion_length": 661.7638854980469, "epoch": 0.018851756640959727, "grad_norm": 0.8618626594543457, "kl": 9.322166442871094e-05, "learning_rate": 2.1999999999999998e-07, "loss": 0.0605, "reward": 0.5763889029622078, "reward_std": 0.5653917491436005, "rewards/accuracy_reward": 0.15277778264135122, "rewards/format_reward": 0.2708333358168602, "step": 11 }, { "clip_ratio": 0.0, "completion_length": 847.4583435058594, "epoch": 0.02056555269922879, "grad_norm": 0.5977460741996765, "kl": 0.0001245737075805664, "learning_rate": 2.4e-07, "loss": 0.0545, "reward": 0.4583333395421505, "reward_std": 0.4860685095191002, "rewards/accuracy_reward": 0.0972222238779068, "rewards/format_reward": 0.2638888917863369, "step": 12 }, { "clip_ratio": 0.0, "completion_length": 687.4305725097656, "epoch": 0.022279348757497857, "grad_norm": 0.7761760354042053, "kl": 0.00014710426330566406, "learning_rate": 2.6e-07, "loss": 0.0447, "reward": 0.3888888955116272, "reward_std": 0.4280589930713177, "rewards/accuracy_reward": 0.06944444589316845, "rewards/format_reward": 0.25, "step": 13 }, { "clip_ratio": 0.0, "completion_length": 776.2916641235352, "epoch": 0.023993144815766924, "grad_norm": 1.0175061225891113, "kl": 0.00014960765838623047, "learning_rate": 2.8e-07, "loss": 0.0222, "reward": 0.611111119389534, "reward_std": 0.48110663890838623, "rewards/accuracy_reward": 0.1944444477558136, "rewards/format_reward": 0.2222222238779068, "step": 14 }, { "clip_ratio": 0.0, "completion_length": 808.4722290039062, "epoch": 0.02570694087403599, "grad_norm": 0.5965744853019714, "kl": 0.0001302957534790039, "learning_rate": 3e-07, "loss": 0.0488, "reward": 0.43055555410683155, "reward_std": 0.3586147967725992, "rewards/accuracy_reward": 0.1111111119389534, "rewards/format_reward": 0.20833333767950535, "step": 15 }, { "clip_ratio": 0.0, "completion_length": 577.4027862548828, "epoch": 0.027420736932305057, "grad_norm": 0.8511067032814026, "kl": 0.00014138221740722656, "learning_rate": 3.2e-07, "loss": 0.0287, "reward": 0.7916666716337204, "reward_std": 0.7402771413326263, "rewards/accuracy_reward": 0.2638888992369175, "rewards/format_reward": 0.2638888922519982, "step": 16 }, { "clip_ratio": 0.0, "completion_length": 693.2777786254883, "epoch": 0.02913453299057412, "grad_norm": 1.0029399394989014, "kl": 0.00013744831085205078, "learning_rate": 3.4000000000000003e-07, "loss": -0.0047, "reward": 0.7430555745959282, "reward_std": 0.5347021222114563, "rewards/accuracy_reward": 0.2777777863666415, "rewards/format_reward": 0.18750000093132257, "step": 17 }, { "clip_ratio": 0.0, "completion_length": 694.6666870117188, "epoch": 0.030848329048843187, "grad_norm": 0.929906964302063, "kl": 0.0001380443572998047, "learning_rate": 3.6e-07, "loss": 0.1027, "reward": 0.3194444477558136, "reward_std": 0.3645694628357887, "rewards/accuracy_reward": 0.041666666977107525, "rewards/format_reward": 0.23611111752688885, "step": 18 }, { "clip_ratio": 0.0, "completion_length": 837.4583587646484, "epoch": 0.032562125107112254, "grad_norm": 0.5381235480308533, "kl": 0.00016641616821289062, "learning_rate": 3.7999999999999996e-07, "loss": 0.0715, "reward": 0.5347222164273262, "reward_std": 0.5603309497237206, "rewards/accuracy_reward": 0.13888889271765947, "rewards/format_reward": 0.2569444477558136, "step": 19 }, { "clip_ratio": 0.0, "completion_length": 865.6805419921875, "epoch": 0.03427592116538132, "grad_norm": 0.6289723515510559, "kl": 0.0001741647720336914, "learning_rate": 4e-07, "loss": -0.0123, "reward": 0.3333333320915699, "reward_std": 0.3452591709792614, "rewards/accuracy_reward": 0.08333333395421505, "rewards/format_reward": 0.1666666716337204, "step": 20 }, { "clip_ratio": 0.0, "completion_length": 664.694450378418, "epoch": 0.03598971722365039, "grad_norm": 0.8022226691246033, "kl": 0.00022649765014648438, "learning_rate": 4.1999999999999995e-07, "loss": 0.0523, "reward": 0.5833333358168602, "reward_std": 0.3963186591863632, "rewards/accuracy_reward": 0.12500000465661287, "rewards/format_reward": 0.3333333320915699, "step": 21 }, { "clip_ratio": 0.0, "completion_length": 874.8055725097656, "epoch": 0.037703513281919454, "grad_norm": 0.6999794244766235, "kl": 0.00022101402282714844, "learning_rate": 4.3999999999999997e-07, "loss": 0.0188, "reward": 0.319444440305233, "reward_std": 0.36139946803450584, "rewards/accuracy_reward": 0.055555556900799274, "rewards/format_reward": 0.2083333358168602, "step": 22 }, { "clip_ratio": 0.0, "completion_length": 873.7361145019531, "epoch": 0.03941730934018852, "grad_norm": 0.5962035059928894, "kl": 0.00021910667419433594, "learning_rate": 4.6e-07, "loss": 0.0391, "reward": 0.611111119389534, "reward_std": 0.31651007384061813, "rewards/accuracy_reward": 0.15277778450399637, "rewards/format_reward": 0.3055555522441864, "step": 23 }, { "clip_ratio": 0.0, "completion_length": 691.0277862548828, "epoch": 0.04113110539845758, "grad_norm": 0.6845191121101379, "kl": 0.0003848075866699219, "learning_rate": 4.8e-07, "loss": 0.0395, "reward": 0.6388888955116272, "reward_std": 0.44496994838118553, "rewards/accuracy_reward": 0.18055556062608957, "rewards/format_reward": 0.2777777835726738, "step": 24 }, { "clip_ratio": 0.0, "completion_length": 606.1527862548828, "epoch": 0.04284490145672665, "grad_norm": 0.7037742733955383, "kl": 0.0006976127624511719, "learning_rate": 5e-07, "loss": 0.0193, "reward": 0.6875000149011612, "reward_std": 0.49456192925572395, "rewards/accuracy_reward": 0.16666666883975267, "rewards/format_reward": 0.3541666716337204, "step": 25 }, { "clip_ratio": 0.0, "completion_length": 615.1250152587891, "epoch": 0.044558697514995714, "grad_norm": 0.8331264853477478, "kl": 0.0008344650268554688, "learning_rate": 5.2e-07, "loss": 0.0633, "reward": 0.7152777686715126, "reward_std": 0.2888181023299694, "rewards/accuracy_reward": 0.19444444868713617, "rewards/format_reward": 0.3263888880610466, "step": 26 }, { "clip_ratio": 0.0, "completion_length": 814.9166717529297, "epoch": 0.04627249357326478, "grad_norm": 0.9271811246871948, "kl": 0.0009531974792480469, "learning_rate": 5.4e-07, "loss": 0.056, "reward": 0.3472222238779068, "reward_std": 0.30904670804739, "rewards/accuracy_reward": 0.02777777798473835, "rewards/format_reward": 0.2916666716337204, "step": 27 }, { "clip_ratio": 0.0, "completion_length": 509.0, "epoch": 0.04798628963153385, "grad_norm": 0.755454957485199, "kl": 0.00131988525390625, "learning_rate": 5.6e-07, "loss": -0.0102, "reward": 1.0277777835726738, "reward_std": 0.6238088309764862, "rewards/accuracy_reward": 0.31944445241242647, "rewards/format_reward": 0.3888888955116272, "step": 28 }, { "clip_ratio": 0.0, "completion_length": 634.9027786254883, "epoch": 0.049700085689802914, "grad_norm": 0.8711504936218262, "kl": 0.0018901824951171875, "learning_rate": 5.8e-07, "loss": 0.039, "reward": 0.5763888880610466, "reward_std": 0.5907959416508675, "rewards/accuracy_reward": 0.13888889271765947, "rewards/format_reward": 0.2986111119389534, "step": 29 }, { "clip_ratio": 0.0, "completion_length": 588.5833358764648, "epoch": 0.05141388174807198, "grad_norm": 0.6898062825202942, "kl": 0.002155303955078125, "learning_rate": 6e-07, "loss": -0.014, "reward": 0.493055559694767, "reward_std": 0.41614027321338654, "rewards/accuracy_reward": 0.06944444496184587, "rewards/format_reward": 0.3541666641831398, "step": 30 }, { "clip_ratio": 0.0, "completion_length": 582.9166717529297, "epoch": 0.05312767780634105, "grad_norm": 0.9486229419708252, "kl": 0.002727508544921875, "learning_rate": 6.2e-07, "loss": 0.0461, "reward": 0.6319444626569748, "reward_std": 0.5628423318266869, "rewards/accuracy_reward": 0.12500000186264515, "rewards/format_reward": 0.381944440305233, "step": 31 }, { "clip_ratio": 0.0, "completion_length": 674.3472290039062, "epoch": 0.054841473864610114, "grad_norm": 0.5769440531730652, "kl": 0.00347137451171875, "learning_rate": 6.4e-07, "loss": 0.0924, "reward": 0.6597222238779068, "reward_std": 0.572759248316288, "rewards/accuracy_reward": 0.13888889271765947, "rewards/format_reward": 0.3819444477558136, "step": 32 }, { "clip_ratio": 0.0, "completion_length": 606.5972213745117, "epoch": 0.056555269922879174, "grad_norm": 0.8302273154258728, "kl": 0.004093170166015625, "learning_rate": 6.6e-07, "loss": -0.0183, "reward": 0.6944444477558136, "reward_std": 0.5897158365696669, "rewards/accuracy_reward": 0.16666666977107525, "rewards/format_reward": 0.3611111156642437, "step": 33 }, { "clip_ratio": 0.0, "completion_length": 750.2639007568359, "epoch": 0.05826906598114824, "grad_norm": 0.8029855489730835, "kl": 0.00673675537109375, "learning_rate": 6.800000000000001e-07, "loss": 0.1226, "reward": 0.8194444477558136, "reward_std": 0.5727398172020912, "rewards/accuracy_reward": 0.19444444868713617, "rewards/format_reward": 0.4305555522441864, "step": 34 }, { "clip_ratio": 0.0, "completion_length": 748.4305572509766, "epoch": 0.05998286203941731, "grad_norm": 0.751978874206543, "kl": 0.00594329833984375, "learning_rate": 7e-07, "loss": 0.1349, "reward": 0.5000000074505806, "reward_std": 0.3002382256090641, "rewards/accuracy_reward": 0.0555555559694767, "rewards/format_reward": 0.3888888880610466, "step": 35 }, { "clip_ratio": 0.0, "completion_length": 589.0416793823242, "epoch": 0.061696658097686374, "grad_norm": 0.7630824446678162, "kl": 0.0087432861328125, "learning_rate": 7.2e-07, "loss": 0.0179, "reward": 0.6041666865348816, "reward_std": 0.33818795159459114, "rewards/accuracy_reward": 0.12500000186264515, "rewards/format_reward": 0.3541666716337204, "step": 36 }, { "clip_ratio": 0.0, "completion_length": 692.2500152587891, "epoch": 0.06341045415595545, "grad_norm": 0.5925531387329102, "kl": 0.0101165771484375, "learning_rate": 7.4e-07, "loss": 0.0299, "reward": 0.5138888955116272, "reward_std": 0.3357668612152338, "rewards/accuracy_reward": 0.055555556900799274, "rewards/format_reward": 0.4027777835726738, "step": 37 }, { "clip_ratio": 0.0, "completion_length": 792.1805725097656, "epoch": 0.06512425021422451, "grad_norm": 1.0705430507659912, "kl": 0.01534271240234375, "learning_rate": 7.599999999999999e-07, "loss": 0.0466, "reward": 0.5000000074505806, "reward_std": 0.41504133865237236, "rewards/accuracy_reward": 0.0555555559694767, "rewards/format_reward": 0.3888888880610466, "step": 38 }, { "clip_ratio": 0.0, "completion_length": 699.8888854980469, "epoch": 0.06683804627249357, "grad_norm": 0.9300626516342163, "kl": 0.01090240478515625, "learning_rate": 7.799999999999999e-07, "loss": 0.0644, "reward": 0.7847222238779068, "reward_std": 0.4304216764867306, "rewards/accuracy_reward": 0.2083333432674408, "rewards/format_reward": 0.3680555522441864, "step": 39 }, { "clip_ratio": 0.0, "completion_length": 575.0833206176758, "epoch": 0.06855184233076264, "grad_norm": 1.078953742980957, "kl": 0.016937255859375, "learning_rate": 8e-07, "loss": 0.0414, "reward": 0.9791666939854622, "reward_std": 0.4737792070955038, "rewards/accuracy_reward": 0.2638888955116272, "rewards/format_reward": 0.4513888955116272, "step": 40 }, { "clip_ratio": 0.0, "completion_length": 626.5694580078125, "epoch": 0.0702656383890317, "grad_norm": 0.6470286846160889, "kl": 0.0107269287109375, "learning_rate": 8.199999999999999e-07, "loss": 0.0883, "reward": 0.6597222313284874, "reward_std": 0.3953079264611006, "rewards/accuracy_reward": 0.11111111380159855, "rewards/format_reward": 0.4375, "step": 41 }, { "clip_ratio": 0.0, "completion_length": 585.4166717529297, "epoch": 0.07197943444730077, "grad_norm": 2.4607300758361816, "kl": 0.0234375, "learning_rate": 8.399999999999999e-07, "loss": 0.0947, "reward": 0.8263889104127884, "reward_std": 0.5376365929841995, "rewards/accuracy_reward": 0.1944444514811039, "rewards/format_reward": 0.4375000074505806, "step": 42 }, { "clip_ratio": 0.0, "completion_length": 705.0555572509766, "epoch": 0.07369323050556983, "grad_norm": 0.5106288194656372, "kl": 0.0121002197265625, "learning_rate": 8.599999999999999e-07, "loss": 0.0658, "reward": 0.6736111342906952, "reward_std": 0.4738336503505707, "rewards/accuracy_reward": 0.1250000037252903, "rewards/format_reward": 0.4236111119389534, "step": 43 }, { "clip_ratio": 0.0, "completion_length": 754.8611145019531, "epoch": 0.07540702656383891, "grad_norm": 1.0860414505004883, "kl": 0.01665496826171875, "learning_rate": 8.799999999999999e-07, "loss": -0.0223, "reward": 0.7500000149011612, "reward_std": 0.4002586603164673, "rewards/accuracy_reward": 0.15277778171002865, "rewards/format_reward": 0.4444444477558136, "step": 44 }, { "clip_ratio": 0.0, "completion_length": 764.9583358764648, "epoch": 0.07712082262210797, "grad_norm": 0.4417635202407837, "kl": 0.010406494140625, "learning_rate": 9e-07, "loss": 0.0694, "reward": 0.8125000223517418, "reward_std": 0.3787213396281004, "rewards/accuracy_reward": 0.180555559694767, "rewards/format_reward": 0.4513888955116272, "step": 45 }, { "clip_ratio": 0.0, "completion_length": 524.1666717529297, "epoch": 0.07883461868037704, "grad_norm": 0.9410390853881836, "kl": 0.013397216796875, "learning_rate": 9.2e-07, "loss": 0.008, "reward": 0.6875, "reward_std": 0.4043467417359352, "rewards/accuracy_reward": 0.12500000186264515, "rewards/format_reward": 0.4375000074505806, "step": 46 }, { "clip_ratio": 0.0, "completion_length": 608.4305725097656, "epoch": 0.0805484147386461, "grad_norm": 0.915087878704071, "kl": 0.014129638671875, "learning_rate": 9.399999999999999e-07, "loss": 0.0755, "reward": 0.6527777910232544, "reward_std": 0.49068866297602654, "rewards/accuracy_reward": 0.11111111380159855, "rewards/format_reward": 0.4305555671453476, "step": 47 }, { "clip_ratio": 0.0, "completion_length": 592.1250076293945, "epoch": 0.08226221079691516, "grad_norm": 0.7676656246185303, "kl": 0.0137786865234375, "learning_rate": 9.6e-07, "loss": 0.0019, "reward": 0.7291666641831398, "reward_std": 0.3599853590130806, "rewards/accuracy_reward": 0.1527777798473835, "rewards/format_reward": 0.4236111119389534, "step": 48 }, { "clip_ratio": 0.0, "completion_length": 472.7777862548828, "epoch": 0.08397600685518423, "grad_norm": 0.8259297609329224, "kl": 0.01544189453125, "learning_rate": 9.8e-07, "loss": 0.0269, "reward": 0.8958333358168602, "reward_std": 0.5085582789033651, "rewards/accuracy_reward": 0.20833333767950535, "rewards/format_reward": 0.479166679084301, "step": 49 }, { "clip_ratio": 0.0, "completion_length": 522.7639007568359, "epoch": 0.0856898029134533, "grad_norm": 7704.1875, "kl": 2.003662109375, "learning_rate": 1e-06, "loss": 0.0621, "reward": 0.5069444477558136, "reward_std": 0.11907241865992546, "rewards/accuracy_reward": 0.013888888992369175, "rewards/format_reward": 0.4791666716337204, "step": 50 }, { "clip_ratio": 0.0, "completion_length": 569.9583435058594, "epoch": 0.08740359897172237, "grad_norm": 0.9847874045372009, "kl": 0.01806640625, "learning_rate": 9.999890338174275e-07, "loss": -0.0313, "reward": 1.0069444626569748, "reward_std": 0.6746698617935181, "rewards/accuracy_reward": 0.2916666753590107, "rewards/format_reward": 0.4236111268401146, "step": 51 }, { "clip_ratio": 0.0, "completion_length": 851.2639007568359, "epoch": 0.08911739502999143, "grad_norm": 0.4385327994823456, "kl": 0.0133514404296875, "learning_rate": 9.999561358041868e-07, "loss": -0.007, "reward": 0.7708333507180214, "reward_std": 0.3091294076293707, "rewards/accuracy_reward": 0.1527777835726738, "rewards/format_reward": 0.4652777910232544, "step": 52 }, { "clip_ratio": 0.0, "completion_length": 672.9305572509766, "epoch": 0.0908311910882605, "grad_norm": 0.6572834849357605, "kl": 0.0176544189453125, "learning_rate": 9.999013075636804e-07, "loss": -0.0003, "reward": 0.673611119389534, "reward_std": 0.3502005450427532, "rewards/accuracy_reward": 0.1111111156642437, "rewards/format_reward": 0.4513888880610466, "step": 53 }, { "clip_ratio": 0.0, "completion_length": 668.2361221313477, "epoch": 0.09254498714652956, "grad_norm": 0.5134143233299255, "kl": 0.0155029296875, "learning_rate": 9.998245517681593e-07, "loss": 0.0325, "reward": 0.6944444552063942, "reward_std": 0.36250423453748226, "rewards/accuracy_reward": 0.11111111287027597, "rewards/format_reward": 0.4722222313284874, "step": 54 }, { "clip_ratio": 0.0, "completion_length": 767.4722137451172, "epoch": 0.09425878320479864, "grad_norm": 0.5993247628211975, "kl": 0.018310546875, "learning_rate": 9.997258721585931e-07, "loss": 0.0402, "reward": 0.7222222238779068, "reward_std": 0.4854987859725952, "rewards/accuracy_reward": 0.12500000093132257, "rewards/format_reward": 0.4722222238779068, "step": 55 }, { "clip_ratio": 0.0, "completion_length": 582.5555572509766, "epoch": 0.0959725792630677, "grad_norm": 0.6086099147796631, "kl": 0.019317626953125, "learning_rate": 9.996052735444862e-07, "loss": -0.0089, "reward": 0.8958333432674408, "reward_std": 0.5041182190179825, "rewards/accuracy_reward": 0.20833333674818277, "rewards/format_reward": 0.4791666716337204, "step": 56 }, { "clip_ratio": 0.0, "completion_length": 666.3611297607422, "epoch": 0.09768637532133675, "grad_norm": 0.3199848532676697, "kl": 0.0146942138671875, "learning_rate": 9.994627618036452e-07, "loss": 0.0068, "reward": 0.6180555671453476, "reward_std": 0.10077410563826561, "rewards/accuracy_reward": 0.0694444477558136, "rewards/format_reward": 0.479166679084301, "step": 57 }, { "clip_ratio": 0.0, "completion_length": 535.6944351196289, "epoch": 0.09940017137960583, "grad_norm": 1.065861701965332, "kl": 0.02264404296875, "learning_rate": 9.992983438818915e-07, "loss": 0.0328, "reward": 0.5972222164273262, "reward_std": 0.3061862140893936, "rewards/accuracy_reward": 0.0555555559694767, "rewards/format_reward": 0.486111119389534, "step": 58 }, { "clip_ratio": 0.0, "completion_length": 645.0833435058594, "epoch": 0.10111396743787489, "grad_norm": 0.8223654627799988, "kl": 0.017425537109375, "learning_rate": 9.991120277927223e-07, "loss": -0.0678, "reward": 0.9791666716337204, "reward_std": 0.7214617803692818, "rewards/accuracy_reward": 0.25000000558793545, "rewards/format_reward": 0.4791666716337204, "step": 59 }, { "clip_ratio": 0.0, "completion_length": 758.6388854980469, "epoch": 0.10282776349614396, "grad_norm": 0.559778094291687, "kl": 0.0177154541015625, "learning_rate": 9.989038226169207e-07, "loss": -0.0276, "reward": 0.7222222238779068, "reward_std": 0.30821535736322403, "rewards/accuracy_reward": 0.11111111287027597, "rewards/format_reward": 0.5, "step": 60 }, { "clip_ratio": 0.0, "completion_length": 653.9860992431641, "epoch": 0.10454155955441302, "grad_norm": 0.4272245168685913, "kl": 0.02130126953125, "learning_rate": 9.98673738502114e-07, "loss": 0.0077, "reward": 0.6319444477558136, "reward_std": 0.17111802101135254, "rewards/accuracy_reward": 0.06944444496184587, "rewards/format_reward": 0.493055559694767, "step": 61 }, { "clip_ratio": 0.0, "completion_length": 746.0694580078125, "epoch": 0.1062553556126821, "grad_norm": 0.7891212105751038, "kl": 0.0184478759765625, "learning_rate": 9.98421786662277e-07, "loss": 0.0491, "reward": 0.8263888955116272, "reward_std": 0.3267286717891693, "rewards/accuracy_reward": 0.18055556062608957, "rewards/format_reward": 0.4652777761220932, "step": 62 }, { "clip_ratio": 0.0, "completion_length": 677.9722137451172, "epoch": 0.10796915167095116, "grad_norm": 0.4481966495513916, "kl": 0.016571044921875, "learning_rate": 9.981479793771866e-07, "loss": 0.0352, "reward": 0.6250000074505806, "reward_std": 0.2613905593752861, "rewards/accuracy_reward": 0.06944444496184587, "rewards/format_reward": 0.4861111119389534, "step": 63 }, { "clip_ratio": 0.0, "completion_length": 650.25, "epoch": 0.10968294772922023, "grad_norm": 0.484955370426178, "kl": 0.0130157470703125, "learning_rate": 9.97852329991824e-07, "loss": 0.0549, "reward": 0.6875000149011612, "reward_std": 0.21065950952470303, "rewards/accuracy_reward": 0.1111111119389534, "rewards/format_reward": 0.4652777835726738, "step": 64 }, { "clip_ratio": 0.0, "completion_length": 718.8333435058594, "epoch": 0.11139674378748929, "grad_norm": 0.6917220950126648, "kl": 0.0135955810546875, "learning_rate": 9.975348529157229e-07, "loss": 0.0278, "reward": 0.7083333432674408, "reward_std": 0.5276868715882301, "rewards/accuracy_reward": 0.12500000186264515, "rewards/format_reward": 0.4583333358168602, "step": 65 }, { "clip_ratio": 0.0, "completion_length": 724.4861221313477, "epoch": 0.11311053984575835, "grad_norm": 0.6181950569152832, "kl": 0.009735107421875, "learning_rate": 9.971955636222684e-07, "loss": 0.0776, "reward": 0.7916666865348816, "reward_std": 0.36798322945833206, "rewards/accuracy_reward": 0.1666666716337204, "rewards/format_reward": 0.4583333358168602, "step": 66 }, { "clip_ratio": 0.0, "completion_length": 645.2639007568359, "epoch": 0.11482433590402742, "grad_norm": 0.47965365648269653, "kl": 0.0118865966796875, "learning_rate": 9.968344786479415e-07, "loss": 0.0211, "reward": 0.6805555522441864, "reward_std": 0.29340869560837746, "rewards/accuracy_reward": 0.1111111156642437, "rewards/format_reward": 0.4583333358168602, "step": 67 }, { "clip_ratio": 0.0, "completion_length": 833.1944427490234, "epoch": 0.11653813196229648, "grad_norm": 0.5236086249351501, "kl": 0.01050567626953125, "learning_rate": 9.964516155915151e-07, "loss": 0.0772, "reward": 0.8611111268401146, "reward_std": 0.3695474322885275, "rewards/accuracy_reward": 0.2083333432674408, "rewards/format_reward": 0.4444444552063942, "step": 68 }, { "clip_ratio": 0.0, "completion_length": 614.4027709960938, "epoch": 0.11825192802056556, "grad_norm": 2.3860788345336914, "kl": 0.0256195068359375, "learning_rate": 9.960469931131936e-07, "loss": 0.0186, "reward": 0.7222222462296486, "reward_std": 0.4142109379172325, "rewards/accuracy_reward": 0.13888888992369175, "rewards/format_reward": 0.4444444552063942, "step": 69 }, { "clip_ratio": 0.0, "completion_length": 731.9722290039062, "epoch": 0.11996572407883462, "grad_norm": 0.7797493934631348, "kl": 0.01611328125, "learning_rate": 9.956206309337066e-07, "loss": 0.0554, "reward": 0.8125000074505806, "reward_std": 0.4372703805565834, "rewards/accuracy_reward": 0.1944444477558136, "rewards/format_reward": 0.4236111119389534, "step": 70 }, { "clip_ratio": 0.0, "completion_length": 663.3055725097656, "epoch": 0.12167952013710369, "grad_norm": 0.6972033977508545, "kl": 0.011383056640625, "learning_rate": 9.951725498333448e-07, "loss": 0.0371, "reward": 0.9583333432674408, "reward_std": 0.5939657315611839, "rewards/accuracy_reward": 0.23611112032085657, "rewards/format_reward": 0.4861111119389534, "step": 71 }, { "clip_ratio": 0.0, "completion_length": 533.8472290039062, "epoch": 0.12339331619537275, "grad_norm": 0.628699541091919, "kl": 0.0160675048828125, "learning_rate": 9.947027716509488e-07, "loss": 0.0075, "reward": 0.798611119389534, "reward_std": 0.34056369215250015, "rewards/accuracy_reward": 0.16666666977107525, "rewards/format_reward": 0.4652777761220932, "step": 72 }, { "clip_ratio": 0.0, "completion_length": 725.944450378418, "epoch": 0.12510711225364182, "grad_norm": 0.5667747855186462, "kl": 0.0106201171875, "learning_rate": 9.942113192828444e-07, "loss": 0.0505, "reward": 0.8611111342906952, "reward_std": 0.3817775323987007, "rewards/accuracy_reward": 0.1944444477558136, "rewards/format_reward": 0.4722222313284874, "step": 73 }, { "clip_ratio": 0.0, "completion_length": 596.125, "epoch": 0.1268209083119109, "grad_norm": 0.8628817796707153, "kl": 0.0158233642578125, "learning_rate": 9.93698216681727e-07, "loss": 0.0773, "reward": 0.923611119389534, "reward_std": 0.4358247146010399, "rewards/accuracy_reward": 0.22222222574055195, "rewards/format_reward": 0.4791666716337204, "step": 74 }, { "clip_ratio": 0.0, "completion_length": 812.0555572509766, "epoch": 0.12853470437017994, "grad_norm": 0.41737478971481323, "kl": 0.0092620849609375, "learning_rate": 9.931634888554935e-07, "loss": -0.0076, "reward": 0.7500000149011612, "reward_std": 0.2785004451870918, "rewards/accuracy_reward": 0.13888889271765947, "rewards/format_reward": 0.4722222238779068, "step": 75 }, { "clip_ratio": 0.0, "completion_length": 508.01390075683594, "epoch": 0.13024850042844902, "grad_norm": 0.772993803024292, "kl": 0.0087738037109375, "learning_rate": 9.926071618660237e-07, "loss": 0.0171, "reward": 1.0069444477558136, "reward_std": 0.6597441658377647, "rewards/accuracy_reward": 0.2638888955116272, "rewards/format_reward": 0.4791666716337204, "step": 76 }, { "clip_ratio": 0.0, "completion_length": 746.2500152587891, "epoch": 0.1319622964867181, "grad_norm": 0.4772380292415619, "kl": 0.00911712646484375, "learning_rate": 9.9202926282791e-07, "loss": 0.0168, "reward": 0.9166666567325592, "reward_std": 0.6034458577632904, "rewards/accuracy_reward": 0.22222222667187452, "rewards/format_reward": 0.4722222313284874, "step": 77 }, { "clip_ratio": 0.0, "completion_length": 952.3888854980469, "epoch": 0.13367609254498714, "grad_norm": 0.49596795439720154, "kl": 0.0111846923828125, "learning_rate": 9.91429819907136e-07, "loss": 0.033, "reward": 0.6180555745959282, "reward_std": 0.34004957228899, "rewards/accuracy_reward": 0.09722222480922937, "rewards/format_reward": 0.423611119389534, "step": 78 }, { "clip_ratio": 0.0, "completion_length": 533.5555725097656, "epoch": 0.1353898886032562, "grad_norm": 0.5074121952056885, "kl": 0.011474609375, "learning_rate": 9.908088623197048e-07, "loss": -0.0007, "reward": 0.5902777835726738, "reward_std": 0.2606759797781706, "rewards/accuracy_reward": 0.0555555559694767, "rewards/format_reward": 0.4791666716337204, "step": 79 }, { "clip_ratio": 0.0, "completion_length": 704.5833282470703, "epoch": 0.13710368466152528, "grad_norm": 0.45615482330322266, "kl": 0.00933074951171875, "learning_rate": 9.901664203302124e-07, "loss": 0.1073, "reward": 0.6111111119389534, "reward_std": 0.3000170197337866, "rewards/accuracy_reward": 0.08333333395421505, "rewards/format_reward": 0.4444444552063942, "step": 80 }, { "clip_ratio": 0.0, "completion_length": 709.5694580078125, "epoch": 0.13881748071979436, "grad_norm": 0.440580815076828, "kl": 0.0095977783203125, "learning_rate": 9.895025252503755e-07, "loss": -0.0114, "reward": 1.0347222536802292, "reward_std": 0.26498175598680973, "rewards/accuracy_reward": 0.2916666679084301, "rewards/format_reward": 0.4513888955116272, "step": 81 }, { "clip_ratio": 0.0, "completion_length": 652.4166717529297, "epoch": 0.1405312767780634, "grad_norm": 0.4756879210472107, "kl": 0.00921630859375, "learning_rate": 9.888172094375033e-07, "loss": 0.0445, "reward": 0.7638888955116272, "reward_std": 0.37851114571094513, "rewards/accuracy_reward": 0.15277778171002865, "rewards/format_reward": 0.4583333358168602, "step": 82 }, { "clip_ratio": 0.0, "completion_length": 585.6250076293945, "epoch": 0.14224507283633248, "grad_norm": 0.9698956608772278, "kl": 0.0133819580078125, "learning_rate": 9.881105062929221e-07, "loss": 0.0097, "reward": 0.652777798473835, "reward_std": 0.37932526133954525, "rewards/accuracy_reward": 0.09722222574055195, "rewards/format_reward": 0.4583333358168602, "step": 83 }, { "clip_ratio": 0.0, "completion_length": 683.9166564941406, "epoch": 0.14395886889460155, "grad_norm": 0.4502439796924591, "kl": 0.0113525390625, "learning_rate": 9.873824502603459e-07, "loss": 0.0209, "reward": 0.8125000074505806, "reward_std": 0.421932702884078, "rewards/accuracy_reward": 0.16666666977107525, "rewards/format_reward": 0.4791666716337204, "step": 84 }, { "clip_ratio": 0.0, "completion_length": 719.1388854980469, "epoch": 0.1456726649528706, "grad_norm": 0.6199227571487427, "kl": 0.01055908203125, "learning_rate": 9.866330768241983e-07, "loss": -0.0255, "reward": 0.5694444626569748, "reward_std": 0.33752935379743576, "rewards/accuracy_reward": 0.0555555559694767, "rewards/format_reward": 0.4583333358168602, "step": 85 }, { "clip_ratio": 0.0, "completion_length": 678.3611221313477, "epoch": 0.14738646101113967, "grad_norm": 0.8055247068405151, "kl": 0.0143890380859375, "learning_rate": 9.85862422507884e-07, "loss": -0.0329, "reward": 0.972222238779068, "reward_std": 0.5578342527151108, "rewards/accuracy_reward": 0.2500000046566129, "rewards/format_reward": 0.4722222238779068, "step": 86 }, { "clip_ratio": 0.0, "completion_length": 870.9166870117188, "epoch": 0.14910025706940874, "grad_norm": 0.7032153010368347, "kl": 0.015716552734375, "learning_rate": 9.850705248720068e-07, "loss": 0.1143, "reward": 0.7083333432674408, "reward_std": 0.29449621587991714, "rewards/accuracy_reward": 0.11111111473292112, "rewards/format_reward": 0.4861111119389534, "step": 87 }, { "clip_ratio": 0.0, "completion_length": 809.2361297607422, "epoch": 0.15081405312767782, "grad_norm": 0.6152629256248474, "kl": 0.0126953125, "learning_rate": 9.8425742251254e-07, "loss": 0.0084, "reward": 0.777777798473835, "reward_std": 0.45732562988996506, "rewards/accuracy_reward": 0.15277778171002865, "rewards/format_reward": 0.4722222313284874, "step": 88 }, { "clip_ratio": 0.0, "completion_length": 596.125, "epoch": 0.15252784918594686, "grad_norm": 0.4119075536727905, "kl": 0.0131683349609375, "learning_rate": 9.83423155058946e-07, "loss": 0.006, "reward": 0.7569444552063942, "reward_std": 0.37087361328303814, "rewards/accuracy_reward": 0.13888889364898205, "rewards/format_reward": 0.479166679084301, "step": 89 }, { "clip_ratio": 0.0, "completion_length": 719.9861221313477, "epoch": 0.15424164524421594, "grad_norm": 0.6751371622085571, "kl": 0.01473236083984375, "learning_rate": 9.825677631722435e-07, "loss": 0.0284, "reward": 0.6805555671453476, "reward_std": 0.2974403705447912, "rewards/accuracy_reward": 0.09722222574055195, "rewards/format_reward": 0.486111119389534, "step": 90 }, { "clip_ratio": 0.0, "completion_length": 676.0000152587891, "epoch": 0.155955441302485, "grad_norm": 0.4955655634403229, "kl": 0.00868988037109375, "learning_rate": 9.816912885430258e-07, "loss": 0.0648, "reward": 0.8750000149011612, "reward_std": 0.38056252896785736, "rewards/accuracy_reward": 0.19444445054978132, "rewards/format_reward": 0.486111119389534, "step": 91 }, { "clip_ratio": 0.0, "completion_length": 685.569450378418, "epoch": 0.15766923736075408, "grad_norm": 0.6032452583312988, "kl": 0.0105133056640625, "learning_rate": 9.807937738894303e-07, "loss": 0.0959, "reward": 1.0208333432674408, "reward_std": 0.41651279479265213, "rewards/accuracy_reward": 0.2638888992369175, "rewards/format_reward": 0.493055559694767, "step": 92 }, { "clip_ratio": 0.0, "completion_length": 552.9722290039062, "epoch": 0.15938303341902313, "grad_norm": 0.5042760968208313, "kl": 0.010833740234375, "learning_rate": 9.798752629550546e-07, "loss": 0.0277, "reward": 0.5972222089767456, "reward_std": 0.25616975128650665, "rewards/accuracy_reward": 0.0555555559694767, "rewards/format_reward": 0.486111119389534, "step": 93 }, { "clip_ratio": 0.0, "completion_length": 613.8750076293945, "epoch": 0.1610968294772922, "grad_norm": 1.230424165725708, "kl": 0.01520538330078125, "learning_rate": 9.78935800506826e-07, "loss": 0.0236, "reward": 0.7291666716337204, "reward_std": 0.3527771979570389, "rewards/accuracy_reward": 0.12500000093132257, "rewards/format_reward": 0.4791666716337204, "step": 94 }, { "clip_ratio": 0.0, "completion_length": 667.2500152587891, "epoch": 0.16281062553556128, "grad_norm": 0.874912440776825, "kl": 0.014312744140625, "learning_rate": 9.779754323328192e-07, "loss": -0.0369, "reward": 0.8194444477558136, "reward_std": 0.24447975307703018, "rewards/accuracy_reward": 0.16666666883975267, "rewards/format_reward": 0.486111119389534, "step": 95 }, { "clip_ratio": 0.0, "completion_length": 696.3333435058594, "epoch": 0.16452442159383032, "grad_norm": 0.4739597737789154, "kl": 0.00701904296875, "learning_rate": 9.769942052400235e-07, "loss": -0.0386, "reward": 0.7152777761220932, "reward_std": 0.4169478937983513, "rewards/accuracy_reward": 0.12500000186264515, "rewards/format_reward": 0.4652777761220932, "step": 96 }, { "clip_ratio": 0.0, "completion_length": 700.5833435058594, "epoch": 0.1662382176520994, "grad_norm": 0.5326427817344666, "kl": 0.0096435546875, "learning_rate": 9.759921670520634e-07, "loss": 0.0463, "reward": 0.6458333507180214, "reward_std": 0.29642581194639206, "rewards/accuracy_reward": 0.08333333488553762, "rewards/format_reward": 0.479166679084301, "step": 97 }, { "clip_ratio": 0.0, "completion_length": 687.944450378418, "epoch": 0.16795201371036847, "grad_norm": 0.41333743929862976, "kl": 0.0089111328125, "learning_rate": 9.749693666068663e-07, "loss": 0.0199, "reward": 0.7916666716337204, "reward_std": 0.32597118616104126, "rewards/accuracy_reward": 0.15277778450399637, "rewards/format_reward": 0.4861111119389534, "step": 98 }, { "clip_ratio": 0.0, "completion_length": 647.2638854980469, "epoch": 0.16966580976863754, "grad_norm": 0.4722951054573059, "kl": 0.0088348388671875, "learning_rate": 9.739258537542835e-07, "loss": 0.036, "reward": 0.7291666567325592, "reward_std": 0.43501005321741104, "rewards/accuracy_reward": 0.12500000186264515, "rewards/format_reward": 0.4791666641831398, "step": 99 }, { "clip_ratio": 0.0, "completion_length": 610.4722213745117, "epoch": 0.1713796058269066, "grad_norm": 0.4890177547931671, "kl": 0.0101318359375, "learning_rate": 9.728616793536587e-07, "loss": -0.0005, "reward": 0.8333333283662796, "reward_std": 0.4303314909338951, "rewards/accuracy_reward": 0.16666666977107525, "rewards/format_reward": 0.5, "step": 100 }, { "clip_ratio": 0.0, "completion_length": 803.6388702392578, "epoch": 0.17309340188517566, "grad_norm": 0.5259881019592285, "kl": 0.0117645263671875, "learning_rate": 9.717768952713511e-07, "loss": 0.0197, "reward": 0.791666679084301, "reward_std": 0.39858745597302914, "rewards/accuracy_reward": 0.16666667349636555, "rewards/format_reward": 0.4583333432674408, "step": 101 }, { "clip_ratio": 0.0, "completion_length": 725.625, "epoch": 0.17480719794344474, "grad_norm": 0.565196692943573, "kl": 0.0084381103515625, "learning_rate": 9.706715543782064e-07, "loss": -0.0314, "reward": 0.7430555671453476, "reward_std": 0.4483235850930214, "rewards/accuracy_reward": 0.13888888992369175, "rewards/format_reward": 0.4652777835726738, "step": 102 }, { "clip_ratio": 0.0, "completion_length": 685.8611145019531, "epoch": 0.17652099400171378, "grad_norm": 0.30295926332473755, "kl": 0.010772705078125, "learning_rate": 9.695457105469804e-07, "loss": 0.0335, "reward": 1.0208333283662796, "reward_std": 0.31875650584697723, "rewards/accuracy_reward": 0.2638888917863369, "rewards/format_reward": 0.493055559694767, "step": 103 }, { "clip_ratio": 0.0, "completion_length": 763.1389007568359, "epoch": 0.17823479005998286, "grad_norm": 0.45252788066864014, "kl": 0.01111602783203125, "learning_rate": 9.683994186497132e-07, "loss": 0.0369, "reward": 0.5069444477558136, "reward_std": 0.11907241307199001, "rewards/accuracy_reward": 0.013888888992369175, "rewards/format_reward": 0.479166679084301, "step": 104 }, { "clip_ratio": 0.0, "completion_length": 673.9444580078125, "epoch": 0.17994858611825193, "grad_norm": 0.800912618637085, "kl": 0.0160675048828125, "learning_rate": 9.672327345550543e-07, "loss": 0.0112, "reward": 0.7986111044883728, "reward_std": 0.4165128022432327, "rewards/accuracy_reward": 0.15277778171002865, "rewards/format_reward": 0.493055559694767, "step": 105 }, { "clip_ratio": 0.0, "completion_length": 660.4027862548828, "epoch": 0.181662382176521, "grad_norm": 0.5626226663589478, "kl": 0.0115509033203125, "learning_rate": 9.66045715125541e-07, "loss": 0.003, "reward": 0.7152777761220932, "reward_std": 0.46130844950675964, "rewards/accuracy_reward": 0.11111111287027597, "rewards/format_reward": 0.493055559694767, "step": 106 }, { "clip_ratio": 0.0, "completion_length": 648.5833435058594, "epoch": 0.18337617823479005, "grad_norm": 0.566942572593689, "kl": 0.0107421875, "learning_rate": 9.648384182148252e-07, "loss": -0.0022, "reward": 1.0833333432674408, "reward_std": 0.41200654953718185, "rewards/accuracy_reward": 0.3055555671453476, "rewards/format_reward": 0.4722222238779068, "step": 107 }, { "clip_ratio": 0.0, "completion_length": 681.1666717529297, "epoch": 0.18508997429305912, "grad_norm": 0.46719685196876526, "kl": 0.0086517333984375, "learning_rate": 9.636109026648554e-07, "loss": 0.008, "reward": 0.5277777835726738, "reward_std": 0.19162002205848694, "rewards/accuracy_reward": 0.02777777798473835, "rewards/format_reward": 0.4722222238779068, "step": 108 }, { "clip_ratio": 0.0, "completion_length": 628.8194580078125, "epoch": 0.1868037703513282, "grad_norm": 0.5135090351104736, "kl": 0.011474609375, "learning_rate": 9.623632283030077e-07, "loss": 0.0236, "reward": 0.722222238779068, "reward_std": 0.19795495830476284, "rewards/accuracy_reward": 0.12500000279396772, "rewards/format_reward": 0.4722222238779068, "step": 109 }, { "clip_ratio": 0.0, "completion_length": 732.9305725097656, "epoch": 0.18851756640959727, "grad_norm": 0.6838110089302063, "kl": 0.00803375244140625, "learning_rate": 9.610954559391704e-07, "loss": -0.0496, "reward": 0.6666666567325592, "reward_std": 0.44429811835289, "rewards/accuracy_reward": 0.0972222238779068, "rewards/format_reward": 0.4722222313284874, "step": 110 }, { "clip_ratio": 0.0, "completion_length": 688.0555725097656, "epoch": 0.19023136246786632, "grad_norm": 0.47608956694602966, "kl": 0.009918212890625, "learning_rate": 9.598076473627796e-07, "loss": 0.0029, "reward": 0.6874999850988388, "reward_std": 0.2559359297156334, "rewards/accuracy_reward": 0.09722222480922937, "rewards/format_reward": 0.493055559694767, "step": 111 }, { "clip_ratio": 0.0, "completion_length": 724.0139007568359, "epoch": 0.1919451585261354, "grad_norm": 0.34519946575164795, "kl": 0.00824737548828125, "learning_rate": 9.58499865339809e-07, "loss": 0.011, "reward": 0.6875, "reward_std": 0.31875649094581604, "rewards/accuracy_reward": 0.09722222574055195, "rewards/format_reward": 0.493055559694767, "step": 112 }, { "clip_ratio": 0.0, "completion_length": 731.2083435058594, "epoch": 0.19365895458440446, "grad_norm": 0.5883038640022278, "kl": 0.00905609130859375, "learning_rate": 9.571721736097088e-07, "loss": 0.0809, "reward": 0.9236111044883728, "reward_std": 0.5362608954310417, "rewards/accuracy_reward": 0.22222222667187452, "rewards/format_reward": 0.4791666716337204, "step": 113 }, { "clip_ratio": 0.0, "completion_length": 585.2639007568359, "epoch": 0.1953727506426735, "grad_norm": 0.6275684237480164, "kl": 0.0088043212890625, "learning_rate": 9.55824636882301e-07, "loss": -0.0049, "reward": 0.6319444477558136, "reward_std": 0.25718430429697037, "rewards/accuracy_reward": 0.06944444496184587, "rewards/format_reward": 0.493055559694767, "step": 114 }, { "clip_ratio": 0.0, "completion_length": 509.3611145019531, "epoch": 0.19708654670094258, "grad_norm": 0.5656007528305054, "kl": 0.0132293701171875, "learning_rate": 9.54457320834625e-07, "loss": -0.0101, "reward": 0.7152777761220932, "reward_std": 0.3016466051340103, "rewards/accuracy_reward": 0.11111111287027597, "rewards/format_reward": 0.493055559694767, "step": 115 }, { "clip_ratio": 0.0, "completion_length": 882.1666564941406, "epoch": 0.19880034275921166, "grad_norm": 0.34537801146507263, "kl": 0.0095062255859375, "learning_rate": 9.530702921077358e-07, "loss": 0.0496, "reward": 0.770833320915699, "reward_std": 0.32522569596767426, "rewards/accuracy_reward": 0.1388888917863369, "rewards/format_reward": 0.493055559694767, "step": 116 }, { "clip_ratio": 0.0, "completion_length": 612.0138931274414, "epoch": 0.20051413881748073, "grad_norm": 0.36981117725372314, "kl": 0.0112152099609375, "learning_rate": 9.516636183034564e-07, "loss": -0.002, "reward": 0.9097222089767456, "reward_std": 0.31875649094581604, "rewards/accuracy_reward": 0.20833333767950535, "rewards/format_reward": 0.493055559694767, "step": 117 }, { "clip_ratio": 0.0, "completion_length": 612.069450378418, "epoch": 0.20222793487574978, "grad_norm": 0.5904315710067749, "kl": 0.0120697021484375, "learning_rate": 9.502373679810839e-07, "loss": 0.0093, "reward": 0.7361111044883728, "reward_std": 0.34745684266090393, "rewards/accuracy_reward": 0.12500000186264515, "rewards/format_reward": 0.486111119389534, "step": 118 }, { "clip_ratio": 0.0, "completion_length": 580.7361221313477, "epoch": 0.20394173093401885, "grad_norm": 0.6722186803817749, "kl": 0.0094146728515625, "learning_rate": 9.487916106540465e-07, "loss": 0.0115, "reward": 1.0000000149011612, "reward_std": 0.4072999134659767, "rewards/accuracy_reward": 0.26388889644294977, "rewards/format_reward": 0.4722222238779068, "step": 119 }, { "clip_ratio": 0.0, "completion_length": 658.5277862548828, "epoch": 0.20565552699228792, "grad_norm": 0.5796108245849609, "kl": 0.009918212890625, "learning_rate": 9.473264167865171e-07, "loss": -0.0049, "reward": 0.798611119389534, "reward_std": 0.5176598504185677, "rewards/accuracy_reward": 0.15277778264135122, "rewards/format_reward": 0.493055559694767, "step": 120 }, { "clip_ratio": 0.0, "completion_length": 715.8750076293945, "epoch": 0.207369323050557, "grad_norm": 0.42695531249046326, "kl": 0.0109405517578125, "learning_rate": 9.458418577899774e-07, "loss": -0.0034, "reward": 0.9166666865348816, "reward_std": 0.3762567415833473, "rewards/accuracy_reward": 0.20833333767950535, "rewards/format_reward": 0.5, "step": 121 }, { "clip_ratio": 0.0, "completion_length": 681.5833435058594, "epoch": 0.20908311910882604, "grad_norm": 0.3627341389656067, "kl": 0.0103912353515625, "learning_rate": 9.443380060197385e-07, "loss": 0.0006, "reward": 0.75, "reward_std": 0.3134361356496811, "rewards/accuracy_reward": 0.12500000186264515, "rewards/format_reward": 0.5, "step": 122 }, { "clip_ratio": 0.0, "completion_length": 732.1249923706055, "epoch": 0.21079691516709512, "grad_norm": 0.4604179263114929, "kl": 0.0093231201171875, "learning_rate": 9.428149347714143e-07, "loss": 0.0102, "reward": 1.0208333432674408, "reward_std": 0.3769379239529371, "rewards/accuracy_reward": 0.2638888955116272, "rewards/format_reward": 0.493055559694767, "step": 123 }, { "clip_ratio": 0.0, "completion_length": 731.8472290039062, "epoch": 0.2125107112253642, "grad_norm": 0.48651084303855896, "kl": 0.008941650390625, "learning_rate": 9.412727182773486e-07, "loss": 0.0067, "reward": 0.6875, "reward_std": 0.3304464891552925, "rewards/accuracy_reward": 0.09722222574055195, "rewards/format_reward": 0.493055559694767, "step": 124 }, { "clip_ratio": 0.0, "completion_length": 543.0972213745117, "epoch": 0.21422450728363324, "grad_norm": 0.9337839484214783, "kl": 0.0222625732421875, "learning_rate": 9.397114317029974e-07, "loss": 0.0038, "reward": 0.9999999850988388, "reward_std": 0.48787199705839157, "rewards/accuracy_reward": 0.2638888908550143, "rewards/format_reward": 0.4722222238779068, "step": 125 }, { "clip_ratio": 0.0, "completion_length": 683.7639007568359, "epoch": 0.2159383033419023, "grad_norm": 0.5181577801704407, "kl": 0.0086822509765625, "learning_rate": 9.381311511432658e-07, "loss": 0.0582, "reward": 0.7638889029622078, "reward_std": 0.4154982175678015, "rewards/accuracy_reward": 0.13888889271765947, "rewards/format_reward": 0.486111119389534, "step": 126 }, { "clip_ratio": 0.0, "completion_length": 701.3888854980469, "epoch": 0.21765209940017138, "grad_norm": 0.4001893401145935, "kl": 0.01253509521484375, "learning_rate": 9.36531953618799e-07, "loss": 0.0222, "reward": 1.0, "reward_std": 0.30821534991264343, "rewards/accuracy_reward": 0.2500000046566129, "rewards/format_reward": 0.5, "step": 127 }, { "clip_ratio": 0.0, "completion_length": 776.9444351196289, "epoch": 0.21936589545844046, "grad_norm": 0.4341527819633484, "kl": 0.01218414306640625, "learning_rate": 9.34913917072228e-07, "loss": -0.0215, "reward": 0.9375, "reward_std": 0.33678142726421356, "rewards/accuracy_reward": 0.22222222574055195, "rewards/format_reward": 0.493055559694767, "step": 128 }, { "clip_ratio": 0.0, "completion_length": 656.3055725097656, "epoch": 0.2210796915167095, "grad_norm": 1.436949610710144, "kl": 0.02315521240234375, "learning_rate": 9.332771203643714e-07, "loss": -0.0426, "reward": 0.923611119389534, "reward_std": 0.435094453394413, "rewards/accuracy_reward": 0.22222222294658422, "rewards/format_reward": 0.4791666716337204, "step": 129 }, { "clip_ratio": 0.0, "completion_length": 562.3333511352539, "epoch": 0.22279348757497858, "grad_norm": 0.7487984895706177, "kl": 0.0170440673828125, "learning_rate": 9.316216432703916e-07, "loss": -0.0057, "reward": 1.3680555522441864, "reward_std": 0.4477668162435293, "rewards/accuracy_reward": 0.4444444477558136, "rewards/format_reward": 0.479166679084301, "step": 130 }, { "clip_ratio": 0.0, "completion_length": 862.7222442626953, "epoch": 0.22450728363324765, "grad_norm": 0.38078296184539795, "kl": 0.01318359375, "learning_rate": 9.299475664759068e-07, "loss": 0.0882, "reward": 0.7083333283662796, "reward_std": 0.3526776432991028, "rewards/accuracy_reward": 0.11111111287027597, "rewards/format_reward": 0.486111119389534, "step": 131 }, { "clip_ratio": 0.0, "completion_length": 602.3888854980469, "epoch": 0.2262210796915167, "grad_norm": 0.5312814712524414, "kl": 0.015228271484375, "learning_rate": 9.282549715730579e-07, "loss": 0.0104, "reward": 0.5902777686715126, "reward_std": 0.26067597232759, "rewards/accuracy_reward": 0.055555556900799274, "rewards/format_reward": 0.4791666716337204, "step": 132 }, { "clip_ratio": 0.0, "completion_length": 644.4722290039062, "epoch": 0.22793487574978577, "grad_norm": 1.1302504539489746, "kl": 0.022735595703125, "learning_rate": 9.265439410565328e-07, "loss": 0.062, "reward": 0.8263889029622078, "reward_std": 0.23915939591825008, "rewards/accuracy_reward": 0.16666667349636555, "rewards/format_reward": 0.493055559694767, "step": 133 }, { "clip_ratio": 0.0, "completion_length": 684.6527709960938, "epoch": 0.22964867180805484, "grad_norm": 0.5044618844985962, "kl": 0.0118408203125, "learning_rate": 9.248145583195447e-07, "loss": 0.0375, "reward": 0.9930555671453476, "reward_std": 0.5024402439594269, "rewards/accuracy_reward": 0.2500000037252903, "rewards/format_reward": 0.493055559694767, "step": 134 }, { "clip_ratio": 0.0, "completion_length": 604.6944427490234, "epoch": 0.23136246786632392, "grad_norm": 0.4981021285057068, "kl": 0.0177459716796875, "learning_rate": 9.230669076497687e-07, "loss": 0.0255, "reward": 0.965277761220932, "reward_std": 0.3830488696694374, "rewards/accuracy_reward": 0.23611111659556627, "rewards/format_reward": 0.493055559694767, "step": 135 }, { "clip_ratio": 0.0, "completion_length": 517.2777824401855, "epoch": 0.23307626392459296, "grad_norm": 1.7722994089126587, "kl": 0.03118896484375, "learning_rate": 9.213010742252327e-07, "loss": 0.0292, "reward": 1.145833358168602, "reward_std": 0.46534085273742676, "rewards/accuracy_reward": 0.3333333320915699, "rewards/format_reward": 0.4791666716337204, "step": 136 }, { "clip_ratio": 0.0, "completion_length": 779.9444580078125, "epoch": 0.23479005998286204, "grad_norm": 0.5337279438972473, "kl": 0.01290130615234375, "learning_rate": 9.195171441101668e-07, "loss": -0.0092, "reward": 0.9583333507180214, "reward_std": 0.39858742617070675, "rewards/accuracy_reward": 0.2500000111758709, "rewards/format_reward": 0.4583333358168602, "step": 137 }, { "clip_ratio": 0.0, "completion_length": 704.3889007568359, "epoch": 0.2365038560411311, "grad_norm": 0.47012683749198914, "kl": 0.01031494140625, "learning_rate": 9.177152042508077e-07, "loss": 0.0208, "reward": 0.729166679084301, "reward_std": 0.32953148148953915, "rewards/accuracy_reward": 0.12500000465661287, "rewards/format_reward": 0.479166679084301, "step": 138 }, { "clip_ratio": 0.0, "completion_length": 647.3472366333008, "epoch": 0.23821765209940018, "grad_norm": 0.5425974726676941, "kl": 0.01230621337890625, "learning_rate": 9.158953424711624e-07, "loss": 0.0068, "reward": 0.6319444477558136, "reward_std": 0.17111803591251373, "rewards/accuracy_reward": 0.06944444589316845, "rewards/format_reward": 0.493055559694767, "step": 139 }, { "clip_ratio": 0.0, "completion_length": 559.319465637207, "epoch": 0.23993144815766923, "grad_norm": 0.4788627028465271, "kl": 0.012969970703125, "learning_rate": 9.140576474687263e-07, "loss": 0.0135, "reward": 1.0694444477558136, "reward_std": 0.3177530914545059, "rewards/accuracy_reward": 0.2916666753590107, "rewards/format_reward": 0.4861111119389534, "step": 140 }, { "clip_ratio": 0.0, "completion_length": 711.7222290039062, "epoch": 0.2416452442159383, "grad_norm": 0.39043956995010376, "kl": 0.0106658935546875, "learning_rate": 9.122022088101613e-07, "loss": 0.0037, "reward": 0.8472222238779068, "reward_std": 0.4586464837193489, "rewards/accuracy_reward": 0.19444444868713617, "rewards/format_reward": 0.4583333432674408, "step": 141 }, { "clip_ratio": 0.0, "completion_length": 674.2222290039062, "epoch": 0.24335904027420738, "grad_norm": 0.1789097785949707, "kl": 0.01094818115234375, "learning_rate": 9.103291169269299e-07, "loss": 0.0154, "reward": 0.6944444328546524, "reward_std": 0.06804138422012329, "rewards/accuracy_reward": 0.09722222480922937, "rewards/format_reward": 0.5, "step": 142 }, { "clip_ratio": 0.0, "completion_length": 771.5555572509766, "epoch": 0.24507283633247642, "grad_norm": 0.4400465488433838, "kl": 0.0099334716796875, "learning_rate": 9.084384631108882e-07, "loss": 0.155, "reward": 0.7500000223517418, "reward_std": 0.4042903557419777, "rewards/accuracy_reward": 0.13888889364898205, "rewards/format_reward": 0.4722222313284874, "step": 143 }, { "clip_ratio": 0.0, "completion_length": 851.3611145019531, "epoch": 0.2467866323907455, "grad_norm": 0.6633160710334778, "kl": 0.01108551025390625, "learning_rate": 9.065303395098358e-07, "loss": 0.0257, "reward": 0.7708333432674408, "reward_std": 0.5133540704846382, "rewards/accuracy_reward": 0.15277778264135122, "rewards/format_reward": 0.4652777835726738, "step": 144 }, { "clip_ratio": 0.0, "completion_length": 711.4722290039062, "epoch": 0.24850042844901457, "grad_norm": 0.46885278820991516, "kl": 0.0117034912109375, "learning_rate": 9.046048391230247e-07, "loss": -0.0231, "reward": 0.7777777910232544, "reward_std": 0.2221490517258644, "rewards/accuracy_reward": 0.1388888955116272, "rewards/format_reward": 0.5, "step": 145 }, { "clip_ratio": 0.0, "completion_length": 760.0555572509766, "epoch": 0.25021422450728364, "grad_norm": 0.49422281980514526, "kl": 0.010040283203125, "learning_rate": 9.026620557966279e-07, "loss": 0.0101, "reward": 0.8750000298023224, "reward_std": 0.43057897686958313, "rewards/accuracy_reward": 0.19444445054978132, "rewards/format_reward": 0.486111119389534, "step": 146 }, { "clip_ratio": 0.0, "completion_length": 614.7639007568359, "epoch": 0.2519280205655527, "grad_norm": 0.4453893005847931, "kl": 0.0105438232421875, "learning_rate": 9.007020842191634e-07, "loss": 0.0151, "reward": 0.7291666641831398, "reward_std": 0.41478364542126656, "rewards/accuracy_reward": 0.12500000186264515, "rewards/format_reward": 0.4791666641831398, "step": 147 }, { "clip_ratio": 0.0, "completion_length": 682.7500152587891, "epoch": 0.2536418166238218, "grad_norm": 0.4301265776157379, "kl": 0.0096435546875, "learning_rate": 8.987250199168808e-07, "loss": -0.0056, "reward": 0.7152777761220932, "reward_std": 0.33566733449697495, "rewards/accuracy_reward": 0.11111111287027597, "rewards/format_reward": 0.493055559694767, "step": 148 }, { "clip_ratio": 0.0, "completion_length": 757.3611221313477, "epoch": 0.25535561268209084, "grad_norm": 0.4108283221721649, "kl": 0.0125579833984375, "learning_rate": 8.967309592491052e-07, "loss": 0.007, "reward": 0.5555555522441864, "reward_std": 0.20964494906365871, "rewards/accuracy_reward": 0.041666666977107525, "rewards/format_reward": 0.4722222238779068, "step": 149 }, { "clip_ratio": 0.0, "completion_length": 684.3611297607422, "epoch": 0.2570694087403599, "grad_norm": 0.3349432647228241, "kl": 0.0099334716796875, "learning_rate": 8.9471999940354e-07, "loss": 0.0087, "reward": 1.111111119389534, "reward_std": 0.3082153648138046, "rewards/accuracy_reward": 0.3055555550381541, "rewards/format_reward": 0.5, "step": 150 }, { "clip_ratio": 0.0, "completion_length": 507.1805648803711, "epoch": 0.258783204798629, "grad_norm": 0.4955935776233673, "kl": 0.01495361328125, "learning_rate": 8.926922383915315e-07, "loss": -0.0025, "reward": 0.6597222238779068, "reward_std": 0.3041820004582405, "rewards/accuracy_reward": 0.08333333488553762, "rewards/format_reward": 0.493055559694767, "step": 151 }, { "clip_ratio": 0.0, "completion_length": 778.1389007568359, "epoch": 0.26049700085689803, "grad_norm": 0.46859902143478394, "kl": 0.01029205322265625, "learning_rate": 8.906477750432903e-07, "loss": 0.0761, "reward": 0.8125, "reward_std": 0.38506873697042465, "rewards/accuracy_reward": 0.16666666977107525, "rewards/format_reward": 0.4791666716337204, "step": 152 }, { "clip_ratio": 0.0, "completion_length": 646.8194427490234, "epoch": 0.2622107969151671, "grad_norm": 0.5361355543136597, "kl": 0.0163421630859375, "learning_rate": 8.88586709003076e-07, "loss": 0.1016, "reward": 0.9861111268401146, "reward_std": 0.4283023551106453, "rewards/accuracy_reward": 0.2500000037252903, "rewards/format_reward": 0.4861111119389534, "step": 153 }, { "clip_ratio": 0.0, "completion_length": 603.5000076293945, "epoch": 0.2639245929734362, "grad_norm": 0.6667534708976746, "kl": 0.015716552734375, "learning_rate": 8.865091407243394e-07, "loss": -0.0517, "reward": 0.888888880610466, "reward_std": 0.4442981034517288, "rewards/accuracy_reward": 0.19444444496184587, "rewards/format_reward": 0.5, "step": 154 }, { "clip_ratio": 0.0, "completion_length": 730.5138854980469, "epoch": 0.2656383890317052, "grad_norm": 0.5823290944099426, "kl": 0.0100250244140625, "learning_rate": 8.844151714648274e-07, "loss": 0.0032, "reward": 0.9652777761220932, "reward_std": 0.3304465189576149, "rewards/accuracy_reward": 0.23611111752688885, "rewards/format_reward": 0.493055559694767, "step": 155 }, { "clip_ratio": 0.0, "completion_length": 699.7500152587891, "epoch": 0.26735218508997427, "grad_norm": 0.3316850960254669, "kl": 0.00982666015625, "learning_rate": 8.823049032816478e-07, "loss": 0.0064, "reward": 0.798611119389534, "reward_std": 0.19436374306678772, "rewards/accuracy_reward": 0.1527777798473835, "rewards/format_reward": 0.493055559694767, "step": 156 }, { "clip_ratio": 0.0, "completion_length": 678.5833358764648, "epoch": 0.26906598114824337, "grad_norm": 0.43551284074783325, "kl": 0.0104827880859375, "learning_rate": 8.801784390262943e-07, "loss": 0.0163, "reward": 1.013888880610466, "reward_std": 0.34745684266090393, "rewards/accuracy_reward": 0.2638888927176595, "rewards/format_reward": 0.486111119389534, "step": 157 }, { "clip_ratio": 0.0, "completion_length": 657.9027862548828, "epoch": 0.2707797772065124, "grad_norm": 0.3160940110683441, "kl": 0.00923919677734375, "learning_rate": 8.780358823396352e-07, "loss": -0.0074, "reward": 0.604166679084301, "reward_std": 0.17633883468806744, "rewards/accuracy_reward": 0.0555555559694767, "rewards/format_reward": 0.493055559694767, "step": 158 }, { "clip_ratio": 0.0, "completion_length": 762.0138854980469, "epoch": 0.27249357326478146, "grad_norm": 0.8683849573135376, "kl": 0.02294921875, "learning_rate": 8.758773376468604e-07, "loss": 0.0224, "reward": 0.597222238779068, "reward_std": 0.25616976991295815, "rewards/accuracy_reward": 0.06944444589316845, "rewards/format_reward": 0.4583333432674408, "step": 159 }, { "clip_ratio": 0.0, "completion_length": 648.625, "epoch": 0.27420736932305056, "grad_norm": 0.44514918327331543, "kl": 0.00907135009765625, "learning_rate": 8.737029101523929e-07, "loss": 0.0111, "reward": 1.0138888955116272, "reward_std": 0.32083219289779663, "rewards/accuracy_reward": 0.2638888927176595, "rewards/format_reward": 0.486111119389534, "step": 160 }, { "clip_ratio": 0.0, "completion_length": 669.7778015136719, "epoch": 0.2759211653813196, "grad_norm": 0.5343595743179321, "kl": 0.0099029541015625, "learning_rate": 8.715127058347614e-07, "loss": -0.0242, "reward": 0.7777777910232544, "reward_std": 0.4600205048918724, "rewards/accuracy_reward": 0.13888889271765947, "rewards/format_reward": 0.5000000074505806, "step": 161 }, { "clip_ratio": 0.0, "completion_length": 619.6527938842773, "epoch": 0.2776349614395887, "grad_norm": 0.24237516522407532, "kl": 0.00861358642578125, "learning_rate": 8.693068314414344e-07, "loss": 0.0059, "reward": 0.777777761220932, "reward_std": 0.17213259637355804, "rewards/accuracy_reward": 0.1388888917863369, "rewards/format_reward": 0.5, "step": 162 }, { "clip_ratio": 0.0, "completion_length": 686.5416717529297, "epoch": 0.27934875749785776, "grad_norm": 0.35996633768081665, "kl": 0.0109405517578125, "learning_rate": 8.670853944836176e-07, "loss": 0.0226, "reward": 0.652777798473835, "reward_std": 0.2794154789298773, "rewards/accuracy_reward": 0.08333333395421505, "rewards/format_reward": 0.486111119389534, "step": 163 }, { "clip_ratio": 0.0, "completion_length": 563.8611221313477, "epoch": 0.2810625535561268, "grad_norm": 0.41780418157577515, "kl": 0.011383056640625, "learning_rate": 8.648485032310144e-07, "loss": -0.0036, "reward": 0.8611111342906952, "reward_std": 0.3134361729025841, "rewards/accuracy_reward": 0.18055555690079927, "rewards/format_reward": 0.5, "step": 164 }, { "clip_ratio": 0.0, "completion_length": 503.6388931274414, "epoch": 0.2827763496143959, "grad_norm": 0.34387895464897156, "kl": 0.011260986328125, "learning_rate": 8.625962667065487e-07, "loss": -0.011, "reward": 0.6111111044883728, "reward_std": 0.15932847559452057, "rewards/accuracy_reward": 0.055555556900799274, "rewards/format_reward": 0.5, "step": 165 }, { "clip_ratio": 0.0, "completion_length": 784.4444580078125, "epoch": 0.28449014567266495, "grad_norm": 0.47405433654785156, "kl": 0.00818634033203125, "learning_rate": 8.603287946810513e-07, "loss": 0.0147, "reward": 0.8055555671453476, "reward_std": 0.5355970486998558, "rewards/accuracy_reward": 0.16666666883975267, "rewards/format_reward": 0.4722222313284874, "step": 166 }, { "clip_ratio": 0.0, "completion_length": 674.5833282470703, "epoch": 0.286203941730934, "grad_norm": 0.43140318989753723, "kl": 0.008544921875, "learning_rate": 8.580461976679099e-07, "loss": 0.0226, "reward": 0.7638888955116272, "reward_std": 0.4538358449935913, "rewards/accuracy_reward": 0.13888889364898205, "rewards/format_reward": 0.4861111119389534, "step": 167 }, { "clip_ratio": 0.0, "completion_length": 789.8472290039062, "epoch": 0.2879177377892031, "grad_norm": 0.7944321632385254, "kl": 0.01107025146484375, "learning_rate": 8.557485869176825e-07, "loss": 0.0802, "reward": 0.680555559694767, "reward_std": 0.28722215443849564, "rewards/accuracy_reward": 0.09722222574055195, "rewards/format_reward": 0.486111119389534, "step": 168 }, { "clip_ratio": 0.0, "completion_length": 594.7222366333008, "epoch": 0.28963153384747214, "grad_norm": 0.47057613730430603, "kl": 0.01092529296875, "learning_rate": 8.534360744126753e-07, "loss": -0.0071, "reward": 0.8263888880610466, "reward_std": 0.2624051198363304, "rewards/accuracy_reward": 0.16666666697710752, "rewards/format_reward": 0.493055559694767, "step": 169 }, { "clip_ratio": 0.0, "completion_length": 575.5000076293945, "epoch": 0.2913453299057412, "grad_norm": 0.48645710945129395, "kl": 0.0149688720703125, "learning_rate": 8.511087728614862e-07, "loss": 0.0024, "reward": 0.965277761220932, "reward_std": 0.37693794071674347, "rewards/accuracy_reward": 0.23611111380159855, "rewards/format_reward": 0.493055559694767, "step": 170 }, { "clip_ratio": 0.0, "completion_length": 738.7916717529297, "epoch": 0.2930591259640103, "grad_norm": 0.4424607753753662, "kl": 0.00809478759765625, "learning_rate": 8.487667956935087e-07, "loss": 0.0969, "reward": 0.7291666865348816, "reward_std": 0.2601025812327862, "rewards/accuracy_reward": 0.12500000093132257, "rewards/format_reward": 0.479166679084301, "step": 171 }, { "clip_ratio": 0.0, "completion_length": 730.7361297607422, "epoch": 0.29477292202227934, "grad_norm": 0.4389359652996063, "kl": 0.00835418701171875, "learning_rate": 8.464102570534061e-07, "loss": 0.0355, "reward": 0.7430555447936058, "reward_std": 0.24438021332025528, "rewards/accuracy_reward": 0.1250000037252903, "rewards/format_reward": 0.493055559694767, "step": 172 }, { "clip_ratio": 0.0, "completion_length": 682.6111068725586, "epoch": 0.29648671808054844, "grad_norm": 0.6182007193565369, "kl": 0.013397216796875, "learning_rate": 8.440392717955475e-07, "loss": 0.0092, "reward": 0.7777777761220932, "reward_std": 0.4936107471585274, "rewards/accuracy_reward": 0.15277778171002865, "rewards/format_reward": 0.4722222238779068, "step": 173 }, { "clip_ratio": 0.0, "completion_length": 690.2222290039062, "epoch": 0.2982005141388175, "grad_norm": 0.30457955598831177, "kl": 0.0081939697265625, "learning_rate": 8.416539554784089e-07, "loss": 0.0026, "reward": 0.7708333432674408, "reward_std": 0.23915940523147583, "rewards/accuracy_reward": 0.13888889271765947, "rewards/format_reward": 0.493055559694767, "step": 174 }, { "clip_ratio": 0.0, "completion_length": 696.6527938842773, "epoch": 0.29991431019708653, "grad_norm": 0.3551907241344452, "kl": 0.010650634765625, "learning_rate": 8.392544243589427e-07, "loss": 0.0106, "reward": 0.5833333283662796, "reward_std": 0.15410767495632172, "rewards/accuracy_reward": 0.041666666977107525, "rewards/format_reward": 0.5, "step": 175 }, { "clip_ratio": 0.0, "completion_length": 606.7916717529297, "epoch": 0.30162810625535563, "grad_norm": 0.4258103370666504, "kl": 0.0097503662109375, "learning_rate": 8.368407953869103e-07, "loss": 0.0108, "reward": 0.7152777761220932, "reward_std": 0.27087756246328354, "rewards/accuracy_reward": 0.11111111473292112, "rewards/format_reward": 0.493055559694767, "step": 176 }, { "clip_ratio": 0.0, "completion_length": 702.7916564941406, "epoch": 0.3033419023136247, "grad_norm": 0.38482344150543213, "kl": 0.009063720703125, "learning_rate": 8.344131861991828e-07, "loss": 0.0025, "reward": 0.7986111268401146, "reward_std": 0.416512792930007, "rewards/accuracy_reward": 0.15277778171002865, "rewards/format_reward": 0.493055559694767, "step": 177 }, { "clip_ratio": 0.0, "completion_length": 673.5555572509766, "epoch": 0.3050556983718937, "grad_norm": 0.4185694456100464, "kl": 0.00726318359375, "learning_rate": 8.319717151140072e-07, "loss": 0.0183, "reward": 0.7430555522441864, "reward_std": 0.3815770819783211, "rewards/accuracy_reward": 0.12500000465661287, "rewards/format_reward": 0.493055559694767, "step": 178 }, { "clip_ratio": 0.0, "completion_length": 740.5277709960938, "epoch": 0.3067694944301628, "grad_norm": 0.4638974070549011, "kl": 0.00782012939453125, "learning_rate": 8.295165011252396e-07, "loss": -0.0087, "reward": 0.7083333507180214, "reward_std": 0.4054961260408163, "rewards/accuracy_reward": 0.12500000186264515, "rewards/format_reward": 0.4583333432674408, "step": 179 }, { "clip_ratio": 0.0, "completion_length": 577.0416641235352, "epoch": 0.30848329048843187, "grad_norm": 0.5336411595344543, "kl": 0.0118560791015625, "learning_rate": 8.270476638965461e-07, "loss": 0.0215, "reward": 0.8333333432674408, "reward_std": 0.358231820166111, "rewards/accuracy_reward": 0.16666667070239782, "rewards/format_reward": 0.5, "step": 180 }, { "clip_ratio": 0.0, "completion_length": 692.4027862548828, "epoch": 0.3101970865467009, "grad_norm": 0.5019733905792236, "kl": 0.0130157470703125, "learning_rate": 8.245653237555705e-07, "loss": -0.0101, "reward": 0.75, "reward_std": 0.33668188750743866, "rewards/accuracy_reward": 0.12500000186264515, "rewards/format_reward": 0.5, "step": 181 }, { "clip_ratio": 0.0, "completion_length": 664.3889007568359, "epoch": 0.31191088260497, "grad_norm": 0.36695027351379395, "kl": 0.0102386474609375, "learning_rate": 8.220696016880687e-07, "loss": 0.0083, "reward": 0.7291666716337204, "reward_std": 0.30504853278398514, "rewards/accuracy_reward": 0.1250000037252903, "rewards/format_reward": 0.4791666716337204, "step": 182 }, { "clip_ratio": 0.0, "completion_length": 741.8055648803711, "epoch": 0.31362467866323906, "grad_norm": 0.5278235077857971, "kl": 0.0111541748046875, "learning_rate": 8.195606193320136e-07, "loss": 0.0004, "reward": 0.9166666716337204, "reward_std": 0.46232304722070694, "rewards/accuracy_reward": 0.20833334140479565, "rewards/format_reward": 0.5, "step": 183 }, { "clip_ratio": 0.0, "completion_length": 648.9861221313477, "epoch": 0.31533847472150817, "grad_norm": 0.48935696482658386, "kl": 0.0121612548828125, "learning_rate": 8.170384989716657e-07, "loss": 0.0721, "reward": 0.7847222089767456, "reward_std": 0.3912379518151283, "rewards/accuracy_reward": 0.15277778171002865, "rewards/format_reward": 0.4791666641831398, "step": 184 }, { "clip_ratio": 0.0, "completion_length": 652.9166793823242, "epoch": 0.3170522707797772, "grad_norm": 0.5268736481666565, "kl": 0.0102691650390625, "learning_rate": 8.145033635316128e-07, "loss": 0.0095, "reward": 1.2916667014360428, "reward_std": 0.7127073556184769, "rewards/accuracy_reward": 0.4027777947485447, "rewards/format_reward": 0.486111119389534, "step": 185 }, { "clip_ratio": 0.0, "completion_length": 650.0972290039062, "epoch": 0.31876606683804626, "grad_norm": 0.3435427248477936, "kl": 0.0112152099609375, "learning_rate": 8.119553365707802e-07, "loss": -0.0147, "reward": 0.7222222238779068, "reward_std": 0.3314610570669174, "rewards/accuracy_reward": 0.11111111473292112, "rewards/format_reward": 0.5, "step": 186 }, { "clip_ratio": 0.0, "completion_length": 668.9722290039062, "epoch": 0.32047986289631536, "grad_norm": 0.5294600129127502, "kl": 0.01114654541015625, "learning_rate": 8.093945422764069e-07, "loss": -0.0126, "reward": 1.0277777910232544, "reward_std": 0.6216515377163887, "rewards/accuracy_reward": 0.26388888992369175, "rewards/format_reward": 0.5, "step": 187 }, { "clip_ratio": 0.0, "completion_length": 735.2639007568359, "epoch": 0.3221936589545844, "grad_norm": 0.5934704542160034, "kl": 0.015716552734375, "learning_rate": 8.068211054579943e-07, "loss": -0.0676, "reward": 0.736111119389534, "reward_std": 0.3602609410881996, "rewards/accuracy_reward": 0.12500000093132257, "rewards/format_reward": 0.486111119389534, "step": 188 }, { "clip_ratio": 0.0, "completion_length": 708.2638854980469, "epoch": 0.32390745501285345, "grad_norm": 0.29051750898361206, "kl": 0.012054443359375, "learning_rate": 8.04235151541222e-07, "loss": 0.0144, "reward": 0.6666666567325592, "reward_std": 0.2453947737812996, "rewards/accuracy_reward": 0.08333333488553762, "rewards/format_reward": 0.5, "step": 189 }, { "clip_ratio": 0.0, "completion_length": 791.9305572509766, "epoch": 0.32562125107112255, "grad_norm": 0.6270461082458496, "kl": 0.00994110107421875, "learning_rate": 8.01636806561836e-07, "loss": 0.1261, "reward": 0.6319444552063942, "reward_std": 0.306252408772707, "rewards/accuracy_reward": 0.0833333358168602, "rewards/format_reward": 0.4652777835726738, "step": 190 }, { "clip_ratio": 0.0, "completion_length": 759.0139007568359, "epoch": 0.3273350471293916, "grad_norm": 0.4964490532875061, "kl": 0.0179290771484375, "learning_rate": 7.990261971595048e-07, "loss": 0.0119, "reward": 0.9027777910232544, "reward_std": 0.5300310179591179, "rewards/accuracy_reward": 0.20833333674818277, "rewards/format_reward": 0.486111119389534, "step": 191 }, { "clip_ratio": 0.0, "completion_length": 513.6249923706055, "epoch": 0.32904884318766064, "grad_norm": 0.5976383090019226, "kl": 0.0132598876953125, "learning_rate": 7.964034505716476e-07, "loss": -0.0007, "reward": 1.0555555820465088, "reward_std": 0.530364416539669, "rewards/accuracy_reward": 0.2777777798473835, "rewards/format_reward": 0.5, "step": 192 }, { "clip_ratio": 0.0, "completion_length": 626.6250152587891, "epoch": 0.33076263924592975, "grad_norm": 0.6004844903945923, "kl": 0.0198211669921875, "learning_rate": 7.93768694627233e-07, "loss": 0.0022, "reward": 0.9375000149011612, "reward_std": 0.32522570341825485, "rewards/accuracy_reward": 0.22222222667187452, "rewards/format_reward": 0.493055559694767, "step": 193 }, { "clip_ratio": 0.0, "completion_length": 613.7361068725586, "epoch": 0.3324764353041988, "grad_norm": 0.5730006098747253, "kl": 0.021759033203125, "learning_rate": 7.911220577405484e-07, "loss": 0.0095, "reward": 0.7777777910232544, "reward_std": 0.29541123658418655, "rewards/accuracy_reward": 0.13888888992369175, "rewards/format_reward": 0.5, "step": 194 }, { "clip_ratio": 0.0, "completion_length": 774.7083435058594, "epoch": 0.3341902313624679, "grad_norm": 0.42059004306793213, "kl": 0.01324462890625, "learning_rate": 7.884636689049422e-07, "loss": 0.0033, "reward": 0.6666666567325592, "reward_std": 0.29541125148534775, "rewards/accuracy_reward": 0.08333333488553762, "rewards/format_reward": 0.5, "step": 195 }, { "clip_ratio": 0.0, "completion_length": 683.8888854980469, "epoch": 0.33590402742073694, "grad_norm": 0.3196467459201813, "kl": 0.0126495361328125, "learning_rate": 7.857936576865356e-07, "loss": 0.0156, "reward": 0.8819444626569748, "reward_std": 0.25071514397859573, "rewards/accuracy_reward": 0.1944444514811039, "rewards/format_reward": 0.493055559694767, "step": 196 }, { "clip_ratio": 0.0, "completion_length": 793.1388854980469, "epoch": 0.337617823479006, "grad_norm": 0.6156473755836487, "kl": 0.01934814453125, "learning_rate": 7.831121542179086e-07, "loss": 0.0594, "reward": 0.8819444552063942, "reward_std": 0.4707336239516735, "rewards/accuracy_reward": 0.20833334140479565, "rewards/format_reward": 0.4652777835726738, "step": 197 }, { "clip_ratio": 0.0, "completion_length": 601.7777862548828, "epoch": 0.3393316195372751, "grad_norm": 0.5810103416442871, "kl": 0.0160675048828125, "learning_rate": 7.804192891917571e-07, "loss": 0.0897, "reward": 0.8750000149011612, "reward_std": 0.4387439265847206, "rewards/accuracy_reward": 0.19444444868713617, "rewards/format_reward": 0.486111119389534, "step": 198 }, { "clip_ratio": 0.0, "completion_length": 739.5694580078125, "epoch": 0.34104541559554413, "grad_norm": 0.4028480350971222, "kl": 0.0103607177734375, "learning_rate": 7.777151938545235e-07, "loss": 0.0106, "reward": 0.7222222238779068, "reward_std": 0.30821534991264343, "rewards/accuracy_reward": 0.11111111473292112, "rewards/format_reward": 0.5, "step": 199 }, { "clip_ratio": 0.0, "completion_length": 765.2083587646484, "epoch": 0.3427592116538132, "grad_norm": 0.5337042808532715, "kl": 0.0111236572265625, "learning_rate": 7.75e-07, "loss": 0.0084, "reward": 0.7083333432674408, "reward_std": 0.37828588485717773, "rewards/accuracy_reward": 0.1111111119389534, "rewards/format_reward": 0.4861111119389534, "step": 200 }, { "clip_ratio": 0.0, "completion_length": 755.25, "epoch": 0.3444730077120823, "grad_norm": 0.5041696429252625, "kl": 0.00969696044921875, "learning_rate": 7.72273839962904e-07, "loss": -0.0162, "reward": 0.8541666567325592, "reward_std": 0.4137297794222832, "rewards/accuracy_reward": 0.19444444682449102, "rewards/format_reward": 0.4652777761220932, "step": 201 }, { "clip_ratio": 0.0, "completion_length": 645.0277862548828, "epoch": 0.3461868037703513, "grad_norm": 0.4631847143173218, "kl": 0.0138092041015625, "learning_rate": 7.695368466124296e-07, "loss": 0.023, "reward": 0.8541666716337204, "reward_std": 0.41651278734207153, "rewards/accuracy_reward": 0.18055556062608957, "rewards/format_reward": 0.493055559694767, "step": 202 }, { "clip_ratio": 0.0, "completion_length": 741.7361145019531, "epoch": 0.34790059982862037, "grad_norm": 0.5321578979492188, "kl": 0.01479339599609375, "learning_rate": 7.667891533457718e-07, "loss": 0.106, "reward": 1.1458333730697632, "reward_std": 0.6571117714047432, "rewards/accuracy_reward": 0.3333333432674408, "rewards/format_reward": 0.479166679084301, "step": 203 }, { "clip_ratio": 0.0, "completion_length": 689.3194427490234, "epoch": 0.3496143958868895, "grad_norm": 0.3890639841556549, "kl": 0.012969970703125, "learning_rate": 7.640308940816239e-07, "loss": -0.0073, "reward": 0.8541666716337204, "reward_std": 0.2211344763636589, "rewards/accuracy_reward": 0.18055556155741215, "rewards/format_reward": 0.493055559694767, "step": 204 }, { "clip_ratio": 0.0, "completion_length": 654.2638931274414, "epoch": 0.3513281919451585, "grad_norm": 0.5081583857536316, "kl": 0.01397705078125, "learning_rate": 7.612622032536507e-07, "loss": 0.0108, "reward": 0.9166666865348816, "reward_std": 0.32624027878046036, "rewards/accuracy_reward": 0.2083333358168602, "rewards/format_reward": 0.5, "step": 205 }, { "clip_ratio": 0.0, "completion_length": 681.0694580078125, "epoch": 0.35304198800342756, "grad_norm": 0.48619431257247925, "kl": 0.0098114013671875, "learning_rate": 7.584832158039378e-07, "loss": -0.005, "reward": 0.7361111044883728, "reward_std": 0.32421112060546875, "rewards/accuracy_reward": 0.1250000037252903, "rewards/format_reward": 0.486111119389534, "step": 206 }, { "clip_ratio": 0.0, "completion_length": 603.4305572509766, "epoch": 0.35475578406169667, "grad_norm": 0.4051726162433624, "kl": 0.01464080810546875, "learning_rate": 7.556940671764124e-07, "loss": -0.0038, "reward": 1.1944444626569748, "reward_std": 0.3134361505508423, "rewards/accuracy_reward": 0.3472222350537777, "rewards/format_reward": 0.5, "step": 207 }, { "clip_ratio": 0.0, "completion_length": 680.7916793823242, "epoch": 0.3564695801199657, "grad_norm": 0.5050489902496338, "kl": 0.0126953125, "learning_rate": 7.528948933102438e-07, "loss": 0.0505, "reward": 0.7916666567325592, "reward_std": 0.32421112060546875, "rewards/accuracy_reward": 0.15277777798473835, "rewards/format_reward": 0.486111119389534, "step": 208 }, { "clip_ratio": 0.0, "completion_length": 656.4722442626953, "epoch": 0.3581833761782348, "grad_norm": 0.29651615023612976, "kl": 0.00934600830078125, "learning_rate": 7.500858306332172e-07, "loss": -0.0032, "reward": 0.6666666567325592, "reward_std": 0.25819889456033707, "rewards/accuracy_reward": 0.08333333395421505, "rewards/format_reward": 0.5, "step": 209 }, { "clip_ratio": 0.0, "completion_length": 515.0, "epoch": 0.35989717223650386, "grad_norm": 0.8540976643562317, "kl": 0.028778076171875, "learning_rate": 7.472670160550848e-07, "loss": 0.0138, "reward": 0.7708333283662796, "reward_std": 0.46130845695734024, "rewards/accuracy_reward": 0.1388888917863369, "rewards/format_reward": 0.493055559694767, "step": 210 }, { "clip_ratio": 0.0, "completion_length": 965.8055725097656, "epoch": 0.3616109682947729, "grad_norm": 0.3300262689590454, "kl": 0.0074920654296875, "learning_rate": 7.444385869608921e-07, "loss": 0.0311, "reward": 0.6527777835726738, "reward_std": 0.17010344564914703, "rewards/accuracy_reward": 0.08333333674818277, "rewards/format_reward": 0.486111119389534, "step": 211 }, { "clip_ratio": 0.0, "completion_length": 625.5555572509766, "epoch": 0.363324764353042, "grad_norm": 0.5068047642707825, "kl": 0.012664794921875, "learning_rate": 7.416006812042827e-07, "loss": -0.0061, "reward": 0.9375, "reward_std": 0.4217336028814316, "rewards/accuracy_reward": 0.22222222480922937, "rewards/format_reward": 0.493055559694767, "step": 212 }, { "clip_ratio": 0.0, "completion_length": 909.013916015625, "epoch": 0.36503856041131105, "grad_norm": 0.3107520341873169, "kl": 0.00826263427734375, "learning_rate": 7.387534371007797e-07, "loss": 0.0477, "reward": 0.7361111044883728, "reward_std": 0.34775684773921967, "rewards/accuracy_reward": 0.12500000093132257, "rewards/format_reward": 0.4861111119389534, "step": 213 }, { "clip_ratio": 0.0, "completion_length": 708.1805572509766, "epoch": 0.3667523564695801, "grad_norm": 0.5627197027206421, "kl": 0.00879669189453125, "learning_rate": 7.358969934210438e-07, "loss": 0.0545, "reward": 0.958333358168602, "reward_std": 0.5707200393080711, "rewards/accuracy_reward": 0.23611111659556627, "rewards/format_reward": 0.486111119389534, "step": 214 }, { "clip_ratio": 0.0, "completion_length": 702.1388854980469, "epoch": 0.3684661525278492, "grad_norm": 0.5161833763122559, "kl": 0.01049041748046875, "learning_rate": 7.330314893841101e-07, "loss": -0.0212, "reward": 1.1388889104127884, "reward_std": 0.5355852097272873, "rewards/accuracy_reward": 0.3194444486871362, "rewards/format_reward": 0.5, "step": 215 }, { "clip_ratio": 0.0, "completion_length": 695.9722290039062, "epoch": 0.37017994858611825, "grad_norm": 0.4373010993003845, "kl": 0.00847625732421875, "learning_rate": 7.301570646506027e-07, "loss": 0.0195, "reward": 0.7430555447936058, "reward_std": 0.30720078758895397, "rewards/accuracy_reward": 0.1250000037252903, "rewards/format_reward": 0.493055559694767, "step": 216 }, { "clip_ratio": 0.0, "completion_length": 636.6944580078125, "epoch": 0.3718937446443873, "grad_norm": 0.49228960275650024, "kl": 0.009765625, "learning_rate": 7.27273859315928e-07, "loss": 0.0262, "reward": 0.8958333432674408, "reward_std": 0.46008094400167465, "rewards/accuracy_reward": 0.20833333767950535, "rewards/format_reward": 0.4791666716337204, "step": 217 }, { "clip_ratio": 0.0, "completion_length": 717.819450378418, "epoch": 0.3736075407026564, "grad_norm": 0.43165305256843567, "kl": 0.0099334716796875, "learning_rate": 7.243820139034464e-07, "loss": 0.0048, "reward": 0.8124999925494194, "reward_std": 0.29642581194639206, "rewards/accuracy_reward": 0.1666666679084301, "rewards/format_reward": 0.4791666716337204, "step": 218 }, { "clip_ratio": 0.0, "completion_length": 838.875, "epoch": 0.37532133676092544, "grad_norm": 0.4618090093135834, "kl": 0.00933837890625, "learning_rate": 7.214816693576234e-07, "loss": 0.0164, "reward": 0.7083333432674408, "reward_std": 0.466628834605217, "rewards/accuracy_reward": 0.11111111380159855, "rewards/format_reward": 0.486111119389534, "step": 219 }, { "clip_ratio": 0.0, "completion_length": 847.2222290039062, "epoch": 0.37703513281919454, "grad_norm": 0.12107283622026443, "kl": 0.00963592529296875, "learning_rate": 7.185729670371604e-07, "loss": -0.0002, "reward": 0.6944444328546524, "reward_std": 0.0680413767695427, "rewards/accuracy_reward": 0.09722222480922937, "rewards/format_reward": 0.5, "step": 220 }, { "clip_ratio": 0.0, "completion_length": 738.3472290039062, "epoch": 0.3787489288774636, "grad_norm": 0.15903827548027039, "kl": 0.00983428955078125, "learning_rate": 7.156560487081051e-07, "loss": 0.0021, "reward": 0.5277777761220932, "reward_std": 0.0680413767695427, "rewards/accuracy_reward": 0.013888888992369175, "rewards/format_reward": 0.5, "step": 221 }, { "clip_ratio": 0.0, "completion_length": 745.5694427490234, "epoch": 0.38046272493573263, "grad_norm": 0.331328809261322, "kl": 0.01165771484375, "learning_rate": 7.127310565369415e-07, "loss": 0.012, "reward": 0.7083333432674408, "reward_std": 0.2549325004220009, "rewards/accuracy_reward": 0.11111111287027597, "rewards/format_reward": 0.4861111119389534, "step": 222 }, { "clip_ratio": 0.0, "completion_length": 747.9027862548828, "epoch": 0.38217652099400173, "grad_norm": 0.43389493227005005, "kl": 0.0093536376953125, "learning_rate": 7.097981330836616e-07, "loss": 0.0764, "reward": 0.6250000074505806, "reward_std": 0.2613905519247055, "rewards/accuracy_reward": 0.06944444496184587, "rewards/format_reward": 0.4861111119389534, "step": 223 }, { "clip_ratio": 0.0, "completion_length": 655.5694580078125, "epoch": 0.3838903170522708, "grad_norm": 0.2923060655593872, "kl": 0.0098876953125, "learning_rate": 7.068574212948169e-07, "loss": 0.0202, "reward": 0.5138888955116272, "reward_std": 0.10206206515431404, "rewards/accuracy_reward": 0.013888888992369175, "rewards/format_reward": 0.486111119389534, "step": 224 }, { "clip_ratio": 0.0, "completion_length": 595.1666793823242, "epoch": 0.3856041131105398, "grad_norm": 0.3962916433811188, "kl": 0.01021575927734375, "learning_rate": 7.039090644965509e-07, "loss": -0.0231, "reward": 0.75, "reward_std": 0.2901904508471489, "rewards/accuracy_reward": 0.1250000037252903, "rewards/format_reward": 0.5, "step": 225 }, { "clip_ratio": 0.0, "completion_length": 801.6944580078125, "epoch": 0.3873179091688089, "grad_norm": 0.42240843176841736, "kl": 0.00876617431640625, "learning_rate": 7.009532063876148e-07, "loss": 0.0223, "reward": 0.7708333507180214, "reward_std": 0.5345706399530172, "rewards/accuracy_reward": 0.13888889271765947, "rewards/format_reward": 0.493055559694767, "step": 226 }, { "clip_ratio": 0.0, "completion_length": 698.6527938842773, "epoch": 0.389031705227078, "grad_norm": 0.5248401165008545, "kl": 0.0102081298828125, "learning_rate": 6.979899910323624e-07, "loss": -0.0301, "reward": 1.0763889104127884, "reward_std": 0.39326707273721695, "rewards/accuracy_reward": 0.29166667722165585, "rewards/format_reward": 0.493055559694767, "step": 227 }, { "clip_ratio": 0.0, "completion_length": 682.1111297607422, "epoch": 0.390745501285347, "grad_norm": 0.6135608553886414, "kl": 0.01019287109375, "learning_rate": 6.950195628537299e-07, "loss": 0.0276, "reward": 1.0694444626569748, "reward_std": 0.496343731880188, "rewards/accuracy_reward": 0.2916666679084301, "rewards/format_reward": 0.486111119389534, "step": 228 }, { "clip_ratio": 0.0, "completion_length": 743.4027862548828, "epoch": 0.3924592973436161, "grad_norm": 0.2736571133136749, "kl": 0.01031494140625, "learning_rate": 6.920420666261961e-07, "loss": 0.0257, "reward": 0.8472222238779068, "reward_std": 0.18812836706638336, "rewards/accuracy_reward": 0.180555559694767, "rewards/format_reward": 0.486111119389534, "step": 229 }, { "clip_ratio": 0.0, "completion_length": 773.6527862548828, "epoch": 0.39417309340188517, "grad_norm": 0.4546229839324951, "kl": 0.0096588134765625, "learning_rate": 6.890576474687263e-07, "loss": 0.0489, "reward": 1.0277778059244156, "reward_std": 0.3637526258826256, "rewards/accuracy_reward": 0.2777777835726738, "rewards/format_reward": 0.4722222238779068, "step": 230 }, { "clip_ratio": 0.0, "completion_length": 651.8611145019531, "epoch": 0.39588688946015427, "grad_norm": 0.40317994356155396, "kl": 0.01104736328125, "learning_rate": 6.860664508377001e-07, "loss": -0.0009, "reward": 1.0555555522441864, "reward_std": 0.2721655070781708, "rewards/accuracy_reward": 0.27777778543531895, "rewards/format_reward": 0.5, "step": 231 }, { "clip_ratio": 0.0, "completion_length": 591.1805725097656, "epoch": 0.3976006855184233, "grad_norm": 0.4714021384716034, "kl": 0.0143280029296875, "learning_rate": 6.83068622519821e-07, "loss": 0.0116, "reward": 1.0, "reward_std": 0.30821534991264343, "rewards/accuracy_reward": 0.25000000186264515, "rewards/format_reward": 0.5, "step": 232 }, { "clip_ratio": 0.0, "completion_length": 549.2777786254883, "epoch": 0.39931448157669236, "grad_norm": 0.6059587001800537, "kl": 0.01348876953125, "learning_rate": 6.800643086250121e-07, "loss": -0.0337, "reward": 1.0138889104127884, "reward_std": 0.5345955863595009, "rewards/accuracy_reward": 0.26388889364898205, "rewards/format_reward": 0.486111119389534, "step": 233 }, { "clip_ratio": 0.0, "completion_length": 809.9444580078125, "epoch": 0.40102827763496146, "grad_norm": 0.45386579632759094, "kl": 0.0084228515625, "learning_rate": 6.770536555792944e-07, "loss": 0.0403, "reward": 0.9305555671453476, "reward_std": 0.4666288197040558, "rewards/accuracy_reward": 0.22222222574055195, "rewards/format_reward": 0.4861111119389534, "step": 234 }, { "clip_ratio": 0.0, "completion_length": 696.6666717529297, "epoch": 0.4027420736932305, "grad_norm": 0.1798969805240631, "kl": 0.0090789794921875, "learning_rate": 6.740368101176495e-07, "loss": 0.0142, "reward": 0.8611111044883728, "reward_std": 0.06804138422012329, "rewards/accuracy_reward": 0.180555559694767, "rewards/format_reward": 0.5, "step": 235 }, { "clip_ratio": 0.0, "completion_length": 741.5277862548828, "epoch": 0.40445586975149955, "grad_norm": 0.37443897128105164, "kl": 0.0080413818359375, "learning_rate": 6.710139192768694e-07, "loss": -0.0028, "reward": 0.7986111268401146, "reward_std": 0.24438021332025528, "rewards/accuracy_reward": 0.15277778450399637, "rewards/format_reward": 0.493055559694767, "step": 236 }, { "clip_ratio": 0.0, "completion_length": 569.6111221313477, "epoch": 0.40616966580976865, "grad_norm": 0.3133380711078644, "kl": 0.013458251953125, "learning_rate": 6.679851303883891e-07, "loss": 0.007, "reward": 0.8888888657093048, "reward_std": 0.13608276844024658, "rewards/accuracy_reward": 0.19444444961845875, "rewards/format_reward": 0.5, "step": 237 }, { "clip_ratio": 0.0, "completion_length": 639.3194427490234, "epoch": 0.4078834618680377, "grad_norm": 0.3414314091205597, "kl": 0.0097808837890625, "learning_rate": 6.649505910711058e-07, "loss": 0.0132, "reward": 0.7916666567325592, "reward_std": 0.10206206887960434, "rewards/accuracy_reward": 0.1527777835726738, "rewards/format_reward": 0.4861111119389534, "step": 238 }, { "clip_ratio": 0.0, "completion_length": 698.0000152587891, "epoch": 0.40959725792630675, "grad_norm": 0.47577425837516785, "kl": 0.0128326416015625, "learning_rate": 6.619104492241847e-07, "loss": 0.0208, "reward": 0.9375000149011612, "reward_std": 0.4217335730791092, "rewards/accuracy_reward": 0.22222223225980997, "rewards/format_reward": 0.493055559694767, "step": 239 }, { "clip_ratio": 0.0, "completion_length": 496.9305648803711, "epoch": 0.41131105398457585, "grad_norm": 0.5120421051979065, "kl": 0.017822265625, "learning_rate": 6.588648530198504e-07, "loss": -0.0194, "reward": 0.9027777761220932, "reward_std": 0.34775684028863907, "rewards/accuracy_reward": 0.20833333674818277, "rewards/format_reward": 0.4861111119389534, "step": 240 }, { "clip_ratio": 0.0, "completion_length": 568.9166946411133, "epoch": 0.4130248500428449, "grad_norm": 0.4153745472431183, "kl": 0.00946044921875, "learning_rate": 6.558139508961654e-07, "loss": -0.0283, "reward": 1.1111111044883728, "reward_std": 0.3314610719680786, "rewards/accuracy_reward": 0.30555556435137987, "rewards/format_reward": 0.5, "step": 241 }, { "clip_ratio": 0.0, "completion_length": 679.4027862548828, "epoch": 0.414738646101114, "grad_norm": 0.2539161145687103, "kl": 0.0107269287109375, "learning_rate": 6.527578915497951e-07, "loss": -0.0036, "reward": 0.7222222089767456, "reward_std": 0.26864049583673477, "rewards/accuracy_reward": 0.11111111380159855, "rewards/format_reward": 0.5, "step": 242 }, { "clip_ratio": 0.0, "completion_length": 663.6388778686523, "epoch": 0.41645244215938304, "grad_norm": 0.4045405089855194, "kl": 0.01134490966796875, "learning_rate": 6.496968239287603e-07, "loss": -0.0349, "reward": 0.8263889029622078, "reward_std": 0.3752421587705612, "rewards/accuracy_reward": 0.16666667349636555, "rewards/format_reward": 0.493055559694767, "step": 243 }, { "clip_ratio": 0.0, "completion_length": 684.1250152587891, "epoch": 0.4181662382176521, "grad_norm": 0.33096015453338623, "kl": 0.00829315185546875, "learning_rate": 6.466308972251785e-07, "loss": -0.0064, "reward": 0.5833333283662796, "reward_std": 0.20412413775920868, "rewards/accuracy_reward": 0.041666666977107525, "rewards/format_reward": 0.5, "step": 244 }, { "clip_ratio": 0.0, "completion_length": 503.8194580078125, "epoch": 0.4198800342759212, "grad_norm": 0.6509292721748352, "kl": 0.0207977294921875, "learning_rate": 6.435602608679916e-07, "loss": -0.0065, "reward": 0.9166666567325592, "reward_std": 0.4262731820344925, "rewards/accuracy_reward": 0.2083333320915699, "rewards/format_reward": 0.5, "step": 245 }, { "clip_ratio": 0.0, "completion_length": 753.3333282470703, "epoch": 0.42159383033419023, "grad_norm": 0.23314639925956726, "kl": 0.0078582763671875, "learning_rate": 6.404850645156841e-07, "loss": 0.0163, "reward": 1.2430555522441864, "reward_std": 0.28890247642993927, "rewards/accuracy_reward": 0.3750000149011612, "rewards/format_reward": 0.493055559694767, "step": 246 }, { "clip_ratio": 0.0, "completion_length": 731.4444580078125, "epoch": 0.4233076263924593, "grad_norm": 0.2738264203071594, "kl": 0.01085662841796875, "learning_rate": 6.374054580489873e-07, "loss": 0.004, "reward": 0.6805555522441864, "reward_std": 0.10206207446753979, "rewards/accuracy_reward": 0.09722222480922937, "rewards/format_reward": 0.486111119389534, "step": 247 }, { "clip_ratio": 0.0, "completion_length": 645.1805801391602, "epoch": 0.4250214224507284, "grad_norm": 0.41325727105140686, "kl": 0.0088348388671875, "learning_rate": 6.343215915635761e-07, "loss": 0.0097, "reward": 0.8888888955116272, "reward_std": 0.3547067791223526, "rewards/accuracy_reward": 0.1944444477558136, "rewards/format_reward": 0.5, "step": 248 }, { "clip_ratio": 0.0, "completion_length": 734.1944427490234, "epoch": 0.4267352185089974, "grad_norm": 0.39983558654785156, "kl": 0.009674072265625, "learning_rate": 6.31233615362752e-07, "loss": 0.0139, "reward": 0.861111119389534, "reward_std": 0.4262731969356537, "rewards/accuracy_reward": 0.18055556248873472, "rewards/format_reward": 0.5, "step": 249 }, { "clip_ratio": 0.0, "completion_length": 676.3472290039062, "epoch": 0.4284490145672665, "grad_norm": 0.3825208246707916, "kl": 0.00958251953125, "learning_rate": 6.281416799501187e-07, "loss": -0.0015, "reward": 0.6944444328546524, "reward_std": 0.20412414520978928, "rewards/accuracy_reward": 0.09722222480922937, "rewards/format_reward": 0.5, "step": 250 }, { "clip_ratio": 0.0, "completion_length": 725.7222290039062, "epoch": 0.4301628106255356, "grad_norm": 0.247074156999588, "kl": 0.011627197265625, "learning_rate": 6.25045936022246e-07, "loss": -0.0156, "reward": 0.9444444328546524, "reward_std": 0.2453947737812996, "rewards/accuracy_reward": 0.2222222276031971, "rewards/format_reward": 0.5, "step": 251 }, { "clip_ratio": 0.0, "completion_length": 644.7083435058594, "epoch": 0.4318766066838046, "grad_norm": 0.4444674849510193, "kl": 0.009063720703125, "learning_rate": 6.219465344613258e-07, "loss": 0.009, "reward": 0.8333333432674408, "reward_std": 0.40472324192523956, "rewards/accuracy_reward": 0.16666666883975267, "rewards/format_reward": 0.5, "step": 252 }, { "clip_ratio": 0.0, "completion_length": 747.0277862548828, "epoch": 0.43359040274207367, "grad_norm": 0.5097996592521667, "kl": 0.00933837890625, "learning_rate": 6.188436263278172e-07, "loss": -0.0255, "reward": 0.7569444552063942, "reward_std": 0.5057707708328962, "rewards/accuracy_reward": 0.13888889271765947, "rewards/format_reward": 0.479166679084301, "step": 253 }, { "clip_ratio": 0.0, "completion_length": 728.7083358764648, "epoch": 0.43530419880034277, "grad_norm": 0.35494205355644226, "kl": 0.0090484619140625, "learning_rate": 6.157373628530852e-07, "loss": 0.0083, "reward": 0.8541666567325592, "reward_std": 0.34325060993433, "rewards/accuracy_reward": 0.180555559694767, "rewards/format_reward": 0.493055559694767, "step": 254 }, { "clip_ratio": 0.0, "completion_length": 681.2361145019531, "epoch": 0.4370179948586118, "grad_norm": 0.3796377182006836, "kl": 0.0113677978515625, "learning_rate": 6.126278954320294e-07, "loss": 0.0005, "reward": 0.7777777910232544, "reward_std": 0.29541125893592834, "rewards/accuracy_reward": 0.13888889271765947, "rewards/format_reward": 0.5, "step": 255 }, { "clip_ratio": 0.0, "completion_length": 747.6666717529297, "epoch": 0.4387317909168809, "grad_norm": 0.400722861289978, "kl": 0.00862884521484375, "learning_rate": 6.095153756157051e-07, "loss": -0.0062, "reward": 0.6249999925494194, "reward_std": 0.24970055185258389, "rewards/accuracy_reward": 0.06944444589316845, "rewards/format_reward": 0.486111119389534, "step": 256 }, { "clip_ratio": 0.0, "completion_length": 794.6527862548828, "epoch": 0.44044558697514996, "grad_norm": 0.4615187644958496, "kl": 0.00820159912109375, "learning_rate": 6.06399955103937e-07, "loss": -0.0081, "reward": 0.6805555522441864, "reward_std": 0.311707004904747, "rewards/accuracy_reward": 0.09722222574055195, "rewards/format_reward": 0.4861111119389534, "step": 257 }, { "clip_ratio": 0.0, "completion_length": 754.5277709960938, "epoch": 0.442159383033419, "grad_norm": 0.3552338480949402, "kl": 0.0101165771484375, "learning_rate": 6.032817857379256e-07, "loss": 0.0125, "reward": 0.6875000074505806, "reward_std": 0.2571843173354864, "rewards/accuracy_reward": 0.0972222238779068, "rewards/format_reward": 0.493055559694767, "step": 258 }, { "clip_ratio": 0.0, "completion_length": 634.4166717529297, "epoch": 0.4438731790916881, "grad_norm": 0.6070718765258789, "kl": 0.00984954833984375, "learning_rate": 6.001610194928464e-07, "loss": 0.006, "reward": 1.0555555522441864, "reward_std": 0.5303644090890884, "rewards/accuracy_reward": 0.27777778543531895, "rewards/format_reward": 0.5, "step": 259 }, { "clip_ratio": 0.0, "completion_length": 624.7361145019531, "epoch": 0.44558697514995715, "grad_norm": 0.5230724215507507, "kl": 0.00922393798828125, "learning_rate": 5.97037808470444e-07, "loss": -0.0235, "reward": 0.7777777761220932, "reward_std": 0.3582318127155304, "rewards/accuracy_reward": 0.13888889085501432, "rewards/format_reward": 0.5, "step": 260 }, { "clip_ratio": 0.0, "completion_length": 691.1666870117188, "epoch": 0.4473007712082262, "grad_norm": 0.39952459931373596, "kl": 0.00872802734375, "learning_rate": 5.939123048916173e-07, "loss": 0.0159, "reward": 0.8333333283662796, "reward_std": 0.2221490666270256, "rewards/accuracy_reward": 0.16666666977107525, "rewards/format_reward": 0.5, "step": 261 }, { "clip_ratio": 0.0, "completion_length": 800.1250152587891, "epoch": 0.4490145672664953, "grad_norm": 0.41582420468330383, "kl": 0.008209228515625, "learning_rate": 5.907846610890011e-07, "loss": 0.0729, "reward": 1.2500000149011612, "reward_std": 0.5078206732869148, "rewards/accuracy_reward": 0.3888888992369175, "rewards/format_reward": 0.4722222238779068, "step": 262 }, { "clip_ratio": 0.0, "completion_length": 708.9583282470703, "epoch": 0.45072836332476435, "grad_norm": 0.5363173484802246, "kl": 0.00957489013671875, "learning_rate": 5.87655029499542e-07, "loss": -0.022, "reward": 0.6527777761220932, "reward_std": 0.25616974383592606, "rewards/accuracy_reward": 0.08333333395421505, "rewards/format_reward": 0.486111119389534, "step": 263 }, { "clip_ratio": 0.0, "completion_length": 608.2083511352539, "epoch": 0.4524421593830334, "grad_norm": 0.4334163963794708, "kl": 0.00849151611328125, "learning_rate": 5.845235626570683e-07, "loss": 0.011, "reward": 0.8333333432674408, "reward_std": 0.3082153648138046, "rewards/accuracy_reward": 0.16666667070239782, "rewards/format_reward": 0.5, "step": 264 }, { "clip_ratio": 0.0, "completion_length": 678.3750152587891, "epoch": 0.4541559554413025, "grad_norm": 0.38257157802581787, "kl": 0.0098876953125, "learning_rate": 5.813904131848564e-07, "loss": -0.0024, "reward": 1.1944444328546524, "reward_std": 0.38669832795858383, "rewards/accuracy_reward": 0.34722223225980997, "rewards/format_reward": 0.5, "step": 265 }, { "clip_ratio": 0.0, "completion_length": 795.8889007568359, "epoch": 0.45586975149957154, "grad_norm": 0.36699798703193665, "kl": 0.0082855224609375, "learning_rate": 5.78255733788191e-07, "loss": 0.0402, "reward": 0.8333333283662796, "reward_std": 0.30821535736322403, "rewards/accuracy_reward": 0.16666666697710752, "rewards/format_reward": 0.5, "step": 266 }, { "clip_ratio": 0.0, "completion_length": 679.8611221313477, "epoch": 0.45758354755784064, "grad_norm": 7.172515869140625, "kl": 0.10137939453125, "learning_rate": 5.751196772469237e-07, "loss": -0.0182, "reward": 0.625, "reward_std": 0.16182994842529297, "rewards/accuracy_reward": 0.06944444496184587, "rewards/format_reward": 0.4861111119389534, "step": 267 }, { "clip_ratio": 0.0, "completion_length": 669.3194580078125, "epoch": 0.4592973436161097, "grad_norm": 0.4328586459159851, "kl": 0.01244354248046875, "learning_rate": 5.71982396408026e-07, "loss": 0.0242, "reward": 0.7430555671453476, "reward_std": 0.25718431919813156, "rewards/accuracy_reward": 0.1250000037252903, "rewards/format_reward": 0.493055559694767, "step": 268 }, { "clip_ratio": 0.0, "completion_length": 706.1805725097656, "epoch": 0.46101113967437873, "grad_norm": 0.15796984732151031, "kl": 0.00635528564453125, "learning_rate": 5.688440441781398e-07, "loss": -0.0015, "reward": 0.7777777761220932, "reward_std": 0.08606629818677902, "rewards/accuracy_reward": 0.1388888917863369, "rewards/format_reward": 0.5, "step": 269 }, { "clip_ratio": 0.0, "completion_length": 763.4861145019531, "epoch": 0.46272493573264784, "grad_norm": 0.42503196001052856, "kl": 0.010040283203125, "learning_rate": 5.657047735161255e-07, "loss": -0.0154, "reward": 1.0138888955116272, "reward_std": 0.3125211223959923, "rewards/accuracy_reward": 0.26388889644294977, "rewards/format_reward": 0.4861111119389534, "step": 270 }, { "clip_ratio": 0.0, "completion_length": 671.2639007568359, "epoch": 0.4644387317909169, "grad_norm": 0.442844957113266, "kl": 0.0098724365234375, "learning_rate": 5.625647374256061e-07, "loss": -0.0138, "reward": 0.8541666567325592, "reward_std": 0.3072007820010185, "rewards/accuracy_reward": 0.18055556155741215, "rewards/format_reward": 0.493055559694767, "step": 271 }, { "clip_ratio": 0.0, "completion_length": 584.1666793823242, "epoch": 0.4661525278491859, "grad_norm": 0.6295328140258789, "kl": 0.0174713134765625, "learning_rate": 5.594240889475106e-07, "loss": 0.0023, "reward": 0.8750000149011612, "reward_std": 0.46228964626789093, "rewards/accuracy_reward": 0.19444444961845875, "rewards/format_reward": 0.4861111119389534, "step": 272 }, { "clip_ratio": 0.0, "completion_length": 709.3611145019531, "epoch": 0.46786632390745503, "grad_norm": 0.34427711367607117, "kl": 0.0111846923828125, "learning_rate": 5.562829811526154e-07, "loss": 0.0205, "reward": 0.8541666865348816, "reward_std": 0.25718431919813156, "rewards/accuracy_reward": 0.180555559694767, "rewards/format_reward": 0.493055559694767, "step": 273 }, { "clip_ratio": 0.0, "completion_length": 725.625, "epoch": 0.4695801199657241, "grad_norm": 0.3454584777355194, "kl": 0.00966644287109375, "learning_rate": 5.531415671340826e-07, "loss": -0.0354, "reward": 0.5972222238779068, "reward_std": 0.28170324862003326, "rewards/accuracy_reward": 0.0555555559694767, "rewards/format_reward": 0.4861111119389534, "step": 274 }, { "clip_ratio": 0.0, "completion_length": 674.6111145019531, "epoch": 0.4712939160239931, "grad_norm": 0.5319638252258301, "kl": 0.0121307373046875, "learning_rate": 5.5e-07, "loss": -0.0055, "reward": 0.8055555522441864, "reward_std": 0.3995024487376213, "rewards/accuracy_reward": 0.1527777835726738, "rewards/format_reward": 0.5, "step": 275 }, { "clip_ratio": 0.0, "completion_length": 655.1527862548828, "epoch": 0.4730077120822622, "grad_norm": 0.4211221933364868, "kl": 0.0104217529296875, "learning_rate": 5.468584328659172e-07, "loss": 0.0072, "reward": 0.8055555671453476, "reward_std": 0.3995024487376213, "rewards/accuracy_reward": 0.15277778171002865, "rewards/format_reward": 0.5, "step": 276 }, { "clip_ratio": 0.0, "completion_length": 731.9305725097656, "epoch": 0.47472150814053127, "grad_norm": 0.4488551914691925, "kl": 0.011688232421875, "learning_rate": 5.437170188473847e-07, "loss": -0.0014, "reward": 1.1041666716337204, "reward_std": 0.34847141802310944, "rewards/accuracy_reward": 0.3055555634200573, "rewards/format_reward": 0.493055559694767, "step": 277 }, { "clip_ratio": 0.0, "completion_length": 681.75, "epoch": 0.47643530419880037, "grad_norm": 0.3861945867538452, "kl": 0.011749267578125, "learning_rate": 5.405759110524894e-07, "loss": 0.02, "reward": 0.8749999925494194, "reward_std": 0.34098767302930355, "rewards/accuracy_reward": 0.1944444514811039, "rewards/format_reward": 0.486111119389534, "step": 278 }, { "clip_ratio": 0.0, "completion_length": 790.6389007568359, "epoch": 0.4781491002570694, "grad_norm": 0.6166090369224548, "kl": 0.0152740478515625, "learning_rate": 5.37435262574394e-07, "loss": 0.0135, "reward": 0.7152777910232544, "reward_std": 0.3984878733754158, "rewards/accuracy_reward": 0.11111111473292112, "rewards/format_reward": 0.493055559694767, "step": 279 }, { "clip_ratio": 0.0, "completion_length": 613.5277862548828, "epoch": 0.47986289631533846, "grad_norm": 0.3968028426170349, "kl": 0.0106964111328125, "learning_rate": 5.342952264838747e-07, "loss": 0.0027, "reward": 0.8611111044883728, "reward_std": 0.24017397314310074, "rewards/accuracy_reward": 0.180555559694767, "rewards/format_reward": 0.5, "step": 280 }, { "clip_ratio": 0.0, "completion_length": 728.4861145019531, "epoch": 0.48157669237360756, "grad_norm": 0.4651610851287842, "kl": 0.00959014892578125, "learning_rate": 5.311559558218603e-07, "loss": 0.0424, "reward": 1.1250000149011612, "reward_std": 0.40491778403520584, "rewards/accuracy_reward": 0.31944445613771677, "rewards/format_reward": 0.486111119389534, "step": 281 }, { "clip_ratio": 0.0, "completion_length": 719.4027862548828, "epoch": 0.4832904884318766, "grad_norm": 0.4324786365032196, "kl": 0.0084686279296875, "learning_rate": 5.28017603591974e-07, "loss": 0.0187, "reward": 0.7986111268401146, "reward_std": 0.24438020400702953, "rewards/accuracy_reward": 0.1527777798473835, "rewards/format_reward": 0.493055559694767, "step": 282 }, { "clip_ratio": 0.0, "completion_length": 743.3889007568359, "epoch": 0.48500428449014565, "grad_norm": 0.317564994096756, "kl": 0.009735107421875, "learning_rate": 5.248803227530763e-07, "loss": 0.0143, "reward": 0.729166679084301, "reward_std": 0.25436214357614517, "rewards/accuracy_reward": 0.12500000186264515, "rewards/format_reward": 0.479166679084301, "step": 283 }, { "clip_ratio": 0.0, "completion_length": 799.6666870117188, "epoch": 0.48671808054841476, "grad_norm": 0.3215982913970947, "kl": 0.0088348388671875, "learning_rate": 5.21744266211809e-07, "loss": -0.0185, "reward": 0.9652777910232544, "reward_std": 0.3315606266260147, "rewards/accuracy_reward": 0.23611111845821142, "rewards/format_reward": 0.493055559694767, "step": 284 }, { "clip_ratio": 0.0, "completion_length": 815.6944580078125, "epoch": 0.4884318766066838, "grad_norm": 0.28981852531433105, "kl": 0.00812530517578125, "learning_rate": 5.186095868151436e-07, "loss": -0.002, "reward": 0.805555559694767, "reward_std": 0.03983211889863014, "rewards/accuracy_reward": 0.1666666716337204, "rewards/format_reward": 0.4722222238779068, "step": 285 }, { "clip_ratio": 0.0, "completion_length": 738.4305725097656, "epoch": 0.49014567266495285, "grad_norm": 0.437377393245697, "kl": 0.00920867919921875, "learning_rate": 5.154764373429315e-07, "loss": 0.0761, "reward": 0.7638889029622078, "reward_std": 0.2561697345227003, "rewards/accuracy_reward": 0.13888889085501432, "rewards/format_reward": 0.486111119389534, "step": 286 }, { "clip_ratio": 0.0, "completion_length": 681.2222290039062, "epoch": 0.49185946872322195, "grad_norm": 0.44190195202827454, "kl": 0.0103912353515625, "learning_rate": 5.123449705004581e-07, "loss": -0.0106, "reward": 0.972222238779068, "reward_std": 0.46232303231954575, "rewards/accuracy_reward": 0.236111119389534, "rewards/format_reward": 0.5, "step": 287 }, { "clip_ratio": 0.0, "completion_length": 695.5277709960938, "epoch": 0.493573264781491, "grad_norm": 0.496455579996109, "kl": 0.01013946533203125, "learning_rate": 5.09215338910999e-07, "loss": -0.0123, "reward": 0.6250000149011612, "reward_std": 0.2721321564167738, "rewards/accuracy_reward": 0.0833333358168602, "rewards/format_reward": 0.4583333432674408, "step": 288 }, { "clip_ratio": 0.0, "completion_length": 728.0972290039062, "epoch": 0.4952870608397601, "grad_norm": 0.6901673078536987, "kl": 0.01247406005859375, "learning_rate": 5.060876951083828e-07, "loss": -0.0307, "reward": 0.8958333283662796, "reward_std": 0.4455699250102043, "rewards/accuracy_reward": 0.2083333395421505, "rewards/format_reward": 0.4791666641831398, "step": 289 }, { "clip_ratio": 0.0, "completion_length": 736.1666717529297, "epoch": 0.49700085689802914, "grad_norm": 0.4506691098213196, "kl": 0.00757598876953125, "learning_rate": 5.02962191529556e-07, "loss": 0.0253, "reward": 0.8958333283662796, "reward_std": 0.48350031673908234, "rewards/accuracy_reward": 0.20833333488553762, "rewards/format_reward": 0.479166679084301, "step": 290 }, { "clip_ratio": 0.0, "completion_length": 784.4305572509766, "epoch": 0.4987146529562982, "grad_norm": 0.39211300015449524, "kl": 0.01044464111328125, "learning_rate": 4.998389805071536e-07, "loss": 0.0139, "reward": 0.9305555671453476, "reward_std": 0.38056251406669617, "rewards/accuracy_reward": 0.22222223225980997, "rewards/format_reward": 0.4861111119389534, "step": 291 }, { "clip_ratio": 0.0, "completion_length": 630.5416793823242, "epoch": 0.5004284490145673, "grad_norm": 0.6736369729042053, "kl": 0.011474609375, "learning_rate": 4.967182142620745e-07, "loss": -0.0179, "reward": 1.0555555820465088, "reward_std": 0.553610123693943, "rewards/accuracy_reward": 0.2777777835726738, "rewards/format_reward": 0.5, "step": 292 }, { "clip_ratio": 0.0, "completion_length": 599.7500076293945, "epoch": 0.5021422450728363, "grad_norm": 0.35509321093559265, "kl": 0.0101470947265625, "learning_rate": 4.93600044896063e-07, "loss": 0.0294, "reward": 0.8333333432674408, "reward_std": 0.331461064517498, "rewards/accuracy_reward": 0.16666666883975267, "rewards/format_reward": 0.5, "step": 293 }, { "clip_ratio": 0.0, "completion_length": 651.8194580078125, "epoch": 0.5038560411311054, "grad_norm": 0.35510072112083435, "kl": 0.0114593505859375, "learning_rate": 4.904846243842949e-07, "loss": -0.0076, "reward": 0.9166666716337204, "reward_std": 0.2634196802973747, "rewards/accuracy_reward": 0.2083333395421505, "rewards/format_reward": 0.5, "step": 294 }, { "clip_ratio": 0.0, "completion_length": 702.9583435058594, "epoch": 0.5055698371893744, "grad_norm": 0.37312352657318115, "kl": 0.01067352294921875, "learning_rate": 4.873721045679706e-07, "loss": -0.0084, "reward": 0.7916666865348816, "reward_std": 0.46386218070983887, "rewards/accuracy_reward": 0.15277778171002865, "rewards/format_reward": 0.486111119389534, "step": 295 }, { "clip_ratio": 0.0, "completion_length": 737.9583511352539, "epoch": 0.5072836332476436, "grad_norm": 0.5573227405548096, "kl": 0.0097503662109375, "learning_rate": 4.842626371469149e-07, "loss": -0.0192, "reward": 1.034722238779068, "reward_std": 0.7432259321212769, "rewards/accuracy_reward": 0.27777778171002865, "rewards/format_reward": 0.4791666716337204, "step": 296 }, { "clip_ratio": 0.0, "completion_length": 685.2639007568359, "epoch": 0.5089974293059126, "grad_norm": 0.5244731307029724, "kl": 0.00927734375, "learning_rate": 4.811563736721829e-07, "loss": -0.0399, "reward": 0.9305555671453476, "reward_std": 0.5026786401867867, "rewards/accuracy_reward": 0.2222222276031971, "rewards/format_reward": 0.486111119389534, "step": 297 }, { "clip_ratio": 0.0, "completion_length": 633.9722290039062, "epoch": 0.5107112253641817, "grad_norm": 0.5318572521209717, "kl": 0.010101318359375, "learning_rate": 4.780534655386743e-07, "loss": 0.0125, "reward": 0.8402777761220932, "reward_std": 0.2809867858886719, "rewards/accuracy_reward": 0.18055555876344442, "rewards/format_reward": 0.4791666716337204, "step": 298 }, { "clip_ratio": 0.0, "completion_length": 738.1944580078125, "epoch": 0.5124250214224507, "grad_norm": 0.4404042065143585, "kl": 0.0084686279296875, "learning_rate": 4.749540639777539e-07, "loss": 0.0031, "reward": 1.1250000149011612, "reward_std": 0.4589441120624542, "rewards/accuracy_reward": 0.31944445334374905, "rewards/format_reward": 0.486111119389534, "step": 299 }, { "clip_ratio": 0.0, "completion_length": 715.8611145019531, "epoch": 0.5141388174807198, "grad_norm": 0.36858052015304565, "kl": 0.010406494140625, "learning_rate": 4.7185832004988133e-07, "loss": -0.0088, "reward": 0.7499999850988388, "reward_std": 0.33668187260627747, "rewards/accuracy_reward": 0.1250000037252903, "rewards/format_reward": 0.5, "step": 300 }, { "clip_ratio": 0.0, "completion_length": 658.6111297607422, "epoch": 0.5158526135389888, "grad_norm": 0.44451257586479187, "kl": 0.00970458984375, "learning_rate": 4.68766384637248e-07, "loss": -0.0124, "reward": 0.8055555522441864, "reward_std": 0.2901904284954071, "rewards/accuracy_reward": 0.1527777798473835, "rewards/format_reward": 0.5, "step": 301 }, { "clip_ratio": 0.0, "completion_length": 743.5555725097656, "epoch": 0.517566409597258, "grad_norm": 0.394045889377594, "kl": 0.00940704345703125, "learning_rate": 4.656784084364238e-07, "loss": 0.0071, "reward": 0.861111119389534, "reward_std": 0.3762567415833473, "rewards/accuracy_reward": 0.18055555783212185, "rewards/format_reward": 0.5, "step": 302 }, { "clip_ratio": 0.0, "completion_length": 686.5833435058594, "epoch": 0.519280205655527, "grad_norm": 0.41484999656677246, "kl": 0.01177978515625, "learning_rate": 4.6259454195101267e-07, "loss": 0.0097, "reward": 0.6388888955116272, "reward_std": 0.22736987471580505, "rewards/accuracy_reward": 0.06944444496184587, "rewards/format_reward": 0.5, "step": 303 }, { "clip_ratio": 0.0, "completion_length": 663.4027862548828, "epoch": 0.5209940017137961, "grad_norm": 0.6104622483253479, "kl": 0.011749267578125, "learning_rate": 4.59514935484316e-07, "loss": 0.0155, "reward": 0.826388880610466, "reward_std": 0.4845541790127754, "rewards/accuracy_reward": 0.16666667070239782, "rewards/format_reward": 0.493055559694767, "step": 304 }, { "clip_ratio": 0.0, "completion_length": 673.0277862548828, "epoch": 0.5227077977720651, "grad_norm": 0.412063330411911, "kl": 0.010986328125, "learning_rate": 4.5643973913200837e-07, "loss": 0.0303, "reward": 0.9513889029622078, "reward_std": 0.3880129065364599, "rewards/accuracy_reward": 0.2361111119389534, "rewards/format_reward": 0.4791666716337204, "step": 305 }, { "clip_ratio": 0.0, "completion_length": 701.0416717529297, "epoch": 0.5244215938303342, "grad_norm": 0.5593081116676331, "kl": 0.0098724365234375, "learning_rate": 4.5336910277482155e-07, "loss": 0.0378, "reward": 0.7986111044883728, "reward_std": 0.39326707273721695, "rewards/accuracy_reward": 0.1527777835726738, "rewards/format_reward": 0.493055559694767, "step": 306 }, { "clip_ratio": 0.0, "completion_length": 751.9722290039062, "epoch": 0.5261353898886033, "grad_norm": 0.48807039856910706, "kl": 0.0091400146484375, "learning_rate": 4.503031760712397e-07, "loss": 0.0218, "reward": 1.1875000149011612, "reward_std": 0.6258577555418015, "rewards/accuracy_reward": 0.34722222946584225, "rewards/format_reward": 0.493055559694767, "step": 307 }, { "clip_ratio": 0.0, "completion_length": 835.2916717529297, "epoch": 0.5278491859468724, "grad_norm": 0.4123370349407196, "kl": 0.010833740234375, "learning_rate": 4.4724210845020494e-07, "loss": -0.0059, "reward": 0.7777777910232544, "reward_std": 0.205099418759346, "rewards/accuracy_reward": 0.15277778077870607, "rewards/format_reward": 0.4722222313284874, "step": 308 }, { "clip_ratio": 0.0, "completion_length": 705.0555572509766, "epoch": 0.5295629820051414, "grad_norm": 0.466584712266922, "kl": 0.00885009765625, "learning_rate": 4.441860491038345e-07, "loss": 0.0187, "reward": 0.8750000074505806, "reward_std": 0.3422360420227051, "rewards/accuracy_reward": 0.19444444496184587, "rewards/format_reward": 0.486111119389534, "step": 309 }, { "clip_ratio": 0.0, "completion_length": 760.2639007568359, "epoch": 0.5312767780634104, "grad_norm": 0.33340954780578613, "kl": 0.00978851318359375, "learning_rate": 4.4113514698014953e-07, "loss": 0.0057, "reward": 0.7708333358168602, "reward_std": 0.17633881978690624, "rewards/accuracy_reward": 0.1388888917863369, "rewards/format_reward": 0.493055559694767, "step": 310 }, { "clip_ratio": 0.0, "completion_length": 770.3194427490234, "epoch": 0.5329905741216795, "grad_norm": 0.34279051423072815, "kl": 0.0094146728515625, "learning_rate": 4.3808955077581546e-07, "loss": -0.0054, "reward": 0.6875000074505806, "reward_std": 0.25718431919813156, "rewards/accuracy_reward": 0.0972222238779068, "rewards/format_reward": 0.493055559694767, "step": 311 }, { "clip_ratio": 0.0, "completion_length": 807.4861297607422, "epoch": 0.5347043701799485, "grad_norm": 0.42101454734802246, "kl": 0.007659912109375, "learning_rate": 4.350494089288943e-07, "loss": 0.0135, "reward": 0.9930555671453476, "reward_std": 0.49956031143665314, "rewards/accuracy_reward": 0.25000000838190317, "rewards/format_reward": 0.493055559694767, "step": 312 }, { "clip_ratio": 0.0, "completion_length": 615.5277709960938, "epoch": 0.5364181662382177, "grad_norm": 0.5372198820114136, "kl": 0.0162506103515625, "learning_rate": 4.3201486961161093e-07, "loss": -0.0168, "reward": 1.1875000149011612, "reward_std": 0.4131338596343994, "rewards/accuracy_reward": 0.3472222276031971, "rewards/format_reward": 0.493055559694767, "step": 313 }, { "clip_ratio": 0.0, "completion_length": 725.8750152587891, "epoch": 0.5381319622964867, "grad_norm": 0.4827311038970947, "kl": 0.00830078125, "learning_rate": 4.2898608072313045e-07, "loss": 0.0221, "reward": 1.0000000149011612, "reward_std": 0.5675767734646797, "rewards/accuracy_reward": 0.25000000558793545, "rewards/format_reward": 0.5, "step": 314 }, { "clip_ratio": 0.0, "completion_length": 631.4444427490234, "epoch": 0.5398457583547558, "grad_norm": 0.32586589455604553, "kl": 0.0107269287109375, "learning_rate": 4.2596318988235037e-07, "loss": -0.011, "reward": 0.861111119389534, "reward_std": 0.22736985981464386, "rewards/accuracy_reward": 0.1805555634200573, "rewards/format_reward": 0.5, "step": 315 }, { "clip_ratio": 0.0, "completion_length": 760.6944427490234, "epoch": 0.5415595544130248, "grad_norm": 0.36847296357154846, "kl": 0.0100860595703125, "learning_rate": 4.2294634442070553e-07, "loss": 0.0058, "reward": 0.8819444477558136, "reward_std": 0.3717171251773834, "rewards/accuracy_reward": 0.1944444477558136, "rewards/format_reward": 0.493055559694767, "step": 316 }, { "clip_ratio": 0.0, "completion_length": 860.0138854980469, "epoch": 0.5432733504712939, "grad_norm": 0.33621668815612793, "kl": 0.0082855224609375, "learning_rate": 4.1993569137498776e-07, "loss": 0.0228, "reward": 0.8958333656191826, "reward_std": 0.3794733416289091, "rewards/accuracy_reward": 0.2083333358168602, "rewards/format_reward": 0.4791666716337204, "step": 317 }, { "clip_ratio": 0.0, "completion_length": 591.1944580078125, "epoch": 0.5449871465295629, "grad_norm": 0.5173898935317993, "kl": 0.01264190673828125, "learning_rate": 4.1693137748017915e-07, "loss": -0.0109, "reward": 1.083333358168602, "reward_std": 0.4227481558918953, "rewards/accuracy_reward": 0.2916666753590107, "rewards/format_reward": 0.5, "step": 318 }, { "clip_ratio": 0.0, "completion_length": 781.0694427490234, "epoch": 0.5467009425878321, "grad_norm": 0.4726315438747406, "kl": 0.01230621337890625, "learning_rate": 4.1393354916230005e-07, "loss": 0.0288, "reward": 0.6666666939854622, "reward_std": 0.27821177802979946, "rewards/accuracy_reward": 0.0972222238779068, "rewards/format_reward": 0.472222238779068, "step": 319 }, { "clip_ratio": 0.0, "completion_length": 706.5139007568359, "epoch": 0.5484147386461011, "grad_norm": 0.4803582727909088, "kl": 0.0104217529296875, "learning_rate": 4.1094235253127374e-07, "loss": 0.0114, "reward": 1.1666666865348816, "reward_std": 0.5012311488389969, "rewards/accuracy_reward": 0.3333333432674408, "rewards/format_reward": 0.5, "step": 320 }, { "clip_ratio": 0.0, "completion_length": 652.2222137451172, "epoch": 0.5501285347043702, "grad_norm": 0.4137377440929413, "kl": 0.00899505615234375, "learning_rate": 4.079579333738039e-07, "loss": -0.0062, "reward": 0.680555559694767, "reward_std": 0.27419466339051723, "rewards/accuracy_reward": 0.09722222294658422, "rewards/format_reward": 0.486111119389534, "step": 321 }, { "clip_ratio": 0.0, "completion_length": 661.7500152587891, "epoch": 0.5518423307626392, "grad_norm": 0.7754374146461487, "kl": 0.01078033447265625, "learning_rate": 4.0498043714627006e-07, "loss": 0.0696, "reward": 0.7083333283662796, "reward_std": 0.4283023327589035, "rewards/accuracy_reward": 0.11111111287027597, "rewards/format_reward": 0.486111119389534, "step": 322 }, { "clip_ratio": 0.0, "completion_length": 747.7777709960938, "epoch": 0.5535561268209083, "grad_norm": 0.23274828493595123, "kl": 0.01306915283203125, "learning_rate": 4.020100089676376e-07, "loss": 0.0218, "reward": 0.8541666641831398, "reward_std": 0.08505172841250896, "rewards/accuracy_reward": 0.180555559694767, "rewards/format_reward": 0.493055559694767, "step": 323 }, { "clip_ratio": 0.0, "completion_length": 794.263916015625, "epoch": 0.5552699228791774, "grad_norm": 0.5167786478996277, "kl": 0.0105438232421875, "learning_rate": 3.9904679361238526e-07, "loss": 0.0041, "reward": 1.0347222089767456, "reward_std": 0.5008072182536125, "rewards/accuracy_reward": 0.2777777835726738, "rewards/format_reward": 0.4791666641831398, "step": 324 }, { "clip_ratio": 0.0, "completion_length": 756.1250305175781, "epoch": 0.5569837189374465, "grad_norm": 0.42761853337287903, "kl": 0.01197052001953125, "learning_rate": 3.9609093550344907e-07, "loss": 0.0332, "reward": 0.7152777761220932, "reward_std": 0.4112919941544533, "rewards/accuracy_reward": 0.11111111380159855, "rewards/format_reward": 0.493055559694767, "step": 325 }, { "clip_ratio": 0.0, "completion_length": 678.4583358764648, "epoch": 0.5586975149957155, "grad_norm": 0.6914120316505432, "kl": 0.01470947265625, "learning_rate": 3.931425787051832e-07, "loss": 0.0291, "reward": 1.0763888955116272, "reward_std": 0.43159355968236923, "rewards/accuracy_reward": 0.29166667349636555, "rewards/format_reward": 0.493055559694767, "step": 326 }, { "clip_ratio": 0.0, "completion_length": 575.4722290039062, "epoch": 0.5604113110539846, "grad_norm": 0.608383059501648, "kl": 0.0129547119140625, "learning_rate": 3.902018669163384e-07, "loss": 0.0089, "reward": 1.0486111044883728, "reward_std": 0.46130846440792084, "rewards/accuracy_reward": 0.2777777807787061, "rewards/format_reward": 0.493055559694767, "step": 327 }, { "clip_ratio": 0.0, "completion_length": 621.3055725097656, "epoch": 0.5621251071122536, "grad_norm": 0.9387192130088806, "kl": 0.0234832763671875, "learning_rate": 3.872689434630585e-07, "loss": 0.0096, "reward": 0.8263888955116272, "reward_std": 0.32522570341825485, "rewards/accuracy_reward": 0.1666666679084301, "rewards/format_reward": 0.493055559694767, "step": 328 }, { "clip_ratio": 0.0, "completion_length": 718.8333587646484, "epoch": 0.5638389031705227, "grad_norm": 0.36304396390914917, "kl": 0.00838470458984375, "learning_rate": 3.843439512918949e-07, "loss": -0.0051, "reward": 0.75, "reward_std": 0.33668185770511627, "rewards/accuracy_reward": 0.12500000465661287, "rewards/format_reward": 0.5, "step": 329 }, { "clip_ratio": 0.0, "completion_length": 838.6250152587891, "epoch": 0.5655526992287918, "grad_norm": 0.4335583746433258, "kl": 0.0093231201171875, "learning_rate": 3.8142703296283953e-07, "loss": 0.0504, "reward": 0.8958333358168602, "reward_std": 0.2658967934548855, "rewards/accuracy_reward": 0.2083333320915699, "rewards/format_reward": 0.4791666641831398, "step": 330 }, { "clip_ratio": 0.0, "completion_length": 664.0694427490234, "epoch": 0.5672664952870609, "grad_norm": 0.44489485025405884, "kl": 0.0103607177734375, "learning_rate": 3.785183306423767e-07, "loss": -0.0029, "reward": 1.1944444328546524, "reward_std": 0.3762567266821861, "rewards/accuracy_reward": 0.3472222276031971, "rewards/format_reward": 0.5, "step": 331 }, { "clip_ratio": 0.0, "completion_length": 668.3055725097656, "epoch": 0.5689802913453299, "grad_norm": 0.22135576605796814, "kl": 0.01212310791015625, "learning_rate": 3.7561798609655373e-07, "loss": 0.0198, "reward": 0.8541666641831398, "reward_std": 0.19436372630298138, "rewards/accuracy_reward": 0.180555559694767, "rewards/format_reward": 0.493055559694767, "step": 332 }, { "clip_ratio": 0.0, "completion_length": 756.5416717529297, "epoch": 0.570694087403599, "grad_norm": 0.3432075083255768, "kl": 0.0111541748046875, "learning_rate": 3.72726140684072e-07, "loss": -0.0046, "reward": 0.6597222164273262, "reward_std": 0.017010344192385674, "rewards/accuracy_reward": 0.0833333358168602, "rewards/format_reward": 0.493055559694767, "step": 333 }, { "clip_ratio": 0.0, "completion_length": 798.2083435058594, "epoch": 0.572407883461868, "grad_norm": 0.3384978473186493, "kl": 0.00952911376953125, "learning_rate": 3.6984293534939737e-07, "loss": 0.0133, "reward": 0.8750000298023224, "reward_std": 0.4249234274029732, "rewards/accuracy_reward": 0.19444444589316845, "rewards/format_reward": 0.486111119389534, "step": 334 }, { "clip_ratio": 0.0, "completion_length": 641.7222290039062, "epoch": 0.5741216795201372, "grad_norm": 0.41000911593437195, "kl": 0.0142669677734375, "learning_rate": 3.6696851061588994e-07, "loss": 0.0868, "reward": 0.9583333283662796, "reward_std": 0.3291585296392441, "rewards/accuracy_reward": 0.23611110914498568, "rewards/format_reward": 0.486111119389534, "step": 335 }, { "clip_ratio": 0.0, "completion_length": 671.9583358764648, "epoch": 0.5758354755784062, "grad_norm": 0.39112672209739685, "kl": 0.009521484375, "learning_rate": 3.641030065789562e-07, "loss": -0.0072, "reward": 0.7152777835726738, "reward_std": 0.26240511797368526, "rewards/accuracy_reward": 0.11111111287027597, "rewards/format_reward": 0.493055559694767, "step": 336 }, { "clip_ratio": 0.0, "completion_length": 706.9722366333008, "epoch": 0.5775492716366752, "grad_norm": 0.48448535799980164, "kl": 0.00994873046875, "learning_rate": 3.612465628992203e-07, "loss": -0.0094, "reward": 0.9652777910232544, "reward_std": 0.332209013402462, "rewards/accuracy_reward": 0.25000000838190317, "rewards/format_reward": 0.4652777835726738, "step": 337 }, { "clip_ratio": 0.0, "completion_length": 729.3333282470703, "epoch": 0.5792630676949443, "grad_norm": 0.5280615091323853, "kl": 0.012420654296875, "learning_rate": 3.5839931879571725e-07, "loss": -0.0209, "reward": 0.965277798473835, "reward_std": 0.5258247926831245, "rewards/accuracy_reward": 0.23611112125217915, "rewards/format_reward": 0.493055559694767, "step": 338 }, { "clip_ratio": 0.0, "completion_length": 779.4166717529297, "epoch": 0.5809768637532133, "grad_norm": 0.32812824845314026, "kl": 0.0185089111328125, "learning_rate": 3.555614130391079e-07, "loss": 0.0169, "reward": 0.6666666716337204, "reward_std": 0.25819889456033707, "rewards/accuracy_reward": 0.08333333395421505, "rewards/format_reward": 0.5, "step": 339 }, { "clip_ratio": 0.0, "completion_length": 616.9444351196289, "epoch": 0.5826906598114824, "grad_norm": 0.5442182421684265, "kl": 0.014892578125, "learning_rate": 3.5273298394491515e-07, "loss": -0.0563, "reward": 0.8055555522441864, "reward_std": 0.44179464131593704, "rewards/accuracy_reward": 0.16666667070239782, "rewards/format_reward": 0.4722222238779068, "step": 340 }, { "clip_ratio": 0.0, "completion_length": 625.2639007568359, "epoch": 0.5844044558697515, "grad_norm": 0.08322001248598099, "kl": 0.01381683349609375, "learning_rate": 3.4991416936678276e-07, "loss": 0.0005, "reward": 0.5, "reward_std": 0.0, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.5, "step": 341 }, { "clip_ratio": 0.0, "completion_length": 662.7639007568359, "epoch": 0.5861182519280206, "grad_norm": 0.42785269021987915, "kl": 0.00940704345703125, "learning_rate": 3.471051066897562e-07, "loss": -0.0295, "reward": 0.861111119389534, "reward_std": 0.47276464104652405, "rewards/accuracy_reward": 0.18055556062608957, "rewards/format_reward": 0.5, "step": 342 }, { "clip_ratio": 0.0, "completion_length": 588.5138854980469, "epoch": 0.5878320479862896, "grad_norm": 0.28376224637031555, "kl": 0.0150909423828125, "learning_rate": 3.4430593282358777e-07, "loss": 0.0035, "reward": 1.2430555373430252, "reward_std": 0.24696609377861023, "rewards/accuracy_reward": 0.3750000037252903, "rewards/format_reward": 0.493055559694767, "step": 343 }, { "clip_ratio": 0.0, "completion_length": 748.7083282470703, "epoch": 0.5895458440445587, "grad_norm": 0.47153374552726746, "kl": 0.0116729736328125, "learning_rate": 3.4151678419606233e-07, "loss": 0.0016, "reward": 0.6527777686715126, "reward_std": 0.33054604940116405, "rewards/accuracy_reward": 0.08333333488553762, "rewards/format_reward": 0.486111119389534, "step": 344 }, { "clip_ratio": 0.0, "completion_length": 730.5972290039062, "epoch": 0.5912596401028277, "grad_norm": 0.45074042677879333, "kl": 0.0081787109375, "learning_rate": 3.387377967463493e-07, "loss": -0.0155, "reward": 1.3472222238779068, "reward_std": 0.6584814712405205, "rewards/accuracy_reward": 0.4305555634200573, "rewards/format_reward": 0.4861111119389534, "step": 345 }, { "clip_ratio": 0.0, "completion_length": 765.8611297607422, "epoch": 0.5929734361610969, "grad_norm": 0.38273346424102783, "kl": 0.00989532470703125, "learning_rate": 3.359691059183761e-07, "loss": 0.0365, "reward": 0.9166666716337204, "reward_std": 0.3995024487376213, "rewards/accuracy_reward": 0.2083333395421505, "rewards/format_reward": 0.5, "step": 346 }, { "clip_ratio": 0.0, "completion_length": 656.5416717529297, "epoch": 0.5946872322193659, "grad_norm": 0.42742764949798584, "kl": 0.01055908203125, "learning_rate": 3.3321084665422803e-07, "loss": 0.0121, "reward": 0.9930555671453476, "reward_std": 0.34847141802310944, "rewards/accuracy_reward": 0.25000000558793545, "rewards/format_reward": 0.493055559694767, "step": 347 }, { "clip_ratio": 0.0, "completion_length": 760.1111297607422, "epoch": 0.596401028277635, "grad_norm": 0.2555226981639862, "kl": 0.0147552490234375, "learning_rate": 3.3046315338757026e-07, "loss": 0.0076, "reward": 0.6111111044883728, "reward_std": 0.15932847559452057, "rewards/accuracy_reward": 0.055555556900799274, "rewards/format_reward": 0.5, "step": 348 }, { "clip_ratio": 0.0, "completion_length": 742.7777862548828, "epoch": 0.598114824335904, "grad_norm": 0.3386673033237457, "kl": 0.01026153564453125, "learning_rate": 3.2772616003709616e-07, "loss": 0.01, "reward": 0.9930555820465088, "reward_std": 0.32522569596767426, "rewards/accuracy_reward": 0.2500000027939677, "rewards/format_reward": 0.493055559694767, "step": 349 }, { "clip_ratio": 0.0, "completion_length": 694.3333435058594, "epoch": 0.5998286203941731, "grad_norm": 0.4300551116466522, "kl": 0.01024627685546875, "learning_rate": 3.250000000000001e-07, "loss": -0.0217, "reward": 1.048611119389534, "reward_std": 0.4845541790127754, "rewards/accuracy_reward": 0.27777778171002865, "rewards/format_reward": 0.493055559694767, "step": 350 }, { "clip_ratio": 0.0, "completion_length": 710.8611145019531, "epoch": 0.6015424164524421, "grad_norm": 0.4361790716648102, "kl": 0.01013946533203125, "learning_rate": 3.222848061454764e-07, "loss": -0.0161, "reward": 0.7430555671453476, "reward_std": 0.3072007894515991, "rewards/accuracy_reward": 0.12500000186264515, "rewards/format_reward": 0.493055559694767, "step": 351 }, { "clip_ratio": 0.0, "completion_length": 867.2222137451172, "epoch": 0.6032562125107113, "grad_norm": 0.41655802726745605, "kl": 0.0092926025390625, "learning_rate": 3.195807108082429e-07, "loss": 0.0058, "reward": 0.7777777910232544, "reward_std": 0.4547397345304489, "rewards/accuracy_reward": 0.13888888992369175, "rewards/format_reward": 0.5, "step": 352 }, { "clip_ratio": 0.0, "completion_length": 808.7639007568359, "epoch": 0.6049700085689803, "grad_norm": 0.32335948944091797, "kl": 0.009429931640625, "learning_rate": 3.168878457820915e-07, "loss": 0.0072, "reward": 1.0277777761220932, "reward_std": 0.33668188750743866, "rewards/accuracy_reward": 0.26388889364898205, "rewards/format_reward": 0.5, "step": 353 }, { "clip_ratio": 0.0, "completion_length": 568.3610992431641, "epoch": 0.6066838046272494, "grad_norm": 0.747093915939331, "kl": 0.011749267578125, "learning_rate": 3.142063423134644e-07, "loss": 0.0159, "reward": 1.4097222536802292, "reward_std": 0.6619075667113066, "rewards/accuracy_reward": 0.4583333395421505, "rewards/format_reward": 0.493055559694767, "step": 354 }, { "clip_ratio": 0.0, "completion_length": 693.5972290039062, "epoch": 0.6083976006855184, "grad_norm": 0.44648948311805725, "kl": 0.01004791259765625, "learning_rate": 3.115363310950578e-07, "loss": 0.0044, "reward": 0.9027778059244156, "reward_std": 0.24970055185258389, "rewards/accuracy_reward": 0.20833333674818277, "rewards/format_reward": 0.486111119389534, "step": 355 }, { "clip_ratio": 0.0, "completion_length": 603.2638931274414, "epoch": 0.6101113967437874, "grad_norm": 0.484390527009964, "kl": 0.012542724609375, "learning_rate": 3.0887794225945143e-07, "loss": -0.0051, "reward": 0.6249999925494194, "reward_std": 0.3125211279839277, "rewards/accuracy_reward": 0.06944444496184587, "rewards/format_reward": 0.486111119389534, "step": 356 }, { "clip_ratio": 0.0, "completion_length": 628.0138931274414, "epoch": 0.6118251928020566, "grad_norm": 0.5177603363990784, "kl": 0.0098114013671875, "learning_rate": 3.062313053727671e-07, "loss": 0.0262, "reward": 1.0833333283662796, "reward_std": 0.4495188891887665, "rewards/accuracy_reward": 0.2916666669771075, "rewards/format_reward": 0.5, "step": 357 }, { "clip_ratio": 0.0, "completion_length": 677.9444580078125, "epoch": 0.6135389888603257, "grad_norm": 0.5876026153564453, "kl": 0.0117034912109375, "learning_rate": 3.0359654942835247e-07, "loss": 0.0255, "reward": 0.722222238779068, "reward_std": 0.45503970980644226, "rewards/accuracy_reward": 0.12500000279396772, "rewards/format_reward": 0.4722222313284874, "step": 358 }, { "clip_ratio": 0.0, "completion_length": 610.0972290039062, "epoch": 0.6152527849185947, "grad_norm": 0.44811731576919556, "kl": 0.0104827880859375, "learning_rate": 3.0097380284049523e-07, "loss": 0.0011, "reward": 0.9027777761220932, "reward_std": 0.2613905444741249, "rewards/accuracy_reward": 0.20833333302289248, "rewards/format_reward": 0.4861111119389534, "step": 359 }, { "clip_ratio": 0.0, "completion_length": 682.4583435058594, "epoch": 0.6169665809768637, "grad_norm": 0.5113480687141418, "kl": 0.0124053955078125, "learning_rate": 2.9836319343816397e-07, "loss": -0.0284, "reward": 0.826388880610466, "reward_std": 0.44850434362888336, "rewards/accuracy_reward": 0.16666666977107525, "rewards/format_reward": 0.493055559694767, "step": 360 }, { "clip_ratio": 0.0, "completion_length": 613.2639007568359, "epoch": 0.6186803770351328, "grad_norm": 0.6223089098930359, "kl": 0.0121612548828125, "learning_rate": 2.9576484845877793e-07, "loss": 0.0238, "reward": 1.0277777910232544, "reward_std": 0.6448972225189209, "rewards/accuracy_reward": 0.2638888955116272, "rewards/format_reward": 0.5, "step": 361 }, { "clip_ratio": 0.0, "completion_length": 640.7083282470703, "epoch": 0.6203941730934018, "grad_norm": 0.5173635482788086, "kl": 0.00988006591796875, "learning_rate": 2.931788945420058e-07, "loss": 0.0152, "reward": 0.7708333283662796, "reward_std": 0.4405169114470482, "rewards/accuracy_reward": 0.13888889364898205, "rewards/format_reward": 0.493055559694767, "step": 362 }, { "clip_ratio": 0.0, "completion_length": 723.5555725097656, "epoch": 0.622107969151671, "grad_norm": 0.49564608931541443, "kl": 0.0084381103515625, "learning_rate": 2.9060545772359305e-07, "loss": -0.0065, "reward": 1.0138888955116272, "reward_std": 0.3242111261934042, "rewards/accuracy_reward": 0.2638888955116272, "rewards/format_reward": 0.486111119389534, "step": 363 }, { "clip_ratio": 0.0, "completion_length": 653.4722290039062, "epoch": 0.62382176520994, "grad_norm": 0.39930129051208496, "kl": 0.01010894775390625, "learning_rate": 2.8804466342921987e-07, "loss": -0.0025, "reward": 0.9444444328546524, "reward_std": 0.38147754967212677, "rewards/accuracy_reward": 0.22222222946584225, "rewards/format_reward": 0.5, "step": 364 }, { "clip_ratio": 0.0, "completion_length": 837.6944732666016, "epoch": 0.6255355612682091, "grad_norm": 0.3432636260986328, "kl": 0.00820159912109375, "learning_rate": 2.854966364683872e-07, "loss": 0.1109, "reward": 0.7986111268401146, "reward_std": 0.36156320944428444, "rewards/accuracy_reward": 0.16666666977107525, "rewards/format_reward": 0.4652777835726738, "step": 365 }, { "clip_ratio": 0.0, "completion_length": 699.8750152587891, "epoch": 0.6272493573264781, "grad_norm": 0.3920920789241791, "kl": 0.00817108154296875, "learning_rate": 2.829615010283344e-07, "loss": -0.0084, "reward": 0.8263888955116272, "reward_std": 0.153093121945858, "rewards/accuracy_reward": 0.1666666716337204, "rewards/format_reward": 0.493055559694767, "step": 366 }, { "clip_ratio": 0.0, "completion_length": 733.0416717529297, "epoch": 0.6289631533847472, "grad_norm": 0.6963837146759033, "kl": 0.01009368896484375, "learning_rate": 2.8043938066798645e-07, "loss": 0.1349, "reward": 1.2152777910232544, "reward_std": 0.6726825386285782, "rewards/accuracy_reward": 0.3611111156642437, "rewards/format_reward": 0.493055559694767, "step": 367 }, { "clip_ratio": 0.0, "completion_length": 789.3610992431641, "epoch": 0.6306769494430163, "grad_norm": 0.48074987530708313, "kl": 0.00942230224609375, "learning_rate": 2.7793039831193133e-07, "loss": 0.1273, "reward": 0.7777777835726738, "reward_std": 0.36897341534495354, "rewards/accuracy_reward": 0.15277778450399637, "rewards/format_reward": 0.4722222238779068, "step": 368 }, { "clip_ratio": 0.0, "completion_length": 684.4861297607422, "epoch": 0.6323907455012854, "grad_norm": 0.5865346789360046, "kl": 0.0122222900390625, "learning_rate": 2.7543467624442956e-07, "loss": 0.0003, "reward": 1.0277777910232544, "reward_std": 0.3762567415833473, "rewards/accuracy_reward": 0.26388888992369175, "rewards/format_reward": 0.5, "step": 369 }, { "clip_ratio": 0.0, "completion_length": 657.6528015136719, "epoch": 0.6341045415595544, "grad_norm": 0.5562211871147156, "kl": 0.0120849609375, "learning_rate": 2.729523361034538e-07, "loss": 0.0188, "reward": 1.1388888955116272, "reward_std": 0.3075363263487816, "rewards/accuracy_reward": 0.3333333432674408, "rewards/format_reward": 0.4722222238779068, "step": 370 }, { "clip_ratio": 0.0, "completion_length": 580.1111297607422, "epoch": 0.6358183376178235, "grad_norm": 0.35350364446640015, "kl": 0.0099639892578125, "learning_rate": 2.7048349887476037e-07, "loss": -0.007, "reward": 1.0902777761220932, "reward_std": 0.3765922859311104, "rewards/accuracy_reward": 0.3055555550381541, "rewards/format_reward": 0.4791666641831398, "step": 371 }, { "clip_ratio": 0.0, "completion_length": 718.4861297607422, "epoch": 0.6375321336760925, "grad_norm": 0.3661295771598816, "kl": 0.00765228271484375, "learning_rate": 2.6802828488599294e-07, "loss": 0.0188, "reward": 0.826388880610466, "reward_std": 0.4112919941544533, "rewards/accuracy_reward": 0.16666666977107525, "rewards/format_reward": 0.493055559694767, "step": 372 }, { "clip_ratio": 0.0, "completion_length": 756.375, "epoch": 0.6392459297343616, "grad_norm": 0.3112243115901947, "kl": 0.01010894775390625, "learning_rate": 2.655868138008171e-07, "loss": -0.0013, "reward": 0.8611111044883728, "reward_std": 0.24017397314310074, "rewards/accuracy_reward": 0.18055555876344442, "rewards/format_reward": 0.5, "step": 373 }, { "clip_ratio": 0.0, "completion_length": 813.3194427490234, "epoch": 0.6409597257926307, "grad_norm": 0.3174319863319397, "kl": 0.007965087890625, "learning_rate": 2.631592046130896e-07, "loss": -0.0123, "reward": 1.0833333432674408, "reward_std": 0.2901904284954071, "rewards/accuracy_reward": 0.2916666716337204, "rewards/format_reward": 0.5, "step": 374 }, { "clip_ratio": 0.0, "completion_length": 705.0972290039062, "epoch": 0.6426735218508998, "grad_norm": 0.46651625633239746, "kl": 0.0082550048828125, "learning_rate": 2.6074557564105724e-07, "loss": 0.0328, "reward": 1.0486111044883728, "reward_std": 0.6196977943181992, "rewards/accuracy_reward": 0.27777778171002865, "rewards/format_reward": 0.493055559694767, "step": 375 }, { "clip_ratio": 0.0, "completion_length": 642.0972290039062, "epoch": 0.6443873179091688, "grad_norm": 0.5062349438667297, "kl": 0.0101165771484375, "learning_rate": 2.583460445215911e-07, "loss": -0.0277, "reward": 0.9861111342906952, "reward_std": 0.3804878890514374, "rewards/accuracy_reward": 0.25000000931322575, "rewards/format_reward": 0.4861111119389534, "step": 376 }, { "clip_ratio": 0.0, "completion_length": 701.2361145019531, "epoch": 0.6461011139674379, "grad_norm": 0.4397139549255371, "kl": 0.008392333984375, "learning_rate": 2.5596072820445254e-07, "loss": -0.0174, "reward": 1.0277777910232544, "reward_std": 0.47276463359594345, "rewards/accuracy_reward": 0.26388890016824007, "rewards/format_reward": 0.5, "step": 377 }, { "clip_ratio": 0.0, "completion_length": 680.9861145019531, "epoch": 0.6478149100257069, "grad_norm": 0.20873984694480896, "kl": 0.0089569091796875, "learning_rate": 2.5358974294659373e-07, "loss": -0.0031, "reward": 0.5555555522441864, "reward_std": 0.13608276098966599, "rewards/accuracy_reward": 0.02777777798473835, "rewards/format_reward": 0.5, "step": 378 }, { "clip_ratio": 0.0, "completion_length": 640.6666717529297, "epoch": 0.6495287060839761, "grad_norm": 0.4596330225467682, "kl": 0.0101165771484375, "learning_rate": 2.512332043064913e-07, "loss": -0.009, "reward": 0.7708333432674408, "reward_std": 0.31242159754037857, "rewards/accuracy_reward": 0.13888889271765947, "rewards/format_reward": 0.493055559694767, "step": 379 }, { "clip_ratio": 0.0, "completion_length": 522.8194427490234, "epoch": 0.6512425021422451, "grad_norm": 0.3971538543701172, "kl": 0.011444091796875, "learning_rate": 2.488912271385139e-07, "loss": 0.0003, "reward": 0.972222238779068, "reward_std": 0.4123065695166588, "rewards/accuracy_reward": 0.23611111380159855, "rewards/format_reward": 0.5, "step": 380 }, { "clip_ratio": 0.0, "completion_length": 829.1250152587891, "epoch": 0.6529562982005142, "grad_norm": 0.2756218910217285, "kl": 0.0085296630859375, "learning_rate": 2.465639255873246e-07, "loss": 0.0465, "reward": 0.5624999925494194, "reward_std": 0.23096106760203838, "rewards/accuracy_reward": 0.041666666977107525, "rewards/format_reward": 0.4791666716337204, "step": 381 }, { "clip_ratio": 0.0, "completion_length": 635.0833435058594, "epoch": 0.6546700942587832, "grad_norm": 0.42576533555984497, "kl": 0.00997161865234375, "learning_rate": 2.4425141308231765e-07, "loss": -0.0305, "reward": 1.0000000149011612, "reward_std": 0.39428164809942245, "rewards/accuracy_reward": 0.25000000838190317, "rewards/format_reward": 0.5, "step": 382 }, { "clip_ratio": 0.0, "completion_length": 768.0277862548828, "epoch": 0.6563838903170522, "grad_norm": 0.4228789508342743, "kl": 0.010772705078125, "learning_rate": 2.4195380233209006e-07, "loss": 0.0427, "reward": 0.7361110970377922, "reward_std": 0.36172348074615, "rewards/accuracy_reward": 0.12500000279396772, "rewards/format_reward": 0.4861111119389534, "step": 383 }, { "clip_ratio": 0.0, "completion_length": 730.5555572509766, "epoch": 0.6580976863753213, "grad_norm": 0.25273385643959045, "kl": 0.0084991455078125, "learning_rate": 2.3967120531894857e-07, "loss": -0.0127, "reward": 0.6388888955116272, "reward_std": 0.15410767495632172, "rewards/accuracy_reward": 0.06944444496184587, "rewards/format_reward": 0.5, "step": 384 }, { "clip_ratio": 0.0, "completion_length": 778.0833587646484, "epoch": 0.6598114824335904, "grad_norm": 0.39399486780166626, "kl": 0.01232147216796875, "learning_rate": 2.374037332934512e-07, "loss": 0.0025, "reward": 0.7083333432674408, "reward_std": 0.26772547513246536, "rewards/accuracy_reward": 0.11111111287027597, "rewards/format_reward": 0.486111119389534, "step": 385 }, { "clip_ratio": 0.0, "completion_length": 661.8750152587891, "epoch": 0.6615252784918595, "grad_norm": 0.35121214389801025, "kl": 0.01195526123046875, "learning_rate": 2.3515149676898552e-07, "loss": -0.0003, "reward": 0.6111111044883728, "reward_std": 0.222149059176445, "rewards/accuracy_reward": 0.0555555559694767, "rewards/format_reward": 0.5, "step": 386 }, { "clip_ratio": 0.0, "completion_length": 646.2222366333008, "epoch": 0.6632390745501285, "grad_norm": 0.3854869604110718, "kl": 0.01076507568359375, "learning_rate": 2.3291460551638237e-07, "loss": -0.016, "reward": 0.8611110895872116, "reward_std": 0.20412414520978928, "rewards/accuracy_reward": 0.18055556062608957, "rewards/format_reward": 0.5, "step": 387 }, { "clip_ratio": 0.0, "completion_length": 707.0277786254883, "epoch": 0.6649528706083976, "grad_norm": 0.6442806720733643, "kl": 0.017303466796875, "learning_rate": 2.306931685585657e-07, "loss": -0.0025, "reward": 0.8611111044883728, "reward_std": 0.5538673847913742, "rewards/accuracy_reward": 0.19444445054978132, "rewards/format_reward": 0.4722222238779068, "step": 388 }, { "clip_ratio": 0.0, "completion_length": 647.8611145019531, "epoch": 0.6666666666666666, "grad_norm": 0.5503394603729248, "kl": 0.0092926025390625, "learning_rate": 2.2848729416523859e-07, "loss": 0.0202, "reward": 0.9861111119389534, "reward_std": 0.5015645399689674, "rewards/accuracy_reward": 0.25000000186264515, "rewards/format_reward": 0.4861111119389534, "step": 389 }, { "clip_ratio": 0.0, "completion_length": 731.2638854980469, "epoch": 0.6683804627249358, "grad_norm": 0.44946956634521484, "kl": 0.00981903076171875, "learning_rate": 2.2629708984760706e-07, "loss": 0.1489, "reward": 0.9583333432674408, "reward_std": 0.35901258140802383, "rewards/accuracy_reward": 0.236111119389534, "rewards/format_reward": 0.486111119389534, "step": 390 }, { "clip_ratio": 0.0, "completion_length": 808.2083435058594, "epoch": 0.6700942587832048, "grad_norm": 0.4584062695503235, "kl": 0.00930023193359375, "learning_rate": 2.2412266235313973e-07, "loss": -0.0126, "reward": 1.020833358168602, "reward_std": 0.5443559736013412, "rewards/accuracy_reward": 0.2638888917863369, "rewards/format_reward": 0.493055559694767, "step": 391 }, { "clip_ratio": 0.0, "completion_length": 662.8055572509766, "epoch": 0.6718080548414739, "grad_norm": 0.5026288628578186, "kl": 0.01027679443359375, "learning_rate": 2.2196411766036487e-07, "loss": 0.0043, "reward": 0.7222222238779068, "reward_std": 0.25819889456033707, "rewards/accuracy_reward": 0.1111111119389534, "rewards/format_reward": 0.5, "step": 392 }, { "clip_ratio": 0.0, "completion_length": 827.4861145019531, "epoch": 0.6735218508997429, "grad_norm": 0.4576079845428467, "kl": 0.01000213623046875, "learning_rate": 2.1982156097370557e-07, "loss": 0.0398, "reward": 0.7708333134651184, "reward_std": 0.31242159754037857, "rewards/accuracy_reward": 0.13888889271765947, "rewards/format_reward": 0.493055559694767, "step": 393 }, { "clip_ratio": 0.0, "completion_length": 632.625, "epoch": 0.675235646958012, "grad_norm": 0.46405985951423645, "kl": 0.010711669921875, "learning_rate": 2.1769509671835223e-07, "loss": -0.0166, "reward": 0.7569444477558136, "reward_std": 0.36649633944034576, "rewards/accuracy_reward": 0.12500000186264515, "rewards/format_reward": 0.5069444477558136, "step": 394 }, { "clip_ratio": 0.0, "completion_length": 740.8055572509766, "epoch": 0.676949443016281, "grad_norm": 0.33941328525543213, "kl": 0.0098724365234375, "learning_rate": 2.1558482853517253e-07, "loss": 0.0659, "reward": 0.743055559694767, "reward_std": 0.24438020400702953, "rewards/accuracy_reward": 0.12500000465661287, "rewards/format_reward": 0.493055559694767, "step": 395 }, { "clip_ratio": 0.0, "completion_length": 810.3055725097656, "epoch": 0.6786632390745502, "grad_norm": 0.3110896348953247, "kl": 0.00872802734375, "learning_rate": 2.134908592756607e-07, "loss": 0.0229, "reward": 0.819444440305233, "reward_std": 0.17010344192385674, "rewards/accuracy_reward": 0.16666667256504297, "rewards/format_reward": 0.4861111119389534, "step": 396 }, { "clip_ratio": 0.0, "completion_length": 735.9027862548828, "epoch": 0.6803770351328192, "grad_norm": 0.8936165571212769, "kl": 0.01611328125, "learning_rate": 2.1141329099692406e-07, "loss": -0.0202, "reward": 0.7013889029622078, "reward_std": 0.26149011217057705, "rewards/accuracy_reward": 0.11111111287027597, "rewards/format_reward": 0.479166679084301, "step": 397 }, { "clip_ratio": 0.0, "completion_length": 689.8472290039062, "epoch": 0.6820908311910883, "grad_norm": 0.23036593198776245, "kl": 0.00927734375, "learning_rate": 2.0935222495670968e-07, "loss": 0.0021, "reward": 0.8055555671453476, "reward_std": 0.1773533970117569, "rewards/accuracy_reward": 0.1527777798473835, "rewards/format_reward": 0.5, "step": 398 }, { "clip_ratio": 0.0, "completion_length": 606.8750076293945, "epoch": 0.6838046272493573, "grad_norm": 0.5848769545555115, "kl": 0.009002685546875, "learning_rate": 2.0730776160846853e-07, "loss": 0.0063, "reward": 1.0208333432674408, "reward_std": 0.5025790855288506, "rewards/accuracy_reward": 0.2638888955116272, "rewards/format_reward": 0.493055559694767, "step": 399 }, { "clip_ratio": 0.0, "completion_length": 822.8333435058594, "epoch": 0.6855184233076264, "grad_norm": 0.5785830020904541, "kl": 0.00983428955078125, "learning_rate": 2.0528000059645995e-07, "loss": 0.0642, "reward": 0.8750000298023224, "reward_std": 0.6504513919353485, "rewards/accuracy_reward": 0.1944444514811039, "rewards/format_reward": 0.486111119389534, "step": 400 }, { "clip_ratio": 0.0, "completion_length": 748.5138854980469, "epoch": 0.6872322193658955, "grad_norm": 0.3859395980834961, "kl": 0.009033203125, "learning_rate": 2.032690407508949e-07, "loss": -0.0078, "reward": 0.8750000149011612, "reward_std": 0.3910152539610863, "rewards/accuracy_reward": 0.19444444589316845, "rewards/format_reward": 0.4861111119389534, "step": 401 }, { "clip_ratio": 0.0, "completion_length": 705.9583435058594, "epoch": 0.6889460154241646, "grad_norm": 0.46305710077285767, "kl": 0.009429931640625, "learning_rate": 2.0127498008311922e-07, "loss": 0.0405, "reward": 0.659722238779068, "reward_std": 0.3002174627035856, "rewards/accuracy_reward": 0.09722222574055195, "rewards/format_reward": 0.4652777835726738, "step": 402 }, { "clip_ratio": 0.0, "completion_length": 689.0972290039062, "epoch": 0.6906598114824336, "grad_norm": 0.30027008056640625, "kl": 0.01239013671875, "learning_rate": 1.9929791578083655e-07, "loss": 0.0017, "reward": 0.986111119389534, "reward_std": 0.3444380611181259, "rewards/accuracy_reward": 0.2500000111758709, "rewards/format_reward": 0.486111119389534, "step": 403 }, { "clip_ratio": 0.0, "completion_length": 593.8333282470703, "epoch": 0.6923736075407027, "grad_norm": 0.0383455790579319, "kl": 0.00807952880859375, "learning_rate": 1.9733794420337213e-07, "loss": 0.0003, "reward": 0.6666666567325592, "reward_std": 0.0, "rewards/accuracy_reward": 0.0833333358168602, "rewards/format_reward": 0.5, "step": 404 }, { "clip_ratio": 0.0, "completion_length": 667.8194427490234, "epoch": 0.6940874035989717, "grad_norm": 0.36720189452171326, "kl": 0.00794219970703125, "learning_rate": 1.9539516087697517e-07, "loss": 0.0015, "reward": 1.1944444626569748, "reward_std": 0.3762567341327667, "rewards/accuracy_reward": 0.3472222276031971, "rewards/format_reward": 0.5, "step": 405 }, { "clip_ratio": 0.0, "completion_length": 609.4305725097656, "epoch": 0.6958011996572407, "grad_norm": 0.5361925959587097, "kl": 0.00843048095703125, "learning_rate": 1.934696604901642e-07, "loss": 0.0433, "reward": 0.9166666716337204, "reward_std": 0.3995024636387825, "rewards/accuracy_reward": 0.20833333395421505, "rewards/format_reward": 0.5, "step": 406 }, { "clip_ratio": 0.0, "completion_length": 680.8055725097656, "epoch": 0.6975149957155099, "grad_norm": 0.3511025905609131, "kl": 0.010101318359375, "learning_rate": 1.915615368891117e-07, "loss": -0.0073, "reward": 1.0138888955116272, "reward_std": 0.40500660240650177, "rewards/accuracy_reward": 0.2638888917863369, "rewards/format_reward": 0.486111119389534, "step": 407 }, { "clip_ratio": 0.0, "completion_length": 488.3472213745117, "epoch": 0.699228791773779, "grad_norm": 0.56728196144104, "kl": 0.0164031982421875, "learning_rate": 1.8967088307307e-07, "loss": -0.0141, "reward": 1.3819444477558136, "reward_std": 0.46130847185850143, "rewards/accuracy_reward": 0.44444444961845875, "rewards/format_reward": 0.493055559694767, "step": 408 }, { "clip_ratio": 0.0, "completion_length": 706.125, "epoch": 0.700942587832048, "grad_norm": 0.5061081647872925, "kl": 0.0098876953125, "learning_rate": 1.8779779118983867e-07, "loss": -0.0202, "reward": 0.8958333358168602, "reward_std": 0.524095680564642, "rewards/accuracy_reward": 0.20833333674818277, "rewards/format_reward": 0.4791666641831398, "step": 409 }, { "clip_ratio": 0.0, "completion_length": 616.6944580078125, "epoch": 0.702656383890317, "grad_norm": 0.4515543282032013, "kl": 0.0114288330078125, "learning_rate": 1.8594235253127372e-07, "loss": 0.0007, "reward": 0.7430555745959282, "reward_std": 0.33044650219380856, "rewards/accuracy_reward": 0.1250000037252903, "rewards/format_reward": 0.493055559694767, "step": 410 }, { "clip_ratio": 0.0, "completion_length": 776.875, "epoch": 0.7043701799485861, "grad_norm": 0.4085945785045624, "kl": 0.0094146728515625, "learning_rate": 1.8410465752883758e-07, "loss": 0.0977, "reward": 0.8402777835726738, "reward_std": 0.2934070285409689, "rewards/accuracy_reward": 0.18055555690079927, "rewards/format_reward": 0.479166679084301, "step": 411 }, { "clip_ratio": 0.0, "completion_length": 741.5000152587891, "epoch": 0.7060839760068551, "grad_norm": 0.4242144823074341, "kl": 0.010284423828125, "learning_rate": 1.822847957491922e-07, "loss": 0.0346, "reward": 0.5624999925494194, "reward_std": 0.25515517219901085, "rewards/accuracy_reward": 0.041666666977107525, "rewards/format_reward": 0.4791666716337204, "step": 412 }, { "clip_ratio": 0.0, "completion_length": 657.5694580078125, "epoch": 0.7077977720651243, "grad_norm": 0.48833325505256653, "kl": 0.0101318359375, "learning_rate": 1.804828558898332e-07, "loss": 0.0055, "reward": 0.7777777761220932, "reward_std": 0.40472327172756195, "rewards/accuracy_reward": 0.13888889085501432, "rewards/format_reward": 0.5, "step": 413 }, { "clip_ratio": 0.0, "completion_length": 751.5833587646484, "epoch": 0.7095115681233933, "grad_norm": 0.49302709102630615, "kl": 0.00957489013671875, "learning_rate": 1.7869892577476722e-07, "loss": 0.069, "reward": 0.6458333432674408, "reward_std": 0.28473581932485104, "rewards/accuracy_reward": 0.08333333395421505, "rewards/format_reward": 0.479166679084301, "step": 414 }, { "clip_ratio": 0.0, "completion_length": 776.7916870117188, "epoch": 0.7112253641816624, "grad_norm": 0.568556010723114, "kl": 0.0110321044921875, "learning_rate": 1.7693309235023127e-07, "loss": -0.042, "reward": 0.6875, "reward_std": 0.2086303625255823, "rewards/accuracy_reward": 0.11111111287027597, "rewards/format_reward": 0.4652777835726738, "step": 415 }, { "clip_ratio": 0.0, "completion_length": 628.6527862548828, "epoch": 0.7129391602399314, "grad_norm": 0.33500367403030396, "kl": 0.0119171142578125, "learning_rate": 1.7518544168045524e-07, "loss": -0.0129, "reward": 0.9444444626569748, "reward_std": 0.2453947812318802, "rewards/accuracy_reward": 0.2222222276031971, "rewards/format_reward": 0.5, "step": 416 }, { "clip_ratio": 0.0, "completion_length": 682.6250076293945, "epoch": 0.7146529562982005, "grad_norm": 0.42712926864624023, "kl": 0.010406494140625, "learning_rate": 1.7345605894346726e-07, "loss": -0.0008, "reward": 0.7152777910232544, "reward_std": 0.4368143603205681, "rewards/accuracy_reward": 0.11111111287027597, "rewards/format_reward": 0.493055559694767, "step": 417 }, { "clip_ratio": 0.0, "completion_length": 570.2222366333008, "epoch": 0.7163667523564696, "grad_norm": 0.47915688157081604, "kl": 0.0113983154296875, "learning_rate": 1.7174502842694212e-07, "loss": 0.0123, "reward": 0.5694444477558136, "reward_std": 0.12530778720974922, "rewards/accuracy_reward": 0.0416666679084301, "rewards/format_reward": 0.486111119389534, "step": 418 }, { "clip_ratio": 0.0, "completion_length": 574.3750152587891, "epoch": 0.7180805484147387, "grad_norm": 0.43631237745285034, "kl": 0.01261138916015625, "learning_rate": 1.7005243352409333e-07, "loss": 0.0184, "reward": 0.8263888955116272, "reward_std": 0.35694384574890137, "rewards/accuracy_reward": 0.16666667070239782, "rewards/format_reward": 0.493055559694767, "step": 419 }, { "clip_ratio": 0.0, "completion_length": 711.5833358764648, "epoch": 0.7197943444730077, "grad_norm": 0.3256857693195343, "kl": 0.00872039794921875, "learning_rate": 1.6837835672960831e-07, "loss": 0.0162, "reward": 0.7361111268401146, "reward_std": 0.26139055751264095, "rewards/accuracy_reward": 0.12500000093132257, "rewards/format_reward": 0.486111119389534, "step": 420 }, { "clip_ratio": 0.0, "completion_length": 578.9722137451172, "epoch": 0.7215081405312768, "grad_norm": 0.5653530359268188, "kl": 0.0140533447265625, "learning_rate": 1.6672287963562852e-07, "loss": 0.0693, "reward": 1.0208333432674408, "reward_std": 0.4548392668366432, "rewards/accuracy_reward": 0.2638888927176595, "rewards/format_reward": 0.493055559694767, "step": 421 }, { "clip_ratio": 0.0, "completion_length": 655.0555572509766, "epoch": 0.7232219365895458, "grad_norm": 0.47506195306777954, "kl": 0.00907135009765625, "learning_rate": 1.6508608292777203e-07, "loss": 0.0037, "reward": 0.6597222238779068, "reward_std": 0.23915940523147583, "rewards/accuracy_reward": 0.08333333395421505, "rewards/format_reward": 0.493055559694767, "step": 422 }, { "clip_ratio": 0.0, "completion_length": 802.1111297607422, "epoch": 0.7249357326478149, "grad_norm": 0.41230887174606323, "kl": 0.01018524169921875, "learning_rate": 1.6346804638120098e-07, "loss": 0.0106, "reward": 0.9652777761220932, "reward_std": 0.41896694898605347, "rewards/accuracy_reward": 0.23611111752688885, "rewards/format_reward": 0.493055559694767, "step": 423 }, { "clip_ratio": 0.0, "completion_length": 806.0416717529297, "epoch": 0.726649528706084, "grad_norm": 0.41215652227401733, "kl": 0.0081787109375, "learning_rate": 1.6186884885673413e-07, "loss": 0.0068, "reward": 0.784722238779068, "reward_std": 0.359107568860054, "rewards/accuracy_reward": 0.15277778077870607, "rewards/format_reward": 0.4791666716337204, "step": 424 }, { "clip_ratio": 0.0, "completion_length": 700.6111221313477, "epoch": 0.7283633247643531, "grad_norm": 0.6025084257125854, "kl": 0.0133056640625, "learning_rate": 1.6028856829700258e-07, "loss": -0.0072, "reward": 1.0972222238779068, "reward_std": 0.6032442003488541, "rewards/accuracy_reward": 0.3055555634200573, "rewards/format_reward": 0.486111119389534, "step": 425 }, { "clip_ratio": 0.0, "completion_length": 503.37500762939453, "epoch": 0.7300771208226221, "grad_norm": 0.6861910223960876, "kl": 0.0125579833984375, "learning_rate": 1.5872728172265146e-07, "loss": -0.0144, "reward": 1.0555555671453476, "reward_std": 0.5035936608910561, "rewards/accuracy_reward": 0.2777777798473835, "rewards/format_reward": 0.5, "step": 426 }, { "clip_ratio": 0.0, "completion_length": 672.6805572509766, "epoch": 0.7317909168808912, "grad_norm": 0.5222618579864502, "kl": 0.01123046875, "learning_rate": 1.5718506522858572e-07, "loss": 0.0373, "reward": 0.7291666716337204, "reward_std": 0.34122148901224136, "rewards/accuracy_reward": 0.12500000093132257, "rewards/format_reward": 0.479166679084301, "step": 427 }, { "clip_ratio": 0.0, "completion_length": 678.2916793823242, "epoch": 0.7335047129391602, "grad_norm": 0.510343074798584, "kl": 0.01007843017578125, "learning_rate": 1.5566199398026147e-07, "loss": 0.0694, "reward": 1.0277777761220932, "reward_std": 0.6949137225747108, "rewards/accuracy_reward": 0.26388889644294977, "rewards/format_reward": 0.5, "step": 428 }, { "clip_ratio": 0.0, "completion_length": 694.0277709960938, "epoch": 0.7352185089974294, "grad_norm": 0.5112901329994202, "kl": 0.010711669921875, "learning_rate": 1.5415814221002265e-07, "loss": -0.0032, "reward": 0.770833320915699, "reward_std": 0.4112919941544533, "rewards/accuracy_reward": 0.13888889271765947, "rewards/format_reward": 0.493055559694767, "step": 429 }, { "clip_ratio": 0.0, "completion_length": 779.0555572509766, "epoch": 0.7369323050556984, "grad_norm": 0.40487441420555115, "kl": 0.00792694091796875, "learning_rate": 1.5267358321348285e-07, "loss": -0.007, "reward": 0.8541666716337204, "reward_std": 0.3304464966058731, "rewards/accuracy_reward": 0.18055555690079927, "rewards/format_reward": 0.493055559694767, "step": 430 }, { "clip_ratio": 0.0, "completion_length": 679.5278015136719, "epoch": 0.7386461011139674, "grad_norm": 0.5640285015106201, "kl": 0.0105438232421875, "learning_rate": 1.5120838934595337e-07, "loss": 0.016, "reward": 0.8750000149011612, "reward_std": 0.31113363057374954, "rewards/accuracy_reward": 0.1944444514811039, "rewards/format_reward": 0.486111119389534, "step": 431 }, { "clip_ratio": 0.0, "completion_length": 642.7639007568359, "epoch": 0.7403598971722365, "grad_norm": 0.5031344890594482, "kl": 0.011260986328125, "learning_rate": 1.4976263201891613e-07, "loss": 0.0137, "reward": 0.986111119389534, "reward_std": 0.4052800089120865, "rewards/accuracy_reward": 0.2500000046566129, "rewards/format_reward": 0.486111119389534, "step": 432 }, { "clip_ratio": 0.0, "completion_length": 764.3055572509766, "epoch": 0.7420736932305055, "grad_norm": 0.30649498105049133, "kl": 0.00821685791015625, "learning_rate": 1.483363816965435e-07, "loss": 0.0368, "reward": 0.9027778059244156, "reward_std": 0.28722215443849564, "rewards/accuracy_reward": 0.20833333767950535, "rewards/format_reward": 0.486111119389534, "step": 433 }, { "clip_ratio": 0.0, "completion_length": 690.9027709960938, "epoch": 0.7437874892887746, "grad_norm": 0.3682223856449127, "kl": 0.00897216796875, "learning_rate": 1.469297078922642e-07, "loss": -0.0162, "reward": 0.6874999925494194, "reward_std": 0.26762592047452927, "rewards/accuracy_reward": 0.09722222480922937, "rewards/format_reward": 0.493055559694767, "step": 434 }, { "clip_ratio": 0.0, "completion_length": 688.2777862548828, "epoch": 0.7455012853470437, "grad_norm": 0.5453760027885437, "kl": 0.0087127685546875, "learning_rate": 1.4554267916537495e-07, "loss": -0.002, "reward": 0.9097222164273262, "reward_std": 0.416512792930007, "rewards/accuracy_reward": 0.2083333358168602, "rewards/format_reward": 0.493055559694767, "step": 435 }, { "clip_ratio": 0.0, "completion_length": 720.7500152587891, "epoch": 0.7472150814053128, "grad_norm": 0.4208390712738037, "kl": 0.00977325439453125, "learning_rate": 1.4417536311769885e-07, "loss": -0.0004, "reward": 1.0486111342906952, "reward_std": 0.4112920016050339, "rewards/accuracy_reward": 0.27777778822928667, "rewards/format_reward": 0.493055559694767, "step": 436 }, { "clip_ratio": 0.0, "completion_length": 646.1666717529297, "epoch": 0.7489288774635818, "grad_norm": 0.38405895233154297, "kl": 0.00957489013671875, "learning_rate": 1.4282782639029128e-07, "loss": 0.0228, "reward": 0.861111119389534, "reward_std": 0.2901904284954071, "rewards/accuracy_reward": 0.18055555783212185, "rewards/format_reward": 0.5, "step": 437 }, { "clip_ratio": 0.0, "completion_length": 586.0555572509766, "epoch": 0.7506426735218509, "grad_norm": 0.2572639584541321, "kl": 0.0116729736328125, "learning_rate": 1.4150013466019114e-07, "loss": 0.0052, "reward": 0.9861111044883728, "reward_std": 0.12496887892484665, "rewards/accuracy_reward": 0.2500000009313226, "rewards/format_reward": 0.4861111119389534, "step": 438 }, { "clip_ratio": 0.0, "completion_length": 721.8333435058594, "epoch": 0.7523564695801199, "grad_norm": 0.3360980451107025, "kl": 0.00910186767578125, "learning_rate": 1.4019235263722034e-07, "loss": -0.0078, "reward": 0.909722238779068, "reward_std": 0.25718431919813156, "rewards/accuracy_reward": 0.20833334047347307, "rewards/format_reward": 0.493055559694767, "step": 439 }, { "clip_ratio": 0.0, "completion_length": 659.875, "epoch": 0.7540702656383891, "grad_norm": 0.4243048131465912, "kl": 0.0103912353515625, "learning_rate": 1.3890454406082956e-07, "loss": 0.0002, "reward": 0.9375, "reward_std": 0.3150074779987335, "rewards/accuracy_reward": 0.22222222108393908, "rewards/format_reward": 0.493055559694767, "step": 440 }, { "clip_ratio": 0.0, "completion_length": 776.7083282470703, "epoch": 0.7557840616966581, "grad_norm": 0.25701966881752014, "kl": 0.00785064697265625, "learning_rate": 1.3763677169699217e-07, "loss": 0.0297, "reward": 0.8750000149011612, "reward_std": 0.1584134679287672, "rewards/accuracy_reward": 0.19444444961845875, "rewards/format_reward": 0.486111119389534, "step": 441 }, { "clip_ratio": 0.0, "completion_length": 634.2500076293945, "epoch": 0.7574978577549272, "grad_norm": 0.42774713039398193, "kl": 0.00933837890625, "learning_rate": 1.3638909733514452e-07, "loss": 0.0248, "reward": 0.9583333432674408, "reward_std": 0.3544755354523659, "rewards/accuracy_reward": 0.236111119389534, "rewards/format_reward": 0.486111119389534, "step": 442 }, { "clip_ratio": 0.0, "completion_length": 714.0833282470703, "epoch": 0.7592116538131962, "grad_norm": 0.5273145437240601, "kl": 0.01111602783203125, "learning_rate": 1.351615817851748e-07, "loss": 0.0447, "reward": 1.0486111342906952, "reward_std": 0.421733595430851, "rewards/accuracy_reward": 0.2777777835726738, "rewards/format_reward": 0.493055559694767, "step": 443 }, { "clip_ratio": 0.0, "completion_length": 805.0972442626953, "epoch": 0.7609254498714653, "grad_norm": 0.28000712394714355, "kl": 0.00943756103515625, "learning_rate": 1.3395428487445914e-07, "loss": 0.0138, "reward": 0.7708333432674408, "reward_std": 0.40107376128435135, "rewards/accuracy_reward": 0.13888889364898205, "rewards/format_reward": 0.493055559694767, "step": 444 }, { "clip_ratio": 0.0, "completion_length": 711.0416870117188, "epoch": 0.7626392459297343, "grad_norm": 0.48471271991729736, "kl": 0.0093841552734375, "learning_rate": 1.3276726544494571e-07, "loss": -0.0408, "reward": 0.7708333283662796, "reward_std": 0.399602010846138, "rewards/accuracy_reward": 0.13888889085501432, "rewards/format_reward": 0.493055559694767, "step": 445 }, { "clip_ratio": 0.0, "completion_length": 650.1944427490234, "epoch": 0.7643530419880035, "grad_norm": 0.2905956208705902, "kl": 0.0093841552734375, "learning_rate": 1.316005813502869e-07, "loss": -0.0041, "reward": 0.8541666865348816, "reward_std": 0.2211344838142395, "rewards/accuracy_reward": 0.18055556155741215, "rewards/format_reward": 0.493055559694767, "step": 446 }, { "clip_ratio": 0.0, "completion_length": 744.8194580078125, "epoch": 0.7660668380462725, "grad_norm": 0.6048784255981445, "kl": 0.0104522705078125, "learning_rate": 1.3045428945301953e-07, "loss": -0.0198, "reward": 1.0555555820465088, "reward_std": 0.4907895475625992, "rewards/accuracy_reward": 0.2777777835726738, "rewards/format_reward": 0.5, "step": 447 }, { "clip_ratio": 0.0, "completion_length": 734.8194580078125, "epoch": 0.7677806341045416, "grad_norm": 0.3953067362308502, "kl": 0.00873565673828125, "learning_rate": 1.2932844562179352e-07, "loss": 0.0348, "reward": 1.0694444477558136, "reward_std": 0.43352314084768295, "rewards/accuracy_reward": 0.2916666716337204, "rewards/format_reward": 0.486111119389534, "step": 448 }, { "clip_ratio": 0.0, "completion_length": 751.0555877685547, "epoch": 0.7694944301628106, "grad_norm": 0.3929743766784668, "kl": 0.0112152099609375, "learning_rate": 1.2822310472864885e-07, "loss": 0.0141, "reward": 0.798611119389534, "reward_std": 0.15942803025245667, "rewards/accuracy_reward": 0.15277778450399637, "rewards/format_reward": 0.493055559694767, "step": 449 }, { "clip_ratio": 0.0, "completion_length": 678.0, "epoch": 0.7712082262210797, "grad_norm": 0.38888290524482727, "kl": 0.0111541748046875, "learning_rate": 1.2713832064634125e-07, "loss": -0.0292, "reward": 0.888888880610466, "reward_std": 0.4547397494316101, "rewards/accuracy_reward": 0.19444444868713617, "rewards/format_reward": 0.5, "step": 450 }, { "clip_ratio": 0.0, "completion_length": 732.7083587646484, "epoch": 0.7729220222793488, "grad_norm": 0.4212208688259125, "kl": 0.0086212158203125, "learning_rate": 1.260741462457165e-07, "loss": 0.0172, "reward": 0.6597222164273262, "reward_std": 0.2624051198363304, "rewards/accuracy_reward": 0.08333333488553762, "rewards/format_reward": 0.493055559694767, "step": 451 }, { "clip_ratio": 0.0, "completion_length": 701.0416870117188, "epoch": 0.7746358183376179, "grad_norm": 0.4472486078739166, "kl": 0.0110626220703125, "learning_rate": 1.2503063339313356e-07, "loss": -0.0389, "reward": 0.9027777761220932, "reward_std": 0.44951891899108887, "rewards/accuracy_reward": 0.20833333674818277, "rewards/format_reward": 0.4861111119389534, "step": 452 }, { "clip_ratio": 0.0, "completion_length": 750.6527862548828, "epoch": 0.7763496143958869, "grad_norm": 0.31978559494018555, "kl": 0.00965118408203125, "learning_rate": 1.2400783294793668e-07, "loss": 0.0029, "reward": 0.6527777761220932, "reward_std": 0.24447975307703018, "rewards/accuracy_reward": 0.08333333395421505, "rewards/format_reward": 0.486111119389534, "step": 453 }, { "clip_ratio": 0.0, "completion_length": 627.0555725097656, "epoch": 0.778063410454156, "grad_norm": 0.5572839975357056, "kl": 0.0136260986328125, "learning_rate": 1.2300579475997657e-07, "loss": -0.0167, "reward": 1.1875000149011612, "reward_std": 0.4453127048909664, "rewards/accuracy_reward": 0.3472222248092294, "rewards/format_reward": 0.493055559694767, "step": 454 }, { "clip_ratio": 0.0, "completion_length": 705.1110992431641, "epoch": 0.779777206512425, "grad_norm": 0.3036157190799713, "kl": 0.0082244873046875, "learning_rate": 1.220245676671809e-07, "loss": 0.0153, "reward": 0.6944444328546524, "reward_std": 0.2634196951985359, "rewards/accuracy_reward": 0.0972222238779068, "rewards/format_reward": 0.5, "step": 455 }, { "clip_ratio": 0.0, "completion_length": 696.0000152587891, "epoch": 0.781491002570694, "grad_norm": 0.304671049118042, "kl": 0.0107574462890625, "learning_rate": 1.2106419949317388e-07, "loss": -0.0076, "reward": 0.8333333283662796, "reward_std": 0.2221490517258644, "rewards/accuracy_reward": 0.16666666977107525, "rewards/format_reward": 0.5, "step": 456 }, { "clip_ratio": 0.0, "completion_length": 688.875, "epoch": 0.7832047986289632, "grad_norm": 0.5803426504135132, "kl": 0.010406494140625, "learning_rate": 1.2012473704494537e-07, "loss": 0.0092, "reward": 0.9861111417412758, "reward_std": 0.3805625271052122, "rewards/accuracy_reward": 0.2500000074505806, "rewards/format_reward": 0.486111119389534, "step": 457 }, { "clip_ratio": 0.0, "completion_length": 601.5277862548828, "epoch": 0.7849185946872322, "grad_norm": 0.7371811270713806, "kl": 0.0137481689453125, "learning_rate": 1.1920622611056974e-07, "loss": 0.0169, "reward": 0.8333333283662796, "reward_std": 0.43149399757385254, "rewards/accuracy_reward": 0.16666666977107525, "rewards/format_reward": 0.5, "step": 458 }, { "clip_ratio": 0.0, "completion_length": 628.7500152587891, "epoch": 0.7866323907455013, "grad_norm": 0.5148778557777405, "kl": 0.011505126953125, "learning_rate": 1.1830871145697412e-07, "loss": 0.0304, "reward": 1.0763888955116272, "reward_std": 0.2211344838142395, "rewards/accuracy_reward": 0.291666679084301, "rewards/format_reward": 0.493055559694767, "step": 459 }, { "clip_ratio": 0.0, "completion_length": 757.3889007568359, "epoch": 0.7883461868037703, "grad_norm": 0.4826764464378357, "kl": 0.0094757080078125, "learning_rate": 1.1743223682775649e-07, "loss": -0.0019, "reward": 0.7152777761220932, "reward_std": 0.384667344391346, "rewards/accuracy_reward": 0.1111111119389534, "rewards/format_reward": 0.493055559694767, "step": 460 }, { "clip_ratio": 0.0, "completion_length": 667.8055572509766, "epoch": 0.7900599828620394, "grad_norm": 0.5709467530250549, "kl": 0.01129150390625, "learning_rate": 1.1657684494105386e-07, "loss": 0.0253, "reward": 0.9027778059244156, "reward_std": 0.5824100151658058, "rewards/accuracy_reward": 0.20833334140479565, "rewards/format_reward": 0.486111119389534, "step": 461 }, { "clip_ratio": 0.0, "completion_length": 782.0694274902344, "epoch": 0.7917737789203085, "grad_norm": 0.359210729598999, "kl": 0.01105499267578125, "learning_rate": 1.1574257748745986e-07, "loss": 0.0259, "reward": 0.8194444477558136, "reward_std": 0.27941547334194183, "rewards/accuracy_reward": 0.1666666716337204, "rewards/format_reward": 0.4861111119389534, "step": 462 }, { "clip_ratio": 0.0, "completion_length": 722.0555572509766, "epoch": 0.7934875749785776, "grad_norm": 0.47103169560432434, "kl": 0.01102447509765625, "learning_rate": 1.1492947512799328e-07, "loss": 0.0043, "reward": 0.8124999925494194, "reward_std": 0.43280857615172863, "rewards/accuracy_reward": 0.16666667256504297, "rewards/format_reward": 0.4791666716337204, "step": 463 }, { "clip_ratio": 0.0, "completion_length": 708.9722290039062, "epoch": 0.7952013710368466, "grad_norm": 0.34840986132621765, "kl": 0.0095672607421875, "learning_rate": 1.1413757749211602e-07, "loss": -0.0008, "reward": 0.888888880610466, "reward_std": 0.13608276098966599, "rewards/accuracy_reward": 0.19444444868713617, "rewards/format_reward": 0.5, "step": 464 }, { "clip_ratio": 0.0, "completion_length": 709.2083282470703, "epoch": 0.7969151670951157, "grad_norm": 0.34385910630226135, "kl": 0.0106964111328125, "learning_rate": 1.1336692317580158e-07, "loss": 0.0174, "reward": 0.8263888955116272, "reward_std": 0.3135357052087784, "rewards/accuracy_reward": 0.1666666679084301, "rewards/format_reward": 0.493055559694767, "step": 465 }, { "clip_ratio": 0.0, "completion_length": 738.0416717529297, "epoch": 0.7986289631533847, "grad_norm": 0.39174365997314453, "kl": 0.0086212158203125, "learning_rate": 1.1261754973965422e-07, "loss": 0.0083, "reward": 1.0833333730697632, "reward_std": 0.4855687543749809, "rewards/accuracy_reward": 0.2916666716337204, "rewards/format_reward": 0.5, "step": 466 }, { "clip_ratio": 0.0, "completion_length": 783.3611145019531, "epoch": 0.8003427592116538, "grad_norm": 0.21360760927200317, "kl": 0.0073394775390625, "learning_rate": 1.1188949370707787e-07, "loss": 0.0061, "reward": 0.7222222238779068, "reward_std": 0.13608276098966599, "rewards/accuracy_reward": 0.11111111287027597, "rewards/format_reward": 0.5, "step": 467 }, { "clip_ratio": 0.0, "completion_length": 591.0416564941406, "epoch": 0.8020565552699229, "grad_norm": 0.44213274121284485, "kl": 0.0121307373046875, "learning_rate": 1.1118279056249653e-07, "loss": -0.0334, "reward": 0.9375000074505806, "reward_std": 0.176338829100132, "rewards/accuracy_reward": 0.2222222238779068, "rewards/format_reward": 0.493055559694767, "step": 468 }, { "clip_ratio": 0.0, "completion_length": 908.9305419921875, "epoch": 0.803770351328192, "grad_norm": 2.4339747428894043, "kl": 0.04512786865234375, "learning_rate": 1.1049747474962444e-07, "loss": 0.0283, "reward": 0.8472222164273262, "reward_std": 0.4613333996385336, "rewards/accuracy_reward": 0.18055556155741215, "rewards/format_reward": 0.486111119389534, "step": 469 }, { "clip_ratio": 0.0, "completion_length": 672.7222290039062, "epoch": 0.805484147386461, "grad_norm": 0.3775832951068878, "kl": 0.00971221923828125, "learning_rate": 1.0983357966978745e-07, "loss": 0.0084, "reward": 0.826388880610466, "reward_std": 0.2337997630238533, "rewards/accuracy_reward": 0.16666666977107525, "rewards/format_reward": 0.493055559694767, "step": 470 }, { "clip_ratio": 0.0, "completion_length": 837.1944580078125, "epoch": 0.8071979434447301, "grad_norm": 0.3247397840023041, "kl": 0.0085601806640625, "learning_rate": 1.0919113768029517e-07, "loss": 0.0263, "reward": 0.8750000149011612, "reward_std": 0.40380824357271194, "rewards/accuracy_reward": 0.19444444682449102, "rewards/format_reward": 0.486111119389534, "step": 471 }, { "clip_ratio": 0.0, "completion_length": 636.1527862548828, "epoch": 0.8089117395029991, "grad_norm": 0.30879315733909607, "kl": 0.0115203857421875, "learning_rate": 1.0857018009286381e-07, "loss": -0.0045, "reward": 0.7708333432674408, "reward_std": 0.22086109220981598, "rewards/accuracy_reward": 0.1388888955116272, "rewards/format_reward": 0.493055559694767, "step": 472 }, { "clip_ratio": 0.0, "completion_length": 697.8194427490234, "epoch": 0.8106255355612683, "grad_norm": 0.5799990296363831, "kl": 0.0116119384765625, "learning_rate": 1.0797073717209013e-07, "loss": 0.0063, "reward": 0.8333333656191826, "reward_std": 0.2221490480005741, "rewards/accuracy_reward": 0.18055556155741215, "rewards/format_reward": 0.472222238779068, "step": 473 }, { "clip_ratio": 0.0, "completion_length": 597.6944580078125, "epoch": 0.8123393316195373, "grad_norm": 0.673372209072113, "kl": 0.012725830078125, "learning_rate": 1.0739283813397639e-07, "loss": -0.0041, "reward": 1.1527777910232544, "reward_std": 0.6239039897918701, "rewards/accuracy_reward": 0.33333334140479565, "rewards/format_reward": 0.486111119389534, "step": 474 }, { "clip_ratio": 0.0, "completion_length": 647.6944427490234, "epoch": 0.8140531276778064, "grad_norm": 0.47942498326301575, "kl": 0.0100555419921875, "learning_rate": 1.068365111445064e-07, "loss": -0.0141, "reward": 0.8888888955116272, "reward_std": 0.4675438329577446, "rewards/accuracy_reward": 0.1944444514811039, "rewards/format_reward": 0.5, "step": 475 }, { "clip_ratio": 0.0, "completion_length": 785.625, "epoch": 0.8157669237360754, "grad_norm": 0.2584112882614136, "kl": 0.00859832763671875, "learning_rate": 1.063017833182728e-07, "loss": 0.0052, "reward": 0.6944444328546524, "reward_std": 0.0680413767695427, "rewards/accuracy_reward": 0.09722222480922937, "rewards/format_reward": 0.5, "step": 476 }, { "clip_ratio": 0.0, "completion_length": 718.2083282470703, "epoch": 0.8174807197943444, "grad_norm": 0.33669352531433105, "kl": 0.01165771484375, "learning_rate": 1.0578868071715544e-07, "loss": 0.0043, "reward": 0.9861111342906952, "reward_std": 0.32973192632198334, "rewards/accuracy_reward": 0.25000000838190317, "rewards/format_reward": 0.4861111119389534, "step": 477 }, { "clip_ratio": 0.0, "completion_length": 617.8611145019531, "epoch": 0.8191945158526135, "grad_norm": 0.8135344982147217, "kl": 0.01825714111328125, "learning_rate": 1.0529722834905125e-07, "loss": -0.0315, "reward": 0.9722222238779068, "reward_std": 0.4262731894850731, "rewards/accuracy_reward": 0.23611111473292112, "rewards/format_reward": 0.5, "step": 478 }, { "clip_ratio": 0.0, "completion_length": 593.4305648803711, "epoch": 0.8209083119108826, "grad_norm": 0.4805351197719574, "kl": 0.0129547119140625, "learning_rate": 1.0482745016665526e-07, "loss": -0.0022, "reward": 1.194444477558136, "reward_std": 0.5820766538381577, "rewards/accuracy_reward": 0.3472222238779068, "rewards/format_reward": 0.5, "step": 479 }, { "clip_ratio": 0.0, "completion_length": 634.7777862548828, "epoch": 0.8226221079691517, "grad_norm": 0.37174245715141296, "kl": 0.01177215576171875, "learning_rate": 1.0437936906629334e-07, "loss": -0.0185, "reward": 1.4375, "reward_std": 0.4637626111507416, "rewards/accuracy_reward": 0.4722222313284874, "rewards/format_reward": 0.493055559694767, "step": 480 }, { "clip_ratio": 0.0, "completion_length": 670.3750152587891, "epoch": 0.8243359040274207, "grad_norm": 0.5397735834121704, "kl": 0.0081024169921875, "learning_rate": 1.0395300688680625e-07, "loss": -0.0052, "reward": 1.166666641831398, "reward_std": 0.41752735525369644, "rewards/accuracy_reward": 0.3333333367481828, "rewards/format_reward": 0.5, "step": 481 }, { "clip_ratio": 0.0, "completion_length": 691.2777862548828, "epoch": 0.8260497000856898, "grad_norm": 0.3484320342540741, "kl": 0.011199951171875, "learning_rate": 1.0354838440848501e-07, "loss": 0.0018, "reward": 0.7430555671453476, "reward_std": 0.33303238451480865, "rewards/accuracy_reward": 0.12500000186264515, "rewards/format_reward": 0.493055559694767, "step": 482 }, { "clip_ratio": 0.0, "completion_length": 664.263916015625, "epoch": 0.8277634961439588, "grad_norm": 0.43313318490982056, "kl": 0.014739990234375, "learning_rate": 1.0316552135205837e-07, "loss": 0.0133, "reward": 0.6527777761220932, "reward_std": 0.25279081612825394, "rewards/accuracy_reward": 0.08333333488553762, "rewards/format_reward": 0.486111119389534, "step": 483 }, { "clip_ratio": 0.0, "completion_length": 688.8888854980469, "epoch": 0.829477292202228, "grad_norm": 0.5132657289505005, "kl": 0.009918212890625, "learning_rate": 1.0280443637773163e-07, "loss": 0.01, "reward": 1.0277778059244156, "reward_std": 0.49601035565137863, "rewards/accuracy_reward": 0.2638888917863369, "rewards/format_reward": 0.5, "step": 484 }, { "clip_ratio": 0.0, "completion_length": 637.1388854980469, "epoch": 0.831191088260497, "grad_norm": 0.470441997051239, "kl": 0.01073455810546875, "learning_rate": 1.0246514708427701e-07, "loss": -0.0112, "reward": 0.888888880610466, "reward_std": 0.4803479462862015, "rewards/accuracy_reward": 0.19444444868713617, "rewards/format_reward": 0.5, "step": 485 }, { "clip_ratio": 0.0, "completion_length": 705.2777862548828, "epoch": 0.8329048843187661, "grad_norm": 0.574053943157196, "kl": 0.0115509033203125, "learning_rate": 1.0214767000817596e-07, "loss": 0.0604, "reward": 1.1041666567325592, "reward_std": 0.4360002353787422, "rewards/accuracy_reward": 0.3194444486871362, "rewards/format_reward": 0.4652777835726738, "step": 486 }, { "clip_ratio": 0.0, "completion_length": 708.1666793823242, "epoch": 0.8346186803770351, "grad_norm": 0.3975141942501068, "kl": 0.01026153564453125, "learning_rate": 1.0185202062281336e-07, "loss": 0.011, "reward": 0.6250000074505806, "reward_std": 0.24970055185258389, "rewards/accuracy_reward": 0.06944444496184587, "rewards/format_reward": 0.486111119389534, "step": 487 }, { "clip_ratio": 0.0, "completion_length": 667.0555572509766, "epoch": 0.8363324764353042, "grad_norm": 0.4273344576358795, "kl": 0.0111236572265625, "learning_rate": 1.0157821333772304e-07, "loss": -0.0084, "reward": 1.2708333283662796, "reward_std": 0.32522569596767426, "rewards/accuracy_reward": 0.38888888619840145, "rewards/format_reward": 0.493055559694767, "step": 488 }, { "clip_ratio": 0.0, "completion_length": 709.2083435058594, "epoch": 0.8380462724935732, "grad_norm": 0.3746323883533478, "kl": 0.008514404296875, "learning_rate": 1.013262614978859e-07, "loss": -0.0182, "reward": 0.6944444477558136, "reward_std": 0.2901904359459877, "rewards/accuracy_reward": 0.09722222574055195, "rewards/format_reward": 0.5, "step": 489 }, { "clip_ratio": 0.0, "completion_length": 678.0555572509766, "epoch": 0.8397600685518424, "grad_norm": 0.44212105870246887, "kl": 0.0089874267578125, "learning_rate": 1.0109617738307911e-07, "loss": 0.0007, "reward": 1.1666666567325592, "reward_std": 0.30821534991264343, "rewards/accuracy_reward": 0.33333333767950535, "rewards/format_reward": 0.5, "step": 490 }, { "clip_ratio": 0.0, "completion_length": 675.7777938842773, "epoch": 0.8414738646101114, "grad_norm": 0.45065274834632874, "kl": 0.0085906982421875, "learning_rate": 1.0088797220727779e-07, "loss": 0.0092, "reward": 0.8055555522441864, "reward_std": 0.3995024636387825, "rewards/accuracy_reward": 0.15277778077870607, "rewards/format_reward": 0.5, "step": 491 }, { "clip_ratio": 0.0, "completion_length": 724.4444580078125, "epoch": 0.8431876606683805, "grad_norm": 0.2776910364627838, "kl": 0.008880615234375, "learning_rate": 1.0070165611810855e-07, "loss": -0.019, "reward": 0.7430555745959282, "reward_std": 0.22113448940217495, "rewards/accuracy_reward": 0.12500000093132257, "rewards/format_reward": 0.493055559694767, "step": 492 }, { "clip_ratio": 0.0, "completion_length": 734.7500152587891, "epoch": 0.8449014567266495, "grad_norm": 0.273231565952301, "kl": 0.01227569580078125, "learning_rate": 1.005372381963547e-07, "loss": 0.0169, "reward": 0.75, "reward_std": 0.15410767495632172, "rewards/accuracy_reward": 0.12500000186264515, "rewards/format_reward": 0.5, "step": 493 }, { "clip_ratio": 0.0, "completion_length": 817.6666717529297, "epoch": 0.8466152527849186, "grad_norm": 0.3405468463897705, "kl": 0.0100250244140625, "learning_rate": 1.0039472645551372e-07, "loss": -0.0042, "reward": 0.9097222238779068, "reward_std": 0.3072007745504379, "rewards/accuracy_reward": 0.2083333395421505, "rewards/format_reward": 0.493055559694767, "step": 494 }, { "clip_ratio": 0.0, "completion_length": 620.8611068725586, "epoch": 0.8483290488431876, "grad_norm": 0.8697577714920044, "kl": 0.0175018310546875, "learning_rate": 1.002741278414069e-07, "loss": -0.0113, "reward": 1.0625000149011612, "reward_std": 0.4665292650461197, "rewards/accuracy_reward": 0.29166667349636555, "rewards/format_reward": 0.479166679084301, "step": 495 }, { "clip_ratio": 0.0, "completion_length": 636.8472290039062, "epoch": 0.8500428449014568, "grad_norm": 0.5277899503707886, "kl": 0.0098876953125, "learning_rate": 1.0017544823184055e-07, "loss": 0.0341, "reward": 1.2222222238779068, "reward_std": 0.686167873442173, "rewards/accuracy_reward": 0.3611111231148243, "rewards/format_reward": 0.5, "step": 496 }, { "clip_ratio": 0.0, "completion_length": 800.9861145019531, "epoch": 0.8517566409597258, "grad_norm": 0.44679829478263855, "kl": 0.012115478515625, "learning_rate": 1.0009869243631952e-07, "loss": 0.0143, "reward": 0.8819444552063942, "reward_std": 0.34861752949655056, "rewards/accuracy_reward": 0.2083333395421505, "rewards/format_reward": 0.4652777910232544, "step": 497 }, { "clip_ratio": 0.0, "completion_length": 856.0694580078125, "epoch": 0.8534704370179949, "grad_norm": 0.22854942083358765, "kl": 0.01018524169921875, "learning_rate": 1.000438641958131e-07, "loss": 0.0058, "reward": 0.6597222238779068, "reward_std": 0.13479479402303696, "rewards/accuracy_reward": 0.08333333674818277, "rewards/format_reward": 0.493055559694767, "step": 498 }, { "clip_ratio": 0.0, "completion_length": 564.8888854980469, "epoch": 0.8551842330762639, "grad_norm": 0.7382153272628784, "kl": 0.0165252685546875, "learning_rate": 1.0001096618257236e-07, "loss": -0.0162, "reward": 1.0277777761220932, "reward_std": 0.2901904284954071, "rewards/accuracy_reward": 0.26388889644294977, "rewards/format_reward": 0.5, "step": 499 }, { "clip_ratio": 0.0, "completion_length": 571.8194351196289, "epoch": 0.856898029134533, "grad_norm": 0.5685946941375732, "kl": 0.0124969482421875, "learning_rate": 1e-07, "loss": -0.0212, "reward": 1.1597222089767456, "reward_std": 0.4405168890953064, "rewards/accuracy_reward": 0.3333333330228925, "rewards/format_reward": 0.493055559694767, "step": 500 }, { "epoch": 0.856898029134533, "step": 500, "total_flos": 0.0, "train_loss": 0.01467308583567501, "train_runtime": 25494.6867, "train_samples_per_second": 1.412, "train_steps_per_second": 0.02 } ], "logging_steps": 1, "max_steps": 500, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 100, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 6, "trial_name": null, "trial_params": null }