| { |
| "best_metric": null, |
| "best_model_checkpoint": null, |
| "epoch": 0.856898029134533, |
| "eval_steps": 500, |
| "global_step": 500, |
| "is_hyper_param_search": false, |
| "is_local_process_zero": true, |
| "is_world_process_zero": true, |
| "log_history": [ |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 673.8611145019531, |
| "epoch": 0.001713796058269066, |
| "grad_norm": 0.6894496083259583, |
| "kl": 0.0, |
| "learning_rate": 2e-08, |
| "loss": 0.0307, |
| "reward": 0.5902777910232544, |
| "reward_std": 0.3424043729901314, |
| "rewards/accuracy_reward": 0.16666666977107525, |
| "rewards/format_reward": 0.2569444514811039, |
| "step": 1 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 663.3889007568359, |
| "epoch": 0.003427592116538132, |
| "grad_norm": 0.6292832493782043, |
| "kl": 0.0, |
| "learning_rate": 4e-08, |
| "loss": -0.0039, |
| "reward": 0.4583333358168602, |
| "reward_std": 0.45602361112833023, |
| "rewards/accuracy_reward": 0.0972222238779068, |
| "rewards/format_reward": 0.2638888917863369, |
| "step": 2 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 843.0972442626953, |
| "epoch": 0.005141388174807198, |
| "grad_norm": 0.5272438526153564, |
| "kl": 0.00014257431030273438, |
| "learning_rate": 6e-08, |
| "loss": 0.0375, |
| "reward": 0.5972222238779068, |
| "reward_std": 0.4886699207127094, |
| "rewards/accuracy_reward": 0.16666667256504297, |
| "rewards/format_reward": 0.2638888955116272, |
| "step": 3 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 708.4166717529297, |
| "epoch": 0.006855184233076264, |
| "grad_norm": 0.59702068567276, |
| "kl": 0.0001093149185180664, |
| "learning_rate": 8e-08, |
| "loss": 0.0373, |
| "reward": 0.5069444477558136, |
| "reward_std": 0.563178788870573, |
| "rewards/accuracy_reward": 0.13888888992369175, |
| "rewards/format_reward": 0.2291666716337204, |
| "step": 4 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 663.9722290039062, |
| "epoch": 0.00856898029134533, |
| "grad_norm": 0.8420807123184204, |
| "kl": 0.00011909008026123047, |
| "learning_rate": 1e-07, |
| "loss": 0.0547, |
| "reward": 0.666666679084301, |
| "reward_std": 0.5673302263021469, |
| "rewards/accuracy_reward": 0.22222222480922937, |
| "rewards/format_reward": 0.2222222238779068, |
| "step": 5 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 630.0, |
| "epoch": 0.010282776349614395, |
| "grad_norm": 0.8917471170425415, |
| "kl": 0.00018858909606933594, |
| "learning_rate": 1.2e-07, |
| "loss": 0.0357, |
| "reward": 0.5833333358168602, |
| "reward_std": 0.4646867737174034, |
| "rewards/accuracy_reward": 0.16666667070239782, |
| "rewards/format_reward": 0.2500000037252903, |
| "step": 6 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 650.5416870117188, |
| "epoch": 0.011996572407883462, |
| "grad_norm": 0.7703569531440735, |
| "kl": 8.26716423034668e-05, |
| "learning_rate": 1.4e-07, |
| "loss": -0.0085, |
| "reward": 0.4305555634200573, |
| "reward_std": 0.31282100826501846, |
| "rewards/accuracy_reward": 0.0555555559694767, |
| "rewards/format_reward": 0.3194444514811039, |
| "step": 7 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 775.2500152587891, |
| "epoch": 0.013710368466152529, |
| "grad_norm": 0.7724463939666748, |
| "kl": 0.00013768672943115234, |
| "learning_rate": 1.6e-07, |
| "loss": 0.0749, |
| "reward": 0.3611111082136631, |
| "reward_std": 0.41373568773269653, |
| "rewards/accuracy_reward": 0.06944444589316845, |
| "rewards/format_reward": 0.2222222276031971, |
| "step": 8 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 634.8472213745117, |
| "epoch": 0.015424164524421594, |
| "grad_norm": 0.5836902856826782, |
| "kl": 0.0001068115234375, |
| "learning_rate": 1.8e-07, |
| "loss": 0.0627, |
| "reward": 0.4861111156642437, |
| "reward_std": 0.46435344591736794, |
| "rewards/accuracy_reward": 0.12500000465661287, |
| "rewards/format_reward": 0.2361111119389534, |
| "step": 9 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 768.4305572509766, |
| "epoch": 0.01713796058269066, |
| "grad_norm": 0.8060944676399231, |
| "kl": 0.0001360177993774414, |
| "learning_rate": 2e-07, |
| "loss": 0.0023, |
| "reward": 0.7847222089767456, |
| "reward_std": 0.46876538544893265, |
| "rewards/accuracy_reward": 0.291666672565043, |
| "rewards/format_reward": 0.2013888955116272, |
| "step": 10 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 661.7638854980469, |
| "epoch": 0.018851756640959727, |
| "grad_norm": 0.8618626594543457, |
| "kl": 9.322166442871094e-05, |
| "learning_rate": 2.1999999999999998e-07, |
| "loss": 0.0605, |
| "reward": 0.5763889029622078, |
| "reward_std": 0.5653917491436005, |
| "rewards/accuracy_reward": 0.15277778264135122, |
| "rewards/format_reward": 0.2708333358168602, |
| "step": 11 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 847.4583435058594, |
| "epoch": 0.02056555269922879, |
| "grad_norm": 0.5977460741996765, |
| "kl": 0.0001245737075805664, |
| "learning_rate": 2.4e-07, |
| "loss": 0.0545, |
| "reward": 0.4583333395421505, |
| "reward_std": 0.4860685095191002, |
| "rewards/accuracy_reward": 0.0972222238779068, |
| "rewards/format_reward": 0.2638888917863369, |
| "step": 12 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 687.4305725097656, |
| "epoch": 0.022279348757497857, |
| "grad_norm": 0.7761760354042053, |
| "kl": 0.00014710426330566406, |
| "learning_rate": 2.6e-07, |
| "loss": 0.0447, |
| "reward": 0.3888888955116272, |
| "reward_std": 0.4280589930713177, |
| "rewards/accuracy_reward": 0.06944444589316845, |
| "rewards/format_reward": 0.25, |
| "step": 13 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 776.2916641235352, |
| "epoch": 0.023993144815766924, |
| "grad_norm": 1.0175061225891113, |
| "kl": 0.00014960765838623047, |
| "learning_rate": 2.8e-07, |
| "loss": 0.0222, |
| "reward": 0.611111119389534, |
| "reward_std": 0.48110663890838623, |
| "rewards/accuracy_reward": 0.1944444477558136, |
| "rewards/format_reward": 0.2222222238779068, |
| "step": 14 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 808.4722290039062, |
| "epoch": 0.02570694087403599, |
| "grad_norm": 0.5965744853019714, |
| "kl": 0.0001302957534790039, |
| "learning_rate": 3e-07, |
| "loss": 0.0488, |
| "reward": 0.43055555410683155, |
| "reward_std": 0.3586147967725992, |
| "rewards/accuracy_reward": 0.1111111119389534, |
| "rewards/format_reward": 0.20833333767950535, |
| "step": 15 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 577.4027862548828, |
| "epoch": 0.027420736932305057, |
| "grad_norm": 0.8511067032814026, |
| "kl": 0.00014138221740722656, |
| "learning_rate": 3.2e-07, |
| "loss": 0.0287, |
| "reward": 0.7916666716337204, |
| "reward_std": 0.7402771413326263, |
| "rewards/accuracy_reward": 0.2638888992369175, |
| "rewards/format_reward": 0.2638888922519982, |
| "step": 16 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 693.2777786254883, |
| "epoch": 0.02913453299057412, |
| "grad_norm": 1.0029399394989014, |
| "kl": 0.00013744831085205078, |
| "learning_rate": 3.4000000000000003e-07, |
| "loss": -0.0047, |
| "reward": 0.7430555745959282, |
| "reward_std": 0.5347021222114563, |
| "rewards/accuracy_reward": 0.2777777863666415, |
| "rewards/format_reward": 0.18750000093132257, |
| "step": 17 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 694.6666870117188, |
| "epoch": 0.030848329048843187, |
| "grad_norm": 0.929906964302063, |
| "kl": 0.0001380443572998047, |
| "learning_rate": 3.6e-07, |
| "loss": 0.1027, |
| "reward": 0.3194444477558136, |
| "reward_std": 0.3645694628357887, |
| "rewards/accuracy_reward": 0.041666666977107525, |
| "rewards/format_reward": 0.23611111752688885, |
| "step": 18 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 837.4583587646484, |
| "epoch": 0.032562125107112254, |
| "grad_norm": 0.5381235480308533, |
| "kl": 0.00016641616821289062, |
| "learning_rate": 3.7999999999999996e-07, |
| "loss": 0.0715, |
| "reward": 0.5347222164273262, |
| "reward_std": 0.5603309497237206, |
| "rewards/accuracy_reward": 0.13888889271765947, |
| "rewards/format_reward": 0.2569444477558136, |
| "step": 19 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 865.6805419921875, |
| "epoch": 0.03427592116538132, |
| "grad_norm": 0.6289723515510559, |
| "kl": 0.0001741647720336914, |
| "learning_rate": 4e-07, |
| "loss": -0.0123, |
| "reward": 0.3333333320915699, |
| "reward_std": 0.3452591709792614, |
| "rewards/accuracy_reward": 0.08333333395421505, |
| "rewards/format_reward": 0.1666666716337204, |
| "step": 20 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 664.694450378418, |
| "epoch": 0.03598971722365039, |
| "grad_norm": 0.8022226691246033, |
| "kl": 0.00022649765014648438, |
| "learning_rate": 4.1999999999999995e-07, |
| "loss": 0.0523, |
| "reward": 0.5833333358168602, |
| "reward_std": 0.3963186591863632, |
| "rewards/accuracy_reward": 0.12500000465661287, |
| "rewards/format_reward": 0.3333333320915699, |
| "step": 21 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 874.8055725097656, |
| "epoch": 0.037703513281919454, |
| "grad_norm": 0.6999794244766235, |
| "kl": 0.00022101402282714844, |
| "learning_rate": 4.3999999999999997e-07, |
| "loss": 0.0188, |
| "reward": 0.319444440305233, |
| "reward_std": 0.36139946803450584, |
| "rewards/accuracy_reward": 0.055555556900799274, |
| "rewards/format_reward": 0.2083333358168602, |
| "step": 22 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 873.7361145019531, |
| "epoch": 0.03941730934018852, |
| "grad_norm": 0.5962035059928894, |
| "kl": 0.00021910667419433594, |
| "learning_rate": 4.6e-07, |
| "loss": 0.0391, |
| "reward": 0.611111119389534, |
| "reward_std": 0.31651007384061813, |
| "rewards/accuracy_reward": 0.15277778450399637, |
| "rewards/format_reward": 0.3055555522441864, |
| "step": 23 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 691.0277862548828, |
| "epoch": 0.04113110539845758, |
| "grad_norm": 0.6845191121101379, |
| "kl": 0.0003848075866699219, |
| "learning_rate": 4.8e-07, |
| "loss": 0.0395, |
| "reward": 0.6388888955116272, |
| "reward_std": 0.44496994838118553, |
| "rewards/accuracy_reward": 0.18055556062608957, |
| "rewards/format_reward": 0.2777777835726738, |
| "step": 24 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 606.1527862548828, |
| "epoch": 0.04284490145672665, |
| "grad_norm": 0.7037742733955383, |
| "kl": 0.0006976127624511719, |
| "learning_rate": 5e-07, |
| "loss": 0.0193, |
| "reward": 0.6875000149011612, |
| "reward_std": 0.49456192925572395, |
| "rewards/accuracy_reward": 0.16666666883975267, |
| "rewards/format_reward": 0.3541666716337204, |
| "step": 25 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 615.1250152587891, |
| "epoch": 0.044558697514995714, |
| "grad_norm": 0.8331264853477478, |
| "kl": 0.0008344650268554688, |
| "learning_rate": 5.2e-07, |
| "loss": 0.0633, |
| "reward": 0.7152777686715126, |
| "reward_std": 0.2888181023299694, |
| "rewards/accuracy_reward": 0.19444444868713617, |
| "rewards/format_reward": 0.3263888880610466, |
| "step": 26 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 814.9166717529297, |
| "epoch": 0.04627249357326478, |
| "grad_norm": 0.9271811246871948, |
| "kl": 0.0009531974792480469, |
| "learning_rate": 5.4e-07, |
| "loss": 0.056, |
| "reward": 0.3472222238779068, |
| "reward_std": 0.30904670804739, |
| "rewards/accuracy_reward": 0.02777777798473835, |
| "rewards/format_reward": 0.2916666716337204, |
| "step": 27 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 509.0, |
| "epoch": 0.04798628963153385, |
| "grad_norm": 0.755454957485199, |
| "kl": 0.00131988525390625, |
| "learning_rate": 5.6e-07, |
| "loss": -0.0102, |
| "reward": 1.0277777835726738, |
| "reward_std": 0.6238088309764862, |
| "rewards/accuracy_reward": 0.31944445241242647, |
| "rewards/format_reward": 0.3888888955116272, |
| "step": 28 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 634.9027786254883, |
| "epoch": 0.049700085689802914, |
| "grad_norm": 0.8711504936218262, |
| "kl": 0.0018901824951171875, |
| "learning_rate": 5.8e-07, |
| "loss": 0.039, |
| "reward": 0.5763888880610466, |
| "reward_std": 0.5907959416508675, |
| "rewards/accuracy_reward": 0.13888889271765947, |
| "rewards/format_reward": 0.2986111119389534, |
| "step": 29 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 588.5833358764648, |
| "epoch": 0.05141388174807198, |
| "grad_norm": 0.6898062825202942, |
| "kl": 0.002155303955078125, |
| "learning_rate": 6e-07, |
| "loss": -0.014, |
| "reward": 0.493055559694767, |
| "reward_std": 0.41614027321338654, |
| "rewards/accuracy_reward": 0.06944444496184587, |
| "rewards/format_reward": 0.3541666641831398, |
| "step": 30 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 582.9166717529297, |
| "epoch": 0.05312767780634105, |
| "grad_norm": 0.9486229419708252, |
| "kl": 0.002727508544921875, |
| "learning_rate": 6.2e-07, |
| "loss": 0.0461, |
| "reward": 0.6319444626569748, |
| "reward_std": 0.5628423318266869, |
| "rewards/accuracy_reward": 0.12500000186264515, |
| "rewards/format_reward": 0.381944440305233, |
| "step": 31 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 674.3472290039062, |
| "epoch": 0.054841473864610114, |
| "grad_norm": 0.5769440531730652, |
| "kl": 0.00347137451171875, |
| "learning_rate": 6.4e-07, |
| "loss": 0.0924, |
| "reward": 0.6597222238779068, |
| "reward_std": 0.572759248316288, |
| "rewards/accuracy_reward": 0.13888889271765947, |
| "rewards/format_reward": 0.3819444477558136, |
| "step": 32 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 606.5972213745117, |
| "epoch": 0.056555269922879174, |
| "grad_norm": 0.8302273154258728, |
| "kl": 0.004093170166015625, |
| "learning_rate": 6.6e-07, |
| "loss": -0.0183, |
| "reward": 0.6944444477558136, |
| "reward_std": 0.5897158365696669, |
| "rewards/accuracy_reward": 0.16666666977107525, |
| "rewards/format_reward": 0.3611111156642437, |
| "step": 33 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 750.2639007568359, |
| "epoch": 0.05826906598114824, |
| "grad_norm": 0.8029855489730835, |
| "kl": 0.00673675537109375, |
| "learning_rate": 6.800000000000001e-07, |
| "loss": 0.1226, |
| "reward": 0.8194444477558136, |
| "reward_std": 0.5727398172020912, |
| "rewards/accuracy_reward": 0.19444444868713617, |
| "rewards/format_reward": 0.4305555522441864, |
| "step": 34 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 748.4305572509766, |
| "epoch": 0.05998286203941731, |
| "grad_norm": 0.751978874206543, |
| "kl": 0.00594329833984375, |
| "learning_rate": 7e-07, |
| "loss": 0.1349, |
| "reward": 0.5000000074505806, |
| "reward_std": 0.3002382256090641, |
| "rewards/accuracy_reward": 0.0555555559694767, |
| "rewards/format_reward": 0.3888888880610466, |
| "step": 35 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 589.0416793823242, |
| "epoch": 0.061696658097686374, |
| "grad_norm": 0.7630824446678162, |
| "kl": 0.0087432861328125, |
| "learning_rate": 7.2e-07, |
| "loss": 0.0179, |
| "reward": 0.6041666865348816, |
| "reward_std": 0.33818795159459114, |
| "rewards/accuracy_reward": 0.12500000186264515, |
| "rewards/format_reward": 0.3541666716337204, |
| "step": 36 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 692.2500152587891, |
| "epoch": 0.06341045415595545, |
| "grad_norm": 0.5925531387329102, |
| "kl": 0.0101165771484375, |
| "learning_rate": 7.4e-07, |
| "loss": 0.0299, |
| "reward": 0.5138888955116272, |
| "reward_std": 0.3357668612152338, |
| "rewards/accuracy_reward": 0.055555556900799274, |
| "rewards/format_reward": 0.4027777835726738, |
| "step": 37 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 792.1805725097656, |
| "epoch": 0.06512425021422451, |
| "grad_norm": 1.0705430507659912, |
| "kl": 0.01534271240234375, |
| "learning_rate": 7.599999999999999e-07, |
| "loss": 0.0466, |
| "reward": 0.5000000074505806, |
| "reward_std": 0.41504133865237236, |
| "rewards/accuracy_reward": 0.0555555559694767, |
| "rewards/format_reward": 0.3888888880610466, |
| "step": 38 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 699.8888854980469, |
| "epoch": 0.06683804627249357, |
| "grad_norm": 0.9300626516342163, |
| "kl": 0.01090240478515625, |
| "learning_rate": 7.799999999999999e-07, |
| "loss": 0.0644, |
| "reward": 0.7847222238779068, |
| "reward_std": 0.4304216764867306, |
| "rewards/accuracy_reward": 0.2083333432674408, |
| "rewards/format_reward": 0.3680555522441864, |
| "step": 39 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 575.0833206176758, |
| "epoch": 0.06855184233076264, |
| "grad_norm": 1.078953742980957, |
| "kl": 0.016937255859375, |
| "learning_rate": 8e-07, |
| "loss": 0.0414, |
| "reward": 0.9791666939854622, |
| "reward_std": 0.4737792070955038, |
| "rewards/accuracy_reward": 0.2638888955116272, |
| "rewards/format_reward": 0.4513888955116272, |
| "step": 40 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 626.5694580078125, |
| "epoch": 0.0702656383890317, |
| "grad_norm": 0.6470286846160889, |
| "kl": 0.0107269287109375, |
| "learning_rate": 8.199999999999999e-07, |
| "loss": 0.0883, |
| "reward": 0.6597222313284874, |
| "reward_std": 0.3953079264611006, |
| "rewards/accuracy_reward": 0.11111111380159855, |
| "rewards/format_reward": 0.4375, |
| "step": 41 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 585.4166717529297, |
| "epoch": 0.07197943444730077, |
| "grad_norm": 2.4607300758361816, |
| "kl": 0.0234375, |
| "learning_rate": 8.399999999999999e-07, |
| "loss": 0.0947, |
| "reward": 0.8263889104127884, |
| "reward_std": 0.5376365929841995, |
| "rewards/accuracy_reward": 0.1944444514811039, |
| "rewards/format_reward": 0.4375000074505806, |
| "step": 42 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 705.0555572509766, |
| "epoch": 0.07369323050556983, |
| "grad_norm": 0.5106288194656372, |
| "kl": 0.0121002197265625, |
| "learning_rate": 8.599999999999999e-07, |
| "loss": 0.0658, |
| "reward": 0.6736111342906952, |
| "reward_std": 0.4738336503505707, |
| "rewards/accuracy_reward": 0.1250000037252903, |
| "rewards/format_reward": 0.4236111119389534, |
| "step": 43 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 754.8611145019531, |
| "epoch": 0.07540702656383891, |
| "grad_norm": 1.0860414505004883, |
| "kl": 0.01665496826171875, |
| "learning_rate": 8.799999999999999e-07, |
| "loss": -0.0223, |
| "reward": 0.7500000149011612, |
| "reward_std": 0.4002586603164673, |
| "rewards/accuracy_reward": 0.15277778171002865, |
| "rewards/format_reward": 0.4444444477558136, |
| "step": 44 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 764.9583358764648, |
| "epoch": 0.07712082262210797, |
| "grad_norm": 0.4417635202407837, |
| "kl": 0.010406494140625, |
| "learning_rate": 9e-07, |
| "loss": 0.0694, |
| "reward": 0.8125000223517418, |
| "reward_std": 0.3787213396281004, |
| "rewards/accuracy_reward": 0.180555559694767, |
| "rewards/format_reward": 0.4513888955116272, |
| "step": 45 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 524.1666717529297, |
| "epoch": 0.07883461868037704, |
| "grad_norm": 0.9410390853881836, |
| "kl": 0.013397216796875, |
| "learning_rate": 9.2e-07, |
| "loss": 0.008, |
| "reward": 0.6875, |
| "reward_std": 0.4043467417359352, |
| "rewards/accuracy_reward": 0.12500000186264515, |
| "rewards/format_reward": 0.4375000074505806, |
| "step": 46 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 608.4305725097656, |
| "epoch": 0.0805484147386461, |
| "grad_norm": 0.915087878704071, |
| "kl": 0.014129638671875, |
| "learning_rate": 9.399999999999999e-07, |
| "loss": 0.0755, |
| "reward": 0.6527777910232544, |
| "reward_std": 0.49068866297602654, |
| "rewards/accuracy_reward": 0.11111111380159855, |
| "rewards/format_reward": 0.4305555671453476, |
| "step": 47 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 592.1250076293945, |
| "epoch": 0.08226221079691516, |
| "grad_norm": 0.7676656246185303, |
| "kl": 0.0137786865234375, |
| "learning_rate": 9.6e-07, |
| "loss": 0.0019, |
| "reward": 0.7291666641831398, |
| "reward_std": 0.3599853590130806, |
| "rewards/accuracy_reward": 0.1527777798473835, |
| "rewards/format_reward": 0.4236111119389534, |
| "step": 48 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 472.7777862548828, |
| "epoch": 0.08397600685518423, |
| "grad_norm": 0.8259297609329224, |
| "kl": 0.01544189453125, |
| "learning_rate": 9.8e-07, |
| "loss": 0.0269, |
| "reward": 0.8958333358168602, |
| "reward_std": 0.5085582789033651, |
| "rewards/accuracy_reward": 0.20833333767950535, |
| "rewards/format_reward": 0.479166679084301, |
| "step": 49 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 522.7639007568359, |
| "epoch": 0.0856898029134533, |
| "grad_norm": 7704.1875, |
| "kl": 2.003662109375, |
| "learning_rate": 1e-06, |
| "loss": 0.0621, |
| "reward": 0.5069444477558136, |
| "reward_std": 0.11907241865992546, |
| "rewards/accuracy_reward": 0.013888888992369175, |
| "rewards/format_reward": 0.4791666716337204, |
| "step": 50 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 569.9583435058594, |
| "epoch": 0.08740359897172237, |
| "grad_norm": 0.9847874045372009, |
| "kl": 0.01806640625, |
| "learning_rate": 9.999890338174275e-07, |
| "loss": -0.0313, |
| "reward": 1.0069444626569748, |
| "reward_std": 0.6746698617935181, |
| "rewards/accuracy_reward": 0.2916666753590107, |
| "rewards/format_reward": 0.4236111268401146, |
| "step": 51 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 851.2639007568359, |
| "epoch": 0.08911739502999143, |
| "grad_norm": 0.4385327994823456, |
| "kl": 0.0133514404296875, |
| "learning_rate": 9.999561358041868e-07, |
| "loss": -0.007, |
| "reward": 0.7708333507180214, |
| "reward_std": 0.3091294076293707, |
| "rewards/accuracy_reward": 0.1527777835726738, |
| "rewards/format_reward": 0.4652777910232544, |
| "step": 52 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 672.9305572509766, |
| "epoch": 0.0908311910882605, |
| "grad_norm": 0.6572834849357605, |
| "kl": 0.0176544189453125, |
| "learning_rate": 9.999013075636804e-07, |
| "loss": -0.0003, |
| "reward": 0.673611119389534, |
| "reward_std": 0.3502005450427532, |
| "rewards/accuracy_reward": 0.1111111156642437, |
| "rewards/format_reward": 0.4513888880610466, |
| "step": 53 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 668.2361221313477, |
| "epoch": 0.09254498714652956, |
| "grad_norm": 0.5134143233299255, |
| "kl": 0.0155029296875, |
| "learning_rate": 9.998245517681593e-07, |
| "loss": 0.0325, |
| "reward": 0.6944444552063942, |
| "reward_std": 0.36250423453748226, |
| "rewards/accuracy_reward": 0.11111111287027597, |
| "rewards/format_reward": 0.4722222313284874, |
| "step": 54 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 767.4722137451172, |
| "epoch": 0.09425878320479864, |
| "grad_norm": 0.5993247628211975, |
| "kl": 0.018310546875, |
| "learning_rate": 9.997258721585931e-07, |
| "loss": 0.0402, |
| "reward": 0.7222222238779068, |
| "reward_std": 0.4854987859725952, |
| "rewards/accuracy_reward": 0.12500000093132257, |
| "rewards/format_reward": 0.4722222238779068, |
| "step": 55 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 582.5555572509766, |
| "epoch": 0.0959725792630677, |
| "grad_norm": 0.6086099147796631, |
| "kl": 0.019317626953125, |
| "learning_rate": 9.996052735444862e-07, |
| "loss": -0.0089, |
| "reward": 0.8958333432674408, |
| "reward_std": 0.5041182190179825, |
| "rewards/accuracy_reward": 0.20833333674818277, |
| "rewards/format_reward": 0.4791666716337204, |
| "step": 56 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 666.3611297607422, |
| "epoch": 0.09768637532133675, |
| "grad_norm": 0.3199848532676697, |
| "kl": 0.0146942138671875, |
| "learning_rate": 9.994627618036452e-07, |
| "loss": 0.0068, |
| "reward": 0.6180555671453476, |
| "reward_std": 0.10077410563826561, |
| "rewards/accuracy_reward": 0.0694444477558136, |
| "rewards/format_reward": 0.479166679084301, |
| "step": 57 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 535.6944351196289, |
| "epoch": 0.09940017137960583, |
| "grad_norm": 1.065861701965332, |
| "kl": 0.02264404296875, |
| "learning_rate": 9.992983438818915e-07, |
| "loss": 0.0328, |
| "reward": 0.5972222164273262, |
| "reward_std": 0.3061862140893936, |
| "rewards/accuracy_reward": 0.0555555559694767, |
| "rewards/format_reward": 0.486111119389534, |
| "step": 58 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 645.0833435058594, |
| "epoch": 0.10111396743787489, |
| "grad_norm": 0.8223654627799988, |
| "kl": 0.017425537109375, |
| "learning_rate": 9.991120277927223e-07, |
| "loss": -0.0678, |
| "reward": 0.9791666716337204, |
| "reward_std": 0.7214617803692818, |
| "rewards/accuracy_reward": 0.25000000558793545, |
| "rewards/format_reward": 0.4791666716337204, |
| "step": 59 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 758.6388854980469, |
| "epoch": 0.10282776349614396, |
| "grad_norm": 0.559778094291687, |
| "kl": 0.0177154541015625, |
| "learning_rate": 9.989038226169207e-07, |
| "loss": -0.0276, |
| "reward": 0.7222222238779068, |
| "reward_std": 0.30821535736322403, |
| "rewards/accuracy_reward": 0.11111111287027597, |
| "rewards/format_reward": 0.5, |
| "step": 60 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 653.9860992431641, |
| "epoch": 0.10454155955441302, |
| "grad_norm": 0.4272245168685913, |
| "kl": 0.02130126953125, |
| "learning_rate": 9.98673738502114e-07, |
| "loss": 0.0077, |
| "reward": 0.6319444477558136, |
| "reward_std": 0.17111802101135254, |
| "rewards/accuracy_reward": 0.06944444496184587, |
| "rewards/format_reward": 0.493055559694767, |
| "step": 61 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 746.0694580078125, |
| "epoch": 0.1062553556126821, |
| "grad_norm": 0.7891212105751038, |
| "kl": 0.0184478759765625, |
| "learning_rate": 9.98421786662277e-07, |
| "loss": 0.0491, |
| "reward": 0.8263888955116272, |
| "reward_std": 0.3267286717891693, |
| "rewards/accuracy_reward": 0.18055556062608957, |
| "rewards/format_reward": 0.4652777761220932, |
| "step": 62 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 677.9722137451172, |
| "epoch": 0.10796915167095116, |
| "grad_norm": 0.4481966495513916, |
| "kl": 0.016571044921875, |
| "learning_rate": 9.981479793771866e-07, |
| "loss": 0.0352, |
| "reward": 0.6250000074505806, |
| "reward_std": 0.2613905593752861, |
| "rewards/accuracy_reward": 0.06944444496184587, |
| "rewards/format_reward": 0.4861111119389534, |
| "step": 63 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 650.25, |
| "epoch": 0.10968294772922023, |
| "grad_norm": 0.484955370426178, |
| "kl": 0.0130157470703125, |
| "learning_rate": 9.97852329991824e-07, |
| "loss": 0.0549, |
| "reward": 0.6875000149011612, |
| "reward_std": 0.21065950952470303, |
| "rewards/accuracy_reward": 0.1111111119389534, |
| "rewards/format_reward": 0.4652777835726738, |
| "step": 64 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 718.8333435058594, |
| "epoch": 0.11139674378748929, |
| "grad_norm": 0.6917220950126648, |
| "kl": 0.0135955810546875, |
| "learning_rate": 9.975348529157229e-07, |
| "loss": 0.0278, |
| "reward": 0.7083333432674408, |
| "reward_std": 0.5276868715882301, |
| "rewards/accuracy_reward": 0.12500000186264515, |
| "rewards/format_reward": 0.4583333358168602, |
| "step": 65 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 724.4861221313477, |
| "epoch": 0.11311053984575835, |
| "grad_norm": 0.6181950569152832, |
| "kl": 0.009735107421875, |
| "learning_rate": 9.971955636222684e-07, |
| "loss": 0.0776, |
| "reward": 0.7916666865348816, |
| "reward_std": 0.36798322945833206, |
| "rewards/accuracy_reward": 0.1666666716337204, |
| "rewards/format_reward": 0.4583333358168602, |
| "step": 66 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 645.2639007568359, |
| "epoch": 0.11482433590402742, |
| "grad_norm": 0.47965365648269653, |
| "kl": 0.0118865966796875, |
| "learning_rate": 9.968344786479415e-07, |
| "loss": 0.0211, |
| "reward": 0.6805555522441864, |
| "reward_std": 0.29340869560837746, |
| "rewards/accuracy_reward": 0.1111111156642437, |
| "rewards/format_reward": 0.4583333358168602, |
| "step": 67 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 833.1944427490234, |
| "epoch": 0.11653813196229648, |
| "grad_norm": 0.5236086249351501, |
| "kl": 0.01050567626953125, |
| "learning_rate": 9.964516155915151e-07, |
| "loss": 0.0772, |
| "reward": 0.8611111268401146, |
| "reward_std": 0.3695474322885275, |
| "rewards/accuracy_reward": 0.2083333432674408, |
| "rewards/format_reward": 0.4444444552063942, |
| "step": 68 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 614.4027709960938, |
| "epoch": 0.11825192802056556, |
| "grad_norm": 2.3860788345336914, |
| "kl": 0.0256195068359375, |
| "learning_rate": 9.960469931131936e-07, |
| "loss": 0.0186, |
| "reward": 0.7222222462296486, |
| "reward_std": 0.4142109379172325, |
| "rewards/accuracy_reward": 0.13888888992369175, |
| "rewards/format_reward": 0.4444444552063942, |
| "step": 69 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 731.9722290039062, |
| "epoch": 0.11996572407883462, |
| "grad_norm": 0.7797493934631348, |
| "kl": 0.01611328125, |
| "learning_rate": 9.956206309337066e-07, |
| "loss": 0.0554, |
| "reward": 0.8125000074505806, |
| "reward_std": 0.4372703805565834, |
| "rewards/accuracy_reward": 0.1944444477558136, |
| "rewards/format_reward": 0.4236111119389534, |
| "step": 70 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 663.3055725097656, |
| "epoch": 0.12167952013710369, |
| "grad_norm": 0.6972033977508545, |
| "kl": 0.011383056640625, |
| "learning_rate": 9.951725498333448e-07, |
| "loss": 0.0371, |
| "reward": 0.9583333432674408, |
| "reward_std": 0.5939657315611839, |
| "rewards/accuracy_reward": 0.23611112032085657, |
| "rewards/format_reward": 0.4861111119389534, |
| "step": 71 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 533.8472290039062, |
| "epoch": 0.12339331619537275, |
| "grad_norm": 0.628699541091919, |
| "kl": 0.0160675048828125, |
| "learning_rate": 9.947027716509488e-07, |
| "loss": 0.0075, |
| "reward": 0.798611119389534, |
| "reward_std": 0.34056369215250015, |
| "rewards/accuracy_reward": 0.16666666977107525, |
| "rewards/format_reward": 0.4652777761220932, |
| "step": 72 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 725.944450378418, |
| "epoch": 0.12510711225364182, |
| "grad_norm": 0.5667747855186462, |
| "kl": 0.0106201171875, |
| "learning_rate": 9.942113192828444e-07, |
| "loss": 0.0505, |
| "reward": 0.8611111342906952, |
| "reward_std": 0.3817775323987007, |
| "rewards/accuracy_reward": 0.1944444477558136, |
| "rewards/format_reward": 0.4722222313284874, |
| "step": 73 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 596.125, |
| "epoch": 0.1268209083119109, |
| "grad_norm": 0.8628817796707153, |
| "kl": 0.0158233642578125, |
| "learning_rate": 9.93698216681727e-07, |
| "loss": 0.0773, |
| "reward": 0.923611119389534, |
| "reward_std": 0.4358247146010399, |
| "rewards/accuracy_reward": 0.22222222574055195, |
| "rewards/format_reward": 0.4791666716337204, |
| "step": 74 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 812.0555572509766, |
| "epoch": 0.12853470437017994, |
| "grad_norm": 0.41737478971481323, |
| "kl": 0.0092620849609375, |
| "learning_rate": 9.931634888554935e-07, |
| "loss": -0.0076, |
| "reward": 0.7500000149011612, |
| "reward_std": 0.2785004451870918, |
| "rewards/accuracy_reward": 0.13888889271765947, |
| "rewards/format_reward": 0.4722222238779068, |
| "step": 75 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 508.01390075683594, |
| "epoch": 0.13024850042844902, |
| "grad_norm": 0.772993803024292, |
| "kl": 0.0087738037109375, |
| "learning_rate": 9.926071618660237e-07, |
| "loss": 0.0171, |
| "reward": 1.0069444477558136, |
| "reward_std": 0.6597441658377647, |
| "rewards/accuracy_reward": 0.2638888955116272, |
| "rewards/format_reward": 0.4791666716337204, |
| "step": 76 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 746.2500152587891, |
| "epoch": 0.1319622964867181, |
| "grad_norm": 0.4772380292415619, |
| "kl": 0.00911712646484375, |
| "learning_rate": 9.9202926282791e-07, |
| "loss": 0.0168, |
| "reward": 0.9166666567325592, |
| "reward_std": 0.6034458577632904, |
| "rewards/accuracy_reward": 0.22222222667187452, |
| "rewards/format_reward": 0.4722222313284874, |
| "step": 77 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 952.3888854980469, |
| "epoch": 0.13367609254498714, |
| "grad_norm": 0.49596795439720154, |
| "kl": 0.0111846923828125, |
| "learning_rate": 9.91429819907136e-07, |
| "loss": 0.033, |
| "reward": 0.6180555745959282, |
| "reward_std": 0.34004957228899, |
| "rewards/accuracy_reward": 0.09722222480922937, |
| "rewards/format_reward": 0.423611119389534, |
| "step": 78 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 533.5555725097656, |
| "epoch": 0.1353898886032562, |
| "grad_norm": 0.5074121952056885, |
| "kl": 0.011474609375, |
| "learning_rate": 9.908088623197048e-07, |
| "loss": -0.0007, |
| "reward": 0.5902777835726738, |
| "reward_std": 0.2606759797781706, |
| "rewards/accuracy_reward": 0.0555555559694767, |
| "rewards/format_reward": 0.4791666716337204, |
| "step": 79 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 704.5833282470703, |
| "epoch": 0.13710368466152528, |
| "grad_norm": 0.45615482330322266, |
| "kl": 0.00933074951171875, |
| "learning_rate": 9.901664203302124e-07, |
| "loss": 0.1073, |
| "reward": 0.6111111119389534, |
| "reward_std": 0.3000170197337866, |
| "rewards/accuracy_reward": 0.08333333395421505, |
| "rewards/format_reward": 0.4444444552063942, |
| "step": 80 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 709.5694580078125, |
| "epoch": 0.13881748071979436, |
| "grad_norm": 0.440580815076828, |
| "kl": 0.0095977783203125, |
| "learning_rate": 9.895025252503755e-07, |
| "loss": -0.0114, |
| "reward": 1.0347222536802292, |
| "reward_std": 0.26498175598680973, |
| "rewards/accuracy_reward": 0.2916666679084301, |
| "rewards/format_reward": 0.4513888955116272, |
| "step": 81 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 652.4166717529297, |
| "epoch": 0.1405312767780634, |
| "grad_norm": 0.4756879210472107, |
| "kl": 0.00921630859375, |
| "learning_rate": 9.888172094375033e-07, |
| "loss": 0.0445, |
| "reward": 0.7638888955116272, |
| "reward_std": 0.37851114571094513, |
| "rewards/accuracy_reward": 0.15277778171002865, |
| "rewards/format_reward": 0.4583333358168602, |
| "step": 82 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 585.6250076293945, |
| "epoch": 0.14224507283633248, |
| "grad_norm": 0.9698956608772278, |
| "kl": 0.0133819580078125, |
| "learning_rate": 9.881105062929221e-07, |
| "loss": 0.0097, |
| "reward": 0.652777798473835, |
| "reward_std": 0.37932526133954525, |
| "rewards/accuracy_reward": 0.09722222574055195, |
| "rewards/format_reward": 0.4583333358168602, |
| "step": 83 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 683.9166564941406, |
| "epoch": 0.14395886889460155, |
| "grad_norm": 0.4502439796924591, |
| "kl": 0.0113525390625, |
| "learning_rate": 9.873824502603459e-07, |
| "loss": 0.0209, |
| "reward": 0.8125000074505806, |
| "reward_std": 0.421932702884078, |
| "rewards/accuracy_reward": 0.16666666977107525, |
| "rewards/format_reward": 0.4791666716337204, |
| "step": 84 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 719.1388854980469, |
| "epoch": 0.1456726649528706, |
| "grad_norm": 0.6199227571487427, |
| "kl": 0.01055908203125, |
| "learning_rate": 9.866330768241983e-07, |
| "loss": -0.0255, |
| "reward": 0.5694444626569748, |
| "reward_std": 0.33752935379743576, |
| "rewards/accuracy_reward": 0.0555555559694767, |
| "rewards/format_reward": 0.4583333358168602, |
| "step": 85 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 678.3611221313477, |
| "epoch": 0.14738646101113967, |
| "grad_norm": 0.8055247068405151, |
| "kl": 0.0143890380859375, |
| "learning_rate": 9.85862422507884e-07, |
| "loss": -0.0329, |
| "reward": 0.972222238779068, |
| "reward_std": 0.5578342527151108, |
| "rewards/accuracy_reward": 0.2500000046566129, |
| "rewards/format_reward": 0.4722222238779068, |
| "step": 86 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 870.9166870117188, |
| "epoch": 0.14910025706940874, |
| "grad_norm": 0.7032153010368347, |
| "kl": 0.015716552734375, |
| "learning_rate": 9.850705248720068e-07, |
| "loss": 0.1143, |
| "reward": 0.7083333432674408, |
| "reward_std": 0.29449621587991714, |
| "rewards/accuracy_reward": 0.11111111473292112, |
| "rewards/format_reward": 0.4861111119389534, |
| "step": 87 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 809.2361297607422, |
| "epoch": 0.15081405312767782, |
| "grad_norm": 0.6152629256248474, |
| "kl": 0.0126953125, |
| "learning_rate": 9.8425742251254e-07, |
| "loss": 0.0084, |
| "reward": 0.777777798473835, |
| "reward_std": 0.45732562988996506, |
| "rewards/accuracy_reward": 0.15277778171002865, |
| "rewards/format_reward": 0.4722222313284874, |
| "step": 88 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 596.125, |
| "epoch": 0.15252784918594686, |
| "grad_norm": 0.4119075536727905, |
| "kl": 0.0131683349609375, |
| "learning_rate": 9.83423155058946e-07, |
| "loss": 0.006, |
| "reward": 0.7569444552063942, |
| "reward_std": 0.37087361328303814, |
| "rewards/accuracy_reward": 0.13888889364898205, |
| "rewards/format_reward": 0.479166679084301, |
| "step": 89 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 719.9861221313477, |
| "epoch": 0.15424164524421594, |
| "grad_norm": 0.6751371622085571, |
| "kl": 0.01473236083984375, |
| "learning_rate": 9.825677631722435e-07, |
| "loss": 0.0284, |
| "reward": 0.6805555671453476, |
| "reward_std": 0.2974403705447912, |
| "rewards/accuracy_reward": 0.09722222574055195, |
| "rewards/format_reward": 0.486111119389534, |
| "step": 90 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 676.0000152587891, |
| "epoch": 0.155955441302485, |
| "grad_norm": 0.4955655634403229, |
| "kl": 0.00868988037109375, |
| "learning_rate": 9.816912885430258e-07, |
| "loss": 0.0648, |
| "reward": 0.8750000149011612, |
| "reward_std": 0.38056252896785736, |
| "rewards/accuracy_reward": 0.19444445054978132, |
| "rewards/format_reward": 0.486111119389534, |
| "step": 91 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 685.569450378418, |
| "epoch": 0.15766923736075408, |
| "grad_norm": 0.6032452583312988, |
| "kl": 0.0105133056640625, |
| "learning_rate": 9.807937738894303e-07, |
| "loss": 0.0959, |
| "reward": 1.0208333432674408, |
| "reward_std": 0.41651279479265213, |
| "rewards/accuracy_reward": 0.2638888992369175, |
| "rewards/format_reward": 0.493055559694767, |
| "step": 92 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 552.9722290039062, |
| "epoch": 0.15938303341902313, |
| "grad_norm": 0.5042760968208313, |
| "kl": 0.010833740234375, |
| "learning_rate": 9.798752629550546e-07, |
| "loss": 0.0277, |
| "reward": 0.5972222089767456, |
| "reward_std": 0.25616975128650665, |
| "rewards/accuracy_reward": 0.0555555559694767, |
| "rewards/format_reward": 0.486111119389534, |
| "step": 93 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 613.8750076293945, |
| "epoch": 0.1610968294772922, |
| "grad_norm": 1.230424165725708, |
| "kl": 0.01520538330078125, |
| "learning_rate": 9.78935800506826e-07, |
| "loss": 0.0236, |
| "reward": 0.7291666716337204, |
| "reward_std": 0.3527771979570389, |
| "rewards/accuracy_reward": 0.12500000093132257, |
| "rewards/format_reward": 0.4791666716337204, |
| "step": 94 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 667.2500152587891, |
| "epoch": 0.16281062553556128, |
| "grad_norm": 0.874912440776825, |
| "kl": 0.014312744140625, |
| "learning_rate": 9.779754323328192e-07, |
| "loss": -0.0369, |
| "reward": 0.8194444477558136, |
| "reward_std": 0.24447975307703018, |
| "rewards/accuracy_reward": 0.16666666883975267, |
| "rewards/format_reward": 0.486111119389534, |
| "step": 95 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 696.3333435058594, |
| "epoch": 0.16452442159383032, |
| "grad_norm": 0.4739597737789154, |
| "kl": 0.00701904296875, |
| "learning_rate": 9.769942052400235e-07, |
| "loss": -0.0386, |
| "reward": 0.7152777761220932, |
| "reward_std": 0.4169478937983513, |
| "rewards/accuracy_reward": 0.12500000186264515, |
| "rewards/format_reward": 0.4652777761220932, |
| "step": 96 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 700.5833435058594, |
| "epoch": 0.1662382176520994, |
| "grad_norm": 0.5326427817344666, |
| "kl": 0.0096435546875, |
| "learning_rate": 9.759921670520634e-07, |
| "loss": 0.0463, |
| "reward": 0.6458333507180214, |
| "reward_std": 0.29642581194639206, |
| "rewards/accuracy_reward": 0.08333333488553762, |
| "rewards/format_reward": 0.479166679084301, |
| "step": 97 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 687.944450378418, |
| "epoch": 0.16795201371036847, |
| "grad_norm": 0.41333743929862976, |
| "kl": 0.0089111328125, |
| "learning_rate": 9.749693666068663e-07, |
| "loss": 0.0199, |
| "reward": 0.7916666716337204, |
| "reward_std": 0.32597118616104126, |
| "rewards/accuracy_reward": 0.15277778450399637, |
| "rewards/format_reward": 0.4861111119389534, |
| "step": 98 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 647.2638854980469, |
| "epoch": 0.16966580976863754, |
| "grad_norm": 0.4722951054573059, |
| "kl": 0.0088348388671875, |
| "learning_rate": 9.739258537542835e-07, |
| "loss": 0.036, |
| "reward": 0.7291666567325592, |
| "reward_std": 0.43501005321741104, |
| "rewards/accuracy_reward": 0.12500000186264515, |
| "rewards/format_reward": 0.4791666641831398, |
| "step": 99 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 610.4722213745117, |
| "epoch": 0.1713796058269066, |
| "grad_norm": 0.4890177547931671, |
| "kl": 0.0101318359375, |
| "learning_rate": 9.728616793536587e-07, |
| "loss": -0.0005, |
| "reward": 0.8333333283662796, |
| "reward_std": 0.4303314909338951, |
| "rewards/accuracy_reward": 0.16666666977107525, |
| "rewards/format_reward": 0.5, |
| "step": 100 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 803.6388702392578, |
| "epoch": 0.17309340188517566, |
| "grad_norm": 0.5259881019592285, |
| "kl": 0.0117645263671875, |
| "learning_rate": 9.717768952713511e-07, |
| "loss": 0.0197, |
| "reward": 0.791666679084301, |
| "reward_std": 0.39858745597302914, |
| "rewards/accuracy_reward": 0.16666667349636555, |
| "rewards/format_reward": 0.4583333432674408, |
| "step": 101 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 725.625, |
| "epoch": 0.17480719794344474, |
| "grad_norm": 0.565196692943573, |
| "kl": 0.0084381103515625, |
| "learning_rate": 9.706715543782064e-07, |
| "loss": -0.0314, |
| "reward": 0.7430555671453476, |
| "reward_std": 0.4483235850930214, |
| "rewards/accuracy_reward": 0.13888888992369175, |
| "rewards/format_reward": 0.4652777835726738, |
| "step": 102 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 685.8611145019531, |
| "epoch": 0.17652099400171378, |
| "grad_norm": 0.30295926332473755, |
| "kl": 0.010772705078125, |
| "learning_rate": 9.695457105469804e-07, |
| "loss": 0.0335, |
| "reward": 1.0208333283662796, |
| "reward_std": 0.31875650584697723, |
| "rewards/accuracy_reward": 0.2638888917863369, |
| "rewards/format_reward": 0.493055559694767, |
| "step": 103 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 763.1389007568359, |
| "epoch": 0.17823479005998286, |
| "grad_norm": 0.45252788066864014, |
| "kl": 0.01111602783203125, |
| "learning_rate": 9.683994186497132e-07, |
| "loss": 0.0369, |
| "reward": 0.5069444477558136, |
| "reward_std": 0.11907241307199001, |
| "rewards/accuracy_reward": 0.013888888992369175, |
| "rewards/format_reward": 0.479166679084301, |
| "step": 104 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 673.9444580078125, |
| "epoch": 0.17994858611825193, |
| "grad_norm": 0.800912618637085, |
| "kl": 0.0160675048828125, |
| "learning_rate": 9.672327345550543e-07, |
| "loss": 0.0112, |
| "reward": 0.7986111044883728, |
| "reward_std": 0.4165128022432327, |
| "rewards/accuracy_reward": 0.15277778171002865, |
| "rewards/format_reward": 0.493055559694767, |
| "step": 105 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 660.4027862548828, |
| "epoch": 0.181662382176521, |
| "grad_norm": 0.5626226663589478, |
| "kl": 0.0115509033203125, |
| "learning_rate": 9.66045715125541e-07, |
| "loss": 0.003, |
| "reward": 0.7152777761220932, |
| "reward_std": 0.46130844950675964, |
| "rewards/accuracy_reward": 0.11111111287027597, |
| "rewards/format_reward": 0.493055559694767, |
| "step": 106 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 648.5833435058594, |
| "epoch": 0.18337617823479005, |
| "grad_norm": 0.566942572593689, |
| "kl": 0.0107421875, |
| "learning_rate": 9.648384182148252e-07, |
| "loss": -0.0022, |
| "reward": 1.0833333432674408, |
| "reward_std": 0.41200654953718185, |
| "rewards/accuracy_reward": 0.3055555671453476, |
| "rewards/format_reward": 0.4722222238779068, |
| "step": 107 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 681.1666717529297, |
| "epoch": 0.18508997429305912, |
| "grad_norm": 0.46719685196876526, |
| "kl": 0.0086517333984375, |
| "learning_rate": 9.636109026648554e-07, |
| "loss": 0.008, |
| "reward": 0.5277777835726738, |
| "reward_std": 0.19162002205848694, |
| "rewards/accuracy_reward": 0.02777777798473835, |
| "rewards/format_reward": 0.4722222238779068, |
| "step": 108 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 628.8194580078125, |
| "epoch": 0.1868037703513282, |
| "grad_norm": 0.5135090351104736, |
| "kl": 0.011474609375, |
| "learning_rate": 9.623632283030077e-07, |
| "loss": 0.0236, |
| "reward": 0.722222238779068, |
| "reward_std": 0.19795495830476284, |
| "rewards/accuracy_reward": 0.12500000279396772, |
| "rewards/format_reward": 0.4722222238779068, |
| "step": 109 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 732.9305725097656, |
| "epoch": 0.18851756640959727, |
| "grad_norm": 0.6838110089302063, |
| "kl": 0.00803375244140625, |
| "learning_rate": 9.610954559391704e-07, |
| "loss": -0.0496, |
| "reward": 0.6666666567325592, |
| "reward_std": 0.44429811835289, |
| "rewards/accuracy_reward": 0.0972222238779068, |
| "rewards/format_reward": 0.4722222313284874, |
| "step": 110 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 688.0555725097656, |
| "epoch": 0.19023136246786632, |
| "grad_norm": 0.47608956694602966, |
| "kl": 0.009918212890625, |
| "learning_rate": 9.598076473627796e-07, |
| "loss": 0.0029, |
| "reward": 0.6874999850988388, |
| "reward_std": 0.2559359297156334, |
| "rewards/accuracy_reward": 0.09722222480922937, |
| "rewards/format_reward": 0.493055559694767, |
| "step": 111 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 724.0139007568359, |
| "epoch": 0.1919451585261354, |
| "grad_norm": 0.34519946575164795, |
| "kl": 0.00824737548828125, |
| "learning_rate": 9.58499865339809e-07, |
| "loss": 0.011, |
| "reward": 0.6875, |
| "reward_std": 0.31875649094581604, |
| "rewards/accuracy_reward": 0.09722222574055195, |
| "rewards/format_reward": 0.493055559694767, |
| "step": 112 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 731.2083435058594, |
| "epoch": 0.19365895458440446, |
| "grad_norm": 0.5883038640022278, |
| "kl": 0.00905609130859375, |
| "learning_rate": 9.571721736097088e-07, |
| "loss": 0.0809, |
| "reward": 0.9236111044883728, |
| "reward_std": 0.5362608954310417, |
| "rewards/accuracy_reward": 0.22222222667187452, |
| "rewards/format_reward": 0.4791666716337204, |
| "step": 113 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 585.2639007568359, |
| "epoch": 0.1953727506426735, |
| "grad_norm": 0.6275684237480164, |
| "kl": 0.0088043212890625, |
| "learning_rate": 9.55824636882301e-07, |
| "loss": -0.0049, |
| "reward": 0.6319444477558136, |
| "reward_std": 0.25718430429697037, |
| "rewards/accuracy_reward": 0.06944444496184587, |
| "rewards/format_reward": 0.493055559694767, |
| "step": 114 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 509.3611145019531, |
| "epoch": 0.19708654670094258, |
| "grad_norm": 0.5656007528305054, |
| "kl": 0.0132293701171875, |
| "learning_rate": 9.54457320834625e-07, |
| "loss": -0.0101, |
| "reward": 0.7152777761220932, |
| "reward_std": 0.3016466051340103, |
| "rewards/accuracy_reward": 0.11111111287027597, |
| "rewards/format_reward": 0.493055559694767, |
| "step": 115 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 882.1666564941406, |
| "epoch": 0.19880034275921166, |
| "grad_norm": 0.34537801146507263, |
| "kl": 0.0095062255859375, |
| "learning_rate": 9.530702921077358e-07, |
| "loss": 0.0496, |
| "reward": 0.770833320915699, |
| "reward_std": 0.32522569596767426, |
| "rewards/accuracy_reward": 0.1388888917863369, |
| "rewards/format_reward": 0.493055559694767, |
| "step": 116 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 612.0138931274414, |
| "epoch": 0.20051413881748073, |
| "grad_norm": 0.36981117725372314, |
| "kl": 0.0112152099609375, |
| "learning_rate": 9.516636183034564e-07, |
| "loss": -0.002, |
| "reward": 0.9097222089767456, |
| "reward_std": 0.31875649094581604, |
| "rewards/accuracy_reward": 0.20833333767950535, |
| "rewards/format_reward": 0.493055559694767, |
| "step": 117 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 612.069450378418, |
| "epoch": 0.20222793487574978, |
| "grad_norm": 0.5904315710067749, |
| "kl": 0.0120697021484375, |
| "learning_rate": 9.502373679810839e-07, |
| "loss": 0.0093, |
| "reward": 0.7361111044883728, |
| "reward_std": 0.34745684266090393, |
| "rewards/accuracy_reward": 0.12500000186264515, |
| "rewards/format_reward": 0.486111119389534, |
| "step": 118 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 580.7361221313477, |
| "epoch": 0.20394173093401885, |
| "grad_norm": 0.6722186803817749, |
| "kl": 0.0094146728515625, |
| "learning_rate": 9.487916106540465e-07, |
| "loss": 0.0115, |
| "reward": 1.0000000149011612, |
| "reward_std": 0.4072999134659767, |
| "rewards/accuracy_reward": 0.26388889644294977, |
| "rewards/format_reward": 0.4722222238779068, |
| "step": 119 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 658.5277862548828, |
| "epoch": 0.20565552699228792, |
| "grad_norm": 0.5796108245849609, |
| "kl": 0.009918212890625, |
| "learning_rate": 9.473264167865171e-07, |
| "loss": -0.0049, |
| "reward": 0.798611119389534, |
| "reward_std": 0.5176598504185677, |
| "rewards/accuracy_reward": 0.15277778264135122, |
| "rewards/format_reward": 0.493055559694767, |
| "step": 120 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 715.8750076293945, |
| "epoch": 0.207369323050557, |
| "grad_norm": 0.42695531249046326, |
| "kl": 0.0109405517578125, |
| "learning_rate": 9.458418577899774e-07, |
| "loss": -0.0034, |
| "reward": 0.9166666865348816, |
| "reward_std": 0.3762567415833473, |
| "rewards/accuracy_reward": 0.20833333767950535, |
| "rewards/format_reward": 0.5, |
| "step": 121 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 681.5833435058594, |
| "epoch": 0.20908311910882604, |
| "grad_norm": 0.3627341389656067, |
| "kl": 0.0103912353515625, |
| "learning_rate": 9.443380060197385e-07, |
| "loss": 0.0006, |
| "reward": 0.75, |
| "reward_std": 0.3134361356496811, |
| "rewards/accuracy_reward": 0.12500000186264515, |
| "rewards/format_reward": 0.5, |
| "step": 122 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 732.1249923706055, |
| "epoch": 0.21079691516709512, |
| "grad_norm": 0.4604179263114929, |
| "kl": 0.0093231201171875, |
| "learning_rate": 9.428149347714143e-07, |
| "loss": 0.0102, |
| "reward": 1.0208333432674408, |
| "reward_std": 0.3769379239529371, |
| "rewards/accuracy_reward": 0.2638888955116272, |
| "rewards/format_reward": 0.493055559694767, |
| "step": 123 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 731.8472290039062, |
| "epoch": 0.2125107112253642, |
| "grad_norm": 0.48651084303855896, |
| "kl": 0.008941650390625, |
| "learning_rate": 9.412727182773486e-07, |
| "loss": 0.0067, |
| "reward": 0.6875, |
| "reward_std": 0.3304464891552925, |
| "rewards/accuracy_reward": 0.09722222574055195, |
| "rewards/format_reward": 0.493055559694767, |
| "step": 124 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 543.0972213745117, |
| "epoch": 0.21422450728363324, |
| "grad_norm": 0.9337839484214783, |
| "kl": 0.0222625732421875, |
| "learning_rate": 9.397114317029974e-07, |
| "loss": 0.0038, |
| "reward": 0.9999999850988388, |
| "reward_std": 0.48787199705839157, |
| "rewards/accuracy_reward": 0.2638888908550143, |
| "rewards/format_reward": 0.4722222238779068, |
| "step": 125 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 683.7639007568359, |
| "epoch": 0.2159383033419023, |
| "grad_norm": 0.5181577801704407, |
| "kl": 0.0086822509765625, |
| "learning_rate": 9.381311511432658e-07, |
| "loss": 0.0582, |
| "reward": 0.7638889029622078, |
| "reward_std": 0.4154982175678015, |
| "rewards/accuracy_reward": 0.13888889271765947, |
| "rewards/format_reward": 0.486111119389534, |
| "step": 126 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 701.3888854980469, |
| "epoch": 0.21765209940017138, |
| "grad_norm": 0.4001893401145935, |
| "kl": 0.01253509521484375, |
| "learning_rate": 9.36531953618799e-07, |
| "loss": 0.0222, |
| "reward": 1.0, |
| "reward_std": 0.30821534991264343, |
| "rewards/accuracy_reward": 0.2500000046566129, |
| "rewards/format_reward": 0.5, |
| "step": 127 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 776.9444351196289, |
| "epoch": 0.21936589545844046, |
| "grad_norm": 0.4341527819633484, |
| "kl": 0.01218414306640625, |
| "learning_rate": 9.34913917072228e-07, |
| "loss": -0.0215, |
| "reward": 0.9375, |
| "reward_std": 0.33678142726421356, |
| "rewards/accuracy_reward": 0.22222222574055195, |
| "rewards/format_reward": 0.493055559694767, |
| "step": 128 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 656.3055725097656, |
| "epoch": 0.2210796915167095, |
| "grad_norm": 1.436949610710144, |
| "kl": 0.02315521240234375, |
| "learning_rate": 9.332771203643714e-07, |
| "loss": -0.0426, |
| "reward": 0.923611119389534, |
| "reward_std": 0.435094453394413, |
| "rewards/accuracy_reward": 0.22222222294658422, |
| "rewards/format_reward": 0.4791666716337204, |
| "step": 129 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 562.3333511352539, |
| "epoch": 0.22279348757497858, |
| "grad_norm": 0.7487984895706177, |
| "kl": 0.0170440673828125, |
| "learning_rate": 9.316216432703916e-07, |
| "loss": -0.0057, |
| "reward": 1.3680555522441864, |
| "reward_std": 0.4477668162435293, |
| "rewards/accuracy_reward": 0.4444444477558136, |
| "rewards/format_reward": 0.479166679084301, |
| "step": 130 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 862.7222442626953, |
| "epoch": 0.22450728363324765, |
| "grad_norm": 0.38078296184539795, |
| "kl": 0.01318359375, |
| "learning_rate": 9.299475664759068e-07, |
| "loss": 0.0882, |
| "reward": 0.7083333283662796, |
| "reward_std": 0.3526776432991028, |
| "rewards/accuracy_reward": 0.11111111287027597, |
| "rewards/format_reward": 0.486111119389534, |
| "step": 131 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 602.3888854980469, |
| "epoch": 0.2262210796915167, |
| "grad_norm": 0.5312814712524414, |
| "kl": 0.015228271484375, |
| "learning_rate": 9.282549715730579e-07, |
| "loss": 0.0104, |
| "reward": 0.5902777686715126, |
| "reward_std": 0.26067597232759, |
| "rewards/accuracy_reward": 0.055555556900799274, |
| "rewards/format_reward": 0.4791666716337204, |
| "step": 132 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 644.4722290039062, |
| "epoch": 0.22793487574978577, |
| "grad_norm": 1.1302504539489746, |
| "kl": 0.022735595703125, |
| "learning_rate": 9.265439410565328e-07, |
| "loss": 0.062, |
| "reward": 0.8263889029622078, |
| "reward_std": 0.23915939591825008, |
| "rewards/accuracy_reward": 0.16666667349636555, |
| "rewards/format_reward": 0.493055559694767, |
| "step": 133 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 684.6527709960938, |
| "epoch": 0.22964867180805484, |
| "grad_norm": 0.5044618844985962, |
| "kl": 0.0118408203125, |
| "learning_rate": 9.248145583195447e-07, |
| "loss": 0.0375, |
| "reward": 0.9930555671453476, |
| "reward_std": 0.5024402439594269, |
| "rewards/accuracy_reward": 0.2500000037252903, |
| "rewards/format_reward": 0.493055559694767, |
| "step": 134 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 604.6944427490234, |
| "epoch": 0.23136246786632392, |
| "grad_norm": 0.4981021285057068, |
| "kl": 0.0177459716796875, |
| "learning_rate": 9.230669076497687e-07, |
| "loss": 0.0255, |
| "reward": 0.965277761220932, |
| "reward_std": 0.3830488696694374, |
| "rewards/accuracy_reward": 0.23611111659556627, |
| "rewards/format_reward": 0.493055559694767, |
| "step": 135 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 517.2777824401855, |
| "epoch": 0.23307626392459296, |
| "grad_norm": 1.7722994089126587, |
| "kl": 0.03118896484375, |
| "learning_rate": 9.213010742252327e-07, |
| "loss": 0.0292, |
| "reward": 1.145833358168602, |
| "reward_std": 0.46534085273742676, |
| "rewards/accuracy_reward": 0.3333333320915699, |
| "rewards/format_reward": 0.4791666716337204, |
| "step": 136 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 779.9444580078125, |
| "epoch": 0.23479005998286204, |
| "grad_norm": 0.5337279438972473, |
| "kl": 0.01290130615234375, |
| "learning_rate": 9.195171441101668e-07, |
| "loss": -0.0092, |
| "reward": 0.9583333507180214, |
| "reward_std": 0.39858742617070675, |
| "rewards/accuracy_reward": 0.2500000111758709, |
| "rewards/format_reward": 0.4583333358168602, |
| "step": 137 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 704.3889007568359, |
| "epoch": 0.2365038560411311, |
| "grad_norm": 0.47012683749198914, |
| "kl": 0.01031494140625, |
| "learning_rate": 9.177152042508077e-07, |
| "loss": 0.0208, |
| "reward": 0.729166679084301, |
| "reward_std": 0.32953148148953915, |
| "rewards/accuracy_reward": 0.12500000465661287, |
| "rewards/format_reward": 0.479166679084301, |
| "step": 138 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 647.3472366333008, |
| "epoch": 0.23821765209940018, |
| "grad_norm": 0.5425974726676941, |
| "kl": 0.01230621337890625, |
| "learning_rate": 9.158953424711624e-07, |
| "loss": 0.0068, |
| "reward": 0.6319444477558136, |
| "reward_std": 0.17111803591251373, |
| "rewards/accuracy_reward": 0.06944444589316845, |
| "rewards/format_reward": 0.493055559694767, |
| "step": 139 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 559.319465637207, |
| "epoch": 0.23993144815766923, |
| "grad_norm": 0.4788627028465271, |
| "kl": 0.012969970703125, |
| "learning_rate": 9.140576474687263e-07, |
| "loss": 0.0135, |
| "reward": 1.0694444477558136, |
| "reward_std": 0.3177530914545059, |
| "rewards/accuracy_reward": 0.2916666753590107, |
| "rewards/format_reward": 0.4861111119389534, |
| "step": 140 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 711.7222290039062, |
| "epoch": 0.2416452442159383, |
| "grad_norm": 0.39043956995010376, |
| "kl": 0.0106658935546875, |
| "learning_rate": 9.122022088101613e-07, |
| "loss": 0.0037, |
| "reward": 0.8472222238779068, |
| "reward_std": 0.4586464837193489, |
| "rewards/accuracy_reward": 0.19444444868713617, |
| "rewards/format_reward": 0.4583333432674408, |
| "step": 141 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 674.2222290039062, |
| "epoch": 0.24335904027420738, |
| "grad_norm": 0.1789097785949707, |
| "kl": 0.01094818115234375, |
| "learning_rate": 9.103291169269299e-07, |
| "loss": 0.0154, |
| "reward": 0.6944444328546524, |
| "reward_std": 0.06804138422012329, |
| "rewards/accuracy_reward": 0.09722222480922937, |
| "rewards/format_reward": 0.5, |
| "step": 142 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 771.5555572509766, |
| "epoch": 0.24507283633247642, |
| "grad_norm": 0.4400465488433838, |
| "kl": 0.0099334716796875, |
| "learning_rate": 9.084384631108882e-07, |
| "loss": 0.155, |
| "reward": 0.7500000223517418, |
| "reward_std": 0.4042903557419777, |
| "rewards/accuracy_reward": 0.13888889364898205, |
| "rewards/format_reward": 0.4722222313284874, |
| "step": 143 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 851.3611145019531, |
| "epoch": 0.2467866323907455, |
| "grad_norm": 0.6633160710334778, |
| "kl": 0.01108551025390625, |
| "learning_rate": 9.065303395098358e-07, |
| "loss": 0.0257, |
| "reward": 0.7708333432674408, |
| "reward_std": 0.5133540704846382, |
| "rewards/accuracy_reward": 0.15277778264135122, |
| "rewards/format_reward": 0.4652777835726738, |
| "step": 144 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 711.4722290039062, |
| "epoch": 0.24850042844901457, |
| "grad_norm": 0.46885278820991516, |
| "kl": 0.0117034912109375, |
| "learning_rate": 9.046048391230247e-07, |
| "loss": -0.0231, |
| "reward": 0.7777777910232544, |
| "reward_std": 0.2221490517258644, |
| "rewards/accuracy_reward": 0.1388888955116272, |
| "rewards/format_reward": 0.5, |
| "step": 145 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 760.0555572509766, |
| "epoch": 0.25021422450728364, |
| "grad_norm": 0.49422281980514526, |
| "kl": 0.010040283203125, |
| "learning_rate": 9.026620557966279e-07, |
| "loss": 0.0101, |
| "reward": 0.8750000298023224, |
| "reward_std": 0.43057897686958313, |
| "rewards/accuracy_reward": 0.19444445054978132, |
| "rewards/format_reward": 0.486111119389534, |
| "step": 146 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 614.7639007568359, |
| "epoch": 0.2519280205655527, |
| "grad_norm": 0.4453893005847931, |
| "kl": 0.0105438232421875, |
| "learning_rate": 9.007020842191634e-07, |
| "loss": 0.0151, |
| "reward": 0.7291666641831398, |
| "reward_std": 0.41478364542126656, |
| "rewards/accuracy_reward": 0.12500000186264515, |
| "rewards/format_reward": 0.4791666641831398, |
| "step": 147 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 682.7500152587891, |
| "epoch": 0.2536418166238218, |
| "grad_norm": 0.4301265776157379, |
| "kl": 0.0096435546875, |
| "learning_rate": 8.987250199168808e-07, |
| "loss": -0.0056, |
| "reward": 0.7152777761220932, |
| "reward_std": 0.33566733449697495, |
| "rewards/accuracy_reward": 0.11111111287027597, |
| "rewards/format_reward": 0.493055559694767, |
| "step": 148 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 757.3611221313477, |
| "epoch": 0.25535561268209084, |
| "grad_norm": 0.4108283221721649, |
| "kl": 0.0125579833984375, |
| "learning_rate": 8.967309592491052e-07, |
| "loss": 0.007, |
| "reward": 0.5555555522441864, |
| "reward_std": 0.20964494906365871, |
| "rewards/accuracy_reward": 0.041666666977107525, |
| "rewards/format_reward": 0.4722222238779068, |
| "step": 149 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 684.3611297607422, |
| "epoch": 0.2570694087403599, |
| "grad_norm": 0.3349432647228241, |
| "kl": 0.0099334716796875, |
| "learning_rate": 8.9471999940354e-07, |
| "loss": 0.0087, |
| "reward": 1.111111119389534, |
| "reward_std": 0.3082153648138046, |
| "rewards/accuracy_reward": 0.3055555550381541, |
| "rewards/format_reward": 0.5, |
| "step": 150 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 507.1805648803711, |
| "epoch": 0.258783204798629, |
| "grad_norm": 0.4955935776233673, |
| "kl": 0.01495361328125, |
| "learning_rate": 8.926922383915315e-07, |
| "loss": -0.0025, |
| "reward": 0.6597222238779068, |
| "reward_std": 0.3041820004582405, |
| "rewards/accuracy_reward": 0.08333333488553762, |
| "rewards/format_reward": 0.493055559694767, |
| "step": 151 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 778.1389007568359, |
| "epoch": 0.26049700085689803, |
| "grad_norm": 0.46859902143478394, |
| "kl": 0.01029205322265625, |
| "learning_rate": 8.906477750432903e-07, |
| "loss": 0.0761, |
| "reward": 0.8125, |
| "reward_std": 0.38506873697042465, |
| "rewards/accuracy_reward": 0.16666666977107525, |
| "rewards/format_reward": 0.4791666716337204, |
| "step": 152 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 646.8194427490234, |
| "epoch": 0.2622107969151671, |
| "grad_norm": 0.5361355543136597, |
| "kl": 0.0163421630859375, |
| "learning_rate": 8.88586709003076e-07, |
| "loss": 0.1016, |
| "reward": 0.9861111268401146, |
| "reward_std": 0.4283023551106453, |
| "rewards/accuracy_reward": 0.2500000037252903, |
| "rewards/format_reward": 0.4861111119389534, |
| "step": 153 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 603.5000076293945, |
| "epoch": 0.2639245929734362, |
| "grad_norm": 0.6667534708976746, |
| "kl": 0.015716552734375, |
| "learning_rate": 8.865091407243394e-07, |
| "loss": -0.0517, |
| "reward": 0.888888880610466, |
| "reward_std": 0.4442981034517288, |
| "rewards/accuracy_reward": 0.19444444496184587, |
| "rewards/format_reward": 0.5, |
| "step": 154 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 730.5138854980469, |
| "epoch": 0.2656383890317052, |
| "grad_norm": 0.5823290944099426, |
| "kl": 0.0100250244140625, |
| "learning_rate": 8.844151714648274e-07, |
| "loss": 0.0032, |
| "reward": 0.9652777761220932, |
| "reward_std": 0.3304465189576149, |
| "rewards/accuracy_reward": 0.23611111752688885, |
| "rewards/format_reward": 0.493055559694767, |
| "step": 155 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 699.7500152587891, |
| "epoch": 0.26735218508997427, |
| "grad_norm": 0.3316850960254669, |
| "kl": 0.00982666015625, |
| "learning_rate": 8.823049032816478e-07, |
| "loss": 0.0064, |
| "reward": 0.798611119389534, |
| "reward_std": 0.19436374306678772, |
| "rewards/accuracy_reward": 0.1527777798473835, |
| "rewards/format_reward": 0.493055559694767, |
| "step": 156 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 678.5833358764648, |
| "epoch": 0.26906598114824337, |
| "grad_norm": 0.43551284074783325, |
| "kl": 0.0104827880859375, |
| "learning_rate": 8.801784390262943e-07, |
| "loss": 0.0163, |
| "reward": 1.013888880610466, |
| "reward_std": 0.34745684266090393, |
| "rewards/accuracy_reward": 0.2638888927176595, |
| "rewards/format_reward": 0.486111119389534, |
| "step": 157 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 657.9027862548828, |
| "epoch": 0.2707797772065124, |
| "grad_norm": 0.3160940110683441, |
| "kl": 0.00923919677734375, |
| "learning_rate": 8.780358823396352e-07, |
| "loss": -0.0074, |
| "reward": 0.604166679084301, |
| "reward_std": 0.17633883468806744, |
| "rewards/accuracy_reward": 0.0555555559694767, |
| "rewards/format_reward": 0.493055559694767, |
| "step": 158 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 762.0138854980469, |
| "epoch": 0.27249357326478146, |
| "grad_norm": 0.8683849573135376, |
| "kl": 0.02294921875, |
| "learning_rate": 8.758773376468604e-07, |
| "loss": 0.0224, |
| "reward": 0.597222238779068, |
| "reward_std": 0.25616976991295815, |
| "rewards/accuracy_reward": 0.06944444589316845, |
| "rewards/format_reward": 0.4583333432674408, |
| "step": 159 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 648.625, |
| "epoch": 0.27420736932305056, |
| "grad_norm": 0.44514918327331543, |
| "kl": 0.00907135009765625, |
| "learning_rate": 8.737029101523929e-07, |
| "loss": 0.0111, |
| "reward": 1.0138888955116272, |
| "reward_std": 0.32083219289779663, |
| "rewards/accuracy_reward": 0.2638888927176595, |
| "rewards/format_reward": 0.486111119389534, |
| "step": 160 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 669.7778015136719, |
| "epoch": 0.2759211653813196, |
| "grad_norm": 0.5343595743179321, |
| "kl": 0.0099029541015625, |
| "learning_rate": 8.715127058347614e-07, |
| "loss": -0.0242, |
| "reward": 0.7777777910232544, |
| "reward_std": 0.4600205048918724, |
| "rewards/accuracy_reward": 0.13888889271765947, |
| "rewards/format_reward": 0.5000000074505806, |
| "step": 161 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 619.6527938842773, |
| "epoch": 0.2776349614395887, |
| "grad_norm": 0.24237516522407532, |
| "kl": 0.00861358642578125, |
| "learning_rate": 8.693068314414344e-07, |
| "loss": 0.0059, |
| "reward": 0.777777761220932, |
| "reward_std": 0.17213259637355804, |
| "rewards/accuracy_reward": 0.1388888917863369, |
| "rewards/format_reward": 0.5, |
| "step": 162 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 686.5416717529297, |
| "epoch": 0.27934875749785776, |
| "grad_norm": 0.35996633768081665, |
| "kl": 0.0109405517578125, |
| "learning_rate": 8.670853944836176e-07, |
| "loss": 0.0226, |
| "reward": 0.652777798473835, |
| "reward_std": 0.2794154789298773, |
| "rewards/accuracy_reward": 0.08333333395421505, |
| "rewards/format_reward": 0.486111119389534, |
| "step": 163 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 563.8611221313477, |
| "epoch": 0.2810625535561268, |
| "grad_norm": 0.41780418157577515, |
| "kl": 0.011383056640625, |
| "learning_rate": 8.648485032310144e-07, |
| "loss": -0.0036, |
| "reward": 0.8611111342906952, |
| "reward_std": 0.3134361729025841, |
| "rewards/accuracy_reward": 0.18055555690079927, |
| "rewards/format_reward": 0.5, |
| "step": 164 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 503.6388931274414, |
| "epoch": 0.2827763496143959, |
| "grad_norm": 0.34387895464897156, |
| "kl": 0.011260986328125, |
| "learning_rate": 8.625962667065487e-07, |
| "loss": -0.011, |
| "reward": 0.6111111044883728, |
| "reward_std": 0.15932847559452057, |
| "rewards/accuracy_reward": 0.055555556900799274, |
| "rewards/format_reward": 0.5, |
| "step": 165 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 784.4444580078125, |
| "epoch": 0.28449014567266495, |
| "grad_norm": 0.47405433654785156, |
| "kl": 0.00818634033203125, |
| "learning_rate": 8.603287946810513e-07, |
| "loss": 0.0147, |
| "reward": 0.8055555671453476, |
| "reward_std": 0.5355970486998558, |
| "rewards/accuracy_reward": 0.16666666883975267, |
| "rewards/format_reward": 0.4722222313284874, |
| "step": 166 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 674.5833282470703, |
| "epoch": 0.286203941730934, |
| "grad_norm": 0.43140318989753723, |
| "kl": 0.008544921875, |
| "learning_rate": 8.580461976679099e-07, |
| "loss": 0.0226, |
| "reward": 0.7638888955116272, |
| "reward_std": 0.4538358449935913, |
| "rewards/accuracy_reward": 0.13888889364898205, |
| "rewards/format_reward": 0.4861111119389534, |
| "step": 167 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 789.8472290039062, |
| "epoch": 0.2879177377892031, |
| "grad_norm": 0.7944321632385254, |
| "kl": 0.01107025146484375, |
| "learning_rate": 8.557485869176825e-07, |
| "loss": 0.0802, |
| "reward": 0.680555559694767, |
| "reward_std": 0.28722215443849564, |
| "rewards/accuracy_reward": 0.09722222574055195, |
| "rewards/format_reward": 0.486111119389534, |
| "step": 168 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 594.7222366333008, |
| "epoch": 0.28963153384747214, |
| "grad_norm": 0.47057613730430603, |
| "kl": 0.01092529296875, |
| "learning_rate": 8.534360744126753e-07, |
| "loss": -0.0071, |
| "reward": 0.8263888880610466, |
| "reward_std": 0.2624051198363304, |
| "rewards/accuracy_reward": 0.16666666697710752, |
| "rewards/format_reward": 0.493055559694767, |
| "step": 169 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 575.5000076293945, |
| "epoch": 0.2913453299057412, |
| "grad_norm": 0.48645710945129395, |
| "kl": 0.0149688720703125, |
| "learning_rate": 8.511087728614862e-07, |
| "loss": 0.0024, |
| "reward": 0.965277761220932, |
| "reward_std": 0.37693794071674347, |
| "rewards/accuracy_reward": 0.23611111380159855, |
| "rewards/format_reward": 0.493055559694767, |
| "step": 170 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 738.7916717529297, |
| "epoch": 0.2930591259640103, |
| "grad_norm": 0.4424607753753662, |
| "kl": 0.00809478759765625, |
| "learning_rate": 8.487667956935087e-07, |
| "loss": 0.0969, |
| "reward": 0.7291666865348816, |
| "reward_std": 0.2601025812327862, |
| "rewards/accuracy_reward": 0.12500000093132257, |
| "rewards/format_reward": 0.479166679084301, |
| "step": 171 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 730.7361297607422, |
| "epoch": 0.29477292202227934, |
| "grad_norm": 0.4389359652996063, |
| "kl": 0.00835418701171875, |
| "learning_rate": 8.464102570534061e-07, |
| "loss": 0.0355, |
| "reward": 0.7430555447936058, |
| "reward_std": 0.24438021332025528, |
| "rewards/accuracy_reward": 0.1250000037252903, |
| "rewards/format_reward": 0.493055559694767, |
| "step": 172 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 682.6111068725586, |
| "epoch": 0.29648671808054844, |
| "grad_norm": 0.6182007193565369, |
| "kl": 0.013397216796875, |
| "learning_rate": 8.440392717955475e-07, |
| "loss": 0.0092, |
| "reward": 0.7777777761220932, |
| "reward_std": 0.4936107471585274, |
| "rewards/accuracy_reward": 0.15277778171002865, |
| "rewards/format_reward": 0.4722222238779068, |
| "step": 173 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 690.2222290039062, |
| "epoch": 0.2982005141388175, |
| "grad_norm": 0.30457955598831177, |
| "kl": 0.0081939697265625, |
| "learning_rate": 8.416539554784089e-07, |
| "loss": 0.0026, |
| "reward": 0.7708333432674408, |
| "reward_std": 0.23915940523147583, |
| "rewards/accuracy_reward": 0.13888889271765947, |
| "rewards/format_reward": 0.493055559694767, |
| "step": 174 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 696.6527938842773, |
| "epoch": 0.29991431019708653, |
| "grad_norm": 0.3551907241344452, |
| "kl": 0.010650634765625, |
| "learning_rate": 8.392544243589427e-07, |
| "loss": 0.0106, |
| "reward": 0.5833333283662796, |
| "reward_std": 0.15410767495632172, |
| "rewards/accuracy_reward": 0.041666666977107525, |
| "rewards/format_reward": 0.5, |
| "step": 175 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 606.7916717529297, |
| "epoch": 0.30162810625535563, |
| "grad_norm": 0.4258103370666504, |
| "kl": 0.0097503662109375, |
| "learning_rate": 8.368407953869103e-07, |
| "loss": 0.0108, |
| "reward": 0.7152777761220932, |
| "reward_std": 0.27087756246328354, |
| "rewards/accuracy_reward": 0.11111111473292112, |
| "rewards/format_reward": 0.493055559694767, |
| "step": 176 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 702.7916564941406, |
| "epoch": 0.3033419023136247, |
| "grad_norm": 0.38482344150543213, |
| "kl": 0.009063720703125, |
| "learning_rate": 8.344131861991828e-07, |
| "loss": 0.0025, |
| "reward": 0.7986111268401146, |
| "reward_std": 0.416512792930007, |
| "rewards/accuracy_reward": 0.15277778171002865, |
| "rewards/format_reward": 0.493055559694767, |
| "step": 177 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 673.5555572509766, |
| "epoch": 0.3050556983718937, |
| "grad_norm": 0.4185694456100464, |
| "kl": 0.00726318359375, |
| "learning_rate": 8.319717151140072e-07, |
| "loss": 0.0183, |
| "reward": 0.7430555522441864, |
| "reward_std": 0.3815770819783211, |
| "rewards/accuracy_reward": 0.12500000465661287, |
| "rewards/format_reward": 0.493055559694767, |
| "step": 178 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 740.5277709960938, |
| "epoch": 0.3067694944301628, |
| "grad_norm": 0.4638974070549011, |
| "kl": 0.00782012939453125, |
| "learning_rate": 8.295165011252396e-07, |
| "loss": -0.0087, |
| "reward": 0.7083333507180214, |
| "reward_std": 0.4054961260408163, |
| "rewards/accuracy_reward": 0.12500000186264515, |
| "rewards/format_reward": 0.4583333432674408, |
| "step": 179 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 577.0416641235352, |
| "epoch": 0.30848329048843187, |
| "grad_norm": 0.5336411595344543, |
| "kl": 0.0118560791015625, |
| "learning_rate": 8.270476638965461e-07, |
| "loss": 0.0215, |
| "reward": 0.8333333432674408, |
| "reward_std": 0.358231820166111, |
| "rewards/accuracy_reward": 0.16666667070239782, |
| "rewards/format_reward": 0.5, |
| "step": 180 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 692.4027862548828, |
| "epoch": 0.3101970865467009, |
| "grad_norm": 0.5019733905792236, |
| "kl": 0.0130157470703125, |
| "learning_rate": 8.245653237555705e-07, |
| "loss": -0.0101, |
| "reward": 0.75, |
| "reward_std": 0.33668188750743866, |
| "rewards/accuracy_reward": 0.12500000186264515, |
| "rewards/format_reward": 0.5, |
| "step": 181 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 664.3889007568359, |
| "epoch": 0.31191088260497, |
| "grad_norm": 0.36695027351379395, |
| "kl": 0.0102386474609375, |
| "learning_rate": 8.220696016880687e-07, |
| "loss": 0.0083, |
| "reward": 0.7291666716337204, |
| "reward_std": 0.30504853278398514, |
| "rewards/accuracy_reward": 0.1250000037252903, |
| "rewards/format_reward": 0.4791666716337204, |
| "step": 182 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 741.8055648803711, |
| "epoch": 0.31362467866323906, |
| "grad_norm": 0.5278235077857971, |
| "kl": 0.0111541748046875, |
| "learning_rate": 8.195606193320136e-07, |
| "loss": 0.0004, |
| "reward": 0.9166666716337204, |
| "reward_std": 0.46232304722070694, |
| "rewards/accuracy_reward": 0.20833334140479565, |
| "rewards/format_reward": 0.5, |
| "step": 183 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 648.9861221313477, |
| "epoch": 0.31533847472150817, |
| "grad_norm": 0.48935696482658386, |
| "kl": 0.0121612548828125, |
| "learning_rate": 8.170384989716657e-07, |
| "loss": 0.0721, |
| "reward": 0.7847222089767456, |
| "reward_std": 0.3912379518151283, |
| "rewards/accuracy_reward": 0.15277778171002865, |
| "rewards/format_reward": 0.4791666641831398, |
| "step": 184 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 652.9166793823242, |
| "epoch": 0.3170522707797772, |
| "grad_norm": 0.5268736481666565, |
| "kl": 0.0102691650390625, |
| "learning_rate": 8.145033635316128e-07, |
| "loss": 0.0095, |
| "reward": 1.2916667014360428, |
| "reward_std": 0.7127073556184769, |
| "rewards/accuracy_reward": 0.4027777947485447, |
| "rewards/format_reward": 0.486111119389534, |
| "step": 185 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 650.0972290039062, |
| "epoch": 0.31876606683804626, |
| "grad_norm": 0.3435427248477936, |
| "kl": 0.0112152099609375, |
| "learning_rate": 8.119553365707802e-07, |
| "loss": -0.0147, |
| "reward": 0.7222222238779068, |
| "reward_std": 0.3314610570669174, |
| "rewards/accuracy_reward": 0.11111111473292112, |
| "rewards/format_reward": 0.5, |
| "step": 186 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 668.9722290039062, |
| "epoch": 0.32047986289631536, |
| "grad_norm": 0.5294600129127502, |
| "kl": 0.01114654541015625, |
| "learning_rate": 8.093945422764069e-07, |
| "loss": -0.0126, |
| "reward": 1.0277777910232544, |
| "reward_std": 0.6216515377163887, |
| "rewards/accuracy_reward": 0.26388888992369175, |
| "rewards/format_reward": 0.5, |
| "step": 187 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 735.2639007568359, |
| "epoch": 0.3221936589545844, |
| "grad_norm": 0.5934704542160034, |
| "kl": 0.015716552734375, |
| "learning_rate": 8.068211054579943e-07, |
| "loss": -0.0676, |
| "reward": 0.736111119389534, |
| "reward_std": 0.3602609410881996, |
| "rewards/accuracy_reward": 0.12500000093132257, |
| "rewards/format_reward": 0.486111119389534, |
| "step": 188 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 708.2638854980469, |
| "epoch": 0.32390745501285345, |
| "grad_norm": 0.29051750898361206, |
| "kl": 0.012054443359375, |
| "learning_rate": 8.04235151541222e-07, |
| "loss": 0.0144, |
| "reward": 0.6666666567325592, |
| "reward_std": 0.2453947737812996, |
| "rewards/accuracy_reward": 0.08333333488553762, |
| "rewards/format_reward": 0.5, |
| "step": 189 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 791.9305572509766, |
| "epoch": 0.32562125107112255, |
| "grad_norm": 0.6270461082458496, |
| "kl": 0.00994110107421875, |
| "learning_rate": 8.01636806561836e-07, |
| "loss": 0.1261, |
| "reward": 0.6319444552063942, |
| "reward_std": 0.306252408772707, |
| "rewards/accuracy_reward": 0.0833333358168602, |
| "rewards/format_reward": 0.4652777835726738, |
| "step": 190 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 759.0139007568359, |
| "epoch": 0.3273350471293916, |
| "grad_norm": 0.4964490532875061, |
| "kl": 0.0179290771484375, |
| "learning_rate": 7.990261971595048e-07, |
| "loss": 0.0119, |
| "reward": 0.9027777910232544, |
| "reward_std": 0.5300310179591179, |
| "rewards/accuracy_reward": 0.20833333674818277, |
| "rewards/format_reward": 0.486111119389534, |
| "step": 191 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 513.6249923706055, |
| "epoch": 0.32904884318766064, |
| "grad_norm": 0.5976383090019226, |
| "kl": 0.0132598876953125, |
| "learning_rate": 7.964034505716476e-07, |
| "loss": -0.0007, |
| "reward": 1.0555555820465088, |
| "reward_std": 0.530364416539669, |
| "rewards/accuracy_reward": 0.2777777798473835, |
| "rewards/format_reward": 0.5, |
| "step": 192 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 626.6250152587891, |
| "epoch": 0.33076263924592975, |
| "grad_norm": 0.6004844903945923, |
| "kl": 0.0198211669921875, |
| "learning_rate": 7.93768694627233e-07, |
| "loss": 0.0022, |
| "reward": 0.9375000149011612, |
| "reward_std": 0.32522570341825485, |
| "rewards/accuracy_reward": 0.22222222667187452, |
| "rewards/format_reward": 0.493055559694767, |
| "step": 193 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 613.7361068725586, |
| "epoch": 0.3324764353041988, |
| "grad_norm": 0.5730006098747253, |
| "kl": 0.021759033203125, |
| "learning_rate": 7.911220577405484e-07, |
| "loss": 0.0095, |
| "reward": 0.7777777910232544, |
| "reward_std": 0.29541123658418655, |
| "rewards/accuracy_reward": 0.13888888992369175, |
| "rewards/format_reward": 0.5, |
| "step": 194 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 774.7083435058594, |
| "epoch": 0.3341902313624679, |
| "grad_norm": 0.42059004306793213, |
| "kl": 0.01324462890625, |
| "learning_rate": 7.884636689049422e-07, |
| "loss": 0.0033, |
| "reward": 0.6666666567325592, |
| "reward_std": 0.29541125148534775, |
| "rewards/accuracy_reward": 0.08333333488553762, |
| "rewards/format_reward": 0.5, |
| "step": 195 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 683.8888854980469, |
| "epoch": 0.33590402742073694, |
| "grad_norm": 0.3196467459201813, |
| "kl": 0.0126495361328125, |
| "learning_rate": 7.857936576865356e-07, |
| "loss": 0.0156, |
| "reward": 0.8819444626569748, |
| "reward_std": 0.25071514397859573, |
| "rewards/accuracy_reward": 0.1944444514811039, |
| "rewards/format_reward": 0.493055559694767, |
| "step": 196 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 793.1388854980469, |
| "epoch": 0.337617823479006, |
| "grad_norm": 0.6156473755836487, |
| "kl": 0.01934814453125, |
| "learning_rate": 7.831121542179086e-07, |
| "loss": 0.0594, |
| "reward": 0.8819444552063942, |
| "reward_std": 0.4707336239516735, |
| "rewards/accuracy_reward": 0.20833334140479565, |
| "rewards/format_reward": 0.4652777835726738, |
| "step": 197 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 601.7777862548828, |
| "epoch": 0.3393316195372751, |
| "grad_norm": 0.5810103416442871, |
| "kl": 0.0160675048828125, |
| "learning_rate": 7.804192891917571e-07, |
| "loss": 0.0897, |
| "reward": 0.8750000149011612, |
| "reward_std": 0.4387439265847206, |
| "rewards/accuracy_reward": 0.19444444868713617, |
| "rewards/format_reward": 0.486111119389534, |
| "step": 198 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 739.5694580078125, |
| "epoch": 0.34104541559554413, |
| "grad_norm": 0.4028480350971222, |
| "kl": 0.0103607177734375, |
| "learning_rate": 7.777151938545235e-07, |
| "loss": 0.0106, |
| "reward": 0.7222222238779068, |
| "reward_std": 0.30821534991264343, |
| "rewards/accuracy_reward": 0.11111111473292112, |
| "rewards/format_reward": 0.5, |
| "step": 199 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 765.2083587646484, |
| "epoch": 0.3427592116538132, |
| "grad_norm": 0.5337042808532715, |
| "kl": 0.0111236572265625, |
| "learning_rate": 7.75e-07, |
| "loss": 0.0084, |
| "reward": 0.7083333432674408, |
| "reward_std": 0.37828588485717773, |
| "rewards/accuracy_reward": 0.1111111119389534, |
| "rewards/format_reward": 0.4861111119389534, |
| "step": 200 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 755.25, |
| "epoch": 0.3444730077120823, |
| "grad_norm": 0.5041696429252625, |
| "kl": 0.00969696044921875, |
| "learning_rate": 7.72273839962904e-07, |
| "loss": -0.0162, |
| "reward": 0.8541666567325592, |
| "reward_std": 0.4137297794222832, |
| "rewards/accuracy_reward": 0.19444444682449102, |
| "rewards/format_reward": 0.4652777761220932, |
| "step": 201 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 645.0277862548828, |
| "epoch": 0.3461868037703513, |
| "grad_norm": 0.4631847143173218, |
| "kl": 0.0138092041015625, |
| "learning_rate": 7.695368466124296e-07, |
| "loss": 0.023, |
| "reward": 0.8541666716337204, |
| "reward_std": 0.41651278734207153, |
| "rewards/accuracy_reward": 0.18055556062608957, |
| "rewards/format_reward": 0.493055559694767, |
| "step": 202 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 741.7361145019531, |
| "epoch": 0.34790059982862037, |
| "grad_norm": 0.5321578979492188, |
| "kl": 0.01479339599609375, |
| "learning_rate": 7.667891533457718e-07, |
| "loss": 0.106, |
| "reward": 1.1458333730697632, |
| "reward_std": 0.6571117714047432, |
| "rewards/accuracy_reward": 0.3333333432674408, |
| "rewards/format_reward": 0.479166679084301, |
| "step": 203 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 689.3194427490234, |
| "epoch": 0.3496143958868895, |
| "grad_norm": 0.3890639841556549, |
| "kl": 0.012969970703125, |
| "learning_rate": 7.640308940816239e-07, |
| "loss": -0.0073, |
| "reward": 0.8541666716337204, |
| "reward_std": 0.2211344763636589, |
| "rewards/accuracy_reward": 0.18055556155741215, |
| "rewards/format_reward": 0.493055559694767, |
| "step": 204 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 654.2638931274414, |
| "epoch": 0.3513281919451585, |
| "grad_norm": 0.5081583857536316, |
| "kl": 0.01397705078125, |
| "learning_rate": 7.612622032536507e-07, |
| "loss": 0.0108, |
| "reward": 0.9166666865348816, |
| "reward_std": 0.32624027878046036, |
| "rewards/accuracy_reward": 0.2083333358168602, |
| "rewards/format_reward": 0.5, |
| "step": 205 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 681.0694580078125, |
| "epoch": 0.35304198800342756, |
| "grad_norm": 0.48619431257247925, |
| "kl": 0.0098114013671875, |
| "learning_rate": 7.584832158039378e-07, |
| "loss": -0.005, |
| "reward": 0.7361111044883728, |
| "reward_std": 0.32421112060546875, |
| "rewards/accuracy_reward": 0.1250000037252903, |
| "rewards/format_reward": 0.486111119389534, |
| "step": 206 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 603.4305572509766, |
| "epoch": 0.35475578406169667, |
| "grad_norm": 0.4051726162433624, |
| "kl": 0.01464080810546875, |
| "learning_rate": 7.556940671764124e-07, |
| "loss": -0.0038, |
| "reward": 1.1944444626569748, |
| "reward_std": 0.3134361505508423, |
| "rewards/accuracy_reward": 0.3472222350537777, |
| "rewards/format_reward": 0.5, |
| "step": 207 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 680.7916793823242, |
| "epoch": 0.3564695801199657, |
| "grad_norm": 0.5050489902496338, |
| "kl": 0.0126953125, |
| "learning_rate": 7.528948933102438e-07, |
| "loss": 0.0505, |
| "reward": 0.7916666567325592, |
| "reward_std": 0.32421112060546875, |
| "rewards/accuracy_reward": 0.15277777798473835, |
| "rewards/format_reward": 0.486111119389534, |
| "step": 208 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 656.4722442626953, |
| "epoch": 0.3581833761782348, |
| "grad_norm": 0.29651615023612976, |
| "kl": 0.00934600830078125, |
| "learning_rate": 7.500858306332172e-07, |
| "loss": -0.0032, |
| "reward": 0.6666666567325592, |
| "reward_std": 0.25819889456033707, |
| "rewards/accuracy_reward": 0.08333333395421505, |
| "rewards/format_reward": 0.5, |
| "step": 209 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 515.0, |
| "epoch": 0.35989717223650386, |
| "grad_norm": 0.8540976643562317, |
| "kl": 0.028778076171875, |
| "learning_rate": 7.472670160550848e-07, |
| "loss": 0.0138, |
| "reward": 0.7708333283662796, |
| "reward_std": 0.46130845695734024, |
| "rewards/accuracy_reward": 0.1388888917863369, |
| "rewards/format_reward": 0.493055559694767, |
| "step": 210 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 965.8055725097656, |
| "epoch": 0.3616109682947729, |
| "grad_norm": 0.3300262689590454, |
| "kl": 0.0074920654296875, |
| "learning_rate": 7.444385869608921e-07, |
| "loss": 0.0311, |
| "reward": 0.6527777835726738, |
| "reward_std": 0.17010344564914703, |
| "rewards/accuracy_reward": 0.08333333674818277, |
| "rewards/format_reward": 0.486111119389534, |
| "step": 211 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 625.5555572509766, |
| "epoch": 0.363324764353042, |
| "grad_norm": 0.5068047642707825, |
| "kl": 0.012664794921875, |
| "learning_rate": 7.416006812042827e-07, |
| "loss": -0.0061, |
| "reward": 0.9375, |
| "reward_std": 0.4217336028814316, |
| "rewards/accuracy_reward": 0.22222222480922937, |
| "rewards/format_reward": 0.493055559694767, |
| "step": 212 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 909.013916015625, |
| "epoch": 0.36503856041131105, |
| "grad_norm": 0.3107520341873169, |
| "kl": 0.00826263427734375, |
| "learning_rate": 7.387534371007797e-07, |
| "loss": 0.0477, |
| "reward": 0.7361111044883728, |
| "reward_std": 0.34775684773921967, |
| "rewards/accuracy_reward": 0.12500000093132257, |
| "rewards/format_reward": 0.4861111119389534, |
| "step": 213 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 708.1805572509766, |
| "epoch": 0.3667523564695801, |
| "grad_norm": 0.5627197027206421, |
| "kl": 0.00879669189453125, |
| "learning_rate": 7.358969934210438e-07, |
| "loss": 0.0545, |
| "reward": 0.958333358168602, |
| "reward_std": 0.5707200393080711, |
| "rewards/accuracy_reward": 0.23611111659556627, |
| "rewards/format_reward": 0.486111119389534, |
| "step": 214 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 702.1388854980469, |
| "epoch": 0.3684661525278492, |
| "grad_norm": 0.5161833763122559, |
| "kl": 0.01049041748046875, |
| "learning_rate": 7.330314893841101e-07, |
| "loss": -0.0212, |
| "reward": 1.1388889104127884, |
| "reward_std": 0.5355852097272873, |
| "rewards/accuracy_reward": 0.3194444486871362, |
| "rewards/format_reward": 0.5, |
| "step": 215 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 695.9722290039062, |
| "epoch": 0.37017994858611825, |
| "grad_norm": 0.4373010993003845, |
| "kl": 0.00847625732421875, |
| "learning_rate": 7.301570646506027e-07, |
| "loss": 0.0195, |
| "reward": 0.7430555447936058, |
| "reward_std": 0.30720078758895397, |
| "rewards/accuracy_reward": 0.1250000037252903, |
| "rewards/format_reward": 0.493055559694767, |
| "step": 216 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 636.6944580078125, |
| "epoch": 0.3718937446443873, |
| "grad_norm": 0.49228960275650024, |
| "kl": 0.009765625, |
| "learning_rate": 7.27273859315928e-07, |
| "loss": 0.0262, |
| "reward": 0.8958333432674408, |
| "reward_std": 0.46008094400167465, |
| "rewards/accuracy_reward": 0.20833333767950535, |
| "rewards/format_reward": 0.4791666716337204, |
| "step": 217 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 717.819450378418, |
| "epoch": 0.3736075407026564, |
| "grad_norm": 0.43165305256843567, |
| "kl": 0.0099334716796875, |
| "learning_rate": 7.243820139034464e-07, |
| "loss": 0.0048, |
| "reward": 0.8124999925494194, |
| "reward_std": 0.29642581194639206, |
| "rewards/accuracy_reward": 0.1666666679084301, |
| "rewards/format_reward": 0.4791666716337204, |
| "step": 218 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 838.875, |
| "epoch": 0.37532133676092544, |
| "grad_norm": 0.4618090093135834, |
| "kl": 0.00933837890625, |
| "learning_rate": 7.214816693576234e-07, |
| "loss": 0.0164, |
| "reward": 0.7083333432674408, |
| "reward_std": 0.466628834605217, |
| "rewards/accuracy_reward": 0.11111111380159855, |
| "rewards/format_reward": 0.486111119389534, |
| "step": 219 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 847.2222290039062, |
| "epoch": 0.37703513281919454, |
| "grad_norm": 0.12107283622026443, |
| "kl": 0.00963592529296875, |
| "learning_rate": 7.185729670371604e-07, |
| "loss": -0.0002, |
| "reward": 0.6944444328546524, |
| "reward_std": 0.0680413767695427, |
| "rewards/accuracy_reward": 0.09722222480922937, |
| "rewards/format_reward": 0.5, |
| "step": 220 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 738.3472290039062, |
| "epoch": 0.3787489288774636, |
| "grad_norm": 0.15903827548027039, |
| "kl": 0.00983428955078125, |
| "learning_rate": 7.156560487081051e-07, |
| "loss": 0.0021, |
| "reward": 0.5277777761220932, |
| "reward_std": 0.0680413767695427, |
| "rewards/accuracy_reward": 0.013888888992369175, |
| "rewards/format_reward": 0.5, |
| "step": 221 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 745.5694427490234, |
| "epoch": 0.38046272493573263, |
| "grad_norm": 0.331328809261322, |
| "kl": 0.01165771484375, |
| "learning_rate": 7.127310565369415e-07, |
| "loss": 0.012, |
| "reward": 0.7083333432674408, |
| "reward_std": 0.2549325004220009, |
| "rewards/accuracy_reward": 0.11111111287027597, |
| "rewards/format_reward": 0.4861111119389534, |
| "step": 222 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 747.9027862548828, |
| "epoch": 0.38217652099400173, |
| "grad_norm": 0.43389493227005005, |
| "kl": 0.0093536376953125, |
| "learning_rate": 7.097981330836616e-07, |
| "loss": 0.0764, |
| "reward": 0.6250000074505806, |
| "reward_std": 0.2613905519247055, |
| "rewards/accuracy_reward": 0.06944444496184587, |
| "rewards/format_reward": 0.4861111119389534, |
| "step": 223 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 655.5694580078125, |
| "epoch": 0.3838903170522708, |
| "grad_norm": 0.2923060655593872, |
| "kl": 0.0098876953125, |
| "learning_rate": 7.068574212948169e-07, |
| "loss": 0.0202, |
| "reward": 0.5138888955116272, |
| "reward_std": 0.10206206515431404, |
| "rewards/accuracy_reward": 0.013888888992369175, |
| "rewards/format_reward": 0.486111119389534, |
| "step": 224 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 595.1666793823242, |
| "epoch": 0.3856041131105398, |
| "grad_norm": 0.3962916433811188, |
| "kl": 0.01021575927734375, |
| "learning_rate": 7.039090644965509e-07, |
| "loss": -0.0231, |
| "reward": 0.75, |
| "reward_std": 0.2901904508471489, |
| "rewards/accuracy_reward": 0.1250000037252903, |
| "rewards/format_reward": 0.5, |
| "step": 225 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 801.6944580078125, |
| "epoch": 0.3873179091688089, |
| "grad_norm": 0.42240843176841736, |
| "kl": 0.00876617431640625, |
| "learning_rate": 7.009532063876148e-07, |
| "loss": 0.0223, |
| "reward": 0.7708333507180214, |
| "reward_std": 0.5345706399530172, |
| "rewards/accuracy_reward": 0.13888889271765947, |
| "rewards/format_reward": 0.493055559694767, |
| "step": 226 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 698.6527938842773, |
| "epoch": 0.389031705227078, |
| "grad_norm": 0.5248401165008545, |
| "kl": 0.0102081298828125, |
| "learning_rate": 6.979899910323624e-07, |
| "loss": -0.0301, |
| "reward": 1.0763889104127884, |
| "reward_std": 0.39326707273721695, |
| "rewards/accuracy_reward": 0.29166667722165585, |
| "rewards/format_reward": 0.493055559694767, |
| "step": 227 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 682.1111297607422, |
| "epoch": 0.390745501285347, |
| "grad_norm": 0.6135608553886414, |
| "kl": 0.01019287109375, |
| "learning_rate": 6.950195628537299e-07, |
| "loss": 0.0276, |
| "reward": 1.0694444626569748, |
| "reward_std": 0.496343731880188, |
| "rewards/accuracy_reward": 0.2916666679084301, |
| "rewards/format_reward": 0.486111119389534, |
| "step": 228 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 743.4027862548828, |
| "epoch": 0.3924592973436161, |
| "grad_norm": 0.2736571133136749, |
| "kl": 0.01031494140625, |
| "learning_rate": 6.920420666261961e-07, |
| "loss": 0.0257, |
| "reward": 0.8472222238779068, |
| "reward_std": 0.18812836706638336, |
| "rewards/accuracy_reward": 0.180555559694767, |
| "rewards/format_reward": 0.486111119389534, |
| "step": 229 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 773.6527862548828, |
| "epoch": 0.39417309340188517, |
| "grad_norm": 0.4546229839324951, |
| "kl": 0.0096588134765625, |
| "learning_rate": 6.890576474687263e-07, |
| "loss": 0.0489, |
| "reward": 1.0277778059244156, |
| "reward_std": 0.3637526258826256, |
| "rewards/accuracy_reward": 0.2777777835726738, |
| "rewards/format_reward": 0.4722222238779068, |
| "step": 230 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 651.8611145019531, |
| "epoch": 0.39588688946015427, |
| "grad_norm": 0.40317994356155396, |
| "kl": 0.01104736328125, |
| "learning_rate": 6.860664508377001e-07, |
| "loss": -0.0009, |
| "reward": 1.0555555522441864, |
| "reward_std": 0.2721655070781708, |
| "rewards/accuracy_reward": 0.27777778543531895, |
| "rewards/format_reward": 0.5, |
| "step": 231 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 591.1805725097656, |
| "epoch": 0.3976006855184233, |
| "grad_norm": 0.4714021384716034, |
| "kl": 0.0143280029296875, |
| "learning_rate": 6.83068622519821e-07, |
| "loss": 0.0116, |
| "reward": 1.0, |
| "reward_std": 0.30821534991264343, |
| "rewards/accuracy_reward": 0.25000000186264515, |
| "rewards/format_reward": 0.5, |
| "step": 232 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 549.2777786254883, |
| "epoch": 0.39931448157669236, |
| "grad_norm": 0.6059587001800537, |
| "kl": 0.01348876953125, |
| "learning_rate": 6.800643086250121e-07, |
| "loss": -0.0337, |
| "reward": 1.0138889104127884, |
| "reward_std": 0.5345955863595009, |
| "rewards/accuracy_reward": 0.26388889364898205, |
| "rewards/format_reward": 0.486111119389534, |
| "step": 233 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 809.9444580078125, |
| "epoch": 0.40102827763496146, |
| "grad_norm": 0.45386579632759094, |
| "kl": 0.0084228515625, |
| "learning_rate": 6.770536555792944e-07, |
| "loss": 0.0403, |
| "reward": 0.9305555671453476, |
| "reward_std": 0.4666288197040558, |
| "rewards/accuracy_reward": 0.22222222574055195, |
| "rewards/format_reward": 0.4861111119389534, |
| "step": 234 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 696.6666717529297, |
| "epoch": 0.4027420736932305, |
| "grad_norm": 0.1798969805240631, |
| "kl": 0.0090789794921875, |
| "learning_rate": 6.740368101176495e-07, |
| "loss": 0.0142, |
| "reward": 0.8611111044883728, |
| "reward_std": 0.06804138422012329, |
| "rewards/accuracy_reward": 0.180555559694767, |
| "rewards/format_reward": 0.5, |
| "step": 235 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 741.5277862548828, |
| "epoch": 0.40445586975149955, |
| "grad_norm": 0.37443897128105164, |
| "kl": 0.0080413818359375, |
| "learning_rate": 6.710139192768694e-07, |
| "loss": -0.0028, |
| "reward": 0.7986111268401146, |
| "reward_std": 0.24438021332025528, |
| "rewards/accuracy_reward": 0.15277778450399637, |
| "rewards/format_reward": 0.493055559694767, |
| "step": 236 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 569.6111221313477, |
| "epoch": 0.40616966580976865, |
| "grad_norm": 0.3133380711078644, |
| "kl": 0.013458251953125, |
| "learning_rate": 6.679851303883891e-07, |
| "loss": 0.007, |
| "reward": 0.8888888657093048, |
| "reward_std": 0.13608276844024658, |
| "rewards/accuracy_reward": 0.19444444961845875, |
| "rewards/format_reward": 0.5, |
| "step": 237 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 639.3194427490234, |
| "epoch": 0.4078834618680377, |
| "grad_norm": 0.3414314091205597, |
| "kl": 0.0097808837890625, |
| "learning_rate": 6.649505910711058e-07, |
| "loss": 0.0132, |
| "reward": 0.7916666567325592, |
| "reward_std": 0.10206206887960434, |
| "rewards/accuracy_reward": 0.1527777835726738, |
| "rewards/format_reward": 0.4861111119389534, |
| "step": 238 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 698.0000152587891, |
| "epoch": 0.40959725792630675, |
| "grad_norm": 0.47577425837516785, |
| "kl": 0.0128326416015625, |
| "learning_rate": 6.619104492241847e-07, |
| "loss": 0.0208, |
| "reward": 0.9375000149011612, |
| "reward_std": 0.4217335730791092, |
| "rewards/accuracy_reward": 0.22222223225980997, |
| "rewards/format_reward": 0.493055559694767, |
| "step": 239 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 496.9305648803711, |
| "epoch": 0.41131105398457585, |
| "grad_norm": 0.5120421051979065, |
| "kl": 0.017822265625, |
| "learning_rate": 6.588648530198504e-07, |
| "loss": -0.0194, |
| "reward": 0.9027777761220932, |
| "reward_std": 0.34775684028863907, |
| "rewards/accuracy_reward": 0.20833333674818277, |
| "rewards/format_reward": 0.4861111119389534, |
| "step": 240 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 568.9166946411133, |
| "epoch": 0.4130248500428449, |
| "grad_norm": 0.4153745472431183, |
| "kl": 0.00946044921875, |
| "learning_rate": 6.558139508961654e-07, |
| "loss": -0.0283, |
| "reward": 1.1111111044883728, |
| "reward_std": 0.3314610719680786, |
| "rewards/accuracy_reward": 0.30555556435137987, |
| "rewards/format_reward": 0.5, |
| "step": 241 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 679.4027862548828, |
| "epoch": 0.414738646101114, |
| "grad_norm": 0.2539161145687103, |
| "kl": 0.0107269287109375, |
| "learning_rate": 6.527578915497951e-07, |
| "loss": -0.0036, |
| "reward": 0.7222222089767456, |
| "reward_std": 0.26864049583673477, |
| "rewards/accuracy_reward": 0.11111111380159855, |
| "rewards/format_reward": 0.5, |
| "step": 242 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 663.6388778686523, |
| "epoch": 0.41645244215938304, |
| "grad_norm": 0.4045405089855194, |
| "kl": 0.01134490966796875, |
| "learning_rate": 6.496968239287603e-07, |
| "loss": -0.0349, |
| "reward": 0.8263889029622078, |
| "reward_std": 0.3752421587705612, |
| "rewards/accuracy_reward": 0.16666667349636555, |
| "rewards/format_reward": 0.493055559694767, |
| "step": 243 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 684.1250152587891, |
| "epoch": 0.4181662382176521, |
| "grad_norm": 0.33096015453338623, |
| "kl": 0.00829315185546875, |
| "learning_rate": 6.466308972251785e-07, |
| "loss": -0.0064, |
| "reward": 0.5833333283662796, |
| "reward_std": 0.20412413775920868, |
| "rewards/accuracy_reward": 0.041666666977107525, |
| "rewards/format_reward": 0.5, |
| "step": 244 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 503.8194580078125, |
| "epoch": 0.4198800342759212, |
| "grad_norm": 0.6509292721748352, |
| "kl": 0.0207977294921875, |
| "learning_rate": 6.435602608679916e-07, |
| "loss": -0.0065, |
| "reward": 0.9166666567325592, |
| "reward_std": 0.4262731820344925, |
| "rewards/accuracy_reward": 0.2083333320915699, |
| "rewards/format_reward": 0.5, |
| "step": 245 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 753.3333282470703, |
| "epoch": 0.42159383033419023, |
| "grad_norm": 0.23314639925956726, |
| "kl": 0.0078582763671875, |
| "learning_rate": 6.404850645156841e-07, |
| "loss": 0.0163, |
| "reward": 1.2430555522441864, |
| "reward_std": 0.28890247642993927, |
| "rewards/accuracy_reward": 0.3750000149011612, |
| "rewards/format_reward": 0.493055559694767, |
| "step": 246 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 731.4444580078125, |
| "epoch": 0.4233076263924593, |
| "grad_norm": 0.2738264203071594, |
| "kl": 0.01085662841796875, |
| "learning_rate": 6.374054580489873e-07, |
| "loss": 0.004, |
| "reward": 0.6805555522441864, |
| "reward_std": 0.10206207446753979, |
| "rewards/accuracy_reward": 0.09722222480922937, |
| "rewards/format_reward": 0.486111119389534, |
| "step": 247 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 645.1805801391602, |
| "epoch": 0.4250214224507284, |
| "grad_norm": 0.41325727105140686, |
| "kl": 0.0088348388671875, |
| "learning_rate": 6.343215915635761e-07, |
| "loss": 0.0097, |
| "reward": 0.8888888955116272, |
| "reward_std": 0.3547067791223526, |
| "rewards/accuracy_reward": 0.1944444477558136, |
| "rewards/format_reward": 0.5, |
| "step": 248 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 734.1944427490234, |
| "epoch": 0.4267352185089974, |
| "grad_norm": 0.39983558654785156, |
| "kl": 0.009674072265625, |
| "learning_rate": 6.31233615362752e-07, |
| "loss": 0.0139, |
| "reward": 0.861111119389534, |
| "reward_std": 0.4262731969356537, |
| "rewards/accuracy_reward": 0.18055556248873472, |
| "rewards/format_reward": 0.5, |
| "step": 249 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 676.3472290039062, |
| "epoch": 0.4284490145672665, |
| "grad_norm": 0.3825208246707916, |
| "kl": 0.00958251953125, |
| "learning_rate": 6.281416799501187e-07, |
| "loss": -0.0015, |
| "reward": 0.6944444328546524, |
| "reward_std": 0.20412414520978928, |
| "rewards/accuracy_reward": 0.09722222480922937, |
| "rewards/format_reward": 0.5, |
| "step": 250 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 725.7222290039062, |
| "epoch": 0.4301628106255356, |
| "grad_norm": 0.247074156999588, |
| "kl": 0.011627197265625, |
| "learning_rate": 6.25045936022246e-07, |
| "loss": -0.0156, |
| "reward": 0.9444444328546524, |
| "reward_std": 0.2453947737812996, |
| "rewards/accuracy_reward": 0.2222222276031971, |
| "rewards/format_reward": 0.5, |
| "step": 251 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 644.7083435058594, |
| "epoch": 0.4318766066838046, |
| "grad_norm": 0.4444674849510193, |
| "kl": 0.009063720703125, |
| "learning_rate": 6.219465344613258e-07, |
| "loss": 0.009, |
| "reward": 0.8333333432674408, |
| "reward_std": 0.40472324192523956, |
| "rewards/accuracy_reward": 0.16666666883975267, |
| "rewards/format_reward": 0.5, |
| "step": 252 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 747.0277862548828, |
| "epoch": 0.43359040274207367, |
| "grad_norm": 0.5097996592521667, |
| "kl": 0.00933837890625, |
| "learning_rate": 6.188436263278172e-07, |
| "loss": -0.0255, |
| "reward": 0.7569444552063942, |
| "reward_std": 0.5057707708328962, |
| "rewards/accuracy_reward": 0.13888889271765947, |
| "rewards/format_reward": 0.479166679084301, |
| "step": 253 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 728.7083358764648, |
| "epoch": 0.43530419880034277, |
| "grad_norm": 0.35494205355644226, |
| "kl": 0.0090484619140625, |
| "learning_rate": 6.157373628530852e-07, |
| "loss": 0.0083, |
| "reward": 0.8541666567325592, |
| "reward_std": 0.34325060993433, |
| "rewards/accuracy_reward": 0.180555559694767, |
| "rewards/format_reward": 0.493055559694767, |
| "step": 254 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 681.2361145019531, |
| "epoch": 0.4370179948586118, |
| "grad_norm": 0.3796377182006836, |
| "kl": 0.0113677978515625, |
| "learning_rate": 6.126278954320294e-07, |
| "loss": 0.0005, |
| "reward": 0.7777777910232544, |
| "reward_std": 0.29541125893592834, |
| "rewards/accuracy_reward": 0.13888889271765947, |
| "rewards/format_reward": 0.5, |
| "step": 255 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 747.6666717529297, |
| "epoch": 0.4387317909168809, |
| "grad_norm": 0.400722861289978, |
| "kl": 0.00862884521484375, |
| "learning_rate": 6.095153756157051e-07, |
| "loss": -0.0062, |
| "reward": 0.6249999925494194, |
| "reward_std": 0.24970055185258389, |
| "rewards/accuracy_reward": 0.06944444589316845, |
| "rewards/format_reward": 0.486111119389534, |
| "step": 256 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 794.6527862548828, |
| "epoch": 0.44044558697514996, |
| "grad_norm": 0.4615187644958496, |
| "kl": 0.00820159912109375, |
| "learning_rate": 6.06399955103937e-07, |
| "loss": -0.0081, |
| "reward": 0.6805555522441864, |
| "reward_std": 0.311707004904747, |
| "rewards/accuracy_reward": 0.09722222574055195, |
| "rewards/format_reward": 0.4861111119389534, |
| "step": 257 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 754.5277709960938, |
| "epoch": 0.442159383033419, |
| "grad_norm": 0.3552338480949402, |
| "kl": 0.0101165771484375, |
| "learning_rate": 6.032817857379256e-07, |
| "loss": 0.0125, |
| "reward": 0.6875000074505806, |
| "reward_std": 0.2571843173354864, |
| "rewards/accuracy_reward": 0.0972222238779068, |
| "rewards/format_reward": 0.493055559694767, |
| "step": 258 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 634.4166717529297, |
| "epoch": 0.4438731790916881, |
| "grad_norm": 0.6070718765258789, |
| "kl": 0.00984954833984375, |
| "learning_rate": 6.001610194928464e-07, |
| "loss": 0.006, |
| "reward": 1.0555555522441864, |
| "reward_std": 0.5303644090890884, |
| "rewards/accuracy_reward": 0.27777778543531895, |
| "rewards/format_reward": 0.5, |
| "step": 259 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 624.7361145019531, |
| "epoch": 0.44558697514995715, |
| "grad_norm": 0.5230724215507507, |
| "kl": 0.00922393798828125, |
| "learning_rate": 5.97037808470444e-07, |
| "loss": -0.0235, |
| "reward": 0.7777777761220932, |
| "reward_std": 0.3582318127155304, |
| "rewards/accuracy_reward": 0.13888889085501432, |
| "rewards/format_reward": 0.5, |
| "step": 260 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 691.1666870117188, |
| "epoch": 0.4473007712082262, |
| "grad_norm": 0.39952459931373596, |
| "kl": 0.00872802734375, |
| "learning_rate": 5.939123048916173e-07, |
| "loss": 0.0159, |
| "reward": 0.8333333283662796, |
| "reward_std": 0.2221490666270256, |
| "rewards/accuracy_reward": 0.16666666977107525, |
| "rewards/format_reward": 0.5, |
| "step": 261 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 800.1250152587891, |
| "epoch": 0.4490145672664953, |
| "grad_norm": 0.41582420468330383, |
| "kl": 0.008209228515625, |
| "learning_rate": 5.907846610890011e-07, |
| "loss": 0.0729, |
| "reward": 1.2500000149011612, |
| "reward_std": 0.5078206732869148, |
| "rewards/accuracy_reward": 0.3888888992369175, |
| "rewards/format_reward": 0.4722222238779068, |
| "step": 262 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 708.9583282470703, |
| "epoch": 0.45072836332476435, |
| "grad_norm": 0.5363173484802246, |
| "kl": 0.00957489013671875, |
| "learning_rate": 5.87655029499542e-07, |
| "loss": -0.022, |
| "reward": 0.6527777761220932, |
| "reward_std": 0.25616974383592606, |
| "rewards/accuracy_reward": 0.08333333395421505, |
| "rewards/format_reward": 0.486111119389534, |
| "step": 263 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 608.2083511352539, |
| "epoch": 0.4524421593830334, |
| "grad_norm": 0.4334163963794708, |
| "kl": 0.00849151611328125, |
| "learning_rate": 5.845235626570683e-07, |
| "loss": 0.011, |
| "reward": 0.8333333432674408, |
| "reward_std": 0.3082153648138046, |
| "rewards/accuracy_reward": 0.16666667070239782, |
| "rewards/format_reward": 0.5, |
| "step": 264 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 678.3750152587891, |
| "epoch": 0.4541559554413025, |
| "grad_norm": 0.38257157802581787, |
| "kl": 0.0098876953125, |
| "learning_rate": 5.813904131848564e-07, |
| "loss": -0.0024, |
| "reward": 1.1944444328546524, |
| "reward_std": 0.38669832795858383, |
| "rewards/accuracy_reward": 0.34722223225980997, |
| "rewards/format_reward": 0.5, |
| "step": 265 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 795.8889007568359, |
| "epoch": 0.45586975149957154, |
| "grad_norm": 0.36699798703193665, |
| "kl": 0.0082855224609375, |
| "learning_rate": 5.78255733788191e-07, |
| "loss": 0.0402, |
| "reward": 0.8333333283662796, |
| "reward_std": 0.30821535736322403, |
| "rewards/accuracy_reward": 0.16666666697710752, |
| "rewards/format_reward": 0.5, |
| "step": 266 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 679.8611221313477, |
| "epoch": 0.45758354755784064, |
| "grad_norm": 7.172515869140625, |
| "kl": 0.10137939453125, |
| "learning_rate": 5.751196772469237e-07, |
| "loss": -0.0182, |
| "reward": 0.625, |
| "reward_std": 0.16182994842529297, |
| "rewards/accuracy_reward": 0.06944444496184587, |
| "rewards/format_reward": 0.4861111119389534, |
| "step": 267 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 669.3194580078125, |
| "epoch": 0.4592973436161097, |
| "grad_norm": 0.4328586459159851, |
| "kl": 0.01244354248046875, |
| "learning_rate": 5.71982396408026e-07, |
| "loss": 0.0242, |
| "reward": 0.7430555671453476, |
| "reward_std": 0.25718431919813156, |
| "rewards/accuracy_reward": 0.1250000037252903, |
| "rewards/format_reward": 0.493055559694767, |
| "step": 268 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 706.1805725097656, |
| "epoch": 0.46101113967437873, |
| "grad_norm": 0.15796984732151031, |
| "kl": 0.00635528564453125, |
| "learning_rate": 5.688440441781398e-07, |
| "loss": -0.0015, |
| "reward": 0.7777777761220932, |
| "reward_std": 0.08606629818677902, |
| "rewards/accuracy_reward": 0.1388888917863369, |
| "rewards/format_reward": 0.5, |
| "step": 269 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 763.4861145019531, |
| "epoch": 0.46272493573264784, |
| "grad_norm": 0.42503196001052856, |
| "kl": 0.010040283203125, |
| "learning_rate": 5.657047735161255e-07, |
| "loss": -0.0154, |
| "reward": 1.0138888955116272, |
| "reward_std": 0.3125211223959923, |
| "rewards/accuracy_reward": 0.26388889644294977, |
| "rewards/format_reward": 0.4861111119389534, |
| "step": 270 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 671.2639007568359, |
| "epoch": 0.4644387317909169, |
| "grad_norm": 0.442844957113266, |
| "kl": 0.0098724365234375, |
| "learning_rate": 5.625647374256061e-07, |
| "loss": -0.0138, |
| "reward": 0.8541666567325592, |
| "reward_std": 0.3072007820010185, |
| "rewards/accuracy_reward": 0.18055556155741215, |
| "rewards/format_reward": 0.493055559694767, |
| "step": 271 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 584.1666793823242, |
| "epoch": 0.4661525278491859, |
| "grad_norm": 0.6295328140258789, |
| "kl": 0.0174713134765625, |
| "learning_rate": 5.594240889475106e-07, |
| "loss": 0.0023, |
| "reward": 0.8750000149011612, |
| "reward_std": 0.46228964626789093, |
| "rewards/accuracy_reward": 0.19444444961845875, |
| "rewards/format_reward": 0.4861111119389534, |
| "step": 272 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 709.3611145019531, |
| "epoch": 0.46786632390745503, |
| "grad_norm": 0.34427711367607117, |
| "kl": 0.0111846923828125, |
| "learning_rate": 5.562829811526154e-07, |
| "loss": 0.0205, |
| "reward": 0.8541666865348816, |
| "reward_std": 0.25718431919813156, |
| "rewards/accuracy_reward": 0.180555559694767, |
| "rewards/format_reward": 0.493055559694767, |
| "step": 273 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 725.625, |
| "epoch": 0.4695801199657241, |
| "grad_norm": 0.3454584777355194, |
| "kl": 0.00966644287109375, |
| "learning_rate": 5.531415671340826e-07, |
| "loss": -0.0354, |
| "reward": 0.5972222238779068, |
| "reward_std": 0.28170324862003326, |
| "rewards/accuracy_reward": 0.0555555559694767, |
| "rewards/format_reward": 0.4861111119389534, |
| "step": 274 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 674.6111145019531, |
| "epoch": 0.4712939160239931, |
| "grad_norm": 0.5319638252258301, |
| "kl": 0.0121307373046875, |
| "learning_rate": 5.5e-07, |
| "loss": -0.0055, |
| "reward": 0.8055555522441864, |
| "reward_std": 0.3995024487376213, |
| "rewards/accuracy_reward": 0.1527777835726738, |
| "rewards/format_reward": 0.5, |
| "step": 275 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 655.1527862548828, |
| "epoch": 0.4730077120822622, |
| "grad_norm": 0.4211221933364868, |
| "kl": 0.0104217529296875, |
| "learning_rate": 5.468584328659172e-07, |
| "loss": 0.0072, |
| "reward": 0.8055555671453476, |
| "reward_std": 0.3995024487376213, |
| "rewards/accuracy_reward": 0.15277778171002865, |
| "rewards/format_reward": 0.5, |
| "step": 276 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 731.9305725097656, |
| "epoch": 0.47472150814053127, |
| "grad_norm": 0.4488551914691925, |
| "kl": 0.011688232421875, |
| "learning_rate": 5.437170188473847e-07, |
| "loss": -0.0014, |
| "reward": 1.1041666716337204, |
| "reward_std": 0.34847141802310944, |
| "rewards/accuracy_reward": 0.3055555634200573, |
| "rewards/format_reward": 0.493055559694767, |
| "step": 277 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 681.75, |
| "epoch": 0.47643530419880037, |
| "grad_norm": 0.3861945867538452, |
| "kl": 0.011749267578125, |
| "learning_rate": 5.405759110524894e-07, |
| "loss": 0.02, |
| "reward": 0.8749999925494194, |
| "reward_std": 0.34098767302930355, |
| "rewards/accuracy_reward": 0.1944444514811039, |
| "rewards/format_reward": 0.486111119389534, |
| "step": 278 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 790.6389007568359, |
| "epoch": 0.4781491002570694, |
| "grad_norm": 0.6166090369224548, |
| "kl": 0.0152740478515625, |
| "learning_rate": 5.37435262574394e-07, |
| "loss": 0.0135, |
| "reward": 0.7152777910232544, |
| "reward_std": 0.3984878733754158, |
| "rewards/accuracy_reward": 0.11111111473292112, |
| "rewards/format_reward": 0.493055559694767, |
| "step": 279 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 613.5277862548828, |
| "epoch": 0.47986289631533846, |
| "grad_norm": 0.3968028426170349, |
| "kl": 0.0106964111328125, |
| "learning_rate": 5.342952264838747e-07, |
| "loss": 0.0027, |
| "reward": 0.8611111044883728, |
| "reward_std": 0.24017397314310074, |
| "rewards/accuracy_reward": 0.180555559694767, |
| "rewards/format_reward": 0.5, |
| "step": 280 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 728.4861145019531, |
| "epoch": 0.48157669237360756, |
| "grad_norm": 0.4651610851287842, |
| "kl": 0.00959014892578125, |
| "learning_rate": 5.311559558218603e-07, |
| "loss": 0.0424, |
| "reward": 1.1250000149011612, |
| "reward_std": 0.40491778403520584, |
| "rewards/accuracy_reward": 0.31944445613771677, |
| "rewards/format_reward": 0.486111119389534, |
| "step": 281 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 719.4027862548828, |
| "epoch": 0.4832904884318766, |
| "grad_norm": 0.4324786365032196, |
| "kl": 0.0084686279296875, |
| "learning_rate": 5.28017603591974e-07, |
| "loss": 0.0187, |
| "reward": 0.7986111268401146, |
| "reward_std": 0.24438020400702953, |
| "rewards/accuracy_reward": 0.1527777798473835, |
| "rewards/format_reward": 0.493055559694767, |
| "step": 282 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 743.3889007568359, |
| "epoch": 0.48500428449014565, |
| "grad_norm": 0.317564994096756, |
| "kl": 0.009735107421875, |
| "learning_rate": 5.248803227530763e-07, |
| "loss": 0.0143, |
| "reward": 0.729166679084301, |
| "reward_std": 0.25436214357614517, |
| "rewards/accuracy_reward": 0.12500000186264515, |
| "rewards/format_reward": 0.479166679084301, |
| "step": 283 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 799.6666870117188, |
| "epoch": 0.48671808054841476, |
| "grad_norm": 0.3215982913970947, |
| "kl": 0.0088348388671875, |
| "learning_rate": 5.21744266211809e-07, |
| "loss": -0.0185, |
| "reward": 0.9652777910232544, |
| "reward_std": 0.3315606266260147, |
| "rewards/accuracy_reward": 0.23611111845821142, |
| "rewards/format_reward": 0.493055559694767, |
| "step": 284 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 815.6944580078125, |
| "epoch": 0.4884318766066838, |
| "grad_norm": 0.28981852531433105, |
| "kl": 0.00812530517578125, |
| "learning_rate": 5.186095868151436e-07, |
| "loss": -0.002, |
| "reward": 0.805555559694767, |
| "reward_std": 0.03983211889863014, |
| "rewards/accuracy_reward": 0.1666666716337204, |
| "rewards/format_reward": 0.4722222238779068, |
| "step": 285 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 738.4305725097656, |
| "epoch": 0.49014567266495285, |
| "grad_norm": 0.437377393245697, |
| "kl": 0.00920867919921875, |
| "learning_rate": 5.154764373429315e-07, |
| "loss": 0.0761, |
| "reward": 0.7638889029622078, |
| "reward_std": 0.2561697345227003, |
| "rewards/accuracy_reward": 0.13888889085501432, |
| "rewards/format_reward": 0.486111119389534, |
| "step": 286 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 681.2222290039062, |
| "epoch": 0.49185946872322195, |
| "grad_norm": 0.44190195202827454, |
| "kl": 0.0103912353515625, |
| "learning_rate": 5.123449705004581e-07, |
| "loss": -0.0106, |
| "reward": 0.972222238779068, |
| "reward_std": 0.46232303231954575, |
| "rewards/accuracy_reward": 0.236111119389534, |
| "rewards/format_reward": 0.5, |
| "step": 287 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 695.5277709960938, |
| "epoch": 0.493573264781491, |
| "grad_norm": 0.496455579996109, |
| "kl": 0.01013946533203125, |
| "learning_rate": 5.09215338910999e-07, |
| "loss": -0.0123, |
| "reward": 0.6250000149011612, |
| "reward_std": 0.2721321564167738, |
| "rewards/accuracy_reward": 0.0833333358168602, |
| "rewards/format_reward": 0.4583333432674408, |
| "step": 288 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 728.0972290039062, |
| "epoch": 0.4952870608397601, |
| "grad_norm": 0.6901673078536987, |
| "kl": 0.01247406005859375, |
| "learning_rate": 5.060876951083828e-07, |
| "loss": -0.0307, |
| "reward": 0.8958333283662796, |
| "reward_std": 0.4455699250102043, |
| "rewards/accuracy_reward": 0.2083333395421505, |
| "rewards/format_reward": 0.4791666641831398, |
| "step": 289 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 736.1666717529297, |
| "epoch": 0.49700085689802914, |
| "grad_norm": 0.4506691098213196, |
| "kl": 0.00757598876953125, |
| "learning_rate": 5.02962191529556e-07, |
| "loss": 0.0253, |
| "reward": 0.8958333283662796, |
| "reward_std": 0.48350031673908234, |
| "rewards/accuracy_reward": 0.20833333488553762, |
| "rewards/format_reward": 0.479166679084301, |
| "step": 290 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 784.4305572509766, |
| "epoch": 0.4987146529562982, |
| "grad_norm": 0.39211300015449524, |
| "kl": 0.01044464111328125, |
| "learning_rate": 4.998389805071536e-07, |
| "loss": 0.0139, |
| "reward": 0.9305555671453476, |
| "reward_std": 0.38056251406669617, |
| "rewards/accuracy_reward": 0.22222223225980997, |
| "rewards/format_reward": 0.4861111119389534, |
| "step": 291 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 630.5416793823242, |
| "epoch": 0.5004284490145673, |
| "grad_norm": 0.6736369729042053, |
| "kl": 0.011474609375, |
| "learning_rate": 4.967182142620745e-07, |
| "loss": -0.0179, |
| "reward": 1.0555555820465088, |
| "reward_std": 0.553610123693943, |
| "rewards/accuracy_reward": 0.2777777835726738, |
| "rewards/format_reward": 0.5, |
| "step": 292 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 599.7500076293945, |
| "epoch": 0.5021422450728363, |
| "grad_norm": 0.35509321093559265, |
| "kl": 0.0101470947265625, |
| "learning_rate": 4.93600044896063e-07, |
| "loss": 0.0294, |
| "reward": 0.8333333432674408, |
| "reward_std": 0.331461064517498, |
| "rewards/accuracy_reward": 0.16666666883975267, |
| "rewards/format_reward": 0.5, |
| "step": 293 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 651.8194580078125, |
| "epoch": 0.5038560411311054, |
| "grad_norm": 0.35510072112083435, |
| "kl": 0.0114593505859375, |
| "learning_rate": 4.904846243842949e-07, |
| "loss": -0.0076, |
| "reward": 0.9166666716337204, |
| "reward_std": 0.2634196802973747, |
| "rewards/accuracy_reward": 0.2083333395421505, |
| "rewards/format_reward": 0.5, |
| "step": 294 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 702.9583435058594, |
| "epoch": 0.5055698371893744, |
| "grad_norm": 0.37312352657318115, |
| "kl": 0.01067352294921875, |
| "learning_rate": 4.873721045679706e-07, |
| "loss": -0.0084, |
| "reward": 0.7916666865348816, |
| "reward_std": 0.46386218070983887, |
| "rewards/accuracy_reward": 0.15277778171002865, |
| "rewards/format_reward": 0.486111119389534, |
| "step": 295 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 737.9583511352539, |
| "epoch": 0.5072836332476436, |
| "grad_norm": 0.5573227405548096, |
| "kl": 0.0097503662109375, |
| "learning_rate": 4.842626371469149e-07, |
| "loss": -0.0192, |
| "reward": 1.034722238779068, |
| "reward_std": 0.7432259321212769, |
| "rewards/accuracy_reward": 0.27777778171002865, |
| "rewards/format_reward": 0.4791666716337204, |
| "step": 296 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 685.2639007568359, |
| "epoch": 0.5089974293059126, |
| "grad_norm": 0.5244731307029724, |
| "kl": 0.00927734375, |
| "learning_rate": 4.811563736721829e-07, |
| "loss": -0.0399, |
| "reward": 0.9305555671453476, |
| "reward_std": 0.5026786401867867, |
| "rewards/accuracy_reward": 0.2222222276031971, |
| "rewards/format_reward": 0.486111119389534, |
| "step": 297 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 633.9722290039062, |
| "epoch": 0.5107112253641817, |
| "grad_norm": 0.5318572521209717, |
| "kl": 0.010101318359375, |
| "learning_rate": 4.780534655386743e-07, |
| "loss": 0.0125, |
| "reward": 0.8402777761220932, |
| "reward_std": 0.2809867858886719, |
| "rewards/accuracy_reward": 0.18055555876344442, |
| "rewards/format_reward": 0.4791666716337204, |
| "step": 298 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 738.1944580078125, |
| "epoch": 0.5124250214224507, |
| "grad_norm": 0.4404042065143585, |
| "kl": 0.0084686279296875, |
| "learning_rate": 4.749540639777539e-07, |
| "loss": 0.0031, |
| "reward": 1.1250000149011612, |
| "reward_std": 0.4589441120624542, |
| "rewards/accuracy_reward": 0.31944445334374905, |
| "rewards/format_reward": 0.486111119389534, |
| "step": 299 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 715.8611145019531, |
| "epoch": 0.5141388174807198, |
| "grad_norm": 0.36858052015304565, |
| "kl": 0.010406494140625, |
| "learning_rate": 4.7185832004988133e-07, |
| "loss": -0.0088, |
| "reward": 0.7499999850988388, |
| "reward_std": 0.33668187260627747, |
| "rewards/accuracy_reward": 0.1250000037252903, |
| "rewards/format_reward": 0.5, |
| "step": 300 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 658.6111297607422, |
| "epoch": 0.5158526135389888, |
| "grad_norm": 0.44451257586479187, |
| "kl": 0.00970458984375, |
| "learning_rate": 4.68766384637248e-07, |
| "loss": -0.0124, |
| "reward": 0.8055555522441864, |
| "reward_std": 0.2901904284954071, |
| "rewards/accuracy_reward": 0.1527777798473835, |
| "rewards/format_reward": 0.5, |
| "step": 301 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 743.5555725097656, |
| "epoch": 0.517566409597258, |
| "grad_norm": 0.394045889377594, |
| "kl": 0.00940704345703125, |
| "learning_rate": 4.656784084364238e-07, |
| "loss": 0.0071, |
| "reward": 0.861111119389534, |
| "reward_std": 0.3762567415833473, |
| "rewards/accuracy_reward": 0.18055555783212185, |
| "rewards/format_reward": 0.5, |
| "step": 302 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 686.5833435058594, |
| "epoch": 0.519280205655527, |
| "grad_norm": 0.41484999656677246, |
| "kl": 0.01177978515625, |
| "learning_rate": 4.6259454195101267e-07, |
| "loss": 0.0097, |
| "reward": 0.6388888955116272, |
| "reward_std": 0.22736987471580505, |
| "rewards/accuracy_reward": 0.06944444496184587, |
| "rewards/format_reward": 0.5, |
| "step": 303 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 663.4027862548828, |
| "epoch": 0.5209940017137961, |
| "grad_norm": 0.6104622483253479, |
| "kl": 0.011749267578125, |
| "learning_rate": 4.59514935484316e-07, |
| "loss": 0.0155, |
| "reward": 0.826388880610466, |
| "reward_std": 0.4845541790127754, |
| "rewards/accuracy_reward": 0.16666667070239782, |
| "rewards/format_reward": 0.493055559694767, |
| "step": 304 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 673.0277862548828, |
| "epoch": 0.5227077977720651, |
| "grad_norm": 0.412063330411911, |
| "kl": 0.010986328125, |
| "learning_rate": 4.5643973913200837e-07, |
| "loss": 0.0303, |
| "reward": 0.9513889029622078, |
| "reward_std": 0.3880129065364599, |
| "rewards/accuracy_reward": 0.2361111119389534, |
| "rewards/format_reward": 0.4791666716337204, |
| "step": 305 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 701.0416717529297, |
| "epoch": 0.5244215938303342, |
| "grad_norm": 0.5593081116676331, |
| "kl": 0.0098724365234375, |
| "learning_rate": 4.5336910277482155e-07, |
| "loss": 0.0378, |
| "reward": 0.7986111044883728, |
| "reward_std": 0.39326707273721695, |
| "rewards/accuracy_reward": 0.1527777835726738, |
| "rewards/format_reward": 0.493055559694767, |
| "step": 306 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 751.9722290039062, |
| "epoch": 0.5261353898886033, |
| "grad_norm": 0.48807039856910706, |
| "kl": 0.0091400146484375, |
| "learning_rate": 4.503031760712397e-07, |
| "loss": 0.0218, |
| "reward": 1.1875000149011612, |
| "reward_std": 0.6258577555418015, |
| "rewards/accuracy_reward": 0.34722222946584225, |
| "rewards/format_reward": 0.493055559694767, |
| "step": 307 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 835.2916717529297, |
| "epoch": 0.5278491859468724, |
| "grad_norm": 0.4123370349407196, |
| "kl": 0.010833740234375, |
| "learning_rate": 4.4724210845020494e-07, |
| "loss": -0.0059, |
| "reward": 0.7777777910232544, |
| "reward_std": 0.205099418759346, |
| "rewards/accuracy_reward": 0.15277778077870607, |
| "rewards/format_reward": 0.4722222313284874, |
| "step": 308 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 705.0555572509766, |
| "epoch": 0.5295629820051414, |
| "grad_norm": 0.466584712266922, |
| "kl": 0.00885009765625, |
| "learning_rate": 4.441860491038345e-07, |
| "loss": 0.0187, |
| "reward": 0.8750000074505806, |
| "reward_std": 0.3422360420227051, |
| "rewards/accuracy_reward": 0.19444444496184587, |
| "rewards/format_reward": 0.486111119389534, |
| "step": 309 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 760.2639007568359, |
| "epoch": 0.5312767780634104, |
| "grad_norm": 0.33340954780578613, |
| "kl": 0.00978851318359375, |
| "learning_rate": 4.4113514698014953e-07, |
| "loss": 0.0057, |
| "reward": 0.7708333358168602, |
| "reward_std": 0.17633881978690624, |
| "rewards/accuracy_reward": 0.1388888917863369, |
| "rewards/format_reward": 0.493055559694767, |
| "step": 310 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 770.3194427490234, |
| "epoch": 0.5329905741216795, |
| "grad_norm": 0.34279051423072815, |
| "kl": 0.0094146728515625, |
| "learning_rate": 4.3808955077581546e-07, |
| "loss": -0.0054, |
| "reward": 0.6875000074505806, |
| "reward_std": 0.25718431919813156, |
| "rewards/accuracy_reward": 0.0972222238779068, |
| "rewards/format_reward": 0.493055559694767, |
| "step": 311 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 807.4861297607422, |
| "epoch": 0.5347043701799485, |
| "grad_norm": 0.42101454734802246, |
| "kl": 0.007659912109375, |
| "learning_rate": 4.350494089288943e-07, |
| "loss": 0.0135, |
| "reward": 0.9930555671453476, |
| "reward_std": 0.49956031143665314, |
| "rewards/accuracy_reward": 0.25000000838190317, |
| "rewards/format_reward": 0.493055559694767, |
| "step": 312 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 615.5277709960938, |
| "epoch": 0.5364181662382177, |
| "grad_norm": 0.5372198820114136, |
| "kl": 0.0162506103515625, |
| "learning_rate": 4.3201486961161093e-07, |
| "loss": -0.0168, |
| "reward": 1.1875000149011612, |
| "reward_std": 0.4131338596343994, |
| "rewards/accuracy_reward": 0.3472222276031971, |
| "rewards/format_reward": 0.493055559694767, |
| "step": 313 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 725.8750152587891, |
| "epoch": 0.5381319622964867, |
| "grad_norm": 0.4827311038970947, |
| "kl": 0.00830078125, |
| "learning_rate": 4.2898608072313045e-07, |
| "loss": 0.0221, |
| "reward": 1.0000000149011612, |
| "reward_std": 0.5675767734646797, |
| "rewards/accuracy_reward": 0.25000000558793545, |
| "rewards/format_reward": 0.5, |
| "step": 314 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 631.4444427490234, |
| "epoch": 0.5398457583547558, |
| "grad_norm": 0.32586589455604553, |
| "kl": 0.0107269287109375, |
| "learning_rate": 4.2596318988235037e-07, |
| "loss": -0.011, |
| "reward": 0.861111119389534, |
| "reward_std": 0.22736985981464386, |
| "rewards/accuracy_reward": 0.1805555634200573, |
| "rewards/format_reward": 0.5, |
| "step": 315 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 760.6944427490234, |
| "epoch": 0.5415595544130248, |
| "grad_norm": 0.36847296357154846, |
| "kl": 0.0100860595703125, |
| "learning_rate": 4.2294634442070553e-07, |
| "loss": 0.0058, |
| "reward": 0.8819444477558136, |
| "reward_std": 0.3717171251773834, |
| "rewards/accuracy_reward": 0.1944444477558136, |
| "rewards/format_reward": 0.493055559694767, |
| "step": 316 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 860.0138854980469, |
| "epoch": 0.5432733504712939, |
| "grad_norm": 0.33621668815612793, |
| "kl": 0.0082855224609375, |
| "learning_rate": 4.1993569137498776e-07, |
| "loss": 0.0228, |
| "reward": 0.8958333656191826, |
| "reward_std": 0.3794733416289091, |
| "rewards/accuracy_reward": 0.2083333358168602, |
| "rewards/format_reward": 0.4791666716337204, |
| "step": 317 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 591.1944580078125, |
| "epoch": 0.5449871465295629, |
| "grad_norm": 0.5173898935317993, |
| "kl": 0.01264190673828125, |
| "learning_rate": 4.1693137748017915e-07, |
| "loss": -0.0109, |
| "reward": 1.083333358168602, |
| "reward_std": 0.4227481558918953, |
| "rewards/accuracy_reward": 0.2916666753590107, |
| "rewards/format_reward": 0.5, |
| "step": 318 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 781.0694427490234, |
| "epoch": 0.5467009425878321, |
| "grad_norm": 0.4726315438747406, |
| "kl": 0.01230621337890625, |
| "learning_rate": 4.1393354916230005e-07, |
| "loss": 0.0288, |
| "reward": 0.6666666939854622, |
| "reward_std": 0.27821177802979946, |
| "rewards/accuracy_reward": 0.0972222238779068, |
| "rewards/format_reward": 0.472222238779068, |
| "step": 319 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 706.5139007568359, |
| "epoch": 0.5484147386461011, |
| "grad_norm": 0.4803582727909088, |
| "kl": 0.0104217529296875, |
| "learning_rate": 4.1094235253127374e-07, |
| "loss": 0.0114, |
| "reward": 1.1666666865348816, |
| "reward_std": 0.5012311488389969, |
| "rewards/accuracy_reward": 0.3333333432674408, |
| "rewards/format_reward": 0.5, |
| "step": 320 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 652.2222137451172, |
| "epoch": 0.5501285347043702, |
| "grad_norm": 0.4137377440929413, |
| "kl": 0.00899505615234375, |
| "learning_rate": 4.079579333738039e-07, |
| "loss": -0.0062, |
| "reward": 0.680555559694767, |
| "reward_std": 0.27419466339051723, |
| "rewards/accuracy_reward": 0.09722222294658422, |
| "rewards/format_reward": 0.486111119389534, |
| "step": 321 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 661.7500152587891, |
| "epoch": 0.5518423307626392, |
| "grad_norm": 0.7754374146461487, |
| "kl": 0.01078033447265625, |
| "learning_rate": 4.0498043714627006e-07, |
| "loss": 0.0696, |
| "reward": 0.7083333283662796, |
| "reward_std": 0.4283023327589035, |
| "rewards/accuracy_reward": 0.11111111287027597, |
| "rewards/format_reward": 0.486111119389534, |
| "step": 322 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 747.7777709960938, |
| "epoch": 0.5535561268209083, |
| "grad_norm": 0.23274828493595123, |
| "kl": 0.01306915283203125, |
| "learning_rate": 4.020100089676376e-07, |
| "loss": 0.0218, |
| "reward": 0.8541666641831398, |
| "reward_std": 0.08505172841250896, |
| "rewards/accuracy_reward": 0.180555559694767, |
| "rewards/format_reward": 0.493055559694767, |
| "step": 323 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 794.263916015625, |
| "epoch": 0.5552699228791774, |
| "grad_norm": 0.5167786478996277, |
| "kl": 0.0105438232421875, |
| "learning_rate": 3.9904679361238526e-07, |
| "loss": 0.0041, |
| "reward": 1.0347222089767456, |
| "reward_std": 0.5008072182536125, |
| "rewards/accuracy_reward": 0.2777777835726738, |
| "rewards/format_reward": 0.4791666641831398, |
| "step": 324 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 756.1250305175781, |
| "epoch": 0.5569837189374465, |
| "grad_norm": 0.42761853337287903, |
| "kl": 0.01197052001953125, |
| "learning_rate": 3.9609093550344907e-07, |
| "loss": 0.0332, |
| "reward": 0.7152777761220932, |
| "reward_std": 0.4112919941544533, |
| "rewards/accuracy_reward": 0.11111111380159855, |
| "rewards/format_reward": 0.493055559694767, |
| "step": 325 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 678.4583358764648, |
| "epoch": 0.5586975149957155, |
| "grad_norm": 0.6914120316505432, |
| "kl": 0.01470947265625, |
| "learning_rate": 3.931425787051832e-07, |
| "loss": 0.0291, |
| "reward": 1.0763888955116272, |
| "reward_std": 0.43159355968236923, |
| "rewards/accuracy_reward": 0.29166667349636555, |
| "rewards/format_reward": 0.493055559694767, |
| "step": 326 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 575.4722290039062, |
| "epoch": 0.5604113110539846, |
| "grad_norm": 0.608383059501648, |
| "kl": 0.0129547119140625, |
| "learning_rate": 3.902018669163384e-07, |
| "loss": 0.0089, |
| "reward": 1.0486111044883728, |
| "reward_std": 0.46130846440792084, |
| "rewards/accuracy_reward": 0.2777777807787061, |
| "rewards/format_reward": 0.493055559694767, |
| "step": 327 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 621.3055725097656, |
| "epoch": 0.5621251071122536, |
| "grad_norm": 0.9387192130088806, |
| "kl": 0.0234832763671875, |
| "learning_rate": 3.872689434630585e-07, |
| "loss": 0.0096, |
| "reward": 0.8263888955116272, |
| "reward_std": 0.32522570341825485, |
| "rewards/accuracy_reward": 0.1666666679084301, |
| "rewards/format_reward": 0.493055559694767, |
| "step": 328 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 718.8333587646484, |
| "epoch": 0.5638389031705227, |
| "grad_norm": 0.36304396390914917, |
| "kl": 0.00838470458984375, |
| "learning_rate": 3.843439512918949e-07, |
| "loss": -0.0051, |
| "reward": 0.75, |
| "reward_std": 0.33668185770511627, |
| "rewards/accuracy_reward": 0.12500000465661287, |
| "rewards/format_reward": 0.5, |
| "step": 329 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 838.6250152587891, |
| "epoch": 0.5655526992287918, |
| "grad_norm": 0.4335583746433258, |
| "kl": 0.0093231201171875, |
| "learning_rate": 3.8142703296283953e-07, |
| "loss": 0.0504, |
| "reward": 0.8958333358168602, |
| "reward_std": 0.2658967934548855, |
| "rewards/accuracy_reward": 0.2083333320915699, |
| "rewards/format_reward": 0.4791666641831398, |
| "step": 330 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 664.0694427490234, |
| "epoch": 0.5672664952870609, |
| "grad_norm": 0.44489485025405884, |
| "kl": 0.0103607177734375, |
| "learning_rate": 3.785183306423767e-07, |
| "loss": -0.0029, |
| "reward": 1.1944444328546524, |
| "reward_std": 0.3762567266821861, |
| "rewards/accuracy_reward": 0.3472222276031971, |
| "rewards/format_reward": 0.5, |
| "step": 331 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 668.3055725097656, |
| "epoch": 0.5689802913453299, |
| "grad_norm": 0.22135576605796814, |
| "kl": 0.01212310791015625, |
| "learning_rate": 3.7561798609655373e-07, |
| "loss": 0.0198, |
| "reward": 0.8541666641831398, |
| "reward_std": 0.19436372630298138, |
| "rewards/accuracy_reward": 0.180555559694767, |
| "rewards/format_reward": 0.493055559694767, |
| "step": 332 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 756.5416717529297, |
| "epoch": 0.570694087403599, |
| "grad_norm": 0.3432075083255768, |
| "kl": 0.0111541748046875, |
| "learning_rate": 3.72726140684072e-07, |
| "loss": -0.0046, |
| "reward": 0.6597222164273262, |
| "reward_std": 0.017010344192385674, |
| "rewards/accuracy_reward": 0.0833333358168602, |
| "rewards/format_reward": 0.493055559694767, |
| "step": 333 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 798.2083435058594, |
| "epoch": 0.572407883461868, |
| "grad_norm": 0.3384978473186493, |
| "kl": 0.00952911376953125, |
| "learning_rate": 3.6984293534939737e-07, |
| "loss": 0.0133, |
| "reward": 0.8750000298023224, |
| "reward_std": 0.4249234274029732, |
| "rewards/accuracy_reward": 0.19444444589316845, |
| "rewards/format_reward": 0.486111119389534, |
| "step": 334 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 641.7222290039062, |
| "epoch": 0.5741216795201372, |
| "grad_norm": 0.41000911593437195, |
| "kl": 0.0142669677734375, |
| "learning_rate": 3.6696851061588994e-07, |
| "loss": 0.0868, |
| "reward": 0.9583333283662796, |
| "reward_std": 0.3291585296392441, |
| "rewards/accuracy_reward": 0.23611110914498568, |
| "rewards/format_reward": 0.486111119389534, |
| "step": 335 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 671.9583358764648, |
| "epoch": 0.5758354755784062, |
| "grad_norm": 0.39112672209739685, |
| "kl": 0.009521484375, |
| "learning_rate": 3.641030065789562e-07, |
| "loss": -0.0072, |
| "reward": 0.7152777835726738, |
| "reward_std": 0.26240511797368526, |
| "rewards/accuracy_reward": 0.11111111287027597, |
| "rewards/format_reward": 0.493055559694767, |
| "step": 336 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 706.9722366333008, |
| "epoch": 0.5775492716366752, |
| "grad_norm": 0.48448535799980164, |
| "kl": 0.00994873046875, |
| "learning_rate": 3.612465628992203e-07, |
| "loss": -0.0094, |
| "reward": 0.9652777910232544, |
| "reward_std": 0.332209013402462, |
| "rewards/accuracy_reward": 0.25000000838190317, |
| "rewards/format_reward": 0.4652777835726738, |
| "step": 337 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 729.3333282470703, |
| "epoch": 0.5792630676949443, |
| "grad_norm": 0.5280615091323853, |
| "kl": 0.012420654296875, |
| "learning_rate": 3.5839931879571725e-07, |
| "loss": -0.0209, |
| "reward": 0.965277798473835, |
| "reward_std": 0.5258247926831245, |
| "rewards/accuracy_reward": 0.23611112125217915, |
| "rewards/format_reward": 0.493055559694767, |
| "step": 338 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 779.4166717529297, |
| "epoch": 0.5809768637532133, |
| "grad_norm": 0.32812824845314026, |
| "kl": 0.0185089111328125, |
| "learning_rate": 3.555614130391079e-07, |
| "loss": 0.0169, |
| "reward": 0.6666666716337204, |
| "reward_std": 0.25819889456033707, |
| "rewards/accuracy_reward": 0.08333333395421505, |
| "rewards/format_reward": 0.5, |
| "step": 339 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 616.9444351196289, |
| "epoch": 0.5826906598114824, |
| "grad_norm": 0.5442182421684265, |
| "kl": 0.014892578125, |
| "learning_rate": 3.5273298394491515e-07, |
| "loss": -0.0563, |
| "reward": 0.8055555522441864, |
| "reward_std": 0.44179464131593704, |
| "rewards/accuracy_reward": 0.16666667070239782, |
| "rewards/format_reward": 0.4722222238779068, |
| "step": 340 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 625.2639007568359, |
| "epoch": 0.5844044558697515, |
| "grad_norm": 0.08322001248598099, |
| "kl": 0.01381683349609375, |
| "learning_rate": 3.4991416936678276e-07, |
| "loss": 0.0005, |
| "reward": 0.5, |
| "reward_std": 0.0, |
| "rewards/accuracy_reward": 0.0, |
| "rewards/format_reward": 0.5, |
| "step": 341 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 662.7639007568359, |
| "epoch": 0.5861182519280206, |
| "grad_norm": 0.42785269021987915, |
| "kl": 0.00940704345703125, |
| "learning_rate": 3.471051066897562e-07, |
| "loss": -0.0295, |
| "reward": 0.861111119389534, |
| "reward_std": 0.47276464104652405, |
| "rewards/accuracy_reward": 0.18055556062608957, |
| "rewards/format_reward": 0.5, |
| "step": 342 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 588.5138854980469, |
| "epoch": 0.5878320479862896, |
| "grad_norm": 0.28376224637031555, |
| "kl": 0.0150909423828125, |
| "learning_rate": 3.4430593282358777e-07, |
| "loss": 0.0035, |
| "reward": 1.2430555373430252, |
| "reward_std": 0.24696609377861023, |
| "rewards/accuracy_reward": 0.3750000037252903, |
| "rewards/format_reward": 0.493055559694767, |
| "step": 343 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 748.7083282470703, |
| "epoch": 0.5895458440445587, |
| "grad_norm": 0.47153374552726746, |
| "kl": 0.0116729736328125, |
| "learning_rate": 3.4151678419606233e-07, |
| "loss": 0.0016, |
| "reward": 0.6527777686715126, |
| "reward_std": 0.33054604940116405, |
| "rewards/accuracy_reward": 0.08333333488553762, |
| "rewards/format_reward": 0.486111119389534, |
| "step": 344 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 730.5972290039062, |
| "epoch": 0.5912596401028277, |
| "grad_norm": 0.45074042677879333, |
| "kl": 0.0081787109375, |
| "learning_rate": 3.387377967463493e-07, |
| "loss": -0.0155, |
| "reward": 1.3472222238779068, |
| "reward_std": 0.6584814712405205, |
| "rewards/accuracy_reward": 0.4305555634200573, |
| "rewards/format_reward": 0.4861111119389534, |
| "step": 345 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 765.8611297607422, |
| "epoch": 0.5929734361610969, |
| "grad_norm": 0.38273346424102783, |
| "kl": 0.00989532470703125, |
| "learning_rate": 3.359691059183761e-07, |
| "loss": 0.0365, |
| "reward": 0.9166666716337204, |
| "reward_std": 0.3995024487376213, |
| "rewards/accuracy_reward": 0.2083333395421505, |
| "rewards/format_reward": 0.5, |
| "step": 346 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 656.5416717529297, |
| "epoch": 0.5946872322193659, |
| "grad_norm": 0.42742764949798584, |
| "kl": 0.01055908203125, |
| "learning_rate": 3.3321084665422803e-07, |
| "loss": 0.0121, |
| "reward": 0.9930555671453476, |
| "reward_std": 0.34847141802310944, |
| "rewards/accuracy_reward": 0.25000000558793545, |
| "rewards/format_reward": 0.493055559694767, |
| "step": 347 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 760.1111297607422, |
| "epoch": 0.596401028277635, |
| "grad_norm": 0.2555226981639862, |
| "kl": 0.0147552490234375, |
| "learning_rate": 3.3046315338757026e-07, |
| "loss": 0.0076, |
| "reward": 0.6111111044883728, |
| "reward_std": 0.15932847559452057, |
| "rewards/accuracy_reward": 0.055555556900799274, |
| "rewards/format_reward": 0.5, |
| "step": 348 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 742.7777862548828, |
| "epoch": 0.598114824335904, |
| "grad_norm": 0.3386673033237457, |
| "kl": 0.01026153564453125, |
| "learning_rate": 3.2772616003709616e-07, |
| "loss": 0.01, |
| "reward": 0.9930555820465088, |
| "reward_std": 0.32522569596767426, |
| "rewards/accuracy_reward": 0.2500000027939677, |
| "rewards/format_reward": 0.493055559694767, |
| "step": 349 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 694.3333435058594, |
| "epoch": 0.5998286203941731, |
| "grad_norm": 0.4300551116466522, |
| "kl": 0.01024627685546875, |
| "learning_rate": 3.250000000000001e-07, |
| "loss": -0.0217, |
| "reward": 1.048611119389534, |
| "reward_std": 0.4845541790127754, |
| "rewards/accuracy_reward": 0.27777778171002865, |
| "rewards/format_reward": 0.493055559694767, |
| "step": 350 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 710.8611145019531, |
| "epoch": 0.6015424164524421, |
| "grad_norm": 0.4361790716648102, |
| "kl": 0.01013946533203125, |
| "learning_rate": 3.222848061454764e-07, |
| "loss": -0.0161, |
| "reward": 0.7430555671453476, |
| "reward_std": 0.3072007894515991, |
| "rewards/accuracy_reward": 0.12500000186264515, |
| "rewards/format_reward": 0.493055559694767, |
| "step": 351 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 867.2222137451172, |
| "epoch": 0.6032562125107113, |
| "grad_norm": 0.41655802726745605, |
| "kl": 0.0092926025390625, |
| "learning_rate": 3.195807108082429e-07, |
| "loss": 0.0058, |
| "reward": 0.7777777910232544, |
| "reward_std": 0.4547397345304489, |
| "rewards/accuracy_reward": 0.13888888992369175, |
| "rewards/format_reward": 0.5, |
| "step": 352 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 808.7639007568359, |
| "epoch": 0.6049700085689803, |
| "grad_norm": 0.32335948944091797, |
| "kl": 0.009429931640625, |
| "learning_rate": 3.168878457820915e-07, |
| "loss": 0.0072, |
| "reward": 1.0277777761220932, |
| "reward_std": 0.33668188750743866, |
| "rewards/accuracy_reward": 0.26388889364898205, |
| "rewards/format_reward": 0.5, |
| "step": 353 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 568.3610992431641, |
| "epoch": 0.6066838046272494, |
| "grad_norm": 0.747093915939331, |
| "kl": 0.011749267578125, |
| "learning_rate": 3.142063423134644e-07, |
| "loss": 0.0159, |
| "reward": 1.4097222536802292, |
| "reward_std": 0.6619075667113066, |
| "rewards/accuracy_reward": 0.4583333395421505, |
| "rewards/format_reward": 0.493055559694767, |
| "step": 354 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 693.5972290039062, |
| "epoch": 0.6083976006855184, |
| "grad_norm": 0.44648948311805725, |
| "kl": 0.01004791259765625, |
| "learning_rate": 3.115363310950578e-07, |
| "loss": 0.0044, |
| "reward": 0.9027778059244156, |
| "reward_std": 0.24970055185258389, |
| "rewards/accuracy_reward": 0.20833333674818277, |
| "rewards/format_reward": 0.486111119389534, |
| "step": 355 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 603.2638931274414, |
| "epoch": 0.6101113967437874, |
| "grad_norm": 0.484390527009964, |
| "kl": 0.012542724609375, |
| "learning_rate": 3.0887794225945143e-07, |
| "loss": -0.0051, |
| "reward": 0.6249999925494194, |
| "reward_std": 0.3125211279839277, |
| "rewards/accuracy_reward": 0.06944444496184587, |
| "rewards/format_reward": 0.486111119389534, |
| "step": 356 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 628.0138931274414, |
| "epoch": 0.6118251928020566, |
| "grad_norm": 0.5177603363990784, |
| "kl": 0.0098114013671875, |
| "learning_rate": 3.062313053727671e-07, |
| "loss": 0.0262, |
| "reward": 1.0833333283662796, |
| "reward_std": 0.4495188891887665, |
| "rewards/accuracy_reward": 0.2916666669771075, |
| "rewards/format_reward": 0.5, |
| "step": 357 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 677.9444580078125, |
| "epoch": 0.6135389888603257, |
| "grad_norm": 0.5876026153564453, |
| "kl": 0.0117034912109375, |
| "learning_rate": 3.0359654942835247e-07, |
| "loss": 0.0255, |
| "reward": 0.722222238779068, |
| "reward_std": 0.45503970980644226, |
| "rewards/accuracy_reward": 0.12500000279396772, |
| "rewards/format_reward": 0.4722222313284874, |
| "step": 358 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 610.0972290039062, |
| "epoch": 0.6152527849185947, |
| "grad_norm": 0.44811731576919556, |
| "kl": 0.0104827880859375, |
| "learning_rate": 3.0097380284049523e-07, |
| "loss": 0.0011, |
| "reward": 0.9027777761220932, |
| "reward_std": 0.2613905444741249, |
| "rewards/accuracy_reward": 0.20833333302289248, |
| "rewards/format_reward": 0.4861111119389534, |
| "step": 359 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 682.4583435058594, |
| "epoch": 0.6169665809768637, |
| "grad_norm": 0.5113480687141418, |
| "kl": 0.0124053955078125, |
| "learning_rate": 2.9836319343816397e-07, |
| "loss": -0.0284, |
| "reward": 0.826388880610466, |
| "reward_std": 0.44850434362888336, |
| "rewards/accuracy_reward": 0.16666666977107525, |
| "rewards/format_reward": 0.493055559694767, |
| "step": 360 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 613.2639007568359, |
| "epoch": 0.6186803770351328, |
| "grad_norm": 0.6223089098930359, |
| "kl": 0.0121612548828125, |
| "learning_rate": 2.9576484845877793e-07, |
| "loss": 0.0238, |
| "reward": 1.0277777910232544, |
| "reward_std": 0.6448972225189209, |
| "rewards/accuracy_reward": 0.2638888955116272, |
| "rewards/format_reward": 0.5, |
| "step": 361 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 640.7083282470703, |
| "epoch": 0.6203941730934018, |
| "grad_norm": 0.5173635482788086, |
| "kl": 0.00988006591796875, |
| "learning_rate": 2.931788945420058e-07, |
| "loss": 0.0152, |
| "reward": 0.7708333283662796, |
| "reward_std": 0.4405169114470482, |
| "rewards/accuracy_reward": 0.13888889364898205, |
| "rewards/format_reward": 0.493055559694767, |
| "step": 362 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 723.5555725097656, |
| "epoch": 0.622107969151671, |
| "grad_norm": 0.49564608931541443, |
| "kl": 0.0084381103515625, |
| "learning_rate": 2.9060545772359305e-07, |
| "loss": -0.0065, |
| "reward": 1.0138888955116272, |
| "reward_std": 0.3242111261934042, |
| "rewards/accuracy_reward": 0.2638888955116272, |
| "rewards/format_reward": 0.486111119389534, |
| "step": 363 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 653.4722290039062, |
| "epoch": 0.62382176520994, |
| "grad_norm": 0.39930129051208496, |
| "kl": 0.01010894775390625, |
| "learning_rate": 2.8804466342921987e-07, |
| "loss": -0.0025, |
| "reward": 0.9444444328546524, |
| "reward_std": 0.38147754967212677, |
| "rewards/accuracy_reward": 0.22222222946584225, |
| "rewards/format_reward": 0.5, |
| "step": 364 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 837.6944732666016, |
| "epoch": 0.6255355612682091, |
| "grad_norm": 0.3432636260986328, |
| "kl": 0.00820159912109375, |
| "learning_rate": 2.854966364683872e-07, |
| "loss": 0.1109, |
| "reward": 0.7986111268401146, |
| "reward_std": 0.36156320944428444, |
| "rewards/accuracy_reward": 0.16666666977107525, |
| "rewards/format_reward": 0.4652777835726738, |
| "step": 365 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 699.8750152587891, |
| "epoch": 0.6272493573264781, |
| "grad_norm": 0.3920920789241791, |
| "kl": 0.00817108154296875, |
| "learning_rate": 2.829615010283344e-07, |
| "loss": -0.0084, |
| "reward": 0.8263888955116272, |
| "reward_std": 0.153093121945858, |
| "rewards/accuracy_reward": 0.1666666716337204, |
| "rewards/format_reward": 0.493055559694767, |
| "step": 366 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 733.0416717529297, |
| "epoch": 0.6289631533847472, |
| "grad_norm": 0.6963837146759033, |
| "kl": 0.01009368896484375, |
| "learning_rate": 2.8043938066798645e-07, |
| "loss": 0.1349, |
| "reward": 1.2152777910232544, |
| "reward_std": 0.6726825386285782, |
| "rewards/accuracy_reward": 0.3611111156642437, |
| "rewards/format_reward": 0.493055559694767, |
| "step": 367 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 789.3610992431641, |
| "epoch": 0.6306769494430163, |
| "grad_norm": 0.48074987530708313, |
| "kl": 0.00942230224609375, |
| "learning_rate": 2.7793039831193133e-07, |
| "loss": 0.1273, |
| "reward": 0.7777777835726738, |
| "reward_std": 0.36897341534495354, |
| "rewards/accuracy_reward": 0.15277778450399637, |
| "rewards/format_reward": 0.4722222238779068, |
| "step": 368 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 684.4861297607422, |
| "epoch": 0.6323907455012854, |
| "grad_norm": 0.5865346789360046, |
| "kl": 0.0122222900390625, |
| "learning_rate": 2.7543467624442956e-07, |
| "loss": 0.0003, |
| "reward": 1.0277777910232544, |
| "reward_std": 0.3762567415833473, |
| "rewards/accuracy_reward": 0.26388888992369175, |
| "rewards/format_reward": 0.5, |
| "step": 369 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 657.6528015136719, |
| "epoch": 0.6341045415595544, |
| "grad_norm": 0.5562211871147156, |
| "kl": 0.0120849609375, |
| "learning_rate": 2.729523361034538e-07, |
| "loss": 0.0188, |
| "reward": 1.1388888955116272, |
| "reward_std": 0.3075363263487816, |
| "rewards/accuracy_reward": 0.3333333432674408, |
| "rewards/format_reward": 0.4722222238779068, |
| "step": 370 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 580.1111297607422, |
| "epoch": 0.6358183376178235, |
| "grad_norm": 0.35350364446640015, |
| "kl": 0.0099639892578125, |
| "learning_rate": 2.7048349887476037e-07, |
| "loss": -0.007, |
| "reward": 1.0902777761220932, |
| "reward_std": 0.3765922859311104, |
| "rewards/accuracy_reward": 0.3055555550381541, |
| "rewards/format_reward": 0.4791666641831398, |
| "step": 371 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 718.4861297607422, |
| "epoch": 0.6375321336760925, |
| "grad_norm": 0.3661295771598816, |
| "kl": 0.00765228271484375, |
| "learning_rate": 2.6802828488599294e-07, |
| "loss": 0.0188, |
| "reward": 0.826388880610466, |
| "reward_std": 0.4112919941544533, |
| "rewards/accuracy_reward": 0.16666666977107525, |
| "rewards/format_reward": 0.493055559694767, |
| "step": 372 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 756.375, |
| "epoch": 0.6392459297343616, |
| "grad_norm": 0.3112243115901947, |
| "kl": 0.01010894775390625, |
| "learning_rate": 2.655868138008171e-07, |
| "loss": -0.0013, |
| "reward": 0.8611111044883728, |
| "reward_std": 0.24017397314310074, |
| "rewards/accuracy_reward": 0.18055555876344442, |
| "rewards/format_reward": 0.5, |
| "step": 373 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 813.3194427490234, |
| "epoch": 0.6409597257926307, |
| "grad_norm": 0.3174319863319397, |
| "kl": 0.007965087890625, |
| "learning_rate": 2.631592046130896e-07, |
| "loss": -0.0123, |
| "reward": 1.0833333432674408, |
| "reward_std": 0.2901904284954071, |
| "rewards/accuracy_reward": 0.2916666716337204, |
| "rewards/format_reward": 0.5, |
| "step": 374 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 705.0972290039062, |
| "epoch": 0.6426735218508998, |
| "grad_norm": 0.46651625633239746, |
| "kl": 0.0082550048828125, |
| "learning_rate": 2.6074557564105724e-07, |
| "loss": 0.0328, |
| "reward": 1.0486111044883728, |
| "reward_std": 0.6196977943181992, |
| "rewards/accuracy_reward": 0.27777778171002865, |
| "rewards/format_reward": 0.493055559694767, |
| "step": 375 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 642.0972290039062, |
| "epoch": 0.6443873179091688, |
| "grad_norm": 0.5062349438667297, |
| "kl": 0.0101165771484375, |
| "learning_rate": 2.583460445215911e-07, |
| "loss": -0.0277, |
| "reward": 0.9861111342906952, |
| "reward_std": 0.3804878890514374, |
| "rewards/accuracy_reward": 0.25000000931322575, |
| "rewards/format_reward": 0.4861111119389534, |
| "step": 376 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 701.2361145019531, |
| "epoch": 0.6461011139674379, |
| "grad_norm": 0.4397139549255371, |
| "kl": 0.008392333984375, |
| "learning_rate": 2.5596072820445254e-07, |
| "loss": -0.0174, |
| "reward": 1.0277777910232544, |
| "reward_std": 0.47276463359594345, |
| "rewards/accuracy_reward": 0.26388890016824007, |
| "rewards/format_reward": 0.5, |
| "step": 377 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 680.9861145019531, |
| "epoch": 0.6478149100257069, |
| "grad_norm": 0.20873984694480896, |
| "kl": 0.0089569091796875, |
| "learning_rate": 2.5358974294659373e-07, |
| "loss": -0.0031, |
| "reward": 0.5555555522441864, |
| "reward_std": 0.13608276098966599, |
| "rewards/accuracy_reward": 0.02777777798473835, |
| "rewards/format_reward": 0.5, |
| "step": 378 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 640.6666717529297, |
| "epoch": 0.6495287060839761, |
| "grad_norm": 0.4596330225467682, |
| "kl": 0.0101165771484375, |
| "learning_rate": 2.512332043064913e-07, |
| "loss": -0.009, |
| "reward": 0.7708333432674408, |
| "reward_std": 0.31242159754037857, |
| "rewards/accuracy_reward": 0.13888889271765947, |
| "rewards/format_reward": 0.493055559694767, |
| "step": 379 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 522.8194427490234, |
| "epoch": 0.6512425021422451, |
| "grad_norm": 0.3971538543701172, |
| "kl": 0.011444091796875, |
| "learning_rate": 2.488912271385139e-07, |
| "loss": 0.0003, |
| "reward": 0.972222238779068, |
| "reward_std": 0.4123065695166588, |
| "rewards/accuracy_reward": 0.23611111380159855, |
| "rewards/format_reward": 0.5, |
| "step": 380 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 829.1250152587891, |
| "epoch": 0.6529562982005142, |
| "grad_norm": 0.2756218910217285, |
| "kl": 0.0085296630859375, |
| "learning_rate": 2.465639255873246e-07, |
| "loss": 0.0465, |
| "reward": 0.5624999925494194, |
| "reward_std": 0.23096106760203838, |
| "rewards/accuracy_reward": 0.041666666977107525, |
| "rewards/format_reward": 0.4791666716337204, |
| "step": 381 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 635.0833435058594, |
| "epoch": 0.6546700942587832, |
| "grad_norm": 0.42576533555984497, |
| "kl": 0.00997161865234375, |
| "learning_rate": 2.4425141308231765e-07, |
| "loss": -0.0305, |
| "reward": 1.0000000149011612, |
| "reward_std": 0.39428164809942245, |
| "rewards/accuracy_reward": 0.25000000838190317, |
| "rewards/format_reward": 0.5, |
| "step": 382 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 768.0277862548828, |
| "epoch": 0.6563838903170522, |
| "grad_norm": 0.4228789508342743, |
| "kl": 0.010772705078125, |
| "learning_rate": 2.4195380233209006e-07, |
| "loss": 0.0427, |
| "reward": 0.7361110970377922, |
| "reward_std": 0.36172348074615, |
| "rewards/accuracy_reward": 0.12500000279396772, |
| "rewards/format_reward": 0.4861111119389534, |
| "step": 383 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 730.5555572509766, |
| "epoch": 0.6580976863753213, |
| "grad_norm": 0.25273385643959045, |
| "kl": 0.0084991455078125, |
| "learning_rate": 2.3967120531894857e-07, |
| "loss": -0.0127, |
| "reward": 0.6388888955116272, |
| "reward_std": 0.15410767495632172, |
| "rewards/accuracy_reward": 0.06944444496184587, |
| "rewards/format_reward": 0.5, |
| "step": 384 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 778.0833587646484, |
| "epoch": 0.6598114824335904, |
| "grad_norm": 0.39399486780166626, |
| "kl": 0.01232147216796875, |
| "learning_rate": 2.374037332934512e-07, |
| "loss": 0.0025, |
| "reward": 0.7083333432674408, |
| "reward_std": 0.26772547513246536, |
| "rewards/accuracy_reward": 0.11111111287027597, |
| "rewards/format_reward": 0.486111119389534, |
| "step": 385 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 661.8750152587891, |
| "epoch": 0.6615252784918595, |
| "grad_norm": 0.35121214389801025, |
| "kl": 0.01195526123046875, |
| "learning_rate": 2.3515149676898552e-07, |
| "loss": -0.0003, |
| "reward": 0.6111111044883728, |
| "reward_std": 0.222149059176445, |
| "rewards/accuracy_reward": 0.0555555559694767, |
| "rewards/format_reward": 0.5, |
| "step": 386 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 646.2222366333008, |
| "epoch": 0.6632390745501285, |
| "grad_norm": 0.3854869604110718, |
| "kl": 0.01076507568359375, |
| "learning_rate": 2.3291460551638237e-07, |
| "loss": -0.016, |
| "reward": 0.8611110895872116, |
| "reward_std": 0.20412414520978928, |
| "rewards/accuracy_reward": 0.18055556062608957, |
| "rewards/format_reward": 0.5, |
| "step": 387 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 707.0277786254883, |
| "epoch": 0.6649528706083976, |
| "grad_norm": 0.6442806720733643, |
| "kl": 0.017303466796875, |
| "learning_rate": 2.306931685585657e-07, |
| "loss": -0.0025, |
| "reward": 0.8611111044883728, |
| "reward_std": 0.5538673847913742, |
| "rewards/accuracy_reward": 0.19444445054978132, |
| "rewards/format_reward": 0.4722222238779068, |
| "step": 388 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 647.8611145019531, |
| "epoch": 0.6666666666666666, |
| "grad_norm": 0.5503394603729248, |
| "kl": 0.0092926025390625, |
| "learning_rate": 2.2848729416523859e-07, |
| "loss": 0.0202, |
| "reward": 0.9861111119389534, |
| "reward_std": 0.5015645399689674, |
| "rewards/accuracy_reward": 0.25000000186264515, |
| "rewards/format_reward": 0.4861111119389534, |
| "step": 389 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 731.2638854980469, |
| "epoch": 0.6683804627249358, |
| "grad_norm": 0.44946956634521484, |
| "kl": 0.00981903076171875, |
| "learning_rate": 2.2629708984760706e-07, |
| "loss": 0.1489, |
| "reward": 0.9583333432674408, |
| "reward_std": 0.35901258140802383, |
| "rewards/accuracy_reward": 0.236111119389534, |
| "rewards/format_reward": 0.486111119389534, |
| "step": 390 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 808.2083435058594, |
| "epoch": 0.6700942587832048, |
| "grad_norm": 0.4584062695503235, |
| "kl": 0.00930023193359375, |
| "learning_rate": 2.2412266235313973e-07, |
| "loss": -0.0126, |
| "reward": 1.020833358168602, |
| "reward_std": 0.5443559736013412, |
| "rewards/accuracy_reward": 0.2638888917863369, |
| "rewards/format_reward": 0.493055559694767, |
| "step": 391 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 662.8055572509766, |
| "epoch": 0.6718080548414739, |
| "grad_norm": 0.5026288628578186, |
| "kl": 0.01027679443359375, |
| "learning_rate": 2.2196411766036487e-07, |
| "loss": 0.0043, |
| "reward": 0.7222222238779068, |
| "reward_std": 0.25819889456033707, |
| "rewards/accuracy_reward": 0.1111111119389534, |
| "rewards/format_reward": 0.5, |
| "step": 392 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 827.4861145019531, |
| "epoch": 0.6735218508997429, |
| "grad_norm": 0.4576079845428467, |
| "kl": 0.01000213623046875, |
| "learning_rate": 2.1982156097370557e-07, |
| "loss": 0.0398, |
| "reward": 0.7708333134651184, |
| "reward_std": 0.31242159754037857, |
| "rewards/accuracy_reward": 0.13888889271765947, |
| "rewards/format_reward": 0.493055559694767, |
| "step": 393 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 632.625, |
| "epoch": 0.675235646958012, |
| "grad_norm": 0.46405985951423645, |
| "kl": 0.010711669921875, |
| "learning_rate": 2.1769509671835223e-07, |
| "loss": -0.0166, |
| "reward": 0.7569444477558136, |
| "reward_std": 0.36649633944034576, |
| "rewards/accuracy_reward": 0.12500000186264515, |
| "rewards/format_reward": 0.5069444477558136, |
| "step": 394 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 740.8055572509766, |
| "epoch": 0.676949443016281, |
| "grad_norm": 0.33941328525543213, |
| "kl": 0.0098724365234375, |
| "learning_rate": 2.1558482853517253e-07, |
| "loss": 0.0659, |
| "reward": 0.743055559694767, |
| "reward_std": 0.24438020400702953, |
| "rewards/accuracy_reward": 0.12500000465661287, |
| "rewards/format_reward": 0.493055559694767, |
| "step": 395 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 810.3055725097656, |
| "epoch": 0.6786632390745502, |
| "grad_norm": 0.3110896348953247, |
| "kl": 0.00872802734375, |
| "learning_rate": 2.134908592756607e-07, |
| "loss": 0.0229, |
| "reward": 0.819444440305233, |
| "reward_std": 0.17010344192385674, |
| "rewards/accuracy_reward": 0.16666667256504297, |
| "rewards/format_reward": 0.4861111119389534, |
| "step": 396 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 735.9027862548828, |
| "epoch": 0.6803770351328192, |
| "grad_norm": 0.8936165571212769, |
| "kl": 0.01611328125, |
| "learning_rate": 2.1141329099692406e-07, |
| "loss": -0.0202, |
| "reward": 0.7013889029622078, |
| "reward_std": 0.26149011217057705, |
| "rewards/accuracy_reward": 0.11111111287027597, |
| "rewards/format_reward": 0.479166679084301, |
| "step": 397 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 689.8472290039062, |
| "epoch": 0.6820908311910883, |
| "grad_norm": 0.23036593198776245, |
| "kl": 0.00927734375, |
| "learning_rate": 2.0935222495670968e-07, |
| "loss": 0.0021, |
| "reward": 0.8055555671453476, |
| "reward_std": 0.1773533970117569, |
| "rewards/accuracy_reward": 0.1527777798473835, |
| "rewards/format_reward": 0.5, |
| "step": 398 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 606.8750076293945, |
| "epoch": 0.6838046272493573, |
| "grad_norm": 0.5848769545555115, |
| "kl": 0.009002685546875, |
| "learning_rate": 2.0730776160846853e-07, |
| "loss": 0.0063, |
| "reward": 1.0208333432674408, |
| "reward_std": 0.5025790855288506, |
| "rewards/accuracy_reward": 0.2638888955116272, |
| "rewards/format_reward": 0.493055559694767, |
| "step": 399 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 822.8333435058594, |
| "epoch": 0.6855184233076264, |
| "grad_norm": 0.5785830020904541, |
| "kl": 0.00983428955078125, |
| "learning_rate": 2.0528000059645995e-07, |
| "loss": 0.0642, |
| "reward": 0.8750000298023224, |
| "reward_std": 0.6504513919353485, |
| "rewards/accuracy_reward": 0.1944444514811039, |
| "rewards/format_reward": 0.486111119389534, |
| "step": 400 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 748.5138854980469, |
| "epoch": 0.6872322193658955, |
| "grad_norm": 0.3859395980834961, |
| "kl": 0.009033203125, |
| "learning_rate": 2.032690407508949e-07, |
| "loss": -0.0078, |
| "reward": 0.8750000149011612, |
| "reward_std": 0.3910152539610863, |
| "rewards/accuracy_reward": 0.19444444589316845, |
| "rewards/format_reward": 0.4861111119389534, |
| "step": 401 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 705.9583435058594, |
| "epoch": 0.6889460154241646, |
| "grad_norm": 0.46305710077285767, |
| "kl": 0.009429931640625, |
| "learning_rate": 2.0127498008311922e-07, |
| "loss": 0.0405, |
| "reward": 0.659722238779068, |
| "reward_std": 0.3002174627035856, |
| "rewards/accuracy_reward": 0.09722222574055195, |
| "rewards/format_reward": 0.4652777835726738, |
| "step": 402 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 689.0972290039062, |
| "epoch": 0.6906598114824336, |
| "grad_norm": 0.30027008056640625, |
| "kl": 0.01239013671875, |
| "learning_rate": 1.9929791578083655e-07, |
| "loss": 0.0017, |
| "reward": 0.986111119389534, |
| "reward_std": 0.3444380611181259, |
| "rewards/accuracy_reward": 0.2500000111758709, |
| "rewards/format_reward": 0.486111119389534, |
| "step": 403 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 593.8333282470703, |
| "epoch": 0.6923736075407027, |
| "grad_norm": 0.0383455790579319, |
| "kl": 0.00807952880859375, |
| "learning_rate": 1.9733794420337213e-07, |
| "loss": 0.0003, |
| "reward": 0.6666666567325592, |
| "reward_std": 0.0, |
| "rewards/accuracy_reward": 0.0833333358168602, |
| "rewards/format_reward": 0.5, |
| "step": 404 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 667.8194427490234, |
| "epoch": 0.6940874035989717, |
| "grad_norm": 0.36720189452171326, |
| "kl": 0.00794219970703125, |
| "learning_rate": 1.9539516087697517e-07, |
| "loss": 0.0015, |
| "reward": 1.1944444626569748, |
| "reward_std": 0.3762567341327667, |
| "rewards/accuracy_reward": 0.3472222276031971, |
| "rewards/format_reward": 0.5, |
| "step": 405 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 609.4305725097656, |
| "epoch": 0.6958011996572407, |
| "grad_norm": 0.5361925959587097, |
| "kl": 0.00843048095703125, |
| "learning_rate": 1.934696604901642e-07, |
| "loss": 0.0433, |
| "reward": 0.9166666716337204, |
| "reward_std": 0.3995024636387825, |
| "rewards/accuracy_reward": 0.20833333395421505, |
| "rewards/format_reward": 0.5, |
| "step": 406 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 680.8055725097656, |
| "epoch": 0.6975149957155099, |
| "grad_norm": 0.3511025905609131, |
| "kl": 0.010101318359375, |
| "learning_rate": 1.915615368891117e-07, |
| "loss": -0.0073, |
| "reward": 1.0138888955116272, |
| "reward_std": 0.40500660240650177, |
| "rewards/accuracy_reward": 0.2638888917863369, |
| "rewards/format_reward": 0.486111119389534, |
| "step": 407 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 488.3472213745117, |
| "epoch": 0.699228791773779, |
| "grad_norm": 0.56728196144104, |
| "kl": 0.0164031982421875, |
| "learning_rate": 1.8967088307307e-07, |
| "loss": -0.0141, |
| "reward": 1.3819444477558136, |
| "reward_std": 0.46130847185850143, |
| "rewards/accuracy_reward": 0.44444444961845875, |
| "rewards/format_reward": 0.493055559694767, |
| "step": 408 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 706.125, |
| "epoch": 0.700942587832048, |
| "grad_norm": 0.5061081647872925, |
| "kl": 0.0098876953125, |
| "learning_rate": 1.8779779118983867e-07, |
| "loss": -0.0202, |
| "reward": 0.8958333358168602, |
| "reward_std": 0.524095680564642, |
| "rewards/accuracy_reward": 0.20833333674818277, |
| "rewards/format_reward": 0.4791666641831398, |
| "step": 409 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 616.6944580078125, |
| "epoch": 0.702656383890317, |
| "grad_norm": 0.4515543282032013, |
| "kl": 0.0114288330078125, |
| "learning_rate": 1.8594235253127372e-07, |
| "loss": 0.0007, |
| "reward": 0.7430555745959282, |
| "reward_std": 0.33044650219380856, |
| "rewards/accuracy_reward": 0.1250000037252903, |
| "rewards/format_reward": 0.493055559694767, |
| "step": 410 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 776.875, |
| "epoch": 0.7043701799485861, |
| "grad_norm": 0.4085945785045624, |
| "kl": 0.0094146728515625, |
| "learning_rate": 1.8410465752883758e-07, |
| "loss": 0.0977, |
| "reward": 0.8402777835726738, |
| "reward_std": 0.2934070285409689, |
| "rewards/accuracy_reward": 0.18055555690079927, |
| "rewards/format_reward": 0.479166679084301, |
| "step": 411 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 741.5000152587891, |
| "epoch": 0.7060839760068551, |
| "grad_norm": 0.4242144823074341, |
| "kl": 0.010284423828125, |
| "learning_rate": 1.822847957491922e-07, |
| "loss": 0.0346, |
| "reward": 0.5624999925494194, |
| "reward_std": 0.25515517219901085, |
| "rewards/accuracy_reward": 0.041666666977107525, |
| "rewards/format_reward": 0.4791666716337204, |
| "step": 412 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 657.5694580078125, |
| "epoch": 0.7077977720651243, |
| "grad_norm": 0.48833325505256653, |
| "kl": 0.0101318359375, |
| "learning_rate": 1.804828558898332e-07, |
| "loss": 0.0055, |
| "reward": 0.7777777761220932, |
| "reward_std": 0.40472327172756195, |
| "rewards/accuracy_reward": 0.13888889085501432, |
| "rewards/format_reward": 0.5, |
| "step": 413 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 751.5833587646484, |
| "epoch": 0.7095115681233933, |
| "grad_norm": 0.49302709102630615, |
| "kl": 0.00957489013671875, |
| "learning_rate": 1.7869892577476722e-07, |
| "loss": 0.069, |
| "reward": 0.6458333432674408, |
| "reward_std": 0.28473581932485104, |
| "rewards/accuracy_reward": 0.08333333395421505, |
| "rewards/format_reward": 0.479166679084301, |
| "step": 414 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 776.7916870117188, |
| "epoch": 0.7112253641816624, |
| "grad_norm": 0.568556010723114, |
| "kl": 0.0110321044921875, |
| "learning_rate": 1.7693309235023127e-07, |
| "loss": -0.042, |
| "reward": 0.6875, |
| "reward_std": 0.2086303625255823, |
| "rewards/accuracy_reward": 0.11111111287027597, |
| "rewards/format_reward": 0.4652777835726738, |
| "step": 415 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 628.6527862548828, |
| "epoch": 0.7129391602399314, |
| "grad_norm": 0.33500367403030396, |
| "kl": 0.0119171142578125, |
| "learning_rate": 1.7518544168045524e-07, |
| "loss": -0.0129, |
| "reward": 0.9444444626569748, |
| "reward_std": 0.2453947812318802, |
| "rewards/accuracy_reward": 0.2222222276031971, |
| "rewards/format_reward": 0.5, |
| "step": 416 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 682.6250076293945, |
| "epoch": 0.7146529562982005, |
| "grad_norm": 0.42712926864624023, |
| "kl": 0.010406494140625, |
| "learning_rate": 1.7345605894346726e-07, |
| "loss": -0.0008, |
| "reward": 0.7152777910232544, |
| "reward_std": 0.4368143603205681, |
| "rewards/accuracy_reward": 0.11111111287027597, |
| "rewards/format_reward": 0.493055559694767, |
| "step": 417 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 570.2222366333008, |
| "epoch": 0.7163667523564696, |
| "grad_norm": 0.47915688157081604, |
| "kl": 0.0113983154296875, |
| "learning_rate": 1.7174502842694212e-07, |
| "loss": 0.0123, |
| "reward": 0.5694444477558136, |
| "reward_std": 0.12530778720974922, |
| "rewards/accuracy_reward": 0.0416666679084301, |
| "rewards/format_reward": 0.486111119389534, |
| "step": 418 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 574.3750152587891, |
| "epoch": 0.7180805484147387, |
| "grad_norm": 0.43631237745285034, |
| "kl": 0.01261138916015625, |
| "learning_rate": 1.7005243352409333e-07, |
| "loss": 0.0184, |
| "reward": 0.8263888955116272, |
| "reward_std": 0.35694384574890137, |
| "rewards/accuracy_reward": 0.16666667070239782, |
| "rewards/format_reward": 0.493055559694767, |
| "step": 419 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 711.5833358764648, |
| "epoch": 0.7197943444730077, |
| "grad_norm": 0.3256857693195343, |
| "kl": 0.00872039794921875, |
| "learning_rate": 1.6837835672960831e-07, |
| "loss": 0.0162, |
| "reward": 0.7361111268401146, |
| "reward_std": 0.26139055751264095, |
| "rewards/accuracy_reward": 0.12500000093132257, |
| "rewards/format_reward": 0.486111119389534, |
| "step": 420 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 578.9722137451172, |
| "epoch": 0.7215081405312768, |
| "grad_norm": 0.5653530359268188, |
| "kl": 0.0140533447265625, |
| "learning_rate": 1.6672287963562852e-07, |
| "loss": 0.0693, |
| "reward": 1.0208333432674408, |
| "reward_std": 0.4548392668366432, |
| "rewards/accuracy_reward": 0.2638888927176595, |
| "rewards/format_reward": 0.493055559694767, |
| "step": 421 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 655.0555572509766, |
| "epoch": 0.7232219365895458, |
| "grad_norm": 0.47506195306777954, |
| "kl": 0.00907135009765625, |
| "learning_rate": 1.6508608292777203e-07, |
| "loss": 0.0037, |
| "reward": 0.6597222238779068, |
| "reward_std": 0.23915940523147583, |
| "rewards/accuracy_reward": 0.08333333395421505, |
| "rewards/format_reward": 0.493055559694767, |
| "step": 422 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 802.1111297607422, |
| "epoch": 0.7249357326478149, |
| "grad_norm": 0.41230887174606323, |
| "kl": 0.01018524169921875, |
| "learning_rate": 1.6346804638120098e-07, |
| "loss": 0.0106, |
| "reward": 0.9652777761220932, |
| "reward_std": 0.41896694898605347, |
| "rewards/accuracy_reward": 0.23611111752688885, |
| "rewards/format_reward": 0.493055559694767, |
| "step": 423 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 806.0416717529297, |
| "epoch": 0.726649528706084, |
| "grad_norm": 0.41215652227401733, |
| "kl": 0.0081787109375, |
| "learning_rate": 1.6186884885673413e-07, |
| "loss": 0.0068, |
| "reward": 0.784722238779068, |
| "reward_std": 0.359107568860054, |
| "rewards/accuracy_reward": 0.15277778077870607, |
| "rewards/format_reward": 0.4791666716337204, |
| "step": 424 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 700.6111221313477, |
| "epoch": 0.7283633247643531, |
| "grad_norm": 0.6025084257125854, |
| "kl": 0.0133056640625, |
| "learning_rate": 1.6028856829700258e-07, |
| "loss": -0.0072, |
| "reward": 1.0972222238779068, |
| "reward_std": 0.6032442003488541, |
| "rewards/accuracy_reward": 0.3055555634200573, |
| "rewards/format_reward": 0.486111119389534, |
| "step": 425 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 503.37500762939453, |
| "epoch": 0.7300771208226221, |
| "grad_norm": 0.6861910223960876, |
| "kl": 0.0125579833984375, |
| "learning_rate": 1.5872728172265146e-07, |
| "loss": -0.0144, |
| "reward": 1.0555555671453476, |
| "reward_std": 0.5035936608910561, |
| "rewards/accuracy_reward": 0.2777777798473835, |
| "rewards/format_reward": 0.5, |
| "step": 426 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 672.6805572509766, |
| "epoch": 0.7317909168808912, |
| "grad_norm": 0.5222618579864502, |
| "kl": 0.01123046875, |
| "learning_rate": 1.5718506522858572e-07, |
| "loss": 0.0373, |
| "reward": 0.7291666716337204, |
| "reward_std": 0.34122148901224136, |
| "rewards/accuracy_reward": 0.12500000093132257, |
| "rewards/format_reward": 0.479166679084301, |
| "step": 427 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 678.2916793823242, |
| "epoch": 0.7335047129391602, |
| "grad_norm": 0.510343074798584, |
| "kl": 0.01007843017578125, |
| "learning_rate": 1.5566199398026147e-07, |
| "loss": 0.0694, |
| "reward": 1.0277777761220932, |
| "reward_std": 0.6949137225747108, |
| "rewards/accuracy_reward": 0.26388889644294977, |
| "rewards/format_reward": 0.5, |
| "step": 428 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 694.0277709960938, |
| "epoch": 0.7352185089974294, |
| "grad_norm": 0.5112901329994202, |
| "kl": 0.010711669921875, |
| "learning_rate": 1.5415814221002265e-07, |
| "loss": -0.0032, |
| "reward": 0.770833320915699, |
| "reward_std": 0.4112919941544533, |
| "rewards/accuracy_reward": 0.13888889271765947, |
| "rewards/format_reward": 0.493055559694767, |
| "step": 429 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 779.0555572509766, |
| "epoch": 0.7369323050556984, |
| "grad_norm": 0.40487441420555115, |
| "kl": 0.00792694091796875, |
| "learning_rate": 1.5267358321348285e-07, |
| "loss": -0.007, |
| "reward": 0.8541666716337204, |
| "reward_std": 0.3304464966058731, |
| "rewards/accuracy_reward": 0.18055555690079927, |
| "rewards/format_reward": 0.493055559694767, |
| "step": 430 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 679.5278015136719, |
| "epoch": 0.7386461011139674, |
| "grad_norm": 0.5640285015106201, |
| "kl": 0.0105438232421875, |
| "learning_rate": 1.5120838934595337e-07, |
| "loss": 0.016, |
| "reward": 0.8750000149011612, |
| "reward_std": 0.31113363057374954, |
| "rewards/accuracy_reward": 0.1944444514811039, |
| "rewards/format_reward": 0.486111119389534, |
| "step": 431 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 642.7639007568359, |
| "epoch": 0.7403598971722365, |
| "grad_norm": 0.5031344890594482, |
| "kl": 0.011260986328125, |
| "learning_rate": 1.4976263201891613e-07, |
| "loss": 0.0137, |
| "reward": 0.986111119389534, |
| "reward_std": 0.4052800089120865, |
| "rewards/accuracy_reward": 0.2500000046566129, |
| "rewards/format_reward": 0.486111119389534, |
| "step": 432 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 764.3055572509766, |
| "epoch": 0.7420736932305055, |
| "grad_norm": 0.30649498105049133, |
| "kl": 0.00821685791015625, |
| "learning_rate": 1.483363816965435e-07, |
| "loss": 0.0368, |
| "reward": 0.9027778059244156, |
| "reward_std": 0.28722215443849564, |
| "rewards/accuracy_reward": 0.20833333767950535, |
| "rewards/format_reward": 0.486111119389534, |
| "step": 433 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 690.9027709960938, |
| "epoch": 0.7437874892887746, |
| "grad_norm": 0.3682223856449127, |
| "kl": 0.00897216796875, |
| "learning_rate": 1.469297078922642e-07, |
| "loss": -0.0162, |
| "reward": 0.6874999925494194, |
| "reward_std": 0.26762592047452927, |
| "rewards/accuracy_reward": 0.09722222480922937, |
| "rewards/format_reward": 0.493055559694767, |
| "step": 434 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 688.2777862548828, |
| "epoch": 0.7455012853470437, |
| "grad_norm": 0.5453760027885437, |
| "kl": 0.0087127685546875, |
| "learning_rate": 1.4554267916537495e-07, |
| "loss": -0.002, |
| "reward": 0.9097222164273262, |
| "reward_std": 0.416512792930007, |
| "rewards/accuracy_reward": 0.2083333358168602, |
| "rewards/format_reward": 0.493055559694767, |
| "step": 435 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 720.7500152587891, |
| "epoch": 0.7472150814053128, |
| "grad_norm": 0.4208390712738037, |
| "kl": 0.00977325439453125, |
| "learning_rate": 1.4417536311769885e-07, |
| "loss": -0.0004, |
| "reward": 1.0486111342906952, |
| "reward_std": 0.4112920016050339, |
| "rewards/accuracy_reward": 0.27777778822928667, |
| "rewards/format_reward": 0.493055559694767, |
| "step": 436 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 646.1666717529297, |
| "epoch": 0.7489288774635818, |
| "grad_norm": 0.38405895233154297, |
| "kl": 0.00957489013671875, |
| "learning_rate": 1.4282782639029128e-07, |
| "loss": 0.0228, |
| "reward": 0.861111119389534, |
| "reward_std": 0.2901904284954071, |
| "rewards/accuracy_reward": 0.18055555783212185, |
| "rewards/format_reward": 0.5, |
| "step": 437 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 586.0555572509766, |
| "epoch": 0.7506426735218509, |
| "grad_norm": 0.2572639584541321, |
| "kl": 0.0116729736328125, |
| "learning_rate": 1.4150013466019114e-07, |
| "loss": 0.0052, |
| "reward": 0.9861111044883728, |
| "reward_std": 0.12496887892484665, |
| "rewards/accuracy_reward": 0.2500000009313226, |
| "rewards/format_reward": 0.4861111119389534, |
| "step": 438 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 721.8333435058594, |
| "epoch": 0.7523564695801199, |
| "grad_norm": 0.3360980451107025, |
| "kl": 0.00910186767578125, |
| "learning_rate": 1.4019235263722034e-07, |
| "loss": -0.0078, |
| "reward": 0.909722238779068, |
| "reward_std": 0.25718431919813156, |
| "rewards/accuracy_reward": 0.20833334047347307, |
| "rewards/format_reward": 0.493055559694767, |
| "step": 439 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 659.875, |
| "epoch": 0.7540702656383891, |
| "grad_norm": 0.4243048131465912, |
| "kl": 0.0103912353515625, |
| "learning_rate": 1.3890454406082956e-07, |
| "loss": 0.0002, |
| "reward": 0.9375, |
| "reward_std": 0.3150074779987335, |
| "rewards/accuracy_reward": 0.22222222108393908, |
| "rewards/format_reward": 0.493055559694767, |
| "step": 440 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 776.7083282470703, |
| "epoch": 0.7557840616966581, |
| "grad_norm": 0.25701966881752014, |
| "kl": 0.00785064697265625, |
| "learning_rate": 1.3763677169699217e-07, |
| "loss": 0.0297, |
| "reward": 0.8750000149011612, |
| "reward_std": 0.1584134679287672, |
| "rewards/accuracy_reward": 0.19444444961845875, |
| "rewards/format_reward": 0.486111119389534, |
| "step": 441 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 634.2500076293945, |
| "epoch": 0.7574978577549272, |
| "grad_norm": 0.42774713039398193, |
| "kl": 0.00933837890625, |
| "learning_rate": 1.3638909733514452e-07, |
| "loss": 0.0248, |
| "reward": 0.9583333432674408, |
| "reward_std": 0.3544755354523659, |
| "rewards/accuracy_reward": 0.236111119389534, |
| "rewards/format_reward": 0.486111119389534, |
| "step": 442 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 714.0833282470703, |
| "epoch": 0.7592116538131962, |
| "grad_norm": 0.5273145437240601, |
| "kl": 0.01111602783203125, |
| "learning_rate": 1.351615817851748e-07, |
| "loss": 0.0447, |
| "reward": 1.0486111342906952, |
| "reward_std": 0.421733595430851, |
| "rewards/accuracy_reward": 0.2777777835726738, |
| "rewards/format_reward": 0.493055559694767, |
| "step": 443 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 805.0972442626953, |
| "epoch": 0.7609254498714653, |
| "grad_norm": 0.28000712394714355, |
| "kl": 0.00943756103515625, |
| "learning_rate": 1.3395428487445914e-07, |
| "loss": 0.0138, |
| "reward": 0.7708333432674408, |
| "reward_std": 0.40107376128435135, |
| "rewards/accuracy_reward": 0.13888889364898205, |
| "rewards/format_reward": 0.493055559694767, |
| "step": 444 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 711.0416870117188, |
| "epoch": 0.7626392459297343, |
| "grad_norm": 0.48471271991729736, |
| "kl": 0.0093841552734375, |
| "learning_rate": 1.3276726544494571e-07, |
| "loss": -0.0408, |
| "reward": 0.7708333283662796, |
| "reward_std": 0.399602010846138, |
| "rewards/accuracy_reward": 0.13888889085501432, |
| "rewards/format_reward": 0.493055559694767, |
| "step": 445 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 650.1944427490234, |
| "epoch": 0.7643530419880035, |
| "grad_norm": 0.2905956208705902, |
| "kl": 0.0093841552734375, |
| "learning_rate": 1.316005813502869e-07, |
| "loss": -0.0041, |
| "reward": 0.8541666865348816, |
| "reward_std": 0.2211344838142395, |
| "rewards/accuracy_reward": 0.18055556155741215, |
| "rewards/format_reward": 0.493055559694767, |
| "step": 446 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 744.8194580078125, |
| "epoch": 0.7660668380462725, |
| "grad_norm": 0.6048784255981445, |
| "kl": 0.0104522705078125, |
| "learning_rate": 1.3045428945301953e-07, |
| "loss": -0.0198, |
| "reward": 1.0555555820465088, |
| "reward_std": 0.4907895475625992, |
| "rewards/accuracy_reward": 0.2777777835726738, |
| "rewards/format_reward": 0.5, |
| "step": 447 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 734.8194580078125, |
| "epoch": 0.7677806341045416, |
| "grad_norm": 0.3953067362308502, |
| "kl": 0.00873565673828125, |
| "learning_rate": 1.2932844562179352e-07, |
| "loss": 0.0348, |
| "reward": 1.0694444477558136, |
| "reward_std": 0.43352314084768295, |
| "rewards/accuracy_reward": 0.2916666716337204, |
| "rewards/format_reward": 0.486111119389534, |
| "step": 448 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 751.0555877685547, |
| "epoch": 0.7694944301628106, |
| "grad_norm": 0.3929743766784668, |
| "kl": 0.0112152099609375, |
| "learning_rate": 1.2822310472864885e-07, |
| "loss": 0.0141, |
| "reward": 0.798611119389534, |
| "reward_std": 0.15942803025245667, |
| "rewards/accuracy_reward": 0.15277778450399637, |
| "rewards/format_reward": 0.493055559694767, |
| "step": 449 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 678.0, |
| "epoch": 0.7712082262210797, |
| "grad_norm": 0.38888290524482727, |
| "kl": 0.0111541748046875, |
| "learning_rate": 1.2713832064634125e-07, |
| "loss": -0.0292, |
| "reward": 0.888888880610466, |
| "reward_std": 0.4547397494316101, |
| "rewards/accuracy_reward": 0.19444444868713617, |
| "rewards/format_reward": 0.5, |
| "step": 450 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 732.7083587646484, |
| "epoch": 0.7729220222793488, |
| "grad_norm": 0.4212208688259125, |
| "kl": 0.0086212158203125, |
| "learning_rate": 1.260741462457165e-07, |
| "loss": 0.0172, |
| "reward": 0.6597222164273262, |
| "reward_std": 0.2624051198363304, |
| "rewards/accuracy_reward": 0.08333333488553762, |
| "rewards/format_reward": 0.493055559694767, |
| "step": 451 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 701.0416870117188, |
| "epoch": 0.7746358183376179, |
| "grad_norm": 0.4472486078739166, |
| "kl": 0.0110626220703125, |
| "learning_rate": 1.2503063339313356e-07, |
| "loss": -0.0389, |
| "reward": 0.9027777761220932, |
| "reward_std": 0.44951891899108887, |
| "rewards/accuracy_reward": 0.20833333674818277, |
| "rewards/format_reward": 0.4861111119389534, |
| "step": 452 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 750.6527862548828, |
| "epoch": 0.7763496143958869, |
| "grad_norm": 0.31978559494018555, |
| "kl": 0.00965118408203125, |
| "learning_rate": 1.2400783294793668e-07, |
| "loss": 0.0029, |
| "reward": 0.6527777761220932, |
| "reward_std": 0.24447975307703018, |
| "rewards/accuracy_reward": 0.08333333395421505, |
| "rewards/format_reward": 0.486111119389534, |
| "step": 453 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 627.0555725097656, |
| "epoch": 0.778063410454156, |
| "grad_norm": 0.5572839975357056, |
| "kl": 0.0136260986328125, |
| "learning_rate": 1.2300579475997657e-07, |
| "loss": -0.0167, |
| "reward": 1.1875000149011612, |
| "reward_std": 0.4453127048909664, |
| "rewards/accuracy_reward": 0.3472222248092294, |
| "rewards/format_reward": 0.493055559694767, |
| "step": 454 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 705.1110992431641, |
| "epoch": 0.779777206512425, |
| "grad_norm": 0.3036157190799713, |
| "kl": 0.0082244873046875, |
| "learning_rate": 1.220245676671809e-07, |
| "loss": 0.0153, |
| "reward": 0.6944444328546524, |
| "reward_std": 0.2634196951985359, |
| "rewards/accuracy_reward": 0.0972222238779068, |
| "rewards/format_reward": 0.5, |
| "step": 455 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 696.0000152587891, |
| "epoch": 0.781491002570694, |
| "grad_norm": 0.304671049118042, |
| "kl": 0.0107574462890625, |
| "learning_rate": 1.2106419949317388e-07, |
| "loss": -0.0076, |
| "reward": 0.8333333283662796, |
| "reward_std": 0.2221490517258644, |
| "rewards/accuracy_reward": 0.16666666977107525, |
| "rewards/format_reward": 0.5, |
| "step": 456 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 688.875, |
| "epoch": 0.7832047986289632, |
| "grad_norm": 0.5803426504135132, |
| "kl": 0.010406494140625, |
| "learning_rate": 1.2012473704494537e-07, |
| "loss": 0.0092, |
| "reward": 0.9861111417412758, |
| "reward_std": 0.3805625271052122, |
| "rewards/accuracy_reward": 0.2500000074505806, |
| "rewards/format_reward": 0.486111119389534, |
| "step": 457 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 601.5277862548828, |
| "epoch": 0.7849185946872322, |
| "grad_norm": 0.7371811270713806, |
| "kl": 0.0137481689453125, |
| "learning_rate": 1.1920622611056974e-07, |
| "loss": 0.0169, |
| "reward": 0.8333333283662796, |
| "reward_std": 0.43149399757385254, |
| "rewards/accuracy_reward": 0.16666666977107525, |
| "rewards/format_reward": 0.5, |
| "step": 458 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 628.7500152587891, |
| "epoch": 0.7866323907455013, |
| "grad_norm": 0.5148778557777405, |
| "kl": 0.011505126953125, |
| "learning_rate": 1.1830871145697412e-07, |
| "loss": 0.0304, |
| "reward": 1.0763888955116272, |
| "reward_std": 0.2211344838142395, |
| "rewards/accuracy_reward": 0.291666679084301, |
| "rewards/format_reward": 0.493055559694767, |
| "step": 459 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 757.3889007568359, |
| "epoch": 0.7883461868037703, |
| "grad_norm": 0.4826764464378357, |
| "kl": 0.0094757080078125, |
| "learning_rate": 1.1743223682775649e-07, |
| "loss": -0.0019, |
| "reward": 0.7152777761220932, |
| "reward_std": 0.384667344391346, |
| "rewards/accuracy_reward": 0.1111111119389534, |
| "rewards/format_reward": 0.493055559694767, |
| "step": 460 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 667.8055572509766, |
| "epoch": 0.7900599828620394, |
| "grad_norm": 0.5709467530250549, |
| "kl": 0.01129150390625, |
| "learning_rate": 1.1657684494105386e-07, |
| "loss": 0.0253, |
| "reward": 0.9027778059244156, |
| "reward_std": 0.5824100151658058, |
| "rewards/accuracy_reward": 0.20833334140479565, |
| "rewards/format_reward": 0.486111119389534, |
| "step": 461 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 782.0694274902344, |
| "epoch": 0.7917737789203085, |
| "grad_norm": 0.359210729598999, |
| "kl": 0.01105499267578125, |
| "learning_rate": 1.1574257748745986e-07, |
| "loss": 0.0259, |
| "reward": 0.8194444477558136, |
| "reward_std": 0.27941547334194183, |
| "rewards/accuracy_reward": 0.1666666716337204, |
| "rewards/format_reward": 0.4861111119389534, |
| "step": 462 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 722.0555572509766, |
| "epoch": 0.7934875749785776, |
| "grad_norm": 0.47103169560432434, |
| "kl": 0.01102447509765625, |
| "learning_rate": 1.1492947512799328e-07, |
| "loss": 0.0043, |
| "reward": 0.8124999925494194, |
| "reward_std": 0.43280857615172863, |
| "rewards/accuracy_reward": 0.16666667256504297, |
| "rewards/format_reward": 0.4791666716337204, |
| "step": 463 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 708.9722290039062, |
| "epoch": 0.7952013710368466, |
| "grad_norm": 0.34840986132621765, |
| "kl": 0.0095672607421875, |
| "learning_rate": 1.1413757749211602e-07, |
| "loss": -0.0008, |
| "reward": 0.888888880610466, |
| "reward_std": 0.13608276098966599, |
| "rewards/accuracy_reward": 0.19444444868713617, |
| "rewards/format_reward": 0.5, |
| "step": 464 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 709.2083282470703, |
| "epoch": 0.7969151670951157, |
| "grad_norm": 0.34385910630226135, |
| "kl": 0.0106964111328125, |
| "learning_rate": 1.1336692317580158e-07, |
| "loss": 0.0174, |
| "reward": 0.8263888955116272, |
| "reward_std": 0.3135357052087784, |
| "rewards/accuracy_reward": 0.1666666679084301, |
| "rewards/format_reward": 0.493055559694767, |
| "step": 465 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 738.0416717529297, |
| "epoch": 0.7986289631533847, |
| "grad_norm": 0.39174365997314453, |
| "kl": 0.0086212158203125, |
| "learning_rate": 1.1261754973965422e-07, |
| "loss": 0.0083, |
| "reward": 1.0833333730697632, |
| "reward_std": 0.4855687543749809, |
| "rewards/accuracy_reward": 0.2916666716337204, |
| "rewards/format_reward": 0.5, |
| "step": 466 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 783.3611145019531, |
| "epoch": 0.8003427592116538, |
| "grad_norm": 0.21360760927200317, |
| "kl": 0.0073394775390625, |
| "learning_rate": 1.1188949370707787e-07, |
| "loss": 0.0061, |
| "reward": 0.7222222238779068, |
| "reward_std": 0.13608276098966599, |
| "rewards/accuracy_reward": 0.11111111287027597, |
| "rewards/format_reward": 0.5, |
| "step": 467 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 591.0416564941406, |
| "epoch": 0.8020565552699229, |
| "grad_norm": 0.44213274121284485, |
| "kl": 0.0121307373046875, |
| "learning_rate": 1.1118279056249653e-07, |
| "loss": -0.0334, |
| "reward": 0.9375000074505806, |
| "reward_std": 0.176338829100132, |
| "rewards/accuracy_reward": 0.2222222238779068, |
| "rewards/format_reward": 0.493055559694767, |
| "step": 468 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 908.9305419921875, |
| "epoch": 0.803770351328192, |
| "grad_norm": 2.4339747428894043, |
| "kl": 0.04512786865234375, |
| "learning_rate": 1.1049747474962444e-07, |
| "loss": 0.0283, |
| "reward": 0.8472222164273262, |
| "reward_std": 0.4613333996385336, |
| "rewards/accuracy_reward": 0.18055556155741215, |
| "rewards/format_reward": 0.486111119389534, |
| "step": 469 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 672.7222290039062, |
| "epoch": 0.805484147386461, |
| "grad_norm": 0.3775832951068878, |
| "kl": 0.00971221923828125, |
| "learning_rate": 1.0983357966978745e-07, |
| "loss": 0.0084, |
| "reward": 0.826388880610466, |
| "reward_std": 0.2337997630238533, |
| "rewards/accuracy_reward": 0.16666666977107525, |
| "rewards/format_reward": 0.493055559694767, |
| "step": 470 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 837.1944580078125, |
| "epoch": 0.8071979434447301, |
| "grad_norm": 0.3247397840023041, |
| "kl": 0.0085601806640625, |
| "learning_rate": 1.0919113768029517e-07, |
| "loss": 0.0263, |
| "reward": 0.8750000149011612, |
| "reward_std": 0.40380824357271194, |
| "rewards/accuracy_reward": 0.19444444682449102, |
| "rewards/format_reward": 0.486111119389534, |
| "step": 471 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 636.1527862548828, |
| "epoch": 0.8089117395029991, |
| "grad_norm": 0.30879315733909607, |
| "kl": 0.0115203857421875, |
| "learning_rate": 1.0857018009286381e-07, |
| "loss": -0.0045, |
| "reward": 0.7708333432674408, |
| "reward_std": 0.22086109220981598, |
| "rewards/accuracy_reward": 0.1388888955116272, |
| "rewards/format_reward": 0.493055559694767, |
| "step": 472 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 697.8194427490234, |
| "epoch": 0.8106255355612683, |
| "grad_norm": 0.5799990296363831, |
| "kl": 0.0116119384765625, |
| "learning_rate": 1.0797073717209013e-07, |
| "loss": 0.0063, |
| "reward": 0.8333333656191826, |
| "reward_std": 0.2221490480005741, |
| "rewards/accuracy_reward": 0.18055556155741215, |
| "rewards/format_reward": 0.472222238779068, |
| "step": 473 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 597.6944580078125, |
| "epoch": 0.8123393316195373, |
| "grad_norm": 0.673372209072113, |
| "kl": 0.012725830078125, |
| "learning_rate": 1.0739283813397639e-07, |
| "loss": -0.0041, |
| "reward": 1.1527777910232544, |
| "reward_std": 0.6239039897918701, |
| "rewards/accuracy_reward": 0.33333334140479565, |
| "rewards/format_reward": 0.486111119389534, |
| "step": 474 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 647.6944427490234, |
| "epoch": 0.8140531276778064, |
| "grad_norm": 0.47942498326301575, |
| "kl": 0.0100555419921875, |
| "learning_rate": 1.068365111445064e-07, |
| "loss": -0.0141, |
| "reward": 0.8888888955116272, |
| "reward_std": 0.4675438329577446, |
| "rewards/accuracy_reward": 0.1944444514811039, |
| "rewards/format_reward": 0.5, |
| "step": 475 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 785.625, |
| "epoch": 0.8157669237360754, |
| "grad_norm": 0.2584112882614136, |
| "kl": 0.00859832763671875, |
| "learning_rate": 1.063017833182728e-07, |
| "loss": 0.0052, |
| "reward": 0.6944444328546524, |
| "reward_std": 0.0680413767695427, |
| "rewards/accuracy_reward": 0.09722222480922937, |
| "rewards/format_reward": 0.5, |
| "step": 476 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 718.2083282470703, |
| "epoch": 0.8174807197943444, |
| "grad_norm": 0.33669352531433105, |
| "kl": 0.01165771484375, |
| "learning_rate": 1.0578868071715544e-07, |
| "loss": 0.0043, |
| "reward": 0.9861111342906952, |
| "reward_std": 0.32973192632198334, |
| "rewards/accuracy_reward": 0.25000000838190317, |
| "rewards/format_reward": 0.4861111119389534, |
| "step": 477 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 617.8611145019531, |
| "epoch": 0.8191945158526135, |
| "grad_norm": 0.8135344982147217, |
| "kl": 0.01825714111328125, |
| "learning_rate": 1.0529722834905125e-07, |
| "loss": -0.0315, |
| "reward": 0.9722222238779068, |
| "reward_std": 0.4262731894850731, |
| "rewards/accuracy_reward": 0.23611111473292112, |
| "rewards/format_reward": 0.5, |
| "step": 478 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 593.4305648803711, |
| "epoch": 0.8209083119108826, |
| "grad_norm": 0.4805351197719574, |
| "kl": 0.0129547119140625, |
| "learning_rate": 1.0482745016665526e-07, |
| "loss": -0.0022, |
| "reward": 1.194444477558136, |
| "reward_std": 0.5820766538381577, |
| "rewards/accuracy_reward": 0.3472222238779068, |
| "rewards/format_reward": 0.5, |
| "step": 479 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 634.7777862548828, |
| "epoch": 0.8226221079691517, |
| "grad_norm": 0.37174245715141296, |
| "kl": 0.01177215576171875, |
| "learning_rate": 1.0437936906629334e-07, |
| "loss": -0.0185, |
| "reward": 1.4375, |
| "reward_std": 0.4637626111507416, |
| "rewards/accuracy_reward": 0.4722222313284874, |
| "rewards/format_reward": 0.493055559694767, |
| "step": 480 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 670.3750152587891, |
| "epoch": 0.8243359040274207, |
| "grad_norm": 0.5397735834121704, |
| "kl": 0.0081024169921875, |
| "learning_rate": 1.0395300688680625e-07, |
| "loss": -0.0052, |
| "reward": 1.166666641831398, |
| "reward_std": 0.41752735525369644, |
| "rewards/accuracy_reward": 0.3333333367481828, |
| "rewards/format_reward": 0.5, |
| "step": 481 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 691.2777862548828, |
| "epoch": 0.8260497000856898, |
| "grad_norm": 0.3484320342540741, |
| "kl": 0.011199951171875, |
| "learning_rate": 1.0354838440848501e-07, |
| "loss": 0.0018, |
| "reward": 0.7430555671453476, |
| "reward_std": 0.33303238451480865, |
| "rewards/accuracy_reward": 0.12500000186264515, |
| "rewards/format_reward": 0.493055559694767, |
| "step": 482 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 664.263916015625, |
| "epoch": 0.8277634961439588, |
| "grad_norm": 0.43313318490982056, |
| "kl": 0.014739990234375, |
| "learning_rate": 1.0316552135205837e-07, |
| "loss": 0.0133, |
| "reward": 0.6527777761220932, |
| "reward_std": 0.25279081612825394, |
| "rewards/accuracy_reward": 0.08333333488553762, |
| "rewards/format_reward": 0.486111119389534, |
| "step": 483 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 688.8888854980469, |
| "epoch": 0.829477292202228, |
| "grad_norm": 0.5132657289505005, |
| "kl": 0.009918212890625, |
| "learning_rate": 1.0280443637773163e-07, |
| "loss": 0.01, |
| "reward": 1.0277778059244156, |
| "reward_std": 0.49601035565137863, |
| "rewards/accuracy_reward": 0.2638888917863369, |
| "rewards/format_reward": 0.5, |
| "step": 484 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 637.1388854980469, |
| "epoch": 0.831191088260497, |
| "grad_norm": 0.470441997051239, |
| "kl": 0.01073455810546875, |
| "learning_rate": 1.0246514708427701e-07, |
| "loss": -0.0112, |
| "reward": 0.888888880610466, |
| "reward_std": 0.4803479462862015, |
| "rewards/accuracy_reward": 0.19444444868713617, |
| "rewards/format_reward": 0.5, |
| "step": 485 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 705.2777862548828, |
| "epoch": 0.8329048843187661, |
| "grad_norm": 0.574053943157196, |
| "kl": 0.0115509033203125, |
| "learning_rate": 1.0214767000817596e-07, |
| "loss": 0.0604, |
| "reward": 1.1041666567325592, |
| "reward_std": 0.4360002353787422, |
| "rewards/accuracy_reward": 0.3194444486871362, |
| "rewards/format_reward": 0.4652777835726738, |
| "step": 486 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 708.1666793823242, |
| "epoch": 0.8346186803770351, |
| "grad_norm": 0.3975141942501068, |
| "kl": 0.01026153564453125, |
| "learning_rate": 1.0185202062281336e-07, |
| "loss": 0.011, |
| "reward": 0.6250000074505806, |
| "reward_std": 0.24970055185258389, |
| "rewards/accuracy_reward": 0.06944444496184587, |
| "rewards/format_reward": 0.486111119389534, |
| "step": 487 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 667.0555572509766, |
| "epoch": 0.8363324764353042, |
| "grad_norm": 0.4273344576358795, |
| "kl": 0.0111236572265625, |
| "learning_rate": 1.0157821333772304e-07, |
| "loss": -0.0084, |
| "reward": 1.2708333283662796, |
| "reward_std": 0.32522569596767426, |
| "rewards/accuracy_reward": 0.38888888619840145, |
| "rewards/format_reward": 0.493055559694767, |
| "step": 488 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 709.2083435058594, |
| "epoch": 0.8380462724935732, |
| "grad_norm": 0.3746323883533478, |
| "kl": 0.008514404296875, |
| "learning_rate": 1.013262614978859e-07, |
| "loss": -0.0182, |
| "reward": 0.6944444477558136, |
| "reward_std": 0.2901904359459877, |
| "rewards/accuracy_reward": 0.09722222574055195, |
| "rewards/format_reward": 0.5, |
| "step": 489 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 678.0555572509766, |
| "epoch": 0.8397600685518424, |
| "grad_norm": 0.44212105870246887, |
| "kl": 0.0089874267578125, |
| "learning_rate": 1.0109617738307911e-07, |
| "loss": 0.0007, |
| "reward": 1.1666666567325592, |
| "reward_std": 0.30821534991264343, |
| "rewards/accuracy_reward": 0.33333333767950535, |
| "rewards/format_reward": 0.5, |
| "step": 490 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 675.7777938842773, |
| "epoch": 0.8414738646101114, |
| "grad_norm": 0.45065274834632874, |
| "kl": 0.0085906982421875, |
| "learning_rate": 1.0088797220727779e-07, |
| "loss": 0.0092, |
| "reward": 0.8055555522441864, |
| "reward_std": 0.3995024636387825, |
| "rewards/accuracy_reward": 0.15277778077870607, |
| "rewards/format_reward": 0.5, |
| "step": 491 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 724.4444580078125, |
| "epoch": 0.8431876606683805, |
| "grad_norm": 0.2776910364627838, |
| "kl": 0.008880615234375, |
| "learning_rate": 1.0070165611810855e-07, |
| "loss": -0.019, |
| "reward": 0.7430555745959282, |
| "reward_std": 0.22113448940217495, |
| "rewards/accuracy_reward": 0.12500000093132257, |
| "rewards/format_reward": 0.493055559694767, |
| "step": 492 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 734.7500152587891, |
| "epoch": 0.8449014567266495, |
| "grad_norm": 0.273231565952301, |
| "kl": 0.01227569580078125, |
| "learning_rate": 1.005372381963547e-07, |
| "loss": 0.0169, |
| "reward": 0.75, |
| "reward_std": 0.15410767495632172, |
| "rewards/accuracy_reward": 0.12500000186264515, |
| "rewards/format_reward": 0.5, |
| "step": 493 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 817.6666717529297, |
| "epoch": 0.8466152527849186, |
| "grad_norm": 0.3405468463897705, |
| "kl": 0.0100250244140625, |
| "learning_rate": 1.0039472645551372e-07, |
| "loss": -0.0042, |
| "reward": 0.9097222238779068, |
| "reward_std": 0.3072007745504379, |
| "rewards/accuracy_reward": 0.2083333395421505, |
| "rewards/format_reward": 0.493055559694767, |
| "step": 494 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 620.8611068725586, |
| "epoch": 0.8483290488431876, |
| "grad_norm": 0.8697577714920044, |
| "kl": 0.0175018310546875, |
| "learning_rate": 1.002741278414069e-07, |
| "loss": -0.0113, |
| "reward": 1.0625000149011612, |
| "reward_std": 0.4665292650461197, |
| "rewards/accuracy_reward": 0.29166667349636555, |
| "rewards/format_reward": 0.479166679084301, |
| "step": 495 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 636.8472290039062, |
| "epoch": 0.8500428449014568, |
| "grad_norm": 0.5277899503707886, |
| "kl": 0.0098876953125, |
| "learning_rate": 1.0017544823184055e-07, |
| "loss": 0.0341, |
| "reward": 1.2222222238779068, |
| "reward_std": 0.686167873442173, |
| "rewards/accuracy_reward": 0.3611111231148243, |
| "rewards/format_reward": 0.5, |
| "step": 496 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 800.9861145019531, |
| "epoch": 0.8517566409597258, |
| "grad_norm": 0.44679829478263855, |
| "kl": 0.012115478515625, |
| "learning_rate": 1.0009869243631952e-07, |
| "loss": 0.0143, |
| "reward": 0.8819444552063942, |
| "reward_std": 0.34861752949655056, |
| "rewards/accuracy_reward": 0.2083333395421505, |
| "rewards/format_reward": 0.4652777910232544, |
| "step": 497 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 856.0694580078125, |
| "epoch": 0.8534704370179949, |
| "grad_norm": 0.22854942083358765, |
| "kl": 0.01018524169921875, |
| "learning_rate": 1.000438641958131e-07, |
| "loss": 0.0058, |
| "reward": 0.6597222238779068, |
| "reward_std": 0.13479479402303696, |
| "rewards/accuracy_reward": 0.08333333674818277, |
| "rewards/format_reward": 0.493055559694767, |
| "step": 498 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 564.8888854980469, |
| "epoch": 0.8551842330762639, |
| "grad_norm": 0.7382153272628784, |
| "kl": 0.0165252685546875, |
| "learning_rate": 1.0001096618257236e-07, |
| "loss": -0.0162, |
| "reward": 1.0277777761220932, |
| "reward_std": 0.2901904284954071, |
| "rewards/accuracy_reward": 0.26388889644294977, |
| "rewards/format_reward": 0.5, |
| "step": 499 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 571.8194351196289, |
| "epoch": 0.856898029134533, |
| "grad_norm": 0.5685946941375732, |
| "kl": 0.0124969482421875, |
| "learning_rate": 1e-07, |
| "loss": -0.0212, |
| "reward": 1.1597222089767456, |
| "reward_std": 0.4405168890953064, |
| "rewards/accuracy_reward": 0.3333333330228925, |
| "rewards/format_reward": 0.493055559694767, |
| "step": 500 |
| }, |
| { |
| "epoch": 0.856898029134533, |
| "step": 500, |
| "total_flos": 0.0, |
| "train_loss": 0.01467308583567501, |
| "train_runtime": 25494.6867, |
| "train_samples_per_second": 1.412, |
| "train_steps_per_second": 0.02 |
| } |
| ], |
| "logging_steps": 1, |
| "max_steps": 500, |
| "num_input_tokens_seen": 0, |
| "num_train_epochs": 1, |
| "save_steps": 100, |
| "stateful_callbacks": { |
| "TrainerControl": { |
| "args": { |
| "should_epoch_stop": false, |
| "should_evaluate": false, |
| "should_log": false, |
| "should_save": true, |
| "should_training_stop": true |
| }, |
| "attributes": {} |
| } |
| }, |
| "total_flos": 0.0, |
| "train_batch_size": 6, |
| "trial_name": null, |
| "trial_params": null |
| } |
|
|