{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.2547121752419766, "eval_steps": 500, "global_step": 500, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "clip_ratio": 0.0, "completion_length": 321.578125, "epoch": 0.0005094243504839531, "grad_norm": 21.497011168465292, "kl": 0.0, "learning_rate": 9.997452878247579e-07, "loss": -0.0, "reward": -0.492842435836792, "reward_std": 0.7784243226051331, "rewards/accuracy_reward": -0.4125000238418579, "rewards/cosine_rewards": -0.08018936403095722, "rewards/format_reward": 0.0, "rewards/repetition_rewards": -0.0001530575300421333, "step": 1 }, { "clip_ratio": 0.0, "completion_length": 211.796875, "epoch": 0.0010188487009679063, "grad_norm": 8.570878529351686, "kl": 0.00115203857421875, "learning_rate": 9.99490575649516e-07, "loss": 0.0, "reward": -0.2021125927567482, "reward_std": 0.686398446559906, "rewards/accuracy_reward": -0.18437501601874828, "rewards/cosine_rewards": -0.01752197090536356, "rewards/format_reward": 0.0, "rewards/repetition_rewards": -0.00021561131143243983, "step": 2 }, { "clip_ratio": 0.0, "completion_length": 242.640625, "epoch": 0.0015282730514518594, "grad_norm": 7.698910727869972, "kl": 0.0014190673828125, "learning_rate": 9.99235863474274e-07, "loss": 0.0001, "reward": -0.6304773092269897, "reward_std": 0.5950716435909271, "rewards/accuracy_reward": -0.6093750298023224, "rewards/cosine_rewards": -0.03664374351501465, "rewards/format_reward": 0.015625, "rewards/repetition_rewards": -8.355615136679262e-05, "step": 3 }, { "clip_ratio": 0.0, "completion_length": 192.765625, "epoch": 0.0020376974019358125, "grad_norm": 8.264023776311538, "kl": 0.00258636474609375, "learning_rate": 9.98981151299032e-07, "loss": 0.0001, "reward": -0.4020528346300125, "reward_std": 0.7227448225021362, "rewards/accuracy_reward": -0.38750001788139343, "rewards/cosine_rewards": -0.014348747674375772, "rewards/format_reward": 0.0, "rewards/repetition_rewards": -0.00020408956333994865, "step": 4 }, { "clip_ratio": 0.0, "completion_length": 199.953125, "epoch": 0.0025471217524197657, "grad_norm": 9.41735274952485, "kl": 0.00286865234375, "learning_rate": 9.9872643912379e-07, "loss": 0.0001, "reward": -0.45950669050216675, "reward_std": 0.6219092607498169, "rewards/accuracy_reward": -0.4343750476837158, "rewards/cosine_rewards": -0.02503613755106926, "rewards/format_reward": 0.0, "rewards/repetition_rewards": -9.553764903103001e-05, "step": 5 }, { "clip_ratio": 0.0, "completion_length": 197.9375, "epoch": 0.003056546102903719, "grad_norm": 12.944765767909546, "kl": 0.008697509765625, "learning_rate": 9.984717269485481e-07, "loss": 0.0003, "reward": -0.42242346704006195, "reward_std": 0.6794147342443466, "rewards/accuracy_reward": -0.40937504172325134, "rewards/cosine_rewards": -0.028209966607391834, "rewards/format_reward": 0.015625, "rewards/repetition_rewards": -0.0004634863289538771, "step": 6 }, { "clip_ratio": 0.0, "completion_length": 131.859375, "epoch": 0.003565970453387672, "grad_norm": 10.259430825273313, "kl": 0.013763427734375, "learning_rate": 9.98217014773306e-07, "loss": 0.0005, "reward": -0.33318234980106354, "reward_std": 0.7437820434570312, "rewards/accuracy_reward": -0.35625000298023224, "rewards/cosine_rewards": -0.00804880098439753, "rewards/format_reward": 0.03125, "rewards/repetition_rewards": -0.00013354701513890177, "step": 7 }, { "clip_ratio": 0.0, "completion_length": 138.0625, "epoch": 0.004075394803871625, "grad_norm": 8.664940595308508, "kl": 0.01800537109375, "learning_rate": 9.979623025980642e-07, "loss": 0.0007, "reward": -0.3353596553206444, "reward_std": 0.7424190640449524, "rewards/accuracy_reward": -0.32500002533197403, "rewards/cosine_rewards": -0.010187382809817791, "rewards/format_reward": 0.0, "rewards/repetition_rewards": -0.00017226976342499256, "step": 8 }, { "clip_ratio": 0.0, "completion_length": 130.609375, "epoch": 0.004584819154355578, "grad_norm": 12.906962146752678, "kl": 0.013641357421875, "learning_rate": 9.977075904228221e-07, "loss": 0.0005, "reward": -0.5576262176036835, "reward_std": 0.38936011493206024, "rewards/accuracy_reward": -0.546875, "rewards/cosine_rewards": -0.010545612312853336, "rewards/format_reward": 0.0, "rewards/repetition_rewards": -0.00020559210679493845, "step": 9 }, { "clip_ratio": 0.0, "completion_length": 118.375, "epoch": 0.005094243504839531, "grad_norm": 12.675664846435772, "kl": 0.014129638671875, "learning_rate": 9.974528782475803e-07, "loss": 0.0006, "reward": -0.5825353264808655, "reward_std": 0.32141495356336236, "rewards/accuracy_reward": -0.5750000178813934, "rewards/cosine_rewards": -0.0075353041756898165, "rewards/format_reward": 0.0, "rewards/repetition_rewards": 0.0, "step": 10 }, { "clip_ratio": 0.0, "completion_length": 116.5625, "epoch": 0.0056036678553234845, "grad_norm": 83.14378275688269, "kl": 0.011932373046875, "learning_rate": 9.971981660723382e-07, "loss": 0.0005, "reward": -0.4973638355731964, "reward_std": 0.6479763090610504, "rewards/accuracy_reward": -0.4906250536441803, "rewards/cosine_rewards": -0.006738818949088454, "rewards/format_reward": 0.0, "rewards/repetition_rewards": 0.0, "step": 11 }, { "clip_ratio": 0.0, "completion_length": 122.5, "epoch": 0.006113092205807438, "grad_norm": 10.015051037156322, "kl": 0.01776123046875, "learning_rate": 9.969434538970963e-07, "loss": 0.0007, "reward": -0.5842953324317932, "reward_std": 0.3923248201608658, "rewards/accuracy_reward": -0.5750000029802322, "rewards/cosine_rewards": -0.009295305702835321, "rewards/format_reward": 0.0, "rewards/repetition_rewards": 0.0, "step": 12 }, { "clip_ratio": 0.0, "completion_length": 113.984375, "epoch": 0.006622516556291391, "grad_norm": 11.394446741932766, "kl": 0.018157958984375, "learning_rate": 9.966887417218542e-07, "loss": 0.0007, "reward": -0.5545713007450104, "reward_std": 0.5603736639022827, "rewards/accuracy_reward": -0.5468750298023224, "rewards/cosine_rewards": -0.007696274435147643, "rewards/format_reward": 0.0, "rewards/repetition_rewards": 0.0, "step": 13 }, { "clip_ratio": 0.0, "completion_length": 105.8125, "epoch": 0.007131940906775344, "grad_norm": 11.774338615514537, "kl": 0.017730712890625, "learning_rate": 9.964340295466124e-07, "loss": 0.0007, "reward": -0.24103393778204918, "reward_std": 0.770084798336029, "rewards/accuracy_reward": -0.23750002682209015, "rewards/cosine_rewards": -0.0035339330206625164, "rewards/format_reward": 0.0, "rewards/repetition_rewards": 0.0, "step": 14 }, { "clip_ratio": 0.0, "completion_length": 99.5, "epoch": 0.007641365257259297, "grad_norm": 12.461454822945, "kl": 0.01995849609375, "learning_rate": 9.961793173713703e-07, "loss": 0.0008, "reward": -0.7055607736110687, "reward_std": 0.2303236834704876, "rewards/accuracy_reward": -0.7156250178813934, "rewards/cosine_rewards": -0.005560769001021981, "rewards/format_reward": 0.015625, "rewards/repetition_rewards": 0.0, "step": 15 }, { "clip_ratio": 0.0, "completion_length": 101.53125, "epoch": 0.00815078960774325, "grad_norm": 16.951183982865736, "kl": 0.0206298828125, "learning_rate": 9.959246051961282e-07, "loss": 0.0008, "reward": -0.3540929928421974, "reward_std": 0.7245323657989502, "rewards/accuracy_reward": -0.3500000238418579, "rewards/cosine_rewards": -0.004092983668670058, "rewards/format_reward": 0.0, "rewards/repetition_rewards": 0.0, "step": 16 }, { "clip_ratio": 0.0, "completion_length": 96.578125, "epoch": 0.008660213958227204, "grad_norm": 9.458837096460513, "kl": 0.025634765625, "learning_rate": 9.956698930208864e-07, "loss": 0.001, "reward": -0.36599001288414, "reward_std": 0.6569808125495911, "rewards/accuracy_reward": -0.37812502682209015, "rewards/cosine_rewards": -0.003489995375275612, "rewards/format_reward": 0.015625, "rewards/repetition_rewards": 0.0, "step": 17 }, { "clip_ratio": 0.0, "completion_length": 96.625, "epoch": 0.009169638308711156, "grad_norm": 11.937426620152417, "kl": 0.02801513671875, "learning_rate": 9.954151808456443e-07, "loss": 0.0011, "reward": -0.40710097551345825, "reward_std": 0.7412720322608948, "rewards/accuracy_reward": -0.43437501788139343, "rewards/cosine_rewards": -0.003975986503064632, "rewards/format_reward": 0.03125, "rewards/repetition_rewards": 0.0, "step": 18 }, { "clip_ratio": 0.0, "completion_length": 97.21875, "epoch": 0.00967906265919511, "grad_norm": 12.317962197180934, "kl": 0.03466796875, "learning_rate": 9.951604686704024e-07, "loss": 0.0014, "reward": -0.25013431906700134, "reward_std": 0.7123757898807526, "rewards/accuracy_reward": -0.32500001788139343, "rewards/cosine_rewards": -0.003259307239204645, "rewards/format_reward": 0.078125, "rewards/repetition_rewards": 0.0, "step": 19 }, { "clip_ratio": 0.0, "completion_length": 97.109375, "epoch": 0.010188487009679063, "grad_norm": 24.27751022595849, "kl": 0.037109375, "learning_rate": 9.949057564951603e-07, "loss": 0.0015, "reward": -0.2632312625646591, "reward_std": 0.6930468529462814, "rewards/accuracy_reward": -0.4624999985098839, "rewards/cosine_rewards": -0.003856247873045504, "rewards/format_reward": 0.203125, "rewards/repetition_rewards": 0.0, "step": 20 }, { "clip_ratio": 0.0, "completion_length": 109.03125, "epoch": 0.010697911360163017, "grad_norm": 12.508736780907405, "kl": 0.053955078125, "learning_rate": 9.946510443199185e-07, "loss": 0.0022, "reward": -0.010567170567810535, "reward_std": 0.7874742448329926, "rewards/accuracy_reward": -0.4125000238418579, "rewards/cosine_rewards": -0.004317150334827602, "rewards/format_reward": 0.40625, "rewards/repetition_rewards": 0.0, "step": 21 }, { "clip_ratio": 0.0, "completion_length": 113.390625, "epoch": 0.011207335710646969, "grad_norm": 10.983519481785477, "kl": 0.073974609375, "learning_rate": 9.943963321446764e-07, "loss": 0.003, "reward": 0.5529356598854065, "reward_std": 0.9540310502052307, "rewards/accuracy_reward": -0.2093750163912773, "rewards/cosine_rewards": -0.003314302652142942, "rewards/format_reward": 0.765625, "rewards/repetition_rewards": 0.0, "step": 22 }, { "clip_ratio": 0.0, "completion_length": 113.796875, "epoch": 0.011716760061130923, "grad_norm": 59.13650095831239, "kl": 0.084716796875, "learning_rate": 9.941416199694345e-07, "loss": 0.0034, "reward": 0.49799469113349915, "reward_std": 0.6547213792800903, "rewards/accuracy_reward": -0.43437501788139343, "rewards/cosine_rewards": -0.005130313569679856, "rewards/format_reward": 0.9375, "rewards/repetition_rewards": 0.0, "step": 23 }, { "clip_ratio": 0.0, "completion_length": 114.34375, "epoch": 0.012226184411614875, "grad_norm": 33.85321217925331, "kl": 0.078369140625, "learning_rate": 9.938869077941925e-07, "loss": 0.0031, "reward": 0.5440552532672882, "reward_std": 0.4689805209636688, "rewards/accuracy_reward": -0.43437501788139343, "rewards/cosine_rewards": -0.005944762844592333, "rewards/format_reward": 0.984375, "rewards/repetition_rewards": 0.0, "step": 24 }, { "clip_ratio": 0.0, "completion_length": 99.078125, "epoch": 0.01273560876209883, "grad_norm": 27.614415529764607, "kl": 0.23876953125, "learning_rate": 9.936321956189506e-07, "loss": 0.0096, "reward": 0.319291889667511, "reward_std": 0.2991320895962417, "rewards/accuracy_reward": -0.659375011920929, "rewards/cosine_rewards": -0.005708091426640749, "rewards/format_reward": 0.984375, "rewards/repetition_rewards": 0.0, "step": 25 }, { "clip_ratio": 0.0, "completion_length": 21.09375, "epoch": 0.013245033112582781, "grad_norm": 78.27860464199816, "kl": 0.810546875, "learning_rate": 9.933774834437085e-07, "loss": 0.0324, "reward": 0.758573591709137, "reward_std": 0.8151377141475677, "rewards/accuracy_reward": -0.24062500894069672, "rewards/cosine_rewards": -0.0008013773494894849, "rewards/format_reward": 1.0, "rewards/repetition_rewards": 0.0, "step": 26 }, { "clip_ratio": 0.0, "completion_length": 18.75, "epoch": 0.013754457463066735, "grad_norm": 15.792726008582374, "kl": 0.876953125, "learning_rate": 9.931227712684667e-07, "loss": 0.0351, "reward": 0.5177058726549149, "reward_std": 0.6054319739341736, "rewards/accuracy_reward": -0.46562501788139343, "rewards/cosine_rewards": -0.000942649960052222, "rewards/format_reward": 0.984375, "rewards/repetition_rewards": -0.000101461038866546, "step": 27 }, { "clip_ratio": 0.0, "completion_length": 15.15625, "epoch": 0.014263881813550688, "grad_norm": 28.538085950991302, "kl": 0.853515625, "learning_rate": 9.928680590932246e-07, "loss": 0.0342, "reward": 0.30591557919979095, "reward_std": 0.3449897766113281, "rewards/accuracy_reward": -0.6625000238418579, "rewards/cosine_rewards": -0.00033439824983361177, "rewards/format_reward": 0.96875, "rewards/repetition_rewards": 0.0, "step": 28 }, { "clip_ratio": 0.0, "completion_length": 13.1875, "epoch": 0.014773306164034642, "grad_norm": 19.672166251005525, "kl": 0.939453125, "learning_rate": 9.926133469179825e-07, "loss": 0.0375, "reward": 0.4091247171163559, "reward_std": 0.46140581369400024, "rewards/accuracy_reward": -0.5750000476837158, "rewards/cosine_rewards": -0.0002502501738490537, "rewards/format_reward": 0.984375, "rewards/repetition_rewards": 0.0, "step": 29 }, { "clip_ratio": 0.0, "completion_length": 16.171875, "epoch": 0.015282730514518594, "grad_norm": 35.96869326683099, "kl": 1.416015625, "learning_rate": 9.923586347427406e-07, "loss": 0.0566, "reward": 0.5554585456848145, "reward_std": 0.7011753022670746, "rewards/accuracy_reward": -0.3812499940395355, "rewards/cosine_rewards": -0.0007914370798971504, "rewards/format_reward": 0.9375, "rewards/repetition_rewards": 0.0, "step": 30 }, { "clip_ratio": 0.0, "completion_length": 16.15625, "epoch": 0.015792154865002548, "grad_norm": 24.745191247130563, "kl": 1.01171875, "learning_rate": 9.921039225674986e-07, "loss": 0.0405, "reward": 0.6306657046079636, "reward_std": 0.7620185613632202, "rewards/accuracy_reward": -0.32187502086162567, "rewards/cosine_rewards": -0.0005842609098181129, "rewards/format_reward": 0.953125, "rewards/repetition_rewards": 0.0, "step": 31 }, { "clip_ratio": 0.0, "completion_length": 39.140625, "epoch": 0.0163015792154865, "grad_norm": 12.709104159306316, "kl": 0.7265625, "learning_rate": 9.918492103922567e-07, "loss": 0.0291, "reward": 0.38606902956962585, "reward_std": 0.8792209327220917, "rewards/accuracy_reward": -0.39375001192092896, "rewards/cosine_rewards": -0.001430943259038031, "rewards/format_reward": 0.78125, "rewards/repetition_rewards": 0.0, "step": 32 }, { "clip_ratio": 0.0, "completion_length": 15.03125, "epoch": 0.016811003565970453, "grad_norm": 17.976804747397264, "kl": 0.904296875, "learning_rate": 9.915944982170146e-07, "loss": 0.0361, "reward": 0.4996982365846634, "reward_std": 0.7649624943733215, "rewards/accuracy_reward": -0.4687500149011612, "rewards/cosine_rewards": -0.00030174180574249476, "rewards/format_reward": 0.96875, "rewards/repetition_rewards": 0.0, "step": 33 }, { "clip_ratio": 0.0, "completion_length": 22.609375, "epoch": 0.017320427916454408, "grad_norm": 39.58286880123024, "kl": 0.8828125, "learning_rate": 9.913397860417728e-07, "loss": 0.0353, "reward": 0.4207390695810318, "reward_std": 0.8459209501743317, "rewards/accuracy_reward": -0.4375000298023224, "rewards/cosine_rewards": -0.0011358977280906402, "rewards/format_reward": 0.859375, "rewards/repetition_rewards": 0.0, "step": 34 }, { "clip_ratio": 0.0, "completion_length": 13.328125, "epoch": 0.01782985226693836, "grad_norm": 19.17123986238676, "kl": 0.955078125, "learning_rate": 9.910850738665307e-07, "loss": 0.0383, "reward": 0.474868506193161, "reward_std": 0.6449769139289856, "rewards/accuracy_reward": -0.49375005066394806, "rewards/cosine_rewards": -0.0001314536166319158, "rewards/format_reward": 0.96875, "rewards/repetition_rewards": 0.0, "step": 35 }, { "clip_ratio": 0.0, "completion_length": 24.328125, "epoch": 0.018339276617422313, "grad_norm": 23.5658227799872, "kl": 0.95703125, "learning_rate": 9.908303616912888e-07, "loss": 0.0382, "reward": 0.4700201153755188, "reward_std": 0.7454200983047485, "rewards/accuracy_reward": -0.41875001788139343, "rewards/cosine_rewards": -0.0018548529915278777, "rewards/format_reward": 0.890625, "rewards/repetition_rewards": 0.0, "step": 36 }, { "clip_ratio": 0.0, "completion_length": 13.921875, "epoch": 0.018848700967906265, "grad_norm": 11.872294898362206, "kl": 1.001953125, "learning_rate": 9.905756495160467e-07, "loss": 0.0401, "reward": 0.5029261708259583, "reward_std": 0.7742039263248444, "rewards/accuracy_reward": -0.4343750327825546, "rewards/cosine_rewards": -0.0001988118929148186, "rewards/format_reward": 0.9375, "rewards/repetition_rewards": 0.0, "step": 37 }, { "clip_ratio": 0.0, "completion_length": 16.40625, "epoch": 0.01935812531839022, "grad_norm": 14.39070050703297, "kl": 0.978515625, "learning_rate": 9.903209373408049e-07, "loss": 0.0391, "reward": 0.4616774320602417, "reward_std": 0.7915183901786804, "rewards/accuracy_reward": -0.4125000238418579, "rewards/cosine_rewards": -0.000822544090624433, "rewards/format_reward": 0.875, "rewards/repetition_rewards": 0.0, "step": 38 }, { "clip_ratio": 0.0, "completion_length": 22.1875, "epoch": 0.019867549668874173, "grad_norm": 9.423514240680602, "kl": 0.9375, "learning_rate": 9.900662251655628e-07, "loss": 0.0376, "reward": 0.5430571883916855, "reward_std": 0.5502887666225433, "rewards/accuracy_reward": -0.4062500447034836, "rewards/cosine_rewards": -0.003600762978749117, "rewards/format_reward": 0.953125, "rewards/repetition_rewards": -0.00021701389050576836, "step": 39 }, { "clip_ratio": 0.0, "completion_length": 12.4375, "epoch": 0.020376974019358125, "grad_norm": 25.806979588800377, "kl": 0.9140625, "learning_rate": 9.89811512990321e-07, "loss": 0.0366, "reward": 0.503069132566452, "reward_std": 0.6060213148593903, "rewards/accuracy_reward": -0.4656250327825546, "rewards/cosine_rewards": -5.582944686466362e-05, "rewards/format_reward": 0.96875, "rewards/repetition_rewards": 0.0, "step": 40 }, { "clip_ratio": 0.0, "completion_length": 12.46875, "epoch": 0.020886398369842078, "grad_norm": 20.104239930601235, "kl": 0.9296875, "learning_rate": 9.895568008150789e-07, "loss": 0.0372, "reward": 0.631201758980751, "reward_std": 0.7148115336894989, "rewards/accuracy_reward": -0.35312502086162567, "rewards/cosine_rewards": -4.822800292458851e-05, "rewards/format_reward": 0.984375, "rewards/repetition_rewards": 0.0, "step": 41 }, { "clip_ratio": 0.0, "completion_length": 12.828125, "epoch": 0.021395822720326033, "grad_norm": 7.720832433302504, "kl": 0.841796875, "learning_rate": 9.89302088639837e-07, "loss": 0.0336, "reward": 0.5936954319477081, "reward_std": 0.4961870163679123, "rewards/accuracy_reward": -0.4062500298023224, "rewards/cosine_rewards": -5.458852319861762e-05, "rewards/format_reward": 1.0, "rewards/repetition_rewards": 0.0, "step": 42 }, { "clip_ratio": 0.0, "completion_length": 12.984375, "epoch": 0.021905247070809986, "grad_norm": 9.831243087089065, "kl": 0.76953125, "learning_rate": 9.89047376464595e-07, "loss": 0.0308, "reward": 0.6499472558498383, "reward_std": 0.7755721807479858, "rewards/accuracy_reward": -0.3500000238418579, "rewards/cosine_rewards": -5.273178430797998e-05, "rewards/format_reward": 1.0, "rewards/repetition_rewards": 0.0, "step": 43 }, { "clip_ratio": 0.0, "completion_length": 13.0, "epoch": 0.022414671421293938, "grad_norm": 15.405185965961632, "kl": 0.79296875, "learning_rate": 9.88792664289353e-07, "loss": 0.0318, "reward": 0.8749629557132721, "reward_std": 0.8532125055789948, "rewards/accuracy_reward": -0.1250000149011612, "rewards/cosine_rewards": -3.706023017002735e-05, "rewards/format_reward": 1.0, "rewards/repetition_rewards": 0.0, "step": 44 }, { "clip_ratio": 0.0, "completion_length": 13.0, "epoch": 0.02292409577177789, "grad_norm": 73.99173140679622, "kl": 0.814453125, "learning_rate": 9.88537952114111e-07, "loss": 0.0326, "reward": 0.8468359708786011, "reward_std": 0.6202812939882278, "rewards/accuracy_reward": -0.15312501415610313, "rewards/cosine_rewards": -3.904559889633674e-05, "rewards/format_reward": 1.0, "rewards/repetition_rewards": 0.0, "step": 45 }, { "clip_ratio": 0.0, "completion_length": 13.09375, "epoch": 0.023433520122261846, "grad_norm": 57.01467559353994, "kl": 0.802734375, "learning_rate": 9.882832399388691e-07, "loss": 0.0321, "reward": 0.7187013626098633, "reward_std": 0.7317405939102173, "rewards/accuracy_reward": -0.26562502793967724, "rewards/cosine_rewards": -4.8641444664099254e-05, "rewards/format_reward": 0.984375, "rewards/repetition_rewards": 0.0, "step": 46 }, { "clip_ratio": 0.0, "completion_length": 14.8125, "epoch": 0.023942944472745798, "grad_norm": 49.94239079179156, "kl": 0.8125, "learning_rate": 9.88028527763627e-07, "loss": 0.0325, "reward": 0.7904289066791534, "reward_std": 0.6411640644073486, "rewards/accuracy_reward": -0.2093750163912773, "rewards/cosine_rewards": -0.00019609702576417476, "rewards/format_reward": 1.0, "rewards/repetition_rewards": 0.0, "step": 47 }, { "clip_ratio": 0.0, "completion_length": 13.984375, "epoch": 0.02445236882322975, "grad_norm": 29.421172213478044, "kl": 0.8046875, "learning_rate": 9.877738155883852e-07, "loss": 0.0322, "reward": 0.7342777252197266, "reward_std": 0.3429698422551155, "rewards/accuracy_reward": -0.2656249850988388, "rewards/cosine_rewards": -9.731029422255233e-05, "rewards/format_reward": 1.0, "rewards/repetition_rewards": 0.0, "step": 48 }, { "clip_ratio": 0.0, "completion_length": 13.0, "epoch": 0.024961793173713703, "grad_norm": 30.98665699286148, "kl": 0.86328125, "learning_rate": 9.87519103413143e-07, "loss": 0.0346, "reward": 1.0437248945236206, "reward_std": 0.6164620369672775, "rewards/accuracy_reward": 0.04374997317790985, "rewards/cosine_rewards": -2.5148013037323835e-05, "rewards/format_reward": 1.0, "rewards/repetition_rewards": 0.0, "step": 49 }, { "clip_ratio": 0.0, "completion_length": 13.0, "epoch": 0.02547121752419766, "grad_norm": 25.805195350433394, "kl": 0.787109375, "learning_rate": 9.872643912379012e-07, "loss": 0.0315, "reward": 0.6499470472335815, "reward_std": 0.4753982424736023, "rewards/accuracy_reward": -0.3500000238418579, "rewards/cosine_rewards": -5.2943185437470675e-05, "rewards/format_reward": 1.0, "rewards/repetition_rewards": 0.0, "step": 50 }, { "clip_ratio": 0.0, "completion_length": 13.0, "epoch": 0.02598064187468161, "grad_norm": 60.37753917289171, "kl": 0.865234375, "learning_rate": 9.870096790626592e-07, "loss": 0.0347, "reward": 1.0999788641929626, "reward_std": 0.716822475194931, "rewards/accuracy_reward": 0.09999999590218067, "rewards/cosine_rewards": -2.117727399308933e-05, "rewards/format_reward": 1.0, "rewards/repetition_rewards": 0.0, "step": 51 }, { "clip_ratio": 0.0, "completion_length": 13.0, "epoch": 0.026490066225165563, "grad_norm": 60.29300770886218, "kl": 0.859375, "learning_rate": 9.867549668874173e-07, "loss": 0.0343, "reward": 1.3249947428703308, "reward_std": 0.6325759440660477, "rewards/accuracy_reward": 0.32499997690320015, "rewards/cosine_rewards": -5.294318725646008e-06, "rewards/format_reward": 1.0, "rewards/repetition_rewards": 0.0, "step": 52 }, { "clip_ratio": 0.0, "completion_length": 14.859375, "epoch": 0.026999490575649515, "grad_norm": 33.49731491963465, "kl": 0.96484375, "learning_rate": 9.865002547121752e-07, "loss": 0.0386, "reward": 0.6497911810874939, "reward_std": 0.23335448652505875, "rewards/accuracy_reward": -0.3500000163912773, "rewards/cosine_rewards": -0.00020882973694824614, "rewards/format_reward": 1.0, "rewards/repetition_rewards": 0.0, "step": 53 }, { "clip_ratio": 0.0, "completion_length": 13.0, "epoch": 0.02750891492613347, "grad_norm": 21.106152145400298, "kl": 0.85546875, "learning_rate": 9.862455425369333e-07, "loss": 0.0342, "reward": 1.3812487125396729, "reward_std": 0.26327238231897354, "rewards/accuracy_reward": 0.3812499940395355, "rewards/cosine_rewards": -1.3235799087851774e-06, "rewards/format_reward": 1.0, "rewards/repetition_rewards": 0.0, "step": 54 }, { "clip_ratio": 0.0, "completion_length": 13.0, "epoch": 0.028018339276617423, "grad_norm": 50.8468456202969, "kl": 0.767578125, "learning_rate": 9.859908303616913e-07, "loss": 0.0307, "reward": 1.493756651878357, "reward_std": 0.3182205259799957, "rewards/accuracy_reward": 0.4937499910593033, "rewards/cosine_rewards": 6.617898179683834e-06, "rewards/format_reward": 1.0, "rewards/repetition_rewards": 0.0, "step": 55 }, { "clip_ratio": 0.0, "completion_length": 13.0, "epoch": 0.028527763627101375, "grad_norm": 45.22229728431362, "kl": 0.833984375, "learning_rate": 9.857361181864494e-07, "loss": 0.0334, "reward": 0.9030899405479431, "reward_std": 0.2386654019355774, "rewards/accuracy_reward": -0.09687501937150955, "rewards/cosine_rewards": -3.507485962472856e-05, "rewards/format_reward": 1.0, "rewards/repetition_rewards": 0.0, "step": 56 }, { "clip_ratio": 0.0, "completion_length": 13.0, "epoch": 0.029037187977585328, "grad_norm": 251.48941881136554, "kl": 0.828125, "learning_rate": 9.854814060112073e-07, "loss": 0.0331, "reward": 1.5781376361846924, "reward_std": 0.3039933070540428, "rewards/accuracy_reward": 0.5781249701976776, "rewards/cosine_rewards": 1.2574006632348755e-05, "rewards/format_reward": 1.0, "rewards/repetition_rewards": 0.0, "step": 57 }, { "clip_ratio": 0.0, "completion_length": 14.453125, "epoch": 0.029546612328069283, "grad_norm": 36.77436472817121, "kl": 0.939453125, "learning_rate": 9.852266938359653e-07, "loss": 0.0376, "reward": 1.334273636341095, "reward_std": 0.34448733925819397, "rewards/accuracy_reward": 0.34999997913837433, "rewards/cosine_rewards": -0.00010142281280423049, "rewards/format_reward": 0.984375, "rewards/repetition_rewards": 0.0, "step": 58 }, { "clip_ratio": 0.0, "completion_length": 14.4375, "epoch": 0.030056036678553236, "grad_norm": 46.54068675299219, "kl": 0.89453125, "learning_rate": 9.849719816607234e-07, "loss": 0.0358, "reward": 0.9967500269412994, "reward_std": 0.4488208740949631, "rewards/accuracy_reward": 0.012499993667006493, "rewards/cosine_rewards": -0.00012501747096393956, "rewards/format_reward": 0.984375, "rewards/repetition_rewards": 0.0, "step": 59 }, { "clip_ratio": 0.0, "completion_length": 13.0, "epoch": 0.030565461029037188, "grad_norm": 63.09725081890949, "kl": 0.837890625, "learning_rate": 9.847172694854813e-07, "loss": 0.0335, "reward": 0.9874708652496338, "reward_std": 0.33707569539546967, "rewards/accuracy_reward": -0.012500002980232239, "rewards/cosine_rewards": -2.9118752991053043e-05, "rewards/format_reward": 1.0, "rewards/repetition_rewards": 0.0, "step": 60 }, { "clip_ratio": 0.0, "completion_length": 13.0, "epoch": 0.03107488537952114, "grad_norm": 105.45023488529014, "kl": 0.8359375, "learning_rate": 9.844625573102394e-07, "loss": 0.0334, "reward": 1.1281058490276337, "reward_std": 0.3039932996034622, "rewards/accuracy_reward": 0.12812498584389687, "rewards/cosine_rewards": -1.919190435728524e-05, "rewards/format_reward": 1.0, "rewards/repetition_rewards": 0.0, "step": 61 }, { "clip_ratio": 0.0, "completion_length": 14.640625, "epoch": 0.031584309730005096, "grad_norm": 85.12929156788996, "kl": 0.84375, "learning_rate": 9.842078451349974e-07, "loss": 0.0337, "reward": 0.7748350501060486, "reward_std": 0.5448895841836929, "rewards/accuracy_reward": -0.2093750238418579, "rewards/cosine_rewards": -0.00016493651855853386, "rewards/format_reward": 0.984375, "rewards/repetition_rewards": 0.0, "step": 62 }, { "clip_ratio": 0.0, "completion_length": 19.078125, "epoch": 0.032093734080489045, "grad_norm": 8.426167428667783, "kl": 0.814453125, "learning_rate": 9.839531329597555e-07, "loss": 0.0326, "reward": 0.874523401260376, "reward_std": 0.0010828198865056038, "rewards/accuracy_reward": -0.1250000149011612, "rewards/cosine_rewards": -0.0004766158472193638, "rewards/format_reward": 1.0, "rewards/repetition_rewards": 0.0, "step": 63 }, { "clip_ratio": 0.0, "completion_length": 19.34375, "epoch": 0.032603158430973, "grad_norm": 86.47647424288853, "kl": 0.810546875, "learning_rate": 9.836984207845134e-07, "loss": 0.0324, "reward": 1.6625866889953613, "reward_std": 0.19662056118249893, "rewards/accuracy_reward": 0.6625000238418579, "rewards/cosine_rewards": 8.658922160975635e-05, "rewards/format_reward": 1.0, "rewards/repetition_rewards": 0.0, "step": 64 }, { "clip_ratio": 0.0, "completion_length": 17.6875, "epoch": 0.033112582781456956, "grad_norm": 59.22678939682069, "kl": 0.86328125, "learning_rate": 9.834437086092716e-07, "loss": 0.0345, "reward": 0.915763258934021, "reward_std": 0.082692209049128, "rewards/accuracy_reward": -0.06875000894069672, "rewards/cosine_rewards": 0.00013823993504047394, "rewards/format_reward": 0.984375, "rewards/repetition_rewards": 0.0, "step": 65 }, { "clip_ratio": 0.0, "completion_length": 24.671875, "epoch": 0.033622007131940905, "grad_norm": 170.96252285475958, "kl": 0.7734375, "learning_rate": 9.831889964340295e-07, "loss": 0.0309, "reward": 1.2965829372406006, "reward_std": 0.32569222897291183, "rewards/accuracy_reward": 0.2968750074505806, "rewards/cosine_rewards": -0.0002921203849837184, "rewards/format_reward": 1.0, "rewards/repetition_rewards": 0.0, "step": 66 }, { "clip_ratio": 0.0, "completion_length": 18.890625, "epoch": 0.03413143148242486, "grad_norm": 302.02752287161115, "kl": 0.84765625, "learning_rate": 9.829342842587876e-07, "loss": 0.0339, "reward": 1.2968038320541382, "reward_std": 0.27610647678375244, "rewards/accuracy_reward": 0.296875, "rewards/cosine_rewards": -7.123823161236942e-05, "rewards/format_reward": 1.0, "rewards/repetition_rewards": 0.0, "step": 67 }, { "clip_ratio": 0.0, "completion_length": 22.984375, "epoch": 0.034640855832908816, "grad_norm": 617.6786375620823, "kl": 0.77734375, "learning_rate": 9.826795720835456e-07, "loss": 0.0311, "reward": 1.4656760096549988, "reward_std": 0.2886117473244667, "rewards/accuracy_reward": 0.46562498807907104, "rewards/cosine_rewards": 5.1008202717639506e-05, "rewards/format_reward": 1.0, "rewards/repetition_rewards": 0.0, "step": 68 }, { "clip_ratio": 0.0, "completion_length": 24.34375, "epoch": 0.035150280183392765, "grad_norm": 29.79960642054238, "kl": 0.728515625, "learning_rate": 9.824248599083037e-07, "loss": 0.0292, "reward": 1.309334635734558, "reward_std": 0.20424916595220566, "rewards/accuracy_reward": 0.32500000298023224, "rewards/cosine_rewards": -4.041045031044632e-05, "rewards/format_reward": 0.984375, "rewards/repetition_rewards": 0.0, "step": 69 }, { "clip_ratio": 0.0, "completion_length": 23.90625, "epoch": 0.03565970453387672, "grad_norm": 91.00871807242451, "kl": 0.744140625, "learning_rate": 9.821701477330616e-07, "loss": 0.0298, "reward": 1.2686043679714203, "reward_std": 0.10558865318307653, "rewards/accuracy_reward": 0.26874998211860657, "rewards/cosine_rewards": -0.00014566810568794608, "rewards/format_reward": 1.0, "rewards/repetition_rewards": 0.0, "step": 70 }, { "clip_ratio": 0.0, "completion_length": 36.328125, "epoch": 0.03616912888436067, "grad_norm": 159.05100875041754, "kl": 0.765625, "learning_rate": 9.819154355578195e-07, "loss": 0.0306, "reward": 1.2812767028808594, "reward_std": 0.6231541335582733, "rewards/accuracy_reward": 0.296875, "rewards/cosine_rewards": 2.6669338694773614e-05, "rewards/format_reward": 0.984375, "rewards/repetition_rewards": 0.0, "step": 71 }, { "clip_ratio": 0.0, "completion_length": 38.4375, "epoch": 0.036678553234844626, "grad_norm": 97.83490373613579, "kl": 0.666015625, "learning_rate": 9.816607233825777e-07, "loss": 0.0266, "reward": 1.647216558456421, "reward_std": 0.32463081181049347, "rewards/accuracy_reward": 0.6625000089406967, "rewards/cosine_rewards": 0.00034145097015425563, "rewards/format_reward": 0.984375, "rewards/repetition_rewards": 0.0, "step": 72 }, { "clip_ratio": 0.0, "completion_length": 40.046875, "epoch": 0.03718797758532858, "grad_norm": 88.5872574021467, "kl": 0.630859375, "learning_rate": 9.814060112073356e-07, "loss": 0.0253, "reward": 1.7599374055862427, "reward_std": 0.3454015702009201, "rewards/accuracy_reward": 0.7750000059604645, "rewards/cosine_rewards": 0.0005623315373668447, "rewards/format_reward": 0.984375, "rewards/repetition_rewards": 0.0, "step": 73 }, { "clip_ratio": 0.0, "completion_length": 60.140625, "epoch": 0.03769740193581253, "grad_norm": 16.49274288281998, "kl": 0.501953125, "learning_rate": 9.811512990320937e-07, "loss": 0.0201, "reward": 1.7728378772735596, "reward_std": 0.2738931328058243, "rewards/accuracy_reward": 0.8031250238418579, "rewards/cosine_rewards": 0.0009629083215259016, "rewards/format_reward": 0.96875, "rewards/repetition_rewards": 0.0, "step": 74 }, { "clip_ratio": 0.0, "completion_length": 100.9375, "epoch": 0.038206826286296486, "grad_norm": 18.283090697254373, "kl": 0.18359375, "learning_rate": 9.808965868568517e-07, "loss": 0.0074, "reward": 1.437682330608368, "reward_std": 0.19817885756492615, "rewards/accuracy_reward": 0.4375000149011612, "rewards/cosine_rewards": 0.00018233060836791992, "rewards/format_reward": 1.0, "rewards/repetition_rewards": 0.0, "step": 75 }, { "clip_ratio": 0.0, "completion_length": 107.34375, "epoch": 0.03871625063678044, "grad_norm": 21.656183371701722, "kl": 0.13330078125, "learning_rate": 9.806418746816098e-07, "loss": 0.0054, "reward": 1.2524056434631348, "reward_std": 0.14976192265748978, "rewards/accuracy_reward": 0.26874999701976776, "rewards/cosine_rewards": -0.0007193188357632607, "rewards/format_reward": 0.984375, "rewards/repetition_rewards": 0.0, "step": 76 }, { "clip_ratio": 0.0, "completion_length": 109.5, "epoch": 0.03922567498726439, "grad_norm": 12.450187143986858, "kl": 0.1328125, "learning_rate": 9.803871625063677e-07, "loss": 0.0053, "reward": 1.535181999206543, "reward_std": 0.04510992762516253, "rewards/accuracy_reward": 0.5499999970197678, "rewards/cosine_rewards": 0.0008070359472185373, "rewards/format_reward": 0.984375, "rewards/repetition_rewards": 0.0, "step": 77 }, { "clip_ratio": 0.0, "completion_length": 112.96875, "epoch": 0.039735099337748346, "grad_norm": 96.22924405795379, "kl": 0.12744140625, "learning_rate": 9.801324503311258e-07, "loss": 0.0051, "reward": 1.4221445322036743, "reward_std": 0.5387175530195236, "rewards/accuracy_reward": 0.4374999850988388, "rewards/cosine_rewards": 0.0002695363436941989, "rewards/format_reward": 0.984375, "rewards/repetition_rewards": 0.0, "step": 78 }, { "clip_ratio": 0.0, "completion_length": 117.46875, "epoch": 0.040244523688232295, "grad_norm": 22.356744283314942, "kl": 0.12451171875, "learning_rate": 9.798777381558838e-07, "loss": 0.005, "reward": 0.9280500411987305, "reward_std": 0.3032594621181488, "rewards/accuracy_reward": -0.06875001452863216, "rewards/cosine_rewards": -0.003199932281859219, "rewards/format_reward": 1.0, "rewards/repetition_rewards": 0.0, "step": 79 }, { "clip_ratio": 0.0, "completion_length": 110.09375, "epoch": 0.04075394803871625, "grad_norm": 11.587196398677966, "kl": 0.12353515625, "learning_rate": 9.79623025980642e-07, "loss": 0.0049, "reward": 1.0698014497756958, "reward_std": 0.306557297706604, "rewards/accuracy_reward": 0.07187498360872269, "rewards/cosine_rewards": -0.0020735373545903713, "rewards/format_reward": 1.0, "rewards/repetition_rewards": 0.0, "step": 80 }, { "clip_ratio": 0.0, "completion_length": 119.359375, "epoch": 0.041263372389200206, "grad_norm": 15.552459183086345, "kl": 0.115234375, "learning_rate": 9.793683138053998e-07, "loss": 0.0046, "reward": 1.902881920337677, "reward_std": 0.2866080105304718, "rewards/accuracy_reward": 0.9156250059604645, "rewards/cosine_rewards": 0.0028820185689255595, "rewards/format_reward": 0.984375, "rewards/repetition_rewards": 0.0, "step": 81 }, { "clip_ratio": 0.0, "completion_length": 121.28125, "epoch": 0.041772796739684155, "grad_norm": 21.56424441435487, "kl": 0.110107421875, "learning_rate": 9.79113601630158e-07, "loss": 0.0044, "reward": 1.2677271366119385, "reward_std": 0.10610348492627963, "rewards/accuracy_reward": 0.26874999701976776, "rewards/cosine_rewards": -0.0010228125611320138, "rewards/format_reward": 1.0, "rewards/repetition_rewards": 0.0, "step": 82 }, { "clip_ratio": 0.0, "completion_length": 115.484375, "epoch": 0.04228222109016811, "grad_norm": 11.018514876954287, "kl": 0.125244140625, "learning_rate": 9.788588894549159e-07, "loss": 0.005, "reward": 1.2675296068191528, "reward_std": 0.16161296842619777, "rewards/accuracy_reward": 0.26874999701976776, "rewards/cosine_rewards": -0.0012203185469843447, "rewards/format_reward": 1.0, "rewards/repetition_rewards": 0.0, "step": 83 }, { "clip_ratio": 0.0, "completion_length": 119.078125, "epoch": 0.04279164544065207, "grad_norm": 20.603407343348504, "kl": 0.1083984375, "learning_rate": 9.78604177279674e-07, "loss": 0.0043, "reward": 1.1548139452934265, "reward_std": 0.537171483039856, "rewards/accuracy_reward": 0.1562499925494194, "rewards/cosine_rewards": -0.0014360386412590742, "rewards/format_reward": 1.0, "rewards/repetition_rewards": 0.0, "step": 84 }, { "clip_ratio": 0.0, "completion_length": 122.640625, "epoch": 0.043301069791136015, "grad_norm": 60.83530697951784, "kl": 0.18359375, "learning_rate": 9.78349465104432e-07, "loss": 0.0073, "reward": 1.5197246074676514, "reward_std": 0.5150813460350037, "rewards/accuracy_reward": 0.518750011920929, "rewards/cosine_rewards": 0.000974582158960402, "rewards/format_reward": 1.0, "rewards/repetition_rewards": 0.0, "step": 85 }, { "clip_ratio": 0.0, "completion_length": 121.015625, "epoch": 0.04381049414161997, "grad_norm": 13.721540678774238, "kl": 0.12451171875, "learning_rate": 9.780947529291899e-07, "loss": 0.005, "reward": 1.1832407712936401, "reward_std": 0.18641822785139084, "rewards/accuracy_reward": 0.18437500298023224, "rewards/cosine_rewards": -0.001134182559326291, "rewards/format_reward": 1.0, "rewards/repetition_rewards": 0.0, "step": 86 }, { "clip_ratio": 0.0, "completion_length": 119.578125, "epoch": 0.04431991849210392, "grad_norm": 215.40977191184217, "kl": 0.115478515625, "learning_rate": 9.77840040753948e-07, "loss": 0.0046, "reward": 1.2237018644809723, "reward_std": 0.23010382801294327, "rewards/accuracy_reward": 0.24062499776482582, "rewards/cosine_rewards": -0.0012981001054868102, "rewards/format_reward": 0.984375, "rewards/repetition_rewards": 0.0, "step": 87 }, { "clip_ratio": 0.0, "completion_length": 127.125, "epoch": 0.044829342842587876, "grad_norm": 11.570945117677702, "kl": 0.110595703125, "learning_rate": 9.77585328578706e-07, "loss": 0.0044, "reward": 1.5510605573654175, "reward_std": 0.0015471973456442356, "rewards/accuracy_reward": 0.550000011920929, "rewards/cosine_rewards": 0.001060541602782905, "rewards/format_reward": 1.0, "rewards/repetition_rewards": 0.0, "step": 88 }, { "clip_ratio": 0.0, "completion_length": 129.953125, "epoch": 0.04533876719307183, "grad_norm": 9.451343704964001, "kl": 0.104248046875, "learning_rate": 9.77330616403464e-07, "loss": 0.0042, "reward": 1.5073344111442566, "reward_std": 0.35981758683919907, "rewards/accuracy_reward": 0.5218750052154064, "rewards/cosine_rewards": 0.0010844313073903322, "rewards/format_reward": 0.984375, "rewards/repetition_rewards": 0.0, "step": 89 }, { "clip_ratio": 0.0, "completion_length": 129.71875, "epoch": 0.04584819154355578, "grad_norm": 16.032910799390205, "kl": 0.094970703125, "learning_rate": 9.77075904228222e-07, "loss": 0.0038, "reward": 1.919905662536621, "reward_std": 0.24129686888772994, "rewards/accuracy_reward": 0.9156250059604645, "rewards/cosine_rewards": 0.0042806623969227076, "rewards/format_reward": 1.0, "rewards/repetition_rewards": 0.0, "step": 90 }, { "clip_ratio": 0.0, "completion_length": 130.390625, "epoch": 0.046357615894039736, "grad_norm": 21.086614900693085, "kl": 0.101806640625, "learning_rate": 9.768211920529801e-07, "loss": 0.0041, "reward": 1.5919697284698486, "reward_std": 0.2428576573729515, "rewards/accuracy_reward": 0.6062500029802322, "rewards/cosine_rewards": 0.0013447333476506174, "rewards/format_reward": 0.984375, "rewards/repetition_rewards": 0.0, "step": 91 }, { "clip_ratio": 0.0, "completion_length": 142.734375, "epoch": 0.04686704024452369, "grad_norm": 8.093804874468816, "kl": 0.095458984375, "learning_rate": 9.76566479877738e-07, "loss": 0.0038, "reward": 1.6935226917266846, "reward_std": 0.1869470328092575, "rewards/accuracy_reward": 0.690625011920929, "rewards/cosine_rewards": 0.0028977063193451613, "rewards/format_reward": 1.0, "rewards/repetition_rewards": 0.0, "step": 92 }, { "clip_ratio": 0.0, "completion_length": 143.203125, "epoch": 0.04737646459500764, "grad_norm": 10.624707519768712, "kl": 0.099609375, "learning_rate": 9.763117677024962e-07, "loss": 0.004, "reward": 1.4030739068984985, "reward_std": 0.3680836334824562, "rewards/accuracy_reward": 0.43437500298023224, "rewards/cosine_rewards": -5.1158247515559196e-05, "rewards/format_reward": 0.96875, "rewards/repetition_rewards": 0.0, "step": 93 }, { "clip_ratio": 0.0, "completion_length": 150.90625, "epoch": 0.047885888945491596, "grad_norm": 25.156466743552734, "kl": 0.101318359375, "learning_rate": 9.760570555272541e-07, "loss": 0.0041, "reward": 1.5800000429153442, "reward_std": 0.5084549486637115, "rewards/accuracy_reward": 0.578125, "rewards/cosine_rewards": 0.0018750545859802514, "rewards/format_reward": 1.0, "rewards/repetition_rewards": 0.0, "step": 94 }, { "clip_ratio": 0.0, "completion_length": 148.46875, "epoch": 0.048395313295975545, "grad_norm": 12.684719084813777, "kl": 0.10205078125, "learning_rate": 9.758023433520122e-07, "loss": 0.0041, "reward": 1.5234779119491577, "reward_std": 0.18682076036930084, "rewards/accuracy_reward": 0.5218749940395355, "rewards/cosine_rewards": 0.0016028713434934616, "rewards/format_reward": 1.0, "rewards/repetition_rewards": 0.0, "step": 95 }, { "clip_ratio": 0.0, "completion_length": 161.109375, "epoch": 0.0489047376464595, "grad_norm": 13.854458043447748, "kl": 0.108154296875, "learning_rate": 9.755476311767702e-07, "loss": 0.0043, "reward": 1.6655999422073364, "reward_std": 0.4286635220050812, "rewards/accuracy_reward": 0.6624999791383743, "rewards/cosine_rewards": 0.0030998505535535514, "rewards/format_reward": 1.0, "rewards/repetition_rewards": 0.0, "step": 96 }, { "clip_ratio": 0.0, "completion_length": 167.609375, "epoch": 0.049414161996943456, "grad_norm": 15.632405914543359, "kl": 0.098388671875, "learning_rate": 9.752929190015283e-07, "loss": 0.0039, "reward": 1.1535860896110535, "reward_std": 0.36188751459121704, "rewards/accuracy_reward": 0.1562499888241291, "rewards/cosine_rewards": -0.002663849270902574, "rewards/format_reward": 1.0, "rewards/repetition_rewards": 0.0, "step": 97 }, { "clip_ratio": 0.0, "completion_length": 168.453125, "epoch": 0.049923586347427405, "grad_norm": 8.747829969093605, "kl": 0.112060546875, "learning_rate": 9.750382068262862e-07, "loss": 0.0045, "reward": 1.3531205654144287, "reward_std": 0.18947682529687881, "rewards/accuracy_reward": 0.3531249985098839, "rewards/cosine_rewards": -4.528439603745937e-06, "rewards/format_reward": 1.0, "rewards/repetition_rewards": 0.0, "step": 98 }, { "clip_ratio": 0.0, "completion_length": 168.71875, "epoch": 0.05043301069791136, "grad_norm": 12.880266278774922, "kl": 0.112060546875, "learning_rate": 9.747834946510442e-07, "loss": 0.0045, "reward": 1.619386613368988, "reward_std": 0.5798123776912689, "rewards/accuracy_reward": 0.6624999940395355, "rewards/cosine_rewards": 0.0037616335321217775, "rewards/format_reward": 0.953125, "rewards/repetition_rewards": 0.0, "step": 99 }, { "clip_ratio": 0.0, "completion_length": 173.859375, "epoch": 0.05094243504839532, "grad_norm": 18.637994264229288, "kl": 0.109619140625, "learning_rate": 9.745287824758023e-07, "loss": 0.0044, "reward": 1.448248565196991, "reward_std": 0.4085986465215683, "rewards/accuracy_reward": 0.4624999687075615, "rewards/cosine_rewards": 0.0013735336251556873, "rewards/format_reward": 0.984375, "rewards/repetition_rewards": 0.0, "step": 100 }, { "clip_ratio": 0.0, "completion_length": 175.375, "epoch": 0.051451859398879266, "grad_norm": 26.647397602965935, "kl": 0.110107421875, "learning_rate": 9.742740703005602e-07, "loss": 0.0044, "reward": 1.0668614506721497, "reward_std": 0.35026729106903076, "rewards/accuracy_reward": 0.07187499292194843, "rewards/cosine_rewards": -0.004894306650385261, "rewards/format_reward": 1.0, "rewards/repetition_rewards": -0.00011927480954909697, "step": 101 }, { "clip_ratio": 0.0, "completion_length": 174.0, "epoch": 0.05196128374936322, "grad_norm": 12.612763516820905, "kl": 0.112060546875, "learning_rate": 9.740193581253183e-07, "loss": 0.0045, "reward": 1.4354371428489685, "reward_std": 0.20804932340979576, "rewards/accuracy_reward": 0.46562499552965164, "rewards/cosine_rewards": 0.0010621265973895788, "rewards/format_reward": 0.96875, "rewards/repetition_rewards": 0.0, "step": 102 }, { "clip_ratio": 0.0, "completion_length": 176.984375, "epoch": 0.05247070809984717, "grad_norm": 19.00540852037756, "kl": 0.116455078125, "learning_rate": 9.737646459500763e-07, "loss": 0.0047, "reward": 1.0997494161128998, "reward_std": 0.5674505531787872, "rewards/accuracy_reward": 0.1499999761581421, "rewards/cosine_rewards": -0.003250634763389826, "rewards/format_reward": 0.953125, "rewards/repetition_rewards": -0.0001250000059371814, "step": 103 }, { "clip_ratio": 0.0, "completion_length": 187.3125, "epoch": 0.052980132450331126, "grad_norm": 8.993773014446798, "kl": 0.115478515625, "learning_rate": 9.735099337748344e-07, "loss": 0.0046, "reward": 1.547185480594635, "reward_std": 0.5772347450256348, "rewards/accuracy_reward": 0.5750000029802322, "rewards/cosine_rewards": 0.0034354651579633355, "rewards/format_reward": 0.96875, "rewards/repetition_rewards": 0.0, "step": 104 }, { "clip_ratio": 0.0, "completion_length": 182.09375, "epoch": 0.05348955680081508, "grad_norm": 21.83379704072219, "kl": 0.11279296875, "learning_rate": 9.732552215995923e-07, "loss": 0.0045, "reward": 0.9665651321411133, "reward_std": 0.19240357726812363, "rewards/accuracy_reward": -0.012500010430812836, "rewards/cosine_rewards": -0.005309856729581952, "rewards/format_reward": 0.984375, "rewards/repetition_rewards": 0.0, "step": 105 }, { "clip_ratio": 0.0, "completion_length": 184.296875, "epoch": 0.05399898115129903, "grad_norm": 15.74936299058057, "kl": 0.1240234375, "learning_rate": 9.730005094243505e-07, "loss": 0.005, "reward": 0.8526512682437897, "reward_std": 0.45641621947288513, "rewards/accuracy_reward": -0.1250000149011612, "rewards/cosine_rewards": -0.006723731989040971, "rewards/format_reward": 0.984375, "rewards/repetition_rewards": 0.0, "step": 106 }, { "clip_ratio": 0.0, "completion_length": 178.203125, "epoch": 0.054508405501782986, "grad_norm": 7.621263779056561, "kl": 0.116455078125, "learning_rate": 9.727457972491084e-07, "loss": 0.0047, "reward": 1.4213617444038391, "reward_std": 0.42073580622673035, "rewards/accuracy_reward": 0.4375, "rewards/cosine_rewards": -0.0005132523947395384, "rewards/format_reward": 0.984375, "rewards/repetition_rewards": 0.0, "step": 107 }, { "clip_ratio": 0.0, "completion_length": 180.234375, "epoch": 0.05501782985226694, "grad_norm": 14.098307573524853, "kl": 0.119140625, "learning_rate": 9.724910850738665e-07, "loss": 0.0048, "reward": 1.1232723593711853, "reward_std": 0.45567604154348373, "rewards/accuracy_reward": 0.12812499329447746, "rewards/cosine_rewards": -0.004852580255828798, "rewards/format_reward": 1.0, "rewards/repetition_rewards": 0.0, "step": 108 }, { "clip_ratio": 0.0, "completion_length": 183.328125, "epoch": 0.05552725420275089, "grad_norm": 10.937546247684887, "kl": 0.18994140625, "learning_rate": 9.722363728986245e-07, "loss": 0.0076, "reward": 1.8381596803665161, "reward_std": 0.28626738488674164, "rewards/accuracy_reward": 0.831250011920929, "rewards/cosine_rewards": 0.006909639807417989, "rewards/format_reward": 1.0, "rewards/repetition_rewards": 0.0, "step": 109 }, { "clip_ratio": 0.0, "completion_length": 174.375, "epoch": 0.056036678553234846, "grad_norm": 16.445027328915916, "kl": 0.11181640625, "learning_rate": 9.719816607233826e-07, "loss": 0.0045, "reward": 1.2096136808395386, "reward_std": 0.36066293716430664, "rewards/accuracy_reward": 0.21249999105930328, "rewards/cosine_rewards": -0.002886334084905684, "rewards/format_reward": 1.0, "rewards/repetition_rewards": 0.0, "step": 110 }, { "clip_ratio": 0.0, "completion_length": 168.765625, "epoch": 0.056546102903718795, "grad_norm": 18.874204475299454, "kl": 0.106689453125, "learning_rate": 9.717269485481405e-07, "loss": 0.0043, "reward": 1.3519207835197449, "reward_std": 0.08376272046007216, "rewards/accuracy_reward": 0.3531250059604645, "rewards/cosine_rewards": -0.0012042350135743618, "rewards/format_reward": 1.0, "rewards/repetition_rewards": 0.0, "step": 111 }, { "clip_ratio": 0.0, "completion_length": 157.796875, "epoch": 0.05705552725420275, "grad_norm": 9.104959106458736, "kl": 0.121337890625, "learning_rate": 9.714722363728986e-07, "loss": 0.0049, "reward": 1.381228744983673, "reward_std": 0.16323383897542953, "rewards/accuracy_reward": 0.3812500238418579, "rewards/cosine_rewards": -2.1282234229147434e-05, "rewards/format_reward": 1.0, "rewards/repetition_rewards": 0.0, "step": 112 }, { "clip_ratio": 0.0, "completion_length": 162.171875, "epoch": 0.05756495160468671, "grad_norm": 13.28611466805594, "kl": 0.10888671875, "learning_rate": 9.712175241976566e-07, "loss": 0.0044, "reward": 1.3366525173187256, "reward_std": 0.28694501193240285, "rewards/accuracy_reward": 0.3531249985098839, "rewards/cosine_rewards": -0.0008475282229483128, "rewards/format_reward": 0.984375, "rewards/repetition_rewards": 0.0, "step": 113 }, { "clip_ratio": 0.0, "completion_length": 159.4375, "epoch": 0.058074375955170655, "grad_norm": 20.0350592953945, "kl": 0.107666015625, "learning_rate": 9.709628120224145e-07, "loss": 0.0043, "reward": 1.4107850790023804, "reward_std": 0.18804995715618134, "rewards/accuracy_reward": 0.40937501937150955, "rewards/cosine_rewards": 0.001410042867064476, "rewards/format_reward": 1.0, "rewards/repetition_rewards": 0.0, "step": 114 }, { "clip_ratio": 0.0, "completion_length": 159.6875, "epoch": 0.05858380030565461, "grad_norm": 9.526099983425397, "kl": 0.10595703125, "learning_rate": 9.707080998471726e-07, "loss": 0.0042, "reward": 1.4221826791763306, "reward_std": 0.2918977811932564, "rewards/accuracy_reward": 0.4374999925494194, "rewards/cosine_rewards": 0.00030758429784327745, "rewards/format_reward": 0.984375, "rewards/repetition_rewards": 0.0, "step": 115 }, { "clip_ratio": 0.0, "completion_length": 157.984375, "epoch": 0.05909322465613857, "grad_norm": 14.046929728525855, "kl": 0.11572265625, "learning_rate": 9.704533876719306e-07, "loss": 0.0046, "reward": 1.2389479279518127, "reward_std": 0.4534989148378372, "rewards/accuracy_reward": 0.24062498658895493, "rewards/cosine_rewards": -0.00167706364300102, "rewards/format_reward": 1.0, "rewards/repetition_rewards": 0.0, "step": 116 }, { "clip_ratio": 0.0, "completion_length": 167.390625, "epoch": 0.059602649006622516, "grad_norm": 12.733865198375515, "kl": 0.105224609375, "learning_rate": 9.701986754966887e-07, "loss": 0.0042, "reward": 1.0669120252132416, "reward_std": 0.319850392639637, "rewards/accuracy_reward": 0.07187498360872269, "rewards/cosine_rewards": -0.004963014740496874, "rewards/format_reward": 1.0, "rewards/repetition_rewards": 0.0, "step": 117 }, { "clip_ratio": 0.0, "completion_length": 156.140625, "epoch": 0.06011207335710647, "grad_norm": 11.670513012812588, "kl": 0.093505859375, "learning_rate": 9.699439633214466e-07, "loss": 0.0038, "reward": 1.665140986442566, "reward_std": 0.12403370253741741, "rewards/accuracy_reward": 0.6624999940395355, "rewards/cosine_rewards": 0.0026409668498672545, "rewards/format_reward": 1.0, "rewards/repetition_rewards": 0.0, "step": 118 }, { "clip_ratio": 0.0, "completion_length": 155.828125, "epoch": 0.06062149770759042, "grad_norm": 28.993909689663674, "kl": 0.1015625, "learning_rate": 9.696892511462047e-07, "loss": 0.0041, "reward": 1.0673952102661133, "reward_std": 0.30907338857650757, "rewards/accuracy_reward": 0.07187499105930328, "rewards/cosine_rewards": -0.004479756113141775, "rewards/format_reward": 1.0, "rewards/repetition_rewards": 0.0, "step": 119 }, { "clip_ratio": 0.0, "completion_length": 154.40625, "epoch": 0.061130922058074376, "grad_norm": 27.685530130702478, "kl": 0.104736328125, "learning_rate": 9.694345389709627e-07, "loss": 0.0042, "reward": 1.4373126029968262, "reward_std": 0.21315501490607858, "rewards/accuracy_reward": 0.4375, "rewards/cosine_rewards": -0.0001873411238193512, "rewards/format_reward": 1.0, "rewards/repetition_rewards": 0.0, "step": 120 }, { "clip_ratio": 0.0, "completion_length": 152.0625, "epoch": 0.06164034640855833, "grad_norm": 12.6569610895899, "kl": 0.121826171875, "learning_rate": 9.691798267957208e-07, "loss": 0.0049, "reward": 1.3240773677825928, "reward_std": 0.26775629818439484, "rewards/accuracy_reward": 0.32499999552965164, "rewards/cosine_rewards": -0.0009226472466252744, "rewards/format_reward": 1.0, "rewards/repetition_rewards": 0.0, "step": 121 }, { "clip_ratio": 0.0, "completion_length": 151.78125, "epoch": 0.06214977075904228, "grad_norm": 11.557058687556623, "kl": 0.103759765625, "learning_rate": 9.689251146204787e-07, "loss": 0.0041, "reward": 1.7503631114959717, "reward_std": 0.08287379238754511, "rewards/accuracy_reward": 0.7468750178813934, "rewards/cosine_rewards": 0.0034881452447734773, "rewards/format_reward": 1.0, "rewards/repetition_rewards": 0.0, "step": 122 }, { "clip_ratio": 0.0, "completion_length": 158.578125, "epoch": 0.06265919510952624, "grad_norm": 19.676894670897823, "kl": 0.1025390625, "learning_rate": 9.686704024452369e-07, "loss": 0.0041, "reward": 1.3520426154136658, "reward_std": 0.24371477961540222, "rewards/accuracy_reward": 0.3531249761581421, "rewards/cosine_rewards": -0.0010823981137946248, "rewards/format_reward": 1.0, "rewards/repetition_rewards": 0.0, "step": 123 }, { "clip_ratio": 0.0, "completion_length": 167.484375, "epoch": 0.06316861946001019, "grad_norm": 7.7934253879172894, "kl": 0.10107421875, "learning_rate": 9.684156902699948e-07, "loss": 0.004, "reward": 1.4387494623661041, "reward_std": 0.26880691200494766, "rewards/accuracy_reward": 0.4374999888241291, "rewards/cosine_rewards": 0.0015281732194125652, "rewards/format_reward": 1.0, "rewards/repetition_rewards": -0.000278731546131894, "step": 124 }, { "clip_ratio": 0.0, "completion_length": 160.671875, "epoch": 0.06367804381049415, "grad_norm": 14.376740722468174, "kl": 0.1064453125, "learning_rate": 9.68160978094753e-07, "loss": 0.0043, "reward": 1.2107464671134949, "reward_std": 0.20096861571073532, "rewards/accuracy_reward": 0.21249999292194843, "rewards/cosine_rewards": -0.0017535560764372349, "rewards/format_reward": 1.0, "rewards/repetition_rewards": 0.0, "step": 125 }, { "clip_ratio": 0.0, "completion_length": 158.25, "epoch": 0.06418746816097809, "grad_norm": 24.20355073697258, "kl": 0.10693359375, "learning_rate": 9.679062659195109e-07, "loss": 0.0043, "reward": 1.09614896774292, "reward_std": 0.4781967103481293, "rewards/accuracy_reward": 0.09999998658895493, "rewards/cosine_rewards": -0.0038510175654664636, "rewards/format_reward": 1.0, "rewards/repetition_rewards": 0.0, "step": 126 }, { "clip_ratio": 0.0, "completion_length": 165.203125, "epoch": 0.06469689251146205, "grad_norm": 8.56510891629326, "kl": 0.111083984375, "learning_rate": 9.676515537442688e-07, "loss": 0.0044, "reward": 1.5521512031555176, "reward_std": 0.46633191406726837, "rewards/accuracy_reward": 0.550000011920929, "rewards/cosine_rewards": 0.002151212247554213, "rewards/format_reward": 1.0, "rewards/repetition_rewards": 0.0, "step": 127 }, { "clip_ratio": 0.0, "completion_length": 164.421875, "epoch": 0.065206316861946, "grad_norm": 13.379306876395301, "kl": 0.120849609375, "learning_rate": 9.67396841569027e-07, "loss": 0.0048, "reward": 1.722820222377777, "reward_std": 0.32213538885116577, "rewards/accuracy_reward": 0.71875, "rewards/cosine_rewards": 0.004070190014317632, "rewards/format_reward": 1.0, "rewards/repetition_rewards": 0.0, "step": 128 }, { "clip_ratio": 0.0, "completion_length": 173.703125, "epoch": 0.06571574121242996, "grad_norm": 153.40445361752842, "kl": 0.1162109375, "learning_rate": 9.67142129393785e-07, "loss": 0.0047, "reward": 1.2643532752990723, "reward_std": 0.23417328391224146, "rewards/accuracy_reward": 0.2656250037252903, "rewards/cosine_rewards": -0.0012717264471575618, "rewards/format_reward": 1.0, "rewards/repetition_rewards": 0.0, "step": 129 }, { "clip_ratio": 0.0, "completion_length": 168.5625, "epoch": 0.06622516556291391, "grad_norm": 9.275290295194138, "kl": 0.10302734375, "learning_rate": 9.66887417218543e-07, "loss": 0.0041, "reward": 1.0960015654563904, "reward_std": 0.21426187455654144, "rewards/accuracy_reward": 0.09999999403953552, "rewards/cosine_rewards": -0.003998432832304388, "rewards/format_reward": 1.0, "rewards/repetition_rewards": 0.0, "step": 130 }, { "clip_ratio": 0.0, "completion_length": 176.265625, "epoch": 0.06673458991339785, "grad_norm": 19.81840956306445, "kl": 0.105224609375, "learning_rate": 9.66632705043301e-07, "loss": 0.0042, "reward": 1.6365814805030823, "reward_std": 0.20723329484462738, "rewards/accuracy_reward": 0.6343750059604645, "rewards/cosine_rewards": 0.0022064344957470894, "rewards/format_reward": 1.0, "rewards/repetition_rewards": 0.0, "step": 131 }, { "clip_ratio": 0.0, "completion_length": 174.234375, "epoch": 0.06724401426388181, "grad_norm": 8.864924483945437, "kl": 0.108642578125, "learning_rate": 9.66377992868059e-07, "loss": 0.0044, "reward": 1.3077268600463867, "reward_std": 0.3278057724237442, "rewards/accuracy_reward": 0.32500000670552254, "rewards/cosine_rewards": -0.0012397709069773555, "rewards/format_reward": 0.984375, "rewards/repetition_rewards": -0.00040839536814019084, "step": 132 }, { "clip_ratio": 0.0, "completion_length": 175.6875, "epoch": 0.06775343861436577, "grad_norm": 14.31694516925494, "kl": 0.112060546875, "learning_rate": 9.661232806928172e-07, "loss": 0.0045, "reward": 1.3927981853485107, "reward_std": 0.3101032227277756, "rewards/accuracy_reward": 0.40937499701976776, "rewards/cosine_rewards": -0.0007292817026609555, "rewards/format_reward": 0.984375, "rewards/repetition_rewards": -0.00022258506942307577, "step": 133 }, { "clip_ratio": 0.0, "completion_length": 174.515625, "epoch": 0.06826286296484972, "grad_norm": 13.206286933876525, "kl": 0.110107421875, "learning_rate": 9.65868568517575e-07, "loss": 0.0044, "reward": 1.4952284097671509, "reward_std": 0.16513758851215243, "rewards/accuracy_reward": 0.4937499910593033, "rewards/cosine_rewards": 0.0014784452505409718, "rewards/format_reward": 1.0, "rewards/repetition_rewards": 0.0, "step": 134 }, { "clip_ratio": 0.0, "completion_length": 188.15625, "epoch": 0.06877228731533368, "grad_norm": 8.196723017225267, "kl": 0.110107421875, "learning_rate": 9.656138563423332e-07, "loss": 0.0044, "reward": 1.3523318767547607, "reward_std": 0.19119784235954285, "rewards/accuracy_reward": 0.3531249985098839, "rewards/cosine_rewards": -0.0007931197178550065, "rewards/format_reward": 1.0, "rewards/repetition_rewards": 0.0, "step": 135 }, { "clip_ratio": 0.0, "completion_length": 184.375, "epoch": 0.06928171166581763, "grad_norm": 7.96588213615428, "kl": 0.10400390625, "learning_rate": 9.653591441670911e-07, "loss": 0.0042, "reward": 1.3809208273887634, "reward_std": 0.16492938250303268, "rewards/accuracy_reward": 0.3812500238418579, "rewards/cosine_rewards": -0.00032906350679695606, "rewards/format_reward": 1.0, "rewards/repetition_rewards": 0.0, "step": 136 }, { "clip_ratio": 0.0, "completion_length": 179.78125, "epoch": 0.06979113601630157, "grad_norm": 15.45343619628501, "kl": 0.1142578125, "learning_rate": 9.651044319918493e-07, "loss": 0.0046, "reward": 1.5364066362380981, "reward_std": 0.3309681713581085, "rewards/accuracy_reward": 0.550000011920929, "rewards/cosine_rewards": 0.002031611278653145, "rewards/format_reward": 0.984375, "rewards/repetition_rewards": 0.0, "step": 137 }, { "clip_ratio": 0.0, "completion_length": 191.484375, "epoch": 0.07030056036678553, "grad_norm": 10.658123621710919, "kl": 0.1142578125, "learning_rate": 9.648497198166072e-07, "loss": 0.0046, "reward": 1.5228744149208069, "reward_std": 0.08533496968448162, "rewards/accuracy_reward": 0.5218750089406967, "rewards/cosine_rewards": 0.0011002181563526392, "rewards/format_reward": 1.0, "rewards/repetition_rewards": -0.00010080645006382838, "step": 138 }, { "clip_ratio": 0.0, "completion_length": 205.734375, "epoch": 0.07080998471726949, "grad_norm": 13.847520067525743, "kl": 0.118408203125, "learning_rate": 9.645950076413653e-07, "loss": 0.0047, "reward": 0.6919489502906799, "reward_std": 0.29361478984355927, "rewards/accuracy_reward": -0.29375001788139343, "rewards/cosine_rewards": -0.014301038347184658, "rewards/format_reward": 1.0, "rewards/repetition_rewards": 0.0, "step": 139 }, { "clip_ratio": 0.0, "completion_length": 187.6875, "epoch": 0.07131940906775344, "grad_norm": 32.08146525993247, "kl": 0.115234375, "learning_rate": 9.643402954661233e-07, "loss": 0.0046, "reward": 1.3814507126808167, "reward_std": 0.10938079445622861, "rewards/accuracy_reward": 0.3812499940395355, "rewards/cosine_rewards": 0.00031310925260186195, "rewards/format_reward": 1.0, "rewards/repetition_rewards": -0.00011241007450735196, "step": 140 }, { "clip_ratio": 0.0, "completion_length": 201.890625, "epoch": 0.0718288334182374, "grad_norm": 13.580365765026054, "kl": 0.12158203125, "learning_rate": 9.640855832908814e-07, "loss": 0.0049, "reward": 1.2904618978500366, "reward_std": 0.09482555650174618, "rewards/accuracy_reward": 0.296875, "rewards/cosine_rewards": -0.006186658749356866, "rewards/format_reward": 1.0, "rewards/repetition_rewards": -0.00022644927958026528, "step": 141 }, { "clip_ratio": 0.0, "completion_length": 196.5, "epoch": 0.07233825776872134, "grad_norm": 24.285612348709225, "kl": 0.113525390625, "learning_rate": 9.638308711156393e-07, "loss": 0.0045, "reward": 1.4387189745903015, "reward_std": 0.30863603949546814, "rewards/accuracy_reward": 0.4374999888241291, "rewards/cosine_rewards": 0.0012189627159386873, "rewards/format_reward": 1.0, "rewards/repetition_rewards": 0.0, "step": 142 }, { "clip_ratio": 0.0, "completion_length": 197.5, "epoch": 0.0728476821192053, "grad_norm": 22.094925488371874, "kl": 0.11669921875, "learning_rate": 9.635761589403972e-07, "loss": 0.0047, "reward": 1.495344638824463, "reward_std": 0.46879828721284866, "rewards/accuracy_reward": 0.4937499761581421, "rewards/cosine_rewards": 0.0015945886261761189, "rewards/format_reward": 1.0, "rewards/repetition_rewards": 0.0, "step": 143 }, { "clip_ratio": 0.0, "completion_length": 195.125, "epoch": 0.07335710646968925, "grad_norm": 7.954348731599845, "kl": 0.1259765625, "learning_rate": 9.633214467651554e-07, "loss": 0.005, "reward": 1.5794875025749207, "reward_std": 0.2703954949975014, "rewards/accuracy_reward": 0.6062500178813934, "rewards/cosine_rewards": 0.0044874417362734675, "rewards/format_reward": 0.96875, "rewards/repetition_rewards": 0.0, "step": 144 }, { "clip_ratio": 0.0, "completion_length": 203.234375, "epoch": 0.0738665308201732, "grad_norm": 10.240352920236468, "kl": 0.1240234375, "learning_rate": 9.630667345899133e-07, "loss": 0.005, "reward": 1.323024868965149, "reward_std": 0.3642221838235855, "rewards/accuracy_reward": 0.32499999552965164, "rewards/cosine_rewards": -0.0019751336076296866, "rewards/format_reward": 1.0, "rewards/repetition_rewards": 0.0, "step": 145 }, { "clip_ratio": 0.0, "completion_length": 210.96875, "epoch": 0.07437595517065716, "grad_norm": 7.757169620409243, "kl": 0.1318359375, "learning_rate": 9.628120224146714e-07, "loss": 0.0053, "reward": 1.4797114729881287, "reward_std": 0.40076301991939545, "rewards/accuracy_reward": 0.4937500078231096, "rewards/cosine_rewards": 0.001799287972971797, "rewards/format_reward": 0.984375, "rewards/repetition_rewards": -0.00021284000831656158, "step": 146 }, { "clip_ratio": 0.0, "completion_length": 203.015625, "epoch": 0.0748853795211411, "grad_norm": 9.791138031684504, "kl": 0.1123046875, "learning_rate": 9.625573102394294e-07, "loss": 0.0045, "reward": 1.5510019659996033, "reward_std": 0.27960680425167084, "rewards/accuracy_reward": 0.5781249850988388, "rewards/cosine_rewards": 0.0043075907160528, "rewards/format_reward": 0.96875, "rewards/repetition_rewards": -0.0001806358341127634, "step": 147 }, { "clip_ratio": 0.0, "completion_length": 211.6875, "epoch": 0.07539480387162506, "grad_norm": 7.852116556950569, "kl": 0.12060546875, "learning_rate": 9.623025980641875e-07, "loss": 0.0048, "reward": 1.2943141460418701, "reward_std": 0.43064263463020325, "rewards/accuracy_reward": 0.2968749701976776, "rewards/cosine_rewards": -0.0023121244739741087, "rewards/format_reward": 1.0, "rewards/repetition_rewards": -0.0002487746678525582, "step": 148 }, { "clip_ratio": 0.0, "completion_length": 220.8125, "epoch": 0.07590422822210902, "grad_norm": 19.19937094751421, "kl": 0.1240234375, "learning_rate": 9.620478858889454e-07, "loss": 0.005, "reward": 1.8400413990020752, "reward_std": 0.39075249433517456, "rewards/accuracy_reward": 0.8593749701976776, "rewards/cosine_rewards": 0.011916308663785458, "rewards/format_reward": 0.96875, "rewards/repetition_rewards": 0.0, "step": 149 }, { "clip_ratio": 0.0, "completion_length": 219.890625, "epoch": 0.07641365257259297, "grad_norm": 17.532821778348946, "kl": 0.1376953125, "learning_rate": 9.617931737137036e-07, "loss": 0.0055, "reward": 1.5498095750808716, "reward_std": 0.29365313798189163, "rewards/accuracy_reward": 0.5781250298023224, "rewards/cosine_rewards": 0.0030243303044699132, "rewards/format_reward": 0.96875, "rewards/repetition_rewards": -8.979885024018586e-05, "step": 150 }, { "clip_ratio": 0.0, "completion_length": 215.3125, "epoch": 0.07692307692307693, "grad_norm": 8.653969191960044, "kl": 0.120361328125, "learning_rate": 9.615384615384615e-07, "loss": 0.0048, "reward": 1.2203205227851868, "reward_std": 0.5703159868717194, "rewards/accuracy_reward": 0.24062499403953552, "rewards/cosine_rewards": -0.004613903176505119, "rewards/format_reward": 0.984375, "rewards/repetition_rewards": -6.565126386703923e-05, "step": 151 }, { "clip_ratio": 0.0, "completion_length": 214.484375, "epoch": 0.07743250127356088, "grad_norm": 56.921468540439704, "kl": 0.119873046875, "learning_rate": 9.612837493632196e-07, "loss": 0.0048, "reward": 1.2310086488723755, "reward_std": 0.41925153136253357, "rewards/accuracy_reward": 0.2656249925494194, "rewards/cosine_rewards": -0.00321156473364681, "rewards/format_reward": 0.96875, "rewards/repetition_rewards": -0.00015470296784769744, "step": 152 }, { "clip_ratio": 0.0, "completion_length": 219.0, "epoch": 0.07794192562404482, "grad_norm": 9.400079372239615, "kl": 0.107666015625, "learning_rate": 9.610290371879775e-07, "loss": 0.0043, "reward": 1.6124141216278076, "reward_std": 0.487982913851738, "rewards/accuracy_reward": 0.606249988079071, "rewards/cosine_rewards": 0.006320342654362321, "rewards/format_reward": 1.0, "rewards/repetition_rewards": -0.00015624999650754035, "step": 153 }, { "clip_ratio": 0.0, "completion_length": 214.90625, "epoch": 0.07845134997452878, "grad_norm": 16.320123460893743, "kl": 0.125732421875, "learning_rate": 9.607743250127357e-07, "loss": 0.005, "reward": 1.5100122094154358, "reward_std": 0.4279818534851074, "rewards/accuracy_reward": 0.5218749791383743, "rewards/cosine_rewards": 0.0037621970986947417, "rewards/format_reward": 0.984375, "rewards/repetition_rewards": 0.0, "step": 154 }, { "clip_ratio": 0.0, "completion_length": 226.125, "epoch": 0.07896077432501274, "grad_norm": 12.345482711993489, "kl": 0.163330078125, "learning_rate": 9.605196128374936e-07, "loss": 0.0065, "reward": 0.9318991005420685, "reward_std": 0.23740804940462112, "rewards/accuracy_reward": -0.04062497615814209, "rewards/cosine_rewards": -0.011850890005007386, "rewards/format_reward": 0.984375, "rewards/repetition_rewards": 0.0, "step": 155 }, { "clip_ratio": 0.0, "completion_length": 223.671875, "epoch": 0.07947019867549669, "grad_norm": 6.991107442023172, "kl": 0.1240234375, "learning_rate": 9.602649006622515e-07, "loss": 0.005, "reward": 0.9476701319217682, "reward_std": 0.33865927904844284, "rewards/accuracy_reward": -0.04062502086162567, "rewards/cosine_rewards": -0.011704806645866483, "rewards/format_reward": 1.0, "rewards/repetition_rewards": 0.0, "step": 156 }, { "clip_ratio": 0.0, "completion_length": 239.46875, "epoch": 0.07997962302598065, "grad_norm": 11.299778697394407, "kl": 0.117919921875, "learning_rate": 9.600101884870097e-07, "loss": 0.0047, "reward": 1.3649136424064636, "reward_std": 0.42557042837142944, "rewards/accuracy_reward": 0.3812499940395355, "rewards/cosine_rewards": -0.0007113651372492313, "rewards/format_reward": 0.984375, "rewards/repetition_rewards": 0.0, "step": 157 }, { "clip_ratio": 0.0, "completion_length": 231.015625, "epoch": 0.08048904737646459, "grad_norm": 16.322542283478924, "kl": 0.12255859375, "learning_rate": 9.597554763117676e-07, "loss": 0.0049, "reward": 1.3941306471824646, "reward_std": 0.4155275672674179, "rewards/accuracy_reward": 0.40937498211860657, "rewards/cosine_rewards": 0.000547687232028693, "rewards/format_reward": 0.984375, "rewards/repetition_rewards": -0.00016711230273358524, "step": 158 }, { "clip_ratio": 0.0, "completion_length": 213.3125, "epoch": 0.08099847172694855, "grad_norm": 6.393101239111367, "kl": 0.11865234375, "learning_rate": 9.595007641365257e-07, "loss": 0.0047, "reward": 1.2930153012275696, "reward_std": 0.2976529533043504, "rewards/accuracy_reward": 0.296875, "rewards/cosine_rewards": -0.0038597104139626026, "rewards/format_reward": 1.0, "rewards/repetition_rewards": 0.0, "step": 159 }, { "clip_ratio": 0.0, "completion_length": 215.125, "epoch": 0.0815078960774325, "grad_norm": 15.2053157125375, "kl": 0.119140625, "learning_rate": 9.592460519612836e-07, "loss": 0.0048, "reward": 1.2102863192558289, "reward_std": 0.43368688225746155, "rewards/accuracy_reward": 0.2124999836087227, "rewards/cosine_rewards": -0.0022136420011520386, "rewards/format_reward": 1.0, "rewards/repetition_rewards": 0.0, "step": 160 }, { "clip_ratio": 0.0, "completion_length": 209.109375, "epoch": 0.08201732042791646, "grad_norm": 8.230345044429809, "kl": 0.11376953125, "learning_rate": 9.589913397860418e-07, "loss": 0.0045, "reward": 1.5247125625610352, "reward_std": 0.313697911798954, "rewards/accuracy_reward": 0.5218749791383743, "rewards/cosine_rewards": 0.0028375727706588805, "rewards/format_reward": 1.0, "rewards/repetition_rewards": 0.0, "step": 161 }, { "clip_ratio": 0.0, "completion_length": 208.421875, "epoch": 0.08252674477840041, "grad_norm": 7.440050437747031, "kl": 0.132568359375, "learning_rate": 9.587366276107997e-07, "loss": 0.0053, "reward": 1.4958758354187012, "reward_std": 0.2720055654644966, "rewards/accuracy_reward": 0.4937499761581421, "rewards/cosine_rewards": 0.002125886792782694, "rewards/format_reward": 1.0, "rewards/repetition_rewards": 0.0, "step": 162 }, { "clip_ratio": 0.0, "completion_length": 201.125, "epoch": 0.08303616912888435, "grad_norm": 44.81125981696038, "kl": 0.119384765625, "learning_rate": 9.584819154355578e-07, "loss": 0.0048, "reward": 1.5242316722869873, "reward_std": 0.6014019548892975, "rewards/accuracy_reward": 0.5218749940395355, "rewards/cosine_rewards": 0.0023566827294416726, "rewards/format_reward": 1.0, "rewards/repetition_rewards": 0.0, "step": 163 }, { "clip_ratio": 0.0, "completion_length": 207.046875, "epoch": 0.08354559347936831, "grad_norm": 13.052733655899614, "kl": 0.119140625, "learning_rate": 9.582272032603158e-07, "loss": 0.0048, "reward": 1.6689130067825317, "reward_std": 0.2884200101252645, "rewards/accuracy_reward": 0.6624999940395355, "rewards/cosine_rewards": 0.0064131125109270215, "rewards/format_reward": 1.0, "rewards/repetition_rewards": 0.0, "step": 164 }, { "clip_ratio": 0.0, "completion_length": 204.53125, "epoch": 0.08405501782985227, "grad_norm": 45.85990240379708, "kl": 0.455078125, "learning_rate": 9.57972491085074e-07, "loss": 0.0181, "reward": 1.72576242685318, "reward_std": 0.48727013170719147, "rewards/accuracy_reward": 0.7187499701976776, "rewards/cosine_rewards": 0.007012464571744204, "rewards/format_reward": 1.0, "rewards/repetition_rewards": 0.0, "step": 165 }, { "clip_ratio": 0.0, "completion_length": 203.890625, "epoch": 0.08456444218033622, "grad_norm": 62.93710363981597, "kl": 0.117431640625, "learning_rate": 9.577177789098318e-07, "loss": 0.0047, "reward": 0.9781621694564819, "reward_std": 0.20786645263433456, "rewards/accuracy_reward": -0.012500008568167686, "rewards/cosine_rewards": -0.009250549599528313, "rewards/format_reward": 1.0, "rewards/repetition_rewards": -8.729050023248419e-05, "step": 166 }, { "clip_ratio": 0.0, "completion_length": 201.109375, "epoch": 0.08507386653082018, "grad_norm": 11.046432411232663, "kl": 0.13134765625, "learning_rate": 9.5746306673459e-07, "loss": 0.0052, "reward": 1.3762089014053345, "reward_std": 0.32985249161720276, "rewards/accuracy_reward": 0.37812500819563866, "rewards/cosine_rewards": -0.0019161199452355504, "rewards/format_reward": 1.0, "rewards/repetition_rewards": 0.0, "step": 167 }, { "clip_ratio": 0.0, "completion_length": 201.578125, "epoch": 0.08558329088130413, "grad_norm": 5.950394983602238, "kl": 0.11279296875, "learning_rate": 9.572083545593479e-07, "loss": 0.0045, "reward": 1.0190700888633728, "reward_std": 0.6297826766967773, "rewards/accuracy_reward": 0.04062497615814209, "rewards/cosine_rewards": -0.005929919425398111, "rewards/format_reward": 0.984375, "rewards/repetition_rewards": 0.0, "step": 168 }, { "clip_ratio": 0.0, "completion_length": 204.71875, "epoch": 0.08609271523178808, "grad_norm": 17.640294707452878, "kl": 0.11572265625, "learning_rate": 9.56953642384106e-07, "loss": 0.0046, "reward": 0.9798631221055984, "reward_std": 0.20544240390881896, "rewards/accuracy_reward": -0.012500017881393433, "rewards/cosine_rewards": -0.007636879570782185, "rewards/format_reward": 1.0, "rewards/repetition_rewards": 0.0, "step": 169 }, { "clip_ratio": 0.0, "completion_length": 190.34375, "epoch": 0.08660213958227203, "grad_norm": 7.762505201079191, "kl": 0.112548828125, "learning_rate": 9.56698930208864e-07, "loss": 0.0045, "reward": 1.152494490146637, "reward_std": 0.30975981056690216, "rewards/accuracy_reward": 0.15625, "rewards/cosine_rewards": -0.0037555836606770754, "rewards/format_reward": 1.0, "rewards/repetition_rewards": 0.0, "step": 170 }, { "clip_ratio": 0.0, "completion_length": 194.875, "epoch": 0.08711156393275599, "grad_norm": 11.506198147130426, "kl": 0.111083984375, "learning_rate": 9.564442180336219e-07, "loss": 0.0045, "reward": 0.9780029058456421, "reward_std": 0.6867689490318298, "rewards/accuracy_reward": -0.012500010430812836, "rewards/cosine_rewards": -0.009497055783867836, "rewards/format_reward": 1.0, "rewards/repetition_rewards": 0.0, "step": 171 }, { "clip_ratio": 0.0, "completion_length": 189.125, "epoch": 0.08762098828323994, "grad_norm": 11.819517524957538, "kl": 0.10546875, "learning_rate": 9.5618950585838e-07, "loss": 0.0042, "reward": 1.3241556882858276, "reward_std": 0.3773365914821625, "rewards/accuracy_reward": 0.32499998807907104, "rewards/cosine_rewards": -0.0008443233091384172, "rewards/format_reward": 1.0, "rewards/repetition_rewards": 0.0, "step": 172 }, { "clip_ratio": 0.0, "completion_length": 182.34375, "epoch": 0.0881304126337239, "grad_norm": 8.416955648144409, "kl": 0.116943359375, "learning_rate": 9.55934793683138e-07, "loss": 0.0047, "reward": 1.6661878824234009, "reward_std": 0.20251824986189604, "rewards/accuracy_reward": 0.6624999940395355, "rewards/cosine_rewards": 0.003687863936647773, "rewards/format_reward": 1.0, "rewards/repetition_rewards": 0.0, "step": 173 }, { "clip_ratio": 0.0, "completion_length": 191.921875, "epoch": 0.08863983698420784, "grad_norm": 7.131693684668596, "kl": 0.12060546875, "learning_rate": 9.55680081507896e-07, "loss": 0.0048, "reward": 1.0765551328659058, "reward_std": 0.3972722738981247, "rewards/accuracy_reward": 0.09999998845160007, "rewards/cosine_rewards": -0.007819817401468754, "rewards/format_reward": 0.984375, "rewards/repetition_rewards": 0.0, "step": 174 }, { "clip_ratio": 0.0, "completion_length": 181.75, "epoch": 0.0891492613346918, "grad_norm": 11.781793838489648, "kl": 0.11083984375, "learning_rate": 9.55425369332654e-07, "loss": 0.0044, "reward": 1.5515506863594055, "reward_std": 0.3071342632174492, "rewards/accuracy_reward": 0.5500000268220901, "rewards/cosine_rewards": 0.0015506702475249767, "rewards/format_reward": 1.0, "rewards/repetition_rewards": 0.0, "step": 175 }, { "clip_ratio": 0.0, "completion_length": 175.609375, "epoch": 0.08965868568517575, "grad_norm": 40.566147710572594, "kl": 0.109130859375, "learning_rate": 9.551706571574121e-07, "loss": 0.0044, "reward": 1.5527549982070923, "reward_std": 0.39126846194267273, "rewards/accuracy_reward": 0.5499999895691872, "rewards/cosine_rewards": 0.0027549704536795616, "rewards/format_reward": 1.0, "rewards/repetition_rewards": 0.0, "step": 176 }, { "clip_ratio": 0.0, "completion_length": 178.53125, "epoch": 0.09016811003565971, "grad_norm": 12.817541000164553, "kl": 0.10595703125, "learning_rate": 9.5491594498217e-07, "loss": 0.0042, "reward": 1.9808745980262756, "reward_std": 0.08480274910107255, "rewards/accuracy_reward": 0.971875011920929, "rewards/cosine_rewards": 0.009283588267862797, "rewards/format_reward": 1.0, "rewards/repetition_rewards": -0.0002840909000951797, "step": 177 }, { "clip_ratio": 0.0, "completion_length": 186.75, "epoch": 0.09067753438614366, "grad_norm": 7.914504626786531, "kl": 0.103759765625, "learning_rate": 9.546612328069282e-07, "loss": 0.0041, "reward": 1.5245178937911987, "reward_std": 0.34963520616292953, "rewards/accuracy_reward": 0.5218750089406967, "rewards/cosine_rewards": 0.0026429439894855022, "rewards/format_reward": 1.0, "rewards/repetition_rewards": 0.0, "step": 178 }, { "clip_ratio": 0.0, "completion_length": 188.9375, "epoch": 0.0911869587366276, "grad_norm": 8.759963126963402, "kl": 0.13037109375, "learning_rate": 9.544065206316861e-07, "loss": 0.0052, "reward": 1.638785481452942, "reward_std": 0.2284149518236518, "rewards/accuracy_reward": 0.6343750059604645, "rewards/cosine_rewards": 0.0044105148408561945, "rewards/format_reward": 1.0, "rewards/repetition_rewards": 0.0, "step": 179 }, { "clip_ratio": 0.0, "completion_length": 197.90625, "epoch": 0.09169638308711156, "grad_norm": 6.418887244829745, "kl": 0.116943359375, "learning_rate": 9.541518084564442e-07, "loss": 0.0047, "reward": 1.3810052275657654, "reward_std": 0.4032685235142708, "rewards/accuracy_reward": 0.3812499940395355, "rewards/cosine_rewards": -0.0001616678200662136, "rewards/format_reward": 1.0, "rewards/repetition_rewards": -8.311169949593022e-05, "step": 180 }, { "clip_ratio": 0.0, "completion_length": 186.078125, "epoch": 0.09220580743759552, "grad_norm": 6.304857280986285, "kl": 0.12890625, "learning_rate": 9.538970962812022e-07, "loss": 0.0051, "reward": 1.2630045115947723, "reward_std": 0.17365956178400666, "rewards/accuracy_reward": 0.2656250037252903, "rewards/cosine_rewards": -0.002620481769554317, "rewards/format_reward": 1.0, "rewards/repetition_rewards": 0.0, "step": 181 }, { "clip_ratio": 0.0, "completion_length": 190.265625, "epoch": 0.09271523178807947, "grad_norm": 11.35970580998742, "kl": 0.11181640625, "learning_rate": 9.536423841059602e-07, "loss": 0.0045, "reward": 1.6366259455680847, "reward_std": 0.2085256204009056, "rewards/accuracy_reward": 0.6343750059604645, "rewards/cosine_rewards": 0.002349784132093191, "rewards/format_reward": 1.0, "rewards/repetition_rewards": -9.889240755001083e-05, "step": 182 }, { "clip_ratio": 0.0, "completion_length": 195.96875, "epoch": 0.09322465613856343, "grad_norm": 33.125698049494765, "kl": 0.118408203125, "learning_rate": 9.533876719307182e-07, "loss": 0.0048, "reward": 1.553468942642212, "reward_std": 0.16592675540596247, "rewards/accuracy_reward": 0.550000011920929, "rewards/cosine_rewards": 0.003468883689492941, "rewards/format_reward": 1.0, "rewards/repetition_rewards": 0.0, "step": 183 }, { "clip_ratio": 0.0, "completion_length": 205.171875, "epoch": 0.09373408048904738, "grad_norm": 15.866784423418947, "kl": 0.115234375, "learning_rate": 9.531329597554763e-07, "loss": 0.0046, "reward": 1.1207141280174255, "reward_std": 0.19428733736276627, "rewards/accuracy_reward": 0.12812499701976776, "rewards/cosine_rewards": -0.007209272123873234, "rewards/format_reward": 1.0, "rewards/repetition_rewards": -0.00020161290012765676, "step": 184 }, { "clip_ratio": 0.0, "completion_length": 205.640625, "epoch": 0.09424350483953133, "grad_norm": 20.352317051449248, "kl": 0.3115234375, "learning_rate": 9.528782475802343e-07, "loss": 0.0124, "reward": 1.6525439023971558, "reward_std": 0.38362888991832733, "rewards/accuracy_reward": 0.6625000238418579, "rewards/cosine_rewards": 0.005779681145213544, "rewards/format_reward": 0.984375, "rewards/repetition_rewards": -0.00011081559932790697, "step": 185 }, { "clip_ratio": 0.0, "completion_length": 201.234375, "epoch": 0.09475292919001528, "grad_norm": 9.691736527471202, "kl": 0.124755859375, "learning_rate": 9.526235354049923e-07, "loss": 0.005, "reward": 0.9626118838787079, "reward_std": 0.40054861456155777, "rewards/accuracy_reward": -0.015625011175870895, "rewards/cosine_rewards": -0.006138101452961564, "rewards/format_reward": 0.984375, "rewards/repetition_rewards": 0.0, "step": 186 }, { "clip_ratio": 0.0, "completion_length": 223.578125, "epoch": 0.09526235354049924, "grad_norm": 9.437447186660206, "kl": 0.123046875, "learning_rate": 9.523688232297503e-07, "loss": 0.0049, "reward": 1.569740116596222, "reward_std": 0.1397167220711708, "rewards/accuracy_reward": 0.5781250298023224, "rewards/cosine_rewards": 0.007240177597850561, "rewards/format_reward": 0.984375, "rewards/repetition_rewards": 0.0, "step": 187 }, { "clip_ratio": 0.0, "completion_length": 233.625, "epoch": 0.09577177789098319, "grad_norm": 14.943967064784786, "kl": 0.15087890625, "learning_rate": 9.521141110545084e-07, "loss": 0.006, "reward": 1.0288785099983215, "reward_std": 0.2980290725827217, "rewards/accuracy_reward": 0.07187499105930328, "rewards/cosine_rewards": -0.011638639261946082, "rewards/format_reward": 0.96875, "rewards/repetition_rewards": -0.00010775862028822303, "step": 188 }, { "clip_ratio": 0.0, "completion_length": 213.921875, "epoch": 0.09628120224146715, "grad_norm": 15.000304282598657, "kl": 0.12744140625, "learning_rate": 9.518593988792664e-07, "loss": 0.0051, "reward": 1.3486477732658386, "reward_std": 0.30507488548755646, "rewards/accuracy_reward": 0.34999997913837433, "rewards/cosine_rewards": -0.0010681524872779846, "rewards/format_reward": 1.0, "rewards/repetition_rewards": -0.0002840909000951797, "step": 189 }, { "clip_ratio": 0.0, "completion_length": 212.96875, "epoch": 0.09679062659195109, "grad_norm": 5.941418391788204, "kl": 0.13916015625, "learning_rate": 9.516046867040244e-07, "loss": 0.0056, "reward": 1.6924657821655273, "reward_std": 0.3630830645561218, "rewards/accuracy_reward": 0.7187499850988388, "rewards/cosine_rewards": 0.004965720232576132, "rewards/format_reward": 0.96875, "rewards/repetition_rewards": 0.0, "step": 190 }, { "clip_ratio": 0.0, "completion_length": 230.421875, "epoch": 0.09730005094243505, "grad_norm": 7.708020896616392, "kl": 0.14453125, "learning_rate": 9.513499745287824e-07, "loss": 0.0058, "reward": 1.250920683145523, "reward_std": 0.4801155626773834, "rewards/accuracy_reward": 0.2968749776482582, "rewards/cosine_rewards": 0.0009206933900713921, "rewards/format_reward": 0.953125, "rewards/repetition_rewards": 0.0, "step": 191 }, { "clip_ratio": 0.0, "completion_length": 247.3125, "epoch": 0.097809475292919, "grad_norm": 13.916574397802314, "kl": 0.13671875, "learning_rate": 9.510952623535404e-07, "loss": 0.0055, "reward": 1.129820704460144, "reward_std": 0.717576265335083, "rewards/accuracy_reward": 0.21249999850988388, "rewards/cosine_rewards": -0.0045542995212599635, "rewards/format_reward": 0.921875, "rewards/repetition_rewards": 0.0, "step": 192 }, { "clip_ratio": 0.0, "completion_length": 244.21875, "epoch": 0.09831889964340296, "grad_norm": 5.201799859391353, "kl": 0.13427734375, "learning_rate": 9.508405501782984e-07, "loss": 0.0054, "reward": 1.3024629950523376, "reward_std": 0.4307016432285309, "rewards/accuracy_reward": 0.37812498956918716, "rewards/cosine_rewards": 0.0024630045518279076, "rewards/format_reward": 0.921875, "rewards/repetition_rewards": 0.0, "step": 193 }, { "clip_ratio": 0.0, "completion_length": 232.203125, "epoch": 0.09882832399388691, "grad_norm": 9.365246983313067, "kl": 0.12939453125, "learning_rate": 9.505858380030564e-07, "loss": 0.0052, "reward": 0.7090668827295303, "reward_std": 0.5288920998573303, "rewards/accuracy_reward": -0.23750002309679985, "rewards/cosine_rewards": -0.022183137945830822, "rewards/format_reward": 0.96875, "rewards/repetition_rewards": 0.0, "step": 194 }, { "clip_ratio": 0.0, "completion_length": 225.8125, "epoch": 0.09933774834437085, "grad_norm": 12.601377217707842, "kl": 0.14501953125, "learning_rate": 9.503311258278145e-07, "loss": 0.0058, "reward": 1.413894236087799, "reward_std": 0.7254346013069153, "rewards/accuracy_reward": 0.5218749940395355, "rewards/cosine_rewards": 0.001518724486231804, "rewards/format_reward": 0.890625, "rewards/repetition_rewards": -0.00012443749437807128, "step": 195 }, { "clip_ratio": 0.0, "completion_length": 226.1875, "epoch": 0.09984717269485481, "grad_norm": 6.45339117400833, "kl": 0.1376953125, "learning_rate": 9.500764136525725e-07, "loss": 0.0055, "reward": 1.5756230354309082, "reward_std": 0.48759835958480835, "rewards/accuracy_reward": 0.6624999940395355, "rewards/cosine_rewards": 0.007316130446270108, "rewards/format_reward": 0.90625, "rewards/repetition_rewards": -0.0004430353583302349, "step": 196 }, { "clip_ratio": 0.0, "completion_length": 227.796875, "epoch": 0.10035659704533877, "grad_norm": 11.359096408502863, "kl": 0.2138671875, "learning_rate": 9.498217014773305e-07, "loss": 0.0086, "reward": 1.2498727440834045, "reward_std": 0.5983296632766724, "rewards/accuracy_reward": 0.37812499701976776, "rewards/cosine_rewards": -0.0031815596157684922, "rewards/format_reward": 0.875, "rewards/repetition_rewards": -7.070136052789167e-05, "step": 197 }, { "clip_ratio": 0.0, "completion_length": 232.234375, "epoch": 0.10086602139582272, "grad_norm": 13.920508776432966, "kl": 0.12255859375, "learning_rate": 9.495669893020886e-07, "loss": 0.0049, "reward": 0.6862081587314606, "reward_std": 0.7485357820987701, "rewards/accuracy_reward": -0.2656250223517418, "rewards/cosine_rewards": -0.01686593284830451, "rewards/format_reward": 0.96875, "rewards/repetition_rewards": -5.089576370664872e-05, "step": 198 }, { "clip_ratio": 0.0, "completion_length": 205.140625, "epoch": 0.10137544574630668, "grad_norm": 8.671114765474545, "kl": 0.124267578125, "learning_rate": 9.493122771268466e-07, "loss": 0.005, "reward": 1.1818422079086304, "reward_std": 0.5584293901920319, "rewards/accuracy_reward": 0.29375000298023224, "rewards/cosine_rewards": -0.002258662148960866, "rewards/format_reward": 0.890625, "rewards/repetition_rewards": -0.0002741228090599179, "step": 199 }, { "clip_ratio": 0.0, "completion_length": 216.875, "epoch": 0.10188487009679063, "grad_norm": 10.078840755345926, "kl": 0.126708984375, "learning_rate": 9.490575649516046e-07, "loss": 0.0051, "reward": 1.3181660771369934, "reward_std": 0.6019489467144012, "rewards/accuracy_reward": 0.40937498211860657, "rewards/cosine_rewards": 0.002541057765483856, "rewards/format_reward": 0.90625, "rewards/repetition_rewards": 0.0, "step": 200 }, { "clip_ratio": 0.0, "completion_length": 199.515625, "epoch": 0.10239429444727458, "grad_norm": 5.555372749575627, "kl": 0.130859375, "learning_rate": 9.488028527763627e-07, "loss": 0.0052, "reward": 1.5905040502548218, "reward_std": 0.41512130200862885, "rewards/accuracy_reward": 0.6624999940395355, "rewards/cosine_rewards": 0.006129102781414986, "rewards/format_reward": 0.921875, "rewards/repetition_rewards": 0.0, "step": 201 }, { "clip_ratio": 0.0, "completion_length": 235.03125, "epoch": 0.10290371879775853, "grad_norm": 10.052848791937414, "kl": 0.12890625, "learning_rate": 9.485481406011207e-07, "loss": 0.0051, "reward": 1.1239948272705078, "reward_std": 0.9119550585746765, "rewards/accuracy_reward": 0.26875001192092896, "rewards/cosine_rewards": -0.004009488970041275, "rewards/format_reward": 0.859375, "rewards/repetition_rewards": -0.00012065636838087812, "step": 202 }, { "clip_ratio": 0.0, "completion_length": 243.5, "epoch": 0.10341314314824249, "grad_norm": 11.024825341246498, "kl": 0.115966796875, "learning_rate": 9.482934284258787e-07, "loss": 0.0046, "reward": 0.9447762966156006, "reward_std": 0.800986647605896, "rewards/accuracy_reward": 0.12812498584389687, "rewards/cosine_rewards": -0.011282204184681177, "rewards/format_reward": 0.828125, "rewards/repetition_rewards": -0.00019145716942148283, "step": 203 }, { "clip_ratio": 0.0, "completion_length": 232.375, "epoch": 0.10392256749872644, "grad_norm": 7.842670844504004, "kl": 0.118408203125, "learning_rate": 9.480387162506367e-07, "loss": 0.0047, "reward": 1.2975149750709534, "reward_std": 0.6046717762947083, "rewards/accuracy_reward": 0.40937497094273567, "rewards/cosine_rewards": -0.0023666354827582836, "rewards/format_reward": 0.890625, "rewards/repetition_rewards": -0.00011837121564894915, "step": 204 }, { "clip_ratio": 0.0, "completion_length": 224.234375, "epoch": 0.1044319918492104, "grad_norm": 20.462186918743633, "kl": 0.125732421875, "learning_rate": 9.477840040753947e-07, "loss": 0.005, "reward": 1.0097321271896362, "reward_std": 0.4872446656227112, "rewards/accuracy_reward": 0.1250000149011612, "rewards/cosine_rewards": -0.005829372443258762, "rewards/format_reward": 0.890625, "rewards/repetition_rewards": -6.351625779643655e-05, "step": 205 }, { "clip_ratio": 0.0, "completion_length": 224.328125, "epoch": 0.10494141619969434, "grad_norm": 18.578745554695768, "kl": 0.1298828125, "learning_rate": 9.475292919001527e-07, "loss": 0.0052, "reward": 0.8584832549095154, "reward_std": 0.5744369626045227, "rewards/accuracy_reward": 0.040624991059303284, "rewards/cosine_rewards": -0.010225818026810884, "rewards/format_reward": 0.828125, "rewards/repetition_rewards": -4.101049853488803e-05, "step": 206 }, { "clip_ratio": 0.0, "completion_length": 209.234375, "epoch": 0.1054508405501783, "grad_norm": 20.30723423804259, "kl": 0.11376953125, "learning_rate": 9.472745797249107e-07, "loss": 0.0045, "reward": 1.0917281210422516, "reward_std": 0.4984763488173485, "rewards/accuracy_reward": 0.24062497913837433, "rewards/cosine_rewards": -0.008111415430903435, "rewards/format_reward": 0.859375, "rewards/repetition_rewards": -0.0001604560275154654, "step": 207 }, { "clip_ratio": 0.0, "completion_length": 202.28125, "epoch": 0.10596026490066225, "grad_norm": 12.813170138938949, "kl": 0.130859375, "learning_rate": 9.470198675496688e-07, "loss": 0.0052, "reward": 1.258288562297821, "reward_std": 0.4630318433046341, "rewards/accuracy_reward": 0.3812500238418579, "rewards/cosine_rewards": 0.0022279657423496246, "rewards/format_reward": 0.875, "rewards/repetition_rewards": -0.00018934992840513587, "step": 208 }, { "clip_ratio": 0.0, "completion_length": 199.0, "epoch": 0.10646968925114621, "grad_norm": 10.281274330223528, "kl": 0.15185546875, "learning_rate": 9.467651553744268e-07, "loss": 0.0061, "reward": 1.2817729711532593, "reward_std": 0.4518425017595291, "rewards/accuracy_reward": 0.40937501937150955, "rewards/cosine_rewards": -0.0026020415825769305, "rewards/format_reward": 0.875, "rewards/repetition_rewards": 0.0, "step": 209 }, { "clip_ratio": 0.0, "completion_length": 191.90625, "epoch": 0.10697911360163016, "grad_norm": 5.991490542584776, "kl": 0.118896484375, "learning_rate": 9.465104431991848e-07, "loss": 0.0047, "reward": 1.537351131439209, "reward_std": 0.5478895753622055, "rewards/accuracy_reward": 0.6875000149011612, "rewards/cosine_rewards": 0.00610114517621696, "rewards/format_reward": 0.84375, "rewards/repetition_rewards": 0.0, "step": 210 }, { "clip_ratio": 0.0, "completion_length": 217.015625, "epoch": 0.1074885379521141, "grad_norm": 60.00536554673066, "kl": 0.115966796875, "learning_rate": 9.462557310239428e-07, "loss": 0.0046, "reward": 0.7785031795501709, "reward_std": 0.42832519114017487, "rewards/accuracy_reward": -0.1250000223517418, "rewards/cosine_rewards": -0.018284045159816742, "rewards/format_reward": 0.921875, "rewards/repetition_rewards": -8.778089977568015e-05, "step": 211 }, { "clip_ratio": 0.0, "completion_length": 205.21875, "epoch": 0.10799796230259806, "grad_norm": 21.72003474650607, "kl": 0.1220703125, "learning_rate": 9.460010188487009e-07, "loss": 0.0049, "reward": 1.0551989674568176, "reward_std": 0.46487441658973694, "rewards/accuracy_reward": 0.18437497317790985, "rewards/cosine_rewards": -0.004139983095228672, "rewards/format_reward": 0.875, "rewards/repetition_rewards": -3.600230411393568e-05, "step": 212 }, { "clip_ratio": 0.0, "completion_length": 237.609375, "epoch": 0.10850738665308202, "grad_norm": 13.593112035734373, "kl": 0.118896484375, "learning_rate": 9.457463066734589e-07, "loss": 0.0048, "reward": 1.4092811346054077, "reward_std": 0.6851305663585663, "rewards/accuracy_reward": 0.6343750059604645, "rewards/cosine_rewards": 0.009402429801411927, "rewards/format_reward": 0.765625, "rewards/repetition_rewards": -0.00012127523950766772, "step": 213 }, { "clip_ratio": 0.0, "completion_length": 266.65625, "epoch": 0.10901681100356597, "grad_norm": 102.50188740817565, "kl": 0.114013671875, "learning_rate": 9.45491594498217e-07, "loss": 0.0046, "reward": 1.3722986578941345, "reward_std": 0.5870523750782013, "rewards/accuracy_reward": 0.5218750089406967, "rewards/cosine_rewards": -0.008539619389921427, "rewards/format_reward": 0.859375, "rewards/repetition_rewards": -0.000411735316447448, "step": 214 }, { "clip_ratio": 0.0, "completion_length": 264.796875, "epoch": 0.10952623535404993, "grad_norm": 17.000809347446246, "kl": 0.114013671875, "learning_rate": 9.452368823229751e-07, "loss": 0.0046, "reward": 1.182218611240387, "reward_std": 0.6600647866725922, "rewards/accuracy_reward": 0.265625, "rewards/cosine_rewards": -0.02086095977574587, "rewards/format_reward": 0.9375, "rewards/repetition_rewards": -4.542151145869866e-05, "step": 215 }, { "clip_ratio": 0.0, "completion_length": 305.796875, "epoch": 0.11003565970453388, "grad_norm": 21.15698667263886, "kl": 0.107666015625, "learning_rate": 9.449821701477331e-07, "loss": 0.0043, "reward": 1.1834356784820557, "reward_std": 0.686463937163353, "rewards/accuracy_reward": 0.2968749925494194, "rewards/cosine_rewards": -0.019689313136041164, "rewards/format_reward": 0.90625, "rewards/repetition_rewards": 0.0, "step": 216 }, { "clip_ratio": 0.0, "completion_length": 400.296875, "epoch": 0.11054508405501783, "grad_norm": 7.9150578217370535, "kl": 0.09814453125, "learning_rate": 9.447274579724911e-07, "loss": 0.0039, "reward": 1.0662736892700195, "reward_std": 0.8222787380218506, "rewards/accuracy_reward": 0.18437499552965164, "rewards/cosine_rewards": -0.0396728478372097, "rewards/format_reward": 0.921875, "rewards/repetition_rewards": -0.00030338978831423447, "step": 217 }, { "clip_ratio": 0.0, "completion_length": 345.671875, "epoch": 0.11105450840550178, "grad_norm": 10.397918174345362, "kl": 0.1357421875, "learning_rate": 9.444727457972492e-07, "loss": 0.0054, "reward": 1.6620882153511047, "reward_std": 0.6701975017786026, "rewards/accuracy_reward": 0.71875, "rewards/cosine_rewards": 0.021556629799306393, "rewards/format_reward": 0.921875, "rewards/repetition_rewards": -9.343791316496208e-05, "step": 218 }, { "clip_ratio": 0.0, "completion_length": 361.984375, "epoch": 0.11156393275598574, "grad_norm": 16.664810091620872, "kl": 0.09619140625, "learning_rate": 9.442180336220072e-07, "loss": 0.0038, "reward": 0.6033791899681091, "reward_std": 0.47761378437280655, "rewards/accuracy_reward": -0.2656250149011612, "rewards/cosine_rewards": -0.0521757323294878, "rewards/format_reward": 0.921875, "rewards/repetition_rewards": -0.0006950152310309932, "step": 219 }, { "clip_ratio": 0.0, "completion_length": 359.359375, "epoch": 0.11207335710646969, "grad_norm": 6.030171938320145, "kl": 0.09423828125, "learning_rate": 9.439633214467651e-07, "loss": 0.0038, "reward": 1.0732125043869019, "reward_std": 0.532948449254036, "rewards/accuracy_reward": 0.18437499180436134, "rewards/cosine_rewards": -0.017268475145101547, "rewards/format_reward": 0.90625, "rewards/repetition_rewards": -0.00014401252064999426, "step": 220 }, { "clip_ratio": 0.0, "completion_length": 467.921875, "epoch": 0.11258278145695365, "grad_norm": 13.578372073106125, "kl": 0.08740234375, "learning_rate": 9.437086092715231e-07, "loss": 0.0035, "reward": 1.089949607849121, "reward_std": 0.7014666199684143, "rewards/accuracy_reward": 0.18437499180436134, "rewards/cosine_rewards": -0.04696316970512271, "rewards/format_reward": 0.953125, "rewards/repetition_rewards": -0.0005872593028470874, "step": 221 }, { "clip_ratio": 0.0, "completion_length": 295.15625, "epoch": 0.11309220580743759, "grad_norm": 9.243544591708364, "kl": 0.099609375, "learning_rate": 9.434538970962812e-07, "loss": 0.004, "reward": 1.25474151968956, "reward_std": 0.42495501041412354, "rewards/accuracy_reward": 0.2968750074505806, "rewards/cosine_rewards": -0.010176160372793674, "rewards/format_reward": 0.96875, "rewards/repetition_rewards": -0.0007073541928548366, "step": 222 }, { "clip_ratio": 0.0, "completion_length": 327.9375, "epoch": 0.11360163015792155, "grad_norm": 11.854145682727308, "kl": 0.09423828125, "learning_rate": 9.431991849210392e-07, "loss": 0.0038, "reward": 1.336020827293396, "reward_std": 0.5929334163665771, "rewards/accuracy_reward": 0.3812499977648258, "rewards/cosine_rewards": 0.0018316814675927162, "rewards/format_reward": 0.953125, "rewards/repetition_rewards": -0.0001858295945567079, "step": 223 }, { "clip_ratio": 0.0, "completion_length": 224.109375, "epoch": 0.1141110545084055, "grad_norm": 9.830726171785136, "kl": 0.13330078125, "learning_rate": 9.429444727457972e-07, "loss": 0.0053, "reward": 0.9984832406044006, "reward_std": 0.45606285333633423, "rewards/accuracy_reward": 0.043749988079071045, "rewards/cosine_rewards": -0.014016739558428526, "rewards/format_reward": 0.96875, "rewards/repetition_rewards": 0.0, "step": 224 }, { "clip_ratio": 0.0, "completion_length": 213.640625, "epoch": 0.11462047885888946, "grad_norm": 9.176338530222994, "kl": 0.115234375, "learning_rate": 9.426897605705553e-07, "loss": 0.0046, "reward": 1.2019062638282776, "reward_std": 0.7189642786979675, "rewards/accuracy_reward": 0.29374999925494194, "rewards/cosine_rewards": -0.01371871994342655, "rewards/format_reward": 0.921875, "rewards/repetition_rewards": 0.0, "step": 225 }, { "clip_ratio": 0.0, "completion_length": 199.71875, "epoch": 0.11512990320937341, "grad_norm": 11.349158424596924, "kl": 0.110107421875, "learning_rate": 9.424350483953133e-07, "loss": 0.0044, "reward": 1.3144216537475586, "reward_std": 0.4914311468601227, "rewards/accuracy_reward": 0.32499998807907104, "rewards/cosine_rewards": 0.005097148037748411, "rewards/format_reward": 0.984375, "rewards/repetition_rewards": -5.056634472566657e-05, "step": 226 }, { "clip_ratio": 0.0, "completion_length": 232.390625, "epoch": 0.11563932755985736, "grad_norm": 8.61981591225416, "kl": 0.105712890625, "learning_rate": 9.421803362200713e-07, "loss": 0.0042, "reward": 1.064522534608841, "reward_std": 0.3509945422410965, "rewards/accuracy_reward": 0.15312501788139343, "rewards/cosine_rewards": -0.010458544362336397, "rewards/format_reward": 0.921875, "rewards/repetition_rewards": -1.8939394067274407e-05, "step": 227 }, { "clip_ratio": 0.0, "completion_length": 197.75, "epoch": 0.11614875191034131, "grad_norm": 12.524545820572607, "kl": 0.106689453125, "learning_rate": 9.419256240448294e-07, "loss": 0.0043, "reward": 1.3913479149341583, "reward_std": 0.2862061709165573, "rewards/accuracy_reward": 0.40937498956918716, "rewards/cosine_rewards": -0.0021625147201120853, "rewards/format_reward": 0.984375, "rewards/repetition_rewards": -0.00023957982193678617, "step": 228 }, { "clip_ratio": 0.0, "completion_length": 193.5, "epoch": 0.11665817626082527, "grad_norm": 17.83927533701907, "kl": 0.13232421875, "learning_rate": 9.416709118695874e-07, "loss": 0.0053, "reward": 1.5334136486053467, "reward_std": 0.45667168498039246, "rewards/accuracy_reward": 0.6062500029802322, "rewards/cosine_rewards": 0.005288586835376918, "rewards/format_reward": 0.921875, "rewards/repetition_rewards": 0.0, "step": 229 }, { "clip_ratio": 0.0, "completion_length": 247.671875, "epoch": 0.11716760061130922, "grad_norm": 23.81656431934888, "kl": 0.108642578125, "learning_rate": 9.414161996943454e-07, "loss": 0.0043, "reward": 1.1264008283615112, "reward_std": 0.6892756521701813, "rewards/accuracy_reward": 0.21249999105930328, "rewards/cosine_rewards": -0.02350334101356566, "rewards/format_reward": 0.9375, "rewards/repetition_rewards": -9.586199303157628e-05, "step": 230 }, { "clip_ratio": 0.0, "completion_length": 262.59375, "epoch": 0.11767702496179318, "grad_norm": 28.467466655884312, "kl": 0.12109375, "learning_rate": 9.411614875191034e-07, "loss": 0.0048, "reward": 1.4944193363189697, "reward_std": 0.3786798119544983, "rewards/accuracy_reward": 0.518750011920929, "rewards/cosine_rewards": 0.0069862306118011475, "rewards/format_reward": 0.96875, "rewards/repetition_rewards": -6.69164874125272e-05, "step": 231 }, { "clip_ratio": 0.0, "completion_length": 248.40625, "epoch": 0.11818644931227713, "grad_norm": 9.248494107592876, "kl": 0.12060546875, "learning_rate": 9.409067753438615e-07, "loss": 0.0048, "reward": 1.2941021919250488, "reward_std": 0.5259552597999573, "rewards/accuracy_reward": 0.3749999776482582, "rewards/cosine_rewards": 0.012875130865722895, "rewards/format_reward": 0.90625, "rewards/repetition_rewards": -2.2944199372432195e-05, "step": 232 }, { "clip_ratio": 0.0, "completion_length": 264.15625, "epoch": 0.11869587366276108, "grad_norm": 14.194855529249866, "kl": 0.107666015625, "learning_rate": 9.406520631686195e-07, "loss": 0.0043, "reward": 1.4532509446144104, "reward_std": 0.47306837141513824, "rewards/accuracy_reward": 0.46562501788139343, "rewards/cosine_rewards": 0.003419560845941305, "rewards/format_reward": 0.984375, "rewards/repetition_rewards": -0.00016866176156327128, "step": 233 }, { "clip_ratio": 0.0, "completion_length": 376.875, "epoch": 0.11920529801324503, "grad_norm": 24.881210594835412, "kl": 0.0986328125, "learning_rate": 9.403973509933774e-07, "loss": 0.0039, "reward": 0.9971878528594971, "reward_std": 0.8903799057006836, "rewards/accuracy_reward": 0.09375, "rewards/cosine_rewards": -0.018273995257914066, "rewards/format_reward": 0.921875, "rewards/repetition_rewards": -0.00016315293032675982, "step": 234 }, { "clip_ratio": 0.0, "completion_length": 432.28125, "epoch": 0.11971472236372899, "grad_norm": 4.4996951306965975, "kl": 0.08544921875, "learning_rate": 9.401426388181355e-07, "loss": 0.0034, "reward": 1.3378186225891113, "reward_std": 0.8512288331985474, "rewards/accuracy_reward": 0.4593750089406967, "rewards/cosine_rewards": -0.027322867885231972, "rewards/format_reward": 0.90625, "rewards/repetition_rewards": -0.000483501615235582, "step": 235 }, { "clip_ratio": 0.0, "completion_length": 465.765625, "epoch": 0.12022414671421294, "grad_norm": 5.402908709299499, "kl": 0.080322265625, "learning_rate": 9.398879266428935e-07, "loss": 0.0032, "reward": 1.4937435388565063, "reward_std": 0.35082364082336426, "rewards/accuracy_reward": 0.5499999821186066, "rewards/cosine_rewards": -0.024816589895635843, "rewards/format_reward": 0.96875, "rewards/repetition_rewards": -0.0001899001763376873, "step": 236 }, { "clip_ratio": 0.0, "completion_length": 431.796875, "epoch": 0.1207335710646969, "grad_norm": 11.178072528468537, "kl": 0.0947265625, "learning_rate": 9.396332144676515e-07, "loss": 0.0038, "reward": 1.1548867225646973, "reward_std": 0.8218154907226562, "rewards/accuracy_reward": 0.23749998956918716, "rewards/cosine_rewards": -0.0042152018286287785, "rewards/format_reward": 0.921875, "rewards/repetition_rewards": -0.0002730985652306117, "step": 237 }, { "clip_ratio": 0.0, "completion_length": 506.640625, "epoch": 0.12124299541518084, "grad_norm": 3.9623481852584983, "kl": 0.078857421875, "learning_rate": 9.393785022924095e-07, "loss": 0.0032, "reward": 1.2667301297187805, "reward_std": 0.8781076371669769, "rewards/accuracy_reward": 0.40937498211860657, "rewards/cosine_rewards": -0.04850983805954456, "rewards/format_reward": 0.90625, "rewards/repetition_rewards": -0.0003849874483421445, "step": 238 }, { "clip_ratio": 0.0, "completion_length": 443.515625, "epoch": 0.1217524197656648, "grad_norm": 5.742693044990549, "kl": 0.094482421875, "learning_rate": 9.391237901171676e-07, "loss": 0.0038, "reward": 0.6565631031990051, "reward_std": 0.7225559949874878, "rewards/accuracy_reward": -0.15312501043081284, "rewards/cosine_rewards": -0.08046763762831688, "rewards/format_reward": 0.890625, "rewards/repetition_rewards": -0.00046927113726269454, "step": 239 }, { "clip_ratio": 0.0, "completion_length": 347.265625, "epoch": 0.12226184411614875, "grad_norm": 7.476762059683771, "kl": 0.08984375, "learning_rate": 9.388690779419256e-07, "loss": 0.0036, "reward": 1.2646641731262207, "reward_std": 0.35142165422439575, "rewards/accuracy_reward": 0.296875, "rewards/cosine_rewards": -0.016350463964045048, "rewards/format_reward": 0.984375, "rewards/repetition_rewards": -0.00023539320682175457, "step": 240 }, { "clip_ratio": 0.0, "completion_length": 241.125, "epoch": 0.12277126846663271, "grad_norm": 7.077545719384053, "kl": 0.101318359375, "learning_rate": 9.386143657666836e-07, "loss": 0.0041, "reward": 0.9919856488704681, "reward_std": 0.6525652855634689, "rewards/accuracy_reward": 0.043749988079071045, "rewards/cosine_rewards": -0.020295456051826477, "rewards/format_reward": 0.96875, "rewards/repetition_rewards": -0.00021890102652832866, "step": 241 }, { "clip_ratio": 0.0, "completion_length": 240.765625, "epoch": 0.12328069281711666, "grad_norm": 8.441107538365218, "kl": 0.1044921875, "learning_rate": 9.383596535914417e-07, "loss": 0.0042, "reward": 1.616421401500702, "reward_std": 0.3022947758436203, "rewards/accuracy_reward": 0.6593749970197678, "rewards/cosine_rewards": 0.004121019504964352, "rewards/format_reward": 0.953125, "rewards/repetition_rewards": -0.00019959894416388124, "step": 242 }, { "clip_ratio": 0.0, "completion_length": 175.796875, "epoch": 0.1237901171676006, "grad_norm": 9.380134906229868, "kl": 0.11083984375, "learning_rate": 9.381049414161997e-07, "loss": 0.0044, "reward": 1.3176445960998535, "reward_std": 0.3997122645378113, "rewards/accuracy_reward": 0.32499998807907104, "rewards/cosine_rewards": -0.0073250585701316595, "rewards/format_reward": 1.0, "rewards/repetition_rewards": -3.028100763913244e-05, "step": 243 }, { "clip_ratio": 0.0, "completion_length": 220.53125, "epoch": 0.12429954151808456, "grad_norm": 5.4472083052212765, "kl": 0.109375, "learning_rate": 9.378502292409577e-07, "loss": 0.0044, "reward": 1.6422365307807922, "reward_std": 0.2728146519511938, "rewards/accuracy_reward": 0.6624999940395355, "rewards/cosine_rewards": 0.01141381449997425, "rewards/format_reward": 0.96875, "rewards/repetition_rewards": -0.00042724609375, "step": 244 }, { "clip_ratio": 0.0, "completion_length": 248.828125, "epoch": 0.12480896586856852, "grad_norm": 8.442731808091501, "kl": 0.1083984375, "learning_rate": 9.375955170657157e-07, "loss": 0.0043, "reward": 1.3157773613929749, "reward_std": 0.4320952445268631, "rewards/accuracy_reward": 0.3531249985098839, "rewards/cosine_rewards": 0.009643017314374447, "rewards/format_reward": 0.953125, "rewards/repetition_rewards": -0.00011561772407731041, "step": 245 }, { "clip_ratio": 0.0, "completion_length": 222.6875, "epoch": 0.12531839021905247, "grad_norm": 14.294535330077375, "kl": 0.1171875, "learning_rate": 9.373408048904738e-07, "loss": 0.0047, "reward": 1.331631362438202, "reward_std": 0.4216170907020569, "rewards/accuracy_reward": 0.3531249985098839, "rewards/cosine_rewards": 0.00982090923935175, "rewards/format_reward": 0.96875, "rewards/repetition_rewards": -6.454958565882407e-05, "step": 246 }, { "clip_ratio": 0.0, "completion_length": 219.984375, "epoch": 0.12582781456953643, "grad_norm": 9.55932148178992, "kl": 0.108642578125, "learning_rate": 9.370860927152318e-07, "loss": 0.0043, "reward": 1.340530276298523, "reward_std": 0.4534093588590622, "rewards/accuracy_reward": 0.3531250096857548, "rewards/cosine_rewards": 0.003091069171205163, "rewards/format_reward": 0.984375, "rewards/repetition_rewards": -6.0797665355494246e-05, "step": 247 }, { "clip_ratio": 0.0, "completion_length": 325.234375, "epoch": 0.12633723892002038, "grad_norm": 17.80951371391722, "kl": 0.110595703125, "learning_rate": 9.368313805399897e-07, "loss": 0.0044, "reward": 1.2751246690750122, "reward_std": 0.520209550857544, "rewards/accuracy_reward": 0.2968749850988388, "rewards/cosine_rewards": 0.010451191570609808, "rewards/format_reward": 0.96875, "rewards/repetition_rewards": -0.0009515111669315957, "step": 248 }, { "clip_ratio": 0.0, "completion_length": 366.625, "epoch": 0.12684666327050434, "grad_norm": 12.086775377390172, "kl": 0.111083984375, "learning_rate": 9.365766683647478e-07, "loss": 0.0044, "reward": 0.8402246385812759, "reward_std": 0.6177513003349304, "rewards/accuracy_reward": -0.04062502086162567, "rewards/cosine_rewards": -0.04077841015532613, "rewards/format_reward": 0.921875, "rewards/repetition_rewards": -0.0002469850951456465, "step": 249 }, { "clip_ratio": 0.0, "completion_length": 449.125, "epoch": 0.1273560876209883, "grad_norm": 11.748922331365623, "kl": 0.0869140625, "learning_rate": 9.363219561895058e-07, "loss": 0.0035, "reward": 1.7429784536361694, "reward_std": 0.6669142842292786, "rewards/accuracy_reward": 0.746874988079071, "rewards/cosine_rewards": 0.02791230659931898, "rewards/format_reward": 0.96875, "rewards/repetition_rewards": -0.000558776329853572, "step": 250 }, { "clip_ratio": 0.0, "completion_length": 523.34375, "epoch": 0.12786551197147222, "grad_norm": 5.635036318066103, "kl": 0.073974609375, "learning_rate": 9.360672440142638e-07, "loss": 0.003, "reward": 1.3756027221679688, "reward_std": 0.3430413454771042, "rewards/accuracy_reward": 0.4375, "rewards/cosine_rewards": -0.01460547186434269, "rewards/format_reward": 0.953125, "rewards/repetition_rewards": -0.0004167625156696886, "step": 251 }, { "clip_ratio": 0.0, "completion_length": 583.78125, "epoch": 0.12837493632195618, "grad_norm": 3.7721855945126013, "kl": 0.071533203125, "learning_rate": 9.358125318390219e-07, "loss": 0.0029, "reward": 1.1054343283176422, "reward_std": 0.9506143927574158, "rewards/accuracy_reward": 0.23749998211860657, "rewards/cosine_rewards": -0.06930245459079742, "rewards/format_reward": 0.9375, "rewards/repetition_rewards": -0.00026325164799345657, "step": 252 }, { "clip_ratio": 0.0, "completion_length": 691.8125, "epoch": 0.12888436067244013, "grad_norm": 4.515905173052182, "kl": 0.06494140625, "learning_rate": 9.355578196637799e-07, "loss": 0.0026, "reward": 1.1395662426948547, "reward_std": 1.164560616016388, "rewards/accuracy_reward": 0.24062499403953552, "rewards/cosine_rewards": -0.06932513415813446, "rewards/format_reward": 0.96875, "rewards/repetition_rewards": -0.0004836731095565483, "step": 253 }, { "clip_ratio": 0.0, "completion_length": 750.703125, "epoch": 0.1293937850229241, "grad_norm": 3.759822074251192, "kl": 0.0609130859375, "learning_rate": 9.353031074885379e-07, "loss": 0.0024, "reward": 1.3212904930114746, "reward_std": 0.9738726019859314, "rewards/accuracy_reward": 0.4625000059604645, "rewards/cosine_rewards": -0.03070250153541565, "rewards/format_reward": 0.890625, "rewards/repetition_rewards": -0.0011320026533212513, "step": 254 }, { "clip_ratio": 0.0, "completion_length": 604.40625, "epoch": 0.12990320937340805, "grad_norm": 4.697641184697458, "kl": 0.082763671875, "learning_rate": 9.350483953132959e-07, "loss": 0.0033, "reward": 1.1115484535694122, "reward_std": 0.7478219866752625, "rewards/accuracy_reward": 0.23125000298023224, "rewards/cosine_rewards": -0.05675292294472456, "rewards/format_reward": 0.9375, "rewards/repetition_rewards": -0.00044860908383270726, "step": 255 }, { "clip_ratio": 0.0, "completion_length": 635.84375, "epoch": 0.130412633723892, "grad_norm": 4.028619362240205, "kl": 0.09765625, "learning_rate": 9.34793683138054e-07, "loss": 0.0039, "reward": 1.457118034362793, "reward_std": 0.788001298904419, "rewards/accuracy_reward": 0.5218749791383743, "rewards/cosine_rewards": -0.0015127966180443764, "rewards/format_reward": 0.9375, "rewards/repetition_rewards": -0.0007440973713528365, "step": 256 }, { "clip_ratio": 0.0, "completion_length": 589.40625, "epoch": 0.13092205807437596, "grad_norm": 5.63417318829876, "kl": 0.07275390625, "learning_rate": 9.34538970962812e-07, "loss": 0.0029, "reward": 1.4154019951820374, "reward_std": 0.815990686416626, "rewards/accuracy_reward": 0.518750011920929, "rewards/cosine_rewards": -0.02476619742810726, "rewards/format_reward": 0.921875, "rewards/repetition_rewards": -0.00045683811185881495, "step": 257 }, { "clip_ratio": 0.0, "completion_length": 676.984375, "epoch": 0.1314314824248599, "grad_norm": 6.021417699991294, "kl": 0.065673828125, "learning_rate": 9.3428425878757e-07, "loss": 0.0026, "reward": 0.6927553117275238, "reward_std": 0.8777336776256561, "rewards/accuracy_reward": -0.0781250074505806, "rewards/cosine_rewards": -0.15041033178567886, "rewards/format_reward": 0.921875, "rewards/repetition_rewards": -0.0005843567778356373, "step": 258 }, { "clip_ratio": 0.0, "completion_length": 555.203125, "epoch": 0.13194090677534387, "grad_norm": 8.306056976533926, "kl": 0.082763671875, "learning_rate": 9.340295466123281e-07, "loss": 0.0033, "reward": 1.2112269699573517, "reward_std": 0.9674933552742004, "rewards/accuracy_reward": 0.43437499552965164, "rewards/cosine_rewards": -0.08218972198665142, "rewards/format_reward": 0.859375, "rewards/repetition_rewards": -0.0003333477216074243, "step": 259 }, { "clip_ratio": 0.0, "completion_length": 694.484375, "epoch": 0.13245033112582782, "grad_norm": 6.232070420425315, "kl": 0.06689453125, "learning_rate": 9.337748344370861e-07, "loss": 0.0027, "reward": 1.0117461681365967, "reward_std": 0.7802118062973022, "rewards/accuracy_reward": 0.21249999478459358, "rewards/cosine_rewards": -0.09079772233963013, "rewards/format_reward": 0.890625, "rewards/repetition_rewards": -0.0005810301227029413, "step": 260 }, { "clip_ratio": 0.0, "completion_length": 543.5625, "epoch": 0.13295975547631178, "grad_norm": 6.334798789966088, "kl": 0.08544921875, "learning_rate": 9.335201222618441e-07, "loss": 0.0034, "reward": 1.0542153716087341, "reward_std": 0.8291297852993011, "rewards/accuracy_reward": 0.18124999105930328, "rewards/cosine_rewards": -0.032575659453868866, "rewards/format_reward": 0.90625, "rewards/repetition_rewards": -0.0007089868013281375, "step": 261 }, { "clip_ratio": 0.0, "completion_length": 356.015625, "epoch": 0.1334691798267957, "grad_norm": 10.460916199726386, "kl": 0.098388671875, "learning_rate": 9.33265410086602e-07, "loss": 0.0039, "reward": 0.6651052087545395, "reward_std": 0.9073293209075928, "rewards/accuracy_reward": -0.09687501192092896, "rewards/cosine_rewards": -0.03463773522526026, "rewards/format_reward": 0.796875, "rewards/repetition_rewards": -0.000257108491496183, "step": 262 }, { "clip_ratio": 0.0, "completion_length": 345.109375, "epoch": 0.13397860417727966, "grad_norm": 105.36035227056938, "kl": 0.10546875, "learning_rate": 9.330106979113601e-07, "loss": 0.0042, "reward": 1.6959076523780823, "reward_std": 0.6433850526809692, "rewards/accuracy_reward": 0.7374999523162842, "rewards/cosine_rewards": 0.036790573969483376, "rewards/format_reward": 0.921875, "rewards/repetition_rewards": -0.0002579164138296619, "step": 263 }, { "clip_ratio": 0.0, "completion_length": 225.671875, "epoch": 0.13448802852776362, "grad_norm": 12.269079038415155, "kl": 0.1044921875, "learning_rate": 9.327559857361181e-07, "loss": 0.0042, "reward": 1.3304521441459656, "reward_std": 0.7337057292461395, "rewards/accuracy_reward": 0.40312500298023224, "rewards/cosine_rewards": -0.009734044317156076, "rewards/format_reward": 0.9375, "rewards/repetition_rewards": -0.0004387954395497218, "step": 264 }, { "clip_ratio": 0.0, "completion_length": 191.421875, "epoch": 0.13499745287824758, "grad_norm": 5.0495702848173805, "kl": 0.12451171875, "learning_rate": 9.325012735608761e-07, "loss": 0.005, "reward": 1.5114508867263794, "reward_std": 0.4991532266139984, "rewards/accuracy_reward": 0.6031249910593033, "rewards/cosine_rewards": 0.0021946561755612493, "rewards/format_reward": 0.90625, "rewards/repetition_rewards": -0.0001188212918350473, "step": 265 }, { "clip_ratio": 0.0, "completion_length": 212.890625, "epoch": 0.13550687722873153, "grad_norm": 8.380147213080475, "kl": 0.11376953125, "learning_rate": 9.322465613856342e-07, "loss": 0.0046, "reward": 1.3185867071151733, "reward_std": 0.5081266015768051, "rewards/accuracy_reward": 0.37812499701976776, "rewards/cosine_rewards": 0.0029779861215502024, "rewards/format_reward": 0.9375, "rewards/repetition_rewards": -1.6225338185904548e-05, "step": 266 }, { "clip_ratio": 0.0, "completion_length": 163.4375, "epoch": 0.1360163015792155, "grad_norm": 6.7670554711392725, "kl": 0.1259765625, "learning_rate": 9.319918492103922e-07, "loss": 0.005, "reward": 1.917210876941681, "reward_std": 0.2323581874370575, "rewards/accuracy_reward": 0.96875, "rewards/cosine_rewards": 0.011114767286926508, "rewards/format_reward": 0.9375, "rewards/repetition_rewards": -0.00015385003644041717, "step": 267 }, { "clip_ratio": 0.0, "completion_length": 156.0, "epoch": 0.13652572592969944, "grad_norm": 7.285618988190272, "kl": 0.119873046875, "learning_rate": 9.317371370351502e-07, "loss": 0.0048, "reward": 1.2626032829284668, "reward_std": 0.6921159029006958, "rewards/accuracy_reward": 0.34687499701976776, "rewards/cosine_rewards": -0.006146675441414118, "rewards/format_reward": 0.921875, "rewards/repetition_rewards": 0.0, "step": 268 }, { "clip_ratio": 0.0, "completion_length": 152.953125, "epoch": 0.1370351502801834, "grad_norm": 10.577916333197056, "kl": 0.140625, "learning_rate": 9.314824248599083e-07, "loss": 0.0056, "reward": 1.2036974430084229, "reward_std": 0.5991593599319458, "rewards/accuracy_reward": 0.2968749850988388, "rewards/cosine_rewards": 0.0007108037825673819, "rewards/format_reward": 0.90625, "rewards/repetition_rewards": -0.0001382743357680738, "step": 269 }, { "clip_ratio": 0.0, "completion_length": 158.34375, "epoch": 0.13754457463066735, "grad_norm": 14.293416719573916, "kl": 0.1201171875, "learning_rate": 9.312277126846663e-07, "loss": 0.0048, "reward": 1.2185573279857635, "reward_std": 0.43015679717063904, "rewards/accuracy_reward": 0.24062500149011612, "rewards/cosine_rewards": -0.006263321032747626, "rewards/format_reward": 0.984375, "rewards/repetition_rewards": -0.00017934850984602235, "step": 270 }, { "clip_ratio": 0.0, "completion_length": 166.46875, "epoch": 0.1380539989811513, "grad_norm": 9.393354579467495, "kl": 0.1240234375, "learning_rate": 9.309730005094243e-07, "loss": 0.005, "reward": 1.5472444295883179, "reward_std": 0.5279964953660965, "rewards/accuracy_reward": 0.606249988079071, "rewards/cosine_rewards": 0.003494387026876211, "rewards/format_reward": 0.9375, "rewards/repetition_rewards": 0.0, "step": 271 }, { "clip_ratio": 0.0, "completion_length": 149.609375, "epoch": 0.13856342333163527, "grad_norm": 7.200843784037537, "kl": 0.117431640625, "learning_rate": 9.307182883341823e-07, "loss": 0.0047, "reward": 1.3769221901893616, "reward_std": 0.4806235730648041, "rewards/accuracy_reward": 0.40937501937150955, "rewards/cosine_rewards": -0.0011419787188060582, "rewards/format_reward": 0.96875, "rewards/repetition_rewards": -6.0797665355494246e-05, "step": 272 }, { "clip_ratio": 0.0, "completion_length": 143.828125, "epoch": 0.1390728476821192, "grad_norm": 11.003082967675637, "kl": 0.18359375, "learning_rate": 9.304635761589404e-07, "loss": 0.0073, "reward": 1.3800683617591858, "reward_std": 0.4095611423254013, "rewards/accuracy_reward": 0.40937498211860657, "rewards/cosine_rewards": 0.0019433526322245598, "rewards/format_reward": 0.96875, "rewards/repetition_rewards": 0.0, "step": 273 }, { "clip_ratio": 0.0, "completion_length": 144.453125, "epoch": 0.13958227203260315, "grad_norm": 6.9562429559567285, "kl": 0.130859375, "learning_rate": 9.302088639836984e-07, "loss": 0.0052, "reward": 1.424567699432373, "reward_std": 0.2551300157792866, "rewards/accuracy_reward": 0.4375, "rewards/cosine_rewards": 0.0028896235453430563, "rewards/format_reward": 0.984375, "rewards/repetition_rewards": -0.0001969077275134623, "step": 274 }, { "clip_ratio": 0.0, "completion_length": 149.296875, "epoch": 0.1400916963830871, "grad_norm": 9.330566123712217, "kl": 0.1240234375, "learning_rate": 9.299541518084564e-07, "loss": 0.005, "reward": 1.2650930285453796, "reward_std": 0.42955365777015686, "rewards/accuracy_reward": 0.32499999552965164, "rewards/cosine_rewards": 0.0025930306874215603, "rewards/format_reward": 0.9375, "rewards/repetition_rewards": 0.0, "step": 275 }, { "clip_ratio": 0.0, "completion_length": 207.75, "epoch": 0.14060112073357106, "grad_norm": 8.495983071057866, "kl": 0.11962890625, "learning_rate": 9.296994396332144e-07, "loss": 0.0048, "reward": 1.8627826571464539, "reward_std": 0.2839447557926178, "rewards/accuracy_reward": 0.859375, "rewards/cosine_rewards": 0.019193909130990505, "rewards/format_reward": 0.984375, "rewards/repetition_rewards": -0.0001612851265235804, "step": 276 }, { "clip_ratio": 0.0, "completion_length": 162.515625, "epoch": 0.14111054508405502, "grad_norm": 12.855580556594866, "kl": 0.14306640625, "learning_rate": 9.294447274579724e-07, "loss": 0.0057, "reward": 1.5162805318832397, "reward_std": 0.6588033437728882, "rewards/accuracy_reward": 0.6343750059604645, "rewards/cosine_rewards": -0.008702149149030447, "rewards/format_reward": 0.890625, "rewards/repetition_rewards": -1.7208149074576795e-05, "step": 277 }, { "clip_ratio": 0.0, "completion_length": 281.484375, "epoch": 0.14161996943453897, "grad_norm": 10.357622467045976, "kl": 0.102783203125, "learning_rate": 9.291900152827304e-07, "loss": 0.0041, "reward": 1.1203789710998535, "reward_std": 0.6814777851104736, "rewards/accuracy_reward": 0.17812500894069672, "rewards/cosine_rewards": -0.010577938985079527, "rewards/format_reward": 0.953125, "rewards/repetition_rewards": -0.0002930604387074709, "step": 278 }, { "clip_ratio": 0.0, "completion_length": 235.390625, "epoch": 0.14212939378502293, "grad_norm": 23.483530147678458, "kl": 0.114013671875, "learning_rate": 9.289353031074884e-07, "loss": 0.0046, "reward": 1.3522316813468933, "reward_std": 0.28182537853717804, "rewards/accuracy_reward": 0.3812500238418579, "rewards/cosine_rewards": 0.0023158364929258823, "rewards/format_reward": 0.96875, "rewards/repetition_rewards": -8.418447396252304e-05, "step": 279 }, { "clip_ratio": 0.0, "completion_length": 271.96875, "epoch": 0.14263881813550688, "grad_norm": 5.804128123317947, "kl": 0.109619140625, "learning_rate": 9.286805909322465e-07, "loss": 0.0044, "reward": 1.2828457355499268, "reward_std": 0.5574119389057159, "rewards/accuracy_reward": 0.3500000163912773, "rewards/cosine_rewards": -0.004654169548302889, "rewards/format_reward": 0.9375, "rewards/repetition_rewards": 0.0, "step": 280 }, { "clip_ratio": 0.0, "completion_length": 346.984375, "epoch": 0.14314824248599084, "grad_norm": 9.062411976412948, "kl": 0.09130859375, "learning_rate": 9.284258787570045e-07, "loss": 0.0037, "reward": 1.9385767579078674, "reward_std": 0.3151838555932045, "rewards/accuracy_reward": 0.9437500238418579, "rewards/cosine_rewards": 0.04203657992184162, "rewards/format_reward": 0.953125, "rewards/repetition_rewards": -0.0003348248792462982, "step": 281 }, { "clip_ratio": 0.0, "completion_length": 395.875, "epoch": 0.1436576668364748, "grad_norm": 8.221976106115973, "kl": 0.104248046875, "learning_rate": 9.281711665817625e-07, "loss": 0.0042, "reward": 1.323907494544983, "reward_std": 0.6098371148109436, "rewards/accuracy_reward": 0.40312501788139343, "rewards/cosine_rewards": 0.014727211673744023, "rewards/format_reward": 0.90625, "rewards/repetition_rewards": -0.00019470852021186147, "step": 282 }, { "clip_ratio": 0.0, "completion_length": 603.21875, "epoch": 0.14416709118695872, "grad_norm": 7.812929807725803, "kl": 0.084228515625, "learning_rate": 9.279164544065206e-07, "loss": 0.0034, "reward": 1.3660696744918823, "reward_std": 0.6207956671714783, "rewards/accuracy_reward": 0.46562500298023224, "rewards/cosine_rewards": -0.005538210505619645, "rewards/format_reward": 0.90625, "rewards/repetition_rewards": -0.0002670584217412397, "step": 283 }, { "clip_ratio": 0.0, "completion_length": 525.6875, "epoch": 0.14467651553744268, "grad_norm": 8.491615227525342, "kl": 0.08056640625, "learning_rate": 9.276617422312786e-07, "loss": 0.0032, "reward": 1.3353699743747711, "reward_std": 0.5946642160415649, "rewards/accuracy_reward": 0.40937500447034836, "rewards/cosine_rewards": -0.02690817415714264, "rewards/format_reward": 0.953125, "rewards/repetition_rewards": -0.0002218634108430706, "step": 284 }, { "clip_ratio": 0.0, "completion_length": 599.984375, "epoch": 0.14518593988792663, "grad_norm": 16.68462964732329, "kl": 0.077880859375, "learning_rate": 9.274070300560366e-07, "loss": 0.0031, "reward": 0.9605185687541962, "reward_std": 0.7793702185153961, "rewards/accuracy_reward": 0.09999999031424522, "rewards/cosine_rewards": -0.06100003980100155, "rewards/format_reward": 0.921875, "rewards/repetition_rewards": -0.0003564156068023294, "step": 285 }, { "clip_ratio": 0.0, "completion_length": 649.3125, "epoch": 0.1456953642384106, "grad_norm": 10.597989534798653, "kl": 0.068115234375, "learning_rate": 9.271523178807946e-07, "loss": 0.0027, "reward": 1.1927469968795776, "reward_std": 1.0099957585334778, "rewards/accuracy_reward": 0.34999997913837433, "rewards/cosine_rewards": -0.04736426845192909, "rewards/format_reward": 0.890625, "rewards/repetition_rewards": -0.0005137407861184329, "step": 286 }, { "clip_ratio": 0.0, "completion_length": 621.234375, "epoch": 0.14620478858889455, "grad_norm": 5.399955402557674, "kl": 0.072265625, "learning_rate": 9.268976057055527e-07, "loss": 0.0029, "reward": 0.821646511554718, "reward_std": 0.9464232325553894, "rewards/accuracy_reward": 0.03749999776482582, "rewards/cosine_rewards": -0.10573448240756989, "rewards/format_reward": 0.890625, "rewards/repetition_rewards": -0.0007440397967002355, "step": 287 }, { "clip_ratio": 0.0, "completion_length": 646.796875, "epoch": 0.1467142129393785, "grad_norm": 5.9108976297695355, "kl": 0.075439453125, "learning_rate": 9.266428935303107e-07, "loss": 0.003, "reward": 1.8053097128868103, "reward_std": 0.5278272330760956, "rewards/accuracy_reward": 0.7749999761581421, "rewards/cosine_rewards": 0.061956772580742836, "rewards/format_reward": 0.96875, "rewards/repetition_rewards": -0.0003969733224948868, "step": 288 }, { "clip_ratio": 0.0, "completion_length": 628.484375, "epoch": 0.14722363728986246, "grad_norm": 4.280094122642851, "kl": 0.0692138671875, "learning_rate": 9.263881813550687e-07, "loss": 0.0028, "reward": 0.7580513060092926, "reward_std": 0.9215057492256165, "rewards/accuracy_reward": -0.04062502086162567, "rewards/cosine_rewards": -0.1223737820982933, "rewards/format_reward": 0.921875, "rewards/repetition_rewards": -0.0008249446109402925, "step": 289 }, { "clip_ratio": 0.0, "completion_length": 697.03125, "epoch": 0.1477330616403464, "grad_norm": 4.704795726585343, "kl": 0.068359375, "learning_rate": 9.261334691798267e-07, "loss": 0.0027, "reward": 1.0915009379386902, "reward_std": 0.6004486382007599, "rewards/accuracy_reward": 0.21249999105930328, "rewards/cosine_rewards": -0.05759305879473686, "rewards/format_reward": 0.9375, "rewards/repetition_rewards": -0.0009060115553438663, "step": 290 }, { "clip_ratio": 0.0, "completion_length": 739.375, "epoch": 0.14824248599083037, "grad_norm": 5.468968593755717, "kl": 0.065185546875, "learning_rate": 9.258787570045847e-07, "loss": 0.0026, "reward": 1.328648567199707, "reward_std": 0.8502229452133179, "rewards/accuracy_reward": 0.40312500298023224, "rewards/cosine_rewards": -0.027191368862986565, "rewards/format_reward": 0.953125, "rewards/repetition_rewards": -0.00041003923979587853, "step": 291 }, { "clip_ratio": 0.0, "completion_length": 800.390625, "epoch": 0.14875191034131433, "grad_norm": 2.644913201802289, "kl": 0.07861328125, "learning_rate": 9.256240448293427e-07, "loss": 0.0031, "reward": 1.5775163769721985, "reward_std": 0.6978716552257538, "rewards/accuracy_reward": 0.6562500149011612, "rewards/cosine_rewards": 0.031187113374471664, "rewards/format_reward": 0.890625, "rewards/repetition_rewards": -0.0005458263913169503, "step": 292 }, { "clip_ratio": 0.0, "completion_length": 969.328125, "epoch": 0.14926133469179828, "grad_norm": 3.858694298808609, "kl": 0.0548095703125, "learning_rate": 9.253693326541008e-07, "loss": 0.0022, "reward": 0.39561687409877777, "reward_std": 1.1356619894504547, "rewards/accuracy_reward": -0.1625000238418579, "rewards/cosine_rewards": -0.23805859684944153, "rewards/format_reward": 0.796875, "rewards/repetition_rewards": -0.0006995665607973933, "step": 293 }, { "clip_ratio": 0.0, "completion_length": 1044.703125, "epoch": 0.1497707590422822, "grad_norm": 2.0660275837501643, "kl": 0.0902099609375, "learning_rate": 9.251146204788588e-07, "loss": 0.0036, "reward": 1.0626700818538666, "reward_std": 1.1662874221801758, "rewards/accuracy_reward": 0.3531249985098839, "rewards/cosine_rewards": -0.055325835943222046, "rewards/format_reward": 0.765625, "rewards/repetition_rewards": -0.0007540385995525867, "step": 294 }, { "clip_ratio": 0.0, "completion_length": 934.28125, "epoch": 0.15028018339276616, "grad_norm": 7.548230617974183, "kl": 0.0538330078125, "learning_rate": 9.248599083036168e-07, "loss": 0.0022, "reward": 1.2535955309867859, "reward_std": 1.0525287985801697, "rewards/accuracy_reward": 0.3750000223517418, "rewards/cosine_rewards": -0.04287016252055764, "rewards/format_reward": 0.921875, "rewards/repetition_rewards": -0.00040922046173363924, "step": 295 }, { "clip_ratio": 0.0, "completion_length": 790.09375, "epoch": 0.15078960774325012, "grad_norm": 3.7401563979919654, "kl": 0.0584716796875, "learning_rate": 9.246051961283748e-07, "loss": 0.0023, "reward": 1.1489249467849731, "reward_std": 0.5376773178577423, "rewards/accuracy_reward": 0.2937499899417162, "rewards/cosine_rewards": -0.08189126010984182, "rewards/format_reward": 0.9375, "rewards/repetition_rewards": -0.0004337812424637377, "step": 296 }, { "clip_ratio": 0.0, "completion_length": 836.375, "epoch": 0.15129903209373408, "grad_norm": 2.8153067499507105, "kl": 0.0618896484375, "learning_rate": 9.243504839531329e-07, "loss": 0.0025, "reward": 1.3525272011756897, "reward_std": 0.8126451969146729, "rewards/accuracy_reward": 0.4906250238418579, "rewards/cosine_rewards": -0.012518584728240967, "rewards/format_reward": 0.875, "rewards/repetition_rewards": -0.000579186889808625, "step": 297 }, { "clip_ratio": 0.0, "completion_length": 890.625, "epoch": 0.15180845644421803, "grad_norm": 5.316949650260697, "kl": 0.0552978515625, "learning_rate": 9.240957717778909e-07, "loss": 0.0022, "reward": 1.2640092372894287, "reward_std": 0.8870376944541931, "rewards/accuracy_reward": 0.4062499850988388, "rewards/cosine_rewards": -0.0009442958980798721, "rewards/format_reward": 0.859375, "rewards/repetition_rewards": -0.0006714609917253256, "step": 298 }, { "clip_ratio": 0.0, "completion_length": 813.953125, "epoch": 0.152317880794702, "grad_norm": 3.8825478721674953, "kl": 0.0574951171875, "learning_rate": 9.23841059602649e-07, "loss": 0.0023, "reward": 1.2717376947402954, "reward_std": 0.830648809671402, "rewards/accuracy_reward": 0.4375, "rewards/cosine_rewards": -0.03994514420628548, "rewards/format_reward": 0.875, "rewards/repetition_rewards": -0.0008170758956111968, "step": 299 }, { "clip_ratio": 0.0, "completion_length": 752.140625, "epoch": 0.15282730514518594, "grad_norm": 5.06920521582769, "kl": 0.059814453125, "learning_rate": 9.235863474274071e-07, "loss": 0.0024, "reward": 1.1217154264450073, "reward_std": 0.8524642586708069, "rewards/accuracy_reward": 0.24062498286366463, "rewards/cosine_rewards": -0.04011305421590805, "rewards/format_reward": 0.921875, "rewards/repetition_rewards": -0.0006714656192343682, "step": 300 }, { "clip_ratio": 0.0, "completion_length": 684.625, "epoch": 0.1533367294956699, "grad_norm": 8.555507127767159, "kl": 0.0672607421875, "learning_rate": 9.233316352521651e-07, "loss": 0.0027, "reward": 1.1471417546272278, "reward_std": 0.7909112870693207, "rewards/accuracy_reward": 0.2656249962747097, "rewards/cosine_rewards": -0.039835451170802116, "rewards/format_reward": 0.921875, "rewards/repetition_rewards": -0.0005227623041719198, "step": 301 }, { "clip_ratio": 0.0, "completion_length": 667.78125, "epoch": 0.15384615384615385, "grad_norm": 2.9040824114504877, "kl": 0.064697265625, "learning_rate": 9.230769230769231e-07, "loss": 0.0026, "reward": 0.9261243343353271, "reward_std": 0.668161928653717, "rewards/accuracy_reward": 0.1281249988824129, "rewards/cosine_rewards": -0.06077958270907402, "rewards/format_reward": 0.859375, "rewards/repetition_rewards": -0.0005960852140560746, "step": 302 }, { "clip_ratio": 0.0, "completion_length": 702.484375, "epoch": 0.1543555781966378, "grad_norm": 4.4461209381275655, "kl": 0.06298828125, "learning_rate": 9.228222109016812e-07, "loss": 0.0025, "reward": 1.506935715675354, "reward_std": 0.6653757691383362, "rewards/accuracy_reward": 0.5468749850988388, "rewards/cosine_rewards": 0.03871871158480644, "rewards/format_reward": 0.921875, "rewards/repetition_rewards": -0.0005329845298547298, "step": 303 }, { "clip_ratio": 0.0, "completion_length": 633.140625, "epoch": 0.15486500254712177, "grad_norm": 3.9254091150548933, "kl": 0.069091796875, "learning_rate": 9.225674987264391e-07, "loss": 0.0028, "reward": 1.3886016011238098, "reward_std": 0.9017740190029144, "rewards/accuracy_reward": 0.5749999731779099, "rewards/cosine_rewards": -0.02933959849178791, "rewards/format_reward": 0.84375, "rewards/repetition_rewards": -0.000808820070233196, "step": 304 }, { "clip_ratio": 0.0, "completion_length": 648.625, "epoch": 0.1553744268976057, "grad_norm": 6.070022774209878, "kl": 0.068115234375, "learning_rate": 9.223127865511971e-07, "loss": 0.0027, "reward": 1.6925800442695618, "reward_std": 0.6231902837753296, "rewards/accuracy_reward": 0.6625000238418579, "rewards/cosine_rewards": 0.06164960749447346, "rewards/format_reward": 0.96875, "rewards/repetition_rewards": -0.0003195497556589544, "step": 305 }, { "clip_ratio": 0.0, "completion_length": 614.15625, "epoch": 0.15588385124808965, "grad_norm": 11.091865062468658, "kl": 0.317138671875, "learning_rate": 9.220580743759551e-07, "loss": 0.0127, "reward": 1.5423057079315186, "reward_std": 0.3847469687461853, "rewards/accuracy_reward": 0.5468749962747097, "rewards/cosine_rewards": 0.05880427733063698, "rewards/format_reward": 0.9375, "rewards/repetition_rewards": -0.0008736126183066517, "step": 306 }, { "clip_ratio": 0.0, "completion_length": 612.578125, "epoch": 0.1563932755985736, "grad_norm": 3.103459103182676, "kl": 0.0673828125, "learning_rate": 9.218033622007132e-07, "loss": 0.0027, "reward": 1.6744784712791443, "reward_std": 0.659433513879776, "rewards/accuracy_reward": 0.6875, "rewards/cosine_rewards": 0.06587037723511457, "rewards/format_reward": 0.921875, "rewards/repetition_rewards": -0.0007668640464544296, "step": 307 }, { "clip_ratio": 0.0, "completion_length": 657.265625, "epoch": 0.15690269994905756, "grad_norm": 4.50781660421839, "kl": 0.068115234375, "learning_rate": 9.215486500254712e-07, "loss": 0.0027, "reward": 1.145881563425064, "reward_std": 1.0458006858825684, "rewards/accuracy_reward": 0.34062499180436134, "rewards/cosine_rewards": -0.03727734461426735, "rewards/format_reward": 0.84375, "rewards/repetition_rewards": -0.0012161528575234115, "step": 308 }, { "clip_ratio": 0.0, "completion_length": 771.5625, "epoch": 0.15741212429954152, "grad_norm": 6.920330329994176, "kl": 0.064208984375, "learning_rate": 9.212939378502292e-07, "loss": 0.0026, "reward": 0.8615269958972931, "reward_std": 0.9165626764297485, "rewards/accuracy_reward": 0.140625, "rewards/cosine_rewards": -0.07544910162687302, "rewards/format_reward": 0.796875, "rewards/repetition_rewards": -0.0005239159800112247, "step": 309 }, { "clip_ratio": 0.0, "completion_length": 758.96875, "epoch": 0.15792154865002547, "grad_norm": 11.706103376756111, "kl": 0.056396484375, "learning_rate": 9.210392256749873e-07, "loss": 0.0023, "reward": 1.567901074886322, "reward_std": 1.1157508492469788, "rewards/accuracy_reward": 0.6437499821186066, "rewards/cosine_rewards": 0.08178849518299103, "rewards/format_reward": 0.84375, "rewards/repetition_rewards": -0.0013874000869691372, "step": 310 }, { "clip_ratio": 0.0, "completion_length": 694.390625, "epoch": 0.15843097300050943, "grad_norm": 4.42128730127276, "kl": 0.062255859375, "learning_rate": 9.207845134997453e-07, "loss": 0.0025, "reward": 0.944963201880455, "reward_std": 0.9125352203845978, "rewards/accuracy_reward": 0.16249998658895493, "rewards/cosine_rewards": -0.04496639594435692, "rewards/format_reward": 0.828125, "rewards/repetition_rewards": -0.0006953877746127546, "step": 311 }, { "clip_ratio": 0.0, "completion_length": 812.46875, "epoch": 0.15894039735099338, "grad_norm": 5.888648418898334, "kl": 0.0587158203125, "learning_rate": 9.205298013245033e-07, "loss": 0.0023, "reward": 0.6825668215751648, "reward_std": 1.0514086484909058, "rewards/accuracy_reward": 0.04999999701976776, "rewards/cosine_rewards": -0.13240730948746204, "rewards/format_reward": 0.765625, "rewards/repetition_rewards": -0.0006508340884465724, "step": 312 }, { "clip_ratio": 0.0, "completion_length": 703.703125, "epoch": 0.15944982170147734, "grad_norm": 5.201587660434957, "kl": 0.0626220703125, "learning_rate": 9.202750891492613e-07, "loss": 0.0025, "reward": 0.849999725818634, "reward_std": 1.2490254640579224, "rewards/accuracy_reward": 0.16249999590218067, "rewards/cosine_rewards": -0.04634671099483967, "rewards/format_reward": 0.734375, "rewards/repetition_rewards": -0.0005285786173772067, "step": 313 }, { "clip_ratio": 0.0, "completion_length": 732.84375, "epoch": 0.1599592460519613, "grad_norm": 41.79369545195822, "kl": 0.0654296875, "learning_rate": 9.200203769740194e-07, "loss": 0.0026, "reward": 1.359117031097412, "reward_std": 1.1281075477600098, "rewards/accuracy_reward": 0.49687501788139343, "rewards/cosine_rewards": 0.06599474605172873, "rewards/format_reward": 0.796875, "rewards/repetition_rewards": -0.0006276974454522133, "step": 314 }, { "clip_ratio": 0.0, "completion_length": 633.609375, "epoch": 0.16046867040244522, "grad_norm": 5.659472242303819, "kl": 0.090087890625, "learning_rate": 9.197656647987774e-07, "loss": 0.0036, "reward": 1.149334043264389, "reward_std": 1.1551178693771362, "rewards/accuracy_reward": 0.3593749962747097, "rewards/cosine_rewards": 0.025284748524427414, "rewards/format_reward": 0.765625, "rewards/repetition_rewards": -0.0009507373906672001, "step": 315 }, { "clip_ratio": 0.0, "completion_length": 704.09375, "epoch": 0.16097809475292918, "grad_norm": 5.702114455603425, "kl": 0.071044921875, "learning_rate": 9.195109526235354e-07, "loss": 0.0028, "reward": 1.3667227029800415, "reward_std": 0.6237545907497406, "rewards/accuracy_reward": 0.4031249713152647, "rewards/cosine_rewards": 0.01146969199180603, "rewards/format_reward": 0.953125, "rewards/repetition_rewards": -0.0009969472303055227, "step": 316 }, { "clip_ratio": 0.0, "completion_length": 615.984375, "epoch": 0.16148751910341314, "grad_norm": 7.41926766137556, "kl": 0.072998046875, "learning_rate": 9.192562404482935e-07, "loss": 0.0029, "reward": 1.2655977010726929, "reward_std": 0.7071200311183929, "rewards/accuracy_reward": 0.37187498807907104, "rewards/cosine_rewards": -0.0119027029722929, "rewards/format_reward": 0.90625, "rewards/repetition_rewards": -0.00062458252068609, "step": 317 }, { "clip_ratio": 0.0, "completion_length": 655.65625, "epoch": 0.1619969434538971, "grad_norm": 6.476802877604566, "kl": 0.072265625, "learning_rate": 9.190015282730514e-07, "loss": 0.0029, "reward": 1.4926868677139282, "reward_std": 0.5651115030050278, "rewards/accuracy_reward": 0.4906250089406967, "rewards/cosine_rewards": 0.05030408315360546, "rewards/format_reward": 0.953125, "rewards/repetition_rewards": -0.0013672530185431242, "step": 318 }, { "clip_ratio": 0.0, "completion_length": 694.4375, "epoch": 0.16250636780438105, "grad_norm": 5.651262358287829, "kl": 0.078125, "learning_rate": 9.187468160978094e-07, "loss": 0.0031, "reward": 1.6467930674552917, "reward_std": 0.6130897700786591, "rewards/accuracy_reward": 0.6343749761581421, "rewards/cosine_rewards": 0.060116952285170555, "rewards/format_reward": 0.953125, "rewards/repetition_rewards": -0.0008239042945206165, "step": 319 }, { "clip_ratio": 0.0, "completion_length": 627.1875, "epoch": 0.163015792154865, "grad_norm": 8.752414113774137, "kl": 0.087890625, "learning_rate": 9.184921039225674e-07, "loss": 0.0035, "reward": 1.2824658155441284, "reward_std": 0.6804981231689453, "rewards/accuracy_reward": 0.4281250089406967, "rewards/cosine_rewards": -0.004211767576634884, "rewards/format_reward": 0.859375, "rewards/repetition_rewards": -0.0008223777404054999, "step": 320 }, { "clip_ratio": 0.0, "completion_length": 647.421875, "epoch": 0.16352521650534896, "grad_norm": 24.555397071153298, "kl": 0.10791015625, "learning_rate": 9.182373917473255e-07, "loss": 0.0043, "reward": 1.5703404545783997, "reward_std": 0.6466428339481354, "rewards/accuracy_reward": 0.5781249850988388, "rewards/cosine_rewards": 0.023975687101483345, "rewards/format_reward": 0.96875, "rewards/repetition_rewards": -0.0005103159819555003, "step": 321 }, { "clip_ratio": 0.0, "completion_length": 679.453125, "epoch": 0.1640346408558329, "grad_norm": 10.65571265954387, "kl": 0.0751953125, "learning_rate": 9.179826795720835e-07, "loss": 0.003, "reward": 1.6510714292526245, "reward_std": 1.0072646141052246, "rewards/accuracy_reward": 0.7062499523162842, "rewards/cosine_rewards": 0.0703657679259777, "rewards/format_reward": 0.875, "rewards/repetition_rewards": -0.0005443187110358849, "step": 322 }, { "clip_ratio": 0.0, "completion_length": 815.84375, "epoch": 0.16454406520631687, "grad_norm": 3.0291641455139042, "kl": 0.0577392578125, "learning_rate": 9.177279673968415e-07, "loss": 0.0023, "reward": 0.7281904220581055, "reward_std": 0.7364227771759033, "rewards/accuracy_reward": -0.07187500596046448, "rewards/cosine_rewards": -0.15170371532440186, "rewards/format_reward": 0.953125, "rewards/repetition_rewards": -0.0013558552600443363, "step": 323 }, { "clip_ratio": 0.0, "completion_length": 644.84375, "epoch": 0.16505348955680083, "grad_norm": 4.950572350987556, "kl": 0.081787109375, "learning_rate": 9.174732552215996e-07, "loss": 0.0033, "reward": 1.5456467270851135, "reward_std": 0.3990190625190735, "rewards/accuracy_reward": 0.5750000178813934, "rewards/cosine_rewards": 0.03384638950228691, "rewards/format_reward": 0.9375, "rewards/repetition_rewards": -0.000699635551427491, "step": 324 }, { "clip_ratio": 0.0, "completion_length": 692.96875, "epoch": 0.16556291390728478, "grad_norm": 5.615028773473959, "kl": 0.0675048828125, "learning_rate": 9.172185430463576e-07, "loss": 0.0027, "reward": 1.4643962979316711, "reward_std": 0.538501039147377, "rewards/accuracy_reward": 0.4906250089406967, "rewards/cosine_rewards": 0.021510865539312363, "rewards/format_reward": 0.953125, "rewards/repetition_rewards": -0.000864640751387924, "step": 325 }, { "clip_ratio": 0.0, "completion_length": 748.71875, "epoch": 0.1660723382577687, "grad_norm": 12.207464085563803, "kl": 0.071533203125, "learning_rate": 9.169638308711156e-07, "loss": 0.0029, "reward": 1.1908642947673798, "reward_std": 0.8150831162929535, "rewards/accuracy_reward": 0.3156250100582838, "rewards/cosine_rewards": 0.0008599106222391129, "rewards/format_reward": 0.875, "rewards/repetition_rewards": -0.000620643695583567, "step": 326 }, { "clip_ratio": 0.0, "completion_length": 692.828125, "epoch": 0.16658176260825266, "grad_norm": 4.633110149052728, "kl": 0.067626953125, "learning_rate": 9.167091186958737e-07, "loss": 0.0027, "reward": 1.3975687623023987, "reward_std": 0.6602180898189545, "rewards/accuracy_reward": 0.40937499701976776, "rewards/cosine_rewards": 0.020020989701151848, "rewards/format_reward": 0.96875, "rewards/repetition_rewards": -0.0005772198055638, "step": 327 }, { "clip_ratio": 0.0, "completion_length": 886.234375, "epoch": 0.16709118695873662, "grad_norm": 11.480819900317456, "kl": 0.0567626953125, "learning_rate": 9.164544065206317e-07, "loss": 0.0023, "reward": 1.3322511315345764, "reward_std": 0.7808408439159393, "rewards/accuracy_reward": 0.3812499865889549, "rewards/cosine_rewards": -0.0007541030645370483, "rewards/format_reward": 0.953125, "rewards/repetition_rewards": -0.0013698027469217777, "step": 328 }, { "clip_ratio": 0.0, "completion_length": 869.515625, "epoch": 0.16760061130922058, "grad_norm": 7.997306620734284, "kl": 0.0577392578125, "learning_rate": 9.161996943453897e-07, "loss": 0.0023, "reward": 1.1871361136436462, "reward_std": 0.9155566692352295, "rewards/accuracy_reward": 0.3218750059604645, "rewards/cosine_rewards": -0.0396097619086504, "rewards/format_reward": 0.90625, "rewards/repetition_rewards": -0.0013791794190183282, "step": 329 }, { "clip_ratio": 0.0, "completion_length": 879.125, "epoch": 0.16811003565970453, "grad_norm": 3.8086916601970175, "kl": 0.05810546875, "learning_rate": 9.159449821701477e-07, "loss": 0.0023, "reward": 1.3816418051719666, "reward_std": 0.8457719385623932, "rewards/accuracy_reward": 0.43437500298023224, "rewards/cosine_rewards": 0.026831649709492922, "rewards/format_reward": 0.921875, "rewards/repetition_rewards": -0.0014398820349015296, "step": 330 }, { "clip_ratio": 0.0, "completion_length": 1077.125, "epoch": 0.1686194600101885, "grad_norm": 3.3635095209387, "kl": 0.05029296875, "learning_rate": 9.156902699949058e-07, "loss": 0.002, "reward": 1.4233552813529968, "reward_std": 0.8923040926456451, "rewards/accuracy_reward": 0.5718750357627869, "rewards/cosine_rewards": 0.05618499033153057, "rewards/format_reward": 0.796875, "rewards/repetition_rewards": -0.0015796992811374366, "step": 331 }, { "clip_ratio": 0.0, "completion_length": 1025.625, "epoch": 0.16912888436067244, "grad_norm": 2.6795836472345886, "kl": 0.053955078125, "learning_rate": 9.154355578196637e-07, "loss": 0.0022, "reward": 1.5009884238243103, "reward_std": 0.7616147696971893, "rewards/accuracy_reward": 0.46562496945261955, "rewards/cosine_rewards": 0.08306753821671009, "rewards/format_reward": 0.953125, "rewards/repetition_rewards": -0.0008290903642773628, "step": 332 }, { "clip_ratio": 0.0, "completion_length": 1130.390625, "epoch": 0.1696383087111564, "grad_norm": 2.8897374461041068, "kl": 0.05615234375, "learning_rate": 9.151808456444217e-07, "loss": 0.0022, "reward": 0.9568201899528503, "reward_std": 0.883324146270752, "rewards/accuracy_reward": 0.18437499552965164, "rewards/cosine_rewards": -0.14675537310540676, "rewards/format_reward": 0.921875, "rewards/repetition_rewards": -0.0026744193164631724, "step": 333 }, { "clip_ratio": 0.0, "completion_length": 1140.796875, "epoch": 0.17014773306164035, "grad_norm": 3.24780768226595, "kl": 0.053955078125, "learning_rate": 9.149261334691798e-07, "loss": 0.0022, "reward": 0.4763996750116348, "reward_std": 1.3050541877746582, "rewards/accuracy_reward": -0.07187501713633537, "rewards/cosine_rewards": -0.26269275695085526, "rewards/format_reward": 0.8125, "rewards/repetition_rewards": -0.0015325736021623015, "step": 334 }, { "clip_ratio": 0.0, "completion_length": 1162.21875, "epoch": 0.1706571574121243, "grad_norm": 7.96370989509143, "kl": 0.0509033203125, "learning_rate": 9.146714212939378e-07, "loss": 0.002, "reward": 1.0168579816818237, "reward_std": 1.0622537732124329, "rewards/accuracy_reward": 0.23749998211860657, "rewards/cosine_rewards": -0.06289426982402802, "rewards/format_reward": 0.84375, "rewards/repetition_rewards": -0.0014976929523982108, "step": 335 }, { "clip_ratio": 0.0, "completion_length": 1223.59375, "epoch": 0.17116658176260827, "grad_norm": 5.774523253643062, "kl": 0.083251953125, "learning_rate": 9.144167091186958e-07, "loss": 0.0033, "reward": 0.9260146915912628, "reward_std": 1.3471828699111938, "rewards/accuracy_reward": 0.26249998807907104, "rewards/cosine_rewards": -0.11641103774309158, "rewards/format_reward": 0.78125, "rewards/repetition_rewards": -0.0013242715504020452, "step": 336 }, { "clip_ratio": 0.0, "completion_length": 1069.484375, "epoch": 0.1716760061130922, "grad_norm": 7.732047491992381, "kl": 0.0555419921875, "learning_rate": 9.141619969434538e-07, "loss": 0.0022, "reward": 1.0389263331890106, "reward_std": 0.9250738620758057, "rewards/accuracy_reward": 0.20937499403953552, "rewards/cosine_rewards": -0.09034883230924606, "rewards/format_reward": 0.921875, "rewards/repetition_rewards": -0.001974849379621446, "step": 337 }, { "clip_ratio": 0.0, "completion_length": 846.234375, "epoch": 0.17218543046357615, "grad_norm": 6.146520612031918, "kl": 0.06689453125, "learning_rate": 9.139072847682119e-07, "loss": 0.0027, "reward": 1.5287657380104065, "reward_std": 0.7281034886837006, "rewards/accuracy_reward": 0.5218749940395355, "rewards/cosine_rewards": 0.055092147551476955, "rewards/format_reward": 0.953125, "rewards/repetition_rewards": -0.0013264745939522982, "step": 338 }, { "clip_ratio": 0.0, "completion_length": 878.59375, "epoch": 0.1726948548140601, "grad_norm": 5.859040533770109, "kl": 0.059814453125, "learning_rate": 9.136525725929699e-07, "loss": 0.0024, "reward": 1.309591829776764, "reward_std": 0.8282720148563385, "rewards/accuracy_reward": 0.3781249839812517, "rewards/cosine_rewards": 0.02607971802353859, "rewards/format_reward": 0.90625, "rewards/repetition_rewards": -0.0008628710638731718, "step": 339 }, { "clip_ratio": 0.0, "completion_length": 706.234375, "epoch": 0.17320427916454406, "grad_norm": 4.068095402441544, "kl": 0.066162109375, "learning_rate": 9.133978604177279e-07, "loss": 0.0026, "reward": 1.1101016998291016, "reward_std": 0.7019257247447968, "rewards/accuracy_reward": 0.20624998956918716, "rewards/cosine_rewards": -0.03270102944225073, "rewards/format_reward": 0.9375, "rewards/repetition_rewards": -0.0009472573874518275, "step": 340 }, { "clip_ratio": 0.0, "completion_length": 782.09375, "epoch": 0.17371370351502802, "grad_norm": 8.092723935778833, "kl": 0.07080078125, "learning_rate": 9.13143148242486e-07, "loss": 0.0028, "reward": 1.3624014258384705, "reward_std": 0.6876442432403564, "rewards/accuracy_reward": 0.40937499701976776, "rewards/cosine_rewards": 0.0012194328010082245, "rewards/format_reward": 0.953125, "rewards/repetition_rewards": -0.0013180217938497663, "step": 341 }, { "clip_ratio": 0.0, "completion_length": 725.109375, "epoch": 0.17422312786551197, "grad_norm": 8.756999335905311, "kl": 0.130126953125, "learning_rate": 9.12888436067244e-07, "loss": 0.0052, "reward": 1.1084296703338623, "reward_std": 1.0551597476005554, "rewards/accuracy_reward": 0.2343750037252903, "rewards/cosine_rewards": -0.062263866886496544, "rewards/format_reward": 0.9375, "rewards/repetition_rewards": -0.0011814486351795495, "step": 342 }, { "clip_ratio": 0.0, "completion_length": 664.5625, "epoch": 0.17473255221599593, "grad_norm": 4.670760280414413, "kl": 0.07275390625, "learning_rate": 9.12633723892002e-07, "loss": 0.0029, "reward": 1.379169523715973, "reward_std": 0.6884946823120117, "rewards/accuracy_reward": 0.40937499701976776, "rewards/cosine_rewards": -0.014050468802452087, "rewards/format_reward": 0.984375, "rewards/repetition_rewards": -0.0005300141347106546, "step": 343 }, { "clip_ratio": 0.0, "completion_length": 667.359375, "epoch": 0.17524197656647988, "grad_norm": 27.098795999332967, "kl": 0.08056640625, "learning_rate": 9.123790117167601e-07, "loss": 0.0032, "reward": 1.6130830645561218, "reward_std": 0.44565099477767944, "rewards/accuracy_reward": 0.5781249701976776, "rewards/cosine_rewards": 0.051279583014547825, "rewards/format_reward": 0.984375, "rewards/repetition_rewards": -0.0006965193606447428, "step": 344 }, { "clip_ratio": 0.0, "completion_length": 654.65625, "epoch": 0.17575140091696384, "grad_norm": 10.133659329005896, "kl": 0.075439453125, "learning_rate": 9.121242995415181e-07, "loss": 0.003, "reward": 1.6888669729232788, "reward_std": 0.506424754858017, "rewards/accuracy_reward": 0.690625011920929, "rewards/cosine_rewards": 0.06190674379467964, "rewards/format_reward": 0.9375, "rewards/repetition_rewards": -0.0011647465871647, "step": 345 }, { "clip_ratio": 0.0, "completion_length": 647.6875, "epoch": 0.1762608252674478, "grad_norm": 5.129606334927334, "kl": 0.07958984375, "learning_rate": 9.11869587366276e-07, "loss": 0.0032, "reward": 1.2593636512756348, "reward_std": 0.41098763048648834, "rewards/accuracy_reward": 0.296875, "rewards/cosine_rewards": -0.005569446831941605, "rewards/format_reward": 0.96875, "rewards/repetition_rewards": -0.0006919201114214957, "step": 346 }, { "clip_ratio": 0.0, "completion_length": 696.46875, "epoch": 0.17677024961793172, "grad_norm": 11.616711482358948, "kl": 0.074462890625, "learning_rate": 9.11614875191034e-07, "loss": 0.003, "reward": 1.4469356536865234, "reward_std": 0.6090122163295746, "rewards/accuracy_reward": 0.46562500298023224, "rewards/cosine_rewards": -0.0018481542356312275, "rewards/format_reward": 0.984375, "rewards/repetition_rewards": -0.001216164615470916, "step": 347 }, { "clip_ratio": 0.0, "completion_length": 660.0, "epoch": 0.17727967396841568, "grad_norm": 16.70982931235053, "kl": 0.092041015625, "learning_rate": 9.113601630157921e-07, "loss": 0.0037, "reward": 1.3860605359077454, "reward_std": 0.5821886360645294, "rewards/accuracy_reward": 0.40625, "rewards/cosine_rewards": 0.01206381805241108, "rewards/format_reward": 0.96875, "rewards/repetition_rewards": -0.0010032225982286036, "step": 348 }, { "clip_ratio": 0.0, "completion_length": 819.25, "epoch": 0.17778909831889964, "grad_norm": 10.312325223777075, "kl": 0.0694580078125, "learning_rate": 9.111054508405501e-07, "loss": 0.0028, "reward": 1.3597615957260132, "reward_std": 0.5677385032176971, "rewards/accuracy_reward": 0.4375000074505806, "rewards/cosine_rewards": 0.0019306838512420654, "rewards/format_reward": 0.921875, "rewards/repetition_rewards": -0.0015441215364262462, "step": 349 }, { "clip_ratio": 0.0, "completion_length": 816.125, "epoch": 0.1782985226693836, "grad_norm": 3.8656926374469416, "kl": 0.07080078125, "learning_rate": 9.108507386653081e-07, "loss": 0.0028, "reward": 1.1428874135017395, "reward_std": 0.40452495217323303, "rewards/accuracy_reward": 0.21249999105930328, "rewards/cosine_rewards": -0.05335182696580887, "rewards/format_reward": 0.984375, "rewards/repetition_rewards": -0.0006357444362947717, "step": 350 }, { "clip_ratio": 0.0, "completion_length": 839.09375, "epoch": 0.17880794701986755, "grad_norm": 10.545025641089767, "kl": 0.062744140625, "learning_rate": 9.105960264900662e-07, "loss": 0.0025, "reward": 1.440682828426361, "reward_std": 0.7122917473316193, "rewards/accuracy_reward": 0.4375, "rewards/cosine_rewards": 0.004494791850447655, "rewards/format_reward": 1.0, "rewards/repetition_rewards": -0.001311894680839032, "step": 351 }, { "clip_ratio": 0.0, "completion_length": 745.34375, "epoch": 0.1793173713703515, "grad_norm": 5.236449549563, "kl": 0.081787109375, "learning_rate": 9.103413143148242e-07, "loss": 0.0033, "reward": 1.7106852531433105, "reward_std": 0.4475601017475128, "rewards/accuracy_reward": 0.6875, "rewards/cosine_rewards": 0.07085046917200089, "rewards/format_reward": 0.953125, "rewards/repetition_rewards": -0.0007901439967099577, "step": 352 }, { "clip_ratio": 0.0, "completion_length": 782.65625, "epoch": 0.17982679572083546, "grad_norm": 4.40390803368756, "kl": 0.07568359375, "learning_rate": 9.100866021395822e-07, "loss": 0.003, "reward": 1.321226179599762, "reward_std": 0.5729265064001083, "rewards/accuracy_reward": 0.3812500238418579, "rewards/cosine_rewards": -0.04316529631614685, "rewards/format_reward": 0.984375, "rewards/repetition_rewards": -0.001233478484209627, "step": 353 }, { "clip_ratio": 0.0, "completion_length": 911.4375, "epoch": 0.18033622007131941, "grad_norm": 4.585368026245459, "kl": 0.083740234375, "learning_rate": 9.098318899643402e-07, "loss": 0.0034, "reward": 1.2610972821712494, "reward_std": 0.5936008393764496, "rewards/accuracy_reward": 0.3812499828636646, "rewards/cosine_rewards": -0.02527322620153427, "rewards/format_reward": 0.90625, "rewards/repetition_rewards": -0.0011294231517240405, "step": 354 }, { "clip_ratio": 0.0, "completion_length": 961.59375, "epoch": 0.18084564442180337, "grad_norm": 7.408796362493787, "kl": 0.0693359375, "learning_rate": 9.095771777890983e-07, "loss": 0.0028, "reward": 1.2509925812482834, "reward_std": 0.5566798448562622, "rewards/accuracy_reward": 0.3499999940395355, "rewards/cosine_rewards": -0.034961797297000885, "rewards/format_reward": 0.9375, "rewards/repetition_rewards": -0.001545542269013822, "step": 355 }, { "clip_ratio": 0.0, "completion_length": 875.421875, "epoch": 0.18135506877228733, "grad_norm": 6.46350391356739, "kl": 0.08251953125, "learning_rate": 9.093224656138563e-07, "loss": 0.0033, "reward": 1.1181039810180664, "reward_std": 0.674926146864891, "rewards/accuracy_reward": 0.23749998956918716, "rewards/cosine_rewards": -0.05567748658359051, "rewards/format_reward": 0.9375, "rewards/repetition_rewards": -0.0012184783699922264, "step": 356 }, { "clip_ratio": 0.0, "completion_length": 1032.265625, "epoch": 0.18186449312277128, "grad_norm": 6.831510020206218, "kl": 0.0609130859375, "learning_rate": 9.090677534386143e-07, "loss": 0.0024, "reward": 1.559360921382904, "reward_std": 0.6407117247581482, "rewards/accuracy_reward": 0.518750011920929, "rewards/cosine_rewards": 0.057725198566913605, "rewards/format_reward": 0.984375, "rewards/repetition_rewards": -0.0014893330517224967, "step": 357 }, { "clip_ratio": 0.0, "completion_length": 1343.859375, "epoch": 0.1823739174732552, "grad_norm": 4.523279934522272, "kl": 0.05419921875, "learning_rate": 9.088130412633724e-07, "loss": 0.0022, "reward": 1.3108936548233032, "reward_std": 1.3749122023582458, "rewards/accuracy_reward": 0.4843749850988388, "rewards/cosine_rewards": -0.015468426048755646, "rewards/format_reward": 0.84375, "rewards/repetition_rewards": -0.0017629386857151985, "step": 358 }, { "clip_ratio": 0.0, "completion_length": 1283.875, "epoch": 0.18288334182373917, "grad_norm": 2.9278358224956733, "kl": 0.046875, "learning_rate": 9.085583290881304e-07, "loss": 0.0019, "reward": 0.899000346660614, "reward_std": 1.2357721328735352, "rewards/accuracy_reward": 0.20000001043081284, "rewards/cosine_rewards": -0.12764177471399307, "rewards/format_reward": 0.828125, "rewards/repetition_rewards": -0.0014829274150542915, "step": 359 }, { "clip_ratio": 0.0, "completion_length": 1351.765625, "epoch": 0.18339276617422312, "grad_norm": 5.747269025294299, "kl": 0.05029296875, "learning_rate": 9.083036169128883e-07, "loss": 0.002, "reward": 0.685440868139267, "reward_std": 1.0775729417800903, "rewards/accuracy_reward": 0.062499986961483955, "rewards/cosine_rewards": -0.23488027602434158, "rewards/format_reward": 0.859375, "rewards/repetition_rewards": -0.0015538162551820278, "step": 360 }, { "clip_ratio": 0.0, "completion_length": 1328.359375, "epoch": 0.18390219052470708, "grad_norm": 5.5666444239146, "kl": 0.046630859375, "learning_rate": 9.080489047376463e-07, "loss": 0.0019, "reward": 1.440912902355194, "reward_std": 1.3688839673995972, "rewards/accuracy_reward": 0.546875, "rewards/cosine_rewards": 0.05198000371456146, "rewards/format_reward": 0.84375, "rewards/repetition_rewards": -0.0016921277856454253, "step": 361 }, { "clip_ratio": 0.0, "completion_length": 1222.359375, "epoch": 0.18441161487519103, "grad_norm": 3.8469994374841496, "kl": 0.063720703125, "learning_rate": 9.077941925624044e-07, "loss": 0.0025, "reward": 1.2567678689956665, "reward_std": 1.0096549689769745, "rewards/accuracy_reward": 0.3531249761581421, "rewards/cosine_rewards": -0.06352230161428452, "rewards/format_reward": 0.96875, "rewards/repetition_rewards": -0.001584898098371923, "step": 362 }, { "clip_ratio": 0.0, "completion_length": 1071.90625, "epoch": 0.184921039225675, "grad_norm": 10.748025785151507, "kl": 0.080810546875, "learning_rate": 9.075394803871624e-07, "loss": 0.0032, "reward": 1.4773434400558472, "reward_std": 0.7611989676952362, "rewards/accuracy_reward": 0.518750011920929, "rewards/cosine_rewards": 0.022458821535110474, "rewards/format_reward": 0.9375, "rewards/repetition_rewards": -0.0013653661007992923, "step": 363 }, { "clip_ratio": 0.0, "completion_length": 1068.84375, "epoch": 0.18543046357615894, "grad_norm": 6.961531569684862, "kl": 0.0966796875, "learning_rate": 9.072847682119204e-07, "loss": 0.0039, "reward": 1.3342331051826477, "reward_std": 0.9612607657909393, "rewards/accuracy_reward": 0.4906249940395355, "rewards/cosine_rewards": -0.029875734820961952, "rewards/format_reward": 0.875, "rewards/repetition_rewards": -0.0015161921037361026, "step": 364 }, { "clip_ratio": 0.0, "completion_length": 1151.765625, "epoch": 0.1859398879266429, "grad_norm": 4.787495962824196, "kl": 0.0526123046875, "learning_rate": 9.070300560366785e-07, "loss": 0.0021, "reward": 0.44851796329021454, "reward_std": 0.6482652425765991, "rewards/accuracy_reward": -0.18125002831220627, "rewards/cosine_rewards": -0.3225611299276352, "rewards/format_reward": 0.953125, "rewards/repetition_rewards": -0.0007959024223964661, "step": 365 }, { "clip_ratio": 0.0, "completion_length": 1087.453125, "epoch": 0.18644931227712686, "grad_norm": 3.5882611622656704, "kl": 0.05517578125, "learning_rate": 9.067753438614365e-07, "loss": 0.0022, "reward": 1.0280417203903198, "reward_std": 0.8365518152713776, "rewards/accuracy_reward": 0.2093750163912773, "rewards/cosine_rewards": -0.10154062137007713, "rewards/format_reward": 0.921875, "rewards/repetition_rewards": -0.0016676230588927865, "step": 366 }, { "clip_ratio": 0.0, "completion_length": 921.3125, "epoch": 0.1869587366276108, "grad_norm": 10.03430669886217, "kl": 0.07080078125, "learning_rate": 9.065206316861945e-07, "loss": 0.0028, "reward": 1.1769609451293945, "reward_std": 0.880241334438324, "rewards/accuracy_reward": 0.2656249925494194, "rewards/cosine_rewards": -0.04036855325102806, "rewards/format_reward": 0.953125, "rewards/repetition_rewards": -0.0014205531333573163, "step": 367 }, { "clip_ratio": 0.0, "completion_length": 841.6875, "epoch": 0.18746816097809477, "grad_norm": 27.23508906311087, "kl": 0.07861328125, "learning_rate": 9.062659195109526e-07, "loss": 0.0031, "reward": 1.685433030128479, "reward_std": 0.49864277243614197, "rewards/accuracy_reward": 0.6875, "rewards/cosine_rewards": 0.1084844060242176, "rewards/format_reward": 0.890625, "rewards/repetition_rewards": -0.0011764070368371904, "step": 368 }, { "clip_ratio": 0.0, "completion_length": 737.03125, "epoch": 0.1879775853285787, "grad_norm": 18.41490109995637, "kl": 0.08740234375, "learning_rate": 9.060112073357106e-07, "loss": 0.0035, "reward": 1.3734083771705627, "reward_std": 0.4119359850883484, "rewards/accuracy_reward": 0.37812499701976776, "rewards/cosine_rewards": 0.01141296117566526, "rewards/format_reward": 0.984375, "rewards/repetition_rewards": -0.000504543146234937, "step": 369 }, { "clip_ratio": 0.0, "completion_length": 707.796875, "epoch": 0.18848700967906265, "grad_norm": 35.58611363543668, "kl": 0.084716796875, "learning_rate": 9.057564951604686e-07, "loss": 0.0034, "reward": 1.6782256960868835, "reward_std": 0.5156250298023224, "rewards/accuracy_reward": 0.6343749761581421, "rewards/cosine_rewards": 0.07835755217820406, "rewards/format_reward": 0.96875, "rewards/repetition_rewards": -0.0032568235765211284, "step": 370 }, { "clip_ratio": 0.0, "completion_length": 647.015625, "epoch": 0.1889964340295466, "grad_norm": 7.982503076191807, "kl": 0.086669921875, "learning_rate": 9.055017829852266e-07, "loss": 0.0035, "reward": 1.760904848575592, "reward_std": 0.49070215225219727, "rewards/accuracy_reward": 0.690625011920929, "rewards/cosine_rewards": 0.0864610131829977, "rewards/format_reward": 0.984375, "rewards/repetition_rewards": -0.0005561279249377549, "step": 371 }, { "clip_ratio": 0.0, "completion_length": 716.359375, "epoch": 0.18950585838003056, "grad_norm": 10.914576943859945, "kl": 0.077880859375, "learning_rate": 9.052470708099847e-07, "loss": 0.0031, "reward": 1.9701185822486877, "reward_std": 0.40086938440799713, "rewards/accuracy_reward": 0.831250011920929, "rewards/cosine_rewards": 0.1397455483675003, "rewards/format_reward": 1.0, "rewards/repetition_rewards": -0.0008769762353040278, "step": 372 }, { "clip_ratio": 0.0, "completion_length": 738.546875, "epoch": 0.19001528273051452, "grad_norm": 6.034059111318338, "kl": 0.08544921875, "learning_rate": 9.049923586347427e-07, "loss": 0.0034, "reward": 1.8058127164840698, "reward_std": 0.41330619156360626, "rewards/accuracy_reward": 0.7468750178813934, "rewards/cosine_rewards": 0.1067701168358326, "rewards/format_reward": 0.953125, "rewards/repetition_rewards": -0.0009574841533321887, "step": 373 }, { "clip_ratio": 0.0, "completion_length": 728.0, "epoch": 0.19052470708099847, "grad_norm": 8.532753416320839, "kl": 0.07861328125, "learning_rate": 9.047376464595006e-07, "loss": 0.0031, "reward": 1.0842646658420563, "reward_std": 0.44039003551006317, "rewards/accuracy_reward": 0.15312500298023224, "rewards/cosine_rewards": -0.052397772669792175, "rewards/format_reward": 0.984375, "rewards/repetition_rewards": -0.0008375749748665839, "step": 374 }, { "clip_ratio": 0.0, "completion_length": 804.21875, "epoch": 0.19103413143148243, "grad_norm": 7.4599789196405375, "kl": 0.078125, "learning_rate": 9.044829342842587e-07, "loss": 0.0031, "reward": 0.974018394947052, "reward_std": 0.5849625766277313, "rewards/accuracy_reward": 0.09999999403953552, "rewards/cosine_rewards": -0.10963174607604742, "rewards/format_reward": 0.984375, "rewards/repetition_rewards": -0.0007248484616866335, "step": 375 }, { "clip_ratio": 0.0, "completion_length": 767.84375, "epoch": 0.19154355578196638, "grad_norm": 7.073657321582574, "kl": 0.0703125, "learning_rate": 9.042282221090167e-07, "loss": 0.0028, "reward": 0.914261519908905, "reward_std": 0.7159627079963684, "rewards/accuracy_reward": 0.09999998658895493, "rewards/cosine_rewards": -0.13804471492767334, "rewards/format_reward": 0.953125, "rewards/repetition_rewards": -0.0008187246276065707, "step": 376 }, { "clip_ratio": 0.0, "completion_length": 898.65625, "epoch": 0.19205298013245034, "grad_norm": 5.152812978669099, "kl": 0.060791015625, "learning_rate": 9.039735099337747e-07, "loss": 0.0024, "reward": 1.3070060014724731, "reward_std": 0.5369542390108109, "rewards/accuracy_reward": 0.3531250059604645, "rewards/cosine_rewards": 0.01811320334672928, "rewards/format_reward": 0.9375, "rewards/repetition_rewards": -0.0017322039348073304, "step": 377 }, { "clip_ratio": 0.0, "completion_length": 824.65625, "epoch": 0.1925624044829343, "grad_norm": 4.4264228290413055, "kl": 0.071044921875, "learning_rate": 9.037187977585327e-07, "loss": 0.0028, "reward": 1.9969289302825928, "reward_std": 0.36238182336091995, "rewards/accuracy_reward": 0.887499988079071, "rewards/cosine_rewards": 0.1572401076555252, "rewards/format_reward": 0.953125, "rewards/repetition_rewards": -0.0009361990523757413, "step": 378 }, { "clip_ratio": 0.0, "completion_length": 1041.234375, "epoch": 0.19307182883341822, "grad_norm": 2.966764644317296, "kl": 0.0531005859375, "learning_rate": 9.034640855832908e-07, "loss": 0.0021, "reward": 1.9203879237174988, "reward_std": 0.6297050192952156, "rewards/accuracy_reward": 0.831250011920929, "rewards/cosine_rewards": 0.1525670364499092, "rewards/format_reward": 0.9375, "rewards/repetition_rewards": -0.0009290309972129762, "step": 379 }, { "clip_ratio": 0.0, "completion_length": 1066.578125, "epoch": 0.19358125318390218, "grad_norm": 6.45607299399273, "kl": 0.0604248046875, "learning_rate": 9.032093734080488e-07, "loss": 0.0024, "reward": 1.5978580713272095, "reward_std": 0.7550583779811859, "rewards/accuracy_reward": 0.546875, "rewards/cosine_rewards": 0.0831909030675888, "rewards/format_reward": 0.96875, "rewards/repetition_rewards": -0.0009578557801432908, "step": 380 }, { "clip_ratio": 0.0, "completion_length": 1159.84375, "epoch": 0.19409067753438614, "grad_norm": 15.218618980246333, "kl": 0.0557861328125, "learning_rate": 9.029546612328068e-07, "loss": 0.0022, "reward": 1.495898723602295, "reward_std": 0.8071758449077606, "rewards/accuracy_reward": 0.5187499970197678, "rewards/cosine_rewards": 0.04102367162704468, "rewards/format_reward": 0.9375, "rewards/repetition_rewards": -0.0013749129138886929, "step": 381 }, { "clip_ratio": 0.0, "completion_length": 1371.671875, "epoch": 0.1946001018848701, "grad_norm": 2.5671889777891415, "kl": 0.0416259765625, "learning_rate": 9.026999490575649e-07, "loss": 0.0017, "reward": 1.4654145240783691, "reward_std": 0.9137448668479919, "rewards/accuracy_reward": 0.5468749925494194, "rewards/cosine_rewards": 0.029642254114151, "rewards/format_reward": 0.890625, "rewards/repetition_rewards": -0.0017276888247579336, "step": 382 }, { "clip_ratio": 0.0, "completion_length": 1303.015625, "epoch": 0.19510952623535405, "grad_norm": 3.460987727073742, "kl": 0.0421142578125, "learning_rate": 9.024452368823229e-07, "loss": 0.0017, "reward": 1.348323106765747, "reward_std": 0.42551596462726593, "rewards/accuracy_reward": 0.40937499701976776, "rewards/cosine_rewards": 0.0025482475757598877, "rewards/format_reward": 0.9375, "rewards/repetition_rewards": -0.0011001455131918192, "step": 383 }, { "clip_ratio": 0.0, "completion_length": 1507.828125, "epoch": 0.195618950585838, "grad_norm": 2.4731324069110165, "kl": 0.040771484375, "learning_rate": 9.021905247070809e-07, "loss": 0.0016, "reward": 1.2840899229049683, "reward_std": 1.3389369249343872, "rewards/accuracy_reward": 0.43437498807907104, "rewards/cosine_rewards": 0.007216873578727245, "rewards/format_reward": 0.84375, "rewards/repetition_rewards": -0.0012519625015556812, "step": 384 }, { "clip_ratio": 0.0, "completion_length": 1392.5625, "epoch": 0.19612837493632196, "grad_norm": 3.9847528096417464, "kl": 0.0401611328125, "learning_rate": 9.019358125318391e-07, "loss": 0.0016, "reward": 0.952269122004509, "reward_std": 1.110903412103653, "rewards/accuracy_reward": 0.21249999478459358, "rewards/cosine_rewards": -0.16534814983606339, "rewards/format_reward": 0.90625, "rewards/repetition_rewards": -0.001132699428126216, "step": 385 }, { "clip_ratio": 0.0, "completion_length": 1509.515625, "epoch": 0.19663779928680591, "grad_norm": 1.3917929153423205, "kl": 0.0386962890625, "learning_rate": 9.016811003565971e-07, "loss": 0.0015, "reward": 1.3951207399368286, "reward_std": 1.3895853757858276, "rewards/accuracy_reward": 0.49062497913837433, "rewards/cosine_rewards": 0.030975546687841415, "rewards/format_reward": 0.875, "rewards/repetition_rewards": -0.0014798620832152665, "step": 386 }, { "clip_ratio": 0.0, "completion_length": 1453.75, "epoch": 0.19714722363728987, "grad_norm": 4.5123123100553215, "kl": 0.040283203125, "learning_rate": 9.014263881813551e-07, "loss": 0.0016, "reward": 1.0250075459480286, "reward_std": 1.1121925115585327, "rewards/accuracy_reward": 0.2656249925494194, "rewards/cosine_rewards": -0.06678299978375435, "rewards/format_reward": 0.828125, "rewards/repetition_rewards": -0.001959475106559694, "step": 387 }, { "clip_ratio": 0.0, "completion_length": 1502.953125, "epoch": 0.19765664798777383, "grad_norm": 6.3687413391252665, "kl": 0.0384521484375, "learning_rate": 9.011716760061131e-07, "loss": 0.0015, "reward": 0.6134699061512947, "reward_std": 0.8420631885528564, "rewards/accuracy_reward": 0.015624985098838806, "rewards/cosine_rewards": -0.2909963075071573, "rewards/format_reward": 0.890625, "rewards/repetition_rewards": -0.0017838198109529912, "step": 388 }, { "clip_ratio": 0.0, "completion_length": 1398.71875, "epoch": 0.19816607233825778, "grad_norm": 3.252398002176852, "kl": 0.04052734375, "learning_rate": 9.009169638308711e-07, "loss": 0.0016, "reward": 0.6156338006258011, "reward_std": 1.171474575996399, "rewards/accuracy_reward": -0.012500010430812836, "rewards/cosine_rewards": -0.2769355773925781, "rewards/format_reward": 0.90625, "rewards/repetition_rewards": -0.0011806105903815478, "step": 389 }, { "clip_ratio": 0.0, "completion_length": 1356.796875, "epoch": 0.1986754966887417, "grad_norm": 2.9564973631467386, "kl": 0.0411376953125, "learning_rate": 9.006622516556291e-07, "loss": 0.0016, "reward": 1.4809187650680542, "reward_std": 0.4241075813770294, "rewards/accuracy_reward": 0.4656249899417162, "rewards/cosine_rewards": 0.0635819137096405, "rewards/format_reward": 0.953125, "rewards/repetition_rewards": -0.0014131638454273343, "step": 390 }, { "clip_ratio": 0.0, "completion_length": 1314.8125, "epoch": 0.19918492103922567, "grad_norm": 1.9868954496027869, "kl": 0.040283203125, "learning_rate": 9.004075394803871e-07, "loss": 0.0016, "reward": 0.3289404660463333, "reward_std": 0.6832451522350311, "rewards/accuracy_reward": -0.23750004172325134, "rewards/cosine_rewards": -0.38559940457344055, "rewards/format_reward": 0.953125, "rewards/repetition_rewards": -0.0010851426632143557, "step": 391 }, { "clip_ratio": 0.0, "completion_length": 1211.65625, "epoch": 0.19969434538970962, "grad_norm": 2.3384012636409524, "kl": 0.0426025390625, "learning_rate": 9.001528273051452e-07, "loss": 0.0017, "reward": 1.7431849241256714, "reward_std": 0.5287438631057739, "rewards/accuracy_reward": 0.6624999940395355, "rewards/cosine_rewards": 0.0974309928715229, "rewards/format_reward": 0.984375, "rewards/repetition_rewards": -0.0011210814118385315, "step": 392 }, { "clip_ratio": 0.0, "completion_length": 1199.40625, "epoch": 0.20020376974019358, "grad_norm": 8.037383660003005, "kl": 0.0426025390625, "learning_rate": 8.998981151299032e-07, "loss": 0.0017, "reward": 1.205706238746643, "reward_std": 0.5482289791107178, "rewards/accuracy_reward": 0.2968749925494194, "rewards/cosine_rewards": -0.09018014371395111, "rewards/format_reward": 1.0, "rewards/repetition_rewards": -0.0009886454208754003, "step": 393 }, { "clip_ratio": 0.0, "completion_length": 1215.25, "epoch": 0.20071319409067753, "grad_norm": 2.7015176132022205, "kl": 0.04150390625, "learning_rate": 8.996434029546612e-07, "loss": 0.0017, "reward": 1.3461086750030518, "reward_std": 0.36276355385780334, "rewards/accuracy_reward": 0.3812499940395355, "rewards/cosine_rewards": -0.033333455212414265, "rewards/format_reward": 1.0, "rewards/repetition_rewards": -0.0018078879220411181, "step": 394 }, { "clip_ratio": 0.0, "completion_length": 1148.140625, "epoch": 0.2012226184411615, "grad_norm": 2.4525739585224064, "kl": 0.0447998046875, "learning_rate": 8.993886907794193e-07, "loss": 0.0018, "reward": 1.6304560899734497, "reward_std": 0.6783818304538727, "rewards/accuracy_reward": 0.5781249850988388, "rewards/cosine_rewards": 0.0690329410135746, "rewards/format_reward": 0.984375, "rewards/repetition_rewards": -0.0010768624488264322, "step": 395 }, { "clip_ratio": 0.0, "completion_length": 1234.03125, "epoch": 0.20173204279164544, "grad_norm": 2.620518407503657, "kl": 0.0426025390625, "learning_rate": 8.991339786041773e-07, "loss": 0.0017, "reward": 1.0580366849899292, "reward_std": 0.45367684960365295, "rewards/accuracy_reward": 0.18437499552965164, "rewards/cosine_rewards": -0.09430436789989471, "rewards/format_reward": 0.96875, "rewards/repetition_rewards": -0.0007839706668164581, "step": 396 }, { "clip_ratio": 0.0, "completion_length": 1255.140625, "epoch": 0.2022414671421294, "grad_norm": 2.848324792333859, "kl": 0.0416259765625, "learning_rate": 8.988792664289353e-07, "loss": 0.0017, "reward": 1.396336853504181, "reward_std": 0.6851004362106323, "rewards/accuracy_reward": 0.40937498584389687, "rewards/cosine_rewards": 0.003251887857913971, "rewards/format_reward": 0.984375, "rewards/repetition_rewards": -0.00066499671083875, "step": 397 }, { "clip_ratio": 0.0, "completion_length": 1243.234375, "epoch": 0.20275089149261336, "grad_norm": 2.5122988909394457, "kl": 0.04150390625, "learning_rate": 8.986245542536933e-07, "loss": 0.0017, "reward": 2.053937077522278, "reward_std": 0.5187530070543289, "rewards/accuracy_reward": 0.8312499821186066, "rewards/cosine_rewards": 0.22372649610042572, "rewards/format_reward": 1.0, "rewards/repetition_rewards": -0.0010393889679107815, "step": 398 }, { "clip_ratio": 0.0, "completion_length": 1395.28125, "epoch": 0.2032603158430973, "grad_norm": 8.131421667160394, "kl": 0.039306640625, "learning_rate": 8.983698420784514e-07, "loss": 0.0016, "reward": 1.9118317365646362, "reward_std": 0.3381110727787018, "rewards/accuracy_reward": 0.7187500149011612, "rewards/cosine_rewards": 0.19487697072327137, "rewards/format_reward": 1.0, "rewards/repetition_rewards": -0.001795282296370715, "step": 399 }, { "clip_ratio": 0.0, "completion_length": 1477.328125, "epoch": 0.20376974019358127, "grad_norm": 2.5663546513961992, "kl": 0.0489501953125, "learning_rate": 8.981151299032094e-07, "loss": 0.002, "reward": 0.616385743021965, "reward_std": 0.5365406274795532, "rewards/accuracy_reward": -0.012500017881393433, "rewards/cosine_rewards": -0.27611421793699265, "rewards/format_reward": 0.90625, "rewards/repetition_rewards": -0.0012499869335442781, "step": 400 }, { "clip_ratio": 0.0, "completion_length": 1641.53125, "epoch": 0.2042791645440652, "grad_norm": 2.3476982545455947, "kl": 0.0382080078125, "learning_rate": 8.978604177279674e-07, "loss": 0.0015, "reward": 0.38990160822868347, "reward_std": 1.22097048163414, "rewards/accuracy_reward": -0.06875001266598701, "rewards/cosine_rewards": -0.352715402841568, "rewards/format_reward": 0.8125, "rewards/repetition_rewards": -0.0011329837725497782, "step": 401 }, { "clip_ratio": 0.0, "completion_length": 1740.96875, "epoch": 0.20478858889454915, "grad_norm": 1.6789909982175664, "kl": 0.036376953125, "learning_rate": 8.976057055527255e-07, "loss": 0.0015, "reward": 0.7690124660730362, "reward_std": 1.7883394956588745, "rewards/accuracy_reward": 0.24062499403953552, "rewards/cosine_rewards": -0.15724666975438595, "rewards/format_reward": 0.6875, "rewards/repetition_rewards": -0.001865879981778562, "step": 402 }, { "clip_ratio": 0.0, "completion_length": 1715.625, "epoch": 0.2052980132450331, "grad_norm": 1.732486740072958, "kl": 0.035400390625, "learning_rate": 8.973509933774834e-07, "loss": 0.0014, "reward": 0.6791011095046997, "reward_std": 1.0334843397140503, "rewards/accuracy_reward": 0.1249999925494194, "rewards/cosine_rewards": -0.21049801260232925, "rewards/format_reward": 0.765625, "rewards/repetition_rewards": -0.0010258048423565924, "step": 403 }, { "clip_ratio": 0.0, "completion_length": 1585.046875, "epoch": 0.20580743759551706, "grad_norm": 1.6162362158227377, "kl": 0.037109375, "learning_rate": 8.970962812022414e-07, "loss": 0.0015, "reward": 0.9881232976913452, "reward_std": 1.0253838300704956, "rewards/accuracy_reward": 0.24062499403953552, "rewards/cosine_rewards": -0.12615075334906578, "rewards/format_reward": 0.875, "rewards/repetition_rewards": -0.001350913429632783, "step": 404 }, { "clip_ratio": 0.0, "completion_length": 1497.734375, "epoch": 0.20631686194600102, "grad_norm": 5.362930427796704, "kl": 0.039306640625, "learning_rate": 8.968415690269994e-07, "loss": 0.0016, "reward": 1.5187935531139374, "reward_std": 0.5071015954017639, "rewards/accuracy_reward": 0.5218749940395355, "rewards/cosine_rewards": 0.07592727243900299, "rewards/format_reward": 0.921875, "rewards/repetition_rewards": -0.0008836896740831435, "step": 405 }, { "clip_ratio": 0.0, "completion_length": 1471.890625, "epoch": 0.20682628629648497, "grad_norm": 2.5474971754837896, "kl": 0.0374755859375, "learning_rate": 8.965868568517575e-07, "loss": 0.0015, "reward": 1.7093470096588135, "reward_std": 0.26929083466529846, "rewards/accuracy_reward": 0.6062499955296516, "rewards/cosine_rewards": 0.13566255569458008, "rewards/format_reward": 0.96875, "rewards/repetition_rewards": -0.0013155650231055915, "step": 406 }, { "clip_ratio": 0.0, "completion_length": 1517.765625, "epoch": 0.20733571064696893, "grad_norm": 2.3954440093211695, "kl": 0.0372314453125, "learning_rate": 8.963321446765155e-07, "loss": 0.0015, "reward": 1.6693125367164612, "reward_std": 0.8508188724517822, "rewards/accuracy_reward": 0.5781250149011612, "rewards/cosine_rewards": 0.12336396798491478, "rewards/format_reward": 0.96875, "rewards/repetition_rewards": -0.000926460576010868, "step": 407 }, { "clip_ratio": 0.0, "completion_length": 1427.40625, "epoch": 0.20784513499745289, "grad_norm": 4.487502302070771, "kl": 0.037109375, "learning_rate": 8.960774325012735e-07, "loss": 0.0015, "reward": 1.6373254656791687, "reward_std": 0.37433764338493347, "rewards/accuracy_reward": 0.550000011920929, "rewards/cosine_rewards": 0.11931294947862625, "rewards/format_reward": 0.96875, "rewards/repetition_rewards": -0.0007375250570476055, "step": 408 }, { "clip_ratio": 0.0, "completion_length": 1500.875, "epoch": 0.20835455934793684, "grad_norm": 5.555469475832445, "kl": 0.0374755859375, "learning_rate": 8.958227203260316e-07, "loss": 0.0015, "reward": 1.398006021976471, "reward_std": 1.336867332458496, "rewards/accuracy_reward": 0.4375, "rewards/cosine_rewards": 0.02401774376630783, "rewards/format_reward": 0.9375, "rewards/repetition_rewards": -0.001011726533761248, "step": 409 }, { "clip_ratio": 0.0, "completion_length": 1434.859375, "epoch": 0.2088639836984208, "grad_norm": 3.6143044040934105, "kl": 0.0435791015625, "learning_rate": 8.955680081507896e-07, "loss": 0.0017, "reward": 1.618862271308899, "reward_std": 0.7050271332263947, "rewards/accuracy_reward": 0.5468750074505806, "rewards/cosine_rewards": 0.1038745865225792, "rewards/format_reward": 0.96875, "rewards/repetition_rewards": -0.000637321179965511, "step": 410 }, { "clip_ratio": 0.0, "completion_length": 1506.828125, "epoch": 0.20937340804890472, "grad_norm": 3.854404990598997, "kl": 0.0361328125, "learning_rate": 8.953132959755476e-07, "loss": 0.0014, "reward": 1.6651726961135864, "reward_std": 0.45976050198078156, "rewards/accuracy_reward": 0.5781250074505806, "rewards/cosine_rewards": 0.11916181445121765, "rewards/format_reward": 0.96875, "rewards/repetition_rewards": -0.0008640679297968745, "step": 411 }, { "clip_ratio": 0.0, "completion_length": 1526.96875, "epoch": 0.20988283239938868, "grad_norm": 2.3422021364641736, "kl": 0.03662109375, "learning_rate": 8.950585838003057e-07, "loss": 0.0015, "reward": 0.6352521181106567, "reward_std": 1.1320685744285583, "rewards/accuracy_reward": -0.012500017881393433, "rewards/cosine_rewards": -0.28875819593667984, "rewards/format_reward": 0.9375, "rewards/repetition_rewards": -0.0009897005802486092, "step": 412 }, { "clip_ratio": 0.0, "completion_length": 1510.03125, "epoch": 0.21039225674987264, "grad_norm": 2.1794044547587275, "kl": 0.0567626953125, "learning_rate": 8.948038716250637e-07, "loss": 0.0023, "reward": 1.4266446828842163, "reward_std": 0.8459653854370117, "rewards/accuracy_reward": 0.4624999910593033, "rewards/cosine_rewards": 0.07424483820796013, "rewards/format_reward": 0.890625, "rewards/repetition_rewards": -0.0007251804636325687, "step": 413 }, { "clip_ratio": 0.0, "completion_length": 1505.8125, "epoch": 0.2109016811003566, "grad_norm": 2.019375037035424, "kl": 0.042236328125, "learning_rate": 8.945491594498217e-07, "loss": 0.0017, "reward": 1.4379878044128418, "reward_std": 0.6174334287643433, "rewards/accuracy_reward": 0.4374999888241291, "rewards/cosine_rewards": 0.04812653362751007, "rewards/format_reward": 0.953125, "rewards/repetition_rewards": -0.0007637535745743662, "step": 414 }, { "clip_ratio": 0.0, "completion_length": 1476.9375, "epoch": 0.21141110545084055, "grad_norm": 2.647595808512136, "kl": 0.041259765625, "learning_rate": 8.942944472745797e-07, "loss": 0.0016, "reward": 0.9988905191421509, "reward_std": 0.6921209692955017, "rewards/accuracy_reward": 0.20937499403953552, "rewards/cosine_rewards": -0.1462814100086689, "rewards/format_reward": 0.9375, "rewards/repetition_rewards": -0.0017031602037604898, "step": 415 }, { "clip_ratio": 0.0, "completion_length": 1508.578125, "epoch": 0.2119205298013245, "grad_norm": 2.6566052420282933, "kl": 0.03466796875, "learning_rate": 8.940397350993378e-07, "loss": 0.0014, "reward": 1.211571991443634, "reward_std": 1.0560529828071594, "rewards/accuracy_reward": 0.32499998807907104, "rewards/cosine_rewards": -0.0497976616024971, "rewards/format_reward": 0.9375, "rewards/repetition_rewards": -0.00113033052184619, "step": 416 }, { "clip_ratio": 0.0, "completion_length": 1513.40625, "epoch": 0.21242995415180846, "grad_norm": 2.084079343038072, "kl": 0.0411376953125, "learning_rate": 8.937850229240957e-07, "loss": 0.0016, "reward": 0.5206416845321655, "reward_std": 0.49498558044433594, "rewards/accuracy_reward": -0.09687501192092896, "rewards/cosine_rewards": -0.36537329852581024, "rewards/format_reward": 0.984375, "rewards/repetition_rewards": -0.0014850463485345244, "step": 417 }, { "clip_ratio": 0.0, "completion_length": 1506.921875, "epoch": 0.21293937850229241, "grad_norm": 1.7101934155068432, "kl": 0.036865234375, "learning_rate": 8.935303107488537e-07, "loss": 0.0015, "reward": 1.16130793094635, "reward_std": 0.738935075700283, "rewards/accuracy_reward": 0.2968749850988388, "rewards/cosine_rewards": -0.08809526264667511, "rewards/format_reward": 0.953125, "rewards/repetition_rewards": -0.0005968308250885457, "step": 418 }, { "clip_ratio": 0.0, "completion_length": 1452.859375, "epoch": 0.21344880285277637, "grad_norm": 2.6364264984634236, "kl": 0.037109375, "learning_rate": 8.932755985736118e-07, "loss": 0.0015, "reward": 1.4882609844207764, "reward_std": 0.6527669131755829, "rewards/accuracy_reward": 0.4937499836087227, "rewards/cosine_rewards": 0.042041175067424774, "rewards/format_reward": 0.953125, "rewards/repetition_rewards": -0.0006552368577104062, "step": 419 }, { "clip_ratio": 0.0, "completion_length": 1425.0, "epoch": 0.21395822720326033, "grad_norm": 22.24294483419425, "kl": 0.0374755859375, "learning_rate": 8.930208863983698e-07, "loss": 0.0015, "reward": 1.5828353762626648, "reward_std": 0.6265529096126556, "rewards/accuracy_reward": 0.5468749850988388, "rewards/cosine_rewards": 0.08372939098626375, "rewards/format_reward": 0.953125, "rewards/repetition_rewards": -0.0008939505496528, "step": 420 }, { "clip_ratio": 0.0, "completion_length": 1396.171875, "epoch": 0.21446765155374428, "grad_norm": 2.8168572000468366, "kl": 0.049560546875, "learning_rate": 8.927661742231278e-07, "loss": 0.002, "reward": 1.6206639409065247, "reward_std": 0.5450826287269592, "rewards/accuracy_reward": 0.546875, "rewards/cosine_rewards": 0.12136101722717285, "rewards/format_reward": 0.953125, "rewards/repetition_rewards": -0.0006969515234231949, "step": 421 }, { "clip_ratio": 0.0, "completion_length": 1395.9375, "epoch": 0.2149770759042282, "grad_norm": 1.840733711397487, "kl": 0.0379638671875, "learning_rate": 8.925114620478858e-07, "loss": 0.0015, "reward": 1.8798171877861023, "reward_std": 0.5979900360107422, "rewards/accuracy_reward": 0.690625011920929, "rewards/cosine_rewards": 0.18991604819893837, "rewards/format_reward": 1.0, "rewards/repetition_rewards": -0.0007238158723339438, "step": 422 }, { "clip_ratio": 0.0, "completion_length": 1503.703125, "epoch": 0.21548650025471217, "grad_norm": 2.327429653832842, "kl": 0.0377197265625, "learning_rate": 8.922567498726439e-07, "loss": 0.0015, "reward": 1.1887712478637695, "reward_std": 0.615043044090271, "rewards/accuracy_reward": 0.2968749850988388, "rewards/cosine_rewards": -0.09194361418485641, "rewards/format_reward": 0.984375, "rewards/repetition_rewards": -0.0005351053987396881, "step": 423 }, { "clip_ratio": 0.0, "completion_length": 1528.765625, "epoch": 0.21599592460519612, "grad_norm": 3.1639646848610017, "kl": 0.0347900390625, "learning_rate": 8.920020376974019e-07, "loss": 0.0014, "reward": 1.1957539916038513, "reward_std": 1.2394747734069824, "rewards/accuracy_reward": 0.3531249985098839, "rewards/cosine_rewards": -0.03115752711892128, "rewards/format_reward": 0.875, "rewards/repetition_rewards": -0.001213467272464186, "step": 424 }, { "clip_ratio": 0.0, "completion_length": 1674.765625, "epoch": 0.21650534895568008, "grad_norm": 2.5043711144165126, "kl": 0.0338134765625, "learning_rate": 8.917473255221599e-07, "loss": 0.0014, "reward": 1.1731443107128143, "reward_std": 0.8068048655986786, "rewards/accuracy_reward": 0.3218749836087227, "rewards/cosine_rewards": -0.038288604468107224, "rewards/format_reward": 0.890625, "rewards/repetition_rewards": -0.0010670205520000309, "step": 425 }, { "clip_ratio": 0.0, "completion_length": 1664.5, "epoch": 0.21701477330616403, "grad_norm": 3.6500533940846327, "kl": 0.03515625, "learning_rate": 8.91492613346918e-07, "loss": 0.0014, "reward": 0.6680706441402435, "reward_std": 1.1470927596092224, "rewards/accuracy_reward": 0.012499995529651642, "rewards/cosine_rewards": -0.28094063699245453, "rewards/format_reward": 0.9375, "rewards/repetition_rewards": -0.000988698098808527, "step": 426 }, { "clip_ratio": 0.0, "completion_length": 1733.359375, "epoch": 0.217524197656648, "grad_norm": 1.7264860203774577, "kl": 0.033203125, "learning_rate": 8.91237901171676e-07, "loss": 0.0013, "reward": 1.151515543460846, "reward_std": 1.0065627694129944, "rewards/accuracy_reward": 0.37812499701976776, "rewards/cosine_rewards": -0.02256488800048828, "rewards/format_reward": 0.796875, "rewards/repetition_rewards": -0.0009195689344778657, "step": 427 }, { "clip_ratio": 0.0, "completion_length": 1738.171875, "epoch": 0.21803362200713194, "grad_norm": 1.8028678441679502, "kl": 0.033447265625, "learning_rate": 8.90983188996434e-07, "loss": 0.0013, "reward": 0.33622707426548004, "reward_std": 1.554500699043274, "rewards/accuracy_reward": -0.046875011175870895, "rewards/cosine_rewards": -0.35029861330986023, "rewards/format_reward": 0.734375, "rewards/repetition_rewards": -0.0009743365517351776, "step": 428 }, { "clip_ratio": 0.0, "completion_length": 1710.0, "epoch": 0.2185430463576159, "grad_norm": 1.7315216890300187, "kl": 0.0386962890625, "learning_rate": 8.90728476821192e-07, "loss": 0.0015, "reward": 1.2531213760375977, "reward_std": 1.7619973421096802, "rewards/accuracy_reward": 0.4624999910593033, "rewards/cosine_rewards": 0.010814379900693893, "rewards/format_reward": 0.78125, "rewards/repetition_rewards": -0.0014429978909902275, "step": 429 }, { "clip_ratio": 0.0, "completion_length": 1681.5625, "epoch": 0.21905247070809986, "grad_norm": 1.448581293364632, "kl": 0.0350341796875, "learning_rate": 8.904737646459501e-07, "loss": 0.0014, "reward": 0.5936174094676971, "reward_std": 1.1982838213443756, "rewards/accuracy_reward": 0.015624940395355225, "rewards/cosine_rewards": -0.2807646095752716, "rewards/format_reward": 0.859375, "rewards/repetition_rewards": -0.0006179730116855353, "step": 430 }, { "clip_ratio": 0.0, "completion_length": 1480.1875, "epoch": 0.2195618950585838, "grad_norm": 4.830088485887878, "kl": 0.0394287109375, "learning_rate": 8.90219052470708e-07, "loss": 0.0016, "reward": 1.1779060363769531, "reward_std": 1.0625053942203522, "rewards/accuracy_reward": 0.31562499701976776, "rewards/cosine_rewards": -0.05869085341691971, "rewards/format_reward": 0.921875, "rewards/repetition_rewards": -0.0009031399386003613, "step": 431 }, { "clip_ratio": 0.0, "completion_length": 1440.796875, "epoch": 0.22007131940906777, "grad_norm": 2.4405157931321124, "kl": 0.037109375, "learning_rate": 8.89964340295466e-07, "loss": 0.0015, "reward": 0.9002698361873627, "reward_std": 0.7880153059959412, "rewards/accuracy_reward": 0.09999998845160007, "rewards/cosine_rewards": -0.18298358470201492, "rewards/format_reward": 0.984375, "rewards/repetition_rewards": -0.0011215846752747893, "step": 432 }, { "clip_ratio": 0.0, "completion_length": 1348.515625, "epoch": 0.2205807437595517, "grad_norm": 2.1326390000811806, "kl": 0.0418701171875, "learning_rate": 8.897096281202241e-07, "loss": 0.0017, "reward": 0.7409723997116089, "reward_std": 0.7918355762958527, "rewards/accuracy_reward": 0.015624990686774254, "rewards/cosine_rewards": -0.21155225485563278, "rewards/format_reward": 0.9375, "rewards/repetition_rewards": -0.000600404484430328, "step": 433 }, { "clip_ratio": 0.0, "completion_length": 1298.125, "epoch": 0.22109016811003565, "grad_norm": 3.815337891096848, "kl": 0.0418701171875, "learning_rate": 8.894549159449821e-07, "loss": 0.0017, "reward": 1.8587952256202698, "reward_std": 0.6939655542373657, "rewards/accuracy_reward": 0.7187499701976776, "rewards/cosine_rewards": 0.1717987135052681, "rewards/format_reward": 0.96875, "rewards/repetition_rewards": -0.0005034840432927012, "step": 434 }, { "clip_ratio": 0.0, "completion_length": 1156.0, "epoch": 0.2215995924605196, "grad_norm": 4.479065196602373, "kl": 0.0440673828125, "learning_rate": 8.892002037697401e-07, "loss": 0.0018, "reward": 1.4347090125083923, "reward_std": 0.3772214949131012, "rewards/accuracy_reward": 0.43749997206032276, "rewards/cosine_rewards": -0.0022302046418190002, "rewards/format_reward": 1.0, "rewards/repetition_rewards": -0.0005607931379927322, "step": 435 }, { "clip_ratio": 0.0, "completion_length": 1158.46875, "epoch": 0.22210901681100356, "grad_norm": 4.149290240827553, "kl": 0.0455322265625, "learning_rate": 8.889454915944982e-07, "loss": 0.0018, "reward": 1.0905642956495285, "reward_std": 0.5234603583812714, "rewards/accuracy_reward": 0.2124999761581421, "rewards/cosine_rewards": -0.10600101202726364, "rewards/format_reward": 0.984375, "rewards/repetition_rewards": -0.00030972264357842505, "step": 436 }, { "clip_ratio": 0.0, "completion_length": 1058.734375, "epoch": 0.22261844116148752, "grad_norm": 6.6780990750455205, "kl": 0.046630859375, "learning_rate": 8.886907794192562e-07, "loss": 0.0019, "reward": 0.9731817841529846, "reward_std": 0.8772869110107422, "rewards/accuracy_reward": 0.09687498956918716, "rewards/cosine_rewards": -0.09189720638096333, "rewards/format_reward": 0.96875, "rewards/repetition_rewards": -0.0005459659732878208, "step": 437 }, { "clip_ratio": 0.0, "completion_length": 1056.5, "epoch": 0.22312786551197147, "grad_norm": 2.737447142182153, "kl": 0.044189453125, "learning_rate": 8.884360672440142e-07, "loss": 0.0018, "reward": 1.1522070169448853, "reward_std": 0.6963326930999756, "rewards/accuracy_reward": 0.24062499403953552, "rewards/cosine_rewards": -0.07206200063228607, "rewards/format_reward": 0.984375, "rewards/repetition_rewards": -0.0007310137443710119, "step": 438 }, { "clip_ratio": 0.0, "completion_length": 1072.171875, "epoch": 0.22363728986245543, "grad_norm": 2.6298041678582953, "kl": 0.046875, "learning_rate": 8.881813550687722e-07, "loss": 0.0019, "reward": 1.4510762691497803, "reward_std": 0.5045955777168274, "rewards/accuracy_reward": 0.49375002086162567, "rewards/cosine_rewards": 0.020084097981452942, "rewards/format_reward": 0.9375, "rewards/repetition_rewards": -0.0002578186395112425, "step": 439 }, { "clip_ratio": 0.0, "completion_length": 1094.546875, "epoch": 0.22414671421293939, "grad_norm": 2.047145097208152, "kl": 0.0438232421875, "learning_rate": 8.879266428935303e-07, "loss": 0.0018, "reward": 1.5189008712768555, "reward_std": 0.34239334613084793, "rewards/accuracy_reward": 0.4906250089406967, "rewards/cosine_rewards": 0.0758383758366108, "rewards/format_reward": 0.953125, "rewards/repetition_rewards": -0.0006874670943943784, "step": 440 }, { "clip_ratio": 0.0, "completion_length": 1083.09375, "epoch": 0.22465613856342334, "grad_norm": 2.966919929118076, "kl": 0.0457763671875, "learning_rate": 8.876719307182883e-07, "loss": 0.0018, "reward": 1.2056291699409485, "reward_std": 0.828714907169342, "rewards/accuracy_reward": 0.29687498696148396, "rewards/cosine_rewards": -0.043857116252183914, "rewards/format_reward": 0.953125, "rewards/repetition_rewards": -0.0005137350672157481, "step": 441 }, { "clip_ratio": 0.0, "completion_length": 1103.375, "epoch": 0.2251655629139073, "grad_norm": 3.389083005059361, "kl": 0.042724609375, "learning_rate": 8.874172185430463e-07, "loss": 0.0017, "reward": 1.483572542667389, "reward_std": 0.5207121074199677, "rewards/accuracy_reward": 0.4624999910593033, "rewards/cosine_rewards": 0.053015733137726784, "rewards/format_reward": 0.96875, "rewards/repetition_rewards": -0.0006932187097845599, "step": 442 }, { "clip_ratio": 0.0, "completion_length": 1217.421875, "epoch": 0.22567498726439122, "grad_norm": 1.8084576051013326, "kl": 0.0426025390625, "learning_rate": 8.871625063678044e-07, "loss": 0.0017, "reward": 1.604416847229004, "reward_std": 0.68864506483078, "rewards/accuracy_reward": 0.578125, "rewards/cosine_rewards": 0.10491618514060974, "rewards/format_reward": 0.921875, "rewards/repetition_rewards": -0.0004992800822947174, "step": 443 }, { "clip_ratio": 0.0, "completion_length": 1285.484375, "epoch": 0.22618441161487518, "grad_norm": 20.129612374292474, "kl": 0.042236328125, "learning_rate": 8.869077941925624e-07, "loss": 0.0017, "reward": 1.7994786500930786, "reward_std": 0.3120774105191231, "rewards/accuracy_reward": 0.6624999940395355, "rewards/cosine_rewards": 0.15342308580875397, "rewards/format_reward": 0.984375, "rewards/repetition_rewards": -0.0008193884277716279, "step": 444 }, { "clip_ratio": 0.0, "completion_length": 1370.578125, "epoch": 0.22669383596535914, "grad_norm": 3.234143174544009, "kl": 0.0433349609375, "learning_rate": 8.866530820173203e-07, "loss": 0.0017, "reward": 1.3398171067237854, "reward_std": 0.7532171607017517, "rewards/accuracy_reward": 0.3812500014901161, "rewards/cosine_rewards": -0.0252380333840847, "rewards/format_reward": 0.984375, "rewards/repetition_rewards": -0.0005697726446669549, "step": 445 }, { "clip_ratio": 0.0, "completion_length": 1474.875, "epoch": 0.2272032603158431, "grad_norm": 1.7926893560129409, "kl": 0.040283203125, "learning_rate": 8.863983698420783e-07, "loss": 0.0016, "reward": 1.4312800765037537, "reward_std": 0.6859093904495239, "rewards/accuracy_reward": 0.43437500298023224, "rewards/cosine_rewards": 0.02916320227086544, "rewards/format_reward": 0.96875, "rewards/repetition_rewards": -0.0010081499349325895, "step": 446 }, { "clip_ratio": 0.0, "completion_length": 1589.5625, "epoch": 0.22771268466632705, "grad_norm": 1.796069908482133, "kl": 0.036865234375, "learning_rate": 8.861436576668364e-07, "loss": 0.0015, "reward": 1.443231225013733, "reward_std": 0.6114392578601837, "rewards/accuracy_reward": 0.4375000074505806, "rewards/cosine_rewards": 0.038026634603738785, "rewards/format_reward": 0.96875, "rewards/repetition_rewards": -0.001045387762133032, "step": 447 }, { "clip_ratio": 0.0, "completion_length": 1672.09375, "epoch": 0.228222109016811, "grad_norm": 2.1300405555940802, "kl": 0.0377197265625, "learning_rate": 8.858889454915944e-07, "loss": 0.0015, "reward": 1.5443891882896423, "reward_std": 0.5743480771780014, "rewards/accuracy_reward": 0.550000011920929, "rewards/cosine_rewards": 0.10473084449768066, "rewards/format_reward": 0.890625, "rewards/repetition_rewards": -0.0009665640536695719, "step": 448 }, { "clip_ratio": 0.0, "completion_length": 1766.0, "epoch": 0.22873153336729496, "grad_norm": 4.287974732646472, "kl": 0.037841796875, "learning_rate": 8.856342333163524e-07, "loss": 0.0015, "reward": 1.3031042218208313, "reward_std": 1.7085354328155518, "rewards/accuracy_reward": 0.4593749940395355, "rewards/cosine_rewards": 0.06326716393232346, "rewards/format_reward": 0.78125, "rewards/repetition_rewards": -0.0007880023040343076, "step": 449 }, { "clip_ratio": 0.0, "completion_length": 1844.640625, "epoch": 0.22924095771777891, "grad_norm": 1.6528382190432698, "kl": 0.0341796875, "learning_rate": 8.853795211411105e-07, "loss": 0.0014, "reward": 0.5885469168424606, "reward_std": 1.7182486653327942, "rewards/accuracy_reward": 0.17812500149011612, "rewards/cosine_rewards": -0.1980201005935669, "rewards/format_reward": 0.609375, "rewards/repetition_rewards": -0.0009329892345704138, "step": 450 }, { "clip_ratio": 0.0, "completion_length": 1887.9375, "epoch": 0.22975038206826287, "grad_norm": 1.8787523511209072, "kl": 0.0335693359375, "learning_rate": 8.851248089658685e-07, "loss": 0.0013, "reward": 0.7307622581720352, "reward_std": 1.600571632385254, "rewards/accuracy_reward": 0.24062500894069672, "rewards/cosine_rewards": -0.13401341438293457, "rewards/format_reward": 0.625, "rewards/repetition_rewards": -0.0008493586792610586, "step": 451 }, { "clip_ratio": 0.0, "completion_length": 1833.78125, "epoch": 0.23025980641874683, "grad_norm": 6.893897138558536, "kl": 0.0357666015625, "learning_rate": 8.848700967906265e-07, "loss": 0.0014, "reward": 1.0294001996517181, "reward_std": 1.7062013149261475, "rewards/accuracy_reward": 0.40312499552965164, "rewards/cosine_rewards": 0.0025026053190231323, "rewards/format_reward": 0.625, "rewards/repetition_rewards": -0.0012274246546439826, "step": 452 }, { "clip_ratio": 0.0, "completion_length": 1943.453125, "epoch": 0.23076923076923078, "grad_norm": 2.3588444541934273, "kl": 0.0322265625, "learning_rate": 8.846153846153846e-07, "loss": 0.0013, "reward": 0.19596866890788078, "reward_std": 1.8146210312843323, "rewards/accuracy_reward": -0.02500000223517418, "rewards/cosine_rewards": -0.3406580686569214, "rewards/format_reward": 0.5625, "rewards/repetition_rewards": -0.0008732638962101191, "step": 453 }, { "clip_ratio": 0.0, "completion_length": 1863.109375, "epoch": 0.2312786551197147, "grad_norm": 1.5475872027740656, "kl": 0.0400390625, "learning_rate": 8.843606724401426e-07, "loss": 0.0016, "reward": 0.2073364406824112, "reward_std": 1.7386137247085571, "rewards/accuracy_reward": -0.043750010430812836, "rewards/cosine_rewards": -0.3414689302444458, "rewards/format_reward": 0.59375, "rewards/repetition_rewards": -0.0011946168669965118, "step": 454 }, { "clip_ratio": 0.0, "completion_length": 1659.65625, "epoch": 0.23178807947019867, "grad_norm": 3.381623642320965, "kl": 0.0540771484375, "learning_rate": 8.841059602649006e-07, "loss": 0.0022, "reward": 1.5414963960647583, "reward_std": 1.3864411413669586, "rewards/accuracy_reward": 0.6218750178813934, "rewards/cosine_rewards": 0.20184022560715675, "rewards/format_reward": 0.71875, "rewards/repetition_rewards": -0.000968798267422244, "step": 455 }, { "clip_ratio": 0.0, "completion_length": 1617.046875, "epoch": 0.23229750382068262, "grad_norm": 7.137141646753771, "kl": 0.0372314453125, "learning_rate": 8.838512480896586e-07, "loss": 0.0015, "reward": 1.1092736423015594, "reward_std": 0.9871836006641388, "rewards/accuracy_reward": 0.2687499839812517, "rewards/cosine_rewards": -0.09614543057978153, "rewards/format_reward": 0.9375, "rewards/repetition_rewards": -0.0008309493132401258, "step": 456 }, { "clip_ratio": 0.0, "completion_length": 1511.703125, "epoch": 0.23280692817116658, "grad_norm": 2.858407735203425, "kl": 0.0447998046875, "learning_rate": 8.835965359144167e-07, "loss": 0.0018, "reward": 1.4474474489688873, "reward_std": 0.8567388504743576, "rewards/accuracy_reward": 0.4937500078231096, "rewards/cosine_rewards": 0.06434839963912964, "rewards/format_reward": 0.890625, "rewards/repetition_rewards": -0.0012760092504322529, "step": 457 }, { "clip_ratio": 0.0, "completion_length": 1550.828125, "epoch": 0.23331635252165053, "grad_norm": 2.432372091234863, "kl": 0.0408935546875, "learning_rate": 8.833418237391747e-07, "loss": 0.0016, "reward": 1.0046057403087616, "reward_std": 1.0828097462654114, "rewards/accuracy_reward": 0.20937498658895493, "rewards/cosine_rewards": -0.14123845472931862, "rewards/format_reward": 0.9375, "rewards/repetition_rewards": -0.0010308316559530795, "step": 458 }, { "clip_ratio": 0.0, "completion_length": 1519.9375, "epoch": 0.2338257768721345, "grad_norm": 2.65385943980829, "kl": 0.0380859375, "learning_rate": 8.830871115639326e-07, "loss": 0.0015, "reward": 1.5737290382385254, "reward_std": 0.676769882440567, "rewards/accuracy_reward": 0.5187499821186066, "rewards/cosine_rewards": 0.10273971408605576, "rewards/format_reward": 0.953125, "rewards/repetition_rewards": -0.0008856799395289272, "step": 459 }, { "clip_ratio": 0.0, "completion_length": 1412.171875, "epoch": 0.23433520122261844, "grad_norm": 7.668944991953695, "kl": 0.03955078125, "learning_rate": 8.828323993886907e-07, "loss": 0.0016, "reward": 1.2575648427009583, "reward_std": 0.8219007402658463, "rewards/accuracy_reward": 0.3499999940395355, "rewards/cosine_rewards": -0.029430712573230267, "rewards/format_reward": 0.9375, "rewards/repetition_rewards": -0.0005044575809733942, "step": 460 }, { "clip_ratio": 0.0, "completion_length": 1428.671875, "epoch": 0.2348446255731024, "grad_norm": 3.3174885087942747, "kl": 0.041259765625, "learning_rate": 8.825776872134487e-07, "loss": 0.0017, "reward": 0.5390121340751648, "reward_std": 0.6499587297439575, "rewards/accuracy_reward": -0.09687501192092896, "rewards/cosine_rewards": -0.3163621127605438, "rewards/format_reward": 0.953125, "rewards/repetition_rewards": -0.0008757157484069467, "step": 461 }, { "clip_ratio": 0.0, "completion_length": 1389.328125, "epoch": 0.23535404992358636, "grad_norm": 1.976395582002063, "kl": 0.040771484375, "learning_rate": 8.823229750382067e-07, "loss": 0.0016, "reward": 1.6086109280586243, "reward_std": 0.5066869556903839, "rewards/accuracy_reward": 0.5218749716877937, "rewards/cosine_rewards": 0.08817524462938309, "rewards/format_reward": 1.0, "rewards/repetition_rewards": -0.0014392710290849209, "step": 462 }, { "clip_ratio": 0.0, "completion_length": 1436.625, "epoch": 0.2358634742740703, "grad_norm": 2.4713376444103146, "kl": 0.039794921875, "learning_rate": 8.820682628629647e-07, "loss": 0.0016, "reward": 1.111421525478363, "reward_std": 0.9693822264671326, "rewards/accuracy_reward": 0.24062500149011612, "rewards/cosine_rewards": -0.11270357295870781, "rewards/format_reward": 0.984375, "rewards/repetition_rewards": -0.0008748299151193351, "step": 463 }, { "clip_ratio": 0.0, "completion_length": 1397.96875, "epoch": 0.23637289862455427, "grad_norm": 2.688736588573913, "kl": 0.0450439453125, "learning_rate": 8.818135506877228e-07, "loss": 0.0018, "reward": 1.0273907780647278, "reward_std": 0.6092932820320129, "rewards/accuracy_reward": 0.20937500149011612, "rewards/cosine_rewards": -0.11841067671775818, "rewards/format_reward": 0.9375, "rewards/repetition_rewards": -0.0010735246760305017, "step": 464 }, { "clip_ratio": 0.0, "completion_length": 1502.5, "epoch": 0.2368823229750382, "grad_norm": 2.233484328017706, "kl": 0.03955078125, "learning_rate": 8.815588385124808e-07, "loss": 0.0016, "reward": 2.0390628576278687, "reward_std": 0.4271709471940994, "rewards/accuracy_reward": 0.7750000059604645, "rewards/cosine_rewards": 0.26496873423457146, "rewards/format_reward": 1.0, "rewards/repetition_rewards": -0.0009058607101906091, "step": 465 }, { "clip_ratio": 0.0, "completion_length": 1610.734375, "epoch": 0.23739174732552215, "grad_norm": 3.1291938497316867, "kl": 0.0394287109375, "learning_rate": 8.813041263372388e-07, "loss": 0.0016, "reward": 1.7407687306404114, "reward_std": 0.8337388634681702, "rewards/accuracy_reward": 0.659375011920929, "rewards/cosine_rewards": 0.17613628506660461, "rewards/format_reward": 0.90625, "rewards/repetition_rewards": -0.0009925005142576993, "step": 466 }, { "clip_ratio": 0.0, "completion_length": 1568.890625, "epoch": 0.2379011716760061, "grad_norm": 2.121477514968572, "kl": 0.0380859375, "learning_rate": 8.810494141619969e-07, "loss": 0.0015, "reward": 1.3490102887153625, "reward_std": 0.7418502867221832, "rewards/accuracy_reward": 0.37812500447034836, "rewards/cosine_rewards": 0.003062829375267029, "rewards/format_reward": 0.96875, "rewards/repetition_rewards": -0.0009275085176341236, "step": 467 }, { "clip_ratio": 0.0, "completion_length": 1698.8125, "epoch": 0.23841059602649006, "grad_norm": 2.4246076523727025, "kl": 0.03662109375, "learning_rate": 8.807947019867549e-07, "loss": 0.0015, "reward": 1.3666119575500488, "reward_std": 1.175959825515747, "rewards/accuracy_reward": 0.46562501788139343, "rewards/cosine_rewards": 0.02737235650420189, "rewards/format_reward": 0.875, "rewards/repetition_rewards": -0.0013854140415787697, "step": 468 }, { "clip_ratio": 0.0, "completion_length": 1691.90625, "epoch": 0.23892002037697402, "grad_norm": 1.3535821296606787, "kl": 0.039306640625, "learning_rate": 8.805399898115129e-07, "loss": 0.0016, "reward": 1.2414605617523193, "reward_std": 1.0560136437416077, "rewards/accuracy_reward": 0.3812499940395355, "rewards/cosine_rewards": -0.02948123589158058, "rewards/format_reward": 0.890625, "rewards/repetition_rewards": -0.0009331759065389633, "step": 469 }, { "clip_ratio": 0.0, "completion_length": 1686.46875, "epoch": 0.23942944472745797, "grad_norm": 1.6084637763929415, "kl": 0.0467529296875, "learning_rate": 8.802852776362711e-07, "loss": 0.0019, "reward": 2.0340508222579956, "reward_std": 1.1313848793506622, "rewards/accuracy_reward": 0.8312499821186066, "rewards/cosine_rewards": 0.3132530748844147, "rewards/format_reward": 0.890625, "rewards/repetition_rewards": -0.001077289809472859, "step": 470 }, { "clip_ratio": 0.0, "completion_length": 1638.4375, "epoch": 0.23993886907794193, "grad_norm": 3.9206215218528553, "kl": 0.0384521484375, "learning_rate": 8.800305654610291e-07, "loss": 0.0015, "reward": 1.417995810508728, "reward_std": 0.7844535112380981, "rewards/accuracy_reward": 0.4375, "rewards/cosine_rewards": 0.04390082508325577, "rewards/format_reward": 0.9375, "rewards/repetition_rewards": -0.0009050128574017435, "step": 471 }, { "clip_ratio": 0.0, "completion_length": 1583.171875, "epoch": 0.24044829342842589, "grad_norm": 1.862615408125007, "kl": 0.0404052734375, "learning_rate": 8.797758532857871e-07, "loss": 0.0016, "reward": 1.3552428185939789, "reward_std": 0.831163614988327, "rewards/accuracy_reward": 0.40937500074505806, "rewards/cosine_rewards": -0.005982518196105957, "rewards/format_reward": 0.953125, "rewards/repetition_rewards": -0.001274671230930835, "step": 472 }, { "clip_ratio": 0.0, "completion_length": 1606.359375, "epoch": 0.24095771777890984, "grad_norm": 6.217207017794225, "kl": 0.039794921875, "learning_rate": 8.795211411105451e-07, "loss": 0.0016, "reward": 1.677711844444275, "reward_std": 0.8612502366304398, "rewards/accuracy_reward": 0.5781249701976776, "rewards/cosine_rewards": 0.11638512089848518, "rewards/format_reward": 0.984375, "rewards/repetition_rewards": -0.0011732576531358063, "step": 473 }, { "clip_ratio": 0.0, "completion_length": 1521.84375, "epoch": 0.2414671421293938, "grad_norm": 2.9743621746841677, "kl": 0.0421142578125, "learning_rate": 8.792664289353031e-07, "loss": 0.0017, "reward": 1.578629732131958, "reward_std": 0.6186130940914154, "rewards/accuracy_reward": 0.5218749940395355, "rewards/cosine_rewards": 0.08938230201601982, "rewards/format_reward": 0.96875, "rewards/repetition_rewards": -0.0013775942497886717, "step": 474 }, { "clip_ratio": 0.0, "completion_length": 1560.25, "epoch": 0.24197656647987772, "grad_norm": 3.931202850710427, "kl": 0.0396728515625, "learning_rate": 8.790117167600611e-07, "loss": 0.0016, "reward": 1.8553311824798584, "reward_std": 0.5484062433242798, "rewards/accuracy_reward": 0.6906249970197678, "rewards/cosine_rewards": 0.1969544254243374, "rewards/format_reward": 0.96875, "rewards/repetition_rewards": -0.0009982050396502018, "step": 475 }, { "clip_ratio": 0.0, "completion_length": 1487.171875, "epoch": 0.24248599083036168, "grad_norm": 1.8115703582475124, "kl": 0.0428466796875, "learning_rate": 8.787570045848191e-07, "loss": 0.0017, "reward": 1.0939862728118896, "reward_std": 0.652959406375885, "rewards/accuracy_reward": 0.24062499776482582, "rewards/cosine_rewards": -0.0988575927913189, "rewards/format_reward": 0.953125, "rewards/repetition_rewards": -0.0009061352466233075, "step": 476 }, { "clip_ratio": 0.0, "completion_length": 1421.203125, "epoch": 0.24299541518084564, "grad_norm": 50.509798046645564, "kl": 0.0455322265625, "learning_rate": 8.785022924095772e-07, "loss": 0.0018, "reward": 1.1556105613708496, "reward_std": 0.8080581426620483, "rewards/accuracy_reward": 0.26874998956918716, "rewards/cosine_rewards": -0.06538418680429459, "rewards/format_reward": 0.953125, "rewards/repetition_rewards": -0.0008802659867797047, "step": 477 }, { "clip_ratio": 0.0, "completion_length": 1423.25, "epoch": 0.2435048395313296, "grad_norm": 2.143196092673566, "kl": 0.042724609375, "learning_rate": 8.782475802343352e-07, "loss": 0.0017, "reward": 1.4646123051643372, "reward_std": 0.4019291028380394, "rewards/accuracy_reward": 0.4374999925494194, "rewards/cosine_rewards": 0.04411640763282776, "rewards/format_reward": 0.984375, "rewards/repetition_rewards": -0.0013792455429211259, "step": 478 }, { "clip_ratio": 0.0, "completion_length": 1525.359375, "epoch": 0.24401426388181355, "grad_norm": 1.4650345234014313, "kl": 0.043701171875, "learning_rate": 8.779928680590932e-07, "loss": 0.0018, "reward": 1.7237411737442017, "reward_std": 0.6819100677967072, "rewards/accuracy_reward": 0.6031249761581421, "rewards/cosine_rewards": 0.13805609196424484, "rewards/format_reward": 0.984375, "rewards/repetition_rewards": -0.0018149468814954162, "step": 479 }, { "clip_ratio": 0.0, "completion_length": 1518.6875, "epoch": 0.2445236882322975, "grad_norm": 2.4194184684625166, "kl": 0.0440673828125, "learning_rate": 8.777381558838512e-07, "loss": 0.0018, "reward": 1.4634617269039154, "reward_std": 0.4558331221342087, "rewards/accuracy_reward": 0.46562497317790985, "rewards/cosine_rewards": 0.03047458827495575, "rewards/format_reward": 0.96875, "rewards/repetition_rewards": -0.0013878352474421263, "step": 480 }, { "clip_ratio": 0.0, "completion_length": 1559.859375, "epoch": 0.24503311258278146, "grad_norm": 5.778258363606284, "kl": 0.041015625, "learning_rate": 8.774834437086093e-07, "loss": 0.0016, "reward": 1.1754435896873474, "reward_std": 0.629539430141449, "rewards/accuracy_reward": 0.2968749850988388, "rewards/cosine_rewards": -0.058087632060050964, "rewards/format_reward": 0.9375, "rewards/repetition_rewards": -0.000843802816234529, "step": 481 }, { "clip_ratio": 0.0, "completion_length": 1613.171875, "epoch": 0.24554253693326542, "grad_norm": 1.6143411312623293, "kl": 0.0389404296875, "learning_rate": 8.772287315333673e-07, "loss": 0.0016, "reward": 0.8586589694023132, "reward_std": 0.45200832188129425, "rewards/accuracy_reward": 0.09999999403953552, "rewards/cosine_rewards": -0.22472049295902252, "rewards/format_reward": 0.984375, "rewards/repetition_rewards": -0.0009954352863132954, "step": 482 }, { "clip_ratio": 0.0, "completion_length": 1670.359375, "epoch": 0.24605196128374937, "grad_norm": 2.239924603374423, "kl": 0.0592041015625, "learning_rate": 8.769740193581253e-07, "loss": 0.0024, "reward": 1.522126853466034, "reward_std": 0.8742709904909134, "rewards/accuracy_reward": 0.4937499910593033, "rewards/cosine_rewards": 0.060823358595371246, "rewards/format_reward": 0.96875, "rewards/repetition_rewards": -0.0011965514277108014, "step": 483 }, { "clip_ratio": 0.0, "completion_length": 1628.625, "epoch": 0.24656138563423333, "grad_norm": 8.106075584628705, "kl": 0.0419921875, "learning_rate": 8.767193071828834e-07, "loss": 0.0017, "reward": 0.9253878593444824, "reward_std": 1.307717740535736, "rewards/accuracy_reward": 0.18437500298023224, "rewards/cosine_rewards": -0.14811599627137184, "rewards/format_reward": 0.890625, "rewards/repetition_rewards": -0.0014961253036744893, "step": 484 }, { "clip_ratio": 0.0, "completion_length": 1648.875, "epoch": 0.24707080998471728, "grad_norm": 1.6847159437413002, "kl": 0.0389404296875, "learning_rate": 8.764645950076414e-07, "loss": 0.0016, "reward": 1.4973651766777039, "reward_std": 0.9533334523439407, "rewards/accuracy_reward": 0.5218749791383743, "rewards/cosine_rewards": 0.0704129058867693, "rewards/format_reward": 0.90625, "rewards/repetition_rewards": -0.0011727037781383842, "step": 485 }, { "clip_ratio": 0.0, "completion_length": 1511.140625, "epoch": 0.2475802343352012, "grad_norm": 1.736665525794252, "kl": 0.0399169921875, "learning_rate": 8.762098828323994e-07, "loss": 0.0016, "reward": 0.7075473368167877, "reward_std": 0.8960316479206085, "rewards/accuracy_reward": 0.015624992549419403, "rewards/cosine_rewards": -0.2443552017211914, "rewards/format_reward": 0.9375, "rewards/repetition_rewards": -0.001222497143317014, "step": 486 }, { "clip_ratio": 0.0, "completion_length": 1567.671875, "epoch": 0.24808965868568517, "grad_norm": 3.031931829741236, "kl": 0.0386962890625, "learning_rate": 8.759551706571575e-07, "loss": 0.0015, "reward": 1.3812061548233032, "reward_std": 0.8888083398342133, "rewards/accuracy_reward": 0.4093749672174454, "rewards/cosine_rewards": 0.004479339346289635, "rewards/format_reward": 0.96875, "rewards/repetition_rewards": -0.0013981764786876738, "step": 487 }, { "clip_ratio": 0.0, "completion_length": 1457.359375, "epoch": 0.24859908303616912, "grad_norm": 5.524727047487839, "kl": 0.0506591796875, "learning_rate": 8.757004584819154e-07, "loss": 0.002, "reward": 1.8092041611671448, "reward_std": 0.5159921646118164, "rewards/accuracy_reward": 0.6343750059604645, "rewards/cosine_rewards": 0.19165128469467163, "rewards/format_reward": 0.984375, "rewards/repetition_rewards": -0.0011971485218964517, "step": 488 }, { "clip_ratio": 0.0, "completion_length": 1379.078125, "epoch": 0.24910850738665308, "grad_norm": 7.278746642143023, "kl": 0.055419921875, "learning_rate": 8.754457463066734e-07, "loss": 0.0022, "reward": 1.194389447569847, "reward_std": 0.5000828057527542, "rewards/accuracy_reward": 0.26874998211860657, "rewards/cosine_rewards": -0.04208715260028839, "rewards/format_reward": 0.96875, "rewards/repetition_rewards": -0.0010234276414848864, "step": 489 }, { "clip_ratio": 0.0, "completion_length": 1397.78125, "epoch": 0.24961793173713703, "grad_norm": 2.9100594613125352, "kl": 0.0543212890625, "learning_rate": 8.751910341314314e-07, "loss": 0.0022, "reward": 1.6665399670600891, "reward_std": 0.6835527420043945, "rewards/accuracy_reward": 0.6625000089406967, "rewards/cosine_rewards": 0.11463410407304764, "rewards/format_reward": 0.890625, "rewards/repetition_rewards": -0.00121912601753138, "step": 490 }, { "clip_ratio": 0.0, "completion_length": 1359.703125, "epoch": 0.250127356087621, "grad_norm": 12.325171247664876, "kl": 0.0457763671875, "learning_rate": 8.749363219561895e-07, "loss": 0.0018, "reward": 1.8410940766334534, "reward_std": 0.4001428484916687, "rewards/accuracy_reward": 0.690625011920929, "rewards/cosine_rewards": 0.18309018202126026, "rewards/format_reward": 0.96875, "rewards/repetition_rewards": -0.0013711884384974837, "step": 491 }, { "clip_ratio": 0.0, "completion_length": 1470.75, "epoch": 0.25063678043810494, "grad_norm": 15.076912789505334, "kl": 0.041015625, "learning_rate": 8.746816097809475e-07, "loss": 0.0016, "reward": 1.2887136340141296, "reward_std": 0.8450455367565155, "rewards/accuracy_reward": 0.3531249761581421, "rewards/cosine_rewards": -0.03244372457265854, "rewards/format_reward": 0.96875, "rewards/repetition_rewards": -0.0007176562794484198, "step": 492 }, { "clip_ratio": 0.0, "completion_length": 1348.140625, "epoch": 0.2511462047885889, "grad_norm": 10.32545410862146, "kl": 0.05810546875, "learning_rate": 8.744268976057055e-07, "loss": 0.0023, "reward": 1.227342277765274, "reward_std": 1.0202240645885468, "rewards/accuracy_reward": 0.3500000238418579, "rewards/cosine_rewards": 0.002939566969871521, "rewards/format_reward": 0.875, "rewards/repetition_rewards": -0.0005973072838969529, "step": 493 }, { "clip_ratio": 0.0, "completion_length": 1301.828125, "epoch": 0.25165562913907286, "grad_norm": 3.667270735290933, "kl": 0.0645751953125, "learning_rate": 8.741721854304636e-07, "loss": 0.0026, "reward": 1.2533040046691895, "reward_std": 0.7434202134609222, "rewards/accuracy_reward": 0.32500000298023224, "rewards/cosine_rewards": -0.03934769332408905, "rewards/format_reward": 0.96875, "rewards/repetition_rewards": -0.0010982811218127608, "step": 494 }, { "clip_ratio": 0.0, "completion_length": 1290.1875, "epoch": 0.2521650534895568, "grad_norm": 4.3465734782527345, "kl": 0.0535888671875, "learning_rate": 8.739174732552216e-07, "loss": 0.0021, "reward": 0.6353173404932022, "reward_std": 0.6600025594234467, "rewards/accuracy_reward": -0.040625013411045074, "rewards/cosine_rewards": -0.2607284113764763, "rewards/format_reward": 0.9375, "rewards/repetition_rewards": -0.0008293068385683, "step": 495 }, { "clip_ratio": 0.0, "completion_length": 1355.375, "epoch": 0.25267447784004077, "grad_norm": 4.901543791199254, "kl": 0.060546875, "learning_rate": 8.736627610799796e-07, "loss": 0.0024, "reward": 1.1404387950897217, "reward_std": 0.670623242855072, "rewards/accuracy_reward": 0.26874999701976776, "rewards/cosine_rewards": -0.11177334189414978, "rewards/format_reward": 0.984375, "rewards/repetition_rewards": -0.0009129364043474197, "step": 496 }, { "clip_ratio": 0.0, "completion_length": 1328.6875, "epoch": 0.2531839021905247, "grad_norm": 3.9728562721335745, "kl": 0.0489501953125, "learning_rate": 8.734080489047376e-07, "loss": 0.002, "reward": 1.1998997032642365, "reward_std": 0.630705714225769, "rewards/accuracy_reward": 0.32500001788139343, "rewards/cosine_rewards": -0.06150183826684952, "rewards/format_reward": 0.9375, "rewards/repetition_rewards": -0.0010984738764818758, "step": 497 }, { "clip_ratio": 0.0, "completion_length": 1373.671875, "epoch": 0.2536933265410087, "grad_norm": 3.325255325148765, "kl": 0.04833984375, "learning_rate": 8.731533367294957e-07, "loss": 0.0019, "reward": 1.2019822597503662, "reward_std": 0.38447779417037964, "rewards/accuracy_reward": 0.296875, "rewards/cosine_rewards": -0.04712319001555443, "rewards/format_reward": 0.953125, "rewards/repetition_rewards": -0.0008945107110776007, "step": 498 }, { "clip_ratio": 0.0, "completion_length": 1309.28125, "epoch": 0.25420275089149263, "grad_norm": 5.499632802088616, "kl": 0.072265625, "learning_rate": 8.728986245542537e-07, "loss": 0.0029, "reward": 1.6111189126968384, "reward_std": 0.1892632469534874, "rewards/accuracy_reward": 0.550000011920929, "rewards/cosine_rewards": 0.07773812115192413, "rewards/format_reward": 0.984375, "rewards/repetition_rewards": -0.0009941596072167158, "step": 499 }, { "clip_ratio": 0.0, "completion_length": 1363.625, "epoch": 0.2547121752419766, "grad_norm": 7.195546109062687, "kl": 0.0482177734375, "learning_rate": 8.726439123790117e-07, "loss": 0.0019, "reward": 1.9320534467697144, "reward_std": 0.4148600548505783, "rewards/accuracy_reward": 0.7468750178813934, "rewards/cosine_rewards": 0.20207761228084564, "rewards/format_reward": 0.984375, "rewards/repetition_rewards": -0.0012741541431751102, "step": 500 } ], "logging_steps": 1.0, "max_steps": 3926, "num_input_tokens_seen": 0, "num_train_epochs": 2, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 4, "trial_name": null, "trial_params": null }