{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.44605022525536375, "eval_steps": 1000, "global_step": 5000, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "completion_length": 225.1666717529297, "epoch": 8.921004505107276e-05, "grad_norm": 62.695068359375, "learning_rate": 2.5e-07, "loss": 12.7202, "reward": 0.19306249171495438, "reward_std": 0.5882241576910019, "rewards/correctness_reward_func": 0.3333333432674408, "rewards/int_reward_func": 0.0833333358168602, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.22360415756702423, "step": 1, "zero_std_ratio": 0.0 }, { "epoch": 0.00017842009010214551, "grad_norm": 59.6503791809082, "learning_rate": 5e-07, "loss": 13.0732, "step": 2 }, { "epoch": 0.00026763013515321824, "grad_norm": 69.65516662597656, "learning_rate": 7.5e-07, "loss": 12.9681, "step": 3 }, { "epoch": 0.00035684018020429103, "grad_norm": 57.81648635864258, "learning_rate": 1e-06, "loss": 8.1042, "step": 4 }, { "epoch": 0.00044605022525536376, "grad_norm": 57.6408576965332, "learning_rate": 1.25e-06, "loss": 8.6056, "step": 5 }, { "epoch": 0.0005352602703064365, "grad_norm": 58.459903717041016, "learning_rate": 1.5e-06, "loss": 10.4929, "step": 6 }, { "epoch": 0.0006244703153575092, "grad_norm": 62.41658020019531, "learning_rate": 1.7500000000000002e-06, "loss": 13.1206, "step": 7 }, { "epoch": 0.0007136803604085821, "grad_norm": 66.22370910644531, "learning_rate": 2e-06, "loss": 13.2007, "step": 8 }, { "epoch": 0.0008028904054596548, "grad_norm": 66.21946716308594, "learning_rate": 2.25e-06, "loss": 12.3522, "step": 9 }, { "epoch": 0.0008921004505107275, "grad_norm": 65.43058776855469, "learning_rate": 2.5e-06, "loss": 7.9566, "step": 10 }, { "epoch": 0.0009813104955618004, "grad_norm": 54.532962799072266, "learning_rate": 2.75e-06, "loss": 8.8616, "step": 11 }, { "epoch": 0.001070520540612873, "grad_norm": 56.53645706176758, "learning_rate": 3e-06, "loss": 10.3095, "step": 12 }, { "completion_length": 222.9791717529297, "epoch": 0.0011597305856639458, "grad_norm": 78.4708023071289, "learning_rate": 3e-06, "loss": -24.6031, "reward": 0.011666670441627502, "reward_std": 0.524684801697731, "rewards/correctness_reward_func": 0.1250000037252903, "rewards/int_reward_func": 0.041666666977107525, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.1550000049173832, "step": 13, "zero_std_ratio": 0.0 }, { "epoch": 0.0012489406307150184, "grad_norm": 88.30101776123047, "learning_rate": 3e-06, "loss": -18.3145, "step": 14 }, { "epoch": 0.0013381506757660913, "grad_norm": 101.97128295898438, "learning_rate": 3e-06, "loss": -7.4151, "step": 15 }, { "epoch": 0.0014273607208171641, "grad_norm": 91.58382415771484, "learning_rate": 3e-06, "loss": -8.9073, "step": 16 }, { "epoch": 0.0015165707658682367, "grad_norm": 90.10670471191406, "learning_rate": 3e-06, "loss": -13.5176, "step": 17 }, { "epoch": 0.0016057808109193096, "grad_norm": 80.67254638671875, "learning_rate": 3e-06, "loss": -17.2813, "step": 18 }, { "epoch": 0.0016949908559703822, "grad_norm": 75.51331329345703, "learning_rate": 3e-06, "loss": -24.6926, "step": 19 }, { "epoch": 0.001784200901021455, "grad_norm": 78.15167999267578, "learning_rate": 3e-06, "loss": -18.5973, "step": 20 }, { "epoch": 0.0018734109460725277, "grad_norm": 89.70745086669922, "learning_rate": 3e-06, "loss": -7.9364, "step": 21 }, { "epoch": 0.0019626209911236007, "grad_norm": 89.28164672851562, "learning_rate": 3e-06, "loss": -9.434, "step": 22 }, { "epoch": 0.002051831036174673, "grad_norm": 98.30489349365234, "learning_rate": 3e-06, "loss": -14.6494, "step": 23 }, { "epoch": 0.002141041081225746, "grad_norm": 92.3221206665039, "learning_rate": 3e-06, "loss": -17.5654, "step": 24 }, { "completion_length": 224.95834350585938, "epoch": 0.002230251126276819, "grad_norm": 77.60931396484375, "learning_rate": 3e-06, "loss": 12.7299, "reward": 0.08664583414793015, "reward_std": 0.529650554060936, "rewards/correctness_reward_func": 0.1666666679084301, "rewards/int_reward_func": 0.041666666977107525, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.12168749794363976, "step": 25, "zero_std_ratio": 0.0 }, { "epoch": 0.0023194611713278916, "grad_norm": 55.48664474487305, "learning_rate": 3e-06, "loss": 11.3297, "step": 26 }, { "epoch": 0.0024086712163789645, "grad_norm": 64.88197326660156, "learning_rate": 3e-06, "loss": 7.6398, "step": 27 }, { "epoch": 0.002497881261430037, "grad_norm": 66.41521453857422, "learning_rate": 3e-06, "loss": 10.9742, "step": 28 }, { "epoch": 0.0025870913064811097, "grad_norm": 60.356266021728516, "learning_rate": 3e-06, "loss": 18.3629, "step": 29 }, { "epoch": 0.0026763013515321826, "grad_norm": 67.53816986083984, "learning_rate": 3e-06, "loss": 10.3122, "step": 30 }, { "epoch": 0.0027655113965832554, "grad_norm": 81.81299591064453, "learning_rate": 3e-06, "loss": 12.4031, "step": 31 }, { "epoch": 0.0028547214416343282, "grad_norm": 58.01384735107422, "learning_rate": 3e-06, "loss": 11.3115, "step": 32 }, { "epoch": 0.0029439314866854006, "grad_norm": 60.38798522949219, "learning_rate": 3e-06, "loss": 7.5438, "step": 33 }, { "epoch": 0.0030331415317364735, "grad_norm": 76.68485260009766, "learning_rate": 3e-06, "loss": 9.8314, "step": 34 }, { "epoch": 0.0031223515767875463, "grad_norm": 63.667381286621094, "learning_rate": 3e-06, "loss": 18.0907, "step": 35 }, { "epoch": 0.003211561621838619, "grad_norm": 64.93324279785156, "learning_rate": 3e-06, "loss": 9.6529, "step": 36 }, { "completion_length": 200.9166717529297, "epoch": 0.003300771666889692, "grad_norm": 55.603302001953125, "learning_rate": 3e-06, "loss": -2.1381, "reward": 0.20900000631809235, "reward_std": 0.5408279597759247, "rewards/correctness_reward_func": 0.2500000074505806, "rewards/int_reward_func": 0.09374999813735485, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.1347499955445528, "step": 37, "zero_std_ratio": 0.0 }, { "epoch": 0.0033899817119407644, "grad_norm": 59.070777893066406, "learning_rate": 3e-06, "loss": -6.7825, "step": 38 }, { "epoch": 0.0034791917569918372, "grad_norm": 73.52457427978516, "learning_rate": 3e-06, "loss": -11.5592, "step": 39 }, { "epoch": 0.00356840180204291, "grad_norm": 68.8139419555664, "learning_rate": 3e-06, "loss": -3.9847, "step": 40 }, { "epoch": 0.003657611847093983, "grad_norm": 74.64259338378906, "learning_rate": 3e-06, "loss": -7.7023, "step": 41 }, { "epoch": 0.0037468218921450553, "grad_norm": 68.76261901855469, "learning_rate": 3e-06, "loss": -11.4536, "step": 42 }, { "epoch": 0.003836031937196128, "grad_norm": 57.10056686401367, "learning_rate": 3e-06, "loss": -3.0195, "step": 43 }, { "epoch": 0.003925241982247201, "grad_norm": 57.4798583984375, "learning_rate": 3e-06, "loss": -7.3677, "step": 44 }, { "epoch": 0.004014452027298274, "grad_norm": 62.251949310302734, "learning_rate": 3e-06, "loss": -12.4481, "step": 45 }, { "epoch": 0.004103662072349346, "grad_norm": 67.0556640625, "learning_rate": 3e-06, "loss": -4.2431, "step": 46 }, { "epoch": 0.0041928721174004195, "grad_norm": 79.22687530517578, "learning_rate": 3e-06, "loss": -8.9896, "step": 47 }, { "epoch": 0.004282082162451492, "grad_norm": 83.6895980834961, "learning_rate": 3e-06, "loss": -12.4645, "step": 48 }, { "completion_length": 186.0, "epoch": 0.004371292207502565, "grad_norm": 187.8282928466797, "learning_rate": 3e-06, "loss": -44.8522, "reward": 0.5453333556652069, "reward_std": 0.9364342093467712, "rewards/correctness_reward_func": 0.4583333358168602, "rewards/int_reward_func": 0.15625, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.06924999551847577, "step": 49, "zero_std_ratio": 0.0 }, { "epoch": 0.004460502252553638, "grad_norm": 92.9270248413086, "learning_rate": 3e-06, "loss": -41.2773, "step": 50 }, { "epoch": 0.00454971229760471, "grad_norm": 212.18917846679688, "learning_rate": 3e-06, "loss": -42.3882, "step": 51 }, { "epoch": 0.004638922342655783, "grad_norm": 102.22235870361328, "learning_rate": 3e-06, "loss": -42.8879, "step": 52 }, { "epoch": 0.004728132387706856, "grad_norm": 79.1269302368164, "learning_rate": 3e-06, "loss": -44.708, "step": 53 }, { "epoch": 0.004817342432757929, "grad_norm": 94.53079986572266, "learning_rate": 3e-06, "loss": -41.656, "step": 54 }, { "epoch": 0.004906552477809001, "grad_norm": 91.7303695678711, "learning_rate": 3e-06, "loss": -45.3257, "step": 55 }, { "epoch": 0.004995762522860074, "grad_norm": 92.66773986816406, "learning_rate": 3e-06, "loss": -41.4113, "step": 56 }, { "epoch": 0.005084972567911147, "grad_norm": 123.76467895507812, "learning_rate": 3e-06, "loss": -43.4643, "step": 57 }, { "epoch": 0.0051741826129622194, "grad_norm": 109.21142578125, "learning_rate": 3e-06, "loss": -44.7136, "step": 58 }, { "epoch": 0.005263392658013293, "grad_norm": 83.24272155761719, "learning_rate": 3e-06, "loss": -45.7862, "step": 59 }, { "epoch": 0.005352602703064365, "grad_norm": 94.45966339111328, "learning_rate": 3e-06, "loss": -42.5492, "step": 60 }, { "completion_length": 225.625, "epoch": 0.0054418127481154375, "grad_norm": 125.85611724853516, "learning_rate": 3e-06, "loss": 28.3825, "reward": 0.31822918355464935, "reward_std": 0.9613562524318695, "rewards/correctness_reward_func": 0.375, "rewards/int_reward_func": 0.1145833320915699, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.17135417833924294, "step": 61, "zero_std_ratio": 0.0 }, { "epoch": 0.005531022793166511, "grad_norm": 128.64669799804688, "learning_rate": 3e-06, "loss": 4.8596, "step": 62 }, { "epoch": 0.005620232838217583, "grad_norm": 287.9391784667969, "learning_rate": 3e-06, "loss": 20.5521, "step": 63 }, { "epoch": 0.0057094428832686565, "grad_norm": 111.01509857177734, "learning_rate": 3e-06, "loss": 16.6241, "step": 64 }, { "epoch": 0.005798652928319729, "grad_norm": 123.25679016113281, "learning_rate": 3e-06, "loss": 6.8919, "step": 65 }, { "epoch": 0.005887862973370801, "grad_norm": 115.68987274169922, "learning_rate": 3e-06, "loss": 19.3061, "step": 66 }, { "epoch": 0.0059770730184218746, "grad_norm": 128.9923553466797, "learning_rate": 3e-06, "loss": 27.4792, "step": 67 }, { "epoch": 0.006066283063472947, "grad_norm": 130.64230346679688, "learning_rate": 3e-06, "loss": 3.8702, "step": 68 }, { "epoch": 0.00615549310852402, "grad_norm": 169.2925262451172, "learning_rate": 3e-06, "loss": 19.2163, "step": 69 }, { "epoch": 0.006244703153575093, "grad_norm": 104.88905334472656, "learning_rate": 3e-06, "loss": 14.5854, "step": 70 }, { "epoch": 0.006333913198626165, "grad_norm": 134.32022094726562, "learning_rate": 3e-06, "loss": 5.6117, "step": 71 }, { "epoch": 0.006423123243677238, "grad_norm": 124.52132415771484, "learning_rate": 3e-06, "loss": 18.0908, "step": 72 }, { "completion_length": 203.81250762939453, "epoch": 0.006512333288728311, "grad_norm": 87.01981353759766, "learning_rate": 3e-06, "loss": 14.5961, "reward": 0.25443750619888306, "reward_std": 0.6893003582954407, "rewards/correctness_reward_func": 0.2916666641831398, "rewards/int_reward_func": 0.09374999813735485, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.13097916916012764, "step": 73, "zero_std_ratio": 0.0 }, { "epoch": 0.006601543333779384, "grad_norm": 83.70246887207031, "learning_rate": 3e-06, "loss": 20.7203, "step": 74 }, { "epoch": 0.006690753378830456, "grad_norm": 80.23466491699219, "learning_rate": 3e-06, "loss": 30.3319, "step": 75 }, { "epoch": 0.006779963423881529, "grad_norm": 74.68209838867188, "learning_rate": 3e-06, "loss": 23.2, "step": 76 }, { "epoch": 0.006869173468932602, "grad_norm": 81.28849029541016, "learning_rate": 3e-06, "loss": 11.7216, "step": 77 }, { "epoch": 0.0069583835139836745, "grad_norm": 85.60411071777344, "learning_rate": 3e-06, "loss": 19.9348, "step": 78 }, { "epoch": 0.007047593559034747, "grad_norm": 95.26403045654297, "learning_rate": 3e-06, "loss": 13.5735, "step": 79 }, { "epoch": 0.00713680360408582, "grad_norm": 81.69352722167969, "learning_rate": 3e-06, "loss": 19.4906, "step": 80 }, { "epoch": 0.0072260136491368926, "grad_norm": 80.9581527709961, "learning_rate": 3e-06, "loss": 29.1989, "step": 81 }, { "epoch": 0.007315223694187966, "grad_norm": 87.37995147705078, "learning_rate": 3e-06, "loss": 23.4541, "step": 82 }, { "epoch": 0.007404433739239038, "grad_norm": 90.7470932006836, "learning_rate": 3e-06, "loss": 10.7907, "step": 83 }, { "epoch": 0.007493643784290111, "grad_norm": 352.26953125, "learning_rate": 3e-06, "loss": 18.1423, "step": 84 }, { "completion_length": 185.6875, "epoch": 0.007582853829341184, "grad_norm": 78.5768051147461, "learning_rate": 3e-06, "loss": -2.0743, "reward": 0.6054166778922081, "reward_std": 0.8349271714687347, "rewards/correctness_reward_func": 0.5, "rewards/int_reward_func": 0.125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.019583335146307945, "step": 85, "zero_std_ratio": 0.0 }, { "epoch": 0.007672063874392256, "grad_norm": 125.65332794189453, "learning_rate": 3e-06, "loss": -18.0183, "step": 86 }, { "epoch": 0.00776127391944333, "grad_norm": 93.01673889160156, "learning_rate": 3e-06, "loss": -2.9219, "step": 87 }, { "epoch": 0.007850483964494403, "grad_norm": 85.38358306884766, "learning_rate": 3e-06, "loss": -7.3606, "step": 88 }, { "epoch": 0.007939694009545474, "grad_norm": 99.59243774414062, "learning_rate": 3e-06, "loss": -18.9376, "step": 89 }, { "epoch": 0.008028904054596548, "grad_norm": 96.83404541015625, "learning_rate": 3e-06, "loss": -7.9748, "step": 90 }, { "epoch": 0.008118114099647621, "grad_norm": 81.16954803466797, "learning_rate": 3e-06, "loss": -4.2134, "step": 91 }, { "epoch": 0.008207324144698692, "grad_norm": 123.15869140625, "learning_rate": 3e-06, "loss": -19.8823, "step": 92 }, { "epoch": 0.008296534189749766, "grad_norm": 93.05419158935547, "learning_rate": 3e-06, "loss": -4.5813, "step": 93 }, { "epoch": 0.008385744234800839, "grad_norm": 106.2331314086914, "learning_rate": 3e-06, "loss": -8.6969, "step": 94 }, { "epoch": 0.00847495427985191, "grad_norm": 99.65939331054688, "learning_rate": 3e-06, "loss": -21.3275, "step": 95 }, { "epoch": 0.008564164324902984, "grad_norm": 94.40375518798828, "learning_rate": 3e-06, "loss": -9.7937, "step": 96 }, { "completion_length": 217.25000762939453, "epoch": 0.008653374369954057, "grad_norm": 133.5598907470703, "learning_rate": 3e-06, "loss": -68.7329, "reward": 0.6968958526849747, "reward_std": 0.7409922480583191, "rewards/correctness_reward_func": 0.5833333283662796, "rewards/int_reward_func": 0.1979166641831398, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.08435416780412197, "step": 97, "zero_std_ratio": 0.0 }, { "epoch": 0.00874258441500513, "grad_norm": 136.60848999023438, "learning_rate": 3e-06, "loss": -74.5256, "step": 98 }, { "epoch": 0.008831794460056202, "grad_norm": 123.70120239257812, "learning_rate": 3e-06, "loss": -59.6774, "step": 99 }, { "epoch": 0.008921004505107275, "grad_norm": 150.22532653808594, "learning_rate": 3e-06, "loss": -69.5624, "step": 100 }, { "epoch": 0.009010214550158348, "grad_norm": 126.68507385253906, "learning_rate": 3e-06, "loss": -62.8973, "step": 101 }, { "epoch": 0.00909942459520942, "grad_norm": 105.47962951660156, "learning_rate": 3e-06, "loss": -61.6182, "step": 102 }, { "epoch": 0.009188634640260493, "grad_norm": 144.26048278808594, "learning_rate": 3e-06, "loss": -70.5109, "step": 103 }, { "epoch": 0.009277844685311567, "grad_norm": 141.22325134277344, "learning_rate": 3e-06, "loss": -76.5479, "step": 104 }, { "epoch": 0.009367054730362638, "grad_norm": 139.37173461914062, "learning_rate": 3e-06, "loss": -62.353, "step": 105 }, { "epoch": 0.009456264775413711, "grad_norm": 150.77801513671875, "learning_rate": 3e-06, "loss": -72.2384, "step": 106 }, { "epoch": 0.009545474820464785, "grad_norm": 138.2374267578125, "learning_rate": 3e-06, "loss": -65.3746, "step": 107 }, { "epoch": 0.009634684865515858, "grad_norm": 132.50453186035156, "learning_rate": 3e-06, "loss": -64.141, "step": 108 }, { "completion_length": 198.125, "epoch": 0.00972389491056693, "grad_norm": 187.5413055419922, "learning_rate": 3e-06, "loss": 44.1489, "reward": 0.7788957953453064, "reward_std": 0.7549726963043213, "rewards/correctness_reward_func": 0.6666666567325592, "rewards/int_reward_func": 0.1770833283662796, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.06485416181385517, "step": 109, "zero_std_ratio": 0.0 }, { "epoch": 0.009813104955618003, "grad_norm": 138.89434814453125, "learning_rate": 3e-06, "loss": 51.6646, "step": 110 }, { "epoch": 0.009902315000669076, "grad_norm": 128.95484924316406, "learning_rate": 3e-06, "loss": 31.7993, "step": 111 }, { "epoch": 0.009991525045720148, "grad_norm": 126.7931900024414, "learning_rate": 3e-06, "loss": 38.5454, "step": 112 }, { "epoch": 0.01008073509077122, "grad_norm": 125.33599853515625, "learning_rate": 3e-06, "loss": 40.3822, "step": 113 }, { "epoch": 0.010169945135822294, "grad_norm": 139.41482543945312, "learning_rate": 3e-06, "loss": 32.5052, "step": 114 }, { "epoch": 0.010259155180873366, "grad_norm": 169.09432983398438, "learning_rate": 3e-06, "loss": 43.5542, "step": 115 }, { "epoch": 0.010348365225924439, "grad_norm": 133.872802734375, "learning_rate": 3e-06, "loss": 50.3469, "step": 116 }, { "epoch": 0.010437575270975512, "grad_norm": 125.77018737792969, "learning_rate": 3e-06, "loss": 31.112, "step": 117 }, { "epoch": 0.010526785316026585, "grad_norm": 128.32257080078125, "learning_rate": 3e-06, "loss": 36.629, "step": 118 }, { "epoch": 0.010615995361077657, "grad_norm": 124.38401794433594, "learning_rate": 3e-06, "loss": 39.8284, "step": 119 }, { "epoch": 0.01070520540612873, "grad_norm": 138.24668884277344, "learning_rate": 3e-06, "loss": 31.1433, "step": 120 }, { "completion_length": 191.6041717529297, "epoch": 0.010794415451179804, "grad_norm": 199.1646270751953, "learning_rate": 3e-06, "loss": 86.314, "reward": 1.0014583468437195, "reward_std": 0.8148851096630096, "rewards/correctness_reward_func": 0.875, "rewards/int_reward_func": 0.2291666641831398, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.10270833596587181, "step": 121, "zero_std_ratio": 0.0 }, { "epoch": 0.010883625496230875, "grad_norm": 195.7254638671875, "learning_rate": 3e-06, "loss": 76.1289, "step": 122 }, { "epoch": 0.010972835541281948, "grad_norm": 175.13900756835938, "learning_rate": 3e-06, "loss": 86.1448, "step": 123 }, { "epoch": 0.011062045586333022, "grad_norm": 182.21661376953125, "learning_rate": 3e-06, "loss": 90.0805, "step": 124 }, { "epoch": 0.011151255631384093, "grad_norm": 189.17214965820312, "learning_rate": 3e-06, "loss": 76.0951, "step": 125 }, { "epoch": 0.011240465676435166, "grad_norm": 195.55718994140625, "learning_rate": 3e-06, "loss": 89.5242, "step": 126 }, { "epoch": 0.01132967572148624, "grad_norm": 171.1396484375, "learning_rate": 3e-06, "loss": 82.3705, "step": 127 }, { "epoch": 0.011418885766537313, "grad_norm": 189.04995727539062, "learning_rate": 3e-06, "loss": 71.8677, "step": 128 }, { "epoch": 0.011508095811588384, "grad_norm": 162.9297332763672, "learning_rate": 3e-06, "loss": 81.2432, "step": 129 }, { "epoch": 0.011597305856639458, "grad_norm": 173.23104858398438, "learning_rate": 3e-06, "loss": 85.8069, "step": 130 }, { "epoch": 0.011686515901690531, "grad_norm": 162.6637420654297, "learning_rate": 3e-06, "loss": 69.8347, "step": 131 }, { "epoch": 0.011775725946741603, "grad_norm": 190.06675720214844, "learning_rate": 3e-06, "loss": 84.5222, "step": 132 }, { "completion_length": 207.33334350585938, "epoch": 0.011864935991792676, "grad_norm": 147.16957092285156, "learning_rate": 3e-06, "loss": -63.4697, "reward": 0.6791666746139526, "reward_std": 1.0425111949443817, "rewards/correctness_reward_func": 0.5833333134651184, "rewards/int_reward_func": 0.1770833358168602, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.08124999818392098, "step": 133, "zero_std_ratio": 0.0 }, { "epoch": 0.011954146036843749, "grad_norm": 154.6719970703125, "learning_rate": 3e-06, "loss": -70.5112, "step": 134 }, { "epoch": 0.01204335608189482, "grad_norm": 137.32408142089844, "learning_rate": 3e-06, "loss": -41.1322, "step": 135 }, { "epoch": 0.012132566126945894, "grad_norm": 126.37704467773438, "learning_rate": 3e-06, "loss": -53.2367, "step": 136 }, { "epoch": 0.012221776171996967, "grad_norm": 152.24891662597656, "learning_rate": 3e-06, "loss": -58.4567, "step": 137 }, { "epoch": 0.01231098621704804, "grad_norm": 116.28028106689453, "learning_rate": 3e-06, "loss": -46.2973, "step": 138 }, { "epoch": 0.012400196262099112, "grad_norm": 152.08795166015625, "learning_rate": 3e-06, "loss": -62.8325, "step": 139 }, { "epoch": 0.012489406307150185, "grad_norm": 146.10671997070312, "learning_rate": 3e-06, "loss": -71.6559, "step": 140 }, { "epoch": 0.012578616352201259, "grad_norm": 149.14556884765625, "learning_rate": 3e-06, "loss": -42.1534, "step": 141 }, { "epoch": 0.01266782639725233, "grad_norm": 151.06182861328125, "learning_rate": 3e-06, "loss": -55.6968, "step": 142 }, { "epoch": 0.012757036442303403, "grad_norm": 145.29530334472656, "learning_rate": 3e-06, "loss": -60.2759, "step": 143 }, { "epoch": 0.012846246487354477, "grad_norm": 124.00696563720703, "learning_rate": 3e-06, "loss": -48.5856, "step": 144 }, { "completion_length": 200.14583587646484, "epoch": 0.012935456532405548, "grad_norm": 104.97675323486328, "learning_rate": 3e-06, "loss": -22.2294, "reward": 0.21922918409109116, "reward_std": 0.6296879947185516, "rewards/correctness_reward_func": 0.2916666716337204, "rewards/int_reward_func": 0.062499999068677425, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.13493750616908073, "step": 145, "zero_std_ratio": 0.0 }, { "epoch": 0.013024666577456621, "grad_norm": 83.18937683105469, "learning_rate": 3e-06, "loss": -22.3765, "step": 146 }, { "epoch": 0.013113876622507695, "grad_norm": 96.22801971435547, "learning_rate": 3e-06, "loss": -22.5564, "step": 147 }, { "epoch": 0.013203086667558768, "grad_norm": 102.87374877929688, "learning_rate": 3e-06, "loss": -24.9001, "step": 148 }, { "epoch": 0.01329229671260984, "grad_norm": 110.96674346923828, "learning_rate": 3e-06, "loss": -18.8972, "step": 149 }, { "epoch": 0.013381506757660913, "grad_norm": 91.87604522705078, "learning_rate": 3e-06, "loss": -18.1615, "step": 150 }, { "epoch": 0.013470716802711986, "grad_norm": 88.4422836303711, "learning_rate": 3e-06, "loss": -23.0431, "step": 151 }, { "epoch": 0.013559926847763058, "grad_norm": 83.86327362060547, "learning_rate": 3e-06, "loss": -23.25, "step": 152 }, { "epoch": 0.01364913689281413, "grad_norm": 82.81922149658203, "learning_rate": 3e-06, "loss": -23.1331, "step": 153 }, { "epoch": 0.013738346937865204, "grad_norm": 104.8452377319336, "learning_rate": 3e-06, "loss": -26.8428, "step": 154 }, { "epoch": 0.013827556982916276, "grad_norm": 92.94257354736328, "learning_rate": 3e-06, "loss": -20.0667, "step": 155 }, { "epoch": 0.013916767027967349, "grad_norm": 84.95638275146484, "learning_rate": 3e-06, "loss": -19.1472, "step": 156 }, { "completion_length": 231.7916717529297, "epoch": 0.014005977073018422, "grad_norm": 139.96688842773438, "learning_rate": 3e-06, "loss": 21.1797, "reward": 0.6720625460147858, "reward_std": 0.9181468784809113, "rewards/correctness_reward_func": 0.5833333432674408, "rewards/int_reward_func": 0.2187500074505806, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.13002083078026772, "step": 157, "zero_std_ratio": 0.0 }, { "epoch": 0.014095187118069494, "grad_norm": 109.121337890625, "learning_rate": 3e-06, "loss": 10.5428, "step": 158 }, { "epoch": 0.014184397163120567, "grad_norm": 94.9039306640625, "learning_rate": 3e-06, "loss": 4.8126, "step": 159 }, { "epoch": 0.01427360720817164, "grad_norm": 109.7251968383789, "learning_rate": 3e-06, "loss": 5.2961, "step": 160 }, { "epoch": 0.014362817253222714, "grad_norm": 103.42703247070312, "learning_rate": 3e-06, "loss": 4.0648, "step": 161 }, { "epoch": 0.014452027298273785, "grad_norm": 127.93770599365234, "learning_rate": 3e-06, "loss": 7.0101, "step": 162 }, { "epoch": 0.014541237343324858, "grad_norm": 145.8150634765625, "learning_rate": 3e-06, "loss": 18.3559, "step": 163 }, { "epoch": 0.014630447388375932, "grad_norm": 116.2653579711914, "learning_rate": 3e-06, "loss": 8.3424, "step": 164 }, { "epoch": 0.014719657433427003, "grad_norm": 104.55130767822266, "learning_rate": 3e-06, "loss": 2.084, "step": 165 }, { "epoch": 0.014808867478478076, "grad_norm": 114.84294128417969, "learning_rate": 3e-06, "loss": 2.2571, "step": 166 }, { "epoch": 0.01489807752352915, "grad_norm": 99.8189468383789, "learning_rate": 3e-06, "loss": 0.9219, "step": 167 }, { "epoch": 0.014987287568580221, "grad_norm": 142.80715942382812, "learning_rate": 3e-06, "loss": 3.505, "step": 168 }, { "completion_length": 214.77083587646484, "epoch": 0.015076497613631295, "grad_norm": 114.3720703125, "learning_rate": 3e-06, "loss": -32.1329, "reward": 1.0511458218097687, "reward_std": 1.0028848350048065, "rewards/correctness_reward_func": 0.9166666865348816, "rewards/int_reward_func": 0.3125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.17802083492279053, "step": 169, "zero_std_ratio": 0.0 }, { "epoch": 0.015165707658682368, "grad_norm": 100.3597412109375, "learning_rate": 3e-06, "loss": -38.7511, "step": 170 }, { "epoch": 0.015254917703733441, "grad_norm": 108.30574035644531, "learning_rate": 3e-06, "loss": -46.3083, "step": 171 }, { "epoch": 0.015344127748784513, "grad_norm": 116.34545135498047, "learning_rate": 3e-06, "loss": -39.7363, "step": 172 }, { "epoch": 0.015433337793835586, "grad_norm": 113.52851104736328, "learning_rate": 3e-06, "loss": -34.5686, "step": 173 }, { "epoch": 0.01552254783888666, "grad_norm": 110.65509796142578, "learning_rate": 3e-06, "loss": -32.8796, "step": 174 }, { "epoch": 0.01561175788393773, "grad_norm": 107.06590270996094, "learning_rate": 3e-06, "loss": -32.6552, "step": 175 }, { "epoch": 0.015700967928988806, "grad_norm": 100.2861557006836, "learning_rate": 3e-06, "loss": -39.6106, "step": 176 }, { "epoch": 0.015790177974039876, "grad_norm": 107.69467163085938, "learning_rate": 3e-06, "loss": -46.9244, "step": 177 }, { "epoch": 0.01587938801909095, "grad_norm": 96.8420181274414, "learning_rate": 3e-06, "loss": -40.93, "step": 178 }, { "epoch": 0.015968598064142022, "grad_norm": 113.12389373779297, "learning_rate": 3e-06, "loss": -37.0258, "step": 179 }, { "epoch": 0.016057808109193095, "grad_norm": 116.10971069335938, "learning_rate": 3e-06, "loss": -34.9046, "step": 180 }, { "completion_length": 218.64583587646484, "epoch": 0.01614701815424417, "grad_norm": 307.6673889160156, "learning_rate": 3e-06, "loss": -29.0457, "reward": 1.4360832571983337, "reward_std": 1.0610616505146027, "rewards/correctness_reward_func": 1.25, "rewards/int_reward_func": 0.34375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.15766665898263454, "step": 181, "zero_std_ratio": 0.0 }, { "epoch": 0.016236228199295242, "grad_norm": 125.99212646484375, "learning_rate": 3e-06, "loss": 1.016, "step": 182 }, { "epoch": 0.01632543824434631, "grad_norm": 112.21588897705078, "learning_rate": 3e-06, "loss": -17.115, "step": 183 }, { "epoch": 0.016414648289397385, "grad_norm": 118.06622314453125, "learning_rate": 3e-06, "loss": -0.4864, "step": 184 }, { "epoch": 0.016503858334448458, "grad_norm": 116.36631774902344, "learning_rate": 3e-06, "loss": 3.4437, "step": 185 }, { "epoch": 0.01659306837949953, "grad_norm": 124.60052490234375, "learning_rate": 3e-06, "loss": -27.5515, "step": 186 }, { "epoch": 0.016682278424550605, "grad_norm": 160.65628051757812, "learning_rate": 3e-06, "loss": -29.3863, "step": 187 }, { "epoch": 0.016771488469601678, "grad_norm": 127.9763412475586, "learning_rate": 3e-06, "loss": 0.7423, "step": 188 }, { "epoch": 0.01686069851465275, "grad_norm": 116.69316101074219, "learning_rate": 3e-06, "loss": -18.6518, "step": 189 }, { "epoch": 0.01694990855970382, "grad_norm": 114.2183609008789, "learning_rate": 3e-06, "loss": -0.87, "step": 190 }, { "epoch": 0.017039118604754894, "grad_norm": 126.1614761352539, "learning_rate": 3e-06, "loss": 2.8213, "step": 191 }, { "epoch": 0.017128328649805968, "grad_norm": 134.43527221679688, "learning_rate": 3e-06, "loss": -28.8518, "step": 192 }, { "completion_length": 247.08334350585938, "epoch": 0.01721753869485704, "grad_norm": 111.3412094116211, "learning_rate": 3e-06, "loss": -7.8814, "reward": 1.0157291293144226, "reward_std": 0.7945153564214706, "rewards/correctness_reward_func": 0.9166666865348816, "rewards/int_reward_func": 0.2916666716337204, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.1926041767001152, "step": 193, "zero_std_ratio": 0.0 }, { "epoch": 0.017306748739908114, "grad_norm": 116.42599487304688, "learning_rate": 3e-06, "loss": -15.1245, "step": 194 }, { "epoch": 0.017395958784959187, "grad_norm": 136.37391662597656, "learning_rate": 3e-06, "loss": -9.2426, "step": 195 }, { "epoch": 0.01748516883001026, "grad_norm": 97.36872863769531, "learning_rate": 3e-06, "loss": -10.8671, "step": 196 }, { "epoch": 0.01757437887506133, "grad_norm": 125.0397720336914, "learning_rate": 3e-06, "loss": -7.0755, "step": 197 }, { "epoch": 0.017663588920112404, "grad_norm": 171.17971801757812, "learning_rate": 3e-06, "loss": -16.797, "step": 198 }, { "epoch": 0.017752798965163477, "grad_norm": 100.81266021728516, "learning_rate": 3e-06, "loss": -8.4577, "step": 199 }, { "epoch": 0.01784200901021455, "grad_norm": 127.79389953613281, "learning_rate": 3e-06, "loss": -16.3874, "step": 200 }, { "epoch": 0.017931219055265624, "grad_norm": 131.9748077392578, "learning_rate": 3e-06, "loss": -10.7973, "step": 201 }, { "epoch": 0.018020429100316697, "grad_norm": 100.95606231689453, "learning_rate": 3e-06, "loss": -12.0026, "step": 202 }, { "epoch": 0.018109639145367767, "grad_norm": 131.19261169433594, "learning_rate": 3e-06, "loss": -8.3155, "step": 203 }, { "epoch": 0.01819884919041884, "grad_norm": 164.74656677246094, "learning_rate": 3e-06, "loss": -18.9275, "step": 204 }, { "completion_length": 191.7291717529297, "epoch": 0.018288059235469913, "grad_norm": 150.95191955566406, "learning_rate": 3e-06, "loss": 50.6719, "reward": 1.3118958473205566, "reward_std": 0.8902758955955505, "rewards/correctness_reward_func": 0.9999999701976776, "rewards/int_reward_func": 0.34375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.03185417060740292, "step": 205, "zero_std_ratio": 0.0 }, { "epoch": 0.018377269280520987, "grad_norm": 128.34344482421875, "learning_rate": 3e-06, "loss": 32.3302, "step": 206 }, { "epoch": 0.01846647932557206, "grad_norm": 136.15789794921875, "learning_rate": 3e-06, "loss": 33.8857, "step": 207 }, { "epoch": 0.018555689370623133, "grad_norm": 140.50901794433594, "learning_rate": 3e-06, "loss": 28.924, "step": 208 }, { "epoch": 0.018644899415674206, "grad_norm": 168.0647430419922, "learning_rate": 3e-06, "loss": 31.3019, "step": 209 }, { "epoch": 0.018734109460725276, "grad_norm": 133.79208374023438, "learning_rate": 3e-06, "loss": 31.6401, "step": 210 }, { "epoch": 0.01882331950577635, "grad_norm": 161.34898376464844, "learning_rate": 3e-06, "loss": 49.2047, "step": 211 }, { "epoch": 0.018912529550827423, "grad_norm": 129.22007751464844, "learning_rate": 3e-06, "loss": 30.2, "step": 212 }, { "epoch": 0.019001739595878496, "grad_norm": 143.37449645996094, "learning_rate": 3e-06, "loss": 31.2762, "step": 213 }, { "epoch": 0.01909094964092957, "grad_norm": 140.57894897460938, "learning_rate": 3e-06, "loss": 26.7715, "step": 214 }, { "epoch": 0.019180159685980643, "grad_norm": 148.71348571777344, "learning_rate": 3e-06, "loss": 28.729, "step": 215 }, { "epoch": 0.019269369731031716, "grad_norm": 137.0448455810547, "learning_rate": 3e-06, "loss": 29.3048, "step": 216 }, { "completion_length": 241.0, "epoch": 0.019358579776082786, "grad_norm": 167.66650390625, "learning_rate": 3e-06, "loss": -43.2087, "reward": 1.6041667461395264, "reward_std": 0.9945478439331055, "rewards/correctness_reward_func": 1.375, "rewards/int_reward_func": 0.4270833283662796, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.1979166641831398, "step": 217, "zero_std_ratio": 0.0 }, { "epoch": 0.01944778982113386, "grad_norm": 168.0265350341797, "learning_rate": 3e-06, "loss": -45.6767, "step": 218 }, { "epoch": 0.019536999866184932, "grad_norm": 148.4340362548828, "learning_rate": 3e-06, "loss": -30.5571, "step": 219 }, { "epoch": 0.019626209911236005, "grad_norm": 139.6564178466797, "learning_rate": 3e-06, "loss": -44.7743, "step": 220 }, { "epoch": 0.01971541995628708, "grad_norm": 147.22129821777344, "learning_rate": 3e-06, "loss": -41.9365, "step": 221 }, { "epoch": 0.019804630001338152, "grad_norm": 190.81561279296875, "learning_rate": 3e-06, "loss": -48.2229, "step": 222 }, { "epoch": 0.019893840046389222, "grad_norm": 165.86917114257812, "learning_rate": 3e-06, "loss": -43.317, "step": 223 }, { "epoch": 0.019983050091440295, "grad_norm": 162.9475555419922, "learning_rate": 3e-06, "loss": -48.1878, "step": 224 }, { "epoch": 0.02007226013649137, "grad_norm": 179.08360290527344, "learning_rate": 3e-06, "loss": -33.3052, "step": 225 }, { "epoch": 0.02016147018154244, "grad_norm": 133.29290771484375, "learning_rate": 3e-06, "loss": -45.8993, "step": 226 }, { "epoch": 0.020250680226593515, "grad_norm": 155.86611938476562, "learning_rate": 3e-06, "loss": -43.9261, "step": 227 }, { "epoch": 0.020339890271644588, "grad_norm": 154.34974670410156, "learning_rate": 3e-06, "loss": -50.4381, "step": 228 }, { "completion_length": 239.43750762939453, "epoch": 0.02042910031669566, "grad_norm": 105.5196304321289, "learning_rate": 3e-06, "loss": 0.4059, "reward": 1.5316042304039001, "reward_std": 0.8583633303642273, "rewards/correctness_reward_func": 1.375, "rewards/int_reward_func": 0.3541666716337204, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.19756250455975533, "step": 229, "zero_std_ratio": 0.0 }, { "epoch": 0.02051831036174673, "grad_norm": 119.5712890625, "learning_rate": 3e-06, "loss": -3.2594, "step": 230 }, { "epoch": 0.020607520406797804, "grad_norm": 128.1366424560547, "learning_rate": 3e-06, "loss": -12.0605, "step": 231 }, { "epoch": 0.020696730451848878, "grad_norm": 126.55559539794922, "learning_rate": 3e-06, "loss": -15.4799, "step": 232 }, { "epoch": 0.02078594049689995, "grad_norm": 137.93882751464844, "learning_rate": 3e-06, "loss": -18.5312, "step": 233 }, { "epoch": 0.020875150541951024, "grad_norm": 108.0162124633789, "learning_rate": 3e-06, "loss": -11.7573, "step": 234 }, { "epoch": 0.020964360587002098, "grad_norm": 118.95193481445312, "learning_rate": 3e-06, "loss": -1.1434, "step": 235 }, { "epoch": 0.02105357063205317, "grad_norm": 126.50416564941406, "learning_rate": 3e-06, "loss": -3.7423, "step": 236 }, { "epoch": 0.02114278067710424, "grad_norm": 130.68190002441406, "learning_rate": 3e-06, "loss": -14.5207, "step": 237 }, { "epoch": 0.021231990722155314, "grad_norm": 129.162109375, "learning_rate": 3e-06, "loss": -16.3237, "step": 238 }, { "epoch": 0.021321200767206387, "grad_norm": 145.95396423339844, "learning_rate": 3e-06, "loss": -20.6294, "step": 239 }, { "epoch": 0.02141041081225746, "grad_norm": 107.8385009765625, "learning_rate": 3e-06, "loss": -14.0773, "step": 240 }, { "completion_length": 206.9791717529297, "epoch": 0.021499620857308534, "grad_norm": 102.38019561767578, "learning_rate": 3e-06, "loss": -32.3486, "reward": 1.035479187965393, "reward_std": 0.7589404881000519, "rewards/correctness_reward_func": 0.8750000149011612, "rewards/int_reward_func": 0.3020833283662796, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.14160417020320892, "step": 241, "zero_std_ratio": 0.0 }, { "epoch": 0.021588830902359607, "grad_norm": 110.24679565429688, "learning_rate": 3e-06, "loss": -38.6199, "step": 242 }, { "epoch": 0.021678040947410677, "grad_norm": 118.22930145263672, "learning_rate": 3e-06, "loss": -52.9139, "step": 243 }, { "epoch": 0.02176725099246175, "grad_norm": 118.6080322265625, "learning_rate": 3e-06, "loss": -43.3805, "step": 244 }, { "epoch": 0.021856461037512823, "grad_norm": 106.9905776977539, "learning_rate": 3e-06, "loss": -36.7945, "step": 245 }, { "epoch": 0.021945671082563897, "grad_norm": 111.37010955810547, "learning_rate": 3e-06, "loss": -36.4452, "step": 246 }, { "epoch": 0.02203488112761497, "grad_norm": 104.93065643310547, "learning_rate": 3e-06, "loss": -34.2096, "step": 247 }, { "epoch": 0.022124091172666043, "grad_norm": 117.96737670898438, "learning_rate": 3e-06, "loss": -40.621, "step": 248 }, { "epoch": 0.022213301217717116, "grad_norm": 118.701904296875, "learning_rate": 3e-06, "loss": -54.4138, "step": 249 }, { "epoch": 0.022302511262768186, "grad_norm": 118.43307495117188, "learning_rate": 3e-06, "loss": -45.0393, "step": 250 }, { "epoch": 0.02239172130781926, "grad_norm": 114.41901397705078, "learning_rate": 3e-06, "loss": -37.6304, "step": 251 }, { "epoch": 0.022480931352870333, "grad_norm": 123.03970336914062, "learning_rate": 3e-06, "loss": -39.0638, "step": 252 }, { "completion_length": 211.20833587646484, "epoch": 0.022570141397921406, "grad_norm": 139.677734375, "learning_rate": 3e-06, "loss": -27.6756, "reward": 1.5570417046546936, "reward_std": 1.1208258867263794, "rewards/correctness_reward_func": 1.2916666865348816, "rewards/int_reward_func": 0.40625, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.14087500423192978, "step": 253, "zero_std_ratio": 0.0 }, { "epoch": 0.02265935144297248, "grad_norm": 145.56021118164062, "learning_rate": 3e-06, "loss": -40.3289, "step": 254 }, { "epoch": 0.022748561488023553, "grad_norm": 138.8564453125, "learning_rate": 3e-06, "loss": -39.1766, "step": 255 }, { "epoch": 0.022837771533074626, "grad_norm": 229.50186157226562, "learning_rate": 3e-06, "loss": -43.9568, "step": 256 }, { "epoch": 0.022926981578125696, "grad_norm": 138.42791748046875, "learning_rate": 3e-06, "loss": -52.4297, "step": 257 }, { "epoch": 0.02301619162317677, "grad_norm": 147.58364868164062, "learning_rate": 3e-06, "loss": -53.5477, "step": 258 }, { "epoch": 0.023105401668227842, "grad_norm": 140.5048828125, "learning_rate": 3e-06, "loss": -28.1418, "step": 259 }, { "epoch": 0.023194611713278915, "grad_norm": 139.11508178710938, "learning_rate": 3e-06, "loss": -42.7612, "step": 260 }, { "epoch": 0.02328382175832999, "grad_norm": 146.18580627441406, "learning_rate": 3e-06, "loss": -39.909, "step": 261 }, { "epoch": 0.023373031803381062, "grad_norm": 264.3643493652344, "learning_rate": 3e-06, "loss": -46.2595, "step": 262 }, { "epoch": 0.023462241848432132, "grad_norm": 154.1084747314453, "learning_rate": 3e-06, "loss": -55.2424, "step": 263 }, { "epoch": 0.023551451893483205, "grad_norm": 156.28662109375, "learning_rate": 3e-06, "loss": -55.6531, "step": 264 }, { "completion_length": 227.14583587646484, "epoch": 0.02364066193853428, "grad_norm": 115.90379333496094, "learning_rate": 3e-06, "loss": -30.7492, "reward": 1.6066043376922607, "reward_std": 0.8875448107719421, "rewards/correctness_reward_func": 1.2916666865348816, "rewards/int_reward_func": 0.3958333283662796, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.010416666977107525, "rewards/xmlcount_reward_func": -0.09131250530481339, "step": 265, "zero_std_ratio": 0.0 }, { "epoch": 0.02372987198358535, "grad_norm": 113.86587524414062, "learning_rate": 3e-06, "loss": -29.6989, "step": 266 }, { "epoch": 0.023819082028636425, "grad_norm": 110.4273681640625, "learning_rate": 3e-06, "loss": -32.0614, "step": 267 }, { "epoch": 0.023908292073687498, "grad_norm": 111.84119415283203, "learning_rate": 3e-06, "loss": -31.9073, "step": 268 }, { "epoch": 0.02399750211873857, "grad_norm": 103.93081665039062, "learning_rate": 3e-06, "loss": -22.3506, "step": 269 }, { "epoch": 0.02408671216378964, "grad_norm": 120.32383728027344, "learning_rate": 3e-06, "loss": -28.5629, "step": 270 }, { "epoch": 0.024175922208840715, "grad_norm": 124.92536163330078, "learning_rate": 3e-06, "loss": -33.0776, "step": 271 }, { "epoch": 0.024265132253891788, "grad_norm": 119.54340362548828, "learning_rate": 3e-06, "loss": -31.6735, "step": 272 }, { "epoch": 0.02435434229894286, "grad_norm": 128.8444061279297, "learning_rate": 3e-06, "loss": -33.8033, "step": 273 }, { "epoch": 0.024443552343993934, "grad_norm": 123.08969116210938, "learning_rate": 3e-06, "loss": -34.4538, "step": 274 }, { "epoch": 0.024532762389045008, "grad_norm": 111.98983001708984, "learning_rate": 3e-06, "loss": -24.6449, "step": 275 }, { "epoch": 0.02462197243409608, "grad_norm": 123.31842041015625, "learning_rate": 3e-06, "loss": -31.1417, "step": 276 }, { "completion_length": 211.20833587646484, "epoch": 0.02471118247914715, "grad_norm": 83.42295837402344, "learning_rate": 3e-06, "loss": -49.3896, "reward": 1.6565834283828735, "reward_std": 0.7390342950820923, "rewards/correctness_reward_func": 1.375, "rewards/int_reward_func": 0.4166666716337204, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.13508333638310432, "step": 277, "zero_std_ratio": 0.0 }, { "epoch": 0.024800392524198224, "grad_norm": 78.70240783691406, "learning_rate": 3e-06, "loss": -60.2538, "step": 278 }, { "epoch": 0.024889602569249297, "grad_norm": 87.03772735595703, "learning_rate": 3e-06, "loss": -54.701, "step": 279 }, { "epoch": 0.02497881261430037, "grad_norm": 105.03215789794922, "learning_rate": 3e-06, "loss": -50.647, "step": 280 }, { "epoch": 0.025068022659351444, "grad_norm": 94.19722747802734, "learning_rate": 3e-06, "loss": -53.7356, "step": 281 }, { "epoch": 0.025157232704402517, "grad_norm": 71.46943664550781, "learning_rate": 3e-06, "loss": -54.5847, "step": 282 }, { "epoch": 0.025246442749453587, "grad_norm": 90.4788589477539, "learning_rate": 3e-06, "loss": -50.5539, "step": 283 }, { "epoch": 0.02533565279450466, "grad_norm": 74.81779479980469, "learning_rate": 3e-06, "loss": -61.3813, "step": 284 }, { "epoch": 0.025424862839555733, "grad_norm": 85.80409240722656, "learning_rate": 3e-06, "loss": -55.7379, "step": 285 }, { "epoch": 0.025514072884606807, "grad_norm": 135.24191284179688, "learning_rate": 3e-06, "loss": -52.1614, "step": 286 }, { "epoch": 0.02560328292965788, "grad_norm": 94.01042175292969, "learning_rate": 3e-06, "loss": -55.4857, "step": 287 }, { "epoch": 0.025692492974708953, "grad_norm": 72.32071685791016, "learning_rate": 3e-06, "loss": -56.3565, "step": 288 }, { "completion_length": 221.45833587646484, "epoch": 0.025781703019760027, "grad_norm": 225.52276611328125, "learning_rate": 3e-06, "loss": -67.7081, "reward": 1.960687518119812, "reward_std": 0.8211362063884735, "rewards/correctness_reward_func": 1.625, "rewards/int_reward_func": 0.4479166716337204, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.11222916468977928, "step": 289, "zero_std_ratio": 0.0 }, { "epoch": 0.025870913064811096, "grad_norm": 239.94651794433594, "learning_rate": 3e-06, "loss": -66.0587, "step": 290 }, { "epoch": 0.02596012310986217, "grad_norm": 173.2037353515625, "learning_rate": 3e-06, "loss": -59.416, "step": 291 }, { "epoch": 0.026049333154913243, "grad_norm": 228.50621032714844, "learning_rate": 3e-06, "loss": -70.6059, "step": 292 }, { "epoch": 0.026138543199964316, "grad_norm": 213.36802673339844, "learning_rate": 3e-06, "loss": -68.8733, "step": 293 }, { "epoch": 0.02622775324501539, "grad_norm": 389.8759460449219, "learning_rate": 3e-06, "loss": -108.725, "step": 294 }, { "epoch": 0.026316963290066463, "grad_norm": 241.96009826660156, "learning_rate": 3e-06, "loss": -73.0107, "step": 295 }, { "epoch": 0.026406173335117536, "grad_norm": 282.705322265625, "learning_rate": 3e-06, "loss": -71.4601, "step": 296 }, { "epoch": 0.026495383380168606, "grad_norm": 182.99859619140625, "learning_rate": 3e-06, "loss": -62.8503, "step": 297 }, { "epoch": 0.02658459342521968, "grad_norm": 237.8432159423828, "learning_rate": 3e-06, "loss": -76.095, "step": 298 }, { "epoch": 0.026673803470270752, "grad_norm": 224.10140991210938, "learning_rate": 3e-06, "loss": -71.9696, "step": 299 }, { "epoch": 0.026763013515321826, "grad_norm": 401.25421142578125, "learning_rate": 3e-06, "loss": -119.7468, "step": 300 }, { "completion_length": 181.00000762939453, "epoch": 0.0268522235603729, "grad_norm": 102.42095184326172, "learning_rate": 3e-06, "loss": 11.7663, "reward": 1.6927291750907898, "reward_std": 0.8399400115013123, "rewards/correctness_reward_func": 1.3333333730697632, "rewards/int_reward_func": 0.40625, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.0468541644513607, "step": 301, "zero_std_ratio": 0.0 }, { "epoch": 0.026941433605423972, "grad_norm": 97.59688568115234, "learning_rate": 3e-06, "loss": 4.3671, "step": 302 }, { "epoch": 0.027030643650475042, "grad_norm": 119.19691467285156, "learning_rate": 3e-06, "loss": 3.0122, "step": 303 }, { "epoch": 0.027119853695526115, "grad_norm": 102.54327392578125, "learning_rate": 3e-06, "loss": 5.7653, "step": 304 }, { "epoch": 0.02720906374057719, "grad_norm": 127.24678802490234, "learning_rate": 3e-06, "loss": 9.0808, "step": 305 }, { "epoch": 0.02729827378562826, "grad_norm": 115.35128784179688, "learning_rate": 3e-06, "loss": 9.7375, "step": 306 }, { "epoch": 0.027387483830679335, "grad_norm": 109.96597290039062, "learning_rate": 3e-06, "loss": 10.9794, "step": 307 }, { "epoch": 0.02747669387573041, "grad_norm": 116.67013549804688, "learning_rate": 3e-06, "loss": 4.0605, "step": 308 }, { "epoch": 0.02756590392078148, "grad_norm": 100.0082015991211, "learning_rate": 3e-06, "loss": 2.0719, "step": 309 }, { "epoch": 0.02765511396583255, "grad_norm": 103.2455062866211, "learning_rate": 3e-06, "loss": 3.753, "step": 310 }, { "epoch": 0.027744324010883625, "grad_norm": 139.74317932128906, "learning_rate": 3e-06, "loss": 7.435, "step": 311 }, { "epoch": 0.027833534055934698, "grad_norm": 126.05006408691406, "learning_rate": 3e-06, "loss": 8.1927, "step": 312 }, { "completion_length": 187.87500762939453, "epoch": 0.02792274410098577, "grad_norm": 182.04180908203125, "learning_rate": 3e-06, "loss": -90.2214, "reward": 1.3959375023841858, "reward_std": 0.7920421957969666, "rewards/correctness_reward_func": 1.0833333432674408, "rewards/int_reward_func": 0.40625, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.09364583343267441, "step": 313, "zero_std_ratio": 0.0 }, { "epoch": 0.028011954146036844, "grad_norm": 238.57090759277344, "learning_rate": 3e-06, "loss": -100.422, "step": 314 }, { "epoch": 0.028101164191087918, "grad_norm": 235.39544677734375, "learning_rate": 3e-06, "loss": -102.3354, "step": 315 }, { "epoch": 0.028190374236138988, "grad_norm": 223.8190460205078, "learning_rate": 3e-06, "loss": -109.9957, "step": 316 }, { "epoch": 0.02827958428119006, "grad_norm": 225.00672912597656, "learning_rate": 3e-06, "loss": -109.9375, "step": 317 }, { "epoch": 0.028368794326241134, "grad_norm": 247.57774353027344, "learning_rate": 3e-06, "loss": -125.4302, "step": 318 }, { "epoch": 0.028458004371292207, "grad_norm": 193.24212646484375, "learning_rate": 3e-06, "loss": -93.1797, "step": 319 }, { "epoch": 0.02854721441634328, "grad_norm": 264.4795227050781, "learning_rate": 3e-06, "loss": -104.7149, "step": 320 }, { "epoch": 0.028636424461394354, "grad_norm": 226.05810546875, "learning_rate": 3e-06, "loss": -107.5763, "step": 321 }, { "epoch": 0.028725634506445427, "grad_norm": 239.6378173828125, "learning_rate": 3e-06, "loss": -115.9954, "step": 322 }, { "epoch": 0.028814844551496497, "grad_norm": 240.8443145751953, "learning_rate": 3e-06, "loss": -117.6999, "step": 323 }, { "epoch": 0.02890405459654757, "grad_norm": 261.65643310546875, "learning_rate": 3e-06, "loss": -132.8027, "step": 324 }, { "completion_length": 189.8541717529297, "epoch": 0.028993264641598643, "grad_norm": 168.10452270507812, "learning_rate": 3e-06, "loss": 12.7588, "reward": 1.7772499918937683, "reward_std": 0.9346717596054077, "rewards/correctness_reward_func": 1.375, "rewards/int_reward_func": 0.4479166567325592, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.045666664838790894, "step": 325, "zero_std_ratio": 0.0 }, { "epoch": 0.029082474686649717, "grad_norm": 182.863037109375, "learning_rate": 3e-06, "loss": 0.3851, "step": 326 }, { "epoch": 0.02917168473170079, "grad_norm": 214.54574584960938, "learning_rate": 3e-06, "loss": 17.6492, "step": 327 }, { "epoch": 0.029260894776751863, "grad_norm": 187.80931091308594, "learning_rate": 3e-06, "loss": 7.692, "step": 328 }, { "epoch": 0.029350104821802937, "grad_norm": 195.0843505859375, "learning_rate": 3e-06, "loss": 12.8044, "step": 329 }, { "epoch": 0.029439314866854006, "grad_norm": 168.82028198242188, "learning_rate": 3e-06, "loss": -7.5147, "step": 330 }, { "epoch": 0.02952852491190508, "grad_norm": 176.14859008789062, "learning_rate": 3e-06, "loss": 10.9965, "step": 331 }, { "epoch": 0.029617734956956153, "grad_norm": 202.02247619628906, "learning_rate": 3e-06, "loss": -1.8138, "step": 332 }, { "epoch": 0.029706945002007226, "grad_norm": 216.37252807617188, "learning_rate": 3e-06, "loss": 15.5179, "step": 333 }, { "epoch": 0.0297961550470583, "grad_norm": 200.23558044433594, "learning_rate": 3e-06, "loss": 5.0549, "step": 334 }, { "epoch": 0.029885365092109373, "grad_norm": 177.7020263671875, "learning_rate": 3e-06, "loss": 11.199, "step": 335 }, { "epoch": 0.029974575137160443, "grad_norm": 170.23106384277344, "learning_rate": 3e-06, "loss": -10.9367, "step": 336 }, { "completion_length": 224.9166717529297, "epoch": 0.030063785182211516, "grad_norm": 127.8658218383789, "learning_rate": 3e-06, "loss": -59.3587, "reward": 1.6053959131240845, "reward_std": 0.5731277614831924, "rewards/correctness_reward_func": 1.3333333134651184, "rewards/int_reward_func": 0.4270833283662796, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.1550208330154419, "step": 337, "zero_std_ratio": 0.0 }, { "epoch": 0.03015299522726259, "grad_norm": 133.41494750976562, "learning_rate": 3e-06, "loss": -53.4241, "step": 338 }, { "epoch": 0.030242205272313662, "grad_norm": 170.7308807373047, "learning_rate": 3e-06, "loss": -65.4722, "step": 339 }, { "epoch": 0.030331415317364736, "grad_norm": 172.28118896484375, "learning_rate": 3e-06, "loss": -53.201, "step": 340 }, { "epoch": 0.03042062536241581, "grad_norm": 118.70462799072266, "learning_rate": 3e-06, "loss": -50.8363, "step": 341 }, { "epoch": 0.030509835407466882, "grad_norm": 143.119384765625, "learning_rate": 3e-06, "loss": -60.764, "step": 342 }, { "epoch": 0.030599045452517952, "grad_norm": 143.7277374267578, "learning_rate": 3e-06, "loss": -62.8186, "step": 343 }, { "epoch": 0.030688255497569025, "grad_norm": 157.5625, "learning_rate": 3e-06, "loss": -57.4741, "step": 344 }, { "epoch": 0.0307774655426201, "grad_norm": 191.64804077148438, "learning_rate": 3e-06, "loss": -71.2662, "step": 345 }, { "epoch": 0.030866675587671172, "grad_norm": 206.0039520263672, "learning_rate": 3e-06, "loss": -56.9883, "step": 346 }, { "epoch": 0.030955885632722245, "grad_norm": 132.1703643798828, "learning_rate": 3e-06, "loss": -54.0822, "step": 347 }, { "epoch": 0.03104509567777332, "grad_norm": 144.338623046875, "learning_rate": 3e-06, "loss": -66.1545, "step": 348 }, { "completion_length": 161.1041717529297, "epoch": 0.03113430572282439, "grad_norm": 176.29396057128906, "learning_rate": 3e-06, "loss": 2.568, "reward": 1.8858751058578491, "reward_std": 0.5198497474193573, "rewards/correctness_reward_func": 1.4583333134651184, "rewards/int_reward_func": 0.40625, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.021291667595505714, "step": 349, "zero_std_ratio": 0.0 }, { "epoch": 0.03122351576787546, "grad_norm": 162.1043701171875, "learning_rate": 3e-06, "loss": 11.6461, "step": 350 }, { "epoch": 0.03131272581292654, "grad_norm": 147.42918395996094, "learning_rate": 3e-06, "loss": 19.3425, "step": 351 }, { "epoch": 0.03140193585797761, "grad_norm": 146.65992736816406, "learning_rate": 3e-06, "loss": 4.1848, "step": 352 }, { "epoch": 0.03149114590302868, "grad_norm": 137.17474365234375, "learning_rate": 3e-06, "loss": 9.5202, "step": 353 }, { "epoch": 0.03158035594807975, "grad_norm": 176.24244689941406, "learning_rate": 3e-06, "loss": 24.3653, "step": 354 }, { "epoch": 0.031669565993130824, "grad_norm": 176.59144592285156, "learning_rate": 3e-06, "loss": 1.4274, "step": 355 }, { "epoch": 0.0317587760381819, "grad_norm": 161.0966339111328, "learning_rate": 3e-06, "loss": 10.7367, "step": 356 }, { "epoch": 0.03184798608323297, "grad_norm": 164.76675415039062, "learning_rate": 3e-06, "loss": 18.5651, "step": 357 }, { "epoch": 0.031937196128284044, "grad_norm": 227.15631103515625, "learning_rate": 3e-06, "loss": 2.7137, "step": 358 }, { "epoch": 0.03202640617333512, "grad_norm": 152.36514282226562, "learning_rate": 3e-06, "loss": 8.0471, "step": 359 }, { "epoch": 0.03211561621838619, "grad_norm": 162.82603454589844, "learning_rate": 3e-06, "loss": 24.2858, "step": 360 }, { "completion_length": 173.6666717529297, "epoch": 0.032204826263437264, "grad_norm": 112.11152648925781, "learning_rate": 3e-06, "loss": -47.6777, "reward": 1.9486668109893799, "reward_std": 0.614804282784462, "rewards/correctness_reward_func": 1.5, "rewards/int_reward_func": 0.4895833283662796, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.04091667756438255, "step": 361, "zero_std_ratio": 0.0 }, { "epoch": 0.03229403630848834, "grad_norm": 212.41893005371094, "learning_rate": 3e-06, "loss": -57.8799, "step": 362 }, { "epoch": 0.03238324635353941, "grad_norm": 129.3856201171875, "learning_rate": 3e-06, "loss": -42.8209, "step": 363 }, { "epoch": 0.032472456398590484, "grad_norm": 90.02410888671875, "learning_rate": 3e-06, "loss": -37.0157, "step": 364 }, { "epoch": 0.03256166644364156, "grad_norm": 122.2002944946289, "learning_rate": 3e-06, "loss": -42.0898, "step": 365 }, { "epoch": 0.03265087648869262, "grad_norm": 114.75045776367188, "learning_rate": 3e-06, "loss": -37.9465, "step": 366 }, { "epoch": 0.0327400865337437, "grad_norm": 117.86136627197266, "learning_rate": 3e-06, "loss": -50.0162, "step": 367 }, { "epoch": 0.03282929657879477, "grad_norm": 224.55755615234375, "learning_rate": 3e-06, "loss": -62.4077, "step": 368 }, { "epoch": 0.03291850662384584, "grad_norm": 145.33380126953125, "learning_rate": 3e-06, "loss": -45.7888, "step": 369 }, { "epoch": 0.033007716668896916, "grad_norm": 107.85284423828125, "learning_rate": 3e-06, "loss": -38.8828, "step": 370 }, { "epoch": 0.03309692671394799, "grad_norm": 143.64854431152344, "learning_rate": 3e-06, "loss": -44.4827, "step": 371 }, { "epoch": 0.03318613675899906, "grad_norm": 120.4244155883789, "learning_rate": 3e-06, "loss": -40.0586, "step": 372 }, { "completion_length": 170.3125, "epoch": 0.033275346804050136, "grad_norm": 199.8765869140625, "learning_rate": 3e-06, "loss": -16.0424, "reward": 1.5020000338554382, "reward_std": 0.6375356912612915, "rewards/correctness_reward_func": 1.0416666865348816, "rewards/int_reward_func": 0.4479166716337204, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.012416664510965347, "step": 373, "zero_std_ratio": 0.0 }, { "epoch": 0.03336455684910121, "grad_norm": 186.22103881835938, "learning_rate": 3e-06, "loss": -12.8464, "step": 374 }, { "epoch": 0.03345376689415228, "grad_norm": 133.84971618652344, "learning_rate": 3e-06, "loss": -16.0581, "step": 375 }, { "epoch": 0.033542976939203356, "grad_norm": 124.62361145019531, "learning_rate": 3e-06, "loss": -20.0662, "step": 376 }, { "epoch": 0.03363218698425443, "grad_norm": 177.62574768066406, "learning_rate": 3e-06, "loss": -16.7656, "step": 377 }, { "epoch": 0.0337213970293055, "grad_norm": 162.92381286621094, "learning_rate": 3e-06, "loss": -19.1936, "step": 378 }, { "epoch": 0.033810607074356576, "grad_norm": 167.49449157714844, "learning_rate": 3e-06, "loss": -16.9396, "step": 379 }, { "epoch": 0.03389981711940764, "grad_norm": 180.7197723388672, "learning_rate": 3e-06, "loss": -14.2949, "step": 380 }, { "epoch": 0.033989027164458716, "grad_norm": 158.6161346435547, "learning_rate": 3e-06, "loss": -18.1083, "step": 381 }, { "epoch": 0.03407823720950979, "grad_norm": 136.7860870361328, "learning_rate": 3e-06, "loss": -21.9306, "step": 382 }, { "epoch": 0.03416744725456086, "grad_norm": 200.51185607910156, "learning_rate": 3e-06, "loss": -18.8755, "step": 383 }, { "epoch": 0.034256657299611935, "grad_norm": 174.00477600097656, "learning_rate": 3e-06, "loss": -21.6131, "step": 384 }, { "completion_length": 168.0416717529297, "epoch": 0.03434586734466301, "grad_norm": 270.6558837890625, "learning_rate": 3e-06, "loss": -108.9543, "reward": 1.5282500386238098, "reward_std": 0.8121029734611511, "rewards/correctness_reward_func": 1.0416666567325592, "rewards/int_reward_func": 0.4479166716337204, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.03866666788235307, "step": 385, "zero_std_ratio": 0.0 }, { "epoch": 0.03443507738971408, "grad_norm": 316.0130615234375, "learning_rate": 3e-06, "loss": -121.254, "step": 386 }, { "epoch": 0.034524287434765155, "grad_norm": 257.7696228027344, "learning_rate": 3e-06, "loss": -102.8911, "step": 387 }, { "epoch": 0.03461349747981623, "grad_norm": 305.755126953125, "learning_rate": 3e-06, "loss": -114.5659, "step": 388 }, { "epoch": 0.0347027075248673, "grad_norm": 219.5818328857422, "learning_rate": 3e-06, "loss": -114.7208, "step": 389 }, { "epoch": 0.034791917569918375, "grad_norm": 255.7522430419922, "learning_rate": 3e-06, "loss": -109.7934, "step": 390 }, { "epoch": 0.03488112761496945, "grad_norm": 291.77642822265625, "learning_rate": 3e-06, "loss": -116.5826, "step": 391 }, { "epoch": 0.03497033766002052, "grad_norm": 333.5157165527344, "learning_rate": 3e-06, "loss": -132.3897, "step": 392 }, { "epoch": 0.03505954770507159, "grad_norm": 267.8763122558594, "learning_rate": 3e-06, "loss": -110.8159, "step": 393 }, { "epoch": 0.03514875775012266, "grad_norm": 312.3733215332031, "learning_rate": 3e-06, "loss": -127.8078, "step": 394 }, { "epoch": 0.035237967795173734, "grad_norm": 242.0186309814453, "learning_rate": 3e-06, "loss": -123.3703, "step": 395 }, { "epoch": 0.03532717784022481, "grad_norm": 297.62847900390625, "learning_rate": 3e-06, "loss": -120.3945, "step": 396 }, { "completion_length": 171.45833587646484, "epoch": 0.03541638788527588, "grad_norm": 192.1062774658203, "learning_rate": 3e-06, "loss": -81.6189, "reward": 1.9622292518615723, "reward_std": 0.8264816105365753, "rewards/correctness_reward_func": 1.4583333134651184, "rewards/int_reward_func": 0.46875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.0351458303630352, "step": 397, "zero_std_ratio": 0.0 }, { "epoch": 0.035505597930326954, "grad_norm": 202.8217010498047, "learning_rate": 3e-06, "loss": -67.341, "step": 398 }, { "epoch": 0.03559480797537803, "grad_norm": 218.82249450683594, "learning_rate": 3e-06, "loss": -65.3347, "step": 399 }, { "epoch": 0.0356840180204291, "grad_norm": 232.65196228027344, "learning_rate": 3e-06, "loss": -76.8989, "step": 400 }, { "epoch": 0.035773228065480174, "grad_norm": 198.04103088378906, "learning_rate": 3e-06, "loss": -74.488, "step": 401 }, { "epoch": 0.03586243811053125, "grad_norm": 211.86273193359375, "learning_rate": 3e-06, "loss": -75.6096, "step": 402 }, { "epoch": 0.03595164815558232, "grad_norm": 218.77589416503906, "learning_rate": 3e-06, "loss": -87.9174, "step": 403 }, { "epoch": 0.036040858200633394, "grad_norm": 243.4962615966797, "learning_rate": 3e-06, "loss": -70.1026, "step": 404 }, { "epoch": 0.03613006824568447, "grad_norm": 242.30494689941406, "learning_rate": 3e-06, "loss": -71.4579, "step": 405 }, { "epoch": 0.03621927829073553, "grad_norm": 274.28948974609375, "learning_rate": 3e-06, "loss": -83.8044, "step": 406 }, { "epoch": 0.03630848833578661, "grad_norm": 257.0942077636719, "learning_rate": 3e-06, "loss": -82.445, "step": 407 }, { "epoch": 0.03639769838083768, "grad_norm": 255.2320556640625, "learning_rate": 3e-06, "loss": -81.9402, "step": 408 }, { "completion_length": 125.70833587646484, "epoch": 0.03648690842588875, "grad_norm": 185.7193145751953, "learning_rate": 3e-06, "loss": 46.4742, "reward": 1.8683959245681763, "reward_std": 0.8172085583209991, "rewards/correctness_reward_func": 1.3333333730697632, "rewards/int_reward_func": 0.4270833432674408, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.10797916725277901, "step": 409, "zero_std_ratio": 0.0 }, { "epoch": 0.03657611847093983, "grad_norm": 218.23338317871094, "learning_rate": 3e-06, "loss": 29.3517, "step": 410 }, { "epoch": 0.0366653285159909, "grad_norm": 180.90330505371094, "learning_rate": 3e-06, "loss": 55.3162, "step": 411 }, { "epoch": 0.03675453856104197, "grad_norm": 216.37953186035156, "learning_rate": 3e-06, "loss": 50.2096, "step": 412 }, { "epoch": 0.036843748606093046, "grad_norm": 198.8724822998047, "learning_rate": 3e-06, "loss": 51.636, "step": 413 }, { "epoch": 0.03693295865114412, "grad_norm": 184.89627075195312, "learning_rate": 3e-06, "loss": 44.8369, "step": 414 }, { "epoch": 0.03702216869619519, "grad_norm": 167.6713104248047, "learning_rate": 3e-06, "loss": 44.1546, "step": 415 }, { "epoch": 0.037111378741246266, "grad_norm": 192.13140869140625, "learning_rate": 3e-06, "loss": 27.4686, "step": 416 }, { "epoch": 0.03720058878629734, "grad_norm": 177.4408721923828, "learning_rate": 3e-06, "loss": 53.37, "step": 417 }, { "epoch": 0.03728979883134841, "grad_norm": 223.81668090820312, "learning_rate": 3e-06, "loss": 45.2759, "step": 418 }, { "epoch": 0.037379008876399486, "grad_norm": 207.5684356689453, "learning_rate": 3e-06, "loss": 46.924, "step": 419 }, { "epoch": 0.03746821892145055, "grad_norm": 180.81484985351562, "learning_rate": 3e-06, "loss": 42.0928, "step": 420 }, { "completion_length": 153.89583587646484, "epoch": 0.037557428966501626, "grad_norm": 247.00067138671875, "learning_rate": 3e-06, "loss": 14.4654, "reward": 1.776770830154419, "reward_std": 0.6972799003124237, "rewards/correctness_reward_func": 1.25, "rewards/int_reward_func": 0.4479166716337204, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.07885416969656944, "step": 421, "zero_std_ratio": 0.0 }, { "epoch": 0.0376466390115527, "grad_norm": 244.66824340820312, "learning_rate": 3e-06, "loss": 34.9601, "step": 422 }, { "epoch": 0.03773584905660377, "grad_norm": 276.21539306640625, "learning_rate": 3e-06, "loss": 14.6261, "step": 423 }, { "epoch": 0.037825059101654845, "grad_norm": 288.96246337890625, "learning_rate": 3e-06, "loss": 41.9368, "step": 424 }, { "epoch": 0.03791426914670592, "grad_norm": 303.6945495605469, "learning_rate": 3e-06, "loss": 23.9119, "step": 425 }, { "epoch": 0.03800347919175699, "grad_norm": 274.27142333984375, "learning_rate": 3e-06, "loss": 27.399, "step": 426 }, { "epoch": 0.038092689236808065, "grad_norm": 233.245361328125, "learning_rate": 3e-06, "loss": 10.175, "step": 427 }, { "epoch": 0.03818189928185914, "grad_norm": 256.8597412109375, "learning_rate": 3e-06, "loss": 31.9908, "step": 428 }, { "epoch": 0.03827110932691021, "grad_norm": 270.4859619140625, "learning_rate": 3e-06, "loss": 10.7867, "step": 429 }, { "epoch": 0.038360319371961285, "grad_norm": 301.17181396484375, "learning_rate": 3e-06, "loss": 40.6524, "step": 430 }, { "epoch": 0.03844952941701236, "grad_norm": 303.94488525390625, "learning_rate": 3e-06, "loss": 21.1706, "step": 431 }, { "epoch": 0.03853873946206343, "grad_norm": 258.3034973144531, "learning_rate": 3e-06, "loss": 22.7622, "step": 432 }, { "completion_length": 107.54166793823242, "epoch": 0.0386279495071145, "grad_norm": 134.02154541015625, "learning_rate": 3e-06, "loss": -22.4999, "reward": 2.2951666712760925, "reward_std": 0.38126008585095406, "rewards/correctness_reward_func": 1.5833333134651184, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.21183334290981293, "step": 433, "zero_std_ratio": 0.0 }, { "epoch": 0.03871715955216557, "grad_norm": 191.98023986816406, "learning_rate": 3e-06, "loss": -14.0492, "step": 434 }, { "epoch": 0.038806369597216644, "grad_norm": 154.30328369140625, "learning_rate": 3e-06, "loss": -10.9444, "step": 435 }, { "epoch": 0.03889557964226772, "grad_norm": 134.01214599609375, "learning_rate": 3e-06, "loss": -17.7574, "step": 436 }, { "epoch": 0.03898478968731879, "grad_norm": 132.3379364013672, "learning_rate": 3e-06, "loss": -18.487, "step": 437 }, { "epoch": 0.039073999732369864, "grad_norm": 146.31573486328125, "learning_rate": 3e-06, "loss": -12.7164, "step": 438 }, { "epoch": 0.03916320977742094, "grad_norm": 136.05592346191406, "learning_rate": 3e-06, "loss": -23.1439, "step": 439 }, { "epoch": 0.03925241982247201, "grad_norm": 138.1117706298828, "learning_rate": 3e-06, "loss": -15.7255, "step": 440 }, { "epoch": 0.039341629867523084, "grad_norm": 166.34922790527344, "learning_rate": 3e-06, "loss": -12.6375, "step": 441 }, { "epoch": 0.03943083991257416, "grad_norm": 132.994140625, "learning_rate": 3e-06, "loss": -20.1431, "step": 442 }, { "epoch": 0.03952004995762523, "grad_norm": 129.54771423339844, "learning_rate": 3e-06, "loss": -19.3478, "step": 443 }, { "epoch": 0.039609260002676304, "grad_norm": 152.91607666015625, "learning_rate": 3e-06, "loss": -14.3018, "step": 444 }, { "completion_length": 160.7916717529297, "epoch": 0.03969847004772738, "grad_norm": 165.9615020751953, "learning_rate": 3e-06, "loss": 38.8687, "reward": 1.6081042885780334, "reward_std": 0.45602357387542725, "rewards/correctness_reward_func": 1.0833333432674408, "rewards/int_reward_func": 0.4479166716337204, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.0768541656434536, "step": 445, "zero_std_ratio": 0.0 }, { "epoch": 0.039787680092778444, "grad_norm": 165.71353149414062, "learning_rate": 3e-06, "loss": 32.8818, "step": 446 }, { "epoch": 0.03987689013782952, "grad_norm": 185.27174377441406, "learning_rate": 3e-06, "loss": 29.8206, "step": 447 }, { "epoch": 0.03996610018288059, "grad_norm": 163.79954528808594, "learning_rate": 3e-06, "loss": 28.2399, "step": 448 }, { "epoch": 0.04005531022793166, "grad_norm": 167.2331085205078, "learning_rate": 3e-06, "loss": 42.2504, "step": 449 }, { "epoch": 0.04014452027298274, "grad_norm": 157.44320678710938, "learning_rate": 3e-06, "loss": 47.0631, "step": 450 }, { "epoch": 0.04023373031803381, "grad_norm": 167.7976837158203, "learning_rate": 3e-06, "loss": 37.7299, "step": 451 }, { "epoch": 0.04032294036308488, "grad_norm": 171.96420288085938, "learning_rate": 3e-06, "loss": 31.7018, "step": 452 }, { "epoch": 0.040412150408135956, "grad_norm": 164.95046997070312, "learning_rate": 3e-06, "loss": 28.9306, "step": 453 }, { "epoch": 0.04050136045318703, "grad_norm": 146.903076171875, "learning_rate": 3e-06, "loss": 26.551, "step": 454 }, { "epoch": 0.0405905704982381, "grad_norm": 182.6881561279297, "learning_rate": 3e-06, "loss": 41.0788, "step": 455 }, { "epoch": 0.040679780543289176, "grad_norm": 147.5907440185547, "learning_rate": 3e-06, "loss": 44.0529, "step": 456 }, { "completion_length": 143.8541717529297, "epoch": 0.04076899058834025, "grad_norm": 252.46615600585938, "learning_rate": 3e-06, "loss": -60.641, "reward": 1.8890208005905151, "reward_std": 0.4024546667933464, "rewards/correctness_reward_func": 1.375, "rewards/int_reward_func": 0.4166666716337204, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.09735416248440742, "step": 457, "zero_std_ratio": 0.0 }, { "epoch": 0.04085820063339132, "grad_norm": 223.07171630859375, "learning_rate": 3e-06, "loss": -42.0093, "step": 458 }, { "epoch": 0.040947410678442396, "grad_norm": 237.3083953857422, "learning_rate": 3e-06, "loss": -57.9657, "step": 459 }, { "epoch": 0.04103662072349346, "grad_norm": 225.29269409179688, "learning_rate": 3e-06, "loss": -41.8661, "step": 460 }, { "epoch": 0.041125830768544536, "grad_norm": 210.0297088623047, "learning_rate": 3e-06, "loss": -47.2305, "step": 461 }, { "epoch": 0.04121504081359561, "grad_norm": 262.04473876953125, "learning_rate": 3e-06, "loss": -50.9342, "step": 462 }, { "epoch": 0.04130425085864668, "grad_norm": 252.53802490234375, "learning_rate": 3e-06, "loss": -62.6208, "step": 463 }, { "epoch": 0.041393460903697755, "grad_norm": 221.40121459960938, "learning_rate": 3e-06, "loss": -43.7282, "step": 464 }, { "epoch": 0.04148267094874883, "grad_norm": 231.59335327148438, "learning_rate": 3e-06, "loss": -60.5224, "step": 465 }, { "epoch": 0.0415718809937999, "grad_norm": 207.73471069335938, "learning_rate": 3e-06, "loss": -44.5223, "step": 466 }, { "epoch": 0.041661091038850975, "grad_norm": 217.08779907226562, "learning_rate": 3e-06, "loss": -50.6021, "step": 467 }, { "epoch": 0.04175030108390205, "grad_norm": 254.29269409179688, "learning_rate": 3e-06, "loss": -55.4887, "step": 468 }, { "completion_length": 122.50000381469727, "epoch": 0.04183951112895312, "grad_norm": 147.28335571289062, "learning_rate": 3e-06, "loss": 12.9864, "reward": 2.310395896434784, "reward_std": 0.44204503297805786, "rewards/correctness_reward_func": 1.6666666269302368, "rewards/int_reward_func": 0.46875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.010416666977107525, "rewards/xmlcount_reward_func": 0.16456249356269836, "step": 469, "zero_std_ratio": 0.0 }, { "epoch": 0.041928721174004195, "grad_norm": 161.9978485107422, "learning_rate": 3e-06, "loss": 34.7546, "step": 470 }, { "epoch": 0.04201793121905527, "grad_norm": 165.3116455078125, "learning_rate": 3e-06, "loss": 17.4188, "step": 471 }, { "epoch": 0.04210714126410634, "grad_norm": 142.81861877441406, "learning_rate": 3e-06, "loss": 18.3006, "step": 472 }, { "epoch": 0.04219635130915741, "grad_norm": 168.01116943359375, "learning_rate": 3e-06, "loss": 17.6413, "step": 473 }, { "epoch": 0.04228556135420848, "grad_norm": 207.03326416015625, "learning_rate": 3e-06, "loss": 15.9259, "step": 474 }, { "epoch": 0.042374771399259555, "grad_norm": 141.62599182128906, "learning_rate": 3e-06, "loss": 12.4355, "step": 475 }, { "epoch": 0.04246398144431063, "grad_norm": 180.9537353515625, "learning_rate": 3e-06, "loss": 32.9299, "step": 476 }, { "epoch": 0.0425531914893617, "grad_norm": 163.92254638671875, "learning_rate": 3e-06, "loss": 17.1089, "step": 477 }, { "epoch": 0.042642401534412774, "grad_norm": 145.9250030517578, "learning_rate": 3e-06, "loss": 17.069, "step": 478 }, { "epoch": 0.04273161157946385, "grad_norm": 152.68319702148438, "learning_rate": 3e-06, "loss": 16.6025, "step": 479 }, { "epoch": 0.04282082162451492, "grad_norm": 222.65997314453125, "learning_rate": 3e-06, "loss": 14.2435, "step": 480 }, { "completion_length": 116.66666793823242, "epoch": 0.042910031669565994, "grad_norm": 166.38722229003906, "learning_rate": 3e-06, "loss": 13.4616, "reward": 1.5636458992958069, "reward_std": 0.6559399664402008, "rewards/correctness_reward_func": 0.9999999701976776, "rewards/int_reward_func": 0.4166666716337204, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.1469791643321514, "step": 481, "zero_std_ratio": 0.0 }, { "epoch": 0.04299924171461707, "grad_norm": 222.22879028320312, "learning_rate": 3e-06, "loss": 18.8121, "step": 482 }, { "epoch": 0.04308845175966814, "grad_norm": 221.17059326171875, "learning_rate": 3e-06, "loss": 21.4877, "step": 483 }, { "epoch": 0.043177661804719214, "grad_norm": 142.53189086914062, "learning_rate": 3e-06, "loss": 16.8492, "step": 484 }, { "epoch": 0.04326687184977029, "grad_norm": 170.13198852539062, "learning_rate": 3e-06, "loss": 20.7898, "step": 485 }, { "epoch": 0.043356081894821354, "grad_norm": 161.23110961914062, "learning_rate": 3e-06, "loss": 12.8851, "step": 486 }, { "epoch": 0.04344529193987243, "grad_norm": 175.66587829589844, "learning_rate": 3e-06, "loss": 10.8873, "step": 487 }, { "epoch": 0.0435345019849235, "grad_norm": 195.75050354003906, "learning_rate": 3e-06, "loss": 15.6694, "step": 488 }, { "epoch": 0.04362371202997457, "grad_norm": 190.2042236328125, "learning_rate": 3e-06, "loss": 19.0926, "step": 489 }, { "epoch": 0.04371292207502565, "grad_norm": 146.10504150390625, "learning_rate": 3e-06, "loss": 13.8907, "step": 490 }, { "epoch": 0.04380213212007672, "grad_norm": 149.26614379882812, "learning_rate": 3e-06, "loss": 17.0683, "step": 491 }, { "epoch": 0.04389134216512779, "grad_norm": 178.4593963623047, "learning_rate": 3e-06, "loss": 10.2799, "step": 492 }, { "completion_length": 137.50000762939453, "epoch": 0.043980552210178867, "grad_norm": 87.93815612792969, "learning_rate": 3e-06, "loss": -35.7257, "reward": 2.5712709426879883, "reward_std": 0.18458116799592972, "rewards/correctness_reward_func": 1.9583333134651184, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.11293749511241913, "step": 493, "zero_std_ratio": 0.0 }, { "epoch": 0.04406976225522994, "grad_norm": 78.96405029296875, "learning_rate": 3e-06, "loss": -37.1011, "step": 494 }, { "epoch": 0.04415897230028101, "grad_norm": 113.70293426513672, "learning_rate": 3e-06, "loss": -46.3151, "step": 495 }, { "epoch": 0.044248182345332086, "grad_norm": 90.45478820800781, "learning_rate": 3e-06, "loss": -43.5143, "step": 496 }, { "epoch": 0.04433739239038316, "grad_norm": 106.42904663085938, "learning_rate": 3e-06, "loss": -42.7944, "step": 497 }, { "epoch": 0.04442660243543423, "grad_norm": 106.20608520507812, "learning_rate": 3e-06, "loss": -48.1815, "step": 498 }, { "epoch": 0.0445158124804853, "grad_norm": 93.41876220703125, "learning_rate": 3e-06, "loss": -37.7992, "step": 499 }, { "epoch": 0.04460502252553637, "grad_norm": 91.40050506591797, "learning_rate": 3e-06, "loss": -38.651, "step": 500 }, { "epoch": 0.044694232570587446, "grad_norm": 116.4251480102539, "learning_rate": 3e-06, "loss": -49.2276, "step": 501 }, { "epoch": 0.04478344261563852, "grad_norm": 92.2903060913086, "learning_rate": 3e-06, "loss": -46.591, "step": 502 }, { "epoch": 0.04487265266068959, "grad_norm": 110.7293472290039, "learning_rate": 3e-06, "loss": -45.7648, "step": 503 }, { "epoch": 0.044961862705740666, "grad_norm": 108.03923797607422, "learning_rate": 3e-06, "loss": -51.9054, "step": 504 }, { "completion_length": 131.14583587646484, "epoch": 0.04505107275079174, "grad_norm": 452.2289733886719, "learning_rate": 3e-06, "loss": 29.1255, "reward": 1.9235833883285522, "reward_std": 0.813209742307663, "rewards/correctness_reward_func": 1.3333333730697632, "rewards/int_reward_func": 0.4270833283662796, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.16316666454076767, "step": 505, "zero_std_ratio": 0.0 }, { "epoch": 0.04514028279584281, "grad_norm": 284.8124694824219, "learning_rate": 3e-06, "loss": 31.7163, "step": 506 }, { "epoch": 0.045229492840893885, "grad_norm": 361.7442626953125, "learning_rate": 3e-06, "loss": 27.2282, "step": 507 }, { "epoch": 0.04531870288594496, "grad_norm": 283.879638671875, "learning_rate": 3e-06, "loss": 42.2817, "step": 508 }, { "epoch": 0.04540791293099603, "grad_norm": 316.19000244140625, "learning_rate": 3e-06, "loss": 26.7891, "step": 509 }, { "epoch": 0.045497122976047105, "grad_norm": 370.62652587890625, "learning_rate": 3e-06, "loss": 34.1636, "step": 510 }, { "epoch": 0.04558633302109818, "grad_norm": 273.1391296386719, "learning_rate": 3e-06, "loss": 27.6705, "step": 511 }, { "epoch": 0.04567554306614925, "grad_norm": 307.9808044433594, "learning_rate": 3e-06, "loss": 28.9577, "step": 512 }, { "epoch": 0.04576475311120032, "grad_norm": 374.3335876464844, "learning_rate": 3e-06, "loss": 22.2669, "step": 513 }, { "epoch": 0.04585396315625139, "grad_norm": 395.0052795410156, "learning_rate": 3e-06, "loss": 39.239, "step": 514 }, { "epoch": 0.045943173201302465, "grad_norm": 539.7128295898438, "learning_rate": 3e-06, "loss": 25.6475, "step": 515 }, { "epoch": 0.04603238324635354, "grad_norm": 348.81170654296875, "learning_rate": 3e-06, "loss": 31.7762, "step": 516 }, { "completion_length": 115.83333587646484, "epoch": 0.04612159329140461, "grad_norm": 177.44480895996094, "learning_rate": 3e-06, "loss": -44.7672, "reward": 2.343208432197571, "reward_std": 0.4359329864382744, "rewards/correctness_reward_func": 1.6666666269302368, "rewards/int_reward_func": 0.4895833283662796, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.18695833534002304, "step": 517, "zero_std_ratio": 0.0 }, { "epoch": 0.046210803336455684, "grad_norm": 199.65908813476562, "learning_rate": 3e-06, "loss": -51.4622, "step": 518 }, { "epoch": 0.04630001338150676, "grad_norm": 175.2034149169922, "learning_rate": 3e-06, "loss": -55.1536, "step": 519 }, { "epoch": 0.04638922342655783, "grad_norm": 160.91688537597656, "learning_rate": 3e-06, "loss": -37.3164, "step": 520 }, { "epoch": 0.046478433471608904, "grad_norm": 165.5592498779297, "learning_rate": 3e-06, "loss": -42.9147, "step": 521 }, { "epoch": 0.04656764351665998, "grad_norm": 154.5955047607422, "learning_rate": 3e-06, "loss": -48.9731, "step": 522 }, { "epoch": 0.04665685356171105, "grad_norm": 202.06838989257812, "learning_rate": 3e-06, "loss": -46.1992, "step": 523 }, { "epoch": 0.046746063606762124, "grad_norm": 216.6766357421875, "learning_rate": 3e-06, "loss": -54.2271, "step": 524 }, { "epoch": 0.0468352736518132, "grad_norm": 212.4103240966797, "learning_rate": 3e-06, "loss": -59.7351, "step": 525 }, { "epoch": 0.046924483696864264, "grad_norm": 160.86546325683594, "learning_rate": 3e-06, "loss": -40.5866, "step": 526 }, { "epoch": 0.04701369374191534, "grad_norm": 171.24478149414062, "learning_rate": 3e-06, "loss": -47.2665, "step": 527 }, { "epoch": 0.04710290378696641, "grad_norm": 165.52357482910156, "learning_rate": 3e-06, "loss": -53.477, "step": 528 }, { "completion_length": 119.77083587646484, "epoch": 0.047192113832017483, "grad_norm": 382.3139953613281, "learning_rate": 3e-06, "loss": 86.4678, "reward": 1.9181458950042725, "reward_std": 0.6776820421218872, "rewards/correctness_reward_func": 1.3333333134651184, "rewards/int_reward_func": 0.4166666716337204, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.1681458279490471, "step": 529, "zero_std_ratio": 0.0 }, { "epoch": 0.04728132387706856, "grad_norm": 264.3744812011719, "learning_rate": 3e-06, "loss": 87.2703, "step": 530 }, { "epoch": 0.04737053392211963, "grad_norm": 273.3477783203125, "learning_rate": 3e-06, "loss": 74.9615, "step": 531 }, { "epoch": 0.0474597439671707, "grad_norm": 326.87078857421875, "learning_rate": 3e-06, "loss": 90.7838, "step": 532 }, { "epoch": 0.04754895401222178, "grad_norm": 294.74041748046875, "learning_rate": 3e-06, "loss": 102.1572, "step": 533 }, { "epoch": 0.04763816405727285, "grad_norm": 312.48626708984375, "learning_rate": 3e-06, "loss": 93.4423, "step": 534 }, { "epoch": 0.04772737410232392, "grad_norm": 383.2833557128906, "learning_rate": 3e-06, "loss": 86.6238, "step": 535 }, { "epoch": 0.047816584147374996, "grad_norm": 316.42926025390625, "learning_rate": 3e-06, "loss": 86.6783, "step": 536 }, { "epoch": 0.04790579419242607, "grad_norm": 268.37506103515625, "learning_rate": 3e-06, "loss": 72.0206, "step": 537 }, { "epoch": 0.04799500423747714, "grad_norm": 337.4726867675781, "learning_rate": 3e-06, "loss": 87.3427, "step": 538 }, { "epoch": 0.04808421428252821, "grad_norm": 284.79827880859375, "learning_rate": 3e-06, "loss": 98.66, "step": 539 }, { "epoch": 0.04817342432757928, "grad_norm": 321.74609375, "learning_rate": 3e-06, "loss": 87.6142, "step": 540 }, { "completion_length": 125.37500762939453, "epoch": 0.048262634372630356, "grad_norm": 187.67727661132812, "learning_rate": 3e-06, "loss": -1.5514, "reward": 1.9883333444595337, "reward_std": 0.6124217808246613, "rewards/correctness_reward_func": 1.4166666865348816, "rewards/int_reward_func": 0.4375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.1341666616499424, "step": 541, "zero_std_ratio": 0.0 }, { "epoch": 0.04835184441768143, "grad_norm": 202.66070556640625, "learning_rate": 3e-06, "loss": 1.9568, "step": 542 }, { "epoch": 0.0484410544627325, "grad_norm": 180.55126953125, "learning_rate": 3e-06, "loss": -6.8013, "step": 543 }, { "epoch": 0.048530264507783576, "grad_norm": 161.08514404296875, "learning_rate": 3e-06, "loss": -3.4725, "step": 544 }, { "epoch": 0.04861947455283465, "grad_norm": 220.28076171875, "learning_rate": 3e-06, "loss": 0.0972, "step": 545 }, { "epoch": 0.04870868459788572, "grad_norm": 321.00994873046875, "learning_rate": 3e-06, "loss": -10.5751, "step": 546 }, { "epoch": 0.048797894642936795, "grad_norm": 197.3623046875, "learning_rate": 3e-06, "loss": -2.6929, "step": 547 }, { "epoch": 0.04888710468798787, "grad_norm": 213.94691467285156, "learning_rate": 3e-06, "loss": 1.3407, "step": 548 }, { "epoch": 0.04897631473303894, "grad_norm": 254.2111053466797, "learning_rate": 3e-06, "loss": -7.3544, "step": 549 }, { "epoch": 0.049065524778090015, "grad_norm": 155.93460083007812, "learning_rate": 3e-06, "loss": -5.7191, "step": 550 }, { "epoch": 0.04915473482314109, "grad_norm": 175.17147827148438, "learning_rate": 3e-06, "loss": -2.0149, "step": 551 }, { "epoch": 0.04924394486819216, "grad_norm": 220.0244140625, "learning_rate": 3e-06, "loss": -12.1544, "step": 552 }, { "completion_length": 138.62500762939453, "epoch": 0.04933315491324323, "grad_norm": 255.7532501220703, "learning_rate": 3e-06, "loss": -42.4297, "reward": 2.4352500438690186, "reward_std": 0.32932066917419434, "rewards/correctness_reward_func": 1.8333333134651184, "rewards/int_reward_func": 0.4791666716337204, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.12275000661611557, "step": 553, "zero_std_ratio": 0.0 }, { "epoch": 0.0494223649582943, "grad_norm": 263.04913330078125, "learning_rate": 3e-06, "loss": -27.7121, "step": 554 }, { "epoch": 0.049511575003345375, "grad_norm": 187.12600708007812, "learning_rate": 3e-06, "loss": -32.293, "step": 555 }, { "epoch": 0.04960078504839645, "grad_norm": 233.73802185058594, "learning_rate": 3e-06, "loss": -34.2649, "step": 556 }, { "epoch": 0.04968999509344752, "grad_norm": 238.03567504882812, "learning_rate": 3e-06, "loss": -48.0373, "step": 557 }, { "epoch": 0.049779205138498595, "grad_norm": 211.17440795898438, "learning_rate": 3e-06, "loss": -34.5469, "step": 558 }, { "epoch": 0.04986841518354967, "grad_norm": 209.6473388671875, "learning_rate": 3e-06, "loss": -45.0123, "step": 559 }, { "epoch": 0.04995762522860074, "grad_norm": 219.1716766357422, "learning_rate": 3e-06, "loss": -29.3907, "step": 560 }, { "epoch": 0.050046835273651814, "grad_norm": 194.5946502685547, "learning_rate": 3e-06, "loss": -33.3813, "step": 561 }, { "epoch": 0.05013604531870289, "grad_norm": 230.82928466796875, "learning_rate": 3e-06, "loss": -37.409, "step": 562 }, { "epoch": 0.05022525536375396, "grad_norm": 268.3168640136719, "learning_rate": 3e-06, "loss": -50.4616, "step": 563 }, { "epoch": 0.050314465408805034, "grad_norm": 225.2816925048828, "learning_rate": 3e-06, "loss": -36.9552, "step": 564 }, { "completion_length": 138.81250381469727, "epoch": 0.05040367545385611, "grad_norm": 236.8852996826172, "learning_rate": 3e-06, "loss": 45.2003, "reward": 1.8692501783370972, "reward_std": 0.7652427852153778, "rewards/correctness_reward_func": 1.25, "rewards/int_reward_func": 0.46875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.15049999579787254, "step": 565, "zero_std_ratio": 0.0 }, { "epoch": 0.050492885498907174, "grad_norm": 258.247802734375, "learning_rate": 3e-06, "loss": 49.6383, "step": 566 }, { "epoch": 0.05058209554395825, "grad_norm": 318.1617126464844, "learning_rate": 3e-06, "loss": 58.9489, "step": 567 }, { "epoch": 0.05067130558900932, "grad_norm": 226.23045349121094, "learning_rate": 3e-06, "loss": 44.3005, "step": 568 }, { "epoch": 0.050760515634060394, "grad_norm": 303.47760009765625, "learning_rate": 3e-06, "loss": 28.8653, "step": 569 }, { "epoch": 0.05084972567911147, "grad_norm": 248.53013610839844, "learning_rate": 3e-06, "loss": 45.7073, "step": 570 }, { "epoch": 0.05093893572416254, "grad_norm": 228.76365661621094, "learning_rate": 3e-06, "loss": 40.9719, "step": 571 }, { "epoch": 0.05102814576921361, "grad_norm": 236.98915100097656, "learning_rate": 3e-06, "loss": 44.3296, "step": 572 }, { "epoch": 0.05111735581426469, "grad_norm": 318.7423400878906, "learning_rate": 3e-06, "loss": 54.9659, "step": 573 }, { "epoch": 0.05120656585931576, "grad_norm": 226.2831268310547, "learning_rate": 3e-06, "loss": 40.0793, "step": 574 }, { "epoch": 0.05129577590436683, "grad_norm": 321.7300109863281, "learning_rate": 3e-06, "loss": 25.4049, "step": 575 }, { "epoch": 0.051384985949417906, "grad_norm": 225.1118927001953, "learning_rate": 3e-06, "loss": 42.5627, "step": 576 }, { "completion_length": 114.56250381469727, "epoch": 0.05147419599446898, "grad_norm": 151.06048583984375, "learning_rate": 3e-06, "loss": 33.113, "reward": 2.1812918186187744, "reward_std": 0.530484139919281, "rewards/correctness_reward_func": 1.5833333134651184, "rewards/int_reward_func": 0.4375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.16045832633972168, "step": 577, "zero_std_ratio": 0.0 }, { "epoch": 0.05156340603952005, "grad_norm": 182.15431213378906, "learning_rate": 3e-06, "loss": 19.2473, "step": 578 }, { "epoch": 0.05165261608457112, "grad_norm": 169.3698272705078, "learning_rate": 3e-06, "loss": 25.8161, "step": 579 }, { "epoch": 0.05174182612962219, "grad_norm": 155.8734588623047, "learning_rate": 3e-06, "loss": 19.9518, "step": 580 }, { "epoch": 0.051831036174673266, "grad_norm": 177.54641723632812, "learning_rate": 3e-06, "loss": 23.4249, "step": 581 }, { "epoch": 0.05192024621972434, "grad_norm": 143.13719177246094, "learning_rate": 3e-06, "loss": 7.9228, "step": 582 }, { "epoch": 0.05200945626477541, "grad_norm": 146.39906311035156, "learning_rate": 3e-06, "loss": 30.8804, "step": 583 }, { "epoch": 0.052098666309826486, "grad_norm": 166.2614288330078, "learning_rate": 3e-06, "loss": 17.3062, "step": 584 }, { "epoch": 0.05218787635487756, "grad_norm": 158.1491241455078, "learning_rate": 3e-06, "loss": 23.1792, "step": 585 }, { "epoch": 0.05227708639992863, "grad_norm": 124.15723419189453, "learning_rate": 3e-06, "loss": 18.8908, "step": 586 }, { "epoch": 0.052366296444979706, "grad_norm": 155.26602172851562, "learning_rate": 3e-06, "loss": 21.968, "step": 587 }, { "epoch": 0.05245550649003078, "grad_norm": 155.0635528564453, "learning_rate": 3e-06, "loss": 6.3548, "step": 588 }, { "completion_length": 161.45833587646484, "epoch": 0.05254471653508185, "grad_norm": 317.9678649902344, "learning_rate": 3e-06, "loss": 6.4115, "reward": 1.8582292199134827, "reward_std": 0.48372724652290344, "rewards/correctness_reward_func": 1.3333333134651184, "rewards/int_reward_func": 0.4895833283662796, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.03531250241212547, "step": 589, "zero_std_ratio": 0.0 }, { "epoch": 0.052633926580132925, "grad_norm": 267.894287109375, "learning_rate": 3e-06, "loss": 6.0494, "step": 590 }, { "epoch": 0.052723136625184, "grad_norm": 197.32470703125, "learning_rate": 3e-06, "loss": 8.4095, "step": 591 }, { "epoch": 0.05281234667023507, "grad_norm": 208.84291076660156, "learning_rate": 3e-06, "loss": 2.183, "step": 592 }, { "epoch": 0.05290155671528614, "grad_norm": 213.50672912597656, "learning_rate": 3e-06, "loss": -5.6327, "step": 593 }, { "epoch": 0.05299076676033721, "grad_norm": 264.34210205078125, "learning_rate": 3e-06, "loss": 0.0463, "step": 594 }, { "epoch": 0.053079976805388285, "grad_norm": 269.38372802734375, "learning_rate": 3e-06, "loss": 3.4921, "step": 595 }, { "epoch": 0.05316918685043936, "grad_norm": 233.82005310058594, "learning_rate": 3e-06, "loss": 3.9645, "step": 596 }, { "epoch": 0.05325839689549043, "grad_norm": 174.2704620361328, "learning_rate": 3e-06, "loss": 6.6277, "step": 597 }, { "epoch": 0.053347606940541505, "grad_norm": 197.27203369140625, "learning_rate": 3e-06, "loss": -0.034, "step": 598 }, { "epoch": 0.05343681698559258, "grad_norm": 195.1741943359375, "learning_rate": 3e-06, "loss": -7.7007, "step": 599 }, { "epoch": 0.05352602703064365, "grad_norm": 218.35403442382812, "learning_rate": 3e-06, "loss": -2.732, "step": 600 }, { "completion_length": 163.20833587646484, "epoch": 0.053615237075694724, "grad_norm": 89.40003204345703, "learning_rate": 3e-06, "loss": -15.5412, "reward": 1.958250105381012, "reward_std": 0.20268601924180984, "rewards/correctness_reward_func": 1.4583333134651184, "rewards/int_reward_func": 0.4375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.06241666525602341, "step": 601, "zero_std_ratio": 0.0 }, { "epoch": 0.0537044471207458, "grad_norm": 85.92318725585938, "learning_rate": 3e-06, "loss": -19.9741, "step": 602 }, { "epoch": 0.05379365716579687, "grad_norm": 89.12326049804688, "learning_rate": 3e-06, "loss": -28.9968, "step": 603 }, { "epoch": 0.053882867210847944, "grad_norm": 109.64755249023438, "learning_rate": 3e-06, "loss": -21.6802, "step": 604 }, { "epoch": 0.05397207725589902, "grad_norm": 99.89476776123047, "learning_rate": 3e-06, "loss": -16.4453, "step": 605 }, { "epoch": 0.054061287300950084, "grad_norm": 140.71066284179688, "learning_rate": 3e-06, "loss": -24.1026, "step": 606 }, { "epoch": 0.05415049734600116, "grad_norm": 80.75763702392578, "learning_rate": 3e-06, "loss": -16.613, "step": 607 }, { "epoch": 0.05423970739105223, "grad_norm": 85.42610168457031, "learning_rate": 3e-06, "loss": -21.2449, "step": 608 }, { "epoch": 0.054328917436103304, "grad_norm": 93.39994812011719, "learning_rate": 3e-06, "loss": -30.1845, "step": 609 }, { "epoch": 0.05441812748115438, "grad_norm": 96.1513671875, "learning_rate": 3e-06, "loss": -22.8781, "step": 610 }, { "epoch": 0.05450733752620545, "grad_norm": 98.65193176269531, "learning_rate": 3e-06, "loss": -17.6772, "step": 611 }, { "epoch": 0.05459654757125652, "grad_norm": 130.4186248779297, "learning_rate": 3e-06, "loss": -25.9751, "step": 612 }, { "completion_length": 129.4375, "epoch": 0.0546857576163076, "grad_norm": 178.97259521484375, "learning_rate": 3e-06, "loss": -41.7741, "reward": 1.9304792881011963, "reward_std": 0.4192664921283722, "rewards/correctness_reward_func": 1.2916666865348816, "rewards/int_reward_func": 0.4895833283662796, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.14922916889190674, "step": 613, "zero_std_ratio": 0.0 }, { "epoch": 0.05477496766135867, "grad_norm": 159.00680541992188, "learning_rate": 3e-06, "loss": -36.4024, "step": 614 }, { "epoch": 0.05486417770640974, "grad_norm": 154.65304565429688, "learning_rate": 3e-06, "loss": -35.2622, "step": 615 }, { "epoch": 0.05495338775146082, "grad_norm": 239.50408935546875, "learning_rate": 3e-06, "loss": -34.9883, "step": 616 }, { "epoch": 0.05504259779651189, "grad_norm": 191.45263671875, "learning_rate": 3e-06, "loss": -30.4894, "step": 617 }, { "epoch": 0.05513180784156296, "grad_norm": 172.67025756835938, "learning_rate": 3e-06, "loss": -40.9277, "step": 618 }, { "epoch": 0.05522101788661403, "grad_norm": 180.4842071533203, "learning_rate": 3e-06, "loss": -44.2844, "step": 619 }, { "epoch": 0.0553102279316651, "grad_norm": 175.10528564453125, "learning_rate": 3e-06, "loss": -39.7696, "step": 620 }, { "epoch": 0.055399437976716176, "grad_norm": 224.33847045898438, "learning_rate": 3e-06, "loss": -38.3443, "step": 621 }, { "epoch": 0.05548864802176725, "grad_norm": 196.98231506347656, "learning_rate": 3e-06, "loss": -36.4789, "step": 622 }, { "epoch": 0.05557785806681832, "grad_norm": 205.22146606445312, "learning_rate": 3e-06, "loss": -32.5847, "step": 623 }, { "epoch": 0.055667068111869396, "grad_norm": 189.9784393310547, "learning_rate": 3e-06, "loss": -43.7861, "step": 624 }, { "completion_length": 135.1666717529297, "epoch": 0.05575627815692047, "grad_norm": 190.61593627929688, "learning_rate": 3e-06, "loss": -23.515, "reward": 1.7549793124198914, "reward_std": 0.5575149804353714, "rewards/correctness_reward_func": 1.1666666865348816, "rewards/int_reward_func": 0.4479166716337204, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.14039582759141922, "step": 625, "zero_std_ratio": 0.0 }, { "epoch": 0.05584548820197154, "grad_norm": 263.34173583984375, "learning_rate": 3e-06, "loss": -26.4535, "step": 626 }, { "epoch": 0.055934698247022616, "grad_norm": 203.12969970703125, "learning_rate": 3e-06, "loss": -20.2943, "step": 627 }, { "epoch": 0.05602390829207369, "grad_norm": 186.3466033935547, "learning_rate": 3e-06, "loss": -22.5875, "step": 628 }, { "epoch": 0.05611311833712476, "grad_norm": 206.43478393554688, "learning_rate": 3e-06, "loss": -14.8606, "step": 629 }, { "epoch": 0.056202328382175835, "grad_norm": 230.95394897460938, "learning_rate": 3e-06, "loss": -11.2027, "step": 630 }, { "epoch": 0.05629153842722691, "grad_norm": 208.40184020996094, "learning_rate": 3e-06, "loss": -24.9903, "step": 631 }, { "epoch": 0.056380748472277975, "grad_norm": 283.0361328125, "learning_rate": 3e-06, "loss": -28.8647, "step": 632 }, { "epoch": 0.05646995851732905, "grad_norm": 243.43634033203125, "learning_rate": 3e-06, "loss": -20.9854, "step": 633 }, { "epoch": 0.05655916856238012, "grad_norm": 196.8306121826172, "learning_rate": 3e-06, "loss": -25.1397, "step": 634 }, { "epoch": 0.056648378607431195, "grad_norm": 204.37130737304688, "learning_rate": 3e-06, "loss": -17.3989, "step": 635 }, { "epoch": 0.05673758865248227, "grad_norm": 222.9701385498047, "learning_rate": 3e-06, "loss": -14.9143, "step": 636 }, { "completion_length": 117.18750381469727, "epoch": 0.05682679869753334, "grad_norm": 152.56382751464844, "learning_rate": 3e-06, "loss": -53.4148, "reward": 2.4275625944137573, "reward_std": 0.3748088702559471, "rewards/correctness_reward_func": 1.75, "rewards/int_reward_func": 0.4895833283662796, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.18797916173934937, "step": 637, "zero_std_ratio": 0.0 }, { "epoch": 0.056916008742584415, "grad_norm": 187.37356567382812, "learning_rate": 3e-06, "loss": -49.109, "step": 638 }, { "epoch": 0.05700521878763549, "grad_norm": 188.10548400878906, "learning_rate": 3e-06, "loss": -49.7143, "step": 639 }, { "epoch": 0.05709442883268656, "grad_norm": 152.7540283203125, "learning_rate": 3e-06, "loss": -46.4195, "step": 640 }, { "epoch": 0.057183638877737634, "grad_norm": 191.8466796875, "learning_rate": 3e-06, "loss": -54.4961, "step": 641 }, { "epoch": 0.05727284892278871, "grad_norm": 160.72772216796875, "learning_rate": 3e-06, "loss": -46.0513, "step": 642 }, { "epoch": 0.05736205896783978, "grad_norm": 163.2805938720703, "learning_rate": 3e-06, "loss": -55.7387, "step": 643 }, { "epoch": 0.057451269012890854, "grad_norm": 189.55470275878906, "learning_rate": 3e-06, "loss": -51.4447, "step": 644 }, { "epoch": 0.05754047905794193, "grad_norm": 182.82583618164062, "learning_rate": 3e-06, "loss": -51.1561, "step": 645 }, { "epoch": 0.057629689102992994, "grad_norm": 177.5033721923828, "learning_rate": 3e-06, "loss": -49.5066, "step": 646 }, { "epoch": 0.05771889914804407, "grad_norm": 208.1852569580078, "learning_rate": 3e-06, "loss": -59.1063, "step": 647 }, { "epoch": 0.05780810919309514, "grad_norm": 185.0785369873047, "learning_rate": 3e-06, "loss": -49.7641, "step": 648 }, { "completion_length": 127.75000762939453, "epoch": 0.057897319238146214, "grad_norm": 332.4126892089844, "learning_rate": 3e-06, "loss": 24.9465, "reward": 2.10916668176651, "reward_std": 0.7774414718151093, "rewards/correctness_reward_func": 1.5, "rewards/int_reward_func": 0.4583333283662796, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.15083333104848862, "step": 649, "zero_std_ratio": 0.0 }, { "epoch": 0.05798652928319729, "grad_norm": 274.673828125, "learning_rate": 3e-06, "loss": 19.6904, "step": 650 }, { "epoch": 0.05807573932824836, "grad_norm": 225.66183471679688, "learning_rate": 3e-06, "loss": 17.079, "step": 651 }, { "epoch": 0.058164949373299434, "grad_norm": 236.45230102539062, "learning_rate": 3e-06, "loss": 16.4363, "step": 652 }, { "epoch": 0.05825415941835051, "grad_norm": 304.5491943359375, "learning_rate": 3e-06, "loss": 33.7894, "step": 653 }, { "epoch": 0.05834336946340158, "grad_norm": 294.2102966308594, "learning_rate": 3e-06, "loss": 12.1637, "step": 654 }, { "epoch": 0.05843257950845265, "grad_norm": 307.0853271484375, "learning_rate": 3e-06, "loss": 23.9968, "step": 655 }, { "epoch": 0.05852178955350373, "grad_norm": 267.52960205078125, "learning_rate": 3e-06, "loss": 18.2069, "step": 656 }, { "epoch": 0.0586109995985548, "grad_norm": 219.6736602783203, "learning_rate": 3e-06, "loss": 16.1471, "step": 657 }, { "epoch": 0.05870020964360587, "grad_norm": 312.0810852050781, "learning_rate": 3e-06, "loss": 15.3629, "step": 658 }, { "epoch": 0.05878941968865694, "grad_norm": 355.7622985839844, "learning_rate": 3e-06, "loss": 32.1285, "step": 659 }, { "epoch": 0.05887862973370801, "grad_norm": 243.60047912597656, "learning_rate": 3e-06, "loss": 10.9719, "step": 660 }, { "completion_length": 128.37500381469727, "epoch": 0.058967839778759086, "grad_norm": 160.17491149902344, "learning_rate": 3e-06, "loss": -4.7968, "reward": 2.220729112625122, "reward_std": 0.37828393280506134, "rewards/correctness_reward_func": 1.5833333134651184, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.13739583641290665, "step": 661, "zero_std_ratio": 0.0 }, { "epoch": 0.05905704982381016, "grad_norm": 213.04843139648438, "learning_rate": 3e-06, "loss": -17.5158, "step": 662 }, { "epoch": 0.05914625986886123, "grad_norm": 360.1976318359375, "learning_rate": 3e-06, "loss": -5.8696, "step": 663 }, { "epoch": 0.059235469913912306, "grad_norm": 165.9031524658203, "learning_rate": 3e-06, "loss": 9.1976, "step": 664 }, { "epoch": 0.05932467995896338, "grad_norm": 185.02491760253906, "learning_rate": 3e-06, "loss": 0.8362, "step": 665 }, { "epoch": 0.05941389000401445, "grad_norm": 186.4868927001953, "learning_rate": 3e-06, "loss": -4.8959, "step": 666 }, { "epoch": 0.059503100049065526, "grad_norm": 172.38906860351562, "learning_rate": 3e-06, "loss": -8.3078, "step": 667 }, { "epoch": 0.0595923100941166, "grad_norm": 205.17637634277344, "learning_rate": 3e-06, "loss": -19.4471, "step": 668 }, { "epoch": 0.05968152013916767, "grad_norm": 412.4108581542969, "learning_rate": 3e-06, "loss": -9.0223, "step": 669 }, { "epoch": 0.059770730184218746, "grad_norm": 165.48020935058594, "learning_rate": 3e-06, "loss": 5.0711, "step": 670 }, { "epoch": 0.05985994022926982, "grad_norm": 185.8058624267578, "learning_rate": 3e-06, "loss": -3.3781, "step": 671 }, { "epoch": 0.059949150274320885, "grad_norm": 214.62686157226562, "learning_rate": 3e-06, "loss": -9.8208, "step": 672 }, { "completion_length": 110.20833587646484, "epoch": 0.06003836031937196, "grad_norm": 223.97824096679688, "learning_rate": 3e-06, "loss": 17.6495, "reward": 1.9757083654403687, "reward_std": 0.7511122822761536, "rewards/correctness_reward_func": 1.375, "rewards/int_reward_func": 0.4375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.16320832818746567, "step": 673, "zero_std_ratio": 0.0 }, { "epoch": 0.06012757036442303, "grad_norm": 180.975341796875, "learning_rate": 3e-06, "loss": -3.9399, "step": 674 }, { "epoch": 0.060216780409474105, "grad_norm": 222.6065216064453, "learning_rate": 3e-06, "loss": 10.3809, "step": 675 }, { "epoch": 0.06030599045452518, "grad_norm": 192.075439453125, "learning_rate": 3e-06, "loss": 14.9998, "step": 676 }, { "epoch": 0.06039520049957625, "grad_norm": 200.7101593017578, "learning_rate": 3e-06, "loss": 2.2906, "step": 677 }, { "epoch": 0.060484410544627325, "grad_norm": 200.1593017578125, "learning_rate": 3e-06, "loss": 13.137, "step": 678 }, { "epoch": 0.0605736205896784, "grad_norm": 185.83168029785156, "learning_rate": 3e-06, "loss": 16.2497, "step": 679 }, { "epoch": 0.06066283063472947, "grad_norm": 208.6911163330078, "learning_rate": 3e-06, "loss": -5.308, "step": 680 }, { "epoch": 0.060752040679780545, "grad_norm": 231.81312561035156, "learning_rate": 3e-06, "loss": 8.1785, "step": 681 }, { "epoch": 0.06084125072483162, "grad_norm": 187.45535278320312, "learning_rate": 3e-06, "loss": 12.5542, "step": 682 }, { "epoch": 0.06093046076988269, "grad_norm": 182.04257202148438, "learning_rate": 3e-06, "loss": -1.0645, "step": 683 }, { "epoch": 0.061019670814933764, "grad_norm": 186.71937561035156, "learning_rate": 3e-06, "loss": 10.3416, "step": 684 }, { "completion_length": 130.4791717529297, "epoch": 0.06110888085998484, "grad_norm": 169.67178344726562, "learning_rate": 3e-06, "loss": 32.9847, "reward": 2.2736042737960815, "reward_std": 0.5084125399589539, "rewards/correctness_reward_func": 1.6666666865348816, "rewards/int_reward_func": 0.4583333283662796, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.14860416948795319, "step": 685, "zero_std_ratio": 0.0 }, { "epoch": 0.061198090905035904, "grad_norm": 231.2995147705078, "learning_rate": 3e-06, "loss": 18.3002, "step": 686 }, { "epoch": 0.06128730095008698, "grad_norm": 162.75083923339844, "learning_rate": 3e-06, "loss": 29.4583, "step": 687 }, { "epoch": 0.06137651099513805, "grad_norm": 209.54966735839844, "learning_rate": 3e-06, "loss": 42.4661, "step": 688 }, { "epoch": 0.061465721040189124, "grad_norm": 396.1800537109375, "learning_rate": 3e-06, "loss": 42.6963, "step": 689 }, { "epoch": 0.0615549310852402, "grad_norm": 144.02049255371094, "learning_rate": 3e-06, "loss": 23.2087, "step": 690 }, { "epoch": 0.06164414113029127, "grad_norm": 179.72213745117188, "learning_rate": 3e-06, "loss": 28.8186, "step": 691 }, { "epoch": 0.061733351175342344, "grad_norm": 143.9263153076172, "learning_rate": 3e-06, "loss": 16.2638, "step": 692 }, { "epoch": 0.06182256122039342, "grad_norm": 148.50750732421875, "learning_rate": 3e-06, "loss": 27.5098, "step": 693 }, { "epoch": 0.06191177126544449, "grad_norm": 178.09515380859375, "learning_rate": 3e-06, "loss": 38.1567, "step": 694 }, { "epoch": 0.06200098131049556, "grad_norm": 319.7694396972656, "learning_rate": 3e-06, "loss": 36.9073, "step": 695 }, { "epoch": 0.06209019135554664, "grad_norm": 137.5644989013672, "learning_rate": 3e-06, "loss": 19.9691, "step": 696 }, { "completion_length": 138.20833587646484, "epoch": 0.06217940140059771, "grad_norm": 243.18804931640625, "learning_rate": 3e-06, "loss": 63.9782, "reward": 1.6311666369438171, "reward_std": 0.6314830482006073, "rewards/correctness_reward_func": 1.0833333730697632, "rewards/int_reward_func": 0.4166666716337204, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.13116667047142982, "step": 697, "zero_std_ratio": 0.0 }, { "epoch": 0.06226861144564878, "grad_norm": 248.19979858398438, "learning_rate": 3e-06, "loss": 62.0708, "step": 698 }, { "epoch": 0.06235782149069985, "grad_norm": 192.6903839111328, "learning_rate": 3e-06, "loss": 44.2788, "step": 699 }, { "epoch": 0.06244703153575092, "grad_norm": 187.61729431152344, "learning_rate": 3e-06, "loss": 45.635, "step": 700 }, { "epoch": 0.062536241580802, "grad_norm": 270.56439208984375, "learning_rate": 3e-06, "loss": 61.8371, "step": 701 }, { "epoch": 0.06262545162585308, "grad_norm": 186.34654235839844, "learning_rate": 3e-06, "loss": 35.0754, "step": 702 }, { "epoch": 0.06271466167090414, "grad_norm": 193.72666931152344, "learning_rate": 3e-06, "loss": 56.4102, "step": 703 }, { "epoch": 0.06280387171595522, "grad_norm": 213.4738006591797, "learning_rate": 3e-06, "loss": 55.6052, "step": 704 }, { "epoch": 0.06289308176100629, "grad_norm": 178.9663543701172, "learning_rate": 3e-06, "loss": 38.9399, "step": 705 }, { "epoch": 0.06298229180605736, "grad_norm": 155.2235870361328, "learning_rate": 3e-06, "loss": 40.2563, "step": 706 }, { "epoch": 0.06307150185110844, "grad_norm": 190.83424377441406, "learning_rate": 3e-06, "loss": 54.1227, "step": 707 }, { "epoch": 0.0631607118961595, "grad_norm": 151.27175903320312, "learning_rate": 3e-06, "loss": 29.2768, "step": 708 }, { "completion_length": 177.89584350585938, "epoch": 0.06324992194121058, "grad_norm": 383.90155029296875, "learning_rate": 3e-06, "loss": -57.4567, "reward": 1.6143542528152466, "reward_std": 1.1507561206817627, "rewards/correctness_reward_func": 1.125, "rewards/int_reward_func": 0.4479166716337204, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.041437502950429916, "step": 709, "zero_std_ratio": 0.0 }, { "epoch": 0.06333913198626165, "grad_norm": 228.4014892578125, "learning_rate": 3e-06, "loss": -22.536, "step": 710 }, { "epoch": 0.06342834203131273, "grad_norm": 247.7998809814453, "learning_rate": 3e-06, "loss": -34.7459, "step": 711 }, { "epoch": 0.0635175520763638, "grad_norm": 256.2286376953125, "learning_rate": 3e-06, "loss": -30.4245, "step": 712 }, { "epoch": 0.06360676212141488, "grad_norm": 254.9169158935547, "learning_rate": 3e-06, "loss": -38.2843, "step": 713 }, { "epoch": 0.06369597216646594, "grad_norm": 321.43609619140625, "learning_rate": 3e-06, "loss": -35.5213, "step": 714 }, { "epoch": 0.06378518221151702, "grad_norm": 349.0517272949219, "learning_rate": 3e-06, "loss": -55.9526, "step": 715 }, { "epoch": 0.06387439225656809, "grad_norm": 209.25282287597656, "learning_rate": 3e-06, "loss": -22.3323, "step": 716 }, { "epoch": 0.06396360230161917, "grad_norm": 247.7156219482422, "learning_rate": 3e-06, "loss": -35.9537, "step": 717 }, { "epoch": 0.06405281234667023, "grad_norm": 274.3576965332031, "learning_rate": 3e-06, "loss": -33.4144, "step": 718 }, { "epoch": 0.0641420223917213, "grad_norm": 287.4893798828125, "learning_rate": 3e-06, "loss": -40.4848, "step": 719 }, { "epoch": 0.06423123243677238, "grad_norm": 375.7614440917969, "learning_rate": 3e-06, "loss": -39.3635, "step": 720 }, { "completion_length": 109.22916793823242, "epoch": 0.06432044248182345, "grad_norm": 123.53705596923828, "learning_rate": 3e-06, "loss": -30.919, "reward": 2.3711042404174805, "reward_std": 0.4830681085586548, "rewards/correctness_reward_func": 1.7083333730697632, "rewards/int_reward_func": 0.46875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.1940208300948143, "step": 721, "zero_std_ratio": 0.0 }, { "epoch": 0.06440965252687453, "grad_norm": 114.69866943359375, "learning_rate": 3e-06, "loss": -29.2645, "step": 722 }, { "epoch": 0.0644988625719256, "grad_norm": 118.97370910644531, "learning_rate": 3e-06, "loss": -25.0985, "step": 723 }, { "epoch": 0.06458807261697667, "grad_norm": 121.6566162109375, "learning_rate": 3e-06, "loss": -29.0591, "step": 724 }, { "epoch": 0.06467728266202774, "grad_norm": 182.92691040039062, "learning_rate": 3e-06, "loss": -28.0901, "step": 725 }, { "epoch": 0.06476649270707882, "grad_norm": 156.50718688964844, "learning_rate": 3e-06, "loss": -30.7831, "step": 726 }, { "epoch": 0.06485570275212989, "grad_norm": 132.53089904785156, "learning_rate": 3e-06, "loss": -33.3058, "step": 727 }, { "epoch": 0.06494491279718097, "grad_norm": 112.21791076660156, "learning_rate": 3e-06, "loss": -31.8015, "step": 728 }, { "epoch": 0.06503412284223203, "grad_norm": 130.97052001953125, "learning_rate": 3e-06, "loss": -27.7953, "step": 729 }, { "epoch": 0.06512333288728311, "grad_norm": 128.9853515625, "learning_rate": 3e-06, "loss": -32.2601, "step": 730 }, { "epoch": 0.06521254293233418, "grad_norm": 146.0636444091797, "learning_rate": 3e-06, "loss": -31.9068, "step": 731 }, { "epoch": 0.06530175297738525, "grad_norm": 148.94358825683594, "learning_rate": 3e-06, "loss": -35.98, "step": 732 }, { "completion_length": 133.77083587646484, "epoch": 0.06539096302243633, "grad_norm": 200.87583923339844, "learning_rate": 3e-06, "loss": 5.8156, "reward": 1.7448542714118958, "reward_std": 0.7897588908672333, "rewards/correctness_reward_func": 1.1666666865348816, "rewards/int_reward_func": 0.4479166716337204, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.1302708424627781, "step": 733, "zero_std_ratio": 0.0 }, { "epoch": 0.0654801730674874, "grad_norm": 254.33050537109375, "learning_rate": 3e-06, "loss": 39.6914, "step": 734 }, { "epoch": 0.06556938311253847, "grad_norm": 207.40354919433594, "learning_rate": 3e-06, "loss": 18.8055, "step": 735 }, { "epoch": 0.06565859315758954, "grad_norm": 255.23114013671875, "learning_rate": 3e-06, "loss": 12.5947, "step": 736 }, { "epoch": 0.06574780320264062, "grad_norm": 183.82200622558594, "learning_rate": 3e-06, "loss": 11.0044, "step": 737 }, { "epoch": 0.06583701324769169, "grad_norm": 226.2420654296875, "learning_rate": 3e-06, "loss": 0.4847, "step": 738 }, { "epoch": 0.06592622329274277, "grad_norm": 233.6065673828125, "learning_rate": 3e-06, "loss": 4.5426, "step": 739 }, { "epoch": 0.06601543333779383, "grad_norm": 264.205078125, "learning_rate": 3e-06, "loss": 39.3686, "step": 740 }, { "epoch": 0.06610464338284491, "grad_norm": 241.85284423828125, "learning_rate": 3e-06, "loss": 17.7268, "step": 741 }, { "epoch": 0.06619385342789598, "grad_norm": 221.0516357421875, "learning_rate": 3e-06, "loss": 13.2982, "step": 742 }, { "epoch": 0.06628306347294706, "grad_norm": 212.37222290039062, "learning_rate": 3e-06, "loss": 7.7467, "step": 743 }, { "epoch": 0.06637227351799813, "grad_norm": 263.42919921875, "learning_rate": 3e-06, "loss": -2.1754, "step": 744 }, { "completion_length": 125.64583587646484, "epoch": 0.0664614835630492, "grad_norm": 193.34835815429688, "learning_rate": 3e-06, "loss": 40.2546, "reward": 1.7169584035873413, "reward_std": 0.4941745698451996, "rewards/correctness_reward_func": 1.0833333730697632, "rewards/int_reward_func": 0.4791666567325592, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.15445833653211594, "step": 745, "zero_std_ratio": 0.0 }, { "epoch": 0.06655069360810027, "grad_norm": 212.19464111328125, "learning_rate": 3e-06, "loss": 41.4745, "step": 746 }, { "epoch": 0.06663990365315134, "grad_norm": 237.55845642089844, "learning_rate": 3e-06, "loss": 47.5379, "step": 747 }, { "epoch": 0.06672911369820242, "grad_norm": 185.4857635498047, "learning_rate": 3e-06, "loss": 43.0404, "step": 748 }, { "epoch": 0.06681832374325349, "grad_norm": 158.21678161621094, "learning_rate": 3e-06, "loss": 32.8558, "step": 749 }, { "epoch": 0.06690753378830457, "grad_norm": 212.29397583007812, "learning_rate": 3e-06, "loss": 41.9557, "step": 750 }, { "epoch": 0.06699674383335563, "grad_norm": 181.2362060546875, "learning_rate": 3e-06, "loss": 38.4185, "step": 751 }, { "epoch": 0.06708595387840671, "grad_norm": 186.73841857910156, "learning_rate": 3e-06, "loss": 37.1992, "step": 752 }, { "epoch": 0.06717516392345778, "grad_norm": 182.0499267578125, "learning_rate": 3e-06, "loss": 42.4944, "step": 753 }, { "epoch": 0.06726437396850886, "grad_norm": 161.4265899658203, "learning_rate": 3e-06, "loss": 40.1143, "step": 754 }, { "epoch": 0.06735358401355993, "grad_norm": 145.66175842285156, "learning_rate": 3e-06, "loss": 29.066, "step": 755 }, { "epoch": 0.067442794058611, "grad_norm": 188.43362426757812, "learning_rate": 3e-06, "loss": 34.8647, "step": 756 }, { "completion_length": 115.64583587646484, "epoch": 0.06753200410366207, "grad_norm": 176.6951446533203, "learning_rate": 3e-06, "loss": 23.7273, "reward": 1.9728541374206543, "reward_std": 0.22583025321364403, "rewards/correctness_reward_func": 1.375, "rewards/int_reward_func": 0.4270833432674408, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.17077083885669708, "step": 757, "zero_std_ratio": 0.0 }, { "epoch": 0.06762121414871315, "grad_norm": 134.009521484375, "learning_rate": 3e-06, "loss": 29.2333, "step": 758 }, { "epoch": 0.06771042419376422, "grad_norm": 131.3091278076172, "learning_rate": 3e-06, "loss": 29.9313, "step": 759 }, { "epoch": 0.06779963423881528, "grad_norm": 128.05767822265625, "learning_rate": 3e-06, "loss": 22.7093, "step": 760 }, { "epoch": 0.06788884428386636, "grad_norm": 115.9305419921875, "learning_rate": 3e-06, "loss": 23.3578, "step": 761 }, { "epoch": 0.06797805432891743, "grad_norm": 127.96971893310547, "learning_rate": 3e-06, "loss": 26.8069, "step": 762 }, { "epoch": 0.06806726437396851, "grad_norm": 172.38279724121094, "learning_rate": 3e-06, "loss": 20.0156, "step": 763 }, { "epoch": 0.06815647441901958, "grad_norm": 122.19217681884766, "learning_rate": 3e-06, "loss": 23.256, "step": 764 }, { "epoch": 0.06824568446407066, "grad_norm": 98.1166763305664, "learning_rate": 3e-06, "loss": 25.2087, "step": 765 }, { "epoch": 0.06833489450912172, "grad_norm": 104.1299819946289, "learning_rate": 3e-06, "loss": 18.9453, "step": 766 }, { "epoch": 0.0684241045541728, "grad_norm": 108.13124084472656, "learning_rate": 3e-06, "loss": 18.3841, "step": 767 }, { "epoch": 0.06851331459922387, "grad_norm": 107.33203887939453, "learning_rate": 3e-06, "loss": 21.8827, "step": 768 }, { "completion_length": 140.7291717529297, "epoch": 0.06860252464427495, "grad_norm": 176.08241271972656, "learning_rate": 3e-06, "loss": 1.9837, "reward": 2.0843957662582397, "reward_std": 0.670438677072525, "rewards/correctness_reward_func": 1.5, "rewards/int_reward_func": 0.46875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.11564583331346512, "step": 769, "zero_std_ratio": 0.0 }, { "epoch": 0.06869173468932602, "grad_norm": 184.2529754638672, "learning_rate": 3e-06, "loss": -9.1612, "step": 770 }, { "epoch": 0.0687809447343771, "grad_norm": 124.57987213134766, "learning_rate": 3e-06, "loss": -1.167, "step": 771 }, { "epoch": 0.06887015477942816, "grad_norm": 146.37869262695312, "learning_rate": 3e-06, "loss": 7.9908, "step": 772 }, { "epoch": 0.06895936482447923, "grad_norm": 159.84788513183594, "learning_rate": 3e-06, "loss": -7.8619, "step": 773 }, { "epoch": 0.06904857486953031, "grad_norm": 165.3255157470703, "learning_rate": 3e-06, "loss": -9.9255, "step": 774 }, { "epoch": 0.06913778491458138, "grad_norm": 147.72352600097656, "learning_rate": 3e-06, "loss": 0.4622, "step": 775 }, { "epoch": 0.06922699495963246, "grad_norm": 161.2108917236328, "learning_rate": 3e-06, "loss": -9.3763, "step": 776 }, { "epoch": 0.06931620500468352, "grad_norm": 117.9613265991211, "learning_rate": 3e-06, "loss": -2.9584, "step": 777 }, { "epoch": 0.0694054150497346, "grad_norm": 127.0103988647461, "learning_rate": 3e-06, "loss": 4.5212, "step": 778 }, { "epoch": 0.06949462509478567, "grad_norm": 161.12701416015625, "learning_rate": 3e-06, "loss": -7.3502, "step": 779 }, { "epoch": 0.06958383513983675, "grad_norm": 147.03277587890625, "learning_rate": 3e-06, "loss": -12.0696, "step": 780 }, { "completion_length": 146.75000762939453, "epoch": 0.06967304518488782, "grad_norm": 93.77994537353516, "learning_rate": 3e-06, "loss": -9.5819, "reward": 2.0504584312438965, "reward_std": 0.46765226125717163, "rewards/correctness_reward_func": 1.4166666865348816, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.010416666977107525, "rewards/xmlcount_reward_func": 0.12337498925626278, "step": 781, "zero_std_ratio": 0.0 }, { "epoch": 0.0697622552299389, "grad_norm": 109.34754180908203, "learning_rate": 3e-06, "loss": -14.1501, "step": 782 }, { "epoch": 0.06985146527498996, "grad_norm": 83.16600799560547, "learning_rate": 3e-06, "loss": -17.3924, "step": 783 }, { "epoch": 0.06994067532004104, "grad_norm": 83.6520767211914, "learning_rate": 3e-06, "loss": -3.3917, "step": 784 }, { "epoch": 0.07002988536509211, "grad_norm": 111.59048461914062, "learning_rate": 3e-06, "loss": -18.6001, "step": 785 }, { "epoch": 0.07011909541014318, "grad_norm": 81.26487731933594, "learning_rate": 3e-06, "loss": -4.8149, "step": 786 }, { "epoch": 0.07020830545519426, "grad_norm": 88.5013198852539, "learning_rate": 3e-06, "loss": -10.8751, "step": 787 }, { "epoch": 0.07029751550024532, "grad_norm": 106.22066497802734, "learning_rate": 3e-06, "loss": -14.8712, "step": 788 }, { "epoch": 0.0703867255452964, "grad_norm": 95.8133544921875, "learning_rate": 3e-06, "loss": -17.8064, "step": 789 }, { "epoch": 0.07047593559034747, "grad_norm": 105.98171997070312, "learning_rate": 3e-06, "loss": -4.9442, "step": 790 }, { "epoch": 0.07056514563539855, "grad_norm": 108.74724578857422, "learning_rate": 3e-06, "loss": -20.1345, "step": 791 }, { "epoch": 0.07065435568044962, "grad_norm": 217.0005340576172, "learning_rate": 3e-06, "loss": -5.632, "step": 792 }, { "completion_length": 137.1041717529297, "epoch": 0.0707435657255007, "grad_norm": 186.0244140625, "learning_rate": 3e-06, "loss": 20.4605, "reward": 1.8869168162345886, "reward_std": 0.7256337702274323, "rewards/correctness_reward_func": 1.2916666567325592, "rewards/int_reward_func": 0.4583333283662796, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.13691667094826698, "step": 793, "zero_std_ratio": 0.0 }, { "epoch": 0.07083277577055176, "grad_norm": 139.76405334472656, "learning_rate": 3e-06, "loss": 14.9958, "step": 794 }, { "epoch": 0.07092198581560284, "grad_norm": 155.77352905273438, "learning_rate": 3e-06, "loss": 24.0522, "step": 795 }, { "epoch": 0.07101119586065391, "grad_norm": 147.90013122558594, "learning_rate": 3e-06, "loss": 30.8391, "step": 796 }, { "epoch": 0.07110040590570499, "grad_norm": 154.99143981933594, "learning_rate": 3e-06, "loss": 24.957, "step": 797 }, { "epoch": 0.07118961595075605, "grad_norm": 154.12411499023438, "learning_rate": 3e-06, "loss": 32.2635, "step": 798 }, { "epoch": 0.07127882599580712, "grad_norm": 142.52955627441406, "learning_rate": 3e-06, "loss": 19.5332, "step": 799 }, { "epoch": 0.0713680360408582, "grad_norm": 140.1791229248047, "learning_rate": 3e-06, "loss": 13.5796, "step": 800 }, { "epoch": 0.07145724608590927, "grad_norm": 144.1186981201172, "learning_rate": 3e-06, "loss": 21.398, "step": 801 }, { "epoch": 0.07154645613096035, "grad_norm": 139.25230407714844, "learning_rate": 3e-06, "loss": 28.6905, "step": 802 }, { "epoch": 0.07163566617601141, "grad_norm": 151.95538330078125, "learning_rate": 3e-06, "loss": 21.6279, "step": 803 }, { "epoch": 0.0717248762210625, "grad_norm": 143.84974670410156, "learning_rate": 3e-06, "loss": 29.0759, "step": 804 }, { "completion_length": 136.22916793823242, "epoch": 0.07181408626611356, "grad_norm": 92.62751770019531, "learning_rate": 3e-06, "loss": -21.4484, "reward": 2.414271116256714, "reward_std": 0.33920496702194214, "rewards/correctness_reward_func": 1.8333333134651184, "rewards/int_reward_func": 0.4583333283662796, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.12260417267680168, "step": 805, "zero_std_ratio": 0.0 }, { "epoch": 0.07190329631116464, "grad_norm": 60.42007827758789, "learning_rate": 3e-06, "loss": -12.7491, "step": 806 }, { "epoch": 0.07199250635621571, "grad_norm": 94.1617660522461, "learning_rate": 3e-06, "loss": -14.5131, "step": 807 }, { "epoch": 0.07208171640126679, "grad_norm": 68.65794372558594, "learning_rate": 3e-06, "loss": -14.1192, "step": 808 }, { "epoch": 0.07217092644631785, "grad_norm": 79.4446792602539, "learning_rate": 3e-06, "loss": -12.4833, "step": 809 }, { "epoch": 0.07226013649136893, "grad_norm": 82.45279693603516, "learning_rate": 3e-06, "loss": -8.9581, "step": 810 }, { "epoch": 0.07234934653642, "grad_norm": 101.833984375, "learning_rate": 3e-06, "loss": -22.4901, "step": 811 }, { "epoch": 0.07243855658147107, "grad_norm": 78.60984802246094, "learning_rate": 3e-06, "loss": -12.9701, "step": 812 }, { "epoch": 0.07252776662652215, "grad_norm": 100.63545989990234, "learning_rate": 3e-06, "loss": -15.4353, "step": 813 }, { "epoch": 0.07261697667157321, "grad_norm": 66.36518096923828, "learning_rate": 3e-06, "loss": -14.7871, "step": 814 }, { "epoch": 0.0727061867166243, "grad_norm": 73.31499481201172, "learning_rate": 3e-06, "loss": -13.5328, "step": 815 }, { "epoch": 0.07279539676167536, "grad_norm": 81.8609848022461, "learning_rate": 3e-06, "loss": -10.202, "step": 816 }, { "completion_length": 176.0416717529297, "epoch": 0.07288460680672644, "grad_norm": 85.71199035644531, "learning_rate": 3e-06, "loss": -2.3775, "reward": 1.308291733264923, "reward_std": 0.4305167943239212, "rewards/correctness_reward_func": 0.8333333432674408, "rewards/int_reward_func": 0.4583333283662796, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.016624998301267624, "step": 817, "zero_std_ratio": 0.0 }, { "epoch": 0.0729738168517775, "grad_norm": 113.99066162109375, "learning_rate": 3e-06, "loss": 3.9682, "step": 818 }, { "epoch": 0.07306302689682859, "grad_norm": 90.42432403564453, "learning_rate": 3e-06, "loss": -8.968, "step": 819 }, { "epoch": 0.07315223694187965, "grad_norm": 86.66998291015625, "learning_rate": 3e-06, "loss": 5.9931, "step": 820 }, { "epoch": 0.07324144698693073, "grad_norm": 112.87352752685547, "learning_rate": 3e-06, "loss": 3.7934, "step": 821 }, { "epoch": 0.0733306570319818, "grad_norm": 116.59276580810547, "learning_rate": 3e-06, "loss": -12.3296, "step": 822 }, { "epoch": 0.07341986707703288, "grad_norm": 95.53129577636719, "learning_rate": 3e-06, "loss": -3.2342, "step": 823 }, { "epoch": 0.07350907712208395, "grad_norm": 134.6486053466797, "learning_rate": 3e-06, "loss": 2.4981, "step": 824 }, { "epoch": 0.07359828716713501, "grad_norm": 80.79833221435547, "learning_rate": 3e-06, "loss": -9.3798, "step": 825 }, { "epoch": 0.07368749721218609, "grad_norm": 107.38970184326172, "learning_rate": 3e-06, "loss": 4.8459, "step": 826 }, { "epoch": 0.07377670725723716, "grad_norm": 91.76937866210938, "learning_rate": 3e-06, "loss": 2.6928, "step": 827 }, { "epoch": 0.07386591730228824, "grad_norm": 112.15656280517578, "learning_rate": 3e-06, "loss": -13.4598, "step": 828 }, { "completion_length": 161.2916717529297, "epoch": 0.0739551273473393, "grad_norm": 225.7854461669922, "learning_rate": 3e-06, "loss": -7.2711, "reward": 1.8568333387374878, "reward_std": 0.38829553686082363, "rewards/correctness_reward_func": 1.2916666865348816, "rewards/int_reward_func": 0.4895833283662796, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.0755833312869072, "step": 829, "zero_std_ratio": 0.0 }, { "epoch": 0.07404433739239039, "grad_norm": 203.64381408691406, "learning_rate": 3e-06, "loss": -24.0722, "step": 830 }, { "epoch": 0.07413354743744145, "grad_norm": 470.9283752441406, "learning_rate": 3e-06, "loss": 7.9581, "step": 831 }, { "epoch": 0.07422275748249253, "grad_norm": 261.8198547363281, "learning_rate": 3e-06, "loss": -13.8961, "step": 832 }, { "epoch": 0.0743119675275436, "grad_norm": 238.60263061523438, "learning_rate": 3e-06, "loss": -17.5778, "step": 833 }, { "epoch": 0.07440117757259468, "grad_norm": 251.20684814453125, "learning_rate": 3e-06, "loss": -13.8026, "step": 834 }, { "epoch": 0.07449038761764575, "grad_norm": 233.15805053710938, "learning_rate": 3e-06, "loss": -9.8419, "step": 835 }, { "epoch": 0.07457959766269683, "grad_norm": 188.42831420898438, "learning_rate": 3e-06, "loss": -27.1095, "step": 836 }, { "epoch": 0.07466880770774789, "grad_norm": 330.0888671875, "learning_rate": 3e-06, "loss": 2.663, "step": 837 }, { "epoch": 0.07475801775279897, "grad_norm": 187.619873046875, "learning_rate": 3e-06, "loss": -17.3052, "step": 838 }, { "epoch": 0.07484722779785004, "grad_norm": 273.087646484375, "learning_rate": 3e-06, "loss": -20.1137, "step": 839 }, { "epoch": 0.0749364378429011, "grad_norm": 231.94540405273438, "learning_rate": 3e-06, "loss": -17.6819, "step": 840 }, { "completion_length": 124.5625, "epoch": 0.07502564788795218, "grad_norm": 169.8292236328125, "learning_rate": 3e-06, "loss": -14.9169, "reward": 2.1858333349227905, "reward_std": 0.5357859879732132, "rewards/correctness_reward_func": 1.5833333134651184, "rewards/int_reward_func": 0.4270833432674408, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.17541665583848953, "step": 841, "zero_std_ratio": 0.0 }, { "epoch": 0.07511485793300325, "grad_norm": 151.44129943847656, "learning_rate": 3e-06, "loss": -38.7775, "step": 842 }, { "epoch": 0.07520406797805433, "grad_norm": 141.34671020507812, "learning_rate": 3e-06, "loss": -32.3555, "step": 843 }, { "epoch": 0.0752932780231054, "grad_norm": 117.83955383300781, "learning_rate": 3e-06, "loss": -35.7298, "step": 844 }, { "epoch": 0.07538248806815648, "grad_norm": 113.38582611083984, "learning_rate": 3e-06, "loss": -36.8355, "step": 845 }, { "epoch": 0.07547169811320754, "grad_norm": 147.53521728515625, "learning_rate": 3e-06, "loss": -34.7305, "step": 846 }, { "epoch": 0.07556090815825862, "grad_norm": 167.8444061279297, "learning_rate": 3e-06, "loss": -17.6609, "step": 847 }, { "epoch": 0.07565011820330969, "grad_norm": 177.19976806640625, "learning_rate": 3e-06, "loss": -42.6765, "step": 848 }, { "epoch": 0.07573932824836077, "grad_norm": 207.4672393798828, "learning_rate": 3e-06, "loss": -36.7629, "step": 849 }, { "epoch": 0.07582853829341184, "grad_norm": 124.84293365478516, "learning_rate": 3e-06, "loss": -39.1349, "step": 850 }, { "epoch": 0.07591774833846292, "grad_norm": 134.89764404296875, "learning_rate": 3e-06, "loss": -41.2224, "step": 851 }, { "epoch": 0.07600695838351398, "grad_norm": 161.6527862548828, "learning_rate": 3e-06, "loss": -39.203, "step": 852 }, { "completion_length": 147.7291717529297, "epoch": 0.07609616842856505, "grad_norm": 371.6798095703125, "learning_rate": 3e-06, "loss": 88.7083, "reward": 2.0986876487731934, "reward_std": 0.5909168422222137, "rewards/correctness_reward_func": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.09868749976158142, "step": 853, "zero_std_ratio": 0.0 }, { "epoch": 0.07618537847361613, "grad_norm": 385.34136962890625, "learning_rate": 3e-06, "loss": 86.0723, "step": 854 }, { "epoch": 0.0762745885186672, "grad_norm": 360.78021240234375, "learning_rate": 3e-06, "loss": 71.1204, "step": 855 }, { "epoch": 0.07636379856371828, "grad_norm": 293.267333984375, "learning_rate": 3e-06, "loss": 66.9494, "step": 856 }, { "epoch": 0.07645300860876934, "grad_norm": 440.7154846191406, "learning_rate": 3e-06, "loss": 88.5771, "step": 857 }, { "epoch": 0.07654221865382042, "grad_norm": 327.457275390625, "learning_rate": 3e-06, "loss": 58.8516, "step": 858 }, { "epoch": 0.07663142869887149, "grad_norm": 371.9436340332031, "learning_rate": 3e-06, "loss": 85.2973, "step": 859 }, { "epoch": 0.07672063874392257, "grad_norm": 389.5568542480469, "learning_rate": 3e-06, "loss": 79.8231, "step": 860 }, { "epoch": 0.07680984878897364, "grad_norm": 321.9656066894531, "learning_rate": 3e-06, "loss": 63.4841, "step": 861 }, { "epoch": 0.07689905883402472, "grad_norm": 284.66876220703125, "learning_rate": 3e-06, "loss": 59.0441, "step": 862 }, { "epoch": 0.07698826887907578, "grad_norm": 410.6514587402344, "learning_rate": 3e-06, "loss": 76.3094, "step": 863 }, { "epoch": 0.07707747892412686, "grad_norm": 284.0197448730469, "learning_rate": 3e-06, "loss": 49.468, "step": 864 }, { "completion_length": 144.1041717529297, "epoch": 0.07716668896917793, "grad_norm": 226.17822265625, "learning_rate": 3e-06, "loss": 56.9406, "reward": 2.379916787147522, "reward_std": 0.430880606174469, "rewards/correctness_reward_func": 1.7916666865348816, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.08824999909847975, "step": 865, "zero_std_ratio": 0.0 }, { "epoch": 0.077255899014229, "grad_norm": 186.55560302734375, "learning_rate": 3e-06, "loss": 52.0502, "step": 866 }, { "epoch": 0.07734510905928008, "grad_norm": 184.80516052246094, "learning_rate": 3e-06, "loss": 46.5618, "step": 867 }, { "epoch": 0.07743431910433114, "grad_norm": 178.5349884033203, "learning_rate": 3e-06, "loss": 46.2696, "step": 868 }, { "epoch": 0.07752352914938222, "grad_norm": 148.83154296875, "learning_rate": 3e-06, "loss": 29.464, "step": 869 }, { "epoch": 0.07761273919443329, "grad_norm": 161.14889526367188, "learning_rate": 3e-06, "loss": 51.0483, "step": 870 }, { "epoch": 0.07770194923948437, "grad_norm": 192.32308959960938, "learning_rate": 3e-06, "loss": 47.9675, "step": 871 }, { "epoch": 0.07779115928453544, "grad_norm": 152.79583740234375, "learning_rate": 3e-06, "loss": 43.9552, "step": 872 }, { "epoch": 0.07788036932958652, "grad_norm": 151.7612762451172, "learning_rate": 3e-06, "loss": 38.7329, "step": 873 }, { "epoch": 0.07796957937463758, "grad_norm": 133.1282196044922, "learning_rate": 3e-06, "loss": 38.5692, "step": 874 }, { "epoch": 0.07805878941968866, "grad_norm": 103.07962036132812, "learning_rate": 3e-06, "loss": 24.2915, "step": 875 }, { "epoch": 0.07814799946473973, "grad_norm": 129.3807373046875, "learning_rate": 3e-06, "loss": 43.1067, "step": 876 }, { "completion_length": 127.54166793823242, "epoch": 0.07823720950979081, "grad_norm": 125.33885192871094, "learning_rate": 3e-06, "loss": -23.6161, "reward": 1.667020857334137, "reward_std": 0.5597978234291077, "rewards/correctness_reward_func": 1.0833333432674408, "rewards/int_reward_func": 0.40625, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.17743750661611557, "step": 877, "zero_std_ratio": 0.0 }, { "epoch": 0.07832641955484188, "grad_norm": 138.50013732910156, "learning_rate": 3e-06, "loss": -16.1757, "step": 878 }, { "epoch": 0.07841562959989294, "grad_norm": 130.26280212402344, "learning_rate": 3e-06, "loss": -21.8816, "step": 879 }, { "epoch": 0.07850483964494402, "grad_norm": 141.026123046875, "learning_rate": 3e-06, "loss": -22.0761, "step": 880 }, { "epoch": 0.07859404968999509, "grad_norm": 126.53893280029297, "learning_rate": 3e-06, "loss": -23.4112, "step": 881 }, { "epoch": 0.07868325973504617, "grad_norm": 153.45120239257812, "learning_rate": 3e-06, "loss": -17.7169, "step": 882 }, { "epoch": 0.07877246978009723, "grad_norm": 122.84283447265625, "learning_rate": 3e-06, "loss": -24.8704, "step": 883 }, { "epoch": 0.07886167982514831, "grad_norm": 157.95201110839844, "learning_rate": 3e-06, "loss": -16.2334, "step": 884 }, { "epoch": 0.07895088987019938, "grad_norm": 136.01124572753906, "learning_rate": 3e-06, "loss": -23.3998, "step": 885 }, { "epoch": 0.07904009991525046, "grad_norm": 135.98423767089844, "learning_rate": 3e-06, "loss": -23.6646, "step": 886 }, { "epoch": 0.07912930996030153, "grad_norm": 131.24002075195312, "learning_rate": 3e-06, "loss": -25.7454, "step": 887 }, { "epoch": 0.07921852000535261, "grad_norm": 124.26398468017578, "learning_rate": 3e-06, "loss": -20.446, "step": 888 }, { "completion_length": 153.58333587646484, "epoch": 0.07930773005040367, "grad_norm": 240.47964477539062, "learning_rate": 3e-06, "loss": -52.9796, "reward": 1.8959583044052124, "reward_std": 0.6871494352817535, "rewards/correctness_reward_func": 1.3333333432674408, "rewards/int_reward_func": 0.4583333283662796, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.10429166257381439, "step": 889, "zero_std_ratio": 0.0 }, { "epoch": 0.07939694009545475, "grad_norm": 205.4910430908203, "learning_rate": 3e-06, "loss": -52.7249, "step": 890 }, { "epoch": 0.07948615014050582, "grad_norm": 242.2780303955078, "learning_rate": 3e-06, "loss": -67.3242, "step": 891 }, { "epoch": 0.07957536018555689, "grad_norm": 262.0589599609375, "learning_rate": 3e-06, "loss": -66.9661, "step": 892 }, { "epoch": 0.07966457023060797, "grad_norm": 186.11415100097656, "learning_rate": 3e-06, "loss": -68.4566, "step": 893 }, { "epoch": 0.07975378027565903, "grad_norm": 254.95228576660156, "learning_rate": 3e-06, "loss": -66.0288, "step": 894 }, { "epoch": 0.07984299032071011, "grad_norm": 270.2388000488281, "learning_rate": 3e-06, "loss": -59.1402, "step": 895 }, { "epoch": 0.07993220036576118, "grad_norm": 232.1254119873047, "learning_rate": 3e-06, "loss": -58.3748, "step": 896 }, { "epoch": 0.08002141041081226, "grad_norm": 423.2415466308594, "learning_rate": 3e-06, "loss": -74.5377, "step": 897 }, { "epoch": 0.08011062045586333, "grad_norm": 289.6065673828125, "learning_rate": 3e-06, "loss": -76.1074, "step": 898 }, { "epoch": 0.0801998305009144, "grad_norm": 212.4766845703125, "learning_rate": 3e-06, "loss": -74.2601, "step": 899 }, { "epoch": 0.08028904054596547, "grad_norm": 286.6225891113281, "learning_rate": 3e-06, "loss": -74.3892, "step": 900 }, { "completion_length": 119.62500381469727, "epoch": 0.08037825059101655, "grad_norm": 123.23504638671875, "learning_rate": 3e-06, "loss": -22.0192, "reward": 2.5415626764297485, "reward_std": 0.190566536039114, "rewards/correctness_reward_func": 1.9166666865348816, "rewards/int_reward_func": 0.4791666716337204, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.1457291655242443, "step": 901, "zero_std_ratio": 0.0 }, { "epoch": 0.08046746063606762, "grad_norm": 134.62196350097656, "learning_rate": 3e-06, "loss": -26.971, "step": 902 }, { "epoch": 0.0805566706811187, "grad_norm": 211.32479858398438, "learning_rate": 3e-06, "loss": -31.3724, "step": 903 }, { "epoch": 0.08064588072616977, "grad_norm": 160.55055236816406, "learning_rate": 3e-06, "loss": -22.228, "step": 904 }, { "epoch": 0.08073509077122083, "grad_norm": 125.40478515625, "learning_rate": 3e-06, "loss": -21.2257, "step": 905 }, { "epoch": 0.08082430081627191, "grad_norm": 111.1106948852539, "learning_rate": 3e-06, "loss": -22.3095, "step": 906 }, { "epoch": 0.08091351086132298, "grad_norm": 122.1114501953125, "learning_rate": 3e-06, "loss": -24.7909, "step": 907 }, { "epoch": 0.08100272090637406, "grad_norm": 156.01158142089844, "learning_rate": 3e-06, "loss": -31.0448, "step": 908 }, { "epoch": 0.08109193095142513, "grad_norm": 158.0888214111328, "learning_rate": 3e-06, "loss": -35.1506, "step": 909 }, { "epoch": 0.0811811409964762, "grad_norm": 156.11680603027344, "learning_rate": 3e-06, "loss": -26.2504, "step": 910 }, { "epoch": 0.08127035104152727, "grad_norm": 136.36370849609375, "learning_rate": 3e-06, "loss": -24.5191, "step": 911 }, { "epoch": 0.08135956108657835, "grad_norm": 138.4123077392578, "learning_rate": 3e-06, "loss": -25.2287, "step": 912 }, { "completion_length": 123.97917175292969, "epoch": 0.08144877113162942, "grad_norm": 69.13970184326172, "learning_rate": 3e-06, "loss": -4.2059, "reward": 2.349874973297119, "reward_std": 0.39924251288175583, "rewards/correctness_reward_func": 1.75, "rewards/int_reward_func": 0.4583333283662796, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.14154166728258133, "step": 913, "zero_std_ratio": 0.0 }, { "epoch": 0.0815379811766805, "grad_norm": 109.77488708496094, "learning_rate": 3e-06, "loss": -6.6804, "step": 914 }, { "epoch": 0.08162719122173157, "grad_norm": 108.82147216796875, "learning_rate": 3e-06, "loss": 1.8191, "step": 915 }, { "epoch": 0.08171640126678265, "grad_norm": 88.40335083007812, "learning_rate": 3e-06, "loss": -5.8692, "step": 916 }, { "epoch": 0.08180561131183371, "grad_norm": 76.1854019165039, "learning_rate": 3e-06, "loss": -1.3803, "step": 917 }, { "epoch": 0.08189482135688479, "grad_norm": 94.09133911132812, "learning_rate": 3e-06, "loss": -2.3375, "step": 918 }, { "epoch": 0.08198403140193586, "grad_norm": 84.88536071777344, "learning_rate": 3e-06, "loss": -5.6229, "step": 919 }, { "epoch": 0.08207324144698692, "grad_norm": 92.1208267211914, "learning_rate": 3e-06, "loss": -7.5509, "step": 920 }, { "epoch": 0.082162451492038, "grad_norm": 89.02661895751953, "learning_rate": 3e-06, "loss": 0.5948, "step": 921 }, { "epoch": 0.08225166153708907, "grad_norm": 95.09249114990234, "learning_rate": 3e-06, "loss": -6.4904, "step": 922 }, { "epoch": 0.08234087158214015, "grad_norm": 83.8741683959961, "learning_rate": 3e-06, "loss": -2.875, "step": 923 }, { "epoch": 0.08243008162719122, "grad_norm": 129.45420837402344, "learning_rate": 3e-06, "loss": -3.085, "step": 924 }, { "completion_length": 115.27083587646484, "epoch": 0.0825192916722423, "grad_norm": 250.4253387451172, "learning_rate": 3e-06, "loss": 9.4605, "reward": 2.1223334074020386, "reward_std": 0.7158277630805969, "rewards/correctness_reward_func": 1.5, "rewards/int_reward_func": 0.4166666716337204, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.205666683614254, "step": 925, "zero_std_ratio": 0.0 }, { "epoch": 0.08260850171729336, "grad_norm": 318.8369140625, "learning_rate": 3e-06, "loss": 16.4358, "step": 926 }, { "epoch": 0.08269771176234444, "grad_norm": 314.372314453125, "learning_rate": 3e-06, "loss": 5.4103, "step": 927 }, { "epoch": 0.08278692180739551, "grad_norm": 255.00933837890625, "learning_rate": 3e-06, "loss": -3.1256, "step": 928 }, { "epoch": 0.08287613185244659, "grad_norm": 357.3619384765625, "learning_rate": 3e-06, "loss": 3.4132, "step": 929 }, { "epoch": 0.08296534189749766, "grad_norm": 409.3254089355469, "learning_rate": 3e-06, "loss": 23.0602, "step": 930 }, { "epoch": 0.08305455194254874, "grad_norm": 270.6861877441406, "learning_rate": 3e-06, "loss": 8.4527, "step": 931 }, { "epoch": 0.0831437619875998, "grad_norm": 507.520263671875, "learning_rate": 3e-06, "loss": 14.712, "step": 932 }, { "epoch": 0.08323297203265087, "grad_norm": 281.0194091796875, "learning_rate": 3e-06, "loss": 4.5989, "step": 933 }, { "epoch": 0.08332218207770195, "grad_norm": 275.3479309082031, "learning_rate": 3e-06, "loss": -5.1609, "step": 934 }, { "epoch": 0.08341139212275302, "grad_norm": 358.3206481933594, "learning_rate": 3e-06, "loss": 3.3467, "step": 935 }, { "epoch": 0.0835006021678041, "grad_norm": 403.45440673828125, "learning_rate": 3e-06, "loss": 20.7584, "step": 936 }, { "completion_length": 131.7291717529297, "epoch": 0.08358981221285516, "grad_norm": 112.89104461669922, "learning_rate": 3e-06, "loss": -12.4613, "reward": 1.9657083749771118, "reward_std": 0.33454202115535736, "rewards/correctness_reward_func": 1.3333333432674408, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.13237499818205833, "step": 937, "zero_std_ratio": 0.0 }, { "epoch": 0.08367902225790624, "grad_norm": 158.0906524658203, "learning_rate": 3e-06, "loss": -6.7935, "step": 938 }, { "epoch": 0.08376823230295731, "grad_norm": 127.69352722167969, "learning_rate": 3e-06, "loss": -15.3734, "step": 939 }, { "epoch": 0.08385744234800839, "grad_norm": 207.05262756347656, "learning_rate": 3e-06, "loss": -25.0891, "step": 940 }, { "epoch": 0.08394665239305946, "grad_norm": 546.4678344726562, "learning_rate": 3e-06, "loss": -29.0194, "step": 941 }, { "epoch": 0.08403586243811054, "grad_norm": 141.02198791503906, "learning_rate": 3e-06, "loss": -10.7265, "step": 942 }, { "epoch": 0.0841250724831616, "grad_norm": 137.6843719482422, "learning_rate": 3e-06, "loss": -13.3029, "step": 943 }, { "epoch": 0.08421428252821268, "grad_norm": 211.74227905273438, "learning_rate": 3e-06, "loss": -8.8958, "step": 944 }, { "epoch": 0.08430349257326375, "grad_norm": 123.87110900878906, "learning_rate": 3e-06, "loss": -16.9913, "step": 945 }, { "epoch": 0.08439270261831482, "grad_norm": 206.8551025390625, "learning_rate": 3e-06, "loss": -26.9321, "step": 946 }, { "epoch": 0.0844819126633659, "grad_norm": 193.33346557617188, "learning_rate": 3e-06, "loss": -30.975, "step": 947 }, { "epoch": 0.08457112270841696, "grad_norm": 147.73297119140625, "learning_rate": 3e-06, "loss": -13.515, "step": 948 }, { "completion_length": 159.95833587646484, "epoch": 0.08466033275346804, "grad_norm": 186.3380889892578, "learning_rate": 3e-06, "loss": 9.8972, "reward": 1.8234166502952576, "reward_std": 0.42612800002098083, "rewards/correctness_reward_func": 1.2916666865348816, "rewards/int_reward_func": 0.4375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.09425000101327896, "step": 949, "zero_std_ratio": 0.0 }, { "epoch": 0.08474954279851911, "grad_norm": 178.0104522705078, "learning_rate": 3e-06, "loss": 10.3947, "step": 950 }, { "epoch": 0.08483875284357019, "grad_norm": 160.924560546875, "learning_rate": 3e-06, "loss": 8.1323, "step": 951 }, { "epoch": 0.08492796288862126, "grad_norm": 144.38978576660156, "learning_rate": 3e-06, "loss": 5.1531, "step": 952 }, { "epoch": 0.08501717293367234, "grad_norm": 174.2298126220703, "learning_rate": 3e-06, "loss": 5.4548, "step": 953 }, { "epoch": 0.0851063829787234, "grad_norm": 164.93479919433594, "learning_rate": 3e-06, "loss": 10.7603, "step": 954 }, { "epoch": 0.08519559302377448, "grad_norm": 198.3860626220703, "learning_rate": 3e-06, "loss": 9.1507, "step": 955 }, { "epoch": 0.08528480306882555, "grad_norm": 160.76519775390625, "learning_rate": 3e-06, "loss": 9.4591, "step": 956 }, { "epoch": 0.08537401311387663, "grad_norm": 170.39776611328125, "learning_rate": 3e-06, "loss": 7.0709, "step": 957 }, { "epoch": 0.0854632231589277, "grad_norm": 145.32798767089844, "learning_rate": 3e-06, "loss": 4.1507, "step": 958 }, { "epoch": 0.08555243320397876, "grad_norm": 170.50514221191406, "learning_rate": 3e-06, "loss": 4.4292, "step": 959 }, { "epoch": 0.08564164324902984, "grad_norm": 197.32290649414062, "learning_rate": 3e-06, "loss": 10.1245, "step": 960 }, { "completion_length": 149.68750762939453, "epoch": 0.08573085329408091, "grad_norm": 530.301025390625, "learning_rate": 3e-06, "loss": 6.2918, "reward": 2.106416702270508, "reward_std": 0.5645134299993515, "rewards/correctness_reward_func": 1.5416666865348816, "rewards/int_reward_func": 0.4791666567325592, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.08558332687243819, "step": 961, "zero_std_ratio": 0.0 }, { "epoch": 0.08582006333913199, "grad_norm": 593.3743286132812, "learning_rate": 3e-06, "loss": -27.7628, "step": 962 }, { "epoch": 0.08590927338418305, "grad_norm": 378.6949157714844, "learning_rate": 3e-06, "loss": -25.6701, "step": 963 }, { "epoch": 0.08599848342923413, "grad_norm": 402.416748046875, "learning_rate": 3e-06, "loss": 10.4982, "step": 964 }, { "epoch": 0.0860876934742852, "grad_norm": 443.3346862792969, "learning_rate": 3e-06, "loss": -39.2448, "step": 965 }, { "epoch": 0.08617690351933628, "grad_norm": 401.20574951171875, "learning_rate": 3e-06, "loss": -68.5925, "step": 966 }, { "epoch": 0.08626611356438735, "grad_norm": 553.38720703125, "learning_rate": 3e-06, "loss": 1.8602, "step": 967 }, { "epoch": 0.08635532360943843, "grad_norm": 628.0134887695312, "learning_rate": 3e-06, "loss": -36.1923, "step": 968 }, { "epoch": 0.0864445336544895, "grad_norm": 380.9430847167969, "learning_rate": 3e-06, "loss": -32.3079, "step": 969 }, { "epoch": 0.08653374369954057, "grad_norm": 385.8163146972656, "learning_rate": 3e-06, "loss": 3.3819, "step": 970 }, { "epoch": 0.08662295374459164, "grad_norm": 432.78118896484375, "learning_rate": 3e-06, "loss": -46.8594, "step": 971 }, { "epoch": 0.08671216378964271, "grad_norm": 439.5821533203125, "learning_rate": 3e-06, "loss": -77.4002, "step": 972 }, { "completion_length": 121.68750381469727, "epoch": 0.08680137383469379, "grad_norm": 35.27005386352539, "learning_rate": 3e-06, "loss": 3.8068, "reward": 2.284437596797943, "reward_std": 0.15820645913481712, "rewards/correctness_reward_func": 1.625, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.15943749248981476, "step": 973, "zero_std_ratio": 0.0 }, { "epoch": 0.08689058387974485, "grad_norm": 68.60123443603516, "learning_rate": 3e-06, "loss": 5.8697, "step": 974 }, { "epoch": 0.08697979392479593, "grad_norm": 69.07678985595703, "learning_rate": 3e-06, "loss": 2.0005, "step": 975 }, { "epoch": 0.087069003969847, "grad_norm": 39.30900955200195, "learning_rate": 3e-06, "loss": 6.0659, "step": 976 }, { "epoch": 0.08715821401489808, "grad_norm": 77.51853942871094, "learning_rate": 3e-06, "loss": 5.9335, "step": 977 }, { "epoch": 0.08724742405994915, "grad_norm": 60.07703399658203, "learning_rate": 3e-06, "loss": 3.043, "step": 978 }, { "epoch": 0.08733663410500023, "grad_norm": 39.25843811035156, "learning_rate": 3e-06, "loss": 3.6489, "step": 979 }, { "epoch": 0.0874258441500513, "grad_norm": 46.68893051147461, "learning_rate": 3e-06, "loss": 5.0957, "step": 980 }, { "epoch": 0.08751505419510237, "grad_norm": 55.4852180480957, "learning_rate": 3e-06, "loss": 0.9956, "step": 981 }, { "epoch": 0.08760426424015344, "grad_norm": 51.21168518066406, "learning_rate": 3e-06, "loss": 5.5624, "step": 982 }, { "epoch": 0.08769347428520452, "grad_norm": 64.15937805175781, "learning_rate": 3e-06, "loss": 4.0323, "step": 983 }, { "epoch": 0.08778268433025559, "grad_norm": 65.25579833984375, "learning_rate": 3e-06, "loss": 1.7638, "step": 984 }, { "completion_length": 138.68750762939453, "epoch": 0.08787189437530665, "grad_norm": 277.1241760253906, "learning_rate": 3e-06, "loss": 10.9365, "reward": 2.176750063896179, "reward_std": 0.40325865149497986, "rewards/correctness_reward_func": 1.5833333134651184, "rewards/int_reward_func": 0.4791666567325592, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.11425000056624413, "step": 985, "zero_std_ratio": 0.0 }, { "epoch": 0.08796110442035773, "grad_norm": 363.5508117675781, "learning_rate": 3e-06, "loss": -2.6393, "step": 986 }, { "epoch": 0.0880503144654088, "grad_norm": 337.4767150878906, "learning_rate": 3e-06, "loss": 9.4007, "step": 987 }, { "epoch": 0.08813952451045988, "grad_norm": 292.9395751953125, "learning_rate": 3e-06, "loss": -2.1309, "step": 988 }, { "epoch": 0.08822873455551095, "grad_norm": 246.76112365722656, "learning_rate": 3e-06, "loss": -9.132, "step": 989 }, { "epoch": 0.08831794460056203, "grad_norm": 267.3565368652344, "learning_rate": 3e-06, "loss": -11.7626, "step": 990 }, { "epoch": 0.08840715464561309, "grad_norm": 257.4312438964844, "learning_rate": 3e-06, "loss": 5.7132, "step": 991 }, { "epoch": 0.08849636469066417, "grad_norm": 317.8547058105469, "learning_rate": 3e-06, "loss": -9.7682, "step": 992 }, { "epoch": 0.08858557473571524, "grad_norm": 260.3039855957031, "learning_rate": 3e-06, "loss": 2.4633, "step": 993 }, { "epoch": 0.08867478478076632, "grad_norm": 261.14697265625, "learning_rate": 3e-06, "loss": -9.4669, "step": 994 }, { "epoch": 0.08876399482581739, "grad_norm": 181.8609161376953, "learning_rate": 3e-06, "loss": -13.1009, "step": 995 }, { "epoch": 0.08885320487086847, "grad_norm": 236.33563232421875, "learning_rate": 3e-06, "loss": -15.7579, "step": 996 }, { "completion_length": 168.9791717529297, "epoch": 0.08894241491591953, "grad_norm": 502.4021911621094, "learning_rate": 3e-06, "loss": -65.14, "reward": 1.9697707891464233, "reward_std": 0.505499929189682, "rewards/correctness_reward_func": 1.4166666269302368, "rewards/int_reward_func": 0.4895833283662796, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.06352083757519722, "step": 997, "zero_std_ratio": 0.0 }, { "epoch": 0.0890316249609706, "grad_norm": 564.5300903320312, "learning_rate": 3e-06, "loss": -45.4159, "step": 998 }, { "epoch": 0.08912083500602168, "grad_norm": 453.83160400390625, "learning_rate": 3e-06, "loss": -31.2339, "step": 999 }, { "epoch": 0.08921004505107274, "grad_norm": 449.2571716308594, "learning_rate": 3e-06, "loss": -71.5707, "step": 1000 }, { "epoch": 0.08929925509612383, "grad_norm": 639.439208984375, "learning_rate": 3e-06, "loss": -2.7696, "step": 1001 }, { "epoch": 0.08938846514117489, "grad_norm": 494.1471862792969, "learning_rate": 3e-06, "loss": -8.904, "step": 1002 }, { "epoch": 0.08947767518622597, "grad_norm": 483.2057800292969, "learning_rate": 3e-06, "loss": -68.9682, "step": 1003 }, { "epoch": 0.08956688523127704, "grad_norm": 587.0855712890625, "learning_rate": 3e-06, "loss": -55.2685, "step": 1004 }, { "epoch": 0.08965609527632812, "grad_norm": 493.4284362792969, "learning_rate": 3e-06, "loss": -40.0821, "step": 1005 }, { "epoch": 0.08974530532137918, "grad_norm": 458.56134033203125, "learning_rate": 3e-06, "loss": -81.0491, "step": 1006 }, { "epoch": 0.08983451536643026, "grad_norm": 604.5118408203125, "learning_rate": 3e-06, "loss": -12.8738, "step": 1007 }, { "epoch": 0.08992372541148133, "grad_norm": 968.2217407226562, "learning_rate": 3e-06, "loss": -16.2694, "step": 1008 }, { "completion_length": 135.1666717529297, "epoch": 0.09001293545653241, "grad_norm": 199.69468688964844, "learning_rate": 3e-06, "loss": -0.1011, "reward": 1.9899166822433472, "reward_std": 0.5430259108543396, "rewards/correctness_reward_func": 1.4166666567325592, "rewards/int_reward_func": 0.4375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.13575000129640102, "step": 1009, "zero_std_ratio": 0.125 }, { "epoch": 0.09010214550158348, "grad_norm": 165.88571166992188, "learning_rate": 3e-06, "loss": -2.8061, "step": 1010 }, { "epoch": 0.09019135554663456, "grad_norm": 199.79214477539062, "learning_rate": 3e-06, "loss": 15.1017, "step": 1011 }, { "epoch": 0.09028056559168562, "grad_norm": 176.049072265625, "learning_rate": 3e-06, "loss": 11.1242, "step": 1012 }, { "epoch": 0.09036977563673669, "grad_norm": 195.1369171142578, "learning_rate": 3e-06, "loss": 5.1197, "step": 1013 }, { "epoch": 0.09045898568178777, "grad_norm": 233.49134826660156, "learning_rate": 3e-06, "loss": -0.4895, "step": 1014 }, { "epoch": 0.09054819572683884, "grad_norm": 211.94871520996094, "learning_rate": 3e-06, "loss": -2.5614, "step": 1015 }, { "epoch": 0.09063740577188992, "grad_norm": 156.52188110351562, "learning_rate": 3e-06, "loss": -2.6152, "step": 1016 }, { "epoch": 0.09072661581694098, "grad_norm": 162.7987823486328, "learning_rate": 3e-06, "loss": 13.4839, "step": 1017 }, { "epoch": 0.09081582586199206, "grad_norm": 194.3466033935547, "learning_rate": 3e-06, "loss": 9.1157, "step": 1018 }, { "epoch": 0.09090503590704313, "grad_norm": 219.33090209960938, "learning_rate": 3e-06, "loss": 1.8747, "step": 1019 }, { "epoch": 0.09099424595209421, "grad_norm": 237.62643432617188, "learning_rate": 3e-06, "loss": -3.0257, "step": 1020 }, { "completion_length": 145.125, "epoch": 0.09108345599714528, "grad_norm": 347.146484375, "learning_rate": 3e-06, "loss": -57.3183, "reward": 2.3361042737960815, "reward_std": 0.1686728447675705, "rewards/correctness_reward_func": 1.7083333134651184, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.12777083739638329, "step": 1021, "zero_std_ratio": 0.0 }, { "epoch": 0.09117266604219636, "grad_norm": 307.16357421875, "learning_rate": 3e-06, "loss": -59.7916, "step": 1022 }, { "epoch": 0.09126187608724742, "grad_norm": 239.71444702148438, "learning_rate": 3e-06, "loss": -68.856, "step": 1023 }, { "epoch": 0.0913510861322985, "grad_norm": 293.3377380371094, "learning_rate": 3e-06, "loss": -52.4881, "step": 1024 }, { "epoch": 0.09144029617734957, "grad_norm": 355.85980224609375, "learning_rate": 3e-06, "loss": -60.7326, "step": 1025 }, { "epoch": 0.09152950622240064, "grad_norm": 443.2692565917969, "learning_rate": 3e-06, "loss": -50.5898, "step": 1026 }, { "epoch": 0.09161871626745172, "grad_norm": 392.3480224609375, "learning_rate": 3e-06, "loss": -63.6762, "step": 1027 }, { "epoch": 0.09170792631250278, "grad_norm": 337.5072021484375, "learning_rate": 3e-06, "loss": -65.8663, "step": 1028 }, { "epoch": 0.09179713635755386, "grad_norm": 255.06619262695312, "learning_rate": 3e-06, "loss": -74.5813, "step": 1029 }, { "epoch": 0.09188634640260493, "grad_norm": 303.1604919433594, "learning_rate": 3e-06, "loss": -60.9415, "step": 1030 }, { "epoch": 0.09197555644765601, "grad_norm": 339.84039306640625, "learning_rate": 3e-06, "loss": -68.1991, "step": 1031 }, { "epoch": 0.09206476649270708, "grad_norm": 868.9529418945312, "learning_rate": 3e-06, "loss": -65.8396, "step": 1032 }, { "completion_length": 140.9375, "epoch": 0.09215397653775816, "grad_norm": 402.7937316894531, "learning_rate": 3e-06, "loss": -15.3165, "reward": 2.3517916202545166, "reward_std": 0.60741326212883, "rewards/correctness_reward_func": 1.75, "rewards/int_reward_func": 0.46875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.13304166495800018, "step": 1033, "zero_std_ratio": 0.0 }, { "epoch": 0.09224318658280922, "grad_norm": 365.8189697265625, "learning_rate": 3e-06, "loss": 13.667, "step": 1034 }, { "epoch": 0.0923323966278603, "grad_norm": 427.4906921386719, "learning_rate": 3e-06, "loss": -26.0149, "step": 1035 }, { "epoch": 0.09242160667291137, "grad_norm": 378.3236999511719, "learning_rate": 3e-06, "loss": -5.2592, "step": 1036 }, { "epoch": 0.09251081671796245, "grad_norm": 524.1071166992188, "learning_rate": 3e-06, "loss": 20.8697, "step": 1037 }, { "epoch": 0.09260002676301352, "grad_norm": 393.7771301269531, "learning_rate": 3e-06, "loss": -12.4555, "step": 1038 }, { "epoch": 0.09268923680806458, "grad_norm": 588.3152465820312, "learning_rate": 3e-06, "loss": -15.8752, "step": 1039 }, { "epoch": 0.09277844685311566, "grad_norm": 354.5204162597656, "learning_rate": 3e-06, "loss": 13.4668, "step": 1040 }, { "epoch": 0.09286765689816673, "grad_norm": 511.9700927734375, "learning_rate": 3e-06, "loss": -31.7805, "step": 1041 }, { "epoch": 0.09295686694321781, "grad_norm": 518.6276245117188, "learning_rate": 3e-06, "loss": -7.4808, "step": 1042 }, { "epoch": 0.09304607698826887, "grad_norm": 377.42205810546875, "learning_rate": 3e-06, "loss": 20.1054, "step": 1043 }, { "epoch": 0.09313528703331996, "grad_norm": 447.12945556640625, "learning_rate": 3e-06, "loss": -15.5087, "step": 1044 }, { "completion_length": 138.9166717529297, "epoch": 0.09322449707837102, "grad_norm": 993.244873046875, "learning_rate": 3e-06, "loss": 45.6857, "reward": 2.4459375143051147, "reward_std": 0.3905292749404907, "rewards/correctness_reward_func": 1.8333333134651184, "rewards/int_reward_func": 0.4791666567325592, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.13343750312924385, "step": 1045, "zero_std_ratio": 0.0 }, { "epoch": 0.0933137071234221, "grad_norm": 672.6893310546875, "learning_rate": 3e-06, "loss": 38.2007, "step": 1046 }, { "epoch": 0.09340291716847317, "grad_norm": 513.650390625, "learning_rate": 3e-06, "loss": 54.7761, "step": 1047 }, { "epoch": 0.09349212721352425, "grad_norm": 527.8473510742188, "learning_rate": 3e-06, "loss": 32.3591, "step": 1048 }, { "epoch": 0.09358133725857531, "grad_norm": 506.8796081542969, "learning_rate": 3e-06, "loss": 46.2954, "step": 1049 }, { "epoch": 0.0936705473036264, "grad_norm": 496.89971923828125, "learning_rate": 3e-06, "loss": 49.5723, "step": 1050 }, { "epoch": 0.09375975734867746, "grad_norm": 896.3591918945312, "learning_rate": 3e-06, "loss": 37.4608, "step": 1051 }, { "epoch": 0.09384896739372853, "grad_norm": 463.4916076660156, "learning_rate": 3e-06, "loss": 32.8312, "step": 1052 }, { "epoch": 0.09393817743877961, "grad_norm": 536.8302612304688, "learning_rate": 3e-06, "loss": 51.5421, "step": 1053 }, { "epoch": 0.09402738748383067, "grad_norm": 560.4818115234375, "learning_rate": 3e-06, "loss": 25.2527, "step": 1054 }, { "epoch": 0.09411659752888175, "grad_norm": 527.0779418945312, "learning_rate": 3e-06, "loss": 38.5143, "step": 1055 }, { "epoch": 0.09420580757393282, "grad_norm": 404.8234558105469, "learning_rate": 3e-06, "loss": 44.4837, "step": 1056 }, { "completion_length": 153.75, "epoch": 0.0942950176189839, "grad_norm": 320.6294250488281, "learning_rate": 3e-06, "loss": -28.9113, "reward": 1.784208357334137, "reward_std": 0.5137874186038971, "rewards/correctness_reward_func": 1.2916666567325592, "rewards/int_reward_func": 0.4270833283662796, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.06545832939445972, "step": 1057, "zero_std_ratio": 0.0 }, { "epoch": 0.09438422766403497, "grad_norm": 405.81268310546875, "learning_rate": 3e-06, "loss": -18.9334, "step": 1058 }, { "epoch": 0.09447343770908605, "grad_norm": 371.4535217285156, "learning_rate": 3e-06, "loss": -41.9034, "step": 1059 }, { "epoch": 0.09456264775413711, "grad_norm": 301.6942443847656, "learning_rate": 3e-06, "loss": -23.3866, "step": 1060 }, { "epoch": 0.0946518577991882, "grad_norm": 365.9558410644531, "learning_rate": 3e-06, "loss": -14.9156, "step": 1061 }, { "epoch": 0.09474106784423926, "grad_norm": 377.2384948730469, "learning_rate": 3e-06, "loss": -36.7945, "step": 1062 }, { "epoch": 0.09483027788929034, "grad_norm": 319.00262451171875, "learning_rate": 3e-06, "loss": -31.6703, "step": 1063 }, { "epoch": 0.0949194879343414, "grad_norm": 374.5344543457031, "learning_rate": 3e-06, "loss": -20.8262, "step": 1064 }, { "epoch": 0.09500869797939247, "grad_norm": 309.783447265625, "learning_rate": 3e-06, "loss": -44.4576, "step": 1065 }, { "epoch": 0.09509790802444355, "grad_norm": 300.767578125, "learning_rate": 3e-06, "loss": -27.8658, "step": 1066 }, { "epoch": 0.09518711806949462, "grad_norm": 382.4290466308594, "learning_rate": 3e-06, "loss": -21.0255, "step": 1067 }, { "epoch": 0.0952763281145457, "grad_norm": 380.5618591308594, "learning_rate": 3e-06, "loss": -40.5874, "step": 1068 }, { "completion_length": 133.43750762939453, "epoch": 0.09536553815959677, "grad_norm": 338.35491943359375, "learning_rate": 3e-06, "loss": -59.0288, "reward": 1.9337083101272583, "reward_std": 0.28059020824730396, "rewards/correctness_reward_func": 1.375, "rewards/int_reward_func": 0.4270833283662796, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.1316249892115593, "step": 1069, "zero_std_ratio": 0.125 }, { "epoch": 0.09545474820464785, "grad_norm": 294.65740966796875, "learning_rate": 3e-06, "loss": -50.4247, "step": 1070 }, { "epoch": 0.09554395824969891, "grad_norm": 245.4026641845703, "learning_rate": 3e-06, "loss": -41.1492, "step": 1071 }, { "epoch": 0.09563316829474999, "grad_norm": 274.2242126464844, "learning_rate": 3e-06, "loss": -37.5643, "step": 1072 }, { "epoch": 0.09572237833980106, "grad_norm": 272.3935546875, "learning_rate": 3e-06, "loss": -48.5536, "step": 1073 }, { "epoch": 0.09581158838485214, "grad_norm": 411.1688232421875, "learning_rate": 3e-06, "loss": -63.078, "step": 1074 }, { "epoch": 0.0959007984299032, "grad_norm": 350.3167724609375, "learning_rate": 3e-06, "loss": -68.731, "step": 1075 }, { "epoch": 0.09599000847495429, "grad_norm": 308.54302978515625, "learning_rate": 3e-06, "loss": -59.1035, "step": 1076 }, { "epoch": 0.09607921852000535, "grad_norm": 272.44537353515625, "learning_rate": 3e-06, "loss": -49.064, "step": 1077 }, { "epoch": 0.09616842856505642, "grad_norm": 349.4591064453125, "learning_rate": 3e-06, "loss": -45.6581, "step": 1078 }, { "epoch": 0.0962576386101075, "grad_norm": 404.52557373046875, "learning_rate": 3e-06, "loss": -57.2088, "step": 1079 }, { "epoch": 0.09634684865515857, "grad_norm": 418.94580078125, "learning_rate": 3e-06, "loss": -74.9134, "step": 1080 }, { "completion_length": 141.1458396911621, "epoch": 0.09643605870020965, "grad_norm": 733.7314453125, "learning_rate": 3e-06, "loss": -79.7336, "reward": 1.6318541765213013, "reward_std": 0.570192813873291, "rewards/correctness_reward_func": 1.0416666567325592, "rewards/int_reward_func": 0.4791666716337204, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.11102083884179592, "step": 1081, "zero_std_ratio": 0.0 }, { "epoch": 0.09652526874526071, "grad_norm": 607.6722412109375, "learning_rate": 3e-06, "loss": -140.3347, "step": 1082 }, { "epoch": 0.09661447879031179, "grad_norm": 667.3759155273438, "learning_rate": 3e-06, "loss": -95.2953, "step": 1083 }, { "epoch": 0.09670368883536286, "grad_norm": 1115.2962646484375, "learning_rate": 3e-06, "loss": -92.1048, "step": 1084 }, { "epoch": 0.09679289888041394, "grad_norm": 624.0264282226562, "learning_rate": 3e-06, "loss": -155.7364, "step": 1085 }, { "epoch": 0.096882108925465, "grad_norm": 1003.4823608398438, "learning_rate": 3e-06, "loss": -133.9341, "step": 1086 }, { "epoch": 0.09697131897051608, "grad_norm": 735.7012329101562, "learning_rate": 3e-06, "loss": -100.9845, "step": 1087 }, { "epoch": 0.09706052901556715, "grad_norm": 606.860107421875, "learning_rate": 3e-06, "loss": -162.1261, "step": 1088 }, { "epoch": 0.09714973906061823, "grad_norm": 645.7361450195312, "learning_rate": 3e-06, "loss": -116.5875, "step": 1089 }, { "epoch": 0.0972389491056693, "grad_norm": 852.1995239257812, "learning_rate": 3e-06, "loss": -110.4707, "step": 1090 }, { "epoch": 0.09732815915072038, "grad_norm": 728.5518188476562, "learning_rate": 3e-06, "loss": -176.5155, "step": 1091 }, { "epoch": 0.09741736919577144, "grad_norm": 802.650634765625, "learning_rate": 3e-06, "loss": -157.6771, "step": 1092 }, { "completion_length": 134.12500381469727, "epoch": 0.09750657924082251, "grad_norm": 373.08319091796875, "learning_rate": 3e-06, "loss": 6.5518, "reward": 2.1240209341049194, "reward_std": 0.493463397026062, "rewards/correctness_reward_func": 1.5, "rewards/int_reward_func": 0.46875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.15527082234621048, "step": 1093, "zero_std_ratio": 0.0 }, { "epoch": 0.09759578928587359, "grad_norm": 483.0643005371094, "learning_rate": 3e-06, "loss": 9.2641, "step": 1094 }, { "epoch": 0.09768499933092466, "grad_norm": 392.19927978515625, "learning_rate": 3e-06, "loss": -14.7997, "step": 1095 }, { "epoch": 0.09777420937597574, "grad_norm": 344.8969421386719, "learning_rate": 3e-06, "loss": 3.3186, "step": 1096 }, { "epoch": 0.0978634194210268, "grad_norm": 425.2146911621094, "learning_rate": 3e-06, "loss": -9.1396, "step": 1097 }, { "epoch": 0.09795262946607788, "grad_norm": 405.4781188964844, "learning_rate": 3e-06, "loss": -33.2548, "step": 1098 }, { "epoch": 0.09804183951112895, "grad_norm": 381.5223083496094, "learning_rate": 3e-06, "loss": 6.69, "step": 1099 }, { "epoch": 0.09813104955618003, "grad_norm": 597.418701171875, "learning_rate": 3e-06, "loss": 8.1762, "step": 1100 }, { "epoch": 0.0982202596012311, "grad_norm": 354.68548583984375, "learning_rate": 3e-06, "loss": -21.8024, "step": 1101 }, { "epoch": 0.09830946964628218, "grad_norm": 368.8730773925781, "learning_rate": 3e-06, "loss": 2.2751, "step": 1102 }, { "epoch": 0.09839867969133324, "grad_norm": 407.3045959472656, "learning_rate": 3e-06, "loss": -13.4736, "step": 1103 }, { "epoch": 0.09848788973638432, "grad_norm": 474.1021728515625, "learning_rate": 3e-06, "loss": -40.1604, "step": 1104 }, { "completion_length": 144.56250762939453, "epoch": 0.09857709978143539, "grad_norm": 509.6014099121094, "learning_rate": 3e-06, "loss": -99.5888, "reward": 1.8781040906906128, "reward_std": 0.4346280097961426, "rewards/correctness_reward_func": 1.375, "rewards/int_reward_func": 0.375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.12810416892170906, "step": 1105, "zero_std_ratio": 0.0 }, { "epoch": 0.09866630982648646, "grad_norm": 533.192626953125, "learning_rate": 3e-06, "loss": -97.968, "step": 1106 }, { "epoch": 0.09875551987153754, "grad_norm": 571.8197631835938, "learning_rate": 3e-06, "loss": -106.1057, "step": 1107 }, { "epoch": 0.0988447299165886, "grad_norm": 525.690673828125, "learning_rate": 3e-06, "loss": -75.4768, "step": 1108 }, { "epoch": 0.09893393996163968, "grad_norm": 478.89398193359375, "learning_rate": 3e-06, "loss": -110.4997, "step": 1109 }, { "epoch": 0.09902315000669075, "grad_norm": 812.3494262695312, "learning_rate": 3e-06, "loss": -112.8618, "step": 1110 }, { "epoch": 0.09911236005174183, "grad_norm": 603.4788818359375, "learning_rate": 3e-06, "loss": -106.561, "step": 1111 }, { "epoch": 0.0992015700967929, "grad_norm": 672.5433959960938, "learning_rate": 3e-06, "loss": -104.4456, "step": 1112 }, { "epoch": 0.09929078014184398, "grad_norm": 582.0712890625, "learning_rate": 3e-06, "loss": -113.8237, "step": 1113 }, { "epoch": 0.09937999018689504, "grad_norm": 536.4866333007812, "learning_rate": 3e-06, "loss": -83.1406, "step": 1114 }, { "epoch": 0.09946920023194612, "grad_norm": 531.5173950195312, "learning_rate": 3e-06, "loss": -121.5239, "step": 1115 }, { "epoch": 0.09955841027699719, "grad_norm": 898.2598266601562, "learning_rate": 3e-06, "loss": -127.2398, "step": 1116 }, { "completion_length": 113.00000381469727, "epoch": 0.09964762032204827, "grad_norm": 492.4721984863281, "learning_rate": 3e-06, "loss": -70.4249, "reward": 2.4547500610351562, "reward_std": 0.45376959443092346, "rewards/correctness_reward_func": 1.7916666865348816, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.16308333724737167, "step": 1117, "zero_std_ratio": 0.0 }, { "epoch": 0.09973683036709934, "grad_norm": 393.4570617675781, "learning_rate": 3e-06, "loss": -52.5425, "step": 1118 }, { "epoch": 0.0998260404121504, "grad_norm": 808.8583984375, "learning_rate": 3e-06, "loss": -32.6061, "step": 1119 }, { "epoch": 0.09991525045720148, "grad_norm": 576.81640625, "learning_rate": 3e-06, "loss": -55.5892, "step": 1120 }, { "epoch": 0.10000446050225255, "grad_norm": 898.8334350585938, "learning_rate": 3e-06, "loss": -54.6342, "step": 1121 }, { "epoch": 0.10009367054730363, "grad_norm": 438.3951110839844, "learning_rate": 3e-06, "loss": -43.4653, "step": 1122 }, { "epoch": 0.1001828805923547, "grad_norm": 708.05908203125, "learning_rate": 3e-06, "loss": -77.0374, "step": 1123 }, { "epoch": 0.10027209063740578, "grad_norm": 425.1826477050781, "learning_rate": 3e-06, "loss": -59.1794, "step": 1124 }, { "epoch": 0.10036130068245684, "grad_norm": 833.095947265625, "learning_rate": 3e-06, "loss": -43.4435, "step": 1125 }, { "epoch": 0.10045051072750792, "grad_norm": 588.0106811523438, "learning_rate": 3e-06, "loss": -62.997, "step": 1126 }, { "epoch": 0.10053972077255899, "grad_norm": 713.5040283203125, "learning_rate": 3e-06, "loss": -71.8443, "step": 1127 }, { "epoch": 0.10062893081761007, "grad_norm": 584.412353515625, "learning_rate": 3e-06, "loss": -53.4193, "step": 1128 }, { "completion_length": 136.2291717529297, "epoch": 0.10071814086266113, "grad_norm": 665.6400146484375, "learning_rate": 3e-06, "loss": 41.7356, "reward": 2.1188125014305115, "reward_std": 0.2311352714896202, "rewards/correctness_reward_func": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.11881250143051147, "step": 1129, "zero_std_ratio": 0.0 }, { "epoch": 0.10080735090771221, "grad_norm": 394.85174560546875, "learning_rate": 3e-06, "loss": 37.6043, "step": 1130 }, { "epoch": 0.10089656095276328, "grad_norm": 596.6718139648438, "learning_rate": 3e-06, "loss": 20.8374, "step": 1131 }, { "epoch": 0.10098577099781435, "grad_norm": 396.1944274902344, "learning_rate": 3e-06, "loss": 40.8713, "step": 1132 }, { "epoch": 0.10107498104286543, "grad_norm": 1378.33544921875, "learning_rate": 3e-06, "loss": -18.756, "step": 1133 }, { "epoch": 0.1011641910879165, "grad_norm": 572.9339599609375, "learning_rate": 3e-06, "loss": 40.0698, "step": 1134 }, { "epoch": 0.10125340113296757, "grad_norm": 500.0650939941406, "learning_rate": 3e-06, "loss": 33.2779, "step": 1135 }, { "epoch": 0.10134261117801864, "grad_norm": 423.95916748046875, "learning_rate": 3e-06, "loss": 37.4019, "step": 1136 }, { "epoch": 0.10143182122306972, "grad_norm": 354.7851867675781, "learning_rate": 3e-06, "loss": 14.2, "step": 1137 }, { "epoch": 0.10152103126812079, "grad_norm": 435.8161315917969, "learning_rate": 3e-06, "loss": 35.5527, "step": 1138 }, { "epoch": 0.10161024131317187, "grad_norm": 1665.0882568359375, "learning_rate": 3e-06, "loss": -32.409, "step": 1139 }, { "epoch": 0.10169945135822293, "grad_norm": 427.7786865234375, "learning_rate": 3e-06, "loss": 38.618, "step": 1140 }, { "completion_length": 152.77084350585938, "epoch": 0.10178866140327401, "grad_norm": 739.2212524414062, "learning_rate": 3e-06, "loss": -77.8406, "reward": 2.052833318710327, "reward_std": 0.4878626614809036, "rewards/correctness_reward_func": 1.5, "rewards/int_reward_func": 0.4895833283662796, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.06325000338256359, "step": 1141, "zero_std_ratio": 0.0 }, { "epoch": 0.10187787144832508, "grad_norm": 798.5391845703125, "learning_rate": 3e-06, "loss": -79.5304, "step": 1142 }, { "epoch": 0.10196708149337616, "grad_norm": 723.3178100585938, "learning_rate": 3e-06, "loss": -79.9193, "step": 1143 }, { "epoch": 0.10205629153842723, "grad_norm": 678.71484375, "learning_rate": 3e-06, "loss": -48.8422, "step": 1144 }, { "epoch": 0.1021455015834783, "grad_norm": 898.7178955078125, "learning_rate": 3e-06, "loss": -78.0983, "step": 1145 }, { "epoch": 0.10223471162852937, "grad_norm": 719.983154296875, "learning_rate": 3e-06, "loss": -108.0717, "step": 1146 }, { "epoch": 0.10232392167358044, "grad_norm": 728.4072265625, "learning_rate": 3e-06, "loss": -84.2578, "step": 1147 }, { "epoch": 0.10241313171863152, "grad_norm": 851.9353637695312, "learning_rate": 3e-06, "loss": -89.1878, "step": 1148 }, { "epoch": 0.10250234176368259, "grad_norm": 711.1327514648438, "learning_rate": 3e-06, "loss": -87.8992, "step": 1149 }, { "epoch": 0.10259155180873367, "grad_norm": 785.3473510742188, "learning_rate": 3e-06, "loss": -57.3662, "step": 1150 }, { "epoch": 0.10268076185378473, "grad_norm": 926.0802612304688, "learning_rate": 3e-06, "loss": -92.5817, "step": 1151 }, { "epoch": 0.10276997189883581, "grad_norm": 769.6296997070312, "learning_rate": 3e-06, "loss": -116.0675, "step": 1152 }, { "completion_length": 117.62500381469727, "epoch": 0.10285918194388688, "grad_norm": 841.6162109375, "learning_rate": 3e-06, "loss": 84.3578, "reward": 2.0038751363754272, "reward_std": 0.5935890823602676, "rewards/correctness_reward_func": 1.3333333134651184, "rewards/int_reward_func": 0.4895833283662796, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.1809583306312561, "step": 1153, "zero_std_ratio": 0.0 }, { "epoch": 0.10294839198893796, "grad_norm": 819.4031982421875, "learning_rate": 3e-06, "loss": 47.4932, "step": 1154 }, { "epoch": 0.10303760203398903, "grad_norm": 983.420654296875, "learning_rate": 3e-06, "loss": 114.4865, "step": 1155 }, { "epoch": 0.1031268120790401, "grad_norm": 818.0223388671875, "learning_rate": 3e-06, "loss": 49.3857, "step": 1156 }, { "epoch": 0.10321602212409117, "grad_norm": 825.228759765625, "learning_rate": 3e-06, "loss": 135.8829, "step": 1157 }, { "epoch": 0.10330523216914224, "grad_norm": 814.0230102539062, "learning_rate": 3e-06, "loss": 58.7408, "step": 1158 }, { "epoch": 0.10339444221419332, "grad_norm": 933.5665283203125, "learning_rate": 3e-06, "loss": 79.1231, "step": 1159 }, { "epoch": 0.10348365225924439, "grad_norm": 787.4599609375, "learning_rate": 3e-06, "loss": 38.9872, "step": 1160 }, { "epoch": 0.10357286230429547, "grad_norm": 807.5901489257812, "learning_rate": 3e-06, "loss": 103.1935, "step": 1161 }, { "epoch": 0.10366207234934653, "grad_norm": 903.9038696289062, "learning_rate": 3e-06, "loss": 35.8445, "step": 1162 }, { "epoch": 0.10375128239439761, "grad_norm": 849.3576049804688, "learning_rate": 3e-06, "loss": 121.1534, "step": 1163 }, { "epoch": 0.10384049243944868, "grad_norm": 772.4011840820312, "learning_rate": 3e-06, "loss": 43.3309, "step": 1164 }, { "completion_length": 129.95834350585938, "epoch": 0.10392970248449976, "grad_norm": 639.35693359375, "learning_rate": 3e-06, "loss": -5.3389, "reward": 2.4029585123062134, "reward_std": 0.3480468839406967, "rewards/correctness_reward_func": 1.75, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.15295833349227905, "step": 1165, "zero_std_ratio": 0.0 }, { "epoch": 0.10401891252955082, "grad_norm": 668.5325927734375, "learning_rate": 3e-06, "loss": -32.1924, "step": 1166 }, { "epoch": 0.1041081225746019, "grad_norm": 512.8074340820312, "learning_rate": 3e-06, "loss": -10.3926, "step": 1167 }, { "epoch": 0.10419733261965297, "grad_norm": 600.4286499023438, "learning_rate": 3e-06, "loss": -19.6917, "step": 1168 }, { "epoch": 0.10428654266470405, "grad_norm": 520.0072631835938, "learning_rate": 3e-06, "loss": 10.3206, "step": 1169 }, { "epoch": 0.10437575270975512, "grad_norm": 477.1594543457031, "learning_rate": 3e-06, "loss": 7.4605, "step": 1170 }, { "epoch": 0.10446496275480618, "grad_norm": 677.6570434570312, "learning_rate": 3e-06, "loss": -7.1203, "step": 1171 }, { "epoch": 0.10455417279985726, "grad_norm": 595.1732177734375, "learning_rate": 3e-06, "loss": -36.923, "step": 1172 }, { "epoch": 0.10464338284490833, "grad_norm": 493.15252685546875, "learning_rate": 3e-06, "loss": -13.682, "step": 1173 }, { "epoch": 0.10473259288995941, "grad_norm": 761.3983154296875, "learning_rate": 3e-06, "loss": -24.4599, "step": 1174 }, { "epoch": 0.10482180293501048, "grad_norm": 635.04150390625, "learning_rate": 3e-06, "loss": 1.3747, "step": 1175 }, { "epoch": 0.10491101298006156, "grad_norm": 515.3302001953125, "learning_rate": 3e-06, "loss": 1.0323, "step": 1176 }, { "completion_length": 112.1875, "epoch": 0.10500022302511262, "grad_norm": 469.7004699707031, "learning_rate": 3e-06, "loss": -0.444, "reward": 2.1683751344680786, "reward_std": 0.5464861989021301, "rewards/correctness_reward_func": 1.5, "rewards/int_reward_func": 0.4479166716337204, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.22045832872390747, "step": 1177, "zero_std_ratio": 0.0 }, { "epoch": 0.1050894330701637, "grad_norm": 548.61572265625, "learning_rate": 3e-06, "loss": 23.8497, "step": 1178 }, { "epoch": 0.10517864311521477, "grad_norm": 470.7399597167969, "learning_rate": 3e-06, "loss": -7.3893, "step": 1179 }, { "epoch": 0.10526785316026585, "grad_norm": 548.130859375, "learning_rate": 3e-06, "loss": -8.4577, "step": 1180 }, { "epoch": 0.10535706320531692, "grad_norm": 747.939453125, "learning_rate": 3e-06, "loss": 25.5028, "step": 1181 }, { "epoch": 0.105446273250368, "grad_norm": 383.6826171875, "learning_rate": 3e-06, "loss": -1.7932, "step": 1182 }, { "epoch": 0.10553548329541906, "grad_norm": 433.5135498046875, "learning_rate": 3e-06, "loss": -3.3989, "step": 1183 }, { "epoch": 0.10562469334047014, "grad_norm": 548.0432739257812, "learning_rate": 3e-06, "loss": 17.3918, "step": 1184 }, { "epoch": 0.10571390338552121, "grad_norm": 412.3025207519531, "learning_rate": 3e-06, "loss": -13.2844, "step": 1185 }, { "epoch": 0.10580311343057228, "grad_norm": 390.11663818359375, "learning_rate": 3e-06, "loss": -16.4993, "step": 1186 }, { "epoch": 0.10589232347562336, "grad_norm": 766.78857421875, "learning_rate": 3e-06, "loss": 14.6288, "step": 1187 }, { "epoch": 0.10598153352067442, "grad_norm": 359.71173095703125, "learning_rate": 3e-06, "loss": -8.8216, "step": 1188 }, { "completion_length": 139.31250381469727, "epoch": 0.1060707435657255, "grad_norm": 1056.218994140625, "learning_rate": 3e-06, "loss": -19.8215, "reward": 2.124916732311249, "reward_std": 0.4207773655653, "rewards/correctness_reward_func": 1.5, "rewards/int_reward_func": 0.4895833283662796, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.1353333331644535, "step": 1189, "zero_std_ratio": 0.0 }, { "epoch": 0.10615995361077657, "grad_norm": 1287.11962890625, "learning_rate": 3e-06, "loss": -46.0924, "step": 1190 }, { "epoch": 0.10624916365582765, "grad_norm": 951.5126953125, "learning_rate": 3e-06, "loss": 40.7922, "step": 1191 }, { "epoch": 0.10633837370087872, "grad_norm": 918.715087890625, "learning_rate": 3e-06, "loss": -68.4776, "step": 1192 }, { "epoch": 0.1064275837459298, "grad_norm": 1028.908447265625, "learning_rate": 3e-06, "loss": -24.5844, "step": 1193 }, { "epoch": 0.10651679379098086, "grad_norm": 998.2527465820312, "learning_rate": 3e-06, "loss": -34.4867, "step": 1194 }, { "epoch": 0.10660600383603194, "grad_norm": 1073.2415771484375, "learning_rate": 3e-06, "loss": -28.3823, "step": 1195 }, { "epoch": 0.10669521388108301, "grad_norm": 980.68701171875, "learning_rate": 3e-06, "loss": -57.3113, "step": 1196 }, { "epoch": 0.10678442392613409, "grad_norm": 914.2239990234375, "learning_rate": 3e-06, "loss": 36.5887, "step": 1197 }, { "epoch": 0.10687363397118516, "grad_norm": 907.9400634765625, "learning_rate": 3e-06, "loss": -67.0974, "step": 1198 }, { "epoch": 0.10696284401623622, "grad_norm": 944.126220703125, "learning_rate": 3e-06, "loss": -33.4221, "step": 1199 }, { "epoch": 0.1070520540612873, "grad_norm": 1056.388671875, "learning_rate": 3e-06, "loss": -47.9315, "step": 1200 }, { "completion_length": 123.27083587646484, "epoch": 0.10714126410633837, "grad_norm": 662.4260864257812, "learning_rate": 3e-06, "loss": 8.406, "reward": 2.4690834283828735, "reward_std": 0.40174539387226105, "rewards/correctness_reward_func": 1.7916666865348816, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.17741665989160538, "step": 1201, "zero_std_ratio": 0.0 }, { "epoch": 0.10723047415138945, "grad_norm": 471.24163818359375, "learning_rate": 3e-06, "loss": -1.3977, "step": 1202 }, { "epoch": 0.10731968419644052, "grad_norm": 460.9804382324219, "learning_rate": 3e-06, "loss": -2.6373, "step": 1203 }, { "epoch": 0.1074088942414916, "grad_norm": 426.29827880859375, "learning_rate": 3e-06, "loss": 13.5829, "step": 1204 }, { "epoch": 0.10749810428654266, "grad_norm": 461.0411071777344, "learning_rate": 3e-06, "loss": -8.1952, "step": 1205 }, { "epoch": 0.10758731433159374, "grad_norm": 441.34466552734375, "learning_rate": 3e-06, "loss": -2.8955, "step": 1206 }, { "epoch": 0.10767652437664481, "grad_norm": 569.8471069335938, "learning_rate": 3e-06, "loss": 3.5701, "step": 1207 }, { "epoch": 0.10776573442169589, "grad_norm": 437.7737121582031, "learning_rate": 3e-06, "loss": -7.5943, "step": 1208 }, { "epoch": 0.10785494446674695, "grad_norm": 465.7676696777344, "learning_rate": 3e-06, "loss": -8.1374, "step": 1209 }, { "epoch": 0.10794415451179804, "grad_norm": 412.93511962890625, "learning_rate": 3e-06, "loss": 7.502, "step": 1210 }, { "epoch": 0.1080333645568491, "grad_norm": 481.81951904296875, "learning_rate": 3e-06, "loss": -8.5634, "step": 1211 }, { "epoch": 0.10812257460190017, "grad_norm": 459.6242980957031, "learning_rate": 3e-06, "loss": -3.8228, "step": 1212 }, { "completion_length": 137.0833396911621, "epoch": 0.10821178464695125, "grad_norm": 554.8159790039062, "learning_rate": 3e-06, "loss": 62.0314, "reward": 1.7515416145324707, "reward_std": 0.3914555162191391, "rewards/correctness_reward_func": 1.1249999701976776, "rewards/int_reward_func": 0.4895833283662796, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.13695833832025528, "step": 1213, "zero_std_ratio": 0.0 }, { "epoch": 0.10830099469200231, "grad_norm": 598.4298706054688, "learning_rate": 3e-06, "loss": 49.5125, "step": 1214 }, { "epoch": 0.1083902047370534, "grad_norm": 621.552490234375, "learning_rate": 3e-06, "loss": 63.9503, "step": 1215 }, { "epoch": 0.10847941478210446, "grad_norm": 622.46875, "learning_rate": 3e-06, "loss": 76.0874, "step": 1216 }, { "epoch": 0.10856862482715554, "grad_norm": 547.4149780273438, "learning_rate": 3e-06, "loss": 76.761, "step": 1217 }, { "epoch": 0.10865783487220661, "grad_norm": 464.8030700683594, "learning_rate": 3e-06, "loss": 56.6468, "step": 1218 }, { "epoch": 0.10874704491725769, "grad_norm": 483.95391845703125, "learning_rate": 3e-06, "loss": 48.4273, "step": 1219 }, { "epoch": 0.10883625496230875, "grad_norm": 417.3274841308594, "learning_rate": 3e-06, "loss": 40.6516, "step": 1220 }, { "epoch": 0.10892546500735983, "grad_norm": 611.0287475585938, "learning_rate": 3e-06, "loss": 51.9757, "step": 1221 }, { "epoch": 0.1090146750524109, "grad_norm": 534.5418090820312, "learning_rate": 3e-06, "loss": 55.8806, "step": 1222 }, { "epoch": 0.10910388509746198, "grad_norm": 455.1130676269531, "learning_rate": 3e-06, "loss": 57.7598, "step": 1223 }, { "epoch": 0.10919309514251305, "grad_norm": 375.9675598144531, "learning_rate": 3e-06, "loss": 39.7899, "step": 1224 }, { "completion_length": 130.31250762939453, "epoch": 0.10928230518756411, "grad_norm": 515.6392211914062, "learning_rate": 3e-06, "loss": 2.711, "reward": 2.421500086784363, "reward_std": 0.37992818653583527, "rewards/correctness_reward_func": 1.7916666865348816, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.12983334064483643, "step": 1225, "zero_std_ratio": 0.125 }, { "epoch": 0.1093715152326152, "grad_norm": 517.3201904296875, "learning_rate": 3e-06, "loss": 2.5469, "step": 1226 }, { "epoch": 0.10946072527766626, "grad_norm": 476.81585693359375, "learning_rate": 3e-06, "loss": -3.3801, "step": 1227 }, { "epoch": 0.10954993532271734, "grad_norm": 389.1577453613281, "learning_rate": 3e-06, "loss": 2.5568, "step": 1228 }, { "epoch": 0.1096391453677684, "grad_norm": 368.0989685058594, "learning_rate": 3e-06, "loss": 13.2772, "step": 1229 }, { "epoch": 0.10972835541281949, "grad_norm": 471.196044921875, "learning_rate": 3e-06, "loss": -11.5976, "step": 1230 }, { "epoch": 0.10981756545787055, "grad_norm": 1111.9873046875, "learning_rate": 3e-06, "loss": -0.3085, "step": 1231 }, { "epoch": 0.10990677550292163, "grad_norm": 546.0790405273438, "learning_rate": 3e-06, "loss": 1.0671, "step": 1232 }, { "epoch": 0.1099959855479727, "grad_norm": 400.2040710449219, "learning_rate": 3e-06, "loss": -7.1922, "step": 1233 }, { "epoch": 0.11008519559302378, "grad_norm": 368.6521301269531, "learning_rate": 3e-06, "loss": -0.5552, "step": 1234 }, { "epoch": 0.11017440563807485, "grad_norm": 350.799560546875, "learning_rate": 3e-06, "loss": 10.4996, "step": 1235 }, { "epoch": 0.11026361568312593, "grad_norm": 538.6735229492188, "learning_rate": 3e-06, "loss": -15.7319, "step": 1236 }, { "completion_length": 113.14583587646484, "epoch": 0.11035282572817699, "grad_norm": 272.7051696777344, "learning_rate": 3e-06, "loss": 5.5426, "reward": 2.4681875705718994, "reward_std": 0.30294275283813477, "rewards/correctness_reward_func": 1.7916666865348816, "rewards/int_reward_func": 0.4895833283662796, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.18693750351667404, "step": 1237, "zero_std_ratio": 0.0 }, { "epoch": 0.11044203577322806, "grad_norm": 704.2574462890625, "learning_rate": 3e-06, "loss": -12.312, "step": 1238 }, { "epoch": 0.11053124581827914, "grad_norm": 246.17315673828125, "learning_rate": 3e-06, "loss": 13.9124, "step": 1239 }, { "epoch": 0.1106204558633302, "grad_norm": 290.6758117675781, "learning_rate": 3e-06, "loss": -2.8205, "step": 1240 }, { "epoch": 0.11070966590838129, "grad_norm": 250.3959197998047, "learning_rate": 3e-06, "loss": -0.9022, "step": 1241 }, { "epoch": 0.11079887595343235, "grad_norm": 219.3252410888672, "learning_rate": 3e-06, "loss": -11.9642, "step": 1242 }, { "epoch": 0.11088808599848343, "grad_norm": 222.90179443359375, "learning_rate": 3e-06, "loss": 2.4097, "step": 1243 }, { "epoch": 0.1109772960435345, "grad_norm": 186.19627380371094, "learning_rate": 3e-06, "loss": -12.2427, "step": 1244 }, { "epoch": 0.11106650608858558, "grad_norm": 269.6827087402344, "learning_rate": 3e-06, "loss": 12.3411, "step": 1245 }, { "epoch": 0.11115571613363665, "grad_norm": 237.4910430908203, "learning_rate": 3e-06, "loss": -5.9803, "step": 1246 }, { "epoch": 0.11124492617868773, "grad_norm": 219.88262939453125, "learning_rate": 3e-06, "loss": -2.1975, "step": 1247 }, { "epoch": 0.11133413622373879, "grad_norm": 188.75503540039062, "learning_rate": 3e-06, "loss": -11.695, "step": 1248 }, { "completion_length": 138.06250381469727, "epoch": 0.11142334626878987, "grad_norm": 550.1611328125, "learning_rate": 3e-06, "loss": 8.1996, "reward": 2.1077709197998047, "reward_std": 0.6573592722415924, "rewards/correctness_reward_func": 1.5, "rewards/int_reward_func": 0.4791666567325592, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.12860416434705257, "step": 1249, "zero_std_ratio": 0.0 }, { "epoch": 0.11151255631384094, "grad_norm": 739.56591796875, "learning_rate": 3e-06, "loss": 97.7263, "step": 1250 }, { "epoch": 0.111601766358892, "grad_norm": 684.5728759765625, "learning_rate": 3e-06, "loss": 37.5632, "step": 1251 }, { "epoch": 0.11169097640394308, "grad_norm": 567.6437377929688, "learning_rate": 3e-06, "loss": 10.2892, "step": 1252 }, { "epoch": 0.11178018644899415, "grad_norm": 876.806640625, "learning_rate": 3e-06, "loss": -13.9692, "step": 1253 }, { "epoch": 0.11186939649404523, "grad_norm": 592.7362670898438, "learning_rate": 3e-06, "loss": 10.123, "step": 1254 }, { "epoch": 0.1119586065390963, "grad_norm": 509.6954345703125, "learning_rate": 3e-06, "loss": 4.082, "step": 1255 }, { "epoch": 0.11204781658414738, "grad_norm": 781.4515380859375, "learning_rate": 3e-06, "loss": 90.9954, "step": 1256 }, { "epoch": 0.11213702662919844, "grad_norm": 621.8211669921875, "learning_rate": 3e-06, "loss": 32.1321, "step": 1257 }, { "epoch": 0.11222623667424952, "grad_norm": 594.2699584960938, "learning_rate": 3e-06, "loss": 7.6137, "step": 1258 }, { "epoch": 0.11231544671930059, "grad_norm": 815.78857421875, "learning_rate": 3e-06, "loss": -16.6171, "step": 1259 }, { "epoch": 0.11240465676435167, "grad_norm": 552.5240478515625, "learning_rate": 3e-06, "loss": 8.0781, "step": 1260 }, { "completion_length": 130.56250381469727, "epoch": 0.11249386680940274, "grad_norm": 737.4078979492188, "learning_rate": 3e-06, "loss": 59.8285, "reward": 2.060583472251892, "reward_std": 0.43678246438503265, "rewards/correctness_reward_func": 1.4583333730697632, "rewards/int_reward_func": 0.4479166567325592, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.15433333814144135, "step": 1261, "zero_std_ratio": 0.0 }, { "epoch": 0.11258307685445382, "grad_norm": 522.5460205078125, "learning_rate": 3e-06, "loss": 33.9124, "step": 1262 }, { "epoch": 0.11267228689950488, "grad_norm": 680.2973022460938, "learning_rate": 3e-06, "loss": 63.2628, "step": 1263 }, { "epoch": 0.11276149694455595, "grad_norm": 631.2474365234375, "learning_rate": 3e-06, "loss": 63.7001, "step": 1264 }, { "epoch": 0.11285070698960703, "grad_norm": 618.1221313476562, "learning_rate": 3e-06, "loss": 51.6552, "step": 1265 }, { "epoch": 0.1129399170346581, "grad_norm": 677.8065185546875, "learning_rate": 3e-06, "loss": 56.3779, "step": 1266 }, { "epoch": 0.11302912707970918, "grad_norm": 694.61865234375, "learning_rate": 3e-06, "loss": 47.5276, "step": 1267 }, { "epoch": 0.11311833712476024, "grad_norm": 319.64862060546875, "learning_rate": 3e-06, "loss": 24.7961, "step": 1268 }, { "epoch": 0.11320754716981132, "grad_norm": 711.6901245117188, "learning_rate": 3e-06, "loss": 41.8441, "step": 1269 }, { "epoch": 0.11329675721486239, "grad_norm": 506.5478515625, "learning_rate": 3e-06, "loss": 50.1798, "step": 1270 }, { "epoch": 0.11338596725991347, "grad_norm": 537.6961669921875, "learning_rate": 3e-06, "loss": 32.1026, "step": 1271 }, { "epoch": 0.11347517730496454, "grad_norm": 495.77386474609375, "learning_rate": 3e-06, "loss": 34.678, "step": 1272 }, { "completion_length": 152.4583396911621, "epoch": 0.11356438735001562, "grad_norm": 540.7517700195312, "learning_rate": 3e-06, "loss": -25.9926, "reward": 1.7821251153945923, "reward_std": 0.4489431008696556, "rewards/correctness_reward_func": 1.1666666567325592, "rewards/int_reward_func": 0.4895833283662796, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.12587500177323818, "step": 1273, "zero_std_ratio": 0.0 }, { "epoch": 0.11365359739506668, "grad_norm": 533.8948974609375, "learning_rate": 3e-06, "loss": -10.4261, "step": 1274 }, { "epoch": 0.11374280744011776, "grad_norm": 407.5013122558594, "learning_rate": 3e-06, "loss": -1.9956, "step": 1275 }, { "epoch": 0.11383201748516883, "grad_norm": 435.1839599609375, "learning_rate": 3e-06, "loss": -14.8616, "step": 1276 }, { "epoch": 0.11392122753021991, "grad_norm": 501.2892761230469, "learning_rate": 3e-06, "loss": 3.4901, "step": 1277 }, { "epoch": 0.11401043757527098, "grad_norm": 419.8990173339844, "learning_rate": 3e-06, "loss": -6.3263, "step": 1278 }, { "epoch": 0.11409964762032204, "grad_norm": 508.3893737792969, "learning_rate": 3e-06, "loss": -31.5865, "step": 1279 }, { "epoch": 0.11418885766537312, "grad_norm": 393.2361755371094, "learning_rate": 3e-06, "loss": -9.3518, "step": 1280 }, { "epoch": 0.11427806771042419, "grad_norm": 370.1325378417969, "learning_rate": 3e-06, "loss": -5.1139, "step": 1281 }, { "epoch": 0.11436727775547527, "grad_norm": 382.4687805175781, "learning_rate": 3e-06, "loss": -17.6292, "step": 1282 }, { "epoch": 0.11445648780052634, "grad_norm": 424.23553466796875, "learning_rate": 3e-06, "loss": -1.1541, "step": 1283 }, { "epoch": 0.11454569784557742, "grad_norm": 366.4822998046875, "learning_rate": 3e-06, "loss": -10.3862, "step": 1284 }, { "completion_length": 150.7916717529297, "epoch": 0.11463490789062848, "grad_norm": 64.43208312988281, "learning_rate": 3e-06, "loss": -3.6021, "reward": 2.220250129699707, "reward_std": 0.1756780087016523, "rewards/correctness_reward_func": 1.6666666865348816, "rewards/int_reward_func": 0.4166666716337204, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.13691666349768639, "step": 1285, "zero_std_ratio": 0.125 }, { "epoch": 0.11472411793567956, "grad_norm": 146.30032348632812, "learning_rate": 3e-06, "loss": -5.2668, "step": 1286 }, { "epoch": 0.11481332798073063, "grad_norm": 63.97313690185547, "learning_rate": 3e-06, "loss": 1.0993, "step": 1287 }, { "epoch": 0.11490253802578171, "grad_norm": 62.98701858520508, "learning_rate": 3e-06, "loss": 0.0564, "step": 1288 }, { "epoch": 0.11499174807083277, "grad_norm": 70.9837417602539, "learning_rate": 3e-06, "loss": -1.8786, "step": 1289 }, { "epoch": 0.11508095811588386, "grad_norm": 127.5965576171875, "learning_rate": 3e-06, "loss": 2.6502, "step": 1290 }, { "epoch": 0.11517016816093492, "grad_norm": 109.8124008178711, "learning_rate": 3e-06, "loss": -4.5564, "step": 1291 }, { "epoch": 0.11525937820598599, "grad_norm": 165.30950927734375, "learning_rate": 3e-06, "loss": -6.4512, "step": 1292 }, { "epoch": 0.11534858825103707, "grad_norm": 52.5026741027832, "learning_rate": 3e-06, "loss": 0.4806, "step": 1293 }, { "epoch": 0.11543779829608813, "grad_norm": 116.0346908569336, "learning_rate": 3e-06, "loss": -0.2058, "step": 1294 }, { "epoch": 0.11552700834113921, "grad_norm": 83.33243560791016, "learning_rate": 3e-06, "loss": -2.1249, "step": 1295 }, { "epoch": 0.11561621838619028, "grad_norm": 105.78126525878906, "learning_rate": 3e-06, "loss": 2.4684, "step": 1296 }, { "completion_length": 132.31250381469727, "epoch": 0.11570542843124136, "grad_norm": 134.2357635498047, "learning_rate": 3e-06, "loss": -3.0816, "reward": 1.8828958868980408, "reward_std": 0.34478074312210083, "rewards/correctness_reward_func": 1.2916666567325592, "rewards/int_reward_func": 0.4375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.1537291705608368, "step": 1297, "zero_std_ratio": 0.0 }, { "epoch": 0.11579463847629243, "grad_norm": 103.42467498779297, "learning_rate": 3e-06, "loss": -8.8164, "step": 1298 }, { "epoch": 0.11588384852134351, "grad_norm": 182.32949829101562, "learning_rate": 3e-06, "loss": -11.4319, "step": 1299 }, { "epoch": 0.11597305856639457, "grad_norm": 208.2166748046875, "learning_rate": 3e-06, "loss": -10.9038, "step": 1300 }, { "epoch": 0.11606226861144565, "grad_norm": 130.95465087890625, "learning_rate": 3e-06, "loss": -10.8062, "step": 1301 }, { "epoch": 0.11615147865649672, "grad_norm": 119.11116790771484, "learning_rate": 3e-06, "loss": -9.441, "step": 1302 }, { "epoch": 0.1162406887015478, "grad_norm": 140.15191650390625, "learning_rate": 3e-06, "loss": -5.4138, "step": 1303 }, { "epoch": 0.11632989874659887, "grad_norm": 133.99444580078125, "learning_rate": 3e-06, "loss": -11.1186, "step": 1304 }, { "epoch": 0.11641910879164993, "grad_norm": 186.12327575683594, "learning_rate": 3e-06, "loss": -13.2229, "step": 1305 }, { "epoch": 0.11650831883670101, "grad_norm": 237.92056274414062, "learning_rate": 3e-06, "loss": -14.0843, "step": 1306 }, { "epoch": 0.11659752888175208, "grad_norm": 166.06137084960938, "learning_rate": 3e-06, "loss": -14.3065, "step": 1307 }, { "epoch": 0.11668673892680316, "grad_norm": 162.08094787597656, "learning_rate": 3e-06, "loss": -13.5576, "step": 1308 }, { "completion_length": 153.43750762939453, "epoch": 0.11677594897185423, "grad_norm": 816.9434814453125, "learning_rate": 3e-06, "loss": 65.7541, "reward": 2.333416700363159, "reward_std": 0.44685766100883484, "rewards/correctness_reward_func": 1.75, "rewards/int_reward_func": 0.4791666567325592, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.1042499989271164, "step": 1309, "zero_std_ratio": 0.125 }, { "epoch": 0.1168651590169053, "grad_norm": 713.5402221679688, "learning_rate": 3e-06, "loss": 86.7146, "step": 1310 }, { "epoch": 0.11695436906195637, "grad_norm": 1150.6640625, "learning_rate": 3e-06, "loss": 87.0367, "step": 1311 }, { "epoch": 0.11704357910700745, "grad_norm": 696.25537109375, "learning_rate": 3e-06, "loss": 58.2521, "step": 1312 }, { "epoch": 0.11713278915205852, "grad_norm": 869.322998046875, "learning_rate": 3e-06, "loss": 60.5598, "step": 1313 }, { "epoch": 0.1172219991971096, "grad_norm": 908.8890991210938, "learning_rate": 3e-06, "loss": 108.3106, "step": 1314 }, { "epoch": 0.11731120924216067, "grad_norm": 816.9879760742188, "learning_rate": 3e-06, "loss": 61.6733, "step": 1315 }, { "epoch": 0.11740041928721175, "grad_norm": 776.9385375976562, "learning_rate": 3e-06, "loss": 78.5087, "step": 1316 }, { "epoch": 0.11748962933226281, "grad_norm": 1054.038330078125, "learning_rate": 3e-06, "loss": 77.8217, "step": 1317 }, { "epoch": 0.11757883937731388, "grad_norm": 593.8987426757812, "learning_rate": 3e-06, "loss": 50.989, "step": 1318 }, { "epoch": 0.11766804942236496, "grad_norm": 748.6307373046875, "learning_rate": 3e-06, "loss": 44.6558, "step": 1319 }, { "epoch": 0.11775725946741603, "grad_norm": 1137.890625, "learning_rate": 3e-06, "loss": 86.6074, "step": 1320 }, { "completion_length": 116.97917175292969, "epoch": 0.1178464695124671, "grad_norm": 59.700103759765625, "learning_rate": 3e-06, "loss": -3.8916, "reward": 2.350229024887085, "reward_std": 0.22116341721266508, "rewards/correctness_reward_func": 1.6666666865348816, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.1835625022649765, "step": 1321, "zero_std_ratio": 0.0 }, { "epoch": 0.11793567955751817, "grad_norm": 52.00520706176758, "learning_rate": 3e-06, "loss": -4.6376, "step": 1322 }, { "epoch": 0.11802488960256925, "grad_norm": 61.93866729736328, "learning_rate": 3e-06, "loss": -5.3856, "step": 1323 }, { "epoch": 0.11811409964762032, "grad_norm": 75.79090118408203, "learning_rate": 3e-06, "loss": -10.1807, "step": 1324 }, { "epoch": 0.1182033096926714, "grad_norm": 57.52518081665039, "learning_rate": 3e-06, "loss": -6.9506, "step": 1325 }, { "epoch": 0.11829251973772247, "grad_norm": 54.51726531982422, "learning_rate": 3e-06, "loss": -6.9744, "step": 1326 }, { "epoch": 0.11838172978277355, "grad_norm": 54.5262336730957, "learning_rate": 3e-06, "loss": -4.3897, "step": 1327 }, { "epoch": 0.11847093982782461, "grad_norm": 46.70759201049805, "learning_rate": 3e-06, "loss": -5.4283, "step": 1328 }, { "epoch": 0.11856014987287569, "grad_norm": 84.02616882324219, "learning_rate": 3e-06, "loss": -6.4225, "step": 1329 }, { "epoch": 0.11864935991792676, "grad_norm": 84.2325668334961, "learning_rate": 3e-06, "loss": -11.725, "step": 1330 }, { "epoch": 0.11873856996297782, "grad_norm": 61.59962844848633, "learning_rate": 3e-06, "loss": -8.3019, "step": 1331 }, { "epoch": 0.1188277800080289, "grad_norm": 47.83831024169922, "learning_rate": 3e-06, "loss": -8.035, "step": 1332 }, { "completion_length": 118.75000381469727, "epoch": 0.11891699005307997, "grad_norm": 255.8539276123047, "learning_rate": 3e-06, "loss": -9.5405, "reward": 2.0334584712982178, "reward_std": 0.5094788670539856, "rewards/correctness_reward_func": 1.4166666865348816, "rewards/int_reward_func": 0.46875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.14804166555404663, "step": 1333, "zero_std_ratio": 0.0 }, { "epoch": 0.11900620009813105, "grad_norm": 288.6775207519531, "learning_rate": 3e-06, "loss": 0.6784, "step": 1334 }, { "epoch": 0.11909541014318212, "grad_norm": 197.23643493652344, "learning_rate": 3e-06, "loss": 2.0711, "step": 1335 }, { "epoch": 0.1191846201882332, "grad_norm": 383.6775207519531, "learning_rate": 3e-06, "loss": 7.2175, "step": 1336 }, { "epoch": 0.11927383023328426, "grad_norm": 509.5005187988281, "learning_rate": 3e-06, "loss": 24.5104, "step": 1337 }, { "epoch": 0.11936304027833534, "grad_norm": 453.14947509765625, "learning_rate": 3e-06, "loss": 30.6734, "step": 1338 }, { "epoch": 0.11945225032338641, "grad_norm": 275.91925048828125, "learning_rate": 3e-06, "loss": -11.3542, "step": 1339 }, { "epoch": 0.11954146036843749, "grad_norm": 288.1315612792969, "learning_rate": 3e-06, "loss": -1.6763, "step": 1340 }, { "epoch": 0.11963067041348856, "grad_norm": 190.97845458984375, "learning_rate": 3e-06, "loss": 0.7165, "step": 1341 }, { "epoch": 0.11971988045853964, "grad_norm": 380.86102294921875, "learning_rate": 3e-06, "loss": 4.2454, "step": 1342 }, { "epoch": 0.1198090905035907, "grad_norm": 470.7318420410156, "learning_rate": 3e-06, "loss": 18.4597, "step": 1343 }, { "epoch": 0.11989830054864177, "grad_norm": 439.4569091796875, "learning_rate": 3e-06, "loss": 25.8113, "step": 1344 }, { "completion_length": 138.70834350585938, "epoch": 0.11998751059369285, "grad_norm": 1121.697021484375, "learning_rate": 3e-06, "loss": -1.833, "reward": 1.5940208435058594, "reward_std": 0.1865759715437889, "rewards/correctness_reward_func": 1.0416666865348816, "rewards/int_reward_func": 0.4166666716337204, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.13568750768899918, "step": 1345, "zero_std_ratio": 0.0 }, { "epoch": 0.12007672063874392, "grad_norm": 361.7314758300781, "learning_rate": 3e-06, "loss": -5.4719, "step": 1346 }, { "epoch": 0.120165930683795, "grad_norm": 722.0169067382812, "learning_rate": 3e-06, "loss": 26.6872, "step": 1347 }, { "epoch": 0.12025514072884606, "grad_norm": 521.7423706054688, "learning_rate": 3e-06, "loss": 12.9836, "step": 1348 }, { "epoch": 0.12034435077389714, "grad_norm": 270.7301025390625, "learning_rate": 3e-06, "loss": 10.6828, "step": 1349 }, { "epoch": 0.12043356081894821, "grad_norm": 339.52813720703125, "learning_rate": 3e-06, "loss": 19.5909, "step": 1350 }, { "epoch": 0.12052277086399929, "grad_norm": 288.1322326660156, "learning_rate": 3e-06, "loss": -2.7182, "step": 1351 }, { "epoch": 0.12061198090905036, "grad_norm": 404.0463562011719, "learning_rate": 3e-06, "loss": -6.4436, "step": 1352 }, { "epoch": 0.12070119095410144, "grad_norm": 480.42559814453125, "learning_rate": 3e-06, "loss": 27.9973, "step": 1353 }, { "epoch": 0.1207904009991525, "grad_norm": 454.6216735839844, "learning_rate": 3e-06, "loss": 10.5823, "step": 1354 }, { "epoch": 0.12087961104420358, "grad_norm": 292.9732971191406, "learning_rate": 3e-06, "loss": 8.2813, "step": 1355 }, { "epoch": 0.12096882108925465, "grad_norm": 302.1980285644531, "learning_rate": 3e-06, "loss": 18.4478, "step": 1356 }, { "completion_length": 114.14583587646484, "epoch": 0.12105803113430573, "grad_norm": 789.7766723632812, "learning_rate": 3e-06, "loss": -361.7816, "reward": 2.2812918424606323, "reward_std": 0.7483960092067719, "rewards/correctness_reward_func": 1.625, "rewards/int_reward_func": 0.4791666567325592, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.1771249994635582, "step": 1357, "zero_std_ratio": 0.0 }, { "epoch": 0.1211472411793568, "grad_norm": 1342.53662109375, "learning_rate": 3e-06, "loss": -332.2092, "step": 1358 }, { "epoch": 0.12123645122440786, "grad_norm": 852.587158203125, "learning_rate": 3e-06, "loss": -354.9008, "step": 1359 }, { "epoch": 0.12132566126945894, "grad_norm": 1136.1865234375, "learning_rate": 3e-06, "loss": -338.6482, "step": 1360 }, { "epoch": 0.12141487131451001, "grad_norm": 1198.8544921875, "learning_rate": 3e-06, "loss": -362.0203, "step": 1361 }, { "epoch": 0.12150408135956109, "grad_norm": 767.9398193359375, "learning_rate": 3e-06, "loss": -368.6167, "step": 1362 }, { "epoch": 0.12159329140461216, "grad_norm": 798.4600830078125, "learning_rate": 3e-06, "loss": -382.5811, "step": 1363 }, { "epoch": 0.12168250144966324, "grad_norm": 905.6390991210938, "learning_rate": 3e-06, "loss": -359.7239, "step": 1364 }, { "epoch": 0.1217717114947143, "grad_norm": 813.3554077148438, "learning_rate": 3e-06, "loss": -387.0817, "step": 1365 }, { "epoch": 0.12186092153976538, "grad_norm": 1077.1717529296875, "learning_rate": 3e-06, "loss": -374.2281, "step": 1366 }, { "epoch": 0.12195013158481645, "grad_norm": 1666.1490478515625, "learning_rate": 3e-06, "loss": -393.2914, "step": 1367 }, { "epoch": 0.12203934162986753, "grad_norm": 792.9961547851562, "learning_rate": 3e-06, "loss": -407.0092, "step": 1368 }, { "completion_length": 143.4791717529297, "epoch": 0.1221285516749186, "grad_norm": 346.3370056152344, "learning_rate": 3e-06, "loss": -0.7849, "reward": 1.621000051498413, "reward_std": 0.5591593682765961, "rewards/correctness_reward_func": 1.0416666567325592, "rewards/int_reward_func": 0.4583333283662796, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.12099999561905861, "step": 1369, "zero_std_ratio": 0.0 }, { "epoch": 0.12221776171996968, "grad_norm": 331.146240234375, "learning_rate": 3e-06, "loss": 22.5103, "step": 1370 }, { "epoch": 0.12230697176502074, "grad_norm": 303.2864990234375, "learning_rate": 3e-06, "loss": -13.1863, "step": 1371 }, { "epoch": 0.12239618181007181, "grad_norm": 314.0926513671875, "learning_rate": 3e-06, "loss": 7.2222, "step": 1372 }, { "epoch": 0.12248539185512289, "grad_norm": 282.6340637207031, "learning_rate": 3e-06, "loss": 3.6854, "step": 1373 }, { "epoch": 0.12257460190017395, "grad_norm": 327.83856201171875, "learning_rate": 3e-06, "loss": 6.0168, "step": 1374 }, { "epoch": 0.12266381194522503, "grad_norm": 409.41033935546875, "learning_rate": 3e-06, "loss": -3.9087, "step": 1375 }, { "epoch": 0.1227530219902761, "grad_norm": 341.80078125, "learning_rate": 3e-06, "loss": 17.7844, "step": 1376 }, { "epoch": 0.12284223203532718, "grad_norm": 320.9837646484375, "learning_rate": 3e-06, "loss": -18.6977, "step": 1377 }, { "epoch": 0.12293144208037825, "grad_norm": 309.8314208984375, "learning_rate": 3e-06, "loss": 4.6537, "step": 1378 }, { "epoch": 0.12302065212542933, "grad_norm": 286.1745910644531, "learning_rate": 3e-06, "loss": -0.5282, "step": 1379 }, { "epoch": 0.1231098621704804, "grad_norm": 367.7530517578125, "learning_rate": 3e-06, "loss": 2.5712, "step": 1380 }, { "completion_length": 140.4375, "epoch": 0.12319907221553147, "grad_norm": 441.84033203125, "learning_rate": 3e-06, "loss": -35.237, "reward": 2.2606041431427, "reward_std": 0.5069623440504074, "rewards/correctness_reward_func": 1.6666666865348816, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.09393750131130219, "step": 1381, "zero_std_ratio": 0.0 }, { "epoch": 0.12328828226058254, "grad_norm": 424.677001953125, "learning_rate": 3e-06, "loss": -30.367, "step": 1382 }, { "epoch": 0.12337749230563362, "grad_norm": 515.3092041015625, "learning_rate": 3e-06, "loss": -114.4816, "step": 1383 }, { "epoch": 0.12346670235068469, "grad_norm": 880.9327392578125, "learning_rate": 3e-06, "loss": -72.5326, "step": 1384 }, { "epoch": 0.12355591239573575, "grad_norm": 943.7388305664062, "learning_rate": 3e-06, "loss": -106.8934, "step": 1385 }, { "epoch": 0.12364512244078683, "grad_norm": 729.3348999023438, "learning_rate": 3e-06, "loss": -63.3284, "step": 1386 }, { "epoch": 0.1237343324858379, "grad_norm": 696.5101928710938, "learning_rate": 3e-06, "loss": -46.2135, "step": 1387 }, { "epoch": 0.12382354253088898, "grad_norm": 893.1636352539062, "learning_rate": 3e-06, "loss": -44.905, "step": 1388 }, { "epoch": 0.12391275257594005, "grad_norm": 477.4311218261719, "learning_rate": 3e-06, "loss": -128.0508, "step": 1389 }, { "epoch": 0.12400196262099113, "grad_norm": 746.9646606445312, "learning_rate": 3e-06, "loss": -93.2043, "step": 1390 }, { "epoch": 0.1240911726660422, "grad_norm": 955.401123046875, "learning_rate": 3e-06, "loss": -134.5935, "step": 1391 }, { "epoch": 0.12418038271109327, "grad_norm": 859.1612548828125, "learning_rate": 3e-06, "loss": -86.3171, "step": 1392 }, { "completion_length": 117.87500381469727, "epoch": 0.12426959275614434, "grad_norm": 664.340576171875, "learning_rate": 3e-06, "loss": -104.9476, "reward": 2.5831665992736816, "reward_std": 0.2565724179148674, "rewards/correctness_reward_func": 1.9166666865348816, "rewards/int_reward_func": 0.4895833283662796, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.176916666328907, "step": 1393, "zero_std_ratio": 0.0 }, { "epoch": 0.12435880280119542, "grad_norm": 669.9559936523438, "learning_rate": 3e-06, "loss": -139.0697, "step": 1394 }, { "epoch": 0.12444801284624649, "grad_norm": 674.5648803710938, "learning_rate": 3e-06, "loss": -111.2749, "step": 1395 }, { "epoch": 0.12453722289129757, "grad_norm": 601.0291748046875, "learning_rate": 3e-06, "loss": -83.8575, "step": 1396 }, { "epoch": 0.12462643293634863, "grad_norm": 679.1978759765625, "learning_rate": 3e-06, "loss": -110.6151, "step": 1397 }, { "epoch": 0.1247156429813997, "grad_norm": 724.3170776367188, "learning_rate": 3e-06, "loss": -124.7426, "step": 1398 }, { "epoch": 0.12480485302645078, "grad_norm": 555.1895141601562, "learning_rate": 3e-06, "loss": -126.9873, "step": 1399 }, { "epoch": 0.12489406307150185, "grad_norm": 672.0289916992188, "learning_rate": 3e-06, "loss": -156.5453, "step": 1400 }, { "epoch": 0.12498327311655293, "grad_norm": 560.0007934570312, "learning_rate": 3e-06, "loss": -129.3565, "step": 1401 }, { "epoch": 0.125072483161604, "grad_norm": 564.6636962890625, "learning_rate": 3e-06, "loss": -98.3201, "step": 1402 }, { "epoch": 0.12516169320665507, "grad_norm": 705.2073974609375, "learning_rate": 3e-06, "loss": -137.3138, "step": 1403 }, { "epoch": 0.12525090325170615, "grad_norm": 738.72119140625, "learning_rate": 3e-06, "loss": -152.8548, "step": 1404 }, { "completion_length": 116.75000381469727, "epoch": 0.1253401132967572, "grad_norm": 767.5428466796875, "learning_rate": 3e-06, "loss": -51.0529, "reward": 2.3569791316986084, "reward_std": 0.4473089873790741, "rewards/correctness_reward_func": 1.6666666865348816, "rewards/int_reward_func": 0.4895833283662796, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.2007291615009308, "step": 1405, "zero_std_ratio": 0.0 }, { "epoch": 0.12542932334180829, "grad_norm": 738.8226928710938, "learning_rate": 3e-06, "loss": -36.4735, "step": 1406 }, { "epoch": 0.12551853338685937, "grad_norm": 532.7732543945312, "learning_rate": 3e-06, "loss": -3.3106, "step": 1407 }, { "epoch": 0.12560774343191045, "grad_norm": 1219.233154296875, "learning_rate": 3e-06, "loss": -10.491, "step": 1408 }, { "epoch": 0.1256969534769615, "grad_norm": 611.2919311523438, "learning_rate": 3e-06, "loss": 5.0358, "step": 1409 }, { "epoch": 0.12578616352201258, "grad_norm": 1330.2183837890625, "learning_rate": 3e-06, "loss": -54.0219, "step": 1410 }, { "epoch": 0.12587537356706366, "grad_norm": 634.2547607421875, "learning_rate": 3e-06, "loss": -61.707, "step": 1411 }, { "epoch": 0.1259645836121147, "grad_norm": 830.1190185546875, "learning_rate": 3e-06, "loss": -44.7945, "step": 1412 }, { "epoch": 0.1260537936571658, "grad_norm": 656.9666748046875, "learning_rate": 3e-06, "loss": -5.745, "step": 1413 }, { "epoch": 0.12614300370221687, "grad_norm": 1234.1181640625, "learning_rate": 3e-06, "loss": -22.9599, "step": 1414 }, { "epoch": 0.12623221374726795, "grad_norm": 810.6359252929688, "learning_rate": 3e-06, "loss": -5.0873, "step": 1415 }, { "epoch": 0.126321423792319, "grad_norm": 1392.664306640625, "learning_rate": 3e-06, "loss": -63.0076, "step": 1416 }, { "completion_length": 107.02083587646484, "epoch": 0.12641063383737008, "grad_norm": 535.7810668945312, "learning_rate": 3e-06, "loss": -19.2557, "reward": 2.204854369163513, "reward_std": 0.5064078867435455, "rewards/correctness_reward_func": 1.5416666269302368, "rewards/int_reward_func": 0.4375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.22568749636411667, "step": 1417, "zero_std_ratio": 0.0 }, { "epoch": 0.12649984388242116, "grad_norm": 547.939453125, "learning_rate": 3e-06, "loss": -52.9769, "step": 1418 }, { "epoch": 0.12658905392747224, "grad_norm": 1266.0494384765625, "learning_rate": 3e-06, "loss": -50.3919, "step": 1419 }, { "epoch": 0.1266782639725233, "grad_norm": 959.639404296875, "learning_rate": 3e-06, "loss": -47.5673, "step": 1420 }, { "epoch": 0.12676747401757438, "grad_norm": 924.2545776367188, "learning_rate": 3e-06, "loss": -11.8101, "step": 1421 }, { "epoch": 0.12685668406262546, "grad_norm": 674.6265869140625, "learning_rate": 3e-06, "loss": -62.9894, "step": 1422 }, { "epoch": 0.12694589410767654, "grad_norm": 611.5636596679688, "learning_rate": 3e-06, "loss": -24.2492, "step": 1423 }, { "epoch": 0.1270351041527276, "grad_norm": 573.57470703125, "learning_rate": 3e-06, "loss": -60.3758, "step": 1424 }, { "epoch": 0.12712431419777867, "grad_norm": 850.60693359375, "learning_rate": 3e-06, "loss": -51.1853, "step": 1425 }, { "epoch": 0.12721352424282975, "grad_norm": 1190.2738037109375, "learning_rate": 3e-06, "loss": -64.8067, "step": 1426 }, { "epoch": 0.1273027342878808, "grad_norm": 884.0977783203125, "learning_rate": 3e-06, "loss": -18.4246, "step": 1427 }, { "epoch": 0.12739194433293188, "grad_norm": 682.3296508789062, "learning_rate": 3e-06, "loss": -72.1404, "step": 1428 }, { "completion_length": 136.56250762939453, "epoch": 0.12748115437798296, "grad_norm": 1359.529052734375, "learning_rate": 3e-06, "loss": 208.4725, "reward": 2.0545417070388794, "reward_std": 0.6211664974689484, "rewards/correctness_reward_func": 1.4583333730697632, "rewards/int_reward_func": 0.46875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.12745832651853561, "step": 1429, "zero_std_ratio": 0.0 }, { "epoch": 0.12757036442303404, "grad_norm": 1307.88232421875, "learning_rate": 3e-06, "loss": 254.5796, "step": 1430 }, { "epoch": 0.1276595744680851, "grad_norm": 1878.1898193359375, "learning_rate": 3e-06, "loss": -34.1773, "step": 1431 }, { "epoch": 0.12774878451313618, "grad_norm": 1812.6014404296875, "learning_rate": 3e-06, "loss": 302.3062, "step": 1432 }, { "epoch": 0.12783799455818726, "grad_norm": 2044.9403076171875, "learning_rate": 3e-06, "loss": 44.0528, "step": 1433 }, { "epoch": 0.12792720460323834, "grad_norm": 1851.6116943359375, "learning_rate": 3e-06, "loss": 49.5851, "step": 1434 }, { "epoch": 0.1280164146482894, "grad_norm": 2113.39794921875, "learning_rate": 3e-06, "loss": 196.8499, "step": 1435 }, { "epoch": 0.12810562469334047, "grad_norm": 1287.9775390625, "learning_rate": 3e-06, "loss": 235.1401, "step": 1436 }, { "epoch": 0.12819483473839155, "grad_norm": 1830.1588134765625, "learning_rate": 3e-06, "loss": -49.5711, "step": 1437 }, { "epoch": 0.1282840447834426, "grad_norm": 1435.050048828125, "learning_rate": 3e-06, "loss": 289.4097, "step": 1438 }, { "epoch": 0.12837325482849368, "grad_norm": 1800.0789794921875, "learning_rate": 3e-06, "loss": 30.6825, "step": 1439 }, { "epoch": 0.12846246487354476, "grad_norm": 2387.952880859375, "learning_rate": 3e-06, "loss": 45.8289, "step": 1440 }, { "completion_length": 116.02083587646484, "epoch": 0.12855167491859584, "grad_norm": 1155.3507080078125, "learning_rate": 3e-06, "loss": 265.5505, "reward": 2.0338125824928284, "reward_std": 0.45823561400175095, "rewards/correctness_reward_func": 1.3333333730697632, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.2004791647195816, "step": 1441, "zero_std_ratio": 0.0 }, { "epoch": 0.1286408849636469, "grad_norm": 1068.2740478515625, "learning_rate": 3e-06, "loss": 231.8086, "step": 1442 }, { "epoch": 0.12873009500869798, "grad_norm": 1055.8021240234375, "learning_rate": 3e-06, "loss": 265.7319, "step": 1443 }, { "epoch": 0.12881930505374906, "grad_norm": 654.5939331054688, "learning_rate": 3e-06, "loss": 225.5054, "step": 1444 }, { "epoch": 0.12890851509880014, "grad_norm": 853.0970458984375, "learning_rate": 3e-06, "loss": 232.0057, "step": 1445 }, { "epoch": 0.1289977251438512, "grad_norm": 1135.107666015625, "learning_rate": 3e-06, "loss": 221.9774, "step": 1446 }, { "epoch": 0.12908693518890227, "grad_norm": 908.7825927734375, "learning_rate": 3e-06, "loss": 245.362, "step": 1447 }, { "epoch": 0.12917614523395335, "grad_norm": 1166.5682373046875, "learning_rate": 3e-06, "loss": 206.0756, "step": 1448 }, { "epoch": 0.12926535527900443, "grad_norm": 866.4660034179688, "learning_rate": 3e-06, "loss": 237.2835, "step": 1449 }, { "epoch": 0.12935456532405548, "grad_norm": 635.88623046875, "learning_rate": 3e-06, "loss": 216.354, "step": 1450 }, { "epoch": 0.12944377536910656, "grad_norm": 740.2254028320312, "learning_rate": 3e-06, "loss": 211.3737, "step": 1451 }, { "epoch": 0.12953298541415764, "grad_norm": 841.6773681640625, "learning_rate": 3e-06, "loss": 204.9694, "step": 1452 }, { "completion_length": 106.0, "epoch": 0.1296221954592087, "grad_norm": 81.25074768066406, "learning_rate": 3e-06, "loss": -11.2076, "reward": 2.7076042890548706, "reward_std": 0.04550948552787304, "rewards/correctness_reward_func": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.20760416984558105, "step": 1453, "zero_std_ratio": 0.0 }, { "epoch": 0.12971140550425977, "grad_norm": 86.57603454589844, "learning_rate": 3e-06, "loss": -11.9418, "step": 1454 }, { "epoch": 0.12980061554931085, "grad_norm": 72.47956848144531, "learning_rate": 3e-06, "loss": -10.8762, "step": 1455 }, { "epoch": 0.12988982559436194, "grad_norm": 87.24005889892578, "learning_rate": 3e-06, "loss": -14.0014, "step": 1456 }, { "epoch": 0.129979035639413, "grad_norm": 70.69519805908203, "learning_rate": 3e-06, "loss": -11.64, "step": 1457 }, { "epoch": 0.13006824568446407, "grad_norm": 67.05101776123047, "learning_rate": 3e-06, "loss": -11.5829, "step": 1458 }, { "epoch": 0.13015745572951515, "grad_norm": 80.47762298583984, "learning_rate": 3e-06, "loss": -11.7077, "step": 1459 }, { "epoch": 0.13024666577456623, "grad_norm": 80.73660278320312, "learning_rate": 3e-06, "loss": -13.0816, "step": 1460 }, { "epoch": 0.13033587581961728, "grad_norm": 82.17903900146484, "learning_rate": 3e-06, "loss": -11.8801, "step": 1461 }, { "epoch": 0.13042508586466836, "grad_norm": 96.77212524414062, "learning_rate": 3e-06, "loss": -15.6827, "step": 1462 }, { "epoch": 0.13051429590971944, "grad_norm": 65.8167724609375, "learning_rate": 3e-06, "loss": -12.5171, "step": 1463 }, { "epoch": 0.1306035059547705, "grad_norm": 70.0600814819336, "learning_rate": 3e-06, "loss": -12.4502, "step": 1464 }, { "completion_length": 121.4375, "epoch": 0.13069271599982157, "grad_norm": 697.7041625976562, "learning_rate": 3e-06, "loss": -71.914, "reward": 2.5268125534057617, "reward_std": 0.3090652823448181, "rewards/correctness_reward_func": 1.875, "rewards/int_reward_func": 0.4791666567325592, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.17264583706855774, "step": 1465, "zero_std_ratio": 0.0 }, { "epoch": 0.13078192604487265, "grad_norm": 969.583740234375, "learning_rate": 3e-06, "loss": -96.3899, "step": 1466 }, { "epoch": 0.13087113608992373, "grad_norm": 1708.633056640625, "learning_rate": 3e-06, "loss": -93.2228, "step": 1467 }, { "epoch": 0.1309603461349748, "grad_norm": 865.3252563476562, "learning_rate": 3e-06, "loss": -123.6049, "step": 1468 }, { "epoch": 0.13104955618002587, "grad_norm": 816.3767700195312, "learning_rate": 3e-06, "loss": -128.9213, "step": 1469 }, { "epoch": 0.13113876622507695, "grad_norm": 763.7314453125, "learning_rate": 3e-06, "loss": -72.8294, "step": 1470 }, { "epoch": 0.13122797627012803, "grad_norm": 745.7823486328125, "learning_rate": 3e-06, "loss": -96.6334, "step": 1471 }, { "epoch": 0.13131718631517908, "grad_norm": 955.34521484375, "learning_rate": 3e-06, "loss": -129.8123, "step": 1472 }, { "epoch": 0.13140639636023016, "grad_norm": 862.6156616210938, "learning_rate": 3e-06, "loss": -139.8425, "step": 1473 }, { "epoch": 0.13149560640528124, "grad_norm": 670.8921508789062, "learning_rate": 3e-06, "loss": -151.7562, "step": 1474 }, { "epoch": 0.13158481645033232, "grad_norm": 995.1624145507812, "learning_rate": 3e-06, "loss": -160.9276, "step": 1475 }, { "epoch": 0.13167402649538337, "grad_norm": 596.0221557617188, "learning_rate": 3e-06, "loss": -98.9438, "step": 1476 }, { "completion_length": 117.20833587646484, "epoch": 0.13176323654043445, "grad_norm": 1281.7259521484375, "learning_rate": 3e-06, "loss": 66.3909, "reward": 2.37918758392334, "reward_std": 0.5254138112068176, "rewards/correctness_reward_func": 1.6666666865348816, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.21252083033323288, "step": 1477, "zero_std_ratio": 0.125 }, { "epoch": 0.13185244658548553, "grad_norm": 1832.419189453125, "learning_rate": 3e-06, "loss": 152.6254, "step": 1478 }, { "epoch": 0.13194165663053659, "grad_norm": 2156.6796875, "learning_rate": 3e-06, "loss": 64.6652, "step": 1479 }, { "epoch": 0.13203086667558767, "grad_norm": 1099.5675048828125, "learning_rate": 3e-06, "loss": 74.1128, "step": 1480 }, { "epoch": 0.13212007672063875, "grad_norm": 1159.567138671875, "learning_rate": 3e-06, "loss": 153.6367, "step": 1481 }, { "epoch": 0.13220928676568983, "grad_norm": 1700.0208740234375, "learning_rate": 3e-06, "loss": 110.7726, "step": 1482 }, { "epoch": 0.13229849681074088, "grad_norm": 1182.3519287109375, "learning_rate": 3e-06, "loss": 63.6161, "step": 1483 }, { "epoch": 0.13238770685579196, "grad_norm": 1845.9815673828125, "learning_rate": 3e-06, "loss": 150.0304, "step": 1484 }, { "epoch": 0.13247691690084304, "grad_norm": 2293.36083984375, "learning_rate": 3e-06, "loss": 40.1143, "step": 1485 }, { "epoch": 0.13256612694589412, "grad_norm": 1194.2135009765625, "learning_rate": 3e-06, "loss": 61.8674, "step": 1486 }, { "epoch": 0.13265533699094517, "grad_norm": 1036.80615234375, "learning_rate": 3e-06, "loss": 138.8943, "step": 1487 }, { "epoch": 0.13274454703599625, "grad_norm": 1460.7186279296875, "learning_rate": 3e-06, "loss": 89.2599, "step": 1488 }, { "completion_length": 111.79167175292969, "epoch": 0.13283375708104733, "grad_norm": 765.440185546875, "learning_rate": 3e-06, "loss": 34.1591, "reward": 2.5814167261123657, "reward_std": 0.36231209337711334, "rewards/correctness_reward_func": 1.875, "rewards/int_reward_func": 0.4895833283662796, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.21683332324028015, "step": 1489, "zero_std_ratio": 0.0 }, { "epoch": 0.1329229671260984, "grad_norm": 890.7492065429688, "learning_rate": 3e-06, "loss": 39.8823, "step": 1490 }, { "epoch": 0.13301217717114946, "grad_norm": 581.3748779296875, "learning_rate": 3e-06, "loss": 19.2035, "step": 1491 }, { "epoch": 0.13310138721620055, "grad_norm": 697.298828125, "learning_rate": 3e-06, "loss": 23.3238, "step": 1492 }, { "epoch": 0.13319059726125163, "grad_norm": 535.7828369140625, "learning_rate": 3e-06, "loss": 35.7368, "step": 1493 }, { "epoch": 0.13327980730630268, "grad_norm": 719.7819213867188, "learning_rate": 3e-06, "loss": 5.173, "step": 1494 }, { "epoch": 0.13336901735135376, "grad_norm": 803.5640869140625, "learning_rate": 3e-06, "loss": 25.9877, "step": 1495 }, { "epoch": 0.13345822739640484, "grad_norm": 1086.519287109375, "learning_rate": 3e-06, "loss": 30.5472, "step": 1496 }, { "epoch": 0.13354743744145592, "grad_norm": 955.2864379882812, "learning_rate": 3e-06, "loss": 14.5545, "step": 1497 }, { "epoch": 0.13363664748650697, "grad_norm": 729.1488037109375, "learning_rate": 3e-06, "loss": 17.2955, "step": 1498 }, { "epoch": 0.13372585753155805, "grad_norm": 532.2217407226562, "learning_rate": 3e-06, "loss": 28.6444, "step": 1499 }, { "epoch": 0.13381506757660913, "grad_norm": 681.295166015625, "learning_rate": 3e-06, "loss": -3.4288, "step": 1500 }, { "completion_length": 109.10417175292969, "epoch": 0.1339042776216602, "grad_norm": 874.5466918945312, "learning_rate": 3e-06, "loss": -251.7181, "reward": 2.537354350090027, "reward_std": 0.38086244463920593, "rewards/correctness_reward_func": 1.8333333134651184, "rewards/int_reward_func": 0.4895833283662796, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.21443749219179153, "step": 1501, "zero_std_ratio": 0.0 }, { "epoch": 0.13399348766671126, "grad_norm": 794.2526245117188, "learning_rate": 3e-06, "loss": -232.9978, "step": 1502 }, { "epoch": 0.13408269771176234, "grad_norm": 898.3654174804688, "learning_rate": 3e-06, "loss": -258.013, "step": 1503 }, { "epoch": 0.13417190775681342, "grad_norm": 950.7967529296875, "learning_rate": 3e-06, "loss": -268.5499, "step": 1504 }, { "epoch": 0.13426111780186448, "grad_norm": 883.3775634765625, "learning_rate": 3e-06, "loss": -260.9937, "step": 1505 }, { "epoch": 0.13435032784691556, "grad_norm": 841.09716796875, "learning_rate": 3e-06, "loss": -201.0721, "step": 1506 }, { "epoch": 0.13443953789196664, "grad_norm": 833.3287353515625, "learning_rate": 3e-06, "loss": -267.1441, "step": 1507 }, { "epoch": 0.13452874793701772, "grad_norm": 800.0440673828125, "learning_rate": 3e-06, "loss": -246.4116, "step": 1508 }, { "epoch": 0.13461795798206877, "grad_norm": 894.3017578125, "learning_rate": 3e-06, "loss": -274.9581, "step": 1509 }, { "epoch": 0.13470716802711985, "grad_norm": 1036.5513916015625, "learning_rate": 3e-06, "loss": -290.6367, "step": 1510 }, { "epoch": 0.13479637807217093, "grad_norm": 768.5258178710938, "learning_rate": 3e-06, "loss": -286.0197, "step": 1511 }, { "epoch": 0.134885588117222, "grad_norm": 835.9033203125, "learning_rate": 3e-06, "loss": -223.6343, "step": 1512 }, { "completion_length": 135.37500381469727, "epoch": 0.13497479816227306, "grad_norm": 1273.8145751953125, "learning_rate": 3e-06, "loss": -118.7228, "reward": 2.0659791231155396, "reward_std": 0.6296385675668716, "rewards/correctness_reward_func": 1.4583333134651184, "rewards/int_reward_func": 0.4583333283662796, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.14931250736117363, "step": 1513, "zero_std_ratio": 0.0 }, { "epoch": 0.13506400820732414, "grad_norm": 1386.9256591796875, "learning_rate": 3e-06, "loss": -3.7572, "step": 1514 }, { "epoch": 0.13515321825237522, "grad_norm": 1552.3677978515625, "learning_rate": 3e-06, "loss": -51.4518, "step": 1515 }, { "epoch": 0.1352424282974263, "grad_norm": 1948.14111328125, "learning_rate": 3e-06, "loss": -100.8674, "step": 1516 }, { "epoch": 0.13533163834247736, "grad_norm": 1424.5458984375, "learning_rate": 3e-06, "loss": -47.4962, "step": 1517 }, { "epoch": 0.13542084838752844, "grad_norm": 1114.11181640625, "learning_rate": 3e-06, "loss": -90.961, "step": 1518 }, { "epoch": 0.13551005843257952, "grad_norm": 1333.2547607421875, "learning_rate": 3e-06, "loss": -132.5085, "step": 1519 }, { "epoch": 0.13559926847763057, "grad_norm": 1323.2939453125, "learning_rate": 3e-06, "loss": -16.2315, "step": 1520 }, { "epoch": 0.13568847852268165, "grad_norm": 1497.0128173828125, "learning_rate": 3e-06, "loss": -72.5231, "step": 1521 }, { "epoch": 0.13577768856773273, "grad_norm": 1749.5069580078125, "learning_rate": 3e-06, "loss": -125.3563, "step": 1522 }, { "epoch": 0.1358668986127838, "grad_norm": 1598.31787109375, "learning_rate": 3e-06, "loss": -74.3843, "step": 1523 }, { "epoch": 0.13595610865783486, "grad_norm": 1268.6849365234375, "learning_rate": 3e-06, "loss": -109.1455, "step": 1524 }, { "completion_length": 134.52083587646484, "epoch": 0.13604531870288594, "grad_norm": 1321.5076904296875, "learning_rate": 3e-06, "loss": 120.2129, "reward": 2.0626251697540283, "reward_std": 0.5234281718730927, "rewards/correctness_reward_func": 1.4166666865348816, "rewards/int_reward_func": 0.4895833283662796, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.1563750095665455, "step": 1525, "zero_std_ratio": 0.0 }, { "epoch": 0.13613452874793702, "grad_norm": 1222.2694091796875, "learning_rate": 3e-06, "loss": 130.8681, "step": 1526 }, { "epoch": 0.1362237387929881, "grad_norm": 1433.10498046875, "learning_rate": 3e-06, "loss": 153.9208, "step": 1527 }, { "epoch": 0.13631294883803916, "grad_norm": 1158.427001953125, "learning_rate": 3e-06, "loss": 113.2684, "step": 1528 }, { "epoch": 0.13640215888309024, "grad_norm": 1139.9339599609375, "learning_rate": 3e-06, "loss": 76.0094, "step": 1529 }, { "epoch": 0.13649136892814132, "grad_norm": 1212.093017578125, "learning_rate": 3e-06, "loss": 174.7632, "step": 1530 }, { "epoch": 0.13658057897319237, "grad_norm": 1497.928955078125, "learning_rate": 3e-06, "loss": 110.0752, "step": 1531 }, { "epoch": 0.13666978901824345, "grad_norm": 1279.407958984375, "learning_rate": 3e-06, "loss": 125.8692, "step": 1532 }, { "epoch": 0.13675899906329453, "grad_norm": 1465.5513916015625, "learning_rate": 3e-06, "loss": 142.0573, "step": 1533 }, { "epoch": 0.1368482091083456, "grad_norm": 1114.577880859375, "learning_rate": 3e-06, "loss": 97.0018, "step": 1534 }, { "epoch": 0.13693741915339666, "grad_norm": 1208.373779296875, "learning_rate": 3e-06, "loss": 63.0144, "step": 1535 }, { "epoch": 0.13702662919844774, "grad_norm": 1987.0606689453125, "learning_rate": 3e-06, "loss": 161.4095, "step": 1536 }, { "completion_length": 107.4375, "epoch": 0.13711583924349882, "grad_norm": 1091.0252685546875, "learning_rate": 3e-06, "loss": -297.7603, "reward": 2.2568334341049194, "reward_std": 0.5629529803991318, "rewards/correctness_reward_func": 1.5416666269302368, "rewards/int_reward_func": 0.4895833283662796, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.010416666977107525, "rewards/xmlcount_reward_func": 0.21516666561365128, "step": 1537, "zero_std_ratio": 0.0 }, { "epoch": 0.1372050492885499, "grad_norm": 1411.4197998046875, "learning_rate": 3e-06, "loss": -316.0208, "step": 1538 }, { "epoch": 0.13729425933360095, "grad_norm": 990.6038818359375, "learning_rate": 3e-06, "loss": -275.7917, "step": 1539 }, { "epoch": 0.13738346937865203, "grad_norm": 1163.958740234375, "learning_rate": 3e-06, "loss": -357.1367, "step": 1540 }, { "epoch": 0.13747267942370311, "grad_norm": 1167.002685546875, "learning_rate": 3e-06, "loss": -276.4684, "step": 1541 }, { "epoch": 0.1375618894687542, "grad_norm": 1276.5869140625, "learning_rate": 3e-06, "loss": -369.6291, "step": 1542 }, { "epoch": 0.13765109951380525, "grad_norm": 1164.8736572265625, "learning_rate": 3e-06, "loss": -308.2538, "step": 1543 }, { "epoch": 0.13774030955885633, "grad_norm": 1285.190673828125, "learning_rate": 3e-06, "loss": -326.7666, "step": 1544 }, { "epoch": 0.1378295196039074, "grad_norm": 1247.821533203125, "learning_rate": 3e-06, "loss": -290.6486, "step": 1545 }, { "epoch": 0.13791872964895846, "grad_norm": 1194.13232421875, "learning_rate": 3e-06, "loss": -382.5776, "step": 1546 }, { "epoch": 0.13800793969400954, "grad_norm": 1206.8460693359375, "learning_rate": 3e-06, "loss": -302.577, "step": 1547 }, { "epoch": 0.13809714973906062, "grad_norm": 1196.979248046875, "learning_rate": 3e-06, "loss": -390.5432, "step": 1548 }, { "completion_length": 109.41667175292969, "epoch": 0.1381863597841117, "grad_norm": 326.03936767578125, "learning_rate": 3e-06, "loss": 45.8013, "reward": 2.609562635421753, "reward_std": 0.17681674100458622, "rewards/correctness_reward_func": 1.9166666865348816, "rewards/int_reward_func": 0.4895833283662796, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.20331250876188278, "step": 1549, "zero_std_ratio": 0.0 }, { "epoch": 0.13827556982916275, "grad_norm": 402.5303649902344, "learning_rate": 3e-06, "loss": 45.5476, "step": 1550 }, { "epoch": 0.13836477987421383, "grad_norm": 445.0027160644531, "learning_rate": 3e-06, "loss": 33.9855, "step": 1551 }, { "epoch": 0.1384539899192649, "grad_norm": 304.2209167480469, "learning_rate": 3e-06, "loss": 74.906, "step": 1552 }, { "epoch": 0.138543199964316, "grad_norm": 268.453369140625, "learning_rate": 3e-06, "loss": 40.646, "step": 1553 }, { "epoch": 0.13863241000936705, "grad_norm": 386.8885498046875, "learning_rate": 3e-06, "loss": 9.6944, "step": 1554 }, { "epoch": 0.13872162005441813, "grad_norm": 319.0711364746094, "learning_rate": 3e-06, "loss": 41.8327, "step": 1555 }, { "epoch": 0.1388108300994692, "grad_norm": 390.6893615722656, "learning_rate": 3e-06, "loss": 42.7597, "step": 1556 }, { "epoch": 0.13890004014452026, "grad_norm": 446.2587890625, "learning_rate": 3e-06, "loss": 28.6514, "step": 1557 }, { "epoch": 0.13898925018957134, "grad_norm": 413.8212585449219, "learning_rate": 3e-06, "loss": 69.6053, "step": 1558 }, { "epoch": 0.13907846023462242, "grad_norm": 313.9552001953125, "learning_rate": 3e-06, "loss": 34.7553, "step": 1559 }, { "epoch": 0.1391676702796735, "grad_norm": 311.7652587890625, "learning_rate": 3e-06, "loss": 8.1029, "step": 1560 }, { "completion_length": 125.66667175292969, "epoch": 0.13925688032472455, "grad_norm": 1157.206787109375, "learning_rate": 3e-06, "loss": -11.8294, "reward": 2.3268543481826782, "reward_std": 0.42937734723091125, "rewards/correctness_reward_func": 1.7083333134651184, "rewards/int_reward_func": 0.4583333283662796, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.1601875051856041, "step": 1561, "zero_std_ratio": 0.0 }, { "epoch": 0.13934609036977563, "grad_norm": 2671.814453125, "learning_rate": 3e-06, "loss": 55.7136, "step": 1562 }, { "epoch": 0.1394353004148267, "grad_norm": 1623.3624267578125, "learning_rate": 3e-06, "loss": -81.8166, "step": 1563 }, { "epoch": 0.1395245104598778, "grad_norm": 931.291748046875, "learning_rate": 3e-06, "loss": -37.318, "step": 1564 }, { "epoch": 0.13961372050492885, "grad_norm": 1049.9766845703125, "learning_rate": 3e-06, "loss": -43.7951, "step": 1565 }, { "epoch": 0.13970293054997993, "grad_norm": 1135.0767822265625, "learning_rate": 3e-06, "loss": -29.7987, "step": 1566 }, { "epoch": 0.139792140595031, "grad_norm": 1037.486083984375, "learning_rate": 3e-06, "loss": -17.2992, "step": 1567 }, { "epoch": 0.13988135064008209, "grad_norm": 2672.843017578125, "learning_rate": 3e-06, "loss": 22.6063, "step": 1568 }, { "epoch": 0.13997056068513314, "grad_norm": 1275.9603271484375, "learning_rate": 3e-06, "loss": -81.5765, "step": 1569 }, { "epoch": 0.14005977073018422, "grad_norm": 1064.6248779296875, "learning_rate": 3e-06, "loss": -37.3489, "step": 1570 }, { "epoch": 0.1401489807752353, "grad_norm": 1112.7813720703125, "learning_rate": 3e-06, "loss": -46.93, "step": 1571 }, { "epoch": 0.14023819082028635, "grad_norm": 986.8911743164062, "learning_rate": 3e-06, "loss": -37.3976, "step": 1572 }, { "completion_length": 108.14583587646484, "epoch": 0.14032740086533743, "grad_norm": 828.357421875, "learning_rate": 3e-06, "loss": 38.523, "reward": 2.4648125171661377, "reward_std": 0.5029634684324265, "rewards/correctness_reward_func": 1.75, "rewards/int_reward_func": 0.4895833283662796, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.2252291589975357, "step": 1573, "zero_std_ratio": 0.0 }, { "epoch": 0.1404166109103885, "grad_norm": 874.63916015625, "learning_rate": 3e-06, "loss": 17.8019, "step": 1574 }, { "epoch": 0.1405058209554396, "grad_norm": 922.2013549804688, "learning_rate": 3e-06, "loss": 5.0649, "step": 1575 }, { "epoch": 0.14059503100049064, "grad_norm": 740.4825439453125, "learning_rate": 3e-06, "loss": -33.9352, "step": 1576 }, { "epoch": 0.14068424104554172, "grad_norm": 935.9121704101562, "learning_rate": 3e-06, "loss": 22.481, "step": 1577 }, { "epoch": 0.1407734510905928, "grad_norm": 982.98876953125, "learning_rate": 3e-06, "loss": 2.7561, "step": 1578 }, { "epoch": 0.14086266113564389, "grad_norm": 804.1883544921875, "learning_rate": 3e-06, "loss": 32.3521, "step": 1579 }, { "epoch": 0.14095187118069494, "grad_norm": 822.947265625, "learning_rate": 3e-06, "loss": -1.7136, "step": 1580 }, { "epoch": 0.14104108122574602, "grad_norm": 923.2521362304688, "learning_rate": 3e-06, "loss": -5.8192, "step": 1581 }, { "epoch": 0.1411302912707971, "grad_norm": 978.51708984375, "learning_rate": 3e-06, "loss": -40.5703, "step": 1582 }, { "epoch": 0.14121950131584818, "grad_norm": 856.7340698242188, "learning_rate": 3e-06, "loss": 10.946, "step": 1583 }, { "epoch": 0.14130871136089923, "grad_norm": 784.1134643554688, "learning_rate": 3e-06, "loss": -15.1785, "step": 1584 }, { "completion_length": 103.85416793823242, "epoch": 0.1413979214059503, "grad_norm": 757.7247924804688, "learning_rate": 3e-06, "loss": -115.5896, "reward": 2.35756254196167, "reward_std": 0.28461553901433945, "rewards/correctness_reward_func": 1.6666666865348816, "rewards/int_reward_func": 0.46875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.22214583307504654, "step": 1585, "zero_std_ratio": 0.0 }, { "epoch": 0.1414871314510014, "grad_norm": 904.9032592773438, "learning_rate": 3e-06, "loss": -102.5051, "step": 1586 }, { "epoch": 0.14157634149605244, "grad_norm": 940.2787475585938, "learning_rate": 3e-06, "loss": -99.6384, "step": 1587 }, { "epoch": 0.14166555154110352, "grad_norm": 1431.8402099609375, "learning_rate": 3e-06, "loss": -109.1234, "step": 1588 }, { "epoch": 0.1417547615861546, "grad_norm": 1079.49072265625, "learning_rate": 3e-06, "loss": -93.5136, "step": 1589 }, { "epoch": 0.14184397163120568, "grad_norm": 957.7713623046875, "learning_rate": 3e-06, "loss": -114.6748, "step": 1590 }, { "epoch": 0.14193318167625674, "grad_norm": 796.9354858398438, "learning_rate": 3e-06, "loss": -118.4429, "step": 1591 }, { "epoch": 0.14202239172130782, "grad_norm": 874.4432983398438, "learning_rate": 3e-06, "loss": -114.6303, "step": 1592 }, { "epoch": 0.1421116017663589, "grad_norm": 1131.59521484375, "learning_rate": 3e-06, "loss": -116.7429, "step": 1593 }, { "epoch": 0.14220081181140998, "grad_norm": 1635.96044921875, "learning_rate": 3e-06, "loss": -132.6347, "step": 1594 }, { "epoch": 0.14229002185646103, "grad_norm": 1009.574951171875, "learning_rate": 3e-06, "loss": -113.8876, "step": 1595 }, { "epoch": 0.1423792319015121, "grad_norm": 1217.6295166015625, "learning_rate": 3e-06, "loss": -117.312, "step": 1596 }, { "completion_length": 113.77083587646484, "epoch": 0.1424684419465632, "grad_norm": 306.18682861328125, "learning_rate": 3e-06, "loss": 10.6702, "reward": 2.6016459465026855, "reward_std": 0.2558625042438507, "rewards/correctness_reward_func": 1.9166666269302368, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.1849791705608368, "step": 1597, "zero_std_ratio": 0.0 }, { "epoch": 0.14255765199161424, "grad_norm": 373.6190490722656, "learning_rate": 3e-06, "loss": 13.2399, "step": 1598 }, { "epoch": 0.14264686203666532, "grad_norm": 604.7830200195312, "learning_rate": 3e-06, "loss": 1.0527, "step": 1599 }, { "epoch": 0.1427360720817164, "grad_norm": 464.7652893066406, "learning_rate": 3e-06, "loss": 7.1036, "step": 1600 }, { "epoch": 0.14282528212676748, "grad_norm": 711.72607421875, "learning_rate": 3e-06, "loss": -19.469, "step": 1601 }, { "epoch": 0.14291449217181854, "grad_norm": 902.3880004882812, "learning_rate": 3e-06, "loss": -15.7111, "step": 1602 }, { "epoch": 0.14300370221686962, "grad_norm": 393.73492431640625, "learning_rate": 3e-06, "loss": 9.011, "step": 1603 }, { "epoch": 0.1430929122619207, "grad_norm": 426.3011779785156, "learning_rate": 3e-06, "loss": 9.2521, "step": 1604 }, { "epoch": 0.14318212230697178, "grad_norm": 622.9773559570312, "learning_rate": 3e-06, "loss": -0.7366, "step": 1605 }, { "epoch": 0.14327133235202283, "grad_norm": 402.3107604980469, "learning_rate": 3e-06, "loss": 4.3138, "step": 1606 }, { "epoch": 0.1433605423970739, "grad_norm": 829.762451171875, "learning_rate": 3e-06, "loss": -23.0001, "step": 1607 }, { "epoch": 0.143449752442125, "grad_norm": 1001.1775512695312, "learning_rate": 3e-06, "loss": -21.5458, "step": 1608 }, { "completion_length": 118.16667175292969, "epoch": 0.14353896248717607, "grad_norm": 1849.2244873046875, "learning_rate": 3e-06, "loss": -563.4475, "reward": 2.283812642097473, "reward_std": 0.7450865209102631, "rewards/correctness_reward_func": 1.5833333134651184, "rewards/int_reward_func": 0.4895833283662796, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.2108958289027214, "step": 1609, "zero_std_ratio": 0.0 }, { "epoch": 0.14362817253222712, "grad_norm": 1622.8909912109375, "learning_rate": 3e-06, "loss": -383.3872, "step": 1610 }, { "epoch": 0.1437173825772782, "grad_norm": 1626.39599609375, "learning_rate": 3e-06, "loss": -388.6346, "step": 1611 }, { "epoch": 0.14380659262232928, "grad_norm": 1461.013427734375, "learning_rate": 3e-06, "loss": -516.7939, "step": 1612 }, { "epoch": 0.14389580266738033, "grad_norm": 1593.8126220703125, "learning_rate": 3e-06, "loss": -459.248, "step": 1613 }, { "epoch": 0.14398501271243141, "grad_norm": 1501.4661865234375, "learning_rate": 3e-06, "loss": -361.4013, "step": 1614 }, { "epoch": 0.1440742227574825, "grad_norm": 1959.175537109375, "learning_rate": 3e-06, "loss": -599.3701, "step": 1615 }, { "epoch": 0.14416343280253358, "grad_norm": 1524.2763671875, "learning_rate": 3e-06, "loss": -406.2586, "step": 1616 }, { "epoch": 0.14425264284758463, "grad_norm": 2186.95947265625, "learning_rate": 3e-06, "loss": -424.5395, "step": 1617 }, { "epoch": 0.1443418528926357, "grad_norm": 1751.9039306640625, "learning_rate": 3e-06, "loss": -537.6578, "step": 1618 }, { "epoch": 0.1444310629376868, "grad_norm": 1491.2742919921875, "learning_rate": 3e-06, "loss": -481.118, "step": 1619 }, { "epoch": 0.14452027298273787, "grad_norm": 1423.106689453125, "learning_rate": 3e-06, "loss": -393.4251, "step": 1620 }, { "completion_length": 123.56250381469727, "epoch": 0.14460948302778892, "grad_norm": 1347.345703125, "learning_rate": 3e-06, "loss": -484.3067, "reward": 2.2743124961853027, "reward_std": 0.4947032183408737, "rewards/correctness_reward_func": 1.625, "rewards/int_reward_func": 0.4583333283662796, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.19097916781902313, "step": 1621, "zero_std_ratio": 0.0 }, { "epoch": 0.14469869307284, "grad_norm": 1622.2376708984375, "learning_rate": 3e-06, "loss": -543.3698, "step": 1622 }, { "epoch": 0.14478790311789108, "grad_norm": 1387.509521484375, "learning_rate": 3e-06, "loss": -551.8273, "step": 1623 }, { "epoch": 0.14487711316294213, "grad_norm": 1271.8485107421875, "learning_rate": 3e-06, "loss": -501.6958, "step": 1624 }, { "epoch": 0.14496632320799321, "grad_norm": 1232.3084716796875, "learning_rate": 3e-06, "loss": -512.8577, "step": 1625 }, { "epoch": 0.1450555332530443, "grad_norm": 1346.7532958984375, "learning_rate": 3e-06, "loss": -458.5431, "step": 1626 }, { "epoch": 0.14514474329809537, "grad_norm": 1523.1427001953125, "learning_rate": 3e-06, "loss": -518.0624, "step": 1627 }, { "epoch": 0.14523395334314643, "grad_norm": 1862.5703125, "learning_rate": 3e-06, "loss": -572.3401, "step": 1628 }, { "epoch": 0.1453231633881975, "grad_norm": 1202.2999267578125, "learning_rate": 3e-06, "loss": -586.3882, "step": 1629 }, { "epoch": 0.1454123734332486, "grad_norm": 1310.3470458984375, "learning_rate": 3e-06, "loss": -542.1135, "step": 1630 }, { "epoch": 0.14550158347829967, "grad_norm": 1226.2174072265625, "learning_rate": 3e-06, "loss": -556.5128, "step": 1631 }, { "epoch": 0.14559079352335072, "grad_norm": 1245.717041015625, "learning_rate": 3e-06, "loss": -500.3018, "step": 1632 }, { "completion_length": 161.08333587646484, "epoch": 0.1456800035684018, "grad_norm": 1047.9556884765625, "learning_rate": 3e-06, "loss": 62.7677, "reward": 1.9896875023841858, "reward_std": 0.36980947852134705, "rewards/correctness_reward_func": 1.4166666865348816, "rewards/int_reward_func": 0.4895833283662796, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.08343750238418579, "step": 1633, "zero_std_ratio": 0.0 }, { "epoch": 0.14576921361345288, "grad_norm": 1155.3780517578125, "learning_rate": 3e-06, "loss": 45.3963, "step": 1634 }, { "epoch": 0.14585842365850396, "grad_norm": 1296.7313232421875, "learning_rate": 3e-06, "loss": 47.438, "step": 1635 }, { "epoch": 0.145947633703555, "grad_norm": 1112.166015625, "learning_rate": 3e-06, "loss": -102.1916, "step": 1636 }, { "epoch": 0.1460368437486061, "grad_norm": 979.8106689453125, "learning_rate": 3e-06, "loss": 49.4149, "step": 1637 }, { "epoch": 0.14612605379365717, "grad_norm": 1428.285888671875, "learning_rate": 3e-06, "loss": -44.5905, "step": 1638 }, { "epoch": 0.14621526383870823, "grad_norm": 1178.86572265625, "learning_rate": 3e-06, "loss": 48.5944, "step": 1639 }, { "epoch": 0.1463044738837593, "grad_norm": 1211.369384765625, "learning_rate": 3e-06, "loss": 43.942, "step": 1640 }, { "epoch": 0.1463936839288104, "grad_norm": 1153.5445556640625, "learning_rate": 3e-06, "loss": 34.518, "step": 1641 }, { "epoch": 0.14648289397386147, "grad_norm": 1077.5166015625, "learning_rate": 3e-06, "loss": -116.0781, "step": 1642 }, { "epoch": 0.14657210401891252, "grad_norm": 1041.2066650390625, "learning_rate": 3e-06, "loss": 40.4303, "step": 1643 }, { "epoch": 0.1466613140639636, "grad_norm": 1450.99609375, "learning_rate": 3e-06, "loss": -44.6822, "step": 1644 }, { "completion_length": 127.64583969116211, "epoch": 0.14675052410901468, "grad_norm": 2347.143310546875, "learning_rate": 3e-06, "loss": 384.8905, "reward": 1.9467709064483643, "reward_std": 0.565439760684967, "rewards/correctness_reward_func": 1.2916666567325592, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.15510417520999908, "step": 1645, "zero_std_ratio": 0.0 }, { "epoch": 0.14683973415406576, "grad_norm": 1446.45458984375, "learning_rate": 3e-06, "loss": 170.8209, "step": 1646 }, { "epoch": 0.1469289441991168, "grad_norm": 1259.9295654296875, "learning_rate": 3e-06, "loss": 182.3417, "step": 1647 }, { "epoch": 0.1470181542441679, "grad_norm": 1511.966796875, "learning_rate": 3e-06, "loss": 90.1429, "step": 1648 }, { "epoch": 0.14710736428921897, "grad_norm": 1601.6256103515625, "learning_rate": 3e-06, "loss": 212.7898, "step": 1649 }, { "epoch": 0.14719657433427003, "grad_norm": 1477.3070068359375, "learning_rate": 3e-06, "loss": 107.817, "step": 1650 }, { "epoch": 0.1472857843793211, "grad_norm": 4009.107666015625, "learning_rate": 3e-06, "loss": 349.6491, "step": 1651 }, { "epoch": 0.14737499442437219, "grad_norm": 1204.2269287109375, "learning_rate": 3e-06, "loss": 160.3832, "step": 1652 }, { "epoch": 0.14746420446942327, "grad_norm": 1213.4332275390625, "learning_rate": 3e-06, "loss": 168.7216, "step": 1653 }, { "epoch": 0.14755341451447432, "grad_norm": 1349.8916015625, "learning_rate": 3e-06, "loss": 70.4885, "step": 1654 }, { "epoch": 0.1476426245595254, "grad_norm": 1503.792236328125, "learning_rate": 3e-06, "loss": 177.7434, "step": 1655 }, { "epoch": 0.14773183460457648, "grad_norm": 1194.224365234375, "learning_rate": 3e-06, "loss": 81.2416, "step": 1656 }, { "completion_length": 123.64583587646484, "epoch": 0.14782104464962756, "grad_norm": 1002.905517578125, "learning_rate": 3e-06, "loss": -0.0244, "reward": 2.277187466621399, "reward_std": 0.6942009925842285, "rewards/correctness_reward_func": 1.5833333730697632, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.19385415315628052, "step": 1657, "zero_std_ratio": 0.0 }, { "epoch": 0.1479102546946786, "grad_norm": 1149.109619140625, "learning_rate": 3e-06, "loss": 41.6179, "step": 1658 }, { "epoch": 0.1479994647397297, "grad_norm": 1113.2333984375, "learning_rate": 3e-06, "loss": 78.8026, "step": 1659 }, { "epoch": 0.14808867478478077, "grad_norm": 1029.0260009765625, "learning_rate": 3e-06, "loss": 47.1482, "step": 1660 }, { "epoch": 0.14817788482983185, "grad_norm": 1232.3424072265625, "learning_rate": 3e-06, "loss": -10.8712, "step": 1661 }, { "epoch": 0.1482670948748829, "grad_norm": 1049.901123046875, "learning_rate": 3e-06, "loss": 100.9177, "step": 1662 }, { "epoch": 0.14835630491993398, "grad_norm": 904.3103637695312, "learning_rate": 3e-06, "loss": -14.9282, "step": 1663 }, { "epoch": 0.14844551496498506, "grad_norm": 1009.290771484375, "learning_rate": 3e-06, "loss": 28.7193, "step": 1664 }, { "epoch": 0.14853472501003612, "grad_norm": 1085.2960205078125, "learning_rate": 3e-06, "loss": 64.3064, "step": 1665 }, { "epoch": 0.1486239350550872, "grad_norm": 885.9617919921875, "learning_rate": 3e-06, "loss": 34.0377, "step": 1666 }, { "epoch": 0.14871314510013828, "grad_norm": 1138.4622802734375, "learning_rate": 3e-06, "loss": -29.1467, "step": 1667 }, { "epoch": 0.14880235514518936, "grad_norm": 1003.6993408203125, "learning_rate": 3e-06, "loss": 87.8599, "step": 1668 }, { "completion_length": 124.60417175292969, "epoch": 0.1488915651902404, "grad_norm": 148.32191467285156, "learning_rate": 3e-06, "loss": -30.0848, "reward": 2.4312500953674316, "reward_std": 0.03657746687531471, "rewards/correctness_reward_func": 1.75, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.18124999850988388, "step": 1669, "zero_std_ratio": 0.0 }, { "epoch": 0.1489807752352915, "grad_norm": 148.70933532714844, "learning_rate": 3e-06, "loss": -23.8765, "step": 1670 }, { "epoch": 0.14906998528034257, "grad_norm": 151.1920623779297, "learning_rate": 3e-06, "loss": -26.6285, "step": 1671 }, { "epoch": 0.14915919532539365, "grad_norm": 136.03228759765625, "learning_rate": 3e-06, "loss": -21.6761, "step": 1672 }, { "epoch": 0.1492484053704447, "grad_norm": 177.11851501464844, "learning_rate": 3e-06, "loss": -24.8027, "step": 1673 }, { "epoch": 0.14933761541549578, "grad_norm": 122.48726654052734, "learning_rate": 3e-06, "loss": -24.5533, "step": 1674 }, { "epoch": 0.14942682546054686, "grad_norm": 145.15347290039062, "learning_rate": 3e-06, "loss": -32.7347, "step": 1675 }, { "epoch": 0.14951603550559794, "grad_norm": 147.5530242919922, "learning_rate": 3e-06, "loss": -26.6998, "step": 1676 }, { "epoch": 0.149605245550649, "grad_norm": 186.55825805664062, "learning_rate": 3e-06, "loss": -29.4036, "step": 1677 }, { "epoch": 0.14969445559570008, "grad_norm": 165.18231201171875, "learning_rate": 3e-06, "loss": -26.6914, "step": 1678 }, { "epoch": 0.14978366564075116, "grad_norm": 170.1136016845703, "learning_rate": 3e-06, "loss": -30.6937, "step": 1679 }, { "epoch": 0.1498728756858022, "grad_norm": 131.10781860351562, "learning_rate": 3e-06, "loss": -29.9625, "step": 1680 }, { "completion_length": 149.37500762939453, "epoch": 0.1499620857308533, "grad_norm": 796.8214111328125, "learning_rate": 3e-06, "loss": -24.992, "reward": 2.1661041378974915, "reward_std": 0.49460718035697937, "rewards/correctness_reward_func": 1.5833333730697632, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.08277083188295364, "step": 1681, "zero_std_ratio": 0.125 }, { "epoch": 0.15005129577590437, "grad_norm": 995.2013549804688, "learning_rate": 3e-06, "loss": -59.4395, "step": 1682 }, { "epoch": 0.15014050582095545, "grad_norm": 1657.536376953125, "learning_rate": 3e-06, "loss": -37.3866, "step": 1683 }, { "epoch": 0.1502297158660065, "grad_norm": 1062.8961181640625, "learning_rate": 3e-06, "loss": -45.4256, "step": 1684 }, { "epoch": 0.15031892591105758, "grad_norm": 1196.541259765625, "learning_rate": 3e-06, "loss": -21.982, "step": 1685 }, { "epoch": 0.15040813595610866, "grad_norm": 1009.1981811523438, "learning_rate": 3e-06, "loss": 1.7535, "step": 1686 }, { "epoch": 0.15049734600115974, "grad_norm": 897.9784545898438, "learning_rate": 3e-06, "loss": -18.5637, "step": 1687 }, { "epoch": 0.1505865560462108, "grad_norm": 1156.031494140625, "learning_rate": 3e-06, "loss": -74.5365, "step": 1688 }, { "epoch": 0.15067576609126188, "grad_norm": 962.795166015625, "learning_rate": 3e-06, "loss": -42.1126, "step": 1689 }, { "epoch": 0.15076497613631296, "grad_norm": 1172.703125, "learning_rate": 3e-06, "loss": -52.0831, "step": 1690 }, { "epoch": 0.150854186181364, "grad_norm": 1299.485595703125, "learning_rate": 3e-06, "loss": -18.9543, "step": 1691 }, { "epoch": 0.1509433962264151, "grad_norm": 1174.2447509765625, "learning_rate": 3e-06, "loss": -0.5707, "step": 1692 }, { "completion_length": 115.62500381469727, "epoch": 0.15103260627146617, "grad_norm": 932.6162109375, "learning_rate": 3e-06, "loss": -99.4483, "reward": 2.3194793462753296, "reward_std": 0.6202397346496582, "rewards/correctness_reward_func": 1.6666666269302368, "rewards/int_reward_func": 0.4791666567325592, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.17364583909511566, "step": 1693, "zero_std_ratio": 0.0 }, { "epoch": 0.15112181631651725, "grad_norm": 1099.0394287109375, "learning_rate": 3e-06, "loss": -107.816, "step": 1694 }, { "epoch": 0.1512110263615683, "grad_norm": 919.58740234375, "learning_rate": 3e-06, "loss": -80.3083, "step": 1695 }, { "epoch": 0.15130023640661938, "grad_norm": 1122.83349609375, "learning_rate": 3e-06, "loss": -19.2087, "step": 1696 }, { "epoch": 0.15138944645167046, "grad_norm": 1361.2589111328125, "learning_rate": 3e-06, "loss": -164.0316, "step": 1697 }, { "epoch": 0.15147865649672154, "grad_norm": 1414.1632080078125, "learning_rate": 3e-06, "loss": -114.2859, "step": 1698 }, { "epoch": 0.1515678665417726, "grad_norm": 968.30126953125, "learning_rate": 3e-06, "loss": -110.0306, "step": 1699 }, { "epoch": 0.15165707658682367, "grad_norm": 1144.3707275390625, "learning_rate": 3e-06, "loss": -118.6632, "step": 1700 }, { "epoch": 0.15174628663187475, "grad_norm": 971.5307006835938, "learning_rate": 3e-06, "loss": -92.3818, "step": 1701 }, { "epoch": 0.15183549667692584, "grad_norm": 917.9314575195312, "learning_rate": 3e-06, "loss": -20.9716, "step": 1702 }, { "epoch": 0.1519247067219769, "grad_norm": 1136.1475830078125, "learning_rate": 3e-06, "loss": -178.9898, "step": 1703 }, { "epoch": 0.15201391676702797, "grad_norm": 1357.263916015625, "learning_rate": 3e-06, "loss": -127.0251, "step": 1704 }, { "completion_length": 123.16667175292969, "epoch": 0.15210312681207905, "grad_norm": 881.5326538085938, "learning_rate": 3e-06, "loss": 93.0848, "reward": 2.370583415031433, "reward_std": 0.38110819458961487, "rewards/correctness_reward_func": 1.7083333134651184, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.16224999725818634, "step": 1705, "zero_std_ratio": 0.125 }, { "epoch": 0.1521923368571301, "grad_norm": 744.7655029296875, "learning_rate": 3e-06, "loss": 42.1843, "step": 1706 }, { "epoch": 0.15228154690218118, "grad_norm": 918.53125, "learning_rate": 3e-06, "loss": 65.0944, "step": 1707 }, { "epoch": 0.15237075694723226, "grad_norm": 670.804443359375, "learning_rate": 3e-06, "loss": 58.4442, "step": 1708 }, { "epoch": 0.15245996699228334, "grad_norm": 811.63134765625, "learning_rate": 3e-06, "loss": 75.2296, "step": 1709 }, { "epoch": 0.1525491770373344, "grad_norm": 890.50537109375, "learning_rate": 3e-06, "loss": 76.3318, "step": 1710 }, { "epoch": 0.15263838708238547, "grad_norm": 802.3552856445312, "learning_rate": 3e-06, "loss": 74.1002, "step": 1711 }, { "epoch": 0.15272759712743655, "grad_norm": 622.7321166992188, "learning_rate": 3e-06, "loss": 32.5934, "step": 1712 }, { "epoch": 0.15281680717248763, "grad_norm": 636.275390625, "learning_rate": 3e-06, "loss": 44.7401, "step": 1713 }, { "epoch": 0.1529060172175387, "grad_norm": 480.5521240234375, "learning_rate": 3e-06, "loss": 44.6325, "step": 1714 }, { "epoch": 0.15299522726258977, "grad_norm": 526.2603759765625, "learning_rate": 3e-06, "loss": 49.214, "step": 1715 }, { "epoch": 0.15308443730764085, "grad_norm": 582.4856567382812, "learning_rate": 3e-06, "loss": 46.1537, "step": 1716 }, { "completion_length": 133.3333396911621, "epoch": 0.1531736473526919, "grad_norm": 535.0222778320312, "learning_rate": 3e-06, "loss": -6.0903, "reward": 2.4653126001358032, "reward_std": 0.3633167892694473, "rewards/correctness_reward_func": 1.8333333134651184, "rewards/int_reward_func": 0.4895833283662796, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.14239583536982536, "step": 1717, "zero_std_ratio": 0.0 }, { "epoch": 0.15326285739774298, "grad_norm": 436.4328308105469, "learning_rate": 3e-06, "loss": -2.8542, "step": 1718 }, { "epoch": 0.15335206744279406, "grad_norm": 1064.8994140625, "learning_rate": 3e-06, "loss": -67.8062, "step": 1719 }, { "epoch": 0.15344127748784514, "grad_norm": 577.8828735351562, "learning_rate": 3e-06, "loss": 43.8438, "step": 1720 }, { "epoch": 0.1535304875328962, "grad_norm": 503.5760803222656, "learning_rate": 3e-06, "loss": 6.5897, "step": 1721 }, { "epoch": 0.15361969757794727, "grad_norm": 328.094970703125, "learning_rate": 3e-06, "loss": 12.6563, "step": 1722 }, { "epoch": 0.15370890762299835, "grad_norm": 471.933349609375, "learning_rate": 3e-06, "loss": -12.6933, "step": 1723 }, { "epoch": 0.15379811766804943, "grad_norm": 385.73516845703125, "learning_rate": 3e-06, "loss": -1.7539, "step": 1724 }, { "epoch": 0.15388732771310049, "grad_norm": 986.2044067382812, "learning_rate": 3e-06, "loss": -54.8942, "step": 1725 }, { "epoch": 0.15397653775815157, "grad_norm": 443.2298583984375, "learning_rate": 3e-06, "loss": 31.631, "step": 1726 }, { "epoch": 0.15406574780320265, "grad_norm": 531.0643920898438, "learning_rate": 3e-06, "loss": 1.8442, "step": 1727 }, { "epoch": 0.15415495784825373, "grad_norm": 283.9168701171875, "learning_rate": 3e-06, "loss": 12.0514, "step": 1728 }, { "completion_length": 146.81250762939453, "epoch": 0.15424416789330478, "grad_norm": 249.7176971435547, "learning_rate": 3e-06, "loss": 8.7164, "reward": 2.526937484741211, "reward_std": 0.22450600564479828, "rewards/correctness_reward_func": 1.9166666269302368, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.11027083173394203, "step": 1729, "zero_std_ratio": 0.0 }, { "epoch": 0.15433337793835586, "grad_norm": 415.48431396484375, "learning_rate": 3e-06, "loss": 37.7652, "step": 1730 }, { "epoch": 0.15442258798340694, "grad_norm": 312.5050354003906, "learning_rate": 3e-06, "loss": 13.5347, "step": 1731 }, { "epoch": 0.154511798028458, "grad_norm": 272.3165588378906, "learning_rate": 3e-06, "loss": 17.2606, "step": 1732 }, { "epoch": 0.15460100807350907, "grad_norm": 345.10382080078125, "learning_rate": 3e-06, "loss": 4.1699, "step": 1733 }, { "epoch": 0.15469021811856015, "grad_norm": 221.7129364013672, "learning_rate": 3e-06, "loss": 9.5225, "step": 1734 }, { "epoch": 0.15477942816361123, "grad_norm": 285.18927001953125, "learning_rate": 3e-06, "loss": 5.8041, "step": 1735 }, { "epoch": 0.15486863820866228, "grad_norm": 374.7469482421875, "learning_rate": 3e-06, "loss": 29.7232, "step": 1736 }, { "epoch": 0.15495784825371337, "grad_norm": 265.818603515625, "learning_rate": 3e-06, "loss": 8.8928, "step": 1737 }, { "epoch": 0.15504705829876445, "grad_norm": 227.65782165527344, "learning_rate": 3e-06, "loss": 12.3003, "step": 1738 }, { "epoch": 0.15513626834381553, "grad_norm": 232.1507110595703, "learning_rate": 3e-06, "loss": 3.8034, "step": 1739 }, { "epoch": 0.15522547838886658, "grad_norm": 177.14718627929688, "learning_rate": 3e-06, "loss": 5.6435, "step": 1740 }, { "completion_length": 162.33333587646484, "epoch": 0.15531468843391766, "grad_norm": 994.4199829101562, "learning_rate": 3e-06, "loss": 58.5382, "reward": 1.847833514213562, "reward_std": 0.5970688164234161, "rewards/correctness_reward_func": 1.2916666567325592, "rewards/int_reward_func": 0.4895833283662796, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.06658333353698254, "step": 1741, "zero_std_ratio": 0.0 }, { "epoch": 0.15540389847896874, "grad_norm": 1349.40283203125, "learning_rate": 3e-06, "loss": 54.8541, "step": 1742 }, { "epoch": 0.1554931085240198, "grad_norm": 1288.0029296875, "learning_rate": 3e-06, "loss": 70.4871, "step": 1743 }, { "epoch": 0.15558231856907087, "grad_norm": 1065.3143310546875, "learning_rate": 3e-06, "loss": 80.8355, "step": 1744 }, { "epoch": 0.15567152861412195, "grad_norm": 1030.62939453125, "learning_rate": 3e-06, "loss": 58.8967, "step": 1745 }, { "epoch": 0.15576073865917303, "grad_norm": 676.4586791992188, "learning_rate": 3e-06, "loss": 32.0503, "step": 1746 }, { "epoch": 0.15584994870422408, "grad_norm": 754.9002685546875, "learning_rate": 3e-06, "loss": 46.5822, "step": 1747 }, { "epoch": 0.15593915874927516, "grad_norm": 771.5171508789062, "learning_rate": 3e-06, "loss": 53.0914, "step": 1748 }, { "epoch": 0.15602836879432624, "grad_norm": 624.7284545898438, "learning_rate": 3e-06, "loss": 54.3331, "step": 1749 }, { "epoch": 0.15611757883937732, "grad_norm": 721.7028198242188, "learning_rate": 3e-06, "loss": 50.9435, "step": 1750 }, { "epoch": 0.15620678888442838, "grad_norm": 981.5281982421875, "learning_rate": 3e-06, "loss": 41.9982, "step": 1751 }, { "epoch": 0.15629599892947946, "grad_norm": 476.7873840332031, "learning_rate": 3e-06, "loss": 23.7931, "step": 1752 }, { "completion_length": 157.45833587646484, "epoch": 0.15638520897453054, "grad_norm": 1073.9844970703125, "learning_rate": 3e-06, "loss": -17.6326, "reward": 1.8527084589004517, "reward_std": 0.6355842351913452, "rewards/correctness_reward_func": 1.375, "rewards/int_reward_func": 0.375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.10270833969116211, "step": 1753, "zero_std_ratio": 0.0 }, { "epoch": 0.15647441901958162, "grad_norm": 1561.85595703125, "learning_rate": 3e-06, "loss": -52.6263, "step": 1754 }, { "epoch": 0.15656362906463267, "grad_norm": 979.494140625, "learning_rate": 3e-06, "loss": -94.7826, "step": 1755 }, { "epoch": 0.15665283910968375, "grad_norm": 868.07177734375, "learning_rate": 3e-06, "loss": -37.3175, "step": 1756 }, { "epoch": 0.15674204915473483, "grad_norm": 891.9551391601562, "learning_rate": 3e-06, "loss": -5.4981, "step": 1757 }, { "epoch": 0.15683125919978588, "grad_norm": 816.58740234375, "learning_rate": 3e-06, "loss": 12.5619, "step": 1758 }, { "epoch": 0.15692046924483696, "grad_norm": 836.6050415039062, "learning_rate": 3e-06, "loss": -22.0006, "step": 1759 }, { "epoch": 0.15700967928988804, "grad_norm": 1234.0372314453125, "learning_rate": 3e-06, "loss": -45.7349, "step": 1760 }, { "epoch": 0.15709888933493912, "grad_norm": 931.4359130859375, "learning_rate": 3e-06, "loss": -87.1049, "step": 1761 }, { "epoch": 0.15718809937999018, "grad_norm": 1074.96435546875, "learning_rate": 3e-06, "loss": -39.8578, "step": 1762 }, { "epoch": 0.15727730942504126, "grad_norm": 807.4719848632812, "learning_rate": 3e-06, "loss": -15.7909, "step": 1763 }, { "epoch": 0.15736651947009234, "grad_norm": 701.7471313476562, "learning_rate": 3e-06, "loss": 2.0572, "step": 1764 }, { "completion_length": 156.6041717529297, "epoch": 0.15745572951514342, "grad_norm": 802.6090087890625, "learning_rate": 3e-06, "loss": -34.1906, "reward": 2.0988959670066833, "reward_std": 0.31426893174648285, "rewards/correctness_reward_func": 1.5416666269302368, "rewards/int_reward_func": 0.4791666567325592, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.0780624970793724, "step": 1765, "zero_std_ratio": 0.0 }, { "epoch": 0.15754493956019447, "grad_norm": 714.1509399414062, "learning_rate": 3e-06, "loss": -53.6734, "step": 1766 }, { "epoch": 0.15763414960524555, "grad_norm": 691.9019165039062, "learning_rate": 3e-06, "loss": -43.5893, "step": 1767 }, { "epoch": 0.15772335965029663, "grad_norm": 890.5856323242188, "learning_rate": 3e-06, "loss": 10.3226, "step": 1768 }, { "epoch": 0.1578125696953477, "grad_norm": 834.9835205078125, "learning_rate": 3e-06, "loss": 0.0275, "step": 1769 }, { "epoch": 0.15790177974039876, "grad_norm": 480.59625244140625, "learning_rate": 3e-06, "loss": -26.698, "step": 1770 }, { "epoch": 0.15799098978544984, "grad_norm": 842.9605102539062, "learning_rate": 3e-06, "loss": -37.832, "step": 1771 }, { "epoch": 0.15808019983050092, "grad_norm": 766.251220703125, "learning_rate": 3e-06, "loss": -56.4611, "step": 1772 }, { "epoch": 0.15816940987555198, "grad_norm": 809.439453125, "learning_rate": 3e-06, "loss": -51.1308, "step": 1773 }, { "epoch": 0.15825861992060306, "grad_norm": 959.1751098632812, "learning_rate": 3e-06, "loss": 11.5874, "step": 1774 }, { "epoch": 0.15834782996565414, "grad_norm": 868.1351318359375, "learning_rate": 3e-06, "loss": -4.9394, "step": 1775 }, { "epoch": 0.15843704001070522, "grad_norm": 572.6174926757812, "learning_rate": 3e-06, "loss": -29.5914, "step": 1776 }, { "completion_length": 121.64583587646484, "epoch": 0.15852625005575627, "grad_norm": 828.8180541992188, "learning_rate": 3e-06, "loss": 69.2331, "reward": 2.4088125228881836, "reward_std": 0.2123552095144987, "rewards/correctness_reward_func": 1.75, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.1588124930858612, "step": 1777, "zero_std_ratio": 0.0 }, { "epoch": 0.15861546010080735, "grad_norm": 726.0809326171875, "learning_rate": 3e-06, "loss": 61.3269, "step": 1778 }, { "epoch": 0.15870467014585843, "grad_norm": 640.3132934570312, "learning_rate": 3e-06, "loss": 62.9404, "step": 1779 }, { "epoch": 0.1587938801909095, "grad_norm": 819.57958984375, "learning_rate": 3e-06, "loss": 75.4607, "step": 1780 }, { "epoch": 0.15888309023596056, "grad_norm": 799.89453125, "learning_rate": 3e-06, "loss": 72.5996, "step": 1781 }, { "epoch": 0.15897230028101164, "grad_norm": 847.7570190429688, "learning_rate": 3e-06, "loss": 96.3295, "step": 1782 }, { "epoch": 0.15906151032606272, "grad_norm": 610.136474609375, "learning_rate": 3e-06, "loss": 52.1926, "step": 1783 }, { "epoch": 0.15915072037111377, "grad_norm": 457.82208251953125, "learning_rate": 3e-06, "loss": 46.0544, "step": 1784 }, { "epoch": 0.15923993041616485, "grad_norm": 419.1843566894531, "learning_rate": 3e-06, "loss": 43.62, "step": 1785 }, { "epoch": 0.15932914046121593, "grad_norm": 517.3756103515625, "learning_rate": 3e-06, "loss": 47.5319, "step": 1786 }, { "epoch": 0.15941835050626701, "grad_norm": 427.8025207519531, "learning_rate": 3e-06, "loss": 43.6472, "step": 1787 }, { "epoch": 0.15950756055131807, "grad_norm": 639.5134887695312, "learning_rate": 3e-06, "loss": 60.5944, "step": 1788 }, { "completion_length": 152.4791717529297, "epoch": 0.15959677059636915, "grad_norm": 1021.4591064453125, "learning_rate": 3e-06, "loss": -505.314, "reward": 2.077250123023987, "reward_std": 0.42116162180900574, "rewards/correctness_reward_func": 1.4583333134651184, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.11891666986048222, "step": 1789, "zero_std_ratio": 0.125 }, { "epoch": 0.15968598064142023, "grad_norm": 1055.1492919921875, "learning_rate": 3e-06, "loss": -514.8371, "step": 1790 }, { "epoch": 0.1597751906864713, "grad_norm": 1102.408447265625, "learning_rate": 3e-06, "loss": -510.6017, "step": 1791 }, { "epoch": 0.15986440073152236, "grad_norm": 826.0501098632812, "learning_rate": 3e-06, "loss": -504.185, "step": 1792 }, { "epoch": 0.15995361077657344, "grad_norm": 924.1912841796875, "learning_rate": 3e-06, "loss": -528.0456, "step": 1793 }, { "epoch": 0.16004282082162452, "grad_norm": 1201.7523193359375, "learning_rate": 3e-06, "loss": -485.692, "step": 1794 }, { "epoch": 0.1601320308666756, "grad_norm": 1054.49560546875, "learning_rate": 3e-06, "loss": -517.5522, "step": 1795 }, { "epoch": 0.16022124091172665, "grad_norm": 1037.0382080078125, "learning_rate": 3e-06, "loss": -533.0304, "step": 1796 }, { "epoch": 0.16031045095677773, "grad_norm": 1121.322509765625, "learning_rate": 3e-06, "loss": -528.4745, "step": 1797 }, { "epoch": 0.1603996610018288, "grad_norm": 869.9371948242188, "learning_rate": 3e-06, "loss": -530.1343, "step": 1798 }, { "epoch": 0.16048887104687987, "grad_norm": 993.0130004882812, "learning_rate": 3e-06, "loss": -561.1622, "step": 1799 }, { "epoch": 0.16057808109193095, "grad_norm": 1424.33056640625, "learning_rate": 3e-06, "loss": -523.6362, "step": 1800 }, { "completion_length": 132.87500762939453, "epoch": 0.16066729113698203, "grad_norm": 516.1659545898438, "learning_rate": 3e-06, "loss": -39.2026, "reward": 2.2690415382385254, "reward_std": 0.5185650140047073, "rewards/correctness_reward_func": 1.6666666865348816, "rewards/int_reward_func": 0.4583333283662796, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.14404166117310524, "step": 1801, "zero_std_ratio": 0.125 }, { "epoch": 0.1607565011820331, "grad_norm": 393.27093505859375, "learning_rate": 3e-06, "loss": -18.2417, "step": 1802 }, { "epoch": 0.16084571122708416, "grad_norm": 274.265625, "learning_rate": 3e-06, "loss": -7.7819, "step": 1803 }, { "epoch": 0.16093492127213524, "grad_norm": 303.0129699707031, "learning_rate": 3e-06, "loss": -12.3068, "step": 1804 }, { "epoch": 0.16102413131718632, "grad_norm": 194.52011108398438, "learning_rate": 3e-06, "loss": -16.6636, "step": 1805 }, { "epoch": 0.1611133413622374, "grad_norm": 361.7630920410156, "learning_rate": 3e-06, "loss": -64.1357, "step": 1806 }, { "epoch": 0.16120255140728845, "grad_norm": 475.3822326660156, "learning_rate": 3e-06, "loss": -49.0127, "step": 1807 }, { "epoch": 0.16129176145233953, "grad_norm": 457.8487548828125, "learning_rate": 3e-06, "loss": -26.1585, "step": 1808 }, { "epoch": 0.1613809714973906, "grad_norm": 345.77685546875, "learning_rate": 3e-06, "loss": -9.8069, "step": 1809 }, { "epoch": 0.16147018154244167, "grad_norm": 430.1940002441406, "learning_rate": 3e-06, "loss": -17.8251, "step": 1810 }, { "epoch": 0.16155939158749275, "grad_norm": 268.2777099609375, "learning_rate": 3e-06, "loss": -21.4181, "step": 1811 }, { "epoch": 0.16164860163254383, "grad_norm": 360.336181640625, "learning_rate": 3e-06, "loss": -69.655, "step": 1812 }, { "completion_length": 123.20833969116211, "epoch": 0.1617378116775949, "grad_norm": 752.1405029296875, "learning_rate": 3e-06, "loss": -52.6682, "reward": 2.5705000162124634, "reward_std": 0.14856409095227718, "rewards/correctness_reward_func": 1.9166666865348816, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.15383333712816238, "step": 1813, "zero_std_ratio": 0.0 }, { "epoch": 0.16182702172264596, "grad_norm": 750.3876342773438, "learning_rate": 3e-06, "loss": -51.5828, "step": 1814 }, { "epoch": 0.16191623176769704, "grad_norm": 721.2176513671875, "learning_rate": 3e-06, "loss": -79.6287, "step": 1815 }, { "epoch": 0.16200544181274812, "grad_norm": 722.4852905273438, "learning_rate": 3e-06, "loss": -70.3175, "step": 1816 }, { "epoch": 0.1620946518577992, "grad_norm": 797.6476440429688, "learning_rate": 3e-06, "loss": -80.2729, "step": 1817 }, { "epoch": 0.16218386190285025, "grad_norm": 883.1192016601562, "learning_rate": 3e-06, "loss": -71.263, "step": 1818 }, { "epoch": 0.16227307194790133, "grad_norm": 1220.3101806640625, "learning_rate": 3e-06, "loss": -69.1647, "step": 1819 }, { "epoch": 0.1623622819929524, "grad_norm": 1046.2666015625, "learning_rate": 3e-06, "loss": -72.9542, "step": 1820 }, { "epoch": 0.1624514920380035, "grad_norm": 762.8731079101562, "learning_rate": 3e-06, "loss": -107.0888, "step": 1821 }, { "epoch": 0.16254070208305454, "grad_norm": 989.8717651367188, "learning_rate": 3e-06, "loss": -105.7402, "step": 1822 }, { "epoch": 0.16262991212810562, "grad_norm": 810.9016723632812, "learning_rate": 3e-06, "loss": -113.7269, "step": 1823 }, { "epoch": 0.1627191221731567, "grad_norm": 794.078369140625, "learning_rate": 3e-06, "loss": -101.5978, "step": 1824 }, { "completion_length": 154.45833587646484, "epoch": 0.16280833221820776, "grad_norm": 1521.7867431640625, "learning_rate": 3e-06, "loss": 198.4336, "reward": 1.999895989894867, "reward_std": 0.5427521467208862, "rewards/correctness_reward_func": 1.4583333134651184, "rewards/int_reward_func": 0.4479166716337204, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.09364583343267441, "step": 1825, "zero_std_ratio": 0.0 }, { "epoch": 0.16289754226325884, "grad_norm": 1178.328369140625, "learning_rate": 3e-06, "loss": 186.775, "step": 1826 }, { "epoch": 0.16298675230830992, "grad_norm": 1188.1805419921875, "learning_rate": 3e-06, "loss": 262.2266, "step": 1827 }, { "epoch": 0.163075962353361, "grad_norm": 1261.284423828125, "learning_rate": 3e-06, "loss": 232.0647, "step": 1828 }, { "epoch": 0.16316517239841205, "grad_norm": 1211.0538330078125, "learning_rate": 3e-06, "loss": 245.9915, "step": 1829 }, { "epoch": 0.16325438244346313, "grad_norm": 1118.0828857421875, "learning_rate": 3e-06, "loss": 269.5143, "step": 1830 }, { "epoch": 0.1633435924885142, "grad_norm": 1425.6029052734375, "learning_rate": 3e-06, "loss": 198.2645, "step": 1831 }, { "epoch": 0.1634328025335653, "grad_norm": 969.9822998046875, "learning_rate": 3e-06, "loss": 184.7289, "step": 1832 }, { "epoch": 0.16352201257861634, "grad_norm": 1341.745849609375, "learning_rate": 3e-06, "loss": 260.7448, "step": 1833 }, { "epoch": 0.16361122262366742, "grad_norm": 1239.0595703125, "learning_rate": 3e-06, "loss": 221.661, "step": 1834 }, { "epoch": 0.1637004326687185, "grad_norm": 1348.814453125, "learning_rate": 3e-06, "loss": 242.47, "step": 1835 }, { "epoch": 0.16378964271376958, "grad_norm": 1185.1468505859375, "learning_rate": 3e-06, "loss": 263.2233, "step": 1836 }, { "completion_length": 119.66666793823242, "epoch": 0.16387885275882064, "grad_norm": 527.2208251953125, "learning_rate": 3e-06, "loss": 34.5997, "reward": 2.168979287147522, "reward_std": 0.27117825858294964, "rewards/correctness_reward_func": 1.5, "rewards/int_reward_func": 0.4895833283662796, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.17939583212137222, "step": 1837, "zero_std_ratio": 0.0 }, { "epoch": 0.16396806280387172, "grad_norm": 730.4968872070312, "learning_rate": 3e-06, "loss": 45.4926, "step": 1838 }, { "epoch": 0.1640572728489228, "grad_norm": 559.9202270507812, "learning_rate": 3e-06, "loss": 43.0753, "step": 1839 }, { "epoch": 0.16414648289397385, "grad_norm": 586.70849609375, "learning_rate": 3e-06, "loss": 45.5974, "step": 1840 }, { "epoch": 0.16423569293902493, "grad_norm": 731.3250732421875, "learning_rate": 3e-06, "loss": 33.5908, "step": 1841 }, { "epoch": 0.164324902984076, "grad_norm": 1114.0931396484375, "learning_rate": 3e-06, "loss": 25.0561, "step": 1842 }, { "epoch": 0.1644141130291271, "grad_norm": 509.1001281738281, "learning_rate": 3e-06, "loss": 28.6634, "step": 1843 }, { "epoch": 0.16450332307417814, "grad_norm": 1331.419677734375, "learning_rate": 3e-06, "loss": 46.7599, "step": 1844 }, { "epoch": 0.16459253311922922, "grad_norm": 434.1399230957031, "learning_rate": 3e-06, "loss": 38.0593, "step": 1845 }, { "epoch": 0.1646817431642803, "grad_norm": 653.2589111328125, "learning_rate": 3e-06, "loss": 41.9517, "step": 1846 }, { "epoch": 0.16477095320933138, "grad_norm": 662.3316040039062, "learning_rate": 3e-06, "loss": 32.4191, "step": 1847 }, { "epoch": 0.16486016325438244, "grad_norm": 946.3890380859375, "learning_rate": 3e-06, "loss": 13.5773, "step": 1848 }, { "completion_length": 113.20833587646484, "epoch": 0.16494937329943352, "grad_norm": 607.1404418945312, "learning_rate": 3e-06, "loss": -34.7088, "reward": 2.585312604904175, "reward_std": 0.15480694454163313, "rewards/correctness_reward_func": 1.875, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.21031249314546585, "step": 1849, "zero_std_ratio": 0.0 }, { "epoch": 0.1650385833444846, "grad_norm": 658.881103515625, "learning_rate": 3e-06, "loss": 4.1338, "step": 1850 }, { "epoch": 0.16512779338953565, "grad_norm": 527.572998046875, "learning_rate": 3e-06, "loss": -27.4651, "step": 1851 }, { "epoch": 0.16521700343458673, "grad_norm": 710.48828125, "learning_rate": 3e-06, "loss": -0.9042, "step": 1852 }, { "epoch": 0.1653062134796378, "grad_norm": 775.1302490234375, "learning_rate": 3e-06, "loss": -12.5198, "step": 1853 }, { "epoch": 0.1653954235246889, "grad_norm": 557.5072021484375, "learning_rate": 3e-06, "loss": -32.8515, "step": 1854 }, { "epoch": 0.16548463356973994, "grad_norm": 621.2504272460938, "learning_rate": 3e-06, "loss": -41.5699, "step": 1855 }, { "epoch": 0.16557384361479102, "grad_norm": 654.5400390625, "learning_rate": 3e-06, "loss": 2.162, "step": 1856 }, { "epoch": 0.1656630536598421, "grad_norm": 577.8197631835938, "learning_rate": 3e-06, "loss": -35.462, "step": 1857 }, { "epoch": 0.16575226370489318, "grad_norm": 820.9921264648438, "learning_rate": 3e-06, "loss": -11.7584, "step": 1858 }, { "epoch": 0.16584147374994423, "grad_norm": 765.3721313476562, "learning_rate": 3e-06, "loss": -20.577, "step": 1859 }, { "epoch": 0.16593068379499532, "grad_norm": 568.7098388671875, "learning_rate": 3e-06, "loss": -30.3173, "step": 1860 }, { "completion_length": 128.0208396911621, "epoch": 0.1660198938400464, "grad_norm": 562.7349243164062, "learning_rate": 3e-06, "loss": -0.1389, "reward": 2.4199376106262207, "reward_std": 0.4903542250394821, "rewards/correctness_reward_func": 1.7916666865348816, "rewards/int_reward_func": 0.4791666567325592, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.14910416305065155, "step": 1861, "zero_std_ratio": 0.0 }, { "epoch": 0.16610910388509748, "grad_norm": 1208.946533203125, "learning_rate": 3e-06, "loss": 22.0607, "step": 1862 }, { "epoch": 0.16619831393014853, "grad_norm": 641.0849609375, "learning_rate": 3e-06, "loss": 15.9128, "step": 1863 }, { "epoch": 0.1662875239751996, "grad_norm": 610.1945190429688, "learning_rate": 3e-06, "loss": 45.7852, "step": 1864 }, { "epoch": 0.1663767340202507, "grad_norm": 591.8941650390625, "learning_rate": 3e-06, "loss": 22.7616, "step": 1865 }, { "epoch": 0.16646594406530174, "grad_norm": 466.8150939941406, "learning_rate": 3e-06, "loss": 26.8555, "step": 1866 }, { "epoch": 0.16655515411035282, "grad_norm": 823.4788818359375, "learning_rate": 3e-06, "loss": -7.2144, "step": 1867 }, { "epoch": 0.1666443641554039, "grad_norm": 1031.0870361328125, "learning_rate": 3e-06, "loss": 6.1348, "step": 1868 }, { "epoch": 0.16673357420045498, "grad_norm": 640.3084716796875, "learning_rate": 3e-06, "loss": 7.8973, "step": 1869 }, { "epoch": 0.16682278424550603, "grad_norm": 587.38916015625, "learning_rate": 3e-06, "loss": 38.7696, "step": 1870 }, { "epoch": 0.16691199429055711, "grad_norm": 470.4759216308594, "learning_rate": 3e-06, "loss": 10.2833, "step": 1871 }, { "epoch": 0.1670012043356082, "grad_norm": 400.15093994140625, "learning_rate": 3e-06, "loss": 16.9143, "step": 1872 }, { "completion_length": 121.5, "epoch": 0.16709041438065927, "grad_norm": 1434.84765625, "learning_rate": 3e-06, "loss": 127.8875, "reward": 2.1267499923706055, "reward_std": 0.28236258029937744, "rewards/correctness_reward_func": 1.4583333134651184, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.16841667145490646, "step": 1873, "zero_std_ratio": 0.0 }, { "epoch": 0.16717962442571033, "grad_norm": 1496.07373046875, "learning_rate": 3e-06, "loss": 88.3829, "step": 1874 }, { "epoch": 0.1672688344707614, "grad_norm": 1312.2996826171875, "learning_rate": 3e-06, "loss": 84.1219, "step": 1875 }, { "epoch": 0.1673580445158125, "grad_norm": 1233.7979736328125, "learning_rate": 3e-06, "loss": 70.5536, "step": 1876 }, { "epoch": 0.16744725456086354, "grad_norm": 896.58837890625, "learning_rate": 3e-06, "loss": 35.9209, "step": 1877 }, { "epoch": 0.16753646460591462, "grad_norm": 1668.3927001953125, "learning_rate": 3e-06, "loss": 82.6518, "step": 1878 }, { "epoch": 0.1676256746509657, "grad_norm": 1143.520263671875, "learning_rate": 3e-06, "loss": 97.7647, "step": 1879 }, { "epoch": 0.16771488469601678, "grad_norm": 964.7223510742188, "learning_rate": 3e-06, "loss": 55.6952, "step": 1880 }, { "epoch": 0.16780409474106783, "grad_norm": 1036.915283203125, "learning_rate": 3e-06, "loss": 61.6463, "step": 1881 }, { "epoch": 0.1678933047861189, "grad_norm": 889.5004272460938, "learning_rate": 3e-06, "loss": 39.2985, "step": 1882 }, { "epoch": 0.16798251483117, "grad_norm": 711.8017578125, "learning_rate": 3e-06, "loss": 21.9326, "step": 1883 }, { "epoch": 0.16807172487622107, "grad_norm": 1177.8270263671875, "learning_rate": 3e-06, "loss": 45.9468, "step": 1884 }, { "completion_length": 116.22916793823242, "epoch": 0.16816093492127213, "grad_norm": 313.5957336425781, "learning_rate": 3e-06, "loss": 13.1803, "reward": 2.5328749418258667, "reward_std": 0.35810738801956177, "rewards/correctness_reward_func": 1.8333333134651184, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.19954168051481247, "step": 1885, "zero_std_ratio": 0.0 }, { "epoch": 0.1682501449663232, "grad_norm": 333.3410949707031, "learning_rate": 3e-06, "loss": 14.3001, "step": 1886 }, { "epoch": 0.1683393550113743, "grad_norm": 330.1903076171875, "learning_rate": 3e-06, "loss": 13.6081, "step": 1887 }, { "epoch": 0.16842856505642537, "grad_norm": 313.8829345703125, "learning_rate": 3e-06, "loss": 21.504, "step": 1888 }, { "epoch": 0.16851777510147642, "grad_norm": 282.533203125, "learning_rate": 3e-06, "loss": 10.1669, "step": 1889 }, { "epoch": 0.1686069851465275, "grad_norm": 203.84523010253906, "learning_rate": 3e-06, "loss": 15.1211, "step": 1890 }, { "epoch": 0.16869619519157858, "grad_norm": 297.6282653808594, "learning_rate": 3e-06, "loss": 6.1928, "step": 1891 }, { "epoch": 0.16878540523662963, "grad_norm": 246.8819122314453, "learning_rate": 3e-06, "loss": 10.9529, "step": 1892 }, { "epoch": 0.1688746152816807, "grad_norm": 225.11219787597656, "learning_rate": 3e-06, "loss": 6.8239, "step": 1893 }, { "epoch": 0.1689638253267318, "grad_norm": 194.10739135742188, "learning_rate": 3e-06, "loss": 14.7224, "step": 1894 }, { "epoch": 0.16905303537178287, "grad_norm": 191.75978088378906, "learning_rate": 3e-06, "loss": 3.1039, "step": 1895 }, { "epoch": 0.16914224541683393, "grad_norm": 121.65093994140625, "learning_rate": 3e-06, "loss": 11.2102, "step": 1896 }, { "completion_length": 157.00000762939453, "epoch": 0.169231455461885, "grad_norm": 1251.7188720703125, "learning_rate": 3e-06, "loss": -10.517, "reward": 2.062812566757202, "reward_std": 0.7439534962177277, "rewards/correctness_reward_func": 1.5, "rewards/int_reward_func": 0.4479166567325592, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.11489583179354668, "step": 1897, "zero_std_ratio": 0.0 }, { "epoch": 0.16932066550693609, "grad_norm": 1017.1513671875, "learning_rate": 3e-06, "loss": -8.0786, "step": 1898 }, { "epoch": 0.16940987555198717, "grad_norm": 937.3341674804688, "learning_rate": 3e-06, "loss": -53.4306, "step": 1899 }, { "epoch": 0.16949908559703822, "grad_norm": 1166.1446533203125, "learning_rate": 3e-06, "loss": -58.4623, "step": 1900 }, { "epoch": 0.1695882956420893, "grad_norm": 627.0881958007812, "learning_rate": 3e-06, "loss": -60.1449, "step": 1901 }, { "epoch": 0.16967750568714038, "grad_norm": 1087.1383056640625, "learning_rate": 3e-06, "loss": -14.86, "step": 1902 }, { "epoch": 0.16976671573219143, "grad_norm": 1112.518798828125, "learning_rate": 3e-06, "loss": -13.7934, "step": 1903 }, { "epoch": 0.1698559257772425, "grad_norm": 1089.9168701171875, "learning_rate": 3e-06, "loss": -17.8976, "step": 1904 }, { "epoch": 0.1699451358222936, "grad_norm": 765.9096069335938, "learning_rate": 3e-06, "loss": -62.3282, "step": 1905 }, { "epoch": 0.17003434586734467, "grad_norm": 1779.7637939453125, "learning_rate": 3e-06, "loss": -69.9882, "step": 1906 }, { "epoch": 0.17012355591239572, "grad_norm": 627.7999877929688, "learning_rate": 3e-06, "loss": -67.9675, "step": 1907 }, { "epoch": 0.1702127659574468, "grad_norm": 1196.003173828125, "learning_rate": 3e-06, "loss": -24.7852, "step": 1908 }, { "completion_length": 139.52083587646484, "epoch": 0.17030197600249788, "grad_norm": 137.36314392089844, "learning_rate": 3e-06, "loss": -6.2761, "reward": 2.261833429336548, "reward_std": 0.2501897104084492, "rewards/correctness_reward_func": 1.625, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.1368333362042904, "step": 1909, "zero_std_ratio": 0.0 }, { "epoch": 0.17039118604754896, "grad_norm": 145.57534790039062, "learning_rate": 3e-06, "loss": -16.6773, "step": 1910 }, { "epoch": 0.17048039609260002, "grad_norm": 566.4564208984375, "learning_rate": 3e-06, "loss": -29.7986, "step": 1911 }, { "epoch": 0.1705696061376511, "grad_norm": 186.74420166015625, "learning_rate": 3e-06, "loss": -10.6402, "step": 1912 }, { "epoch": 0.17065881618270218, "grad_norm": 155.60208129882812, "learning_rate": 3e-06, "loss": -14.5853, "step": 1913 }, { "epoch": 0.17074802622775326, "grad_norm": 186.169921875, "learning_rate": 3e-06, "loss": -7.9219, "step": 1914 }, { "epoch": 0.1708372362728043, "grad_norm": 170.4557342529297, "learning_rate": 3e-06, "loss": -9.1424, "step": 1915 }, { "epoch": 0.1709264463178554, "grad_norm": 173.5465087890625, "learning_rate": 3e-06, "loss": -16.6805, "step": 1916 }, { "epoch": 0.17101565636290647, "grad_norm": 975.5637817382812, "learning_rate": 3e-06, "loss": -46.2821, "step": 1917 }, { "epoch": 0.17110486640795752, "grad_norm": 277.77490234375, "learning_rate": 3e-06, "loss": -12.9822, "step": 1918 }, { "epoch": 0.1711940764530086, "grad_norm": 203.00672912597656, "learning_rate": 3e-06, "loss": -17.6713, "step": 1919 }, { "epoch": 0.17128328649805968, "grad_norm": 243.0919647216797, "learning_rate": 3e-06, "loss": -11.2019, "step": 1920 }, { "completion_length": 124.27083969116211, "epoch": 0.17137249654311076, "grad_norm": 2301.88134765625, "learning_rate": 3e-06, "loss": -253.2357, "reward": 2.5085625648498535, "reward_std": 0.26843154430389404, "rewards/correctness_reward_func": 1.875, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.13356249406933784, "step": 1921, "zero_std_ratio": 0.0 }, { "epoch": 0.17146170658816182, "grad_norm": 1669.822021484375, "learning_rate": 3e-06, "loss": -404.5188, "step": 1922 }, { "epoch": 0.1715509166332129, "grad_norm": 1267.5242919921875, "learning_rate": 3e-06, "loss": -460.9462, "step": 1923 }, { "epoch": 0.17164012667826398, "grad_norm": 1655.473388671875, "learning_rate": 3e-06, "loss": -412.0498, "step": 1924 }, { "epoch": 0.17172933672331506, "grad_norm": 1569.4832763671875, "learning_rate": 3e-06, "loss": -305.7101, "step": 1925 }, { "epoch": 0.1718185467683661, "grad_norm": 1087.016845703125, "learning_rate": 3e-06, "loss": -636.9792, "step": 1926 }, { "epoch": 0.1719077568134172, "grad_norm": 2180.221435546875, "learning_rate": 3e-06, "loss": -309.4447, "step": 1927 }, { "epoch": 0.17199696685846827, "grad_norm": 1617.933349609375, "learning_rate": 3e-06, "loss": -462.1522, "step": 1928 }, { "epoch": 0.17208617690351935, "grad_norm": 985.1787109375, "learning_rate": 3e-06, "loss": -494.9189, "step": 1929 }, { "epoch": 0.1721753869485704, "grad_norm": 1417.375732421875, "learning_rate": 3e-06, "loss": -479.894, "step": 1930 }, { "epoch": 0.17226459699362148, "grad_norm": 1510.1646728515625, "learning_rate": 3e-06, "loss": -378.276, "step": 1931 }, { "epoch": 0.17235380703867256, "grad_norm": 1007.6100463867188, "learning_rate": 3e-06, "loss": -662.9573, "step": 1932 }, { "completion_length": 144.41666793823242, "epoch": 0.17244301708372362, "grad_norm": 136.9275360107422, "learning_rate": 3e-06, "loss": -8.754, "reward": 2.550520896911621, "reward_std": 0.2396542876958847, "rewards/correctness_reward_func": 1.9166666269302368, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.13385416939854622, "step": 1933, "zero_std_ratio": 0.0 }, { "epoch": 0.1725322271287747, "grad_norm": 83.86649322509766, "learning_rate": 3e-06, "loss": 0.1167, "step": 1934 }, { "epoch": 0.17262143717382578, "grad_norm": 77.8895263671875, "learning_rate": 3e-06, "loss": -1.4578, "step": 1935 }, { "epoch": 0.17271064721887686, "grad_norm": 122.21768951416016, "learning_rate": 3e-06, "loss": -4.8938, "step": 1936 }, { "epoch": 0.1727998572639279, "grad_norm": 86.19290161132812, "learning_rate": 3e-06, "loss": -0.7627, "step": 1937 }, { "epoch": 0.172889067308979, "grad_norm": 82.05410766601562, "learning_rate": 3e-06, "loss": -3.3958, "step": 1938 }, { "epoch": 0.17297827735403007, "grad_norm": 225.0465850830078, "learning_rate": 3e-06, "loss": -13.7727, "step": 1939 }, { "epoch": 0.17306748739908115, "grad_norm": 90.8927001953125, "learning_rate": 3e-06, "loss": -1.4379, "step": 1940 }, { "epoch": 0.1731566974441322, "grad_norm": 108.59156799316406, "learning_rate": 3e-06, "loss": -4.1789, "step": 1941 }, { "epoch": 0.17324590748918328, "grad_norm": 116.51862335205078, "learning_rate": 3e-06, "loss": -6.4501, "step": 1942 }, { "epoch": 0.17333511753423436, "grad_norm": 92.47396850585938, "learning_rate": 3e-06, "loss": -2.9944, "step": 1943 }, { "epoch": 0.17342432757928541, "grad_norm": 119.22129821777344, "learning_rate": 3e-06, "loss": -6.243, "step": 1944 }, { "completion_length": 145.5416717529297, "epoch": 0.1735135376243365, "grad_norm": 2110.79443359375, "learning_rate": 3e-06, "loss": 94.0773, "reward": 1.6517499685287476, "reward_std": 0.7779462337493896, "rewards/correctness_reward_func": 1.0416666567325592, "rewards/int_reward_func": 0.4791666716337204, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.13091666996479034, "step": 1945, "zero_std_ratio": 0.125 }, { "epoch": 0.17360274766938757, "grad_norm": 1997.6201171875, "learning_rate": 3e-06, "loss": 99.2755, "step": 1946 }, { "epoch": 0.17369195771443866, "grad_norm": 2617.997802734375, "learning_rate": 3e-06, "loss": -170.4776, "step": 1947 }, { "epoch": 0.1737811677594897, "grad_norm": 2001.4349365234375, "learning_rate": 3e-06, "loss": -48.466, "step": 1948 }, { "epoch": 0.1738703778045408, "grad_norm": 3665.568359375, "learning_rate": 3e-06, "loss": 56.4173, "step": 1949 }, { "epoch": 0.17395958784959187, "grad_norm": 3150.109619140625, "learning_rate": 3e-06, "loss": -52.6612, "step": 1950 }, { "epoch": 0.17404879789464295, "grad_norm": 2255.739013671875, "learning_rate": 3e-06, "loss": 86.9101, "step": 1951 }, { "epoch": 0.174138007939694, "grad_norm": 2205.787353515625, "learning_rate": 3e-06, "loss": 99.2739, "step": 1952 }, { "epoch": 0.17422721798474508, "grad_norm": 2441.709716796875, "learning_rate": 3e-06, "loss": -200.8932, "step": 1953 }, { "epoch": 0.17431642802979616, "grad_norm": 2390.3076171875, "learning_rate": 3e-06, "loss": -55.9708, "step": 1954 }, { "epoch": 0.17440563807484724, "grad_norm": 3095.658203125, "learning_rate": 3e-06, "loss": 39.4443, "step": 1955 }, { "epoch": 0.1744948481198983, "grad_norm": 2838.15966796875, "learning_rate": 3e-06, "loss": -70.563, "step": 1956 }, { "completion_length": 136.4791717529297, "epoch": 0.17458405816494937, "grad_norm": 706.7903442382812, "learning_rate": 3e-06, "loss": -24.1702, "reward": 2.388875126838684, "reward_std": 0.29309016466140747, "rewards/correctness_reward_func": 1.7916666269302368, "rewards/int_reward_func": 0.4791666567325592, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.11804166808724403, "step": 1957, "zero_std_ratio": 0.125 }, { "epoch": 0.17467326821000045, "grad_norm": 1125.595458984375, "learning_rate": 3e-06, "loss": 43.8321, "step": 1958 }, { "epoch": 0.1747624782550515, "grad_norm": 891.260009765625, "learning_rate": 3e-06, "loss": -32.6333, "step": 1959 }, { "epoch": 0.1748516883001026, "grad_norm": 1020.5130615234375, "learning_rate": 3e-06, "loss": 101.2067, "step": 1960 }, { "epoch": 0.17494089834515367, "grad_norm": 619.5036010742188, "learning_rate": 3e-06, "loss": 33.6314, "step": 1961 }, { "epoch": 0.17503010839020475, "grad_norm": 1052.5194091796875, "learning_rate": 3e-06, "loss": 52.0245, "step": 1962 }, { "epoch": 0.1751193184352558, "grad_norm": 637.0113525390625, "learning_rate": 3e-06, "loss": -27.3594, "step": 1963 }, { "epoch": 0.17520852848030688, "grad_norm": 1216.4146728515625, "learning_rate": 3e-06, "loss": 31.9169, "step": 1964 }, { "epoch": 0.17529773852535796, "grad_norm": 799.7539672851562, "learning_rate": 3e-06, "loss": -45.1417, "step": 1965 }, { "epoch": 0.17538694857040904, "grad_norm": 1073.3214111328125, "learning_rate": 3e-06, "loss": 79.6225, "step": 1966 }, { "epoch": 0.1754761586154601, "grad_norm": 606.6988525390625, "learning_rate": 3e-06, "loss": 22.0892, "step": 1967 }, { "epoch": 0.17556536866051117, "grad_norm": 1075.4769287109375, "learning_rate": 3e-06, "loss": 34.4998, "step": 1968 }, { "completion_length": 127.02083587646484, "epoch": 0.17565457870556225, "grad_norm": 1479.780517578125, "learning_rate": 3e-06, "loss": 22.8573, "reward": 2.3632084131240845, "reward_std": 0.3230869174003601, "rewards/correctness_reward_func": 1.7083333730697632, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.154874999076128, "step": 1969, "zero_std_ratio": 0.0 }, { "epoch": 0.1757437887506133, "grad_norm": 770.041748046875, "learning_rate": 3e-06, "loss": 36.1886, "step": 1970 }, { "epoch": 0.17583299879566439, "grad_norm": 634.0138549804688, "learning_rate": 3e-06, "loss": 4.3242, "step": 1971 }, { "epoch": 0.17592220884071547, "grad_norm": 1109.409912109375, "learning_rate": 3e-06, "loss": -24.1577, "step": 1972 }, { "epoch": 0.17601141888576655, "grad_norm": 950.0133056640625, "learning_rate": 3e-06, "loss": -30.5734, "step": 1973 }, { "epoch": 0.1761006289308176, "grad_norm": 1003.349853515625, "learning_rate": 3e-06, "loss": -125.0075, "step": 1974 }, { "epoch": 0.17618983897586868, "grad_norm": 1581.892578125, "learning_rate": 3e-06, "loss": 7.7013, "step": 1975 }, { "epoch": 0.17627904902091976, "grad_norm": 858.9105224609375, "learning_rate": 3e-06, "loss": 28.9444, "step": 1976 }, { "epoch": 0.17636825906597084, "grad_norm": 506.2606201171875, "learning_rate": 3e-06, "loss": -3.128, "step": 1977 }, { "epoch": 0.1764574691110219, "grad_norm": 1361.7171630859375, "learning_rate": 3e-06, "loss": -22.6255, "step": 1978 }, { "epoch": 0.17654667915607297, "grad_norm": 945.2322387695312, "learning_rate": 3e-06, "loss": -31.6284, "step": 1979 }, { "epoch": 0.17663588920112405, "grad_norm": 1066.5302734375, "learning_rate": 3e-06, "loss": -122.7752, "step": 1980 }, { "completion_length": 129.52083587646484, "epoch": 0.17672509924617513, "grad_norm": 522.1423950195312, "learning_rate": 3e-06, "loss": -6.6804, "reward": 2.387354254722595, "reward_std": 0.42842997610569, "rewards/correctness_reward_func": 1.75, "rewards/int_reward_func": 0.4895833283662796, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.14777083322405815, "step": 1981, "zero_std_ratio": 0.0 }, { "epoch": 0.17681430929122618, "grad_norm": 1231.280517578125, "learning_rate": 3e-06, "loss": 31.9041, "step": 1982 }, { "epoch": 0.17690351933627727, "grad_norm": 663.7846069335938, "learning_rate": 3e-06, "loss": 16.5326, "step": 1983 }, { "epoch": 0.17699272938132835, "grad_norm": 530.1618041992188, "learning_rate": 3e-06, "loss": 25.7806, "step": 1984 }, { "epoch": 0.1770819394263794, "grad_norm": 703.8350830078125, "learning_rate": 3e-06, "loss": 3.6452, "step": 1985 }, { "epoch": 0.17717114947143048, "grad_norm": 1063.3558349609375, "learning_rate": 3e-06, "loss": 37.3771, "step": 1986 }, { "epoch": 0.17726035951648156, "grad_norm": 429.0755920410156, "learning_rate": 3e-06, "loss": -9.0188, "step": 1987 }, { "epoch": 0.17734956956153264, "grad_norm": 1193.805419921875, "learning_rate": 3e-06, "loss": 24.0102, "step": 1988 }, { "epoch": 0.1774387796065837, "grad_norm": 550.2122802734375, "learning_rate": 3e-06, "loss": 5.9939, "step": 1989 }, { "epoch": 0.17752798965163477, "grad_norm": 480.5219421386719, "learning_rate": 3e-06, "loss": 20.8503, "step": 1990 }, { "epoch": 0.17761719969668585, "grad_norm": 359.2476806640625, "learning_rate": 3e-06, "loss": 1.5336, "step": 1991 }, { "epoch": 0.17770640974173693, "grad_norm": 754.3380737304688, "learning_rate": 3e-06, "loss": 29.4998, "step": 1992 }, { "completion_length": 150.93750762939453, "epoch": 0.17779561978678798, "grad_norm": 1932.5921630859375, "learning_rate": 3e-06, "loss": -288.8813, "reward": 2.269333302974701, "reward_std": 0.5021546930074692, "rewards/correctness_reward_func": 1.6666666865348816, "rewards/int_reward_func": 0.4583333283662796, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.1443333402276039, "step": 1993, "zero_std_ratio": 0.0 }, { "epoch": 0.17788482983183906, "grad_norm": 2357.911865234375, "learning_rate": 3e-06, "loss": -134.0228, "step": 1994 }, { "epoch": 0.17797403987689014, "grad_norm": 1959.8599853515625, "learning_rate": 3e-06, "loss": -81.068, "step": 1995 }, { "epoch": 0.1780632499219412, "grad_norm": 1494.47412109375, "learning_rate": 3e-06, "loss": -185.6545, "step": 1996 }, { "epoch": 0.17815245996699228, "grad_norm": 1894.7733154296875, "learning_rate": 3e-06, "loss": -266.9818, "step": 1997 }, { "epoch": 0.17824167001204336, "grad_norm": 2350.27490234375, "learning_rate": 3e-06, "loss": -234.3397, "step": 1998 }, { "epoch": 0.17833088005709444, "grad_norm": 2210.189453125, "learning_rate": 3e-06, "loss": -288.7056, "step": 1999 }, { "epoch": 0.1784200901021455, "grad_norm": 1879.7398681640625, "learning_rate": 3e-06, "loss": -138.2824, "step": 2000 }, { "epoch": 0.17850930014719657, "grad_norm": 2152.613525390625, "learning_rate": 3e-06, "loss": -97.7832, "step": 2001 }, { "epoch": 0.17859851019224765, "grad_norm": 1610.9349365234375, "learning_rate": 3e-06, "loss": -200.0292, "step": 2002 }, { "epoch": 0.17868772023729873, "grad_norm": 1528.112060546875, "learning_rate": 3e-06, "loss": -275.8097, "step": 2003 }, { "epoch": 0.17877693028234978, "grad_norm": 2053.736328125, "learning_rate": 3e-06, "loss": -258.0316, "step": 2004 }, { "completion_length": 141.9375, "epoch": 0.17886614032740086, "grad_norm": 193.6829833984375, "learning_rate": 3e-06, "loss": 20.2948, "reward": 2.5581459999084473, "reward_std": 0.2487168163061142, "rewards/correctness_reward_func": 1.9166666269302368, "rewards/int_reward_func": 0.4895833283662796, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.15189583599567413, "step": 2005, "zero_std_ratio": 0.0 }, { "epoch": 0.17895535037245194, "grad_norm": 155.00283813476562, "learning_rate": 3e-06, "loss": 17.0246, "step": 2006 }, { "epoch": 0.17904456041750302, "grad_norm": 259.50830078125, "learning_rate": 3e-06, "loss": 22.8199, "step": 2007 }, { "epoch": 0.17913377046255408, "grad_norm": 266.588134765625, "learning_rate": 3e-06, "loss": 24.9054, "step": 2008 }, { "epoch": 0.17922298050760516, "grad_norm": 651.2266845703125, "learning_rate": 3e-06, "loss": 49.409, "step": 2009 }, { "epoch": 0.17931219055265624, "grad_norm": 211.16615295410156, "learning_rate": 3e-06, "loss": 24.0546, "step": 2010 }, { "epoch": 0.1794014005977073, "grad_norm": 190.15692138671875, "learning_rate": 3e-06, "loss": 18.0055, "step": 2011 }, { "epoch": 0.17949061064275837, "grad_norm": 160.15274047851562, "learning_rate": 3e-06, "loss": 15.9223, "step": 2012 }, { "epoch": 0.17957982068780945, "grad_norm": 181.6917724609375, "learning_rate": 3e-06, "loss": 21.005, "step": 2013 }, { "epoch": 0.17966903073286053, "grad_norm": 256.7626953125, "learning_rate": 3e-06, "loss": 20.4393, "step": 2014 }, { "epoch": 0.17975824077791158, "grad_norm": 425.4783630371094, "learning_rate": 3e-06, "loss": 37.1911, "step": 2015 }, { "epoch": 0.17984745082296266, "grad_norm": 152.9344482421875, "learning_rate": 3e-06, "loss": 20.1633, "step": 2016 }, { "completion_length": 141.95833587646484, "epoch": 0.17993666086801374, "grad_norm": 913.6690063476562, "learning_rate": 3e-06, "loss": -38.0152, "reward": 2.531229257583618, "reward_std": 0.2668761610984802, "rewards/correctness_reward_func": 1.9166666269302368, "rewards/int_reward_func": 0.4895833283662796, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.124979168176651, "step": 2017, "zero_std_ratio": 0.25 }, { "epoch": 0.18002587091306482, "grad_norm": 955.5236206054688, "learning_rate": 3e-06, "loss": -14.7569, "step": 2018 }, { "epoch": 0.18011508095811588, "grad_norm": 767.3821411132812, "learning_rate": 3e-06, "loss": -35.5258, "step": 2019 }, { "epoch": 0.18020429100316696, "grad_norm": 1576.59521484375, "learning_rate": 3e-06, "loss": -63.1452, "step": 2020 }, { "epoch": 0.18029350104821804, "grad_norm": 411.6076965332031, "learning_rate": 3e-06, "loss": -3.7112, "step": 2021 }, { "epoch": 0.18038271109326912, "grad_norm": 621.1591186523438, "learning_rate": 3e-06, "loss": -5.7912, "step": 2022 }, { "epoch": 0.18047192113832017, "grad_norm": 1037.3421630859375, "learning_rate": 3e-06, "loss": -40.5176, "step": 2023 }, { "epoch": 0.18056113118337125, "grad_norm": 730.2128295898438, "learning_rate": 3e-06, "loss": -14.0285, "step": 2024 }, { "epoch": 0.18065034122842233, "grad_norm": 1322.21826171875, "learning_rate": 3e-06, "loss": -39.4187, "step": 2025 }, { "epoch": 0.18073955127347338, "grad_norm": 1288.70361328125, "learning_rate": 3e-06, "loss": -74.1179, "step": 2026 }, { "epoch": 0.18082876131852446, "grad_norm": 477.3416442871094, "learning_rate": 3e-06, "loss": -7.2589, "step": 2027 }, { "epoch": 0.18091797136357554, "grad_norm": 718.8214721679688, "learning_rate": 3e-06, "loss": -11.7694, "step": 2028 }, { "completion_length": 149.4791717529297, "epoch": 0.18100718140862662, "grad_norm": 1311.2017822265625, "learning_rate": 3e-06, "loss": -69.5996, "reward": 2.306437611579895, "reward_std": 0.4377765115350485, "rewards/correctness_reward_func": 1.7083333134651184, "rewards/int_reward_func": 0.4791666716337204, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.11893749982118607, "step": 2029, "zero_std_ratio": 0.125 }, { "epoch": 0.18109639145367767, "grad_norm": 1927.8277587890625, "learning_rate": 3e-06, "loss": -19.9164, "step": 2030 }, { "epoch": 0.18118560149872875, "grad_norm": 2200.863525390625, "learning_rate": 3e-06, "loss": 19.4123, "step": 2031 }, { "epoch": 0.18127481154377983, "grad_norm": 1609.12255859375, "learning_rate": 3e-06, "loss": 132.9534, "step": 2032 }, { "epoch": 0.18136402158883091, "grad_norm": 1760.5648193359375, "learning_rate": 3e-06, "loss": 74.6029, "step": 2033 }, { "epoch": 0.18145323163388197, "grad_norm": 1445.6087646484375, "learning_rate": 3e-06, "loss": 91.8306, "step": 2034 }, { "epoch": 0.18154244167893305, "grad_norm": 1266.7987060546875, "learning_rate": 3e-06, "loss": -78.4848, "step": 2035 }, { "epoch": 0.18163165172398413, "grad_norm": 2177.941650390625, "learning_rate": 3e-06, "loss": -23.6046, "step": 2036 }, { "epoch": 0.18172086176903518, "grad_norm": 1841.123046875, "learning_rate": 3e-06, "loss": 7.9887, "step": 2037 }, { "epoch": 0.18181007181408626, "grad_norm": 2021.9366455078125, "learning_rate": 3e-06, "loss": 115.5819, "step": 2038 }, { "epoch": 0.18189928185913734, "grad_norm": 1844.681640625, "learning_rate": 3e-06, "loss": 60.9947, "step": 2039 }, { "epoch": 0.18198849190418842, "grad_norm": 1540.6129150390625, "learning_rate": 3e-06, "loss": 94.1448, "step": 2040 }, { "completion_length": 122.04167175292969, "epoch": 0.18207770194923947, "grad_norm": 431.05072021484375, "learning_rate": 3e-06, "loss": -45.7214, "reward": 2.455124855041504, "reward_std": 0.2788470536470413, "rewards/correctness_reward_func": 1.7916666865348816, "rewards/int_reward_func": 0.4791666716337204, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.18429165333509445, "step": 2041, "zero_std_ratio": 0.125 }, { "epoch": 0.18216691199429055, "grad_norm": 363.893798828125, "learning_rate": 3e-06, "loss": -43.1347, "step": 2042 }, { "epoch": 0.18225612203934163, "grad_norm": 588.0111083984375, "learning_rate": 3e-06, "loss": -30.2104, "step": 2043 }, { "epoch": 0.18234533208439271, "grad_norm": 432.35784912109375, "learning_rate": 3e-06, "loss": -37.9299, "step": 2044 }, { "epoch": 0.18243454212944377, "grad_norm": 515.4808349609375, "learning_rate": 3e-06, "loss": -40.889, "step": 2045 }, { "epoch": 0.18252375217449485, "grad_norm": 1244.8402099609375, "learning_rate": 3e-06, "loss": -30.4561, "step": 2046 }, { "epoch": 0.18261296221954593, "grad_norm": 758.7728271484375, "learning_rate": 3e-06, "loss": -51.244, "step": 2047 }, { "epoch": 0.182702172264597, "grad_norm": 447.59246826171875, "learning_rate": 3e-06, "loss": -48.6483, "step": 2048 }, { "epoch": 0.18279138230964806, "grad_norm": 542.2373046875, "learning_rate": 3e-06, "loss": -40.3981, "step": 2049 }, { "epoch": 0.18288059235469914, "grad_norm": 358.4046325683594, "learning_rate": 3e-06, "loss": -45.3628, "step": 2050 }, { "epoch": 0.18296980239975022, "grad_norm": 600.2781982421875, "learning_rate": 3e-06, "loss": -51.8527, "step": 2051 }, { "epoch": 0.18305901244480127, "grad_norm": 435.05706787109375, "learning_rate": 3e-06, "loss": -38.3583, "step": 2052 }, { "completion_length": 161.66666793823242, "epoch": 0.18314822248985235, "grad_norm": 1581.74951171875, "learning_rate": 3e-06, "loss": 29.2414, "reward": 2.3151042461395264, "reward_std": 0.6183367669582367, "rewards/correctness_reward_func": 1.7916666865348816, "rewards/int_reward_func": 0.46875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.0546875037252903, "step": 2053, "zero_std_ratio": 0.0 }, { "epoch": 0.18323743253490343, "grad_norm": 1009.0924072265625, "learning_rate": 3e-06, "loss": 39.8569, "step": 2054 }, { "epoch": 0.1833266425799545, "grad_norm": 1006.994873046875, "learning_rate": 3e-06, "loss": -25.3739, "step": 2055 }, { "epoch": 0.18341585262500557, "grad_norm": 1858.0333251953125, "learning_rate": 3e-06, "loss": -156.3526, "step": 2056 }, { "epoch": 0.18350506267005665, "grad_norm": 1687.8575439453125, "learning_rate": 3e-06, "loss": -102.6945, "step": 2057 }, { "epoch": 0.18359427271510773, "grad_norm": 3122.237548828125, "learning_rate": 3e-06, "loss": -448.7975, "step": 2058 }, { "epoch": 0.1836834827601588, "grad_norm": 2170.154541015625, "learning_rate": 3e-06, "loss": -11.5904, "step": 2059 }, { "epoch": 0.18377269280520986, "grad_norm": 1241.9334716796875, "learning_rate": 3e-06, "loss": 14.9715, "step": 2060 }, { "epoch": 0.18386190285026094, "grad_norm": 1282.6116943359375, "learning_rate": 3e-06, "loss": -53.5031, "step": 2061 }, { "epoch": 0.18395111289531202, "grad_norm": 2399.23193359375, "learning_rate": 3e-06, "loss": -207.4761, "step": 2062 }, { "epoch": 0.18404032294036307, "grad_norm": 3210.094482421875, "learning_rate": 3e-06, "loss": -169.572, "step": 2063 }, { "epoch": 0.18412953298541415, "grad_norm": 3275.695556640625, "learning_rate": 3e-06, "loss": -540.9293, "step": 2064 }, { "completion_length": 127.64583969116211, "epoch": 0.18421874303046523, "grad_norm": 1122.905517578125, "learning_rate": 3e-06, "loss": 10.8385, "reward": 2.301645874977112, "reward_std": 0.4431246966123581, "rewards/correctness_reward_func": 1.6666666865348816, "rewards/int_reward_func": 0.4895833283662796, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.14539583399891853, "step": 2065, "zero_std_ratio": 0.0 }, { "epoch": 0.1843079530755163, "grad_norm": 1122.392578125, "learning_rate": 3e-06, "loss": 49.955, "step": 2066 }, { "epoch": 0.18439716312056736, "grad_norm": 1480.3697509765625, "learning_rate": 3e-06, "loss": 58.7913, "step": 2067 }, { "epoch": 0.18448637316561844, "grad_norm": 1758.8323974609375, "learning_rate": 3e-06, "loss": 20.1775, "step": 2068 }, { "epoch": 0.18457558321066952, "grad_norm": 1807.8433837890625, "learning_rate": 3e-06, "loss": 6.628, "step": 2069 }, { "epoch": 0.1846647932557206, "grad_norm": 1256.463623046875, "learning_rate": 3e-06, "loss": -21.4105, "step": 2070 }, { "epoch": 0.18475400330077166, "grad_norm": 1165.8466796875, "learning_rate": 3e-06, "loss": 9.5585, "step": 2071 }, { "epoch": 0.18484321334582274, "grad_norm": 1214.4351806640625, "learning_rate": 3e-06, "loss": 27.7133, "step": 2072 }, { "epoch": 0.18493242339087382, "grad_norm": 1408.9798583984375, "learning_rate": 3e-06, "loss": 41.0192, "step": 2073 }, { "epoch": 0.1850216334359249, "grad_norm": 2391.484130859375, "learning_rate": 3e-06, "loss": 11.1923, "step": 2074 }, { "epoch": 0.18511084348097595, "grad_norm": 1309.9111328125, "learning_rate": 3e-06, "loss": -8.1838, "step": 2075 }, { "epoch": 0.18520005352602703, "grad_norm": 1430.7237548828125, "learning_rate": 3e-06, "loss": -34.23, "step": 2076 }, { "completion_length": 127.29167175292969, "epoch": 0.1852892635710781, "grad_norm": 760.6821899414062, "learning_rate": 3e-06, "loss": 28.2148, "reward": 2.2379584312438965, "reward_std": 0.45385661721229553, "rewards/correctness_reward_func": 1.625, "rewards/int_reward_func": 0.4791666567325592, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.13379166647791862, "step": 2077, "zero_std_ratio": 0.0 }, { "epoch": 0.18537847361612916, "grad_norm": 1331.1378173828125, "learning_rate": 3e-06, "loss": 147.6262, "step": 2078 }, { "epoch": 0.18546768366118024, "grad_norm": 1488.8958740234375, "learning_rate": 3e-06, "loss": 170.4627, "step": 2079 }, { "epoch": 0.18555689370623132, "grad_norm": 1463.0443115234375, "learning_rate": 3e-06, "loss": 122.8418, "step": 2080 }, { "epoch": 0.1856461037512824, "grad_norm": 1204.493408203125, "learning_rate": 3e-06, "loss": 72.0359, "step": 2081 }, { "epoch": 0.18573531379633346, "grad_norm": 1609.120849609375, "learning_rate": 3e-06, "loss": 160.9981, "step": 2082 }, { "epoch": 0.18582452384138454, "grad_norm": 735.0900268554688, "learning_rate": 3e-06, "loss": 23.1434, "step": 2083 }, { "epoch": 0.18591373388643562, "grad_norm": 1322.2744140625, "learning_rate": 3e-06, "loss": 134.6119, "step": 2084 }, { "epoch": 0.1860029439314867, "grad_norm": 1587.6624755859375, "learning_rate": 3e-06, "loss": 135.2965, "step": 2085 }, { "epoch": 0.18609215397653775, "grad_norm": 1264.227294921875, "learning_rate": 3e-06, "loss": 90.2438, "step": 2086 }, { "epoch": 0.18618136402158883, "grad_norm": 769.9957275390625, "learning_rate": 3e-06, "loss": 61.2736, "step": 2087 }, { "epoch": 0.1862705740666399, "grad_norm": 1437.9520263671875, "learning_rate": 3e-06, "loss": 127.8669, "step": 2088 }, { "completion_length": 124.0, "epoch": 0.18635978411169096, "grad_norm": 2092.324462890625, "learning_rate": 3e-06, "loss": -305.053, "reward": 2.428458333015442, "reward_std": 0.4015738368034363, "rewards/correctness_reward_func": 1.75, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.1784583330154419, "step": 2089, "zero_std_ratio": 0.0 }, { "epoch": 0.18644899415674204, "grad_norm": 2426.65625, "learning_rate": 3e-06, "loss": -232.8155, "step": 2090 }, { "epoch": 0.18653820420179312, "grad_norm": 2513.125244140625, "learning_rate": 3e-06, "loss": -294.7675, "step": 2091 }, { "epoch": 0.1866274142468442, "grad_norm": 1736.9534912109375, "learning_rate": 3e-06, "loss": -373.3613, "step": 2092 }, { "epoch": 0.18671662429189526, "grad_norm": 1675.3985595703125, "learning_rate": 3e-06, "loss": -312.3737, "step": 2093 }, { "epoch": 0.18680583433694634, "grad_norm": 1661.304443359375, "learning_rate": 3e-06, "loss": -274.9287, "step": 2094 }, { "epoch": 0.18689504438199742, "grad_norm": 2010.2904052734375, "learning_rate": 3e-06, "loss": -307.0745, "step": 2095 }, { "epoch": 0.1869842544270485, "grad_norm": 2005.97119140625, "learning_rate": 3e-06, "loss": -248.6287, "step": 2096 }, { "epoch": 0.18707346447209955, "grad_norm": 2148.4443359375, "learning_rate": 3e-06, "loss": -324.0838, "step": 2097 }, { "epoch": 0.18716267451715063, "grad_norm": 1827.4686279296875, "learning_rate": 3e-06, "loss": -405.5588, "step": 2098 }, { "epoch": 0.1872518845622017, "grad_norm": 1557.4544677734375, "learning_rate": 3e-06, "loss": -350.384, "step": 2099 }, { "epoch": 0.1873410946072528, "grad_norm": 1810.8126220703125, "learning_rate": 3e-06, "loss": -321.4857, "step": 2100 }, { "completion_length": 140.9791717529297, "epoch": 0.18743030465230384, "grad_norm": 237.09336853027344, "learning_rate": 3e-06, "loss": -24.8648, "reward": 2.407604217529297, "reward_std": 0.22888919711112976, "rewards/correctness_reward_func": 1.75, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.157604169100523, "step": 2101, "zero_std_ratio": 0.0 }, { "epoch": 0.18751951469735492, "grad_norm": 318.93609619140625, "learning_rate": 3e-06, "loss": -20.7236, "step": 2102 }, { "epoch": 0.187608724742406, "grad_norm": 286.1955261230469, "learning_rate": 3e-06, "loss": -24.869, "step": 2103 }, { "epoch": 0.18769793478745705, "grad_norm": 392.14410400390625, "learning_rate": 3e-06, "loss": -31.8759, "step": 2104 }, { "epoch": 0.18778714483250813, "grad_norm": 287.25006103515625, "learning_rate": 3e-06, "loss": -25.5882, "step": 2105 }, { "epoch": 0.18787635487755922, "grad_norm": 278.5559387207031, "learning_rate": 3e-06, "loss": -25.7843, "step": 2106 }, { "epoch": 0.1879655649226103, "grad_norm": 339.7559509277344, "learning_rate": 3e-06, "loss": -35.2184, "step": 2107 }, { "epoch": 0.18805477496766135, "grad_norm": 455.73504638671875, "learning_rate": 3e-06, "loss": -32.7528, "step": 2108 }, { "epoch": 0.18814398501271243, "grad_norm": 384.77130126953125, "learning_rate": 3e-06, "loss": -36.0195, "step": 2109 }, { "epoch": 0.1882331950577635, "grad_norm": 390.36846923828125, "learning_rate": 3e-06, "loss": -44.4518, "step": 2110 }, { "epoch": 0.1883224051028146, "grad_norm": 437.9986877441406, "learning_rate": 3e-06, "loss": -26.0731, "step": 2111 }, { "epoch": 0.18841161514786564, "grad_norm": 393.3838806152344, "learning_rate": 3e-06, "loss": -35.506, "step": 2112 }, { "completion_length": 129.43750762939453, "epoch": 0.18850082519291672, "grad_norm": 1305.8514404296875, "learning_rate": 3e-06, "loss": 110.8168, "reward": 2.460416793823242, "reward_std": 0.30926184356212616, "rewards/correctness_reward_func": 1.8333333730697632, "rewards/int_reward_func": 0.4895833283662796, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.13749999552965164, "step": 2113, "zero_std_ratio": 0.0 }, { "epoch": 0.1885900352379678, "grad_norm": 1914.8585205078125, "learning_rate": 3e-06, "loss": 42.632, "step": 2114 }, { "epoch": 0.18867924528301888, "grad_norm": 1306.41162109375, "learning_rate": 3e-06, "loss": 60.4664, "step": 2115 }, { "epoch": 0.18876845532806993, "grad_norm": 1066.9373779296875, "learning_rate": 3e-06, "loss": 128.6848, "step": 2116 }, { "epoch": 0.18885766537312101, "grad_norm": 1232.74267578125, "learning_rate": 3e-06, "loss": 68.6546, "step": 2117 }, { "epoch": 0.1889468754181721, "grad_norm": 851.8841552734375, "learning_rate": 3e-06, "loss": 92.6301, "step": 2118 }, { "epoch": 0.18903608546322315, "grad_norm": 1269.6170654296875, "learning_rate": 3e-06, "loss": 82.624, "step": 2119 }, { "epoch": 0.18912529550827423, "grad_norm": 1477.7921142578125, "learning_rate": 3e-06, "loss": 46.4268, "step": 2120 }, { "epoch": 0.1892145055533253, "grad_norm": 1204.431640625, "learning_rate": 3e-06, "loss": 47.8916, "step": 2121 }, { "epoch": 0.1893037155983764, "grad_norm": 1141.2989501953125, "learning_rate": 3e-06, "loss": 116.479, "step": 2122 }, { "epoch": 0.18939292564342744, "grad_norm": 1084.2598876953125, "learning_rate": 3e-06, "loss": 64.8248, "step": 2123 }, { "epoch": 0.18948213568847852, "grad_norm": 913.5030517578125, "learning_rate": 3e-06, "loss": 87.6114, "step": 2124 }, { "completion_length": 133.2291717529297, "epoch": 0.1895713457335296, "grad_norm": 1788.35546875, "learning_rate": 3e-06, "loss": 181.0141, "reward": 2.338604211807251, "reward_std": 0.5955996513366699, "rewards/correctness_reward_func": 1.625, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.2136041596531868, "step": 2125, "zero_std_ratio": 0.125 }, { "epoch": 0.18966055577858068, "grad_norm": 1528.2685546875, "learning_rate": 3e-06, "loss": 107.8506, "step": 2126 }, { "epoch": 0.18974976582363173, "grad_norm": 2611.96630859375, "learning_rate": 3e-06, "loss": 53.3311, "step": 2127 }, { "epoch": 0.1898389758686828, "grad_norm": 1537.9984130859375, "learning_rate": 3e-06, "loss": 112.4603, "step": 2128 }, { "epoch": 0.1899281859137339, "grad_norm": 1148.2452392578125, "learning_rate": 3e-06, "loss": 115.4492, "step": 2129 }, { "epoch": 0.19001739595878495, "grad_norm": 1439.117919921875, "learning_rate": 3e-06, "loss": 107.7209, "step": 2130 }, { "epoch": 0.19010660600383603, "grad_norm": 1633.4248046875, "learning_rate": 3e-06, "loss": 161.3715, "step": 2131 }, { "epoch": 0.1901958160488871, "grad_norm": 1571.0533447265625, "learning_rate": 3e-06, "loss": 95.7953, "step": 2132 }, { "epoch": 0.1902850260939382, "grad_norm": 1700.539794921875, "learning_rate": 3e-06, "loss": 31.5747, "step": 2133 }, { "epoch": 0.19037423613898924, "grad_norm": 1770.425537109375, "learning_rate": 3e-06, "loss": 96.7928, "step": 2134 }, { "epoch": 0.19046344618404032, "grad_norm": 1083.177734375, "learning_rate": 3e-06, "loss": 106.8439, "step": 2135 }, { "epoch": 0.1905526562290914, "grad_norm": 1709.0933837890625, "learning_rate": 3e-06, "loss": 81.4831, "step": 2136 }, { "completion_length": 145.9166717529297, "epoch": 0.19064186627414248, "grad_norm": 1901.0673828125, "learning_rate": 3e-06, "loss": 74.4176, "reward": 2.2969167232513428, "reward_std": 0.5523957759141922, "rewards/correctness_reward_func": 1.6666666865348816, "rewards/int_reward_func": 0.4895833283662796, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.1406666710972786, "step": 2137, "zero_std_ratio": 0.0 }, { "epoch": 0.19073107631919353, "grad_norm": 2433.760986328125, "learning_rate": 3e-06, "loss": 28.9411, "step": 2138 }, { "epoch": 0.1908202863642446, "grad_norm": 3526.50732421875, "learning_rate": 3e-06, "loss": -3.2852, "step": 2139 }, { "epoch": 0.1909094964092957, "grad_norm": 2794.43310546875, "learning_rate": 3e-06, "loss": 67.5947, "step": 2140 }, { "epoch": 0.19099870645434677, "grad_norm": 2891.499755859375, "learning_rate": 3e-06, "loss": 192.5647, "step": 2141 }, { "epoch": 0.19108791649939783, "grad_norm": 2748.095947265625, "learning_rate": 3e-06, "loss": 100.963, "step": 2142 }, { "epoch": 0.1911771265444489, "grad_norm": 1738.9039306640625, "learning_rate": 3e-06, "loss": 40.8662, "step": 2143 }, { "epoch": 0.19126633658949999, "grad_norm": 2699.06103515625, "learning_rate": 3e-06, "loss": 17.0858, "step": 2144 }, { "epoch": 0.19135554663455104, "grad_norm": 2892.302490234375, "learning_rate": 3e-06, "loss": -27.5113, "step": 2145 }, { "epoch": 0.19144475667960212, "grad_norm": 2559.01318359375, "learning_rate": 3e-06, "loss": 20.1366, "step": 2146 }, { "epoch": 0.1915339667246532, "grad_norm": 3452.719482421875, "learning_rate": 3e-06, "loss": 160.6933, "step": 2147 }, { "epoch": 0.19162317676970428, "grad_norm": 2362.385986328125, "learning_rate": 3e-06, "loss": 78.5965, "step": 2148 }, { "completion_length": 120.25000381469727, "epoch": 0.19171238681475533, "grad_norm": 1334.63623046875, "learning_rate": 3e-06, "loss": 6.4367, "reward": 2.3084168434143066, "reward_std": 0.29630675725638866, "rewards/correctness_reward_func": 1.6666666865348816, "rewards/int_reward_func": 0.46875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.1730000004172325, "step": 2149, "zero_std_ratio": 0.0 }, { "epoch": 0.1918015968598064, "grad_norm": 760.7504272460938, "learning_rate": 3e-06, "loss": -88.3566, "step": 2150 }, { "epoch": 0.1918908069048575, "grad_norm": 1294.0516357421875, "learning_rate": 3e-06, "loss": -41.7255, "step": 2151 }, { "epoch": 0.19198001694990857, "grad_norm": 1805.128662109375, "learning_rate": 3e-06, "loss": -19.25, "step": 2152 }, { "epoch": 0.19206922699495962, "grad_norm": 1069.1192626953125, "learning_rate": 3e-06, "loss": -69.7657, "step": 2153 }, { "epoch": 0.1921584370400107, "grad_norm": 2599.32080078125, "learning_rate": 3e-06, "loss": -44.1199, "step": 2154 }, { "epoch": 0.19224764708506178, "grad_norm": 1054.9842529296875, "learning_rate": 3e-06, "loss": -7.7095, "step": 2155 }, { "epoch": 0.19233685713011284, "grad_norm": 818.435302734375, "learning_rate": 3e-06, "loss": -102.3675, "step": 2156 }, { "epoch": 0.19242606717516392, "grad_norm": 970.5858154296875, "learning_rate": 3e-06, "loss": -53.6351, "step": 2157 }, { "epoch": 0.192515277220215, "grad_norm": 865.392822265625, "learning_rate": 3e-06, "loss": -21.3515, "step": 2158 }, { "epoch": 0.19260448726526608, "grad_norm": 981.1688232421875, "learning_rate": 3e-06, "loss": -87.6568, "step": 2159 }, { "epoch": 0.19269369731031713, "grad_norm": 975.263916015625, "learning_rate": 3e-06, "loss": -57.958, "step": 2160 }, { "completion_length": 136.45833587646484, "epoch": 0.1927829073553682, "grad_norm": 1038.02294921875, "learning_rate": 3e-06, "loss": 3.3305, "reward": 2.5663751363754272, "reward_std": 0.25874343514442444, "rewards/correctness_reward_func": 1.9166666269302368, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.1497083343565464, "step": 2161, "zero_std_ratio": 0.0 }, { "epoch": 0.1928721174004193, "grad_norm": 803.9536743164062, "learning_rate": 3e-06, "loss": -15.4779, "step": 2162 }, { "epoch": 0.19296132744547037, "grad_norm": 1474.9486083984375, "learning_rate": 3e-06, "loss": 2.4125, "step": 2163 }, { "epoch": 0.19305053749052142, "grad_norm": 1137.7777099609375, "learning_rate": 3e-06, "loss": -10.7272, "step": 2164 }, { "epoch": 0.1931397475355725, "grad_norm": 1287.4840087890625, "learning_rate": 3e-06, "loss": -36.5552, "step": 2165 }, { "epoch": 0.19322895758062358, "grad_norm": 750.6218872070312, "learning_rate": 3e-06, "loss": 28.6985, "step": 2166 }, { "epoch": 0.19331816762567466, "grad_norm": 1729.5426025390625, "learning_rate": 3e-06, "loss": -13.0334, "step": 2167 }, { "epoch": 0.19340737767072572, "grad_norm": 919.8224487304688, "learning_rate": 3e-06, "loss": -28.3112, "step": 2168 }, { "epoch": 0.1934965877157768, "grad_norm": 1243.5130615234375, "learning_rate": 3e-06, "loss": -7.4271, "step": 2169 }, { "epoch": 0.19358579776082788, "grad_norm": 897.3583374023438, "learning_rate": 3e-06, "loss": -19.3195, "step": 2170 }, { "epoch": 0.19367500780587893, "grad_norm": 1540.0894775390625, "learning_rate": 3e-06, "loss": -66.3459, "step": 2171 }, { "epoch": 0.19376421785093, "grad_norm": 910.990966796875, "learning_rate": 3e-06, "loss": 19.7238, "step": 2172 }, { "completion_length": 116.02083587646484, "epoch": 0.1938534278959811, "grad_norm": 1810.232666015625, "learning_rate": 3e-06, "loss": 39.429, "reward": 2.171999931335449, "reward_std": 0.5019665211439133, "rewards/correctness_reward_func": 1.4583333134651184, "rewards/int_reward_func": 0.4895833283662796, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.2240833342075348, "step": 2173, "zero_std_ratio": 0.0 }, { "epoch": 0.19394263794103217, "grad_norm": 1911.38525390625, "learning_rate": 3e-06, "loss": -56.2197, "step": 2174 }, { "epoch": 0.19403184798608322, "grad_norm": 1706.150146484375, "learning_rate": 3e-06, "loss": 29.7264, "step": 2175 }, { "epoch": 0.1941210580311343, "grad_norm": 2385.930419921875, "learning_rate": 3e-06, "loss": 32.8588, "step": 2176 }, { "epoch": 0.19421026807618538, "grad_norm": 1925.345947265625, "learning_rate": 3e-06, "loss": 30.2097, "step": 2177 }, { "epoch": 0.19429947812123646, "grad_norm": 1719.54052734375, "learning_rate": 3e-06, "loss": 141.0592, "step": 2178 }, { "epoch": 0.19438868816628752, "grad_norm": 1638.2069091796875, "learning_rate": 3e-06, "loss": 20.0159, "step": 2179 }, { "epoch": 0.1944778982113386, "grad_norm": 2645.457763671875, "learning_rate": 3e-06, "loss": -39.4086, "step": 2180 }, { "epoch": 0.19456710825638968, "grad_norm": 1837.9119873046875, "learning_rate": 3e-06, "loss": 13.8234, "step": 2181 }, { "epoch": 0.19465631830144076, "grad_norm": 1472.4017333984375, "learning_rate": 3e-06, "loss": 23.4693, "step": 2182 }, { "epoch": 0.1947455283464918, "grad_norm": 2043.978759765625, "learning_rate": 3e-06, "loss": 6.3195, "step": 2183 }, { "epoch": 0.1948347383915429, "grad_norm": 2784.806640625, "learning_rate": 3e-06, "loss": 114.7516, "step": 2184 }, { "completion_length": 148.5, "epoch": 0.19492394843659397, "grad_norm": 2347.893798828125, "learning_rate": 3e-06, "loss": 152.3218, "reward": 2.072833240032196, "reward_std": 0.3999442011117935, "rewards/correctness_reward_func": 1.4166666865348816, "rewards/int_reward_func": 0.4791666716337204, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.176999993622303, "step": 2185, "zero_std_ratio": 0.125 }, { "epoch": 0.19501315848164502, "grad_norm": 1482.416015625, "learning_rate": 3e-06, "loss": 80.5951, "step": 2186 }, { "epoch": 0.1951023685266961, "grad_norm": 1534.2724609375, "learning_rate": 3e-06, "loss": 75.3633, "step": 2187 }, { "epoch": 0.19519157857174718, "grad_norm": 1607.8106689453125, "learning_rate": 3e-06, "loss": 60.7631, "step": 2188 }, { "epoch": 0.19528078861679826, "grad_norm": 2337.1337890625, "learning_rate": 3e-06, "loss": -126.4198, "step": 2189 }, { "epoch": 0.19536999866184931, "grad_norm": 1377.0809326171875, "learning_rate": 3e-06, "loss": 133.5221, "step": 2190 }, { "epoch": 0.1954592087069004, "grad_norm": 1643.5670166015625, "learning_rate": 3e-06, "loss": 132.6571, "step": 2191 }, { "epoch": 0.19554841875195147, "grad_norm": 1409.347412109375, "learning_rate": 3e-06, "loss": 71.3489, "step": 2192 }, { "epoch": 0.19563762879700256, "grad_norm": 1953.61328125, "learning_rate": 3e-06, "loss": 63.8237, "step": 2193 }, { "epoch": 0.1957268388420536, "grad_norm": 1273.4095458984375, "learning_rate": 3e-06, "loss": 52.83, "step": 2194 }, { "epoch": 0.1958160488871047, "grad_norm": 2235.899169921875, "learning_rate": 3e-06, "loss": -132.9868, "step": 2195 }, { "epoch": 0.19590525893215577, "grad_norm": 2546.96435546875, "learning_rate": 3e-06, "loss": 114.6427, "step": 2196 }, { "completion_length": 124.45833587646484, "epoch": 0.19599446897720682, "grad_norm": 1299.908203125, "learning_rate": 3e-06, "loss": 25.437, "reward": 1.7506250143051147, "reward_std": 0.45970311760902405, "rewards/correctness_reward_func": 1.1250000298023224, "rewards/int_reward_func": 0.4479166716337204, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.17770832777023315, "step": 2197, "zero_std_ratio": 0.125 }, { "epoch": 0.1960836790222579, "grad_norm": 1442.3870849609375, "learning_rate": 3e-06, "loss": 86.5591, "step": 2198 }, { "epoch": 0.19617288906730898, "grad_norm": 855.531982421875, "learning_rate": 3e-06, "loss": 76.0765, "step": 2199 }, { "epoch": 0.19626209911236006, "grad_norm": 870.2618408203125, "learning_rate": 3e-06, "loss": 80.9394, "step": 2200 }, { "epoch": 0.1963513091574111, "grad_norm": 1157.236572265625, "learning_rate": 3e-06, "loss": 86.6789, "step": 2201 }, { "epoch": 0.1964405192024622, "grad_norm": 1135.1353759765625, "learning_rate": 3e-06, "loss": 36.164, "step": 2202 }, { "epoch": 0.19652972924751327, "grad_norm": 1008.8060913085938, "learning_rate": 3e-06, "loss": 10.2971, "step": 2203 }, { "epoch": 0.19661893929256435, "grad_norm": 1482.61083984375, "learning_rate": 3e-06, "loss": 68.7021, "step": 2204 }, { "epoch": 0.1967081493376154, "grad_norm": 798.7431640625, "learning_rate": 3e-06, "loss": 60.4609, "step": 2205 }, { "epoch": 0.1967973593826665, "grad_norm": 971.9412231445312, "learning_rate": 3e-06, "loss": 71.1141, "step": 2206 }, { "epoch": 0.19688656942771757, "grad_norm": 1165.7591552734375, "learning_rate": 3e-06, "loss": 81.0864, "step": 2207 }, { "epoch": 0.19697577947276865, "grad_norm": 1213.063720703125, "learning_rate": 3e-06, "loss": 22.4711, "step": 2208 }, { "completion_length": 160.8541717529297, "epoch": 0.1970649895178197, "grad_norm": 1580.6722412109375, "learning_rate": 3e-06, "loss": -274.6722, "reward": 1.804187536239624, "reward_std": 0.7367371022701263, "rewards/correctness_reward_func": 1.2916666269302368, "rewards/int_reward_func": 0.4166666716337204, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.09585416316986084, "step": 2209, "zero_std_ratio": 0.0 }, { "epoch": 0.19715419956287078, "grad_norm": 1305.8641357421875, "learning_rate": 3e-06, "loss": -286.2726, "step": 2210 }, { "epoch": 0.19724340960792186, "grad_norm": 3519.257080078125, "learning_rate": 3e-06, "loss": -514.0822, "step": 2211 }, { "epoch": 0.1973326196529729, "grad_norm": 4824.2421875, "learning_rate": 3e-06, "loss": -367.1955, "step": 2212 }, { "epoch": 0.197421829698024, "grad_norm": 2674.205322265625, "learning_rate": 3e-06, "loss": -301.9088, "step": 2213 }, { "epoch": 0.19751103974307507, "grad_norm": 1980.4049072265625, "learning_rate": 3e-06, "loss": -304.9702, "step": 2214 }, { "epoch": 0.19760024978812615, "grad_norm": 2130.482666015625, "learning_rate": 3e-06, "loss": -305.0632, "step": 2215 }, { "epoch": 0.1976894598331772, "grad_norm": 1712.863525390625, "learning_rate": 3e-06, "loss": -309.973, "step": 2216 }, { "epoch": 0.19777866987822829, "grad_norm": 2488.321533203125, "learning_rate": 3e-06, "loss": -574.7097, "step": 2217 }, { "epoch": 0.19786787992327937, "grad_norm": 2911.133544921875, "learning_rate": 3e-06, "loss": -474.2518, "step": 2218 }, { "epoch": 0.19795708996833045, "grad_norm": 4416.62841796875, "learning_rate": 3e-06, "loss": -422.1913, "step": 2219 }, { "epoch": 0.1980463000133815, "grad_norm": 4086.730224609375, "learning_rate": 3e-06, "loss": -435.4988, "step": 2220 }, { "completion_length": 102.9375, "epoch": 0.19813551005843258, "grad_norm": 803.6179809570312, "learning_rate": 3e-06, "loss": -18.5715, "reward": 2.2100417613983154, "reward_std": 0.26370862126350403, "rewards/correctness_reward_func": 1.5, "rewards/int_reward_func": 0.4791666716337204, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.23087499290704727, "step": 2221, "zero_std_ratio": 0.0 }, { "epoch": 0.19822472010348366, "grad_norm": 617.244384765625, "learning_rate": 3e-06, "loss": -24.6303, "step": 2222 }, { "epoch": 0.1983139301485347, "grad_norm": 502.6885986328125, "learning_rate": 3e-06, "loss": -31.1001, "step": 2223 }, { "epoch": 0.1984031401935858, "grad_norm": 698.3114013671875, "learning_rate": 3e-06, "loss": -35.3177, "step": 2224 }, { "epoch": 0.19849235023863687, "grad_norm": 618.0604858398438, "learning_rate": 3e-06, "loss": -30.079, "step": 2225 }, { "epoch": 0.19858156028368795, "grad_norm": 484.8530578613281, "learning_rate": 3e-06, "loss": -37.8852, "step": 2226 }, { "epoch": 0.198670770328739, "grad_norm": 948.900634765625, "learning_rate": 3e-06, "loss": -15.1709, "step": 2227 }, { "epoch": 0.19875998037379008, "grad_norm": 609.5656127929688, "learning_rate": 3e-06, "loss": -37.6612, "step": 2228 }, { "epoch": 0.19884919041884117, "grad_norm": 672.0021362304688, "learning_rate": 3e-06, "loss": -49.1828, "step": 2229 }, { "epoch": 0.19893840046389225, "grad_norm": 888.1821899414062, "learning_rate": 3e-06, "loss": -55.2511, "step": 2230 }, { "epoch": 0.1990276105089433, "grad_norm": 811.3577880859375, "learning_rate": 3e-06, "loss": -45.7303, "step": 2231 }, { "epoch": 0.19911682055399438, "grad_norm": 717.1616821289062, "learning_rate": 3e-06, "loss": -51.4052, "step": 2232 }, { "completion_length": 140.27084350585938, "epoch": 0.19920603059904546, "grad_norm": 2046.7083740234375, "learning_rate": 3e-06, "loss": 109.0772, "reward": 2.342104196548462, "reward_std": 0.5653499662876129, "rewards/correctness_reward_func": 1.7083333134651184, "rewards/int_reward_func": 0.4583333283662796, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.17543750256299973, "step": 2233, "zero_std_ratio": 0.0 }, { "epoch": 0.19929524064409654, "grad_norm": 2079.919189453125, "learning_rate": 3e-06, "loss": 253.4095, "step": 2234 }, { "epoch": 0.1993844506891476, "grad_norm": 2711.693115234375, "learning_rate": 3e-06, "loss": 172.1419, "step": 2235 }, { "epoch": 0.19947366073419867, "grad_norm": 2554.647705078125, "learning_rate": 3e-06, "loss": 212.6262, "step": 2236 }, { "epoch": 0.19956287077924975, "grad_norm": 2302.18212890625, "learning_rate": 3e-06, "loss": 216.2501, "step": 2237 }, { "epoch": 0.1996520808243008, "grad_norm": 2337.755126953125, "learning_rate": 3e-06, "loss": 82.4594, "step": 2238 }, { "epoch": 0.19974129086935188, "grad_norm": 2143.60791015625, "learning_rate": 3e-06, "loss": 100.3759, "step": 2239 }, { "epoch": 0.19983050091440296, "grad_norm": 2368.834228515625, "learning_rate": 3e-06, "loss": 246.018, "step": 2240 }, { "epoch": 0.19991971095945404, "grad_norm": 2243.505615234375, "learning_rate": 3e-06, "loss": 138.0225, "step": 2241 }, { "epoch": 0.2000089210045051, "grad_norm": 2373.282470703125, "learning_rate": 3e-06, "loss": 180.4512, "step": 2242 }, { "epoch": 0.20009813104955618, "grad_norm": 2514.03955078125, "learning_rate": 3e-06, "loss": 167.8582, "step": 2243 }, { "epoch": 0.20018734109460726, "grad_norm": 1966.1226806640625, "learning_rate": 3e-06, "loss": 61.633, "step": 2244 }, { "completion_length": 111.00000381469727, "epoch": 0.20027655113965834, "grad_norm": 1399.266845703125, "learning_rate": 3e-06, "loss": -45.4763, "reward": 2.3000833988189697, "reward_std": 0.5667559206485748, "rewards/correctness_reward_func": 1.6666666865348816, "rewards/int_reward_func": 0.4375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.19591666013002396, "step": 2245, "zero_std_ratio": 0.0 }, { "epoch": 0.2003657611847094, "grad_norm": 1209.40625, "learning_rate": 3e-06, "loss": -77.9653, "step": 2246 }, { "epoch": 0.20045497122976047, "grad_norm": 1058.274658203125, "learning_rate": 3e-06, "loss": -114.5617, "step": 2247 }, { "epoch": 0.20054418127481155, "grad_norm": 1116.7257080078125, "learning_rate": 3e-06, "loss": -100.1849, "step": 2248 }, { "epoch": 0.2006333913198626, "grad_norm": 2506.532958984375, "learning_rate": 3e-06, "loss": -81.4074, "step": 2249 }, { "epoch": 0.20072260136491368, "grad_norm": 1058.7152099609375, "learning_rate": 3e-06, "loss": -129.5974, "step": 2250 }, { "epoch": 0.20081181140996476, "grad_norm": 1346.8084716796875, "learning_rate": 3e-06, "loss": -49.5752, "step": 2251 }, { "epoch": 0.20090102145501584, "grad_norm": 1400.63525390625, "learning_rate": 3e-06, "loss": -80.931, "step": 2252 }, { "epoch": 0.2009902315000669, "grad_norm": 979.59228515625, "learning_rate": 3e-06, "loss": -125.4819, "step": 2253 }, { "epoch": 0.20107944154511798, "grad_norm": 1141.697265625, "learning_rate": 3e-06, "loss": -114.8348, "step": 2254 }, { "epoch": 0.20116865159016906, "grad_norm": 1260.422607421875, "learning_rate": 3e-06, "loss": -92.5143, "step": 2255 }, { "epoch": 0.20125786163522014, "grad_norm": 1216.84228515625, "learning_rate": 3e-06, "loss": -142.977, "step": 2256 }, { "completion_length": 123.4375, "epoch": 0.2013470716802712, "grad_norm": 1664.2445068359375, "learning_rate": 3e-06, "loss": -129.5342, "reward": 2.551270842552185, "reward_std": 0.28455013036727905, "rewards/correctness_reward_func": 1.875, "rewards/int_reward_func": 0.4791666567325592, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.010416666977107525, "rewards/xmlcount_reward_func": 0.18668748438358307, "step": 2257, "zero_std_ratio": 0.0 }, { "epoch": 0.20143628172532227, "grad_norm": 2019.86083984375, "learning_rate": 3e-06, "loss": -188.5191, "step": 2258 }, { "epoch": 0.20152549177037335, "grad_norm": 1595.3583984375, "learning_rate": 3e-06, "loss": -125.4679, "step": 2259 }, { "epoch": 0.20161470181542443, "grad_norm": 2631.081787109375, "learning_rate": 3e-06, "loss": -152.7189, "step": 2260 }, { "epoch": 0.20170391186047548, "grad_norm": 1726.4815673828125, "learning_rate": 3e-06, "loss": -96.2797, "step": 2261 }, { "epoch": 0.20179312190552656, "grad_norm": 1873.742431640625, "learning_rate": 3e-06, "loss": -135.3711, "step": 2262 }, { "epoch": 0.20188233195057764, "grad_norm": 1621.1622314453125, "learning_rate": 3e-06, "loss": -140.641, "step": 2263 }, { "epoch": 0.2019715419956287, "grad_norm": 1722.911865234375, "learning_rate": 3e-06, "loss": -196.7486, "step": 2264 }, { "epoch": 0.20206075204067978, "grad_norm": 1306.2991943359375, "learning_rate": 3e-06, "loss": -128.2448, "step": 2265 }, { "epoch": 0.20214996208573086, "grad_norm": 2126.202392578125, "learning_rate": 3e-06, "loss": -168.1409, "step": 2266 }, { "epoch": 0.20223917213078194, "grad_norm": 1632.1204833984375, "learning_rate": 3e-06, "loss": -104.4657, "step": 2267 }, { "epoch": 0.202328382175833, "grad_norm": 1830.1990966796875, "learning_rate": 3e-06, "loss": -141.1356, "step": 2268 }, { "completion_length": 123.02083587646484, "epoch": 0.20241759222088407, "grad_norm": 1051.958251953125, "learning_rate": 3e-06, "loss": 63.9873, "reward": 2.0415000915527344, "reward_std": 0.3290978819131851, "rewards/correctness_reward_func": 1.3750000298023224, "rewards/int_reward_func": 0.46875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.1977500021457672, "step": 2269, "zero_std_ratio": 0.0 }, { "epoch": 0.20250680226593515, "grad_norm": 1236.19775390625, "learning_rate": 3e-06, "loss": 99.0337, "step": 2270 }, { "epoch": 0.20259601231098623, "grad_norm": 937.3912353515625, "learning_rate": 3e-06, "loss": 65.646, "step": 2271 }, { "epoch": 0.20268522235603728, "grad_norm": 1138.8841552734375, "learning_rate": 3e-06, "loss": 50.2982, "step": 2272 }, { "epoch": 0.20277443240108836, "grad_norm": 1543.039794921875, "learning_rate": 3e-06, "loss": 105.3749, "step": 2273 }, { "epoch": 0.20286364244613944, "grad_norm": 913.1903076171875, "learning_rate": 3e-06, "loss": 74.6702, "step": 2274 }, { "epoch": 0.20295285249119052, "grad_norm": 916.6128540039062, "learning_rate": 3e-06, "loss": 57.9423, "step": 2275 }, { "epoch": 0.20304206253624157, "grad_norm": 1266.6317138671875, "learning_rate": 3e-06, "loss": 95.4216, "step": 2276 }, { "epoch": 0.20313127258129265, "grad_norm": 932.1137084960938, "learning_rate": 3e-06, "loss": 61.0586, "step": 2277 }, { "epoch": 0.20322048262634373, "grad_norm": 1065.530029296875, "learning_rate": 3e-06, "loss": 39.2222, "step": 2278 }, { "epoch": 0.2033096926713948, "grad_norm": 1048.569580078125, "learning_rate": 3e-06, "loss": 95.8649, "step": 2279 }, { "epoch": 0.20339890271644587, "grad_norm": 975.1697387695312, "learning_rate": 3e-06, "loss": 64.6243, "step": 2280 }, { "completion_length": 123.35417175292969, "epoch": 0.20348811276149695, "grad_norm": 2359.048095703125, "learning_rate": 3e-06, "loss": -129.0191, "reward": 1.7423540949821472, "reward_std": 0.6932721436023712, "rewards/correctness_reward_func": 1.0833333432674408, "rewards/int_reward_func": 0.46875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.1902708262205124, "step": 2281, "zero_std_ratio": 0.0 }, { "epoch": 0.20357732280654803, "grad_norm": 2037.848388671875, "learning_rate": 3e-06, "loss": 32.7358, "step": 2282 }, { "epoch": 0.20366653285159908, "grad_norm": 2450.302978515625, "learning_rate": 3e-06, "loss": -88.7673, "step": 2283 }, { "epoch": 0.20375574289665016, "grad_norm": 1783.2620849609375, "learning_rate": 3e-06, "loss": -99.9251, "step": 2284 }, { "epoch": 0.20384495294170124, "grad_norm": 1748.13330078125, "learning_rate": 3e-06, "loss": -53.0798, "step": 2285 }, { "epoch": 0.20393416298675232, "grad_norm": 2280.80810546875, "learning_rate": 3e-06, "loss": -79.2135, "step": 2286 }, { "epoch": 0.20402337303180337, "grad_norm": 2504.2412109375, "learning_rate": 3e-06, "loss": -153.4697, "step": 2287 }, { "epoch": 0.20411258307685445, "grad_norm": 2130.119873046875, "learning_rate": 3e-06, "loss": 18.6636, "step": 2288 }, { "epoch": 0.20420179312190553, "grad_norm": 2430.63818359375, "learning_rate": 3e-06, "loss": -116.2015, "step": 2289 }, { "epoch": 0.2042910031669566, "grad_norm": 2750.778076171875, "learning_rate": 3e-06, "loss": -129.5705, "step": 2290 }, { "epoch": 0.20438021321200767, "grad_norm": 1790.766845703125, "learning_rate": 3e-06, "loss": -95.7831, "step": 2291 }, { "epoch": 0.20446942325705875, "grad_norm": 2436.076171875, "learning_rate": 3e-06, "loss": -110.9333, "step": 2292 }, { "completion_length": 126.64583969116211, "epoch": 0.20455863330210983, "grad_norm": 1839.115966796875, "learning_rate": 3e-06, "loss": -97.4442, "reward": 2.2966668605804443, "reward_std": 0.4922372102737427, "rewards/correctness_reward_func": 1.625, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.17166666314005852, "step": 2293, "zero_std_ratio": 0.0 }, { "epoch": 0.20464784334716088, "grad_norm": 1397.8768310546875, "learning_rate": 3e-06, "loss": -101.0346, "step": 2294 }, { "epoch": 0.20473705339221196, "grad_norm": 1341.2484130859375, "learning_rate": 3e-06, "loss": -247.112, "step": 2295 }, { "epoch": 0.20482626343726304, "grad_norm": 2028.7335205078125, "learning_rate": 3e-06, "loss": -72.8786, "step": 2296 }, { "epoch": 0.20491547348231412, "grad_norm": 1479.264892578125, "learning_rate": 3e-06, "loss": -213.6321, "step": 2297 }, { "epoch": 0.20500468352736517, "grad_norm": 1442.75927734375, "learning_rate": 3e-06, "loss": -161.4184, "step": 2298 }, { "epoch": 0.20509389357241625, "grad_norm": 1903.4906005859375, "learning_rate": 3e-06, "loss": -115.0706, "step": 2299 }, { "epoch": 0.20518310361746733, "grad_norm": 1373.6041259765625, "learning_rate": 3e-06, "loss": -128.6296, "step": 2300 }, { "epoch": 0.2052723136625184, "grad_norm": 1395.6331787109375, "learning_rate": 3e-06, "loss": -256.6596, "step": 2301 }, { "epoch": 0.20536152370756947, "grad_norm": 1820.6612548828125, "learning_rate": 3e-06, "loss": -86.9157, "step": 2302 }, { "epoch": 0.20545073375262055, "grad_norm": 1319.0960693359375, "learning_rate": 3e-06, "loss": -233.4003, "step": 2303 }, { "epoch": 0.20553994379767163, "grad_norm": 1100.8870849609375, "learning_rate": 3e-06, "loss": -176.9607, "step": 2304 }, { "completion_length": 130.87500381469727, "epoch": 0.20562915384272268, "grad_norm": 790.4862670898438, "learning_rate": 3e-06, "loss": 48.6722, "reward": 2.4920207262039185, "reward_std": 0.28309868834912777, "rewards/correctness_reward_func": 1.8333333134651184, "rewards/int_reward_func": 0.4895833283662796, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.16910415515303612, "step": 2305, "zero_std_ratio": 0.0 }, { "epoch": 0.20571836388777376, "grad_norm": 699.5379028320312, "learning_rate": 3e-06, "loss": 23.5835, "step": 2306 }, { "epoch": 0.20580757393282484, "grad_norm": 664.294921875, "learning_rate": 3e-06, "loss": 62.6984, "step": 2307 }, { "epoch": 0.20589678397787592, "grad_norm": 814.1132202148438, "learning_rate": 3e-06, "loss": 58.8113, "step": 2308 }, { "epoch": 0.20598599402292697, "grad_norm": 621.1661376953125, "learning_rate": 3e-06, "loss": 17.8389, "step": 2309 }, { "epoch": 0.20607520406797805, "grad_norm": 754.7105102539062, "learning_rate": 3e-06, "loss": 43.741, "step": 2310 }, { "epoch": 0.20616441411302913, "grad_norm": 765.5684814453125, "learning_rate": 3e-06, "loss": 33.7028, "step": 2311 }, { "epoch": 0.2062536241580802, "grad_norm": 469.3419189453125, "learning_rate": 3e-06, "loss": 9.6402, "step": 2312 }, { "epoch": 0.20634283420313126, "grad_norm": 575.3997192382812, "learning_rate": 3e-06, "loss": 43.3943, "step": 2313 }, { "epoch": 0.20643204424818234, "grad_norm": 636.8104858398438, "learning_rate": 3e-06, "loss": 39.065, "step": 2314 }, { "epoch": 0.20652125429323342, "grad_norm": 401.0857849121094, "learning_rate": 3e-06, "loss": 3.814, "step": 2315 }, { "epoch": 0.20661046433828448, "grad_norm": 442.88726806640625, "learning_rate": 3e-06, "loss": 20.3367, "step": 2316 }, { "completion_length": 109.10417175292969, "epoch": 0.20669967438333556, "grad_norm": 1531.75146484375, "learning_rate": 3e-06, "loss": 87.3807, "reward": 2.5790417194366455, "reward_std": 0.30080675333738327, "rewards/correctness_reward_func": 1.875, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.20404167473316193, "step": 2317, "zero_std_ratio": 0.125 }, { "epoch": 0.20678888442838664, "grad_norm": 1411.746826171875, "learning_rate": 3e-06, "loss": 71.0098, "step": 2318 }, { "epoch": 0.20687809447343772, "grad_norm": 1122.78759765625, "learning_rate": 3e-06, "loss": 74.4398, "step": 2319 }, { "epoch": 0.20696730451848877, "grad_norm": 1717.5146484375, "learning_rate": 3e-06, "loss": 74.3571, "step": 2320 }, { "epoch": 0.20705651456353985, "grad_norm": 1425.1131591796875, "learning_rate": 3e-06, "loss": 7.1953, "step": 2321 }, { "epoch": 0.20714572460859093, "grad_norm": 1329.4161376953125, "learning_rate": 3e-06, "loss": 84.9138, "step": 2322 }, { "epoch": 0.207234934653642, "grad_norm": 881.3815307617188, "learning_rate": 3e-06, "loss": 59.1147, "step": 2323 }, { "epoch": 0.20732414469869306, "grad_norm": 1501.871826171875, "learning_rate": 3e-06, "loss": 47.1585, "step": 2324 }, { "epoch": 0.20741335474374414, "grad_norm": 786.2328491210938, "learning_rate": 3e-06, "loss": 46.2053, "step": 2325 }, { "epoch": 0.20750256478879522, "grad_norm": 744.2485961914062, "learning_rate": 3e-06, "loss": 39.6594, "step": 2326 }, { "epoch": 0.2075917748338463, "grad_norm": 752.8430786132812, "learning_rate": 3e-06, "loss": 11.047, "step": 2327 }, { "epoch": 0.20768098487889736, "grad_norm": 740.0889892578125, "learning_rate": 3e-06, "loss": 47.685, "step": 2328 }, { "completion_length": 122.72916793823242, "epoch": 0.20777019492394844, "grad_norm": 159.8445587158203, "learning_rate": 3e-06, "loss": -1.327, "reward": 2.3375418186187744, "reward_std": 0.14845435763709247, "rewards/correctness_reward_func": 1.6666666865348816, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.17087500542402267, "step": 2329, "zero_std_ratio": 0.0 }, { "epoch": 0.20785940496899952, "grad_norm": 233.4280548095703, "learning_rate": 3e-06, "loss": 13.7035, "step": 2330 }, { "epoch": 0.20794861501405057, "grad_norm": 190.21124267578125, "learning_rate": 3e-06, "loss": 2.0237, "step": 2331 }, { "epoch": 0.20803782505910165, "grad_norm": 202.0067901611328, "learning_rate": 3e-06, "loss": 7.6713, "step": 2332 }, { "epoch": 0.20812703510415273, "grad_norm": 118.27416229248047, "learning_rate": 3e-06, "loss": 0.5948, "step": 2333 }, { "epoch": 0.2082162451492038, "grad_norm": 205.14324951171875, "learning_rate": 3e-06, "loss": 1.5616, "step": 2334 }, { "epoch": 0.20830545519425486, "grad_norm": 115.84490203857422, "learning_rate": 3e-06, "loss": -3.7101, "step": 2335 }, { "epoch": 0.20839466523930594, "grad_norm": 152.4305877685547, "learning_rate": 3e-06, "loss": 6.609, "step": 2336 }, { "epoch": 0.20848387528435702, "grad_norm": 93.83056640625, "learning_rate": 3e-06, "loss": -2.1607, "step": 2337 }, { "epoch": 0.2085730853294081, "grad_norm": 186.54685974121094, "learning_rate": 3e-06, "loss": 2.1806, "step": 2338 }, { "epoch": 0.20866229537445916, "grad_norm": 103.79989624023438, "learning_rate": 3e-06, "loss": -1.846, "step": 2339 }, { "epoch": 0.20875150541951024, "grad_norm": 98.66133117675781, "learning_rate": 3e-06, "loss": -3.5675, "step": 2340 }, { "completion_length": 128.33333587646484, "epoch": 0.20884071546456132, "grad_norm": 508.50653076171875, "learning_rate": 3e-06, "loss": 18.0618, "reward": 2.2047917246818542, "reward_std": 0.27334894239902496, "rewards/correctness_reward_func": 1.5416666865348816, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.16312500834465027, "step": 2341, "zero_std_ratio": 0.0 }, { "epoch": 0.20892992550961237, "grad_norm": 535.1369018554688, "learning_rate": 3e-06, "loss": 25.3646, "step": 2342 }, { "epoch": 0.20901913555466345, "grad_norm": 366.54559326171875, "learning_rate": 3e-06, "loss": 4.7151, "step": 2343 }, { "epoch": 0.20910834559971453, "grad_norm": 501.6902160644531, "learning_rate": 3e-06, "loss": -1.087, "step": 2344 }, { "epoch": 0.2091975556447656, "grad_norm": 443.63360595703125, "learning_rate": 3e-06, "loss": 8.3783, "step": 2345 }, { "epoch": 0.20928676568981666, "grad_norm": 368.2359313964844, "learning_rate": 3e-06, "loss": 2.4934, "step": 2346 }, { "epoch": 0.20937597573486774, "grad_norm": 362.5126647949219, "learning_rate": 3e-06, "loss": 8.2788, "step": 2347 }, { "epoch": 0.20946518577991882, "grad_norm": 331.4849548339844, "learning_rate": 3e-06, "loss": 11.4291, "step": 2348 }, { "epoch": 0.2095543958249699, "grad_norm": 373.8135070800781, "learning_rate": 3e-06, "loss": -2.6253, "step": 2349 }, { "epoch": 0.20964360587002095, "grad_norm": 359.12152099609375, "learning_rate": 3e-06, "loss": -9.531, "step": 2350 }, { "epoch": 0.20973281591507204, "grad_norm": 387.0042724609375, "learning_rate": 3e-06, "loss": -0.5337, "step": 2351 }, { "epoch": 0.20982202596012312, "grad_norm": 312.2693786621094, "learning_rate": 3e-06, "loss": -4.1708, "step": 2352 }, { "completion_length": 140.35416793823242, "epoch": 0.2099112360051742, "grad_norm": 3304.60009765625, "learning_rate": 3e-06, "loss": -79.5805, "reward": 2.270458459854126, "reward_std": 0.6086882501840591, "rewards/correctness_reward_func": 1.6666666269302368, "rewards/int_reward_func": 0.4895833283662796, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.11420833691954613, "step": 2353, "zero_std_ratio": 0.0 }, { "epoch": 0.21000044605022525, "grad_norm": 555.783203125, "learning_rate": 3e-06, "loss": -26.9599, "step": 2354 }, { "epoch": 0.21008965609527633, "grad_norm": 310.6244201660156, "learning_rate": 3e-06, "loss": -20.3898, "step": 2355 }, { "epoch": 0.2101788661403274, "grad_norm": 442.4192810058594, "learning_rate": 3e-06, "loss": -6.1292, "step": 2356 }, { "epoch": 0.21026807618537846, "grad_norm": 380.87841796875, "learning_rate": 3e-06, "loss": -28.3523, "step": 2357 }, { "epoch": 0.21035728623042954, "grad_norm": 546.93310546875, "learning_rate": 3e-06, "loss": -13.9403, "step": 2358 }, { "epoch": 0.21044649627548062, "grad_norm": 1282.0760498046875, "learning_rate": 3e-06, "loss": -75.0629, "step": 2359 }, { "epoch": 0.2105357063205317, "grad_norm": 411.43829345703125, "learning_rate": 3e-06, "loss": -26.5976, "step": 2360 }, { "epoch": 0.21062491636558275, "grad_norm": 287.0208435058594, "learning_rate": 3e-06, "loss": -20.6849, "step": 2361 }, { "epoch": 0.21071412641063383, "grad_norm": 571.956787109375, "learning_rate": 3e-06, "loss": -7.8208, "step": 2362 }, { "epoch": 0.21080333645568491, "grad_norm": 343.234130859375, "learning_rate": 3e-06, "loss": -29.511, "step": 2363 }, { "epoch": 0.210892546500736, "grad_norm": 552.2923583984375, "learning_rate": 3e-06, "loss": -17.5949, "step": 2364 }, { "completion_length": 126.52083587646484, "epoch": 0.21098175654578705, "grad_norm": 243.8163299560547, "learning_rate": 3e-06, "loss": 4.4713, "reward": 2.246416687965393, "reward_std": 0.34428833425045013, "rewards/correctness_reward_func": 1.625, "rewards/int_reward_func": 0.46875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.15266667306423187, "step": 2365, "zero_std_ratio": 0.0 }, { "epoch": 0.21107096659083813, "grad_norm": 106.08045959472656, "learning_rate": 3e-06, "loss": 2.346, "step": 2366 }, { "epoch": 0.2111601766358892, "grad_norm": 143.98031616210938, "learning_rate": 3e-06, "loss": 3.9441, "step": 2367 }, { "epoch": 0.2112493866809403, "grad_norm": 106.59970092773438, "learning_rate": 3e-06, "loss": 0.7764, "step": 2368 }, { "epoch": 0.21133859672599134, "grad_norm": 135.23463439941406, "learning_rate": 3e-06, "loss": 5.1002, "step": 2369 }, { "epoch": 0.21142780677104242, "grad_norm": 195.45408630371094, "learning_rate": 3e-06, "loss": 14.522, "step": 2370 }, { "epoch": 0.2115170168160935, "grad_norm": 742.7892456054688, "learning_rate": 3e-06, "loss": 1.9066, "step": 2371 }, { "epoch": 0.21160622686114455, "grad_norm": 128.09744262695312, "learning_rate": 3e-06, "loss": 1.677, "step": 2372 }, { "epoch": 0.21169543690619563, "grad_norm": 118.60245513916016, "learning_rate": 3e-06, "loss": 1.7248, "step": 2373 }, { "epoch": 0.2117846469512467, "grad_norm": 97.40567779541016, "learning_rate": 3e-06, "loss": 0.1819, "step": 2374 }, { "epoch": 0.2118738569962978, "grad_norm": 134.66766357421875, "learning_rate": 3e-06, "loss": 2.7604, "step": 2375 }, { "epoch": 0.21196306704134885, "grad_norm": 491.87567138671875, "learning_rate": 3e-06, "loss": 12.4578, "step": 2376 }, { "completion_length": 113.10416793823242, "epoch": 0.21205227708639993, "grad_norm": 465.9616394042969, "learning_rate": 3e-06, "loss": -32.9042, "reward": 2.663749933242798, "reward_std": 0.13261950528249145, "rewards/correctness_reward_func": 1.9583333134651184, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.20541664958000183, "step": 2377, "zero_std_ratio": 0.125 }, { "epoch": 0.212141487131451, "grad_norm": 558.760009765625, "learning_rate": 3e-06, "loss": -9.7375, "step": 2378 }, { "epoch": 0.2122306971765021, "grad_norm": 437.1868591308594, "learning_rate": 3e-06, "loss": -20.5382, "step": 2379 }, { "epoch": 0.21231990722155314, "grad_norm": 571.3277587890625, "learning_rate": 3e-06, "loss": -9.5875, "step": 2380 }, { "epoch": 0.21240911726660422, "grad_norm": 460.2364501953125, "learning_rate": 3e-06, "loss": -7.0278, "step": 2381 }, { "epoch": 0.2124983273116553, "grad_norm": 801.8722534179688, "learning_rate": 3e-06, "loss": -42.3189, "step": 2382 }, { "epoch": 0.21258753735670635, "grad_norm": 486.27691650390625, "learning_rate": 3e-06, "loss": -38.3283, "step": 2383 }, { "epoch": 0.21267674740175743, "grad_norm": 633.953125, "learning_rate": 3e-06, "loss": -15.9907, "step": 2384 }, { "epoch": 0.2127659574468085, "grad_norm": 500.3168640136719, "learning_rate": 3e-06, "loss": -26.8408, "step": 2385 }, { "epoch": 0.2128551674918596, "grad_norm": 779.6222534179688, "learning_rate": 3e-06, "loss": -14.4924, "step": 2386 }, { "epoch": 0.21294437753691065, "grad_norm": 469.1849670410156, "learning_rate": 3e-06, "loss": -10.9935, "step": 2387 }, { "epoch": 0.21303358758196173, "grad_norm": 550.7216796875, "learning_rate": 3e-06, "loss": -53.6791, "step": 2388 }, { "completion_length": 135.33333587646484, "epoch": 0.2131227976270128, "grad_norm": 290.79766845703125, "learning_rate": 3e-06, "loss": 9.7784, "reward": 2.2966458797454834, "reward_std": 0.6616353988647461, "rewards/correctness_reward_func": 1.6666666865348816, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.12997916713356972, "step": 2389, "zero_std_ratio": 0.0 }, { "epoch": 0.21321200767206389, "grad_norm": 348.85821533203125, "learning_rate": 3e-06, "loss": 29.5501, "step": 2390 }, { "epoch": 0.21330121771711494, "grad_norm": 424.7781982421875, "learning_rate": 3e-06, "loss": 18.5572, "step": 2391 }, { "epoch": 0.21339042776216602, "grad_norm": 288.4269104003906, "learning_rate": 3e-06, "loss": 11.6147, "step": 2392 }, { "epoch": 0.2134796378072171, "grad_norm": 311.3384704589844, "learning_rate": 3e-06, "loss": 14.8, "step": 2393 }, { "epoch": 0.21356884785226818, "grad_norm": 581.3242797851562, "learning_rate": 3e-06, "loss": 26.3673, "step": 2394 }, { "epoch": 0.21365805789731923, "grad_norm": 243.1928253173828, "learning_rate": 3e-06, "loss": 8.8072, "step": 2395 }, { "epoch": 0.2137472679423703, "grad_norm": 464.5150451660156, "learning_rate": 3e-06, "loss": 27.1507, "step": 2396 }, { "epoch": 0.2138364779874214, "grad_norm": 379.3248596191406, "learning_rate": 3e-06, "loss": 12.6021, "step": 2397 }, { "epoch": 0.21392568803247244, "grad_norm": 306.65716552734375, "learning_rate": 3e-06, "loss": 9.6934, "step": 2398 }, { "epoch": 0.21401489807752352, "grad_norm": 254.15750122070312, "learning_rate": 3e-06, "loss": 9.9346, "step": 2399 }, { "epoch": 0.2141041081225746, "grad_norm": 396.16680908203125, "learning_rate": 3e-06, "loss": 19.5421, "step": 2400 }, { "completion_length": 153.3541717529297, "epoch": 0.21419331816762568, "grad_norm": 837.0029296875, "learning_rate": 3e-06, "loss": 17.6087, "reward": 2.1600000858306885, "reward_std": 0.44815221428871155, "rewards/correctness_reward_func": 1.5833333134651184, "rewards/int_reward_func": 0.4895833283662796, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.08708333410322666, "step": 2401, "zero_std_ratio": 0.0 }, { "epoch": 0.21428252821267674, "grad_norm": 772.85107421875, "learning_rate": 3e-06, "loss": 8.4741, "step": 2402 }, { "epoch": 0.21437173825772782, "grad_norm": 622.2056884765625, "learning_rate": 3e-06, "loss": 23.1122, "step": 2403 }, { "epoch": 0.2144609483027789, "grad_norm": 931.6195678710938, "learning_rate": 3e-06, "loss": -2.0691, "step": 2404 }, { "epoch": 0.21455015834782998, "grad_norm": 945.195556640625, "learning_rate": 3e-06, "loss": 6.8255, "step": 2405 }, { "epoch": 0.21463936839288103, "grad_norm": 665.4218139648438, "learning_rate": 3e-06, "loss": -10.6024, "step": 2406 }, { "epoch": 0.2147285784379321, "grad_norm": 668.084228515625, "learning_rate": 3e-06, "loss": 12.1126, "step": 2407 }, { "epoch": 0.2148177884829832, "grad_norm": 681.3531494140625, "learning_rate": 3e-06, "loss": 3.2916, "step": 2408 }, { "epoch": 0.21490699852803424, "grad_norm": 429.6943664550781, "learning_rate": 3e-06, "loss": 18.1306, "step": 2409 }, { "epoch": 0.21499620857308532, "grad_norm": 625.8978271484375, "learning_rate": 3e-06, "loss": -4.3924, "step": 2410 }, { "epoch": 0.2150854186181364, "grad_norm": 1044.302978515625, "learning_rate": 3e-06, "loss": -1.7148, "step": 2411 }, { "epoch": 0.21517462866318748, "grad_norm": 526.33447265625, "learning_rate": 3e-06, "loss": -14.9186, "step": 2412 }, { "completion_length": 122.52083969116211, "epoch": 0.21526383870823854, "grad_norm": 1148.2313232421875, "learning_rate": 3e-06, "loss": -46.4753, "reward": 2.430062413215637, "reward_std": 0.4773147590458393, "rewards/correctness_reward_func": 1.75, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.18006250262260437, "step": 2413, "zero_std_ratio": 0.0 }, { "epoch": 0.21535304875328962, "grad_norm": 882.3959350585938, "learning_rate": 3e-06, "loss": -44.2003, "step": 2414 }, { "epoch": 0.2154422587983407, "grad_norm": 794.4973754882812, "learning_rate": 3e-06, "loss": -10.3375, "step": 2415 }, { "epoch": 0.21553146884339178, "grad_norm": 613.2156982421875, "learning_rate": 3e-06, "loss": -8.6654, "step": 2416 }, { "epoch": 0.21562067888844283, "grad_norm": 739.1935424804688, "learning_rate": 3e-06, "loss": -18.7665, "step": 2417 }, { "epoch": 0.2157098889334939, "grad_norm": 800.4462280273438, "learning_rate": 3e-06, "loss": -48.7075, "step": 2418 }, { "epoch": 0.215799098978545, "grad_norm": 938.549560546875, "learning_rate": 3e-06, "loss": -51.1103, "step": 2419 }, { "epoch": 0.21588830902359607, "grad_norm": 581.0545654296875, "learning_rate": 3e-06, "loss": -46.1839, "step": 2420 }, { "epoch": 0.21597751906864712, "grad_norm": 803.5621337890625, "learning_rate": 3e-06, "loss": -25.6564, "step": 2421 }, { "epoch": 0.2160667291136982, "grad_norm": 815.6629638671875, "learning_rate": 3e-06, "loss": -20.3486, "step": 2422 }, { "epoch": 0.21615593915874928, "grad_norm": 742.8468627929688, "learning_rate": 3e-06, "loss": -30.5293, "step": 2423 }, { "epoch": 0.21624514920380034, "grad_norm": 989.3768310546875, "learning_rate": 3e-06, "loss": -69.2167, "step": 2424 }, { "completion_length": 128.58333587646484, "epoch": 0.21633435924885142, "grad_norm": 279.8984069824219, "learning_rate": 3e-06, "loss": 14.9508, "reward": 1.7630833983421326, "reward_std": 0.2766040712594986, "rewards/correctness_reward_func": 1.1666666567325592, "rewards/int_reward_func": 0.4791666567325592, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.117249995470047, "step": 2425, "zero_std_ratio": 0.0 }, { "epoch": 0.2164235692939025, "grad_norm": 536.0889282226562, "learning_rate": 3e-06, "loss": -6.0752, "step": 2426 }, { "epoch": 0.21651277933895358, "grad_norm": 348.53533935546875, "learning_rate": 3e-06, "loss": 17.0228, "step": 2427 }, { "epoch": 0.21660198938400463, "grad_norm": 385.8553466796875, "learning_rate": 3e-06, "loss": 12.1111, "step": 2428 }, { "epoch": 0.2166911994290557, "grad_norm": 267.1854553222656, "learning_rate": 3e-06, "loss": 0.8226, "step": 2429 }, { "epoch": 0.2167804094741068, "grad_norm": 316.3680725097656, "learning_rate": 3e-06, "loss": 9.285, "step": 2430 }, { "epoch": 0.21686961951915787, "grad_norm": 280.73785400390625, "learning_rate": 3e-06, "loss": 13.0207, "step": 2431 }, { "epoch": 0.21695882956420892, "grad_norm": 346.0469970703125, "learning_rate": 3e-06, "loss": -5.4154, "step": 2432 }, { "epoch": 0.21704803960926, "grad_norm": 264.33837890625, "learning_rate": 3e-06, "loss": 11.8802, "step": 2433 }, { "epoch": 0.21713724965431108, "grad_norm": 304.8049011230469, "learning_rate": 3e-06, "loss": 3.9282, "step": 2434 }, { "epoch": 0.21722645969936213, "grad_norm": 163.74391174316406, "learning_rate": 3e-06, "loss": -4.001, "step": 2435 }, { "epoch": 0.21731566974441321, "grad_norm": 224.05996704101562, "learning_rate": 3e-06, "loss": 1.9239, "step": 2436 }, { "completion_length": 148.2916717529297, "epoch": 0.2174048797894643, "grad_norm": 737.8390502929688, "learning_rate": 3e-06, "loss": -72.0648, "reward": 2.1894376277923584, "reward_std": 0.16566037200391293, "rewards/correctness_reward_func": 1.5833333134651184, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.1061041597276926, "step": 2437, "zero_std_ratio": 0.0 }, { "epoch": 0.21749408983451538, "grad_norm": 1339.2056884765625, "learning_rate": 3e-06, "loss": -92.2614, "step": 2438 }, { "epoch": 0.21758329987956643, "grad_norm": 112.15784454345703, "learning_rate": 3e-06, "loss": 2.3648, "step": 2439 }, { "epoch": 0.2176725099246175, "grad_norm": 210.3965606689453, "learning_rate": 3e-06, "loss": 1.6453, "step": 2440 }, { "epoch": 0.2177617199696686, "grad_norm": 2733.96630859375, "learning_rate": 3e-06, "loss": 86.5019, "step": 2441 }, { "epoch": 0.21785093001471967, "grad_norm": 440.70416259765625, "learning_rate": 3e-06, "loss": -15.824, "step": 2442 }, { "epoch": 0.21794014005977072, "grad_norm": 733.9408569335938, "learning_rate": 3e-06, "loss": -80.8303, "step": 2443 }, { "epoch": 0.2180293501048218, "grad_norm": 1372.98046875, "learning_rate": 3e-06, "loss": -108.3653, "step": 2444 }, { "epoch": 0.21811856014987288, "grad_norm": 158.97706604003906, "learning_rate": 3e-06, "loss": 0.8506, "step": 2445 }, { "epoch": 0.21820777019492396, "grad_norm": 297.0166320800781, "learning_rate": 3e-06, "loss": -1.8569, "step": 2446 }, { "epoch": 0.218296980239975, "grad_norm": 2724.94775390625, "learning_rate": 3e-06, "loss": 114.5594, "step": 2447 }, { "epoch": 0.2183861902850261, "grad_norm": 631.491943359375, "learning_rate": 3e-06, "loss": -27.9172, "step": 2448 }, { "completion_length": 117.02083969116211, "epoch": 0.21847540033007717, "grad_norm": 72.98030090332031, "learning_rate": 3e-06, "loss": -7.2195, "reward": 2.3603543043136597, "reward_std": 0.15350967459380627, "rewards/correctness_reward_func": 1.6666666865348816, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.19368749111890793, "step": 2449, "zero_std_ratio": 0.125 }, { "epoch": 0.21856461037512823, "grad_norm": 69.76261901855469, "learning_rate": 3e-06, "loss": -6.6095, "step": 2450 }, { "epoch": 0.2186538204201793, "grad_norm": 119.00989532470703, "learning_rate": 3e-06, "loss": -3.7843, "step": 2451 }, { "epoch": 0.2187430304652304, "grad_norm": 81.48381805419922, "learning_rate": 3e-06, "loss": -6.6727, "step": 2452 }, { "epoch": 0.21883224051028147, "grad_norm": 96.27825927734375, "learning_rate": 3e-06, "loss": -7.5212, "step": 2453 }, { "epoch": 0.21892145055533252, "grad_norm": 74.60059356689453, "learning_rate": 3e-06, "loss": -8.8148, "step": 2454 }, { "epoch": 0.2190106606003836, "grad_norm": 78.35423278808594, "learning_rate": 3e-06, "loss": -8.4421, "step": 2455 }, { "epoch": 0.21909987064543468, "grad_norm": 144.65966796875, "learning_rate": 3e-06, "loss": -7.9481, "step": 2456 }, { "epoch": 0.21918908069048576, "grad_norm": 93.7354965209961, "learning_rate": 3e-06, "loss": -5.0046, "step": 2457 }, { "epoch": 0.2192782907355368, "grad_norm": 78.02239227294922, "learning_rate": 3e-06, "loss": -8.622, "step": 2458 }, { "epoch": 0.2193675007805879, "grad_norm": 82.25161743164062, "learning_rate": 3e-06, "loss": -9.508, "step": 2459 }, { "epoch": 0.21945671082563897, "grad_norm": 77.15231323242188, "learning_rate": 3e-06, "loss": -10.3714, "step": 2460 }, { "completion_length": 115.97916793823242, "epoch": 0.21954592087069005, "grad_norm": 39.886009216308594, "learning_rate": 3e-06, "loss": -4.294, "reward": 2.4212708473205566, "reward_std": 0.040800848975777626, "rewards/correctness_reward_func": 1.75, "rewards/int_reward_func": 0.4895833283662796, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.18168748915195465, "step": 2461, "zero_std_ratio": 0.0 }, { "epoch": 0.2196351309157411, "grad_norm": 45.25791549682617, "learning_rate": 3e-06, "loss": -2.4001, "step": 2462 }, { "epoch": 0.21972434096079219, "grad_norm": 47.455623626708984, "learning_rate": 3e-06, "loss": 0.16, "step": 2463 }, { "epoch": 0.21981355100584327, "grad_norm": 56.16099166870117, "learning_rate": 3e-06, "loss": -5.4067, "step": 2464 }, { "epoch": 0.21990276105089432, "grad_norm": 58.30009841918945, "learning_rate": 3e-06, "loss": -6.231, "step": 2465 }, { "epoch": 0.2199919710959454, "grad_norm": 55.359840393066406, "learning_rate": 3e-06, "loss": -2.543, "step": 2466 }, { "epoch": 0.22008118114099648, "grad_norm": 50.81299591064453, "learning_rate": 3e-06, "loss": -5.613, "step": 2467 }, { "epoch": 0.22017039118604756, "grad_norm": 54.18115234375, "learning_rate": 3e-06, "loss": -3.8278, "step": 2468 }, { "epoch": 0.2202596012310986, "grad_norm": 70.59385681152344, "learning_rate": 3e-06, "loss": -0.9699, "step": 2469 }, { "epoch": 0.2203488112761497, "grad_norm": 68.53688049316406, "learning_rate": 3e-06, "loss": -7.7968, "step": 2470 }, { "epoch": 0.22043802132120077, "grad_norm": 72.17977142333984, "learning_rate": 3e-06, "loss": -8.891, "step": 2471 }, { "epoch": 0.22052723136625185, "grad_norm": 76.02491760253906, "learning_rate": 3e-06, "loss": -4.5234, "step": 2472 }, { "completion_length": 120.66667175292969, "epoch": 0.2206164414113029, "grad_norm": 217.54934692382812, "learning_rate": 3e-06, "loss": -3.0888, "reward": 2.4115419387817383, "reward_std": 0.12016614899039268, "rewards/correctness_reward_func": 1.7083333134651184, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.2032083421945572, "step": 2473, "zero_std_ratio": 0.0 }, { "epoch": 0.22070565145635399, "grad_norm": 235.37335205078125, "learning_rate": 3e-06, "loss": 5.7274, "step": 2474 }, { "epoch": 0.22079486150140507, "grad_norm": 458.1533508300781, "learning_rate": 3e-06, "loss": -5.1241, "step": 2475 }, { "epoch": 0.22088407154645612, "grad_norm": 265.38818359375, "learning_rate": 3e-06, "loss": 9.2465, "step": 2476 }, { "epoch": 0.2209732815915072, "grad_norm": 494.6804504394531, "learning_rate": 3e-06, "loss": -4.1893, "step": 2477 }, { "epoch": 0.22106249163655828, "grad_norm": 272.2371826171875, "learning_rate": 3e-06, "loss": 9.0183, "step": 2478 }, { "epoch": 0.22115170168160936, "grad_norm": 322.984130859375, "learning_rate": 3e-06, "loss": -1.4007, "step": 2479 }, { "epoch": 0.2212409117266604, "grad_norm": 377.4932556152344, "learning_rate": 3e-06, "loss": 6.0472, "step": 2480 }, { "epoch": 0.2213301217717115, "grad_norm": 765.150634765625, "learning_rate": 3e-06, "loss": -13.7039, "step": 2481 }, { "epoch": 0.22141933181676257, "grad_norm": 315.2571716308594, "learning_rate": 3e-06, "loss": 10.4126, "step": 2482 }, { "epoch": 0.22150854186181365, "grad_norm": 702.4007568359375, "learning_rate": 3e-06, "loss": -10.1122, "step": 2483 }, { "epoch": 0.2215977519068647, "grad_norm": 322.5043029785156, "learning_rate": 3e-06, "loss": 7.4748, "step": 2484 }, { "completion_length": 136.4791717529297, "epoch": 0.22168696195191578, "grad_norm": 666.5755004882812, "learning_rate": 3e-06, "loss": 3.2721, "reward": 2.0126249194145203, "reward_std": 0.5076068788766861, "rewards/correctness_reward_func": 1.375, "rewards/int_reward_func": 0.4895833283662796, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.14804167300462723, "step": 2485, "zero_std_ratio": 0.0 }, { "epoch": 0.22177617199696686, "grad_norm": 1157.80908203125, "learning_rate": 3e-06, "loss": 35.3796, "step": 2486 }, { "epoch": 0.22186538204201794, "grad_norm": 964.0335693359375, "learning_rate": 3e-06, "loss": 27.2501, "step": 2487 }, { "epoch": 0.221954592087069, "grad_norm": 1680.4920654296875, "learning_rate": 3e-06, "loss": 11.387, "step": 2488 }, { "epoch": 0.22204380213212008, "grad_norm": 1438.4658203125, "learning_rate": 3e-06, "loss": 46.2915, "step": 2489 }, { "epoch": 0.22213301217717116, "grad_norm": 1312.2196044921875, "learning_rate": 3e-06, "loss": 53.6756, "step": 2490 }, { "epoch": 0.2222222222222222, "grad_norm": 598.6546630859375, "learning_rate": 3e-06, "loss": -0.5956, "step": 2491 }, { "epoch": 0.2223114322672733, "grad_norm": 1167.168212890625, "learning_rate": 3e-06, "loss": 18.1678, "step": 2492 }, { "epoch": 0.22240064231232437, "grad_norm": 714.756103515625, "learning_rate": 3e-06, "loss": 9.6702, "step": 2493 }, { "epoch": 0.22248985235737545, "grad_norm": 964.5927124023438, "learning_rate": 3e-06, "loss": -3.789, "step": 2494 }, { "epoch": 0.2225790624024265, "grad_norm": 930.6976928710938, "learning_rate": 3e-06, "loss": 25.0124, "step": 2495 }, { "epoch": 0.22266827244747758, "grad_norm": 747.74365234375, "learning_rate": 3e-06, "loss": 36.8743, "step": 2496 }, { "completion_length": 115.33333587646484, "epoch": 0.22275748249252866, "grad_norm": 283.8410339355469, "learning_rate": 3e-06, "loss": -6.8542, "reward": 2.154166579246521, "reward_std": 0.4607701599597931, "rewards/correctness_reward_func": 1.5, "rewards/int_reward_func": 0.4375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.21666666120290756, "step": 2497, "zero_std_ratio": 0.0 }, { "epoch": 0.22284669253757974, "grad_norm": 285.5289611816406, "learning_rate": 3e-06, "loss": -18.1384, "step": 2498 }, { "epoch": 0.2229359025826308, "grad_norm": 412.27838134765625, "learning_rate": 3e-06, "loss": -1.6925, "step": 2499 }, { "epoch": 0.22302511262768188, "grad_norm": 322.4535827636719, "learning_rate": 3e-06, "loss": -7.0813, "step": 2500 }, { "epoch": 0.22311432267273296, "grad_norm": 294.1036682128906, "learning_rate": 3e-06, "loss": -9.4859, "step": 2501 }, { "epoch": 0.223203532717784, "grad_norm": 302.7540283203125, "learning_rate": 3e-06, "loss": -0.4445, "step": 2502 }, { "epoch": 0.2232927427628351, "grad_norm": 237.27244567871094, "learning_rate": 3e-06, "loss": -11.6923, "step": 2503 }, { "epoch": 0.22338195280788617, "grad_norm": 386.3728332519531, "learning_rate": 3e-06, "loss": -20.7637, "step": 2504 }, { "epoch": 0.22347116285293725, "grad_norm": 268.1266784667969, "learning_rate": 3e-06, "loss": -8.5103, "step": 2505 }, { "epoch": 0.2235603728979883, "grad_norm": 270.6510925292969, "learning_rate": 3e-06, "loss": -10.2348, "step": 2506 }, { "epoch": 0.22364958294303938, "grad_norm": 188.07669067382812, "learning_rate": 3e-06, "loss": -10.9502, "step": 2507 }, { "epoch": 0.22373879298809046, "grad_norm": 206.290771484375, "learning_rate": 3e-06, "loss": -5.0808, "step": 2508 }, { "completion_length": 123.00000762939453, "epoch": 0.22382800303314154, "grad_norm": 248.20083618164062, "learning_rate": 3e-06, "loss": -22.2839, "reward": 2.6338332891464233, "reward_std": 0.24515828490257263, "rewards/correctness_reward_func": 1.9166666269302368, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.21716666221618652, "step": 2509, "zero_std_ratio": 0.0 }, { "epoch": 0.2239172130781926, "grad_norm": 277.49224853515625, "learning_rate": 3e-06, "loss": -13.8777, "step": 2510 }, { "epoch": 0.22400642312324368, "grad_norm": 314.8838195800781, "learning_rate": 3e-06, "loss": -11.5577, "step": 2511 }, { "epoch": 0.22409563316829476, "grad_norm": 365.1380310058594, "learning_rate": 3e-06, "loss": -25.7421, "step": 2512 }, { "epoch": 0.22418484321334584, "grad_norm": 738.1392822265625, "learning_rate": 3e-06, "loss": -28.4632, "step": 2513 }, { "epoch": 0.2242740532583969, "grad_norm": 298.1048583984375, "learning_rate": 3e-06, "loss": 20.3454, "step": 2514 }, { "epoch": 0.22436326330344797, "grad_norm": 294.0306701660156, "learning_rate": 3e-06, "loss": -25.6874, "step": 2515 }, { "epoch": 0.22445247334849905, "grad_norm": 307.4908142089844, "learning_rate": 3e-06, "loss": -16.1791, "step": 2516 }, { "epoch": 0.2245416833935501, "grad_norm": 289.4143981933594, "learning_rate": 3e-06, "loss": -13.7269, "step": 2517 }, { "epoch": 0.22463089343860118, "grad_norm": 321.67999267578125, "learning_rate": 3e-06, "loss": -26.8296, "step": 2518 }, { "epoch": 0.22472010348365226, "grad_norm": 690.182861328125, "learning_rate": 3e-06, "loss": -28.0334, "step": 2519 }, { "epoch": 0.22480931352870334, "grad_norm": 309.8341064453125, "learning_rate": 3e-06, "loss": 19.6679, "step": 2520 }, { "completion_length": 138.18750762939453, "epoch": 0.2248985235737544, "grad_norm": 962.9716796875, "learning_rate": 3e-06, "loss": -466.6429, "reward": 1.927125096321106, "reward_std": 0.4949150010943413, "rewards/correctness_reward_func": 1.3333333134651184, "rewards/int_reward_func": 0.4895833283662796, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.1042083278298378, "step": 2521, "zero_std_ratio": 0.0 }, { "epoch": 0.22498773361880547, "grad_norm": 977.1348876953125, "learning_rate": 3e-06, "loss": -455.5698, "step": 2522 }, { "epoch": 0.22507694366385655, "grad_norm": 1716.0784912109375, "learning_rate": 3e-06, "loss": -472.2814, "step": 2523 }, { "epoch": 0.22516615370890763, "grad_norm": 1554.378662109375, "learning_rate": 3e-06, "loss": -517.5912, "step": 2524 }, { "epoch": 0.2252553637539587, "grad_norm": 1132.1138916015625, "learning_rate": 3e-06, "loss": -500.9869, "step": 2525 }, { "epoch": 0.22534457379900977, "grad_norm": 1298.7674560546875, "learning_rate": 3e-06, "loss": -496.895, "step": 2526 }, { "epoch": 0.22543378384406085, "grad_norm": 1010.2288208007812, "learning_rate": 3e-06, "loss": -483.6955, "step": 2527 }, { "epoch": 0.2255229938891119, "grad_norm": 939.7051391601562, "learning_rate": 3e-06, "loss": -477.8679, "step": 2528 }, { "epoch": 0.22561220393416298, "grad_norm": 2647.501953125, "learning_rate": 3e-06, "loss": -497.0203, "step": 2529 }, { "epoch": 0.22570141397921406, "grad_norm": 1514.046142578125, "learning_rate": 3e-06, "loss": -558.5161, "step": 2530 }, { "epoch": 0.22579062402426514, "grad_norm": 1017.2706909179688, "learning_rate": 3e-06, "loss": -530.007, "step": 2531 }, { "epoch": 0.2258798340693162, "grad_norm": 1210.83984375, "learning_rate": 3e-06, "loss": -530.6325, "step": 2532 }, { "completion_length": 158.08333587646484, "epoch": 0.22596904411436727, "grad_norm": 3164.390869140625, "learning_rate": 3e-06, "loss": -235.1734, "reward": 1.9235208630561829, "reward_std": 0.5265283584594727, "rewards/correctness_reward_func": 1.3333333134651184, "rewards/int_reward_func": 0.4791666567325592, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.11102083697915077, "step": 2533, "zero_std_ratio": 0.0 }, { "epoch": 0.22605825415941835, "grad_norm": 3456.934814453125, "learning_rate": 3e-06, "loss": -198.7878, "step": 2534 }, { "epoch": 0.22614746420446943, "grad_norm": 3546.382080078125, "learning_rate": 3e-06, "loss": -147.5051, "step": 2535 }, { "epoch": 0.2262366742495205, "grad_norm": 2367.549072265625, "learning_rate": 3e-06, "loss": -172.2864, "step": 2536 }, { "epoch": 0.22632588429457157, "grad_norm": 2736.5947265625, "learning_rate": 3e-06, "loss": -287.8439, "step": 2537 }, { "epoch": 0.22641509433962265, "grad_norm": 3597.607177734375, "learning_rate": 3e-06, "loss": -207.2024, "step": 2538 }, { "epoch": 0.22650430438467373, "grad_norm": 2903.524169921875, "learning_rate": 3e-06, "loss": -340.7718, "step": 2539 }, { "epoch": 0.22659351442972478, "grad_norm": 4115.52783203125, "learning_rate": 3e-06, "loss": -345.8203, "step": 2540 }, { "epoch": 0.22668272447477586, "grad_norm": 3821.200439453125, "learning_rate": 3e-06, "loss": -282.2995, "step": 2541 }, { "epoch": 0.22677193451982694, "grad_norm": 2847.97021484375, "learning_rate": 3e-06, "loss": -290.9111, "step": 2542 }, { "epoch": 0.226861144564878, "grad_norm": 2697.22119140625, "learning_rate": 3e-06, "loss": -436.4271, "step": 2543 }, { "epoch": 0.22695035460992907, "grad_norm": 2621.13134765625, "learning_rate": 3e-06, "loss": -355.8498, "step": 2544 }, { "completion_length": 137.12500762939453, "epoch": 0.22703956465498015, "grad_norm": 1619.9908447265625, "learning_rate": 3e-06, "loss": 56.5571, "reward": 2.2058334350585938, "reward_std": 0.3718770742416382, "rewards/correctness_reward_func": 1.5416666269302368, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.1641666665673256, "step": 2545, "zero_std_ratio": 0.0 }, { "epoch": 0.22712877470003123, "grad_norm": 1277.8330078125, "learning_rate": 3e-06, "loss": 39.8159, "step": 2546 }, { "epoch": 0.22721798474508229, "grad_norm": 2199.858154296875, "learning_rate": 3e-06, "loss": 59.8863, "step": 2547 }, { "epoch": 0.22730719479013337, "grad_norm": 2098.661865234375, "learning_rate": 3e-06, "loss": 72.6976, "step": 2548 }, { "epoch": 0.22739640483518445, "grad_norm": 2782.49560546875, "learning_rate": 3e-06, "loss": 57.7231, "step": 2549 }, { "epoch": 0.22748561488023553, "grad_norm": 1974.8419189453125, "learning_rate": 3e-06, "loss": 33.9867, "step": 2550 }, { "epoch": 0.22757482492528658, "grad_norm": 1646.1302490234375, "learning_rate": 3e-06, "loss": 42.3364, "step": 2551 }, { "epoch": 0.22766403497033766, "grad_norm": 1905.888671875, "learning_rate": 3e-06, "loss": 21.0742, "step": 2552 }, { "epoch": 0.22775324501538874, "grad_norm": 2715.112548828125, "learning_rate": 3e-06, "loss": 23.6365, "step": 2553 }, { "epoch": 0.22784245506043982, "grad_norm": 2285.765625, "learning_rate": 3e-06, "loss": 58.3677, "step": 2554 }, { "epoch": 0.22793166510549087, "grad_norm": 2014.7835693359375, "learning_rate": 3e-06, "loss": 29.8156, "step": 2555 }, { "epoch": 0.22802087515054195, "grad_norm": 1831.78125, "learning_rate": 3e-06, "loss": 16.4394, "step": 2556 }, { "completion_length": 136.9375, "epoch": 0.22811008519559303, "grad_norm": 836.2206420898438, "learning_rate": 3e-06, "loss": -400.024, "reward": 2.038104295730591, "reward_std": 0.36657825112342834, "rewards/correctness_reward_func": 1.4583333134651184, "rewards/int_reward_func": 0.4583333283662796, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.12143750861287117, "step": 2557, "zero_std_ratio": 0.0 }, { "epoch": 0.22819929524064408, "grad_norm": 729.5347900390625, "learning_rate": 3e-06, "loss": -326.6933, "step": 2558 }, { "epoch": 0.22828850528569516, "grad_norm": 1105.0972900390625, "learning_rate": 3e-06, "loss": -290.3208, "step": 2559 }, { "epoch": 0.22837771533074624, "grad_norm": 1039.4100341796875, "learning_rate": 3e-06, "loss": -360.9429, "step": 2560 }, { "epoch": 0.22846692537579733, "grad_norm": 1980.042236328125, "learning_rate": 3e-06, "loss": -323.0144, "step": 2561 }, { "epoch": 0.22855613542084838, "grad_norm": 926.3096313476562, "learning_rate": 3e-06, "loss": -348.8501, "step": 2562 }, { "epoch": 0.22864534546589946, "grad_norm": 1314.363037109375, "learning_rate": 3e-06, "loss": -418.6348, "step": 2563 }, { "epoch": 0.22873455551095054, "grad_norm": 1042.013671875, "learning_rate": 3e-06, "loss": -331.0753, "step": 2564 }, { "epoch": 0.22882376555600162, "grad_norm": 1598.5084228515625, "learning_rate": 3e-06, "loss": -314.2241, "step": 2565 }, { "epoch": 0.22891297560105267, "grad_norm": 1485.787841796875, "learning_rate": 3e-06, "loss": -379.8989, "step": 2566 }, { "epoch": 0.22900218564610375, "grad_norm": 3140.890869140625, "learning_rate": 3e-06, "loss": -314.3493, "step": 2567 }, { "epoch": 0.22909139569115483, "grad_norm": 1227.4429931640625, "learning_rate": 3e-06, "loss": -365.1925, "step": 2568 }, { "completion_length": 114.25000381469727, "epoch": 0.22918060573620588, "grad_norm": 992.5396118164062, "learning_rate": 3e-06, "loss": 145.8403, "reward": 2.355666756629944, "reward_std": 0.2649495005607605, "rewards/correctness_reward_func": 1.6666666865348816, "rewards/int_reward_func": 0.4895833283662796, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.19941666722297668, "step": 2569, "zero_std_ratio": 0.0 }, { "epoch": 0.22926981578125696, "grad_norm": 920.4397583007812, "learning_rate": 3e-06, "loss": 125.9513, "step": 2570 }, { "epoch": 0.22935902582630804, "grad_norm": 1110.3140869140625, "learning_rate": 3e-06, "loss": 130.2265, "step": 2571 }, { "epoch": 0.22944823587135912, "grad_norm": 945.190185546875, "learning_rate": 3e-06, "loss": 135.5103, "step": 2572 }, { "epoch": 0.22953744591641018, "grad_norm": 1002.4699096679688, "learning_rate": 3e-06, "loss": 101.9883, "step": 2573 }, { "epoch": 0.22962665596146126, "grad_norm": 1004.0828857421875, "learning_rate": 3e-06, "loss": 99.0042, "step": 2574 }, { "epoch": 0.22971586600651234, "grad_norm": 968.288818359375, "learning_rate": 3e-06, "loss": 123.8671, "step": 2575 }, { "epoch": 0.22980507605156342, "grad_norm": 885.2098388671875, "learning_rate": 3e-06, "loss": 100.9767, "step": 2576 }, { "epoch": 0.22989428609661447, "grad_norm": 990.7384033203125, "learning_rate": 3e-06, "loss": 94.6427, "step": 2577 }, { "epoch": 0.22998349614166555, "grad_norm": 765.0950317382812, "learning_rate": 3e-06, "loss": 97.1664, "step": 2578 }, { "epoch": 0.23007270618671663, "grad_norm": 819.2175903320312, "learning_rate": 3e-06, "loss": 59.3393, "step": 2579 }, { "epoch": 0.2301619162317677, "grad_norm": 632.7185668945312, "learning_rate": 3e-06, "loss": 59.3236, "step": 2580 }, { "completion_length": 116.45833969116211, "epoch": 0.23025112627681876, "grad_norm": 1367.638427734375, "learning_rate": 3e-06, "loss": 22.0348, "reward": 2.447500228881836, "reward_std": 0.537164568901062, "rewards/correctness_reward_func": 1.75, "rewards/int_reward_func": 0.4791666567325592, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.21833331882953644, "step": 2581, "zero_std_ratio": 0.0 }, { "epoch": 0.23034033632186984, "grad_norm": 1206.524658203125, "learning_rate": 3e-06, "loss": -17.3179, "step": 2582 }, { "epoch": 0.23042954636692092, "grad_norm": 1213.4954833984375, "learning_rate": 3e-06, "loss": 7.0438, "step": 2583 }, { "epoch": 0.23051875641197198, "grad_norm": 1282.153076171875, "learning_rate": 3e-06, "loss": -11.0693, "step": 2584 }, { "epoch": 0.23060796645702306, "grad_norm": 1258.742919921875, "learning_rate": 3e-06, "loss": 23.2091, "step": 2585 }, { "epoch": 0.23069717650207414, "grad_norm": 891.6541748046875, "learning_rate": 3e-06, "loss": 44.6377, "step": 2586 }, { "epoch": 0.23078638654712522, "grad_norm": 1186.5556640625, "learning_rate": 3e-06, "loss": 13.3836, "step": 2587 }, { "epoch": 0.23087559659217627, "grad_norm": 1031.4847412109375, "learning_rate": 3e-06, "loss": -19.3828, "step": 2588 }, { "epoch": 0.23096480663722735, "grad_norm": 1197.4765625, "learning_rate": 3e-06, "loss": -18.3731, "step": 2589 }, { "epoch": 0.23105401668227843, "grad_norm": 1187.748779296875, "learning_rate": 3e-06, "loss": -34.1627, "step": 2590 }, { "epoch": 0.2311432267273295, "grad_norm": 1293.2896728515625, "learning_rate": 3e-06, "loss": 25.4429, "step": 2591 }, { "epoch": 0.23123243677238056, "grad_norm": 837.5556030273438, "learning_rate": 3e-06, "loss": 29.9603, "step": 2592 }, { "completion_length": 138.5208396911621, "epoch": 0.23132164681743164, "grad_norm": 2396.30810546875, "learning_rate": 3e-06, "loss": -178.7073, "reward": 2.0477917194366455, "reward_std": 0.405208945274353, "rewards/correctness_reward_func": 1.4166666865348816, "rewards/int_reward_func": 0.4895833283662796, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.14154166355729103, "step": 2593, "zero_std_ratio": 0.0 }, { "epoch": 0.23141085686248272, "grad_norm": 2775.789794921875, "learning_rate": 3e-06, "loss": -198.7789, "step": 2594 }, { "epoch": 0.23150006690753377, "grad_norm": 2544.03515625, "learning_rate": 3e-06, "loss": -201.1037, "step": 2595 }, { "epoch": 0.23158927695258485, "grad_norm": 1803.113525390625, "learning_rate": 3e-06, "loss": -222.0999, "step": 2596 }, { "epoch": 0.23167848699763594, "grad_norm": 1768.275390625, "learning_rate": 3e-06, "loss": -36.627, "step": 2597 }, { "epoch": 0.23176769704268702, "grad_norm": 2948.94482421875, "learning_rate": 3e-06, "loss": -135.6279, "step": 2598 }, { "epoch": 0.23185690708773807, "grad_norm": 2209.195068359375, "learning_rate": 3e-06, "loss": -208.3697, "step": 2599 }, { "epoch": 0.23194611713278915, "grad_norm": 1962.291748046875, "learning_rate": 3e-06, "loss": -240.1463, "step": 2600 }, { "epoch": 0.23203532717784023, "grad_norm": 1832.804931640625, "learning_rate": 3e-06, "loss": -239.5251, "step": 2601 }, { "epoch": 0.2321245372228913, "grad_norm": 1593.4063720703125, "learning_rate": 3e-06, "loss": -271.6163, "step": 2602 }, { "epoch": 0.23221374726794236, "grad_norm": 2859.34375, "learning_rate": 3e-06, "loss": -93.6033, "step": 2603 }, { "epoch": 0.23230295731299344, "grad_norm": 2233.3232421875, "learning_rate": 3e-06, "loss": -203.8326, "step": 2604 }, { "completion_length": 133.08334350585938, "epoch": 0.23239216735804452, "grad_norm": 312.1352233886719, "learning_rate": 3e-06, "loss": 2.6697, "reward": 2.2783544063568115, "reward_std": 0.27657610177993774, "rewards/correctness_reward_func": 1.625, "rewards/int_reward_func": 0.4895833283662796, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.16377082467079163, "step": 2605, "zero_std_ratio": 0.125 }, { "epoch": 0.2324813774030956, "grad_norm": 301.582763671875, "learning_rate": 3e-06, "loss": -7.0407, "step": 2606 }, { "epoch": 0.23257058744814665, "grad_norm": 584.7735595703125, "learning_rate": 3e-06, "loss": -10.2937, "step": 2607 }, { "epoch": 0.23265979749319773, "grad_norm": 259.61553955078125, "learning_rate": 3e-06, "loss": 9.9754, "step": 2608 }, { "epoch": 0.23274900753824881, "grad_norm": 249.1676025390625, "learning_rate": 3e-06, "loss": 3.9731, "step": 2609 }, { "epoch": 0.23283821758329987, "grad_norm": 246.88815307617188, "learning_rate": 3e-06, "loss": 11.3381, "step": 2610 }, { "epoch": 0.23292742762835095, "grad_norm": 547.9613647460938, "learning_rate": 3e-06, "loss": -5.3289, "step": 2611 }, { "epoch": 0.23301663767340203, "grad_norm": 303.4884033203125, "learning_rate": 3e-06, "loss": -11.0554, "step": 2612 }, { "epoch": 0.2331058477184531, "grad_norm": 562.7376708984375, "learning_rate": 3e-06, "loss": -18.1232, "step": 2613 }, { "epoch": 0.23319505776350416, "grad_norm": 373.4956359863281, "learning_rate": 3e-06, "loss": 7.7594, "step": 2614 }, { "epoch": 0.23328426780855524, "grad_norm": 361.0956726074219, "learning_rate": 3e-06, "loss": 6.2606, "step": 2615 }, { "epoch": 0.23337347785360632, "grad_norm": 290.0255432128906, "learning_rate": 3e-06, "loss": 9.9446, "step": 2616 }, { "completion_length": 118.83333587646484, "epoch": 0.2334626878986574, "grad_norm": 947.1046142578125, "learning_rate": 3e-06, "loss": -7.4717, "reward": 2.5243749618530273, "reward_std": 0.33671872317790985, "rewards/correctness_reward_func": 1.8333333730697632, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.19104167073965073, "step": 2617, "zero_std_ratio": 0.0 }, { "epoch": 0.23355189794370845, "grad_norm": 795.0842895507812, "learning_rate": 3e-06, "loss": 30.0327, "step": 2618 }, { "epoch": 0.23364110798875953, "grad_norm": 1086.8138427734375, "learning_rate": 3e-06, "loss": 23.2035, "step": 2619 }, { "epoch": 0.2337303180338106, "grad_norm": 1498.2811279296875, "learning_rate": 3e-06, "loss": -21.2059, "step": 2620 }, { "epoch": 0.2338195280788617, "grad_norm": 1175.6298828125, "learning_rate": 3e-06, "loss": 48.3238, "step": 2621 }, { "epoch": 0.23390873812391275, "grad_norm": 961.840087890625, "learning_rate": 3e-06, "loss": 36.8231, "step": 2622 }, { "epoch": 0.23399794816896383, "grad_norm": 1117.9853515625, "learning_rate": 3e-06, "loss": -12.174, "step": 2623 }, { "epoch": 0.2340871582140149, "grad_norm": 926.93798828125, "learning_rate": 3e-06, "loss": 22.8576, "step": 2624 }, { "epoch": 0.23417636825906596, "grad_norm": 1278.1483154296875, "learning_rate": 3e-06, "loss": 8.4446, "step": 2625 }, { "epoch": 0.23426557830411704, "grad_norm": 1131.859130859375, "learning_rate": 3e-06, "loss": -32.4271, "step": 2626 }, { "epoch": 0.23435478834916812, "grad_norm": 1015.3899536132812, "learning_rate": 3e-06, "loss": 36.5816, "step": 2627 }, { "epoch": 0.2344439983942192, "grad_norm": 1250.8270263671875, "learning_rate": 3e-06, "loss": 29.9637, "step": 2628 }, { "completion_length": 145.68750762939453, "epoch": 0.23453320843927025, "grad_norm": 1060.10595703125, "learning_rate": 3e-06, "loss": -38.7825, "reward": 1.9933959245681763, "reward_std": 0.4457727372646332, "rewards/correctness_reward_func": 1.375, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.11839582771062851, "step": 2629, "zero_std_ratio": 0.0 }, { "epoch": 0.23462241848432133, "grad_norm": 778.2303466796875, "learning_rate": 3e-06, "loss": 27.4781, "step": 2630 }, { "epoch": 0.2347116285293724, "grad_norm": 1917.360595703125, "learning_rate": 3e-06, "loss": -101.2905, "step": 2631 }, { "epoch": 0.2348008385744235, "grad_norm": 922.7222290039062, "learning_rate": 3e-06, "loss": 31.0484, "step": 2632 }, { "epoch": 0.23489004861947455, "grad_norm": 1508.9708251953125, "learning_rate": 3e-06, "loss": 25.8033, "step": 2633 }, { "epoch": 0.23497925866452563, "grad_norm": 3495.319091796875, "learning_rate": 3e-06, "loss": 115.9854, "step": 2634 }, { "epoch": 0.2350684687095767, "grad_norm": 1065.3980712890625, "learning_rate": 3e-06, "loss": -60.1251, "step": 2635 }, { "epoch": 0.23515767875462776, "grad_norm": 640.9755249023438, "learning_rate": 3e-06, "loss": 15.637, "step": 2636 }, { "epoch": 0.23524688879967884, "grad_norm": 2169.97412109375, "learning_rate": 3e-06, "loss": -134.1001, "step": 2637 }, { "epoch": 0.23533609884472992, "grad_norm": 673.7764282226562, "learning_rate": 3e-06, "loss": 16.1889, "step": 2638 }, { "epoch": 0.235425308889781, "grad_norm": 1503.6060791015625, "learning_rate": 3e-06, "loss": 0.1166, "step": 2639 }, { "epoch": 0.23551451893483205, "grad_norm": 3856.886962890625, "learning_rate": 3e-06, "loss": 91.0509, "step": 2640 }, { "completion_length": 136.16666793823242, "epoch": 0.23560372897988313, "grad_norm": 3150.307373046875, "learning_rate": 3e-06, "loss": -2.5998, "reward": 2.161541700363159, "reward_std": 0.492770716547966, "rewards/correctness_reward_func": 1.5416666269302368, "rewards/int_reward_func": 0.4791666567325592, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.1407083384692669, "step": 2641, "zero_std_ratio": 0.0 }, { "epoch": 0.2356929390249342, "grad_norm": 2879.72265625, "learning_rate": 3e-06, "loss": 21.1676, "step": 2642 }, { "epoch": 0.2357821490699853, "grad_norm": 1765.1065673828125, "learning_rate": 3e-06, "loss": -47.9587, "step": 2643 }, { "epoch": 0.23587135911503634, "grad_norm": 3145.753662109375, "learning_rate": 3e-06, "loss": 28.0295, "step": 2644 }, { "epoch": 0.23596056916008742, "grad_norm": 2760.466796875, "learning_rate": 3e-06, "loss": -24.6202, "step": 2645 }, { "epoch": 0.2360497792051385, "grad_norm": 2200.556640625, "learning_rate": 3e-06, "loss": -30.4654, "step": 2646 }, { "epoch": 0.23613898925018958, "grad_norm": 2981.78515625, "learning_rate": 3e-06, "loss": -9.9669, "step": 2647 }, { "epoch": 0.23622819929524064, "grad_norm": 2706.1416015625, "learning_rate": 3e-06, "loss": -4.2508, "step": 2648 }, { "epoch": 0.23631740934029172, "grad_norm": 1792.266845703125, "learning_rate": 3e-06, "loss": -60.3839, "step": 2649 }, { "epoch": 0.2364066193853428, "grad_norm": 2900.65966796875, "learning_rate": 3e-06, "loss": 3.1149, "step": 2650 }, { "epoch": 0.23649582943039385, "grad_norm": 1927.884521484375, "learning_rate": 3e-06, "loss": -21.4186, "step": 2651 }, { "epoch": 0.23658503947544493, "grad_norm": 1961.5538330078125, "learning_rate": 3e-06, "loss": -29.1293, "step": 2652 }, { "completion_length": 118.79166793823242, "epoch": 0.236674249520496, "grad_norm": 815.8477783203125, "learning_rate": 3e-06, "loss": -72.8621, "reward": 2.1797499656677246, "reward_std": 0.4376053512096405, "rewards/correctness_reward_func": 1.5, "rewards/int_reward_func": 0.4895833283662796, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.190166674554348, "step": 2653, "zero_std_ratio": 0.0 }, { "epoch": 0.2367634595655471, "grad_norm": 784.2760009765625, "learning_rate": 3e-06, "loss": -84.9872, "step": 2654 }, { "epoch": 0.23685266961059814, "grad_norm": 877.09716796875, "learning_rate": 3e-06, "loss": -89.9346, "step": 2655 }, { "epoch": 0.23694187965564922, "grad_norm": 758.9246826171875, "learning_rate": 3e-06, "loss": -96.6217, "step": 2656 }, { "epoch": 0.2370310897007003, "grad_norm": 677.511962890625, "learning_rate": 3e-06, "loss": -105.1141, "step": 2657 }, { "epoch": 0.23712029974575138, "grad_norm": 657.8270874023438, "learning_rate": 3e-06, "loss": -112.326, "step": 2658 }, { "epoch": 0.23720950979080244, "grad_norm": 816.3355102539062, "learning_rate": 3e-06, "loss": -86.2745, "step": 2659 }, { "epoch": 0.23729871983585352, "grad_norm": 643.4733276367188, "learning_rate": 3e-06, "loss": -97.1522, "step": 2660 }, { "epoch": 0.2373879298809046, "grad_norm": 754.5259399414062, "learning_rate": 3e-06, "loss": -107.4985, "step": 2661 }, { "epoch": 0.23747713992595565, "grad_norm": 595.6665649414062, "learning_rate": 3e-06, "loss": -114.4714, "step": 2662 }, { "epoch": 0.23756634997100673, "grad_norm": 584.2109985351562, "learning_rate": 3e-06, "loss": -121.8344, "step": 2663 }, { "epoch": 0.2376555600160578, "grad_norm": 595.26708984375, "learning_rate": 3e-06, "loss": -132.3993, "step": 2664 }, { "completion_length": 126.27083587646484, "epoch": 0.2377447700611089, "grad_norm": 1131.886474609375, "learning_rate": 3e-06, "loss": -22.9922, "reward": 1.7247709035873413, "reward_std": 0.4915483295917511, "rewards/correctness_reward_func": 1.0833333432674408, "rewards/int_reward_func": 0.4791666567325592, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.16227082908153534, "step": 2665, "zero_std_ratio": 0.0 }, { "epoch": 0.23783398010615994, "grad_norm": 1314.6025390625, "learning_rate": 3e-06, "loss": -18.614, "step": 2666 }, { "epoch": 0.23792319015121102, "grad_norm": 1549.7657470703125, "learning_rate": 3e-06, "loss": 27.5703, "step": 2667 }, { "epoch": 0.2380124001962621, "grad_norm": 1480.2467041015625, "learning_rate": 3e-06, "loss": -43.6743, "step": 2668 }, { "epoch": 0.23810161024131318, "grad_norm": 1315.4725341796875, "learning_rate": 3e-06, "loss": -62.2457, "step": 2669 }, { "epoch": 0.23819082028636424, "grad_norm": 919.5346069335938, "learning_rate": 3e-06, "loss": 12.7562, "step": 2670 }, { "epoch": 0.23828003033141532, "grad_norm": 1541.4696044921875, "learning_rate": 3e-06, "loss": -36.8959, "step": 2671 }, { "epoch": 0.2383692403764664, "grad_norm": 1359.3785400390625, "learning_rate": 3e-06, "loss": -25.5954, "step": 2672 }, { "epoch": 0.23845845042151748, "grad_norm": 1613.6568603515625, "learning_rate": 3e-06, "loss": 18.7952, "step": 2673 }, { "epoch": 0.23854766046656853, "grad_norm": 1872.6046142578125, "learning_rate": 3e-06, "loss": -55.3274, "step": 2674 }, { "epoch": 0.2386368705116196, "grad_norm": 1405.7705078125, "learning_rate": 3e-06, "loss": -67.8542, "step": 2675 }, { "epoch": 0.2387260805566707, "grad_norm": 1027.1573486328125, "learning_rate": 3e-06, "loss": 5.5983, "step": 2676 }, { "completion_length": 131.02083587646484, "epoch": 0.23881529060172174, "grad_norm": 1824.8133544921875, "learning_rate": 3e-06, "loss": 179.5916, "reward": 2.4202083349227905, "reward_std": 0.39864790439605713, "rewards/correctness_reward_func": 1.7916666865348816, "rewards/int_reward_func": 0.4791666716337204, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.14937499910593033, "step": 2677, "zero_std_ratio": 0.0 }, { "epoch": 0.23890450064677282, "grad_norm": 1887.1849365234375, "learning_rate": 3e-06, "loss": 279.8454, "step": 2678 }, { "epoch": 0.2389937106918239, "grad_norm": 2179.565673828125, "learning_rate": 3e-06, "loss": 202.9643, "step": 2679 }, { "epoch": 0.23908292073687498, "grad_norm": 2075.03759765625, "learning_rate": 3e-06, "loss": 205.2736, "step": 2680 }, { "epoch": 0.23917213078192603, "grad_norm": 2201.120849609375, "learning_rate": 3e-06, "loss": 245.5202, "step": 2681 }, { "epoch": 0.23926134082697711, "grad_norm": 1846.619873046875, "learning_rate": 3e-06, "loss": 133.1962, "step": 2682 }, { "epoch": 0.2393505508720282, "grad_norm": 1933.9061279296875, "learning_rate": 3e-06, "loss": 139.4566, "step": 2683 }, { "epoch": 0.23943976091707928, "grad_norm": 2089.64697265625, "learning_rate": 3e-06, "loss": 234.4406, "step": 2684 }, { "epoch": 0.23952897096213033, "grad_norm": 2168.590576171875, "learning_rate": 3e-06, "loss": 132.6577, "step": 2685 }, { "epoch": 0.2396181810071814, "grad_norm": 1802.5875244140625, "learning_rate": 3e-06, "loss": 143.3382, "step": 2686 }, { "epoch": 0.2397073910522325, "grad_norm": 1915.945556640625, "learning_rate": 3e-06, "loss": 154.7713, "step": 2687 }, { "epoch": 0.23979660109728354, "grad_norm": 1281.6055908203125, "learning_rate": 3e-06, "loss": 82.1742, "step": 2688 }, { "completion_length": 133.3125, "epoch": 0.23988581114233462, "grad_norm": 383.0669250488281, "learning_rate": 3e-06, "loss": -16.7181, "reward": 2.366750121116638, "reward_std": 0.47807711362838745, "rewards/correctness_reward_func": 1.75, "rewards/int_reward_func": 0.4791666716337204, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.13758333399891853, "step": 2689, "zero_std_ratio": 0.0 }, { "epoch": 0.2399750211873857, "grad_norm": 391.2867736816406, "learning_rate": 3e-06, "loss": -17.6372, "step": 2690 }, { "epoch": 0.24006423123243678, "grad_norm": 296.14239501953125, "learning_rate": 3e-06, "loss": -26.3105, "step": 2691 }, { "epoch": 0.24015344127748783, "grad_norm": 331.05303955078125, "learning_rate": 3e-06, "loss": -19.8325, "step": 2692 }, { "epoch": 0.2402426513225389, "grad_norm": 844.7974853515625, "learning_rate": 3e-06, "loss": -42.8517, "step": 2693 }, { "epoch": 0.24033186136759, "grad_norm": 269.29437255859375, "learning_rate": 3e-06, "loss": -13.9485, "step": 2694 }, { "epoch": 0.24042107141264107, "grad_norm": 239.9994354248047, "learning_rate": 3e-06, "loss": -25.301, "step": 2695 }, { "epoch": 0.24051028145769213, "grad_norm": 255.10848999023438, "learning_rate": 3e-06, "loss": -22.1235, "step": 2696 }, { "epoch": 0.2405994915027432, "grad_norm": 239.280029296875, "learning_rate": 3e-06, "loss": -32.4946, "step": 2697 }, { "epoch": 0.2406887015477943, "grad_norm": 246.9565887451172, "learning_rate": 3e-06, "loss": -26.7659, "step": 2698 }, { "epoch": 0.24077791159284537, "grad_norm": 937.3157958984375, "learning_rate": 3e-06, "loss": -42.2542, "step": 2699 }, { "epoch": 0.24086712163789642, "grad_norm": 249.9385223388672, "learning_rate": 3e-06, "loss": -17.6092, "step": 2700 }, { "completion_length": 138.3125, "epoch": 0.2409563316829475, "grad_norm": 96.49369812011719, "learning_rate": 3e-06, "loss": -10.3369, "reward": 1.8352708220481873, "reward_std": 0.12855645269155502, "rewards/correctness_reward_func": 1.2083333432674408, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.12693750113248825, "step": 2701, "zero_std_ratio": 0.0 }, { "epoch": 0.24104554172799858, "grad_norm": 79.1015853881836, "learning_rate": 3e-06, "loss": -4.475, "step": 2702 }, { "epoch": 0.24113475177304963, "grad_norm": 394.2448425292969, "learning_rate": 3e-06, "loss": -17.3966, "step": 2703 }, { "epoch": 0.2412239618181007, "grad_norm": 132.28253173828125, "learning_rate": 3e-06, "loss": -10.7963, "step": 2704 }, { "epoch": 0.2413131718631518, "grad_norm": 110.22737884521484, "learning_rate": 3e-06, "loss": -17.1299, "step": 2705 }, { "epoch": 0.24140238190820287, "grad_norm": 101.7492446899414, "learning_rate": 3e-06, "loss": 1.907, "step": 2706 }, { "epoch": 0.24149159195325393, "grad_norm": 95.92127990722656, "learning_rate": 3e-06, "loss": -11.6109, "step": 2707 }, { "epoch": 0.241580801998305, "grad_norm": 93.961181640625, "learning_rate": 3e-06, "loss": -4.9432, "step": 2708 }, { "epoch": 0.24167001204335609, "grad_norm": 510.8270263671875, "learning_rate": 3e-06, "loss": -22.3494, "step": 2709 }, { "epoch": 0.24175922208840717, "grad_norm": 140.4524688720703, "learning_rate": 3e-06, "loss": -12.6703, "step": 2710 }, { "epoch": 0.24184843213345822, "grad_norm": 111.56268310546875, "learning_rate": 3e-06, "loss": -18.3569, "step": 2711 }, { "epoch": 0.2419376421785093, "grad_norm": 137.32752990722656, "learning_rate": 3e-06, "loss": 2.4727, "step": 2712 }, { "completion_length": 116.31250381469727, "epoch": 0.24202685222356038, "grad_norm": 102.95317077636719, "learning_rate": 3e-06, "loss": 1.31, "reward": 2.391145944595337, "reward_std": 0.12602753471583128, "rewards/correctness_reward_func": 1.7083333134651184, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.18281250447034836, "step": 2713, "zero_std_ratio": 0.0 }, { "epoch": 0.24211606226861146, "grad_norm": 107.33989715576172, "learning_rate": 3e-06, "loss": 2.0966, "step": 2714 }, { "epoch": 0.2422052723136625, "grad_norm": 157.60411071777344, "learning_rate": 3e-06, "loss": -2.3055, "step": 2715 }, { "epoch": 0.2422944823587136, "grad_norm": 130.59576416015625, "learning_rate": 3e-06, "loss": -1.5137, "step": 2716 }, { "epoch": 0.24238369240376467, "grad_norm": 166.42257690429688, "learning_rate": 3e-06, "loss": -4.5743, "step": 2717 }, { "epoch": 0.24247290244881572, "grad_norm": 103.23640441894531, "learning_rate": 3e-06, "loss": 0.6876, "step": 2718 }, { "epoch": 0.2425621124938668, "grad_norm": 111.08712768554688, "learning_rate": 3e-06, "loss": -0.1601, "step": 2719 }, { "epoch": 0.24265132253891789, "grad_norm": 110.8766860961914, "learning_rate": 3e-06, "loss": 0.275, "step": 2720 }, { "epoch": 0.24274053258396897, "grad_norm": 176.57183837890625, "learning_rate": 3e-06, "loss": -4.0874, "step": 2721 }, { "epoch": 0.24282974262902002, "grad_norm": 114.48674774169922, "learning_rate": 3e-06, "loss": -3.9568, "step": 2722 }, { "epoch": 0.2429189526740711, "grad_norm": 195.21788024902344, "learning_rate": 3e-06, "loss": -6.4559, "step": 2723 }, { "epoch": 0.24300816271912218, "grad_norm": 107.71833801269531, "learning_rate": 3e-06, "loss": -1.4972, "step": 2724 }, { "completion_length": 111.70833587646484, "epoch": 0.24309737276417326, "grad_norm": 758.646728515625, "learning_rate": 3e-06, "loss": -50.9212, "reward": 2.3797292709350586, "reward_std": 0.4224308282136917, "rewards/correctness_reward_func": 1.7083333134651184, "rewards/int_reward_func": 0.4895833283662796, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.18181249499320984, "step": 2725, "zero_std_ratio": 0.0 }, { "epoch": 0.2431865828092243, "grad_norm": 910.1637573242188, "learning_rate": 3e-06, "loss": -28.3655, "step": 2726 }, { "epoch": 0.2432757928542754, "grad_norm": 788.1729736328125, "learning_rate": 3e-06, "loss": -56.1199, "step": 2727 }, { "epoch": 0.24336500289932647, "grad_norm": 1074.454345703125, "learning_rate": 3e-06, "loss": -46.9052, "step": 2728 }, { "epoch": 0.24345421294437752, "grad_norm": 780.352783203125, "learning_rate": 3e-06, "loss": 9.7858, "step": 2729 }, { "epoch": 0.2435434229894286, "grad_norm": 941.043212890625, "learning_rate": 3e-06, "loss": -36.2944, "step": 2730 }, { "epoch": 0.24363263303447968, "grad_norm": 752.3587036132812, "learning_rate": 3e-06, "loss": -56.5041, "step": 2731 }, { "epoch": 0.24372184307953076, "grad_norm": 841.4219970703125, "learning_rate": 3e-06, "loss": -34.2839, "step": 2732 }, { "epoch": 0.24381105312458182, "grad_norm": 625.3259887695312, "learning_rate": 3e-06, "loss": -71.5521, "step": 2733 }, { "epoch": 0.2439002631696329, "grad_norm": 919.334228515625, "learning_rate": 3e-06, "loss": -52.7569, "step": 2734 }, { "epoch": 0.24398947321468398, "grad_norm": 901.2823486328125, "learning_rate": 3e-06, "loss": 5.3611, "step": 2735 }, { "epoch": 0.24407868325973506, "grad_norm": 1115.445068359375, "learning_rate": 3e-06, "loss": -42.9473, "step": 2736 }, { "completion_length": 115.54166793823242, "epoch": 0.2441678933047861, "grad_norm": 664.6596069335938, "learning_rate": 3e-06, "loss": 64.882, "reward": 2.324125051498413, "reward_std": 0.25276508182287216, "rewards/correctness_reward_func": 1.625, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.1991249993443489, "step": 2737, "zero_std_ratio": 0.0 }, { "epoch": 0.2442571033498372, "grad_norm": 765.7220458984375, "learning_rate": 3e-06, "loss": 0.9702, "step": 2738 }, { "epoch": 0.24434631339488827, "grad_norm": 924.0851440429688, "learning_rate": 3e-06, "loss": 35.4542, "step": 2739 }, { "epoch": 0.24443552343993935, "grad_norm": 1282.8251953125, "learning_rate": 3e-06, "loss": 16.3642, "step": 2740 }, { "epoch": 0.2445247334849904, "grad_norm": 1375.6214599609375, "learning_rate": 3e-06, "loss": -52.175, "step": 2741 }, { "epoch": 0.24461394353004148, "grad_norm": 642.0603637695312, "learning_rate": 3e-06, "loss": -36.9288, "step": 2742 }, { "epoch": 0.24470315357509256, "grad_norm": 729.1392822265625, "learning_rate": 3e-06, "loss": 64.6104, "step": 2743 }, { "epoch": 0.24479236362014362, "grad_norm": 757.8997192382812, "learning_rate": 3e-06, "loss": -6.2502, "step": 2744 }, { "epoch": 0.2448815736651947, "grad_norm": 895.462646484375, "learning_rate": 3e-06, "loss": 28.867, "step": 2745 }, { "epoch": 0.24497078371024578, "grad_norm": 1581.2919921875, "learning_rate": 3e-06, "loss": 6.4948, "step": 2746 }, { "epoch": 0.24505999375529686, "grad_norm": 1451.12353515625, "learning_rate": 3e-06, "loss": -59.1616, "step": 2747 }, { "epoch": 0.2451492038003479, "grad_norm": 536.7738647460938, "learning_rate": 3e-06, "loss": -45.5196, "step": 2748 }, { "completion_length": 117.20833587646484, "epoch": 0.245238413845399, "grad_norm": 134.61968994140625, "learning_rate": 3e-06, "loss": -2.612, "reward": 2.3929167985916138, "reward_std": 0.1570490561425686, "rewards/correctness_reward_func": 1.7083333134651184, "rewards/int_reward_func": 0.4791666716337204, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.20541665703058243, "step": 2749, "zero_std_ratio": 0.0 }, { "epoch": 0.24532762389045007, "grad_norm": 169.6995391845703, "learning_rate": 3e-06, "loss": 0.2511, "step": 2750 }, { "epoch": 0.24541683393550115, "grad_norm": 133.6337127685547, "learning_rate": 3e-06, "loss": -6.6203, "step": 2751 }, { "epoch": 0.2455060439805522, "grad_norm": 134.84703063964844, "learning_rate": 3e-06, "loss": -2.6237, "step": 2752 }, { "epoch": 0.24559525402560328, "grad_norm": 121.29273223876953, "learning_rate": 3e-06, "loss": -1.7745, "step": 2753 }, { "epoch": 0.24568446407065436, "grad_norm": 127.68272399902344, "learning_rate": 3e-06, "loss": 3.1492, "step": 2754 }, { "epoch": 0.24577367411570541, "grad_norm": 116.89612579345703, "learning_rate": 3e-06, "loss": -3.8123, "step": 2755 }, { "epoch": 0.2458628841607565, "grad_norm": 110.55398559570312, "learning_rate": 3e-06, "loss": -0.7114, "step": 2756 }, { "epoch": 0.24595209420580758, "grad_norm": 149.1359405517578, "learning_rate": 3e-06, "loss": -9.3238, "step": 2757 }, { "epoch": 0.24604130425085866, "grad_norm": 129.7926025390625, "learning_rate": 3e-06, "loss": -5.0288, "step": 2758 }, { "epoch": 0.2461305142959097, "grad_norm": 106.68339538574219, "learning_rate": 3e-06, "loss": -4.4991, "step": 2759 }, { "epoch": 0.2462197243409608, "grad_norm": 122.20665740966797, "learning_rate": 3e-06, "loss": 0.9642, "step": 2760 }, { "completion_length": 123.87500381469727, "epoch": 0.24630893438601187, "grad_norm": 1535.9466552734375, "learning_rate": 3e-06, "loss": -54.068, "reward": 2.000249981880188, "reward_std": 0.3110375218093395, "rewards/correctness_reward_func": 1.3333333134651184, "rewards/int_reward_func": 0.4791666567325592, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.18774999678134918, "step": 2761, "zero_std_ratio": 0.0 }, { "epoch": 0.24639814443106295, "grad_norm": 1170.3299560546875, "learning_rate": 3e-06, "loss": 66.1128, "step": 2762 }, { "epoch": 0.246487354476114, "grad_norm": 2449.8046875, "learning_rate": 3e-06, "loss": 19.3362, "step": 2763 }, { "epoch": 0.24657656452116508, "grad_norm": 2181.69921875, "learning_rate": 3e-06, "loss": -118.6144, "step": 2764 }, { "epoch": 0.24666577456621616, "grad_norm": 1696.40380859375, "learning_rate": 3e-06, "loss": -163.0827, "step": 2765 }, { "epoch": 0.24675498461126724, "grad_norm": 2616.766357421875, "learning_rate": 3e-06, "loss": -59.11, "step": 2766 }, { "epoch": 0.2468441946563183, "grad_norm": 1294.1663818359375, "learning_rate": 3e-06, "loss": -68.8212, "step": 2767 }, { "epoch": 0.24693340470136937, "grad_norm": 1346.945068359375, "learning_rate": 3e-06, "loss": 55.1171, "step": 2768 }, { "epoch": 0.24702261474642045, "grad_norm": 2457.05908203125, "learning_rate": 3e-06, "loss": -8.482, "step": 2769 }, { "epoch": 0.2471118247914715, "grad_norm": 4191.4248046875, "learning_rate": 3e-06, "loss": -160.2893, "step": 2770 }, { "epoch": 0.2472010348365226, "grad_norm": 1363.685546875, "learning_rate": 3e-06, "loss": -183.5964, "step": 2771 }, { "epoch": 0.24729024488157367, "grad_norm": 2059.74072265625, "learning_rate": 3e-06, "loss": -113.7664, "step": 2772 }, { "completion_length": 161.52083587646484, "epoch": 0.24737945492662475, "grad_norm": 1604.876953125, "learning_rate": 3e-06, "loss": -42.039, "reward": 1.64020836353302, "reward_std": 0.35881197452545166, "rewards/correctness_reward_func": 1.125, "rewards/int_reward_func": 0.4375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.07770833000540733, "step": 2773, "zero_std_ratio": 0.125 }, { "epoch": 0.2474686649716758, "grad_norm": 1848.3817138671875, "learning_rate": 3e-06, "loss": 41.1877, "step": 2774 }, { "epoch": 0.24755787501672688, "grad_norm": 1808.0823974609375, "learning_rate": 3e-06, "loss": -40.5367, "step": 2775 }, { "epoch": 0.24764708506177796, "grad_norm": 1113.06689453125, "learning_rate": 3e-06, "loss": -70.3134, "step": 2776 }, { "epoch": 0.24773629510682904, "grad_norm": 1473.16650390625, "learning_rate": 3e-06, "loss": -54.4106, "step": 2777 }, { "epoch": 0.2478255051518801, "grad_norm": 1608.3349609375, "learning_rate": 3e-06, "loss": -49.2935, "step": 2778 }, { "epoch": 0.24791471519693117, "grad_norm": 1539.8026123046875, "learning_rate": 3e-06, "loss": -35.5182, "step": 2779 }, { "epoch": 0.24800392524198225, "grad_norm": 1777.2486572265625, "learning_rate": 3e-06, "loss": 30.3037, "step": 2780 }, { "epoch": 0.2480931352870333, "grad_norm": 1655.0826416015625, "learning_rate": 3e-06, "loss": -42.6796, "step": 2781 }, { "epoch": 0.2481823453320844, "grad_norm": 1024.0179443359375, "learning_rate": 3e-06, "loss": -89.759, "step": 2782 }, { "epoch": 0.24827155537713547, "grad_norm": 1473.08203125, "learning_rate": 3e-06, "loss": -69.6303, "step": 2783 }, { "epoch": 0.24836076542218655, "grad_norm": 1904.6173095703125, "learning_rate": 3e-06, "loss": -81.8251, "step": 2784 }, { "completion_length": 128.43750762939453, "epoch": 0.2484499754672376, "grad_norm": 1864.3160400390625, "learning_rate": 3e-06, "loss": -212.5905, "reward": 2.2850834131240845, "reward_std": 0.417568564414978, "rewards/correctness_reward_func": 1.625, "rewards/int_reward_func": 0.4895833283662796, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.1705000028014183, "step": 2785, "zero_std_ratio": 0.0 }, { "epoch": 0.24853918551228868, "grad_norm": 1880.1895751953125, "learning_rate": 3e-06, "loss": -123.6046, "step": 2786 }, { "epoch": 0.24862839555733976, "grad_norm": 1683.79736328125, "learning_rate": 3e-06, "loss": -87.2422, "step": 2787 }, { "epoch": 0.24871760560239084, "grad_norm": 1564.4197998046875, "learning_rate": 3e-06, "loss": -154.2475, "step": 2788 }, { "epoch": 0.2488068156474419, "grad_norm": 1567.22216796875, "learning_rate": 3e-06, "loss": -176.831, "step": 2789 }, { "epoch": 0.24889602569249297, "grad_norm": 1535.4984130859375, "learning_rate": 3e-06, "loss": -140.511, "step": 2790 }, { "epoch": 0.24898523573754405, "grad_norm": 1612.568603515625, "learning_rate": 3e-06, "loss": -216.3696, "step": 2791 }, { "epoch": 0.24907444578259513, "grad_norm": 1735.4697265625, "learning_rate": 3e-06, "loss": -147.2878, "step": 2792 }, { "epoch": 0.24916365582764619, "grad_norm": 1873.488525390625, "learning_rate": 3e-06, "loss": -111.5574, "step": 2793 }, { "epoch": 0.24925286587269727, "grad_norm": 1418.802734375, "learning_rate": 3e-06, "loss": -170.793, "step": 2794 }, { "epoch": 0.24934207591774835, "grad_norm": 1663.2034912109375, "learning_rate": 3e-06, "loss": -197.7705, "step": 2795 }, { "epoch": 0.2494312859627994, "grad_norm": 1235.136962890625, "learning_rate": 3e-06, "loss": -157.0547, "step": 2796 }, { "completion_length": 142.64583587646484, "epoch": 0.24952049600785048, "grad_norm": 1619.957275390625, "learning_rate": 3e-06, "loss": -30.3573, "reward": 1.5315624475479126, "reward_std": 0.6786567568778992, "rewards/correctness_reward_func": 0.9583333283662796, "rewards/int_reward_func": 0.4166666716337204, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.15656249970197678, "step": 2797, "zero_std_ratio": 0.0 }, { "epoch": 0.24960970605290156, "grad_norm": 1342.072509765625, "learning_rate": 3e-06, "loss": -26.1193, "step": 2798 }, { "epoch": 0.24969891609795264, "grad_norm": 1125.7022705078125, "learning_rate": 3e-06, "loss": -24.8344, "step": 2799 }, { "epoch": 0.2497881261430037, "grad_norm": 958.1636352539062, "learning_rate": 3e-06, "loss": -50.3119, "step": 2800 }, { "epoch": 0.24987733618805477, "grad_norm": 1085.536865234375, "learning_rate": 3e-06, "loss": -44.7726, "step": 2801 }, { "epoch": 0.24996654623310585, "grad_norm": 1255.123046875, "learning_rate": 3e-06, "loss": -64.3716, "step": 2802 }, { "epoch": 0.2500557562781569, "grad_norm": 1413.07080078125, "learning_rate": 3e-06, "loss": -37.5393, "step": 2803 }, { "epoch": 0.250144966323208, "grad_norm": 1002.2625732421875, "learning_rate": 3e-06, "loss": -38.3578, "step": 2804 }, { "epoch": 0.25023417636825906, "grad_norm": 1106.23974609375, "learning_rate": 3e-06, "loss": -33.4304, "step": 2805 }, { "epoch": 0.25032338641331014, "grad_norm": 1196.9891357421875, "learning_rate": 3e-06, "loss": -67.9984, "step": 2806 }, { "epoch": 0.2504125964583612, "grad_norm": 1090.32763671875, "learning_rate": 3e-06, "loss": -61.613, "step": 2807 }, { "epoch": 0.2505018065034123, "grad_norm": 1379.7518310546875, "learning_rate": 3e-06, "loss": -87.9652, "step": 2808 }, { "completion_length": 152.2291717529297, "epoch": 0.25059101654846333, "grad_norm": 3175.446044921875, "learning_rate": 3e-06, "loss": -269.122, "reward": 1.9372500777244568, "reward_std": 0.6658292412757874, "rewards/correctness_reward_func": 1.3333333730697632, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.10391666647046804, "step": 2809, "zero_std_ratio": 0.0 }, { "epoch": 0.2506802265935144, "grad_norm": 2337.353515625, "learning_rate": 3e-06, "loss": -179.0576, "step": 2810 }, { "epoch": 0.2507694366385655, "grad_norm": 2623.594482421875, "learning_rate": 3e-06, "loss": -216.9014, "step": 2811 }, { "epoch": 0.25085864668361657, "grad_norm": 2270.6611328125, "learning_rate": 3e-06, "loss": -216.638, "step": 2812 }, { "epoch": 0.25094785672866765, "grad_norm": 2236.594482421875, "learning_rate": 3e-06, "loss": -165.7091, "step": 2813 }, { "epoch": 0.25103706677371873, "grad_norm": 2890.3359375, "learning_rate": 3e-06, "loss": -248.6344, "step": 2814 }, { "epoch": 0.2511262768187698, "grad_norm": 2581.38818359375, "learning_rate": 3e-06, "loss": -284.7481, "step": 2815 }, { "epoch": 0.2512154868638209, "grad_norm": 2692.57958984375, "learning_rate": 3e-06, "loss": -202.7348, "step": 2816 }, { "epoch": 0.2513046969088719, "grad_norm": 2182.74951171875, "learning_rate": 3e-06, "loss": -236.8827, "step": 2817 }, { "epoch": 0.251393906953923, "grad_norm": 2094.3203125, "learning_rate": 3e-06, "loss": -262.0088, "step": 2818 }, { "epoch": 0.2514831169989741, "grad_norm": 2779.8896484375, "learning_rate": 3e-06, "loss": -187.7016, "step": 2819 }, { "epoch": 0.25157232704402516, "grad_norm": 2167.91259765625, "learning_rate": 3e-06, "loss": -279.1689, "step": 2820 }, { "completion_length": 117.97916793823242, "epoch": 0.25166153708907624, "grad_norm": 744.908203125, "learning_rate": 3e-06, "loss": -76.7129, "reward": 2.347895860671997, "reward_std": 0.2259850949048996, "rewards/correctness_reward_func": 1.6666666269302368, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.1812291517853737, "step": 2821, "zero_std_ratio": 0.0 }, { "epoch": 0.2517507471341273, "grad_norm": 670.07080078125, "learning_rate": 3e-06, "loss": -59.1948, "step": 2822 }, { "epoch": 0.2518399571791784, "grad_norm": 576.8960571289062, "learning_rate": 3e-06, "loss": -81.2326, "step": 2823 }, { "epoch": 0.2519291672242294, "grad_norm": 617.103271484375, "learning_rate": 3e-06, "loss": -74.1532, "step": 2824 }, { "epoch": 0.2520183772692805, "grad_norm": 704.6082763671875, "learning_rate": 3e-06, "loss": -72.6393, "step": 2825 }, { "epoch": 0.2521075873143316, "grad_norm": 467.5483703613281, "learning_rate": 3e-06, "loss": -103.0292, "step": 2826 }, { "epoch": 0.25219679735938266, "grad_norm": 751.9234008789062, "learning_rate": 3e-06, "loss": -81.9601, "step": 2827 }, { "epoch": 0.25228600740443374, "grad_norm": 722.494384765625, "learning_rate": 3e-06, "loss": -61.2682, "step": 2828 }, { "epoch": 0.2523752174494848, "grad_norm": 455.8627014160156, "learning_rate": 3e-06, "loss": -89.9871, "step": 2829 }, { "epoch": 0.2524644274945359, "grad_norm": 562.8233642578125, "learning_rate": 3e-06, "loss": -80.9138, "step": 2830 }, { "epoch": 0.252553637539587, "grad_norm": 606.7791137695312, "learning_rate": 3e-06, "loss": -84.0287, "step": 2831 }, { "epoch": 0.252642847584638, "grad_norm": 418.8397521972656, "learning_rate": 3e-06, "loss": -111.686, "step": 2832 }, { "completion_length": 140.9791717529297, "epoch": 0.2527320576296891, "grad_norm": 1045.3818359375, "learning_rate": 3e-06, "loss": -70.3398, "reward": 2.242041826248169, "reward_std": 0.37531861662864685, "rewards/correctness_reward_func": 1.625, "rewards/int_reward_func": 0.4895833283662796, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.1274583339691162, "step": 2833, "zero_std_ratio": 0.0 }, { "epoch": 0.25282126767474017, "grad_norm": 898.2760009765625, "learning_rate": 3e-06, "loss": -74.9414, "step": 2834 }, { "epoch": 0.25291047771979125, "grad_norm": 1541.7431640625, "learning_rate": 3e-06, "loss": -58.5472, "step": 2835 }, { "epoch": 0.25299968776484233, "grad_norm": 3247.922119140625, "learning_rate": 3e-06, "loss": -229.4754, "step": 2836 }, { "epoch": 0.2530888978098934, "grad_norm": 3208.151123046875, "learning_rate": 3e-06, "loss": -81.4552, "step": 2837 }, { "epoch": 0.2531781078549445, "grad_norm": 1030.628173828125, "learning_rate": 3e-06, "loss": -58.0683, "step": 2838 }, { "epoch": 0.2532673178999955, "grad_norm": 1552.9305419921875, "learning_rate": 3e-06, "loss": -84.6276, "step": 2839 }, { "epoch": 0.2533565279450466, "grad_norm": 1072.2696533203125, "learning_rate": 3e-06, "loss": -88.336, "step": 2840 }, { "epoch": 0.2534457379900977, "grad_norm": 1726.9500732421875, "learning_rate": 3e-06, "loss": -71.3833, "step": 2841 }, { "epoch": 0.25353494803514875, "grad_norm": 3437.212646484375, "learning_rate": 3e-06, "loss": -285.0593, "step": 2842 }, { "epoch": 0.25362415808019984, "grad_norm": 2565.8173828125, "learning_rate": 3e-06, "loss": -127.9671, "step": 2843 }, { "epoch": 0.2537133681252509, "grad_norm": 1189.047607421875, "learning_rate": 3e-06, "loss": -84.2622, "step": 2844 }, { "completion_length": 104.56250381469727, "epoch": 0.253802578170302, "grad_norm": 942.047119140625, "learning_rate": 3e-06, "loss": 13.3562, "reward": 2.6932294368743896, "reward_std": 0.13863282464444637, "rewards/correctness_reward_func": 1.9583333134651184, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.23489583283662796, "step": 2845, "zero_std_ratio": 0.0 }, { "epoch": 0.2538917882153531, "grad_norm": 439.05731201171875, "learning_rate": 3e-06, "loss": 0.4311, "step": 2846 }, { "epoch": 0.2539809982604041, "grad_norm": 536.1798095703125, "learning_rate": 3e-06, "loss": -21.4241, "step": 2847 }, { "epoch": 0.2540702083054552, "grad_norm": 482.5874938964844, "learning_rate": 3e-06, "loss": 2.2489, "step": 2848 }, { "epoch": 0.25415941835050626, "grad_norm": 844.9990844726562, "learning_rate": 3e-06, "loss": -45.5204, "step": 2849 }, { "epoch": 0.25424862839555734, "grad_norm": 549.787109375, "learning_rate": 3e-06, "loss": -12.4542, "step": 2850 }, { "epoch": 0.2543378384406084, "grad_norm": 931.2616577148438, "learning_rate": 3e-06, "loss": 7.4838, "step": 2851 }, { "epoch": 0.2544270484856595, "grad_norm": 654.050048828125, "learning_rate": 3e-06, "loss": -4.2283, "step": 2852 }, { "epoch": 0.2545162585307106, "grad_norm": 495.4552917480469, "learning_rate": 3e-06, "loss": -23.7347, "step": 2853 }, { "epoch": 0.2546054685757616, "grad_norm": 604.6043090820312, "learning_rate": 3e-06, "loss": -3.3253, "step": 2854 }, { "epoch": 0.2546946786208127, "grad_norm": 748.7670288085938, "learning_rate": 3e-06, "loss": -52.7906, "step": 2855 }, { "epoch": 0.25478388866586377, "grad_norm": 480.82305908203125, "learning_rate": 3e-06, "loss": -14.6563, "step": 2856 }, { "completion_length": 143.2291717529297, "epoch": 0.25487309871091485, "grad_norm": 1571.882568359375, "learning_rate": 3e-06, "loss": -9.1291, "reward": 1.9929999709129333, "reward_std": 0.5555278956890106, "rewards/correctness_reward_func": 1.375, "rewards/int_reward_func": 0.4791666716337204, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.13883333653211594, "step": 2857, "zero_std_ratio": 0.0 }, { "epoch": 0.2549623087559659, "grad_norm": 1909.6287841796875, "learning_rate": 3e-06, "loss": -1.4751, "step": 2858 }, { "epoch": 0.255051518801017, "grad_norm": 1516.956298828125, "learning_rate": 3e-06, "loss": 77.7763, "step": 2859 }, { "epoch": 0.2551407288460681, "grad_norm": 1635.180419921875, "learning_rate": 3e-06, "loss": 30.0892, "step": 2860 }, { "epoch": 0.2552299388911191, "grad_norm": 1452.2025146484375, "learning_rate": 3e-06, "loss": 10.889, "step": 2861 }, { "epoch": 0.2553191489361702, "grad_norm": 1438.949462890625, "learning_rate": 3e-06, "loss": 19.8305, "step": 2862 }, { "epoch": 0.2554083589812213, "grad_norm": 1551.45751953125, "learning_rate": 3e-06, "loss": -18.7983, "step": 2863 }, { "epoch": 0.25549756902627235, "grad_norm": 2298.835693359375, "learning_rate": 3e-06, "loss": -28.7939, "step": 2864 }, { "epoch": 0.25558677907132343, "grad_norm": 1485.9146728515625, "learning_rate": 3e-06, "loss": 62.8034, "step": 2865 }, { "epoch": 0.2556759891163745, "grad_norm": 1412.7081298828125, "learning_rate": 3e-06, "loss": 12.7132, "step": 2866 }, { "epoch": 0.2557651991614256, "grad_norm": 1433.5377197265625, "learning_rate": 3e-06, "loss": -21.8716, "step": 2867 }, { "epoch": 0.2558544092064767, "grad_norm": 1969.7119140625, "learning_rate": 3e-06, "loss": -10.5038, "step": 2868 }, { "completion_length": 108.58333587646484, "epoch": 0.2559436192515277, "grad_norm": 1309.0294189453125, "learning_rate": 3e-06, "loss": 9.3717, "reward": 2.346583366394043, "reward_std": 0.38520485162734985, "rewards/correctness_reward_func": 1.625, "rewards/int_reward_func": 0.4895833283662796, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.23199999332427979, "step": 2869, "zero_std_ratio": 0.0 }, { "epoch": 0.2560328292965788, "grad_norm": 1052.8076171875, "learning_rate": 3e-06, "loss": 68.964, "step": 2870 }, { "epoch": 0.25612203934162986, "grad_norm": 1088.320068359375, "learning_rate": 3e-06, "loss": 79.2477, "step": 2871 }, { "epoch": 0.25621124938668094, "grad_norm": 1275.5045166015625, "learning_rate": 3e-06, "loss": 0.6862, "step": 2872 }, { "epoch": 0.256300459431732, "grad_norm": 2489.02587890625, "learning_rate": 3e-06, "loss": 5.1994, "step": 2873 }, { "epoch": 0.2563896694767831, "grad_norm": 1260.5767822265625, "learning_rate": 3e-06, "loss": 42.2048, "step": 2874 }, { "epoch": 0.2564788795218342, "grad_norm": 2855.42822265625, "learning_rate": 3e-06, "loss": 9.9973, "step": 2875 }, { "epoch": 0.2565680895668852, "grad_norm": 1059.3900146484375, "learning_rate": 3e-06, "loss": 58.3824, "step": 2876 }, { "epoch": 0.2566572996119363, "grad_norm": 1149.518798828125, "learning_rate": 3e-06, "loss": 71.9122, "step": 2877 }, { "epoch": 0.25674650965698737, "grad_norm": 1287.1915283203125, "learning_rate": 3e-06, "loss": -11.4331, "step": 2878 }, { "epoch": 0.25683571970203845, "grad_norm": 3011.14453125, "learning_rate": 3e-06, "loss": -35.1416, "step": 2879 }, { "epoch": 0.2569249297470895, "grad_norm": 1243.9486083984375, "learning_rate": 3e-06, "loss": 23.6993, "step": 2880 }, { "completion_length": 94.04166793823242, "epoch": 0.2570141397921406, "grad_norm": 864.4456176757812, "learning_rate": 3e-06, "loss": -18.2377, "reward": 2.501083493232727, "reward_std": 0.2294117882847786, "rewards/correctness_reward_func": 1.75, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.2510833218693733, "step": 2881, "zero_std_ratio": 0.0 }, { "epoch": 0.2571033498371917, "grad_norm": 745.5352172851562, "learning_rate": 3e-06, "loss": -0.7096, "step": 2882 }, { "epoch": 0.25719255988224277, "grad_norm": 753.320068359375, "learning_rate": 3e-06, "loss": 15.4283, "step": 2883 }, { "epoch": 0.2572817699272938, "grad_norm": 752.463623046875, "learning_rate": 3e-06, "loss": 5.8229, "step": 2884 }, { "epoch": 0.25737097997234487, "grad_norm": 1526.0992431640625, "learning_rate": 3e-06, "loss": -46.1967, "step": 2885 }, { "epoch": 0.25746019001739595, "grad_norm": 976.7958984375, "learning_rate": 3e-06, "loss": -38.3941, "step": 2886 }, { "epoch": 0.25754940006244703, "grad_norm": 893.157958984375, "learning_rate": 3e-06, "loss": -25.5575, "step": 2887 }, { "epoch": 0.2576386101074981, "grad_norm": 926.5038452148438, "learning_rate": 3e-06, "loss": -9.8097, "step": 2888 }, { "epoch": 0.2577278201525492, "grad_norm": 712.7392578125, "learning_rate": 3e-06, "loss": 13.1726, "step": 2889 }, { "epoch": 0.25781703019760027, "grad_norm": 850.4429931640625, "learning_rate": 3e-06, "loss": 0.7582, "step": 2890 }, { "epoch": 0.2579062402426513, "grad_norm": 1289.7423095703125, "learning_rate": 3e-06, "loss": -57.286, "step": 2891 }, { "epoch": 0.2579954502877024, "grad_norm": 911.5849609375, "learning_rate": 3e-06, "loss": -40.4834, "step": 2892 }, { "completion_length": 108.77083587646484, "epoch": 0.25808466033275346, "grad_norm": 1608.457275390625, "learning_rate": 3e-06, "loss": 42.447, "reward": 2.397250175476074, "reward_std": 0.5737389028072357, "rewards/correctness_reward_func": 1.6666666865348816, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.2305833324790001, "step": 2893, "zero_std_ratio": 0.0 }, { "epoch": 0.25817387037780454, "grad_norm": 1408.6148681640625, "learning_rate": 3e-06, "loss": 97.2545, "step": 2894 }, { "epoch": 0.2582630804228556, "grad_norm": 1850.2611083984375, "learning_rate": 3e-06, "loss": 118.5457, "step": 2895 }, { "epoch": 0.2583522904679067, "grad_norm": 1593.41357421875, "learning_rate": 3e-06, "loss": 88.0971, "step": 2896 }, { "epoch": 0.2584415005129578, "grad_norm": 1482.3663330078125, "learning_rate": 3e-06, "loss": 136.3199, "step": 2897 }, { "epoch": 0.25853071055800886, "grad_norm": 2080.68896484375, "learning_rate": 3e-06, "loss": 45.2217, "step": 2898 }, { "epoch": 0.2586199206030599, "grad_norm": 1491.896484375, "learning_rate": 3e-06, "loss": 13.4847, "step": 2899 }, { "epoch": 0.25870913064811096, "grad_norm": 1294.8150634765625, "learning_rate": 3e-06, "loss": 75.2897, "step": 2900 }, { "epoch": 0.25879834069316204, "grad_norm": 1676.6094970703125, "learning_rate": 3e-06, "loss": 76.2061, "step": 2901 }, { "epoch": 0.2588875507382131, "grad_norm": 1526.220458984375, "learning_rate": 3e-06, "loss": 50.2407, "step": 2902 }, { "epoch": 0.2589767607832642, "grad_norm": 1478.2939453125, "learning_rate": 3e-06, "loss": 101.3816, "step": 2903 }, { "epoch": 0.2590659708283153, "grad_norm": 1666.55419921875, "learning_rate": 3e-06, "loss": 9.6811, "step": 2904 }, { "completion_length": 132.0, "epoch": 0.25915518087336636, "grad_norm": 1786.49609375, "learning_rate": 3e-06, "loss": -78.7828, "reward": 2.2495001554489136, "reward_std": 0.383434534072876, "rewards/correctness_reward_func": 1.5833333730697632, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.16616666316986084, "step": 2905, "zero_std_ratio": 0.0 }, { "epoch": 0.2592443909184174, "grad_norm": 1530.810791015625, "learning_rate": 3e-06, "loss": -142.4264, "step": 2906 }, { "epoch": 0.25933360096346847, "grad_norm": 1333.4097900390625, "learning_rate": 3e-06, "loss": -69.7162, "step": 2907 }, { "epoch": 0.25942281100851955, "grad_norm": 1612.8759765625, "learning_rate": 3e-06, "loss": -109.628, "step": 2908 }, { "epoch": 0.25951202105357063, "grad_norm": 1138.9757080078125, "learning_rate": 3e-06, "loss": -109.3496, "step": 2909 }, { "epoch": 0.2596012310986217, "grad_norm": 2154.076171875, "learning_rate": 3e-06, "loss": -82.0285, "step": 2910 }, { "epoch": 0.2596904411436728, "grad_norm": 1556.2987060546875, "learning_rate": 3e-06, "loss": -103.8431, "step": 2911 }, { "epoch": 0.25977965118872387, "grad_norm": 1218.1712646484375, "learning_rate": 3e-06, "loss": -175.3098, "step": 2912 }, { "epoch": 0.25986886123377495, "grad_norm": 1257.9783935546875, "learning_rate": 3e-06, "loss": -85.8033, "step": 2913 }, { "epoch": 0.259958071278826, "grad_norm": 1447.798095703125, "learning_rate": 3e-06, "loss": -134.4778, "step": 2914 }, { "epoch": 0.26004728132387706, "grad_norm": 1196.5157470703125, "learning_rate": 3e-06, "loss": -118.3898, "step": 2915 }, { "epoch": 0.26013649136892814, "grad_norm": 1735.26904296875, "learning_rate": 3e-06, "loss": -109.7503, "step": 2916 }, { "completion_length": 116.3125, "epoch": 0.2602257014139792, "grad_norm": 578.029541015625, "learning_rate": 3e-06, "loss": -74.6343, "reward": 2.18025004863739, "reward_std": 0.6192338168621063, "rewards/correctness_reward_func": 1.5, "rewards/int_reward_func": 0.46875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.21150000393390656, "step": 2917, "zero_std_ratio": 0.0 }, { "epoch": 0.2603149114590303, "grad_norm": 873.973388671875, "learning_rate": 3e-06, "loss": -50.678, "step": 2918 }, { "epoch": 0.2604041215040814, "grad_norm": 734.5584106445312, "learning_rate": 3e-06, "loss": -84.7397, "step": 2919 }, { "epoch": 0.26049333154913246, "grad_norm": 672.841064453125, "learning_rate": 3e-06, "loss": -79.0981, "step": 2920 }, { "epoch": 0.2605825415941835, "grad_norm": 632.9013061523438, "learning_rate": 3e-06, "loss": -78.2254, "step": 2921 }, { "epoch": 0.26067175163923456, "grad_norm": 587.878662109375, "learning_rate": 3e-06, "loss": -75.7101, "step": 2922 }, { "epoch": 0.26076096168428564, "grad_norm": 493.7314453125, "learning_rate": 3e-06, "loss": -84.4347, "step": 2923 }, { "epoch": 0.2608501717293367, "grad_norm": 903.7343139648438, "learning_rate": 3e-06, "loss": -58.6569, "step": 2924 }, { "epoch": 0.2609393817743878, "grad_norm": 615.508056640625, "learning_rate": 3e-06, "loss": -98.8108, "step": 2925 }, { "epoch": 0.2610285918194389, "grad_norm": 625.9939575195312, "learning_rate": 3e-06, "loss": -94.0236, "step": 2926 }, { "epoch": 0.26111780186448996, "grad_norm": 541.8840942382812, "learning_rate": 3e-06, "loss": -87.5613, "step": 2927 }, { "epoch": 0.261207011909541, "grad_norm": 745.64501953125, "learning_rate": 3e-06, "loss": -85.1391, "step": 2928 }, { "completion_length": 107.14583587646484, "epoch": 0.26129622195459207, "grad_norm": 1219.5657958984375, "learning_rate": 3e-06, "loss": -53.8114, "reward": 2.276187539100647, "reward_std": 0.7405118346214294, "rewards/correctness_reward_func": 1.5833333134651184, "rewards/int_reward_func": 0.4791666716337204, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.213687501847744, "step": 2929, "zero_std_ratio": 0.0 }, { "epoch": 0.26138543199964315, "grad_norm": 1440.60205078125, "learning_rate": 3e-06, "loss": -35.0205, "step": 2930 }, { "epoch": 0.2614746420446942, "grad_norm": 1359.045166015625, "learning_rate": 3e-06, "loss": -43.2593, "step": 2931 }, { "epoch": 0.2615638520897453, "grad_norm": 1127.97314453125, "learning_rate": 3e-06, "loss": -111.7786, "step": 2932 }, { "epoch": 0.2616530621347964, "grad_norm": 1079.3187255859375, "learning_rate": 3e-06, "loss": -57.8533, "step": 2933 }, { "epoch": 0.26174227217984747, "grad_norm": 981.3270263671875, "learning_rate": 3e-06, "loss": -17.5171, "step": 2934 }, { "epoch": 0.26183148222489855, "grad_norm": 1314.573974609375, "learning_rate": 3e-06, "loss": -73.1231, "step": 2935 }, { "epoch": 0.2619206922699496, "grad_norm": 1722.6348876953125, "learning_rate": 3e-06, "loss": -42.8779, "step": 2936 }, { "epoch": 0.26200990231500065, "grad_norm": 1447.8468017578125, "learning_rate": 3e-06, "loss": -55.328, "step": 2937 }, { "epoch": 0.26209911236005173, "grad_norm": 1420.0738525390625, "learning_rate": 3e-06, "loss": -121.9105, "step": 2938 }, { "epoch": 0.2621883224051028, "grad_norm": 1097.515380859375, "learning_rate": 3e-06, "loss": -73.3034, "step": 2939 }, { "epoch": 0.2622775324501539, "grad_norm": 1337.4931640625, "learning_rate": 3e-06, "loss": -38.3407, "step": 2940 }, { "completion_length": 119.6875, "epoch": 0.262366742495205, "grad_norm": 482.51605224609375, "learning_rate": 3e-06, "loss": 0.691, "reward": 2.0625417232513428, "reward_std": 0.39333905279636383, "rewards/correctness_reward_func": 1.3750000298023224, "rewards/int_reward_func": 0.4479166567325592, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.2396249920129776, "step": 2941, "zero_std_ratio": 0.0 }, { "epoch": 0.26245595254025605, "grad_norm": 556.9758911132812, "learning_rate": 3e-06, "loss": -2.4846, "step": 2942 }, { "epoch": 0.2625451625853071, "grad_norm": 579.5152587890625, "learning_rate": 3e-06, "loss": -19.3386, "step": 2943 }, { "epoch": 0.26263437263035816, "grad_norm": 799.861572265625, "learning_rate": 3e-06, "loss": -1.3492, "step": 2944 }, { "epoch": 0.26272358267540924, "grad_norm": 754.6559448242188, "learning_rate": 3e-06, "loss": 1.3488, "step": 2945 }, { "epoch": 0.2628127927204603, "grad_norm": 812.598876953125, "learning_rate": 3e-06, "loss": 5.2034, "step": 2946 }, { "epoch": 0.2629020027655114, "grad_norm": 756.814208984375, "learning_rate": 3e-06, "loss": -6.1012, "step": 2947 }, { "epoch": 0.2629912128105625, "grad_norm": 632.0143432617188, "learning_rate": 3e-06, "loss": -5.9643, "step": 2948 }, { "epoch": 0.26308042285561356, "grad_norm": 689.2689819335938, "learning_rate": 3e-06, "loss": -25.1921, "step": 2949 }, { "epoch": 0.26316963290066464, "grad_norm": 851.1929931640625, "learning_rate": 3e-06, "loss": -12.9888, "step": 2950 }, { "epoch": 0.26325884294571567, "grad_norm": 1079.9923095703125, "learning_rate": 3e-06, "loss": -10.8567, "step": 2951 }, { "epoch": 0.26334805299076675, "grad_norm": 898.2387084960938, "learning_rate": 3e-06, "loss": -9.645, "step": 2952 }, { "completion_length": 118.5625, "epoch": 0.2634372630358178, "grad_norm": 787.6694946289062, "learning_rate": 3e-06, "loss": -29.457, "reward": 2.2219375371932983, "reward_std": 0.6826577037572861, "rewards/correctness_reward_func": 1.5416666865348816, "rewards/int_reward_func": 0.4791666567325592, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.20110415667295456, "step": 2953, "zero_std_ratio": 0.0 }, { "epoch": 0.2635264730808689, "grad_norm": 971.4552001953125, "learning_rate": 3e-06, "loss": -27.3094, "step": 2954 }, { "epoch": 0.26361568312592, "grad_norm": 1412.1124267578125, "learning_rate": 3e-06, "loss": -41.0105, "step": 2955 }, { "epoch": 0.26370489317097107, "grad_norm": 1042.5732421875, "learning_rate": 3e-06, "loss": -64.5026, "step": 2956 }, { "epoch": 0.26379410321602215, "grad_norm": 895.4235229492188, "learning_rate": 3e-06, "loss": -29.1741, "step": 2957 }, { "epoch": 0.26388331326107317, "grad_norm": 1083.151123046875, "learning_rate": 3e-06, "loss": -87.1508, "step": 2958 }, { "epoch": 0.26397252330612425, "grad_norm": 873.146484375, "learning_rate": 3e-06, "loss": -44.7478, "step": 2959 }, { "epoch": 0.26406173335117533, "grad_norm": 1301.8702392578125, "learning_rate": 3e-06, "loss": -57.3308, "step": 2960 }, { "epoch": 0.2641509433962264, "grad_norm": 1813.9736328125, "learning_rate": 3e-06, "loss": -59.8182, "step": 2961 }, { "epoch": 0.2642401534412775, "grad_norm": 2201.687255859375, "learning_rate": 3e-06, "loss": -89.2731, "step": 2962 }, { "epoch": 0.2643293634863286, "grad_norm": 913.6972045898438, "learning_rate": 3e-06, "loss": -54.9221, "step": 2963 }, { "epoch": 0.26441857353137965, "grad_norm": 1180.409423828125, "learning_rate": 3e-06, "loss": -113.8787, "step": 2964 }, { "completion_length": 136.6666717529297, "epoch": 0.26450778357643073, "grad_norm": 969.1856079101562, "learning_rate": 3e-06, "loss": -246.5684, "reward": 2.567145824432373, "reward_std": 0.2896235566586256, "rewards/correctness_reward_func": 1.9166666865348816, "rewards/int_reward_func": 0.4895833283662796, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.16089582443237305, "step": 2965, "zero_std_ratio": 0.0 }, { "epoch": 0.26459699362148176, "grad_norm": 858.3824462890625, "learning_rate": 3e-06, "loss": -244.7534, "step": 2966 }, { "epoch": 0.26468620366653284, "grad_norm": 950.44873046875, "learning_rate": 3e-06, "loss": -228.9067, "step": 2967 }, { "epoch": 0.2647754137115839, "grad_norm": 822.103759765625, "learning_rate": 3e-06, "loss": -210.6689, "step": 2968 }, { "epoch": 0.264864623756635, "grad_norm": 1124.1434326171875, "learning_rate": 3e-06, "loss": -219.4285, "step": 2969 }, { "epoch": 0.2649538338016861, "grad_norm": 3672.276611328125, "learning_rate": 3e-06, "loss": -404.9469, "step": 2970 }, { "epoch": 0.26504304384673716, "grad_norm": 975.5111694335938, "learning_rate": 3e-06, "loss": -268.2276, "step": 2971 }, { "epoch": 0.26513225389178824, "grad_norm": 1334.278076171875, "learning_rate": 3e-06, "loss": -258.803, "step": 2972 }, { "epoch": 0.26522146393683926, "grad_norm": 1281.6295166015625, "learning_rate": 3e-06, "loss": -256.8895, "step": 2973 }, { "epoch": 0.26531067398189034, "grad_norm": 839.9090576171875, "learning_rate": 3e-06, "loss": -234.8308, "step": 2974 }, { "epoch": 0.2653998840269414, "grad_norm": 1169.2425537109375, "learning_rate": 3e-06, "loss": -260.2271, "step": 2975 }, { "epoch": 0.2654890940719925, "grad_norm": 2688.470703125, "learning_rate": 3e-06, "loss": -448.1551, "step": 2976 }, { "completion_length": 110.56250381469727, "epoch": 0.2655783041170436, "grad_norm": 1131.6473388671875, "learning_rate": 3e-06, "loss": 3.8707, "reward": 2.0714792013168335, "reward_std": 0.18218296021223068, "rewards/correctness_reward_func": 1.375, "rewards/int_reward_func": 0.4895833283662796, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.2068958282470703, "step": 2977, "zero_std_ratio": 0.125 }, { "epoch": 0.26566751416209466, "grad_norm": 847.2361450195312, "learning_rate": 3e-06, "loss": 14.9236, "step": 2978 }, { "epoch": 0.26575672420714574, "grad_norm": 1642.1241455078125, "learning_rate": 3e-06, "loss": -39.4793, "step": 2979 }, { "epoch": 0.2658459342521968, "grad_norm": 1092.3753662109375, "learning_rate": 3e-06, "loss": 2.1236, "step": 2980 }, { "epoch": 0.26593514429724785, "grad_norm": 878.7587280273438, "learning_rate": 3e-06, "loss": -10.7814, "step": 2981 }, { "epoch": 0.26602435434229893, "grad_norm": 1027.923095703125, "learning_rate": 3e-06, "loss": -38.2648, "step": 2982 }, { "epoch": 0.26611356438735, "grad_norm": 1022.8844604492188, "learning_rate": 3e-06, "loss": -0.4588, "step": 2983 }, { "epoch": 0.2662027744324011, "grad_norm": 1529.910400390625, "learning_rate": 3e-06, "loss": 8.662, "step": 2984 }, { "epoch": 0.26629198447745217, "grad_norm": 1578.8734130859375, "learning_rate": 3e-06, "loss": -43.4542, "step": 2985 }, { "epoch": 0.26638119452250325, "grad_norm": 1200.3994140625, "learning_rate": 3e-06, "loss": 1.6446, "step": 2986 }, { "epoch": 0.26647040456755433, "grad_norm": 1133.090087890625, "learning_rate": 3e-06, "loss": -23.5441, "step": 2987 }, { "epoch": 0.26655961461260536, "grad_norm": 948.1660766601562, "learning_rate": 3e-06, "loss": -49.3466, "step": 2988 }, { "completion_length": 106.72917175292969, "epoch": 0.26664882465765644, "grad_norm": 955.7079467773438, "learning_rate": 3e-06, "loss": -78.1366, "reward": 2.4230626821517944, "reward_std": 0.14010506123304367, "rewards/correctness_reward_func": 1.7083333134651184, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.2147291600704193, "step": 2989, "zero_std_ratio": 0.0 }, { "epoch": 0.2667380347027075, "grad_norm": 1161.521484375, "learning_rate": 3e-06, "loss": -75.2466, "step": 2990 }, { "epoch": 0.2668272447477586, "grad_norm": 909.06005859375, "learning_rate": 3e-06, "loss": -75.9224, "step": 2991 }, { "epoch": 0.2669164547928097, "grad_norm": 815.2526245117188, "learning_rate": 3e-06, "loss": -87.1421, "step": 2992 }, { "epoch": 0.26700566483786076, "grad_norm": 891.2210083007812, "learning_rate": 3e-06, "loss": -66.0872, "step": 2993 }, { "epoch": 0.26709487488291184, "grad_norm": 751.7452392578125, "learning_rate": 3e-06, "loss": -55.2387, "step": 2994 }, { "epoch": 0.26718408492796286, "grad_norm": 776.564697265625, "learning_rate": 3e-06, "loss": -86.0661, "step": 2995 }, { "epoch": 0.26727329497301394, "grad_norm": 843.0634765625, "learning_rate": 3e-06, "loss": -79.6742, "step": 2996 }, { "epoch": 0.267362505018065, "grad_norm": 951.456298828125, "learning_rate": 3e-06, "loss": -85.2094, "step": 2997 }, { "epoch": 0.2674517150631161, "grad_norm": 900.0969848632812, "learning_rate": 3e-06, "loss": -94.7873, "step": 2998 }, { "epoch": 0.2675409251081672, "grad_norm": 977.86474609375, "learning_rate": 3e-06, "loss": -73.4958, "step": 2999 }, { "epoch": 0.26763013515321826, "grad_norm": 683.9253540039062, "learning_rate": 3e-06, "loss": -67.8395, "step": 3000 }, { "completion_length": 111.1875, "epoch": 0.26771934519826934, "grad_norm": 1202.17138671875, "learning_rate": 3e-06, "loss": 46.5333, "reward": 2.185624837875366, "reward_std": 0.26768165081739426, "rewards/correctness_reward_func": 1.4583333730697632, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.22729165852069855, "step": 3001, "zero_std_ratio": 0.0 }, { "epoch": 0.2678085552433204, "grad_norm": 1060.165283203125, "learning_rate": 3e-06, "loss": 51.2124, "step": 3002 }, { "epoch": 0.26789776528837145, "grad_norm": 1288.5169677734375, "learning_rate": 3e-06, "loss": 51.4419, "step": 3003 }, { "epoch": 0.26798697533342253, "grad_norm": 1191.4482421875, "learning_rate": 3e-06, "loss": -15.033, "step": 3004 }, { "epoch": 0.2680761853784736, "grad_norm": 1316.1217041015625, "learning_rate": 3e-06, "loss": 9.9512, "step": 3005 }, { "epoch": 0.2681653954235247, "grad_norm": 1167.442626953125, "learning_rate": 3e-06, "loss": 53.6129, "step": 3006 }, { "epoch": 0.26825460546857577, "grad_norm": 1028.405029296875, "learning_rate": 3e-06, "loss": 34.9836, "step": 3007 }, { "epoch": 0.26834381551362685, "grad_norm": 853.8731079101562, "learning_rate": 3e-06, "loss": 38.3951, "step": 3008 }, { "epoch": 0.26843302555867793, "grad_norm": 1154.0389404296875, "learning_rate": 3e-06, "loss": 30.2547, "step": 3009 }, { "epoch": 0.26852223560372895, "grad_norm": 1773.9207763671875, "learning_rate": 3e-06, "loss": -22.6816, "step": 3010 }, { "epoch": 0.26861144564878003, "grad_norm": 962.8943481445312, "learning_rate": 3e-06, "loss": -6.7198, "step": 3011 }, { "epoch": 0.2687006556938311, "grad_norm": 1030.552734375, "learning_rate": 3e-06, "loss": 32.0164, "step": 3012 }, { "completion_length": 110.66666793823242, "epoch": 0.2687898657388822, "grad_norm": 2206.882568359375, "learning_rate": 3e-06, "loss": -73.715, "reward": 2.1887917518615723, "reward_std": 0.6579568237066269, "rewards/correctness_reward_func": 1.5, "rewards/int_reward_func": 0.4791666716337204, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.2096249982714653, "step": 3013, "zero_std_ratio": 0.0 }, { "epoch": 0.2688790757839333, "grad_norm": 1810.1790771484375, "learning_rate": 3e-06, "loss": -2.4885, "step": 3014 }, { "epoch": 0.26896828582898435, "grad_norm": 1863.427001953125, "learning_rate": 3e-06, "loss": -2.5874, "step": 3015 }, { "epoch": 0.26905749587403543, "grad_norm": 3166.76953125, "learning_rate": 3e-06, "loss": -74.9516, "step": 3016 }, { "epoch": 0.2691467059190865, "grad_norm": 1822.0609130859375, "learning_rate": 3e-06, "loss": 12.2712, "step": 3017 }, { "epoch": 0.26923591596413754, "grad_norm": 2559.92626953125, "learning_rate": 3e-06, "loss": -12.2874, "step": 3018 }, { "epoch": 0.2693251260091886, "grad_norm": 2351.409912109375, "learning_rate": 3e-06, "loss": -86.6655, "step": 3019 }, { "epoch": 0.2694143360542397, "grad_norm": 1778.614990234375, "learning_rate": 3e-06, "loss": -12.654, "step": 3020 }, { "epoch": 0.2695035460992908, "grad_norm": 1871.30859375, "learning_rate": 3e-06, "loss": -11.1651, "step": 3021 }, { "epoch": 0.26959275614434186, "grad_norm": 3068.177734375, "learning_rate": 3e-06, "loss": -94.2456, "step": 3022 }, { "epoch": 0.26968196618939294, "grad_norm": 1755.19189453125, "learning_rate": 3e-06, "loss": 1.3564, "step": 3023 }, { "epoch": 0.269771176234444, "grad_norm": 2667.14697265625, "learning_rate": 3e-06, "loss": -8.4249, "step": 3024 }, { "completion_length": 114.47917175292969, "epoch": 0.26986038627949505, "grad_norm": 1128.5185546875, "learning_rate": 3e-06, "loss": -113.1262, "reward": 2.590812563896179, "reward_std": 0.3377353250980377, "rewards/correctness_reward_func": 1.875, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.21581250429153442, "step": 3025, "zero_std_ratio": 0.0 }, { "epoch": 0.2699495963245461, "grad_norm": 1205.2628173828125, "learning_rate": 3e-06, "loss": -121.8958, "step": 3026 }, { "epoch": 0.2700388063695972, "grad_norm": 1650.044677734375, "learning_rate": 3e-06, "loss": -74.4126, "step": 3027 }, { "epoch": 0.2701280164146483, "grad_norm": 1552.579833984375, "learning_rate": 3e-06, "loss": -37.5396, "step": 3028 }, { "epoch": 0.27021722645969937, "grad_norm": 1209.5589599609375, "learning_rate": 3e-06, "loss": -29.122, "step": 3029 }, { "epoch": 0.27030643650475045, "grad_norm": 2512.024169921875, "learning_rate": 3e-06, "loss": -4.1829, "step": 3030 }, { "epoch": 0.2703956465498015, "grad_norm": 1050.1766357421875, "learning_rate": 3e-06, "loss": -127.4833, "step": 3031 }, { "epoch": 0.2704848565948526, "grad_norm": 1175.8475341796875, "learning_rate": 3e-06, "loss": -129.5383, "step": 3032 }, { "epoch": 0.27057406663990363, "grad_norm": 1434.1722412109375, "learning_rate": 3e-06, "loss": -82.8567, "step": 3033 }, { "epoch": 0.2706632766849547, "grad_norm": 1285.1558837890625, "learning_rate": 3e-06, "loss": -53.0559, "step": 3034 }, { "epoch": 0.2707524867300058, "grad_norm": 961.3186645507812, "learning_rate": 3e-06, "loss": -38.931, "step": 3035 }, { "epoch": 0.2708416967750569, "grad_norm": 2193.393310546875, "learning_rate": 3e-06, "loss": -16.9111, "step": 3036 }, { "completion_length": 140.66666793823242, "epoch": 0.27093090682010795, "grad_norm": 2801.187255859375, "learning_rate": 3e-06, "loss": -196.6357, "reward": 1.7385209202766418, "reward_std": 0.7085210978984833, "rewards/correctness_reward_func": 1.0833333432674408, "rewards/int_reward_func": 0.46875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.18643750250339508, "step": 3037, "zero_std_ratio": 0.0 }, { "epoch": 0.27102011686515903, "grad_norm": 2811.52734375, "learning_rate": 3e-06, "loss": 44.996, "step": 3038 }, { "epoch": 0.2711093269102101, "grad_norm": 3620.3154296875, "learning_rate": 3e-06, "loss": -30.4338, "step": 3039 }, { "epoch": 0.27119853695526114, "grad_norm": 2687.5185546875, "learning_rate": 3e-06, "loss": 149.5113, "step": 3040 }, { "epoch": 0.2712877470003122, "grad_norm": 2482.21142578125, "learning_rate": 3e-06, "loss": -44.9672, "step": 3041 }, { "epoch": 0.2713769570453633, "grad_norm": 3290.331787109375, "learning_rate": 3e-06, "loss": 175.7723, "step": 3042 }, { "epoch": 0.2714661670904144, "grad_norm": 2688.0302734375, "learning_rate": 3e-06, "loss": -227.2278, "step": 3043 }, { "epoch": 0.27155537713546546, "grad_norm": 2466.43408203125, "learning_rate": 3e-06, "loss": 16.3886, "step": 3044 }, { "epoch": 0.27164458718051654, "grad_norm": 2916.62939453125, "learning_rate": 3e-06, "loss": -69.8449, "step": 3045 }, { "epoch": 0.2717337972255676, "grad_norm": 2572.781005859375, "learning_rate": 3e-06, "loss": 116.6977, "step": 3046 }, { "epoch": 0.27182300727061864, "grad_norm": 2047.2603759765625, "learning_rate": 3e-06, "loss": -52.7569, "step": 3047 }, { "epoch": 0.2719122173156697, "grad_norm": 3459.267822265625, "learning_rate": 3e-06, "loss": 85.0563, "step": 3048 }, { "completion_length": 131.02083587646484, "epoch": 0.2720014273607208, "grad_norm": 1489.41259765625, "learning_rate": 3e-06, "loss": -16.1508, "reward": 2.0418750047683716, "reward_std": 0.3964058458805084, "rewards/correctness_reward_func": 1.375, "rewards/int_reward_func": 0.46875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.19812500476837158, "step": 3049, "zero_std_ratio": 0.125 }, { "epoch": 0.2720906374057719, "grad_norm": 1750.580078125, "learning_rate": 3e-06, "loss": 43.4089, "step": 3050 }, { "epoch": 0.27217984745082296, "grad_norm": 1699.452880859375, "learning_rate": 3e-06, "loss": 80.7036, "step": 3051 }, { "epoch": 0.27226905749587405, "grad_norm": 1434.2325439453125, "learning_rate": 3e-06, "loss": 72.9774, "step": 3052 }, { "epoch": 0.2723582675409251, "grad_norm": 1672.6483154296875, "learning_rate": 3e-06, "loss": 129.7158, "step": 3053 }, { "epoch": 0.2724474775859762, "grad_norm": 1193.5745849609375, "learning_rate": 3e-06, "loss": 19.0838, "step": 3054 }, { "epoch": 0.27253668763102723, "grad_norm": 1405.1256103515625, "learning_rate": 3e-06, "loss": -42.4678, "step": 3055 }, { "epoch": 0.2726258976760783, "grad_norm": 1926.5093994140625, "learning_rate": 3e-06, "loss": 20.8341, "step": 3056 }, { "epoch": 0.2727151077211294, "grad_norm": 1377.70166015625, "learning_rate": 3e-06, "loss": 43.6751, "step": 3057 }, { "epoch": 0.27280431776618047, "grad_norm": 1210.7298583984375, "learning_rate": 3e-06, "loss": 54.2988, "step": 3058 }, { "epoch": 0.27289352781123155, "grad_norm": 1328.4100341796875, "learning_rate": 3e-06, "loss": 100.938, "step": 3059 }, { "epoch": 0.27298273785628263, "grad_norm": 1012.921142578125, "learning_rate": 3e-06, "loss": 5.0293, "step": 3060 }, { "completion_length": 129.75000381469727, "epoch": 0.2730719479013337, "grad_norm": 2009.1962890625, "learning_rate": 3e-06, "loss": -236.9441, "reward": 1.953624963760376, "reward_std": 0.5693272352218628, "rewards/correctness_reward_func": 1.2916666865348816, "rewards/int_reward_func": 0.4895833283662796, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.17237499356269836, "step": 3061, "zero_std_ratio": 0.0 }, { "epoch": 0.27316115794638474, "grad_norm": 1963.955078125, "learning_rate": 3e-06, "loss": -283.4947, "step": 3062 }, { "epoch": 0.2732503679914358, "grad_norm": 3396.832275390625, "learning_rate": 3e-06, "loss": -333.5178, "step": 3063 }, { "epoch": 0.2733395780364869, "grad_norm": 2347.00048828125, "learning_rate": 3e-06, "loss": -216.9495, "step": 3064 }, { "epoch": 0.273428788081538, "grad_norm": 3707.431884765625, "learning_rate": 3e-06, "loss": -236.4454, "step": 3065 }, { "epoch": 0.27351799812658906, "grad_norm": 2146.6220703125, "learning_rate": 3e-06, "loss": -236.2018, "step": 3066 }, { "epoch": 0.27360720817164014, "grad_norm": 2053.462158203125, "learning_rate": 3e-06, "loss": -242.0625, "step": 3067 }, { "epoch": 0.2736964182166912, "grad_norm": 2314.38671875, "learning_rate": 3e-06, "loss": -291.3707, "step": 3068 }, { "epoch": 0.2737856282617423, "grad_norm": 2155.58984375, "learning_rate": 3e-06, "loss": -333.5438, "step": 3069 }, { "epoch": 0.2738748383067933, "grad_norm": 1966.490966796875, "learning_rate": 3e-06, "loss": -233.6188, "step": 3070 }, { "epoch": 0.2739640483518444, "grad_norm": 2886.151611328125, "learning_rate": 3e-06, "loss": -245.3511, "step": 3071 }, { "epoch": 0.2740532583968955, "grad_norm": 2648.965576171875, "learning_rate": 3e-06, "loss": -262.1523, "step": 3072 }, { "completion_length": 119.85417175292969, "epoch": 0.27414246844194656, "grad_norm": 1739.1029052734375, "learning_rate": 3e-06, "loss": -34.4111, "reward": 2.1497918367385864, "reward_std": 0.29938701912760735, "rewards/correctness_reward_func": 1.4583333134651184, "rewards/int_reward_func": 0.4895833283662796, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.2018750011920929, "step": 3073, "zero_std_ratio": 0.0 }, { "epoch": 0.27423167848699764, "grad_norm": 1859.151123046875, "learning_rate": 3e-06, "loss": 36.4689, "step": 3074 }, { "epoch": 0.2743208885320487, "grad_norm": 2193.682373046875, "learning_rate": 3e-06, "loss": 45.3547, "step": 3075 }, { "epoch": 0.2744100985770998, "grad_norm": 1923.550048828125, "learning_rate": 3e-06, "loss": 33.7396, "step": 3076 }, { "epoch": 0.27449930862215083, "grad_norm": 1634.33544921875, "learning_rate": 3e-06, "loss": 123.5694, "step": 3077 }, { "epoch": 0.2745885186672019, "grad_norm": 973.1492919921875, "learning_rate": 3e-06, "loss": 70.8626, "step": 3078 }, { "epoch": 0.274677728712253, "grad_norm": 1792.93701171875, "learning_rate": 3e-06, "loss": -51.9396, "step": 3079 }, { "epoch": 0.27476693875730407, "grad_norm": 2238.671875, "learning_rate": 3e-06, "loss": 6.7037, "step": 3080 }, { "epoch": 0.27485614880235515, "grad_norm": 3862.292236328125, "learning_rate": 3e-06, "loss": 28.2405, "step": 3081 }, { "epoch": 0.27494535884740623, "grad_norm": 2528.921630859375, "learning_rate": 3e-06, "loss": 2.2837, "step": 3082 }, { "epoch": 0.2750345688924573, "grad_norm": 2276.7197265625, "learning_rate": 3e-06, "loss": 88.1544, "step": 3083 }, { "epoch": 0.2751237789375084, "grad_norm": 934.5348510742188, "learning_rate": 3e-06, "loss": 57.3894, "step": 3084 }, { "completion_length": 119.00000762939453, "epoch": 0.2752129889825594, "grad_norm": 4593.513671875, "learning_rate": 3e-06, "loss": 47.8896, "reward": 2.4779791831970215, "reward_std": 0.5189203917980194, "rewards/correctness_reward_func": 1.7916666865348816, "rewards/int_reward_func": 0.4895833283662796, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.1967291608452797, "step": 3085, "zero_std_ratio": 0.0 }, { "epoch": 0.2753021990276105, "grad_norm": 451.9112243652344, "learning_rate": 3e-06, "loss": 2.5885, "step": 3086 }, { "epoch": 0.2753914090726616, "grad_norm": 506.1065673828125, "learning_rate": 3e-06, "loss": 6.4191, "step": 3087 }, { "epoch": 0.27548061911771266, "grad_norm": 386.3024597167969, "learning_rate": 3e-06, "loss": -7.6148, "step": 3088 }, { "epoch": 0.27556982916276374, "grad_norm": 763.252685546875, "learning_rate": 3e-06, "loss": -2.6826, "step": 3089 }, { "epoch": 0.2756590392078148, "grad_norm": 488.9563293457031, "learning_rate": 3e-06, "loss": -8.4882, "step": 3090 }, { "epoch": 0.2757482492528659, "grad_norm": 1068.6734619140625, "learning_rate": 3e-06, "loss": 12.4432, "step": 3091 }, { "epoch": 0.2758374592979169, "grad_norm": 339.1369934082031, "learning_rate": 3e-06, "loss": -4.5047, "step": 3092 }, { "epoch": 0.275926669342968, "grad_norm": 364.6488342285156, "learning_rate": 3e-06, "loss": -1.7684, "step": 3093 }, { "epoch": 0.2760158793880191, "grad_norm": 338.7989807128906, "learning_rate": 3e-06, "loss": -13.8045, "step": 3094 }, { "epoch": 0.27610508943307016, "grad_norm": 449.4442443847656, "learning_rate": 3e-06, "loss": -12.7494, "step": 3095 }, { "epoch": 0.27619429947812124, "grad_norm": 282.933837890625, "learning_rate": 3e-06, "loss": -9.7291, "step": 3096 }, { "completion_length": 157.00000762939453, "epoch": 0.2762835095231723, "grad_norm": 554.0556640625, "learning_rate": 3e-06, "loss": 27.0297, "reward": 2.5507707595825195, "reward_std": 0.27092354744672775, "rewards/correctness_reward_func": 1.9166666865348816, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.1341041661798954, "step": 3097, "zero_std_ratio": 0.0 }, { "epoch": 0.2763727195682234, "grad_norm": 586.5379028320312, "learning_rate": 3e-06, "loss": 20.3531, "step": 3098 }, { "epoch": 0.2764619296132745, "grad_norm": 491.3218078613281, "learning_rate": 3e-06, "loss": 57.8648, "step": 3099 }, { "epoch": 0.2765511396583255, "grad_norm": 500.4717712402344, "learning_rate": 3e-06, "loss": 54.587, "step": 3100 }, { "epoch": 0.2766403497033766, "grad_norm": 1038.388427734375, "learning_rate": 3e-06, "loss": -22.3984, "step": 3101 }, { "epoch": 0.27672955974842767, "grad_norm": 518.1480102539062, "learning_rate": 3e-06, "loss": 21.9942, "step": 3102 }, { "epoch": 0.27681876979347875, "grad_norm": 645.0948486328125, "learning_rate": 3e-06, "loss": 20.7679, "step": 3103 }, { "epoch": 0.2769079798385298, "grad_norm": 721.4098510742188, "learning_rate": 3e-06, "loss": 13.2672, "step": 3104 }, { "epoch": 0.2769971898835809, "grad_norm": 701.0861206054688, "learning_rate": 3e-06, "loss": 54.3506, "step": 3105 }, { "epoch": 0.277086399928632, "grad_norm": 476.0610656738281, "learning_rate": 3e-06, "loss": 45.4997, "step": 3106 }, { "epoch": 0.277175609973683, "grad_norm": 416.7836608886719, "learning_rate": 3e-06, "loss": -33.7216, "step": 3107 }, { "epoch": 0.2772648200187341, "grad_norm": 598.7232666015625, "learning_rate": 3e-06, "loss": 13.1185, "step": 3108 }, { "completion_length": 135.25000762939453, "epoch": 0.2773540300637852, "grad_norm": 1037.9493408203125, "learning_rate": 3e-06, "loss": -86.4669, "reward": 2.482937455177307, "reward_std": 0.3366389195434749, "rewards/correctness_reward_func": 1.8333333134651184, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.1496041640639305, "step": 3109, "zero_std_ratio": 0.0 }, { "epoch": 0.27744324010883625, "grad_norm": 956.6131591796875, "learning_rate": 3e-06, "loss": -123.449, "step": 3110 }, { "epoch": 0.27753245015388733, "grad_norm": 800.9594116210938, "learning_rate": 3e-06, "loss": -117.6646, "step": 3111 }, { "epoch": 0.2776216601989384, "grad_norm": 986.456298828125, "learning_rate": 3e-06, "loss": -182.4949, "step": 3112 }, { "epoch": 0.2777108702439895, "grad_norm": 812.1907348632812, "learning_rate": 3e-06, "loss": -85.4342, "step": 3113 }, { "epoch": 0.2778000802890405, "grad_norm": 938.8145751953125, "learning_rate": 3e-06, "loss": -82.5111, "step": 3114 }, { "epoch": 0.2778892903340916, "grad_norm": 871.9295654296875, "learning_rate": 3e-06, "loss": -93.9149, "step": 3115 }, { "epoch": 0.2779785003791427, "grad_norm": 744.1700439453125, "learning_rate": 3e-06, "loss": -121.0674, "step": 3116 }, { "epoch": 0.27806771042419376, "grad_norm": 732.523193359375, "learning_rate": 3e-06, "loss": -124.3064, "step": 3117 }, { "epoch": 0.27815692046924484, "grad_norm": 1220.0933837890625, "learning_rate": 3e-06, "loss": -187.1621, "step": 3118 }, { "epoch": 0.2782461305142959, "grad_norm": 835.1678466796875, "learning_rate": 3e-06, "loss": -95.6963, "step": 3119 }, { "epoch": 0.278335340559347, "grad_norm": 766.2755126953125, "learning_rate": 3e-06, "loss": -92.651, "step": 3120 }, { "completion_length": 129.9583396911621, "epoch": 0.2784245506043981, "grad_norm": 173.5714111328125, "learning_rate": 3e-06, "loss": 2.5545, "reward": 2.3773958683013916, "reward_std": 0.324051845818758, "rewards/correctness_reward_func": 1.7083333134651184, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.16906250268220901, "step": 3121, "zero_std_ratio": 0.0 }, { "epoch": 0.2785137606494491, "grad_norm": 169.6839599609375, "learning_rate": 3e-06, "loss": 2.4794, "step": 3122 }, { "epoch": 0.2786029706945002, "grad_norm": 250.81739807128906, "learning_rate": 3e-06, "loss": 0.8429, "step": 3123 }, { "epoch": 0.27869218073955127, "grad_norm": 295.755859375, "learning_rate": 3e-06, "loss": 18.8133, "step": 3124 }, { "epoch": 0.27878139078460235, "grad_norm": 256.17169189453125, "learning_rate": 3e-06, "loss": 4.0764, "step": 3125 }, { "epoch": 0.2788706008296534, "grad_norm": 256.9142150878906, "learning_rate": 3e-06, "loss": 0.1387, "step": 3126 }, { "epoch": 0.2789598108747045, "grad_norm": 181.0319061279297, "learning_rate": 3e-06, "loss": 0.261, "step": 3127 }, { "epoch": 0.2790490209197556, "grad_norm": 214.18463134765625, "learning_rate": 3e-06, "loss": 1.0267, "step": 3128 }, { "epoch": 0.2791382309648066, "grad_norm": 256.59930419921875, "learning_rate": 3e-06, "loss": -1.8326, "step": 3129 }, { "epoch": 0.2792274410098577, "grad_norm": 304.7680358886719, "learning_rate": 3e-06, "loss": 14.8539, "step": 3130 }, { "epoch": 0.27931665105490877, "grad_norm": 225.83462524414062, "learning_rate": 3e-06, "loss": 2.0682, "step": 3131 }, { "epoch": 0.27940586109995985, "grad_norm": 236.8647918701172, "learning_rate": 3e-06, "loss": -1.0186, "step": 3132 }, { "completion_length": 111.39583587646484, "epoch": 0.27949507114501093, "grad_norm": 634.8365478515625, "learning_rate": 3e-06, "loss": 20.6641, "reward": 2.5326459407806396, "reward_std": 0.4156091511249542, "rewards/correctness_reward_func": 1.8333333730697632, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.1993125006556511, "step": 3133, "zero_std_ratio": 0.0 }, { "epoch": 0.279584281190062, "grad_norm": 448.44091796875, "learning_rate": 3e-06, "loss": 10.6298, "step": 3134 }, { "epoch": 0.2796734912351131, "grad_norm": 390.3191833496094, "learning_rate": 3e-06, "loss": 13.2447, "step": 3135 }, { "epoch": 0.27976270128016417, "grad_norm": 512.5096435546875, "learning_rate": 3e-06, "loss": 37.3883, "step": 3136 }, { "epoch": 0.2798519113252152, "grad_norm": 478.20611572265625, "learning_rate": 3e-06, "loss": 24.3938, "step": 3137 }, { "epoch": 0.2799411213702663, "grad_norm": 556.0982666015625, "learning_rate": 3e-06, "loss": 18.3117, "step": 3138 }, { "epoch": 0.28003033141531736, "grad_norm": 663.95068359375, "learning_rate": 3e-06, "loss": 16.8996, "step": 3139 }, { "epoch": 0.28011954146036844, "grad_norm": 361.2245788574219, "learning_rate": 3e-06, "loss": 6.8645, "step": 3140 }, { "epoch": 0.2802087515054195, "grad_norm": 298.50439453125, "learning_rate": 3e-06, "loss": 9.3322, "step": 3141 }, { "epoch": 0.2802979615504706, "grad_norm": 1618.78271484375, "learning_rate": 3e-06, "loss": 34.6681, "step": 3142 }, { "epoch": 0.2803871715955217, "grad_norm": 412.96124267578125, "learning_rate": 3e-06, "loss": 18.2687, "step": 3143 }, { "epoch": 0.2804763816405727, "grad_norm": 554.4656982421875, "learning_rate": 3e-06, "loss": 12.3581, "step": 3144 }, { "completion_length": 128.45833587646484, "epoch": 0.2805655916856238, "grad_norm": 287.9043273925781, "learning_rate": 3e-06, "loss": 4.5874, "reward": 2.5003544092178345, "reward_std": 0.36615208350121975, "rewards/correctness_reward_func": 1.8333333134651184, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.16702084243297577, "step": 3145, "zero_std_ratio": 0.0 }, { "epoch": 0.28065480173067486, "grad_norm": 258.46197509765625, "learning_rate": 3e-06, "loss": -0.7948, "step": 3146 }, { "epoch": 0.28074401177572594, "grad_norm": 780.818115234375, "learning_rate": 3e-06, "loss": 0.1238, "step": 3147 }, { "epoch": 0.280833221820777, "grad_norm": 335.89825439453125, "learning_rate": 3e-06, "loss": -21.2792, "step": 3148 }, { "epoch": 0.2809224318658281, "grad_norm": 251.9041290283203, "learning_rate": 3e-06, "loss": -1.1268, "step": 3149 }, { "epoch": 0.2810116419108792, "grad_norm": 214.9091796875, "learning_rate": 3e-06, "loss": -4.6157, "step": 3150 }, { "epoch": 0.28110085195593026, "grad_norm": 252.64630126953125, "learning_rate": 3e-06, "loss": 1.5208, "step": 3151 }, { "epoch": 0.2811900620009813, "grad_norm": 310.5806579589844, "learning_rate": 3e-06, "loss": -3.5422, "step": 3152 }, { "epoch": 0.28127927204603237, "grad_norm": 746.1177978515625, "learning_rate": 3e-06, "loss": -3.3449, "step": 3153 }, { "epoch": 0.28136848209108345, "grad_norm": 449.2014465332031, "learning_rate": 3e-06, "loss": -24.2734, "step": 3154 }, { "epoch": 0.28145769213613453, "grad_norm": 242.75436401367188, "learning_rate": 3e-06, "loss": -3.2139, "step": 3155 }, { "epoch": 0.2815469021811856, "grad_norm": 235.21990966796875, "learning_rate": 3e-06, "loss": -5.174, "step": 3156 }, { "completion_length": 144.6041717529297, "epoch": 0.2816361122262367, "grad_norm": 2443.907958984375, "learning_rate": 3e-06, "loss": 92.9171, "reward": 2.298208475112915, "reward_std": 0.438069611787796, "rewards/correctness_reward_func": 1.7083333134651184, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.0898750051856041, "step": 3157, "zero_std_ratio": 0.0 }, { "epoch": 0.28172532227128777, "grad_norm": 2332.5009765625, "learning_rate": 3e-06, "loss": 5.1398, "step": 3158 }, { "epoch": 0.2818145323163388, "grad_norm": 2053.478271484375, "learning_rate": 3e-06, "loss": -141.9143, "step": 3159 }, { "epoch": 0.2819037423613899, "grad_norm": 1320.920654296875, "learning_rate": 3e-06, "loss": -168.4594, "step": 3160 }, { "epoch": 0.28199295240644096, "grad_norm": 1368.8167724609375, "learning_rate": 3e-06, "loss": 22.5075, "step": 3161 }, { "epoch": 0.28208216245149204, "grad_norm": 1353.4337158203125, "learning_rate": 3e-06, "loss": -20.4864, "step": 3162 }, { "epoch": 0.2821713724965431, "grad_norm": 2528.395263671875, "learning_rate": 3e-06, "loss": 54.1979, "step": 3163 }, { "epoch": 0.2822605825415942, "grad_norm": 1900.82421875, "learning_rate": 3e-06, "loss": -15.1804, "step": 3164 }, { "epoch": 0.2823497925866453, "grad_norm": 2002.454345703125, "learning_rate": 3e-06, "loss": -167.1213, "step": 3165 }, { "epoch": 0.28243900263169636, "grad_norm": 943.1581420898438, "learning_rate": 3e-06, "loss": -195.2964, "step": 3166 }, { "epoch": 0.2825282126767474, "grad_norm": 1444.2733154296875, "learning_rate": 3e-06, "loss": -6.4129, "step": 3167 }, { "epoch": 0.28261742272179846, "grad_norm": 1901.743896484375, "learning_rate": 3e-06, "loss": -46.1337, "step": 3168 }, { "completion_length": 170.8125, "epoch": 0.28270663276684954, "grad_norm": 221.38189697265625, "learning_rate": 3e-06, "loss": -24.2093, "reward": 1.9977917075157166, "reward_std": 0.19150124862790108, "rewards/correctness_reward_func": 1.4583333134651184, "rewards/int_reward_func": 0.4791666567325592, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.06029167026281357, "step": 3169, "zero_std_ratio": 0.0 }, { "epoch": 0.2827958428119006, "grad_norm": 796.3472290039062, "learning_rate": 3e-06, "loss": -26.4905, "step": 3170 }, { "epoch": 0.2828850528569517, "grad_norm": 399.228271484375, "learning_rate": 3e-06, "loss": -47.5167, "step": 3171 }, { "epoch": 0.2829742629020028, "grad_norm": 864.2409057617188, "learning_rate": 3e-06, "loss": -19.7004, "step": 3172 }, { "epoch": 0.28306347294705386, "grad_norm": 698.4199829101562, "learning_rate": 3e-06, "loss": 10.421, "step": 3173 }, { "epoch": 0.2831526829921049, "grad_norm": 281.5791015625, "learning_rate": 3e-06, "loss": -7.3077, "step": 3174 }, { "epoch": 0.28324189303715597, "grad_norm": 250.7912139892578, "learning_rate": 3e-06, "loss": -26.8344, "step": 3175 }, { "epoch": 0.28333110308220705, "grad_norm": 503.819091796875, "learning_rate": 3e-06, "loss": -33.8434, "step": 3176 }, { "epoch": 0.2834203131272581, "grad_norm": 409.0915222167969, "learning_rate": 3e-06, "loss": -50.5725, "step": 3177 }, { "epoch": 0.2835095231723092, "grad_norm": 458.4985656738281, "learning_rate": 3e-06, "loss": -28.8809, "step": 3178 }, { "epoch": 0.2835987332173603, "grad_norm": 1216.4083251953125, "learning_rate": 3e-06, "loss": -3.0048, "step": 3179 }, { "epoch": 0.28368794326241137, "grad_norm": 229.0378875732422, "learning_rate": 3e-06, "loss": -11.2557, "step": 3180 }, { "completion_length": 113.41667175292969, "epoch": 0.2837771533074624, "grad_norm": 472.3091735839844, "learning_rate": 3e-06, "loss": 21.2478, "reward": 2.449875235557556, "reward_std": 0.33516695350408554, "rewards/correctness_reward_func": 1.75, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.19987501204013824, "step": 3181, "zero_std_ratio": 0.0 }, { "epoch": 0.2838663633525135, "grad_norm": 839.1354370117188, "learning_rate": 3e-06, "loss": 27.4166, "step": 3182 }, { "epoch": 0.28395557339756455, "grad_norm": 1058.7283935546875, "learning_rate": 3e-06, "loss": 36.2361, "step": 3183 }, { "epoch": 0.28404478344261563, "grad_norm": 306.9678649902344, "learning_rate": 3e-06, "loss": -19.0696, "step": 3184 }, { "epoch": 0.2841339934876667, "grad_norm": 431.45159912109375, "learning_rate": 3e-06, "loss": -4.6623, "step": 3185 }, { "epoch": 0.2842232035327178, "grad_norm": 594.4414672851562, "learning_rate": 3e-06, "loss": -2.7911, "step": 3186 }, { "epoch": 0.2843124135777689, "grad_norm": 436.5487060546875, "learning_rate": 3e-06, "loss": 16.6563, "step": 3187 }, { "epoch": 0.28440162362281995, "grad_norm": 651.9136352539062, "learning_rate": 3e-06, "loss": 19.0367, "step": 3188 }, { "epoch": 0.284490833667871, "grad_norm": 753.1431884765625, "learning_rate": 3e-06, "loss": 25.9058, "step": 3189 }, { "epoch": 0.28458004371292206, "grad_norm": 294.2620544433594, "learning_rate": 3e-06, "loss": -19.6903, "step": 3190 }, { "epoch": 0.28466925375797314, "grad_norm": 382.62408447265625, "learning_rate": 3e-06, "loss": -7.6563, "step": 3191 }, { "epoch": 0.2847584638030242, "grad_norm": 569.9071044921875, "learning_rate": 3e-06, "loss": -4.1669, "step": 3192 }, { "completion_length": 151.64583587646484, "epoch": 0.2848476738480753, "grad_norm": 454.8868103027344, "learning_rate": 3e-06, "loss": 27.5014, "reward": 1.8396041989326477, "reward_std": 0.14819328393787146, "rewards/correctness_reward_func": 1.2083333432674408, "rewards/int_reward_func": 0.4583333283662796, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.17293750494718552, "step": 3193, "zero_std_ratio": 0.0 }, { "epoch": 0.2849368838931264, "grad_norm": 267.8363342285156, "learning_rate": 3e-06, "loss": 32.0825, "step": 3194 }, { "epoch": 0.28502609393817746, "grad_norm": 391.85369873046875, "learning_rate": 3e-06, "loss": 20.7338, "step": 3195 }, { "epoch": 0.2851153039832285, "grad_norm": 284.7150573730469, "learning_rate": 3e-06, "loss": 11.0047, "step": 3196 }, { "epoch": 0.28520451402827957, "grad_norm": 439.2987365722656, "learning_rate": 3e-06, "loss": 9.9788, "step": 3197 }, { "epoch": 0.28529372407333065, "grad_norm": 467.1644592285156, "learning_rate": 3e-06, "loss": 17.1824, "step": 3198 }, { "epoch": 0.2853829341183817, "grad_norm": 365.1625671386719, "learning_rate": 3e-06, "loss": 19.553, "step": 3199 }, { "epoch": 0.2854721441634328, "grad_norm": 423.6968994140625, "learning_rate": 3e-06, "loss": 25.0349, "step": 3200 }, { "epoch": 0.2855613542084839, "grad_norm": 270.33331298828125, "learning_rate": 3e-06, "loss": 12.1621, "step": 3201 }, { "epoch": 0.28565056425353497, "grad_norm": 268.3208923339844, "learning_rate": 3e-06, "loss": 4.2701, "step": 3202 }, { "epoch": 0.28573977429858605, "grad_norm": 100.51649475097656, "learning_rate": 3e-06, "loss": 3.6041, "step": 3203 }, { "epoch": 0.28582898434363707, "grad_norm": 107.8956298828125, "learning_rate": 3e-06, "loss": 10.4808, "step": 3204 }, { "completion_length": 141.22916793823242, "epoch": 0.28591819438868815, "grad_norm": 208.2997589111328, "learning_rate": 3e-06, "loss": 4.5493, "reward": 1.916812539100647, "reward_std": 0.49208760261535645, "rewards/correctness_reward_func": 1.2916666865348816, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.12514584138989449, "step": 3205, "zero_std_ratio": 0.0 }, { "epoch": 0.28600740443373923, "grad_norm": 252.3931884765625, "learning_rate": 3e-06, "loss": -7.7603, "step": 3206 }, { "epoch": 0.2860966144787903, "grad_norm": 218.31919860839844, "learning_rate": 3e-06, "loss": 2.1983, "step": 3207 }, { "epoch": 0.2861858245238414, "grad_norm": 211.39637756347656, "learning_rate": 3e-06, "loss": 5.9205, "step": 3208 }, { "epoch": 0.2862750345688925, "grad_norm": 230.24545288085938, "learning_rate": 3e-06, "loss": 3.4065, "step": 3209 }, { "epoch": 0.28636424461394355, "grad_norm": 216.7148895263672, "learning_rate": 3e-06, "loss": -19.7156, "step": 3210 }, { "epoch": 0.2864534546589946, "grad_norm": 203.72927856445312, "learning_rate": 3e-06, "loss": 1.3779, "step": 3211 }, { "epoch": 0.28654266470404566, "grad_norm": 262.5057067871094, "learning_rate": 3e-06, "loss": -12.2007, "step": 3212 }, { "epoch": 0.28663187474909674, "grad_norm": 260.7509765625, "learning_rate": 3e-06, "loss": 0.383, "step": 3213 }, { "epoch": 0.2867210847941478, "grad_norm": 209.00767517089844, "learning_rate": 3e-06, "loss": 3.1272, "step": 3214 }, { "epoch": 0.2868102948391989, "grad_norm": 199.4011993408203, "learning_rate": 3e-06, "loss": 2.5939, "step": 3215 }, { "epoch": 0.28689950488425, "grad_norm": 191.7748565673828, "learning_rate": 3e-06, "loss": -21.0517, "step": 3216 }, { "completion_length": 149.375, "epoch": 0.28698871492930106, "grad_norm": 147.71131896972656, "learning_rate": 3e-06, "loss": -5.1247, "reward": 2.2130000591278076, "reward_std": 0.3840484172105789, "rewards/correctness_reward_func": 1.625, "rewards/int_reward_func": 0.46875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.11925000324845314, "step": 3217, "zero_std_ratio": 0.0 }, { "epoch": 0.28707792497435214, "grad_norm": 156.00924682617188, "learning_rate": 3e-06, "loss": 3.5745, "step": 3218 }, { "epoch": 0.28716713501940316, "grad_norm": 176.4787139892578, "learning_rate": 3e-06, "loss": -4.1422, "step": 3219 }, { "epoch": 0.28725634506445424, "grad_norm": 146.17117309570312, "learning_rate": 3e-06, "loss": 2.1965, "step": 3220 }, { "epoch": 0.2873455551095053, "grad_norm": 163.52188110351562, "learning_rate": 3e-06, "loss": 2.327, "step": 3221 }, { "epoch": 0.2874347651545564, "grad_norm": 190.82752990722656, "learning_rate": 3e-06, "loss": -2.9467, "step": 3222 }, { "epoch": 0.2875239751996075, "grad_norm": 143.25057983398438, "learning_rate": 3e-06, "loss": -5.9213, "step": 3223 }, { "epoch": 0.28761318524465856, "grad_norm": 171.8119659423828, "learning_rate": 3e-06, "loss": 2.7104, "step": 3224 }, { "epoch": 0.28770239528970964, "grad_norm": 247.70462036132812, "learning_rate": 3e-06, "loss": -5.3113, "step": 3225 }, { "epoch": 0.28779160533476067, "grad_norm": 130.56521606445312, "learning_rate": 3e-06, "loss": 1.0596, "step": 3226 }, { "epoch": 0.28788081537981175, "grad_norm": 129.9229736328125, "learning_rate": 3e-06, "loss": 0.1748, "step": 3227 }, { "epoch": 0.28797002542486283, "grad_norm": 204.13160705566406, "learning_rate": 3e-06, "loss": -3.702, "step": 3228 }, { "completion_length": 122.18750381469727, "epoch": 0.2880592354699139, "grad_norm": 71.31390380859375, "learning_rate": 3e-06, "loss": 0.6841, "reward": 2.659354329109192, "reward_std": 0.1278772410005331, "rewards/correctness_reward_func": 1.9583333134651184, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.20102083683013916, "step": 3229, "zero_std_ratio": 0.0 }, { "epoch": 0.288148445514965, "grad_norm": 83.12515258789062, "learning_rate": 3e-06, "loss": -1.8636, "step": 3230 }, { "epoch": 0.28823765556001607, "grad_norm": 80.56033325195312, "learning_rate": 3e-06, "loss": -1.1191, "step": 3231 }, { "epoch": 0.28832686560506715, "grad_norm": 163.4371337890625, "learning_rate": 3e-06, "loss": -0.821, "step": 3232 }, { "epoch": 0.28841607565011823, "grad_norm": 133.67848205566406, "learning_rate": 3e-06, "loss": -6.9013, "step": 3233 }, { "epoch": 0.28850528569516926, "grad_norm": 99.9486312866211, "learning_rate": 3e-06, "loss": -0.6863, "step": 3234 }, { "epoch": 0.28859449574022034, "grad_norm": 106.5368423461914, "learning_rate": 3e-06, "loss": 0.9028, "step": 3235 }, { "epoch": 0.2886837057852714, "grad_norm": 82.39705657958984, "learning_rate": 3e-06, "loss": -2.3538, "step": 3236 }, { "epoch": 0.2887729158303225, "grad_norm": 74.2812728881836, "learning_rate": 3e-06, "loss": -1.6313, "step": 3237 }, { "epoch": 0.2888621258753736, "grad_norm": 150.24708557128906, "learning_rate": 3e-06, "loss": -2.0579, "step": 3238 }, { "epoch": 0.28895133592042466, "grad_norm": 124.14253234863281, "learning_rate": 3e-06, "loss": -8.917, "step": 3239 }, { "epoch": 0.28904054596547574, "grad_norm": 95.86636352539062, "learning_rate": 3e-06, "loss": -2.0824, "step": 3240 }, { "completion_length": 143.9166717529297, "epoch": 0.28912975601052676, "grad_norm": 990.1576538085938, "learning_rate": 3e-06, "loss": -37.2769, "reward": 1.797458291053772, "reward_std": 0.7053342461585999, "rewards/correctness_reward_func": 1.2083333134651184, "rewards/int_reward_func": 0.4895833283662796, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.09954165969975293, "step": 3241, "zero_std_ratio": 0.0 }, { "epoch": 0.28921896605557784, "grad_norm": 404.61358642578125, "learning_rate": 3e-06, "loss": -26.8977, "step": 3242 }, { "epoch": 0.2893081761006289, "grad_norm": 845.18701171875, "learning_rate": 3e-06, "loss": -102.4074, "step": 3243 }, { "epoch": 0.28939738614568, "grad_norm": 717.4209594726562, "learning_rate": 3e-06, "loss": -36.9436, "step": 3244 }, { "epoch": 0.2894865961907311, "grad_norm": 991.8800048828125, "learning_rate": 3e-06, "loss": -42.9548, "step": 3245 }, { "epoch": 0.28957580623578216, "grad_norm": 1358.9937744140625, "learning_rate": 3e-06, "loss": -64.8945, "step": 3246 }, { "epoch": 0.28966501628083324, "grad_norm": 807.1321411132812, "learning_rate": 3e-06, "loss": -47.5419, "step": 3247 }, { "epoch": 0.28975422632588427, "grad_norm": 586.22998046875, "learning_rate": 3e-06, "loss": -34.8822, "step": 3248 }, { "epoch": 0.28984343637093535, "grad_norm": 970.878173828125, "learning_rate": 3e-06, "loss": -123.8926, "step": 3249 }, { "epoch": 0.28993264641598643, "grad_norm": 694.0813598632812, "learning_rate": 3e-06, "loss": -48.5033, "step": 3250 }, { "epoch": 0.2900218564610375, "grad_norm": 1660.216796875, "learning_rate": 3e-06, "loss": -79.0184, "step": 3251 }, { "epoch": 0.2901110665060886, "grad_norm": 1258.2376708984375, "learning_rate": 3e-06, "loss": -102.3825, "step": 3252 }, { "completion_length": 147.95833587646484, "epoch": 0.29020027655113967, "grad_norm": 170.14231872558594, "learning_rate": 3e-06, "loss": -16.6486, "reward": 2.226270914077759, "reward_std": 0.45757677406072617, "rewards/correctness_reward_func": 1.6666666269302368, "rewards/int_reward_func": 0.46875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.09085416607558727, "step": 3253, "zero_std_ratio": 0.125 }, { "epoch": 0.29028948659619075, "grad_norm": 297.652587890625, "learning_rate": 3e-06, "loss": 25.3992, "step": 3254 }, { "epoch": 0.29037869664124183, "grad_norm": 324.4555969238281, "learning_rate": 3e-06, "loss": 10.0792, "step": 3255 }, { "epoch": 0.29046790668629285, "grad_norm": 353.401123046875, "learning_rate": 3e-06, "loss": -7.4984, "step": 3256 }, { "epoch": 0.29055711673134393, "grad_norm": 231.3075408935547, "learning_rate": 3e-06, "loss": -9.5692, "step": 3257 }, { "epoch": 0.290646326776395, "grad_norm": 390.5084228515625, "learning_rate": 3e-06, "loss": -10.4636, "step": 3258 }, { "epoch": 0.2907355368214461, "grad_norm": 296.0336608886719, "learning_rate": 3e-06, "loss": -21.1044, "step": 3259 }, { "epoch": 0.2908247468664972, "grad_norm": 330.7455139160156, "learning_rate": 3e-06, "loss": 20.7091, "step": 3260 }, { "epoch": 0.29091395691154825, "grad_norm": 427.682861328125, "learning_rate": 3e-06, "loss": 10.5583, "step": 3261 }, { "epoch": 0.29100316695659934, "grad_norm": 574.2042236328125, "learning_rate": 3e-06, "loss": -20.2204, "step": 3262 }, { "epoch": 0.29109237700165036, "grad_norm": 782.3720092773438, "learning_rate": 3e-06, "loss": -22.3625, "step": 3263 }, { "epoch": 0.29118158704670144, "grad_norm": 860.0758056640625, "learning_rate": 3e-06, "loss": -27.5065, "step": 3264 }, { "completion_length": 112.60417175292969, "epoch": 0.2912707970917525, "grad_norm": 367.41717529296875, "learning_rate": 3e-06, "loss": -22.1894, "reward": 2.3418959379196167, "reward_std": 0.4054219573736191, "rewards/correctness_reward_func": 1.6666666865348816, "rewards/int_reward_func": 0.46875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.20647918432950974, "step": 3265, "zero_std_ratio": 0.0 }, { "epoch": 0.2913600071368036, "grad_norm": 708.2421875, "learning_rate": 3e-06, "loss": -28.1032, "step": 3266 }, { "epoch": 0.2914492171818547, "grad_norm": 488.5195617675781, "learning_rate": 3e-06, "loss": -40.8457, "step": 3267 }, { "epoch": 0.29153842722690576, "grad_norm": 529.2683715820312, "learning_rate": 3e-06, "loss": -16.1758, "step": 3268 }, { "epoch": 0.29162763727195684, "grad_norm": 403.1246032714844, "learning_rate": 3e-06, "loss": -28.7215, "step": 3269 }, { "epoch": 0.2917168473170079, "grad_norm": 550.165771484375, "learning_rate": 3e-06, "loss": -14.4839, "step": 3270 }, { "epoch": 0.29180605736205895, "grad_norm": 495.1943664550781, "learning_rate": 3e-06, "loss": -31.1421, "step": 3271 }, { "epoch": 0.29189526740711, "grad_norm": 458.0103759765625, "learning_rate": 3e-06, "loss": -42.1751, "step": 3272 }, { "epoch": 0.2919844774521611, "grad_norm": 510.6725158691406, "learning_rate": 3e-06, "loss": -55.8295, "step": 3273 }, { "epoch": 0.2920736874972122, "grad_norm": 829.9152221679688, "learning_rate": 3e-06, "loss": -32.635, "step": 3274 }, { "epoch": 0.29216289754226327, "grad_norm": 490.6168518066406, "learning_rate": 3e-06, "loss": -41.7228, "step": 3275 }, { "epoch": 0.29225210758731435, "grad_norm": 654.599609375, "learning_rate": 3e-06, "loss": -32.2169, "step": 3276 }, { "completion_length": 122.04167175292969, "epoch": 0.2923413176323654, "grad_norm": 1395.3751220703125, "learning_rate": 3e-06, "loss": -45.335, "reward": 2.4845833778381348, "reward_std": 0.5396375358104706, "rewards/correctness_reward_func": 1.7916666865348816, "rewards/int_reward_func": 0.46875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.22416667640209198, "step": 3277, "zero_std_ratio": 0.0 }, { "epoch": 0.29243052767741645, "grad_norm": 1533.7100830078125, "learning_rate": 3e-06, "loss": -59.5225, "step": 3278 }, { "epoch": 0.29251973772246753, "grad_norm": 1490.9530029296875, "learning_rate": 3e-06, "loss": -76.7689, "step": 3279 }, { "epoch": 0.2926089477675186, "grad_norm": 1361.626953125, "learning_rate": 3e-06, "loss": -142.2738, "step": 3280 }, { "epoch": 0.2926981578125697, "grad_norm": 1284.0633544921875, "learning_rate": 3e-06, "loss": -90.7309, "step": 3281 }, { "epoch": 0.2927873678576208, "grad_norm": 1101.373291015625, "learning_rate": 3e-06, "loss": -46.592, "step": 3282 }, { "epoch": 0.29287657790267185, "grad_norm": 1388.462158203125, "learning_rate": 3e-06, "loss": -87.2955, "step": 3283 }, { "epoch": 0.29296578794772293, "grad_norm": 1650.6728515625, "learning_rate": 3e-06, "loss": -109.8586, "step": 3284 }, { "epoch": 0.293054997992774, "grad_norm": 1382.8204345703125, "learning_rate": 3e-06, "loss": -130.8093, "step": 3285 }, { "epoch": 0.29314420803782504, "grad_norm": 1356.60107421875, "learning_rate": 3e-06, "loss": -191.3394, "step": 3286 }, { "epoch": 0.2932334180828761, "grad_norm": 1137.9412841796875, "learning_rate": 3e-06, "loss": -139.0338, "step": 3287 }, { "epoch": 0.2933226281279272, "grad_norm": 2433.387451171875, "learning_rate": 3e-06, "loss": -102.1352, "step": 3288 }, { "completion_length": 132.58333587646484, "epoch": 0.2934118381729783, "grad_norm": 2851.475341796875, "learning_rate": 3e-06, "loss": 692.6053, "reward": 1.8479167222976685, "reward_std": 0.5779125392436981, "rewards/correctness_reward_func": 1.2083333134651184, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.13958332687616348, "step": 3289, "zero_std_ratio": 0.0 }, { "epoch": 0.29350104821802936, "grad_norm": 2359.300537109375, "learning_rate": 3e-06, "loss": 509.5677, "step": 3290 }, { "epoch": 0.29359025826308044, "grad_norm": 2929.178955078125, "learning_rate": 3e-06, "loss": 459.7925, "step": 3291 }, { "epoch": 0.2936794683081315, "grad_norm": 2546.384765625, "learning_rate": 3e-06, "loss": 418.7861, "step": 3292 }, { "epoch": 0.29376867835318254, "grad_norm": 2023.1873779296875, "learning_rate": 3e-06, "loss": 607.5456, "step": 3293 }, { "epoch": 0.2938578883982336, "grad_norm": 2562.637939453125, "learning_rate": 3e-06, "loss": 321.3831, "step": 3294 }, { "epoch": 0.2939470984432847, "grad_norm": 2365.257080078125, "learning_rate": 3e-06, "loss": 692.0815, "step": 3295 }, { "epoch": 0.2940363084883358, "grad_norm": 2739.76904296875, "learning_rate": 3e-06, "loss": 501.0413, "step": 3296 }, { "epoch": 0.29412551853338686, "grad_norm": 2638.45654296875, "learning_rate": 3e-06, "loss": 431.3708, "step": 3297 }, { "epoch": 0.29421472857843795, "grad_norm": 2472.531005859375, "learning_rate": 3e-06, "loss": 375.7653, "step": 3298 }, { "epoch": 0.294303938623489, "grad_norm": 1980.08203125, "learning_rate": 3e-06, "loss": 588.7766, "step": 3299 }, { "epoch": 0.29439314866854005, "grad_norm": 2077.862548828125, "learning_rate": 3e-06, "loss": 287.4859, "step": 3300 }, { "completion_length": 135.12500762939453, "epoch": 0.29448235871359113, "grad_norm": 1344.27001953125, "learning_rate": 3e-06, "loss": -25.4379, "reward": 2.2621041536331177, "reward_std": 0.5536149442195892, "rewards/correctness_reward_func": 1.625, "rewards/int_reward_func": 0.4895833283662796, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.14752083271741867, "step": 3301, "zero_std_ratio": 0.0 }, { "epoch": 0.2945715687586422, "grad_norm": 1298.08154296875, "learning_rate": 3e-06, "loss": 132.2727, "step": 3302 }, { "epoch": 0.2946607788036933, "grad_norm": 1458.47021484375, "learning_rate": 3e-06, "loss": 93.6915, "step": 3303 }, { "epoch": 0.29474998884874437, "grad_norm": 861.47705078125, "learning_rate": 3e-06, "loss": 46.3207, "step": 3304 }, { "epoch": 0.29483919889379545, "grad_norm": 986.2140502929688, "learning_rate": 3e-06, "loss": 38.3626, "step": 3305 }, { "epoch": 0.29492840893884653, "grad_norm": 1047.131103515625, "learning_rate": 3e-06, "loss": 100.2296, "step": 3306 }, { "epoch": 0.2950176189838976, "grad_norm": 1278.004638671875, "learning_rate": 3e-06, "loss": -33.0876, "step": 3307 }, { "epoch": 0.29510682902894864, "grad_norm": 1247.85546875, "learning_rate": 3e-06, "loss": 127.7417, "step": 3308 }, { "epoch": 0.2951960390739997, "grad_norm": 1768.7672119140625, "learning_rate": 3e-06, "loss": 76.4776, "step": 3309 }, { "epoch": 0.2952852491190508, "grad_norm": 1382.069091796875, "learning_rate": 3e-06, "loss": 40.8143, "step": 3310 }, { "epoch": 0.2953744591641019, "grad_norm": 1118.470703125, "learning_rate": 3e-06, "loss": 32.1421, "step": 3311 }, { "epoch": 0.29546366920915296, "grad_norm": 1096.5416259765625, "learning_rate": 3e-06, "loss": 91.3316, "step": 3312 }, { "completion_length": 123.5, "epoch": 0.29555287925420404, "grad_norm": 1611.606689453125, "learning_rate": 3e-06, "loss": -47.9054, "reward": 2.441854238510132, "reward_std": 0.20054445415735245, "rewards/correctness_reward_func": 1.75, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.19185416400432587, "step": 3313, "zero_std_ratio": 0.125 }, { "epoch": 0.2956420892992551, "grad_norm": 1572.578857421875, "learning_rate": 3e-06, "loss": -43.4335, "step": 3314 }, { "epoch": 0.29573129934430614, "grad_norm": 1769.008056640625, "learning_rate": 3e-06, "loss": -99.8678, "step": 3315 }, { "epoch": 0.2958205093893572, "grad_norm": 1092.047119140625, "learning_rate": 3e-06, "loss": -51.9967, "step": 3316 }, { "epoch": 0.2959097194344083, "grad_norm": 1475.814697265625, "learning_rate": 3e-06, "loss": -10.7728, "step": 3317 }, { "epoch": 0.2959989294794594, "grad_norm": 1677.0244140625, "learning_rate": 3e-06, "loss": -43.9897, "step": 3318 }, { "epoch": 0.29608813952451046, "grad_norm": 1983.81103515625, "learning_rate": 3e-06, "loss": -70.9193, "step": 3319 }, { "epoch": 0.29617734956956154, "grad_norm": 1700.870361328125, "learning_rate": 3e-06, "loss": -52.6147, "step": 3320 }, { "epoch": 0.2962665596146126, "grad_norm": 1310.3284912109375, "learning_rate": 3e-06, "loss": -114.1415, "step": 3321 }, { "epoch": 0.2963557696596637, "grad_norm": 970.1644287109375, "learning_rate": 3e-06, "loss": -56.8137, "step": 3322 }, { "epoch": 0.29644497970471473, "grad_norm": 1543.35888671875, "learning_rate": 3e-06, "loss": -27.2239, "step": 3323 }, { "epoch": 0.2965341897497658, "grad_norm": 1682.4708251953125, "learning_rate": 3e-06, "loss": -59.0936, "step": 3324 }, { "completion_length": 164.1041717529297, "epoch": 0.2966233997948169, "grad_norm": 2562.382080078125, "learning_rate": 3e-06, "loss": -137.0939, "reward": 2.0416667461395264, "reward_std": 0.612931102514267, "rewards/correctness_reward_func": 1.4583333134651184, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.0833333320915699, "step": 3325, "zero_std_ratio": 0.0 }, { "epoch": 0.29671260983986797, "grad_norm": 1285.0123291015625, "learning_rate": 3e-06, "loss": -51.5861, "step": 3326 }, { "epoch": 0.29680181988491905, "grad_norm": 1550.627685546875, "learning_rate": 3e-06, "loss": 13.2072, "step": 3327 }, { "epoch": 0.29689102992997013, "grad_norm": 2237.448974609375, "learning_rate": 3e-06, "loss": -65.6572, "step": 3328 }, { "epoch": 0.2969802399750212, "grad_norm": 1952.9339599609375, "learning_rate": 3e-06, "loss": -72.0183, "step": 3329 }, { "epoch": 0.29706945002007223, "grad_norm": 5932.919921875, "learning_rate": 3e-06, "loss": -131.6846, "step": 3330 }, { "epoch": 0.2971586600651233, "grad_norm": 4180.40283203125, "learning_rate": 3e-06, "loss": -167.2603, "step": 3331 }, { "epoch": 0.2972478701101744, "grad_norm": 1364.23974609375, "learning_rate": 3e-06, "loss": -53.2719, "step": 3332 }, { "epoch": 0.2973370801552255, "grad_norm": 1608.9691162109375, "learning_rate": 3e-06, "loss": 2.6519, "step": 3333 }, { "epoch": 0.29742629020027656, "grad_norm": 2299.852783203125, "learning_rate": 3e-06, "loss": -68.5413, "step": 3334 }, { "epoch": 0.29751550024532764, "grad_norm": 1593.8436279296875, "learning_rate": 3e-06, "loss": -75.5035, "step": 3335 }, { "epoch": 0.2976047102903787, "grad_norm": 2549.036376953125, "learning_rate": 3e-06, "loss": -126.9194, "step": 3336 }, { "completion_length": 126.83333587646484, "epoch": 0.2976939203354298, "grad_norm": 2272.15185546875, "learning_rate": 3e-06, "loss": -57.5032, "reward": 1.988270878791809, "reward_std": 0.5164491832256317, "rewards/correctness_reward_func": 1.375, "rewards/int_reward_func": 0.46875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.1445208340883255, "step": 3337, "zero_std_ratio": 0.0 }, { "epoch": 0.2977831303804808, "grad_norm": 2061.87841796875, "learning_rate": 3e-06, "loss": -135.5704, "step": 3338 }, { "epoch": 0.2978723404255319, "grad_norm": 2686.571533203125, "learning_rate": 3e-06, "loss": -172.322, "step": 3339 }, { "epoch": 0.297961550470583, "grad_norm": 4018.337890625, "learning_rate": 3e-06, "loss": -105.3649, "step": 3340 }, { "epoch": 0.29805076051563406, "grad_norm": 1975.170166015625, "learning_rate": 3e-06, "loss": -102.7281, "step": 3341 }, { "epoch": 0.29813997056068514, "grad_norm": 2352.061767578125, "learning_rate": 3e-06, "loss": -68.474, "step": 3342 }, { "epoch": 0.2982291806057362, "grad_norm": 2748.470947265625, "learning_rate": 3e-06, "loss": -95.8487, "step": 3343 }, { "epoch": 0.2983183906507873, "grad_norm": 1501.2760009765625, "learning_rate": 3e-06, "loss": -163.1516, "step": 3344 }, { "epoch": 0.2984076006958383, "grad_norm": 2639.54833984375, "learning_rate": 3e-06, "loss": -225.3082, "step": 3345 }, { "epoch": 0.2984968107408894, "grad_norm": 1705.81689453125, "learning_rate": 3e-06, "loss": -181.0758, "step": 3346 }, { "epoch": 0.2985860207859405, "grad_norm": 1626.8902587890625, "learning_rate": 3e-06, "loss": -144.4269, "step": 3347 }, { "epoch": 0.29867523083099157, "grad_norm": 2255.9111328125, "learning_rate": 3e-06, "loss": -129.8865, "step": 3348 }, { "completion_length": 131.20834350585938, "epoch": 0.29876444087604265, "grad_norm": 128.3455352783203, "learning_rate": 3e-06, "loss": -2.5903, "reward": 2.038541793823242, "reward_std": 0.193179689347744, "rewards/correctness_reward_func": 1.375, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.16354165971279144, "step": 3349, "zero_std_ratio": 0.0 }, { "epoch": 0.2988536509210937, "grad_norm": 183.58480834960938, "learning_rate": 3e-06, "loss": -1.5293, "step": 3350 }, { "epoch": 0.2989428609661448, "grad_norm": 225.32484436035156, "learning_rate": 3e-06, "loss": 1.4312, "step": 3351 }, { "epoch": 0.2990320710111959, "grad_norm": 220.1730194091797, "learning_rate": 3e-06, "loss": -6.048, "step": 3352 }, { "epoch": 0.2991212810562469, "grad_norm": 198.416748046875, "learning_rate": 3e-06, "loss": 6.2046, "step": 3353 }, { "epoch": 0.299210491101298, "grad_norm": 227.38864135742188, "learning_rate": 3e-06, "loss": -3.3866, "step": 3354 }, { "epoch": 0.2992997011463491, "grad_norm": 176.162841796875, "learning_rate": 3e-06, "loss": -3.7381, "step": 3355 }, { "epoch": 0.29938891119140015, "grad_norm": 224.11245727539062, "learning_rate": 3e-06, "loss": -3.5747, "step": 3356 }, { "epoch": 0.29947812123645123, "grad_norm": 189.82000732421875, "learning_rate": 3e-06, "loss": -1.7388, "step": 3357 }, { "epoch": 0.2995673312815023, "grad_norm": 248.49205017089844, "learning_rate": 3e-06, "loss": -7.9419, "step": 3358 }, { "epoch": 0.2996565413265534, "grad_norm": 207.15567016601562, "learning_rate": 3e-06, "loss": 4.2332, "step": 3359 }, { "epoch": 0.2997457513716044, "grad_norm": 347.9344787597656, "learning_rate": 3e-06, "loss": -7.4492, "step": 3360 }, { "completion_length": 134.1875, "epoch": 0.2998349614166555, "grad_norm": 459.2406921386719, "learning_rate": 3e-06, "loss": -9.2619, "reward": 2.0878958702087402, "reward_std": 0.18173200264573097, "rewards/correctness_reward_func": 1.4166666865348816, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.17122916132211685, "step": 3361, "zero_std_ratio": 0.0 }, { "epoch": 0.2999241714617066, "grad_norm": 603.4501342773438, "learning_rate": 3e-06, "loss": 5.5084, "step": 3362 }, { "epoch": 0.30001338150675766, "grad_norm": 493.88165283203125, "learning_rate": 3e-06, "loss": -21.0153, "step": 3363 }, { "epoch": 0.30010259155180874, "grad_norm": 338.9836730957031, "learning_rate": 3e-06, "loss": 25.7138, "step": 3364 }, { "epoch": 0.3001918015968598, "grad_norm": 475.26458740234375, "learning_rate": 3e-06, "loss": 3.3465, "step": 3365 }, { "epoch": 0.3002810116419109, "grad_norm": 425.6787109375, "learning_rate": 3e-06, "loss": 13.5073, "step": 3366 }, { "epoch": 0.3003702216869619, "grad_norm": 447.2120361328125, "learning_rate": 3e-06, "loss": -10.0454, "step": 3367 }, { "epoch": 0.300459431732013, "grad_norm": 548.1171264648438, "learning_rate": 3e-06, "loss": -1.5524, "step": 3368 }, { "epoch": 0.3005486417770641, "grad_norm": 528.1126098632812, "learning_rate": 3e-06, "loss": -28.2928, "step": 3369 }, { "epoch": 0.30063785182211517, "grad_norm": 398.9341125488281, "learning_rate": 3e-06, "loss": 19.4493, "step": 3370 }, { "epoch": 0.30072706186716625, "grad_norm": 572.12939453125, "learning_rate": 3e-06, "loss": -2.8259, "step": 3371 }, { "epoch": 0.3008162719122173, "grad_norm": 489.2485656738281, "learning_rate": 3e-06, "loss": 9.2163, "step": 3372 }, { "completion_length": 127.58333969116211, "epoch": 0.3009054819572684, "grad_norm": 923.7178344726562, "learning_rate": 3e-06, "loss": -124.9218, "reward": 2.510520815849304, "reward_std": 0.3787711560726166, "rewards/correctness_reward_func": 1.8333333134651184, "rewards/int_reward_func": 0.4895833283662796, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.1876041740179062, "step": 3373, "zero_std_ratio": 0.0 }, { "epoch": 0.3009946920023195, "grad_norm": 2734.735107421875, "learning_rate": 3e-06, "loss": -136.406, "step": 3374 }, { "epoch": 0.3010839020473705, "grad_norm": 1279.9725341796875, "learning_rate": 3e-06, "loss": -120.1009, "step": 3375 }, { "epoch": 0.3011731120924216, "grad_norm": 926.9482421875, "learning_rate": 3e-06, "loss": -105.4302, "step": 3376 }, { "epoch": 0.30126232213747267, "grad_norm": 1064.126220703125, "learning_rate": 3e-06, "loss": -83.6931, "step": 3377 }, { "epoch": 0.30135153218252375, "grad_norm": 2943.598388671875, "learning_rate": 3e-06, "loss": -40.5103, "step": 3378 }, { "epoch": 0.30144074222757483, "grad_norm": 1120.6453857421875, "learning_rate": 3e-06, "loss": -142.1838, "step": 3379 }, { "epoch": 0.3015299522726259, "grad_norm": 4952.03515625, "learning_rate": 3e-06, "loss": -158.188, "step": 3380 }, { "epoch": 0.301619162317677, "grad_norm": 1137.2315673828125, "learning_rate": 3e-06, "loss": -133.3203, "step": 3381 }, { "epoch": 0.301708372362728, "grad_norm": 1245.4500732421875, "learning_rate": 3e-06, "loss": -117.426, "step": 3382 }, { "epoch": 0.3017975824077791, "grad_norm": 785.490478515625, "learning_rate": 3e-06, "loss": -97.84, "step": 3383 }, { "epoch": 0.3018867924528302, "grad_norm": 1461.48486328125, "learning_rate": 3e-06, "loss": -57.0243, "step": 3384 }, { "completion_length": 129.75000381469727, "epoch": 0.30197600249788126, "grad_norm": 2557.474609375, "learning_rate": 3e-06, "loss": 254.0306, "reward": 2.310854196548462, "reward_std": 0.4048392176628113, "rewards/correctness_reward_func": 1.6666666865348816, "rewards/int_reward_func": 0.4375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.20668749511241913, "step": 3385, "zero_std_ratio": 0.125 }, { "epoch": 0.30206521254293234, "grad_norm": 2035.84716796875, "learning_rate": 3e-06, "loss": 170.739, "step": 3386 }, { "epoch": 0.3021544225879834, "grad_norm": 4204.1943359375, "learning_rate": 3e-06, "loss": 198.368, "step": 3387 }, { "epoch": 0.3022436326330345, "grad_norm": 1970.0904541015625, "learning_rate": 3e-06, "loss": 161.0625, "step": 3388 }, { "epoch": 0.3023328426780856, "grad_norm": 2223.108642578125, "learning_rate": 3e-06, "loss": 120.0307, "step": 3389 }, { "epoch": 0.3024220527231366, "grad_norm": 2544.989501953125, "learning_rate": 3e-06, "loss": 133.2601, "step": 3390 }, { "epoch": 0.3025112627681877, "grad_norm": 3480.48388671875, "learning_rate": 3e-06, "loss": 241.0414, "step": 3391 }, { "epoch": 0.30260047281323876, "grad_norm": 2016.6514892578125, "learning_rate": 3e-06, "loss": 144.9663, "step": 3392 }, { "epoch": 0.30268968285828984, "grad_norm": 3826.53857421875, "learning_rate": 3e-06, "loss": 189.4543, "step": 3393 }, { "epoch": 0.3027788929033409, "grad_norm": 1892.319091796875, "learning_rate": 3e-06, "loss": 142.2155, "step": 3394 }, { "epoch": 0.302868102948392, "grad_norm": 2486.595947265625, "learning_rate": 3e-06, "loss": 102.2165, "step": 3395 }, { "epoch": 0.3029573129934431, "grad_norm": 2480.132568359375, "learning_rate": 3e-06, "loss": 115.9007, "step": 3396 }, { "completion_length": 129.9166717529297, "epoch": 0.3030465230384941, "grad_norm": 1883.1416015625, "learning_rate": 3e-06, "loss": 246.619, "reward": 2.1547292470932007, "reward_std": 0.6037483513355255, "rewards/correctness_reward_func": 1.4583333134651184, "rewards/int_reward_func": 0.4895833283662796, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.20681250095367432, "step": 3397, "zero_std_ratio": 0.0 }, { "epoch": 0.3031357330835452, "grad_norm": 1984.0673828125, "learning_rate": 3e-06, "loss": 157.039, "step": 3398 }, { "epoch": 0.30322494312859627, "grad_norm": 2339.0927734375, "learning_rate": 3e-06, "loss": 238.2178, "step": 3399 }, { "epoch": 0.30331415317364735, "grad_norm": 2265.627197265625, "learning_rate": 3e-06, "loss": 269.6314, "step": 3400 }, { "epoch": 0.30340336321869843, "grad_norm": 1702.5643310546875, "learning_rate": 3e-06, "loss": 205.2379, "step": 3401 }, { "epoch": 0.3034925732637495, "grad_norm": 1914.6392822265625, "learning_rate": 3e-06, "loss": 215.4068, "step": 3402 }, { "epoch": 0.3035817833088006, "grad_norm": 1603.1527099609375, "learning_rate": 3e-06, "loss": 214.743, "step": 3403 }, { "epoch": 0.30367099335385167, "grad_norm": 1865.1605224609375, "learning_rate": 3e-06, "loss": 129.1935, "step": 3404 }, { "epoch": 0.3037602033989027, "grad_norm": 2698.400634765625, "learning_rate": 3e-06, "loss": 183.5472, "step": 3405 }, { "epoch": 0.3038494134439538, "grad_norm": 2480.45458984375, "learning_rate": 3e-06, "loss": 231.2657, "step": 3406 }, { "epoch": 0.30393862348900486, "grad_norm": 3024.17138671875, "learning_rate": 3e-06, "loss": 162.3446, "step": 3407 }, { "epoch": 0.30402783353405594, "grad_norm": 1581.31103515625, "learning_rate": 3e-06, "loss": 181.2881, "step": 3408 }, { "completion_length": 137.43750762939453, "epoch": 0.304117043579107, "grad_norm": 1639.672119140625, "learning_rate": 3e-06, "loss": 91.632, "reward": 2.313562512397766, "reward_std": 0.5164721310138702, "rewards/correctness_reward_func": 1.6666666865348816, "rewards/int_reward_func": 0.4895833283662796, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.15731249004602432, "step": 3409, "zero_std_ratio": 0.125 }, { "epoch": 0.3042062536241581, "grad_norm": 2700.233154296875, "learning_rate": 3e-06, "loss": 58.8832, "step": 3410 }, { "epoch": 0.3042954636692092, "grad_norm": 1606.640380859375, "learning_rate": 3e-06, "loss": 127.0183, "step": 3411 }, { "epoch": 0.3043846737142602, "grad_norm": 2296.01806640625, "learning_rate": 3e-06, "loss": -6.3656, "step": 3412 }, { "epoch": 0.3044738837593113, "grad_norm": 1910.6243896484375, "learning_rate": 3e-06, "loss": 81.2368, "step": 3413 }, { "epoch": 0.30456309380436236, "grad_norm": 1277.1546630859375, "learning_rate": 3e-06, "loss": 126.1999, "step": 3414 }, { "epoch": 0.30465230384941344, "grad_norm": 1482.3331298828125, "learning_rate": 3e-06, "loss": 70.915, "step": 3415 }, { "epoch": 0.3047415138944645, "grad_norm": 2044.9129638671875, "learning_rate": 3e-06, "loss": 56.9034, "step": 3416 }, { "epoch": 0.3048307239395156, "grad_norm": 2144.29248046875, "learning_rate": 3e-06, "loss": 90.1882, "step": 3417 }, { "epoch": 0.3049199339845667, "grad_norm": 1471.55517578125, "learning_rate": 3e-06, "loss": -24.1518, "step": 3418 }, { "epoch": 0.30500914402961776, "grad_norm": 2043.9342041015625, "learning_rate": 3e-06, "loss": 42.1343, "step": 3419 }, { "epoch": 0.3050983540746688, "grad_norm": 1137.6531982421875, "learning_rate": 3e-06, "loss": 95.3205, "step": 3420 }, { "completion_length": 138.97916793823242, "epoch": 0.30518756411971987, "grad_norm": 1632.7293701171875, "learning_rate": 3e-06, "loss": 53.4267, "reward": 2.007729232311249, "reward_std": 0.782759964466095, "rewards/correctness_reward_func": 1.375, "rewards/int_reward_func": 0.4895833283662796, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.1431458219885826, "step": 3421, "zero_std_ratio": 0.0 }, { "epoch": 0.30527677416477095, "grad_norm": 1780.9111328125, "learning_rate": 3e-06, "loss": -31.6388, "step": 3422 }, { "epoch": 0.30536598420982203, "grad_norm": 2004.45263671875, "learning_rate": 3e-06, "loss": -96.4925, "step": 3423 }, { "epoch": 0.3054551942548731, "grad_norm": 2178.379638671875, "learning_rate": 3e-06, "loss": -31.0931, "step": 3424 }, { "epoch": 0.3055444042999242, "grad_norm": 2379.232666015625, "learning_rate": 3e-06, "loss": -52.2415, "step": 3425 }, { "epoch": 0.30563361434497527, "grad_norm": 1761.1578369140625, "learning_rate": 3e-06, "loss": -20.97, "step": 3426 }, { "epoch": 0.3057228243900263, "grad_norm": 1574.2545166015625, "learning_rate": 3e-06, "loss": 43.9616, "step": 3427 }, { "epoch": 0.3058120344350774, "grad_norm": 1830.3680419921875, "learning_rate": 3e-06, "loss": -54.098, "step": 3428 }, { "epoch": 0.30590124448012845, "grad_norm": 2105.930908203125, "learning_rate": 3e-06, "loss": -113.2404, "step": 3429 }, { "epoch": 0.30599045452517953, "grad_norm": 2117.16162109375, "learning_rate": 3e-06, "loss": -42.252, "step": 3430 }, { "epoch": 0.3060796645702306, "grad_norm": 2408.48486328125, "learning_rate": 3e-06, "loss": -64.0465, "step": 3431 }, { "epoch": 0.3061688746152817, "grad_norm": 2292.602783203125, "learning_rate": 3e-06, "loss": -36.1359, "step": 3432 }, { "completion_length": 111.89583587646484, "epoch": 0.3062580846603328, "grad_norm": 2219.448974609375, "learning_rate": 3e-06, "loss": 97.0913, "reward": 2.126791834831238, "reward_std": 0.5767087936401367, "rewards/correctness_reward_func": 1.4166666269302368, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.21012499928474426, "step": 3433, "zero_std_ratio": 0.0 }, { "epoch": 0.3063472947053838, "grad_norm": 1077.029541015625, "learning_rate": 3e-06, "loss": 178.3958, "step": 3434 }, { "epoch": 0.3064365047504349, "grad_norm": 1877.599365234375, "learning_rate": 3e-06, "loss": -6.635, "step": 3435 }, { "epoch": 0.30652571479548596, "grad_norm": 1314.9306640625, "learning_rate": 3e-06, "loss": 41.6684, "step": 3436 }, { "epoch": 0.30661492484053704, "grad_norm": 1241.1844482421875, "learning_rate": 3e-06, "loss": 99.2351, "step": 3437 }, { "epoch": 0.3067041348855881, "grad_norm": 1601.69384765625, "learning_rate": 3e-06, "loss": 145.26, "step": 3438 }, { "epoch": 0.3067933449306392, "grad_norm": 2270.629638671875, "learning_rate": 3e-06, "loss": 62.3046, "step": 3439 }, { "epoch": 0.3068825549756903, "grad_norm": 1364.313232421875, "learning_rate": 3e-06, "loss": 160.1761, "step": 3440 }, { "epoch": 0.30697176502074136, "grad_norm": 1874.877197265625, "learning_rate": 3e-06, "loss": -38.1233, "step": 3441 }, { "epoch": 0.3070609750657924, "grad_norm": 997.1654663085938, "learning_rate": 3e-06, "loss": 23.6143, "step": 3442 }, { "epoch": 0.30715018511084347, "grad_norm": 1273.2977294921875, "learning_rate": 3e-06, "loss": 68.5558, "step": 3443 }, { "epoch": 0.30723939515589455, "grad_norm": 1813.5419921875, "learning_rate": 3e-06, "loss": 101.6688, "step": 3444 }, { "completion_length": 149.56250762939453, "epoch": 0.3073286052009456, "grad_norm": 2822.899658203125, "learning_rate": 3e-06, "loss": -292.8889, "reward": 2.095687508583069, "reward_std": 0.5714960098266602, "rewards/correctness_reward_func": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.09568749740719795, "step": 3445, "zero_std_ratio": 0.0 }, { "epoch": 0.3074178152459967, "grad_norm": 4109.23974609375, "learning_rate": 3e-06, "loss": -228.1821, "step": 3446 }, { "epoch": 0.3075070252910478, "grad_norm": 2918.7236328125, "learning_rate": 3e-06, "loss": -392.164, "step": 3447 }, { "epoch": 0.30759623533609887, "grad_norm": 2594.619384765625, "learning_rate": 3e-06, "loss": -260.4514, "step": 3448 }, { "epoch": 0.3076854453811499, "grad_norm": 2120.168212890625, "learning_rate": 3e-06, "loss": -259.6828, "step": 3449 }, { "epoch": 0.30777465542620097, "grad_norm": 2984.971435546875, "learning_rate": 3e-06, "loss": -470.2361, "step": 3450 }, { "epoch": 0.30786386547125205, "grad_norm": 2873.526123046875, "learning_rate": 3e-06, "loss": -313.9846, "step": 3451 }, { "epoch": 0.30795307551630313, "grad_norm": 3758.116455078125, "learning_rate": 3e-06, "loss": -238.0249, "step": 3452 }, { "epoch": 0.3080422855613542, "grad_norm": 2606.766845703125, "learning_rate": 3e-06, "loss": -435.909, "step": 3453 }, { "epoch": 0.3081314956064053, "grad_norm": 2542.0869140625, "learning_rate": 3e-06, "loss": -308.9927, "step": 3454 }, { "epoch": 0.3082207056514564, "grad_norm": 2566.150634765625, "learning_rate": 3e-06, "loss": -300.2436, "step": 3455 }, { "epoch": 0.30830991569650745, "grad_norm": 3085.616943359375, "learning_rate": 3e-06, "loss": -506.2858, "step": 3456 }, { "completion_length": 131.45833587646484, "epoch": 0.3083991257415585, "grad_norm": 1224.834228515625, "learning_rate": 3e-06, "loss": 76.0338, "reward": 2.374791741371155, "reward_std": 0.30504775047302246, "rewards/correctness_reward_func": 1.7083333134651184, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.1664583384990692, "step": 3457, "zero_std_ratio": 0.0 }, { "epoch": 0.30848833578660956, "grad_norm": 1355.8123779296875, "learning_rate": 3e-06, "loss": 86.3256, "step": 3458 }, { "epoch": 0.30857754583166064, "grad_norm": 1006.8480224609375, "learning_rate": 3e-06, "loss": 80.0275, "step": 3459 }, { "epoch": 0.3086667558767117, "grad_norm": 1022.3683471679688, "learning_rate": 3e-06, "loss": 54.7983, "step": 3460 }, { "epoch": 0.3087559659217628, "grad_norm": 1576.0550537109375, "learning_rate": 3e-06, "loss": 122.0738, "step": 3461 }, { "epoch": 0.3088451759668139, "grad_norm": 833.067138671875, "learning_rate": 3e-06, "loss": 80.7155, "step": 3462 }, { "epoch": 0.30893438601186496, "grad_norm": 1065.4052734375, "learning_rate": 3e-06, "loss": 66.2053, "step": 3463 }, { "epoch": 0.309023596056916, "grad_norm": 966.0946044921875, "learning_rate": 3e-06, "loss": 65.1731, "step": 3464 }, { "epoch": 0.30911280610196706, "grad_norm": 680.0827026367188, "learning_rate": 3e-06, "loss": 63.317, "step": 3465 }, { "epoch": 0.30920201614701814, "grad_norm": 571.6697998046875, "learning_rate": 3e-06, "loss": 42.5732, "step": 3466 }, { "epoch": 0.3092912261920692, "grad_norm": 980.7891845703125, "learning_rate": 3e-06, "loss": 86.8468, "step": 3467 }, { "epoch": 0.3093804362371203, "grad_norm": 609.6798095703125, "learning_rate": 3e-06, "loss": 59.3307, "step": 3468 }, { "completion_length": 128.7291717529297, "epoch": 0.3094696462821714, "grad_norm": 438.8487854003906, "learning_rate": 3e-06, "loss": -1.369, "reward": 2.3499168157577515, "reward_std": 0.5051312148571014, "rewards/correctness_reward_func": 1.6666666865348816, "rewards/int_reward_func": 0.4895833283662796, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.19366666674613953, "step": 3469, "zero_std_ratio": 0.0 }, { "epoch": 0.30955885632722246, "grad_norm": 369.682861328125, "learning_rate": 3e-06, "loss": -8.4492, "step": 3470 }, { "epoch": 0.30964806637227354, "grad_norm": 400.91033935546875, "learning_rate": 3e-06, "loss": -17.9311, "step": 3471 }, { "epoch": 0.30973727641732457, "grad_norm": 238.37318420410156, "learning_rate": 3e-06, "loss": -12.8926, "step": 3472 }, { "epoch": 0.30982648646237565, "grad_norm": 275.04156494140625, "learning_rate": 3e-06, "loss": -2.6769, "step": 3473 }, { "epoch": 0.30991569650742673, "grad_norm": 269.20855712890625, "learning_rate": 3e-06, "loss": 6.3232, "step": 3474 }, { "epoch": 0.3100049065524778, "grad_norm": 493.4793395996094, "learning_rate": 3e-06, "loss": -6.0249, "step": 3475 }, { "epoch": 0.3100941165975289, "grad_norm": 313.3173522949219, "learning_rate": 3e-06, "loss": -7.0084, "step": 3476 }, { "epoch": 0.31018332664257997, "grad_norm": 386.220703125, "learning_rate": 3e-06, "loss": -20.1692, "step": 3477 }, { "epoch": 0.31027253668763105, "grad_norm": 254.54696655273438, "learning_rate": 3e-06, "loss": -14.9858, "step": 3478 }, { "epoch": 0.3103617467326821, "grad_norm": 287.2984313964844, "learning_rate": 3e-06, "loss": -5.9454, "step": 3479 }, { "epoch": 0.31045095677773316, "grad_norm": 271.01824951171875, "learning_rate": 3e-06, "loss": 4.0798, "step": 3480 }, { "completion_length": 107.10416793823242, "epoch": 0.31054016682278424, "grad_norm": 1235.95849609375, "learning_rate": 3e-06, "loss": -24.7784, "reward": 2.398291826248169, "reward_std": 0.35086895525455475, "rewards/correctness_reward_func": 1.6666666269302368, "rewards/int_reward_func": 0.4895833283662796, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.2420416623353958, "step": 3481, "zero_std_ratio": 0.0 }, { "epoch": 0.3106293768678353, "grad_norm": 1640.132568359375, "learning_rate": 3e-06, "loss": -12.9565, "step": 3482 }, { "epoch": 0.3107185869128864, "grad_norm": 1060.5028076171875, "learning_rate": 3e-06, "loss": -14.1932, "step": 3483 }, { "epoch": 0.3108077969579375, "grad_norm": 1488.4853515625, "learning_rate": 3e-06, "loss": -173.2142, "step": 3484 }, { "epoch": 0.31089700700298856, "grad_norm": 1269.6356201171875, "learning_rate": 3e-06, "loss": -68.3184, "step": 3485 }, { "epoch": 0.3109862170480396, "grad_norm": 1511.3739013671875, "learning_rate": 3e-06, "loss": -49.932, "step": 3486 }, { "epoch": 0.31107542709309066, "grad_norm": 1395.3077392578125, "learning_rate": 3e-06, "loss": -44.3433, "step": 3487 }, { "epoch": 0.31116463713814174, "grad_norm": 1362.9388427734375, "learning_rate": 3e-06, "loss": -36.2175, "step": 3488 }, { "epoch": 0.3112538471831928, "grad_norm": 1198.84375, "learning_rate": 3e-06, "loss": -27.1698, "step": 3489 }, { "epoch": 0.3113430572282439, "grad_norm": 1056.8677978515625, "learning_rate": 3e-06, "loss": -177.2978, "step": 3490 }, { "epoch": 0.311432267273295, "grad_norm": 922.377685546875, "learning_rate": 3e-06, "loss": -99.7835, "step": 3491 }, { "epoch": 0.31152147731834606, "grad_norm": 1295.142822265625, "learning_rate": 3e-06, "loss": -90.9417, "step": 3492 }, { "completion_length": 137.25000762939453, "epoch": 0.31161068736339714, "grad_norm": 1035.3687744140625, "learning_rate": 3e-06, "loss": -24.6821, "reward": 2.0234166383743286, "reward_std": 0.23504738509655, "rewards/correctness_reward_func": 1.3333333730697632, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.1900833249092102, "step": 3493, "zero_std_ratio": 0.0 }, { "epoch": 0.31169989740844817, "grad_norm": 3030.28369140625, "learning_rate": 3e-06, "loss": 31.7563, "step": 3494 }, { "epoch": 0.31178910745349925, "grad_norm": 3439.691650390625, "learning_rate": 3e-06, "loss": 266.7087, "step": 3495 }, { "epoch": 0.31187831749855033, "grad_norm": 890.7906494140625, "learning_rate": 3e-06, "loss": 2.0345, "step": 3496 }, { "epoch": 0.3119675275436014, "grad_norm": 1953.463134765625, "learning_rate": 3e-06, "loss": 25.1341, "step": 3497 }, { "epoch": 0.3120567375886525, "grad_norm": 955.2403564453125, "learning_rate": 3e-06, "loss": 14.6134, "step": 3498 }, { "epoch": 0.31214594763370357, "grad_norm": 996.666015625, "learning_rate": 3e-06, "loss": -34.1477, "step": 3499 }, { "epoch": 0.31223515767875465, "grad_norm": 2663.505126953125, "learning_rate": 3e-06, "loss": 34.0487, "step": 3500 }, { "epoch": 0.3123243677238057, "grad_norm": 3484.478759765625, "learning_rate": 3e-06, "loss": 237.8413, "step": 3501 }, { "epoch": 0.31241357776885675, "grad_norm": 857.60546875, "learning_rate": 3e-06, "loss": -4.8499, "step": 3502 }, { "epoch": 0.31250278781390783, "grad_norm": 2055.64013671875, "learning_rate": 3e-06, "loss": 12.6596, "step": 3503 }, { "epoch": 0.3125919978589589, "grad_norm": 808.066162109375, "learning_rate": 3e-06, "loss": 1.5554, "step": 3504 }, { "completion_length": 127.10416793823242, "epoch": 0.31268120790401, "grad_norm": 1961.106201171875, "learning_rate": 3e-06, "loss": 242.6115, "reward": 2.231500029563904, "reward_std": 0.5459994077682495, "rewards/correctness_reward_func": 1.5833333730697632, "rewards/int_reward_func": 0.46875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.17941665649414062, "step": 3505, "zero_std_ratio": 0.0 }, { "epoch": 0.3127704179490611, "grad_norm": 2225.6005859375, "learning_rate": 3e-06, "loss": 132.0739, "step": 3506 }, { "epoch": 0.31285962799411215, "grad_norm": 1880.614990234375, "learning_rate": 3e-06, "loss": 225.9371, "step": 3507 }, { "epoch": 0.31294883803916324, "grad_norm": 1840.9091796875, "learning_rate": 3e-06, "loss": 170.3474, "step": 3508 }, { "epoch": 0.31303804808421426, "grad_norm": 2116.667724609375, "learning_rate": 3e-06, "loss": 147.4738, "step": 3509 }, { "epoch": 0.31312725812926534, "grad_norm": 2286.77197265625, "learning_rate": 3e-06, "loss": 162.1887, "step": 3510 }, { "epoch": 0.3132164681743164, "grad_norm": 2880.526123046875, "learning_rate": 3e-06, "loss": 153.8584, "step": 3511 }, { "epoch": 0.3133056782193675, "grad_norm": 1211.30859375, "learning_rate": 3e-06, "loss": 68.9388, "step": 3512 }, { "epoch": 0.3133948882644186, "grad_norm": 2469.4755859375, "learning_rate": 3e-06, "loss": 133.8359, "step": 3513 }, { "epoch": 0.31348409830946966, "grad_norm": 1633.5400390625, "learning_rate": 3e-06, "loss": 87.7971, "step": 3514 }, { "epoch": 0.31357330835452074, "grad_norm": 1542.455078125, "learning_rate": 3e-06, "loss": 65.0249, "step": 3515 }, { "epoch": 0.31366251839957177, "grad_norm": 1071.098388671875, "learning_rate": 3e-06, "loss": 66.6994, "step": 3516 }, { "completion_length": 132.3125, "epoch": 0.31375172844462285, "grad_norm": 482.2054748535156, "learning_rate": 3e-06, "loss": -62.5491, "reward": 2.2242918014526367, "reward_std": 0.4100498706102371, "rewards/correctness_reward_func": 1.5833333134651184, "rewards/int_reward_func": 0.4895833283662796, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.15137500315904617, "step": 3517, "zero_std_ratio": 0.0 }, { "epoch": 0.3138409384896739, "grad_norm": 619.18896484375, "learning_rate": 3e-06, "loss": -57.3433, "step": 3518 }, { "epoch": 0.313930148534725, "grad_norm": 483.38934326171875, "learning_rate": 3e-06, "loss": -66.9188, "step": 3519 }, { "epoch": 0.3140193585797761, "grad_norm": 382.498046875, "learning_rate": 3e-06, "loss": -48.7591, "step": 3520 }, { "epoch": 0.31410856862482717, "grad_norm": 515.4923095703125, "learning_rate": 3e-06, "loss": -50.6039, "step": 3521 }, { "epoch": 0.31419777866987825, "grad_norm": 472.1094055175781, "learning_rate": 3e-06, "loss": -70.0483, "step": 3522 }, { "epoch": 0.3142869887149293, "grad_norm": 475.8892822265625, "learning_rate": 3e-06, "loss": -66.1357, "step": 3523 }, { "epoch": 0.31437619875998035, "grad_norm": 734.4429321289062, "learning_rate": 3e-06, "loss": -59.0726, "step": 3524 }, { "epoch": 0.31446540880503143, "grad_norm": 549.729736328125, "learning_rate": 3e-06, "loss": -72.0147, "step": 3525 }, { "epoch": 0.3145546188500825, "grad_norm": 485.8818359375, "learning_rate": 3e-06, "loss": -54.301, "step": 3526 }, { "epoch": 0.3146438288951336, "grad_norm": 513.7693481445312, "learning_rate": 3e-06, "loss": -56.7122, "step": 3527 }, { "epoch": 0.3147330389401847, "grad_norm": 496.35089111328125, "learning_rate": 3e-06, "loss": -78.5214, "step": 3528 }, { "completion_length": 132.0416717529297, "epoch": 0.31482224898523575, "grad_norm": 251.9969482421875, "learning_rate": 3e-06, "loss": -8.2888, "reward": 2.206416606903076, "reward_std": 0.35549503564834595, "rewards/correctness_reward_func": 1.625, "rewards/int_reward_func": 0.4270833283662796, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.15433333069086075, "step": 3529, "zero_std_ratio": 0.0 }, { "epoch": 0.31491145903028683, "grad_norm": 221.64627075195312, "learning_rate": 3e-06, "loss": -13.5528, "step": 3530 }, { "epoch": 0.31500066907533786, "grad_norm": 280.32391357421875, "learning_rate": 3e-06, "loss": -9.7781, "step": 3531 }, { "epoch": 0.31508987912038894, "grad_norm": 251.96041870117188, "learning_rate": 3e-06, "loss": -10.1227, "step": 3532 }, { "epoch": 0.31517908916544, "grad_norm": 230.5780792236328, "learning_rate": 3e-06, "loss": -8.8748, "step": 3533 }, { "epoch": 0.3152682992104911, "grad_norm": 362.037841796875, "learning_rate": 3e-06, "loss": -16.1895, "step": 3534 }, { "epoch": 0.3153575092555422, "grad_norm": 399.199462890625, "learning_rate": 3e-06, "loss": -15.0929, "step": 3535 }, { "epoch": 0.31544671930059326, "grad_norm": 402.5520324707031, "learning_rate": 3e-06, "loss": -18.6046, "step": 3536 }, { "epoch": 0.31553592934564434, "grad_norm": 319.6678466796875, "learning_rate": 3e-06, "loss": -16.9303, "step": 3537 }, { "epoch": 0.3156251393906954, "grad_norm": 396.6726379394531, "learning_rate": 3e-06, "loss": -16.0522, "step": 3538 }, { "epoch": 0.31571434943574644, "grad_norm": 237.65599060058594, "learning_rate": 3e-06, "loss": -12.8396, "step": 3539 }, { "epoch": 0.3158035594807975, "grad_norm": 549.5143432617188, "learning_rate": 3e-06, "loss": -27.5769, "step": 3540 }, { "completion_length": 111.72917175292969, "epoch": 0.3158927695258486, "grad_norm": 1066.9210205078125, "learning_rate": 3e-06, "loss": -13.1611, "reward": 2.358520805835724, "reward_std": 0.353410130366683, "rewards/correctness_reward_func": 1.625, "rewards/int_reward_func": 0.4895833283662796, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.24393751472234726, "step": 3541, "zero_std_ratio": 0.0 }, { "epoch": 0.3159819795708997, "grad_norm": 846.209716796875, "learning_rate": 3e-06, "loss": -36.9528, "step": 3542 }, { "epoch": 0.31607118961595077, "grad_norm": 1642.4786376953125, "learning_rate": 3e-06, "loss": -84.1564, "step": 3543 }, { "epoch": 0.31616039966100185, "grad_norm": 899.3390502929688, "learning_rate": 3e-06, "loss": -23.0899, "step": 3544 }, { "epoch": 0.3162496097060529, "grad_norm": 1540.0653076171875, "learning_rate": 3e-06, "loss": -129.7341, "step": 3545 }, { "epoch": 0.31633881975110395, "grad_norm": 832.514404296875, "learning_rate": 3e-06, "loss": -43.6316, "step": 3546 }, { "epoch": 0.31642802979615503, "grad_norm": 1379.0322265625, "learning_rate": 3e-06, "loss": -60.6527, "step": 3547 }, { "epoch": 0.3165172398412061, "grad_norm": 1262.822998046875, "learning_rate": 3e-06, "loss": -68.4766, "step": 3548 }, { "epoch": 0.3166064498862572, "grad_norm": 1682.234619140625, "learning_rate": 3e-06, "loss": -142.1812, "step": 3549 }, { "epoch": 0.31669565993130827, "grad_norm": 1589.224609375, "learning_rate": 3e-06, "loss": -65.7304, "step": 3550 }, { "epoch": 0.31678486997635935, "grad_norm": 1872.8958740234375, "learning_rate": 3e-06, "loss": -198.6722, "step": 3551 }, { "epoch": 0.31687408002141043, "grad_norm": 1707.17431640625, "learning_rate": 3e-06, "loss": -62.5591, "step": 3552 }, { "completion_length": 118.50000762939453, "epoch": 0.31696329006646146, "grad_norm": 2130.009521484375, "learning_rate": 3e-06, "loss": 143.1229, "reward": 2.5496459007263184, "reward_std": 0.34619903564453125, "rewards/correctness_reward_func": 1.8333333134651184, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.2163124978542328, "step": 3553, "zero_std_ratio": 0.0 }, { "epoch": 0.31705250011151254, "grad_norm": 1761.5262451171875, "learning_rate": 3e-06, "loss": 161.0544, "step": 3554 }, { "epoch": 0.3171417101565636, "grad_norm": 1395.034912109375, "learning_rate": 3e-06, "loss": 85.8981, "step": 3555 }, { "epoch": 0.3172309202016147, "grad_norm": 1349.159912109375, "learning_rate": 3e-06, "loss": 164.1265, "step": 3556 }, { "epoch": 0.3173201302466658, "grad_norm": 2355.351806640625, "learning_rate": 3e-06, "loss": 233.1079, "step": 3557 }, { "epoch": 0.31740934029171686, "grad_norm": 1788.181396484375, "learning_rate": 3e-06, "loss": 110.2326, "step": 3558 }, { "epoch": 0.31749855033676794, "grad_norm": 1944.2767333984375, "learning_rate": 3e-06, "loss": 151.674, "step": 3559 }, { "epoch": 0.317587760381819, "grad_norm": 1940.034423828125, "learning_rate": 3e-06, "loss": 154.2493, "step": 3560 }, { "epoch": 0.31767697042687004, "grad_norm": 1463.426025390625, "learning_rate": 3e-06, "loss": 81.3784, "step": 3561 }, { "epoch": 0.3177661804719211, "grad_norm": 1291.3863525390625, "learning_rate": 3e-06, "loss": 154.0923, "step": 3562 }, { "epoch": 0.3178553905169722, "grad_norm": 1874.134033203125, "learning_rate": 3e-06, "loss": 208.9081, "step": 3563 }, { "epoch": 0.3179446005620233, "grad_norm": 1569.9703369140625, "learning_rate": 3e-06, "loss": 88.4686, "step": 3564 }, { "completion_length": 124.5, "epoch": 0.31803381060707436, "grad_norm": 2750.666015625, "learning_rate": 3e-06, "loss": 34.1139, "reward": 2.196979284286499, "reward_std": 0.6484965085983276, "rewards/correctness_reward_func": 1.5416666865348816, "rewards/int_reward_func": 0.4583333283662796, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.19697917252779007, "step": 3565, "zero_std_ratio": 0.0 }, { "epoch": 0.31812302065212544, "grad_norm": 3971.495849609375, "learning_rate": 3e-06, "loss": 112.9225, "step": 3566 }, { "epoch": 0.3182122306971765, "grad_norm": 2179.958984375, "learning_rate": 3e-06, "loss": 164.682, "step": 3567 }, { "epoch": 0.31830144074222755, "grad_norm": 1972.8162841796875, "learning_rate": 3e-06, "loss": 229.5756, "step": 3568 }, { "epoch": 0.31839065078727863, "grad_norm": 2127.831787109375, "learning_rate": 3e-06, "loss": 17.1921, "step": 3569 }, { "epoch": 0.3184798608323297, "grad_norm": 3436.4765625, "learning_rate": 3e-06, "loss": 188.9201, "step": 3570 }, { "epoch": 0.3185690708773808, "grad_norm": 2922.649169921875, "learning_rate": 3e-06, "loss": 19.502, "step": 3571 }, { "epoch": 0.31865828092243187, "grad_norm": 2532.553466796875, "learning_rate": 3e-06, "loss": 102.4736, "step": 3572 }, { "epoch": 0.31874749096748295, "grad_norm": 1824.5203857421875, "learning_rate": 3e-06, "loss": 137.9859, "step": 3573 }, { "epoch": 0.31883670101253403, "grad_norm": 1672.5279541015625, "learning_rate": 3e-06, "loss": 198.011, "step": 3574 }, { "epoch": 0.3189259110575851, "grad_norm": 1954.422119140625, "learning_rate": 3e-06, "loss": -7.5728, "step": 3575 }, { "epoch": 0.31901512110263613, "grad_norm": 2867.286376953125, "learning_rate": 3e-06, "loss": 150.4029, "step": 3576 }, { "completion_length": 106.35416793823242, "epoch": 0.3191043311476872, "grad_norm": 1317.7806396484375, "learning_rate": 3e-06, "loss": 77.9951, "reward": 2.4563751220703125, "reward_std": 0.4861636757850647, "rewards/correctness_reward_func": 1.75, "rewards/int_reward_func": 0.4895833283662796, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.21679165959358215, "step": 3577, "zero_std_ratio": 0.125 }, { "epoch": 0.3191935411927383, "grad_norm": 1242.9266357421875, "learning_rate": 3e-06, "loss": 91.7167, "step": 3578 }, { "epoch": 0.3192827512377894, "grad_norm": 944.972900390625, "learning_rate": 3e-06, "loss": 51.7559, "step": 3579 }, { "epoch": 0.31937196128284046, "grad_norm": 1269.2489013671875, "learning_rate": 3e-06, "loss": 53.2938, "step": 3580 }, { "epoch": 0.31946117132789154, "grad_norm": 1029.042236328125, "learning_rate": 3e-06, "loss": 28.1905, "step": 3581 }, { "epoch": 0.3195503813729426, "grad_norm": 1117.2750244140625, "learning_rate": 3e-06, "loss": 25.4861, "step": 3582 }, { "epoch": 0.31963959141799364, "grad_norm": 875.5952758789062, "learning_rate": 3e-06, "loss": 44.3067, "step": 3583 }, { "epoch": 0.3197288014630447, "grad_norm": 946.8582153320312, "learning_rate": 3e-06, "loss": 53.4432, "step": 3584 }, { "epoch": 0.3198180115080958, "grad_norm": 687.897216796875, "learning_rate": 3e-06, "loss": 25.2881, "step": 3585 }, { "epoch": 0.3199072215531469, "grad_norm": 751.2490234375, "learning_rate": 3e-06, "loss": 10.1752, "step": 3586 }, { "epoch": 0.31999643159819796, "grad_norm": 535.4669189453125, "learning_rate": 3e-06, "loss": 2.3747, "step": 3587 }, { "epoch": 0.32008564164324904, "grad_norm": 582.8524780273438, "learning_rate": 3e-06, "loss": -12.9358, "step": 3588 }, { "completion_length": 123.02083587646484, "epoch": 0.3201748516883001, "grad_norm": 2492.42041015625, "learning_rate": 3e-06, "loss": -292.7274, "reward": 1.953458309173584, "reward_std": 0.9102485775947571, "rewards/correctness_reward_func": 1.3333333730697632, "rewards/int_reward_func": 0.4270833283662796, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.19304165989160538, "step": 3589, "zero_std_ratio": 0.0 }, { "epoch": 0.3202640617333512, "grad_norm": 2429.146240234375, "learning_rate": 3e-06, "loss": -137.1821, "step": 3590 }, { "epoch": 0.3203532717784022, "grad_norm": 2870.54833984375, "learning_rate": 3e-06, "loss": -407.01, "step": 3591 }, { "epoch": 0.3204424818234533, "grad_norm": 2122.081787109375, "learning_rate": 3e-06, "loss": -184.8808, "step": 3592 }, { "epoch": 0.3205316918685044, "grad_norm": 2716.769287109375, "learning_rate": 3e-06, "loss": -214.2635, "step": 3593 }, { "epoch": 0.32062090191355547, "grad_norm": 2610.397216796875, "learning_rate": 3e-06, "loss": -253.8306, "step": 3594 }, { "epoch": 0.32071011195860655, "grad_norm": 2260.266357421875, "learning_rate": 3e-06, "loss": -295.899, "step": 3595 }, { "epoch": 0.3207993220036576, "grad_norm": 2561.358154296875, "learning_rate": 3e-06, "loss": -154.5181, "step": 3596 }, { "epoch": 0.3208885320487087, "grad_norm": 2768.01708984375, "learning_rate": 3e-06, "loss": -413.8638, "step": 3597 }, { "epoch": 0.32097774209375973, "grad_norm": 2066.959228515625, "learning_rate": 3e-06, "loss": -222.631, "step": 3598 }, { "epoch": 0.3210669521388108, "grad_norm": 3016.967041015625, "learning_rate": 3e-06, "loss": -260.3246, "step": 3599 }, { "epoch": 0.3211561621838619, "grad_norm": 2780.324462890625, "learning_rate": 3e-06, "loss": -304.6732, "step": 3600 }, { "completion_length": 107.06250381469727, "epoch": 0.321245372228913, "grad_norm": 282.5953369140625, "learning_rate": 3e-06, "loss": 5.1212, "reward": 2.5829583406448364, "reward_std": 0.2692784518003464, "rewards/correctness_reward_func": 1.875, "rewards/int_reward_func": 0.4895833283662796, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.21837499737739563, "step": 3601, "zero_std_ratio": 0.125 }, { "epoch": 0.32133458227396405, "grad_norm": 271.3232116699219, "learning_rate": 3e-06, "loss": -3.0769, "step": 3602 }, { "epoch": 0.32142379231901513, "grad_norm": 364.28497314453125, "learning_rate": 3e-06, "loss": -6.4581, "step": 3603 }, { "epoch": 0.3215130023640662, "grad_norm": 278.7883605957031, "learning_rate": 3e-06, "loss": -4.8135, "step": 3604 }, { "epoch": 0.3216022124091173, "grad_norm": 363.4386901855469, "learning_rate": 3e-06, "loss": -7.7434, "step": 3605 }, { "epoch": 0.3216914224541683, "grad_norm": 253.23947143554688, "learning_rate": 3e-06, "loss": -15.8899, "step": 3606 }, { "epoch": 0.3217806324992194, "grad_norm": 387.0167541503906, "learning_rate": 3e-06, "loss": 5.4161, "step": 3607 }, { "epoch": 0.3218698425442705, "grad_norm": 189.8441162109375, "learning_rate": 3e-06, "loss": -4.6316, "step": 3608 }, { "epoch": 0.32195905258932156, "grad_norm": 675.2334594726562, "learning_rate": 3e-06, "loss": -13.2652, "step": 3609 }, { "epoch": 0.32204826263437264, "grad_norm": 507.86236572265625, "learning_rate": 3e-06, "loss": -10.0185, "step": 3610 }, { "epoch": 0.3221374726794237, "grad_norm": 546.0225830078125, "learning_rate": 3e-06, "loss": -11.6543, "step": 3611 }, { "epoch": 0.3222266827244748, "grad_norm": 353.2464904785156, "learning_rate": 3e-06, "loss": -18.2837, "step": 3612 }, { "completion_length": 105.58333587646484, "epoch": 0.3223158927695258, "grad_norm": 2668.735595703125, "learning_rate": 3e-06, "loss": 13.8467, "reward": 2.601229190826416, "reward_std": 0.3478473722934723, "rewards/correctness_reward_func": 1.875, "rewards/int_reward_func": 0.4895833283662796, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.23664583265781403, "step": 3613, "zero_std_ratio": 0.0 }, { "epoch": 0.3224051028145769, "grad_norm": 1316.1551513671875, "learning_rate": 3e-06, "loss": -65.3535, "step": 3614 }, { "epoch": 0.322494312859628, "grad_norm": 3655.150146484375, "learning_rate": 3e-06, "loss": 78.3909, "step": 3615 }, { "epoch": 0.32258352290467907, "grad_norm": 1713.85791015625, "learning_rate": 3e-06, "loss": 40.3379, "step": 3616 }, { "epoch": 0.32267273294973015, "grad_norm": 1496.111328125, "learning_rate": 3e-06, "loss": 44.1579, "step": 3617 }, { "epoch": 0.3227619429947812, "grad_norm": 1891.6641845703125, "learning_rate": 3e-06, "loss": -38.0822, "step": 3618 }, { "epoch": 0.3228511530398323, "grad_norm": 2723.79638671875, "learning_rate": 3e-06, "loss": -30.5793, "step": 3619 }, { "epoch": 0.32294036308488333, "grad_norm": 1334.346923828125, "learning_rate": 3e-06, "loss": -80.2084, "step": 3620 }, { "epoch": 0.3230295731299344, "grad_norm": 2450.21240234375, "learning_rate": 3e-06, "loss": 66.9659, "step": 3621 }, { "epoch": 0.3231187831749855, "grad_norm": 1621.0714111328125, "learning_rate": 3e-06, "loss": 25.5895, "step": 3622 }, { "epoch": 0.32320799322003657, "grad_norm": 1799.2176513671875, "learning_rate": 3e-06, "loss": 45.7815, "step": 3623 }, { "epoch": 0.32329720326508765, "grad_norm": 1986.276611328125, "learning_rate": 3e-06, "loss": -44.4525, "step": 3624 }, { "completion_length": 112.66667175292969, "epoch": 0.32338641331013873, "grad_norm": 893.3452758789062, "learning_rate": 3e-06, "loss": -309.7397, "reward": 2.3738335371017456, "reward_std": 0.24283339828252792, "rewards/correctness_reward_func": 1.6666666269302368, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.2071666643023491, "step": 3625, "zero_std_ratio": 0.0 }, { "epoch": 0.3234756233551898, "grad_norm": 2516.806640625, "learning_rate": 3e-06, "loss": -360.8942, "step": 3626 }, { "epoch": 0.3235648334002409, "grad_norm": 2866.770263671875, "learning_rate": 3e-06, "loss": -361.0575, "step": 3627 }, { "epoch": 0.3236540434452919, "grad_norm": 1074.0142822265625, "learning_rate": 3e-06, "loss": -253.4038, "step": 3628 }, { "epoch": 0.323743253490343, "grad_norm": 978.5372314453125, "learning_rate": 3e-06, "loss": -286.0543, "step": 3629 }, { "epoch": 0.3238324635353941, "grad_norm": 1195.3905029296875, "learning_rate": 3e-06, "loss": -308.0766, "step": 3630 }, { "epoch": 0.32392167358044516, "grad_norm": 881.34619140625, "learning_rate": 3e-06, "loss": -319.0973, "step": 3631 }, { "epoch": 0.32401088362549624, "grad_norm": 1869.920166015625, "learning_rate": 3e-06, "loss": -391.0098, "step": 3632 }, { "epoch": 0.3241000936705473, "grad_norm": 2161.52392578125, "learning_rate": 3e-06, "loss": -406.0872, "step": 3633 }, { "epoch": 0.3241893037155984, "grad_norm": 762.9923095703125, "learning_rate": 3e-06, "loss": -275.7999, "step": 3634 }, { "epoch": 0.3242785137606494, "grad_norm": 1074.247314453125, "learning_rate": 3e-06, "loss": -303.4648, "step": 3635 }, { "epoch": 0.3243677238057005, "grad_norm": 1251.16796875, "learning_rate": 3e-06, "loss": -326.3678, "step": 3636 }, { "completion_length": 128.14583587646484, "epoch": 0.3244569338507516, "grad_norm": 805.3264770507812, "learning_rate": 3e-06, "loss": -11.3518, "reward": 2.2009791135787964, "reward_std": 0.3784063160419464, "rewards/correctness_reward_func": 1.5416666269302368, "rewards/int_reward_func": 0.4895833283662796, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.16972915828227997, "step": 3637, "zero_std_ratio": 0.0 }, { "epoch": 0.32454614389580266, "grad_norm": 602.3580322265625, "learning_rate": 3e-06, "loss": -85.3785, "step": 3638 }, { "epoch": 0.32463535394085374, "grad_norm": 1273.105224609375, "learning_rate": 3e-06, "loss": -54.7464, "step": 3639 }, { "epoch": 0.3247245639859048, "grad_norm": 1085.851318359375, "learning_rate": 3e-06, "loss": -35.6832, "step": 3640 }, { "epoch": 0.3248137740309559, "grad_norm": 1371.066162109375, "learning_rate": 3e-06, "loss": -36.3455, "step": 3641 }, { "epoch": 0.324902984076007, "grad_norm": 1235.7861328125, "learning_rate": 3e-06, "loss": -29.9585, "step": 3642 }, { "epoch": 0.324992194121058, "grad_norm": 759.6223754882812, "learning_rate": 3e-06, "loss": -20.5416, "step": 3643 }, { "epoch": 0.3250814041661091, "grad_norm": 593.486083984375, "learning_rate": 3e-06, "loss": -93.3678, "step": 3644 }, { "epoch": 0.32517061421116017, "grad_norm": 1073.51025390625, "learning_rate": 3e-06, "loss": -79.1663, "step": 3645 }, { "epoch": 0.32525982425621125, "grad_norm": 1178.8023681640625, "learning_rate": 3e-06, "loss": -60.3346, "step": 3646 }, { "epoch": 0.32534903430126233, "grad_norm": 1097.6136474609375, "learning_rate": 3e-06, "loss": -55.4358, "step": 3647 }, { "epoch": 0.3254382443463134, "grad_norm": 1049.3626708984375, "learning_rate": 3e-06, "loss": -52.2942, "step": 3648 }, { "completion_length": 109.33333587646484, "epoch": 0.3255274543913645, "grad_norm": 1148.6719970703125, "learning_rate": 3e-06, "loss": -57.3723, "reward": 2.640625, "reward_std": 0.23042766749858856, "rewards/correctness_reward_func": 1.9166666269302368, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.223958320915699, "step": 3649, "zero_std_ratio": 0.0 }, { "epoch": 0.3256166644364155, "grad_norm": 1095.4107666015625, "learning_rate": 3e-06, "loss": -18.2973, "step": 3650 }, { "epoch": 0.3257058744814666, "grad_norm": 1107.4505615234375, "learning_rate": 3e-06, "loss": -54.9771, "step": 3651 }, { "epoch": 0.3257950845265177, "grad_norm": 1003.48046875, "learning_rate": 3e-06, "loss": -52.7486, "step": 3652 }, { "epoch": 0.32588429457156876, "grad_norm": 933.0286254882812, "learning_rate": 3e-06, "loss": -36.98, "step": 3653 }, { "epoch": 0.32597350461661984, "grad_norm": 1066.2449951171875, "learning_rate": 3e-06, "loss": -40.3954, "step": 3654 }, { "epoch": 0.3260627146616709, "grad_norm": 1359.7176513671875, "learning_rate": 3e-06, "loss": -74.8014, "step": 3655 }, { "epoch": 0.326151924706722, "grad_norm": 1079.4840087890625, "learning_rate": 3e-06, "loss": -35.4425, "step": 3656 }, { "epoch": 0.3262411347517731, "grad_norm": 1084.4779052734375, "learning_rate": 3e-06, "loss": -65.3251, "step": 3657 }, { "epoch": 0.3263303447968241, "grad_norm": 988.546142578125, "learning_rate": 3e-06, "loss": -63.4372, "step": 3658 }, { "epoch": 0.3264195548418752, "grad_norm": 800.3999633789062, "learning_rate": 3e-06, "loss": -47.9004, "step": 3659 }, { "epoch": 0.32650876488692626, "grad_norm": 1109.8271484375, "learning_rate": 3e-06, "loss": -47.4971, "step": 3660 }, { "completion_length": 120.79167175292969, "epoch": 0.32659797493197734, "grad_norm": 1260.7034912109375, "learning_rate": 3e-06, "loss": -112.8493, "reward": 2.326666831970215, "reward_std": 0.4045807123184204, "rewards/correctness_reward_func": 1.6666666269302368, "rewards/int_reward_func": 0.46875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.1912500038743019, "step": 3661, "zero_std_ratio": 0.0 }, { "epoch": 0.3266871849770284, "grad_norm": 1566.5546875, "learning_rate": 3e-06, "loss": -94.4388, "step": 3662 }, { "epoch": 0.3267763950220795, "grad_norm": 2560.1044921875, "learning_rate": 3e-06, "loss": -2.2135, "step": 3663 }, { "epoch": 0.3268656050671306, "grad_norm": 2050.251220703125, "learning_rate": 3e-06, "loss": -142.9664, "step": 3664 }, { "epoch": 0.3269548151121816, "grad_norm": 2384.525390625, "learning_rate": 3e-06, "loss": 87.1691, "step": 3665 }, { "epoch": 0.3270440251572327, "grad_norm": 2650.506103515625, "learning_rate": 3e-06, "loss": 26.7629, "step": 3666 }, { "epoch": 0.32713323520228377, "grad_norm": 1382.559814453125, "learning_rate": 3e-06, "loss": -120.191, "step": 3667 }, { "epoch": 0.32722244524733485, "grad_norm": 1485.59716796875, "learning_rate": 3e-06, "loss": -108.6166, "step": 3668 }, { "epoch": 0.32731165529238593, "grad_norm": 2099.324462890625, "learning_rate": 3e-06, "loss": -36.5844, "step": 3669 }, { "epoch": 0.327400865337437, "grad_norm": 1942.8819580078125, "learning_rate": 3e-06, "loss": -164.9788, "step": 3670 }, { "epoch": 0.3274900753824881, "grad_norm": 2547.656005859375, "learning_rate": 3e-06, "loss": 63.7993, "step": 3671 }, { "epoch": 0.32757928542753917, "grad_norm": 3616.071044921875, "learning_rate": 3e-06, "loss": 27.0725, "step": 3672 }, { "completion_length": 110.72916793823242, "epoch": 0.3276684954725902, "grad_norm": 340.6033020019531, "learning_rate": 3e-06, "loss": -8.2579, "reward": 2.4626877307891846, "reward_std": 0.045908909291028976, "rewards/correctness_reward_func": 1.75, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.21268751472234726, "step": 3673, "zero_std_ratio": 0.0 }, { "epoch": 0.3277577055176413, "grad_norm": 219.06321716308594, "learning_rate": 3e-06, "loss": -5.0348, "step": 3674 }, { "epoch": 0.32784691556269235, "grad_norm": 389.8656005859375, "learning_rate": 3e-06, "loss": -10.281, "step": 3675 }, { "epoch": 0.32793612560774343, "grad_norm": 341.981201171875, "learning_rate": 3e-06, "loss": -1.416, "step": 3676 }, { "epoch": 0.3280253356527945, "grad_norm": 380.3072814941406, "learning_rate": 3e-06, "loss": -0.507, "step": 3677 }, { "epoch": 0.3281145456978456, "grad_norm": 260.8265075683594, "learning_rate": 3e-06, "loss": -18.1865, "step": 3678 }, { "epoch": 0.3282037557428967, "grad_norm": 302.20654296875, "learning_rate": 3e-06, "loss": -11.9098, "step": 3679 }, { "epoch": 0.3282929657879477, "grad_norm": 222.0885772705078, "learning_rate": 3e-06, "loss": -7.0888, "step": 3680 }, { "epoch": 0.3283821758329988, "grad_norm": 430.5523376464844, "learning_rate": 3e-06, "loss": -10.7264, "step": 3681 }, { "epoch": 0.32847138587804986, "grad_norm": 265.58538818359375, "learning_rate": 3e-06, "loss": -3.5917, "step": 3682 }, { "epoch": 0.32856059592310094, "grad_norm": 293.1582336425781, "learning_rate": 3e-06, "loss": -4.6839, "step": 3683 }, { "epoch": 0.328649805968152, "grad_norm": 232.859619140625, "learning_rate": 3e-06, "loss": -20.3161, "step": 3684 }, { "completion_length": 112.0625, "epoch": 0.3287390160132031, "grad_norm": 2203.77001953125, "learning_rate": 3e-06, "loss": 277.2263, "reward": 1.9876667261123657, "reward_std": 0.5348050594329834, "rewards/correctness_reward_func": 1.2500000298023224, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.23766665905714035, "step": 3685, "zero_std_ratio": 0.0 }, { "epoch": 0.3288282260582542, "grad_norm": 2869.953125, "learning_rate": 3e-06, "loss": 58.6315, "step": 3686 }, { "epoch": 0.3289174361033052, "grad_norm": 4965.337890625, "learning_rate": 3e-06, "loss": 205.3616, "step": 3687 }, { "epoch": 0.3290066461483563, "grad_norm": 3435.413330078125, "learning_rate": 3e-06, "loss": 370.4272, "step": 3688 }, { "epoch": 0.32909585619340737, "grad_norm": 2505.3427734375, "learning_rate": 3e-06, "loss": 229.5794, "step": 3689 }, { "epoch": 0.32918506623845845, "grad_norm": 4843.75634765625, "learning_rate": 3e-06, "loss": 378.4928, "step": 3690 }, { "epoch": 0.3292742762835095, "grad_norm": 2651.68115234375, "learning_rate": 3e-06, "loss": 266.2636, "step": 3691 }, { "epoch": 0.3293634863285606, "grad_norm": 2857.37841796875, "learning_rate": 3e-06, "loss": 33.1587, "step": 3692 }, { "epoch": 0.3294526963736117, "grad_norm": 5819.806640625, "learning_rate": 3e-06, "loss": 169.4257, "step": 3693 }, { "epoch": 0.32954190641866277, "grad_norm": 3938.398681640625, "learning_rate": 3e-06, "loss": 342.3057, "step": 3694 }, { "epoch": 0.3296311164637138, "grad_norm": 2616.122802734375, "learning_rate": 3e-06, "loss": 214.5321, "step": 3695 }, { "epoch": 0.32972032650876487, "grad_norm": 4910.58837890625, "learning_rate": 3e-06, "loss": 348.5316, "step": 3696 }, { "completion_length": 113.60416793823242, "epoch": 0.32980953655381595, "grad_norm": 1411.974853515625, "learning_rate": 3e-06, "loss": 6.3254, "reward": 2.4781458377838135, "reward_std": 0.3388865441083908, "rewards/correctness_reward_func": 1.75, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.22814583778381348, "step": 3697, "zero_std_ratio": 0.0 }, { "epoch": 0.32989874659886703, "grad_norm": 1283.7645263671875, "learning_rate": 3e-06, "loss": -12.7356, "step": 3698 }, { "epoch": 0.3299879566439181, "grad_norm": 877.5414428710938, "learning_rate": 3e-06, "loss": -26.1698, "step": 3699 }, { "epoch": 0.3300771666889692, "grad_norm": 1045.9703369140625, "learning_rate": 3e-06, "loss": -22.601, "step": 3700 }, { "epoch": 0.3301663767340203, "grad_norm": 1150.7113037109375, "learning_rate": 3e-06, "loss": -7.7785, "step": 3701 }, { "epoch": 0.3302555867790713, "grad_norm": 1304.5067138671875, "learning_rate": 3e-06, "loss": 33.4538, "step": 3702 }, { "epoch": 0.3303447968241224, "grad_norm": 1145.140625, "learning_rate": 3e-06, "loss": -21.9474, "step": 3703 }, { "epoch": 0.33043400686917346, "grad_norm": 863.726806640625, "learning_rate": 3e-06, "loss": -33.3601, "step": 3704 }, { "epoch": 0.33052321691422454, "grad_norm": 844.621337890625, "learning_rate": 3e-06, "loss": -41.9311, "step": 3705 }, { "epoch": 0.3306124269592756, "grad_norm": 638.1948852539062, "learning_rate": 3e-06, "loss": -48.2286, "step": 3706 }, { "epoch": 0.3307016370043267, "grad_norm": 817.4290771484375, "learning_rate": 3e-06, "loss": -36.4349, "step": 3707 }, { "epoch": 0.3307908470493778, "grad_norm": 1100.6036376953125, "learning_rate": 3e-06, "loss": -9.9008, "step": 3708 }, { "completion_length": 110.20833587646484, "epoch": 0.33088005709442886, "grad_norm": 2032.963134765625, "learning_rate": 3e-06, "loss": -98.3232, "reward": 2.01939594745636, "reward_std": 0.5031762719154358, "rewards/correctness_reward_func": 1.3333333730697632, "rewards/int_reward_func": 0.4583333283662796, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.2277291715145111, "step": 3709, "zero_std_ratio": 0.0 }, { "epoch": 0.3309692671394799, "grad_norm": 3561.154541015625, "learning_rate": 3e-06, "loss": -234.6138, "step": 3710 }, { "epoch": 0.33105847718453096, "grad_norm": 2032.5712890625, "learning_rate": 3e-06, "loss": -143.6856, "step": 3711 }, { "epoch": 0.33114768722958204, "grad_norm": 2917.2412109375, "learning_rate": 3e-06, "loss": -137.2449, "step": 3712 }, { "epoch": 0.3312368972746331, "grad_norm": 3507.35400390625, "learning_rate": 3e-06, "loss": -91.187, "step": 3713 }, { "epoch": 0.3313261073196842, "grad_norm": 2156.76123046875, "learning_rate": 3e-06, "loss": -124.7247, "step": 3714 }, { "epoch": 0.3314153173647353, "grad_norm": 1812.80615234375, "learning_rate": 3e-06, "loss": -106.3387, "step": 3715 }, { "epoch": 0.33150452740978636, "grad_norm": 4808.53369140625, "learning_rate": 3e-06, "loss": -234.4785, "step": 3716 }, { "epoch": 0.3315937374548374, "grad_norm": 2011.66015625, "learning_rate": 3e-06, "loss": -158.3969, "step": 3717 }, { "epoch": 0.33168294749988847, "grad_norm": 3321.2294921875, "learning_rate": 3e-06, "loss": -146.4013, "step": 3718 }, { "epoch": 0.33177215754493955, "grad_norm": 3284.157958984375, "learning_rate": 3e-06, "loss": -128.5277, "step": 3719 }, { "epoch": 0.33186136758999063, "grad_norm": 1800.2490234375, "learning_rate": 3e-06, "loss": -145.7202, "step": 3720 }, { "completion_length": 121.60417175292969, "epoch": 0.3319505776350417, "grad_norm": 1295.116943359375, "learning_rate": 3e-06, "loss": 244.8966, "reward": 2.2688333988189697, "reward_std": 0.5156040489673615, "rewards/correctness_reward_func": 1.5833333134651184, "rewards/int_reward_func": 0.4791666716337204, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.20633333176374435, "step": 3721, "zero_std_ratio": 0.0 }, { "epoch": 0.3320397876800928, "grad_norm": 1259.49462890625, "learning_rate": 3e-06, "loss": 242.2528, "step": 3722 }, { "epoch": 0.33212899772514387, "grad_norm": 1396.940185546875, "learning_rate": 3e-06, "loss": 228.8479, "step": 3723 }, { "epoch": 0.33221820777019495, "grad_norm": 1548.0001220703125, "learning_rate": 3e-06, "loss": 247.426, "step": 3724 }, { "epoch": 0.332307417815246, "grad_norm": 1286.5562744140625, "learning_rate": 3e-06, "loss": 274.3082, "step": 3725 }, { "epoch": 0.33239662786029706, "grad_norm": 1436.1986083984375, "learning_rate": 3e-06, "loss": 272.1638, "step": 3726 }, { "epoch": 0.33248583790534814, "grad_norm": 1275.008544921875, "learning_rate": 3e-06, "loss": 232.4248, "step": 3727 }, { "epoch": 0.3325750479503992, "grad_norm": 1186.125, "learning_rate": 3e-06, "loss": 232.7789, "step": 3728 }, { "epoch": 0.3326642579954503, "grad_norm": 1786.677978515625, "learning_rate": 3e-06, "loss": 223.1063, "step": 3729 }, { "epoch": 0.3327534680405014, "grad_norm": 1449.0225830078125, "learning_rate": 3e-06, "loss": 236.4222, "step": 3730 }, { "epoch": 0.33284267808555246, "grad_norm": 1295.3238525390625, "learning_rate": 3e-06, "loss": 261.4824, "step": 3731 }, { "epoch": 0.3329318881306035, "grad_norm": 1632.6220703125, "learning_rate": 3e-06, "loss": 254.584, "step": 3732 }, { "completion_length": 121.56250381469727, "epoch": 0.33302109817565456, "grad_norm": 804.380859375, "learning_rate": 3e-06, "loss": 0.7745, "reward": 2.2756041288375854, "reward_std": 0.31197306513786316, "rewards/correctness_reward_func": 1.5833333730697632, "rewards/int_reward_func": 0.4895833283662796, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.20268750190734863, "step": 3733, "zero_std_ratio": 0.0 }, { "epoch": 0.33311030822070564, "grad_norm": 646.4083251953125, "learning_rate": 3e-06, "loss": -11.4135, "step": 3734 }, { "epoch": 0.3331995182657567, "grad_norm": 408.9941101074219, "learning_rate": 3e-06, "loss": -9.116, "step": 3735 }, { "epoch": 0.3332887283108078, "grad_norm": 303.27032470703125, "learning_rate": 3e-06, "loss": -5.0903, "step": 3736 }, { "epoch": 0.3333779383558589, "grad_norm": 403.4747619628906, "learning_rate": 3e-06, "loss": -15.858, "step": 3737 }, { "epoch": 0.33346714840090996, "grad_norm": 872.9730224609375, "learning_rate": 3e-06, "loss": -26.6625, "step": 3738 }, { "epoch": 0.333556358445961, "grad_norm": 899.8195190429688, "learning_rate": 3e-06, "loss": -8.1656, "step": 3739 }, { "epoch": 0.33364556849101207, "grad_norm": 978.0716552734375, "learning_rate": 3e-06, "loss": -20.88, "step": 3740 }, { "epoch": 0.33373477853606315, "grad_norm": 603.8497924804688, "learning_rate": 3e-06, "loss": -14.6502, "step": 3741 }, { "epoch": 0.33382398858111423, "grad_norm": 281.9450378417969, "learning_rate": 3e-06, "loss": -6.9461, "step": 3742 }, { "epoch": 0.3339131986261653, "grad_norm": 689.4998779296875, "learning_rate": 3e-06, "loss": -25.6937, "step": 3743 }, { "epoch": 0.3340024086712164, "grad_norm": 1168.7984619140625, "learning_rate": 3e-06, "loss": -44.0861, "step": 3744 }, { "completion_length": 112.12500381469727, "epoch": 0.33409161871626747, "grad_norm": 3496.795654296875, "learning_rate": 3e-06, "loss": -68.1808, "reward": 2.1083958745002747, "reward_std": 0.4722538888454437, "rewards/correctness_reward_func": 1.4166666567325592, "rewards/int_reward_func": 0.4791666567325592, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.21256250143051147, "step": 3745, "zero_std_ratio": 0.0 }, { "epoch": 0.33418082876131855, "grad_norm": 2577.34619140625, "learning_rate": 3e-06, "loss": -54.8034, "step": 3746 }, { "epoch": 0.3342700388063696, "grad_norm": 2912.902099609375, "learning_rate": 3e-06, "loss": -69.192, "step": 3747 }, { "epoch": 0.33435924885142065, "grad_norm": 3495.400146484375, "learning_rate": 3e-06, "loss": -110.2852, "step": 3748 }, { "epoch": 0.33444845889647173, "grad_norm": 3447.805908203125, "learning_rate": 3e-06, "loss": -161.3526, "step": 3749 }, { "epoch": 0.3345376689415228, "grad_norm": 2818.6171875, "learning_rate": 3e-06, "loss": -34.035, "step": 3750 }, { "epoch": 0.3346268789865739, "grad_norm": 2677.86083984375, "learning_rate": 3e-06, "loss": -96.4869, "step": 3751 }, { "epoch": 0.334716089031625, "grad_norm": 2588.042724609375, "learning_rate": 3e-06, "loss": -74.7067, "step": 3752 }, { "epoch": 0.33480529907667606, "grad_norm": 3059.37646484375, "learning_rate": 3e-06, "loss": -87.2842, "step": 3753 }, { "epoch": 0.3348945091217271, "grad_norm": 3030.9462890625, "learning_rate": 3e-06, "loss": -162.2216, "step": 3754 }, { "epoch": 0.33498371916677816, "grad_norm": 2825.66796875, "learning_rate": 3e-06, "loss": -196.5695, "step": 3755 }, { "epoch": 0.33507292921182924, "grad_norm": 2718.734375, "learning_rate": 3e-06, "loss": -57.8142, "step": 3756 }, { "completion_length": 104.58333587646484, "epoch": 0.3351621392568803, "grad_norm": 2376.456298828125, "learning_rate": 3e-06, "loss": -42.6298, "reward": 2.0352708101272583, "reward_std": 0.4816778749227524, "rewards/correctness_reward_func": 1.3333333134651184, "rewards/int_reward_func": 0.46875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.2331874966621399, "step": 3757, "zero_std_ratio": 0.0 }, { "epoch": 0.3352513493019314, "grad_norm": 4023.816162109375, "learning_rate": 3e-06, "loss": -37.3748, "step": 3758 }, { "epoch": 0.3353405593469825, "grad_norm": 3090.600830078125, "learning_rate": 3e-06, "loss": -4.9385, "step": 3759 }, { "epoch": 0.33542976939203356, "grad_norm": 1713.480712890625, "learning_rate": 3e-06, "loss": -0.4641, "step": 3760 }, { "epoch": 0.33551897943708464, "grad_norm": 3838.7734375, "learning_rate": 3e-06, "loss": -43.0298, "step": 3761 }, { "epoch": 0.33560818948213567, "grad_norm": 2250.1259765625, "learning_rate": 3e-06, "loss": -83.8458, "step": 3762 }, { "epoch": 0.33569739952718675, "grad_norm": 2242.5068359375, "learning_rate": 3e-06, "loss": -58.9135, "step": 3763 }, { "epoch": 0.3357866095722378, "grad_norm": 3397.069091796875, "learning_rate": 3e-06, "loss": -69.8761, "step": 3764 }, { "epoch": 0.3358758196172889, "grad_norm": 2095.59326171875, "learning_rate": 3e-06, "loss": -17.9272, "step": 3765 }, { "epoch": 0.33596502966234, "grad_norm": 2431.42578125, "learning_rate": 3e-06, "loss": -32.2177, "step": 3766 }, { "epoch": 0.33605423970739107, "grad_norm": 3581.637939453125, "learning_rate": 3e-06, "loss": -85.7642, "step": 3767 }, { "epoch": 0.33614344975244215, "grad_norm": 2016.685302734375, "learning_rate": 3e-06, "loss": -112.2871, "step": 3768 }, { "completion_length": 120.62500762939453, "epoch": 0.33623265979749317, "grad_norm": 484.28375244140625, "learning_rate": 3e-06, "loss": 14.0951, "reward": 2.6062084436416626, "reward_std": 0.22124376893043518, "rewards/correctness_reward_func": 1.9166666269302368, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.18954166769981384, "step": 3769, "zero_std_ratio": 0.0 }, { "epoch": 0.33632186984254425, "grad_norm": 621.1134643554688, "learning_rate": 3e-06, "loss": 13.4584, "step": 3770 }, { "epoch": 0.33641107988759533, "grad_norm": 496.6928405761719, "learning_rate": 3e-06, "loss": 41.2466, "step": 3771 }, { "epoch": 0.3365002899326464, "grad_norm": 943.55615234375, "learning_rate": 3e-06, "loss": -11.2473, "step": 3772 }, { "epoch": 0.3365894999776975, "grad_norm": 518.924560546875, "learning_rate": 3e-06, "loss": 37.0233, "step": 3773 }, { "epoch": 0.3366787100227486, "grad_norm": 569.4124145507812, "learning_rate": 3e-06, "loss": 26.4821, "step": 3774 }, { "epoch": 0.33676792006779965, "grad_norm": 520.06396484375, "learning_rate": 3e-06, "loss": 13.0696, "step": 3775 }, { "epoch": 0.33685713011285073, "grad_norm": 961.465087890625, "learning_rate": 3e-06, "loss": 12.3902, "step": 3776 }, { "epoch": 0.33694634015790176, "grad_norm": 596.431884765625, "learning_rate": 3e-06, "loss": 35.3064, "step": 3777 }, { "epoch": 0.33703555020295284, "grad_norm": 1086.9384765625, "learning_rate": 3e-06, "loss": -14.9772, "step": 3778 }, { "epoch": 0.3371247602480039, "grad_norm": 388.6129150390625, "learning_rate": 3e-06, "loss": 30.892, "step": 3779 }, { "epoch": 0.337213970293055, "grad_norm": 626.19775390625, "learning_rate": 3e-06, "loss": 19.7802, "step": 3780 }, { "completion_length": 117.68750381469727, "epoch": 0.3373031803381061, "grad_norm": 3244.53515625, "learning_rate": 3e-06, "loss": -629.4629, "reward": 2.068187654018402, "reward_std": 0.5362763404846191, "rewards/correctness_reward_func": 1.4166666865348816, "rewards/int_reward_func": 0.4479166567325592, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.20360417664051056, "step": 3781, "zero_std_ratio": 0.0 }, { "epoch": 0.33739239038315716, "grad_norm": 2580.144287109375, "learning_rate": 3e-06, "loss": -720.3294, "step": 3782 }, { "epoch": 0.33748160042820824, "grad_norm": 1935.1009521484375, "learning_rate": 3e-06, "loss": -699.9628, "step": 3783 }, { "epoch": 0.33757081047325926, "grad_norm": 2164.1318359375, "learning_rate": 3e-06, "loss": -694.0245, "step": 3784 }, { "epoch": 0.33766002051831034, "grad_norm": 1540.8697509765625, "learning_rate": 3e-06, "loss": -712.5054, "step": 3785 }, { "epoch": 0.3377492305633614, "grad_norm": 1660.828857421875, "learning_rate": 3e-06, "loss": -697.576, "step": 3786 }, { "epoch": 0.3378384406084125, "grad_norm": 4260.7197265625, "learning_rate": 3e-06, "loss": -663.6683, "step": 3787 }, { "epoch": 0.3379276506534636, "grad_norm": 3799.364501953125, "learning_rate": 3e-06, "loss": -747.9332, "step": 3788 }, { "epoch": 0.33801686069851467, "grad_norm": 1127.9512939453125, "learning_rate": 3e-06, "loss": -731.9249, "step": 3789 }, { "epoch": 0.33810607074356575, "grad_norm": 2499.471923828125, "learning_rate": 3e-06, "loss": -716.568, "step": 3790 }, { "epoch": 0.3381952807886168, "grad_norm": 1399.4041748046875, "learning_rate": 3e-06, "loss": -747.1489, "step": 3791 }, { "epoch": 0.33828449083366785, "grad_norm": 1374.1256103515625, "learning_rate": 3e-06, "loss": -735.219, "step": 3792 }, { "completion_length": 129.8958396911621, "epoch": 0.33837370087871893, "grad_norm": 1610.641845703125, "learning_rate": 3e-06, "loss": -154.8416, "reward": 1.9861875772476196, "reward_std": 0.4793417975306511, "rewards/correctness_reward_func": 1.3333333730697632, "rewards/int_reward_func": 0.4895833283662796, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.16327083110809326, "step": 3793, "zero_std_ratio": 0.0 }, { "epoch": 0.33846291092377, "grad_norm": 3318.900146484375, "learning_rate": 3e-06, "loss": -115.1702, "step": 3794 }, { "epoch": 0.3385521209688211, "grad_norm": 1492.2672119140625, "learning_rate": 3e-06, "loss": -136.9132, "step": 3795 }, { "epoch": 0.33864133101387217, "grad_norm": 2145.712158203125, "learning_rate": 3e-06, "loss": -286.7365, "step": 3796 }, { "epoch": 0.33873054105892325, "grad_norm": 2136.60107421875, "learning_rate": 3e-06, "loss": -54.993, "step": 3797 }, { "epoch": 0.33881975110397433, "grad_norm": 2415.7568359375, "learning_rate": 3e-06, "loss": -219.7261, "step": 3798 }, { "epoch": 0.33890896114902536, "grad_norm": 1806.381591796875, "learning_rate": 3e-06, "loss": -159.3348, "step": 3799 }, { "epoch": 0.33899817119407644, "grad_norm": 2929.940185546875, "learning_rate": 3e-06, "loss": -125.3085, "step": 3800 }, { "epoch": 0.3390873812391275, "grad_norm": 1396.681884765625, "learning_rate": 3e-06, "loss": -145.2968, "step": 3801 }, { "epoch": 0.3391765912841786, "grad_norm": 2133.839111328125, "learning_rate": 3e-06, "loss": -301.6281, "step": 3802 }, { "epoch": 0.3392658013292297, "grad_norm": 1754.7598876953125, "learning_rate": 3e-06, "loss": -76.726, "step": 3803 }, { "epoch": 0.33935501137428076, "grad_norm": 1836.256103515625, "learning_rate": 3e-06, "loss": -235.7217, "step": 3804 }, { "completion_length": 111.10416793823242, "epoch": 0.33944422141933184, "grad_norm": 1979.6875, "learning_rate": 3e-06, "loss": 93.282, "reward": 2.1524165868759155, "reward_std": 0.640927255153656, "rewards/correctness_reward_func": 1.4583333134651184, "rewards/int_reward_func": 0.46875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.2253333255648613, "step": 3805, "zero_std_ratio": 0.0 }, { "epoch": 0.33953343146438286, "grad_norm": 1209.397216796875, "learning_rate": 3e-06, "loss": 0.2896, "step": 3806 }, { "epoch": 0.33962264150943394, "grad_norm": 1435.964599609375, "learning_rate": 3e-06, "loss": -21.7229, "step": 3807 }, { "epoch": 0.339711851554485, "grad_norm": 1624.9171142578125, "learning_rate": 3e-06, "loss": 8.4529, "step": 3808 }, { "epoch": 0.3398010615995361, "grad_norm": 1711.4871826171875, "learning_rate": 3e-06, "loss": -53.8415, "step": 3809 }, { "epoch": 0.3398902716445872, "grad_norm": 1156.0087890625, "learning_rate": 3e-06, "loss": -62.8926, "step": 3810 }, { "epoch": 0.33997948168963826, "grad_norm": 1725.9737548828125, "learning_rate": 3e-06, "loss": 78.5929, "step": 3811 }, { "epoch": 0.34006869173468934, "grad_norm": 1332.1077880859375, "learning_rate": 3e-06, "loss": -18.2293, "step": 3812 }, { "epoch": 0.3401579017797404, "grad_norm": 1520.8389892578125, "learning_rate": 3e-06, "loss": -36.9412, "step": 3813 }, { "epoch": 0.34024711182479145, "grad_norm": 1858.5863037109375, "learning_rate": 3e-06, "loss": -5.2105, "step": 3814 }, { "epoch": 0.34033632186984253, "grad_norm": 1879.468994140625, "learning_rate": 3e-06, "loss": -75.4713, "step": 3815 }, { "epoch": 0.3404255319148936, "grad_norm": 2325.705810546875, "learning_rate": 3e-06, "loss": -65.8209, "step": 3816 }, { "completion_length": 122.06250762939453, "epoch": 0.3405147419599447, "grad_norm": 1964.9132080078125, "learning_rate": 3e-06, "loss": -125.0619, "reward": 2.094208300113678, "reward_std": 0.7129809856414795, "rewards/correctness_reward_func": 1.4166666865348816, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.17754165828227997, "step": 3817, "zero_std_ratio": 0.0 }, { "epoch": 0.34060395200499577, "grad_norm": 1518.7874755859375, "learning_rate": 3e-06, "loss": -98.8743, "step": 3818 }, { "epoch": 0.34069316205004685, "grad_norm": 2556.021240234375, "learning_rate": 3e-06, "loss": -89.804, "step": 3819 }, { "epoch": 0.34078237209509793, "grad_norm": 1146.2711181640625, "learning_rate": 3e-06, "loss": -136.6513, "step": 3820 }, { "epoch": 0.34087158214014895, "grad_norm": 1256.955078125, "learning_rate": 3e-06, "loss": -128.7605, "step": 3821 }, { "epoch": 0.34096079218520003, "grad_norm": 912.225830078125, "learning_rate": 3e-06, "loss": -142.5117, "step": 3822 }, { "epoch": 0.3410500022302511, "grad_norm": 1488.637939453125, "learning_rate": 3e-06, "loss": -152.1164, "step": 3823 }, { "epoch": 0.3411392122753022, "grad_norm": 1487.519287109375, "learning_rate": 3e-06, "loss": -117.3488, "step": 3824 }, { "epoch": 0.3412284223203533, "grad_norm": 3207.55419921875, "learning_rate": 3e-06, "loss": -123.062, "step": 3825 }, { "epoch": 0.34131763236540436, "grad_norm": 884.8870849609375, "learning_rate": 3e-06, "loss": -167.0282, "step": 3826 }, { "epoch": 0.34140684241045544, "grad_norm": 811.690185546875, "learning_rate": 3e-06, "loss": -150.6012, "step": 3827 }, { "epoch": 0.3414960524555065, "grad_norm": 957.6675415039062, "learning_rate": 3e-06, "loss": -162.0193, "step": 3828 }, { "completion_length": 114.72917175292969, "epoch": 0.34158526250055754, "grad_norm": 2639.091064453125, "learning_rate": 3e-06, "loss": -632.0696, "reward": 2.384958267211914, "reward_std": 0.6155047714710236, "rewards/correctness_reward_func": 1.6666666865348816, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.21829166263341904, "step": 3829, "zero_std_ratio": 0.0 }, { "epoch": 0.3416744725456086, "grad_norm": 2420.7646484375, "learning_rate": 3e-06, "loss": -665.6334, "step": 3830 }, { "epoch": 0.3417636825906597, "grad_norm": 3467.790771484375, "learning_rate": 3e-06, "loss": -668.0657, "step": 3831 }, { "epoch": 0.3418528926357108, "grad_norm": 2691.39306640625, "learning_rate": 3e-06, "loss": -628.0853, "step": 3832 }, { "epoch": 0.34194210268076186, "grad_norm": 2812.41650390625, "learning_rate": 3e-06, "loss": -626.0587, "step": 3833 }, { "epoch": 0.34203131272581294, "grad_norm": 2414.3642578125, "learning_rate": 3e-06, "loss": -581.8898, "step": 3834 }, { "epoch": 0.342120522770864, "grad_norm": 2684.473876953125, "learning_rate": 3e-06, "loss": -653.4653, "step": 3835 }, { "epoch": 0.34220973281591505, "grad_norm": 1945.0899658203125, "learning_rate": 3e-06, "loss": -697.4971, "step": 3836 }, { "epoch": 0.3422989428609661, "grad_norm": 3359.0341796875, "learning_rate": 3e-06, "loss": -672.0356, "step": 3837 }, { "epoch": 0.3423881529060172, "grad_norm": 2580.909423828125, "learning_rate": 3e-06, "loss": -662.06, "step": 3838 }, { "epoch": 0.3424773629510683, "grad_norm": 2583.73828125, "learning_rate": 3e-06, "loss": -683.656, "step": 3839 }, { "epoch": 0.34256657299611937, "grad_norm": 2190.490234375, "learning_rate": 3e-06, "loss": -627.7733, "step": 3840 }, { "completion_length": 105.35416793823242, "epoch": 0.34265578304117045, "grad_norm": 1496.418212890625, "learning_rate": 3e-06, "loss": -22.3329, "reward": 2.5867291688919067, "reward_std": 0.34263482689857483, "rewards/correctness_reward_func": 1.875, "rewards/int_reward_func": 0.4895833283662796, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.22214582562446594, "step": 3841, "zero_std_ratio": 0.0 }, { "epoch": 0.3427449930862215, "grad_norm": 1106.179443359375, "learning_rate": 3e-06, "loss": -106.4433, "step": 3842 }, { "epoch": 0.3428342031312726, "grad_norm": 2934.532470703125, "learning_rate": 3e-06, "loss": -46.2536, "step": 3843 }, { "epoch": 0.34292341317632363, "grad_norm": 1777.9378662109375, "learning_rate": 3e-06, "loss": -91.55, "step": 3844 }, { "epoch": 0.3430126232213747, "grad_norm": 1910.0574951171875, "learning_rate": 3e-06, "loss": -59.8763, "step": 3845 }, { "epoch": 0.3431018332664258, "grad_norm": 1019.4939575195312, "learning_rate": 3e-06, "loss": -93.8046, "step": 3846 }, { "epoch": 0.3431910433114769, "grad_norm": 2129.9052734375, "learning_rate": 3e-06, "loss": -31.7549, "step": 3847 }, { "epoch": 0.34328025335652795, "grad_norm": 1151.435791015625, "learning_rate": 3e-06, "loss": -108.9817, "step": 3848 }, { "epoch": 0.34336946340157903, "grad_norm": 1752.5361328125, "learning_rate": 3e-06, "loss": -76.0305, "step": 3849 }, { "epoch": 0.3434586734466301, "grad_norm": 992.4119873046875, "learning_rate": 3e-06, "loss": -108.6331, "step": 3850 }, { "epoch": 0.34354788349168114, "grad_norm": 1257.0975341796875, "learning_rate": 3e-06, "loss": -91.4081, "step": 3851 }, { "epoch": 0.3436370935367322, "grad_norm": 866.9653930664062, "learning_rate": 3e-06, "loss": -93.5153, "step": 3852 }, { "completion_length": 103.58333587646484, "epoch": 0.3437263035817833, "grad_norm": 1760.1412353515625, "learning_rate": 3e-06, "loss": 95.909, "reward": 2.235270917415619, "reward_std": 0.39060652256011963, "rewards/correctness_reward_func": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.23527083545923233, "step": 3853, "zero_std_ratio": 0.0 }, { "epoch": 0.3438155136268344, "grad_norm": 1233.7650146484375, "learning_rate": 3e-06, "loss": 70.5813, "step": 3854 }, { "epoch": 0.34390472367188546, "grad_norm": 1628.172607421875, "learning_rate": 3e-06, "loss": 90.9779, "step": 3855 }, { "epoch": 0.34399393371693654, "grad_norm": 1863.203857421875, "learning_rate": 3e-06, "loss": 102.9058, "step": 3856 }, { "epoch": 0.3440831437619876, "grad_norm": 1323.8583984375, "learning_rate": 3e-06, "loss": 54.9208, "step": 3857 }, { "epoch": 0.3441723538070387, "grad_norm": 1092.504150390625, "learning_rate": 3e-06, "loss": 11.4709, "step": 3858 }, { "epoch": 0.3442615638520897, "grad_norm": 1166.389404296875, "learning_rate": 3e-06, "loss": 67.0388, "step": 3859 }, { "epoch": 0.3443507738971408, "grad_norm": 1034.0667724609375, "learning_rate": 3e-06, "loss": 49.2914, "step": 3860 }, { "epoch": 0.3444399839421919, "grad_norm": 1360.8424072265625, "learning_rate": 3e-06, "loss": 58.5834, "step": 3861 }, { "epoch": 0.34452919398724297, "grad_norm": 1252.01123046875, "learning_rate": 3e-06, "loss": 62.5596, "step": 3862 }, { "epoch": 0.34461840403229405, "grad_norm": 924.7935180664062, "learning_rate": 3e-06, "loss": 30.2912, "step": 3863 }, { "epoch": 0.3447076140773451, "grad_norm": 866.155029296875, "learning_rate": 3e-06, "loss": 2.9605, "step": 3864 }, { "completion_length": 125.27083969116211, "epoch": 0.3447968241223962, "grad_norm": 1952.330078125, "learning_rate": 3e-06, "loss": 213.4637, "reward": 2.0809166431427, "reward_std": 0.7243660390377045, "rewards/correctness_reward_func": 1.4166666269302368, "rewards/int_reward_func": 0.4895833283662796, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.17466666549444199, "step": 3865, "zero_std_ratio": 0.0 }, { "epoch": 0.34488603416744723, "grad_norm": 1912.045654296875, "learning_rate": 3e-06, "loss": 173.2764, "step": 3866 }, { "epoch": 0.3449752442124983, "grad_norm": 1792.7205810546875, "learning_rate": 3e-06, "loss": 154.4507, "step": 3867 }, { "epoch": 0.3450644542575494, "grad_norm": 1729.6405029296875, "learning_rate": 3e-06, "loss": 124.9202, "step": 3868 }, { "epoch": 0.34515366430260047, "grad_norm": 1594.2381591796875, "learning_rate": 3e-06, "loss": 110.1437, "step": 3869 }, { "epoch": 0.34524287434765155, "grad_norm": 2124.4677734375, "learning_rate": 3e-06, "loss": 75.9209, "step": 3870 }, { "epoch": 0.34533208439270263, "grad_norm": 1809.8023681640625, "learning_rate": 3e-06, "loss": 159.3822, "step": 3871 }, { "epoch": 0.3454212944377537, "grad_norm": 1331.769287109375, "learning_rate": 3e-06, "loss": 122.9425, "step": 3872 }, { "epoch": 0.34551050448280474, "grad_norm": 1428.9512939453125, "learning_rate": 3e-06, "loss": 107.6665, "step": 3873 }, { "epoch": 0.3455997145278558, "grad_norm": 1269.2869873046875, "learning_rate": 3e-06, "loss": 61.887, "step": 3874 }, { "epoch": 0.3456889245729069, "grad_norm": 1277.948486328125, "learning_rate": 3e-06, "loss": 59.1085, "step": 3875 }, { "epoch": 0.345778134617958, "grad_norm": 1226.925537109375, "learning_rate": 3e-06, "loss": 36.3568, "step": 3876 }, { "completion_length": 143.95833587646484, "epoch": 0.34586734466300906, "grad_norm": 388.56695556640625, "learning_rate": 3e-06, "loss": -31.111, "reward": 1.9702085256576538, "reward_std": 0.35788530111312866, "rewards/correctness_reward_func": 1.3333333134651184, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.13687500171363354, "step": 3877, "zero_std_ratio": 0.0 }, { "epoch": 0.34595655470806014, "grad_norm": 495.5047912597656, "learning_rate": 3e-06, "loss": -8.8734, "step": 3878 }, { "epoch": 0.3460457647531112, "grad_norm": 301.70843505859375, "learning_rate": 3e-06, "loss": -3.1011, "step": 3879 }, { "epoch": 0.3461349747981623, "grad_norm": 277.6426086425781, "learning_rate": 3e-06, "loss": -11.0117, "step": 3880 }, { "epoch": 0.3462241848432133, "grad_norm": 267.3581848144531, "learning_rate": 3e-06, "loss": -12.9656, "step": 3881 }, { "epoch": 0.3463133948882644, "grad_norm": 231.85153198242188, "learning_rate": 3e-06, "loss": -13.1321, "step": 3882 }, { "epoch": 0.3464026049333155, "grad_norm": 270.12310791015625, "learning_rate": 3e-06, "loss": -34.7187, "step": 3883 }, { "epoch": 0.34649181497836656, "grad_norm": 273.5789489746094, "learning_rate": 3e-06, "loss": -16.4671, "step": 3884 }, { "epoch": 0.34658102502341764, "grad_norm": 227.5151824951172, "learning_rate": 3e-06, "loss": -5.5387, "step": 3885 }, { "epoch": 0.3466702350684687, "grad_norm": 273.5270690917969, "learning_rate": 3e-06, "loss": -12.0656, "step": 3886 }, { "epoch": 0.3467594451135198, "grad_norm": 235.01266479492188, "learning_rate": 3e-06, "loss": -16.6773, "step": 3887 }, { "epoch": 0.34684865515857083, "grad_norm": 241.6991424560547, "learning_rate": 3e-06, "loss": -17.6967, "step": 3888 }, { "completion_length": 105.14583587646484, "epoch": 0.3469378652036219, "grad_norm": 1062.1593017578125, "learning_rate": 3e-06, "loss": -28.0877, "reward": 2.0580002069473267, "reward_std": 0.21911855041980743, "rewards/correctness_reward_func": 1.3333333730697632, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.22466665506362915, "step": 3889, "zero_std_ratio": 0.0 }, { "epoch": 0.347027075248673, "grad_norm": 615.6689453125, "learning_rate": 3e-06, "loss": -34.0835, "step": 3890 }, { "epoch": 0.34711628529372407, "grad_norm": 1141.142333984375, "learning_rate": 3e-06, "loss": 12.5009, "step": 3891 }, { "epoch": 0.34720549533877515, "grad_norm": 607.4088745117188, "learning_rate": 3e-06, "loss": -31.0659, "step": 3892 }, { "epoch": 0.34729470538382623, "grad_norm": 701.9598388671875, "learning_rate": 3e-06, "loss": -25.6905, "step": 3893 }, { "epoch": 0.3473839154288773, "grad_norm": 665.7315673828125, "learning_rate": 3e-06, "loss": -2.9312, "step": 3894 }, { "epoch": 0.3474731254739284, "grad_norm": 832.5386352539062, "learning_rate": 3e-06, "loss": -38.4036, "step": 3895 }, { "epoch": 0.3475623355189794, "grad_norm": 605.162353515625, "learning_rate": 3e-06, "loss": -40.7736, "step": 3896 }, { "epoch": 0.3476515455640305, "grad_norm": 1030.767822265625, "learning_rate": 3e-06, "loss": -3.1332, "step": 3897 }, { "epoch": 0.3477407556090816, "grad_norm": 682.4091796875, "learning_rate": 3e-06, "loss": -35.0785, "step": 3898 }, { "epoch": 0.34782996565413266, "grad_norm": 693.3421020507812, "learning_rate": 3e-06, "loss": -32.6496, "step": 3899 }, { "epoch": 0.34791917569918374, "grad_norm": 732.666015625, "learning_rate": 3e-06, "loss": -12.5821, "step": 3900 }, { "completion_length": 125.75000762939453, "epoch": 0.3480083857442348, "grad_norm": 1363.7186279296875, "learning_rate": 3e-06, "loss": 54.3206, "reward": 2.115416646003723, "reward_std": 0.501444935798645, "rewards/correctness_reward_func": 1.5, "rewards/int_reward_func": 0.4583333283662796, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.15708333253860474, "step": 3901, "zero_std_ratio": 0.0 }, { "epoch": 0.3480975957892859, "grad_norm": 1636.769287109375, "learning_rate": 3e-06, "loss": 53.8557, "step": 3902 }, { "epoch": 0.3481868058343369, "grad_norm": 1905.1029052734375, "learning_rate": 3e-06, "loss": 66.261, "step": 3903 }, { "epoch": 0.348276015879388, "grad_norm": 1813.3272705078125, "learning_rate": 3e-06, "loss": 10.1103, "step": 3904 }, { "epoch": 0.3483652259244391, "grad_norm": 1334.3360595703125, "learning_rate": 3e-06, "loss": 17.2302, "step": 3905 }, { "epoch": 0.34845443596949016, "grad_norm": 1741.0811767578125, "learning_rate": 3e-06, "loss": 88.232, "step": 3906 }, { "epoch": 0.34854364601454124, "grad_norm": 1352.8822021484375, "learning_rate": 3e-06, "loss": 41.8102, "step": 3907 }, { "epoch": 0.3486328560595923, "grad_norm": 1699.8956298828125, "learning_rate": 3e-06, "loss": 40.4133, "step": 3908 }, { "epoch": 0.3487220661046434, "grad_norm": 1906.539306640625, "learning_rate": 3e-06, "loss": 50.9925, "step": 3909 }, { "epoch": 0.3488112761496945, "grad_norm": 1767.1746826171875, "learning_rate": 3e-06, "loss": -7.3788, "step": 3910 }, { "epoch": 0.3489004861947455, "grad_norm": 1362.558837890625, "learning_rate": 3e-06, "loss": -4.0332, "step": 3911 }, { "epoch": 0.3489896962397966, "grad_norm": 2189.984130859375, "learning_rate": 3e-06, "loss": 64.9922, "step": 3912 }, { "completion_length": 114.50000381469727, "epoch": 0.34907890628484767, "grad_norm": 111.75199890136719, "learning_rate": 3e-06, "loss": -1.4047, "reward": 1.8137917518615723, "reward_std": 0.15144500136375427, "rewards/correctness_reward_func": 1.2083333134651184, "rewards/int_reward_func": 0.4166666716337204, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.1887916624546051, "step": 3913, "zero_std_ratio": 0.0 }, { "epoch": 0.34916811632989875, "grad_norm": 130.38160705566406, "learning_rate": 3e-06, "loss": -4.1075, "step": 3914 }, { "epoch": 0.34925732637494983, "grad_norm": 81.8251724243164, "learning_rate": 3e-06, "loss": 1.803, "step": 3915 }, { "epoch": 0.3493465364200009, "grad_norm": 214.54583740234375, "learning_rate": 3e-06, "loss": -4.0775, "step": 3916 }, { "epoch": 0.349435746465052, "grad_norm": 136.1807403564453, "learning_rate": 3e-06, "loss": -4.4314, "step": 3917 }, { "epoch": 0.349524956510103, "grad_norm": 99.69410705566406, "learning_rate": 3e-06, "loss": -2.5351, "step": 3918 }, { "epoch": 0.3496141665551541, "grad_norm": 146.0958251953125, "learning_rate": 3e-06, "loss": -2.184, "step": 3919 }, { "epoch": 0.3497033766002052, "grad_norm": 104.46556854248047, "learning_rate": 3e-06, "loss": -5.918, "step": 3920 }, { "epoch": 0.34979258664525625, "grad_norm": 75.3154296875, "learning_rate": 3e-06, "loss": 0.8011, "step": 3921 }, { "epoch": 0.34988179669030733, "grad_norm": 122.06836700439453, "learning_rate": 3e-06, "loss": -5.2575, "step": 3922 }, { "epoch": 0.3499710067353584, "grad_norm": 147.12356567382812, "learning_rate": 3e-06, "loss": -5.7878, "step": 3923 }, { "epoch": 0.3500602167804095, "grad_norm": 115.91873931884766, "learning_rate": 3e-06, "loss": -3.5225, "step": 3924 }, { "completion_length": 135.9166717529297, "epoch": 0.3501494268254605, "grad_norm": 2373.406005859375, "learning_rate": 3e-06, "loss": -7.9388, "reward": 2.0555626153945923, "reward_std": 0.5804407000541687, "rewards/correctness_reward_func": 1.3750000298023224, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.18056251108646393, "step": 3925, "zero_std_ratio": 0.0 }, { "epoch": 0.3502386368705116, "grad_norm": 5773.9423828125, "learning_rate": 3e-06, "loss": 56.9934, "step": 3926 }, { "epoch": 0.3503278469155627, "grad_norm": 2974.229736328125, "learning_rate": 3e-06, "loss": 98.3543, "step": 3927 }, { "epoch": 0.35041705696061376, "grad_norm": 3015.796630859375, "learning_rate": 3e-06, "loss": -90.0642, "step": 3928 }, { "epoch": 0.35050626700566484, "grad_norm": 3810.086181640625, "learning_rate": 3e-06, "loss": 89.4173, "step": 3929 }, { "epoch": 0.3505954770507159, "grad_norm": 6000.9912109375, "learning_rate": 3e-06, "loss": 105.2092, "step": 3930 }, { "epoch": 0.350684687095767, "grad_norm": 2607.024169921875, "learning_rate": 3e-06, "loss": -13.4248, "step": 3931 }, { "epoch": 0.3507738971408181, "grad_norm": 2689.4892578125, "learning_rate": 3e-06, "loss": 33.3661, "step": 3932 }, { "epoch": 0.3508631071858691, "grad_norm": 5004.39599609375, "learning_rate": 3e-06, "loss": 97.1288, "step": 3933 }, { "epoch": 0.3509523172309202, "grad_norm": 2357.776611328125, "learning_rate": 3e-06, "loss": -128.6457, "step": 3934 }, { "epoch": 0.35104152727597127, "grad_norm": 4137.27001953125, "learning_rate": 3e-06, "loss": 91.5591, "step": 3935 }, { "epoch": 0.35113073732102235, "grad_norm": 4145.02490234375, "learning_rate": 3e-06, "loss": 73.9578, "step": 3936 }, { "completion_length": 130.39583587646484, "epoch": 0.3512199473660734, "grad_norm": 1533.2069091796875, "learning_rate": 3e-06, "loss": -273.4802, "reward": 1.9726042747497559, "reward_std": 0.17483485862612724, "rewards/correctness_reward_func": 1.2916666865348816, "rewards/int_reward_func": 0.4895833283662796, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.1913541555404663, "step": 3937, "zero_std_ratio": 0.0 }, { "epoch": 0.3513091574111245, "grad_norm": 1664.6263427734375, "learning_rate": 3e-06, "loss": -288.2635, "step": 3938 }, { "epoch": 0.3513983674561756, "grad_norm": 1596.6475830078125, "learning_rate": 3e-06, "loss": -312.1967, "step": 3939 }, { "epoch": 0.3514875775012266, "grad_norm": 1829.539794921875, "learning_rate": 3e-06, "loss": -323.4694, "step": 3940 }, { "epoch": 0.3515767875462777, "grad_norm": 1600.3477783203125, "learning_rate": 3e-06, "loss": -303.0234, "step": 3941 }, { "epoch": 0.35166599759132877, "grad_norm": 1815.720458984375, "learning_rate": 3e-06, "loss": -309.1255, "step": 3942 }, { "epoch": 0.35175520763637985, "grad_norm": 1387.8104248046875, "learning_rate": 3e-06, "loss": -324.101, "step": 3943 }, { "epoch": 0.35184441768143093, "grad_norm": 1369.0994873046875, "learning_rate": 3e-06, "loss": -347.9606, "step": 3944 }, { "epoch": 0.351933627726482, "grad_norm": 1932.86376953125, "learning_rate": 3e-06, "loss": -388.4294, "step": 3945 }, { "epoch": 0.3520228377715331, "grad_norm": 1609.04541015625, "learning_rate": 3e-06, "loss": -409.2695, "step": 3946 }, { "epoch": 0.3521120478165842, "grad_norm": 1499.1630859375, "learning_rate": 3e-06, "loss": -383.4575, "step": 3947 }, { "epoch": 0.3522012578616352, "grad_norm": 1589.2305908203125, "learning_rate": 3e-06, "loss": -400.8874, "step": 3948 }, { "completion_length": 112.43750381469727, "epoch": 0.3522904679066863, "grad_norm": 1852.2044677734375, "learning_rate": 3e-06, "loss": 366.9594, "reward": 2.1618751287460327, "reward_std": 0.4159102328121662, "rewards/correctness_reward_func": 1.4583333134651184, "rewards/int_reward_func": 0.4895833283662796, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.21395833790302277, "step": 3949, "zero_std_ratio": 0.0 }, { "epoch": 0.35237967795173736, "grad_norm": 1595.02099609375, "learning_rate": 3e-06, "loss": 276.9512, "step": 3950 }, { "epoch": 0.35246888799678844, "grad_norm": 1651.42626953125, "learning_rate": 3e-06, "loss": 333.0921, "step": 3951 }, { "epoch": 0.3525580980418395, "grad_norm": 1683.9635009765625, "learning_rate": 3e-06, "loss": 372.0911, "step": 3952 }, { "epoch": 0.3526473080868906, "grad_norm": 1603.2061767578125, "learning_rate": 3e-06, "loss": 373.6106, "step": 3953 }, { "epoch": 0.3527365181319417, "grad_norm": 2586.568359375, "learning_rate": 3e-06, "loss": 327.2733, "step": 3954 }, { "epoch": 0.3528257281769927, "grad_norm": 1694.0765380859375, "learning_rate": 3e-06, "loss": 373.1467, "step": 3955 }, { "epoch": 0.3529149382220438, "grad_norm": 1690.65869140625, "learning_rate": 3e-06, "loss": 281.8227, "step": 3956 }, { "epoch": 0.35300414826709486, "grad_norm": 1673.522216796875, "learning_rate": 3e-06, "loss": 328.5616, "step": 3957 }, { "epoch": 0.35309335831214594, "grad_norm": 1510.2718505859375, "learning_rate": 3e-06, "loss": 363.3606, "step": 3958 }, { "epoch": 0.353182568357197, "grad_norm": 1428.447021484375, "learning_rate": 3e-06, "loss": 349.4862, "step": 3959 }, { "epoch": 0.3532717784022481, "grad_norm": 2153.787353515625, "learning_rate": 3e-06, "loss": 300.6429, "step": 3960 }, { "completion_length": 98.54167175292969, "epoch": 0.3533609884472992, "grad_norm": 1080.53759765625, "learning_rate": 3e-06, "loss": 47.9625, "reward": 2.6085416078567505, "reward_std": 0.3628959357738495, "rewards/correctness_reward_func": 1.875, "rewards/int_reward_func": 0.4895833283662796, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.24395833164453506, "step": 3961, "zero_std_ratio": 0.0 }, { "epoch": 0.35345019849235026, "grad_norm": 976.9656982421875, "learning_rate": 3e-06, "loss": 106.8893, "step": 3962 }, { "epoch": 0.3535394085374013, "grad_norm": 883.5478515625, "learning_rate": 3e-06, "loss": 95.4302, "step": 3963 }, { "epoch": 0.35362861858245237, "grad_norm": 916.159423828125, "learning_rate": 3e-06, "loss": 48.995, "step": 3964 }, { "epoch": 0.35371782862750345, "grad_norm": 1041.633056640625, "learning_rate": 3e-06, "loss": 105.0387, "step": 3965 }, { "epoch": 0.35380703867255453, "grad_norm": 745.586669921875, "learning_rate": 3e-06, "loss": 34.3167, "step": 3966 }, { "epoch": 0.3538962487176056, "grad_norm": 676.5847778320312, "learning_rate": 3e-06, "loss": 22.7108, "step": 3967 }, { "epoch": 0.3539854587626567, "grad_norm": 731.3202514648438, "learning_rate": 3e-06, "loss": 76.4485, "step": 3968 }, { "epoch": 0.35407466880770777, "grad_norm": 516.250244140625, "learning_rate": 3e-06, "loss": 64.9996, "step": 3969 }, { "epoch": 0.3541638788527588, "grad_norm": 641.28955078125, "learning_rate": 3e-06, "loss": 30.3455, "step": 3970 }, { "epoch": 0.3542530888978099, "grad_norm": 668.1777954101562, "learning_rate": 3e-06, "loss": 61.5066, "step": 3971 }, { "epoch": 0.35434229894286096, "grad_norm": 458.7284851074219, "learning_rate": 3e-06, "loss": 9.8505, "step": 3972 }, { "completion_length": 121.3125, "epoch": 0.35443150898791204, "grad_norm": 1823.4388427734375, "learning_rate": 3e-06, "loss": -144.8193, "reward": 1.979270875453949, "reward_std": 0.3714235872030258, "rewards/correctness_reward_func": 1.2916666567325592, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.1876041628420353, "step": 3973, "zero_std_ratio": 0.0 }, { "epoch": 0.3545207190329631, "grad_norm": 623.92919921875, "learning_rate": 3e-06, "loss": -62.0512, "step": 3974 }, { "epoch": 0.3546099290780142, "grad_norm": 1062.03125, "learning_rate": 3e-06, "loss": -88.9177, "step": 3975 }, { "epoch": 0.3546991391230653, "grad_norm": 405.3869323730469, "learning_rate": 3e-06, "loss": -83.5694, "step": 3976 }, { "epoch": 0.35478834916811636, "grad_norm": 478.39959716796875, "learning_rate": 3e-06, "loss": -82.9814, "step": 3977 }, { "epoch": 0.3548775592131674, "grad_norm": 970.3162841796875, "learning_rate": 3e-06, "loss": -89.0366, "step": 3978 }, { "epoch": 0.35496676925821846, "grad_norm": 1531.9501953125, "learning_rate": 3e-06, "loss": -136.3281, "step": 3979 }, { "epoch": 0.35505597930326954, "grad_norm": 534.76171875, "learning_rate": 3e-06, "loss": -72.2801, "step": 3980 }, { "epoch": 0.3551451893483206, "grad_norm": 1064.50390625, "learning_rate": 3e-06, "loss": -94.2882, "step": 3981 }, { "epoch": 0.3552343993933717, "grad_norm": 319.95416259765625, "learning_rate": 3e-06, "loss": -88.7971, "step": 3982 }, { "epoch": 0.3553236094384228, "grad_norm": 505.2913818359375, "learning_rate": 3e-06, "loss": -93.3108, "step": 3983 }, { "epoch": 0.35541281948347386, "grad_norm": 770.2453002929688, "learning_rate": 3e-06, "loss": -92.7481, "step": 3984 }, { "completion_length": 100.91667175292969, "epoch": 0.3555020295285249, "grad_norm": 188.17056274414062, "learning_rate": 3e-06, "loss": -15.298, "reward": 2.190354287624359, "reward_std": 0.13247851561754942, "rewards/correctness_reward_func": 1.4583333134651184, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.23202083259820938, "step": 3985, "zero_std_ratio": 0.0 }, { "epoch": 0.35559123957357597, "grad_norm": 246.90707397460938, "learning_rate": 3e-06, "loss": -21.1176, "step": 3986 }, { "epoch": 0.35568044961862705, "grad_norm": 194.64175415039062, "learning_rate": 3e-06, "loss": -26.2212, "step": 3987 }, { "epoch": 0.35576965966367813, "grad_norm": 172.0954132080078, "learning_rate": 3e-06, "loss": -17.1377, "step": 3988 }, { "epoch": 0.3558588697087292, "grad_norm": 286.969482421875, "learning_rate": 3e-06, "loss": -25.4642, "step": 3989 }, { "epoch": 0.3559480797537803, "grad_norm": 201.43844604492188, "learning_rate": 3e-06, "loss": -18.0935, "step": 3990 }, { "epoch": 0.35603728979883137, "grad_norm": 173.05914306640625, "learning_rate": 3e-06, "loss": -18.526, "step": 3991 }, { "epoch": 0.3561264998438824, "grad_norm": 231.81845092773438, "learning_rate": 3e-06, "loss": -24.9131, "step": 3992 }, { "epoch": 0.3562157098889335, "grad_norm": 189.8501434326172, "learning_rate": 3e-06, "loss": -31.0998, "step": 3993 }, { "epoch": 0.35630491993398455, "grad_norm": 188.58221435546875, "learning_rate": 3e-06, "loss": -22.941, "step": 3994 }, { "epoch": 0.35639412997903563, "grad_norm": 225.09400939941406, "learning_rate": 3e-06, "loss": -33.4812, "step": 3995 }, { "epoch": 0.3564833400240867, "grad_norm": 179.369140625, "learning_rate": 3e-06, "loss": -23.5121, "step": 3996 }, { "completion_length": 133.8541717529297, "epoch": 0.3565725500691378, "grad_norm": 1200.021240234375, "learning_rate": 3e-06, "loss": 74.9888, "reward": 2.041708469390869, "reward_std": 0.3618824928998947, "rewards/correctness_reward_func": 1.4166666567325592, "rewards/int_reward_func": 0.4895833283662796, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.1354583315551281, "step": 3997, "zero_std_ratio": 0.0 }, { "epoch": 0.3566617601141889, "grad_norm": 2234.099853515625, "learning_rate": 3e-06, "loss": 25.0163, "step": 3998 }, { "epoch": 0.35675097015923996, "grad_norm": 991.7899780273438, "learning_rate": 3e-06, "loss": 9.3074, "step": 3999 }, { "epoch": 0.356840180204291, "grad_norm": 2450.069091796875, "learning_rate": 3e-06, "loss": -142.5053, "step": 4000 }, { "epoch": 0.35692939024934206, "grad_norm": 1018.7874145507812, "learning_rate": 3e-06, "loss": -263.4709, "step": 4001 }, { "epoch": 0.35701860029439314, "grad_norm": 1514.8592529296875, "learning_rate": 3e-06, "loss": -0.0563, "step": 4002 }, { "epoch": 0.3571078103394442, "grad_norm": 1402.239013671875, "learning_rate": 3e-06, "loss": 71.0604, "step": 4003 }, { "epoch": 0.3571970203844953, "grad_norm": 2501.983154296875, "learning_rate": 3e-06, "loss": 22.5289, "step": 4004 }, { "epoch": 0.3572862304295464, "grad_norm": 944.4086303710938, "learning_rate": 3e-06, "loss": 0.0536, "step": 4005 }, { "epoch": 0.35737544047459746, "grad_norm": 2534.032958984375, "learning_rate": 3e-06, "loss": -157.0007, "step": 4006 }, { "epoch": 0.3574646505196485, "grad_norm": 1158.6558837890625, "learning_rate": 3e-06, "loss": -277.1906, "step": 4007 }, { "epoch": 0.35755386056469957, "grad_norm": 1730.727294921875, "learning_rate": 3e-06, "loss": -13.6321, "step": 4008 }, { "completion_length": 107.45833587646484, "epoch": 0.35764307060975065, "grad_norm": 1198.1639404296875, "learning_rate": 3e-06, "loss": -29.7024, "reward": 2.4935208559036255, "reward_std": 0.39220699667930603, "rewards/correctness_reward_func": 1.7916666865348816, "rewards/int_reward_func": 0.4895833283662796, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.2122708410024643, "step": 4009, "zero_std_ratio": 0.0 }, { "epoch": 0.3577322806548017, "grad_norm": 1874.5372314453125, "learning_rate": 3e-06, "loss": -60.4107, "step": 4010 }, { "epoch": 0.3578214906998528, "grad_norm": 1597.57861328125, "learning_rate": 3e-06, "loss": -29.3406, "step": 4011 }, { "epoch": 0.3579107007449039, "grad_norm": 1721.5269775390625, "learning_rate": 3e-06, "loss": -43.0304, "step": 4012 }, { "epoch": 0.35799991078995497, "grad_norm": 1254.4666748046875, "learning_rate": 3e-06, "loss": -70.5784, "step": 4013 }, { "epoch": 0.35808912083500605, "grad_norm": 1421.9080810546875, "learning_rate": 3e-06, "loss": -78.2865, "step": 4014 }, { "epoch": 0.35817833088005707, "grad_norm": 1116.732421875, "learning_rate": 3e-06, "loss": -35.7034, "step": 4015 }, { "epoch": 0.35826754092510815, "grad_norm": 1829.7552490234375, "learning_rate": 3e-06, "loss": -90.3673, "step": 4016 }, { "epoch": 0.35835675097015923, "grad_norm": 1632.64794921875, "learning_rate": 3e-06, "loss": -50.3379, "step": 4017 }, { "epoch": 0.3584459610152103, "grad_norm": 2161.5712890625, "learning_rate": 3e-06, "loss": -70.4291, "step": 4018 }, { "epoch": 0.3585351710602614, "grad_norm": 1959.2393798828125, "learning_rate": 3e-06, "loss": -101.9316, "step": 4019 }, { "epoch": 0.3586243811053125, "grad_norm": 1894.447265625, "learning_rate": 3e-06, "loss": -105.0586, "step": 4020 }, { "completion_length": 99.39583587646484, "epoch": 0.35871359115036355, "grad_norm": 1992.625732421875, "learning_rate": 3e-06, "loss": 28.5218, "reward": 2.2624791860580444, "reward_std": 0.6723311841487885, "rewards/correctness_reward_func": 1.5833333730697632, "rewards/int_reward_func": 0.4479166567325592, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.23122917860746384, "step": 4021, "zero_std_ratio": 0.0 }, { "epoch": 0.3588028011954146, "grad_norm": 3796.889404296875, "learning_rate": 3e-06, "loss": 74.1355, "step": 4022 }, { "epoch": 0.35889201124046566, "grad_norm": 2086.572509765625, "learning_rate": 3e-06, "loss": 104.5997, "step": 4023 }, { "epoch": 0.35898122128551674, "grad_norm": 1337.6444091796875, "learning_rate": 3e-06, "loss": 20.1896, "step": 4024 }, { "epoch": 0.3590704313305678, "grad_norm": 1724.3558349609375, "learning_rate": 3e-06, "loss": 24.121, "step": 4025 }, { "epoch": 0.3591596413756189, "grad_norm": 1661.4539794921875, "learning_rate": 3e-06, "loss": -87.7674, "step": 4026 }, { "epoch": 0.35924885142067, "grad_norm": 1738.5406494140625, "learning_rate": 3e-06, "loss": 6.038, "step": 4027 }, { "epoch": 0.35933806146572106, "grad_norm": 2524.776123046875, "learning_rate": 3e-06, "loss": 27.9882, "step": 4028 }, { "epoch": 0.35942727151077214, "grad_norm": 1999.9293212890625, "learning_rate": 3e-06, "loss": 79.3562, "step": 4029 }, { "epoch": 0.35951648155582316, "grad_norm": 1058.39453125, "learning_rate": 3e-06, "loss": 6.9009, "step": 4030 }, { "epoch": 0.35960569160087424, "grad_norm": 2380.192626953125, "learning_rate": 3e-06, "loss": 0.5756, "step": 4031 }, { "epoch": 0.3596949016459253, "grad_norm": 1748.9957275390625, "learning_rate": 3e-06, "loss": -105.0523, "step": 4032 }, { "completion_length": 104.75000381469727, "epoch": 0.3597841116909764, "grad_norm": 1442.3868408203125, "learning_rate": 3e-06, "loss": 33.8626, "reward": 2.224375009536743, "reward_std": 0.2803745334967971, "rewards/correctness_reward_func": 1.5416666865348816, "rewards/int_reward_func": 0.4375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.24520834535360336, "step": 4033, "zero_std_ratio": 0.0 }, { "epoch": 0.3598733217360275, "grad_norm": 665.0608520507812, "learning_rate": 3e-06, "loss": -0.5576, "step": 4034 }, { "epoch": 0.35996253178107857, "grad_norm": 642.9202880859375, "learning_rate": 3e-06, "loss": -0.2554, "step": 4035 }, { "epoch": 0.36005174182612965, "grad_norm": 878.629638671875, "learning_rate": 3e-06, "loss": 25.2766, "step": 4036 }, { "epoch": 0.36014095187118067, "grad_norm": 861.7694702148438, "learning_rate": 3e-06, "loss": 29.4603, "step": 4037 }, { "epoch": 0.36023016191623175, "grad_norm": 260.9869384765625, "learning_rate": 3e-06, "loss": -3.6128, "step": 4038 }, { "epoch": 0.36031937196128283, "grad_norm": 1684.0743408203125, "learning_rate": 3e-06, "loss": 20.4014, "step": 4039 }, { "epoch": 0.3604085820063339, "grad_norm": 499.24822998046875, "learning_rate": 3e-06, "loss": -3.3176, "step": 4040 }, { "epoch": 0.360497792051385, "grad_norm": 481.536865234375, "learning_rate": 3e-06, "loss": -1.2369, "step": 4041 }, { "epoch": 0.36058700209643607, "grad_norm": 622.6273803710938, "learning_rate": 3e-06, "loss": 17.5933, "step": 4042 }, { "epoch": 0.36067621214148715, "grad_norm": 528.6300659179688, "learning_rate": 3e-06, "loss": 21.7594, "step": 4043 }, { "epoch": 0.36076542218653823, "grad_norm": 261.8156433105469, "learning_rate": 3e-06, "loss": -4.0295, "step": 4044 }, { "completion_length": 112.62500381469727, "epoch": 0.36085463223158926, "grad_norm": 1860.0218505859375, "learning_rate": 3e-06, "loss": 111.1417, "reward": 2.5006461143493652, "reward_std": 0.47322162613272667, "rewards/correctness_reward_func": 1.7916666865348816, "rewards/int_reward_func": 0.4895833283662796, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.21939583867788315, "step": 4045, "zero_std_ratio": 0.0 }, { "epoch": 0.36094384227664034, "grad_norm": 2668.2890625, "learning_rate": 3e-06, "loss": 103.9209, "step": 4046 }, { "epoch": 0.3610330523216914, "grad_norm": 2340.26025390625, "learning_rate": 3e-06, "loss": 95.0352, "step": 4047 }, { "epoch": 0.3611222623667425, "grad_norm": 1857.202880859375, "learning_rate": 3e-06, "loss": 109.1596, "step": 4048 }, { "epoch": 0.3612114724117936, "grad_norm": 2125.943115234375, "learning_rate": 3e-06, "loss": 204.8602, "step": 4049 }, { "epoch": 0.36130068245684466, "grad_norm": 2596.594482421875, "learning_rate": 3e-06, "loss": 76.446, "step": 4050 }, { "epoch": 0.36138989250189574, "grad_norm": 2351.39697265625, "learning_rate": 3e-06, "loss": 97.2649, "step": 4051 }, { "epoch": 0.36147910254694676, "grad_norm": 2422.360107421875, "learning_rate": 3e-06, "loss": 92.9321, "step": 4052 }, { "epoch": 0.36156831259199784, "grad_norm": 2310.494140625, "learning_rate": 3e-06, "loss": 71.4649, "step": 4053 }, { "epoch": 0.3616575226370489, "grad_norm": 1808.0902099609375, "learning_rate": 3e-06, "loss": 97.1086, "step": 4054 }, { "epoch": 0.3617467326821, "grad_norm": 2899.19482421875, "learning_rate": 3e-06, "loss": 159.2642, "step": 4055 }, { "epoch": 0.3618359427271511, "grad_norm": 2288.69287109375, "learning_rate": 3e-06, "loss": 49.3393, "step": 4056 }, { "completion_length": 126.75000762939453, "epoch": 0.36192515277220216, "grad_norm": 1815.6575927734375, "learning_rate": 3e-06, "loss": 179.127, "reward": 2.318562626838684, "reward_std": 0.5790471583604813, "rewards/correctness_reward_func": 1.625, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.19356249272823334, "step": 4057, "zero_std_ratio": 0.0 }, { "epoch": 0.36201436281725324, "grad_norm": 2313.75830078125, "learning_rate": 3e-06, "loss": 91.6541, "step": 4058 }, { "epoch": 0.36210357286230427, "grad_norm": 2244.6650390625, "learning_rate": 3e-06, "loss": 231.7648, "step": 4059 }, { "epoch": 0.36219278290735535, "grad_norm": 1862.6575927734375, "learning_rate": 3e-06, "loss": 128.5912, "step": 4060 }, { "epoch": 0.36228199295240643, "grad_norm": 1577.962158203125, "learning_rate": 3e-06, "loss": 68.6859, "step": 4061 }, { "epoch": 0.3623712029974575, "grad_norm": 1171.620361328125, "learning_rate": 3e-06, "loss": 154.6711, "step": 4062 }, { "epoch": 0.3624604130425086, "grad_norm": 1688.853515625, "learning_rate": 3e-06, "loss": 164.3745, "step": 4063 }, { "epoch": 0.36254962308755967, "grad_norm": 1763.775390625, "learning_rate": 3e-06, "loss": 70.661, "step": 4064 }, { "epoch": 0.36263883313261075, "grad_norm": 2364.757080078125, "learning_rate": 3e-06, "loss": 206.6784, "step": 4065 }, { "epoch": 0.36272804317766183, "grad_norm": 1931.721435546875, "learning_rate": 3e-06, "loss": 106.8668, "step": 4066 }, { "epoch": 0.36281725322271285, "grad_norm": 1670.3575439453125, "learning_rate": 3e-06, "loss": 51.7907, "step": 4067 }, { "epoch": 0.36290646326776393, "grad_norm": 1417.4649658203125, "learning_rate": 3e-06, "loss": 141.0333, "step": 4068 }, { "completion_length": 118.60416793823242, "epoch": 0.362995673312815, "grad_norm": 1114.549072265625, "learning_rate": 3e-06, "loss": 109.3135, "reward": 2.3475000858306885, "reward_std": 0.3930952399969101, "rewards/correctness_reward_func": 1.6666666865348816, "rewards/int_reward_func": 0.4791666716337204, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.20166666060686111, "step": 4069, "zero_std_ratio": 0.0 }, { "epoch": 0.3630848833578661, "grad_norm": 1923.74951171875, "learning_rate": 3e-06, "loss": 134.5127, "step": 4070 }, { "epoch": 0.3631740934029172, "grad_norm": 1075.20654296875, "learning_rate": 3e-06, "loss": 116.1331, "step": 4071 }, { "epoch": 0.36326330344796826, "grad_norm": 1750.5550537109375, "learning_rate": 3e-06, "loss": 38.3991, "step": 4072 }, { "epoch": 0.36335251349301934, "grad_norm": 2035.4764404296875, "learning_rate": 3e-06, "loss": 121.3921, "step": 4073 }, { "epoch": 0.36344172353807036, "grad_norm": 1746.701904296875, "learning_rate": 3e-06, "loss": 47.3718, "step": 4074 }, { "epoch": 0.36353093358312144, "grad_norm": 1150.864013671875, "learning_rate": 3e-06, "loss": 91.7032, "step": 4075 }, { "epoch": 0.3636201436281725, "grad_norm": 1636.14794921875, "learning_rate": 3e-06, "loss": 102.4581, "step": 4076 }, { "epoch": 0.3637093536732236, "grad_norm": 1101.1552734375, "learning_rate": 3e-06, "loss": 95.0299, "step": 4077 }, { "epoch": 0.3637985637182747, "grad_norm": 1751.56982421875, "learning_rate": 3e-06, "loss": 16.8594, "step": 4078 }, { "epoch": 0.36388777376332576, "grad_norm": 1321.927490234375, "learning_rate": 3e-06, "loss": 93.828, "step": 4079 }, { "epoch": 0.36397698380837684, "grad_norm": 1768.9239501953125, "learning_rate": 3e-06, "loss": 19.4123, "step": 4080 }, { "completion_length": 124.14583587646484, "epoch": 0.3640661938534279, "grad_norm": 2405.406982421875, "learning_rate": 3e-06, "loss": 112.4096, "reward": 2.2018333673477173, "reward_std": 0.34975601732730865, "rewards/correctness_reward_func": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.2018333300948143, "step": 4081, "zero_std_ratio": 0.0 }, { "epoch": 0.36415540389847895, "grad_norm": 1257.9105224609375, "learning_rate": 3e-06, "loss": 131.3701, "step": 4082 }, { "epoch": 0.36424461394353, "grad_norm": 2129.0322265625, "learning_rate": 3e-06, "loss": 135.6432, "step": 4083 }, { "epoch": 0.3643338239885811, "grad_norm": 1093.75439453125, "learning_rate": 3e-06, "loss": 187.8958, "step": 4084 }, { "epoch": 0.3644230340336322, "grad_norm": 2406.638427734375, "learning_rate": 3e-06, "loss": 78.88, "step": 4085 }, { "epoch": 0.36451224407868327, "grad_norm": 1234.732177734375, "learning_rate": 3e-06, "loss": 103.9847, "step": 4086 }, { "epoch": 0.36460145412373435, "grad_norm": 1725.11328125, "learning_rate": 3e-06, "loss": 82.622, "step": 4087 }, { "epoch": 0.36469066416878543, "grad_norm": 1585.5115966796875, "learning_rate": 3e-06, "loss": 113.2464, "step": 4088 }, { "epoch": 0.36477987421383645, "grad_norm": 2204.511962890625, "learning_rate": 3e-06, "loss": 95.8539, "step": 4089 }, { "epoch": 0.36486908425888753, "grad_norm": 1244.7109375, "learning_rate": 3e-06, "loss": 166.0196, "step": 4090 }, { "epoch": 0.3649582943039386, "grad_norm": 2593.052734375, "learning_rate": 3e-06, "loss": 48.9979, "step": 4091 }, { "epoch": 0.3650475043489897, "grad_norm": 1707.4163818359375, "learning_rate": 3e-06, "loss": 75.9911, "step": 4092 }, { "completion_length": 129.87500381469727, "epoch": 0.3651367143940408, "grad_norm": 2491.021728515625, "learning_rate": 3e-06, "loss": -359.7729, "reward": 2.42870831489563, "reward_std": 0.6494415998458862, "rewards/correctness_reward_func": 1.75, "rewards/int_reward_func": 0.4895833283662796, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.18912500888109207, "step": 4093, "zero_std_ratio": 0.0 }, { "epoch": 0.36522592443909185, "grad_norm": 2131.9296875, "learning_rate": 3e-06, "loss": -296.6728, "step": 4094 }, { "epoch": 0.36531513448414293, "grad_norm": 2279.06396484375, "learning_rate": 3e-06, "loss": -269.2261, "step": 4095 }, { "epoch": 0.365404344529194, "grad_norm": 1496.7486572265625, "learning_rate": 3e-06, "loss": -125.9525, "step": 4096 }, { "epoch": 0.36549355457424504, "grad_norm": 3478.40625, "learning_rate": 3e-06, "loss": -236.9106, "step": 4097 }, { "epoch": 0.3655827646192961, "grad_norm": 3427.197509765625, "learning_rate": 3e-06, "loss": -397.3185, "step": 4098 }, { "epoch": 0.3656719746643472, "grad_norm": 2189.1806640625, "learning_rate": 3e-06, "loss": -387.5226, "step": 4099 }, { "epoch": 0.3657611847093983, "grad_norm": 2282.716796875, "learning_rate": 3e-06, "loss": -321.1008, "step": 4100 }, { "epoch": 0.36585039475444936, "grad_norm": 2178.079833984375, "learning_rate": 3e-06, "loss": -286.7885, "step": 4101 }, { "epoch": 0.36593960479950044, "grad_norm": 1816.7889404296875, "learning_rate": 3e-06, "loss": -135.5441, "step": 4102 }, { "epoch": 0.3660288148445515, "grad_norm": 4172.68994140625, "learning_rate": 3e-06, "loss": -287.5686, "step": 4103 }, { "epoch": 0.36611802488960254, "grad_norm": 16226.7822265625, "learning_rate": 3e-06, "loss": -492.7274, "step": 4104 }, { "completion_length": 138.5416717529297, "epoch": 0.3662072349346536, "grad_norm": 2049.502685546875, "learning_rate": 3e-06, "loss": 174.642, "reward": 2.0777708888053894, "reward_std": 0.6865398585796356, "rewards/correctness_reward_func": 1.4583333134651184, "rewards/int_reward_func": 0.4583333283662796, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.16110417246818542, "step": 4105, "zero_std_ratio": 0.0 }, { "epoch": 0.3662964449797047, "grad_norm": 2934.732421875, "learning_rate": 3e-06, "loss": 305.0708, "step": 4106 }, { "epoch": 0.3663856550247558, "grad_norm": 3624.81005859375, "learning_rate": 3e-06, "loss": 274.5327, "step": 4107 }, { "epoch": 0.36647486506980687, "grad_norm": 3055.79931640625, "learning_rate": 3e-06, "loss": 261.0976, "step": 4108 }, { "epoch": 0.36656407511485795, "grad_norm": 1940.75, "learning_rate": 3e-06, "loss": 292.6276, "step": 4109 }, { "epoch": 0.366653285159909, "grad_norm": 3130.45849609375, "learning_rate": 3e-06, "loss": 268.0661, "step": 4110 }, { "epoch": 0.3667424952049601, "grad_norm": 2042.078125, "learning_rate": 3e-06, "loss": 168.7931, "step": 4111 }, { "epoch": 0.36683170525001113, "grad_norm": 3418.92724609375, "learning_rate": 3e-06, "loss": 263.8893, "step": 4112 }, { "epoch": 0.3669209152950622, "grad_norm": 3315.45458984375, "learning_rate": 3e-06, "loss": 232.0627, "step": 4113 }, { "epoch": 0.3670101253401133, "grad_norm": 2064.73876953125, "learning_rate": 3e-06, "loss": 214.6888, "step": 4114 }, { "epoch": 0.36709933538516437, "grad_norm": 2108.952880859375, "learning_rate": 3e-06, "loss": 249.6333, "step": 4115 }, { "epoch": 0.36718854543021545, "grad_norm": 2127.252685546875, "learning_rate": 3e-06, "loss": 200.7585, "step": 4116 }, { "completion_length": 107.02083587646484, "epoch": 0.36727775547526653, "grad_norm": 460.37249755859375, "learning_rate": 3e-06, "loss": 15.9995, "reward": 2.249562621116638, "reward_std": 0.41240447759628296, "rewards/correctness_reward_func": 1.625, "rewards/int_reward_func": 0.40625, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.21831249445676804, "step": 4117, "zero_std_ratio": 0.0 }, { "epoch": 0.3673669655203176, "grad_norm": 341.31829833984375, "learning_rate": 3e-06, "loss": 24.0603, "step": 4118 }, { "epoch": 0.36745617556536864, "grad_norm": 394.5201110839844, "learning_rate": 3e-06, "loss": 35.4696, "step": 4119 }, { "epoch": 0.3675453856104197, "grad_norm": 352.8374938964844, "learning_rate": 3e-06, "loss": 24.5691, "step": 4120 }, { "epoch": 0.3676345956554708, "grad_norm": 288.1681823730469, "learning_rate": 3e-06, "loss": 9.3665, "step": 4121 }, { "epoch": 0.3677238057005219, "grad_norm": 467.0898742675781, "learning_rate": 3e-06, "loss": 12.1091, "step": 4122 }, { "epoch": 0.36781301574557296, "grad_norm": 333.2739562988281, "learning_rate": 3e-06, "loss": 5.9978, "step": 4123 }, { "epoch": 0.36790222579062404, "grad_norm": 273.65277099609375, "learning_rate": 3e-06, "loss": 15.2441, "step": 4124 }, { "epoch": 0.3679914358356751, "grad_norm": 313.0566101074219, "learning_rate": 3e-06, "loss": 27.7277, "step": 4125 }, { "epoch": 0.36808064588072614, "grad_norm": 256.2476501464844, "learning_rate": 3e-06, "loss": 13.553, "step": 4126 }, { "epoch": 0.3681698559257772, "grad_norm": 241.54498291015625, "learning_rate": 3e-06, "loss": 2.4649, "step": 4127 }, { "epoch": 0.3682590659708283, "grad_norm": 303.197021484375, "learning_rate": 3e-06, "loss": 0.951, "step": 4128 }, { "completion_length": 124.60417175292969, "epoch": 0.3683482760158794, "grad_norm": 1029.1561279296875, "learning_rate": 3e-06, "loss": 90.1219, "reward": 2.110583484172821, "reward_std": 0.49745041131973267, "rewards/correctness_reward_func": 1.4583333730697632, "rewards/int_reward_func": 0.4895833283662796, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.1626666635274887, "step": 4129, "zero_std_ratio": 0.0 }, { "epoch": 0.36843748606093046, "grad_norm": 955.6253662109375, "learning_rate": 3e-06, "loss": 72.0493, "step": 4130 }, { "epoch": 0.36852669610598154, "grad_norm": 931.212158203125, "learning_rate": 3e-06, "loss": 26.7699, "step": 4131 }, { "epoch": 0.3686159061510326, "grad_norm": 2004.7652587890625, "learning_rate": 3e-06, "loss": 61.1489, "step": 4132 }, { "epoch": 0.3687051161960837, "grad_norm": 1978.98095703125, "learning_rate": 3e-06, "loss": 54.9747, "step": 4133 }, { "epoch": 0.36879432624113473, "grad_norm": 714.897216796875, "learning_rate": 3e-06, "loss": 51.4766, "step": 4134 }, { "epoch": 0.3688835362861858, "grad_norm": 2049.15087890625, "learning_rate": 3e-06, "loss": 49.8238, "step": 4135 }, { "epoch": 0.3689727463312369, "grad_norm": 1023.445068359375, "learning_rate": 3e-06, "loss": 35.3613, "step": 4136 }, { "epoch": 0.36906195637628797, "grad_norm": 354.4919738769531, "learning_rate": 3e-06, "loss": 12.1702, "step": 4137 }, { "epoch": 0.36915116642133905, "grad_norm": 529.1526489257812, "learning_rate": 3e-06, "loss": 17.7458, "step": 4138 }, { "epoch": 0.36924037646639013, "grad_norm": 348.49822998046875, "learning_rate": 3e-06, "loss": 25.1575, "step": 4139 }, { "epoch": 0.3693295865114412, "grad_norm": 562.7550048828125, "learning_rate": 3e-06, "loss": 44.4098, "step": 4140 }, { "completion_length": 141.37500762939453, "epoch": 0.36941879655649223, "grad_norm": 3207.765625, "learning_rate": 3e-06, "loss": -887.4449, "reward": 2.3043750524520874, "reward_std": 0.5956731140613556, "rewards/correctness_reward_func": 1.6666666865348816, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.13770833611488342, "step": 4141, "zero_std_ratio": 0.125 }, { "epoch": 0.3695080066015433, "grad_norm": 3255.30078125, "learning_rate": 3e-06, "loss": -912.122, "step": 4142 }, { "epoch": 0.3695972166465944, "grad_norm": 3195.828125, "learning_rate": 3e-06, "loss": -811.557, "step": 4143 }, { "epoch": 0.3696864266916455, "grad_norm": 2226.805908203125, "learning_rate": 3e-06, "loss": -829.1491, "step": 4144 }, { "epoch": 0.36977563673669656, "grad_norm": 2298.43017578125, "learning_rate": 3e-06, "loss": -795.5204, "step": 4145 }, { "epoch": 0.36986484678174764, "grad_norm": 3086.864013671875, "learning_rate": 3e-06, "loss": -882.7493, "step": 4146 }, { "epoch": 0.3699540568267987, "grad_norm": 2982.46826171875, "learning_rate": 3e-06, "loss": -915.4579, "step": 4147 }, { "epoch": 0.3700432668718498, "grad_norm": 3152.36865234375, "learning_rate": 3e-06, "loss": -951.7438, "step": 4148 }, { "epoch": 0.3701324769169008, "grad_norm": 2327.09033203125, "learning_rate": 3e-06, "loss": -872.5244, "step": 4149 }, { "epoch": 0.3702216869619519, "grad_norm": 1945.86474609375, "learning_rate": 3e-06, "loss": -901.1398, "step": 4150 }, { "epoch": 0.370310897007003, "grad_norm": 2727.462646484375, "learning_rate": 3e-06, "loss": -871.1791, "step": 4151 }, { "epoch": 0.37040010705205406, "grad_norm": 2856.957763671875, "learning_rate": 3e-06, "loss": -989.1044, "step": 4152 }, { "completion_length": 110.08333587646484, "epoch": 0.37048931709710514, "grad_norm": 147.50808715820312, "learning_rate": 3e-06, "loss": 0.2431, "reward": 2.4840625524520874, "reward_std": 0.11895603453740478, "rewards/correctness_reward_func": 1.7916666865348816, "rewards/int_reward_func": 0.4895833283662796, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.20281249284744263, "step": 4153, "zero_std_ratio": 0.0 }, { "epoch": 0.3705785271421562, "grad_norm": 145.95028686523438, "learning_rate": 3e-06, "loss": 5.6026, "step": 4154 }, { "epoch": 0.3706677371872073, "grad_norm": 131.57688903808594, "learning_rate": 3e-06, "loss": 6.2424, "step": 4155 }, { "epoch": 0.3707569472322583, "grad_norm": 84.48395538330078, "learning_rate": 3e-06, "loss": 5.9066, "step": 4156 }, { "epoch": 0.3708461572773094, "grad_norm": 123.48330688476562, "learning_rate": 3e-06, "loss": 3.4525, "step": 4157 }, { "epoch": 0.3709353673223605, "grad_norm": 139.19007873535156, "learning_rate": 3e-06, "loss": -2.5766, "step": 4158 }, { "epoch": 0.37102457736741157, "grad_norm": 132.55308532714844, "learning_rate": 3e-06, "loss": -0.4067, "step": 4159 }, { "epoch": 0.37111378741246265, "grad_norm": 114.35975646972656, "learning_rate": 3e-06, "loss": 4.0513, "step": 4160 }, { "epoch": 0.37120299745751373, "grad_norm": 134.66842651367188, "learning_rate": 3e-06, "loss": 2.6168, "step": 4161 }, { "epoch": 0.3712922075025648, "grad_norm": 281.9593200683594, "learning_rate": 3e-06, "loss": 5.4427, "step": 4162 }, { "epoch": 0.3713814175476159, "grad_norm": 126.50686645507812, "learning_rate": 3e-06, "loss": 3.3902, "step": 4163 }, { "epoch": 0.3714706275926669, "grad_norm": 153.9324188232422, "learning_rate": 3e-06, "loss": -4.3268, "step": 4164 }, { "completion_length": 102.79167175292969, "epoch": 0.371559837637718, "grad_norm": 650.2644653320312, "learning_rate": 3e-06, "loss": 60.3234, "reward": 2.5276252031326294, "reward_std": 0.23809551447629929, "rewards/correctness_reward_func": 1.7916666269302368, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.23595834523439407, "step": 4165, "zero_std_ratio": 0.0 }, { "epoch": 0.3716490476827691, "grad_norm": 766.36669921875, "learning_rate": 3e-06, "loss": 7.9096, "step": 4166 }, { "epoch": 0.37173825772782015, "grad_norm": 1424.0771484375, "learning_rate": 3e-06, "loss": 25.2122, "step": 4167 }, { "epoch": 0.37182746777287123, "grad_norm": 1251.6806640625, "learning_rate": 3e-06, "loss": 24.4775, "step": 4168 }, { "epoch": 0.3719166778179223, "grad_norm": 1003.8441162109375, "learning_rate": 3e-06, "loss": 43.9804, "step": 4169 }, { "epoch": 0.3720058878629734, "grad_norm": 904.416259765625, "learning_rate": 3e-06, "loss": 24.6363, "step": 4170 }, { "epoch": 0.3720950979080244, "grad_norm": 742.5004272460938, "learning_rate": 3e-06, "loss": 55.0547, "step": 4171 }, { "epoch": 0.3721843079530755, "grad_norm": 519.9896850585938, "learning_rate": 3e-06, "loss": -0.6226, "step": 4172 }, { "epoch": 0.3722735179981266, "grad_norm": 693.0836181640625, "learning_rate": 3e-06, "loss": 11.9299, "step": 4173 }, { "epoch": 0.37236272804317766, "grad_norm": 693.8762817382812, "learning_rate": 3e-06, "loss": 9.1782, "step": 4174 }, { "epoch": 0.37245193808822874, "grad_norm": 848.0070190429688, "learning_rate": 3e-06, "loss": 29.2588, "step": 4175 }, { "epoch": 0.3725411481332798, "grad_norm": 760.9902954101562, "learning_rate": 3e-06, "loss": 17.7304, "step": 4176 }, { "completion_length": 134.0, "epoch": 0.3726303581783309, "grad_norm": 241.20314025878906, "learning_rate": 3e-06, "loss": -0.9938, "reward": 1.9664167165756226, "reward_std": 0.269694022834301, "rewards/correctness_reward_func": 1.3333333134651184, "rewards/int_reward_func": 0.4895833283662796, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.14349999837577343, "step": 4177, "zero_std_ratio": 0.0 }, { "epoch": 0.3727195682233819, "grad_norm": 221.4034423828125, "learning_rate": 3e-06, "loss": 0.8544, "step": 4178 }, { "epoch": 0.372808778268433, "grad_norm": 156.0101318359375, "learning_rate": 3e-06, "loss": -7.6062, "step": 4179 }, { "epoch": 0.3728979883134841, "grad_norm": 230.4752960205078, "learning_rate": 3e-06, "loss": -1.1486, "step": 4180 }, { "epoch": 0.37298719835853517, "grad_norm": 181.69650268554688, "learning_rate": 3e-06, "loss": -5.847, "step": 4181 }, { "epoch": 0.37307640840358625, "grad_norm": 171.9276123046875, "learning_rate": 3e-06, "loss": 3.0712, "step": 4182 }, { "epoch": 0.3731656184486373, "grad_norm": 211.27719116210938, "learning_rate": 3e-06, "loss": -2.026, "step": 4183 }, { "epoch": 0.3732548284936884, "grad_norm": 262.46307373046875, "learning_rate": 3e-06, "loss": -0.6228, "step": 4184 }, { "epoch": 0.3733440385387395, "grad_norm": 167.2457275390625, "learning_rate": 3e-06, "loss": -8.5308, "step": 4185 }, { "epoch": 0.3734332485837905, "grad_norm": 210.81787109375, "learning_rate": 3e-06, "loss": -3.4821, "step": 4186 }, { "epoch": 0.3735224586288416, "grad_norm": 193.9115447998047, "learning_rate": 3e-06, "loss": -7.9114, "step": 4187 }, { "epoch": 0.37361166867389267, "grad_norm": 236.89215087890625, "learning_rate": 3e-06, "loss": 1.0488, "step": 4188 }, { "completion_length": 124.43750381469727, "epoch": 0.37370087871894375, "grad_norm": 235.67262268066406, "learning_rate": 3e-06, "loss": -10.262, "reward": 2.343104362487793, "reward_std": 0.22925975546240807, "rewards/correctness_reward_func": 1.6666666865348816, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.17643750831484795, "step": 4189, "zero_std_ratio": 0.0 }, { "epoch": 0.37379008876399483, "grad_norm": 170.79525756835938, "learning_rate": 3e-06, "loss": -4.4316, "step": 4190 }, { "epoch": 0.3738792988090459, "grad_norm": 139.02206420898438, "learning_rate": 3e-06, "loss": -3.4703, "step": 4191 }, { "epoch": 0.373968508854097, "grad_norm": 224.05685424804688, "learning_rate": 3e-06, "loss": -10.4546, "step": 4192 }, { "epoch": 0.374057718899148, "grad_norm": 202.5243682861328, "learning_rate": 3e-06, "loss": -17.2406, "step": 4193 }, { "epoch": 0.3741469289441991, "grad_norm": 412.2899169921875, "learning_rate": 3e-06, "loss": 1.5714, "step": 4194 }, { "epoch": 0.3742361389892502, "grad_norm": 236.7718505859375, "learning_rate": 3e-06, "loss": -14.027, "step": 4195 }, { "epoch": 0.37432534903430126, "grad_norm": 237.08128356933594, "learning_rate": 3e-06, "loss": -7.2211, "step": 4196 }, { "epoch": 0.37441455907935234, "grad_norm": 176.626220703125, "learning_rate": 3e-06, "loss": -6.0445, "step": 4197 }, { "epoch": 0.3745037691244034, "grad_norm": 294.6336364746094, "learning_rate": 3e-06, "loss": -15.3397, "step": 4198 }, { "epoch": 0.3745929791694545, "grad_norm": 202.54689025878906, "learning_rate": 3e-06, "loss": -21.6452, "step": 4199 }, { "epoch": 0.3746821892145056, "grad_norm": 427.1260681152344, "learning_rate": 3e-06, "loss": -0.4063, "step": 4200 }, { "completion_length": 126.70833969116211, "epoch": 0.3747713992595566, "grad_norm": 241.9331512451172, "learning_rate": 3e-06, "loss": -9.0842, "reward": 2.380625009536743, "reward_std": 0.3124370686709881, "rewards/correctness_reward_func": 1.7083333134651184, "rewards/int_reward_func": 0.4895833283662796, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.18270833045244217, "step": 4201, "zero_std_ratio": 0.0 }, { "epoch": 0.3748606093046077, "grad_norm": 237.35166931152344, "learning_rate": 3e-06, "loss": -22.0071, "step": 4202 }, { "epoch": 0.37494981934965876, "grad_norm": 236.7279815673828, "learning_rate": 3e-06, "loss": -1.1559, "step": 4203 }, { "epoch": 0.37503902939470984, "grad_norm": 298.2912292480469, "learning_rate": 3e-06, "loss": -15.1957, "step": 4204 }, { "epoch": 0.3751282394397609, "grad_norm": 229.4846954345703, "learning_rate": 3e-06, "loss": -15.4783, "step": 4205 }, { "epoch": 0.375217449484812, "grad_norm": 313.5098571777344, "learning_rate": 3e-06, "loss": 4.857, "step": 4206 }, { "epoch": 0.3753066595298631, "grad_norm": 347.5099792480469, "learning_rate": 3e-06, "loss": -16.1393, "step": 4207 }, { "epoch": 0.3753958695749141, "grad_norm": 233.28878784179688, "learning_rate": 3e-06, "loss": -29.4661, "step": 4208 }, { "epoch": 0.3754850796199652, "grad_norm": 281.68072509765625, "learning_rate": 3e-06, "loss": -6.5767, "step": 4209 }, { "epoch": 0.37557428966501627, "grad_norm": 276.3356018066406, "learning_rate": 3e-06, "loss": -24.749, "step": 4210 }, { "epoch": 0.37566349971006735, "grad_norm": 242.5169677734375, "learning_rate": 3e-06, "loss": -23.1229, "step": 4211 }, { "epoch": 0.37575270975511843, "grad_norm": 273.2527160644531, "learning_rate": 3e-06, "loss": 2.3749, "step": 4212 }, { "completion_length": 121.64583587646484, "epoch": 0.3758419198001695, "grad_norm": 787.3429565429688, "learning_rate": 3e-06, "loss": 11.9565, "reward": 2.5361251831054688, "reward_std": 0.3409547358751297, "rewards/correctness_reward_func": 1.875, "rewards/int_reward_func": 0.4895833283662796, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.17154166847467422, "step": 4213, "zero_std_ratio": 0.0 }, { "epoch": 0.3759311298452206, "grad_norm": 622.6375732421875, "learning_rate": 3e-06, "loss": 25.3744, "step": 4214 }, { "epoch": 0.37602033989027167, "grad_norm": 449.7272644042969, "learning_rate": 3e-06, "loss": 15.1553, "step": 4215 }, { "epoch": 0.3761095499353227, "grad_norm": 714.5230712890625, "learning_rate": 3e-06, "loss": 10.3302, "step": 4216 }, { "epoch": 0.3761987599803738, "grad_norm": 454.1429748535156, "learning_rate": 3e-06, "loss": 11.9743, "step": 4217 }, { "epoch": 0.37628797002542486, "grad_norm": 572.553955078125, "learning_rate": 3e-06, "loss": 19.8313, "step": 4218 }, { "epoch": 0.37637718007047594, "grad_norm": 547.42626953125, "learning_rate": 3e-06, "loss": 10.3074, "step": 4219 }, { "epoch": 0.376466390115527, "grad_norm": 618.7186279296875, "learning_rate": 3e-06, "loss": 23.9382, "step": 4220 }, { "epoch": 0.3765556001605781, "grad_norm": 497.3319396972656, "learning_rate": 3e-06, "loss": 11.1986, "step": 4221 }, { "epoch": 0.3766448102056292, "grad_norm": 316.6004333496094, "learning_rate": 3e-06, "loss": 4.9879, "step": 4222 }, { "epoch": 0.3767340202506802, "grad_norm": 261.0172119140625, "learning_rate": 3e-06, "loss": 8.7952, "step": 4223 }, { "epoch": 0.3768232302957313, "grad_norm": 358.22381591796875, "learning_rate": 3e-06, "loss": 11.7231, "step": 4224 }, { "completion_length": 132.4791717529297, "epoch": 0.37691244034078236, "grad_norm": 1228.272216796875, "learning_rate": 3e-06, "loss": -39.569, "reward": 2.418583393096924, "reward_std": 0.36135293543338776, "rewards/correctness_reward_func": 1.75, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.16858333349227905, "step": 4225, "zero_std_ratio": 0.125 }, { "epoch": 0.37700165038583344, "grad_norm": 815.456298828125, "learning_rate": 3e-06, "loss": -59.287, "step": 4226 }, { "epoch": 0.3770908604308845, "grad_norm": 1235.8233642578125, "learning_rate": 3e-06, "loss": -11.05, "step": 4227 }, { "epoch": 0.3771800704759356, "grad_norm": 1294.6051025390625, "learning_rate": 3e-06, "loss": -32.0026, "step": 4228 }, { "epoch": 0.3772692805209867, "grad_norm": 1115.9564208984375, "learning_rate": 3e-06, "loss": -30.6909, "step": 4229 }, { "epoch": 0.37735849056603776, "grad_norm": 1002.8909912109375, "learning_rate": 3e-06, "loss": -64.4382, "step": 4230 }, { "epoch": 0.3774477006110888, "grad_norm": 1155.1558837890625, "learning_rate": 3e-06, "loss": -51.8689, "step": 4231 }, { "epoch": 0.37753691065613987, "grad_norm": 922.8177490234375, "learning_rate": 3e-06, "loss": -63.281, "step": 4232 }, { "epoch": 0.37762612070119095, "grad_norm": 1143.1485595703125, "learning_rate": 3e-06, "loss": -33.4023, "step": 4233 }, { "epoch": 0.37771533074624203, "grad_norm": 1408.3936767578125, "learning_rate": 3e-06, "loss": -44.1105, "step": 4234 }, { "epoch": 0.3778045407912931, "grad_norm": 1156.298095703125, "learning_rate": 3e-06, "loss": -33.6505, "step": 4235 }, { "epoch": 0.3778937508363442, "grad_norm": 1049.872314453125, "learning_rate": 3e-06, "loss": -68.2741, "step": 4236 }, { "completion_length": 108.0, "epoch": 0.37798296088139527, "grad_norm": 73.9151382446289, "learning_rate": 3e-06, "loss": -2.6458, "reward": 2.199937582015991, "reward_std": 0.12390623055398464, "rewards/correctness_reward_func": 1.4583333134651184, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.24160416424274445, "step": 4237, "zero_std_ratio": 0.125 }, { "epoch": 0.3780721709264463, "grad_norm": 68.44878387451172, "learning_rate": 3e-06, "loss": -4.1362, "step": 4238 }, { "epoch": 0.3781613809714974, "grad_norm": 81.85137176513672, "learning_rate": 3e-06, "loss": -6.8214, "step": 4239 }, { "epoch": 0.37825059101654845, "grad_norm": 59.9229850769043, "learning_rate": 3e-06, "loss": -6.2128, "step": 4240 }, { "epoch": 0.37833980106159953, "grad_norm": 58.90769958496094, "learning_rate": 3e-06, "loss": -1.3736, "step": 4241 }, { "epoch": 0.3784290111066506, "grad_norm": 122.20264434814453, "learning_rate": 3e-06, "loss": 3.049, "step": 4242 }, { "epoch": 0.3785182211517017, "grad_norm": 75.69770050048828, "learning_rate": 3e-06, "loss": -3.5548, "step": 4243 }, { "epoch": 0.3786074311967528, "grad_norm": 76.24506378173828, "learning_rate": 3e-06, "loss": -4.7935, "step": 4244 }, { "epoch": 0.3786966412418038, "grad_norm": 134.84402465820312, "learning_rate": 3e-06, "loss": -7.7533, "step": 4245 }, { "epoch": 0.3787858512868549, "grad_norm": 55.79713821411133, "learning_rate": 3e-06, "loss": -7.1077, "step": 4246 }, { "epoch": 0.37887506133190596, "grad_norm": 56.5816764831543, "learning_rate": 3e-06, "loss": -2.0349, "step": 4247 }, { "epoch": 0.37896427137695704, "grad_norm": 118.09184265136719, "learning_rate": 3e-06, "loss": 1.7209, "step": 4248 }, { "completion_length": 114.54167175292969, "epoch": 0.3790534814220081, "grad_norm": 1257.9886474609375, "learning_rate": 3e-06, "loss": 1.1603, "reward": 2.234187602996826, "reward_std": 0.14598910976201296, "rewards/correctness_reward_func": 1.5416666865348816, "rewards/int_reward_func": 0.4895833283662796, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.20293749868869781, "step": 4249, "zero_std_ratio": 0.0 }, { "epoch": 0.3791426914670592, "grad_norm": 649.6129150390625, "learning_rate": 3e-06, "loss": -31.8787, "step": 4250 }, { "epoch": 0.3792319015121103, "grad_norm": 1252.320556640625, "learning_rate": 3e-06, "loss": 32.6163, "step": 4251 }, { "epoch": 0.37932111155716136, "grad_norm": 487.7680969238281, "learning_rate": 3e-06, "loss": -10.0451, "step": 4252 }, { "epoch": 0.3794103216022124, "grad_norm": 744.9708862304688, "learning_rate": 3e-06, "loss": -26.9852, "step": 4253 }, { "epoch": 0.37949953164726347, "grad_norm": 1229.1878662109375, "learning_rate": 3e-06, "loss": -5.431, "step": 4254 }, { "epoch": 0.37958874169231455, "grad_norm": 848.5198974609375, "learning_rate": 3e-06, "loss": -4.7063, "step": 4255 }, { "epoch": 0.3796779517373656, "grad_norm": 381.228271484375, "learning_rate": 3e-06, "loss": -33.1477, "step": 4256 }, { "epoch": 0.3797671617824167, "grad_norm": 1288.538818359375, "learning_rate": 3e-06, "loss": 19.7839, "step": 4257 }, { "epoch": 0.3798563718274678, "grad_norm": 522.0806274414062, "learning_rate": 3e-06, "loss": -13.5798, "step": 4258 }, { "epoch": 0.37994558187251887, "grad_norm": 717.9677734375, "learning_rate": 3e-06, "loss": -32.576, "step": 4259 }, { "epoch": 0.3800347919175699, "grad_norm": 1090.452880859375, "learning_rate": 3e-06, "loss": -8.6865, "step": 4260 }, { "completion_length": 125.45833587646484, "epoch": 0.38012400196262097, "grad_norm": 2341.59228515625, "learning_rate": 3e-06, "loss": -188.4113, "reward": 2.4416667222976685, "reward_std": 0.2982505140826106, "rewards/correctness_reward_func": 1.7916666865348816, "rewards/int_reward_func": 0.4895833283662796, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.16041666269302368, "step": 4261, "zero_std_ratio": 0.0 }, { "epoch": 0.38021321200767205, "grad_norm": 2093.097900390625, "learning_rate": 3e-06, "loss": -155.1409, "step": 4262 }, { "epoch": 0.38030242205272313, "grad_norm": 2212.36181640625, "learning_rate": 3e-06, "loss": -177.1702, "step": 4263 }, { "epoch": 0.3803916320977742, "grad_norm": 1850.092041015625, "learning_rate": 3e-06, "loss": -236.2749, "step": 4264 }, { "epoch": 0.3804808421428253, "grad_norm": 1421.4678955078125, "learning_rate": 3e-06, "loss": -146.1698, "step": 4265 }, { "epoch": 0.3805700521878764, "grad_norm": 2727.897705078125, "learning_rate": 3e-06, "loss": -305.1101, "step": 4266 }, { "epoch": 0.38065926223292745, "grad_norm": 2072.666015625, "learning_rate": 3e-06, "loss": -261.0359, "step": 4267 }, { "epoch": 0.3807484722779785, "grad_norm": 2168.50927734375, "learning_rate": 3e-06, "loss": -232.5397, "step": 4268 }, { "epoch": 0.38083768232302956, "grad_norm": 2648.869873046875, "learning_rate": 3e-06, "loss": -258.1508, "step": 4269 }, { "epoch": 0.38092689236808064, "grad_norm": 1851.0516357421875, "learning_rate": 3e-06, "loss": -305.0295, "step": 4270 }, { "epoch": 0.3810161024131317, "grad_norm": 1622.75146484375, "learning_rate": 3e-06, "loss": -202.0132, "step": 4271 }, { "epoch": 0.3811053124581828, "grad_norm": 1869.26220703125, "learning_rate": 3e-06, "loss": -384.5416, "step": 4272 }, { "completion_length": 150.6666717529297, "epoch": 0.3811945225032339, "grad_norm": 2437.776123046875, "learning_rate": 3e-06, "loss": -125.4092, "reward": 2.1955208778381348, "reward_std": 0.6789143979549408, "rewards/correctness_reward_func": 1.5833333134651184, "rewards/int_reward_func": 0.4895833283662796, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.12260416522622108, "step": 4273, "zero_std_ratio": 0.0 }, { "epoch": 0.38128373254828496, "grad_norm": 2186.8740234375, "learning_rate": 3e-06, "loss": -31.3271, "step": 4274 }, { "epoch": 0.381372942593336, "grad_norm": 1779.6231689453125, "learning_rate": 3e-06, "loss": -19.8358, "step": 4275 }, { "epoch": 0.38146215263838706, "grad_norm": 3099.948974609375, "learning_rate": 3e-06, "loss": -57.4521, "step": 4276 }, { "epoch": 0.38155136268343814, "grad_norm": 2141.255126953125, "learning_rate": 3e-06, "loss": -74.5195, "step": 4277 }, { "epoch": 0.3816405727284892, "grad_norm": 2284.310791015625, "learning_rate": 3e-06, "loss": -104.3235, "step": 4278 }, { "epoch": 0.3817297827735403, "grad_norm": 2633.470947265625, "learning_rate": 3e-06, "loss": -150.423, "step": 4279 }, { "epoch": 0.3818189928185914, "grad_norm": 2595.467529296875, "learning_rate": 3e-06, "loss": -77.9972, "step": 4280 }, { "epoch": 0.38190820286364247, "grad_norm": 2049.65576171875, "learning_rate": 3e-06, "loss": -27.538, "step": 4281 }, { "epoch": 0.38199741290869355, "grad_norm": 2545.734130859375, "learning_rate": 3e-06, "loss": -75.7149, "step": 4282 }, { "epoch": 0.38208662295374457, "grad_norm": 2116.4599609375, "learning_rate": 3e-06, "loss": -90.5809, "step": 4283 }, { "epoch": 0.38217583299879565, "grad_norm": 2044.74951171875, "learning_rate": 3e-06, "loss": -97.3073, "step": 4284 }, { "completion_length": 95.16666793823242, "epoch": 0.38226504304384673, "grad_norm": 968.459716796875, "learning_rate": 3e-06, "loss": 241.0823, "reward": 2.6273125410079956, "reward_std": 0.16535173915326595, "rewards/correctness_reward_func": 1.875, "rewards/int_reward_func": 0.4895833283662796, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.2627291679382324, "step": 4285, "zero_std_ratio": 0.125 }, { "epoch": 0.3823542530888978, "grad_norm": 1634.25, "learning_rate": 3e-06, "loss": 177.6339, "step": 4286 }, { "epoch": 0.3824434631339489, "grad_norm": 972.9589233398438, "learning_rate": 3e-06, "loss": 192.0269, "step": 4287 }, { "epoch": 0.38253267317899997, "grad_norm": 969.876708984375, "learning_rate": 3e-06, "loss": 219.3505, "step": 4288 }, { "epoch": 0.38262188322405105, "grad_norm": 1102.2791748046875, "learning_rate": 3e-06, "loss": 231.0779, "step": 4289 }, { "epoch": 0.3827110932691021, "grad_norm": 940.2847900390625, "learning_rate": 3e-06, "loss": 210.9052, "step": 4290 }, { "epoch": 0.38280030331415316, "grad_norm": 994.2579956054688, "learning_rate": 3e-06, "loss": 235.5683, "step": 4291 }, { "epoch": 0.38288951335920424, "grad_norm": 1280.37646484375, "learning_rate": 3e-06, "loss": 171.5284, "step": 4292 }, { "epoch": 0.3829787234042553, "grad_norm": 935.1000366210938, "learning_rate": 3e-06, "loss": 178.2432, "step": 4293 }, { "epoch": 0.3830679334493064, "grad_norm": 1280.197509765625, "learning_rate": 3e-06, "loss": 189.7223, "step": 4294 }, { "epoch": 0.3831571434943575, "grad_norm": 944.9692993164062, "learning_rate": 3e-06, "loss": 217.3895, "step": 4295 }, { "epoch": 0.38324635353940856, "grad_norm": 1260.349853515625, "learning_rate": 3e-06, "loss": 180.5636, "step": 4296 }, { "completion_length": 130.43750381469727, "epoch": 0.38333556358445964, "grad_norm": 2645.08203125, "learning_rate": 3e-06, "loss": -39.3991, "reward": 2.288166642189026, "reward_std": 0.38392316456884146, "rewards/correctness_reward_func": 1.6666666865348816, "rewards/int_reward_func": 0.46875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.15275000035762787, "step": 4297, "zero_std_ratio": 0.0 }, { "epoch": 0.38342477362951066, "grad_norm": 2226.264892578125, "learning_rate": 3e-06, "loss": -104.2657, "step": 4298 }, { "epoch": 0.38351398367456174, "grad_norm": 2707.9423828125, "learning_rate": 3e-06, "loss": -34.5452, "step": 4299 }, { "epoch": 0.3836031937196128, "grad_norm": 2426.00537109375, "learning_rate": 3e-06, "loss": -44.6228, "step": 4300 }, { "epoch": 0.3836924037646639, "grad_norm": 2102.959228515625, "learning_rate": 3e-06, "loss": -59.4103, "step": 4301 }, { "epoch": 0.383781613809715, "grad_norm": 2361.084228515625, "learning_rate": 3e-06, "loss": -73.5066, "step": 4302 }, { "epoch": 0.38387082385476606, "grad_norm": 2108.65673828125, "learning_rate": 3e-06, "loss": -53.9765, "step": 4303 }, { "epoch": 0.38396003389981714, "grad_norm": 2600.673095703125, "learning_rate": 3e-06, "loss": -107.5761, "step": 4304 }, { "epoch": 0.38404924394486817, "grad_norm": 2482.641845703125, "learning_rate": 3e-06, "loss": -39.8455, "step": 4305 }, { "epoch": 0.38413845398991925, "grad_norm": 2694.06884765625, "learning_rate": 3e-06, "loss": -74.5549, "step": 4306 }, { "epoch": 0.38422766403497033, "grad_norm": 1989.16259765625, "learning_rate": 3e-06, "loss": -63.0356, "step": 4307 }, { "epoch": 0.3843168740800214, "grad_norm": 2709.6025390625, "learning_rate": 3e-06, "loss": -76.2588, "step": 4308 }, { "completion_length": 123.64583587646484, "epoch": 0.3844060841250725, "grad_norm": 1153.4588623046875, "learning_rate": 3e-06, "loss": 8.185, "reward": 2.1336459517478943, "reward_std": 0.49706225097179413, "rewards/correctness_reward_func": 1.4583333134651184, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.17531249672174454, "step": 4309, "zero_std_ratio": 0.0 }, { "epoch": 0.38449529417012357, "grad_norm": 1167.989990234375, "learning_rate": 3e-06, "loss": -1.5971, "step": 4310 }, { "epoch": 0.38458450421517465, "grad_norm": 1224.7232666015625, "learning_rate": 3e-06, "loss": -37.6662, "step": 4311 }, { "epoch": 0.3846737142602257, "grad_norm": 885.1259155273438, "learning_rate": 3e-06, "loss": 21.3979, "step": 4312 }, { "epoch": 0.38476292430527675, "grad_norm": 1005.4548950195312, "learning_rate": 3e-06, "loss": 18.1406, "step": 4313 }, { "epoch": 0.38485213435032783, "grad_norm": 1271.1480712890625, "learning_rate": 3e-06, "loss": -17.9424, "step": 4314 }, { "epoch": 0.3849413443953789, "grad_norm": 1140.416259765625, "learning_rate": 3e-06, "loss": 1.1635, "step": 4315 }, { "epoch": 0.38503055444043, "grad_norm": 1540.6529541015625, "learning_rate": 3e-06, "loss": -1.6065, "step": 4316 }, { "epoch": 0.3851197644854811, "grad_norm": 1463.730712890625, "learning_rate": 3e-06, "loss": -52.7173, "step": 4317 }, { "epoch": 0.38520897453053216, "grad_norm": 1214.785400390625, "learning_rate": 3e-06, "loss": 17.7672, "step": 4318 }, { "epoch": 0.38529818457558324, "grad_norm": 802.9563598632812, "learning_rate": 3e-06, "loss": 21.311, "step": 4319 }, { "epoch": 0.38538739462063426, "grad_norm": 1599.6231689453125, "learning_rate": 3e-06, "loss": -45.965, "step": 4320 }, { "completion_length": 125.08333969116211, "epoch": 0.38547660466568534, "grad_norm": 2156.890380859375, "learning_rate": 3e-06, "loss": 247.9054, "reward": 1.900687575340271, "reward_std": 0.48099492862820625, "rewards/correctness_reward_func": 1.2083333432674408, "rewards/int_reward_func": 0.4895833283662796, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.20277082920074463, "step": 4321, "zero_std_ratio": 0.0 }, { "epoch": 0.3855658147107364, "grad_norm": 2383.71240234375, "learning_rate": 3e-06, "loss": 255.115, "step": 4322 }, { "epoch": 0.3856550247557875, "grad_norm": 3035.939453125, "learning_rate": 3e-06, "loss": 61.1526, "step": 4323 }, { "epoch": 0.3857442348008386, "grad_norm": 2387.04833984375, "learning_rate": 3e-06, "loss": 308.3098, "step": 4324 }, { "epoch": 0.38583344484588966, "grad_norm": 2428.302734375, "learning_rate": 3e-06, "loss": 266.467, "step": 4325 }, { "epoch": 0.38592265489094074, "grad_norm": 3084.668212890625, "learning_rate": 3e-06, "loss": 256.6953, "step": 4326 }, { "epoch": 0.38601186493599177, "grad_norm": 1984.0787353515625, "learning_rate": 3e-06, "loss": 228.3725, "step": 4327 }, { "epoch": 0.38610107498104285, "grad_norm": 2522.44677734375, "learning_rate": 3e-06, "loss": 236.1969, "step": 4328 }, { "epoch": 0.3861902850260939, "grad_norm": 3426.984375, "learning_rate": 3e-06, "loss": 22.029, "step": 4329 }, { "epoch": 0.386279495071145, "grad_norm": 2492.844970703125, "learning_rate": 3e-06, "loss": 281.8287, "step": 4330 }, { "epoch": 0.3863687051161961, "grad_norm": 2329.809326171875, "learning_rate": 3e-06, "loss": 241.7872, "step": 4331 }, { "epoch": 0.38645791516124717, "grad_norm": 2958.973876953125, "learning_rate": 3e-06, "loss": 208.3751, "step": 4332 }, { "completion_length": 109.00000381469727, "epoch": 0.38654712520629825, "grad_norm": 1636.20654296875, "learning_rate": 3e-06, "loss": 104.2388, "reward": 2.3063541650772095, "reward_std": 0.5427626818418503, "rewards/correctness_reward_func": 1.5833333134651184, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.22302082926034927, "step": 4333, "zero_std_ratio": 0.0 }, { "epoch": 0.38663633525134933, "grad_norm": 1425.84912109375, "learning_rate": 3e-06, "loss": 86.1505, "step": 4334 }, { "epoch": 0.38672554529640035, "grad_norm": 1081.7259521484375, "learning_rate": 3e-06, "loss": 27.1816, "step": 4335 }, { "epoch": 0.38681475534145143, "grad_norm": 2852.9072265625, "learning_rate": 3e-06, "loss": 83.3406, "step": 4336 }, { "epoch": 0.3869039653865025, "grad_norm": 1063.5379638671875, "learning_rate": 3e-06, "loss": 12.9931, "step": 4337 }, { "epoch": 0.3869931754315536, "grad_norm": 2035.1484375, "learning_rate": 3e-06, "loss": 38.7931, "step": 4338 }, { "epoch": 0.3870823854766047, "grad_norm": 1776.1805419921875, "learning_rate": 3e-06, "loss": 89.821, "step": 4339 }, { "epoch": 0.38717159552165575, "grad_norm": 1523.745849609375, "learning_rate": 3e-06, "loss": 66.8125, "step": 4340 }, { "epoch": 0.38726080556670683, "grad_norm": 889.780517578125, "learning_rate": 3e-06, "loss": 19.7219, "step": 4341 }, { "epoch": 0.38735001561175786, "grad_norm": 2178.412353515625, "learning_rate": 3e-06, "loss": 62.9779, "step": 4342 }, { "epoch": 0.38743922565680894, "grad_norm": 879.8733520507812, "learning_rate": 3e-06, "loss": 1.305, "step": 4343 }, { "epoch": 0.38752843570186, "grad_norm": 1527.49609375, "learning_rate": 3e-06, "loss": 26.53, "step": 4344 }, { "completion_length": 129.4375, "epoch": 0.3876176457469111, "grad_norm": 1646.77001953125, "learning_rate": 3e-06, "loss": 13.1459, "reward": 2.0968542098999023, "reward_std": 0.4767192304134369, "rewards/correctness_reward_func": 1.4166666567325592, "rewards/int_reward_func": 0.4895833283662796, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.19060416519641876, "step": 4345, "zero_std_ratio": 0.0 }, { "epoch": 0.3877068557919622, "grad_norm": 2592.675048828125, "learning_rate": 3e-06, "loss": 6.1038, "step": 4346 }, { "epoch": 0.38779606583701326, "grad_norm": 2514.614501953125, "learning_rate": 3e-06, "loss": 50.0829, "step": 4347 }, { "epoch": 0.38788527588206434, "grad_norm": 2126.40283203125, "learning_rate": 3e-06, "loss": 121.7649, "step": 4348 }, { "epoch": 0.3879744859271154, "grad_norm": 3124.124267578125, "learning_rate": 3e-06, "loss": -202.426, "step": 4349 }, { "epoch": 0.38806369597216644, "grad_norm": 1829.81201171875, "learning_rate": 3e-06, "loss": 74.7906, "step": 4350 }, { "epoch": 0.3881529060172175, "grad_norm": 1718.4022216796875, "learning_rate": 3e-06, "loss": 5.5431, "step": 4351 }, { "epoch": 0.3882421160622686, "grad_norm": 2224.92041015625, "learning_rate": 3e-06, "loss": -12.48, "step": 4352 }, { "epoch": 0.3883313261073197, "grad_norm": 2015.194580078125, "learning_rate": 3e-06, "loss": 40.8911, "step": 4353 }, { "epoch": 0.38842053615237077, "grad_norm": 3536.88232421875, "learning_rate": 3e-06, "loss": 92.2398, "step": 4354 }, { "epoch": 0.38850974619742185, "grad_norm": 3572.492431640625, "learning_rate": 3e-06, "loss": -225.2687, "step": 4355 }, { "epoch": 0.3885989562424729, "grad_norm": 1616.7044677734375, "learning_rate": 3e-06, "loss": 59.1872, "step": 4356 }, { "completion_length": 104.72917175292969, "epoch": 0.38868816628752395, "grad_norm": 554.7708129882812, "learning_rate": 3e-06, "loss": -3.394, "reward": 2.646396040916443, "reward_std": 0.18972741812467575, "rewards/correctness_reward_func": 1.9166666865348816, "rewards/int_reward_func": 0.4895833283662796, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.24014582484960556, "step": 4357, "zero_std_ratio": 0.0 }, { "epoch": 0.38877737633257503, "grad_norm": 460.69384765625, "learning_rate": 3e-06, "loss": -31.8041, "step": 4358 }, { "epoch": 0.3888665863776261, "grad_norm": 401.4151611328125, "learning_rate": 3e-06, "loss": -5.9124, "step": 4359 }, { "epoch": 0.3889557964226772, "grad_norm": 795.5250854492188, "learning_rate": 3e-06, "loss": -1.0795, "step": 4360 }, { "epoch": 0.38904500646772827, "grad_norm": 309.6639709472656, "learning_rate": 3e-06, "loss": -31.524, "step": 4361 }, { "epoch": 0.38913421651277935, "grad_norm": 433.15032958984375, "learning_rate": 3e-06, "loss": -25.7393, "step": 4362 }, { "epoch": 0.38922342655783043, "grad_norm": 508.97271728515625, "learning_rate": 3e-06, "loss": -12.2884, "step": 4363 }, { "epoch": 0.3893126366028815, "grad_norm": 434.8893127441406, "learning_rate": 3e-06, "loss": -37.6312, "step": 4364 }, { "epoch": 0.38940184664793254, "grad_norm": 345.7143859863281, "learning_rate": 3e-06, "loss": -13.5652, "step": 4365 }, { "epoch": 0.3894910566929836, "grad_norm": 703.9806518554688, "learning_rate": 3e-06, "loss": -18.6191, "step": 4366 }, { "epoch": 0.3895802667380347, "grad_norm": 226.0228271484375, "learning_rate": 3e-06, "loss": -36.8689, "step": 4367 }, { "epoch": 0.3896694767830858, "grad_norm": 372.95361328125, "learning_rate": 3e-06, "loss": -34.0147, "step": 4368 }, { "completion_length": 134.83333587646484, "epoch": 0.38975868682813686, "grad_norm": 1500.481201171875, "learning_rate": 3e-06, "loss": -136.7448, "reward": 2.1487709283828735, "reward_std": 0.5297794193029404, "rewards/correctness_reward_func": 1.5, "rewards/int_reward_func": 0.4791666567325592, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.16960416734218597, "step": 4369, "zero_std_ratio": 0.0 }, { "epoch": 0.38984789687318794, "grad_norm": 1374.9449462890625, "learning_rate": 3e-06, "loss": -69.0792, "step": 4370 }, { "epoch": 0.389937106918239, "grad_norm": 1892.0389404296875, "learning_rate": 3e-06, "loss": -73.4509, "step": 4371 }, { "epoch": 0.39002631696329004, "grad_norm": 1942.9813232421875, "learning_rate": 3e-06, "loss": -68.5719, "step": 4372 }, { "epoch": 0.3901155270083411, "grad_norm": 1781.539306640625, "learning_rate": 3e-06, "loss": -42.5413, "step": 4373 }, { "epoch": 0.3902047370533922, "grad_norm": 1654.6309814453125, "learning_rate": 3e-06, "loss": -31.4551, "step": 4374 }, { "epoch": 0.3902939470984433, "grad_norm": 1432.309814453125, "learning_rate": 3e-06, "loss": -162.5581, "step": 4375 }, { "epoch": 0.39038315714349436, "grad_norm": 1463.658447265625, "learning_rate": 3e-06, "loss": -79.0538, "step": 4376 }, { "epoch": 0.39047236718854544, "grad_norm": 2189.373046875, "learning_rate": 3e-06, "loss": -86.5963, "step": 4377 }, { "epoch": 0.3905615772335965, "grad_norm": 1866.843994140625, "learning_rate": 3e-06, "loss": -93.1451, "step": 4378 }, { "epoch": 0.39065078727864755, "grad_norm": 1741.803955078125, "learning_rate": 3e-06, "loss": -61.3421, "step": 4379 }, { "epoch": 0.39073999732369863, "grad_norm": 2030.5574951171875, "learning_rate": 3e-06, "loss": -35.6768, "step": 4380 }, { "completion_length": 132.70833587646484, "epoch": 0.3908292073687497, "grad_norm": 3410.898193359375, "learning_rate": 3e-06, "loss": -378.4021, "reward": 1.7994791865348816, "reward_std": 0.5843348354101181, "rewards/correctness_reward_func": 1.2083333730697632, "rewards/int_reward_func": 0.4166666567325592, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.1744791716337204, "step": 4381, "zero_std_ratio": 0.0 }, { "epoch": 0.3909184174138008, "grad_norm": 4356.34228515625, "learning_rate": 3e-06, "loss": -535.3192, "step": 4382 }, { "epoch": 0.39100762745885187, "grad_norm": 3591.458984375, "learning_rate": 3e-06, "loss": -432.7177, "step": 4383 }, { "epoch": 0.39109683750390295, "grad_norm": 2477.579345703125, "learning_rate": 3e-06, "loss": -360.3688, "step": 4384 }, { "epoch": 0.39118604754895403, "grad_norm": 2796.083984375, "learning_rate": 3e-06, "loss": -395.1808, "step": 4385 }, { "epoch": 0.3912752575940051, "grad_norm": 2836.18505859375, "learning_rate": 3e-06, "loss": -435.3364, "step": 4386 }, { "epoch": 0.39136446763905613, "grad_norm": 3413.806640625, "learning_rate": 3e-06, "loss": -429.386, "step": 4387 }, { "epoch": 0.3914536776841072, "grad_norm": 3565.781005859375, "learning_rate": 3e-06, "loss": -616.5824, "step": 4388 }, { "epoch": 0.3915428877291583, "grad_norm": 2876.142578125, "learning_rate": 3e-06, "loss": -484.576, "step": 4389 }, { "epoch": 0.3916320977742094, "grad_norm": 2598.4521484375, "learning_rate": 3e-06, "loss": -410.789, "step": 4390 }, { "epoch": 0.39172130781926046, "grad_norm": 3809.568603515625, "learning_rate": 3e-06, "loss": -419.592, "step": 4391 }, { "epoch": 0.39181051786431154, "grad_norm": 2873.151123046875, "learning_rate": 3e-06, "loss": -485.0909, "step": 4392 }, { "completion_length": 95.95833587646484, "epoch": 0.3918997279093626, "grad_norm": 915.3318481445312, "learning_rate": 3e-06, "loss": 41.9751, "reward": 2.3233959674835205, "reward_std": 0.3844504952430725, "rewards/correctness_reward_func": 1.5833333730697632, "rewards/int_reward_func": 0.4791666567325592, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.26089583337306976, "step": 4393, "zero_std_ratio": 0.125 }, { "epoch": 0.39198893795441364, "grad_norm": 676.8789672851562, "learning_rate": 3e-06, "loss": 4.8595, "step": 4394 }, { "epoch": 0.3920781479994647, "grad_norm": 780.9202270507812, "learning_rate": 3e-06, "loss": -29.6741, "step": 4395 }, { "epoch": 0.3921673580445158, "grad_norm": 770.776611328125, "learning_rate": 3e-06, "loss": 0.1452, "step": 4396 }, { "epoch": 0.3922565680895669, "grad_norm": 1450.8193359375, "learning_rate": 3e-06, "loss": -35.4223, "step": 4397 }, { "epoch": 0.39234577813461796, "grad_norm": 1034.5201416015625, "learning_rate": 3e-06, "loss": 7.853, "step": 4398 }, { "epoch": 0.39243498817966904, "grad_norm": 977.0890502929688, "learning_rate": 3e-06, "loss": 44.3763, "step": 4399 }, { "epoch": 0.3925241982247201, "grad_norm": 719.3131713867188, "learning_rate": 3e-06, "loss": 6.3568, "step": 4400 }, { "epoch": 0.3926134082697712, "grad_norm": 949.4197998046875, "learning_rate": 3e-06, "loss": -33.3205, "step": 4401 }, { "epoch": 0.3927026183148222, "grad_norm": 716.4667358398438, "learning_rate": 3e-06, "loss": -2.176, "step": 4402 }, { "epoch": 0.3927918283598733, "grad_norm": 1392.4869384765625, "learning_rate": 3e-06, "loss": -45.0552, "step": 4403 }, { "epoch": 0.3928810384049244, "grad_norm": 1166.02001953125, "learning_rate": 3e-06, "loss": -2.2053, "step": 4404 }, { "completion_length": 126.37500762939453, "epoch": 0.39297024844997547, "grad_norm": 2645.462646484375, "learning_rate": 3e-06, "loss": -92.8634, "reward": 2.0533958673477173, "reward_std": 0.7317273020744324, "rewards/correctness_reward_func": 1.4583333134651184, "rewards/int_reward_func": 0.4166666716337204, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.1783958375453949, "step": 4405, "zero_std_ratio": 0.0 }, { "epoch": 0.39305945849502655, "grad_norm": 2143.677001953125, "learning_rate": 3e-06, "loss": 38.0845, "step": 4406 }, { "epoch": 0.39314866854007763, "grad_norm": 3394.27978515625, "learning_rate": 3e-06, "loss": -57.4195, "step": 4407 }, { "epoch": 0.3932378785851287, "grad_norm": 2765.41650390625, "learning_rate": 3e-06, "loss": 11.0512, "step": 4408 }, { "epoch": 0.39332708863017973, "grad_norm": 2432.927001953125, "learning_rate": 3e-06, "loss": -55.7422, "step": 4409 }, { "epoch": 0.3934162986752308, "grad_norm": 2931.593017578125, "learning_rate": 3e-06, "loss": -190.4823, "step": 4410 }, { "epoch": 0.3935055087202819, "grad_norm": 2393.96240234375, "learning_rate": 3e-06, "loss": -106.2685, "step": 4411 }, { "epoch": 0.393594718765333, "grad_norm": 2688.864501953125, "learning_rate": 3e-06, "loss": 10.698, "step": 4412 }, { "epoch": 0.39368392881038405, "grad_norm": 3332.27734375, "learning_rate": 3e-06, "loss": -85.0943, "step": 4413 }, { "epoch": 0.39377313885543513, "grad_norm": 2363.884033203125, "learning_rate": 3e-06, "loss": -6.6928, "step": 4414 }, { "epoch": 0.3938623489004862, "grad_norm": 2732.054443359375, "learning_rate": 3e-06, "loss": -73.5381, "step": 4415 }, { "epoch": 0.3939515589455373, "grad_norm": 3588.205078125, "learning_rate": 3e-06, "loss": -209.6246, "step": 4416 }, { "completion_length": 111.93750381469727, "epoch": 0.3940407689905883, "grad_norm": 1776.72900390625, "learning_rate": 3e-06, "loss": 125.7514, "reward": 2.386979341506958, "reward_std": 0.16185548156499863, "rewards/correctness_reward_func": 1.6666666865348816, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.22031249105930328, "step": 4417, "zero_std_ratio": 0.0 }, { "epoch": 0.3941299790356394, "grad_norm": 1930.419677734375, "learning_rate": 3e-06, "loss": 189.4515, "step": 4418 }, { "epoch": 0.3942191890806905, "grad_norm": 1767.7796630859375, "learning_rate": 3e-06, "loss": 53.9774, "step": 4419 }, { "epoch": 0.39430839912574156, "grad_norm": 1809.4879150390625, "learning_rate": 3e-06, "loss": 90.2603, "step": 4420 }, { "epoch": 0.39439760917079264, "grad_norm": 1214.28564453125, "learning_rate": 3e-06, "loss": 84.1137, "step": 4421 }, { "epoch": 0.3944868192158437, "grad_norm": 1518.6868896484375, "learning_rate": 3e-06, "loss": -1.5612, "step": 4422 }, { "epoch": 0.3945760292608948, "grad_norm": 1371.1156005859375, "learning_rate": 3e-06, "loss": 112.9752, "step": 4423 }, { "epoch": 0.3946652393059458, "grad_norm": 1373.93603515625, "learning_rate": 3e-06, "loss": 179.0911, "step": 4424 }, { "epoch": 0.3947544493509969, "grad_norm": 1492.3758544921875, "learning_rate": 3e-06, "loss": 44.4087, "step": 4425 }, { "epoch": 0.394843659396048, "grad_norm": 1692.947265625, "learning_rate": 3e-06, "loss": 71.132, "step": 4426 }, { "epoch": 0.39493286944109907, "grad_norm": 1337.1668701171875, "learning_rate": 3e-06, "loss": 69.9097, "step": 4427 }, { "epoch": 0.39502207948615015, "grad_norm": 1510.5037841796875, "learning_rate": 3e-06, "loss": -7.2035, "step": 4428 }, { "completion_length": 123.85417175292969, "epoch": 0.3951112895312012, "grad_norm": 2464.857421875, "learning_rate": 3e-06, "loss": 93.8496, "reward": 2.5766459703445435, "reward_std": 0.2359820306301117, "rewards/correctness_reward_func": 1.875, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.2016458436846733, "step": 4429, "zero_std_ratio": 0.0 }, { "epoch": 0.3952004995762523, "grad_norm": 1783.1583251953125, "learning_rate": 3e-06, "loss": 66.2191, "step": 4430 }, { "epoch": 0.39528970962130333, "grad_norm": 3015.827392578125, "learning_rate": 3e-06, "loss": 98.559, "step": 4431 }, { "epoch": 0.3953789196663544, "grad_norm": 2580.365966796875, "learning_rate": 3e-06, "loss": 89.4454, "step": 4432 }, { "epoch": 0.3954681297114055, "grad_norm": 2133.212158203125, "learning_rate": 3e-06, "loss": 36.2475, "step": 4433 }, { "epoch": 0.39555733975645657, "grad_norm": 2386.112548828125, "learning_rate": 3e-06, "loss": 70.8541, "step": 4434 }, { "epoch": 0.39564654980150765, "grad_norm": 3322.854248046875, "learning_rate": 3e-06, "loss": 56.6651, "step": 4435 }, { "epoch": 0.39573575984655873, "grad_norm": 2065.8125, "learning_rate": 3e-06, "loss": 38.5488, "step": 4436 }, { "epoch": 0.3958249698916098, "grad_norm": 3036.602294921875, "learning_rate": 3e-06, "loss": 64.3112, "step": 4437 }, { "epoch": 0.3959141799366609, "grad_norm": 2572.306640625, "learning_rate": 3e-06, "loss": 32.6966, "step": 4438 }, { "epoch": 0.3960033899817119, "grad_norm": 2097.511474609375, "learning_rate": 3e-06, "loss": -5.9896, "step": 4439 }, { "epoch": 0.396092600026763, "grad_norm": 2346.47998046875, "learning_rate": 3e-06, "loss": 29.6087, "step": 4440 }, { "completion_length": 139.8541717529297, "epoch": 0.3961818100718141, "grad_norm": 2890.445068359375, "learning_rate": 3e-06, "loss": -196.3254, "reward": 1.5590626001358032, "reward_std": 0.5422908663749695, "rewards/correctness_reward_func": 0.9166666865348816, "rewards/int_reward_func": 0.4791666567325592, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.16322916746139526, "step": 4441, "zero_std_ratio": 0.0 }, { "epoch": 0.39627102011686516, "grad_norm": 2755.874755859375, "learning_rate": 3e-06, "loss": -171.9438, "step": 4442 }, { "epoch": 0.39636023016191624, "grad_norm": 2615.59716796875, "learning_rate": 3e-06, "loss": -224.5904, "step": 4443 }, { "epoch": 0.3964494402069673, "grad_norm": 3186.255859375, "learning_rate": 3e-06, "loss": -371.7861, "step": 4444 }, { "epoch": 0.3965386502520184, "grad_norm": 2557.218017578125, "learning_rate": 3e-06, "loss": -147.9058, "step": 4445 }, { "epoch": 0.3966278602970694, "grad_norm": 3510.248046875, "learning_rate": 3e-06, "loss": -229.8155, "step": 4446 }, { "epoch": 0.3967170703421205, "grad_norm": 2978.868896484375, "learning_rate": 3e-06, "loss": -204.1951, "step": 4447 }, { "epoch": 0.3968062803871716, "grad_norm": 2577.03076171875, "learning_rate": 3e-06, "loss": -204.9646, "step": 4448 }, { "epoch": 0.39689549043222266, "grad_norm": 2439.06005859375, "learning_rate": 3e-06, "loss": -260.226, "step": 4449 }, { "epoch": 0.39698470047727374, "grad_norm": 3109.95166015625, "learning_rate": 3e-06, "loss": -418.0882, "step": 4450 }, { "epoch": 0.3970739105223248, "grad_norm": 2557.521240234375, "learning_rate": 3e-06, "loss": -182.9856, "step": 4451 }, { "epoch": 0.3971631205673759, "grad_norm": 6517.7744140625, "learning_rate": 3e-06, "loss": -278.5426, "step": 4452 }, { "completion_length": 86.8125, "epoch": 0.397252330612427, "grad_norm": 587.2297973632812, "learning_rate": 3e-06, "loss": -53.9669, "reward": 2.599083423614502, "reward_std": 0.15260164253413677, "rewards/correctness_reward_func": 1.8333333134651184, "rewards/int_reward_func": 0.4895833283662796, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.2761666625738144, "step": 4453, "zero_std_ratio": 0.125 }, { "epoch": 0.397341540657478, "grad_norm": 647.9014892578125, "learning_rate": 3e-06, "loss": -68.1908, "step": 4454 }, { "epoch": 0.3974307507025291, "grad_norm": 774.140380859375, "learning_rate": 3e-06, "loss": -56.8643, "step": 4455 }, { "epoch": 0.39751996074758017, "grad_norm": 519.39599609375, "learning_rate": 3e-06, "loss": -49.1085, "step": 4456 }, { "epoch": 0.39760917079263125, "grad_norm": 293.1749267578125, "learning_rate": 3e-06, "loss": -62.9867, "step": 4457 }, { "epoch": 0.39769838083768233, "grad_norm": 962.1717529296875, "learning_rate": 3e-06, "loss": -11.5104, "step": 4458 }, { "epoch": 0.3977875908827334, "grad_norm": 642.9428100585938, "learning_rate": 3e-06, "loss": -71.677, "step": 4459 }, { "epoch": 0.3978768009277845, "grad_norm": 494.9519348144531, "learning_rate": 3e-06, "loss": -90.3451, "step": 4460 }, { "epoch": 0.3979660109728355, "grad_norm": 546.158203125, "learning_rate": 3e-06, "loss": -80.3851, "step": 4461 }, { "epoch": 0.3980552210178866, "grad_norm": 482.6031494140625, "learning_rate": 3e-06, "loss": -63.1697, "step": 4462 }, { "epoch": 0.3981444310629377, "grad_norm": 514.2749633789062, "learning_rate": 3e-06, "loss": -69.5602, "step": 4463 }, { "epoch": 0.39823364110798876, "grad_norm": 769.8934936523438, "learning_rate": 3e-06, "loss": -28.4791, "step": 4464 }, { "completion_length": 117.66667175292969, "epoch": 0.39832285115303984, "grad_norm": 2188.4208984375, "learning_rate": 3e-06, "loss": 51.9134, "reward": 2.0459792613983154, "reward_std": 0.4676392078399658, "rewards/correctness_reward_func": 1.3333333134651184, "rewards/int_reward_func": 0.4895833283662796, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.22306250780820847, "step": 4465, "zero_std_ratio": 0.125 }, { "epoch": 0.3984120611980909, "grad_norm": 1697.3514404296875, "learning_rate": 3e-06, "loss": 131.0448, "step": 4466 }, { "epoch": 0.398501271243142, "grad_norm": 2349.888916015625, "learning_rate": 3e-06, "loss": 93.7523, "step": 4467 }, { "epoch": 0.3985904812881931, "grad_norm": 2104.506103515625, "learning_rate": 3e-06, "loss": 94.9501, "step": 4468 }, { "epoch": 0.3986796913332441, "grad_norm": 2242.360595703125, "learning_rate": 3e-06, "loss": 18.3909, "step": 4469 }, { "epoch": 0.3987689013782952, "grad_norm": 2549.342041015625, "learning_rate": 3e-06, "loss": 76.1693, "step": 4470 }, { "epoch": 0.39885811142334626, "grad_norm": 2465.906005859375, "learning_rate": 3e-06, "loss": 27.6701, "step": 4471 }, { "epoch": 0.39894732146839734, "grad_norm": 1705.8909912109375, "learning_rate": 3e-06, "loss": 108.5421, "step": 4472 }, { "epoch": 0.3990365315134484, "grad_norm": 2019.6395263671875, "learning_rate": 3e-06, "loss": 62.8409, "step": 4473 }, { "epoch": 0.3991257415584995, "grad_norm": 1470.4991455078125, "learning_rate": 3e-06, "loss": 70.8084, "step": 4474 }, { "epoch": 0.3992149516035506, "grad_norm": 2084.828369140625, "learning_rate": 3e-06, "loss": -24.9256, "step": 4475 }, { "epoch": 0.3993041616486016, "grad_norm": 1714.69775390625, "learning_rate": 3e-06, "loss": 30.2634, "step": 4476 }, { "completion_length": 108.06250381469727, "epoch": 0.3993933716936527, "grad_norm": 611.30419921875, "learning_rate": 3e-06, "loss": 78.2691, "reward": 2.2938334941864014, "reward_std": 0.3085622787475586, "rewards/correctness_reward_func": 1.5833333134651184, "rewards/int_reward_func": 0.4791666567325592, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.23133332282304764, "step": 4477, "zero_std_ratio": 0.0 }, { "epoch": 0.39948258173870377, "grad_norm": 1216.0955810546875, "learning_rate": 3e-06, "loss": 64.1597, "step": 4478 }, { "epoch": 0.39957179178375485, "grad_norm": 720.3121948242188, "learning_rate": 3e-06, "loss": 26.3581, "step": 4479 }, { "epoch": 0.39966100182880593, "grad_norm": 1071.1658935546875, "learning_rate": 3e-06, "loss": 65.5913, "step": 4480 }, { "epoch": 0.399750211873857, "grad_norm": 752.7969360351562, "learning_rate": 3e-06, "loss": 4.5674, "step": 4481 }, { "epoch": 0.3998394219189081, "grad_norm": 705.0219116210938, "learning_rate": 3e-06, "loss": 38.2804, "step": 4482 }, { "epoch": 0.39992863196395917, "grad_norm": 755.8858032226562, "learning_rate": 3e-06, "loss": 66.7057, "step": 4483 }, { "epoch": 0.4000178420090102, "grad_norm": 948.241455078125, "learning_rate": 3e-06, "loss": 37.3275, "step": 4484 }, { "epoch": 0.4001070520540613, "grad_norm": 560.0237426757812, "learning_rate": 3e-06, "loss": 20.5239, "step": 4485 }, { "epoch": 0.40019626209911235, "grad_norm": 736.54052734375, "learning_rate": 3e-06, "loss": 50.8955, "step": 4486 }, { "epoch": 0.40028547214416343, "grad_norm": 763.338134765625, "learning_rate": 3e-06, "loss": -8.2893, "step": 4487 }, { "epoch": 0.4003746821892145, "grad_norm": 554.2036743164062, "learning_rate": 3e-06, "loss": 32.0063, "step": 4488 }, { "completion_length": 92.60417175292969, "epoch": 0.4004638922342656, "grad_norm": 1206.7291259765625, "learning_rate": 3e-06, "loss": -18.0873, "reward": 2.4997918605804443, "reward_std": 0.3818785697221756, "rewards/correctness_reward_func": 1.75, "rewards/int_reward_func": 0.4791666567325592, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.2706249803304672, "step": 4489, "zero_std_ratio": 0.125 }, { "epoch": 0.4005531022793167, "grad_norm": 2226.52294921875, "learning_rate": 3e-06, "loss": -52.5148, "step": 4490 }, { "epoch": 0.4006423123243677, "grad_norm": 712.6217651367188, "learning_rate": 3e-06, "loss": -49.2483, "step": 4491 }, { "epoch": 0.4007315223694188, "grad_norm": 981.7110595703125, "learning_rate": 3e-06, "loss": -36.3447, "step": 4492 }, { "epoch": 0.40082073241446986, "grad_norm": 713.4608764648438, "learning_rate": 3e-06, "loss": -66.1797, "step": 4493 }, { "epoch": 0.40090994245952094, "grad_norm": 711.169677734375, "learning_rate": 3e-06, "loss": -31.9974, "step": 4494 }, { "epoch": 0.400999152504572, "grad_norm": 1115.0733642578125, "learning_rate": 3e-06, "loss": -28.7112, "step": 4495 }, { "epoch": 0.4010883625496231, "grad_norm": 1556.9822998046875, "learning_rate": 3e-06, "loss": -58.9058, "step": 4496 }, { "epoch": 0.4011775725946742, "grad_norm": 732.631103515625, "learning_rate": 3e-06, "loss": -62.286, "step": 4497 }, { "epoch": 0.4012667826397252, "grad_norm": 878.0677490234375, "learning_rate": 3e-06, "loss": -47.8085, "step": 4498 }, { "epoch": 0.4013559926847763, "grad_norm": 778.924072265625, "learning_rate": 3e-06, "loss": -83.6162, "step": 4499 }, { "epoch": 0.40144520272982737, "grad_norm": 875.9097900390625, "learning_rate": 3e-06, "loss": -50.2953, "step": 4500 }, { "completion_length": 108.54166793823242, "epoch": 0.40153441277487845, "grad_norm": 3047.681640625, "learning_rate": 3e-06, "loss": 115.2349, "reward": 2.3728750944137573, "reward_std": 0.44737179577350616, "rewards/correctness_reward_func": 1.7083333730697632, "rewards/int_reward_func": 0.4583333432674408, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.20620832592248917, "step": 4501, "zero_std_ratio": 0.0 }, { "epoch": 0.4016236228199295, "grad_norm": 2236.61669921875, "learning_rate": 3e-06, "loss": -16.8149, "step": 4502 }, { "epoch": 0.4017128328649806, "grad_norm": 2588.25634765625, "learning_rate": 3e-06, "loss": 62.3591, "step": 4503 }, { "epoch": 0.4018020429100317, "grad_norm": 3359.859130859375, "learning_rate": 3e-06, "loss": 32.871, "step": 4504 }, { "epoch": 0.40189125295508277, "grad_norm": 2448.682861328125, "learning_rate": 3e-06, "loss": 39.657, "step": 4505 }, { "epoch": 0.4019804630001338, "grad_norm": 2157.782470703125, "learning_rate": 3e-06, "loss": 32.42, "step": 4506 }, { "epoch": 0.40206967304518487, "grad_norm": 2658.2587890625, "learning_rate": 3e-06, "loss": 78.2154, "step": 4507 }, { "epoch": 0.40215888309023595, "grad_norm": 2020.3133544921875, "learning_rate": 3e-06, "loss": -25.6052, "step": 4508 }, { "epoch": 0.40224809313528703, "grad_norm": 2028.5576171875, "learning_rate": 3e-06, "loss": 29.4557, "step": 4509 }, { "epoch": 0.4023373031803381, "grad_norm": 3263.72119140625, "learning_rate": 3e-06, "loss": -11.1023, "step": 4510 }, { "epoch": 0.4024265132253892, "grad_norm": 2063.79296875, "learning_rate": 3e-06, "loss": -2.058, "step": 4511 }, { "epoch": 0.4025157232704403, "grad_norm": 2336.689697265625, "learning_rate": 3e-06, "loss": -14.5534, "step": 4512 }, { "completion_length": 109.37500381469727, "epoch": 0.4026049333154913, "grad_norm": 3415.278076171875, "learning_rate": 3e-06, "loss": 128.9906, "reward": 2.5421459674835205, "reward_std": 0.376520493067801, "rewards/correctness_reward_func": 1.8333333134651184, "rewards/int_reward_func": 0.4895833283662796, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.21922917664051056, "step": 4513, "zero_std_ratio": 0.0 }, { "epoch": 0.4026941433605424, "grad_norm": 2345.03173828125, "learning_rate": 3e-06, "loss": 6.6794, "step": 4514 }, { "epoch": 0.40278335340559346, "grad_norm": 2155.927978515625, "learning_rate": 3e-06, "loss": 92.3101, "step": 4515 }, { "epoch": 0.40287256345064454, "grad_norm": 2858.669921875, "learning_rate": 3e-06, "loss": 30.3606, "step": 4516 }, { "epoch": 0.4029617734956956, "grad_norm": 2711.11376953125, "learning_rate": 3e-06, "loss": 59.2346, "step": 4517 }, { "epoch": 0.4030509835407467, "grad_norm": 2982.524658203125, "learning_rate": 3e-06, "loss": -50.1728, "step": 4518 }, { "epoch": 0.4031401935857978, "grad_norm": 3115.11181640625, "learning_rate": 3e-06, "loss": 89.0581, "step": 4519 }, { "epoch": 0.40322940363084886, "grad_norm": 2314.586181640625, "learning_rate": 3e-06, "loss": -4.3823, "step": 4520 }, { "epoch": 0.4033186136758999, "grad_norm": 2651.218505859375, "learning_rate": 3e-06, "loss": 67.7336, "step": 4521 }, { "epoch": 0.40340782372095096, "grad_norm": 2987.61181640625, "learning_rate": 3e-06, "loss": -6.0195, "step": 4522 }, { "epoch": 0.40349703376600204, "grad_norm": 2144.894775390625, "learning_rate": 3e-06, "loss": 29.1948, "step": 4523 }, { "epoch": 0.4035862438110531, "grad_norm": 1985.8211669921875, "learning_rate": 3e-06, "loss": -78.9802, "step": 4524 }, { "completion_length": 136.6666717529297, "epoch": 0.4036754538561042, "grad_norm": 2262.306884765625, "learning_rate": 3e-06, "loss": -135.8674, "reward": 2.117166757583618, "reward_std": 0.5028187781572342, "rewards/correctness_reward_func": 1.4583333134651184, "rewards/int_reward_func": 0.4791666716337204, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.179666668176651, "step": 4525, "zero_std_ratio": 0.0 }, { "epoch": 0.4037646639011553, "grad_norm": 2278.4541015625, "learning_rate": 3e-06, "loss": -36.372, "step": 4526 }, { "epoch": 0.40385387394620637, "grad_norm": 2979.447509765625, "learning_rate": 3e-06, "loss": -175.5953, "step": 4527 }, { "epoch": 0.4039430839912574, "grad_norm": 2280.955322265625, "learning_rate": 3e-06, "loss": -230.2135, "step": 4528 }, { "epoch": 0.40403229403630847, "grad_norm": 2556.7412109375, "learning_rate": 3e-06, "loss": -105.198, "step": 4529 }, { "epoch": 0.40412150408135955, "grad_norm": 2763.752197265625, "learning_rate": 3e-06, "loss": -116.017, "step": 4530 }, { "epoch": 0.40421071412641063, "grad_norm": 2322.650634765625, "learning_rate": 3e-06, "loss": -144.1313, "step": 4531 }, { "epoch": 0.4042999241714617, "grad_norm": 2360.745849609375, "learning_rate": 3e-06, "loss": -48.9794, "step": 4532 }, { "epoch": 0.4043891342165128, "grad_norm": 2772.17724609375, "learning_rate": 3e-06, "loss": -184.9366, "step": 4533 }, { "epoch": 0.40447834426156387, "grad_norm": 2252.931640625, "learning_rate": 3e-06, "loss": -248.6156, "step": 4534 }, { "epoch": 0.40456755430661495, "grad_norm": 3105.7373046875, "learning_rate": 3e-06, "loss": -120.0493, "step": 4535 }, { "epoch": 0.404656764351666, "grad_norm": 2613.921630859375, "learning_rate": 3e-06, "loss": -139.5313, "step": 4536 }, { "completion_length": 93.91666793823242, "epoch": 0.40474597439671706, "grad_norm": 492.1138916015625, "learning_rate": 3e-06, "loss": -3.1361, "reward": 2.596729278564453, "reward_std": 0.25618069618940353, "rewards/correctness_reward_func": 1.8333333134651184, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.2633958235383034, "step": 4537, "zero_std_ratio": 0.0 }, { "epoch": 0.40483518444176814, "grad_norm": 285.7267761230469, "learning_rate": 3e-06, "loss": -4.738, "step": 4538 }, { "epoch": 0.4049243944868192, "grad_norm": 261.9408874511719, "learning_rate": 3e-06, "loss": -11.812, "step": 4539 }, { "epoch": 0.4050136045318703, "grad_norm": 317.0812683105469, "learning_rate": 3e-06, "loss": 6.6235, "step": 4540 }, { "epoch": 0.4051028145769214, "grad_norm": 295.71868896484375, "learning_rate": 3e-06, "loss": -2.7281, "step": 4541 }, { "epoch": 0.40519202462197246, "grad_norm": 237.23867797851562, "learning_rate": 3e-06, "loss": -10.0047, "step": 4542 }, { "epoch": 0.4052812346670235, "grad_norm": 205.29910278320312, "learning_rate": 3e-06, "loss": -10.0978, "step": 4543 }, { "epoch": 0.40537044471207456, "grad_norm": 220.62782287597656, "learning_rate": 3e-06, "loss": -7.3525, "step": 4544 }, { "epoch": 0.40545965475712564, "grad_norm": 327.15191650390625, "learning_rate": 3e-06, "loss": -15.3753, "step": 4545 }, { "epoch": 0.4055488648021767, "grad_norm": 282.9918212890625, "learning_rate": 3e-06, "loss": 3.29, "step": 4546 }, { "epoch": 0.4056380748472278, "grad_norm": 284.6883544921875, "learning_rate": 3e-06, "loss": -7.3828, "step": 4547 }, { "epoch": 0.4057272848922789, "grad_norm": 199.97779846191406, "learning_rate": 3e-06, "loss": -13.479, "step": 4548 }, { "completion_length": 140.87500762939453, "epoch": 0.40581649493732996, "grad_norm": 1743.8311767578125, "learning_rate": 3e-06, "loss": -42.0653, "reward": 2.153520941734314, "reward_std": 0.2934277607128024, "rewards/correctness_reward_func": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.1535208448767662, "step": 4549, "zero_std_ratio": 0.0 }, { "epoch": 0.40590570498238104, "grad_norm": 1359.685302734375, "learning_rate": 3e-06, "loss": -44.6948, "step": 4550 }, { "epoch": 0.40599491502743207, "grad_norm": 2835.112060546875, "learning_rate": 3e-06, "loss": 96.6286, "step": 4551 }, { "epoch": 0.40608412507248315, "grad_norm": 2169.55517578125, "learning_rate": 3e-06, "loss": -72.9084, "step": 4552 }, { "epoch": 0.40617333511753423, "grad_norm": 1673.4661865234375, "learning_rate": 3e-06, "loss": -40.8202, "step": 4553 }, { "epoch": 0.4062625451625853, "grad_norm": 1953.1387939453125, "learning_rate": 3e-06, "loss": -17.5565, "step": 4554 }, { "epoch": 0.4063517552076364, "grad_norm": 1849.91845703125, "learning_rate": 3e-06, "loss": -58.3381, "step": 4555 }, { "epoch": 0.40644096525268747, "grad_norm": 1448.991943359375, "learning_rate": 3e-06, "loss": -46.4931, "step": 4556 }, { "epoch": 0.40653017529773855, "grad_norm": 2893.8212890625, "learning_rate": 3e-06, "loss": 92.7454, "step": 4557 }, { "epoch": 0.4066193853427896, "grad_norm": 2205.922119140625, "learning_rate": 3e-06, "loss": -84.5381, "step": 4558 }, { "epoch": 0.40670859538784065, "grad_norm": 1973.232666015625, "learning_rate": 3e-06, "loss": -58.4052, "step": 4559 }, { "epoch": 0.40679780543289173, "grad_norm": 2163.770751953125, "learning_rate": 3e-06, "loss": -24.5027, "step": 4560 }, { "completion_length": 120.68750381469727, "epoch": 0.4068870154779428, "grad_norm": 1010.835693359375, "learning_rate": 3e-06, "loss": -0.0495, "reward": 2.31793749332428, "reward_std": 0.4470832943916321, "rewards/correctness_reward_func": 1.625, "rewards/int_reward_func": 0.4791666716337204, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.2137708216905594, "step": 4561, "zero_std_ratio": 0.125 }, { "epoch": 0.4069762255229939, "grad_norm": 1807.03076171875, "learning_rate": 3e-06, "loss": -2.5692, "step": 4562 }, { "epoch": 0.407065435568045, "grad_norm": 1030.5826416015625, "learning_rate": 3e-06, "loss": -64.5969, "step": 4563 }, { "epoch": 0.40715464561309606, "grad_norm": 2388.715087890625, "learning_rate": 3e-06, "loss": -35.3867, "step": 4564 }, { "epoch": 0.4072438556581471, "grad_norm": 1347.58740234375, "learning_rate": 3e-06, "loss": -39.5709, "step": 4565 }, { "epoch": 0.40733306570319816, "grad_norm": 1562.1121826171875, "learning_rate": 3e-06, "loss": 3.0432, "step": 4566 }, { "epoch": 0.40742227574824924, "grad_norm": 1401.406982421875, "learning_rate": 3e-06, "loss": -8.1095, "step": 4567 }, { "epoch": 0.4075114857933003, "grad_norm": 1684.52294921875, "learning_rate": 3e-06, "loss": -16.8756, "step": 4568 }, { "epoch": 0.4076006958383514, "grad_norm": 949.5942993164062, "learning_rate": 3e-06, "loss": -78.7984, "step": 4569 }, { "epoch": 0.4076899058834025, "grad_norm": 2046.3505859375, "learning_rate": 3e-06, "loss": -49.0768, "step": 4570 }, { "epoch": 0.40777911592845356, "grad_norm": 1135.771484375, "learning_rate": 3e-06, "loss": -58.4028, "step": 4571 }, { "epoch": 0.40786832597350464, "grad_norm": 1541.1790771484375, "learning_rate": 3e-06, "loss": -22.0011, "step": 4572 }, { "completion_length": 133.50000381469727, "epoch": 0.40795753601855567, "grad_norm": 2201.4404296875, "learning_rate": 3e-06, "loss": 30.1736, "reward": 2.1441668272018433, "reward_std": 0.49582910537719727, "rewards/correctness_reward_func": 1.5, "rewards/int_reward_func": 0.4895833283662796, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.15458333492279053, "step": 4573, "zero_std_ratio": 0.0 }, { "epoch": 0.40804674606360675, "grad_norm": 2501.365478515625, "learning_rate": 3e-06, "loss": 62.1313, "step": 4574 }, { "epoch": 0.4081359561086578, "grad_norm": 2596.810791015625, "learning_rate": 3e-06, "loss": -12.3846, "step": 4575 }, { "epoch": 0.4082251661537089, "grad_norm": 2680.445556640625, "learning_rate": 3e-06, "loss": 79.133, "step": 4576 }, { "epoch": 0.40831437619876, "grad_norm": 2412.553955078125, "learning_rate": 3e-06, "loss": 41.4638, "step": 4577 }, { "epoch": 0.40840358624381107, "grad_norm": 2411.2314453125, "learning_rate": 3e-06, "loss": 122.2831, "step": 4578 }, { "epoch": 0.40849279628886215, "grad_norm": 2899.239013671875, "learning_rate": 3e-06, "loss": -3.9079, "step": 4579 }, { "epoch": 0.4085820063339132, "grad_norm": 3095.702880859375, "learning_rate": 3e-06, "loss": 34.3594, "step": 4580 }, { "epoch": 0.40867121637896425, "grad_norm": 1955.3992919921875, "learning_rate": 3e-06, "loss": -48.0532, "step": 4581 }, { "epoch": 0.40876042642401533, "grad_norm": 2275.324462890625, "learning_rate": 3e-06, "loss": 47.0231, "step": 4582 }, { "epoch": 0.4088496364690664, "grad_norm": 2572.98291015625, "learning_rate": 3e-06, "loss": 22.4895, "step": 4583 }, { "epoch": 0.4089388465141175, "grad_norm": 2650.92578125, "learning_rate": 3e-06, "loss": 100.6145, "step": 4584 }, { "completion_length": 107.70833587646484, "epoch": 0.4090280565591686, "grad_norm": 3150.210693359375, "learning_rate": 3e-06, "loss": 180.4281, "reward": 2.1793543100357056, "reward_std": 0.6565037667751312, "rewards/correctness_reward_func": 1.5416666865348816, "rewards/int_reward_func": 0.4166666716337204, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.22102083265781403, "step": 4585, "zero_std_ratio": 0.0 }, { "epoch": 0.40911726660421965, "grad_norm": 2096.89306640625, "learning_rate": 3e-06, "loss": 172.0236, "step": 4586 }, { "epoch": 0.40920647664927073, "grad_norm": 2270.0009765625, "learning_rate": 3e-06, "loss": 167.4242, "step": 4587 }, { "epoch": 0.40929568669432176, "grad_norm": 2266.05810546875, "learning_rate": 3e-06, "loss": 62.7198, "step": 4588 }, { "epoch": 0.40938489673937284, "grad_norm": 1768.1646728515625, "learning_rate": 3e-06, "loss": 155.1805, "step": 4589 }, { "epoch": 0.4094741067844239, "grad_norm": 2094.54638671875, "learning_rate": 3e-06, "loss": 94.2917, "step": 4590 }, { "epoch": 0.409563316829475, "grad_norm": 2649.318115234375, "learning_rate": 3e-06, "loss": 163.9059, "step": 4591 }, { "epoch": 0.4096525268745261, "grad_norm": 3413.80419921875, "learning_rate": 3e-06, "loss": 140.4374, "step": 4592 }, { "epoch": 0.40974173691957716, "grad_norm": 2184.5390625, "learning_rate": 3e-06, "loss": 148.0765, "step": 4593 }, { "epoch": 0.40983094696462824, "grad_norm": 2550.187744140625, "learning_rate": 3e-06, "loss": 27.306, "step": 4594 }, { "epoch": 0.40992015700967926, "grad_norm": 2120.653564453125, "learning_rate": 3e-06, "loss": 132.799, "step": 4595 }, { "epoch": 0.41000936705473034, "grad_norm": 2176.01171875, "learning_rate": 3e-06, "loss": 62.546, "step": 4596 }, { "completion_length": 144.1875, "epoch": 0.4100985770997814, "grad_norm": 3152.858154296875, "learning_rate": 3e-06, "loss": -94.0272, "reward": 2.4576668739318848, "reward_std": 0.38699233531951904, "rewards/correctness_reward_func": 1.7916666865348816, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.16600000113248825, "step": 4597, "zero_std_ratio": 0.0 }, { "epoch": 0.4101877871448325, "grad_norm": 2087.945068359375, "learning_rate": 3e-06, "loss": 160.0029, "step": 4598 }, { "epoch": 0.4102769971898836, "grad_norm": 4502.35498046875, "learning_rate": 3e-06, "loss": -21.8276, "step": 4599 }, { "epoch": 0.41036620723493467, "grad_norm": 3351.725341796875, "learning_rate": 3e-06, "loss": 23.4662, "step": 4600 }, { "epoch": 0.41045541727998575, "grad_norm": 4507.30029296875, "learning_rate": 3e-06, "loss": -86.0637, "step": 4601 }, { "epoch": 0.4105446273250368, "grad_norm": 2262.435302734375, "learning_rate": 3e-06, "loss": -74.1348, "step": 4602 }, { "epoch": 0.41063383737008785, "grad_norm": 3453.205078125, "learning_rate": 3e-06, "loss": -90.4243, "step": 4603 }, { "epoch": 0.41072304741513893, "grad_norm": 2649.433349609375, "learning_rate": 3e-06, "loss": 122.1819, "step": 4604 }, { "epoch": 0.41081225746019, "grad_norm": 4201.88818359375, "learning_rate": 3e-06, "loss": 8.0694, "step": 4605 }, { "epoch": 0.4109014675052411, "grad_norm": 3256.39208984375, "learning_rate": 3e-06, "loss": -25.0413, "step": 4606 }, { "epoch": 0.41099067755029217, "grad_norm": 4376.93310546875, "learning_rate": 3e-06, "loss": -62.4535, "step": 4607 }, { "epoch": 0.41107988759534325, "grad_norm": 2179.294189453125, "learning_rate": 3e-06, "loss": -77.5449, "step": 4608 }, { "completion_length": 120.54166793823242, "epoch": 0.41116909764039433, "grad_norm": 1042.729736328125, "learning_rate": 3e-06, "loss": -20.1056, "reward": 2.4948750734329224, "reward_std": 0.2811383157968521, "rewards/correctness_reward_func": 1.7916666865348816, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.2032083421945572, "step": 4609, "zero_std_ratio": 0.0 }, { "epoch": 0.41125830768544536, "grad_norm": 1303.88720703125, "learning_rate": 3e-06, "loss": -84.2715, "step": 4610 }, { "epoch": 0.41134751773049644, "grad_norm": 1612.7059326171875, "learning_rate": 3e-06, "loss": -42.2012, "step": 4611 }, { "epoch": 0.4114367277755475, "grad_norm": 2013.770751953125, "learning_rate": 3e-06, "loss": -98.2331, "step": 4612 }, { "epoch": 0.4115259378205986, "grad_norm": 1233.87109375, "learning_rate": 3e-06, "loss": -21.5357, "step": 4613 }, { "epoch": 0.4116151478656497, "grad_norm": 1015.8805541992188, "learning_rate": 3e-06, "loss": -6.9059, "step": 4614 }, { "epoch": 0.41170435791070076, "grad_norm": 1349.1619873046875, "learning_rate": 3e-06, "loss": -38.3976, "step": 4615 }, { "epoch": 0.41179356795575184, "grad_norm": 2366.06201171875, "learning_rate": 3e-06, "loss": -108.8375, "step": 4616 }, { "epoch": 0.41188277800080286, "grad_norm": 1739.8282470703125, "learning_rate": 3e-06, "loss": -66.2551, "step": 4617 }, { "epoch": 0.41197198804585394, "grad_norm": 1470.2923583984375, "learning_rate": 3e-06, "loss": -120.778, "step": 4618 }, { "epoch": 0.412061198090905, "grad_norm": 1271.054931640625, "learning_rate": 3e-06, "loss": -49.9573, "step": 4619 }, { "epoch": 0.4121504081359561, "grad_norm": 1155.3302001953125, "learning_rate": 3e-06, "loss": -32.1803, "step": 4620 }, { "completion_length": 111.43750381469727, "epoch": 0.4122396181810072, "grad_norm": 1574.9420166015625, "learning_rate": 3e-06, "loss": -58.7778, "reward": 2.5045000314712524, "reward_std": 0.45437583327293396, "rewards/correctness_reward_func": 1.7916666865348816, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.21283333748579025, "step": 4621, "zero_std_ratio": 0.0 }, { "epoch": 0.41232882822605826, "grad_norm": 1608.0845947265625, "learning_rate": 3e-06, "loss": -72.4866, "step": 4622 }, { "epoch": 0.41241803827110934, "grad_norm": 1154.161865234375, "learning_rate": 3e-06, "loss": -61.771, "step": 4623 }, { "epoch": 0.4125072483161604, "grad_norm": 1399.01904296875, "learning_rate": 3e-06, "loss": -58.3739, "step": 4624 }, { "epoch": 0.41259645836121145, "grad_norm": 1314.036865234375, "learning_rate": 3e-06, "loss": -53.6863, "step": 4625 }, { "epoch": 0.41268566840626253, "grad_norm": 1866.6964111328125, "learning_rate": 3e-06, "loss": -85.4003, "step": 4626 }, { "epoch": 0.4127748784513136, "grad_norm": 1679.0784912109375, "learning_rate": 3e-06, "loss": -94.5975, "step": 4627 }, { "epoch": 0.4128640884963647, "grad_norm": 1415.811279296875, "learning_rate": 3e-06, "loss": -93.1929, "step": 4628 }, { "epoch": 0.41295329854141577, "grad_norm": 1643.84765625, "learning_rate": 3e-06, "loss": -86.8478, "step": 4629 }, { "epoch": 0.41304250858646685, "grad_norm": 1463.5797119140625, "learning_rate": 3e-06, "loss": -93.2234, "step": 4630 }, { "epoch": 0.41313171863151793, "grad_norm": 1296.5030517578125, "learning_rate": 3e-06, "loss": -75.766, "step": 4631 }, { "epoch": 0.41322092867656895, "grad_norm": 1692.20361328125, "learning_rate": 3e-06, "loss": -135.8848, "step": 4632 }, { "completion_length": 124.72916793823242, "epoch": 0.41331013872162004, "grad_norm": 2467.01806640625, "learning_rate": 3e-06, "loss": 158.2784, "reward": 2.3108749389648438, "reward_std": 0.6133005619049072, "rewards/correctness_reward_func": 1.625, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.18587501347064972, "step": 4633, "zero_std_ratio": 0.0 }, { "epoch": 0.4133993487666711, "grad_norm": 3435.282470703125, "learning_rate": 3e-06, "loss": 41.9606, "step": 4634 }, { "epoch": 0.4134885588117222, "grad_norm": 3134.1943359375, "learning_rate": 3e-06, "loss": 152.0694, "step": 4635 }, { "epoch": 0.4135777688567733, "grad_norm": 3353.673095703125, "learning_rate": 3e-06, "loss": 36.5705, "step": 4636 }, { "epoch": 0.41366697890182436, "grad_norm": 2618.17822265625, "learning_rate": 3e-06, "loss": 12.4712, "step": 4637 }, { "epoch": 0.41375618894687544, "grad_norm": 4295.0185546875, "learning_rate": 3e-06, "loss": 13.6658, "step": 4638 }, { "epoch": 0.4138453989919265, "grad_norm": 2777.183837890625, "learning_rate": 3e-06, "loss": 133.8692, "step": 4639 }, { "epoch": 0.41393460903697754, "grad_norm": 2855.992919921875, "learning_rate": 3e-06, "loss": 23.0042, "step": 4640 }, { "epoch": 0.4140238190820286, "grad_norm": 2815.174072265625, "learning_rate": 3e-06, "loss": 146.3808, "step": 4641 }, { "epoch": 0.4141130291270797, "grad_norm": 3490.5478515625, "learning_rate": 3e-06, "loss": 20.3602, "step": 4642 }, { "epoch": 0.4142022391721308, "grad_norm": 2136.437255859375, "learning_rate": 3e-06, "loss": -5.1699, "step": 4643 }, { "epoch": 0.41429144921718186, "grad_norm": 2849.356689453125, "learning_rate": 3e-06, "loss": 1.8616, "step": 4644 }, { "completion_length": 110.70833587646484, "epoch": 0.41438065926223294, "grad_norm": 3896.034912109375, "learning_rate": 3e-06, "loss": -62.1564, "reward": 2.2017918825149536, "reward_std": 0.6816571354866028, "rewards/correctness_reward_func": 1.5, "rewards/int_reward_func": 0.4895833283662796, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.2122083306312561, "step": 4645, "zero_std_ratio": 0.0 }, { "epoch": 0.414469869307284, "grad_norm": 3007.28857421875, "learning_rate": 3e-06, "loss": -96.0256, "step": 4646 }, { "epoch": 0.41455907935233505, "grad_norm": 2997.415283203125, "learning_rate": 3e-06, "loss": -167.8648, "step": 4647 }, { "epoch": 0.4146482893973861, "grad_norm": 3653.358642578125, "learning_rate": 3e-06, "loss": -89.4959, "step": 4648 }, { "epoch": 0.4147374994424372, "grad_norm": 2074.160400390625, "learning_rate": 3e-06, "loss": -242.8365, "step": 4649 }, { "epoch": 0.4148267094874883, "grad_norm": 3840.6259765625, "learning_rate": 3e-06, "loss": -297.5809, "step": 4650 }, { "epoch": 0.41491591953253937, "grad_norm": 3828.221435546875, "learning_rate": 3e-06, "loss": -79.2413, "step": 4651 }, { "epoch": 0.41500512957759045, "grad_norm": 3546.709228515625, "learning_rate": 3e-06, "loss": -122.5833, "step": 4652 }, { "epoch": 0.41509433962264153, "grad_norm": 3058.851318359375, "learning_rate": 3e-06, "loss": -201.99, "step": 4653 }, { "epoch": 0.4151835496676926, "grad_norm": 3424.933349609375, "learning_rate": 3e-06, "loss": -125.0965, "step": 4654 }, { "epoch": 0.41527275971274363, "grad_norm": 2539.9248046875, "learning_rate": 3e-06, "loss": -270.1254, "step": 4655 }, { "epoch": 0.4153619697577947, "grad_norm": 3630.814208984375, "learning_rate": 3e-06, "loss": -330.5645, "step": 4656 }, { "completion_length": 105.35417175292969, "epoch": 0.4154511798028458, "grad_norm": 2519.49267578125, "learning_rate": 3e-06, "loss": 49.0135, "reward": 2.3322917222976685, "reward_std": 0.26117075234651566, "rewards/correctness_reward_func": 1.5833333134651184, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.24895832687616348, "step": 4657, "zero_std_ratio": 0.0 }, { "epoch": 0.4155403898478969, "grad_norm": 1258.58642578125, "learning_rate": 3e-06, "loss": 184.7028, "step": 4658 }, { "epoch": 0.41562959989294795, "grad_norm": 1806.11181640625, "learning_rate": 3e-06, "loss": 95.5226, "step": 4659 }, { "epoch": 0.41571880993799903, "grad_norm": 1190.3297119140625, "learning_rate": 3e-06, "loss": 140.9631, "step": 4660 }, { "epoch": 0.4158080199830501, "grad_norm": 1557.1248779296875, "learning_rate": 3e-06, "loss": 122.0557, "step": 4661 }, { "epoch": 0.41589723002810114, "grad_norm": 1204.9892578125, "learning_rate": 3e-06, "loss": 86.8605, "step": 4662 }, { "epoch": 0.4159864400731522, "grad_norm": 2276.178466796875, "learning_rate": 3e-06, "loss": 34.4029, "step": 4663 }, { "epoch": 0.4160756501182033, "grad_norm": 1341.726318359375, "learning_rate": 3e-06, "loss": 171.0295, "step": 4664 }, { "epoch": 0.4161648601632544, "grad_norm": 1570.38623046875, "learning_rate": 3e-06, "loss": 72.0648, "step": 4665 }, { "epoch": 0.41625407020830546, "grad_norm": 1459.7984619140625, "learning_rate": 3e-06, "loss": 133.0911, "step": 4666 }, { "epoch": 0.41634328025335654, "grad_norm": 1461.5155029296875, "learning_rate": 3e-06, "loss": 113.0094, "step": 4667 }, { "epoch": 0.4164324902984076, "grad_norm": 1173.0648193359375, "learning_rate": 3e-06, "loss": 82.5442, "step": 4668 }, { "completion_length": 126.45833969116211, "epoch": 0.4165217003434587, "grad_norm": 3216.97119140625, "learning_rate": 3e-06, "loss": -133.8797, "reward": 2.2646875381469727, "reward_std": 0.6605730056762695, "rewards/correctness_reward_func": 1.5833333134651184, "rewards/int_reward_func": 0.4791666716337204, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.20218748599290848, "step": 4669, "zero_std_ratio": 0.0 }, { "epoch": 0.4166109103885097, "grad_norm": 3038.15185546875, "learning_rate": 3e-06, "loss": -181.8719, "step": 4670 }, { "epoch": 0.4167001204335608, "grad_norm": 3462.04052734375, "learning_rate": 3e-06, "loss": -179.7917, "step": 4671 }, { "epoch": 0.4167893304786119, "grad_norm": 3619.353515625, "learning_rate": 3e-06, "loss": -253.4383, "step": 4672 }, { "epoch": 0.41687854052366297, "grad_norm": 2706.7646484375, "learning_rate": 3e-06, "loss": -183.2029, "step": 4673 }, { "epoch": 0.41696775056871405, "grad_norm": 2186.54052734375, "learning_rate": 3e-06, "loss": -153.3663, "step": 4674 }, { "epoch": 0.4170569606137651, "grad_norm": 3265.121826171875, "learning_rate": 3e-06, "loss": -190.1335, "step": 4675 }, { "epoch": 0.4171461706588162, "grad_norm": 4940.11767578125, "learning_rate": 3e-06, "loss": -230.5817, "step": 4676 }, { "epoch": 0.41723538070386723, "grad_norm": 2794.723388671875, "learning_rate": 3e-06, "loss": -225.531, "step": 4677 }, { "epoch": 0.4173245907489183, "grad_norm": 3386.083984375, "learning_rate": 3e-06, "loss": -339.9044, "step": 4678 }, { "epoch": 0.4174138007939694, "grad_norm": 2681.4990234375, "learning_rate": 3e-06, "loss": -256.3524, "step": 4679 }, { "epoch": 0.41750301083902047, "grad_norm": 2711.448974609375, "learning_rate": 3e-06, "loss": -184.7529, "step": 4680 }, { "completion_length": 137.27083587646484, "epoch": 0.41759222088407155, "grad_norm": 2070.4892578125, "learning_rate": 3e-06, "loss": 97.8704, "reward": 2.07693749666214, "reward_std": 0.25175633281469345, "rewards/correctness_reward_func": 1.4166666865348816, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.1602708250284195, "step": 4681, "zero_std_ratio": 0.0 }, { "epoch": 0.41768143092912263, "grad_norm": 1705.652099609375, "learning_rate": 3e-06, "loss": 37.5747, "step": 4682 }, { "epoch": 0.4177706409741737, "grad_norm": 1692.2301025390625, "learning_rate": 3e-06, "loss": 31.9954, "step": 4683 }, { "epoch": 0.41785985101922474, "grad_norm": 2258.3994140625, "learning_rate": 3e-06, "loss": 145.4962, "step": 4684 }, { "epoch": 0.4179490610642758, "grad_norm": 4183.3828125, "learning_rate": 3e-06, "loss": 40.7482, "step": 4685 }, { "epoch": 0.4180382711093269, "grad_norm": 1590.640869140625, "learning_rate": 3e-06, "loss": 58.0558, "step": 4686 }, { "epoch": 0.418127481154378, "grad_norm": 2943.1494140625, "learning_rate": 3e-06, "loss": 91.5453, "step": 4687 }, { "epoch": 0.41821669119942906, "grad_norm": 1771.3564453125, "learning_rate": 3e-06, "loss": 8.4268, "step": 4688 }, { "epoch": 0.41830590124448014, "grad_norm": 2106.645263671875, "learning_rate": 3e-06, "loss": 19.6648, "step": 4689 }, { "epoch": 0.4183951112895312, "grad_norm": 2549.50244140625, "learning_rate": 3e-06, "loss": 129.8656, "step": 4690 }, { "epoch": 0.4184843213345823, "grad_norm": 5634.33447265625, "learning_rate": 3e-06, "loss": 48.6544, "step": 4691 }, { "epoch": 0.4185735313796333, "grad_norm": 1607.723388671875, "learning_rate": 3e-06, "loss": 49.7337, "step": 4692 }, { "completion_length": 118.20833587646484, "epoch": 0.4186627414246844, "grad_norm": 2767.89892578125, "learning_rate": 3e-06, "loss": -86.8339, "reward": 2.201979160308838, "reward_std": 0.5707839578390121, "rewards/correctness_reward_func": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.2019791603088379, "step": 4693, "zero_std_ratio": 0.25 }, { "epoch": 0.4187519514697355, "grad_norm": 2204.860107421875, "learning_rate": 3e-06, "loss": 173.4777, "step": 4694 }, { "epoch": 0.41884116151478656, "grad_norm": 2602.115478515625, "learning_rate": 3e-06, "loss": 129.4576, "step": 4695 }, { "epoch": 0.41893037155983764, "grad_norm": 2858.0859375, "learning_rate": 3e-06, "loss": 239.7687, "step": 4696 }, { "epoch": 0.4190195816048887, "grad_norm": 2422.18408203125, "learning_rate": 3e-06, "loss": 119.028, "step": 4697 }, { "epoch": 0.4191087916499398, "grad_norm": 4855.3388671875, "learning_rate": 3e-06, "loss": 138.8455, "step": 4698 }, { "epoch": 0.41919800169499083, "grad_norm": 2886.328369140625, "learning_rate": 3e-06, "loss": -114.8922, "step": 4699 }, { "epoch": 0.4192872117400419, "grad_norm": 2362.24609375, "learning_rate": 3e-06, "loss": 146.5009, "step": 4700 }, { "epoch": 0.419376421785093, "grad_norm": 2583.86669921875, "learning_rate": 3e-06, "loss": 92.0824, "step": 4701 }, { "epoch": 0.41946563183014407, "grad_norm": 2945.39892578125, "learning_rate": 3e-06, "loss": 207.9769, "step": 4702 }, { "epoch": 0.41955484187519515, "grad_norm": 2893.637939453125, "learning_rate": 3e-06, "loss": 85.4845, "step": 4703 }, { "epoch": 0.41964405192024623, "grad_norm": 3180.474609375, "learning_rate": 3e-06, "loss": 104.1824, "step": 4704 }, { "completion_length": 114.52083587646484, "epoch": 0.4197332619652973, "grad_norm": 2654.566650390625, "learning_rate": 3e-06, "loss": 116.4145, "reward": 2.431208372116089, "reward_std": 0.39004985988140106, "rewards/correctness_reward_func": 1.7083333134651184, "rewards/int_reward_func": 0.4895833283662796, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.2332916483283043, "step": 4705, "zero_std_ratio": 0.0 }, { "epoch": 0.4198224720103484, "grad_norm": 1360.7677001953125, "learning_rate": 3e-06, "loss": 79.0159, "step": 4706 }, { "epoch": 0.4199116820553994, "grad_norm": 2640.930908203125, "learning_rate": 3e-06, "loss": 47.6137, "step": 4707 }, { "epoch": 0.4200008921004505, "grad_norm": 1496.2322998046875, "learning_rate": 3e-06, "loss": 18.2993, "step": 4708 }, { "epoch": 0.4200901021455016, "grad_norm": 1407.573486328125, "learning_rate": 3e-06, "loss": 33.8828, "step": 4709 }, { "epoch": 0.42017931219055266, "grad_norm": 2438.75537109375, "learning_rate": 3e-06, "loss": 57.3319, "step": 4710 }, { "epoch": 0.42026852223560374, "grad_norm": 1612.858154296875, "learning_rate": 3e-06, "loss": 97.2706, "step": 4711 }, { "epoch": 0.4203577322806548, "grad_norm": 1375.151123046875, "learning_rate": 3e-06, "loss": 60.6773, "step": 4712 }, { "epoch": 0.4204469423257059, "grad_norm": 2661.447998046875, "learning_rate": 3e-06, "loss": 22.4794, "step": 4713 }, { "epoch": 0.4205361523707569, "grad_norm": 1412.3653564453125, "learning_rate": 3e-06, "loss": 1.5317, "step": 4714 }, { "epoch": 0.420625362415808, "grad_norm": 1386.42138671875, "learning_rate": 3e-06, "loss": 10.3295, "step": 4715 }, { "epoch": 0.4207145724608591, "grad_norm": 2478.667724609375, "learning_rate": 3e-06, "loss": 25.4252, "step": 4716 }, { "completion_length": 137.39583587646484, "epoch": 0.42080378250591016, "grad_norm": 3164.174560546875, "learning_rate": 3e-06, "loss": -168.0903, "reward": 1.98556250333786, "reward_std": 0.5670964270830154, "rewards/correctness_reward_func": 1.375, "rewards/int_reward_func": 0.4479166567325592, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.1626458391547203, "step": 4717, "zero_std_ratio": 0.0 }, { "epoch": 0.42089299255096124, "grad_norm": 3302.13134765625, "learning_rate": 3e-06, "loss": -100.1588, "step": 4718 }, { "epoch": 0.4209822025960123, "grad_norm": 2642.700439453125, "learning_rate": 3e-06, "loss": -164.3252, "step": 4719 }, { "epoch": 0.4210714126410634, "grad_norm": 3316.759033203125, "learning_rate": 3e-06, "loss": -139.6858, "step": 4720 }, { "epoch": 0.4211606226861145, "grad_norm": 2811.689208984375, "learning_rate": 3e-06, "loss": -286.0138, "step": 4721 }, { "epoch": 0.4212498327311655, "grad_norm": 3012.87841796875, "learning_rate": 3e-06, "loss": -167.418, "step": 4722 }, { "epoch": 0.4213390427762166, "grad_norm": 6204.53662109375, "learning_rate": 3e-06, "loss": -203.4082, "step": 4723 }, { "epoch": 0.42142825282126767, "grad_norm": 3371.279052734375, "learning_rate": 3e-06, "loss": -120.3425, "step": 4724 }, { "epoch": 0.42151746286631875, "grad_norm": 2584.768798828125, "learning_rate": 3e-06, "loss": -178.347, "step": 4725 }, { "epoch": 0.42160667291136983, "grad_norm": 3431.489501953125, "learning_rate": 3e-06, "loss": -161.7848, "step": 4726 }, { "epoch": 0.4216958829564209, "grad_norm": 2444.591064453125, "learning_rate": 3e-06, "loss": -308.1559, "step": 4727 }, { "epoch": 0.421785093001472, "grad_norm": 12274.5263671875, "learning_rate": 3e-06, "loss": -184.5765, "step": 4728 }, { "completion_length": 123.66667175292969, "epoch": 0.421874303046523, "grad_norm": 2644.42236328125, "learning_rate": 3e-06, "loss": 167.2355, "reward": 2.093583405017853, "reward_std": 0.5386003851890564, "rewards/correctness_reward_func": 1.4583333134651184, "rewards/int_reward_func": 0.4479166716337204, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.18733333051204681, "step": 4729, "zero_std_ratio": 0.125 }, { "epoch": 0.4219635130915741, "grad_norm": 2395.18505859375, "learning_rate": 3e-06, "loss": 206.8639, "step": 4730 }, { "epoch": 0.4220527231366252, "grad_norm": 2714.57958984375, "learning_rate": 3e-06, "loss": 179.4842, "step": 4731 }, { "epoch": 0.42214193318167625, "grad_norm": 2909.9970703125, "learning_rate": 3e-06, "loss": 208.0405, "step": 4732 }, { "epoch": 0.42223114322672733, "grad_norm": 2459.055419921875, "learning_rate": 3e-06, "loss": 299.6983, "step": 4733 }, { "epoch": 0.4223203532717784, "grad_norm": 2175.963134765625, "learning_rate": 3e-06, "loss": 222.0926, "step": 4734 }, { "epoch": 0.4224095633168295, "grad_norm": 2971.178955078125, "learning_rate": 3e-06, "loss": 155.8842, "step": 4735 }, { "epoch": 0.4224987733618806, "grad_norm": 2151.519775390625, "learning_rate": 3e-06, "loss": 191.8081, "step": 4736 }, { "epoch": 0.4225879834069316, "grad_norm": 2700.96826171875, "learning_rate": 3e-06, "loss": 167.6277, "step": 4737 }, { "epoch": 0.4226771934519827, "grad_norm": 2947.5546875, "learning_rate": 3e-06, "loss": 193.6169, "step": 4738 }, { "epoch": 0.42276640349703376, "grad_norm": 2653.3935546875, "learning_rate": 3e-06, "loss": 267.1923, "step": 4739 }, { "epoch": 0.42285561354208484, "grad_norm": 2173.1728515625, "learning_rate": 3e-06, "loss": 194.755, "step": 4740 }, { "completion_length": 138.52083587646484, "epoch": 0.4229448235871359, "grad_norm": 3735.315185546875, "learning_rate": 3e-06, "loss": -102.6535, "reward": 1.8853334188461304, "reward_std": 0.6065454185009003, "rewards/correctness_reward_func": 1.25, "rewards/int_reward_func": 0.46875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.1665833368897438, "step": 4741, "zero_std_ratio": 0.0 }, { "epoch": 0.423034033632187, "grad_norm": 2712.3330078125, "learning_rate": 3e-06, "loss": -186.7376, "step": 4742 }, { "epoch": 0.4231232436772381, "grad_norm": 3133.096435546875, "learning_rate": 3e-06, "loss": -237.1748, "step": 4743 }, { "epoch": 0.4232124537222891, "grad_norm": 2764.13232421875, "learning_rate": 3e-06, "loss": -91.3225, "step": 4744 }, { "epoch": 0.4233016637673402, "grad_norm": 2392.63671875, "learning_rate": 3e-06, "loss": -193.0827, "step": 4745 }, { "epoch": 0.42339087381239127, "grad_norm": 2520.42578125, "learning_rate": 3e-06, "loss": -200.9839, "step": 4746 }, { "epoch": 0.42348008385744235, "grad_norm": 3289.576171875, "learning_rate": 3e-06, "loss": -150.0153, "step": 4747 }, { "epoch": 0.4235692939024934, "grad_norm": 2884.5947265625, "learning_rate": 3e-06, "loss": -219.5681, "step": 4748 }, { "epoch": 0.4236585039475445, "grad_norm": 2677.20458984375, "learning_rate": 3e-06, "loss": -280.6207, "step": 4749 }, { "epoch": 0.4237477139925956, "grad_norm": 2322.284423828125, "learning_rate": 3e-06, "loss": -123.3083, "step": 4750 }, { "epoch": 0.4238369240376466, "grad_norm": 1918.373046875, "learning_rate": 3e-06, "loss": -228.8708, "step": 4751 }, { "epoch": 0.4239261340826977, "grad_norm": 2602.23876953125, "learning_rate": 3e-06, "loss": -239.5393, "step": 4752 }, { "completion_length": 124.16667175292969, "epoch": 0.4240153441277488, "grad_norm": 1854.790771484375, "learning_rate": 3e-06, "loss": 185.1028, "reward": 1.5921666622161865, "reward_std": 0.3244532197713852, "rewards/correctness_reward_func": 0.9166666567325592, "rewards/int_reward_func": 0.4791666716337204, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.19633334130048752, "step": 4753, "zero_std_ratio": 0.0 }, { "epoch": 0.42410455417279985, "grad_norm": 1548.004150390625, "learning_rate": 3e-06, "loss": 153.5047, "step": 4754 }, { "epoch": 0.42419376421785093, "grad_norm": 1800.6510009765625, "learning_rate": 3e-06, "loss": 145.3145, "step": 4755 }, { "epoch": 0.424282974262902, "grad_norm": 1818.24462890625, "learning_rate": 3e-06, "loss": 292.989, "step": 4756 }, { "epoch": 0.4243721843079531, "grad_norm": 1307.919921875, "learning_rate": 3e-06, "loss": 221.8827, "step": 4757 }, { "epoch": 0.4244613943530042, "grad_norm": 2112.387451171875, "learning_rate": 3e-06, "loss": 228.3054, "step": 4758 }, { "epoch": 0.4245506043980552, "grad_norm": 2190.97705078125, "learning_rate": 3e-06, "loss": 163.6473, "step": 4759 }, { "epoch": 0.4246398144431063, "grad_norm": 1946.3955078125, "learning_rate": 3e-06, "loss": 122.0501, "step": 4760 }, { "epoch": 0.42472902448815736, "grad_norm": 2247.267578125, "learning_rate": 3e-06, "loss": 111.2868, "step": 4761 }, { "epoch": 0.42481823453320844, "grad_norm": 1802.09326171875, "learning_rate": 3e-06, "loss": 257.5065, "step": 4762 }, { "epoch": 0.4249074445782595, "grad_norm": 1595.8099365234375, "learning_rate": 3e-06, "loss": 195.7554, "step": 4763 }, { "epoch": 0.4249966546233106, "grad_norm": 2096.761962890625, "learning_rate": 3e-06, "loss": 187.7819, "step": 4764 }, { "completion_length": 106.3125, "epoch": 0.4250858646683617, "grad_norm": 2212.3076171875, "learning_rate": 3e-06, "loss": 129.1688, "reward": 2.3120625019073486, "reward_std": 0.2913160026073456, "rewards/correctness_reward_func": 1.5833333134651184, "rewards/int_reward_func": 0.4791666567325592, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.24956248700618744, "step": 4765, "zero_std_ratio": 0.0 }, { "epoch": 0.4251750747134127, "grad_norm": 1714.5950927734375, "learning_rate": 3e-06, "loss": 153.6584, "step": 4766 }, { "epoch": 0.4252642847584638, "grad_norm": 2224.00146484375, "learning_rate": 3e-06, "loss": 121.3417, "step": 4767 }, { "epoch": 0.42535349480351486, "grad_norm": 1682.892333984375, "learning_rate": 3e-06, "loss": 180.9551, "step": 4768 }, { "epoch": 0.42544270484856594, "grad_norm": 2114.0224609375, "learning_rate": 3e-06, "loss": 145.2112, "step": 4769 }, { "epoch": 0.425531914893617, "grad_norm": 2470.04150390625, "learning_rate": 3e-06, "loss": 108.4827, "step": 4770 }, { "epoch": 0.4256211249386681, "grad_norm": 2337.825439453125, "learning_rate": 3e-06, "loss": 92.6054, "step": 4771 }, { "epoch": 0.4257103349837192, "grad_norm": 1925.2049560546875, "learning_rate": 3e-06, "loss": 111.0629, "step": 4772 }, { "epoch": 0.42579954502877027, "grad_norm": 1953.6134033203125, "learning_rate": 3e-06, "loss": 74.1994, "step": 4773 }, { "epoch": 0.4258887550738213, "grad_norm": 2311.61474609375, "learning_rate": 3e-06, "loss": 159.6213, "step": 4774 }, { "epoch": 0.42597796511887237, "grad_norm": 2442.84375, "learning_rate": 3e-06, "loss": 103.3555, "step": 4775 }, { "epoch": 0.42606717516392345, "grad_norm": 2680.85595703125, "learning_rate": 3e-06, "loss": 58.9689, "step": 4776 }, { "completion_length": 132.7291717529297, "epoch": 0.42615638520897453, "grad_norm": 1592.9537353515625, "learning_rate": 3e-06, "loss": -201.855, "reward": 2.270583391189575, "reward_std": 0.42313191294670105, "rewards/correctness_reward_func": 1.5833333134651184, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.18725000321865082, "step": 4777, "zero_std_ratio": 0.0 }, { "epoch": 0.4262455952540256, "grad_norm": 1359.201904296875, "learning_rate": 3e-06, "loss": -120.8258, "step": 4778 }, { "epoch": 0.4263348052990767, "grad_norm": 1580.97119140625, "learning_rate": 3e-06, "loss": -127.4356, "step": 4779 }, { "epoch": 0.42642401534412777, "grad_norm": 1704.806396484375, "learning_rate": 3e-06, "loss": -70.5067, "step": 4780 }, { "epoch": 0.4265132253891788, "grad_norm": 1873.3330078125, "learning_rate": 3e-06, "loss": -105.0195, "step": 4781 }, { "epoch": 0.4266024354342299, "grad_norm": 2064.9775390625, "learning_rate": 3e-06, "loss": -151.4526, "step": 4782 }, { "epoch": 0.42669164547928096, "grad_norm": 1638.578369140625, "learning_rate": 3e-06, "loss": -200.3938, "step": 4783 }, { "epoch": 0.42678085552433204, "grad_norm": 1462.1710205078125, "learning_rate": 3e-06, "loss": -141.1695, "step": 4784 }, { "epoch": 0.4268700655693831, "grad_norm": 1574.3309326171875, "learning_rate": 3e-06, "loss": -142.3791, "step": 4785 }, { "epoch": 0.4269592756144342, "grad_norm": 1804.44677734375, "learning_rate": 3e-06, "loss": -88.088, "step": 4786 }, { "epoch": 0.4270484856594853, "grad_norm": 1467.924560546875, "learning_rate": 3e-06, "loss": -118.8514, "step": 4787 }, { "epoch": 0.42713769570453636, "grad_norm": 1873.3359375, "learning_rate": 3e-06, "loss": -183.4673, "step": 4788 }, { "completion_length": 133.64584350585938, "epoch": 0.4272269057495874, "grad_norm": 4360.11083984375, "learning_rate": 3e-06, "loss": 238.4407, "reward": 1.975125014781952, "reward_std": 0.6049195230007172, "rewards/correctness_reward_func": 1.2916666567325592, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.1834583356976509, "step": 4789, "zero_std_ratio": 0.0 }, { "epoch": 0.42731611579463846, "grad_norm": 2505.157470703125, "learning_rate": 3e-06, "loss": 111.631, "step": 4790 }, { "epoch": 0.42740532583968954, "grad_norm": 3292.53466796875, "learning_rate": 3e-06, "loss": 140.6327, "step": 4791 }, { "epoch": 0.4274945358847406, "grad_norm": 2838.91845703125, "learning_rate": 3e-06, "loss": 81.2008, "step": 4792 }, { "epoch": 0.4275837459297917, "grad_norm": 2897.04443359375, "learning_rate": 3e-06, "loss": 155.9897, "step": 4793 }, { "epoch": 0.4276729559748428, "grad_norm": 2521.357421875, "learning_rate": 3e-06, "loss": 212.9114, "step": 4794 }, { "epoch": 0.42776216601989386, "grad_norm": 3036.285888671875, "learning_rate": 3e-06, "loss": 220.0389, "step": 4795 }, { "epoch": 0.4278513760649449, "grad_norm": 2433.4931640625, "learning_rate": 3e-06, "loss": 83.9858, "step": 4796 }, { "epoch": 0.42794058610999597, "grad_norm": 3767.9970703125, "learning_rate": 3e-06, "loss": 114.5484, "step": 4797 }, { "epoch": 0.42802979615504705, "grad_norm": 2735.781982421875, "learning_rate": 3e-06, "loss": 47.1155, "step": 4798 }, { "epoch": 0.42811900620009813, "grad_norm": 2806.32373046875, "learning_rate": 3e-06, "loss": 116.8537, "step": 4799 }, { "epoch": 0.4282082162451492, "grad_norm": 2857.369140625, "learning_rate": 3e-06, "loss": 165.0932, "step": 4800 }, { "completion_length": 105.95833587646484, "epoch": 0.4282974262902003, "grad_norm": 2020.14404296875, "learning_rate": 3e-06, "loss": -60.2094, "reward": 2.1610000133514404, "reward_std": 0.6872619390487671, "rewards/correctness_reward_func": 1.4583333134651184, "rewards/int_reward_func": 0.4791666567325592, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.22350000590085983, "step": 4801, "zero_std_ratio": 0.0 }, { "epoch": 0.42838663633525137, "grad_norm": 2284.986572265625, "learning_rate": 3e-06, "loss": -118.4988, "step": 4802 }, { "epoch": 0.42847584638030245, "grad_norm": 1963.19921875, "learning_rate": 3e-06, "loss": 11.9475, "step": 4803 }, { "epoch": 0.4285650564253535, "grad_norm": 1838.6492919921875, "learning_rate": 3e-06, "loss": -27.6601, "step": 4804 }, { "epoch": 0.42865426647040455, "grad_norm": 1829.18408203125, "learning_rate": 3e-06, "loss": 22.4575, "step": 4805 }, { "epoch": 0.42874347651545563, "grad_norm": 1779.8443603515625, "learning_rate": 3e-06, "loss": 36.5018, "step": 4806 }, { "epoch": 0.4288326865605067, "grad_norm": 1936.9298095703125, "learning_rate": 3e-06, "loss": -77.3496, "step": 4807 }, { "epoch": 0.4289218966055578, "grad_norm": 1912.121826171875, "learning_rate": 3e-06, "loss": -136.9483, "step": 4808 }, { "epoch": 0.4290111066506089, "grad_norm": 1942.01806640625, "learning_rate": 3e-06, "loss": -4.6558, "step": 4809 }, { "epoch": 0.42910031669565996, "grad_norm": 1937.736572265625, "learning_rate": 3e-06, "loss": -51.0107, "step": 4810 }, { "epoch": 0.429189526740711, "grad_norm": 1700.6866455078125, "learning_rate": 3e-06, "loss": -1.3644, "step": 4811 }, { "epoch": 0.42927873678576206, "grad_norm": 1721.527099609375, "learning_rate": 3e-06, "loss": 22.1069, "step": 4812 }, { "completion_length": 119.50000381469727, "epoch": 0.42936794683081314, "grad_norm": 1245.38818359375, "learning_rate": 3e-06, "loss": -81.2712, "reward": 2.3725208044052124, "reward_std": 0.35795650258660316, "rewards/correctness_reward_func": 1.6666666865348816, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.205854170024395, "step": 4813, "zero_std_ratio": 0.0 }, { "epoch": 0.4294571568758642, "grad_norm": 1102.3568115234375, "learning_rate": 3e-06, "loss": -39.0536, "step": 4814 }, { "epoch": 0.4295463669209153, "grad_norm": 1275.9248046875, "learning_rate": 3e-06, "loss": 28.9221, "step": 4815 }, { "epoch": 0.4296355769659664, "grad_norm": 2102.848876953125, "learning_rate": 3e-06, "loss": 20.8182, "step": 4816 }, { "epoch": 0.42972478701101746, "grad_norm": 1293.2333984375, "learning_rate": 3e-06, "loss": 9.9023, "step": 4817 }, { "epoch": 0.4298139970560685, "grad_norm": 1469.6082763671875, "learning_rate": 3e-06, "loss": 4.9411, "step": 4818 }, { "epoch": 0.42990320710111957, "grad_norm": 1133.334716796875, "learning_rate": 3e-06, "loss": -88.0742, "step": 4819 }, { "epoch": 0.42999241714617065, "grad_norm": 1441.196533203125, "learning_rate": 3e-06, "loss": -48.0744, "step": 4820 }, { "epoch": 0.4300816271912217, "grad_norm": 1375.7498779296875, "learning_rate": 3e-06, "loss": 20.287, "step": 4821 }, { "epoch": 0.4301708372362728, "grad_norm": 1378.4139404296875, "learning_rate": 3e-06, "loss": 1.0811, "step": 4822 }, { "epoch": 0.4302600472813239, "grad_norm": 1326.36083984375, "learning_rate": 3e-06, "loss": 2.0677, "step": 4823 }, { "epoch": 0.43034925732637497, "grad_norm": 1634.8228759765625, "learning_rate": 3e-06, "loss": -2.0969, "step": 4824 }, { "completion_length": 130.85416793823242, "epoch": 0.43043846737142605, "grad_norm": 2792.92724609375, "learning_rate": 3e-06, "loss": 225.0658, "reward": 2.1767501831054688, "reward_std": 0.6465975046157837, "rewards/correctness_reward_func": 1.5, "rewards/int_reward_func": 0.4895833283662796, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.18716666847467422, "step": 4825, "zero_std_ratio": 0.0 }, { "epoch": 0.4305276774164771, "grad_norm": 2882.937255859375, "learning_rate": 3e-06, "loss": 129.7453, "step": 4826 }, { "epoch": 0.43061688746152815, "grad_norm": 2635.66796875, "learning_rate": 3e-06, "loss": 119.9416, "step": 4827 }, { "epoch": 0.43070609750657923, "grad_norm": 2341.3251953125, "learning_rate": 3e-06, "loss": 190.7699, "step": 4828 }, { "epoch": 0.4307953075516303, "grad_norm": 2153.9501953125, "learning_rate": 3e-06, "loss": 217.081, "step": 4829 }, { "epoch": 0.4308845175966814, "grad_norm": 2196.369873046875, "learning_rate": 3e-06, "loss": 178.0037, "step": 4830 }, { "epoch": 0.4309737276417325, "grad_norm": 3357.78271484375, "learning_rate": 3e-06, "loss": 200.5025, "step": 4831 }, { "epoch": 0.43106293768678355, "grad_norm": 4150.8935546875, "learning_rate": 3e-06, "loss": 121.5597, "step": 4832 }, { "epoch": 0.4311521477318346, "grad_norm": 2835.580810546875, "learning_rate": 3e-06, "loss": 101.8292, "step": 4833 }, { "epoch": 0.43124135777688566, "grad_norm": 2180.24462890625, "learning_rate": 3e-06, "loss": 161.3556, "step": 4834 }, { "epoch": 0.43133056782193674, "grad_norm": 2273.382568359375, "learning_rate": 3e-06, "loss": 190.901, "step": 4835 }, { "epoch": 0.4314197778669878, "grad_norm": 2384.73095703125, "learning_rate": 3e-06, "loss": 154.3409, "step": 4836 }, { "completion_length": 113.95833587646484, "epoch": 0.4315089879120389, "grad_norm": 1001.3514404296875, "learning_rate": 3e-06, "loss": -66.4703, "reward": 2.4173542261123657, "reward_std": 0.39125795662403107, "rewards/correctness_reward_func": 1.7083333730697632, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.20902081578969955, "step": 4837, "zero_std_ratio": 0.0 }, { "epoch": 0.43159819795709, "grad_norm": 1048.712158203125, "learning_rate": 3e-06, "loss": -7.8578, "step": 4838 }, { "epoch": 0.43168740800214106, "grad_norm": 1285.9119873046875, "learning_rate": 3e-06, "loss": 1.3376, "step": 4839 }, { "epoch": 0.43177661804719214, "grad_norm": 1080.864501953125, "learning_rate": 3e-06, "loss": -15.6687, "step": 4840 }, { "epoch": 0.43186582809224316, "grad_norm": 781.4937133789062, "learning_rate": 3e-06, "loss": -22.375, "step": 4841 }, { "epoch": 0.43195503813729424, "grad_norm": 872.2640380859375, "learning_rate": 3e-06, "loss": -40.5698, "step": 4842 }, { "epoch": 0.4320442481823453, "grad_norm": 947.8171997070312, "learning_rate": 3e-06, "loss": -72.9156, "step": 4843 }, { "epoch": 0.4321334582273964, "grad_norm": 1033.6824951171875, "learning_rate": 3e-06, "loss": -21.7052, "step": 4844 }, { "epoch": 0.4322226682724475, "grad_norm": 855.9918823242188, "learning_rate": 3e-06, "loss": -17.0029, "step": 4845 }, { "epoch": 0.43231187831749857, "grad_norm": 1098.1748046875, "learning_rate": 3e-06, "loss": -39.1799, "step": 4846 }, { "epoch": 0.43240108836254965, "grad_norm": 768.6985473632812, "learning_rate": 3e-06, "loss": -42.4438, "step": 4847 }, { "epoch": 0.43249029840760067, "grad_norm": 970.845947265625, "learning_rate": 3e-06, "loss": -61.9566, "step": 4848 }, { "completion_length": 132.70833587646484, "epoch": 0.43257950845265175, "grad_norm": 1917.1629638671875, "learning_rate": 3e-06, "loss": 32.1103, "reward": 1.760562539100647, "reward_std": 0.5508699715137482, "rewards/correctness_reward_func": 1.1249999701976776, "rewards/int_reward_func": 0.46875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.1668124943971634, "step": 4849, "zero_std_ratio": 0.0 }, { "epoch": 0.43266871849770283, "grad_norm": 1720.861572265625, "learning_rate": 3e-06, "loss": -3.542, "step": 4850 }, { "epoch": 0.4327579285427539, "grad_norm": 1584.3790283203125, "learning_rate": 3e-06, "loss": 42.6637, "step": 4851 }, { "epoch": 0.432847138587805, "grad_norm": 1655.5177001953125, "learning_rate": 3e-06, "loss": 61.8116, "step": 4852 }, { "epoch": 0.43293634863285607, "grad_norm": 1394.019775390625, "learning_rate": 3e-06, "loss": 28.7491, "step": 4853 }, { "epoch": 0.43302555867790715, "grad_norm": 1541.73486328125, "learning_rate": 3e-06, "loss": 87.1405, "step": 4854 }, { "epoch": 0.43311476872295823, "grad_norm": 1758.8826904296875, "learning_rate": 3e-06, "loss": 16.6436, "step": 4855 }, { "epoch": 0.43320397876800926, "grad_norm": 1400.04638671875, "learning_rate": 3e-06, "loss": -19.8999, "step": 4856 }, { "epoch": 0.43329318881306034, "grad_norm": 1455.920166015625, "learning_rate": 3e-06, "loss": 11.8914, "step": 4857 }, { "epoch": 0.4333823988581114, "grad_norm": 1461.504150390625, "learning_rate": 3e-06, "loss": 23.733, "step": 4858 }, { "epoch": 0.4334716089031625, "grad_norm": 1183.6624755859375, "learning_rate": 3e-06, "loss": 5.0948, "step": 4859 }, { "epoch": 0.4335608189482136, "grad_norm": 1372.22265625, "learning_rate": 3e-06, "loss": 53.08, "step": 4860 }, { "completion_length": 128.18750762939453, "epoch": 0.43365002899326466, "grad_norm": 2610.615234375, "learning_rate": 3e-06, "loss": -190.8727, "reward": 2.094249963760376, "reward_std": 0.46839261054992676, "rewards/correctness_reward_func": 1.4166666865348816, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.17758333310484886, "step": 4861, "zero_std_ratio": 0.0 }, { "epoch": 0.43373923903831574, "grad_norm": 2357.686767578125, "learning_rate": 3e-06, "loss": -84.2941, "step": 4862 }, { "epoch": 0.43382844908336676, "grad_norm": 3635.294189453125, "learning_rate": 3e-06, "loss": -120.0914, "step": 4863 }, { "epoch": 0.43391765912841784, "grad_norm": 2095.424560546875, "learning_rate": 3e-06, "loss": -125.8521, "step": 4864 }, { "epoch": 0.4340068691734689, "grad_norm": 2184.37548828125, "learning_rate": 3e-06, "loss": -193.8857, "step": 4865 }, { "epoch": 0.43409607921852, "grad_norm": 2022.6405029296875, "learning_rate": 3e-06, "loss": -164.6635, "step": 4866 }, { "epoch": 0.4341852892635711, "grad_norm": 1273.54736328125, "learning_rate": 3e-06, "loss": -202.7379, "step": 4867 }, { "epoch": 0.43427449930862216, "grad_norm": 2147.048828125, "learning_rate": 3e-06, "loss": -95.2552, "step": 4868 }, { "epoch": 0.43436370935367324, "grad_norm": 1961.6202392578125, "learning_rate": 3e-06, "loss": -146.9478, "step": 4869 }, { "epoch": 0.43445291939872427, "grad_norm": 1901.156494140625, "learning_rate": 3e-06, "loss": -130.3531, "step": 4870 }, { "epoch": 0.43454212944377535, "grad_norm": 1489.42041015625, "learning_rate": 3e-06, "loss": -198.9397, "step": 4871 }, { "epoch": 0.43463133948882643, "grad_norm": 2089.50927734375, "learning_rate": 3e-06, "loss": -172.8029, "step": 4872 }, { "completion_length": 135.20833587646484, "epoch": 0.4347205495338775, "grad_norm": 798.479736328125, "learning_rate": 3e-06, "loss": 9.3401, "reward": 2.2059375047683716, "reward_std": 0.3920647129416466, "rewards/correctness_reward_func": 1.5416666269302368, "rewards/int_reward_func": 0.4895833283662796, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.17468750476837158, "step": 4873, "zero_std_ratio": 0.0 }, { "epoch": 0.4348097595789286, "grad_norm": 539.5579223632812, "learning_rate": 3e-06, "loss": 17.786, "step": 4874 }, { "epoch": 0.43489896962397967, "grad_norm": 568.3456420898438, "learning_rate": 3e-06, "loss": 4.1223, "step": 4875 }, { "epoch": 0.43498817966903075, "grad_norm": 879.725341796875, "learning_rate": 3e-06, "loss": -0.9839, "step": 4876 }, { "epoch": 0.43507738971408183, "grad_norm": 584.0048828125, "learning_rate": 3e-06, "loss": -9.3807, "step": 4877 }, { "epoch": 0.43516659975913285, "grad_norm": 486.4605712890625, "learning_rate": 3e-06, "loss": -5.1715, "step": 4878 }, { "epoch": 0.43525580980418394, "grad_norm": 787.57080078125, "learning_rate": 3e-06, "loss": 4.1706, "step": 4879 }, { "epoch": 0.435345019849235, "grad_norm": 519.5097045898438, "learning_rate": 3e-06, "loss": 12.0307, "step": 4880 }, { "epoch": 0.4354342298942861, "grad_norm": 500.0478515625, "learning_rate": 3e-06, "loss": 0.1486, "step": 4881 }, { "epoch": 0.4355234399393372, "grad_norm": 929.611572265625, "learning_rate": 3e-06, "loss": -10.7807, "step": 4882 }, { "epoch": 0.43561264998438826, "grad_norm": 399.7865295410156, "learning_rate": 3e-06, "loss": -14.896, "step": 4883 }, { "epoch": 0.43570186002943934, "grad_norm": 385.0538330078125, "learning_rate": 3e-06, "loss": -13.8193, "step": 4884 }, { "completion_length": 136.9166717529297, "epoch": 0.43579107007449036, "grad_norm": 1712.8873291015625, "learning_rate": 3e-06, "loss": -474.3248, "reward": 2.2843334078788757, "reward_std": 0.31879012286663055, "rewards/correctness_reward_func": 1.625, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.15933333337306976, "step": 4885, "zero_std_ratio": 0.0 }, { "epoch": 0.43588028011954144, "grad_norm": 2365.484375, "learning_rate": 3e-06, "loss": -408.279, "step": 4886 }, { "epoch": 0.4359694901645925, "grad_norm": 2289.011474609375, "learning_rate": 3e-06, "loss": -365.816, "step": 4887 }, { "epoch": 0.4360587002096436, "grad_norm": 2914.354736328125, "learning_rate": 3e-06, "loss": -444.0196, "step": 4888 }, { "epoch": 0.4361479102546947, "grad_norm": 2228.08203125, "learning_rate": 3e-06, "loss": -490.6989, "step": 4889 }, { "epoch": 0.43623712029974576, "grad_norm": 1949.876953125, "learning_rate": 3e-06, "loss": -406.9779, "step": 4890 }, { "epoch": 0.43632633034479684, "grad_norm": 2079.7431640625, "learning_rate": 3e-06, "loss": -486.6237, "step": 4891 }, { "epoch": 0.4364155403898479, "grad_norm": 2317.83154296875, "learning_rate": 3e-06, "loss": -432.0898, "step": 4892 }, { "epoch": 0.43650475043489895, "grad_norm": 2047.4058837890625, "learning_rate": 3e-06, "loss": -393.9342, "step": 4893 }, { "epoch": 0.43659396047995, "grad_norm": 2848.729248046875, "learning_rate": 3e-06, "loss": -473.8518, "step": 4894 }, { "epoch": 0.4366831705250011, "grad_norm": 2257.74560546875, "learning_rate": 3e-06, "loss": -525.7773, "step": 4895 }, { "epoch": 0.4367723805700522, "grad_norm": 2916.028564453125, "learning_rate": 3e-06, "loss": -436.6926, "step": 4896 }, { "completion_length": 110.56250381469727, "epoch": 0.43686159061510327, "grad_norm": 462.3417053222656, "learning_rate": 3e-06, "loss": 24.6718, "reward": 2.3465418815612793, "reward_std": 0.2508978545665741, "rewards/correctness_reward_func": 1.625, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.22154168784618378, "step": 4897, "zero_std_ratio": 0.0 }, { "epoch": 0.43695080066015435, "grad_norm": 228.47775268554688, "learning_rate": 3e-06, "loss": 0.2597, "step": 4898 }, { "epoch": 0.43704001070520543, "grad_norm": 194.57667541503906, "learning_rate": 3e-06, "loss": -7.4975, "step": 4899 }, { "epoch": 0.43712922075025645, "grad_norm": 182.21533203125, "learning_rate": 3e-06, "loss": 9.2837, "step": 4900 }, { "epoch": 0.43721843079530753, "grad_norm": 208.354736328125, "learning_rate": 3e-06, "loss": 0.6416, "step": 4901 }, { "epoch": 0.4373076408403586, "grad_norm": 217.32864379882812, "learning_rate": 3e-06, "loss": -1.5885, "step": 4902 }, { "epoch": 0.4373968508854097, "grad_norm": 446.7803649902344, "learning_rate": 3e-06, "loss": 20.484, "step": 4903 }, { "epoch": 0.4374860609304608, "grad_norm": 206.23187255859375, "learning_rate": 3e-06, "loss": -3.1168, "step": 4904 }, { "epoch": 0.43757527097551185, "grad_norm": 183.9177703857422, "learning_rate": 3e-06, "loss": -10.2896, "step": 4905 }, { "epoch": 0.43766448102056293, "grad_norm": 247.6621856689453, "learning_rate": 3e-06, "loss": 5.2115, "step": 4906 }, { "epoch": 0.437753691065614, "grad_norm": 215.6500244140625, "learning_rate": 3e-06, "loss": -2.343, "step": 4907 }, { "epoch": 0.43784290111066504, "grad_norm": 187.55506896972656, "learning_rate": 3e-06, "loss": -6.3174, "step": 4908 }, { "completion_length": 111.75000381469727, "epoch": 0.4379321111557161, "grad_norm": 932.2211303710938, "learning_rate": 3e-06, "loss": -32.9461, "reward": 2.4582293033599854, "reward_std": 0.39839141070842743, "rewards/correctness_reward_func": 1.75, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.208229161798954, "step": 4909, "zero_std_ratio": 0.0 }, { "epoch": 0.4380213212007672, "grad_norm": 478.12646484375, "learning_rate": 3e-06, "loss": -22.2235, "step": 4910 }, { "epoch": 0.4381105312458183, "grad_norm": 1642.239501953125, "learning_rate": 3e-06, "loss": -36.4984, "step": 4911 }, { "epoch": 0.43819974129086936, "grad_norm": 499.28021240234375, "learning_rate": 3e-06, "loss": -25.3838, "step": 4912 }, { "epoch": 0.43828895133592044, "grad_norm": 811.1210327148438, "learning_rate": 3e-06, "loss": -26.3914, "step": 4913 }, { "epoch": 0.4383781613809715, "grad_norm": 503.590576171875, "learning_rate": 3e-06, "loss": -17.5022, "step": 4914 }, { "epoch": 0.43846737142602255, "grad_norm": 949.7000122070312, "learning_rate": 3e-06, "loss": -44.6905, "step": 4915 }, { "epoch": 0.4385565814710736, "grad_norm": 674.6856689453125, "learning_rate": 3e-06, "loss": -30.7684, "step": 4916 }, { "epoch": 0.4386457915161247, "grad_norm": 980.5775756835938, "learning_rate": 3e-06, "loss": -52.0799, "step": 4917 }, { "epoch": 0.4387350015611758, "grad_norm": 1017.7597045898438, "learning_rate": 3e-06, "loss": -41.6844, "step": 4918 }, { "epoch": 0.43882421160622687, "grad_norm": 1136.501708984375, "learning_rate": 3e-06, "loss": -49.9445, "step": 4919 }, { "epoch": 0.43891342165127795, "grad_norm": 800.0311889648438, "learning_rate": 3e-06, "loss": -35.5243, "step": 4920 }, { "completion_length": 133.52083587646484, "epoch": 0.439002631696329, "grad_norm": 3527.6337890625, "learning_rate": 3e-06, "loss": -213.315, "reward": 2.2666666507720947, "reward_std": 0.3952704668045044, "rewards/correctness_reward_func": 1.625, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.14166665822267532, "step": 4921, "zero_std_ratio": 0.125 }, { "epoch": 0.4390918417413801, "grad_norm": 3887.60595703125, "learning_rate": 3e-06, "loss": -272.1929, "step": 4922 }, { "epoch": 0.43918105178643113, "grad_norm": 5159.13037109375, "learning_rate": 3e-06, "loss": 86.9037, "step": 4923 }, { "epoch": 0.4392702618314822, "grad_norm": 5193.552734375, "learning_rate": 3e-06, "loss": -103.4785, "step": 4924 }, { "epoch": 0.4393594718765333, "grad_norm": 4378.69287109375, "learning_rate": 3e-06, "loss": 18.9231, "step": 4925 }, { "epoch": 0.43944868192158437, "grad_norm": 3221.58837890625, "learning_rate": 3e-06, "loss": 31.4523, "step": 4926 }, { "epoch": 0.43953789196663545, "grad_norm": 3224.938232421875, "learning_rate": 3e-06, "loss": -228.466, "step": 4927 }, { "epoch": 0.43962710201168653, "grad_norm": 3711.105712890625, "learning_rate": 3e-06, "loss": -255.0297, "step": 4928 }, { "epoch": 0.4397163120567376, "grad_norm": 4958.43505859375, "learning_rate": 3e-06, "loss": 61.5004, "step": 4929 }, { "epoch": 0.43980552210178864, "grad_norm": 4686.0810546875, "learning_rate": 3e-06, "loss": -118.6554, "step": 4930 }, { "epoch": 0.4398947321468397, "grad_norm": 3395.106201171875, "learning_rate": 3e-06, "loss": -1.1484, "step": 4931 }, { "epoch": 0.4399839421918908, "grad_norm": 3410.759765625, "learning_rate": 3e-06, "loss": 10.2204, "step": 4932 }, { "completion_length": 120.02083587646484, "epoch": 0.4400731522369419, "grad_norm": 604.1112060546875, "learning_rate": 3e-06, "loss": -15.1026, "reward": 2.202208459377289, "reward_std": 0.22792022675275803, "rewards/correctness_reward_func": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.20220834016799927, "step": 4933, "zero_std_ratio": 0.0 }, { "epoch": 0.44016236228199296, "grad_norm": 326.7537841796875, "learning_rate": 3e-06, "loss": -19.0298, "step": 4934 }, { "epoch": 0.44025157232704404, "grad_norm": 250.4120330810547, "learning_rate": 3e-06, "loss": -24.5205, "step": 4935 }, { "epoch": 0.4403407823720951, "grad_norm": 395.6042785644531, "learning_rate": 3e-06, "loss": -24.2463, "step": 4936 }, { "epoch": 0.44042999241714614, "grad_norm": 490.93438720703125, "learning_rate": 3e-06, "loss": -45.514, "step": 4937 }, { "epoch": 0.4405192024621972, "grad_norm": 488.9787292480469, "learning_rate": 3e-06, "loss": -20.4931, "step": 4938 }, { "epoch": 0.4406084125072483, "grad_norm": 537.3056030273438, "learning_rate": 3e-06, "loss": -21.27, "step": 4939 }, { "epoch": 0.4406976225522994, "grad_norm": 461.5647277832031, "learning_rate": 3e-06, "loss": -26.4429, "step": 4940 }, { "epoch": 0.44078683259735046, "grad_norm": 342.1727294921875, "learning_rate": 3e-06, "loss": -29.6064, "step": 4941 }, { "epoch": 0.44087604264240154, "grad_norm": 466.01361083984375, "learning_rate": 3e-06, "loss": -34.4463, "step": 4942 }, { "epoch": 0.4409652526874526, "grad_norm": 486.9739685058594, "learning_rate": 3e-06, "loss": -55.8654, "step": 4943 }, { "epoch": 0.4410544627325037, "grad_norm": 548.7767944335938, "learning_rate": 3e-06, "loss": -25.9152, "step": 4944 }, { "completion_length": 141.5416717529297, "epoch": 0.44114367277755473, "grad_norm": 2532.386474609375, "learning_rate": 3e-06, "loss": -513.683, "reward": 2.013312578201294, "reward_std": 0.5771010220050812, "rewards/correctness_reward_func": 1.375, "rewards/int_reward_func": 0.4895833283662796, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.14872916415333748, "step": 4945, "zero_std_ratio": 0.0 }, { "epoch": 0.4412328828226058, "grad_norm": 7636.50390625, "learning_rate": 3e-06, "loss": -534.022, "step": 4946 }, { "epoch": 0.4413220928676569, "grad_norm": 2935.525634765625, "learning_rate": 3e-06, "loss": -611.0369, "step": 4947 }, { "epoch": 0.44141130291270797, "grad_norm": 3286.58544921875, "learning_rate": 3e-06, "loss": -537.5168, "step": 4948 }, { "epoch": 0.44150051295775905, "grad_norm": 2914.54052734375, "learning_rate": 3e-06, "loss": -514.6476, "step": 4949 }, { "epoch": 0.44158972300281013, "grad_norm": 3642.92041015625, "learning_rate": 3e-06, "loss": -398.1576, "step": 4950 }, { "epoch": 0.4416789330478612, "grad_norm": 2585.7119140625, "learning_rate": 3e-06, "loss": -522.0596, "step": 4951 }, { "epoch": 0.44176814309291224, "grad_norm": 3222.106689453125, "learning_rate": 3e-06, "loss": -547.4564, "step": 4952 }, { "epoch": 0.4418573531379633, "grad_norm": 3209.69873046875, "learning_rate": 3e-06, "loss": -629.0561, "step": 4953 }, { "epoch": 0.4419465631830144, "grad_norm": 3518.091796875, "learning_rate": 3e-06, "loss": -547.1844, "step": 4954 }, { "epoch": 0.4420357732280655, "grad_norm": 2543.301513671875, "learning_rate": 3e-06, "loss": -530.1975, "step": 4955 }, { "epoch": 0.44212498327311656, "grad_norm": 10371.1982421875, "learning_rate": 3e-06, "loss": -448.8305, "step": 4956 }, { "completion_length": 134.3333396911621, "epoch": 0.44221419331816764, "grad_norm": 2111.578857421875, "learning_rate": 3e-06, "loss": 24.1321, "reward": 2.5344998836517334, "reward_std": 0.26025331020355225, "rewards/correctness_reward_func": 1.8333333134651184, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.20116667449474335, "step": 4957, "zero_std_ratio": 0.125 }, { "epoch": 0.4423034033632187, "grad_norm": 1751.8060302734375, "learning_rate": 3e-06, "loss": 12.3244, "step": 4958 }, { "epoch": 0.4423926134082698, "grad_norm": 2014.47998046875, "learning_rate": 3e-06, "loss": -38.7561, "step": 4959 }, { "epoch": 0.4424818234533208, "grad_norm": 1729.11083984375, "learning_rate": 3e-06, "loss": 13.8352, "step": 4960 }, { "epoch": 0.4425710334983719, "grad_norm": 1802.935546875, "learning_rate": 3e-06, "loss": -59.1178, "step": 4961 }, { "epoch": 0.442660243543423, "grad_norm": 1576.2713623046875, "learning_rate": 3e-06, "loss": -46.1488, "step": 4962 }, { "epoch": 0.44274945358847406, "grad_norm": 1939.35986328125, "learning_rate": 3e-06, "loss": 22.1796, "step": 4963 }, { "epoch": 0.44283866363352514, "grad_norm": 1826.6181640625, "learning_rate": 3e-06, "loss": 4.2715, "step": 4964 }, { "epoch": 0.4429278736785762, "grad_norm": 1660.52685546875, "learning_rate": 3e-06, "loss": -41.5383, "step": 4965 }, { "epoch": 0.4430170837236273, "grad_norm": 1500.8109130859375, "learning_rate": 3e-06, "loss": 3.0405, "step": 4966 }, { "epoch": 0.4431062937686783, "grad_norm": 1459.4525146484375, "learning_rate": 3e-06, "loss": -67.6826, "step": 4967 }, { "epoch": 0.4431955038137294, "grad_norm": 1657.65673828125, "learning_rate": 3e-06, "loss": -51.5723, "step": 4968 }, { "completion_length": 117.06250381469727, "epoch": 0.4432847138587805, "grad_norm": 2050.062255859375, "learning_rate": 3e-06, "loss": -365.8284, "reward": 1.9028334617614746, "reward_std": 0.366595596075058, "rewards/correctness_reward_func": 1.2083333432674408, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.19450000673532486, "step": 4969, "zero_std_ratio": 0.0 }, { "epoch": 0.44337392390383157, "grad_norm": 2167.846923828125, "learning_rate": 3e-06, "loss": -329.9084, "step": 4970 }, { "epoch": 0.44346313394888265, "grad_norm": 1831.7462158203125, "learning_rate": 3e-06, "loss": -355.1531, "step": 4971 }, { "epoch": 0.44355234399393373, "grad_norm": 2638.92041015625, "learning_rate": 3e-06, "loss": -411.2054, "step": 4972 }, { "epoch": 0.4436415540389848, "grad_norm": 2043.7064208984375, "learning_rate": 3e-06, "loss": -364.5036, "step": 4973 }, { "epoch": 0.4437307640840359, "grad_norm": 2681.273681640625, "learning_rate": 3e-06, "loss": -350.5971, "step": 4974 }, { "epoch": 0.4438199741290869, "grad_norm": 1997.8013916015625, "learning_rate": 3e-06, "loss": -395.6633, "step": 4975 }, { "epoch": 0.443909184174138, "grad_norm": 3987.311279296875, "learning_rate": 3e-06, "loss": -359.5617, "step": 4976 }, { "epoch": 0.4439983942191891, "grad_norm": 1935.837890625, "learning_rate": 3e-06, "loss": -381.1286, "step": 4977 }, { "epoch": 0.44408760426424015, "grad_norm": 2417.363037109375, "learning_rate": 3e-06, "loss": -442.4392, "step": 4978 }, { "epoch": 0.44417681430929123, "grad_norm": 2208.86181640625, "learning_rate": 3e-06, "loss": -394.7316, "step": 4979 }, { "epoch": 0.4442660243543423, "grad_norm": 1990.800048828125, "learning_rate": 3e-06, "loss": -355.8077, "step": 4980 }, { "completion_length": 113.16667175292969, "epoch": 0.4443552343993934, "grad_norm": 2283.399658203125, "learning_rate": 3e-06, "loss": -123.2546, "reward": 2.3373334407806396, "reward_std": 0.4521911069750786, "rewards/correctness_reward_func": 1.625, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.2123333364725113, "step": 4981, "zero_std_ratio": 0.0 }, { "epoch": 0.4444444444444444, "grad_norm": 2096.0625, "learning_rate": 3e-06, "loss": -152.287, "step": 4982 }, { "epoch": 0.4445336544894955, "grad_norm": 1680.1812744140625, "learning_rate": 3e-06, "loss": -132.2925, "step": 4983 }, { "epoch": 0.4446228645345466, "grad_norm": 2131.746337890625, "learning_rate": 3e-06, "loss": -77.5604, "step": 4984 }, { "epoch": 0.44471207457959766, "grad_norm": 2567.088623046875, "learning_rate": 3e-06, "loss": -101.2305, "step": 4985 }, { "epoch": 0.44480128462464874, "grad_norm": 2307.953125, "learning_rate": 3e-06, "loss": -20.4885, "step": 4986 }, { "epoch": 0.4448904946696998, "grad_norm": 2095.7626953125, "learning_rate": 3e-06, "loss": -156.7293, "step": 4987 }, { "epoch": 0.4449797047147509, "grad_norm": 2344.3818359375, "learning_rate": 3e-06, "loss": -167.2778, "step": 4988 }, { "epoch": 0.445068914759802, "grad_norm": 2124.602294921875, "learning_rate": 3e-06, "loss": -145.1272, "step": 4989 }, { "epoch": 0.445158124804853, "grad_norm": 1749.4766845703125, "learning_rate": 3e-06, "loss": -109.2397, "step": 4990 }, { "epoch": 0.4452473348499041, "grad_norm": 2732.352294921875, "learning_rate": 3e-06, "loss": -127.8369, "step": 4991 }, { "epoch": 0.44533654489495517, "grad_norm": 3887.142333984375, "learning_rate": 3e-06, "loss": -19.6074, "step": 4992 }, { "completion_length": 117.68750381469727, "epoch": 0.44542575494000625, "grad_norm": 2400.797607421875, "learning_rate": 3e-06, "loss": -113.2892, "reward": 2.0996875762939453, "reward_std": 0.5116298198699951, "rewards/correctness_reward_func": 1.4166666269302368, "rewards/int_reward_func": 0.4791666567325592, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.20385416597127914, "step": 4993, "zero_std_ratio": 0.0 }, { "epoch": 0.4455149649850573, "grad_norm": 2056.5302734375, "learning_rate": 3e-06, "loss": 51.3632, "step": 4994 }, { "epoch": 0.4456041750301084, "grad_norm": 2440.17236328125, "learning_rate": 3e-06, "loss": 124.6281, "step": 4995 }, { "epoch": 0.4456933850751595, "grad_norm": 1901.8880615234375, "learning_rate": 3e-06, "loss": 16.3196, "step": 4996 }, { "epoch": 0.4457825951202105, "grad_norm": 1567.1207275390625, "learning_rate": 3e-06, "loss": 141.5414, "step": 4997 }, { "epoch": 0.4458718051652616, "grad_norm": 1841.68603515625, "learning_rate": 3e-06, "loss": 259.5959, "step": 4998 }, { "epoch": 0.4459610152103127, "grad_norm": 2199.729248046875, "learning_rate": 3e-06, "loss": -131.7332, "step": 4999 }, { "epoch": 0.44605022525536375, "grad_norm": 2258.56103515625, "learning_rate": 3e-06, "loss": 28.2476, "step": 5000 } ], "logging_steps": 1, "max_steps": 112090, "num_input_tokens_seen": 0, "num_train_epochs": 10, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": true, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 6, "trial_name": null, "trial_params": null }