{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.22302511262768188, "eval_steps": 1000, "global_step": 2500, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "completion_length": 225.1666717529297, "epoch": 8.921004505107276e-05, "grad_norm": 62.695068359375, "learning_rate": 2.5e-07, "loss": 12.7202, "reward": 0.19306249171495438, "reward_std": 0.5882241576910019, "rewards/correctness_reward_func": 0.3333333432674408, "rewards/int_reward_func": 0.0833333358168602, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.22360415756702423, "step": 1, "zero_std_ratio": 0.0 }, { "epoch": 0.00017842009010214551, "grad_norm": 59.6503791809082, "learning_rate": 5e-07, "loss": 13.0732, "step": 2 }, { "epoch": 0.00026763013515321824, "grad_norm": 69.65516662597656, "learning_rate": 7.5e-07, "loss": 12.9681, "step": 3 }, { "epoch": 0.00035684018020429103, "grad_norm": 57.81648635864258, "learning_rate": 1e-06, "loss": 8.1042, "step": 4 }, { "epoch": 0.00044605022525536376, "grad_norm": 57.6408576965332, "learning_rate": 1.25e-06, "loss": 8.6056, "step": 5 }, { "epoch": 0.0005352602703064365, "grad_norm": 58.459903717041016, "learning_rate": 1.5e-06, "loss": 10.4929, "step": 6 }, { "epoch": 0.0006244703153575092, "grad_norm": 62.41658020019531, "learning_rate": 1.7500000000000002e-06, "loss": 13.1206, "step": 7 }, { "epoch": 0.0007136803604085821, "grad_norm": 66.22370910644531, "learning_rate": 2e-06, "loss": 13.2007, "step": 8 }, { "epoch": 0.0008028904054596548, "grad_norm": 66.21946716308594, "learning_rate": 2.25e-06, "loss": 12.3522, "step": 9 }, { "epoch": 0.0008921004505107275, "grad_norm": 65.43058776855469, "learning_rate": 2.5e-06, "loss": 7.9566, "step": 10 }, { "epoch": 0.0009813104955618004, "grad_norm": 54.532962799072266, "learning_rate": 2.75e-06, "loss": 8.8616, "step": 11 }, { "epoch": 0.001070520540612873, "grad_norm": 56.53645706176758, "learning_rate": 3e-06, "loss": 10.3095, "step": 12 }, { "completion_length": 222.9791717529297, "epoch": 0.0011597305856639458, "grad_norm": 78.4708023071289, "learning_rate": 3e-06, "loss": -24.6031, "reward": 0.011666670441627502, "reward_std": 0.524684801697731, "rewards/correctness_reward_func": 0.1250000037252903, "rewards/int_reward_func": 0.041666666977107525, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.1550000049173832, "step": 13, "zero_std_ratio": 0.0 }, { "epoch": 0.0012489406307150184, "grad_norm": 88.30101776123047, "learning_rate": 3e-06, "loss": -18.3145, "step": 14 }, { "epoch": 0.0013381506757660913, "grad_norm": 101.97128295898438, "learning_rate": 3e-06, "loss": -7.4151, "step": 15 }, { "epoch": 0.0014273607208171641, "grad_norm": 91.58382415771484, "learning_rate": 3e-06, "loss": -8.9073, "step": 16 }, { "epoch": 0.0015165707658682367, "grad_norm": 90.10670471191406, "learning_rate": 3e-06, "loss": -13.5176, "step": 17 }, { "epoch": 0.0016057808109193096, "grad_norm": 80.67254638671875, "learning_rate": 3e-06, "loss": -17.2813, "step": 18 }, { "epoch": 0.0016949908559703822, "grad_norm": 75.51331329345703, "learning_rate": 3e-06, "loss": -24.6926, "step": 19 }, { "epoch": 0.001784200901021455, "grad_norm": 78.15167999267578, "learning_rate": 3e-06, "loss": -18.5973, "step": 20 }, { "epoch": 0.0018734109460725277, "grad_norm": 89.70745086669922, "learning_rate": 3e-06, "loss": -7.9364, "step": 21 }, { "epoch": 0.0019626209911236007, "grad_norm": 89.28164672851562, "learning_rate": 3e-06, "loss": -9.434, "step": 22 }, { "epoch": 0.002051831036174673, "grad_norm": 98.30489349365234, "learning_rate": 3e-06, "loss": -14.6494, "step": 23 }, { "epoch": 0.002141041081225746, "grad_norm": 92.3221206665039, "learning_rate": 3e-06, "loss": -17.5654, "step": 24 }, { "completion_length": 224.95834350585938, "epoch": 0.002230251126276819, "grad_norm": 77.60931396484375, "learning_rate": 3e-06, "loss": 12.7299, "reward": 0.08664583414793015, "reward_std": 0.529650554060936, "rewards/correctness_reward_func": 0.1666666679084301, "rewards/int_reward_func": 0.041666666977107525, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.12168749794363976, "step": 25, "zero_std_ratio": 0.0 }, { "epoch": 0.0023194611713278916, "grad_norm": 55.48664474487305, "learning_rate": 3e-06, "loss": 11.3297, "step": 26 }, { "epoch": 0.0024086712163789645, "grad_norm": 64.88197326660156, "learning_rate": 3e-06, "loss": 7.6398, "step": 27 }, { "epoch": 0.002497881261430037, "grad_norm": 66.41521453857422, "learning_rate": 3e-06, "loss": 10.9742, "step": 28 }, { "epoch": 0.0025870913064811097, "grad_norm": 60.356266021728516, "learning_rate": 3e-06, "loss": 18.3629, "step": 29 }, { "epoch": 0.0026763013515321826, "grad_norm": 67.53816986083984, "learning_rate": 3e-06, "loss": 10.3122, "step": 30 }, { "epoch": 0.0027655113965832554, "grad_norm": 81.81299591064453, "learning_rate": 3e-06, "loss": 12.4031, "step": 31 }, { "epoch": 0.0028547214416343282, "grad_norm": 58.01384735107422, "learning_rate": 3e-06, "loss": 11.3115, "step": 32 }, { "epoch": 0.0029439314866854006, "grad_norm": 60.38798522949219, "learning_rate": 3e-06, "loss": 7.5438, "step": 33 }, { "epoch": 0.0030331415317364735, "grad_norm": 76.68485260009766, "learning_rate": 3e-06, "loss": 9.8314, "step": 34 }, { "epoch": 0.0031223515767875463, "grad_norm": 63.667381286621094, "learning_rate": 3e-06, "loss": 18.0907, "step": 35 }, { "epoch": 0.003211561621838619, "grad_norm": 64.93324279785156, "learning_rate": 3e-06, "loss": 9.6529, "step": 36 }, { "completion_length": 200.9166717529297, "epoch": 0.003300771666889692, "grad_norm": 55.603302001953125, "learning_rate": 3e-06, "loss": -2.1381, "reward": 0.20900000631809235, "reward_std": 0.5408279597759247, "rewards/correctness_reward_func": 0.2500000074505806, "rewards/int_reward_func": 0.09374999813735485, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.1347499955445528, "step": 37, "zero_std_ratio": 0.0 }, { "epoch": 0.0033899817119407644, "grad_norm": 59.070777893066406, "learning_rate": 3e-06, "loss": -6.7825, "step": 38 }, { "epoch": 0.0034791917569918372, "grad_norm": 73.52457427978516, "learning_rate": 3e-06, "loss": -11.5592, "step": 39 }, { "epoch": 0.00356840180204291, "grad_norm": 68.8139419555664, "learning_rate": 3e-06, "loss": -3.9847, "step": 40 }, { "epoch": 0.003657611847093983, "grad_norm": 74.64259338378906, "learning_rate": 3e-06, "loss": -7.7023, "step": 41 }, { "epoch": 0.0037468218921450553, "grad_norm": 68.76261901855469, "learning_rate": 3e-06, "loss": -11.4536, "step": 42 }, { "epoch": 0.003836031937196128, "grad_norm": 57.10056686401367, "learning_rate": 3e-06, "loss": -3.0195, "step": 43 }, { "epoch": 0.003925241982247201, "grad_norm": 57.4798583984375, "learning_rate": 3e-06, "loss": -7.3677, "step": 44 }, { "epoch": 0.004014452027298274, "grad_norm": 62.251949310302734, "learning_rate": 3e-06, "loss": -12.4481, "step": 45 }, { "epoch": 0.004103662072349346, "grad_norm": 67.0556640625, "learning_rate": 3e-06, "loss": -4.2431, "step": 46 }, { "epoch": 0.0041928721174004195, "grad_norm": 79.22687530517578, "learning_rate": 3e-06, "loss": -8.9896, "step": 47 }, { "epoch": 0.004282082162451492, "grad_norm": 83.6895980834961, "learning_rate": 3e-06, "loss": -12.4645, "step": 48 }, { "completion_length": 186.0, "epoch": 0.004371292207502565, "grad_norm": 187.8282928466797, "learning_rate": 3e-06, "loss": -44.8522, "reward": 0.5453333556652069, "reward_std": 0.9364342093467712, "rewards/correctness_reward_func": 0.4583333358168602, "rewards/int_reward_func": 0.15625, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.06924999551847577, "step": 49, "zero_std_ratio": 0.0 }, { "epoch": 0.004460502252553638, "grad_norm": 92.9270248413086, "learning_rate": 3e-06, "loss": -41.2773, "step": 50 }, { "epoch": 0.00454971229760471, "grad_norm": 212.18917846679688, "learning_rate": 3e-06, "loss": -42.3882, "step": 51 }, { "epoch": 0.004638922342655783, "grad_norm": 102.22235870361328, "learning_rate": 3e-06, "loss": -42.8879, "step": 52 }, { "epoch": 0.004728132387706856, "grad_norm": 79.1269302368164, "learning_rate": 3e-06, "loss": -44.708, "step": 53 }, { "epoch": 0.004817342432757929, "grad_norm": 94.53079986572266, "learning_rate": 3e-06, "loss": -41.656, "step": 54 }, { "epoch": 0.004906552477809001, "grad_norm": 91.7303695678711, "learning_rate": 3e-06, "loss": -45.3257, "step": 55 }, { "epoch": 0.004995762522860074, "grad_norm": 92.66773986816406, "learning_rate": 3e-06, "loss": -41.4113, "step": 56 }, { "epoch": 0.005084972567911147, "grad_norm": 123.76467895507812, "learning_rate": 3e-06, "loss": -43.4643, "step": 57 }, { "epoch": 0.0051741826129622194, "grad_norm": 109.21142578125, "learning_rate": 3e-06, "loss": -44.7136, "step": 58 }, { "epoch": 0.005263392658013293, "grad_norm": 83.24272155761719, "learning_rate": 3e-06, "loss": -45.7862, "step": 59 }, { "epoch": 0.005352602703064365, "grad_norm": 94.45966339111328, "learning_rate": 3e-06, "loss": -42.5492, "step": 60 }, { "completion_length": 225.625, "epoch": 0.0054418127481154375, "grad_norm": 125.85611724853516, "learning_rate": 3e-06, "loss": 28.3825, "reward": 0.31822918355464935, "reward_std": 0.9613562524318695, "rewards/correctness_reward_func": 0.375, "rewards/int_reward_func": 0.1145833320915699, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.17135417833924294, "step": 61, "zero_std_ratio": 0.0 }, { "epoch": 0.005531022793166511, "grad_norm": 128.64669799804688, "learning_rate": 3e-06, "loss": 4.8596, "step": 62 }, { "epoch": 0.005620232838217583, "grad_norm": 287.9391784667969, "learning_rate": 3e-06, "loss": 20.5521, "step": 63 }, { "epoch": 0.0057094428832686565, "grad_norm": 111.01509857177734, "learning_rate": 3e-06, "loss": 16.6241, "step": 64 }, { "epoch": 0.005798652928319729, "grad_norm": 123.25679016113281, "learning_rate": 3e-06, "loss": 6.8919, "step": 65 }, { "epoch": 0.005887862973370801, "grad_norm": 115.68987274169922, "learning_rate": 3e-06, "loss": 19.3061, "step": 66 }, { "epoch": 0.0059770730184218746, "grad_norm": 128.9923553466797, "learning_rate": 3e-06, "loss": 27.4792, "step": 67 }, { "epoch": 0.006066283063472947, "grad_norm": 130.64230346679688, "learning_rate": 3e-06, "loss": 3.8702, "step": 68 }, { "epoch": 0.00615549310852402, "grad_norm": 169.2925262451172, "learning_rate": 3e-06, "loss": 19.2163, "step": 69 }, { "epoch": 0.006244703153575093, "grad_norm": 104.88905334472656, "learning_rate": 3e-06, "loss": 14.5854, "step": 70 }, { "epoch": 0.006333913198626165, "grad_norm": 134.32022094726562, "learning_rate": 3e-06, "loss": 5.6117, "step": 71 }, { "epoch": 0.006423123243677238, "grad_norm": 124.52132415771484, "learning_rate": 3e-06, "loss": 18.0908, "step": 72 }, { "completion_length": 203.81250762939453, "epoch": 0.006512333288728311, "grad_norm": 87.01981353759766, "learning_rate": 3e-06, "loss": 14.5961, "reward": 0.25443750619888306, "reward_std": 0.6893003582954407, "rewards/correctness_reward_func": 0.2916666641831398, "rewards/int_reward_func": 0.09374999813735485, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.13097916916012764, "step": 73, "zero_std_ratio": 0.0 }, { "epoch": 0.006601543333779384, "grad_norm": 83.70246887207031, "learning_rate": 3e-06, "loss": 20.7203, "step": 74 }, { "epoch": 0.006690753378830456, "grad_norm": 80.23466491699219, "learning_rate": 3e-06, "loss": 30.3319, "step": 75 }, { "epoch": 0.006779963423881529, "grad_norm": 74.68209838867188, "learning_rate": 3e-06, "loss": 23.2, "step": 76 }, { "epoch": 0.006869173468932602, "grad_norm": 81.28849029541016, "learning_rate": 3e-06, "loss": 11.7216, "step": 77 }, { "epoch": 0.0069583835139836745, "grad_norm": 85.60411071777344, "learning_rate": 3e-06, "loss": 19.9348, "step": 78 }, { "epoch": 0.007047593559034747, "grad_norm": 95.26403045654297, "learning_rate": 3e-06, "loss": 13.5735, "step": 79 }, { "epoch": 0.00713680360408582, "grad_norm": 81.69352722167969, "learning_rate": 3e-06, "loss": 19.4906, "step": 80 }, { "epoch": 0.0072260136491368926, "grad_norm": 80.9581527709961, "learning_rate": 3e-06, "loss": 29.1989, "step": 81 }, { "epoch": 0.007315223694187966, "grad_norm": 87.37995147705078, "learning_rate": 3e-06, "loss": 23.4541, "step": 82 }, { "epoch": 0.007404433739239038, "grad_norm": 90.7470932006836, "learning_rate": 3e-06, "loss": 10.7907, "step": 83 }, { "epoch": 0.007493643784290111, "grad_norm": 352.26953125, "learning_rate": 3e-06, "loss": 18.1423, "step": 84 }, { "completion_length": 185.6875, "epoch": 0.007582853829341184, "grad_norm": 78.5768051147461, "learning_rate": 3e-06, "loss": -2.0743, "reward": 0.6054166778922081, "reward_std": 0.8349271714687347, "rewards/correctness_reward_func": 0.5, "rewards/int_reward_func": 0.125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.019583335146307945, "step": 85, "zero_std_ratio": 0.0 }, { "epoch": 0.007672063874392256, "grad_norm": 125.65332794189453, "learning_rate": 3e-06, "loss": -18.0183, "step": 86 }, { "epoch": 0.00776127391944333, "grad_norm": 93.01673889160156, "learning_rate": 3e-06, "loss": -2.9219, "step": 87 }, { "epoch": 0.007850483964494403, "grad_norm": 85.38358306884766, "learning_rate": 3e-06, "loss": -7.3606, "step": 88 }, { "epoch": 0.007939694009545474, "grad_norm": 99.59243774414062, "learning_rate": 3e-06, "loss": -18.9376, "step": 89 }, { "epoch": 0.008028904054596548, "grad_norm": 96.83404541015625, "learning_rate": 3e-06, "loss": -7.9748, "step": 90 }, { "epoch": 0.008118114099647621, "grad_norm": 81.16954803466797, "learning_rate": 3e-06, "loss": -4.2134, "step": 91 }, { "epoch": 0.008207324144698692, "grad_norm": 123.15869140625, "learning_rate": 3e-06, "loss": -19.8823, "step": 92 }, { "epoch": 0.008296534189749766, "grad_norm": 93.05419158935547, "learning_rate": 3e-06, "loss": -4.5813, "step": 93 }, { "epoch": 0.008385744234800839, "grad_norm": 106.2331314086914, "learning_rate": 3e-06, "loss": -8.6969, "step": 94 }, { "epoch": 0.00847495427985191, "grad_norm": 99.65939331054688, "learning_rate": 3e-06, "loss": -21.3275, "step": 95 }, { "epoch": 0.008564164324902984, "grad_norm": 94.40375518798828, "learning_rate": 3e-06, "loss": -9.7937, "step": 96 }, { "completion_length": 217.25000762939453, "epoch": 0.008653374369954057, "grad_norm": 133.5598907470703, "learning_rate": 3e-06, "loss": -68.7329, "reward": 0.6968958526849747, "reward_std": 0.7409922480583191, "rewards/correctness_reward_func": 0.5833333283662796, "rewards/int_reward_func": 0.1979166641831398, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.08435416780412197, "step": 97, "zero_std_ratio": 0.0 }, { "epoch": 0.00874258441500513, "grad_norm": 136.60848999023438, "learning_rate": 3e-06, "loss": -74.5256, "step": 98 }, { "epoch": 0.008831794460056202, "grad_norm": 123.70120239257812, "learning_rate": 3e-06, "loss": -59.6774, "step": 99 }, { "epoch": 0.008921004505107275, "grad_norm": 150.22532653808594, "learning_rate": 3e-06, "loss": -69.5624, "step": 100 }, { "epoch": 0.009010214550158348, "grad_norm": 126.68507385253906, "learning_rate": 3e-06, "loss": -62.8973, "step": 101 }, { "epoch": 0.00909942459520942, "grad_norm": 105.47962951660156, "learning_rate": 3e-06, "loss": -61.6182, "step": 102 }, { "epoch": 0.009188634640260493, "grad_norm": 144.26048278808594, "learning_rate": 3e-06, "loss": -70.5109, "step": 103 }, { "epoch": 0.009277844685311567, "grad_norm": 141.22325134277344, "learning_rate": 3e-06, "loss": -76.5479, "step": 104 }, { "epoch": 0.009367054730362638, "grad_norm": 139.37173461914062, "learning_rate": 3e-06, "loss": -62.353, "step": 105 }, { "epoch": 0.009456264775413711, "grad_norm": 150.77801513671875, "learning_rate": 3e-06, "loss": -72.2384, "step": 106 }, { "epoch": 0.009545474820464785, "grad_norm": 138.2374267578125, "learning_rate": 3e-06, "loss": -65.3746, "step": 107 }, { "epoch": 0.009634684865515858, "grad_norm": 132.50453186035156, "learning_rate": 3e-06, "loss": -64.141, "step": 108 }, { "completion_length": 198.125, "epoch": 0.00972389491056693, "grad_norm": 187.5413055419922, "learning_rate": 3e-06, "loss": 44.1489, "reward": 0.7788957953453064, "reward_std": 0.7549726963043213, "rewards/correctness_reward_func": 0.6666666567325592, "rewards/int_reward_func": 0.1770833283662796, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.06485416181385517, "step": 109, "zero_std_ratio": 0.0 }, { "epoch": 0.009813104955618003, "grad_norm": 138.89434814453125, "learning_rate": 3e-06, "loss": 51.6646, "step": 110 }, { "epoch": 0.009902315000669076, "grad_norm": 128.95484924316406, "learning_rate": 3e-06, "loss": 31.7993, "step": 111 }, { "epoch": 0.009991525045720148, "grad_norm": 126.7931900024414, "learning_rate": 3e-06, "loss": 38.5454, "step": 112 }, { "epoch": 0.01008073509077122, "grad_norm": 125.33599853515625, "learning_rate": 3e-06, "loss": 40.3822, "step": 113 }, { "epoch": 0.010169945135822294, "grad_norm": 139.41482543945312, "learning_rate": 3e-06, "loss": 32.5052, "step": 114 }, { "epoch": 0.010259155180873366, "grad_norm": 169.09432983398438, "learning_rate": 3e-06, "loss": 43.5542, "step": 115 }, { "epoch": 0.010348365225924439, "grad_norm": 133.872802734375, "learning_rate": 3e-06, "loss": 50.3469, "step": 116 }, { "epoch": 0.010437575270975512, "grad_norm": 125.77018737792969, "learning_rate": 3e-06, "loss": 31.112, "step": 117 }, { "epoch": 0.010526785316026585, "grad_norm": 128.32257080078125, "learning_rate": 3e-06, "loss": 36.629, "step": 118 }, { "epoch": 0.010615995361077657, "grad_norm": 124.38401794433594, "learning_rate": 3e-06, "loss": 39.8284, "step": 119 }, { "epoch": 0.01070520540612873, "grad_norm": 138.24668884277344, "learning_rate": 3e-06, "loss": 31.1433, "step": 120 }, { "completion_length": 191.6041717529297, "epoch": 0.010794415451179804, "grad_norm": 199.1646270751953, "learning_rate": 3e-06, "loss": 86.314, "reward": 1.0014583468437195, "reward_std": 0.8148851096630096, "rewards/correctness_reward_func": 0.875, "rewards/int_reward_func": 0.2291666641831398, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.10270833596587181, "step": 121, "zero_std_ratio": 0.0 }, { "epoch": 0.010883625496230875, "grad_norm": 195.7254638671875, "learning_rate": 3e-06, "loss": 76.1289, "step": 122 }, { "epoch": 0.010972835541281948, "grad_norm": 175.13900756835938, "learning_rate": 3e-06, "loss": 86.1448, "step": 123 }, { "epoch": 0.011062045586333022, "grad_norm": 182.21661376953125, "learning_rate": 3e-06, "loss": 90.0805, "step": 124 }, { "epoch": 0.011151255631384093, "grad_norm": 189.17214965820312, "learning_rate": 3e-06, "loss": 76.0951, "step": 125 }, { "epoch": 0.011240465676435166, "grad_norm": 195.55718994140625, "learning_rate": 3e-06, "loss": 89.5242, "step": 126 }, { "epoch": 0.01132967572148624, "grad_norm": 171.1396484375, "learning_rate": 3e-06, "loss": 82.3705, "step": 127 }, { "epoch": 0.011418885766537313, "grad_norm": 189.04995727539062, "learning_rate": 3e-06, "loss": 71.8677, "step": 128 }, { "epoch": 0.011508095811588384, "grad_norm": 162.9297332763672, "learning_rate": 3e-06, "loss": 81.2432, "step": 129 }, { "epoch": 0.011597305856639458, "grad_norm": 173.23104858398438, "learning_rate": 3e-06, "loss": 85.8069, "step": 130 }, { "epoch": 0.011686515901690531, "grad_norm": 162.6637420654297, "learning_rate": 3e-06, "loss": 69.8347, "step": 131 }, { "epoch": 0.011775725946741603, "grad_norm": 190.06675720214844, "learning_rate": 3e-06, "loss": 84.5222, "step": 132 }, { "completion_length": 207.33334350585938, "epoch": 0.011864935991792676, "grad_norm": 147.16957092285156, "learning_rate": 3e-06, "loss": -63.4697, "reward": 0.6791666746139526, "reward_std": 1.0425111949443817, "rewards/correctness_reward_func": 0.5833333134651184, "rewards/int_reward_func": 0.1770833358168602, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.08124999818392098, "step": 133, "zero_std_ratio": 0.0 }, { "epoch": 0.011954146036843749, "grad_norm": 154.6719970703125, "learning_rate": 3e-06, "loss": -70.5112, "step": 134 }, { "epoch": 0.01204335608189482, "grad_norm": 137.32408142089844, "learning_rate": 3e-06, "loss": -41.1322, "step": 135 }, { "epoch": 0.012132566126945894, "grad_norm": 126.37704467773438, "learning_rate": 3e-06, "loss": -53.2367, "step": 136 }, { "epoch": 0.012221776171996967, "grad_norm": 152.24891662597656, "learning_rate": 3e-06, "loss": -58.4567, "step": 137 }, { "epoch": 0.01231098621704804, "grad_norm": 116.28028106689453, "learning_rate": 3e-06, "loss": -46.2973, "step": 138 }, { "epoch": 0.012400196262099112, "grad_norm": 152.08795166015625, "learning_rate": 3e-06, "loss": -62.8325, "step": 139 }, { "epoch": 0.012489406307150185, "grad_norm": 146.10671997070312, "learning_rate": 3e-06, "loss": -71.6559, "step": 140 }, { "epoch": 0.012578616352201259, "grad_norm": 149.14556884765625, "learning_rate": 3e-06, "loss": -42.1534, "step": 141 }, { "epoch": 0.01266782639725233, "grad_norm": 151.06182861328125, "learning_rate": 3e-06, "loss": -55.6968, "step": 142 }, { "epoch": 0.012757036442303403, "grad_norm": 145.29530334472656, "learning_rate": 3e-06, "loss": -60.2759, "step": 143 }, { "epoch": 0.012846246487354477, "grad_norm": 124.00696563720703, "learning_rate": 3e-06, "loss": -48.5856, "step": 144 }, { "completion_length": 200.14583587646484, "epoch": 0.012935456532405548, "grad_norm": 104.97675323486328, "learning_rate": 3e-06, "loss": -22.2294, "reward": 0.21922918409109116, "reward_std": 0.6296879947185516, "rewards/correctness_reward_func": 0.2916666716337204, "rewards/int_reward_func": 0.062499999068677425, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.13493750616908073, "step": 145, "zero_std_ratio": 0.0 }, { "epoch": 0.013024666577456621, "grad_norm": 83.18937683105469, "learning_rate": 3e-06, "loss": -22.3765, "step": 146 }, { "epoch": 0.013113876622507695, "grad_norm": 96.22801971435547, "learning_rate": 3e-06, "loss": -22.5564, "step": 147 }, { "epoch": 0.013203086667558768, "grad_norm": 102.87374877929688, "learning_rate": 3e-06, "loss": -24.9001, "step": 148 }, { "epoch": 0.01329229671260984, "grad_norm": 110.96674346923828, "learning_rate": 3e-06, "loss": -18.8972, "step": 149 }, { "epoch": 0.013381506757660913, "grad_norm": 91.87604522705078, "learning_rate": 3e-06, "loss": -18.1615, "step": 150 }, { "epoch": 0.013470716802711986, "grad_norm": 88.4422836303711, "learning_rate": 3e-06, "loss": -23.0431, "step": 151 }, { "epoch": 0.013559926847763058, "grad_norm": 83.86327362060547, "learning_rate": 3e-06, "loss": -23.25, "step": 152 }, { "epoch": 0.01364913689281413, "grad_norm": 82.81922149658203, "learning_rate": 3e-06, "loss": -23.1331, "step": 153 }, { "epoch": 0.013738346937865204, "grad_norm": 104.8452377319336, "learning_rate": 3e-06, "loss": -26.8428, "step": 154 }, { "epoch": 0.013827556982916276, "grad_norm": 92.94257354736328, "learning_rate": 3e-06, "loss": -20.0667, "step": 155 }, { "epoch": 0.013916767027967349, "grad_norm": 84.95638275146484, "learning_rate": 3e-06, "loss": -19.1472, "step": 156 }, { "completion_length": 231.7916717529297, "epoch": 0.014005977073018422, "grad_norm": 139.96688842773438, "learning_rate": 3e-06, "loss": 21.1797, "reward": 0.6720625460147858, "reward_std": 0.9181468784809113, "rewards/correctness_reward_func": 0.5833333432674408, "rewards/int_reward_func": 0.2187500074505806, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.13002083078026772, "step": 157, "zero_std_ratio": 0.0 }, { "epoch": 0.014095187118069494, "grad_norm": 109.121337890625, "learning_rate": 3e-06, "loss": 10.5428, "step": 158 }, { "epoch": 0.014184397163120567, "grad_norm": 94.9039306640625, "learning_rate": 3e-06, "loss": 4.8126, "step": 159 }, { "epoch": 0.01427360720817164, "grad_norm": 109.7251968383789, "learning_rate": 3e-06, "loss": 5.2961, "step": 160 }, { "epoch": 0.014362817253222714, "grad_norm": 103.42703247070312, "learning_rate": 3e-06, "loss": 4.0648, "step": 161 }, { "epoch": 0.014452027298273785, "grad_norm": 127.93770599365234, "learning_rate": 3e-06, "loss": 7.0101, "step": 162 }, { "epoch": 0.014541237343324858, "grad_norm": 145.8150634765625, "learning_rate": 3e-06, "loss": 18.3559, "step": 163 }, { "epoch": 0.014630447388375932, "grad_norm": 116.2653579711914, "learning_rate": 3e-06, "loss": 8.3424, "step": 164 }, { "epoch": 0.014719657433427003, "grad_norm": 104.55130767822266, "learning_rate": 3e-06, "loss": 2.084, "step": 165 }, { "epoch": 0.014808867478478076, "grad_norm": 114.84294128417969, "learning_rate": 3e-06, "loss": 2.2571, "step": 166 }, { "epoch": 0.01489807752352915, "grad_norm": 99.8189468383789, "learning_rate": 3e-06, "loss": 0.9219, "step": 167 }, { "epoch": 0.014987287568580221, "grad_norm": 142.80715942382812, "learning_rate": 3e-06, "loss": 3.505, "step": 168 }, { "completion_length": 214.77083587646484, "epoch": 0.015076497613631295, "grad_norm": 114.3720703125, "learning_rate": 3e-06, "loss": -32.1329, "reward": 1.0511458218097687, "reward_std": 1.0028848350048065, "rewards/correctness_reward_func": 0.9166666865348816, "rewards/int_reward_func": 0.3125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.17802083492279053, "step": 169, "zero_std_ratio": 0.0 }, { "epoch": 0.015165707658682368, "grad_norm": 100.3597412109375, "learning_rate": 3e-06, "loss": -38.7511, "step": 170 }, { "epoch": 0.015254917703733441, "grad_norm": 108.30574035644531, "learning_rate": 3e-06, "loss": -46.3083, "step": 171 }, { "epoch": 0.015344127748784513, "grad_norm": 116.34545135498047, "learning_rate": 3e-06, "loss": -39.7363, "step": 172 }, { "epoch": 0.015433337793835586, "grad_norm": 113.52851104736328, "learning_rate": 3e-06, "loss": -34.5686, "step": 173 }, { "epoch": 0.01552254783888666, "grad_norm": 110.65509796142578, "learning_rate": 3e-06, "loss": -32.8796, "step": 174 }, { "epoch": 0.01561175788393773, "grad_norm": 107.06590270996094, "learning_rate": 3e-06, "loss": -32.6552, "step": 175 }, { "epoch": 0.015700967928988806, "grad_norm": 100.2861557006836, "learning_rate": 3e-06, "loss": -39.6106, "step": 176 }, { "epoch": 0.015790177974039876, "grad_norm": 107.69467163085938, "learning_rate": 3e-06, "loss": -46.9244, "step": 177 }, { "epoch": 0.01587938801909095, "grad_norm": 96.8420181274414, "learning_rate": 3e-06, "loss": -40.93, "step": 178 }, { "epoch": 0.015968598064142022, "grad_norm": 113.12389373779297, "learning_rate": 3e-06, "loss": -37.0258, "step": 179 }, { "epoch": 0.016057808109193095, "grad_norm": 116.10971069335938, "learning_rate": 3e-06, "loss": -34.9046, "step": 180 }, { "completion_length": 218.64583587646484, "epoch": 0.01614701815424417, "grad_norm": 307.6673889160156, "learning_rate": 3e-06, "loss": -29.0457, "reward": 1.4360832571983337, "reward_std": 1.0610616505146027, "rewards/correctness_reward_func": 1.25, "rewards/int_reward_func": 0.34375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.15766665898263454, "step": 181, "zero_std_ratio": 0.0 }, { "epoch": 0.016236228199295242, "grad_norm": 125.99212646484375, "learning_rate": 3e-06, "loss": 1.016, "step": 182 }, { "epoch": 0.01632543824434631, "grad_norm": 112.21588897705078, "learning_rate": 3e-06, "loss": -17.115, "step": 183 }, { "epoch": 0.016414648289397385, "grad_norm": 118.06622314453125, "learning_rate": 3e-06, "loss": -0.4864, "step": 184 }, { "epoch": 0.016503858334448458, "grad_norm": 116.36631774902344, "learning_rate": 3e-06, "loss": 3.4437, "step": 185 }, { "epoch": 0.01659306837949953, "grad_norm": 124.60052490234375, "learning_rate": 3e-06, "loss": -27.5515, "step": 186 }, { "epoch": 0.016682278424550605, "grad_norm": 160.65628051757812, "learning_rate": 3e-06, "loss": -29.3863, "step": 187 }, { "epoch": 0.016771488469601678, "grad_norm": 127.9763412475586, "learning_rate": 3e-06, "loss": 0.7423, "step": 188 }, { "epoch": 0.01686069851465275, "grad_norm": 116.69316101074219, "learning_rate": 3e-06, "loss": -18.6518, "step": 189 }, { "epoch": 0.01694990855970382, "grad_norm": 114.2183609008789, "learning_rate": 3e-06, "loss": -0.87, "step": 190 }, { "epoch": 0.017039118604754894, "grad_norm": 126.1614761352539, "learning_rate": 3e-06, "loss": 2.8213, "step": 191 }, { "epoch": 0.017128328649805968, "grad_norm": 134.43527221679688, "learning_rate": 3e-06, "loss": -28.8518, "step": 192 }, { "completion_length": 247.08334350585938, "epoch": 0.01721753869485704, "grad_norm": 111.3412094116211, "learning_rate": 3e-06, "loss": -7.8814, "reward": 1.0157291293144226, "reward_std": 0.7945153564214706, "rewards/correctness_reward_func": 0.9166666865348816, "rewards/int_reward_func": 0.2916666716337204, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.1926041767001152, "step": 193, "zero_std_ratio": 0.0 }, { "epoch": 0.017306748739908114, "grad_norm": 116.42599487304688, "learning_rate": 3e-06, "loss": -15.1245, "step": 194 }, { "epoch": 0.017395958784959187, "grad_norm": 136.37391662597656, "learning_rate": 3e-06, "loss": -9.2426, "step": 195 }, { "epoch": 0.01748516883001026, "grad_norm": 97.36872863769531, "learning_rate": 3e-06, "loss": -10.8671, "step": 196 }, { "epoch": 0.01757437887506133, "grad_norm": 125.0397720336914, "learning_rate": 3e-06, "loss": -7.0755, "step": 197 }, { "epoch": 0.017663588920112404, "grad_norm": 171.17971801757812, "learning_rate": 3e-06, "loss": -16.797, "step": 198 }, { "epoch": 0.017752798965163477, "grad_norm": 100.81266021728516, "learning_rate": 3e-06, "loss": -8.4577, "step": 199 }, { "epoch": 0.01784200901021455, "grad_norm": 127.79389953613281, "learning_rate": 3e-06, "loss": -16.3874, "step": 200 }, { "epoch": 0.017931219055265624, "grad_norm": 131.9748077392578, "learning_rate": 3e-06, "loss": -10.7973, "step": 201 }, { "epoch": 0.018020429100316697, "grad_norm": 100.95606231689453, "learning_rate": 3e-06, "loss": -12.0026, "step": 202 }, { "epoch": 0.018109639145367767, "grad_norm": 131.19261169433594, "learning_rate": 3e-06, "loss": -8.3155, "step": 203 }, { "epoch": 0.01819884919041884, "grad_norm": 164.74656677246094, "learning_rate": 3e-06, "loss": -18.9275, "step": 204 }, { "completion_length": 191.7291717529297, "epoch": 0.018288059235469913, "grad_norm": 150.95191955566406, "learning_rate": 3e-06, "loss": 50.6719, "reward": 1.3118958473205566, "reward_std": 0.8902758955955505, "rewards/correctness_reward_func": 0.9999999701976776, "rewards/int_reward_func": 0.34375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.03185417060740292, "step": 205, "zero_std_ratio": 0.0 }, { "epoch": 0.018377269280520987, "grad_norm": 128.34344482421875, "learning_rate": 3e-06, "loss": 32.3302, "step": 206 }, { "epoch": 0.01846647932557206, "grad_norm": 136.15789794921875, "learning_rate": 3e-06, "loss": 33.8857, "step": 207 }, { "epoch": 0.018555689370623133, "grad_norm": 140.50901794433594, "learning_rate": 3e-06, "loss": 28.924, "step": 208 }, { "epoch": 0.018644899415674206, "grad_norm": 168.0647430419922, "learning_rate": 3e-06, "loss": 31.3019, "step": 209 }, { "epoch": 0.018734109460725276, "grad_norm": 133.79208374023438, "learning_rate": 3e-06, "loss": 31.6401, "step": 210 }, { "epoch": 0.01882331950577635, "grad_norm": 161.34898376464844, "learning_rate": 3e-06, "loss": 49.2047, "step": 211 }, { "epoch": 0.018912529550827423, "grad_norm": 129.22007751464844, "learning_rate": 3e-06, "loss": 30.2, "step": 212 }, { "epoch": 0.019001739595878496, "grad_norm": 143.37449645996094, "learning_rate": 3e-06, "loss": 31.2762, "step": 213 }, { "epoch": 0.01909094964092957, "grad_norm": 140.57894897460938, "learning_rate": 3e-06, "loss": 26.7715, "step": 214 }, { "epoch": 0.019180159685980643, "grad_norm": 148.71348571777344, "learning_rate": 3e-06, "loss": 28.729, "step": 215 }, { "epoch": 0.019269369731031716, "grad_norm": 137.0448455810547, "learning_rate": 3e-06, "loss": 29.3048, "step": 216 }, { "completion_length": 241.0, "epoch": 0.019358579776082786, "grad_norm": 167.66650390625, "learning_rate": 3e-06, "loss": -43.2087, "reward": 1.6041667461395264, "reward_std": 0.9945478439331055, "rewards/correctness_reward_func": 1.375, "rewards/int_reward_func": 0.4270833283662796, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.1979166641831398, "step": 217, "zero_std_ratio": 0.0 }, { "epoch": 0.01944778982113386, "grad_norm": 168.0265350341797, "learning_rate": 3e-06, "loss": -45.6767, "step": 218 }, { "epoch": 0.019536999866184932, "grad_norm": 148.4340362548828, "learning_rate": 3e-06, "loss": -30.5571, "step": 219 }, { "epoch": 0.019626209911236005, "grad_norm": 139.6564178466797, "learning_rate": 3e-06, "loss": -44.7743, "step": 220 }, { "epoch": 0.01971541995628708, "grad_norm": 147.22129821777344, "learning_rate": 3e-06, "loss": -41.9365, "step": 221 }, { "epoch": 0.019804630001338152, "grad_norm": 190.81561279296875, "learning_rate": 3e-06, "loss": -48.2229, "step": 222 }, { "epoch": 0.019893840046389222, "grad_norm": 165.86917114257812, "learning_rate": 3e-06, "loss": -43.317, "step": 223 }, { "epoch": 0.019983050091440295, "grad_norm": 162.9475555419922, "learning_rate": 3e-06, "loss": -48.1878, "step": 224 }, { "epoch": 0.02007226013649137, "grad_norm": 179.08360290527344, "learning_rate": 3e-06, "loss": -33.3052, "step": 225 }, { "epoch": 0.02016147018154244, "grad_norm": 133.29290771484375, "learning_rate": 3e-06, "loss": -45.8993, "step": 226 }, { "epoch": 0.020250680226593515, "grad_norm": 155.86611938476562, "learning_rate": 3e-06, "loss": -43.9261, "step": 227 }, { "epoch": 0.020339890271644588, "grad_norm": 154.34974670410156, "learning_rate": 3e-06, "loss": -50.4381, "step": 228 }, { "completion_length": 239.43750762939453, "epoch": 0.02042910031669566, "grad_norm": 105.5196304321289, "learning_rate": 3e-06, "loss": 0.4059, "reward": 1.5316042304039001, "reward_std": 0.8583633303642273, "rewards/correctness_reward_func": 1.375, "rewards/int_reward_func": 0.3541666716337204, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.19756250455975533, "step": 229, "zero_std_ratio": 0.0 }, { "epoch": 0.02051831036174673, "grad_norm": 119.5712890625, "learning_rate": 3e-06, "loss": -3.2594, "step": 230 }, { "epoch": 0.020607520406797804, "grad_norm": 128.1366424560547, "learning_rate": 3e-06, "loss": -12.0605, "step": 231 }, { "epoch": 0.020696730451848878, "grad_norm": 126.55559539794922, "learning_rate": 3e-06, "loss": -15.4799, "step": 232 }, { "epoch": 0.02078594049689995, "grad_norm": 137.93882751464844, "learning_rate": 3e-06, "loss": -18.5312, "step": 233 }, { "epoch": 0.020875150541951024, "grad_norm": 108.0162124633789, "learning_rate": 3e-06, "loss": -11.7573, "step": 234 }, { "epoch": 0.020964360587002098, "grad_norm": 118.95193481445312, "learning_rate": 3e-06, "loss": -1.1434, "step": 235 }, { "epoch": 0.02105357063205317, "grad_norm": 126.50416564941406, "learning_rate": 3e-06, "loss": -3.7423, "step": 236 }, { "epoch": 0.02114278067710424, "grad_norm": 130.68190002441406, "learning_rate": 3e-06, "loss": -14.5207, "step": 237 }, { "epoch": 0.021231990722155314, "grad_norm": 129.162109375, "learning_rate": 3e-06, "loss": -16.3237, "step": 238 }, { "epoch": 0.021321200767206387, "grad_norm": 145.95396423339844, "learning_rate": 3e-06, "loss": -20.6294, "step": 239 }, { "epoch": 0.02141041081225746, "grad_norm": 107.8385009765625, "learning_rate": 3e-06, "loss": -14.0773, "step": 240 }, { "completion_length": 206.9791717529297, "epoch": 0.021499620857308534, "grad_norm": 102.38019561767578, "learning_rate": 3e-06, "loss": -32.3486, "reward": 1.035479187965393, "reward_std": 0.7589404881000519, "rewards/correctness_reward_func": 0.8750000149011612, "rewards/int_reward_func": 0.3020833283662796, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.14160417020320892, "step": 241, "zero_std_ratio": 0.0 }, { "epoch": 0.021588830902359607, "grad_norm": 110.24679565429688, "learning_rate": 3e-06, "loss": -38.6199, "step": 242 }, { "epoch": 0.021678040947410677, "grad_norm": 118.22930145263672, "learning_rate": 3e-06, "loss": -52.9139, "step": 243 }, { "epoch": 0.02176725099246175, "grad_norm": 118.6080322265625, "learning_rate": 3e-06, "loss": -43.3805, "step": 244 }, { "epoch": 0.021856461037512823, "grad_norm": 106.9905776977539, "learning_rate": 3e-06, "loss": -36.7945, "step": 245 }, { "epoch": 0.021945671082563897, "grad_norm": 111.37010955810547, "learning_rate": 3e-06, "loss": -36.4452, "step": 246 }, { "epoch": 0.02203488112761497, "grad_norm": 104.93065643310547, "learning_rate": 3e-06, "loss": -34.2096, "step": 247 }, { "epoch": 0.022124091172666043, "grad_norm": 117.96737670898438, "learning_rate": 3e-06, "loss": -40.621, "step": 248 }, { "epoch": 0.022213301217717116, "grad_norm": 118.701904296875, "learning_rate": 3e-06, "loss": -54.4138, "step": 249 }, { "epoch": 0.022302511262768186, "grad_norm": 118.43307495117188, "learning_rate": 3e-06, "loss": -45.0393, "step": 250 }, { "epoch": 0.02239172130781926, "grad_norm": 114.41901397705078, "learning_rate": 3e-06, "loss": -37.6304, "step": 251 }, { "epoch": 0.022480931352870333, "grad_norm": 123.03970336914062, "learning_rate": 3e-06, "loss": -39.0638, "step": 252 }, { "completion_length": 211.20833587646484, "epoch": 0.022570141397921406, "grad_norm": 139.677734375, "learning_rate": 3e-06, "loss": -27.6756, "reward": 1.5570417046546936, "reward_std": 1.1208258867263794, "rewards/correctness_reward_func": 1.2916666865348816, "rewards/int_reward_func": 0.40625, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.14087500423192978, "step": 253, "zero_std_ratio": 0.0 }, { "epoch": 0.02265935144297248, "grad_norm": 145.56021118164062, "learning_rate": 3e-06, "loss": -40.3289, "step": 254 }, { "epoch": 0.022748561488023553, "grad_norm": 138.8564453125, "learning_rate": 3e-06, "loss": -39.1766, "step": 255 }, { "epoch": 0.022837771533074626, "grad_norm": 229.50186157226562, "learning_rate": 3e-06, "loss": -43.9568, "step": 256 }, { "epoch": 0.022926981578125696, "grad_norm": 138.42791748046875, "learning_rate": 3e-06, "loss": -52.4297, "step": 257 }, { "epoch": 0.02301619162317677, "grad_norm": 147.58364868164062, "learning_rate": 3e-06, "loss": -53.5477, "step": 258 }, { "epoch": 0.023105401668227842, "grad_norm": 140.5048828125, "learning_rate": 3e-06, "loss": -28.1418, "step": 259 }, { "epoch": 0.023194611713278915, "grad_norm": 139.11508178710938, "learning_rate": 3e-06, "loss": -42.7612, "step": 260 }, { "epoch": 0.02328382175832999, "grad_norm": 146.18580627441406, "learning_rate": 3e-06, "loss": -39.909, "step": 261 }, { "epoch": 0.023373031803381062, "grad_norm": 264.3643493652344, "learning_rate": 3e-06, "loss": -46.2595, "step": 262 }, { "epoch": 0.023462241848432132, "grad_norm": 154.1084747314453, "learning_rate": 3e-06, "loss": -55.2424, "step": 263 }, { "epoch": 0.023551451893483205, "grad_norm": 156.28662109375, "learning_rate": 3e-06, "loss": -55.6531, "step": 264 }, { "completion_length": 227.14583587646484, "epoch": 0.02364066193853428, "grad_norm": 115.90379333496094, "learning_rate": 3e-06, "loss": -30.7492, "reward": 1.6066043376922607, "reward_std": 0.8875448107719421, "rewards/correctness_reward_func": 1.2916666865348816, "rewards/int_reward_func": 0.3958333283662796, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.010416666977107525, "rewards/xmlcount_reward_func": -0.09131250530481339, "step": 265, "zero_std_ratio": 0.0 }, { "epoch": 0.02372987198358535, "grad_norm": 113.86587524414062, "learning_rate": 3e-06, "loss": -29.6989, "step": 266 }, { "epoch": 0.023819082028636425, "grad_norm": 110.4273681640625, "learning_rate": 3e-06, "loss": -32.0614, "step": 267 }, { "epoch": 0.023908292073687498, "grad_norm": 111.84119415283203, "learning_rate": 3e-06, "loss": -31.9073, "step": 268 }, { "epoch": 0.02399750211873857, "grad_norm": 103.93081665039062, "learning_rate": 3e-06, "loss": -22.3506, "step": 269 }, { "epoch": 0.02408671216378964, "grad_norm": 120.32383728027344, "learning_rate": 3e-06, "loss": -28.5629, "step": 270 }, { "epoch": 0.024175922208840715, "grad_norm": 124.92536163330078, "learning_rate": 3e-06, "loss": -33.0776, "step": 271 }, { "epoch": 0.024265132253891788, "grad_norm": 119.54340362548828, "learning_rate": 3e-06, "loss": -31.6735, "step": 272 }, { "epoch": 0.02435434229894286, "grad_norm": 128.8444061279297, "learning_rate": 3e-06, "loss": -33.8033, "step": 273 }, { "epoch": 0.024443552343993934, "grad_norm": 123.08969116210938, "learning_rate": 3e-06, "loss": -34.4538, "step": 274 }, { "epoch": 0.024532762389045008, "grad_norm": 111.98983001708984, "learning_rate": 3e-06, "loss": -24.6449, "step": 275 }, { "epoch": 0.02462197243409608, "grad_norm": 123.31842041015625, "learning_rate": 3e-06, "loss": -31.1417, "step": 276 }, { "completion_length": 211.20833587646484, "epoch": 0.02471118247914715, "grad_norm": 83.42295837402344, "learning_rate": 3e-06, "loss": -49.3896, "reward": 1.6565834283828735, "reward_std": 0.7390342950820923, "rewards/correctness_reward_func": 1.375, "rewards/int_reward_func": 0.4166666716337204, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.13508333638310432, "step": 277, "zero_std_ratio": 0.0 }, { "epoch": 0.024800392524198224, "grad_norm": 78.70240783691406, "learning_rate": 3e-06, "loss": -60.2538, "step": 278 }, { "epoch": 0.024889602569249297, "grad_norm": 87.03772735595703, "learning_rate": 3e-06, "loss": -54.701, "step": 279 }, { "epoch": 0.02497881261430037, "grad_norm": 105.03215789794922, "learning_rate": 3e-06, "loss": -50.647, "step": 280 }, { "epoch": 0.025068022659351444, "grad_norm": 94.19722747802734, "learning_rate": 3e-06, "loss": -53.7356, "step": 281 }, { "epoch": 0.025157232704402517, "grad_norm": 71.46943664550781, "learning_rate": 3e-06, "loss": -54.5847, "step": 282 }, { "epoch": 0.025246442749453587, "grad_norm": 90.4788589477539, "learning_rate": 3e-06, "loss": -50.5539, "step": 283 }, { "epoch": 0.02533565279450466, "grad_norm": 74.81779479980469, "learning_rate": 3e-06, "loss": -61.3813, "step": 284 }, { "epoch": 0.025424862839555733, "grad_norm": 85.80409240722656, "learning_rate": 3e-06, "loss": -55.7379, "step": 285 }, { "epoch": 0.025514072884606807, "grad_norm": 135.24191284179688, "learning_rate": 3e-06, "loss": -52.1614, "step": 286 }, { "epoch": 0.02560328292965788, "grad_norm": 94.01042175292969, "learning_rate": 3e-06, "loss": -55.4857, "step": 287 }, { "epoch": 0.025692492974708953, "grad_norm": 72.32071685791016, "learning_rate": 3e-06, "loss": -56.3565, "step": 288 }, { "completion_length": 221.45833587646484, "epoch": 0.025781703019760027, "grad_norm": 225.52276611328125, "learning_rate": 3e-06, "loss": -67.7081, "reward": 1.960687518119812, "reward_std": 0.8211362063884735, "rewards/correctness_reward_func": 1.625, "rewards/int_reward_func": 0.4479166716337204, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.11222916468977928, "step": 289, "zero_std_ratio": 0.0 }, { "epoch": 0.025870913064811096, "grad_norm": 239.94651794433594, "learning_rate": 3e-06, "loss": -66.0587, "step": 290 }, { "epoch": 0.02596012310986217, "grad_norm": 173.2037353515625, "learning_rate": 3e-06, "loss": -59.416, "step": 291 }, { "epoch": 0.026049333154913243, "grad_norm": 228.50621032714844, "learning_rate": 3e-06, "loss": -70.6059, "step": 292 }, { "epoch": 0.026138543199964316, "grad_norm": 213.36802673339844, "learning_rate": 3e-06, "loss": -68.8733, "step": 293 }, { "epoch": 0.02622775324501539, "grad_norm": 389.8759460449219, "learning_rate": 3e-06, "loss": -108.725, "step": 294 }, { "epoch": 0.026316963290066463, "grad_norm": 241.96009826660156, "learning_rate": 3e-06, "loss": -73.0107, "step": 295 }, { "epoch": 0.026406173335117536, "grad_norm": 282.705322265625, "learning_rate": 3e-06, "loss": -71.4601, "step": 296 }, { "epoch": 0.026495383380168606, "grad_norm": 182.99859619140625, "learning_rate": 3e-06, "loss": -62.8503, "step": 297 }, { "epoch": 0.02658459342521968, "grad_norm": 237.8432159423828, "learning_rate": 3e-06, "loss": -76.095, "step": 298 }, { "epoch": 0.026673803470270752, "grad_norm": 224.10140991210938, "learning_rate": 3e-06, "loss": -71.9696, "step": 299 }, { "epoch": 0.026763013515321826, "grad_norm": 401.25421142578125, "learning_rate": 3e-06, "loss": -119.7468, "step": 300 }, { "completion_length": 181.00000762939453, "epoch": 0.0268522235603729, "grad_norm": 102.42095184326172, "learning_rate": 3e-06, "loss": 11.7663, "reward": 1.6927291750907898, "reward_std": 0.8399400115013123, "rewards/correctness_reward_func": 1.3333333730697632, "rewards/int_reward_func": 0.40625, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.0468541644513607, "step": 301, "zero_std_ratio": 0.0 }, { "epoch": 0.026941433605423972, "grad_norm": 97.59688568115234, "learning_rate": 3e-06, "loss": 4.3671, "step": 302 }, { "epoch": 0.027030643650475042, "grad_norm": 119.19691467285156, "learning_rate": 3e-06, "loss": 3.0122, "step": 303 }, { "epoch": 0.027119853695526115, "grad_norm": 102.54327392578125, "learning_rate": 3e-06, "loss": 5.7653, "step": 304 }, { "epoch": 0.02720906374057719, "grad_norm": 127.24678802490234, "learning_rate": 3e-06, "loss": 9.0808, "step": 305 }, { "epoch": 0.02729827378562826, "grad_norm": 115.35128784179688, "learning_rate": 3e-06, "loss": 9.7375, "step": 306 }, { "epoch": 0.027387483830679335, "grad_norm": 109.96597290039062, "learning_rate": 3e-06, "loss": 10.9794, "step": 307 }, { "epoch": 0.02747669387573041, "grad_norm": 116.67013549804688, "learning_rate": 3e-06, "loss": 4.0605, "step": 308 }, { "epoch": 0.02756590392078148, "grad_norm": 100.0082015991211, "learning_rate": 3e-06, "loss": 2.0719, "step": 309 }, { "epoch": 0.02765511396583255, "grad_norm": 103.2455062866211, "learning_rate": 3e-06, "loss": 3.753, "step": 310 }, { "epoch": 0.027744324010883625, "grad_norm": 139.74317932128906, "learning_rate": 3e-06, "loss": 7.435, "step": 311 }, { "epoch": 0.027833534055934698, "grad_norm": 126.05006408691406, "learning_rate": 3e-06, "loss": 8.1927, "step": 312 }, { "completion_length": 187.87500762939453, "epoch": 0.02792274410098577, "grad_norm": 182.04180908203125, "learning_rate": 3e-06, "loss": -90.2214, "reward": 1.3959375023841858, "reward_std": 0.7920421957969666, "rewards/correctness_reward_func": 1.0833333432674408, "rewards/int_reward_func": 0.40625, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.09364583343267441, "step": 313, "zero_std_ratio": 0.0 }, { "epoch": 0.028011954146036844, "grad_norm": 238.57090759277344, "learning_rate": 3e-06, "loss": -100.422, "step": 314 }, { "epoch": 0.028101164191087918, "grad_norm": 235.39544677734375, "learning_rate": 3e-06, "loss": -102.3354, "step": 315 }, { "epoch": 0.028190374236138988, "grad_norm": 223.8190460205078, "learning_rate": 3e-06, "loss": -109.9957, "step": 316 }, { "epoch": 0.02827958428119006, "grad_norm": 225.00672912597656, "learning_rate": 3e-06, "loss": -109.9375, "step": 317 }, { "epoch": 0.028368794326241134, "grad_norm": 247.57774353027344, "learning_rate": 3e-06, "loss": -125.4302, "step": 318 }, { "epoch": 0.028458004371292207, "grad_norm": 193.24212646484375, "learning_rate": 3e-06, "loss": -93.1797, "step": 319 }, { "epoch": 0.02854721441634328, "grad_norm": 264.4795227050781, "learning_rate": 3e-06, "loss": -104.7149, "step": 320 }, { "epoch": 0.028636424461394354, "grad_norm": 226.05810546875, "learning_rate": 3e-06, "loss": -107.5763, "step": 321 }, { "epoch": 0.028725634506445427, "grad_norm": 239.6378173828125, "learning_rate": 3e-06, "loss": -115.9954, "step": 322 }, { "epoch": 0.028814844551496497, "grad_norm": 240.8443145751953, "learning_rate": 3e-06, "loss": -117.6999, "step": 323 }, { "epoch": 0.02890405459654757, "grad_norm": 261.65643310546875, "learning_rate": 3e-06, "loss": -132.8027, "step": 324 }, { "completion_length": 189.8541717529297, "epoch": 0.028993264641598643, "grad_norm": 168.10452270507812, "learning_rate": 3e-06, "loss": 12.7588, "reward": 1.7772499918937683, "reward_std": 0.9346717596054077, "rewards/correctness_reward_func": 1.375, "rewards/int_reward_func": 0.4479166567325592, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.045666664838790894, "step": 325, "zero_std_ratio": 0.0 }, { "epoch": 0.029082474686649717, "grad_norm": 182.863037109375, "learning_rate": 3e-06, "loss": 0.3851, "step": 326 }, { "epoch": 0.02917168473170079, "grad_norm": 214.54574584960938, "learning_rate": 3e-06, "loss": 17.6492, "step": 327 }, { "epoch": 0.029260894776751863, "grad_norm": 187.80931091308594, "learning_rate": 3e-06, "loss": 7.692, "step": 328 }, { "epoch": 0.029350104821802937, "grad_norm": 195.0843505859375, "learning_rate": 3e-06, "loss": 12.8044, "step": 329 }, { "epoch": 0.029439314866854006, "grad_norm": 168.82028198242188, "learning_rate": 3e-06, "loss": -7.5147, "step": 330 }, { "epoch": 0.02952852491190508, "grad_norm": 176.14859008789062, "learning_rate": 3e-06, "loss": 10.9965, "step": 331 }, { "epoch": 0.029617734956956153, "grad_norm": 202.02247619628906, "learning_rate": 3e-06, "loss": -1.8138, "step": 332 }, { "epoch": 0.029706945002007226, "grad_norm": 216.37252807617188, "learning_rate": 3e-06, "loss": 15.5179, "step": 333 }, { "epoch": 0.0297961550470583, "grad_norm": 200.23558044433594, "learning_rate": 3e-06, "loss": 5.0549, "step": 334 }, { "epoch": 0.029885365092109373, "grad_norm": 177.7020263671875, "learning_rate": 3e-06, "loss": 11.199, "step": 335 }, { "epoch": 0.029974575137160443, "grad_norm": 170.23106384277344, "learning_rate": 3e-06, "loss": -10.9367, "step": 336 }, { "completion_length": 224.9166717529297, "epoch": 0.030063785182211516, "grad_norm": 127.8658218383789, "learning_rate": 3e-06, "loss": -59.3587, "reward": 1.6053959131240845, "reward_std": 0.5731277614831924, "rewards/correctness_reward_func": 1.3333333134651184, "rewards/int_reward_func": 0.4270833283662796, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.1550208330154419, "step": 337, "zero_std_ratio": 0.0 }, { "epoch": 0.03015299522726259, "grad_norm": 133.41494750976562, "learning_rate": 3e-06, "loss": -53.4241, "step": 338 }, { "epoch": 0.030242205272313662, "grad_norm": 170.7308807373047, "learning_rate": 3e-06, "loss": -65.4722, "step": 339 }, { "epoch": 0.030331415317364736, "grad_norm": 172.28118896484375, "learning_rate": 3e-06, "loss": -53.201, "step": 340 }, { "epoch": 0.03042062536241581, "grad_norm": 118.70462799072266, "learning_rate": 3e-06, "loss": -50.8363, "step": 341 }, { "epoch": 0.030509835407466882, "grad_norm": 143.119384765625, "learning_rate": 3e-06, "loss": -60.764, "step": 342 }, { "epoch": 0.030599045452517952, "grad_norm": 143.7277374267578, "learning_rate": 3e-06, "loss": -62.8186, "step": 343 }, { "epoch": 0.030688255497569025, "grad_norm": 157.5625, "learning_rate": 3e-06, "loss": -57.4741, "step": 344 }, { "epoch": 0.0307774655426201, "grad_norm": 191.64804077148438, "learning_rate": 3e-06, "loss": -71.2662, "step": 345 }, { "epoch": 0.030866675587671172, "grad_norm": 206.0039520263672, "learning_rate": 3e-06, "loss": -56.9883, "step": 346 }, { "epoch": 0.030955885632722245, "grad_norm": 132.1703643798828, "learning_rate": 3e-06, "loss": -54.0822, "step": 347 }, { "epoch": 0.03104509567777332, "grad_norm": 144.338623046875, "learning_rate": 3e-06, "loss": -66.1545, "step": 348 }, { "completion_length": 161.1041717529297, "epoch": 0.03113430572282439, "grad_norm": 176.29396057128906, "learning_rate": 3e-06, "loss": 2.568, "reward": 1.8858751058578491, "reward_std": 0.5198497474193573, "rewards/correctness_reward_func": 1.4583333134651184, "rewards/int_reward_func": 0.40625, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.021291667595505714, "step": 349, "zero_std_ratio": 0.0 }, { "epoch": 0.03122351576787546, "grad_norm": 162.1043701171875, "learning_rate": 3e-06, "loss": 11.6461, "step": 350 }, { "epoch": 0.03131272581292654, "grad_norm": 147.42918395996094, "learning_rate": 3e-06, "loss": 19.3425, "step": 351 }, { "epoch": 0.03140193585797761, "grad_norm": 146.65992736816406, "learning_rate": 3e-06, "loss": 4.1848, "step": 352 }, { "epoch": 0.03149114590302868, "grad_norm": 137.17474365234375, "learning_rate": 3e-06, "loss": 9.5202, "step": 353 }, { "epoch": 0.03158035594807975, "grad_norm": 176.24244689941406, "learning_rate": 3e-06, "loss": 24.3653, "step": 354 }, { "epoch": 0.031669565993130824, "grad_norm": 176.59144592285156, "learning_rate": 3e-06, "loss": 1.4274, "step": 355 }, { "epoch": 0.0317587760381819, "grad_norm": 161.0966339111328, "learning_rate": 3e-06, "loss": 10.7367, "step": 356 }, { "epoch": 0.03184798608323297, "grad_norm": 164.76675415039062, "learning_rate": 3e-06, "loss": 18.5651, "step": 357 }, { "epoch": 0.031937196128284044, "grad_norm": 227.15631103515625, "learning_rate": 3e-06, "loss": 2.7137, "step": 358 }, { "epoch": 0.03202640617333512, "grad_norm": 152.36514282226562, "learning_rate": 3e-06, "loss": 8.0471, "step": 359 }, { "epoch": 0.03211561621838619, "grad_norm": 162.82603454589844, "learning_rate": 3e-06, "loss": 24.2858, "step": 360 }, { "completion_length": 173.6666717529297, "epoch": 0.032204826263437264, "grad_norm": 112.11152648925781, "learning_rate": 3e-06, "loss": -47.6777, "reward": 1.9486668109893799, "reward_std": 0.614804282784462, "rewards/correctness_reward_func": 1.5, "rewards/int_reward_func": 0.4895833283662796, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.04091667756438255, "step": 361, "zero_std_ratio": 0.0 }, { "epoch": 0.03229403630848834, "grad_norm": 212.41893005371094, "learning_rate": 3e-06, "loss": -57.8799, "step": 362 }, { "epoch": 0.03238324635353941, "grad_norm": 129.3856201171875, "learning_rate": 3e-06, "loss": -42.8209, "step": 363 }, { "epoch": 0.032472456398590484, "grad_norm": 90.02410888671875, "learning_rate": 3e-06, "loss": -37.0157, "step": 364 }, { "epoch": 0.03256166644364156, "grad_norm": 122.2002944946289, "learning_rate": 3e-06, "loss": -42.0898, "step": 365 }, { "epoch": 0.03265087648869262, "grad_norm": 114.75045776367188, "learning_rate": 3e-06, "loss": -37.9465, "step": 366 }, { "epoch": 0.0327400865337437, "grad_norm": 117.86136627197266, "learning_rate": 3e-06, "loss": -50.0162, "step": 367 }, { "epoch": 0.03282929657879477, "grad_norm": 224.55755615234375, "learning_rate": 3e-06, "loss": -62.4077, "step": 368 }, { "epoch": 0.03291850662384584, "grad_norm": 145.33380126953125, "learning_rate": 3e-06, "loss": -45.7888, "step": 369 }, { "epoch": 0.033007716668896916, "grad_norm": 107.85284423828125, "learning_rate": 3e-06, "loss": -38.8828, "step": 370 }, { "epoch": 0.03309692671394799, "grad_norm": 143.64854431152344, "learning_rate": 3e-06, "loss": -44.4827, "step": 371 }, { "epoch": 0.03318613675899906, "grad_norm": 120.4244155883789, "learning_rate": 3e-06, "loss": -40.0586, "step": 372 }, { "completion_length": 170.3125, "epoch": 0.033275346804050136, "grad_norm": 199.8765869140625, "learning_rate": 3e-06, "loss": -16.0424, "reward": 1.5020000338554382, "reward_std": 0.6375356912612915, "rewards/correctness_reward_func": 1.0416666865348816, "rewards/int_reward_func": 0.4479166716337204, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.012416664510965347, "step": 373, "zero_std_ratio": 0.0 }, { "epoch": 0.03336455684910121, "grad_norm": 186.22103881835938, "learning_rate": 3e-06, "loss": -12.8464, "step": 374 }, { "epoch": 0.03345376689415228, "grad_norm": 133.84971618652344, "learning_rate": 3e-06, "loss": -16.0581, "step": 375 }, { "epoch": 0.033542976939203356, "grad_norm": 124.62361145019531, "learning_rate": 3e-06, "loss": -20.0662, "step": 376 }, { "epoch": 0.03363218698425443, "grad_norm": 177.62574768066406, "learning_rate": 3e-06, "loss": -16.7656, "step": 377 }, { "epoch": 0.0337213970293055, "grad_norm": 162.92381286621094, "learning_rate": 3e-06, "loss": -19.1936, "step": 378 }, { "epoch": 0.033810607074356576, "grad_norm": 167.49449157714844, "learning_rate": 3e-06, "loss": -16.9396, "step": 379 }, { "epoch": 0.03389981711940764, "grad_norm": 180.7197723388672, "learning_rate": 3e-06, "loss": -14.2949, "step": 380 }, { "epoch": 0.033989027164458716, "grad_norm": 158.6161346435547, "learning_rate": 3e-06, "loss": -18.1083, "step": 381 }, { "epoch": 0.03407823720950979, "grad_norm": 136.7860870361328, "learning_rate": 3e-06, "loss": -21.9306, "step": 382 }, { "epoch": 0.03416744725456086, "grad_norm": 200.51185607910156, "learning_rate": 3e-06, "loss": -18.8755, "step": 383 }, { "epoch": 0.034256657299611935, "grad_norm": 174.00477600097656, "learning_rate": 3e-06, "loss": -21.6131, "step": 384 }, { "completion_length": 168.0416717529297, "epoch": 0.03434586734466301, "grad_norm": 270.6558837890625, "learning_rate": 3e-06, "loss": -108.9543, "reward": 1.5282500386238098, "reward_std": 0.8121029734611511, "rewards/correctness_reward_func": 1.0416666567325592, "rewards/int_reward_func": 0.4479166716337204, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.03866666788235307, "step": 385, "zero_std_ratio": 0.0 }, { "epoch": 0.03443507738971408, "grad_norm": 316.0130615234375, "learning_rate": 3e-06, "loss": -121.254, "step": 386 }, { "epoch": 0.034524287434765155, "grad_norm": 257.7696228027344, "learning_rate": 3e-06, "loss": -102.8911, "step": 387 }, { "epoch": 0.03461349747981623, "grad_norm": 305.755126953125, "learning_rate": 3e-06, "loss": -114.5659, "step": 388 }, { "epoch": 0.0347027075248673, "grad_norm": 219.5818328857422, "learning_rate": 3e-06, "loss": -114.7208, "step": 389 }, { "epoch": 0.034791917569918375, "grad_norm": 255.7522430419922, "learning_rate": 3e-06, "loss": -109.7934, "step": 390 }, { "epoch": 0.03488112761496945, "grad_norm": 291.77642822265625, "learning_rate": 3e-06, "loss": -116.5826, "step": 391 }, { "epoch": 0.03497033766002052, "grad_norm": 333.5157165527344, "learning_rate": 3e-06, "loss": -132.3897, "step": 392 }, { "epoch": 0.03505954770507159, "grad_norm": 267.8763122558594, "learning_rate": 3e-06, "loss": -110.8159, "step": 393 }, { "epoch": 0.03514875775012266, "grad_norm": 312.3733215332031, "learning_rate": 3e-06, "loss": -127.8078, "step": 394 }, { "epoch": 0.035237967795173734, "grad_norm": 242.0186309814453, "learning_rate": 3e-06, "loss": -123.3703, "step": 395 }, { "epoch": 0.03532717784022481, "grad_norm": 297.62847900390625, "learning_rate": 3e-06, "loss": -120.3945, "step": 396 }, { "completion_length": 171.45833587646484, "epoch": 0.03541638788527588, "grad_norm": 192.1062774658203, "learning_rate": 3e-06, "loss": -81.6189, "reward": 1.9622292518615723, "reward_std": 0.8264816105365753, "rewards/correctness_reward_func": 1.4583333134651184, "rewards/int_reward_func": 0.46875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.0351458303630352, "step": 397, "zero_std_ratio": 0.0 }, { "epoch": 0.035505597930326954, "grad_norm": 202.8217010498047, "learning_rate": 3e-06, "loss": -67.341, "step": 398 }, { "epoch": 0.03559480797537803, "grad_norm": 218.82249450683594, "learning_rate": 3e-06, "loss": -65.3347, "step": 399 }, { "epoch": 0.0356840180204291, "grad_norm": 232.65196228027344, "learning_rate": 3e-06, "loss": -76.8989, "step": 400 }, { "epoch": 0.035773228065480174, "grad_norm": 198.04103088378906, "learning_rate": 3e-06, "loss": -74.488, "step": 401 }, { "epoch": 0.03586243811053125, "grad_norm": 211.86273193359375, "learning_rate": 3e-06, "loss": -75.6096, "step": 402 }, { "epoch": 0.03595164815558232, "grad_norm": 218.77589416503906, "learning_rate": 3e-06, "loss": -87.9174, "step": 403 }, { "epoch": 0.036040858200633394, "grad_norm": 243.4962615966797, "learning_rate": 3e-06, "loss": -70.1026, "step": 404 }, { "epoch": 0.03613006824568447, "grad_norm": 242.30494689941406, "learning_rate": 3e-06, "loss": -71.4579, "step": 405 }, { "epoch": 0.03621927829073553, "grad_norm": 274.28948974609375, "learning_rate": 3e-06, "loss": -83.8044, "step": 406 }, { "epoch": 0.03630848833578661, "grad_norm": 257.0942077636719, "learning_rate": 3e-06, "loss": -82.445, "step": 407 }, { "epoch": 0.03639769838083768, "grad_norm": 255.2320556640625, "learning_rate": 3e-06, "loss": -81.9402, "step": 408 }, { "completion_length": 125.70833587646484, "epoch": 0.03648690842588875, "grad_norm": 185.7193145751953, "learning_rate": 3e-06, "loss": 46.4742, "reward": 1.8683959245681763, "reward_std": 0.8172085583209991, "rewards/correctness_reward_func": 1.3333333730697632, "rewards/int_reward_func": 0.4270833432674408, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.10797916725277901, "step": 409, "zero_std_ratio": 0.0 }, { "epoch": 0.03657611847093983, "grad_norm": 218.23338317871094, "learning_rate": 3e-06, "loss": 29.3517, "step": 410 }, { "epoch": 0.0366653285159909, "grad_norm": 180.90330505371094, "learning_rate": 3e-06, "loss": 55.3162, "step": 411 }, { "epoch": 0.03675453856104197, "grad_norm": 216.37953186035156, "learning_rate": 3e-06, "loss": 50.2096, "step": 412 }, { "epoch": 0.036843748606093046, "grad_norm": 198.8724822998047, "learning_rate": 3e-06, "loss": 51.636, "step": 413 }, { "epoch": 0.03693295865114412, "grad_norm": 184.89627075195312, "learning_rate": 3e-06, "loss": 44.8369, "step": 414 }, { "epoch": 0.03702216869619519, "grad_norm": 167.6713104248047, "learning_rate": 3e-06, "loss": 44.1546, "step": 415 }, { "epoch": 0.037111378741246266, "grad_norm": 192.13140869140625, "learning_rate": 3e-06, "loss": 27.4686, "step": 416 }, { "epoch": 0.03720058878629734, "grad_norm": 177.4408721923828, "learning_rate": 3e-06, "loss": 53.37, "step": 417 }, { "epoch": 0.03728979883134841, "grad_norm": 223.81668090820312, "learning_rate": 3e-06, "loss": 45.2759, "step": 418 }, { "epoch": 0.037379008876399486, "grad_norm": 207.5684356689453, "learning_rate": 3e-06, "loss": 46.924, "step": 419 }, { "epoch": 0.03746821892145055, "grad_norm": 180.81484985351562, "learning_rate": 3e-06, "loss": 42.0928, "step": 420 }, { "completion_length": 153.89583587646484, "epoch": 0.037557428966501626, "grad_norm": 247.00067138671875, "learning_rate": 3e-06, "loss": 14.4654, "reward": 1.776770830154419, "reward_std": 0.6972799003124237, "rewards/correctness_reward_func": 1.25, "rewards/int_reward_func": 0.4479166716337204, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.07885416969656944, "step": 421, "zero_std_ratio": 0.0 }, { "epoch": 0.0376466390115527, "grad_norm": 244.66824340820312, "learning_rate": 3e-06, "loss": 34.9601, "step": 422 }, { "epoch": 0.03773584905660377, "grad_norm": 276.21539306640625, "learning_rate": 3e-06, "loss": 14.6261, "step": 423 }, { "epoch": 0.037825059101654845, "grad_norm": 288.96246337890625, "learning_rate": 3e-06, "loss": 41.9368, "step": 424 }, { "epoch": 0.03791426914670592, "grad_norm": 303.6945495605469, "learning_rate": 3e-06, "loss": 23.9119, "step": 425 }, { "epoch": 0.03800347919175699, "grad_norm": 274.27142333984375, "learning_rate": 3e-06, "loss": 27.399, "step": 426 }, { "epoch": 0.038092689236808065, "grad_norm": 233.245361328125, "learning_rate": 3e-06, "loss": 10.175, "step": 427 }, { "epoch": 0.03818189928185914, "grad_norm": 256.8597412109375, "learning_rate": 3e-06, "loss": 31.9908, "step": 428 }, { "epoch": 0.03827110932691021, "grad_norm": 270.4859619140625, "learning_rate": 3e-06, "loss": 10.7867, "step": 429 }, { "epoch": 0.038360319371961285, "grad_norm": 301.17181396484375, "learning_rate": 3e-06, "loss": 40.6524, "step": 430 }, { "epoch": 0.03844952941701236, "grad_norm": 303.94488525390625, "learning_rate": 3e-06, "loss": 21.1706, "step": 431 }, { "epoch": 0.03853873946206343, "grad_norm": 258.3034973144531, "learning_rate": 3e-06, "loss": 22.7622, "step": 432 }, { "completion_length": 107.54166793823242, "epoch": 0.0386279495071145, "grad_norm": 134.02154541015625, "learning_rate": 3e-06, "loss": -22.4999, "reward": 2.2951666712760925, "reward_std": 0.38126008585095406, "rewards/correctness_reward_func": 1.5833333134651184, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.21183334290981293, "step": 433, "zero_std_ratio": 0.0 }, { "epoch": 0.03871715955216557, "grad_norm": 191.98023986816406, "learning_rate": 3e-06, "loss": -14.0492, "step": 434 }, { "epoch": 0.038806369597216644, "grad_norm": 154.30328369140625, "learning_rate": 3e-06, "loss": -10.9444, "step": 435 }, { "epoch": 0.03889557964226772, "grad_norm": 134.01214599609375, "learning_rate": 3e-06, "loss": -17.7574, "step": 436 }, { "epoch": 0.03898478968731879, "grad_norm": 132.3379364013672, "learning_rate": 3e-06, "loss": -18.487, "step": 437 }, { "epoch": 0.039073999732369864, "grad_norm": 146.31573486328125, "learning_rate": 3e-06, "loss": -12.7164, "step": 438 }, { "epoch": 0.03916320977742094, "grad_norm": 136.05592346191406, "learning_rate": 3e-06, "loss": -23.1439, "step": 439 }, { "epoch": 0.03925241982247201, "grad_norm": 138.1117706298828, "learning_rate": 3e-06, "loss": -15.7255, "step": 440 }, { "epoch": 0.039341629867523084, "grad_norm": 166.34922790527344, "learning_rate": 3e-06, "loss": -12.6375, "step": 441 }, { "epoch": 0.03943083991257416, "grad_norm": 132.994140625, "learning_rate": 3e-06, "loss": -20.1431, "step": 442 }, { "epoch": 0.03952004995762523, "grad_norm": 129.54771423339844, "learning_rate": 3e-06, "loss": -19.3478, "step": 443 }, { "epoch": 0.039609260002676304, "grad_norm": 152.91607666015625, "learning_rate": 3e-06, "loss": -14.3018, "step": 444 }, { "completion_length": 160.7916717529297, "epoch": 0.03969847004772738, "grad_norm": 165.9615020751953, "learning_rate": 3e-06, "loss": 38.8687, "reward": 1.6081042885780334, "reward_std": 0.45602357387542725, "rewards/correctness_reward_func": 1.0833333432674408, "rewards/int_reward_func": 0.4479166716337204, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.0768541656434536, "step": 445, "zero_std_ratio": 0.0 }, { "epoch": 0.039787680092778444, "grad_norm": 165.71353149414062, "learning_rate": 3e-06, "loss": 32.8818, "step": 446 }, { "epoch": 0.03987689013782952, "grad_norm": 185.27174377441406, "learning_rate": 3e-06, "loss": 29.8206, "step": 447 }, { "epoch": 0.03996610018288059, "grad_norm": 163.79954528808594, "learning_rate": 3e-06, "loss": 28.2399, "step": 448 }, { "epoch": 0.04005531022793166, "grad_norm": 167.2331085205078, "learning_rate": 3e-06, "loss": 42.2504, "step": 449 }, { "epoch": 0.04014452027298274, "grad_norm": 157.44320678710938, "learning_rate": 3e-06, "loss": 47.0631, "step": 450 }, { "epoch": 0.04023373031803381, "grad_norm": 167.7976837158203, "learning_rate": 3e-06, "loss": 37.7299, "step": 451 }, { "epoch": 0.04032294036308488, "grad_norm": 171.96420288085938, "learning_rate": 3e-06, "loss": 31.7018, "step": 452 }, { "epoch": 0.040412150408135956, "grad_norm": 164.95046997070312, "learning_rate": 3e-06, "loss": 28.9306, "step": 453 }, { "epoch": 0.04050136045318703, "grad_norm": 146.903076171875, "learning_rate": 3e-06, "loss": 26.551, "step": 454 }, { "epoch": 0.0405905704982381, "grad_norm": 182.6881561279297, "learning_rate": 3e-06, "loss": 41.0788, "step": 455 }, { "epoch": 0.040679780543289176, "grad_norm": 147.5907440185547, "learning_rate": 3e-06, "loss": 44.0529, "step": 456 }, { "completion_length": 143.8541717529297, "epoch": 0.04076899058834025, "grad_norm": 252.46615600585938, "learning_rate": 3e-06, "loss": -60.641, "reward": 1.8890208005905151, "reward_std": 0.4024546667933464, "rewards/correctness_reward_func": 1.375, "rewards/int_reward_func": 0.4166666716337204, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.09735416248440742, "step": 457, "zero_std_ratio": 0.0 }, { "epoch": 0.04085820063339132, "grad_norm": 223.07171630859375, "learning_rate": 3e-06, "loss": -42.0093, "step": 458 }, { "epoch": 0.040947410678442396, "grad_norm": 237.3083953857422, "learning_rate": 3e-06, "loss": -57.9657, "step": 459 }, { "epoch": 0.04103662072349346, "grad_norm": 225.29269409179688, "learning_rate": 3e-06, "loss": -41.8661, "step": 460 }, { "epoch": 0.041125830768544536, "grad_norm": 210.0297088623047, "learning_rate": 3e-06, "loss": -47.2305, "step": 461 }, { "epoch": 0.04121504081359561, "grad_norm": 262.04473876953125, "learning_rate": 3e-06, "loss": -50.9342, "step": 462 }, { "epoch": 0.04130425085864668, "grad_norm": 252.53802490234375, "learning_rate": 3e-06, "loss": -62.6208, "step": 463 }, { "epoch": 0.041393460903697755, "grad_norm": 221.40121459960938, "learning_rate": 3e-06, "loss": -43.7282, "step": 464 }, { "epoch": 0.04148267094874883, "grad_norm": 231.59335327148438, "learning_rate": 3e-06, "loss": -60.5224, "step": 465 }, { "epoch": 0.0415718809937999, "grad_norm": 207.73471069335938, "learning_rate": 3e-06, "loss": -44.5223, "step": 466 }, { "epoch": 0.041661091038850975, "grad_norm": 217.08779907226562, "learning_rate": 3e-06, "loss": -50.6021, "step": 467 }, { "epoch": 0.04175030108390205, "grad_norm": 254.29269409179688, "learning_rate": 3e-06, "loss": -55.4887, "step": 468 }, { "completion_length": 122.50000381469727, "epoch": 0.04183951112895312, "grad_norm": 147.28335571289062, "learning_rate": 3e-06, "loss": 12.9864, "reward": 2.310395896434784, "reward_std": 0.44204503297805786, "rewards/correctness_reward_func": 1.6666666269302368, "rewards/int_reward_func": 0.46875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.010416666977107525, "rewards/xmlcount_reward_func": 0.16456249356269836, "step": 469, "zero_std_ratio": 0.0 }, { "epoch": 0.041928721174004195, "grad_norm": 161.9978485107422, "learning_rate": 3e-06, "loss": 34.7546, "step": 470 }, { "epoch": 0.04201793121905527, "grad_norm": 165.3116455078125, "learning_rate": 3e-06, "loss": 17.4188, "step": 471 }, { "epoch": 0.04210714126410634, "grad_norm": 142.81861877441406, "learning_rate": 3e-06, "loss": 18.3006, "step": 472 }, { "epoch": 0.04219635130915741, "grad_norm": 168.01116943359375, "learning_rate": 3e-06, "loss": 17.6413, "step": 473 }, { "epoch": 0.04228556135420848, "grad_norm": 207.03326416015625, "learning_rate": 3e-06, "loss": 15.9259, "step": 474 }, { "epoch": 0.042374771399259555, "grad_norm": 141.62599182128906, "learning_rate": 3e-06, "loss": 12.4355, "step": 475 }, { "epoch": 0.04246398144431063, "grad_norm": 180.9537353515625, "learning_rate": 3e-06, "loss": 32.9299, "step": 476 }, { "epoch": 0.0425531914893617, "grad_norm": 163.92254638671875, "learning_rate": 3e-06, "loss": 17.1089, "step": 477 }, { "epoch": 0.042642401534412774, "grad_norm": 145.9250030517578, "learning_rate": 3e-06, "loss": 17.069, "step": 478 }, { "epoch": 0.04273161157946385, "grad_norm": 152.68319702148438, "learning_rate": 3e-06, "loss": 16.6025, "step": 479 }, { "epoch": 0.04282082162451492, "grad_norm": 222.65997314453125, "learning_rate": 3e-06, "loss": 14.2435, "step": 480 }, { "completion_length": 116.66666793823242, "epoch": 0.042910031669565994, "grad_norm": 166.38722229003906, "learning_rate": 3e-06, "loss": 13.4616, "reward": 1.5636458992958069, "reward_std": 0.6559399664402008, "rewards/correctness_reward_func": 0.9999999701976776, "rewards/int_reward_func": 0.4166666716337204, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.1469791643321514, "step": 481, "zero_std_ratio": 0.0 }, { "epoch": 0.04299924171461707, "grad_norm": 222.22879028320312, "learning_rate": 3e-06, "loss": 18.8121, "step": 482 }, { "epoch": 0.04308845175966814, "grad_norm": 221.17059326171875, "learning_rate": 3e-06, "loss": 21.4877, "step": 483 }, { "epoch": 0.043177661804719214, "grad_norm": 142.53189086914062, "learning_rate": 3e-06, "loss": 16.8492, "step": 484 }, { "epoch": 0.04326687184977029, "grad_norm": 170.13198852539062, "learning_rate": 3e-06, "loss": 20.7898, "step": 485 }, { "epoch": 0.043356081894821354, "grad_norm": 161.23110961914062, "learning_rate": 3e-06, "loss": 12.8851, "step": 486 }, { "epoch": 0.04344529193987243, "grad_norm": 175.66587829589844, "learning_rate": 3e-06, "loss": 10.8873, "step": 487 }, { "epoch": 0.0435345019849235, "grad_norm": 195.75050354003906, "learning_rate": 3e-06, "loss": 15.6694, "step": 488 }, { "epoch": 0.04362371202997457, "grad_norm": 190.2042236328125, "learning_rate": 3e-06, "loss": 19.0926, "step": 489 }, { "epoch": 0.04371292207502565, "grad_norm": 146.10504150390625, "learning_rate": 3e-06, "loss": 13.8907, "step": 490 }, { "epoch": 0.04380213212007672, "grad_norm": 149.26614379882812, "learning_rate": 3e-06, "loss": 17.0683, "step": 491 }, { "epoch": 0.04389134216512779, "grad_norm": 178.4593963623047, "learning_rate": 3e-06, "loss": 10.2799, "step": 492 }, { "completion_length": 137.50000762939453, "epoch": 0.043980552210178867, "grad_norm": 87.93815612792969, "learning_rate": 3e-06, "loss": -35.7257, "reward": 2.5712709426879883, "reward_std": 0.18458116799592972, "rewards/correctness_reward_func": 1.9583333134651184, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.11293749511241913, "step": 493, "zero_std_ratio": 0.0 }, { "epoch": 0.04406976225522994, "grad_norm": 78.96405029296875, "learning_rate": 3e-06, "loss": -37.1011, "step": 494 }, { "epoch": 0.04415897230028101, "grad_norm": 113.70293426513672, "learning_rate": 3e-06, "loss": -46.3151, "step": 495 }, { "epoch": 0.044248182345332086, "grad_norm": 90.45478820800781, "learning_rate": 3e-06, "loss": -43.5143, "step": 496 }, { "epoch": 0.04433739239038316, "grad_norm": 106.42904663085938, "learning_rate": 3e-06, "loss": -42.7944, "step": 497 }, { "epoch": 0.04442660243543423, "grad_norm": 106.20608520507812, "learning_rate": 3e-06, "loss": -48.1815, "step": 498 }, { "epoch": 0.0445158124804853, "grad_norm": 93.41876220703125, "learning_rate": 3e-06, "loss": -37.7992, "step": 499 }, { "epoch": 0.04460502252553637, "grad_norm": 91.40050506591797, "learning_rate": 3e-06, "loss": -38.651, "step": 500 }, { "epoch": 0.044694232570587446, "grad_norm": 116.4251480102539, "learning_rate": 3e-06, "loss": -49.2276, "step": 501 }, { "epoch": 0.04478344261563852, "grad_norm": 92.2903060913086, "learning_rate": 3e-06, "loss": -46.591, "step": 502 }, { "epoch": 0.04487265266068959, "grad_norm": 110.7293472290039, "learning_rate": 3e-06, "loss": -45.7648, "step": 503 }, { "epoch": 0.044961862705740666, "grad_norm": 108.03923797607422, "learning_rate": 3e-06, "loss": -51.9054, "step": 504 }, { "completion_length": 131.14583587646484, "epoch": 0.04505107275079174, "grad_norm": 452.2289733886719, "learning_rate": 3e-06, "loss": 29.1255, "reward": 1.9235833883285522, "reward_std": 0.813209742307663, "rewards/correctness_reward_func": 1.3333333730697632, "rewards/int_reward_func": 0.4270833283662796, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.16316666454076767, "step": 505, "zero_std_ratio": 0.0 }, { "epoch": 0.04514028279584281, "grad_norm": 284.8124694824219, "learning_rate": 3e-06, "loss": 31.7163, "step": 506 }, { "epoch": 0.045229492840893885, "grad_norm": 361.7442626953125, "learning_rate": 3e-06, "loss": 27.2282, "step": 507 }, { "epoch": 0.04531870288594496, "grad_norm": 283.879638671875, "learning_rate": 3e-06, "loss": 42.2817, "step": 508 }, { "epoch": 0.04540791293099603, "grad_norm": 316.19000244140625, "learning_rate": 3e-06, "loss": 26.7891, "step": 509 }, { "epoch": 0.045497122976047105, "grad_norm": 370.62652587890625, "learning_rate": 3e-06, "loss": 34.1636, "step": 510 }, { "epoch": 0.04558633302109818, "grad_norm": 273.1391296386719, "learning_rate": 3e-06, "loss": 27.6705, "step": 511 }, { "epoch": 0.04567554306614925, "grad_norm": 307.9808044433594, "learning_rate": 3e-06, "loss": 28.9577, "step": 512 }, { "epoch": 0.04576475311120032, "grad_norm": 374.3335876464844, "learning_rate": 3e-06, "loss": 22.2669, "step": 513 }, { "epoch": 0.04585396315625139, "grad_norm": 395.0052795410156, "learning_rate": 3e-06, "loss": 39.239, "step": 514 }, { "epoch": 0.045943173201302465, "grad_norm": 539.7128295898438, "learning_rate": 3e-06, "loss": 25.6475, "step": 515 }, { "epoch": 0.04603238324635354, "grad_norm": 348.81170654296875, "learning_rate": 3e-06, "loss": 31.7762, "step": 516 }, { "completion_length": 115.83333587646484, "epoch": 0.04612159329140461, "grad_norm": 177.44480895996094, "learning_rate": 3e-06, "loss": -44.7672, "reward": 2.343208432197571, "reward_std": 0.4359329864382744, "rewards/correctness_reward_func": 1.6666666269302368, "rewards/int_reward_func": 0.4895833283662796, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.18695833534002304, "step": 517, "zero_std_ratio": 0.0 }, { "epoch": 0.046210803336455684, "grad_norm": 199.65908813476562, "learning_rate": 3e-06, "loss": -51.4622, "step": 518 }, { "epoch": 0.04630001338150676, "grad_norm": 175.2034149169922, "learning_rate": 3e-06, "loss": -55.1536, "step": 519 }, { "epoch": 0.04638922342655783, "grad_norm": 160.91688537597656, "learning_rate": 3e-06, "loss": -37.3164, "step": 520 }, { "epoch": 0.046478433471608904, "grad_norm": 165.5592498779297, "learning_rate": 3e-06, "loss": -42.9147, "step": 521 }, { "epoch": 0.04656764351665998, "grad_norm": 154.5955047607422, "learning_rate": 3e-06, "loss": -48.9731, "step": 522 }, { "epoch": 0.04665685356171105, "grad_norm": 202.06838989257812, "learning_rate": 3e-06, "loss": -46.1992, "step": 523 }, { "epoch": 0.046746063606762124, "grad_norm": 216.6766357421875, "learning_rate": 3e-06, "loss": -54.2271, "step": 524 }, { "epoch": 0.0468352736518132, "grad_norm": 212.4103240966797, "learning_rate": 3e-06, "loss": -59.7351, "step": 525 }, { "epoch": 0.046924483696864264, "grad_norm": 160.86546325683594, "learning_rate": 3e-06, "loss": -40.5866, "step": 526 }, { "epoch": 0.04701369374191534, "grad_norm": 171.24478149414062, "learning_rate": 3e-06, "loss": -47.2665, "step": 527 }, { "epoch": 0.04710290378696641, "grad_norm": 165.52357482910156, "learning_rate": 3e-06, "loss": -53.477, "step": 528 }, { "completion_length": 119.77083587646484, "epoch": 0.047192113832017483, "grad_norm": 382.3139953613281, "learning_rate": 3e-06, "loss": 86.4678, "reward": 1.9181458950042725, "reward_std": 0.6776820421218872, "rewards/correctness_reward_func": 1.3333333134651184, "rewards/int_reward_func": 0.4166666716337204, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.1681458279490471, "step": 529, "zero_std_ratio": 0.0 }, { "epoch": 0.04728132387706856, "grad_norm": 264.3744812011719, "learning_rate": 3e-06, "loss": 87.2703, "step": 530 }, { "epoch": 0.04737053392211963, "grad_norm": 273.3477783203125, "learning_rate": 3e-06, "loss": 74.9615, "step": 531 }, { "epoch": 0.0474597439671707, "grad_norm": 326.87078857421875, "learning_rate": 3e-06, "loss": 90.7838, "step": 532 }, { "epoch": 0.04754895401222178, "grad_norm": 294.74041748046875, "learning_rate": 3e-06, "loss": 102.1572, "step": 533 }, { "epoch": 0.04763816405727285, "grad_norm": 312.48626708984375, "learning_rate": 3e-06, "loss": 93.4423, "step": 534 }, { "epoch": 0.04772737410232392, "grad_norm": 383.2833557128906, "learning_rate": 3e-06, "loss": 86.6238, "step": 535 }, { "epoch": 0.047816584147374996, "grad_norm": 316.42926025390625, "learning_rate": 3e-06, "loss": 86.6783, "step": 536 }, { "epoch": 0.04790579419242607, "grad_norm": 268.37506103515625, "learning_rate": 3e-06, "loss": 72.0206, "step": 537 }, { "epoch": 0.04799500423747714, "grad_norm": 337.4726867675781, "learning_rate": 3e-06, "loss": 87.3427, "step": 538 }, { "epoch": 0.04808421428252821, "grad_norm": 284.79827880859375, "learning_rate": 3e-06, "loss": 98.66, "step": 539 }, { "epoch": 0.04817342432757928, "grad_norm": 321.74609375, "learning_rate": 3e-06, "loss": 87.6142, "step": 540 }, { "completion_length": 125.37500762939453, "epoch": 0.048262634372630356, "grad_norm": 187.67727661132812, "learning_rate": 3e-06, "loss": -1.5514, "reward": 1.9883333444595337, "reward_std": 0.6124217808246613, "rewards/correctness_reward_func": 1.4166666865348816, "rewards/int_reward_func": 0.4375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.1341666616499424, "step": 541, "zero_std_ratio": 0.0 }, { "epoch": 0.04835184441768143, "grad_norm": 202.66070556640625, "learning_rate": 3e-06, "loss": 1.9568, "step": 542 }, { "epoch": 0.0484410544627325, "grad_norm": 180.55126953125, "learning_rate": 3e-06, "loss": -6.8013, "step": 543 }, { "epoch": 0.048530264507783576, "grad_norm": 161.08514404296875, "learning_rate": 3e-06, "loss": -3.4725, "step": 544 }, { "epoch": 0.04861947455283465, "grad_norm": 220.28076171875, "learning_rate": 3e-06, "loss": 0.0972, "step": 545 }, { "epoch": 0.04870868459788572, "grad_norm": 321.00994873046875, "learning_rate": 3e-06, "loss": -10.5751, "step": 546 }, { "epoch": 0.048797894642936795, "grad_norm": 197.3623046875, "learning_rate": 3e-06, "loss": -2.6929, "step": 547 }, { "epoch": 0.04888710468798787, "grad_norm": 213.94691467285156, "learning_rate": 3e-06, "loss": 1.3407, "step": 548 }, { "epoch": 0.04897631473303894, "grad_norm": 254.2111053466797, "learning_rate": 3e-06, "loss": -7.3544, "step": 549 }, { "epoch": 0.049065524778090015, "grad_norm": 155.93460083007812, "learning_rate": 3e-06, "loss": -5.7191, "step": 550 }, { "epoch": 0.04915473482314109, "grad_norm": 175.17147827148438, "learning_rate": 3e-06, "loss": -2.0149, "step": 551 }, { "epoch": 0.04924394486819216, "grad_norm": 220.0244140625, "learning_rate": 3e-06, "loss": -12.1544, "step": 552 }, { "completion_length": 138.62500762939453, "epoch": 0.04933315491324323, "grad_norm": 255.7532501220703, "learning_rate": 3e-06, "loss": -42.4297, "reward": 2.4352500438690186, "reward_std": 0.32932066917419434, "rewards/correctness_reward_func": 1.8333333134651184, "rewards/int_reward_func": 0.4791666716337204, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.12275000661611557, "step": 553, "zero_std_ratio": 0.0 }, { "epoch": 0.0494223649582943, "grad_norm": 263.04913330078125, "learning_rate": 3e-06, "loss": -27.7121, "step": 554 }, { "epoch": 0.049511575003345375, "grad_norm": 187.12600708007812, "learning_rate": 3e-06, "loss": -32.293, "step": 555 }, { "epoch": 0.04960078504839645, "grad_norm": 233.73802185058594, "learning_rate": 3e-06, "loss": -34.2649, "step": 556 }, { "epoch": 0.04968999509344752, "grad_norm": 238.03567504882812, "learning_rate": 3e-06, "loss": -48.0373, "step": 557 }, { "epoch": 0.049779205138498595, "grad_norm": 211.17440795898438, "learning_rate": 3e-06, "loss": -34.5469, "step": 558 }, { "epoch": 0.04986841518354967, "grad_norm": 209.6473388671875, "learning_rate": 3e-06, "loss": -45.0123, "step": 559 }, { "epoch": 0.04995762522860074, "grad_norm": 219.1716766357422, "learning_rate": 3e-06, "loss": -29.3907, "step": 560 }, { "epoch": 0.050046835273651814, "grad_norm": 194.5946502685547, "learning_rate": 3e-06, "loss": -33.3813, "step": 561 }, { "epoch": 0.05013604531870289, "grad_norm": 230.82928466796875, "learning_rate": 3e-06, "loss": -37.409, "step": 562 }, { "epoch": 0.05022525536375396, "grad_norm": 268.3168640136719, "learning_rate": 3e-06, "loss": -50.4616, "step": 563 }, { "epoch": 0.050314465408805034, "grad_norm": 225.2816925048828, "learning_rate": 3e-06, "loss": -36.9552, "step": 564 }, { "completion_length": 138.81250381469727, "epoch": 0.05040367545385611, "grad_norm": 236.8852996826172, "learning_rate": 3e-06, "loss": 45.2003, "reward": 1.8692501783370972, "reward_std": 0.7652427852153778, "rewards/correctness_reward_func": 1.25, "rewards/int_reward_func": 0.46875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.15049999579787254, "step": 565, "zero_std_ratio": 0.0 }, { "epoch": 0.050492885498907174, "grad_norm": 258.247802734375, "learning_rate": 3e-06, "loss": 49.6383, "step": 566 }, { "epoch": 0.05058209554395825, "grad_norm": 318.1617126464844, "learning_rate": 3e-06, "loss": 58.9489, "step": 567 }, { "epoch": 0.05067130558900932, "grad_norm": 226.23045349121094, "learning_rate": 3e-06, "loss": 44.3005, "step": 568 }, { "epoch": 0.050760515634060394, "grad_norm": 303.47760009765625, "learning_rate": 3e-06, "loss": 28.8653, "step": 569 }, { "epoch": 0.05084972567911147, "grad_norm": 248.53013610839844, "learning_rate": 3e-06, "loss": 45.7073, "step": 570 }, { "epoch": 0.05093893572416254, "grad_norm": 228.76365661621094, "learning_rate": 3e-06, "loss": 40.9719, "step": 571 }, { "epoch": 0.05102814576921361, "grad_norm": 236.98915100097656, "learning_rate": 3e-06, "loss": 44.3296, "step": 572 }, { "epoch": 0.05111735581426469, "grad_norm": 318.7423400878906, "learning_rate": 3e-06, "loss": 54.9659, "step": 573 }, { "epoch": 0.05120656585931576, "grad_norm": 226.2831268310547, "learning_rate": 3e-06, "loss": 40.0793, "step": 574 }, { "epoch": 0.05129577590436683, "grad_norm": 321.7300109863281, "learning_rate": 3e-06, "loss": 25.4049, "step": 575 }, { "epoch": 0.051384985949417906, "grad_norm": 225.1118927001953, "learning_rate": 3e-06, "loss": 42.5627, "step": 576 }, { "completion_length": 114.56250381469727, "epoch": 0.05147419599446898, "grad_norm": 151.06048583984375, "learning_rate": 3e-06, "loss": 33.113, "reward": 2.1812918186187744, "reward_std": 0.530484139919281, "rewards/correctness_reward_func": 1.5833333134651184, "rewards/int_reward_func": 0.4375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.16045832633972168, "step": 577, "zero_std_ratio": 0.0 }, { "epoch": 0.05156340603952005, "grad_norm": 182.15431213378906, "learning_rate": 3e-06, "loss": 19.2473, "step": 578 }, { "epoch": 0.05165261608457112, "grad_norm": 169.3698272705078, "learning_rate": 3e-06, "loss": 25.8161, "step": 579 }, { "epoch": 0.05174182612962219, "grad_norm": 155.8734588623047, "learning_rate": 3e-06, "loss": 19.9518, "step": 580 }, { "epoch": 0.051831036174673266, "grad_norm": 177.54641723632812, "learning_rate": 3e-06, "loss": 23.4249, "step": 581 }, { "epoch": 0.05192024621972434, "grad_norm": 143.13719177246094, "learning_rate": 3e-06, "loss": 7.9228, "step": 582 }, { "epoch": 0.05200945626477541, "grad_norm": 146.39906311035156, "learning_rate": 3e-06, "loss": 30.8804, "step": 583 }, { "epoch": 0.052098666309826486, "grad_norm": 166.2614288330078, "learning_rate": 3e-06, "loss": 17.3062, "step": 584 }, { "epoch": 0.05218787635487756, "grad_norm": 158.1491241455078, "learning_rate": 3e-06, "loss": 23.1792, "step": 585 }, { "epoch": 0.05227708639992863, "grad_norm": 124.15723419189453, "learning_rate": 3e-06, "loss": 18.8908, "step": 586 }, { "epoch": 0.052366296444979706, "grad_norm": 155.26602172851562, "learning_rate": 3e-06, "loss": 21.968, "step": 587 }, { "epoch": 0.05245550649003078, "grad_norm": 155.0635528564453, "learning_rate": 3e-06, "loss": 6.3548, "step": 588 }, { "completion_length": 161.45833587646484, "epoch": 0.05254471653508185, "grad_norm": 317.9678649902344, "learning_rate": 3e-06, "loss": 6.4115, "reward": 1.8582292199134827, "reward_std": 0.48372724652290344, "rewards/correctness_reward_func": 1.3333333134651184, "rewards/int_reward_func": 0.4895833283662796, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.03531250241212547, "step": 589, "zero_std_ratio": 0.0 }, { "epoch": 0.052633926580132925, "grad_norm": 267.894287109375, "learning_rate": 3e-06, "loss": 6.0494, "step": 590 }, { "epoch": 0.052723136625184, "grad_norm": 197.32470703125, "learning_rate": 3e-06, "loss": 8.4095, "step": 591 }, { "epoch": 0.05281234667023507, "grad_norm": 208.84291076660156, "learning_rate": 3e-06, "loss": 2.183, "step": 592 }, { "epoch": 0.05290155671528614, "grad_norm": 213.50672912597656, "learning_rate": 3e-06, "loss": -5.6327, "step": 593 }, { "epoch": 0.05299076676033721, "grad_norm": 264.34210205078125, "learning_rate": 3e-06, "loss": 0.0463, "step": 594 }, { "epoch": 0.053079976805388285, "grad_norm": 269.38372802734375, "learning_rate": 3e-06, "loss": 3.4921, "step": 595 }, { "epoch": 0.05316918685043936, "grad_norm": 233.82005310058594, "learning_rate": 3e-06, "loss": 3.9645, "step": 596 }, { "epoch": 0.05325839689549043, "grad_norm": 174.2704620361328, "learning_rate": 3e-06, "loss": 6.6277, "step": 597 }, { "epoch": 0.053347606940541505, "grad_norm": 197.27203369140625, "learning_rate": 3e-06, "loss": -0.034, "step": 598 }, { "epoch": 0.05343681698559258, "grad_norm": 195.1741943359375, "learning_rate": 3e-06, "loss": -7.7007, "step": 599 }, { "epoch": 0.05352602703064365, "grad_norm": 218.35403442382812, "learning_rate": 3e-06, "loss": -2.732, "step": 600 }, { "completion_length": 163.20833587646484, "epoch": 0.053615237075694724, "grad_norm": 89.40003204345703, "learning_rate": 3e-06, "loss": -15.5412, "reward": 1.958250105381012, "reward_std": 0.20268601924180984, "rewards/correctness_reward_func": 1.4583333134651184, "rewards/int_reward_func": 0.4375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.06241666525602341, "step": 601, "zero_std_ratio": 0.0 }, { "epoch": 0.0537044471207458, "grad_norm": 85.92318725585938, "learning_rate": 3e-06, "loss": -19.9741, "step": 602 }, { "epoch": 0.05379365716579687, "grad_norm": 89.12326049804688, "learning_rate": 3e-06, "loss": -28.9968, "step": 603 }, { "epoch": 0.053882867210847944, "grad_norm": 109.64755249023438, "learning_rate": 3e-06, "loss": -21.6802, "step": 604 }, { "epoch": 0.05397207725589902, "grad_norm": 99.89476776123047, "learning_rate": 3e-06, "loss": -16.4453, "step": 605 }, { "epoch": 0.054061287300950084, "grad_norm": 140.71066284179688, "learning_rate": 3e-06, "loss": -24.1026, "step": 606 }, { "epoch": 0.05415049734600116, "grad_norm": 80.75763702392578, "learning_rate": 3e-06, "loss": -16.613, "step": 607 }, { "epoch": 0.05423970739105223, "grad_norm": 85.42610168457031, "learning_rate": 3e-06, "loss": -21.2449, "step": 608 }, { "epoch": 0.054328917436103304, "grad_norm": 93.39994812011719, "learning_rate": 3e-06, "loss": -30.1845, "step": 609 }, { "epoch": 0.05441812748115438, "grad_norm": 96.1513671875, "learning_rate": 3e-06, "loss": -22.8781, "step": 610 }, { "epoch": 0.05450733752620545, "grad_norm": 98.65193176269531, "learning_rate": 3e-06, "loss": -17.6772, "step": 611 }, { "epoch": 0.05459654757125652, "grad_norm": 130.4186248779297, "learning_rate": 3e-06, "loss": -25.9751, "step": 612 }, { "completion_length": 129.4375, "epoch": 0.0546857576163076, "grad_norm": 178.97259521484375, "learning_rate": 3e-06, "loss": -41.7741, "reward": 1.9304792881011963, "reward_std": 0.4192664921283722, "rewards/correctness_reward_func": 1.2916666865348816, "rewards/int_reward_func": 0.4895833283662796, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.14922916889190674, "step": 613, "zero_std_ratio": 0.0 }, { "epoch": 0.05477496766135867, "grad_norm": 159.00680541992188, "learning_rate": 3e-06, "loss": -36.4024, "step": 614 }, { "epoch": 0.05486417770640974, "grad_norm": 154.65304565429688, "learning_rate": 3e-06, "loss": -35.2622, "step": 615 }, { "epoch": 0.05495338775146082, "grad_norm": 239.50408935546875, "learning_rate": 3e-06, "loss": -34.9883, "step": 616 }, { "epoch": 0.05504259779651189, "grad_norm": 191.45263671875, "learning_rate": 3e-06, "loss": -30.4894, "step": 617 }, { "epoch": 0.05513180784156296, "grad_norm": 172.67025756835938, "learning_rate": 3e-06, "loss": -40.9277, "step": 618 }, { "epoch": 0.05522101788661403, "grad_norm": 180.4842071533203, "learning_rate": 3e-06, "loss": -44.2844, "step": 619 }, { "epoch": 0.0553102279316651, "grad_norm": 175.10528564453125, "learning_rate": 3e-06, "loss": -39.7696, "step": 620 }, { "epoch": 0.055399437976716176, "grad_norm": 224.33847045898438, "learning_rate": 3e-06, "loss": -38.3443, "step": 621 }, { "epoch": 0.05548864802176725, "grad_norm": 196.98231506347656, "learning_rate": 3e-06, "loss": -36.4789, "step": 622 }, { "epoch": 0.05557785806681832, "grad_norm": 205.22146606445312, "learning_rate": 3e-06, "loss": -32.5847, "step": 623 }, { "epoch": 0.055667068111869396, "grad_norm": 189.9784393310547, "learning_rate": 3e-06, "loss": -43.7861, "step": 624 }, { "completion_length": 135.1666717529297, "epoch": 0.05575627815692047, "grad_norm": 190.61593627929688, "learning_rate": 3e-06, "loss": -23.515, "reward": 1.7549793124198914, "reward_std": 0.5575149804353714, "rewards/correctness_reward_func": 1.1666666865348816, "rewards/int_reward_func": 0.4479166716337204, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.14039582759141922, "step": 625, "zero_std_ratio": 0.0 }, { "epoch": 0.05584548820197154, "grad_norm": 263.34173583984375, "learning_rate": 3e-06, "loss": -26.4535, "step": 626 }, { "epoch": 0.055934698247022616, "grad_norm": 203.12969970703125, "learning_rate": 3e-06, "loss": -20.2943, "step": 627 }, { "epoch": 0.05602390829207369, "grad_norm": 186.3466033935547, "learning_rate": 3e-06, "loss": -22.5875, "step": 628 }, { "epoch": 0.05611311833712476, "grad_norm": 206.43478393554688, "learning_rate": 3e-06, "loss": -14.8606, "step": 629 }, { "epoch": 0.056202328382175835, "grad_norm": 230.95394897460938, "learning_rate": 3e-06, "loss": -11.2027, "step": 630 }, { "epoch": 0.05629153842722691, "grad_norm": 208.40184020996094, "learning_rate": 3e-06, "loss": -24.9903, "step": 631 }, { "epoch": 0.056380748472277975, "grad_norm": 283.0361328125, "learning_rate": 3e-06, "loss": -28.8647, "step": 632 }, { "epoch": 0.05646995851732905, "grad_norm": 243.43634033203125, "learning_rate": 3e-06, "loss": -20.9854, "step": 633 }, { "epoch": 0.05655916856238012, "grad_norm": 196.8306121826172, "learning_rate": 3e-06, "loss": -25.1397, "step": 634 }, { "epoch": 0.056648378607431195, "grad_norm": 204.37130737304688, "learning_rate": 3e-06, "loss": -17.3989, "step": 635 }, { "epoch": 0.05673758865248227, "grad_norm": 222.9701385498047, "learning_rate": 3e-06, "loss": -14.9143, "step": 636 }, { "completion_length": 117.18750381469727, "epoch": 0.05682679869753334, "grad_norm": 152.56382751464844, "learning_rate": 3e-06, "loss": -53.4148, "reward": 2.4275625944137573, "reward_std": 0.3748088702559471, "rewards/correctness_reward_func": 1.75, "rewards/int_reward_func": 0.4895833283662796, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.18797916173934937, "step": 637, "zero_std_ratio": 0.0 }, { "epoch": 0.056916008742584415, "grad_norm": 187.37356567382812, "learning_rate": 3e-06, "loss": -49.109, "step": 638 }, { "epoch": 0.05700521878763549, "grad_norm": 188.10548400878906, "learning_rate": 3e-06, "loss": -49.7143, "step": 639 }, { "epoch": 0.05709442883268656, "grad_norm": 152.7540283203125, "learning_rate": 3e-06, "loss": -46.4195, "step": 640 }, { "epoch": 0.057183638877737634, "grad_norm": 191.8466796875, "learning_rate": 3e-06, "loss": -54.4961, "step": 641 }, { "epoch": 0.05727284892278871, "grad_norm": 160.72772216796875, "learning_rate": 3e-06, "loss": -46.0513, "step": 642 }, { "epoch": 0.05736205896783978, "grad_norm": 163.2805938720703, "learning_rate": 3e-06, "loss": -55.7387, "step": 643 }, { "epoch": 0.057451269012890854, "grad_norm": 189.55470275878906, "learning_rate": 3e-06, "loss": -51.4447, "step": 644 }, { "epoch": 0.05754047905794193, "grad_norm": 182.82583618164062, "learning_rate": 3e-06, "loss": -51.1561, "step": 645 }, { "epoch": 0.057629689102992994, "grad_norm": 177.5033721923828, "learning_rate": 3e-06, "loss": -49.5066, "step": 646 }, { "epoch": 0.05771889914804407, "grad_norm": 208.1852569580078, "learning_rate": 3e-06, "loss": -59.1063, "step": 647 }, { "epoch": 0.05780810919309514, "grad_norm": 185.0785369873047, "learning_rate": 3e-06, "loss": -49.7641, "step": 648 }, { "completion_length": 127.75000762939453, "epoch": 0.057897319238146214, "grad_norm": 332.4126892089844, "learning_rate": 3e-06, "loss": 24.9465, "reward": 2.10916668176651, "reward_std": 0.7774414718151093, "rewards/correctness_reward_func": 1.5, "rewards/int_reward_func": 0.4583333283662796, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.15083333104848862, "step": 649, "zero_std_ratio": 0.0 }, { "epoch": 0.05798652928319729, "grad_norm": 274.673828125, "learning_rate": 3e-06, "loss": 19.6904, "step": 650 }, { "epoch": 0.05807573932824836, "grad_norm": 225.66183471679688, "learning_rate": 3e-06, "loss": 17.079, "step": 651 }, { "epoch": 0.058164949373299434, "grad_norm": 236.45230102539062, "learning_rate": 3e-06, "loss": 16.4363, "step": 652 }, { "epoch": 0.05825415941835051, "grad_norm": 304.5491943359375, "learning_rate": 3e-06, "loss": 33.7894, "step": 653 }, { "epoch": 0.05834336946340158, "grad_norm": 294.2102966308594, "learning_rate": 3e-06, "loss": 12.1637, "step": 654 }, { "epoch": 0.05843257950845265, "grad_norm": 307.0853271484375, "learning_rate": 3e-06, "loss": 23.9968, "step": 655 }, { "epoch": 0.05852178955350373, "grad_norm": 267.52960205078125, "learning_rate": 3e-06, "loss": 18.2069, "step": 656 }, { "epoch": 0.0586109995985548, "grad_norm": 219.6736602783203, "learning_rate": 3e-06, "loss": 16.1471, "step": 657 }, { "epoch": 0.05870020964360587, "grad_norm": 312.0810852050781, "learning_rate": 3e-06, "loss": 15.3629, "step": 658 }, { "epoch": 0.05878941968865694, "grad_norm": 355.7622985839844, "learning_rate": 3e-06, "loss": 32.1285, "step": 659 }, { "epoch": 0.05887862973370801, "grad_norm": 243.60047912597656, "learning_rate": 3e-06, "loss": 10.9719, "step": 660 }, { "completion_length": 128.37500381469727, "epoch": 0.058967839778759086, "grad_norm": 160.17491149902344, "learning_rate": 3e-06, "loss": -4.7968, "reward": 2.220729112625122, "reward_std": 0.37828393280506134, "rewards/correctness_reward_func": 1.5833333134651184, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.13739583641290665, "step": 661, "zero_std_ratio": 0.0 }, { "epoch": 0.05905704982381016, "grad_norm": 213.04843139648438, "learning_rate": 3e-06, "loss": -17.5158, "step": 662 }, { "epoch": 0.05914625986886123, "grad_norm": 360.1976318359375, "learning_rate": 3e-06, "loss": -5.8696, "step": 663 }, { "epoch": 0.059235469913912306, "grad_norm": 165.9031524658203, "learning_rate": 3e-06, "loss": 9.1976, "step": 664 }, { "epoch": 0.05932467995896338, "grad_norm": 185.02491760253906, "learning_rate": 3e-06, "loss": 0.8362, "step": 665 }, { "epoch": 0.05941389000401445, "grad_norm": 186.4868927001953, "learning_rate": 3e-06, "loss": -4.8959, "step": 666 }, { "epoch": 0.059503100049065526, "grad_norm": 172.38906860351562, "learning_rate": 3e-06, "loss": -8.3078, "step": 667 }, { "epoch": 0.0595923100941166, "grad_norm": 205.17637634277344, "learning_rate": 3e-06, "loss": -19.4471, "step": 668 }, { "epoch": 0.05968152013916767, "grad_norm": 412.4108581542969, "learning_rate": 3e-06, "loss": -9.0223, "step": 669 }, { "epoch": 0.059770730184218746, "grad_norm": 165.48020935058594, "learning_rate": 3e-06, "loss": 5.0711, "step": 670 }, { "epoch": 0.05985994022926982, "grad_norm": 185.8058624267578, "learning_rate": 3e-06, "loss": -3.3781, "step": 671 }, { "epoch": 0.059949150274320885, "grad_norm": 214.62686157226562, "learning_rate": 3e-06, "loss": -9.8208, "step": 672 }, { "completion_length": 110.20833587646484, "epoch": 0.06003836031937196, "grad_norm": 223.97824096679688, "learning_rate": 3e-06, "loss": 17.6495, "reward": 1.9757083654403687, "reward_std": 0.7511122822761536, "rewards/correctness_reward_func": 1.375, "rewards/int_reward_func": 0.4375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.16320832818746567, "step": 673, "zero_std_ratio": 0.0 }, { "epoch": 0.06012757036442303, "grad_norm": 180.975341796875, "learning_rate": 3e-06, "loss": -3.9399, "step": 674 }, { "epoch": 0.060216780409474105, "grad_norm": 222.6065216064453, "learning_rate": 3e-06, "loss": 10.3809, "step": 675 }, { "epoch": 0.06030599045452518, "grad_norm": 192.075439453125, "learning_rate": 3e-06, "loss": 14.9998, "step": 676 }, { "epoch": 0.06039520049957625, "grad_norm": 200.7101593017578, "learning_rate": 3e-06, "loss": 2.2906, "step": 677 }, { "epoch": 0.060484410544627325, "grad_norm": 200.1593017578125, "learning_rate": 3e-06, "loss": 13.137, "step": 678 }, { "epoch": 0.0605736205896784, "grad_norm": 185.83168029785156, "learning_rate": 3e-06, "loss": 16.2497, "step": 679 }, { "epoch": 0.06066283063472947, "grad_norm": 208.6911163330078, "learning_rate": 3e-06, "loss": -5.308, "step": 680 }, { "epoch": 0.060752040679780545, "grad_norm": 231.81312561035156, "learning_rate": 3e-06, "loss": 8.1785, "step": 681 }, { "epoch": 0.06084125072483162, "grad_norm": 187.45535278320312, "learning_rate": 3e-06, "loss": 12.5542, "step": 682 }, { "epoch": 0.06093046076988269, "grad_norm": 182.04257202148438, "learning_rate": 3e-06, "loss": -1.0645, "step": 683 }, { "epoch": 0.061019670814933764, "grad_norm": 186.71937561035156, "learning_rate": 3e-06, "loss": 10.3416, "step": 684 }, { "completion_length": 130.4791717529297, "epoch": 0.06110888085998484, "grad_norm": 169.67178344726562, "learning_rate": 3e-06, "loss": 32.9847, "reward": 2.2736042737960815, "reward_std": 0.5084125399589539, "rewards/correctness_reward_func": 1.6666666865348816, "rewards/int_reward_func": 0.4583333283662796, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.14860416948795319, "step": 685, "zero_std_ratio": 0.0 }, { "epoch": 0.061198090905035904, "grad_norm": 231.2995147705078, "learning_rate": 3e-06, "loss": 18.3002, "step": 686 }, { "epoch": 0.06128730095008698, "grad_norm": 162.75083923339844, "learning_rate": 3e-06, "loss": 29.4583, "step": 687 }, { "epoch": 0.06137651099513805, "grad_norm": 209.54966735839844, "learning_rate": 3e-06, "loss": 42.4661, "step": 688 }, { "epoch": 0.061465721040189124, "grad_norm": 396.1800537109375, "learning_rate": 3e-06, "loss": 42.6963, "step": 689 }, { "epoch": 0.0615549310852402, "grad_norm": 144.02049255371094, "learning_rate": 3e-06, "loss": 23.2087, "step": 690 }, { "epoch": 0.06164414113029127, "grad_norm": 179.72213745117188, "learning_rate": 3e-06, "loss": 28.8186, "step": 691 }, { "epoch": 0.061733351175342344, "grad_norm": 143.9263153076172, "learning_rate": 3e-06, "loss": 16.2638, "step": 692 }, { "epoch": 0.06182256122039342, "grad_norm": 148.50750732421875, "learning_rate": 3e-06, "loss": 27.5098, "step": 693 }, { "epoch": 0.06191177126544449, "grad_norm": 178.09515380859375, "learning_rate": 3e-06, "loss": 38.1567, "step": 694 }, { "epoch": 0.06200098131049556, "grad_norm": 319.7694396972656, "learning_rate": 3e-06, "loss": 36.9073, "step": 695 }, { "epoch": 0.06209019135554664, "grad_norm": 137.5644989013672, "learning_rate": 3e-06, "loss": 19.9691, "step": 696 }, { "completion_length": 138.20833587646484, "epoch": 0.06217940140059771, "grad_norm": 243.18804931640625, "learning_rate": 3e-06, "loss": 63.9782, "reward": 1.6311666369438171, "reward_std": 0.6314830482006073, "rewards/correctness_reward_func": 1.0833333730697632, "rewards/int_reward_func": 0.4166666716337204, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.13116667047142982, "step": 697, "zero_std_ratio": 0.0 }, { "epoch": 0.06226861144564878, "grad_norm": 248.19979858398438, "learning_rate": 3e-06, "loss": 62.0708, "step": 698 }, { "epoch": 0.06235782149069985, "grad_norm": 192.6903839111328, "learning_rate": 3e-06, "loss": 44.2788, "step": 699 }, { "epoch": 0.06244703153575092, "grad_norm": 187.61729431152344, "learning_rate": 3e-06, "loss": 45.635, "step": 700 }, { "epoch": 0.062536241580802, "grad_norm": 270.56439208984375, "learning_rate": 3e-06, "loss": 61.8371, "step": 701 }, { "epoch": 0.06262545162585308, "grad_norm": 186.34654235839844, "learning_rate": 3e-06, "loss": 35.0754, "step": 702 }, { "epoch": 0.06271466167090414, "grad_norm": 193.72666931152344, "learning_rate": 3e-06, "loss": 56.4102, "step": 703 }, { "epoch": 0.06280387171595522, "grad_norm": 213.4738006591797, "learning_rate": 3e-06, "loss": 55.6052, "step": 704 }, { "epoch": 0.06289308176100629, "grad_norm": 178.9663543701172, "learning_rate": 3e-06, "loss": 38.9399, "step": 705 }, { "epoch": 0.06298229180605736, "grad_norm": 155.2235870361328, "learning_rate": 3e-06, "loss": 40.2563, "step": 706 }, { "epoch": 0.06307150185110844, "grad_norm": 190.83424377441406, "learning_rate": 3e-06, "loss": 54.1227, "step": 707 }, { "epoch": 0.0631607118961595, "grad_norm": 151.27175903320312, "learning_rate": 3e-06, "loss": 29.2768, "step": 708 }, { "completion_length": 177.89584350585938, "epoch": 0.06324992194121058, "grad_norm": 383.90155029296875, "learning_rate": 3e-06, "loss": -57.4567, "reward": 1.6143542528152466, "reward_std": 1.1507561206817627, "rewards/correctness_reward_func": 1.125, "rewards/int_reward_func": 0.4479166716337204, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.041437502950429916, "step": 709, "zero_std_ratio": 0.0 }, { "epoch": 0.06333913198626165, "grad_norm": 228.4014892578125, "learning_rate": 3e-06, "loss": -22.536, "step": 710 }, { "epoch": 0.06342834203131273, "grad_norm": 247.7998809814453, "learning_rate": 3e-06, "loss": -34.7459, "step": 711 }, { "epoch": 0.0635175520763638, "grad_norm": 256.2286376953125, "learning_rate": 3e-06, "loss": -30.4245, "step": 712 }, { "epoch": 0.06360676212141488, "grad_norm": 254.9169158935547, "learning_rate": 3e-06, "loss": -38.2843, "step": 713 }, { "epoch": 0.06369597216646594, "grad_norm": 321.43609619140625, "learning_rate": 3e-06, "loss": -35.5213, "step": 714 }, { "epoch": 0.06378518221151702, "grad_norm": 349.0517272949219, "learning_rate": 3e-06, "loss": -55.9526, "step": 715 }, { "epoch": 0.06387439225656809, "grad_norm": 209.25282287597656, "learning_rate": 3e-06, "loss": -22.3323, "step": 716 }, { "epoch": 0.06396360230161917, "grad_norm": 247.7156219482422, "learning_rate": 3e-06, "loss": -35.9537, "step": 717 }, { "epoch": 0.06405281234667023, "grad_norm": 274.3576965332031, "learning_rate": 3e-06, "loss": -33.4144, "step": 718 }, { "epoch": 0.0641420223917213, "grad_norm": 287.4893798828125, "learning_rate": 3e-06, "loss": -40.4848, "step": 719 }, { "epoch": 0.06423123243677238, "grad_norm": 375.7614440917969, "learning_rate": 3e-06, "loss": -39.3635, "step": 720 }, { "completion_length": 109.22916793823242, "epoch": 0.06432044248182345, "grad_norm": 123.53705596923828, "learning_rate": 3e-06, "loss": -30.919, "reward": 2.3711042404174805, "reward_std": 0.4830681085586548, "rewards/correctness_reward_func": 1.7083333730697632, "rewards/int_reward_func": 0.46875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.1940208300948143, "step": 721, "zero_std_ratio": 0.0 }, { "epoch": 0.06440965252687453, "grad_norm": 114.69866943359375, "learning_rate": 3e-06, "loss": -29.2645, "step": 722 }, { "epoch": 0.0644988625719256, "grad_norm": 118.97370910644531, "learning_rate": 3e-06, "loss": -25.0985, "step": 723 }, { "epoch": 0.06458807261697667, "grad_norm": 121.6566162109375, "learning_rate": 3e-06, "loss": -29.0591, "step": 724 }, { "epoch": 0.06467728266202774, "grad_norm": 182.92691040039062, "learning_rate": 3e-06, "loss": -28.0901, "step": 725 }, { "epoch": 0.06476649270707882, "grad_norm": 156.50718688964844, "learning_rate": 3e-06, "loss": -30.7831, "step": 726 }, { "epoch": 0.06485570275212989, "grad_norm": 132.53089904785156, "learning_rate": 3e-06, "loss": -33.3058, "step": 727 }, { "epoch": 0.06494491279718097, "grad_norm": 112.21791076660156, "learning_rate": 3e-06, "loss": -31.8015, "step": 728 }, { "epoch": 0.06503412284223203, "grad_norm": 130.97052001953125, "learning_rate": 3e-06, "loss": -27.7953, "step": 729 }, { "epoch": 0.06512333288728311, "grad_norm": 128.9853515625, "learning_rate": 3e-06, "loss": -32.2601, "step": 730 }, { "epoch": 0.06521254293233418, "grad_norm": 146.0636444091797, "learning_rate": 3e-06, "loss": -31.9068, "step": 731 }, { "epoch": 0.06530175297738525, "grad_norm": 148.94358825683594, "learning_rate": 3e-06, "loss": -35.98, "step": 732 }, { "completion_length": 133.77083587646484, "epoch": 0.06539096302243633, "grad_norm": 200.87583923339844, "learning_rate": 3e-06, "loss": 5.8156, "reward": 1.7448542714118958, "reward_std": 0.7897588908672333, "rewards/correctness_reward_func": 1.1666666865348816, "rewards/int_reward_func": 0.4479166716337204, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.1302708424627781, "step": 733, "zero_std_ratio": 0.0 }, { "epoch": 0.0654801730674874, "grad_norm": 254.33050537109375, "learning_rate": 3e-06, "loss": 39.6914, "step": 734 }, { "epoch": 0.06556938311253847, "grad_norm": 207.40354919433594, "learning_rate": 3e-06, "loss": 18.8055, "step": 735 }, { "epoch": 0.06565859315758954, "grad_norm": 255.23114013671875, "learning_rate": 3e-06, "loss": 12.5947, "step": 736 }, { "epoch": 0.06574780320264062, "grad_norm": 183.82200622558594, "learning_rate": 3e-06, "loss": 11.0044, "step": 737 }, { "epoch": 0.06583701324769169, "grad_norm": 226.2420654296875, "learning_rate": 3e-06, "loss": 0.4847, "step": 738 }, { "epoch": 0.06592622329274277, "grad_norm": 233.6065673828125, "learning_rate": 3e-06, "loss": 4.5426, "step": 739 }, { "epoch": 0.06601543333779383, "grad_norm": 264.205078125, "learning_rate": 3e-06, "loss": 39.3686, "step": 740 }, { "epoch": 0.06610464338284491, "grad_norm": 241.85284423828125, "learning_rate": 3e-06, "loss": 17.7268, "step": 741 }, { "epoch": 0.06619385342789598, "grad_norm": 221.0516357421875, "learning_rate": 3e-06, "loss": 13.2982, "step": 742 }, { "epoch": 0.06628306347294706, "grad_norm": 212.37222290039062, "learning_rate": 3e-06, "loss": 7.7467, "step": 743 }, { "epoch": 0.06637227351799813, "grad_norm": 263.42919921875, "learning_rate": 3e-06, "loss": -2.1754, "step": 744 }, { "completion_length": 125.64583587646484, "epoch": 0.0664614835630492, "grad_norm": 193.34835815429688, "learning_rate": 3e-06, "loss": 40.2546, "reward": 1.7169584035873413, "reward_std": 0.4941745698451996, "rewards/correctness_reward_func": 1.0833333730697632, "rewards/int_reward_func": 0.4791666567325592, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.15445833653211594, "step": 745, "zero_std_ratio": 0.0 }, { "epoch": 0.06655069360810027, "grad_norm": 212.19464111328125, "learning_rate": 3e-06, "loss": 41.4745, "step": 746 }, { "epoch": 0.06663990365315134, "grad_norm": 237.55845642089844, "learning_rate": 3e-06, "loss": 47.5379, "step": 747 }, { "epoch": 0.06672911369820242, "grad_norm": 185.4857635498047, "learning_rate": 3e-06, "loss": 43.0404, "step": 748 }, { "epoch": 0.06681832374325349, "grad_norm": 158.21678161621094, "learning_rate": 3e-06, "loss": 32.8558, "step": 749 }, { "epoch": 0.06690753378830457, "grad_norm": 212.29397583007812, "learning_rate": 3e-06, "loss": 41.9557, "step": 750 }, { "epoch": 0.06699674383335563, "grad_norm": 181.2362060546875, "learning_rate": 3e-06, "loss": 38.4185, "step": 751 }, { "epoch": 0.06708595387840671, "grad_norm": 186.73841857910156, "learning_rate": 3e-06, "loss": 37.1992, "step": 752 }, { "epoch": 0.06717516392345778, "grad_norm": 182.0499267578125, "learning_rate": 3e-06, "loss": 42.4944, "step": 753 }, { "epoch": 0.06726437396850886, "grad_norm": 161.4265899658203, "learning_rate": 3e-06, "loss": 40.1143, "step": 754 }, { "epoch": 0.06735358401355993, "grad_norm": 145.66175842285156, "learning_rate": 3e-06, "loss": 29.066, "step": 755 }, { "epoch": 0.067442794058611, "grad_norm": 188.43362426757812, "learning_rate": 3e-06, "loss": 34.8647, "step": 756 }, { "completion_length": 115.64583587646484, "epoch": 0.06753200410366207, "grad_norm": 176.6951446533203, "learning_rate": 3e-06, "loss": 23.7273, "reward": 1.9728541374206543, "reward_std": 0.22583025321364403, "rewards/correctness_reward_func": 1.375, "rewards/int_reward_func": 0.4270833432674408, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.17077083885669708, "step": 757, "zero_std_ratio": 0.0 }, { "epoch": 0.06762121414871315, "grad_norm": 134.009521484375, "learning_rate": 3e-06, "loss": 29.2333, "step": 758 }, { "epoch": 0.06771042419376422, "grad_norm": 131.3091278076172, "learning_rate": 3e-06, "loss": 29.9313, "step": 759 }, { "epoch": 0.06779963423881528, "grad_norm": 128.05767822265625, "learning_rate": 3e-06, "loss": 22.7093, "step": 760 }, { "epoch": 0.06788884428386636, "grad_norm": 115.9305419921875, "learning_rate": 3e-06, "loss": 23.3578, "step": 761 }, { "epoch": 0.06797805432891743, "grad_norm": 127.96971893310547, "learning_rate": 3e-06, "loss": 26.8069, "step": 762 }, { "epoch": 0.06806726437396851, "grad_norm": 172.38279724121094, "learning_rate": 3e-06, "loss": 20.0156, "step": 763 }, { "epoch": 0.06815647441901958, "grad_norm": 122.19217681884766, "learning_rate": 3e-06, "loss": 23.256, "step": 764 }, { "epoch": 0.06824568446407066, "grad_norm": 98.1166763305664, "learning_rate": 3e-06, "loss": 25.2087, "step": 765 }, { "epoch": 0.06833489450912172, "grad_norm": 104.1299819946289, "learning_rate": 3e-06, "loss": 18.9453, "step": 766 }, { "epoch": 0.0684241045541728, "grad_norm": 108.13124084472656, "learning_rate": 3e-06, "loss": 18.3841, "step": 767 }, { "epoch": 0.06851331459922387, "grad_norm": 107.33203887939453, "learning_rate": 3e-06, "loss": 21.8827, "step": 768 }, { "completion_length": 140.7291717529297, "epoch": 0.06860252464427495, "grad_norm": 176.08241271972656, "learning_rate": 3e-06, "loss": 1.9837, "reward": 2.0843957662582397, "reward_std": 0.670438677072525, "rewards/correctness_reward_func": 1.5, "rewards/int_reward_func": 0.46875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.11564583331346512, "step": 769, "zero_std_ratio": 0.0 }, { "epoch": 0.06869173468932602, "grad_norm": 184.2529754638672, "learning_rate": 3e-06, "loss": -9.1612, "step": 770 }, { "epoch": 0.0687809447343771, "grad_norm": 124.57987213134766, "learning_rate": 3e-06, "loss": -1.167, "step": 771 }, { "epoch": 0.06887015477942816, "grad_norm": 146.37869262695312, "learning_rate": 3e-06, "loss": 7.9908, "step": 772 }, { "epoch": 0.06895936482447923, "grad_norm": 159.84788513183594, "learning_rate": 3e-06, "loss": -7.8619, "step": 773 }, { "epoch": 0.06904857486953031, "grad_norm": 165.3255157470703, "learning_rate": 3e-06, "loss": -9.9255, "step": 774 }, { "epoch": 0.06913778491458138, "grad_norm": 147.72352600097656, "learning_rate": 3e-06, "loss": 0.4622, "step": 775 }, { "epoch": 0.06922699495963246, "grad_norm": 161.2108917236328, "learning_rate": 3e-06, "loss": -9.3763, "step": 776 }, { "epoch": 0.06931620500468352, "grad_norm": 117.9613265991211, "learning_rate": 3e-06, "loss": -2.9584, "step": 777 }, { "epoch": 0.0694054150497346, "grad_norm": 127.0103988647461, "learning_rate": 3e-06, "loss": 4.5212, "step": 778 }, { "epoch": 0.06949462509478567, "grad_norm": 161.12701416015625, "learning_rate": 3e-06, "loss": -7.3502, "step": 779 }, { "epoch": 0.06958383513983675, "grad_norm": 147.03277587890625, "learning_rate": 3e-06, "loss": -12.0696, "step": 780 }, { "completion_length": 146.75000762939453, "epoch": 0.06967304518488782, "grad_norm": 93.77994537353516, "learning_rate": 3e-06, "loss": -9.5819, "reward": 2.0504584312438965, "reward_std": 0.46765226125717163, "rewards/correctness_reward_func": 1.4166666865348816, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.010416666977107525, "rewards/xmlcount_reward_func": 0.12337498925626278, "step": 781, "zero_std_ratio": 0.0 }, { "epoch": 0.0697622552299389, "grad_norm": 109.34754180908203, "learning_rate": 3e-06, "loss": -14.1501, "step": 782 }, { "epoch": 0.06985146527498996, "grad_norm": 83.16600799560547, "learning_rate": 3e-06, "loss": -17.3924, "step": 783 }, { "epoch": 0.06994067532004104, "grad_norm": 83.6520767211914, "learning_rate": 3e-06, "loss": -3.3917, "step": 784 }, { "epoch": 0.07002988536509211, "grad_norm": 111.59048461914062, "learning_rate": 3e-06, "loss": -18.6001, "step": 785 }, { "epoch": 0.07011909541014318, "grad_norm": 81.26487731933594, "learning_rate": 3e-06, "loss": -4.8149, "step": 786 }, { "epoch": 0.07020830545519426, "grad_norm": 88.5013198852539, "learning_rate": 3e-06, "loss": -10.8751, "step": 787 }, { "epoch": 0.07029751550024532, "grad_norm": 106.22066497802734, "learning_rate": 3e-06, "loss": -14.8712, "step": 788 }, { "epoch": 0.0703867255452964, "grad_norm": 95.8133544921875, "learning_rate": 3e-06, "loss": -17.8064, "step": 789 }, { "epoch": 0.07047593559034747, "grad_norm": 105.98171997070312, "learning_rate": 3e-06, "loss": -4.9442, "step": 790 }, { "epoch": 0.07056514563539855, "grad_norm": 108.74724578857422, "learning_rate": 3e-06, "loss": -20.1345, "step": 791 }, { "epoch": 0.07065435568044962, "grad_norm": 217.0005340576172, "learning_rate": 3e-06, "loss": -5.632, "step": 792 }, { "completion_length": 137.1041717529297, "epoch": 0.0707435657255007, "grad_norm": 186.0244140625, "learning_rate": 3e-06, "loss": 20.4605, "reward": 1.8869168162345886, "reward_std": 0.7256337702274323, "rewards/correctness_reward_func": 1.2916666567325592, "rewards/int_reward_func": 0.4583333283662796, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.13691667094826698, "step": 793, "zero_std_ratio": 0.0 }, { "epoch": 0.07083277577055176, "grad_norm": 139.76405334472656, "learning_rate": 3e-06, "loss": 14.9958, "step": 794 }, { "epoch": 0.07092198581560284, "grad_norm": 155.77352905273438, "learning_rate": 3e-06, "loss": 24.0522, "step": 795 }, { "epoch": 0.07101119586065391, "grad_norm": 147.90013122558594, "learning_rate": 3e-06, "loss": 30.8391, "step": 796 }, { "epoch": 0.07110040590570499, "grad_norm": 154.99143981933594, "learning_rate": 3e-06, "loss": 24.957, "step": 797 }, { "epoch": 0.07118961595075605, "grad_norm": 154.12411499023438, "learning_rate": 3e-06, "loss": 32.2635, "step": 798 }, { "epoch": 0.07127882599580712, "grad_norm": 142.52955627441406, "learning_rate": 3e-06, "loss": 19.5332, "step": 799 }, { "epoch": 0.0713680360408582, "grad_norm": 140.1791229248047, "learning_rate": 3e-06, "loss": 13.5796, "step": 800 }, { "epoch": 0.07145724608590927, "grad_norm": 144.1186981201172, "learning_rate": 3e-06, "loss": 21.398, "step": 801 }, { "epoch": 0.07154645613096035, "grad_norm": 139.25230407714844, "learning_rate": 3e-06, "loss": 28.6905, "step": 802 }, { "epoch": 0.07163566617601141, "grad_norm": 151.95538330078125, "learning_rate": 3e-06, "loss": 21.6279, "step": 803 }, { "epoch": 0.0717248762210625, "grad_norm": 143.84974670410156, "learning_rate": 3e-06, "loss": 29.0759, "step": 804 }, { "completion_length": 136.22916793823242, "epoch": 0.07181408626611356, "grad_norm": 92.62751770019531, "learning_rate": 3e-06, "loss": -21.4484, "reward": 2.414271116256714, "reward_std": 0.33920496702194214, "rewards/correctness_reward_func": 1.8333333134651184, "rewards/int_reward_func": 0.4583333283662796, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.12260417267680168, "step": 805, "zero_std_ratio": 0.0 }, { "epoch": 0.07190329631116464, "grad_norm": 60.42007827758789, "learning_rate": 3e-06, "loss": -12.7491, "step": 806 }, { "epoch": 0.07199250635621571, "grad_norm": 94.1617660522461, "learning_rate": 3e-06, "loss": -14.5131, "step": 807 }, { "epoch": 0.07208171640126679, "grad_norm": 68.65794372558594, "learning_rate": 3e-06, "loss": -14.1192, "step": 808 }, { "epoch": 0.07217092644631785, "grad_norm": 79.4446792602539, "learning_rate": 3e-06, "loss": -12.4833, "step": 809 }, { "epoch": 0.07226013649136893, "grad_norm": 82.45279693603516, "learning_rate": 3e-06, "loss": -8.9581, "step": 810 }, { "epoch": 0.07234934653642, "grad_norm": 101.833984375, "learning_rate": 3e-06, "loss": -22.4901, "step": 811 }, { "epoch": 0.07243855658147107, "grad_norm": 78.60984802246094, "learning_rate": 3e-06, "loss": -12.9701, "step": 812 }, { "epoch": 0.07252776662652215, "grad_norm": 100.63545989990234, "learning_rate": 3e-06, "loss": -15.4353, "step": 813 }, { "epoch": 0.07261697667157321, "grad_norm": 66.36518096923828, "learning_rate": 3e-06, "loss": -14.7871, "step": 814 }, { "epoch": 0.0727061867166243, "grad_norm": 73.31499481201172, "learning_rate": 3e-06, "loss": -13.5328, "step": 815 }, { "epoch": 0.07279539676167536, "grad_norm": 81.8609848022461, "learning_rate": 3e-06, "loss": -10.202, "step": 816 }, { "completion_length": 176.0416717529297, "epoch": 0.07288460680672644, "grad_norm": 85.71199035644531, "learning_rate": 3e-06, "loss": -2.3775, "reward": 1.308291733264923, "reward_std": 0.4305167943239212, "rewards/correctness_reward_func": 0.8333333432674408, "rewards/int_reward_func": 0.4583333283662796, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.016624998301267624, "step": 817, "zero_std_ratio": 0.0 }, { "epoch": 0.0729738168517775, "grad_norm": 113.99066162109375, "learning_rate": 3e-06, "loss": 3.9682, "step": 818 }, { "epoch": 0.07306302689682859, "grad_norm": 90.42432403564453, "learning_rate": 3e-06, "loss": -8.968, "step": 819 }, { "epoch": 0.07315223694187965, "grad_norm": 86.66998291015625, "learning_rate": 3e-06, "loss": 5.9931, "step": 820 }, { "epoch": 0.07324144698693073, "grad_norm": 112.87352752685547, "learning_rate": 3e-06, "loss": 3.7934, "step": 821 }, { "epoch": 0.0733306570319818, "grad_norm": 116.59276580810547, "learning_rate": 3e-06, "loss": -12.3296, "step": 822 }, { "epoch": 0.07341986707703288, "grad_norm": 95.53129577636719, "learning_rate": 3e-06, "loss": -3.2342, "step": 823 }, { "epoch": 0.07350907712208395, "grad_norm": 134.6486053466797, "learning_rate": 3e-06, "loss": 2.4981, "step": 824 }, { "epoch": 0.07359828716713501, "grad_norm": 80.79833221435547, "learning_rate": 3e-06, "loss": -9.3798, "step": 825 }, { "epoch": 0.07368749721218609, "grad_norm": 107.38970184326172, "learning_rate": 3e-06, "loss": 4.8459, "step": 826 }, { "epoch": 0.07377670725723716, "grad_norm": 91.76937866210938, "learning_rate": 3e-06, "loss": 2.6928, "step": 827 }, { "epoch": 0.07386591730228824, "grad_norm": 112.15656280517578, "learning_rate": 3e-06, "loss": -13.4598, "step": 828 }, { "completion_length": 161.2916717529297, "epoch": 0.0739551273473393, "grad_norm": 225.7854461669922, "learning_rate": 3e-06, "loss": -7.2711, "reward": 1.8568333387374878, "reward_std": 0.38829553686082363, "rewards/correctness_reward_func": 1.2916666865348816, "rewards/int_reward_func": 0.4895833283662796, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.0755833312869072, "step": 829, "zero_std_ratio": 0.0 }, { "epoch": 0.07404433739239039, "grad_norm": 203.64381408691406, "learning_rate": 3e-06, "loss": -24.0722, "step": 830 }, { "epoch": 0.07413354743744145, "grad_norm": 470.9283752441406, "learning_rate": 3e-06, "loss": 7.9581, "step": 831 }, { "epoch": 0.07422275748249253, "grad_norm": 261.8198547363281, "learning_rate": 3e-06, "loss": -13.8961, "step": 832 }, { "epoch": 0.0743119675275436, "grad_norm": 238.60263061523438, "learning_rate": 3e-06, "loss": -17.5778, "step": 833 }, { "epoch": 0.07440117757259468, "grad_norm": 251.20684814453125, "learning_rate": 3e-06, "loss": -13.8026, "step": 834 }, { "epoch": 0.07449038761764575, "grad_norm": 233.15805053710938, "learning_rate": 3e-06, "loss": -9.8419, "step": 835 }, { "epoch": 0.07457959766269683, "grad_norm": 188.42831420898438, "learning_rate": 3e-06, "loss": -27.1095, "step": 836 }, { "epoch": 0.07466880770774789, "grad_norm": 330.0888671875, "learning_rate": 3e-06, "loss": 2.663, "step": 837 }, { "epoch": 0.07475801775279897, "grad_norm": 187.619873046875, "learning_rate": 3e-06, "loss": -17.3052, "step": 838 }, { "epoch": 0.07484722779785004, "grad_norm": 273.087646484375, "learning_rate": 3e-06, "loss": -20.1137, "step": 839 }, { "epoch": 0.0749364378429011, "grad_norm": 231.94540405273438, "learning_rate": 3e-06, "loss": -17.6819, "step": 840 }, { "completion_length": 124.5625, "epoch": 0.07502564788795218, "grad_norm": 169.8292236328125, "learning_rate": 3e-06, "loss": -14.9169, "reward": 2.1858333349227905, "reward_std": 0.5357859879732132, "rewards/correctness_reward_func": 1.5833333134651184, "rewards/int_reward_func": 0.4270833432674408, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.17541665583848953, "step": 841, "zero_std_ratio": 0.0 }, { "epoch": 0.07511485793300325, "grad_norm": 151.44129943847656, "learning_rate": 3e-06, "loss": -38.7775, "step": 842 }, { "epoch": 0.07520406797805433, "grad_norm": 141.34671020507812, "learning_rate": 3e-06, "loss": -32.3555, "step": 843 }, { "epoch": 0.0752932780231054, "grad_norm": 117.83955383300781, "learning_rate": 3e-06, "loss": -35.7298, "step": 844 }, { "epoch": 0.07538248806815648, "grad_norm": 113.38582611083984, "learning_rate": 3e-06, "loss": -36.8355, "step": 845 }, { "epoch": 0.07547169811320754, "grad_norm": 147.53521728515625, "learning_rate": 3e-06, "loss": -34.7305, "step": 846 }, { "epoch": 0.07556090815825862, "grad_norm": 167.8444061279297, "learning_rate": 3e-06, "loss": -17.6609, "step": 847 }, { "epoch": 0.07565011820330969, "grad_norm": 177.19976806640625, "learning_rate": 3e-06, "loss": -42.6765, "step": 848 }, { "epoch": 0.07573932824836077, "grad_norm": 207.4672393798828, "learning_rate": 3e-06, "loss": -36.7629, "step": 849 }, { "epoch": 0.07582853829341184, "grad_norm": 124.84293365478516, "learning_rate": 3e-06, "loss": -39.1349, "step": 850 }, { "epoch": 0.07591774833846292, "grad_norm": 134.89764404296875, "learning_rate": 3e-06, "loss": -41.2224, "step": 851 }, { "epoch": 0.07600695838351398, "grad_norm": 161.6527862548828, "learning_rate": 3e-06, "loss": -39.203, "step": 852 }, { "completion_length": 147.7291717529297, "epoch": 0.07609616842856505, "grad_norm": 371.6798095703125, "learning_rate": 3e-06, "loss": 88.7083, "reward": 2.0986876487731934, "reward_std": 0.5909168422222137, "rewards/correctness_reward_func": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.09868749976158142, "step": 853, "zero_std_ratio": 0.0 }, { "epoch": 0.07618537847361613, "grad_norm": 385.34136962890625, "learning_rate": 3e-06, "loss": 86.0723, "step": 854 }, { "epoch": 0.0762745885186672, "grad_norm": 360.78021240234375, "learning_rate": 3e-06, "loss": 71.1204, "step": 855 }, { "epoch": 0.07636379856371828, "grad_norm": 293.267333984375, "learning_rate": 3e-06, "loss": 66.9494, "step": 856 }, { "epoch": 0.07645300860876934, "grad_norm": 440.7154846191406, "learning_rate": 3e-06, "loss": 88.5771, "step": 857 }, { "epoch": 0.07654221865382042, "grad_norm": 327.457275390625, "learning_rate": 3e-06, "loss": 58.8516, "step": 858 }, { "epoch": 0.07663142869887149, "grad_norm": 371.9436340332031, "learning_rate": 3e-06, "loss": 85.2973, "step": 859 }, { "epoch": 0.07672063874392257, "grad_norm": 389.5568542480469, "learning_rate": 3e-06, "loss": 79.8231, "step": 860 }, { "epoch": 0.07680984878897364, "grad_norm": 321.9656066894531, "learning_rate": 3e-06, "loss": 63.4841, "step": 861 }, { "epoch": 0.07689905883402472, "grad_norm": 284.66876220703125, "learning_rate": 3e-06, "loss": 59.0441, "step": 862 }, { "epoch": 0.07698826887907578, "grad_norm": 410.6514587402344, "learning_rate": 3e-06, "loss": 76.3094, "step": 863 }, { "epoch": 0.07707747892412686, "grad_norm": 284.0197448730469, "learning_rate": 3e-06, "loss": 49.468, "step": 864 }, { "completion_length": 144.1041717529297, "epoch": 0.07716668896917793, "grad_norm": 226.17822265625, "learning_rate": 3e-06, "loss": 56.9406, "reward": 2.379916787147522, "reward_std": 0.430880606174469, "rewards/correctness_reward_func": 1.7916666865348816, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.08824999909847975, "step": 865, "zero_std_ratio": 0.0 }, { "epoch": 0.077255899014229, "grad_norm": 186.55560302734375, "learning_rate": 3e-06, "loss": 52.0502, "step": 866 }, { "epoch": 0.07734510905928008, "grad_norm": 184.80516052246094, "learning_rate": 3e-06, "loss": 46.5618, "step": 867 }, { "epoch": 0.07743431910433114, "grad_norm": 178.5349884033203, "learning_rate": 3e-06, "loss": 46.2696, "step": 868 }, { "epoch": 0.07752352914938222, "grad_norm": 148.83154296875, "learning_rate": 3e-06, "loss": 29.464, "step": 869 }, { "epoch": 0.07761273919443329, "grad_norm": 161.14889526367188, "learning_rate": 3e-06, "loss": 51.0483, "step": 870 }, { "epoch": 0.07770194923948437, "grad_norm": 192.32308959960938, "learning_rate": 3e-06, "loss": 47.9675, "step": 871 }, { "epoch": 0.07779115928453544, "grad_norm": 152.79583740234375, "learning_rate": 3e-06, "loss": 43.9552, "step": 872 }, { "epoch": 0.07788036932958652, "grad_norm": 151.7612762451172, "learning_rate": 3e-06, "loss": 38.7329, "step": 873 }, { "epoch": 0.07796957937463758, "grad_norm": 133.1282196044922, "learning_rate": 3e-06, "loss": 38.5692, "step": 874 }, { "epoch": 0.07805878941968866, "grad_norm": 103.07962036132812, "learning_rate": 3e-06, "loss": 24.2915, "step": 875 }, { "epoch": 0.07814799946473973, "grad_norm": 129.3807373046875, "learning_rate": 3e-06, "loss": 43.1067, "step": 876 }, { "completion_length": 127.54166793823242, "epoch": 0.07823720950979081, "grad_norm": 125.33885192871094, "learning_rate": 3e-06, "loss": -23.6161, "reward": 1.667020857334137, "reward_std": 0.5597978234291077, "rewards/correctness_reward_func": 1.0833333432674408, "rewards/int_reward_func": 0.40625, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.17743750661611557, "step": 877, "zero_std_ratio": 0.0 }, { "epoch": 0.07832641955484188, "grad_norm": 138.50013732910156, "learning_rate": 3e-06, "loss": -16.1757, "step": 878 }, { "epoch": 0.07841562959989294, "grad_norm": 130.26280212402344, "learning_rate": 3e-06, "loss": -21.8816, "step": 879 }, { "epoch": 0.07850483964494402, "grad_norm": 141.026123046875, "learning_rate": 3e-06, "loss": -22.0761, "step": 880 }, { "epoch": 0.07859404968999509, "grad_norm": 126.53893280029297, "learning_rate": 3e-06, "loss": -23.4112, "step": 881 }, { "epoch": 0.07868325973504617, "grad_norm": 153.45120239257812, "learning_rate": 3e-06, "loss": -17.7169, "step": 882 }, { "epoch": 0.07877246978009723, "grad_norm": 122.84283447265625, "learning_rate": 3e-06, "loss": -24.8704, "step": 883 }, { "epoch": 0.07886167982514831, "grad_norm": 157.95201110839844, "learning_rate": 3e-06, "loss": -16.2334, "step": 884 }, { "epoch": 0.07895088987019938, "grad_norm": 136.01124572753906, "learning_rate": 3e-06, "loss": -23.3998, "step": 885 }, { "epoch": 0.07904009991525046, "grad_norm": 135.98423767089844, "learning_rate": 3e-06, "loss": -23.6646, "step": 886 }, { "epoch": 0.07912930996030153, "grad_norm": 131.24002075195312, "learning_rate": 3e-06, "loss": -25.7454, "step": 887 }, { "epoch": 0.07921852000535261, "grad_norm": 124.26398468017578, "learning_rate": 3e-06, "loss": -20.446, "step": 888 }, { "completion_length": 153.58333587646484, "epoch": 0.07930773005040367, "grad_norm": 240.47964477539062, "learning_rate": 3e-06, "loss": -52.9796, "reward": 1.8959583044052124, "reward_std": 0.6871494352817535, "rewards/correctness_reward_func": 1.3333333432674408, "rewards/int_reward_func": 0.4583333283662796, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.10429166257381439, "step": 889, "zero_std_ratio": 0.0 }, { "epoch": 0.07939694009545475, "grad_norm": 205.4910430908203, "learning_rate": 3e-06, "loss": -52.7249, "step": 890 }, { "epoch": 0.07948615014050582, "grad_norm": 242.2780303955078, "learning_rate": 3e-06, "loss": -67.3242, "step": 891 }, { "epoch": 0.07957536018555689, "grad_norm": 262.0589599609375, "learning_rate": 3e-06, "loss": -66.9661, "step": 892 }, { "epoch": 0.07966457023060797, "grad_norm": 186.11415100097656, "learning_rate": 3e-06, "loss": -68.4566, "step": 893 }, { "epoch": 0.07975378027565903, "grad_norm": 254.95228576660156, "learning_rate": 3e-06, "loss": -66.0288, "step": 894 }, { "epoch": 0.07984299032071011, "grad_norm": 270.2388000488281, "learning_rate": 3e-06, "loss": -59.1402, "step": 895 }, { "epoch": 0.07993220036576118, "grad_norm": 232.1254119873047, "learning_rate": 3e-06, "loss": -58.3748, "step": 896 }, { "epoch": 0.08002141041081226, "grad_norm": 423.2415466308594, "learning_rate": 3e-06, "loss": -74.5377, "step": 897 }, { "epoch": 0.08011062045586333, "grad_norm": 289.6065673828125, "learning_rate": 3e-06, "loss": -76.1074, "step": 898 }, { "epoch": 0.0801998305009144, "grad_norm": 212.4766845703125, "learning_rate": 3e-06, "loss": -74.2601, "step": 899 }, { "epoch": 0.08028904054596547, "grad_norm": 286.6225891113281, "learning_rate": 3e-06, "loss": -74.3892, "step": 900 }, { "completion_length": 119.62500381469727, "epoch": 0.08037825059101655, "grad_norm": 123.23504638671875, "learning_rate": 3e-06, "loss": -22.0192, "reward": 2.5415626764297485, "reward_std": 0.190566536039114, "rewards/correctness_reward_func": 1.9166666865348816, "rewards/int_reward_func": 0.4791666716337204, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.1457291655242443, "step": 901, "zero_std_ratio": 0.0 }, { "epoch": 0.08046746063606762, "grad_norm": 134.62196350097656, "learning_rate": 3e-06, "loss": -26.971, "step": 902 }, { "epoch": 0.0805566706811187, "grad_norm": 211.32479858398438, "learning_rate": 3e-06, "loss": -31.3724, "step": 903 }, { "epoch": 0.08064588072616977, "grad_norm": 160.55055236816406, "learning_rate": 3e-06, "loss": -22.228, "step": 904 }, { "epoch": 0.08073509077122083, "grad_norm": 125.40478515625, "learning_rate": 3e-06, "loss": -21.2257, "step": 905 }, { "epoch": 0.08082430081627191, "grad_norm": 111.1106948852539, "learning_rate": 3e-06, "loss": -22.3095, "step": 906 }, { "epoch": 0.08091351086132298, "grad_norm": 122.1114501953125, "learning_rate": 3e-06, "loss": -24.7909, "step": 907 }, { "epoch": 0.08100272090637406, "grad_norm": 156.01158142089844, "learning_rate": 3e-06, "loss": -31.0448, "step": 908 }, { "epoch": 0.08109193095142513, "grad_norm": 158.0888214111328, "learning_rate": 3e-06, "loss": -35.1506, "step": 909 }, { "epoch": 0.0811811409964762, "grad_norm": 156.11680603027344, "learning_rate": 3e-06, "loss": -26.2504, "step": 910 }, { "epoch": 0.08127035104152727, "grad_norm": 136.36370849609375, "learning_rate": 3e-06, "loss": -24.5191, "step": 911 }, { "epoch": 0.08135956108657835, "grad_norm": 138.4123077392578, "learning_rate": 3e-06, "loss": -25.2287, "step": 912 }, { "completion_length": 123.97917175292969, "epoch": 0.08144877113162942, "grad_norm": 69.13970184326172, "learning_rate": 3e-06, "loss": -4.2059, "reward": 2.349874973297119, "reward_std": 0.39924251288175583, "rewards/correctness_reward_func": 1.75, "rewards/int_reward_func": 0.4583333283662796, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.14154166728258133, "step": 913, "zero_std_ratio": 0.0 }, { "epoch": 0.0815379811766805, "grad_norm": 109.77488708496094, "learning_rate": 3e-06, "loss": -6.6804, "step": 914 }, { "epoch": 0.08162719122173157, "grad_norm": 108.82147216796875, "learning_rate": 3e-06, "loss": 1.8191, "step": 915 }, { "epoch": 0.08171640126678265, "grad_norm": 88.40335083007812, "learning_rate": 3e-06, "loss": -5.8692, "step": 916 }, { "epoch": 0.08180561131183371, "grad_norm": 76.1854019165039, "learning_rate": 3e-06, "loss": -1.3803, "step": 917 }, { "epoch": 0.08189482135688479, "grad_norm": 94.09133911132812, "learning_rate": 3e-06, "loss": -2.3375, "step": 918 }, { "epoch": 0.08198403140193586, "grad_norm": 84.88536071777344, "learning_rate": 3e-06, "loss": -5.6229, "step": 919 }, { "epoch": 0.08207324144698692, "grad_norm": 92.1208267211914, "learning_rate": 3e-06, "loss": -7.5509, "step": 920 }, { "epoch": 0.082162451492038, "grad_norm": 89.02661895751953, "learning_rate": 3e-06, "loss": 0.5948, "step": 921 }, { "epoch": 0.08225166153708907, "grad_norm": 95.09249114990234, "learning_rate": 3e-06, "loss": -6.4904, "step": 922 }, { "epoch": 0.08234087158214015, "grad_norm": 83.8741683959961, "learning_rate": 3e-06, "loss": -2.875, "step": 923 }, { "epoch": 0.08243008162719122, "grad_norm": 129.45420837402344, "learning_rate": 3e-06, "loss": -3.085, "step": 924 }, { "completion_length": 115.27083587646484, "epoch": 0.0825192916722423, "grad_norm": 250.4253387451172, "learning_rate": 3e-06, "loss": 9.4605, "reward": 2.1223334074020386, "reward_std": 0.7158277630805969, "rewards/correctness_reward_func": 1.5, "rewards/int_reward_func": 0.4166666716337204, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.205666683614254, "step": 925, "zero_std_ratio": 0.0 }, { "epoch": 0.08260850171729336, "grad_norm": 318.8369140625, "learning_rate": 3e-06, "loss": 16.4358, "step": 926 }, { "epoch": 0.08269771176234444, "grad_norm": 314.372314453125, "learning_rate": 3e-06, "loss": 5.4103, "step": 927 }, { "epoch": 0.08278692180739551, "grad_norm": 255.00933837890625, "learning_rate": 3e-06, "loss": -3.1256, "step": 928 }, { "epoch": 0.08287613185244659, "grad_norm": 357.3619384765625, "learning_rate": 3e-06, "loss": 3.4132, "step": 929 }, { "epoch": 0.08296534189749766, "grad_norm": 409.3254089355469, "learning_rate": 3e-06, "loss": 23.0602, "step": 930 }, { "epoch": 0.08305455194254874, "grad_norm": 270.6861877441406, "learning_rate": 3e-06, "loss": 8.4527, "step": 931 }, { "epoch": 0.0831437619875998, "grad_norm": 507.520263671875, "learning_rate": 3e-06, "loss": 14.712, "step": 932 }, { "epoch": 0.08323297203265087, "grad_norm": 281.0194091796875, "learning_rate": 3e-06, "loss": 4.5989, "step": 933 }, { "epoch": 0.08332218207770195, "grad_norm": 275.3479309082031, "learning_rate": 3e-06, "loss": -5.1609, "step": 934 }, { "epoch": 0.08341139212275302, "grad_norm": 358.3206481933594, "learning_rate": 3e-06, "loss": 3.3467, "step": 935 }, { "epoch": 0.0835006021678041, "grad_norm": 403.45440673828125, "learning_rate": 3e-06, "loss": 20.7584, "step": 936 }, { "completion_length": 131.7291717529297, "epoch": 0.08358981221285516, "grad_norm": 112.89104461669922, "learning_rate": 3e-06, "loss": -12.4613, "reward": 1.9657083749771118, "reward_std": 0.33454202115535736, "rewards/correctness_reward_func": 1.3333333432674408, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.13237499818205833, "step": 937, "zero_std_ratio": 0.0 }, { "epoch": 0.08367902225790624, "grad_norm": 158.0906524658203, "learning_rate": 3e-06, "loss": -6.7935, "step": 938 }, { "epoch": 0.08376823230295731, "grad_norm": 127.69352722167969, "learning_rate": 3e-06, "loss": -15.3734, "step": 939 }, { "epoch": 0.08385744234800839, "grad_norm": 207.05262756347656, "learning_rate": 3e-06, "loss": -25.0891, "step": 940 }, { "epoch": 0.08394665239305946, "grad_norm": 546.4678344726562, "learning_rate": 3e-06, "loss": -29.0194, "step": 941 }, { "epoch": 0.08403586243811054, "grad_norm": 141.02198791503906, "learning_rate": 3e-06, "loss": -10.7265, "step": 942 }, { "epoch": 0.0841250724831616, "grad_norm": 137.6843719482422, "learning_rate": 3e-06, "loss": -13.3029, "step": 943 }, { "epoch": 0.08421428252821268, "grad_norm": 211.74227905273438, "learning_rate": 3e-06, "loss": -8.8958, "step": 944 }, { "epoch": 0.08430349257326375, "grad_norm": 123.87110900878906, "learning_rate": 3e-06, "loss": -16.9913, "step": 945 }, { "epoch": 0.08439270261831482, "grad_norm": 206.8551025390625, "learning_rate": 3e-06, "loss": -26.9321, "step": 946 }, { "epoch": 0.0844819126633659, "grad_norm": 193.33346557617188, "learning_rate": 3e-06, "loss": -30.975, "step": 947 }, { "epoch": 0.08457112270841696, "grad_norm": 147.73297119140625, "learning_rate": 3e-06, "loss": -13.515, "step": 948 }, { "completion_length": 159.95833587646484, "epoch": 0.08466033275346804, "grad_norm": 186.3380889892578, "learning_rate": 3e-06, "loss": 9.8972, "reward": 1.8234166502952576, "reward_std": 0.42612800002098083, "rewards/correctness_reward_func": 1.2916666865348816, "rewards/int_reward_func": 0.4375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.09425000101327896, "step": 949, "zero_std_ratio": 0.0 }, { "epoch": 0.08474954279851911, "grad_norm": 178.0104522705078, "learning_rate": 3e-06, "loss": 10.3947, "step": 950 }, { "epoch": 0.08483875284357019, "grad_norm": 160.924560546875, "learning_rate": 3e-06, "loss": 8.1323, "step": 951 }, { "epoch": 0.08492796288862126, "grad_norm": 144.38978576660156, "learning_rate": 3e-06, "loss": 5.1531, "step": 952 }, { "epoch": 0.08501717293367234, "grad_norm": 174.2298126220703, "learning_rate": 3e-06, "loss": 5.4548, "step": 953 }, { "epoch": 0.0851063829787234, "grad_norm": 164.93479919433594, "learning_rate": 3e-06, "loss": 10.7603, "step": 954 }, { "epoch": 0.08519559302377448, "grad_norm": 198.3860626220703, "learning_rate": 3e-06, "loss": 9.1507, "step": 955 }, { "epoch": 0.08528480306882555, "grad_norm": 160.76519775390625, "learning_rate": 3e-06, "loss": 9.4591, "step": 956 }, { "epoch": 0.08537401311387663, "grad_norm": 170.39776611328125, "learning_rate": 3e-06, "loss": 7.0709, "step": 957 }, { "epoch": 0.0854632231589277, "grad_norm": 145.32798767089844, "learning_rate": 3e-06, "loss": 4.1507, "step": 958 }, { "epoch": 0.08555243320397876, "grad_norm": 170.50514221191406, "learning_rate": 3e-06, "loss": 4.4292, "step": 959 }, { "epoch": 0.08564164324902984, "grad_norm": 197.32290649414062, "learning_rate": 3e-06, "loss": 10.1245, "step": 960 }, { "completion_length": 149.68750762939453, "epoch": 0.08573085329408091, "grad_norm": 530.301025390625, "learning_rate": 3e-06, "loss": 6.2918, "reward": 2.106416702270508, "reward_std": 0.5645134299993515, "rewards/correctness_reward_func": 1.5416666865348816, "rewards/int_reward_func": 0.4791666567325592, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.08558332687243819, "step": 961, "zero_std_ratio": 0.0 }, { "epoch": 0.08582006333913199, "grad_norm": 593.3743286132812, "learning_rate": 3e-06, "loss": -27.7628, "step": 962 }, { "epoch": 0.08590927338418305, "grad_norm": 378.6949157714844, "learning_rate": 3e-06, "loss": -25.6701, "step": 963 }, { "epoch": 0.08599848342923413, "grad_norm": 402.416748046875, "learning_rate": 3e-06, "loss": 10.4982, "step": 964 }, { "epoch": 0.0860876934742852, "grad_norm": 443.3346862792969, "learning_rate": 3e-06, "loss": -39.2448, "step": 965 }, { "epoch": 0.08617690351933628, "grad_norm": 401.20574951171875, "learning_rate": 3e-06, "loss": -68.5925, "step": 966 }, { "epoch": 0.08626611356438735, "grad_norm": 553.38720703125, "learning_rate": 3e-06, "loss": 1.8602, "step": 967 }, { "epoch": 0.08635532360943843, "grad_norm": 628.0134887695312, "learning_rate": 3e-06, "loss": -36.1923, "step": 968 }, { "epoch": 0.0864445336544895, "grad_norm": 380.9430847167969, "learning_rate": 3e-06, "loss": -32.3079, "step": 969 }, { "epoch": 0.08653374369954057, "grad_norm": 385.8163146972656, "learning_rate": 3e-06, "loss": 3.3819, "step": 970 }, { "epoch": 0.08662295374459164, "grad_norm": 432.78118896484375, "learning_rate": 3e-06, "loss": -46.8594, "step": 971 }, { "epoch": 0.08671216378964271, "grad_norm": 439.5821533203125, "learning_rate": 3e-06, "loss": -77.4002, "step": 972 }, { "completion_length": 121.68750381469727, "epoch": 0.08680137383469379, "grad_norm": 35.27005386352539, "learning_rate": 3e-06, "loss": 3.8068, "reward": 2.284437596797943, "reward_std": 0.15820645913481712, "rewards/correctness_reward_func": 1.625, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.15943749248981476, "step": 973, "zero_std_ratio": 0.0 }, { "epoch": 0.08689058387974485, "grad_norm": 68.60123443603516, "learning_rate": 3e-06, "loss": 5.8697, "step": 974 }, { "epoch": 0.08697979392479593, "grad_norm": 69.07678985595703, "learning_rate": 3e-06, "loss": 2.0005, "step": 975 }, { "epoch": 0.087069003969847, "grad_norm": 39.30900955200195, "learning_rate": 3e-06, "loss": 6.0659, "step": 976 }, { "epoch": 0.08715821401489808, "grad_norm": 77.51853942871094, "learning_rate": 3e-06, "loss": 5.9335, "step": 977 }, { "epoch": 0.08724742405994915, "grad_norm": 60.07703399658203, "learning_rate": 3e-06, "loss": 3.043, "step": 978 }, { "epoch": 0.08733663410500023, "grad_norm": 39.25843811035156, "learning_rate": 3e-06, "loss": 3.6489, "step": 979 }, { "epoch": 0.0874258441500513, "grad_norm": 46.68893051147461, "learning_rate": 3e-06, "loss": 5.0957, "step": 980 }, { "epoch": 0.08751505419510237, "grad_norm": 55.4852180480957, "learning_rate": 3e-06, "loss": 0.9956, "step": 981 }, { "epoch": 0.08760426424015344, "grad_norm": 51.21168518066406, "learning_rate": 3e-06, "loss": 5.5624, "step": 982 }, { "epoch": 0.08769347428520452, "grad_norm": 64.15937805175781, "learning_rate": 3e-06, "loss": 4.0323, "step": 983 }, { "epoch": 0.08778268433025559, "grad_norm": 65.25579833984375, "learning_rate": 3e-06, "loss": 1.7638, "step": 984 }, { "completion_length": 138.68750762939453, "epoch": 0.08787189437530665, "grad_norm": 277.1241760253906, "learning_rate": 3e-06, "loss": 10.9365, "reward": 2.176750063896179, "reward_std": 0.40325865149497986, "rewards/correctness_reward_func": 1.5833333134651184, "rewards/int_reward_func": 0.4791666567325592, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.11425000056624413, "step": 985, "zero_std_ratio": 0.0 }, { "epoch": 0.08796110442035773, "grad_norm": 363.5508117675781, "learning_rate": 3e-06, "loss": -2.6393, "step": 986 }, { "epoch": 0.0880503144654088, "grad_norm": 337.4767150878906, "learning_rate": 3e-06, "loss": 9.4007, "step": 987 }, { "epoch": 0.08813952451045988, "grad_norm": 292.9395751953125, "learning_rate": 3e-06, "loss": -2.1309, "step": 988 }, { "epoch": 0.08822873455551095, "grad_norm": 246.76112365722656, "learning_rate": 3e-06, "loss": -9.132, "step": 989 }, { "epoch": 0.08831794460056203, "grad_norm": 267.3565368652344, "learning_rate": 3e-06, "loss": -11.7626, "step": 990 }, { "epoch": 0.08840715464561309, "grad_norm": 257.4312438964844, "learning_rate": 3e-06, "loss": 5.7132, "step": 991 }, { "epoch": 0.08849636469066417, "grad_norm": 317.8547058105469, "learning_rate": 3e-06, "loss": -9.7682, "step": 992 }, { "epoch": 0.08858557473571524, "grad_norm": 260.3039855957031, "learning_rate": 3e-06, "loss": 2.4633, "step": 993 }, { "epoch": 0.08867478478076632, "grad_norm": 261.14697265625, "learning_rate": 3e-06, "loss": -9.4669, "step": 994 }, { "epoch": 0.08876399482581739, "grad_norm": 181.8609161376953, "learning_rate": 3e-06, "loss": -13.1009, "step": 995 }, { "epoch": 0.08885320487086847, "grad_norm": 236.33563232421875, "learning_rate": 3e-06, "loss": -15.7579, "step": 996 }, { "completion_length": 168.9791717529297, "epoch": 0.08894241491591953, "grad_norm": 502.4021911621094, "learning_rate": 3e-06, "loss": -65.14, "reward": 1.9697707891464233, "reward_std": 0.505499929189682, "rewards/correctness_reward_func": 1.4166666269302368, "rewards/int_reward_func": 0.4895833283662796, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.06352083757519722, "step": 997, "zero_std_ratio": 0.0 }, { "epoch": 0.0890316249609706, "grad_norm": 564.5300903320312, "learning_rate": 3e-06, "loss": -45.4159, "step": 998 }, { "epoch": 0.08912083500602168, "grad_norm": 453.83160400390625, "learning_rate": 3e-06, "loss": -31.2339, "step": 999 }, { "epoch": 0.08921004505107274, "grad_norm": 449.2571716308594, "learning_rate": 3e-06, "loss": -71.5707, "step": 1000 }, { "epoch": 0.08929925509612383, "grad_norm": 639.439208984375, "learning_rate": 3e-06, "loss": -2.7696, "step": 1001 }, { "epoch": 0.08938846514117489, "grad_norm": 494.1471862792969, "learning_rate": 3e-06, "loss": -8.904, "step": 1002 }, { "epoch": 0.08947767518622597, "grad_norm": 483.2057800292969, "learning_rate": 3e-06, "loss": -68.9682, "step": 1003 }, { "epoch": 0.08956688523127704, "grad_norm": 587.0855712890625, "learning_rate": 3e-06, "loss": -55.2685, "step": 1004 }, { "epoch": 0.08965609527632812, "grad_norm": 493.4284362792969, "learning_rate": 3e-06, "loss": -40.0821, "step": 1005 }, { "epoch": 0.08974530532137918, "grad_norm": 458.56134033203125, "learning_rate": 3e-06, "loss": -81.0491, "step": 1006 }, { "epoch": 0.08983451536643026, "grad_norm": 604.5118408203125, "learning_rate": 3e-06, "loss": -12.8738, "step": 1007 }, { "epoch": 0.08992372541148133, "grad_norm": 968.2217407226562, "learning_rate": 3e-06, "loss": -16.2694, "step": 1008 }, { "completion_length": 135.1666717529297, "epoch": 0.09001293545653241, "grad_norm": 199.69468688964844, "learning_rate": 3e-06, "loss": -0.1011, "reward": 1.9899166822433472, "reward_std": 0.5430259108543396, "rewards/correctness_reward_func": 1.4166666567325592, "rewards/int_reward_func": 0.4375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.13575000129640102, "step": 1009, "zero_std_ratio": 0.125 }, { "epoch": 0.09010214550158348, "grad_norm": 165.88571166992188, "learning_rate": 3e-06, "loss": -2.8061, "step": 1010 }, { "epoch": 0.09019135554663456, "grad_norm": 199.79214477539062, "learning_rate": 3e-06, "loss": 15.1017, "step": 1011 }, { "epoch": 0.09028056559168562, "grad_norm": 176.049072265625, "learning_rate": 3e-06, "loss": 11.1242, "step": 1012 }, { "epoch": 0.09036977563673669, "grad_norm": 195.1369171142578, "learning_rate": 3e-06, "loss": 5.1197, "step": 1013 }, { "epoch": 0.09045898568178777, "grad_norm": 233.49134826660156, "learning_rate": 3e-06, "loss": -0.4895, "step": 1014 }, { "epoch": 0.09054819572683884, "grad_norm": 211.94871520996094, "learning_rate": 3e-06, "loss": -2.5614, "step": 1015 }, { "epoch": 0.09063740577188992, "grad_norm": 156.52188110351562, "learning_rate": 3e-06, "loss": -2.6152, "step": 1016 }, { "epoch": 0.09072661581694098, "grad_norm": 162.7987823486328, "learning_rate": 3e-06, "loss": 13.4839, "step": 1017 }, { "epoch": 0.09081582586199206, "grad_norm": 194.3466033935547, "learning_rate": 3e-06, "loss": 9.1157, "step": 1018 }, { "epoch": 0.09090503590704313, "grad_norm": 219.33090209960938, "learning_rate": 3e-06, "loss": 1.8747, "step": 1019 }, { "epoch": 0.09099424595209421, "grad_norm": 237.62643432617188, "learning_rate": 3e-06, "loss": -3.0257, "step": 1020 }, { "completion_length": 145.125, "epoch": 0.09108345599714528, "grad_norm": 347.146484375, "learning_rate": 3e-06, "loss": -57.3183, "reward": 2.3361042737960815, "reward_std": 0.1686728447675705, "rewards/correctness_reward_func": 1.7083333134651184, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.12777083739638329, "step": 1021, "zero_std_ratio": 0.0 }, { "epoch": 0.09117266604219636, "grad_norm": 307.16357421875, "learning_rate": 3e-06, "loss": -59.7916, "step": 1022 }, { "epoch": 0.09126187608724742, "grad_norm": 239.71444702148438, "learning_rate": 3e-06, "loss": -68.856, "step": 1023 }, { "epoch": 0.0913510861322985, "grad_norm": 293.3377380371094, "learning_rate": 3e-06, "loss": -52.4881, "step": 1024 }, { "epoch": 0.09144029617734957, "grad_norm": 355.85980224609375, "learning_rate": 3e-06, "loss": -60.7326, "step": 1025 }, { "epoch": 0.09152950622240064, "grad_norm": 443.2692565917969, "learning_rate": 3e-06, "loss": -50.5898, "step": 1026 }, { "epoch": 0.09161871626745172, "grad_norm": 392.3480224609375, "learning_rate": 3e-06, "loss": -63.6762, "step": 1027 }, { "epoch": 0.09170792631250278, "grad_norm": 337.5072021484375, "learning_rate": 3e-06, "loss": -65.8663, "step": 1028 }, { "epoch": 0.09179713635755386, "grad_norm": 255.06619262695312, "learning_rate": 3e-06, "loss": -74.5813, "step": 1029 }, { "epoch": 0.09188634640260493, "grad_norm": 303.1604919433594, "learning_rate": 3e-06, "loss": -60.9415, "step": 1030 }, { "epoch": 0.09197555644765601, "grad_norm": 339.84039306640625, "learning_rate": 3e-06, "loss": -68.1991, "step": 1031 }, { "epoch": 0.09206476649270708, "grad_norm": 868.9529418945312, "learning_rate": 3e-06, "loss": -65.8396, "step": 1032 }, { "completion_length": 140.9375, "epoch": 0.09215397653775816, "grad_norm": 402.7937316894531, "learning_rate": 3e-06, "loss": -15.3165, "reward": 2.3517916202545166, "reward_std": 0.60741326212883, "rewards/correctness_reward_func": 1.75, "rewards/int_reward_func": 0.46875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.13304166495800018, "step": 1033, "zero_std_ratio": 0.0 }, { "epoch": 0.09224318658280922, "grad_norm": 365.8189697265625, "learning_rate": 3e-06, "loss": 13.667, "step": 1034 }, { "epoch": 0.0923323966278603, "grad_norm": 427.4906921386719, "learning_rate": 3e-06, "loss": -26.0149, "step": 1035 }, { "epoch": 0.09242160667291137, "grad_norm": 378.3236999511719, "learning_rate": 3e-06, "loss": -5.2592, "step": 1036 }, { "epoch": 0.09251081671796245, "grad_norm": 524.1071166992188, "learning_rate": 3e-06, "loss": 20.8697, "step": 1037 }, { "epoch": 0.09260002676301352, "grad_norm": 393.7771301269531, "learning_rate": 3e-06, "loss": -12.4555, "step": 1038 }, { "epoch": 0.09268923680806458, "grad_norm": 588.3152465820312, "learning_rate": 3e-06, "loss": -15.8752, "step": 1039 }, { "epoch": 0.09277844685311566, "grad_norm": 354.5204162597656, "learning_rate": 3e-06, "loss": 13.4668, "step": 1040 }, { "epoch": 0.09286765689816673, "grad_norm": 511.9700927734375, "learning_rate": 3e-06, "loss": -31.7805, "step": 1041 }, { "epoch": 0.09295686694321781, "grad_norm": 518.6276245117188, "learning_rate": 3e-06, "loss": -7.4808, "step": 1042 }, { "epoch": 0.09304607698826887, "grad_norm": 377.42205810546875, "learning_rate": 3e-06, "loss": 20.1054, "step": 1043 }, { "epoch": 0.09313528703331996, "grad_norm": 447.12945556640625, "learning_rate": 3e-06, "loss": -15.5087, "step": 1044 }, { "completion_length": 138.9166717529297, "epoch": 0.09322449707837102, "grad_norm": 993.244873046875, "learning_rate": 3e-06, "loss": 45.6857, "reward": 2.4459375143051147, "reward_std": 0.3905292749404907, "rewards/correctness_reward_func": 1.8333333134651184, "rewards/int_reward_func": 0.4791666567325592, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.13343750312924385, "step": 1045, "zero_std_ratio": 0.0 }, { "epoch": 0.0933137071234221, "grad_norm": 672.6893310546875, "learning_rate": 3e-06, "loss": 38.2007, "step": 1046 }, { "epoch": 0.09340291716847317, "grad_norm": 513.650390625, "learning_rate": 3e-06, "loss": 54.7761, "step": 1047 }, { "epoch": 0.09349212721352425, "grad_norm": 527.8473510742188, "learning_rate": 3e-06, "loss": 32.3591, "step": 1048 }, { "epoch": 0.09358133725857531, "grad_norm": 506.8796081542969, "learning_rate": 3e-06, "loss": 46.2954, "step": 1049 }, { "epoch": 0.0936705473036264, "grad_norm": 496.89971923828125, "learning_rate": 3e-06, "loss": 49.5723, "step": 1050 }, { "epoch": 0.09375975734867746, "grad_norm": 896.3591918945312, "learning_rate": 3e-06, "loss": 37.4608, "step": 1051 }, { "epoch": 0.09384896739372853, "grad_norm": 463.4916076660156, "learning_rate": 3e-06, "loss": 32.8312, "step": 1052 }, { "epoch": 0.09393817743877961, "grad_norm": 536.8302612304688, "learning_rate": 3e-06, "loss": 51.5421, "step": 1053 }, { "epoch": 0.09402738748383067, "grad_norm": 560.4818115234375, "learning_rate": 3e-06, "loss": 25.2527, "step": 1054 }, { "epoch": 0.09411659752888175, "grad_norm": 527.0779418945312, "learning_rate": 3e-06, "loss": 38.5143, "step": 1055 }, { "epoch": 0.09420580757393282, "grad_norm": 404.8234558105469, "learning_rate": 3e-06, "loss": 44.4837, "step": 1056 }, { "completion_length": 153.75, "epoch": 0.0942950176189839, "grad_norm": 320.6294250488281, "learning_rate": 3e-06, "loss": -28.9113, "reward": 1.784208357334137, "reward_std": 0.5137874186038971, "rewards/correctness_reward_func": 1.2916666567325592, "rewards/int_reward_func": 0.4270833283662796, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.06545832939445972, "step": 1057, "zero_std_ratio": 0.0 }, { "epoch": 0.09438422766403497, "grad_norm": 405.81268310546875, "learning_rate": 3e-06, "loss": -18.9334, "step": 1058 }, { "epoch": 0.09447343770908605, "grad_norm": 371.4535217285156, "learning_rate": 3e-06, "loss": -41.9034, "step": 1059 }, { "epoch": 0.09456264775413711, "grad_norm": 301.6942443847656, "learning_rate": 3e-06, "loss": -23.3866, "step": 1060 }, { "epoch": 0.0946518577991882, "grad_norm": 365.9558410644531, "learning_rate": 3e-06, "loss": -14.9156, "step": 1061 }, { "epoch": 0.09474106784423926, "grad_norm": 377.2384948730469, "learning_rate": 3e-06, "loss": -36.7945, "step": 1062 }, { "epoch": 0.09483027788929034, "grad_norm": 319.00262451171875, "learning_rate": 3e-06, "loss": -31.6703, "step": 1063 }, { "epoch": 0.0949194879343414, "grad_norm": 374.5344543457031, "learning_rate": 3e-06, "loss": -20.8262, "step": 1064 }, { "epoch": 0.09500869797939247, "grad_norm": 309.783447265625, "learning_rate": 3e-06, "loss": -44.4576, "step": 1065 }, { "epoch": 0.09509790802444355, "grad_norm": 300.767578125, "learning_rate": 3e-06, "loss": -27.8658, "step": 1066 }, { "epoch": 0.09518711806949462, "grad_norm": 382.4290466308594, "learning_rate": 3e-06, "loss": -21.0255, "step": 1067 }, { "epoch": 0.0952763281145457, "grad_norm": 380.5618591308594, "learning_rate": 3e-06, "loss": -40.5874, "step": 1068 }, { "completion_length": 133.43750762939453, "epoch": 0.09536553815959677, "grad_norm": 338.35491943359375, "learning_rate": 3e-06, "loss": -59.0288, "reward": 1.9337083101272583, "reward_std": 0.28059020824730396, "rewards/correctness_reward_func": 1.375, "rewards/int_reward_func": 0.4270833283662796, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.1316249892115593, "step": 1069, "zero_std_ratio": 0.125 }, { "epoch": 0.09545474820464785, "grad_norm": 294.65740966796875, "learning_rate": 3e-06, "loss": -50.4247, "step": 1070 }, { "epoch": 0.09554395824969891, "grad_norm": 245.4026641845703, "learning_rate": 3e-06, "loss": -41.1492, "step": 1071 }, { "epoch": 0.09563316829474999, "grad_norm": 274.2242126464844, "learning_rate": 3e-06, "loss": -37.5643, "step": 1072 }, { "epoch": 0.09572237833980106, "grad_norm": 272.3935546875, "learning_rate": 3e-06, "loss": -48.5536, "step": 1073 }, { "epoch": 0.09581158838485214, "grad_norm": 411.1688232421875, "learning_rate": 3e-06, "loss": -63.078, "step": 1074 }, { "epoch": 0.0959007984299032, "grad_norm": 350.3167724609375, "learning_rate": 3e-06, "loss": -68.731, "step": 1075 }, { "epoch": 0.09599000847495429, "grad_norm": 308.54302978515625, "learning_rate": 3e-06, "loss": -59.1035, "step": 1076 }, { "epoch": 0.09607921852000535, "grad_norm": 272.44537353515625, "learning_rate": 3e-06, "loss": -49.064, "step": 1077 }, { "epoch": 0.09616842856505642, "grad_norm": 349.4591064453125, "learning_rate": 3e-06, "loss": -45.6581, "step": 1078 }, { "epoch": 0.0962576386101075, "grad_norm": 404.52557373046875, "learning_rate": 3e-06, "loss": -57.2088, "step": 1079 }, { "epoch": 0.09634684865515857, "grad_norm": 418.94580078125, "learning_rate": 3e-06, "loss": -74.9134, "step": 1080 }, { "completion_length": 141.1458396911621, "epoch": 0.09643605870020965, "grad_norm": 733.7314453125, "learning_rate": 3e-06, "loss": -79.7336, "reward": 1.6318541765213013, "reward_std": 0.570192813873291, "rewards/correctness_reward_func": 1.0416666567325592, "rewards/int_reward_func": 0.4791666716337204, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.11102083884179592, "step": 1081, "zero_std_ratio": 0.0 }, { "epoch": 0.09652526874526071, "grad_norm": 607.6722412109375, "learning_rate": 3e-06, "loss": -140.3347, "step": 1082 }, { "epoch": 0.09661447879031179, "grad_norm": 667.3759155273438, "learning_rate": 3e-06, "loss": -95.2953, "step": 1083 }, { "epoch": 0.09670368883536286, "grad_norm": 1115.2962646484375, "learning_rate": 3e-06, "loss": -92.1048, "step": 1084 }, { "epoch": 0.09679289888041394, "grad_norm": 624.0264282226562, "learning_rate": 3e-06, "loss": -155.7364, "step": 1085 }, { "epoch": 0.096882108925465, "grad_norm": 1003.4823608398438, "learning_rate": 3e-06, "loss": -133.9341, "step": 1086 }, { "epoch": 0.09697131897051608, "grad_norm": 735.7012329101562, "learning_rate": 3e-06, "loss": -100.9845, "step": 1087 }, { "epoch": 0.09706052901556715, "grad_norm": 606.860107421875, "learning_rate": 3e-06, "loss": -162.1261, "step": 1088 }, { "epoch": 0.09714973906061823, "grad_norm": 645.7361450195312, "learning_rate": 3e-06, "loss": -116.5875, "step": 1089 }, { "epoch": 0.0972389491056693, "grad_norm": 852.1995239257812, "learning_rate": 3e-06, "loss": -110.4707, "step": 1090 }, { "epoch": 0.09732815915072038, "grad_norm": 728.5518188476562, "learning_rate": 3e-06, "loss": -176.5155, "step": 1091 }, { "epoch": 0.09741736919577144, "grad_norm": 802.650634765625, "learning_rate": 3e-06, "loss": -157.6771, "step": 1092 }, { "completion_length": 134.12500381469727, "epoch": 0.09750657924082251, "grad_norm": 373.08319091796875, "learning_rate": 3e-06, "loss": 6.5518, "reward": 2.1240209341049194, "reward_std": 0.493463397026062, "rewards/correctness_reward_func": 1.5, "rewards/int_reward_func": 0.46875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.15527082234621048, "step": 1093, "zero_std_ratio": 0.0 }, { "epoch": 0.09759578928587359, "grad_norm": 483.0643005371094, "learning_rate": 3e-06, "loss": 9.2641, "step": 1094 }, { "epoch": 0.09768499933092466, "grad_norm": 392.19927978515625, "learning_rate": 3e-06, "loss": -14.7997, "step": 1095 }, { "epoch": 0.09777420937597574, "grad_norm": 344.8969421386719, "learning_rate": 3e-06, "loss": 3.3186, "step": 1096 }, { "epoch": 0.0978634194210268, "grad_norm": 425.2146911621094, "learning_rate": 3e-06, "loss": -9.1396, "step": 1097 }, { "epoch": 0.09795262946607788, "grad_norm": 405.4781188964844, "learning_rate": 3e-06, "loss": -33.2548, "step": 1098 }, { "epoch": 0.09804183951112895, "grad_norm": 381.5223083496094, "learning_rate": 3e-06, "loss": 6.69, "step": 1099 }, { "epoch": 0.09813104955618003, "grad_norm": 597.418701171875, "learning_rate": 3e-06, "loss": 8.1762, "step": 1100 }, { "epoch": 0.0982202596012311, "grad_norm": 354.68548583984375, "learning_rate": 3e-06, "loss": -21.8024, "step": 1101 }, { "epoch": 0.09830946964628218, "grad_norm": 368.8730773925781, "learning_rate": 3e-06, "loss": 2.2751, "step": 1102 }, { "epoch": 0.09839867969133324, "grad_norm": 407.3045959472656, "learning_rate": 3e-06, "loss": -13.4736, "step": 1103 }, { "epoch": 0.09848788973638432, "grad_norm": 474.1021728515625, "learning_rate": 3e-06, "loss": -40.1604, "step": 1104 }, { "completion_length": 144.56250762939453, "epoch": 0.09857709978143539, "grad_norm": 509.6014099121094, "learning_rate": 3e-06, "loss": -99.5888, "reward": 1.8781040906906128, "reward_std": 0.4346280097961426, "rewards/correctness_reward_func": 1.375, "rewards/int_reward_func": 0.375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.12810416892170906, "step": 1105, "zero_std_ratio": 0.0 }, { "epoch": 0.09866630982648646, "grad_norm": 533.192626953125, "learning_rate": 3e-06, "loss": -97.968, "step": 1106 }, { "epoch": 0.09875551987153754, "grad_norm": 571.8197631835938, "learning_rate": 3e-06, "loss": -106.1057, "step": 1107 }, { "epoch": 0.0988447299165886, "grad_norm": 525.690673828125, "learning_rate": 3e-06, "loss": -75.4768, "step": 1108 }, { "epoch": 0.09893393996163968, "grad_norm": 478.89398193359375, "learning_rate": 3e-06, "loss": -110.4997, "step": 1109 }, { "epoch": 0.09902315000669075, "grad_norm": 812.3494262695312, "learning_rate": 3e-06, "loss": -112.8618, "step": 1110 }, { "epoch": 0.09911236005174183, "grad_norm": 603.4788818359375, "learning_rate": 3e-06, "loss": -106.561, "step": 1111 }, { "epoch": 0.0992015700967929, "grad_norm": 672.5433959960938, "learning_rate": 3e-06, "loss": -104.4456, "step": 1112 }, { "epoch": 0.09929078014184398, "grad_norm": 582.0712890625, "learning_rate": 3e-06, "loss": -113.8237, "step": 1113 }, { "epoch": 0.09937999018689504, "grad_norm": 536.4866333007812, "learning_rate": 3e-06, "loss": -83.1406, "step": 1114 }, { "epoch": 0.09946920023194612, "grad_norm": 531.5173950195312, "learning_rate": 3e-06, "loss": -121.5239, "step": 1115 }, { "epoch": 0.09955841027699719, "grad_norm": 898.2598266601562, "learning_rate": 3e-06, "loss": -127.2398, "step": 1116 }, { "completion_length": 113.00000381469727, "epoch": 0.09964762032204827, "grad_norm": 492.4721984863281, "learning_rate": 3e-06, "loss": -70.4249, "reward": 2.4547500610351562, "reward_std": 0.45376959443092346, "rewards/correctness_reward_func": 1.7916666865348816, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.16308333724737167, "step": 1117, "zero_std_ratio": 0.0 }, { "epoch": 0.09973683036709934, "grad_norm": 393.4570617675781, "learning_rate": 3e-06, "loss": -52.5425, "step": 1118 }, { "epoch": 0.0998260404121504, "grad_norm": 808.8583984375, "learning_rate": 3e-06, "loss": -32.6061, "step": 1119 }, { "epoch": 0.09991525045720148, "grad_norm": 576.81640625, "learning_rate": 3e-06, "loss": -55.5892, "step": 1120 }, { "epoch": 0.10000446050225255, "grad_norm": 898.8334350585938, "learning_rate": 3e-06, "loss": -54.6342, "step": 1121 }, { "epoch": 0.10009367054730363, "grad_norm": 438.3951110839844, "learning_rate": 3e-06, "loss": -43.4653, "step": 1122 }, { "epoch": 0.1001828805923547, "grad_norm": 708.05908203125, "learning_rate": 3e-06, "loss": -77.0374, "step": 1123 }, { "epoch": 0.10027209063740578, "grad_norm": 425.1826477050781, "learning_rate": 3e-06, "loss": -59.1794, "step": 1124 }, { "epoch": 0.10036130068245684, "grad_norm": 833.095947265625, "learning_rate": 3e-06, "loss": -43.4435, "step": 1125 }, { "epoch": 0.10045051072750792, "grad_norm": 588.0106811523438, "learning_rate": 3e-06, "loss": -62.997, "step": 1126 }, { "epoch": 0.10053972077255899, "grad_norm": 713.5040283203125, "learning_rate": 3e-06, "loss": -71.8443, "step": 1127 }, { "epoch": 0.10062893081761007, "grad_norm": 584.412353515625, "learning_rate": 3e-06, "loss": -53.4193, "step": 1128 }, { "completion_length": 136.2291717529297, "epoch": 0.10071814086266113, "grad_norm": 665.6400146484375, "learning_rate": 3e-06, "loss": 41.7356, "reward": 2.1188125014305115, "reward_std": 0.2311352714896202, "rewards/correctness_reward_func": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.11881250143051147, "step": 1129, "zero_std_ratio": 0.0 }, { "epoch": 0.10080735090771221, "grad_norm": 394.85174560546875, "learning_rate": 3e-06, "loss": 37.6043, "step": 1130 }, { "epoch": 0.10089656095276328, "grad_norm": 596.6718139648438, "learning_rate": 3e-06, "loss": 20.8374, "step": 1131 }, { "epoch": 0.10098577099781435, "grad_norm": 396.1944274902344, "learning_rate": 3e-06, "loss": 40.8713, "step": 1132 }, { "epoch": 0.10107498104286543, "grad_norm": 1378.33544921875, "learning_rate": 3e-06, "loss": -18.756, "step": 1133 }, { "epoch": 0.1011641910879165, "grad_norm": 572.9339599609375, "learning_rate": 3e-06, "loss": 40.0698, "step": 1134 }, { "epoch": 0.10125340113296757, "grad_norm": 500.0650939941406, "learning_rate": 3e-06, "loss": 33.2779, "step": 1135 }, { "epoch": 0.10134261117801864, "grad_norm": 423.95916748046875, "learning_rate": 3e-06, "loss": 37.4019, "step": 1136 }, { "epoch": 0.10143182122306972, "grad_norm": 354.7851867675781, "learning_rate": 3e-06, "loss": 14.2, "step": 1137 }, { "epoch": 0.10152103126812079, "grad_norm": 435.8161315917969, "learning_rate": 3e-06, "loss": 35.5527, "step": 1138 }, { "epoch": 0.10161024131317187, "grad_norm": 1665.0882568359375, "learning_rate": 3e-06, "loss": -32.409, "step": 1139 }, { "epoch": 0.10169945135822293, "grad_norm": 427.7786865234375, "learning_rate": 3e-06, "loss": 38.618, "step": 1140 }, { "completion_length": 152.77084350585938, "epoch": 0.10178866140327401, "grad_norm": 739.2212524414062, "learning_rate": 3e-06, "loss": -77.8406, "reward": 2.052833318710327, "reward_std": 0.4878626614809036, "rewards/correctness_reward_func": 1.5, "rewards/int_reward_func": 0.4895833283662796, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.06325000338256359, "step": 1141, "zero_std_ratio": 0.0 }, { "epoch": 0.10187787144832508, "grad_norm": 798.5391845703125, "learning_rate": 3e-06, "loss": -79.5304, "step": 1142 }, { "epoch": 0.10196708149337616, "grad_norm": 723.3178100585938, "learning_rate": 3e-06, "loss": -79.9193, "step": 1143 }, { "epoch": 0.10205629153842723, "grad_norm": 678.71484375, "learning_rate": 3e-06, "loss": -48.8422, "step": 1144 }, { "epoch": 0.1021455015834783, "grad_norm": 898.7178955078125, "learning_rate": 3e-06, "loss": -78.0983, "step": 1145 }, { "epoch": 0.10223471162852937, "grad_norm": 719.983154296875, "learning_rate": 3e-06, "loss": -108.0717, "step": 1146 }, { "epoch": 0.10232392167358044, "grad_norm": 728.4072265625, "learning_rate": 3e-06, "loss": -84.2578, "step": 1147 }, { "epoch": 0.10241313171863152, "grad_norm": 851.9353637695312, "learning_rate": 3e-06, "loss": -89.1878, "step": 1148 }, { "epoch": 0.10250234176368259, "grad_norm": 711.1327514648438, "learning_rate": 3e-06, "loss": -87.8992, "step": 1149 }, { "epoch": 0.10259155180873367, "grad_norm": 785.3473510742188, "learning_rate": 3e-06, "loss": -57.3662, "step": 1150 }, { "epoch": 0.10268076185378473, "grad_norm": 926.0802612304688, "learning_rate": 3e-06, "loss": -92.5817, "step": 1151 }, { "epoch": 0.10276997189883581, "grad_norm": 769.6296997070312, "learning_rate": 3e-06, "loss": -116.0675, "step": 1152 }, { "completion_length": 117.62500381469727, "epoch": 0.10285918194388688, "grad_norm": 841.6162109375, "learning_rate": 3e-06, "loss": 84.3578, "reward": 2.0038751363754272, "reward_std": 0.5935890823602676, "rewards/correctness_reward_func": 1.3333333134651184, "rewards/int_reward_func": 0.4895833283662796, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.1809583306312561, "step": 1153, "zero_std_ratio": 0.0 }, { "epoch": 0.10294839198893796, "grad_norm": 819.4031982421875, "learning_rate": 3e-06, "loss": 47.4932, "step": 1154 }, { "epoch": 0.10303760203398903, "grad_norm": 983.420654296875, "learning_rate": 3e-06, "loss": 114.4865, "step": 1155 }, { "epoch": 0.1031268120790401, "grad_norm": 818.0223388671875, "learning_rate": 3e-06, "loss": 49.3857, "step": 1156 }, { "epoch": 0.10321602212409117, "grad_norm": 825.228759765625, "learning_rate": 3e-06, "loss": 135.8829, "step": 1157 }, { "epoch": 0.10330523216914224, "grad_norm": 814.0230102539062, "learning_rate": 3e-06, "loss": 58.7408, "step": 1158 }, { "epoch": 0.10339444221419332, "grad_norm": 933.5665283203125, "learning_rate": 3e-06, "loss": 79.1231, "step": 1159 }, { "epoch": 0.10348365225924439, "grad_norm": 787.4599609375, "learning_rate": 3e-06, "loss": 38.9872, "step": 1160 }, { "epoch": 0.10357286230429547, "grad_norm": 807.5901489257812, "learning_rate": 3e-06, "loss": 103.1935, "step": 1161 }, { "epoch": 0.10366207234934653, "grad_norm": 903.9038696289062, "learning_rate": 3e-06, "loss": 35.8445, "step": 1162 }, { "epoch": 0.10375128239439761, "grad_norm": 849.3576049804688, "learning_rate": 3e-06, "loss": 121.1534, "step": 1163 }, { "epoch": 0.10384049243944868, "grad_norm": 772.4011840820312, "learning_rate": 3e-06, "loss": 43.3309, "step": 1164 }, { "completion_length": 129.95834350585938, "epoch": 0.10392970248449976, "grad_norm": 639.35693359375, "learning_rate": 3e-06, "loss": -5.3389, "reward": 2.4029585123062134, "reward_std": 0.3480468839406967, "rewards/correctness_reward_func": 1.75, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.15295833349227905, "step": 1165, "zero_std_ratio": 0.0 }, { "epoch": 0.10401891252955082, "grad_norm": 668.5325927734375, "learning_rate": 3e-06, "loss": -32.1924, "step": 1166 }, { "epoch": 0.1041081225746019, "grad_norm": 512.8074340820312, "learning_rate": 3e-06, "loss": -10.3926, "step": 1167 }, { "epoch": 0.10419733261965297, "grad_norm": 600.4286499023438, "learning_rate": 3e-06, "loss": -19.6917, "step": 1168 }, { "epoch": 0.10428654266470405, "grad_norm": 520.0072631835938, "learning_rate": 3e-06, "loss": 10.3206, "step": 1169 }, { "epoch": 0.10437575270975512, "grad_norm": 477.1594543457031, "learning_rate": 3e-06, "loss": 7.4605, "step": 1170 }, { "epoch": 0.10446496275480618, "grad_norm": 677.6570434570312, "learning_rate": 3e-06, "loss": -7.1203, "step": 1171 }, { "epoch": 0.10455417279985726, "grad_norm": 595.1732177734375, "learning_rate": 3e-06, "loss": -36.923, "step": 1172 }, { "epoch": 0.10464338284490833, "grad_norm": 493.15252685546875, "learning_rate": 3e-06, "loss": -13.682, "step": 1173 }, { "epoch": 0.10473259288995941, "grad_norm": 761.3983154296875, "learning_rate": 3e-06, "loss": -24.4599, "step": 1174 }, { "epoch": 0.10482180293501048, "grad_norm": 635.04150390625, "learning_rate": 3e-06, "loss": 1.3747, "step": 1175 }, { "epoch": 0.10491101298006156, "grad_norm": 515.3302001953125, "learning_rate": 3e-06, "loss": 1.0323, "step": 1176 }, { "completion_length": 112.1875, "epoch": 0.10500022302511262, "grad_norm": 469.7004699707031, "learning_rate": 3e-06, "loss": -0.444, "reward": 2.1683751344680786, "reward_std": 0.5464861989021301, "rewards/correctness_reward_func": 1.5, "rewards/int_reward_func": 0.4479166716337204, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.22045832872390747, "step": 1177, "zero_std_ratio": 0.0 }, { "epoch": 0.1050894330701637, "grad_norm": 548.61572265625, "learning_rate": 3e-06, "loss": 23.8497, "step": 1178 }, { "epoch": 0.10517864311521477, "grad_norm": 470.7399597167969, "learning_rate": 3e-06, "loss": -7.3893, "step": 1179 }, { "epoch": 0.10526785316026585, "grad_norm": 548.130859375, "learning_rate": 3e-06, "loss": -8.4577, "step": 1180 }, { "epoch": 0.10535706320531692, "grad_norm": 747.939453125, "learning_rate": 3e-06, "loss": 25.5028, "step": 1181 }, { "epoch": 0.105446273250368, "grad_norm": 383.6826171875, "learning_rate": 3e-06, "loss": -1.7932, "step": 1182 }, { "epoch": 0.10553548329541906, "grad_norm": 433.5135498046875, "learning_rate": 3e-06, "loss": -3.3989, "step": 1183 }, { "epoch": 0.10562469334047014, "grad_norm": 548.0432739257812, "learning_rate": 3e-06, "loss": 17.3918, "step": 1184 }, { "epoch": 0.10571390338552121, "grad_norm": 412.3025207519531, "learning_rate": 3e-06, "loss": -13.2844, "step": 1185 }, { "epoch": 0.10580311343057228, "grad_norm": 390.11663818359375, "learning_rate": 3e-06, "loss": -16.4993, "step": 1186 }, { "epoch": 0.10589232347562336, "grad_norm": 766.78857421875, "learning_rate": 3e-06, "loss": 14.6288, "step": 1187 }, { "epoch": 0.10598153352067442, "grad_norm": 359.71173095703125, "learning_rate": 3e-06, "loss": -8.8216, "step": 1188 }, { "completion_length": 139.31250381469727, "epoch": 0.1060707435657255, "grad_norm": 1056.218994140625, "learning_rate": 3e-06, "loss": -19.8215, "reward": 2.124916732311249, "reward_std": 0.4207773655653, "rewards/correctness_reward_func": 1.5, "rewards/int_reward_func": 0.4895833283662796, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.1353333331644535, "step": 1189, "zero_std_ratio": 0.0 }, { "epoch": 0.10615995361077657, "grad_norm": 1287.11962890625, "learning_rate": 3e-06, "loss": -46.0924, "step": 1190 }, { "epoch": 0.10624916365582765, "grad_norm": 951.5126953125, "learning_rate": 3e-06, "loss": 40.7922, "step": 1191 }, { "epoch": 0.10633837370087872, "grad_norm": 918.715087890625, "learning_rate": 3e-06, "loss": -68.4776, "step": 1192 }, { "epoch": 0.1064275837459298, "grad_norm": 1028.908447265625, "learning_rate": 3e-06, "loss": -24.5844, "step": 1193 }, { "epoch": 0.10651679379098086, "grad_norm": 998.2527465820312, "learning_rate": 3e-06, "loss": -34.4867, "step": 1194 }, { "epoch": 0.10660600383603194, "grad_norm": 1073.2415771484375, "learning_rate": 3e-06, "loss": -28.3823, "step": 1195 }, { "epoch": 0.10669521388108301, "grad_norm": 980.68701171875, "learning_rate": 3e-06, "loss": -57.3113, "step": 1196 }, { "epoch": 0.10678442392613409, "grad_norm": 914.2239990234375, "learning_rate": 3e-06, "loss": 36.5887, "step": 1197 }, { "epoch": 0.10687363397118516, "grad_norm": 907.9400634765625, "learning_rate": 3e-06, "loss": -67.0974, "step": 1198 }, { "epoch": 0.10696284401623622, "grad_norm": 944.126220703125, "learning_rate": 3e-06, "loss": -33.4221, "step": 1199 }, { "epoch": 0.1070520540612873, "grad_norm": 1056.388671875, "learning_rate": 3e-06, "loss": -47.9315, "step": 1200 }, { "completion_length": 123.27083587646484, "epoch": 0.10714126410633837, "grad_norm": 662.4260864257812, "learning_rate": 3e-06, "loss": 8.406, "reward": 2.4690834283828735, "reward_std": 0.40174539387226105, "rewards/correctness_reward_func": 1.7916666865348816, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.17741665989160538, "step": 1201, "zero_std_ratio": 0.0 }, { "epoch": 0.10723047415138945, "grad_norm": 471.24163818359375, "learning_rate": 3e-06, "loss": -1.3977, "step": 1202 }, { "epoch": 0.10731968419644052, "grad_norm": 460.9804382324219, "learning_rate": 3e-06, "loss": -2.6373, "step": 1203 }, { "epoch": 0.1074088942414916, "grad_norm": 426.29827880859375, "learning_rate": 3e-06, "loss": 13.5829, "step": 1204 }, { "epoch": 0.10749810428654266, "grad_norm": 461.0411071777344, "learning_rate": 3e-06, "loss": -8.1952, "step": 1205 }, { "epoch": 0.10758731433159374, "grad_norm": 441.34466552734375, "learning_rate": 3e-06, "loss": -2.8955, "step": 1206 }, { "epoch": 0.10767652437664481, "grad_norm": 569.8471069335938, "learning_rate": 3e-06, "loss": 3.5701, "step": 1207 }, { "epoch": 0.10776573442169589, "grad_norm": 437.7737121582031, "learning_rate": 3e-06, "loss": -7.5943, "step": 1208 }, { "epoch": 0.10785494446674695, "grad_norm": 465.7676696777344, "learning_rate": 3e-06, "loss": -8.1374, "step": 1209 }, { "epoch": 0.10794415451179804, "grad_norm": 412.93511962890625, "learning_rate": 3e-06, "loss": 7.502, "step": 1210 }, { "epoch": 0.1080333645568491, "grad_norm": 481.81951904296875, "learning_rate": 3e-06, "loss": -8.5634, "step": 1211 }, { "epoch": 0.10812257460190017, "grad_norm": 459.6242980957031, "learning_rate": 3e-06, "loss": -3.8228, "step": 1212 }, { "completion_length": 137.0833396911621, "epoch": 0.10821178464695125, "grad_norm": 554.8159790039062, "learning_rate": 3e-06, "loss": 62.0314, "reward": 1.7515416145324707, "reward_std": 0.3914555162191391, "rewards/correctness_reward_func": 1.1249999701976776, "rewards/int_reward_func": 0.4895833283662796, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.13695833832025528, "step": 1213, "zero_std_ratio": 0.0 }, { "epoch": 0.10830099469200231, "grad_norm": 598.4298706054688, "learning_rate": 3e-06, "loss": 49.5125, "step": 1214 }, { "epoch": 0.1083902047370534, "grad_norm": 621.552490234375, "learning_rate": 3e-06, "loss": 63.9503, "step": 1215 }, { "epoch": 0.10847941478210446, "grad_norm": 622.46875, "learning_rate": 3e-06, "loss": 76.0874, "step": 1216 }, { "epoch": 0.10856862482715554, "grad_norm": 547.4149780273438, "learning_rate": 3e-06, "loss": 76.761, "step": 1217 }, { "epoch": 0.10865783487220661, "grad_norm": 464.8030700683594, "learning_rate": 3e-06, "loss": 56.6468, "step": 1218 }, { "epoch": 0.10874704491725769, "grad_norm": 483.95391845703125, "learning_rate": 3e-06, "loss": 48.4273, "step": 1219 }, { "epoch": 0.10883625496230875, "grad_norm": 417.3274841308594, "learning_rate": 3e-06, "loss": 40.6516, "step": 1220 }, { "epoch": 0.10892546500735983, "grad_norm": 611.0287475585938, "learning_rate": 3e-06, "loss": 51.9757, "step": 1221 }, { "epoch": 0.1090146750524109, "grad_norm": 534.5418090820312, "learning_rate": 3e-06, "loss": 55.8806, "step": 1222 }, { "epoch": 0.10910388509746198, "grad_norm": 455.1130676269531, "learning_rate": 3e-06, "loss": 57.7598, "step": 1223 }, { "epoch": 0.10919309514251305, "grad_norm": 375.9675598144531, "learning_rate": 3e-06, "loss": 39.7899, "step": 1224 }, { "completion_length": 130.31250762939453, "epoch": 0.10928230518756411, "grad_norm": 515.6392211914062, "learning_rate": 3e-06, "loss": 2.711, "reward": 2.421500086784363, "reward_std": 0.37992818653583527, "rewards/correctness_reward_func": 1.7916666865348816, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.12983334064483643, "step": 1225, "zero_std_ratio": 0.125 }, { "epoch": 0.1093715152326152, "grad_norm": 517.3201904296875, "learning_rate": 3e-06, "loss": 2.5469, "step": 1226 }, { "epoch": 0.10946072527766626, "grad_norm": 476.81585693359375, "learning_rate": 3e-06, "loss": -3.3801, "step": 1227 }, { "epoch": 0.10954993532271734, "grad_norm": 389.1577453613281, "learning_rate": 3e-06, "loss": 2.5568, "step": 1228 }, { "epoch": 0.1096391453677684, "grad_norm": 368.0989685058594, "learning_rate": 3e-06, "loss": 13.2772, "step": 1229 }, { "epoch": 0.10972835541281949, "grad_norm": 471.196044921875, "learning_rate": 3e-06, "loss": -11.5976, "step": 1230 }, { "epoch": 0.10981756545787055, "grad_norm": 1111.9873046875, "learning_rate": 3e-06, "loss": -0.3085, "step": 1231 }, { "epoch": 0.10990677550292163, "grad_norm": 546.0790405273438, "learning_rate": 3e-06, "loss": 1.0671, "step": 1232 }, { "epoch": 0.1099959855479727, "grad_norm": 400.2040710449219, "learning_rate": 3e-06, "loss": -7.1922, "step": 1233 }, { "epoch": 0.11008519559302378, "grad_norm": 368.6521301269531, "learning_rate": 3e-06, "loss": -0.5552, "step": 1234 }, { "epoch": 0.11017440563807485, "grad_norm": 350.799560546875, "learning_rate": 3e-06, "loss": 10.4996, "step": 1235 }, { "epoch": 0.11026361568312593, "grad_norm": 538.6735229492188, "learning_rate": 3e-06, "loss": -15.7319, "step": 1236 }, { "completion_length": 113.14583587646484, "epoch": 0.11035282572817699, "grad_norm": 272.7051696777344, "learning_rate": 3e-06, "loss": 5.5426, "reward": 2.4681875705718994, "reward_std": 0.30294275283813477, "rewards/correctness_reward_func": 1.7916666865348816, "rewards/int_reward_func": 0.4895833283662796, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.18693750351667404, "step": 1237, "zero_std_ratio": 0.0 }, { "epoch": 0.11044203577322806, "grad_norm": 704.2574462890625, "learning_rate": 3e-06, "loss": -12.312, "step": 1238 }, { "epoch": 0.11053124581827914, "grad_norm": 246.17315673828125, "learning_rate": 3e-06, "loss": 13.9124, "step": 1239 }, { "epoch": 0.1106204558633302, "grad_norm": 290.6758117675781, "learning_rate": 3e-06, "loss": -2.8205, "step": 1240 }, { "epoch": 0.11070966590838129, "grad_norm": 250.3959197998047, "learning_rate": 3e-06, "loss": -0.9022, "step": 1241 }, { "epoch": 0.11079887595343235, "grad_norm": 219.3252410888672, "learning_rate": 3e-06, "loss": -11.9642, "step": 1242 }, { "epoch": 0.11088808599848343, "grad_norm": 222.90179443359375, "learning_rate": 3e-06, "loss": 2.4097, "step": 1243 }, { "epoch": 0.1109772960435345, "grad_norm": 186.19627380371094, "learning_rate": 3e-06, "loss": -12.2427, "step": 1244 }, { "epoch": 0.11106650608858558, "grad_norm": 269.6827087402344, "learning_rate": 3e-06, "loss": 12.3411, "step": 1245 }, { "epoch": 0.11115571613363665, "grad_norm": 237.4910430908203, "learning_rate": 3e-06, "loss": -5.9803, "step": 1246 }, { "epoch": 0.11124492617868773, "grad_norm": 219.88262939453125, "learning_rate": 3e-06, "loss": -2.1975, "step": 1247 }, { "epoch": 0.11133413622373879, "grad_norm": 188.75503540039062, "learning_rate": 3e-06, "loss": -11.695, "step": 1248 }, { "completion_length": 138.06250381469727, "epoch": 0.11142334626878987, "grad_norm": 550.1611328125, "learning_rate": 3e-06, "loss": 8.1996, "reward": 2.1077709197998047, "reward_std": 0.6573592722415924, "rewards/correctness_reward_func": 1.5, "rewards/int_reward_func": 0.4791666567325592, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.12860416434705257, "step": 1249, "zero_std_ratio": 0.0 }, { "epoch": 0.11151255631384094, "grad_norm": 739.56591796875, "learning_rate": 3e-06, "loss": 97.7263, "step": 1250 }, { "epoch": 0.111601766358892, "grad_norm": 684.5728759765625, "learning_rate": 3e-06, "loss": 37.5632, "step": 1251 }, { "epoch": 0.11169097640394308, "grad_norm": 567.6437377929688, "learning_rate": 3e-06, "loss": 10.2892, "step": 1252 }, { "epoch": 0.11178018644899415, "grad_norm": 876.806640625, "learning_rate": 3e-06, "loss": -13.9692, "step": 1253 }, { "epoch": 0.11186939649404523, "grad_norm": 592.7362670898438, "learning_rate": 3e-06, "loss": 10.123, "step": 1254 }, { "epoch": 0.1119586065390963, "grad_norm": 509.6954345703125, "learning_rate": 3e-06, "loss": 4.082, "step": 1255 }, { "epoch": 0.11204781658414738, "grad_norm": 781.4515380859375, "learning_rate": 3e-06, "loss": 90.9954, "step": 1256 }, { "epoch": 0.11213702662919844, "grad_norm": 621.8211669921875, "learning_rate": 3e-06, "loss": 32.1321, "step": 1257 }, { "epoch": 0.11222623667424952, "grad_norm": 594.2699584960938, "learning_rate": 3e-06, "loss": 7.6137, "step": 1258 }, { "epoch": 0.11231544671930059, "grad_norm": 815.78857421875, "learning_rate": 3e-06, "loss": -16.6171, "step": 1259 }, { "epoch": 0.11240465676435167, "grad_norm": 552.5240478515625, "learning_rate": 3e-06, "loss": 8.0781, "step": 1260 }, { "completion_length": 130.56250381469727, "epoch": 0.11249386680940274, "grad_norm": 737.4078979492188, "learning_rate": 3e-06, "loss": 59.8285, "reward": 2.060583472251892, "reward_std": 0.43678246438503265, "rewards/correctness_reward_func": 1.4583333730697632, "rewards/int_reward_func": 0.4479166567325592, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.15433333814144135, "step": 1261, "zero_std_ratio": 0.0 }, { "epoch": 0.11258307685445382, "grad_norm": 522.5460205078125, "learning_rate": 3e-06, "loss": 33.9124, "step": 1262 }, { "epoch": 0.11267228689950488, "grad_norm": 680.2973022460938, "learning_rate": 3e-06, "loss": 63.2628, "step": 1263 }, { "epoch": 0.11276149694455595, "grad_norm": 631.2474365234375, "learning_rate": 3e-06, "loss": 63.7001, "step": 1264 }, { "epoch": 0.11285070698960703, "grad_norm": 618.1221313476562, "learning_rate": 3e-06, "loss": 51.6552, "step": 1265 }, { "epoch": 0.1129399170346581, "grad_norm": 677.8065185546875, "learning_rate": 3e-06, "loss": 56.3779, "step": 1266 }, { "epoch": 0.11302912707970918, "grad_norm": 694.61865234375, "learning_rate": 3e-06, "loss": 47.5276, "step": 1267 }, { "epoch": 0.11311833712476024, "grad_norm": 319.64862060546875, "learning_rate": 3e-06, "loss": 24.7961, "step": 1268 }, { "epoch": 0.11320754716981132, "grad_norm": 711.6901245117188, "learning_rate": 3e-06, "loss": 41.8441, "step": 1269 }, { "epoch": 0.11329675721486239, "grad_norm": 506.5478515625, "learning_rate": 3e-06, "loss": 50.1798, "step": 1270 }, { "epoch": 0.11338596725991347, "grad_norm": 537.6961669921875, "learning_rate": 3e-06, "loss": 32.1026, "step": 1271 }, { "epoch": 0.11347517730496454, "grad_norm": 495.77386474609375, "learning_rate": 3e-06, "loss": 34.678, "step": 1272 }, { "completion_length": 152.4583396911621, "epoch": 0.11356438735001562, "grad_norm": 540.7517700195312, "learning_rate": 3e-06, "loss": -25.9926, "reward": 1.7821251153945923, "reward_std": 0.4489431008696556, "rewards/correctness_reward_func": 1.1666666567325592, "rewards/int_reward_func": 0.4895833283662796, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.12587500177323818, "step": 1273, "zero_std_ratio": 0.0 }, { "epoch": 0.11365359739506668, "grad_norm": 533.8948974609375, "learning_rate": 3e-06, "loss": -10.4261, "step": 1274 }, { "epoch": 0.11374280744011776, "grad_norm": 407.5013122558594, "learning_rate": 3e-06, "loss": -1.9956, "step": 1275 }, { "epoch": 0.11383201748516883, "grad_norm": 435.1839599609375, "learning_rate": 3e-06, "loss": -14.8616, "step": 1276 }, { "epoch": 0.11392122753021991, "grad_norm": 501.2892761230469, "learning_rate": 3e-06, "loss": 3.4901, "step": 1277 }, { "epoch": 0.11401043757527098, "grad_norm": 419.8990173339844, "learning_rate": 3e-06, "loss": -6.3263, "step": 1278 }, { "epoch": 0.11409964762032204, "grad_norm": 508.3893737792969, "learning_rate": 3e-06, "loss": -31.5865, "step": 1279 }, { "epoch": 0.11418885766537312, "grad_norm": 393.2361755371094, "learning_rate": 3e-06, "loss": -9.3518, "step": 1280 }, { "epoch": 0.11427806771042419, "grad_norm": 370.1325378417969, "learning_rate": 3e-06, "loss": -5.1139, "step": 1281 }, { "epoch": 0.11436727775547527, "grad_norm": 382.4687805175781, "learning_rate": 3e-06, "loss": -17.6292, "step": 1282 }, { "epoch": 0.11445648780052634, "grad_norm": 424.23553466796875, "learning_rate": 3e-06, "loss": -1.1541, "step": 1283 }, { "epoch": 0.11454569784557742, "grad_norm": 366.4822998046875, "learning_rate": 3e-06, "loss": -10.3862, "step": 1284 }, { "completion_length": 150.7916717529297, "epoch": 0.11463490789062848, "grad_norm": 64.43208312988281, "learning_rate": 3e-06, "loss": -3.6021, "reward": 2.220250129699707, "reward_std": 0.1756780087016523, "rewards/correctness_reward_func": 1.6666666865348816, "rewards/int_reward_func": 0.4166666716337204, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.13691666349768639, "step": 1285, "zero_std_ratio": 0.125 }, { "epoch": 0.11472411793567956, "grad_norm": 146.30032348632812, "learning_rate": 3e-06, "loss": -5.2668, "step": 1286 }, { "epoch": 0.11481332798073063, "grad_norm": 63.97313690185547, "learning_rate": 3e-06, "loss": 1.0993, "step": 1287 }, { "epoch": 0.11490253802578171, "grad_norm": 62.98701858520508, "learning_rate": 3e-06, "loss": 0.0564, "step": 1288 }, { "epoch": 0.11499174807083277, "grad_norm": 70.9837417602539, "learning_rate": 3e-06, "loss": -1.8786, "step": 1289 }, { "epoch": 0.11508095811588386, "grad_norm": 127.5965576171875, "learning_rate": 3e-06, "loss": 2.6502, "step": 1290 }, { "epoch": 0.11517016816093492, "grad_norm": 109.8124008178711, "learning_rate": 3e-06, "loss": -4.5564, "step": 1291 }, { "epoch": 0.11525937820598599, "grad_norm": 165.30950927734375, "learning_rate": 3e-06, "loss": -6.4512, "step": 1292 }, { "epoch": 0.11534858825103707, "grad_norm": 52.5026741027832, "learning_rate": 3e-06, "loss": 0.4806, "step": 1293 }, { "epoch": 0.11543779829608813, "grad_norm": 116.0346908569336, "learning_rate": 3e-06, "loss": -0.2058, "step": 1294 }, { "epoch": 0.11552700834113921, "grad_norm": 83.33243560791016, "learning_rate": 3e-06, "loss": -2.1249, "step": 1295 }, { "epoch": 0.11561621838619028, "grad_norm": 105.78126525878906, "learning_rate": 3e-06, "loss": 2.4684, "step": 1296 }, { "completion_length": 132.31250381469727, "epoch": 0.11570542843124136, "grad_norm": 134.2357635498047, "learning_rate": 3e-06, "loss": -3.0816, "reward": 1.8828958868980408, "reward_std": 0.34478074312210083, "rewards/correctness_reward_func": 1.2916666567325592, "rewards/int_reward_func": 0.4375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.1537291705608368, "step": 1297, "zero_std_ratio": 0.0 }, { "epoch": 0.11579463847629243, "grad_norm": 103.42467498779297, "learning_rate": 3e-06, "loss": -8.8164, "step": 1298 }, { "epoch": 0.11588384852134351, "grad_norm": 182.32949829101562, "learning_rate": 3e-06, "loss": -11.4319, "step": 1299 }, { "epoch": 0.11597305856639457, "grad_norm": 208.2166748046875, "learning_rate": 3e-06, "loss": -10.9038, "step": 1300 }, { "epoch": 0.11606226861144565, "grad_norm": 130.95465087890625, "learning_rate": 3e-06, "loss": -10.8062, "step": 1301 }, { "epoch": 0.11615147865649672, "grad_norm": 119.11116790771484, "learning_rate": 3e-06, "loss": -9.441, "step": 1302 }, { "epoch": 0.1162406887015478, "grad_norm": 140.15191650390625, "learning_rate": 3e-06, "loss": -5.4138, "step": 1303 }, { "epoch": 0.11632989874659887, "grad_norm": 133.99444580078125, "learning_rate": 3e-06, "loss": -11.1186, "step": 1304 }, { "epoch": 0.11641910879164993, "grad_norm": 186.12327575683594, "learning_rate": 3e-06, "loss": -13.2229, "step": 1305 }, { "epoch": 0.11650831883670101, "grad_norm": 237.92056274414062, "learning_rate": 3e-06, "loss": -14.0843, "step": 1306 }, { "epoch": 0.11659752888175208, "grad_norm": 166.06137084960938, "learning_rate": 3e-06, "loss": -14.3065, "step": 1307 }, { "epoch": 0.11668673892680316, "grad_norm": 162.08094787597656, "learning_rate": 3e-06, "loss": -13.5576, "step": 1308 }, { "completion_length": 153.43750762939453, "epoch": 0.11677594897185423, "grad_norm": 816.9434814453125, "learning_rate": 3e-06, "loss": 65.7541, "reward": 2.333416700363159, "reward_std": 0.44685766100883484, "rewards/correctness_reward_func": 1.75, "rewards/int_reward_func": 0.4791666567325592, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.1042499989271164, "step": 1309, "zero_std_ratio": 0.125 }, { "epoch": 0.1168651590169053, "grad_norm": 713.5402221679688, "learning_rate": 3e-06, "loss": 86.7146, "step": 1310 }, { "epoch": 0.11695436906195637, "grad_norm": 1150.6640625, "learning_rate": 3e-06, "loss": 87.0367, "step": 1311 }, { "epoch": 0.11704357910700745, "grad_norm": 696.25537109375, "learning_rate": 3e-06, "loss": 58.2521, "step": 1312 }, { "epoch": 0.11713278915205852, "grad_norm": 869.322998046875, "learning_rate": 3e-06, "loss": 60.5598, "step": 1313 }, { "epoch": 0.1172219991971096, "grad_norm": 908.8890991210938, "learning_rate": 3e-06, "loss": 108.3106, "step": 1314 }, { "epoch": 0.11731120924216067, "grad_norm": 816.9879760742188, "learning_rate": 3e-06, "loss": 61.6733, "step": 1315 }, { "epoch": 0.11740041928721175, "grad_norm": 776.9385375976562, "learning_rate": 3e-06, "loss": 78.5087, "step": 1316 }, { "epoch": 0.11748962933226281, "grad_norm": 1054.038330078125, "learning_rate": 3e-06, "loss": 77.8217, "step": 1317 }, { "epoch": 0.11757883937731388, "grad_norm": 593.8987426757812, "learning_rate": 3e-06, "loss": 50.989, "step": 1318 }, { "epoch": 0.11766804942236496, "grad_norm": 748.6307373046875, "learning_rate": 3e-06, "loss": 44.6558, "step": 1319 }, { "epoch": 0.11775725946741603, "grad_norm": 1137.890625, "learning_rate": 3e-06, "loss": 86.6074, "step": 1320 }, { "completion_length": 116.97917175292969, "epoch": 0.1178464695124671, "grad_norm": 59.700103759765625, "learning_rate": 3e-06, "loss": -3.8916, "reward": 2.350229024887085, "reward_std": 0.22116341721266508, "rewards/correctness_reward_func": 1.6666666865348816, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.1835625022649765, "step": 1321, "zero_std_ratio": 0.0 }, { "epoch": 0.11793567955751817, "grad_norm": 52.00520706176758, "learning_rate": 3e-06, "loss": -4.6376, "step": 1322 }, { "epoch": 0.11802488960256925, "grad_norm": 61.93866729736328, "learning_rate": 3e-06, "loss": -5.3856, "step": 1323 }, { "epoch": 0.11811409964762032, "grad_norm": 75.79090118408203, "learning_rate": 3e-06, "loss": -10.1807, "step": 1324 }, { "epoch": 0.1182033096926714, "grad_norm": 57.52518081665039, "learning_rate": 3e-06, "loss": -6.9506, "step": 1325 }, { "epoch": 0.11829251973772247, "grad_norm": 54.51726531982422, "learning_rate": 3e-06, "loss": -6.9744, "step": 1326 }, { "epoch": 0.11838172978277355, "grad_norm": 54.5262336730957, "learning_rate": 3e-06, "loss": -4.3897, "step": 1327 }, { "epoch": 0.11847093982782461, "grad_norm": 46.70759201049805, "learning_rate": 3e-06, "loss": -5.4283, "step": 1328 }, { "epoch": 0.11856014987287569, "grad_norm": 84.02616882324219, "learning_rate": 3e-06, "loss": -6.4225, "step": 1329 }, { "epoch": 0.11864935991792676, "grad_norm": 84.2325668334961, "learning_rate": 3e-06, "loss": -11.725, "step": 1330 }, { "epoch": 0.11873856996297782, "grad_norm": 61.59962844848633, "learning_rate": 3e-06, "loss": -8.3019, "step": 1331 }, { "epoch": 0.1188277800080289, "grad_norm": 47.83831024169922, "learning_rate": 3e-06, "loss": -8.035, "step": 1332 }, { "completion_length": 118.75000381469727, "epoch": 0.11891699005307997, "grad_norm": 255.8539276123047, "learning_rate": 3e-06, "loss": -9.5405, "reward": 2.0334584712982178, "reward_std": 0.5094788670539856, "rewards/correctness_reward_func": 1.4166666865348816, "rewards/int_reward_func": 0.46875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.14804166555404663, "step": 1333, "zero_std_ratio": 0.0 }, { "epoch": 0.11900620009813105, "grad_norm": 288.6775207519531, "learning_rate": 3e-06, "loss": 0.6784, "step": 1334 }, { "epoch": 0.11909541014318212, "grad_norm": 197.23643493652344, "learning_rate": 3e-06, "loss": 2.0711, "step": 1335 }, { "epoch": 0.1191846201882332, "grad_norm": 383.6775207519531, "learning_rate": 3e-06, "loss": 7.2175, "step": 1336 }, { "epoch": 0.11927383023328426, "grad_norm": 509.5005187988281, "learning_rate": 3e-06, "loss": 24.5104, "step": 1337 }, { "epoch": 0.11936304027833534, "grad_norm": 453.14947509765625, "learning_rate": 3e-06, "loss": 30.6734, "step": 1338 }, { "epoch": 0.11945225032338641, "grad_norm": 275.91925048828125, "learning_rate": 3e-06, "loss": -11.3542, "step": 1339 }, { "epoch": 0.11954146036843749, "grad_norm": 288.1315612792969, "learning_rate": 3e-06, "loss": -1.6763, "step": 1340 }, { "epoch": 0.11963067041348856, "grad_norm": 190.97845458984375, "learning_rate": 3e-06, "loss": 0.7165, "step": 1341 }, { "epoch": 0.11971988045853964, "grad_norm": 380.86102294921875, "learning_rate": 3e-06, "loss": 4.2454, "step": 1342 }, { "epoch": 0.1198090905035907, "grad_norm": 470.7318420410156, "learning_rate": 3e-06, "loss": 18.4597, "step": 1343 }, { "epoch": 0.11989830054864177, "grad_norm": 439.4569091796875, "learning_rate": 3e-06, "loss": 25.8113, "step": 1344 }, { "completion_length": 138.70834350585938, "epoch": 0.11998751059369285, "grad_norm": 1121.697021484375, "learning_rate": 3e-06, "loss": -1.833, "reward": 1.5940208435058594, "reward_std": 0.1865759715437889, "rewards/correctness_reward_func": 1.0416666865348816, "rewards/int_reward_func": 0.4166666716337204, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.13568750768899918, "step": 1345, "zero_std_ratio": 0.0 }, { "epoch": 0.12007672063874392, "grad_norm": 361.7314758300781, "learning_rate": 3e-06, "loss": -5.4719, "step": 1346 }, { "epoch": 0.120165930683795, "grad_norm": 722.0169067382812, "learning_rate": 3e-06, "loss": 26.6872, "step": 1347 }, { "epoch": 0.12025514072884606, "grad_norm": 521.7423706054688, "learning_rate": 3e-06, "loss": 12.9836, "step": 1348 }, { "epoch": 0.12034435077389714, "grad_norm": 270.7301025390625, "learning_rate": 3e-06, "loss": 10.6828, "step": 1349 }, { "epoch": 0.12043356081894821, "grad_norm": 339.52813720703125, "learning_rate": 3e-06, "loss": 19.5909, "step": 1350 }, { "epoch": 0.12052277086399929, "grad_norm": 288.1322326660156, "learning_rate": 3e-06, "loss": -2.7182, "step": 1351 }, { "epoch": 0.12061198090905036, "grad_norm": 404.0463562011719, "learning_rate": 3e-06, "loss": -6.4436, "step": 1352 }, { "epoch": 0.12070119095410144, "grad_norm": 480.42559814453125, "learning_rate": 3e-06, "loss": 27.9973, "step": 1353 }, { "epoch": 0.1207904009991525, "grad_norm": 454.6216735839844, "learning_rate": 3e-06, "loss": 10.5823, "step": 1354 }, { "epoch": 0.12087961104420358, "grad_norm": 292.9732971191406, "learning_rate": 3e-06, "loss": 8.2813, "step": 1355 }, { "epoch": 0.12096882108925465, "grad_norm": 302.1980285644531, "learning_rate": 3e-06, "loss": 18.4478, "step": 1356 }, { "completion_length": 114.14583587646484, "epoch": 0.12105803113430573, "grad_norm": 789.7766723632812, "learning_rate": 3e-06, "loss": -361.7816, "reward": 2.2812918424606323, "reward_std": 0.7483960092067719, "rewards/correctness_reward_func": 1.625, "rewards/int_reward_func": 0.4791666567325592, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.1771249994635582, "step": 1357, "zero_std_ratio": 0.0 }, { "epoch": 0.1211472411793568, "grad_norm": 1342.53662109375, "learning_rate": 3e-06, "loss": -332.2092, "step": 1358 }, { "epoch": 0.12123645122440786, "grad_norm": 852.587158203125, "learning_rate": 3e-06, "loss": -354.9008, "step": 1359 }, { "epoch": 0.12132566126945894, "grad_norm": 1136.1865234375, "learning_rate": 3e-06, "loss": -338.6482, "step": 1360 }, { "epoch": 0.12141487131451001, "grad_norm": 1198.8544921875, "learning_rate": 3e-06, "loss": -362.0203, "step": 1361 }, { "epoch": 0.12150408135956109, "grad_norm": 767.9398193359375, "learning_rate": 3e-06, "loss": -368.6167, "step": 1362 }, { "epoch": 0.12159329140461216, "grad_norm": 798.4600830078125, "learning_rate": 3e-06, "loss": -382.5811, "step": 1363 }, { "epoch": 0.12168250144966324, "grad_norm": 905.6390991210938, "learning_rate": 3e-06, "loss": -359.7239, "step": 1364 }, { "epoch": 0.1217717114947143, "grad_norm": 813.3554077148438, "learning_rate": 3e-06, "loss": -387.0817, "step": 1365 }, { "epoch": 0.12186092153976538, "grad_norm": 1077.1717529296875, "learning_rate": 3e-06, "loss": -374.2281, "step": 1366 }, { "epoch": 0.12195013158481645, "grad_norm": 1666.1490478515625, "learning_rate": 3e-06, "loss": -393.2914, "step": 1367 }, { "epoch": 0.12203934162986753, "grad_norm": 792.9961547851562, "learning_rate": 3e-06, "loss": -407.0092, "step": 1368 }, { "completion_length": 143.4791717529297, "epoch": 0.1221285516749186, "grad_norm": 346.3370056152344, "learning_rate": 3e-06, "loss": -0.7849, "reward": 1.621000051498413, "reward_std": 0.5591593682765961, "rewards/correctness_reward_func": 1.0416666567325592, "rewards/int_reward_func": 0.4583333283662796, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.12099999561905861, "step": 1369, "zero_std_ratio": 0.0 }, { "epoch": 0.12221776171996968, "grad_norm": 331.146240234375, "learning_rate": 3e-06, "loss": 22.5103, "step": 1370 }, { "epoch": 0.12230697176502074, "grad_norm": 303.2864990234375, "learning_rate": 3e-06, "loss": -13.1863, "step": 1371 }, { "epoch": 0.12239618181007181, "grad_norm": 314.0926513671875, "learning_rate": 3e-06, "loss": 7.2222, "step": 1372 }, { "epoch": 0.12248539185512289, "grad_norm": 282.6340637207031, "learning_rate": 3e-06, "loss": 3.6854, "step": 1373 }, { "epoch": 0.12257460190017395, "grad_norm": 327.83856201171875, "learning_rate": 3e-06, "loss": 6.0168, "step": 1374 }, { "epoch": 0.12266381194522503, "grad_norm": 409.41033935546875, "learning_rate": 3e-06, "loss": -3.9087, "step": 1375 }, { "epoch": 0.1227530219902761, "grad_norm": 341.80078125, "learning_rate": 3e-06, "loss": 17.7844, "step": 1376 }, { "epoch": 0.12284223203532718, "grad_norm": 320.9837646484375, "learning_rate": 3e-06, "loss": -18.6977, "step": 1377 }, { "epoch": 0.12293144208037825, "grad_norm": 309.8314208984375, "learning_rate": 3e-06, "loss": 4.6537, "step": 1378 }, { "epoch": 0.12302065212542933, "grad_norm": 286.1745910644531, "learning_rate": 3e-06, "loss": -0.5282, "step": 1379 }, { "epoch": 0.1231098621704804, "grad_norm": 367.7530517578125, "learning_rate": 3e-06, "loss": 2.5712, "step": 1380 }, { "completion_length": 140.4375, "epoch": 0.12319907221553147, "grad_norm": 441.84033203125, "learning_rate": 3e-06, "loss": -35.237, "reward": 2.2606041431427, "reward_std": 0.5069623440504074, "rewards/correctness_reward_func": 1.6666666865348816, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.09393750131130219, "step": 1381, "zero_std_ratio": 0.0 }, { "epoch": 0.12328828226058254, "grad_norm": 424.677001953125, "learning_rate": 3e-06, "loss": -30.367, "step": 1382 }, { "epoch": 0.12337749230563362, "grad_norm": 515.3092041015625, "learning_rate": 3e-06, "loss": -114.4816, "step": 1383 }, { "epoch": 0.12346670235068469, "grad_norm": 880.9327392578125, "learning_rate": 3e-06, "loss": -72.5326, "step": 1384 }, { "epoch": 0.12355591239573575, "grad_norm": 943.7388305664062, "learning_rate": 3e-06, "loss": -106.8934, "step": 1385 }, { "epoch": 0.12364512244078683, "grad_norm": 729.3348999023438, "learning_rate": 3e-06, "loss": -63.3284, "step": 1386 }, { "epoch": 0.1237343324858379, "grad_norm": 696.5101928710938, "learning_rate": 3e-06, "loss": -46.2135, "step": 1387 }, { "epoch": 0.12382354253088898, "grad_norm": 893.1636352539062, "learning_rate": 3e-06, "loss": -44.905, "step": 1388 }, { "epoch": 0.12391275257594005, "grad_norm": 477.4311218261719, "learning_rate": 3e-06, "loss": -128.0508, "step": 1389 }, { "epoch": 0.12400196262099113, "grad_norm": 746.9646606445312, "learning_rate": 3e-06, "loss": -93.2043, "step": 1390 }, { "epoch": 0.1240911726660422, "grad_norm": 955.401123046875, "learning_rate": 3e-06, "loss": -134.5935, "step": 1391 }, { "epoch": 0.12418038271109327, "grad_norm": 859.1612548828125, "learning_rate": 3e-06, "loss": -86.3171, "step": 1392 }, { "completion_length": 117.87500381469727, "epoch": 0.12426959275614434, "grad_norm": 664.340576171875, "learning_rate": 3e-06, "loss": -104.9476, "reward": 2.5831665992736816, "reward_std": 0.2565724179148674, "rewards/correctness_reward_func": 1.9166666865348816, "rewards/int_reward_func": 0.4895833283662796, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.176916666328907, "step": 1393, "zero_std_ratio": 0.0 }, { "epoch": 0.12435880280119542, "grad_norm": 669.9559936523438, "learning_rate": 3e-06, "loss": -139.0697, "step": 1394 }, { "epoch": 0.12444801284624649, "grad_norm": 674.5648803710938, "learning_rate": 3e-06, "loss": -111.2749, "step": 1395 }, { "epoch": 0.12453722289129757, "grad_norm": 601.0291748046875, "learning_rate": 3e-06, "loss": -83.8575, "step": 1396 }, { "epoch": 0.12462643293634863, "grad_norm": 679.1978759765625, "learning_rate": 3e-06, "loss": -110.6151, "step": 1397 }, { "epoch": 0.1247156429813997, "grad_norm": 724.3170776367188, "learning_rate": 3e-06, "loss": -124.7426, "step": 1398 }, { "epoch": 0.12480485302645078, "grad_norm": 555.1895141601562, "learning_rate": 3e-06, "loss": -126.9873, "step": 1399 }, { "epoch": 0.12489406307150185, "grad_norm": 672.0289916992188, "learning_rate": 3e-06, "loss": -156.5453, "step": 1400 }, { "epoch": 0.12498327311655293, "grad_norm": 560.0007934570312, "learning_rate": 3e-06, "loss": -129.3565, "step": 1401 }, { "epoch": 0.125072483161604, "grad_norm": 564.6636962890625, "learning_rate": 3e-06, "loss": -98.3201, "step": 1402 }, { "epoch": 0.12516169320665507, "grad_norm": 705.2073974609375, "learning_rate": 3e-06, "loss": -137.3138, "step": 1403 }, { "epoch": 0.12525090325170615, "grad_norm": 738.72119140625, "learning_rate": 3e-06, "loss": -152.8548, "step": 1404 }, { "completion_length": 116.75000381469727, "epoch": 0.1253401132967572, "grad_norm": 767.5428466796875, "learning_rate": 3e-06, "loss": -51.0529, "reward": 2.3569791316986084, "reward_std": 0.4473089873790741, "rewards/correctness_reward_func": 1.6666666865348816, "rewards/int_reward_func": 0.4895833283662796, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.2007291615009308, "step": 1405, "zero_std_ratio": 0.0 }, { "epoch": 0.12542932334180829, "grad_norm": 738.8226928710938, "learning_rate": 3e-06, "loss": -36.4735, "step": 1406 }, { "epoch": 0.12551853338685937, "grad_norm": 532.7732543945312, "learning_rate": 3e-06, "loss": -3.3106, "step": 1407 }, { "epoch": 0.12560774343191045, "grad_norm": 1219.233154296875, "learning_rate": 3e-06, "loss": -10.491, "step": 1408 }, { "epoch": 0.1256969534769615, "grad_norm": 611.2919311523438, "learning_rate": 3e-06, "loss": 5.0358, "step": 1409 }, { "epoch": 0.12578616352201258, "grad_norm": 1330.2183837890625, "learning_rate": 3e-06, "loss": -54.0219, "step": 1410 }, { "epoch": 0.12587537356706366, "grad_norm": 634.2547607421875, "learning_rate": 3e-06, "loss": -61.707, "step": 1411 }, { "epoch": 0.1259645836121147, "grad_norm": 830.1190185546875, "learning_rate": 3e-06, "loss": -44.7945, "step": 1412 }, { "epoch": 0.1260537936571658, "grad_norm": 656.9666748046875, "learning_rate": 3e-06, "loss": -5.745, "step": 1413 }, { "epoch": 0.12614300370221687, "grad_norm": 1234.1181640625, "learning_rate": 3e-06, "loss": -22.9599, "step": 1414 }, { "epoch": 0.12623221374726795, "grad_norm": 810.6359252929688, "learning_rate": 3e-06, "loss": -5.0873, "step": 1415 }, { "epoch": 0.126321423792319, "grad_norm": 1392.664306640625, "learning_rate": 3e-06, "loss": -63.0076, "step": 1416 }, { "completion_length": 107.02083587646484, "epoch": 0.12641063383737008, "grad_norm": 535.7810668945312, "learning_rate": 3e-06, "loss": -19.2557, "reward": 2.204854369163513, "reward_std": 0.5064078867435455, "rewards/correctness_reward_func": 1.5416666269302368, "rewards/int_reward_func": 0.4375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.22568749636411667, "step": 1417, "zero_std_ratio": 0.0 }, { "epoch": 0.12649984388242116, "grad_norm": 547.939453125, "learning_rate": 3e-06, "loss": -52.9769, "step": 1418 }, { "epoch": 0.12658905392747224, "grad_norm": 1266.0494384765625, "learning_rate": 3e-06, "loss": -50.3919, "step": 1419 }, { "epoch": 0.1266782639725233, "grad_norm": 959.639404296875, "learning_rate": 3e-06, "loss": -47.5673, "step": 1420 }, { "epoch": 0.12676747401757438, "grad_norm": 924.2545776367188, "learning_rate": 3e-06, "loss": -11.8101, "step": 1421 }, { "epoch": 0.12685668406262546, "grad_norm": 674.6265869140625, "learning_rate": 3e-06, "loss": -62.9894, "step": 1422 }, { "epoch": 0.12694589410767654, "grad_norm": 611.5636596679688, "learning_rate": 3e-06, "loss": -24.2492, "step": 1423 }, { "epoch": 0.1270351041527276, "grad_norm": 573.57470703125, "learning_rate": 3e-06, "loss": -60.3758, "step": 1424 }, { "epoch": 0.12712431419777867, "grad_norm": 850.60693359375, "learning_rate": 3e-06, "loss": -51.1853, "step": 1425 }, { "epoch": 0.12721352424282975, "grad_norm": 1190.2738037109375, "learning_rate": 3e-06, "loss": -64.8067, "step": 1426 }, { "epoch": 0.1273027342878808, "grad_norm": 884.0977783203125, "learning_rate": 3e-06, "loss": -18.4246, "step": 1427 }, { "epoch": 0.12739194433293188, "grad_norm": 682.3296508789062, "learning_rate": 3e-06, "loss": -72.1404, "step": 1428 }, { "completion_length": 136.56250762939453, "epoch": 0.12748115437798296, "grad_norm": 1359.529052734375, "learning_rate": 3e-06, "loss": 208.4725, "reward": 2.0545417070388794, "reward_std": 0.6211664974689484, "rewards/correctness_reward_func": 1.4583333730697632, "rewards/int_reward_func": 0.46875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.12745832651853561, "step": 1429, "zero_std_ratio": 0.0 }, { "epoch": 0.12757036442303404, "grad_norm": 1307.88232421875, "learning_rate": 3e-06, "loss": 254.5796, "step": 1430 }, { "epoch": 0.1276595744680851, "grad_norm": 1878.1898193359375, "learning_rate": 3e-06, "loss": -34.1773, "step": 1431 }, { "epoch": 0.12774878451313618, "grad_norm": 1812.6014404296875, "learning_rate": 3e-06, "loss": 302.3062, "step": 1432 }, { "epoch": 0.12783799455818726, "grad_norm": 2044.9403076171875, "learning_rate": 3e-06, "loss": 44.0528, "step": 1433 }, { "epoch": 0.12792720460323834, "grad_norm": 1851.6116943359375, "learning_rate": 3e-06, "loss": 49.5851, "step": 1434 }, { "epoch": 0.1280164146482894, "grad_norm": 2113.39794921875, "learning_rate": 3e-06, "loss": 196.8499, "step": 1435 }, { "epoch": 0.12810562469334047, "grad_norm": 1287.9775390625, "learning_rate": 3e-06, "loss": 235.1401, "step": 1436 }, { "epoch": 0.12819483473839155, "grad_norm": 1830.1588134765625, "learning_rate": 3e-06, "loss": -49.5711, "step": 1437 }, { "epoch": 0.1282840447834426, "grad_norm": 1435.050048828125, "learning_rate": 3e-06, "loss": 289.4097, "step": 1438 }, { "epoch": 0.12837325482849368, "grad_norm": 1800.0789794921875, "learning_rate": 3e-06, "loss": 30.6825, "step": 1439 }, { "epoch": 0.12846246487354476, "grad_norm": 2387.952880859375, "learning_rate": 3e-06, "loss": 45.8289, "step": 1440 }, { "completion_length": 116.02083587646484, "epoch": 0.12855167491859584, "grad_norm": 1155.3507080078125, "learning_rate": 3e-06, "loss": 265.5505, "reward": 2.0338125824928284, "reward_std": 0.45823561400175095, "rewards/correctness_reward_func": 1.3333333730697632, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.2004791647195816, "step": 1441, "zero_std_ratio": 0.0 }, { "epoch": 0.1286408849636469, "grad_norm": 1068.2740478515625, "learning_rate": 3e-06, "loss": 231.8086, "step": 1442 }, { "epoch": 0.12873009500869798, "grad_norm": 1055.8021240234375, "learning_rate": 3e-06, "loss": 265.7319, "step": 1443 }, { "epoch": 0.12881930505374906, "grad_norm": 654.5939331054688, "learning_rate": 3e-06, "loss": 225.5054, "step": 1444 }, { "epoch": 0.12890851509880014, "grad_norm": 853.0970458984375, "learning_rate": 3e-06, "loss": 232.0057, "step": 1445 }, { "epoch": 0.1289977251438512, "grad_norm": 1135.107666015625, "learning_rate": 3e-06, "loss": 221.9774, "step": 1446 }, { "epoch": 0.12908693518890227, "grad_norm": 908.7825927734375, "learning_rate": 3e-06, "loss": 245.362, "step": 1447 }, { "epoch": 0.12917614523395335, "grad_norm": 1166.5682373046875, "learning_rate": 3e-06, "loss": 206.0756, "step": 1448 }, { "epoch": 0.12926535527900443, "grad_norm": 866.4660034179688, "learning_rate": 3e-06, "loss": 237.2835, "step": 1449 }, { "epoch": 0.12935456532405548, "grad_norm": 635.88623046875, "learning_rate": 3e-06, "loss": 216.354, "step": 1450 }, { "epoch": 0.12944377536910656, "grad_norm": 740.2254028320312, "learning_rate": 3e-06, "loss": 211.3737, "step": 1451 }, { "epoch": 0.12953298541415764, "grad_norm": 841.6773681640625, "learning_rate": 3e-06, "loss": 204.9694, "step": 1452 }, { "completion_length": 106.0, "epoch": 0.1296221954592087, "grad_norm": 81.25074768066406, "learning_rate": 3e-06, "loss": -11.2076, "reward": 2.7076042890548706, "reward_std": 0.04550948552787304, "rewards/correctness_reward_func": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.20760416984558105, "step": 1453, "zero_std_ratio": 0.0 }, { "epoch": 0.12971140550425977, "grad_norm": 86.57603454589844, "learning_rate": 3e-06, "loss": -11.9418, "step": 1454 }, { "epoch": 0.12980061554931085, "grad_norm": 72.47956848144531, "learning_rate": 3e-06, "loss": -10.8762, "step": 1455 }, { "epoch": 0.12988982559436194, "grad_norm": 87.24005889892578, "learning_rate": 3e-06, "loss": -14.0014, "step": 1456 }, { "epoch": 0.129979035639413, "grad_norm": 70.69519805908203, "learning_rate": 3e-06, "loss": -11.64, "step": 1457 }, { "epoch": 0.13006824568446407, "grad_norm": 67.05101776123047, "learning_rate": 3e-06, "loss": -11.5829, "step": 1458 }, { "epoch": 0.13015745572951515, "grad_norm": 80.47762298583984, "learning_rate": 3e-06, "loss": -11.7077, "step": 1459 }, { "epoch": 0.13024666577456623, "grad_norm": 80.73660278320312, "learning_rate": 3e-06, "loss": -13.0816, "step": 1460 }, { "epoch": 0.13033587581961728, "grad_norm": 82.17903900146484, "learning_rate": 3e-06, "loss": -11.8801, "step": 1461 }, { "epoch": 0.13042508586466836, "grad_norm": 96.77212524414062, "learning_rate": 3e-06, "loss": -15.6827, "step": 1462 }, { "epoch": 0.13051429590971944, "grad_norm": 65.8167724609375, "learning_rate": 3e-06, "loss": -12.5171, "step": 1463 }, { "epoch": 0.1306035059547705, "grad_norm": 70.0600814819336, "learning_rate": 3e-06, "loss": -12.4502, "step": 1464 }, { "completion_length": 121.4375, "epoch": 0.13069271599982157, "grad_norm": 697.7041625976562, "learning_rate": 3e-06, "loss": -71.914, "reward": 2.5268125534057617, "reward_std": 0.3090652823448181, "rewards/correctness_reward_func": 1.875, "rewards/int_reward_func": 0.4791666567325592, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.17264583706855774, "step": 1465, "zero_std_ratio": 0.0 }, { "epoch": 0.13078192604487265, "grad_norm": 969.583740234375, "learning_rate": 3e-06, "loss": -96.3899, "step": 1466 }, { "epoch": 0.13087113608992373, "grad_norm": 1708.633056640625, "learning_rate": 3e-06, "loss": -93.2228, "step": 1467 }, { "epoch": 0.1309603461349748, "grad_norm": 865.3252563476562, "learning_rate": 3e-06, "loss": -123.6049, "step": 1468 }, { "epoch": 0.13104955618002587, "grad_norm": 816.3767700195312, "learning_rate": 3e-06, "loss": -128.9213, "step": 1469 }, { "epoch": 0.13113876622507695, "grad_norm": 763.7314453125, "learning_rate": 3e-06, "loss": -72.8294, "step": 1470 }, { "epoch": 0.13122797627012803, "grad_norm": 745.7823486328125, "learning_rate": 3e-06, "loss": -96.6334, "step": 1471 }, { "epoch": 0.13131718631517908, "grad_norm": 955.34521484375, "learning_rate": 3e-06, "loss": -129.8123, "step": 1472 }, { "epoch": 0.13140639636023016, "grad_norm": 862.6156616210938, "learning_rate": 3e-06, "loss": -139.8425, "step": 1473 }, { "epoch": 0.13149560640528124, "grad_norm": 670.8921508789062, "learning_rate": 3e-06, "loss": -151.7562, "step": 1474 }, { "epoch": 0.13158481645033232, "grad_norm": 995.1624145507812, "learning_rate": 3e-06, "loss": -160.9276, "step": 1475 }, { "epoch": 0.13167402649538337, "grad_norm": 596.0221557617188, "learning_rate": 3e-06, "loss": -98.9438, "step": 1476 }, { "completion_length": 117.20833587646484, "epoch": 0.13176323654043445, "grad_norm": 1281.7259521484375, "learning_rate": 3e-06, "loss": 66.3909, "reward": 2.37918758392334, "reward_std": 0.5254138112068176, "rewards/correctness_reward_func": 1.6666666865348816, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.21252083033323288, "step": 1477, "zero_std_ratio": 0.125 }, { "epoch": 0.13185244658548553, "grad_norm": 1832.419189453125, "learning_rate": 3e-06, "loss": 152.6254, "step": 1478 }, { "epoch": 0.13194165663053659, "grad_norm": 2156.6796875, "learning_rate": 3e-06, "loss": 64.6652, "step": 1479 }, { "epoch": 0.13203086667558767, "grad_norm": 1099.5675048828125, "learning_rate": 3e-06, "loss": 74.1128, "step": 1480 }, { "epoch": 0.13212007672063875, "grad_norm": 1159.567138671875, "learning_rate": 3e-06, "loss": 153.6367, "step": 1481 }, { "epoch": 0.13220928676568983, "grad_norm": 1700.0208740234375, "learning_rate": 3e-06, "loss": 110.7726, "step": 1482 }, { "epoch": 0.13229849681074088, "grad_norm": 1182.3519287109375, "learning_rate": 3e-06, "loss": 63.6161, "step": 1483 }, { "epoch": 0.13238770685579196, "grad_norm": 1845.9815673828125, "learning_rate": 3e-06, "loss": 150.0304, "step": 1484 }, { "epoch": 0.13247691690084304, "grad_norm": 2293.36083984375, "learning_rate": 3e-06, "loss": 40.1143, "step": 1485 }, { "epoch": 0.13256612694589412, "grad_norm": 1194.2135009765625, "learning_rate": 3e-06, "loss": 61.8674, "step": 1486 }, { "epoch": 0.13265533699094517, "grad_norm": 1036.80615234375, "learning_rate": 3e-06, "loss": 138.8943, "step": 1487 }, { "epoch": 0.13274454703599625, "grad_norm": 1460.7186279296875, "learning_rate": 3e-06, "loss": 89.2599, "step": 1488 }, { "completion_length": 111.79167175292969, "epoch": 0.13283375708104733, "grad_norm": 765.440185546875, "learning_rate": 3e-06, "loss": 34.1591, "reward": 2.5814167261123657, "reward_std": 0.36231209337711334, "rewards/correctness_reward_func": 1.875, "rewards/int_reward_func": 0.4895833283662796, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.21683332324028015, "step": 1489, "zero_std_ratio": 0.0 }, { "epoch": 0.1329229671260984, "grad_norm": 890.7492065429688, "learning_rate": 3e-06, "loss": 39.8823, "step": 1490 }, { "epoch": 0.13301217717114946, "grad_norm": 581.3748779296875, "learning_rate": 3e-06, "loss": 19.2035, "step": 1491 }, { "epoch": 0.13310138721620055, "grad_norm": 697.298828125, "learning_rate": 3e-06, "loss": 23.3238, "step": 1492 }, { "epoch": 0.13319059726125163, "grad_norm": 535.7828369140625, "learning_rate": 3e-06, "loss": 35.7368, "step": 1493 }, { "epoch": 0.13327980730630268, "grad_norm": 719.7819213867188, "learning_rate": 3e-06, "loss": 5.173, "step": 1494 }, { "epoch": 0.13336901735135376, "grad_norm": 803.5640869140625, "learning_rate": 3e-06, "loss": 25.9877, "step": 1495 }, { "epoch": 0.13345822739640484, "grad_norm": 1086.519287109375, "learning_rate": 3e-06, "loss": 30.5472, "step": 1496 }, { "epoch": 0.13354743744145592, "grad_norm": 955.2864379882812, "learning_rate": 3e-06, "loss": 14.5545, "step": 1497 }, { "epoch": 0.13363664748650697, "grad_norm": 729.1488037109375, "learning_rate": 3e-06, "loss": 17.2955, "step": 1498 }, { "epoch": 0.13372585753155805, "grad_norm": 532.2217407226562, "learning_rate": 3e-06, "loss": 28.6444, "step": 1499 }, { "epoch": 0.13381506757660913, "grad_norm": 681.295166015625, "learning_rate": 3e-06, "loss": -3.4288, "step": 1500 }, { "completion_length": 109.10417175292969, "epoch": 0.1339042776216602, "grad_norm": 874.5466918945312, "learning_rate": 3e-06, "loss": -251.7181, "reward": 2.537354350090027, "reward_std": 0.38086244463920593, "rewards/correctness_reward_func": 1.8333333134651184, "rewards/int_reward_func": 0.4895833283662796, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.21443749219179153, "step": 1501, "zero_std_ratio": 0.0 }, { "epoch": 0.13399348766671126, "grad_norm": 794.2526245117188, "learning_rate": 3e-06, "loss": -232.9978, "step": 1502 }, { "epoch": 0.13408269771176234, "grad_norm": 898.3654174804688, "learning_rate": 3e-06, "loss": -258.013, "step": 1503 }, { "epoch": 0.13417190775681342, "grad_norm": 950.7967529296875, "learning_rate": 3e-06, "loss": -268.5499, "step": 1504 }, { "epoch": 0.13426111780186448, "grad_norm": 883.3775634765625, "learning_rate": 3e-06, "loss": -260.9937, "step": 1505 }, { "epoch": 0.13435032784691556, "grad_norm": 841.09716796875, "learning_rate": 3e-06, "loss": -201.0721, "step": 1506 }, { "epoch": 0.13443953789196664, "grad_norm": 833.3287353515625, "learning_rate": 3e-06, "loss": -267.1441, "step": 1507 }, { "epoch": 0.13452874793701772, "grad_norm": 800.0440673828125, "learning_rate": 3e-06, "loss": -246.4116, "step": 1508 }, { "epoch": 0.13461795798206877, "grad_norm": 894.3017578125, "learning_rate": 3e-06, "loss": -274.9581, "step": 1509 }, { "epoch": 0.13470716802711985, "grad_norm": 1036.5513916015625, "learning_rate": 3e-06, "loss": -290.6367, "step": 1510 }, { "epoch": 0.13479637807217093, "grad_norm": 768.5258178710938, "learning_rate": 3e-06, "loss": -286.0197, "step": 1511 }, { "epoch": 0.134885588117222, "grad_norm": 835.9033203125, "learning_rate": 3e-06, "loss": -223.6343, "step": 1512 }, { "completion_length": 135.37500381469727, "epoch": 0.13497479816227306, "grad_norm": 1273.8145751953125, "learning_rate": 3e-06, "loss": -118.7228, "reward": 2.0659791231155396, "reward_std": 0.6296385675668716, "rewards/correctness_reward_func": 1.4583333134651184, "rewards/int_reward_func": 0.4583333283662796, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.14931250736117363, "step": 1513, "zero_std_ratio": 0.0 }, { "epoch": 0.13506400820732414, "grad_norm": 1386.9256591796875, "learning_rate": 3e-06, "loss": -3.7572, "step": 1514 }, { "epoch": 0.13515321825237522, "grad_norm": 1552.3677978515625, "learning_rate": 3e-06, "loss": -51.4518, "step": 1515 }, { "epoch": 0.1352424282974263, "grad_norm": 1948.14111328125, "learning_rate": 3e-06, "loss": -100.8674, "step": 1516 }, { "epoch": 0.13533163834247736, "grad_norm": 1424.5458984375, "learning_rate": 3e-06, "loss": -47.4962, "step": 1517 }, { "epoch": 0.13542084838752844, "grad_norm": 1114.11181640625, "learning_rate": 3e-06, "loss": -90.961, "step": 1518 }, { "epoch": 0.13551005843257952, "grad_norm": 1333.2547607421875, "learning_rate": 3e-06, "loss": -132.5085, "step": 1519 }, { "epoch": 0.13559926847763057, "grad_norm": 1323.2939453125, "learning_rate": 3e-06, "loss": -16.2315, "step": 1520 }, { "epoch": 0.13568847852268165, "grad_norm": 1497.0128173828125, "learning_rate": 3e-06, "loss": -72.5231, "step": 1521 }, { "epoch": 0.13577768856773273, "grad_norm": 1749.5069580078125, "learning_rate": 3e-06, "loss": -125.3563, "step": 1522 }, { "epoch": 0.1358668986127838, "grad_norm": 1598.31787109375, "learning_rate": 3e-06, "loss": -74.3843, "step": 1523 }, { "epoch": 0.13595610865783486, "grad_norm": 1268.6849365234375, "learning_rate": 3e-06, "loss": -109.1455, "step": 1524 }, { "completion_length": 134.52083587646484, "epoch": 0.13604531870288594, "grad_norm": 1321.5076904296875, "learning_rate": 3e-06, "loss": 120.2129, "reward": 2.0626251697540283, "reward_std": 0.5234281718730927, "rewards/correctness_reward_func": 1.4166666865348816, "rewards/int_reward_func": 0.4895833283662796, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.1563750095665455, "step": 1525, "zero_std_ratio": 0.0 }, { "epoch": 0.13613452874793702, "grad_norm": 1222.2694091796875, "learning_rate": 3e-06, "loss": 130.8681, "step": 1526 }, { "epoch": 0.1362237387929881, "grad_norm": 1433.10498046875, "learning_rate": 3e-06, "loss": 153.9208, "step": 1527 }, { "epoch": 0.13631294883803916, "grad_norm": 1158.427001953125, "learning_rate": 3e-06, "loss": 113.2684, "step": 1528 }, { "epoch": 0.13640215888309024, "grad_norm": 1139.9339599609375, "learning_rate": 3e-06, "loss": 76.0094, "step": 1529 }, { "epoch": 0.13649136892814132, "grad_norm": 1212.093017578125, "learning_rate": 3e-06, "loss": 174.7632, "step": 1530 }, { "epoch": 0.13658057897319237, "grad_norm": 1497.928955078125, "learning_rate": 3e-06, "loss": 110.0752, "step": 1531 }, { "epoch": 0.13666978901824345, "grad_norm": 1279.407958984375, "learning_rate": 3e-06, "loss": 125.8692, "step": 1532 }, { "epoch": 0.13675899906329453, "grad_norm": 1465.5513916015625, "learning_rate": 3e-06, "loss": 142.0573, "step": 1533 }, { "epoch": 0.1368482091083456, "grad_norm": 1114.577880859375, "learning_rate": 3e-06, "loss": 97.0018, "step": 1534 }, { "epoch": 0.13693741915339666, "grad_norm": 1208.373779296875, "learning_rate": 3e-06, "loss": 63.0144, "step": 1535 }, { "epoch": 0.13702662919844774, "grad_norm": 1987.0606689453125, "learning_rate": 3e-06, "loss": 161.4095, "step": 1536 }, { "completion_length": 107.4375, "epoch": 0.13711583924349882, "grad_norm": 1091.0252685546875, "learning_rate": 3e-06, "loss": -297.7603, "reward": 2.2568334341049194, "reward_std": 0.5629529803991318, "rewards/correctness_reward_func": 1.5416666269302368, "rewards/int_reward_func": 0.4895833283662796, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.010416666977107525, "rewards/xmlcount_reward_func": 0.21516666561365128, "step": 1537, "zero_std_ratio": 0.0 }, { "epoch": 0.1372050492885499, "grad_norm": 1411.4197998046875, "learning_rate": 3e-06, "loss": -316.0208, "step": 1538 }, { "epoch": 0.13729425933360095, "grad_norm": 990.6038818359375, "learning_rate": 3e-06, "loss": -275.7917, "step": 1539 }, { "epoch": 0.13738346937865203, "grad_norm": 1163.958740234375, "learning_rate": 3e-06, "loss": -357.1367, "step": 1540 }, { "epoch": 0.13747267942370311, "grad_norm": 1167.002685546875, "learning_rate": 3e-06, "loss": -276.4684, "step": 1541 }, { "epoch": 0.1375618894687542, "grad_norm": 1276.5869140625, "learning_rate": 3e-06, "loss": -369.6291, "step": 1542 }, { "epoch": 0.13765109951380525, "grad_norm": 1164.8736572265625, "learning_rate": 3e-06, "loss": -308.2538, "step": 1543 }, { "epoch": 0.13774030955885633, "grad_norm": 1285.190673828125, "learning_rate": 3e-06, "loss": -326.7666, "step": 1544 }, { "epoch": 0.1378295196039074, "grad_norm": 1247.821533203125, "learning_rate": 3e-06, "loss": -290.6486, "step": 1545 }, { "epoch": 0.13791872964895846, "grad_norm": 1194.13232421875, "learning_rate": 3e-06, "loss": -382.5776, "step": 1546 }, { "epoch": 0.13800793969400954, "grad_norm": 1206.8460693359375, "learning_rate": 3e-06, "loss": -302.577, "step": 1547 }, { "epoch": 0.13809714973906062, "grad_norm": 1196.979248046875, "learning_rate": 3e-06, "loss": -390.5432, "step": 1548 }, { "completion_length": 109.41667175292969, "epoch": 0.1381863597841117, "grad_norm": 326.03936767578125, "learning_rate": 3e-06, "loss": 45.8013, "reward": 2.609562635421753, "reward_std": 0.17681674100458622, "rewards/correctness_reward_func": 1.9166666865348816, "rewards/int_reward_func": 0.4895833283662796, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.20331250876188278, "step": 1549, "zero_std_ratio": 0.0 }, { "epoch": 0.13827556982916275, "grad_norm": 402.5303649902344, "learning_rate": 3e-06, "loss": 45.5476, "step": 1550 }, { "epoch": 0.13836477987421383, "grad_norm": 445.0027160644531, "learning_rate": 3e-06, "loss": 33.9855, "step": 1551 }, { "epoch": 0.1384539899192649, "grad_norm": 304.2209167480469, "learning_rate": 3e-06, "loss": 74.906, "step": 1552 }, { "epoch": 0.138543199964316, "grad_norm": 268.453369140625, "learning_rate": 3e-06, "loss": 40.646, "step": 1553 }, { "epoch": 0.13863241000936705, "grad_norm": 386.8885498046875, "learning_rate": 3e-06, "loss": 9.6944, "step": 1554 }, { "epoch": 0.13872162005441813, "grad_norm": 319.0711364746094, "learning_rate": 3e-06, "loss": 41.8327, "step": 1555 }, { "epoch": 0.1388108300994692, "grad_norm": 390.6893615722656, "learning_rate": 3e-06, "loss": 42.7597, "step": 1556 }, { "epoch": 0.13890004014452026, "grad_norm": 446.2587890625, "learning_rate": 3e-06, "loss": 28.6514, "step": 1557 }, { "epoch": 0.13898925018957134, "grad_norm": 413.8212585449219, "learning_rate": 3e-06, "loss": 69.6053, "step": 1558 }, { "epoch": 0.13907846023462242, "grad_norm": 313.9552001953125, "learning_rate": 3e-06, "loss": 34.7553, "step": 1559 }, { "epoch": 0.1391676702796735, "grad_norm": 311.7652587890625, "learning_rate": 3e-06, "loss": 8.1029, "step": 1560 }, { "completion_length": 125.66667175292969, "epoch": 0.13925688032472455, "grad_norm": 1157.206787109375, "learning_rate": 3e-06, "loss": -11.8294, "reward": 2.3268543481826782, "reward_std": 0.42937734723091125, "rewards/correctness_reward_func": 1.7083333134651184, "rewards/int_reward_func": 0.4583333283662796, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.1601875051856041, "step": 1561, "zero_std_ratio": 0.0 }, { "epoch": 0.13934609036977563, "grad_norm": 2671.814453125, "learning_rate": 3e-06, "loss": 55.7136, "step": 1562 }, { "epoch": 0.1394353004148267, "grad_norm": 1623.3624267578125, "learning_rate": 3e-06, "loss": -81.8166, "step": 1563 }, { "epoch": 0.1395245104598778, "grad_norm": 931.291748046875, "learning_rate": 3e-06, "loss": -37.318, "step": 1564 }, { "epoch": 0.13961372050492885, "grad_norm": 1049.9766845703125, "learning_rate": 3e-06, "loss": -43.7951, "step": 1565 }, { "epoch": 0.13970293054997993, "grad_norm": 1135.0767822265625, "learning_rate": 3e-06, "loss": -29.7987, "step": 1566 }, { "epoch": 0.139792140595031, "grad_norm": 1037.486083984375, "learning_rate": 3e-06, "loss": -17.2992, "step": 1567 }, { "epoch": 0.13988135064008209, "grad_norm": 2672.843017578125, "learning_rate": 3e-06, "loss": 22.6063, "step": 1568 }, { "epoch": 0.13997056068513314, "grad_norm": 1275.9603271484375, "learning_rate": 3e-06, "loss": -81.5765, "step": 1569 }, { "epoch": 0.14005977073018422, "grad_norm": 1064.6248779296875, "learning_rate": 3e-06, "loss": -37.3489, "step": 1570 }, { "epoch": 0.1401489807752353, "grad_norm": 1112.7813720703125, "learning_rate": 3e-06, "loss": -46.93, "step": 1571 }, { "epoch": 0.14023819082028635, "grad_norm": 986.8911743164062, "learning_rate": 3e-06, "loss": -37.3976, "step": 1572 }, { "completion_length": 108.14583587646484, "epoch": 0.14032740086533743, "grad_norm": 828.357421875, "learning_rate": 3e-06, "loss": 38.523, "reward": 2.4648125171661377, "reward_std": 0.5029634684324265, "rewards/correctness_reward_func": 1.75, "rewards/int_reward_func": 0.4895833283662796, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.2252291589975357, "step": 1573, "zero_std_ratio": 0.0 }, { "epoch": 0.1404166109103885, "grad_norm": 874.63916015625, "learning_rate": 3e-06, "loss": 17.8019, "step": 1574 }, { "epoch": 0.1405058209554396, "grad_norm": 922.2013549804688, "learning_rate": 3e-06, "loss": 5.0649, "step": 1575 }, { "epoch": 0.14059503100049064, "grad_norm": 740.4825439453125, "learning_rate": 3e-06, "loss": -33.9352, "step": 1576 }, { "epoch": 0.14068424104554172, "grad_norm": 935.9121704101562, "learning_rate": 3e-06, "loss": 22.481, "step": 1577 }, { "epoch": 0.1407734510905928, "grad_norm": 982.98876953125, "learning_rate": 3e-06, "loss": 2.7561, "step": 1578 }, { "epoch": 0.14086266113564389, "grad_norm": 804.1883544921875, "learning_rate": 3e-06, "loss": 32.3521, "step": 1579 }, { "epoch": 0.14095187118069494, "grad_norm": 822.947265625, "learning_rate": 3e-06, "loss": -1.7136, "step": 1580 }, { "epoch": 0.14104108122574602, "grad_norm": 923.2521362304688, "learning_rate": 3e-06, "loss": -5.8192, "step": 1581 }, { "epoch": 0.1411302912707971, "grad_norm": 978.51708984375, "learning_rate": 3e-06, "loss": -40.5703, "step": 1582 }, { "epoch": 0.14121950131584818, "grad_norm": 856.7340698242188, "learning_rate": 3e-06, "loss": 10.946, "step": 1583 }, { "epoch": 0.14130871136089923, "grad_norm": 784.1134643554688, "learning_rate": 3e-06, "loss": -15.1785, "step": 1584 }, { "completion_length": 103.85416793823242, "epoch": 0.1413979214059503, "grad_norm": 757.7247924804688, "learning_rate": 3e-06, "loss": -115.5896, "reward": 2.35756254196167, "reward_std": 0.28461553901433945, "rewards/correctness_reward_func": 1.6666666865348816, "rewards/int_reward_func": 0.46875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.22214583307504654, "step": 1585, "zero_std_ratio": 0.0 }, { "epoch": 0.1414871314510014, "grad_norm": 904.9032592773438, "learning_rate": 3e-06, "loss": -102.5051, "step": 1586 }, { "epoch": 0.14157634149605244, "grad_norm": 940.2787475585938, "learning_rate": 3e-06, "loss": -99.6384, "step": 1587 }, { "epoch": 0.14166555154110352, "grad_norm": 1431.8402099609375, "learning_rate": 3e-06, "loss": -109.1234, "step": 1588 }, { "epoch": 0.1417547615861546, "grad_norm": 1079.49072265625, "learning_rate": 3e-06, "loss": -93.5136, "step": 1589 }, { "epoch": 0.14184397163120568, "grad_norm": 957.7713623046875, "learning_rate": 3e-06, "loss": -114.6748, "step": 1590 }, { "epoch": 0.14193318167625674, "grad_norm": 796.9354858398438, "learning_rate": 3e-06, "loss": -118.4429, "step": 1591 }, { "epoch": 0.14202239172130782, "grad_norm": 874.4432983398438, "learning_rate": 3e-06, "loss": -114.6303, "step": 1592 }, { "epoch": 0.1421116017663589, "grad_norm": 1131.59521484375, "learning_rate": 3e-06, "loss": -116.7429, "step": 1593 }, { "epoch": 0.14220081181140998, "grad_norm": 1635.96044921875, "learning_rate": 3e-06, "loss": -132.6347, "step": 1594 }, { "epoch": 0.14229002185646103, "grad_norm": 1009.574951171875, "learning_rate": 3e-06, "loss": -113.8876, "step": 1595 }, { "epoch": 0.1423792319015121, "grad_norm": 1217.6295166015625, "learning_rate": 3e-06, "loss": -117.312, "step": 1596 }, { "completion_length": 113.77083587646484, "epoch": 0.1424684419465632, "grad_norm": 306.18682861328125, "learning_rate": 3e-06, "loss": 10.6702, "reward": 2.6016459465026855, "reward_std": 0.2558625042438507, "rewards/correctness_reward_func": 1.9166666269302368, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.1849791705608368, "step": 1597, "zero_std_ratio": 0.0 }, { "epoch": 0.14255765199161424, "grad_norm": 373.6190490722656, "learning_rate": 3e-06, "loss": 13.2399, "step": 1598 }, { "epoch": 0.14264686203666532, "grad_norm": 604.7830200195312, "learning_rate": 3e-06, "loss": 1.0527, "step": 1599 }, { "epoch": 0.1427360720817164, "grad_norm": 464.7652893066406, "learning_rate": 3e-06, "loss": 7.1036, "step": 1600 }, { "epoch": 0.14282528212676748, "grad_norm": 711.72607421875, "learning_rate": 3e-06, "loss": -19.469, "step": 1601 }, { "epoch": 0.14291449217181854, "grad_norm": 902.3880004882812, "learning_rate": 3e-06, "loss": -15.7111, "step": 1602 }, { "epoch": 0.14300370221686962, "grad_norm": 393.73492431640625, "learning_rate": 3e-06, "loss": 9.011, "step": 1603 }, { "epoch": 0.1430929122619207, "grad_norm": 426.3011779785156, "learning_rate": 3e-06, "loss": 9.2521, "step": 1604 }, { "epoch": 0.14318212230697178, "grad_norm": 622.9773559570312, "learning_rate": 3e-06, "loss": -0.7366, "step": 1605 }, { "epoch": 0.14327133235202283, "grad_norm": 402.3107604980469, "learning_rate": 3e-06, "loss": 4.3138, "step": 1606 }, { "epoch": 0.1433605423970739, "grad_norm": 829.762451171875, "learning_rate": 3e-06, "loss": -23.0001, "step": 1607 }, { "epoch": 0.143449752442125, "grad_norm": 1001.1775512695312, "learning_rate": 3e-06, "loss": -21.5458, "step": 1608 }, { "completion_length": 118.16667175292969, "epoch": 0.14353896248717607, "grad_norm": 1849.2244873046875, "learning_rate": 3e-06, "loss": -563.4475, "reward": 2.283812642097473, "reward_std": 0.7450865209102631, "rewards/correctness_reward_func": 1.5833333134651184, "rewards/int_reward_func": 0.4895833283662796, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.2108958289027214, "step": 1609, "zero_std_ratio": 0.0 }, { "epoch": 0.14362817253222712, "grad_norm": 1622.8909912109375, "learning_rate": 3e-06, "loss": -383.3872, "step": 1610 }, { "epoch": 0.1437173825772782, "grad_norm": 1626.39599609375, "learning_rate": 3e-06, "loss": -388.6346, "step": 1611 }, { "epoch": 0.14380659262232928, "grad_norm": 1461.013427734375, "learning_rate": 3e-06, "loss": -516.7939, "step": 1612 }, { "epoch": 0.14389580266738033, "grad_norm": 1593.8126220703125, "learning_rate": 3e-06, "loss": -459.248, "step": 1613 }, { "epoch": 0.14398501271243141, "grad_norm": 1501.4661865234375, "learning_rate": 3e-06, "loss": -361.4013, "step": 1614 }, { "epoch": 0.1440742227574825, "grad_norm": 1959.175537109375, "learning_rate": 3e-06, "loss": -599.3701, "step": 1615 }, { "epoch": 0.14416343280253358, "grad_norm": 1524.2763671875, "learning_rate": 3e-06, "loss": -406.2586, "step": 1616 }, { "epoch": 0.14425264284758463, "grad_norm": 2186.95947265625, "learning_rate": 3e-06, "loss": -424.5395, "step": 1617 }, { "epoch": 0.1443418528926357, "grad_norm": 1751.9039306640625, "learning_rate": 3e-06, "loss": -537.6578, "step": 1618 }, { "epoch": 0.1444310629376868, "grad_norm": 1491.2742919921875, "learning_rate": 3e-06, "loss": -481.118, "step": 1619 }, { "epoch": 0.14452027298273787, "grad_norm": 1423.106689453125, "learning_rate": 3e-06, "loss": -393.4251, "step": 1620 }, { "completion_length": 123.56250381469727, "epoch": 0.14460948302778892, "grad_norm": 1347.345703125, "learning_rate": 3e-06, "loss": -484.3067, "reward": 2.2743124961853027, "reward_std": 0.4947032183408737, "rewards/correctness_reward_func": 1.625, "rewards/int_reward_func": 0.4583333283662796, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.19097916781902313, "step": 1621, "zero_std_ratio": 0.0 }, { "epoch": 0.14469869307284, "grad_norm": 1622.2376708984375, "learning_rate": 3e-06, "loss": -543.3698, "step": 1622 }, { "epoch": 0.14478790311789108, "grad_norm": 1387.509521484375, "learning_rate": 3e-06, "loss": -551.8273, "step": 1623 }, { "epoch": 0.14487711316294213, "grad_norm": 1271.8485107421875, "learning_rate": 3e-06, "loss": -501.6958, "step": 1624 }, { "epoch": 0.14496632320799321, "grad_norm": 1232.3084716796875, "learning_rate": 3e-06, "loss": -512.8577, "step": 1625 }, { "epoch": 0.1450555332530443, "grad_norm": 1346.7532958984375, "learning_rate": 3e-06, "loss": -458.5431, "step": 1626 }, { "epoch": 0.14514474329809537, "grad_norm": 1523.1427001953125, "learning_rate": 3e-06, "loss": -518.0624, "step": 1627 }, { "epoch": 0.14523395334314643, "grad_norm": 1862.5703125, "learning_rate": 3e-06, "loss": -572.3401, "step": 1628 }, { "epoch": 0.1453231633881975, "grad_norm": 1202.2999267578125, "learning_rate": 3e-06, "loss": -586.3882, "step": 1629 }, { "epoch": 0.1454123734332486, "grad_norm": 1310.3470458984375, "learning_rate": 3e-06, "loss": -542.1135, "step": 1630 }, { "epoch": 0.14550158347829967, "grad_norm": 1226.2174072265625, "learning_rate": 3e-06, "loss": -556.5128, "step": 1631 }, { "epoch": 0.14559079352335072, "grad_norm": 1245.717041015625, "learning_rate": 3e-06, "loss": -500.3018, "step": 1632 }, { "completion_length": 161.08333587646484, "epoch": 0.1456800035684018, "grad_norm": 1047.9556884765625, "learning_rate": 3e-06, "loss": 62.7677, "reward": 1.9896875023841858, "reward_std": 0.36980947852134705, "rewards/correctness_reward_func": 1.4166666865348816, "rewards/int_reward_func": 0.4895833283662796, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.08343750238418579, "step": 1633, "zero_std_ratio": 0.0 }, { "epoch": 0.14576921361345288, "grad_norm": 1155.3780517578125, "learning_rate": 3e-06, "loss": 45.3963, "step": 1634 }, { "epoch": 0.14585842365850396, "grad_norm": 1296.7313232421875, "learning_rate": 3e-06, "loss": 47.438, "step": 1635 }, { "epoch": 0.145947633703555, "grad_norm": 1112.166015625, "learning_rate": 3e-06, "loss": -102.1916, "step": 1636 }, { "epoch": 0.1460368437486061, "grad_norm": 979.8106689453125, "learning_rate": 3e-06, "loss": 49.4149, "step": 1637 }, { "epoch": 0.14612605379365717, "grad_norm": 1428.285888671875, "learning_rate": 3e-06, "loss": -44.5905, "step": 1638 }, { "epoch": 0.14621526383870823, "grad_norm": 1178.86572265625, "learning_rate": 3e-06, "loss": 48.5944, "step": 1639 }, { "epoch": 0.1463044738837593, "grad_norm": 1211.369384765625, "learning_rate": 3e-06, "loss": 43.942, "step": 1640 }, { "epoch": 0.1463936839288104, "grad_norm": 1153.5445556640625, "learning_rate": 3e-06, "loss": 34.518, "step": 1641 }, { "epoch": 0.14648289397386147, "grad_norm": 1077.5166015625, "learning_rate": 3e-06, "loss": -116.0781, "step": 1642 }, { "epoch": 0.14657210401891252, "grad_norm": 1041.2066650390625, "learning_rate": 3e-06, "loss": 40.4303, "step": 1643 }, { "epoch": 0.1466613140639636, "grad_norm": 1450.99609375, "learning_rate": 3e-06, "loss": -44.6822, "step": 1644 }, { "completion_length": 127.64583969116211, "epoch": 0.14675052410901468, "grad_norm": 2347.143310546875, "learning_rate": 3e-06, "loss": 384.8905, "reward": 1.9467709064483643, "reward_std": 0.565439760684967, "rewards/correctness_reward_func": 1.2916666567325592, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.15510417520999908, "step": 1645, "zero_std_ratio": 0.0 }, { "epoch": 0.14683973415406576, "grad_norm": 1446.45458984375, "learning_rate": 3e-06, "loss": 170.8209, "step": 1646 }, { "epoch": 0.1469289441991168, "grad_norm": 1259.9295654296875, "learning_rate": 3e-06, "loss": 182.3417, "step": 1647 }, { "epoch": 0.1470181542441679, "grad_norm": 1511.966796875, "learning_rate": 3e-06, "loss": 90.1429, "step": 1648 }, { "epoch": 0.14710736428921897, "grad_norm": 1601.6256103515625, "learning_rate": 3e-06, "loss": 212.7898, "step": 1649 }, { "epoch": 0.14719657433427003, "grad_norm": 1477.3070068359375, "learning_rate": 3e-06, "loss": 107.817, "step": 1650 }, { "epoch": 0.1472857843793211, "grad_norm": 4009.107666015625, "learning_rate": 3e-06, "loss": 349.6491, "step": 1651 }, { "epoch": 0.14737499442437219, "grad_norm": 1204.2269287109375, "learning_rate": 3e-06, "loss": 160.3832, "step": 1652 }, { "epoch": 0.14746420446942327, "grad_norm": 1213.4332275390625, "learning_rate": 3e-06, "loss": 168.7216, "step": 1653 }, { "epoch": 0.14755341451447432, "grad_norm": 1349.8916015625, "learning_rate": 3e-06, "loss": 70.4885, "step": 1654 }, { "epoch": 0.1476426245595254, "grad_norm": 1503.792236328125, "learning_rate": 3e-06, "loss": 177.7434, "step": 1655 }, { "epoch": 0.14773183460457648, "grad_norm": 1194.224365234375, "learning_rate": 3e-06, "loss": 81.2416, "step": 1656 }, { "completion_length": 123.64583587646484, "epoch": 0.14782104464962756, "grad_norm": 1002.905517578125, "learning_rate": 3e-06, "loss": -0.0244, "reward": 2.277187466621399, "reward_std": 0.6942009925842285, "rewards/correctness_reward_func": 1.5833333730697632, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.19385415315628052, "step": 1657, "zero_std_ratio": 0.0 }, { "epoch": 0.1479102546946786, "grad_norm": 1149.109619140625, "learning_rate": 3e-06, "loss": 41.6179, "step": 1658 }, { "epoch": 0.1479994647397297, "grad_norm": 1113.2333984375, "learning_rate": 3e-06, "loss": 78.8026, "step": 1659 }, { "epoch": 0.14808867478478077, "grad_norm": 1029.0260009765625, "learning_rate": 3e-06, "loss": 47.1482, "step": 1660 }, { "epoch": 0.14817788482983185, "grad_norm": 1232.3424072265625, "learning_rate": 3e-06, "loss": -10.8712, "step": 1661 }, { "epoch": 0.1482670948748829, "grad_norm": 1049.901123046875, "learning_rate": 3e-06, "loss": 100.9177, "step": 1662 }, { "epoch": 0.14835630491993398, "grad_norm": 904.3103637695312, "learning_rate": 3e-06, "loss": -14.9282, "step": 1663 }, { "epoch": 0.14844551496498506, "grad_norm": 1009.290771484375, "learning_rate": 3e-06, "loss": 28.7193, "step": 1664 }, { "epoch": 0.14853472501003612, "grad_norm": 1085.2960205078125, "learning_rate": 3e-06, "loss": 64.3064, "step": 1665 }, { "epoch": 0.1486239350550872, "grad_norm": 885.9617919921875, "learning_rate": 3e-06, "loss": 34.0377, "step": 1666 }, { "epoch": 0.14871314510013828, "grad_norm": 1138.4622802734375, "learning_rate": 3e-06, "loss": -29.1467, "step": 1667 }, { "epoch": 0.14880235514518936, "grad_norm": 1003.6993408203125, "learning_rate": 3e-06, "loss": 87.8599, "step": 1668 }, { "completion_length": 124.60417175292969, "epoch": 0.1488915651902404, "grad_norm": 148.32191467285156, "learning_rate": 3e-06, "loss": -30.0848, "reward": 2.4312500953674316, "reward_std": 0.03657746687531471, "rewards/correctness_reward_func": 1.75, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.18124999850988388, "step": 1669, "zero_std_ratio": 0.0 }, { "epoch": 0.1489807752352915, "grad_norm": 148.70933532714844, "learning_rate": 3e-06, "loss": -23.8765, "step": 1670 }, { "epoch": 0.14906998528034257, "grad_norm": 151.1920623779297, "learning_rate": 3e-06, "loss": -26.6285, "step": 1671 }, { "epoch": 0.14915919532539365, "grad_norm": 136.03228759765625, "learning_rate": 3e-06, "loss": -21.6761, "step": 1672 }, { "epoch": 0.1492484053704447, "grad_norm": 177.11851501464844, "learning_rate": 3e-06, "loss": -24.8027, "step": 1673 }, { "epoch": 0.14933761541549578, "grad_norm": 122.48726654052734, "learning_rate": 3e-06, "loss": -24.5533, "step": 1674 }, { "epoch": 0.14942682546054686, "grad_norm": 145.15347290039062, "learning_rate": 3e-06, "loss": -32.7347, "step": 1675 }, { "epoch": 0.14951603550559794, "grad_norm": 147.5530242919922, "learning_rate": 3e-06, "loss": -26.6998, "step": 1676 }, { "epoch": 0.149605245550649, "grad_norm": 186.55825805664062, "learning_rate": 3e-06, "loss": -29.4036, "step": 1677 }, { "epoch": 0.14969445559570008, "grad_norm": 165.18231201171875, "learning_rate": 3e-06, "loss": -26.6914, "step": 1678 }, { "epoch": 0.14978366564075116, "grad_norm": 170.1136016845703, "learning_rate": 3e-06, "loss": -30.6937, "step": 1679 }, { "epoch": 0.1498728756858022, "grad_norm": 131.10781860351562, "learning_rate": 3e-06, "loss": -29.9625, "step": 1680 }, { "completion_length": 149.37500762939453, "epoch": 0.1499620857308533, "grad_norm": 796.8214111328125, "learning_rate": 3e-06, "loss": -24.992, "reward": 2.1661041378974915, "reward_std": 0.49460718035697937, "rewards/correctness_reward_func": 1.5833333730697632, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.08277083188295364, "step": 1681, "zero_std_ratio": 0.125 }, { "epoch": 0.15005129577590437, "grad_norm": 995.2013549804688, "learning_rate": 3e-06, "loss": -59.4395, "step": 1682 }, { "epoch": 0.15014050582095545, "grad_norm": 1657.536376953125, "learning_rate": 3e-06, "loss": -37.3866, "step": 1683 }, { "epoch": 0.1502297158660065, "grad_norm": 1062.8961181640625, "learning_rate": 3e-06, "loss": -45.4256, "step": 1684 }, { "epoch": 0.15031892591105758, "grad_norm": 1196.541259765625, "learning_rate": 3e-06, "loss": -21.982, "step": 1685 }, { "epoch": 0.15040813595610866, "grad_norm": 1009.1981811523438, "learning_rate": 3e-06, "loss": 1.7535, "step": 1686 }, { "epoch": 0.15049734600115974, "grad_norm": 897.9784545898438, "learning_rate": 3e-06, "loss": -18.5637, "step": 1687 }, { "epoch": 0.1505865560462108, "grad_norm": 1156.031494140625, "learning_rate": 3e-06, "loss": -74.5365, "step": 1688 }, { "epoch": 0.15067576609126188, "grad_norm": 962.795166015625, "learning_rate": 3e-06, "loss": -42.1126, "step": 1689 }, { "epoch": 0.15076497613631296, "grad_norm": 1172.703125, "learning_rate": 3e-06, "loss": -52.0831, "step": 1690 }, { "epoch": 0.150854186181364, "grad_norm": 1299.485595703125, "learning_rate": 3e-06, "loss": -18.9543, "step": 1691 }, { "epoch": 0.1509433962264151, "grad_norm": 1174.2447509765625, "learning_rate": 3e-06, "loss": -0.5707, "step": 1692 }, { "completion_length": 115.62500381469727, "epoch": 0.15103260627146617, "grad_norm": 932.6162109375, "learning_rate": 3e-06, "loss": -99.4483, "reward": 2.3194793462753296, "reward_std": 0.6202397346496582, "rewards/correctness_reward_func": 1.6666666269302368, "rewards/int_reward_func": 0.4791666567325592, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.17364583909511566, "step": 1693, "zero_std_ratio": 0.0 }, { "epoch": 0.15112181631651725, "grad_norm": 1099.0394287109375, "learning_rate": 3e-06, "loss": -107.816, "step": 1694 }, { "epoch": 0.1512110263615683, "grad_norm": 919.58740234375, "learning_rate": 3e-06, "loss": -80.3083, "step": 1695 }, { "epoch": 0.15130023640661938, "grad_norm": 1122.83349609375, "learning_rate": 3e-06, "loss": -19.2087, "step": 1696 }, { "epoch": 0.15138944645167046, "grad_norm": 1361.2589111328125, "learning_rate": 3e-06, "loss": -164.0316, "step": 1697 }, { "epoch": 0.15147865649672154, "grad_norm": 1414.1632080078125, "learning_rate": 3e-06, "loss": -114.2859, "step": 1698 }, { "epoch": 0.1515678665417726, "grad_norm": 968.30126953125, "learning_rate": 3e-06, "loss": -110.0306, "step": 1699 }, { "epoch": 0.15165707658682367, "grad_norm": 1144.3707275390625, "learning_rate": 3e-06, "loss": -118.6632, "step": 1700 }, { "epoch": 0.15174628663187475, "grad_norm": 971.5307006835938, "learning_rate": 3e-06, "loss": -92.3818, "step": 1701 }, { "epoch": 0.15183549667692584, "grad_norm": 917.9314575195312, "learning_rate": 3e-06, "loss": -20.9716, "step": 1702 }, { "epoch": 0.1519247067219769, "grad_norm": 1136.1475830078125, "learning_rate": 3e-06, "loss": -178.9898, "step": 1703 }, { "epoch": 0.15201391676702797, "grad_norm": 1357.263916015625, "learning_rate": 3e-06, "loss": -127.0251, "step": 1704 }, { "completion_length": 123.16667175292969, "epoch": 0.15210312681207905, "grad_norm": 881.5326538085938, "learning_rate": 3e-06, "loss": 93.0848, "reward": 2.370583415031433, "reward_std": 0.38110819458961487, "rewards/correctness_reward_func": 1.7083333134651184, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.16224999725818634, "step": 1705, "zero_std_ratio": 0.125 }, { "epoch": 0.1521923368571301, "grad_norm": 744.7655029296875, "learning_rate": 3e-06, "loss": 42.1843, "step": 1706 }, { "epoch": 0.15228154690218118, "grad_norm": 918.53125, "learning_rate": 3e-06, "loss": 65.0944, "step": 1707 }, { "epoch": 0.15237075694723226, "grad_norm": 670.804443359375, "learning_rate": 3e-06, "loss": 58.4442, "step": 1708 }, { "epoch": 0.15245996699228334, "grad_norm": 811.63134765625, "learning_rate": 3e-06, "loss": 75.2296, "step": 1709 }, { "epoch": 0.1525491770373344, "grad_norm": 890.50537109375, "learning_rate": 3e-06, "loss": 76.3318, "step": 1710 }, { "epoch": 0.15263838708238547, "grad_norm": 802.3552856445312, "learning_rate": 3e-06, "loss": 74.1002, "step": 1711 }, { "epoch": 0.15272759712743655, "grad_norm": 622.7321166992188, "learning_rate": 3e-06, "loss": 32.5934, "step": 1712 }, { "epoch": 0.15281680717248763, "grad_norm": 636.275390625, "learning_rate": 3e-06, "loss": 44.7401, "step": 1713 }, { "epoch": 0.1529060172175387, "grad_norm": 480.5521240234375, "learning_rate": 3e-06, "loss": 44.6325, "step": 1714 }, { "epoch": 0.15299522726258977, "grad_norm": 526.2603759765625, "learning_rate": 3e-06, "loss": 49.214, "step": 1715 }, { "epoch": 0.15308443730764085, "grad_norm": 582.4856567382812, "learning_rate": 3e-06, "loss": 46.1537, "step": 1716 }, { "completion_length": 133.3333396911621, "epoch": 0.1531736473526919, "grad_norm": 535.0222778320312, "learning_rate": 3e-06, "loss": -6.0903, "reward": 2.4653126001358032, "reward_std": 0.3633167892694473, "rewards/correctness_reward_func": 1.8333333134651184, "rewards/int_reward_func": 0.4895833283662796, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.14239583536982536, "step": 1717, "zero_std_ratio": 0.0 }, { "epoch": 0.15326285739774298, "grad_norm": 436.4328308105469, "learning_rate": 3e-06, "loss": -2.8542, "step": 1718 }, { "epoch": 0.15335206744279406, "grad_norm": 1064.8994140625, "learning_rate": 3e-06, "loss": -67.8062, "step": 1719 }, { "epoch": 0.15344127748784514, "grad_norm": 577.8828735351562, "learning_rate": 3e-06, "loss": 43.8438, "step": 1720 }, { "epoch": 0.1535304875328962, "grad_norm": 503.5760803222656, "learning_rate": 3e-06, "loss": 6.5897, "step": 1721 }, { "epoch": 0.15361969757794727, "grad_norm": 328.094970703125, "learning_rate": 3e-06, "loss": 12.6563, "step": 1722 }, { "epoch": 0.15370890762299835, "grad_norm": 471.933349609375, "learning_rate": 3e-06, "loss": -12.6933, "step": 1723 }, { "epoch": 0.15379811766804943, "grad_norm": 385.73516845703125, "learning_rate": 3e-06, "loss": -1.7539, "step": 1724 }, { "epoch": 0.15388732771310049, "grad_norm": 986.2044067382812, "learning_rate": 3e-06, "loss": -54.8942, "step": 1725 }, { "epoch": 0.15397653775815157, "grad_norm": 443.2298583984375, "learning_rate": 3e-06, "loss": 31.631, "step": 1726 }, { "epoch": 0.15406574780320265, "grad_norm": 531.0643920898438, "learning_rate": 3e-06, "loss": 1.8442, "step": 1727 }, { "epoch": 0.15415495784825373, "grad_norm": 283.9168701171875, "learning_rate": 3e-06, "loss": 12.0514, "step": 1728 }, { "completion_length": 146.81250762939453, "epoch": 0.15424416789330478, "grad_norm": 249.7176971435547, "learning_rate": 3e-06, "loss": 8.7164, "reward": 2.526937484741211, "reward_std": 0.22450600564479828, "rewards/correctness_reward_func": 1.9166666269302368, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.11027083173394203, "step": 1729, "zero_std_ratio": 0.0 }, { "epoch": 0.15433337793835586, "grad_norm": 415.48431396484375, "learning_rate": 3e-06, "loss": 37.7652, "step": 1730 }, { "epoch": 0.15442258798340694, "grad_norm": 312.5050354003906, "learning_rate": 3e-06, "loss": 13.5347, "step": 1731 }, { "epoch": 0.154511798028458, "grad_norm": 272.3165588378906, "learning_rate": 3e-06, "loss": 17.2606, "step": 1732 }, { "epoch": 0.15460100807350907, "grad_norm": 345.10382080078125, "learning_rate": 3e-06, "loss": 4.1699, "step": 1733 }, { "epoch": 0.15469021811856015, "grad_norm": 221.7129364013672, "learning_rate": 3e-06, "loss": 9.5225, "step": 1734 }, { "epoch": 0.15477942816361123, "grad_norm": 285.18927001953125, "learning_rate": 3e-06, "loss": 5.8041, "step": 1735 }, { "epoch": 0.15486863820866228, "grad_norm": 374.7469482421875, "learning_rate": 3e-06, "loss": 29.7232, "step": 1736 }, { "epoch": 0.15495784825371337, "grad_norm": 265.818603515625, "learning_rate": 3e-06, "loss": 8.8928, "step": 1737 }, { "epoch": 0.15504705829876445, "grad_norm": 227.65782165527344, "learning_rate": 3e-06, "loss": 12.3003, "step": 1738 }, { "epoch": 0.15513626834381553, "grad_norm": 232.1507110595703, "learning_rate": 3e-06, "loss": 3.8034, "step": 1739 }, { "epoch": 0.15522547838886658, "grad_norm": 177.14718627929688, "learning_rate": 3e-06, "loss": 5.6435, "step": 1740 }, { "completion_length": 162.33333587646484, "epoch": 0.15531468843391766, "grad_norm": 994.4199829101562, "learning_rate": 3e-06, "loss": 58.5382, "reward": 1.847833514213562, "reward_std": 0.5970688164234161, "rewards/correctness_reward_func": 1.2916666567325592, "rewards/int_reward_func": 0.4895833283662796, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.06658333353698254, "step": 1741, "zero_std_ratio": 0.0 }, { "epoch": 0.15540389847896874, "grad_norm": 1349.40283203125, "learning_rate": 3e-06, "loss": 54.8541, "step": 1742 }, { "epoch": 0.1554931085240198, "grad_norm": 1288.0029296875, "learning_rate": 3e-06, "loss": 70.4871, "step": 1743 }, { "epoch": 0.15558231856907087, "grad_norm": 1065.3143310546875, "learning_rate": 3e-06, "loss": 80.8355, "step": 1744 }, { "epoch": 0.15567152861412195, "grad_norm": 1030.62939453125, "learning_rate": 3e-06, "loss": 58.8967, "step": 1745 }, { "epoch": 0.15576073865917303, "grad_norm": 676.4586791992188, "learning_rate": 3e-06, "loss": 32.0503, "step": 1746 }, { "epoch": 0.15584994870422408, "grad_norm": 754.9002685546875, "learning_rate": 3e-06, "loss": 46.5822, "step": 1747 }, { "epoch": 0.15593915874927516, "grad_norm": 771.5171508789062, "learning_rate": 3e-06, "loss": 53.0914, "step": 1748 }, { "epoch": 0.15602836879432624, "grad_norm": 624.7284545898438, "learning_rate": 3e-06, "loss": 54.3331, "step": 1749 }, { "epoch": 0.15611757883937732, "grad_norm": 721.7028198242188, "learning_rate": 3e-06, "loss": 50.9435, "step": 1750 }, { "epoch": 0.15620678888442838, "grad_norm": 981.5281982421875, "learning_rate": 3e-06, "loss": 41.9982, "step": 1751 }, { "epoch": 0.15629599892947946, "grad_norm": 476.7873840332031, "learning_rate": 3e-06, "loss": 23.7931, "step": 1752 }, { "completion_length": 157.45833587646484, "epoch": 0.15638520897453054, "grad_norm": 1073.9844970703125, "learning_rate": 3e-06, "loss": -17.6326, "reward": 1.8527084589004517, "reward_std": 0.6355842351913452, "rewards/correctness_reward_func": 1.375, "rewards/int_reward_func": 0.375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.10270833969116211, "step": 1753, "zero_std_ratio": 0.0 }, { "epoch": 0.15647441901958162, "grad_norm": 1561.85595703125, "learning_rate": 3e-06, "loss": -52.6263, "step": 1754 }, { "epoch": 0.15656362906463267, "grad_norm": 979.494140625, "learning_rate": 3e-06, "loss": -94.7826, "step": 1755 }, { "epoch": 0.15665283910968375, "grad_norm": 868.07177734375, "learning_rate": 3e-06, "loss": -37.3175, "step": 1756 }, { "epoch": 0.15674204915473483, "grad_norm": 891.9551391601562, "learning_rate": 3e-06, "loss": -5.4981, "step": 1757 }, { "epoch": 0.15683125919978588, "grad_norm": 816.58740234375, "learning_rate": 3e-06, "loss": 12.5619, "step": 1758 }, { "epoch": 0.15692046924483696, "grad_norm": 836.6050415039062, "learning_rate": 3e-06, "loss": -22.0006, "step": 1759 }, { "epoch": 0.15700967928988804, "grad_norm": 1234.0372314453125, "learning_rate": 3e-06, "loss": -45.7349, "step": 1760 }, { "epoch": 0.15709888933493912, "grad_norm": 931.4359130859375, "learning_rate": 3e-06, "loss": -87.1049, "step": 1761 }, { "epoch": 0.15718809937999018, "grad_norm": 1074.96435546875, "learning_rate": 3e-06, "loss": -39.8578, "step": 1762 }, { "epoch": 0.15727730942504126, "grad_norm": 807.4719848632812, "learning_rate": 3e-06, "loss": -15.7909, "step": 1763 }, { "epoch": 0.15736651947009234, "grad_norm": 701.7471313476562, "learning_rate": 3e-06, "loss": 2.0572, "step": 1764 }, { "completion_length": 156.6041717529297, "epoch": 0.15745572951514342, "grad_norm": 802.6090087890625, "learning_rate": 3e-06, "loss": -34.1906, "reward": 2.0988959670066833, "reward_std": 0.31426893174648285, "rewards/correctness_reward_func": 1.5416666269302368, "rewards/int_reward_func": 0.4791666567325592, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.0780624970793724, "step": 1765, "zero_std_ratio": 0.0 }, { "epoch": 0.15754493956019447, "grad_norm": 714.1509399414062, "learning_rate": 3e-06, "loss": -53.6734, "step": 1766 }, { "epoch": 0.15763414960524555, "grad_norm": 691.9019165039062, "learning_rate": 3e-06, "loss": -43.5893, "step": 1767 }, { "epoch": 0.15772335965029663, "grad_norm": 890.5856323242188, "learning_rate": 3e-06, "loss": 10.3226, "step": 1768 }, { "epoch": 0.1578125696953477, "grad_norm": 834.9835205078125, "learning_rate": 3e-06, "loss": 0.0275, "step": 1769 }, { "epoch": 0.15790177974039876, "grad_norm": 480.59625244140625, "learning_rate": 3e-06, "loss": -26.698, "step": 1770 }, { "epoch": 0.15799098978544984, "grad_norm": 842.9605102539062, "learning_rate": 3e-06, "loss": -37.832, "step": 1771 }, { "epoch": 0.15808019983050092, "grad_norm": 766.251220703125, "learning_rate": 3e-06, "loss": -56.4611, "step": 1772 }, { "epoch": 0.15816940987555198, "grad_norm": 809.439453125, "learning_rate": 3e-06, "loss": -51.1308, "step": 1773 }, { "epoch": 0.15825861992060306, "grad_norm": 959.1751098632812, "learning_rate": 3e-06, "loss": 11.5874, "step": 1774 }, { "epoch": 0.15834782996565414, "grad_norm": 868.1351318359375, "learning_rate": 3e-06, "loss": -4.9394, "step": 1775 }, { "epoch": 0.15843704001070522, "grad_norm": 572.6174926757812, "learning_rate": 3e-06, "loss": -29.5914, "step": 1776 }, { "completion_length": 121.64583587646484, "epoch": 0.15852625005575627, "grad_norm": 828.8180541992188, "learning_rate": 3e-06, "loss": 69.2331, "reward": 2.4088125228881836, "reward_std": 0.2123552095144987, "rewards/correctness_reward_func": 1.75, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.1588124930858612, "step": 1777, "zero_std_ratio": 0.0 }, { "epoch": 0.15861546010080735, "grad_norm": 726.0809326171875, "learning_rate": 3e-06, "loss": 61.3269, "step": 1778 }, { "epoch": 0.15870467014585843, "grad_norm": 640.3132934570312, "learning_rate": 3e-06, "loss": 62.9404, "step": 1779 }, { "epoch": 0.1587938801909095, "grad_norm": 819.57958984375, "learning_rate": 3e-06, "loss": 75.4607, "step": 1780 }, { "epoch": 0.15888309023596056, "grad_norm": 799.89453125, "learning_rate": 3e-06, "loss": 72.5996, "step": 1781 }, { "epoch": 0.15897230028101164, "grad_norm": 847.7570190429688, "learning_rate": 3e-06, "loss": 96.3295, "step": 1782 }, { "epoch": 0.15906151032606272, "grad_norm": 610.136474609375, "learning_rate": 3e-06, "loss": 52.1926, "step": 1783 }, { "epoch": 0.15915072037111377, "grad_norm": 457.82208251953125, "learning_rate": 3e-06, "loss": 46.0544, "step": 1784 }, { "epoch": 0.15923993041616485, "grad_norm": 419.1843566894531, "learning_rate": 3e-06, "loss": 43.62, "step": 1785 }, { "epoch": 0.15932914046121593, "grad_norm": 517.3756103515625, "learning_rate": 3e-06, "loss": 47.5319, "step": 1786 }, { "epoch": 0.15941835050626701, "grad_norm": 427.8025207519531, "learning_rate": 3e-06, "loss": 43.6472, "step": 1787 }, { "epoch": 0.15950756055131807, "grad_norm": 639.5134887695312, "learning_rate": 3e-06, "loss": 60.5944, "step": 1788 }, { "completion_length": 152.4791717529297, "epoch": 0.15959677059636915, "grad_norm": 1021.4591064453125, "learning_rate": 3e-06, "loss": -505.314, "reward": 2.077250123023987, "reward_std": 0.42116162180900574, "rewards/correctness_reward_func": 1.4583333134651184, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.11891666986048222, "step": 1789, "zero_std_ratio": 0.125 }, { "epoch": 0.15968598064142023, "grad_norm": 1055.1492919921875, "learning_rate": 3e-06, "loss": -514.8371, "step": 1790 }, { "epoch": 0.1597751906864713, "grad_norm": 1102.408447265625, "learning_rate": 3e-06, "loss": -510.6017, "step": 1791 }, { "epoch": 0.15986440073152236, "grad_norm": 826.0501098632812, "learning_rate": 3e-06, "loss": -504.185, "step": 1792 }, { "epoch": 0.15995361077657344, "grad_norm": 924.1912841796875, "learning_rate": 3e-06, "loss": -528.0456, "step": 1793 }, { "epoch": 0.16004282082162452, "grad_norm": 1201.7523193359375, "learning_rate": 3e-06, "loss": -485.692, "step": 1794 }, { "epoch": 0.1601320308666756, "grad_norm": 1054.49560546875, "learning_rate": 3e-06, "loss": -517.5522, "step": 1795 }, { "epoch": 0.16022124091172665, "grad_norm": 1037.0382080078125, "learning_rate": 3e-06, "loss": -533.0304, "step": 1796 }, { "epoch": 0.16031045095677773, "grad_norm": 1121.322509765625, "learning_rate": 3e-06, "loss": -528.4745, "step": 1797 }, { "epoch": 0.1603996610018288, "grad_norm": 869.9371948242188, "learning_rate": 3e-06, "loss": -530.1343, "step": 1798 }, { "epoch": 0.16048887104687987, "grad_norm": 993.0130004882812, "learning_rate": 3e-06, "loss": -561.1622, "step": 1799 }, { "epoch": 0.16057808109193095, "grad_norm": 1424.33056640625, "learning_rate": 3e-06, "loss": -523.6362, "step": 1800 }, { "completion_length": 132.87500762939453, "epoch": 0.16066729113698203, "grad_norm": 516.1659545898438, "learning_rate": 3e-06, "loss": -39.2026, "reward": 2.2690415382385254, "reward_std": 0.5185650140047073, "rewards/correctness_reward_func": 1.6666666865348816, "rewards/int_reward_func": 0.4583333283662796, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.14404166117310524, "step": 1801, "zero_std_ratio": 0.125 }, { "epoch": 0.1607565011820331, "grad_norm": 393.27093505859375, "learning_rate": 3e-06, "loss": -18.2417, "step": 1802 }, { "epoch": 0.16084571122708416, "grad_norm": 274.265625, "learning_rate": 3e-06, "loss": -7.7819, "step": 1803 }, { "epoch": 0.16093492127213524, "grad_norm": 303.0129699707031, "learning_rate": 3e-06, "loss": -12.3068, "step": 1804 }, { "epoch": 0.16102413131718632, "grad_norm": 194.52011108398438, "learning_rate": 3e-06, "loss": -16.6636, "step": 1805 }, { "epoch": 0.1611133413622374, "grad_norm": 361.7630920410156, "learning_rate": 3e-06, "loss": -64.1357, "step": 1806 }, { "epoch": 0.16120255140728845, "grad_norm": 475.3822326660156, "learning_rate": 3e-06, "loss": -49.0127, "step": 1807 }, { "epoch": 0.16129176145233953, "grad_norm": 457.8487548828125, "learning_rate": 3e-06, "loss": -26.1585, "step": 1808 }, { "epoch": 0.1613809714973906, "grad_norm": 345.77685546875, "learning_rate": 3e-06, "loss": -9.8069, "step": 1809 }, { "epoch": 0.16147018154244167, "grad_norm": 430.1940002441406, "learning_rate": 3e-06, "loss": -17.8251, "step": 1810 }, { "epoch": 0.16155939158749275, "grad_norm": 268.2777099609375, "learning_rate": 3e-06, "loss": -21.4181, "step": 1811 }, { "epoch": 0.16164860163254383, "grad_norm": 360.336181640625, "learning_rate": 3e-06, "loss": -69.655, "step": 1812 }, { "completion_length": 123.20833969116211, "epoch": 0.1617378116775949, "grad_norm": 752.1405029296875, "learning_rate": 3e-06, "loss": -52.6682, "reward": 2.5705000162124634, "reward_std": 0.14856409095227718, "rewards/correctness_reward_func": 1.9166666865348816, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.15383333712816238, "step": 1813, "zero_std_ratio": 0.0 }, { "epoch": 0.16182702172264596, "grad_norm": 750.3876342773438, "learning_rate": 3e-06, "loss": -51.5828, "step": 1814 }, { "epoch": 0.16191623176769704, "grad_norm": 721.2176513671875, "learning_rate": 3e-06, "loss": -79.6287, "step": 1815 }, { "epoch": 0.16200544181274812, "grad_norm": 722.4852905273438, "learning_rate": 3e-06, "loss": -70.3175, "step": 1816 }, { "epoch": 0.1620946518577992, "grad_norm": 797.6476440429688, "learning_rate": 3e-06, "loss": -80.2729, "step": 1817 }, { "epoch": 0.16218386190285025, "grad_norm": 883.1192016601562, "learning_rate": 3e-06, "loss": -71.263, "step": 1818 }, { "epoch": 0.16227307194790133, "grad_norm": 1220.3101806640625, "learning_rate": 3e-06, "loss": -69.1647, "step": 1819 }, { "epoch": 0.1623622819929524, "grad_norm": 1046.2666015625, "learning_rate": 3e-06, "loss": -72.9542, "step": 1820 }, { "epoch": 0.1624514920380035, "grad_norm": 762.8731079101562, "learning_rate": 3e-06, "loss": -107.0888, "step": 1821 }, { "epoch": 0.16254070208305454, "grad_norm": 989.8717651367188, "learning_rate": 3e-06, "loss": -105.7402, "step": 1822 }, { "epoch": 0.16262991212810562, "grad_norm": 810.9016723632812, "learning_rate": 3e-06, "loss": -113.7269, "step": 1823 }, { "epoch": 0.1627191221731567, "grad_norm": 794.078369140625, "learning_rate": 3e-06, "loss": -101.5978, "step": 1824 }, { "completion_length": 154.45833587646484, "epoch": 0.16280833221820776, "grad_norm": 1521.7867431640625, "learning_rate": 3e-06, "loss": 198.4336, "reward": 1.999895989894867, "reward_std": 0.5427521467208862, "rewards/correctness_reward_func": 1.4583333134651184, "rewards/int_reward_func": 0.4479166716337204, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.09364583343267441, "step": 1825, "zero_std_ratio": 0.0 }, { "epoch": 0.16289754226325884, "grad_norm": 1178.328369140625, "learning_rate": 3e-06, "loss": 186.775, "step": 1826 }, { "epoch": 0.16298675230830992, "grad_norm": 1188.1805419921875, "learning_rate": 3e-06, "loss": 262.2266, "step": 1827 }, { "epoch": 0.163075962353361, "grad_norm": 1261.284423828125, "learning_rate": 3e-06, "loss": 232.0647, "step": 1828 }, { "epoch": 0.16316517239841205, "grad_norm": 1211.0538330078125, "learning_rate": 3e-06, "loss": 245.9915, "step": 1829 }, { "epoch": 0.16325438244346313, "grad_norm": 1118.0828857421875, "learning_rate": 3e-06, "loss": 269.5143, "step": 1830 }, { "epoch": 0.1633435924885142, "grad_norm": 1425.6029052734375, "learning_rate": 3e-06, "loss": 198.2645, "step": 1831 }, { "epoch": 0.1634328025335653, "grad_norm": 969.9822998046875, "learning_rate": 3e-06, "loss": 184.7289, "step": 1832 }, { "epoch": 0.16352201257861634, "grad_norm": 1341.745849609375, "learning_rate": 3e-06, "loss": 260.7448, "step": 1833 }, { "epoch": 0.16361122262366742, "grad_norm": 1239.0595703125, "learning_rate": 3e-06, "loss": 221.661, "step": 1834 }, { "epoch": 0.1637004326687185, "grad_norm": 1348.814453125, "learning_rate": 3e-06, "loss": 242.47, "step": 1835 }, { "epoch": 0.16378964271376958, "grad_norm": 1185.1468505859375, "learning_rate": 3e-06, "loss": 263.2233, "step": 1836 }, { "completion_length": 119.66666793823242, "epoch": 0.16387885275882064, "grad_norm": 527.2208251953125, "learning_rate": 3e-06, "loss": 34.5997, "reward": 2.168979287147522, "reward_std": 0.27117825858294964, "rewards/correctness_reward_func": 1.5, "rewards/int_reward_func": 0.4895833283662796, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.17939583212137222, "step": 1837, "zero_std_ratio": 0.0 }, { "epoch": 0.16396806280387172, "grad_norm": 730.4968872070312, "learning_rate": 3e-06, "loss": 45.4926, "step": 1838 }, { "epoch": 0.1640572728489228, "grad_norm": 559.9202270507812, "learning_rate": 3e-06, "loss": 43.0753, "step": 1839 }, { "epoch": 0.16414648289397385, "grad_norm": 586.70849609375, "learning_rate": 3e-06, "loss": 45.5974, "step": 1840 }, { "epoch": 0.16423569293902493, "grad_norm": 731.3250732421875, "learning_rate": 3e-06, "loss": 33.5908, "step": 1841 }, { "epoch": 0.164324902984076, "grad_norm": 1114.0931396484375, "learning_rate": 3e-06, "loss": 25.0561, "step": 1842 }, { "epoch": 0.1644141130291271, "grad_norm": 509.1001281738281, "learning_rate": 3e-06, "loss": 28.6634, "step": 1843 }, { "epoch": 0.16450332307417814, "grad_norm": 1331.419677734375, "learning_rate": 3e-06, "loss": 46.7599, "step": 1844 }, { "epoch": 0.16459253311922922, "grad_norm": 434.1399230957031, "learning_rate": 3e-06, "loss": 38.0593, "step": 1845 }, { "epoch": 0.1646817431642803, "grad_norm": 653.2589111328125, "learning_rate": 3e-06, "loss": 41.9517, "step": 1846 }, { "epoch": 0.16477095320933138, "grad_norm": 662.3316040039062, "learning_rate": 3e-06, "loss": 32.4191, "step": 1847 }, { "epoch": 0.16486016325438244, "grad_norm": 946.3890380859375, "learning_rate": 3e-06, "loss": 13.5773, "step": 1848 }, { "completion_length": 113.20833587646484, "epoch": 0.16494937329943352, "grad_norm": 607.1404418945312, "learning_rate": 3e-06, "loss": -34.7088, "reward": 2.585312604904175, "reward_std": 0.15480694454163313, "rewards/correctness_reward_func": 1.875, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.21031249314546585, "step": 1849, "zero_std_ratio": 0.0 }, { "epoch": 0.1650385833444846, "grad_norm": 658.881103515625, "learning_rate": 3e-06, "loss": 4.1338, "step": 1850 }, { "epoch": 0.16512779338953565, "grad_norm": 527.572998046875, "learning_rate": 3e-06, "loss": -27.4651, "step": 1851 }, { "epoch": 0.16521700343458673, "grad_norm": 710.48828125, "learning_rate": 3e-06, "loss": -0.9042, "step": 1852 }, { "epoch": 0.1653062134796378, "grad_norm": 775.1302490234375, "learning_rate": 3e-06, "loss": -12.5198, "step": 1853 }, { "epoch": 0.1653954235246889, "grad_norm": 557.5072021484375, "learning_rate": 3e-06, "loss": -32.8515, "step": 1854 }, { "epoch": 0.16548463356973994, "grad_norm": 621.2504272460938, "learning_rate": 3e-06, "loss": -41.5699, "step": 1855 }, { "epoch": 0.16557384361479102, "grad_norm": 654.5400390625, "learning_rate": 3e-06, "loss": 2.162, "step": 1856 }, { "epoch": 0.1656630536598421, "grad_norm": 577.8197631835938, "learning_rate": 3e-06, "loss": -35.462, "step": 1857 }, { "epoch": 0.16575226370489318, "grad_norm": 820.9921264648438, "learning_rate": 3e-06, "loss": -11.7584, "step": 1858 }, { "epoch": 0.16584147374994423, "grad_norm": 765.3721313476562, "learning_rate": 3e-06, "loss": -20.577, "step": 1859 }, { "epoch": 0.16593068379499532, "grad_norm": 568.7098388671875, "learning_rate": 3e-06, "loss": -30.3173, "step": 1860 }, { "completion_length": 128.0208396911621, "epoch": 0.1660198938400464, "grad_norm": 562.7349243164062, "learning_rate": 3e-06, "loss": -0.1389, "reward": 2.4199376106262207, "reward_std": 0.4903542250394821, "rewards/correctness_reward_func": 1.7916666865348816, "rewards/int_reward_func": 0.4791666567325592, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.14910416305065155, "step": 1861, "zero_std_ratio": 0.0 }, { "epoch": 0.16610910388509748, "grad_norm": 1208.946533203125, "learning_rate": 3e-06, "loss": 22.0607, "step": 1862 }, { "epoch": 0.16619831393014853, "grad_norm": 641.0849609375, "learning_rate": 3e-06, "loss": 15.9128, "step": 1863 }, { "epoch": 0.1662875239751996, "grad_norm": 610.1945190429688, "learning_rate": 3e-06, "loss": 45.7852, "step": 1864 }, { "epoch": 0.1663767340202507, "grad_norm": 591.8941650390625, "learning_rate": 3e-06, "loss": 22.7616, "step": 1865 }, { "epoch": 0.16646594406530174, "grad_norm": 466.8150939941406, "learning_rate": 3e-06, "loss": 26.8555, "step": 1866 }, { "epoch": 0.16655515411035282, "grad_norm": 823.4788818359375, "learning_rate": 3e-06, "loss": -7.2144, "step": 1867 }, { "epoch": 0.1666443641554039, "grad_norm": 1031.0870361328125, "learning_rate": 3e-06, "loss": 6.1348, "step": 1868 }, { "epoch": 0.16673357420045498, "grad_norm": 640.3084716796875, "learning_rate": 3e-06, "loss": 7.8973, "step": 1869 }, { "epoch": 0.16682278424550603, "grad_norm": 587.38916015625, "learning_rate": 3e-06, "loss": 38.7696, "step": 1870 }, { "epoch": 0.16691199429055711, "grad_norm": 470.4759216308594, "learning_rate": 3e-06, "loss": 10.2833, "step": 1871 }, { "epoch": 0.1670012043356082, "grad_norm": 400.15093994140625, "learning_rate": 3e-06, "loss": 16.9143, "step": 1872 }, { "completion_length": 121.5, "epoch": 0.16709041438065927, "grad_norm": 1434.84765625, "learning_rate": 3e-06, "loss": 127.8875, "reward": 2.1267499923706055, "reward_std": 0.28236258029937744, "rewards/correctness_reward_func": 1.4583333134651184, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.16841667145490646, "step": 1873, "zero_std_ratio": 0.0 }, { "epoch": 0.16717962442571033, "grad_norm": 1496.07373046875, "learning_rate": 3e-06, "loss": 88.3829, "step": 1874 }, { "epoch": 0.1672688344707614, "grad_norm": 1312.2996826171875, "learning_rate": 3e-06, "loss": 84.1219, "step": 1875 }, { "epoch": 0.1673580445158125, "grad_norm": 1233.7979736328125, "learning_rate": 3e-06, "loss": 70.5536, "step": 1876 }, { "epoch": 0.16744725456086354, "grad_norm": 896.58837890625, "learning_rate": 3e-06, "loss": 35.9209, "step": 1877 }, { "epoch": 0.16753646460591462, "grad_norm": 1668.3927001953125, "learning_rate": 3e-06, "loss": 82.6518, "step": 1878 }, { "epoch": 0.1676256746509657, "grad_norm": 1143.520263671875, "learning_rate": 3e-06, "loss": 97.7647, "step": 1879 }, { "epoch": 0.16771488469601678, "grad_norm": 964.7223510742188, "learning_rate": 3e-06, "loss": 55.6952, "step": 1880 }, { "epoch": 0.16780409474106783, "grad_norm": 1036.915283203125, "learning_rate": 3e-06, "loss": 61.6463, "step": 1881 }, { "epoch": 0.1678933047861189, "grad_norm": 889.5004272460938, "learning_rate": 3e-06, "loss": 39.2985, "step": 1882 }, { "epoch": 0.16798251483117, "grad_norm": 711.8017578125, "learning_rate": 3e-06, "loss": 21.9326, "step": 1883 }, { "epoch": 0.16807172487622107, "grad_norm": 1177.8270263671875, "learning_rate": 3e-06, "loss": 45.9468, "step": 1884 }, { "completion_length": 116.22916793823242, "epoch": 0.16816093492127213, "grad_norm": 313.5957336425781, "learning_rate": 3e-06, "loss": 13.1803, "reward": 2.5328749418258667, "reward_std": 0.35810738801956177, "rewards/correctness_reward_func": 1.8333333134651184, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.19954168051481247, "step": 1885, "zero_std_ratio": 0.0 }, { "epoch": 0.1682501449663232, "grad_norm": 333.3410949707031, "learning_rate": 3e-06, "loss": 14.3001, "step": 1886 }, { "epoch": 0.1683393550113743, "grad_norm": 330.1903076171875, "learning_rate": 3e-06, "loss": 13.6081, "step": 1887 }, { "epoch": 0.16842856505642537, "grad_norm": 313.8829345703125, "learning_rate": 3e-06, "loss": 21.504, "step": 1888 }, { "epoch": 0.16851777510147642, "grad_norm": 282.533203125, "learning_rate": 3e-06, "loss": 10.1669, "step": 1889 }, { "epoch": 0.1686069851465275, "grad_norm": 203.84523010253906, "learning_rate": 3e-06, "loss": 15.1211, "step": 1890 }, { "epoch": 0.16869619519157858, "grad_norm": 297.6282653808594, "learning_rate": 3e-06, "loss": 6.1928, "step": 1891 }, { "epoch": 0.16878540523662963, "grad_norm": 246.8819122314453, "learning_rate": 3e-06, "loss": 10.9529, "step": 1892 }, { "epoch": 0.1688746152816807, "grad_norm": 225.11219787597656, "learning_rate": 3e-06, "loss": 6.8239, "step": 1893 }, { "epoch": 0.1689638253267318, "grad_norm": 194.10739135742188, "learning_rate": 3e-06, "loss": 14.7224, "step": 1894 }, { "epoch": 0.16905303537178287, "grad_norm": 191.75978088378906, "learning_rate": 3e-06, "loss": 3.1039, "step": 1895 }, { "epoch": 0.16914224541683393, "grad_norm": 121.65093994140625, "learning_rate": 3e-06, "loss": 11.2102, "step": 1896 }, { "completion_length": 157.00000762939453, "epoch": 0.169231455461885, "grad_norm": 1251.7188720703125, "learning_rate": 3e-06, "loss": -10.517, "reward": 2.062812566757202, "reward_std": 0.7439534962177277, "rewards/correctness_reward_func": 1.5, "rewards/int_reward_func": 0.4479166567325592, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.11489583179354668, "step": 1897, "zero_std_ratio": 0.0 }, { "epoch": 0.16932066550693609, "grad_norm": 1017.1513671875, "learning_rate": 3e-06, "loss": -8.0786, "step": 1898 }, { "epoch": 0.16940987555198717, "grad_norm": 937.3341674804688, "learning_rate": 3e-06, "loss": -53.4306, "step": 1899 }, { "epoch": 0.16949908559703822, "grad_norm": 1166.1446533203125, "learning_rate": 3e-06, "loss": -58.4623, "step": 1900 }, { "epoch": 0.1695882956420893, "grad_norm": 627.0881958007812, "learning_rate": 3e-06, "loss": -60.1449, "step": 1901 }, { "epoch": 0.16967750568714038, "grad_norm": 1087.1383056640625, "learning_rate": 3e-06, "loss": -14.86, "step": 1902 }, { "epoch": 0.16976671573219143, "grad_norm": 1112.518798828125, "learning_rate": 3e-06, "loss": -13.7934, "step": 1903 }, { "epoch": 0.1698559257772425, "grad_norm": 1089.9168701171875, "learning_rate": 3e-06, "loss": -17.8976, "step": 1904 }, { "epoch": 0.1699451358222936, "grad_norm": 765.9096069335938, "learning_rate": 3e-06, "loss": -62.3282, "step": 1905 }, { "epoch": 0.17003434586734467, "grad_norm": 1779.7637939453125, "learning_rate": 3e-06, "loss": -69.9882, "step": 1906 }, { "epoch": 0.17012355591239572, "grad_norm": 627.7999877929688, "learning_rate": 3e-06, "loss": -67.9675, "step": 1907 }, { "epoch": 0.1702127659574468, "grad_norm": 1196.003173828125, "learning_rate": 3e-06, "loss": -24.7852, "step": 1908 }, { "completion_length": 139.52083587646484, "epoch": 0.17030197600249788, "grad_norm": 137.36314392089844, "learning_rate": 3e-06, "loss": -6.2761, "reward": 2.261833429336548, "reward_std": 0.2501897104084492, "rewards/correctness_reward_func": 1.625, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.1368333362042904, "step": 1909, "zero_std_ratio": 0.0 }, { "epoch": 0.17039118604754896, "grad_norm": 145.57534790039062, "learning_rate": 3e-06, "loss": -16.6773, "step": 1910 }, { "epoch": 0.17048039609260002, "grad_norm": 566.4564208984375, "learning_rate": 3e-06, "loss": -29.7986, "step": 1911 }, { "epoch": 0.1705696061376511, "grad_norm": 186.74420166015625, "learning_rate": 3e-06, "loss": -10.6402, "step": 1912 }, { "epoch": 0.17065881618270218, "grad_norm": 155.60208129882812, "learning_rate": 3e-06, "loss": -14.5853, "step": 1913 }, { "epoch": 0.17074802622775326, "grad_norm": 186.169921875, "learning_rate": 3e-06, "loss": -7.9219, "step": 1914 }, { "epoch": 0.1708372362728043, "grad_norm": 170.4557342529297, "learning_rate": 3e-06, "loss": -9.1424, "step": 1915 }, { "epoch": 0.1709264463178554, "grad_norm": 173.5465087890625, "learning_rate": 3e-06, "loss": -16.6805, "step": 1916 }, { "epoch": 0.17101565636290647, "grad_norm": 975.5637817382812, "learning_rate": 3e-06, "loss": -46.2821, "step": 1917 }, { "epoch": 0.17110486640795752, "grad_norm": 277.77490234375, "learning_rate": 3e-06, "loss": -12.9822, "step": 1918 }, { "epoch": 0.1711940764530086, "grad_norm": 203.00672912597656, "learning_rate": 3e-06, "loss": -17.6713, "step": 1919 }, { "epoch": 0.17128328649805968, "grad_norm": 243.0919647216797, "learning_rate": 3e-06, "loss": -11.2019, "step": 1920 }, { "completion_length": 124.27083969116211, "epoch": 0.17137249654311076, "grad_norm": 2301.88134765625, "learning_rate": 3e-06, "loss": -253.2357, "reward": 2.5085625648498535, "reward_std": 0.26843154430389404, "rewards/correctness_reward_func": 1.875, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.13356249406933784, "step": 1921, "zero_std_ratio": 0.0 }, { "epoch": 0.17146170658816182, "grad_norm": 1669.822021484375, "learning_rate": 3e-06, "loss": -404.5188, "step": 1922 }, { "epoch": 0.1715509166332129, "grad_norm": 1267.5242919921875, "learning_rate": 3e-06, "loss": -460.9462, "step": 1923 }, { "epoch": 0.17164012667826398, "grad_norm": 1655.473388671875, "learning_rate": 3e-06, "loss": -412.0498, "step": 1924 }, { "epoch": 0.17172933672331506, "grad_norm": 1569.4832763671875, "learning_rate": 3e-06, "loss": -305.7101, "step": 1925 }, { "epoch": 0.1718185467683661, "grad_norm": 1087.016845703125, "learning_rate": 3e-06, "loss": -636.9792, "step": 1926 }, { "epoch": 0.1719077568134172, "grad_norm": 2180.221435546875, "learning_rate": 3e-06, "loss": -309.4447, "step": 1927 }, { "epoch": 0.17199696685846827, "grad_norm": 1617.933349609375, "learning_rate": 3e-06, "loss": -462.1522, "step": 1928 }, { "epoch": 0.17208617690351935, "grad_norm": 985.1787109375, "learning_rate": 3e-06, "loss": -494.9189, "step": 1929 }, { "epoch": 0.1721753869485704, "grad_norm": 1417.375732421875, "learning_rate": 3e-06, "loss": -479.894, "step": 1930 }, { "epoch": 0.17226459699362148, "grad_norm": 1510.1646728515625, "learning_rate": 3e-06, "loss": -378.276, "step": 1931 }, { "epoch": 0.17235380703867256, "grad_norm": 1007.6100463867188, "learning_rate": 3e-06, "loss": -662.9573, "step": 1932 }, { "completion_length": 144.41666793823242, "epoch": 0.17244301708372362, "grad_norm": 136.9275360107422, "learning_rate": 3e-06, "loss": -8.754, "reward": 2.550520896911621, "reward_std": 0.2396542876958847, "rewards/correctness_reward_func": 1.9166666269302368, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.13385416939854622, "step": 1933, "zero_std_ratio": 0.0 }, { "epoch": 0.1725322271287747, "grad_norm": 83.86649322509766, "learning_rate": 3e-06, "loss": 0.1167, "step": 1934 }, { "epoch": 0.17262143717382578, "grad_norm": 77.8895263671875, "learning_rate": 3e-06, "loss": -1.4578, "step": 1935 }, { "epoch": 0.17271064721887686, "grad_norm": 122.21768951416016, "learning_rate": 3e-06, "loss": -4.8938, "step": 1936 }, { "epoch": 0.1727998572639279, "grad_norm": 86.19290161132812, "learning_rate": 3e-06, "loss": -0.7627, "step": 1937 }, { "epoch": 0.172889067308979, "grad_norm": 82.05410766601562, "learning_rate": 3e-06, "loss": -3.3958, "step": 1938 }, { "epoch": 0.17297827735403007, "grad_norm": 225.0465850830078, "learning_rate": 3e-06, "loss": -13.7727, "step": 1939 }, { "epoch": 0.17306748739908115, "grad_norm": 90.8927001953125, "learning_rate": 3e-06, "loss": -1.4379, "step": 1940 }, { "epoch": 0.1731566974441322, "grad_norm": 108.59156799316406, "learning_rate": 3e-06, "loss": -4.1789, "step": 1941 }, { "epoch": 0.17324590748918328, "grad_norm": 116.51862335205078, "learning_rate": 3e-06, "loss": -6.4501, "step": 1942 }, { "epoch": 0.17333511753423436, "grad_norm": 92.47396850585938, "learning_rate": 3e-06, "loss": -2.9944, "step": 1943 }, { "epoch": 0.17342432757928541, "grad_norm": 119.22129821777344, "learning_rate": 3e-06, "loss": -6.243, "step": 1944 }, { "completion_length": 145.5416717529297, "epoch": 0.1735135376243365, "grad_norm": 2110.79443359375, "learning_rate": 3e-06, "loss": 94.0773, "reward": 1.6517499685287476, "reward_std": 0.7779462337493896, "rewards/correctness_reward_func": 1.0416666567325592, "rewards/int_reward_func": 0.4791666716337204, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.13091666996479034, "step": 1945, "zero_std_ratio": 0.125 }, { "epoch": 0.17360274766938757, "grad_norm": 1997.6201171875, "learning_rate": 3e-06, "loss": 99.2755, "step": 1946 }, { "epoch": 0.17369195771443866, "grad_norm": 2617.997802734375, "learning_rate": 3e-06, "loss": -170.4776, "step": 1947 }, { "epoch": 0.1737811677594897, "grad_norm": 2001.4349365234375, "learning_rate": 3e-06, "loss": -48.466, "step": 1948 }, { "epoch": 0.1738703778045408, "grad_norm": 3665.568359375, "learning_rate": 3e-06, "loss": 56.4173, "step": 1949 }, { "epoch": 0.17395958784959187, "grad_norm": 3150.109619140625, "learning_rate": 3e-06, "loss": -52.6612, "step": 1950 }, { "epoch": 0.17404879789464295, "grad_norm": 2255.739013671875, "learning_rate": 3e-06, "loss": 86.9101, "step": 1951 }, { "epoch": 0.174138007939694, "grad_norm": 2205.787353515625, "learning_rate": 3e-06, "loss": 99.2739, "step": 1952 }, { "epoch": 0.17422721798474508, "grad_norm": 2441.709716796875, "learning_rate": 3e-06, "loss": -200.8932, "step": 1953 }, { "epoch": 0.17431642802979616, "grad_norm": 2390.3076171875, "learning_rate": 3e-06, "loss": -55.9708, "step": 1954 }, { "epoch": 0.17440563807484724, "grad_norm": 3095.658203125, "learning_rate": 3e-06, "loss": 39.4443, "step": 1955 }, { "epoch": 0.1744948481198983, "grad_norm": 2838.15966796875, "learning_rate": 3e-06, "loss": -70.563, "step": 1956 }, { "completion_length": 136.4791717529297, "epoch": 0.17458405816494937, "grad_norm": 706.7903442382812, "learning_rate": 3e-06, "loss": -24.1702, "reward": 2.388875126838684, "reward_std": 0.29309016466140747, "rewards/correctness_reward_func": 1.7916666269302368, "rewards/int_reward_func": 0.4791666567325592, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.11804166808724403, "step": 1957, "zero_std_ratio": 0.125 }, { "epoch": 0.17467326821000045, "grad_norm": 1125.595458984375, "learning_rate": 3e-06, "loss": 43.8321, "step": 1958 }, { "epoch": 0.1747624782550515, "grad_norm": 891.260009765625, "learning_rate": 3e-06, "loss": -32.6333, "step": 1959 }, { "epoch": 0.1748516883001026, "grad_norm": 1020.5130615234375, "learning_rate": 3e-06, "loss": 101.2067, "step": 1960 }, { "epoch": 0.17494089834515367, "grad_norm": 619.5036010742188, "learning_rate": 3e-06, "loss": 33.6314, "step": 1961 }, { "epoch": 0.17503010839020475, "grad_norm": 1052.5194091796875, "learning_rate": 3e-06, "loss": 52.0245, "step": 1962 }, { "epoch": 0.1751193184352558, "grad_norm": 637.0113525390625, "learning_rate": 3e-06, "loss": -27.3594, "step": 1963 }, { "epoch": 0.17520852848030688, "grad_norm": 1216.4146728515625, "learning_rate": 3e-06, "loss": 31.9169, "step": 1964 }, { "epoch": 0.17529773852535796, "grad_norm": 799.7539672851562, "learning_rate": 3e-06, "loss": -45.1417, "step": 1965 }, { "epoch": 0.17538694857040904, "grad_norm": 1073.3214111328125, "learning_rate": 3e-06, "loss": 79.6225, "step": 1966 }, { "epoch": 0.1754761586154601, "grad_norm": 606.6988525390625, "learning_rate": 3e-06, "loss": 22.0892, "step": 1967 }, { "epoch": 0.17556536866051117, "grad_norm": 1075.4769287109375, "learning_rate": 3e-06, "loss": 34.4998, "step": 1968 }, { "completion_length": 127.02083587646484, "epoch": 0.17565457870556225, "grad_norm": 1479.780517578125, "learning_rate": 3e-06, "loss": 22.8573, "reward": 2.3632084131240845, "reward_std": 0.3230869174003601, "rewards/correctness_reward_func": 1.7083333730697632, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.154874999076128, "step": 1969, "zero_std_ratio": 0.0 }, { "epoch": 0.1757437887506133, "grad_norm": 770.041748046875, "learning_rate": 3e-06, "loss": 36.1886, "step": 1970 }, { "epoch": 0.17583299879566439, "grad_norm": 634.0138549804688, "learning_rate": 3e-06, "loss": 4.3242, "step": 1971 }, { "epoch": 0.17592220884071547, "grad_norm": 1109.409912109375, "learning_rate": 3e-06, "loss": -24.1577, "step": 1972 }, { "epoch": 0.17601141888576655, "grad_norm": 950.0133056640625, "learning_rate": 3e-06, "loss": -30.5734, "step": 1973 }, { "epoch": 0.1761006289308176, "grad_norm": 1003.349853515625, "learning_rate": 3e-06, "loss": -125.0075, "step": 1974 }, { "epoch": 0.17618983897586868, "grad_norm": 1581.892578125, "learning_rate": 3e-06, "loss": 7.7013, "step": 1975 }, { "epoch": 0.17627904902091976, "grad_norm": 858.9105224609375, "learning_rate": 3e-06, "loss": 28.9444, "step": 1976 }, { "epoch": 0.17636825906597084, "grad_norm": 506.2606201171875, "learning_rate": 3e-06, "loss": -3.128, "step": 1977 }, { "epoch": 0.1764574691110219, "grad_norm": 1361.7171630859375, "learning_rate": 3e-06, "loss": -22.6255, "step": 1978 }, { "epoch": 0.17654667915607297, "grad_norm": 945.2322387695312, "learning_rate": 3e-06, "loss": -31.6284, "step": 1979 }, { "epoch": 0.17663588920112405, "grad_norm": 1066.5302734375, "learning_rate": 3e-06, "loss": -122.7752, "step": 1980 }, { "completion_length": 129.52083587646484, "epoch": 0.17672509924617513, "grad_norm": 522.1423950195312, "learning_rate": 3e-06, "loss": -6.6804, "reward": 2.387354254722595, "reward_std": 0.42842997610569, "rewards/correctness_reward_func": 1.75, "rewards/int_reward_func": 0.4895833283662796, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.14777083322405815, "step": 1981, "zero_std_ratio": 0.0 }, { "epoch": 0.17681430929122618, "grad_norm": 1231.280517578125, "learning_rate": 3e-06, "loss": 31.9041, "step": 1982 }, { "epoch": 0.17690351933627727, "grad_norm": 663.7846069335938, "learning_rate": 3e-06, "loss": 16.5326, "step": 1983 }, { "epoch": 0.17699272938132835, "grad_norm": 530.1618041992188, "learning_rate": 3e-06, "loss": 25.7806, "step": 1984 }, { "epoch": 0.1770819394263794, "grad_norm": 703.8350830078125, "learning_rate": 3e-06, "loss": 3.6452, "step": 1985 }, { "epoch": 0.17717114947143048, "grad_norm": 1063.3558349609375, "learning_rate": 3e-06, "loss": 37.3771, "step": 1986 }, { "epoch": 0.17726035951648156, "grad_norm": 429.0755920410156, "learning_rate": 3e-06, "loss": -9.0188, "step": 1987 }, { "epoch": 0.17734956956153264, "grad_norm": 1193.805419921875, "learning_rate": 3e-06, "loss": 24.0102, "step": 1988 }, { "epoch": 0.1774387796065837, "grad_norm": 550.2122802734375, "learning_rate": 3e-06, "loss": 5.9939, "step": 1989 }, { "epoch": 0.17752798965163477, "grad_norm": 480.5219421386719, "learning_rate": 3e-06, "loss": 20.8503, "step": 1990 }, { "epoch": 0.17761719969668585, "grad_norm": 359.2476806640625, "learning_rate": 3e-06, "loss": 1.5336, "step": 1991 }, { "epoch": 0.17770640974173693, "grad_norm": 754.3380737304688, "learning_rate": 3e-06, "loss": 29.4998, "step": 1992 }, { "completion_length": 150.93750762939453, "epoch": 0.17779561978678798, "grad_norm": 1932.5921630859375, "learning_rate": 3e-06, "loss": -288.8813, "reward": 2.269333302974701, "reward_std": 0.5021546930074692, "rewards/correctness_reward_func": 1.6666666865348816, "rewards/int_reward_func": 0.4583333283662796, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.1443333402276039, "step": 1993, "zero_std_ratio": 0.0 }, { "epoch": 0.17788482983183906, "grad_norm": 2357.911865234375, "learning_rate": 3e-06, "loss": -134.0228, "step": 1994 }, { "epoch": 0.17797403987689014, "grad_norm": 1959.8599853515625, "learning_rate": 3e-06, "loss": -81.068, "step": 1995 }, { "epoch": 0.1780632499219412, "grad_norm": 1494.47412109375, "learning_rate": 3e-06, "loss": -185.6545, "step": 1996 }, { "epoch": 0.17815245996699228, "grad_norm": 1894.7733154296875, "learning_rate": 3e-06, "loss": -266.9818, "step": 1997 }, { "epoch": 0.17824167001204336, "grad_norm": 2350.27490234375, "learning_rate": 3e-06, "loss": -234.3397, "step": 1998 }, { "epoch": 0.17833088005709444, "grad_norm": 2210.189453125, "learning_rate": 3e-06, "loss": -288.7056, "step": 1999 }, { "epoch": 0.1784200901021455, "grad_norm": 1879.7398681640625, "learning_rate": 3e-06, "loss": -138.2824, "step": 2000 }, { "epoch": 0.17850930014719657, "grad_norm": 2152.613525390625, "learning_rate": 3e-06, "loss": -97.7832, "step": 2001 }, { "epoch": 0.17859851019224765, "grad_norm": 1610.9349365234375, "learning_rate": 3e-06, "loss": -200.0292, "step": 2002 }, { "epoch": 0.17868772023729873, "grad_norm": 1528.112060546875, "learning_rate": 3e-06, "loss": -275.8097, "step": 2003 }, { "epoch": 0.17877693028234978, "grad_norm": 2053.736328125, "learning_rate": 3e-06, "loss": -258.0316, "step": 2004 }, { "completion_length": 141.9375, "epoch": 0.17886614032740086, "grad_norm": 193.6829833984375, "learning_rate": 3e-06, "loss": 20.2948, "reward": 2.5581459999084473, "reward_std": 0.2487168163061142, "rewards/correctness_reward_func": 1.9166666269302368, "rewards/int_reward_func": 0.4895833283662796, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.15189583599567413, "step": 2005, "zero_std_ratio": 0.0 }, { "epoch": 0.17895535037245194, "grad_norm": 155.00283813476562, "learning_rate": 3e-06, "loss": 17.0246, "step": 2006 }, { "epoch": 0.17904456041750302, "grad_norm": 259.50830078125, "learning_rate": 3e-06, "loss": 22.8199, "step": 2007 }, { "epoch": 0.17913377046255408, "grad_norm": 266.588134765625, "learning_rate": 3e-06, "loss": 24.9054, "step": 2008 }, { "epoch": 0.17922298050760516, "grad_norm": 651.2266845703125, "learning_rate": 3e-06, "loss": 49.409, "step": 2009 }, { "epoch": 0.17931219055265624, "grad_norm": 211.16615295410156, "learning_rate": 3e-06, "loss": 24.0546, "step": 2010 }, { "epoch": 0.1794014005977073, "grad_norm": 190.15692138671875, "learning_rate": 3e-06, "loss": 18.0055, "step": 2011 }, { "epoch": 0.17949061064275837, "grad_norm": 160.15274047851562, "learning_rate": 3e-06, "loss": 15.9223, "step": 2012 }, { "epoch": 0.17957982068780945, "grad_norm": 181.6917724609375, "learning_rate": 3e-06, "loss": 21.005, "step": 2013 }, { "epoch": 0.17966903073286053, "grad_norm": 256.7626953125, "learning_rate": 3e-06, "loss": 20.4393, "step": 2014 }, { "epoch": 0.17975824077791158, "grad_norm": 425.4783630371094, "learning_rate": 3e-06, "loss": 37.1911, "step": 2015 }, { "epoch": 0.17984745082296266, "grad_norm": 152.9344482421875, "learning_rate": 3e-06, "loss": 20.1633, "step": 2016 }, { "completion_length": 141.95833587646484, "epoch": 0.17993666086801374, "grad_norm": 913.6690063476562, "learning_rate": 3e-06, "loss": -38.0152, "reward": 2.531229257583618, "reward_std": 0.2668761610984802, "rewards/correctness_reward_func": 1.9166666269302368, "rewards/int_reward_func": 0.4895833283662796, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.124979168176651, "step": 2017, "zero_std_ratio": 0.25 }, { "epoch": 0.18002587091306482, "grad_norm": 955.5236206054688, "learning_rate": 3e-06, "loss": -14.7569, "step": 2018 }, { "epoch": 0.18011508095811588, "grad_norm": 767.3821411132812, "learning_rate": 3e-06, "loss": -35.5258, "step": 2019 }, { "epoch": 0.18020429100316696, "grad_norm": 1576.59521484375, "learning_rate": 3e-06, "loss": -63.1452, "step": 2020 }, { "epoch": 0.18029350104821804, "grad_norm": 411.6076965332031, "learning_rate": 3e-06, "loss": -3.7112, "step": 2021 }, { "epoch": 0.18038271109326912, "grad_norm": 621.1591186523438, "learning_rate": 3e-06, "loss": -5.7912, "step": 2022 }, { "epoch": 0.18047192113832017, "grad_norm": 1037.3421630859375, "learning_rate": 3e-06, "loss": -40.5176, "step": 2023 }, { "epoch": 0.18056113118337125, "grad_norm": 730.2128295898438, "learning_rate": 3e-06, "loss": -14.0285, "step": 2024 }, { "epoch": 0.18065034122842233, "grad_norm": 1322.21826171875, "learning_rate": 3e-06, "loss": -39.4187, "step": 2025 }, { "epoch": 0.18073955127347338, "grad_norm": 1288.70361328125, "learning_rate": 3e-06, "loss": -74.1179, "step": 2026 }, { "epoch": 0.18082876131852446, "grad_norm": 477.3416442871094, "learning_rate": 3e-06, "loss": -7.2589, "step": 2027 }, { "epoch": 0.18091797136357554, "grad_norm": 718.8214721679688, "learning_rate": 3e-06, "loss": -11.7694, "step": 2028 }, { "completion_length": 149.4791717529297, "epoch": 0.18100718140862662, "grad_norm": 1311.2017822265625, "learning_rate": 3e-06, "loss": -69.5996, "reward": 2.306437611579895, "reward_std": 0.4377765115350485, "rewards/correctness_reward_func": 1.7083333134651184, "rewards/int_reward_func": 0.4791666716337204, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.11893749982118607, "step": 2029, "zero_std_ratio": 0.125 }, { "epoch": 0.18109639145367767, "grad_norm": 1927.8277587890625, "learning_rate": 3e-06, "loss": -19.9164, "step": 2030 }, { "epoch": 0.18118560149872875, "grad_norm": 2200.863525390625, "learning_rate": 3e-06, "loss": 19.4123, "step": 2031 }, { "epoch": 0.18127481154377983, "grad_norm": 1609.12255859375, "learning_rate": 3e-06, "loss": 132.9534, "step": 2032 }, { "epoch": 0.18136402158883091, "grad_norm": 1760.5648193359375, "learning_rate": 3e-06, "loss": 74.6029, "step": 2033 }, { "epoch": 0.18145323163388197, "grad_norm": 1445.6087646484375, "learning_rate": 3e-06, "loss": 91.8306, "step": 2034 }, { "epoch": 0.18154244167893305, "grad_norm": 1266.7987060546875, "learning_rate": 3e-06, "loss": -78.4848, "step": 2035 }, { "epoch": 0.18163165172398413, "grad_norm": 2177.941650390625, "learning_rate": 3e-06, "loss": -23.6046, "step": 2036 }, { "epoch": 0.18172086176903518, "grad_norm": 1841.123046875, "learning_rate": 3e-06, "loss": 7.9887, "step": 2037 }, { "epoch": 0.18181007181408626, "grad_norm": 2021.9366455078125, "learning_rate": 3e-06, "loss": 115.5819, "step": 2038 }, { "epoch": 0.18189928185913734, "grad_norm": 1844.681640625, "learning_rate": 3e-06, "loss": 60.9947, "step": 2039 }, { "epoch": 0.18198849190418842, "grad_norm": 1540.6129150390625, "learning_rate": 3e-06, "loss": 94.1448, "step": 2040 }, { "completion_length": 122.04167175292969, "epoch": 0.18207770194923947, "grad_norm": 431.05072021484375, "learning_rate": 3e-06, "loss": -45.7214, "reward": 2.455124855041504, "reward_std": 0.2788470536470413, "rewards/correctness_reward_func": 1.7916666865348816, "rewards/int_reward_func": 0.4791666716337204, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.18429165333509445, "step": 2041, "zero_std_ratio": 0.125 }, { "epoch": 0.18216691199429055, "grad_norm": 363.893798828125, "learning_rate": 3e-06, "loss": -43.1347, "step": 2042 }, { "epoch": 0.18225612203934163, "grad_norm": 588.0111083984375, "learning_rate": 3e-06, "loss": -30.2104, "step": 2043 }, { "epoch": 0.18234533208439271, "grad_norm": 432.35784912109375, "learning_rate": 3e-06, "loss": -37.9299, "step": 2044 }, { "epoch": 0.18243454212944377, "grad_norm": 515.4808349609375, "learning_rate": 3e-06, "loss": -40.889, "step": 2045 }, { "epoch": 0.18252375217449485, "grad_norm": 1244.8402099609375, "learning_rate": 3e-06, "loss": -30.4561, "step": 2046 }, { "epoch": 0.18261296221954593, "grad_norm": 758.7728271484375, "learning_rate": 3e-06, "loss": -51.244, "step": 2047 }, { "epoch": 0.182702172264597, "grad_norm": 447.59246826171875, "learning_rate": 3e-06, "loss": -48.6483, "step": 2048 }, { "epoch": 0.18279138230964806, "grad_norm": 542.2373046875, "learning_rate": 3e-06, "loss": -40.3981, "step": 2049 }, { "epoch": 0.18288059235469914, "grad_norm": 358.4046325683594, "learning_rate": 3e-06, "loss": -45.3628, "step": 2050 }, { "epoch": 0.18296980239975022, "grad_norm": 600.2781982421875, "learning_rate": 3e-06, "loss": -51.8527, "step": 2051 }, { "epoch": 0.18305901244480127, "grad_norm": 435.05706787109375, "learning_rate": 3e-06, "loss": -38.3583, "step": 2052 }, { "completion_length": 161.66666793823242, "epoch": 0.18314822248985235, "grad_norm": 1581.74951171875, "learning_rate": 3e-06, "loss": 29.2414, "reward": 2.3151042461395264, "reward_std": 0.6183367669582367, "rewards/correctness_reward_func": 1.7916666865348816, "rewards/int_reward_func": 0.46875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.0546875037252903, "step": 2053, "zero_std_ratio": 0.0 }, { "epoch": 0.18323743253490343, "grad_norm": 1009.0924072265625, "learning_rate": 3e-06, "loss": 39.8569, "step": 2054 }, { "epoch": 0.1833266425799545, "grad_norm": 1006.994873046875, "learning_rate": 3e-06, "loss": -25.3739, "step": 2055 }, { "epoch": 0.18341585262500557, "grad_norm": 1858.0333251953125, "learning_rate": 3e-06, "loss": -156.3526, "step": 2056 }, { "epoch": 0.18350506267005665, "grad_norm": 1687.8575439453125, "learning_rate": 3e-06, "loss": -102.6945, "step": 2057 }, { "epoch": 0.18359427271510773, "grad_norm": 3122.237548828125, "learning_rate": 3e-06, "loss": -448.7975, "step": 2058 }, { "epoch": 0.1836834827601588, "grad_norm": 2170.154541015625, "learning_rate": 3e-06, "loss": -11.5904, "step": 2059 }, { "epoch": 0.18377269280520986, "grad_norm": 1241.9334716796875, "learning_rate": 3e-06, "loss": 14.9715, "step": 2060 }, { "epoch": 0.18386190285026094, "grad_norm": 1282.6116943359375, "learning_rate": 3e-06, "loss": -53.5031, "step": 2061 }, { "epoch": 0.18395111289531202, "grad_norm": 2399.23193359375, "learning_rate": 3e-06, "loss": -207.4761, "step": 2062 }, { "epoch": 0.18404032294036307, "grad_norm": 3210.094482421875, "learning_rate": 3e-06, "loss": -169.572, "step": 2063 }, { "epoch": 0.18412953298541415, "grad_norm": 3275.695556640625, "learning_rate": 3e-06, "loss": -540.9293, "step": 2064 }, { "completion_length": 127.64583969116211, "epoch": 0.18421874303046523, "grad_norm": 1122.905517578125, "learning_rate": 3e-06, "loss": 10.8385, "reward": 2.301645874977112, "reward_std": 0.4431246966123581, "rewards/correctness_reward_func": 1.6666666865348816, "rewards/int_reward_func": 0.4895833283662796, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.14539583399891853, "step": 2065, "zero_std_ratio": 0.0 }, { "epoch": 0.1843079530755163, "grad_norm": 1122.392578125, "learning_rate": 3e-06, "loss": 49.955, "step": 2066 }, { "epoch": 0.18439716312056736, "grad_norm": 1480.3697509765625, "learning_rate": 3e-06, "loss": 58.7913, "step": 2067 }, { "epoch": 0.18448637316561844, "grad_norm": 1758.8323974609375, "learning_rate": 3e-06, "loss": 20.1775, "step": 2068 }, { "epoch": 0.18457558321066952, "grad_norm": 1807.8433837890625, "learning_rate": 3e-06, "loss": 6.628, "step": 2069 }, { "epoch": 0.1846647932557206, "grad_norm": 1256.463623046875, "learning_rate": 3e-06, "loss": -21.4105, "step": 2070 }, { "epoch": 0.18475400330077166, "grad_norm": 1165.8466796875, "learning_rate": 3e-06, "loss": 9.5585, "step": 2071 }, { "epoch": 0.18484321334582274, "grad_norm": 1214.4351806640625, "learning_rate": 3e-06, "loss": 27.7133, "step": 2072 }, { "epoch": 0.18493242339087382, "grad_norm": 1408.9798583984375, "learning_rate": 3e-06, "loss": 41.0192, "step": 2073 }, { "epoch": 0.1850216334359249, "grad_norm": 2391.484130859375, "learning_rate": 3e-06, "loss": 11.1923, "step": 2074 }, { "epoch": 0.18511084348097595, "grad_norm": 1309.9111328125, "learning_rate": 3e-06, "loss": -8.1838, "step": 2075 }, { "epoch": 0.18520005352602703, "grad_norm": 1430.7237548828125, "learning_rate": 3e-06, "loss": -34.23, "step": 2076 }, { "completion_length": 127.29167175292969, "epoch": 0.1852892635710781, "grad_norm": 760.6821899414062, "learning_rate": 3e-06, "loss": 28.2148, "reward": 2.2379584312438965, "reward_std": 0.45385661721229553, "rewards/correctness_reward_func": 1.625, "rewards/int_reward_func": 0.4791666567325592, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.13379166647791862, "step": 2077, "zero_std_ratio": 0.0 }, { "epoch": 0.18537847361612916, "grad_norm": 1331.1378173828125, "learning_rate": 3e-06, "loss": 147.6262, "step": 2078 }, { "epoch": 0.18546768366118024, "grad_norm": 1488.8958740234375, "learning_rate": 3e-06, "loss": 170.4627, "step": 2079 }, { "epoch": 0.18555689370623132, "grad_norm": 1463.0443115234375, "learning_rate": 3e-06, "loss": 122.8418, "step": 2080 }, { "epoch": 0.1856461037512824, "grad_norm": 1204.493408203125, "learning_rate": 3e-06, "loss": 72.0359, "step": 2081 }, { "epoch": 0.18573531379633346, "grad_norm": 1609.120849609375, "learning_rate": 3e-06, "loss": 160.9981, "step": 2082 }, { "epoch": 0.18582452384138454, "grad_norm": 735.0900268554688, "learning_rate": 3e-06, "loss": 23.1434, "step": 2083 }, { "epoch": 0.18591373388643562, "grad_norm": 1322.2744140625, "learning_rate": 3e-06, "loss": 134.6119, "step": 2084 }, { "epoch": 0.1860029439314867, "grad_norm": 1587.6624755859375, "learning_rate": 3e-06, "loss": 135.2965, "step": 2085 }, { "epoch": 0.18609215397653775, "grad_norm": 1264.227294921875, "learning_rate": 3e-06, "loss": 90.2438, "step": 2086 }, { "epoch": 0.18618136402158883, "grad_norm": 769.9957275390625, "learning_rate": 3e-06, "loss": 61.2736, "step": 2087 }, { "epoch": 0.1862705740666399, "grad_norm": 1437.9520263671875, "learning_rate": 3e-06, "loss": 127.8669, "step": 2088 }, { "completion_length": 124.0, "epoch": 0.18635978411169096, "grad_norm": 2092.324462890625, "learning_rate": 3e-06, "loss": -305.053, "reward": 2.428458333015442, "reward_std": 0.4015738368034363, "rewards/correctness_reward_func": 1.75, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.1784583330154419, "step": 2089, "zero_std_ratio": 0.0 }, { "epoch": 0.18644899415674204, "grad_norm": 2426.65625, "learning_rate": 3e-06, "loss": -232.8155, "step": 2090 }, { "epoch": 0.18653820420179312, "grad_norm": 2513.125244140625, "learning_rate": 3e-06, "loss": -294.7675, "step": 2091 }, { "epoch": 0.1866274142468442, "grad_norm": 1736.9534912109375, "learning_rate": 3e-06, "loss": -373.3613, "step": 2092 }, { "epoch": 0.18671662429189526, "grad_norm": 1675.3985595703125, "learning_rate": 3e-06, "loss": -312.3737, "step": 2093 }, { "epoch": 0.18680583433694634, "grad_norm": 1661.304443359375, "learning_rate": 3e-06, "loss": -274.9287, "step": 2094 }, { "epoch": 0.18689504438199742, "grad_norm": 2010.2904052734375, "learning_rate": 3e-06, "loss": -307.0745, "step": 2095 }, { "epoch": 0.1869842544270485, "grad_norm": 2005.97119140625, "learning_rate": 3e-06, "loss": -248.6287, "step": 2096 }, { "epoch": 0.18707346447209955, "grad_norm": 2148.4443359375, "learning_rate": 3e-06, "loss": -324.0838, "step": 2097 }, { "epoch": 0.18716267451715063, "grad_norm": 1827.4686279296875, "learning_rate": 3e-06, "loss": -405.5588, "step": 2098 }, { "epoch": 0.1872518845622017, "grad_norm": 1557.4544677734375, "learning_rate": 3e-06, "loss": -350.384, "step": 2099 }, { "epoch": 0.1873410946072528, "grad_norm": 1810.8126220703125, "learning_rate": 3e-06, "loss": -321.4857, "step": 2100 }, { "completion_length": 140.9791717529297, "epoch": 0.18743030465230384, "grad_norm": 237.09336853027344, "learning_rate": 3e-06, "loss": -24.8648, "reward": 2.407604217529297, "reward_std": 0.22888919711112976, "rewards/correctness_reward_func": 1.75, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.157604169100523, "step": 2101, "zero_std_ratio": 0.0 }, { "epoch": 0.18751951469735492, "grad_norm": 318.93609619140625, "learning_rate": 3e-06, "loss": -20.7236, "step": 2102 }, { "epoch": 0.187608724742406, "grad_norm": 286.1955261230469, "learning_rate": 3e-06, "loss": -24.869, "step": 2103 }, { "epoch": 0.18769793478745705, "grad_norm": 392.14410400390625, "learning_rate": 3e-06, "loss": -31.8759, "step": 2104 }, { "epoch": 0.18778714483250813, "grad_norm": 287.25006103515625, "learning_rate": 3e-06, "loss": -25.5882, "step": 2105 }, { "epoch": 0.18787635487755922, "grad_norm": 278.5559387207031, "learning_rate": 3e-06, "loss": -25.7843, "step": 2106 }, { "epoch": 0.1879655649226103, "grad_norm": 339.7559509277344, "learning_rate": 3e-06, "loss": -35.2184, "step": 2107 }, { "epoch": 0.18805477496766135, "grad_norm": 455.73504638671875, "learning_rate": 3e-06, "loss": -32.7528, "step": 2108 }, { "epoch": 0.18814398501271243, "grad_norm": 384.77130126953125, "learning_rate": 3e-06, "loss": -36.0195, "step": 2109 }, { "epoch": 0.1882331950577635, "grad_norm": 390.36846923828125, "learning_rate": 3e-06, "loss": -44.4518, "step": 2110 }, { "epoch": 0.1883224051028146, "grad_norm": 437.9986877441406, "learning_rate": 3e-06, "loss": -26.0731, "step": 2111 }, { "epoch": 0.18841161514786564, "grad_norm": 393.3838806152344, "learning_rate": 3e-06, "loss": -35.506, "step": 2112 }, { "completion_length": 129.43750762939453, "epoch": 0.18850082519291672, "grad_norm": 1305.8514404296875, "learning_rate": 3e-06, "loss": 110.8168, "reward": 2.460416793823242, "reward_std": 0.30926184356212616, "rewards/correctness_reward_func": 1.8333333730697632, "rewards/int_reward_func": 0.4895833283662796, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.13749999552965164, "step": 2113, "zero_std_ratio": 0.0 }, { "epoch": 0.1885900352379678, "grad_norm": 1914.8585205078125, "learning_rate": 3e-06, "loss": 42.632, "step": 2114 }, { "epoch": 0.18867924528301888, "grad_norm": 1306.41162109375, "learning_rate": 3e-06, "loss": 60.4664, "step": 2115 }, { "epoch": 0.18876845532806993, "grad_norm": 1066.9373779296875, "learning_rate": 3e-06, "loss": 128.6848, "step": 2116 }, { "epoch": 0.18885766537312101, "grad_norm": 1232.74267578125, "learning_rate": 3e-06, "loss": 68.6546, "step": 2117 }, { "epoch": 0.1889468754181721, "grad_norm": 851.8841552734375, "learning_rate": 3e-06, "loss": 92.6301, "step": 2118 }, { "epoch": 0.18903608546322315, "grad_norm": 1269.6170654296875, "learning_rate": 3e-06, "loss": 82.624, "step": 2119 }, { "epoch": 0.18912529550827423, "grad_norm": 1477.7921142578125, "learning_rate": 3e-06, "loss": 46.4268, "step": 2120 }, { "epoch": 0.1892145055533253, "grad_norm": 1204.431640625, "learning_rate": 3e-06, "loss": 47.8916, "step": 2121 }, { "epoch": 0.1893037155983764, "grad_norm": 1141.2989501953125, "learning_rate": 3e-06, "loss": 116.479, "step": 2122 }, { "epoch": 0.18939292564342744, "grad_norm": 1084.2598876953125, "learning_rate": 3e-06, "loss": 64.8248, "step": 2123 }, { "epoch": 0.18948213568847852, "grad_norm": 913.5030517578125, "learning_rate": 3e-06, "loss": 87.6114, "step": 2124 }, { "completion_length": 133.2291717529297, "epoch": 0.1895713457335296, "grad_norm": 1788.35546875, "learning_rate": 3e-06, "loss": 181.0141, "reward": 2.338604211807251, "reward_std": 0.5955996513366699, "rewards/correctness_reward_func": 1.625, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.2136041596531868, "step": 2125, "zero_std_ratio": 0.125 }, { "epoch": 0.18966055577858068, "grad_norm": 1528.2685546875, "learning_rate": 3e-06, "loss": 107.8506, "step": 2126 }, { "epoch": 0.18974976582363173, "grad_norm": 2611.96630859375, "learning_rate": 3e-06, "loss": 53.3311, "step": 2127 }, { "epoch": 0.1898389758686828, "grad_norm": 1537.9984130859375, "learning_rate": 3e-06, "loss": 112.4603, "step": 2128 }, { "epoch": 0.1899281859137339, "grad_norm": 1148.2452392578125, "learning_rate": 3e-06, "loss": 115.4492, "step": 2129 }, { "epoch": 0.19001739595878495, "grad_norm": 1439.117919921875, "learning_rate": 3e-06, "loss": 107.7209, "step": 2130 }, { "epoch": 0.19010660600383603, "grad_norm": 1633.4248046875, "learning_rate": 3e-06, "loss": 161.3715, "step": 2131 }, { "epoch": 0.1901958160488871, "grad_norm": 1571.0533447265625, "learning_rate": 3e-06, "loss": 95.7953, "step": 2132 }, { "epoch": 0.1902850260939382, "grad_norm": 1700.539794921875, "learning_rate": 3e-06, "loss": 31.5747, "step": 2133 }, { "epoch": 0.19037423613898924, "grad_norm": 1770.425537109375, "learning_rate": 3e-06, "loss": 96.7928, "step": 2134 }, { "epoch": 0.19046344618404032, "grad_norm": 1083.177734375, "learning_rate": 3e-06, "loss": 106.8439, "step": 2135 }, { "epoch": 0.1905526562290914, "grad_norm": 1709.0933837890625, "learning_rate": 3e-06, "loss": 81.4831, "step": 2136 }, { "completion_length": 145.9166717529297, "epoch": 0.19064186627414248, "grad_norm": 1901.0673828125, "learning_rate": 3e-06, "loss": 74.4176, "reward": 2.2969167232513428, "reward_std": 0.5523957759141922, "rewards/correctness_reward_func": 1.6666666865348816, "rewards/int_reward_func": 0.4895833283662796, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.1406666710972786, "step": 2137, "zero_std_ratio": 0.0 }, { "epoch": 0.19073107631919353, "grad_norm": 2433.760986328125, "learning_rate": 3e-06, "loss": 28.9411, "step": 2138 }, { "epoch": 0.1908202863642446, "grad_norm": 3526.50732421875, "learning_rate": 3e-06, "loss": -3.2852, "step": 2139 }, { "epoch": 0.1909094964092957, "grad_norm": 2794.43310546875, "learning_rate": 3e-06, "loss": 67.5947, "step": 2140 }, { "epoch": 0.19099870645434677, "grad_norm": 2891.499755859375, "learning_rate": 3e-06, "loss": 192.5647, "step": 2141 }, { "epoch": 0.19108791649939783, "grad_norm": 2748.095947265625, "learning_rate": 3e-06, "loss": 100.963, "step": 2142 }, { "epoch": 0.1911771265444489, "grad_norm": 1738.9039306640625, "learning_rate": 3e-06, "loss": 40.8662, "step": 2143 }, { "epoch": 0.19126633658949999, "grad_norm": 2699.06103515625, "learning_rate": 3e-06, "loss": 17.0858, "step": 2144 }, { "epoch": 0.19135554663455104, "grad_norm": 2892.302490234375, "learning_rate": 3e-06, "loss": -27.5113, "step": 2145 }, { "epoch": 0.19144475667960212, "grad_norm": 2559.01318359375, "learning_rate": 3e-06, "loss": 20.1366, "step": 2146 }, { "epoch": 0.1915339667246532, "grad_norm": 3452.719482421875, "learning_rate": 3e-06, "loss": 160.6933, "step": 2147 }, { "epoch": 0.19162317676970428, "grad_norm": 2362.385986328125, "learning_rate": 3e-06, "loss": 78.5965, "step": 2148 }, { "completion_length": 120.25000381469727, "epoch": 0.19171238681475533, "grad_norm": 1334.63623046875, "learning_rate": 3e-06, "loss": 6.4367, "reward": 2.3084168434143066, "reward_std": 0.29630675725638866, "rewards/correctness_reward_func": 1.6666666865348816, "rewards/int_reward_func": 0.46875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.1730000004172325, "step": 2149, "zero_std_ratio": 0.0 }, { "epoch": 0.1918015968598064, "grad_norm": 760.7504272460938, "learning_rate": 3e-06, "loss": -88.3566, "step": 2150 }, { "epoch": 0.1918908069048575, "grad_norm": 1294.0516357421875, "learning_rate": 3e-06, "loss": -41.7255, "step": 2151 }, { "epoch": 0.19198001694990857, "grad_norm": 1805.128662109375, "learning_rate": 3e-06, "loss": -19.25, "step": 2152 }, { "epoch": 0.19206922699495962, "grad_norm": 1069.1192626953125, "learning_rate": 3e-06, "loss": -69.7657, "step": 2153 }, { "epoch": 0.1921584370400107, "grad_norm": 2599.32080078125, "learning_rate": 3e-06, "loss": -44.1199, "step": 2154 }, { "epoch": 0.19224764708506178, "grad_norm": 1054.9842529296875, "learning_rate": 3e-06, "loss": -7.7095, "step": 2155 }, { "epoch": 0.19233685713011284, "grad_norm": 818.435302734375, "learning_rate": 3e-06, "loss": -102.3675, "step": 2156 }, { "epoch": 0.19242606717516392, "grad_norm": 970.5858154296875, "learning_rate": 3e-06, "loss": -53.6351, "step": 2157 }, { "epoch": 0.192515277220215, "grad_norm": 865.392822265625, "learning_rate": 3e-06, "loss": -21.3515, "step": 2158 }, { "epoch": 0.19260448726526608, "grad_norm": 981.1688232421875, "learning_rate": 3e-06, "loss": -87.6568, "step": 2159 }, { "epoch": 0.19269369731031713, "grad_norm": 975.263916015625, "learning_rate": 3e-06, "loss": -57.958, "step": 2160 }, { "completion_length": 136.45833587646484, "epoch": 0.1927829073553682, "grad_norm": 1038.02294921875, "learning_rate": 3e-06, "loss": 3.3305, "reward": 2.5663751363754272, "reward_std": 0.25874343514442444, "rewards/correctness_reward_func": 1.9166666269302368, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.1497083343565464, "step": 2161, "zero_std_ratio": 0.0 }, { "epoch": 0.1928721174004193, "grad_norm": 803.9536743164062, "learning_rate": 3e-06, "loss": -15.4779, "step": 2162 }, { "epoch": 0.19296132744547037, "grad_norm": 1474.9486083984375, "learning_rate": 3e-06, "loss": 2.4125, "step": 2163 }, { "epoch": 0.19305053749052142, "grad_norm": 1137.7777099609375, "learning_rate": 3e-06, "loss": -10.7272, "step": 2164 }, { "epoch": 0.1931397475355725, "grad_norm": 1287.4840087890625, "learning_rate": 3e-06, "loss": -36.5552, "step": 2165 }, { "epoch": 0.19322895758062358, "grad_norm": 750.6218872070312, "learning_rate": 3e-06, "loss": 28.6985, "step": 2166 }, { "epoch": 0.19331816762567466, "grad_norm": 1729.5426025390625, "learning_rate": 3e-06, "loss": -13.0334, "step": 2167 }, { "epoch": 0.19340737767072572, "grad_norm": 919.8224487304688, "learning_rate": 3e-06, "loss": -28.3112, "step": 2168 }, { "epoch": 0.1934965877157768, "grad_norm": 1243.5130615234375, "learning_rate": 3e-06, "loss": -7.4271, "step": 2169 }, { "epoch": 0.19358579776082788, "grad_norm": 897.3583374023438, "learning_rate": 3e-06, "loss": -19.3195, "step": 2170 }, { "epoch": 0.19367500780587893, "grad_norm": 1540.0894775390625, "learning_rate": 3e-06, "loss": -66.3459, "step": 2171 }, { "epoch": 0.19376421785093, "grad_norm": 910.990966796875, "learning_rate": 3e-06, "loss": 19.7238, "step": 2172 }, { "completion_length": 116.02083587646484, "epoch": 0.1938534278959811, "grad_norm": 1810.232666015625, "learning_rate": 3e-06, "loss": 39.429, "reward": 2.171999931335449, "reward_std": 0.5019665211439133, "rewards/correctness_reward_func": 1.4583333134651184, "rewards/int_reward_func": 0.4895833283662796, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.2240833342075348, "step": 2173, "zero_std_ratio": 0.0 }, { "epoch": 0.19394263794103217, "grad_norm": 1911.38525390625, "learning_rate": 3e-06, "loss": -56.2197, "step": 2174 }, { "epoch": 0.19403184798608322, "grad_norm": 1706.150146484375, "learning_rate": 3e-06, "loss": 29.7264, "step": 2175 }, { "epoch": 0.1941210580311343, "grad_norm": 2385.930419921875, "learning_rate": 3e-06, "loss": 32.8588, "step": 2176 }, { "epoch": 0.19421026807618538, "grad_norm": 1925.345947265625, "learning_rate": 3e-06, "loss": 30.2097, "step": 2177 }, { "epoch": 0.19429947812123646, "grad_norm": 1719.54052734375, "learning_rate": 3e-06, "loss": 141.0592, "step": 2178 }, { "epoch": 0.19438868816628752, "grad_norm": 1638.2069091796875, "learning_rate": 3e-06, "loss": 20.0159, "step": 2179 }, { "epoch": 0.1944778982113386, "grad_norm": 2645.457763671875, "learning_rate": 3e-06, "loss": -39.4086, "step": 2180 }, { "epoch": 0.19456710825638968, "grad_norm": 1837.9119873046875, "learning_rate": 3e-06, "loss": 13.8234, "step": 2181 }, { "epoch": 0.19465631830144076, "grad_norm": 1472.4017333984375, "learning_rate": 3e-06, "loss": 23.4693, "step": 2182 }, { "epoch": 0.1947455283464918, "grad_norm": 2043.978759765625, "learning_rate": 3e-06, "loss": 6.3195, "step": 2183 }, { "epoch": 0.1948347383915429, "grad_norm": 2784.806640625, "learning_rate": 3e-06, "loss": 114.7516, "step": 2184 }, { "completion_length": 148.5, "epoch": 0.19492394843659397, "grad_norm": 2347.893798828125, "learning_rate": 3e-06, "loss": 152.3218, "reward": 2.072833240032196, "reward_std": 0.3999442011117935, "rewards/correctness_reward_func": 1.4166666865348816, "rewards/int_reward_func": 0.4791666716337204, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.176999993622303, "step": 2185, "zero_std_ratio": 0.125 }, { "epoch": 0.19501315848164502, "grad_norm": 1482.416015625, "learning_rate": 3e-06, "loss": 80.5951, "step": 2186 }, { "epoch": 0.1951023685266961, "grad_norm": 1534.2724609375, "learning_rate": 3e-06, "loss": 75.3633, "step": 2187 }, { "epoch": 0.19519157857174718, "grad_norm": 1607.8106689453125, "learning_rate": 3e-06, "loss": 60.7631, "step": 2188 }, { "epoch": 0.19528078861679826, "grad_norm": 2337.1337890625, "learning_rate": 3e-06, "loss": -126.4198, "step": 2189 }, { "epoch": 0.19536999866184931, "grad_norm": 1377.0809326171875, "learning_rate": 3e-06, "loss": 133.5221, "step": 2190 }, { "epoch": 0.1954592087069004, "grad_norm": 1643.5670166015625, "learning_rate": 3e-06, "loss": 132.6571, "step": 2191 }, { "epoch": 0.19554841875195147, "grad_norm": 1409.347412109375, "learning_rate": 3e-06, "loss": 71.3489, "step": 2192 }, { "epoch": 0.19563762879700256, "grad_norm": 1953.61328125, "learning_rate": 3e-06, "loss": 63.8237, "step": 2193 }, { "epoch": 0.1957268388420536, "grad_norm": 1273.4095458984375, "learning_rate": 3e-06, "loss": 52.83, "step": 2194 }, { "epoch": 0.1958160488871047, "grad_norm": 2235.899169921875, "learning_rate": 3e-06, "loss": -132.9868, "step": 2195 }, { "epoch": 0.19590525893215577, "grad_norm": 2546.96435546875, "learning_rate": 3e-06, "loss": 114.6427, "step": 2196 }, { "completion_length": 124.45833587646484, "epoch": 0.19599446897720682, "grad_norm": 1299.908203125, "learning_rate": 3e-06, "loss": 25.437, "reward": 1.7506250143051147, "reward_std": 0.45970311760902405, "rewards/correctness_reward_func": 1.1250000298023224, "rewards/int_reward_func": 0.4479166716337204, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.17770832777023315, "step": 2197, "zero_std_ratio": 0.125 }, { "epoch": 0.1960836790222579, "grad_norm": 1442.3870849609375, "learning_rate": 3e-06, "loss": 86.5591, "step": 2198 }, { "epoch": 0.19617288906730898, "grad_norm": 855.531982421875, "learning_rate": 3e-06, "loss": 76.0765, "step": 2199 }, { "epoch": 0.19626209911236006, "grad_norm": 870.2618408203125, "learning_rate": 3e-06, "loss": 80.9394, "step": 2200 }, { "epoch": 0.1963513091574111, "grad_norm": 1157.236572265625, "learning_rate": 3e-06, "loss": 86.6789, "step": 2201 }, { "epoch": 0.1964405192024622, "grad_norm": 1135.1353759765625, "learning_rate": 3e-06, "loss": 36.164, "step": 2202 }, { "epoch": 0.19652972924751327, "grad_norm": 1008.8060913085938, "learning_rate": 3e-06, "loss": 10.2971, "step": 2203 }, { "epoch": 0.19661893929256435, "grad_norm": 1482.61083984375, "learning_rate": 3e-06, "loss": 68.7021, "step": 2204 }, { "epoch": 0.1967081493376154, "grad_norm": 798.7431640625, "learning_rate": 3e-06, "loss": 60.4609, "step": 2205 }, { "epoch": 0.1967973593826665, "grad_norm": 971.9412231445312, "learning_rate": 3e-06, "loss": 71.1141, "step": 2206 }, { "epoch": 0.19688656942771757, "grad_norm": 1165.7591552734375, "learning_rate": 3e-06, "loss": 81.0864, "step": 2207 }, { "epoch": 0.19697577947276865, "grad_norm": 1213.063720703125, "learning_rate": 3e-06, "loss": 22.4711, "step": 2208 }, { "completion_length": 160.8541717529297, "epoch": 0.1970649895178197, "grad_norm": 1580.6722412109375, "learning_rate": 3e-06, "loss": -274.6722, "reward": 1.804187536239624, "reward_std": 0.7367371022701263, "rewards/correctness_reward_func": 1.2916666269302368, "rewards/int_reward_func": 0.4166666716337204, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.09585416316986084, "step": 2209, "zero_std_ratio": 0.0 }, { "epoch": 0.19715419956287078, "grad_norm": 1305.8641357421875, "learning_rate": 3e-06, "loss": -286.2726, "step": 2210 }, { "epoch": 0.19724340960792186, "grad_norm": 3519.257080078125, "learning_rate": 3e-06, "loss": -514.0822, "step": 2211 }, { "epoch": 0.1973326196529729, "grad_norm": 4824.2421875, "learning_rate": 3e-06, "loss": -367.1955, "step": 2212 }, { "epoch": 0.197421829698024, "grad_norm": 2674.205322265625, "learning_rate": 3e-06, "loss": -301.9088, "step": 2213 }, { "epoch": 0.19751103974307507, "grad_norm": 1980.4049072265625, "learning_rate": 3e-06, "loss": -304.9702, "step": 2214 }, { "epoch": 0.19760024978812615, "grad_norm": 2130.482666015625, "learning_rate": 3e-06, "loss": -305.0632, "step": 2215 }, { "epoch": 0.1976894598331772, "grad_norm": 1712.863525390625, "learning_rate": 3e-06, "loss": -309.973, "step": 2216 }, { "epoch": 0.19777866987822829, "grad_norm": 2488.321533203125, "learning_rate": 3e-06, "loss": -574.7097, "step": 2217 }, { "epoch": 0.19786787992327937, "grad_norm": 2911.133544921875, "learning_rate": 3e-06, "loss": -474.2518, "step": 2218 }, { "epoch": 0.19795708996833045, "grad_norm": 4416.62841796875, "learning_rate": 3e-06, "loss": -422.1913, "step": 2219 }, { "epoch": 0.1980463000133815, "grad_norm": 4086.730224609375, "learning_rate": 3e-06, "loss": -435.4988, "step": 2220 }, { "completion_length": 102.9375, "epoch": 0.19813551005843258, "grad_norm": 803.6179809570312, "learning_rate": 3e-06, "loss": -18.5715, "reward": 2.2100417613983154, "reward_std": 0.26370862126350403, "rewards/correctness_reward_func": 1.5, "rewards/int_reward_func": 0.4791666716337204, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.23087499290704727, "step": 2221, "zero_std_ratio": 0.0 }, { "epoch": 0.19822472010348366, "grad_norm": 617.244384765625, "learning_rate": 3e-06, "loss": -24.6303, "step": 2222 }, { "epoch": 0.1983139301485347, "grad_norm": 502.6885986328125, "learning_rate": 3e-06, "loss": -31.1001, "step": 2223 }, { "epoch": 0.1984031401935858, "grad_norm": 698.3114013671875, "learning_rate": 3e-06, "loss": -35.3177, "step": 2224 }, { "epoch": 0.19849235023863687, "grad_norm": 618.0604858398438, "learning_rate": 3e-06, "loss": -30.079, "step": 2225 }, { "epoch": 0.19858156028368795, "grad_norm": 484.8530578613281, "learning_rate": 3e-06, "loss": -37.8852, "step": 2226 }, { "epoch": 0.198670770328739, "grad_norm": 948.900634765625, "learning_rate": 3e-06, "loss": -15.1709, "step": 2227 }, { "epoch": 0.19875998037379008, "grad_norm": 609.5656127929688, "learning_rate": 3e-06, "loss": -37.6612, "step": 2228 }, { "epoch": 0.19884919041884117, "grad_norm": 672.0021362304688, "learning_rate": 3e-06, "loss": -49.1828, "step": 2229 }, { "epoch": 0.19893840046389225, "grad_norm": 888.1821899414062, "learning_rate": 3e-06, "loss": -55.2511, "step": 2230 }, { "epoch": 0.1990276105089433, "grad_norm": 811.3577880859375, "learning_rate": 3e-06, "loss": -45.7303, "step": 2231 }, { "epoch": 0.19911682055399438, "grad_norm": 717.1616821289062, "learning_rate": 3e-06, "loss": -51.4052, "step": 2232 }, { "completion_length": 140.27084350585938, "epoch": 0.19920603059904546, "grad_norm": 2046.7083740234375, "learning_rate": 3e-06, "loss": 109.0772, "reward": 2.342104196548462, "reward_std": 0.5653499662876129, "rewards/correctness_reward_func": 1.7083333134651184, "rewards/int_reward_func": 0.4583333283662796, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.17543750256299973, "step": 2233, "zero_std_ratio": 0.0 }, { "epoch": 0.19929524064409654, "grad_norm": 2079.919189453125, "learning_rate": 3e-06, "loss": 253.4095, "step": 2234 }, { "epoch": 0.1993844506891476, "grad_norm": 2711.693115234375, "learning_rate": 3e-06, "loss": 172.1419, "step": 2235 }, { "epoch": 0.19947366073419867, "grad_norm": 2554.647705078125, "learning_rate": 3e-06, "loss": 212.6262, "step": 2236 }, { "epoch": 0.19956287077924975, "grad_norm": 2302.18212890625, "learning_rate": 3e-06, "loss": 216.2501, "step": 2237 }, { "epoch": 0.1996520808243008, "grad_norm": 2337.755126953125, "learning_rate": 3e-06, "loss": 82.4594, "step": 2238 }, { "epoch": 0.19974129086935188, "grad_norm": 2143.60791015625, "learning_rate": 3e-06, "loss": 100.3759, "step": 2239 }, { "epoch": 0.19983050091440296, "grad_norm": 2368.834228515625, "learning_rate": 3e-06, "loss": 246.018, "step": 2240 }, { "epoch": 0.19991971095945404, "grad_norm": 2243.505615234375, "learning_rate": 3e-06, "loss": 138.0225, "step": 2241 }, { "epoch": 0.2000089210045051, "grad_norm": 2373.282470703125, "learning_rate": 3e-06, "loss": 180.4512, "step": 2242 }, { "epoch": 0.20009813104955618, "grad_norm": 2514.03955078125, "learning_rate": 3e-06, "loss": 167.8582, "step": 2243 }, { "epoch": 0.20018734109460726, "grad_norm": 1966.1226806640625, "learning_rate": 3e-06, "loss": 61.633, "step": 2244 }, { "completion_length": 111.00000381469727, "epoch": 0.20027655113965834, "grad_norm": 1399.266845703125, "learning_rate": 3e-06, "loss": -45.4763, "reward": 2.3000833988189697, "reward_std": 0.5667559206485748, "rewards/correctness_reward_func": 1.6666666865348816, "rewards/int_reward_func": 0.4375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.19591666013002396, "step": 2245, "zero_std_ratio": 0.0 }, { "epoch": 0.2003657611847094, "grad_norm": 1209.40625, "learning_rate": 3e-06, "loss": -77.9653, "step": 2246 }, { "epoch": 0.20045497122976047, "grad_norm": 1058.274658203125, "learning_rate": 3e-06, "loss": -114.5617, "step": 2247 }, { "epoch": 0.20054418127481155, "grad_norm": 1116.7257080078125, "learning_rate": 3e-06, "loss": -100.1849, "step": 2248 }, { "epoch": 0.2006333913198626, "grad_norm": 2506.532958984375, "learning_rate": 3e-06, "loss": -81.4074, "step": 2249 }, { "epoch": 0.20072260136491368, "grad_norm": 1058.7152099609375, "learning_rate": 3e-06, "loss": -129.5974, "step": 2250 }, { "epoch": 0.20081181140996476, "grad_norm": 1346.8084716796875, "learning_rate": 3e-06, "loss": -49.5752, "step": 2251 }, { "epoch": 0.20090102145501584, "grad_norm": 1400.63525390625, "learning_rate": 3e-06, "loss": -80.931, "step": 2252 }, { "epoch": 0.2009902315000669, "grad_norm": 979.59228515625, "learning_rate": 3e-06, "loss": -125.4819, "step": 2253 }, { "epoch": 0.20107944154511798, "grad_norm": 1141.697265625, "learning_rate": 3e-06, "loss": -114.8348, "step": 2254 }, { "epoch": 0.20116865159016906, "grad_norm": 1260.422607421875, "learning_rate": 3e-06, "loss": -92.5143, "step": 2255 }, { "epoch": 0.20125786163522014, "grad_norm": 1216.84228515625, "learning_rate": 3e-06, "loss": -142.977, "step": 2256 }, { "completion_length": 123.4375, "epoch": 0.2013470716802712, "grad_norm": 1664.2445068359375, "learning_rate": 3e-06, "loss": -129.5342, "reward": 2.551270842552185, "reward_std": 0.28455013036727905, "rewards/correctness_reward_func": 1.875, "rewards/int_reward_func": 0.4791666567325592, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.010416666977107525, "rewards/xmlcount_reward_func": 0.18668748438358307, "step": 2257, "zero_std_ratio": 0.0 }, { "epoch": 0.20143628172532227, "grad_norm": 2019.86083984375, "learning_rate": 3e-06, "loss": -188.5191, "step": 2258 }, { "epoch": 0.20152549177037335, "grad_norm": 1595.3583984375, "learning_rate": 3e-06, "loss": -125.4679, "step": 2259 }, { "epoch": 0.20161470181542443, "grad_norm": 2631.081787109375, "learning_rate": 3e-06, "loss": -152.7189, "step": 2260 }, { "epoch": 0.20170391186047548, "grad_norm": 1726.4815673828125, "learning_rate": 3e-06, "loss": -96.2797, "step": 2261 }, { "epoch": 0.20179312190552656, "grad_norm": 1873.742431640625, "learning_rate": 3e-06, "loss": -135.3711, "step": 2262 }, { "epoch": 0.20188233195057764, "grad_norm": 1621.1622314453125, "learning_rate": 3e-06, "loss": -140.641, "step": 2263 }, { "epoch": 0.2019715419956287, "grad_norm": 1722.911865234375, "learning_rate": 3e-06, "loss": -196.7486, "step": 2264 }, { "epoch": 0.20206075204067978, "grad_norm": 1306.2991943359375, "learning_rate": 3e-06, "loss": -128.2448, "step": 2265 }, { "epoch": 0.20214996208573086, "grad_norm": 2126.202392578125, "learning_rate": 3e-06, "loss": -168.1409, "step": 2266 }, { "epoch": 0.20223917213078194, "grad_norm": 1632.1204833984375, "learning_rate": 3e-06, "loss": -104.4657, "step": 2267 }, { "epoch": 0.202328382175833, "grad_norm": 1830.1990966796875, "learning_rate": 3e-06, "loss": -141.1356, "step": 2268 }, { "completion_length": 123.02083587646484, "epoch": 0.20241759222088407, "grad_norm": 1051.958251953125, "learning_rate": 3e-06, "loss": 63.9873, "reward": 2.0415000915527344, "reward_std": 0.3290978819131851, "rewards/correctness_reward_func": 1.3750000298023224, "rewards/int_reward_func": 0.46875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.1977500021457672, "step": 2269, "zero_std_ratio": 0.0 }, { "epoch": 0.20250680226593515, "grad_norm": 1236.19775390625, "learning_rate": 3e-06, "loss": 99.0337, "step": 2270 }, { "epoch": 0.20259601231098623, "grad_norm": 937.3912353515625, "learning_rate": 3e-06, "loss": 65.646, "step": 2271 }, { "epoch": 0.20268522235603728, "grad_norm": 1138.8841552734375, "learning_rate": 3e-06, "loss": 50.2982, "step": 2272 }, { "epoch": 0.20277443240108836, "grad_norm": 1543.039794921875, "learning_rate": 3e-06, "loss": 105.3749, "step": 2273 }, { "epoch": 0.20286364244613944, "grad_norm": 913.1903076171875, "learning_rate": 3e-06, "loss": 74.6702, "step": 2274 }, { "epoch": 0.20295285249119052, "grad_norm": 916.6128540039062, "learning_rate": 3e-06, "loss": 57.9423, "step": 2275 }, { "epoch": 0.20304206253624157, "grad_norm": 1266.6317138671875, "learning_rate": 3e-06, "loss": 95.4216, "step": 2276 }, { "epoch": 0.20313127258129265, "grad_norm": 932.1137084960938, "learning_rate": 3e-06, "loss": 61.0586, "step": 2277 }, { "epoch": 0.20322048262634373, "grad_norm": 1065.530029296875, "learning_rate": 3e-06, "loss": 39.2222, "step": 2278 }, { "epoch": 0.2033096926713948, "grad_norm": 1048.569580078125, "learning_rate": 3e-06, "loss": 95.8649, "step": 2279 }, { "epoch": 0.20339890271644587, "grad_norm": 975.1697387695312, "learning_rate": 3e-06, "loss": 64.6243, "step": 2280 }, { "completion_length": 123.35417175292969, "epoch": 0.20348811276149695, "grad_norm": 2359.048095703125, "learning_rate": 3e-06, "loss": -129.0191, "reward": 1.7423540949821472, "reward_std": 0.6932721436023712, "rewards/correctness_reward_func": 1.0833333432674408, "rewards/int_reward_func": 0.46875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.1902708262205124, "step": 2281, "zero_std_ratio": 0.0 }, { "epoch": 0.20357732280654803, "grad_norm": 2037.848388671875, "learning_rate": 3e-06, "loss": 32.7358, "step": 2282 }, { "epoch": 0.20366653285159908, "grad_norm": 2450.302978515625, "learning_rate": 3e-06, "loss": -88.7673, "step": 2283 }, { "epoch": 0.20375574289665016, "grad_norm": 1783.2620849609375, "learning_rate": 3e-06, "loss": -99.9251, "step": 2284 }, { "epoch": 0.20384495294170124, "grad_norm": 1748.13330078125, "learning_rate": 3e-06, "loss": -53.0798, "step": 2285 }, { "epoch": 0.20393416298675232, "grad_norm": 2280.80810546875, "learning_rate": 3e-06, "loss": -79.2135, "step": 2286 }, { "epoch": 0.20402337303180337, "grad_norm": 2504.2412109375, "learning_rate": 3e-06, "loss": -153.4697, "step": 2287 }, { "epoch": 0.20411258307685445, "grad_norm": 2130.119873046875, "learning_rate": 3e-06, "loss": 18.6636, "step": 2288 }, { "epoch": 0.20420179312190553, "grad_norm": 2430.63818359375, "learning_rate": 3e-06, "loss": -116.2015, "step": 2289 }, { "epoch": 0.2042910031669566, "grad_norm": 2750.778076171875, "learning_rate": 3e-06, "loss": -129.5705, "step": 2290 }, { "epoch": 0.20438021321200767, "grad_norm": 1790.766845703125, "learning_rate": 3e-06, "loss": -95.7831, "step": 2291 }, { "epoch": 0.20446942325705875, "grad_norm": 2436.076171875, "learning_rate": 3e-06, "loss": -110.9333, "step": 2292 }, { "completion_length": 126.64583969116211, "epoch": 0.20455863330210983, "grad_norm": 1839.115966796875, "learning_rate": 3e-06, "loss": -97.4442, "reward": 2.2966668605804443, "reward_std": 0.4922372102737427, "rewards/correctness_reward_func": 1.625, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.17166666314005852, "step": 2293, "zero_std_ratio": 0.0 }, { "epoch": 0.20464784334716088, "grad_norm": 1397.8768310546875, "learning_rate": 3e-06, "loss": -101.0346, "step": 2294 }, { "epoch": 0.20473705339221196, "grad_norm": 1341.2484130859375, "learning_rate": 3e-06, "loss": -247.112, "step": 2295 }, { "epoch": 0.20482626343726304, "grad_norm": 2028.7335205078125, "learning_rate": 3e-06, "loss": -72.8786, "step": 2296 }, { "epoch": 0.20491547348231412, "grad_norm": 1479.264892578125, "learning_rate": 3e-06, "loss": -213.6321, "step": 2297 }, { "epoch": 0.20500468352736517, "grad_norm": 1442.75927734375, "learning_rate": 3e-06, "loss": -161.4184, "step": 2298 }, { "epoch": 0.20509389357241625, "grad_norm": 1903.4906005859375, "learning_rate": 3e-06, "loss": -115.0706, "step": 2299 }, { "epoch": 0.20518310361746733, "grad_norm": 1373.6041259765625, "learning_rate": 3e-06, "loss": -128.6296, "step": 2300 }, { "epoch": 0.2052723136625184, "grad_norm": 1395.6331787109375, "learning_rate": 3e-06, "loss": -256.6596, "step": 2301 }, { "epoch": 0.20536152370756947, "grad_norm": 1820.6612548828125, "learning_rate": 3e-06, "loss": -86.9157, "step": 2302 }, { "epoch": 0.20545073375262055, "grad_norm": 1319.0960693359375, "learning_rate": 3e-06, "loss": -233.4003, "step": 2303 }, { "epoch": 0.20553994379767163, "grad_norm": 1100.8870849609375, "learning_rate": 3e-06, "loss": -176.9607, "step": 2304 }, { "completion_length": 130.87500381469727, "epoch": 0.20562915384272268, "grad_norm": 790.4862670898438, "learning_rate": 3e-06, "loss": 48.6722, "reward": 2.4920207262039185, "reward_std": 0.28309868834912777, "rewards/correctness_reward_func": 1.8333333134651184, "rewards/int_reward_func": 0.4895833283662796, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.16910415515303612, "step": 2305, "zero_std_ratio": 0.0 }, { "epoch": 0.20571836388777376, "grad_norm": 699.5379028320312, "learning_rate": 3e-06, "loss": 23.5835, "step": 2306 }, { "epoch": 0.20580757393282484, "grad_norm": 664.294921875, "learning_rate": 3e-06, "loss": 62.6984, "step": 2307 }, { "epoch": 0.20589678397787592, "grad_norm": 814.1132202148438, "learning_rate": 3e-06, "loss": 58.8113, "step": 2308 }, { "epoch": 0.20598599402292697, "grad_norm": 621.1661376953125, "learning_rate": 3e-06, "loss": 17.8389, "step": 2309 }, { "epoch": 0.20607520406797805, "grad_norm": 754.7105102539062, "learning_rate": 3e-06, "loss": 43.741, "step": 2310 }, { "epoch": 0.20616441411302913, "grad_norm": 765.5684814453125, "learning_rate": 3e-06, "loss": 33.7028, "step": 2311 }, { "epoch": 0.2062536241580802, "grad_norm": 469.3419189453125, "learning_rate": 3e-06, "loss": 9.6402, "step": 2312 }, { "epoch": 0.20634283420313126, "grad_norm": 575.3997192382812, "learning_rate": 3e-06, "loss": 43.3943, "step": 2313 }, { "epoch": 0.20643204424818234, "grad_norm": 636.8104858398438, "learning_rate": 3e-06, "loss": 39.065, "step": 2314 }, { "epoch": 0.20652125429323342, "grad_norm": 401.0857849121094, "learning_rate": 3e-06, "loss": 3.814, "step": 2315 }, { "epoch": 0.20661046433828448, "grad_norm": 442.88726806640625, "learning_rate": 3e-06, "loss": 20.3367, "step": 2316 }, { "completion_length": 109.10417175292969, "epoch": 0.20669967438333556, "grad_norm": 1531.75146484375, "learning_rate": 3e-06, "loss": 87.3807, "reward": 2.5790417194366455, "reward_std": 0.30080675333738327, "rewards/correctness_reward_func": 1.875, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.20404167473316193, "step": 2317, "zero_std_ratio": 0.125 }, { "epoch": 0.20678888442838664, "grad_norm": 1411.746826171875, "learning_rate": 3e-06, "loss": 71.0098, "step": 2318 }, { "epoch": 0.20687809447343772, "grad_norm": 1122.78759765625, "learning_rate": 3e-06, "loss": 74.4398, "step": 2319 }, { "epoch": 0.20696730451848877, "grad_norm": 1717.5146484375, "learning_rate": 3e-06, "loss": 74.3571, "step": 2320 }, { "epoch": 0.20705651456353985, "grad_norm": 1425.1131591796875, "learning_rate": 3e-06, "loss": 7.1953, "step": 2321 }, { "epoch": 0.20714572460859093, "grad_norm": 1329.4161376953125, "learning_rate": 3e-06, "loss": 84.9138, "step": 2322 }, { "epoch": 0.207234934653642, "grad_norm": 881.3815307617188, "learning_rate": 3e-06, "loss": 59.1147, "step": 2323 }, { "epoch": 0.20732414469869306, "grad_norm": 1501.871826171875, "learning_rate": 3e-06, "loss": 47.1585, "step": 2324 }, { "epoch": 0.20741335474374414, "grad_norm": 786.2328491210938, "learning_rate": 3e-06, "loss": 46.2053, "step": 2325 }, { "epoch": 0.20750256478879522, "grad_norm": 744.2485961914062, "learning_rate": 3e-06, "loss": 39.6594, "step": 2326 }, { "epoch": 0.2075917748338463, "grad_norm": 752.8430786132812, "learning_rate": 3e-06, "loss": 11.047, "step": 2327 }, { "epoch": 0.20768098487889736, "grad_norm": 740.0889892578125, "learning_rate": 3e-06, "loss": 47.685, "step": 2328 }, { "completion_length": 122.72916793823242, "epoch": 0.20777019492394844, "grad_norm": 159.8445587158203, "learning_rate": 3e-06, "loss": -1.327, "reward": 2.3375418186187744, "reward_std": 0.14845435763709247, "rewards/correctness_reward_func": 1.6666666865348816, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.17087500542402267, "step": 2329, "zero_std_ratio": 0.0 }, { "epoch": 0.20785940496899952, "grad_norm": 233.4280548095703, "learning_rate": 3e-06, "loss": 13.7035, "step": 2330 }, { "epoch": 0.20794861501405057, "grad_norm": 190.21124267578125, "learning_rate": 3e-06, "loss": 2.0237, "step": 2331 }, { "epoch": 0.20803782505910165, "grad_norm": 202.0067901611328, "learning_rate": 3e-06, "loss": 7.6713, "step": 2332 }, { "epoch": 0.20812703510415273, "grad_norm": 118.27416229248047, "learning_rate": 3e-06, "loss": 0.5948, "step": 2333 }, { "epoch": 0.2082162451492038, "grad_norm": 205.14324951171875, "learning_rate": 3e-06, "loss": 1.5616, "step": 2334 }, { "epoch": 0.20830545519425486, "grad_norm": 115.84490203857422, "learning_rate": 3e-06, "loss": -3.7101, "step": 2335 }, { "epoch": 0.20839466523930594, "grad_norm": 152.4305877685547, "learning_rate": 3e-06, "loss": 6.609, "step": 2336 }, { "epoch": 0.20848387528435702, "grad_norm": 93.83056640625, "learning_rate": 3e-06, "loss": -2.1607, "step": 2337 }, { "epoch": 0.2085730853294081, "grad_norm": 186.54685974121094, "learning_rate": 3e-06, "loss": 2.1806, "step": 2338 }, { "epoch": 0.20866229537445916, "grad_norm": 103.79989624023438, "learning_rate": 3e-06, "loss": -1.846, "step": 2339 }, { "epoch": 0.20875150541951024, "grad_norm": 98.66133117675781, "learning_rate": 3e-06, "loss": -3.5675, "step": 2340 }, { "completion_length": 128.33333587646484, "epoch": 0.20884071546456132, "grad_norm": 508.50653076171875, "learning_rate": 3e-06, "loss": 18.0618, "reward": 2.2047917246818542, "reward_std": 0.27334894239902496, "rewards/correctness_reward_func": 1.5416666865348816, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.16312500834465027, "step": 2341, "zero_std_ratio": 0.0 }, { "epoch": 0.20892992550961237, "grad_norm": 535.1369018554688, "learning_rate": 3e-06, "loss": 25.3646, "step": 2342 }, { "epoch": 0.20901913555466345, "grad_norm": 366.54559326171875, "learning_rate": 3e-06, "loss": 4.7151, "step": 2343 }, { "epoch": 0.20910834559971453, "grad_norm": 501.6902160644531, "learning_rate": 3e-06, "loss": -1.087, "step": 2344 }, { "epoch": 0.2091975556447656, "grad_norm": 443.63360595703125, "learning_rate": 3e-06, "loss": 8.3783, "step": 2345 }, { "epoch": 0.20928676568981666, "grad_norm": 368.2359313964844, "learning_rate": 3e-06, "loss": 2.4934, "step": 2346 }, { "epoch": 0.20937597573486774, "grad_norm": 362.5126647949219, "learning_rate": 3e-06, "loss": 8.2788, "step": 2347 }, { "epoch": 0.20946518577991882, "grad_norm": 331.4849548339844, "learning_rate": 3e-06, "loss": 11.4291, "step": 2348 }, { "epoch": 0.2095543958249699, "grad_norm": 373.8135070800781, "learning_rate": 3e-06, "loss": -2.6253, "step": 2349 }, { "epoch": 0.20964360587002095, "grad_norm": 359.12152099609375, "learning_rate": 3e-06, "loss": -9.531, "step": 2350 }, { "epoch": 0.20973281591507204, "grad_norm": 387.0042724609375, "learning_rate": 3e-06, "loss": -0.5337, "step": 2351 }, { "epoch": 0.20982202596012312, "grad_norm": 312.2693786621094, "learning_rate": 3e-06, "loss": -4.1708, "step": 2352 }, { "completion_length": 140.35416793823242, "epoch": 0.2099112360051742, "grad_norm": 3304.60009765625, "learning_rate": 3e-06, "loss": -79.5805, "reward": 2.270458459854126, "reward_std": 0.6086882501840591, "rewards/correctness_reward_func": 1.6666666269302368, "rewards/int_reward_func": 0.4895833283662796, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.11420833691954613, "step": 2353, "zero_std_ratio": 0.0 }, { "epoch": 0.21000044605022525, "grad_norm": 555.783203125, "learning_rate": 3e-06, "loss": -26.9599, "step": 2354 }, { "epoch": 0.21008965609527633, "grad_norm": 310.6244201660156, "learning_rate": 3e-06, "loss": -20.3898, "step": 2355 }, { "epoch": 0.2101788661403274, "grad_norm": 442.4192810058594, "learning_rate": 3e-06, "loss": -6.1292, "step": 2356 }, { "epoch": 0.21026807618537846, "grad_norm": 380.87841796875, "learning_rate": 3e-06, "loss": -28.3523, "step": 2357 }, { "epoch": 0.21035728623042954, "grad_norm": 546.93310546875, "learning_rate": 3e-06, "loss": -13.9403, "step": 2358 }, { "epoch": 0.21044649627548062, "grad_norm": 1282.0760498046875, "learning_rate": 3e-06, "loss": -75.0629, "step": 2359 }, { "epoch": 0.2105357063205317, "grad_norm": 411.43829345703125, "learning_rate": 3e-06, "loss": -26.5976, "step": 2360 }, { "epoch": 0.21062491636558275, "grad_norm": 287.0208435058594, "learning_rate": 3e-06, "loss": -20.6849, "step": 2361 }, { "epoch": 0.21071412641063383, "grad_norm": 571.956787109375, "learning_rate": 3e-06, "loss": -7.8208, "step": 2362 }, { "epoch": 0.21080333645568491, "grad_norm": 343.234130859375, "learning_rate": 3e-06, "loss": -29.511, "step": 2363 }, { "epoch": 0.210892546500736, "grad_norm": 552.2923583984375, "learning_rate": 3e-06, "loss": -17.5949, "step": 2364 }, { "completion_length": 126.52083587646484, "epoch": 0.21098175654578705, "grad_norm": 243.8163299560547, "learning_rate": 3e-06, "loss": 4.4713, "reward": 2.246416687965393, "reward_std": 0.34428833425045013, "rewards/correctness_reward_func": 1.625, "rewards/int_reward_func": 0.46875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.15266667306423187, "step": 2365, "zero_std_ratio": 0.0 }, { "epoch": 0.21107096659083813, "grad_norm": 106.08045959472656, "learning_rate": 3e-06, "loss": 2.346, "step": 2366 }, { "epoch": 0.2111601766358892, "grad_norm": 143.98031616210938, "learning_rate": 3e-06, "loss": 3.9441, "step": 2367 }, { "epoch": 0.2112493866809403, "grad_norm": 106.59970092773438, "learning_rate": 3e-06, "loss": 0.7764, "step": 2368 }, { "epoch": 0.21133859672599134, "grad_norm": 135.23463439941406, "learning_rate": 3e-06, "loss": 5.1002, "step": 2369 }, { "epoch": 0.21142780677104242, "grad_norm": 195.45408630371094, "learning_rate": 3e-06, "loss": 14.522, "step": 2370 }, { "epoch": 0.2115170168160935, "grad_norm": 742.7892456054688, "learning_rate": 3e-06, "loss": 1.9066, "step": 2371 }, { "epoch": 0.21160622686114455, "grad_norm": 128.09744262695312, "learning_rate": 3e-06, "loss": 1.677, "step": 2372 }, { "epoch": 0.21169543690619563, "grad_norm": 118.60245513916016, "learning_rate": 3e-06, "loss": 1.7248, "step": 2373 }, { "epoch": 0.2117846469512467, "grad_norm": 97.40567779541016, "learning_rate": 3e-06, "loss": 0.1819, "step": 2374 }, { "epoch": 0.2118738569962978, "grad_norm": 134.66766357421875, "learning_rate": 3e-06, "loss": 2.7604, "step": 2375 }, { "epoch": 0.21196306704134885, "grad_norm": 491.87567138671875, "learning_rate": 3e-06, "loss": 12.4578, "step": 2376 }, { "completion_length": 113.10416793823242, "epoch": 0.21205227708639993, "grad_norm": 465.9616394042969, "learning_rate": 3e-06, "loss": -32.9042, "reward": 2.663749933242798, "reward_std": 0.13261950528249145, "rewards/correctness_reward_func": 1.9583333134651184, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.20541664958000183, "step": 2377, "zero_std_ratio": 0.125 }, { "epoch": 0.212141487131451, "grad_norm": 558.760009765625, "learning_rate": 3e-06, "loss": -9.7375, "step": 2378 }, { "epoch": 0.2122306971765021, "grad_norm": 437.1868591308594, "learning_rate": 3e-06, "loss": -20.5382, "step": 2379 }, { "epoch": 0.21231990722155314, "grad_norm": 571.3277587890625, "learning_rate": 3e-06, "loss": -9.5875, "step": 2380 }, { "epoch": 0.21240911726660422, "grad_norm": 460.2364501953125, "learning_rate": 3e-06, "loss": -7.0278, "step": 2381 }, { "epoch": 0.2124983273116553, "grad_norm": 801.8722534179688, "learning_rate": 3e-06, "loss": -42.3189, "step": 2382 }, { "epoch": 0.21258753735670635, "grad_norm": 486.27691650390625, "learning_rate": 3e-06, "loss": -38.3283, "step": 2383 }, { "epoch": 0.21267674740175743, "grad_norm": 633.953125, "learning_rate": 3e-06, "loss": -15.9907, "step": 2384 }, { "epoch": 0.2127659574468085, "grad_norm": 500.3168640136719, "learning_rate": 3e-06, "loss": -26.8408, "step": 2385 }, { "epoch": 0.2128551674918596, "grad_norm": 779.6222534179688, "learning_rate": 3e-06, "loss": -14.4924, "step": 2386 }, { "epoch": 0.21294437753691065, "grad_norm": 469.1849670410156, "learning_rate": 3e-06, "loss": -10.9935, "step": 2387 }, { "epoch": 0.21303358758196173, "grad_norm": 550.7216796875, "learning_rate": 3e-06, "loss": -53.6791, "step": 2388 }, { "completion_length": 135.33333587646484, "epoch": 0.2131227976270128, "grad_norm": 290.79766845703125, "learning_rate": 3e-06, "loss": 9.7784, "reward": 2.2966458797454834, "reward_std": 0.6616353988647461, "rewards/correctness_reward_func": 1.6666666865348816, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.12997916713356972, "step": 2389, "zero_std_ratio": 0.0 }, { "epoch": 0.21321200767206389, "grad_norm": 348.85821533203125, "learning_rate": 3e-06, "loss": 29.5501, "step": 2390 }, { "epoch": 0.21330121771711494, "grad_norm": 424.7781982421875, "learning_rate": 3e-06, "loss": 18.5572, "step": 2391 }, { "epoch": 0.21339042776216602, "grad_norm": 288.4269104003906, "learning_rate": 3e-06, "loss": 11.6147, "step": 2392 }, { "epoch": 0.2134796378072171, "grad_norm": 311.3384704589844, "learning_rate": 3e-06, "loss": 14.8, "step": 2393 }, { "epoch": 0.21356884785226818, "grad_norm": 581.3242797851562, "learning_rate": 3e-06, "loss": 26.3673, "step": 2394 }, { "epoch": 0.21365805789731923, "grad_norm": 243.1928253173828, "learning_rate": 3e-06, "loss": 8.8072, "step": 2395 }, { "epoch": 0.2137472679423703, "grad_norm": 464.5150451660156, "learning_rate": 3e-06, "loss": 27.1507, "step": 2396 }, { "epoch": 0.2138364779874214, "grad_norm": 379.3248596191406, "learning_rate": 3e-06, "loss": 12.6021, "step": 2397 }, { "epoch": 0.21392568803247244, "grad_norm": 306.65716552734375, "learning_rate": 3e-06, "loss": 9.6934, "step": 2398 }, { "epoch": 0.21401489807752352, "grad_norm": 254.15750122070312, "learning_rate": 3e-06, "loss": 9.9346, "step": 2399 }, { "epoch": 0.2141041081225746, "grad_norm": 396.16680908203125, "learning_rate": 3e-06, "loss": 19.5421, "step": 2400 }, { "completion_length": 153.3541717529297, "epoch": 0.21419331816762568, "grad_norm": 837.0029296875, "learning_rate": 3e-06, "loss": 17.6087, "reward": 2.1600000858306885, "reward_std": 0.44815221428871155, "rewards/correctness_reward_func": 1.5833333134651184, "rewards/int_reward_func": 0.4895833283662796, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.08708333410322666, "step": 2401, "zero_std_ratio": 0.0 }, { "epoch": 0.21428252821267674, "grad_norm": 772.85107421875, "learning_rate": 3e-06, "loss": 8.4741, "step": 2402 }, { "epoch": 0.21437173825772782, "grad_norm": 622.2056884765625, "learning_rate": 3e-06, "loss": 23.1122, "step": 2403 }, { "epoch": 0.2144609483027789, "grad_norm": 931.6195678710938, "learning_rate": 3e-06, "loss": -2.0691, "step": 2404 }, { "epoch": 0.21455015834782998, "grad_norm": 945.195556640625, "learning_rate": 3e-06, "loss": 6.8255, "step": 2405 }, { "epoch": 0.21463936839288103, "grad_norm": 665.4218139648438, "learning_rate": 3e-06, "loss": -10.6024, "step": 2406 }, { "epoch": 0.2147285784379321, "grad_norm": 668.084228515625, "learning_rate": 3e-06, "loss": 12.1126, "step": 2407 }, { "epoch": 0.2148177884829832, "grad_norm": 681.3531494140625, "learning_rate": 3e-06, "loss": 3.2916, "step": 2408 }, { "epoch": 0.21490699852803424, "grad_norm": 429.6943664550781, "learning_rate": 3e-06, "loss": 18.1306, "step": 2409 }, { "epoch": 0.21499620857308532, "grad_norm": 625.8978271484375, "learning_rate": 3e-06, "loss": -4.3924, "step": 2410 }, { "epoch": 0.2150854186181364, "grad_norm": 1044.302978515625, "learning_rate": 3e-06, "loss": -1.7148, "step": 2411 }, { "epoch": 0.21517462866318748, "grad_norm": 526.33447265625, "learning_rate": 3e-06, "loss": -14.9186, "step": 2412 }, { "completion_length": 122.52083969116211, "epoch": 0.21526383870823854, "grad_norm": 1148.2313232421875, "learning_rate": 3e-06, "loss": -46.4753, "reward": 2.430062413215637, "reward_std": 0.4773147590458393, "rewards/correctness_reward_func": 1.75, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.18006250262260437, "step": 2413, "zero_std_ratio": 0.0 }, { "epoch": 0.21535304875328962, "grad_norm": 882.3959350585938, "learning_rate": 3e-06, "loss": -44.2003, "step": 2414 }, { "epoch": 0.2154422587983407, "grad_norm": 794.4973754882812, "learning_rate": 3e-06, "loss": -10.3375, "step": 2415 }, { "epoch": 0.21553146884339178, "grad_norm": 613.2156982421875, "learning_rate": 3e-06, "loss": -8.6654, "step": 2416 }, { "epoch": 0.21562067888844283, "grad_norm": 739.1935424804688, "learning_rate": 3e-06, "loss": -18.7665, "step": 2417 }, { "epoch": 0.2157098889334939, "grad_norm": 800.4462280273438, "learning_rate": 3e-06, "loss": -48.7075, "step": 2418 }, { "epoch": 0.215799098978545, "grad_norm": 938.549560546875, "learning_rate": 3e-06, "loss": -51.1103, "step": 2419 }, { "epoch": 0.21588830902359607, "grad_norm": 581.0545654296875, "learning_rate": 3e-06, "loss": -46.1839, "step": 2420 }, { "epoch": 0.21597751906864712, "grad_norm": 803.5621337890625, "learning_rate": 3e-06, "loss": -25.6564, "step": 2421 }, { "epoch": 0.2160667291136982, "grad_norm": 815.6629638671875, "learning_rate": 3e-06, "loss": -20.3486, "step": 2422 }, { "epoch": 0.21615593915874928, "grad_norm": 742.8468627929688, "learning_rate": 3e-06, "loss": -30.5293, "step": 2423 }, { "epoch": 0.21624514920380034, "grad_norm": 989.3768310546875, "learning_rate": 3e-06, "loss": -69.2167, "step": 2424 }, { "completion_length": 128.58333587646484, "epoch": 0.21633435924885142, "grad_norm": 279.8984069824219, "learning_rate": 3e-06, "loss": 14.9508, "reward": 1.7630833983421326, "reward_std": 0.2766040712594986, "rewards/correctness_reward_func": 1.1666666567325592, "rewards/int_reward_func": 0.4791666567325592, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.117249995470047, "step": 2425, "zero_std_ratio": 0.0 }, { "epoch": 0.2164235692939025, "grad_norm": 536.0889282226562, "learning_rate": 3e-06, "loss": -6.0752, "step": 2426 }, { "epoch": 0.21651277933895358, "grad_norm": 348.53533935546875, "learning_rate": 3e-06, "loss": 17.0228, "step": 2427 }, { "epoch": 0.21660198938400463, "grad_norm": 385.8553466796875, "learning_rate": 3e-06, "loss": 12.1111, "step": 2428 }, { "epoch": 0.2166911994290557, "grad_norm": 267.1854553222656, "learning_rate": 3e-06, "loss": 0.8226, "step": 2429 }, { "epoch": 0.2167804094741068, "grad_norm": 316.3680725097656, "learning_rate": 3e-06, "loss": 9.285, "step": 2430 }, { "epoch": 0.21686961951915787, "grad_norm": 280.73785400390625, "learning_rate": 3e-06, "loss": 13.0207, "step": 2431 }, { "epoch": 0.21695882956420892, "grad_norm": 346.0469970703125, "learning_rate": 3e-06, "loss": -5.4154, "step": 2432 }, { "epoch": 0.21704803960926, "grad_norm": 264.33837890625, "learning_rate": 3e-06, "loss": 11.8802, "step": 2433 }, { "epoch": 0.21713724965431108, "grad_norm": 304.8049011230469, "learning_rate": 3e-06, "loss": 3.9282, "step": 2434 }, { "epoch": 0.21722645969936213, "grad_norm": 163.74391174316406, "learning_rate": 3e-06, "loss": -4.001, "step": 2435 }, { "epoch": 0.21731566974441321, "grad_norm": 224.05996704101562, "learning_rate": 3e-06, "loss": 1.9239, "step": 2436 }, { "completion_length": 148.2916717529297, "epoch": 0.2174048797894643, "grad_norm": 737.8390502929688, "learning_rate": 3e-06, "loss": -72.0648, "reward": 2.1894376277923584, "reward_std": 0.16566037200391293, "rewards/correctness_reward_func": 1.5833333134651184, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.1061041597276926, "step": 2437, "zero_std_ratio": 0.0 }, { "epoch": 0.21749408983451538, "grad_norm": 1339.2056884765625, "learning_rate": 3e-06, "loss": -92.2614, "step": 2438 }, { "epoch": 0.21758329987956643, "grad_norm": 112.15784454345703, "learning_rate": 3e-06, "loss": 2.3648, "step": 2439 }, { "epoch": 0.2176725099246175, "grad_norm": 210.3965606689453, "learning_rate": 3e-06, "loss": 1.6453, "step": 2440 }, { "epoch": 0.2177617199696686, "grad_norm": 2733.96630859375, "learning_rate": 3e-06, "loss": 86.5019, "step": 2441 }, { "epoch": 0.21785093001471967, "grad_norm": 440.70416259765625, "learning_rate": 3e-06, "loss": -15.824, "step": 2442 }, { "epoch": 0.21794014005977072, "grad_norm": 733.9408569335938, "learning_rate": 3e-06, "loss": -80.8303, "step": 2443 }, { "epoch": 0.2180293501048218, "grad_norm": 1372.98046875, "learning_rate": 3e-06, "loss": -108.3653, "step": 2444 }, { "epoch": 0.21811856014987288, "grad_norm": 158.97706604003906, "learning_rate": 3e-06, "loss": 0.8506, "step": 2445 }, { "epoch": 0.21820777019492396, "grad_norm": 297.0166320800781, "learning_rate": 3e-06, "loss": -1.8569, "step": 2446 }, { "epoch": 0.218296980239975, "grad_norm": 2724.94775390625, "learning_rate": 3e-06, "loss": 114.5594, "step": 2447 }, { "epoch": 0.2183861902850261, "grad_norm": 631.491943359375, "learning_rate": 3e-06, "loss": -27.9172, "step": 2448 }, { "completion_length": 117.02083969116211, "epoch": 0.21847540033007717, "grad_norm": 72.98030090332031, "learning_rate": 3e-06, "loss": -7.2195, "reward": 2.3603543043136597, "reward_std": 0.15350967459380627, "rewards/correctness_reward_func": 1.6666666865348816, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.19368749111890793, "step": 2449, "zero_std_ratio": 0.125 }, { "epoch": 0.21856461037512823, "grad_norm": 69.76261901855469, "learning_rate": 3e-06, "loss": -6.6095, "step": 2450 }, { "epoch": 0.2186538204201793, "grad_norm": 119.00989532470703, "learning_rate": 3e-06, "loss": -3.7843, "step": 2451 }, { "epoch": 0.2187430304652304, "grad_norm": 81.48381805419922, "learning_rate": 3e-06, "loss": -6.6727, "step": 2452 }, { "epoch": 0.21883224051028147, "grad_norm": 96.27825927734375, "learning_rate": 3e-06, "loss": -7.5212, "step": 2453 }, { "epoch": 0.21892145055533252, "grad_norm": 74.60059356689453, "learning_rate": 3e-06, "loss": -8.8148, "step": 2454 }, { "epoch": 0.2190106606003836, "grad_norm": 78.35423278808594, "learning_rate": 3e-06, "loss": -8.4421, "step": 2455 }, { "epoch": 0.21909987064543468, "grad_norm": 144.65966796875, "learning_rate": 3e-06, "loss": -7.9481, "step": 2456 }, { "epoch": 0.21918908069048576, "grad_norm": 93.7354965209961, "learning_rate": 3e-06, "loss": -5.0046, "step": 2457 }, { "epoch": 0.2192782907355368, "grad_norm": 78.02239227294922, "learning_rate": 3e-06, "loss": -8.622, "step": 2458 }, { "epoch": 0.2193675007805879, "grad_norm": 82.25161743164062, "learning_rate": 3e-06, "loss": -9.508, "step": 2459 }, { "epoch": 0.21945671082563897, "grad_norm": 77.15231323242188, "learning_rate": 3e-06, "loss": -10.3714, "step": 2460 }, { "completion_length": 115.97916793823242, "epoch": 0.21954592087069005, "grad_norm": 39.886009216308594, "learning_rate": 3e-06, "loss": -4.294, "reward": 2.4212708473205566, "reward_std": 0.040800848975777626, "rewards/correctness_reward_func": 1.75, "rewards/int_reward_func": 0.4895833283662796, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.18168748915195465, "step": 2461, "zero_std_ratio": 0.0 }, { "epoch": 0.2196351309157411, "grad_norm": 45.25791549682617, "learning_rate": 3e-06, "loss": -2.4001, "step": 2462 }, { "epoch": 0.21972434096079219, "grad_norm": 47.455623626708984, "learning_rate": 3e-06, "loss": 0.16, "step": 2463 }, { "epoch": 0.21981355100584327, "grad_norm": 56.16099166870117, "learning_rate": 3e-06, "loss": -5.4067, "step": 2464 }, { "epoch": 0.21990276105089432, "grad_norm": 58.30009841918945, "learning_rate": 3e-06, "loss": -6.231, "step": 2465 }, { "epoch": 0.2199919710959454, "grad_norm": 55.359840393066406, "learning_rate": 3e-06, "loss": -2.543, "step": 2466 }, { "epoch": 0.22008118114099648, "grad_norm": 50.81299591064453, "learning_rate": 3e-06, "loss": -5.613, "step": 2467 }, { "epoch": 0.22017039118604756, "grad_norm": 54.18115234375, "learning_rate": 3e-06, "loss": -3.8278, "step": 2468 }, { "epoch": 0.2202596012310986, "grad_norm": 70.59385681152344, "learning_rate": 3e-06, "loss": -0.9699, "step": 2469 }, { "epoch": 0.2203488112761497, "grad_norm": 68.53688049316406, "learning_rate": 3e-06, "loss": -7.7968, "step": 2470 }, { "epoch": 0.22043802132120077, "grad_norm": 72.17977142333984, "learning_rate": 3e-06, "loss": -8.891, "step": 2471 }, { "epoch": 0.22052723136625185, "grad_norm": 76.02491760253906, "learning_rate": 3e-06, "loss": -4.5234, "step": 2472 }, { "completion_length": 120.66667175292969, "epoch": 0.2206164414113029, "grad_norm": 217.54934692382812, "learning_rate": 3e-06, "loss": -3.0888, "reward": 2.4115419387817383, "reward_std": 0.12016614899039268, "rewards/correctness_reward_func": 1.7083333134651184, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.2032083421945572, "step": 2473, "zero_std_ratio": 0.0 }, { "epoch": 0.22070565145635399, "grad_norm": 235.37335205078125, "learning_rate": 3e-06, "loss": 5.7274, "step": 2474 }, { "epoch": 0.22079486150140507, "grad_norm": 458.1533508300781, "learning_rate": 3e-06, "loss": -5.1241, "step": 2475 }, { "epoch": 0.22088407154645612, "grad_norm": 265.38818359375, "learning_rate": 3e-06, "loss": 9.2465, "step": 2476 }, { "epoch": 0.2209732815915072, "grad_norm": 494.6804504394531, "learning_rate": 3e-06, "loss": -4.1893, "step": 2477 }, { "epoch": 0.22106249163655828, "grad_norm": 272.2371826171875, "learning_rate": 3e-06, "loss": 9.0183, "step": 2478 }, { "epoch": 0.22115170168160936, "grad_norm": 322.984130859375, "learning_rate": 3e-06, "loss": -1.4007, "step": 2479 }, { "epoch": 0.2212409117266604, "grad_norm": 377.4932556152344, "learning_rate": 3e-06, "loss": 6.0472, "step": 2480 }, { "epoch": 0.2213301217717115, "grad_norm": 765.150634765625, "learning_rate": 3e-06, "loss": -13.7039, "step": 2481 }, { "epoch": 0.22141933181676257, "grad_norm": 315.2571716308594, "learning_rate": 3e-06, "loss": 10.4126, "step": 2482 }, { "epoch": 0.22150854186181365, "grad_norm": 702.4007568359375, "learning_rate": 3e-06, "loss": -10.1122, "step": 2483 }, { "epoch": 0.2215977519068647, "grad_norm": 322.5043029785156, "learning_rate": 3e-06, "loss": 7.4748, "step": 2484 }, { "completion_length": 136.4791717529297, "epoch": 0.22168696195191578, "grad_norm": 666.5755004882812, "learning_rate": 3e-06, "loss": 3.2721, "reward": 2.0126249194145203, "reward_std": 0.5076068788766861, "rewards/correctness_reward_func": 1.375, "rewards/int_reward_func": 0.4895833283662796, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.14804167300462723, "step": 2485, "zero_std_ratio": 0.0 }, { "epoch": 0.22177617199696686, "grad_norm": 1157.80908203125, "learning_rate": 3e-06, "loss": 35.3796, "step": 2486 }, { "epoch": 0.22186538204201794, "grad_norm": 964.0335693359375, "learning_rate": 3e-06, "loss": 27.2501, "step": 2487 }, { "epoch": 0.221954592087069, "grad_norm": 1680.4920654296875, "learning_rate": 3e-06, "loss": 11.387, "step": 2488 }, { "epoch": 0.22204380213212008, "grad_norm": 1438.4658203125, "learning_rate": 3e-06, "loss": 46.2915, "step": 2489 }, { "epoch": 0.22213301217717116, "grad_norm": 1312.2196044921875, "learning_rate": 3e-06, "loss": 53.6756, "step": 2490 }, { "epoch": 0.2222222222222222, "grad_norm": 598.6546630859375, "learning_rate": 3e-06, "loss": -0.5956, "step": 2491 }, { "epoch": 0.2223114322672733, "grad_norm": 1167.168212890625, "learning_rate": 3e-06, "loss": 18.1678, "step": 2492 }, { "epoch": 0.22240064231232437, "grad_norm": 714.756103515625, "learning_rate": 3e-06, "loss": 9.6702, "step": 2493 }, { "epoch": 0.22248985235737545, "grad_norm": 964.5927124023438, "learning_rate": 3e-06, "loss": -3.789, "step": 2494 }, { "epoch": 0.2225790624024265, "grad_norm": 930.6976928710938, "learning_rate": 3e-06, "loss": 25.0124, "step": 2495 }, { "epoch": 0.22266827244747758, "grad_norm": 747.74365234375, "learning_rate": 3e-06, "loss": 36.8743, "step": 2496 }, { "completion_length": 115.33333587646484, "epoch": 0.22275748249252866, "grad_norm": 283.8410339355469, "learning_rate": 3e-06, "loss": -6.8542, "reward": 2.154166579246521, "reward_std": 0.4607701599597931, "rewards/correctness_reward_func": 1.5, "rewards/int_reward_func": 0.4375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.21666666120290756, "step": 2497, "zero_std_ratio": 0.0 }, { "epoch": 0.22284669253757974, "grad_norm": 285.5289611816406, "learning_rate": 3e-06, "loss": -18.1384, "step": 2498 }, { "epoch": 0.2229359025826308, "grad_norm": 412.27838134765625, "learning_rate": 3e-06, "loss": -1.6925, "step": 2499 }, { "epoch": 0.22302511262768188, "grad_norm": 322.4535827636719, "learning_rate": 3e-06, "loss": -7.0813, "step": 2500 } ], "logging_steps": 1, "max_steps": 112090, "num_input_tokens_seen": 0, "num_train_epochs": 10, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 6, "trial_name": null, "trial_params": null }