{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.08921004505107274, "eval_steps": 1000, "global_step": 1000, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "completion_length": 225.1666717529297, "epoch": 8.921004505107276e-05, "grad_norm": 62.695068359375, "learning_rate": 2.5e-07, "loss": 12.7202, "reward": 0.19306249171495438, "reward_std": 0.5882241576910019, "rewards/correctness_reward_func": 0.3333333432674408, "rewards/int_reward_func": 0.0833333358168602, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.22360415756702423, "step": 1, "zero_std_ratio": 0.0 }, { "epoch": 0.00017842009010214551, "grad_norm": 59.6503791809082, "learning_rate": 5e-07, "loss": 13.0732, "step": 2 }, { "epoch": 0.00026763013515321824, "grad_norm": 69.65516662597656, "learning_rate": 7.5e-07, "loss": 12.9681, "step": 3 }, { "epoch": 0.00035684018020429103, "grad_norm": 57.81648635864258, "learning_rate": 1e-06, "loss": 8.1042, "step": 4 }, { "epoch": 0.00044605022525536376, "grad_norm": 57.6408576965332, "learning_rate": 1.25e-06, "loss": 8.6056, "step": 5 }, { "epoch": 0.0005352602703064365, "grad_norm": 58.459903717041016, "learning_rate": 1.5e-06, "loss": 10.4929, "step": 6 }, { "epoch": 0.0006244703153575092, "grad_norm": 62.41658020019531, "learning_rate": 1.7500000000000002e-06, "loss": 13.1206, "step": 7 }, { "epoch": 0.0007136803604085821, "grad_norm": 66.22370910644531, "learning_rate": 2e-06, "loss": 13.2007, "step": 8 }, { "epoch": 0.0008028904054596548, "grad_norm": 66.21946716308594, "learning_rate": 2.25e-06, "loss": 12.3522, "step": 9 }, { "epoch": 0.0008921004505107275, "grad_norm": 65.43058776855469, "learning_rate": 2.5e-06, "loss": 7.9566, "step": 10 }, { "epoch": 0.0009813104955618004, "grad_norm": 54.532962799072266, "learning_rate": 2.75e-06, "loss": 8.8616, "step": 11 }, { "epoch": 0.001070520540612873, "grad_norm": 56.53645706176758, "learning_rate": 3e-06, "loss": 10.3095, "step": 12 }, { "completion_length": 222.9791717529297, "epoch": 0.0011597305856639458, "grad_norm": 78.4708023071289, "learning_rate": 3e-06, "loss": -24.6031, "reward": 0.011666670441627502, "reward_std": 0.524684801697731, "rewards/correctness_reward_func": 0.1250000037252903, "rewards/int_reward_func": 0.041666666977107525, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.1550000049173832, "step": 13, "zero_std_ratio": 0.0 }, { "epoch": 0.0012489406307150184, "grad_norm": 88.30101776123047, "learning_rate": 3e-06, "loss": -18.3145, "step": 14 }, { "epoch": 0.0013381506757660913, "grad_norm": 101.97128295898438, "learning_rate": 3e-06, "loss": -7.4151, "step": 15 }, { "epoch": 0.0014273607208171641, "grad_norm": 91.58382415771484, "learning_rate": 3e-06, "loss": -8.9073, "step": 16 }, { "epoch": 0.0015165707658682367, "grad_norm": 90.10670471191406, "learning_rate": 3e-06, "loss": -13.5176, "step": 17 }, { "epoch": 0.0016057808109193096, "grad_norm": 80.67254638671875, "learning_rate": 3e-06, "loss": -17.2813, "step": 18 }, { "epoch": 0.0016949908559703822, "grad_norm": 75.51331329345703, "learning_rate": 3e-06, "loss": -24.6926, "step": 19 }, { "epoch": 0.001784200901021455, "grad_norm": 78.15167999267578, "learning_rate": 3e-06, "loss": -18.5973, "step": 20 }, { "epoch": 0.0018734109460725277, "grad_norm": 89.70745086669922, "learning_rate": 3e-06, "loss": -7.9364, "step": 21 }, { "epoch": 0.0019626209911236007, "grad_norm": 89.28164672851562, "learning_rate": 3e-06, "loss": -9.434, "step": 22 }, { "epoch": 0.002051831036174673, "grad_norm": 98.30489349365234, "learning_rate": 3e-06, "loss": -14.6494, "step": 23 }, { "epoch": 0.002141041081225746, "grad_norm": 92.3221206665039, "learning_rate": 3e-06, "loss": -17.5654, "step": 24 }, { "completion_length": 224.95834350585938, "epoch": 0.002230251126276819, "grad_norm": 77.60931396484375, "learning_rate": 3e-06, "loss": 12.7299, "reward": 0.08664583414793015, "reward_std": 0.529650554060936, "rewards/correctness_reward_func": 0.1666666679084301, "rewards/int_reward_func": 0.041666666977107525, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.12168749794363976, "step": 25, "zero_std_ratio": 0.0 }, { "epoch": 0.0023194611713278916, "grad_norm": 55.48664474487305, "learning_rate": 3e-06, "loss": 11.3297, "step": 26 }, { "epoch": 0.0024086712163789645, "grad_norm": 64.88197326660156, "learning_rate": 3e-06, "loss": 7.6398, "step": 27 }, { "epoch": 0.002497881261430037, "grad_norm": 66.41521453857422, "learning_rate": 3e-06, "loss": 10.9742, "step": 28 }, { "epoch": 0.0025870913064811097, "grad_norm": 60.356266021728516, "learning_rate": 3e-06, "loss": 18.3629, "step": 29 }, { "epoch": 0.0026763013515321826, "grad_norm": 67.53816986083984, "learning_rate": 3e-06, "loss": 10.3122, "step": 30 }, { "epoch": 0.0027655113965832554, "grad_norm": 81.81299591064453, "learning_rate": 3e-06, "loss": 12.4031, "step": 31 }, { "epoch": 0.0028547214416343282, "grad_norm": 58.01384735107422, "learning_rate": 3e-06, "loss": 11.3115, "step": 32 }, { "epoch": 0.0029439314866854006, "grad_norm": 60.38798522949219, "learning_rate": 3e-06, "loss": 7.5438, "step": 33 }, { "epoch": 0.0030331415317364735, "grad_norm": 76.68485260009766, "learning_rate": 3e-06, "loss": 9.8314, "step": 34 }, { "epoch": 0.0031223515767875463, "grad_norm": 63.667381286621094, "learning_rate": 3e-06, "loss": 18.0907, "step": 35 }, { "epoch": 0.003211561621838619, "grad_norm": 64.93324279785156, "learning_rate": 3e-06, "loss": 9.6529, "step": 36 }, { "completion_length": 200.9166717529297, "epoch": 0.003300771666889692, "grad_norm": 55.603302001953125, "learning_rate": 3e-06, "loss": -2.1381, "reward": 0.20900000631809235, "reward_std": 0.5408279597759247, "rewards/correctness_reward_func": 0.2500000074505806, "rewards/int_reward_func": 0.09374999813735485, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.1347499955445528, "step": 37, "zero_std_ratio": 0.0 }, { "epoch": 0.0033899817119407644, "grad_norm": 59.070777893066406, "learning_rate": 3e-06, "loss": -6.7825, "step": 38 }, { "epoch": 0.0034791917569918372, "grad_norm": 73.52457427978516, "learning_rate": 3e-06, "loss": -11.5592, "step": 39 }, { "epoch": 0.00356840180204291, "grad_norm": 68.8139419555664, "learning_rate": 3e-06, "loss": -3.9847, "step": 40 }, { "epoch": 0.003657611847093983, "grad_norm": 74.64259338378906, "learning_rate": 3e-06, "loss": -7.7023, "step": 41 }, { "epoch": 0.0037468218921450553, "grad_norm": 68.76261901855469, "learning_rate": 3e-06, "loss": -11.4536, "step": 42 }, { "epoch": 0.003836031937196128, "grad_norm": 57.10056686401367, "learning_rate": 3e-06, "loss": -3.0195, "step": 43 }, { "epoch": 0.003925241982247201, "grad_norm": 57.4798583984375, "learning_rate": 3e-06, "loss": -7.3677, "step": 44 }, { "epoch": 0.004014452027298274, "grad_norm": 62.251949310302734, "learning_rate": 3e-06, "loss": -12.4481, "step": 45 }, { "epoch": 0.004103662072349346, "grad_norm": 67.0556640625, "learning_rate": 3e-06, "loss": -4.2431, "step": 46 }, { "epoch": 0.0041928721174004195, "grad_norm": 79.22687530517578, "learning_rate": 3e-06, "loss": -8.9896, "step": 47 }, { "epoch": 0.004282082162451492, "grad_norm": 83.6895980834961, "learning_rate": 3e-06, "loss": -12.4645, "step": 48 }, { "completion_length": 186.0, "epoch": 0.004371292207502565, "grad_norm": 187.8282928466797, "learning_rate": 3e-06, "loss": -44.8522, "reward": 0.5453333556652069, "reward_std": 0.9364342093467712, "rewards/correctness_reward_func": 0.4583333358168602, "rewards/int_reward_func": 0.15625, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.06924999551847577, "step": 49, "zero_std_ratio": 0.0 }, { "epoch": 0.004460502252553638, "grad_norm": 92.9270248413086, "learning_rate": 3e-06, "loss": -41.2773, "step": 50 }, { "epoch": 0.00454971229760471, "grad_norm": 212.18917846679688, "learning_rate": 3e-06, "loss": -42.3882, "step": 51 }, { "epoch": 0.004638922342655783, "grad_norm": 102.22235870361328, "learning_rate": 3e-06, "loss": -42.8879, "step": 52 }, { "epoch": 0.004728132387706856, "grad_norm": 79.1269302368164, "learning_rate": 3e-06, "loss": -44.708, "step": 53 }, { "epoch": 0.004817342432757929, "grad_norm": 94.53079986572266, "learning_rate": 3e-06, "loss": -41.656, "step": 54 }, { "epoch": 0.004906552477809001, "grad_norm": 91.7303695678711, "learning_rate": 3e-06, "loss": -45.3257, "step": 55 }, { "epoch": 0.004995762522860074, "grad_norm": 92.66773986816406, "learning_rate": 3e-06, "loss": -41.4113, "step": 56 }, { "epoch": 0.005084972567911147, "grad_norm": 123.76467895507812, "learning_rate": 3e-06, "loss": -43.4643, "step": 57 }, { "epoch": 0.0051741826129622194, "grad_norm": 109.21142578125, "learning_rate": 3e-06, "loss": -44.7136, "step": 58 }, { "epoch": 0.005263392658013293, "grad_norm": 83.24272155761719, "learning_rate": 3e-06, "loss": -45.7862, "step": 59 }, { "epoch": 0.005352602703064365, "grad_norm": 94.45966339111328, "learning_rate": 3e-06, "loss": -42.5492, "step": 60 }, { "completion_length": 225.625, "epoch": 0.0054418127481154375, "grad_norm": 125.85611724853516, "learning_rate": 3e-06, "loss": 28.3825, "reward": 0.31822918355464935, "reward_std": 0.9613562524318695, "rewards/correctness_reward_func": 0.375, "rewards/int_reward_func": 0.1145833320915699, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.17135417833924294, "step": 61, "zero_std_ratio": 0.0 }, { "epoch": 0.005531022793166511, "grad_norm": 128.64669799804688, "learning_rate": 3e-06, "loss": 4.8596, "step": 62 }, { "epoch": 0.005620232838217583, "grad_norm": 287.9391784667969, "learning_rate": 3e-06, "loss": 20.5521, "step": 63 }, { "epoch": 0.0057094428832686565, "grad_norm": 111.01509857177734, "learning_rate": 3e-06, "loss": 16.6241, "step": 64 }, { "epoch": 0.005798652928319729, "grad_norm": 123.25679016113281, "learning_rate": 3e-06, "loss": 6.8919, "step": 65 }, { "epoch": 0.005887862973370801, "grad_norm": 115.68987274169922, "learning_rate": 3e-06, "loss": 19.3061, "step": 66 }, { "epoch": 0.0059770730184218746, "grad_norm": 128.9923553466797, "learning_rate": 3e-06, "loss": 27.4792, "step": 67 }, { "epoch": 0.006066283063472947, "grad_norm": 130.64230346679688, "learning_rate": 3e-06, "loss": 3.8702, "step": 68 }, { "epoch": 0.00615549310852402, "grad_norm": 169.2925262451172, "learning_rate": 3e-06, "loss": 19.2163, "step": 69 }, { "epoch": 0.006244703153575093, "grad_norm": 104.88905334472656, "learning_rate": 3e-06, "loss": 14.5854, "step": 70 }, { "epoch": 0.006333913198626165, "grad_norm": 134.32022094726562, "learning_rate": 3e-06, "loss": 5.6117, "step": 71 }, { "epoch": 0.006423123243677238, "grad_norm": 124.52132415771484, "learning_rate": 3e-06, "loss": 18.0908, "step": 72 }, { "completion_length": 203.81250762939453, "epoch": 0.006512333288728311, "grad_norm": 87.01981353759766, "learning_rate": 3e-06, "loss": 14.5961, "reward": 0.25443750619888306, "reward_std": 0.6893003582954407, "rewards/correctness_reward_func": 0.2916666641831398, "rewards/int_reward_func": 0.09374999813735485, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.13097916916012764, "step": 73, "zero_std_ratio": 0.0 }, { "epoch": 0.006601543333779384, "grad_norm": 83.70246887207031, "learning_rate": 3e-06, "loss": 20.7203, "step": 74 }, { "epoch": 0.006690753378830456, "grad_norm": 80.23466491699219, "learning_rate": 3e-06, "loss": 30.3319, "step": 75 }, { "epoch": 0.006779963423881529, "grad_norm": 74.68209838867188, "learning_rate": 3e-06, "loss": 23.2, "step": 76 }, { "epoch": 0.006869173468932602, "grad_norm": 81.28849029541016, "learning_rate": 3e-06, "loss": 11.7216, "step": 77 }, { "epoch": 0.0069583835139836745, "grad_norm": 85.60411071777344, "learning_rate": 3e-06, "loss": 19.9348, "step": 78 }, { "epoch": 0.007047593559034747, "grad_norm": 95.26403045654297, "learning_rate": 3e-06, "loss": 13.5735, "step": 79 }, { "epoch": 0.00713680360408582, "grad_norm": 81.69352722167969, "learning_rate": 3e-06, "loss": 19.4906, "step": 80 }, { "epoch": 0.0072260136491368926, "grad_norm": 80.9581527709961, "learning_rate": 3e-06, "loss": 29.1989, "step": 81 }, { "epoch": 0.007315223694187966, "grad_norm": 87.37995147705078, "learning_rate": 3e-06, "loss": 23.4541, "step": 82 }, { "epoch": 0.007404433739239038, "grad_norm": 90.7470932006836, "learning_rate": 3e-06, "loss": 10.7907, "step": 83 }, { "epoch": 0.007493643784290111, "grad_norm": 352.26953125, "learning_rate": 3e-06, "loss": 18.1423, "step": 84 }, { "completion_length": 185.6875, "epoch": 0.007582853829341184, "grad_norm": 78.5768051147461, "learning_rate": 3e-06, "loss": -2.0743, "reward": 0.6054166778922081, "reward_std": 0.8349271714687347, "rewards/correctness_reward_func": 0.5, "rewards/int_reward_func": 0.125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.019583335146307945, "step": 85, "zero_std_ratio": 0.0 }, { "epoch": 0.007672063874392256, "grad_norm": 125.65332794189453, "learning_rate": 3e-06, "loss": -18.0183, "step": 86 }, { "epoch": 0.00776127391944333, "grad_norm": 93.01673889160156, "learning_rate": 3e-06, "loss": -2.9219, "step": 87 }, { "epoch": 0.007850483964494403, "grad_norm": 85.38358306884766, "learning_rate": 3e-06, "loss": -7.3606, "step": 88 }, { "epoch": 0.007939694009545474, "grad_norm": 99.59243774414062, "learning_rate": 3e-06, "loss": -18.9376, "step": 89 }, { "epoch": 0.008028904054596548, "grad_norm": 96.83404541015625, "learning_rate": 3e-06, "loss": -7.9748, "step": 90 }, { "epoch": 0.008118114099647621, "grad_norm": 81.16954803466797, "learning_rate": 3e-06, "loss": -4.2134, "step": 91 }, { "epoch": 0.008207324144698692, "grad_norm": 123.15869140625, "learning_rate": 3e-06, "loss": -19.8823, "step": 92 }, { "epoch": 0.008296534189749766, "grad_norm": 93.05419158935547, "learning_rate": 3e-06, "loss": -4.5813, "step": 93 }, { "epoch": 0.008385744234800839, "grad_norm": 106.2331314086914, "learning_rate": 3e-06, "loss": -8.6969, "step": 94 }, { "epoch": 0.00847495427985191, "grad_norm": 99.65939331054688, "learning_rate": 3e-06, "loss": -21.3275, "step": 95 }, { "epoch": 0.008564164324902984, "grad_norm": 94.40375518798828, "learning_rate": 3e-06, "loss": -9.7937, "step": 96 }, { "completion_length": 217.25000762939453, "epoch": 0.008653374369954057, "grad_norm": 133.5598907470703, "learning_rate": 3e-06, "loss": -68.7329, "reward": 0.6968958526849747, "reward_std": 0.7409922480583191, "rewards/correctness_reward_func": 0.5833333283662796, "rewards/int_reward_func": 0.1979166641831398, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.08435416780412197, "step": 97, "zero_std_ratio": 0.0 }, { "epoch": 0.00874258441500513, "grad_norm": 136.60848999023438, "learning_rate": 3e-06, "loss": -74.5256, "step": 98 }, { "epoch": 0.008831794460056202, "grad_norm": 123.70120239257812, "learning_rate": 3e-06, "loss": -59.6774, "step": 99 }, { "epoch": 0.008921004505107275, "grad_norm": 150.22532653808594, "learning_rate": 3e-06, "loss": -69.5624, "step": 100 }, { "epoch": 0.009010214550158348, "grad_norm": 126.68507385253906, "learning_rate": 3e-06, "loss": -62.8973, "step": 101 }, { "epoch": 0.00909942459520942, "grad_norm": 105.47962951660156, "learning_rate": 3e-06, "loss": -61.6182, "step": 102 }, { "epoch": 0.009188634640260493, "grad_norm": 144.26048278808594, "learning_rate": 3e-06, "loss": -70.5109, "step": 103 }, { "epoch": 0.009277844685311567, "grad_norm": 141.22325134277344, "learning_rate": 3e-06, "loss": -76.5479, "step": 104 }, { "epoch": 0.009367054730362638, "grad_norm": 139.37173461914062, "learning_rate": 3e-06, "loss": -62.353, "step": 105 }, { "epoch": 0.009456264775413711, "grad_norm": 150.77801513671875, "learning_rate": 3e-06, "loss": -72.2384, "step": 106 }, { "epoch": 0.009545474820464785, "grad_norm": 138.2374267578125, "learning_rate": 3e-06, "loss": -65.3746, "step": 107 }, { "epoch": 0.009634684865515858, "grad_norm": 132.50453186035156, "learning_rate": 3e-06, "loss": -64.141, "step": 108 }, { "completion_length": 198.125, "epoch": 0.00972389491056693, "grad_norm": 187.5413055419922, "learning_rate": 3e-06, "loss": 44.1489, "reward": 0.7788957953453064, "reward_std": 0.7549726963043213, "rewards/correctness_reward_func": 0.6666666567325592, "rewards/int_reward_func": 0.1770833283662796, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.06485416181385517, "step": 109, "zero_std_ratio": 0.0 }, { "epoch": 0.009813104955618003, "grad_norm": 138.89434814453125, "learning_rate": 3e-06, "loss": 51.6646, "step": 110 }, { "epoch": 0.009902315000669076, "grad_norm": 128.95484924316406, "learning_rate": 3e-06, "loss": 31.7993, "step": 111 }, { "epoch": 0.009991525045720148, "grad_norm": 126.7931900024414, "learning_rate": 3e-06, "loss": 38.5454, "step": 112 }, { "epoch": 0.01008073509077122, "grad_norm": 125.33599853515625, "learning_rate": 3e-06, "loss": 40.3822, "step": 113 }, { "epoch": 0.010169945135822294, "grad_norm": 139.41482543945312, "learning_rate": 3e-06, "loss": 32.5052, "step": 114 }, { "epoch": 0.010259155180873366, "grad_norm": 169.09432983398438, "learning_rate": 3e-06, "loss": 43.5542, "step": 115 }, { "epoch": 0.010348365225924439, "grad_norm": 133.872802734375, "learning_rate": 3e-06, "loss": 50.3469, "step": 116 }, { "epoch": 0.010437575270975512, "grad_norm": 125.77018737792969, "learning_rate": 3e-06, "loss": 31.112, "step": 117 }, { "epoch": 0.010526785316026585, "grad_norm": 128.32257080078125, "learning_rate": 3e-06, "loss": 36.629, "step": 118 }, { "epoch": 0.010615995361077657, "grad_norm": 124.38401794433594, "learning_rate": 3e-06, "loss": 39.8284, "step": 119 }, { "epoch": 0.01070520540612873, "grad_norm": 138.24668884277344, "learning_rate": 3e-06, "loss": 31.1433, "step": 120 }, { "completion_length": 191.6041717529297, "epoch": 0.010794415451179804, "grad_norm": 199.1646270751953, "learning_rate": 3e-06, "loss": 86.314, "reward": 1.0014583468437195, "reward_std": 0.8148851096630096, "rewards/correctness_reward_func": 0.875, "rewards/int_reward_func": 0.2291666641831398, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.10270833596587181, "step": 121, "zero_std_ratio": 0.0 }, { "epoch": 0.010883625496230875, "grad_norm": 195.7254638671875, "learning_rate": 3e-06, "loss": 76.1289, "step": 122 }, { "epoch": 0.010972835541281948, "grad_norm": 175.13900756835938, "learning_rate": 3e-06, "loss": 86.1448, "step": 123 }, { "epoch": 0.011062045586333022, "grad_norm": 182.21661376953125, "learning_rate": 3e-06, "loss": 90.0805, "step": 124 }, { "epoch": 0.011151255631384093, "grad_norm": 189.17214965820312, "learning_rate": 3e-06, "loss": 76.0951, "step": 125 }, { "epoch": 0.011240465676435166, "grad_norm": 195.55718994140625, "learning_rate": 3e-06, "loss": 89.5242, "step": 126 }, { "epoch": 0.01132967572148624, "grad_norm": 171.1396484375, "learning_rate": 3e-06, "loss": 82.3705, "step": 127 }, { "epoch": 0.011418885766537313, "grad_norm": 189.04995727539062, "learning_rate": 3e-06, "loss": 71.8677, "step": 128 }, { "epoch": 0.011508095811588384, "grad_norm": 162.9297332763672, "learning_rate": 3e-06, "loss": 81.2432, "step": 129 }, { "epoch": 0.011597305856639458, "grad_norm": 173.23104858398438, "learning_rate": 3e-06, "loss": 85.8069, "step": 130 }, { "epoch": 0.011686515901690531, "grad_norm": 162.6637420654297, "learning_rate": 3e-06, "loss": 69.8347, "step": 131 }, { "epoch": 0.011775725946741603, "grad_norm": 190.06675720214844, "learning_rate": 3e-06, "loss": 84.5222, "step": 132 }, { "completion_length": 207.33334350585938, "epoch": 0.011864935991792676, "grad_norm": 147.16957092285156, "learning_rate": 3e-06, "loss": -63.4697, "reward": 0.6791666746139526, "reward_std": 1.0425111949443817, "rewards/correctness_reward_func": 0.5833333134651184, "rewards/int_reward_func": 0.1770833358168602, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.08124999818392098, "step": 133, "zero_std_ratio": 0.0 }, { "epoch": 0.011954146036843749, "grad_norm": 154.6719970703125, "learning_rate": 3e-06, "loss": -70.5112, "step": 134 }, { "epoch": 0.01204335608189482, "grad_norm": 137.32408142089844, "learning_rate": 3e-06, "loss": -41.1322, "step": 135 }, { "epoch": 0.012132566126945894, "grad_norm": 126.37704467773438, "learning_rate": 3e-06, "loss": -53.2367, "step": 136 }, { "epoch": 0.012221776171996967, "grad_norm": 152.24891662597656, "learning_rate": 3e-06, "loss": -58.4567, "step": 137 }, { "epoch": 0.01231098621704804, "grad_norm": 116.28028106689453, "learning_rate": 3e-06, "loss": -46.2973, "step": 138 }, { "epoch": 0.012400196262099112, "grad_norm": 152.08795166015625, "learning_rate": 3e-06, "loss": -62.8325, "step": 139 }, { "epoch": 0.012489406307150185, "grad_norm": 146.10671997070312, "learning_rate": 3e-06, "loss": -71.6559, "step": 140 }, { "epoch": 0.012578616352201259, "grad_norm": 149.14556884765625, "learning_rate": 3e-06, "loss": -42.1534, "step": 141 }, { "epoch": 0.01266782639725233, "grad_norm": 151.06182861328125, "learning_rate": 3e-06, "loss": -55.6968, "step": 142 }, { "epoch": 0.012757036442303403, "grad_norm": 145.29530334472656, "learning_rate": 3e-06, "loss": -60.2759, "step": 143 }, { "epoch": 0.012846246487354477, "grad_norm": 124.00696563720703, "learning_rate": 3e-06, "loss": -48.5856, "step": 144 }, { "completion_length": 200.14583587646484, "epoch": 0.012935456532405548, "grad_norm": 104.97675323486328, "learning_rate": 3e-06, "loss": -22.2294, "reward": 0.21922918409109116, "reward_std": 0.6296879947185516, "rewards/correctness_reward_func": 0.2916666716337204, "rewards/int_reward_func": 0.062499999068677425, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.13493750616908073, "step": 145, "zero_std_ratio": 0.0 }, { "epoch": 0.013024666577456621, "grad_norm": 83.18937683105469, "learning_rate": 3e-06, "loss": -22.3765, "step": 146 }, { "epoch": 0.013113876622507695, "grad_norm": 96.22801971435547, "learning_rate": 3e-06, "loss": -22.5564, "step": 147 }, { "epoch": 0.013203086667558768, "grad_norm": 102.87374877929688, "learning_rate": 3e-06, "loss": -24.9001, "step": 148 }, { "epoch": 0.01329229671260984, "grad_norm": 110.96674346923828, "learning_rate": 3e-06, "loss": -18.8972, "step": 149 }, { "epoch": 0.013381506757660913, "grad_norm": 91.87604522705078, "learning_rate": 3e-06, "loss": -18.1615, "step": 150 }, { "epoch": 0.013470716802711986, "grad_norm": 88.4422836303711, "learning_rate": 3e-06, "loss": -23.0431, "step": 151 }, { "epoch": 0.013559926847763058, "grad_norm": 83.86327362060547, "learning_rate": 3e-06, "loss": -23.25, "step": 152 }, { "epoch": 0.01364913689281413, "grad_norm": 82.81922149658203, "learning_rate": 3e-06, "loss": -23.1331, "step": 153 }, { "epoch": 0.013738346937865204, "grad_norm": 104.8452377319336, "learning_rate": 3e-06, "loss": -26.8428, "step": 154 }, { "epoch": 0.013827556982916276, "grad_norm": 92.94257354736328, "learning_rate": 3e-06, "loss": -20.0667, "step": 155 }, { "epoch": 0.013916767027967349, "grad_norm": 84.95638275146484, "learning_rate": 3e-06, "loss": -19.1472, "step": 156 }, { "completion_length": 231.7916717529297, "epoch": 0.014005977073018422, "grad_norm": 139.96688842773438, "learning_rate": 3e-06, "loss": 21.1797, "reward": 0.6720625460147858, "reward_std": 0.9181468784809113, "rewards/correctness_reward_func": 0.5833333432674408, "rewards/int_reward_func": 0.2187500074505806, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.13002083078026772, "step": 157, "zero_std_ratio": 0.0 }, { "epoch": 0.014095187118069494, "grad_norm": 109.121337890625, "learning_rate": 3e-06, "loss": 10.5428, "step": 158 }, { "epoch": 0.014184397163120567, "grad_norm": 94.9039306640625, "learning_rate": 3e-06, "loss": 4.8126, "step": 159 }, { "epoch": 0.01427360720817164, "grad_norm": 109.7251968383789, "learning_rate": 3e-06, "loss": 5.2961, "step": 160 }, { "epoch": 0.014362817253222714, "grad_norm": 103.42703247070312, "learning_rate": 3e-06, "loss": 4.0648, "step": 161 }, { "epoch": 0.014452027298273785, "grad_norm": 127.93770599365234, "learning_rate": 3e-06, "loss": 7.0101, "step": 162 }, { "epoch": 0.014541237343324858, "grad_norm": 145.8150634765625, "learning_rate": 3e-06, "loss": 18.3559, "step": 163 }, { "epoch": 0.014630447388375932, "grad_norm": 116.2653579711914, "learning_rate": 3e-06, "loss": 8.3424, "step": 164 }, { "epoch": 0.014719657433427003, "grad_norm": 104.55130767822266, "learning_rate": 3e-06, "loss": 2.084, "step": 165 }, { "epoch": 0.014808867478478076, "grad_norm": 114.84294128417969, "learning_rate": 3e-06, "loss": 2.2571, "step": 166 }, { "epoch": 0.01489807752352915, "grad_norm": 99.8189468383789, "learning_rate": 3e-06, "loss": 0.9219, "step": 167 }, { "epoch": 0.014987287568580221, "grad_norm": 142.80715942382812, "learning_rate": 3e-06, "loss": 3.505, "step": 168 }, { "completion_length": 214.77083587646484, "epoch": 0.015076497613631295, "grad_norm": 114.3720703125, "learning_rate": 3e-06, "loss": -32.1329, "reward": 1.0511458218097687, "reward_std": 1.0028848350048065, "rewards/correctness_reward_func": 0.9166666865348816, "rewards/int_reward_func": 0.3125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.17802083492279053, "step": 169, "zero_std_ratio": 0.0 }, { "epoch": 0.015165707658682368, "grad_norm": 100.3597412109375, "learning_rate": 3e-06, "loss": -38.7511, "step": 170 }, { "epoch": 0.015254917703733441, "grad_norm": 108.30574035644531, "learning_rate": 3e-06, "loss": -46.3083, "step": 171 }, { "epoch": 0.015344127748784513, "grad_norm": 116.34545135498047, "learning_rate": 3e-06, "loss": -39.7363, "step": 172 }, { "epoch": 0.015433337793835586, "grad_norm": 113.52851104736328, "learning_rate": 3e-06, "loss": -34.5686, "step": 173 }, { "epoch": 0.01552254783888666, "grad_norm": 110.65509796142578, "learning_rate": 3e-06, "loss": -32.8796, "step": 174 }, { "epoch": 0.01561175788393773, "grad_norm": 107.06590270996094, "learning_rate": 3e-06, "loss": -32.6552, "step": 175 }, { "epoch": 0.015700967928988806, "grad_norm": 100.2861557006836, "learning_rate": 3e-06, "loss": -39.6106, "step": 176 }, { "epoch": 0.015790177974039876, "grad_norm": 107.69467163085938, "learning_rate": 3e-06, "loss": -46.9244, "step": 177 }, { "epoch": 0.01587938801909095, "grad_norm": 96.8420181274414, "learning_rate": 3e-06, "loss": -40.93, "step": 178 }, { "epoch": 0.015968598064142022, "grad_norm": 113.12389373779297, "learning_rate": 3e-06, "loss": -37.0258, "step": 179 }, { "epoch": 0.016057808109193095, "grad_norm": 116.10971069335938, "learning_rate": 3e-06, "loss": -34.9046, "step": 180 }, { "completion_length": 218.64583587646484, "epoch": 0.01614701815424417, "grad_norm": 307.6673889160156, "learning_rate": 3e-06, "loss": -29.0457, "reward": 1.4360832571983337, "reward_std": 1.0610616505146027, "rewards/correctness_reward_func": 1.25, "rewards/int_reward_func": 0.34375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.15766665898263454, "step": 181, "zero_std_ratio": 0.0 }, { "epoch": 0.016236228199295242, "grad_norm": 125.99212646484375, "learning_rate": 3e-06, "loss": 1.016, "step": 182 }, { "epoch": 0.01632543824434631, "grad_norm": 112.21588897705078, "learning_rate": 3e-06, "loss": -17.115, "step": 183 }, { "epoch": 0.016414648289397385, "grad_norm": 118.06622314453125, "learning_rate": 3e-06, "loss": -0.4864, "step": 184 }, { "epoch": 0.016503858334448458, "grad_norm": 116.36631774902344, "learning_rate": 3e-06, "loss": 3.4437, "step": 185 }, { "epoch": 0.01659306837949953, "grad_norm": 124.60052490234375, "learning_rate": 3e-06, "loss": -27.5515, "step": 186 }, { "epoch": 0.016682278424550605, "grad_norm": 160.65628051757812, "learning_rate": 3e-06, "loss": -29.3863, "step": 187 }, { "epoch": 0.016771488469601678, "grad_norm": 127.9763412475586, "learning_rate": 3e-06, "loss": 0.7423, "step": 188 }, { "epoch": 0.01686069851465275, "grad_norm": 116.69316101074219, "learning_rate": 3e-06, "loss": -18.6518, "step": 189 }, { "epoch": 0.01694990855970382, "grad_norm": 114.2183609008789, "learning_rate": 3e-06, "loss": -0.87, "step": 190 }, { "epoch": 0.017039118604754894, "grad_norm": 126.1614761352539, "learning_rate": 3e-06, "loss": 2.8213, "step": 191 }, { "epoch": 0.017128328649805968, "grad_norm": 134.43527221679688, "learning_rate": 3e-06, "loss": -28.8518, "step": 192 }, { "completion_length": 247.08334350585938, "epoch": 0.01721753869485704, "grad_norm": 111.3412094116211, "learning_rate": 3e-06, "loss": -7.8814, "reward": 1.0157291293144226, "reward_std": 0.7945153564214706, "rewards/correctness_reward_func": 0.9166666865348816, "rewards/int_reward_func": 0.2916666716337204, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.1926041767001152, "step": 193, "zero_std_ratio": 0.0 }, { "epoch": 0.017306748739908114, "grad_norm": 116.42599487304688, "learning_rate": 3e-06, "loss": -15.1245, "step": 194 }, { "epoch": 0.017395958784959187, "grad_norm": 136.37391662597656, "learning_rate": 3e-06, "loss": -9.2426, "step": 195 }, { "epoch": 0.01748516883001026, "grad_norm": 97.36872863769531, "learning_rate": 3e-06, "loss": -10.8671, "step": 196 }, { "epoch": 0.01757437887506133, "grad_norm": 125.0397720336914, "learning_rate": 3e-06, "loss": -7.0755, "step": 197 }, { "epoch": 0.017663588920112404, "grad_norm": 171.17971801757812, "learning_rate": 3e-06, "loss": -16.797, "step": 198 }, { "epoch": 0.017752798965163477, "grad_norm": 100.81266021728516, "learning_rate": 3e-06, "loss": -8.4577, "step": 199 }, { "epoch": 0.01784200901021455, "grad_norm": 127.79389953613281, "learning_rate": 3e-06, "loss": -16.3874, "step": 200 }, { "epoch": 0.017931219055265624, "grad_norm": 131.9748077392578, "learning_rate": 3e-06, "loss": -10.7973, "step": 201 }, { "epoch": 0.018020429100316697, "grad_norm": 100.95606231689453, "learning_rate": 3e-06, "loss": -12.0026, "step": 202 }, { "epoch": 0.018109639145367767, "grad_norm": 131.19261169433594, "learning_rate": 3e-06, "loss": -8.3155, "step": 203 }, { "epoch": 0.01819884919041884, "grad_norm": 164.74656677246094, "learning_rate": 3e-06, "loss": -18.9275, "step": 204 }, { "completion_length": 191.7291717529297, "epoch": 0.018288059235469913, "grad_norm": 150.95191955566406, "learning_rate": 3e-06, "loss": 50.6719, "reward": 1.3118958473205566, "reward_std": 0.8902758955955505, "rewards/correctness_reward_func": 0.9999999701976776, "rewards/int_reward_func": 0.34375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.03185417060740292, "step": 205, "zero_std_ratio": 0.0 }, { "epoch": 0.018377269280520987, "grad_norm": 128.34344482421875, "learning_rate": 3e-06, "loss": 32.3302, "step": 206 }, { "epoch": 0.01846647932557206, "grad_norm": 136.15789794921875, "learning_rate": 3e-06, "loss": 33.8857, "step": 207 }, { "epoch": 0.018555689370623133, "grad_norm": 140.50901794433594, "learning_rate": 3e-06, "loss": 28.924, "step": 208 }, { "epoch": 0.018644899415674206, "grad_norm": 168.0647430419922, "learning_rate": 3e-06, "loss": 31.3019, "step": 209 }, { "epoch": 0.018734109460725276, "grad_norm": 133.79208374023438, "learning_rate": 3e-06, "loss": 31.6401, "step": 210 }, { "epoch": 0.01882331950577635, "grad_norm": 161.34898376464844, "learning_rate": 3e-06, "loss": 49.2047, "step": 211 }, { "epoch": 0.018912529550827423, "grad_norm": 129.22007751464844, "learning_rate": 3e-06, "loss": 30.2, "step": 212 }, { "epoch": 0.019001739595878496, "grad_norm": 143.37449645996094, "learning_rate": 3e-06, "loss": 31.2762, "step": 213 }, { "epoch": 0.01909094964092957, "grad_norm": 140.57894897460938, "learning_rate": 3e-06, "loss": 26.7715, "step": 214 }, { "epoch": 0.019180159685980643, "grad_norm": 148.71348571777344, "learning_rate": 3e-06, "loss": 28.729, "step": 215 }, { "epoch": 0.019269369731031716, "grad_norm": 137.0448455810547, "learning_rate": 3e-06, "loss": 29.3048, "step": 216 }, { "completion_length": 241.0, "epoch": 0.019358579776082786, "grad_norm": 167.66650390625, "learning_rate": 3e-06, "loss": -43.2087, "reward": 1.6041667461395264, "reward_std": 0.9945478439331055, "rewards/correctness_reward_func": 1.375, "rewards/int_reward_func": 0.4270833283662796, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.1979166641831398, "step": 217, "zero_std_ratio": 0.0 }, { "epoch": 0.01944778982113386, "grad_norm": 168.0265350341797, "learning_rate": 3e-06, "loss": -45.6767, "step": 218 }, { "epoch": 0.019536999866184932, "grad_norm": 148.4340362548828, "learning_rate": 3e-06, "loss": -30.5571, "step": 219 }, { "epoch": 0.019626209911236005, "grad_norm": 139.6564178466797, "learning_rate": 3e-06, "loss": -44.7743, "step": 220 }, { "epoch": 0.01971541995628708, "grad_norm": 147.22129821777344, "learning_rate": 3e-06, "loss": -41.9365, "step": 221 }, { "epoch": 0.019804630001338152, "grad_norm": 190.81561279296875, "learning_rate": 3e-06, "loss": -48.2229, "step": 222 }, { "epoch": 0.019893840046389222, "grad_norm": 165.86917114257812, "learning_rate": 3e-06, "loss": -43.317, "step": 223 }, { "epoch": 0.019983050091440295, "grad_norm": 162.9475555419922, "learning_rate": 3e-06, "loss": -48.1878, "step": 224 }, { "epoch": 0.02007226013649137, "grad_norm": 179.08360290527344, "learning_rate": 3e-06, "loss": -33.3052, "step": 225 }, { "epoch": 0.02016147018154244, "grad_norm": 133.29290771484375, "learning_rate": 3e-06, "loss": -45.8993, "step": 226 }, { "epoch": 0.020250680226593515, "grad_norm": 155.86611938476562, "learning_rate": 3e-06, "loss": -43.9261, "step": 227 }, { "epoch": 0.020339890271644588, "grad_norm": 154.34974670410156, "learning_rate": 3e-06, "loss": -50.4381, "step": 228 }, { "completion_length": 239.43750762939453, "epoch": 0.02042910031669566, "grad_norm": 105.5196304321289, "learning_rate": 3e-06, "loss": 0.4059, "reward": 1.5316042304039001, "reward_std": 0.8583633303642273, "rewards/correctness_reward_func": 1.375, "rewards/int_reward_func": 0.3541666716337204, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.19756250455975533, "step": 229, "zero_std_ratio": 0.0 }, { "epoch": 0.02051831036174673, "grad_norm": 119.5712890625, "learning_rate": 3e-06, "loss": -3.2594, "step": 230 }, { "epoch": 0.020607520406797804, "grad_norm": 128.1366424560547, "learning_rate": 3e-06, "loss": -12.0605, "step": 231 }, { "epoch": 0.020696730451848878, "grad_norm": 126.55559539794922, "learning_rate": 3e-06, "loss": -15.4799, "step": 232 }, { "epoch": 0.02078594049689995, "grad_norm": 137.93882751464844, "learning_rate": 3e-06, "loss": -18.5312, "step": 233 }, { "epoch": 0.020875150541951024, "grad_norm": 108.0162124633789, "learning_rate": 3e-06, "loss": -11.7573, "step": 234 }, { "epoch": 0.020964360587002098, "grad_norm": 118.95193481445312, "learning_rate": 3e-06, "loss": -1.1434, "step": 235 }, { "epoch": 0.02105357063205317, "grad_norm": 126.50416564941406, "learning_rate": 3e-06, "loss": -3.7423, "step": 236 }, { "epoch": 0.02114278067710424, "grad_norm": 130.68190002441406, "learning_rate": 3e-06, "loss": -14.5207, "step": 237 }, { "epoch": 0.021231990722155314, "grad_norm": 129.162109375, "learning_rate": 3e-06, "loss": -16.3237, "step": 238 }, { "epoch": 0.021321200767206387, "grad_norm": 145.95396423339844, "learning_rate": 3e-06, "loss": -20.6294, "step": 239 }, { "epoch": 0.02141041081225746, "grad_norm": 107.8385009765625, "learning_rate": 3e-06, "loss": -14.0773, "step": 240 }, { "completion_length": 206.9791717529297, "epoch": 0.021499620857308534, "grad_norm": 102.38019561767578, "learning_rate": 3e-06, "loss": -32.3486, "reward": 1.035479187965393, "reward_std": 0.7589404881000519, "rewards/correctness_reward_func": 0.8750000149011612, "rewards/int_reward_func": 0.3020833283662796, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.14160417020320892, "step": 241, "zero_std_ratio": 0.0 }, { "epoch": 0.021588830902359607, "grad_norm": 110.24679565429688, "learning_rate": 3e-06, "loss": -38.6199, "step": 242 }, { "epoch": 0.021678040947410677, "grad_norm": 118.22930145263672, "learning_rate": 3e-06, "loss": -52.9139, "step": 243 }, { "epoch": 0.02176725099246175, "grad_norm": 118.6080322265625, "learning_rate": 3e-06, "loss": -43.3805, "step": 244 }, { "epoch": 0.021856461037512823, "grad_norm": 106.9905776977539, "learning_rate": 3e-06, "loss": -36.7945, "step": 245 }, { "epoch": 0.021945671082563897, "grad_norm": 111.37010955810547, "learning_rate": 3e-06, "loss": -36.4452, "step": 246 }, { "epoch": 0.02203488112761497, "grad_norm": 104.93065643310547, "learning_rate": 3e-06, "loss": -34.2096, "step": 247 }, { "epoch": 0.022124091172666043, "grad_norm": 117.96737670898438, "learning_rate": 3e-06, "loss": -40.621, "step": 248 }, { "epoch": 0.022213301217717116, "grad_norm": 118.701904296875, "learning_rate": 3e-06, "loss": -54.4138, "step": 249 }, { "epoch": 0.022302511262768186, "grad_norm": 118.43307495117188, "learning_rate": 3e-06, "loss": -45.0393, "step": 250 }, { "epoch": 0.02239172130781926, "grad_norm": 114.41901397705078, "learning_rate": 3e-06, "loss": -37.6304, "step": 251 }, { "epoch": 0.022480931352870333, "grad_norm": 123.03970336914062, "learning_rate": 3e-06, "loss": -39.0638, "step": 252 }, { "completion_length": 211.20833587646484, "epoch": 0.022570141397921406, "grad_norm": 139.677734375, "learning_rate": 3e-06, "loss": -27.6756, "reward": 1.5570417046546936, "reward_std": 1.1208258867263794, "rewards/correctness_reward_func": 1.2916666865348816, "rewards/int_reward_func": 0.40625, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.14087500423192978, "step": 253, "zero_std_ratio": 0.0 }, { "epoch": 0.02265935144297248, "grad_norm": 145.56021118164062, "learning_rate": 3e-06, "loss": -40.3289, "step": 254 }, { "epoch": 0.022748561488023553, "grad_norm": 138.8564453125, "learning_rate": 3e-06, "loss": -39.1766, "step": 255 }, { "epoch": 0.022837771533074626, "grad_norm": 229.50186157226562, "learning_rate": 3e-06, "loss": -43.9568, "step": 256 }, { "epoch": 0.022926981578125696, "grad_norm": 138.42791748046875, "learning_rate": 3e-06, "loss": -52.4297, "step": 257 }, { "epoch": 0.02301619162317677, "grad_norm": 147.58364868164062, "learning_rate": 3e-06, "loss": -53.5477, "step": 258 }, { "epoch": 0.023105401668227842, "grad_norm": 140.5048828125, "learning_rate": 3e-06, "loss": -28.1418, "step": 259 }, { "epoch": 0.023194611713278915, "grad_norm": 139.11508178710938, "learning_rate": 3e-06, "loss": -42.7612, "step": 260 }, { "epoch": 0.02328382175832999, "grad_norm": 146.18580627441406, "learning_rate": 3e-06, "loss": -39.909, "step": 261 }, { "epoch": 0.023373031803381062, "grad_norm": 264.3643493652344, "learning_rate": 3e-06, "loss": -46.2595, "step": 262 }, { "epoch": 0.023462241848432132, "grad_norm": 154.1084747314453, "learning_rate": 3e-06, "loss": -55.2424, "step": 263 }, { "epoch": 0.023551451893483205, "grad_norm": 156.28662109375, "learning_rate": 3e-06, "loss": -55.6531, "step": 264 }, { "completion_length": 227.14583587646484, "epoch": 0.02364066193853428, "grad_norm": 115.90379333496094, "learning_rate": 3e-06, "loss": -30.7492, "reward": 1.6066043376922607, "reward_std": 0.8875448107719421, "rewards/correctness_reward_func": 1.2916666865348816, "rewards/int_reward_func": 0.3958333283662796, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.010416666977107525, "rewards/xmlcount_reward_func": -0.09131250530481339, "step": 265, "zero_std_ratio": 0.0 }, { "epoch": 0.02372987198358535, "grad_norm": 113.86587524414062, "learning_rate": 3e-06, "loss": -29.6989, "step": 266 }, { "epoch": 0.023819082028636425, "grad_norm": 110.4273681640625, "learning_rate": 3e-06, "loss": -32.0614, "step": 267 }, { "epoch": 0.023908292073687498, "grad_norm": 111.84119415283203, "learning_rate": 3e-06, "loss": -31.9073, "step": 268 }, { "epoch": 0.02399750211873857, "grad_norm": 103.93081665039062, "learning_rate": 3e-06, "loss": -22.3506, "step": 269 }, { "epoch": 0.02408671216378964, "grad_norm": 120.32383728027344, "learning_rate": 3e-06, "loss": -28.5629, "step": 270 }, { "epoch": 0.024175922208840715, "grad_norm": 124.92536163330078, "learning_rate": 3e-06, "loss": -33.0776, "step": 271 }, { "epoch": 0.024265132253891788, "grad_norm": 119.54340362548828, "learning_rate": 3e-06, "loss": -31.6735, "step": 272 }, { "epoch": 0.02435434229894286, "grad_norm": 128.8444061279297, "learning_rate": 3e-06, "loss": -33.8033, "step": 273 }, { "epoch": 0.024443552343993934, "grad_norm": 123.08969116210938, "learning_rate": 3e-06, "loss": -34.4538, "step": 274 }, { "epoch": 0.024532762389045008, "grad_norm": 111.98983001708984, "learning_rate": 3e-06, "loss": -24.6449, "step": 275 }, { "epoch": 0.02462197243409608, "grad_norm": 123.31842041015625, "learning_rate": 3e-06, "loss": -31.1417, "step": 276 }, { "completion_length": 211.20833587646484, "epoch": 0.02471118247914715, "grad_norm": 83.42295837402344, "learning_rate": 3e-06, "loss": -49.3896, "reward": 1.6565834283828735, "reward_std": 0.7390342950820923, "rewards/correctness_reward_func": 1.375, "rewards/int_reward_func": 0.4166666716337204, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.13508333638310432, "step": 277, "zero_std_ratio": 0.0 }, { "epoch": 0.024800392524198224, "grad_norm": 78.70240783691406, "learning_rate": 3e-06, "loss": -60.2538, "step": 278 }, { "epoch": 0.024889602569249297, "grad_norm": 87.03772735595703, "learning_rate": 3e-06, "loss": -54.701, "step": 279 }, { "epoch": 0.02497881261430037, "grad_norm": 105.03215789794922, "learning_rate": 3e-06, "loss": -50.647, "step": 280 }, { "epoch": 0.025068022659351444, "grad_norm": 94.19722747802734, "learning_rate": 3e-06, "loss": -53.7356, "step": 281 }, { "epoch": 0.025157232704402517, "grad_norm": 71.46943664550781, "learning_rate": 3e-06, "loss": -54.5847, "step": 282 }, { "epoch": 0.025246442749453587, "grad_norm": 90.4788589477539, "learning_rate": 3e-06, "loss": -50.5539, "step": 283 }, { "epoch": 0.02533565279450466, "grad_norm": 74.81779479980469, "learning_rate": 3e-06, "loss": -61.3813, "step": 284 }, { "epoch": 0.025424862839555733, "grad_norm": 85.80409240722656, "learning_rate": 3e-06, "loss": -55.7379, "step": 285 }, { "epoch": 0.025514072884606807, "grad_norm": 135.24191284179688, "learning_rate": 3e-06, "loss": -52.1614, "step": 286 }, { "epoch": 0.02560328292965788, "grad_norm": 94.01042175292969, "learning_rate": 3e-06, "loss": -55.4857, "step": 287 }, { "epoch": 0.025692492974708953, "grad_norm": 72.32071685791016, "learning_rate": 3e-06, "loss": -56.3565, "step": 288 }, { "completion_length": 221.45833587646484, "epoch": 0.025781703019760027, "grad_norm": 225.52276611328125, "learning_rate": 3e-06, "loss": -67.7081, "reward": 1.960687518119812, "reward_std": 0.8211362063884735, "rewards/correctness_reward_func": 1.625, "rewards/int_reward_func": 0.4479166716337204, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.11222916468977928, "step": 289, "zero_std_ratio": 0.0 }, { "epoch": 0.025870913064811096, "grad_norm": 239.94651794433594, "learning_rate": 3e-06, "loss": -66.0587, "step": 290 }, { "epoch": 0.02596012310986217, "grad_norm": 173.2037353515625, "learning_rate": 3e-06, "loss": -59.416, "step": 291 }, { "epoch": 0.026049333154913243, "grad_norm": 228.50621032714844, "learning_rate": 3e-06, "loss": -70.6059, "step": 292 }, { "epoch": 0.026138543199964316, "grad_norm": 213.36802673339844, "learning_rate": 3e-06, "loss": -68.8733, "step": 293 }, { "epoch": 0.02622775324501539, "grad_norm": 389.8759460449219, "learning_rate": 3e-06, "loss": -108.725, "step": 294 }, { "epoch": 0.026316963290066463, "grad_norm": 241.96009826660156, "learning_rate": 3e-06, "loss": -73.0107, "step": 295 }, { "epoch": 0.026406173335117536, "grad_norm": 282.705322265625, "learning_rate": 3e-06, "loss": -71.4601, "step": 296 }, { "epoch": 0.026495383380168606, "grad_norm": 182.99859619140625, "learning_rate": 3e-06, "loss": -62.8503, "step": 297 }, { "epoch": 0.02658459342521968, "grad_norm": 237.8432159423828, "learning_rate": 3e-06, "loss": -76.095, "step": 298 }, { "epoch": 0.026673803470270752, "grad_norm": 224.10140991210938, "learning_rate": 3e-06, "loss": -71.9696, "step": 299 }, { "epoch": 0.026763013515321826, "grad_norm": 401.25421142578125, "learning_rate": 3e-06, "loss": -119.7468, "step": 300 }, { "completion_length": 181.00000762939453, "epoch": 0.0268522235603729, "grad_norm": 102.42095184326172, "learning_rate": 3e-06, "loss": 11.7663, "reward": 1.6927291750907898, "reward_std": 0.8399400115013123, "rewards/correctness_reward_func": 1.3333333730697632, "rewards/int_reward_func": 0.40625, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.0468541644513607, "step": 301, "zero_std_ratio": 0.0 }, { "epoch": 0.026941433605423972, "grad_norm": 97.59688568115234, "learning_rate": 3e-06, "loss": 4.3671, "step": 302 }, { "epoch": 0.027030643650475042, "grad_norm": 119.19691467285156, "learning_rate": 3e-06, "loss": 3.0122, "step": 303 }, { "epoch": 0.027119853695526115, "grad_norm": 102.54327392578125, "learning_rate": 3e-06, "loss": 5.7653, "step": 304 }, { "epoch": 0.02720906374057719, "grad_norm": 127.24678802490234, "learning_rate": 3e-06, "loss": 9.0808, "step": 305 }, { "epoch": 0.02729827378562826, "grad_norm": 115.35128784179688, "learning_rate": 3e-06, "loss": 9.7375, "step": 306 }, { "epoch": 0.027387483830679335, "grad_norm": 109.96597290039062, "learning_rate": 3e-06, "loss": 10.9794, "step": 307 }, { "epoch": 0.02747669387573041, "grad_norm": 116.67013549804688, "learning_rate": 3e-06, "loss": 4.0605, "step": 308 }, { "epoch": 0.02756590392078148, "grad_norm": 100.0082015991211, "learning_rate": 3e-06, "loss": 2.0719, "step": 309 }, { "epoch": 0.02765511396583255, "grad_norm": 103.2455062866211, "learning_rate": 3e-06, "loss": 3.753, "step": 310 }, { "epoch": 0.027744324010883625, "grad_norm": 139.74317932128906, "learning_rate": 3e-06, "loss": 7.435, "step": 311 }, { "epoch": 0.027833534055934698, "grad_norm": 126.05006408691406, "learning_rate": 3e-06, "loss": 8.1927, "step": 312 }, { "completion_length": 187.87500762939453, "epoch": 0.02792274410098577, "grad_norm": 182.04180908203125, "learning_rate": 3e-06, "loss": -90.2214, "reward": 1.3959375023841858, "reward_std": 0.7920421957969666, "rewards/correctness_reward_func": 1.0833333432674408, "rewards/int_reward_func": 0.40625, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.09364583343267441, "step": 313, "zero_std_ratio": 0.0 }, { "epoch": 0.028011954146036844, "grad_norm": 238.57090759277344, "learning_rate": 3e-06, "loss": -100.422, "step": 314 }, { "epoch": 0.028101164191087918, "grad_norm": 235.39544677734375, "learning_rate": 3e-06, "loss": -102.3354, "step": 315 }, { "epoch": 0.028190374236138988, "grad_norm": 223.8190460205078, "learning_rate": 3e-06, "loss": -109.9957, "step": 316 }, { "epoch": 0.02827958428119006, "grad_norm": 225.00672912597656, "learning_rate": 3e-06, "loss": -109.9375, "step": 317 }, { "epoch": 0.028368794326241134, "grad_norm": 247.57774353027344, "learning_rate": 3e-06, "loss": -125.4302, "step": 318 }, { "epoch": 0.028458004371292207, "grad_norm": 193.24212646484375, "learning_rate": 3e-06, "loss": -93.1797, "step": 319 }, { "epoch": 0.02854721441634328, "grad_norm": 264.4795227050781, "learning_rate": 3e-06, "loss": -104.7149, "step": 320 }, { "epoch": 0.028636424461394354, "grad_norm": 226.05810546875, "learning_rate": 3e-06, "loss": -107.5763, "step": 321 }, { "epoch": 0.028725634506445427, "grad_norm": 239.6378173828125, "learning_rate": 3e-06, "loss": -115.9954, "step": 322 }, { "epoch": 0.028814844551496497, "grad_norm": 240.8443145751953, "learning_rate": 3e-06, "loss": -117.6999, "step": 323 }, { "epoch": 0.02890405459654757, "grad_norm": 261.65643310546875, "learning_rate": 3e-06, "loss": -132.8027, "step": 324 }, { "completion_length": 189.8541717529297, "epoch": 0.028993264641598643, "grad_norm": 168.10452270507812, "learning_rate": 3e-06, "loss": 12.7588, "reward": 1.7772499918937683, "reward_std": 0.9346717596054077, "rewards/correctness_reward_func": 1.375, "rewards/int_reward_func": 0.4479166567325592, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.045666664838790894, "step": 325, "zero_std_ratio": 0.0 }, { "epoch": 0.029082474686649717, "grad_norm": 182.863037109375, "learning_rate": 3e-06, "loss": 0.3851, "step": 326 }, { "epoch": 0.02917168473170079, "grad_norm": 214.54574584960938, "learning_rate": 3e-06, "loss": 17.6492, "step": 327 }, { "epoch": 0.029260894776751863, "grad_norm": 187.80931091308594, "learning_rate": 3e-06, "loss": 7.692, "step": 328 }, { "epoch": 0.029350104821802937, "grad_norm": 195.0843505859375, "learning_rate": 3e-06, "loss": 12.8044, "step": 329 }, { "epoch": 0.029439314866854006, "grad_norm": 168.82028198242188, "learning_rate": 3e-06, "loss": -7.5147, "step": 330 }, { "epoch": 0.02952852491190508, "grad_norm": 176.14859008789062, "learning_rate": 3e-06, "loss": 10.9965, "step": 331 }, { "epoch": 0.029617734956956153, "grad_norm": 202.02247619628906, "learning_rate": 3e-06, "loss": -1.8138, "step": 332 }, { "epoch": 0.029706945002007226, "grad_norm": 216.37252807617188, "learning_rate": 3e-06, "loss": 15.5179, "step": 333 }, { "epoch": 0.0297961550470583, "grad_norm": 200.23558044433594, "learning_rate": 3e-06, "loss": 5.0549, "step": 334 }, { "epoch": 0.029885365092109373, "grad_norm": 177.7020263671875, "learning_rate": 3e-06, "loss": 11.199, "step": 335 }, { "epoch": 0.029974575137160443, "grad_norm": 170.23106384277344, "learning_rate": 3e-06, "loss": -10.9367, "step": 336 }, { "completion_length": 224.9166717529297, "epoch": 0.030063785182211516, "grad_norm": 127.8658218383789, "learning_rate": 3e-06, "loss": -59.3587, "reward": 1.6053959131240845, "reward_std": 0.5731277614831924, "rewards/correctness_reward_func": 1.3333333134651184, "rewards/int_reward_func": 0.4270833283662796, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.1550208330154419, "step": 337, "zero_std_ratio": 0.0 }, { "epoch": 0.03015299522726259, "grad_norm": 133.41494750976562, "learning_rate": 3e-06, "loss": -53.4241, "step": 338 }, { "epoch": 0.030242205272313662, "grad_norm": 170.7308807373047, "learning_rate": 3e-06, "loss": -65.4722, "step": 339 }, { "epoch": 0.030331415317364736, "grad_norm": 172.28118896484375, "learning_rate": 3e-06, "loss": -53.201, "step": 340 }, { "epoch": 0.03042062536241581, "grad_norm": 118.70462799072266, "learning_rate": 3e-06, "loss": -50.8363, "step": 341 }, { "epoch": 0.030509835407466882, "grad_norm": 143.119384765625, "learning_rate": 3e-06, "loss": -60.764, "step": 342 }, { "epoch": 0.030599045452517952, "grad_norm": 143.7277374267578, "learning_rate": 3e-06, "loss": -62.8186, "step": 343 }, { "epoch": 0.030688255497569025, "grad_norm": 157.5625, "learning_rate": 3e-06, "loss": -57.4741, "step": 344 }, { "epoch": 0.0307774655426201, "grad_norm": 191.64804077148438, "learning_rate": 3e-06, "loss": -71.2662, "step": 345 }, { "epoch": 0.030866675587671172, "grad_norm": 206.0039520263672, "learning_rate": 3e-06, "loss": -56.9883, "step": 346 }, { "epoch": 0.030955885632722245, "grad_norm": 132.1703643798828, "learning_rate": 3e-06, "loss": -54.0822, "step": 347 }, { "epoch": 0.03104509567777332, "grad_norm": 144.338623046875, "learning_rate": 3e-06, "loss": -66.1545, "step": 348 }, { "completion_length": 161.1041717529297, "epoch": 0.03113430572282439, "grad_norm": 176.29396057128906, "learning_rate": 3e-06, "loss": 2.568, "reward": 1.8858751058578491, "reward_std": 0.5198497474193573, "rewards/correctness_reward_func": 1.4583333134651184, "rewards/int_reward_func": 0.40625, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.021291667595505714, "step": 349, "zero_std_ratio": 0.0 }, { "epoch": 0.03122351576787546, "grad_norm": 162.1043701171875, "learning_rate": 3e-06, "loss": 11.6461, "step": 350 }, { "epoch": 0.03131272581292654, "grad_norm": 147.42918395996094, "learning_rate": 3e-06, "loss": 19.3425, "step": 351 }, { "epoch": 0.03140193585797761, "grad_norm": 146.65992736816406, "learning_rate": 3e-06, "loss": 4.1848, "step": 352 }, { "epoch": 0.03149114590302868, "grad_norm": 137.17474365234375, "learning_rate": 3e-06, "loss": 9.5202, "step": 353 }, { "epoch": 0.03158035594807975, "grad_norm": 176.24244689941406, "learning_rate": 3e-06, "loss": 24.3653, "step": 354 }, { "epoch": 0.031669565993130824, "grad_norm": 176.59144592285156, "learning_rate": 3e-06, "loss": 1.4274, "step": 355 }, { "epoch": 0.0317587760381819, "grad_norm": 161.0966339111328, "learning_rate": 3e-06, "loss": 10.7367, "step": 356 }, { "epoch": 0.03184798608323297, "grad_norm": 164.76675415039062, "learning_rate": 3e-06, "loss": 18.5651, "step": 357 }, { "epoch": 0.031937196128284044, "grad_norm": 227.15631103515625, "learning_rate": 3e-06, "loss": 2.7137, "step": 358 }, { "epoch": 0.03202640617333512, "grad_norm": 152.36514282226562, "learning_rate": 3e-06, "loss": 8.0471, "step": 359 }, { "epoch": 0.03211561621838619, "grad_norm": 162.82603454589844, "learning_rate": 3e-06, "loss": 24.2858, "step": 360 }, { "completion_length": 173.6666717529297, "epoch": 0.032204826263437264, "grad_norm": 112.11152648925781, "learning_rate": 3e-06, "loss": -47.6777, "reward": 1.9486668109893799, "reward_std": 0.614804282784462, "rewards/correctness_reward_func": 1.5, "rewards/int_reward_func": 0.4895833283662796, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.04091667756438255, "step": 361, "zero_std_ratio": 0.0 }, { "epoch": 0.03229403630848834, "grad_norm": 212.41893005371094, "learning_rate": 3e-06, "loss": -57.8799, "step": 362 }, { "epoch": 0.03238324635353941, "grad_norm": 129.3856201171875, "learning_rate": 3e-06, "loss": -42.8209, "step": 363 }, { "epoch": 0.032472456398590484, "grad_norm": 90.02410888671875, "learning_rate": 3e-06, "loss": -37.0157, "step": 364 }, { "epoch": 0.03256166644364156, "grad_norm": 122.2002944946289, "learning_rate": 3e-06, "loss": -42.0898, "step": 365 }, { "epoch": 0.03265087648869262, "grad_norm": 114.75045776367188, "learning_rate": 3e-06, "loss": -37.9465, "step": 366 }, { "epoch": 0.0327400865337437, "grad_norm": 117.86136627197266, "learning_rate": 3e-06, "loss": -50.0162, "step": 367 }, { "epoch": 0.03282929657879477, "grad_norm": 224.55755615234375, "learning_rate": 3e-06, "loss": -62.4077, "step": 368 }, { "epoch": 0.03291850662384584, "grad_norm": 145.33380126953125, "learning_rate": 3e-06, "loss": -45.7888, "step": 369 }, { "epoch": 0.033007716668896916, "grad_norm": 107.85284423828125, "learning_rate": 3e-06, "loss": -38.8828, "step": 370 }, { "epoch": 0.03309692671394799, "grad_norm": 143.64854431152344, "learning_rate": 3e-06, "loss": -44.4827, "step": 371 }, { "epoch": 0.03318613675899906, "grad_norm": 120.4244155883789, "learning_rate": 3e-06, "loss": -40.0586, "step": 372 }, { "completion_length": 170.3125, "epoch": 0.033275346804050136, "grad_norm": 199.8765869140625, "learning_rate": 3e-06, "loss": -16.0424, "reward": 1.5020000338554382, "reward_std": 0.6375356912612915, "rewards/correctness_reward_func": 1.0416666865348816, "rewards/int_reward_func": 0.4479166716337204, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.012416664510965347, "step": 373, "zero_std_ratio": 0.0 }, { "epoch": 0.03336455684910121, "grad_norm": 186.22103881835938, "learning_rate": 3e-06, "loss": -12.8464, "step": 374 }, { "epoch": 0.03345376689415228, "grad_norm": 133.84971618652344, "learning_rate": 3e-06, "loss": -16.0581, "step": 375 }, { "epoch": 0.033542976939203356, "grad_norm": 124.62361145019531, "learning_rate": 3e-06, "loss": -20.0662, "step": 376 }, { "epoch": 0.03363218698425443, "grad_norm": 177.62574768066406, "learning_rate": 3e-06, "loss": -16.7656, "step": 377 }, { "epoch": 0.0337213970293055, "grad_norm": 162.92381286621094, "learning_rate": 3e-06, "loss": -19.1936, "step": 378 }, { "epoch": 0.033810607074356576, "grad_norm": 167.49449157714844, "learning_rate": 3e-06, "loss": -16.9396, "step": 379 }, { "epoch": 0.03389981711940764, "grad_norm": 180.7197723388672, "learning_rate": 3e-06, "loss": -14.2949, "step": 380 }, { "epoch": 0.033989027164458716, "grad_norm": 158.6161346435547, "learning_rate": 3e-06, "loss": -18.1083, "step": 381 }, { "epoch": 0.03407823720950979, "grad_norm": 136.7860870361328, "learning_rate": 3e-06, "loss": -21.9306, "step": 382 }, { "epoch": 0.03416744725456086, "grad_norm": 200.51185607910156, "learning_rate": 3e-06, "loss": -18.8755, "step": 383 }, { "epoch": 0.034256657299611935, "grad_norm": 174.00477600097656, "learning_rate": 3e-06, "loss": -21.6131, "step": 384 }, { "completion_length": 168.0416717529297, "epoch": 0.03434586734466301, "grad_norm": 270.6558837890625, "learning_rate": 3e-06, "loss": -108.9543, "reward": 1.5282500386238098, "reward_std": 0.8121029734611511, "rewards/correctness_reward_func": 1.0416666567325592, "rewards/int_reward_func": 0.4479166716337204, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.03866666788235307, "step": 385, "zero_std_ratio": 0.0 }, { "epoch": 0.03443507738971408, "grad_norm": 316.0130615234375, "learning_rate": 3e-06, "loss": -121.254, "step": 386 }, { "epoch": 0.034524287434765155, "grad_norm": 257.7696228027344, "learning_rate": 3e-06, "loss": -102.8911, "step": 387 }, { "epoch": 0.03461349747981623, "grad_norm": 305.755126953125, "learning_rate": 3e-06, "loss": -114.5659, "step": 388 }, { "epoch": 0.0347027075248673, "grad_norm": 219.5818328857422, "learning_rate": 3e-06, "loss": -114.7208, "step": 389 }, { "epoch": 0.034791917569918375, "grad_norm": 255.7522430419922, "learning_rate": 3e-06, "loss": -109.7934, "step": 390 }, { "epoch": 0.03488112761496945, "grad_norm": 291.77642822265625, "learning_rate": 3e-06, "loss": -116.5826, "step": 391 }, { "epoch": 0.03497033766002052, "grad_norm": 333.5157165527344, "learning_rate": 3e-06, "loss": -132.3897, "step": 392 }, { "epoch": 0.03505954770507159, "grad_norm": 267.8763122558594, "learning_rate": 3e-06, "loss": -110.8159, "step": 393 }, { "epoch": 0.03514875775012266, "grad_norm": 312.3733215332031, "learning_rate": 3e-06, "loss": -127.8078, "step": 394 }, { "epoch": 0.035237967795173734, "grad_norm": 242.0186309814453, "learning_rate": 3e-06, "loss": -123.3703, "step": 395 }, { "epoch": 0.03532717784022481, "grad_norm": 297.62847900390625, "learning_rate": 3e-06, "loss": -120.3945, "step": 396 }, { "completion_length": 171.45833587646484, "epoch": 0.03541638788527588, "grad_norm": 192.1062774658203, "learning_rate": 3e-06, "loss": -81.6189, "reward": 1.9622292518615723, "reward_std": 0.8264816105365753, "rewards/correctness_reward_func": 1.4583333134651184, "rewards/int_reward_func": 0.46875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.0351458303630352, "step": 397, "zero_std_ratio": 0.0 }, { "epoch": 0.035505597930326954, "grad_norm": 202.8217010498047, "learning_rate": 3e-06, "loss": -67.341, "step": 398 }, { "epoch": 0.03559480797537803, "grad_norm": 218.82249450683594, "learning_rate": 3e-06, "loss": -65.3347, "step": 399 }, { "epoch": 0.0356840180204291, "grad_norm": 232.65196228027344, "learning_rate": 3e-06, "loss": -76.8989, "step": 400 }, { "epoch": 0.035773228065480174, "grad_norm": 198.04103088378906, "learning_rate": 3e-06, "loss": -74.488, "step": 401 }, { "epoch": 0.03586243811053125, "grad_norm": 211.86273193359375, "learning_rate": 3e-06, "loss": -75.6096, "step": 402 }, { "epoch": 0.03595164815558232, "grad_norm": 218.77589416503906, "learning_rate": 3e-06, "loss": -87.9174, "step": 403 }, { "epoch": 0.036040858200633394, "grad_norm": 243.4962615966797, "learning_rate": 3e-06, "loss": -70.1026, "step": 404 }, { "epoch": 0.03613006824568447, "grad_norm": 242.30494689941406, "learning_rate": 3e-06, "loss": -71.4579, "step": 405 }, { "epoch": 0.03621927829073553, "grad_norm": 274.28948974609375, "learning_rate": 3e-06, "loss": -83.8044, "step": 406 }, { "epoch": 0.03630848833578661, "grad_norm": 257.0942077636719, "learning_rate": 3e-06, "loss": -82.445, "step": 407 }, { "epoch": 0.03639769838083768, "grad_norm": 255.2320556640625, "learning_rate": 3e-06, "loss": -81.9402, "step": 408 }, { "completion_length": 125.70833587646484, "epoch": 0.03648690842588875, "grad_norm": 185.7193145751953, "learning_rate": 3e-06, "loss": 46.4742, "reward": 1.8683959245681763, "reward_std": 0.8172085583209991, "rewards/correctness_reward_func": 1.3333333730697632, "rewards/int_reward_func": 0.4270833432674408, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.10797916725277901, "step": 409, "zero_std_ratio": 0.0 }, { "epoch": 0.03657611847093983, "grad_norm": 218.23338317871094, "learning_rate": 3e-06, "loss": 29.3517, "step": 410 }, { "epoch": 0.0366653285159909, "grad_norm": 180.90330505371094, "learning_rate": 3e-06, "loss": 55.3162, "step": 411 }, { "epoch": 0.03675453856104197, "grad_norm": 216.37953186035156, "learning_rate": 3e-06, "loss": 50.2096, "step": 412 }, { "epoch": 0.036843748606093046, "grad_norm": 198.8724822998047, "learning_rate": 3e-06, "loss": 51.636, "step": 413 }, { "epoch": 0.03693295865114412, "grad_norm": 184.89627075195312, "learning_rate": 3e-06, "loss": 44.8369, "step": 414 }, { "epoch": 0.03702216869619519, "grad_norm": 167.6713104248047, "learning_rate": 3e-06, "loss": 44.1546, "step": 415 }, { "epoch": 0.037111378741246266, "grad_norm": 192.13140869140625, "learning_rate": 3e-06, "loss": 27.4686, "step": 416 }, { "epoch": 0.03720058878629734, "grad_norm": 177.4408721923828, "learning_rate": 3e-06, "loss": 53.37, "step": 417 }, { "epoch": 0.03728979883134841, "grad_norm": 223.81668090820312, "learning_rate": 3e-06, "loss": 45.2759, "step": 418 }, { "epoch": 0.037379008876399486, "grad_norm": 207.5684356689453, "learning_rate": 3e-06, "loss": 46.924, "step": 419 }, { "epoch": 0.03746821892145055, "grad_norm": 180.81484985351562, "learning_rate": 3e-06, "loss": 42.0928, "step": 420 }, { "completion_length": 153.89583587646484, "epoch": 0.037557428966501626, "grad_norm": 247.00067138671875, "learning_rate": 3e-06, "loss": 14.4654, "reward": 1.776770830154419, "reward_std": 0.6972799003124237, "rewards/correctness_reward_func": 1.25, "rewards/int_reward_func": 0.4479166716337204, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.07885416969656944, "step": 421, "zero_std_ratio": 0.0 }, { "epoch": 0.0376466390115527, "grad_norm": 244.66824340820312, "learning_rate": 3e-06, "loss": 34.9601, "step": 422 }, { "epoch": 0.03773584905660377, "grad_norm": 276.21539306640625, "learning_rate": 3e-06, "loss": 14.6261, "step": 423 }, { "epoch": 0.037825059101654845, "grad_norm": 288.96246337890625, "learning_rate": 3e-06, "loss": 41.9368, "step": 424 }, { "epoch": 0.03791426914670592, "grad_norm": 303.6945495605469, "learning_rate": 3e-06, "loss": 23.9119, "step": 425 }, { "epoch": 0.03800347919175699, "grad_norm": 274.27142333984375, "learning_rate": 3e-06, "loss": 27.399, "step": 426 }, { "epoch": 0.038092689236808065, "grad_norm": 233.245361328125, "learning_rate": 3e-06, "loss": 10.175, "step": 427 }, { "epoch": 0.03818189928185914, "grad_norm": 256.8597412109375, "learning_rate": 3e-06, "loss": 31.9908, "step": 428 }, { "epoch": 0.03827110932691021, "grad_norm": 270.4859619140625, "learning_rate": 3e-06, "loss": 10.7867, "step": 429 }, { "epoch": 0.038360319371961285, "grad_norm": 301.17181396484375, "learning_rate": 3e-06, "loss": 40.6524, "step": 430 }, { "epoch": 0.03844952941701236, "grad_norm": 303.94488525390625, "learning_rate": 3e-06, "loss": 21.1706, "step": 431 }, { "epoch": 0.03853873946206343, "grad_norm": 258.3034973144531, "learning_rate": 3e-06, "loss": 22.7622, "step": 432 }, { "completion_length": 107.54166793823242, "epoch": 0.0386279495071145, "grad_norm": 134.02154541015625, "learning_rate": 3e-06, "loss": -22.4999, "reward": 2.2951666712760925, "reward_std": 0.38126008585095406, "rewards/correctness_reward_func": 1.5833333134651184, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.21183334290981293, "step": 433, "zero_std_ratio": 0.0 }, { "epoch": 0.03871715955216557, "grad_norm": 191.98023986816406, "learning_rate": 3e-06, "loss": -14.0492, "step": 434 }, { "epoch": 0.038806369597216644, "grad_norm": 154.30328369140625, "learning_rate": 3e-06, "loss": -10.9444, "step": 435 }, { "epoch": 0.03889557964226772, "grad_norm": 134.01214599609375, "learning_rate": 3e-06, "loss": -17.7574, "step": 436 }, { "epoch": 0.03898478968731879, "grad_norm": 132.3379364013672, "learning_rate": 3e-06, "loss": -18.487, "step": 437 }, { "epoch": 0.039073999732369864, "grad_norm": 146.31573486328125, "learning_rate": 3e-06, "loss": -12.7164, "step": 438 }, { "epoch": 0.03916320977742094, "grad_norm": 136.05592346191406, "learning_rate": 3e-06, "loss": -23.1439, "step": 439 }, { "epoch": 0.03925241982247201, "grad_norm": 138.1117706298828, "learning_rate": 3e-06, "loss": -15.7255, "step": 440 }, { "epoch": 0.039341629867523084, "grad_norm": 166.34922790527344, "learning_rate": 3e-06, "loss": -12.6375, "step": 441 }, { "epoch": 0.03943083991257416, "grad_norm": 132.994140625, "learning_rate": 3e-06, "loss": -20.1431, "step": 442 }, { "epoch": 0.03952004995762523, "grad_norm": 129.54771423339844, "learning_rate": 3e-06, "loss": -19.3478, "step": 443 }, { "epoch": 0.039609260002676304, "grad_norm": 152.91607666015625, "learning_rate": 3e-06, "loss": -14.3018, "step": 444 }, { "completion_length": 160.7916717529297, "epoch": 0.03969847004772738, "grad_norm": 165.9615020751953, "learning_rate": 3e-06, "loss": 38.8687, "reward": 1.6081042885780334, "reward_std": 0.45602357387542725, "rewards/correctness_reward_func": 1.0833333432674408, "rewards/int_reward_func": 0.4479166716337204, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.0768541656434536, "step": 445, "zero_std_ratio": 0.0 }, { "epoch": 0.039787680092778444, "grad_norm": 165.71353149414062, "learning_rate": 3e-06, "loss": 32.8818, "step": 446 }, { "epoch": 0.03987689013782952, "grad_norm": 185.27174377441406, "learning_rate": 3e-06, "loss": 29.8206, "step": 447 }, { "epoch": 0.03996610018288059, "grad_norm": 163.79954528808594, "learning_rate": 3e-06, "loss": 28.2399, "step": 448 }, { "epoch": 0.04005531022793166, "grad_norm": 167.2331085205078, "learning_rate": 3e-06, "loss": 42.2504, "step": 449 }, { "epoch": 0.04014452027298274, "grad_norm": 157.44320678710938, "learning_rate": 3e-06, "loss": 47.0631, "step": 450 }, { "epoch": 0.04023373031803381, "grad_norm": 167.7976837158203, "learning_rate": 3e-06, "loss": 37.7299, "step": 451 }, { "epoch": 0.04032294036308488, "grad_norm": 171.96420288085938, "learning_rate": 3e-06, "loss": 31.7018, "step": 452 }, { "epoch": 0.040412150408135956, "grad_norm": 164.95046997070312, "learning_rate": 3e-06, "loss": 28.9306, "step": 453 }, { "epoch": 0.04050136045318703, "grad_norm": 146.903076171875, "learning_rate": 3e-06, "loss": 26.551, "step": 454 }, { "epoch": 0.0405905704982381, "grad_norm": 182.6881561279297, "learning_rate": 3e-06, "loss": 41.0788, "step": 455 }, { "epoch": 0.040679780543289176, "grad_norm": 147.5907440185547, "learning_rate": 3e-06, "loss": 44.0529, "step": 456 }, { "completion_length": 143.8541717529297, "epoch": 0.04076899058834025, "grad_norm": 252.46615600585938, "learning_rate": 3e-06, "loss": -60.641, "reward": 1.8890208005905151, "reward_std": 0.4024546667933464, "rewards/correctness_reward_func": 1.375, "rewards/int_reward_func": 0.4166666716337204, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.09735416248440742, "step": 457, "zero_std_ratio": 0.0 }, { "epoch": 0.04085820063339132, "grad_norm": 223.07171630859375, "learning_rate": 3e-06, "loss": -42.0093, "step": 458 }, { "epoch": 0.040947410678442396, "grad_norm": 237.3083953857422, "learning_rate": 3e-06, "loss": -57.9657, "step": 459 }, { "epoch": 0.04103662072349346, "grad_norm": 225.29269409179688, "learning_rate": 3e-06, "loss": -41.8661, "step": 460 }, { "epoch": 0.041125830768544536, "grad_norm": 210.0297088623047, "learning_rate": 3e-06, "loss": -47.2305, "step": 461 }, { "epoch": 0.04121504081359561, "grad_norm": 262.04473876953125, "learning_rate": 3e-06, "loss": -50.9342, "step": 462 }, { "epoch": 0.04130425085864668, "grad_norm": 252.53802490234375, "learning_rate": 3e-06, "loss": -62.6208, "step": 463 }, { "epoch": 0.041393460903697755, "grad_norm": 221.40121459960938, "learning_rate": 3e-06, "loss": -43.7282, "step": 464 }, { "epoch": 0.04148267094874883, "grad_norm": 231.59335327148438, "learning_rate": 3e-06, "loss": -60.5224, "step": 465 }, { "epoch": 0.0415718809937999, "grad_norm": 207.73471069335938, "learning_rate": 3e-06, "loss": -44.5223, "step": 466 }, { "epoch": 0.041661091038850975, "grad_norm": 217.08779907226562, "learning_rate": 3e-06, "loss": -50.6021, "step": 467 }, { "epoch": 0.04175030108390205, "grad_norm": 254.29269409179688, "learning_rate": 3e-06, "loss": -55.4887, "step": 468 }, { "completion_length": 122.50000381469727, "epoch": 0.04183951112895312, "grad_norm": 147.28335571289062, "learning_rate": 3e-06, "loss": 12.9864, "reward": 2.310395896434784, "reward_std": 0.44204503297805786, "rewards/correctness_reward_func": 1.6666666269302368, "rewards/int_reward_func": 0.46875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.010416666977107525, "rewards/xmlcount_reward_func": 0.16456249356269836, "step": 469, "zero_std_ratio": 0.0 }, { "epoch": 0.041928721174004195, "grad_norm": 161.9978485107422, "learning_rate": 3e-06, "loss": 34.7546, "step": 470 }, { "epoch": 0.04201793121905527, "grad_norm": 165.3116455078125, "learning_rate": 3e-06, "loss": 17.4188, "step": 471 }, { "epoch": 0.04210714126410634, "grad_norm": 142.81861877441406, "learning_rate": 3e-06, "loss": 18.3006, "step": 472 }, { "epoch": 0.04219635130915741, "grad_norm": 168.01116943359375, "learning_rate": 3e-06, "loss": 17.6413, "step": 473 }, { "epoch": 0.04228556135420848, "grad_norm": 207.03326416015625, "learning_rate": 3e-06, "loss": 15.9259, "step": 474 }, { "epoch": 0.042374771399259555, "grad_norm": 141.62599182128906, "learning_rate": 3e-06, "loss": 12.4355, "step": 475 }, { "epoch": 0.04246398144431063, "grad_norm": 180.9537353515625, "learning_rate": 3e-06, "loss": 32.9299, "step": 476 }, { "epoch": 0.0425531914893617, "grad_norm": 163.92254638671875, "learning_rate": 3e-06, "loss": 17.1089, "step": 477 }, { "epoch": 0.042642401534412774, "grad_norm": 145.9250030517578, "learning_rate": 3e-06, "loss": 17.069, "step": 478 }, { "epoch": 0.04273161157946385, "grad_norm": 152.68319702148438, "learning_rate": 3e-06, "loss": 16.6025, "step": 479 }, { "epoch": 0.04282082162451492, "grad_norm": 222.65997314453125, "learning_rate": 3e-06, "loss": 14.2435, "step": 480 }, { "completion_length": 116.66666793823242, "epoch": 0.042910031669565994, "grad_norm": 166.38722229003906, "learning_rate": 3e-06, "loss": 13.4616, "reward": 1.5636458992958069, "reward_std": 0.6559399664402008, "rewards/correctness_reward_func": 0.9999999701976776, "rewards/int_reward_func": 0.4166666716337204, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.1469791643321514, "step": 481, "zero_std_ratio": 0.0 }, { "epoch": 0.04299924171461707, "grad_norm": 222.22879028320312, "learning_rate": 3e-06, "loss": 18.8121, "step": 482 }, { "epoch": 0.04308845175966814, "grad_norm": 221.17059326171875, "learning_rate": 3e-06, "loss": 21.4877, "step": 483 }, { "epoch": 0.043177661804719214, "grad_norm": 142.53189086914062, "learning_rate": 3e-06, "loss": 16.8492, "step": 484 }, { "epoch": 0.04326687184977029, "grad_norm": 170.13198852539062, "learning_rate": 3e-06, "loss": 20.7898, "step": 485 }, { "epoch": 0.043356081894821354, "grad_norm": 161.23110961914062, "learning_rate": 3e-06, "loss": 12.8851, "step": 486 }, { "epoch": 0.04344529193987243, "grad_norm": 175.66587829589844, "learning_rate": 3e-06, "loss": 10.8873, "step": 487 }, { "epoch": 0.0435345019849235, "grad_norm": 195.75050354003906, "learning_rate": 3e-06, "loss": 15.6694, "step": 488 }, { "epoch": 0.04362371202997457, "grad_norm": 190.2042236328125, "learning_rate": 3e-06, "loss": 19.0926, "step": 489 }, { "epoch": 0.04371292207502565, "grad_norm": 146.10504150390625, "learning_rate": 3e-06, "loss": 13.8907, "step": 490 }, { "epoch": 0.04380213212007672, "grad_norm": 149.26614379882812, "learning_rate": 3e-06, "loss": 17.0683, "step": 491 }, { "epoch": 0.04389134216512779, "grad_norm": 178.4593963623047, "learning_rate": 3e-06, "loss": 10.2799, "step": 492 }, { "completion_length": 137.50000762939453, "epoch": 0.043980552210178867, "grad_norm": 87.93815612792969, "learning_rate": 3e-06, "loss": -35.7257, "reward": 2.5712709426879883, "reward_std": 0.18458116799592972, "rewards/correctness_reward_func": 1.9583333134651184, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.11293749511241913, "step": 493, "zero_std_ratio": 0.0 }, { "epoch": 0.04406976225522994, "grad_norm": 78.96405029296875, "learning_rate": 3e-06, "loss": -37.1011, "step": 494 }, { "epoch": 0.04415897230028101, "grad_norm": 113.70293426513672, "learning_rate": 3e-06, "loss": -46.3151, "step": 495 }, { "epoch": 0.044248182345332086, "grad_norm": 90.45478820800781, "learning_rate": 3e-06, "loss": -43.5143, "step": 496 }, { "epoch": 0.04433739239038316, "grad_norm": 106.42904663085938, "learning_rate": 3e-06, "loss": -42.7944, "step": 497 }, { "epoch": 0.04442660243543423, "grad_norm": 106.20608520507812, "learning_rate": 3e-06, "loss": -48.1815, "step": 498 }, { "epoch": 0.0445158124804853, "grad_norm": 93.41876220703125, "learning_rate": 3e-06, "loss": -37.7992, "step": 499 }, { "epoch": 0.04460502252553637, "grad_norm": 91.40050506591797, "learning_rate": 3e-06, "loss": -38.651, "step": 500 }, { "epoch": 0.044694232570587446, "grad_norm": 116.4251480102539, "learning_rate": 3e-06, "loss": -49.2276, "step": 501 }, { "epoch": 0.04478344261563852, "grad_norm": 92.2903060913086, "learning_rate": 3e-06, "loss": -46.591, "step": 502 }, { "epoch": 0.04487265266068959, "grad_norm": 110.7293472290039, "learning_rate": 3e-06, "loss": -45.7648, "step": 503 }, { "epoch": 0.044961862705740666, "grad_norm": 108.03923797607422, "learning_rate": 3e-06, "loss": -51.9054, "step": 504 }, { "completion_length": 131.14583587646484, "epoch": 0.04505107275079174, "grad_norm": 452.2289733886719, "learning_rate": 3e-06, "loss": 29.1255, "reward": 1.9235833883285522, "reward_std": 0.813209742307663, "rewards/correctness_reward_func": 1.3333333730697632, "rewards/int_reward_func": 0.4270833283662796, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.16316666454076767, "step": 505, "zero_std_ratio": 0.0 }, { "epoch": 0.04514028279584281, "grad_norm": 284.8124694824219, "learning_rate": 3e-06, "loss": 31.7163, "step": 506 }, { "epoch": 0.045229492840893885, "grad_norm": 361.7442626953125, "learning_rate": 3e-06, "loss": 27.2282, "step": 507 }, { "epoch": 0.04531870288594496, "grad_norm": 283.879638671875, "learning_rate": 3e-06, "loss": 42.2817, "step": 508 }, { "epoch": 0.04540791293099603, "grad_norm": 316.19000244140625, "learning_rate": 3e-06, "loss": 26.7891, "step": 509 }, { "epoch": 0.045497122976047105, "grad_norm": 370.62652587890625, "learning_rate": 3e-06, "loss": 34.1636, "step": 510 }, { "epoch": 0.04558633302109818, "grad_norm": 273.1391296386719, "learning_rate": 3e-06, "loss": 27.6705, "step": 511 }, { "epoch": 0.04567554306614925, "grad_norm": 307.9808044433594, "learning_rate": 3e-06, "loss": 28.9577, "step": 512 }, { "epoch": 0.04576475311120032, "grad_norm": 374.3335876464844, "learning_rate": 3e-06, "loss": 22.2669, "step": 513 }, { "epoch": 0.04585396315625139, "grad_norm": 395.0052795410156, "learning_rate": 3e-06, "loss": 39.239, "step": 514 }, { "epoch": 0.045943173201302465, "grad_norm": 539.7128295898438, "learning_rate": 3e-06, "loss": 25.6475, "step": 515 }, { "epoch": 0.04603238324635354, "grad_norm": 348.81170654296875, "learning_rate": 3e-06, "loss": 31.7762, "step": 516 }, { "completion_length": 115.83333587646484, "epoch": 0.04612159329140461, "grad_norm": 177.44480895996094, "learning_rate": 3e-06, "loss": -44.7672, "reward": 2.343208432197571, "reward_std": 0.4359329864382744, "rewards/correctness_reward_func": 1.6666666269302368, "rewards/int_reward_func": 0.4895833283662796, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.18695833534002304, "step": 517, "zero_std_ratio": 0.0 }, { "epoch": 0.046210803336455684, "grad_norm": 199.65908813476562, "learning_rate": 3e-06, "loss": -51.4622, "step": 518 }, { "epoch": 0.04630001338150676, "grad_norm": 175.2034149169922, "learning_rate": 3e-06, "loss": -55.1536, "step": 519 }, { "epoch": 0.04638922342655783, "grad_norm": 160.91688537597656, "learning_rate": 3e-06, "loss": -37.3164, "step": 520 }, { "epoch": 0.046478433471608904, "grad_norm": 165.5592498779297, "learning_rate": 3e-06, "loss": -42.9147, "step": 521 }, { "epoch": 0.04656764351665998, "grad_norm": 154.5955047607422, "learning_rate": 3e-06, "loss": -48.9731, "step": 522 }, { "epoch": 0.04665685356171105, "grad_norm": 202.06838989257812, "learning_rate": 3e-06, "loss": -46.1992, "step": 523 }, { "epoch": 0.046746063606762124, "grad_norm": 216.6766357421875, "learning_rate": 3e-06, "loss": -54.2271, "step": 524 }, { "epoch": 0.0468352736518132, "grad_norm": 212.4103240966797, "learning_rate": 3e-06, "loss": -59.7351, "step": 525 }, { "epoch": 0.046924483696864264, "grad_norm": 160.86546325683594, "learning_rate": 3e-06, "loss": -40.5866, "step": 526 }, { "epoch": 0.04701369374191534, "grad_norm": 171.24478149414062, "learning_rate": 3e-06, "loss": -47.2665, "step": 527 }, { "epoch": 0.04710290378696641, "grad_norm": 165.52357482910156, "learning_rate": 3e-06, "loss": -53.477, "step": 528 }, { "completion_length": 119.77083587646484, "epoch": 0.047192113832017483, "grad_norm": 382.3139953613281, "learning_rate": 3e-06, "loss": 86.4678, "reward": 1.9181458950042725, "reward_std": 0.6776820421218872, "rewards/correctness_reward_func": 1.3333333134651184, "rewards/int_reward_func": 0.4166666716337204, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.1681458279490471, "step": 529, "zero_std_ratio": 0.0 }, { "epoch": 0.04728132387706856, "grad_norm": 264.3744812011719, "learning_rate": 3e-06, "loss": 87.2703, "step": 530 }, { "epoch": 0.04737053392211963, "grad_norm": 273.3477783203125, "learning_rate": 3e-06, "loss": 74.9615, "step": 531 }, { "epoch": 0.0474597439671707, "grad_norm": 326.87078857421875, "learning_rate": 3e-06, "loss": 90.7838, "step": 532 }, { "epoch": 0.04754895401222178, "grad_norm": 294.74041748046875, "learning_rate": 3e-06, "loss": 102.1572, "step": 533 }, { "epoch": 0.04763816405727285, "grad_norm": 312.48626708984375, "learning_rate": 3e-06, "loss": 93.4423, "step": 534 }, { "epoch": 0.04772737410232392, "grad_norm": 383.2833557128906, "learning_rate": 3e-06, "loss": 86.6238, "step": 535 }, { "epoch": 0.047816584147374996, "grad_norm": 316.42926025390625, "learning_rate": 3e-06, "loss": 86.6783, "step": 536 }, { "epoch": 0.04790579419242607, "grad_norm": 268.37506103515625, "learning_rate": 3e-06, "loss": 72.0206, "step": 537 }, { "epoch": 0.04799500423747714, "grad_norm": 337.4726867675781, "learning_rate": 3e-06, "loss": 87.3427, "step": 538 }, { "epoch": 0.04808421428252821, "grad_norm": 284.79827880859375, "learning_rate": 3e-06, "loss": 98.66, "step": 539 }, { "epoch": 0.04817342432757928, "grad_norm": 321.74609375, "learning_rate": 3e-06, "loss": 87.6142, "step": 540 }, { "completion_length": 125.37500762939453, "epoch": 0.048262634372630356, "grad_norm": 187.67727661132812, "learning_rate": 3e-06, "loss": -1.5514, "reward": 1.9883333444595337, "reward_std": 0.6124217808246613, "rewards/correctness_reward_func": 1.4166666865348816, "rewards/int_reward_func": 0.4375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.1341666616499424, "step": 541, "zero_std_ratio": 0.0 }, { "epoch": 0.04835184441768143, "grad_norm": 202.66070556640625, "learning_rate": 3e-06, "loss": 1.9568, "step": 542 }, { "epoch": 0.0484410544627325, "grad_norm": 180.55126953125, "learning_rate": 3e-06, "loss": -6.8013, "step": 543 }, { "epoch": 0.048530264507783576, "grad_norm": 161.08514404296875, "learning_rate": 3e-06, "loss": -3.4725, "step": 544 }, { "epoch": 0.04861947455283465, "grad_norm": 220.28076171875, "learning_rate": 3e-06, "loss": 0.0972, "step": 545 }, { "epoch": 0.04870868459788572, "grad_norm": 321.00994873046875, "learning_rate": 3e-06, "loss": -10.5751, "step": 546 }, { "epoch": 0.048797894642936795, "grad_norm": 197.3623046875, "learning_rate": 3e-06, "loss": -2.6929, "step": 547 }, { "epoch": 0.04888710468798787, "grad_norm": 213.94691467285156, "learning_rate": 3e-06, "loss": 1.3407, "step": 548 }, { "epoch": 0.04897631473303894, "grad_norm": 254.2111053466797, "learning_rate": 3e-06, "loss": -7.3544, "step": 549 }, { "epoch": 0.049065524778090015, "grad_norm": 155.93460083007812, "learning_rate": 3e-06, "loss": -5.7191, "step": 550 }, { "epoch": 0.04915473482314109, "grad_norm": 175.17147827148438, "learning_rate": 3e-06, "loss": -2.0149, "step": 551 }, { "epoch": 0.04924394486819216, "grad_norm": 220.0244140625, "learning_rate": 3e-06, "loss": -12.1544, "step": 552 }, { "completion_length": 138.62500762939453, "epoch": 0.04933315491324323, "grad_norm": 255.7532501220703, "learning_rate": 3e-06, "loss": -42.4297, "reward": 2.4352500438690186, "reward_std": 0.32932066917419434, "rewards/correctness_reward_func": 1.8333333134651184, "rewards/int_reward_func": 0.4791666716337204, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.12275000661611557, "step": 553, "zero_std_ratio": 0.0 }, { "epoch": 0.0494223649582943, "grad_norm": 263.04913330078125, "learning_rate": 3e-06, "loss": -27.7121, "step": 554 }, { "epoch": 0.049511575003345375, "grad_norm": 187.12600708007812, "learning_rate": 3e-06, "loss": -32.293, "step": 555 }, { "epoch": 0.04960078504839645, "grad_norm": 233.73802185058594, "learning_rate": 3e-06, "loss": -34.2649, "step": 556 }, { "epoch": 0.04968999509344752, "grad_norm": 238.03567504882812, "learning_rate": 3e-06, "loss": -48.0373, "step": 557 }, { "epoch": 0.049779205138498595, "grad_norm": 211.17440795898438, "learning_rate": 3e-06, "loss": -34.5469, "step": 558 }, { "epoch": 0.04986841518354967, "grad_norm": 209.6473388671875, "learning_rate": 3e-06, "loss": -45.0123, "step": 559 }, { "epoch": 0.04995762522860074, "grad_norm": 219.1716766357422, "learning_rate": 3e-06, "loss": -29.3907, "step": 560 }, { "epoch": 0.050046835273651814, "grad_norm": 194.5946502685547, "learning_rate": 3e-06, "loss": -33.3813, "step": 561 }, { "epoch": 0.05013604531870289, "grad_norm": 230.82928466796875, "learning_rate": 3e-06, "loss": -37.409, "step": 562 }, { "epoch": 0.05022525536375396, "grad_norm": 268.3168640136719, "learning_rate": 3e-06, "loss": -50.4616, "step": 563 }, { "epoch": 0.050314465408805034, "grad_norm": 225.2816925048828, "learning_rate": 3e-06, "loss": -36.9552, "step": 564 }, { "completion_length": 138.81250381469727, "epoch": 0.05040367545385611, "grad_norm": 236.8852996826172, "learning_rate": 3e-06, "loss": 45.2003, "reward": 1.8692501783370972, "reward_std": 0.7652427852153778, "rewards/correctness_reward_func": 1.25, "rewards/int_reward_func": 0.46875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.15049999579787254, "step": 565, "zero_std_ratio": 0.0 }, { "epoch": 0.050492885498907174, "grad_norm": 258.247802734375, "learning_rate": 3e-06, "loss": 49.6383, "step": 566 }, { "epoch": 0.05058209554395825, "grad_norm": 318.1617126464844, "learning_rate": 3e-06, "loss": 58.9489, "step": 567 }, { "epoch": 0.05067130558900932, "grad_norm": 226.23045349121094, "learning_rate": 3e-06, "loss": 44.3005, "step": 568 }, { "epoch": 0.050760515634060394, "grad_norm": 303.47760009765625, "learning_rate": 3e-06, "loss": 28.8653, "step": 569 }, { "epoch": 0.05084972567911147, "grad_norm": 248.53013610839844, "learning_rate": 3e-06, "loss": 45.7073, "step": 570 }, { "epoch": 0.05093893572416254, "grad_norm": 228.76365661621094, "learning_rate": 3e-06, "loss": 40.9719, "step": 571 }, { "epoch": 0.05102814576921361, "grad_norm": 236.98915100097656, "learning_rate": 3e-06, "loss": 44.3296, "step": 572 }, { "epoch": 0.05111735581426469, "grad_norm": 318.7423400878906, "learning_rate": 3e-06, "loss": 54.9659, "step": 573 }, { "epoch": 0.05120656585931576, "grad_norm": 226.2831268310547, "learning_rate": 3e-06, "loss": 40.0793, "step": 574 }, { "epoch": 0.05129577590436683, "grad_norm": 321.7300109863281, "learning_rate": 3e-06, "loss": 25.4049, "step": 575 }, { "epoch": 0.051384985949417906, "grad_norm": 225.1118927001953, "learning_rate": 3e-06, "loss": 42.5627, "step": 576 }, { "completion_length": 114.56250381469727, "epoch": 0.05147419599446898, "grad_norm": 151.06048583984375, "learning_rate": 3e-06, "loss": 33.113, "reward": 2.1812918186187744, "reward_std": 0.530484139919281, "rewards/correctness_reward_func": 1.5833333134651184, "rewards/int_reward_func": 0.4375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.16045832633972168, "step": 577, "zero_std_ratio": 0.0 }, { "epoch": 0.05156340603952005, "grad_norm": 182.15431213378906, "learning_rate": 3e-06, "loss": 19.2473, "step": 578 }, { "epoch": 0.05165261608457112, "grad_norm": 169.3698272705078, "learning_rate": 3e-06, "loss": 25.8161, "step": 579 }, { "epoch": 0.05174182612962219, "grad_norm": 155.8734588623047, "learning_rate": 3e-06, "loss": 19.9518, "step": 580 }, { "epoch": 0.051831036174673266, "grad_norm": 177.54641723632812, "learning_rate": 3e-06, "loss": 23.4249, "step": 581 }, { "epoch": 0.05192024621972434, "grad_norm": 143.13719177246094, "learning_rate": 3e-06, "loss": 7.9228, "step": 582 }, { "epoch": 0.05200945626477541, "grad_norm": 146.39906311035156, "learning_rate": 3e-06, "loss": 30.8804, "step": 583 }, { "epoch": 0.052098666309826486, "grad_norm": 166.2614288330078, "learning_rate": 3e-06, "loss": 17.3062, "step": 584 }, { "epoch": 0.05218787635487756, "grad_norm": 158.1491241455078, "learning_rate": 3e-06, "loss": 23.1792, "step": 585 }, { "epoch": 0.05227708639992863, "grad_norm": 124.15723419189453, "learning_rate": 3e-06, "loss": 18.8908, "step": 586 }, { "epoch": 0.052366296444979706, "grad_norm": 155.26602172851562, "learning_rate": 3e-06, "loss": 21.968, "step": 587 }, { "epoch": 0.05245550649003078, "grad_norm": 155.0635528564453, "learning_rate": 3e-06, "loss": 6.3548, "step": 588 }, { "completion_length": 161.45833587646484, "epoch": 0.05254471653508185, "grad_norm": 317.9678649902344, "learning_rate": 3e-06, "loss": 6.4115, "reward": 1.8582292199134827, "reward_std": 0.48372724652290344, "rewards/correctness_reward_func": 1.3333333134651184, "rewards/int_reward_func": 0.4895833283662796, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.03531250241212547, "step": 589, "zero_std_ratio": 0.0 }, { "epoch": 0.052633926580132925, "grad_norm": 267.894287109375, "learning_rate": 3e-06, "loss": 6.0494, "step": 590 }, { "epoch": 0.052723136625184, "grad_norm": 197.32470703125, "learning_rate": 3e-06, "loss": 8.4095, "step": 591 }, { "epoch": 0.05281234667023507, "grad_norm": 208.84291076660156, "learning_rate": 3e-06, "loss": 2.183, "step": 592 }, { "epoch": 0.05290155671528614, "grad_norm": 213.50672912597656, "learning_rate": 3e-06, "loss": -5.6327, "step": 593 }, { "epoch": 0.05299076676033721, "grad_norm": 264.34210205078125, "learning_rate": 3e-06, "loss": 0.0463, "step": 594 }, { "epoch": 0.053079976805388285, "grad_norm": 269.38372802734375, "learning_rate": 3e-06, "loss": 3.4921, "step": 595 }, { "epoch": 0.05316918685043936, "grad_norm": 233.82005310058594, "learning_rate": 3e-06, "loss": 3.9645, "step": 596 }, { "epoch": 0.05325839689549043, "grad_norm": 174.2704620361328, "learning_rate": 3e-06, "loss": 6.6277, "step": 597 }, { "epoch": 0.053347606940541505, "grad_norm": 197.27203369140625, "learning_rate": 3e-06, "loss": -0.034, "step": 598 }, { "epoch": 0.05343681698559258, "grad_norm": 195.1741943359375, "learning_rate": 3e-06, "loss": -7.7007, "step": 599 }, { "epoch": 0.05352602703064365, "grad_norm": 218.35403442382812, "learning_rate": 3e-06, "loss": -2.732, "step": 600 }, { "completion_length": 163.20833587646484, "epoch": 0.053615237075694724, "grad_norm": 89.40003204345703, "learning_rate": 3e-06, "loss": -15.5412, "reward": 1.958250105381012, "reward_std": 0.20268601924180984, "rewards/correctness_reward_func": 1.4583333134651184, "rewards/int_reward_func": 0.4375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.06241666525602341, "step": 601, "zero_std_ratio": 0.0 }, { "epoch": 0.0537044471207458, "grad_norm": 85.92318725585938, "learning_rate": 3e-06, "loss": -19.9741, "step": 602 }, { "epoch": 0.05379365716579687, "grad_norm": 89.12326049804688, "learning_rate": 3e-06, "loss": -28.9968, "step": 603 }, { "epoch": 0.053882867210847944, "grad_norm": 109.64755249023438, "learning_rate": 3e-06, "loss": -21.6802, "step": 604 }, { "epoch": 0.05397207725589902, "grad_norm": 99.89476776123047, "learning_rate": 3e-06, "loss": -16.4453, "step": 605 }, { "epoch": 0.054061287300950084, "grad_norm": 140.71066284179688, "learning_rate": 3e-06, "loss": -24.1026, "step": 606 }, { "epoch": 0.05415049734600116, "grad_norm": 80.75763702392578, "learning_rate": 3e-06, "loss": -16.613, "step": 607 }, { "epoch": 0.05423970739105223, "grad_norm": 85.42610168457031, "learning_rate": 3e-06, "loss": -21.2449, "step": 608 }, { "epoch": 0.054328917436103304, "grad_norm": 93.39994812011719, "learning_rate": 3e-06, "loss": -30.1845, "step": 609 }, { "epoch": 0.05441812748115438, "grad_norm": 96.1513671875, "learning_rate": 3e-06, "loss": -22.8781, "step": 610 }, { "epoch": 0.05450733752620545, "grad_norm": 98.65193176269531, "learning_rate": 3e-06, "loss": -17.6772, "step": 611 }, { "epoch": 0.05459654757125652, "grad_norm": 130.4186248779297, "learning_rate": 3e-06, "loss": -25.9751, "step": 612 }, { "completion_length": 129.4375, "epoch": 0.0546857576163076, "grad_norm": 178.97259521484375, "learning_rate": 3e-06, "loss": -41.7741, "reward": 1.9304792881011963, "reward_std": 0.4192664921283722, "rewards/correctness_reward_func": 1.2916666865348816, "rewards/int_reward_func": 0.4895833283662796, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.14922916889190674, "step": 613, "zero_std_ratio": 0.0 }, { "epoch": 0.05477496766135867, "grad_norm": 159.00680541992188, "learning_rate": 3e-06, "loss": -36.4024, "step": 614 }, { "epoch": 0.05486417770640974, "grad_norm": 154.65304565429688, "learning_rate": 3e-06, "loss": -35.2622, "step": 615 }, { "epoch": 0.05495338775146082, "grad_norm": 239.50408935546875, "learning_rate": 3e-06, "loss": -34.9883, "step": 616 }, { "epoch": 0.05504259779651189, "grad_norm": 191.45263671875, "learning_rate": 3e-06, "loss": -30.4894, "step": 617 }, { "epoch": 0.05513180784156296, "grad_norm": 172.67025756835938, "learning_rate": 3e-06, "loss": -40.9277, "step": 618 }, { "epoch": 0.05522101788661403, "grad_norm": 180.4842071533203, "learning_rate": 3e-06, "loss": -44.2844, "step": 619 }, { "epoch": 0.0553102279316651, "grad_norm": 175.10528564453125, "learning_rate": 3e-06, "loss": -39.7696, "step": 620 }, { "epoch": 0.055399437976716176, "grad_norm": 224.33847045898438, "learning_rate": 3e-06, "loss": -38.3443, "step": 621 }, { "epoch": 0.05548864802176725, "grad_norm": 196.98231506347656, "learning_rate": 3e-06, "loss": -36.4789, "step": 622 }, { "epoch": 0.05557785806681832, "grad_norm": 205.22146606445312, "learning_rate": 3e-06, "loss": -32.5847, "step": 623 }, { "epoch": 0.055667068111869396, "grad_norm": 189.9784393310547, "learning_rate": 3e-06, "loss": -43.7861, "step": 624 }, { "completion_length": 135.1666717529297, "epoch": 0.05575627815692047, "grad_norm": 190.61593627929688, "learning_rate": 3e-06, "loss": -23.515, "reward": 1.7549793124198914, "reward_std": 0.5575149804353714, "rewards/correctness_reward_func": 1.1666666865348816, "rewards/int_reward_func": 0.4479166716337204, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.14039582759141922, "step": 625, "zero_std_ratio": 0.0 }, { "epoch": 0.05584548820197154, "grad_norm": 263.34173583984375, "learning_rate": 3e-06, "loss": -26.4535, "step": 626 }, { "epoch": 0.055934698247022616, "grad_norm": 203.12969970703125, "learning_rate": 3e-06, "loss": -20.2943, "step": 627 }, { "epoch": 0.05602390829207369, "grad_norm": 186.3466033935547, "learning_rate": 3e-06, "loss": -22.5875, "step": 628 }, { "epoch": 0.05611311833712476, "grad_norm": 206.43478393554688, "learning_rate": 3e-06, "loss": -14.8606, "step": 629 }, { "epoch": 0.056202328382175835, "grad_norm": 230.95394897460938, "learning_rate": 3e-06, "loss": -11.2027, "step": 630 }, { "epoch": 0.05629153842722691, "grad_norm": 208.40184020996094, "learning_rate": 3e-06, "loss": -24.9903, "step": 631 }, { "epoch": 0.056380748472277975, "grad_norm": 283.0361328125, "learning_rate": 3e-06, "loss": -28.8647, "step": 632 }, { "epoch": 0.05646995851732905, "grad_norm": 243.43634033203125, "learning_rate": 3e-06, "loss": -20.9854, "step": 633 }, { "epoch": 0.05655916856238012, "grad_norm": 196.8306121826172, "learning_rate": 3e-06, "loss": -25.1397, "step": 634 }, { "epoch": 0.056648378607431195, "grad_norm": 204.37130737304688, "learning_rate": 3e-06, "loss": -17.3989, "step": 635 }, { "epoch": 0.05673758865248227, "grad_norm": 222.9701385498047, "learning_rate": 3e-06, "loss": -14.9143, "step": 636 }, { "completion_length": 117.18750381469727, "epoch": 0.05682679869753334, "grad_norm": 152.56382751464844, "learning_rate": 3e-06, "loss": -53.4148, "reward": 2.4275625944137573, "reward_std": 0.3748088702559471, "rewards/correctness_reward_func": 1.75, "rewards/int_reward_func": 0.4895833283662796, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.18797916173934937, "step": 637, "zero_std_ratio": 0.0 }, { "epoch": 0.056916008742584415, "grad_norm": 187.37356567382812, "learning_rate": 3e-06, "loss": -49.109, "step": 638 }, { "epoch": 0.05700521878763549, "grad_norm": 188.10548400878906, "learning_rate": 3e-06, "loss": -49.7143, "step": 639 }, { "epoch": 0.05709442883268656, "grad_norm": 152.7540283203125, "learning_rate": 3e-06, "loss": -46.4195, "step": 640 }, { "epoch": 0.057183638877737634, "grad_norm": 191.8466796875, "learning_rate": 3e-06, "loss": -54.4961, "step": 641 }, { "epoch": 0.05727284892278871, "grad_norm": 160.72772216796875, "learning_rate": 3e-06, "loss": -46.0513, "step": 642 }, { "epoch": 0.05736205896783978, "grad_norm": 163.2805938720703, "learning_rate": 3e-06, "loss": -55.7387, "step": 643 }, { "epoch": 0.057451269012890854, "grad_norm": 189.55470275878906, "learning_rate": 3e-06, "loss": -51.4447, "step": 644 }, { "epoch": 0.05754047905794193, "grad_norm": 182.82583618164062, "learning_rate": 3e-06, "loss": -51.1561, "step": 645 }, { "epoch": 0.057629689102992994, "grad_norm": 177.5033721923828, "learning_rate": 3e-06, "loss": -49.5066, "step": 646 }, { "epoch": 0.05771889914804407, "grad_norm": 208.1852569580078, "learning_rate": 3e-06, "loss": -59.1063, "step": 647 }, { "epoch": 0.05780810919309514, "grad_norm": 185.0785369873047, "learning_rate": 3e-06, "loss": -49.7641, "step": 648 }, { "completion_length": 127.75000762939453, "epoch": 0.057897319238146214, "grad_norm": 332.4126892089844, "learning_rate": 3e-06, "loss": 24.9465, "reward": 2.10916668176651, "reward_std": 0.7774414718151093, "rewards/correctness_reward_func": 1.5, "rewards/int_reward_func": 0.4583333283662796, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.15083333104848862, "step": 649, "zero_std_ratio": 0.0 }, { "epoch": 0.05798652928319729, "grad_norm": 274.673828125, "learning_rate": 3e-06, "loss": 19.6904, "step": 650 }, { "epoch": 0.05807573932824836, "grad_norm": 225.66183471679688, "learning_rate": 3e-06, "loss": 17.079, "step": 651 }, { "epoch": 0.058164949373299434, "grad_norm": 236.45230102539062, "learning_rate": 3e-06, "loss": 16.4363, "step": 652 }, { "epoch": 0.05825415941835051, "grad_norm": 304.5491943359375, "learning_rate": 3e-06, "loss": 33.7894, "step": 653 }, { "epoch": 0.05834336946340158, "grad_norm": 294.2102966308594, "learning_rate": 3e-06, "loss": 12.1637, "step": 654 }, { "epoch": 0.05843257950845265, "grad_norm": 307.0853271484375, "learning_rate": 3e-06, "loss": 23.9968, "step": 655 }, { "epoch": 0.05852178955350373, "grad_norm": 267.52960205078125, "learning_rate": 3e-06, "loss": 18.2069, "step": 656 }, { "epoch": 0.0586109995985548, "grad_norm": 219.6736602783203, "learning_rate": 3e-06, "loss": 16.1471, "step": 657 }, { "epoch": 0.05870020964360587, "grad_norm": 312.0810852050781, "learning_rate": 3e-06, "loss": 15.3629, "step": 658 }, { "epoch": 0.05878941968865694, "grad_norm": 355.7622985839844, "learning_rate": 3e-06, "loss": 32.1285, "step": 659 }, { "epoch": 0.05887862973370801, "grad_norm": 243.60047912597656, "learning_rate": 3e-06, "loss": 10.9719, "step": 660 }, { "completion_length": 128.37500381469727, "epoch": 0.058967839778759086, "grad_norm": 160.17491149902344, "learning_rate": 3e-06, "loss": -4.7968, "reward": 2.220729112625122, "reward_std": 0.37828393280506134, "rewards/correctness_reward_func": 1.5833333134651184, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.13739583641290665, "step": 661, "zero_std_ratio": 0.0 }, { "epoch": 0.05905704982381016, "grad_norm": 213.04843139648438, "learning_rate": 3e-06, "loss": -17.5158, "step": 662 }, { "epoch": 0.05914625986886123, "grad_norm": 360.1976318359375, "learning_rate": 3e-06, "loss": -5.8696, "step": 663 }, { "epoch": 0.059235469913912306, "grad_norm": 165.9031524658203, "learning_rate": 3e-06, "loss": 9.1976, "step": 664 }, { "epoch": 0.05932467995896338, "grad_norm": 185.02491760253906, "learning_rate": 3e-06, "loss": 0.8362, "step": 665 }, { "epoch": 0.05941389000401445, "grad_norm": 186.4868927001953, "learning_rate": 3e-06, "loss": -4.8959, "step": 666 }, { "epoch": 0.059503100049065526, "grad_norm": 172.38906860351562, "learning_rate": 3e-06, "loss": -8.3078, "step": 667 }, { "epoch": 0.0595923100941166, "grad_norm": 205.17637634277344, "learning_rate": 3e-06, "loss": -19.4471, "step": 668 }, { "epoch": 0.05968152013916767, "grad_norm": 412.4108581542969, "learning_rate": 3e-06, "loss": -9.0223, "step": 669 }, { "epoch": 0.059770730184218746, "grad_norm": 165.48020935058594, "learning_rate": 3e-06, "loss": 5.0711, "step": 670 }, { "epoch": 0.05985994022926982, "grad_norm": 185.8058624267578, "learning_rate": 3e-06, "loss": -3.3781, "step": 671 }, { "epoch": 0.059949150274320885, "grad_norm": 214.62686157226562, "learning_rate": 3e-06, "loss": -9.8208, "step": 672 }, { "completion_length": 110.20833587646484, "epoch": 0.06003836031937196, "grad_norm": 223.97824096679688, "learning_rate": 3e-06, "loss": 17.6495, "reward": 1.9757083654403687, "reward_std": 0.7511122822761536, "rewards/correctness_reward_func": 1.375, "rewards/int_reward_func": 0.4375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.16320832818746567, "step": 673, "zero_std_ratio": 0.0 }, { "epoch": 0.06012757036442303, "grad_norm": 180.975341796875, "learning_rate": 3e-06, "loss": -3.9399, "step": 674 }, { "epoch": 0.060216780409474105, "grad_norm": 222.6065216064453, "learning_rate": 3e-06, "loss": 10.3809, "step": 675 }, { "epoch": 0.06030599045452518, "grad_norm": 192.075439453125, "learning_rate": 3e-06, "loss": 14.9998, "step": 676 }, { "epoch": 0.06039520049957625, "grad_norm": 200.7101593017578, "learning_rate": 3e-06, "loss": 2.2906, "step": 677 }, { "epoch": 0.060484410544627325, "grad_norm": 200.1593017578125, "learning_rate": 3e-06, "loss": 13.137, "step": 678 }, { "epoch": 0.0605736205896784, "grad_norm": 185.83168029785156, "learning_rate": 3e-06, "loss": 16.2497, "step": 679 }, { "epoch": 0.06066283063472947, "grad_norm": 208.6911163330078, "learning_rate": 3e-06, "loss": -5.308, "step": 680 }, { "epoch": 0.060752040679780545, "grad_norm": 231.81312561035156, "learning_rate": 3e-06, "loss": 8.1785, "step": 681 }, { "epoch": 0.06084125072483162, "grad_norm": 187.45535278320312, "learning_rate": 3e-06, "loss": 12.5542, "step": 682 }, { "epoch": 0.06093046076988269, "grad_norm": 182.04257202148438, "learning_rate": 3e-06, "loss": -1.0645, "step": 683 }, { "epoch": 0.061019670814933764, "grad_norm": 186.71937561035156, "learning_rate": 3e-06, "loss": 10.3416, "step": 684 }, { "completion_length": 130.4791717529297, "epoch": 0.06110888085998484, "grad_norm": 169.67178344726562, "learning_rate": 3e-06, "loss": 32.9847, "reward": 2.2736042737960815, "reward_std": 0.5084125399589539, "rewards/correctness_reward_func": 1.6666666865348816, "rewards/int_reward_func": 0.4583333283662796, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.14860416948795319, "step": 685, "zero_std_ratio": 0.0 }, { "epoch": 0.061198090905035904, "grad_norm": 231.2995147705078, "learning_rate": 3e-06, "loss": 18.3002, "step": 686 }, { "epoch": 0.06128730095008698, "grad_norm": 162.75083923339844, "learning_rate": 3e-06, "loss": 29.4583, "step": 687 }, { "epoch": 0.06137651099513805, "grad_norm": 209.54966735839844, "learning_rate": 3e-06, "loss": 42.4661, "step": 688 }, { "epoch": 0.061465721040189124, "grad_norm": 396.1800537109375, "learning_rate": 3e-06, "loss": 42.6963, "step": 689 }, { "epoch": 0.0615549310852402, "grad_norm": 144.02049255371094, "learning_rate": 3e-06, "loss": 23.2087, "step": 690 }, { "epoch": 0.06164414113029127, "grad_norm": 179.72213745117188, "learning_rate": 3e-06, "loss": 28.8186, "step": 691 }, { "epoch": 0.061733351175342344, "grad_norm": 143.9263153076172, "learning_rate": 3e-06, "loss": 16.2638, "step": 692 }, { "epoch": 0.06182256122039342, "grad_norm": 148.50750732421875, "learning_rate": 3e-06, "loss": 27.5098, "step": 693 }, { "epoch": 0.06191177126544449, "grad_norm": 178.09515380859375, "learning_rate": 3e-06, "loss": 38.1567, "step": 694 }, { "epoch": 0.06200098131049556, "grad_norm": 319.7694396972656, "learning_rate": 3e-06, "loss": 36.9073, "step": 695 }, { "epoch": 0.06209019135554664, "grad_norm": 137.5644989013672, "learning_rate": 3e-06, "loss": 19.9691, "step": 696 }, { "completion_length": 138.20833587646484, "epoch": 0.06217940140059771, "grad_norm": 243.18804931640625, "learning_rate": 3e-06, "loss": 63.9782, "reward": 1.6311666369438171, "reward_std": 0.6314830482006073, "rewards/correctness_reward_func": 1.0833333730697632, "rewards/int_reward_func": 0.4166666716337204, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.13116667047142982, "step": 697, "zero_std_ratio": 0.0 }, { "epoch": 0.06226861144564878, "grad_norm": 248.19979858398438, "learning_rate": 3e-06, "loss": 62.0708, "step": 698 }, { "epoch": 0.06235782149069985, "grad_norm": 192.6903839111328, "learning_rate": 3e-06, "loss": 44.2788, "step": 699 }, { "epoch": 0.06244703153575092, "grad_norm": 187.61729431152344, "learning_rate": 3e-06, "loss": 45.635, "step": 700 }, { "epoch": 0.062536241580802, "grad_norm": 270.56439208984375, "learning_rate": 3e-06, "loss": 61.8371, "step": 701 }, { "epoch": 0.06262545162585308, "grad_norm": 186.34654235839844, "learning_rate": 3e-06, "loss": 35.0754, "step": 702 }, { "epoch": 0.06271466167090414, "grad_norm": 193.72666931152344, "learning_rate": 3e-06, "loss": 56.4102, "step": 703 }, { "epoch": 0.06280387171595522, "grad_norm": 213.4738006591797, "learning_rate": 3e-06, "loss": 55.6052, "step": 704 }, { "epoch": 0.06289308176100629, "grad_norm": 178.9663543701172, "learning_rate": 3e-06, "loss": 38.9399, "step": 705 }, { "epoch": 0.06298229180605736, "grad_norm": 155.2235870361328, "learning_rate": 3e-06, "loss": 40.2563, "step": 706 }, { "epoch": 0.06307150185110844, "grad_norm": 190.83424377441406, "learning_rate": 3e-06, "loss": 54.1227, "step": 707 }, { "epoch": 0.0631607118961595, "grad_norm": 151.27175903320312, "learning_rate": 3e-06, "loss": 29.2768, "step": 708 }, { "completion_length": 177.89584350585938, "epoch": 0.06324992194121058, "grad_norm": 383.90155029296875, "learning_rate": 3e-06, "loss": -57.4567, "reward": 1.6143542528152466, "reward_std": 1.1507561206817627, "rewards/correctness_reward_func": 1.125, "rewards/int_reward_func": 0.4479166716337204, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.041437502950429916, "step": 709, "zero_std_ratio": 0.0 }, { "epoch": 0.06333913198626165, "grad_norm": 228.4014892578125, "learning_rate": 3e-06, "loss": -22.536, "step": 710 }, { "epoch": 0.06342834203131273, "grad_norm": 247.7998809814453, "learning_rate": 3e-06, "loss": -34.7459, "step": 711 }, { "epoch": 0.0635175520763638, "grad_norm": 256.2286376953125, "learning_rate": 3e-06, "loss": -30.4245, "step": 712 }, { "epoch": 0.06360676212141488, "grad_norm": 254.9169158935547, "learning_rate": 3e-06, "loss": -38.2843, "step": 713 }, { "epoch": 0.06369597216646594, "grad_norm": 321.43609619140625, "learning_rate": 3e-06, "loss": -35.5213, "step": 714 }, { "epoch": 0.06378518221151702, "grad_norm": 349.0517272949219, "learning_rate": 3e-06, "loss": -55.9526, "step": 715 }, { "epoch": 0.06387439225656809, "grad_norm": 209.25282287597656, "learning_rate": 3e-06, "loss": -22.3323, "step": 716 }, { "epoch": 0.06396360230161917, "grad_norm": 247.7156219482422, "learning_rate": 3e-06, "loss": -35.9537, "step": 717 }, { "epoch": 0.06405281234667023, "grad_norm": 274.3576965332031, "learning_rate": 3e-06, "loss": -33.4144, "step": 718 }, { "epoch": 0.0641420223917213, "grad_norm": 287.4893798828125, "learning_rate": 3e-06, "loss": -40.4848, "step": 719 }, { "epoch": 0.06423123243677238, "grad_norm": 375.7614440917969, "learning_rate": 3e-06, "loss": -39.3635, "step": 720 }, { "completion_length": 109.22916793823242, "epoch": 0.06432044248182345, "grad_norm": 123.53705596923828, "learning_rate": 3e-06, "loss": -30.919, "reward": 2.3711042404174805, "reward_std": 0.4830681085586548, "rewards/correctness_reward_func": 1.7083333730697632, "rewards/int_reward_func": 0.46875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.1940208300948143, "step": 721, "zero_std_ratio": 0.0 }, { "epoch": 0.06440965252687453, "grad_norm": 114.69866943359375, "learning_rate": 3e-06, "loss": -29.2645, "step": 722 }, { "epoch": 0.0644988625719256, "grad_norm": 118.97370910644531, "learning_rate": 3e-06, "loss": -25.0985, "step": 723 }, { "epoch": 0.06458807261697667, "grad_norm": 121.6566162109375, "learning_rate": 3e-06, "loss": -29.0591, "step": 724 }, { "epoch": 0.06467728266202774, "grad_norm": 182.92691040039062, "learning_rate": 3e-06, "loss": -28.0901, "step": 725 }, { "epoch": 0.06476649270707882, "grad_norm": 156.50718688964844, "learning_rate": 3e-06, "loss": -30.7831, "step": 726 }, { "epoch": 0.06485570275212989, "grad_norm": 132.53089904785156, "learning_rate": 3e-06, "loss": -33.3058, "step": 727 }, { "epoch": 0.06494491279718097, "grad_norm": 112.21791076660156, "learning_rate": 3e-06, "loss": -31.8015, "step": 728 }, { "epoch": 0.06503412284223203, "grad_norm": 130.97052001953125, "learning_rate": 3e-06, "loss": -27.7953, "step": 729 }, { "epoch": 0.06512333288728311, "grad_norm": 128.9853515625, "learning_rate": 3e-06, "loss": -32.2601, "step": 730 }, { "epoch": 0.06521254293233418, "grad_norm": 146.0636444091797, "learning_rate": 3e-06, "loss": -31.9068, "step": 731 }, { "epoch": 0.06530175297738525, "grad_norm": 148.94358825683594, "learning_rate": 3e-06, "loss": -35.98, "step": 732 }, { "completion_length": 133.77083587646484, "epoch": 0.06539096302243633, "grad_norm": 200.87583923339844, "learning_rate": 3e-06, "loss": 5.8156, "reward": 1.7448542714118958, "reward_std": 0.7897588908672333, "rewards/correctness_reward_func": 1.1666666865348816, "rewards/int_reward_func": 0.4479166716337204, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.1302708424627781, "step": 733, "zero_std_ratio": 0.0 }, { "epoch": 0.0654801730674874, "grad_norm": 254.33050537109375, "learning_rate": 3e-06, "loss": 39.6914, "step": 734 }, { "epoch": 0.06556938311253847, "grad_norm": 207.40354919433594, "learning_rate": 3e-06, "loss": 18.8055, "step": 735 }, { "epoch": 0.06565859315758954, "grad_norm": 255.23114013671875, "learning_rate": 3e-06, "loss": 12.5947, "step": 736 }, { "epoch": 0.06574780320264062, "grad_norm": 183.82200622558594, "learning_rate": 3e-06, "loss": 11.0044, "step": 737 }, { "epoch": 0.06583701324769169, "grad_norm": 226.2420654296875, "learning_rate": 3e-06, "loss": 0.4847, "step": 738 }, { "epoch": 0.06592622329274277, "grad_norm": 233.6065673828125, "learning_rate": 3e-06, "loss": 4.5426, "step": 739 }, { "epoch": 0.06601543333779383, "grad_norm": 264.205078125, "learning_rate": 3e-06, "loss": 39.3686, "step": 740 }, { "epoch": 0.06610464338284491, "grad_norm": 241.85284423828125, "learning_rate": 3e-06, "loss": 17.7268, "step": 741 }, { "epoch": 0.06619385342789598, "grad_norm": 221.0516357421875, "learning_rate": 3e-06, "loss": 13.2982, "step": 742 }, { "epoch": 0.06628306347294706, "grad_norm": 212.37222290039062, "learning_rate": 3e-06, "loss": 7.7467, "step": 743 }, { "epoch": 0.06637227351799813, "grad_norm": 263.42919921875, "learning_rate": 3e-06, "loss": -2.1754, "step": 744 }, { "completion_length": 125.64583587646484, "epoch": 0.0664614835630492, "grad_norm": 193.34835815429688, "learning_rate": 3e-06, "loss": 40.2546, "reward": 1.7169584035873413, "reward_std": 0.4941745698451996, "rewards/correctness_reward_func": 1.0833333730697632, "rewards/int_reward_func": 0.4791666567325592, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.15445833653211594, "step": 745, "zero_std_ratio": 0.0 }, { "epoch": 0.06655069360810027, "grad_norm": 212.19464111328125, "learning_rate": 3e-06, "loss": 41.4745, "step": 746 }, { "epoch": 0.06663990365315134, "grad_norm": 237.55845642089844, "learning_rate": 3e-06, "loss": 47.5379, "step": 747 }, { "epoch": 0.06672911369820242, "grad_norm": 185.4857635498047, "learning_rate": 3e-06, "loss": 43.0404, "step": 748 }, { "epoch": 0.06681832374325349, "grad_norm": 158.21678161621094, "learning_rate": 3e-06, "loss": 32.8558, "step": 749 }, { "epoch": 0.06690753378830457, "grad_norm": 212.29397583007812, "learning_rate": 3e-06, "loss": 41.9557, "step": 750 }, { "epoch": 0.06699674383335563, "grad_norm": 181.2362060546875, "learning_rate": 3e-06, "loss": 38.4185, "step": 751 }, { "epoch": 0.06708595387840671, "grad_norm": 186.73841857910156, "learning_rate": 3e-06, "loss": 37.1992, "step": 752 }, { "epoch": 0.06717516392345778, "grad_norm": 182.0499267578125, "learning_rate": 3e-06, "loss": 42.4944, "step": 753 }, { "epoch": 0.06726437396850886, "grad_norm": 161.4265899658203, "learning_rate": 3e-06, "loss": 40.1143, "step": 754 }, { "epoch": 0.06735358401355993, "grad_norm": 145.66175842285156, "learning_rate": 3e-06, "loss": 29.066, "step": 755 }, { "epoch": 0.067442794058611, "grad_norm": 188.43362426757812, "learning_rate": 3e-06, "loss": 34.8647, "step": 756 }, { "completion_length": 115.64583587646484, "epoch": 0.06753200410366207, "grad_norm": 176.6951446533203, "learning_rate": 3e-06, "loss": 23.7273, "reward": 1.9728541374206543, "reward_std": 0.22583025321364403, "rewards/correctness_reward_func": 1.375, "rewards/int_reward_func": 0.4270833432674408, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.17077083885669708, "step": 757, "zero_std_ratio": 0.0 }, { "epoch": 0.06762121414871315, "grad_norm": 134.009521484375, "learning_rate": 3e-06, "loss": 29.2333, "step": 758 }, { "epoch": 0.06771042419376422, "grad_norm": 131.3091278076172, "learning_rate": 3e-06, "loss": 29.9313, "step": 759 }, { "epoch": 0.06779963423881528, "grad_norm": 128.05767822265625, "learning_rate": 3e-06, "loss": 22.7093, "step": 760 }, { "epoch": 0.06788884428386636, "grad_norm": 115.9305419921875, "learning_rate": 3e-06, "loss": 23.3578, "step": 761 }, { "epoch": 0.06797805432891743, "grad_norm": 127.96971893310547, "learning_rate": 3e-06, "loss": 26.8069, "step": 762 }, { "epoch": 0.06806726437396851, "grad_norm": 172.38279724121094, "learning_rate": 3e-06, "loss": 20.0156, "step": 763 }, { "epoch": 0.06815647441901958, "grad_norm": 122.19217681884766, "learning_rate": 3e-06, "loss": 23.256, "step": 764 }, { "epoch": 0.06824568446407066, "grad_norm": 98.1166763305664, "learning_rate": 3e-06, "loss": 25.2087, "step": 765 }, { "epoch": 0.06833489450912172, "grad_norm": 104.1299819946289, "learning_rate": 3e-06, "loss": 18.9453, "step": 766 }, { "epoch": 0.0684241045541728, "grad_norm": 108.13124084472656, "learning_rate": 3e-06, "loss": 18.3841, "step": 767 }, { "epoch": 0.06851331459922387, "grad_norm": 107.33203887939453, "learning_rate": 3e-06, "loss": 21.8827, "step": 768 }, { "completion_length": 140.7291717529297, "epoch": 0.06860252464427495, "grad_norm": 176.08241271972656, "learning_rate": 3e-06, "loss": 1.9837, "reward": 2.0843957662582397, "reward_std": 0.670438677072525, "rewards/correctness_reward_func": 1.5, "rewards/int_reward_func": 0.46875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.11564583331346512, "step": 769, "zero_std_ratio": 0.0 }, { "epoch": 0.06869173468932602, "grad_norm": 184.2529754638672, "learning_rate": 3e-06, "loss": -9.1612, "step": 770 }, { "epoch": 0.0687809447343771, "grad_norm": 124.57987213134766, "learning_rate": 3e-06, "loss": -1.167, "step": 771 }, { "epoch": 0.06887015477942816, "grad_norm": 146.37869262695312, "learning_rate": 3e-06, "loss": 7.9908, "step": 772 }, { "epoch": 0.06895936482447923, "grad_norm": 159.84788513183594, "learning_rate": 3e-06, "loss": -7.8619, "step": 773 }, { "epoch": 0.06904857486953031, "grad_norm": 165.3255157470703, "learning_rate": 3e-06, "loss": -9.9255, "step": 774 }, { "epoch": 0.06913778491458138, "grad_norm": 147.72352600097656, "learning_rate": 3e-06, "loss": 0.4622, "step": 775 }, { "epoch": 0.06922699495963246, "grad_norm": 161.2108917236328, "learning_rate": 3e-06, "loss": -9.3763, "step": 776 }, { "epoch": 0.06931620500468352, "grad_norm": 117.9613265991211, "learning_rate": 3e-06, "loss": -2.9584, "step": 777 }, { "epoch": 0.0694054150497346, "grad_norm": 127.0103988647461, "learning_rate": 3e-06, "loss": 4.5212, "step": 778 }, { "epoch": 0.06949462509478567, "grad_norm": 161.12701416015625, "learning_rate": 3e-06, "loss": -7.3502, "step": 779 }, { "epoch": 0.06958383513983675, "grad_norm": 147.03277587890625, "learning_rate": 3e-06, "loss": -12.0696, "step": 780 }, { "completion_length": 146.75000762939453, "epoch": 0.06967304518488782, "grad_norm": 93.77994537353516, "learning_rate": 3e-06, "loss": -9.5819, "reward": 2.0504584312438965, "reward_std": 0.46765226125717163, "rewards/correctness_reward_func": 1.4166666865348816, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.010416666977107525, "rewards/xmlcount_reward_func": 0.12337498925626278, "step": 781, "zero_std_ratio": 0.0 }, { "epoch": 0.0697622552299389, "grad_norm": 109.34754180908203, "learning_rate": 3e-06, "loss": -14.1501, "step": 782 }, { "epoch": 0.06985146527498996, "grad_norm": 83.16600799560547, "learning_rate": 3e-06, "loss": -17.3924, "step": 783 }, { "epoch": 0.06994067532004104, "grad_norm": 83.6520767211914, "learning_rate": 3e-06, "loss": -3.3917, "step": 784 }, { "epoch": 0.07002988536509211, "grad_norm": 111.59048461914062, "learning_rate": 3e-06, "loss": -18.6001, "step": 785 }, { "epoch": 0.07011909541014318, "grad_norm": 81.26487731933594, "learning_rate": 3e-06, "loss": -4.8149, "step": 786 }, { "epoch": 0.07020830545519426, "grad_norm": 88.5013198852539, "learning_rate": 3e-06, "loss": -10.8751, "step": 787 }, { "epoch": 0.07029751550024532, "grad_norm": 106.22066497802734, "learning_rate": 3e-06, "loss": -14.8712, "step": 788 }, { "epoch": 0.0703867255452964, "grad_norm": 95.8133544921875, "learning_rate": 3e-06, "loss": -17.8064, "step": 789 }, { "epoch": 0.07047593559034747, "grad_norm": 105.98171997070312, "learning_rate": 3e-06, "loss": -4.9442, "step": 790 }, { "epoch": 0.07056514563539855, "grad_norm": 108.74724578857422, "learning_rate": 3e-06, "loss": -20.1345, "step": 791 }, { "epoch": 0.07065435568044962, "grad_norm": 217.0005340576172, "learning_rate": 3e-06, "loss": -5.632, "step": 792 }, { "completion_length": 137.1041717529297, "epoch": 0.0707435657255007, "grad_norm": 186.0244140625, "learning_rate": 3e-06, "loss": 20.4605, "reward": 1.8869168162345886, "reward_std": 0.7256337702274323, "rewards/correctness_reward_func": 1.2916666567325592, "rewards/int_reward_func": 0.4583333283662796, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.13691667094826698, "step": 793, "zero_std_ratio": 0.0 }, { "epoch": 0.07083277577055176, "grad_norm": 139.76405334472656, "learning_rate": 3e-06, "loss": 14.9958, "step": 794 }, { "epoch": 0.07092198581560284, "grad_norm": 155.77352905273438, "learning_rate": 3e-06, "loss": 24.0522, "step": 795 }, { "epoch": 0.07101119586065391, "grad_norm": 147.90013122558594, "learning_rate": 3e-06, "loss": 30.8391, "step": 796 }, { "epoch": 0.07110040590570499, "grad_norm": 154.99143981933594, "learning_rate": 3e-06, "loss": 24.957, "step": 797 }, { "epoch": 0.07118961595075605, "grad_norm": 154.12411499023438, "learning_rate": 3e-06, "loss": 32.2635, "step": 798 }, { "epoch": 0.07127882599580712, "grad_norm": 142.52955627441406, "learning_rate": 3e-06, "loss": 19.5332, "step": 799 }, { "epoch": 0.0713680360408582, "grad_norm": 140.1791229248047, "learning_rate": 3e-06, "loss": 13.5796, "step": 800 }, { "epoch": 0.07145724608590927, "grad_norm": 144.1186981201172, "learning_rate": 3e-06, "loss": 21.398, "step": 801 }, { "epoch": 0.07154645613096035, "grad_norm": 139.25230407714844, "learning_rate": 3e-06, "loss": 28.6905, "step": 802 }, { "epoch": 0.07163566617601141, "grad_norm": 151.95538330078125, "learning_rate": 3e-06, "loss": 21.6279, "step": 803 }, { "epoch": 0.0717248762210625, "grad_norm": 143.84974670410156, "learning_rate": 3e-06, "loss": 29.0759, "step": 804 }, { "completion_length": 136.22916793823242, "epoch": 0.07181408626611356, "grad_norm": 92.62751770019531, "learning_rate": 3e-06, "loss": -21.4484, "reward": 2.414271116256714, "reward_std": 0.33920496702194214, "rewards/correctness_reward_func": 1.8333333134651184, "rewards/int_reward_func": 0.4583333283662796, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.12260417267680168, "step": 805, "zero_std_ratio": 0.0 }, { "epoch": 0.07190329631116464, "grad_norm": 60.42007827758789, "learning_rate": 3e-06, "loss": -12.7491, "step": 806 }, { "epoch": 0.07199250635621571, "grad_norm": 94.1617660522461, "learning_rate": 3e-06, "loss": -14.5131, "step": 807 }, { "epoch": 0.07208171640126679, "grad_norm": 68.65794372558594, "learning_rate": 3e-06, "loss": -14.1192, "step": 808 }, { "epoch": 0.07217092644631785, "grad_norm": 79.4446792602539, "learning_rate": 3e-06, "loss": -12.4833, "step": 809 }, { "epoch": 0.07226013649136893, "grad_norm": 82.45279693603516, "learning_rate": 3e-06, "loss": -8.9581, "step": 810 }, { "epoch": 0.07234934653642, "grad_norm": 101.833984375, "learning_rate": 3e-06, "loss": -22.4901, "step": 811 }, { "epoch": 0.07243855658147107, "grad_norm": 78.60984802246094, "learning_rate": 3e-06, "loss": -12.9701, "step": 812 }, { "epoch": 0.07252776662652215, "grad_norm": 100.63545989990234, "learning_rate": 3e-06, "loss": -15.4353, "step": 813 }, { "epoch": 0.07261697667157321, "grad_norm": 66.36518096923828, "learning_rate": 3e-06, "loss": -14.7871, "step": 814 }, { "epoch": 0.0727061867166243, "grad_norm": 73.31499481201172, "learning_rate": 3e-06, "loss": -13.5328, "step": 815 }, { "epoch": 0.07279539676167536, "grad_norm": 81.8609848022461, "learning_rate": 3e-06, "loss": -10.202, "step": 816 }, { "completion_length": 176.0416717529297, "epoch": 0.07288460680672644, "grad_norm": 85.71199035644531, "learning_rate": 3e-06, "loss": -2.3775, "reward": 1.308291733264923, "reward_std": 0.4305167943239212, "rewards/correctness_reward_func": 0.8333333432674408, "rewards/int_reward_func": 0.4583333283662796, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.016624998301267624, "step": 817, "zero_std_ratio": 0.0 }, { "epoch": 0.0729738168517775, "grad_norm": 113.99066162109375, "learning_rate": 3e-06, "loss": 3.9682, "step": 818 }, { "epoch": 0.07306302689682859, "grad_norm": 90.42432403564453, "learning_rate": 3e-06, "loss": -8.968, "step": 819 }, { "epoch": 0.07315223694187965, "grad_norm": 86.66998291015625, "learning_rate": 3e-06, "loss": 5.9931, "step": 820 }, { "epoch": 0.07324144698693073, "grad_norm": 112.87352752685547, "learning_rate": 3e-06, "loss": 3.7934, "step": 821 }, { "epoch": 0.0733306570319818, "grad_norm": 116.59276580810547, "learning_rate": 3e-06, "loss": -12.3296, "step": 822 }, { "epoch": 0.07341986707703288, "grad_norm": 95.53129577636719, "learning_rate": 3e-06, "loss": -3.2342, "step": 823 }, { "epoch": 0.07350907712208395, "grad_norm": 134.6486053466797, "learning_rate": 3e-06, "loss": 2.4981, "step": 824 }, { "epoch": 0.07359828716713501, "grad_norm": 80.79833221435547, "learning_rate": 3e-06, "loss": -9.3798, "step": 825 }, { "epoch": 0.07368749721218609, "grad_norm": 107.38970184326172, "learning_rate": 3e-06, "loss": 4.8459, "step": 826 }, { "epoch": 0.07377670725723716, "grad_norm": 91.76937866210938, "learning_rate": 3e-06, "loss": 2.6928, "step": 827 }, { "epoch": 0.07386591730228824, "grad_norm": 112.15656280517578, "learning_rate": 3e-06, "loss": -13.4598, "step": 828 }, { "completion_length": 161.2916717529297, "epoch": 0.0739551273473393, "grad_norm": 225.7854461669922, "learning_rate": 3e-06, "loss": -7.2711, "reward": 1.8568333387374878, "reward_std": 0.38829553686082363, "rewards/correctness_reward_func": 1.2916666865348816, "rewards/int_reward_func": 0.4895833283662796, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.0755833312869072, "step": 829, "zero_std_ratio": 0.0 }, { "epoch": 0.07404433739239039, "grad_norm": 203.64381408691406, "learning_rate": 3e-06, "loss": -24.0722, "step": 830 }, { "epoch": 0.07413354743744145, "grad_norm": 470.9283752441406, "learning_rate": 3e-06, "loss": 7.9581, "step": 831 }, { "epoch": 0.07422275748249253, "grad_norm": 261.8198547363281, "learning_rate": 3e-06, "loss": -13.8961, "step": 832 }, { "epoch": 0.0743119675275436, "grad_norm": 238.60263061523438, "learning_rate": 3e-06, "loss": -17.5778, "step": 833 }, { "epoch": 0.07440117757259468, "grad_norm": 251.20684814453125, "learning_rate": 3e-06, "loss": -13.8026, "step": 834 }, { "epoch": 0.07449038761764575, "grad_norm": 233.15805053710938, "learning_rate": 3e-06, "loss": -9.8419, "step": 835 }, { "epoch": 0.07457959766269683, "grad_norm": 188.42831420898438, "learning_rate": 3e-06, "loss": -27.1095, "step": 836 }, { "epoch": 0.07466880770774789, "grad_norm": 330.0888671875, "learning_rate": 3e-06, "loss": 2.663, "step": 837 }, { "epoch": 0.07475801775279897, "grad_norm": 187.619873046875, "learning_rate": 3e-06, "loss": -17.3052, "step": 838 }, { "epoch": 0.07484722779785004, "grad_norm": 273.087646484375, "learning_rate": 3e-06, "loss": -20.1137, "step": 839 }, { "epoch": 0.0749364378429011, "grad_norm": 231.94540405273438, "learning_rate": 3e-06, "loss": -17.6819, "step": 840 }, { "completion_length": 124.5625, "epoch": 0.07502564788795218, "grad_norm": 169.8292236328125, "learning_rate": 3e-06, "loss": -14.9169, "reward": 2.1858333349227905, "reward_std": 0.5357859879732132, "rewards/correctness_reward_func": 1.5833333134651184, "rewards/int_reward_func": 0.4270833432674408, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.17541665583848953, "step": 841, "zero_std_ratio": 0.0 }, { "epoch": 0.07511485793300325, "grad_norm": 151.44129943847656, "learning_rate": 3e-06, "loss": -38.7775, "step": 842 }, { "epoch": 0.07520406797805433, "grad_norm": 141.34671020507812, "learning_rate": 3e-06, "loss": -32.3555, "step": 843 }, { "epoch": 0.0752932780231054, "grad_norm": 117.83955383300781, "learning_rate": 3e-06, "loss": -35.7298, "step": 844 }, { "epoch": 0.07538248806815648, "grad_norm": 113.38582611083984, "learning_rate": 3e-06, "loss": -36.8355, "step": 845 }, { "epoch": 0.07547169811320754, "grad_norm": 147.53521728515625, "learning_rate": 3e-06, "loss": -34.7305, "step": 846 }, { "epoch": 0.07556090815825862, "grad_norm": 167.8444061279297, "learning_rate": 3e-06, "loss": -17.6609, "step": 847 }, { "epoch": 0.07565011820330969, "grad_norm": 177.19976806640625, "learning_rate": 3e-06, "loss": -42.6765, "step": 848 }, { "epoch": 0.07573932824836077, "grad_norm": 207.4672393798828, "learning_rate": 3e-06, "loss": -36.7629, "step": 849 }, { "epoch": 0.07582853829341184, "grad_norm": 124.84293365478516, "learning_rate": 3e-06, "loss": -39.1349, "step": 850 }, { "epoch": 0.07591774833846292, "grad_norm": 134.89764404296875, "learning_rate": 3e-06, "loss": -41.2224, "step": 851 }, { "epoch": 0.07600695838351398, "grad_norm": 161.6527862548828, "learning_rate": 3e-06, "loss": -39.203, "step": 852 }, { "completion_length": 147.7291717529297, "epoch": 0.07609616842856505, "grad_norm": 371.6798095703125, "learning_rate": 3e-06, "loss": 88.7083, "reward": 2.0986876487731934, "reward_std": 0.5909168422222137, "rewards/correctness_reward_func": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.09868749976158142, "step": 853, "zero_std_ratio": 0.0 }, { "epoch": 0.07618537847361613, "grad_norm": 385.34136962890625, "learning_rate": 3e-06, "loss": 86.0723, "step": 854 }, { "epoch": 0.0762745885186672, "grad_norm": 360.78021240234375, "learning_rate": 3e-06, "loss": 71.1204, "step": 855 }, { "epoch": 0.07636379856371828, "grad_norm": 293.267333984375, "learning_rate": 3e-06, "loss": 66.9494, "step": 856 }, { "epoch": 0.07645300860876934, "grad_norm": 440.7154846191406, "learning_rate": 3e-06, "loss": 88.5771, "step": 857 }, { "epoch": 0.07654221865382042, "grad_norm": 327.457275390625, "learning_rate": 3e-06, "loss": 58.8516, "step": 858 }, { "epoch": 0.07663142869887149, "grad_norm": 371.9436340332031, "learning_rate": 3e-06, "loss": 85.2973, "step": 859 }, { "epoch": 0.07672063874392257, "grad_norm": 389.5568542480469, "learning_rate": 3e-06, "loss": 79.8231, "step": 860 }, { "epoch": 0.07680984878897364, "grad_norm": 321.9656066894531, "learning_rate": 3e-06, "loss": 63.4841, "step": 861 }, { "epoch": 0.07689905883402472, "grad_norm": 284.66876220703125, "learning_rate": 3e-06, "loss": 59.0441, "step": 862 }, { "epoch": 0.07698826887907578, "grad_norm": 410.6514587402344, "learning_rate": 3e-06, "loss": 76.3094, "step": 863 }, { "epoch": 0.07707747892412686, "grad_norm": 284.0197448730469, "learning_rate": 3e-06, "loss": 49.468, "step": 864 }, { "completion_length": 144.1041717529297, "epoch": 0.07716668896917793, "grad_norm": 226.17822265625, "learning_rate": 3e-06, "loss": 56.9406, "reward": 2.379916787147522, "reward_std": 0.430880606174469, "rewards/correctness_reward_func": 1.7916666865348816, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.08824999909847975, "step": 865, "zero_std_ratio": 0.0 }, { "epoch": 0.077255899014229, "grad_norm": 186.55560302734375, "learning_rate": 3e-06, "loss": 52.0502, "step": 866 }, { "epoch": 0.07734510905928008, "grad_norm": 184.80516052246094, "learning_rate": 3e-06, "loss": 46.5618, "step": 867 }, { "epoch": 0.07743431910433114, "grad_norm": 178.5349884033203, "learning_rate": 3e-06, "loss": 46.2696, "step": 868 }, { "epoch": 0.07752352914938222, "grad_norm": 148.83154296875, "learning_rate": 3e-06, "loss": 29.464, "step": 869 }, { "epoch": 0.07761273919443329, "grad_norm": 161.14889526367188, "learning_rate": 3e-06, "loss": 51.0483, "step": 870 }, { "epoch": 0.07770194923948437, "grad_norm": 192.32308959960938, "learning_rate": 3e-06, "loss": 47.9675, "step": 871 }, { "epoch": 0.07779115928453544, "grad_norm": 152.79583740234375, "learning_rate": 3e-06, "loss": 43.9552, "step": 872 }, { "epoch": 0.07788036932958652, "grad_norm": 151.7612762451172, "learning_rate": 3e-06, "loss": 38.7329, "step": 873 }, { "epoch": 0.07796957937463758, "grad_norm": 133.1282196044922, "learning_rate": 3e-06, "loss": 38.5692, "step": 874 }, { "epoch": 0.07805878941968866, "grad_norm": 103.07962036132812, "learning_rate": 3e-06, "loss": 24.2915, "step": 875 }, { "epoch": 0.07814799946473973, "grad_norm": 129.3807373046875, "learning_rate": 3e-06, "loss": 43.1067, "step": 876 }, { "completion_length": 127.54166793823242, "epoch": 0.07823720950979081, "grad_norm": 125.33885192871094, "learning_rate": 3e-06, "loss": -23.6161, "reward": 1.667020857334137, "reward_std": 0.5597978234291077, "rewards/correctness_reward_func": 1.0833333432674408, "rewards/int_reward_func": 0.40625, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.17743750661611557, "step": 877, "zero_std_ratio": 0.0 }, { "epoch": 0.07832641955484188, "grad_norm": 138.50013732910156, "learning_rate": 3e-06, "loss": -16.1757, "step": 878 }, { "epoch": 0.07841562959989294, "grad_norm": 130.26280212402344, "learning_rate": 3e-06, "loss": -21.8816, "step": 879 }, { "epoch": 0.07850483964494402, "grad_norm": 141.026123046875, "learning_rate": 3e-06, "loss": -22.0761, "step": 880 }, { "epoch": 0.07859404968999509, "grad_norm": 126.53893280029297, "learning_rate": 3e-06, "loss": -23.4112, "step": 881 }, { "epoch": 0.07868325973504617, "grad_norm": 153.45120239257812, "learning_rate": 3e-06, "loss": -17.7169, "step": 882 }, { "epoch": 0.07877246978009723, "grad_norm": 122.84283447265625, "learning_rate": 3e-06, "loss": -24.8704, "step": 883 }, { "epoch": 0.07886167982514831, "grad_norm": 157.95201110839844, "learning_rate": 3e-06, "loss": -16.2334, "step": 884 }, { "epoch": 0.07895088987019938, "grad_norm": 136.01124572753906, "learning_rate": 3e-06, "loss": -23.3998, "step": 885 }, { "epoch": 0.07904009991525046, "grad_norm": 135.98423767089844, "learning_rate": 3e-06, "loss": -23.6646, "step": 886 }, { "epoch": 0.07912930996030153, "grad_norm": 131.24002075195312, "learning_rate": 3e-06, "loss": -25.7454, "step": 887 }, { "epoch": 0.07921852000535261, "grad_norm": 124.26398468017578, "learning_rate": 3e-06, "loss": -20.446, "step": 888 }, { "completion_length": 153.58333587646484, "epoch": 0.07930773005040367, "grad_norm": 240.47964477539062, "learning_rate": 3e-06, "loss": -52.9796, "reward": 1.8959583044052124, "reward_std": 0.6871494352817535, "rewards/correctness_reward_func": 1.3333333432674408, "rewards/int_reward_func": 0.4583333283662796, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.10429166257381439, "step": 889, "zero_std_ratio": 0.0 }, { "epoch": 0.07939694009545475, "grad_norm": 205.4910430908203, "learning_rate": 3e-06, "loss": -52.7249, "step": 890 }, { "epoch": 0.07948615014050582, "grad_norm": 242.2780303955078, "learning_rate": 3e-06, "loss": -67.3242, "step": 891 }, { "epoch": 0.07957536018555689, "grad_norm": 262.0589599609375, "learning_rate": 3e-06, "loss": -66.9661, "step": 892 }, { "epoch": 0.07966457023060797, "grad_norm": 186.11415100097656, "learning_rate": 3e-06, "loss": -68.4566, "step": 893 }, { "epoch": 0.07975378027565903, "grad_norm": 254.95228576660156, "learning_rate": 3e-06, "loss": -66.0288, "step": 894 }, { "epoch": 0.07984299032071011, "grad_norm": 270.2388000488281, "learning_rate": 3e-06, "loss": -59.1402, "step": 895 }, { "epoch": 0.07993220036576118, "grad_norm": 232.1254119873047, "learning_rate": 3e-06, "loss": -58.3748, "step": 896 }, { "epoch": 0.08002141041081226, "grad_norm": 423.2415466308594, "learning_rate": 3e-06, "loss": -74.5377, "step": 897 }, { "epoch": 0.08011062045586333, "grad_norm": 289.6065673828125, "learning_rate": 3e-06, "loss": -76.1074, "step": 898 }, { "epoch": 0.0801998305009144, "grad_norm": 212.4766845703125, "learning_rate": 3e-06, "loss": -74.2601, "step": 899 }, { "epoch": 0.08028904054596547, "grad_norm": 286.6225891113281, "learning_rate": 3e-06, "loss": -74.3892, "step": 900 }, { "completion_length": 119.62500381469727, "epoch": 0.08037825059101655, "grad_norm": 123.23504638671875, "learning_rate": 3e-06, "loss": -22.0192, "reward": 2.5415626764297485, "reward_std": 0.190566536039114, "rewards/correctness_reward_func": 1.9166666865348816, "rewards/int_reward_func": 0.4791666716337204, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.1457291655242443, "step": 901, "zero_std_ratio": 0.0 }, { "epoch": 0.08046746063606762, "grad_norm": 134.62196350097656, "learning_rate": 3e-06, "loss": -26.971, "step": 902 }, { "epoch": 0.0805566706811187, "grad_norm": 211.32479858398438, "learning_rate": 3e-06, "loss": -31.3724, "step": 903 }, { "epoch": 0.08064588072616977, "grad_norm": 160.55055236816406, "learning_rate": 3e-06, "loss": -22.228, "step": 904 }, { "epoch": 0.08073509077122083, "grad_norm": 125.40478515625, "learning_rate": 3e-06, "loss": -21.2257, "step": 905 }, { "epoch": 0.08082430081627191, "grad_norm": 111.1106948852539, "learning_rate": 3e-06, "loss": -22.3095, "step": 906 }, { "epoch": 0.08091351086132298, "grad_norm": 122.1114501953125, "learning_rate": 3e-06, "loss": -24.7909, "step": 907 }, { "epoch": 0.08100272090637406, "grad_norm": 156.01158142089844, "learning_rate": 3e-06, "loss": -31.0448, "step": 908 }, { "epoch": 0.08109193095142513, "grad_norm": 158.0888214111328, "learning_rate": 3e-06, "loss": -35.1506, "step": 909 }, { "epoch": 0.0811811409964762, "grad_norm": 156.11680603027344, "learning_rate": 3e-06, "loss": -26.2504, "step": 910 }, { "epoch": 0.08127035104152727, "grad_norm": 136.36370849609375, "learning_rate": 3e-06, "loss": -24.5191, "step": 911 }, { "epoch": 0.08135956108657835, "grad_norm": 138.4123077392578, "learning_rate": 3e-06, "loss": -25.2287, "step": 912 }, { "completion_length": 123.97917175292969, "epoch": 0.08144877113162942, "grad_norm": 69.13970184326172, "learning_rate": 3e-06, "loss": -4.2059, "reward": 2.349874973297119, "reward_std": 0.39924251288175583, "rewards/correctness_reward_func": 1.75, "rewards/int_reward_func": 0.4583333283662796, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.14154166728258133, "step": 913, "zero_std_ratio": 0.0 }, { "epoch": 0.0815379811766805, "grad_norm": 109.77488708496094, "learning_rate": 3e-06, "loss": -6.6804, "step": 914 }, { "epoch": 0.08162719122173157, "grad_norm": 108.82147216796875, "learning_rate": 3e-06, "loss": 1.8191, "step": 915 }, { "epoch": 0.08171640126678265, "grad_norm": 88.40335083007812, "learning_rate": 3e-06, "loss": -5.8692, "step": 916 }, { "epoch": 0.08180561131183371, "grad_norm": 76.1854019165039, "learning_rate": 3e-06, "loss": -1.3803, "step": 917 }, { "epoch": 0.08189482135688479, "grad_norm": 94.09133911132812, "learning_rate": 3e-06, "loss": -2.3375, "step": 918 }, { "epoch": 0.08198403140193586, "grad_norm": 84.88536071777344, "learning_rate": 3e-06, "loss": -5.6229, "step": 919 }, { "epoch": 0.08207324144698692, "grad_norm": 92.1208267211914, "learning_rate": 3e-06, "loss": -7.5509, "step": 920 }, { "epoch": 0.082162451492038, "grad_norm": 89.02661895751953, "learning_rate": 3e-06, "loss": 0.5948, "step": 921 }, { "epoch": 0.08225166153708907, "grad_norm": 95.09249114990234, "learning_rate": 3e-06, "loss": -6.4904, "step": 922 }, { "epoch": 0.08234087158214015, "grad_norm": 83.8741683959961, "learning_rate": 3e-06, "loss": -2.875, "step": 923 }, { "epoch": 0.08243008162719122, "grad_norm": 129.45420837402344, "learning_rate": 3e-06, "loss": -3.085, "step": 924 }, { "completion_length": 115.27083587646484, "epoch": 0.0825192916722423, "grad_norm": 250.4253387451172, "learning_rate": 3e-06, "loss": 9.4605, "reward": 2.1223334074020386, "reward_std": 0.7158277630805969, "rewards/correctness_reward_func": 1.5, "rewards/int_reward_func": 0.4166666716337204, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.205666683614254, "step": 925, "zero_std_ratio": 0.0 }, { "epoch": 0.08260850171729336, "grad_norm": 318.8369140625, "learning_rate": 3e-06, "loss": 16.4358, "step": 926 }, { "epoch": 0.08269771176234444, "grad_norm": 314.372314453125, "learning_rate": 3e-06, "loss": 5.4103, "step": 927 }, { "epoch": 0.08278692180739551, "grad_norm": 255.00933837890625, "learning_rate": 3e-06, "loss": -3.1256, "step": 928 }, { "epoch": 0.08287613185244659, "grad_norm": 357.3619384765625, "learning_rate": 3e-06, "loss": 3.4132, "step": 929 }, { "epoch": 0.08296534189749766, "grad_norm": 409.3254089355469, "learning_rate": 3e-06, "loss": 23.0602, "step": 930 }, { "epoch": 0.08305455194254874, "grad_norm": 270.6861877441406, "learning_rate": 3e-06, "loss": 8.4527, "step": 931 }, { "epoch": 0.0831437619875998, "grad_norm": 507.520263671875, "learning_rate": 3e-06, "loss": 14.712, "step": 932 }, { "epoch": 0.08323297203265087, "grad_norm": 281.0194091796875, "learning_rate": 3e-06, "loss": 4.5989, "step": 933 }, { "epoch": 0.08332218207770195, "grad_norm": 275.3479309082031, "learning_rate": 3e-06, "loss": -5.1609, "step": 934 }, { "epoch": 0.08341139212275302, "grad_norm": 358.3206481933594, "learning_rate": 3e-06, "loss": 3.3467, "step": 935 }, { "epoch": 0.0835006021678041, "grad_norm": 403.45440673828125, "learning_rate": 3e-06, "loss": 20.7584, "step": 936 }, { "completion_length": 131.7291717529297, "epoch": 0.08358981221285516, "grad_norm": 112.89104461669922, "learning_rate": 3e-06, "loss": -12.4613, "reward": 1.9657083749771118, "reward_std": 0.33454202115535736, "rewards/correctness_reward_func": 1.3333333432674408, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.13237499818205833, "step": 937, "zero_std_ratio": 0.0 }, { "epoch": 0.08367902225790624, "grad_norm": 158.0906524658203, "learning_rate": 3e-06, "loss": -6.7935, "step": 938 }, { "epoch": 0.08376823230295731, "grad_norm": 127.69352722167969, "learning_rate": 3e-06, "loss": -15.3734, "step": 939 }, { "epoch": 0.08385744234800839, "grad_norm": 207.05262756347656, "learning_rate": 3e-06, "loss": -25.0891, "step": 940 }, { "epoch": 0.08394665239305946, "grad_norm": 546.4678344726562, "learning_rate": 3e-06, "loss": -29.0194, "step": 941 }, { "epoch": 0.08403586243811054, "grad_norm": 141.02198791503906, "learning_rate": 3e-06, "loss": -10.7265, "step": 942 }, { "epoch": 0.0841250724831616, "grad_norm": 137.6843719482422, "learning_rate": 3e-06, "loss": -13.3029, "step": 943 }, { "epoch": 0.08421428252821268, "grad_norm": 211.74227905273438, "learning_rate": 3e-06, "loss": -8.8958, "step": 944 }, { "epoch": 0.08430349257326375, "grad_norm": 123.87110900878906, "learning_rate": 3e-06, "loss": -16.9913, "step": 945 }, { "epoch": 0.08439270261831482, "grad_norm": 206.8551025390625, "learning_rate": 3e-06, "loss": -26.9321, "step": 946 }, { "epoch": 0.0844819126633659, "grad_norm": 193.33346557617188, "learning_rate": 3e-06, "loss": -30.975, "step": 947 }, { "epoch": 0.08457112270841696, "grad_norm": 147.73297119140625, "learning_rate": 3e-06, "loss": -13.515, "step": 948 }, { "completion_length": 159.95833587646484, "epoch": 0.08466033275346804, "grad_norm": 186.3380889892578, "learning_rate": 3e-06, "loss": 9.8972, "reward": 1.8234166502952576, "reward_std": 0.42612800002098083, "rewards/correctness_reward_func": 1.2916666865348816, "rewards/int_reward_func": 0.4375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.09425000101327896, "step": 949, "zero_std_ratio": 0.0 }, { "epoch": 0.08474954279851911, "grad_norm": 178.0104522705078, "learning_rate": 3e-06, "loss": 10.3947, "step": 950 }, { "epoch": 0.08483875284357019, "grad_norm": 160.924560546875, "learning_rate": 3e-06, "loss": 8.1323, "step": 951 }, { "epoch": 0.08492796288862126, "grad_norm": 144.38978576660156, "learning_rate": 3e-06, "loss": 5.1531, "step": 952 }, { "epoch": 0.08501717293367234, "grad_norm": 174.2298126220703, "learning_rate": 3e-06, "loss": 5.4548, "step": 953 }, { "epoch": 0.0851063829787234, "grad_norm": 164.93479919433594, "learning_rate": 3e-06, "loss": 10.7603, "step": 954 }, { "epoch": 0.08519559302377448, "grad_norm": 198.3860626220703, "learning_rate": 3e-06, "loss": 9.1507, "step": 955 }, { "epoch": 0.08528480306882555, "grad_norm": 160.76519775390625, "learning_rate": 3e-06, "loss": 9.4591, "step": 956 }, { "epoch": 0.08537401311387663, "grad_norm": 170.39776611328125, "learning_rate": 3e-06, "loss": 7.0709, "step": 957 }, { "epoch": 0.0854632231589277, "grad_norm": 145.32798767089844, "learning_rate": 3e-06, "loss": 4.1507, "step": 958 }, { "epoch": 0.08555243320397876, "grad_norm": 170.50514221191406, "learning_rate": 3e-06, "loss": 4.4292, "step": 959 }, { "epoch": 0.08564164324902984, "grad_norm": 197.32290649414062, "learning_rate": 3e-06, "loss": 10.1245, "step": 960 }, { "completion_length": 149.68750762939453, "epoch": 0.08573085329408091, "grad_norm": 530.301025390625, "learning_rate": 3e-06, "loss": 6.2918, "reward": 2.106416702270508, "reward_std": 0.5645134299993515, "rewards/correctness_reward_func": 1.5416666865348816, "rewards/int_reward_func": 0.4791666567325592, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.08558332687243819, "step": 961, "zero_std_ratio": 0.0 }, { "epoch": 0.08582006333913199, "grad_norm": 593.3743286132812, "learning_rate": 3e-06, "loss": -27.7628, "step": 962 }, { "epoch": 0.08590927338418305, "grad_norm": 378.6949157714844, "learning_rate": 3e-06, "loss": -25.6701, "step": 963 }, { "epoch": 0.08599848342923413, "grad_norm": 402.416748046875, "learning_rate": 3e-06, "loss": 10.4982, "step": 964 }, { "epoch": 0.0860876934742852, "grad_norm": 443.3346862792969, "learning_rate": 3e-06, "loss": -39.2448, "step": 965 }, { "epoch": 0.08617690351933628, "grad_norm": 401.20574951171875, "learning_rate": 3e-06, "loss": -68.5925, "step": 966 }, { "epoch": 0.08626611356438735, "grad_norm": 553.38720703125, "learning_rate": 3e-06, "loss": 1.8602, "step": 967 }, { "epoch": 0.08635532360943843, "grad_norm": 628.0134887695312, "learning_rate": 3e-06, "loss": -36.1923, "step": 968 }, { "epoch": 0.0864445336544895, "grad_norm": 380.9430847167969, "learning_rate": 3e-06, "loss": -32.3079, "step": 969 }, { "epoch": 0.08653374369954057, "grad_norm": 385.8163146972656, "learning_rate": 3e-06, "loss": 3.3819, "step": 970 }, { "epoch": 0.08662295374459164, "grad_norm": 432.78118896484375, "learning_rate": 3e-06, "loss": -46.8594, "step": 971 }, { "epoch": 0.08671216378964271, "grad_norm": 439.5821533203125, "learning_rate": 3e-06, "loss": -77.4002, "step": 972 }, { "completion_length": 121.68750381469727, "epoch": 0.08680137383469379, "grad_norm": 35.27005386352539, "learning_rate": 3e-06, "loss": 3.8068, "reward": 2.284437596797943, "reward_std": 0.15820645913481712, "rewards/correctness_reward_func": 1.625, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.15943749248981476, "step": 973, "zero_std_ratio": 0.0 }, { "epoch": 0.08689058387974485, "grad_norm": 68.60123443603516, "learning_rate": 3e-06, "loss": 5.8697, "step": 974 }, { "epoch": 0.08697979392479593, "grad_norm": 69.07678985595703, "learning_rate": 3e-06, "loss": 2.0005, "step": 975 }, { "epoch": 0.087069003969847, "grad_norm": 39.30900955200195, "learning_rate": 3e-06, "loss": 6.0659, "step": 976 }, { "epoch": 0.08715821401489808, "grad_norm": 77.51853942871094, "learning_rate": 3e-06, "loss": 5.9335, "step": 977 }, { "epoch": 0.08724742405994915, "grad_norm": 60.07703399658203, "learning_rate": 3e-06, "loss": 3.043, "step": 978 }, { "epoch": 0.08733663410500023, "grad_norm": 39.25843811035156, "learning_rate": 3e-06, "loss": 3.6489, "step": 979 }, { "epoch": 0.0874258441500513, "grad_norm": 46.68893051147461, "learning_rate": 3e-06, "loss": 5.0957, "step": 980 }, { "epoch": 0.08751505419510237, "grad_norm": 55.4852180480957, "learning_rate": 3e-06, "loss": 0.9956, "step": 981 }, { "epoch": 0.08760426424015344, "grad_norm": 51.21168518066406, "learning_rate": 3e-06, "loss": 5.5624, "step": 982 }, { "epoch": 0.08769347428520452, "grad_norm": 64.15937805175781, "learning_rate": 3e-06, "loss": 4.0323, "step": 983 }, { "epoch": 0.08778268433025559, "grad_norm": 65.25579833984375, "learning_rate": 3e-06, "loss": 1.7638, "step": 984 }, { "completion_length": 138.68750762939453, "epoch": 0.08787189437530665, "grad_norm": 277.1241760253906, "learning_rate": 3e-06, "loss": 10.9365, "reward": 2.176750063896179, "reward_std": 0.40325865149497986, "rewards/correctness_reward_func": 1.5833333134651184, "rewards/int_reward_func": 0.4791666567325592, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.11425000056624413, "step": 985, "zero_std_ratio": 0.0 }, { "epoch": 0.08796110442035773, "grad_norm": 363.5508117675781, "learning_rate": 3e-06, "loss": -2.6393, "step": 986 }, { "epoch": 0.0880503144654088, "grad_norm": 337.4767150878906, "learning_rate": 3e-06, "loss": 9.4007, "step": 987 }, { "epoch": 0.08813952451045988, "grad_norm": 292.9395751953125, "learning_rate": 3e-06, "loss": -2.1309, "step": 988 }, { "epoch": 0.08822873455551095, "grad_norm": 246.76112365722656, "learning_rate": 3e-06, "loss": -9.132, "step": 989 }, { "epoch": 0.08831794460056203, "grad_norm": 267.3565368652344, "learning_rate": 3e-06, "loss": -11.7626, "step": 990 }, { "epoch": 0.08840715464561309, "grad_norm": 257.4312438964844, "learning_rate": 3e-06, "loss": 5.7132, "step": 991 }, { "epoch": 0.08849636469066417, "grad_norm": 317.8547058105469, "learning_rate": 3e-06, "loss": -9.7682, "step": 992 }, { "epoch": 0.08858557473571524, "grad_norm": 260.3039855957031, "learning_rate": 3e-06, "loss": 2.4633, "step": 993 }, { "epoch": 0.08867478478076632, "grad_norm": 261.14697265625, "learning_rate": 3e-06, "loss": -9.4669, "step": 994 }, { "epoch": 0.08876399482581739, "grad_norm": 181.8609161376953, "learning_rate": 3e-06, "loss": -13.1009, "step": 995 }, { "epoch": 0.08885320487086847, "grad_norm": 236.33563232421875, "learning_rate": 3e-06, "loss": -15.7579, "step": 996 }, { "completion_length": 168.9791717529297, "epoch": 0.08894241491591953, "grad_norm": 502.4021911621094, "learning_rate": 3e-06, "loss": -65.14, "reward": 1.9697707891464233, "reward_std": 0.505499929189682, "rewards/correctness_reward_func": 1.4166666269302368, "rewards/int_reward_func": 0.4895833283662796, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.06352083757519722, "step": 997, "zero_std_ratio": 0.0 }, { "epoch": 0.0890316249609706, "grad_norm": 564.5300903320312, "learning_rate": 3e-06, "loss": -45.4159, "step": 998 }, { "epoch": 0.08912083500602168, "grad_norm": 453.83160400390625, "learning_rate": 3e-06, "loss": -31.2339, "step": 999 }, { "epoch": 0.08921004505107274, "grad_norm": 449.2571716308594, "learning_rate": 3e-06, "loss": -71.5707, "step": 1000 } ], "logging_steps": 1, "max_steps": 112090, "num_input_tokens_seen": 0, "num_train_epochs": 10, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": true, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 6, "trial_name": null, "trial_params": null }