diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,18787 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 0.2222222222222222, + "eval_steps": 1000, + "global_step": 2500, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "completion_length": 246.3125, + "epoch": 8.888888888888889e-05, + "grad_norm": 60.320960998535156, + "learning_rate": 2.5e-07, + "loss": -10.229, + "reward": 1.7395833730697632, + "reward_std": 0.6432403922080994, + "rewards/boxed_and_answer_tags_format_reward": 0.65625, + "rewards/correctness_reward_func_math": 1.0833333432674408, + "step": 1, + "zero_std_ratio": 0.0 + }, + { + "epoch": 0.00017777777777777779, + "grad_norm": 71.0031509399414, + "learning_rate": 5e-07, + "loss": -9.5625, + "step": 2 + }, + { + "epoch": 0.0002666666666666667, + "grad_norm": 61.95022964477539, + "learning_rate": 7.5e-07, + "loss": -16.2291, + "step": 3 + }, + { + "epoch": 0.00035555555555555557, + "grad_norm": 67.81867980957031, + "learning_rate": 1e-06, + "loss": -11.0016, + "step": 4 + }, + { + "epoch": 0.00044444444444444447, + "grad_norm": 57.108917236328125, + "learning_rate": 1.25e-06, + "loss": -6.1658, + "step": 5 + }, + { + "epoch": 0.0005333333333333334, + "grad_norm": 72.85011291503906, + "learning_rate": 1.5e-06, + "loss": -4.0145, + "step": 6 + }, + { + "epoch": 0.0006222222222222223, + "grad_norm": 59.103431701660156, + "learning_rate": 1.7500000000000002e-06, + "loss": -9.9488, + "step": 7 + }, + { + "epoch": 0.0007111111111111111, + "grad_norm": 73.94007873535156, + "learning_rate": 2e-06, + "loss": -9.1809, + "step": 8 + }, + { + "epoch": 0.0008, + "grad_norm": 62.28184509277344, + "learning_rate": 2.25e-06, + "loss": -16.4311, + "step": 9 + }, + { + "epoch": 0.0008888888888888889, + "grad_norm": 68.46251678466797, + "learning_rate": 2.5e-06, + "loss": -11.402, + "step": 10 + }, + { + "epoch": 0.0009777777777777777, + "grad_norm": 59.184749603271484, + "learning_rate": 2.75e-06, + "loss": -6.1384, + "step": 11 + }, + { + "epoch": 0.0010666666666666667, + "grad_norm": 71.60365295410156, + "learning_rate": 3e-06, + "loss": -3.932, + "step": 12 + }, + { + "completion_length": 249.62500762939453, + "epoch": 0.0011555555555555555, + "grad_norm": 61.79197311401367, + "learning_rate": 3e-06, + "loss": -2.1748, + "reward": 0.9791666865348816, + "reward_std": 0.4510806053876877, + "rewards/boxed_and_answer_tags_format_reward": 0.6458333134651184, + "rewards/correctness_reward_func_math": 0.3333333283662796, + "step": 13, + "zero_std_ratio": 0.125 + }, + { + "epoch": 0.0012444444444444445, + "grad_norm": 55.82655334472656, + "learning_rate": 3e-06, + "loss": -3.8107, + "step": 14 + }, + { + "epoch": 0.0013333333333333333, + "grad_norm": 58.3712158203125, + "learning_rate": 3e-06, + "loss": 4.085, + "step": 15 + }, + { + "epoch": 0.0014222222222222223, + "grad_norm": 73.97306823730469, + "learning_rate": 3e-06, + "loss": -4.8915, + "step": 16 + }, + { + "epoch": 0.001511111111111111, + "grad_norm": 51.51576232910156, + "learning_rate": 3e-06, + "loss": -0.161, + "step": 17 + }, + { + "epoch": 0.0016, + "grad_norm": 72.59639739990234, + "learning_rate": 3e-06, + "loss": 3.0369, + "step": 18 + }, + { + "epoch": 0.0016888888888888889, + "grad_norm": 48.86510467529297, + "learning_rate": 3e-06, + "loss": -2.3437, + "step": 19 + }, + { + "epoch": 0.0017777777777777779, + "grad_norm": 55.3180046081543, + "learning_rate": 3e-06, + "loss": -4.3585, + "step": 20 + }, + { + "epoch": 0.0018666666666666666, + "grad_norm": 54.75101089477539, + "learning_rate": 3e-06, + "loss": 3.9965, + "step": 21 + }, + { + "epoch": 0.0019555555555555554, + "grad_norm": 75.27330017089844, + "learning_rate": 3e-06, + "loss": -4.9733, + "step": 22 + }, + { + "epoch": 0.0020444444444444447, + "grad_norm": 51.991214752197266, + "learning_rate": 3e-06, + "loss": -0.3033, + "step": 23 + }, + { + "epoch": 0.0021333333333333334, + "grad_norm": 54.72827911376953, + "learning_rate": 3e-06, + "loss": 2.5278, + "step": 24 + }, + { + "completion_length": 226.8125, + "epoch": 0.0022222222222222222, + "grad_norm": 58.25025939941406, + "learning_rate": 3e-06, + "loss": -6.7768, + "reward": 1.6041666865348816, + "reward_std": 0.6311438381671906, + "rewards/boxed_and_answer_tags_format_reward": 0.6458333432674408, + "rewards/correctness_reward_func_math": 0.9583333432674408, + "step": 25, + "zero_std_ratio": 0.0 + }, + { + "epoch": 0.002311111111111111, + "grad_norm": 68.62551879882812, + "learning_rate": 3e-06, + "loss": -4.0906, + "step": 26 + }, + { + "epoch": 0.0024, + "grad_norm": 65.11053466796875, + "learning_rate": 3e-06, + "loss": -4.6172, + "step": 27 + }, + { + "epoch": 0.002488888888888889, + "grad_norm": 76.28429412841797, + "learning_rate": 3e-06, + "loss": -7.9209, + "step": 28 + }, + { + "epoch": 0.002577777777777778, + "grad_norm": 62.037696838378906, + "learning_rate": 3e-06, + "loss": -3.4414, + "step": 29 + }, + { + "epoch": 0.0026666666666666666, + "grad_norm": 58.92220687866211, + "learning_rate": 3e-06, + "loss": -3.2836, + "step": 30 + }, + { + "epoch": 0.0027555555555555554, + "grad_norm": 57.03800582885742, + "learning_rate": 3e-06, + "loss": -7.1747, + "step": 31 + }, + { + "epoch": 0.0028444444444444446, + "grad_norm": 71.39422607421875, + "learning_rate": 3e-06, + "loss": -4.5251, + "step": 32 + }, + { + "epoch": 0.0029333333333333334, + "grad_norm": 130.19813537597656, + "learning_rate": 3e-06, + "loss": -4.5744, + "step": 33 + }, + { + "epoch": 0.003022222222222222, + "grad_norm": 76.09828186035156, + "learning_rate": 3e-06, + "loss": -7.9552, + "step": 34 + }, + { + "epoch": 0.003111111111111111, + "grad_norm": 63.77288055419922, + "learning_rate": 3e-06, + "loss": -3.6391, + "step": 35 + }, + { + "epoch": 0.0032, + "grad_norm": 58.53509521484375, + "learning_rate": 3e-06, + "loss": -3.9415, + "step": 36 + }, + { + "completion_length": 245.14583587646484, + "epoch": 0.003288888888888889, + "grad_norm": 53.06296920776367, + "learning_rate": 3e-06, + "loss": 7.1798, + "reward": 0.9375000298023224, + "reward_std": 0.3340114951133728, + "rewards/boxed_and_answer_tags_format_reward": 0.6041666567325592, + "rewards/correctness_reward_func_math": 0.3333333246409893, + "step": 37, + "zero_std_ratio": 0.25 + }, + { + "epoch": 0.0033777777777777777, + "grad_norm": 78.04679870605469, + "learning_rate": 3e-06, + "loss": 13.2393, + "step": 38 + }, + { + "epoch": 0.0034666666666666665, + "grad_norm": 64.38521575927734, + "learning_rate": 3e-06, + "loss": 11.5406, + "step": 39 + }, + { + "epoch": 0.0035555555555555557, + "grad_norm": 56.69493865966797, + "learning_rate": 3e-06, + "loss": 11.0537, + "step": 40 + }, + { + "epoch": 0.0036444444444444445, + "grad_norm": 59.67893600463867, + "learning_rate": 3e-06, + "loss": 12.2084, + "step": 41 + }, + { + "epoch": 0.0037333333333333333, + "grad_norm": 44.71684646606445, + "learning_rate": 3e-06, + "loss": 14.915, + "step": 42 + }, + { + "epoch": 0.003822222222222222, + "grad_norm": 53.003570556640625, + "learning_rate": 3e-06, + "loss": 7.1581, + "step": 43 + }, + { + "epoch": 0.003911111111111111, + "grad_norm": 86.505615234375, + "learning_rate": 3e-06, + "loss": 13.0403, + "step": 44 + }, + { + "epoch": 0.004, + "grad_norm": 73.7258529663086, + "learning_rate": 3e-06, + "loss": 11.1962, + "step": 45 + }, + { + "epoch": 0.004088888888888889, + "grad_norm": 92.57136535644531, + "learning_rate": 3e-06, + "loss": 10.6526, + "step": 46 + }, + { + "epoch": 0.004177777777777778, + "grad_norm": 63.43205642700195, + "learning_rate": 3e-06, + "loss": 11.8212, + "step": 47 + }, + { + "epoch": 0.004266666666666667, + "grad_norm": 44.73876953125, + "learning_rate": 3e-06, + "loss": 14.3455, + "step": 48 + }, + { + "completion_length": 236.37500762939453, + "epoch": 0.004355555555555555, + "grad_norm": 42.604164123535156, + "learning_rate": 3e-06, + "loss": -28.8562, + "reward": 1.6145833730697632, + "reward_std": 0.3440491110086441, + "rewards/boxed_and_answer_tags_format_reward": 0.65625, + "rewards/correctness_reward_func_math": 0.9583333432674408, + "step": 49, + "zero_std_ratio": 0.125 + }, + { + "epoch": 0.0044444444444444444, + "grad_norm": 55.280696868896484, + "learning_rate": 3e-06, + "loss": -29.8671, + "step": 50 + }, + { + "epoch": 0.004533333333333334, + "grad_norm": 53.84416198730469, + "learning_rate": 3e-06, + "loss": -28.6148, + "step": 51 + }, + { + "epoch": 0.004622222222222222, + "grad_norm": 48.8647575378418, + "learning_rate": 3e-06, + "loss": -28.0853, + "step": 52 + }, + { + "epoch": 0.004711111111111111, + "grad_norm": 65.01343536376953, + "learning_rate": 3e-06, + "loss": -26.2356, + "step": 53 + }, + { + "epoch": 0.0048, + "grad_norm": 64.81402587890625, + "learning_rate": 3e-06, + "loss": -30.8205, + "step": 54 + }, + { + "epoch": 0.004888888888888889, + "grad_norm": 44.85778045654297, + "learning_rate": 3e-06, + "loss": -28.6691, + "step": 55 + }, + { + "epoch": 0.004977777777777778, + "grad_norm": 45.61606216430664, + "learning_rate": 3e-06, + "loss": -30.0595, + "step": 56 + }, + { + "epoch": 0.005066666666666666, + "grad_norm": 49.3116455078125, + "learning_rate": 3e-06, + "loss": -28.8315, + "step": 57 + }, + { + "epoch": 0.005155555555555556, + "grad_norm": 45.42935562133789, + "learning_rate": 3e-06, + "loss": -28.1493, + "step": 58 + }, + { + "epoch": 0.005244444444444445, + "grad_norm": 52.282257080078125, + "learning_rate": 3e-06, + "loss": -26.8023, + "step": 59 + }, + { + "epoch": 0.005333333333333333, + "grad_norm": 61.042945861816406, + "learning_rate": 3e-06, + "loss": -30.9091, + "step": 60 + }, + { + "completion_length": 250.8541717529297, + "epoch": 0.005422222222222222, + "grad_norm": 56.66669464111328, + "learning_rate": 3e-06, + "loss": -4.2644, + "reward": 1.0104166865348816, + "reward_std": 0.3859569579362869, + "rewards/boxed_and_answer_tags_format_reward": 0.5104166716337204, + "rewards/correctness_reward_func_math": 0.4999999850988388, + "step": 61, + "zero_std_ratio": 0.5 + }, + { + "epoch": 0.005511111111111111, + "grad_norm": 46.742279052734375, + "learning_rate": 3e-06, + "loss": 1.921, + "step": 62 + }, + { + "epoch": 0.0056, + "grad_norm": 64.74068450927734, + "learning_rate": 3e-06, + "loss": -1.0677, + "step": 63 + }, + { + "epoch": 0.005688888888888889, + "grad_norm": 53.72319412231445, + "learning_rate": 3e-06, + "loss": 0.7498, + "step": 64 + }, + { + "epoch": 0.0057777777777777775, + "grad_norm": 51.9224739074707, + "learning_rate": 3e-06, + "loss": -1.7073, + "step": 65 + }, + { + "epoch": 0.005866666666666667, + "grad_norm": 49.95579528808594, + "learning_rate": 3e-06, + "loss": -4.7011, + "step": 66 + }, + { + "epoch": 0.005955555555555556, + "grad_norm": 54.09262466430664, + "learning_rate": 3e-06, + "loss": -4.882, + "step": 67 + }, + { + "epoch": 0.006044444444444444, + "grad_norm": 51.433746337890625, + "learning_rate": 3e-06, + "loss": 1.6496, + "step": 68 + }, + { + "epoch": 0.0061333333333333335, + "grad_norm": 48.16537094116211, + "learning_rate": 3e-06, + "loss": -1.5035, + "step": 69 + }, + { + "epoch": 0.006222222222222222, + "grad_norm": 55.34268569946289, + "learning_rate": 3e-06, + "loss": 0.0384, + "step": 70 + }, + { + "epoch": 0.006311111111111111, + "grad_norm": 45.631813049316406, + "learning_rate": 3e-06, + "loss": -1.8713, + "step": 71 + }, + { + "epoch": 0.0064, + "grad_norm": 48.471473693847656, + "learning_rate": 3e-06, + "loss": -4.9618, + "step": 72 + }, + { + "completion_length": 231.14583587646484, + "epoch": 0.006488888888888889, + "grad_norm": 91.5987777709961, + "learning_rate": 3e-06, + "loss": 59.5993, + "reward": 1.1145833730697632, + "reward_std": 0.4806128740310669, + "rewards/boxed_and_answer_tags_format_reward": 0.6145833432674408, + "rewards/correctness_reward_func_math": 0.5, + "step": 73, + "zero_std_ratio": 0.125 + }, + { + "epoch": 0.006577777777777778, + "grad_norm": 83.08058166503906, + "learning_rate": 3e-06, + "loss": 67.2965, + "step": 74 + }, + { + "epoch": 0.006666666666666667, + "grad_norm": 65.38250732421875, + "learning_rate": 3e-06, + "loss": 60.7344, + "step": 75 + }, + { + "epoch": 0.0067555555555555554, + "grad_norm": 66.78120422363281, + "learning_rate": 3e-06, + "loss": 63.4533, + "step": 76 + }, + { + "epoch": 0.006844444444444445, + "grad_norm": 62.675838470458984, + "learning_rate": 3e-06, + "loss": 54.1729, + "step": 77 + }, + { + "epoch": 0.006933333333333333, + "grad_norm": 63.28793716430664, + "learning_rate": 3e-06, + "loss": 61.2604, + "step": 78 + }, + { + "epoch": 0.007022222222222222, + "grad_norm": 75.33735656738281, + "learning_rate": 3e-06, + "loss": 59.0054, + "step": 79 + }, + { + "epoch": 0.0071111111111111115, + "grad_norm": 86.537109375, + "learning_rate": 3e-06, + "loss": 66.586, + "step": 80 + }, + { + "epoch": 0.0072, + "grad_norm": 66.0783462524414, + "learning_rate": 3e-06, + "loss": 59.9151, + "step": 81 + }, + { + "epoch": 0.007288888888888889, + "grad_norm": 66.10869598388672, + "learning_rate": 3e-06, + "loss": 62.0304, + "step": 82 + }, + { + "epoch": 0.007377777777777777, + "grad_norm": 58.026912689208984, + "learning_rate": 3e-06, + "loss": 53.3198, + "step": 83 + }, + { + "epoch": 0.007466666666666667, + "grad_norm": 59.65370559692383, + "learning_rate": 3e-06, + "loss": 60.237, + "step": 84 + }, + { + "completion_length": 246.70833587646484, + "epoch": 0.007555555555555556, + "grad_norm": 38.2843017578125, + "learning_rate": 3e-06, + "loss": -3.4492, + "reward": 0.9270833730697632, + "reward_std": 0.2587623968720436, + "rewards/boxed_and_answer_tags_format_reward": 0.6354166567325592, + "rewards/correctness_reward_func_math": 0.2916666679084301, + "step": 85, + "zero_std_ratio": 0.125 + }, + { + "epoch": 0.007644444444444444, + "grad_norm": 42.626834869384766, + "learning_rate": 3e-06, + "loss": -3.0457, + "step": 86 + }, + { + "epoch": 0.007733333333333333, + "grad_norm": 31.817684173583984, + "learning_rate": 3e-06, + "loss": 0.2054, + "step": 87 + }, + { + "epoch": 0.007822222222222222, + "grad_norm": 41.712833404541016, + "learning_rate": 3e-06, + "loss": 0.6522, + "step": 88 + }, + { + "epoch": 0.007911111111111112, + "grad_norm": 33.385929107666016, + "learning_rate": 3e-06, + "loss": -2.3715, + "step": 89 + }, + { + "epoch": 0.008, + "grad_norm": 43.1032829284668, + "learning_rate": 3e-06, + "loss": 1.5502, + "step": 90 + }, + { + "epoch": 0.008088888888888889, + "grad_norm": 36.241458892822266, + "learning_rate": 3e-06, + "loss": -3.5684, + "step": 91 + }, + { + "epoch": 0.008177777777777779, + "grad_norm": 41.06986618041992, + "learning_rate": 3e-06, + "loss": -3.2263, + "step": 92 + }, + { + "epoch": 0.008266666666666667, + "grad_norm": 31.25284767150879, + "learning_rate": 3e-06, + "loss": -0.3487, + "step": 93 + }, + { + "epoch": 0.008355555555555555, + "grad_norm": 36.958518981933594, + "learning_rate": 3e-06, + "loss": 0.1995, + "step": 94 + }, + { + "epoch": 0.008444444444444444, + "grad_norm": 34.949676513671875, + "learning_rate": 3e-06, + "loss": -2.9378, + "step": 95 + }, + { + "epoch": 0.008533333333333334, + "grad_norm": 36.523372650146484, + "learning_rate": 3e-06, + "loss": 0.9469, + "step": 96 + }, + { + "completion_length": 246.08333587646484, + "epoch": 0.008622222222222222, + "grad_norm": 59.17626953125, + "learning_rate": 3e-06, + "loss": 2.6896, + "reward": 1.3333333730697632, + "reward_std": 0.5695068836212158, + "rewards/boxed_and_answer_tags_format_reward": 0.5833333432674408, + "rewards/correctness_reward_func_math": 0.7500000149011612, + "step": 97, + "zero_std_ratio": 0.125 + }, + { + "epoch": 0.00871111111111111, + "grad_norm": 71.58135223388672, + "learning_rate": 3e-06, + "loss": 1.0704, + "step": 98 + }, + { + "epoch": 0.0088, + "grad_norm": 65.36974334716797, + "learning_rate": 3e-06, + "loss": -2.7445, + "step": 99 + }, + { + "epoch": 0.008888888888888889, + "grad_norm": 60.50218200683594, + "learning_rate": 3e-06, + "loss": 3.606, + "step": 100 + }, + { + "epoch": 0.008977777777777777, + "grad_norm": 61.99585723876953, + "learning_rate": 3e-06, + "loss": -1.4435, + "step": 101 + }, + { + "epoch": 0.009066666666666667, + "grad_norm": 106.92288970947266, + "learning_rate": 3e-06, + "loss": -3.972, + "step": 102 + }, + { + "epoch": 0.009155555555555556, + "grad_norm": 58.85340118408203, + "learning_rate": 3e-06, + "loss": 1.5567, + "step": 103 + }, + { + "epoch": 0.009244444444444444, + "grad_norm": 70.97467041015625, + "learning_rate": 3e-06, + "loss": -0.3592, + "step": 104 + }, + { + "epoch": 0.009333333333333334, + "grad_norm": 62.310516357421875, + "learning_rate": 3e-06, + "loss": -4.0897, + "step": 105 + }, + { + "epoch": 0.009422222222222222, + "grad_norm": 60.98678207397461, + "learning_rate": 3e-06, + "loss": 2.9803, + "step": 106 + }, + { + "epoch": 0.00951111111111111, + "grad_norm": 60.23484420776367, + "learning_rate": 3e-06, + "loss": -2.2749, + "step": 107 + }, + { + "epoch": 0.0096, + "grad_norm": 58.0914192199707, + "learning_rate": 3e-06, + "loss": -4.9462, + "step": 108 + }, + { + "completion_length": 249.06250762939453, + "epoch": 0.00968888888888889, + "grad_norm": 65.09230041503906, + "learning_rate": 3e-06, + "loss": -11.6324, + "reward": 1.125, + "reward_std": 0.5275504291057587, + "rewards/boxed_and_answer_tags_format_reward": 0.5833333432674408, + "rewards/correctness_reward_func_math": 0.5416666716337204, + "step": 109, + "zero_std_ratio": 0.0 + }, + { + "epoch": 0.009777777777777778, + "grad_norm": 58.50445556640625, + "learning_rate": 3e-06, + "loss": -12.2664, + "step": 110 + }, + { + "epoch": 0.009866666666666666, + "grad_norm": 53.459251403808594, + "learning_rate": 3e-06, + "loss": -7.2192, + "step": 111 + }, + { + "epoch": 0.009955555555555556, + "grad_norm": 60.34041213989258, + "learning_rate": 3e-06, + "loss": -6.9971, + "step": 112 + }, + { + "epoch": 0.010044444444444444, + "grad_norm": 61.72711944580078, + "learning_rate": 3e-06, + "loss": -0.4686, + "step": 113 + }, + { + "epoch": 0.010133333333333333, + "grad_norm": 96.1756591796875, + "learning_rate": 3e-06, + "loss": -7.4161, + "step": 114 + }, + { + "epoch": 0.010222222222222223, + "grad_norm": 61.3508415222168, + "learning_rate": 3e-06, + "loss": -12.3763, + "step": 115 + }, + { + "epoch": 0.010311111111111111, + "grad_norm": 55.424896240234375, + "learning_rate": 3e-06, + "loss": -12.8949, + "step": 116 + }, + { + "epoch": 0.0104, + "grad_norm": 56.08291244506836, + "learning_rate": 3e-06, + "loss": -7.8472, + "step": 117 + }, + { + "epoch": 0.01048888888888889, + "grad_norm": 73.18891906738281, + "learning_rate": 3e-06, + "loss": -8.0281, + "step": 118 + }, + { + "epoch": 0.010577777777777778, + "grad_norm": 64.47604370117188, + "learning_rate": 3e-06, + "loss": -1.3444, + "step": 119 + }, + { + "epoch": 0.010666666666666666, + "grad_norm": 96.0721664428711, + "learning_rate": 3e-06, + "loss": -8.5737, + "step": 120 + }, + { + "completion_length": 253.0, + "epoch": 0.010755555555555556, + "grad_norm": 60.78779983520508, + "learning_rate": 3e-06, + "loss": -1.605, + "reward": 1.1145833730697632, + "reward_std": 0.4272044152021408, + "rewards/boxed_and_answer_tags_format_reward": 0.6145833432674408, + "rewards/correctness_reward_func_math": 0.4999999850988388, + "step": 121, + "zero_std_ratio": 0.25 + }, + { + "epoch": 0.010844444444444445, + "grad_norm": 49.34260177612305, + "learning_rate": 3e-06, + "loss": -0.2147, + "step": 122 + }, + { + "epoch": 0.010933333333333333, + "grad_norm": 53.38318634033203, + "learning_rate": 3e-06, + "loss": -7.1697, + "step": 123 + }, + { + "epoch": 0.011022222222222221, + "grad_norm": 84.88465881347656, + "learning_rate": 3e-06, + "loss": 4.7029, + "step": 124 + }, + { + "epoch": 0.011111111111111112, + "grad_norm": 50.966583251953125, + "learning_rate": 3e-06, + "loss": -5.2481, + "step": 125 + }, + { + "epoch": 0.0112, + "grad_norm": 64.3619155883789, + "learning_rate": 3e-06, + "loss": -5.9545, + "step": 126 + }, + { + "epoch": 0.011288888888888888, + "grad_norm": 60.359500885009766, + "learning_rate": 3e-06, + "loss": -1.8244, + "step": 127 + }, + { + "epoch": 0.011377777777777778, + "grad_norm": 51.08177947998047, + "learning_rate": 3e-06, + "loss": -0.8292, + "step": 128 + }, + { + "epoch": 0.011466666666666667, + "grad_norm": 53.191165924072266, + "learning_rate": 3e-06, + "loss": -7.7867, + "step": 129 + }, + { + "epoch": 0.011555555555555555, + "grad_norm": 87.42491912841797, + "learning_rate": 3e-06, + "loss": 3.92, + "step": 130 + }, + { + "epoch": 0.011644444444444445, + "grad_norm": 49.99729537963867, + "learning_rate": 3e-06, + "loss": -5.9859, + "step": 131 + }, + { + "epoch": 0.011733333333333333, + "grad_norm": 45.14487075805664, + "learning_rate": 3e-06, + "loss": -6.6928, + "step": 132 + }, + { + "completion_length": 239.1666717529297, + "epoch": 0.011822222222222222, + "grad_norm": 68.43509674072266, + "learning_rate": 3e-06, + "loss": 45.5812, + "reward": 1.1666666865348816, + "reward_std": 0.47104020416736603, + "rewards/boxed_and_answer_tags_format_reward": 0.625, + "rewards/correctness_reward_func_math": 0.5416666679084301, + "step": 133, + "zero_std_ratio": 0.125 + }, + { + "epoch": 0.011911111111111112, + "grad_norm": 62.809059143066406, + "learning_rate": 3e-06, + "loss": 47.1017, + "step": 134 + }, + { + "epoch": 0.012, + "grad_norm": 61.8614387512207, + "learning_rate": 3e-06, + "loss": 41.2505, + "step": 135 + }, + { + "epoch": 0.012088888888888889, + "grad_norm": 65.46350860595703, + "learning_rate": 3e-06, + "loss": 44.191, + "step": 136 + }, + { + "epoch": 0.012177777777777777, + "grad_norm": 59.1669807434082, + "learning_rate": 3e-06, + "loss": 36.6508, + "step": 137 + }, + { + "epoch": 0.012266666666666667, + "grad_norm": 55.44610595703125, + "learning_rate": 3e-06, + "loss": 41.1041, + "step": 138 + }, + { + "epoch": 0.012355555555555555, + "grad_norm": 61.210411071777344, + "learning_rate": 3e-06, + "loss": 44.6212, + "step": 139 + }, + { + "epoch": 0.012444444444444444, + "grad_norm": 64.934326171875, + "learning_rate": 3e-06, + "loss": 46.5024, + "step": 140 + }, + { + "epoch": 0.012533333333333334, + "grad_norm": 67.7354507446289, + "learning_rate": 3e-06, + "loss": 40.309, + "step": 141 + }, + { + "epoch": 0.012622222222222222, + "grad_norm": 69.55413055419922, + "learning_rate": 3e-06, + "loss": 42.8161, + "step": 142 + }, + { + "epoch": 0.01271111111111111, + "grad_norm": 59.040592193603516, + "learning_rate": 3e-06, + "loss": 35.3869, + "step": 143 + }, + { + "epoch": 0.0128, + "grad_norm": 56.21048355102539, + "learning_rate": 3e-06, + "loss": 39.9197, + "step": 144 + }, + { + "completion_length": 237.37500762939453, + "epoch": 0.012888888888888889, + "grad_norm": 67.88895416259766, + "learning_rate": 3e-06, + "loss": -21.4223, + "reward": 0.8958333730697632, + "reward_std": 0.44294705986976624, + "rewards/boxed_and_answer_tags_format_reward": 0.5625000149011612, + "rewards/correctness_reward_func_math": 0.3333333358168602, + "step": 145, + "zero_std_ratio": 0.375 + }, + { + "epoch": 0.012977777777777777, + "grad_norm": 55.02178955078125, + "learning_rate": 3e-06, + "loss": -26.3881, + "step": 146 + }, + { + "epoch": 0.013066666666666667, + "grad_norm": 103.78085327148438, + "learning_rate": 3e-06, + "loss": -21.7028, + "step": 147 + }, + { + "epoch": 0.013155555555555556, + "grad_norm": 62.1268196105957, + "learning_rate": 3e-06, + "loss": -19.048, + "step": 148 + }, + { + "epoch": 0.013244444444444444, + "grad_norm": 57.99726486206055, + "learning_rate": 3e-06, + "loss": -19.346, + "step": 149 + }, + { + "epoch": 0.013333333333333334, + "grad_norm": 58.639549255371094, + "learning_rate": 3e-06, + "loss": -25.0216, + "step": 150 + }, + { + "epoch": 0.013422222222222223, + "grad_norm": 75.58393859863281, + "learning_rate": 3e-06, + "loss": -21.7941, + "step": 151 + }, + { + "epoch": 0.013511111111111111, + "grad_norm": 54.83882522583008, + "learning_rate": 3e-06, + "loss": -27.6056, + "step": 152 + }, + { + "epoch": 0.0136, + "grad_norm": 70.61170196533203, + "learning_rate": 3e-06, + "loss": -21.99, + "step": 153 + }, + { + "epoch": 0.01368888888888889, + "grad_norm": 68.1909408569336, + "learning_rate": 3e-06, + "loss": -20.2119, + "step": 154 + }, + { + "epoch": 0.013777777777777778, + "grad_norm": 68.70491027832031, + "learning_rate": 3e-06, + "loss": -20.2249, + "step": 155 + }, + { + "epoch": 0.013866666666666666, + "grad_norm": 55.29183578491211, + "learning_rate": 3e-06, + "loss": -25.9634, + "step": 156 + }, + { + "completion_length": 250.6666717529297, + "epoch": 0.013955555555555556, + "grad_norm": 70.28712463378906, + "learning_rate": 3e-06, + "loss": -13.869, + "reward": 1.4583333730697632, + "reward_std": 0.6823203265666962, + "rewards/boxed_and_answer_tags_format_reward": 0.625, + "rewards/correctness_reward_func_math": 0.8333333283662796, + "step": 157, + "zero_std_ratio": 0.125 + }, + { + "epoch": 0.014044444444444444, + "grad_norm": 60.210201263427734, + "learning_rate": 3e-06, + "loss": -24.2593, + "step": 158 + }, + { + "epoch": 0.014133333333333333, + "grad_norm": 77.62222290039062, + "learning_rate": 3e-06, + "loss": -11.4696, + "step": 159 + }, + { + "epoch": 0.014222222222222223, + "grad_norm": 70.80023193359375, + "learning_rate": 3e-06, + "loss": -25.8617, + "step": 160 + }, + { + "epoch": 0.014311111111111111, + "grad_norm": 64.7750244140625, + "learning_rate": 3e-06, + "loss": -14.8635, + "step": 161 + }, + { + "epoch": 0.0144, + "grad_norm": 77.83097076416016, + "learning_rate": 3e-06, + "loss": -7.727, + "step": 162 + }, + { + "epoch": 0.01448888888888889, + "grad_norm": 79.27497100830078, + "learning_rate": 3e-06, + "loss": -14.3689, + "step": 163 + }, + { + "epoch": 0.014577777777777778, + "grad_norm": 78.7293472290039, + "learning_rate": 3e-06, + "loss": -24.9423, + "step": 164 + }, + { + "epoch": 0.014666666666666666, + "grad_norm": 70.24745178222656, + "learning_rate": 3e-06, + "loss": -12.4543, + "step": 165 + }, + { + "epoch": 0.014755555555555555, + "grad_norm": 75.36212158203125, + "learning_rate": 3e-06, + "loss": -26.7206, + "step": 166 + }, + { + "epoch": 0.014844444444444445, + "grad_norm": 65.05477142333984, + "learning_rate": 3e-06, + "loss": -16.0018, + "step": 167 + }, + { + "epoch": 0.014933333333333333, + "grad_norm": 78.83174133300781, + "learning_rate": 3e-06, + "loss": -8.9618, + "step": 168 + }, + { + "completion_length": 236.2916717529297, + "epoch": 0.015022222222222222, + "grad_norm": 82.83967590332031, + "learning_rate": 3e-06, + "loss": -0.5024, + "reward": 1.1145833730697632, + "reward_std": 0.3740755543112755, + "rewards/boxed_and_answer_tags_format_reward": 0.6145833134651184, + "rewards/correctness_reward_func_math": 0.5, + "step": 169, + "zero_std_ratio": 0.375 + }, + { + "epoch": 0.015111111111111112, + "grad_norm": 51.84051513671875, + "learning_rate": 3e-06, + "loss": 6.7446, + "step": 170 + }, + { + "epoch": 0.0152, + "grad_norm": 63.607723236083984, + "learning_rate": 3e-06, + "loss": 2.7771, + "step": 171 + }, + { + "epoch": 0.015288888888888888, + "grad_norm": 52.88029479980469, + "learning_rate": 3e-06, + "loss": 8.1945, + "step": 172 + }, + { + "epoch": 0.015377777777777778, + "grad_norm": 68.90487670898438, + "learning_rate": 3e-06, + "loss": 0.8609, + "step": 173 + }, + { + "epoch": 0.015466666666666667, + "grad_norm": 57.66716766357422, + "learning_rate": 3e-06, + "loss": -0.5103, + "step": 174 + }, + { + "epoch": 0.015555555555555555, + "grad_norm": 69.48858642578125, + "learning_rate": 3e-06, + "loss": -0.89, + "step": 175 + }, + { + "epoch": 0.015644444444444443, + "grad_norm": 51.13008117675781, + "learning_rate": 3e-06, + "loss": 5.8779, + "step": 176 + }, + { + "epoch": 0.015733333333333332, + "grad_norm": 61.48530578613281, + "learning_rate": 3e-06, + "loss": 2.0727, + "step": 177 + }, + { + "epoch": 0.015822222222222224, + "grad_norm": 55.415924072265625, + "learning_rate": 3e-06, + "loss": 7.6559, + "step": 178 + }, + { + "epoch": 0.015911111111111112, + "grad_norm": 65.15290069580078, + "learning_rate": 3e-06, + "loss": -0.6101, + "step": 179 + }, + { + "epoch": 0.016, + "grad_norm": 52.03913879394531, + "learning_rate": 3e-06, + "loss": -1.3899, + "step": 180 + }, + { + "completion_length": 250.14583587646484, + "epoch": 0.01608888888888889, + "grad_norm": 63.963829040527344, + "learning_rate": 3e-06, + "loss": -8.994, + "reward": 0.885416716337204, + "reward_std": 0.3302172925323248, + "rewards/boxed_and_answer_tags_format_reward": 0.59375, + "rewards/correctness_reward_func_math": 0.2916666567325592, + "step": 181, + "zero_std_ratio": 0.5 + }, + { + "epoch": 0.016177777777777777, + "grad_norm": 48.50006866455078, + "learning_rate": 3e-06, + "loss": -0.9428, + "step": 182 + }, + { + "epoch": 0.016266666666666665, + "grad_norm": 58.21607971191406, + "learning_rate": 3e-06, + "loss": -8.2051, + "step": 183 + }, + { + "epoch": 0.016355555555555557, + "grad_norm": 76.80998992919922, + "learning_rate": 3e-06, + "loss": 1.6849, + "step": 184 + }, + { + "epoch": 0.016444444444444446, + "grad_norm": 48.460941314697266, + "learning_rate": 3e-06, + "loss": -3.4021, + "step": 185 + }, + { + "epoch": 0.016533333333333334, + "grad_norm": 55.28091049194336, + "learning_rate": 3e-06, + "loss": 0.4156, + "step": 186 + }, + { + "epoch": 0.016622222222222222, + "grad_norm": 65.21077728271484, + "learning_rate": 3e-06, + "loss": -9.4278, + "step": 187 + }, + { + "epoch": 0.01671111111111111, + "grad_norm": 50.71424865722656, + "learning_rate": 3e-06, + "loss": -2.0682, + "step": 188 + }, + { + "epoch": 0.0168, + "grad_norm": 58.02372360229492, + "learning_rate": 3e-06, + "loss": -9.1068, + "step": 189 + }, + { + "epoch": 0.016888888888888887, + "grad_norm": 61.12031555175781, + "learning_rate": 3e-06, + "loss": 0.679, + "step": 190 + }, + { + "epoch": 0.01697777777777778, + "grad_norm": 51.7930908203125, + "learning_rate": 3e-06, + "loss": -4.3214, + "step": 191 + }, + { + "epoch": 0.017066666666666667, + "grad_norm": 48.15507507324219, + "learning_rate": 3e-06, + "loss": -0.8545, + "step": 192 + }, + { + "completion_length": 248.1666717529297, + "epoch": 0.017155555555555556, + "grad_norm": 62.317527770996094, + "learning_rate": 3e-06, + "loss": 13.8287, + "reward": 1.0729167461395264, + "reward_std": 0.348264142870903, + "rewards/boxed_and_answer_tags_format_reward": 0.5729166865348816, + "rewards/correctness_reward_func_math": 0.5, + "step": 193, + "zero_std_ratio": 0.375 + }, + { + "epoch": 0.017244444444444444, + "grad_norm": 43.339691162109375, + "learning_rate": 3e-06, + "loss": 17.523, + "step": 194 + }, + { + "epoch": 0.017333333333333333, + "grad_norm": 48.14270782470703, + "learning_rate": 3e-06, + "loss": 15.9481, + "step": 195 + }, + { + "epoch": 0.01742222222222222, + "grad_norm": 43.32905960083008, + "learning_rate": 3e-06, + "loss": 14.7259, + "step": 196 + }, + { + "epoch": 0.017511111111111113, + "grad_norm": 45.01740264892578, + "learning_rate": 3e-06, + "loss": 13.8658, + "step": 197 + }, + { + "epoch": 0.0176, + "grad_norm": 43.2428092956543, + "learning_rate": 3e-06, + "loss": 20.4664, + "step": 198 + }, + { + "epoch": 0.01768888888888889, + "grad_norm": 56.058616638183594, + "learning_rate": 3e-06, + "loss": 13.1828, + "step": 199 + }, + { + "epoch": 0.017777777777777778, + "grad_norm": 46.968666076660156, + "learning_rate": 3e-06, + "loss": 16.8591, + "step": 200 + }, + { + "epoch": 0.017866666666666666, + "grad_norm": 45.98298263549805, + "learning_rate": 3e-06, + "loss": 15.5154, + "step": 201 + }, + { + "epoch": 0.017955555555555554, + "grad_norm": 43.91643142700195, + "learning_rate": 3e-06, + "loss": 14.2328, + "step": 202 + }, + { + "epoch": 0.018044444444444443, + "grad_norm": 44.83538055419922, + "learning_rate": 3e-06, + "loss": 13.1929, + "step": 203 + }, + { + "epoch": 0.018133333333333335, + "grad_norm": 42.83240509033203, + "learning_rate": 3e-06, + "loss": 19.8936, + "step": 204 + }, + { + "completion_length": 249.02083587646484, + "epoch": 0.018222222222222223, + "grad_norm": 119.68338775634766, + "learning_rate": 3e-06, + "loss": -4.5616, + "reward": 0.8229166865348816, + "reward_std": 0.28067073225975037, + "rewards/boxed_and_answer_tags_format_reward": 0.65625, + "rewards/correctness_reward_func_math": 0.1666666716337204, + "step": 205, + "zero_std_ratio": 0.5 + }, + { + "epoch": 0.01831111111111111, + "grad_norm": 40.65678405761719, + "learning_rate": 3e-06, + "loss": 3.7784, + "step": 206 + }, + { + "epoch": 0.0184, + "grad_norm": 51.84949493408203, + "learning_rate": 3e-06, + "loss": 2.132, + "step": 207 + }, + { + "epoch": 0.018488888888888888, + "grad_norm": 40.80442428588867, + "learning_rate": 3e-06, + "loss": -1.7568, + "step": 208 + }, + { + "epoch": 0.018577777777777776, + "grad_norm": 51.88225555419922, + "learning_rate": 3e-06, + "loss": -2.808, + "step": 209 + }, + { + "epoch": 0.018666666666666668, + "grad_norm": 57.230106353759766, + "learning_rate": 3e-06, + "loss": -1.6958, + "step": 210 + }, + { + "epoch": 0.018755555555555557, + "grad_norm": 65.36343383789062, + "learning_rate": 3e-06, + "loss": -5.016, + "step": 211 + }, + { + "epoch": 0.018844444444444445, + "grad_norm": 42.36751937866211, + "learning_rate": 3e-06, + "loss": 3.2604, + "step": 212 + }, + { + "epoch": 0.018933333333333333, + "grad_norm": 54.347625732421875, + "learning_rate": 3e-06, + "loss": 1.5256, + "step": 213 + }, + { + "epoch": 0.01902222222222222, + "grad_norm": 40.971683502197266, + "learning_rate": 3e-06, + "loss": -2.3704, + "step": 214 + }, + { + "epoch": 0.01911111111111111, + "grad_norm": 51.366546630859375, + "learning_rate": 3e-06, + "loss": -3.7841, + "step": 215 + }, + { + "epoch": 0.0192, + "grad_norm": 64.25231170654297, + "learning_rate": 3e-06, + "loss": -2.6423, + "step": 216 + }, + { + "completion_length": 250.4791717529297, + "epoch": 0.01928888888888889, + "grad_norm": 60.69169235229492, + "learning_rate": 3e-06, + "loss": 3.7848, + "reward": 1.0833333730697632, + "reward_std": 0.5039487332105637, + "rewards/boxed_and_answer_tags_format_reward": 0.625, + "rewards/correctness_reward_func_math": 0.4583333283662796, + "step": 217, + "zero_std_ratio": 0.25 + }, + { + "epoch": 0.01937777777777778, + "grad_norm": 65.37804412841797, + "learning_rate": 3e-06, + "loss": -5.4996, + "step": 218 + }, + { + "epoch": 0.019466666666666667, + "grad_norm": 58.69138717651367, + "learning_rate": 3e-06, + "loss": 0.3025, + "step": 219 + }, + { + "epoch": 0.019555555555555555, + "grad_norm": 72.17839813232422, + "learning_rate": 3e-06, + "loss": -0.8041, + "step": 220 + }, + { + "epoch": 0.019644444444444444, + "grad_norm": 69.56704711914062, + "learning_rate": 3e-06, + "loss": 3.1412, + "step": 221 + }, + { + "epoch": 0.019733333333333332, + "grad_norm": 64.57500457763672, + "learning_rate": 3e-06, + "loss": 6.7244, + "step": 222 + }, + { + "epoch": 0.019822222222222224, + "grad_norm": 65.06715393066406, + "learning_rate": 3e-06, + "loss": 3.0694, + "step": 223 + }, + { + "epoch": 0.019911111111111112, + "grad_norm": 72.74304962158203, + "learning_rate": 3e-06, + "loss": -5.928, + "step": 224 + }, + { + "epoch": 0.02, + "grad_norm": 62.06201934814453, + "learning_rate": 3e-06, + "loss": -0.2234, + "step": 225 + }, + { + "epoch": 0.02008888888888889, + "grad_norm": 74.25010681152344, + "learning_rate": 3e-06, + "loss": -1.5055, + "step": 226 + }, + { + "epoch": 0.020177777777777777, + "grad_norm": 64.32748413085938, + "learning_rate": 3e-06, + "loss": 2.4819, + "step": 227 + }, + { + "epoch": 0.020266666666666665, + "grad_norm": 64.75834655761719, + "learning_rate": 3e-06, + "loss": 5.504, + "step": 228 + }, + { + "completion_length": 252.9375, + "epoch": 0.020355555555555557, + "grad_norm": 86.62522888183594, + "learning_rate": 3e-06, + "loss": 14.2874, + "reward": 1.1562500596046448, + "reward_std": 0.5227071046829224, + "rewards/boxed_and_answer_tags_format_reward": 0.7395833432674408, + "rewards/correctness_reward_func_math": 0.4166666567325592, + "step": 229, + "zero_std_ratio": 0.375 + }, + { + "epoch": 0.020444444444444446, + "grad_norm": 61.35566329956055, + "learning_rate": 3e-06, + "loss": 17.7162, + "step": 230 + }, + { + "epoch": 0.020533333333333334, + "grad_norm": 61.87510681152344, + "learning_rate": 3e-06, + "loss": 8.9931, + "step": 231 + }, + { + "epoch": 0.020622222222222222, + "grad_norm": 57.673770904541016, + "learning_rate": 3e-06, + "loss": 14.8206, + "step": 232 + }, + { + "epoch": 0.02071111111111111, + "grad_norm": 64.32942199707031, + "learning_rate": 3e-06, + "loss": 10.9396, + "step": 233 + }, + { + "epoch": 0.0208, + "grad_norm": 66.23136138916016, + "learning_rate": 3e-06, + "loss": 15.6923, + "step": 234 + }, + { + "epoch": 0.020888888888888887, + "grad_norm": 74.55809783935547, + "learning_rate": 3e-06, + "loss": 13.7889, + "step": 235 + }, + { + "epoch": 0.02097777777777778, + "grad_norm": 60.680240631103516, + "learning_rate": 3e-06, + "loss": 17.2098, + "step": 236 + }, + { + "epoch": 0.021066666666666668, + "grad_norm": 63.526371002197266, + "learning_rate": 3e-06, + "loss": 8.3714, + "step": 237 + }, + { + "epoch": 0.021155555555555556, + "grad_norm": 60.387813568115234, + "learning_rate": 3e-06, + "loss": 13.9642, + "step": 238 + }, + { + "epoch": 0.021244444444444444, + "grad_norm": 65.6108169555664, + "learning_rate": 3e-06, + "loss": 10.0042, + "step": 239 + }, + { + "epoch": 0.021333333333333333, + "grad_norm": 65.62525177001953, + "learning_rate": 3e-06, + "loss": 14.6078, + "step": 240 + }, + { + "completion_length": 245.89584350585938, + "epoch": 0.02142222222222222, + "grad_norm": 99.76964569091797, + "learning_rate": 3e-06, + "loss": -5.9045, + "reward": 1.2916666865348816, + "reward_std": 0.23116151243448257, + "rewards/boxed_and_answer_tags_format_reward": 0.7083333432674408, + "rewards/correctness_reward_func_math": 0.5833333358168602, + "step": 241, + "zero_std_ratio": 0.375 + }, + { + "epoch": 0.021511111111111113, + "grad_norm": 34.929359436035156, + "learning_rate": 3e-06, + "loss": -5.1064, + "step": 242 + }, + { + "epoch": 0.0216, + "grad_norm": 41.406982421875, + "learning_rate": 3e-06, + "loss": -7.9375, + "step": 243 + }, + { + "epoch": 0.02168888888888889, + "grad_norm": 40.73991775512695, + "learning_rate": 3e-06, + "loss": -4.6122, + "step": 244 + }, + { + "epoch": 0.021777777777777778, + "grad_norm": 32.28548812866211, + "learning_rate": 3e-06, + "loss": -3.2865, + "step": 245 + }, + { + "epoch": 0.021866666666666666, + "grad_norm": 37.392860412597656, + "learning_rate": 3e-06, + "loss": -0.2002, + "step": 246 + }, + { + "epoch": 0.021955555555555555, + "grad_norm": 105.95482635498047, + "learning_rate": 3e-06, + "loss": -5.7761, + "step": 247 + }, + { + "epoch": 0.022044444444444443, + "grad_norm": 35.37491226196289, + "learning_rate": 3e-06, + "loss": -5.3203, + "step": 248 + }, + { + "epoch": 0.022133333333333335, + "grad_norm": 37.672000885009766, + "learning_rate": 3e-06, + "loss": -8.3175, + "step": 249 + }, + { + "epoch": 0.022222222222222223, + "grad_norm": 34.36002731323242, + "learning_rate": 3e-06, + "loss": -4.8629, + "step": 250 + }, + { + "epoch": 0.02231111111111111, + "grad_norm": 35.60414123535156, + "learning_rate": 3e-06, + "loss": -3.8192, + "step": 251 + }, + { + "epoch": 0.0224, + "grad_norm": 38.58955764770508, + "learning_rate": 3e-06, + "loss": -0.897, + "step": 252 + }, + { + "completion_length": 250.1875, + "epoch": 0.022488888888888888, + "grad_norm": 71.29794311523438, + "learning_rate": 3e-06, + "loss": 2.5796, + "reward": 0.8645833432674408, + "reward_std": 0.3201860636472702, + "rewards/boxed_and_answer_tags_format_reward": 0.5729166716337204, + "rewards/correctness_reward_func_math": 0.2916666641831398, + "step": 253, + "zero_std_ratio": 0.625 + }, + { + "epoch": 0.022577777777777776, + "grad_norm": 62.11003112792969, + "learning_rate": 3e-06, + "loss": 2.4232, + "step": 254 + }, + { + "epoch": 0.02266666666666667, + "grad_norm": 57.18949508666992, + "learning_rate": 3e-06, + "loss": 5.9388, + "step": 255 + }, + { + "epoch": 0.022755555555555557, + "grad_norm": 60.49555206298828, + "learning_rate": 3e-06, + "loss": 5.5698, + "step": 256 + }, + { + "epoch": 0.022844444444444445, + "grad_norm": 134.1082305908203, + "learning_rate": 3e-06, + "loss": -5.3771, + "step": 257 + }, + { + "epoch": 0.022933333333333333, + "grad_norm": 170.15768432617188, + "learning_rate": 3e-06, + "loss": 6.3811, + "step": 258 + }, + { + "epoch": 0.02302222222222222, + "grad_norm": 70.64490509033203, + "learning_rate": 3e-06, + "loss": 1.7661, + "step": 259 + }, + { + "epoch": 0.02311111111111111, + "grad_norm": 71.96417999267578, + "learning_rate": 3e-06, + "loss": 0.8909, + "step": 260 + }, + { + "epoch": 0.0232, + "grad_norm": 58.19865417480469, + "learning_rate": 3e-06, + "loss": 5.1442, + "step": 261 + }, + { + "epoch": 0.02328888888888889, + "grad_norm": 61.813690185546875, + "learning_rate": 3e-06, + "loss": 4.1458, + "step": 262 + }, + { + "epoch": 0.02337777777777778, + "grad_norm": 63.21968460083008, + "learning_rate": 3e-06, + "loss": -6.5992, + "step": 263 + }, + { + "epoch": 0.023466666666666667, + "grad_norm": 130.61351013183594, + "learning_rate": 3e-06, + "loss": 4.4745, + "step": 264 + }, + { + "completion_length": 254.58333587646484, + "epoch": 0.023555555555555555, + "grad_norm": 37.4125862121582, + "learning_rate": 3e-06, + "loss": 6.5013, + "reward": 1.0000000298023224, + "reward_std": 0.32049281150102615, + "rewards/boxed_and_answer_tags_format_reward": 0.7083333134651184, + "rewards/correctness_reward_func_math": 0.2916666679084301, + "step": 265, + "zero_std_ratio": 0.5 + }, + { + "epoch": 0.023644444444444444, + "grad_norm": 47.14967346191406, + "learning_rate": 3e-06, + "loss": 0.9939, + "step": 266 + }, + { + "epoch": 0.023733333333333332, + "grad_norm": 52.939048767089844, + "learning_rate": 3e-06, + "loss": -0.9139, + "step": 267 + }, + { + "epoch": 0.023822222222222224, + "grad_norm": 86.99070739746094, + "learning_rate": 3e-06, + "loss": 8.1755, + "step": 268 + }, + { + "epoch": 0.023911111111111112, + "grad_norm": 39.69975280761719, + "learning_rate": 3e-06, + "loss": 4.7483, + "step": 269 + }, + { + "epoch": 0.024, + "grad_norm": 59.52255630493164, + "learning_rate": 3e-06, + "loss": 1.1483, + "step": 270 + }, + { + "epoch": 0.02408888888888889, + "grad_norm": 38.91862106323242, + "learning_rate": 3e-06, + "loss": 6.09, + "step": 271 + }, + { + "epoch": 0.024177777777777777, + "grad_norm": 43.66323471069336, + "learning_rate": 3e-06, + "loss": 0.4765, + "step": 272 + }, + { + "epoch": 0.024266666666666666, + "grad_norm": 44.54389572143555, + "learning_rate": 3e-06, + "loss": -1.5437, + "step": 273 + }, + { + "epoch": 0.024355555555555554, + "grad_norm": 84.41556549072266, + "learning_rate": 3e-06, + "loss": 7.1028, + "step": 274 + }, + { + "epoch": 0.024444444444444446, + "grad_norm": 38.220367431640625, + "learning_rate": 3e-06, + "loss": 4.0407, + "step": 275 + }, + { + "epoch": 0.024533333333333334, + "grad_norm": 45.620452880859375, + "learning_rate": 3e-06, + "loss": 0.5047, + "step": 276 + }, + { + "completion_length": 243.18750762939453, + "epoch": 0.024622222222222222, + "grad_norm": 84.81961059570312, + "learning_rate": 3e-06, + "loss": 5.442, + "reward": 1.3333333730697632, + "reward_std": 0.4887756109237671, + "rewards/boxed_and_answer_tags_format_reward": 0.625, + "rewards/correctness_reward_func_math": 0.7083333432674408, + "step": 277, + "zero_std_ratio": 0.0 + }, + { + "epoch": 0.02471111111111111, + "grad_norm": 112.86151885986328, + "learning_rate": 3e-06, + "loss": -1.2528, + "step": 278 + }, + { + "epoch": 0.0248, + "grad_norm": 76.52424621582031, + "learning_rate": 3e-06, + "loss": -4.5325, + "step": 279 + }, + { + "epoch": 0.024888888888888887, + "grad_norm": 94.0294189453125, + "learning_rate": 3e-06, + "loss": -6.6167, + "step": 280 + }, + { + "epoch": 0.02497777777777778, + "grad_norm": 78.60155487060547, + "learning_rate": 3e-06, + "loss": 0.2653, + "step": 281 + }, + { + "epoch": 0.025066666666666668, + "grad_norm": 58.42827224731445, + "learning_rate": 3e-06, + "loss": -5.6591, + "step": 282 + }, + { + "epoch": 0.025155555555555556, + "grad_norm": 81.04212188720703, + "learning_rate": 3e-06, + "loss": 4.2739, + "step": 283 + }, + { + "epoch": 0.025244444444444444, + "grad_norm": 67.27478790283203, + "learning_rate": 3e-06, + "loss": -1.5776, + "step": 284 + }, + { + "epoch": 0.025333333333333333, + "grad_norm": 114.41588592529297, + "learning_rate": 3e-06, + "loss": -5.4532, + "step": 285 + }, + { + "epoch": 0.02542222222222222, + "grad_norm": 75.61115264892578, + "learning_rate": 3e-06, + "loss": -7.4401, + "step": 286 + }, + { + "epoch": 0.02551111111111111, + "grad_norm": 236.67214965820312, + "learning_rate": 3e-06, + "loss": -0.6833, + "step": 287 + }, + { + "epoch": 0.0256, + "grad_norm": 59.2407341003418, + "learning_rate": 3e-06, + "loss": -6.8172, + "step": 288 + }, + { + "completion_length": 250.20834350585938, + "epoch": 0.02568888888888889, + "grad_norm": 93.99871063232422, + "learning_rate": 3e-06, + "loss": -12.9867, + "reward": 1.4687500596046448, + "reward_std": 0.6822589337825775, + "rewards/boxed_and_answer_tags_format_reward": 0.6770833432674408, + "rewards/correctness_reward_func_math": 0.7916666865348816, + "step": 289, + "zero_std_ratio": 0.25 + }, + { + "epoch": 0.025777777777777778, + "grad_norm": 79.78562927246094, + "learning_rate": 3e-06, + "loss": -10.387, + "step": 290 + }, + { + "epoch": 0.025866666666666666, + "grad_norm": 150.55654907226562, + "learning_rate": 3e-06, + "loss": -11.9684, + "step": 291 + }, + { + "epoch": 0.025955555555555555, + "grad_norm": 86.15855407714844, + "learning_rate": 3e-06, + "loss": -13.3488, + "step": 292 + }, + { + "epoch": 0.026044444444444443, + "grad_norm": 82.68080139160156, + "learning_rate": 3e-06, + "loss": -9.7978, + "step": 293 + }, + { + "epoch": 0.026133333333333335, + "grad_norm": 73.47705841064453, + "learning_rate": 3e-06, + "loss": -15.2661, + "step": 294 + }, + { + "epoch": 0.026222222222222223, + "grad_norm": 88.39766693115234, + "learning_rate": 3e-06, + "loss": -14.0408, + "step": 295 + }, + { + "epoch": 0.02631111111111111, + "grad_norm": 81.03710174560547, + "learning_rate": 3e-06, + "loss": -11.6435, + "step": 296 + }, + { + "epoch": 0.0264, + "grad_norm": 97.82394409179688, + "learning_rate": 3e-06, + "loss": -12.9819, + "step": 297 + }, + { + "epoch": 0.026488888888888888, + "grad_norm": 91.29530334472656, + "learning_rate": 3e-06, + "loss": -14.6218, + "step": 298 + }, + { + "epoch": 0.026577777777777777, + "grad_norm": 76.14654541015625, + "learning_rate": 3e-06, + "loss": -10.3319, + "step": 299 + }, + { + "epoch": 0.02666666666666667, + "grad_norm": 76.4620590209961, + "learning_rate": 3e-06, + "loss": -16.649, + "step": 300 + }, + { + "completion_length": 245.87500762939453, + "epoch": 0.026755555555555557, + "grad_norm": 66.47940063476562, + "learning_rate": 3e-06, + "loss": -2.4273, + "reward": 1.2916666865348816, + "reward_std": 0.4701542556285858, + "rewards/boxed_and_answer_tags_format_reward": 0.75, + "rewards/correctness_reward_func_math": 0.5416666716337204, + "step": 301, + "zero_std_ratio": 0.5 + }, + { + "epoch": 0.026844444444444445, + "grad_norm": 57.04201126098633, + "learning_rate": 3e-06, + "loss": -2.7348, + "step": 302 + }, + { + "epoch": 0.026933333333333333, + "grad_norm": 67.42317962646484, + "learning_rate": 3e-06, + "loss": -0.5419, + "step": 303 + }, + { + "epoch": 0.027022222222222222, + "grad_norm": 68.1643295288086, + "learning_rate": 3e-06, + "loss": -4.7537, + "step": 304 + }, + { + "epoch": 0.02711111111111111, + "grad_norm": 66.15480041503906, + "learning_rate": 3e-06, + "loss": -3.0219, + "step": 305 + }, + { + "epoch": 0.0272, + "grad_norm": 72.49027252197266, + "learning_rate": 3e-06, + "loss": -0.992, + "step": 306 + }, + { + "epoch": 0.02728888888888889, + "grad_norm": 63.84511947631836, + "learning_rate": 3e-06, + "loss": -2.9532, + "step": 307 + }, + { + "epoch": 0.02737777777777778, + "grad_norm": 60.41191864013672, + "learning_rate": 3e-06, + "loss": -3.5425, + "step": 308 + }, + { + "epoch": 0.027466666666666667, + "grad_norm": 75.88224029541016, + "learning_rate": 3e-06, + "loss": -0.626, + "step": 309 + }, + { + "epoch": 0.027555555555555555, + "grad_norm": 60.12965774536133, + "learning_rate": 3e-06, + "loss": -4.971, + "step": 310 + }, + { + "epoch": 0.027644444444444444, + "grad_norm": 67.24330139160156, + "learning_rate": 3e-06, + "loss": -3.5011, + "step": 311 + }, + { + "epoch": 0.027733333333333332, + "grad_norm": 66.98039245605469, + "learning_rate": 3e-06, + "loss": -1.487, + "step": 312 + }, + { + "completion_length": 255.27083587646484, + "epoch": 0.027822222222222224, + "grad_norm": 36.04975891113281, + "learning_rate": 3e-06, + "loss": -0.5188, + "reward": 1.5312500596046448, + "reward_std": 0.1546149756759405, + "rewards/boxed_and_answer_tags_format_reward": 0.6145833432674408, + "rewards/correctness_reward_func_math": 0.9166666567325592, + "step": 313, + "zero_std_ratio": 0.75 + }, + { + "epoch": 0.027911111111111112, + "grad_norm": 35.71009063720703, + "learning_rate": 3e-06, + "loss": 2.2571, + "step": 314 + }, + { + "epoch": 0.028, + "grad_norm": 43.758975982666016, + "learning_rate": 3e-06, + "loss": 1.5113, + "step": 315 + }, + { + "epoch": 0.02808888888888889, + "grad_norm": 42.099124908447266, + "learning_rate": 3e-06, + "loss": -1.203, + "step": 316 + }, + { + "epoch": 0.028177777777777777, + "grad_norm": 46.69057846069336, + "learning_rate": 3e-06, + "loss": 2.1907, + "step": 317 + }, + { + "epoch": 0.028266666666666666, + "grad_norm": 43.00071334838867, + "learning_rate": 3e-06, + "loss": -0.0413, + "step": 318 + }, + { + "epoch": 0.028355555555555554, + "grad_norm": 36.017799377441406, + "learning_rate": 3e-06, + "loss": -0.9995, + "step": 319 + }, + { + "epoch": 0.028444444444444446, + "grad_norm": 35.86075973510742, + "learning_rate": 3e-06, + "loss": 1.8159, + "step": 320 + }, + { + "epoch": 0.028533333333333334, + "grad_norm": 46.41409683227539, + "learning_rate": 3e-06, + "loss": 0.8693, + "step": 321 + }, + { + "epoch": 0.028622222222222223, + "grad_norm": 42.182472229003906, + "learning_rate": 3e-06, + "loss": -1.9042, + "step": 322 + }, + { + "epoch": 0.02871111111111111, + "grad_norm": 47.805999755859375, + "learning_rate": 3e-06, + "loss": 1.6417, + "step": 323 + }, + { + "epoch": 0.0288, + "grad_norm": 45.03670883178711, + "learning_rate": 3e-06, + "loss": -1.09, + "step": 324 + }, + { + "completion_length": 252.64583587646484, + "epoch": 0.028888888888888888, + "grad_norm": 58.358917236328125, + "learning_rate": 3e-06, + "loss": -12.4368, + "reward": 1.1666666865348816, + "reward_std": 0.37967559695243835, + "rewards/boxed_and_answer_tags_format_reward": 0.6666666567325592, + "rewards/correctness_reward_func_math": 0.5, + "step": 325, + "zero_std_ratio": 0.625 + }, + { + "epoch": 0.02897777777777778, + "grad_norm": 70.42740631103516, + "learning_rate": 3e-06, + "loss": -6.5213, + "step": 326 + }, + { + "epoch": 0.029066666666666668, + "grad_norm": 71.7884750366211, + "learning_rate": 3e-06, + "loss": -14.7372, + "step": 327 + }, + { + "epoch": 0.029155555555555556, + "grad_norm": 64.89356231689453, + "learning_rate": 3e-06, + "loss": -2.411, + "step": 328 + }, + { + "epoch": 0.029244444444444444, + "grad_norm": 63.557125091552734, + "learning_rate": 3e-06, + "loss": -6.2777, + "step": 329 + }, + { + "epoch": 0.029333333333333333, + "grad_norm": 55.46377182006836, + "learning_rate": 3e-06, + "loss": -6.9502, + "step": 330 + }, + { + "epoch": 0.02942222222222222, + "grad_norm": 67.63842010498047, + "learning_rate": 3e-06, + "loss": -13.2148, + "step": 331 + }, + { + "epoch": 0.02951111111111111, + "grad_norm": 69.31304931640625, + "learning_rate": 3e-06, + "loss": -7.451, + "step": 332 + }, + { + "epoch": 0.0296, + "grad_norm": 72.68626403808594, + "learning_rate": 3e-06, + "loss": -15.5911, + "step": 333 + }, + { + "epoch": 0.02968888888888889, + "grad_norm": 67.20828247070312, + "learning_rate": 3e-06, + "loss": -3.4952, + "step": 334 + }, + { + "epoch": 0.029777777777777778, + "grad_norm": 71.56851959228516, + "learning_rate": 3e-06, + "loss": -6.8577, + "step": 335 + }, + { + "epoch": 0.029866666666666666, + "grad_norm": 55.80412292480469, + "learning_rate": 3e-06, + "loss": -7.9959, + "step": 336 + }, + { + "completion_length": 254.33333587646484, + "epoch": 0.029955555555555555, + "grad_norm": 83.18997955322266, + "learning_rate": 3e-06, + "loss": -4.4206, + "reward": 1.3854166865348816, + "reward_std": 0.6009446382522583, + "rewards/boxed_and_answer_tags_format_reward": 0.6354166567325592, + "rewards/correctness_reward_func_math": 0.75, + "step": 337, + "zero_std_ratio": 0.25 + }, + { + "epoch": 0.030044444444444443, + "grad_norm": 76.95658111572266, + "learning_rate": 3e-06, + "loss": 0.0507, + "step": 338 + }, + { + "epoch": 0.030133333333333335, + "grad_norm": 99.8234634399414, + "learning_rate": 3e-06, + "loss": -4.7763, + "step": 339 + }, + { + "epoch": 0.030222222222222223, + "grad_norm": 89.73624420166016, + "learning_rate": 3e-06, + "loss": -12.8145, + "step": 340 + }, + { + "epoch": 0.03031111111111111, + "grad_norm": 105.25814819335938, + "learning_rate": 3e-06, + "loss": 1.3688, + "step": 341 + }, + { + "epoch": 0.0304, + "grad_norm": 91.62116241455078, + "learning_rate": 3e-06, + "loss": -7.2119, + "step": 342 + }, + { + "epoch": 0.03048888888888889, + "grad_norm": 89.00618743896484, + "learning_rate": 3e-06, + "loss": -5.8364, + "step": 343 + }, + { + "epoch": 0.030577777777777777, + "grad_norm": 75.79231262207031, + "learning_rate": 3e-06, + "loss": -1.9053, + "step": 344 + }, + { + "epoch": 0.030666666666666665, + "grad_norm": 109.15798950195312, + "learning_rate": 3e-06, + "loss": -7.4347, + "step": 345 + }, + { + "epoch": 0.030755555555555557, + "grad_norm": 91.9997787475586, + "learning_rate": 3e-06, + "loss": -15.0307, + "step": 346 + }, + { + "epoch": 0.030844444444444445, + "grad_norm": 113.604248046875, + "learning_rate": 3e-06, + "loss": -0.8063, + "step": 347 + }, + { + "epoch": 0.030933333333333334, + "grad_norm": 90.35537719726562, + "learning_rate": 3e-06, + "loss": -9.993, + "step": 348 + }, + { + "completion_length": 241.06250762939453, + "epoch": 0.031022222222222222, + "grad_norm": 79.29890441894531, + "learning_rate": 3e-06, + "loss": -11.3458, + "reward": 1.6979167461395264, + "reward_std": 0.5608386099338531, + "rewards/boxed_and_answer_tags_format_reward": 0.65625, + "rewards/correctness_reward_func_math": 1.0416666567325592, + "step": 349, + "zero_std_ratio": 0.25 + }, + { + "epoch": 0.03111111111111111, + "grad_norm": 85.71048736572266, + "learning_rate": 3e-06, + "loss": -0.4666, + "step": 350 + }, + { + "epoch": 0.0312, + "grad_norm": 74.05301666259766, + "learning_rate": 3e-06, + "loss": -17.7307, + "step": 351 + }, + { + "epoch": 0.03128888888888889, + "grad_norm": 77.7562026977539, + "learning_rate": 3e-06, + "loss": -10.9744, + "step": 352 + }, + { + "epoch": 0.031377777777777775, + "grad_norm": 81.5293197631836, + "learning_rate": 3e-06, + "loss": -7.753, + "step": 353 + }, + { + "epoch": 0.031466666666666664, + "grad_norm": 74.96295166015625, + "learning_rate": 3e-06, + "loss": -4.9168, + "step": 354 + }, + { + "epoch": 0.03155555555555556, + "grad_norm": 77.19646453857422, + "learning_rate": 3e-06, + "loss": -12.4852, + "step": 355 + }, + { + "epoch": 0.03164444444444445, + "grad_norm": 88.24592590332031, + "learning_rate": 3e-06, + "loss": -2.2546, + "step": 356 + }, + { + "epoch": 0.031733333333333336, + "grad_norm": 69.32099151611328, + "learning_rate": 3e-06, + "loss": -18.7936, + "step": 357 + }, + { + "epoch": 0.031822222222222224, + "grad_norm": 74.59849548339844, + "learning_rate": 3e-06, + "loss": -12.5008, + "step": 358 + }, + { + "epoch": 0.03191111111111111, + "grad_norm": 89.21590423583984, + "learning_rate": 3e-06, + "loss": -9.1562, + "step": 359 + }, + { + "epoch": 0.032, + "grad_norm": 70.9638671875, + "learning_rate": 3e-06, + "loss": -7.0573, + "step": 360 + }, + { + "completion_length": 250.83334350585938, + "epoch": 0.03208888888888889, + "grad_norm": 72.94464111328125, + "learning_rate": 3e-06, + "loss": -14.781, + "reward": 0.947916716337204, + "reward_std": 0.45044803619384766, + "rewards/boxed_and_answer_tags_format_reward": 0.6979166567325592, + "rewards/correctness_reward_func_math": 0.2499999962747097, + "step": 361, + "zero_std_ratio": 0.375 + }, + { + "epoch": 0.03217777777777778, + "grad_norm": 91.14832305908203, + "learning_rate": 3e-06, + "loss": -15.6654, + "step": 362 + }, + { + "epoch": 0.032266666666666666, + "grad_norm": 74.7421875, + "learning_rate": 3e-06, + "loss": -18.7276, + "step": 363 + }, + { + "epoch": 0.032355555555555554, + "grad_norm": 76.62783813476562, + "learning_rate": 3e-06, + "loss": -20.5042, + "step": 364 + }, + { + "epoch": 0.03244444444444444, + "grad_norm": 77.55496978759766, + "learning_rate": 3e-06, + "loss": -23.5561, + "step": 365 + }, + { + "epoch": 0.03253333333333333, + "grad_norm": 87.73894500732422, + "learning_rate": 3e-06, + "loss": -13.7666, + "step": 366 + }, + { + "epoch": 0.03262222222222222, + "grad_norm": 85.25169372558594, + "learning_rate": 3e-06, + "loss": -15.9124, + "step": 367 + }, + { + "epoch": 0.032711111111111114, + "grad_norm": 82.08868408203125, + "learning_rate": 3e-06, + "loss": -17.5144, + "step": 368 + }, + { + "epoch": 0.0328, + "grad_norm": 88.5888900756836, + "learning_rate": 3e-06, + "loss": -20.7047, + "step": 369 + }, + { + "epoch": 0.03288888888888889, + "grad_norm": 87.28410339355469, + "learning_rate": 3e-06, + "loss": -22.9003, + "step": 370 + }, + { + "epoch": 0.03297777777777778, + "grad_norm": 75.89826965332031, + "learning_rate": 3e-06, + "loss": -25.7767, + "step": 371 + }, + { + "epoch": 0.03306666666666667, + "grad_norm": 79.7787094116211, + "learning_rate": 3e-06, + "loss": -16.1167, + "step": 372 + }, + { + "completion_length": 251.9791717529297, + "epoch": 0.033155555555555556, + "grad_norm": 122.57412719726562, + "learning_rate": 3e-06, + "loss": -7.3842, + "reward": 1.1666666865348816, + "reward_std": 0.7582502365112305, + "rewards/boxed_and_answer_tags_format_reward": 0.5833333134651184, + "rewards/correctness_reward_func_math": 0.5833333283662796, + "step": 373, + "zero_std_ratio": 0.125 + }, + { + "epoch": 0.033244444444444445, + "grad_norm": 103.27584838867188, + "learning_rate": 3e-06, + "loss": -2.3769, + "step": 374 + }, + { + "epoch": 0.03333333333333333, + "grad_norm": 111.76622772216797, + "learning_rate": 3e-06, + "loss": 5.4646, + "step": 375 + }, + { + "epoch": 0.03342222222222222, + "grad_norm": 110.36207580566406, + "learning_rate": 3e-06, + "loss": -9.271, + "step": 376 + }, + { + "epoch": 0.03351111111111111, + "grad_norm": 102.65152740478516, + "learning_rate": 3e-06, + "loss": -0.8725, + "step": 377 + }, + { + "epoch": 0.0336, + "grad_norm": 107.27348327636719, + "learning_rate": 3e-06, + "loss": -4.3279, + "step": 378 + }, + { + "epoch": 0.033688888888888886, + "grad_norm": 118.1567153930664, + "learning_rate": 3e-06, + "loss": -8.1813, + "step": 379 + }, + { + "epoch": 0.033777777777777775, + "grad_norm": 98.9560317993164, + "learning_rate": 3e-06, + "loss": -3.3073, + "step": 380 + }, + { + "epoch": 0.03386666666666667, + "grad_norm": 119.64665222167969, + "learning_rate": 3e-06, + "loss": 4.1117, + "step": 381 + }, + { + "epoch": 0.03395555555555556, + "grad_norm": 118.64970397949219, + "learning_rate": 3e-06, + "loss": -10.3727, + "step": 382 + }, + { + "epoch": 0.03404444444444445, + "grad_norm": 117.53937530517578, + "learning_rate": 3e-06, + "loss": -3.6092, + "step": 383 + }, + { + "epoch": 0.034133333333333335, + "grad_norm": 108.86544799804688, + "learning_rate": 3e-06, + "loss": -6.4896, + "step": 384 + }, + { + "completion_length": 242.27084350585938, + "epoch": 0.03422222222222222, + "grad_norm": 37.28844451904297, + "learning_rate": 3e-06, + "loss": 35.3752, + "reward": 1.0208333432674408, + "reward_std": 0.12909945845603943, + "rewards/boxed_and_answer_tags_format_reward": 0.6875, + "rewards/correctness_reward_func_math": 0.3333333358168602, + "step": 385, + "zero_std_ratio": 0.875 + }, + { + "epoch": 0.03431111111111111, + "grad_norm": 34.82659149169922, + "learning_rate": 3e-06, + "loss": 32.1003, + "step": 386 + }, + { + "epoch": 0.0344, + "grad_norm": 34.34743881225586, + "learning_rate": 3e-06, + "loss": 31.1165, + "step": 387 + }, + { + "epoch": 0.03448888888888889, + "grad_norm": 44.72328186035156, + "learning_rate": 3e-06, + "loss": 32.7756, + "step": 388 + }, + { + "epoch": 0.03457777777777778, + "grad_norm": 42.72700119018555, + "learning_rate": 3e-06, + "loss": 32.2398, + "step": 389 + }, + { + "epoch": 0.034666666666666665, + "grad_norm": 47.69383239746094, + "learning_rate": 3e-06, + "loss": 33.0683, + "step": 390 + }, + { + "epoch": 0.03475555555555555, + "grad_norm": 39.66519546508789, + "learning_rate": 3e-06, + "loss": 34.5053, + "step": 391 + }, + { + "epoch": 0.03484444444444444, + "grad_norm": 39.71942138671875, + "learning_rate": 3e-06, + "loss": 31.0092, + "step": 392 + }, + { + "epoch": 0.03493333333333333, + "grad_norm": 36.60993576049805, + "learning_rate": 3e-06, + "loss": 30.3034, + "step": 393 + }, + { + "epoch": 0.035022222222222225, + "grad_norm": 47.912837982177734, + "learning_rate": 3e-06, + "loss": 31.4023, + "step": 394 + }, + { + "epoch": 0.035111111111111114, + "grad_norm": 42.3475341796875, + "learning_rate": 3e-06, + "loss": 31.2326, + "step": 395 + }, + { + "epoch": 0.0352, + "grad_norm": 40.417381286621094, + "learning_rate": 3e-06, + "loss": 31.5571, + "step": 396 + }, + { + "completion_length": 251.56250762939453, + "epoch": 0.03528888888888889, + "grad_norm": 41.37530517578125, + "learning_rate": 3e-06, + "loss": 9.2621, + "reward": 1.6458333730697632, + "reward_std": 0.1489431317895651, + "rewards/boxed_and_answer_tags_format_reward": 0.7291666865348816, + "rewards/correctness_reward_func_math": 0.9166666865348816, + "step": 397, + "zero_std_ratio": 0.75 + }, + { + "epoch": 0.03537777777777778, + "grad_norm": 42.853084564208984, + "learning_rate": 3e-06, + "loss": 8.8572, + "step": 398 + }, + { + "epoch": 0.03546666666666667, + "grad_norm": 41.449344635009766, + "learning_rate": 3e-06, + "loss": 5.6158, + "step": 399 + }, + { + "epoch": 0.035555555555555556, + "grad_norm": 58.31279754638672, + "learning_rate": 3e-06, + "loss": 7.968, + "step": 400 + }, + { + "epoch": 0.035644444444444444, + "grad_norm": 48.664459228515625, + "learning_rate": 3e-06, + "loss": 11.1793, + "step": 401 + }, + { + "epoch": 0.03573333333333333, + "grad_norm": 45.02378845214844, + "learning_rate": 3e-06, + "loss": 7.6242, + "step": 402 + }, + { + "epoch": 0.03582222222222222, + "grad_norm": 43.53935241699219, + "learning_rate": 3e-06, + "loss": 8.0172, + "step": 403 + }, + { + "epoch": 0.03591111111111111, + "grad_norm": 42.496604919433594, + "learning_rate": 3e-06, + "loss": 7.5088, + "step": 404 + }, + { + "epoch": 0.036, + "grad_norm": 44.294986724853516, + "learning_rate": 3e-06, + "loss": 3.9932, + "step": 405 + }, + { + "epoch": 0.036088888888888886, + "grad_norm": 73.07268524169922, + "learning_rate": 3e-06, + "loss": 6.5222, + "step": 406 + }, + { + "epoch": 0.03617777777777778, + "grad_norm": 44.31553649902344, + "learning_rate": 3e-06, + "loss": 9.2936, + "step": 407 + }, + { + "epoch": 0.03626666666666667, + "grad_norm": 48.42079162597656, + "learning_rate": 3e-06, + "loss": 5.8115, + "step": 408 + }, + { + "completion_length": 253.7291717529297, + "epoch": 0.03635555555555556, + "grad_norm": 55.15653991699219, + "learning_rate": 3e-06, + "loss": -4.6673, + "reward": 0.90625, + "reward_std": 0.15461495518684387, + "rewards/boxed_and_answer_tags_format_reward": 0.7395833432674408, + "rewards/correctness_reward_func_math": 0.1666666716337204, + "step": 409, + "zero_std_ratio": 0.75 + }, + { + "epoch": 0.036444444444444446, + "grad_norm": 54.93301773071289, + "learning_rate": 3e-06, + "loss": 1.7365, + "step": 410 + }, + { + "epoch": 0.036533333333333334, + "grad_norm": 50.56829071044922, + "learning_rate": 3e-06, + "loss": 2.3972, + "step": 411 + }, + { + "epoch": 0.03662222222222222, + "grad_norm": 50.894187927246094, + "learning_rate": 3e-06, + "loss": 0.7298, + "step": 412 + }, + { + "epoch": 0.03671111111111111, + "grad_norm": 64.99378204345703, + "learning_rate": 3e-06, + "loss": 1.6822, + "step": 413 + }, + { + "epoch": 0.0368, + "grad_norm": 53.45103454589844, + "learning_rate": 3e-06, + "loss": 2.1252, + "step": 414 + }, + { + "epoch": 0.03688888888888889, + "grad_norm": 58.880393981933594, + "learning_rate": 3e-06, + "loss": -4.8912, + "step": 415 + }, + { + "epoch": 0.036977777777777776, + "grad_norm": 52.00230407714844, + "learning_rate": 3e-06, + "loss": 0.9087, + "step": 416 + }, + { + "epoch": 0.037066666666666664, + "grad_norm": 54.192508697509766, + "learning_rate": 3e-06, + "loss": 1.5202, + "step": 417 + }, + { + "epoch": 0.03715555555555555, + "grad_norm": 52.10379409790039, + "learning_rate": 3e-06, + "loss": -0.0026, + "step": 418 + }, + { + "epoch": 0.03724444444444444, + "grad_norm": 56.19913864135742, + "learning_rate": 3e-06, + "loss": 0.9843, + "step": 419 + }, + { + "epoch": 0.037333333333333336, + "grad_norm": 51.573699951171875, + "learning_rate": 3e-06, + "loss": 0.9967, + "step": 420 + }, + { + "completion_length": 241.95833587646484, + "epoch": 0.037422222222222225, + "grad_norm": 65.98304748535156, + "learning_rate": 3e-06, + "loss": 1.8524, + "reward": 1.0937500596046448, + "reward_std": 0.3994170129299164, + "rewards/boxed_and_answer_tags_format_reward": 0.59375, + "rewards/correctness_reward_func_math": 0.5, + "step": 421, + "zero_std_ratio": 0.625 + }, + { + "epoch": 0.03751111111111111, + "grad_norm": 68.6063232421875, + "learning_rate": 3e-06, + "loss": -2.8346, + "step": 422 + }, + { + "epoch": 0.0376, + "grad_norm": 61.650146484375, + "learning_rate": 3e-06, + "loss": -2.9892, + "step": 423 + }, + { + "epoch": 0.03768888888888889, + "grad_norm": 66.45751953125, + "learning_rate": 3e-06, + "loss": -3.349, + "step": 424 + }, + { + "epoch": 0.03777777777777778, + "grad_norm": 66.84425354003906, + "learning_rate": 3e-06, + "loss": 2.716, + "step": 425 + }, + { + "epoch": 0.037866666666666667, + "grad_norm": 61.292354583740234, + "learning_rate": 3e-06, + "loss": 1.6447, + "step": 426 + }, + { + "epoch": 0.037955555555555555, + "grad_norm": 66.02394104003906, + "learning_rate": 3e-06, + "loss": 1.1078, + "step": 427 + }, + { + "epoch": 0.03804444444444444, + "grad_norm": 62.10107421875, + "learning_rate": 3e-06, + "loss": -3.9971, + "step": 428 + }, + { + "epoch": 0.03813333333333333, + "grad_norm": 85.67212677001953, + "learning_rate": 3e-06, + "loss": -3.8248, + "step": 429 + }, + { + "epoch": 0.03822222222222222, + "grad_norm": 60.61140060424805, + "learning_rate": 3e-06, + "loss": -4.433, + "step": 430 + }, + { + "epoch": 0.03831111111111111, + "grad_norm": 62.81836700439453, + "learning_rate": 3e-06, + "loss": 1.751, + "step": 431 + }, + { + "epoch": 0.0384, + "grad_norm": 60.05256652832031, + "learning_rate": 3e-06, + "loss": 0.8432, + "step": 432 + }, + { + "completion_length": 247.27083587646484, + "epoch": 0.03848888888888889, + "grad_norm": 83.67351531982422, + "learning_rate": 3e-06, + "loss": 6.9951, + "reward": 1.3229166865348816, + "reward_std": 0.3936077058315277, + "rewards/boxed_and_answer_tags_format_reward": 0.7395833432674408, + "rewards/correctness_reward_func_math": 0.5833333544433117, + "step": 433, + "zero_std_ratio": 0.5 + }, + { + "epoch": 0.03857777777777778, + "grad_norm": 71.3005599975586, + "learning_rate": 3e-06, + "loss": 4.71, + "step": 434 + }, + { + "epoch": 0.03866666666666667, + "grad_norm": 81.9188003540039, + "learning_rate": 3e-06, + "loss": 8.7788, + "step": 435 + }, + { + "epoch": 0.03875555555555556, + "grad_norm": 79.42195129394531, + "learning_rate": 3e-06, + "loss": 11.129, + "step": 436 + }, + { + "epoch": 0.038844444444444445, + "grad_norm": 84.69261169433594, + "learning_rate": 3e-06, + "loss": 10.7206, + "step": 437 + }, + { + "epoch": 0.038933333333333334, + "grad_norm": 74.52008819580078, + "learning_rate": 3e-06, + "loss": 12.7332, + "step": 438 + }, + { + "epoch": 0.03902222222222222, + "grad_norm": 93.8567123413086, + "learning_rate": 3e-06, + "loss": 6.2331, + "step": 439 + }, + { + "epoch": 0.03911111111111111, + "grad_norm": 79.21229553222656, + "learning_rate": 3e-06, + "loss": 3.7102, + "step": 440 + }, + { + "epoch": 0.0392, + "grad_norm": 74.53849792480469, + "learning_rate": 3e-06, + "loss": 8.0091, + "step": 441 + }, + { + "epoch": 0.03928888888888889, + "grad_norm": 79.04943084716797, + "learning_rate": 3e-06, + "loss": 9.4256, + "step": 442 + }, + { + "epoch": 0.039377777777777775, + "grad_norm": 81.54142761230469, + "learning_rate": 3e-06, + "loss": 9.0032, + "step": 443 + }, + { + "epoch": 0.039466666666666664, + "grad_norm": 83.44096374511719, + "learning_rate": 3e-06, + "loss": 11.701, + "step": 444 + }, + { + "completion_length": 249.2291717529297, + "epoch": 0.03955555555555555, + "grad_norm": 78.02682495117188, + "learning_rate": 3e-06, + "loss": 1.3882, + "reward": 1.1458333730697632, + "reward_std": 0.33968228101730347, + "rewards/boxed_and_answer_tags_format_reward": 0.6041666865348816, + "rewards/correctness_reward_func_math": 0.5416666679084301, + "step": 445, + "zero_std_ratio": 0.5 + }, + { + "epoch": 0.03964444444444445, + "grad_norm": 77.9029312133789, + "learning_rate": 3e-06, + "loss": -10.5754, + "step": 446 + }, + { + "epoch": 0.039733333333333336, + "grad_norm": 78.25541687011719, + "learning_rate": 3e-06, + "loss": -4.1056, + "step": 447 + }, + { + "epoch": 0.039822222222222224, + "grad_norm": 100.47134399414062, + "learning_rate": 3e-06, + "loss": -10.7897, + "step": 448 + }, + { + "epoch": 0.03991111111111111, + "grad_norm": 74.78359985351562, + "learning_rate": 3e-06, + "loss": -3.3144, + "step": 449 + }, + { + "epoch": 0.04, + "grad_norm": 96.55880737304688, + "learning_rate": 3e-06, + "loss": 2.8763, + "step": 450 + }, + { + "epoch": 0.04008888888888889, + "grad_norm": 87.14449310302734, + "learning_rate": 3e-06, + "loss": 0.2308, + "step": 451 + }, + { + "epoch": 0.04017777777777778, + "grad_norm": 97.77748107910156, + "learning_rate": 3e-06, + "loss": -11.2666, + "step": 452 + }, + { + "epoch": 0.040266666666666666, + "grad_norm": 73.44164276123047, + "learning_rate": 3e-06, + "loss": -5.678, + "step": 453 + }, + { + "epoch": 0.040355555555555554, + "grad_norm": 100.09737396240234, + "learning_rate": 3e-06, + "loss": -11.4908, + "step": 454 + }, + { + "epoch": 0.04044444444444444, + "grad_norm": 74.52122497558594, + "learning_rate": 3e-06, + "loss": -4.6635, + "step": 455 + }, + { + "epoch": 0.04053333333333333, + "grad_norm": 71.74639892578125, + "learning_rate": 3e-06, + "loss": 1.7931, + "step": 456 + }, + { + "completion_length": 252.52084350585938, + "epoch": 0.04062222222222222, + "grad_norm": 102.89392852783203, + "learning_rate": 3e-06, + "loss": 12.935, + "reward": 1.0625000596046448, + "reward_std": 0.43528568744659424, + "rewards/boxed_and_answer_tags_format_reward": 0.6875, + "rewards/correctness_reward_func_math": 0.3750000111758709, + "step": 457, + "zero_std_ratio": 0.5 + }, + { + "epoch": 0.040711111111111115, + "grad_norm": 105.67884063720703, + "learning_rate": 3e-06, + "loss": 2.3682, + "step": 458 + }, + { + "epoch": 0.0408, + "grad_norm": 90.33670806884766, + "learning_rate": 3e-06, + "loss": 14.3722, + "step": 459 + }, + { + "epoch": 0.04088888888888889, + "grad_norm": 109.0367431640625, + "learning_rate": 3e-06, + "loss": 11.6758, + "step": 460 + }, + { + "epoch": 0.04097777777777778, + "grad_norm": 108.55673217773438, + "learning_rate": 3e-06, + "loss": 13.7948, + "step": 461 + }, + { + "epoch": 0.04106666666666667, + "grad_norm": 91.73406982421875, + "learning_rate": 3e-06, + "loss": 7.7516, + "step": 462 + }, + { + "epoch": 0.041155555555555556, + "grad_norm": 94.59785461425781, + "learning_rate": 3e-06, + "loss": 11.5977, + "step": 463 + }, + { + "epoch": 0.041244444444444445, + "grad_norm": 78.94522857666016, + "learning_rate": 3e-06, + "loss": 1.9425, + "step": 464 + }, + { + "epoch": 0.04133333333333333, + "grad_norm": 93.91060638427734, + "learning_rate": 3e-06, + "loss": 12.3454, + "step": 465 + }, + { + "epoch": 0.04142222222222222, + "grad_norm": 98.78986358642578, + "learning_rate": 3e-06, + "loss": 10.3247, + "step": 466 + }, + { + "epoch": 0.04151111111111111, + "grad_norm": 111.63731384277344, + "learning_rate": 3e-06, + "loss": 12.2581, + "step": 467 + }, + { + "epoch": 0.0416, + "grad_norm": 88.63348388671875, + "learning_rate": 3e-06, + "loss": 5.7115, + "step": 468 + }, + { + "completion_length": 235.58333587646484, + "epoch": 0.041688888888888886, + "grad_norm": 70.78716278076172, + "learning_rate": 3e-06, + "loss": -40.1334, + "reward": 1.4687500596046448, + "reward_std": 0.38577648997306824, + "rewards/boxed_and_answer_tags_format_reward": 0.6770833432674408, + "rewards/correctness_reward_func_math": 0.7916666865348816, + "step": 469, + "zero_std_ratio": 0.625 + }, + { + "epoch": 0.041777777777777775, + "grad_norm": 93.38511657714844, + "learning_rate": 3e-06, + "loss": -36.9438, + "step": 470 + }, + { + "epoch": 0.04186666666666667, + "grad_norm": 84.67546081542969, + "learning_rate": 3e-06, + "loss": -43.221, + "step": 471 + }, + { + "epoch": 0.04195555555555556, + "grad_norm": 99.22493743896484, + "learning_rate": 3e-06, + "loss": -43.1284, + "step": 472 + }, + { + "epoch": 0.04204444444444445, + "grad_norm": 77.45098876953125, + "learning_rate": 3e-06, + "loss": -40.9152, + "step": 473 + }, + { + "epoch": 0.042133333333333335, + "grad_norm": 70.95470428466797, + "learning_rate": 3e-06, + "loss": -32.8293, + "step": 474 + }, + { + "epoch": 0.042222222222222223, + "grad_norm": 77.2640609741211, + "learning_rate": 3e-06, + "loss": -41.1009, + "step": 475 + }, + { + "epoch": 0.04231111111111111, + "grad_norm": 84.97554779052734, + "learning_rate": 3e-06, + "loss": -37.7295, + "step": 476 + }, + { + "epoch": 0.0424, + "grad_norm": 67.63621520996094, + "learning_rate": 3e-06, + "loss": -44.2423, + "step": 477 + }, + { + "epoch": 0.04248888888888889, + "grad_norm": 99.13645935058594, + "learning_rate": 3e-06, + "loss": -45.4078, + "step": 478 + }, + { + "epoch": 0.04257777777777778, + "grad_norm": 71.1223373413086, + "learning_rate": 3e-06, + "loss": -43.0538, + "step": 479 + }, + { + "epoch": 0.042666666666666665, + "grad_norm": 81.17517852783203, + "learning_rate": 3e-06, + "loss": -34.5254, + "step": 480 + }, + { + "completion_length": 250.5625, + "epoch": 0.042755555555555554, + "grad_norm": 111.48869323730469, + "learning_rate": 3e-06, + "loss": 20.9161, + "reward": 1.2291666865348816, + "reward_std": 0.26686520874500275, + "rewards/boxed_and_answer_tags_format_reward": 0.6041666865348816, + "rewards/correctness_reward_func_math": 0.6250000223517418, + "step": 481, + "zero_std_ratio": 0.625 + }, + { + "epoch": 0.04284444444444444, + "grad_norm": 110.74134063720703, + "learning_rate": 3e-06, + "loss": 21.5354, + "step": 482 + }, + { + "epoch": 0.04293333333333333, + "grad_norm": 121.27849578857422, + "learning_rate": 3e-06, + "loss": 22.1143, + "step": 483 + }, + { + "epoch": 0.043022222222222226, + "grad_norm": 111.5245590209961, + "learning_rate": 3e-06, + "loss": 27.6408, + "step": 484 + }, + { + "epoch": 0.043111111111111114, + "grad_norm": 100.0729751586914, + "learning_rate": 3e-06, + "loss": 24.0997, + "step": 485 + }, + { + "epoch": 0.0432, + "grad_norm": 123.4867935180664, + "learning_rate": 3e-06, + "loss": 19.3324, + "step": 486 + }, + { + "epoch": 0.04328888888888889, + "grad_norm": 107.24234008789062, + "learning_rate": 3e-06, + "loss": 18.5364, + "step": 487 + }, + { + "epoch": 0.04337777777777778, + "grad_norm": 113.5108413696289, + "learning_rate": 3e-06, + "loss": 18.1962, + "step": 488 + }, + { + "epoch": 0.04346666666666667, + "grad_norm": 136.6710968017578, + "learning_rate": 3e-06, + "loss": 17.9099, + "step": 489 + }, + { + "epoch": 0.043555555555555556, + "grad_norm": 110.78118896484375, + "learning_rate": 3e-06, + "loss": 24.1538, + "step": 490 + }, + { + "epoch": 0.043644444444444444, + "grad_norm": 87.0732192993164, + "learning_rate": 3e-06, + "loss": 20.6286, + "step": 491 + }, + { + "epoch": 0.04373333333333333, + "grad_norm": 111.51911163330078, + "learning_rate": 3e-06, + "loss": 15.8689, + "step": 492 + }, + { + "completion_length": 232.1666717529297, + "epoch": 0.04382222222222222, + "grad_norm": 58.495811462402344, + "learning_rate": 3e-06, + "loss": 1.5791, + "reward": 1.1145833730697632, + "reward_std": 0.20219221711158752, + "rewards/boxed_and_answer_tags_format_reward": 0.6145833432674408, + "rewards/correctness_reward_func_math": 0.5, + "step": 493, + "zero_std_ratio": 0.75 + }, + { + "epoch": 0.04391111111111111, + "grad_norm": 66.24677276611328, + "learning_rate": 3e-06, + "loss": 8.869, + "step": 494 + }, + { + "epoch": 0.044, + "grad_norm": 75.63920593261719, + "learning_rate": 3e-06, + "loss": 6.6083, + "step": 495 + }, + { + "epoch": 0.044088888888888886, + "grad_norm": 41.957889556884766, + "learning_rate": 3e-06, + "loss": 6.373, + "step": 496 + }, + { + "epoch": 0.04417777777777778, + "grad_norm": 56.327693939208984, + "learning_rate": 3e-06, + "loss": 8.5285, + "step": 497 + }, + { + "epoch": 0.04426666666666667, + "grad_norm": 56.58005905151367, + "learning_rate": 3e-06, + "loss": 8.228, + "step": 498 + }, + { + "epoch": 0.04435555555555556, + "grad_norm": 56.65522766113281, + "learning_rate": 3e-06, + "loss": 1.1127, + "step": 499 + }, + { + "epoch": 0.044444444444444446, + "grad_norm": 52.785221099853516, + "learning_rate": 3e-06, + "loss": 8.324, + "step": 500 + }, + { + "epoch": 0.044533333333333334, + "grad_norm": 52.37721252441406, + "learning_rate": 3e-06, + "loss": 5.6566, + "step": 501 + }, + { + "epoch": 0.04462222222222222, + "grad_norm": 50.5732307434082, + "learning_rate": 3e-06, + "loss": 5.266, + "step": 502 + }, + { + "epoch": 0.04471111111111111, + "grad_norm": 59.614261627197266, + "learning_rate": 3e-06, + "loss": 7.5254, + "step": 503 + }, + { + "epoch": 0.0448, + "grad_norm": 57.78561782836914, + "learning_rate": 3e-06, + "loss": 6.7155, + "step": 504 + }, + { + "completion_length": 249.5416717529297, + "epoch": 0.04488888888888889, + "grad_norm": 75.29253387451172, + "learning_rate": 3e-06, + "loss": 1.5945, + "reward": 1.1979166865348816, + "reward_std": 0.3113893121480942, + "rewards/boxed_and_answer_tags_format_reward": 0.65625, + "rewards/correctness_reward_func_math": 0.5416666567325592, + "step": 505, + "zero_std_ratio": 0.5 + }, + { + "epoch": 0.044977777777777776, + "grad_norm": 71.35801696777344, + "learning_rate": 3e-06, + "loss": 6.5836, + "step": 506 + }, + { + "epoch": 0.045066666666666665, + "grad_norm": 68.37297058105469, + "learning_rate": 3e-06, + "loss": 2.1054, + "step": 507 + }, + { + "epoch": 0.04515555555555555, + "grad_norm": 67.4723892211914, + "learning_rate": 3e-06, + "loss": 2.3974, + "step": 508 + }, + { + "epoch": 0.04524444444444444, + "grad_norm": 84.1152114868164, + "learning_rate": 3e-06, + "loss": 0.1293, + "step": 509 + }, + { + "epoch": 0.04533333333333334, + "grad_norm": 95.73898315429688, + "learning_rate": 3e-06, + "loss": -3.5888, + "step": 510 + }, + { + "epoch": 0.045422222222222225, + "grad_norm": 73.4489974975586, + "learning_rate": 3e-06, + "loss": 0.6248, + "step": 511 + }, + { + "epoch": 0.04551111111111111, + "grad_norm": 67.3970947265625, + "learning_rate": 3e-06, + "loss": 5.0044, + "step": 512 + }, + { + "epoch": 0.0456, + "grad_norm": 68.55184936523438, + "learning_rate": 3e-06, + "loss": 1.048, + "step": 513 + }, + { + "epoch": 0.04568888888888889, + "grad_norm": 72.8236312866211, + "learning_rate": 3e-06, + "loss": 0.9001, + "step": 514 + }, + { + "epoch": 0.04577777777777778, + "grad_norm": 72.06463623046875, + "learning_rate": 3e-06, + "loss": -1.2144, + "step": 515 + }, + { + "epoch": 0.04586666666666667, + "grad_norm": 87.04244995117188, + "learning_rate": 3e-06, + "loss": -5.3325, + "step": 516 + }, + { + "completion_length": 235.27083587646484, + "epoch": 0.045955555555555555, + "grad_norm": 76.81670379638672, + "learning_rate": 3e-06, + "loss": -18.2768, + "reward": 1.2395833730697632, + "reward_std": 0.4973409175872803, + "rewards/boxed_and_answer_tags_format_reward": 0.6145833432674408, + "rewards/correctness_reward_func_math": 0.625, + "step": 517, + "zero_std_ratio": 0.5 + }, + { + "epoch": 0.04604444444444444, + "grad_norm": 129.93211364746094, + "learning_rate": 3e-06, + "loss": -20.3578, + "step": 518 + }, + { + "epoch": 0.04613333333333333, + "grad_norm": 87.40878295898438, + "learning_rate": 3e-06, + "loss": -19.4647, + "step": 519 + }, + { + "epoch": 0.04622222222222222, + "grad_norm": 88.27845764160156, + "learning_rate": 3e-06, + "loss": -23.9652, + "step": 520 + }, + { + "epoch": 0.04631111111111111, + "grad_norm": 74.01776123046875, + "learning_rate": 3e-06, + "loss": -23.0054, + "step": 521 + }, + { + "epoch": 0.0464, + "grad_norm": 86.09662628173828, + "learning_rate": 3e-06, + "loss": -16.2859, + "step": 522 + }, + { + "epoch": 0.04648888888888889, + "grad_norm": 86.07221221923828, + "learning_rate": 3e-06, + "loss": -19.6548, + "step": 523 + }, + { + "epoch": 0.04657777777777778, + "grad_norm": 81.51930236816406, + "learning_rate": 3e-06, + "loss": -22.0114, + "step": 524 + }, + { + "epoch": 0.04666666666666667, + "grad_norm": 85.83316802978516, + "learning_rate": 3e-06, + "loss": -21.3756, + "step": 525 + }, + { + "epoch": 0.04675555555555556, + "grad_norm": 80.6872787475586, + "learning_rate": 3e-06, + "loss": -25.6275, + "step": 526 + }, + { + "epoch": 0.046844444444444445, + "grad_norm": 82.79136657714844, + "learning_rate": 3e-06, + "loss": -24.846, + "step": 527 + }, + { + "epoch": 0.046933333333333334, + "grad_norm": 91.39631652832031, + "learning_rate": 3e-06, + "loss": -17.8527, + "step": 528 + }, + { + "completion_length": 251.77084350585938, + "epoch": 0.04702222222222222, + "grad_norm": 58.933895111083984, + "learning_rate": 3e-06, + "loss": 1.5821, + "reward": 1.7083333730697632, + "reward_std": 0.29204893112182617, + "rewards/boxed_and_answer_tags_format_reward": 0.7083333432674408, + "rewards/correctness_reward_func_math": 1.0, + "step": 529, + "zero_std_ratio": 0.625 + }, + { + "epoch": 0.04711111111111111, + "grad_norm": 70.82673645019531, + "learning_rate": 3e-06, + "loss": 4.666, + "step": 530 + }, + { + "epoch": 0.0472, + "grad_norm": 62.45388412475586, + "learning_rate": 3e-06, + "loss": -0.8319, + "step": 531 + }, + { + "epoch": 0.04728888888888889, + "grad_norm": 68.45257568359375, + "learning_rate": 3e-06, + "loss": -4.2921, + "step": 532 + }, + { + "epoch": 0.047377777777777776, + "grad_norm": 63.96629333496094, + "learning_rate": 3e-06, + "loss": 7.1791, + "step": 533 + }, + { + "epoch": 0.047466666666666664, + "grad_norm": 71.47022247314453, + "learning_rate": 3e-06, + "loss": -0.0559, + "step": 534 + }, + { + "epoch": 0.04755555555555555, + "grad_norm": 69.21123504638672, + "learning_rate": 3e-06, + "loss": 1.1193, + "step": 535 + }, + { + "epoch": 0.04764444444444445, + "grad_norm": 54.05726623535156, + "learning_rate": 3e-06, + "loss": 3.7663, + "step": 536 + }, + { + "epoch": 0.047733333333333336, + "grad_norm": 57.079166412353516, + "learning_rate": 3e-06, + "loss": -1.8682, + "step": 537 + }, + { + "epoch": 0.047822222222222224, + "grad_norm": 89.69593811035156, + "learning_rate": 3e-06, + "loss": -5.0232, + "step": 538 + }, + { + "epoch": 0.04791111111111111, + "grad_norm": 68.15176391601562, + "learning_rate": 3e-06, + "loss": 6.4281, + "step": 539 + }, + { + "epoch": 0.048, + "grad_norm": 71.53436279296875, + "learning_rate": 3e-06, + "loss": -0.8678, + "step": 540 + }, + { + "completion_length": 248.4791717529297, + "epoch": 0.04808888888888889, + "grad_norm": 111.93199157714844, + "learning_rate": 3e-06, + "loss": -3.1821, + "reward": 1.3125, + "reward_std": 0.7091469466686249, + "rewards/boxed_and_answer_tags_format_reward": 0.6875, + "rewards/correctness_reward_func_math": 0.625, + "step": 541, + "zero_std_ratio": 0.25 + }, + { + "epoch": 0.04817777777777778, + "grad_norm": 104.72472381591797, + "learning_rate": 3e-06, + "loss": -9.114, + "step": 542 + }, + { + "epoch": 0.048266666666666666, + "grad_norm": 137.42185974121094, + "learning_rate": 3e-06, + "loss": -12.9595, + "step": 543 + }, + { + "epoch": 0.048355555555555554, + "grad_norm": 115.6964340209961, + "learning_rate": 3e-06, + "loss": -16.5078, + "step": 544 + }, + { + "epoch": 0.04844444444444444, + "grad_norm": 107.43921661376953, + "learning_rate": 3e-06, + "loss": -12.0856, + "step": 545 + }, + { + "epoch": 0.04853333333333333, + "grad_norm": 139.91366577148438, + "learning_rate": 3e-06, + "loss": -8.5451, + "step": 546 + }, + { + "epoch": 0.04862222222222222, + "grad_norm": 107.25052642822266, + "learning_rate": 3e-06, + "loss": -4.7342, + "step": 547 + }, + { + "epoch": 0.04871111111111111, + "grad_norm": 104.6925048828125, + "learning_rate": 3e-06, + "loss": -10.6582, + "step": 548 + }, + { + "epoch": 0.0488, + "grad_norm": 108.13795471191406, + "learning_rate": 3e-06, + "loss": -14.883, + "step": 549 + }, + { + "epoch": 0.04888888888888889, + "grad_norm": 108.62395477294922, + "learning_rate": 3e-06, + "loss": -19.0199, + "step": 550 + }, + { + "epoch": 0.04897777777777778, + "grad_norm": 103.06570434570312, + "learning_rate": 3e-06, + "loss": -14.306, + "step": 551 + }, + { + "epoch": 0.04906666666666667, + "grad_norm": 125.94219970703125, + "learning_rate": 3e-06, + "loss": -10.2489, + "step": 552 + }, + { + "completion_length": 221.43750762939453, + "epoch": 0.049155555555555557, + "grad_norm": 90.84999084472656, + "learning_rate": 3e-06, + "loss": 12.0477, + "reward": 2.0104166865348816, + "reward_std": 0.406316339969635, + "rewards/boxed_and_answer_tags_format_reward": 0.6770833432674408, + "rewards/correctness_reward_func_math": 1.3333333730697632, + "step": 553, + "zero_std_ratio": 0.5 + }, + { + "epoch": 0.049244444444444445, + "grad_norm": 81.99716186523438, + "learning_rate": 3e-06, + "loss": 11.4046, + "step": 554 + }, + { + "epoch": 0.04933333333333333, + "grad_norm": 89.69168090820312, + "learning_rate": 3e-06, + "loss": 15.5262, + "step": 555 + }, + { + "epoch": 0.04942222222222222, + "grad_norm": 93.98981475830078, + "learning_rate": 3e-06, + "loss": 15.7367, + "step": 556 + }, + { + "epoch": 0.04951111111111111, + "grad_norm": 86.68983459472656, + "learning_rate": 3e-06, + "loss": 5.511, + "step": 557 + }, + { + "epoch": 0.0496, + "grad_norm": 78.73108673095703, + "learning_rate": 3e-06, + "loss": 13.5853, + "step": 558 + }, + { + "epoch": 0.04968888888888889, + "grad_norm": 88.60321044921875, + "learning_rate": 3e-06, + "loss": 10.8131, + "step": 559 + }, + { + "epoch": 0.049777777777777775, + "grad_norm": 78.82019805908203, + "learning_rate": 3e-06, + "loss": 10.9253, + "step": 560 + }, + { + "epoch": 0.04986666666666666, + "grad_norm": 91.8647232055664, + "learning_rate": 3e-06, + "loss": 14.6415, + "step": 561 + }, + { + "epoch": 0.04995555555555556, + "grad_norm": 106.29496765136719, + "learning_rate": 3e-06, + "loss": 14.4603, + "step": 562 + }, + { + "epoch": 0.05004444444444445, + "grad_norm": 72.984130859375, + "learning_rate": 3e-06, + "loss": 4.1028, + "step": 563 + }, + { + "epoch": 0.050133333333333335, + "grad_norm": 88.46710205078125, + "learning_rate": 3e-06, + "loss": 11.4185, + "step": 564 + }, + { + "completion_length": 238.89584350585938, + "epoch": 0.050222222222222224, + "grad_norm": 91.9970703125, + "learning_rate": 3e-06, + "loss": -6.0724, + "reward": 1.0729166865348816, + "reward_std": 0.5935818552970886, + "rewards/boxed_and_answer_tags_format_reward": 0.6145833432674408, + "rewards/correctness_reward_func_math": 0.4583333283662796, + "step": 565, + "zero_std_ratio": 0.375 + }, + { + "epoch": 0.05031111111111111, + "grad_norm": 101.35919952392578, + "learning_rate": 3e-06, + "loss": -6.1114, + "step": 566 + }, + { + "epoch": 0.0504, + "grad_norm": 100.29364776611328, + "learning_rate": 3e-06, + "loss": 3.7467, + "step": 567 + }, + { + "epoch": 0.05048888888888889, + "grad_norm": 93.71849822998047, + "learning_rate": 3e-06, + "loss": 7.819, + "step": 568 + }, + { + "epoch": 0.05057777777777778, + "grad_norm": 84.14008331298828, + "learning_rate": 3e-06, + "loss": -3.9591, + "step": 569 + }, + { + "epoch": 0.050666666666666665, + "grad_norm": 91.14092254638672, + "learning_rate": 3e-06, + "loss": -7.111, + "step": 570 + }, + { + "epoch": 0.050755555555555554, + "grad_norm": 93.15682220458984, + "learning_rate": 3e-06, + "loss": -7.5642, + "step": 571 + }, + { + "epoch": 0.05084444444444444, + "grad_norm": 113.89299011230469, + "learning_rate": 3e-06, + "loss": -6.5357, + "step": 572 + }, + { + "epoch": 0.05093333333333333, + "grad_norm": 91.64227294921875, + "learning_rate": 3e-06, + "loss": 2.2068, + "step": 573 + }, + { + "epoch": 0.05102222222222222, + "grad_norm": 108.78038024902344, + "learning_rate": 3e-06, + "loss": 5.8992, + "step": 574 + }, + { + "epoch": 0.051111111111111114, + "grad_norm": 83.73683166503906, + "learning_rate": 3e-06, + "loss": -5.9239, + "step": 575 + }, + { + "epoch": 0.0512, + "grad_norm": 143.7006378173828, + "learning_rate": 3e-06, + "loss": -8.4902, + "step": 576 + }, + { + "completion_length": 243.7291717529297, + "epoch": 0.05128888888888889, + "grad_norm": 82.69302368164062, + "learning_rate": 3e-06, + "loss": 26.9064, + "reward": 1.4583333730697632, + "reward_std": 0.4701542556285858, + "rewards/boxed_and_answer_tags_format_reward": 0.75, + "rewards/correctness_reward_func_math": 0.708333358168602, + "step": 577, + "zero_std_ratio": 0.5 + }, + { + "epoch": 0.05137777777777778, + "grad_norm": 77.71623992919922, + "learning_rate": 3e-06, + "loss": 24.0495, + "step": 578 + }, + { + "epoch": 0.05146666666666667, + "grad_norm": 80.28130340576172, + "learning_rate": 3e-06, + "loss": 19.1044, + "step": 579 + }, + { + "epoch": 0.051555555555555556, + "grad_norm": 86.22237396240234, + "learning_rate": 3e-06, + "loss": 22.9667, + "step": 580 + }, + { + "epoch": 0.051644444444444444, + "grad_norm": 94.3071060180664, + "learning_rate": 3e-06, + "loss": 17.6854, + "step": 581 + }, + { + "epoch": 0.05173333333333333, + "grad_norm": 86.01050567626953, + "learning_rate": 3e-06, + "loss": 28.4794, + "step": 582 + }, + { + "epoch": 0.05182222222222222, + "grad_norm": 87.72801971435547, + "learning_rate": 3e-06, + "loss": 25.3194, + "step": 583 + }, + { + "epoch": 0.05191111111111111, + "grad_norm": 74.66322326660156, + "learning_rate": 3e-06, + "loss": 22.9833, + "step": 584 + }, + { + "epoch": 0.052, + "grad_norm": 90.72804260253906, + "learning_rate": 3e-06, + "loss": 17.4641, + "step": 585 + }, + { + "epoch": 0.052088888888888886, + "grad_norm": 87.39917755126953, + "learning_rate": 3e-06, + "loss": 21.6816, + "step": 586 + }, + { + "epoch": 0.052177777777777774, + "grad_norm": 85.64997100830078, + "learning_rate": 3e-06, + "loss": 15.9135, + "step": 587 + }, + { + "epoch": 0.05226666666666667, + "grad_norm": 92.35039520263672, + "learning_rate": 3e-06, + "loss": 26.4856, + "step": 588 + }, + { + "completion_length": 248.3541717529297, + "epoch": 0.05235555555555556, + "grad_norm": 93.8902359008789, + "learning_rate": 3e-06, + "loss": 14.9976, + "reward": 1.3541666865348816, + "reward_std": 0.5839263796806335, + "rewards/boxed_and_answer_tags_format_reward": 0.7291666865348816, + "rewards/correctness_reward_func_math": 0.6250000149011612, + "step": 589, + "zero_std_ratio": 0.375 + }, + { + "epoch": 0.052444444444444446, + "grad_norm": 98.34622192382812, + "learning_rate": 3e-06, + "loss": 12.6228, + "step": 590 + }, + { + "epoch": 0.052533333333333335, + "grad_norm": 108.93590545654297, + "learning_rate": 3e-06, + "loss": 9.3556, + "step": 591 + }, + { + "epoch": 0.05262222222222222, + "grad_norm": 114.49545288085938, + "learning_rate": 3e-06, + "loss": 23.8665, + "step": 592 + }, + { + "epoch": 0.05271111111111111, + "grad_norm": 102.37223052978516, + "learning_rate": 3e-06, + "loss": 7.3813, + "step": 593 + }, + { + "epoch": 0.0528, + "grad_norm": 98.17306518554688, + "learning_rate": 3e-06, + "loss": 13.3691, + "step": 594 + }, + { + "epoch": 0.05288888888888889, + "grad_norm": 95.84387969970703, + "learning_rate": 3e-06, + "loss": 13.8857, + "step": 595 + }, + { + "epoch": 0.052977777777777776, + "grad_norm": 101.44741821289062, + "learning_rate": 3e-06, + "loss": 11.0637, + "step": 596 + }, + { + "epoch": 0.053066666666666665, + "grad_norm": 96.87228393554688, + "learning_rate": 3e-06, + "loss": 7.1168, + "step": 597 + }, + { + "epoch": 0.05315555555555555, + "grad_norm": 129.4097442626953, + "learning_rate": 3e-06, + "loss": 22.4722, + "step": 598 + }, + { + "epoch": 0.05324444444444444, + "grad_norm": 100.4094467163086, + "learning_rate": 3e-06, + "loss": 4.9249, + "step": 599 + }, + { + "epoch": 0.05333333333333334, + "grad_norm": 97.2879638671875, + "learning_rate": 3e-06, + "loss": 10.7864, + "step": 600 + }, + { + "completion_length": 249.31250762939453, + "epoch": 0.053422222222222225, + "grad_norm": 81.29678344726562, + "learning_rate": 3e-06, + "loss": -1.4361, + "reward": 1.6458333730697632, + "reward_std": 0.43528565764427185, + "rewards/boxed_and_answer_tags_format_reward": 0.6875, + "rewards/correctness_reward_func_math": 0.9583333432674408, + "step": 601, + "zero_std_ratio": 0.5 + }, + { + "epoch": 0.05351111111111111, + "grad_norm": 71.18397521972656, + "learning_rate": 3e-06, + "loss": -9.7138, + "step": 602 + }, + { + "epoch": 0.0536, + "grad_norm": 79.09825897216797, + "learning_rate": 3e-06, + "loss": -7.5626, + "step": 603 + }, + { + "epoch": 0.05368888888888889, + "grad_norm": 88.4588394165039, + "learning_rate": 3e-06, + "loss": -8.3433, + "step": 604 + }, + { + "epoch": 0.05377777777777778, + "grad_norm": 75.85662841796875, + "learning_rate": 3e-06, + "loss": -3.126, + "step": 605 + }, + { + "epoch": 0.05386666666666667, + "grad_norm": 76.75032806396484, + "learning_rate": 3e-06, + "loss": -4.1707, + "step": 606 + }, + { + "epoch": 0.053955555555555555, + "grad_norm": 96.06957244873047, + "learning_rate": 3e-06, + "loss": -2.3824, + "step": 607 + }, + { + "epoch": 0.054044444444444444, + "grad_norm": 108.4106674194336, + "learning_rate": 3e-06, + "loss": -10.8022, + "step": 608 + }, + { + "epoch": 0.05413333333333333, + "grad_norm": 82.68360900878906, + "learning_rate": 3e-06, + "loss": -9.1987, + "step": 609 + }, + { + "epoch": 0.05422222222222222, + "grad_norm": 92.35367584228516, + "learning_rate": 3e-06, + "loss": -9.379, + "step": 610 + }, + { + "epoch": 0.05431111111111111, + "grad_norm": 78.61454010009766, + "learning_rate": 3e-06, + "loss": -4.4421, + "step": 611 + }, + { + "epoch": 0.0544, + "grad_norm": 83.68685150146484, + "learning_rate": 3e-06, + "loss": -5.8651, + "step": 612 + }, + { + "completion_length": 252.8541717529297, + "epoch": 0.05448888888888889, + "grad_norm": 156.99725341796875, + "learning_rate": 3e-06, + "loss": -38.1088, + "reward": 1.2395833730697632, + "reward_std": 0.3936076909303665, + "rewards/boxed_and_answer_tags_format_reward": 0.7395833432674408, + "rewards/correctness_reward_func_math": 0.5, + "step": 613, + "zero_std_ratio": 0.5 + }, + { + "epoch": 0.05457777777777778, + "grad_norm": 82.39892578125, + "learning_rate": 3e-06, + "loss": -24.9067, + "step": 614 + }, + { + "epoch": 0.05466666666666667, + "grad_norm": 76.3405532836914, + "learning_rate": 3e-06, + "loss": -21.2904, + "step": 615 + }, + { + "epoch": 0.05475555555555556, + "grad_norm": 114.32887268066406, + "learning_rate": 3e-06, + "loss": -41.3273, + "step": 616 + }, + { + "epoch": 0.054844444444444446, + "grad_norm": 90.10194396972656, + "learning_rate": 3e-06, + "loss": -19.8759, + "step": 617 + }, + { + "epoch": 0.054933333333333334, + "grad_norm": 101.73013305664062, + "learning_rate": 3e-06, + "loss": -31.9803, + "step": 618 + }, + { + "epoch": 0.05502222222222222, + "grad_norm": 107.18656921386719, + "learning_rate": 3e-06, + "loss": -38.8443, + "step": 619 + }, + { + "epoch": 0.05511111111111111, + "grad_norm": 85.78763580322266, + "learning_rate": 3e-06, + "loss": -26.4852, + "step": 620 + }, + { + "epoch": 0.0552, + "grad_norm": 78.22523498535156, + "learning_rate": 3e-06, + "loss": -22.9265, + "step": 621 + }, + { + "epoch": 0.05528888888888889, + "grad_norm": 120.62594604492188, + "learning_rate": 3e-06, + "loss": -44.6351, + "step": 622 + }, + { + "epoch": 0.055377777777777776, + "grad_norm": 93.32075500488281, + "learning_rate": 3e-06, + "loss": -21.4564, + "step": 623 + }, + { + "epoch": 0.055466666666666664, + "grad_norm": 97.2727279663086, + "learning_rate": 3e-06, + "loss": -34.0292, + "step": 624 + }, + { + "completion_length": 247.5, + "epoch": 0.05555555555555555, + "grad_norm": 98.01384735107422, + "learning_rate": 3e-06, + "loss": -25.8639, + "reward": 1.9895833730697632, + "reward_std": 0.6326004266738892, + "rewards/boxed_and_answer_tags_format_reward": 0.7395833432674408, + "rewards/correctness_reward_func_math": 1.25, + "step": 625, + "zero_std_ratio": 0.25 + }, + { + "epoch": 0.05564444444444445, + "grad_norm": 110.7737045288086, + "learning_rate": 3e-06, + "loss": -26.4317, + "step": 626 + }, + { + "epoch": 0.055733333333333336, + "grad_norm": 100.14824676513672, + "learning_rate": 3e-06, + "loss": -17.6826, + "step": 627 + }, + { + "epoch": 0.055822222222222224, + "grad_norm": 95.32125854492188, + "learning_rate": 3e-06, + "loss": -13.7046, + "step": 628 + }, + { + "epoch": 0.05591111111111111, + "grad_norm": 153.66207885742188, + "learning_rate": 3e-06, + "loss": -33.0736, + "step": 629 + }, + { + "epoch": 0.056, + "grad_norm": 98.00647735595703, + "learning_rate": 3e-06, + "loss": -23.4488, + "step": 630 + }, + { + "epoch": 0.05608888888888889, + "grad_norm": 101.93690490722656, + "learning_rate": 3e-06, + "loss": -27.788, + "step": 631 + }, + { + "epoch": 0.05617777777777778, + "grad_norm": 109.6976089477539, + "learning_rate": 3e-06, + "loss": -27.8496, + "step": 632 + }, + { + "epoch": 0.056266666666666666, + "grad_norm": 94.93986511230469, + "learning_rate": 3e-06, + "loss": -18.741, + "step": 633 + }, + { + "epoch": 0.056355555555555555, + "grad_norm": 99.8827133178711, + "learning_rate": 3e-06, + "loss": -15.4788, + "step": 634 + }, + { + "epoch": 0.05644444444444444, + "grad_norm": 157.86849975585938, + "learning_rate": 3e-06, + "loss": -35.8463, + "step": 635 + }, + { + "epoch": 0.05653333333333333, + "grad_norm": 109.21646118164062, + "learning_rate": 3e-06, + "loss": -25.4238, + "step": 636 + }, + { + "completion_length": 247.5416717529297, + "epoch": 0.05662222222222222, + "grad_norm": 107.59577941894531, + "learning_rate": 3e-06, + "loss": 2.5406, + "reward": 0.71875, + "reward_std": 0.11004260182380676, + "rewards/boxed_and_answer_tags_format_reward": 0.6770833432674408, + "rewards/correctness_reward_func_math": 0.0416666679084301, + "step": 637, + "zero_std_ratio": 0.875 + }, + { + "epoch": 0.05671111111111111, + "grad_norm": 92.30776977539062, + "learning_rate": 3e-06, + "loss": 3.1641, + "step": 638 + }, + { + "epoch": 0.0568, + "grad_norm": 41.486106872558594, + "learning_rate": 3e-06, + "loss": 4.5319, + "step": 639 + }, + { + "epoch": 0.05688888888888889, + "grad_norm": 50.142147064208984, + "learning_rate": 3e-06, + "loss": 0.4783, + "step": 640 + }, + { + "epoch": 0.05697777777777778, + "grad_norm": 41.67461395263672, + "learning_rate": 3e-06, + "loss": 0.8438, + "step": 641 + }, + { + "epoch": 0.05706666666666667, + "grad_norm": 51.555999755859375, + "learning_rate": 3e-06, + "loss": 0.3726, + "step": 642 + }, + { + "epoch": 0.05715555555555556, + "grad_norm": 56.56801223754883, + "learning_rate": 3e-06, + "loss": 2.3059, + "step": 643 + }, + { + "epoch": 0.057244444444444445, + "grad_norm": 57.52075958251953, + "learning_rate": 3e-06, + "loss": 2.0942, + "step": 644 + }, + { + "epoch": 0.05733333333333333, + "grad_norm": 51.82474136352539, + "learning_rate": 3e-06, + "loss": 3.4151, + "step": 645 + }, + { + "epoch": 0.05742222222222222, + "grad_norm": 45.45164108276367, + "learning_rate": 3e-06, + "loss": -0.6204, + "step": 646 + }, + { + "epoch": 0.05751111111111111, + "grad_norm": 43.34312057495117, + "learning_rate": 3e-06, + "loss": -0.6824, + "step": 647 + }, + { + "epoch": 0.0576, + "grad_norm": 88.9341812133789, + "learning_rate": 3e-06, + "loss": -0.345, + "step": 648 + }, + { + "completion_length": 246.81250762939453, + "epoch": 0.05768888888888889, + "grad_norm": 91.70360565185547, + "learning_rate": 3e-06, + "loss": -21.4258, + "reward": 1.8541666865348816, + "reward_std": 0.3332235887646675, + "rewards/boxed_and_answer_tags_format_reward": 0.6875, + "rewards/correctness_reward_func_math": 1.1666666567325592, + "step": 649, + "zero_std_ratio": 0.625 + }, + { + "epoch": 0.057777777777777775, + "grad_norm": 117.84858703613281, + "learning_rate": 3e-06, + "loss": -24.4443, + "step": 650 + }, + { + "epoch": 0.057866666666666663, + "grad_norm": 94.53849792480469, + "learning_rate": 3e-06, + "loss": -21.691, + "step": 651 + }, + { + "epoch": 0.05795555555555556, + "grad_norm": 97.39710998535156, + "learning_rate": 3e-06, + "loss": -24.1024, + "step": 652 + }, + { + "epoch": 0.05804444444444445, + "grad_norm": 90.82528686523438, + "learning_rate": 3e-06, + "loss": -23.0164, + "step": 653 + }, + { + "epoch": 0.058133333333333335, + "grad_norm": 77.49068450927734, + "learning_rate": 3e-06, + "loss": -22.6643, + "step": 654 + }, + { + "epoch": 0.058222222222222224, + "grad_norm": 87.06861114501953, + "learning_rate": 3e-06, + "loss": -22.1806, + "step": 655 + }, + { + "epoch": 0.05831111111111111, + "grad_norm": 116.24286651611328, + "learning_rate": 3e-06, + "loss": -25.0616, + "step": 656 + }, + { + "epoch": 0.0584, + "grad_norm": 80.90653228759766, + "learning_rate": 3e-06, + "loss": -23.1949, + "step": 657 + }, + { + "epoch": 0.05848888888888889, + "grad_norm": 122.82756042480469, + "learning_rate": 3e-06, + "loss": -25.4261, + "step": 658 + }, + { + "epoch": 0.05857777777777778, + "grad_norm": 87.51302337646484, + "learning_rate": 3e-06, + "loss": -24.0548, + "step": 659 + }, + { + "epoch": 0.058666666666666666, + "grad_norm": 85.40345764160156, + "learning_rate": 3e-06, + "loss": -23.4326, + "step": 660 + }, + { + "completion_length": 250.9375, + "epoch": 0.058755555555555554, + "grad_norm": 40.29450607299805, + "learning_rate": 3e-06, + "loss": 9.1824, + "reward": 0.8541666865348816, + "reward_std": 0.10206206887960434, + "rewards/boxed_and_answer_tags_format_reward": 0.5625, + "rewards/correctness_reward_func_math": 0.2916666567325592, + "step": 661, + "zero_std_ratio": 0.875 + }, + { + "epoch": 0.05884444444444444, + "grad_norm": 53.10469436645508, + "learning_rate": 3e-06, + "loss": 5.2732, + "step": 662 + }, + { + "epoch": 0.05893333333333333, + "grad_norm": 53.52404022216797, + "learning_rate": 3e-06, + "loss": 9.7305, + "step": 663 + }, + { + "epoch": 0.05902222222222222, + "grad_norm": 43.00156021118164, + "learning_rate": 3e-06, + "loss": 10.5135, + "step": 664 + }, + { + "epoch": 0.059111111111111114, + "grad_norm": 51.94622802734375, + "learning_rate": 3e-06, + "loss": 8.7434, + "step": 665 + }, + { + "epoch": 0.0592, + "grad_norm": 45.08756637573242, + "learning_rate": 3e-06, + "loss": 10.9107, + "step": 666 + }, + { + "epoch": 0.05928888888888889, + "grad_norm": 38.80097961425781, + "learning_rate": 3e-06, + "loss": 9.0445, + "step": 667 + }, + { + "epoch": 0.05937777777777778, + "grad_norm": 50.45252227783203, + "learning_rate": 3e-06, + "loss": 4.2955, + "step": 668 + }, + { + "epoch": 0.05946666666666667, + "grad_norm": 42.282501220703125, + "learning_rate": 3e-06, + "loss": 8.7618, + "step": 669 + }, + { + "epoch": 0.059555555555555556, + "grad_norm": 40.22513961791992, + "learning_rate": 3e-06, + "loss": 9.5891, + "step": 670 + }, + { + "epoch": 0.059644444444444444, + "grad_norm": 55.79698181152344, + "learning_rate": 3e-06, + "loss": 7.7516, + "step": 671 + }, + { + "epoch": 0.05973333333333333, + "grad_norm": 41.462181091308594, + "learning_rate": 3e-06, + "loss": 10.17, + "step": 672 + }, + { + "completion_length": 238.4375, + "epoch": 0.05982222222222222, + "grad_norm": 95.73463439941406, + "learning_rate": 3e-06, + "loss": 14.012, + "reward": 1.4583333730697632, + "reward_std": 0.3410547822713852, + "rewards/boxed_and_answer_tags_format_reward": 0.75, + "rewards/correctness_reward_func_math": 0.7083333134651184, + "step": 673, + "zero_std_ratio": 0.625 + }, + { + "epoch": 0.05991111111111111, + "grad_norm": 92.55806732177734, + "learning_rate": 3e-06, + "loss": 7.0593, + "step": 674 + }, + { + "epoch": 0.06, + "grad_norm": 90.97936248779297, + "learning_rate": 3e-06, + "loss": 6.1863, + "step": 675 + }, + { + "epoch": 0.060088888888888886, + "grad_norm": 82.84268951416016, + "learning_rate": 3e-06, + "loss": -3.7791, + "step": 676 + }, + { + "epoch": 0.060177777777777774, + "grad_norm": 93.55880737304688, + "learning_rate": 3e-06, + "loss": 1.1706, + "step": 677 + }, + { + "epoch": 0.06026666666666667, + "grad_norm": 143.54983520507812, + "learning_rate": 3e-06, + "loss": 5.2435, + "step": 678 + }, + { + "epoch": 0.06035555555555556, + "grad_norm": 103.67829132080078, + "learning_rate": 3e-06, + "loss": 12.2804, + "step": 679 + }, + { + "epoch": 0.060444444444444446, + "grad_norm": 94.47793579101562, + "learning_rate": 3e-06, + "loss": 5.8252, + "step": 680 + }, + { + "epoch": 0.060533333333333335, + "grad_norm": 87.38697814941406, + "learning_rate": 3e-06, + "loss": 4.5993, + "step": 681 + }, + { + "epoch": 0.06062222222222222, + "grad_norm": 79.12389373779297, + "learning_rate": 3e-06, + "loss": -4.8813, + "step": 682 + }, + { + "epoch": 0.06071111111111111, + "grad_norm": 97.0263900756836, + "learning_rate": 3e-06, + "loss": -0.474, + "step": 683 + }, + { + "epoch": 0.0608, + "grad_norm": 183.30641174316406, + "learning_rate": 3e-06, + "loss": 2.9729, + "step": 684 + }, + { + "completion_length": 255.4166717529297, + "epoch": 0.06088888888888889, + "grad_norm": 119.54466247558594, + "learning_rate": 3e-06, + "loss": -13.0228, + "reward": 1.4583333730697632, + "reward_std": 0.505022794008255, + "rewards/boxed_and_answer_tags_format_reward": 0.75, + "rewards/correctness_reward_func_math": 0.708333358168602, + "step": 685, + "zero_std_ratio": 0.5 + }, + { + "epoch": 0.06097777777777778, + "grad_norm": 121.260986328125, + "learning_rate": 3e-06, + "loss": -7.3608, + "step": 686 + }, + { + "epoch": 0.061066666666666665, + "grad_norm": 107.9063720703125, + "learning_rate": 3e-06, + "loss": -2.885, + "step": 687 + }, + { + "epoch": 0.06115555555555555, + "grad_norm": 122.71879577636719, + "learning_rate": 3e-06, + "loss": -9.4339, + "step": 688 + }, + { + "epoch": 0.06124444444444444, + "grad_norm": 98.11092376708984, + "learning_rate": 3e-06, + "loss": -7.9372, + "step": 689 + }, + { + "epoch": 0.06133333333333333, + "grad_norm": 106.66928100585938, + "learning_rate": 3e-06, + "loss": 2.2675, + "step": 690 + }, + { + "epoch": 0.061422222222222225, + "grad_norm": 117.10845947265625, + "learning_rate": 3e-06, + "loss": -13.9389, + "step": 691 + }, + { + "epoch": 0.061511111111111114, + "grad_norm": 123.24708557128906, + "learning_rate": 3e-06, + "loss": -9.064, + "step": 692 + }, + { + "epoch": 0.0616, + "grad_norm": 108.9686050415039, + "learning_rate": 3e-06, + "loss": -4.3708, + "step": 693 + }, + { + "epoch": 0.06168888888888889, + "grad_norm": 120.98512268066406, + "learning_rate": 3e-06, + "loss": -10.9314, + "step": 694 + }, + { + "epoch": 0.06177777777777778, + "grad_norm": 96.25732421875, + "learning_rate": 3e-06, + "loss": -9.4401, + "step": 695 + }, + { + "epoch": 0.06186666666666667, + "grad_norm": 107.97279357910156, + "learning_rate": 3e-06, + "loss": 0.1328, + "step": 696 + }, + { + "completion_length": 242.4166717529297, + "epoch": 0.061955555555555555, + "grad_norm": 77.24815368652344, + "learning_rate": 3e-06, + "loss": 21.7633, + "reward": 1.6875, + "reward_std": 0.3680921494960785, + "rewards/boxed_and_answer_tags_format_reward": 0.6875, + "rewards/correctness_reward_func_math": 1.0, + "step": 697, + "zero_std_ratio": 0.625 + }, + { + "epoch": 0.062044444444444444, + "grad_norm": 90.61959075927734, + "learning_rate": 3e-06, + "loss": 21.3004, + "step": 698 + }, + { + "epoch": 0.06213333333333333, + "grad_norm": 79.43978881835938, + "learning_rate": 3e-06, + "loss": 22.885, + "step": 699 + }, + { + "epoch": 0.06222222222222222, + "grad_norm": 88.06346130371094, + "learning_rate": 3e-06, + "loss": 16.6794, + "step": 700 + }, + { + "epoch": 0.06231111111111111, + "grad_norm": 215.39535522460938, + "learning_rate": 3e-06, + "loss": 18.8777, + "step": 701 + }, + { + "epoch": 0.0624, + "grad_norm": 92.26004791259766, + "learning_rate": 3e-06, + "loss": 11.0055, + "step": 702 + }, + { + "epoch": 0.062488888888888885, + "grad_norm": 77.40907287597656, + "learning_rate": 3e-06, + "loss": 20.7747, + "step": 703 + }, + { + "epoch": 0.06257777777777777, + "grad_norm": 100.23749542236328, + "learning_rate": 3e-06, + "loss": 20.3858, + "step": 704 + }, + { + "epoch": 0.06266666666666666, + "grad_norm": 75.75386047363281, + "learning_rate": 3e-06, + "loss": 22.1331, + "step": 705 + }, + { + "epoch": 0.06275555555555555, + "grad_norm": 88.5040054321289, + "learning_rate": 3e-06, + "loss": 15.6211, + "step": 706 + }, + { + "epoch": 0.06284444444444444, + "grad_norm": 83.19754028320312, + "learning_rate": 3e-06, + "loss": 17.9444, + "step": 707 + }, + { + "epoch": 0.06293333333333333, + "grad_norm": 88.55027770996094, + "learning_rate": 3e-06, + "loss": 9.7909, + "step": 708 + }, + { + "completion_length": 253.8541717529297, + "epoch": 0.06302222222222222, + "grad_norm": 106.40866088867188, + "learning_rate": 3e-06, + "loss": -1.6061, + "reward": 1.2708333730697632, + "reward_std": 0.38547582924366, + "rewards/boxed_and_answer_tags_format_reward": 0.6875, + "rewards/correctness_reward_func_math": 0.5833333432674408, + "step": 709, + "zero_std_ratio": 0.25 + }, + { + "epoch": 0.06311111111111112, + "grad_norm": 123.38434600830078, + "learning_rate": 3e-06, + "loss": -9.3301, + "step": 710 + }, + { + "epoch": 0.0632, + "grad_norm": 101.67454528808594, + "learning_rate": 3e-06, + "loss": -7.1273, + "step": 711 + }, + { + "epoch": 0.0632888888888889, + "grad_norm": 112.82793426513672, + "learning_rate": 3e-06, + "loss": -9.8195, + "step": 712 + }, + { + "epoch": 0.06337777777777778, + "grad_norm": 108.99236297607422, + "learning_rate": 3e-06, + "loss": -2.3741, + "step": 713 + }, + { + "epoch": 0.06346666666666667, + "grad_norm": 106.9615478515625, + "learning_rate": 3e-06, + "loss": -7.792, + "step": 714 + }, + { + "epoch": 0.06355555555555556, + "grad_norm": 94.91867065429688, + "learning_rate": 3e-06, + "loss": -3.0819, + "step": 715 + }, + { + "epoch": 0.06364444444444445, + "grad_norm": 114.82243347167969, + "learning_rate": 3e-06, + "loss": -11.4017, + "step": 716 + }, + { + "epoch": 0.06373333333333334, + "grad_norm": 104.33937072753906, + "learning_rate": 3e-06, + "loss": -9.7339, + "step": 717 + }, + { + "epoch": 0.06382222222222222, + "grad_norm": 136.31576538085938, + "learning_rate": 3e-06, + "loss": -12.1719, + "step": 718 + }, + { + "epoch": 0.06391111111111111, + "grad_norm": 99.78816223144531, + "learning_rate": 3e-06, + "loss": -4.7219, + "step": 719 + }, + { + "epoch": 0.064, + "grad_norm": 120.37998962402344, + "learning_rate": 3e-06, + "loss": -10.5969, + "step": 720 + }, + { + "completion_length": 244.0416717529297, + "epoch": 0.06408888888888889, + "grad_norm": 101.32681274414062, + "learning_rate": 3e-06, + "loss": 2.1527, + "reward": 1.0, + "reward_std": 0.4417443424463272, + "rewards/boxed_and_answer_tags_format_reward": 0.6666666865348816, + "rewards/correctness_reward_func_math": 0.3333333246409893, + "step": 721, + "zero_std_ratio": 0.375 + }, + { + "epoch": 0.06417777777777778, + "grad_norm": 113.38105773925781, + "learning_rate": 3e-06, + "loss": 11.181, + "step": 722 + }, + { + "epoch": 0.06426666666666667, + "grad_norm": 114.85003662109375, + "learning_rate": 3e-06, + "loss": -2.1413, + "step": 723 + }, + { + "epoch": 0.06435555555555555, + "grad_norm": 133.62515258789062, + "learning_rate": 3e-06, + "loss": 9.3709, + "step": 724 + }, + { + "epoch": 0.06444444444444444, + "grad_norm": 113.68856048583984, + "learning_rate": 3e-06, + "loss": -3.0918, + "step": 725 + }, + { + "epoch": 0.06453333333333333, + "grad_norm": 161.93836975097656, + "learning_rate": 3e-06, + "loss": -1.9418, + "step": 726 + }, + { + "epoch": 0.06462222222222222, + "grad_norm": 100.46546173095703, + "learning_rate": 3e-06, + "loss": 0.5341, + "step": 727 + }, + { + "epoch": 0.06471111111111111, + "grad_norm": 141.3654022216797, + "learning_rate": 3e-06, + "loss": 9.9107, + "step": 728 + }, + { + "epoch": 0.0648, + "grad_norm": 131.8665771484375, + "learning_rate": 3e-06, + "loss": -3.674, + "step": 729 + }, + { + "epoch": 0.06488888888888888, + "grad_norm": 128.03195190429688, + "learning_rate": 3e-06, + "loss": 6.8735, + "step": 730 + }, + { + "epoch": 0.06497777777777777, + "grad_norm": 117.97486114501953, + "learning_rate": 3e-06, + "loss": -5.1859, + "step": 731 + }, + { + "epoch": 0.06506666666666666, + "grad_norm": 158.5392303466797, + "learning_rate": 3e-06, + "loss": -4.222, + "step": 732 + }, + { + "completion_length": 249.08334350585938, + "epoch": 0.06515555555555555, + "grad_norm": 122.15642547607422, + "learning_rate": 3e-06, + "loss": 5.173, + "reward": 1.1770833730697632, + "reward_std": 0.2915456295013428, + "rewards/boxed_and_answer_tags_format_reward": 0.6354166567325592, + "rewards/correctness_reward_func_math": 0.5416666567325592, + "step": 733, + "zero_std_ratio": 0.625 + }, + { + "epoch": 0.06524444444444444, + "grad_norm": 122.5665054321289, + "learning_rate": 3e-06, + "loss": -10.2692, + "step": 734 + }, + { + "epoch": 0.06533333333333333, + "grad_norm": 95.55500030517578, + "learning_rate": 3e-06, + "loss": -4.3816, + "step": 735 + }, + { + "epoch": 0.06542222222222223, + "grad_norm": 111.13971710205078, + "learning_rate": 3e-06, + "loss": -4.5502, + "step": 736 + }, + { + "epoch": 0.06551111111111112, + "grad_norm": 98.50959014892578, + "learning_rate": 3e-06, + "loss": -5.0919, + "step": 737 + }, + { + "epoch": 0.0656, + "grad_norm": 82.98762512207031, + "learning_rate": 3e-06, + "loss": -7.9156, + "step": 738 + }, + { + "epoch": 0.0656888888888889, + "grad_norm": 114.31904602050781, + "learning_rate": 3e-06, + "loss": 4.2146, + "step": 739 + }, + { + "epoch": 0.06577777777777778, + "grad_norm": 109.786376953125, + "learning_rate": 3e-06, + "loss": -11.7776, + "step": 740 + }, + { + "epoch": 0.06586666666666667, + "grad_norm": 95.00721740722656, + "learning_rate": 3e-06, + "loss": -5.7331, + "step": 741 + }, + { + "epoch": 0.06595555555555556, + "grad_norm": 87.01516723632812, + "learning_rate": 3e-06, + "loss": -5.0034, + "step": 742 + }, + { + "epoch": 0.06604444444444445, + "grad_norm": 101.2181167602539, + "learning_rate": 3e-06, + "loss": -6.7069, + "step": 743 + }, + { + "epoch": 0.06613333333333334, + "grad_norm": 79.3046875, + "learning_rate": 3e-06, + "loss": -8.6759, + "step": 744 + }, + { + "completion_length": 252.93750762939453, + "epoch": 0.06622222222222222, + "grad_norm": 101.21440887451172, + "learning_rate": 3e-06, + "loss": 2.4358, + "reward": 1.354166716337204, + "reward_std": 0.4736091196537018, + "rewards/boxed_and_answer_tags_format_reward": 0.6458333432674408, + "rewards/correctness_reward_func_math": 0.7083333358168602, + "step": 745, + "zero_std_ratio": 0.375 + }, + { + "epoch": 0.06631111111111111, + "grad_norm": 133.58985900878906, + "learning_rate": 3e-06, + "loss": 0.2243, + "step": 746 + }, + { + "epoch": 0.0664, + "grad_norm": 137.97776794433594, + "learning_rate": 3e-06, + "loss": 3.7561, + "step": 747 + }, + { + "epoch": 0.06648888888888889, + "grad_norm": 97.12255859375, + "learning_rate": 3e-06, + "loss": 2.8029, + "step": 748 + }, + { + "epoch": 0.06657777777777778, + "grad_norm": 133.5359344482422, + "learning_rate": 3e-06, + "loss": 5.9185, + "step": 749 + }, + { + "epoch": 0.06666666666666667, + "grad_norm": 165.00294494628906, + "learning_rate": 3e-06, + "loss": 5.6118, + "step": 750 + }, + { + "epoch": 0.06675555555555555, + "grad_norm": 147.9979705810547, + "learning_rate": 3e-06, + "loss": 1.9697, + "step": 751 + }, + { + "epoch": 0.06684444444444444, + "grad_norm": 119.22462463378906, + "learning_rate": 3e-06, + "loss": -0.882, + "step": 752 + }, + { + "epoch": 0.06693333333333333, + "grad_norm": 144.49305725097656, + "learning_rate": 3e-06, + "loss": 2.487, + "step": 753 + }, + { + "epoch": 0.06702222222222222, + "grad_norm": 96.12986755371094, + "learning_rate": 3e-06, + "loss": 1.0482, + "step": 754 + }, + { + "epoch": 0.06711111111111111, + "grad_norm": 117.53173065185547, + "learning_rate": 3e-06, + "loss": 4.1922, + "step": 755 + }, + { + "epoch": 0.0672, + "grad_norm": 99.70003509521484, + "learning_rate": 3e-06, + "loss": 4.014, + "step": 756 + }, + { + "completion_length": 252.9166717529297, + "epoch": 0.06728888888888888, + "grad_norm": 107.6641616821289, + "learning_rate": 3e-06, + "loss": -6.2275, + "reward": 1.7812500596046448, + "reward_std": 0.38577648997306824, + "rewards/boxed_and_answer_tags_format_reward": 0.7395833432674408, + "rewards/correctness_reward_func_math": 1.0416666567325592, + "step": 757, + "zero_std_ratio": 0.5 + }, + { + "epoch": 0.06737777777777777, + "grad_norm": 87.34847259521484, + "learning_rate": 3e-06, + "loss": -3.2976, + "step": 758 + }, + { + "epoch": 0.06746666666666666, + "grad_norm": 114.33875274658203, + "learning_rate": 3e-06, + "loss": -4.307, + "step": 759 + }, + { + "epoch": 0.06755555555555555, + "grad_norm": 106.53048706054688, + "learning_rate": 3e-06, + "loss": -1.13, + "step": 760 + }, + { + "epoch": 0.06764444444444444, + "grad_norm": 120.27633666992188, + "learning_rate": 3e-06, + "loss": -3.2187, + "step": 761 + }, + { + "epoch": 0.06773333333333334, + "grad_norm": 96.025390625, + "learning_rate": 3e-06, + "loss": 0.3329, + "step": 762 + }, + { + "epoch": 0.06782222222222223, + "grad_norm": 115.00994110107422, + "learning_rate": 3e-06, + "loss": -7.6432, + "step": 763 + }, + { + "epoch": 0.06791111111111112, + "grad_norm": 87.76792907714844, + "learning_rate": 3e-06, + "loss": -4.6143, + "step": 764 + }, + { + "epoch": 0.068, + "grad_norm": 113.94709777832031, + "learning_rate": 3e-06, + "loss": -5.7334, + "step": 765 + }, + { + "epoch": 0.0680888888888889, + "grad_norm": 111.59996032714844, + "learning_rate": 3e-06, + "loss": -2.659, + "step": 766 + }, + { + "epoch": 0.06817777777777778, + "grad_norm": 129.63861083984375, + "learning_rate": 3e-06, + "loss": -5.0582, + "step": 767 + }, + { + "epoch": 0.06826666666666667, + "grad_norm": 93.2347412109375, + "learning_rate": 3e-06, + "loss": -1.0268, + "step": 768 + }, + { + "completion_length": 251.52083587646484, + "epoch": 0.06835555555555556, + "grad_norm": 184.61167907714844, + "learning_rate": 3e-06, + "loss": 5.0221, + "reward": 1.4166666865348816, + "reward_std": 0.6262910515069962, + "rewards/boxed_and_answer_tags_format_reward": 0.75, + "rewards/correctness_reward_func_math": 0.6666666567325592, + "step": 769, + "zero_std_ratio": 0.375 + }, + { + "epoch": 0.06844444444444445, + "grad_norm": 137.45404052734375, + "learning_rate": 3e-06, + "loss": 7.8168, + "step": 770 + }, + { + "epoch": 0.06853333333333333, + "grad_norm": 202.94410705566406, + "learning_rate": 3e-06, + "loss": 9.1514, + "step": 771 + }, + { + "epoch": 0.06862222222222222, + "grad_norm": 161.6815948486328, + "learning_rate": 3e-06, + "loss": 8.282, + "step": 772 + }, + { + "epoch": 0.06871111111111111, + "grad_norm": 123.37694549560547, + "learning_rate": 3e-06, + "loss": 3.5652, + "step": 773 + }, + { + "epoch": 0.0688, + "grad_norm": 146.2295379638672, + "learning_rate": 3e-06, + "loss": 9.5204, + "step": 774 + }, + { + "epoch": 0.06888888888888889, + "grad_norm": 145.88613891601562, + "learning_rate": 3e-06, + "loss": 4.6675, + "step": 775 + }, + { + "epoch": 0.06897777777777778, + "grad_norm": 135.99313354492188, + "learning_rate": 3e-06, + "loss": 6.1798, + "step": 776 + }, + { + "epoch": 0.06906666666666667, + "grad_norm": 134.66729736328125, + "learning_rate": 3e-06, + "loss": 6.9994, + "step": 777 + }, + { + "epoch": 0.06915555555555555, + "grad_norm": 173.39735412597656, + "learning_rate": 3e-06, + "loss": 7.3314, + "step": 778 + }, + { + "epoch": 0.06924444444444444, + "grad_norm": 136.74331665039062, + "learning_rate": 3e-06, + "loss": 1.5412, + "step": 779 + }, + { + "epoch": 0.06933333333333333, + "grad_norm": 142.06529235839844, + "learning_rate": 3e-06, + "loss": 7.5, + "step": 780 + }, + { + "completion_length": 236.14583587646484, + "epoch": 0.06942222222222222, + "grad_norm": 92.94062805175781, + "learning_rate": 3e-06, + "loss": 9.4998, + "reward": 0.8854166865348816, + "reward_std": 0.3922351598739624, + "rewards/boxed_and_answer_tags_format_reward": 0.71875, + "rewards/correctness_reward_func_math": 0.1666666679084301, + "step": 781, + "zero_std_ratio": 0.375 + }, + { + "epoch": 0.0695111111111111, + "grad_norm": 84.2685546875, + "learning_rate": 3e-06, + "loss": 20.3539, + "step": 782 + }, + { + "epoch": 0.0696, + "grad_norm": 79.0595932006836, + "learning_rate": 3e-06, + "loss": 16.2649, + "step": 783 + }, + { + "epoch": 0.06968888888888888, + "grad_norm": 151.0120086669922, + "learning_rate": 3e-06, + "loss": 13.9466, + "step": 784 + }, + { + "epoch": 0.06977777777777777, + "grad_norm": 84.83601379394531, + "learning_rate": 3e-06, + "loss": 13.7201, + "step": 785 + }, + { + "epoch": 0.06986666666666666, + "grad_norm": 149.62045288085938, + "learning_rate": 3e-06, + "loss": 13.032, + "step": 786 + }, + { + "epoch": 0.06995555555555556, + "grad_norm": 95.56779479980469, + "learning_rate": 3e-06, + "loss": 8.1661, + "step": 787 + }, + { + "epoch": 0.07004444444444445, + "grad_norm": 89.89787292480469, + "learning_rate": 3e-06, + "loss": 19.6015, + "step": 788 + }, + { + "epoch": 0.07013333333333334, + "grad_norm": 78.87327575683594, + "learning_rate": 3e-06, + "loss": 15.0965, + "step": 789 + }, + { + "epoch": 0.07022222222222223, + "grad_norm": 88.94290161132812, + "learning_rate": 3e-06, + "loss": 12.3248, + "step": 790 + }, + { + "epoch": 0.07031111111111112, + "grad_norm": 84.62553405761719, + "learning_rate": 3e-06, + "loss": 12.571, + "step": 791 + }, + { + "epoch": 0.0704, + "grad_norm": 102.37629699707031, + "learning_rate": 3e-06, + "loss": 11.5955, + "step": 792 + }, + { + "completion_length": 245.95834350585938, + "epoch": 0.07048888888888889, + "grad_norm": 87.68032836914062, + "learning_rate": 3e-06, + "loss": -2.896, + "reward": 0.979166716337204, + "reward_std": 0.3332235962152481, + "rewards/boxed_and_answer_tags_format_reward": 0.5625, + "rewards/correctness_reward_func_math": 0.416666679084301, + "step": 793, + "zero_std_ratio": 0.625 + }, + { + "epoch": 0.07057777777777778, + "grad_norm": 102.35977935791016, + "learning_rate": 3e-06, + "loss": -14.1397, + "step": 794 + }, + { + "epoch": 0.07066666666666667, + "grad_norm": 121.52274322509766, + "learning_rate": 3e-06, + "loss": -20.6379, + "step": 795 + }, + { + "epoch": 0.07075555555555556, + "grad_norm": 123.77271270751953, + "learning_rate": 3e-06, + "loss": -15.954, + "step": 796 + }, + { + "epoch": 0.07084444444444445, + "grad_norm": 115.74909210205078, + "learning_rate": 3e-06, + "loss": -19.257, + "step": 797 + }, + { + "epoch": 0.07093333333333333, + "grad_norm": 171.18011474609375, + "learning_rate": 3e-06, + "loss": -15.1334, + "step": 798 + }, + { + "epoch": 0.07102222222222222, + "grad_norm": 92.46512603759766, + "learning_rate": 3e-06, + "loss": -3.7242, + "step": 799 + }, + { + "epoch": 0.07111111111111111, + "grad_norm": 104.15264129638672, + "learning_rate": 3e-06, + "loss": -14.9942, + "step": 800 + }, + { + "epoch": 0.0712, + "grad_norm": 120.8930892944336, + "learning_rate": 3e-06, + "loss": -22.0188, + "step": 801 + }, + { + "epoch": 0.07128888888888889, + "grad_norm": 112.13275146484375, + "learning_rate": 3e-06, + "loss": -17.6709, + "step": 802 + }, + { + "epoch": 0.07137777777777778, + "grad_norm": 136.23388671875, + "learning_rate": 3e-06, + "loss": -20.6627, + "step": 803 + }, + { + "epoch": 0.07146666666666666, + "grad_norm": 124.1773681640625, + "learning_rate": 3e-06, + "loss": -17.0962, + "step": 804 + }, + { + "completion_length": 232.89583587646484, + "epoch": 0.07155555555555555, + "grad_norm": 91.88973236083984, + "learning_rate": 3e-06, + "loss": -7.8492, + "reward": 1.7291666865348816, + "reward_std": 0.47015421837568283, + "rewards/boxed_and_answer_tags_format_reward": 0.6875, + "rewards/correctness_reward_func_math": 1.0416666567325592, + "step": 805, + "zero_std_ratio": 0.5 + }, + { + "epoch": 0.07164444444444444, + "grad_norm": 107.12287139892578, + "learning_rate": 3e-06, + "loss": -23.4332, + "step": 806 + }, + { + "epoch": 0.07173333333333333, + "grad_norm": 104.5981674194336, + "learning_rate": 3e-06, + "loss": -11.8278, + "step": 807 + }, + { + "epoch": 0.07182222222222222, + "grad_norm": 102.6692886352539, + "learning_rate": 3e-06, + "loss": -20.165, + "step": 808 + }, + { + "epoch": 0.0719111111111111, + "grad_norm": 88.46080017089844, + "learning_rate": 3e-06, + "loss": -10.8484, + "step": 809 + }, + { + "epoch": 0.072, + "grad_norm": 124.10685729980469, + "learning_rate": 3e-06, + "loss": -20.237, + "step": 810 + }, + { + "epoch": 0.07208888888888888, + "grad_norm": 95.68196868896484, + "learning_rate": 3e-06, + "loss": -9.2521, + "step": 811 + }, + { + "epoch": 0.07217777777777777, + "grad_norm": 95.62832641601562, + "learning_rate": 3e-06, + "loss": -25.6333, + "step": 812 + }, + { + "epoch": 0.07226666666666667, + "grad_norm": 104.2110366821289, + "learning_rate": 3e-06, + "loss": -14.2533, + "step": 813 + }, + { + "epoch": 0.07235555555555556, + "grad_norm": 114.39372253417969, + "learning_rate": 3e-06, + "loss": -22.2888, + "step": 814 + }, + { + "epoch": 0.07244444444444445, + "grad_norm": 112.07288360595703, + "learning_rate": 3e-06, + "loss": -12.7028, + "step": 815 + }, + { + "epoch": 0.07253333333333334, + "grad_norm": 138.86337280273438, + "learning_rate": 3e-06, + "loss": -23.0688, + "step": 816 + }, + { + "completion_length": 249.3541717529297, + "epoch": 0.07262222222222223, + "grad_norm": 106.32572937011719, + "learning_rate": 3e-06, + "loss": 1.5611, + "reward": 1.291666716337204, + "reward_std": 0.37455084919929504, + "rewards/boxed_and_answer_tags_format_reward": 0.6666666865348816, + "rewards/correctness_reward_func_math": 0.625, + "step": 817, + "zero_std_ratio": 0.5 + }, + { + "epoch": 0.07271111111111112, + "grad_norm": 119.1928482055664, + "learning_rate": 3e-06, + "loss": 6.5715, + "step": 818 + }, + { + "epoch": 0.0728, + "grad_norm": 89.53767395019531, + "learning_rate": 3e-06, + "loss": 9.9694, + "step": 819 + }, + { + "epoch": 0.07288888888888889, + "grad_norm": 97.2778091430664, + "learning_rate": 3e-06, + "loss": 8.1343, + "step": 820 + }, + { + "epoch": 0.07297777777777778, + "grad_norm": 99.85116577148438, + "learning_rate": 3e-06, + "loss": 3.7739, + "step": 821 + }, + { + "epoch": 0.07306666666666667, + "grad_norm": 107.6137924194336, + "learning_rate": 3e-06, + "loss": -0.6852, + "step": 822 + }, + { + "epoch": 0.07315555555555556, + "grad_norm": 113.21214294433594, + "learning_rate": 3e-06, + "loss": 1.024, + "step": 823 + }, + { + "epoch": 0.07324444444444445, + "grad_norm": 130.27040100097656, + "learning_rate": 3e-06, + "loss": 4.4904, + "step": 824 + }, + { + "epoch": 0.07333333333333333, + "grad_norm": 87.94723510742188, + "learning_rate": 3e-06, + "loss": 7.7117, + "step": 825 + }, + { + "epoch": 0.07342222222222222, + "grad_norm": 102.87310791015625, + "learning_rate": 3e-06, + "loss": 5.3932, + "step": 826 + }, + { + "epoch": 0.07351111111111111, + "grad_norm": 97.53043365478516, + "learning_rate": 3e-06, + "loss": 1.1637, + "step": 827 + }, + { + "epoch": 0.0736, + "grad_norm": 126.4931640625, + "learning_rate": 3e-06, + "loss": -3.7025, + "step": 828 + }, + { + "completion_length": 240.75, + "epoch": 0.07368888888888889, + "grad_norm": 88.22557067871094, + "learning_rate": 3e-06, + "loss": 22.5183, + "reward": 1.1354166865348816, + "reward_std": 0.35377833247184753, + "rewards/boxed_and_answer_tags_format_reward": 0.59375, + "rewards/correctness_reward_func_math": 0.5416666567325592, + "step": 829, + "zero_std_ratio": 0.625 + }, + { + "epoch": 0.07377777777777778, + "grad_norm": 86.47975158691406, + "learning_rate": 3e-06, + "loss": 24.5943, + "step": 830 + }, + { + "epoch": 0.07386666666666666, + "grad_norm": 93.10693359375, + "learning_rate": 3e-06, + "loss": 27.2039, + "step": 831 + }, + { + "epoch": 0.07395555555555555, + "grad_norm": 96.74742889404297, + "learning_rate": 3e-06, + "loss": 32.9558, + "step": 832 + }, + { + "epoch": 0.07404444444444444, + "grad_norm": 97.9085693359375, + "learning_rate": 3e-06, + "loss": 25.7304, + "step": 833 + }, + { + "epoch": 0.07413333333333333, + "grad_norm": 95.80497741699219, + "learning_rate": 3e-06, + "loss": 28.8338, + "step": 834 + }, + { + "epoch": 0.07422222222222222, + "grad_norm": 82.9104995727539, + "learning_rate": 3e-06, + "loss": 20.9569, + "step": 835 + }, + { + "epoch": 0.0743111111111111, + "grad_norm": 332.6025695800781, + "learning_rate": 3e-06, + "loss": 22.7693, + "step": 836 + }, + { + "epoch": 0.0744, + "grad_norm": 97.00851440429688, + "learning_rate": 3e-06, + "loss": 24.6048, + "step": 837 + }, + { + "epoch": 0.07448888888888888, + "grad_norm": 90.94817352294922, + "learning_rate": 3e-06, + "loss": 30.2657, + "step": 838 + }, + { + "epoch": 0.07457777777777778, + "grad_norm": 91.87737274169922, + "learning_rate": 3e-06, + "loss": 23.8124, + "step": 839 + }, + { + "epoch": 0.07466666666666667, + "grad_norm": 100.71826934814453, + "learning_rate": 3e-06, + "loss": 26.5346, + "step": 840 + }, + { + "completion_length": 244.3125, + "epoch": 0.07475555555555556, + "grad_norm": 454.38482666015625, + "learning_rate": 3e-06, + "loss": -16.9865, + "reward": 1.2812500596046448, + "reward_std": 0.3782803416252136, + "rewards/boxed_and_answer_tags_format_reward": 0.6145833432674408, + "rewards/correctness_reward_func_math": 0.6666666492819786, + "step": 841, + "zero_std_ratio": 0.625 + }, + { + "epoch": 0.07484444444444445, + "grad_norm": 102.08020782470703, + "learning_rate": 3e-06, + "loss": -18.5601, + "step": 842 + }, + { + "epoch": 0.07493333333333334, + "grad_norm": 101.7997055053711, + "learning_rate": 3e-06, + "loss": -23.9473, + "step": 843 + }, + { + "epoch": 0.07502222222222223, + "grad_norm": 100.2668685913086, + "learning_rate": 3e-06, + "loss": -26.2402, + "step": 844 + }, + { + "epoch": 0.07511111111111111, + "grad_norm": 119.95198059082031, + "learning_rate": 3e-06, + "loss": -22.5011, + "step": 845 + }, + { + "epoch": 0.0752, + "grad_norm": 101.83861541748047, + "learning_rate": 3e-06, + "loss": -14.7265, + "step": 846 + }, + { + "epoch": 0.07528888888888889, + "grad_norm": 106.50312042236328, + "learning_rate": 3e-06, + "loss": -18.8046, + "step": 847 + }, + { + "epoch": 0.07537777777777778, + "grad_norm": 114.58135986328125, + "learning_rate": 3e-06, + "loss": -20.037, + "step": 848 + }, + { + "epoch": 0.07546666666666667, + "grad_norm": 121.03673553466797, + "learning_rate": 3e-06, + "loss": -25.817, + "step": 849 + }, + { + "epoch": 0.07555555555555556, + "grad_norm": 97.15817260742188, + "learning_rate": 3e-06, + "loss": -28.7689, + "step": 850 + }, + { + "epoch": 0.07564444444444444, + "grad_norm": 101.58541107177734, + "learning_rate": 3e-06, + "loss": -24.6109, + "step": 851 + }, + { + "epoch": 0.07573333333333333, + "grad_norm": 100.84017181396484, + "learning_rate": 3e-06, + "loss": -16.8195, + "step": 852 + }, + { + "completion_length": 247.4166717529297, + "epoch": 0.07582222222222222, + "grad_norm": 60.974788665771484, + "learning_rate": 3e-06, + "loss": 1.297, + "reward": 1.2708333432674408, + "reward_std": 0.12909945845603943, + "rewards/boxed_and_answer_tags_format_reward": 0.6875, + "rewards/correctness_reward_func_math": 0.5833333358168602, + "step": 853, + "zero_std_ratio": 0.875 + }, + { + "epoch": 0.07591111111111111, + "grad_norm": 58.498374938964844, + "learning_rate": 3e-06, + "loss": -4.5823, + "step": 854 + }, + { + "epoch": 0.076, + "grad_norm": 51.417320251464844, + "learning_rate": 3e-06, + "loss": -2.8582, + "step": 855 + }, + { + "epoch": 0.07608888888888889, + "grad_norm": 52.74491882324219, + "learning_rate": 3e-06, + "loss": -3.5933, + "step": 856 + }, + { + "epoch": 0.07617777777777777, + "grad_norm": 54.02571487426758, + "learning_rate": 3e-06, + "loss": -0.1777, + "step": 857 + }, + { + "epoch": 0.07626666666666666, + "grad_norm": 44.29707717895508, + "learning_rate": 3e-06, + "loss": -0.0581, + "step": 858 + }, + { + "epoch": 0.07635555555555555, + "grad_norm": 63.4464111328125, + "learning_rate": 3e-06, + "loss": 0.981, + "step": 859 + }, + { + "epoch": 0.07644444444444444, + "grad_norm": 58.98569869995117, + "learning_rate": 3e-06, + "loss": -5.3316, + "step": 860 + }, + { + "epoch": 0.07653333333333333, + "grad_norm": 54.73743438720703, + "learning_rate": 3e-06, + "loss": -3.799, + "step": 861 + }, + { + "epoch": 0.07662222222222222, + "grad_norm": 55.15388107299805, + "learning_rate": 3e-06, + "loss": -4.4757, + "step": 862 + }, + { + "epoch": 0.0767111111111111, + "grad_norm": 61.510887145996094, + "learning_rate": 3e-06, + "loss": -0.6125, + "step": 863 + }, + { + "epoch": 0.0768, + "grad_norm": 46.19833755493164, + "learning_rate": 3e-06, + "loss": -0.7855, + "step": 864 + }, + { + "completion_length": 251.3125, + "epoch": 0.0768888888888889, + "grad_norm": 111.38273620605469, + "learning_rate": 3e-06, + "loss": 14.4732, + "reward": 1.5208333730697632, + "reward_std": 0.4937378317117691, + "rewards/boxed_and_answer_tags_format_reward": 0.7291666567325592, + "rewards/correctness_reward_func_math": 0.7916666567325592, + "step": 865, + "zero_std_ratio": 0.375 + }, + { + "epoch": 0.07697777777777778, + "grad_norm": 112.39920043945312, + "learning_rate": 3e-06, + "loss": 3.7967, + "step": 866 + }, + { + "epoch": 0.07706666666666667, + "grad_norm": 106.71125793457031, + "learning_rate": 3e-06, + "loss": 1.1063, + "step": 867 + }, + { + "epoch": 0.07715555555555556, + "grad_norm": 129.4515838623047, + "learning_rate": 3e-06, + "loss": -2.7262, + "step": 868 + }, + { + "epoch": 0.07724444444444445, + "grad_norm": 109.67815399169922, + "learning_rate": 3e-06, + "loss": 0.1256, + "step": 869 + }, + { + "epoch": 0.07733333333333334, + "grad_norm": 116.57035827636719, + "learning_rate": 3e-06, + "loss": -2.946, + "step": 870 + }, + { + "epoch": 0.07742222222222223, + "grad_norm": 122.09374237060547, + "learning_rate": 3e-06, + "loss": 13.0142, + "step": 871 + }, + { + "epoch": 0.07751111111111111, + "grad_norm": 108.0525894165039, + "learning_rate": 3e-06, + "loss": 2.4968, + "step": 872 + }, + { + "epoch": 0.0776, + "grad_norm": 108.2818374633789, + "learning_rate": 3e-06, + "loss": -0.8286, + "step": 873 + }, + { + "epoch": 0.07768888888888889, + "grad_norm": 139.2396697998047, + "learning_rate": 3e-06, + "loss": -5.0471, + "step": 874 + }, + { + "epoch": 0.07777777777777778, + "grad_norm": 114.9443588256836, + "learning_rate": 3e-06, + "loss": -1.3382, + "step": 875 + }, + { + "epoch": 0.07786666666666667, + "grad_norm": 128.51272583007812, + "learning_rate": 3e-06, + "loss": -4.8307, + "step": 876 + }, + { + "completion_length": 252.50000762939453, + "epoch": 0.07795555555555556, + "grad_norm": 126.4562759399414, + "learning_rate": 3e-06, + "loss": 0.751, + "reward": 1.0729166865348816, + "reward_std": 0.22548970580101013, + "rewards/boxed_and_answer_tags_format_reward": 0.6145833432674408, + "rewards/correctness_reward_func_math": 0.4583333283662796, + "step": 877, + "zero_std_ratio": 0.75 + }, + { + "epoch": 0.07804444444444444, + "grad_norm": 82.09607696533203, + "learning_rate": 3e-06, + "loss": 0.195, + "step": 878 + }, + { + "epoch": 0.07813333333333333, + "grad_norm": 74.75113677978516, + "learning_rate": 3e-06, + "loss": -1.5817, + "step": 879 + }, + { + "epoch": 0.07822222222222222, + "grad_norm": 115.73063659667969, + "learning_rate": 3e-06, + "loss": -8.3706, + "step": 880 + }, + { + "epoch": 0.07831111111111111, + "grad_norm": 70.39916229248047, + "learning_rate": 3e-06, + "loss": -3.2191, + "step": 881 + }, + { + "epoch": 0.0784, + "grad_norm": 103.28494262695312, + "learning_rate": 3e-06, + "loss": 1.0954, + "step": 882 + }, + { + "epoch": 0.07848888888888889, + "grad_norm": 100.23104858398438, + "learning_rate": 3e-06, + "loss": 0.0428, + "step": 883 + }, + { + "epoch": 0.07857777777777777, + "grad_norm": 90.33434295654297, + "learning_rate": 3e-06, + "loss": -0.8422, + "step": 884 + }, + { + "epoch": 0.07866666666666666, + "grad_norm": 99.41636657714844, + "learning_rate": 3e-06, + "loss": -2.558, + "step": 885 + }, + { + "epoch": 0.07875555555555555, + "grad_norm": 99.72211456298828, + "learning_rate": 3e-06, + "loss": -8.9057, + "step": 886 + }, + { + "epoch": 0.07884444444444444, + "grad_norm": 73.4344711303711, + "learning_rate": 3e-06, + "loss": -4.09, + "step": 887 + }, + { + "epoch": 0.07893333333333333, + "grad_norm": 98.25971221923828, + "learning_rate": 3e-06, + "loss": -0.4152, + "step": 888 + }, + { + "completion_length": 249.2916717529297, + "epoch": 0.07902222222222222, + "grad_norm": 165.06918334960938, + "learning_rate": 3e-06, + "loss": 25.8684, + "reward": 1.4270833730697632, + "reward_std": 0.5354157984256744, + "rewards/boxed_and_answer_tags_format_reward": 0.6770833432674408, + "rewards/correctness_reward_func_math": 0.75, + "step": 889, + "zero_std_ratio": 0.375 + }, + { + "epoch": 0.0791111111111111, + "grad_norm": 132.98858642578125, + "learning_rate": 3e-06, + "loss": 19.4902, + "step": 890 + }, + { + "epoch": 0.0792, + "grad_norm": 122.3958969116211, + "learning_rate": 3e-06, + "loss": 32.6103, + "step": 891 + }, + { + "epoch": 0.0792888888888889, + "grad_norm": 118.38285064697266, + "learning_rate": 3e-06, + "loss": 32.6835, + "step": 892 + }, + { + "epoch": 0.07937777777777778, + "grad_norm": 126.4738540649414, + "learning_rate": 3e-06, + "loss": 30.5181, + "step": 893 + }, + { + "epoch": 0.07946666666666667, + "grad_norm": 128.20831298828125, + "learning_rate": 3e-06, + "loss": 33.4332, + "step": 894 + }, + { + "epoch": 0.07955555555555556, + "grad_norm": 152.9354248046875, + "learning_rate": 3e-06, + "loss": 23.7035, + "step": 895 + }, + { + "epoch": 0.07964444444444445, + "grad_norm": 130.27053833007812, + "learning_rate": 3e-06, + "loss": 16.7548, + "step": 896 + }, + { + "epoch": 0.07973333333333334, + "grad_norm": 127.17219543457031, + "learning_rate": 3e-06, + "loss": 30.7848, + "step": 897 + }, + { + "epoch": 0.07982222222222222, + "grad_norm": 118.670654296875, + "learning_rate": 3e-06, + "loss": 30.7772, + "step": 898 + }, + { + "epoch": 0.07991111111111111, + "grad_norm": 120.19160461425781, + "learning_rate": 3e-06, + "loss": 27.401, + "step": 899 + }, + { + "epoch": 0.08, + "grad_norm": 137.2371063232422, + "learning_rate": 3e-06, + "loss": 30.9334, + "step": 900 + }, + { + "completion_length": 255.89583587646484, + "epoch": 0.08008888888888889, + "grad_norm": 88.54483032226562, + "learning_rate": 3e-06, + "loss": 3.6467, + "reward": 1.3437500596046448, + "reward_std": 0.30770808458328247, + "rewards/boxed_and_answer_tags_format_reward": 0.6770833432674408, + "rewards/correctness_reward_func_math": 0.6666666567325592, + "step": 901, + "zero_std_ratio": 0.625 + }, + { + "epoch": 0.08017777777777778, + "grad_norm": 87.48735046386719, + "learning_rate": 3e-06, + "loss": 1.8615, + "step": 902 + }, + { + "epoch": 0.08026666666666667, + "grad_norm": 86.97764587402344, + "learning_rate": 3e-06, + "loss": 2.8386, + "step": 903 + }, + { + "epoch": 0.08035555555555556, + "grad_norm": 105.64205932617188, + "learning_rate": 3e-06, + "loss": -3.9129, + "step": 904 + }, + { + "epoch": 0.08044444444444444, + "grad_norm": 80.34683227539062, + "learning_rate": 3e-06, + "loss": 1.4293, + "step": 905 + }, + { + "epoch": 0.08053333333333333, + "grad_norm": 91.16341400146484, + "learning_rate": 3e-06, + "loss": 2.803, + "step": 906 + }, + { + "epoch": 0.08062222222222222, + "grad_norm": 96.49407196044922, + "learning_rate": 3e-06, + "loss": 2.4431, + "step": 907 + }, + { + "epoch": 0.08071111111111111, + "grad_norm": 84.40055084228516, + "learning_rate": 3e-06, + "loss": 0.5478, + "step": 908 + }, + { + "epoch": 0.0808, + "grad_norm": 78.79622650146484, + "learning_rate": 3e-06, + "loss": 1.4422, + "step": 909 + }, + { + "epoch": 0.08088888888888889, + "grad_norm": 128.47531127929688, + "learning_rate": 3e-06, + "loss": -5.1302, + "step": 910 + }, + { + "epoch": 0.08097777777777777, + "grad_norm": 79.19956970214844, + "learning_rate": 3e-06, + "loss": -0.2354, + "step": 911 + }, + { + "epoch": 0.08106666666666666, + "grad_norm": 107.92975616455078, + "learning_rate": 3e-06, + "loss": 1.3494, + "step": 912 + }, + { + "completion_length": 236.7916717529297, + "epoch": 0.08115555555555555, + "grad_norm": 72.56483459472656, + "learning_rate": 3e-06, + "loss": -4.7678, + "reward": 1.0, + "reward_std": 0.23116151988506317, + "rewards/boxed_and_answer_tags_format_reward": 0.625, + "rewards/correctness_reward_func_math": 0.375, + "step": 913, + "zero_std_ratio": 0.75 + }, + { + "epoch": 0.08124444444444444, + "grad_norm": 84.60346984863281, + "learning_rate": 3e-06, + "loss": -1.6215, + "step": 914 + }, + { + "epoch": 0.08133333333333333, + "grad_norm": 80.78106689453125, + "learning_rate": 3e-06, + "loss": 3.2587, + "step": 915 + }, + { + "epoch": 0.08142222222222223, + "grad_norm": 71.9332275390625, + "learning_rate": 3e-06, + "loss": -4.7685, + "step": 916 + }, + { + "epoch": 0.08151111111111112, + "grad_norm": 98.66748046875, + "learning_rate": 3e-06, + "loss": -6.589, + "step": 917 + }, + { + "epoch": 0.0816, + "grad_norm": 135.69175720214844, + "learning_rate": 3e-06, + "loss": -7.5017, + "step": 918 + }, + { + "epoch": 0.0816888888888889, + "grad_norm": 73.60437774658203, + "learning_rate": 3e-06, + "loss": -5.1495, + "step": 919 + }, + { + "epoch": 0.08177777777777778, + "grad_norm": 90.75928497314453, + "learning_rate": 3e-06, + "loss": -2.384, + "step": 920 + }, + { + "epoch": 0.08186666666666667, + "grad_norm": 82.05548095703125, + "learning_rate": 3e-06, + "loss": 2.8112, + "step": 921 + }, + { + "epoch": 0.08195555555555556, + "grad_norm": 77.72819519042969, + "learning_rate": 3e-06, + "loss": -5.4085, + "step": 922 + }, + { + "epoch": 0.08204444444444445, + "grad_norm": 100.81270599365234, + "learning_rate": 3e-06, + "loss": -7.311, + "step": 923 + }, + { + "epoch": 0.08213333333333334, + "grad_norm": 86.93998718261719, + "learning_rate": 3e-06, + "loss": -7.627, + "step": 924 + }, + { + "completion_length": 244.2916717529297, + "epoch": 0.08222222222222222, + "grad_norm": 124.40040588378906, + "learning_rate": 3e-06, + "loss": 2.5607, + "reward": 1.7708333730697632, + "reward_std": 0.5440726578235626, + "rewards/boxed_and_answer_tags_format_reward": 0.7291666567325592, + "rewards/correctness_reward_func_math": 1.0416666567325592, + "step": 925, + "zero_std_ratio": 0.375 + }, + { + "epoch": 0.08231111111111111, + "grad_norm": 100.88472747802734, + "learning_rate": 3e-06, + "loss": -11.1251, + "step": 926 + }, + { + "epoch": 0.0824, + "grad_norm": 139.0868377685547, + "learning_rate": 3e-06, + "loss": -4.6789, + "step": 927 + }, + { + "epoch": 0.08248888888888889, + "grad_norm": 105.37358093261719, + "learning_rate": 3e-06, + "loss": -1.7812, + "step": 928 + }, + { + "epoch": 0.08257777777777778, + "grad_norm": 122.72453308105469, + "learning_rate": 3e-06, + "loss": 5.9917, + "step": 929 + }, + { + "epoch": 0.08266666666666667, + "grad_norm": 125.92015075683594, + "learning_rate": 3e-06, + "loss": -2.2776, + "step": 930 + }, + { + "epoch": 0.08275555555555555, + "grad_norm": 122.41661834716797, + "learning_rate": 3e-06, + "loss": 0.8417, + "step": 931 + }, + { + "epoch": 0.08284444444444444, + "grad_norm": 117.53387451171875, + "learning_rate": 3e-06, + "loss": -13.1484, + "step": 932 + }, + { + "epoch": 0.08293333333333333, + "grad_norm": 126.8305892944336, + "learning_rate": 3e-06, + "loss": -7.6824, + "step": 933 + }, + { + "epoch": 0.08302222222222222, + "grad_norm": 111.0191421508789, + "learning_rate": 3e-06, + "loss": -3.9524, + "step": 934 + }, + { + "epoch": 0.08311111111111111, + "grad_norm": 131.84397888183594, + "learning_rate": 3e-06, + "loss": 2.9848, + "step": 935 + }, + { + "epoch": 0.0832, + "grad_norm": 124.30160522460938, + "learning_rate": 3e-06, + "loss": -5.5558, + "step": 936 + }, + { + "completion_length": 240.9791717529297, + "epoch": 0.08328888888888888, + "grad_norm": 137.87579345703125, + "learning_rate": 3e-06, + "loss": -18.5164, + "reward": 1.1354166865348816, + "reward_std": 0.4608011841773987, + "rewards/boxed_and_answer_tags_format_reward": 0.6770833432674408, + "rewards/correctness_reward_func_math": 0.4583333432674408, + "step": 937, + "zero_std_ratio": 0.375 + }, + { + "epoch": 0.08337777777777777, + "grad_norm": 132.598388671875, + "learning_rate": 3e-06, + "loss": -26.5032, + "step": 938 + }, + { + "epoch": 0.08346666666666666, + "grad_norm": 147.24671936035156, + "learning_rate": 3e-06, + "loss": -20.2196, + "step": 939 + }, + { + "epoch": 0.08355555555555555, + "grad_norm": 140.57591247558594, + "learning_rate": 3e-06, + "loss": -19.0462, + "step": 940 + }, + { + "epoch": 0.08364444444444444, + "grad_norm": 124.26339721679688, + "learning_rate": 3e-06, + "loss": -22.6895, + "step": 941 + }, + { + "epoch": 0.08373333333333334, + "grad_norm": 126.45221710205078, + "learning_rate": 3e-06, + "loss": -24.6262, + "step": 942 + }, + { + "epoch": 0.08382222222222223, + "grad_norm": 135.6764373779297, + "learning_rate": 3e-06, + "loss": -20.8866, + "step": 943 + }, + { + "epoch": 0.08391111111111112, + "grad_norm": 139.4601287841797, + "learning_rate": 3e-06, + "loss": -28.928, + "step": 944 + }, + { + "epoch": 0.084, + "grad_norm": 173.5882568359375, + "learning_rate": 3e-06, + "loss": -23.599, + "step": 945 + }, + { + "epoch": 0.0840888888888889, + "grad_norm": 131.79933166503906, + "learning_rate": 3e-06, + "loss": -22.4616, + "step": 946 + }, + { + "epoch": 0.08417777777777778, + "grad_norm": 128.2574920654297, + "learning_rate": 3e-06, + "loss": -26.6084, + "step": 947 + }, + { + "epoch": 0.08426666666666667, + "grad_norm": 133.56024169921875, + "learning_rate": 3e-06, + "loss": -29.0544, + "step": 948 + }, + { + "completion_length": 254.3541717529297, + "epoch": 0.08435555555555556, + "grad_norm": 113.98381805419922, + "learning_rate": 3e-06, + "loss": 2.0708, + "reward": 1.291666716337204, + "reward_std": 0.4701542258262634, + "rewards/boxed_and_answer_tags_format_reward": 0.75, + "rewards/correctness_reward_func_math": 0.541666679084301, + "step": 949, + "zero_std_ratio": 0.5 + }, + { + "epoch": 0.08444444444444445, + "grad_norm": 158.377197265625, + "learning_rate": 3e-06, + "loss": 0.0861, + "step": 950 + }, + { + "epoch": 0.08453333333333334, + "grad_norm": 152.6723175048828, + "learning_rate": 3e-06, + "loss": 8.1982, + "step": 951 + }, + { + "epoch": 0.08462222222222222, + "grad_norm": 122.41393280029297, + "learning_rate": 3e-06, + "loss": -2.2863, + "step": 952 + }, + { + "epoch": 0.08471111111111111, + "grad_norm": 137.0810089111328, + "learning_rate": 3e-06, + "loss": -1.879, + "step": 953 + }, + { + "epoch": 0.0848, + "grad_norm": 122.0219497680664, + "learning_rate": 3e-06, + "loss": -4.6058, + "step": 954 + }, + { + "epoch": 0.08488888888888889, + "grad_norm": 122.55841064453125, + "learning_rate": 3e-06, + "loss": 0.6988, + "step": 955 + }, + { + "epoch": 0.08497777777777778, + "grad_norm": 144.296875, + "learning_rate": 3e-06, + "loss": -2.2706, + "step": 956 + }, + { + "epoch": 0.08506666666666667, + "grad_norm": 130.82684326171875, + "learning_rate": 3e-06, + "loss": 6.122, + "step": 957 + }, + { + "epoch": 0.08515555555555555, + "grad_norm": 121.61994934082031, + "learning_rate": 3e-06, + "loss": -4.2647, + "step": 958 + }, + { + "epoch": 0.08524444444444444, + "grad_norm": 124.73589324951172, + "learning_rate": 3e-06, + "loss": -4.8135, + "step": 959 + }, + { + "epoch": 0.08533333333333333, + "grad_norm": 109.87874603271484, + "learning_rate": 3e-06, + "loss": -7.1554, + "step": 960 + }, + { + "completion_length": 244.50000762939453, + "epoch": 0.08542222222222222, + "grad_norm": 74.7403793334961, + "learning_rate": 3e-06, + "loss": 9.6961, + "reward": 2.1666667461395264, + "reward_std": 0.20412414520978928, + "rewards/boxed_and_answer_tags_format_reward": 0.75, + "rewards/correctness_reward_func_math": 1.4166666269302368, + "step": 961, + "zero_std_ratio": 0.75 + }, + { + "epoch": 0.08551111111111111, + "grad_norm": 68.04969024658203, + "learning_rate": 3e-06, + "loss": 12.6362, + "step": 962 + }, + { + "epoch": 0.0856, + "grad_norm": 69.76187133789062, + "learning_rate": 3e-06, + "loss": 13.3129, + "step": 963 + }, + { + "epoch": 0.08568888888888888, + "grad_norm": 81.5944595336914, + "learning_rate": 3e-06, + "loss": 9.7628, + "step": 964 + }, + { + "epoch": 0.08577777777777777, + "grad_norm": 83.20171356201172, + "learning_rate": 3e-06, + "loss": 8.2983, + "step": 965 + }, + { + "epoch": 0.08586666666666666, + "grad_norm": 78.74623107910156, + "learning_rate": 3e-06, + "loss": 3.5912, + "step": 966 + }, + { + "epoch": 0.08595555555555555, + "grad_norm": 65.22360229492188, + "learning_rate": 3e-06, + "loss": 8.6179, + "step": 967 + }, + { + "epoch": 0.08604444444444445, + "grad_norm": 67.84490966796875, + "learning_rate": 3e-06, + "loss": 10.9563, + "step": 968 + }, + { + "epoch": 0.08613333333333334, + "grad_norm": 66.93883514404297, + "learning_rate": 3e-06, + "loss": 11.5826, + "step": 969 + }, + { + "epoch": 0.08622222222222223, + "grad_norm": 75.27574157714844, + "learning_rate": 3e-06, + "loss": 8.5011, + "step": 970 + }, + { + "epoch": 0.08631111111111112, + "grad_norm": 68.24022674560547, + "learning_rate": 3e-06, + "loss": 6.8511, + "step": 971 + }, + { + "epoch": 0.0864, + "grad_norm": 70.8918685913086, + "learning_rate": 3e-06, + "loss": 2.4563, + "step": 972 + }, + { + "completion_length": 251.0416717529297, + "epoch": 0.08648888888888889, + "grad_norm": 130.6646270751953, + "learning_rate": 3e-06, + "loss": -5.8085, + "reward": 1.4479166865348816, + "reward_std": 0.5305383503437042, + "rewards/boxed_and_answer_tags_format_reward": 0.7395833432674408, + "rewards/correctness_reward_func_math": 0.708333358168602, + "step": 973, + "zero_std_ratio": 0.375 + }, + { + "epoch": 0.08657777777777778, + "grad_norm": 134.2749786376953, + "learning_rate": 3e-06, + "loss": 1.4252, + "step": 974 + }, + { + "epoch": 0.08666666666666667, + "grad_norm": 137.93409729003906, + "learning_rate": 3e-06, + "loss": -1.3707, + "step": 975 + }, + { + "epoch": 0.08675555555555556, + "grad_norm": 121.03262329101562, + "learning_rate": 3e-06, + "loss": 1.4106, + "step": 976 + }, + { + "epoch": 0.08684444444444445, + "grad_norm": 132.82774353027344, + "learning_rate": 3e-06, + "loss": -4.6112, + "step": 977 + }, + { + "epoch": 0.08693333333333333, + "grad_norm": 163.79421997070312, + "learning_rate": 3e-06, + "loss": -9.213, + "step": 978 + }, + { + "epoch": 0.08702222222222222, + "grad_norm": 135.431396484375, + "learning_rate": 3e-06, + "loss": -9.618, + "step": 979 + }, + { + "epoch": 0.08711111111111111, + "grad_norm": 130.899658203125, + "learning_rate": 3e-06, + "loss": -2.0931, + "step": 980 + }, + { + "epoch": 0.0872, + "grad_norm": 133.3518524169922, + "learning_rate": 3e-06, + "loss": -5.8637, + "step": 981 + }, + { + "epoch": 0.08728888888888889, + "grad_norm": 144.94068908691406, + "learning_rate": 3e-06, + "loss": -2.6211, + "step": 982 + }, + { + "epoch": 0.08737777777777778, + "grad_norm": 141.72738647460938, + "learning_rate": 3e-06, + "loss": -9.573, + "step": 983 + }, + { + "epoch": 0.08746666666666666, + "grad_norm": 148.67123413085938, + "learning_rate": 3e-06, + "loss": -12.8677, + "step": 984 + }, + { + "completion_length": 255.89583587646484, + "epoch": 0.08755555555555555, + "grad_norm": 148.83518981933594, + "learning_rate": 3e-06, + "loss": -2.2086, + "reward": 1.3958333432674408, + "reward_std": 0.48216672986745834, + "rewards/boxed_and_answer_tags_format_reward": 0.7291666865348816, + "rewards/correctness_reward_func_math": 0.6666666865348816, + "step": 985, + "zero_std_ratio": 0.375 + }, + { + "epoch": 0.08764444444444444, + "grad_norm": 141.38470458984375, + "learning_rate": 3e-06, + "loss": -5.2298, + "step": 986 + }, + { + "epoch": 0.08773333333333333, + "grad_norm": 114.79426574707031, + "learning_rate": 3e-06, + "loss": 0.6894, + "step": 987 + }, + { + "epoch": 0.08782222222222222, + "grad_norm": 137.67416381835938, + "learning_rate": 3e-06, + "loss": 0.8801, + "step": 988 + }, + { + "epoch": 0.0879111111111111, + "grad_norm": 138.06517028808594, + "learning_rate": 3e-06, + "loss": -1.6148, + "step": 989 + }, + { + "epoch": 0.088, + "grad_norm": 153.39608764648438, + "learning_rate": 3e-06, + "loss": 0.5819, + "step": 990 + }, + { + "epoch": 0.08808888888888888, + "grad_norm": 159.18431091308594, + "learning_rate": 3e-06, + "loss": -3.8668, + "step": 991 + }, + { + "epoch": 0.08817777777777777, + "grad_norm": 141.29696655273438, + "learning_rate": 3e-06, + "loss": -7.4773, + "step": 992 + }, + { + "epoch": 0.08826666666666666, + "grad_norm": 129.83058166503906, + "learning_rate": 3e-06, + "loss": -1.6063, + "step": 993 + }, + { + "epoch": 0.08835555555555556, + "grad_norm": 133.49261474609375, + "learning_rate": 3e-06, + "loss": -0.5547, + "step": 994 + }, + { + "epoch": 0.08844444444444445, + "grad_norm": 125.11674499511719, + "learning_rate": 3e-06, + "loss": -3.5547, + "step": 995 + }, + { + "epoch": 0.08853333333333334, + "grad_norm": 113.9607925415039, + "learning_rate": 3e-06, + "loss": -1.1078, + "step": 996 + }, + { + "completion_length": 254.1041717529297, + "epoch": 0.08862222222222223, + "grad_norm": 112.0015869140625, + "learning_rate": 3e-06, + "loss": 0.6874, + "reward": 1.6666666865348816, + "reward_std": 0.3557328134775162, + "rewards/boxed_and_answer_tags_format_reward": 0.7083333432674408, + "rewards/correctness_reward_func_math": 0.9583333432674408, + "step": 997, + "zero_std_ratio": 0.375 + }, + { + "epoch": 0.08871111111111112, + "grad_norm": 117.1604232788086, + "learning_rate": 3e-06, + "loss": 10.6509, + "step": 998 + }, + { + "epoch": 0.0888, + "grad_norm": 97.7497787475586, + "learning_rate": 3e-06, + "loss": 1.431, + "step": 999 + }, + { + "epoch": 0.08888888888888889, + "grad_norm": 99.68553161621094, + "learning_rate": 3e-06, + "loss": 9.1808, + "step": 1000 + }, + { + "epoch": 0.08897777777777778, + "grad_norm": 127.76787567138672, + "learning_rate": 3e-06, + "loss": 7.5762, + "step": 1001 + }, + { + "epoch": 0.08906666666666667, + "grad_norm": 93.00830841064453, + "learning_rate": 3e-06, + "loss": 5.1825, + "step": 1002 + }, + { + "epoch": 0.08915555555555556, + "grad_norm": 100.87223815917969, + "learning_rate": 3e-06, + "loss": -0.2862, + "step": 1003 + }, + { + "epoch": 0.08924444444444445, + "grad_norm": 125.37133026123047, + "learning_rate": 3e-06, + "loss": 9.2038, + "step": 1004 + }, + { + "epoch": 0.08933333333333333, + "grad_norm": 107.84559631347656, + "learning_rate": 3e-06, + "loss": -0.0948, + "step": 1005 + }, + { + "epoch": 0.08942222222222222, + "grad_norm": 126.59029388427734, + "learning_rate": 3e-06, + "loss": 7.2332, + "step": 1006 + }, + { + "epoch": 0.08951111111111111, + "grad_norm": 120.74652099609375, + "learning_rate": 3e-06, + "loss": 6.3094, + "step": 1007 + }, + { + "epoch": 0.0896, + "grad_norm": 94.37996673583984, + "learning_rate": 3e-06, + "loss": 3.469, + "step": 1008 + }, + { + "completion_length": 242.25000762939453, + "epoch": 0.08968888888888889, + "grad_norm": 61.877044677734375, + "learning_rate": 3e-06, + "loss": 0.3003, + "reward": 1.0104166865348816, + "reward_std": 0.11467799544334412, + "rewards/boxed_and_answer_tags_format_reward": 0.6770833432674408, + "rewards/correctness_reward_func_math": 0.3333333358168602, + "step": 1009, + "zero_std_ratio": 0.875 + }, + { + "epoch": 0.08977777777777778, + "grad_norm": 48.69044494628906, + "learning_rate": 3e-06, + "loss": -1.93, + "step": 1010 + }, + { + "epoch": 0.08986666666666666, + "grad_norm": 62.56666946411133, + "learning_rate": 3e-06, + "loss": 0.7216, + "step": 1011 + }, + { + "epoch": 0.08995555555555555, + "grad_norm": 54.1674690246582, + "learning_rate": 3e-06, + "loss": -1.7886, + "step": 1012 + }, + { + "epoch": 0.09004444444444444, + "grad_norm": 52.60224533081055, + "learning_rate": 3e-06, + "loss": 0.5879, + "step": 1013 + }, + { + "epoch": 0.09013333333333333, + "grad_norm": 45.58321762084961, + "learning_rate": 3e-06, + "loss": 0.5204, + "step": 1014 + }, + { + "epoch": 0.09022222222222222, + "grad_norm": 57.8793830871582, + "learning_rate": 3e-06, + "loss": -0.4941, + "step": 1015 + }, + { + "epoch": 0.0903111111111111, + "grad_norm": 51.80791091918945, + "learning_rate": 3e-06, + "loss": -2.7426, + "step": 1016 + }, + { + "epoch": 0.0904, + "grad_norm": 56.86159896850586, + "learning_rate": 3e-06, + "loss": -0.3923, + "step": 1017 + }, + { + "epoch": 0.09048888888888888, + "grad_norm": 48.4435920715332, + "learning_rate": 3e-06, + "loss": -3.0717, + "step": 1018 + }, + { + "epoch": 0.09057777777777777, + "grad_norm": 52.369598388671875, + "learning_rate": 3e-06, + "loss": -0.6683, + "step": 1019 + }, + { + "epoch": 0.09066666666666667, + "grad_norm": 45.13036346435547, + "learning_rate": 3e-06, + "loss": -0.8826, + "step": 1020 + }, + { + "completion_length": 251.9166717529297, + "epoch": 0.09075555555555556, + "grad_norm": 0.0, + "learning_rate": 3e-06, + "loss": 0.0, + "reward": 1.5, + "reward_std": 0.0, + "rewards/boxed_and_answer_tags_format_reward": 0.75, + "rewards/correctness_reward_func_math": 0.75, + "step": 1021, + "zero_std_ratio": 1.0 + }, + { + "epoch": 0.09084444444444445, + "grad_norm": 0.0, + "learning_rate": 3e-06, + "loss": 0.0, + "step": 1022 + }, + { + "epoch": 0.09093333333333334, + "grad_norm": 0.0, + "learning_rate": 3e-06, + "loss": 0.0, + "step": 1023 + }, + { + "epoch": 0.09102222222222223, + "grad_norm": 0.0, + "learning_rate": 3e-06, + "loss": 0.0, + "step": 1024 + }, + { + "epoch": 0.09111111111111111, + "grad_norm": 0.0, + "learning_rate": 3e-06, + "loss": 0.0, + "step": 1025 + }, + { + "epoch": 0.0912, + "grad_norm": 0.0, + "learning_rate": 3e-06, + "loss": 0.0, + "step": 1026 + }, + { + "epoch": 0.09128888888888889, + "grad_norm": 0.0, + "learning_rate": 3e-06, + "loss": 0.0, + "step": 1027 + }, + { + "epoch": 0.09137777777777778, + "grad_norm": 0.0, + "learning_rate": 3e-06, + "loss": 0.0, + "step": 1028 + }, + { + "epoch": 0.09146666666666667, + "grad_norm": 0.0, + "learning_rate": 3e-06, + "loss": 0.0, + "step": 1029 + }, + { + "epoch": 0.09155555555555556, + "grad_norm": 0.0, + "learning_rate": 3e-06, + "loss": 0.0, + "step": 1030 + }, + { + "epoch": 0.09164444444444445, + "grad_norm": 0.0, + "learning_rate": 3e-06, + "loss": 0.0, + "step": 1031 + }, + { + "epoch": 0.09173333333333333, + "grad_norm": 0.0, + "learning_rate": 3e-06, + "loss": 0.0, + "step": 1032 + }, + { + "completion_length": 252.3541717529297, + "epoch": 0.09182222222222222, + "grad_norm": 126.83654022216797, + "learning_rate": 3e-06, + "loss": -8.3335, + "reward": 1.625, + "reward_std": 0.3410547971725464, + "rewards/boxed_and_answer_tags_format_reward": 0.75, + "rewards/correctness_reward_func_math": 0.875, + "step": 1033, + "zero_std_ratio": 0.625 + }, + { + "epoch": 0.09191111111111111, + "grad_norm": 102.25955963134766, + "learning_rate": 3e-06, + "loss": -6.4906, + "step": 1034 + }, + { + "epoch": 0.092, + "grad_norm": 128.81582641601562, + "learning_rate": 3e-06, + "loss": -6.558, + "step": 1035 + }, + { + "epoch": 0.09208888888888889, + "grad_norm": 112.67058563232422, + "learning_rate": 3e-06, + "loss": -9.3061, + "step": 1036 + }, + { + "epoch": 0.09217777777777778, + "grad_norm": 108.50650024414062, + "learning_rate": 3e-06, + "loss": -6.0381, + "step": 1037 + }, + { + "epoch": 0.09226666666666666, + "grad_norm": 108.7009506225586, + "learning_rate": 3e-06, + "loss": 0.0853, + "step": 1038 + }, + { + "epoch": 0.09235555555555555, + "grad_norm": 114.24646759033203, + "learning_rate": 3e-06, + "loss": -9.0476, + "step": 1039 + }, + { + "epoch": 0.09244444444444444, + "grad_norm": 99.69547271728516, + "learning_rate": 3e-06, + "loss": -8.1766, + "step": 1040 + }, + { + "epoch": 0.09253333333333333, + "grad_norm": 142.9137725830078, + "learning_rate": 3e-06, + "loss": -7.6658, + "step": 1041 + }, + { + "epoch": 0.09262222222222222, + "grad_norm": 113.05297088623047, + "learning_rate": 3e-06, + "loss": -10.693, + "step": 1042 + }, + { + "epoch": 0.0927111111111111, + "grad_norm": 114.48544311523438, + "learning_rate": 3e-06, + "loss": -8.4665, + "step": 1043 + }, + { + "epoch": 0.0928, + "grad_norm": 131.56544494628906, + "learning_rate": 3e-06, + "loss": -1.0749, + "step": 1044 + }, + { + "completion_length": 251.3125, + "epoch": 0.09288888888888888, + "grad_norm": 162.5680389404297, + "learning_rate": 3e-06, + "loss": 2.2852, + "reward": 1.4479166865348816, + "reward_std": 0.5932036638259888, + "rewards/boxed_and_answer_tags_format_reward": 0.7395833432674408, + "rewards/correctness_reward_func_math": 0.708333358168602, + "step": 1045, + "zero_std_ratio": 0.375 + }, + { + "epoch": 0.09297777777777778, + "grad_norm": 183.73635864257812, + "learning_rate": 3e-06, + "loss": 4.3862, + "step": 1046 + }, + { + "epoch": 0.09306666666666667, + "grad_norm": 149.2581329345703, + "learning_rate": 3e-06, + "loss": 3.2506, + "step": 1047 + }, + { + "epoch": 0.09315555555555556, + "grad_norm": 146.53892517089844, + "learning_rate": 3e-06, + "loss": 6.7977, + "step": 1048 + }, + { + "epoch": 0.09324444444444445, + "grad_norm": 187.605224609375, + "learning_rate": 3e-06, + "loss": -4.9329, + "step": 1049 + }, + { + "epoch": 0.09333333333333334, + "grad_norm": 135.69638061523438, + "learning_rate": 3e-06, + "loss": 1.364, + "step": 1050 + }, + { + "epoch": 0.09342222222222223, + "grad_norm": 155.4630126953125, + "learning_rate": 3e-06, + "loss": -0.1606, + "step": 1051 + }, + { + "epoch": 0.09351111111111111, + "grad_norm": 137.24606323242188, + "learning_rate": 3e-06, + "loss": 2.4667, + "step": 1052 + }, + { + "epoch": 0.0936, + "grad_norm": 145.3031768798828, + "learning_rate": 3e-06, + "loss": 0.3328, + "step": 1053 + }, + { + "epoch": 0.09368888888888889, + "grad_norm": 138.85008239746094, + "learning_rate": 3e-06, + "loss": 3.6961, + "step": 1054 + }, + { + "epoch": 0.09377777777777778, + "grad_norm": 151.01002502441406, + "learning_rate": 3e-06, + "loss": -8.223, + "step": 1055 + }, + { + "epoch": 0.09386666666666667, + "grad_norm": 130.21697998046875, + "learning_rate": 3e-06, + "loss": -1.2589, + "step": 1056 + }, + { + "completion_length": 255.4791717529297, + "epoch": 0.09395555555555556, + "grad_norm": 129.92930603027344, + "learning_rate": 3e-06, + "loss": -4.7404, + "reward": 1.0, + "reward_std": 0.43528565764427185, + "rewards/boxed_and_answer_tags_format_reward": 0.625, + "rewards/correctness_reward_func_math": 0.375, + "step": 1057, + "zero_std_ratio": 0.5 + }, + { + "epoch": 0.09404444444444444, + "grad_norm": 134.7594451904297, + "learning_rate": 3e-06, + "loss": -7.4851, + "step": 1058 + }, + { + "epoch": 0.09413333333333333, + "grad_norm": 145.62059020996094, + "learning_rate": 3e-06, + "loss": -11.0884, + "step": 1059 + }, + { + "epoch": 0.09422222222222222, + "grad_norm": 116.93612670898438, + "learning_rate": 3e-06, + "loss": 6.683, + "step": 1060 + }, + { + "epoch": 0.09431111111111111, + "grad_norm": 126.82006072998047, + "learning_rate": 3e-06, + "loss": -7.7905, + "step": 1061 + }, + { + "epoch": 0.0944, + "grad_norm": 252.49966430664062, + "learning_rate": 3e-06, + "loss": 4.6391, + "step": 1062 + }, + { + "epoch": 0.09448888888888889, + "grad_norm": 129.0404052734375, + "learning_rate": 3e-06, + "loss": -6.2658, + "step": 1063 + }, + { + "epoch": 0.09457777777777777, + "grad_norm": 140.08370971679688, + "learning_rate": 3e-06, + "loss": -8.4286, + "step": 1064 + }, + { + "epoch": 0.09466666666666666, + "grad_norm": 114.8161392211914, + "learning_rate": 3e-06, + "loss": -12.9169, + "step": 1065 + }, + { + "epoch": 0.09475555555555555, + "grad_norm": 112.29281616210938, + "learning_rate": 3e-06, + "loss": 4.7452, + "step": 1066 + }, + { + "epoch": 0.09484444444444444, + "grad_norm": 140.55029296875, + "learning_rate": 3e-06, + "loss": -9.5303, + "step": 1067 + }, + { + "epoch": 0.09493333333333333, + "grad_norm": 147.0571746826172, + "learning_rate": 3e-06, + "loss": 3.0677, + "step": 1068 + }, + { + "completion_length": 245.0416717529297, + "epoch": 0.09502222222222222, + "grad_norm": 101.13983154296875, + "learning_rate": 3e-06, + "loss": 7.3268, + "reward": 0.7916666865348816, + "reward_std": 0.27350127696990967, + "rewards/boxed_and_answer_tags_format_reward": 0.4999999850988388, + "rewards/correctness_reward_func_math": 0.2916666679084301, + "step": 1069, + "zero_std_ratio": 0.625 + }, + { + "epoch": 0.0951111111111111, + "grad_norm": 121.06708526611328, + "learning_rate": 3e-06, + "loss": 4.489, + "step": 1070 + }, + { + "epoch": 0.0952, + "grad_norm": 127.9291763305664, + "learning_rate": 3e-06, + "loss": 1.7732, + "step": 1071 + }, + { + "epoch": 0.0952888888888889, + "grad_norm": 120.67790222167969, + "learning_rate": 3e-06, + "loss": 7.2385, + "step": 1072 + }, + { + "epoch": 0.09537777777777778, + "grad_norm": 98.45962524414062, + "learning_rate": 3e-06, + "loss": -9.8857, + "step": 1073 + }, + { + "epoch": 0.09546666666666667, + "grad_norm": 110.4314193725586, + "learning_rate": 3e-06, + "loss": -1.6899, + "step": 1074 + }, + { + "epoch": 0.09555555555555556, + "grad_norm": 101.835693359375, + "learning_rate": 3e-06, + "loss": 5.3228, + "step": 1075 + }, + { + "epoch": 0.09564444444444445, + "grad_norm": 119.22904205322266, + "learning_rate": 3e-06, + "loss": 3.0508, + "step": 1076 + }, + { + "epoch": 0.09573333333333334, + "grad_norm": 126.35284423828125, + "learning_rate": 3e-06, + "loss": -1.1565, + "step": 1077 + }, + { + "epoch": 0.09582222222222223, + "grad_norm": 129.94705200195312, + "learning_rate": 3e-06, + "loss": 4.3463, + "step": 1078 + }, + { + "epoch": 0.09591111111111111, + "grad_norm": 95.31863403320312, + "learning_rate": 3e-06, + "loss": -12.0709, + "step": 1079 + }, + { + "epoch": 0.096, + "grad_norm": 111.38770294189453, + "learning_rate": 3e-06, + "loss": -4.804, + "step": 1080 + }, + { + "completion_length": 227.33333587646484, + "epoch": 0.09608888888888889, + "grad_norm": 93.04568481445312, + "learning_rate": 3e-06, + "loss": -1.8293, + "reward": 1.2604166865348816, + "reward_std": 0.4915197938680649, + "rewards/boxed_and_answer_tags_format_reward": 0.6770833432674408, + "rewards/correctness_reward_func_math": 0.5833333134651184, + "step": 1081, + "zero_std_ratio": 0.5 + }, + { + "epoch": 0.09617777777777778, + "grad_norm": 186.89431762695312, + "learning_rate": 3e-06, + "loss": -1.6852, + "step": 1082 + }, + { + "epoch": 0.09626666666666667, + "grad_norm": 132.6072998046875, + "learning_rate": 3e-06, + "loss": -19.4138, + "step": 1083 + }, + { + "epoch": 0.09635555555555556, + "grad_norm": 128.269287109375, + "learning_rate": 3e-06, + "loss": -7.5699, + "step": 1084 + }, + { + "epoch": 0.09644444444444444, + "grad_norm": 104.99845123291016, + "learning_rate": 3e-06, + "loss": -14.2839, + "step": 1085 + }, + { + "epoch": 0.09653333333333333, + "grad_norm": 127.01644897460938, + "learning_rate": 3e-06, + "loss": -6.2089, + "step": 1086 + }, + { + "epoch": 0.09662222222222222, + "grad_norm": 99.6088638305664, + "learning_rate": 3e-06, + "loss": -3.0049, + "step": 1087 + }, + { + "epoch": 0.09671111111111111, + "grad_norm": 120.30072784423828, + "learning_rate": 3e-06, + "loss": -3.4027, + "step": 1088 + }, + { + "epoch": 0.0968, + "grad_norm": 121.11226654052734, + "learning_rate": 3e-06, + "loss": -20.6903, + "step": 1089 + }, + { + "epoch": 0.09688888888888889, + "grad_norm": 101.49308013916016, + "learning_rate": 3e-06, + "loss": -8.6379, + "step": 1090 + }, + { + "epoch": 0.09697777777777777, + "grad_norm": 117.81327819824219, + "learning_rate": 3e-06, + "loss": -15.1799, + "step": 1091 + }, + { + "epoch": 0.09706666666666666, + "grad_norm": 114.92808532714844, + "learning_rate": 3e-06, + "loss": -8.7164, + "step": 1092 + }, + { + "completion_length": 245.68750762939453, + "epoch": 0.09715555555555555, + "grad_norm": 137.37049865722656, + "learning_rate": 3e-06, + "loss": -17.5632, + "reward": 1.2604166865348816, + "reward_std": 0.3936076760292053, + "rewards/boxed_and_answer_tags_format_reward": 0.6770833432674408, + "rewards/correctness_reward_func_math": 0.5833333134651184, + "step": 1093, + "zero_std_ratio": 0.5 + }, + { + "epoch": 0.09724444444444444, + "grad_norm": 104.14005279541016, + "learning_rate": 3e-06, + "loss": -13.8548, + "step": 1094 + }, + { + "epoch": 0.09733333333333333, + "grad_norm": 111.48066711425781, + "learning_rate": 3e-06, + "loss": -14.4865, + "step": 1095 + }, + { + "epoch": 0.09742222222222222, + "grad_norm": 115.86460876464844, + "learning_rate": 3e-06, + "loss": -12.7074, + "step": 1096 + }, + { + "epoch": 0.09751111111111112, + "grad_norm": 108.50313568115234, + "learning_rate": 3e-06, + "loss": -17.9489, + "step": 1097 + }, + { + "epoch": 0.0976, + "grad_norm": 113.4880599975586, + "learning_rate": 3e-06, + "loss": -16.1087, + "step": 1098 + }, + { + "epoch": 0.0976888888888889, + "grad_norm": 109.29180145263672, + "learning_rate": 3e-06, + "loss": -19.2428, + "step": 1099 + }, + { + "epoch": 0.09777777777777778, + "grad_norm": 107.1020278930664, + "learning_rate": 3e-06, + "loss": -15.2564, + "step": 1100 + }, + { + "epoch": 0.09786666666666667, + "grad_norm": 131.5577392578125, + "learning_rate": 3e-06, + "loss": -15.6807, + "step": 1101 + }, + { + "epoch": 0.09795555555555556, + "grad_norm": 121.74998474121094, + "learning_rate": 3e-06, + "loss": -15.0192, + "step": 1102 + }, + { + "epoch": 0.09804444444444445, + "grad_norm": 107.87700653076172, + "learning_rate": 3e-06, + "loss": -19.6246, + "step": 1103 + }, + { + "epoch": 0.09813333333333334, + "grad_norm": 132.6728515625, + "learning_rate": 3e-06, + "loss": -17.4119, + "step": 1104 + }, + { + "completion_length": 247.375, + "epoch": 0.09822222222222222, + "grad_norm": 194.66749572753906, + "learning_rate": 3e-06, + "loss": -59.2499, + "reward": 1.7083333730697632, + "reward_std": 0.6184598803520203, + "rewards/boxed_and_answer_tags_format_reward": 0.75, + "rewards/correctness_reward_func_math": 0.9583333432674408, + "step": 1105, + "zero_std_ratio": 0.375 + }, + { + "epoch": 0.09831111111111111, + "grad_norm": 179.941162109375, + "learning_rate": 3e-06, + "loss": -46.7423, + "step": 1106 + }, + { + "epoch": 0.0984, + "grad_norm": 232.29762268066406, + "learning_rate": 3e-06, + "loss": -54.0588, + "step": 1107 + }, + { + "epoch": 0.09848888888888889, + "grad_norm": 208.61793518066406, + "learning_rate": 3e-06, + "loss": -72.7937, + "step": 1108 + }, + { + "epoch": 0.09857777777777778, + "grad_norm": 180.18431091308594, + "learning_rate": 3e-06, + "loss": -37.6563, + "step": 1109 + }, + { + "epoch": 0.09866666666666667, + "grad_norm": 191.83653259277344, + "learning_rate": 3e-06, + "loss": -56.5117, + "step": 1110 + }, + { + "epoch": 0.09875555555555555, + "grad_norm": 192.1591796875, + "learning_rate": 3e-06, + "loss": -63.0639, + "step": 1111 + }, + { + "epoch": 0.09884444444444444, + "grad_norm": 183.53610229492188, + "learning_rate": 3e-06, + "loss": -50.2258, + "step": 1112 + }, + { + "epoch": 0.09893333333333333, + "grad_norm": 233.22872924804688, + "learning_rate": 3e-06, + "loss": -58.1688, + "step": 1113 + }, + { + "epoch": 0.09902222222222222, + "grad_norm": 219.78233337402344, + "learning_rate": 3e-06, + "loss": -78.4412, + "step": 1114 + }, + { + "epoch": 0.09911111111111111, + "grad_norm": 189.9258270263672, + "learning_rate": 3e-06, + "loss": -42.5197, + "step": 1115 + }, + { + "epoch": 0.0992, + "grad_norm": 204.02183532714844, + "learning_rate": 3e-06, + "loss": -61.9828, + "step": 1116 + }, + { + "completion_length": 248.9166717529297, + "epoch": 0.09928888888888888, + "grad_norm": 114.85681915283203, + "learning_rate": 3e-06, + "loss": 1.5984, + "reward": 1.2395833730697632, + "reward_std": 0.2296396717429161, + "rewards/boxed_and_answer_tags_format_reward": 0.7395833432674408, + "rewards/correctness_reward_func_math": 0.4999999850988388, + "step": 1117, + "zero_std_ratio": 0.625 + }, + { + "epoch": 0.09937777777777777, + "grad_norm": 113.66478729248047, + "learning_rate": 3e-06, + "loss": -2.2296, + "step": 1118 + }, + { + "epoch": 0.09946666666666666, + "grad_norm": 124.4161148071289, + "learning_rate": 3e-06, + "loss": 4.382, + "step": 1119 + }, + { + "epoch": 0.09955555555555555, + "grad_norm": 131.39085388183594, + "learning_rate": 3e-06, + "loss": 0.5734, + "step": 1120 + }, + { + "epoch": 0.09964444444444444, + "grad_norm": 119.46894073486328, + "learning_rate": 3e-06, + "loss": 5.0378, + "step": 1121 + }, + { + "epoch": 0.09973333333333333, + "grad_norm": 113.26507568359375, + "learning_rate": 3e-06, + "loss": 1.4215, + "step": 1122 + }, + { + "epoch": 0.09982222222222223, + "grad_norm": 120.69562530517578, + "learning_rate": 3e-06, + "loss": 0.0203, + "step": 1123 + }, + { + "epoch": 0.09991111111111112, + "grad_norm": 128.0107421875, + "learning_rate": 3e-06, + "loss": -3.7284, + "step": 1124 + }, + { + "epoch": 0.1, + "grad_norm": 139.31997680664062, + "learning_rate": 3e-06, + "loss": 2.9527, + "step": 1125 + }, + { + "epoch": 0.1000888888888889, + "grad_norm": 111.49156188964844, + "learning_rate": 3e-06, + "loss": -1.217, + "step": 1126 + }, + { + "epoch": 0.10017777777777778, + "grad_norm": 107.92985534667969, + "learning_rate": 3e-06, + "loss": 3.7084, + "step": 1127 + }, + { + "epoch": 0.10026666666666667, + "grad_norm": 112.88748168945312, + "learning_rate": 3e-06, + "loss": -0.5577, + "step": 1128 + }, + { + "completion_length": 245.18750762939453, + "epoch": 0.10035555555555556, + "grad_norm": 124.6043930053711, + "learning_rate": 3e-06, + "loss": -20.6745, + "reward": 1.4583333432674408, + "reward_std": 0.5722163170576096, + "rewards/boxed_and_answer_tags_format_reward": 0.625, + "rewards/correctness_reward_func_math": 0.8333333283662796, + "step": 1129, + "zero_std_ratio": 0.375 + }, + { + "epoch": 0.10044444444444445, + "grad_norm": 118.25672149658203, + "learning_rate": 3e-06, + "loss": -32.0798, + "step": 1130 + }, + { + "epoch": 0.10053333333333334, + "grad_norm": 189.2974853515625, + "learning_rate": 3e-06, + "loss": -23.7609, + "step": 1131 + }, + { + "epoch": 0.10062222222222222, + "grad_norm": 139.94485473632812, + "learning_rate": 3e-06, + "loss": -30.0811, + "step": 1132 + }, + { + "epoch": 0.10071111111111111, + "grad_norm": 132.92324829101562, + "learning_rate": 3e-06, + "loss": -29.1304, + "step": 1133 + }, + { + "epoch": 0.1008, + "grad_norm": 129.6322021484375, + "learning_rate": 3e-06, + "loss": -25.8999, + "step": 1134 + }, + { + "epoch": 0.10088888888888889, + "grad_norm": 122.53899383544922, + "learning_rate": 3e-06, + "loss": -22.266, + "step": 1135 + }, + { + "epoch": 0.10097777777777778, + "grad_norm": 111.6375961303711, + "learning_rate": 3e-06, + "loss": -32.9394, + "step": 1136 + }, + { + "epoch": 0.10106666666666667, + "grad_norm": 162.63771057128906, + "learning_rate": 3e-06, + "loss": -25.9897, + "step": 1137 + }, + { + "epoch": 0.10115555555555555, + "grad_norm": 154.28424072265625, + "learning_rate": 3e-06, + "loss": -32.7633, + "step": 1138 + }, + { + "epoch": 0.10124444444444444, + "grad_norm": 132.94351196289062, + "learning_rate": 3e-06, + "loss": -31.2033, + "step": 1139 + }, + { + "epoch": 0.10133333333333333, + "grad_norm": 131.9947052001953, + "learning_rate": 3e-06, + "loss": -27.4069, + "step": 1140 + }, + { + "completion_length": 248.625, + "epoch": 0.10142222222222222, + "grad_norm": 113.37858581542969, + "learning_rate": 3e-06, + "loss": -14.7283, + "reward": 0.9479166865348816, + "reward_std": 0.2296396642923355, + "rewards/boxed_and_answer_tags_format_reward": 0.6145833432674408, + "rewards/correctness_reward_func_math": 0.3333333246409893, + "step": 1141, + "zero_std_ratio": 0.625 + }, + { + "epoch": 0.10151111111111111, + "grad_norm": 112.18434143066406, + "learning_rate": 3e-06, + "loss": -11.645, + "step": 1142 + }, + { + "epoch": 0.1016, + "grad_norm": 105.41796112060547, + "learning_rate": 3e-06, + "loss": -15.9935, + "step": 1143 + }, + { + "epoch": 0.10168888888888888, + "grad_norm": 91.3786849975586, + "learning_rate": 3e-06, + "loss": -16.6043, + "step": 1144 + }, + { + "epoch": 0.10177777777777777, + "grad_norm": 104.3455810546875, + "learning_rate": 3e-06, + "loss": -3.4885, + "step": 1145 + }, + { + "epoch": 0.10186666666666666, + "grad_norm": 87.41930389404297, + "learning_rate": 3e-06, + "loss": -20.634, + "step": 1146 + }, + { + "epoch": 0.10195555555555555, + "grad_norm": 90.50940704345703, + "learning_rate": 3e-06, + "loss": -16.7866, + "step": 1147 + }, + { + "epoch": 0.10204444444444444, + "grad_norm": 117.73979949951172, + "learning_rate": 3e-06, + "loss": -13.9565, + "step": 1148 + }, + { + "epoch": 0.10213333333333334, + "grad_norm": 117.4783706665039, + "learning_rate": 3e-06, + "loss": -18.3371, + "step": 1149 + }, + { + "epoch": 0.10222222222222223, + "grad_norm": 90.1675033569336, + "learning_rate": 3e-06, + "loss": -19.5285, + "step": 1150 + }, + { + "epoch": 0.10231111111111112, + "grad_norm": 114.65315246582031, + "learning_rate": 3e-06, + "loss": -6.9923, + "step": 1151 + }, + { + "epoch": 0.1024, + "grad_norm": 236.66262817382812, + "learning_rate": 3e-06, + "loss": -23.6847, + "step": 1152 + }, + { + "completion_length": 254.0625, + "epoch": 0.1024888888888889, + "grad_norm": 112.95475769042969, + "learning_rate": 3e-06, + "loss": -27.7434, + "reward": 0.96875, + "reward_std": 0.3734789788722992, + "rewards/boxed_and_answer_tags_format_reward": 0.71875, + "rewards/correctness_reward_func_math": 0.25, + "step": 1153, + "zero_std_ratio": 0.5 + }, + { + "epoch": 0.10257777777777778, + "grad_norm": 144.55364990234375, + "learning_rate": 3e-06, + "loss": -22.4297, + "step": 1154 + }, + { + "epoch": 0.10266666666666667, + "grad_norm": 128.1786346435547, + "learning_rate": 3e-06, + "loss": -34.1587, + "step": 1155 + }, + { + "epoch": 0.10275555555555556, + "grad_norm": 125.85458374023438, + "learning_rate": 3e-06, + "loss": -30.6015, + "step": 1156 + }, + { + "epoch": 0.10284444444444445, + "grad_norm": 142.524658203125, + "learning_rate": 3e-06, + "loss": -31.4122, + "step": 1157 + }, + { + "epoch": 0.10293333333333334, + "grad_norm": 111.42877197265625, + "learning_rate": 3e-06, + "loss": -26.1967, + "step": 1158 + }, + { + "epoch": 0.10302222222222222, + "grad_norm": 123.39324951171875, + "learning_rate": 3e-06, + "loss": -29.3019, + "step": 1159 + }, + { + "epoch": 0.10311111111111111, + "grad_norm": 151.65802001953125, + "learning_rate": 3e-06, + "loss": -24.3001, + "step": 1160 + }, + { + "epoch": 0.1032, + "grad_norm": 127.43438720703125, + "learning_rate": 3e-06, + "loss": -36.1734, + "step": 1161 + }, + { + "epoch": 0.10328888888888889, + "grad_norm": 123.67347717285156, + "learning_rate": 3e-06, + "loss": -33.6541, + "step": 1162 + }, + { + "epoch": 0.10337777777777778, + "grad_norm": 147.1012420654297, + "learning_rate": 3e-06, + "loss": -33.954, + "step": 1163 + }, + { + "epoch": 0.10346666666666667, + "grad_norm": 125.47201538085938, + "learning_rate": 3e-06, + "loss": -28.7879, + "step": 1164 + }, + { + "completion_length": 236.64583587646484, + "epoch": 0.10355555555555555, + "grad_norm": 348.24346923828125, + "learning_rate": 3e-06, + "loss": -9.8131, + "reward": 1.4375000596046448, + "reward_std": 0.4971916079521179, + "rewards/boxed_and_answer_tags_format_reward": 0.6875, + "rewards/correctness_reward_func_math": 0.75, + "step": 1165, + "zero_std_ratio": 0.5 + }, + { + "epoch": 0.10364444444444444, + "grad_norm": 189.84584045410156, + "learning_rate": 3e-06, + "loss": -5.3377, + "step": 1166 + }, + { + "epoch": 0.10373333333333333, + "grad_norm": 192.2544403076172, + "learning_rate": 3e-06, + "loss": -1.7722, + "step": 1167 + }, + { + "epoch": 0.10382222222222222, + "grad_norm": 180.6472625732422, + "learning_rate": 3e-06, + "loss": -6.7487, + "step": 1168 + }, + { + "epoch": 0.1039111111111111, + "grad_norm": 197.0133819580078, + "learning_rate": 3e-06, + "loss": -2.4648, + "step": 1169 + }, + { + "epoch": 0.104, + "grad_norm": 256.5722351074219, + "learning_rate": 3e-06, + "loss": -17.2997, + "step": 1170 + }, + { + "epoch": 0.10408888888888888, + "grad_norm": 179.86228942871094, + "learning_rate": 3e-06, + "loss": -12.5209, + "step": 1171 + }, + { + "epoch": 0.10417777777777777, + "grad_norm": 195.3128204345703, + "learning_rate": 3e-06, + "loss": -9.6999, + "step": 1172 + }, + { + "epoch": 0.10426666666666666, + "grad_norm": 204.9373321533203, + "learning_rate": 3e-06, + "loss": -6.9052, + "step": 1173 + }, + { + "epoch": 0.10435555555555555, + "grad_norm": 192.8905792236328, + "learning_rate": 3e-06, + "loss": -10.4987, + "step": 1174 + }, + { + "epoch": 0.10444444444444445, + "grad_norm": 181.99449157714844, + "learning_rate": 3e-06, + "loss": -7.0688, + "step": 1175 + }, + { + "epoch": 0.10453333333333334, + "grad_norm": 189.1867218017578, + "learning_rate": 3e-06, + "loss": -22.4777, + "step": 1176 + }, + { + "completion_length": 250.33333587646484, + "epoch": 0.10462222222222223, + "grad_norm": 158.67459106445312, + "learning_rate": 3e-06, + "loss": -8.4502, + "reward": 0.9583333432674408, + "reward_std": 0.5451789498329163, + "rewards/boxed_and_answer_tags_format_reward": 0.5, + "rewards/correctness_reward_func_math": 0.4583333358168602, + "step": 1177, + "zero_std_ratio": 0.375 + }, + { + "epoch": 0.10471111111111112, + "grad_norm": 158.85533142089844, + "learning_rate": 3e-06, + "loss": -13.1446, + "step": 1178 + }, + { + "epoch": 0.1048, + "grad_norm": 155.9822235107422, + "learning_rate": 3e-06, + "loss": -6.7899, + "step": 1179 + }, + { + "epoch": 0.10488888888888889, + "grad_norm": 150.00985717773438, + "learning_rate": 3e-06, + "loss": 5.268, + "step": 1180 + }, + { + "epoch": 0.10497777777777778, + "grad_norm": 140.22618103027344, + "learning_rate": 3e-06, + "loss": -3.0269, + "step": 1181 + }, + { + "epoch": 0.10506666666666667, + "grad_norm": 130.9547119140625, + "learning_rate": 3e-06, + "loss": -6.5307, + "step": 1182 + }, + { + "epoch": 0.10515555555555556, + "grad_norm": 157.96466064453125, + "learning_rate": 3e-06, + "loss": -10.1506, + "step": 1183 + }, + { + "epoch": 0.10524444444444445, + "grad_norm": 162.57582092285156, + "learning_rate": 3e-06, + "loss": -14.3085, + "step": 1184 + }, + { + "epoch": 0.10533333333333333, + "grad_norm": 163.5466766357422, + "learning_rate": 3e-06, + "loss": -9.5339, + "step": 1185 + }, + { + "epoch": 0.10542222222222222, + "grad_norm": 126.0348129272461, + "learning_rate": 3e-06, + "loss": 2.7285, + "step": 1186 + }, + { + "epoch": 0.10551111111111111, + "grad_norm": 152.0486297607422, + "learning_rate": 3e-06, + "loss": -5.0055, + "step": 1187 + }, + { + "epoch": 0.1056, + "grad_norm": 135.672607421875, + "learning_rate": 3e-06, + "loss": -7.9305, + "step": 1188 + }, + { + "completion_length": 242.43750762939453, + "epoch": 0.10568888888888889, + "grad_norm": 102.83517456054688, + "learning_rate": 3e-06, + "loss": -9.6079, + "reward": 1.5833333730697632, + "reward_std": 0.3332235962152481, + "rewards/boxed_and_answer_tags_format_reward": 0.75, + "rewards/correctness_reward_func_math": 0.8333333134651184, + "step": 1189, + "zero_std_ratio": 0.625 + }, + { + "epoch": 0.10577777777777778, + "grad_norm": 104.57038116455078, + "learning_rate": 3e-06, + "loss": 3.4859, + "step": 1190 + }, + { + "epoch": 0.10586666666666666, + "grad_norm": 130.5141143798828, + "learning_rate": 3e-06, + "loss": 9.6096, + "step": 1191 + }, + { + "epoch": 0.10595555555555555, + "grad_norm": 121.0637435913086, + "learning_rate": 3e-06, + "loss": -7.0112, + "step": 1192 + }, + { + "epoch": 0.10604444444444444, + "grad_norm": 116.66060638427734, + "learning_rate": 3e-06, + "loss": -0.6318, + "step": 1193 + }, + { + "epoch": 0.10613333333333333, + "grad_norm": 93.47602844238281, + "learning_rate": 3e-06, + "loss": -7.121, + "step": 1194 + }, + { + "epoch": 0.10622222222222222, + "grad_norm": 97.91071319580078, + "learning_rate": 3e-06, + "loss": -11.2048, + "step": 1195 + }, + { + "epoch": 0.1063111111111111, + "grad_norm": 99.79684448242188, + "learning_rate": 3e-06, + "loss": 2.4369, + "step": 1196 + }, + { + "epoch": 0.1064, + "grad_norm": 103.4743423461914, + "learning_rate": 3e-06, + "loss": 7.3113, + "step": 1197 + }, + { + "epoch": 0.10648888888888888, + "grad_norm": 115.71762084960938, + "learning_rate": 3e-06, + "loss": -8.8378, + "step": 1198 + }, + { + "epoch": 0.10657777777777777, + "grad_norm": 116.43769073486328, + "learning_rate": 3e-06, + "loss": -2.6688, + "step": 1199 + }, + { + "epoch": 0.10666666666666667, + "grad_norm": 93.20524597167969, + "learning_rate": 3e-06, + "loss": -9.0718, + "step": 1200 + }, + { + "completion_length": 242.77083587646484, + "epoch": 0.10675555555555556, + "grad_norm": 156.0432586669922, + "learning_rate": 3e-06, + "loss": 6.0861, + "reward": 1.6770833730697632, + "reward_std": 0.5419133305549622, + "rewards/boxed_and_answer_tags_format_reward": 0.6770833432674408, + "rewards/correctness_reward_func_math": 1.0, + "step": 1201, + "zero_std_ratio": 0.375 + }, + { + "epoch": 0.10684444444444445, + "grad_norm": 173.78958129882812, + "learning_rate": 3e-06, + "loss": 9.0247, + "step": 1202 + }, + { + "epoch": 0.10693333333333334, + "grad_norm": 152.82534790039062, + "learning_rate": 3e-06, + "loss": 5.6834, + "step": 1203 + }, + { + "epoch": 0.10702222222222223, + "grad_norm": 151.876953125, + "learning_rate": 3e-06, + "loss": 6.9018, + "step": 1204 + }, + { + "epoch": 0.10711111111111112, + "grad_norm": 165.88400268554688, + "learning_rate": 3e-06, + "loss": 9.1234, + "step": 1205 + }, + { + "epoch": 0.1072, + "grad_norm": 167.48348999023438, + "learning_rate": 3e-06, + "loss": 9.4483, + "step": 1206 + }, + { + "epoch": 0.10728888888888889, + "grad_norm": 149.90536499023438, + "learning_rate": 3e-06, + "loss": 2.7725, + "step": 1207 + }, + { + "epoch": 0.10737777777777778, + "grad_norm": 169.23411560058594, + "learning_rate": 3e-06, + "loss": 6.9296, + "step": 1208 + }, + { + "epoch": 0.10746666666666667, + "grad_norm": 233.46914672851562, + "learning_rate": 3e-06, + "loss": 3.5024, + "step": 1209 + }, + { + "epoch": 0.10755555555555556, + "grad_norm": 147.3250732421875, + "learning_rate": 3e-06, + "loss": 3.7723, + "step": 1210 + }, + { + "epoch": 0.10764444444444445, + "grad_norm": 155.88914489746094, + "learning_rate": 3e-06, + "loss": 7.0329, + "step": 1211 + }, + { + "epoch": 0.10773333333333333, + "grad_norm": 153.82785034179688, + "learning_rate": 3e-06, + "loss": 7.0486, + "step": 1212 + }, + { + "completion_length": 247.6666717529297, + "epoch": 0.10782222222222222, + "grad_norm": 158.41856384277344, + "learning_rate": 3e-06, + "loss": 18.2887, + "reward": 1.614583432674408, + "reward_std": 0.3665703386068344, + "rewards/boxed_and_answer_tags_format_reward": 0.7395833432674408, + "rewards/correctness_reward_func_math": 0.8750000149011612, + "step": 1213, + "zero_std_ratio": 0.5 + }, + { + "epoch": 0.10791111111111111, + "grad_norm": 163.51866149902344, + "learning_rate": 3e-06, + "loss": 5.6461, + "step": 1214 + }, + { + "epoch": 0.108, + "grad_norm": 131.4403839111328, + "learning_rate": 3e-06, + "loss": 3.3667, + "step": 1215 + }, + { + "epoch": 0.10808888888888889, + "grad_norm": 136.77757263183594, + "learning_rate": 3e-06, + "loss": 4.406, + "step": 1216 + }, + { + "epoch": 0.10817777777777778, + "grad_norm": 113.4407958984375, + "learning_rate": 3e-06, + "loss": 12.3995, + "step": 1217 + }, + { + "epoch": 0.10826666666666666, + "grad_norm": 136.1168212890625, + "learning_rate": 3e-06, + "loss": 4.6776, + "step": 1218 + }, + { + "epoch": 0.10835555555555555, + "grad_norm": 144.9372100830078, + "learning_rate": 3e-06, + "loss": 17.5724, + "step": 1219 + }, + { + "epoch": 0.10844444444444444, + "grad_norm": 125.29820251464844, + "learning_rate": 3e-06, + "loss": 4.1424, + "step": 1220 + }, + { + "epoch": 0.10853333333333333, + "grad_norm": 138.5077667236328, + "learning_rate": 3e-06, + "loss": 1.3947, + "step": 1221 + }, + { + "epoch": 0.10862222222222222, + "grad_norm": 128.62693786621094, + "learning_rate": 3e-06, + "loss": 2.4864, + "step": 1222 + }, + { + "epoch": 0.1087111111111111, + "grad_norm": 107.87577056884766, + "learning_rate": 3e-06, + "loss": 10.4239, + "step": 1223 + }, + { + "epoch": 0.1088, + "grad_norm": 127.07205963134766, + "learning_rate": 3e-06, + "loss": 1.3109, + "step": 1224 + }, + { + "completion_length": 238.89584350585938, + "epoch": 0.10888888888888888, + "grad_norm": 154.35325622558594, + "learning_rate": 3e-06, + "loss": 8.6596, + "reward": 1.5208333730697632, + "reward_std": 0.3680921420454979, + "rewards/boxed_and_answer_tags_format_reward": 0.6875, + "rewards/correctness_reward_func_math": 0.8333333432674408, + "step": 1225, + "zero_std_ratio": 0.625 + }, + { + "epoch": 0.10897777777777778, + "grad_norm": 165.58445739746094, + "learning_rate": 3e-06, + "loss": 1.8278, + "step": 1226 + }, + { + "epoch": 0.10906666666666667, + "grad_norm": 360.30029296875, + "learning_rate": 3e-06, + "loss": -9.1779, + "step": 1227 + }, + { + "epoch": 0.10915555555555556, + "grad_norm": 152.518310546875, + "learning_rate": 3e-06, + "loss": -9.97, + "step": 1228 + }, + { + "epoch": 0.10924444444444445, + "grad_norm": 146.67100524902344, + "learning_rate": 3e-06, + "loss": -6.3997, + "step": 1229 + }, + { + "epoch": 0.10933333333333334, + "grad_norm": 138.9771728515625, + "learning_rate": 3e-06, + "loss": -6.0985, + "step": 1230 + }, + { + "epoch": 0.10942222222222223, + "grad_norm": 159.66302490234375, + "learning_rate": 3e-06, + "loss": 5.5207, + "step": 1231 + }, + { + "epoch": 0.10951111111111111, + "grad_norm": 138.22695922851562, + "learning_rate": 3e-06, + "loss": -0.4886, + "step": 1232 + }, + { + "epoch": 0.1096, + "grad_norm": 160.3970489501953, + "learning_rate": 3e-06, + "loss": -12.1071, + "step": 1233 + }, + { + "epoch": 0.10968888888888889, + "grad_norm": 141.63226318359375, + "learning_rate": 3e-06, + "loss": -13.6489, + "step": 1234 + }, + { + "epoch": 0.10977777777777778, + "grad_norm": 146.45748901367188, + "learning_rate": 3e-06, + "loss": -10.0014, + "step": 1235 + }, + { + "epoch": 0.10986666666666667, + "grad_norm": 140.94992065429688, + "learning_rate": 3e-06, + "loss": -9.6508, + "step": 1236 + }, + { + "completion_length": 255.4166717529297, + "epoch": 0.10995555555555556, + "grad_norm": 160.60601806640625, + "learning_rate": 3e-06, + "loss": 1.8826, + "reward": 0.8020833432674408, + "reward_std": 0.3381742835044861, + "rewards/boxed_and_answer_tags_format_reward": 0.6354166865348816, + "rewards/correctness_reward_func_math": 0.1666666679084301, + "step": 1237, + "zero_std_ratio": 0.625 + }, + { + "epoch": 0.11004444444444444, + "grad_norm": 186.42630004882812, + "learning_rate": 3e-06, + "loss": -4.6822, + "step": 1238 + }, + { + "epoch": 0.11013333333333333, + "grad_norm": 141.29820251464844, + "learning_rate": 3e-06, + "loss": 11.1707, + "step": 1239 + }, + { + "epoch": 0.11022222222222222, + "grad_norm": 144.1931610107422, + "learning_rate": 3e-06, + "loss": 3.3338, + "step": 1240 + }, + { + "epoch": 0.11031111111111111, + "grad_norm": 135.27630615234375, + "learning_rate": 3e-06, + "loss": 4.9367, + "step": 1241 + }, + { + "epoch": 0.1104, + "grad_norm": 175.80433654785156, + "learning_rate": 3e-06, + "loss": -13.1628, + "step": 1242 + }, + { + "epoch": 0.11048888888888889, + "grad_norm": 165.0531463623047, + "learning_rate": 3e-06, + "loss": -0.3721, + "step": 1243 + }, + { + "epoch": 0.11057777777777777, + "grad_norm": 264.06695556640625, + "learning_rate": 3e-06, + "loss": -7.2717, + "step": 1244 + }, + { + "epoch": 0.11066666666666666, + "grad_norm": 145.43423461914062, + "learning_rate": 3e-06, + "loss": 8.5606, + "step": 1245 + }, + { + "epoch": 0.11075555555555555, + "grad_norm": 145.75946044921875, + "learning_rate": 3e-06, + "loss": 0.9458, + "step": 1246 + }, + { + "epoch": 0.11084444444444444, + "grad_norm": 125.21862030029297, + "learning_rate": 3e-06, + "loss": 3.1516, + "step": 1247 + }, + { + "epoch": 0.11093333333333333, + "grad_norm": 177.2153778076172, + "learning_rate": 3e-06, + "loss": -17.2454, + "step": 1248 + }, + { + "completion_length": 251.6041717529297, + "epoch": 0.11102222222222222, + "grad_norm": 371.7901306152344, + "learning_rate": 3e-06, + "loss": -30.6046, + "reward": 1.9895833730697632, + "reward_std": 0.8478911817073822, + "rewards/boxed_and_answer_tags_format_reward": 0.7395833432674408, + "rewards/correctness_reward_func_math": 1.25, + "step": 1249, + "zero_std_ratio": 0.125 + }, + { + "epoch": 0.1111111111111111, + "grad_norm": 241.84666442871094, + "learning_rate": 3e-06, + "loss": -47.9038, + "step": 1250 + }, + { + "epoch": 0.1112, + "grad_norm": 187.25973510742188, + "learning_rate": 3e-06, + "loss": -31.5847, + "step": 1251 + }, + { + "epoch": 0.1112888888888889, + "grad_norm": 220.72537231445312, + "learning_rate": 3e-06, + "loss": -37.9464, + "step": 1252 + }, + { + "epoch": 0.11137777777777778, + "grad_norm": 215.938720703125, + "learning_rate": 3e-06, + "loss": -36.917, + "step": 1253 + }, + { + "epoch": 0.11146666666666667, + "grad_norm": 255.27755737304688, + "learning_rate": 3e-06, + "loss": -36.4102, + "step": 1254 + }, + { + "epoch": 0.11155555555555556, + "grad_norm": 252.8043212890625, + "learning_rate": 3e-06, + "loss": -32.6218, + "step": 1255 + }, + { + "epoch": 0.11164444444444445, + "grad_norm": 225.40321350097656, + "learning_rate": 3e-06, + "loss": -51.6723, + "step": 1256 + }, + { + "epoch": 0.11173333333333334, + "grad_norm": 208.3738250732422, + "learning_rate": 3e-06, + "loss": -35.6313, + "step": 1257 + }, + { + "epoch": 0.11182222222222223, + "grad_norm": 215.90704345703125, + "learning_rate": 3e-06, + "loss": -41.5126, + "step": 1258 + }, + { + "epoch": 0.11191111111111111, + "grad_norm": 242.8232879638672, + "learning_rate": 3e-06, + "loss": -42.0952, + "step": 1259 + }, + { + "epoch": 0.112, + "grad_norm": 220.46678161621094, + "learning_rate": 3e-06, + "loss": -40.7929, + "step": 1260 + }, + { + "completion_length": 252.2291717529297, + "epoch": 0.11208888888888889, + "grad_norm": 112.288330078125, + "learning_rate": 3e-06, + "loss": -28.9692, + "reward": 0.8333333730697632, + "reward_std": 0.23116151988506317, + "rewards/boxed_and_answer_tags_format_reward": 0.625, + "rewards/correctness_reward_func_math": 0.2083333283662796, + "step": 1261, + "zero_std_ratio": 0.75 + }, + { + "epoch": 0.11217777777777778, + "grad_norm": 111.31488800048828, + "learning_rate": 3e-06, + "loss": -22.9427, + "step": 1262 + }, + { + "epoch": 0.11226666666666667, + "grad_norm": 130.2353057861328, + "learning_rate": 3e-06, + "loss": -30.2762, + "step": 1263 + }, + { + "epoch": 0.11235555555555556, + "grad_norm": 154.38973999023438, + "learning_rate": 3e-06, + "loss": -39.3815, + "step": 1264 + }, + { + "epoch": 0.11244444444444444, + "grad_norm": 109.89620971679688, + "learning_rate": 3e-06, + "loss": -28.5723, + "step": 1265 + }, + { + "epoch": 0.11253333333333333, + "grad_norm": 104.00005340576172, + "learning_rate": 3e-06, + "loss": -25.6052, + "step": 1266 + }, + { + "epoch": 0.11262222222222222, + "grad_norm": 121.78623962402344, + "learning_rate": 3e-06, + "loss": -30.0983, + "step": 1267 + }, + { + "epoch": 0.11271111111111111, + "grad_norm": 119.55603790283203, + "learning_rate": 3e-06, + "loss": -23.8716, + "step": 1268 + }, + { + "epoch": 0.1128, + "grad_norm": 124.7007827758789, + "learning_rate": 3e-06, + "loss": -31.7822, + "step": 1269 + }, + { + "epoch": 0.11288888888888889, + "grad_norm": 133.42088317871094, + "learning_rate": 3e-06, + "loss": -42.2768, + "step": 1270 + }, + { + "epoch": 0.11297777777777777, + "grad_norm": 128.7488555908203, + "learning_rate": 3e-06, + "loss": -30.9053, + "step": 1271 + }, + { + "epoch": 0.11306666666666666, + "grad_norm": 128.32147216796875, + "learning_rate": 3e-06, + "loss": -27.7986, + "step": 1272 + }, + { + "completion_length": 251.02083587646484, + "epoch": 0.11315555555555555, + "grad_norm": 276.54681396484375, + "learning_rate": 3e-06, + "loss": -8.3431, + "reward": 1.7916666865348816, + "reward_std": 0.4701542258262634, + "rewards/boxed_and_answer_tags_format_reward": 0.75, + "rewards/correctness_reward_func_math": 1.0416666567325592, + "step": 1273, + "zero_std_ratio": 0.5 + }, + { + "epoch": 0.11324444444444444, + "grad_norm": 234.2393341064453, + "learning_rate": 3e-06, + "loss": -13.7623, + "step": 1274 + }, + { + "epoch": 0.11333333333333333, + "grad_norm": 236.026611328125, + "learning_rate": 3e-06, + "loss": -4.7751, + "step": 1275 + }, + { + "epoch": 0.11342222222222222, + "grad_norm": 245.18170166015625, + "learning_rate": 3e-06, + "loss": -16.0847, + "step": 1276 + }, + { + "epoch": 0.1135111111111111, + "grad_norm": 314.53057861328125, + "learning_rate": 3e-06, + "loss": -8.6135, + "step": 1277 + }, + { + "epoch": 0.1136, + "grad_norm": 207.09188842773438, + "learning_rate": 3e-06, + "loss": -13.115, + "step": 1278 + }, + { + "epoch": 0.1136888888888889, + "grad_norm": 252.8507080078125, + "learning_rate": 3e-06, + "loss": -12.3574, + "step": 1279 + }, + { + "epoch": 0.11377777777777778, + "grad_norm": 241.2433319091797, + "learning_rate": 3e-06, + "loss": -18.4114, + "step": 1280 + }, + { + "epoch": 0.11386666666666667, + "grad_norm": 217.98683166503906, + "learning_rate": 3e-06, + "loss": -7.0416, + "step": 1281 + }, + { + "epoch": 0.11395555555555556, + "grad_norm": 252.15773010253906, + "learning_rate": 3e-06, + "loss": -19.5222, + "step": 1282 + }, + { + "epoch": 0.11404444444444445, + "grad_norm": 227.17205810546875, + "learning_rate": 3e-06, + "loss": -13.2521, + "step": 1283 + }, + { + "epoch": 0.11413333333333334, + "grad_norm": 235.24327087402344, + "learning_rate": 3e-06, + "loss": -17.9427, + "step": 1284 + }, + { + "completion_length": 253.25, + "epoch": 0.11422222222222222, + "grad_norm": 206.74746704101562, + "learning_rate": 3e-06, + "loss": 0.218, + "reward": 1.7187500596046448, + "reward_std": 0.5723656415939331, + "rewards/boxed_and_answer_tags_format_reward": 0.6770833432674408, + "rewards/correctness_reward_func_math": 1.0416666567325592, + "step": 1285, + "zero_std_ratio": 0.375 + }, + { + "epoch": 0.11431111111111111, + "grad_norm": 207.45079040527344, + "learning_rate": 3e-06, + "loss": 23.7089, + "step": 1286 + }, + { + "epoch": 0.1144, + "grad_norm": 201.48416137695312, + "learning_rate": 3e-06, + "loss": 16.6222, + "step": 1287 + }, + { + "epoch": 0.11448888888888889, + "grad_norm": 240.6068115234375, + "learning_rate": 3e-06, + "loss": 2.8229, + "step": 1288 + }, + { + "epoch": 0.11457777777777778, + "grad_norm": 267.2186279296875, + "learning_rate": 3e-06, + "loss": -6.3361, + "step": 1289 + }, + { + "epoch": 0.11466666666666667, + "grad_norm": 254.57681274414062, + "learning_rate": 3e-06, + "loss": 2.6077, + "step": 1290 + }, + { + "epoch": 0.11475555555555556, + "grad_norm": 215.68649291992188, + "learning_rate": 3e-06, + "loss": -2.7734, + "step": 1291 + }, + { + "epoch": 0.11484444444444444, + "grad_norm": 227.70590209960938, + "learning_rate": 3e-06, + "loss": 18.5952, + "step": 1292 + }, + { + "epoch": 0.11493333333333333, + "grad_norm": 215.9313201904297, + "learning_rate": 3e-06, + "loss": 13.7655, + "step": 1293 + }, + { + "epoch": 0.11502222222222222, + "grad_norm": 251.1554718017578, + "learning_rate": 3e-06, + "loss": -1.3305, + "step": 1294 + }, + { + "epoch": 0.11511111111111111, + "grad_norm": 274.58538818359375, + "learning_rate": 3e-06, + "loss": -8.8694, + "step": 1295 + }, + { + "epoch": 0.1152, + "grad_norm": 238.6010284423828, + "learning_rate": 3e-06, + "loss": -1.8695, + "step": 1296 + }, + { + "completion_length": 252.1666717529297, + "epoch": 0.11528888888888889, + "grad_norm": 220.25823974609375, + "learning_rate": 3e-06, + "loss": -7.0592, + "reward": 1.0625, + "reward_std": 0.39512956142425537, + "rewards/boxed_and_answer_tags_format_reward": 0.6875, + "rewards/correctness_reward_func_math": 0.375, + "step": 1297, + "zero_std_ratio": 0.625 + }, + { + "epoch": 0.11537777777777777, + "grad_norm": 190.90309143066406, + "learning_rate": 3e-06, + "loss": -21.8086, + "step": 1298 + }, + { + "epoch": 0.11546666666666666, + "grad_norm": 167.63180541992188, + "learning_rate": 3e-06, + "loss": -10.4386, + "step": 1299 + }, + { + "epoch": 0.11555555555555555, + "grad_norm": 196.72048950195312, + "learning_rate": 3e-06, + "loss": -10.7261, + "step": 1300 + }, + { + "epoch": 0.11564444444444444, + "grad_norm": 243.5116424560547, + "learning_rate": 3e-06, + "loss": -21.7371, + "step": 1301 + }, + { + "epoch": 0.11573333333333333, + "grad_norm": 163.6327667236328, + "learning_rate": 3e-06, + "loss": -21.7888, + "step": 1302 + }, + { + "epoch": 0.11582222222222222, + "grad_norm": 217.04978942871094, + "learning_rate": 3e-06, + "loss": -7.9678, + "step": 1303 + }, + { + "epoch": 0.11591111111111112, + "grad_norm": 182.5911865234375, + "learning_rate": 3e-06, + "loss": -22.9758, + "step": 1304 + }, + { + "epoch": 0.116, + "grad_norm": 167.71888732910156, + "learning_rate": 3e-06, + "loss": -12.1331, + "step": 1305 + }, + { + "epoch": 0.1160888888888889, + "grad_norm": 168.7008819580078, + "learning_rate": 3e-06, + "loss": -13.2942, + "step": 1306 + }, + { + "epoch": 0.11617777777777778, + "grad_norm": 210.5468292236328, + "learning_rate": 3e-06, + "loss": -22.5768, + "step": 1307 + }, + { + "epoch": 0.11626666666666667, + "grad_norm": 179.2998046875, + "learning_rate": 3e-06, + "loss": -23.8159, + "step": 1308 + }, + { + "completion_length": 254.5416717529297, + "epoch": 0.11635555555555556, + "grad_norm": 219.9465789794922, + "learning_rate": 3e-06, + "loss": 7.1814, + "reward": 1.2916667461395264, + "reward_std": 0.48936043679714203, + "rewards/boxed_and_answer_tags_format_reward": 0.75, + "rewards/correctness_reward_func_math": 0.5416666716337204, + "step": 1309, + "zero_std_ratio": 0.5 + }, + { + "epoch": 0.11644444444444445, + "grad_norm": 224.42604064941406, + "learning_rate": 3e-06, + "loss": 8.354, + "step": 1310 + }, + { + "epoch": 0.11653333333333334, + "grad_norm": 262.8431091308594, + "learning_rate": 3e-06, + "loss": 5.5188, + "step": 1311 + }, + { + "epoch": 0.11662222222222222, + "grad_norm": 446.6980285644531, + "learning_rate": 3e-06, + "loss": 7.2822, + "step": 1312 + }, + { + "epoch": 0.11671111111111111, + "grad_norm": 200.77650451660156, + "learning_rate": 3e-06, + "loss": -0.5339, + "step": 1313 + }, + { + "epoch": 0.1168, + "grad_norm": 230.9583740234375, + "learning_rate": 3e-06, + "loss": -3.6372, + "step": 1314 + }, + { + "epoch": 0.11688888888888889, + "grad_norm": 257.7409362792969, + "learning_rate": 3e-06, + "loss": 5.1944, + "step": 1315 + }, + { + "epoch": 0.11697777777777778, + "grad_norm": 215.5637664794922, + "learning_rate": 3e-06, + "loss": 6.6842, + "step": 1316 + }, + { + "epoch": 0.11706666666666667, + "grad_norm": 228.1314697265625, + "learning_rate": 3e-06, + "loss": 3.1597, + "step": 1317 + }, + { + "epoch": 0.11715555555555555, + "grad_norm": 227.79237365722656, + "learning_rate": 3e-06, + "loss": 6.0615, + "step": 1318 + }, + { + "epoch": 0.11724444444444444, + "grad_norm": 364.67889404296875, + "learning_rate": 3e-06, + "loss": -4.7295, + "step": 1319 + }, + { + "epoch": 0.11733333333333333, + "grad_norm": 435.36578369140625, + "learning_rate": 3e-06, + "loss": -7.3331, + "step": 1320 + }, + { + "completion_length": 254.12500762939453, + "epoch": 0.11742222222222222, + "grad_norm": 181.09559631347656, + "learning_rate": 3e-06, + "loss": 9.7639, + "reward": 1.3437500596046448, + "reward_std": 0.3936076909303665, + "rewards/boxed_and_answer_tags_format_reward": 0.6770833432674408, + "rewards/correctness_reward_func_math": 0.6666666865348816, + "step": 1321, + "zero_std_ratio": 0.5 + }, + { + "epoch": 0.11751111111111111, + "grad_norm": 184.9253387451172, + "learning_rate": 3e-06, + "loss": -7.8109, + "step": 1322 + }, + { + "epoch": 0.1176, + "grad_norm": 178.65553283691406, + "learning_rate": 3e-06, + "loss": -0.5036, + "step": 1323 + }, + { + "epoch": 0.11768888888888888, + "grad_norm": 172.2812042236328, + "learning_rate": 3e-06, + "loss": -5.0317, + "step": 1324 + }, + { + "epoch": 0.11777777777777777, + "grad_norm": 189.1818084716797, + "learning_rate": 3e-06, + "loss": -5.0896, + "step": 1325 + }, + { + "epoch": 0.11786666666666666, + "grad_norm": 195.15562438964844, + "learning_rate": 3e-06, + "loss": -6.7484, + "step": 1326 + }, + { + "epoch": 0.11795555555555555, + "grad_norm": 180.81617736816406, + "learning_rate": 3e-06, + "loss": 6.0087, + "step": 1327 + }, + { + "epoch": 0.11804444444444444, + "grad_norm": 181.2151336669922, + "learning_rate": 3e-06, + "loss": -12.1462, + "step": 1328 + }, + { + "epoch": 0.11813333333333334, + "grad_norm": 232.097900390625, + "learning_rate": 3e-06, + "loss": -6.1373, + "step": 1329 + }, + { + "epoch": 0.11822222222222223, + "grad_norm": 180.55462646484375, + "learning_rate": 3e-06, + "loss": -10.2139, + "step": 1330 + }, + { + "epoch": 0.11831111111111112, + "grad_norm": 192.82818603515625, + "learning_rate": 3e-06, + "loss": -11.1656, + "step": 1331 + }, + { + "epoch": 0.1184, + "grad_norm": 190.95399475097656, + "learning_rate": 3e-06, + "loss": -13.1171, + "step": 1332 + }, + { + "completion_length": 254.81250762939453, + "epoch": 0.1184888888888889, + "grad_norm": 65.9253921508789, + "learning_rate": 3e-06, + "loss": 2.6383, + "reward": 0.9791666865348816, + "reward_std": 0.10206206887960434, + "rewards/boxed_and_answer_tags_format_reward": 0.6875, + "rewards/correctness_reward_func_math": 0.2916666679084301, + "step": 1333, + "zero_std_ratio": 0.875 + }, + { + "epoch": 0.11857777777777778, + "grad_norm": 66.89152526855469, + "learning_rate": 3e-06, + "loss": 0.0904, + "step": 1334 + }, + { + "epoch": 0.11866666666666667, + "grad_norm": 52.20970916748047, + "learning_rate": 3e-06, + "loss": -2.1923, + "step": 1335 + }, + { + "epoch": 0.11875555555555556, + "grad_norm": 62.94733810424805, + "learning_rate": 3e-06, + "loss": -0.1938, + "step": 1336 + }, + { + "epoch": 0.11884444444444445, + "grad_norm": 74.16914367675781, + "learning_rate": 3e-06, + "loss": -0.0211, + "step": 1337 + }, + { + "epoch": 0.11893333333333334, + "grad_norm": 60.86524963378906, + "learning_rate": 3e-06, + "loss": 1.9824, + "step": 1338 + }, + { + "epoch": 0.11902222222222222, + "grad_norm": 68.02703857421875, + "learning_rate": 3e-06, + "loss": 1.8138, + "step": 1339 + }, + { + "epoch": 0.11911111111111111, + "grad_norm": 65.81590270996094, + "learning_rate": 3e-06, + "loss": -0.8801, + "step": 1340 + }, + { + "epoch": 0.1192, + "grad_norm": 49.24789810180664, + "learning_rate": 3e-06, + "loss": -2.9454, + "step": 1341 + }, + { + "epoch": 0.11928888888888889, + "grad_norm": 66.03591918945312, + "learning_rate": 3e-06, + "loss": -2.1792, + "step": 1342 + }, + { + "epoch": 0.11937777777777778, + "grad_norm": 68.27337646484375, + "learning_rate": 3e-06, + "loss": -1.5692, + "step": 1343 + }, + { + "epoch": 0.11946666666666667, + "grad_norm": 76.02365112304688, + "learning_rate": 3e-06, + "loss": 1.3993, + "step": 1344 + }, + { + "completion_length": 242.02084350585938, + "epoch": 0.11955555555555555, + "grad_norm": 159.37770080566406, + "learning_rate": 3e-06, + "loss": -16.5166, + "reward": 1.3125, + "reward_std": 0.43528567254543304, + "rewards/boxed_and_answer_tags_format_reward": 0.6875, + "rewards/correctness_reward_func_math": 0.625, + "step": 1345, + "zero_std_ratio": 0.5 + }, + { + "epoch": 0.11964444444444444, + "grad_norm": 156.10609436035156, + "learning_rate": 3e-06, + "loss": -19.2254, + "step": 1346 + }, + { + "epoch": 0.11973333333333333, + "grad_norm": 159.9058074951172, + "learning_rate": 3e-06, + "loss": -21.4448, + "step": 1347 + }, + { + "epoch": 0.11982222222222222, + "grad_norm": 217.29722595214844, + "learning_rate": 3e-06, + "loss": -22.2701, + "step": 1348 + }, + { + "epoch": 0.11991111111111111, + "grad_norm": 220.8108673095703, + "learning_rate": 3e-06, + "loss": -9.9609, + "step": 1349 + }, + { + "epoch": 0.12, + "grad_norm": 255.4961395263672, + "learning_rate": 3e-06, + "loss": -10.7417, + "step": 1350 + }, + { + "epoch": 0.12008888888888888, + "grad_norm": 159.0548858642578, + "learning_rate": 3e-06, + "loss": -18.3206, + "step": 1351 + }, + { + "epoch": 0.12017777777777777, + "grad_norm": 151.7304229736328, + "learning_rate": 3e-06, + "loss": -21.7915, + "step": 1352 + }, + { + "epoch": 0.12026666666666666, + "grad_norm": 162.3264923095703, + "learning_rate": 3e-06, + "loss": -24.2052, + "step": 1353 + }, + { + "epoch": 0.12035555555555555, + "grad_norm": 188.41310119628906, + "learning_rate": 3e-06, + "loss": -26.0542, + "step": 1354 + }, + { + "epoch": 0.12044444444444445, + "grad_norm": 182.1976776123047, + "learning_rate": 3e-06, + "loss": -14.6291, + "step": 1355 + }, + { + "epoch": 0.12053333333333334, + "grad_norm": 227.4834747314453, + "learning_rate": 3e-06, + "loss": -16.78, + "step": 1356 + }, + { + "completion_length": 236.93750762939453, + "epoch": 0.12062222222222223, + "grad_norm": 244.82566833496094, + "learning_rate": 3e-06, + "loss": -29.5738, + "reward": 2.125, + "reward_std": 0.39512956142425537, + "rewards/boxed_and_answer_tags_format_reward": 0.75, + "rewards/correctness_reward_func_math": 1.375, + "step": 1357, + "zero_std_ratio": 0.625 + }, + { + "epoch": 0.12071111111111112, + "grad_norm": 560.8680419921875, + "learning_rate": 3e-06, + "loss": -35.2668, + "step": 1358 + }, + { + "epoch": 0.1208, + "grad_norm": 303.6029968261719, + "learning_rate": 3e-06, + "loss": -21.3551, + "step": 1359 + }, + { + "epoch": 0.12088888888888889, + "grad_norm": 247.47055053710938, + "learning_rate": 3e-06, + "loss": -30.3557, + "step": 1360 + }, + { + "epoch": 0.12097777777777778, + "grad_norm": 311.3307189941406, + "learning_rate": 3e-06, + "loss": -31.387, + "step": 1361 + }, + { + "epoch": 0.12106666666666667, + "grad_norm": 296.8590087890625, + "learning_rate": 3e-06, + "loss": -33.6326, + "step": 1362 + }, + { + "epoch": 0.12115555555555556, + "grad_norm": 253.0756378173828, + "learning_rate": 3e-06, + "loss": -31.4888, + "step": 1363 + }, + { + "epoch": 0.12124444444444445, + "grad_norm": 293.0926513671875, + "learning_rate": 3e-06, + "loss": -38.9989, + "step": 1364 + }, + { + "epoch": 0.12133333333333333, + "grad_norm": 318.564208984375, + "learning_rate": 3e-06, + "loss": -25.8663, + "step": 1365 + }, + { + "epoch": 0.12142222222222222, + "grad_norm": 262.5808410644531, + "learning_rate": 3e-06, + "loss": -33.3059, + "step": 1366 + }, + { + "epoch": 0.12151111111111111, + "grad_norm": 297.9688720703125, + "learning_rate": 3e-06, + "loss": -36.1247, + "step": 1367 + }, + { + "epoch": 0.1216, + "grad_norm": 319.11968994140625, + "learning_rate": 3e-06, + "loss": -37.8042, + "step": 1368 + }, + { + "completion_length": 241.52083587646484, + "epoch": 0.12168888888888889, + "grad_norm": 149.28683471679688, + "learning_rate": 3e-06, + "loss": -5.2185, + "reward": 0.979166716337204, + "reward_std": 0.306186206638813, + "rewards/boxed_and_answer_tags_format_reward": 0.6875, + "rewards/correctness_reward_func_math": 0.2916666641831398, + "step": 1369, + "zero_std_ratio": 0.625 + }, + { + "epoch": 0.12177777777777778, + "grad_norm": 234.47120666503906, + "learning_rate": 3e-06, + "loss": -0.5601, + "step": 1370 + }, + { + "epoch": 0.12186666666666666, + "grad_norm": 168.30697631835938, + "learning_rate": 3e-06, + "loss": -6.9666, + "step": 1371 + }, + { + "epoch": 0.12195555555555555, + "grad_norm": 304.0892333984375, + "learning_rate": 3e-06, + "loss": -11.5806, + "step": 1372 + }, + { + "epoch": 0.12204444444444444, + "grad_norm": 210.6204071044922, + "learning_rate": 3e-06, + "loss": 3.7489, + "step": 1373 + }, + { + "epoch": 0.12213333333333333, + "grad_norm": 227.05795288085938, + "learning_rate": 3e-06, + "loss": -19.6276, + "step": 1374 + }, + { + "epoch": 0.12222222222222222, + "grad_norm": 148.8232879638672, + "learning_rate": 3e-06, + "loss": -8.3126, + "step": 1375 + }, + { + "epoch": 0.1223111111111111, + "grad_norm": 216.28646850585938, + "learning_rate": 3e-06, + "loss": -4.662, + "step": 1376 + }, + { + "epoch": 0.1224, + "grad_norm": 196.22518920898438, + "learning_rate": 3e-06, + "loss": -10.8248, + "step": 1377 + }, + { + "epoch": 0.12248888888888888, + "grad_norm": 238.0521697998047, + "learning_rate": 3e-06, + "loss": -16.9448, + "step": 1378 + }, + { + "epoch": 0.12257777777777777, + "grad_norm": 195.62860107421875, + "learning_rate": 3e-06, + "loss": -0.2301, + "step": 1379 + }, + { + "epoch": 0.12266666666666666, + "grad_norm": 201.47740173339844, + "learning_rate": 3e-06, + "loss": -26.1931, + "step": 1380 + }, + { + "completion_length": 223.375, + "epoch": 0.12275555555555556, + "grad_norm": 340.7679443359375, + "learning_rate": 3e-06, + "loss": -35.7739, + "reward": 1.5000000596046448, + "reward_std": 0.4779854714870453, + "rewards/boxed_and_answer_tags_format_reward": 0.75, + "rewards/correctness_reward_func_math": 0.75, + "step": 1381, + "zero_std_ratio": 0.5 + }, + { + "epoch": 0.12284444444444445, + "grad_norm": 357.4112548828125, + "learning_rate": 3e-06, + "loss": -25.5447, + "step": 1382 + }, + { + "epoch": 0.12293333333333334, + "grad_norm": 355.23419189453125, + "learning_rate": 3e-06, + "loss": -29.5073, + "step": 1383 + }, + { + "epoch": 0.12302222222222223, + "grad_norm": 362.6241455078125, + "learning_rate": 3e-06, + "loss": -15.9052, + "step": 1384 + }, + { + "epoch": 0.12311111111111112, + "grad_norm": 388.32904052734375, + "learning_rate": 3e-06, + "loss": -16.7547, + "step": 1385 + }, + { + "epoch": 0.1232, + "grad_norm": 398.5157775878906, + "learning_rate": 3e-06, + "loss": -32.0836, + "step": 1386 + }, + { + "epoch": 0.12328888888888889, + "grad_norm": 256.07763671875, + "learning_rate": 3e-06, + "loss": -36.567, + "step": 1387 + }, + { + "epoch": 0.12337777777777778, + "grad_norm": 273.55108642578125, + "learning_rate": 3e-06, + "loss": -32.0325, + "step": 1388 + }, + { + "epoch": 0.12346666666666667, + "grad_norm": 298.35675048828125, + "learning_rate": 3e-06, + "loss": -33.9138, + "step": 1389 + }, + { + "epoch": 0.12355555555555556, + "grad_norm": 319.2604064941406, + "learning_rate": 3e-06, + "loss": -22.5196, + "step": 1390 + }, + { + "epoch": 0.12364444444444445, + "grad_norm": 321.7362976074219, + "learning_rate": 3e-06, + "loss": -22.5907, + "step": 1391 + }, + { + "epoch": 0.12373333333333333, + "grad_norm": 289.5376281738281, + "learning_rate": 3e-06, + "loss": -39.4107, + "step": 1392 + }, + { + "completion_length": 239.6041717529297, + "epoch": 0.12382222222222222, + "grad_norm": 189.32923889160156, + "learning_rate": 3e-06, + "loss": 33.0773, + "reward": 1.3541666865348816, + "reward_std": 0.23899273574352264, + "rewards/boxed_and_answer_tags_format_reward": 0.6875, + "rewards/correctness_reward_func_math": 0.6666666567325592, + "step": 1393, + "zero_std_ratio": 0.75 + }, + { + "epoch": 0.12391111111111111, + "grad_norm": 280.063720703125, + "learning_rate": 3e-06, + "loss": 28.1537, + "step": 1394 + }, + { + "epoch": 0.124, + "grad_norm": 239.7598114013672, + "learning_rate": 3e-06, + "loss": 30.4262, + "step": 1395 + }, + { + "epoch": 0.12408888888888889, + "grad_norm": 230.4679718017578, + "learning_rate": 3e-06, + "loss": 26.9258, + "step": 1396 + }, + { + "epoch": 0.12417777777777778, + "grad_norm": 241.6261444091797, + "learning_rate": 3e-06, + "loss": 39.8305, + "step": 1397 + }, + { + "epoch": 0.12426666666666666, + "grad_norm": 305.56097412109375, + "learning_rate": 3e-06, + "loss": 29.9909, + "step": 1398 + }, + { + "epoch": 0.12435555555555555, + "grad_norm": 190.80581665039062, + "learning_rate": 3e-06, + "loss": 28.9829, + "step": 1399 + }, + { + "epoch": 0.12444444444444444, + "grad_norm": 272.18365478515625, + "learning_rate": 3e-06, + "loss": 21.7391, + "step": 1400 + }, + { + "epoch": 0.12453333333333333, + "grad_norm": 236.96322631835938, + "learning_rate": 3e-06, + "loss": 24.1388, + "step": 1401 + }, + { + "epoch": 0.12462222222222222, + "grad_norm": 257.829833984375, + "learning_rate": 3e-06, + "loss": 19.7315, + "step": 1402 + }, + { + "epoch": 0.1247111111111111, + "grad_norm": 237.62989807128906, + "learning_rate": 3e-06, + "loss": 32.9331, + "step": 1403 + }, + { + "epoch": 0.1248, + "grad_norm": 299.9111633300781, + "learning_rate": 3e-06, + "loss": 20.7263, + "step": 1404 + }, + { + "completion_length": 253.62500762939453, + "epoch": 0.12488888888888888, + "grad_norm": 351.87091064453125, + "learning_rate": 3e-06, + "loss": 1.3314, + "reward": 1.5416666865348816, + "reward_std": 0.48936042189598083, + "rewards/boxed_and_answer_tags_format_reward": 0.75, + "rewards/correctness_reward_func_math": 0.7916666716337204, + "step": 1405, + "zero_std_ratio": 0.5 + }, + { + "epoch": 0.12497777777777777, + "grad_norm": 318.410888671875, + "learning_rate": 3e-06, + "loss": -6.872, + "step": 1406 + }, + { + "epoch": 0.12506666666666666, + "grad_norm": 296.7760925292969, + "learning_rate": 3e-06, + "loss": 7.1205, + "step": 1407 + }, + { + "epoch": 0.12515555555555555, + "grad_norm": 264.91400146484375, + "learning_rate": 3e-06, + "loss": 18.3195, + "step": 1408 + }, + { + "epoch": 0.12524444444444444, + "grad_norm": 309.8560485839844, + "learning_rate": 3e-06, + "loss": -9.7835, + "step": 1409 + }, + { + "epoch": 0.12533333333333332, + "grad_norm": 343.082763671875, + "learning_rate": 3e-06, + "loss": 8.5194, + "step": 1410 + }, + { + "epoch": 0.1254222222222222, + "grad_norm": 349.1229248046875, + "learning_rate": 3e-06, + "loss": -2.322, + "step": 1411 + }, + { + "epoch": 0.1255111111111111, + "grad_norm": 343.0054626464844, + "learning_rate": 3e-06, + "loss": -10.4852, + "step": 1412 + }, + { + "epoch": 0.1256, + "grad_norm": 293.6500549316406, + "learning_rate": 3e-06, + "loss": 3.7367, + "step": 1413 + }, + { + "epoch": 0.12568888888888888, + "grad_norm": 244.24459838867188, + "learning_rate": 3e-06, + "loss": 12.8548, + "step": 1414 + }, + { + "epoch": 0.12577777777777777, + "grad_norm": 308.9840393066406, + "learning_rate": 3e-06, + "loss": -14.9646, + "step": 1415 + }, + { + "epoch": 0.12586666666666665, + "grad_norm": 302.1766052246094, + "learning_rate": 3e-06, + "loss": 4.8047, + "step": 1416 + }, + { + "completion_length": 249.6666717529297, + "epoch": 0.12595555555555554, + "grad_norm": 189.9821319580078, + "learning_rate": 3e-06, + "loss": -21.868, + "reward": 1.4583333730697632, + "reward_std": 0.4701542258262634, + "rewards/boxed_and_answer_tags_format_reward": 0.75, + "rewards/correctness_reward_func_math": 0.7083333134651184, + "step": 1417, + "zero_std_ratio": 0.5 + }, + { + "epoch": 0.12604444444444443, + "grad_norm": 230.71632385253906, + "learning_rate": 3e-06, + "loss": -6.3819, + "step": 1418 + }, + { + "epoch": 0.12613333333333332, + "grad_norm": 245.25421142578125, + "learning_rate": 3e-06, + "loss": 10.5483, + "step": 1419 + }, + { + "epoch": 0.12622222222222224, + "grad_norm": 153.20816040039062, + "learning_rate": 3e-06, + "loss": -8.6118, + "step": 1420 + }, + { + "epoch": 0.12631111111111112, + "grad_norm": 167.58921813964844, + "learning_rate": 3e-06, + "loss": -8.8097, + "step": 1421 + }, + { + "epoch": 0.1264, + "grad_norm": 190.3168182373047, + "learning_rate": 3e-06, + "loss": -14.9707, + "step": 1422 + }, + { + "epoch": 0.1264888888888889, + "grad_norm": 191.2883758544922, + "learning_rate": 3e-06, + "loss": -23.6776, + "step": 1423 + }, + { + "epoch": 0.1265777777777778, + "grad_norm": 223.39498901367188, + "learning_rate": 3e-06, + "loss": -8.999, + "step": 1424 + }, + { + "epoch": 0.12666666666666668, + "grad_norm": 228.12818908691406, + "learning_rate": 3e-06, + "loss": 7.6836, + "step": 1425 + }, + { + "epoch": 0.12675555555555557, + "grad_norm": 165.7634735107422, + "learning_rate": 3e-06, + "loss": -12.3289, + "step": 1426 + }, + { + "epoch": 0.12684444444444445, + "grad_norm": 169.15396118164062, + "learning_rate": 3e-06, + "loss": -13.59, + "step": 1427 + }, + { + "epoch": 0.12693333333333334, + "grad_norm": 204.25228881835938, + "learning_rate": 3e-06, + "loss": -19.9151, + "step": 1428 + }, + { + "completion_length": 228.02083587646484, + "epoch": 0.12702222222222223, + "grad_norm": 167.8898468017578, + "learning_rate": 3e-06, + "loss": 7.6334, + "reward": 1.2083333432674408, + "reward_std": 0.3602609783411026, + "rewards/boxed_and_answer_tags_format_reward": 0.75, + "rewards/correctness_reward_func_math": 0.4583333358168602, + "step": 1429, + "zero_std_ratio": 0.625 + }, + { + "epoch": 0.12711111111111112, + "grad_norm": 224.77040100097656, + "learning_rate": 3e-06, + "loss": -1.7483, + "step": 1430 + }, + { + "epoch": 0.1272, + "grad_norm": 220.69076538085938, + "learning_rate": 3e-06, + "loss": 14.763, + "step": 1431 + }, + { + "epoch": 0.1272888888888889, + "grad_norm": 150.462646484375, + "learning_rate": 3e-06, + "loss": 17.6324, + "step": 1432 + }, + { + "epoch": 0.12737777777777778, + "grad_norm": 203.99217224121094, + "learning_rate": 3e-06, + "loss": 17.7821, + "step": 1433 + }, + { + "epoch": 0.12746666666666667, + "grad_norm": 188.85665893554688, + "learning_rate": 3e-06, + "loss": 13.1598, + "step": 1434 + }, + { + "epoch": 0.12755555555555556, + "grad_norm": 155.11329650878906, + "learning_rate": 3e-06, + "loss": 6.2584, + "step": 1435 + }, + { + "epoch": 0.12764444444444445, + "grad_norm": 205.92800903320312, + "learning_rate": 3e-06, + "loss": -3.0899, + "step": 1436 + }, + { + "epoch": 0.12773333333333334, + "grad_norm": 201.03298950195312, + "learning_rate": 3e-06, + "loss": 12.9818, + "step": 1437 + }, + { + "epoch": 0.12782222222222223, + "grad_norm": 167.153076171875, + "learning_rate": 3e-06, + "loss": 15.1051, + "step": 1438 + }, + { + "epoch": 0.12791111111111111, + "grad_norm": 223.22909545898438, + "learning_rate": 3e-06, + "loss": 14.2815, + "step": 1439 + }, + { + "epoch": 0.128, + "grad_norm": 173.97694396972656, + "learning_rate": 3e-06, + "loss": 9.4034, + "step": 1440 + }, + { + "completion_length": 239.5416717529297, + "epoch": 0.1280888888888889, + "grad_norm": 299.9499816894531, + "learning_rate": 3e-06, + "loss": -27.7606, + "reward": 1.5208333730697632, + "reward_std": 0.3680921643972397, + "rewards/boxed_and_answer_tags_format_reward": 0.6875, + "rewards/correctness_reward_func_math": 0.8333333432674408, + "step": 1441, + "zero_std_ratio": 0.625 + }, + { + "epoch": 0.12817777777777778, + "grad_norm": 236.49612426757812, + "learning_rate": 3e-06, + "loss": -27.7132, + "step": 1442 + }, + { + "epoch": 0.12826666666666667, + "grad_norm": 186.70510864257812, + "learning_rate": 3e-06, + "loss": -28.2186, + "step": 1443 + }, + { + "epoch": 0.12835555555555556, + "grad_norm": 294.79656982421875, + "learning_rate": 3e-06, + "loss": -18.5453, + "step": 1444 + }, + { + "epoch": 0.12844444444444444, + "grad_norm": 257.8788146972656, + "learning_rate": 3e-06, + "loss": -26.3342, + "step": 1445 + }, + { + "epoch": 0.12853333333333333, + "grad_norm": 280.1625061035156, + "learning_rate": 3e-06, + "loss": -37.5225, + "step": 1446 + }, + { + "epoch": 0.12862222222222222, + "grad_norm": 257.7731018066406, + "learning_rate": 3e-06, + "loss": -31.1507, + "step": 1447 + }, + { + "epoch": 0.1287111111111111, + "grad_norm": 221.82879638671875, + "learning_rate": 3e-06, + "loss": -30.7869, + "step": 1448 + }, + { + "epoch": 0.1288, + "grad_norm": 227.20188903808594, + "learning_rate": 3e-06, + "loss": -31.8637, + "step": 1449 + }, + { + "epoch": 0.1288888888888889, + "grad_norm": 319.30633544921875, + "learning_rate": 3e-06, + "loss": -24.0296, + "step": 1450 + }, + { + "epoch": 0.12897777777777777, + "grad_norm": 206.82269287109375, + "learning_rate": 3e-06, + "loss": -28.7864, + "step": 1451 + }, + { + "epoch": 0.12906666666666666, + "grad_norm": 196.76171875, + "learning_rate": 3e-06, + "loss": -42.0569, + "step": 1452 + }, + { + "completion_length": 248.1666717529297, + "epoch": 0.12915555555555555, + "grad_norm": 210.2032928466797, + "learning_rate": 3e-06, + "loss": -15.8435, + "reward": 1.5000000596046448, + "reward_std": 0.3680921941995621, + "rewards/boxed_and_answer_tags_format_reward": 0.75, + "rewards/correctness_reward_func_math": 0.75, + "step": 1453, + "zero_std_ratio": 0.625 + }, + { + "epoch": 0.12924444444444444, + "grad_norm": 209.77688598632812, + "learning_rate": 3e-06, + "loss": -14.2805, + "step": 1454 + }, + { + "epoch": 0.12933333333333333, + "grad_norm": 662.65966796875, + "learning_rate": 3e-06, + "loss": -15.5908, + "step": 1455 + }, + { + "epoch": 0.12942222222222222, + "grad_norm": 176.72958374023438, + "learning_rate": 3e-06, + "loss": -15.1466, + "step": 1456 + }, + { + "epoch": 0.1295111111111111, + "grad_norm": 224.30841064453125, + "learning_rate": 3e-06, + "loss": -23.5471, + "step": 1457 + }, + { + "epoch": 0.1296, + "grad_norm": 187.04263305664062, + "learning_rate": 3e-06, + "loss": -15.5373, + "step": 1458 + }, + { + "epoch": 0.12968888888888888, + "grad_norm": 216.18629455566406, + "learning_rate": 3e-06, + "loss": -19.5843, + "step": 1459 + }, + { + "epoch": 0.12977777777777777, + "grad_norm": 204.7811279296875, + "learning_rate": 3e-06, + "loss": -17.6737, + "step": 1460 + }, + { + "epoch": 0.12986666666666666, + "grad_norm": 357.2877197265625, + "learning_rate": 3e-06, + "loss": -19.0236, + "step": 1461 + }, + { + "epoch": 0.12995555555555555, + "grad_norm": 173.1217803955078, + "learning_rate": 3e-06, + "loss": -18.2209, + "step": 1462 + }, + { + "epoch": 0.13004444444444443, + "grad_norm": 202.11126708984375, + "learning_rate": 3e-06, + "loss": -26.1672, + "step": 1463 + }, + { + "epoch": 0.13013333333333332, + "grad_norm": 215.072265625, + "learning_rate": 3e-06, + "loss": -18.211, + "step": 1464 + }, + { + "completion_length": 248.45833587646484, + "epoch": 0.1302222222222222, + "grad_norm": 189.24551391601562, + "learning_rate": 3e-06, + "loss": -26.8632, + "reward": 1.1041666865348816, + "reward_std": 0.23899272084236145, + "rewards/boxed_and_answer_tags_format_reward": 0.6875, + "rewards/correctness_reward_func_math": 0.4166666716337204, + "step": 1465, + "zero_std_ratio": 0.75 + }, + { + "epoch": 0.1303111111111111, + "grad_norm": 150.01397705078125, + "learning_rate": 3e-06, + "loss": -36.2487, + "step": 1466 + }, + { + "epoch": 0.1304, + "grad_norm": 239.17184448242188, + "learning_rate": 3e-06, + "loss": -32.3168, + "step": 1467 + }, + { + "epoch": 0.13048888888888888, + "grad_norm": 214.16525268554688, + "learning_rate": 3e-06, + "loss": -36.6605, + "step": 1468 + }, + { + "epoch": 0.13057777777777776, + "grad_norm": 200.94650268554688, + "learning_rate": 3e-06, + "loss": -23.324, + "step": 1469 + }, + { + "epoch": 0.13066666666666665, + "grad_norm": 202.95838928222656, + "learning_rate": 3e-06, + "loss": -34.8067, + "step": 1470 + }, + { + "epoch": 0.13075555555555557, + "grad_norm": 147.8043670654297, + "learning_rate": 3e-06, + "loss": -29.223, + "step": 1471 + }, + { + "epoch": 0.13084444444444446, + "grad_norm": 137.1348114013672, + "learning_rate": 3e-06, + "loss": -38.384, + "step": 1472 + }, + { + "epoch": 0.13093333333333335, + "grad_norm": 198.6035614013672, + "learning_rate": 3e-06, + "loss": -35.8388, + "step": 1473 + }, + { + "epoch": 0.13102222222222223, + "grad_norm": 225.8579864501953, + "learning_rate": 3e-06, + "loss": -39.2579, + "step": 1474 + }, + { + "epoch": 0.13111111111111112, + "grad_norm": 193.4779052734375, + "learning_rate": 3e-06, + "loss": -25.8384, + "step": 1475 + }, + { + "epoch": 0.1312, + "grad_norm": 188.15464782714844, + "learning_rate": 3e-06, + "loss": -37.6788, + "step": 1476 + }, + { + "completion_length": 236.9375, + "epoch": 0.1312888888888889, + "grad_norm": 433.47833251953125, + "learning_rate": 3e-06, + "loss": -44.9082, + "reward": 1.9166667461395264, + "reward_std": 0.6611596345901489, + "rewards/boxed_and_answer_tags_format_reward": 0.75, + "rewards/correctness_reward_func_math": 1.1666666865348816, + "step": 1477, + "zero_std_ratio": 0.375 + }, + { + "epoch": 0.1313777777777778, + "grad_norm": 327.1938171386719, + "learning_rate": 3e-06, + "loss": -49.2004, + "step": 1478 + }, + { + "epoch": 0.13146666666666668, + "grad_norm": 295.2423095703125, + "learning_rate": 3e-06, + "loss": -47.3065, + "step": 1479 + }, + { + "epoch": 0.13155555555555556, + "grad_norm": 288.9835510253906, + "learning_rate": 3e-06, + "loss": -35.3959, + "step": 1480 + }, + { + "epoch": 0.13164444444444445, + "grad_norm": 310.02056884765625, + "learning_rate": 3e-06, + "loss": -36.4699, + "step": 1481 + }, + { + "epoch": 0.13173333333333334, + "grad_norm": 288.0977478027344, + "learning_rate": 3e-06, + "loss": -36.6703, + "step": 1482 + }, + { + "epoch": 0.13182222222222223, + "grad_norm": 266.6463317871094, + "learning_rate": 3e-06, + "loss": -47.7283, + "step": 1483 + }, + { + "epoch": 0.13191111111111112, + "grad_norm": 405.67071533203125, + "learning_rate": 3e-06, + "loss": -51.2972, + "step": 1484 + }, + { + "epoch": 0.132, + "grad_norm": 401.3482360839844, + "learning_rate": 3e-06, + "loss": -52.0379, + "step": 1485 + }, + { + "epoch": 0.1320888888888889, + "grad_norm": 280.8495788574219, + "learning_rate": 3e-06, + "loss": -39.6706, + "step": 1486 + }, + { + "epoch": 0.13217777777777778, + "grad_norm": 329.5269775390625, + "learning_rate": 3e-06, + "loss": -41.6526, + "step": 1487 + }, + { + "epoch": 0.13226666666666667, + "grad_norm": 335.015380859375, + "learning_rate": 3e-06, + "loss": -42.414, + "step": 1488 + }, + { + "completion_length": 244.70833587646484, + "epoch": 0.13235555555555556, + "grad_norm": 365.8779602050781, + "learning_rate": 3e-06, + "loss": 0.0487, + "reward": 1.1875, + "reward_std": 0.23116151988506317, + "rewards/boxed_and_answer_tags_format_reward": 0.5625, + "rewards/correctness_reward_func_math": 0.625, + "step": 1489, + "zero_std_ratio": 0.75 + }, + { + "epoch": 0.13244444444444445, + "grad_norm": 270.31494140625, + "learning_rate": 3e-06, + "loss": -3.1951, + "step": 1490 + }, + { + "epoch": 0.13253333333333334, + "grad_norm": 350.0372619628906, + "learning_rate": 3e-06, + "loss": 2.4081, + "step": 1491 + }, + { + "epoch": 0.13262222222222222, + "grad_norm": 297.7118835449219, + "learning_rate": 3e-06, + "loss": 1.5965, + "step": 1492 + }, + { + "epoch": 0.1327111111111111, + "grad_norm": 251.00436401367188, + "learning_rate": 3e-06, + "loss": -8.6016, + "step": 1493 + }, + { + "epoch": 0.1328, + "grad_norm": 364.3514709472656, + "learning_rate": 3e-06, + "loss": 5.7272, + "step": 1494 + }, + { + "epoch": 0.1328888888888889, + "grad_norm": 327.8075256347656, + "learning_rate": 3e-06, + "loss": -6.1829, + "step": 1495 + }, + { + "epoch": 0.13297777777777778, + "grad_norm": 816.3570556640625, + "learning_rate": 3e-06, + "loss": -10.451, + "step": 1496 + }, + { + "epoch": 0.13306666666666667, + "grad_norm": 319.318115234375, + "learning_rate": 3e-06, + "loss": -7.2984, + "step": 1497 + }, + { + "epoch": 0.13315555555555555, + "grad_norm": 306.3028259277344, + "learning_rate": 3e-06, + "loss": -7.2124, + "step": 1498 + }, + { + "epoch": 0.13324444444444444, + "grad_norm": 194.4175567626953, + "learning_rate": 3e-06, + "loss": -14.7213, + "step": 1499 + }, + { + "epoch": 0.13333333333333333, + "grad_norm": 289.50408935546875, + "learning_rate": 3e-06, + "loss": -8.9605, + "step": 1500 + }, + { + "completion_length": 245.8541717529297, + "epoch": 0.13342222222222222, + "grad_norm": 177.177734375, + "learning_rate": 3e-06, + "loss": -19.9228, + "reward": 1.0000000596046448, + "reward_std": 0.23116152733564377, + "rewards/boxed_and_answer_tags_format_reward": 0.625, + "rewards/correctness_reward_func_math": 0.375, + "step": 1501, + "zero_std_ratio": 0.75 + }, + { + "epoch": 0.1335111111111111, + "grad_norm": 256.49334716796875, + "learning_rate": 3e-06, + "loss": -12.3773, + "step": 1502 + }, + { + "epoch": 0.1336, + "grad_norm": 200.69879150390625, + "learning_rate": 3e-06, + "loss": -12.0433, + "step": 1503 + }, + { + "epoch": 0.13368888888888888, + "grad_norm": 195.25538635253906, + "learning_rate": 3e-06, + "loss": -5.3936, + "step": 1504 + }, + { + "epoch": 0.13377777777777777, + "grad_norm": 179.9781036376953, + "learning_rate": 3e-06, + "loss": -11.4454, + "step": 1505 + }, + { + "epoch": 0.13386666666666666, + "grad_norm": 255.14865112304688, + "learning_rate": 3e-06, + "loss": -14.0628, + "step": 1506 + }, + { + "epoch": 0.13395555555555555, + "grad_norm": 185.94732666015625, + "learning_rate": 3e-06, + "loss": -21.484, + "step": 1507 + }, + { + "epoch": 0.13404444444444444, + "grad_norm": 233.72573852539062, + "learning_rate": 3e-06, + "loss": -15.3775, + "step": 1508 + }, + { + "epoch": 0.13413333333333333, + "grad_norm": 329.163818359375, + "learning_rate": 3e-06, + "loss": -14.9938, + "step": 1509 + }, + { + "epoch": 0.13422222222222221, + "grad_norm": 199.61465454101562, + "learning_rate": 3e-06, + "loss": -7.8012, + "step": 1510 + }, + { + "epoch": 0.1343111111111111, + "grad_norm": 168.71255493164062, + "learning_rate": 3e-06, + "loss": -14.2907, + "step": 1511 + }, + { + "epoch": 0.1344, + "grad_norm": 246.87896728515625, + "learning_rate": 3e-06, + "loss": -15.4667, + "step": 1512 + }, + { + "completion_length": 240.18750762939453, + "epoch": 0.13448888888888888, + "grad_norm": 273.19427490234375, + "learning_rate": 3e-06, + "loss": -30.8428, + "reward": 1.5625, + "reward_std": 0.3410547822713852, + "rewards/boxed_and_answer_tags_format_reward": 0.6875, + "rewards/correctness_reward_func_math": 0.875, + "step": 1513, + "zero_std_ratio": 0.625 + }, + { + "epoch": 0.13457777777777777, + "grad_norm": 198.8998565673828, + "learning_rate": 3e-06, + "loss": -36.7035, + "step": 1514 + }, + { + "epoch": 0.13466666666666666, + "grad_norm": 266.1399230957031, + "learning_rate": 3e-06, + "loss": -29.4233, + "step": 1515 + }, + { + "epoch": 0.13475555555555555, + "grad_norm": 270.2858581542969, + "learning_rate": 3e-06, + "loss": -36.6868, + "step": 1516 + }, + { + "epoch": 0.13484444444444443, + "grad_norm": 190.39768981933594, + "learning_rate": 3e-06, + "loss": -35.0192, + "step": 1517 + }, + { + "epoch": 0.13493333333333332, + "grad_norm": 319.56494140625, + "learning_rate": 3e-06, + "loss": -35.0648, + "step": 1518 + }, + { + "epoch": 0.1350222222222222, + "grad_norm": 224.65713500976562, + "learning_rate": 3e-06, + "loss": -35.3229, + "step": 1519 + }, + { + "epoch": 0.1351111111111111, + "grad_norm": 188.5618438720703, + "learning_rate": 3e-06, + "loss": -41.7633, + "step": 1520 + }, + { + "epoch": 0.1352, + "grad_norm": 334.20281982421875, + "learning_rate": 3e-06, + "loss": -34.6749, + "step": 1521 + }, + { + "epoch": 0.13528888888888888, + "grad_norm": 232.7653350830078, + "learning_rate": 3e-06, + "loss": -42.387, + "step": 1522 + }, + { + "epoch": 0.1353777777777778, + "grad_norm": 179.99102783203125, + "learning_rate": 3e-06, + "loss": -40.5993, + "step": 1523 + }, + { + "epoch": 0.13546666666666668, + "grad_norm": 266.2838134765625, + "learning_rate": 3e-06, + "loss": -43.1418, + "step": 1524 + }, + { + "completion_length": 251.06250762939453, + "epoch": 0.13555555555555557, + "grad_norm": 379.1654052734375, + "learning_rate": 3e-06, + "loss": 5.5695, + "reward": 1.0520833730697632, + "reward_std": 0.5148759335279465, + "rewards/boxed_and_answer_tags_format_reward": 0.6770833432674408, + "rewards/correctness_reward_func_math": 0.375, + "step": 1525, + "zero_std_ratio": 0.375 + }, + { + "epoch": 0.13564444444444446, + "grad_norm": 382.55987548828125, + "learning_rate": 3e-06, + "loss": -2.5248, + "step": 1526 + }, + { + "epoch": 0.13573333333333334, + "grad_norm": 535.1154174804688, + "learning_rate": 3e-06, + "loss": 2.5235, + "step": 1527 + }, + { + "epoch": 0.13582222222222223, + "grad_norm": 382.97515869140625, + "learning_rate": 3e-06, + "loss": 25.0744, + "step": 1528 + }, + { + "epoch": 0.13591111111111112, + "grad_norm": 417.135009765625, + "learning_rate": 3e-06, + "loss": -7.6026, + "step": 1529 + }, + { + "epoch": 0.136, + "grad_norm": 452.19580078125, + "learning_rate": 3e-06, + "loss": -7.3517, + "step": 1530 + }, + { + "epoch": 0.1360888888888889, + "grad_norm": 404.5818176269531, + "learning_rate": 3e-06, + "loss": 4.4363, + "step": 1531 + }, + { + "epoch": 0.1361777777777778, + "grad_norm": 378.475341796875, + "learning_rate": 3e-06, + "loss": -7.4401, + "step": 1532 + }, + { + "epoch": 0.13626666666666667, + "grad_norm": 446.7852478027344, + "learning_rate": 3e-06, + "loss": -2.1065, + "step": 1533 + }, + { + "epoch": 0.13635555555555556, + "grad_norm": 313.30340576171875, + "learning_rate": 3e-06, + "loss": 20.5916, + "step": 1534 + }, + { + "epoch": 0.13644444444444445, + "grad_norm": 385.8117370605469, + "learning_rate": 3e-06, + "loss": -12.1391, + "step": 1535 + }, + { + "epoch": 0.13653333333333334, + "grad_norm": 381.8157653808594, + "learning_rate": 3e-06, + "loss": -10.1047, + "step": 1536 + }, + { + "completion_length": 251.9166717529297, + "epoch": 0.13662222222222223, + "grad_norm": 151.30177307128906, + "learning_rate": 3e-06, + "loss": 7.8283, + "reward": 1.5, + "reward_std": 0.1369306445121765, + "rewards/boxed_and_answer_tags_format_reward": 0.625, + "rewards/correctness_reward_func_math": 0.875, + "step": 1537, + "zero_std_ratio": 0.875 + }, + { + "epoch": 0.13671111111111112, + "grad_norm": 152.4805908203125, + "learning_rate": 3e-06, + "loss": 2.9257, + "step": 1538 + }, + { + "epoch": 0.1368, + "grad_norm": 143.77023315429688, + "learning_rate": 3e-06, + "loss": 1.5204, + "step": 1539 + }, + { + "epoch": 0.1368888888888889, + "grad_norm": 134.1708526611328, + "learning_rate": 3e-06, + "loss": 2.3719, + "step": 1540 + }, + { + "epoch": 0.13697777777777778, + "grad_norm": 152.87826538085938, + "learning_rate": 3e-06, + "loss": 4.1701, + "step": 1541 + }, + { + "epoch": 0.13706666666666667, + "grad_norm": 140.14395141601562, + "learning_rate": 3e-06, + "loss": 5.3651, + "step": 1542 + }, + { + "epoch": 0.13715555555555556, + "grad_norm": 126.84275817871094, + "learning_rate": 3e-06, + "loss": 7.1642, + "step": 1543 + }, + { + "epoch": 0.13724444444444445, + "grad_norm": 168.67564392089844, + "learning_rate": 3e-06, + "loss": -0.2961, + "step": 1544 + }, + { + "epoch": 0.13733333333333334, + "grad_norm": 163.78794860839844, + "learning_rate": 3e-06, + "loss": -0.8864, + "step": 1545 + }, + { + "epoch": 0.13742222222222222, + "grad_norm": 125.58057403564453, + "learning_rate": 3e-06, + "loss": -0.0556, + "step": 1546 + }, + { + "epoch": 0.1375111111111111, + "grad_norm": 173.51966857910156, + "learning_rate": 3e-06, + "loss": 3.8643, + "step": 1547 + }, + { + "epoch": 0.1376, + "grad_norm": 120.23555755615234, + "learning_rate": 3e-06, + "loss": 1.9312, + "step": 1548 + }, + { + "completion_length": 254.1041717529297, + "epoch": 0.1376888888888889, + "grad_norm": 355.8426208496094, + "learning_rate": 3e-06, + "loss": -6.5178, + "reward": 1.1250000596046448, + "reward_std": 0.23116152733564377, + "rewards/boxed_and_answer_tags_format_reward": 0.75, + "rewards/correctness_reward_func_math": 0.375, + "step": 1549, + "zero_std_ratio": 0.75 + }, + { + "epoch": 0.13777777777777778, + "grad_norm": 321.2608947753906, + "learning_rate": 3e-06, + "loss": -6.46, + "step": 1550 + }, + { + "epoch": 0.13786666666666667, + "grad_norm": 220.72544860839844, + "learning_rate": 3e-06, + "loss": -3.6261, + "step": 1551 + }, + { + "epoch": 0.13795555555555555, + "grad_norm": 475.98944091796875, + "learning_rate": 3e-06, + "loss": -20.6756, + "step": 1552 + }, + { + "epoch": 0.13804444444444444, + "grad_norm": 345.5748291015625, + "learning_rate": 3e-06, + "loss": -3.3905, + "step": 1553 + }, + { + "epoch": 0.13813333333333333, + "grad_norm": 285.5007629394531, + "learning_rate": 3e-06, + "loss": -2.0237, + "step": 1554 + }, + { + "epoch": 0.13822222222222222, + "grad_norm": 395.1439514160156, + "learning_rate": 3e-06, + "loss": -8.0344, + "step": 1555 + }, + { + "epoch": 0.1383111111111111, + "grad_norm": 300.19091796875, + "learning_rate": 3e-06, + "loss": -9.601, + "step": 1556 + }, + { + "epoch": 0.1384, + "grad_norm": 285.38763427734375, + "learning_rate": 3e-06, + "loss": -7.3983, + "step": 1557 + }, + { + "epoch": 0.13848888888888888, + "grad_norm": 463.34136962890625, + "learning_rate": 3e-06, + "loss": -25.5177, + "step": 1558 + }, + { + "epoch": 0.13857777777777777, + "grad_norm": 298.12005615234375, + "learning_rate": 3e-06, + "loss": -4.867, + "step": 1559 + }, + { + "epoch": 0.13866666666666666, + "grad_norm": 274.00836181640625, + "learning_rate": 3e-06, + "loss": -3.8962, + "step": 1560 + }, + { + "completion_length": 242.33334350585938, + "epoch": 0.13875555555555555, + "grad_norm": 310.84564208984375, + "learning_rate": 3e-06, + "loss": -16.4582, + "reward": 1.541666716337204, + "reward_std": 0.3061862289905548, + "rewards/boxed_and_answer_tags_format_reward": 0.75, + "rewards/correctness_reward_func_math": 0.7916666492819786, + "step": 1561, + "zero_std_ratio": 0.625 + }, + { + "epoch": 0.13884444444444444, + "grad_norm": 398.2083435058594, + "learning_rate": 3e-06, + "loss": -20.4413, + "step": 1562 + }, + { + "epoch": 0.13893333333333333, + "grad_norm": 378.9441833496094, + "learning_rate": 3e-06, + "loss": -16.1798, + "step": 1563 + }, + { + "epoch": 0.1390222222222222, + "grad_norm": 350.91192626953125, + "learning_rate": 3e-06, + "loss": -20.2111, + "step": 1564 + }, + { + "epoch": 0.1391111111111111, + "grad_norm": 365.54754638671875, + "learning_rate": 3e-06, + "loss": -17.1017, + "step": 1565 + }, + { + "epoch": 0.1392, + "grad_norm": 551.0444946289062, + "learning_rate": 3e-06, + "loss": -29.4716, + "step": 1566 + }, + { + "epoch": 0.13928888888888888, + "grad_norm": 320.9019775390625, + "learning_rate": 3e-06, + "loss": -20.3973, + "step": 1567 + }, + { + "epoch": 0.13937777777777777, + "grad_norm": 456.51190185546875, + "learning_rate": 3e-06, + "loss": -24.394, + "step": 1568 + }, + { + "epoch": 0.13946666666666666, + "grad_norm": 406.370361328125, + "learning_rate": 3e-06, + "loss": -20.4216, + "step": 1569 + }, + { + "epoch": 0.13955555555555554, + "grad_norm": 356.31982421875, + "learning_rate": 3e-06, + "loss": -24.4687, + "step": 1570 + }, + { + "epoch": 0.13964444444444443, + "grad_norm": 342.86468505859375, + "learning_rate": 3e-06, + "loss": -23.7611, + "step": 1571 + }, + { + "epoch": 0.13973333333333332, + "grad_norm": 536.0755615234375, + "learning_rate": 3e-06, + "loss": -33.8363, + "step": 1572 + }, + { + "completion_length": 254.18750762939453, + "epoch": 0.1398222222222222, + "grad_norm": 186.11480712890625, + "learning_rate": 3e-06, + "loss": 4.595, + "reward": 0.9791666865348816, + "reward_std": 0.23899271339178085, + "rewards/boxed_and_answer_tags_format_reward": 0.5625, + "rewards/correctness_reward_func_math": 0.4166666567325592, + "step": 1573, + "zero_std_ratio": 0.75 + }, + { + "epoch": 0.13991111111111112, + "grad_norm": 166.36386108398438, + "learning_rate": 3e-06, + "loss": 6.7414, + "step": 1574 + }, + { + "epoch": 0.14, + "grad_norm": 213.84388732910156, + "learning_rate": 3e-06, + "loss": 4.949, + "step": 1575 + }, + { + "epoch": 0.1400888888888889, + "grad_norm": 196.5020294189453, + "learning_rate": 3e-06, + "loss": 8.9449, + "step": 1576 + }, + { + "epoch": 0.1401777777777778, + "grad_norm": 216.48086547851562, + "learning_rate": 3e-06, + "loss": 7.0276, + "step": 1577 + }, + { + "epoch": 0.14026666666666668, + "grad_norm": 188.523681640625, + "learning_rate": 3e-06, + "loss": 5.1123, + "step": 1578 + }, + { + "epoch": 0.14035555555555557, + "grad_norm": 223.4918975830078, + "learning_rate": 3e-06, + "loss": 2.6615, + "step": 1579 + }, + { + "epoch": 0.14044444444444446, + "grad_norm": 218.71112060546875, + "learning_rate": 3e-06, + "loss": 3.1185, + "step": 1580 + }, + { + "epoch": 0.14053333333333334, + "grad_norm": 223.2500762939453, + "learning_rate": 3e-06, + "loss": 1.1539, + "step": 1581 + }, + { + "epoch": 0.14062222222222223, + "grad_norm": 176.69094848632812, + "learning_rate": 3e-06, + "loss": 5.1857, + "step": 1582 + }, + { + "epoch": 0.14071111111111112, + "grad_norm": 243.36929321289062, + "learning_rate": 3e-06, + "loss": 4.2964, + "step": 1583 + }, + { + "epoch": 0.1408, + "grad_norm": 269.2211608886719, + "learning_rate": 3e-06, + "loss": 1.8612, + "step": 1584 + }, + { + "completion_length": 245.9166717529297, + "epoch": 0.1408888888888889, + "grad_norm": 542.5291137695312, + "learning_rate": 3e-06, + "loss": 14.2794, + "reward": 0.9791666865348816, + "reward_std": 0.5643851011991501, + "rewards/boxed_and_answer_tags_format_reward": 0.6875, + "rewards/correctness_reward_func_math": 0.2916666716337204, + "step": 1585, + "zero_std_ratio": 0.375 + }, + { + "epoch": 0.14097777777777779, + "grad_norm": 575.147705078125, + "learning_rate": 3e-06, + "loss": 17.5223, + "step": 1586 + }, + { + "epoch": 0.14106666666666667, + "grad_norm": 405.6719970703125, + "learning_rate": 3e-06, + "loss": 9.7247, + "step": 1587 + }, + { + "epoch": 0.14115555555555556, + "grad_norm": 528.659912109375, + "learning_rate": 3e-06, + "loss": 35.2638, + "step": 1588 + }, + { + "epoch": 0.14124444444444445, + "grad_norm": 514.4658203125, + "learning_rate": 3e-06, + "loss": 14.2266, + "step": 1589 + }, + { + "epoch": 0.14133333333333334, + "grad_norm": 535.8659057617188, + "learning_rate": 3e-06, + "loss": 13.3064, + "step": 1590 + }, + { + "epoch": 0.14142222222222223, + "grad_norm": 538.135009765625, + "learning_rate": 3e-06, + "loss": 7.7669, + "step": 1591 + }, + { + "epoch": 0.14151111111111112, + "grad_norm": 547.5822143554688, + "learning_rate": 3e-06, + "loss": 13.0588, + "step": 1592 + }, + { + "epoch": 0.1416, + "grad_norm": 415.9985046386719, + "learning_rate": 3e-06, + "loss": 4.3967, + "step": 1593 + }, + { + "epoch": 0.1416888888888889, + "grad_norm": 553.2412719726562, + "learning_rate": 3e-06, + "loss": 26.3526, + "step": 1594 + }, + { + "epoch": 0.14177777777777778, + "grad_norm": 512.5444946289062, + "learning_rate": 3e-06, + "loss": 5.7301, + "step": 1595 + }, + { + "epoch": 0.14186666666666667, + "grad_norm": 580.6036987304688, + "learning_rate": 3e-06, + "loss": 4.7651, + "step": 1596 + }, + { + "completion_length": 235.7916717529297, + "epoch": 0.14195555555555556, + "grad_norm": 347.6015319824219, + "learning_rate": 3e-06, + "loss": -18.3809, + "reward": 1.5416666865348816, + "reward_std": 0.3332235887646675, + "rewards/boxed_and_answer_tags_format_reward": 0.625, + "rewards/correctness_reward_func_math": 0.9166666567325592, + "step": 1597, + "zero_std_ratio": 0.625 + }, + { + "epoch": 0.14204444444444445, + "grad_norm": 317.97137451171875, + "learning_rate": 3e-06, + "loss": 2.7823, + "step": 1598 + }, + { + "epoch": 0.14213333333333333, + "grad_norm": 202.9899444580078, + "learning_rate": 3e-06, + "loss": -5.0647, + "step": 1599 + }, + { + "epoch": 0.14222222222222222, + "grad_norm": 786.5781860351562, + "learning_rate": 3e-06, + "loss": -4.8509, + "step": 1600 + }, + { + "epoch": 0.1423111111111111, + "grad_norm": 290.1077575683594, + "learning_rate": 3e-06, + "loss": -8.7908, + "step": 1601 + }, + { + "epoch": 0.1424, + "grad_norm": 198.83493041992188, + "learning_rate": 3e-06, + "loss": 8.0596, + "step": 1602 + }, + { + "epoch": 0.1424888888888889, + "grad_norm": 270.3849792480469, + "learning_rate": 3e-06, + "loss": -21.899, + "step": 1603 + }, + { + "epoch": 0.14257777777777778, + "grad_norm": 1309.112060546875, + "learning_rate": 3e-06, + "loss": -1.111, + "step": 1604 + }, + { + "epoch": 0.14266666666666666, + "grad_norm": 212.12266540527344, + "learning_rate": 3e-06, + "loss": -7.2685, + "step": 1605 + }, + { + "epoch": 0.14275555555555555, + "grad_norm": 242.00680541992188, + "learning_rate": 3e-06, + "loss": -7.1693, + "step": 1606 + }, + { + "epoch": 0.14284444444444444, + "grad_norm": 271.86090087890625, + "learning_rate": 3e-06, + "loss": -10.3562, + "step": 1607 + }, + { + "epoch": 0.14293333333333333, + "grad_norm": 233.82144165039062, + "learning_rate": 3e-06, + "loss": 4.7331, + "step": 1608 + }, + { + "completion_length": 254.1666717529297, + "epoch": 0.14302222222222222, + "grad_norm": 183.18118286132812, + "learning_rate": 3e-06, + "loss": -27.8021, + "reward": 1.8958333730697632, + "reward_std": 0.10206206887960434, + "rewards/boxed_and_answer_tags_format_reward": 0.6875, + "rewards/correctness_reward_func_math": 1.2083333134651184, + "step": 1609, + "zero_std_ratio": 0.875 + }, + { + "epoch": 0.1431111111111111, + "grad_norm": 393.4090270996094, + "learning_rate": 3e-06, + "loss": -29.9682, + "step": 1610 + }, + { + "epoch": 0.1432, + "grad_norm": 232.8114776611328, + "learning_rate": 3e-06, + "loss": -21.6474, + "step": 1611 + }, + { + "epoch": 0.14328888888888888, + "grad_norm": 304.4367370605469, + "learning_rate": 3e-06, + "loss": -21.4802, + "step": 1612 + }, + { + "epoch": 0.14337777777777777, + "grad_norm": 162.92181396484375, + "learning_rate": 3e-06, + "loss": -28.1092, + "step": 1613 + }, + { + "epoch": 0.14346666666666666, + "grad_norm": 209.14356994628906, + "learning_rate": 3e-06, + "loss": -19.8933, + "step": 1614 + }, + { + "epoch": 0.14355555555555555, + "grad_norm": 697.1129760742188, + "learning_rate": 3e-06, + "loss": -27.563, + "step": 1615 + }, + { + "epoch": 0.14364444444444444, + "grad_norm": 311.41851806640625, + "learning_rate": 3e-06, + "loss": -33.0986, + "step": 1616 + }, + { + "epoch": 0.14373333333333332, + "grad_norm": 257.729248046875, + "learning_rate": 3e-06, + "loss": -26.3909, + "step": 1617 + }, + { + "epoch": 0.1438222222222222, + "grad_norm": 258.4046936035156, + "learning_rate": 3e-06, + "loss": -26.2488, + "step": 1618 + }, + { + "epoch": 0.1439111111111111, + "grad_norm": 130.1263885498047, + "learning_rate": 3e-06, + "loss": -30.1948, + "step": 1619 + }, + { + "epoch": 0.144, + "grad_norm": 182.67807006835938, + "learning_rate": 3e-06, + "loss": -23.4364, + "step": 1620 + }, + { + "completion_length": 249.4375, + "epoch": 0.14408888888888888, + "grad_norm": 354.3665771484375, + "learning_rate": 3e-06, + "loss": 8.1684, + "reward": 1.3229166865348816, + "reward_std": 0.15461497008800507, + "rewards/boxed_and_answer_tags_format_reward": 0.7395833432674408, + "rewards/correctness_reward_func_math": 0.5833333432674408, + "step": 1621, + "zero_std_ratio": 0.75 + }, + { + "epoch": 0.14417777777777777, + "grad_norm": 424.4937438964844, + "learning_rate": 3e-06, + "loss": -0.1374, + "step": 1622 + }, + { + "epoch": 0.14426666666666665, + "grad_norm": 441.7712097167969, + "learning_rate": 3e-06, + "loss": -5.8721, + "step": 1623 + }, + { + "epoch": 0.14435555555555554, + "grad_norm": 474.1778259277344, + "learning_rate": 3e-06, + "loss": -4.5006, + "step": 1624 + }, + { + "epoch": 0.14444444444444443, + "grad_norm": 464.2291564941406, + "learning_rate": 3e-06, + "loss": -9.1126, + "step": 1625 + }, + { + "epoch": 0.14453333333333335, + "grad_norm": 375.41595458984375, + "learning_rate": 3e-06, + "loss": -1.7981, + "step": 1626 + }, + { + "epoch": 0.14462222222222224, + "grad_norm": 591.2286376953125, + "learning_rate": 3e-06, + "loss": -2.7549, + "step": 1627 + }, + { + "epoch": 0.14471111111111112, + "grad_norm": 269.577880859375, + "learning_rate": 3e-06, + "loss": -15.3618, + "step": 1628 + }, + { + "epoch": 0.1448, + "grad_norm": 366.1959533691406, + "learning_rate": 3e-06, + "loss": -17.4499, + "step": 1629 + }, + { + "epoch": 0.1448888888888889, + "grad_norm": 248.89236450195312, + "learning_rate": 3e-06, + "loss": -18.4987, + "step": 1630 + }, + { + "epoch": 0.1449777777777778, + "grad_norm": 173.4352569580078, + "learning_rate": 3e-06, + "loss": -16.0213, + "step": 1631 + }, + { + "epoch": 0.14506666666666668, + "grad_norm": 558.5076904296875, + "learning_rate": 3e-06, + "loss": -18.2749, + "step": 1632 + }, + { + "completion_length": 251.20833587646484, + "epoch": 0.14515555555555557, + "grad_norm": 273.303955078125, + "learning_rate": 3e-06, + "loss": -13.4676, + "reward": 1.6145833730697632, + "reward_std": 0.33129163831472397, + "rewards/boxed_and_answer_tags_format_reward": 0.6979166567325592, + "rewards/correctness_reward_func_math": 0.9166666865348816, + "step": 1633, + "zero_std_ratio": 0.625 + }, + { + "epoch": 0.14524444444444445, + "grad_norm": 275.0503845214844, + "learning_rate": 3e-06, + "loss": -3.2551, + "step": 1634 + }, + { + "epoch": 0.14533333333333334, + "grad_norm": 399.2369384765625, + "learning_rate": 3e-06, + "loss": -5.5698, + "step": 1635 + }, + { + "epoch": 0.14542222222222223, + "grad_norm": 328.1588439941406, + "learning_rate": 3e-06, + "loss": -18.8269, + "step": 1636 + }, + { + "epoch": 0.14551111111111112, + "grad_norm": 239.95028686523438, + "learning_rate": 3e-06, + "loss": -6.0394, + "step": 1637 + }, + { + "epoch": 0.1456, + "grad_norm": 290.9996643066406, + "learning_rate": 3e-06, + "loss": -3.1009, + "step": 1638 + }, + { + "epoch": 0.1456888888888889, + "grad_norm": 334.3653869628906, + "learning_rate": 3e-06, + "loss": -17.4325, + "step": 1639 + }, + { + "epoch": 0.14577777777777778, + "grad_norm": 301.37139892578125, + "learning_rate": 3e-06, + "loss": -7.3761, + "step": 1640 + }, + { + "epoch": 0.14586666666666667, + "grad_norm": 593.1119995117188, + "learning_rate": 3e-06, + "loss": -12.5855, + "step": 1641 + }, + { + "epoch": 0.14595555555555556, + "grad_norm": 394.4963073730469, + "learning_rate": 3e-06, + "loss": -21.8609, + "step": 1642 + }, + { + "epoch": 0.14604444444444445, + "grad_norm": 239.55348205566406, + "learning_rate": 3e-06, + "loss": -9.8962, + "step": 1643 + }, + { + "epoch": 0.14613333333333334, + "grad_norm": 279.3672180175781, + "learning_rate": 3e-06, + "loss": -8.5061, + "step": 1644 + }, + { + "completion_length": 241.9166717529297, + "epoch": 0.14622222222222223, + "grad_norm": 477.223388671875, + "learning_rate": 3e-06, + "loss": -12.8053, + "reward": 1.1875000596046448, + "reward_std": 0.3332235887646675, + "rewards/boxed_and_answer_tags_format_reward": 0.6875, + "rewards/correctness_reward_func_math": 0.4999999850988388, + "step": 1645, + "zero_std_ratio": 0.625 + }, + { + "epoch": 0.14631111111111111, + "grad_norm": 255.73138427734375, + "learning_rate": 3e-06, + "loss": 6.7089, + "step": 1646 + }, + { + "epoch": 0.1464, + "grad_norm": 414.9920959472656, + "learning_rate": 3e-06, + "loss": -17.6565, + "step": 1647 + }, + { + "epoch": 0.1464888888888889, + "grad_norm": 362.6864318847656, + "learning_rate": 3e-06, + "loss": -5.9471, + "step": 1648 + }, + { + "epoch": 0.14657777777777778, + "grad_norm": 257.5548400878906, + "learning_rate": 3e-06, + "loss": 7.0843, + "step": 1649 + }, + { + "epoch": 0.14666666666666667, + "grad_norm": 407.9867248535156, + "learning_rate": 3e-06, + "loss": -8.5254, + "step": 1650 + }, + { + "epoch": 0.14675555555555556, + "grad_norm": 326.8368225097656, + "learning_rate": 3e-06, + "loss": -14.7068, + "step": 1651 + }, + { + "epoch": 0.14684444444444444, + "grad_norm": 222.26805114746094, + "learning_rate": 3e-06, + "loss": 4.1678, + "step": 1652 + }, + { + "epoch": 0.14693333333333333, + "grad_norm": 361.6373596191406, + "learning_rate": 3e-06, + "loss": -22.3731, + "step": 1653 + }, + { + "epoch": 0.14702222222222222, + "grad_norm": 331.17803955078125, + "learning_rate": 3e-06, + "loss": -9.9171, + "step": 1654 + }, + { + "epoch": 0.1471111111111111, + "grad_norm": 365.830078125, + "learning_rate": 3e-06, + "loss": 1.295, + "step": 1655 + }, + { + "epoch": 0.1472, + "grad_norm": 400.97723388671875, + "learning_rate": 3e-06, + "loss": -15.896, + "step": 1656 + }, + { + "completion_length": 246.3541717529297, + "epoch": 0.14728888888888889, + "grad_norm": 407.52398681640625, + "learning_rate": 3e-06, + "loss": -95.2234, + "reward": 1.4583333730697632, + "reward_std": 0.6184598803520203, + "rewards/boxed_and_answer_tags_format_reward": 0.5, + "rewards/correctness_reward_func_math": 0.9583333432674408, + "step": 1657, + "zero_std_ratio": 0.375 + }, + { + "epoch": 0.14737777777777777, + "grad_norm": 386.23016357421875, + "learning_rate": 3e-06, + "loss": -94.5723, + "step": 1658 + }, + { + "epoch": 0.14746666666666666, + "grad_norm": 337.7243347167969, + "learning_rate": 3e-06, + "loss": -78.7432, + "step": 1659 + }, + { + "epoch": 0.14755555555555555, + "grad_norm": 391.17547607421875, + "learning_rate": 3e-06, + "loss": -95.9404, + "step": 1660 + }, + { + "epoch": 0.14764444444444444, + "grad_norm": 462.5135498046875, + "learning_rate": 3e-06, + "loss": -69.7866, + "step": 1661 + }, + { + "epoch": 0.14773333333333333, + "grad_norm": 326.1936340332031, + "learning_rate": 3e-06, + "loss": -70.0077, + "step": 1662 + }, + { + "epoch": 0.14782222222222222, + "grad_norm": 417.0813903808594, + "learning_rate": 3e-06, + "loss": -104.9765, + "step": 1663 + }, + { + "epoch": 0.1479111111111111, + "grad_norm": 407.79150390625, + "learning_rate": 3e-06, + "loss": -99.052, + "step": 1664 + }, + { + "epoch": 0.148, + "grad_norm": 459.3847961425781, + "learning_rate": 3e-06, + "loss": -89.9644, + "step": 1665 + }, + { + "epoch": 0.14808888888888888, + "grad_norm": 449.5730285644531, + "learning_rate": 3e-06, + "loss": -106.9796, + "step": 1666 + }, + { + "epoch": 0.14817777777777777, + "grad_norm": 431.5627746582031, + "learning_rate": 3e-06, + "loss": -78.6433, + "step": 1667 + }, + { + "epoch": 0.14826666666666666, + "grad_norm": 356.53759765625, + "learning_rate": 3e-06, + "loss": -78.1133, + "step": 1668 + }, + { + "completion_length": 250.12500762939453, + "epoch": 0.14835555555555555, + "grad_norm": 798.4279174804688, + "learning_rate": 3e-06, + "loss": 13.9371, + "reward": 1.2500000596046448, + "reward_std": 0.778884083032608, + "rewards/boxed_and_answer_tags_format_reward": 0.625, + "rewards/correctness_reward_func_math": 0.625, + "step": 1669, + "zero_std_ratio": 0.25 + }, + { + "epoch": 0.14844444444444443, + "grad_norm": 674.0550537109375, + "learning_rate": 3e-06, + "loss": -39.024, + "step": 1670 + }, + { + "epoch": 0.14853333333333332, + "grad_norm": 686.0006103515625, + "learning_rate": 3e-06, + "loss": -26.081, + "step": 1671 + }, + { + "epoch": 0.1486222222222222, + "grad_norm": 854.5956420898438, + "learning_rate": 3e-06, + "loss": -6.9918, + "step": 1672 + }, + { + "epoch": 0.1487111111111111, + "grad_norm": 588.1673583984375, + "learning_rate": 3e-06, + "loss": -41.4908, + "step": 1673 + }, + { + "epoch": 0.1488, + "grad_norm": 802.58544921875, + "learning_rate": 3e-06, + "loss": -9.7048, + "step": 1674 + }, + { + "epoch": 0.14888888888888888, + "grad_norm": 842.8211669921875, + "learning_rate": 3e-06, + "loss": 3.388, + "step": 1675 + }, + { + "epoch": 0.14897777777777776, + "grad_norm": 603.185791015625, + "learning_rate": 3e-06, + "loss": -51.6478, + "step": 1676 + }, + { + "epoch": 0.14906666666666665, + "grad_norm": 667.0884399414062, + "learning_rate": 3e-06, + "loss": -34.1086, + "step": 1677 + }, + { + "epoch": 0.14915555555555557, + "grad_norm": 702.9710693359375, + "learning_rate": 3e-06, + "loss": -14.8077, + "step": 1678 + }, + { + "epoch": 0.14924444444444446, + "grad_norm": 887.6446533203125, + "learning_rate": 3e-06, + "loss": -53.0034, + "step": 1679 + }, + { + "epoch": 0.14933333333333335, + "grad_norm": 738.0032348632812, + "learning_rate": 3e-06, + "loss": -19.4303, + "step": 1680 + }, + { + "completion_length": 252.8541717529297, + "epoch": 0.14942222222222223, + "grad_norm": 563.926513671875, + "learning_rate": 3e-06, + "loss": 4.8877, + "reward": 1.0625, + "reward_std": 0.43528565764427185, + "rewards/boxed_and_answer_tags_format_reward": 0.6875, + "rewards/correctness_reward_func_math": 0.375, + "step": 1681, + "zero_std_ratio": 0.5 + }, + { + "epoch": 0.14951111111111112, + "grad_norm": 571.0492553710938, + "learning_rate": 3e-06, + "loss": -12.3935, + "step": 1682 + }, + { + "epoch": 0.1496, + "grad_norm": 365.8373107910156, + "learning_rate": 3e-06, + "loss": -5.9182, + "step": 1683 + }, + { + "epoch": 0.1496888888888889, + "grad_norm": 700.759033203125, + "learning_rate": 3e-06, + "loss": 14.5414, + "step": 1684 + }, + { + "epoch": 0.1497777777777778, + "grad_norm": 600.5524291992188, + "learning_rate": 3e-06, + "loss": 4.631, + "step": 1685 + }, + { + "epoch": 0.14986666666666668, + "grad_norm": 505.4430847167969, + "learning_rate": 3e-06, + "loss": -27.7184, + "step": 1686 + }, + { + "epoch": 0.14995555555555556, + "grad_norm": 464.21759033203125, + "learning_rate": 3e-06, + "loss": -0.9671, + "step": 1687 + }, + { + "epoch": 0.15004444444444445, + "grad_norm": 500.7622985839844, + "learning_rate": 3e-06, + "loss": -18.8001, + "step": 1688 + }, + { + "epoch": 0.15013333333333334, + "grad_norm": 369.3395080566406, + "learning_rate": 3e-06, + "loss": -9.8684, + "step": 1689 + }, + { + "epoch": 0.15022222222222223, + "grad_norm": 881.677001953125, + "learning_rate": 3e-06, + "loss": 6.5321, + "step": 1690 + }, + { + "epoch": 0.15031111111111112, + "grad_norm": 586.9358520507812, + "learning_rate": 3e-06, + "loss": -3.3023, + "step": 1691 + }, + { + "epoch": 0.1504, + "grad_norm": 406.6219787597656, + "learning_rate": 3e-06, + "loss": -33.3837, + "step": 1692 + }, + { + "completion_length": 250.125, + "epoch": 0.1504888888888889, + "grad_norm": 382.4415283203125, + "learning_rate": 3e-06, + "loss": 19.5614, + "reward": 1.3750000596046448, + "reward_std": 0.3602609932422638, + "rewards/boxed_and_answer_tags_format_reward": 0.75, + "rewards/correctness_reward_func_math": 0.625, + "step": 1693, + "zero_std_ratio": 0.625 + }, + { + "epoch": 0.15057777777777778, + "grad_norm": 349.4650573730469, + "learning_rate": 3e-06, + "loss": 22.1681, + "step": 1694 + }, + { + "epoch": 0.15066666666666667, + "grad_norm": 306.9416809082031, + "learning_rate": 3e-06, + "loss": 34.961, + "step": 1695 + }, + { + "epoch": 0.15075555555555556, + "grad_norm": 353.93121337890625, + "learning_rate": 3e-06, + "loss": 41.0782, + "step": 1696 + }, + { + "epoch": 0.15084444444444445, + "grad_norm": 213.22996520996094, + "learning_rate": 3e-06, + "loss": 26.4404, + "step": 1697 + }, + { + "epoch": 0.15093333333333334, + "grad_norm": 356.7481994628906, + "learning_rate": 3e-06, + "loss": 16.8735, + "step": 1698 + }, + { + "epoch": 0.15102222222222222, + "grad_norm": 394.1246643066406, + "learning_rate": 3e-06, + "loss": 15.6199, + "step": 1699 + }, + { + "epoch": 0.1511111111111111, + "grad_norm": 358.08831787109375, + "learning_rate": 3e-06, + "loss": 19.0218, + "step": 1700 + }, + { + "epoch": 0.1512, + "grad_norm": 290.66485595703125, + "learning_rate": 3e-06, + "loss": 30.4685, + "step": 1701 + }, + { + "epoch": 0.1512888888888889, + "grad_norm": 427.0489501953125, + "learning_rate": 3e-06, + "loss": 35.7429, + "step": 1702 + }, + { + "epoch": 0.15137777777777778, + "grad_norm": 218.4366912841797, + "learning_rate": 3e-06, + "loss": 22.0305, + "step": 1703 + }, + { + "epoch": 0.15146666666666667, + "grad_norm": 375.6819152832031, + "learning_rate": 3e-06, + "loss": 14.9698, + "step": 1704 + }, + { + "completion_length": 232.77083587646484, + "epoch": 0.15155555555555555, + "grad_norm": 213.29344177246094, + "learning_rate": 3e-06, + "loss": 5.0413, + "reward": 1.4791666865348816, + "reward_std": 0.10206206887960434, + "rewards/boxed_and_answer_tags_format_reward": 0.6875, + "rewards/correctness_reward_func_math": 0.7916666567325592, + "step": 1705, + "zero_std_ratio": 0.875 + }, + { + "epoch": 0.15164444444444444, + "grad_norm": 168.3350067138672, + "learning_rate": 3e-06, + "loss": -3.6939, + "step": 1706 + }, + { + "epoch": 0.15173333333333333, + "grad_norm": 159.5379638671875, + "learning_rate": 3e-06, + "loss": 1.1446, + "step": 1707 + }, + { + "epoch": 0.15182222222222222, + "grad_norm": 144.53854370117188, + "learning_rate": 3e-06, + "loss": 3.3238, + "step": 1708 + }, + { + "epoch": 0.1519111111111111, + "grad_norm": 174.06390380859375, + "learning_rate": 3e-06, + "loss": -0.1332, + "step": 1709 + }, + { + "epoch": 0.152, + "grad_norm": 124.43144989013672, + "learning_rate": 3e-06, + "loss": -4.2987, + "step": 1710 + }, + { + "epoch": 0.15208888888888888, + "grad_norm": 242.69232177734375, + "learning_rate": 3e-06, + "loss": 1.5975, + "step": 1711 + }, + { + "epoch": 0.15217777777777777, + "grad_norm": 172.77381896972656, + "learning_rate": 3e-06, + "loss": -5.9669, + "step": 1712 + }, + { + "epoch": 0.15226666666666666, + "grad_norm": 139.244873046875, + "learning_rate": 3e-06, + "loss": -1.5727, + "step": 1713 + }, + { + "epoch": 0.15235555555555555, + "grad_norm": 133.6866455078125, + "learning_rate": 3e-06, + "loss": -0.2409, + "step": 1714 + }, + { + "epoch": 0.15244444444444444, + "grad_norm": 171.1123809814453, + "learning_rate": 3e-06, + "loss": -3.8625, + "step": 1715 + }, + { + "epoch": 0.15253333333333333, + "grad_norm": 110.9591064453125, + "learning_rate": 3e-06, + "loss": -6.3714, + "step": 1716 + }, + { + "completion_length": 254.12500762939453, + "epoch": 0.15262222222222221, + "grad_norm": 464.7311096191406, + "learning_rate": 3e-06, + "loss": -16.6423, + "reward": 1.2916666865348816, + "reward_std": 0.48936043679714203, + "rewards/boxed_and_answer_tags_format_reward": 0.75, + "rewards/correctness_reward_func_math": 0.5416666567325592, + "step": 1717, + "zero_std_ratio": 0.5 + }, + { + "epoch": 0.1527111111111111, + "grad_norm": 429.9415588378906, + "learning_rate": 3e-06, + "loss": -21.4782, + "step": 1718 + }, + { + "epoch": 0.1528, + "grad_norm": 415.4131774902344, + "learning_rate": 3e-06, + "loss": -38.5279, + "step": 1719 + }, + { + "epoch": 0.15288888888888888, + "grad_norm": 589.5310668945312, + "learning_rate": 3e-06, + "loss": -19.3757, + "step": 1720 + }, + { + "epoch": 0.15297777777777777, + "grad_norm": 492.2614440917969, + "learning_rate": 3e-06, + "loss": -28.3813, + "step": 1721 + }, + { + "epoch": 0.15306666666666666, + "grad_norm": 381.6370849609375, + "learning_rate": 3e-06, + "loss": -23.7667, + "step": 1722 + }, + { + "epoch": 0.15315555555555554, + "grad_norm": 448.4298095703125, + "learning_rate": 3e-06, + "loss": -20.063, + "step": 1723 + }, + { + "epoch": 0.15324444444444443, + "grad_norm": 351.5326232910156, + "learning_rate": 3e-06, + "loss": -25.6929, + "step": 1724 + }, + { + "epoch": 0.15333333333333332, + "grad_norm": 951.7434692382812, + "learning_rate": 3e-06, + "loss": -43.7247, + "step": 1725 + }, + { + "epoch": 0.1534222222222222, + "grad_norm": 457.6904602050781, + "learning_rate": 3e-06, + "loss": -27.8251, + "step": 1726 + }, + { + "epoch": 0.1535111111111111, + "grad_norm": 433.86907958984375, + "learning_rate": 3e-06, + "loss": -37.9928, + "step": 1727 + }, + { + "epoch": 0.1536, + "grad_norm": 506.2419128417969, + "learning_rate": 3e-06, + "loss": -31.6119, + "step": 1728 + }, + { + "completion_length": 250.25000762939453, + "epoch": 0.1536888888888889, + "grad_norm": 638.6153564453125, + "learning_rate": 3e-06, + "loss": -60.7029, + "reward": 1.4895833730697632, + "reward_std": 0.7616997957229614, + "rewards/boxed_and_answer_tags_format_reward": 0.7395833432674408, + "rewards/correctness_reward_func_math": 0.75, + "step": 1729, + "zero_std_ratio": 0.125 + }, + { + "epoch": 0.1537777777777778, + "grad_norm": 645.9075927734375, + "learning_rate": 3e-06, + "loss": -100.7913, + "step": 1730 + }, + { + "epoch": 0.15386666666666668, + "grad_norm": 769.6129760742188, + "learning_rate": 3e-06, + "loss": -41.8559, + "step": 1731 + }, + { + "epoch": 0.15395555555555557, + "grad_norm": 594.3479614257812, + "learning_rate": 3e-06, + "loss": -85.2107, + "step": 1732 + }, + { + "epoch": 0.15404444444444446, + "grad_norm": 513.4801635742188, + "learning_rate": 3e-06, + "loss": -78.7403, + "step": 1733 + }, + { + "epoch": 0.15413333333333334, + "grad_norm": 596.7926635742188, + "learning_rate": 3e-06, + "loss": -38.3655, + "step": 1734 + }, + { + "epoch": 0.15422222222222223, + "grad_norm": 762.2822875976562, + "learning_rate": 3e-06, + "loss": -68.4507, + "step": 1735 + }, + { + "epoch": 0.15431111111111112, + "grad_norm": 507.63958740234375, + "learning_rate": 3e-06, + "loss": -108.803, + "step": 1736 + }, + { + "epoch": 0.1544, + "grad_norm": 657.9226684570312, + "learning_rate": 3e-06, + "loss": -56.9832, + "step": 1737 + }, + { + "epoch": 0.1544888888888889, + "grad_norm": 604.657958984375, + "learning_rate": 3e-06, + "loss": -96.6592, + "step": 1738 + }, + { + "epoch": 0.1545777777777778, + "grad_norm": 491.42047119140625, + "learning_rate": 3e-06, + "loss": -87.8488, + "step": 1739 + }, + { + "epoch": 0.15466666666666667, + "grad_norm": 542.5538940429688, + "learning_rate": 3e-06, + "loss": -52.5061, + "step": 1740 + }, + { + "completion_length": 241.3125, + "epoch": 0.15475555555555556, + "grad_norm": 257.529541015625, + "learning_rate": 3e-06, + "loss": -24.5069, + "reward": 1.1250000596046448, + "reward_std": 0.23116152733564377, + "rewards/boxed_and_answer_tags_format_reward": 0.75, + "rewards/correctness_reward_func_math": 0.375, + "step": 1741, + "zero_std_ratio": 0.75 + }, + { + "epoch": 0.15484444444444445, + "grad_norm": 272.9305725097656, + "learning_rate": 3e-06, + "loss": -31.0682, + "step": 1742 + }, + { + "epoch": 0.15493333333333334, + "grad_norm": 241.25071716308594, + "learning_rate": 3e-06, + "loss": -32.0895, + "step": 1743 + }, + { + "epoch": 0.15502222222222223, + "grad_norm": 208.89321899414062, + "learning_rate": 3e-06, + "loss": -28.8566, + "step": 1744 + }, + { + "epoch": 0.15511111111111112, + "grad_norm": 186.21788024902344, + "learning_rate": 3e-06, + "loss": -25.5258, + "step": 1745 + }, + { + "epoch": 0.1552, + "grad_norm": 208.84288024902344, + "learning_rate": 3e-06, + "loss": -36.6331, + "step": 1746 + }, + { + "epoch": 0.1552888888888889, + "grad_norm": 273.28900146484375, + "learning_rate": 3e-06, + "loss": -26.8255, + "step": 1747 + }, + { + "epoch": 0.15537777777777778, + "grad_norm": 255.3370361328125, + "learning_rate": 3e-06, + "loss": -35.3528, + "step": 1748 + }, + { + "epoch": 0.15546666666666667, + "grad_norm": 265.7087097167969, + "learning_rate": 3e-06, + "loss": -37.0957, + "step": 1749 + }, + { + "epoch": 0.15555555555555556, + "grad_norm": 174.1486358642578, + "learning_rate": 3e-06, + "loss": -31.8935, + "step": 1750 + }, + { + "epoch": 0.15564444444444445, + "grad_norm": 208.65518188476562, + "learning_rate": 3e-06, + "loss": -29.0371, + "step": 1751 + }, + { + "epoch": 0.15573333333333333, + "grad_norm": 172.7078857421875, + "learning_rate": 3e-06, + "loss": -39.3675, + "step": 1752 + }, + { + "completion_length": 252.39583587646484, + "epoch": 0.15582222222222222, + "grad_norm": 451.1358947753906, + "learning_rate": 3e-06, + "loss": -66.2963, + "reward": 1.3541667461395264, + "reward_std": 0.6070848852396011, + "rewards/boxed_and_answer_tags_format_reward": 0.6875, + "rewards/correctness_reward_func_math": 0.6666666716337204, + "step": 1753, + "zero_std_ratio": 0.375 + }, + { + "epoch": 0.1559111111111111, + "grad_norm": 447.5511169433594, + "learning_rate": 3e-06, + "loss": -65.7659, + "step": 1754 + }, + { + "epoch": 0.156, + "grad_norm": 481.50335693359375, + "learning_rate": 3e-06, + "loss": -67.9006, + "step": 1755 + }, + { + "epoch": 0.1560888888888889, + "grad_norm": 418.3782043457031, + "learning_rate": 3e-06, + "loss": -59.6285, + "step": 1756 + }, + { + "epoch": 0.15617777777777778, + "grad_norm": 487.20574951171875, + "learning_rate": 3e-06, + "loss": -82.0834, + "step": 1757 + }, + { + "epoch": 0.15626666666666666, + "grad_norm": 454.06463623046875, + "learning_rate": 3e-06, + "loss": -57.3851, + "step": 1758 + }, + { + "epoch": 0.15635555555555555, + "grad_norm": 408.6988830566406, + "learning_rate": 3e-06, + "loss": -69.432, + "step": 1759 + }, + { + "epoch": 0.15644444444444444, + "grad_norm": 399.9183349609375, + "learning_rate": 3e-06, + "loss": -71.3301, + "step": 1760 + }, + { + "epoch": 0.15653333333333333, + "grad_norm": 577.1817626953125, + "learning_rate": 3e-06, + "loss": -74.2775, + "step": 1761 + }, + { + "epoch": 0.15662222222222222, + "grad_norm": 413.5326843261719, + "learning_rate": 3e-06, + "loss": -63.2282, + "step": 1762 + }, + { + "epoch": 0.1567111111111111, + "grad_norm": 498.8305358886719, + "learning_rate": 3e-06, + "loss": -88.1602, + "step": 1763 + }, + { + "epoch": 0.1568, + "grad_norm": 530.10595703125, + "learning_rate": 3e-06, + "loss": -64.1951, + "step": 1764 + }, + { + "completion_length": 247.00000762939453, + "epoch": 0.15688888888888888, + "grad_norm": 433.1192321777344, + "learning_rate": 3e-06, + "loss": -27.1999, + "reward": 1.6458333730697632, + "reward_std": 0.3602609783411026, + "rewards/boxed_and_answer_tags_format_reward": 0.6875, + "rewards/correctness_reward_func_math": 0.9583333432674408, + "step": 1765, + "zero_std_ratio": 0.625 + }, + { + "epoch": 0.15697777777777777, + "grad_norm": 350.172119140625, + "learning_rate": 3e-06, + "loss": -32.7846, + "step": 1766 + }, + { + "epoch": 0.15706666666666666, + "grad_norm": 443.9689636230469, + "learning_rate": 3e-06, + "loss": -25.8718, + "step": 1767 + }, + { + "epoch": 0.15715555555555555, + "grad_norm": 491.6800537109375, + "learning_rate": 3e-06, + "loss": -21.288, + "step": 1768 + }, + { + "epoch": 0.15724444444444444, + "grad_norm": 490.1741943359375, + "learning_rate": 3e-06, + "loss": -30.8959, + "step": 1769 + }, + { + "epoch": 0.15733333333333333, + "grad_norm": 403.3340759277344, + "learning_rate": 3e-06, + "loss": -17.7178, + "step": 1770 + }, + { + "epoch": 0.1574222222222222, + "grad_norm": 434.2870178222656, + "learning_rate": 3e-06, + "loss": -28.6675, + "step": 1771 + }, + { + "epoch": 0.1575111111111111, + "grad_norm": 390.14208984375, + "learning_rate": 3e-06, + "loss": -35.2631, + "step": 1772 + }, + { + "epoch": 0.1576, + "grad_norm": 545.4449462890625, + "learning_rate": 3e-06, + "loss": -27.2021, + "step": 1773 + }, + { + "epoch": 0.15768888888888888, + "grad_norm": 409.76416015625, + "learning_rate": 3e-06, + "loss": -25.2074, + "step": 1774 + }, + { + "epoch": 0.15777777777777777, + "grad_norm": 1407.31787109375, + "learning_rate": 3e-06, + "loss": -32.1991, + "step": 1775 + }, + { + "epoch": 0.15786666666666666, + "grad_norm": 409.153076171875, + "learning_rate": 3e-06, + "loss": -21.3904, + "step": 1776 + }, + { + "completion_length": 255.6666717529297, + "epoch": 0.15795555555555554, + "grad_norm": 250.8477020263672, + "learning_rate": 3e-06, + "loss": -20.1606, + "reward": 1.0625000596046448, + "reward_std": 0.23116152733564377, + "rewards/boxed_and_answer_tags_format_reward": 0.6875, + "rewards/correctness_reward_func_math": 0.3750000111758709, + "step": 1777, + "zero_std_ratio": 0.75 + }, + { + "epoch": 0.15804444444444443, + "grad_norm": 295.9480895996094, + "learning_rate": 3e-06, + "loss": -4.3906, + "step": 1778 + }, + { + "epoch": 0.15813333333333332, + "grad_norm": 264.98590087890625, + "learning_rate": 3e-06, + "loss": 1.1149, + "step": 1779 + }, + { + "epoch": 0.1582222222222222, + "grad_norm": 282.2425537109375, + "learning_rate": 3e-06, + "loss": -13.9569, + "step": 1780 + }, + { + "epoch": 0.15831111111111112, + "grad_norm": 269.65191650390625, + "learning_rate": 3e-06, + "loss": -12.0395, + "step": 1781 + }, + { + "epoch": 0.1584, + "grad_norm": 262.14825439453125, + "learning_rate": 3e-06, + "loss": -17.0425, + "step": 1782 + }, + { + "epoch": 0.1584888888888889, + "grad_norm": 226.9910888671875, + "learning_rate": 3e-06, + "loss": -22.3367, + "step": 1783 + }, + { + "epoch": 0.1585777777777778, + "grad_norm": 268.9870300292969, + "learning_rate": 3e-06, + "loss": -7.3806, + "step": 1784 + }, + { + "epoch": 0.15866666666666668, + "grad_norm": 252.59866333007812, + "learning_rate": 3e-06, + "loss": -2.0587, + "step": 1785 + }, + { + "epoch": 0.15875555555555557, + "grad_norm": 285.8102111816406, + "learning_rate": 3e-06, + "loss": -17.0219, + "step": 1786 + }, + { + "epoch": 0.15884444444444445, + "grad_norm": 281.6475830078125, + "learning_rate": 3e-06, + "loss": -16.6806, + "step": 1787 + }, + { + "epoch": 0.15893333333333334, + "grad_norm": 218.4907989501953, + "learning_rate": 3e-06, + "loss": -22.657, + "step": 1788 + }, + { + "completion_length": 252.0625, + "epoch": 0.15902222222222223, + "grad_norm": 365.0342102050781, + "learning_rate": 3e-06, + "loss": 4.7637, + "reward": 1.0416666865348816, + "reward_std": 0.23899271339178085, + "rewards/boxed_and_answer_tags_format_reward": 0.625, + "rewards/correctness_reward_func_math": 0.4166666679084301, + "step": 1789, + "zero_std_ratio": 0.75 + }, + { + "epoch": 0.15911111111111112, + "grad_norm": 306.0011291503906, + "learning_rate": 3e-06, + "loss": -12.1321, + "step": 1790 + }, + { + "epoch": 0.1592, + "grad_norm": 385.5239562988281, + "learning_rate": 3e-06, + "loss": -11.9155, + "step": 1791 + }, + { + "epoch": 0.1592888888888889, + "grad_norm": 502.8174743652344, + "learning_rate": 3e-06, + "loss": -9.7161, + "step": 1792 + }, + { + "epoch": 0.15937777777777778, + "grad_norm": 286.40557861328125, + "learning_rate": 3e-06, + "loss": -0.1257, + "step": 1793 + }, + { + "epoch": 0.15946666666666667, + "grad_norm": 250.90745544433594, + "learning_rate": 3e-06, + "loss": -5.5968, + "step": 1794 + }, + { + "epoch": 0.15955555555555556, + "grad_norm": 337.0714416503906, + "learning_rate": 3e-06, + "loss": 1.3961, + "step": 1795 + }, + { + "epoch": 0.15964444444444445, + "grad_norm": 279.7541198730469, + "learning_rate": 3e-06, + "loss": -15.7637, + "step": 1796 + }, + { + "epoch": 0.15973333333333334, + "grad_norm": 293.27703857421875, + "learning_rate": 3e-06, + "loss": -16.9533, + "step": 1797 + }, + { + "epoch": 0.15982222222222223, + "grad_norm": 398.4286193847656, + "learning_rate": 3e-06, + "loss": -14.4282, + "step": 1798 + }, + { + "epoch": 0.15991111111111111, + "grad_norm": 314.15338134765625, + "learning_rate": 3e-06, + "loss": -2.8244, + "step": 1799 + }, + { + "epoch": 0.16, + "grad_norm": 302.6134338378906, + "learning_rate": 3e-06, + "loss": -11.5238, + "step": 1800 + }, + { + "completion_length": 251.93750762939453, + "epoch": 0.1600888888888889, + "grad_norm": 598.98095703125, + "learning_rate": 3e-06, + "loss": -25.8615, + "reward": 1.5, + "reward_std": 0.20412415266036987, + "rewards/boxed_and_answer_tags_format_reward": 0.75, + "rewards/correctness_reward_func_math": 0.75, + "step": 1801, + "zero_std_ratio": 0.75 + }, + { + "epoch": 0.16017777777777778, + "grad_norm": 618.0178833007812, + "learning_rate": 3e-06, + "loss": -21.1122, + "step": 1802 + }, + { + "epoch": 0.16026666666666667, + "grad_norm": 541.5601806640625, + "learning_rate": 3e-06, + "loss": -11.0087, + "step": 1803 + }, + { + "epoch": 0.16035555555555556, + "grad_norm": 470.309814453125, + "learning_rate": 3e-06, + "loss": -22.2437, + "step": 1804 + }, + { + "epoch": 0.16044444444444445, + "grad_norm": 413.4715270996094, + "learning_rate": 3e-06, + "loss": -33.0321, + "step": 1805 + }, + { + "epoch": 0.16053333333333333, + "grad_norm": 376.2085266113281, + "learning_rate": 3e-06, + "loss": -25.3322, + "step": 1806 + }, + { + "epoch": 0.16062222222222222, + "grad_norm": 322.1478576660156, + "learning_rate": 3e-06, + "loss": -37.8067, + "step": 1807 + }, + { + "epoch": 0.1607111111111111, + "grad_norm": 272.68951416015625, + "learning_rate": 3e-06, + "loss": -34.8264, + "step": 1808 + }, + { + "epoch": 0.1608, + "grad_norm": 365.9797058105469, + "learning_rate": 3e-06, + "loss": -24.8187, + "step": 1809 + }, + { + "epoch": 0.1608888888888889, + "grad_norm": 314.506591796875, + "learning_rate": 3e-06, + "loss": -32.5667, + "step": 1810 + }, + { + "epoch": 0.16097777777777778, + "grad_norm": 410.4781494140625, + "learning_rate": 3e-06, + "loss": -43.7721, + "step": 1811 + }, + { + "epoch": 0.16106666666666666, + "grad_norm": 323.0709228515625, + "learning_rate": 3e-06, + "loss": -32.8399, + "step": 1812 + }, + { + "completion_length": 253.14583587646484, + "epoch": 0.16115555555555555, + "grad_norm": 490.2536315917969, + "learning_rate": 3e-06, + "loss": -24.4492, + "reward": 1.4375000596046448, + "reward_std": 0.3680921420454979, + "rewards/boxed_and_answer_tags_format_reward": 0.6875, + "rewards/correctness_reward_func_math": 0.7500000149011612, + "step": 1813, + "zero_std_ratio": 0.625 + }, + { + "epoch": 0.16124444444444444, + "grad_norm": 376.43231201171875, + "learning_rate": 3e-06, + "loss": -22.8531, + "step": 1814 + }, + { + "epoch": 0.16133333333333333, + "grad_norm": 370.8981018066406, + "learning_rate": 3e-06, + "loss": -6.1589, + "step": 1815 + }, + { + "epoch": 0.16142222222222222, + "grad_norm": 350.6385498046875, + "learning_rate": 3e-06, + "loss": -17.2146, + "step": 1816 + }, + { + "epoch": 0.1615111111111111, + "grad_norm": 413.9906311035156, + "learning_rate": 3e-06, + "loss": -13.1823, + "step": 1817 + }, + { + "epoch": 0.1616, + "grad_norm": 511.4176940917969, + "learning_rate": 3e-06, + "loss": -23.6257, + "step": 1818 + }, + { + "epoch": 0.16168888888888888, + "grad_norm": 425.5303039550781, + "learning_rate": 3e-06, + "loss": -29.0499, + "step": 1819 + }, + { + "epoch": 0.16177777777777777, + "grad_norm": 329.032958984375, + "learning_rate": 3e-06, + "loss": -28.0294, + "step": 1820 + }, + { + "epoch": 0.16186666666666666, + "grad_norm": 482.425537109375, + "learning_rate": 3e-06, + "loss": -8.7538, + "step": 1821 + }, + { + "epoch": 0.16195555555555555, + "grad_norm": 422.88494873046875, + "learning_rate": 3e-06, + "loss": -22.7747, + "step": 1822 + }, + { + "epoch": 0.16204444444444444, + "grad_norm": 394.9844055175781, + "learning_rate": 3e-06, + "loss": -18.7768, + "step": 1823 + }, + { + "epoch": 0.16213333333333332, + "grad_norm": 449.5504455566406, + "learning_rate": 3e-06, + "loss": -25.7761, + "step": 1824 + }, + { + "completion_length": 240.8541717529297, + "epoch": 0.1622222222222222, + "grad_norm": 474.4821472167969, + "learning_rate": 3e-06, + "loss": 5.9184, + "reward": 1.5416666865348816, + "reward_std": 0.3602609857916832, + "rewards/boxed_and_answer_tags_format_reward": 0.75, + "rewards/correctness_reward_func_math": 0.7916666865348816, + "step": 1825, + "zero_std_ratio": 0.625 + }, + { + "epoch": 0.1623111111111111, + "grad_norm": 394.9033203125, + "learning_rate": 3e-06, + "loss": -8.6183, + "step": 1826 + }, + { + "epoch": 0.1624, + "grad_norm": 585.3305053710938, + "learning_rate": 3e-06, + "loss": 17.2736, + "step": 1827 + }, + { + "epoch": 0.16248888888888888, + "grad_norm": 411.5712585449219, + "learning_rate": 3e-06, + "loss": 2.0057, + "step": 1828 + }, + { + "epoch": 0.16257777777777777, + "grad_norm": 326.8497619628906, + "learning_rate": 3e-06, + "loss": 5.208, + "step": 1829 + }, + { + "epoch": 0.16266666666666665, + "grad_norm": 621.4788208007812, + "learning_rate": 3e-06, + "loss": -6.1221, + "step": 1830 + }, + { + "epoch": 0.16275555555555554, + "grad_norm": 628.4518432617188, + "learning_rate": 3e-06, + "loss": 3.4633, + "step": 1831 + }, + { + "epoch": 0.16284444444444446, + "grad_norm": 389.485595703125, + "learning_rate": 3e-06, + "loss": -14.53, + "step": 1832 + }, + { + "epoch": 0.16293333333333335, + "grad_norm": 760.7333374023438, + "learning_rate": 3e-06, + "loss": 11.3702, + "step": 1833 + }, + { + "epoch": 0.16302222222222224, + "grad_norm": 327.4570617675781, + "learning_rate": 3e-06, + "loss": -4.0738, + "step": 1834 + }, + { + "epoch": 0.16311111111111112, + "grad_norm": 325.6021728515625, + "learning_rate": 3e-06, + "loss": -0.9882, + "step": 1835 + }, + { + "epoch": 0.1632, + "grad_norm": 434.9088439941406, + "learning_rate": 3e-06, + "loss": -14.5048, + "step": 1836 + }, + { + "completion_length": 252.3541717529297, + "epoch": 0.1632888888888889, + "grad_norm": 364.32318115234375, + "learning_rate": 3e-06, + "loss": 5.0981, + "reward": 1.520833432674408, + "reward_std": 0.20412413775920868, + "rewards/boxed_and_answer_tags_format_reward": 0.6875, + "rewards/correctness_reward_func_math": 0.8333333544433117, + "step": 1837, + "zero_std_ratio": 0.75 + }, + { + "epoch": 0.1633777777777778, + "grad_norm": 327.6516418457031, + "learning_rate": 3e-06, + "loss": 13.1354, + "step": 1838 + }, + { + "epoch": 0.16346666666666668, + "grad_norm": 327.8387145996094, + "learning_rate": 3e-06, + "loss": 7.7064, + "step": 1839 + }, + { + "epoch": 0.16355555555555557, + "grad_norm": 483.0592346191406, + "learning_rate": 3e-06, + "loss": 9.8312, + "step": 1840 + }, + { + "epoch": 0.16364444444444445, + "grad_norm": 409.86724853515625, + "learning_rate": 3e-06, + "loss": 13.0568, + "step": 1841 + }, + { + "epoch": 0.16373333333333334, + "grad_norm": 282.73626708984375, + "learning_rate": 3e-06, + "loss": 3.5377, + "step": 1842 + }, + { + "epoch": 0.16382222222222223, + "grad_norm": 262.7396240234375, + "learning_rate": 3e-06, + "loss": -0.2709, + "step": 1843 + }, + { + "epoch": 0.16391111111111112, + "grad_norm": 375.62359619140625, + "learning_rate": 3e-06, + "loss": 8.4106, + "step": 1844 + }, + { + "epoch": 0.164, + "grad_norm": 295.78814697265625, + "learning_rate": 3e-06, + "loss": 2.2675, + "step": 1845 + }, + { + "epoch": 0.1640888888888889, + "grad_norm": 555.635009765625, + "learning_rate": 3e-06, + "loss": 1.7638, + "step": 1846 + }, + { + "epoch": 0.16417777777777778, + "grad_norm": 362.09722900390625, + "learning_rate": 3e-06, + "loss": 2.102, + "step": 1847 + }, + { + "epoch": 0.16426666666666667, + "grad_norm": 349.70440673828125, + "learning_rate": 3e-06, + "loss": -2.454, + "step": 1848 + }, + { + "completion_length": 249.08333587646484, + "epoch": 0.16435555555555556, + "grad_norm": 437.3484802246094, + "learning_rate": 3e-06, + "loss": -29.0477, + "reward": 1.4166666865348816, + "reward_std": 0.4701542556285858, + "rewards/boxed_and_answer_tags_format_reward": 0.625, + "rewards/correctness_reward_func_math": 0.7916666567325592, + "step": 1849, + "zero_std_ratio": 0.5 + }, + { + "epoch": 0.16444444444444445, + "grad_norm": 638.8463745117188, + "learning_rate": 3e-06, + "loss": -47.733, + "step": 1850 + }, + { + "epoch": 0.16453333333333334, + "grad_norm": 460.33624267578125, + "learning_rate": 3e-06, + "loss": -34.4307, + "step": 1851 + }, + { + "epoch": 0.16462222222222223, + "grad_norm": 491.926025390625, + "learning_rate": 3e-06, + "loss": -52.2731, + "step": 1852 + }, + { + "epoch": 0.1647111111111111, + "grad_norm": 607.3854370117188, + "learning_rate": 3e-06, + "loss": -47.2157, + "step": 1853 + }, + { + "epoch": 0.1648, + "grad_norm": 512.1332397460938, + "learning_rate": 3e-06, + "loss": -50.0155, + "step": 1854 + }, + { + "epoch": 0.1648888888888889, + "grad_norm": 403.7186279296875, + "learning_rate": 3e-06, + "loss": -33.8899, + "step": 1855 + }, + { + "epoch": 0.16497777777777778, + "grad_norm": 423.3454284667969, + "learning_rate": 3e-06, + "loss": -51.1046, + "step": 1856 + }, + { + "epoch": 0.16506666666666667, + "grad_norm": 615.0731811523438, + "learning_rate": 3e-06, + "loss": -36.3091, + "step": 1857 + }, + { + "epoch": 0.16515555555555556, + "grad_norm": 557.7341918945312, + "learning_rate": 3e-06, + "loss": -58.0938, + "step": 1858 + }, + { + "epoch": 0.16524444444444444, + "grad_norm": 575.3082885742188, + "learning_rate": 3e-06, + "loss": -56.4051, + "step": 1859 + }, + { + "epoch": 0.16533333333333333, + "grad_norm": 530.2061157226562, + "learning_rate": 3e-06, + "loss": -57.1385, + "step": 1860 + }, + { + "completion_length": 229.4791717529297, + "epoch": 0.16542222222222222, + "grad_norm": 290.0047912597656, + "learning_rate": 3e-06, + "loss": -26.4189, + "reward": 1.4375000596046448, + "reward_std": 0.3332236111164093, + "rewards/boxed_and_answer_tags_format_reward": 0.6875, + "rewards/correctness_reward_func_math": 0.75, + "step": 1861, + "zero_std_ratio": 0.625 + }, + { + "epoch": 0.1655111111111111, + "grad_norm": 343.6731262207031, + "learning_rate": 3e-06, + "loss": -30.5266, + "step": 1862 + }, + { + "epoch": 0.1656, + "grad_norm": 321.04400634765625, + "learning_rate": 3e-06, + "loss": -26.9367, + "step": 1863 + }, + { + "epoch": 0.16568888888888889, + "grad_norm": 423.7695007324219, + "learning_rate": 3e-06, + "loss": -20.3331, + "step": 1864 + }, + { + "epoch": 0.16577777777777777, + "grad_norm": 301.9991760253906, + "learning_rate": 3e-06, + "loss": -22.0128, + "step": 1865 + }, + { + "epoch": 0.16586666666666666, + "grad_norm": 266.446533203125, + "learning_rate": 3e-06, + "loss": -13.6451, + "step": 1866 + }, + { + "epoch": 0.16595555555555555, + "grad_norm": 289.8598937988281, + "learning_rate": 3e-06, + "loss": -29.9844, + "step": 1867 + }, + { + "epoch": 0.16604444444444444, + "grad_norm": 474.86016845703125, + "learning_rate": 3e-06, + "loss": -32.2196, + "step": 1868 + }, + { + "epoch": 0.16613333333333333, + "grad_norm": 314.1653137207031, + "learning_rate": 3e-06, + "loss": -29.7642, + "step": 1869 + }, + { + "epoch": 0.16622222222222222, + "grad_norm": 362.4693603515625, + "learning_rate": 3e-06, + "loss": -24.4635, + "step": 1870 + }, + { + "epoch": 0.1663111111111111, + "grad_norm": 312.52569580078125, + "learning_rate": 3e-06, + "loss": -24.009, + "step": 1871 + }, + { + "epoch": 0.1664, + "grad_norm": 259.9459228515625, + "learning_rate": 3e-06, + "loss": -18.4052, + "step": 1872 + }, + { + "completion_length": 250.8541717529297, + "epoch": 0.16648888888888888, + "grad_norm": 374.1226501464844, + "learning_rate": 3e-06, + "loss": -4.5437, + "reward": 1.6666667461395264, + "reward_std": 0.20412414520978928, + "rewards/boxed_and_answer_tags_format_reward": 0.75, + "rewards/correctness_reward_func_math": 0.9166666865348816, + "step": 1873, + "zero_std_ratio": 0.75 + }, + { + "epoch": 0.16657777777777777, + "grad_norm": 501.4902038574219, + "learning_rate": 3e-06, + "loss": -16.285, + "step": 1874 + }, + { + "epoch": 0.16666666666666666, + "grad_norm": 511.9213562011719, + "learning_rate": 3e-06, + "loss": -5.965, + "step": 1875 + }, + { + "epoch": 0.16675555555555555, + "grad_norm": 377.8996276855469, + "learning_rate": 3e-06, + "loss": -11.742, + "step": 1876 + }, + { + "epoch": 0.16684444444444443, + "grad_norm": 275.3067321777344, + "learning_rate": 3e-06, + "loss": -2.8953, + "step": 1877 + }, + { + "epoch": 0.16693333333333332, + "grad_norm": 311.0989990234375, + "learning_rate": 3e-06, + "loss": 3.1096, + "step": 1878 + }, + { + "epoch": 0.1670222222222222, + "grad_norm": 408.4767761230469, + "learning_rate": 3e-06, + "loss": -5.1197, + "step": 1879 + }, + { + "epoch": 0.1671111111111111, + "grad_norm": 411.1174621582031, + "learning_rate": 3e-06, + "loss": -20.4177, + "step": 1880 + }, + { + "epoch": 0.1672, + "grad_norm": 432.32159423828125, + "learning_rate": 3e-06, + "loss": -14.0135, + "step": 1881 + }, + { + "epoch": 0.16728888888888888, + "grad_norm": 637.0897216796875, + "learning_rate": 3e-06, + "loss": -17.1271, + "step": 1882 + }, + { + "epoch": 0.16737777777777776, + "grad_norm": 277.6168212890625, + "learning_rate": 3e-06, + "loss": -7.087, + "step": 1883 + }, + { + "epoch": 0.16746666666666668, + "grad_norm": 364.540283203125, + "learning_rate": 3e-06, + "loss": 0.8844, + "step": 1884 + }, + { + "completion_length": 236.375, + "epoch": 0.16755555555555557, + "grad_norm": 518.0326538085938, + "learning_rate": 3e-06, + "loss": 7.3195, + "reward": 1.5208333730697632, + "reward_std": 0.3332235887646675, + "rewards/boxed_and_answer_tags_format_reward": 0.6875, + "rewards/correctness_reward_func_math": 0.8333333283662796, + "step": 1885, + "zero_std_ratio": 0.625 + }, + { + "epoch": 0.16764444444444446, + "grad_norm": 477.8092346191406, + "learning_rate": 3e-06, + "loss": 9.9487, + "step": 1886 + }, + { + "epoch": 0.16773333333333335, + "grad_norm": 573.36083984375, + "learning_rate": 3e-06, + "loss": -5.14, + "step": 1887 + }, + { + "epoch": 0.16782222222222223, + "grad_norm": 552.39599609375, + "learning_rate": 3e-06, + "loss": 8.7078, + "step": 1888 + }, + { + "epoch": 0.16791111111111112, + "grad_norm": 585.54296875, + "learning_rate": 3e-06, + "loss": 1.6238, + "step": 1889 + }, + { + "epoch": 0.168, + "grad_norm": 431.30364990234375, + "learning_rate": 3e-06, + "loss": 2.3952, + "step": 1890 + }, + { + "epoch": 0.1680888888888889, + "grad_norm": 878.414306640625, + "learning_rate": 3e-06, + "loss": 3.0619, + "step": 1891 + }, + { + "epoch": 0.1681777777777778, + "grad_norm": 494.9808044433594, + "learning_rate": 3e-06, + "loss": 5.923, + "step": 1892 + }, + { + "epoch": 0.16826666666666668, + "grad_norm": 434.52093505859375, + "learning_rate": 3e-06, + "loss": -13.3846, + "step": 1893 + }, + { + "epoch": 0.16835555555555556, + "grad_norm": 355.6122131347656, + "learning_rate": 3e-06, + "loss": 1.8179, + "step": 1894 + }, + { + "epoch": 0.16844444444444445, + "grad_norm": 613.9844970703125, + "learning_rate": 3e-06, + "loss": -2.4103, + "step": 1895 + }, + { + "epoch": 0.16853333333333334, + "grad_norm": 337.2001953125, + "learning_rate": 3e-06, + "loss": -8.9448, + "step": 1896 + }, + { + "completion_length": 253.52084350585938, + "epoch": 0.16862222222222223, + "grad_norm": 439.8132629394531, + "learning_rate": 3e-06, + "loss": -50.7881, + "reward": 1.291666716337204, + "reward_std": 0.5373477265238762, + "rewards/boxed_and_answer_tags_format_reward": 0.625, + "rewards/correctness_reward_func_math": 0.6666666716337204, + "step": 1897, + "zero_std_ratio": 0.375 + }, + { + "epoch": 0.16871111111111112, + "grad_norm": 414.8184509277344, + "learning_rate": 3e-06, + "loss": -54.9974, + "step": 1898 + }, + { + "epoch": 0.1688, + "grad_norm": 449.9412536621094, + "learning_rate": 3e-06, + "loss": -85.2961, + "step": 1899 + }, + { + "epoch": 0.1688888888888889, + "grad_norm": 588.9105224609375, + "learning_rate": 3e-06, + "loss": -72.8215, + "step": 1900 + }, + { + "epoch": 0.16897777777777778, + "grad_norm": 511.78790283203125, + "learning_rate": 3e-06, + "loss": -88.0678, + "step": 1901 + }, + { + "epoch": 0.16906666666666667, + "grad_norm": 514.567138671875, + "learning_rate": 3e-06, + "loss": -90.2521, + "step": 1902 + }, + { + "epoch": 0.16915555555555556, + "grad_norm": 394.65826416015625, + "learning_rate": 3e-06, + "loss": -56.8307, + "step": 1903 + }, + { + "epoch": 0.16924444444444445, + "grad_norm": 463.7818603515625, + "learning_rate": 3e-06, + "loss": -62.5249, + "step": 1904 + }, + { + "epoch": 0.16933333333333334, + "grad_norm": 579.9658813476562, + "learning_rate": 3e-06, + "loss": -90.807, + "step": 1905 + }, + { + "epoch": 0.16942222222222222, + "grad_norm": 578.1275024414062, + "learning_rate": 3e-06, + "loss": -82.3221, + "step": 1906 + }, + { + "epoch": 0.1695111111111111, + "grad_norm": 405.3597412109375, + "learning_rate": 3e-06, + "loss": -100.0349, + "step": 1907 + }, + { + "epoch": 0.1696, + "grad_norm": 545.5985107421875, + "learning_rate": 3e-06, + "loss": -101.4231, + "step": 1908 + }, + { + "completion_length": 233.64584350585938, + "epoch": 0.1696888888888889, + "grad_norm": 459.9978942871094, + "learning_rate": 3e-06, + "loss": -49.3001, + "reward": 1.7916667461395264, + "reward_std": 0.4701542258262634, + "rewards/boxed_and_answer_tags_format_reward": 0.75, + "rewards/correctness_reward_func_math": 1.0416666865348816, + "step": 1909, + "zero_std_ratio": 0.5 + }, + { + "epoch": 0.16977777777777778, + "grad_norm": 383.19635009765625, + "learning_rate": 3e-06, + "loss": -29.5879, + "step": 1910 + }, + { + "epoch": 0.16986666666666667, + "grad_norm": 451.4841003417969, + "learning_rate": 3e-06, + "loss": -53.6264, + "step": 1911 + }, + { + "epoch": 0.16995555555555555, + "grad_norm": 429.31640625, + "learning_rate": 3e-06, + "loss": -41.3605, + "step": 1912 + }, + { + "epoch": 0.17004444444444444, + "grad_norm": 476.54168701171875, + "learning_rate": 3e-06, + "loss": -33.034, + "step": 1913 + }, + { + "epoch": 0.17013333333333333, + "grad_norm": 411.1586608886719, + "learning_rate": 3e-06, + "loss": -33.1554, + "step": 1914 + }, + { + "epoch": 0.17022222222222222, + "grad_norm": 504.1165771484375, + "learning_rate": 3e-06, + "loss": -54.2275, + "step": 1915 + }, + { + "epoch": 0.1703111111111111, + "grad_norm": 430.4921569824219, + "learning_rate": 3e-06, + "loss": -36.929, + "step": 1916 + }, + { + "epoch": 0.1704, + "grad_norm": 488.67071533203125, + "learning_rate": 3e-06, + "loss": -59.2601, + "step": 1917 + }, + { + "epoch": 0.17048888888888888, + "grad_norm": 381.2705078125, + "learning_rate": 3e-06, + "loss": -46.7398, + "step": 1918 + }, + { + "epoch": 0.17057777777777777, + "grad_norm": 439.42071533203125, + "learning_rate": 3e-06, + "loss": -40.6781, + "step": 1919 + }, + { + "epoch": 0.17066666666666666, + "grad_norm": 468.06365966796875, + "learning_rate": 3e-06, + "loss": -39.7694, + "step": 1920 + }, + { + "completion_length": 248.89583587646484, + "epoch": 0.17075555555555555, + "grad_norm": 389.05523681640625, + "learning_rate": 3e-06, + "loss": -15.4944, + "reward": 1.2291667461395264, + "reward_std": 0.23116152733564377, + "rewards/boxed_and_answer_tags_format_reward": 0.6875, + "rewards/correctness_reward_func_math": 0.5416666716337204, + "step": 1921, + "zero_std_ratio": 0.75 + }, + { + "epoch": 0.17084444444444444, + "grad_norm": 437.8026428222656, + "learning_rate": 3e-06, + "loss": -1.488, + "step": 1922 + }, + { + "epoch": 0.17093333333333333, + "grad_norm": 251.8334197998047, + "learning_rate": 3e-06, + "loss": -4.1204, + "step": 1923 + }, + { + "epoch": 0.17102222222222221, + "grad_norm": 547.3317260742188, + "learning_rate": 3e-06, + "loss": -28.7346, + "step": 1924 + }, + { + "epoch": 0.1711111111111111, + "grad_norm": 289.30096435546875, + "learning_rate": 3e-06, + "loss": -16.5796, + "step": 1925 + }, + { + "epoch": 0.1712, + "grad_norm": 335.68682861328125, + "learning_rate": 3e-06, + "loss": -26.3341, + "step": 1926 + }, + { + "epoch": 0.17128888888888888, + "grad_norm": 396.2536926269531, + "learning_rate": 3e-06, + "loss": -18.4726, + "step": 1927 + }, + { + "epoch": 0.17137777777777777, + "grad_norm": 490.247802734375, + "learning_rate": 3e-06, + "loss": -8.9601, + "step": 1928 + }, + { + "epoch": 0.17146666666666666, + "grad_norm": 386.6977844238281, + "learning_rate": 3e-06, + "loss": -11.4419, + "step": 1929 + }, + { + "epoch": 0.17155555555555554, + "grad_norm": 547.1292114257812, + "learning_rate": 3e-06, + "loss": -34.8083, + "step": 1930 + }, + { + "epoch": 0.17164444444444443, + "grad_norm": 253.44793701171875, + "learning_rate": 3e-06, + "loss": -20.5172, + "step": 1931 + }, + { + "epoch": 0.17173333333333332, + "grad_norm": 536.4385375976562, + "learning_rate": 3e-06, + "loss": -35.105, + "step": 1932 + }, + { + "completion_length": 254.0416717529297, + "epoch": 0.1718222222222222, + "grad_norm": 513.8276977539062, + "learning_rate": 3e-06, + "loss": 3.0418, + "reward": 1.625, + "reward_std": 0.39512956142425537, + "rewards/boxed_and_answer_tags_format_reward": 0.75, + "rewards/correctness_reward_func_math": 0.875, + "step": 1933, + "zero_std_ratio": 0.625 + }, + { + "epoch": 0.1719111111111111, + "grad_norm": 700.8008422851562, + "learning_rate": 3e-06, + "loss": -24.1725, + "step": 1934 + }, + { + "epoch": 0.172, + "grad_norm": 640.3589477539062, + "learning_rate": 3e-06, + "loss": -18.554, + "step": 1935 + }, + { + "epoch": 0.1720888888888889, + "grad_norm": 507.8769836425781, + "learning_rate": 3e-06, + "loss": -30.509, + "step": 1936 + }, + { + "epoch": 0.1721777777777778, + "grad_norm": 414.5351257324219, + "learning_rate": 3e-06, + "loss": -39.3267, + "step": 1937 + }, + { + "epoch": 0.17226666666666668, + "grad_norm": 445.7782897949219, + "learning_rate": 3e-06, + "loss": -25.2095, + "step": 1938 + }, + { + "epoch": 0.17235555555555557, + "grad_norm": 665.8450927734375, + "learning_rate": 3e-06, + "loss": -6.8353, + "step": 1939 + }, + { + "epoch": 0.17244444444444446, + "grad_norm": 558.6971435546875, + "learning_rate": 3e-06, + "loss": -33.1913, + "step": 1940 + }, + { + "epoch": 0.17253333333333334, + "grad_norm": 500.55841064453125, + "learning_rate": 3e-06, + "loss": -30.2144, + "step": 1941 + }, + { + "epoch": 0.17262222222222223, + "grad_norm": 530.8914184570312, + "learning_rate": 3e-06, + "loss": -38.4873, + "step": 1942 + }, + { + "epoch": 0.17271111111111112, + "grad_norm": 427.3404846191406, + "learning_rate": 3e-06, + "loss": -45.8584, + "step": 1943 + }, + { + "epoch": 0.1728, + "grad_norm": 391.1449279785156, + "learning_rate": 3e-06, + "loss": -29.2224, + "step": 1944 + }, + { + "completion_length": 255.1041717529297, + "epoch": 0.1728888888888889, + "grad_norm": 763.403076171875, + "learning_rate": 3e-06, + "loss": -9.7223, + "reward": 1.8333333730697632, + "reward_std": 0.4971916079521179, + "rewards/boxed_and_answer_tags_format_reward": 0.75, + "rewards/correctness_reward_func_math": 1.0833333134651184, + "step": 1945, + "zero_std_ratio": 0.5 + }, + { + "epoch": 0.17297777777777779, + "grad_norm": 562.3991088867188, + "learning_rate": 3e-06, + "loss": -7.2214, + "step": 1946 + }, + { + "epoch": 0.17306666666666667, + "grad_norm": 686.5684814453125, + "learning_rate": 3e-06, + "loss": 2.4798, + "step": 1947 + }, + { + "epoch": 0.17315555555555556, + "grad_norm": 897.48046875, + "learning_rate": 3e-06, + "loss": -0.4508, + "step": 1948 + }, + { + "epoch": 0.17324444444444445, + "grad_norm": 439.28924560546875, + "learning_rate": 3e-06, + "loss": -15.6095, + "step": 1949 + }, + { + "epoch": 0.17333333333333334, + "grad_norm": 501.46044921875, + "learning_rate": 3e-06, + "loss": -7.5293, + "step": 1950 + }, + { + "epoch": 0.17342222222222223, + "grad_norm": 659.4769287109375, + "learning_rate": 3e-06, + "loss": -14.7485, + "step": 1951 + }, + { + "epoch": 0.17351111111111112, + "grad_norm": 528.5435180664062, + "learning_rate": 3e-06, + "loss": -14.1578, + "step": 1952 + }, + { + "epoch": 0.1736, + "grad_norm": 827.1624145507812, + "learning_rate": 3e-06, + "loss": -3.8977, + "step": 1953 + }, + { + "epoch": 0.1736888888888889, + "grad_norm": 593.5472412109375, + "learning_rate": 3e-06, + "loss": -4.4025, + "step": 1954 + }, + { + "epoch": 0.17377777777777778, + "grad_norm": 444.0652160644531, + "learning_rate": 3e-06, + "loss": -18.2012, + "step": 1955 + }, + { + "epoch": 0.17386666666666667, + "grad_norm": 478.5816345214844, + "learning_rate": 3e-06, + "loss": -13.0243, + "step": 1956 + }, + { + "completion_length": 253.25000762939453, + "epoch": 0.17395555555555556, + "grad_norm": 375.89666748046875, + "learning_rate": 3e-06, + "loss": -12.1653, + "reward": 1.2291666865348816, + "reward_std": 0.10206207633018494, + "rewards/boxed_and_answer_tags_format_reward": 0.6875, + "rewards/correctness_reward_func_math": 0.5416666567325592, + "step": 1957, + "zero_std_ratio": 0.875 + }, + { + "epoch": 0.17404444444444445, + "grad_norm": 902.545654296875, + "learning_rate": 3e-06, + "loss": -25.8535, + "step": 1958 + }, + { + "epoch": 0.17413333333333333, + "grad_norm": 629.4489135742188, + "learning_rate": 3e-06, + "loss": -22.1681, + "step": 1959 + }, + { + "epoch": 0.17422222222222222, + "grad_norm": 359.41644287109375, + "learning_rate": 3e-06, + "loss": -19.6096, + "step": 1960 + }, + { + "epoch": 0.1743111111111111, + "grad_norm": 356.7577819824219, + "learning_rate": 3e-06, + "loss": -12.0564, + "step": 1961 + }, + { + "epoch": 0.1744, + "grad_norm": 236.30433654785156, + "learning_rate": 3e-06, + "loss": -25.8836, + "step": 1962 + }, + { + "epoch": 0.1744888888888889, + "grad_norm": 340.1228332519531, + "learning_rate": 3e-06, + "loss": -14.5898, + "step": 1963 + }, + { + "epoch": 0.17457777777777778, + "grad_norm": 186.7640838623047, + "learning_rate": 3e-06, + "loss": -29.7451, + "step": 1964 + }, + { + "epoch": 0.17466666666666666, + "grad_norm": 411.6400451660156, + "learning_rate": 3e-06, + "loss": -20.199, + "step": 1965 + }, + { + "epoch": 0.17475555555555555, + "grad_norm": 319.7075500488281, + "learning_rate": 3e-06, + "loss": -22.8842, + "step": 1966 + }, + { + "epoch": 0.17484444444444444, + "grad_norm": 417.3407287597656, + "learning_rate": 3e-06, + "loss": -16.5058, + "step": 1967 + }, + { + "epoch": 0.17493333333333333, + "grad_norm": 243.57456970214844, + "learning_rate": 3e-06, + "loss": -31.0802, + "step": 1968 + }, + { + "completion_length": 249.93750762939453, + "epoch": 0.17502222222222222, + "grad_norm": 276.6730041503906, + "learning_rate": 3e-06, + "loss": -29.1006, + "reward": 1.3958333730697632, + "reward_std": 0.10206206887960434, + "rewards/boxed_and_answer_tags_format_reward": 0.6875, + "rewards/correctness_reward_func_math": 0.7083333432674408, + "step": 1969, + "zero_std_ratio": 0.875 + }, + { + "epoch": 0.1751111111111111, + "grad_norm": 699.74658203125, + "learning_rate": 3e-06, + "loss": -14.0976, + "step": 1970 + }, + { + "epoch": 0.1752, + "grad_norm": 471.5838623046875, + "learning_rate": 3e-06, + "loss": -20.9071, + "step": 1971 + }, + { + "epoch": 0.17528888888888888, + "grad_norm": 472.9868469238281, + "learning_rate": 3e-06, + "loss": -25.4522, + "step": 1972 + }, + { + "epoch": 0.17537777777777777, + "grad_norm": 254.69761657714844, + "learning_rate": 3e-06, + "loss": -29.7094, + "step": 1973 + }, + { + "epoch": 0.17546666666666666, + "grad_norm": 248.12869262695312, + "learning_rate": 3e-06, + "loss": -28.2889, + "step": 1974 + }, + { + "epoch": 0.17555555555555555, + "grad_norm": 236.1974639892578, + "learning_rate": 3e-06, + "loss": -32.1523, + "step": 1975 + }, + { + "epoch": 0.17564444444444444, + "grad_norm": 648.0961303710938, + "learning_rate": 3e-06, + "loss": -16.4427, + "step": 1976 + }, + { + "epoch": 0.17573333333333332, + "grad_norm": 378.50634765625, + "learning_rate": 3e-06, + "loss": -27.5418, + "step": 1977 + }, + { + "epoch": 0.1758222222222222, + "grad_norm": 439.670654296875, + "learning_rate": 3e-06, + "loss": -30.8163, + "step": 1978 + }, + { + "epoch": 0.1759111111111111, + "grad_norm": 216.43161010742188, + "learning_rate": 3e-06, + "loss": -34.2176, + "step": 1979 + }, + { + "epoch": 0.176, + "grad_norm": 242.0641632080078, + "learning_rate": 3e-06, + "loss": -31.2334, + "step": 1980 + }, + { + "completion_length": 248.2916717529297, + "epoch": 0.17608888888888888, + "grad_norm": 431.9725646972656, + "learning_rate": 3e-06, + "loss": -90.337, + "reward": 1.0208333730697632, + "reward_std": 0.37592336535453796, + "rewards/boxed_and_answer_tags_format_reward": 0.5625, + "rewards/correctness_reward_func_math": 0.4583333432674408, + "step": 1981, + "zero_std_ratio": 0.625 + }, + { + "epoch": 0.17617777777777777, + "grad_norm": 421.96124267578125, + "learning_rate": 3e-06, + "loss": -74.4792, + "step": 1982 + }, + { + "epoch": 0.17626666666666665, + "grad_norm": 376.8736877441406, + "learning_rate": 3e-06, + "loss": -66.9567, + "step": 1983 + }, + { + "epoch": 0.17635555555555554, + "grad_norm": 489.1451110839844, + "learning_rate": 3e-06, + "loss": -61.4308, + "step": 1984 + }, + { + "epoch": 0.17644444444444443, + "grad_norm": 597.60498046875, + "learning_rate": 3e-06, + "loss": -58.1928, + "step": 1985 + }, + { + "epoch": 0.17653333333333332, + "grad_norm": 466.6503601074219, + "learning_rate": 3e-06, + "loss": -63.9999, + "step": 1986 + }, + { + "epoch": 0.17662222222222224, + "grad_norm": 420.9534606933594, + "learning_rate": 3e-06, + "loss": -98.6249, + "step": 1987 + }, + { + "epoch": 0.17671111111111112, + "grad_norm": 428.0660400390625, + "learning_rate": 3e-06, + "loss": -80.7811, + "step": 1988 + }, + { + "epoch": 0.1768, + "grad_norm": 381.4080505371094, + "learning_rate": 3e-06, + "loss": -75.1067, + "step": 1989 + }, + { + "epoch": 0.1768888888888889, + "grad_norm": 451.9751892089844, + "learning_rate": 3e-06, + "loss": -66.6592, + "step": 1990 + }, + { + "epoch": 0.1769777777777778, + "grad_norm": 526.015625, + "learning_rate": 3e-06, + "loss": -69.5291, + "step": 1991 + }, + { + "epoch": 0.17706666666666668, + "grad_norm": 532.5322265625, + "learning_rate": 3e-06, + "loss": -72.9259, + "step": 1992 + }, + { + "completion_length": 254.4166717529297, + "epoch": 0.17715555555555557, + "grad_norm": 836.5994873046875, + "learning_rate": 3e-06, + "loss": -56.0218, + "reward": 1.0625000298023224, + "reward_std": 0.6184598803520203, + "rewards/boxed_and_answer_tags_format_reward": 0.6875, + "rewards/correctness_reward_func_math": 0.3749999925494194, + "step": 1993, + "zero_std_ratio": 0.375 + }, + { + "epoch": 0.17724444444444445, + "grad_norm": 828.1771240234375, + "learning_rate": 3e-06, + "loss": 7.2773, + "step": 1994 + }, + { + "epoch": 0.17733333333333334, + "grad_norm": 832.0421752929688, + "learning_rate": 3e-06, + "loss": -26.279, + "step": 1995 + }, + { + "epoch": 0.17742222222222223, + "grad_norm": 842.8486938476562, + "learning_rate": 3e-06, + "loss": -26.3621, + "step": 1996 + }, + { + "epoch": 0.17751111111111112, + "grad_norm": 641.175048828125, + "learning_rate": 3e-06, + "loss": -35.8716, + "step": 1997 + }, + { + "epoch": 0.1776, + "grad_norm": 621.6319580078125, + "learning_rate": 3e-06, + "loss": -46.1934, + "step": 1998 + }, + { + "epoch": 0.1776888888888889, + "grad_norm": 908.200927734375, + "learning_rate": 3e-06, + "loss": -65.1173, + "step": 1999 + }, + { + "epoch": 0.17777777777777778, + "grad_norm": 875.9900512695312, + "learning_rate": 3e-06, + "loss": 0.5691, + "step": 2000 + }, + { + "epoch": 0.17786666666666667, + "grad_norm": 801.6747436523438, + "learning_rate": 3e-06, + "loss": -35.2798, + "step": 2001 + }, + { + "epoch": 0.17795555555555556, + "grad_norm": 942.287353515625, + "learning_rate": 3e-06, + "loss": -34.8283, + "step": 2002 + }, + { + "epoch": 0.17804444444444445, + "grad_norm": 616.319091796875, + "learning_rate": 3e-06, + "loss": -39.5028, + "step": 2003 + }, + { + "epoch": 0.17813333333333334, + "grad_norm": 901.8914184570312, + "learning_rate": 3e-06, + "loss": -52.1208, + "step": 2004 + }, + { + "completion_length": 248.33333587646484, + "epoch": 0.17822222222222223, + "grad_norm": 706.6128540039062, + "learning_rate": 3e-06, + "loss": -29.2574, + "reward": 1.5416666865348816, + "reward_std": 0.6341222822666168, + "rewards/boxed_and_answer_tags_format_reward": 0.75, + "rewards/correctness_reward_func_math": 0.7916666865348816, + "step": 2005, + "zero_std_ratio": 0.375 + }, + { + "epoch": 0.17831111111111111, + "grad_norm": 781.2979736328125, + "learning_rate": 3e-06, + "loss": -8.4374, + "step": 2006 + }, + { + "epoch": 0.1784, + "grad_norm": 799.8029174804688, + "learning_rate": 3e-06, + "loss": -62.7724, + "step": 2007 + }, + { + "epoch": 0.1784888888888889, + "grad_norm": 783.2274780273438, + "learning_rate": 3e-06, + "loss": -32.8635, + "step": 2008 + }, + { + "epoch": 0.17857777777777778, + "grad_norm": 657.0900268554688, + "learning_rate": 3e-06, + "loss": -6.8832, + "step": 2009 + }, + { + "epoch": 0.17866666666666667, + "grad_norm": 736.504638671875, + "learning_rate": 3e-06, + "loss": -34.957, + "step": 2010 + }, + { + "epoch": 0.17875555555555556, + "grad_norm": 723.53173828125, + "learning_rate": 3e-06, + "loss": -38.0641, + "step": 2011 + }, + { + "epoch": 0.17884444444444444, + "grad_norm": 789.1529541015625, + "learning_rate": 3e-06, + "loss": -14.8114, + "step": 2012 + }, + { + "epoch": 0.17893333333333333, + "grad_norm": 596.5152587890625, + "learning_rate": 3e-06, + "loss": -74.1531, + "step": 2013 + }, + { + "epoch": 0.17902222222222222, + "grad_norm": 761.294189453125, + "learning_rate": 3e-06, + "loss": -44.1236, + "step": 2014 + }, + { + "epoch": 0.1791111111111111, + "grad_norm": 669.9348754882812, + "learning_rate": 3e-06, + "loss": -13.3884, + "step": 2015 + }, + { + "epoch": 0.1792, + "grad_norm": 624.5435180664062, + "learning_rate": 3e-06, + "loss": -43.6932, + "step": 2016 + }, + { + "completion_length": 248.64584350585938, + "epoch": 0.1792888888888889, + "grad_norm": 277.96478271484375, + "learning_rate": 3e-06, + "loss": 27.6711, + "reward": 1.0625, + "reward_std": 0.23116151988506317, + "rewards/boxed_and_answer_tags_format_reward": 0.6875, + "rewards/correctness_reward_func_math": 0.375, + "step": 2017, + "zero_std_ratio": 0.75 + }, + { + "epoch": 0.17937777777777777, + "grad_norm": 346.2447814941406, + "learning_rate": 3e-06, + "loss": 23.3867, + "step": 2018 + }, + { + "epoch": 0.17946666666666666, + "grad_norm": 250.9003448486328, + "learning_rate": 3e-06, + "loss": 24.0388, + "step": 2019 + }, + { + "epoch": 0.17955555555555555, + "grad_norm": 308.0636291503906, + "learning_rate": 3e-06, + "loss": 22.8199, + "step": 2020 + }, + { + "epoch": 0.17964444444444444, + "grad_norm": 356.0393371582031, + "learning_rate": 3e-06, + "loss": 17.5979, + "step": 2021 + }, + { + "epoch": 0.17973333333333333, + "grad_norm": 350.8787841796875, + "learning_rate": 3e-06, + "loss": 21.2881, + "step": 2022 + }, + { + "epoch": 0.17982222222222222, + "grad_norm": 290.56292724609375, + "learning_rate": 3e-06, + "loss": 24.3076, + "step": 2023 + }, + { + "epoch": 0.1799111111111111, + "grad_norm": 291.3890075683594, + "learning_rate": 3e-06, + "loss": 19.0537, + "step": 2024 + }, + { + "epoch": 0.18, + "grad_norm": 249.847412109375, + "learning_rate": 3e-06, + "loss": 20.9012, + "step": 2025 + }, + { + "epoch": 0.18008888888888888, + "grad_norm": 343.4595031738281, + "learning_rate": 3e-06, + "loss": 17.5784, + "step": 2026 + }, + { + "epoch": 0.18017777777777777, + "grad_norm": 551.070068359375, + "learning_rate": 3e-06, + "loss": 11.5671, + "step": 2027 + }, + { + "epoch": 0.18026666666666666, + "grad_norm": 353.6826171875, + "learning_rate": 3e-06, + "loss": 13.8267, + "step": 2028 + }, + { + "completion_length": 252.8541717529297, + "epoch": 0.18035555555555555, + "grad_norm": 691.8173828125, + "learning_rate": 3e-06, + "loss": 19.1562, + "reward": 1.3958333730697632, + "reward_std": 0.505022794008255, + "rewards/boxed_and_answer_tags_format_reward": 0.6875, + "rewards/correctness_reward_func_math": 0.7083333134651184, + "step": 2029, + "zero_std_ratio": 0.5 + }, + { + "epoch": 0.18044444444444444, + "grad_norm": 769.9490356445312, + "learning_rate": 3e-06, + "loss": -6.2182, + "step": 2030 + }, + { + "epoch": 0.18053333333333332, + "grad_norm": 746.6094360351562, + "learning_rate": 3e-06, + "loss": 26.2743, + "step": 2031 + }, + { + "epoch": 0.1806222222222222, + "grad_norm": 538.3868408203125, + "learning_rate": 3e-06, + "loss": 41.1944, + "step": 2032 + }, + { + "epoch": 0.1807111111111111, + "grad_norm": 1304.790771484375, + "learning_rate": 3e-06, + "loss": -12.3747, + "step": 2033 + }, + { + "epoch": 0.1808, + "grad_norm": 551.8775634765625, + "learning_rate": 3e-06, + "loss": 18.321, + "step": 2034 + }, + { + "epoch": 0.18088888888888888, + "grad_norm": 639.1527709960938, + "learning_rate": 3e-06, + "loss": 9.5878, + "step": 2035 + }, + { + "epoch": 0.18097777777777777, + "grad_norm": 659.2324829101562, + "learning_rate": 3e-06, + "loss": -12.4579, + "step": 2036 + }, + { + "epoch": 0.18106666666666665, + "grad_norm": 709.0194702148438, + "learning_rate": 3e-06, + "loss": 14.4976, + "step": 2037 + }, + { + "epoch": 0.18115555555555554, + "grad_norm": 522.2162475585938, + "learning_rate": 3e-06, + "loss": 31.9036, + "step": 2038 + }, + { + "epoch": 0.18124444444444446, + "grad_norm": 605.5569458007812, + "learning_rate": 3e-06, + "loss": -16.3849, + "step": 2039 + }, + { + "epoch": 0.18133333333333335, + "grad_norm": 586.4031982421875, + "learning_rate": 3e-06, + "loss": 7.3009, + "step": 2040 + }, + { + "completion_length": 255.25, + "epoch": 0.18142222222222223, + "grad_norm": 294.46368408203125, + "learning_rate": 3e-06, + "loss": -46.1645, + "reward": 1.1666666865348816, + "reward_std": 0.26603010296821594, + "rewards/boxed_and_answer_tags_format_reward": 0.625, + "rewards/correctness_reward_func_math": 0.5416666567325592, + "step": 2041, + "zero_std_ratio": 0.75 + }, + { + "epoch": 0.18151111111111112, + "grad_norm": 281.5460510253906, + "learning_rate": 3e-06, + "loss": -45.9692, + "step": 2042 + }, + { + "epoch": 0.1816, + "grad_norm": 426.70758056640625, + "learning_rate": 3e-06, + "loss": -59.0595, + "step": 2043 + }, + { + "epoch": 0.1816888888888889, + "grad_norm": 379.1589660644531, + "learning_rate": 3e-06, + "loss": -48.9399, + "step": 2044 + }, + { + "epoch": 0.1817777777777778, + "grad_norm": 372.3189697265625, + "learning_rate": 3e-06, + "loss": -59.5334, + "step": 2045 + }, + { + "epoch": 0.18186666666666668, + "grad_norm": 316.384765625, + "learning_rate": 3e-06, + "loss": -42.7701, + "step": 2046 + }, + { + "epoch": 0.18195555555555556, + "grad_norm": 333.133056640625, + "learning_rate": 3e-06, + "loss": -52.4876, + "step": 2047 + }, + { + "epoch": 0.18204444444444445, + "grad_norm": 302.69488525390625, + "learning_rate": 3e-06, + "loss": -50.558, + "step": 2048 + }, + { + "epoch": 0.18213333333333334, + "grad_norm": 297.52264404296875, + "learning_rate": 3e-06, + "loss": -65.7042, + "step": 2049 + }, + { + "epoch": 0.18222222222222223, + "grad_norm": 390.93719482421875, + "learning_rate": 3e-06, + "loss": -56.6311, + "step": 2050 + }, + { + "epoch": 0.18231111111111112, + "grad_norm": 510.16064453125, + "learning_rate": 3e-06, + "loss": -66.4786, + "step": 2051 + }, + { + "epoch": 0.1824, + "grad_norm": 386.4976501464844, + "learning_rate": 3e-06, + "loss": -55.7261, + "step": 2052 + }, + { + "completion_length": 231.77083587646484, + "epoch": 0.1824888888888889, + "grad_norm": 214.66683959960938, + "learning_rate": 3e-06, + "loss": -13.3666, + "reward": 1.0208333730697632, + "reward_std": 0.10206207633018494, + "rewards/boxed_and_answer_tags_format_reward": 0.5625, + "rewards/correctness_reward_func_math": 0.4583333283662796, + "step": 2053, + "zero_std_ratio": 0.875 + }, + { + "epoch": 0.18257777777777778, + "grad_norm": 241.9335174560547, + "learning_rate": 3e-06, + "loss": -16.4601, + "step": 2054 + }, + { + "epoch": 0.18266666666666667, + "grad_norm": 227.05535888671875, + "learning_rate": 3e-06, + "loss": -15.4358, + "step": 2055 + }, + { + "epoch": 0.18275555555555556, + "grad_norm": 280.6861267089844, + "learning_rate": 3e-06, + "loss": -19.7707, + "step": 2056 + }, + { + "epoch": 0.18284444444444445, + "grad_norm": 211.3414306640625, + "learning_rate": 3e-06, + "loss": -11.4767, + "step": 2057 + }, + { + "epoch": 0.18293333333333334, + "grad_norm": 332.11248779296875, + "learning_rate": 3e-06, + "loss": -11.748, + "step": 2058 + }, + { + "epoch": 0.18302222222222223, + "grad_norm": 210.33470153808594, + "learning_rate": 3e-06, + "loss": -16.0377, + "step": 2059 + }, + { + "epoch": 0.1831111111111111, + "grad_norm": 230.14593505859375, + "learning_rate": 3e-06, + "loss": -19.6348, + "step": 2060 + }, + { + "epoch": 0.1832, + "grad_norm": 215.13331604003906, + "learning_rate": 3e-06, + "loss": -19.3665, + "step": 2061 + }, + { + "epoch": 0.1832888888888889, + "grad_norm": 401.6134338378906, + "learning_rate": 3e-06, + "loss": -22.9875, + "step": 2062 + }, + { + "epoch": 0.18337777777777778, + "grad_norm": 223.34193420410156, + "learning_rate": 3e-06, + "loss": -16.2301, + "step": 2063 + }, + { + "epoch": 0.18346666666666667, + "grad_norm": 241.48159790039062, + "learning_rate": 3e-06, + "loss": -17.0173, + "step": 2064 + }, + { + "completion_length": 250.12500762939453, + "epoch": 0.18355555555555556, + "grad_norm": 783.0624389648438, + "learning_rate": 3e-06, + "loss": -7.0804, + "reward": 1.2500000596046448, + "reward_std": 0.3332235962152481, + "rewards/boxed_and_answer_tags_format_reward": 0.75, + "rewards/correctness_reward_func_math": 0.5000000111758709, + "step": 2065, + "zero_std_ratio": 0.625 + }, + { + "epoch": 0.18364444444444444, + "grad_norm": 456.5368957519531, + "learning_rate": 3e-06, + "loss": -5.7073, + "step": 2066 + }, + { + "epoch": 0.18373333333333333, + "grad_norm": 801.0426635742188, + "learning_rate": 3e-06, + "loss": 24.9016, + "step": 2067 + }, + { + "epoch": 0.18382222222222222, + "grad_norm": 452.8072204589844, + "learning_rate": 3e-06, + "loss": -16.3536, + "step": 2068 + }, + { + "epoch": 0.1839111111111111, + "grad_norm": 573.7277221679688, + "learning_rate": 3e-06, + "loss": -14.2969, + "step": 2069 + }, + { + "epoch": 0.184, + "grad_norm": 490.78375244140625, + "learning_rate": 3e-06, + "loss": -29.7209, + "step": 2070 + }, + { + "epoch": 0.18408888888888889, + "grad_norm": 967.42578125, + "learning_rate": 3e-06, + "loss": -11.7546, + "step": 2071 + }, + { + "epoch": 0.18417777777777777, + "grad_norm": 446.62945556640625, + "learning_rate": 3e-06, + "loss": -11.8012, + "step": 2072 + }, + { + "epoch": 0.18426666666666666, + "grad_norm": 586.0079345703125, + "learning_rate": 3e-06, + "loss": 16.1318, + "step": 2073 + }, + { + "epoch": 0.18435555555555555, + "grad_norm": 483.7483825683594, + "learning_rate": 3e-06, + "loss": -22.5055, + "step": 2074 + }, + { + "epoch": 0.18444444444444444, + "grad_norm": 552.7943725585938, + "learning_rate": 3e-06, + "loss": -23.8956, + "step": 2075 + }, + { + "epoch": 0.18453333333333333, + "grad_norm": 516.2462768554688, + "learning_rate": 3e-06, + "loss": -36.5617, + "step": 2076 + }, + { + "completion_length": 245.45833587646484, + "epoch": 0.18462222222222222, + "grad_norm": 526.9989013671875, + "learning_rate": 3e-06, + "loss": -1.3188, + "reward": 1.8541667461395264, + "reward_std": 0.3872983753681183, + "rewards/boxed_and_answer_tags_format_reward": 0.6875, + "rewards/correctness_reward_func_math": 1.1666666269302368, + "step": 2077, + "zero_std_ratio": 0.625 + }, + { + "epoch": 0.1847111111111111, + "grad_norm": 695.4895629882812, + "learning_rate": 3e-06, + "loss": -7.7195, + "step": 2078 + }, + { + "epoch": 0.1848, + "grad_norm": 732.2682495117188, + "learning_rate": 3e-06, + "loss": -3.051, + "step": 2079 + }, + { + "epoch": 0.18488888888888888, + "grad_norm": 717.4251098632812, + "learning_rate": 3e-06, + "loss": 6.4156, + "step": 2080 + }, + { + "epoch": 0.18497777777777777, + "grad_norm": 588.271484375, + "learning_rate": 3e-06, + "loss": -1.8704, + "step": 2081 + }, + { + "epoch": 0.18506666666666666, + "grad_norm": 563.7857055664062, + "learning_rate": 3e-06, + "loss": 5.8266, + "step": 2082 + }, + { + "epoch": 0.18515555555555555, + "grad_norm": 523.8809814453125, + "learning_rate": 3e-06, + "loss": -6.2269, + "step": 2083 + }, + { + "epoch": 0.18524444444444443, + "grad_norm": 566.2587890625, + "learning_rate": 3e-06, + "loss": -11.1423, + "step": 2084 + }, + { + "epoch": 0.18533333333333332, + "grad_norm": 575.639892578125, + "learning_rate": 3e-06, + "loss": -8.9986, + "step": 2085 + }, + { + "epoch": 0.1854222222222222, + "grad_norm": 607.2640380859375, + "learning_rate": 3e-06, + "loss": 0.5698, + "step": 2086 + }, + { + "epoch": 0.1855111111111111, + "grad_norm": 685.4660034179688, + "learning_rate": 3e-06, + "loss": -13.8418, + "step": 2087 + }, + { + "epoch": 0.1856, + "grad_norm": 1059.2657470703125, + "learning_rate": 3e-06, + "loss": -3.093, + "step": 2088 + }, + { + "completion_length": 243.375, + "epoch": 0.18568888888888888, + "grad_norm": 533.3504638671875, + "learning_rate": 3e-06, + "loss": -2.4458, + "reward": 1.7916667461395264, + "reward_std": 0.43528565764427185, + "rewards/boxed_and_answer_tags_format_reward": 0.75, + "rewards/correctness_reward_func_math": 1.0416666567325592, + "step": 2089, + "zero_std_ratio": 0.5 + }, + { + "epoch": 0.18577777777777776, + "grad_norm": 466.76678466796875, + "learning_rate": 3e-06, + "loss": -25.5447, + "step": 2090 + }, + { + "epoch": 0.18586666666666668, + "grad_norm": 482.4854736328125, + "learning_rate": 3e-06, + "loss": -11.9751, + "step": 2091 + }, + { + "epoch": 0.18595555555555557, + "grad_norm": 617.836669921875, + "learning_rate": 3e-06, + "loss": -16.149, + "step": 2092 + }, + { + "epoch": 0.18604444444444446, + "grad_norm": 806.7719116210938, + "learning_rate": 3e-06, + "loss": -12.1531, + "step": 2093 + }, + { + "epoch": 0.18613333333333335, + "grad_norm": 436.9642333984375, + "learning_rate": 3e-06, + "loss": -20.6401, + "step": 2094 + }, + { + "epoch": 0.18622222222222223, + "grad_norm": 533.0576171875, + "learning_rate": 3e-06, + "loss": -6.39, + "step": 2095 + }, + { + "epoch": 0.18631111111111112, + "grad_norm": 578.9844360351562, + "learning_rate": 3e-06, + "loss": -28.3945, + "step": 2096 + }, + { + "epoch": 0.1864, + "grad_norm": 518.856201171875, + "learning_rate": 3e-06, + "loss": -18.442, + "step": 2097 + }, + { + "epoch": 0.1864888888888889, + "grad_norm": 681.1263427734375, + "learning_rate": 3e-06, + "loss": -22.9392, + "step": 2098 + }, + { + "epoch": 0.1865777777777778, + "grad_norm": 759.8504028320312, + "learning_rate": 3e-06, + "loss": -19.4906, + "step": 2099 + }, + { + "epoch": 0.18666666666666668, + "grad_norm": 497.5691833496094, + "learning_rate": 3e-06, + "loss": -25.688, + "step": 2100 + }, + { + "completion_length": 247.7291717529297, + "epoch": 0.18675555555555556, + "grad_norm": 766.0548706054688, + "learning_rate": 3e-06, + "loss": 9.0542, + "reward": 1.145833358168602, + "reward_std": 0.20412415266036987, + "rewards/boxed_and_answer_tags_format_reward": 0.5625, + "rewards/correctness_reward_func_math": 0.5833333544433117, + "step": 2101, + "zero_std_ratio": 0.75 + }, + { + "epoch": 0.18684444444444445, + "grad_norm": 274.5552062988281, + "learning_rate": 3e-06, + "loss": -14.4338, + "step": 2102 + }, + { + "epoch": 0.18693333333333334, + "grad_norm": 357.243896484375, + "learning_rate": 3e-06, + "loss": -3.3275, + "step": 2103 + }, + { + "epoch": 0.18702222222222223, + "grad_norm": 349.6654968261719, + "learning_rate": 3e-06, + "loss": -3.714, + "step": 2104 + }, + { + "epoch": 0.18711111111111112, + "grad_norm": 252.651611328125, + "learning_rate": 3e-06, + "loss": 4.8234, + "step": 2105 + }, + { + "epoch": 0.1872, + "grad_norm": 422.1712951660156, + "learning_rate": 3e-06, + "loss": 2.0141, + "step": 2106 + }, + { + "epoch": 0.1872888888888889, + "grad_norm": 848.9307250976562, + "learning_rate": 3e-06, + "loss": 1.1462, + "step": 2107 + }, + { + "epoch": 0.18737777777777778, + "grad_norm": 255.5582275390625, + "learning_rate": 3e-06, + "loss": -17.6895, + "step": 2108 + }, + { + "epoch": 0.18746666666666667, + "grad_norm": 307.6992492675781, + "learning_rate": 3e-06, + "loss": -9.5895, + "step": 2109 + }, + { + "epoch": 0.18755555555555556, + "grad_norm": 277.8653259277344, + "learning_rate": 3e-06, + "loss": -9.602, + "step": 2110 + }, + { + "epoch": 0.18764444444444445, + "grad_norm": 234.34913635253906, + "learning_rate": 3e-06, + "loss": 0.8899, + "step": 2111 + }, + { + "epoch": 0.18773333333333334, + "grad_norm": 355.8470764160156, + "learning_rate": 3e-06, + "loss": -4.6677, + "step": 2112 + }, + { + "completion_length": 250.89583587646484, + "epoch": 0.18782222222222222, + "grad_norm": 810.5846557617188, + "learning_rate": 3e-06, + "loss": 15.7377, + "reward": 1.125, + "reward_std": 0.23116151988506317, + "rewards/boxed_and_answer_tags_format_reward": 0.75, + "rewards/correctness_reward_func_math": 0.375, + "step": 2113, + "zero_std_ratio": 0.75 + }, + { + "epoch": 0.1879111111111111, + "grad_norm": 509.224853515625, + "learning_rate": 3e-06, + "loss": 18.5617, + "step": 2114 + }, + { + "epoch": 0.188, + "grad_norm": 530.5845336914062, + "learning_rate": 3e-06, + "loss": 7.082, + "step": 2115 + }, + { + "epoch": 0.1880888888888889, + "grad_norm": 672.1024169921875, + "learning_rate": 3e-06, + "loss": 1.3842, + "step": 2116 + }, + { + "epoch": 0.18817777777777778, + "grad_norm": 408.4438171386719, + "learning_rate": 3e-06, + "loss": 16.7967, + "step": 2117 + }, + { + "epoch": 0.18826666666666667, + "grad_norm": 569.5597534179688, + "learning_rate": 3e-06, + "loss": 15.6526, + "step": 2118 + }, + { + "epoch": 0.18835555555555555, + "grad_norm": 613.7771606445312, + "learning_rate": 3e-06, + "loss": 11.1284, + "step": 2119 + }, + { + "epoch": 0.18844444444444444, + "grad_norm": 443.5073547363281, + "learning_rate": 3e-06, + "loss": 13.1326, + "step": 2120 + }, + { + "epoch": 0.18853333333333333, + "grad_norm": 745.5543823242188, + "learning_rate": 3e-06, + "loss": 3.177, + "step": 2121 + }, + { + "epoch": 0.18862222222222222, + "grad_norm": 776.3263549804688, + "learning_rate": 3e-06, + "loss": -4.0693, + "step": 2122 + }, + { + "epoch": 0.1887111111111111, + "grad_norm": 439.9760437011719, + "learning_rate": 3e-06, + "loss": 14.0792, + "step": 2123 + }, + { + "epoch": 0.1888, + "grad_norm": 620.5515747070312, + "learning_rate": 3e-06, + "loss": 10.3075, + "step": 2124 + }, + { + "completion_length": 255.89583587646484, + "epoch": 0.18888888888888888, + "grad_norm": 1261.142578125, + "learning_rate": 3e-06, + "loss": -52.8242, + "reward": 1.5416666865348816, + "reward_std": 0.7283531129360199, + "rewards/boxed_and_answer_tags_format_reward": 0.75, + "rewards/correctness_reward_func_math": 0.7916666567325592, + "step": 2125, + "zero_std_ratio": 0.25 + }, + { + "epoch": 0.18897777777777777, + "grad_norm": 828.8499755859375, + "learning_rate": 3e-06, + "loss": -18.1417, + "step": 2126 + }, + { + "epoch": 0.18906666666666666, + "grad_norm": 921.6665649414062, + "learning_rate": 3e-06, + "loss": -28.653, + "step": 2127 + }, + { + "epoch": 0.18915555555555555, + "grad_norm": 1188.288330078125, + "learning_rate": 3e-06, + "loss": -14.6245, + "step": 2128 + }, + { + "epoch": 0.18924444444444444, + "grad_norm": 875.7568969726562, + "learning_rate": 3e-06, + "loss": -19.076, + "step": 2129 + }, + { + "epoch": 0.18933333333333333, + "grad_norm": 935.9678955078125, + "learning_rate": 3e-06, + "loss": -30.5958, + "step": 2130 + }, + { + "epoch": 0.18942222222222221, + "grad_norm": 933.6688842773438, + "learning_rate": 3e-06, + "loss": -56.0788, + "step": 2131 + }, + { + "epoch": 0.1895111111111111, + "grad_norm": 1004.11572265625, + "learning_rate": 3e-06, + "loss": -27.5338, + "step": 2132 + }, + { + "epoch": 0.1896, + "grad_norm": 805.9441528320312, + "learning_rate": 3e-06, + "loss": -38.4037, + "step": 2133 + }, + { + "epoch": 0.18968888888888888, + "grad_norm": 1125.8046875, + "learning_rate": 3e-06, + "loss": -21.7139, + "step": 2134 + }, + { + "epoch": 0.18977777777777777, + "grad_norm": 892.211181640625, + "learning_rate": 3e-06, + "loss": -29.3145, + "step": 2135 + }, + { + "epoch": 0.18986666666666666, + "grad_norm": 895.8474731445312, + "learning_rate": 3e-06, + "loss": -38.8515, + "step": 2136 + }, + { + "completion_length": 241.06250762939453, + "epoch": 0.18995555555555554, + "grad_norm": 587.2543334960938, + "learning_rate": 3e-06, + "loss": -30.24, + "reward": 1.7083333730697632, + "reward_std": 0.5128540322184563, + "rewards/boxed_and_answer_tags_format_reward": 0.625, + "rewards/correctness_reward_func_math": 1.0833333134651184, + "step": 2137, + "zero_std_ratio": 0.5 + }, + { + "epoch": 0.19004444444444443, + "grad_norm": 483.7323913574219, + "learning_rate": 3e-06, + "loss": -54.7314, + "step": 2138 + }, + { + "epoch": 0.19013333333333332, + "grad_norm": 695.5252075195312, + "learning_rate": 3e-06, + "loss": -37.2405, + "step": 2139 + }, + { + "epoch": 0.1902222222222222, + "grad_norm": 659.273681640625, + "learning_rate": 3e-06, + "loss": -54.6989, + "step": 2140 + }, + { + "epoch": 0.1903111111111111, + "grad_norm": 665.203857421875, + "learning_rate": 3e-06, + "loss": -49.4977, + "step": 2141 + }, + { + "epoch": 0.1904, + "grad_norm": 570.3987426757812, + "learning_rate": 3e-06, + "loss": -29.7209, + "step": 2142 + }, + { + "epoch": 0.1904888888888889, + "grad_norm": 543.2655029296875, + "learning_rate": 3e-06, + "loss": -35.8428, + "step": 2143 + }, + { + "epoch": 0.1905777777777778, + "grad_norm": 810.827880859375, + "learning_rate": 3e-06, + "loss": -59.7728, + "step": 2144 + }, + { + "epoch": 0.19066666666666668, + "grad_norm": 702.2298583984375, + "learning_rate": 3e-06, + "loss": -49.0496, + "step": 2145 + }, + { + "epoch": 0.19075555555555557, + "grad_norm": 575.1386108398438, + "learning_rate": 3e-06, + "loss": -64.258, + "step": 2146 + }, + { + "epoch": 0.19084444444444446, + "grad_norm": 646.1061401367188, + "learning_rate": 3e-06, + "loss": -60.6597, + "step": 2147 + }, + { + "epoch": 0.19093333333333334, + "grad_norm": 583.6048583984375, + "learning_rate": 3e-06, + "loss": -42.0407, + "step": 2148 + }, + { + "completion_length": 230.43750762939453, + "epoch": 0.19102222222222223, + "grad_norm": 553.2525024414062, + "learning_rate": 3e-06, + "loss": 31.5824, + "reward": 1.2083333730697632, + "reward_std": 0.46232303977012634, + "rewards/boxed_and_answer_tags_format_reward": 0.625, + "rewards/correctness_reward_func_math": 0.5833333283662796, + "step": 2149, + "zero_std_ratio": 0.5 + }, + { + "epoch": 0.19111111111111112, + "grad_norm": 672.8676147460938, + "learning_rate": 3e-06, + "loss": 26.2286, + "step": 2150 + }, + { + "epoch": 0.1912, + "grad_norm": 594.0144653320312, + "learning_rate": 3e-06, + "loss": -2.6153, + "step": 2151 + }, + { + "epoch": 0.1912888888888889, + "grad_norm": 828.8351440429688, + "learning_rate": 3e-06, + "loss": 2.5627, + "step": 2152 + }, + { + "epoch": 0.19137777777777779, + "grad_norm": 601.149658203125, + "learning_rate": 3e-06, + "loss": -1.2538, + "step": 2153 + }, + { + "epoch": 0.19146666666666667, + "grad_norm": 685.7401123046875, + "learning_rate": 3e-06, + "loss": 8.7495, + "step": 2154 + }, + { + "epoch": 0.19155555555555556, + "grad_norm": 561.976318359375, + "learning_rate": 3e-06, + "loss": 22.0952, + "step": 2155 + }, + { + "epoch": 0.19164444444444445, + "grad_norm": 583.328369140625, + "learning_rate": 3e-06, + "loss": 15.3746, + "step": 2156 + }, + { + "epoch": 0.19173333333333334, + "grad_norm": 495.55609130859375, + "learning_rate": 3e-06, + "loss": -9.5814, + "step": 2157 + }, + { + "epoch": 0.19182222222222223, + "grad_norm": 764.2197265625, + "learning_rate": 3e-06, + "loss": -2.7963, + "step": 2158 + }, + { + "epoch": 0.19191111111111112, + "grad_norm": 2350.23779296875, + "learning_rate": 3e-06, + "loss": -8.0366, + "step": 2159 + }, + { + "epoch": 0.192, + "grad_norm": 904.4937744140625, + "learning_rate": 3e-06, + "loss": -6.0055, + "step": 2160 + }, + { + "completion_length": 247.93750762939453, + "epoch": 0.1920888888888889, + "grad_norm": 899.4290771484375, + "learning_rate": 3e-06, + "loss": 51.9844, + "reward": 1.5625000596046448, + "reward_std": 0.5050228163599968, + "rewards/boxed_and_answer_tags_format_reward": 0.6875, + "rewards/correctness_reward_func_math": 0.8749999701976776, + "step": 2161, + "zero_std_ratio": 0.5 + }, + { + "epoch": 0.19217777777777778, + "grad_norm": 936.0744018554688, + "learning_rate": 3e-06, + "loss": 24.5592, + "step": 2162 + }, + { + "epoch": 0.19226666666666667, + "grad_norm": 865.0958251953125, + "learning_rate": 3e-06, + "loss": 4.2507, + "step": 2163 + }, + { + "epoch": 0.19235555555555556, + "grad_norm": 819.485107421875, + "learning_rate": 3e-06, + "loss": -17.4293, + "step": 2164 + }, + { + "epoch": 0.19244444444444445, + "grad_norm": 823.7230224609375, + "learning_rate": 3e-06, + "loss": -19.1261, + "step": 2165 + }, + { + "epoch": 0.19253333333333333, + "grad_norm": 1002.2861328125, + "learning_rate": 3e-06, + "loss": -19.0886, + "step": 2166 + }, + { + "epoch": 0.19262222222222222, + "grad_norm": 926.2667236328125, + "learning_rate": 3e-06, + "loss": 38.8715, + "step": 2167 + }, + { + "epoch": 0.1927111111111111, + "grad_norm": 890.3988037109375, + "learning_rate": 3e-06, + "loss": 16.1366, + "step": 2168 + }, + { + "epoch": 0.1928, + "grad_norm": 863.7974853515625, + "learning_rate": 3e-06, + "loss": -6.0243, + "step": 2169 + }, + { + "epoch": 0.1928888888888889, + "grad_norm": 675.4772338867188, + "learning_rate": 3e-06, + "loss": -29.941, + "step": 2170 + }, + { + "epoch": 0.19297777777777778, + "grad_norm": 795.9426879882812, + "learning_rate": 3e-06, + "loss": -37.6626, + "step": 2171 + }, + { + "epoch": 0.19306666666666666, + "grad_norm": 1173.4658203125, + "learning_rate": 3e-06, + "loss": -42.7799, + "step": 2172 + }, + { + "completion_length": 255.27083587646484, + "epoch": 0.19315555555555555, + "grad_norm": 1052.441650390625, + "learning_rate": 3e-06, + "loss": 17.4183, + "reward": 1.75, + "reward_std": 0.720521941781044, + "rewards/boxed_and_answer_tags_format_reward": 0.75, + "rewards/correctness_reward_func_math": 1.0, + "step": 2173, + "zero_std_ratio": 0.25 + }, + { + "epoch": 0.19324444444444444, + "grad_norm": 1221.40673828125, + "learning_rate": 3e-06, + "loss": 10.0883, + "step": 2174 + }, + { + "epoch": 0.19333333333333333, + "grad_norm": 733.4461059570312, + "learning_rate": 3e-06, + "loss": 7.5639, + "step": 2175 + }, + { + "epoch": 0.19342222222222222, + "grad_norm": 981.289794921875, + "learning_rate": 3e-06, + "loss": 32.1803, + "step": 2176 + }, + { + "epoch": 0.1935111111111111, + "grad_norm": 1169.4273681640625, + "learning_rate": 3e-06, + "loss": 38.9057, + "step": 2177 + }, + { + "epoch": 0.1936, + "grad_norm": 1102.001220703125, + "learning_rate": 3e-06, + "loss": 11.3484, + "step": 2178 + }, + { + "epoch": 0.19368888888888888, + "grad_norm": 992.0403442382812, + "learning_rate": 3e-06, + "loss": 17.4565, + "step": 2179 + }, + { + "epoch": 0.19377777777777777, + "grad_norm": 1574.7171630859375, + "learning_rate": 3e-06, + "loss": -4.7767, + "step": 2180 + }, + { + "epoch": 0.19386666666666666, + "grad_norm": 713.719482421875, + "learning_rate": 3e-06, + "loss": -2.3293, + "step": 2181 + }, + { + "epoch": 0.19395555555555555, + "grad_norm": 999.0922241210938, + "learning_rate": 3e-06, + "loss": 20.7797, + "step": 2182 + }, + { + "epoch": 0.19404444444444444, + "grad_norm": 1164.4508056640625, + "learning_rate": 3e-06, + "loss": 22.3378, + "step": 2183 + }, + { + "epoch": 0.19413333333333332, + "grad_norm": 1133.1566162109375, + "learning_rate": 3e-06, + "loss": -3.4018, + "step": 2184 + }, + { + "completion_length": 246.64584350585938, + "epoch": 0.1942222222222222, + "grad_norm": 668.209716796875, + "learning_rate": 3e-06, + "loss": -9.4869, + "reward": 1.7083333730697632, + "reward_std": 0.3061862215399742, + "rewards/boxed_and_answer_tags_format_reward": 0.75, + "rewards/correctness_reward_func_math": 0.9583333432674408, + "step": 2185, + "zero_std_ratio": 0.625 + }, + { + "epoch": 0.1943111111111111, + "grad_norm": 433.6723937988281, + "learning_rate": 3e-06, + "loss": -11.4995, + "step": 2186 + }, + { + "epoch": 0.1944, + "grad_norm": 682.7325439453125, + "learning_rate": 3e-06, + "loss": -28.6521, + "step": 2187 + }, + { + "epoch": 0.19448888888888888, + "grad_norm": 636.8472900390625, + "learning_rate": 3e-06, + "loss": -13.4065, + "step": 2188 + }, + { + "epoch": 0.19457777777777777, + "grad_norm": 447.03173828125, + "learning_rate": 3e-06, + "loss": -18.6822, + "step": 2189 + }, + { + "epoch": 0.19466666666666665, + "grad_norm": 855.8515014648438, + "learning_rate": 3e-06, + "loss": -18.9689, + "step": 2190 + }, + { + "epoch": 0.19475555555555554, + "grad_norm": 1044.5152587890625, + "learning_rate": 3e-06, + "loss": -20.745, + "step": 2191 + }, + { + "epoch": 0.19484444444444443, + "grad_norm": 422.6670837402344, + "learning_rate": 3e-06, + "loss": -16.4546, + "step": 2192 + }, + { + "epoch": 0.19493333333333332, + "grad_norm": 698.4715576171875, + "learning_rate": 3e-06, + "loss": -37.7735, + "step": 2193 + }, + { + "epoch": 0.19502222222222224, + "grad_norm": 689.1241455078125, + "learning_rate": 3e-06, + "loss": -24.1425, + "step": 2194 + }, + { + "epoch": 0.19511111111111112, + "grad_norm": 521.723876953125, + "learning_rate": 3e-06, + "loss": -25.6598, + "step": 2195 + }, + { + "epoch": 0.1952, + "grad_norm": 641.6820678710938, + "learning_rate": 3e-06, + "loss": -27.7095, + "step": 2196 + }, + { + "completion_length": 250.18750762939453, + "epoch": 0.1952888888888889, + "grad_norm": 695.4179077148438, + "learning_rate": 3e-06, + "loss": -23.0851, + "reward": 1.5625000596046448, + "reward_std": 0.23116153478622437, + "rewards/boxed_and_answer_tags_format_reward": 0.6875, + "rewards/correctness_reward_func_math": 0.8750000149011612, + "step": 2197, + "zero_std_ratio": 0.75 + }, + { + "epoch": 0.1953777777777778, + "grad_norm": 606.8045654296875, + "learning_rate": 3e-06, + "loss": 1.2567, + "step": 2198 + }, + { + "epoch": 0.19546666666666668, + "grad_norm": 742.4933471679688, + "learning_rate": 3e-06, + "loss": 1.28, + "step": 2199 + }, + { + "epoch": 0.19555555555555557, + "grad_norm": 555.1710205078125, + "learning_rate": 3e-06, + "loss": -2.6916, + "step": 2200 + }, + { + "epoch": 0.19564444444444445, + "grad_norm": 658.5838012695312, + "learning_rate": 3e-06, + "loss": -3.5826, + "step": 2201 + }, + { + "epoch": 0.19573333333333334, + "grad_norm": 481.04693603515625, + "learning_rate": 3e-06, + "loss": -9.7868, + "step": 2202 + }, + { + "epoch": 0.19582222222222223, + "grad_norm": 583.1636352539062, + "learning_rate": 3e-06, + "loss": -26.7921, + "step": 2203 + }, + { + "epoch": 0.19591111111111112, + "grad_norm": 582.9187622070312, + "learning_rate": 3e-06, + "loss": -1.0781, + "step": 2204 + }, + { + "epoch": 0.196, + "grad_norm": 660.7078247070312, + "learning_rate": 3e-06, + "loss": -2.465, + "step": 2205 + }, + { + "epoch": 0.1960888888888889, + "grad_norm": 522.7738647460938, + "learning_rate": 3e-06, + "loss": -3.8803, + "step": 2206 + }, + { + "epoch": 0.19617777777777778, + "grad_norm": 1109.6839599609375, + "learning_rate": 3e-06, + "loss": -13.524, + "step": 2207 + }, + { + "epoch": 0.19626666666666667, + "grad_norm": 588.8282470703125, + "learning_rate": 3e-06, + "loss": -12.7292, + "step": 2208 + }, + { + "completion_length": 244.9791717529297, + "epoch": 0.19635555555555556, + "grad_norm": 403.5253601074219, + "learning_rate": 3e-06, + "loss": -24.4641, + "reward": 1.4895833730697632, + "reward_std": 0.20219219475984573, + "rewards/boxed_and_answer_tags_format_reward": 0.7395833432674408, + "rewards/correctness_reward_func_math": 0.75, + "step": 2209, + "zero_std_ratio": 0.75 + }, + { + "epoch": 0.19644444444444445, + "grad_norm": 545.7481079101562, + "learning_rate": 3e-06, + "loss": -28.5028, + "step": 2210 + }, + { + "epoch": 0.19653333333333334, + "grad_norm": 737.4202270507812, + "learning_rate": 3e-06, + "loss": -21.5566, + "step": 2211 + }, + { + "epoch": 0.19662222222222223, + "grad_norm": 415.2235412597656, + "learning_rate": 3e-06, + "loss": -19.7415, + "step": 2212 + }, + { + "epoch": 0.19671111111111111, + "grad_norm": 661.0859375, + "learning_rate": 3e-06, + "loss": -27.5604, + "step": 2213 + }, + { + "epoch": 0.1968, + "grad_norm": 688.0073852539062, + "learning_rate": 3e-06, + "loss": -25.3044, + "step": 2214 + }, + { + "epoch": 0.1968888888888889, + "grad_norm": 377.343505859375, + "learning_rate": 3e-06, + "loss": -30.7918, + "step": 2215 + }, + { + "epoch": 0.19697777777777778, + "grad_norm": 535.9216918945312, + "learning_rate": 3e-06, + "loss": -28.4559, + "step": 2216 + }, + { + "epoch": 0.19706666666666667, + "grad_norm": 631.3789672851562, + "learning_rate": 3e-06, + "loss": -28.4084, + "step": 2217 + }, + { + "epoch": 0.19715555555555556, + "grad_norm": 467.3281555175781, + "learning_rate": 3e-06, + "loss": -24.7054, + "step": 2218 + }, + { + "epoch": 0.19724444444444444, + "grad_norm": 673.6080932617188, + "learning_rate": 3e-06, + "loss": -30.1266, + "step": 2219 + }, + { + "epoch": 0.19733333333333333, + "grad_norm": 650.60009765625, + "learning_rate": 3e-06, + "loss": -30.6526, + "step": 2220 + }, + { + "completion_length": 251.27083587646484, + "epoch": 0.19742222222222222, + "grad_norm": 937.9761962890625, + "learning_rate": 3e-06, + "loss": -1.5455, + "reward": 1.9166667461395264, + "reward_std": 0.4779854342341423, + "rewards/boxed_and_answer_tags_format_reward": 0.75, + "rewards/correctness_reward_func_math": 1.1666666567325592, + "step": 2221, + "zero_std_ratio": 0.5 + }, + { + "epoch": 0.1975111111111111, + "grad_norm": 618.4127807617188, + "learning_rate": 3e-06, + "loss": 8.2676, + "step": 2222 + }, + { + "epoch": 0.1976, + "grad_norm": 681.9295654296875, + "learning_rate": 3e-06, + "loss": 5.7096, + "step": 2223 + }, + { + "epoch": 0.1976888888888889, + "grad_norm": 1006.9591064453125, + "learning_rate": 3e-06, + "loss": 52.6603, + "step": 2224 + }, + { + "epoch": 0.19777777777777777, + "grad_norm": 786.645263671875, + "learning_rate": 3e-06, + "loss": 7.371, + "step": 2225 + }, + { + "epoch": 0.19786666666666666, + "grad_norm": 779.1517333984375, + "learning_rate": 3e-06, + "loss": 15.4347, + "step": 2226 + }, + { + "epoch": 0.19795555555555555, + "grad_norm": 1029.3656005859375, + "learning_rate": 3e-06, + "loss": -12.7798, + "step": 2227 + }, + { + "epoch": 0.19804444444444444, + "grad_norm": 1035.653076171875, + "learning_rate": 3e-06, + "loss": -0.4735, + "step": 2228 + }, + { + "epoch": 0.19813333333333333, + "grad_norm": 829.162841796875, + "learning_rate": 3e-06, + "loss": -2.8614, + "step": 2229 + }, + { + "epoch": 0.19822222222222222, + "grad_norm": 873.4863891601562, + "learning_rate": 3e-06, + "loss": 30.9012, + "step": 2230 + }, + { + "epoch": 0.1983111111111111, + "grad_norm": 649.9591674804688, + "learning_rate": 3e-06, + "loss": -6.5108, + "step": 2231 + }, + { + "epoch": 0.1984, + "grad_norm": 805.36328125, + "learning_rate": 3e-06, + "loss": 2.5386, + "step": 2232 + }, + { + "completion_length": 246.1041717529297, + "epoch": 0.19848888888888888, + "grad_norm": 895.270263671875, + "learning_rate": 3e-06, + "loss": -57.1749, + "reward": 1.2500000596046448, + "reward_std": 0.4779854342341423, + "rewards/boxed_and_answer_tags_format_reward": 0.75, + "rewards/correctness_reward_func_math": 0.5000000111758709, + "step": 2233, + "zero_std_ratio": 0.5 + }, + { + "epoch": 0.19857777777777777, + "grad_norm": 765.6522827148438, + "learning_rate": 3e-06, + "loss": -48.4423, + "step": 2234 + }, + { + "epoch": 0.19866666666666666, + "grad_norm": 755.2459716796875, + "learning_rate": 3e-06, + "loss": -29.8722, + "step": 2235 + }, + { + "epoch": 0.19875555555555555, + "grad_norm": 685.5021362304688, + "learning_rate": 3e-06, + "loss": -19.4638, + "step": 2236 + }, + { + "epoch": 0.19884444444444443, + "grad_norm": 832.52880859375, + "learning_rate": 3e-06, + "loss": -41.564, + "step": 2237 + }, + { + "epoch": 0.19893333333333332, + "grad_norm": 681.293701171875, + "learning_rate": 3e-06, + "loss": -56.8457, + "step": 2238 + }, + { + "epoch": 0.1990222222222222, + "grad_norm": 733.9737548828125, + "learning_rate": 3e-06, + "loss": -71.0826, + "step": 2239 + }, + { + "epoch": 0.1991111111111111, + "grad_norm": 759.0897216796875, + "learning_rate": 3e-06, + "loss": -59.8898, + "step": 2240 + }, + { + "epoch": 0.1992, + "grad_norm": 752.1569213867188, + "learning_rate": 3e-06, + "loss": -35.6279, + "step": 2241 + }, + { + "epoch": 0.19928888888888888, + "grad_norm": 793.9288330078125, + "learning_rate": 3e-06, + "loss": -31.1288, + "step": 2242 + }, + { + "epoch": 0.19937777777777776, + "grad_norm": 875.0328979492188, + "learning_rate": 3e-06, + "loss": -54.7059, + "step": 2243 + }, + { + "epoch": 0.19946666666666665, + "grad_norm": 735.8705444335938, + "learning_rate": 3e-06, + "loss": -64.119, + "step": 2244 + }, + { + "completion_length": 245.1875, + "epoch": 0.19955555555555557, + "grad_norm": 691.626953125, + "learning_rate": 3e-06, + "loss": 33.6045, + "reward": 1.0729166865348816, + "reward_std": 0.41281384229660034, + "rewards/boxed_and_answer_tags_format_reward": 0.7395833432674408, + "rewards/correctness_reward_func_math": 0.3333333432674408, + "step": 2245, + "zero_std_ratio": 0.5 + }, + { + "epoch": 0.19964444444444446, + "grad_norm": 847.1795654296875, + "learning_rate": 3e-06, + "loss": 36.038, + "step": 2246 + }, + { + "epoch": 0.19973333333333335, + "grad_norm": 502.91473388671875, + "learning_rate": 3e-06, + "loss": 22.1079, + "step": 2247 + }, + { + "epoch": 0.19982222222222223, + "grad_norm": 577.439697265625, + "learning_rate": 3e-06, + "loss": 33.3352, + "step": 2248 + }, + { + "epoch": 0.19991111111111112, + "grad_norm": 830.4883422851562, + "learning_rate": 3e-06, + "loss": 41.7401, + "step": 2249 + }, + { + "epoch": 0.2, + "grad_norm": 769.1807861328125, + "learning_rate": 3e-06, + "loss": 37.7994, + "step": 2250 + }, + { + "epoch": 0.2000888888888889, + "grad_norm": 636.1270141601562, + "learning_rate": 3e-06, + "loss": 27.5784, + "step": 2251 + }, + { + "epoch": 0.2001777777777778, + "grad_norm": 761.4356079101562, + "learning_rate": 3e-06, + "loss": 27.7986, + "step": 2252 + }, + { + "epoch": 0.20026666666666668, + "grad_norm": 530.1929931640625, + "learning_rate": 3e-06, + "loss": 17.7146, + "step": 2253 + }, + { + "epoch": 0.20035555555555556, + "grad_norm": 564.9805908203125, + "learning_rate": 3e-06, + "loss": 27.1709, + "step": 2254 + }, + { + "epoch": 0.20044444444444445, + "grad_norm": 600.7700805664062, + "learning_rate": 3e-06, + "loss": 37.6539, + "step": 2255 + }, + { + "epoch": 0.20053333333333334, + "grad_norm": 710.3458862304688, + "learning_rate": 3e-06, + "loss": 30.2361, + "step": 2256 + }, + { + "completion_length": 249.9791717529297, + "epoch": 0.20062222222222223, + "grad_norm": 404.9451904296875, + "learning_rate": 3e-06, + "loss": -93.8459, + "reward": 1.1458333730697632, + "reward_std": 0.23116151988506317, + "rewards/boxed_and_answer_tags_format_reward": 0.6875, + "rewards/correctness_reward_func_math": 0.4583333283662796, + "step": 2257, + "zero_std_ratio": 0.75 + }, + { + "epoch": 0.20071111111111112, + "grad_norm": 394.6553649902344, + "learning_rate": 3e-06, + "loss": -76.6418, + "step": 2258 + }, + { + "epoch": 0.2008, + "grad_norm": 366.0687255859375, + "learning_rate": 3e-06, + "loss": -73.8687, + "step": 2259 + }, + { + "epoch": 0.2008888888888889, + "grad_norm": 464.67388916015625, + "learning_rate": 3e-06, + "loss": -82.5149, + "step": 2260 + }, + { + "epoch": 0.20097777777777778, + "grad_norm": 437.32470703125, + "learning_rate": 3e-06, + "loss": -62.3202, + "step": 2261 + }, + { + "epoch": 0.20106666666666667, + "grad_norm": 884.9143676757812, + "learning_rate": 3e-06, + "loss": -76.1318, + "step": 2262 + }, + { + "epoch": 0.20115555555555556, + "grad_norm": 431.22882080078125, + "learning_rate": 3e-06, + "loss": -101.0349, + "step": 2263 + }, + { + "epoch": 0.20124444444444445, + "grad_norm": 407.47344970703125, + "learning_rate": 3e-06, + "loss": -83.3404, + "step": 2264 + }, + { + "epoch": 0.20133333333333334, + "grad_norm": 450.6275634765625, + "learning_rate": 3e-06, + "loss": -81.2565, + "step": 2265 + }, + { + "epoch": 0.20142222222222222, + "grad_norm": 446.26715087890625, + "learning_rate": 3e-06, + "loss": -92.9891, + "step": 2266 + }, + { + "epoch": 0.2015111111111111, + "grad_norm": 421.58514404296875, + "learning_rate": 3e-06, + "loss": -72.7071, + "step": 2267 + }, + { + "epoch": 0.2016, + "grad_norm": 499.9620361328125, + "learning_rate": 3e-06, + "loss": -86.6616, + "step": 2268 + }, + { + "completion_length": 251.43750762939453, + "epoch": 0.2016888888888889, + "grad_norm": 1807.4974365234375, + "learning_rate": 3e-06, + "loss": -2.4199, + "reward": 1.4375, + "reward_std": 0.46232306957244873, + "rewards/boxed_and_answer_tags_format_reward": 0.6875, + "rewards/correctness_reward_func_math": 0.75, + "step": 2269, + "zero_std_ratio": 0.5 + }, + { + "epoch": 0.20177777777777778, + "grad_norm": 1014.031005859375, + "learning_rate": 3e-06, + "loss": -11.0913, + "step": 2270 + }, + { + "epoch": 0.20186666666666667, + "grad_norm": 969.154541015625, + "learning_rate": 3e-06, + "loss": -45.8343, + "step": 2271 + }, + { + "epoch": 0.20195555555555555, + "grad_norm": 1514.218994140625, + "learning_rate": 3e-06, + "loss": -6.767, + "step": 2272 + }, + { + "epoch": 0.20204444444444444, + "grad_norm": 1047.0552978515625, + "learning_rate": 3e-06, + "loss": -41.9022, + "step": 2273 + }, + { + "epoch": 0.20213333333333333, + "grad_norm": 1069.608642578125, + "learning_rate": 3e-06, + "loss": -32.74, + "step": 2274 + }, + { + "epoch": 0.20222222222222222, + "grad_norm": 1186.3797607421875, + "learning_rate": 3e-06, + "loss": -5.627, + "step": 2275 + }, + { + "epoch": 0.2023111111111111, + "grad_norm": 1353.1217041015625, + "learning_rate": 3e-06, + "loss": -20.7514, + "step": 2276 + }, + { + "epoch": 0.2024, + "grad_norm": 973.7822875976562, + "learning_rate": 3e-06, + "loss": -57.7389, + "step": 2277 + }, + { + "epoch": 0.20248888888888888, + "grad_norm": 1122.6533203125, + "learning_rate": 3e-06, + "loss": -19.221, + "step": 2278 + }, + { + "epoch": 0.20257777777777777, + "grad_norm": 1007.2978515625, + "learning_rate": 3e-06, + "loss": -49.3046, + "step": 2279 + }, + { + "epoch": 0.20266666666666666, + "grad_norm": 1087.270751953125, + "learning_rate": 3e-06, + "loss": -46.9498, + "step": 2280 + }, + { + "completion_length": 248.0, + "epoch": 0.20275555555555555, + "grad_norm": 684.866455078125, + "learning_rate": 3e-06, + "loss": -24.7332, + "reward": 1.0416666865348816, + "reward_std": 0.40296074748039246, + "rewards/boxed_and_answer_tags_format_reward": 0.625, + "rewards/correctness_reward_func_math": 0.4166666716337204, + "step": 2281, + "zero_std_ratio": 0.625 + }, + { + "epoch": 0.20284444444444444, + "grad_norm": 685.3329467773438, + "learning_rate": 3e-06, + "loss": 10.4014, + "step": 2282 + }, + { + "epoch": 0.20293333333333333, + "grad_norm": 828.5563354492188, + "learning_rate": 3e-06, + "loss": -17.5656, + "step": 2283 + }, + { + "epoch": 0.20302222222222222, + "grad_norm": 797.2943115234375, + "learning_rate": 3e-06, + "loss": -24.5962, + "step": 2284 + }, + { + "epoch": 0.2031111111111111, + "grad_norm": 778.4367065429688, + "learning_rate": 3e-06, + "loss": 13.921, + "step": 2285 + }, + { + "epoch": 0.2032, + "grad_norm": 694.8433837890625, + "learning_rate": 3e-06, + "loss": -1.7194, + "step": 2286 + }, + { + "epoch": 0.20328888888888888, + "grad_norm": 808.0003662109375, + "learning_rate": 3e-06, + "loss": -30.4908, + "step": 2287 + }, + { + "epoch": 0.20337777777777777, + "grad_norm": 724.6696166992188, + "learning_rate": 3e-06, + "loss": 2.6068, + "step": 2288 + }, + { + "epoch": 0.20346666666666666, + "grad_norm": 830.2708129882812, + "learning_rate": 3e-06, + "loss": -28.4917, + "step": 2289 + }, + { + "epoch": 0.20355555555555555, + "grad_norm": 785.9896850585938, + "learning_rate": 3e-06, + "loss": -31.1686, + "step": 2290 + }, + { + "epoch": 0.20364444444444443, + "grad_norm": 851.0347290039062, + "learning_rate": 3e-06, + "loss": 0.2481, + "step": 2291 + }, + { + "epoch": 0.20373333333333332, + "grad_norm": 774.1303100585938, + "learning_rate": 3e-06, + "loss": -6.2776, + "step": 2292 + }, + { + "completion_length": 246.0625, + "epoch": 0.2038222222222222, + "grad_norm": 635.4057006835938, + "learning_rate": 3e-06, + "loss": 4.3122, + "reward": 1.5104167461395264, + "reward_std": 0.34120412170886993, + "rewards/boxed_and_answer_tags_format_reward": 0.6770833432674408, + "rewards/correctness_reward_func_math": 0.8333333283662796, + "step": 2293, + "zero_std_ratio": 0.625 + }, + { + "epoch": 0.2039111111111111, + "grad_norm": 653.86328125, + "learning_rate": 3e-06, + "loss": 12.7972, + "step": 2294 + }, + { + "epoch": 0.204, + "grad_norm": 610.3839721679688, + "learning_rate": 3e-06, + "loss": -2.9267, + "step": 2295 + }, + { + "epoch": 0.20408888888888888, + "grad_norm": 714.1361083984375, + "learning_rate": 3e-06, + "loss": -23.5832, + "step": 2296 + }, + { + "epoch": 0.2041777777777778, + "grad_norm": 718.49658203125, + "learning_rate": 3e-06, + "loss": -4.627, + "step": 2297 + }, + { + "epoch": 0.20426666666666668, + "grad_norm": 696.8006591796875, + "learning_rate": 3e-06, + "loss": -11.8357, + "step": 2298 + }, + { + "epoch": 0.20435555555555557, + "grad_norm": 814.3312377929688, + "learning_rate": 3e-06, + "loss": -1.5097, + "step": 2299 + }, + { + "epoch": 0.20444444444444446, + "grad_norm": 556.5509643554688, + "learning_rate": 3e-06, + "loss": 10.6618, + "step": 2300 + }, + { + "epoch": 0.20453333333333334, + "grad_norm": 667.0651245117188, + "learning_rate": 3e-06, + "loss": -6.0333, + "step": 2301 + }, + { + "epoch": 0.20462222222222223, + "grad_norm": 694.1640625, + "learning_rate": 3e-06, + "loss": -24.5284, + "step": 2302 + }, + { + "epoch": 0.20471111111111112, + "grad_norm": 923.972900390625, + "learning_rate": 3e-06, + "loss": -11.2173, + "step": 2303 + }, + { + "epoch": 0.2048, + "grad_norm": 659.1800537109375, + "learning_rate": 3e-06, + "loss": -16.7782, + "step": 2304 + }, + { + "completion_length": 254.4375, + "epoch": 0.2048888888888889, + "grad_norm": 971.201171875, + "learning_rate": 3e-06, + "loss": -6.4646, + "reward": 2.125, + "reward_std": 0.43528565764427185, + "rewards/boxed_and_answer_tags_format_reward": 0.75, + "rewards/correctness_reward_func_math": 1.375, + "step": 2305, + "zero_std_ratio": 0.5 + }, + { + "epoch": 0.2049777777777778, + "grad_norm": 1799.6265869140625, + "learning_rate": 3e-06, + "loss": 13.6479, + "step": 2306 + }, + { + "epoch": 0.20506666666666667, + "grad_norm": 1432.3759765625, + "learning_rate": 3e-06, + "loss": -12.8131, + "step": 2307 + }, + { + "epoch": 0.20515555555555556, + "grad_norm": 1100.3665771484375, + "learning_rate": 3e-06, + "loss": 19.3096, + "step": 2308 + }, + { + "epoch": 0.20524444444444445, + "grad_norm": 954.0089111328125, + "learning_rate": 3e-06, + "loss": -5.624, + "step": 2309 + }, + { + "epoch": 0.20533333333333334, + "grad_norm": 1022.4109497070312, + "learning_rate": 3e-06, + "loss": -26.2045, + "step": 2310 + }, + { + "epoch": 0.20542222222222223, + "grad_norm": 1009.416015625, + "learning_rate": 3e-06, + "loss": -9.7483, + "step": 2311 + }, + { + "epoch": 0.20551111111111112, + "grad_norm": 1540.92333984375, + "learning_rate": 3e-06, + "loss": 0.92, + "step": 2312 + }, + { + "epoch": 0.2056, + "grad_norm": 1027.509765625, + "learning_rate": 3e-06, + "loss": -24.657, + "step": 2313 + }, + { + "epoch": 0.2056888888888889, + "grad_norm": 948.5579833984375, + "learning_rate": 3e-06, + "loss": 8.6437, + "step": 2314 + }, + { + "epoch": 0.20577777777777778, + "grad_norm": 919.4548950195312, + "learning_rate": 3e-06, + "loss": -21.9334, + "step": 2315 + }, + { + "epoch": 0.20586666666666667, + "grad_norm": 979.2217407226562, + "learning_rate": 3e-06, + "loss": -36.1305, + "step": 2316 + }, + { + "completion_length": 253.52083587646484, + "epoch": 0.20595555555555556, + "grad_norm": 382.2681579589844, + "learning_rate": 3e-06, + "loss": 13.32, + "reward": 0.9791666865348816, + "reward_std": 0.10206206887960434, + "rewards/boxed_and_answer_tags_format_reward": 0.6875, + "rewards/correctness_reward_func_math": 0.2916666679084301, + "step": 2317, + "zero_std_ratio": 0.875 + }, + { + "epoch": 0.20604444444444445, + "grad_norm": 585.9896850585938, + "learning_rate": 3e-06, + "loss": 14.567, + "step": 2318 + }, + { + "epoch": 0.20613333333333334, + "grad_norm": 433.8167419433594, + "learning_rate": 3e-06, + "loss": 0.179, + "step": 2319 + }, + { + "epoch": 0.20622222222222222, + "grad_norm": 473.40289306640625, + "learning_rate": 3e-06, + "loss": -8.1342, + "step": 2320 + }, + { + "epoch": 0.2063111111111111, + "grad_norm": 454.9488220214844, + "learning_rate": 3e-06, + "loss": 6.4596, + "step": 2321 + }, + { + "epoch": 0.2064, + "grad_norm": 545.6451416015625, + "learning_rate": 3e-06, + "loss": -0.0272, + "step": 2322 + }, + { + "epoch": 0.2064888888888889, + "grad_norm": 393.2840576171875, + "learning_rate": 3e-06, + "loss": 10.1428, + "step": 2323 + }, + { + "epoch": 0.20657777777777778, + "grad_norm": 624.9196166992188, + "learning_rate": 3e-06, + "loss": 10.519, + "step": 2324 + }, + { + "epoch": 0.20666666666666667, + "grad_norm": 423.4202880859375, + "learning_rate": 3e-06, + "loss": -6.3045, + "step": 2325 + }, + { + "epoch": 0.20675555555555555, + "grad_norm": 396.0754699707031, + "learning_rate": 3e-06, + "loss": -11.096, + "step": 2326 + }, + { + "epoch": 0.20684444444444444, + "grad_norm": 583.927734375, + "learning_rate": 3e-06, + "loss": 2.5989, + "step": 2327 + }, + { + "epoch": 0.20693333333333333, + "grad_norm": 597.1937255859375, + "learning_rate": 3e-06, + "loss": -8.4196, + "step": 2328 + }, + { + "completion_length": 242.62500762939453, + "epoch": 0.20702222222222222, + "grad_norm": 800.9533081054688, + "learning_rate": 3e-06, + "loss": -33.827, + "reward": 1.4375000596046448, + "reward_std": 0.46232303977012634, + "rewards/boxed_and_answer_tags_format_reward": 0.6875, + "rewards/correctness_reward_func_math": 0.75, + "step": 2329, + "zero_std_ratio": 0.5 + }, + { + "epoch": 0.2071111111111111, + "grad_norm": 696.8334350585938, + "learning_rate": 3e-06, + "loss": -20.2565, + "step": 2330 + }, + { + "epoch": 0.2072, + "grad_norm": 728.7371826171875, + "learning_rate": 3e-06, + "loss": -55.6491, + "step": 2331 + }, + { + "epoch": 0.20728888888888888, + "grad_norm": 845.1964111328125, + "learning_rate": 3e-06, + "loss": -34.6344, + "step": 2332 + }, + { + "epoch": 0.20737777777777777, + "grad_norm": 797.7058715820312, + "learning_rate": 3e-06, + "loss": -38.3381, + "step": 2333 + }, + { + "epoch": 0.20746666666666666, + "grad_norm": 815.6392211914062, + "learning_rate": 3e-06, + "loss": -39.5545, + "step": 2334 + }, + { + "epoch": 0.20755555555555555, + "grad_norm": 824.5341796875, + "learning_rate": 3e-06, + "loss": -42.9804, + "step": 2335 + }, + { + "epoch": 0.20764444444444444, + "grad_norm": 839.4075927734375, + "learning_rate": 3e-06, + "loss": -31.0013, + "step": 2336 + }, + { + "epoch": 0.20773333333333333, + "grad_norm": 808.272705078125, + "learning_rate": 3e-06, + "loss": -63.1393, + "step": 2337 + }, + { + "epoch": 0.2078222222222222, + "grad_norm": 937.5029296875, + "learning_rate": 3e-06, + "loss": -49.1489, + "step": 2338 + }, + { + "epoch": 0.2079111111111111, + "grad_norm": 852.005859375, + "learning_rate": 3e-06, + "loss": -50.4064, + "step": 2339 + }, + { + "epoch": 0.208, + "grad_norm": 897.2970581054688, + "learning_rate": 3e-06, + "loss": -52.8892, + "step": 2340 + }, + { + "completion_length": 222.70834350585938, + "epoch": 0.20808888888888888, + "grad_norm": 757.0648803710938, + "learning_rate": 3e-06, + "loss": 35.128, + "reward": 1.7083333730697632, + "reward_std": 0.3332235887646675, + "rewards/boxed_and_answer_tags_format_reward": 0.625, + "rewards/correctness_reward_func_math": 1.0833333134651184, + "step": 2341, + "zero_std_ratio": 0.625 + }, + { + "epoch": 0.20817777777777777, + "grad_norm": 974.6918334960938, + "learning_rate": 3e-06, + "loss": 26.4346, + "step": 2342 + }, + { + "epoch": 0.20826666666666666, + "grad_norm": 960.5851440429688, + "learning_rate": 3e-06, + "loss": 18.2117, + "step": 2343 + }, + { + "epoch": 0.20835555555555554, + "grad_norm": 748.3045654296875, + "learning_rate": 3e-06, + "loss": 19.8586, + "step": 2344 + }, + { + "epoch": 0.20844444444444443, + "grad_norm": 900.9320068359375, + "learning_rate": 3e-06, + "loss": 9.7996, + "step": 2345 + }, + { + "epoch": 0.20853333333333332, + "grad_norm": 769.8694458007812, + "learning_rate": 3e-06, + "loss": 25.5797, + "step": 2346 + }, + { + "epoch": 0.2086222222222222, + "grad_norm": 734.0509033203125, + "learning_rate": 3e-06, + "loss": 27.9682, + "step": 2347 + }, + { + "epoch": 0.2087111111111111, + "grad_norm": 869.8523559570312, + "learning_rate": 3e-06, + "loss": 21.5981, + "step": 2348 + }, + { + "epoch": 0.2088, + "grad_norm": 1000.9803466796875, + "learning_rate": 3e-06, + "loss": 4.3671, + "step": 2349 + }, + { + "epoch": 0.2088888888888889, + "grad_norm": 766.8132934570312, + "learning_rate": 3e-06, + "loss": 10.1062, + "step": 2350 + }, + { + "epoch": 0.2089777777777778, + "grad_norm": 1140.35986328125, + "learning_rate": 3e-06, + "loss": -0.0209, + "step": 2351 + }, + { + "epoch": 0.20906666666666668, + "grad_norm": 769.6608276367188, + "learning_rate": 3e-06, + "loss": 17.1774, + "step": 2352 + }, + { + "completion_length": 235.12500762939453, + "epoch": 0.20915555555555557, + "grad_norm": 1358.110595703125, + "learning_rate": 3e-06, + "loss": -29.3384, + "reward": 1.5625000596046448, + "reward_std": 0.599253699183464, + "rewards/boxed_and_answer_tags_format_reward": 0.6875, + "rewards/correctness_reward_func_math": 0.8750000298023224, + "step": 2353, + "zero_std_ratio": 0.375 + }, + { + "epoch": 0.20924444444444446, + "grad_norm": 1078.0203857421875, + "learning_rate": 3e-06, + "loss": -4.6919, + "step": 2354 + }, + { + "epoch": 0.20933333333333334, + "grad_norm": 1239.9210205078125, + "learning_rate": 3e-06, + "loss": -58.9311, + "step": 2355 + }, + { + "epoch": 0.20942222222222223, + "grad_norm": 1190.312255859375, + "learning_rate": 3e-06, + "loss": -41.3338, + "step": 2356 + }, + { + "epoch": 0.20951111111111112, + "grad_norm": 1177.6978759765625, + "learning_rate": 3e-06, + "loss": -77.211, + "step": 2357 + }, + { + "epoch": 0.2096, + "grad_norm": 1262.98876953125, + "learning_rate": 3e-06, + "loss": -30.6502, + "step": 2358 + }, + { + "epoch": 0.2096888888888889, + "grad_norm": 1281.507568359375, + "learning_rate": 3e-06, + "loss": -46.3909, + "step": 2359 + }, + { + "epoch": 0.20977777777777779, + "grad_norm": 1242.6148681640625, + "learning_rate": 3e-06, + "loss": -20.0434, + "step": 2360 + }, + { + "epoch": 0.20986666666666667, + "grad_norm": 1216.9324951171875, + "learning_rate": 3e-06, + "loss": -75.5012, + "step": 2361 + }, + { + "epoch": 0.20995555555555556, + "grad_norm": 1148.396240234375, + "learning_rate": 3e-06, + "loss": -60.3074, + "step": 2362 + }, + { + "epoch": 0.21004444444444445, + "grad_norm": 1038.05224609375, + "learning_rate": 3e-06, + "loss": -90.1416, + "step": 2363 + }, + { + "epoch": 0.21013333333333334, + "grad_norm": 1481.82470703125, + "learning_rate": 3e-06, + "loss": -41.4112, + "step": 2364 + }, + { + "completion_length": 236.4375, + "epoch": 0.21022222222222223, + "grad_norm": 1174.8533935546875, + "learning_rate": 3e-06, + "loss": -15.5789, + "reward": 1.2500000298023224, + "reward_std": 0.4431168735027313, + "rewards/boxed_and_answer_tags_format_reward": 0.75, + "rewards/correctness_reward_func_math": 0.4999999925494194, + "step": 2365, + "zero_std_ratio": 0.5 + }, + { + "epoch": 0.21031111111111112, + "grad_norm": 3015.200927734375, + "learning_rate": 3e-06, + "loss": -27.8838, + "step": 2366 + }, + { + "epoch": 0.2104, + "grad_norm": 1439.3358154296875, + "learning_rate": 3e-06, + "loss": -38.5885, + "step": 2367 + }, + { + "epoch": 0.2104888888888889, + "grad_norm": 869.1331787109375, + "learning_rate": 3e-06, + "loss": -19.1413, + "step": 2368 + }, + { + "epoch": 0.21057777777777778, + "grad_norm": 954.2206420898438, + "learning_rate": 3e-06, + "loss": -22.8123, + "step": 2369 + }, + { + "epoch": 0.21066666666666667, + "grad_norm": 923.0101928710938, + "learning_rate": 3e-06, + "loss": -34.3305, + "step": 2370 + }, + { + "epoch": 0.21075555555555556, + "grad_norm": 1316.6534423828125, + "learning_rate": 3e-06, + "loss": -30.9986, + "step": 2371 + }, + { + "epoch": 0.21084444444444445, + "grad_norm": 1236.0667724609375, + "learning_rate": 3e-06, + "loss": -48.6076, + "step": 2372 + }, + { + "epoch": 0.21093333333333333, + "grad_norm": 1252.537109375, + "learning_rate": 3e-06, + "loss": -61.8291, + "step": 2373 + }, + { + "epoch": 0.21102222222222222, + "grad_norm": 1343.861328125, + "learning_rate": 3e-06, + "loss": -29.769, + "step": 2374 + }, + { + "epoch": 0.2111111111111111, + "grad_norm": 1069.728515625, + "learning_rate": 3e-06, + "loss": -31.3108, + "step": 2375 + }, + { + "epoch": 0.2112, + "grad_norm": 898.3480834960938, + "learning_rate": 3e-06, + "loss": -53.5648, + "step": 2376 + }, + { + "completion_length": 250.2916717529297, + "epoch": 0.2112888888888889, + "grad_norm": 1555.4439697265625, + "learning_rate": 3e-06, + "loss": 23.924, + "reward": 0.9583333432674408, + "reward_std": 0.43528565764427185, + "rewards/boxed_and_answer_tags_format_reward": 0.75, + "rewards/correctness_reward_func_math": 0.2083333358168602, + "step": 2377, + "zero_std_ratio": 0.5 + }, + { + "epoch": 0.21137777777777778, + "grad_norm": 1271.503662109375, + "learning_rate": 3e-06, + "loss": -28.6245, + "step": 2378 + }, + { + "epoch": 0.21146666666666666, + "grad_norm": 1083.2822265625, + "learning_rate": 3e-06, + "loss": -61.1598, + "step": 2379 + }, + { + "epoch": 0.21155555555555555, + "grad_norm": 950.6062622070312, + "learning_rate": 3e-06, + "loss": -19.0397, + "step": 2380 + }, + { + "epoch": 0.21164444444444444, + "grad_norm": 1111.03857421875, + "learning_rate": 3e-06, + "loss": 11.29, + "step": 2381 + }, + { + "epoch": 0.21173333333333333, + "grad_norm": 1028.947509765625, + "learning_rate": 3e-06, + "loss": -16.2803, + "step": 2382 + }, + { + "epoch": 0.21182222222222222, + "grad_norm": 1200.6107177734375, + "learning_rate": 3e-06, + "loss": 14.5257, + "step": 2383 + }, + { + "epoch": 0.2119111111111111, + "grad_norm": 1430.796630859375, + "learning_rate": 3e-06, + "loss": -34.2578, + "step": 2384 + }, + { + "epoch": 0.212, + "grad_norm": 1028.548583984375, + "learning_rate": 3e-06, + "loss": -68.5076, + "step": 2385 + }, + { + "epoch": 0.21208888888888888, + "grad_norm": 1001.2411499023438, + "learning_rate": 3e-06, + "loss": -25.6736, + "step": 2386 + }, + { + "epoch": 0.21217777777777777, + "grad_norm": 984.0308227539062, + "learning_rate": 3e-06, + "loss": -2.7987, + "step": 2387 + }, + { + "epoch": 0.21226666666666666, + "grad_norm": 965.3121337890625, + "learning_rate": 3e-06, + "loss": -28.9898, + "step": 2388 + }, + { + "completion_length": 229.2916717529297, + "epoch": 0.21235555555555555, + "grad_norm": 557.446533203125, + "learning_rate": 3e-06, + "loss": 21.0555, + "reward": 1.8437500596046448, + "reward_std": 0.1546149756759405, + "rewards/boxed_and_answer_tags_format_reward": 0.6770833432674408, + "rewards/correctness_reward_func_math": 1.1666666716337204, + "step": 2389, + "zero_std_ratio": 0.75 + }, + { + "epoch": 0.21244444444444444, + "grad_norm": 575.1875610351562, + "learning_rate": 3e-06, + "loss": 23.3964, + "step": 2390 + }, + { + "epoch": 0.21253333333333332, + "grad_norm": 360.0423278808594, + "learning_rate": 3e-06, + "loss": 33.3316, + "step": 2391 + }, + { + "epoch": 0.2126222222222222, + "grad_norm": 255.54953002929688, + "learning_rate": 3e-06, + "loss": 31.5129, + "step": 2392 + }, + { + "epoch": 0.2127111111111111, + "grad_norm": 265.9867248535156, + "learning_rate": 3e-06, + "loss": 16.7137, + "step": 2393 + }, + { + "epoch": 0.2128, + "grad_norm": 356.7539978027344, + "learning_rate": 3e-06, + "loss": 29.8486, + "step": 2394 + }, + { + "epoch": 0.21288888888888888, + "grad_norm": 380.1522521972656, + "learning_rate": 3e-06, + "loss": 18.9171, + "step": 2395 + }, + { + "epoch": 0.21297777777777777, + "grad_norm": 627.226806640625, + "learning_rate": 3e-06, + "loss": 14.277, + "step": 2396 + }, + { + "epoch": 0.21306666666666665, + "grad_norm": 473.0029296875, + "learning_rate": 3e-06, + "loss": 26.1599, + "step": 2397 + }, + { + "epoch": 0.21315555555555554, + "grad_norm": 256.2850646972656, + "learning_rate": 3e-06, + "loss": 27.8036, + "step": 2398 + }, + { + "epoch": 0.21324444444444443, + "grad_norm": 294.08056640625, + "learning_rate": 3e-06, + "loss": 11.133, + "step": 2399 + }, + { + "epoch": 0.21333333333333335, + "grad_norm": 292.5602111816406, + "learning_rate": 3e-06, + "loss": 21.8162, + "step": 2400 + }, + { + "completion_length": 248.7291717529297, + "epoch": 0.21342222222222224, + "grad_norm": 991.9778442382812, + "learning_rate": 3e-06, + "loss": 12.964, + "reward": 1.5000000596046448, + "reward_std": 0.20412413775920868, + "rewards/boxed_and_answer_tags_format_reward": 0.75, + "rewards/correctness_reward_func_math": 0.75, + "step": 2401, + "zero_std_ratio": 0.75 + }, + { + "epoch": 0.21351111111111112, + "grad_norm": 644.521484375, + "learning_rate": 3e-06, + "loss": -4.3122, + "step": 2402 + }, + { + "epoch": 0.2136, + "grad_norm": 680.9287719726562, + "learning_rate": 3e-06, + "loss": 3.5692, + "step": 2403 + }, + { + "epoch": 0.2136888888888889, + "grad_norm": 824.704345703125, + "learning_rate": 3e-06, + "loss": 20.7909, + "step": 2404 + }, + { + "epoch": 0.2137777777777778, + "grad_norm": 956.5078125, + "learning_rate": 3e-06, + "loss": 10.9202, + "step": 2405 + }, + { + "epoch": 0.21386666666666668, + "grad_norm": 1738.125244140625, + "learning_rate": 3e-06, + "loss": 5.5927, + "step": 2406 + }, + { + "epoch": 0.21395555555555557, + "grad_norm": 920.435546875, + "learning_rate": 3e-06, + "loss": 2.9809, + "step": 2407 + }, + { + "epoch": 0.21404444444444445, + "grad_norm": 677.7364501953125, + "learning_rate": 3e-06, + "loss": -7.6728, + "step": 2408 + }, + { + "epoch": 0.21413333333333334, + "grad_norm": 724.0176391601562, + "learning_rate": 3e-06, + "loss": -5.1334, + "step": 2409 + }, + { + "epoch": 0.21422222222222223, + "grad_norm": 885.7861938476562, + "learning_rate": 3e-06, + "loss": 17.665, + "step": 2410 + }, + { + "epoch": 0.21431111111111112, + "grad_norm": 884.716552734375, + "learning_rate": 3e-06, + "loss": 3.5877, + "step": 2411 + }, + { + "epoch": 0.2144, + "grad_norm": 1398.6461181640625, + "learning_rate": 3e-06, + "loss": 3.2704, + "step": 2412 + }, + { + "completion_length": 234.20834350585938, + "epoch": 0.2144888888888889, + "grad_norm": 1070.3880615234375, + "learning_rate": 3e-06, + "loss": -4.4756, + "reward": 1.3750000596046448, + "reward_std": 0.23116153478622437, + "rewards/boxed_and_answer_tags_format_reward": 0.75, + "rewards/correctness_reward_func_math": 0.6249999813735485, + "step": 2413, + "zero_std_ratio": 0.75 + }, + { + "epoch": 0.21457777777777778, + "grad_norm": 1026.6502685546875, + "learning_rate": 3e-06, + "loss": 44.4837, + "step": 2414 + }, + { + "epoch": 0.21466666666666667, + "grad_norm": 772.33837890625, + "learning_rate": 3e-06, + "loss": 37.0703, + "step": 2415 + }, + { + "epoch": 0.21475555555555556, + "grad_norm": 1085.96875, + "learning_rate": 3e-06, + "loss": 30.7537, + "step": 2416 + }, + { + "epoch": 0.21484444444444445, + "grad_norm": 827.8580932617188, + "learning_rate": 3e-06, + "loss": 48.8112, + "step": 2417 + }, + { + "epoch": 0.21493333333333334, + "grad_norm": 781.8536376953125, + "learning_rate": 3e-06, + "loss": 35.331, + "step": 2418 + }, + { + "epoch": 0.21502222222222223, + "grad_norm": 830.3689575195312, + "learning_rate": 3e-06, + "loss": -6.2381, + "step": 2419 + }, + { + "epoch": 0.21511111111111111, + "grad_norm": 1895.36572265625, + "learning_rate": 3e-06, + "loss": 38.7443, + "step": 2420 + }, + { + "epoch": 0.2152, + "grad_norm": 1093.376220703125, + "learning_rate": 3e-06, + "loss": 32.4039, + "step": 2421 + }, + { + "epoch": 0.2152888888888889, + "grad_norm": 1121.695556640625, + "learning_rate": 3e-06, + "loss": 21.949, + "step": 2422 + }, + { + "epoch": 0.21537777777777778, + "grad_norm": 908.1818237304688, + "learning_rate": 3e-06, + "loss": 39.3356, + "step": 2423 + }, + { + "epoch": 0.21546666666666667, + "grad_norm": 780.5909423828125, + "learning_rate": 3e-06, + "loss": 28.4789, + "step": 2424 + }, + { + "completion_length": 240.37500762939453, + "epoch": 0.21555555555555556, + "grad_norm": 1291.5950927734375, + "learning_rate": 3e-06, + "loss": -48.4281, + "reward": 1.625, + "reward_std": 0.564385175704956, + "rewards/boxed_and_answer_tags_format_reward": 0.75, + "rewards/correctness_reward_func_math": 0.875, + "step": 2425, + "zero_std_ratio": 0.375 + }, + { + "epoch": 0.21564444444444444, + "grad_norm": 1240.4404296875, + "learning_rate": 3e-06, + "loss": -54.5814, + "step": 2426 + }, + { + "epoch": 0.21573333333333333, + "grad_norm": 1328.792724609375, + "learning_rate": 3e-06, + "loss": -77.3574, + "step": 2427 + }, + { + "epoch": 0.21582222222222222, + "grad_norm": 1365.1212158203125, + "learning_rate": 3e-06, + "loss": -56.5684, + "step": 2428 + }, + { + "epoch": 0.2159111111111111, + "grad_norm": 1141.714111328125, + "learning_rate": 3e-06, + "loss": -52.8116, + "step": 2429 + }, + { + "epoch": 0.216, + "grad_norm": 1254.617919921875, + "learning_rate": 3e-06, + "loss": -45.3142, + "step": 2430 + }, + { + "epoch": 0.21608888888888889, + "grad_norm": 1215.14404296875, + "learning_rate": 3e-06, + "loss": -61.5184, + "step": 2431 + }, + { + "epoch": 0.21617777777777777, + "grad_norm": 2824.7529296875, + "learning_rate": 3e-06, + "loss": -67.1598, + "step": 2432 + }, + { + "epoch": 0.21626666666666666, + "grad_norm": 1158.3133544921875, + "learning_rate": 3e-06, + "loss": -84.968, + "step": 2433 + }, + { + "epoch": 0.21635555555555555, + "grad_norm": 948.725341796875, + "learning_rate": 3e-06, + "loss": -67.6487, + "step": 2434 + }, + { + "epoch": 0.21644444444444444, + "grad_norm": 1137.3834228515625, + "learning_rate": 3e-06, + "loss": -60.4848, + "step": 2435 + }, + { + "epoch": 0.21653333333333333, + "grad_norm": 1104.3291015625, + "learning_rate": 3e-06, + "loss": -61.2857, + "step": 2436 + }, + { + "completion_length": 240.14583587646484, + "epoch": 0.21662222222222222, + "grad_norm": 668.9032592773438, + "learning_rate": 3e-06, + "loss": 11.3774, + "reward": 1.2291667461395264, + "reward_std": 0.23116152733564377, + "rewards/boxed_and_answer_tags_format_reward": 0.6875, + "rewards/correctness_reward_func_math": 0.5416666716337204, + "step": 2437, + "zero_std_ratio": 0.75 + }, + { + "epoch": 0.2167111111111111, + "grad_norm": 636.1585693359375, + "learning_rate": 3e-06, + "loss": -7.5818, + "step": 2438 + }, + { + "epoch": 0.2168, + "grad_norm": 497.7554626464844, + "learning_rate": 3e-06, + "loss": 6.8708, + "step": 2439 + }, + { + "epoch": 0.21688888888888888, + "grad_norm": 553.154052734375, + "learning_rate": 3e-06, + "loss": 24.3757, + "step": 2440 + }, + { + "epoch": 0.21697777777777777, + "grad_norm": 1472.5419921875, + "learning_rate": 3e-06, + "loss": -3.4629, + "step": 2441 + }, + { + "epoch": 0.21706666666666666, + "grad_norm": 652.6502685546875, + "learning_rate": 3e-06, + "loss": 18.709, + "step": 2442 + }, + { + "epoch": 0.21715555555555555, + "grad_norm": 599.662353515625, + "learning_rate": 3e-06, + "loss": 6.6085, + "step": 2443 + }, + { + "epoch": 0.21724444444444443, + "grad_norm": 596.9681396484375, + "learning_rate": 3e-06, + "loss": -10.8602, + "step": 2444 + }, + { + "epoch": 0.21733333333333332, + "grad_norm": 487.63580322265625, + "learning_rate": 3e-06, + "loss": -1.5804, + "step": 2445 + }, + { + "epoch": 0.2174222222222222, + "grad_norm": 1273.145751953125, + "learning_rate": 3e-06, + "loss": 16.3577, + "step": 2446 + }, + { + "epoch": 0.2175111111111111, + "grad_norm": 771.951904296875, + "learning_rate": 3e-06, + "loss": -6.3023, + "step": 2447 + }, + { + "epoch": 0.2176, + "grad_norm": 712.5418701171875, + "learning_rate": 3e-06, + "loss": 16.2355, + "step": 2448 + }, + { + "completion_length": 248.37500762939453, + "epoch": 0.21768888888888888, + "grad_norm": 790.022216796875, + "learning_rate": 3e-06, + "loss": 7.116, + "reward": 2.2291667461395264, + "reward_std": 0.26603007316589355, + "rewards/boxed_and_answer_tags_format_reward": 0.6875, + "rewards/correctness_reward_func_math": 1.5416666865348816, + "step": 2449, + "zero_std_ratio": 0.75 + }, + { + "epoch": 0.21777777777777776, + "grad_norm": 546.7139892578125, + "learning_rate": 3e-06, + "loss": -10.9262, + "step": 2450 + }, + { + "epoch": 0.21786666666666665, + "grad_norm": 517.8162231445312, + "learning_rate": 3e-06, + "loss": 0.3609, + "step": 2451 + }, + { + "epoch": 0.21795555555555557, + "grad_norm": 682.6878051757812, + "learning_rate": 3e-06, + "loss": 0.4365, + "step": 2452 + }, + { + "epoch": 0.21804444444444446, + "grad_norm": 607.7827758789062, + "learning_rate": 3e-06, + "loss": 10.3393, + "step": 2453 + }, + { + "epoch": 0.21813333333333335, + "grad_norm": 518.8916625976562, + "learning_rate": 3e-06, + "loss": 0.2425, + "step": 2454 + }, + { + "epoch": 0.21822222222222223, + "grad_norm": 629.7891235351562, + "learning_rate": 3e-06, + "loss": -3.1223, + "step": 2455 + }, + { + "epoch": 0.21831111111111112, + "grad_norm": 510.4330139160156, + "learning_rate": 3e-06, + "loss": -15.683, + "step": 2456 + }, + { + "epoch": 0.2184, + "grad_norm": 553.461669921875, + "learning_rate": 3e-06, + "loss": -4.2699, + "step": 2457 + }, + { + "epoch": 0.2184888888888889, + "grad_norm": 526.62109375, + "learning_rate": 3e-06, + "loss": -6.1042, + "step": 2458 + }, + { + "epoch": 0.2185777777777778, + "grad_norm": 562.5404052734375, + "learning_rate": 3e-06, + "loss": -0.4739, + "step": 2459 + }, + { + "epoch": 0.21866666666666668, + "grad_norm": 544.2666625976562, + "learning_rate": 3e-06, + "loss": -6.7032, + "step": 2460 + }, + { + "completion_length": 246.02083587646484, + "epoch": 0.21875555555555556, + "grad_norm": 1100.9122314453125, + "learning_rate": 3e-06, + "loss": 12.7639, + "reward": 1.8750001192092896, + "reward_std": 0.3061862289905548, + "rewards/boxed_and_answer_tags_format_reward": 0.75, + "rewards/correctness_reward_func_math": 1.1250000298023224, + "step": 2461, + "zero_std_ratio": 0.625 + }, + { + "epoch": 0.21884444444444445, + "grad_norm": 897.7620239257812, + "learning_rate": 3e-06, + "loss": 20.9614, + "step": 2462 + }, + { + "epoch": 0.21893333333333334, + "grad_norm": 986.5072021484375, + "learning_rate": 3e-06, + "loss": 42.7016, + "step": 2463 + }, + { + "epoch": 0.21902222222222223, + "grad_norm": 1005.2491455078125, + "learning_rate": 3e-06, + "loss": 59.914, + "step": 2464 + }, + { + "epoch": 0.21911111111111112, + "grad_norm": 723.766357421875, + "learning_rate": 3e-06, + "loss": -18.4072, + "step": 2465 + }, + { + "epoch": 0.2192, + "grad_norm": 814.1770629882812, + "learning_rate": 3e-06, + "loss": 56.8266, + "step": 2466 + }, + { + "epoch": 0.2192888888888889, + "grad_norm": 880.131591796875, + "learning_rate": 3e-06, + "loss": 0.9147, + "step": 2467 + }, + { + "epoch": 0.21937777777777778, + "grad_norm": 1002.6812744140625, + "learning_rate": 3e-06, + "loss": 10.1945, + "step": 2468 + }, + { + "epoch": 0.21946666666666667, + "grad_norm": 928.541748046875, + "learning_rate": 3e-06, + "loss": 28.8268, + "step": 2469 + }, + { + "epoch": 0.21955555555555556, + "grad_norm": 1081.3568115234375, + "learning_rate": 3e-06, + "loss": 42.3389, + "step": 2470 + }, + { + "epoch": 0.21964444444444445, + "grad_norm": 849.524658203125, + "learning_rate": 3e-06, + "loss": -28.4933, + "step": 2471 + }, + { + "epoch": 0.21973333333333334, + "grad_norm": 800.930419921875, + "learning_rate": 3e-06, + "loss": 48.021, + "step": 2472 + }, + { + "completion_length": 243.7291717529297, + "epoch": 0.21982222222222222, + "grad_norm": 840.5211181640625, + "learning_rate": 3e-06, + "loss": -29.1071, + "reward": 1.4062500596046448, + "reward_std": 0.47030356526374817, + "rewards/boxed_and_answer_tags_format_reward": 0.7395833432674408, + "rewards/correctness_reward_func_math": 0.6666666567325592, + "step": 2473, + "zero_std_ratio": 0.5 + }, + { + "epoch": 0.2199111111111111, + "grad_norm": 766.2994384765625, + "learning_rate": 3e-06, + "loss": -22.7254, + "step": 2474 + }, + { + "epoch": 0.22, + "grad_norm": 1315.990966796875, + "learning_rate": 3e-06, + "loss": -2.9843, + "step": 2475 + }, + { + "epoch": 0.2200888888888889, + "grad_norm": 823.3301391601562, + "learning_rate": 3e-06, + "loss": -45.7856, + "step": 2476 + }, + { + "epoch": 0.22017777777777778, + "grad_norm": 969.9705810546875, + "learning_rate": 3e-06, + "loss": -26.3312, + "step": 2477 + }, + { + "epoch": 0.22026666666666667, + "grad_norm": 1052.6732177734375, + "learning_rate": 3e-06, + "loss": -6.0265, + "step": 2478 + }, + { + "epoch": 0.22035555555555555, + "grad_norm": 810.92041015625, + "learning_rate": 3e-06, + "loss": -35.8816, + "step": 2479 + }, + { + "epoch": 0.22044444444444444, + "grad_norm": 716.6881713867188, + "learning_rate": 3e-06, + "loss": -34.8377, + "step": 2480 + }, + { + "epoch": 0.22053333333333333, + "grad_norm": 1079.2554931640625, + "learning_rate": 3e-06, + "loss": -13.4226, + "step": 2481 + }, + { + "epoch": 0.22062222222222222, + "grad_norm": 770.211669921875, + "learning_rate": 3e-06, + "loss": -50.3465, + "step": 2482 + }, + { + "epoch": 0.2207111111111111, + "grad_norm": 886.2747802734375, + "learning_rate": 3e-06, + "loss": -33.0956, + "step": 2483 + }, + { + "epoch": 0.2208, + "grad_norm": 890.3179321289062, + "learning_rate": 3e-06, + "loss": -15.2544, + "step": 2484 + }, + { + "completion_length": 253.58333587646484, + "epoch": 0.22088888888888888, + "grad_norm": 303.0287780761719, + "learning_rate": 3e-06, + "loss": -15.111, + "reward": 1.2916666865348816, + "reward_std": 0.10206207633018494, + "rewards/boxed_and_answer_tags_format_reward": 0.75, + "rewards/correctness_reward_func_math": 0.5416666567325592, + "step": 2485, + "zero_std_ratio": 0.875 + }, + { + "epoch": 0.22097777777777777, + "grad_norm": 377.69757080078125, + "learning_rate": 3e-06, + "loss": -16.8297, + "step": 2486 + }, + { + "epoch": 0.22106666666666666, + "grad_norm": 388.11773681640625, + "learning_rate": 3e-06, + "loss": -14.6188, + "step": 2487 + }, + { + "epoch": 0.22115555555555555, + "grad_norm": 291.3501892089844, + "learning_rate": 3e-06, + "loss": -13.4672, + "step": 2488 + }, + { + "epoch": 0.22124444444444444, + "grad_norm": 439.7605285644531, + "learning_rate": 3e-06, + "loss": -23.8357, + "step": 2489 + }, + { + "epoch": 0.22133333333333333, + "grad_norm": 379.401611328125, + "learning_rate": 3e-06, + "loss": -9.9997, + "step": 2490 + }, + { + "epoch": 0.22142222222222221, + "grad_norm": 297.3055725097656, + "learning_rate": 3e-06, + "loss": -18.6912, + "step": 2491 + }, + { + "epoch": 0.2215111111111111, + "grad_norm": 412.62890625, + "learning_rate": 3e-06, + "loss": -20.8417, + "step": 2492 + }, + { + "epoch": 0.2216, + "grad_norm": 300.6817321777344, + "learning_rate": 3e-06, + "loss": -19.9718, + "step": 2493 + }, + { + "epoch": 0.22168888888888888, + "grad_norm": 268.96551513671875, + "learning_rate": 3e-06, + "loss": -18.9416, + "step": 2494 + }, + { + "epoch": 0.22177777777777777, + "grad_norm": 426.9893798828125, + "learning_rate": 3e-06, + "loss": -27.8923, + "step": 2495 + }, + { + "epoch": 0.22186666666666666, + "grad_norm": 381.15704345703125, + "learning_rate": 3e-06, + "loss": -19.2658, + "step": 2496 + }, + { + "completion_length": 249.95833587646484, + "epoch": 0.22195555555555554, + "grad_norm": 1263.931640625, + "learning_rate": 3e-06, + "loss": -12.4574, + "reward": 1.1875, + "reward_std": 0.46232303977012634, + "rewards/boxed_and_answer_tags_format_reward": 0.6875, + "rewards/correctness_reward_func_math": 0.5, + "step": 2497, + "zero_std_ratio": 0.5 + }, + { + "epoch": 0.22204444444444443, + "grad_norm": 963.8367919921875, + "learning_rate": 3e-06, + "loss": 15.1327, + "step": 2498 + }, + { + "epoch": 0.22213333333333332, + "grad_norm": 1354.34619140625, + "learning_rate": 3e-06, + "loss": -37.5826, + "step": 2499 + }, + { + "epoch": 0.2222222222222222, + "grad_norm": 1030.41650390625, + "learning_rate": 3e-06, + "loss": 3.9406, + "step": 2500 + } + ], + "logging_steps": 1, + "max_steps": 112500, + "num_input_tokens_seen": 0, + "num_train_epochs": 10, + "save_steps": 500, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 0.0, + "train_batch_size": 6, + "trial_name": null, + "trial_params": null +}