{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.5714285714285714, "eval_steps": 500, "global_step": 500, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "completion_length": 2571.2083587646484, "epoch": 0.001142857142857143, "grad_norm": 0.12744835019111633, "kl": 0.0, "learning_rate": 2e-08, "loss": 0.0681, "reward": 0.1723687592893839, "reward_std": 0.7976016625761986, "rewards/cosine_scaled_reward": -0.015534311532974243, "rewards/format_reward": 0.5208333488553762, "step": 1 }, { "completion_length": 2804.395881652832, "epoch": 0.002285714285714286, "grad_norm": 0.061661649495363235, "kl": 0.0, "learning_rate": 4e-08, "loss": 0.0245, "reward": -0.018269629566930234, "reward_std": 0.44402940198779106, "rewards/cosine_scaled_reward": -0.04980122856795788, "rewards/format_reward": 0.37500000558793545, "step": 2 }, { "completion_length": 3337.375030517578, "epoch": 0.0034285714285714284, "grad_norm": 0.10997606813907623, "kl": 4.9442052841186523e-05, "learning_rate": 6e-08, "loss": -0.0096, "reward": -0.3936590664088726, "reward_std": 0.5575782060623169, "rewards/cosine_scaled_reward": -0.19862568378448486, "rewards/format_reward": 0.1458333395421505, "step": 3 }, { "completion_length": 2260.895881652832, "epoch": 0.004571428571428572, "grad_norm": 0.15913932025432587, "kl": 2.993643283843994e-05, "learning_rate": 8e-08, "loss": 0.0445, "reward": 0.13440400827676058, "reward_std": 0.8942861538380384, "rewards/cosine_scaled_reward": -0.10464579728432, "rewards/format_reward": 0.6250000018626451, "step": 4 }, { "completion_length": 3328.2916870117188, "epoch": 0.005714285714285714, "grad_norm": 0.12190493941307068, "kl": 4.523247480392456e-05, "learning_rate": 1e-07, "loss": 0.0609, "reward": -0.27870709635317326, "reward_std": 0.7129274010658264, "rewards/cosine_scaled_reward": -0.2122665431816131, "rewards/format_reward": 0.31250000931322575, "step": 5 }, { "completion_length": 3129.8333740234375, "epoch": 0.006857142857142857, "grad_norm": 0.14417913556098938, "kl": 4.3526291847229004e-05, "learning_rate": 1.2e-07, "loss": -0.0215, "reward": -0.07921706140041351, "reward_std": 0.6614448297768831, "rewards/cosine_scaled_reward": -0.06220451416447759, "rewards/format_reward": 0.29166666977107525, "step": 6 }, { "completion_length": 3113.729217529297, "epoch": 0.008, "grad_norm": 0.1703435778617859, "kl": 1.8851831555366516e-05, "learning_rate": 1.4e-07, "loss": 0.0934, "reward": -0.0697940494865179, "reward_std": 0.8283112272620201, "rewards/cosine_scaled_reward": -0.14727119728922844, "rewards/format_reward": 0.4583333395421505, "step": 7 }, { "completion_length": 2710.6041870117188, "epoch": 0.009142857142857144, "grad_norm": 0.1097821518778801, "kl": 2.4922192096710205e-05, "learning_rate": 1.6e-07, "loss": 0.033, "reward": 0.20415285229682922, "reward_std": 0.7091612815856934, "rewards/cosine_scaled_reward": 0.06203982699662447, "rewards/format_reward": 0.41666667349636555, "step": 8 }, { "completion_length": 3129.687530517578, "epoch": 0.010285714285714285, "grad_norm": 0.1483897864818573, "kl": 4.6446919441223145e-05, "learning_rate": 1.8e-07, "loss": 0.0837, "reward": -0.2891757604666054, "reward_std": 0.7400874830782413, "rewards/cosine_scaled_reward": -0.21521726623177528, "rewards/format_reward": 0.2916666753590107, "step": 9 }, { "completion_length": 2703.3750076293945, "epoch": 0.011428571428571429, "grad_norm": 0.11335786432027817, "kl": 3.4183263778686523e-05, "learning_rate": 2e-07, "loss": 0.0303, "reward": -0.11180419684387743, "reward_std": 0.7680593952536583, "rewards/cosine_scaled_reward": -0.1549013671465218, "rewards/format_reward": 0.4166666679084301, "step": 10 }, { "completion_length": 3326.854217529297, "epoch": 0.012571428571428572, "grad_norm": 0.13309723138809204, "kl": 3.510713577270508e-05, "learning_rate": 2.1999999999999998e-07, "loss": 0.0705, "reward": -0.4452022071927786, "reward_std": 0.6313146576285362, "rewards/cosine_scaled_reward": -0.23017787747085094, "rewards/format_reward": 0.12500000186264515, "step": 11 }, { "completion_length": 2544.666717529297, "epoch": 0.013714285714285714, "grad_norm": 0.14041368663311005, "kl": 3.600120544433594e-05, "learning_rate": 2.4e-07, "loss": 0.0277, "reward": 0.006800652190577239, "reward_std": 0.7336893752217293, "rewards/cosine_scaled_reward": -0.17795369494706392, "rewards/format_reward": 0.6250000204890966, "step": 12 }, { "completion_length": 3018.8333587646484, "epoch": 0.014857142857142857, "grad_norm": 0.13021793961524963, "kl": 3.663450479507446e-05, "learning_rate": 2.6e-07, "loss": 0.0269, "reward": -0.03413328202441335, "reward_std": 0.6794986762106419, "rewards/cosine_scaled_reward": -0.06286561000160873, "rewards/format_reward": 0.3541666828095913, "step": 13 }, { "completion_length": 2947.5208740234375, "epoch": 0.016, "grad_norm": 0.12336233258247375, "kl": 2.9928982257843018e-05, "learning_rate": 2.8e-07, "loss": 0.0508, "reward": 0.07709243893623352, "reward_std": 0.8241597190499306, "rewards/cosine_scaled_reward": -0.015953163150697947, "rewards/format_reward": 0.39583334140479565, "step": 14 }, { "completion_length": 2676.5625228881836, "epoch": 0.017142857142857144, "grad_norm": 0.047696515917778015, "kl": 3.113597631454468e-05, "learning_rate": 3e-07, "loss": 0.0069, "reward": 0.036701809614896774, "reward_std": 0.43677932769060135, "rewards/cosine_scaled_reward": -0.021881014108657837, "rewards/format_reward": 0.39583333395421505, "step": 15 }, { "completion_length": 3442.0000610351562, "epoch": 0.018285714285714287, "grad_norm": 0.12319929152727127, "kl": 3.999471664428711e-05, "learning_rate": 3.2e-07, "loss": 0.0369, "reward": -0.43698735162615776, "reward_std": 0.556250561028719, "rewards/cosine_scaled_reward": -0.20507843233644962, "rewards/format_reward": 0.1041666679084301, "step": 16 }, { "completion_length": 2437.750011444092, "epoch": 0.019428571428571427, "grad_norm": 0.14334183931350708, "kl": 3.676116466522217e-05, "learning_rate": 3.4000000000000003e-07, "loss": 0.0128, "reward": 0.32777530141174793, "reward_std": 0.8251273017376661, "rewards/cosine_scaled_reward": 0.026506672613322735, "rewards/format_reward": 0.6458333432674408, "step": 17 }, { "completion_length": 2874.2292098999023, "epoch": 0.02057142857142857, "grad_norm": 0.1185201108455658, "kl": 1.5683472156524658e-05, "learning_rate": 3.6e-07, "loss": 0.0031, "reward": -0.024667851626873016, "reward_std": 0.8119602091610432, "rewards/cosine_scaled_reward": -0.12354222661815584, "rewards/format_reward": 0.4583333395421505, "step": 18 }, { "completion_length": 3046.6458587646484, "epoch": 0.021714285714285714, "grad_norm": 0.19736716151237488, "kl": 2.751254942268133e-05, "learning_rate": 3.7999999999999996e-07, "loss": 0.0779, "reward": 0.09869576059281826, "reward_std": 0.8057069275528193, "rewards/cosine_scaled_reward": 0.01759765949100256, "rewards/format_reward": 0.3541666753590107, "step": 19 }, { "completion_length": 2335.291732788086, "epoch": 0.022857142857142857, "grad_norm": 0.21245542168617249, "kl": 1.4175660908222198e-05, "learning_rate": 4e-07, "loss": 0.1185, "reward": 0.38306641951203346, "reward_std": 0.8724737018346786, "rewards/cosine_scaled_reward": 0.04800319205969572, "rewards/format_reward": 0.666666679084301, "step": 20 }, { "completion_length": 2711.3958435058594, "epoch": 0.024, "grad_norm": 0.15821056067943573, "kl": 2.4184584617614746e-05, "learning_rate": 4.1999999999999995e-07, "loss": 0.0511, "reward": 0.1797822918742895, "reward_std": 0.9325026646256447, "rewards/cosine_scaled_reward": -0.0036781951785087585, "rewards/format_reward": 0.47916666977107525, "step": 21 }, { "completion_length": 1910.2083892822266, "epoch": 0.025142857142857144, "grad_norm": 0.14541339874267578, "kl": 2.391217276453972e-05, "learning_rate": 4.3999999999999997e-07, "loss": 0.0295, "reward": 0.347878472879529, "reward_std": 0.6697478331625462, "rewards/cosine_scaled_reward": -0.004899490624666214, "rewards/format_reward": 0.7500000074505806, "step": 22 }, { "completion_length": 2642.2083740234375, "epoch": 0.026285714285714287, "grad_norm": 0.13504153490066528, "kl": 2.520345151424408e-05, "learning_rate": 4.6e-07, "loss": 0.0768, "reward": 0.019413750036619604, "reward_std": 0.9948069639503956, "rewards/cosine_scaled_reward": -0.10891764007828897, "rewards/format_reward": 0.45833334513008595, "step": 23 }, { "completion_length": 2697.000015258789, "epoch": 0.027428571428571427, "grad_norm": 0.13816049695014954, "kl": 2.109631896018982e-05, "learning_rate": 4.8e-07, "loss": 0.0697, "reward": 0.3483977415598929, "reward_std": 0.8868874367326498, "rewards/cosine_scaled_reward": 0.06968503817915916, "rewards/format_reward": 0.5833333432674408, "step": 24 }, { "completion_length": 2679.3958587646484, "epoch": 0.02857142857142857, "grad_norm": 0.10494664311408997, "kl": 3.129430115222931e-05, "learning_rate": 5e-07, "loss": 0.001, "reward": 0.1587733030319214, "reward_std": 0.7055602557957172, "rewards/cosine_scaled_reward": 0.02119587583001703, "rewards/format_reward": 0.4375000074505806, "step": 25 }, { "completion_length": 3099.979217529297, "epoch": 0.029714285714285714, "grad_norm": 0.075159452855587, "kl": 2.6673078536987305e-05, "learning_rate": 5.2e-07, "loss": 0.003, "reward": -0.006685070693492889, "reward_std": 0.5016829147934914, "rewards/cosine_scaled_reward": -0.08887793682515621, "rewards/format_reward": 0.45833333395421505, "step": 26 }, { "completion_length": 2958.8958740234375, "epoch": 0.030857142857142857, "grad_norm": 0.13963520526885986, "kl": 1.3343989849090576e-05, "learning_rate": 5.4e-07, "loss": 0.0628, "reward": 0.024143089074641466, "reward_std": 0.7820315174758434, "rewards/cosine_scaled_reward": -0.0528924111276865, "rewards/format_reward": 0.3958333358168602, "step": 27 }, { "completion_length": 2831.958366394043, "epoch": 0.032, "grad_norm": 0.0931120440363884, "kl": 2.434663474559784e-05, "learning_rate": 5.6e-07, "loss": 0.045, "reward": -0.018430547177558765, "reward_std": 0.626394847407937, "rewards/cosine_scaled_reward": -0.051781938411295414, "rewards/format_reward": 0.3541666753590107, "step": 28 }, { "completion_length": 3351.750030517578, "epoch": 0.03314285714285714, "grad_norm": 0.10650024563074112, "kl": 2.886354923248291e-05, "learning_rate": 5.8e-07, "loss": 0.0438, "reward": -0.466870941221714, "reward_std": 0.4314545188099146, "rewards/cosine_scaled_reward": -0.2414477914571762, "rewards/format_reward": 0.1458333358168602, "step": 29 }, { "completion_length": 2859.812545776367, "epoch": 0.03428571428571429, "grad_norm": 0.18510645627975464, "kl": 1.806020736694336e-05, "learning_rate": 6e-07, "loss": 0.1004, "reward": 0.23486553132534027, "reward_std": 1.0620297119021416, "rewards/cosine_scaled_reward": 0.04186862939968705, "rewards/format_reward": 0.4583333469927311, "step": 30 }, { "completion_length": 3150.250030517578, "epoch": 0.03542857142857143, "grad_norm": 0.11023243516683578, "kl": 1.753866672515869e-05, "learning_rate": 6.2e-07, "loss": 0.0007, "reward": -0.2325914899702184, "reward_std": 0.6160467248409986, "rewards/cosine_scaled_reward": -0.16382855689153075, "rewards/format_reward": 0.29166667722165585, "step": 31 }, { "completion_length": 3241.166717529297, "epoch": 0.036571428571428574, "grad_norm": 0.16251012682914734, "kl": 3.0582770705223083e-05, "learning_rate": 6.4e-07, "loss": 0.078, "reward": -0.17190912459045649, "reward_std": 0.7624128982424736, "rewards/cosine_scaled_reward": -0.11033781431615353, "rewards/format_reward": 0.25000000931322575, "step": 32 }, { "completion_length": 3253.625030517578, "epoch": 0.037714285714285714, "grad_norm": 0.1084270030260086, "kl": 4.7340989112854004e-05, "learning_rate": 6.6e-07, "loss": 0.0213, "reward": -0.2144899107515812, "reward_std": 0.7134921476244926, "rewards/cosine_scaled_reward": -0.15857082698494196, "rewards/format_reward": 0.29166667349636555, "step": 33 }, { "completion_length": 2398.7708587646484, "epoch": 0.038857142857142854, "grad_norm": 0.14240942895412445, "kl": 6.242096424102783e-05, "learning_rate": 6.800000000000001e-07, "loss": 0.0497, "reward": 0.31558607518672943, "reward_std": 0.8723539188504219, "rewards/cosine_scaled_reward": 0.06751577369868755, "rewards/format_reward": 0.541666679084301, "step": 34 }, { "completion_length": 3059.1667098999023, "epoch": 0.04, "grad_norm": 0.13201534748077393, "kl": 4.059448838233948e-05, "learning_rate": 7e-07, "loss": 0.0401, "reward": -0.0117220189422369, "reward_std": 0.9031526632606983, "rewards/cosine_scaled_reward": -0.04519825894385576, "rewards/format_reward": 0.3125000037252903, "step": 35 }, { "completion_length": 3364.4791870117188, "epoch": 0.04114285714285714, "grad_norm": 0.09137182682752609, "kl": 6.282329559326172e-05, "learning_rate": 7.2e-07, "loss": 0.0275, "reward": -0.37299076607450843, "reward_std": 0.5668806284666061, "rewards/cosine_scaled_reward": -0.19870344595983624, "rewards/format_reward": 0.16666667349636555, "step": 36 }, { "completion_length": 3180.062530517578, "epoch": 0.04228571428571429, "grad_norm": 0.06048983708024025, "kl": 6.483495235443115e-05, "learning_rate": 7.4e-07, "loss": 0.0165, "reward": -0.4461427731439471, "reward_std": 0.4245890639722347, "rewards/cosine_scaled_reward": -0.26771107502281666, "rewards/format_reward": 0.22916666977107525, "step": 37 }, { "completion_length": 3264.937530517578, "epoch": 0.04342857142857143, "grad_norm": 0.09965316206216812, "kl": 6.621703505516052e-05, "learning_rate": 7.599999999999999e-07, "loss": 0.0253, "reward": -0.2411605268716812, "reward_std": 0.5170722529292107, "rewards/cosine_scaled_reward": -0.10409475707274396, "rewards/format_reward": 0.1666666679084301, "step": 38 }, { "completion_length": 2869.1875534057617, "epoch": 0.044571428571428574, "grad_norm": 0.05097671225667, "kl": 7.027853280305862e-05, "learning_rate": 7.799999999999999e-07, "loss": 0.0051, "reward": 0.1772767100483179, "reward_std": 0.425695575773716, "rewards/cosine_scaled_reward": 0.008680417202413082, "rewards/format_reward": 0.5208333358168602, "step": 39 }, { "completion_length": 2741.87508392334, "epoch": 0.045714285714285714, "grad_norm": 0.1285083144903183, "kl": 0.00026188790798187256, "learning_rate": 8e-07, "loss": 0.0503, "reward": -0.07532945368438959, "reward_std": 0.5845174305140972, "rewards/cosine_scaled_reward": -0.10031583718955517, "rewards/format_reward": 0.3750000037252903, "step": 40 }, { "completion_length": 3102.6041870117188, "epoch": 0.046857142857142854, "grad_norm": 0.11375687271356583, "kl": 9.709596633911133e-05, "learning_rate": 8.199999999999999e-07, "loss": 0.0088, "reward": -0.23022597841918468, "reward_std": 0.6951021775603294, "rewards/cosine_scaled_reward": -0.16865359526127577, "rewards/format_reward": 0.2916666716337204, "step": 41 }, { "completion_length": 2861.2708435058594, "epoch": 0.048, "grad_norm": 0.04051314666867256, "kl": 8.571147918701172e-05, "learning_rate": 8.399999999999999e-07, "loss": 0.0008, "reward": -0.4261672543361783, "reward_std": 0.2807927541434765, "rewards/cosine_scaled_reward": -0.277394006960094, "rewards/format_reward": 0.2916666679084301, "step": 42 }, { "completion_length": 3074.937530517578, "epoch": 0.04914285714285714, "grad_norm": 0.099971242249012, "kl": 7.382780313491821e-05, "learning_rate": 8.599999999999999e-07, "loss": 0.0538, "reward": -0.23065751791000366, "reward_std": 0.6040490940213203, "rewards/cosine_scaled_reward": -0.13301679654978216, "rewards/format_reward": 0.2291666679084301, "step": 43 }, { "completion_length": 2720.145881652832, "epoch": 0.05028571428571429, "grad_norm": 0.10246309638023376, "kl": 0.0003544166684150696, "learning_rate": 8.799999999999999e-07, "loss": -0.011, "reward": 0.011595025658607483, "reward_std": 0.5640863105654716, "rewards/cosine_scaled_reward": -0.05820102244615555, "rewards/format_reward": 0.4166666679084301, "step": 44 }, { "completion_length": 3374.291717529297, "epoch": 0.05142857142857143, "grad_norm": 0.11695674061775208, "kl": 0.00015322864055633545, "learning_rate": 9e-07, "loss": 0.0382, "reward": -0.16595029830932617, "reward_std": 0.7633817754685879, "rewards/cosine_scaled_reward": -0.08942665439099073, "rewards/format_reward": 0.20833333767950535, "step": 45 }, { "completion_length": 3254.958335876465, "epoch": 0.052571428571428575, "grad_norm": 0.06262751668691635, "kl": 0.0002702465280890465, "learning_rate": 9.2e-07, "loss": 0.0021, "reward": -0.45498750917613506, "reward_std": 0.3716874625533819, "rewards/cosine_scaled_reward": -0.24009231384843588, "rewards/format_reward": 0.1666666679084301, "step": 46 }, { "completion_length": 2913.1250610351562, "epoch": 0.053714285714285714, "grad_norm": 0.13851620256900787, "kl": 0.0001294887624680996, "learning_rate": 9.399999999999999e-07, "loss": 0.1169, "reward": 0.13789150305092335, "reward_std": 0.9271005466580391, "rewards/cosine_scaled_reward": 0.006347283720970154, "rewards/format_reward": 0.416666679084301, "step": 47 }, { "completion_length": 2858.250045776367, "epoch": 0.054857142857142854, "grad_norm": 0.12427303940057755, "kl": 0.0005601570010185242, "learning_rate": 9.6e-07, "loss": 0.0265, "reward": -0.051424789475277066, "reward_std": 0.813489394262433, "rewards/cosine_scaled_reward": -0.07831720494141337, "rewards/format_reward": 0.3333333358168602, "step": 48 }, { "completion_length": 2364.562530517578, "epoch": 0.056, "grad_norm": 0.09100354462862015, "kl": 0.0003460892476141453, "learning_rate": 9.8e-07, "loss": 0.0101, "reward": 0.31277667777612805, "reward_std": 0.7601135969161987, "rewards/cosine_scaled_reward": 0.04609484411776066, "rewards/format_reward": 0.583333333954215, "step": 49 }, { "completion_length": 2957.937530517578, "epoch": 0.05714285714285714, "grad_norm": 0.11310350149869919, "kl": 0.0003506038337945938, "learning_rate": 1e-06, "loss": 0.0231, "reward": 0.07819816470146179, "reward_std": 0.5129979718476534, "rewards/cosine_scaled_reward": 0.04247652553021908, "rewards/format_reward": 0.3125, "step": 50 }, { "completion_length": 2296.833335876465, "epoch": 0.05828571428571429, "grad_norm": 0.09167315810918808, "kl": 0.0011107921600341797, "learning_rate": 9.999890338174275e-07, "loss": 0.0018, "reward": 0.04252960532903671, "reward_std": 0.607517946511507, "rewards/cosine_scaled_reward": -0.12472434528172016, "rewards/format_reward": 0.5833333358168602, "step": 51 }, { "completion_length": 2910.0417289733887, "epoch": 0.05942857142857143, "grad_norm": 0.1661226898431778, "kl": 0.0011932700872421265, "learning_rate": 9.999561358041868e-07, "loss": 0.0441, "reward": 0.25355312041938305, "reward_std": 1.1632588431239128, "rewards/cosine_scaled_reward": 0.05952752288430929, "rewards/format_reward": 0.4375000037252903, "step": 52 }, { "completion_length": 2785.2500610351562, "epoch": 0.060571428571428575, "grad_norm": 0.1787281036376953, "kl": 0.0006063804030418396, "learning_rate": 9.999013075636804e-07, "loss": 0.0319, "reward": 0.1302148699760437, "reward_std": 0.8732462916523218, "rewards/cosine_scaled_reward": -0.028057849034667015, "rewards/format_reward": 0.4791666716337204, "step": 53 }, { "completion_length": 2941.104248046875, "epoch": 0.061714285714285715, "grad_norm": 0.21909195184707642, "kl": 0.00023331446573138237, "learning_rate": 9.998245517681593e-07, "loss": 0.1682, "reward": 0.3165966849774122, "reward_std": 1.0916050113737583, "rewards/cosine_scaled_reward": 0.09867796488106251, "rewards/format_reward": 0.45833334885537624, "step": 54 }, { "completion_length": 3090.6458587646484, "epoch": 0.06285714285714286, "grad_norm": 0.1303940713405609, "kl": 0.0007647275924682617, "learning_rate": 9.997258721585931e-07, "loss": 0.0367, "reward": 0.04699781024828553, "reward_std": 0.8400920890271664, "rewards/cosine_scaled_reward": -0.015484219416975975, "rewards/format_reward": 0.33333333767950535, "step": 55 }, { "completion_length": 2963.4584045410156, "epoch": 0.064, "grad_norm": 0.09380260854959488, "kl": 0.0018551349639892578, "learning_rate": 9.996052735444862e-07, "loss": 0.0177, "reward": -0.06758141331374645, "reward_std": 0.6088844947516918, "rewards/cosine_scaled_reward": -0.0913094412535429, "rewards/format_reward": 0.3750000074505806, "step": 56 }, { "completion_length": 3346.7708435058594, "epoch": 0.06514285714285714, "grad_norm": 0.13119955360889435, "kl": 0.00034799426794052124, "learning_rate": 9.994627618036452e-07, "loss": 0.0109, "reward": -0.14502036944031715, "reward_std": 0.866327153518796, "rewards/cosine_scaled_reward": -0.12297584302723408, "rewards/format_reward": 0.29166666977107525, "step": 57 }, { "completion_length": 2402.6458740234375, "epoch": 0.06628571428571428, "grad_norm": 0.11010728776454926, "kl": 0.00420612096786499, "learning_rate": 9.992983438818915e-07, "loss": 0.0515, "reward": 0.10594136267900467, "reward_std": 0.6864449828863144, "rewards/cosine_scaled_reward": -0.09731632098555565, "rewards/format_reward": 0.604166679084301, "step": 58 }, { "completion_length": 3156.6458740234375, "epoch": 0.06742857142857143, "grad_norm": 0.14981059730052948, "kl": 0.0015277080237865448, "learning_rate": 9.991120277927223e-07, "loss": 0.0349, "reward": -0.15045391581952572, "reward_std": 0.6176320239901543, "rewards/cosine_scaled_reward": -0.09630896709859371, "rewards/format_reward": 0.2708333432674408, "step": 59 }, { "completion_length": 3069.6041870117188, "epoch": 0.06857142857142857, "grad_norm": 0.08052746206521988, "kl": 0.0005385726690292358, "learning_rate": 9.989038226169207e-07, "loss": 0.0124, "reward": -0.26248200982809067, "reward_std": 0.5521546974778175, "rewards/cosine_scaled_reward": -0.19996057264506817, "rewards/format_reward": 0.3333333395421505, "step": 60 }, { "completion_length": 3061.250030517578, "epoch": 0.06971428571428571, "grad_norm": 0.1755966693162918, "kl": 0.0010443329811096191, "learning_rate": 9.98673738502114e-07, "loss": 0.106, "reward": 0.027474643662571907, "reward_std": 0.7631269134581089, "rewards/cosine_scaled_reward": -0.0676457080990076, "rewards/format_reward": 0.43750000931322575, "step": 61 }, { "completion_length": 2801.791778564453, "epoch": 0.07085714285714285, "grad_norm": 0.5502915382385254, "kl": 0.06185805797576904, "learning_rate": 9.98421786662277e-07, "loss": 0.0777, "reward": 0.16424999572336674, "reward_std": 1.018867939710617, "rewards/cosine_scaled_reward": -0.02840923797339201, "rewards/format_reward": 0.5000000186264515, "step": 62 }, { "completion_length": 2495.2708740234375, "epoch": 0.072, "grad_norm": 0.19492964446544647, "kl": 0.0023099184036254883, "learning_rate": 9.981479793771866e-07, "loss": 0.1135, "reward": 0.40039824694395065, "reward_std": 1.032276712357998, "rewards/cosine_scaled_reward": 0.06732268328778446, "rewards/format_reward": 0.6458333507180214, "step": 63 }, { "completion_length": 3155.354217529297, "epoch": 0.07314285714285715, "grad_norm": 0.13117599487304688, "kl": 0.0020775794982910156, "learning_rate": 9.97852329991824e-07, "loss": 0.0771, "reward": -0.09214674681425095, "reward_std": 0.7898851484060287, "rewards/cosine_scaled_reward": -0.06703834654763341, "rewards/format_reward": 0.27083333767950535, "step": 64 }, { "completion_length": 2716.3542251586914, "epoch": 0.07428571428571429, "grad_norm": 0.11173497885465622, "kl": 0.00382232666015625, "learning_rate": 9.975348529157229e-07, "loss": 0.0204, "reward": -0.11022741347551346, "reward_std": 0.672077115625143, "rewards/cosine_scaled_reward": -0.14180615730583668, "rewards/format_reward": 0.3958333358168602, "step": 65 }, { "completion_length": 2310.1041831970215, "epoch": 0.07542857142857143, "grad_norm": 0.06708691269159317, "kl": 0.0025424957275390625, "learning_rate": 9.971955636222684e-07, "loss": 0.0339, "reward": 0.20656662248075008, "reward_std": 0.589762119576335, "rewards/cosine_scaled_reward": 0.03981146775186062, "rewards/format_reward": 0.4791666716337204, "step": 66 }, { "completion_length": 3528.8541870117188, "epoch": 0.07657142857142857, "grad_norm": 0.07865083962678909, "kl": 0.0022742748260498047, "learning_rate": 9.968344786479415e-07, "loss": 0.0142, "reward": -0.46846646815538406, "reward_std": 0.5137450993061066, "rewards/cosine_scaled_reward": -0.22528725676238537, "rewards/format_reward": 0.1041666679084301, "step": 67 }, { "completion_length": 2388.875030517578, "epoch": 0.07771428571428571, "grad_norm": 0.1428786665201187, "kl": 0.009924888610839844, "learning_rate": 9.964516155915151e-07, "loss": 0.0722, "reward": 0.0902576670050621, "reward_std": 0.7360437363386154, "rewards/cosine_scaled_reward": -0.06687035039067268, "rewards/format_reward": 0.5208333414047956, "step": 68 }, { "completion_length": 2723.7708740234375, "epoch": 0.07885714285714286, "grad_norm": 0.08696511387825012, "kl": 0.004919290542602539, "learning_rate": 9.960469931131936e-07, "loss": 0.0544, "reward": -0.24375398270785809, "reward_std": 0.5107810720801353, "rewards/cosine_scaled_reward": -0.21045764535665512, "rewards/format_reward": 0.37500000186264515, "step": 69 }, { "completion_length": 3078.8125762939453, "epoch": 0.08, "grad_norm": 0.09773669391870499, "kl": 0.0020369887351989746, "learning_rate": 9.956206309337066e-07, "loss": 0.0139, "reward": -0.0818065321072936, "reward_std": 0.6509491205215454, "rewards/cosine_scaled_reward": -0.11709374003112316, "rewards/format_reward": 0.39583334140479565, "step": 70 }, { "completion_length": 2727.979202270508, "epoch": 0.08114285714285714, "grad_norm": 0.07420142740011215, "kl": 0.004831179976463318, "learning_rate": 9.951725498333448e-07, "loss": 0.002, "reward": -0.03378719836473465, "reward_std": 0.6056636273860931, "rewards/cosine_scaled_reward": -0.07006076944526285, "rewards/format_reward": 0.37500000558793545, "step": 71 }, { "completion_length": 3157.750030517578, "epoch": 0.08228571428571428, "grad_norm": 0.12406568974256516, "kl": 0.006371498107910156, "learning_rate": 9.947027716509488e-07, "loss": 0.0799, "reward": -0.2358899898827076, "reward_std": 0.626086350530386, "rewards/cosine_scaled_reward": -0.14525874331593513, "rewards/format_reward": 0.25000000186264515, "step": 72 }, { "completion_length": 3562.5625, "epoch": 0.08342857142857144, "grad_norm": 0.07233923673629761, "kl": 0.0012898445129394531, "learning_rate": 9.942113192828444e-07, "loss": 0.0053, "reward": -0.47873567789793015, "reward_std": 0.4625825770199299, "rewards/cosine_scaled_reward": -0.23070307821035385, "rewards/format_reward": 0.1041666679084301, "step": 73 }, { "completion_length": 3244.3334045410156, "epoch": 0.08457142857142858, "grad_norm": 0.1488964557647705, "kl": 0.003963470458984375, "learning_rate": 9.93698216681727e-07, "loss": 0.0581, "reward": -0.0020135529339313507, "reward_std": 0.7993322685360909, "rewards/cosine_scaled_reward": -0.022909073159098625, "rewards/format_reward": 0.29166666977107525, "step": 74 }, { "completion_length": 3028.687545776367, "epoch": 0.08571428571428572, "grad_norm": 0.08981792628765106, "kl": 0.005274057388305664, "learning_rate": 9.931634888554935e-07, "loss": 0.0046, "reward": 0.03718305751681328, "reward_std": 0.6846093349158764, "rewards/cosine_scaled_reward": -0.010177075862884521, "rewards/format_reward": 0.3333333395421505, "step": 75 }, { "completion_length": 2771.2083740234375, "epoch": 0.08685714285714285, "grad_norm": 0.10527168214321136, "kl": 0.0019083023071289062, "learning_rate": 9.926071618660237e-07, "loss": 0.0698, "reward": -0.13475606916472316, "reward_std": 0.6331770308315754, "rewards/cosine_scaled_reward": -0.19242265075445175, "rewards/format_reward": 0.4791666753590107, "step": 76 }, { "completion_length": 3160.062545776367, "epoch": 0.088, "grad_norm": 0.053626783192157745, "kl": 0.0024300217628479004, "learning_rate": 9.9202926282791e-07, "loss": 0.0089, "reward": -0.07912399154156446, "reward_std": 0.4063580594956875, "rewards/cosine_scaled_reward": -0.07036877004429698, "rewards/format_reward": 0.3333333358168602, "step": 77 }, { "completion_length": 3157.354217529297, "epoch": 0.08914285714285715, "grad_norm": 0.12693333625793457, "kl": 0.0019735097885131836, "learning_rate": 9.91429819907136e-07, "loss": 0.043, "reward": 0.10285145416855812, "reward_std": 0.8885178752243519, "rewards/cosine_scaled_reward": 0.01143704541027546, "rewards/format_reward": 0.35416666977107525, "step": 78 }, { "completion_length": 2627.145866394043, "epoch": 0.09028571428571429, "grad_norm": 0.0993918776512146, "kl": 0.005728721618652344, "learning_rate": 9.908088623197048e-07, "loss": 0.0208, "reward": 0.005513674899702892, "reward_std": 0.696781549602747, "rewards/cosine_scaled_reward": -0.11277296394109726, "rewards/format_reward": 0.5000000111758709, "step": 79 }, { "completion_length": 3378.6666870117188, "epoch": 0.09142857142857143, "grad_norm": 0.11993207037448883, "kl": 0.0034465789794921875, "learning_rate": 9.901664203302124e-07, "loss": 0.0148, "reward": -0.25228459760546684, "reward_std": 0.6576736122369766, "rewards/cosine_scaled_reward": -0.15029108710587025, "rewards/format_reward": 0.2291666753590107, "step": 80 }, { "completion_length": 3124.9792098999023, "epoch": 0.09257142857142857, "grad_norm": 0.07844506949186325, "kl": 0.0075626373291015625, "learning_rate": 9.895025252503755e-07, "loss": 0.013, "reward": -0.3477923655882478, "reward_std": 0.4749002642929554, "rewards/cosine_scaled_reward": -0.2027296293526888, "rewards/format_reward": 0.22916666977107525, "step": 81 }, { "completion_length": 2722.375045776367, "epoch": 0.09371428571428571, "grad_norm": 0.13039354979991913, "kl": 0.006313920021057129, "learning_rate": 9.888172094375033e-07, "loss": -0.0104, "reward": 0.08498383313417435, "reward_std": 0.7605466395616531, "rewards/cosine_scaled_reward": -0.03437880612909794, "rewards/format_reward": 0.4375000074505806, "step": 82 }, { "completion_length": 2761.1250228881836, "epoch": 0.09485714285714286, "grad_norm": 0.11148078739643097, "kl": 0.006354331970214844, "learning_rate": 9.881105062929221e-07, "loss": 0.0171, "reward": -0.1586984060704708, "reward_std": 0.6908692009747028, "rewards/cosine_scaled_reward": -0.14450077898800373, "rewards/format_reward": 0.3333333358168602, "step": 83 }, { "completion_length": 3147.3333587646484, "epoch": 0.096, "grad_norm": 0.15051236748695374, "kl": 0.0017566680908203125, "learning_rate": 9.873824502603459e-07, "loss": 0.0437, "reward": 0.20425852667540312, "reward_std": 0.8553840257227421, "rewards/cosine_scaled_reward": 0.06748985398371588, "rewards/format_reward": 0.3958333395421505, "step": 84 }, { "completion_length": 3178.7500762939453, "epoch": 0.09714285714285714, "grad_norm": 0.12316413968801498, "kl": 0.002705097198486328, "learning_rate": 9.866330768241983e-07, "loss": 0.0373, "reward": -0.051826220005750656, "reward_std": 0.8340367153286934, "rewards/cosine_scaled_reward": -0.0864723757840693, "rewards/format_reward": 0.35416666977107525, "step": 85 }, { "completion_length": 3106.2083740234375, "epoch": 0.09828571428571428, "grad_norm": 0.16358214616775513, "kl": 0.005809783935546875, "learning_rate": 9.85862422507884e-07, "loss": 0.0502, "reward": -0.21086052944883704, "reward_std": 0.6448673270642757, "rewards/cosine_scaled_reward": -0.1603346224874258, "rewards/format_reward": 0.3125000074505806, "step": 86 }, { "completion_length": 2988.3334045410156, "epoch": 0.09942857142857142, "grad_norm": 0.18081220984458923, "kl": 0.006814241409301758, "learning_rate": 9.850705248720068e-07, "loss": 0.0524, "reward": 0.13324597105383873, "reward_std": 0.8470934070646763, "rewards/cosine_scaled_reward": -0.020617252215743065, "rewards/format_reward": 0.47916668094694614, "step": 87 }, { "completion_length": 2812.625030517578, "epoch": 0.10057142857142858, "grad_norm": 0.1538206785917282, "kl": 0.012554168701171875, "learning_rate": 9.8425742251254e-07, "loss": 0.0806, "reward": 0.1440363209694624, "reward_std": 0.7827430870383978, "rewards/cosine_scaled_reward": -0.009355325251817703, "rewards/format_reward": 0.4791666753590107, "step": 88 }, { "completion_length": 3243.562515258789, "epoch": 0.10171428571428572, "grad_norm": 0.10299122333526611, "kl": 0.004688262939453125, "learning_rate": 9.83423155058946e-07, "loss": 0.0151, "reward": -0.2134521808475256, "reward_std": 0.6248220186680555, "rewards/cosine_scaled_reward": -0.12200713902711868, "rewards/format_reward": 0.22916667349636555, "step": 89 }, { "completion_length": 2706.8958587646484, "epoch": 0.10285714285714286, "grad_norm": 0.08340943604707718, "kl": 0.009927749633789062, "learning_rate": 9.825677631722435e-07, "loss": 0.0087, "reward": -0.25001570768654346, "reward_std": 0.5442444123327732, "rewards/cosine_scaled_reward": -0.24867095332592726, "rewards/format_reward": 0.43750000558793545, "step": 90 }, { "completion_length": 2932.2500762939453, "epoch": 0.104, "grad_norm": 0.19951650500297546, "kl": 0.004284858703613281, "learning_rate": 9.816912885430258e-07, "loss": 0.0723, "reward": 0.07665663212537766, "reward_std": 0.8412054367363453, "rewards/cosine_scaled_reward": -0.04515018220990896, "rewards/format_reward": 0.4375000037252903, "step": 91 }, { "completion_length": 2869.812545776367, "epoch": 0.10514285714285715, "grad_norm": 0.12959204614162445, "kl": 0.010074138641357422, "learning_rate": 9.807937738894303e-07, "loss": 0.0231, "reward": 0.038971804082393646, "reward_std": 0.8026900477707386, "rewards/cosine_scaled_reward": -0.07580470014363527, "rewards/format_reward": 0.45833334140479565, "step": 92 }, { "completion_length": 3530.5625, "epoch": 0.10628571428571429, "grad_norm": 0.09351453185081482, "kl": 0.005786895751953125, "learning_rate": 9.798752629550546e-07, "loss": 0.0137, "reward": -0.5460330247879028, "reward_std": 0.4978806171566248, "rewards/cosine_scaled_reward": -0.25744735077023506, "rewards/format_reward": 0.06250000186264515, "step": 93 }, { "completion_length": 3159.645835876465, "epoch": 0.10742857142857143, "grad_norm": 0.09856709837913513, "kl": 0.009381294250488281, "learning_rate": 9.78935800506826e-07, "loss": 0.0331, "reward": -0.2638606168329716, "reward_std": 0.6384533829987049, "rewards/cosine_scaled_reward": -0.1464436650276184, "rewards/format_reward": 0.2083333358168602, "step": 94 }, { "completion_length": 3420.812530517578, "epoch": 0.10857142857142857, "grad_norm": 0.0972990095615387, "kl": 0.0027017593383789062, "learning_rate": 9.779754323328192e-07, "loss": 0.0175, "reward": -0.29529019072651863, "reward_std": 0.6119959745556116, "rewards/cosine_scaled_reward": -0.18471521139144897, "rewards/format_reward": 0.2500000074505806, "step": 95 }, { "completion_length": 3003.437515258789, "epoch": 0.10971428571428571, "grad_norm": 0.10754227638244629, "kl": 0.008491039276123047, "learning_rate": 9.769942052400235e-07, "loss": 0.0294, "reward": -0.0011827312409877777, "reward_std": 0.7199659757316113, "rewards/cosine_scaled_reward": -0.056293437257409096, "rewards/format_reward": 0.3750000037252903, "step": 96 }, { "completion_length": 3227.7708740234375, "epoch": 0.11085714285714286, "grad_norm": 0.055975738912820816, "kl": 0.005445957183837891, "learning_rate": 9.759921670520634e-07, "loss": 0.0114, "reward": -0.04315417259931564, "reward_std": 0.39706853218376637, "rewards/cosine_scaled_reward": -0.03440806642174721, "rewards/format_reward": 0.31250000186264515, "step": 97 }, { "completion_length": 3059.354217529297, "epoch": 0.112, "grad_norm": 0.10543321073055267, "kl": 0.005114555358886719, "learning_rate": 9.749693666068663e-07, "loss": 0.0567, "reward": -0.14790542237460613, "reward_std": 0.6289958246052265, "rewards/cosine_scaled_reward": -0.12040873523801565, "rewards/format_reward": 0.3125000111758709, "step": 98 }, { "completion_length": 2944.104217529297, "epoch": 0.11314285714285714, "grad_norm": 0.14774614572525024, "kl": 0.007845878601074219, "learning_rate": 9.739258537542835e-07, "loss": 0.0208, "reward": 0.024691712111234665, "reward_std": 0.8161002658307552, "rewards/cosine_scaled_reward": -0.026264889165759087, "rewards/format_reward": 0.33333334140479565, "step": 99 }, { "completion_length": 3003.562515258789, "epoch": 0.11428571428571428, "grad_norm": 0.14759190380573273, "kl": 0.00844573974609375, "learning_rate": 9.728616793536587e-07, "loss": 0.0498, "reward": -0.03464020788669586, "reward_std": 0.7968008350580931, "rewards/cosine_scaled_reward": -0.07347729802131653, "rewards/format_reward": 0.35416667349636555, "step": 100 }, { "completion_length": 2660.7500228881836, "epoch": 0.11542857142857142, "grad_norm": 0.05885668471455574, "kl": 0.004193544387817383, "learning_rate": 9.717768952713511e-07, "loss": 0.0099, "reward": -0.024280589073896408, "reward_std": 0.478687334805727, "rewards/cosine_scaled_reward": -0.06905538472346961, "rewards/format_reward": 0.39583333395421505, "step": 101 }, { "completion_length": 3017.6459045410156, "epoch": 0.11657142857142858, "grad_norm": 0.19236327707767487, "kl": 0.01230621337890625, "learning_rate": 9.706715543782064e-07, "loss": 0.1031, "reward": 0.0067907206248492, "reward_std": 0.7886966746300459, "rewards/cosine_scaled_reward": -0.07641204819083214, "rewards/format_reward": 0.41666667722165585, "step": 102 }, { "completion_length": 3141.437545776367, "epoch": 0.11771428571428572, "grad_norm": 0.12005823105573654, "kl": 0.010993003845214844, "learning_rate": 9.695457105469804e-07, "loss": 0.0418, "reward": 0.11525936797261238, "reward_std": 0.7872378453612328, "rewards/cosine_scaled_reward": -0.020046040415763855, "rewards/format_reward": 0.4583333507180214, "step": 103 }, { "completion_length": 2757.520851135254, "epoch": 0.11885714285714286, "grad_norm": 0.16546285152435303, "kl": 0.007067680358886719, "learning_rate": 9.683994186497132e-07, "loss": 0.029, "reward": -0.012922056019306183, "reward_std": 0.7183502614498138, "rewards/cosine_scaled_reward": -0.08642422500997782, "rewards/format_reward": 0.41666667349636555, "step": 104 }, { "completion_length": 2874.68758392334, "epoch": 0.12, "grad_norm": 0.15343305468559265, "kl": 0.0062236785888671875, "learning_rate": 9.672327345550543e-07, "loss": -0.0011, "reward": 0.3746343031525612, "reward_std": 0.9715399444103241, "rewards/cosine_scaled_reward": 0.13150912104174495, "rewards/format_reward": 0.47916668094694614, "step": 105 }, { "completion_length": 2406.2292404174805, "epoch": 0.12114285714285715, "grad_norm": 0.11538145691156387, "kl": 0.014267921447753906, "learning_rate": 9.66045715125541e-07, "loss": 0.0512, "reward": 0.7112023187801242, "reward_std": 0.7698421813547611, "rewards/cosine_scaled_reward": 0.2695285137742758, "rewards/format_reward": 0.6875000111758709, "step": 106 }, { "completion_length": 2906.083366394043, "epoch": 0.12228571428571429, "grad_norm": 0.1356947422027588, "kl": 0.0071544647216796875, "learning_rate": 9.648384182148252e-07, "loss": 0.0559, "reward": 0.04975247010588646, "reward_std": 0.6387073248624802, "rewards/cosine_scaled_reward": -0.04459162801504135, "rewards/format_reward": 0.43750001303851604, "step": 107 }, { "completion_length": 2924.479202270508, "epoch": 0.12342857142857143, "grad_norm": 1.227287769317627, "kl": 0.09261322021484375, "learning_rate": 9.636109026648554e-07, "loss": 0.0411, "reward": -0.06936776265501976, "reward_std": 0.6040569245815277, "rewards/cosine_scaled_reward": -0.09396206960082054, "rewards/format_reward": 0.3750000037252903, "step": 108 }, { "completion_length": 3159.4166870117188, "epoch": 0.12457142857142857, "grad_norm": 0.06464666873216629, "kl": 0.0052165985107421875, "learning_rate": 9.623632283030077e-07, "loss": -0.0001, "reward": -0.19677976984530687, "reward_std": 0.3907594494521618, "rewards/cosine_scaled_reward": -0.13640070147812366, "rewards/format_reward": 0.3125, "step": 109 }, { "completion_length": 2941.979217529297, "epoch": 0.12571428571428572, "grad_norm": 0.11352120339870453, "kl": 0.0075016021728515625, "learning_rate": 9.610954559391704e-07, "loss": 0.0559, "reward": -0.02889834251254797, "reward_std": 0.6912853047251701, "rewards/cosine_scaled_reward": -0.09470331901684403, "rewards/format_reward": 0.41666667349636555, "step": 110 }, { "completion_length": 3445.0625610351562, "epoch": 0.12685714285714286, "grad_norm": 0.1718732863664627, "kl": 0.0095977783203125, "learning_rate": 9.598076473627796e-07, "loss": 0.0525, "reward": -0.1273586554452777, "reward_std": 0.7887241318821907, "rewards/cosine_scaled_reward": -0.042100945487618446, "rewards/format_reward": 0.1666666716337204, "step": 111 }, { "completion_length": 3372.2083740234375, "epoch": 0.128, "grad_norm": 0.13645856082439423, "kl": 0.00563812255859375, "learning_rate": 9.58499865339809e-07, "loss": 0.0184, "reward": -0.056365881115198135, "reward_std": 0.7170692086219788, "rewards/cosine_scaled_reward": -0.048214955255389214, "rewards/format_reward": 0.2916666753590107, "step": 112 }, { "completion_length": 2884.791717529297, "epoch": 0.12914285714285714, "grad_norm": 0.20523111522197723, "kl": 0.010105133056640625, "learning_rate": 9.571721736097088e-07, "loss": -0.0033, "reward": 0.13258023280650377, "reward_std": 0.8612403385341167, "rewards/cosine_scaled_reward": -0.058843023143708706, "rewards/format_reward": 0.5416666809469461, "step": 113 }, { "completion_length": 2625.458396911621, "epoch": 0.13028571428571428, "grad_norm": 0.08083964139223099, "kl": 0.0072994232177734375, "learning_rate": 9.55824636882301e-07, "loss": 0.0276, "reward": -0.11776229925453663, "reward_std": 0.5373759977519512, "rewards/cosine_scaled_reward": -0.2312552430666983, "rewards/format_reward": 0.5833333414047956, "step": 114 }, { "completion_length": 3025.1041717529297, "epoch": 0.13142857142857142, "grad_norm": 0.06019025295972824, "kl": 0.0069713592529296875, "learning_rate": 9.54457320834625e-07, "loss": 0.0315, "reward": -0.2688702419400215, "reward_std": 0.44118795450776815, "rewards/cosine_scaled_reward": -0.14728060085326433, "rewards/format_reward": 0.2291666716337204, "step": 115 }, { "completion_length": 3401.4166870117188, "epoch": 0.13257142857142856, "grad_norm": 0.1261359304189682, "kl": 0.0067615509033203125, "learning_rate": 9.530702921077358e-07, "loss": 0.0071, "reward": -0.318508867174387, "reward_std": 0.602899955585599, "rewards/cosine_scaled_reward": -0.16228950582444668, "rewards/format_reward": 0.1666666716337204, "step": 116 }, { "completion_length": 3060.7916717529297, "epoch": 0.1337142857142857, "grad_norm": 0.0866546705365181, "kl": 0.008612632751464844, "learning_rate": 9.516636183034564e-07, "loss": 0.0005, "reward": -0.3146707344567403, "reward_std": 0.5091048590838909, "rewards/cosine_scaled_reward": -0.2149065202102065, "rewards/format_reward": 0.29166666977107525, "step": 117 }, { "completion_length": 3120.0833435058594, "epoch": 0.13485714285714287, "grad_norm": 0.18219080567359924, "kl": 0.00536346435546875, "learning_rate": 9.502373679810839e-07, "loss": 0.0522, "reward": 0.4105420224368572, "reward_std": 1.0126653835177422, "rewards/cosine_scaled_reward": 0.17556388210505247, "rewards/format_reward": 0.43750000931322575, "step": 118 }, { "completion_length": 2533.9167251586914, "epoch": 0.136, "grad_norm": 0.1974111646413803, "kl": 0.054787635803222656, "learning_rate": 9.487916106540465e-07, "loss": 0.0693, "reward": 0.0521804504096508, "reward_std": 0.7259144820272923, "rewards/cosine_scaled_reward": -0.07517153583467007, "rewards/format_reward": 0.4791666679084301, "step": 119 }, { "completion_length": 2534.4167137145996, "epoch": 0.13714285714285715, "grad_norm": 0.11178319901227951, "kl": 0.009916305541992188, "learning_rate": 9.473264167865171e-07, "loss": 0.0475, "reward": 0.0007900348864495754, "reward_std": 0.6627710647881031, "rewards/cosine_scaled_reward": -0.09777611820027232, "rewards/format_reward": 0.45833333767950535, "step": 120 }, { "completion_length": 2313.229248046875, "epoch": 0.1382857142857143, "grad_norm": 0.16405229270458221, "kl": 0.013139724731445312, "learning_rate": 9.458418577899774e-07, "loss": 0.0402, "reward": 0.3239856115542352, "reward_std": 0.7424066159874201, "rewards/cosine_scaled_reward": -0.005178395658731461, "rewards/format_reward": 0.7083333469927311, "step": 121 }, { "completion_length": 2761.8958587646484, "epoch": 0.13942857142857143, "grad_norm": 0.12051185965538025, "kl": 0.0068817138671875, "learning_rate": 9.443380060197385e-07, "loss": 0.026, "reward": 0.3165215402841568, "reward_std": 0.7121441401541233, "rewards/cosine_scaled_reward": 0.07533489167690277, "rewards/format_reward": 0.5416666697710752, "step": 122 }, { "completion_length": 3158.104217529297, "epoch": 0.14057142857142857, "grad_norm": 0.1131911501288414, "kl": 0.007569313049316406, "learning_rate": 9.428149347714143e-07, "loss": 0.0551, "reward": -0.16298508271574974, "reward_std": 0.7209546975791454, "rewards/cosine_scaled_reward": -0.13522059097886086, "rewards/format_reward": 0.31250000558793545, "step": 123 }, { "completion_length": 2517.312530517578, "epoch": 0.1417142857142857, "grad_norm": 0.12443465739488602, "kl": 0.009019851684570312, "learning_rate": 9.412727182773486e-07, "loss": -0.0164, "reward": 0.19143125228583813, "reward_std": 0.8426807429641485, "rewards/cosine_scaled_reward": -0.0331303218845278, "rewards/format_reward": 0.5625000055879354, "step": 124 }, { "completion_length": 2815.5833740234375, "epoch": 0.14285714285714285, "grad_norm": 0.06811921298503876, "kl": 0.0053272247314453125, "learning_rate": 9.397114317029974e-07, "loss": 0.0266, "reward": -0.04552587552461773, "reward_std": 0.594413885846734, "rewards/cosine_scaled_reward": -0.050011674873530865, "rewards/format_reward": 0.31250000186264515, "step": 125 }, { "completion_length": 2926.750045776367, "epoch": 0.144, "grad_norm": 0.11453459411859512, "kl": 0.005611419677734375, "learning_rate": 9.381311511432658e-07, "loss": 0.0231, "reward": -0.027173910290002823, "reward_std": 0.6515960693359375, "rewards/cosine_scaled_reward": -0.11056716740131378, "rewards/format_reward": 0.45833333395421505, "step": 126 }, { "completion_length": 3180.0416870117188, "epoch": 0.14514285714285713, "grad_norm": 0.13321727514266968, "kl": 0.008800506591796875, "learning_rate": 9.36531953618799e-07, "loss": 0.0247, "reward": -0.2067241296172142, "reward_std": 0.7376469634473324, "rewards/cosine_scaled_reward": -0.1875669350847602, "rewards/format_reward": 0.35416667722165585, "step": 127 }, { "completion_length": 2854.145866394043, "epoch": 0.1462857142857143, "grad_norm": 0.13030071556568146, "kl": 0.008055686950683594, "learning_rate": 9.34913917072228e-07, "loss": 0.0297, "reward": 0.28113045543432236, "reward_std": 0.7527024820446968, "rewards/cosine_scaled_reward": 0.08081426518037915, "rewards/format_reward": 0.47916666977107525, "step": 128 }, { "completion_length": 3468.4583435058594, "epoch": 0.14742857142857144, "grad_norm": 0.09160865843296051, "kl": 0.009267807006835938, "learning_rate": 9.332771203643714e-07, "loss": 0.0182, "reward": -0.3858581744134426, "reward_std": 0.4741293340921402, "rewards/cosine_scaled_reward": -0.17545855604112148, "rewards/format_reward": 0.12500000186264515, "step": 129 }, { "completion_length": 3145.2083435058594, "epoch": 0.14857142857142858, "grad_norm": 0.11552103608846664, "kl": 0.008371353149414062, "learning_rate": 9.316216432703916e-07, "loss": 0.0555, "reward": -0.1405576877295971, "reward_std": 0.6180392988026142, "rewards/cosine_scaled_reward": -0.08423710052738898, "rewards/format_reward": 0.2500000074505806, "step": 130 }, { "completion_length": 2990.750011444092, "epoch": 0.14971428571428572, "grad_norm": 0.1343425065279007, "kl": 0.010997772216796875, "learning_rate": 9.299475664759068e-07, "loss": 0.0573, "reward": -0.0012444127351045609, "reward_std": 0.7613321915268898, "rewards/cosine_scaled_reward": -0.0183742493391037, "rewards/format_reward": 0.29166666977107525, "step": 131 }, { "completion_length": 2849.229217529297, "epoch": 0.15085714285714286, "grad_norm": 0.1754404902458191, "kl": 0.007671356201171875, "learning_rate": 9.282549715730579e-07, "loss": 0.0039, "reward": 0.15461664367467165, "reward_std": 1.0978601425886154, "rewards/cosine_scaled_reward": -0.016693929443135858, "rewards/format_reward": 0.4583333395421505, "step": 132 }, { "completion_length": 3204.5208740234375, "epoch": 0.152, "grad_norm": 0.09437424689531326, "kl": 0.010120391845703125, "learning_rate": 9.265439410565328e-07, "loss": 0.0371, "reward": -0.3896838743239641, "reward_std": 0.46856095641851425, "rewards/cosine_scaled_reward": -0.23207763396203518, "rewards/format_reward": 0.22916666977107525, "step": 133 }, { "completion_length": 2440.8125534057617, "epoch": 0.15314285714285714, "grad_norm": 0.11227884888648987, "kl": 0.010713577270507812, "learning_rate": 9.248145583195447e-07, "loss": 0.0059, "reward": 0.03150389529764652, "reward_std": 0.6699380800127983, "rewards/cosine_scaled_reward": -0.10194769129157066, "rewards/format_reward": 0.5208333358168602, "step": 134 }, { "completion_length": 2048.7083740234375, "epoch": 0.15428571428571428, "grad_norm": 0.11956455558538437, "kl": 0.009142875671386719, "learning_rate": 9.230669076497687e-07, "loss": 0.0417, "reward": 0.611090637743473, "reward_std": 0.8544140718877316, "rewards/cosine_scaled_reward": 0.19521427806466818, "rewards/format_reward": 0.6875, "step": 135 }, { "completion_length": 3031.7708587646484, "epoch": 0.15542857142857142, "grad_norm": 0.16711600124835968, "kl": 0.012142181396484375, "learning_rate": 9.213010742252327e-07, "loss": 0.0697, "reward": 0.3044010065495968, "reward_std": 0.8896326720714569, "rewards/cosine_scaled_reward": 0.08019791916012764, "rewards/format_reward": 0.5000000093132257, "step": 136 }, { "completion_length": 3144.291717529297, "epoch": 0.15657142857142858, "grad_norm": 0.1869216412305832, "kl": 0.01092529296875, "learning_rate": 9.195171441101668e-07, "loss": 0.0486, "reward": -0.18480181868653744, "reward_std": 0.7662193942815065, "rewards/cosine_scaled_reward": -0.14503308382700197, "rewards/format_reward": 0.2916666753590107, "step": 137 }, { "completion_length": 2823.6667251586914, "epoch": 0.15771428571428572, "grad_norm": 0.1232113316655159, "kl": 0.010227203369140625, "learning_rate": 9.177152042508077e-07, "loss": 0.0399, "reward": -0.14219614176545292, "reward_std": 0.6205537635833025, "rewards/cosine_scaled_reward": -0.1589539386332035, "rewards/format_reward": 0.3958333432674408, "step": 138 }, { "completion_length": 3272.187530517578, "epoch": 0.15885714285714286, "grad_norm": 0.15304112434387207, "kl": 0.0121917724609375, "learning_rate": 9.158953424711624e-07, "loss": 0.0143, "reward": -0.028373660519719124, "reward_std": 0.7605781219899654, "rewards/cosine_scaled_reward": -0.07536444254219532, "rewards/format_reward": 0.37500000558793545, "step": 139 }, { "completion_length": 3336.541717529297, "epoch": 0.16, "grad_norm": 0.1725020408630371, "kl": 0.018383026123046875, "learning_rate": 9.140576474687263e-07, "loss": 0.0185, "reward": -0.09492362570017576, "reward_std": 0.739283999428153, "rewards/cosine_scaled_reward": -0.052022709511220455, "rewards/format_reward": 0.2291666753590107, "step": 140 }, { "completion_length": 2818.7500915527344, "epoch": 0.16114285714285714, "grad_norm": 0.1428345888853073, "kl": 0.014141082763671875, "learning_rate": 9.122022088101613e-07, "loss": 0.0428, "reward": 0.17010034061968327, "reward_std": 0.8265232257544994, "rewards/cosine_scaled_reward": -0.03258664230816066, "rewards/format_reward": 0.5416666734963655, "step": 141 }, { "completion_length": 2865.041717529297, "epoch": 0.16228571428571428, "grad_norm": 0.132287859916687, "kl": 0.012800216674804688, "learning_rate": 9.103291169269299e-07, "loss": -0.0266, "reward": 0.14131132513284683, "reward_std": 0.6835053861141205, "rewards/cosine_scaled_reward": -0.061369335278868675, "rewards/format_reward": 0.5833333414047956, "step": 142 }, { "completion_length": 2789.979217529297, "epoch": 0.16342857142857142, "grad_norm": 0.2371223270893097, "kl": 0.01451873779296875, "learning_rate": 9.084384631108882e-07, "loss": 0.0726, "reward": -0.21053062099963427, "reward_std": 0.6688540205359459, "rewards/cosine_scaled_reward": -0.22842608066275716, "rewards/format_reward": 0.4375000111758709, "step": 143 }, { "completion_length": 3133.833366394043, "epoch": 0.16457142857142856, "grad_norm": 0.14418281614780426, "kl": 0.0147552490234375, "learning_rate": 9.065303395098358e-07, "loss": 0.0211, "reward": -0.19076000433415174, "reward_std": 0.8512778505682945, "rewards/cosine_scaled_reward": -0.1536552112083882, "rewards/format_reward": 0.29166666977107525, "step": 144 }, { "completion_length": 2111.166679382324, "epoch": 0.1657142857142857, "grad_norm": 0.08913204073905945, "kl": 0.011333465576171875, "learning_rate": 9.046048391230247e-07, "loss": 0.0067, "reward": 0.37262871488928795, "reward_std": 0.6370398961007595, "rewards/cosine_scaled_reward": 0.05427891947329044, "rewards/format_reward": 0.6666666679084301, "step": 145 }, { "completion_length": 2267.708351135254, "epoch": 0.16685714285714287, "grad_norm": 0.09961648285388947, "kl": 0.008184432983398438, "learning_rate": 9.026620557966279e-07, "loss": 0.0193, "reward": 0.021780904848128557, "reward_std": 0.629846852272749, "rewards/cosine_scaled_reward": -0.18135410267859697, "rewards/format_reward": 0.666666679084301, "step": 146 }, { "completion_length": 2814.354202270508, "epoch": 0.168, "grad_norm": 0.1152271032333374, "kl": 0.015193939208984375, "learning_rate": 9.007020842191634e-07, "loss": -0.0113, "reward": 0.11335810273885727, "reward_std": 0.75190694257617, "rewards/cosine_scaled_reward": -0.01303301053121686, "rewards/format_reward": 0.4375000149011612, "step": 147 }, { "completion_length": 2681.6458892822266, "epoch": 0.16914285714285715, "grad_norm": 0.10002895444631577, "kl": 0.0156402587890625, "learning_rate": 8.987250199168808e-07, "loss": 0.0397, "reward": 0.015451362356543541, "reward_std": 0.6257824674248695, "rewards/cosine_scaled_reward": -0.08100172178819776, "rewards/format_reward": 0.4583333395421505, "step": 148 }, { "completion_length": 2944.9375610351562, "epoch": 0.1702857142857143, "grad_norm": 0.11802194267511368, "kl": 0.012241363525390625, "learning_rate": 8.967309592491052e-07, "loss": 0.0548, "reward": -0.029212953057140112, "reward_std": 0.5978215262293816, "rewards/cosine_scaled_reward": -0.11697033792734146, "rewards/format_reward": 0.47916668094694614, "step": 149 }, { "completion_length": 3025.500045776367, "epoch": 0.17142857142857143, "grad_norm": 0.158779576420784, "kl": 0.016357421875, "learning_rate": 8.9471999940354e-07, "loss": 0.075, "reward": 0.17186034470796585, "reward_std": 0.9030736871063709, "rewards/cosine_scaled_reward": 0.03854179289191961, "rewards/format_reward": 0.39583333395421505, "step": 150 }, { "completion_length": 2807.2709045410156, "epoch": 0.17257142857142857, "grad_norm": 0.2797929644584656, "kl": 0.016078948974609375, "learning_rate": 8.926922383915315e-07, "loss": 0.1148, "reward": 0.35514865489676595, "reward_std": 1.013380728662014, "rewards/cosine_scaled_reward": 0.09170010522939265, "rewards/format_reward": 0.520833345130086, "step": 151 }, { "completion_length": 2813.312511444092, "epoch": 0.1737142857142857, "grad_norm": 0.15241533517837524, "kl": 0.016437530517578125, "learning_rate": 8.906477750432903e-07, "loss": 0.0504, "reward": -0.21058875182643533, "reward_std": 0.6285759322345257, "rewards/cosine_scaled_reward": -0.17488694563508034, "rewards/format_reward": 0.33333333395421505, "step": 152 }, { "completion_length": 2995.937515258789, "epoch": 0.17485714285714285, "grad_norm": 0.09293405711650848, "kl": 0.022808074951171875, "learning_rate": 8.88586709003076e-07, "loss": 0.0405, "reward": -0.2887796126306057, "reward_std": 0.4931412860751152, "rewards/cosine_scaled_reward": -0.21798867918550968, "rewards/format_reward": 0.3333333395421505, "step": 153 }, { "completion_length": 3390.1458435058594, "epoch": 0.176, "grad_norm": 0.17047946155071259, "kl": 0.01389312744140625, "learning_rate": 8.865091407243394e-07, "loss": 0.0042, "reward": 0.22278353199362755, "reward_std": 0.9290903359651566, "rewards/cosine_scaled_reward": 0.07489011948928237, "rewards/format_reward": 0.3958333432674408, "step": 154 }, { "completion_length": 2628.625030517578, "epoch": 0.17714285714285713, "grad_norm": 0.16334204375743866, "kl": 0.01788330078125, "learning_rate": 8.844151714648274e-07, "loss": 0.0719, "reward": 0.07151136547327042, "reward_std": 0.850743044167757, "rewards/cosine_scaled_reward": -0.05793104809708893, "rewards/format_reward": 0.4583333395421505, "step": 155 }, { "completion_length": 3115.0208740234375, "epoch": 0.1782857142857143, "grad_norm": 0.1153322234749794, "kl": 0.015087127685546875, "learning_rate": 8.823049032816478e-07, "loss": 0.0422, "reward": -0.11803282611072063, "reward_std": 0.6224103905260563, "rewards/cosine_scaled_reward": -0.07339339889585972, "rewards/format_reward": 0.2708333432674408, "step": 156 }, { "completion_length": 3112.3333740234375, "epoch": 0.17942857142857144, "grad_norm": 0.11238392442464828, "kl": 0.018207550048828125, "learning_rate": 8.801784390262943e-07, "loss": 0.0267, "reward": -0.12678863108158112, "reward_std": 0.6108014769852161, "rewards/cosine_scaled_reward": -0.13557185977697372, "rewards/format_reward": 0.3750000149011612, "step": 157 }, { "completion_length": 3134.9375610351562, "epoch": 0.18057142857142858, "grad_norm": 0.14315029978752136, "kl": 0.016357421875, "learning_rate": 8.780358823396352e-07, "loss": 0.0291, "reward": 0.31565938144922256, "reward_std": 0.7341890074312687, "rewards/cosine_scaled_reward": 0.12421234138309956, "rewards/format_reward": 0.43750000931322575, "step": 158 }, { "completion_length": 3197.2916870117188, "epoch": 0.18171428571428572, "grad_norm": 0.08521895110607147, "kl": 0.02240753173828125, "learning_rate": 8.758773376468604e-07, "loss": -0.0065, "reward": -0.2289612852036953, "reward_std": 0.45951012521982193, "rewards/cosine_scaled_reward": -0.15268473327159882, "rewards/format_reward": 0.29166666977107525, "step": 159 }, { "completion_length": 2835.1250076293945, "epoch": 0.18285714285714286, "grad_norm": 0.1391443908214569, "kl": 0.0212249755859375, "learning_rate": 8.737029101523929e-07, "loss": 0.0389, "reward": -0.09338995814323425, "reward_std": 0.6676197461783886, "rewards/cosine_scaled_reward": -0.08398129511624575, "rewards/format_reward": 0.3125000074505806, "step": 160 }, { "completion_length": 2894.2500762939453, "epoch": 0.184, "grad_norm": 0.15109732747077942, "kl": 0.022735595703125, "learning_rate": 8.715127058347614e-07, "loss": 0.0618, "reward": 0.18475250899791718, "reward_std": 0.7132500857114792, "rewards/cosine_scaled_reward": 0.038985077291727066, "rewards/format_reward": 0.4375000074505806, "step": 161 }, { "completion_length": 3240.104217529297, "epoch": 0.18514285714285714, "grad_norm": 0.25355350971221924, "kl": 0.02893829345703125, "learning_rate": 8.693068314414344e-07, "loss": 0.03, "reward": -0.08004653453826904, "reward_std": 0.7777004204690456, "rewards/cosine_scaled_reward": -0.050195490941405296, "rewards/format_reward": 0.2500000074505806, "step": 162 }, { "completion_length": 2491.4166870117188, "epoch": 0.18628571428571428, "grad_norm": 0.10477182269096375, "kl": 0.019779205322265625, "learning_rate": 8.670853944836176e-07, "loss": 0.0184, "reward": 0.30021179956384003, "reward_std": 0.5858964845538139, "rewards/cosine_scaled_reward": 0.05349706672132015, "rewards/format_reward": 0.5833333432674408, "step": 163 }, { "completion_length": 2634.3334045410156, "epoch": 0.18742857142857142, "grad_norm": 0.10012540221214294, "kl": 0.02051544189453125, "learning_rate": 8.648485032310144e-07, "loss": 0.0372, "reward": 0.18599897995591164, "reward_std": 0.5746248178184032, "rewards/cosine_scaled_reward": 0.004582956433296204, "rewards/format_reward": 0.520833333954215, "step": 164 }, { "completion_length": 3389.854217529297, "epoch": 0.18857142857142858, "grad_norm": 0.17076270282268524, "kl": 0.0308380126953125, "learning_rate": 8.625962667065487e-07, "loss": 0.0336, "reward": -0.19457256980240345, "reward_std": 0.8124973215162754, "rewards/cosine_scaled_reward": -0.14226967841386795, "rewards/format_reward": 0.2708333395421505, "step": 165 }, { "completion_length": 3060.5416870117188, "epoch": 0.18971428571428572, "grad_norm": 0.1075100377202034, "kl": 0.01813507080078125, "learning_rate": 8.603287946810513e-07, "loss": 0.0452, "reward": -0.002260749228298664, "reward_std": 0.5852988548576832, "rewards/cosine_scaled_reward": -0.04963597096502781, "rewards/format_reward": 0.37500000931322575, "step": 166 }, { "completion_length": 2668.5625610351562, "epoch": 0.19085714285714286, "grad_norm": 0.17578865587711334, "kl": 0.0214080810546875, "learning_rate": 8.580461976679099e-07, "loss": 0.0434, "reward": -0.03433691617101431, "reward_std": 0.7491964735090733, "rewards/cosine_scaled_reward": -0.18043148797005415, "rewards/format_reward": 0.5625000055879354, "step": 167 }, { "completion_length": 3086.2083892822266, "epoch": 0.192, "grad_norm": 0.1612425446510315, "kl": 0.02228546142578125, "learning_rate": 8.557485869176825e-07, "loss": 0.0382, "reward": 0.3381266240030527, "reward_std": 0.7707905732095242, "rewards/cosine_scaled_reward": 0.10119387321174145, "rewards/format_reward": 0.5208333469927311, "step": 168 }, { "completion_length": 2517.1041946411133, "epoch": 0.19314285714285714, "grad_norm": 0.16143831610679626, "kl": 0.025909423828125, "learning_rate": 8.534360744126753e-07, "loss": 0.0434, "reward": 0.7395636513829231, "reward_std": 0.8513101674616337, "rewards/cosine_scaled_reward": 0.3049415610730648, "rewards/format_reward": 0.6458333358168602, "step": 169 }, { "completion_length": 2514.916702270508, "epoch": 0.19428571428571428, "grad_norm": 0.10102265328168869, "kl": 0.0198974609375, "learning_rate": 8.511087728614862e-07, "loss": 0.0574, "reward": 0.13654466345906258, "reward_std": 0.6086900364607573, "rewards/cosine_scaled_reward": 0.003594242036342621, "rewards/format_reward": 0.45833334140479565, "step": 170 }, { "completion_length": 2985.0208740234375, "epoch": 0.19542857142857142, "grad_norm": 0.13056401908397675, "kl": 0.0213165283203125, "learning_rate": 8.487667956935087e-07, "loss": 0.0198, "reward": 0.062067726626992226, "reward_std": 0.7003943808376789, "rewards/cosine_scaled_reward": -0.022239719983190298, "rewards/format_reward": 0.3958333432674408, "step": 171 }, { "completion_length": 3023.25, "epoch": 0.19657142857142856, "grad_norm": 0.13006411492824554, "kl": 0.027801513671875, "learning_rate": 8.464102570534061e-07, "loss": 0.0166, "reward": -0.10689089074730873, "reward_std": 0.7168557345867157, "rewards/cosine_scaled_reward": -0.0742192417383194, "rewards/format_reward": 0.2708333395421505, "step": 172 }, { "completion_length": 2487.312572479248, "epoch": 0.1977142857142857, "grad_norm": 0.15723471343517303, "kl": 0.029430389404296875, "learning_rate": 8.440392717955475e-07, "loss": 0.0449, "reward": 0.07556262612342834, "reward_std": 0.7289031557738781, "rewards/cosine_scaled_reward": -0.11148698627948761, "rewards/format_reward": 0.5833333395421505, "step": 173 }, { "completion_length": 2791.6459045410156, "epoch": 0.19885714285714284, "grad_norm": 0.12100586295127869, "kl": 0.03168487548828125, "learning_rate": 8.416539554784089e-07, "loss": 0.0243, "reward": 0.15438630525022745, "reward_std": 0.6935141123831272, "rewards/cosine_scaled_reward": -0.043977076187729836, "rewards/format_reward": 0.5625000111758709, "step": 174 }, { "completion_length": 2961.2083892822266, "epoch": 0.2, "grad_norm": 0.10910208523273468, "kl": 0.027740478515625, "learning_rate": 8.392544243589427e-07, "loss": 0.0207, "reward": 0.0747014251537621, "reward_std": 0.6700296718627214, "rewards/cosine_scaled_reward": -0.031223440542817116, "rewards/format_reward": 0.43750001303851604, "step": 175 }, { "completion_length": 2736.104217529297, "epoch": 0.20114285714285715, "grad_norm": 0.22644685208797455, "kl": 0.02967071533203125, "learning_rate": 8.368407953869103e-07, "loss": 0.0671, "reward": 0.0621003326959908, "reward_std": 0.8631531074643135, "rewards/cosine_scaled_reward": -0.06792039083666168, "rewards/format_reward": 0.45833333767950535, "step": 176 }, { "completion_length": 3160.0000610351562, "epoch": 0.2022857142857143, "grad_norm": 0.23566588759422302, "kl": 0.03285980224609375, "learning_rate": 8.344131861991828e-07, "loss": 0.0637, "reward": 0.023158524534665048, "reward_std": 0.7583435364067554, "rewards/cosine_scaled_reward": -0.10049578547477722, "rewards/format_reward": 0.5000000186264515, "step": 177 }, { "completion_length": 2873.854232788086, "epoch": 0.20342857142857143, "grad_norm": 0.19544903934001923, "kl": 0.0418548583984375, "learning_rate": 8.319717151140072e-07, "loss": 0.0085, "reward": 0.14865556359291077, "reward_std": 0.8756407424807549, "rewards/cosine_scaled_reward": 0.016698965802788734, "rewards/format_reward": 0.416666679084301, "step": 178 }, { "completion_length": 3137.1666870117188, "epoch": 0.20457142857142857, "grad_norm": 0.07909320294857025, "kl": 0.03249359130859375, "learning_rate": 8.295165011252396e-07, "loss": 0.0049, "reward": -0.36861060559749603, "reward_std": 0.3870035018771887, "rewards/cosine_scaled_reward": -0.24457938224077225, "rewards/format_reward": 0.2916666679084301, "step": 179 }, { "completion_length": 2372.270866394043, "epoch": 0.2057142857142857, "grad_norm": 0.14237825572490692, "kl": 0.03276824951171875, "learning_rate": 8.270476638965461e-07, "loss": 0.0399, "reward": 0.31979808397591114, "reward_std": 0.7863202355802059, "rewards/cosine_scaled_reward": 0.06971612432971597, "rewards/format_reward": 0.5416666679084301, "step": 180 }, { "completion_length": 3232.3958892822266, "epoch": 0.20685714285714285, "grad_norm": 0.11890272796154022, "kl": 0.0372467041015625, "learning_rate": 8.245653237555705e-07, "loss": 0.0285, "reward": -0.08456094935536385, "reward_std": 0.6224741712212563, "rewards/cosine_scaled_reward": -0.07428957521915436, "rewards/format_reward": 0.3125000074505806, "step": 181 }, { "completion_length": 2675.750045776367, "epoch": 0.208, "grad_norm": 0.1544029265642166, "kl": 0.026336669921875, "learning_rate": 8.220696016880687e-07, "loss": 0.0012, "reward": 0.28332219598814845, "reward_std": 0.8586058430373669, "rewards/cosine_scaled_reward": 0.04892569035291672, "rewards/format_reward": 0.541666679084301, "step": 182 }, { "completion_length": 2722.3334045410156, "epoch": 0.20914285714285713, "grad_norm": 0.2383163869380951, "kl": 0.0434417724609375, "learning_rate": 8.195606193320136e-07, "loss": 0.0422, "reward": 0.37364979088306427, "reward_std": 0.7791223339736462, "rewards/cosine_scaled_reward": 0.08836854621767998, "rewards/format_reward": 0.583333345130086, "step": 183 }, { "completion_length": 2838.3125534057617, "epoch": 0.2102857142857143, "grad_norm": 0.08796233683824539, "kl": 0.037139892578125, "learning_rate": 8.170384989716657e-07, "loss": 0.0153, "reward": -0.2598969964310527, "reward_std": 0.48821557871997356, "rewards/cosine_scaled_reward": -0.2099758218973875, "rewards/format_reward": 0.3541666679084301, "step": 184 }, { "completion_length": 2718.395851135254, "epoch": 0.21142857142857144, "grad_norm": 0.14351139962673187, "kl": 0.0391998291015625, "learning_rate": 8.145033635316128e-07, "loss": 0.0168, "reward": -0.022745168476831168, "reward_std": 0.6752897650003433, "rewards/cosine_scaled_reward": -0.14890163764357567, "rewards/format_reward": 0.5416666828095913, "step": 185 }, { "completion_length": 3067.541717529297, "epoch": 0.21257142857142858, "grad_norm": 0.10349977016448975, "kl": 0.038116455078125, "learning_rate": 8.119553365707802e-07, "loss": 0.0083, "reward": 0.06632867828011513, "reward_std": 0.6240403186529875, "rewards/cosine_scaled_reward": 0.0068851374089717865, "rewards/format_reward": 0.35416666977107525, "step": 186 }, { "completion_length": 2420.291702270508, "epoch": 0.21371428571428572, "grad_norm": 0.15342004597187042, "kl": 0.046356201171875, "learning_rate": 8.093945422764069e-07, "loss": 0.0298, "reward": 0.08685800805687904, "reward_std": 0.6128952912986279, "rewards/cosine_scaled_reward": -0.11734730005264282, "rewards/format_reward": 0.6250000111758709, "step": 187 }, { "completion_length": 3567.3958435058594, "epoch": 0.21485714285714286, "grad_norm": 0.15315963327884674, "kl": 0.05072021484375, "learning_rate": 8.068211054579943e-07, "loss": 0.0074, "reward": -0.398057883605361, "reward_std": 0.6201205141842365, "rewards/cosine_scaled_reward": -0.18498908844776452, "rewards/format_reward": 0.10416666977107525, "step": 188 }, { "completion_length": 3156.937545776367, "epoch": 0.216, "grad_norm": 0.22657519578933716, "kl": 0.05267333984375, "learning_rate": 8.04235151541222e-07, "loss": 0.0596, "reward": 0.08714086917461827, "reward_std": 0.7904996033757925, "rewards/cosine_scaled_reward": -0.008101928047835827, "rewards/format_reward": 0.3958333469927311, "step": 189 }, { "completion_length": 2789.041702270508, "epoch": 0.21714285714285714, "grad_norm": 0.13739848136901855, "kl": 0.051605224609375, "learning_rate": 8.01636806561836e-07, "loss": 0.0128, "reward": 0.15377038344740868, "reward_std": 0.7078918963670731, "rewards/cosine_scaled_reward": -0.0018926504999399185, "rewards/format_reward": 0.47916668094694614, "step": 190 }, { "completion_length": 2624.083366394043, "epoch": 0.21828571428571428, "grad_norm": 0.17836546897888184, "kl": 0.05645751953125, "learning_rate": 7.990261971595048e-07, "loss": 0.0403, "reward": 0.17610792024061084, "reward_std": 0.7685041464865208, "rewards/cosine_scaled_reward": 0.013859146274626255, "rewards/format_reward": 0.4583333395421505, "step": 191 }, { "completion_length": 3288.916717529297, "epoch": 0.21942857142857142, "grad_norm": 0.20157144963741302, "kl": 0.053802490234375, "learning_rate": 7.964034505716476e-07, "loss": 0.0343, "reward": -0.18958128988742828, "reward_std": 0.6780943684279919, "rewards/cosine_scaled_reward": -0.14980729576200247, "rewards/format_reward": 0.31250000931322575, "step": 192 }, { "completion_length": 3114.1458740234375, "epoch": 0.22057142857142858, "grad_norm": 0.18799538910388947, "kl": 0.0482025146484375, "learning_rate": 7.93768694627233e-07, "loss": 0.0323, "reward": -0.24548013135790825, "reward_std": 0.6712213661521673, "rewards/cosine_scaled_reward": -0.1577296955510974, "rewards/format_reward": 0.25000000186264515, "step": 193 }, { "completion_length": 3140.9375610351562, "epoch": 0.22171428571428572, "grad_norm": 0.27719372510910034, "kl": 0.0517578125, "learning_rate": 7.911220577405484e-07, "loss": 0.0627, "reward": 0.3136282116174698, "reward_std": 1.0058045387268066, "rewards/cosine_scaled_reward": 0.1217548530548811, "rewards/format_reward": 0.4166666716337204, "step": 194 }, { "completion_length": 3052.187545776367, "epoch": 0.22285714285714286, "grad_norm": 0.2891777753829956, "kl": 0.058441162109375, "learning_rate": 7.884636689049422e-07, "loss": 0.0488, "reward": -0.09272150322794914, "reward_std": 0.6414213795214891, "rewards/cosine_scaled_reward": -0.07150299660861492, "rewards/format_reward": 0.2916666716337204, "step": 195 }, { "completion_length": 3178.437530517578, "epoch": 0.224, "grad_norm": 0.1800169199705124, "kl": 0.06561279296875, "learning_rate": 7.857936576865356e-07, "loss": 0.0051, "reward": -0.12664931640028954, "reward_std": 0.584855318069458, "rewards/cosine_scaled_reward": -0.08012674562633038, "rewards/format_reward": 0.27083333395421505, "step": 196 }, { "completion_length": 2523.6667404174805, "epoch": 0.22514285714285714, "grad_norm": 0.29900649189949036, "kl": 0.08282470703125, "learning_rate": 7.831121542179086e-07, "loss": 0.0404, "reward": 0.32191772386431694, "reward_std": 1.1363152042031288, "rewards/cosine_scaled_reward": 0.052287210419308394, "rewards/format_reward": 0.5416666809469461, "step": 197 }, { "completion_length": 2786.625045776367, "epoch": 0.22628571428571428, "grad_norm": 0.34645184874534607, "kl": 0.10394287109375, "learning_rate": 7.804192891917571e-07, "loss": 0.0284, "reward": -0.09496624395251274, "reward_std": 0.6138490363955498, "rewards/cosine_scaled_reward": -0.11393817700445652, "rewards/format_reward": 0.37500000558793545, "step": 198 }, { "completion_length": 2917.729217529297, "epoch": 0.22742857142857142, "grad_norm": 0.27866116166114807, "kl": 0.08343505859375, "learning_rate": 7.777151938545235e-07, "loss": 0.0601, "reward": -0.2887334353290498, "reward_std": 0.6878443285822868, "rewards/cosine_scaled_reward": -0.18037102394737303, "rewards/format_reward": 0.22916667349636555, "step": 199 }, { "completion_length": 2416.0833587646484, "epoch": 0.22857142857142856, "grad_norm": 0.29420676827430725, "kl": 0.07171630859375, "learning_rate": 7.75e-07, "loss": 0.0608, "reward": 0.38031474966555834, "reward_std": 0.8896522857248783, "rewards/cosine_scaled_reward": 0.08112852554768324, "rewards/format_reward": 0.604166679084301, "step": 200 }, { "completion_length": 2426.1667098999023, "epoch": 0.2297142857142857, "grad_norm": 0.3698631525039673, "kl": 0.069580078125, "learning_rate": 7.72273839962904e-07, "loss": 0.0396, "reward": 0.5210681445896626, "reward_std": 0.8631033673882484, "rewards/cosine_scaled_reward": 0.18454162776470184, "rewards/format_reward": 0.5833333395421505, "step": 201 }, { "completion_length": 2384.5416946411133, "epoch": 0.23085714285714284, "grad_norm": 0.21861740946769714, "kl": 0.09124755859375, "learning_rate": 7.695368466124296e-07, "loss": 0.0316, "reward": 0.4668930694460869, "reward_std": 0.7846425659954548, "rewards/cosine_scaled_reward": 0.17546686669811606, "rewards/format_reward": 0.541666679084301, "step": 202 }, { "completion_length": 3118.041717529297, "epoch": 0.232, "grad_norm": 0.32266831398010254, "kl": 0.115875244140625, "learning_rate": 7.667891533457718e-07, "loss": 0.058, "reward": 0.050868467427790165, "reward_std": 0.8579942099750042, "rewards/cosine_scaled_reward": 0.037848809035494924, "rewards/format_reward": 0.22916666977107525, "step": 203 }, { "completion_length": 2304.9583587646484, "epoch": 0.23314285714285715, "grad_norm": 0.18206676840782166, "kl": 0.109619140625, "learning_rate": 7.640308940816239e-07, "loss": 0.0048, "reward": 0.30574227310717106, "reward_std": 0.6170836389064789, "rewards/cosine_scaled_reward": -0.008607452735304832, "rewards/format_reward": 0.708333358168602, "step": 204 }, { "completion_length": 2729.062545776367, "epoch": 0.2342857142857143, "grad_norm": 0.3828980624675751, "kl": 0.11627197265625, "learning_rate": 7.612622032536507e-07, "loss": 0.0304, "reward": 0.5650580562651157, "reward_std": 1.018170591443777, "rewards/cosine_scaled_reward": 0.22851963574066758, "rewards/format_reward": 0.541666679084301, "step": 205 }, { "completion_length": 3118.2500534057617, "epoch": 0.23542857142857143, "grad_norm": 0.38659653067588806, "kl": 0.12823486328125, "learning_rate": 7.584832158039378e-07, "loss": 0.0409, "reward": -0.19641774892807007, "reward_std": 0.7758117392659187, "rewards/cosine_scaled_reward": -0.16738836327567697, "rewards/format_reward": 0.3125000037252903, "step": 206 }, { "completion_length": 3061.0000610351562, "epoch": 0.23657142857142857, "grad_norm": 0.5246978998184204, "kl": 0.1700439453125, "learning_rate": 7.556940671764124e-07, "loss": 0.0753, "reward": -0.06954369135200977, "reward_std": 0.7209791392087936, "rewards/cosine_scaled_reward": -0.15650581319641788, "rewards/format_reward": 0.4791666753590107, "step": 207 }, { "completion_length": 2615.5833587646484, "epoch": 0.2377142857142857, "grad_norm": 0.590552568435669, "kl": 0.137298583984375, "learning_rate": 7.528948933102438e-07, "loss": 0.0517, "reward": 0.10035054851323366, "reward_std": 0.6904583033174276, "rewards/cosine_scaled_reward": -0.06608795002102852, "rewards/format_reward": 0.541666679084301, "step": 208 }, { "completion_length": 2907.0625610351562, "epoch": 0.23885714285714285, "grad_norm": 0.3516550660133362, "kl": 0.1842041015625, "learning_rate": 7.500858306332172e-07, "loss": 0.0368, "reward": 0.19722715765237808, "reward_std": 0.7542031668126583, "rewards/cosine_scaled_reward": 0.045978354290127754, "rewards/format_reward": 0.43750000558793545, "step": 209 }, { "completion_length": 2872.8333740234375, "epoch": 0.24, "grad_norm": 0.31704995036125183, "kl": 0.17333984375, "learning_rate": 7.472670160550848e-07, "loss": 0.0472, "reward": 0.005523813422769308, "reward_std": 0.6337867602705956, "rewards/cosine_scaled_reward": -0.05793424462899566, "rewards/format_reward": 0.3958333432674408, "step": 210 }, { "completion_length": 2413.104202270508, "epoch": 0.24114285714285713, "grad_norm": 0.30833762884140015, "kl": 0.18377685546875, "learning_rate": 7.444385869608921e-07, "loss": 0.0023, "reward": 0.17130140773952007, "reward_std": 0.6732404362410307, "rewards/cosine_scaled_reward": -0.004216981120407581, "rewards/format_reward": 0.5000000074505806, "step": 211 }, { "completion_length": 2611.0000762939453, "epoch": 0.2422857142857143, "grad_norm": 0.36040380597114563, "kl": 0.195404052734375, "learning_rate": 7.416006812042827e-07, "loss": 0.0443, "reward": 0.207231349311769, "reward_std": 0.8025440499186516, "rewards/cosine_scaled_reward": 0.004049690440297127, "rewards/format_reward": 0.520833345130086, "step": 212 }, { "completion_length": 2825.479217529297, "epoch": 0.24342857142857144, "grad_norm": 0.4717042148113251, "kl": 0.2646484375, "learning_rate": 7.387534371007797e-07, "loss": 0.0311, "reward": 0.24717780202627182, "reward_std": 0.9448489658534527, "rewards/cosine_scaled_reward": 0.05484255403280258, "rewards/format_reward": 0.45833334140479565, "step": 213 }, { "completion_length": 2770.4375762939453, "epoch": 0.24457142857142858, "grad_norm": 0.5870217084884644, "kl": 0.2110595703125, "learning_rate": 7.358969934210438e-07, "loss": 0.0885, "reward": 0.2060818038880825, "reward_std": 0.8542167469859123, "rewards/cosine_scaled_reward": 0.04777835123240948, "rewards/format_reward": 0.4375000074505806, "step": 214 }, { "completion_length": 2372.354217529297, "epoch": 0.24571428571428572, "grad_norm": 0.2540161609649658, "kl": 0.204833984375, "learning_rate": 7.330314893841101e-07, "loss": 0.0192, "reward": -0.08897280413657427, "reward_std": 0.5330733954906464, "rewards/cosine_scaled_reward": -0.18874458596110344, "rewards/format_reward": 0.541666679084301, "step": 215 }, { "completion_length": 2626.2083892822266, "epoch": 0.24685714285714286, "grad_norm": 0.38485583662986755, "kl": 0.2821044921875, "learning_rate": 7.301570646506027e-07, "loss": 0.0632, "reward": 0.3148540537804365, "reward_std": 0.8362242169678211, "rewards/cosine_scaled_reward": 0.04583857045508921, "rewards/format_reward": 0.5833333469927311, "step": 216 }, { "completion_length": 3083.104217529297, "epoch": 0.248, "grad_norm": 0.35301655530929565, "kl": 0.3206787109375, "learning_rate": 7.27273859315928e-07, "loss": 0.0247, "reward": 0.1059049442410469, "reward_std": 0.8536710906773806, "rewards/cosine_scaled_reward": -0.003996940446086228, "rewards/format_reward": 0.3958333358168602, "step": 217 }, { "completion_length": 2729.8125610351562, "epoch": 0.24914285714285714, "grad_norm": 0.3946671783924103, "kl": 0.2547607421875, "learning_rate": 7.243820139034464e-07, "loss": 0.0153, "reward": -0.2474758685566485, "reward_std": 0.6462593208998442, "rewards/cosine_scaled_reward": -0.22353118157479912, "rewards/format_reward": 0.3750000149011612, "step": 218 }, { "completion_length": 2773.6250610351562, "epoch": 0.2502857142857143, "grad_norm": 0.5205709338188171, "kl": 0.28680419921875, "learning_rate": 7.214816693576234e-07, "loss": 0.0678, "reward": 0.17035892861895263, "reward_std": 0.8409441113471985, "rewards/cosine_scaled_reward": -0.03153566690161824, "rewards/format_reward": 0.5416666772216558, "step": 219 }, { "completion_length": 3064.9792404174805, "epoch": 0.25142857142857145, "grad_norm": 0.3793766498565674, "kl": 0.3310546875, "learning_rate": 7.185729670371604e-07, "loss": 0.0536, "reward": -0.40579412039369345, "reward_std": 0.4997274577617645, "rewards/cosine_scaled_reward": -0.2578846197575331, "rewards/format_reward": 0.25000000186264515, "step": 220 }, { "completion_length": 2607.375045776367, "epoch": 0.25257142857142856, "grad_norm": 0.4927677810192108, "kl": 0.2911376953125, "learning_rate": 7.156560487081051e-07, "loss": 0.0538, "reward": 0.29882943257689476, "reward_std": 0.8470067903399467, "rewards/cosine_scaled_reward": 0.015899650752544403, "rewards/format_reward": 0.625000013038516, "step": 221 }, { "completion_length": 2744.041748046875, "epoch": 0.2537142857142857, "grad_norm": 0.3646686375141144, "kl": 0.331298828125, "learning_rate": 7.127310565369415e-07, "loss": 0.0247, "reward": 0.22724982630461454, "reward_std": 0.6808402426540852, "rewards/cosine_scaled_reward": -0.022449948824942112, "rewards/format_reward": 0.6250000074505806, "step": 222 }, { "completion_length": 2798.2083740234375, "epoch": 0.25485714285714284, "grad_norm": 0.397216260433197, "kl": 0.3436279296875, "learning_rate": 7.097981330836616e-07, "loss": 0.0215, "reward": 0.16939541138708591, "reward_std": 0.5345852337777615, "rewards/cosine_scaled_reward": 0.007962611503899097, "rewards/format_reward": 0.5, "step": 223 }, { "completion_length": 3249.1666870117188, "epoch": 0.256, "grad_norm": 0.7151598930358887, "kl": 0.38037109375, "learning_rate": 7.068574212948169e-07, "loss": 0.0902, "reward": -0.05321236699819565, "reward_std": 0.8325384929776192, "rewards/cosine_scaled_reward": -0.046919144690036774, "rewards/format_reward": 0.2708333432674408, "step": 224 }, { "completion_length": 2881.2709350585938, "epoch": 0.2571428571428571, "grad_norm": 0.412383109331131, "kl": 0.35223388671875, "learning_rate": 7.039090644965509e-07, "loss": 0.0437, "reward": 0.012765285558998585, "reward_std": 0.6845123060047626, "rewards/cosine_scaled_reward": -0.08529938757419586, "rewards/format_reward": 0.45833333395421505, "step": 225 }, { "completion_length": 2802.041717529297, "epoch": 0.2582857142857143, "grad_norm": 0.4051804840564728, "kl": 0.37164306640625, "learning_rate": 7.009532063876148e-07, "loss": 0.0577, "reward": 0.17664484679698944, "reward_std": 0.7071616277098656, "rewards/cosine_scaled_reward": 0.03299903869628906, "rewards/format_reward": 0.43750000558793545, "step": 226 }, { "completion_length": 2718.291732788086, "epoch": 0.25942857142857145, "grad_norm": 0.4118206799030304, "kl": 0.400390625, "learning_rate": 6.979899910323624e-07, "loss": 0.0558, "reward": 0.059418462216854095, "reward_std": 0.6883127726614475, "rewards/cosine_scaled_reward": -0.16530904080718756, "rewards/format_reward": 0.6666666734963655, "step": 227 }, { "completion_length": 2551.270866394043, "epoch": 0.26057142857142856, "grad_norm": 0.5437552332878113, "kl": 0.3446044921875, "learning_rate": 6.950195628537299e-07, "loss": 0.0212, "reward": 0.16873644525185227, "reward_std": 0.6643664948642254, "rewards/cosine_scaled_reward": -0.00022228434681892395, "rewards/format_reward": 0.5000000037252903, "step": 228 }, { "completion_length": 3142.5208740234375, "epoch": 0.26171428571428573, "grad_norm": 0.824421226978302, "kl": 0.490966796875, "learning_rate": 6.920420666261961e-07, "loss": 0.022, "reward": -0.009648986160755157, "reward_std": 0.523324097506702, "rewards/cosine_scaled_reward": -0.06068984791636467, "rewards/format_reward": 0.3958333395421505, "step": 229 }, { "completion_length": 3071.4583740234375, "epoch": 0.26285714285714284, "grad_norm": 0.5922899842262268, "kl": 0.43035888671875, "learning_rate": 6.890576474687263e-07, "loss": 0.0518, "reward": -0.29550562985241413, "reward_std": 0.6693998202681541, "rewards/cosine_scaled_reward": -0.22350211441516876, "rewards/format_reward": 0.31250000931322575, "step": 230 }, { "completion_length": 2912.7083740234375, "epoch": 0.264, "grad_norm": 0.5167642831802368, "kl": 0.3560791015625, "learning_rate": 6.860664508377001e-07, "loss": 0.0283, "reward": 0.1596711277961731, "reward_std": 0.7965347096323967, "rewards/cosine_scaled_reward": -0.01580745540559292, "rewards/format_reward": 0.5000000074505806, "step": 231 }, { "completion_length": 3158.1459045410156, "epoch": 0.2651428571428571, "grad_norm": 0.6749157905578613, "kl": 0.4017333984375, "learning_rate": 6.83068622519821e-07, "loss": 0.0837, "reward": -0.297568422742188, "reward_std": 0.6748274452984333, "rewards/cosine_scaled_reward": -0.18697709869593382, "rewards/format_reward": 0.22916666977107525, "step": 232 }, { "completion_length": 2762.1666870117188, "epoch": 0.2662857142857143, "grad_norm": 0.3311655819416046, "kl": 0.317626953125, "learning_rate": 6.800643086250121e-07, "loss": 0.0416, "reward": -0.08082350715994835, "reward_std": 0.6287317145615816, "rewards/cosine_scaled_reward": -0.16008687764406204, "rewards/format_reward": 0.47916666977107525, "step": 233 }, { "completion_length": 2734.8750648498535, "epoch": 0.2674285714285714, "grad_norm": 0.2763339579105377, "kl": 0.291839599609375, "learning_rate": 6.770536555792944e-07, "loss": 0.0202, "reward": -0.14287907630205154, "reward_std": 0.5912873484194279, "rewards/cosine_scaled_reward": -0.13689745217561722, "rewards/format_reward": 0.35416666977107525, "step": 234 }, { "completion_length": 2442.187515258789, "epoch": 0.26857142857142857, "grad_norm": 0.324736088514328, "kl": 0.25311279296875, "learning_rate": 6.740368101176495e-07, "loss": 0.0321, "reward": -0.05069837532937527, "reward_std": 0.6721667312085629, "rewards/cosine_scaled_reward": -0.173017387278378, "rewards/format_reward": 0.5416666716337204, "step": 235 }, { "completion_length": 2971.2083740234375, "epoch": 0.26971428571428574, "grad_norm": 0.6796517968177795, "kl": 0.234130859375, "learning_rate": 6.710139192768694e-07, "loss": 0.0103, "reward": 0.2688827021047473, "reward_std": 0.9795111119747162, "rewards/cosine_scaled_reward": -0.004769146267790347, "rewards/format_reward": 0.604166679084301, "step": 236 }, { "completion_length": 2812.8750610351562, "epoch": 0.27085714285714285, "grad_norm": 0.9011460542678833, "kl": 0.2601318359375, "learning_rate": 6.679851303883891e-07, "loss": 0.0936, "reward": 0.06308133527636528, "reward_std": 0.7535826116800308, "rewards/cosine_scaled_reward": -0.068068141117692, "rewards/format_reward": 0.47916667349636555, "step": 237 }, { "completion_length": 2691.7084045410156, "epoch": 0.272, "grad_norm": 1.501541018486023, "kl": 0.273681640625, "learning_rate": 6.649505910711058e-07, "loss": 0.0986, "reward": 0.32719678059220314, "reward_std": 1.0173063725233078, "rewards/cosine_scaled_reward": 0.05737040005624294, "rewards/format_reward": 0.5625000074505806, "step": 238 }, { "completion_length": 2189.7708587646484, "epoch": 0.27314285714285713, "grad_norm": 0.20339642465114594, "kl": 0.24578857421875, "learning_rate": 6.619104492241847e-07, "loss": 0.0259, "reward": 0.3876400392036885, "reward_std": 0.6854967623949051, "rewards/cosine_scaled_reward": 0.07331139780580997, "rewards/format_reward": 0.6458333395421505, "step": 239 }, { "completion_length": 2928.354232788086, "epoch": 0.2742857142857143, "grad_norm": 0.46207401156425476, "kl": 0.510498046875, "learning_rate": 6.588648530198504e-07, "loss": 0.0666, "reward": -0.031538160517811775, "reward_std": 0.6143174581229687, "rewards/cosine_scaled_reward": -0.12371946685016155, "rewards/format_reward": 0.47916667349636555, "step": 240 }, { "completion_length": 3160.937545776367, "epoch": 0.2754285714285714, "grad_norm": 0.4524831175804138, "kl": 0.5111083984375, "learning_rate": 6.558139508961654e-07, "loss": 0.0504, "reward": -0.24879638850688934, "reward_std": 0.6495300643146038, "rewards/cosine_scaled_reward": -0.22295443713665009, "rewards/format_reward": 0.37500000558793545, "step": 241 }, { "completion_length": 2566.5000610351562, "epoch": 0.2765714285714286, "grad_norm": 0.578352153301239, "kl": 0.4998779296875, "learning_rate": 6.527578915497951e-07, "loss": 0.0285, "reward": 0.15099805174395442, "reward_std": 0.515793077647686, "rewards/cosine_scaled_reward": -0.11855055205523968, "rewards/format_reward": 0.7291666809469461, "step": 242 }, { "completion_length": 2909.3958587646484, "epoch": 0.2777142857142857, "grad_norm": 0.8137231469154358, "kl": 0.51025390625, "learning_rate": 6.496968239287603e-07, "loss": 0.0269, "reward": 0.334348788484931, "reward_std": 0.7821071408689022, "rewards/cosine_scaled_reward": 0.07382349669933319, "rewards/format_reward": 0.5625000037252903, "step": 243 }, { "completion_length": 2917.541748046875, "epoch": 0.27885714285714286, "grad_norm": 0.6295875906944275, "kl": 0.50042724609375, "learning_rate": 6.466308972251785e-07, "loss": 0.0786, "reward": 0.2729534022510052, "reward_std": 0.830750398337841, "rewards/cosine_scaled_reward": 0.08813249412924051, "rewards/format_reward": 0.43750000558793545, "step": 244 }, { "completion_length": 3121.8958740234375, "epoch": 0.28, "grad_norm": 1.1297352313995361, "kl": 0.5673828125, "learning_rate": 6.435602608679916e-07, "loss": 0.12, "reward": 0.08920413255691528, "reward_std": 1.0364465415477753, "rewards/cosine_scaled_reward": -0.05938760610297322, "rewards/format_reward": 0.4583333432674408, "step": 245 }, { "completion_length": 3070.0833740234375, "epoch": 0.28114285714285714, "grad_norm": 0.941576361656189, "kl": 0.61669921875, "learning_rate": 6.404850645156841e-07, "loss": 0.1055, "reward": 0.07732813712209463, "reward_std": 0.9496094807982445, "rewards/cosine_scaled_reward": -0.05677332216873765, "rewards/format_reward": 0.45833334513008595, "step": 246 }, { "completion_length": 3249.375045776367, "epoch": 0.2822857142857143, "grad_norm": 0.7210645079612732, "kl": 0.67041015625, "learning_rate": 6.374054580489873e-07, "loss": 0.0552, "reward": -0.3232395015656948, "reward_std": 0.6331962943077087, "rewards/cosine_scaled_reward": -0.21088325325399637, "rewards/format_reward": 0.2500000037252903, "step": 247 }, { "completion_length": 2833.7083892822266, "epoch": 0.2834285714285714, "grad_norm": 0.5416091680526733, "kl": 0.570068359375, "learning_rate": 6.343215915635761e-07, "loss": 0.0338, "reward": 0.2699696607887745, "reward_std": 0.769625548273325, "rewards/cosine_scaled_reward": 0.08322756737470627, "rewards/format_reward": 0.4583333395421505, "step": 248 }, { "completion_length": 2345.041702270508, "epoch": 0.2845714285714286, "grad_norm": 0.4412481486797333, "kl": 0.44451904296875, "learning_rate": 6.31233615362752e-07, "loss": 0.0447, "reward": 0.4360041692852974, "reward_std": 0.8868755213916302, "rewards/cosine_scaled_reward": 0.08689466118812561, "rewards/format_reward": 0.6666666772216558, "step": 249 }, { "completion_length": 2516.8333587646484, "epoch": 0.2857142857142857, "grad_norm": 0.5790634155273438, "kl": 0.5372314453125, "learning_rate": 6.281416799501187e-07, "loss": 0.0766, "reward": 0.1508978575002402, "reward_std": 0.7845667004585266, "rewards/cosine_scaled_reward": -0.10766792530193925, "rewards/format_reward": 0.666666679084301, "step": 250 }, { "completion_length": 2367.6875534057617, "epoch": 0.28685714285714287, "grad_norm": 0.465372771024704, "kl": 0.53857421875, "learning_rate": 6.25045936022246e-07, "loss": 0.045, "reward": 0.2600698294118047, "reward_std": 0.7807004749774933, "rewards/cosine_scaled_reward": -0.08423476293683052, "rewards/format_reward": 0.7708333395421505, "step": 251 }, { "completion_length": 2789.3750915527344, "epoch": 0.288, "grad_norm": 0.5998123288154602, "kl": 0.5885009765625, "learning_rate": 6.219465344613258e-07, "loss": 0.0437, "reward": 0.0935278192628175, "reward_std": 0.7258482351899147, "rewards/cosine_scaled_reward": -0.09951683203689754, "rewards/format_reward": 0.583333345130086, "step": 252 }, { "completion_length": 2500.1875915527344, "epoch": 0.28914285714285715, "grad_norm": 0.6858162879943848, "kl": 0.5001220703125, "learning_rate": 6.188436263278172e-07, "loss": 0.0555, "reward": 0.2763403048738837, "reward_std": 0.9541792422533035, "rewards/cosine_scaled_reward": -0.020189424976706505, "rewards/format_reward": 0.645833345130086, "step": 253 }, { "completion_length": 3007.7500610351562, "epoch": 0.29028571428571426, "grad_norm": 0.7101835012435913, "kl": 0.715576171875, "learning_rate": 6.157373628530852e-07, "loss": 0.0531, "reward": 0.011770043522119522, "reward_std": 0.6751254117116332, "rewards/cosine_scaled_reward": -0.06386499013751745, "rewards/format_reward": 0.4166666716337204, "step": 254 }, { "completion_length": 2951.0625534057617, "epoch": 0.2914285714285714, "grad_norm": 0.8391396403312683, "kl": 0.64337158203125, "learning_rate": 6.126278954320294e-07, "loss": 0.0387, "reward": -0.2175804078578949, "reward_std": 0.633871290832758, "rewards/cosine_scaled_reward": -0.2334041576832533, "rewards/format_reward": 0.4375000111758709, "step": 255 }, { "completion_length": 2761.041702270508, "epoch": 0.2925714285714286, "grad_norm": 1.0026954412460327, "kl": 0.5166015625, "learning_rate": 6.095153756157051e-07, "loss": 0.0119, "reward": 0.14736154675483704, "reward_std": 0.7076679766178131, "rewards/cosine_scaled_reward": -0.052284312434494495, "rewards/format_reward": 0.5625000093132257, "step": 256 }, { "completion_length": 3124.2709350585938, "epoch": 0.2937142857142857, "grad_norm": 0.9888486266136169, "kl": 0.50518798828125, "learning_rate": 6.06399955103937e-07, "loss": 0.0677, "reward": 0.3817547345533967, "reward_std": 0.9725684300065041, "rewards/cosine_scaled_reward": 0.10540459351614118, "rewards/format_reward": 0.541666679084301, "step": 257 }, { "completion_length": 3022.979217529297, "epoch": 0.2948571428571429, "grad_norm": 1.060575246810913, "kl": 0.49462890625, "learning_rate": 6.032817857379256e-07, "loss": 0.0775, "reward": 0.11859412118792534, "reward_std": 0.8907319009304047, "rewards/cosine_scaled_reward": -0.01629660092294216, "rewards/format_reward": 0.4375000074505806, "step": 258 }, { "completion_length": 2550.833366394043, "epoch": 0.296, "grad_norm": 0.4294055998325348, "kl": 0.43841552734375, "learning_rate": 6.001610194928464e-07, "loss": 0.052, "reward": 0.30166149651631713, "reward_std": 0.831500705331564, "rewards/cosine_scaled_reward": 0.02446294855326414, "rewards/format_reward": 0.6041666828095913, "step": 259 }, { "completion_length": 2399.1250610351562, "epoch": 0.29714285714285715, "grad_norm": 0.3650396168231964, "kl": 0.36968994140625, "learning_rate": 5.97037808470444e-07, "loss": 0.0443, "reward": 0.403296634554863, "reward_std": 0.926733735948801, "rewards/cosine_scaled_reward": 0.07911694049835205, "rewards/format_reward": 0.6250000149011612, "step": 260 }, { "completion_length": 2862.2708892822266, "epoch": 0.29828571428571427, "grad_norm": 1.088972568511963, "kl": 0.4329833984375, "learning_rate": 5.939123048916173e-07, "loss": 0.068, "reward": -0.11121963523328304, "reward_std": 0.6940420866012573, "rewards/cosine_scaled_reward": -0.19372293539345264, "rewards/format_reward": 0.5000000074505806, "step": 261 }, { "completion_length": 2654.479217529297, "epoch": 0.29942857142857143, "grad_norm": 0.9074969291687012, "kl": 0.4139404296875, "learning_rate": 5.907846610890011e-07, "loss": 0.0371, "reward": 0.052452532574534416, "reward_std": 0.6277891993522644, "rewards/cosine_scaled_reward": -0.18083440139889717, "rewards/format_reward": 0.7083333395421505, "step": 262 }, { "completion_length": 2793.729202270508, "epoch": 0.30057142857142854, "grad_norm": 2.0290629863739014, "kl": 0.50604248046875, "learning_rate": 5.87655029499542e-07, "loss": 0.0191, "reward": -0.09252774063497782, "reward_std": 0.5941142216324806, "rewards/cosine_scaled_reward": -0.2058687130920589, "rewards/format_reward": 0.5625000074505806, "step": 263 }, { "completion_length": 2861.9584350585938, "epoch": 0.3017142857142857, "grad_norm": 0.7169579863548279, "kl": 0.4697265625, "learning_rate": 5.845235626570683e-07, "loss": 0.0771, "reward": 0.00036056432873010635, "reward_std": 0.7790909744799137, "rewards/cosine_scaled_reward": -0.16522826021537185, "rewards/format_reward": 0.5833333469927311, "step": 264 }, { "completion_length": 2631.666732788086, "epoch": 0.3028571428571429, "grad_norm": 0.7453656196594238, "kl": 0.45709228515625, "learning_rate": 5.813904131848564e-07, "loss": 0.0216, "reward": 0.3167956112883985, "reward_std": 0.8776490315794945, "rewards/cosine_scaled_reward": -0.05162257980555296, "rewards/format_reward": 0.7708333414047956, "step": 265 }, { "completion_length": 2901.4583892822266, "epoch": 0.304, "grad_norm": 1.1252886056900024, "kl": 0.4161376953125, "learning_rate": 5.78255733788191e-07, "loss": -0.0038, "reward": 0.05045348312705755, "reward_std": 0.8374455273151398, "rewards/cosine_scaled_reward": -0.1557634025812149, "rewards/format_reward": 0.6250000055879354, "step": 266 }, { "completion_length": 3027.8750762939453, "epoch": 0.30514285714285716, "grad_norm": 0.44741290807724, "kl": 0.4798583984375, "learning_rate": 5.751196772469237e-07, "loss": 0.0489, "reward": 0.042676386423408985, "reward_std": 0.7426804676651955, "rewards/cosine_scaled_reward": -0.14299316331744194, "rewards/format_reward": 0.604166679084301, "step": 267 }, { "completion_length": 2573.0208740234375, "epoch": 0.3062857142857143, "grad_norm": 0.42620187997817993, "kl": 0.3807373046875, "learning_rate": 5.71982396408026e-07, "loss": 0.0182, "reward": 0.010210057254880667, "reward_std": 0.6326607689261436, "rewards/cosine_scaled_reward": -0.11539698392152786, "rewards/format_reward": 0.5208333395421505, "step": 268 }, { "completion_length": 2693.4584350585938, "epoch": 0.30742857142857144, "grad_norm": 0.8198751211166382, "kl": 0.38409423828125, "learning_rate": 5.688440441781398e-07, "loss": 0.0175, "reward": 0.04538612812757492, "reward_std": 0.8305856361985207, "rewards/cosine_scaled_reward": -0.16101938486099243, "rewards/format_reward": 0.6250000093132257, "step": 269 }, { "completion_length": 2817.5834197998047, "epoch": 0.30857142857142855, "grad_norm": 0.6884281039237976, "kl": 0.3497314453125, "learning_rate": 5.657047735161255e-07, "loss": 0.0278, "reward": 0.19405698496848345, "reward_std": 0.9544314742088318, "rewards/cosine_scaled_reward": -0.05997245345497504, "rewards/format_reward": 0.6041666828095913, "step": 270 }, { "completion_length": 2640.4584045410156, "epoch": 0.3097142857142857, "grad_norm": 0.48852798342704773, "kl": 0.32904052734375, "learning_rate": 5.625647374256061e-07, "loss": 0.0483, "reward": 0.3868153728544712, "reward_std": 0.8188424855470657, "rewards/cosine_scaled_reward": 0.05347882490605116, "rewards/format_reward": 0.6666666697710752, "step": 271 }, { "completion_length": 3057.0000915527344, "epoch": 0.31085714285714283, "grad_norm": 0.805952787399292, "kl": 0.4097900390625, "learning_rate": 5.594240889475106e-07, "loss": 0.0648, "reward": -0.030033869668841362, "reward_std": 0.788214236497879, "rewards/cosine_scaled_reward": -0.1403372660279274, "rewards/format_reward": 0.5000000111758709, "step": 272 }, { "completion_length": 2664.3125610351562, "epoch": 0.312, "grad_norm": 0.6616376638412476, "kl": 0.384765625, "learning_rate": 5.562829811526154e-07, "loss": 0.0021, "reward": 0.062399049289524555, "reward_std": 0.7391597256064415, "rewards/cosine_scaled_reward": -0.06876735016703606, "rewards/format_reward": 0.47916667349636555, "step": 273 }, { "completion_length": 2088.8541870117188, "epoch": 0.31314285714285717, "grad_norm": 1.1467257738113403, "kl": 0.261138916015625, "learning_rate": 5.531415671340826e-07, "loss": -0.0011, "reward": 0.5514285732060671, "reward_std": 0.6735243201255798, "rewards/cosine_scaled_reward": 0.1613447144627571, "rewards/format_reward": 0.6875, "step": 274 }, { "completion_length": 2230.0625381469727, "epoch": 0.3142857142857143, "grad_norm": 0.48795756697654724, "kl": 0.2476806640625, "learning_rate": 5.5e-07, "loss": 0.0153, "reward": 0.4183642081916332, "reward_std": 0.9099490307271481, "rewards/cosine_scaled_reward": 0.07764563540695235, "rewards/format_reward": 0.6458333432674408, "step": 275 }, { "completion_length": 2390.729248046875, "epoch": 0.31542857142857145, "grad_norm": 0.5403981804847717, "kl": 0.2706298828125, "learning_rate": 5.468584328659172e-07, "loss": 0.0278, "reward": 0.5140420459210873, "reward_std": 0.9649418145418167, "rewards/cosine_scaled_reward": 0.08755906065925956, "rewards/format_reward": 0.7500000204890966, "step": 276 }, { "completion_length": 2273.7917404174805, "epoch": 0.31657142857142856, "grad_norm": 0.681844174861908, "kl": 0.2330322265625, "learning_rate": 5.437170188473847e-07, "loss": 0.0456, "reward": 0.39711499866098166, "reward_std": 0.8123867064714432, "rewards/cosine_scaled_reward": 0.028049522661603987, "rewards/format_reward": 0.7291666772216558, "step": 277 }, { "completion_length": 2178.3333740234375, "epoch": 0.3177142857142857, "grad_norm": 0.31549885869026184, "kl": 0.264892578125, "learning_rate": 5.405759110524894e-07, "loss": 0.0333, "reward": 0.46570797031745315, "reward_std": 0.5852809324860573, "rewards/cosine_scaled_reward": 0.054337045177817345, "rewards/format_reward": 0.791666679084301, "step": 278 }, { "completion_length": 2957.604248046875, "epoch": 0.31885714285714284, "grad_norm": 0.6503915786743164, "kl": 0.3946533203125, "learning_rate": 5.37435262574394e-07, "loss": 0.0216, "reward": -0.10753144230693579, "reward_std": 0.821341261267662, "rewards/cosine_scaled_reward": -0.20013360609300435, "rewards/format_reward": 0.5000000149011612, "step": 279 }, { "completion_length": 2075.479217529297, "epoch": 0.32, "grad_norm": 1.738971471786499, "kl": 0.265228271484375, "learning_rate": 5.342952264838747e-07, "loss": 0.0882, "reward": 0.510447891894728, "reward_std": 0.914945088326931, "rewards/cosine_scaled_reward": 0.14110325649380684, "rewards/format_reward": 0.6458333488553762, "step": 280 }, { "completion_length": 3364.3333740234375, "epoch": 0.3211428571428571, "grad_norm": 0.8055136203765869, "kl": 0.486083984375, "learning_rate": 5.311559558218603e-07, "loss": 0.0224, "reward": -0.18871852289885283, "reward_std": 0.6349410191178322, "rewards/cosine_scaled_reward": -0.18654117919504642, "rewards/format_reward": 0.39583334140479565, "step": 281 }, { "completion_length": 2701.916748046875, "epoch": 0.3222857142857143, "grad_norm": 0.4943215548992157, "kl": 0.34820556640625, "learning_rate": 5.28017603591974e-07, "loss": 0.0322, "reward": 0.3567043347284198, "reward_std": 0.687431275844574, "rewards/cosine_scaled_reward": 0.010191468521952629, "rewards/format_reward": 0.7291666865348816, "step": 282 }, { "completion_length": 2955.5000610351562, "epoch": 0.32342857142857145, "grad_norm": 2.2376575469970703, "kl": 0.4698486328125, "learning_rate": 5.248803227530763e-07, "loss": 0.1237, "reward": 0.17788631992880255, "reward_std": 0.9274262189865112, "rewards/cosine_scaled_reward": -0.03141580242663622, "rewards/format_reward": 0.5416666828095913, "step": 283 }, { "completion_length": 2488.4584350585938, "epoch": 0.32457142857142857, "grad_norm": 0.9498338103294373, "kl": 0.3389892578125, "learning_rate": 5.21744266211809e-07, "loss": 0.0732, "reward": 0.36859262455254793, "reward_std": 0.7699229158461094, "rewards/cosine_scaled_reward": -0.00418412871658802, "rewards/format_reward": 0.7708333507180214, "step": 284 }, { "completion_length": 2094.3750762939453, "epoch": 0.32571428571428573, "grad_norm": 0.24566514790058136, "kl": 0.274169921875, "learning_rate": 5.186095868151436e-07, "loss": 0.0373, "reward": 0.21163646131753922, "reward_std": 0.6311895027756691, "rewards/cosine_scaled_reward": -0.09609892021398991, "rewards/format_reward": 0.7500000111758709, "step": 285 }, { "completion_length": 2660.3333740234375, "epoch": 0.32685714285714285, "grad_norm": 0.4989126920700073, "kl": 0.52783203125, "learning_rate": 5.154764373429315e-07, "loss": 0.0441, "reward": 0.1898297774605453, "reward_std": 0.8184352703392506, "rewards/cosine_scaled_reward": -0.08057594299316406, "rewards/format_reward": 0.6666666846722364, "step": 286 }, { "completion_length": 1924.020881652832, "epoch": 0.328, "grad_norm": 1.0765957832336426, "kl": 0.527130126953125, "learning_rate": 5.123449705004581e-07, "loss": -0.0004, "reward": 0.320914643118158, "reward_std": 0.6506007239222527, "rewards/cosine_scaled_reward": -0.0552152032032609, "rewards/format_reward": 0.8125000149011612, "step": 287 }, { "completion_length": 2751.916748046875, "epoch": 0.3291428571428571, "grad_norm": 0.525725781917572, "kl": 0.47174072265625, "learning_rate": 5.09215338910999e-07, "loss": 0.0345, "reward": 0.28691791370511055, "reward_std": 0.925068948417902, "rewards/cosine_scaled_reward": -0.05298358201980591, "rewards/format_reward": 0.7291666716337204, "step": 288 }, { "completion_length": 2081.791702270508, "epoch": 0.3302857142857143, "grad_norm": 0.4456733167171478, "kl": 0.426513671875, "learning_rate": 5.060876951083828e-07, "loss": 0.0375, "reward": 0.2169907259522006, "reward_std": 0.6403507255017757, "rewards/cosine_scaled_reward": -0.062007103115320206, "rewards/format_reward": 0.6875000055879354, "step": 289 }, { "completion_length": 2641.5625915527344, "epoch": 0.3314285714285714, "grad_norm": 0.49116048216819763, "kl": 0.56640625, "learning_rate": 5.02962191529556e-07, "loss": 0.0674, "reward": 0.32489653676748276, "reward_std": 0.7462290413677692, "rewards/cosine_scaled_reward": -0.07674999348819256, "rewards/format_reward": 0.854166679084301, "step": 290 }, { "completion_length": 2844.3959045410156, "epoch": 0.3325714285714286, "grad_norm": 0.9147380590438843, "kl": 0.5509033203125, "learning_rate": 4.998389805071536e-07, "loss": 0.0864, "reward": 0.2941320105455816, "reward_std": 0.8829877898097038, "rewards/cosine_scaled_reward": -0.09763280488550663, "rewards/format_reward": 0.8333333544433117, "step": 291 }, { "completion_length": 2980.3126220703125, "epoch": 0.33371428571428574, "grad_norm": 1.0147628784179688, "kl": 0.628662109375, "learning_rate": 4.967182142620745e-07, "loss": 0.0385, "reward": 0.16018317895941436, "reward_std": 0.6120474711060524, "rewards/cosine_scaled_reward": -0.09870209451764822, "rewards/format_reward": 0.6875000111758709, "step": 292 }, { "completion_length": 2222.3125610351562, "epoch": 0.33485714285714285, "grad_norm": 1.3469599485397339, "kl": 0.42333984375, "learning_rate": 4.93600044896063e-07, "loss": -0.0092, "reward": 0.35159002151340246, "reward_std": 0.6072813756763935, "rewards/cosine_scaled_reward": -0.03080323152244091, "rewards/format_reward": 0.8125000074505806, "step": 293 }, { "completion_length": 3145.1041870117188, "epoch": 0.336, "grad_norm": 1.195894718170166, "kl": 0.774658203125, "learning_rate": 4.904846243842949e-07, "loss": 0.0366, "reward": -0.011713245883584023, "reward_std": 0.686446376144886, "rewards/cosine_scaled_reward": -0.1353329624980688, "rewards/format_reward": 0.5208333414047956, "step": 294 }, { "completion_length": 2874.3126068115234, "epoch": 0.33714285714285713, "grad_norm": 0.6826883554458618, "kl": 0.62451171875, "learning_rate": 4.873721045679706e-07, "loss": 0.0648, "reward": 0.4660371467471123, "reward_std": 0.8896083161234856, "rewards/cosine_scaled_reward": 0.10532690212130547, "rewards/format_reward": 0.6666666865348816, "step": 295 }, { "completion_length": 3109.916748046875, "epoch": 0.3382857142857143, "grad_norm": 0.6580361723899841, "kl": 0.54638671875, "learning_rate": 4.842626371469149e-07, "loss": 0.0593, "reward": 0.1299741494731279, "reward_std": 0.7981628403067589, "rewards/cosine_scaled_reward": -0.1501350817270577, "rewards/format_reward": 0.7291666977107525, "step": 296 }, { "completion_length": 3412.6875915527344, "epoch": 0.3394285714285714, "grad_norm": 0.5886114835739136, "kl": 0.65771484375, "learning_rate": 4.811563736721829e-07, "loss": 0.0604, "reward": -0.13705510459840298, "reward_std": 0.7897306978702545, "rewards/cosine_scaled_reward": -0.1760294260457158, "rewards/format_reward": 0.41666666977107525, "step": 297 }, { "completion_length": 2516.0000610351562, "epoch": 0.3405714285714286, "grad_norm": 0.9390971660614014, "kl": 0.326019287109375, "learning_rate": 4.780534655386743e-07, "loss": -0.005, "reward": 0.2075387438526377, "reward_std": 0.678107738494873, "rewards/cosine_scaled_reward": -0.06827879883348942, "rewards/format_reward": 0.687500013038516, "step": 298 }, { "completion_length": 2798.750045776367, "epoch": 0.3417142857142857, "grad_norm": 0.32884588837623596, "kl": 0.3729248046875, "learning_rate": 4.749540639777539e-07, "loss": 0.029, "reward": 0.3677529713604599, "reward_std": 0.581198662519455, "rewards/cosine_scaled_reward": 0.07487833127379417, "rewards/format_reward": 0.6250000149011612, "step": 299 }, { "completion_length": 3013.9584350585938, "epoch": 0.34285714285714286, "grad_norm": 0.45819181203842163, "kl": 0.378662109375, "learning_rate": 4.7185832004988133e-07, "loss": 0.0371, "reward": 0.11145258648321033, "reward_std": 0.8130291737616062, "rewards/cosine_scaled_reward": -0.12973832013085485, "rewards/format_reward": 0.6666666865348816, "step": 300 }, { "completion_length": 2654.791717529297, "epoch": 0.344, "grad_norm": 0.4394110143184662, "kl": 0.3597412109375, "learning_rate": 4.68766384637248e-07, "loss": 0.0512, "reward": 0.15728470450267196, "reward_std": 0.8084782175719738, "rewards/cosine_scaled_reward": -0.07348441705107689, "rewards/format_reward": 0.6041666753590107, "step": 301 }, { "completion_length": 2718.750030517578, "epoch": 0.34514285714285714, "grad_norm": 1.5091288089752197, "kl": 0.374420166015625, "learning_rate": 4.656784084364238e-07, "loss": -0.0067, "reward": -0.010899038054049015, "reward_std": 0.6495076231658459, "rewards/cosine_scaled_reward": -0.1349128014408052, "rewards/format_reward": 0.5208333358168602, "step": 302 }, { "completion_length": 2785.0833740234375, "epoch": 0.3462857142857143, "grad_norm": 0.6161960363388062, "kl": 0.3387451171875, "learning_rate": 4.6259454195101267e-07, "loss": 0.0533, "reward": 0.19499525055289268, "reward_std": 0.9087162502110004, "rewards/cosine_scaled_reward": -0.10367321688681841, "rewards/format_reward": 0.7083333469927311, "step": 303 }, { "completion_length": 2953.541778564453, "epoch": 0.3474285714285714, "grad_norm": 0.4118628203868866, "kl": 0.3302001953125, "learning_rate": 4.59514935484316e-07, "loss": 0.05, "reward": 0.17398711014539003, "reward_std": 0.7196006700396538, "rewards/cosine_scaled_reward": -0.13903226237744093, "rewards/format_reward": 0.7708333488553762, "step": 304 }, { "completion_length": 3122.791748046875, "epoch": 0.3485714285714286, "grad_norm": 0.9769808650016785, "kl": 0.315673828125, "learning_rate": 4.5643973913200837e-07, "loss": 0.0601, "reward": 0.1985210245475173, "reward_std": 0.7755458503961563, "rewards/cosine_scaled_reward": -0.06837073154747486, "rewards/format_reward": 0.6666666865348816, "step": 305 }, { "completion_length": 2781.9584350585938, "epoch": 0.3497142857142857, "grad_norm": 0.8336344957351685, "kl": 0.295135498046875, "learning_rate": 4.5336910277482155e-07, "loss": -0.0088, "reward": 0.34240800654515624, "reward_std": 0.7838171310722828, "rewards/cosine_scaled_reward": 0.005472499451570911, "rewards/format_reward": 0.7083333432674408, "step": 306 }, { "completion_length": 2925.625045776367, "epoch": 0.35085714285714287, "grad_norm": 1.742169737815857, "kl": 0.31103515625, "learning_rate": 4.503031760712397e-07, "loss": 0.0824, "reward": 0.24728509783744812, "reward_std": 1.0561788342893124, "rewards/cosine_scaled_reward": -0.03675899375230074, "rewards/format_reward": 0.625000013038516, "step": 307 }, { "completion_length": 3307.916748046875, "epoch": 0.352, "grad_norm": 0.5225728154182434, "kl": 0.390625, "learning_rate": 4.4724210845020494e-07, "loss": 0.0089, "reward": 0.06603662599809468, "reward_std": 0.6128693893551826, "rewards/cosine_scaled_reward": -0.17313291411846876, "rewards/format_reward": 0.7083333414047956, "step": 308 }, { "completion_length": 2979.479217529297, "epoch": 0.35314285714285715, "grad_norm": 0.4211791157722473, "kl": 0.259429931640625, "learning_rate": 4.441860491038345e-07, "loss": 0.0147, "reward": 0.2866139570251107, "reward_std": 0.7773850671947002, "rewards/cosine_scaled_reward": -0.009212411940097809, "rewards/format_reward": 0.6666666753590107, "step": 309 }, { "completion_length": 2679.1667404174805, "epoch": 0.35428571428571426, "grad_norm": 0.5345349907875061, "kl": 0.33343505859375, "learning_rate": 4.4113514698014953e-07, "loss": 0.0432, "reward": 0.24587753415107727, "reward_std": 0.8905804492533207, "rewards/cosine_scaled_reward": -0.016845128498971462, "rewards/format_reward": 0.6041666809469461, "step": 310 }, { "completion_length": 2646.2500610351562, "epoch": 0.3554285714285714, "grad_norm": 0.32929864525794983, "kl": 0.261383056640625, "learning_rate": 4.3808955077581546e-07, "loss": 0.0258, "reward": 0.1976035200059414, "reward_std": 0.8284550718963146, "rewards/cosine_scaled_reward": -0.04515882022678852, "rewards/format_reward": 0.6041666716337204, "step": 311 }, { "completion_length": 2335.729217529297, "epoch": 0.3565714285714286, "grad_norm": 0.296840637922287, "kl": 0.254730224609375, "learning_rate": 4.350494089288943e-07, "loss": 0.0257, "reward": 0.5823600944131613, "reward_std": 0.691787600517273, "rewards/cosine_scaled_reward": 0.1733840461820364, "rewards/format_reward": 0.708333333954215, "step": 312 }, { "completion_length": 2918.68758392334, "epoch": 0.3577142857142857, "grad_norm": 0.8261905312538147, "kl": 0.3216552734375, "learning_rate": 4.3201486961161093e-07, "loss": 0.0615, "reward": 0.32681620866060257, "reward_std": 0.8109251782298088, "rewards/cosine_scaled_reward": 0.06859785690903664, "rewards/format_reward": 0.5625000149011612, "step": 313 }, { "completion_length": 2584.812545776367, "epoch": 0.3588571428571429, "grad_norm": 0.28141534328460693, "kl": 0.24951171875, "learning_rate": 4.2898608072313045e-07, "loss": -0.0017, "reward": 0.5454013433773071, "reward_std": 0.7297746650874615, "rewards/cosine_scaled_reward": 0.13757295534014702, "rewards/format_reward": 0.729166679084301, "step": 314 }, { "completion_length": 3015.229232788086, "epoch": 0.36, "grad_norm": 0.40665552020072937, "kl": 0.357452392578125, "learning_rate": 4.2596318988235037e-07, "loss": 0.0237, "reward": 0.010891908779740334, "reward_std": 0.7054996266961098, "rewards/cosine_scaled_reward": -0.11506736988667399, "rewards/format_reward": 0.5000000074505806, "step": 315 }, { "completion_length": 3297.6875610351562, "epoch": 0.36114285714285715, "grad_norm": 0.38543111085891724, "kl": 0.36572265625, "learning_rate": 4.2294634442070553e-07, "loss": 0.0324, "reward": -0.13673021260183305, "reward_std": 0.5753876939415932, "rewards/cosine_scaled_reward": -0.2870428040623665, "rewards/format_reward": 0.6666666865348816, "step": 316 }, { "completion_length": 3066.8958892822266, "epoch": 0.36228571428571427, "grad_norm": 0.5159828066825867, "kl": 0.4085693359375, "learning_rate": 4.1993569137498776e-07, "loss": 0.0244, "reward": 0.13786310516297817, "reward_std": 0.6435285620391369, "rewards/cosine_scaled_reward": -0.11537853349000216, "rewards/format_reward": 0.6875000279396772, "step": 317 }, { "completion_length": 2283.9375610351562, "epoch": 0.36342857142857143, "grad_norm": 0.4553980827331543, "kl": 0.24249267578125, "learning_rate": 4.1693137748017915e-07, "loss": -0.0024, "reward": 0.41944058798253536, "reward_std": 0.9410093426704407, "rewards/cosine_scaled_reward": -0.0460501410998404, "rewards/format_reward": 0.8958333507180214, "step": 318 }, { "completion_length": 3003.750030517578, "epoch": 0.36457142857142855, "grad_norm": 0.3188877999782562, "kl": 0.35693359375, "learning_rate": 4.1393354916230005e-07, "loss": 0.0374, "reward": -0.02601405733730644, "reward_std": 0.7571947351098061, "rewards/cosine_scaled_reward": -0.16905247140675783, "rewards/format_reward": 0.5625000111758709, "step": 319 }, { "completion_length": 2351.791748046875, "epoch": 0.3657142857142857, "grad_norm": 0.28264015913009644, "kl": 0.226318359375, "learning_rate": 4.1094235253127374e-07, "loss": 0.0313, "reward": 0.41428154334425926, "reward_std": 0.6797180473804474, "rewards/cosine_scaled_reward": -0.014672968536615372, "rewards/format_reward": 0.8541666753590107, "step": 320 }, { "completion_length": 2420.500099182129, "epoch": 0.3668571428571429, "grad_norm": 0.3555741012096405, "kl": 0.2991943359375, "learning_rate": 4.079579333738039e-07, "loss": 0.0064, "reward": 0.3246429590508342, "reward_std": 0.84641108289361, "rewards/cosine_scaled_reward": -0.021552213234826922, "rewards/format_reward": 0.7291666902601719, "step": 321 }, { "completion_length": 2859.395950317383, "epoch": 0.368, "grad_norm": 0.3224516808986664, "kl": 0.27166748046875, "learning_rate": 4.0498043714627006e-07, "loss": 0.013, "reward": 0.41165721049765125, "reward_std": 0.8220065757632256, "rewards/cosine_scaled_reward": 0.0005064834840595722, "rewards/format_reward": 0.8125000149011612, "step": 322 }, { "completion_length": 2920.3125610351562, "epoch": 0.36914285714285716, "grad_norm": 0.7611133456230164, "kl": 0.2857666015625, "learning_rate": 4.020100089676376e-07, "loss": 0.0486, "reward": 0.29905556747689843, "reward_std": 0.7838596850633621, "rewards/cosine_scaled_reward": -0.0440771235153079, "rewards/format_reward": 0.7500000223517418, "step": 323 }, { "completion_length": 3198.3334045410156, "epoch": 0.3702857142857143, "grad_norm": 0.5829113125801086, "kl": 0.34033203125, "learning_rate": 3.9904679361238526e-07, "loss": 0.035, "reward": 0.06515605933964252, "reward_std": 0.6232591606676579, "rewards/cosine_scaled_reward": -0.12263932824134827, "rewards/format_reward": 0.6041666734963655, "step": 324 }, { "completion_length": 2887.104202270508, "epoch": 0.37142857142857144, "grad_norm": 0.3115164339542389, "kl": 0.286163330078125, "learning_rate": 3.9609093550344907e-07, "loss": 0.0308, "reward": 0.04865055438131094, "reward_std": 0.769499409943819, "rewards/cosine_scaled_reward": -0.14275423251092434, "rewards/format_reward": 0.6041666716337204, "step": 325 }, { "completion_length": 2667.5834045410156, "epoch": 0.37257142857142855, "grad_norm": 0.2706601023674011, "kl": 0.24713134765625, "learning_rate": 3.931425787051832e-07, "loss": 0.0352, "reward": 0.3725670697167516, "reward_std": 0.7936568856239319, "rewards/cosine_scaled_reward": -0.0039613001281395555, "rewards/format_reward": 0.7708333395421505, "step": 326 }, { "completion_length": 2546.291702270508, "epoch": 0.3737142857142857, "grad_norm": 0.2914826273918152, "kl": 0.26849365234375, "learning_rate": 3.902018669163384e-07, "loss": 0.0326, "reward": 0.36802724679000676, "reward_std": 0.7584006376564503, "rewards/cosine_scaled_reward": 0.046540172887034714, "rewards/format_reward": 0.6666666772216558, "step": 327 }, { "completion_length": 3366.104217529297, "epoch": 0.37485714285714283, "grad_norm": 0.42033851146698, "kl": 0.346923828125, "learning_rate": 3.872689434630585e-07, "loss": 0.04, "reward": -0.22989960946142673, "reward_std": 0.757425207644701, "rewards/cosine_scaled_reward": -0.22673101257532835, "rewards/format_reward": 0.3958333469927311, "step": 328 }, { "completion_length": 2347.7500534057617, "epoch": 0.376, "grad_norm": 0.8453741073608398, "kl": 0.227630615234375, "learning_rate": 3.843439512918949e-07, "loss": 0.0358, "reward": 0.4706056764116511, "reward_std": 0.8056343197822571, "rewards/cosine_scaled_reward": 0.021218265290372074, "rewards/format_reward": 0.8541666865348816, "step": 329 }, { "completion_length": 2407.020866394043, "epoch": 0.37714285714285717, "grad_norm": 0.7580931782722473, "kl": 0.237945556640625, "learning_rate": 3.8142703296283953e-07, "loss": -0.0101, "reward": 0.12094018794596195, "reward_std": 0.6862670034170151, "rewards/cosine_scaled_reward": -0.19688204117119312, "rewards/format_reward": 0.8125000018626451, "step": 330 }, { "completion_length": 2858.2083740234375, "epoch": 0.3782857142857143, "grad_norm": 0.528272271156311, "kl": 0.30035400390625, "learning_rate": 3.785183306423767e-07, "loss": 0.0324, "reward": -0.01362695125862956, "reward_std": 0.7233411334455013, "rewards/cosine_scaled_reward": -0.1397905834019184, "rewards/format_reward": 0.5208333414047956, "step": 331 }, { "completion_length": 2584.729217529297, "epoch": 0.37942857142857145, "grad_norm": 0.6263169050216675, "kl": 0.2864990234375, "learning_rate": 3.7561798609655373e-07, "loss": 0.0382, "reward": 0.33939856104552746, "reward_std": 0.8283688835799694, "rewards/cosine_scaled_reward": 0.012080416432581842, "rewards/format_reward": 0.6875000149011612, "step": 332 }, { "completion_length": 2398.562545776367, "epoch": 0.38057142857142856, "grad_norm": 0.2747509777545929, "kl": 0.20452880859375, "learning_rate": 3.72726140684072e-07, "loss": 0.0178, "reward": 0.3634027219377458, "reward_std": 0.7900962755084038, "rewards/cosine_scaled_reward": -0.07603580364957452, "rewards/format_reward": 0.8958333507180214, "step": 333 }, { "completion_length": 3390.7500915527344, "epoch": 0.38171428571428573, "grad_norm": 0.6542767882347107, "kl": 0.46240234375, "learning_rate": 3.6984293534939737e-07, "loss": 0.0728, "reward": -0.052195049822330475, "reward_std": 0.894780658185482, "rewards/cosine_scaled_reward": -0.17814880423247814, "rewards/format_reward": 0.5208333414047956, "step": 334 }, { "completion_length": 2516.604217529297, "epoch": 0.38285714285714284, "grad_norm": 0.2772899866104126, "kl": 0.257080078125, "learning_rate": 3.6696851061588994e-07, "loss": 0.0318, "reward": 0.1776880531979259, "reward_std": 0.7250074371695518, "rewards/cosine_scaled_reward": -0.06235098314937204, "rewards/format_reward": 0.6250000093132257, "step": 335 }, { "completion_length": 3047.0000915527344, "epoch": 0.384, "grad_norm": 1.0677460432052612, "kl": 0.34222412109375, "learning_rate": 3.641030065789562e-07, "loss": 0.0384, "reward": 0.3056840244680643, "reward_std": 0.8023335263133049, "rewards/cosine_scaled_reward": -0.030411685816943645, "rewards/format_reward": 0.729166692122817, "step": 336 }, { "completion_length": 2612.916748046875, "epoch": 0.3851428571428571, "grad_norm": 1.6512010097503662, "kl": 0.316650390625, "learning_rate": 3.612465628992203e-07, "loss": 0.0747, "reward": 0.456050219014287, "reward_std": 1.0556022450327873, "rewards/cosine_scaled_reward": 0.01445878017693758, "rewards/format_reward": 0.8125000223517418, "step": 337 }, { "completion_length": 2305.4791870117188, "epoch": 0.3862857142857143, "grad_norm": 0.9437749981880188, "kl": 0.30230712890625, "learning_rate": 3.5839931879571725e-07, "loss": -0.0135, "reward": 0.5471778312930837, "reward_std": 0.794790405780077, "rewards/cosine_scaled_reward": 0.12062014266848564, "rewards/format_reward": 0.75, "step": 338 }, { "completion_length": 3010.1250534057617, "epoch": 0.38742857142857146, "grad_norm": 0.5747052431106567, "kl": 0.360382080078125, "learning_rate": 3.555614130391079e-07, "loss": 0.0191, "reward": -0.03356679296121001, "reward_std": 0.7060465253889561, "rewards/cosine_scaled_reward": -0.17198694869875908, "rewards/format_reward": 0.5625000111758709, "step": 339 }, { "completion_length": 2871.2709045410156, "epoch": 0.38857142857142857, "grad_norm": 0.5294919610023499, "kl": 0.3682861328125, "learning_rate": 3.5273298394491515e-07, "loss": 0.0492, "reward": 0.29094321094453335, "reward_std": 0.7570115067064762, "rewards/cosine_scaled_reward": -0.11235274095088243, "rewards/format_reward": 0.8750000074505806, "step": 340 }, { "completion_length": 2639.062545776367, "epoch": 0.38971428571428574, "grad_norm": 0.45286688208580017, "kl": 0.29937744140625, "learning_rate": 3.4991416936678276e-07, "loss": 0.0171, "reward": 0.6249844692647457, "reward_std": 0.815760787576437, "rewards/cosine_scaled_reward": 0.17363815288990736, "rewards/format_reward": 0.7500000093132257, "step": 341 }, { "completion_length": 2894.0208587646484, "epoch": 0.39085714285714285, "grad_norm": 0.3460341691970825, "kl": 0.4681396484375, "learning_rate": 3.471051066897562e-07, "loss": 0.0579, "reward": 0.08103763521648943, "reward_std": 0.7632105089724064, "rewards/cosine_scaled_reward": -0.1950855739414692, "rewards/format_reward": 0.7500000074505806, "step": 342 }, { "completion_length": 2808.6250610351562, "epoch": 0.392, "grad_norm": 1.3520926237106323, "kl": 0.3250732421875, "learning_rate": 3.4430593282358777e-07, "loss": 0.0791, "reward": 0.48628374096006155, "reward_std": 0.8851627111434937, "rewards/cosine_scaled_reward": 0.07822982332436368, "rewards/format_reward": 0.7500000167638063, "step": 343 }, { "completion_length": 2434.958366394043, "epoch": 0.3931428571428571, "grad_norm": 0.38433942198753357, "kl": 0.2788543701171875, "learning_rate": 3.4151678419606233e-07, "loss": 0.0233, "reward": 0.6341738551855087, "reward_std": 0.7020405307412148, "rewards/cosine_scaled_reward": 0.1561431773006916, "rewards/format_reward": 0.8125000037252903, "step": 344 }, { "completion_length": 2946.3958892822266, "epoch": 0.3942857142857143, "grad_norm": 0.9588437080383301, "kl": 0.4776611328125, "learning_rate": 3.387377967463493e-07, "loss": 0.0089, "reward": 0.4796811621636152, "reward_std": 0.8463685475289822, "rewards/cosine_scaled_reward": 0.07671219296753407, "rewards/format_reward": 0.7500000037252903, "step": 345 }, { "completion_length": 2810.916717529297, "epoch": 0.3954285714285714, "grad_norm": 0.7113232612609863, "kl": 0.45458984375, "learning_rate": 3.359691059183761e-07, "loss": 0.0365, "reward": 0.29816207382827997, "reward_std": 0.8527140244841576, "rewards/cosine_scaled_reward": -0.028450995916500688, "rewards/format_reward": 0.7083333507180214, "step": 346 }, { "completion_length": 3077.9584045410156, "epoch": 0.3965714285714286, "grad_norm": 1.0087529420852661, "kl": 0.5037841796875, "learning_rate": 3.3321084665422803e-07, "loss": 0.027, "reward": -0.07710191514343023, "reward_std": 0.6414642743766308, "rewards/cosine_scaled_reward": -0.2412154171615839, "rewards/format_reward": 0.6458333469927311, "step": 347 }, { "completion_length": 2701.1042709350586, "epoch": 0.3977142857142857, "grad_norm": 0.7987737059593201, "kl": 0.3995208740234375, "learning_rate": 3.3046315338757026e-07, "loss": 0.0226, "reward": 0.299939407967031, "reward_std": 0.602581400424242, "rewards/cosine_scaled_reward": -0.08414606470614672, "rewards/format_reward": 0.8541666772216558, "step": 348 }, { "completion_length": 2987.229278564453, "epoch": 0.39885714285714285, "grad_norm": 1.26498544216156, "kl": 0.451416015625, "learning_rate": 3.2772616003709616e-07, "loss": 0.0201, "reward": 0.11506683845072985, "reward_std": 0.7807445377111435, "rewards/cosine_scaled_reward": -0.13993432931602, "rewards/format_reward": 0.687500013038516, "step": 349 }, { "completion_length": 2590.395950317383, "epoch": 0.4, "grad_norm": 0.5970591902732849, "kl": 0.4644775390625, "learning_rate": 3.250000000000001e-07, "loss": 0.0338, "reward": 0.18042142933700234, "reward_std": 0.8871416859328747, "rewards/cosine_scaled_reward": -0.14772336441092193, "rewards/format_reward": 0.7708333544433117, "step": 350 }, { "completion_length": 2721.6250762939453, "epoch": 0.40114285714285713, "grad_norm": 0.43472933769226074, "kl": 0.25958251953125, "learning_rate": 3.222848061454764e-07, "loss": 0.0395, "reward": 0.42602336849085987, "reward_std": 0.8064659312367439, "rewards/cosine_scaled_reward": 0.042032238095998764, "rewards/format_reward": 0.7500000149011612, "step": 351 }, { "completion_length": 2447.8334045410156, "epoch": 0.4022857142857143, "grad_norm": 1.039471983909607, "kl": 0.2555999755859375, "learning_rate": 3.195807108082429e-07, "loss": 0.0413, "reward": 0.24063719739206135, "reward_std": 0.8650816380977631, "rewards/cosine_scaled_reward": -0.0003559635952115059, "rewards/format_reward": 0.5625000093132257, "step": 352 }, { "completion_length": 2217.14591217041, "epoch": 0.4034285714285714, "grad_norm": 0.41167160868644714, "kl": 0.203216552734375, "learning_rate": 3.168878457820915e-07, "loss": 0.0232, "reward": 0.5322136869654059, "reward_std": 0.666268203407526, "rewards/cosine_scaled_reward": 0.04740488715469837, "rewards/format_reward": 0.8958333507180214, "step": 353 }, { "completion_length": 2258.625045776367, "epoch": 0.4045714285714286, "grad_norm": 0.22829324007034302, "kl": 0.180816650390625, "learning_rate": 3.142063423134644e-07, "loss": 0.0048, "reward": 0.5726254731416702, "reward_std": 0.7559525668621063, "rewards/cosine_scaled_reward": 0.12823878531344235, "rewards/format_reward": 0.7708333414047956, "step": 354 }, { "completion_length": 2250.791778564453, "epoch": 0.4057142857142857, "grad_norm": 0.602950394153595, "kl": 0.2418212890625, "learning_rate": 3.115363310950578e-07, "loss": 0.0347, "reward": 0.6632253341376781, "reward_std": 0.8946371823549271, "rewards/cosine_scaled_reward": 0.15655125584453344, "rewards/format_reward": 0.8333333544433117, "step": 355 }, { "completion_length": 2813.9583892822266, "epoch": 0.40685714285714286, "grad_norm": 0.49410441517829895, "kl": 0.27777099609375, "learning_rate": 3.0887794225945143e-07, "loss": 0.0194, "reward": 0.21525360643863678, "reward_std": 0.9108888022601604, "rewards/cosine_scaled_reward": -0.058567093685269356, "rewards/format_reward": 0.6458333469927311, "step": 356 }, { "completion_length": 3031.3751220703125, "epoch": 0.408, "grad_norm": 0.46776533126831055, "kl": 0.290283203125, "learning_rate": 3.062313053727671e-07, "loss": 0.0666, "reward": 0.11198169272392988, "reward_std": 0.7659357041120529, "rewards/cosine_scaled_reward": -0.17274215212091804, "rewards/format_reward": 0.7500000149011612, "step": 357 }, { "completion_length": 2567.7709197998047, "epoch": 0.40914285714285714, "grad_norm": 0.9789031744003296, "kl": 0.2848052978515625, "learning_rate": 3.0359654942835247e-07, "loss": 0.082, "reward": 0.4794031195342541, "reward_std": 0.9315466657280922, "rewards/cosine_scaled_reward": 0.028890431160107255, "rewards/format_reward": 0.8333333507180214, "step": 358 }, { "completion_length": 2519.666702270508, "epoch": 0.4102857142857143, "grad_norm": 0.3785220980644226, "kl": 0.329376220703125, "learning_rate": 3.0097380284049523e-07, "loss": 0.0228, "reward": 0.32482871878892183, "reward_std": 0.804768543690443, "rewards/cosine_scaled_reward": -0.028755411505699158, "rewards/format_reward": 0.7500000186264515, "step": 359 }, { "completion_length": 2652.604263305664, "epoch": 0.4114285714285714, "grad_norm": 0.5460115075111389, "kl": 0.22186279296875, "learning_rate": 2.9836319343816397e-07, "loss": 0.0358, "reward": 0.5476951543241739, "reward_std": 0.8413685485720634, "rewards/cosine_scaled_reward": 0.047556648729369044, "rewards/format_reward": 0.8958333432674408, "step": 360 }, { "completion_length": 2775.6251220703125, "epoch": 0.4125714285714286, "grad_norm": 0.6770034432411194, "kl": 0.27569580078125, "learning_rate": 2.9576484845877793e-07, "loss": 0.0547, "reward": 0.21503734902944416, "reward_std": 0.7295170053839684, "rewards/cosine_scaled_reward": -0.1001668400131166, "rewards/format_reward": 0.7500000186264515, "step": 361 }, { "completion_length": 1685.1875305175781, "epoch": 0.4137142857142857, "grad_norm": 0.2777215838432312, "kl": 0.158172607421875, "learning_rate": 2.931788945420058e-07, "loss": 0.0005, "reward": 0.4802695903927088, "reward_std": 0.6514372602105141, "rewards/cosine_scaled_reward": 0.03301592729985714, "rewards/format_reward": 0.8541666679084301, "step": 362 }, { "completion_length": 1911.145896911621, "epoch": 0.41485714285714287, "grad_norm": 0.5700949430465698, "kl": 0.2712554931640625, "learning_rate": 2.9060545772359305e-07, "loss": 0.0486, "reward": 0.3370086522772908, "reward_std": 0.7742916233837605, "rewards/cosine_scaled_reward": 0.0005665780045092106, "rewards/format_reward": 0.7083333395421505, "step": 363 }, { "completion_length": 2883.229248046875, "epoch": 0.416, "grad_norm": 0.5325201749801636, "kl": 0.36102294921875, "learning_rate": 2.8804466342921987e-07, "loss": 0.0226, "reward": -0.025897093466483057, "reward_std": 0.6142625138163567, "rewards/cosine_scaled_reward": -0.23417398636229336, "rewards/format_reward": 0.7083333469927311, "step": 364 }, { "completion_length": 2721.312545776367, "epoch": 0.41714285714285715, "grad_norm": 0.7317801713943481, "kl": 0.36126708984375, "learning_rate": 2.854966364683872e-07, "loss": 0.0568, "reward": 0.3746628388762474, "reward_std": 0.8317583128809929, "rewards/cosine_scaled_reward": 0.022838744800537825, "rewards/format_reward": 0.7083333395421505, "step": 365 }, { "completion_length": 2259.645854949951, "epoch": 0.41828571428571426, "grad_norm": 0.5810126662254333, "kl": 0.244354248046875, "learning_rate": 2.829615010283344e-07, "loss": 0.0253, "reward": 0.4160766340792179, "reward_std": 0.7030897587537766, "rewards/cosine_scaled_reward": 0.050911818630993366, "rewards/format_reward": 0.7291666828095913, "step": 366 }, { "completion_length": 3181.5209350585938, "epoch": 0.41942857142857143, "grad_norm": 0.4794647991657257, "kl": 0.4725341796875, "learning_rate": 2.8043938066798645e-07, "loss": 0.0514, "reward": 0.09267363836988807, "reward_std": 0.7625463083386421, "rewards/cosine_scaled_reward": -0.12190989265218377, "rewards/format_reward": 0.6250000204890966, "step": 367 }, { "completion_length": 2849.666732788086, "epoch": 0.4205714285714286, "grad_norm": 1.0089714527130127, "kl": 0.32720947265625, "learning_rate": 2.7793039831193133e-07, "loss": -0.0027, "reward": 0.1751155611127615, "reward_std": 0.549890723079443, "rewards/cosine_scaled_reward": -0.07263889070600271, "rewards/format_reward": 0.666666679084301, "step": 368 }, { "completion_length": 2855.750045776367, "epoch": 0.4217142857142857, "grad_norm": 0.9463785290718079, "kl": 0.40081787109375, "learning_rate": 2.7543467624442956e-07, "loss": 0.0535, "reward": 0.1500562410801649, "reward_std": 0.7940640598535538, "rewards/cosine_scaled_reward": -0.10133596323430538, "rewards/format_reward": 0.666666679084301, "step": 369 }, { "completion_length": 2896.3959197998047, "epoch": 0.4228571428571429, "grad_norm": 0.8330612778663635, "kl": 0.416717529296875, "learning_rate": 2.729523361034538e-07, "loss": 0.0286, "reward": 0.183703294955194, "reward_std": 0.45754858292639256, "rewards/cosine_scaled_reward": -0.09295342303812504, "rewards/format_reward": 0.7291666772216558, "step": 370 }, { "completion_length": 1955.479263305664, "epoch": 0.424, "grad_norm": 0.3391231894493103, "kl": 0.262359619140625, "learning_rate": 2.7048349887476037e-07, "loss": 0.0184, "reward": 0.6992178884102032, "reward_std": 0.6993260197341442, "rewards/cosine_scaled_reward": 0.16803688369691372, "rewards/format_reward": 0.8750000149011612, "step": 371 }, { "completion_length": 3005.6458740234375, "epoch": 0.42514285714285716, "grad_norm": 0.5167466402053833, "kl": 0.4355926513671875, "learning_rate": 2.6802828488599294e-07, "loss": 0.0432, "reward": 0.19448892027139664, "reward_std": 0.7890233434736729, "rewards/cosine_scaled_reward": -0.021921467036008835, "rewards/format_reward": 0.5625000093132257, "step": 372 }, { "completion_length": 1942.3333892822266, "epoch": 0.42628571428571427, "grad_norm": 0.3325159549713135, "kl": 0.2429656982421875, "learning_rate": 2.655868138008171e-07, "loss": 0.0212, "reward": 0.3014055141247809, "reward_std": 0.8629297837615013, "rewards/cosine_scaled_reward": -0.07299621542915702, "rewards/format_reward": 0.791666679084301, "step": 373 }, { "completion_length": 2335.020866394043, "epoch": 0.42742857142857144, "grad_norm": 0.8563280701637268, "kl": 0.3255615234375, "learning_rate": 2.631592046130896e-07, "loss": 0.0616, "reward": 0.2297945898026228, "reward_std": 0.9343635328114033, "rewards/cosine_scaled_reward": -0.047990256920456886, "rewards/format_reward": 0.6458333469927311, "step": 374 }, { "completion_length": 2628.291748046875, "epoch": 0.42857142857142855, "grad_norm": 0.9893895387649536, "kl": 0.34588623046875, "learning_rate": 2.6074557564105724e-07, "loss": 0.0665, "reward": 0.35648069988383213, "reward_std": 0.7849867083132267, "rewards/cosine_scaled_reward": 0.0042181313037872314, "rewards/format_reward": 0.729166679084301, "step": 375 }, { "completion_length": 2365.6459045410156, "epoch": 0.4297142857142857, "grad_norm": 0.37569403648376465, "kl": 0.351654052734375, "learning_rate": 2.583460445215911e-07, "loss": 0.0279, "reward": 0.13466822169721127, "reward_std": 0.7210676558315754, "rewards/cosine_scaled_reward": -0.1441901307553053, "rewards/format_reward": 0.7291666753590107, "step": 376 }, { "completion_length": 3061.979278564453, "epoch": 0.4308571428571429, "grad_norm": 0.7083445191383362, "kl": 0.4854736328125, "learning_rate": 2.5596072820445254e-07, "loss": 0.0347, "reward": 0.34864536579698324, "reward_std": 1.0110743790864944, "rewards/cosine_scaled_reward": 0.04069389193318784, "rewards/format_reward": 0.6250000186264515, "step": 377 }, { "completion_length": 2377.3542709350586, "epoch": 0.432, "grad_norm": 0.49458223581314087, "kl": 0.311187744140625, "learning_rate": 2.5358974294659373e-07, "loss": 0.0331, "reward": 0.5616534340661019, "reward_std": 0.7573869004845619, "rewards/cosine_scaled_reward": 0.08385843969881535, "rewards/format_reward": 0.8541666939854622, "step": 378 }, { "completion_length": 2865.729248046875, "epoch": 0.43314285714285716, "grad_norm": 1.2933725118637085, "kl": 0.4803466796875, "learning_rate": 2.512332043064913e-07, "loss": 0.0196, "reward": 0.3152011390775442, "reward_std": 0.7045796066522598, "rewards/cosine_scaled_reward": -0.010164887178689241, "rewards/format_reward": 0.7083333507180214, "step": 379 }, { "completion_length": 2530.7708892822266, "epoch": 0.4342857142857143, "grad_norm": 0.6860670447349548, "kl": 0.32769775390625, "learning_rate": 2.488912271385139e-07, "loss": 0.0408, "reward": 0.2659956905990839, "reward_std": 0.727360412478447, "rewards/cosine_scaled_reward": -0.08539670892059803, "rewards/format_reward": 0.7916666753590107, "step": 380 }, { "completion_length": 2727.3333587646484, "epoch": 0.43542857142857144, "grad_norm": 0.8145273923873901, "kl": 0.439697265625, "learning_rate": 2.465639255873246e-07, "loss": 0.0334, "reward": 0.08977367201441666, "reward_std": 0.808791808784008, "rewards/cosine_scaled_reward": -0.1487837778404355, "rewards/format_reward": 0.6666666753590107, "step": 381 }, { "completion_length": 2468.104248046875, "epoch": 0.43657142857142855, "grad_norm": 0.42541611194610596, "kl": 0.37420654296875, "learning_rate": 2.4425141308231765e-07, "loss": 0.0623, "reward": 0.18184729665517807, "reward_std": 0.8058411814272404, "rewards/cosine_scaled_reward": -0.1598250768147409, "rewards/format_reward": 0.8125000111758709, "step": 382 }, { "completion_length": 2856.6458892822266, "epoch": 0.4377142857142857, "grad_norm": 0.6056004762649536, "kl": 0.4031982421875, "learning_rate": 2.4195380233209006e-07, "loss": 0.044, "reward": 0.46744489343836904, "reward_std": 0.9438849911093712, "rewards/cosine_scaled_reward": 0.08236491866409779, "rewards/format_reward": 0.7083333488553762, "step": 383 }, { "completion_length": 2167.7500762939453, "epoch": 0.43885714285714283, "grad_norm": 0.6828803420066833, "kl": 0.2847900390625, "learning_rate": 2.3967120531894857e-07, "loss": 0.0052, "reward": 0.8705862760543823, "reward_std": 0.8881346955895424, "rewards/cosine_scaled_reward": 0.32432539528235793, "rewards/format_reward": 0.7708333432674408, "step": 384 }, { "completion_length": 2416.9167098999023, "epoch": 0.44, "grad_norm": 0.6516284942626953, "kl": 0.360748291015625, "learning_rate": 2.374037332934512e-07, "loss": 0.0588, "reward": 0.24877717718482018, "reward_std": 0.735356081277132, "rewards/cosine_scaled_reward": -0.08876009099185467, "rewards/format_reward": 0.7708333432674408, "step": 385 }, { "completion_length": 2890.8959350585938, "epoch": 0.44114285714285717, "grad_norm": 0.6051266193389893, "kl": 0.4339599609375, "learning_rate": 2.3515149676898552e-07, "loss": 0.0433, "reward": 0.37795007787644863, "reward_std": 0.8708282820880413, "rewards/cosine_scaled_reward": 0.02761521004140377, "rewards/format_reward": 0.708333358168602, "step": 386 }, { "completion_length": 2727.416778564453, "epoch": 0.4422857142857143, "grad_norm": 0.4502924680709839, "kl": 0.392852783203125, "learning_rate": 2.3291460551638237e-07, "loss": 0.0409, "reward": 0.40206454193685204, "reward_std": 0.8078175410628319, "rewards/cosine_scaled_reward": 0.03367285244166851, "rewards/format_reward": 0.7291666716337204, "step": 387 }, { "completion_length": 2498.291732788086, "epoch": 0.44342857142857145, "grad_norm": 0.7455759048461914, "kl": 0.421722412109375, "learning_rate": 2.306931685585657e-07, "loss": 0.0377, "reward": 0.4461521580815315, "reward_std": 0.6339214891195297, "rewards/cosine_scaled_reward": 0.06488732434809208, "rewards/format_reward": 0.7500000074505806, "step": 388 }, { "completion_length": 2519.2500610351562, "epoch": 0.44457142857142856, "grad_norm": 0.3315029740333557, "kl": 0.347686767578125, "learning_rate": 2.2848729416523859e-07, "loss": 0.0373, "reward": 0.41832172160502523, "reward_std": 0.8497539162635803, "rewards/cosine_scaled_reward": 0.04175483621656895, "rewards/format_reward": 0.7291666846722364, "step": 389 }, { "completion_length": 2520.270881652832, "epoch": 0.44571428571428573, "grad_norm": 0.6555607914924622, "kl": 0.3785400390625, "learning_rate": 2.2629708984760706e-07, "loss": 0.0494, "reward": 0.36745146568864584, "reward_std": 0.7271608784794807, "rewards/cosine_scaled_reward": 0.005871989764273167, "rewards/format_reward": 0.7500000223517418, "step": 390 }, { "completion_length": 2606.7916870117188, "epoch": 0.44685714285714284, "grad_norm": 0.5517706871032715, "kl": 0.4510498046875, "learning_rate": 2.2412266235313973e-07, "loss": 0.0531, "reward": 0.30924548767507076, "reward_std": 0.7186228446662426, "rewards/cosine_scaled_reward": 0.02813367173075676, "rewards/format_reward": 0.6250000037252903, "step": 391 }, { "completion_length": 2368.2500762939453, "epoch": 0.448, "grad_norm": 1.1368578672409058, "kl": 0.367889404296875, "learning_rate": 2.2196411766036487e-07, "loss": 0.0712, "reward": 0.43708939105272293, "reward_std": 0.9549101106822491, "rewards/cosine_scaled_reward": -0.014195648953318596, "rewards/format_reward": 0.8541666865348816, "step": 392 }, { "completion_length": 2708.354217529297, "epoch": 0.4491428571428571, "grad_norm": 1.338770866394043, "kl": 0.411865234375, "learning_rate": 2.1982156097370557e-07, "loss": 0.0669, "reward": 0.5190371284261346, "reward_std": 1.0145683512091637, "rewards/cosine_scaled_reward": 0.11480109271360561, "rewards/format_reward": 0.708333358168602, "step": 393 }, { "completion_length": 3059.1250610351562, "epoch": 0.4502857142857143, "grad_norm": 0.698340117931366, "kl": 0.5703125, "learning_rate": 2.1769509671835223e-07, "loss": 0.0558, "reward": 0.08016422716900706, "reward_std": 0.9071974717080593, "rewards/cosine_scaled_reward": -0.10081264981999993, "rewards/format_reward": 0.541666679084301, "step": 394 }, { "completion_length": 2238.604232788086, "epoch": 0.4514285714285714, "grad_norm": 0.527318000793457, "kl": 0.3275146484375, "learning_rate": 2.1558482853517253e-07, "loss": 0.0224, "reward": 0.518818385200575, "reward_std": 0.812805999070406, "rewards/cosine_scaled_reward": 0.07729056139942259, "rewards/format_reward": 0.7916666902601719, "step": 395 }, { "completion_length": 2808.7084045410156, "epoch": 0.45257142857142857, "grad_norm": 1.1238527297973633, "kl": 0.48876953125, "learning_rate": 2.134908592756607e-07, "loss": 0.1152, "reward": 0.2978093853453174, "reward_std": 0.9074007868766785, "rewards/cosine_scaled_reward": -0.03470429126173258, "rewards/format_reward": 0.7083333507180214, "step": 396 }, { "completion_length": 2351.8126068115234, "epoch": 0.45371428571428574, "grad_norm": 0.3706618845462799, "kl": 0.246002197265625, "learning_rate": 2.1141329099692406e-07, "loss": 0.0129, "reward": 0.37504480965435505, "reward_std": 0.5897877439856529, "rewards/cosine_scaled_reward": -0.05014793388545513, "rewards/format_reward": 0.8750000149011612, "step": 397 }, { "completion_length": 2328.37508392334, "epoch": 0.45485714285714285, "grad_norm": 0.7165157198905945, "kl": 0.3564453125, "learning_rate": 2.0935222495670968e-07, "loss": 0.0288, "reward": 0.3573523070663214, "reward_std": 0.7765154354274273, "rewards/cosine_scaled_reward": 0.0030241278000175953, "rewards/format_reward": 0.729166679084301, "step": 398 }, { "completion_length": 2353.854202270508, "epoch": 0.456, "grad_norm": 1.2241944074630737, "kl": 0.3509521484375, "learning_rate": 2.0730776160846853e-07, "loss": 0.0858, "reward": 0.7341470178216696, "reward_std": 1.0539017170667648, "rewards/cosine_scaled_reward": 0.18186382483690977, "rewards/format_reward": 0.8541666939854622, "step": 399 }, { "completion_length": 1577.7917098999023, "epoch": 0.45714285714285713, "grad_norm": 1.081485390663147, "kl": 0.142333984375, "learning_rate": 2.0528000059645995e-07, "loss": -0.0309, "reward": 0.879038143903017, "reward_std": 0.9596639424562454, "rewards/cosine_scaled_reward": 0.2600486520677805, "rewards/format_reward": 0.8958333395421505, "step": 400 }, { "completion_length": 2888.479217529297, "epoch": 0.4582857142857143, "grad_norm": 0.948920726776123, "kl": 0.559173583984375, "learning_rate": 2.032690407508949e-07, "loss": 0.0508, "reward": 0.2996121197938919, "reward_std": 0.6879791766405106, "rewards/cosine_scaled_reward": -0.01863069087266922, "rewards/format_reward": 0.7083333488553762, "step": 401 }, { "completion_length": 2353.104217529297, "epoch": 0.4594285714285714, "grad_norm": 0.6571682095527649, "kl": 0.3966827392578125, "learning_rate": 2.0127498008311922e-07, "loss": 0.0259, "reward": 0.2608861066401005, "reward_std": 0.6378191784024239, "rewards/cosine_scaled_reward": -0.031162479892373085, "rewards/format_reward": 0.6875000167638063, "step": 402 }, { "completion_length": 2268.104232788086, "epoch": 0.4605714285714286, "grad_norm": 0.6540146470069885, "kl": 0.3348388671875, "learning_rate": 1.9929791578083655e-07, "loss": 0.0243, "reward": 0.3493347900584922, "reward_std": 0.7477662637829781, "rewards/cosine_scaled_reward": -0.0498207900673151, "rewards/format_reward": 0.8333333432674408, "step": 403 }, { "completion_length": 2252.0625610351562, "epoch": 0.4617142857142857, "grad_norm": 0.5845898985862732, "kl": 0.31449127197265625, "learning_rate": 1.9733794420337213e-07, "loss": 0.0223, "reward": 0.3203935632482171, "reward_std": 0.5589652694761753, "rewards/cosine_scaled_reward": -0.0724391471594572, "rewards/format_reward": 0.8541666716337204, "step": 404 }, { "completion_length": 2176.437557220459, "epoch": 0.46285714285714286, "grad_norm": 0.5805149078369141, "kl": 0.4361114501953125, "learning_rate": 1.9539516087697517e-07, "loss": 0.0716, "reward": 0.5914140390232205, "reward_std": 0.9176547713577747, "rewards/cosine_scaled_reward": 0.1673335493542254, "rewards/format_reward": 0.7083333469927311, "step": 405 }, { "completion_length": 2222.3125762939453, "epoch": 0.464, "grad_norm": 0.714565098285675, "kl": 0.348388671875, "learning_rate": 1.934696604901642e-07, "loss": 0.053, "reward": 0.5401954464614391, "reward_std": 0.9346184208989143, "rewards/cosine_scaled_reward": 0.03536188416182995, "rewards/format_reward": 0.8958333432674408, "step": 406 }, { "completion_length": 2257.3542404174805, "epoch": 0.46514285714285714, "grad_norm": 0.7260815501213074, "kl": 0.316375732421875, "learning_rate": 1.915615368891117e-07, "loss": 0.0606, "reward": 0.6055942573584616, "reward_std": 0.6878091357648373, "rewards/cosine_scaled_reward": 0.12408588983817026, "rewards/format_reward": 0.8333333507180214, "step": 407 }, { "completion_length": 2682.041717529297, "epoch": 0.4662857142857143, "grad_norm": 0.7148492336273193, "kl": 0.502105712890625, "learning_rate": 1.8967088307307e-07, "loss": 0.0441, "reward": 0.5293791117146611, "reward_std": 0.863475501537323, "rewards/cosine_scaled_reward": 0.14691544696688652, "rewards/format_reward": 0.6666666716337204, "step": 408 }, { "completion_length": 3076.0834045410156, "epoch": 0.4674285714285714, "grad_norm": 0.8249404430389404, "kl": 0.6727294921875, "learning_rate": 1.8779779118983867e-07, "loss": 0.0616, "reward": 0.002814117819070816, "reward_std": 0.8114083893597126, "rewards/cosine_scaled_reward": -0.10041821748018265, "rewards/format_reward": 0.4583333395421505, "step": 409 }, { "completion_length": 2465.000087738037, "epoch": 0.4685714285714286, "grad_norm": 0.6067022681236267, "kl": 0.5772857666015625, "learning_rate": 1.8594235253127372e-07, "loss": 0.0528, "reward": 0.12211680319160223, "reward_std": 0.7772498056292534, "rewards/cosine_scaled_reward": -0.13637491036206484, "rewards/format_reward": 0.6875000093132257, "step": 410 }, { "completion_length": 2848.7708740234375, "epoch": 0.4697142857142857, "grad_norm": 0.5674929022789001, "kl": 0.5439453125, "learning_rate": 1.8410465752883758e-07, "loss": 0.0797, "reward": 0.23797017033211887, "reward_std": 0.8856572322547436, "rewards/cosine_scaled_reward": -0.07174723839852959, "rewards/format_reward": 0.7083333507180214, "step": 411 }, { "completion_length": 2702.9375610351562, "epoch": 0.47085714285714286, "grad_norm": 0.9189234375953674, "kl": 0.4500732421875, "learning_rate": 1.822847957491922e-07, "loss": 0.025, "reward": 0.47001307643949986, "reward_std": 0.9889923110604286, "rewards/cosine_scaled_reward": 0.04870818182826042, "rewards/format_reward": 0.770833358168602, "step": 412 }, { "completion_length": 2750.1250762939453, "epoch": 0.472, "grad_norm": 0.7404162287712097, "kl": 0.49786376953125, "learning_rate": 1.804828558898332e-07, "loss": 0.0398, "reward": 0.37627727701328695, "reward_std": 0.7015467956662178, "rewards/cosine_scaled_reward": -0.021446891129016876, "rewards/format_reward": 0.8125000298023224, "step": 413 }, { "completion_length": 3040.6459350585938, "epoch": 0.47314285714285714, "grad_norm": 1.531003475189209, "kl": 0.66845703125, "learning_rate": 1.7869892577476722e-07, "loss": 0.0372, "reward": -0.023312292993068695, "reward_std": 0.6657886579632759, "rewards/cosine_scaled_reward": -0.20822268491610885, "rewards/format_reward": 0.6458333507180214, "step": 414 }, { "completion_length": 3178.354248046875, "epoch": 0.4742857142857143, "grad_norm": 1.0874366760253906, "kl": 0.808349609375, "learning_rate": 1.7693309235023127e-07, "loss": 0.0745, "reward": -0.028731117257848382, "reward_std": 0.8625498786568642, "rewards/cosine_scaled_reward": -0.1594004575163126, "rewards/format_reward": 0.5208333395421505, "step": 415 }, { "completion_length": 2240.5625610351562, "epoch": 0.4754285714285714, "grad_norm": 1.3929944038391113, "kl": 0.28546142578125, "learning_rate": 1.7518544168045524e-07, "loss": 0.0775, "reward": 0.5215831075329334, "reward_std": 0.8785289078950882, "rewards/cosine_scaled_reward": 0.049687013030052185, "rewards/format_reward": 0.8541666939854622, "step": 416 }, { "completion_length": 3151.9375915527344, "epoch": 0.4765714285714286, "grad_norm": 0.6912859678268433, "kl": 0.676025390625, "learning_rate": 1.7345605894346726e-07, "loss": 0.0716, "reward": -0.13146568089723587, "reward_std": 0.7075865548104048, "rewards/cosine_scaled_reward": -0.1975945346057415, "rewards/format_reward": 0.47916668467223644, "step": 417 }, { "completion_length": 2078.3542289733887, "epoch": 0.4777142857142857, "grad_norm": 1.4748533964157104, "kl": 0.241912841796875, "learning_rate": 1.7174502842694212e-07, "loss": -0.0234, "reward": 0.6464557002764195, "reward_std": 0.8331911526620388, "rewards/cosine_scaled_reward": 0.1137741282582283, "rewards/format_reward": 0.8958333432674408, "step": 418 }, { "completion_length": 2601.4584197998047, "epoch": 0.47885714285714287, "grad_norm": 0.4618014395236969, "kl": 0.39984130859375, "learning_rate": 1.7005243352409333e-07, "loss": 0.0497, "reward": 0.6187925288686529, "reward_std": 0.9556396976113319, "rewards/cosine_scaled_reward": 0.1003096466884017, "rewards/format_reward": 0.8750000223517418, "step": 419 }, { "completion_length": 2392.33341217041, "epoch": 0.48, "grad_norm": 0.7824205160140991, "kl": 0.47357177734375, "learning_rate": 1.6837835672960831e-07, "loss": 0.0699, "reward": 0.2953487314807717, "reward_std": 0.6610889099538326, "rewards/cosine_scaled_reward": -0.03334581479430199, "rewards/format_reward": 0.729166679084301, "step": 420 }, { "completion_length": 3098.0834045410156, "epoch": 0.48114285714285715, "grad_norm": 1.4201315641403198, "kl": 0.530029296875, "learning_rate": 1.6672287963562852e-07, "loss": 0.0068, "reward": -0.07355257519520819, "reward_std": 0.6699985489249229, "rewards/cosine_scaled_reward": -0.20630090683698654, "rewards/format_reward": 0.583333345130086, "step": 421 }, { "completion_length": 2704.5833892822266, "epoch": 0.48228571428571426, "grad_norm": 0.31716275215148926, "kl": 0.3623046875, "learning_rate": 1.6508608292777203e-07, "loss": 0.0337, "reward": 0.30903656780719757, "reward_std": 0.791595920920372, "rewards/cosine_scaled_reward": -0.028064551996067166, "rewards/format_reward": 0.7291666865348816, "step": 422 }, { "completion_length": 2378.3334197998047, "epoch": 0.48342857142857143, "grad_norm": 0.561937153339386, "kl": 0.29791259765625, "learning_rate": 1.6346804638120098e-07, "loss": -0.0015, "reward": 0.24677492817863822, "reward_std": 0.749111071228981, "rewards/cosine_scaled_reward": -0.10133864358067513, "rewards/format_reward": 0.7916666753590107, "step": 423 }, { "completion_length": 2921.854202270508, "epoch": 0.4845714285714286, "grad_norm": 0.8485268950462341, "kl": 0.461669921875, "learning_rate": 1.6186884885673413e-07, "loss": 0.0307, "reward": 0.025166813982650638, "reward_std": 0.8601982519030571, "rewards/cosine_scaled_reward": -0.17313844989985228, "rewards/format_reward": 0.6250000204890966, "step": 424 }, { "completion_length": 2087.3125762939453, "epoch": 0.4857142857142857, "grad_norm": 1.4564896821975708, "kl": 0.282867431640625, "learning_rate": 1.6028856829700258e-07, "loss": 0.0946, "reward": 0.7901451410725713, "reward_std": 1.1320747658610344, "rewards/cosine_scaled_reward": 0.2687780649284832, "rewards/format_reward": 0.7500000111758709, "step": 425 }, { "completion_length": 2330.562599182129, "epoch": 0.4868571428571429, "grad_norm": 0.6082496047019958, "kl": 0.316375732421875, "learning_rate": 1.5872728172265146e-07, "loss": 0.0174, "reward": 0.3098383769392967, "reward_std": 0.7229567170143127, "rewards/cosine_scaled_reward": -0.0652941414155066, "rewards/format_reward": 0.8125000186264515, "step": 426 }, { "completion_length": 2724.479217529297, "epoch": 0.488, "grad_norm": 0.6990248560905457, "kl": 0.45989990234375, "learning_rate": 1.5718506522858572e-07, "loss": 0.0519, "reward": 0.12407399946823716, "reward_std": 0.8184213675558567, "rewards/cosine_scaled_reward": -0.0571793733688537, "rewards/format_reward": 0.541666679084301, "step": 427 }, { "completion_length": 2934.5834045410156, "epoch": 0.48914285714285716, "grad_norm": 0.8835780620574951, "kl": 0.461181640625, "learning_rate": 1.5566199398026147e-07, "loss": 0.0673, "reward": 0.10110859386622906, "reward_std": 0.9170898050069809, "rewards/cosine_scaled_reward": -0.1494620693847537, "rewards/format_reward": 0.6666666902601719, "step": 428 }, { "completion_length": 2098.0208892822266, "epoch": 0.49028571428571427, "grad_norm": 0.4786301255226135, "kl": 0.250732421875, "learning_rate": 1.5415814221002265e-07, "loss": 0.0115, "reward": 0.21683326549828053, "reward_std": 0.8974504359066486, "rewards/cosine_scaled_reward": -0.14522514073178172, "rewards/format_reward": 0.8125000149011612, "step": 429 }, { "completion_length": 2277.187530517578, "epoch": 0.49142857142857144, "grad_norm": 0.34657105803489685, "kl": 0.2308349609375, "learning_rate": 1.5267358321348285e-07, "loss": 0.0271, "reward": 0.20404995302669704, "reward_std": 0.6838134452700615, "rewards/cosine_scaled_reward": -0.08210572600364685, "rewards/format_reward": 0.7083333469927311, "step": 430 }, { "completion_length": 2592.3125915527344, "epoch": 0.49257142857142855, "grad_norm": 0.40773648023605347, "kl": 0.3669281005859375, "learning_rate": 1.5120838934595337e-07, "loss": 0.0348, "reward": -0.10748124029487371, "reward_std": 0.5141724869608879, "rewards/cosine_scaled_reward": -0.23057417757809162, "rewards/format_reward": 0.6041666697710752, "step": 431 }, { "completion_length": 2727.416748046875, "epoch": 0.4937142857142857, "grad_norm": 0.3802461326122284, "kl": 0.365966796875, "learning_rate": 1.4976263201891613e-07, "loss": 0.0608, "reward": 0.1675979122519493, "reward_std": 0.601953960955143, "rewards/cosine_scaled_reward": -0.07371543161571026, "rewards/format_reward": 0.6458333376795053, "step": 432 }, { "completion_length": 2820.9791870117188, "epoch": 0.4948571428571429, "grad_norm": 0.9824896454811096, "kl": 0.38507080078125, "learning_rate": 1.483363816965435e-07, "loss": 0.0532, "reward": 0.24698512954637408, "reward_std": 0.9584920853376389, "rewards/cosine_scaled_reward": -0.006244649179279804, "rewards/format_reward": 0.5833333507180214, "step": 433 }, { "completion_length": 2987.625045776367, "epoch": 0.496, "grad_norm": 1.5248559713363647, "kl": 0.41156005859375, "learning_rate": 1.469297078922642e-07, "loss": 0.0052, "reward": -0.08143354021012783, "reward_std": 0.48334776237607, "rewards/cosine_scaled_reward": -0.25419083051383495, "rewards/format_reward": 0.6875000037252903, "step": 434 }, { "completion_length": 2147.1250228881836, "epoch": 0.49714285714285716, "grad_norm": 0.3896443545818329, "kl": 0.2529144287109375, "learning_rate": 1.4554267916537495e-07, "loss": 0.0041, "reward": 0.21179522573947906, "reward_std": 0.6268462352454662, "rewards/cosine_scaled_reward": -0.12806864827871323, "rewards/format_reward": 0.8125000111758709, "step": 435 }, { "completion_length": 2244.6042251586914, "epoch": 0.4982857142857143, "grad_norm": 1.1815028190612793, "kl": 0.34857177734375, "learning_rate": 1.4417536311769885e-07, "loss": -0.0171, "reward": 0.31779580656439066, "reward_std": 0.8181377947330475, "rewards/cosine_scaled_reward": -0.02664483431726694, "rewards/format_reward": 0.7291666772216558, "step": 436 }, { "completion_length": 2895.5834045410156, "epoch": 0.49942857142857144, "grad_norm": 0.6824524402618408, "kl": 0.3638916015625, "learning_rate": 1.4282782639029128e-07, "loss": 0.053, "reward": 0.1753413761034608, "reward_std": 0.7372228689491749, "rewards/cosine_scaled_reward": -0.11190539970993996, "rewards/format_reward": 0.7291666828095913, "step": 437 }, { "completion_length": 2927.729232788086, "epoch": 0.5005714285714286, "grad_norm": 0.5597608685493469, "kl": 0.362335205078125, "learning_rate": 1.4150013466019114e-07, "loss": 0.0319, "reward": 0.1679457863792777, "reward_std": 0.7294919807463884, "rewards/cosine_scaled_reward": -0.09090281999669969, "rewards/format_reward": 0.666666679084301, "step": 438 }, { "completion_length": 2691.5833740234375, "epoch": 0.5017142857142857, "grad_norm": 0.8128771781921387, "kl": 0.296539306640625, "learning_rate": 1.4019235263722034e-07, "loss": 0.0088, "reward": -0.08755548892077059, "reward_std": 0.49066366255283356, "rewards/cosine_scaled_reward": -0.1950508914887905, "rewards/format_reward": 0.5625000074505806, "step": 439 }, { "completion_length": 3066.7292404174805, "epoch": 0.5028571428571429, "grad_norm": 1.3051230907440186, "kl": 0.438079833984375, "learning_rate": 1.3890454406082956e-07, "loss": 0.0168, "reward": -0.09066830575466156, "reward_std": 0.5652249306440353, "rewards/cosine_scaled_reward": -0.1592588908970356, "rewards/format_reward": 0.4791666716337204, "step": 440 }, { "completion_length": 2851.2500610351562, "epoch": 0.504, "grad_norm": 0.5024367570877075, "kl": 0.3612060546875, "learning_rate": 1.3763677169699217e-07, "loss": 0.0408, "reward": 0.41364969685673714, "reward_std": 0.8325144313275814, "rewards/cosine_scaled_reward": 0.01983257569372654, "rewards/format_reward": 0.7708333469927311, "step": 441 }, { "completion_length": 2756.0833892822266, "epoch": 0.5051428571428571, "grad_norm": 0.634526789188385, "kl": 0.311614990234375, "learning_rate": 1.3638909733514452e-07, "loss": 0.0151, "reward": 0.31202031567227095, "reward_std": 0.8749045357108116, "rewards/cosine_scaled_reward": -0.0003691236488521099, "rewards/format_reward": 0.666666679084301, "step": 442 }, { "completion_length": 3059.8125534057617, "epoch": 0.5062857142857143, "grad_norm": 0.5289911031723022, "kl": 0.453643798828125, "learning_rate": 1.351615817851748e-07, "loss": 0.0414, "reward": 0.029406734742224216, "reward_std": 0.8291610702872276, "rewards/cosine_scaled_reward": -0.1101669161580503, "rewards/format_reward": 0.500000013038516, "step": 443 }, { "completion_length": 2669.6876068115234, "epoch": 0.5074285714285715, "grad_norm": 1.0344839096069336, "kl": 0.39227294921875, "learning_rate": 1.3395428487445914e-07, "loss": 0.0173, "reward": 0.18073289468884468, "reward_std": 0.8002647534012794, "rewards/cosine_scaled_reward": -0.05391998961567879, "rewards/format_reward": 0.6041666809469461, "step": 444 }, { "completion_length": 2744.3959045410156, "epoch": 0.5085714285714286, "grad_norm": 1.1817784309387207, "kl": 0.28948974609375, "learning_rate": 1.3276726544494571e-07, "loss": 0.0885, "reward": 0.130654014647007, "reward_std": 0.9245680868625641, "rewards/cosine_scaled_reward": -0.07616318017244339, "rewards/format_reward": 0.562500013038516, "step": 445 }, { "completion_length": 2821.354217529297, "epoch": 0.5097142857142857, "grad_norm": 0.5576800107955933, "kl": 0.2890625, "learning_rate": 1.316005813502869e-07, "loss": 0.0212, "reward": 0.15429426170885563, "reward_std": 0.7137273997068405, "rewards/cosine_scaled_reward": -0.09584356285631657, "rewards/format_reward": 0.6666666679084301, "step": 446 }, { "completion_length": 2405.583396911621, "epoch": 0.5108571428571429, "grad_norm": 0.211807519197464, "kl": 0.246002197265625, "learning_rate": 1.3045428945301953e-07, "loss": 0.0362, "reward": -0.02742259856313467, "reward_std": 0.662248931825161, "rewards/cosine_scaled_reward": -0.2382231242954731, "rewards/format_reward": 0.708333345130086, "step": 447 }, { "completion_length": 2420.5834045410156, "epoch": 0.512, "grad_norm": 0.43571776151657104, "kl": 0.217529296875, "learning_rate": 1.2932844562179352e-07, "loss": 0.0407, "reward": 0.2491408372297883, "reward_std": 0.7298646531999111, "rewards/cosine_scaled_reward": -0.05479799164459109, "rewards/format_reward": 0.7083333563059568, "step": 448 }, { "completion_length": 2219.5000610351562, "epoch": 0.5131428571428571, "grad_norm": 0.36097243428230286, "kl": 0.170135498046875, "learning_rate": 1.2822310472864885e-07, "loss": 0.0368, "reward": 0.2152203669102164, "reward_std": 0.7272625118494034, "rewards/cosine_scaled_reward": -0.09725791588425636, "rewards/format_reward": 0.750000013038516, "step": 449 }, { "completion_length": 2734.604217529297, "epoch": 0.5142857142857142, "grad_norm": 0.21459351480007172, "kl": 0.228668212890625, "learning_rate": 1.2713832064634125e-07, "loss": 0.0388, "reward": 0.31540712295100093, "reward_std": 0.6263695955276489, "rewards/cosine_scaled_reward": -0.024456078186631203, "rewards/format_reward": 0.7500000074505806, "step": 450 }, { "completion_length": 2182.0000610351562, "epoch": 0.5154285714285715, "grad_norm": 1.156898021697998, "kl": 0.1932373046875, "learning_rate": 1.260741462457165e-07, "loss": 0.0423, "reward": 0.4187639909796417, "reward_std": 0.828635673969984, "rewards/cosine_scaled_reward": -0.02044131373986602, "rewards/format_reward": 0.854166679084301, "step": 451 }, { "completion_length": 2950.0833892822266, "epoch": 0.5165714285714286, "grad_norm": 1.0129191875457764, "kl": 0.2197265625, "learning_rate": 1.2503063339313356e-07, "loss": 0.0467, "reward": 0.22497872763779014, "reward_std": 0.7840702906250954, "rewards/cosine_scaled_reward": -9.69860702753067e-05, "rewards/format_reward": 0.5625000111758709, "step": 452 }, { "completion_length": 2666.812545776367, "epoch": 0.5177142857142857, "grad_norm": 0.9255622029304504, "kl": 0.2947998046875, "learning_rate": 1.2400783294793668e-07, "loss": 0.0568, "reward": 0.23635170864872634, "reward_std": 0.9474616125226021, "rewards/cosine_scaled_reward": 0.014322125818580389, "rewards/format_reward": 0.5208333432674408, "step": 453 }, { "completion_length": 2574.166732788086, "epoch": 0.5188571428571429, "grad_norm": 0.32647454738616943, "kl": 0.2278289794921875, "learning_rate": 1.2300579475997657e-07, "loss": 0.0413, "reward": -0.001333402469754219, "reward_std": 0.6261133253574371, "rewards/cosine_scaled_reward": -0.1682842280715704, "rewards/format_reward": 0.6041666753590107, "step": 454 }, { "completion_length": 3117.4375915527344, "epoch": 0.52, "grad_norm": 0.5304046869277954, "kl": 0.443359375, "learning_rate": 1.220245676671809e-07, "loss": 0.0768, "reward": -0.15152850991580635, "reward_std": 0.6764501072466373, "rewards/cosine_scaled_reward": -0.2339008767157793, "rewards/format_reward": 0.5208333469927311, "step": 455 }, { "completion_length": 2992.0000610351562, "epoch": 0.5211428571428571, "grad_norm": 0.8478217720985413, "kl": 0.326171875, "learning_rate": 1.2106419949317388e-07, "loss": 0.0617, "reward": 0.046928441151976585, "reward_std": 0.8120874315500259, "rewards/cosine_scaled_reward": -0.0800070259720087, "rewards/format_reward": 0.479166679084301, "step": 456 }, { "completion_length": 2832.1875610351562, "epoch": 0.5222857142857142, "grad_norm": 0.8208078145980835, "kl": 0.300048828125, "learning_rate": 1.2012473704494537e-07, "loss": 0.0135, "reward": 0.3024125434458256, "reward_std": 0.7796786315739155, "rewards/cosine_scaled_reward": 0.018373452126979828, "rewards/format_reward": 0.6250000149011612, "step": 457 }, { "completion_length": 2559.0834045410156, "epoch": 0.5234285714285715, "grad_norm": 1.155639886856079, "kl": 0.3125, "learning_rate": 1.1920622611056974e-07, "loss": 0.0601, "reward": -0.02342590008629486, "reward_std": 0.6464426964521408, "rewards/cosine_scaled_reward": -0.1930369706824422, "rewards/format_reward": 0.6250000167638063, "step": 458 }, { "completion_length": 2455.3125228881836, "epoch": 0.5245714285714286, "grad_norm": 0.5656007528305054, "kl": 0.265380859375, "learning_rate": 1.1830871145697412e-07, "loss": 0.0203, "reward": 0.17724867910146713, "reward_std": 0.7673604302108288, "rewards/cosine_scaled_reward": -0.0773997250944376, "rewards/format_reward": 0.6458333358168602, "step": 459 }, { "completion_length": 2945.979248046875, "epoch": 0.5257142857142857, "grad_norm": 0.9485638737678528, "kl": 0.40087890625, "learning_rate": 1.1743223682775649e-07, "loss": 0.0091, "reward": -0.21174129098653793, "reward_std": 0.6470157653093338, "rewards/cosine_scaled_reward": -0.2377923596650362, "rewards/format_reward": 0.45833334140479565, "step": 460 }, { "completion_length": 2731.500045776367, "epoch": 0.5268571428571428, "grad_norm": 1.1951149702072144, "kl": 0.3209228515625, "learning_rate": 1.1657684494105386e-07, "loss": 0.0507, "reward": 0.395471319090575, "reward_std": 1.0323380753397942, "rewards/cosine_scaled_reward": 0.06153442489448935, "rewards/format_reward": 0.6250000204890966, "step": 461 }, { "completion_length": 2310.6875610351562, "epoch": 0.528, "grad_norm": 0.764068603515625, "kl": 0.26934814453125, "learning_rate": 1.1574257748745986e-07, "loss": 0.0606, "reward": 0.05179802142083645, "reward_std": 0.5869512222707272, "rewards/cosine_scaled_reward": -0.16689125541597605, "rewards/format_reward": 0.6875000149011612, "step": 462 }, { "completion_length": 2988.8959197998047, "epoch": 0.5291428571428571, "grad_norm": 0.6601057052612305, "kl": 0.36944580078125, "learning_rate": 1.1492947512799328e-07, "loss": 0.0391, "reward": 0.3766809217631817, "reward_std": 1.0442971400916576, "rewards/cosine_scaled_reward": 0.0020326152443885803, "rewards/format_reward": 0.7291666902601719, "step": 463 }, { "completion_length": 2086.9584045410156, "epoch": 0.5302857142857142, "grad_norm": 0.5062448382377625, "kl": 0.2507781982421875, "learning_rate": 1.1413757749211602e-07, "loss": 0.0197, "reward": 0.4759128368459642, "reward_std": 0.610221728682518, "rewards/cosine_scaled_reward": 0.0023420676589012146, "rewards/format_reward": 0.916666679084301, "step": 464 }, { "completion_length": 2885.041732788086, "epoch": 0.5314285714285715, "grad_norm": 0.7739923596382141, "kl": 0.41680908203125, "learning_rate": 1.1336692317580158e-07, "loss": 0.0037, "reward": 0.2254231304395944, "reward_std": 0.9453909136354923, "rewards/cosine_scaled_reward": -0.08590294234454632, "rewards/format_reward": 0.7083333563059568, "step": 465 }, { "completion_length": 2756.520896911621, "epoch": 0.5325714285714286, "grad_norm": 1.5458998680114746, "kl": 0.32745361328125, "learning_rate": 1.1261754973965422e-07, "loss": 0.1051, "reward": 0.3231991082429886, "reward_std": 0.8955418467521667, "rewards/cosine_scaled_reward": 0.024495400488376617, "rewards/format_reward": 0.625000013038516, "step": 466 }, { "completion_length": 2878.729248046875, "epoch": 0.5337142857142857, "grad_norm": 0.46346038579940796, "kl": 0.31219482421875, "learning_rate": 1.1188949370707787e-07, "loss": 0.0296, "reward": 0.08808497712016106, "reward_std": 0.6584981977939606, "rewards/cosine_scaled_reward": -0.19319903245195746, "rewards/format_reward": 0.7708333432674408, "step": 467 }, { "completion_length": 2945.479248046875, "epoch": 0.5348571428571428, "grad_norm": 0.5377303957939148, "kl": 0.395263671875, "learning_rate": 1.1118279056249653e-07, "loss": 0.0565, "reward": 0.3272525854408741, "reward_std": 0.7806177064776421, "rewards/cosine_scaled_reward": 0.046021029353141785, "rewards/format_reward": 0.6041666772216558, "step": 468 }, { "completion_length": 2482.9167251586914, "epoch": 0.536, "grad_norm": 0.6872317790985107, "kl": 0.414398193359375, "learning_rate": 1.1049747474962444e-07, "loss": 0.05, "reward": 0.3257849495857954, "reward_std": 0.8201456405222416, "rewards/cosine_scaled_reward": -0.05236888420768082, "rewards/format_reward": 0.791666679084301, "step": 469 }, { "completion_length": 3126.6458892822266, "epoch": 0.5371428571428571, "grad_norm": 1.1381498575210571, "kl": 0.4842529296875, "learning_rate": 1.0983357966978745e-07, "loss": 0.0271, "reward": 0.055752304033376276, "reward_std": 0.8491252809762955, "rewards/cosine_scaled_reward": -0.12250774865970016, "rewards/format_reward": 0.5625000111758709, "step": 470 }, { "completion_length": 2754.291778564453, "epoch": 0.5382857142857143, "grad_norm": 0.48720309138298035, "kl": 0.3826904296875, "learning_rate": 1.0919113768029517e-07, "loss": 0.0524, "reward": 0.5262258416041732, "reward_std": 0.9357166737318039, "rewards/cosine_scaled_reward": 0.09029024560004473, "rewards/format_reward": 0.7708333544433117, "step": 471 }, { "completion_length": 2644.812545776367, "epoch": 0.5394285714285715, "grad_norm": 0.3475642800331116, "kl": 0.34210205078125, "learning_rate": 1.0857018009286381e-07, "loss": 0.0365, "reward": 0.23837368440581486, "reward_std": 0.8128412887454033, "rewards/cosine_scaled_reward": -0.07736600749194622, "rewards/format_reward": 0.729166679084301, "step": 472 }, { "completion_length": 2967.229232788086, "epoch": 0.5405714285714286, "grad_norm": 0.6972779631614685, "kl": 0.3824462890625, "learning_rate": 1.0797073717209013e-07, "loss": 0.0307, "reward": 0.10205891542136669, "reward_std": 0.7758991979062557, "rewards/cosine_scaled_reward": -0.09507068432867527, "rewards/format_reward": 0.5833333432674408, "step": 473 }, { "completion_length": 2673.6875610351562, "epoch": 0.5417142857142857, "grad_norm": 0.45455074310302734, "kl": 0.395111083984375, "learning_rate": 1.0739283813397639e-07, "loss": 0.039, "reward": 0.46767894667573273, "reward_std": 0.8645204231142998, "rewards/cosine_scaled_reward": 0.09637415563338436, "rewards/format_reward": 0.687500013038516, "step": 474 }, { "completion_length": 2484.291717529297, "epoch": 0.5428571428571428, "grad_norm": 0.712739109992981, "kl": 0.348419189453125, "learning_rate": 1.068365111445064e-07, "loss": 0.0527, "reward": 0.3686076030135155, "reward_std": 0.916143324226141, "rewards/cosine_scaled_reward": 0.012207330204546452, "rewards/format_reward": 0.7083333469927311, "step": 475 }, { "completion_length": 3011.8959045410156, "epoch": 0.544, "grad_norm": 1.0881688594818115, "kl": 0.4395751953125, "learning_rate": 1.063017833182728e-07, "loss": 0.0524, "reward": 0.3218140173703432, "reward_std": 1.0662804022431374, "rewards/cosine_scaled_reward": -0.025444235419854522, "rewards/format_reward": 0.7083333507180214, "step": 476 }, { "completion_length": 2656.3751068115234, "epoch": 0.5451428571428572, "grad_norm": 2.0791175365448, "kl": 0.4376220703125, "learning_rate": 1.0578868071715544e-07, "loss": 0.0965, "reward": 0.2924130540341139, "reward_std": 0.976725772023201, "rewards/cosine_scaled_reward": 0.013521750457584858, "rewards/format_reward": 0.6041666865348816, "step": 477 }, { "completion_length": 2819.812545776367, "epoch": 0.5462857142857143, "grad_norm": 0.5398616194725037, "kl": 0.363677978515625, "learning_rate": 1.0529722834905125e-07, "loss": 0.0443, "reward": -0.004572154954075813, "reward_std": 0.6425527259707451, "rewards/cosine_scaled_reward": -0.13902169838547707, "rewards/format_reward": 0.5416666753590107, "step": 478 }, { "completion_length": 2858.562545776367, "epoch": 0.5474285714285714, "grad_norm": 0.6435410976409912, "kl": 0.378143310546875, "learning_rate": 1.0482745016665526e-07, "loss": 0.0261, "reward": 0.18186998274177313, "reward_std": 0.8897733464837074, "rewards/cosine_scaled_reward": -0.12430705223232508, "rewards/format_reward": 0.7291666846722364, "step": 479 }, { "completion_length": 2607.62508392334, "epoch": 0.5485714285714286, "grad_norm": 0.8115129470825195, "kl": 0.40704345703125, "learning_rate": 1.0437936906629334e-07, "loss": 0.043, "reward": 0.002752909902483225, "reward_std": 0.6639882102608681, "rewards/cosine_scaled_reward": -0.1971071765292436, "rewards/format_reward": 0.6666666716337204, "step": 480 }, { "completion_length": 3112.166748046875, "epoch": 0.5497142857142857, "grad_norm": 0.5421264171600342, "kl": 0.4429931640625, "learning_rate": 1.0395300688680625e-07, "loss": 0.0528, "reward": 0.04194536246359348, "reward_std": 0.6818186715245247, "rewards/cosine_scaled_reward": -0.13939355686306953, "rewards/format_reward": 0.6041666902601719, "step": 481 }, { "completion_length": 2706.7084045410156, "epoch": 0.5508571428571428, "grad_norm": 0.4543374478816986, "kl": 0.368194580078125, "learning_rate": 1.0354838440848501e-07, "loss": 0.0506, "reward": 0.4122487809509039, "reward_std": 0.7574451714754105, "rewards/cosine_scaled_reward": 0.056680090725421906, "rewards/format_reward": 0.7083333395421505, "step": 482 }, { "completion_length": 2878.229217529297, "epoch": 0.552, "grad_norm": 0.6022698879241943, "kl": 0.40606689453125, "learning_rate": 1.0316552135205837e-07, "loss": 0.0356, "reward": -0.09259207546710968, "reward_std": 0.6557772308588028, "rewards/cosine_scaled_reward": -0.19897521962411702, "rewards/format_reward": 0.5416666772216558, "step": 483 }, { "completion_length": 2701.9375610351562, "epoch": 0.5531428571428572, "grad_norm": 1.3102695941925049, "kl": 0.356781005859375, "learning_rate": 1.0280443637773163e-07, "loss": 0.0726, "reward": 0.3319150470197201, "reward_std": 1.0350141674280167, "rewards/cosine_scaled_reward": -0.02834635879844427, "rewards/format_reward": 0.729166679084301, "step": 484 }, { "completion_length": 2762.916717529297, "epoch": 0.5542857142857143, "grad_norm": 0.8539823293685913, "kl": 0.47900390625, "learning_rate": 1.0246514708427701e-07, "loss": 0.08, "reward": 0.18864674912765622, "reward_std": 0.7356266789138317, "rewards/cosine_scaled_reward": -0.11939917271956801, "rewards/format_reward": 0.7500000204890966, "step": 485 }, { "completion_length": 2239.6667404174805, "epoch": 0.5554285714285714, "grad_norm": 0.3320426940917969, "kl": 0.263824462890625, "learning_rate": 1.0214767000817596e-07, "loss": 0.0167, "reward": 0.2528488418611232, "reward_std": 0.7695866264402866, "rewards/cosine_scaled_reward": -0.10691238380968571, "rewards/format_reward": 0.812500013038516, "step": 486 }, { "completion_length": 2169.6666984558105, "epoch": 0.5565714285714286, "grad_norm": 0.562497615814209, "kl": 0.274017333984375, "learning_rate": 1.0185202062281336e-07, "loss": 0.0039, "reward": 0.44721357547678053, "reward_std": 0.8235045485198498, "rewards/cosine_scaled_reward": 0.010554181411862373, "rewards/format_reward": 0.8333333395421505, "step": 487 }, { "completion_length": 2426.8125534057617, "epoch": 0.5577142857142857, "grad_norm": 0.2765026092529297, "kl": 0.3238372802734375, "learning_rate": 1.0157821333772304e-07, "loss": 0.0469, "reward": 0.08554835570976138, "reward_std": 0.5648987516760826, "rewards/cosine_scaled_reward": -0.19786089658737183, "rewards/format_reward": 0.7916666753590107, "step": 488 }, { "completion_length": 3204.875030517578, "epoch": 0.5588571428571428, "grad_norm": 1.2936737537384033, "kl": 0.4969482421875, "learning_rate": 1.013262614978859e-07, "loss": 0.0295, "reward": -0.17606779746711254, "reward_std": 0.6055882386863232, "rewards/cosine_scaled_reward": -0.21996455593034625, "rewards/format_reward": 0.47916668094694614, "step": 489 }, { "completion_length": 2257.1458892822266, "epoch": 0.56, "grad_norm": 0.4323062598705292, "kl": 0.3099365234375, "learning_rate": 1.0109617738307911e-07, "loss": 0.0338, "reward": 0.3065086267888546, "reward_std": 0.7493411414325237, "rewards/cosine_scaled_reward": -0.061892845667898655, "rewards/format_reward": 0.7916666734963655, "step": 490 }, { "completion_length": 2736.312545776367, "epoch": 0.5611428571428572, "grad_norm": 0.5150272846221924, "kl": 0.3674163818359375, "learning_rate": 1.0088797220727779e-07, "loss": 0.0312, "reward": 0.5249021016061306, "reward_std": 1.010607898235321, "rewards/cosine_scaled_reward": 0.1508561042137444, "rewards/format_reward": 0.6250000204890966, "step": 491 }, { "completion_length": 2490.8958740234375, "epoch": 0.5622857142857143, "grad_norm": 1.4419853687286377, "kl": 0.42706298828125, "learning_rate": 1.0070165611810855e-07, "loss": 0.0032, "reward": 0.1767475767992437, "reward_std": 0.777750164270401, "rewards/cosine_scaled_reward": -0.07436079788021743, "rewards/format_reward": 0.6458333469927311, "step": 492 }, { "completion_length": 2550.9375762939453, "epoch": 0.5634285714285714, "grad_norm": 0.4972594380378723, "kl": 0.450469970703125, "learning_rate": 1.005372381963547e-07, "loss": 0.0431, "reward": 0.6753305066376925, "reward_std": 0.8794010616838932, "rewards/cosine_scaled_reward": 0.15572454407811165, "rewards/format_reward": 0.8541666716337204, "step": 493 }, { "completion_length": 2323.479232788086, "epoch": 0.5645714285714286, "grad_norm": 0.5891183614730835, "kl": 0.3460693359375, "learning_rate": 1.0039472645551372e-07, "loss": 0.0271, "reward": 0.6445839628577232, "reward_std": 1.0191630199551582, "rewards/cosine_scaled_reward": 0.12392374624687363, "rewards/format_reward": 0.8541666753590107, "step": 494 }, { "completion_length": 2892.916717529297, "epoch": 0.5657142857142857, "grad_norm": 0.4543853998184204, "kl": 0.45623779296875, "learning_rate": 1.002741278414069e-07, "loss": 0.0451, "reward": 0.18997809663414955, "reward_std": 0.8969040662050247, "rewards/cosine_scaled_reward": -0.05571722239255905, "rewards/format_reward": 0.6041666828095913, "step": 495 }, { "completion_length": 2114.500045776367, "epoch": 0.5668571428571428, "grad_norm": 1.2619980573654175, "kl": 0.32281494140625, "learning_rate": 1.0017544823184055e-07, "loss": 0.0048, "reward": 0.488508015871048, "reward_std": 0.7361417338252068, "rewards/cosine_scaled_reward": 0.10424756724387407, "rewards/format_reward": 0.7083333358168602, "step": 496 }, { "completion_length": 2322.916679382324, "epoch": 0.568, "grad_norm": 0.9984971284866333, "kl": 0.2513885498046875, "learning_rate": 1.0009869243631952e-07, "loss": -0.0203, "reward": 0.6254424216458574, "reward_std": 0.7260546460747719, "rewards/cosine_scaled_reward": 0.1703429389744997, "rewards/format_reward": 0.7708333488553762, "step": 497 }, { "completion_length": 2773.166748046875, "epoch": 0.5691428571428572, "grad_norm": 0.5907127857208252, "kl": 0.4683837890625, "learning_rate": 1.000438641958131e-07, "loss": 0.0642, "reward": 0.36813890002667904, "reward_std": 0.9811634942889214, "rewards/cosine_scaled_reward": -0.0031640082597732544, "rewards/format_reward": 0.7291666828095913, "step": 498 }, { "completion_length": 2583.5834350585938, "epoch": 0.5702857142857143, "grad_norm": 0.35024699568748474, "kl": 0.358978271484375, "learning_rate": 1.0001096618257236e-07, "loss": 0.0401, "reward": 0.21864662691950798, "reward_std": 0.8943019956350327, "rewards/cosine_scaled_reward": -0.07930864673107862, "rewards/format_reward": 0.6875000037252903, "step": 499 }, { "completion_length": 3055.6666870117188, "epoch": 0.5714285714285714, "grad_norm": 1.0981477499008179, "kl": 0.4345703125, "learning_rate": 1e-07, "loss": 0.023, "reward": 0.14122199080884457, "reward_std": 0.8392615914344788, "rewards/cosine_scaled_reward": -0.031122979940846562, "rewards/format_reward": 0.5000000055879354, "step": 500 }, { "epoch": 0.5714285714285714, "step": 500, "total_flos": 0.0, "train_loss": 0.038267850877775345, "train_runtime": 72125.5591, "train_samples_per_second": 0.333, "train_steps_per_second": 0.007 } ], "logging_steps": 1, "max_steps": 500, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 50, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 6, "trial_name": null, "trial_params": null }