| { |
| "best_metric": null, |
| "best_model_checkpoint": null, |
| "epoch": 0.5714285714285714, |
| "eval_steps": 500, |
| "global_step": 500, |
| "is_hyper_param_search": false, |
| "is_local_process_zero": true, |
| "is_world_process_zero": true, |
| "log_history": [ |
| { |
| "completion_length": 2571.2083587646484, |
| "epoch": 0.001142857142857143, |
| "grad_norm": 0.19510559737682343, |
| "kl": 0.0, |
| "learning_rate": 2e-08, |
| "loss": 0.0, |
| "reward": 0.4897647276520729, |
| "reward_std": 0.8290339335799217, |
| "rewards/cosine_scaled_reward": -0.015534311532974243, |
| "rewards/format_reward": 0.5208333488553762, |
| "step": 1 |
| }, |
| { |
| "completion_length": 2804.395881652832, |
| "epoch": 0.002285714285714286, |
| "grad_norm": 0.18415163457393646, |
| "kl": 0.0, |
| "learning_rate": 4e-08, |
| "loss": 0.0, |
| "reward": 0.27539755403995514, |
| "reward_std": 0.42092563211917877, |
| "rewards/cosine_scaled_reward": -0.04980122856795788, |
| "rewards/format_reward": 0.37500000558793545, |
| "step": 2 |
| }, |
| { |
| "completion_length": 3361.1458435058594, |
| "epoch": 0.0034285714285714284, |
| "grad_norm": 0.16567473113536835, |
| "kl": 4.006922245025635e-05, |
| "learning_rate": 6e-08, |
| "loss": 0.0, |
| "reward": -0.23245980869978666, |
| "reward_std": 0.5928730629384518, |
| "rewards/cosine_scaled_reward": -0.17872990405885503, |
| "rewards/format_reward": 0.1250000037252903, |
| "step": 3 |
| }, |
| { |
| "completion_length": 2153.729202270508, |
| "epoch": 0.004571428571428572, |
| "grad_norm": 0.25097447633743286, |
| "kl": 4.071742296218872e-05, |
| "learning_rate": 8e-08, |
| "loss": 0.0, |
| "reward": 0.3372869056183845, |
| "reward_std": 0.6931154392659664, |
| "rewards/cosine_scaled_reward": -0.14385656313970685, |
| "rewards/format_reward": 0.6250000037252903, |
| "step": 4 |
| }, |
| { |
| "completion_length": 3400.375030517578, |
| "epoch": 0.005714285714285714, |
| "grad_norm": 0.224583700299263, |
| "kl": 4.339590668678284e-05, |
| "learning_rate": 1e-07, |
| "loss": 0.0, |
| "reward": -0.30452448688447475, |
| "reward_std": 0.5627153031527996, |
| "rewards/cosine_scaled_reward": -0.24601224437355995, |
| "rewards/format_reward": 0.18750000558793545, |
| "step": 5 |
| }, |
| { |
| "completion_length": 3246.2291717529297, |
| "epoch": 0.006857142857142857, |
| "grad_norm": 0.17905186116695404, |
| "kl": 4.048645496368408e-05, |
| "learning_rate": 1.2e-07, |
| "loss": 0.0, |
| "reward": -0.04279324598610401, |
| "reward_std": 0.5993511825799942, |
| "rewards/cosine_scaled_reward": -0.18806329306971747, |
| "rewards/format_reward": 0.3333333469927311, |
| "step": 6 |
| }, |
| { |
| "completion_length": 2938.0209197998047, |
| "epoch": 0.008, |
| "grad_norm": 0.19757460057735443, |
| "kl": 2.4452805519104004e-05, |
| "learning_rate": 1.4e-07, |
| "loss": 0.0, |
| "reward": 0.2690338185057044, |
| "reward_std": 0.7015659939497709, |
| "rewards/cosine_scaled_reward": -0.11548309866338968, |
| "rewards/format_reward": 0.5000000111758709, |
| "step": 7 |
| }, |
| { |
| "completion_length": 2751.770866394043, |
| "epoch": 0.009142857142857144, |
| "grad_norm": 0.1658889204263687, |
| "kl": 1.728162169456482e-05, |
| "learning_rate": 1.6e-07, |
| "loss": 0.0, |
| "reward": 0.6224091164767742, |
| "reward_std": 0.7972168251872063, |
| "rewards/cosine_scaled_reward": 0.09245455078780651, |
| "rewards/format_reward": 0.43750001303851604, |
| "step": 8 |
| }, |
| { |
| "completion_length": 3031.7084045410156, |
| "epoch": 0.010285714285714285, |
| "grad_norm": 0.2120562642812729, |
| "kl": 3.610551357269287e-05, |
| "learning_rate": 1.8e-07, |
| "loss": 0.0, |
| "reward": 0.18008227972313762, |
| "reward_std": 0.8001800198107958, |
| "rewards/cosine_scaled_reward": -0.10787552827969193, |
| "rewards/format_reward": 0.39583334140479565, |
| "step": 9 |
| }, |
| { |
| "completion_length": 2753.208366394043, |
| "epoch": 0.011428571428571429, |
| "grad_norm": 0.21144917607307434, |
| "kl": 3.1989067792892456e-05, |
| "learning_rate": 2e-07, |
| "loss": 0.0, |
| "reward": 0.08000842481851578, |
| "reward_std": 0.7203316055238247, |
| "rewards/cosine_scaled_reward": -0.14749579317867756, |
| "rewards/format_reward": 0.3750000111758709, |
| "step": 10 |
| }, |
| { |
| "completion_length": 3293.2708435058594, |
| "epoch": 0.012571428571428572, |
| "grad_norm": 0.1650972217321396, |
| "kl": 3.8489699363708496e-05, |
| "learning_rate": 2.1999999999999998e-07, |
| "loss": 0.0, |
| "reward": -0.41089826077222824, |
| "reward_std": 0.4038824327290058, |
| "rewards/cosine_scaled_reward": -0.2783657982945442, |
| "rewards/format_reward": 0.1458333395421505, |
| "step": 11 |
| }, |
| { |
| "completion_length": 2683.5000915527344, |
| "epoch": 0.013714285714285714, |
| "grad_norm": 0.19341549277305603, |
| "kl": 4.114210605621338e-05, |
| "learning_rate": 2.4e-07, |
| "loss": 0.0, |
| "reward": 0.43104756623506546, |
| "reward_std": 0.5792003609240055, |
| "rewards/cosine_scaled_reward": -0.11780955828726292, |
| "rewards/format_reward": 0.6666666828095913, |
| "step": 12 |
| }, |
| { |
| "completion_length": 2886.541702270508, |
| "epoch": 0.014857142857142857, |
| "grad_norm": 0.22984430193901062, |
| "kl": 3.719329833984375e-05, |
| "learning_rate": 2.6e-07, |
| "loss": 0.0, |
| "reward": 0.23748547211289406, |
| "reward_std": 0.7873078212141991, |
| "rewards/cosine_scaled_reward": -0.06875726429279894, |
| "rewards/format_reward": 0.3750000149011612, |
| "step": 13 |
| }, |
| { |
| "completion_length": 2877.312530517578, |
| "epoch": 0.016, |
| "grad_norm": 0.2078430950641632, |
| "kl": 2.997368574142456e-05, |
| "learning_rate": 2.8e-07, |
| "loss": 0.0, |
| "reward": 0.14176954282447696, |
| "reward_std": 0.7100466191768646, |
| "rewards/cosine_scaled_reward": -0.11661523208022118, |
| "rewards/format_reward": 0.3750000074505806, |
| "step": 14 |
| }, |
| { |
| "completion_length": 2681.437511444092, |
| "epoch": 0.017142857142857144, |
| "grad_norm": 0.20677024126052856, |
| "kl": 2.692500129342079e-05, |
| "learning_rate": 3e-07, |
| "loss": 0.0, |
| "reward": 0.5315095772966743, |
| "reward_std": 0.6707141287624836, |
| "rewards/cosine_scaled_reward": 0.04700478911399841, |
| "rewards/format_reward": 0.43750000558793545, |
| "step": 15 |
| }, |
| { |
| "completion_length": 3505.2708435058594, |
| "epoch": 0.018285714285714287, |
| "grad_norm": 0.19309639930725098, |
| "kl": 3.673136234283447e-05, |
| "learning_rate": 3.2e-07, |
| "loss": 0.0, |
| "reward": -0.2943936800584197, |
| "reward_std": 0.6762014180421829, |
| "rewards/cosine_scaled_reward": -0.19928016886115074, |
| "rewards/format_reward": 0.10416666977107525, |
| "step": 16 |
| }, |
| { |
| "completion_length": 2520.7916984558105, |
| "epoch": 0.019428571428571427, |
| "grad_norm": 0.2713626027107239, |
| "kl": 3.7260353565216064e-05, |
| "learning_rate": 3.4000000000000003e-07, |
| "loss": 0.0, |
| "reward": 0.29364213347435, |
| "reward_std": 0.7413498945534229, |
| "rewards/cosine_scaled_reward": -0.09276227621012367, |
| "rewards/format_reward": 0.4791666828095913, |
| "step": 17 |
| }, |
| { |
| "completion_length": 2989.500030517578, |
| "epoch": 0.02057142857142857, |
| "grad_norm": 0.14791515469551086, |
| "kl": 2.292729914188385e-05, |
| "learning_rate": 3.6e-07, |
| "loss": 0.0, |
| "reward": 0.22529255971312523, |
| "reward_std": 0.5643694922327995, |
| "rewards/cosine_scaled_reward": -0.08527039736509323, |
| "rewards/format_reward": 0.39583333395421505, |
| "step": 18 |
| }, |
| { |
| "completion_length": 2874.250030517578, |
| "epoch": 0.021714285714285714, |
| "grad_norm": 0.18149082362651825, |
| "kl": 2.5779008865356445e-05, |
| "learning_rate": 3.7999999999999996e-07, |
| "loss": 0.0, |
| "reward": 0.6877359300851822, |
| "reward_std": 0.7566238529980183, |
| "rewards/cosine_scaled_reward": 0.1355345994234085, |
| "rewards/format_reward": 0.416666679084301, |
| "step": 19 |
| }, |
| { |
| "completion_length": 2473.520854949951, |
| "epoch": 0.022857142857142857, |
| "grad_norm": 0.20354878902435303, |
| "kl": 1.685088500380516e-05, |
| "learning_rate": 4e-07, |
| "loss": 0.0, |
| "reward": 0.5272735962644219, |
| "reward_std": 0.7334154956042767, |
| "rewards/cosine_scaled_reward": -0.059279868844896555, |
| "rewards/format_reward": 0.6458333414047956, |
| "step": 20 |
| }, |
| { |
| "completion_length": 2523.520835876465, |
| "epoch": 0.024, |
| "grad_norm": 0.26596662402153015, |
| "kl": 4.191696643829346e-05, |
| "learning_rate": 4.1999999999999995e-07, |
| "loss": 0.0, |
| "reward": 0.5646272003650665, |
| "reward_std": 0.5900244954973459, |
| "rewards/cosine_scaled_reward": 0.04273026343435049, |
| "rewards/format_reward": 0.47916666977107525, |
| "step": 21 |
| }, |
| { |
| "completion_length": 2035.3542251586914, |
| "epoch": 0.025142857142857144, |
| "grad_norm": 0.2960587441921234, |
| "kl": 3.464333713054657e-05, |
| "learning_rate": 4.3999999999999997e-07, |
| "loss": 0.0, |
| "reward": 0.637257520109415, |
| "reward_std": 0.8020393662154675, |
| "rewards/cosine_scaled_reward": -0.03553791396552697, |
| "rewards/format_reward": 0.7083333488553762, |
| "step": 22 |
| }, |
| { |
| "completion_length": 2664.3125610351562, |
| "epoch": 0.026285714285714287, |
| "grad_norm": 0.21004951000213623, |
| "kl": 3.3404678106307983e-05, |
| "learning_rate": 4.6e-07, |
| "loss": 0.0, |
| "reward": 0.22290698066353798, |
| "reward_std": 0.7860734751448035, |
| "rewards/cosine_scaled_reward": -0.0968798566609621, |
| "rewards/format_reward": 0.4166666753590107, |
| "step": 23 |
| }, |
| { |
| "completion_length": 2712.6875610351562, |
| "epoch": 0.027428571428571427, |
| "grad_norm": 0.2664627432823181, |
| "kl": 2.100318670272827e-05, |
| "learning_rate": 4.8e-07, |
| "loss": 0.0, |
| "reward": 0.7516965055838227, |
| "reward_std": 1.1073086112737656, |
| "rewards/cosine_scaled_reward": 0.07376489660236984, |
| "rewards/format_reward": 0.6041666865348816, |
| "step": 24 |
| }, |
| { |
| "completion_length": 2708.1458740234375, |
| "epoch": 0.02857142857142857, |
| "grad_norm": 0.22358901798725128, |
| "kl": 3.844499588012695e-05, |
| "learning_rate": 5e-07, |
| "loss": 0.0, |
| "reward": 0.1855073875049129, |
| "reward_std": 0.6800910420715809, |
| "rewards/cosine_scaled_reward": -0.08432964608073235, |
| "rewards/format_reward": 0.3541666753590107, |
| "step": 25 |
| }, |
| { |
| "completion_length": 3107.541717529297, |
| "epoch": 0.029714285714285714, |
| "grad_norm": 0.1617579460144043, |
| "kl": 2.820044755935669e-05, |
| "learning_rate": 5.2e-07, |
| "loss": 0.0, |
| "reward": 0.587481252849102, |
| "reward_std": 0.693414282053709, |
| "rewards/cosine_scaled_reward": 0.07499059912515804, |
| "rewards/format_reward": 0.43750001303851604, |
| "step": 26 |
| }, |
| { |
| "completion_length": 2998.666702270508, |
| "epoch": 0.030857142857142857, |
| "grad_norm": 0.18699905276298523, |
| "kl": 2.479786053299904e-05, |
| "learning_rate": 5.4e-07, |
| "loss": 0.0, |
| "reward": 0.12037789449095726, |
| "reward_std": 0.8231884613633156, |
| "rewards/cosine_scaled_reward": -0.12731105368584394, |
| "rewards/format_reward": 0.37500000558793545, |
| "step": 27 |
| }, |
| { |
| "completion_length": 2880.4375534057617, |
| "epoch": 0.032, |
| "grad_norm": 0.1849849373102188, |
| "kl": 2.5503337383270264e-05, |
| "learning_rate": 5.6e-07, |
| "loss": 0.0, |
| "reward": 0.38755445554852486, |
| "reward_std": 0.8283951133489609, |
| "rewards/cosine_scaled_reward": -0.004139451950322837, |
| "rewards/format_reward": 0.39583334140479565, |
| "step": 28 |
| }, |
| { |
| "completion_length": 3422.604217529297, |
| "epoch": 0.03314285714285714, |
| "grad_norm": 0.17641501128673553, |
| "kl": 2.0990148186683655e-05, |
| "learning_rate": 5.8e-07, |
| "loss": 0.0, |
| "reward": -0.2620885446667671, |
| "reward_std": 0.563602440059185, |
| "rewards/cosine_scaled_reward": -0.20396093931049109, |
| "rewards/format_reward": 0.14583333767950535, |
| "step": 29 |
| }, |
| { |
| "completion_length": 2927.229217529297, |
| "epoch": 0.03428571428571429, |
| "grad_norm": 0.20999222993850708, |
| "kl": 3.0543655157089233e-05, |
| "learning_rate": 6e-07, |
| "loss": 0.0, |
| "reward": 0.3440352368634194, |
| "reward_std": 0.9646327681839466, |
| "rewards/cosine_scaled_reward": -0.0467323949560523, |
| "rewards/format_reward": 0.43750001676380634, |
| "step": 30 |
| }, |
| { |
| "completion_length": 2953.437530517578, |
| "epoch": 0.03542857142857143, |
| "grad_norm": 0.18049225211143494, |
| "kl": 4.493445158004761e-05, |
| "learning_rate": 6.2e-07, |
| "loss": 0.0, |
| "reward": 0.20754884742200375, |
| "reward_std": 0.8056708425283432, |
| "rewards/cosine_scaled_reward": -0.08372558187693357, |
| "rewards/format_reward": 0.37500000931322575, |
| "step": 31 |
| }, |
| { |
| "completion_length": 3194.937545776367, |
| "epoch": 0.036571428571428574, |
| "grad_norm": 0.15885497629642487, |
| "kl": 2.6959925889968872e-05, |
| "learning_rate": 6.4e-07, |
| "loss": 0.0, |
| "reward": 0.04864098597317934, |
| "reward_std": 0.6213513053953648, |
| "rewards/cosine_scaled_reward": -0.12151284422725439, |
| "rewards/format_reward": 0.2916666716337204, |
| "step": 32 |
| }, |
| { |
| "completion_length": 3312.2291870117188, |
| "epoch": 0.037714285714285714, |
| "grad_norm": 0.15803882479667664, |
| "kl": 4.449114203453064e-05, |
| "learning_rate": 6.6e-07, |
| "loss": 0.0, |
| "reward": 0.20429637841880322, |
| "reward_std": 0.770494450815022, |
| "rewards/cosine_scaled_reward": -0.03326847730204463, |
| "rewards/format_reward": 0.27083334140479565, |
| "step": 33 |
| }, |
| { |
| "completion_length": 2483.666702270508, |
| "epoch": 0.038857142857142854, |
| "grad_norm": 0.22648921608924866, |
| "kl": 7.438473403453827e-05, |
| "learning_rate": 6.800000000000001e-07, |
| "loss": 0.0, |
| "reward": 0.7040222510695457, |
| "reward_std": 0.8860063031315804, |
| "rewards/cosine_scaled_reward": 0.07076112227514386, |
| "rewards/format_reward": 0.5625000074505806, |
| "step": 34 |
| }, |
| { |
| "completion_length": 2937.750015258789, |
| "epoch": 0.04, |
| "grad_norm": 0.20356310904026031, |
| "kl": 4.690885543823242e-05, |
| "learning_rate": 7e-07, |
| "loss": 0.0, |
| "reward": 0.31086206063628197, |
| "reward_std": 0.8583472333848476, |
| "rewards/cosine_scaled_reward": -0.032068971544504166, |
| "rewards/format_reward": 0.37500000558793545, |
| "step": 35 |
| }, |
| { |
| "completion_length": 3399.6458435058594, |
| "epoch": 0.04114285714285714, |
| "grad_norm": 0.17643427848815918, |
| "kl": 4.9777328968048096e-05, |
| "learning_rate": 7.2e-07, |
| "loss": 0.0, |
| "reward": -0.3302987515926361, |
| "reward_std": 0.4607627494260669, |
| "rewards/cosine_scaled_reward": -0.22764937952160835, |
| "rewards/format_reward": 0.1250000037252903, |
| "step": 36 |
| }, |
| { |
| "completion_length": 3218.916702270508, |
| "epoch": 0.04228571428571429, |
| "grad_norm": 0.16882504522800446, |
| "kl": 3.8273632526397705e-05, |
| "learning_rate": 7.4e-07, |
| "loss": 0.0, |
| "reward": -0.16163601353764534, |
| "reward_std": 0.4668765738606453, |
| "rewards/cosine_scaled_reward": -0.21623468212783337, |
| "rewards/format_reward": 0.2708333395421505, |
| "step": 37 |
| }, |
| { |
| "completion_length": 3279.5416870117188, |
| "epoch": 0.04342857142857143, |
| "grad_norm": 0.15908610820770264, |
| "kl": 4.918128252029419e-05, |
| "learning_rate": 7.599999999999999e-07, |
| "loss": 0.0, |
| "reward": -0.20695911906659603, |
| "reward_std": 0.5721107684075832, |
| "rewards/cosine_scaled_reward": -0.176396232098341, |
| "rewards/format_reward": 0.1458333395421505, |
| "step": 38 |
| }, |
| { |
| "completion_length": 2879.375045776367, |
| "epoch": 0.044571428571428574, |
| "grad_norm": 0.2147342413663864, |
| "kl": 7.034093141555786e-05, |
| "learning_rate": 7.799999999999999e-07, |
| "loss": 0.0, |
| "reward": 0.29712690226733685, |
| "reward_std": 0.6135409390553832, |
| "rewards/cosine_scaled_reward": -0.04935322143137455, |
| "rewards/format_reward": 0.3958333432674408, |
| "step": 39 |
| }, |
| { |
| "completion_length": 2469.687530517578, |
| "epoch": 0.045714285714285714, |
| "grad_norm": 0.19995440542697906, |
| "kl": 0.00012910738587379456, |
| "learning_rate": 8e-07, |
| "loss": 0.0, |
| "reward": 0.2901697149500251, |
| "reward_std": 0.5493389200419188, |
| "rewards/cosine_scaled_reward": -0.13616514671593904, |
| "rewards/format_reward": 0.5625000074505806, |
| "step": 40 |
| }, |
| { |
| "completion_length": 3059.3333740234375, |
| "epoch": 0.046857142857142854, |
| "grad_norm": 0.1593063771724701, |
| "kl": 5.719810724258423e-05, |
| "learning_rate": 8.199999999999999e-07, |
| "loss": 0.0, |
| "reward": -0.08104978175833821, |
| "reward_std": 0.537400247529149, |
| "rewards/cosine_scaled_reward": -0.21760822273790836, |
| "rewards/format_reward": 0.3541666716337204, |
| "step": 41 |
| }, |
| { |
| "completion_length": 2785.979190826416, |
| "epoch": 0.048, |
| "grad_norm": 0.24719306826591492, |
| "kl": 5.231797695159912e-05, |
| "learning_rate": 8.399999999999999e-07, |
| "loss": 0.0, |
| "reward": -0.2765159234404564, |
| "reward_std": 0.3159199729561806, |
| "rewards/cosine_scaled_reward": -0.29450796730816364, |
| "rewards/format_reward": 0.3125, |
| "step": 42 |
| }, |
| { |
| "completion_length": 3143.4583435058594, |
| "epoch": 0.04914285714285714, |
| "grad_norm": 0.17697609961032867, |
| "kl": 5.420297384262085e-05, |
| "learning_rate": 8.599999999999999e-07, |
| "loss": 0.0, |
| "reward": -0.027741093188524246, |
| "reward_std": 0.5976252444088459, |
| "rewards/cosine_scaled_reward": -0.14928722102195024, |
| "rewards/format_reward": 0.2708333358168602, |
| "step": 43 |
| }, |
| { |
| "completion_length": 2617.750045776367, |
| "epoch": 0.05028571428571429, |
| "grad_norm": 0.30272603034973145, |
| "kl": 0.00017141178250312805, |
| "learning_rate": 8.799999999999999e-07, |
| "loss": 0.0, |
| "reward": 0.54763038828969, |
| "reward_std": 0.6350081358104944, |
| "rewards/cosine_scaled_reward": -0.007434792350977659, |
| "rewards/format_reward": 0.5625000055879354, |
| "step": 44 |
| }, |
| { |
| "completion_length": 3381.1458740234375, |
| "epoch": 0.05142857142857143, |
| "grad_norm": 0.14729151129722595, |
| "kl": 5.987286567687988e-05, |
| "learning_rate": 9e-07, |
| "loss": 0.0, |
| "reward": 0.1284105316735804, |
| "reward_std": 0.6408776678144932, |
| "rewards/cosine_scaled_reward": -0.06079472857527435, |
| "rewards/format_reward": 0.25, |
| "step": 45 |
| }, |
| { |
| "completion_length": 3190.1458740234375, |
| "epoch": 0.052571428571428575, |
| "grad_norm": 0.18984851241111755, |
| "kl": 0.0001230388879776001, |
| "learning_rate": 9.2e-07, |
| "loss": 0.0, |
| "reward": -0.2710073352791369, |
| "reward_std": 0.37906504422426224, |
| "rewards/cosine_scaled_reward": -0.22925367206335068, |
| "rewards/format_reward": 0.18750000186264515, |
| "step": 46 |
| }, |
| { |
| "completion_length": 2773.666748046875, |
| "epoch": 0.053714285714285714, |
| "grad_norm": 0.22614669799804688, |
| "kl": 7.295981049537659e-05, |
| "learning_rate": 9.399999999999999e-07, |
| "loss": 0.0, |
| "reward": 0.6267627328634262, |
| "reward_std": 1.0455659702420235, |
| "rewards/cosine_scaled_reward": 0.0633813701570034, |
| "rewards/format_reward": 0.5000000111758709, |
| "step": 47 |
| }, |
| { |
| "completion_length": 2853.6250534057617, |
| "epoch": 0.054857142857142854, |
| "grad_norm": 0.2049134522676468, |
| "kl": 0.00029557570815086365, |
| "learning_rate": 9.6e-07, |
| "loss": 0.0, |
| "reward": 0.1033776430413127, |
| "reward_std": 0.891391895711422, |
| "rewards/cosine_scaled_reward": -0.11497785244137049, |
| "rewards/format_reward": 0.33333334140479565, |
| "step": 48 |
| }, |
| { |
| "completion_length": 2345.6250610351562, |
| "epoch": 0.056, |
| "grad_norm": 0.21897369623184204, |
| "kl": 0.00014978647232055664, |
| "learning_rate": 9.8e-07, |
| "loss": 0.0, |
| "reward": 0.6087969962973148, |
| "reward_std": 0.7800232023000717, |
| "rewards/cosine_scaled_reward": -0.008101530373096466, |
| "rewards/format_reward": 0.6250000037252903, |
| "step": 49 |
| }, |
| { |
| "completion_length": 3047.5000228881836, |
| "epoch": 0.05714285714285714, |
| "grad_norm": 0.17249740660190582, |
| "kl": 0.0002017766237258911, |
| "learning_rate": 1e-06, |
| "loss": 0.0, |
| "reward": 0.43377939984202385, |
| "reward_std": 0.7017503455281258, |
| "rewards/cosine_scaled_reward": 0.029389701783657074, |
| "rewards/format_reward": 0.37500000931322575, |
| "step": 50 |
| }, |
| { |
| "completion_length": 2343.145881652832, |
| "epoch": 0.05828571428571429, |
| "grad_norm": 0.2304115891456604, |
| "kl": 0.0005070865154266357, |
| "learning_rate": 9.999890338174275e-07, |
| "loss": 0.0, |
| "reward": 0.30895326659083366, |
| "reward_std": 0.47219153563492, |
| "rewards/cosine_scaled_reward": -0.10594002925790846, |
| "rewards/format_reward": 0.5208333395421505, |
| "step": 51 |
| }, |
| { |
| "completion_length": 3017.1042098999023, |
| "epoch": 0.05942857142857143, |
| "grad_norm": 0.22551533579826355, |
| "kl": 0.0004254281520843506, |
| "learning_rate": 9.999561358041868e-07, |
| "loss": 0.0, |
| "reward": 0.3134525269269943, |
| "reward_std": 0.7696055620908737, |
| "rewards/cosine_scaled_reward": -0.020357078406959772, |
| "rewards/format_reward": 0.35416667349636555, |
| "step": 52 |
| }, |
| { |
| "completion_length": 2933.2084197998047, |
| "epoch": 0.060571428571428575, |
| "grad_norm": 0.1710565686225891, |
| "kl": 0.00019855797290802002, |
| "learning_rate": 9.999013075636804e-07, |
| "loss": 0.0, |
| "reward": 0.4190669923555106, |
| "reward_std": 0.9724426595494151, |
| "rewards/cosine_scaled_reward": -0.030049838591367006, |
| "rewards/format_reward": 0.4791666753590107, |
| "step": 53 |
| }, |
| { |
| "completion_length": 2728.2083740234375, |
| "epoch": 0.061714285714285715, |
| "grad_norm": 0.1685902327299118, |
| "kl": 8.179806172847748e-05, |
| "learning_rate": 9.998245517681593e-07, |
| "loss": 0.0, |
| "reward": 0.9252486824989319, |
| "reward_std": 1.0930374152958393, |
| "rewards/cosine_scaled_reward": 0.16054099425673485, |
| "rewards/format_reward": 0.6041666772216558, |
| "step": 54 |
| }, |
| { |
| "completion_length": 2987.250030517578, |
| "epoch": 0.06285714285714286, |
| "grad_norm": 0.1715623438358307, |
| "kl": 0.0003556758165359497, |
| "learning_rate": 9.997258721585931e-07, |
| "loss": 0.0, |
| "reward": 0.34154442069120705, |
| "reward_std": 0.7232516668736935, |
| "rewards/cosine_scaled_reward": -0.03756110556423664, |
| "rewards/format_reward": 0.41666667349636555, |
| "step": 55 |
| }, |
| { |
| "completion_length": 3029.250045776367, |
| "epoch": 0.064, |
| "grad_norm": 0.15501734614372253, |
| "kl": 0.00012418627738952637, |
| "learning_rate": 9.996052735444862e-07, |
| "loss": 0.0, |
| "reward": 0.25998372526373714, |
| "reward_std": 0.6867683958262205, |
| "rewards/cosine_scaled_reward": -0.047091471031308174, |
| "rewards/format_reward": 0.3541666716337204, |
| "step": 56 |
| }, |
| { |
| "completion_length": 3392.9791870117188, |
| "epoch": 0.06514285714285714, |
| "grad_norm": 0.1358395218849182, |
| "kl": 5.584210157394409e-05, |
| "learning_rate": 9.994627618036452e-07, |
| "loss": 0.0, |
| "reward": -0.13618062436580658, |
| "reward_std": 0.6030133794993162, |
| "rewards/cosine_scaled_reward": -0.18267363915219903, |
| "rewards/format_reward": 0.2291666679084301, |
| "step": 57 |
| }, |
| { |
| "completion_length": 2349.08341217041, |
| "epoch": 0.06628571428571428, |
| "grad_norm": 0.22305431962013245, |
| "kl": 0.0013600699603557587, |
| "learning_rate": 9.992983438818915e-07, |
| "loss": 0.0001, |
| "reward": 0.5223081162257586, |
| "reward_std": 0.9040666744112968, |
| "rewards/cosine_scaled_reward": -0.030512610450387, |
| "rewards/format_reward": 0.5833333395421505, |
| "step": 58 |
| }, |
| { |
| "completion_length": 2854.125030517578, |
| "epoch": 0.06742857142857143, |
| "grad_norm": 0.1728586107492447, |
| "kl": 7.890164852142334e-05, |
| "learning_rate": 9.991120277927223e-07, |
| "loss": 0.0, |
| "reward": 0.1912415586411953, |
| "reward_std": 0.6860835738480091, |
| "rewards/cosine_scaled_reward": -0.06062923185527325, |
| "rewards/format_reward": 0.31250000558793545, |
| "step": 59 |
| }, |
| { |
| "completion_length": 3058.0208587646484, |
| "epoch": 0.06857142857142857, |
| "grad_norm": 0.15029877424240112, |
| "kl": 0.0001646280288696289, |
| "learning_rate": 9.989038226169207e-07, |
| "loss": 0.0, |
| "reward": -0.1796177178621292, |
| "reward_std": 0.5032580140978098, |
| "rewards/cosine_scaled_reward": -0.2356421989388764, |
| "rewards/format_reward": 0.29166667349636555, |
| "step": 60 |
| }, |
| { |
| "completion_length": 3021.9375915527344, |
| "epoch": 0.06971428571428571, |
| "grad_norm": 0.18782874941825867, |
| "kl": 0.00031859055161476135, |
| "learning_rate": 9.98673738502114e-07, |
| "loss": 0.0, |
| "reward": 0.2405167557299137, |
| "reward_std": 0.8873464465141296, |
| "rewards/cosine_scaled_reward": -0.11932496633380651, |
| "rewards/format_reward": 0.4791666679084301, |
| "step": 61 |
| }, |
| { |
| "completion_length": 2797.437515258789, |
| "epoch": 0.07085714285714285, |
| "grad_norm": 0.19510804116725922, |
| "kl": 0.0028433650732040405, |
| "learning_rate": 9.98421786662277e-07, |
| "loss": 0.0001, |
| "reward": 0.6466732956469059, |
| "reward_std": 1.1665390692651272, |
| "rewards/cosine_scaled_reward": 0.06291996827349067, |
| "rewards/format_reward": 0.520833345130086, |
| "step": 62 |
| }, |
| { |
| "completion_length": 2213.6458740234375, |
| "epoch": 0.072, |
| "grad_norm": 0.21749359369277954, |
| "kl": 0.0012581497430801392, |
| "learning_rate": 9.981479793771866e-07, |
| "loss": 0.0001, |
| "reward": 0.9055937305092812, |
| "reward_std": 0.8001900352537632, |
| "rewards/cosine_scaled_reward": 0.08821351453661919, |
| "rewards/format_reward": 0.7291666716337204, |
| "step": 63 |
| }, |
| { |
| "completion_length": 2778.3333435058594, |
| "epoch": 0.07314285714285715, |
| "grad_norm": 0.18261142075061798, |
| "kl": 0.0003489851951599121, |
| "learning_rate": 9.97852329991824e-07, |
| "loss": 0.0, |
| "reward": 0.38868435472249985, |
| "reward_std": 0.6305454671382904, |
| "rewards/cosine_scaled_reward": -0.02440785290673375, |
| "rewards/format_reward": 0.4375000074505806, |
| "step": 64 |
| }, |
| { |
| "completion_length": 2673.687530517578, |
| "epoch": 0.07428571428571429, |
| "grad_norm": 0.1992972046136856, |
| "kl": 0.00033433735370635986, |
| "learning_rate": 9.975348529157229e-07, |
| "loss": 0.0, |
| "reward": 0.3928167396225035, |
| "reward_std": 0.7381406147032976, |
| "rewards/cosine_scaled_reward": -0.07442497462034225, |
| "rewards/format_reward": 0.5416666734963655, |
| "step": 65 |
| }, |
| { |
| "completion_length": 2147.3750038146973, |
| "epoch": 0.07542857142857143, |
| "grad_norm": 0.22494584321975708, |
| "kl": 0.0011640042066574097, |
| "learning_rate": 9.971955636222684e-07, |
| "loss": 0.0, |
| "reward": 0.42546659521758556, |
| "reward_std": 0.5198768675327301, |
| "rewards/cosine_scaled_reward": -0.03726671263575554, |
| "rewards/format_reward": 0.5, |
| "step": 66 |
| }, |
| { |
| "completion_length": 3342.7708740234375, |
| "epoch": 0.07657142857142857, |
| "grad_norm": 0.14088153839111328, |
| "kl": 0.00046347081661224365, |
| "learning_rate": 9.968344786479415e-07, |
| "loss": 0.0, |
| "reward": -0.42950472608208656, |
| "reward_std": 0.378866882994771, |
| "rewards/cosine_scaled_reward": -0.31891903653740883, |
| "rewards/format_reward": 0.2083333395421505, |
| "step": 67 |
| }, |
| { |
| "completion_length": 2170.7083854675293, |
| "epoch": 0.07771428571428571, |
| "grad_norm": 0.29666775465011597, |
| "kl": 0.0014674067497253418, |
| "learning_rate": 9.964516155915151e-07, |
| "loss": 0.0001, |
| "reward": 0.4676838363520801, |
| "reward_std": 0.7767233625054359, |
| "rewards/cosine_scaled_reward": -0.03699141927063465, |
| "rewards/format_reward": 0.5416666679084301, |
| "step": 68 |
| }, |
| { |
| "completion_length": 2446.1458740234375, |
| "epoch": 0.07885714285714286, |
| "grad_norm": 0.22960010170936584, |
| "kl": 0.0015052556991577148, |
| "learning_rate": 9.960469931131936e-07, |
| "loss": 0.0001, |
| "reward": 0.21741511672735214, |
| "reward_std": 0.8107622135430574, |
| "rewards/cosine_scaled_reward": -0.1412924542091787, |
| "rewards/format_reward": 0.500000013038516, |
| "step": 69 |
| }, |
| { |
| "completion_length": 3091.6041717529297, |
| "epoch": 0.08, |
| "grad_norm": 0.19971789419651031, |
| "kl": 0.0011597275733947754, |
| "learning_rate": 9.956206309337066e-07, |
| "loss": 0.0, |
| "reward": 0.12162317335605621, |
| "reward_std": 0.5483939237892628, |
| "rewards/cosine_scaled_reward": -0.11627176497131586, |
| "rewards/format_reward": 0.35416666977107525, |
| "step": 70 |
| }, |
| { |
| "completion_length": 2609.0208587646484, |
| "epoch": 0.08114285714285714, |
| "grad_norm": 0.23139537870883942, |
| "kl": 0.0016418173909187317, |
| "learning_rate": 9.951725498333448e-07, |
| "loss": 0.0001, |
| "reward": 0.2501606810837984, |
| "reward_std": 0.5798324253410101, |
| "rewards/cosine_scaled_reward": -0.0832529878243804, |
| "rewards/format_reward": 0.4166666716337204, |
| "step": 71 |
| }, |
| { |
| "completion_length": 3072.9583740234375, |
| "epoch": 0.08228571428571428, |
| "grad_norm": 0.22307687997817993, |
| "kl": 0.0010285675525665283, |
| "learning_rate": 9.947027716509488e-07, |
| "loss": 0.0, |
| "reward": -0.06439175084233284, |
| "reward_std": 0.7740810364484787, |
| "rewards/cosine_scaled_reward": -0.16761254286393523, |
| "rewards/format_reward": 0.2708333395421505, |
| "step": 72 |
| }, |
| { |
| "completion_length": 3455.875030517578, |
| "epoch": 0.08342857142857144, |
| "grad_norm": 0.150093212723732, |
| "kl": 0.00021230429410934448, |
| "learning_rate": 9.942113192828444e-07, |
| "loss": 0.0, |
| "reward": -0.021114151924848557, |
| "reward_std": 0.7219380252063274, |
| "rewards/cosine_scaled_reward": -0.10430707037448883, |
| "rewards/format_reward": 0.18750000558793545, |
| "step": 73 |
| }, |
| { |
| "completion_length": 3296.250045776367, |
| "epoch": 0.08457142857142858, |
| "grad_norm": 0.17310041189193726, |
| "kl": 0.0007840991020202637, |
| "learning_rate": 9.93698216681727e-07, |
| "loss": 0.0, |
| "reward": -0.0612453930079937, |
| "reward_std": 0.5310056991875172, |
| "rewards/cosine_scaled_reward": -0.14520604407880455, |
| "rewards/format_reward": 0.22916666977107525, |
| "step": 74 |
| }, |
| { |
| "completion_length": 3076.0833740234375, |
| "epoch": 0.08571428571428572, |
| "grad_norm": 0.14123524725437164, |
| "kl": 0.0008448450826108456, |
| "learning_rate": 9.931634888554935e-07, |
| "loss": 0.0, |
| "reward": 0.21019641309976578, |
| "reward_std": 0.4503815546631813, |
| "rewards/cosine_scaled_reward": -0.05115179833956063, |
| "rewards/format_reward": 0.31250000186264515, |
| "step": 75 |
| }, |
| { |
| "completion_length": 2814.2500228881836, |
| "epoch": 0.08685714285714285, |
| "grad_norm": 0.20728114247322083, |
| "kl": 0.0001634061336517334, |
| "learning_rate": 9.926071618660237e-07, |
| "loss": 0.0, |
| "reward": 0.06560860201716423, |
| "reward_std": 0.5740169659256935, |
| "rewards/cosine_scaled_reward": -0.17552903201431036, |
| "rewards/format_reward": 0.41666666977107525, |
| "step": 76 |
| }, |
| { |
| "completion_length": 3305.1041870117188, |
| "epoch": 0.088, |
| "grad_norm": 0.14447131752967834, |
| "kl": 0.0003515356220304966, |
| "learning_rate": 9.9202926282791e-07, |
| "loss": 0.0, |
| "reward": -0.06417501904070377, |
| "reward_std": 0.5662869866937399, |
| "rewards/cosine_scaled_reward": -0.15708751417696476, |
| "rewards/format_reward": 0.25000000186264515, |
| "step": 77 |
| }, |
| { |
| "completion_length": 3341.875, |
| "epoch": 0.08914285714285715, |
| "grad_norm": 0.14708709716796875, |
| "kl": 0.00022222846746444702, |
| "learning_rate": 9.91429819907136e-07, |
| "loss": 0.0, |
| "reward": 0.07691416330635548, |
| "reward_std": 0.7588405385613441, |
| "rewards/cosine_scaled_reward": -0.09695957787334919, |
| "rewards/format_reward": 0.27083334140479565, |
| "step": 78 |
| }, |
| { |
| "completion_length": 2390.979179382324, |
| "epoch": 0.09028571428571429, |
| "grad_norm": 0.2777462303638458, |
| "kl": 0.0015588998794555664, |
| "learning_rate": 9.908088623197048e-07, |
| "loss": 0.0001, |
| "reward": 0.19228350277990103, |
| "reward_std": 0.7328854473307729, |
| "rewards/cosine_scaled_reward": -0.1746915839612484, |
| "rewards/format_reward": 0.5416666734963655, |
| "step": 79 |
| }, |
| { |
| "completion_length": 3182.437530517578, |
| "epoch": 0.09142857142857143, |
| "grad_norm": 0.18599167466163635, |
| "kl": 0.0007250010967254639, |
| "learning_rate": 9.901664203302124e-07, |
| "loss": 0.0, |
| "reward": 0.44475090876221657, |
| "reward_std": 0.6929970532655716, |
| "rewards/cosine_scaled_reward": 0.024458803236484528, |
| "rewards/format_reward": 0.3958333358168602, |
| "step": 80 |
| }, |
| { |
| "completion_length": 2998.7500534057617, |
| "epoch": 0.09257142857142857, |
| "grad_norm": 0.23842793703079224, |
| "kl": 0.0027131736278533936, |
| "learning_rate": 9.895025252503755e-07, |
| "loss": 0.0001, |
| "reward": -0.0324825718998909, |
| "reward_std": 0.6861623711884022, |
| "rewards/cosine_scaled_reward": -0.16207461850717664, |
| "rewards/format_reward": 0.29166667349636555, |
| "step": 81 |
| }, |
| { |
| "completion_length": 2781.791717529297, |
| "epoch": 0.09371428571428571, |
| "grad_norm": 0.19417299330234528, |
| "kl": 0.0011616647243499756, |
| "learning_rate": 9.888172094375033e-07, |
| "loss": 0.0, |
| "reward": 0.3426995016634464, |
| "reward_std": 0.756939671933651, |
| "rewards/cosine_scaled_reward": -0.03698359243571758, |
| "rewards/format_reward": 0.41666666977107525, |
| "step": 82 |
| }, |
| { |
| "completion_length": 2735.4791946411133, |
| "epoch": 0.09485714285714286, |
| "grad_norm": 0.23736146092414856, |
| "kl": 0.001489013433456421, |
| "learning_rate": 9.881105062929221e-07, |
| "loss": 0.0001, |
| "reward": 0.022416603518649936, |
| "reward_std": 0.33409483451396227, |
| "rewards/cosine_scaled_reward": -0.14504170510917902, |
| "rewards/format_reward": 0.3125, |
| "step": 83 |
| }, |
| { |
| "completion_length": 3006.7708587646484, |
| "epoch": 0.096, |
| "grad_norm": 0.16109317541122437, |
| "kl": 0.0031517744064331055, |
| "learning_rate": 9.873824502603459e-07, |
| "loss": 0.0001, |
| "reward": 0.2841636799275875, |
| "reward_std": 0.9676609244197607, |
| "rewards/cosine_scaled_reward": -0.055834827944636345, |
| "rewards/format_reward": 0.39583334140479565, |
| "step": 84 |
| }, |
| { |
| "completion_length": 2945.0416870117188, |
| "epoch": 0.09714285714285714, |
| "grad_norm": 0.14022095501422882, |
| "kl": 0.00030300021171569824, |
| "learning_rate": 9.866330768241983e-07, |
| "loss": 0.0, |
| "reward": 0.3169367350637913, |
| "reward_std": 0.8734622374176979, |
| "rewards/cosine_scaled_reward": -0.08111499063670635, |
| "rewards/format_reward": 0.47916667722165585, |
| "step": 85 |
| }, |
| { |
| "completion_length": 2773.3333740234375, |
| "epoch": 0.09828571428571428, |
| "grad_norm": 0.20840014517307281, |
| "kl": 0.0014702677726745605, |
| "learning_rate": 9.85862422507884e-07, |
| "loss": 0.0001, |
| "reward": 0.3505563363432884, |
| "reward_std": 0.50963762588799, |
| "rewards/cosine_scaled_reward": -0.06430518021807075, |
| "rewards/format_reward": 0.47916666977107525, |
| "step": 86 |
| }, |
| { |
| "completion_length": 2771.8333740234375, |
| "epoch": 0.09942857142857142, |
| "grad_norm": 0.2091607004404068, |
| "kl": 0.0014937222003936768, |
| "learning_rate": 9.850705248720068e-07, |
| "loss": 0.0001, |
| "reward": 0.21895797760225832, |
| "reward_std": 0.8198871687054634, |
| "rewards/cosine_scaled_reward": -0.11968768760561943, |
| "rewards/format_reward": 0.45833334513008595, |
| "step": 87 |
| }, |
| { |
| "completion_length": 2588.979248046875, |
| "epoch": 0.10057142857142858, |
| "grad_norm": 0.20578493177890778, |
| "kl": 0.003347158432006836, |
| "learning_rate": 9.8425742251254e-07, |
| "loss": 0.0001, |
| "reward": 0.8585769310593605, |
| "reward_std": 1.0248939394950867, |
| "rewards/cosine_scaled_reward": 0.11678845560527407, |
| "rewards/format_reward": 0.6250000093132257, |
| "step": 88 |
| }, |
| { |
| "completion_length": 3318.6459045410156, |
| "epoch": 0.10171428571428572, |
| "grad_norm": 0.198954775929451, |
| "kl": 0.0015476346015930176, |
| "learning_rate": 9.83423155058946e-07, |
| "loss": 0.0001, |
| "reward": 0.03354086447507143, |
| "reward_std": 0.9196067973971367, |
| "rewards/cosine_scaled_reward": -0.10822956170886755, |
| "rewards/format_reward": 0.2500000074505806, |
| "step": 89 |
| }, |
| { |
| "completion_length": 2533.125045776367, |
| "epoch": 0.10285714285714286, |
| "grad_norm": 0.30491241812705994, |
| "kl": 0.0027846097946166992, |
| "learning_rate": 9.825677631722435e-07, |
| "loss": 0.0001, |
| "reward": 0.04410627228207886, |
| "reward_std": 0.5515282340347767, |
| "rewards/cosine_scaled_reward": -0.21753019373863935, |
| "rewards/format_reward": 0.47916667722165585, |
| "step": 90 |
| }, |
| { |
| "completion_length": 3007.5000610351562, |
| "epoch": 0.104, |
| "grad_norm": 0.1628594547510147, |
| "kl": 0.0011910051107406616, |
| "learning_rate": 9.816912885430258e-07, |
| "loss": 0.0, |
| "reward": 0.4382549002766609, |
| "reward_std": 0.8723863288760185, |
| "rewards/cosine_scaled_reward": 0.010794118046760559, |
| "rewards/format_reward": 0.416666679084301, |
| "step": 91 |
| }, |
| { |
| "completion_length": 2512.5833587646484, |
| "epoch": 0.10514285714285715, |
| "grad_norm": 0.19360074400901794, |
| "kl": 0.0010070949792861938, |
| "learning_rate": 9.807937738894303e-07, |
| "loss": 0.0, |
| "reward": 0.32041475689038634, |
| "reward_std": 0.6855251621454954, |
| "rewards/cosine_scaled_reward": -0.11062596645206213, |
| "rewards/format_reward": 0.5416666697710752, |
| "step": 92 |
| }, |
| { |
| "completion_length": 3344.4166870117188, |
| "epoch": 0.10628571428571429, |
| "grad_norm": 0.1997271329164505, |
| "kl": 0.0018374919891357422, |
| "learning_rate": 9.798752629550546e-07, |
| "loss": 0.0001, |
| "reward": -0.34033770859241486, |
| "reward_std": 0.5492488071322441, |
| "rewards/cosine_scaled_reward": -0.2535021901130676, |
| "rewards/format_reward": 0.1666666716337204, |
| "step": 93 |
| }, |
| { |
| "completion_length": 3007.375015258789, |
| "epoch": 0.10742857142857143, |
| "grad_norm": 0.18923112750053406, |
| "kl": 0.0023823007941246033, |
| "learning_rate": 9.78935800506826e-07, |
| "loss": 0.0001, |
| "reward": 0.13416577805764973, |
| "reward_std": 0.4046405693516135, |
| "rewards/cosine_scaled_reward": -0.06833377946168184, |
| "rewards/format_reward": 0.27083333395421505, |
| "step": 94 |
| }, |
| { |
| "completion_length": 3444.0416870117188, |
| "epoch": 0.10857142857142857, |
| "grad_norm": 0.14268328249454498, |
| "kl": 0.0009976625442504883, |
| "learning_rate": 9.779754323328192e-07, |
| "loss": 0.0, |
| "reward": -0.09058984462171793, |
| "reward_std": 0.7453176453709602, |
| "rewards/cosine_scaled_reward": -0.15987825905904174, |
| "rewards/format_reward": 0.2291666716337204, |
| "step": 95 |
| }, |
| { |
| "completion_length": 2678.1875228881836, |
| "epoch": 0.10971428571428571, |
| "grad_norm": 0.19071771204471588, |
| "kl": 0.0018395986407995224, |
| "learning_rate": 9.769942052400235e-07, |
| "loss": 0.0001, |
| "reward": 0.348697304725647, |
| "reward_std": 0.6102097555994987, |
| "rewards/cosine_scaled_reward": -0.04440134949982166, |
| "rewards/format_reward": 0.43750000186264515, |
| "step": 96 |
| }, |
| { |
| "completion_length": 2990.000030517578, |
| "epoch": 0.11085714285714286, |
| "grad_norm": 0.20323379337787628, |
| "kl": 0.001025363802909851, |
| "learning_rate": 9.759921670520634e-07, |
| "loss": 0.0, |
| "reward": 0.41594838351011276, |
| "reward_std": 0.7984455898404121, |
| "rewards/cosine_scaled_reward": 0.02047417126595974, |
| "rewards/format_reward": 0.3750000111758709, |
| "step": 97 |
| }, |
| { |
| "completion_length": 2679.2083587646484, |
| "epoch": 0.112, |
| "grad_norm": 0.19446168839931488, |
| "kl": 0.0004923343658447266, |
| "learning_rate": 9.749693666068663e-07, |
| "loss": 0.0, |
| "reward": 0.38869317155331373, |
| "reward_std": 0.564236119389534, |
| "rewards/cosine_scaled_reward": -0.07648676075041294, |
| "rewards/format_reward": 0.541666679084301, |
| "step": 98 |
| }, |
| { |
| "completion_length": 2801.7708435058594, |
| "epoch": 0.11314285714285714, |
| "grad_norm": 0.20274780690670013, |
| "kl": 0.0009057521820068359, |
| "learning_rate": 9.739258537542835e-07, |
| "loss": 0.0, |
| "reward": 0.04034796729683876, |
| "reward_std": 0.43001002445816994, |
| "rewards/cosine_scaled_reward": -0.1152426817570813, |
| "rewards/format_reward": 0.27083333395421505, |
| "step": 99 |
| }, |
| { |
| "completion_length": 2591.125030517578, |
| "epoch": 0.11428571428571428, |
| "grad_norm": 0.19084717333316803, |
| "kl": 0.001154184341430664, |
| "learning_rate": 9.728616793536587e-07, |
| "loss": 0.0, |
| "reward": 0.6981350090354681, |
| "reward_std": 0.9793934002518654, |
| "rewards/cosine_scaled_reward": 0.06781747564673424, |
| "rewards/format_reward": 0.5625000111758709, |
| "step": 100 |
| }, |
| { |
| "completion_length": 2723.312530517578, |
| "epoch": 0.11542857142857142, |
| "grad_norm": 0.22346965968608856, |
| "kl": 0.001171112060546875, |
| "learning_rate": 9.717768952713511e-07, |
| "loss": 0.0, |
| "reward": 0.2876786937122233, |
| "reward_std": 0.4630406089127064, |
| "rewards/cosine_scaled_reward": -0.05407731421291828, |
| "rewards/format_reward": 0.39583333395421505, |
| "step": 101 |
| }, |
| { |
| "completion_length": 2054.187545776367, |
| "epoch": 0.11657142857142858, |
| "grad_norm": 0.23816390335559845, |
| "kl": 0.002776503562927246, |
| "learning_rate": 9.706715543782064e-07, |
| "loss": 0.0001, |
| "reward": 0.6102155670523643, |
| "reward_std": 0.7314882390201092, |
| "rewards/cosine_scaled_reward": -0.059475560672581196, |
| "rewards/format_reward": 0.7291666865348816, |
| "step": 102 |
| }, |
| { |
| "completion_length": 2666.833396911621, |
| "epoch": 0.11771428571428572, |
| "grad_norm": 0.2530282139778137, |
| "kl": 0.0017848014831542969, |
| "learning_rate": 9.695457105469804e-07, |
| "loss": 0.0001, |
| "reward": 0.2429911457002163, |
| "reward_std": 0.6969506703317165, |
| "rewards/cosine_scaled_reward": -0.11808777041733265, |
| "rewards/format_reward": 0.479166679084301, |
| "step": 103 |
| }, |
| { |
| "completion_length": 2740.9791870117188, |
| "epoch": 0.11885714285714286, |
| "grad_norm": 0.20160914957523346, |
| "kl": 0.0023827552795410156, |
| "learning_rate": 9.683994186497132e-07, |
| "loss": 0.0001, |
| "reward": 0.20196924358606339, |
| "reward_std": 0.5999510791152716, |
| "rewards/cosine_scaled_reward": -0.0865153931081295, |
| "rewards/format_reward": 0.3750000037252903, |
| "step": 104 |
| }, |
| { |
| "completion_length": 2462.1041870117188, |
| "epoch": 0.12, |
| "grad_norm": 0.24578723311424255, |
| "kl": 0.0011974573135375977, |
| "learning_rate": 9.672327345550543e-07, |
| "loss": 0.0, |
| "reward": 0.29713789327070117, |
| "reward_std": 0.8025108277797699, |
| "rewards/cosine_scaled_reward": -0.080597716383636, |
| "rewards/format_reward": 0.45833334140479565, |
| "step": 105 |
| }, |
| { |
| "completion_length": 2199.854179382324, |
| "epoch": 0.12114285714285715, |
| "grad_norm": 0.19964368641376495, |
| "kl": 0.002254962921142578, |
| "learning_rate": 9.66045715125541e-07, |
| "loss": 0.0001, |
| "reward": 0.8605472417548299, |
| "reward_std": 0.7241987711749971, |
| "rewards/cosine_scaled_reward": 0.11777362413704395, |
| "rewards/format_reward": 0.6250000037252903, |
| "step": 106 |
| }, |
| { |
| "completion_length": 2900.125030517578, |
| "epoch": 0.12228571428571429, |
| "grad_norm": 0.1984090954065323, |
| "kl": 0.0013356208801269531, |
| "learning_rate": 9.648384182148252e-07, |
| "loss": 0.0001, |
| "reward": 0.3114801459014416, |
| "reward_std": 0.5947921872138977, |
| "rewards/cosine_scaled_reward": -0.08384328025022114, |
| "rewards/format_reward": 0.47916666977107525, |
| "step": 107 |
| }, |
| { |
| "completion_length": 2570.6041870117188, |
| "epoch": 0.12342857142857143, |
| "grad_norm": 0.2010282427072525, |
| "kl": 0.0011126399040222168, |
| "learning_rate": 9.636109026648554e-07, |
| "loss": 0.0, |
| "reward": 0.6098845191299915, |
| "reward_std": 0.8683119043707848, |
| "rewards/cosine_scaled_reward": 0.02369226049631834, |
| "rewards/format_reward": 0.5625000074505806, |
| "step": 108 |
| }, |
| { |
| "completion_length": 2960.645896911621, |
| "epoch": 0.12457142857142857, |
| "grad_norm": 0.19048207998275757, |
| "kl": 0.0006784200668334961, |
| "learning_rate": 9.623632283030077e-07, |
| "loss": 0.0, |
| "reward": 0.38045269437134266, |
| "reward_std": 0.6336373277008533, |
| "rewards/cosine_scaled_reward": 0.002726326696574688, |
| "rewards/format_reward": 0.3750000037252903, |
| "step": 109 |
| }, |
| { |
| "completion_length": 2726.1042098999023, |
| "epoch": 0.12571428571428572, |
| "grad_norm": 0.20842185616493225, |
| "kl": 0.0014406442642211914, |
| "learning_rate": 9.610954559391704e-07, |
| "loss": 0.0001, |
| "reward": 0.2915392220020294, |
| "reward_std": 0.891454428434372, |
| "rewards/cosine_scaled_reward": -0.10423038713634014, |
| "rewards/format_reward": 0.5000000093132257, |
| "step": 110 |
| }, |
| { |
| "completion_length": 2772.9375534057617, |
| "epoch": 0.12685714285714286, |
| "grad_norm": 0.20696447789669037, |
| "kl": 0.0015645027160644531, |
| "learning_rate": 9.598076473627796e-07, |
| "loss": 0.0001, |
| "reward": 0.20833251508884132, |
| "reward_std": 0.7267865724861622, |
| "rewards/cosine_scaled_reward": -0.12500040791928768, |
| "rewards/format_reward": 0.4583333395421505, |
| "step": 111 |
| }, |
| { |
| "completion_length": 2978.0208892822266, |
| "epoch": 0.128, |
| "grad_norm": 0.16811113059520721, |
| "kl": 0.000897526741027832, |
| "learning_rate": 9.58499865339809e-07, |
| "loss": 0.0, |
| "reward": 0.3418322764337063, |
| "reward_std": 0.65217125415802, |
| "rewards/cosine_scaled_reward": -0.05825053807348013, |
| "rewards/format_reward": 0.4583333358168602, |
| "step": 112 |
| }, |
| { |
| "completion_length": 2559.7083740234375, |
| "epoch": 0.12914285714285714, |
| "grad_norm": 0.20631085336208344, |
| "kl": 0.0014824867248535156, |
| "learning_rate": 9.571721736097088e-07, |
| "loss": 0.0001, |
| "reward": 0.43655770644545555, |
| "reward_std": 0.7786016501486301, |
| "rewards/cosine_scaled_reward": -0.03172116680070758, |
| "rewards/format_reward": 0.5000000093132257, |
| "step": 113 |
| }, |
| { |
| "completion_length": 2495.7917137145996, |
| "epoch": 0.13028571428571428, |
| "grad_norm": 0.22188131511211395, |
| "kl": 0.0031752586364746094, |
| "learning_rate": 9.55824636882301e-07, |
| "loss": 0.0001, |
| "reward": 0.20582257118076086, |
| "reward_std": 0.7198268510401249, |
| "rewards/cosine_scaled_reward": -0.18875539442524314, |
| "rewards/format_reward": 0.5833333469927311, |
| "step": 114 |
| }, |
| { |
| "completion_length": 2754.6667098999023, |
| "epoch": 0.13142857142857142, |
| "grad_norm": 0.19496048986911774, |
| "kl": 0.0033437013626098633, |
| "learning_rate": 9.54457320834625e-07, |
| "loss": 0.0001, |
| "reward": 0.02730092965066433, |
| "reward_std": 0.5944252610206604, |
| "rewards/cosine_scaled_reward": -0.2050995440222323, |
| "rewards/format_reward": 0.4375000074505806, |
| "step": 115 |
| }, |
| { |
| "completion_length": 3369.375030517578, |
| "epoch": 0.13257142857142856, |
| "grad_norm": 0.1768704503774643, |
| "kl": 0.001748800277709961, |
| "learning_rate": 9.530702921077358e-07, |
| "loss": 0.0001, |
| "reward": -0.17860433738678694, |
| "reward_std": 0.7088249325752258, |
| "rewards/cosine_scaled_reward": -0.15180217241868377, |
| "rewards/format_reward": 0.1250000037252903, |
| "step": 116 |
| }, |
| { |
| "completion_length": 3029.687530517578, |
| "epoch": 0.1337142857142857, |
| "grad_norm": 0.22546739876270294, |
| "kl": 0.0027513504028320312, |
| "learning_rate": 9.516636183034564e-07, |
| "loss": 0.0001, |
| "reward": -0.09248590935021639, |
| "reward_std": 0.6045599468052387, |
| "rewards/cosine_scaled_reward": -0.19207628909498453, |
| "rewards/format_reward": 0.2916666753590107, |
| "step": 117 |
| }, |
| { |
| "completion_length": 2870.8958740234375, |
| "epoch": 0.13485714285714287, |
| "grad_norm": 0.16772252321243286, |
| "kl": 0.0015900135040283203, |
| "learning_rate": 9.502373679810839e-07, |
| "loss": 0.0001, |
| "reward": 0.46789007633924484, |
| "reward_std": 0.8386504454538226, |
| "rewards/cosine_scaled_reward": 0.015195020474493504, |
| "rewards/format_reward": 0.4375000074505806, |
| "step": 118 |
| }, |
| { |
| "completion_length": 2562.3125610351562, |
| "epoch": 0.136, |
| "grad_norm": 0.21363668143749237, |
| "kl": 0.004290223121643066, |
| "learning_rate": 9.487916106540465e-07, |
| "loss": 0.0002, |
| "reward": 0.7369925267994404, |
| "reward_std": 0.7727241404354572, |
| "rewards/cosine_scaled_reward": 0.0768295917659998, |
| "rewards/format_reward": 0.5833333395421505, |
| "step": 119 |
| }, |
| { |
| "completion_length": 2289.1667289733887, |
| "epoch": 0.13714285714285715, |
| "grad_norm": 0.2960655391216278, |
| "kl": 0.0022792816162109375, |
| "learning_rate": 9.473264167865171e-07, |
| "loss": 0.0001, |
| "reward": 0.513806963339448, |
| "reward_std": 0.7356006652116776, |
| "rewards/cosine_scaled_reward": -0.045179841690696776, |
| "rewards/format_reward": 0.6041666772216558, |
| "step": 120 |
| }, |
| { |
| "completion_length": 1657.1041870117188, |
| "epoch": 0.1382857142857143, |
| "grad_norm": 0.27528315782546997, |
| "kl": 0.00405430793762207, |
| "learning_rate": 9.458418577899774e-07, |
| "loss": 0.0002, |
| "reward": 0.7713621072471142, |
| "reward_std": 0.6179003790020943, |
| "rewards/cosine_scaled_reward": -0.010152293369174004, |
| "rewards/format_reward": 0.7916666716337204, |
| "step": 121 |
| }, |
| { |
| "completion_length": 2898.375030517578, |
| "epoch": 0.13942857142857143, |
| "grad_norm": 0.20366472005844116, |
| "kl": 0.0014312267303466797, |
| "learning_rate": 9.443380060197385e-07, |
| "loss": 0.0001, |
| "reward": 0.47324386797845364, |
| "reward_std": 0.8266710359603167, |
| "rewards/cosine_scaled_reward": 0.02828860469162464, |
| "rewards/format_reward": 0.4166666753590107, |
| "step": 122 |
| }, |
| { |
| "completion_length": 2754.791717529297, |
| "epoch": 0.14057142857142857, |
| "grad_norm": 0.17461104691028595, |
| "kl": 0.0015136003494262695, |
| "learning_rate": 9.428149347714143e-07, |
| "loss": 0.0001, |
| "reward": 0.28382661822251976, |
| "reward_std": 0.5956688185688108, |
| "rewards/cosine_scaled_reward": -0.12892001681029797, |
| "rewards/format_reward": 0.541666679084301, |
| "step": 123 |
| }, |
| { |
| "completion_length": 2220.0833587646484, |
| "epoch": 0.1417142857142857, |
| "grad_norm": 0.2452717274427414, |
| "kl": 0.0047130584716796875, |
| "learning_rate": 9.412727182773486e-07, |
| "loss": 0.0002, |
| "reward": 0.5766889279475436, |
| "reward_std": 0.7685924731194973, |
| "rewards/cosine_scaled_reward": -0.03457220923155546, |
| "rewards/format_reward": 0.6458333414047956, |
| "step": 124 |
| }, |
| { |
| "completion_length": 2875.895866394043, |
| "epoch": 0.14285714285714285, |
| "grad_norm": 0.17229042947292328, |
| "kl": 0.0020313262939453125, |
| "learning_rate": 9.397114317029974e-07, |
| "loss": 0.0001, |
| "reward": 0.23444998264312744, |
| "reward_std": 0.5397426411509514, |
| "rewards/cosine_scaled_reward": -0.028608346357941628, |
| "rewards/format_reward": 0.2916666679084301, |
| "step": 125 |
| }, |
| { |
| "completion_length": 2846.5833587646484, |
| "epoch": 0.144, |
| "grad_norm": 0.1771528422832489, |
| "kl": 0.0011348724365234375, |
| "learning_rate": 9.381311511432658e-07, |
| "loss": 0.0, |
| "reward": 0.2827608957886696, |
| "reward_std": 0.6769377700984478, |
| "rewards/cosine_scaled_reward": -0.11903620883822441, |
| "rewards/format_reward": 0.5208333414047956, |
| "step": 126 |
| }, |
| { |
| "completion_length": 3026.437545776367, |
| "epoch": 0.14514285714285713, |
| "grad_norm": 0.23673483729362488, |
| "kl": 0.0022172927856445312, |
| "learning_rate": 9.36531953618799e-07, |
| "loss": 0.0001, |
| "reward": -0.08590636402368546, |
| "reward_std": 0.6378746330738068, |
| "rewards/cosine_scaled_reward": -0.18878651410341263, |
| "rewards/format_reward": 0.2916666716337204, |
| "step": 127 |
| }, |
| { |
| "completion_length": 2956.1458892822266, |
| "epoch": 0.1462857142857143, |
| "grad_norm": 0.1844361275434494, |
| "kl": 0.002949953079223633, |
| "learning_rate": 9.34913917072228e-07, |
| "loss": 0.0001, |
| "reward": 0.39643347449600697, |
| "reward_std": 0.7595263011753559, |
| "rewards/cosine_scaled_reward": 0.010716758668422699, |
| "rewards/format_reward": 0.37500000186264515, |
| "step": 128 |
| }, |
| { |
| "completion_length": 3379.1666870117188, |
| "epoch": 0.14742857142857144, |
| "grad_norm": 0.40960681438446045, |
| "kl": 0.02291560173034668, |
| "learning_rate": 9.332771203643714e-07, |
| "loss": 0.0009, |
| "reward": -0.011747716460376978, |
| "reward_std": 0.9172870293259621, |
| "rewards/cosine_scaled_reward": -0.09962387196719646, |
| "rewards/format_reward": 0.1875000037252903, |
| "step": 129 |
| }, |
| { |
| "completion_length": 2683.5416870117188, |
| "epoch": 0.14857142857142858, |
| "grad_norm": 0.18997545540332794, |
| "kl": 0.0017414093017578125, |
| "learning_rate": 9.316216432703916e-07, |
| "loss": 0.0001, |
| "reward": 0.2956298217177391, |
| "reward_std": 0.7938947193324566, |
| "rewards/cosine_scaled_reward": -0.05010176869109273, |
| "rewards/format_reward": 0.39583334140479565, |
| "step": 130 |
| }, |
| { |
| "completion_length": 2938.958396911621, |
| "epoch": 0.14971428571428572, |
| "grad_norm": 0.1829785704612732, |
| "kl": 0.004005908966064453, |
| "learning_rate": 9.299475664759068e-07, |
| "loss": 0.0002, |
| "reward": 0.6047782748937607, |
| "reward_std": 0.8474699303042144, |
| "rewards/cosine_scaled_reward": 0.10447247140109539, |
| "rewards/format_reward": 0.39583334140479565, |
| "step": 131 |
| }, |
| { |
| "completion_length": 2620.854179382324, |
| "epoch": 0.15085714285714286, |
| "grad_norm": 0.17881910502910614, |
| "kl": 0.0018241405487060547, |
| "learning_rate": 9.282549715730579e-07, |
| "loss": 0.0001, |
| "reward": 0.3893936946988106, |
| "reward_std": 0.687925798818469, |
| "rewards/cosine_scaled_reward": -0.013636493356898427, |
| "rewards/format_reward": 0.41666667349636555, |
| "step": 132 |
| }, |
| { |
| "completion_length": 3095.3958740234375, |
| "epoch": 0.152, |
| "grad_norm": 0.20941434800624847, |
| "kl": 0.0031735897064208984, |
| "learning_rate": 9.265439410565328e-07, |
| "loss": 0.0001, |
| "reward": 0.05856896564364433, |
| "reward_std": 0.7605378944426775, |
| "rewards/cosine_scaled_reward": -0.12696552043780684, |
| "rewards/format_reward": 0.31250000558793545, |
| "step": 133 |
| }, |
| { |
| "completion_length": 2441.6458740234375, |
| "epoch": 0.15314285714285714, |
| "grad_norm": 0.18976929783821106, |
| "kl": 0.0040435791015625, |
| "learning_rate": 9.248145583195447e-07, |
| "loss": 0.0002, |
| "reward": 0.5501680783927441, |
| "reward_std": 0.618685107678175, |
| "rewards/cosine_scaled_reward": -0.026999298483133316, |
| "rewards/format_reward": 0.6041666697710752, |
| "step": 134 |
| }, |
| { |
| "completion_length": 2050.6041870117188, |
| "epoch": 0.15428571428571428, |
| "grad_norm": 0.2555489242076874, |
| "kl": 0.004626750946044922, |
| "learning_rate": 9.230669076497687e-07, |
| "loss": 0.0002, |
| "reward": 1.1215206906199455, |
| "reward_std": 0.8544092262163758, |
| "rewards/cosine_scaled_reward": 0.22742698714137077, |
| "rewards/format_reward": 0.6666666772216558, |
| "step": 135 |
| }, |
| { |
| "completion_length": 2620.791717529297, |
| "epoch": 0.15542857142857142, |
| "grad_norm": 0.18313182890415192, |
| "kl": 0.003920078277587891, |
| "learning_rate": 9.213010742252327e-07, |
| "loss": 0.0002, |
| "reward": 0.5208159852772951, |
| "reward_std": 0.9554536268115044, |
| "rewards/cosine_scaled_reward": 0.010407987982034683, |
| "rewards/format_reward": 0.5000000111758709, |
| "step": 136 |
| }, |
| { |
| "completion_length": 2811.666702270508, |
| "epoch": 0.15657142857142858, |
| "grad_norm": 0.17304006218910217, |
| "kl": 0.0026001930236816406, |
| "learning_rate": 9.195171441101668e-07, |
| "loss": 0.0001, |
| "reward": 0.03694445453584194, |
| "reward_std": 0.6093163676559925, |
| "rewards/cosine_scaled_reward": -0.15861110761761665, |
| "rewards/format_reward": 0.35416666977107525, |
| "step": 137 |
| }, |
| { |
| "completion_length": 2228.2500762939453, |
| "epoch": 0.15771428571428572, |
| "grad_norm": 0.19729673862457275, |
| "kl": 0.0018553733825683594, |
| "learning_rate": 9.177152042508077e-07, |
| "loss": 0.0001, |
| "reward": 0.4917922643944621, |
| "reward_std": 0.5503780655562878, |
| "rewards/cosine_scaled_reward": -0.08743719174526632, |
| "rewards/format_reward": 0.6666666734963655, |
| "step": 138 |
| }, |
| { |
| "completion_length": 3121.8750915527344, |
| "epoch": 0.15885714285714286, |
| "grad_norm": 0.17099516093730927, |
| "kl": 0.0035991668701171875, |
| "learning_rate": 9.158953424711624e-07, |
| "loss": 0.0001, |
| "reward": 0.20771604776382446, |
| "reward_std": 0.8068908657878637, |
| "rewards/cosine_scaled_reward": -0.08364198234630749, |
| "rewards/format_reward": 0.37500000931322575, |
| "step": 139 |
| }, |
| { |
| "completion_length": 2868.979217529297, |
| "epoch": 0.16, |
| "grad_norm": 0.22809311747550964, |
| "kl": 0.0057888031005859375, |
| "learning_rate": 9.140576474687263e-07, |
| "loss": 0.0002, |
| "reward": 0.10171525273472071, |
| "reward_std": 0.42831680551171303, |
| "rewards/cosine_scaled_reward": -0.11580904200673103, |
| "rewards/format_reward": 0.33333333767950535, |
| "step": 140 |
| }, |
| { |
| "completion_length": 2534.354263305664, |
| "epoch": 0.16114285714285714, |
| "grad_norm": 0.1993127316236496, |
| "kl": 0.0041599273681640625, |
| "learning_rate": 9.122022088101613e-07, |
| "loss": 0.0002, |
| "reward": 0.5355201922357082, |
| "reward_std": 0.7652903571724892, |
| "rewards/cosine_scaled_reward": -0.04473992623388767, |
| "rewards/format_reward": 0.6250000149011612, |
| "step": 141 |
| }, |
| { |
| "completion_length": 2745.250015258789, |
| "epoch": 0.16228571428571428, |
| "grad_norm": 0.20162560045719147, |
| "kl": 0.0030961036682128906, |
| "learning_rate": 9.103291169269299e-07, |
| "loss": 0.0001, |
| "reward": 0.3946248684078455, |
| "reward_std": 0.7829636707901955, |
| "rewards/cosine_scaled_reward": -0.09435425186529756, |
| "rewards/format_reward": 0.583333345130086, |
| "step": 142 |
| }, |
| { |
| "completion_length": 2417.6250610351562, |
| "epoch": 0.16342857142857142, |
| "grad_norm": 0.4018979072570801, |
| "kl": 0.004683494567871094, |
| "learning_rate": 9.084384631108882e-07, |
| "loss": 0.0002, |
| "reward": 0.19592272117733955, |
| "reward_std": 0.5982137080281973, |
| "rewards/cosine_scaled_reward": -0.1832886370830238, |
| "rewards/format_reward": 0.5625000111758709, |
| "step": 143 |
| }, |
| { |
| "completion_length": 2747.145881652832, |
| "epoch": 0.16457142857142856, |
| "grad_norm": 0.2238093614578247, |
| "kl": 0.0034952163696289062, |
| "learning_rate": 9.065303395098358e-07, |
| "loss": 0.0001, |
| "reward": 0.3479926884174347, |
| "reward_std": 0.8791730478405952, |
| "rewards/cosine_scaled_reward": -0.04475365893449634, |
| "rewards/format_reward": 0.43750001303851604, |
| "step": 144 |
| }, |
| { |
| "completion_length": 2055.895866394043, |
| "epoch": 0.1657142857142857, |
| "grad_norm": 0.2830770015716553, |
| "kl": 0.0055332183837890625, |
| "learning_rate": 9.046048391230247e-07, |
| "loss": 0.0002, |
| "reward": 0.5167977176606655, |
| "reward_std": 0.5320884976536036, |
| "rewards/cosine_scaled_reward": -0.043684473261237144, |
| "rewards/format_reward": 0.6041666716337204, |
| "step": 145 |
| }, |
| { |
| "completion_length": 1917.2917251586914, |
| "epoch": 0.16685714285714287, |
| "grad_norm": 0.18457695841789246, |
| "kl": 0.0024001598358154297, |
| "learning_rate": 9.026620557966279e-07, |
| "loss": 0.0001, |
| "reward": 0.5474055055528879, |
| "reward_std": 0.7067083790898323, |
| "rewards/cosine_scaled_reward": -0.11171392910182476, |
| "rewards/format_reward": 0.7708333395421505, |
| "step": 146 |
| }, |
| { |
| "completion_length": 2394.2500534057617, |
| "epoch": 0.168, |
| "grad_norm": 0.26179954409599304, |
| "kl": 0.004405975341796875, |
| "learning_rate": 9.007020842191634e-07, |
| "loss": 0.0002, |
| "reward": 0.39791956916451454, |
| "reward_std": 1.0572543032467365, |
| "rewards/cosine_scaled_reward": -0.07187354937195778, |
| "rewards/format_reward": 0.5416666697710752, |
| "step": 147 |
| }, |
| { |
| "completion_length": 1997.3125534057617, |
| "epoch": 0.16914285714285715, |
| "grad_norm": 0.22162283957004547, |
| "kl": 0.0032930374145507812, |
| "learning_rate": 8.987250199168808e-07, |
| "loss": 0.0001, |
| "reward": 0.6760393833974376, |
| "reward_std": 0.7666322588920593, |
| "rewards/cosine_scaled_reward": -0.01614698488265276, |
| "rewards/format_reward": 0.7083333358168602, |
| "step": 148 |
| }, |
| { |
| "completion_length": 2594.416717529297, |
| "epoch": 0.1702857142857143, |
| "grad_norm": 0.17498500645160675, |
| "kl": 0.0034232139587402344, |
| "learning_rate": 8.967309592491052e-07, |
| "loss": 0.0001, |
| "reward": 0.5365147553384304, |
| "reward_std": 0.6859856657683849, |
| "rewards/cosine_scaled_reward": -0.012992626056075096, |
| "rewards/format_reward": 0.5625000186264515, |
| "step": 149 |
| }, |
| { |
| "completion_length": 2656.104202270508, |
| "epoch": 0.17142857142857143, |
| "grad_norm": 0.22585879266262054, |
| "kl": 0.0047016143798828125, |
| "learning_rate": 8.9471999940354e-07, |
| "loss": 0.0002, |
| "reward": 0.2979818880558014, |
| "reward_std": 0.7468250878155231, |
| "rewards/cosine_scaled_reward": -0.08017573575489223, |
| "rewards/format_reward": 0.45833333767950535, |
| "step": 150 |
| }, |
| { |
| "completion_length": 2433.208396911621, |
| "epoch": 0.17257142857142857, |
| "grad_norm": 0.2490771859884262, |
| "kl": 0.0044155120849609375, |
| "learning_rate": 8.926922383915315e-07, |
| "loss": 0.0002, |
| "reward": 0.5584549466148019, |
| "reward_std": 0.8048359379172325, |
| "rewards/cosine_scaled_reward": -0.022855868563055992, |
| "rewards/format_reward": 0.6041666809469461, |
| "step": 151 |
| }, |
| { |
| "completion_length": 2724.979217529297, |
| "epoch": 0.1737142857142857, |
| "grad_norm": 0.2223489135503769, |
| "kl": 0.003587007522583008, |
| "learning_rate": 8.906477750432903e-07, |
| "loss": 0.0001, |
| "reward": 0.020049065351486206, |
| "reward_std": 0.6065222397446632, |
| "rewards/cosine_scaled_reward": -0.1774754635989666, |
| "rewards/format_reward": 0.37500000558793545, |
| "step": 152 |
| }, |
| { |
| "completion_length": 2770.2708740234375, |
| "epoch": 0.17485714285714285, |
| "grad_norm": 0.19104580581188202, |
| "kl": 0.006191253662109375, |
| "learning_rate": 8.88586709003076e-07, |
| "loss": 0.0002, |
| "reward": 0.004709841683506966, |
| "reward_std": 0.47944022715091705, |
| "rewards/cosine_scaled_reward": -0.19556174334138632, |
| "rewards/format_reward": 0.3958333358168602, |
| "step": 153 |
| }, |
| { |
| "completion_length": 3214.062530517578, |
| "epoch": 0.176, |
| "grad_norm": 0.186995267868042, |
| "kl": 0.0031766891479492188, |
| "learning_rate": 8.865091407243394e-07, |
| "loss": 0.0001, |
| "reward": 0.3578900098800659, |
| "reward_std": 1.006507821381092, |
| "rewards/cosine_scaled_reward": 0.03311167098581791, |
| "rewards/format_reward": 0.29166666977107525, |
| "step": 154 |
| }, |
| { |
| "completion_length": 2554.666732788086, |
| "epoch": 0.17714285714285713, |
| "grad_norm": 0.20163290202617645, |
| "kl": 0.004076957702636719, |
| "learning_rate": 8.844151714648274e-07, |
| "loss": 0.0002, |
| "reward": 0.5557569041848183, |
| "reward_std": 0.6795310117304325, |
| "rewards/cosine_scaled_reward": 0.02787843905389309, |
| "rewards/format_reward": 0.5000000037252903, |
| "step": 155 |
| }, |
| { |
| "completion_length": 2689.2292098999023, |
| "epoch": 0.1782857142857143, |
| "grad_norm": 0.18030290305614471, |
| "kl": 0.003159046173095703, |
| "learning_rate": 8.823049032816478e-07, |
| "loss": 0.0001, |
| "reward": 0.4783413279801607, |
| "reward_std": 0.8905918002128601, |
| "rewards/cosine_scaled_reward": 0.02042064256966114, |
| "rewards/format_reward": 0.4375, |
| "step": 156 |
| }, |
| { |
| "completion_length": 2461.875015258789, |
| "epoch": 0.17942857142857144, |
| "grad_norm": 0.22876004874706268, |
| "kl": 0.003749370574951172, |
| "learning_rate": 8.801784390262943e-07, |
| "loss": 0.0001, |
| "reward": 0.15547512285411358, |
| "reward_std": 0.5757812671363354, |
| "rewards/cosine_scaled_reward": -0.19309578835964203, |
| "rewards/format_reward": 0.541666679084301, |
| "step": 157 |
| }, |
| { |
| "completion_length": 2797.8333740234375, |
| "epoch": 0.18057142857142858, |
| "grad_norm": 0.2126486450433731, |
| "kl": 0.0034036636352539062, |
| "learning_rate": 8.780358823396352e-07, |
| "loss": 0.0001, |
| "reward": 0.5735968193039298, |
| "reward_std": 0.8958301469683647, |
| "rewards/cosine_scaled_reward": 0.02638173568993807, |
| "rewards/format_reward": 0.5208333469927311, |
| "step": 158 |
| }, |
| { |
| "completion_length": 2499.937515258789, |
| "epoch": 0.18171428571428572, |
| "grad_norm": 0.17875580489635468, |
| "kl": 0.003962516784667969, |
| "learning_rate": 8.758773376468604e-07, |
| "loss": 0.0002, |
| "reward": 0.11497697234153748, |
| "reward_std": 0.49865414947271347, |
| "rewards/cosine_scaled_reward": -0.1820948626846075, |
| "rewards/format_reward": 0.4791666753590107, |
| "step": 159 |
| }, |
| { |
| "completion_length": 2463.708381652832, |
| "epoch": 0.18285714285714286, |
| "grad_norm": 0.23183724284172058, |
| "kl": 0.006978034973144531, |
| "learning_rate": 8.737029101523929e-07, |
| "loss": 0.0003, |
| "reward": 0.7383107729256153, |
| "reward_std": 0.9244415387511253, |
| "rewards/cosine_scaled_reward": 0.05665536457672715, |
| "rewards/format_reward": 0.6250000055879354, |
| "step": 160 |
| }, |
| { |
| "completion_length": 2109.3333740234375, |
| "epoch": 0.184, |
| "grad_norm": 0.2291431725025177, |
| "kl": 0.0051670074462890625, |
| "learning_rate": 8.715127058347614e-07, |
| "loss": 0.0002, |
| "reward": 0.43655015155673027, |
| "reward_std": 0.5879729185253382, |
| "rewards/cosine_scaled_reward": -0.11505825724452734, |
| "rewards/format_reward": 0.6666666772216558, |
| "step": 161 |
| }, |
| { |
| "completion_length": 2962.1458740234375, |
| "epoch": 0.18514285714285714, |
| "grad_norm": 0.18190893530845642, |
| "kl": 0.008568763732910156, |
| "learning_rate": 8.693068314414344e-07, |
| "loss": 0.0003, |
| "reward": -0.018576149828732014, |
| "reward_std": 0.6142776571214199, |
| "rewards/cosine_scaled_reward": -0.1759547544643283, |
| "rewards/format_reward": 0.3333333432674408, |
| "step": 162 |
| }, |
| { |
| "completion_length": 2259.770851135254, |
| "epoch": 0.18628571428571428, |
| "grad_norm": 0.2146298736333847, |
| "kl": 0.004612922668457031, |
| "learning_rate": 8.670853944836176e-07, |
| "loss": 0.0002, |
| "reward": 0.9779267348349094, |
| "reward_std": 0.5618617758154869, |
| "rewards/cosine_scaled_reward": 0.18687998969107866, |
| "rewards/format_reward": 0.6041666716337204, |
| "step": 163 |
| }, |
| { |
| "completion_length": 2226.6042098999023, |
| "epoch": 0.18742857142857142, |
| "grad_norm": 0.2856714725494385, |
| "kl": 0.0065479278564453125, |
| "learning_rate": 8.648485032310144e-07, |
| "loss": 0.0003, |
| "reward": 0.5088155679404736, |
| "reward_std": 0.6196571066975594, |
| "rewards/cosine_scaled_reward": -0.05809224210679531, |
| "rewards/format_reward": 0.6250000167638063, |
| "step": 164 |
| }, |
| { |
| "completion_length": 2562.2083892822266, |
| "epoch": 0.18857142857142858, |
| "grad_norm": 0.2433558702468872, |
| "kl": 0.0056896209716796875, |
| "learning_rate": 8.625962667065487e-07, |
| "loss": 0.0002, |
| "reward": 0.17038288246840239, |
| "reward_std": 0.8414329029619694, |
| "rewards/cosine_scaled_reward": -0.14397523301886395, |
| "rewards/format_reward": 0.4583333432674408, |
| "step": 165 |
| }, |
| { |
| "completion_length": 2418.8958892822266, |
| "epoch": 0.18971428571428572, |
| "grad_norm": 0.1904870867729187, |
| "kl": 0.0036516189575195312, |
| "learning_rate": 8.603287946810513e-07, |
| "loss": 0.0001, |
| "reward": 0.37238264083862305, |
| "reward_std": 0.7455781847238541, |
| "rewards/cosine_scaled_reward": -0.08464201167225838, |
| "rewards/format_reward": 0.5416666772216558, |
| "step": 166 |
| }, |
| { |
| "completion_length": 1871.6042175292969, |
| "epoch": 0.19085714285714286, |
| "grad_norm": 0.21718546748161316, |
| "kl": 0.003131866455078125, |
| "learning_rate": 8.580461976679099e-07, |
| "loss": 0.0001, |
| "reward": 0.5296564288437366, |
| "reward_std": 0.6462862379848957, |
| "rewards/cosine_scaled_reward": -0.1414217846468091, |
| "rewards/format_reward": 0.8125000204890966, |
| "step": 167 |
| }, |
| { |
| "completion_length": 2663.8750534057617, |
| "epoch": 0.192, |
| "grad_norm": 0.22513794898986816, |
| "kl": 0.0043582916259765625, |
| "learning_rate": 8.557485869176825e-07, |
| "loss": 0.0002, |
| "reward": 0.5514265485107899, |
| "reward_std": 0.9005202539265156, |
| "rewards/cosine_scaled_reward": -0.01595340482890606, |
| "rewards/format_reward": 0.5833333432674408, |
| "step": 168 |
| }, |
| { |
| "completion_length": 1646.375015258789, |
| "epoch": 0.19314285714285714, |
| "grad_norm": 0.26874470710754395, |
| "kl": 0.0048503875732421875, |
| "learning_rate": 8.534360744126753e-07, |
| "loss": 0.0002, |
| "reward": 1.3640152756124735, |
| "reward_std": 0.753107562661171, |
| "rewards/cosine_scaled_reward": 0.2549243066459894, |
| "rewards/format_reward": 0.8541666753590107, |
| "step": 169 |
| }, |
| { |
| "completion_length": 2181.6667289733887, |
| "epoch": 0.19428571428571428, |
| "grad_norm": 0.2248026579618454, |
| "kl": 0.0046710968017578125, |
| "learning_rate": 8.511087728614862e-07, |
| "loss": 0.0002, |
| "reward": 0.5494927279651165, |
| "reward_std": 0.5512229539453983, |
| "rewards/cosine_scaled_reward": -0.006503628566861153, |
| "rewards/format_reward": 0.5625000111758709, |
| "step": 170 |
| }, |
| { |
| "completion_length": 2032.6458587646484, |
| "epoch": 0.19542857142857142, |
| "grad_norm": 0.2114606499671936, |
| "kl": 0.0031061172485351562, |
| "learning_rate": 8.487667956935087e-07, |
| "loss": 0.0001, |
| "reward": 0.5898782406002283, |
| "reward_std": 0.7169837113469839, |
| "rewards/cosine_scaled_reward": -0.0175608959980309, |
| "rewards/format_reward": 0.6250000037252903, |
| "step": 171 |
| }, |
| { |
| "completion_length": 2407.6875610351562, |
| "epoch": 0.19657142857142856, |
| "grad_norm": 0.295926034450531, |
| "kl": 0.0061702728271484375, |
| "learning_rate": 8.464102570534061e-07, |
| "loss": 0.0002, |
| "reward": 0.8869556821882725, |
| "reward_std": 0.7517024762928486, |
| "rewards/cosine_scaled_reward": 0.1518111675977707, |
| "rewards/format_reward": 0.583333345130086, |
| "step": 172 |
| }, |
| { |
| "completion_length": 1608.0833702087402, |
| "epoch": 0.1977142857142857, |
| "grad_norm": 0.28166893124580383, |
| "kl": 0.00473785400390625, |
| "learning_rate": 8.440392717955475e-07, |
| "loss": 0.0002, |
| "reward": 0.49864733405411243, |
| "reward_std": 0.6368578001856804, |
| "rewards/cosine_scaled_reward": -0.12567635625600815, |
| "rewards/format_reward": 0.7500000037252903, |
| "step": 173 |
| }, |
| { |
| "completion_length": 1789.645866394043, |
| "epoch": 0.19885714285714284, |
| "grad_norm": 0.2832661271095276, |
| "kl": 0.006890296936035156, |
| "learning_rate": 8.416539554784089e-07, |
| "loss": 0.0003, |
| "reward": 0.717628687620163, |
| "reward_std": 0.6769092865288258, |
| "rewards/cosine_scaled_reward": -0.026602333411574364, |
| "rewards/format_reward": 0.7708333414047956, |
| "step": 174 |
| }, |
| { |
| "completion_length": 2314.1042098999023, |
| "epoch": 0.2, |
| "grad_norm": 0.2392614483833313, |
| "kl": 0.004840850830078125, |
| "learning_rate": 8.392544243589427e-07, |
| "loss": 0.0002, |
| "reward": 0.7497927155345678, |
| "reward_std": 0.7398151978850365, |
| "rewards/cosine_scaled_reward": 0.0728130005300045, |
| "rewards/format_reward": 0.6041666753590107, |
| "step": 175 |
| }, |
| { |
| "completion_length": 1971.0417213439941, |
| "epoch": 0.20114285714285715, |
| "grad_norm": 0.23342494666576385, |
| "kl": 0.006039619445800781, |
| "learning_rate": 8.368407953869103e-07, |
| "loss": 0.0002, |
| "reward": 0.5884247794747353, |
| "reward_std": 0.859587823972106, |
| "rewards/cosine_scaled_reward": -0.10162095539271832, |
| "rewards/format_reward": 0.7916666716337204, |
| "step": 176 |
| }, |
| { |
| "completion_length": 2159.5417098999023, |
| "epoch": 0.2022857142857143, |
| "grad_norm": 0.24055302143096924, |
| "kl": 0.004790306091308594, |
| "learning_rate": 8.344131861991828e-07, |
| "loss": 0.0002, |
| "reward": 0.6115904222242534, |
| "reward_std": 0.7329309619963169, |
| "rewards/cosine_scaled_reward": -0.04837145563215017, |
| "rewards/format_reward": 0.7083333432674408, |
| "step": 177 |
| }, |
| { |
| "completion_length": 1962.1875457763672, |
| "epoch": 0.20342857142857143, |
| "grad_norm": 0.24433988332748413, |
| "kl": 0.007831573486328125, |
| "learning_rate": 8.319717151140072e-07, |
| "loss": 0.0003, |
| "reward": 0.5400892496109009, |
| "reward_std": 0.5809228383004665, |
| "rewards/cosine_scaled_reward": -0.07370538869872689, |
| "rewards/format_reward": 0.6875000111758709, |
| "step": 178 |
| }, |
| { |
| "completion_length": 2294.9375228881836, |
| "epoch": 0.20457142857142857, |
| "grad_norm": 0.21765747666358948, |
| "kl": 0.00481414794921875, |
| "learning_rate": 8.295165011252396e-07, |
| "loss": 0.0002, |
| "reward": 0.3706574998795986, |
| "reward_std": 0.711703471839428, |
| "rewards/cosine_scaled_reward": -0.08550458867102861, |
| "rewards/format_reward": 0.5416666679084301, |
| "step": 179 |
| }, |
| { |
| "completion_length": 1782.8333587646484, |
| "epoch": 0.2057142857142857, |
| "grad_norm": 0.2759692668914795, |
| "kl": 0.009118080139160156, |
| "learning_rate": 8.270476638965461e-07, |
| "loss": 0.0004, |
| "reward": 0.8141425922513008, |
| "reward_std": 0.8202670086175203, |
| "rewards/cosine_scaled_reward": 0.06332127377390862, |
| "rewards/format_reward": 0.6875000018626451, |
| "step": 180 |
| }, |
| { |
| "completion_length": 2508.3750534057617, |
| "epoch": 0.20685714285714285, |
| "grad_norm": 0.2136908620595932, |
| "kl": 0.005702972412109375, |
| "learning_rate": 8.245653237555705e-07, |
| "loss": 0.0002, |
| "reward": 0.53153170022415, |
| "reward_std": 0.6961524914950132, |
| "rewards/cosine_scaled_reward": -0.02590082644019276, |
| "rewards/format_reward": 0.5833333469927311, |
| "step": 181 |
| }, |
| { |
| "completion_length": 1820.1875228881836, |
| "epoch": 0.208, |
| "grad_norm": 0.1944979727268219, |
| "kl": 0.0024585723876953125, |
| "learning_rate": 8.220696016880687e-07, |
| "loss": 0.0001, |
| "reward": 0.5819809623062611, |
| "reward_std": 0.6004299521446228, |
| "rewards/cosine_scaled_reward": -0.07359286700375378, |
| "rewards/format_reward": 0.7291666716337204, |
| "step": 182 |
| }, |
| { |
| "completion_length": 1418.458351135254, |
| "epoch": 0.20914285714285713, |
| "grad_norm": 0.2479531317949295, |
| "kl": 0.00787353515625, |
| "learning_rate": 8.195606193320136e-07, |
| "loss": 0.0003, |
| "reward": 1.0037191323935986, |
| "reward_std": 0.7833016626536846, |
| "rewards/cosine_scaled_reward": 0.04352622898295522, |
| "rewards/format_reward": 0.916666679084301, |
| "step": 183 |
| }, |
| { |
| "completion_length": 2061.06254196167, |
| "epoch": 0.2102857142857143, |
| "grad_norm": 0.2752915620803833, |
| "kl": 0.0069942474365234375, |
| "learning_rate": 8.170384989716657e-07, |
| "loss": 0.0003, |
| "reward": 0.39811624586582184, |
| "reward_std": 0.612327728420496, |
| "rewards/cosine_scaled_reward": -0.11344187799841166, |
| "rewards/format_reward": 0.6250000093132257, |
| "step": 184 |
| }, |
| { |
| "completion_length": 1844.6667098999023, |
| "epoch": 0.21142857142857144, |
| "grad_norm": 0.2239396721124649, |
| "kl": 0.0045299530029296875, |
| "learning_rate": 8.145033635316128e-07, |
| "loss": 0.0002, |
| "reward": 0.32189831510186195, |
| "reward_std": 0.5676768328994513, |
| "rewards/cosine_scaled_reward": -0.18280084058642387, |
| "rewards/format_reward": 0.6875000149011612, |
| "step": 185 |
| }, |
| { |
| "completion_length": 2218.791702270508, |
| "epoch": 0.21257142857142858, |
| "grad_norm": 0.20360872149467468, |
| "kl": 0.0063877105712890625, |
| "learning_rate": 8.119553365707802e-07, |
| "loss": 0.0003, |
| "reward": 0.48006572760641575, |
| "reward_std": 0.5926500409841537, |
| "rewards/cosine_scaled_reward": -0.030800477601587772, |
| "rewards/format_reward": 0.5416666734963655, |
| "step": 186 |
| }, |
| { |
| "completion_length": 1770.9791870117188, |
| "epoch": 0.21371428571428572, |
| "grad_norm": 0.23867206275463104, |
| "kl": 0.0055904388427734375, |
| "learning_rate": 8.093945422764069e-07, |
| "loss": 0.0002, |
| "reward": 0.6042033806443214, |
| "reward_std": 0.4485644996166229, |
| "rewards/cosine_scaled_reward": -0.08331498829647899, |
| "rewards/format_reward": 0.7708333395421505, |
| "step": 187 |
| }, |
| { |
| "completion_length": 2462.854202270508, |
| "epoch": 0.21485714285714286, |
| "grad_norm": 0.24813704192638397, |
| "kl": 0.0071868896484375, |
| "learning_rate": 8.068211054579943e-07, |
| "loss": 0.0003, |
| "reward": 0.014021937269717455, |
| "reward_std": 0.5901096761226654, |
| "rewards/cosine_scaled_reward": -0.22215570323169231, |
| "rewards/format_reward": 0.4583333469927311, |
| "step": 188 |
| }, |
| { |
| "completion_length": 1708.5000534057617, |
| "epoch": 0.216, |
| "grad_norm": 0.2578124403953552, |
| "kl": 0.006272315979003906, |
| "learning_rate": 8.04235151541222e-07, |
| "loss": 0.0003, |
| "reward": 0.6445069871842861, |
| "reward_std": 0.7020072378218174, |
| "rewards/cosine_scaled_reward": -0.052746512461453676, |
| "rewards/format_reward": 0.7500000111758709, |
| "step": 189 |
| }, |
| { |
| "completion_length": 1291.5833892822266, |
| "epoch": 0.21714285714285714, |
| "grad_norm": 0.2704342007637024, |
| "kl": 0.0067882537841796875, |
| "learning_rate": 8.01636806561836e-07, |
| "loss": 0.0003, |
| "reward": 0.9361954592168331, |
| "reward_std": 0.6467648409307003, |
| "rewards/cosine_scaled_reward": -0.0006522866897284985, |
| "rewards/format_reward": 0.9375000149011612, |
| "step": 190 |
| }, |
| { |
| "completion_length": 1299.5833892822266, |
| "epoch": 0.21828571428571428, |
| "grad_norm": 0.24868877232074738, |
| "kl": 0.0057735443115234375, |
| "learning_rate": 7.990261971595048e-07, |
| "loss": 0.0002, |
| "reward": 1.0458338633179665, |
| "reward_std": 0.7703647427260876, |
| "rewards/cosine_scaled_reward": 0.08541689871344715, |
| "rewards/format_reward": 0.8750000149011612, |
| "step": 191 |
| }, |
| { |
| "completion_length": 1983.1042404174805, |
| "epoch": 0.21942857142857142, |
| "grad_norm": 0.24434901773929596, |
| "kl": 0.006023406982421875, |
| "learning_rate": 7.964034505716476e-07, |
| "loss": 0.0002, |
| "reward": 0.5690425429493189, |
| "reward_std": 0.7882693633437157, |
| "rewards/cosine_scaled_reward": -0.09047873830422759, |
| "rewards/format_reward": 0.750000013038516, |
| "step": 192 |
| }, |
| { |
| "completion_length": 2579.333366394043, |
| "epoch": 0.22057142857142858, |
| "grad_norm": 0.26767823100090027, |
| "kl": 0.008405685424804688, |
| "learning_rate": 7.93768694627233e-07, |
| "loss": 0.0003, |
| "reward": 0.08483336865901947, |
| "reward_std": 0.5648258291184902, |
| "rewards/cosine_scaled_reward": -0.19716665521264076, |
| "rewards/format_reward": 0.4791666753590107, |
| "step": 193 |
| }, |
| { |
| "completion_length": 2240.562568664551, |
| "epoch": 0.22171428571428572, |
| "grad_norm": 0.19335110485553741, |
| "kl": 0.00666046142578125, |
| "learning_rate": 7.911220577405484e-07, |
| "loss": 0.0003, |
| "reward": 1.0503641180694103, |
| "reward_std": 0.5451640971004963, |
| "rewards/cosine_scaled_reward": 0.16059872414916754, |
| "rewards/format_reward": 0.7291666772216558, |
| "step": 194 |
| }, |
| { |
| "completion_length": 1826.2708892822266, |
| "epoch": 0.22285714285714286, |
| "grad_norm": 0.24450555443763733, |
| "kl": 0.006824493408203125, |
| "learning_rate": 7.884636689049422e-07, |
| "loss": 0.0003, |
| "reward": 0.7163921128958464, |
| "reward_std": 1.0118936747312546, |
| "rewards/cosine_scaled_reward": -0.03763728140620515, |
| "rewards/format_reward": 0.7916666828095913, |
| "step": 195 |
| }, |
| { |
| "completion_length": 2620.041778564453, |
| "epoch": 0.224, |
| "grad_norm": 0.2696687877178192, |
| "kl": 0.00728607177734375, |
| "learning_rate": 7.857936576865356e-07, |
| "loss": 0.0003, |
| "reward": 0.21067072823643684, |
| "reward_std": 0.859990905970335, |
| "rewards/cosine_scaled_reward": -0.13424798846244812, |
| "rewards/format_reward": 0.4791666716337204, |
| "step": 196 |
| }, |
| { |
| "completion_length": 1049.0208587646484, |
| "epoch": 0.22514285714285714, |
| "grad_norm": 0.27546316385269165, |
| "kl": 0.00786590576171875, |
| "learning_rate": 7.831121542179086e-07, |
| "loss": 0.0003, |
| "reward": 0.9739863537251949, |
| "reward_std": 0.8307300806045532, |
| "rewards/cosine_scaled_reward": 0.01824316382408142, |
| "rewards/format_reward": 0.9375, |
| "step": 197 |
| }, |
| { |
| "completion_length": 1475.4583892822266, |
| "epoch": 0.22628571428571428, |
| "grad_norm": 0.2635370194911957, |
| "kl": 0.008056640625, |
| "learning_rate": 7.804192891917571e-07, |
| "loss": 0.0003, |
| "reward": 0.8781713657081127, |
| "reward_std": 0.8930405229330063, |
| "rewards/cosine_scaled_reward": 0.012002333998680115, |
| "rewards/format_reward": 0.8541666679084301, |
| "step": 198 |
| }, |
| { |
| "completion_length": 1526.4583587646484, |
| "epoch": 0.22742857142857142, |
| "grad_norm": 0.29261499643325806, |
| "kl": 0.007879257202148438, |
| "learning_rate": 7.777151938545235e-07, |
| "loss": 0.0003, |
| "reward": 0.42517440766096115, |
| "reward_std": 0.6142569780349731, |
| "rewards/cosine_scaled_reward": -0.22491280548274517, |
| "rewards/format_reward": 0.8750000149011612, |
| "step": 199 |
| }, |
| { |
| "completion_length": 1371.458366394043, |
| "epoch": 0.22857142857142856, |
| "grad_norm": 0.23450589179992676, |
| "kl": 0.0063762664794921875, |
| "learning_rate": 7.75e-07, |
| "loss": 0.0003, |
| "reward": 1.134487384930253, |
| "reward_std": 0.7592474110424519, |
| "rewards/cosine_scaled_reward": 0.10891036130487919, |
| "rewards/format_reward": 0.9166666679084301, |
| "step": 200 |
| }, |
| { |
| "completion_length": 1873.7500686645508, |
| "epoch": 0.2297142857142857, |
| "grad_norm": 0.20057973265647888, |
| "kl": 0.0047245025634765625, |
| "learning_rate": 7.72273839962904e-07, |
| "loss": 0.0002, |
| "reward": 1.3440011143684387, |
| "reward_std": 0.6584898792207241, |
| "rewards/cosine_scaled_reward": 0.2657505361130461, |
| "rewards/format_reward": 0.8125000074505806, |
| "step": 201 |
| }, |
| { |
| "completion_length": 1296.6041870117188, |
| "epoch": 0.23085714285714284, |
| "grad_norm": 0.2126869261264801, |
| "kl": 0.006069183349609375, |
| "learning_rate": 7.695368466124296e-07, |
| "loss": 0.0002, |
| "reward": 1.2371138632297516, |
| "reward_std": 0.45892489701509476, |
| "rewards/cosine_scaled_reward": 0.18105687946081161, |
| "rewards/format_reward": 0.8750000055879354, |
| "step": 202 |
| }, |
| { |
| "completion_length": 1661.5000457763672, |
| "epoch": 0.232, |
| "grad_norm": 0.25018230080604553, |
| "kl": 0.008678436279296875, |
| "learning_rate": 7.667891533457718e-07, |
| "loss": 0.0003, |
| "reward": 0.8724356088787317, |
| "reward_std": 0.8623361866921186, |
| "rewards/cosine_scaled_reward": 0.040384437423199415, |
| "rewards/format_reward": 0.7916666939854622, |
| "step": 203 |
| }, |
| { |
| "completion_length": 1537.708381652832, |
| "epoch": 0.23314285714285715, |
| "grad_norm": 0.34546995162963867, |
| "kl": 0.007953643798828125, |
| "learning_rate": 7.640308940816239e-07, |
| "loss": 0.0003, |
| "reward": 0.7893814034759998, |
| "reward_std": 0.7430329732596874, |
| "rewards/cosine_scaled_reward": -0.03239264711737633, |
| "rewards/format_reward": 0.8541666697710752, |
| "step": 204 |
| }, |
| { |
| "completion_length": 1645.1250305175781, |
| "epoch": 0.2342857142857143, |
| "grad_norm": 0.24868625402450562, |
| "kl": 0.0064334869384765625, |
| "learning_rate": 7.612622032536507e-07, |
| "loss": 0.0003, |
| "reward": 1.1130757508799434, |
| "reward_std": 0.9551872611045837, |
| "rewards/cosine_scaled_reward": 0.15028784982860088, |
| "rewards/format_reward": 0.8125000055879354, |
| "step": 205 |
| }, |
| { |
| "completion_length": 2075.187545776367, |
| "epoch": 0.23542857142857143, |
| "grad_norm": 0.25428035855293274, |
| "kl": 0.0065708160400390625, |
| "learning_rate": 7.584832158039378e-07, |
| "loss": 0.0003, |
| "reward": 0.233387497253716, |
| "reward_std": 0.5259231552481651, |
| "rewards/cosine_scaled_reward": -0.22705625742673874, |
| "rewards/format_reward": 0.6875000260770321, |
| "step": 206 |
| }, |
| { |
| "completion_length": 1656.3750686645508, |
| "epoch": 0.23657142857142857, |
| "grad_norm": 0.26431310176849365, |
| "kl": 0.0085601806640625, |
| "learning_rate": 7.556940671764124e-07, |
| "loss": 0.0003, |
| "reward": 0.6532822176814079, |
| "reward_std": 0.6114402338862419, |
| "rewards/cosine_scaled_reward": -0.13169224187731743, |
| "rewards/format_reward": 0.9166666716337204, |
| "step": 207 |
| }, |
| { |
| "completion_length": 1265.3542022705078, |
| "epoch": 0.2377142857142857, |
| "grad_norm": 0.241045743227005, |
| "kl": 0.00737762451171875, |
| "learning_rate": 7.528948933102438e-07, |
| "loss": 0.0003, |
| "reward": 0.9077612403780222, |
| "reward_std": 0.5354608930647373, |
| "rewards/cosine_scaled_reward": -0.01486940123140812, |
| "rewards/format_reward": 0.9375000074505806, |
| "step": 208 |
| }, |
| { |
| "completion_length": 1344.3750305175781, |
| "epoch": 0.23885714285714285, |
| "grad_norm": 0.3023705780506134, |
| "kl": 0.010385513305664062, |
| "learning_rate": 7.500858306332172e-07, |
| "loss": 0.0004, |
| "reward": 1.0339510310441256, |
| "reward_std": 0.7570701129734516, |
| "rewards/cosine_scaled_reward": 0.07947551319375634, |
| "rewards/format_reward": 0.8750000149011612, |
| "step": 209 |
| }, |
| { |
| "completion_length": 1667.3541793823242, |
| "epoch": 0.24, |
| "grad_norm": 0.18748997151851654, |
| "kl": 0.006000518798828125, |
| "learning_rate": 7.472670160550848e-07, |
| "loss": 0.0002, |
| "reward": 0.666348340921104, |
| "reward_std": 0.5869894400238991, |
| "rewards/cosine_scaled_reward": -0.09390917886048555, |
| "rewards/format_reward": 0.8541666716337204, |
| "step": 210 |
| }, |
| { |
| "completion_length": 1571.8958740234375, |
| "epoch": 0.24114285714285713, |
| "grad_norm": 0.25766894221305847, |
| "kl": 0.0088043212890625, |
| "learning_rate": 7.444385869608921e-07, |
| "loss": 0.0004, |
| "reward": 0.9365372620522976, |
| "reward_std": 0.5268308343365788, |
| "rewards/cosine_scaled_reward": 0.07243526913225651, |
| "rewards/format_reward": 0.7916666734963655, |
| "step": 211 |
| }, |
| { |
| "completion_length": 967.6458549499512, |
| "epoch": 0.2422857142857143, |
| "grad_norm": 0.2648831009864807, |
| "kl": 0.00879669189453125, |
| "learning_rate": 7.416006812042827e-07, |
| "loss": 0.0004, |
| "reward": 1.0848271660506725, |
| "reward_std": 0.44052685238420963, |
| "rewards/cosine_scaled_reward": 0.08408024348318577, |
| "rewards/format_reward": 0.9166666679084301, |
| "step": 212 |
| }, |
| { |
| "completion_length": 1431.3959121704102, |
| "epoch": 0.24342857142857144, |
| "grad_norm": 0.33580857515335083, |
| "kl": 0.0103302001953125, |
| "learning_rate": 7.387534371007797e-07, |
| "loss": 0.0004, |
| "reward": 1.239082669839263, |
| "reward_std": 0.9362837933003902, |
| "rewards/cosine_scaled_reward": 0.1820413067471236, |
| "rewards/format_reward": 0.8750000074505806, |
| "step": 213 |
| }, |
| { |
| "completion_length": 1762.3125457763672, |
| "epoch": 0.24457142857142858, |
| "grad_norm": 0.24269923567771912, |
| "kl": 0.00745391845703125, |
| "learning_rate": 7.358969934210438e-07, |
| "loss": 0.0003, |
| "reward": 1.0624176412820816, |
| "reward_std": 0.7310810908675194, |
| "rewards/cosine_scaled_reward": 0.11454213625984266, |
| "rewards/format_reward": 0.8333333432674408, |
| "step": 214 |
| }, |
| { |
| "completion_length": 1427.3542022705078, |
| "epoch": 0.24571428571428572, |
| "grad_norm": 0.25518307089805603, |
| "kl": 0.0060749053955078125, |
| "learning_rate": 7.330314893841101e-07, |
| "loss": 0.0002, |
| "reward": 0.5486622415482998, |
| "reward_std": 0.42411663569509983, |
| "rewards/cosine_scaled_reward": -0.14233557134866714, |
| "rewards/format_reward": 0.8333333432674408, |
| "step": 215 |
| }, |
| { |
| "completion_length": 1129.395851135254, |
| "epoch": 0.24685714285714286, |
| "grad_norm": 0.28439247608184814, |
| "kl": 0.007411956787109375, |
| "learning_rate": 7.301570646506027e-07, |
| "loss": 0.0003, |
| "reward": 1.0450292900204659, |
| "reward_std": 0.6436006706207991, |
| "rewards/cosine_scaled_reward": 0.08501462638378143, |
| "rewards/format_reward": 0.8750000055879354, |
| "step": 216 |
| }, |
| { |
| "completion_length": 1375.4791946411133, |
| "epoch": 0.248, |
| "grad_norm": 0.22866062819957733, |
| "kl": 0.0072784423828125, |
| "learning_rate": 7.27273859315928e-07, |
| "loss": 0.0003, |
| "reward": 1.0294950436800718, |
| "reward_std": 0.7613518834114075, |
| "rewards/cosine_scaled_reward": 0.08766416925936937, |
| "rewards/format_reward": 0.8541666679084301, |
| "step": 217 |
| }, |
| { |
| "completion_length": 1546.6041946411133, |
| "epoch": 0.24914285714285714, |
| "grad_norm": 0.2400980442762375, |
| "kl": 0.00803375244140625, |
| "learning_rate": 7.243820139034464e-07, |
| "loss": 0.0003, |
| "reward": 0.6402788404375315, |
| "reward_std": 0.7851009257137775, |
| "rewards/cosine_scaled_reward": -0.10694391108700074, |
| "rewards/format_reward": 0.8541666716337204, |
| "step": 218 |
| }, |
| { |
| "completion_length": 1293.1042098999023, |
| "epoch": 0.2502857142857143, |
| "grad_norm": 0.3005291521549225, |
| "kl": 0.0071582794189453125, |
| "learning_rate": 7.214816693576234e-07, |
| "loss": 0.0003, |
| "reward": 1.2072506174445152, |
| "reward_std": 0.8155215214937925, |
| "rewards/cosine_scaled_reward": 0.1452919525327161, |
| "rewards/format_reward": 0.9166666679084301, |
| "step": 219 |
| }, |
| { |
| "completion_length": 1585.4583740234375, |
| "epoch": 0.25142857142857145, |
| "grad_norm": 0.29073673486709595, |
| "kl": 0.009313583374023438, |
| "learning_rate": 7.185729670371604e-07, |
| "loss": 0.0004, |
| "reward": 0.25497015565633774, |
| "reward_std": 0.43767623975872993, |
| "rewards/cosine_scaled_reward": -0.2787649389356375, |
| "rewards/format_reward": 0.8125000149011612, |
| "step": 220 |
| }, |
| { |
| "completion_length": 1091.9583625793457, |
| "epoch": 0.25257142857142856, |
| "grad_norm": 0.2504833936691284, |
| "kl": 0.006580352783203125, |
| "learning_rate": 7.156560487081051e-07, |
| "loss": 0.0003, |
| "reward": 1.1241591908037663, |
| "reward_std": 0.3971919184550643, |
| "rewards/cosine_scaled_reward": 0.09332958236336708, |
| "rewards/format_reward": 0.9375, |
| "step": 221 |
| }, |
| { |
| "completion_length": 1431.5000305175781, |
| "epoch": 0.2537142857142857, |
| "grad_norm": 0.2770059108734131, |
| "kl": 0.008672714233398438, |
| "learning_rate": 7.127310565369415e-07, |
| "loss": 0.0003, |
| "reward": 1.0303391478955746, |
| "reward_std": 0.711064089089632, |
| "rewards/cosine_scaled_reward": 0.09850288555026054, |
| "rewards/format_reward": 0.8333333395421505, |
| "step": 222 |
| }, |
| { |
| "completion_length": 1270.583351135254, |
| "epoch": 0.25485714285714284, |
| "grad_norm": 0.22273501753807068, |
| "kl": 0.00675201416015625, |
| "learning_rate": 7.097981330836616e-07, |
| "loss": 0.0003, |
| "reward": 0.8901128210127354, |
| "reward_std": 0.560223001986742, |
| "rewards/cosine_scaled_reward": 0.0075564137659966946, |
| "rewards/format_reward": 0.875, |
| "step": 223 |
| }, |
| { |
| "completion_length": 1732.1458587646484, |
| "epoch": 0.256, |
| "grad_norm": 0.2018723040819168, |
| "kl": 0.008203506469726562, |
| "learning_rate": 7.068574212948169e-07, |
| "loss": 0.0003, |
| "reward": 0.7687601521611214, |
| "reward_std": 0.7839281968772411, |
| "rewards/cosine_scaled_reward": -0.03228660812601447, |
| "rewards/format_reward": 0.8333333432674408, |
| "step": 224 |
| }, |
| { |
| "completion_length": 2117.18758392334, |
| "epoch": 0.2571428571428571, |
| "grad_norm": 0.2617863416671753, |
| "kl": 0.01264190673828125, |
| "learning_rate": 7.039090644965509e-07, |
| "loss": 0.0005, |
| "reward": 0.8388488199561834, |
| "reward_std": 0.7986405938863754, |
| "rewards/cosine_scaled_reward": 0.044424411840736866, |
| "rewards/format_reward": 0.750000013038516, |
| "step": 225 |
| }, |
| { |
| "completion_length": 1414.020881652832, |
| "epoch": 0.2582857142857143, |
| "grad_norm": 0.19753257930278778, |
| "kl": 0.0067138671875, |
| "learning_rate": 7.009532063876148e-07, |
| "loss": 0.0003, |
| "reward": 1.1793978083878756, |
| "reward_std": 0.5940953148528934, |
| "rewards/cosine_scaled_reward": 0.11053222604095936, |
| "rewards/format_reward": 0.9583333432674408, |
| "step": 226 |
| }, |
| { |
| "completion_length": 1173.1666870117188, |
| "epoch": 0.25942857142857145, |
| "grad_norm": 0.3516033887863159, |
| "kl": 0.01033782958984375, |
| "learning_rate": 6.979899910323624e-07, |
| "loss": 0.0004, |
| "reward": 0.7449045367538929, |
| "reward_std": 0.5567011050879955, |
| "rewards/cosine_scaled_reward": -0.10671441350132227, |
| "rewards/format_reward": 0.9583333432674408, |
| "step": 227 |
| }, |
| { |
| "completion_length": 1190.1250305175781, |
| "epoch": 0.26057142857142856, |
| "grad_norm": 0.31133556365966797, |
| "kl": 0.00760650634765625, |
| "learning_rate": 6.950195628537299e-07, |
| "loss": 0.0003, |
| "reward": 1.191257143393159, |
| "reward_std": 0.6371707431972027, |
| "rewards/cosine_scaled_reward": 0.1581285521388054, |
| "rewards/format_reward": 0.875, |
| "step": 228 |
| }, |
| { |
| "completion_length": 1375.6250305175781, |
| "epoch": 0.26171428571428573, |
| "grad_norm": 0.2500562369823456, |
| "kl": 0.0087432861328125, |
| "learning_rate": 6.920420666261961e-07, |
| "loss": 0.0003, |
| "reward": 0.7764038797467947, |
| "reward_std": 0.5216951882466674, |
| "rewards/cosine_scaled_reward": -0.04929806664586067, |
| "rewards/format_reward": 0.8750000055879354, |
| "step": 229 |
| }, |
| { |
| "completion_length": 1970.1458587646484, |
| "epoch": 0.26285714285714284, |
| "grad_norm": 0.23503738641738892, |
| "kl": 0.009944915771484375, |
| "learning_rate": 6.890576474687263e-07, |
| "loss": 0.0004, |
| "reward": 0.531811726745218, |
| "reward_std": 0.7926243953406811, |
| "rewards/cosine_scaled_reward": -0.10909414570778608, |
| "rewards/format_reward": 0.7500000204890966, |
| "step": 230 |
| }, |
| { |
| "completion_length": 1569.3750457763672, |
| "epoch": 0.264, |
| "grad_norm": 0.2320818156003952, |
| "kl": 0.008174896240234375, |
| "learning_rate": 6.860664508377001e-07, |
| "loss": 0.0003, |
| "reward": 1.0276812557131052, |
| "reward_std": 0.789770107716322, |
| "rewards/cosine_scaled_reward": 0.05550729110836983, |
| "rewards/format_reward": 0.9166666716337204, |
| "step": 231 |
| }, |
| { |
| "completion_length": 1607.0625305175781, |
| "epoch": 0.2651428571428571, |
| "grad_norm": 0.23815037310123444, |
| "kl": 0.008876800537109375, |
| "learning_rate": 6.83068622519821e-07, |
| "loss": 0.0004, |
| "reward": 0.352162616327405, |
| "reward_std": 0.5658496394753456, |
| "rewards/cosine_scaled_reward": -0.2405853734817356, |
| "rewards/format_reward": 0.8333333414047956, |
| "step": 232 |
| }, |
| { |
| "completion_length": 1084.5833740234375, |
| "epoch": 0.2662857142857143, |
| "grad_norm": 0.2646292746067047, |
| "kl": 0.007411956787109375, |
| "learning_rate": 6.800643086250121e-07, |
| "loss": 0.0003, |
| "reward": 0.6558086425065994, |
| "reward_std": 0.7230100817978382, |
| "rewards/cosine_scaled_reward": -0.1408456964418292, |
| "rewards/format_reward": 0.9375000149011612, |
| "step": 233 |
| }, |
| { |
| "completion_length": 1680.520881652832, |
| "epoch": 0.2674285714285714, |
| "grad_norm": 0.2598845660686493, |
| "kl": 0.009510040283203125, |
| "learning_rate": 6.770536555792944e-07, |
| "loss": 0.0004, |
| "reward": 0.6849129311740398, |
| "reward_std": 0.5895892381668091, |
| "rewards/cosine_scaled_reward": -0.042960209771990776, |
| "rewards/format_reward": 0.7708333432674408, |
| "step": 234 |
| }, |
| { |
| "completion_length": 1212.5208549499512, |
| "epoch": 0.26857142857142857, |
| "grad_norm": 0.281758576631546, |
| "kl": 0.007808685302734375, |
| "learning_rate": 6.740368101176495e-07, |
| "loss": 0.0003, |
| "reward": 1.2133875098079443, |
| "reward_std": 0.634877560660243, |
| "rewards/cosine_scaled_reward": 0.14836042001843452, |
| "rewards/format_reward": 0.9166666716337204, |
| "step": 235 |
| }, |
| { |
| "completion_length": 2002.2500534057617, |
| "epoch": 0.26971428571428574, |
| "grad_norm": 0.21561594307422638, |
| "kl": 0.008035659790039062, |
| "learning_rate": 6.710139192768694e-07, |
| "loss": 0.0003, |
| "reward": 0.5754090845584869, |
| "reward_std": 0.681129951030016, |
| "rewards/cosine_scaled_reward": -0.07687881146557629, |
| "rewards/format_reward": 0.7291666716337204, |
| "step": 236 |
| }, |
| { |
| "completion_length": 1571.7291946411133, |
| "epoch": 0.27085714285714285, |
| "grad_norm": 0.23634596168994904, |
| "kl": 0.006927490234375, |
| "learning_rate": 6.679851303883891e-07, |
| "loss": 0.0003, |
| "reward": 0.8946835342794657, |
| "reward_std": 0.5508107021450996, |
| "rewards/cosine_scaled_reward": 0.03067508526146412, |
| "rewards/format_reward": 0.8333333432674408, |
| "step": 237 |
| }, |
| { |
| "completion_length": 1162.2292098999023, |
| "epoch": 0.272, |
| "grad_norm": 0.23457442224025726, |
| "kl": 0.0085906982421875, |
| "learning_rate": 6.649505910711058e-07, |
| "loss": 0.0003, |
| "reward": 1.158431712538004, |
| "reward_std": 0.7775964550673962, |
| "rewards/cosine_scaled_reward": 0.08963251765817404, |
| "rewards/format_reward": 0.9791666716337204, |
| "step": 238 |
| }, |
| { |
| "completion_length": 1398.8750457763672, |
| "epoch": 0.27314285714285713, |
| "grad_norm": 0.22179792821407318, |
| "kl": 0.005764007568359375, |
| "learning_rate": 6.619104492241847e-07, |
| "loss": 0.0002, |
| "reward": 1.289793978095986, |
| "reward_std": 0.5327765932306647, |
| "rewards/cosine_scaled_reward": 0.23864697851240635, |
| "rewards/format_reward": 0.8125, |
| "step": 239 |
| }, |
| { |
| "completion_length": 1420.9375381469727, |
| "epoch": 0.2742857142857143, |
| "grad_norm": 0.33549872040748596, |
| "kl": 0.01104736328125, |
| "learning_rate": 6.588648530198504e-07, |
| "loss": 0.0004, |
| "reward": 0.3914220330771059, |
| "reward_std": 0.6205890811979771, |
| "rewards/cosine_scaled_reward": -0.2209556633606553, |
| "rewards/format_reward": 0.8333333395421505, |
| "step": 240 |
| }, |
| { |
| "completion_length": 1621.7292098999023, |
| "epoch": 0.2754285714285714, |
| "grad_norm": 0.28596657514572144, |
| "kl": 0.010402679443359375, |
| "learning_rate": 6.558139508961654e-07, |
| "loss": 0.0004, |
| "reward": 0.44276779890060425, |
| "reward_std": 0.5154628418385983, |
| "rewards/cosine_scaled_reward": -0.1848661107942462, |
| "rewards/format_reward": 0.8125000111758709, |
| "step": 241 |
| }, |
| { |
| "completion_length": 1138.5000457763672, |
| "epoch": 0.2765714285714286, |
| "grad_norm": 0.4309399724006653, |
| "kl": 0.0121612548828125, |
| "learning_rate": 6.527578915497951e-07, |
| "loss": 0.0005, |
| "reward": 0.7130568971624598, |
| "reward_std": 0.4758261194219813, |
| "rewards/cosine_scaled_reward": -0.12263822788372636, |
| "rewards/format_reward": 0.9583333432674408, |
| "step": 242 |
| }, |
| { |
| "completion_length": 1494.687515258789, |
| "epoch": 0.2777142857142857, |
| "grad_norm": 0.21594373881816864, |
| "kl": 0.00745391845703125, |
| "learning_rate": 6.496968239287603e-07, |
| "loss": 0.0003, |
| "reward": 0.7156112641096115, |
| "reward_std": 0.6859831623733044, |
| "rewards/cosine_scaled_reward": -0.06927772145718336, |
| "rewards/format_reward": 0.8541666772216558, |
| "step": 243 |
| }, |
| { |
| "completion_length": 1628.1667137145996, |
| "epoch": 0.27885714285714286, |
| "grad_norm": 0.2714637219905853, |
| "kl": 0.007848739624023438, |
| "learning_rate": 6.466308972251785e-07, |
| "loss": 0.0003, |
| "reward": 0.8751203082501888, |
| "reward_std": 0.9190315119922161, |
| "rewards/cosine_scaled_reward": 0.010476819472387433, |
| "rewards/format_reward": 0.8541666753590107, |
| "step": 244 |
| }, |
| { |
| "completion_length": 1648.875015258789, |
| "epoch": 0.28, |
| "grad_norm": 0.2101169377565384, |
| "kl": 0.0067119598388671875, |
| "learning_rate": 6.435602608679916e-07, |
| "loss": 0.0003, |
| "reward": 1.1979175508022308, |
| "reward_std": 0.7641258873045444, |
| "rewards/cosine_scaled_reward": 0.1718754144385457, |
| "rewards/format_reward": 0.8541666697710752, |
| "step": 245 |
| }, |
| { |
| "completion_length": 1389.895881652832, |
| "epoch": 0.28114285714285714, |
| "grad_norm": 0.21795134246349335, |
| "kl": 0.009159088134765625, |
| "learning_rate": 6.404850645156841e-07, |
| "loss": 0.0004, |
| "reward": 0.7879314236342907, |
| "reward_std": 0.6879578605294228, |
| "rewards/cosine_scaled_reward": -0.07478431053459644, |
| "rewards/format_reward": 0.9375, |
| "step": 246 |
| }, |
| { |
| "completion_length": 2069.1458892822266, |
| "epoch": 0.2822857142857143, |
| "grad_norm": 0.2720467150211334, |
| "kl": 0.012157440185546875, |
| "learning_rate": 6.374054580489873e-07, |
| "loss": 0.0005, |
| "reward": 0.3278281930834055, |
| "reward_std": 0.6504724733531475, |
| "rewards/cosine_scaled_reward": -0.16941926488652825, |
| "rewards/format_reward": 0.6666666753590107, |
| "step": 247 |
| }, |
| { |
| "completion_length": 1404.020866394043, |
| "epoch": 0.2834285714285714, |
| "grad_norm": 0.29227957129478455, |
| "kl": 0.009120941162109375, |
| "learning_rate": 6.343215915635761e-07, |
| "loss": 0.0004, |
| "reward": 1.3239782322198153, |
| "reward_std": 0.6399974799714983, |
| "rewards/cosine_scaled_reward": 0.24532245565205812, |
| "rewards/format_reward": 0.8333333395421505, |
| "step": 248 |
| }, |
| { |
| "completion_length": 1421.7500610351562, |
| "epoch": 0.2845714285714286, |
| "grad_norm": 0.25874021649360657, |
| "kl": 0.010471343994140625, |
| "learning_rate": 6.31233615362752e-07, |
| "loss": 0.0004, |
| "reward": 1.373143790755421, |
| "reward_std": 0.5737661384046078, |
| "rewards/cosine_scaled_reward": 0.23865519277751446, |
| "rewards/format_reward": 0.8958333432674408, |
| "step": 249 |
| }, |
| { |
| "completion_length": 1404.6667022705078, |
| "epoch": 0.2857142857142857, |
| "grad_norm": 0.6512049436569214, |
| "kl": 0.011989593505859375, |
| "learning_rate": 6.281416799501187e-07, |
| "loss": 0.0005, |
| "reward": 0.8663649614900351, |
| "reward_std": 0.6013601124286652, |
| "rewards/cosine_scaled_reward": -0.035567532293498516, |
| "rewards/format_reward": 0.9375000074505806, |
| "step": 250 |
| }, |
| { |
| "completion_length": 1101.1250305175781, |
| "epoch": 0.28685714285714287, |
| "grad_norm": 0.40293389558792114, |
| "kl": 0.013065338134765625, |
| "learning_rate": 6.25045936022246e-07, |
| "loss": 0.0005, |
| "reward": 0.7222131416201591, |
| "reward_std": 0.6657759360969067, |
| "rewards/cosine_scaled_reward": -0.07639344967901707, |
| "rewards/format_reward": 0.8750000074505806, |
| "step": 251 |
| }, |
| { |
| "completion_length": 1508.4583702087402, |
| "epoch": 0.288, |
| "grad_norm": 0.2395544797182083, |
| "kl": 0.010028839111328125, |
| "learning_rate": 6.219465344613258e-07, |
| "loss": 0.0004, |
| "reward": 0.7061197310686111, |
| "reward_std": 0.5144665259867907, |
| "rewards/cosine_scaled_reward": -0.07402347587049007, |
| "rewards/format_reward": 0.8541666697710752, |
| "step": 252 |
| }, |
| { |
| "completion_length": 1397.2291984558105, |
| "epoch": 0.28914285714285715, |
| "grad_norm": 0.32397717237472534, |
| "kl": 0.013702392578125, |
| "learning_rate": 6.188436263278172e-07, |
| "loss": 0.0005, |
| "reward": 0.6706481170840561, |
| "reward_std": 0.8627937883138657, |
| "rewards/cosine_scaled_reward": -0.09175930079072714, |
| "rewards/format_reward": 0.8541666716337204, |
| "step": 253 |
| }, |
| { |
| "completion_length": 1515.0000839233398, |
| "epoch": 0.29028571428571426, |
| "grad_norm": 0.3457793891429901, |
| "kl": 0.010593414306640625, |
| "learning_rate": 6.157373628530852e-07, |
| "loss": 0.0004, |
| "reward": 0.5583736216649413, |
| "reward_std": 0.7457092814147472, |
| "rewards/cosine_scaled_reward": -0.12706319894641638, |
| "rewards/format_reward": 0.8125000074505806, |
| "step": 254 |
| }, |
| { |
| "completion_length": 2169.166702270508, |
| "epoch": 0.2914285714285714, |
| "grad_norm": 0.2815548777580261, |
| "kl": 0.0111541748046875, |
| "learning_rate": 6.126278954320294e-07, |
| "loss": 0.0004, |
| "reward": 0.2357357144355774, |
| "reward_std": 0.6888550817966461, |
| "rewards/cosine_scaled_reward": -0.20504882326349616, |
| "rewards/format_reward": 0.6458333358168602, |
| "step": 255 |
| }, |
| { |
| "completion_length": 1219.937515258789, |
| "epoch": 0.2925714285714286, |
| "grad_norm": 0.2393861711025238, |
| "kl": 0.00894927978515625, |
| "learning_rate": 6.095153756157051e-07, |
| "loss": 0.0004, |
| "reward": 1.0311391949653625, |
| "reward_std": 0.6041121408343315, |
| "rewards/cosine_scaled_reward": 0.036402929574251175, |
| "rewards/format_reward": 0.9583333432674408, |
| "step": 256 |
| }, |
| { |
| "completion_length": 1972.0000228881836, |
| "epoch": 0.2937142857142857, |
| "grad_norm": 0.18130330741405487, |
| "kl": 0.009616851806640625, |
| "learning_rate": 6.06399955103937e-07, |
| "loss": 0.0004, |
| "reward": 0.9688311172649264, |
| "reward_std": 0.8995218388736248, |
| "rewards/cosine_scaled_reward": 0.10941554605960846, |
| "rewards/format_reward": 0.7500000055879354, |
| "step": 257 |
| }, |
| { |
| "completion_length": 1788.7917098999023, |
| "epoch": 0.2948571428571429, |
| "grad_norm": 0.24320876598358154, |
| "kl": 0.00939178466796875, |
| "learning_rate": 6.032817857379256e-07, |
| "loss": 0.0004, |
| "reward": 0.7169995531439781, |
| "reward_std": 0.7408483605831861, |
| "rewards/cosine_scaled_reward": -0.05816690996289253, |
| "rewards/format_reward": 0.8333333432674408, |
| "step": 258 |
| }, |
| { |
| "completion_length": 1297.375015258789, |
| "epoch": 0.296, |
| "grad_norm": 0.30073729157447815, |
| "kl": 0.011287689208984375, |
| "learning_rate": 6.001610194928464e-07, |
| "loss": 0.0005, |
| "reward": 0.8080965355038643, |
| "reward_std": 0.5971200875937939, |
| "rewards/cosine_scaled_reward": -0.0230350773781538, |
| "rewards/format_reward": 0.8541666772216558, |
| "step": 259 |
| }, |
| { |
| "completion_length": 1015.0000190734863, |
| "epoch": 0.29714285714285715, |
| "grad_norm": 0.29709815979003906, |
| "kl": 0.00806427001953125, |
| "learning_rate": 5.97037808470444e-07, |
| "loss": 0.0003, |
| "reward": 1.2091553770005703, |
| "reward_std": 0.7044631829485297, |
| "rewards/cosine_scaled_reward": 0.12541100312955678, |
| "rewards/format_reward": 0.9583333432674408, |
| "step": 260 |
| }, |
| { |
| "completion_length": 2179.1666870117188, |
| "epoch": 0.29828571428571427, |
| "grad_norm": 0.20508523285388947, |
| "kl": 0.01059722900390625, |
| "learning_rate": 5.939123048916173e-07, |
| "loss": 0.0004, |
| "reward": 0.47496978752315044, |
| "reward_std": 0.5501383896917105, |
| "rewards/cosine_scaled_reward": -0.07501510810106993, |
| "rewards/format_reward": 0.625, |
| "step": 261 |
| }, |
| { |
| "completion_length": 1553.3542175292969, |
| "epoch": 0.29942857142857143, |
| "grad_norm": 0.41438964009284973, |
| "kl": 0.01168060302734375, |
| "learning_rate": 5.907846610890011e-07, |
| "loss": 0.0005, |
| "reward": 0.3764358746702783, |
| "reward_std": 0.6005188822746277, |
| "rewards/cosine_scaled_reward": -0.20761540438979864, |
| "rewards/format_reward": 0.7916666753590107, |
| "step": 262 |
| }, |
| { |
| "completion_length": 1263.2917098999023, |
| "epoch": 0.30057142857142854, |
| "grad_norm": 0.26419222354888916, |
| "kl": 0.007190704345703125, |
| "learning_rate": 5.87655029499542e-07, |
| "loss": 0.0003, |
| "reward": 0.5402636826038361, |
| "reward_std": 0.5199831649661064, |
| "rewards/cosine_scaled_reward": -0.1986181689426303, |
| "rewards/format_reward": 0.9375000074505806, |
| "step": 263 |
| }, |
| { |
| "completion_length": 1310.7292404174805, |
| "epoch": 0.3017142857142857, |
| "grad_norm": 0.22596046328544617, |
| "kl": 0.008066177368164062, |
| "learning_rate": 5.845235626570683e-07, |
| "loss": 0.0003, |
| "reward": 0.8366705775260925, |
| "reward_std": 0.7488792147487402, |
| "rewards/cosine_scaled_reward": -0.0504147283063503, |
| "rewards/format_reward": 0.9375000074505806, |
| "step": 264 |
| }, |
| { |
| "completion_length": 1436.1458587646484, |
| "epoch": 0.3028571428571429, |
| "grad_norm": 0.24842660129070282, |
| "kl": 0.009662628173828125, |
| "learning_rate": 5.813904131848564e-07, |
| "loss": 0.0004, |
| "reward": 1.0776810441166162, |
| "reward_std": 0.6058933921158314, |
| "rewards/cosine_scaled_reward": 0.07009050995111465, |
| "rewards/format_reward": 0.9375000074505806, |
| "step": 265 |
| }, |
| { |
| "completion_length": 1513.9167022705078, |
| "epoch": 0.304, |
| "grad_norm": 0.22984397411346436, |
| "kl": 0.0084075927734375, |
| "learning_rate": 5.78255733788191e-07, |
| "loss": 0.0003, |
| "reward": 0.9025900475680828, |
| "reward_std": 0.5533245950937271, |
| "rewards/cosine_scaled_reward": -0.007038334384560585, |
| "rewards/format_reward": 0.9166666679084301, |
| "step": 266 |
| }, |
| { |
| "completion_length": 1806.5000228881836, |
| "epoch": 0.30514285714285716, |
| "grad_norm": 0.34077852964401245, |
| "kl": 0.013782501220703125, |
| "learning_rate": 5.751196772469237e-07, |
| "loss": 0.0006, |
| "reward": 0.22909173252992332, |
| "reward_std": 0.6235861741006374, |
| "rewards/cosine_scaled_reward": -0.2292041452601552, |
| "rewards/format_reward": 0.6875000074505806, |
| "step": 267 |
| }, |
| { |
| "completion_length": 1182.5625228881836, |
| "epoch": 0.3062857142857143, |
| "grad_norm": 0.29332637786865234, |
| "kl": 0.01444244384765625, |
| "learning_rate": 5.71982396408026e-07, |
| "loss": 0.0006, |
| "reward": 0.6916986927390099, |
| "reward_std": 0.5937565844506025, |
| "rewards/cosine_scaled_reward": -0.11248400900512934, |
| "rewards/format_reward": 0.9166666716337204, |
| "step": 268 |
| }, |
| { |
| "completion_length": 1484.3750228881836, |
| "epoch": 0.30742857142857144, |
| "grad_norm": 0.28696972131729126, |
| "kl": 0.010364532470703125, |
| "learning_rate": 5.688440441781398e-07, |
| "loss": 0.0004, |
| "reward": 0.6794960014522076, |
| "reward_std": 0.5427012406289577, |
| "rewards/cosine_scaled_reward": -0.08733535185456276, |
| "rewards/format_reward": 0.8541666772216558, |
| "step": 269 |
| }, |
| { |
| "completion_length": 1590.645851135254, |
| "epoch": 0.30857142857142855, |
| "grad_norm": 0.22989951074123383, |
| "kl": 0.01123046875, |
| "learning_rate": 5.657047735161255e-07, |
| "loss": 0.0004, |
| "reward": 0.8947062492370605, |
| "reward_std": 1.0069415792822838, |
| "rewards/cosine_scaled_reward": 0.0202697841450572, |
| "rewards/format_reward": 0.8541666716337204, |
| "step": 270 |
| }, |
| { |
| "completion_length": 1183.0000457763672, |
| "epoch": 0.3097142857142857, |
| "grad_norm": 0.29653027653694153, |
| "kl": 0.0086669921875, |
| "learning_rate": 5.625647374256061e-07, |
| "loss": 0.0003, |
| "reward": 1.1815486252307892, |
| "reward_std": 0.7092056274414062, |
| "rewards/cosine_scaled_reward": 0.11160763050429523, |
| "rewards/format_reward": 0.9583333358168602, |
| "step": 271 |
| }, |
| { |
| "completion_length": 1719.6875457763672, |
| "epoch": 0.31085714285714283, |
| "grad_norm": 0.2264571636915207, |
| "kl": 0.010009765625, |
| "learning_rate": 5.594240889475106e-07, |
| "loss": 0.0004, |
| "reward": 0.6460251696407795, |
| "reward_std": 0.6881718635559082, |
| "rewards/cosine_scaled_reward": -0.0832374356687069, |
| "rewards/format_reward": 0.8125000149011612, |
| "step": 272 |
| }, |
| { |
| "completion_length": 1260.9166870117188, |
| "epoch": 0.312, |
| "grad_norm": 0.2590281367301941, |
| "kl": 0.01006317138671875, |
| "learning_rate": 5.562829811526154e-07, |
| "loss": 0.0004, |
| "reward": 1.0608440730720758, |
| "reward_std": 0.49590713158249855, |
| "rewards/cosine_scaled_reward": 0.06167200347408652, |
| "rewards/format_reward": 0.9375000074505806, |
| "step": 273 |
| }, |
| { |
| "completion_length": 1077.4583587646484, |
| "epoch": 0.31314285714285717, |
| "grad_norm": 0.29721972346305847, |
| "kl": 0.012844085693359375, |
| "learning_rate": 5.531415671340826e-07, |
| "loss": 0.0005, |
| "reward": 1.1142795570194721, |
| "reward_std": 0.61124661937356, |
| "rewards/cosine_scaled_reward": 0.057139765471220016, |
| "rewards/format_reward": 1.0, |
| "step": 274 |
| }, |
| { |
| "completion_length": 1554.9167175292969, |
| "epoch": 0.3142857142857143, |
| "grad_norm": 0.23796315491199493, |
| "kl": 0.0102996826171875, |
| "learning_rate": 5.5e-07, |
| "loss": 0.0004, |
| "reward": 0.974131946451962, |
| "reward_std": 0.6788717601448298, |
| "rewards/cosine_scaled_reward": 0.08081597136333585, |
| "rewards/format_reward": 0.8125, |
| "step": 275 |
| }, |
| { |
| "completion_length": 1287.2916870117188, |
| "epoch": 0.31542857142857145, |
| "grad_norm": 0.35233473777770996, |
| "kl": 0.012027740478515625, |
| "learning_rate": 5.468584328659172e-07, |
| "loss": 0.0005, |
| "reward": 1.0395971853286028, |
| "reward_std": 0.770504854619503, |
| "rewards/cosine_scaled_reward": 0.07188189588487148, |
| "rewards/format_reward": 0.8958333432674408, |
| "step": 276 |
| }, |
| { |
| "completion_length": 1217.3958549499512, |
| "epoch": 0.31657142857142856, |
| "grad_norm": 0.423069030046463, |
| "kl": 0.01209259033203125, |
| "learning_rate": 5.437170188473847e-07, |
| "loss": 0.0005, |
| "reward": 1.1905713304877281, |
| "reward_std": 0.7023467533290386, |
| "rewards/cosine_scaled_reward": 0.1369522949680686, |
| "rewards/format_reward": 0.9166666865348816, |
| "step": 277 |
| }, |
| { |
| "completion_length": 1309.5417098999023, |
| "epoch": 0.3177142857142857, |
| "grad_norm": 0.28700801730155945, |
| "kl": 0.007678985595703125, |
| "learning_rate": 5.405759110524894e-07, |
| "loss": 0.0003, |
| "reward": 1.08343615103513, |
| "reward_std": 0.47201682440936565, |
| "rewards/cosine_scaled_reward": 0.07296805875375867, |
| "rewards/format_reward": 0.9375000074505806, |
| "step": 278 |
| }, |
| { |
| "completion_length": 1422.8958740234375, |
| "epoch": 0.31885714285714284, |
| "grad_norm": 0.48463860154151917, |
| "kl": 0.01280975341796875, |
| "learning_rate": 5.37435262574394e-07, |
| "loss": 0.0005, |
| "reward": 0.8561514802277088, |
| "reward_std": 0.6483948938548565, |
| "rewards/cosine_scaled_reward": -0.04067427571862936, |
| "rewards/format_reward": 0.9375000074505806, |
| "step": 279 |
| }, |
| { |
| "completion_length": 1659.3333778381348, |
| "epoch": 0.32, |
| "grad_norm": 0.289205402135849, |
| "kl": 0.016315460205078125, |
| "learning_rate": 5.342952264838747e-07, |
| "loss": 0.0007, |
| "reward": 1.3687419444322586, |
| "reward_std": 0.9038915932178497, |
| "rewards/cosine_scaled_reward": 0.27812093193642795, |
| "rewards/format_reward": 0.8125000186264515, |
| "step": 280 |
| }, |
| { |
| "completion_length": 2221.645851135254, |
| "epoch": 0.3211428571428571, |
| "grad_norm": 0.2718718945980072, |
| "kl": 0.015529632568359375, |
| "learning_rate": 5.311559558218603e-07, |
| "loss": 0.0006, |
| "reward": 0.3475176487118006, |
| "reward_std": 0.7998633496463299, |
| "rewards/cosine_scaled_reward": -0.11790786002529785, |
| "rewards/format_reward": 0.5833333414047956, |
| "step": 281 |
| }, |
| { |
| "completion_length": 1343.0625381469727, |
| "epoch": 0.3222857142857143, |
| "grad_norm": 0.28914305567741394, |
| "kl": 0.010540008544921875, |
| "learning_rate": 5.28017603591974e-07, |
| "loss": 0.0004, |
| "reward": 1.117010936141014, |
| "reward_std": 0.6344310864806175, |
| "rewards/cosine_scaled_reward": 0.0897554587572813, |
| "rewards/format_reward": 0.9375000074505806, |
| "step": 282 |
| }, |
| { |
| "completion_length": 1945.6250686645508, |
| "epoch": 0.32342857142857145, |
| "grad_norm": 0.1946076899766922, |
| "kl": 0.0126190185546875, |
| "learning_rate": 5.248803227530763e-07, |
| "loss": 0.0005, |
| "reward": 1.111644510179758, |
| "reward_std": 0.7122819889336824, |
| "rewards/cosine_scaled_reward": 0.15998891461640596, |
| "rewards/format_reward": 0.7916666716337204, |
| "step": 283 |
| }, |
| { |
| "completion_length": 1269.0000381469727, |
| "epoch": 0.32457142857142857, |
| "grad_norm": 0.26954689621925354, |
| "kl": 0.006649017333984375, |
| "learning_rate": 5.21744266211809e-07, |
| "loss": 0.0003, |
| "reward": 0.9363188669085503, |
| "reward_std": 0.4825965305790305, |
| "rewards/cosine_scaled_reward": -0.011007236316800117, |
| "rewards/format_reward": 0.9583333432674408, |
| "step": 284 |
| }, |
| { |
| "completion_length": 881.7916946411133, |
| "epoch": 0.32571428571428573, |
| "grad_norm": 0.3140123784542084, |
| "kl": 0.01016998291015625, |
| "learning_rate": 5.186095868151436e-07, |
| "loss": 0.0004, |
| "reward": 1.102295933291316, |
| "reward_std": 0.8087750803679228, |
| "rewards/cosine_scaled_reward": 0.061564626172184944, |
| "rewards/format_reward": 0.9791666716337204, |
| "step": 285 |
| }, |
| { |
| "completion_length": 1551.270881652832, |
| "epoch": 0.32685714285714285, |
| "grad_norm": 0.4618990421295166, |
| "kl": 0.01885223388671875, |
| "learning_rate": 5.154764373429315e-07, |
| "loss": 0.0008, |
| "reward": 0.8153204774716869, |
| "reward_std": 0.8457888886332512, |
| "rewards/cosine_scaled_reward": 0.0014102212153375149, |
| "rewards/format_reward": 0.8125000111758709, |
| "step": 286 |
| }, |
| { |
| "completion_length": 1342.687515258789, |
| "epoch": 0.328, |
| "grad_norm": 0.3015505373477936, |
| "kl": 0.01461029052734375, |
| "learning_rate": 5.123449705004581e-07, |
| "loss": 0.0006, |
| "reward": 0.8659841865301132, |
| "reward_std": 0.7081486638635397, |
| "rewards/cosine_scaled_reward": 0.047575398966728244, |
| "rewards/format_reward": 0.7708333358168602, |
| "step": 287 |
| }, |
| { |
| "completion_length": 1528.8542251586914, |
| "epoch": 0.3291428571428571, |
| "grad_norm": 0.33641308546066284, |
| "kl": 0.014873504638671875, |
| "learning_rate": 5.09215338910999e-07, |
| "loss": 0.0006, |
| "reward": 0.765910281566903, |
| "reward_std": 0.7056640759110451, |
| "rewards/cosine_scaled_reward": -0.07537820562720299, |
| "rewards/format_reward": 0.916666679084301, |
| "step": 288 |
| }, |
| { |
| "completion_length": 1399.9583587646484, |
| "epoch": 0.3302857142857143, |
| "grad_norm": 0.4806149899959564, |
| "kl": 0.017198562622070312, |
| "learning_rate": 5.060876951083828e-07, |
| "loss": 0.0007, |
| "reward": 0.7978585977107286, |
| "reward_std": 0.40477965772151947, |
| "rewards/cosine_scaled_reward": -0.038570704869925976, |
| "rewards/format_reward": 0.8750000074505806, |
| "step": 289 |
| }, |
| { |
| "completion_length": 1012.0625228881836, |
| "epoch": 0.3314285714285714, |
| "grad_norm": 0.335376113653183, |
| "kl": 0.009782791137695312, |
| "learning_rate": 5.02962191529556e-07, |
| "loss": 0.0004, |
| "reward": 1.1996897123754025, |
| "reward_std": 0.8074947744607925, |
| "rewards/cosine_scaled_reward": 0.09984485851600766, |
| "rewards/format_reward": 1.0, |
| "step": 290 |
| }, |
| { |
| "completion_length": 1318.7500457763672, |
| "epoch": 0.3325714285714286, |
| "grad_norm": 0.24284976720809937, |
| "kl": 0.014574050903320312, |
| "learning_rate": 4.998389805071536e-07, |
| "loss": 0.0006, |
| "reward": 0.9642973355948925, |
| "reward_std": 0.7317453175783157, |
| "rewards/cosine_scaled_reward": 0.034231980331242085, |
| "rewards/format_reward": 0.895833333954215, |
| "step": 291 |
| }, |
| { |
| "completion_length": 1687.9166946411133, |
| "epoch": 0.33371428571428574, |
| "grad_norm": 0.28374290466308594, |
| "kl": 0.017087936401367188, |
| "learning_rate": 4.967182142620745e-07, |
| "loss": 0.0007, |
| "reward": 0.6350435484200716, |
| "reward_std": 0.6143127456307411, |
| "rewards/cosine_scaled_reward": -0.1199782375479117, |
| "rewards/format_reward": 0.8750000111758709, |
| "step": 292 |
| }, |
| { |
| "completion_length": 1271.3750381469727, |
| "epoch": 0.33485714285714285, |
| "grad_norm": 0.38170552253723145, |
| "kl": 0.023435592651367188, |
| "learning_rate": 4.93600044896063e-07, |
| "loss": 0.0009, |
| "reward": 0.8408510126173496, |
| "reward_std": 0.6498479042202234, |
| "rewards/cosine_scaled_reward": -0.03790782764554024, |
| "rewards/format_reward": 0.916666679084301, |
| "step": 293 |
| }, |
| { |
| "completion_length": 1540.2708740234375, |
| "epoch": 0.336, |
| "grad_norm": 0.29528045654296875, |
| "kl": 0.012912750244140625, |
| "learning_rate": 4.904846243842949e-07, |
| "loss": 0.0005, |
| "reward": 0.8604166656732559, |
| "reward_std": 0.5733331702649593, |
| "rewards/cosine_scaled_reward": 0.003124975599348545, |
| "rewards/format_reward": 0.8541666716337204, |
| "step": 294 |
| }, |
| { |
| "completion_length": 1394.4792098999023, |
| "epoch": 0.33714285714285713, |
| "grad_norm": 0.3902963399887085, |
| "kl": 0.014739990234375, |
| "learning_rate": 4.873721045679706e-07, |
| "loss": 0.0006, |
| "reward": 1.0252277310937643, |
| "reward_std": 0.5337934233248234, |
| "rewards/cosine_scaled_reward": 0.06469716504216194, |
| "rewards/format_reward": 0.8958333432674408, |
| "step": 295 |
| }, |
| { |
| "completion_length": 1718.8958892822266, |
| "epoch": 0.3382857142857143, |
| "grad_norm": 0.609682559967041, |
| "kl": 0.02099609375, |
| "learning_rate": 4.842626371469149e-07, |
| "loss": 0.0008, |
| "reward": 0.6263405880890787, |
| "reward_std": 0.6606453433632851, |
| "rewards/cosine_scaled_reward": -0.10349638154730201, |
| "rewards/format_reward": 0.833333358168602, |
| "step": 296 |
| }, |
| { |
| "completion_length": 1873.7083892822266, |
| "epoch": 0.3394285714285714, |
| "grad_norm": 0.29421982169151306, |
| "kl": 0.01636505126953125, |
| "learning_rate": 4.811563736721829e-07, |
| "loss": 0.0007, |
| "reward": 0.8774769939482212, |
| "reward_std": 0.8447716347873211, |
| "rewards/cosine_scaled_reward": 0.04290514811873436, |
| "rewards/format_reward": 0.791666679084301, |
| "step": 297 |
| }, |
| { |
| "completion_length": 1471.958381652832, |
| "epoch": 0.3405714285714286, |
| "grad_norm": 0.37182632088661194, |
| "kl": 0.017574310302734375, |
| "learning_rate": 4.780534655386743e-07, |
| "loss": 0.0007, |
| "reward": 0.7408417947590351, |
| "reward_std": 0.7095592878758907, |
| "rewards/cosine_scaled_reward": -0.0462457868270576, |
| "rewards/format_reward": 0.8333333488553762, |
| "step": 298 |
| }, |
| { |
| "completion_length": 1818.8958892822266, |
| "epoch": 0.3417142857142857, |
| "grad_norm": 0.4747914671897888, |
| "kl": 0.03339385986328125, |
| "learning_rate": 4.749540639777539e-07, |
| "loss": 0.0013, |
| "reward": 0.6715311715379357, |
| "reward_std": 0.638892836868763, |
| "rewards/cosine_scaled_reward": -0.02881775365676731, |
| "rewards/format_reward": 0.7291666846722364, |
| "step": 299 |
| }, |
| { |
| "completion_length": 1775.7083740234375, |
| "epoch": 0.34285714285714286, |
| "grad_norm": 0.8008535504341125, |
| "kl": 0.03324127197265625, |
| "learning_rate": 4.7185832004988133e-07, |
| "loss": 0.0013, |
| "reward": 0.503076022490859, |
| "reward_std": 0.6447076573967934, |
| "rewards/cosine_scaled_reward": -0.08179534040391445, |
| "rewards/format_reward": 0.6666666846722364, |
| "step": 300 |
| }, |
| { |
| "completion_length": 1681.7500457763672, |
| "epoch": 0.344, |
| "grad_norm": 0.6473574042320251, |
| "kl": 0.0345306396484375, |
| "learning_rate": 4.68766384637248e-07, |
| "loss": 0.0014, |
| "reward": 0.47819951456040144, |
| "reward_std": 0.7737650983035564, |
| "rewards/cosine_scaled_reward": -0.16715026053134352, |
| "rewards/format_reward": 0.8125000074505806, |
| "step": 301 |
| }, |
| { |
| "completion_length": 1704.6250228881836, |
| "epoch": 0.34514285714285714, |
| "grad_norm": 0.32545921206474304, |
| "kl": 0.03167724609375, |
| "learning_rate": 4.656784084364238e-07, |
| "loss": 0.0013, |
| "reward": 0.9550731834024191, |
| "reward_std": 0.7380726649425924, |
| "rewards/cosine_scaled_reward": 0.11295327357947826, |
| "rewards/format_reward": 0.7291666716337204, |
| "step": 302 |
| }, |
| { |
| "completion_length": 1360.1458740234375, |
| "epoch": 0.3462857142857143, |
| "grad_norm": 0.35140302777290344, |
| "kl": 0.02618408203125, |
| "learning_rate": 4.6259454195101267e-07, |
| "loss": 0.001, |
| "reward": 0.7957173548638821, |
| "reward_std": 0.7143728174269199, |
| "rewards/cosine_scaled_reward": -0.018808012828230858, |
| "rewards/format_reward": 0.8333333358168602, |
| "step": 303 |
| }, |
| { |
| "completion_length": 1244.3750381469727, |
| "epoch": 0.3474285714285714, |
| "grad_norm": 0.38197869062423706, |
| "kl": 0.0144195556640625, |
| "learning_rate": 4.59514935484316e-07, |
| "loss": 0.0006, |
| "reward": 0.6606247052550316, |
| "reward_std": 0.5714793428778648, |
| "rewards/cosine_scaled_reward": -0.11760433949530125, |
| "rewards/format_reward": 0.8958333395421505, |
| "step": 304 |
| }, |
| { |
| "completion_length": 1325.6667022705078, |
| "epoch": 0.3485714285714286, |
| "grad_norm": 0.3902123272418976, |
| "kl": 0.016117095947265625, |
| "learning_rate": 4.5643973913200837e-07, |
| "loss": 0.0006, |
| "reward": 0.6409200690686703, |
| "reward_std": 0.6203774958848953, |
| "rewards/cosine_scaled_reward": -0.1587066389620304, |
| "rewards/format_reward": 0.9583333432674408, |
| "step": 305 |
| }, |
| { |
| "completion_length": 1282.3542175292969, |
| "epoch": 0.3497142857142857, |
| "grad_norm": 0.6988428235054016, |
| "kl": 0.026388168334960938, |
| "learning_rate": 4.5336910277482155e-07, |
| "loss": 0.0011, |
| "reward": 1.0727143473923206, |
| "reward_std": 0.6053726552054286, |
| "rewards/cosine_scaled_reward": 0.11969051510095596, |
| "rewards/format_reward": 0.833333333954215, |
| "step": 306 |
| }, |
| { |
| "completion_length": 1716.9583702087402, |
| "epoch": 0.35085714285714287, |
| "grad_norm": 0.36837834119796753, |
| "kl": 0.040142059326171875, |
| "learning_rate": 4.503031760712397e-07, |
| "loss": 0.0016, |
| "reward": 0.8561188094317913, |
| "reward_std": 0.9516715090721846, |
| "rewards/cosine_scaled_reward": 0.011392734944820404, |
| "rewards/format_reward": 0.8333333395421505, |
| "step": 307 |
| }, |
| { |
| "completion_length": 2273.4584045410156, |
| "epoch": 0.352, |
| "grad_norm": 0.5795905590057373, |
| "kl": 0.03869056701660156, |
| "learning_rate": 4.4724210845020494e-07, |
| "loss": 0.0015, |
| "reward": 0.7666318230330944, |
| "reward_std": 0.9787959046661854, |
| "rewards/cosine_scaled_reward": -0.002100769430398941, |
| "rewards/format_reward": 0.7708333432674408, |
| "step": 308 |
| }, |
| { |
| "completion_length": 2063.3542556762695, |
| "epoch": 0.35314285714285715, |
| "grad_norm": 0.3654479384422302, |
| "kl": 0.05999183654785156, |
| "learning_rate": 4.441860491038345e-07, |
| "loss": 0.0024, |
| "reward": 0.6858750740066171, |
| "reward_std": 0.760811198502779, |
| "rewards/cosine_scaled_reward": -0.0528958085924387, |
| "rewards/format_reward": 0.7916666734963655, |
| "step": 309 |
| }, |
| { |
| "completion_length": 1340.1250305175781, |
| "epoch": 0.35428571428571426, |
| "grad_norm": 0.49283814430236816, |
| "kl": 0.033687591552734375, |
| "learning_rate": 4.4113514698014953e-07, |
| "loss": 0.0013, |
| "reward": 0.7110094074159861, |
| "reward_std": 0.7351307831704617, |
| "rewards/cosine_scaled_reward": -0.06116198655217886, |
| "rewards/format_reward": 0.8333333395421505, |
| "step": 310 |
| }, |
| { |
| "completion_length": 1418.187515258789, |
| "epoch": 0.3554285714285714, |
| "grad_norm": 0.44990670680999756, |
| "kl": 0.04097747802734375, |
| "learning_rate": 4.3808955077581546e-07, |
| "loss": 0.0016, |
| "reward": 1.2232839856296778, |
| "reward_std": 0.9500982649624348, |
| "rewards/cosine_scaled_reward": 0.15330865047872066, |
| "rewards/format_reward": 0.9166666716337204, |
| "step": 311 |
| }, |
| { |
| "completion_length": 1471.708381652832, |
| "epoch": 0.3565714285714286, |
| "grad_norm": 0.6391323804855347, |
| "kl": 0.04137420654296875, |
| "learning_rate": 4.350494089288943e-07, |
| "loss": 0.0017, |
| "reward": 1.1680905930697918, |
| "reward_std": 0.37351681664586067, |
| "rewards/cosine_scaled_reward": 0.16737862676382065, |
| "rewards/format_reward": 0.8333333395421505, |
| "step": 312 |
| }, |
| { |
| "completion_length": 2079.6667137145996, |
| "epoch": 0.3577142857142857, |
| "grad_norm": 0.9562482833862305, |
| "kl": 0.07365036010742188, |
| "learning_rate": 4.3201486961161093e-07, |
| "loss": 0.0029, |
| "reward": 0.6807579882442951, |
| "reward_std": 0.8096479428932071, |
| "rewards/cosine_scaled_reward": 0.017462321557104588, |
| "rewards/format_reward": 0.6458333488553762, |
| "step": 313 |
| }, |
| { |
| "completion_length": 1646.2500305175781, |
| "epoch": 0.3588571428571429, |
| "grad_norm": 0.582167387008667, |
| "kl": 0.06851959228515625, |
| "learning_rate": 4.2898608072313045e-07, |
| "loss": 0.0027, |
| "reward": 0.823919128626585, |
| "reward_std": 0.5355783794075251, |
| "rewards/cosine_scaled_reward": 0.0369595680385828, |
| "rewards/format_reward": 0.7500000111758709, |
| "step": 314 |
| }, |
| { |
| "completion_length": 2116.229217529297, |
| "epoch": 0.36, |
| "grad_norm": 0.6685879230499268, |
| "kl": 0.1165008544921875, |
| "learning_rate": 4.2596318988235037e-07, |
| "loss": 0.0047, |
| "reward": 0.8504741322249174, |
| "reward_std": 0.5491553768515587, |
| "rewards/cosine_scaled_reward": 0.06065371725708246, |
| "rewards/format_reward": 0.7291666734963655, |
| "step": 315 |
| }, |
| { |
| "completion_length": 2242.4375610351562, |
| "epoch": 0.36114285714285715, |
| "grad_norm": 0.7561622262001038, |
| "kl": 0.11125946044921875, |
| "learning_rate": 4.2294634442070553e-07, |
| "loss": 0.0045, |
| "reward": -0.009940480813384056, |
| "reward_std": 0.5052176639437675, |
| "rewards/cosine_scaled_reward": -0.22372024692595005, |
| "rewards/format_reward": 0.43750000931322575, |
| "step": 316 |
| }, |
| { |
| "completion_length": 1841.1875457763672, |
| "epoch": 0.36228571428571427, |
| "grad_norm": 1.0075013637542725, |
| "kl": 0.0679473876953125, |
| "learning_rate": 4.1993569137498776e-07, |
| "loss": 0.0027, |
| "reward": 0.5096511642332189, |
| "reward_std": 0.8513825722038746, |
| "rewards/cosine_scaled_reward": -0.057674430310726166, |
| "rewards/format_reward": 0.625000013038516, |
| "step": 317 |
| }, |
| { |
| "completion_length": 1029.7916870117188, |
| "epoch": 0.36342857142857143, |
| "grad_norm": 0.38250845670700073, |
| "kl": 0.039661407470703125, |
| "learning_rate": 4.1693137748017915e-07, |
| "loss": 0.0016, |
| "reward": 0.8797074742615223, |
| "reward_std": 0.4655795283615589, |
| "rewards/cosine_scaled_reward": -0.0601462684571743, |
| "rewards/format_reward": 1.0, |
| "step": 318 |
| }, |
| { |
| "completion_length": 1612.9167098999023, |
| "epoch": 0.36457142857142855, |
| "grad_norm": 0.7825446724891663, |
| "kl": 0.05585479736328125, |
| "learning_rate": 4.1393354916230005e-07, |
| "loss": 0.0022, |
| "reward": 0.5894158203154802, |
| "reward_std": 0.7856029607355595, |
| "rewards/cosine_scaled_reward": -0.13237543310970068, |
| "rewards/format_reward": 0.854166679084301, |
| "step": 319 |
| }, |
| { |
| "completion_length": 1328.7916831970215, |
| "epoch": 0.3657142857142857, |
| "grad_norm": 0.7866749167442322, |
| "kl": 0.07510757446289062, |
| "learning_rate": 4.1094235253127374e-07, |
| "loss": 0.003, |
| "reward": 1.0613620709627867, |
| "reward_std": 0.6303130388259888, |
| "rewards/cosine_scaled_reward": 0.07234767638146877, |
| "rewards/format_reward": 0.916666679084301, |
| "step": 320 |
| }, |
| { |
| "completion_length": 1117.3125457763672, |
| "epoch": 0.3668571428571429, |
| "grad_norm": 0.5064549446105957, |
| "kl": 0.04396820068359375, |
| "learning_rate": 4.079579333738039e-07, |
| "loss": 0.0018, |
| "reward": 1.1430763825774193, |
| "reward_std": 0.6700709462165833, |
| "rewards/cosine_scaled_reward": 0.1340381633490324, |
| "rewards/format_reward": 0.8750000111758709, |
| "step": 321 |
| }, |
| { |
| "completion_length": 1828.0417175292969, |
| "epoch": 0.368, |
| "grad_norm": 0.7172619104385376, |
| "kl": 0.133880615234375, |
| "learning_rate": 4.0498043714627006e-07, |
| "loss": 0.0054, |
| "reward": 0.645214811898768, |
| "reward_std": 1.0024632290005684, |
| "rewards/cosine_scaled_reward": -0.04197592940181494, |
| "rewards/format_reward": 0.7291666753590107, |
| "step": 322 |
| }, |
| { |
| "completion_length": 1734.2916946411133, |
| "epoch": 0.36914285714285716, |
| "grad_norm": 0.7203247547149658, |
| "kl": 0.1142730712890625, |
| "learning_rate": 4.020100089676376e-07, |
| "loss": 0.0046, |
| "reward": 0.6648443900048733, |
| "reward_std": 0.6985526494681835, |
| "rewards/cosine_scaled_reward": 0.009505534544587135, |
| "rewards/format_reward": 0.6458333414047956, |
| "step": 323 |
| }, |
| { |
| "completion_length": 1589.1667022705078, |
| "epoch": 0.3702857142857143, |
| "grad_norm": 0.8060963153839111, |
| "kl": 0.10352706909179688, |
| "learning_rate": 3.9904679361238526e-07, |
| "loss": 0.0041, |
| "reward": 0.27571332873776555, |
| "reward_std": 0.6113120466470718, |
| "rewards/cosine_scaled_reward": -0.20589334331452847, |
| "rewards/format_reward": 0.6875000167638063, |
| "step": 324 |
| }, |
| { |
| "completion_length": 1642.0625686645508, |
| "epoch": 0.37142857142857144, |
| "grad_norm": 0.603640079498291, |
| "kl": 0.059417724609375, |
| "learning_rate": 3.9609093550344907e-07, |
| "loss": 0.0024, |
| "reward": 0.9450785778462887, |
| "reward_std": 0.8715181350708008, |
| "rewards/cosine_scaled_reward": 0.045455962885171175, |
| "rewards/format_reward": 0.8541666865348816, |
| "step": 325 |
| }, |
| { |
| "completion_length": 1383.208351135254, |
| "epoch": 0.37257142857142855, |
| "grad_norm": 0.8735449314117432, |
| "kl": 0.08353424072265625, |
| "learning_rate": 3.931425787051832e-07, |
| "loss": 0.0033, |
| "reward": 1.0190029181540012, |
| "reward_std": 0.7654371298849583, |
| "rewards/cosine_scaled_reward": 0.07200142601504922, |
| "rewards/format_reward": 0.8750000111758709, |
| "step": 326 |
| }, |
| { |
| "completion_length": 1609.208366394043, |
| "epoch": 0.3737142857142857, |
| "grad_norm": 0.40844622254371643, |
| "kl": 0.056499481201171875, |
| "learning_rate": 3.902018669163384e-07, |
| "loss": 0.0023, |
| "reward": 1.1053630914539099, |
| "reward_std": 0.6595817804336548, |
| "rewards/cosine_scaled_reward": 0.1360148610547185, |
| "rewards/format_reward": 0.8333333358168602, |
| "step": 327 |
| }, |
| { |
| "completion_length": 1788.4375534057617, |
| "epoch": 0.37485714285714283, |
| "grad_norm": 0.6934894919395447, |
| "kl": 0.11262893676757812, |
| "learning_rate": 3.872689434630585e-07, |
| "loss": 0.0045, |
| "reward": 0.47454674541950226, |
| "reward_std": 0.7443269528448582, |
| "rewards/cosine_scaled_reward": -0.12730997893959284, |
| "rewards/format_reward": 0.7291666753590107, |
| "step": 328 |
| }, |
| { |
| "completion_length": 1103.7083587646484, |
| "epoch": 0.376, |
| "grad_norm": 0.9421954154968262, |
| "kl": 0.05080413818359375, |
| "learning_rate": 3.843439512918949e-07, |
| "loss": 0.002, |
| "reward": 1.2518079336732626, |
| "reward_std": 0.5773205179721117, |
| "rewards/cosine_scaled_reward": 0.15715394588187337, |
| "rewards/format_reward": 0.9375000149011612, |
| "step": 329 |
| }, |
| { |
| "completion_length": 1171.9791946411133, |
| "epoch": 0.37714285714285717, |
| "grad_norm": 0.7696932554244995, |
| "kl": 0.10460662841796875, |
| "learning_rate": 3.8142703296283953e-07, |
| "loss": 0.0042, |
| "reward": 0.5345208197832108, |
| "reward_std": 0.7594005465507507, |
| "rewards/cosine_scaled_reward": -0.15982293151319027, |
| "rewards/format_reward": 0.8541666716337204, |
| "step": 330 |
| }, |
| { |
| "completion_length": 1748.4167098999023, |
| "epoch": 0.3782857142857143, |
| "grad_norm": 1.497854232788086, |
| "kl": 0.12451934814453125, |
| "learning_rate": 3.785183306423767e-07, |
| "loss": 0.005, |
| "reward": 0.5829995409585536, |
| "reward_std": 0.8618629835546017, |
| "rewards/cosine_scaled_reward": -0.0626668983604759, |
| "rewards/format_reward": 0.7083333432674408, |
| "step": 331 |
| }, |
| { |
| "completion_length": 1661.1667022705078, |
| "epoch": 0.37942857142857145, |
| "grad_norm": 0.7761502861976624, |
| "kl": 0.08023452758789062, |
| "learning_rate": 3.7561798609655373e-07, |
| "loss": 0.0032, |
| "reward": 0.5321935811080039, |
| "reward_std": 0.5660249888896942, |
| "rewards/cosine_scaled_reward": -0.11931989248842001, |
| "rewards/format_reward": 0.7708333432674408, |
| "step": 332 |
| }, |
| { |
| "completion_length": 1305.7500228881836, |
| "epoch": 0.38057142857142856, |
| "grad_norm": 0.7130544185638428, |
| "kl": 0.0748291015625, |
| "learning_rate": 3.72726140684072e-07, |
| "loss": 0.003, |
| "reward": 0.7820851001888514, |
| "reward_std": 0.671183954924345, |
| "rewards/cosine_scaled_reward": -0.06729080062359571, |
| "rewards/format_reward": 0.9166666865348816, |
| "step": 333 |
| }, |
| { |
| "completion_length": 2046.4375610351562, |
| "epoch": 0.38171428571428573, |
| "grad_norm": 0.9568099975585938, |
| "kl": 0.18133544921875, |
| "learning_rate": 3.6984293534939737e-07, |
| "loss": 0.0072, |
| "reward": 0.355112224817276, |
| "reward_std": 0.6977374590933323, |
| "rewards/cosine_scaled_reward": -0.1870272308588028, |
| "rewards/format_reward": 0.7291666828095913, |
| "step": 334 |
| }, |
| { |
| "completion_length": 1462.2292022705078, |
| "epoch": 0.38285714285714284, |
| "grad_norm": 1.2614269256591797, |
| "kl": 0.07938385009765625, |
| "learning_rate": 3.6696851061588994e-07, |
| "loss": 0.0032, |
| "reward": 0.9349308745004237, |
| "reward_std": 0.6556266993284225, |
| "rewards/cosine_scaled_reward": 0.04038208909332752, |
| "rewards/format_reward": 0.854166679084301, |
| "step": 335 |
| }, |
| { |
| "completion_length": 1641.4375534057617, |
| "epoch": 0.384, |
| "grad_norm": 0.8289753198623657, |
| "kl": 0.11359786987304688, |
| "learning_rate": 3.641030065789562e-07, |
| "loss": 0.0045, |
| "reward": 0.8081842958927155, |
| "reward_std": 0.931253258138895, |
| "rewards/cosine_scaled_reward": 0.08117546886205673, |
| "rewards/format_reward": 0.6458333469927311, |
| "step": 336 |
| }, |
| { |
| "completion_length": 1747.2500534057617, |
| "epoch": 0.3851428571428571, |
| "grad_norm": 1.1650760173797607, |
| "kl": 0.12990570068359375, |
| "learning_rate": 3.612465628992203e-07, |
| "loss": 0.0052, |
| "reward": 0.7834707293659449, |
| "reward_std": 0.8634283617138863, |
| "rewards/cosine_scaled_reward": -0.04576464742422104, |
| "rewards/format_reward": 0.8750000074505806, |
| "step": 337 |
| }, |
| { |
| "completion_length": 1280.7708740234375, |
| "epoch": 0.3862857142857143, |
| "grad_norm": 0.5302877426147461, |
| "kl": 0.062366485595703125, |
| "learning_rate": 3.5839931879571725e-07, |
| "loss": 0.0025, |
| "reward": 1.1318869441747665, |
| "reward_std": 0.7834238409996033, |
| "rewards/cosine_scaled_reward": 0.09719344391487539, |
| "rewards/format_reward": 0.9375, |
| "step": 338 |
| }, |
| { |
| "completion_length": 1575.9375381469727, |
| "epoch": 0.38742857142857146, |
| "grad_norm": 3.0021309852600098, |
| "kl": 0.14781951904296875, |
| "learning_rate": 3.555614130391079e-07, |
| "loss": 0.0059, |
| "reward": 0.435087047284469, |
| "reward_std": 0.43532489985227585, |
| "rewards/cosine_scaled_reward": -0.13662315905094147, |
| "rewards/format_reward": 0.7083333563059568, |
| "step": 339 |
| }, |
| { |
| "completion_length": 1496.6250534057617, |
| "epoch": 0.38857142857142857, |
| "grad_norm": 1.7084987163543701, |
| "kl": 0.12076950073242188, |
| "learning_rate": 3.5273298394491515e-07, |
| "loss": 0.0048, |
| "reward": 0.7966768400510773, |
| "reward_std": 0.5643632663413882, |
| "rewards/cosine_scaled_reward": -0.007911591790616512, |
| "rewards/format_reward": 0.8125000111758709, |
| "step": 340 |
| }, |
| { |
| "completion_length": 1304.9792098999023, |
| "epoch": 0.38971428571428574, |
| "grad_norm": 5.948554039001465, |
| "kl": 0.22769927978515625, |
| "learning_rate": 3.4991416936678276e-07, |
| "loss": 0.0091, |
| "reward": 1.1839299397543073, |
| "reward_std": 0.643942728638649, |
| "rewards/cosine_scaled_reward": 0.175298273563385, |
| "rewards/format_reward": 0.8333333507180214, |
| "step": 341 |
| }, |
| { |
| "completion_length": 1663.6875610351562, |
| "epoch": 0.39085714285714285, |
| "grad_norm": 1.0961518287658691, |
| "kl": 0.18028640747070312, |
| "learning_rate": 3.471051066897562e-07, |
| "loss": 0.0072, |
| "reward": 0.8878756612539291, |
| "reward_std": 0.9449864365160465, |
| "rewards/cosine_scaled_reward": 0.06893783865962178, |
| "rewards/format_reward": 0.750000013038516, |
| "step": 342 |
| }, |
| { |
| "completion_length": 1583.0000686645508, |
| "epoch": 0.392, |
| "grad_norm": 1.2760441303253174, |
| "kl": 0.15594482421875, |
| "learning_rate": 3.4430593282358777e-07, |
| "loss": 0.0062, |
| "reward": 0.8547459337860346, |
| "reward_std": 0.6034443583339453, |
| "rewards/cosine_scaled_reward": 0.0002896404330385849, |
| "rewards/format_reward": 0.8541666939854622, |
| "step": 343 |
| }, |
| { |
| "completion_length": 1706.4375457763672, |
| "epoch": 0.3931428571428571, |
| "grad_norm": 1.06587553024292, |
| "kl": 0.281158447265625, |
| "learning_rate": 3.4151678419606233e-07, |
| "loss": 0.0113, |
| "reward": 1.1242186180315912, |
| "reward_std": 0.6546860057860613, |
| "rewards/cosine_scaled_reward": 0.17669263062998652, |
| "rewards/format_reward": 0.7708333432674408, |
| "step": 344 |
| }, |
| { |
| "completion_length": 1465.4583587646484, |
| "epoch": 0.3942857142857143, |
| "grad_norm": 1.2281720638275146, |
| "kl": 0.1438140869140625, |
| "learning_rate": 3.387377967463493e-07, |
| "loss": 0.0058, |
| "reward": 0.9402041547000408, |
| "reward_std": 0.6865711808204651, |
| "rewards/cosine_scaled_reward": 0.011768726049922407, |
| "rewards/format_reward": 0.916666679084301, |
| "step": 345 |
| }, |
| { |
| "completion_length": 1591.0833740234375, |
| "epoch": 0.3954285714285714, |
| "grad_norm": 0.8186521530151367, |
| "kl": 0.10266876220703125, |
| "learning_rate": 3.359691059183761e-07, |
| "loss": 0.0041, |
| "reward": 0.6260932851582766, |
| "reward_std": 0.5609011054039001, |
| "rewards/cosine_scaled_reward": -0.15570336702512577, |
| "rewards/format_reward": 0.9375000074505806, |
| "step": 346 |
| }, |
| { |
| "completion_length": 1574.2083892822266, |
| "epoch": 0.3965714285714286, |
| "grad_norm": 0.7894213795661926, |
| "kl": 0.1241302490234375, |
| "learning_rate": 3.3321084665422803e-07, |
| "loss": 0.005, |
| "reward": 0.5836004763841629, |
| "reward_std": 0.6906316690146923, |
| "rewards/cosine_scaled_reward": -0.15611644479213282, |
| "rewards/format_reward": 0.8958333432674408, |
| "step": 347 |
| }, |
| { |
| "completion_length": 1624.1667022705078, |
| "epoch": 0.3977142857142857, |
| "grad_norm": 1.4086353778839111, |
| "kl": 0.22664642333984375, |
| "learning_rate": 3.3046315338757026e-07, |
| "loss": 0.0091, |
| "reward": 0.7276029635686427, |
| "reward_std": 0.6486309953033924, |
| "rewards/cosine_scaled_reward": -0.04244852438569069, |
| "rewards/format_reward": 0.8125000074505806, |
| "step": 348 |
| }, |
| { |
| "completion_length": 1256.1875228881836, |
| "epoch": 0.39885714285714285, |
| "grad_norm": 1.4020756483078003, |
| "kl": 0.1046600341796875, |
| "learning_rate": 3.2772616003709616e-07, |
| "loss": 0.0042, |
| "reward": 0.5811667609959841, |
| "reward_std": 0.7433434501290321, |
| "rewards/cosine_scaled_reward": -0.09483329905197024, |
| "rewards/format_reward": 0.7708333469927311, |
| "step": 349 |
| }, |
| { |
| "completion_length": 1057.7291946411133, |
| "epoch": 0.4, |
| "grad_norm": 0.9083346128463745, |
| "kl": 0.0474395751953125, |
| "learning_rate": 3.250000000000001e-07, |
| "loss": 0.0019, |
| "reward": 0.8259809445589781, |
| "reward_std": 0.7229878939688206, |
| "rewards/cosine_scaled_reward": -0.06617620773613453, |
| "rewards/format_reward": 0.9583333358168602, |
| "step": 350 |
| }, |
| { |
| "completion_length": 1219.6458740234375, |
| "epoch": 0.40114285714285713, |
| "grad_norm": 1.6471617221832275, |
| "kl": 0.12386322021484375, |
| "learning_rate": 3.222848061454764e-07, |
| "loss": 0.0049, |
| "reward": 0.7785749807953835, |
| "reward_std": 0.8159589394927025, |
| "rewards/cosine_scaled_reward": -0.0377958663739264, |
| "rewards/format_reward": 0.854166679084301, |
| "step": 351 |
| }, |
| { |
| "completion_length": 1422.1875534057617, |
| "epoch": 0.4022857142857143, |
| "grad_norm": 1.3077303171157837, |
| "kl": 0.2323760986328125, |
| "learning_rate": 3.195807108082429e-07, |
| "loss": 0.0093, |
| "reward": 0.9625241123139858, |
| "reward_std": 0.8021371066570282, |
| "rewards/cosine_scaled_reward": 0.06459538266062737, |
| "rewards/format_reward": 0.8333333469927311, |
| "step": 352 |
| }, |
| { |
| "completion_length": 1283.333351135254, |
| "epoch": 0.4034285714285714, |
| "grad_norm": 1.1020612716674805, |
| "kl": 0.125701904296875, |
| "learning_rate": 3.168878457820915e-07, |
| "loss": 0.005, |
| "reward": 1.0344783924520016, |
| "reward_std": 0.8279353678226471, |
| "rewards/cosine_scaled_reward": 0.05890584830194712, |
| "rewards/format_reward": 0.916666679084301, |
| "step": 353 |
| }, |
| { |
| "completion_length": 1127.6667022705078, |
| "epoch": 0.4045714285714286, |
| "grad_norm": 1.216342568397522, |
| "kl": 0.12079620361328125, |
| "learning_rate": 3.142063423134644e-07, |
| "loss": 0.0048, |
| "reward": 1.099512368440628, |
| "reward_std": 0.5979024097323418, |
| "rewards/cosine_scaled_reward": 0.0914228311739862, |
| "rewards/format_reward": 0.916666679084301, |
| "step": 354 |
| }, |
| { |
| "completion_length": 1001.6458549499512, |
| "epoch": 0.4057142857142857, |
| "grad_norm": 0.9325054287910461, |
| "kl": 0.059314727783203125, |
| "learning_rate": 3.115363310950578e-07, |
| "loss": 0.0024, |
| "reward": 1.2166800498962402, |
| "reward_std": 0.675847515463829, |
| "rewards/cosine_scaled_reward": 0.12917334213852882, |
| "rewards/format_reward": 0.9583333432674408, |
| "step": 355 |
| }, |
| { |
| "completion_length": 1459.2292251586914, |
| "epoch": 0.40685714285714286, |
| "grad_norm": 2.4190444946289062, |
| "kl": 0.2304840087890625, |
| "learning_rate": 3.0887794225945143e-07, |
| "loss": 0.0092, |
| "reward": 0.4463031552731991, |
| "reward_std": 0.6662746425718069, |
| "rewards/cosine_scaled_reward": -0.17268177028745413, |
| "rewards/format_reward": 0.7916666865348816, |
| "step": 356 |
| }, |
| { |
| "completion_length": 1725.6875457763672, |
| "epoch": 0.408, |
| "grad_norm": 2.1936678886413574, |
| "kl": 0.295989990234375, |
| "learning_rate": 3.062313053727671e-07, |
| "loss": 0.0118, |
| "reward": 0.3668034356087446, |
| "reward_std": 0.5282239988446236, |
| "rewards/cosine_scaled_reward": -0.26451496221125126, |
| "rewards/format_reward": 0.8958333507180214, |
| "step": 357 |
| }, |
| { |
| "completion_length": 1472.312515258789, |
| "epoch": 0.40914285714285714, |
| "grad_norm": 1.637854814529419, |
| "kl": 0.2021331787109375, |
| "learning_rate": 3.0359654942835247e-07, |
| "loss": 0.0081, |
| "reward": 1.2237652689218521, |
| "reward_std": 0.6800592541694641, |
| "rewards/cosine_scaled_reward": 0.1535492818802595, |
| "rewards/format_reward": 0.9166666716337204, |
| "step": 358 |
| }, |
| { |
| "completion_length": 929.5416793823242, |
| "epoch": 0.4102857142857143, |
| "grad_norm": 1.4736838340759277, |
| "kl": 0.1471099853515625, |
| "learning_rate": 3.0097380284049523e-07, |
| "loss": 0.0059, |
| "reward": 0.8410445712506771, |
| "reward_std": 0.5730547439306974, |
| "rewards/cosine_scaled_reward": -0.06906107859686017, |
| "rewards/format_reward": 0.9791666716337204, |
| "step": 359 |
| }, |
| { |
| "completion_length": 1209.8125457763672, |
| "epoch": 0.4114285714285714, |
| "grad_norm": 2.1532845497131348, |
| "kl": 0.35944366455078125, |
| "learning_rate": 2.9836319343816397e-07, |
| "loss": 0.0144, |
| "reward": 0.769681841135025, |
| "reward_std": 0.8142721727490425, |
| "rewards/cosine_scaled_reward": -0.06307575106620789, |
| "rewards/format_reward": 0.895833358168602, |
| "step": 360 |
| }, |
| { |
| "completion_length": 1173.6041946411133, |
| "epoch": 0.4125714285714286, |
| "grad_norm": 2.002321481704712, |
| "kl": 0.2322998046875, |
| "learning_rate": 2.9576484845877793e-07, |
| "loss": 0.0093, |
| "reward": 0.8964769653975964, |
| "reward_std": 0.7383791692554951, |
| "rewards/cosine_scaled_reward": -0.041344886645674706, |
| "rewards/format_reward": 0.9791666716337204, |
| "step": 361 |
| }, |
| { |
| "completion_length": 1014.0625305175781, |
| "epoch": 0.4137142857142857, |
| "grad_norm": 1.2043986320495605, |
| "kl": 0.2470703125, |
| "learning_rate": 2.931788945420058e-07, |
| "loss": 0.0099, |
| "reward": 1.00144612044096, |
| "reward_std": 0.47917743772268295, |
| "rewards/cosine_scaled_reward": 0.03197303228080273, |
| "rewards/format_reward": 0.9375, |
| "step": 362 |
| }, |
| { |
| "completion_length": 896.8541870117188, |
| "epoch": 0.41485714285714287, |
| "grad_norm": 1.703730583190918, |
| "kl": 0.1836700439453125, |
| "learning_rate": 2.9060545772359305e-07, |
| "loss": 0.0074, |
| "reward": 1.373911328613758, |
| "reward_std": 0.7581704575568438, |
| "rewards/cosine_scaled_reward": 0.21820560842752457, |
| "rewards/format_reward": 0.9375000074505806, |
| "step": 363 |
| }, |
| { |
| "completion_length": 1474.4167251586914, |
| "epoch": 0.416, |
| "grad_norm": 2.327479839324951, |
| "kl": 0.5921630859375, |
| "learning_rate": 2.8804466342921987e-07, |
| "loss": 0.0237, |
| "reward": 0.48526396974921227, |
| "reward_std": 0.5558421425521374, |
| "rewards/cosine_scaled_reward": -0.20528469607234, |
| "rewards/format_reward": 0.895833358168602, |
| "step": 364 |
| }, |
| { |
| "completion_length": 1866.6875610351562, |
| "epoch": 0.41714285714285715, |
| "grad_norm": 2.2852516174316406, |
| "kl": 0.80853271484375, |
| "learning_rate": 2.854966364683872e-07, |
| "loss": 0.0324, |
| "reward": 0.7523469850420952, |
| "reward_std": 0.6609731055796146, |
| "rewards/cosine_scaled_reward": 0.03242346830666065, |
| "rewards/format_reward": 0.6875000111758709, |
| "step": 365 |
| }, |
| { |
| "completion_length": 1085.2083740234375, |
| "epoch": 0.41828571428571426, |
| "grad_norm": 2.795609951019287, |
| "kl": 0.29659271240234375, |
| "learning_rate": 2.829615010283344e-07, |
| "loss": 0.0119, |
| "reward": 0.9424293376505375, |
| "reward_std": 0.763162437826395, |
| "rewards/cosine_scaled_reward": 0.023297980427742004, |
| "rewards/format_reward": 0.8958333507180214, |
| "step": 366 |
| }, |
| { |
| "completion_length": 1567.9167175292969, |
| "epoch": 0.41942857142857143, |
| "grad_norm": 1.7122759819030762, |
| "kl": 0.5317840576171875, |
| "learning_rate": 2.8043938066798645e-07, |
| "loss": 0.0213, |
| "reward": 0.8573304824531078, |
| "reward_std": 0.7073214678093791, |
| "rewards/cosine_scaled_reward": 0.01199856773018837, |
| "rewards/format_reward": 0.8333333507180214, |
| "step": 367 |
| }, |
| { |
| "completion_length": 1578.2083549499512, |
| "epoch": 0.4205714285714286, |
| "grad_norm": 4.118416786193848, |
| "kl": 0.380859375, |
| "learning_rate": 2.7793039831193133e-07, |
| "loss": 0.0152, |
| "reward": 0.5520367622375488, |
| "reward_std": 0.651011124253273, |
| "rewards/cosine_scaled_reward": -0.13023164262995124, |
| "rewards/format_reward": 0.8125000149011612, |
| "step": 368 |
| }, |
| { |
| "completion_length": 1570.8333702087402, |
| "epoch": 0.4217142857142857, |
| "grad_norm": 3.169609785079956, |
| "kl": 0.53338623046875, |
| "learning_rate": 2.7543467624442956e-07, |
| "loss": 0.0213, |
| "reward": 0.7706317435950041, |
| "reward_std": 0.9069001339375973, |
| "rewards/cosine_scaled_reward": -0.020934134838171303, |
| "rewards/format_reward": 0.8125000223517418, |
| "step": 369 |
| }, |
| { |
| "completion_length": 1200.7500228881836, |
| "epoch": 0.4228571428571429, |
| "grad_norm": 1.968066930770874, |
| "kl": 0.415557861328125, |
| "learning_rate": 2.729523361034538e-07, |
| "loss": 0.0166, |
| "reward": 0.761927604675293, |
| "reward_std": 0.5050818659365177, |
| "rewards/cosine_scaled_reward": -0.05653620883822441, |
| "rewards/format_reward": 0.8750000223517418, |
| "step": 370 |
| }, |
| { |
| "completion_length": 638.2708549499512, |
| "epoch": 0.424, |
| "grad_norm": 1.2254022359848022, |
| "kl": 0.0458984375, |
| "learning_rate": 2.7048349887476037e-07, |
| "loss": 0.0018, |
| "reward": 1.2654454093426466, |
| "reward_std": 0.4487613644450903, |
| "rewards/cosine_scaled_reward": 0.13272269815206528, |
| "rewards/format_reward": 1.0, |
| "step": 371 |
| }, |
| { |
| "completion_length": 1610.770896911621, |
| "epoch": 0.42514285714285716, |
| "grad_norm": 1.422675371170044, |
| "kl": 0.4040679931640625, |
| "learning_rate": 2.6802828488599294e-07, |
| "loss": 0.0162, |
| "reward": 0.9225452449172735, |
| "reward_std": 0.8723856098949909, |
| "rewards/cosine_scaled_reward": 0.03418927453458309, |
| "rewards/format_reward": 0.8541666865348816, |
| "step": 372 |
| }, |
| { |
| "completion_length": 916.0416870117188, |
| "epoch": 0.42628571428571427, |
| "grad_norm": 4.1732659339904785, |
| "kl": 0.23648834228515625, |
| "learning_rate": 2.655868138008171e-07, |
| "loss": 0.0095, |
| "reward": 0.753505942877382, |
| "reward_std": 0.5106633007526398, |
| "rewards/cosine_scaled_reward": -0.1024136976338923, |
| "rewards/format_reward": 0.9583333432674408, |
| "step": 373 |
| }, |
| { |
| "completion_length": 1235.333396911621, |
| "epoch": 0.42742857142857144, |
| "grad_norm": 0.9994511008262634, |
| "kl": 0.1927032470703125, |
| "learning_rate": 2.631592046130896e-07, |
| "loss": 0.0077, |
| "reward": 1.0162720493972301, |
| "reward_std": 0.4486389271914959, |
| "rewards/cosine_scaled_reward": 0.03938601026311517, |
| "rewards/format_reward": 0.9375000149011612, |
| "step": 374 |
| }, |
| { |
| "completion_length": 1341.0625305175781, |
| "epoch": 0.42857142857142855, |
| "grad_norm": 2.8108980655670166, |
| "kl": 0.382843017578125, |
| "learning_rate": 2.6074557564105724e-07, |
| "loss": 0.0153, |
| "reward": 0.9311719611287117, |
| "reward_std": 0.6638787761330605, |
| "rewards/cosine_scaled_reward": 0.038502639159560204, |
| "rewards/format_reward": 0.8541666865348816, |
| "step": 375 |
| }, |
| { |
| "completion_length": 1282.5000534057617, |
| "epoch": 0.4297142857142857, |
| "grad_norm": 1.4323093891143799, |
| "kl": 0.21431732177734375, |
| "learning_rate": 2.583460445215911e-07, |
| "loss": 0.0086, |
| "reward": 0.8344197571277618, |
| "reward_std": 0.7551028467714787, |
| "rewards/cosine_scaled_reward": -0.04112347261980176, |
| "rewards/format_reward": 0.916666679084301, |
| "step": 376 |
| }, |
| { |
| "completion_length": 1487.5625495910645, |
| "epoch": 0.4308571428571429, |
| "grad_norm": 2.224385976791382, |
| "kl": 0.4652557373046875, |
| "learning_rate": 2.5596072820445254e-07, |
| "loss": 0.0186, |
| "reward": 0.49483706802129745, |
| "reward_std": 0.7973055504262447, |
| "rewards/cosine_scaled_reward": -0.13799815066158772, |
| "rewards/format_reward": 0.7708333432674408, |
| "step": 377 |
| }, |
| { |
| "completion_length": 1158.9791851043701, |
| "epoch": 0.432, |
| "grad_norm": 1.5131416320800781, |
| "kl": 0.283233642578125, |
| "learning_rate": 2.5358974294659373e-07, |
| "loss": 0.0113, |
| "reward": 1.1355108618736267, |
| "reward_std": 0.7891764938831329, |
| "rewards/cosine_scaled_reward": 0.1198387467302382, |
| "rewards/format_reward": 0.8958333432674408, |
| "step": 378 |
| }, |
| { |
| "completion_length": 1687.2292213439941, |
| "epoch": 0.43314285714285716, |
| "grad_norm": 2.515700340270996, |
| "kl": 0.5447998046875, |
| "learning_rate": 2.512332043064913e-07, |
| "loss": 0.0218, |
| "reward": 0.488259082660079, |
| "reward_std": 0.6213226802647114, |
| "rewards/cosine_scaled_reward": -0.16212046705186367, |
| "rewards/format_reward": 0.8125000186264515, |
| "step": 379 |
| }, |
| { |
| "completion_length": 1145.1250343322754, |
| "epoch": 0.4342857142857143, |
| "grad_norm": 2.491877555847168, |
| "kl": 0.34210205078125, |
| "learning_rate": 2.488912271385139e-07, |
| "loss": 0.0137, |
| "reward": 0.6510128956288099, |
| "reward_std": 0.764140423387289, |
| "rewards/cosine_scaled_reward": -0.09116021171212196, |
| "rewards/format_reward": 0.8333333432674408, |
| "step": 380 |
| }, |
| { |
| "completion_length": 1576.9791946411133, |
| "epoch": 0.43542857142857144, |
| "grad_norm": 2.5349020957946777, |
| "kl": 0.4387664794921875, |
| "learning_rate": 2.465639255873246e-07, |
| "loss": 0.0176, |
| "reward": 0.45536880288273096, |
| "reward_std": 0.6865225993096828, |
| "rewards/cosine_scaled_reward": -0.17856561671942472, |
| "rewards/format_reward": 0.8125000223517418, |
| "step": 381 |
| }, |
| { |
| "completion_length": 1223.1667022705078, |
| "epoch": 0.43657142857142855, |
| "grad_norm": 3.171522378921509, |
| "kl": 0.3052978515625, |
| "learning_rate": 2.4425141308231765e-07, |
| "loss": 0.0122, |
| "reward": 0.47218240704387426, |
| "reward_std": 0.6197453737258911, |
| "rewards/cosine_scaled_reward": -0.20140881277620792, |
| "rewards/format_reward": 0.8750000223517418, |
| "step": 382 |
| }, |
| { |
| "completion_length": 1093.8750343322754, |
| "epoch": 0.4377142857142857, |
| "grad_norm": 1.554950475692749, |
| "kl": 0.307342529296875, |
| "learning_rate": 2.4195380233209006e-07, |
| "loss": 0.0123, |
| "reward": 1.3121628165245056, |
| "reward_std": 0.7447218038141727, |
| "rewards/cosine_scaled_reward": 0.1873313980177045, |
| "rewards/format_reward": 0.9375000074505806, |
| "step": 383 |
| }, |
| { |
| "completion_length": 1102.0833778381348, |
| "epoch": 0.43885714285714283, |
| "grad_norm": 3.9803340435028076, |
| "kl": 0.32332611083984375, |
| "learning_rate": 2.3967120531894857e-07, |
| "loss": 0.0129, |
| "reward": 1.2209933521226048, |
| "reward_std": 0.9811634235084057, |
| "rewards/cosine_scaled_reward": 0.1834133416414261, |
| "rewards/format_reward": 0.854166679084301, |
| "step": 384 |
| }, |
| { |
| "completion_length": 1301.0625381469727, |
| "epoch": 0.44, |
| "grad_norm": 1.6203845739364624, |
| "kl": 0.3782196044921875, |
| "learning_rate": 2.374037332934512e-07, |
| "loss": 0.0151, |
| "reward": 0.6873492915183306, |
| "reward_std": 0.7231754045933485, |
| "rewards/cosine_scaled_reward": -0.09382536727935076, |
| "rewards/format_reward": 0.8750000111758709, |
| "step": 385 |
| }, |
| { |
| "completion_length": 1025.8125305175781, |
| "epoch": 0.44114285714285717, |
| "grad_norm": 2.561079978942871, |
| "kl": 0.351043701171875, |
| "learning_rate": 2.3515149676898552e-07, |
| "loss": 0.014, |
| "reward": 1.1378429010510445, |
| "reward_std": 0.8463675826787949, |
| "rewards/cosine_scaled_reward": 0.14183809887617826, |
| "rewards/format_reward": 0.8541666865348816, |
| "step": 386 |
| }, |
| { |
| "completion_length": 1460.0417175292969, |
| "epoch": 0.4422857142857143, |
| "grad_norm": 3.437443494796753, |
| "kl": 0.76416015625, |
| "learning_rate": 2.3291460551638237e-07, |
| "loss": 0.0306, |
| "reward": 0.6168055031448603, |
| "reward_std": 0.6576487310230732, |
| "rewards/cosine_scaled_reward": -0.1290972474962473, |
| "rewards/format_reward": 0.8750000223517418, |
| "step": 387 |
| }, |
| { |
| "completion_length": 1255.8958740234375, |
| "epoch": 0.44342857142857145, |
| "grad_norm": 3.211789846420288, |
| "kl": 0.52313232421875, |
| "learning_rate": 2.306931685585657e-07, |
| "loss": 0.0209, |
| "reward": 0.8223461015149951, |
| "reward_std": 0.6507496982812881, |
| "rewards/cosine_scaled_reward": -0.015910295769572258, |
| "rewards/format_reward": 0.8541666939854622, |
| "step": 388 |
| }, |
| { |
| "completion_length": 1253.7917022705078, |
| "epoch": 0.44457142857142856, |
| "grad_norm": 5.535946846008301, |
| "kl": 0.503692626953125, |
| "learning_rate": 2.2848729416523859e-07, |
| "loss": 0.0201, |
| "reward": 0.7248962745070457, |
| "reward_std": 0.5912930071353912, |
| "rewards/cosine_scaled_reward": -0.07505187718197703, |
| "rewards/format_reward": 0.8750000149011612, |
| "step": 389 |
| }, |
| { |
| "completion_length": 1278.0417022705078, |
| "epoch": 0.44571428571428573, |
| "grad_norm": 3.257326364517212, |
| "kl": 0.6408615112304688, |
| "learning_rate": 2.2629708984760706e-07, |
| "loss": 0.0256, |
| "reward": 0.6384771540760994, |
| "reward_std": 0.7342811785638332, |
| "rewards/cosine_scaled_reward": -0.11826143972575665, |
| "rewards/format_reward": 0.8750000074505806, |
| "step": 390 |
| }, |
| { |
| "completion_length": 1312.6042022705078, |
| "epoch": 0.44685714285714284, |
| "grad_norm": 5.830494403839111, |
| "kl": 1.293975830078125, |
| "learning_rate": 2.2412266235313973e-07, |
| "loss": 0.0519, |
| "reward": 0.8241331037133932, |
| "reward_std": 0.9150289222598076, |
| "rewards/cosine_scaled_reward": 0.016233190894126892, |
| "rewards/format_reward": 0.791666679084301, |
| "step": 391 |
| }, |
| { |
| "completion_length": 1468.937557220459, |
| "epoch": 0.448, |
| "grad_norm": 1.9504032135009766, |
| "kl": 0.7166290283203125, |
| "learning_rate": 2.2196411766036487e-07, |
| "loss": 0.0287, |
| "reward": 0.5725310668349266, |
| "reward_std": 0.7125820200890303, |
| "rewards/cosine_scaled_reward": -0.1512344698421657, |
| "rewards/format_reward": 0.8750000149011612, |
| "step": 392 |
| }, |
| { |
| "completion_length": 1273.1250267028809, |
| "epoch": 0.4491428571428571, |
| "grad_norm": 6.959710597991943, |
| "kl": 1.0224609375, |
| "learning_rate": 2.1982156097370557e-07, |
| "loss": 0.0409, |
| "reward": 0.8387455176562071, |
| "reward_std": 0.8648201785981655, |
| "rewards/cosine_scaled_reward": 0.013122743383519264, |
| "rewards/format_reward": 0.8125000204890966, |
| "step": 393 |
| }, |
| { |
| "completion_length": 1102.4583854675293, |
| "epoch": 0.4502857142857143, |
| "grad_norm": 1.945556402206421, |
| "kl": 0.54296875, |
| "learning_rate": 2.1769509671835223e-07, |
| "loss": 0.0217, |
| "reward": 0.32192777935415506, |
| "reward_std": 0.5174819845706224, |
| "rewards/cosine_scaled_reward": -0.24528613314032555, |
| "rewards/format_reward": 0.8125000149011612, |
| "step": 394 |
| }, |
| { |
| "completion_length": 937.8958549499512, |
| "epoch": 0.4514285714285714, |
| "grad_norm": 5.496992588043213, |
| "kl": 0.67437744140625, |
| "learning_rate": 2.1558482853517253e-07, |
| "loss": 0.027, |
| "reward": 0.9939035659190267, |
| "reward_std": 0.7032170053571463, |
| "rewards/cosine_scaled_reward": 0.049035104806534946, |
| "rewards/format_reward": 0.8958333507180214, |
| "step": 395 |
| }, |
| { |
| "completion_length": 987.7292022705078, |
| "epoch": 0.45257142857142857, |
| "grad_norm": 4.864074230194092, |
| "kl": 0.49527740478515625, |
| "learning_rate": 2.134908592756607e-07, |
| "loss": 0.0198, |
| "reward": 0.7741489135660231, |
| "reward_std": 0.5997458174824715, |
| "rewards/cosine_scaled_reward": -0.06084221974015236, |
| "rewards/format_reward": 0.8958333432674408, |
| "step": 396 |
| }, |
| { |
| "completion_length": 1146.437515258789, |
| "epoch": 0.45371428571428574, |
| "grad_norm": 4.235805511474609, |
| "kl": 0.8509521484375, |
| "learning_rate": 2.1141329099692406e-07, |
| "loss": 0.034, |
| "reward": 0.8896873220801353, |
| "reward_std": 0.6099906638264656, |
| "rewards/cosine_scaled_reward": -0.04473969340324402, |
| "rewards/format_reward": 0.9791666716337204, |
| "step": 397 |
| }, |
| { |
| "completion_length": 1264.4167022705078, |
| "epoch": 0.45485714285714285, |
| "grad_norm": 2.6962742805480957, |
| "kl": 0.738128662109375, |
| "learning_rate": 2.0935222495670968e-07, |
| "loss": 0.0296, |
| "reward": 0.5152244120836258, |
| "reward_std": 0.8842574842274189, |
| "rewards/cosine_scaled_reward": -0.14863780653104186, |
| "rewards/format_reward": 0.8125000186264515, |
| "step": 398 |
| }, |
| { |
| "completion_length": 1035.7292022705078, |
| "epoch": 0.456, |
| "grad_norm": 2.1586830615997314, |
| "kl": 0.4624786376953125, |
| "learning_rate": 2.0730776160846853e-07, |
| "loss": 0.0185, |
| "reward": 0.9472854379564524, |
| "reward_std": 0.6391190886497498, |
| "rewards/cosine_scaled_reward": 0.025726054795086384, |
| "rewards/format_reward": 0.8958333358168602, |
| "step": 399 |
| }, |
| { |
| "completion_length": 913.2500343322754, |
| "epoch": 0.45714285714285713, |
| "grad_norm": 3.6302056312561035, |
| "kl": 0.26483154296875, |
| "learning_rate": 2.0528000059645995e-07, |
| "loss": 0.0106, |
| "reward": 1.4475815817713737, |
| "reward_std": 0.7946172095835209, |
| "rewards/cosine_scaled_reward": 0.26545744470786303, |
| "rewards/format_reward": 0.916666679084301, |
| "step": 400 |
| }, |
| { |
| "completion_length": 1466.1667175292969, |
| "epoch": 0.4582857142857143, |
| "grad_norm": 3.406836986541748, |
| "kl": 0.783599853515625, |
| "learning_rate": 2.032690407508949e-07, |
| "loss": 0.0314, |
| "reward": 0.6450461961794645, |
| "reward_std": 0.541100338101387, |
| "rewards/cosine_scaled_reward": -0.1149769127368927, |
| "rewards/format_reward": 0.8750000149011612, |
| "step": 401 |
| }, |
| { |
| "completion_length": 1107.4375228881836, |
| "epoch": 0.4594285714285714, |
| "grad_norm": 5.733443737030029, |
| "kl": 0.68023681640625, |
| "learning_rate": 2.0127498008311922e-07, |
| "loss": 0.0272, |
| "reward": 0.7256474066525698, |
| "reward_std": 0.6739897206425667, |
| "rewards/cosine_scaled_reward": -0.07467629760503769, |
| "rewards/format_reward": 0.8750000223517418, |
| "step": 402 |
| }, |
| { |
| "completion_length": 961.5000381469727, |
| "epoch": 0.4605714285714286, |
| "grad_norm": 2.457207441329956, |
| "kl": 0.1911468505859375, |
| "learning_rate": 1.9929791578083655e-07, |
| "loss": 0.0076, |
| "reward": 1.1732404585927725, |
| "reward_std": 0.589294470846653, |
| "rewards/cosine_scaled_reward": 0.08662020694464445, |
| "rewards/format_reward": 1.0, |
| "step": 403 |
| }, |
| { |
| "completion_length": 1151.1667175292969, |
| "epoch": 0.4617142857142857, |
| "grad_norm": 4.529026508331299, |
| "kl": 0.5990447998046875, |
| "learning_rate": 1.9733794420337213e-07, |
| "loss": 0.024, |
| "reward": 0.7381823370233178, |
| "reward_std": 0.6230232007801533, |
| "rewards/cosine_scaled_reward": -0.04757549986243248, |
| "rewards/format_reward": 0.8333333507180214, |
| "step": 404 |
| }, |
| { |
| "completion_length": 1068.1667251586914, |
| "epoch": 0.46285714285714286, |
| "grad_norm": 7.2086262702941895, |
| "kl": 0.4109954833984375, |
| "learning_rate": 1.9539516087697517e-07, |
| "loss": 0.0164, |
| "reward": 1.3128372617065907, |
| "reward_std": 0.8704881519079208, |
| "rewards/cosine_scaled_reward": 0.20850196853280067, |
| "rewards/format_reward": 0.895833358168602, |
| "step": 405 |
| }, |
| { |
| "completion_length": 1452.145851135254, |
| "epoch": 0.464, |
| "grad_norm": 6.928232192993164, |
| "kl": 1.5478515625, |
| "learning_rate": 1.934696604901642e-07, |
| "loss": 0.062, |
| "reward": 0.8689113333821297, |
| "reward_std": 1.0427627116441727, |
| "rewards/cosine_scaled_reward": 0.017788991099223495, |
| "rewards/format_reward": 0.8333333432674408, |
| "step": 406 |
| }, |
| { |
| "completion_length": 1271.6041984558105, |
| "epoch": 0.46514285714285714, |
| "grad_norm": 2.267540216445923, |
| "kl": 0.8037109375, |
| "learning_rate": 1.915615368891117e-07, |
| "loss": 0.0321, |
| "reward": 0.8099214821122587, |
| "reward_std": 0.6622566767036915, |
| "rewards/cosine_scaled_reward": -0.0012892577797174454, |
| "rewards/format_reward": 0.8125000111758709, |
| "step": 407 |
| }, |
| { |
| "completion_length": 1401.8333625793457, |
| "epoch": 0.4662857142857143, |
| "grad_norm": 2.2123162746429443, |
| "kl": 0.6725006103515625, |
| "learning_rate": 1.8967088307307e-07, |
| "loss": 0.0269, |
| "reward": 1.0615033656358719, |
| "reward_std": 0.8494626618921757, |
| "rewards/cosine_scaled_reward": 0.10366834327578545, |
| "rewards/format_reward": 0.8541666865348816, |
| "step": 408 |
| }, |
| { |
| "completion_length": 1703.6875610351562, |
| "epoch": 0.4674285714285714, |
| "grad_norm": 3.3070085048675537, |
| "kl": 1.30078125, |
| "learning_rate": 1.8779779118983867e-07, |
| "loss": 0.052, |
| "reward": 0.682604375295341, |
| "reward_std": 0.7020121356472373, |
| "rewards/cosine_scaled_reward": -0.0649478193372488, |
| "rewards/format_reward": 0.8125000149011612, |
| "step": 409 |
| }, |
| { |
| "completion_length": 1503.2500343322754, |
| "epoch": 0.4685714285714286, |
| "grad_norm": 7.976802349090576, |
| "kl": 1.97076416015625, |
| "learning_rate": 1.8594235253127372e-07, |
| "loss": 0.0788, |
| "reward": 0.46726590394973755, |
| "reward_std": 0.8286273218691349, |
| "rewards/cosine_scaled_reward": -0.07886704918928444, |
| "rewards/format_reward": 0.6250000260770321, |
| "step": 410 |
| }, |
| { |
| "completion_length": 1640.1458778381348, |
| "epoch": 0.4697142857142857, |
| "grad_norm": 3.6351277828216553, |
| "kl": 0.9295196533203125, |
| "learning_rate": 1.8410465752883758e-07, |
| "loss": 0.0372, |
| "reward": 0.6482336856424809, |
| "reward_std": 0.7643021754920483, |
| "rewards/cosine_scaled_reward": -0.0821331706829369, |
| "rewards/format_reward": 0.8125000186264515, |
| "step": 411 |
| }, |
| { |
| "completion_length": 1004.1458587646484, |
| "epoch": 0.47085714285714286, |
| "grad_norm": 2.904054641723633, |
| "kl": 0.413330078125, |
| "learning_rate": 1.822847957491922e-07, |
| "loss": 0.0165, |
| "reward": 0.9712292104959488, |
| "reward_std": 0.9014318101108074, |
| "rewards/cosine_scaled_reward": 0.048114595003426075, |
| "rewards/format_reward": 0.8750000149011612, |
| "step": 412 |
| }, |
| { |
| "completion_length": 1190.9583740234375, |
| "epoch": 0.472, |
| "grad_norm": 2.611903667449951, |
| "kl": 0.674652099609375, |
| "learning_rate": 1.804828558898332e-07, |
| "loss": 0.027, |
| "reward": 0.8645992483943701, |
| "reward_std": 0.8248982280492783, |
| "rewards/cosine_scaled_reward": 0.005216277204453945, |
| "rewards/format_reward": 0.854166679084301, |
| "step": 413 |
| }, |
| { |
| "completion_length": 1804.0000228881836, |
| "epoch": 0.47314285714285714, |
| "grad_norm": 3.9129860401153564, |
| "kl": 1.2784423828125, |
| "learning_rate": 1.7869892577476722e-07, |
| "loss": 0.0512, |
| "reward": 0.5207854583859444, |
| "reward_std": 0.8007166758179665, |
| "rewards/cosine_scaled_reward": -0.10419062164146453, |
| "rewards/format_reward": 0.7291666753590107, |
| "step": 414 |
| }, |
| { |
| "completion_length": 1269.1667022705078, |
| "epoch": 0.4742857142857143, |
| "grad_norm": 6.273374080657959, |
| "kl": 0.8948516845703125, |
| "learning_rate": 1.7693309235023127e-07, |
| "loss": 0.0358, |
| "reward": 0.6741074454039335, |
| "reward_std": 0.7499744538217783, |
| "rewards/cosine_scaled_reward": -0.05877961404621601, |
| "rewards/format_reward": 0.7916666865348816, |
| "step": 415 |
| }, |
| { |
| "completion_length": 1129.3125534057617, |
| "epoch": 0.4754285714285714, |
| "grad_norm": 1.7560322284698486, |
| "kl": 0.2455902099609375, |
| "learning_rate": 1.7518544168045524e-07, |
| "loss": 0.0098, |
| "reward": 1.1211753264069557, |
| "reward_std": 0.7526138452813029, |
| "rewards/cosine_scaled_reward": 0.07100430876016617, |
| "rewards/format_reward": 0.9791666716337204, |
| "step": 416 |
| }, |
| { |
| "completion_length": 1839.1875762939453, |
| "epoch": 0.4765714285714286, |
| "grad_norm": 4.114261627197266, |
| "kl": 1.20947265625, |
| "learning_rate": 1.7345605894346726e-07, |
| "loss": 0.0484, |
| "reward": 0.6709015071392059, |
| "reward_std": 0.874784380197525, |
| "rewards/cosine_scaled_reward": -0.049965920858085155, |
| "rewards/format_reward": 0.7708333544433117, |
| "step": 417 |
| }, |
| { |
| "completion_length": 896.5625305175781, |
| "epoch": 0.4777142857142857, |
| "grad_norm": 4.675693511962891, |
| "kl": 0.6626663208007812, |
| "learning_rate": 1.7174502842694212e-07, |
| "loss": 0.0265, |
| "reward": 1.091725105419755, |
| "reward_std": 0.7902912050485611, |
| "rewards/cosine_scaled_reward": 0.13961253920570016, |
| "rewards/format_reward": 0.8125000186264515, |
| "step": 418 |
| }, |
| { |
| "completion_length": 1493.6250457763672, |
| "epoch": 0.47885714285714287, |
| "grad_norm": 2.2568399906158447, |
| "kl": 0.5430755615234375, |
| "learning_rate": 1.7005243352409333e-07, |
| "loss": 0.0217, |
| "reward": 0.8138027191162109, |
| "reward_std": 0.8418096788227558, |
| "rewards/cosine_scaled_reward": -0.04101531347259879, |
| "rewards/format_reward": 0.8958333507180214, |
| "step": 419 |
| }, |
| { |
| "completion_length": 1029.1875228881836, |
| "epoch": 0.48, |
| "grad_norm": 3.587470293045044, |
| "kl": 0.508331298828125, |
| "learning_rate": 1.6837835672960831e-07, |
| "loss": 0.0204, |
| "reward": 0.7593546295538545, |
| "reward_std": 0.41525958105921745, |
| "rewards/cosine_scaled_reward": -0.057822706177830696, |
| "rewards/format_reward": 0.8750000149011612, |
| "step": 420 |
| }, |
| { |
| "completion_length": 1540.0833625793457, |
| "epoch": 0.48114285714285715, |
| "grad_norm": 2.360013484954834, |
| "kl": 0.88397216796875, |
| "learning_rate": 1.6672287963562852e-07, |
| "loss": 0.0354, |
| "reward": 0.4944930747151375, |
| "reward_std": 0.6833166517317295, |
| "rewards/cosine_scaled_reward": -0.19025347288697958, |
| "rewards/format_reward": 0.8750000223517418, |
| "step": 421 |
| }, |
| { |
| "completion_length": 1501.7292556762695, |
| "epoch": 0.48228571428571426, |
| "grad_norm": 2.635981798171997, |
| "kl": 0.740234375, |
| "learning_rate": 1.6508608292777203e-07, |
| "loss": 0.0296, |
| "reward": 0.9262944171205163, |
| "reward_std": 0.754150040447712, |
| "rewards/cosine_scaled_reward": 0.025647209025919437, |
| "rewards/format_reward": 0.8750000149011612, |
| "step": 422 |
| }, |
| { |
| "completion_length": 1964.8750762939453, |
| "epoch": 0.48342857142857143, |
| "grad_norm": 4.063174247741699, |
| "kl": 1.255462646484375, |
| "learning_rate": 1.6346804638120098e-07, |
| "loss": 0.0503, |
| "reward": 0.3909978587180376, |
| "reward_std": 0.8100734055042267, |
| "rewards/cosine_scaled_reward": -0.13783442322164774, |
| "rewards/format_reward": 0.6666666865348816, |
| "step": 423 |
| }, |
| { |
| "completion_length": 1720.937515258789, |
| "epoch": 0.4845714285714286, |
| "grad_norm": 3.836009979248047, |
| "kl": 0.89697265625, |
| "learning_rate": 1.6186884885673413e-07, |
| "loss": 0.0358, |
| "reward": 0.29376337490975857, |
| "reward_std": 0.5368039496243, |
| "rewards/cosine_scaled_reward": -0.2489516567438841, |
| "rewards/format_reward": 0.7916666865348816, |
| "step": 424 |
| }, |
| { |
| "completion_length": 1298.8125495910645, |
| "epoch": 0.4857142857142857, |
| "grad_norm": 1.9228633642196655, |
| "kl": 0.457244873046875, |
| "learning_rate": 1.6028856829700258e-07, |
| "loss": 0.0183, |
| "reward": 1.3913816176354885, |
| "reward_std": 0.6065534967929125, |
| "rewards/cosine_scaled_reward": 0.2477741353213787, |
| "rewards/format_reward": 0.8958333432674408, |
| "step": 425 |
| }, |
| { |
| "completion_length": 973.8541870117188, |
| "epoch": 0.4868571428571429, |
| "grad_norm": 2.929983615875244, |
| "kl": 0.32680511474609375, |
| "learning_rate": 1.5872728172265146e-07, |
| "loss": 0.0131, |
| "reward": 0.8921321593225002, |
| "reward_std": 0.4427370298653841, |
| "rewards/cosine_scaled_reward": -0.04351727291941643, |
| "rewards/format_reward": 0.9791666716337204, |
| "step": 426 |
| }, |
| { |
| "completion_length": 1596.7708740234375, |
| "epoch": 0.488, |
| "grad_norm": 3.161161422729492, |
| "kl": 0.484588623046875, |
| "learning_rate": 1.5718506522858572e-07, |
| "loss": 0.0194, |
| "reward": 0.973305675201118, |
| "reward_std": 0.8661460429430008, |
| "rewards/cosine_scaled_reward": 0.06998615153133869, |
| "rewards/format_reward": 0.8333333507180214, |
| "step": 427 |
| }, |
| { |
| "completion_length": 1573.3125610351562, |
| "epoch": 0.48914285714285716, |
| "grad_norm": 2.7298057079315186, |
| "kl": 0.6797714233398438, |
| "learning_rate": 1.5566199398026147e-07, |
| "loss": 0.0272, |
| "reward": 0.833369608346402, |
| "reward_std": 0.8424164094030857, |
| "rewards/cosine_scaled_reward": -0.020815202966332436, |
| "rewards/format_reward": 0.8750000074505806, |
| "step": 428 |
| }, |
| { |
| "completion_length": 972.3958435058594, |
| "epoch": 0.49028571428571427, |
| "grad_norm": 3.271073579788208, |
| "kl": 0.5417633056640625, |
| "learning_rate": 1.5415814221002265e-07, |
| "loss": 0.0216, |
| "reward": 0.6053271126002073, |
| "reward_std": 0.7208438105881214, |
| "rewards/cosine_scaled_reward": -0.15566978510469198, |
| "rewards/format_reward": 0.9166666865348816, |
| "step": 429 |
| }, |
| { |
| "completion_length": 1340.9167098999023, |
| "epoch": 0.49142857142857144, |
| "grad_norm": 4.3969879150390625, |
| "kl": 0.57080078125, |
| "learning_rate": 1.5267358321348285e-07, |
| "loss": 0.0228, |
| "reward": 0.9481127467006445, |
| "reward_std": 0.8026427961885929, |
| "rewards/cosine_scaled_reward": 0.0365563714876771, |
| "rewards/format_reward": 0.8750000223517418, |
| "step": 430 |
| }, |
| { |
| "completion_length": 1008.6250267028809, |
| "epoch": 0.49257142857142855, |
| "grad_norm": 2.9247236251831055, |
| "kl": 0.3984832763671875, |
| "learning_rate": 1.5120838934595337e-07, |
| "loss": 0.016, |
| "reward": 0.7810343587771058, |
| "reward_std": 0.6039218865334988, |
| "rewards/cosine_scaled_reward": -0.06781616434454918, |
| "rewards/format_reward": 0.9166666865348816, |
| "step": 431 |
| }, |
| { |
| "completion_length": 1682.0417022705078, |
| "epoch": 0.4937142857142857, |
| "grad_norm": 3.6658926010131836, |
| "kl": 0.94122314453125, |
| "learning_rate": 1.4976263201891613e-07, |
| "loss": 0.0376, |
| "reward": 0.37208714336156845, |
| "reward_std": 0.7206946834921837, |
| "rewards/cosine_scaled_reward": -0.17853978439234197, |
| "rewards/format_reward": 0.7291666846722364, |
| "step": 432 |
| }, |
| { |
| "completion_length": 1573.8125610351562, |
| "epoch": 0.4948571428571429, |
| "grad_norm": 6.710811138153076, |
| "kl": 0.98748779296875, |
| "learning_rate": 1.483363816965435e-07, |
| "loss": 0.0395, |
| "reward": 0.84003546833992, |
| "reward_std": 0.6314461715519428, |
| "rewards/cosine_scaled_reward": 0.0033510662615299225, |
| "rewards/format_reward": 0.833333358168602, |
| "step": 433 |
| }, |
| { |
| "completion_length": 1738.7292175292969, |
| "epoch": 0.496, |
| "grad_norm": 4.559843063354492, |
| "kl": 1.0400390625, |
| "learning_rate": 1.469297078922642e-07, |
| "loss": 0.0415, |
| "reward": 0.4310444425791502, |
| "reward_std": 0.6072977893054485, |
| "rewards/cosine_scaled_reward": -0.15947778831468895, |
| "rewards/format_reward": 0.7500000223517418, |
| "step": 434 |
| }, |
| { |
| "completion_length": 935.7708511352539, |
| "epoch": 0.49714285714285716, |
| "grad_norm": 1.8925296068191528, |
| "kl": 0.5841522216796875, |
| "learning_rate": 1.4554267916537495e-07, |
| "loss": 0.0234, |
| "reward": 0.6217253655195236, |
| "reward_std": 0.5926484689116478, |
| "rewards/cosine_scaled_reward": -0.1578873231774196, |
| "rewards/format_reward": 0.9375000149011612, |
| "step": 435 |
| }, |
| { |
| "completion_length": 1107.6875495910645, |
| "epoch": 0.4982857142857143, |
| "grad_norm": 2.4335594177246094, |
| "kl": 0.6590576171875, |
| "learning_rate": 1.4417536311769885e-07, |
| "loss": 0.0264, |
| "reward": 1.1079479418694973, |
| "reward_std": 0.7582566514611244, |
| "rewards/cosine_scaled_reward": 0.11647394431201974, |
| "rewards/format_reward": 0.8750000111758709, |
| "step": 436 |
| }, |
| { |
| "completion_length": 1200.9375381469727, |
| "epoch": 0.49942857142857144, |
| "grad_norm": 3.8739376068115234, |
| "kl": 0.56396484375, |
| "learning_rate": 1.4282782639029128e-07, |
| "loss": 0.0225, |
| "reward": 0.7988617792725563, |
| "reward_std": 0.6751260980963707, |
| "rewards/cosine_scaled_reward": -0.04848578106611967, |
| "rewards/format_reward": 0.895833358168602, |
| "step": 437 |
| }, |
| { |
| "completion_length": 1821.7708892822266, |
| "epoch": 0.5005714285714286, |
| "grad_norm": 3.8501853942871094, |
| "kl": 1.15753173828125, |
| "learning_rate": 1.4150013466019114e-07, |
| "loss": 0.0462, |
| "reward": 0.45935247000306845, |
| "reward_std": 0.6324943359941244, |
| "rewards/cosine_scaled_reward": -0.16615711338818073, |
| "rewards/format_reward": 0.7916666846722364, |
| "step": 438 |
| }, |
| { |
| "completion_length": 1447.7916793823242, |
| "epoch": 0.5017142857142857, |
| "grad_norm": 13.093454360961914, |
| "kl": 1.1785888671875, |
| "learning_rate": 1.4019235263722034e-07, |
| "loss": 0.0471, |
| "reward": 0.6338995918631554, |
| "reward_std": 0.6636701337993145, |
| "rewards/cosine_scaled_reward": -0.12055021477863193, |
| "rewards/format_reward": 0.8750000223517418, |
| "step": 439 |
| }, |
| { |
| "completion_length": 1387.4792022705078, |
| "epoch": 0.5028571428571429, |
| "grad_norm": 4.15952205657959, |
| "kl": 1.02880859375, |
| "learning_rate": 1.3890454406082956e-07, |
| "loss": 0.0411, |
| "reward": 0.5482727251946926, |
| "reward_std": 0.6047806814312935, |
| "rewards/cosine_scaled_reward": -0.15294698532670736, |
| "rewards/format_reward": 0.854166679084301, |
| "step": 440 |
| }, |
| { |
| "completion_length": 1395.708381652832, |
| "epoch": 0.504, |
| "grad_norm": 3.0667598247528076, |
| "kl": 0.73974609375, |
| "learning_rate": 1.3763677169699217e-07, |
| "loss": 0.0296, |
| "reward": 0.9216288132593036, |
| "reward_std": 0.7844664789736271, |
| "rewards/cosine_scaled_reward": 0.04414770007133484, |
| "rewards/format_reward": 0.8333333507180214, |
| "step": 441 |
| }, |
| { |
| "completion_length": 1071.7917022705078, |
| "epoch": 0.5051428571428571, |
| "grad_norm": 2.833597421646118, |
| "kl": 0.763397216796875, |
| "learning_rate": 1.3638909733514452e-07, |
| "loss": 0.0305, |
| "reward": 0.886376628652215, |
| "reward_std": 0.8642673939466476, |
| "rewards/cosine_scaled_reward": 0.02652162907179445, |
| "rewards/format_reward": 0.8333333469927311, |
| "step": 442 |
| }, |
| { |
| "completion_length": 1647.8542404174805, |
| "epoch": 0.5062857142857143, |
| "grad_norm": 2.424894332885742, |
| "kl": 1.24749755859375, |
| "learning_rate": 1.351615817851748e-07, |
| "loss": 0.0499, |
| "reward": 0.5579078826121986, |
| "reward_std": 0.6206741183996201, |
| "rewards/cosine_scaled_reward": -0.12729605846107006, |
| "rewards/format_reward": 0.8125000111758709, |
| "step": 443 |
| }, |
| { |
| "completion_length": 1236.9166946411133, |
| "epoch": 0.5074285714285715, |
| "grad_norm": 4.010989665985107, |
| "kl": 0.55133056640625, |
| "learning_rate": 1.3395428487445914e-07, |
| "loss": 0.0221, |
| "reward": 0.581955180503428, |
| "reward_std": 0.7338661774992943, |
| "rewards/cosine_scaled_reward": -0.1569390781223774, |
| "rewards/format_reward": 0.8958333507180214, |
| "step": 444 |
| }, |
| { |
| "completion_length": 1259.9166793823242, |
| "epoch": 0.5085714285714286, |
| "grad_norm": 2.583313465118408, |
| "kl": 0.74591064453125, |
| "learning_rate": 1.3276726544494571e-07, |
| "loss": 0.0298, |
| "reward": 0.5723795741796494, |
| "reward_std": 0.788989819586277, |
| "rewards/cosine_scaled_reward": -0.13047689152881503, |
| "rewards/format_reward": 0.8333333432674408, |
| "step": 445 |
| }, |
| { |
| "completion_length": 1332.9792098999023, |
| "epoch": 0.5097142857142857, |
| "grad_norm": 16.281204223632812, |
| "kl": 1.068878173828125, |
| "learning_rate": 1.316005813502869e-07, |
| "loss": 0.0428, |
| "reward": 0.7542771156877279, |
| "reward_std": 0.7587186098098755, |
| "rewards/cosine_scaled_reward": -0.008278121706098318, |
| "rewards/format_reward": 0.770833358168602, |
| "step": 446 |
| }, |
| { |
| "completion_length": 1513.3958930969238, |
| "epoch": 0.5108571428571429, |
| "grad_norm": 3.205157518386841, |
| "kl": 1.26171875, |
| "learning_rate": 1.3045428945301953e-07, |
| "loss": 0.0505, |
| "reward": 0.7465504482388496, |
| "reward_std": 0.57644097879529, |
| "rewards/cosine_scaled_reward": -0.04339144751429558, |
| "rewards/format_reward": 0.8333333432674408, |
| "step": 447 |
| }, |
| { |
| "completion_length": 1335.083366394043, |
| "epoch": 0.512, |
| "grad_norm": 3.9793076515197754, |
| "kl": 1.251495361328125, |
| "learning_rate": 1.2932844562179352e-07, |
| "loss": 0.0501, |
| "reward": 0.6771020290179877, |
| "reward_std": 0.7358664702624083, |
| "rewards/cosine_scaled_reward": -0.026032326743006706, |
| "rewards/format_reward": 0.729166679084301, |
| "step": 448 |
| }, |
| { |
| "completion_length": 1115.187515258789, |
| "epoch": 0.5131428571428571, |
| "grad_norm": 3.228578805923462, |
| "kl": 0.47408294677734375, |
| "learning_rate": 1.2822310472864885e-07, |
| "loss": 0.0189, |
| "reward": 0.422957434784621, |
| "reward_std": 0.5728885792195797, |
| "rewards/cosine_scaled_reward": -0.22602130100131035, |
| "rewards/format_reward": 0.8750000223517418, |
| "step": 449 |
| }, |
| { |
| "completion_length": 1225.1458740234375, |
| "epoch": 0.5142857142857142, |
| "grad_norm": 2.933077335357666, |
| "kl": 0.64776611328125, |
| "learning_rate": 1.2713832064634125e-07, |
| "loss": 0.0259, |
| "reward": 0.790303866029717, |
| "reward_std": 0.622036661952734, |
| "rewards/cosine_scaled_reward": -0.04234808124601841, |
| "rewards/format_reward": 0.8750000149011612, |
| "step": 450 |
| }, |
| { |
| "completion_length": 1283.5208892822266, |
| "epoch": 0.5154285714285715, |
| "grad_norm": 4.34015417098999, |
| "kl": 0.707855224609375, |
| "learning_rate": 1.260741462457165e-07, |
| "loss": 0.0283, |
| "reward": 0.7061118334531784, |
| "reward_std": 0.6641687378287315, |
| "rewards/cosine_scaled_reward": -0.06361076328903437, |
| "rewards/format_reward": 0.833333358168602, |
| "step": 451 |
| }, |
| { |
| "completion_length": 1390.333366394043, |
| "epoch": 0.5165714285714286, |
| "grad_norm": 3.791003942489624, |
| "kl": 0.733154296875, |
| "learning_rate": 1.2503063339313356e-07, |
| "loss": 0.0294, |
| "reward": 0.8051364235579967, |
| "reward_std": 0.9408066868782043, |
| "rewards/cosine_scaled_reward": -0.014098492218181491, |
| "rewards/format_reward": 0.8333333507180214, |
| "step": 452 |
| }, |
| { |
| "completion_length": 1164.7916984558105, |
| "epoch": 0.5177142857142857, |
| "grad_norm": 2.2949931621551514, |
| "kl": 0.513427734375, |
| "learning_rate": 1.2400783294793668e-07, |
| "loss": 0.0205, |
| "reward": 0.8233426138758659, |
| "reward_std": 0.5944336298853159, |
| "rewards/cosine_scaled_reward": -0.06749536748975515, |
| "rewards/format_reward": 0.9583333432674408, |
| "step": 453 |
| }, |
| { |
| "completion_length": 1308.4167022705078, |
| "epoch": 0.5188571428571429, |
| "grad_norm": 3.0252280235290527, |
| "kl": 0.7433929443359375, |
| "learning_rate": 1.2300579475997657e-07, |
| "loss": 0.0298, |
| "reward": 0.5274054184556007, |
| "reward_std": 0.6061475053429604, |
| "rewards/cosine_scaled_reward": -0.14254729636013508, |
| "rewards/format_reward": 0.8125000149011612, |
| "step": 454 |
| }, |
| { |
| "completion_length": 1739.1458702087402, |
| "epoch": 0.52, |
| "grad_norm": 4.987695217132568, |
| "kl": 1.2000732421875, |
| "learning_rate": 1.220245676671809e-07, |
| "loss": 0.048, |
| "reward": 0.2408284079283476, |
| "reward_std": 0.6372124627232552, |
| "rewards/cosine_scaled_reward": -0.27541913744062185, |
| "rewards/format_reward": 0.7916666865348816, |
| "step": 455 |
| }, |
| { |
| "completion_length": 1663.187557220459, |
| "epoch": 0.5211428571428571, |
| "grad_norm": 3.197329044342041, |
| "kl": 0.941253662109375, |
| "learning_rate": 1.2106419949317388e-07, |
| "loss": 0.0376, |
| "reward": 0.47842175513505936, |
| "reward_std": 0.7478823028504848, |
| "rewards/cosine_scaled_reward": -0.15662246476858854, |
| "rewards/format_reward": 0.791666679084301, |
| "step": 456 |
| }, |
| { |
| "completion_length": 1417.3542289733887, |
| "epoch": 0.5222857142857142, |
| "grad_norm": 2.964496612548828, |
| "kl": 0.938079833984375, |
| "learning_rate": 1.2012473704494537e-07, |
| "loss": 0.0375, |
| "reward": 0.6622494223265676, |
| "reward_std": 0.5692349448800087, |
| "rewards/cosine_scaled_reward": -0.03345862403512001, |
| "rewards/format_reward": 0.7291666828095913, |
| "step": 457 |
| }, |
| { |
| "completion_length": 1527.2916946411133, |
| "epoch": 0.5234285714285715, |
| "grad_norm": 4.469851493835449, |
| "kl": 1.12396240234375, |
| "learning_rate": 1.1920622611056974e-07, |
| "loss": 0.0449, |
| "reward": 0.5578488986939192, |
| "reward_std": 0.693300411105156, |
| "rewards/cosine_scaled_reward": -0.10649222880601883, |
| "rewards/format_reward": 0.7708333469927311, |
| "step": 458 |
| }, |
| { |
| "completion_length": 1051.7708740234375, |
| "epoch": 0.5245714285714286, |
| "grad_norm": 3.3350539207458496, |
| "kl": 0.466766357421875, |
| "learning_rate": 1.1830871145697412e-07, |
| "loss": 0.0187, |
| "reward": 0.9517181403934956, |
| "reward_std": 0.8380660861730576, |
| "rewards/cosine_scaled_reward": 0.0071090515702962875, |
| "rewards/format_reward": 0.9375000074505806, |
| "step": 459 |
| }, |
| { |
| "completion_length": 1639.1250610351562, |
| "epoch": 0.5257142857142857, |
| "grad_norm": 7.242498874664307, |
| "kl": 1.00091552734375, |
| "learning_rate": 1.1743223682775649e-07, |
| "loss": 0.04, |
| "reward": 0.3730992656201124, |
| "reward_std": 0.726275160908699, |
| "rewards/cosine_scaled_reward": -0.16761704441159964, |
| "rewards/format_reward": 0.7083333432674408, |
| "step": 460 |
| }, |
| { |
| "completion_length": 1452.6250305175781, |
| "epoch": 0.5268571428571428, |
| "grad_norm": 3.2622721195220947, |
| "kl": 1.1546630859375, |
| "learning_rate": 1.1657684494105386e-07, |
| "loss": 0.0461, |
| "reward": 0.8234957940876484, |
| "reward_std": 0.6311179846525192, |
| "rewards/cosine_scaled_reward": -0.0049187901604454964, |
| "rewards/format_reward": 0.8333333488553762, |
| "step": 461 |
| }, |
| { |
| "completion_length": 1285.4166946411133, |
| "epoch": 0.528, |
| "grad_norm": 3.8556363582611084, |
| "kl": 0.8689117431640625, |
| "learning_rate": 1.1574257748745986e-07, |
| "loss": 0.0348, |
| "reward": 0.4587779585272074, |
| "reward_std": 0.6436084322631359, |
| "rewards/cosine_scaled_reward": -0.17686102783773094, |
| "rewards/format_reward": 0.8125000223517418, |
| "step": 462 |
| }, |
| { |
| "completion_length": 1802.8125381469727, |
| "epoch": 0.5291428571428571, |
| "grad_norm": 5.279976844787598, |
| "kl": 0.799560546875, |
| "learning_rate": 1.1492947512799328e-07, |
| "loss": 0.032, |
| "reward": 0.6291724583134055, |
| "reward_std": 0.8651260025799274, |
| "rewards/cosine_scaled_reward": -0.04999712225981057, |
| "rewards/format_reward": 0.7291666939854622, |
| "step": 463 |
| }, |
| { |
| "completion_length": 1047.145851135254, |
| "epoch": 0.5302857142857142, |
| "grad_norm": 2.4638097286224365, |
| "kl": 0.605743408203125, |
| "learning_rate": 1.1413757749211602e-07, |
| "loss": 0.0242, |
| "reward": 1.302383467555046, |
| "reward_std": 0.4087425358593464, |
| "rewards/cosine_scaled_reward": 0.1616083886474371, |
| "rewards/format_reward": 0.9791666716337204, |
| "step": 464 |
| }, |
| { |
| "completion_length": 1426.8542022705078, |
| "epoch": 0.5314285714285715, |
| "grad_norm": 2.563239336013794, |
| "kl": 0.8856201171875, |
| "learning_rate": 1.1336692317580158e-07, |
| "loss": 0.0354, |
| "reward": 0.514048907905817, |
| "reward_std": 0.7893399521708488, |
| "rewards/cosine_scaled_reward": -0.14922555815428495, |
| "rewards/format_reward": 0.8125000223517418, |
| "step": 465 |
| }, |
| { |
| "completion_length": 1289.5625381469727, |
| "epoch": 0.5325714285714286, |
| "grad_norm": 2.4319827556610107, |
| "kl": 0.561614990234375, |
| "learning_rate": 1.1261754973965422e-07, |
| "loss": 0.0225, |
| "reward": 1.006679143756628, |
| "reward_std": 0.8896586894989014, |
| "rewards/cosine_scaled_reward": 0.045006227446720004, |
| "rewards/format_reward": 0.9166666865348816, |
| "step": 466 |
| }, |
| { |
| "completion_length": 1679.6666870117188, |
| "epoch": 0.5337142857142857, |
| "grad_norm": 3.1560564041137695, |
| "kl": 1.129669189453125, |
| "learning_rate": 1.1188949370707787e-07, |
| "loss": 0.0453, |
| "reward": 0.34113773331046104, |
| "reward_std": 0.6508898884057999, |
| "rewards/cosine_scaled_reward": -0.19401447381824255, |
| "rewards/format_reward": 0.7291666846722364, |
| "step": 467 |
| }, |
| { |
| "completion_length": 1565.2083778381348, |
| "epoch": 0.5348571428571428, |
| "grad_norm": 181.5196990966797, |
| "kl": 7.007781982421875, |
| "learning_rate": 1.1118279056249653e-07, |
| "loss": 0.2808, |
| "reward": 0.6900721359997988, |
| "reward_std": 0.8350103311240673, |
| "rewards/cosine_scaled_reward": -0.07163060246966779, |
| "rewards/format_reward": 0.8333333432674408, |
| "step": 468 |
| }, |
| { |
| "completion_length": 1385.6458892822266, |
| "epoch": 0.536, |
| "grad_norm": 3.4314520359039307, |
| "kl": 0.941375732421875, |
| "learning_rate": 1.1049747474962444e-07, |
| "loss": 0.0377, |
| "reward": 0.5675666080787778, |
| "reward_std": 0.6128499489277601, |
| "rewards/cosine_scaled_reward": -0.12246670690365136, |
| "rewards/format_reward": 0.8125000149011612, |
| "step": 469 |
| }, |
| { |
| "completion_length": 1621.4375305175781, |
| "epoch": 0.5371428571428571, |
| "grad_norm": 4.546476364135742, |
| "kl": 1.376953125, |
| "learning_rate": 1.0983357966978745e-07, |
| "loss": 0.0551, |
| "reward": 0.30739592137979344, |
| "reward_std": 0.725900623947382, |
| "rewards/cosine_scaled_reward": -0.15880204178392887, |
| "rewards/format_reward": 0.625000013038516, |
| "step": 470 |
| }, |
| { |
| "completion_length": 1291.0208587646484, |
| "epoch": 0.5382857142857143, |
| "grad_norm": 2.4413740634918213, |
| "kl": 0.525848388671875, |
| "learning_rate": 1.0919113768029517e-07, |
| "loss": 0.021, |
| "reward": 0.7892501968890429, |
| "reward_std": 0.7734898887574673, |
| "rewards/cosine_scaled_reward": -0.03245823457837105, |
| "rewards/format_reward": 0.8541666939854622, |
| "step": 471 |
| }, |
| { |
| "completion_length": 1400.1667022705078, |
| "epoch": 0.5394285714285715, |
| "grad_norm": 3.6039347648620605, |
| "kl": 0.58624267578125, |
| "learning_rate": 1.0857018009286381e-07, |
| "loss": 0.0234, |
| "reward": 0.3975699208676815, |
| "reward_std": 0.6745503656566143, |
| "rewards/cosine_scaled_reward": -0.22829839028418064, |
| "rewards/format_reward": 0.8541666865348816, |
| "step": 472 |
| }, |
| { |
| "completion_length": 1205.7500381469727, |
| "epoch": 0.5405714285714286, |
| "grad_norm": 2.652244806289673, |
| "kl": 0.7998046875, |
| "learning_rate": 1.0797073717209013e-07, |
| "loss": 0.032, |
| "reward": 0.5399867701344192, |
| "reward_std": 0.6538946256041527, |
| "rewards/cosine_scaled_reward": -0.17792330123484135, |
| "rewards/format_reward": 0.8958333507180214, |
| "step": 473 |
| }, |
| { |
| "completion_length": 1451.083351135254, |
| "epoch": 0.5417142857142857, |
| "grad_norm": 4.507306098937988, |
| "kl": 0.9193878173828125, |
| "learning_rate": 1.0739283813397639e-07, |
| "loss": 0.0368, |
| "reward": 1.300966864451766, |
| "reward_std": 0.745520330965519, |
| "rewards/cosine_scaled_reward": 0.2546501159667969, |
| "rewards/format_reward": 0.7916666828095913, |
| "step": 474 |
| }, |
| { |
| "completion_length": 1491.0000686645508, |
| "epoch": 0.5428571428571428, |
| "grad_norm": 2.09909987449646, |
| "kl": 0.9030914306640625, |
| "learning_rate": 1.068365111445064e-07, |
| "loss": 0.0361, |
| "reward": 0.6428233720362186, |
| "reward_std": 0.8779257349669933, |
| "rewards/cosine_scaled_reward": -0.06400499166920781, |
| "rewards/format_reward": 0.7708333469927311, |
| "step": 475 |
| }, |
| { |
| "completion_length": 1502.0208740234375, |
| "epoch": 0.544, |
| "grad_norm": 3.001065969467163, |
| "kl": 0.85467529296875, |
| "learning_rate": 1.063017833182728e-07, |
| "loss": 0.0342, |
| "reward": 0.9168616086244583, |
| "reward_std": 0.9465513862669468, |
| "rewards/cosine_scaled_reward": 0.020930795930325985, |
| "rewards/format_reward": 0.8750000074505806, |
| "step": 476 |
| }, |
| { |
| "completion_length": 1313.4791946411133, |
| "epoch": 0.5451428571428572, |
| "grad_norm": 4.386512756347656, |
| "kl": 0.9460906982421875, |
| "learning_rate": 1.0578868071715544e-07, |
| "loss": 0.0379, |
| "reward": 0.8154587242752314, |
| "reward_std": 0.8511052504181862, |
| "rewards/cosine_scaled_reward": 0.022312658838927746, |
| "rewards/format_reward": 0.7708333544433117, |
| "step": 477 |
| }, |
| { |
| "completion_length": 1496.3125305175781, |
| "epoch": 0.5462857142857143, |
| "grad_norm": 3.565664768218994, |
| "kl": 0.8566741943359375, |
| "learning_rate": 1.0529722834905125e-07, |
| "loss": 0.0342, |
| "reward": 0.8052469007670879, |
| "reward_std": 0.7007171474397182, |
| "rewards/cosine_scaled_reward": -0.003626542165875435, |
| "rewards/format_reward": 0.8125000223517418, |
| "step": 478 |
| }, |
| { |
| "completion_length": 1708.4167098999023, |
| "epoch": 0.5474285714285714, |
| "grad_norm": 2.6371641159057617, |
| "kl": 1.12823486328125, |
| "learning_rate": 1.0482745016665526e-07, |
| "loss": 0.0451, |
| "reward": 0.5828341413289309, |
| "reward_std": 0.6526912562549114, |
| "rewards/cosine_scaled_reward": -0.1148329358547926, |
| "rewards/format_reward": 0.8125000074505806, |
| "step": 479 |
| }, |
| { |
| "completion_length": 1512.4166946411133, |
| "epoch": 0.5485714285714286, |
| "grad_norm": 4.293049335479736, |
| "kl": 0.984771728515625, |
| "learning_rate": 1.0437936906629334e-07, |
| "loss": 0.0394, |
| "reward": 0.8314212337136269, |
| "reward_std": 0.5310982428491116, |
| "rewards/cosine_scaled_reward": -0.03220607154071331, |
| "rewards/format_reward": 0.895833358168602, |
| "step": 480 |
| }, |
| { |
| "completion_length": 1671.5417022705078, |
| "epoch": 0.5497142857142857, |
| "grad_norm": 5.241688251495361, |
| "kl": 0.995025634765625, |
| "learning_rate": 1.0395300688680625e-07, |
| "loss": 0.0397, |
| "reward": 0.5629774704575539, |
| "reward_std": 0.6768053583800793, |
| "rewards/cosine_scaled_reward": -0.1351779391989112, |
| "rewards/format_reward": 0.8333333432674408, |
| "step": 481 |
| }, |
| { |
| "completion_length": 1726.7500267028809, |
| "epoch": 0.5508571428571428, |
| "grad_norm": 4.3753886222839355, |
| "kl": 1.4754638671875, |
| "learning_rate": 1.0354838440848501e-07, |
| "loss": 0.059, |
| "reward": 0.5926886834204197, |
| "reward_std": 0.6844369061291218, |
| "rewards/cosine_scaled_reward": -0.01615567714907229, |
| "rewards/format_reward": 0.6250000279396772, |
| "step": 482 |
| }, |
| { |
| "completion_length": 1887.2500610351562, |
| "epoch": 0.552, |
| "grad_norm": 3.9196157455444336, |
| "kl": 1.394287109375, |
| "learning_rate": 1.0316552135205837e-07, |
| "loss": 0.0558, |
| "reward": 0.5968103185296059, |
| "reward_std": 0.8982166573405266, |
| "rewards/cosine_scaled_reward": -0.08701153006404638, |
| "rewards/format_reward": 0.770833358168602, |
| "step": 483 |
| }, |
| { |
| "completion_length": 1416.1458930969238, |
| "epoch": 0.5531428571428572, |
| "grad_norm": 4.831650257110596, |
| "kl": 0.6915283203125, |
| "learning_rate": 1.0280443637773163e-07, |
| "loss": 0.0276, |
| "reward": 0.6882726345211267, |
| "reward_std": 0.781242698431015, |
| "rewards/cosine_scaled_reward": -0.06211370480014011, |
| "rewards/format_reward": 0.8125000223517418, |
| "step": 484 |
| }, |
| { |
| "completion_length": 1175.083351135254, |
| "epoch": 0.5542857142857143, |
| "grad_norm": 2.7525644302368164, |
| "kl": 0.708404541015625, |
| "learning_rate": 1.0246514708427701e-07, |
| "loss": 0.0283, |
| "reward": 0.5069214683026075, |
| "reward_std": 0.4130475576967001, |
| "rewards/cosine_scaled_reward": -0.16320594353601336, |
| "rewards/format_reward": 0.8333333414047956, |
| "step": 485 |
| }, |
| { |
| "completion_length": 945.3750343322754, |
| "epoch": 0.5554285714285714, |
| "grad_norm": 1.641010046005249, |
| "kl": 0.423370361328125, |
| "learning_rate": 1.0214767000817596e-07, |
| "loss": 0.0169, |
| "reward": 0.7414621282368898, |
| "reward_std": 0.5828515980392694, |
| "rewards/cosine_scaled_reward": -0.0771856140345335, |
| "rewards/format_reward": 0.8958333432674408, |
| "step": 486 |
| }, |
| { |
| "completion_length": 1054.7708625793457, |
| "epoch": 0.5565714285714286, |
| "grad_norm": 2.9195919036865234, |
| "kl": 0.4309539794921875, |
| "learning_rate": 1.0185202062281336e-07, |
| "loss": 0.0173, |
| "reward": 1.2239743052050471, |
| "reward_std": 0.5242457445710897, |
| "rewards/cosine_scaled_reward": 0.14323717169463634, |
| "rewards/format_reward": 0.9375000149011612, |
| "step": 487 |
| }, |
| { |
| "completion_length": 1278.1667022705078, |
| "epoch": 0.5577142857142857, |
| "grad_norm": 2.495598316192627, |
| "kl": 0.9759674072265625, |
| "learning_rate": 1.0157821333772304e-07, |
| "loss": 0.039, |
| "reward": 0.6595983803272247, |
| "reward_std": 0.5137137211859226, |
| "rewards/cosine_scaled_reward": -0.08686749078333378, |
| "rewards/format_reward": 0.8333333544433117, |
| "step": 488 |
| }, |
| { |
| "completion_length": 1564.6250381469727, |
| "epoch": 0.5588571428571428, |
| "grad_norm": 3.082076072692871, |
| "kl": 1.235137939453125, |
| "learning_rate": 1.013262614978859e-07, |
| "loss": 0.0494, |
| "reward": 0.09155605779960752, |
| "reward_std": 0.5432869009673595, |
| "rewards/cosine_scaled_reward": -0.28755532018840313, |
| "rewards/format_reward": 0.6666666828095913, |
| "step": 489 |
| }, |
| { |
| "completion_length": 1278.1250381469727, |
| "epoch": 0.56, |
| "grad_norm": 1.8155015707015991, |
| "kl": 0.6814727783203125, |
| "learning_rate": 1.0109617738307911e-07, |
| "loss": 0.0273, |
| "reward": 0.6778741236776114, |
| "reward_std": 0.7777648419141769, |
| "rewards/cosine_scaled_reward": -0.08814628981053829, |
| "rewards/format_reward": 0.854166679084301, |
| "step": 490 |
| }, |
| { |
| "completion_length": 1448.9375381469727, |
| "epoch": 0.5611428571428572, |
| "grad_norm": 2.183570623397827, |
| "kl": 0.358306884765625, |
| "learning_rate": 1.0088797220727779e-07, |
| "loss": 0.0143, |
| "reward": 1.1492541544139385, |
| "reward_std": 1.02406694740057, |
| "rewards/cosine_scaled_reward": 0.12671040603891015, |
| "rewards/format_reward": 0.8958333432674408, |
| "step": 491 |
| }, |
| { |
| "completion_length": 1325.7708740234375, |
| "epoch": 0.5622857142857143, |
| "grad_norm": 3.939805030822754, |
| "kl": 0.6011505126953125, |
| "learning_rate": 1.0070165611810855e-07, |
| "loss": 0.024, |
| "reward": 0.8053325146902353, |
| "reward_std": 0.5690103769302368, |
| "rewards/cosine_scaled_reward": -0.045250434428453445, |
| "rewards/format_reward": 0.8958333507180214, |
| "step": 492 |
| }, |
| { |
| "completion_length": 1136.2291946411133, |
| "epoch": 0.5634285714285714, |
| "grad_norm": 3.2057812213897705, |
| "kl": 0.6187744140625, |
| "learning_rate": 1.005372381963547e-07, |
| "loss": 0.0248, |
| "reward": 0.8366872314363718, |
| "reward_std": 0.8825880065560341, |
| "rewards/cosine_scaled_reward": -0.019156392896547914, |
| "rewards/format_reward": 0.8750000074505806, |
| "step": 493 |
| }, |
| { |
| "completion_length": 1201.3125305175781, |
| "epoch": 0.5645714285714286, |
| "grad_norm": 3.475240707397461, |
| "kl": 0.5660400390625, |
| "learning_rate": 1.0039472645551372e-07, |
| "loss": 0.0226, |
| "reward": 0.8446337506175041, |
| "reward_std": 0.9212733060121536, |
| "rewards/cosine_scaled_reward": -0.025599811924621463, |
| "rewards/format_reward": 0.8958333432674408, |
| "step": 494 |
| }, |
| { |
| "completion_length": 1859.8333892822266, |
| "epoch": 0.5657142857142857, |
| "grad_norm": 5.380873680114746, |
| "kl": 1.025390625, |
| "learning_rate": 1.002741278414069e-07, |
| "loss": 0.0411, |
| "reward": 0.7559722196310759, |
| "reward_std": 1.0365862026810646, |
| "rewards/cosine_scaled_reward": -0.007430561818182468, |
| "rewards/format_reward": 0.7708333656191826, |
| "step": 495 |
| }, |
| { |
| "completion_length": 1412.4375610351562, |
| "epoch": 0.5668571428571428, |
| "grad_norm": 3.017749547958374, |
| "kl": 1.0056610107421875, |
| "learning_rate": 1.0017544823184055e-07, |
| "loss": 0.0403, |
| "reward": 0.8477246034890413, |
| "reward_std": 0.7844705618917942, |
| "rewards/cosine_scaled_reward": 0.038445642217993736, |
| "rewards/format_reward": 0.7708333488553762, |
| "step": 496 |
| }, |
| { |
| "completion_length": 1259.4583702087402, |
| "epoch": 0.568, |
| "grad_norm": 3.237330436706543, |
| "kl": 0.850830078125, |
| "learning_rate": 1.0009869243631952e-07, |
| "loss": 0.034, |
| "reward": 1.0958917308598757, |
| "reward_std": 0.5949588976800442, |
| "rewards/cosine_scaled_reward": 0.12086252495646477, |
| "rewards/format_reward": 0.854166679084301, |
| "step": 497 |
| }, |
| { |
| "completion_length": 1587.083351135254, |
| "epoch": 0.5691428571428572, |
| "grad_norm": 4.424108028411865, |
| "kl": 1.174072265625, |
| "learning_rate": 1.000438641958131e-07, |
| "loss": 0.047, |
| "reward": 0.5771038420498371, |
| "reward_std": 0.7162478044629097, |
| "rewards/cosine_scaled_reward": -0.08644808363169432, |
| "rewards/format_reward": 0.7500000149011612, |
| "step": 498 |
| }, |
| { |
| "completion_length": 1515.1667022705078, |
| "epoch": 0.5702857142857143, |
| "grad_norm": 1.4978034496307373, |
| "kl": 0.6841812133789062, |
| "learning_rate": 1.0001096618257236e-07, |
| "loss": 0.0274, |
| "reward": 0.8714520921930671, |
| "reward_std": 0.7796698864549398, |
| "rewards/cosine_scaled_reward": -0.012190633453428745, |
| "rewards/format_reward": 0.8958333507180214, |
| "step": 499 |
| }, |
| { |
| "completion_length": 1229.7083740234375, |
| "epoch": 0.5714285714285714, |
| "grad_norm": 2.3225278854370117, |
| "kl": 0.565399169921875, |
| "learning_rate": 1e-07, |
| "loss": 0.0227, |
| "reward": 0.6700912415981293, |
| "reward_std": 0.7157665528357029, |
| "rewards/cosine_scaled_reward": -0.1337043906096369, |
| "rewards/format_reward": 0.9375000149011612, |
| "step": 500 |
| }, |
| { |
| "epoch": 0.5714285714285714, |
| "step": 500, |
| "total_flos": 0.0, |
| "train_loss": 0.009290046402068698, |
| "train_runtime": 55313.9986, |
| "train_samples_per_second": 0.434, |
| "train_steps_per_second": 0.009 |
| } |
| ], |
| "logging_steps": 1, |
| "max_steps": 500, |
| "num_input_tokens_seen": 0, |
| "num_train_epochs": 1, |
| "save_steps": 50, |
| "stateful_callbacks": { |
| "TrainerControl": { |
| "args": { |
| "should_epoch_stop": false, |
| "should_evaluate": false, |
| "should_log": false, |
| "should_save": true, |
| "should_training_stop": true |
| }, |
| "attributes": {} |
| } |
| }, |
| "total_flos": 0.0, |
| "train_batch_size": 6, |
| "trial_name": null, |
| "trial_params": null |
| } |
|
|