Instructions to use kangdawei/MMR-Adaptive-Smooth-GRPO with libraries, inference providers, notebooks, and local apps. Follow these links to get started.
- Libraries
- Transformers
How to use kangdawei/MMR-Adaptive-Smooth-GRPO with Transformers:
# Load model directly from transformers import AutoModel model = AutoModel.from_pretrained("kangdawei/MMR-Adaptive-Smooth-GRPO", dtype="auto") - Notebooks
- Google Colab
- Kaggle
| { | |
| "best_metric": null, | |
| "best_model_checkpoint": null, | |
| "epoch": 0.5714285714285714, | |
| "eval_steps": 500, | |
| "global_step": 500, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "completion_length": 2571.2083587646484, | |
| "epoch": 0.001142857142857143, | |
| "grad_norm": 0.19696776568889618, | |
| "kl": 0.0, | |
| "learning_rate": 2e-08, | |
| "loss": 0.0, | |
| "reward": 0.4897647276520729, | |
| "reward_std": 0.8290339708328247, | |
| "rewards/cosine_scaled_reward": -0.015534311532974243, | |
| "rewards/format_reward": 0.5208333488553762, | |
| "step": 1 | |
| }, | |
| { | |
| "completion_length": 2804.395881652832, | |
| "epoch": 0.002285714285714286, | |
| "grad_norm": 0.1806372106075287, | |
| "kl": 0.0, | |
| "learning_rate": 4e-08, | |
| "loss": 0.0, | |
| "reward": 0.27539755776524544, | |
| "reward_std": 0.42092563211917877, | |
| "rewards/cosine_scaled_reward": -0.04980122856795788, | |
| "rewards/format_reward": 0.37500000558793545, | |
| "step": 2 | |
| }, | |
| { | |
| "completion_length": 3339.625015258789, | |
| "epoch": 0.0034285714285714284, | |
| "grad_norm": 0.1699189692735672, | |
| "kl": 4.197657108306885e-05, | |
| "learning_rate": 6e-08, | |
| "loss": 0.0, | |
| "reward": -0.24649023730307817, | |
| "reward_std": 0.7038179924711585, | |
| "rewards/cosine_scaled_reward": -0.18574512389022857, | |
| "rewards/format_reward": 0.1250000037252903, | |
| "step": 3 | |
| }, | |
| { | |
| "completion_length": 2276.2708892822266, | |
| "epoch": 0.004571428571428572, | |
| "grad_norm": 0.2699170410633087, | |
| "kl": 3.144703805446625e-05, | |
| "learning_rate": 8e-08, | |
| "loss": 0.0, | |
| "reward": 0.37421327200718224, | |
| "reward_std": 0.6797358561307192, | |
| "rewards/cosine_scaled_reward": -0.09414338041096926, | |
| "rewards/format_reward": 0.5625000055879354, | |
| "step": 4 | |
| }, | |
| { | |
| "completion_length": 3310.1041870117188, | |
| "epoch": 0.005714285714285714, | |
| "grad_norm": 0.17548619210720062, | |
| "kl": 4.331022500991821e-05, | |
| "learning_rate": 1e-07, | |
| "loss": 0.0, | |
| "reward": -0.07299477732158266, | |
| "reward_std": 0.7602944187819958, | |
| "rewards/cosine_scaled_reward": -0.19274738454259932, | |
| "rewards/format_reward": 0.31250000558793545, | |
| "step": 5 | |
| }, | |
| { | |
| "completion_length": 3136.104217529297, | |
| "epoch": 0.006857142857142857, | |
| "grad_norm": 0.20332929491996765, | |
| "kl": 4.7653913497924805e-05, | |
| "learning_rate": 1.2e-07, | |
| "loss": 0.0, | |
| "reward": -0.02561904746107757, | |
| "reward_std": 1.0297068133950233, | |
| "rewards/cosine_scaled_reward": -0.13780953222885728, | |
| "rewards/format_reward": 0.25000000931322575, | |
| "step": 6 | |
| }, | |
| { | |
| "completion_length": 3268.2500915527344, | |
| "epoch": 0.008, | |
| "grad_norm": 0.14488613605499268, | |
| "kl": 2.9772520065307617e-05, | |
| "learning_rate": 1.4e-07, | |
| "loss": 0.0, | |
| "reward": 0.23762857168912888, | |
| "reward_std": 1.0299683846533298, | |
| "rewards/cosine_scaled_reward": -0.09993573231622577, | |
| "rewards/format_reward": 0.43750000558793545, | |
| "step": 7 | |
| }, | |
| { | |
| "completion_length": 2672.9791870117188, | |
| "epoch": 0.009142857142857144, | |
| "grad_norm": 0.16940173506736755, | |
| "kl": 2.1375715732574463e-05, | |
| "learning_rate": 1.6e-07, | |
| "loss": 0.0, | |
| "reward": 0.6125958878546953, | |
| "reward_std": 0.5378385670483112, | |
| "rewards/cosine_scaled_reward": 0.07713126111775637, | |
| "rewards/format_reward": 0.4583333358168602, | |
| "step": 8 | |
| }, | |
| { | |
| "completion_length": 3104.3334045410156, | |
| "epoch": 0.010285714285714285, | |
| "grad_norm": 0.2231517881155014, | |
| "kl": 3.183633089065552e-05, | |
| "learning_rate": 1.8e-07, | |
| "loss": 0.0, | |
| "reward": 0.09738215431571007, | |
| "reward_std": 0.6578879225999117, | |
| "rewards/cosine_scaled_reward": -0.128392253711354, | |
| "rewards/format_reward": 0.3541666716337204, | |
| "step": 9 | |
| }, | |
| { | |
| "completion_length": 2790.8541717529297, | |
| "epoch": 0.011428571428571429, | |
| "grad_norm": 0.19208469986915588, | |
| "kl": 3.1501054763793945e-05, | |
| "learning_rate": 2e-07, | |
| "loss": 0.0, | |
| "reward": 0.008928850293159485, | |
| "reward_std": 0.5447255074977875, | |
| "rewards/cosine_scaled_reward": -0.18303558183833957, | |
| "rewards/format_reward": 0.3750000074505806, | |
| "step": 10 | |
| }, | |
| { | |
| "completion_length": 3341.437530517578, | |
| "epoch": 0.012571428571428572, | |
| "grad_norm": 0.19327814877033234, | |
| "kl": 2.907589077949524e-05, | |
| "learning_rate": 2.1999999999999998e-07, | |
| "loss": 0.0, | |
| "reward": -0.39456131402403116, | |
| "reward_std": 0.5775978416204453, | |
| "rewards/cosine_scaled_reward": -0.24936398956924677, | |
| "rewards/format_reward": 0.1041666679084301, | |
| "step": 11 | |
| }, | |
| { | |
| "completion_length": 2766.7083740234375, | |
| "epoch": 0.013714285714285714, | |
| "grad_norm": 0.23883244395256042, | |
| "kl": 3.895256668329239e-05, | |
| "learning_rate": 2.4e-07, | |
| "loss": 0.0, | |
| "reward": 0.3529932126402855, | |
| "reward_std": 0.861735014244914, | |
| "rewards/cosine_scaled_reward": -0.08392009884119034, | |
| "rewards/format_reward": 0.5208333488553762, | |
| "step": 12 | |
| }, | |
| { | |
| "completion_length": 2890.750030517578, | |
| "epoch": 0.014857142857142857, | |
| "grad_norm": 0.20623275637626648, | |
| "kl": 2.8409063816070557e-05, | |
| "learning_rate": 2.6e-07, | |
| "loss": 0.0, | |
| "reward": 0.39041033387184143, | |
| "reward_std": 0.7538400888442993, | |
| "rewards/cosine_scaled_reward": -0.023544855881482363, | |
| "rewards/format_reward": 0.4375000111758709, | |
| "step": 13 | |
| }, | |
| { | |
| "completion_length": 2856.187530517578, | |
| "epoch": 0.016, | |
| "grad_norm": 0.16158564388751984, | |
| "kl": 2.251937985420227e-05, | |
| "learning_rate": 2.8e-07, | |
| "loss": 0.0, | |
| "reward": 0.11729078739881516, | |
| "reward_std": 0.7728225328028202, | |
| "rewards/cosine_scaled_reward": -0.13927128538489342, | |
| "rewards/format_reward": 0.3958333358168602, | |
| "step": 14 | |
| }, | |
| { | |
| "completion_length": 2816.333354949951, | |
| "epoch": 0.017142857142857144, | |
| "grad_norm": 0.1757153868675232, | |
| "kl": 3.409385681152344e-05, | |
| "learning_rate": 3e-07, | |
| "loss": 0.0, | |
| "reward": 0.5039311908185482, | |
| "reward_std": 0.6081470809876919, | |
| "rewards/cosine_scaled_reward": 0.03321555629372597, | |
| "rewards/format_reward": 0.43750000558793545, | |
| "step": 15 | |
| }, | |
| { | |
| "completion_length": 3518.9166870117188, | |
| "epoch": 0.018285714285714287, | |
| "grad_norm": 0.16117794811725616, | |
| "kl": 4.215538501739502e-05, | |
| "learning_rate": 3.2e-07, | |
| "loss": 0.0, | |
| "reward": -0.3141332839149982, | |
| "reward_std": 0.5307199694216251, | |
| "rewards/cosine_scaled_reward": -0.1987333269789815, | |
| "rewards/format_reward": 0.0833333358168602, | |
| "step": 16 | |
| }, | |
| { | |
| "completion_length": 2329.0208587646484, | |
| "epoch": 0.019428571428571427, | |
| "grad_norm": 0.2838650941848755, | |
| "kl": 4.272162914276123e-05, | |
| "learning_rate": 3.4000000000000003e-07, | |
| "loss": 0.0, | |
| "reward": 0.39212552830576897, | |
| "reward_std": 0.791872002184391, | |
| "rewards/cosine_scaled_reward": -0.07477057632058859, | |
| "rewards/format_reward": 0.5416666679084301, | |
| "step": 17 | |
| }, | |
| { | |
| "completion_length": 2841.3125534057617, | |
| "epoch": 0.02057142857142857, | |
| "grad_norm": 0.15529987215995789, | |
| "kl": 2.863258123397827e-05, | |
| "learning_rate": 3.6e-07, | |
| "loss": 0.0, | |
| "reward": 0.2834795080125332, | |
| "reward_std": 0.7086473144590855, | |
| "rewards/cosine_scaled_reward": -0.10826026648283005, | |
| "rewards/format_reward": 0.5000000111758709, | |
| "step": 18 | |
| }, | |
| { | |
| "completion_length": 3026.9583740234375, | |
| "epoch": 0.021714285714285714, | |
| "grad_norm": 0.16150601208209991, | |
| "kl": 2.719089388847351e-05, | |
| "learning_rate": 3.7999999999999996e-07, | |
| "loss": 0.0, | |
| "reward": 0.6184139084070921, | |
| "reward_std": 0.9973310008645058, | |
| "rewards/cosine_scaled_reward": 0.11129027768038213, | |
| "rewards/format_reward": 0.3958333395421505, | |
| "step": 19 | |
| }, | |
| { | |
| "completion_length": 2519.5000228881836, | |
| "epoch": 0.022857142857142857, | |
| "grad_norm": 0.22576642036437988, | |
| "kl": 2.012774348258972e-05, | |
| "learning_rate": 4e-07, | |
| "loss": 0.0, | |
| "reward": 0.6895501036196947, | |
| "reward_std": 0.7981752147898078, | |
| "rewards/cosine_scaled_reward": 0.021858368068933487, | |
| "rewards/format_reward": 0.6458333469927311, | |
| "step": 20 | |
| }, | |
| { | |
| "completion_length": 2594.7916870117188, | |
| "epoch": 0.024, | |
| "grad_norm": 0.23808935284614563, | |
| "kl": 3.5569071769714355e-05, | |
| "learning_rate": 4.1999999999999995e-07, | |
| "loss": 0.0, | |
| "reward": 0.33955152705311775, | |
| "reward_std": 0.5852350238710642, | |
| "rewards/cosine_scaled_reward": -0.048974241130054, | |
| "rewards/format_reward": 0.43750000186264515, | |
| "step": 21 | |
| }, | |
| { | |
| "completion_length": 1815.8125457763672, | |
| "epoch": 0.025142857142857144, | |
| "grad_norm": 0.297273725271225, | |
| "kl": 3.524869680404663e-05, | |
| "learning_rate": 4.3999999999999997e-07, | |
| "loss": 0.0, | |
| "reward": 0.9210781529545784, | |
| "reward_std": 0.6531081832945347, | |
| "rewards/cosine_scaled_reward": 0.03345571830868721, | |
| "rewards/format_reward": 0.8541666716337204, | |
| "step": 22 | |
| }, | |
| { | |
| "completion_length": 2265.041702270508, | |
| "epoch": 0.026285714285714287, | |
| "grad_norm": 0.2135019600391388, | |
| "kl": 2.358853816986084e-05, | |
| "learning_rate": 4.6e-07, | |
| "loss": 0.0, | |
| "reward": 0.46448952704668045, | |
| "reward_std": 0.6991389617323875, | |
| "rewards/cosine_scaled_reward": -0.06983858160674572, | |
| "rewards/format_reward": 0.6041666734963655, | |
| "step": 23 | |
| }, | |
| { | |
| "completion_length": 2737.4584197998047, | |
| "epoch": 0.027428571428571427, | |
| "grad_norm": 0.20275644958019257, | |
| "kl": 2.9724091291427612e-05, | |
| "learning_rate": 4.8e-07, | |
| "loss": 0.0, | |
| "reward": 0.5923267342150211, | |
| "reward_std": 0.9929416831582785, | |
| "rewards/cosine_scaled_reward": 0.025330022268462926, | |
| "rewards/format_reward": 0.5416666809469461, | |
| "step": 24 | |
| }, | |
| { | |
| "completion_length": 2758.7083740234375, | |
| "epoch": 0.02857142857142857, | |
| "grad_norm": 0.2127925157546997, | |
| "kl": 3.1773000955581665e-05, | |
| "learning_rate": 5e-07, | |
| "loss": 0.0, | |
| "reward": 0.32758328318595886, | |
| "reward_std": 0.7631605602800846, | |
| "rewards/cosine_scaled_reward": -0.06537504983134568, | |
| "rewards/format_reward": 0.45833334140479565, | |
| "step": 25 | |
| }, | |
| { | |
| "completion_length": 3101.4166870117188, | |
| "epoch": 0.029714285714285714, | |
| "grad_norm": 0.1526872217655182, | |
| "kl": 3.3758580684661865e-05, | |
| "learning_rate": 5.2e-07, | |
| "loss": 0.0, | |
| "reward": 0.2650221809744835, | |
| "reward_std": 0.6115698590874672, | |
| "rewards/cosine_scaled_reward": -0.07582224532961845, | |
| "rewards/format_reward": 0.41666667722165585, | |
| "step": 26 | |
| }, | |
| { | |
| "completion_length": 2927.1458740234375, | |
| "epoch": 0.030857142857142857, | |
| "grad_norm": 0.22624598443508148, | |
| "kl": 4.612654447555542e-05, | |
| "learning_rate": 5.4e-07, | |
| "loss": 0.0, | |
| "reward": 0.18323302548378706, | |
| "reward_std": 0.7548771295696497, | |
| "rewards/cosine_scaled_reward": -0.09588348306715488, | |
| "rewards/format_reward": 0.3750000037252903, | |
| "step": 27 | |
| }, | |
| { | |
| "completion_length": 2783.541679382324, | |
| "epoch": 0.032, | |
| "grad_norm": 0.19066381454467773, | |
| "kl": 3.3406540751457214e-05, | |
| "learning_rate": 5.6e-07, | |
| "loss": 0.0, | |
| "reward": 0.3194316625595093, | |
| "reward_std": 0.5980064831674099, | |
| "rewards/cosine_scaled_reward": -0.048617489635944366, | |
| "rewards/format_reward": 0.416666679084301, | |
| "step": 28 | |
| }, | |
| { | |
| "completion_length": 3331.8125610351562, | |
| "epoch": 0.03314285714285714, | |
| "grad_norm": 0.24491053819656372, | |
| "kl": 2.5831162929534912e-05, | |
| "learning_rate": 5.8e-07, | |
| "loss": 0.0, | |
| "reward": -0.3617453798651695, | |
| "reward_std": 0.5833318009972572, | |
| "rewards/cosine_scaled_reward": -0.25378935784101486, | |
| "rewards/format_reward": 0.14583333767950535, | |
| "step": 29 | |
| }, | |
| { | |
| "completion_length": 2860.062545776367, | |
| "epoch": 0.03428571428571429, | |
| "grad_norm": 0.17712661623954773, | |
| "kl": 1.9449740648269653e-05, | |
| "learning_rate": 6e-07, | |
| "loss": 0.0, | |
| "reward": 0.5695777088403702, | |
| "reward_std": 0.9141397215425968, | |
| "rewards/cosine_scaled_reward": 0.03478884696960449, | |
| "rewards/format_reward": 0.5000000093132257, | |
| "step": 30 | |
| }, | |
| { | |
| "completion_length": 2967.625045776367, | |
| "epoch": 0.03542857142857143, | |
| "grad_norm": 0.19428333640098572, | |
| "kl": 3.3482909202575684e-05, | |
| "learning_rate": 6.2e-07, | |
| "loss": 0.0, | |
| "reward": 0.05395581666380167, | |
| "reward_std": 0.726109255105257, | |
| "rewards/cosine_scaled_reward": -0.12927209632471204, | |
| "rewards/format_reward": 0.3125000074505806, | |
| "step": 31 | |
| }, | |
| { | |
| "completion_length": 3092.8750610351562, | |
| "epoch": 0.036571428571428574, | |
| "grad_norm": 0.18985038995742798, | |
| "kl": 2.6285648345947266e-05, | |
| "learning_rate": 6.4e-07, | |
| "loss": 0.0, | |
| "reward": 0.18355572840664536, | |
| "reward_std": 0.8745833523571491, | |
| "rewards/cosine_scaled_reward": -0.0853054765611887, | |
| "rewards/format_reward": 0.3541666753590107, | |
| "step": 32 | |
| }, | |
| { | |
| "completion_length": 3396.3750610351562, | |
| "epoch": 0.037714285714285714, | |
| "grad_norm": 0.1381026655435562, | |
| "kl": 3.3989548683166504e-05, | |
| "learning_rate": 6.6e-07, | |
| "loss": 0.0, | |
| "reward": 0.2385600535199046, | |
| "reward_std": 0.840669609606266, | |
| "rewards/cosine_scaled_reward": -0.03696998115628958, | |
| "rewards/format_reward": 0.3125000074505806, | |
| "step": 33 | |
| }, | |
| { | |
| "completion_length": 2426.937545776367, | |
| "epoch": 0.038857142857142854, | |
| "grad_norm": 0.3463960886001587, | |
| "kl": 2.298876643180847e-05, | |
| "learning_rate": 6.800000000000001e-07, | |
| "loss": 0.0, | |
| "reward": 0.8328317422419786, | |
| "reward_std": 0.8242906630039215, | |
| "rewards/cosine_scaled_reward": 0.12474916968494654, | |
| "rewards/format_reward": 0.5833333414047956, | |
| "step": 34 | |
| }, | |
| { | |
| "completion_length": 3079.8750534057617, | |
| "epoch": 0.04, | |
| "grad_norm": 0.22398288547992706, | |
| "kl": 4.157423973083496e-05, | |
| "learning_rate": 7e-07, | |
| "loss": 0.0, | |
| "reward": -0.05081530287861824, | |
| "reward_std": 0.9191469214856625, | |
| "rewards/cosine_scaled_reward": -0.16082432121038437, | |
| "rewards/format_reward": 0.2708333358168602, | |
| "step": 35 | |
| }, | |
| { | |
| "completion_length": 3372.6458740234375, | |
| "epoch": 0.04114285714285714, | |
| "grad_norm": 0.17480096220970154, | |
| "kl": 3.078579902648926e-05, | |
| "learning_rate": 7.2e-07, | |
| "loss": 0.0, | |
| "reward": -0.3681298622395843, | |
| "reward_std": 0.6958093121647835, | |
| "rewards/cosine_scaled_reward": -0.2673982698470354, | |
| "rewards/format_reward": 0.1666666679084301, | |
| "step": 36 | |
| }, | |
| { | |
| "completion_length": 3249.5208435058594, | |
| "epoch": 0.04228571428571429, | |
| "grad_norm": 0.16790439188480377, | |
| "kl": 2.954155206680298e-05, | |
| "learning_rate": 7.4e-07, | |
| "loss": 0.0, | |
| "reward": -0.22375414264388382, | |
| "reward_std": 0.5829485245049, | |
| "rewards/cosine_scaled_reward": -0.22646040935069323, | |
| "rewards/format_reward": 0.2291666679084301, | |
| "step": 37 | |
| }, | |
| { | |
| "completion_length": 3267.0208435058594, | |
| "epoch": 0.04342857142857143, | |
| "grad_norm": 0.15718603134155273, | |
| "kl": 2.9595568776130676e-05, | |
| "learning_rate": 7.599999999999999e-07, | |
| "loss": 0.0, | |
| "reward": -0.2248917780816555, | |
| "reward_std": 0.3576042940840125, | |
| "rewards/cosine_scaled_reward": -0.17494588904082775, | |
| "rewards/format_reward": 0.125, | |
| "step": 38 | |
| }, | |
| { | |
| "completion_length": 2847.5208587646484, | |
| "epoch": 0.044571428571428574, | |
| "grad_norm": 0.24797889590263367, | |
| "kl": 1.7508864402770996e-05, | |
| "learning_rate": 7.799999999999999e-07, | |
| "loss": 0.0, | |
| "reward": 0.3429147396236658, | |
| "reward_std": 0.48750871582888067, | |
| "rewards/cosine_scaled_reward": -0.03687598556280136, | |
| "rewards/format_reward": 0.41666666977107525, | |
| "step": 39 | |
| }, | |
| { | |
| "completion_length": 2490.791702270508, | |
| "epoch": 0.045714285714285714, | |
| "grad_norm": 0.18541350960731506, | |
| "kl": 1.5174038708209991e-05, | |
| "learning_rate": 8e-07, | |
| "loss": 0.0, | |
| "reward": 0.3607976003549993, | |
| "reward_std": 0.40943362936377525, | |
| "rewards/cosine_scaled_reward": -0.11126788146793842, | |
| "rewards/format_reward": 0.5833333432674408, | |
| "step": 40 | |
| }, | |
| { | |
| "completion_length": 3017.6250610351562, | |
| "epoch": 0.046857142857142854, | |
| "grad_norm": 0.1669527143239975, | |
| "kl": 2.2130087018013e-05, | |
| "learning_rate": 8.199999999999999e-07, | |
| "loss": 0.0, | |
| "reward": -0.09058963833376765, | |
| "reward_std": 0.6663156133145094, | |
| "rewards/cosine_scaled_reward": -0.22237816639244556, | |
| "rewards/format_reward": 0.3541666679084301, | |
| "step": 41 | |
| }, | |
| { | |
| "completion_length": 2780.5208587646484, | |
| "epoch": 0.048, | |
| "grad_norm": 0.2479742020368576, | |
| "kl": 5.4270029067993164e-05, | |
| "learning_rate": 8.399999999999999e-07, | |
| "loss": 0.0, | |
| "reward": -0.20546918595209718, | |
| "reward_std": 0.47696714848279953, | |
| "rewards/cosine_scaled_reward": -0.26940126344561577, | |
| "rewards/format_reward": 0.33333333395421505, | |
| "step": 42 | |
| }, | |
| { | |
| "completion_length": 2762.8541870117188, | |
| "epoch": 0.04914285714285714, | |
| "grad_norm": 0.1957981437444687, | |
| "kl": 2.5795772671699524e-05, | |
| "learning_rate": 8.599999999999999e-07, | |
| "loss": 0.0, | |
| "reward": 0.1344442442059517, | |
| "reward_std": 0.6109267733991146, | |
| "rewards/cosine_scaled_reward": -0.1098612155765295, | |
| "rewards/format_reward": 0.35416667722165585, | |
| "step": 43 | |
| }, | |
| { | |
| "completion_length": 2733.7708740234375, | |
| "epoch": 0.05028571428571429, | |
| "grad_norm": 0.3306754231452942, | |
| "kl": 5.932897329330444e-05, | |
| "learning_rate": 8.799999999999999e-07, | |
| "loss": 0.0, | |
| "reward": 0.36940332502126694, | |
| "reward_std": 0.7888948805630207, | |
| "rewards/cosine_scaled_reward": -0.044465010054409504, | |
| "rewards/format_reward": 0.45833334513008595, | |
| "step": 44 | |
| }, | |
| { | |
| "completion_length": 3472.312530517578, | |
| "epoch": 0.05142857142857143, | |
| "grad_norm": 0.14603589475154877, | |
| "kl": 3.156810998916626e-05, | |
| "learning_rate": 9e-07, | |
| "loss": 0.0, | |
| "reward": -0.042608221992850304, | |
| "reward_std": 0.5772924721240997, | |
| "rewards/cosine_scaled_reward": -0.10463745961897075, | |
| "rewards/format_reward": 0.16666667349636555, | |
| "step": 45 | |
| }, | |
| { | |
| "completion_length": 3173.9791870117188, | |
| "epoch": 0.052571428571428575, | |
| "grad_norm": 0.21064509451389313, | |
| "kl": 3.273040056228638e-05, | |
| "learning_rate": 9.2e-07, | |
| "loss": 0.0, | |
| "reward": -0.29001128301024437, | |
| "reward_std": 0.4347268417477608, | |
| "rewards/cosine_scaled_reward": -0.22833896055817604, | |
| "rewards/format_reward": 0.1666666679084301, | |
| "step": 46 | |
| }, | |
| { | |
| "completion_length": 2814.500030517578, | |
| "epoch": 0.053714285714285714, | |
| "grad_norm": 0.2205515205860138, | |
| "kl": 1.9498169422149658e-05, | |
| "learning_rate": 9.399999999999999e-07, | |
| "loss": 0.0, | |
| "reward": 0.5555428601801395, | |
| "reward_std": 0.9152592048048973, | |
| "rewards/cosine_scaled_reward": 0.027771430788561702, | |
| "rewards/format_reward": 0.5000000074505806, | |
| "step": 47 | |
| }, | |
| { | |
| "completion_length": 2843.6250610351562, | |
| "epoch": 0.054857142857142854, | |
| "grad_norm": 0.1848597675561905, | |
| "kl": 8.176267147064209e-05, | |
| "learning_rate": 9.6e-07, | |
| "loss": 0.0, | |
| "reward": 0.15840810351073742, | |
| "reward_std": 0.8359496407210827, | |
| "rewards/cosine_scaled_reward": -0.09787929493177217, | |
| "rewards/format_reward": 0.35416666977107525, | |
| "step": 48 | |
| }, | |
| { | |
| "completion_length": 2283.8541946411133, | |
| "epoch": 0.056, | |
| "grad_norm": 0.20812320709228516, | |
| "kl": 3.5099685192108154e-05, | |
| "learning_rate": 9.8e-07, | |
| "loss": 0.0, | |
| "reward": 0.5328511632978916, | |
| "reward_std": 0.8218232821673155, | |
| "rewards/cosine_scaled_reward": -0.025241072289645672, | |
| "rewards/format_reward": 0.5833333414047956, | |
| "step": 49 | |
| }, | |
| { | |
| "completion_length": 2871.3125534057617, | |
| "epoch": 0.05714285714285714, | |
| "grad_norm": 0.17633675038814545, | |
| "kl": 5.5462121963500977e-05, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0, | |
| "reward": 0.425536647439003, | |
| "reward_std": 0.6618794202804565, | |
| "rewards/cosine_scaled_reward": 0.025268293917179108, | |
| "rewards/format_reward": 0.37500000186264515, | |
| "step": 50 | |
| }, | |
| { | |
| "completion_length": 2325.1458740234375, | |
| "epoch": 0.05828571428571429, | |
| "grad_norm": 0.24762925505638123, | |
| "kl": 0.00016194581985473633, | |
| "learning_rate": 9.999890338174275e-07, | |
| "loss": 0.0, | |
| "reward": 0.2191239856183529, | |
| "reward_std": 0.5495061241090298, | |
| "rewards/cosine_scaled_reward": -0.16127135697752237, | |
| "rewards/format_reward": 0.5416666716337204, | |
| "step": 51 | |
| }, | |
| { | |
| "completion_length": 2865.750045776367, | |
| "epoch": 0.05942857142857143, | |
| "grad_norm": 0.2184467613697052, | |
| "kl": 8.131936192512512e-05, | |
| "learning_rate": 9.999561358041868e-07, | |
| "loss": 0.0, | |
| "reward": 0.49965737760066986, | |
| "reward_std": 1.128704745322466, | |
| "rewards/cosine_scaled_reward": 0.031078664120286703, | |
| "rewards/format_reward": 0.4375000111758709, | |
| "step": 52 | |
| }, | |
| { | |
| "completion_length": 2812.062530517578, | |
| "epoch": 0.060571428571428575, | |
| "grad_norm": 0.23506155610084534, | |
| "kl": 0.0001082010567188263, | |
| "learning_rate": 9.999013075636804e-07, | |
| "loss": 0.0, | |
| "reward": 0.17906612996011972, | |
| "reward_std": 0.767060749232769, | |
| "rewards/cosine_scaled_reward": -0.13963361305650324, | |
| "rewards/format_reward": 0.45833333767950535, | |
| "step": 53 | |
| }, | |
| { | |
| "completion_length": 2882.104232788086, | |
| "epoch": 0.061714285714285715, | |
| "grad_norm": 0.20438477396965027, | |
| "kl": 4.882924258708954e-05, | |
| "learning_rate": 9.998245517681593e-07, | |
| "loss": 0.0, | |
| "reward": 0.8570227089803666, | |
| "reward_std": 1.0325051695108414, | |
| "rewards/cosine_scaled_reward": 0.1576780043542385, | |
| "rewards/format_reward": 0.541666679084301, | |
| "step": 54 | |
| }, | |
| { | |
| "completion_length": 2910.7291870117188, | |
| "epoch": 0.06285714285714286, | |
| "grad_norm": 0.19835922122001648, | |
| "kl": 6.431713700294495e-05, | |
| "learning_rate": 9.997258721585931e-07, | |
| "loss": 0.0, | |
| "reward": 0.25683262944221497, | |
| "reward_std": 0.8695680163800716, | |
| "rewards/cosine_scaled_reward": -0.04866702202707529, | |
| "rewards/format_reward": 0.35416667349636555, | |
| "step": 55 | |
| }, | |
| { | |
| "completion_length": 2741.3750534057617, | |
| "epoch": 0.064, | |
| "grad_norm": 0.1814289689064026, | |
| "kl": 3.975629806518555e-05, | |
| "learning_rate": 9.996052735444862e-07, | |
| "loss": 0.0, | |
| "reward": 0.47193842101842165, | |
| "reward_std": 0.5631829425692558, | |
| "rewards/cosine_scaled_reward": -0.003614123910665512, | |
| "rewards/format_reward": 0.47916667349636555, | |
| "step": 56 | |
| }, | |
| { | |
| "completion_length": 3241.8334045410156, | |
| "epoch": 0.06514285714285714, | |
| "grad_norm": 0.1280871331691742, | |
| "kl": 2.622697502374649e-05, | |
| "learning_rate": 9.994627618036452e-07, | |
| "loss": 0.0, | |
| "reward": 0.1633035959675908, | |
| "reward_std": 0.93094053119421, | |
| "rewards/cosine_scaled_reward": -0.09543154633138329, | |
| "rewards/format_reward": 0.35416667349636555, | |
| "step": 57 | |
| }, | |
| { | |
| "completion_length": 2383.1667251586914, | |
| "epoch": 0.06628571428571428, | |
| "grad_norm": 0.2606426179409027, | |
| "kl": 0.000448569655418396, | |
| "learning_rate": 9.992983438818915e-07, | |
| "loss": 0.0, | |
| "reward": 0.46104544680565596, | |
| "reward_std": 0.8501972369849682, | |
| "rewards/cosine_scaled_reward": -0.07156064454466105, | |
| "rewards/format_reward": 0.6041666772216558, | |
| "step": 58 | |
| }, | |
| { | |
| "completion_length": 2776.9167098999023, | |
| "epoch": 0.06742857142857143, | |
| "grad_norm": 0.1767275035381317, | |
| "kl": 4.192814230918884e-05, | |
| "learning_rate": 9.991120277927223e-07, | |
| "loss": 0.0, | |
| "reward": 0.1411968027241528, | |
| "reward_std": 0.6273871827870607, | |
| "rewards/cosine_scaled_reward": -0.08565161540172994, | |
| "rewards/format_reward": 0.31250000186264515, | |
| "step": 59 | |
| }, | |
| { | |
| "completion_length": 3010.2708435058594, | |
| "epoch": 0.06857142857142857, | |
| "grad_norm": 0.16747058928012848, | |
| "kl": 4.213489592075348e-05, | |
| "learning_rate": 9.989038226169207e-07, | |
| "loss": 0.0, | |
| "reward": -0.10470366384834051, | |
| "reward_std": 0.5668917782604694, | |
| "rewards/cosine_scaled_reward": -0.20860183122567832, | |
| "rewards/format_reward": 0.3125000074505806, | |
| "step": 60 | |
| }, | |
| { | |
| "completion_length": 2930.125045776367, | |
| "epoch": 0.06971428571428571, | |
| "grad_norm": 0.18895801901817322, | |
| "kl": 0.00015696324408054352, | |
| "learning_rate": 9.98673738502114e-07, | |
| "loss": 0.0, | |
| "reward": 0.31786643620580435, | |
| "reward_std": 0.8822249062359333, | |
| "rewards/cosine_scaled_reward": -0.08065013960003853, | |
| "rewards/format_reward": 0.4791666753590107, | |
| "step": 61 | |
| }, | |
| { | |
| "completion_length": 2585.104232788086, | |
| "epoch": 0.07085714285714285, | |
| "grad_norm": 0.19783088564872742, | |
| "kl": 0.00045157596468925476, | |
| "learning_rate": 9.98421786662277e-07, | |
| "loss": 0.0, | |
| "reward": 0.5959182996302843, | |
| "reward_std": 0.8203712180256844, | |
| "rewards/cosine_scaled_reward": -0.014540859963744879, | |
| "rewards/format_reward": 0.6250000093132257, | |
| "step": 62 | |
| }, | |
| { | |
| "completion_length": 2321.2708587646484, | |
| "epoch": 0.072, | |
| "grad_norm": 0.19266927242279053, | |
| "kl": 0.00032773613929748535, | |
| "learning_rate": 9.981479793771866e-07, | |
| "loss": 0.0, | |
| "reward": 0.9606147482991219, | |
| "reward_std": 0.7731334287673235, | |
| "rewards/cosine_scaled_reward": 0.13655737973749638, | |
| "rewards/format_reward": 0.6875000149011612, | |
| "step": 63 | |
| }, | |
| { | |
| "completion_length": 2989.8958587646484, | |
| "epoch": 0.07314285714285715, | |
| "grad_norm": 0.21102704107761383, | |
| "kl": 0.0001429772237315774, | |
| "learning_rate": 9.97852329991824e-07, | |
| "loss": 0.0, | |
| "reward": 0.30724555626511574, | |
| "reward_std": 0.8989091999828815, | |
| "rewards/cosine_scaled_reward": -0.054710563155822456, | |
| "rewards/format_reward": 0.41666668467223644, | |
| "step": 64 | |
| }, | |
| { | |
| "completion_length": 2789.541690826416, | |
| "epoch": 0.07428571428571429, | |
| "grad_norm": 0.2767220735549927, | |
| "kl": 0.0001298440620303154, | |
| "learning_rate": 9.975348529157229e-07, | |
| "loss": 0.0, | |
| "reward": 0.10150328651070595, | |
| "reward_std": 0.5617774492129683, | |
| "rewards/cosine_scaled_reward": -0.14716504141688347, | |
| "rewards/format_reward": 0.39583334140479565, | |
| "step": 65 | |
| }, | |
| { | |
| "completion_length": 2099.3333435058594, | |
| "epoch": 0.07542857142857143, | |
| "grad_norm": 0.24364213645458221, | |
| "kl": 0.00016443058848381042, | |
| "learning_rate": 9.971955636222684e-07, | |
| "loss": 0.0, | |
| "reward": 0.5610158704221249, | |
| "reward_std": 0.46347122080624104, | |
| "rewards/cosine_scaled_reward": 0.03050791099667549, | |
| "rewards/format_reward": 0.5, | |
| "step": 66 | |
| }, | |
| { | |
| "completion_length": 3478.562530517578, | |
| "epoch": 0.07657142857142857, | |
| "grad_norm": 0.14123453199863434, | |
| "kl": 0.00021456927061080933, | |
| "learning_rate": 9.968344786479415e-07, | |
| "loss": 0.0, | |
| "reward": -0.4186356011778116, | |
| "reward_std": 0.4762955382466316, | |
| "rewards/cosine_scaled_reward": -0.26140114292502403, | |
| "rewards/format_reward": 0.1041666679084301, | |
| "step": 67 | |
| }, | |
| { | |
| "completion_length": 1994.0000534057617, | |
| "epoch": 0.07771428571428571, | |
| "grad_norm": 0.24095165729522705, | |
| "kl": 0.0009656250476837158, | |
| "learning_rate": 9.964516155915151e-07, | |
| "loss": 0.0, | |
| "reward": 0.5732477158308029, | |
| "reward_std": 0.779868096113205, | |
| "rewards/cosine_scaled_reward": -0.04670950397849083, | |
| "rewards/format_reward": 0.666666679084301, | |
| "step": 68 | |
| }, | |
| { | |
| "completion_length": 2451.6875228881836, | |
| "epoch": 0.07885714285714286, | |
| "grad_norm": 0.2385924905538559, | |
| "kl": 0.0006657838821411133, | |
| "learning_rate": 9.960469931131936e-07, | |
| "loss": 0.0, | |
| "reward": 0.06796575582120568, | |
| "reward_std": 0.5406197272241116, | |
| "rewards/cosine_scaled_reward": -0.19518379587680101, | |
| "rewards/format_reward": 0.45833334140479565, | |
| "step": 69 | |
| }, | |
| { | |
| "completion_length": 3113.541702270508, | |
| "epoch": 0.08, | |
| "grad_norm": 0.1962832510471344, | |
| "kl": 0.0009769648313522339, | |
| "learning_rate": 9.956206309337066e-07, | |
| "loss": 0.0, | |
| "reward": 0.0411946764215827, | |
| "reward_std": 0.5727978125214577, | |
| "rewards/cosine_scaled_reward": -0.1773193427361548, | |
| "rewards/format_reward": 0.3958333432674408, | |
| "step": 70 | |
| }, | |
| { | |
| "completion_length": 2725.270851135254, | |
| "epoch": 0.08114285714285714, | |
| "grad_norm": 0.18360581994056702, | |
| "kl": 0.0005005262792110443, | |
| "learning_rate": 9.951725498333448e-07, | |
| "loss": 0.0, | |
| "reward": 0.38103893026709557, | |
| "reward_std": 0.6639891043305397, | |
| "rewards/cosine_scaled_reward": 0.003019465133547783, | |
| "rewards/format_reward": 0.37500000558793545, | |
| "step": 71 | |
| }, | |
| { | |
| "completion_length": 2805.8333587646484, | |
| "epoch": 0.08228571428571428, | |
| "grad_norm": 0.20988361537456512, | |
| "kl": 0.0007044821977615356, | |
| "learning_rate": 9.947027716509488e-07, | |
| "loss": 0.0, | |
| "reward": 0.25359107134863734, | |
| "reward_std": 0.811398807913065, | |
| "rewards/cosine_scaled_reward": -0.10237114643678069, | |
| "rewards/format_reward": 0.45833335258066654, | |
| "step": 72 | |
| }, | |
| { | |
| "completion_length": 3307.854217529297, | |
| "epoch": 0.08342857142857144, | |
| "grad_norm": 0.1682441234588623, | |
| "kl": 0.0001322627067565918, | |
| "learning_rate": 9.942113192828444e-07, | |
| "loss": 0.0, | |
| "reward": 0.15133656188845634, | |
| "reward_std": 0.8552793562412262, | |
| "rewards/cosine_scaled_reward": -0.04933173581957817, | |
| "rewards/format_reward": 0.25000000931322575, | |
| "step": 73 | |
| }, | |
| { | |
| "completion_length": 3133.104248046875, | |
| "epoch": 0.08457142857142858, | |
| "grad_norm": 0.15636852383613586, | |
| "kl": 0.00043823570013046265, | |
| "learning_rate": 9.93698216681727e-07, | |
| "loss": 0.0, | |
| "reward": 0.3009439923334867, | |
| "reward_std": 0.9593954384326935, | |
| "rewards/cosine_scaled_reward": -0.005778005812317133, | |
| "rewards/format_reward": 0.3125000037252903, | |
| "step": 74 | |
| }, | |
| { | |
| "completion_length": 2883.166717529297, | |
| "epoch": 0.08571428571428572, | |
| "grad_norm": 0.15218932926654816, | |
| "kl": 0.0008656233549118042, | |
| "learning_rate": 9.931634888554935e-07, | |
| "loss": 0.0, | |
| "reward": 0.32558897137641907, | |
| "reward_std": 0.5437379367649555, | |
| "rewards/cosine_scaled_reward": -0.03512220270931721, | |
| "rewards/format_reward": 0.39583334140479565, | |
| "step": 75 | |
| }, | |
| { | |
| "completion_length": 2863.3125228881836, | |
| "epoch": 0.08685714285714285, | |
| "grad_norm": 0.15872140228748322, | |
| "kl": 9.638071060180664e-05, | |
| "learning_rate": 9.926071618660237e-07, | |
| "loss": 0.0, | |
| "reward": 0.18390853703022003, | |
| "reward_std": 0.5108423344790936, | |
| "rewards/cosine_scaled_reward": -0.13721241243183613, | |
| "rewards/format_reward": 0.45833334140479565, | |
| "step": 76 | |
| }, | |
| { | |
| "completion_length": 3153.354217529297, | |
| "epoch": 0.088, | |
| "grad_norm": 0.15251125395298004, | |
| "kl": 0.00015661679208278656, | |
| "learning_rate": 9.9202926282791e-07, | |
| "loss": 0.0, | |
| "reward": 0.06627489440143108, | |
| "reward_std": 0.4633176252245903, | |
| "rewards/cosine_scaled_reward": -0.14394589699804783, | |
| "rewards/format_reward": 0.35416666977107525, | |
| "step": 77 | |
| }, | |
| { | |
| "completion_length": 3143.916702270508, | |
| "epoch": 0.08914285714285715, | |
| "grad_norm": 0.16262713074684143, | |
| "kl": 0.0005014901980757713, | |
| "learning_rate": 9.91429819907136e-07, | |
| "loss": 0.0, | |
| "reward": 0.38807435147464275, | |
| "reward_std": 0.8590060994029045, | |
| "rewards/cosine_scaled_reward": 0.016953811049461365, | |
| "rewards/format_reward": 0.354166679084301, | |
| "step": 78 | |
| }, | |
| { | |
| "completion_length": 2234.041675567627, | |
| "epoch": 0.09028571428571429, | |
| "grad_norm": 0.22237282991409302, | |
| "kl": 0.0009625256061553955, | |
| "learning_rate": 9.908088623197048e-07, | |
| "loss": 0.0, | |
| "reward": 0.4828355088829994, | |
| "reward_std": 0.6572117768228054, | |
| "rewards/cosine_scaled_reward": -0.06066559627652168, | |
| "rewards/format_reward": 0.6041666697710752, | |
| "step": 79 | |
| }, | |
| { | |
| "completion_length": 3146.1250610351562, | |
| "epoch": 0.09142857142857143, | |
| "grad_norm": 0.16743233799934387, | |
| "kl": 0.00040898716542869806, | |
| "learning_rate": 9.901664203302124e-07, | |
| "loss": 0.0, | |
| "reward": 0.3415686395019293, | |
| "reward_std": 0.9433976151049137, | |
| "rewards/cosine_scaled_reward": -0.016715684439986944, | |
| "rewards/format_reward": 0.37500000558793545, | |
| "step": 80 | |
| }, | |
| { | |
| "completion_length": 2960.333366394043, | |
| "epoch": 0.09257142857142857, | |
| "grad_norm": 0.31833556294441223, | |
| "kl": 0.0011633634567260742, | |
| "learning_rate": 9.895025252503755e-07, | |
| "loss": 0.0, | |
| "reward": 0.0825501810759306, | |
| "reward_std": 0.7551630176603794, | |
| "rewards/cosine_scaled_reward": -0.1253915773704648, | |
| "rewards/format_reward": 0.33333334513008595, | |
| "step": 81 | |
| }, | |
| { | |
| "completion_length": 2789.666679382324, | |
| "epoch": 0.09371428571428571, | |
| "grad_norm": 0.1964961588382721, | |
| "kl": 0.0009930580854415894, | |
| "learning_rate": 9.888172094375033e-07, | |
| "loss": 0.0, | |
| "reward": 0.32206014543771744, | |
| "reward_std": 0.8189963661134243, | |
| "rewards/cosine_scaled_reward": -0.05771994253154844, | |
| "rewards/format_reward": 0.4375000074505806, | |
| "step": 82 | |
| }, | |
| { | |
| "completion_length": 2697.8541870117188, | |
| "epoch": 0.09485714285714286, | |
| "grad_norm": 0.23213206231594086, | |
| "kl": 0.0010943468660116196, | |
| "learning_rate": 9.881105062929221e-07, | |
| "loss": 0.0, | |
| "reward": 0.06216884404420853, | |
| "reward_std": 0.5933049730956554, | |
| "rewards/cosine_scaled_reward": -0.14599892310798168, | |
| "rewards/format_reward": 0.35416666977107525, | |
| "step": 83 | |
| }, | |
| { | |
| "completion_length": 3034.8958587646484, | |
| "epoch": 0.096, | |
| "grad_norm": 0.16874635219573975, | |
| "kl": 0.0005750656127929688, | |
| "learning_rate": 9.873824502603459e-07, | |
| "loss": 0.0, | |
| "reward": 0.3496646843268536, | |
| "reward_std": 0.9294291753321886, | |
| "rewards/cosine_scaled_reward": -0.02308431826531887, | |
| "rewards/format_reward": 0.39583334513008595, | |
| "step": 84 | |
| }, | |
| { | |
| "completion_length": 3031.625045776367, | |
| "epoch": 0.09714285714285714, | |
| "grad_norm": 0.14502210915088654, | |
| "kl": 0.0002237856388092041, | |
| "learning_rate": 9.866330768241983e-07, | |
| "loss": 0.0, | |
| "reward": 0.11518191546201706, | |
| "reward_std": 0.844723105430603, | |
| "rewards/cosine_scaled_reward": -0.15074237808585167, | |
| "rewards/format_reward": 0.4166666753590107, | |
| "step": 85 | |
| }, | |
| { | |
| "completion_length": 2953.500045776367, | |
| "epoch": 0.09828571428571428, | |
| "grad_norm": 0.17785117030143738, | |
| "kl": 0.0008893311023712158, | |
| "learning_rate": 9.85862422507884e-07, | |
| "loss": 0.0, | |
| "reward": 0.19509585201740265, | |
| "reward_std": 0.6929262951016426, | |
| "rewards/cosine_scaled_reward": -0.08995212335139513, | |
| "rewards/format_reward": 0.3750000074505806, | |
| "step": 86 | |
| }, | |
| { | |
| "completion_length": 2735.4375762939453, | |
| "epoch": 0.09942857142857142, | |
| "grad_norm": 0.23141460120677948, | |
| "kl": 0.0007982850074768066, | |
| "learning_rate": 9.850705248720068e-07, | |
| "loss": 0.0, | |
| "reward": 0.3154503256082535, | |
| "reward_std": 0.7814465276896954, | |
| "rewards/cosine_scaled_reward": -0.10269152000546455, | |
| "rewards/format_reward": 0.5208333395421505, | |
| "step": 87 | |
| }, | |
| { | |
| "completion_length": 2611.250068664551, | |
| "epoch": 0.10057142857142858, | |
| "grad_norm": 0.21885579824447632, | |
| "kl": 0.004000961780548096, | |
| "learning_rate": 9.8425742251254e-07, | |
| "loss": 0.0002, | |
| "reward": 0.700496843084693, | |
| "reward_std": 0.977291576564312, | |
| "rewards/cosine_scaled_reward": 0.04816507982468465, | |
| "rewards/format_reward": 0.6041666828095913, | |
| "step": 88 | |
| }, | |
| { | |
| "completion_length": 3090.6250610351562, | |
| "epoch": 0.10171428571428572, | |
| "grad_norm": 0.18484243750572205, | |
| "kl": 0.0012155771255493164, | |
| "learning_rate": 9.83423155058946e-07, | |
| "loss": 0.0, | |
| "reward": 0.1298842504620552, | |
| "reward_std": 0.7731972448527813, | |
| "rewards/cosine_scaled_reward": -0.1121412068605423, | |
| "rewards/format_reward": 0.3541666753590107, | |
| "step": 89 | |
| }, | |
| { | |
| "completion_length": 2396.8125228881836, | |
| "epoch": 0.10285714285714286, | |
| "grad_norm": 0.26242735981941223, | |
| "kl": 0.0012450218200683594, | |
| "learning_rate": 9.825677631722435e-07, | |
| "loss": 0.0, | |
| "reward": -0.022300932556390762, | |
| "reward_std": 0.4295784495770931, | |
| "rewards/cosine_scaled_reward": -0.2715671341866255, | |
| "rewards/format_reward": 0.520833333954215, | |
| "step": 90 | |
| }, | |
| { | |
| "completion_length": 3109.687530517578, | |
| "epoch": 0.104, | |
| "grad_norm": 0.15659227967262268, | |
| "kl": 0.0008578300476074219, | |
| "learning_rate": 9.816912885430258e-07, | |
| "loss": 0.0, | |
| "reward": 0.3678978532552719, | |
| "reward_std": 0.8310167863965034, | |
| "rewards/cosine_scaled_reward": -0.024384415162785444, | |
| "rewards/format_reward": 0.41666667722165585, | |
| "step": 91 | |
| }, | |
| { | |
| "completion_length": 2683.479202270508, | |
| "epoch": 0.10514285714285715, | |
| "grad_norm": 0.2791576683521271, | |
| "kl": 0.0041623711585998535, | |
| "learning_rate": 9.807937738894303e-07, | |
| "loss": 0.0002, | |
| "reward": 0.17558906483463943, | |
| "reward_std": 0.667485423386097, | |
| "rewards/cosine_scaled_reward": -0.1622054846957326, | |
| "rewards/format_reward": 0.5000000037252903, | |
| "step": 92 | |
| }, | |
| { | |
| "completion_length": 3554.875, | |
| "epoch": 0.10628571428571429, | |
| "grad_norm": 0.17273500561714172, | |
| "kl": 0.001106351613998413, | |
| "learning_rate": 9.798752629550546e-07, | |
| "loss": 0.0, | |
| "reward": -0.46894520218484104, | |
| "reward_std": 0.36610983312129974, | |
| "rewards/cosine_scaled_reward": -0.2553059346973896, | |
| "rewards/format_reward": 0.0416666679084301, | |
| "step": 93 | |
| }, | |
| { | |
| "completion_length": 2966.3958435058594, | |
| "epoch": 0.10742857142857143, | |
| "grad_norm": 0.19159966707229614, | |
| "kl": 0.0015701055526733398, | |
| "learning_rate": 9.78935800506826e-07, | |
| "loss": 0.0001, | |
| "reward": -0.010549061640631407, | |
| "reward_std": 0.39640795812010765, | |
| "rewards/cosine_scaled_reward": -0.16152450628578663, | |
| "rewards/format_reward": 0.3125000037252903, | |
| "step": 94 | |
| }, | |
| { | |
| "completion_length": 3374.4375610351562, | |
| "epoch": 0.10857142857142857, | |
| "grad_norm": 0.15439394116401672, | |
| "kl": 0.00039564818143844604, | |
| "learning_rate": 9.779754323328192e-07, | |
| "loss": 0.0, | |
| "reward": -0.28152387839509174, | |
| "reward_std": 0.6863285079598427, | |
| "rewards/cosine_scaled_reward": -0.23451193794608116, | |
| "rewards/format_reward": 0.1875000074505806, | |
| "step": 95 | |
| }, | |
| { | |
| "completion_length": 2657.4791870117188, | |
| "epoch": 0.10971428571428571, | |
| "grad_norm": 0.17443199455738068, | |
| "kl": 0.00177721306681633, | |
| "learning_rate": 9.769942052400235e-07, | |
| "loss": 0.0001, | |
| "reward": 0.09489981457591057, | |
| "reward_std": 0.5061229234561324, | |
| "rewards/cosine_scaled_reward": -0.14005008898675442, | |
| "rewards/format_reward": 0.37500000558793545, | |
| "step": 96 | |
| }, | |
| { | |
| "completion_length": 3091.729248046875, | |
| "epoch": 0.11085714285714286, | |
| "grad_norm": 0.18888260424137115, | |
| "kl": 0.0008979141712188721, | |
| "learning_rate": 9.759921670520634e-07, | |
| "loss": 0.0, | |
| "reward": 0.05433476809412241, | |
| "reward_std": 0.6424587089568377, | |
| "rewards/cosine_scaled_reward": -0.14991594851016998, | |
| "rewards/format_reward": 0.35416667349636555, | |
| "step": 97 | |
| }, | |
| { | |
| "completion_length": 2351.4375381469727, | |
| "epoch": 0.112, | |
| "grad_norm": 0.2002606987953186, | |
| "kl": 0.0006032586097717285, | |
| "learning_rate": 9.749693666068663e-07, | |
| "loss": 0.0, | |
| "reward": 0.4714247789233923, | |
| "reward_std": 0.7434845846146345, | |
| "rewards/cosine_scaled_reward": -0.04553762264549732, | |
| "rewards/format_reward": 0.5625000149011612, | |
| "step": 98 | |
| }, | |
| { | |
| "completion_length": 2837.7291717529297, | |
| "epoch": 0.11314285714285714, | |
| "grad_norm": 0.21085548400878906, | |
| "kl": 0.000676274299621582, | |
| "learning_rate": 9.739258537542835e-07, | |
| "loss": 0.0, | |
| "reward": 0.2813050076365471, | |
| "reward_std": 0.6411089226603508, | |
| "rewards/cosine_scaled_reward": -0.03643083991482854, | |
| "rewards/format_reward": 0.3541666679084301, | |
| "step": 99 | |
| }, | |
| { | |
| "completion_length": 2754.875045776367, | |
| "epoch": 0.11428571428571428, | |
| "grad_norm": 0.17745624482631683, | |
| "kl": 0.0011057853698730469, | |
| "learning_rate": 9.728616793536587e-07, | |
| "loss": 0.0, | |
| "reward": 0.6976646836847067, | |
| "reward_std": 0.9069891199469566, | |
| "rewards/cosine_scaled_reward": 0.11966563505120575, | |
| "rewards/format_reward": 0.45833334513008595, | |
| "step": 100 | |
| }, | |
| { | |
| "completion_length": 2905.625045776367, | |
| "epoch": 0.11542857142857142, | |
| "grad_norm": 0.22573921084403992, | |
| "kl": 0.0009785890579223633, | |
| "learning_rate": 9.717768952713511e-07, | |
| "loss": 0.0, | |
| "reward": 0.06032659858465195, | |
| "reward_std": 0.503840334713459, | |
| "rewards/cosine_scaled_reward": -0.13650337606668472, | |
| "rewards/format_reward": 0.3333333395421505, | |
| "step": 101 | |
| }, | |
| { | |
| "completion_length": 2533.3959045410156, | |
| "epoch": 0.11657142857142858, | |
| "grad_norm": 0.21088238060474396, | |
| "kl": 0.002618730068206787, | |
| "learning_rate": 9.706715543782064e-07, | |
| "loss": 0.0001, | |
| "reward": 0.5906962184235454, | |
| "reward_std": 1.0304273441433907, | |
| "rewards/cosine_scaled_reward": -0.017151910811662674, | |
| "rewards/format_reward": 0.6250000223517418, | |
| "step": 102 | |
| }, | |
| { | |
| "completion_length": 2815.187530517578, | |
| "epoch": 0.11771428571428572, | |
| "grad_norm": 0.23541951179504395, | |
| "kl": 0.0009691715240478516, | |
| "learning_rate": 9.695457105469804e-07, | |
| "loss": 0.0, | |
| "reward": 0.09393319487571716, | |
| "reward_std": 0.8980946093797684, | |
| "rewards/cosine_scaled_reward": -0.16136674140579998, | |
| "rewards/format_reward": 0.41666666977107525, | |
| "step": 103 | |
| }, | |
| { | |
| "completion_length": 2650.6250228881836, | |
| "epoch": 0.11885714285714286, | |
| "grad_norm": 0.3019506335258484, | |
| "kl": 0.007655918598175049, | |
| "learning_rate": 9.683994186497132e-07, | |
| "loss": 0.0003, | |
| "reward": 0.03867958951741457, | |
| "reward_std": 0.49825899116694927, | |
| "rewards/cosine_scaled_reward": -0.16816022247076035, | |
| "rewards/format_reward": 0.37500000558793545, | |
| "step": 104 | |
| }, | |
| { | |
| "completion_length": 2774.0833892822266, | |
| "epoch": 0.12, | |
| "grad_norm": 0.18828085064888, | |
| "kl": 0.0009101927280426025, | |
| "learning_rate": 9.672327345550543e-07, | |
| "loss": 0.0, | |
| "reward": 0.413003945723176, | |
| "reward_std": 0.9300272315740585, | |
| "rewards/cosine_scaled_reward": 0.008585309609770775, | |
| "rewards/format_reward": 0.3958333358168602, | |
| "step": 105 | |
| }, | |
| { | |
| "completion_length": 2155.4584045410156, | |
| "epoch": 0.12114285714285715, | |
| "grad_norm": 0.1837824583053589, | |
| "kl": 0.002248704433441162, | |
| "learning_rate": 9.66045715125541e-07, | |
| "loss": 0.0001, | |
| "reward": 0.945452444255352, | |
| "reward_std": 0.6177145391702652, | |
| "rewards/cosine_scaled_reward": 0.0977262444794178, | |
| "rewards/format_reward": 0.7500000037252903, | |
| "step": 106 | |
| }, | |
| { | |
| "completion_length": 2785.7708740234375, | |
| "epoch": 0.12228571428571429, | |
| "grad_norm": 0.20877282321453094, | |
| "kl": 0.001208662986755371, | |
| "learning_rate": 9.648384182148252e-07, | |
| "loss": 0.0, | |
| "reward": 0.41627691127359867, | |
| "reward_std": 0.7068986408412457, | |
| "rewards/cosine_scaled_reward": -0.05227821506559849, | |
| "rewards/format_reward": 0.5208333507180214, | |
| "step": 107 | |
| }, | |
| { | |
| "completion_length": 2584.791702270508, | |
| "epoch": 0.12342857142857143, | |
| "grad_norm": 0.2157466560602188, | |
| "kl": 0.0009077191352844238, | |
| "learning_rate": 9.636109026648554e-07, | |
| "loss": 0.0, | |
| "reward": 0.5254465593025088, | |
| "reward_std": 0.6625101678073406, | |
| "rewards/cosine_scaled_reward": 0.0023066122084856033, | |
| "rewards/format_reward": 0.5208333395421505, | |
| "step": 108 | |
| }, | |
| { | |
| "completion_length": 3017.6041870117188, | |
| "epoch": 0.12457142857142857, | |
| "grad_norm": 0.16122227907180786, | |
| "kl": 0.000370025634765625, | |
| "learning_rate": 9.623632283030077e-07, | |
| "loss": 0.0, | |
| "reward": 0.18975364477955736, | |
| "reward_std": 0.49849717505276203, | |
| "rewards/cosine_scaled_reward": -0.061373173259198666, | |
| "rewards/format_reward": 0.31250000186264515, | |
| "step": 109 | |
| }, | |
| { | |
| "completion_length": 2612.8542404174805, | |
| "epoch": 0.12571428571428572, | |
| "grad_norm": 0.21259663999080658, | |
| "kl": 0.000641578808426857, | |
| "learning_rate": 9.610954559391704e-07, | |
| "loss": 0.0, | |
| "reward": 0.32054631412029266, | |
| "reward_std": 0.7353306971490383, | |
| "rewards/cosine_scaled_reward": -0.10014353273436427, | |
| "rewards/format_reward": 0.5208333432674408, | |
| "step": 110 | |
| }, | |
| { | |
| "completion_length": 3100.354217529297, | |
| "epoch": 0.12685714285714286, | |
| "grad_norm": 0.18133312463760376, | |
| "kl": 0.0013672113418579102, | |
| "learning_rate": 9.598076473627796e-07, | |
| "loss": 0.0001, | |
| "reward": 0.07442984823137522, | |
| "reward_std": 0.9228987656533718, | |
| "rewards/cosine_scaled_reward": -0.10861842148005962, | |
| "rewards/format_reward": 0.29166667349636555, | |
| "step": 111 | |
| }, | |
| { | |
| "completion_length": 3039.8959045410156, | |
| "epoch": 0.128, | |
| "grad_norm": 0.15633469820022583, | |
| "kl": 0.0005895942449569702, | |
| "learning_rate": 9.58499865339809e-07, | |
| "loss": 0.0, | |
| "reward": 0.5644901357591152, | |
| "reward_std": 0.8792787380516529, | |
| "rewards/cosine_scaled_reward": 0.021828406490385532, | |
| "rewards/format_reward": 0.5208333414047956, | |
| "step": 112 | |
| }, | |
| { | |
| "completion_length": 2473.229202270508, | |
| "epoch": 0.12914285714285714, | |
| "grad_norm": 0.2744481861591339, | |
| "kl": 0.0017359256744384766, | |
| "learning_rate": 9.571721736097088e-07, | |
| "loss": 0.0001, | |
| "reward": 0.17924559116363525, | |
| "reward_std": 0.607318002730608, | |
| "rewards/cosine_scaled_reward": -0.1603772146627307, | |
| "rewards/format_reward": 0.5000000037252903, | |
| "step": 113 | |
| }, | |
| { | |
| "completion_length": 2619.4167289733887, | |
| "epoch": 0.13028571428571428, | |
| "grad_norm": 0.2416525036096573, | |
| "kl": 0.0021146535873413086, | |
| "learning_rate": 9.55824636882301e-07, | |
| "loss": 0.0001, | |
| "reward": 0.3448843713849783, | |
| "reward_std": 0.5230822302401066, | |
| "rewards/cosine_scaled_reward": -0.12964116781949997, | |
| "rewards/format_reward": 0.6041666772216558, | |
| "step": 114 | |
| }, | |
| { | |
| "completion_length": 2841.2916870117188, | |
| "epoch": 0.13142857142857142, | |
| "grad_norm": 0.20283274352550507, | |
| "kl": 0.0020351409912109375, | |
| "learning_rate": 9.54457320834625e-07, | |
| "loss": 0.0001, | |
| "reward": 0.21423707529902458, | |
| "reward_std": 0.7165789231657982, | |
| "rewards/cosine_scaled_reward": -0.08038146048784256, | |
| "rewards/format_reward": 0.3750000037252903, | |
| "step": 115 | |
| }, | |
| { | |
| "completion_length": 3370.125030517578, | |
| "epoch": 0.13257142857142856, | |
| "grad_norm": 0.15726514160633087, | |
| "kl": 0.001503288745880127, | |
| "learning_rate": 9.530702921077358e-07, | |
| "loss": 0.0001, | |
| "reward": 0.014336531981825829, | |
| "reward_std": 0.5990661717951298, | |
| "rewards/cosine_scaled_reward": -0.07616507587954402, | |
| "rewards/format_reward": 0.1666666679084301, | |
| "step": 116 | |
| }, | |
| { | |
| "completion_length": 3045.937530517578, | |
| "epoch": 0.1337142857142857, | |
| "grad_norm": 0.1993873417377472, | |
| "kl": 0.0015254020690917969, | |
| "learning_rate": 9.516636183034564e-07, | |
| "loss": 0.0001, | |
| "reward": -0.07220430299639702, | |
| "reward_std": 0.667652253061533, | |
| "rewards/cosine_scaled_reward": -0.20276882825419307, | |
| "rewards/format_reward": 0.3333333469927311, | |
| "step": 117 | |
| }, | |
| { | |
| "completion_length": 2959.562530517578, | |
| "epoch": 0.13485714285714287, | |
| "grad_norm": 0.18038718402385712, | |
| "kl": 0.0012336969375610352, | |
| "learning_rate": 9.502373679810839e-07, | |
| "loss": 0.0, | |
| "reward": 0.5795598048716784, | |
| "reward_std": 1.1043038107454777, | |
| "rewards/cosine_scaled_reward": 0.07102989172562957, | |
| "rewards/format_reward": 0.4375000149011612, | |
| "step": 118 | |
| }, | |
| { | |
| "completion_length": 2178.750030517578, | |
| "epoch": 0.136, | |
| "grad_norm": 0.24167504906654358, | |
| "kl": 0.002427428960800171, | |
| "learning_rate": 9.487916106540465e-07, | |
| "loss": 0.0001, | |
| "reward": 0.812275217846036, | |
| "reward_std": 0.6674736551940441, | |
| "rewards/cosine_scaled_reward": 0.08322094567120075, | |
| "rewards/format_reward": 0.6458333432674408, | |
| "step": 119 | |
| }, | |
| { | |
| "completion_length": 2230.562568664551, | |
| "epoch": 0.13714285714285715, | |
| "grad_norm": 0.26925644278526306, | |
| "kl": 0.0018974542617797852, | |
| "learning_rate": 9.473264167865171e-07, | |
| "loss": 0.0001, | |
| "reward": 0.43469466688111424, | |
| "reward_std": 0.5474803410470486, | |
| "rewards/cosine_scaled_reward": -0.0951526677235961, | |
| "rewards/format_reward": 0.6250000037252903, | |
| "step": 120 | |
| }, | |
| { | |
| "completion_length": 1692.5000534057617, | |
| "epoch": 0.1382857142857143, | |
| "grad_norm": 0.28395897150039673, | |
| "kl": 0.005714893341064453, | |
| "learning_rate": 9.458418577899774e-07, | |
| "loss": 0.0002, | |
| "reward": 0.7201991314068437, | |
| "reward_std": 0.6400695294141769, | |
| "rewards/cosine_scaled_reward": -0.02531713293865323, | |
| "rewards/format_reward": 0.7708333507180214, | |
| "step": 121 | |
| }, | |
| { | |
| "completion_length": 2879.75004196167, | |
| "epoch": 0.13942857142857143, | |
| "grad_norm": 0.18640156090259552, | |
| "kl": 0.0010684728622436523, | |
| "learning_rate": 9.443380060197385e-07, | |
| "loss": 0.0, | |
| "reward": 0.4576802644878626, | |
| "reward_std": 0.934948768466711, | |
| "rewards/cosine_scaled_reward": 0.020506808534264565, | |
| "rewards/format_reward": 0.4166666716337204, | |
| "step": 122 | |
| }, | |
| { | |
| "completion_length": 2707.9375610351562, | |
| "epoch": 0.14057142857142857, | |
| "grad_norm": 0.17253972589969635, | |
| "kl": 0.0014038681983947754, | |
| "learning_rate": 9.428149347714143e-07, | |
| "loss": 0.0001, | |
| "reward": 0.35453951358795166, | |
| "reward_std": 0.7643875367939472, | |
| "rewards/cosine_scaled_reward": -0.09356357716023922, | |
| "rewards/format_reward": 0.5416666734963655, | |
| "step": 123 | |
| }, | |
| { | |
| "completion_length": 2144.5416946411133, | |
| "epoch": 0.1417142857142857, | |
| "grad_norm": 0.2417304664850235, | |
| "kl": 0.0027747154235839844, | |
| "learning_rate": 9.412727182773486e-07, | |
| "loss": 0.0001, | |
| "reward": 0.32306639989838004, | |
| "reward_std": 0.8119018785655499, | |
| "rewards/cosine_scaled_reward": -0.11971682743751444, | |
| "rewards/format_reward": 0.5625000111758709, | |
| "step": 124 | |
| }, | |
| { | |
| "completion_length": 2816.479202270508, | |
| "epoch": 0.14285714285714285, | |
| "grad_norm": 0.20347152650356293, | |
| "kl": 0.0017933845520019531, | |
| "learning_rate": 9.397114317029974e-07, | |
| "loss": 0.0001, | |
| "reward": 0.15680650994181633, | |
| "reward_std": 0.4326724670827389, | |
| "rewards/cosine_scaled_reward": -0.06743010319769382, | |
| "rewards/format_reward": 0.2916666679084301, | |
| "step": 125 | |
| }, | |
| { | |
| "completion_length": 2907.7500534057617, | |
| "epoch": 0.144, | |
| "grad_norm": 0.16710162162780762, | |
| "kl": 0.0011298656463623047, | |
| "learning_rate": 9.381311511432658e-07, | |
| "loss": 0.0, | |
| "reward": 0.24374442547559738, | |
| "reward_std": 0.5884060095995665, | |
| "rewards/cosine_scaled_reward": -0.07604445889592171, | |
| "rewards/format_reward": 0.3958333395421505, | |
| "step": 126 | |
| }, | |
| { | |
| "completion_length": 2909.5625, | |
| "epoch": 0.14514285714285713, | |
| "grad_norm": 0.1726786196231842, | |
| "kl": 0.0017578601837158203, | |
| "learning_rate": 9.36531953618799e-07, | |
| "loss": 0.0001, | |
| "reward": -0.006642797961831093, | |
| "reward_std": 0.5724836103618145, | |
| "rewards/cosine_scaled_reward": -0.1908214169088751, | |
| "rewards/format_reward": 0.37500000558793545, | |
| "step": 127 | |
| }, | |
| { | |
| "completion_length": 2919.9166946411133, | |
| "epoch": 0.1462857142857143, | |
| "grad_norm": 0.18813887238502502, | |
| "kl": 0.0023109018802642822, | |
| "learning_rate": 9.34913917072228e-07, | |
| "loss": 0.0001, | |
| "reward": 0.5732522960752249, | |
| "reward_std": 0.8493749275803566, | |
| "rewards/cosine_scaled_reward": 0.08870946802198887, | |
| "rewards/format_reward": 0.3958333358168602, | |
| "step": 128 | |
| }, | |
| { | |
| "completion_length": 3364.8334045410156, | |
| "epoch": 0.14742857142857144, | |
| "grad_norm": 0.17843031883239746, | |
| "kl": 0.0023398399353027344, | |
| "learning_rate": 9.332771203643714e-07, | |
| "loss": 0.0001, | |
| "reward": 0.12374773435294628, | |
| "reward_std": 0.6275974959135056, | |
| "rewards/cosine_scaled_reward": -0.07354282308369875, | |
| "rewards/format_reward": 0.27083334140479565, | |
| "step": 129 | |
| }, | |
| { | |
| "completion_length": 2752.5416946411133, | |
| "epoch": 0.14857142857142858, | |
| "grad_norm": 0.1666869968175888, | |
| "kl": 0.0013623237609863281, | |
| "learning_rate": 9.316216432703916e-07, | |
| "loss": 0.0001, | |
| "reward": 0.07455501146614552, | |
| "reward_std": 0.8575821630656719, | |
| "rewards/cosine_scaled_reward": -0.15022250125184655, | |
| "rewards/format_reward": 0.37500000931322575, | |
| "step": 130 | |
| }, | |
| { | |
| "completion_length": 2932.8958740234375, | |
| "epoch": 0.14971428571428572, | |
| "grad_norm": 0.2144927680492401, | |
| "kl": 0.0027085542678833008, | |
| "learning_rate": 9.299475664759068e-07, | |
| "loss": 0.0001, | |
| "reward": 0.5810204441659153, | |
| "reward_std": 0.7529679946601391, | |
| "rewards/cosine_scaled_reward": 0.09259352087974548, | |
| "rewards/format_reward": 0.39583334140479565, | |
| "step": 131 | |
| }, | |
| { | |
| "completion_length": 2482.7916870117188, | |
| "epoch": 0.15085714285714286, | |
| "grad_norm": 0.2159230262041092, | |
| "kl": 0.0015616416931152344, | |
| "learning_rate": 9.282549715730579e-07, | |
| "loss": 0.0001, | |
| "reward": 0.3923495952039957, | |
| "reward_std": 0.6323644928634167, | |
| "rewards/cosine_scaled_reward": -0.02257518842816353, | |
| "rewards/format_reward": 0.4375, | |
| "step": 132 | |
| }, | |
| { | |
| "completion_length": 3183.0416870117188, | |
| "epoch": 0.152, | |
| "grad_norm": 0.21430212259292603, | |
| "kl": 0.00218963623046875, | |
| "learning_rate": 9.265439410565328e-07, | |
| "loss": 0.0001, | |
| "reward": -0.07401247788220644, | |
| "reward_std": 0.4380334075540304, | |
| "rewards/cosine_scaled_reward": -0.17242290638387203, | |
| "rewards/format_reward": 0.27083333767950535, | |
| "step": 133 | |
| }, | |
| { | |
| "completion_length": 2434.312515258789, | |
| "epoch": 0.15314285714285714, | |
| "grad_norm": 0.21282276511192322, | |
| "kl": 0.0028487443923950195, | |
| "learning_rate": 9.248145583195447e-07, | |
| "loss": 0.0001, | |
| "reward": 0.6068310905247927, | |
| "reward_std": 0.8159074038267136, | |
| "rewards/cosine_scaled_reward": 0.01174885593354702, | |
| "rewards/format_reward": 0.583333333954215, | |
| "step": 134 | |
| }, | |
| { | |
| "completion_length": 2001.7917137145996, | |
| "epoch": 0.15428571428571428, | |
| "grad_norm": 0.3072402775287628, | |
| "kl": 0.0029096603393554688, | |
| "learning_rate": 9.230669076497687e-07, | |
| "loss": 0.0001, | |
| "reward": 0.9695501551032066, | |
| "reward_std": 0.6822597435675561, | |
| "rewards/cosine_scaled_reward": 0.17227506916970015, | |
| "rewards/format_reward": 0.6250000111758709, | |
| "step": 135 | |
| }, | |
| { | |
| "completion_length": 2730.0625610351562, | |
| "epoch": 0.15542857142857142, | |
| "grad_norm": 0.2851649224758148, | |
| "kl": 0.002216339111328125, | |
| "learning_rate": 9.213010742252327e-07, | |
| "loss": 0.0001, | |
| "reward": 0.4177038297057152, | |
| "reward_std": 1.0492460913956165, | |
| "rewards/cosine_scaled_reward": -0.020314730005338788, | |
| "rewards/format_reward": 0.45833334140479565, | |
| "step": 136 | |
| }, | |
| { | |
| "completion_length": 2912.0000381469727, | |
| "epoch": 0.15657142857142858, | |
| "grad_norm": 0.2026607245206833, | |
| "kl": 0.0021758079528808594, | |
| "learning_rate": 9.195171441101668e-07, | |
| "loss": 0.0001, | |
| "reward": -0.09136633202433586, | |
| "reward_std": 0.5394793637096882, | |
| "rewards/cosine_scaled_reward": -0.19151651859283447, | |
| "rewards/format_reward": 0.29166667349636555, | |
| "step": 137 | |
| }, | |
| { | |
| "completion_length": 2472.25004196167, | |
| "epoch": 0.15771428571428572, | |
| "grad_norm": 0.2178824096918106, | |
| "kl": 0.0017549991607666016, | |
| "learning_rate": 9.177152042508077e-07, | |
| "loss": 0.0001, | |
| "reward": 0.3587841596454382, | |
| "reward_std": 0.9164101183414459, | |
| "rewards/cosine_scaled_reward": -0.10185793554410338, | |
| "rewards/format_reward": 0.5625000111758709, | |
| "step": 138 | |
| }, | |
| { | |
| "completion_length": 3295.791717529297, | |
| "epoch": 0.15885714285714286, | |
| "grad_norm": 0.17741361260414124, | |
| "kl": 0.00331878662109375, | |
| "learning_rate": 9.158953424711624e-07, | |
| "loss": 0.0001, | |
| "reward": 0.006225615739822388, | |
| "reward_std": 0.7490637376904488, | |
| "rewards/cosine_scaled_reward": -0.15313720237463713, | |
| "rewards/format_reward": 0.3125000111758709, | |
| "step": 139 | |
| }, | |
| { | |
| "completion_length": 2845.375030517578, | |
| "epoch": 0.16, | |
| "grad_norm": 0.27644607424736023, | |
| "kl": 0.005204200744628906, | |
| "learning_rate": 9.140576474687263e-07, | |
| "loss": 0.0002, | |
| "reward": 0.2643515709787607, | |
| "reward_std": 0.7205736935138702, | |
| "rewards/cosine_scaled_reward": -0.04490754520520568, | |
| "rewards/format_reward": 0.3541666753590107, | |
| "step": 140 | |
| }, | |
| { | |
| "completion_length": 2522.1458740234375, | |
| "epoch": 0.16114285714285714, | |
| "grad_norm": 0.21788166463375092, | |
| "kl": 0.003261566162109375, | |
| "learning_rate": 9.122022088101613e-07, | |
| "loss": 0.0001, | |
| "reward": 0.3097852375358343, | |
| "reward_std": 0.7691731601953506, | |
| "rewards/cosine_scaled_reward": -0.14719070680439472, | |
| "rewards/format_reward": 0.604166679084301, | |
| "step": 141 | |
| }, | |
| { | |
| "completion_length": 2841.3958740234375, | |
| "epoch": 0.16228571428571428, | |
| "grad_norm": 0.18084634840488434, | |
| "kl": 0.0034284591674804688, | |
| "learning_rate": 9.103291169269299e-07, | |
| "loss": 0.0001, | |
| "reward": 0.39247775822877884, | |
| "reward_std": 0.9289789237082005, | |
| "rewards/cosine_scaled_reward": -0.0745944594964385, | |
| "rewards/format_reward": 0.5416666828095913, | |
| "step": 142 | |
| }, | |
| { | |
| "completion_length": 2528.7708740234375, | |
| "epoch": 0.16342857142857142, | |
| "grad_norm": 0.31566375494003296, | |
| "kl": 0.004278659820556641, | |
| "learning_rate": 9.084384631108882e-07, | |
| "loss": 0.0002, | |
| "reward": 0.16241500061005354, | |
| "reward_std": 0.6862597297877073, | |
| "rewards/cosine_scaled_reward": -0.15837583474058192, | |
| "rewards/format_reward": 0.4791666753590107, | |
| "step": 143 | |
| }, | |
| { | |
| "completion_length": 2870.937545776367, | |
| "epoch": 0.16457142857142856, | |
| "grad_norm": 0.20246466994285583, | |
| "kl": 0.002933502197265625, | |
| "learning_rate": 9.065303395098358e-07, | |
| "loss": 0.0001, | |
| "reward": 0.4816696117632091, | |
| "reward_std": 0.7971302233636379, | |
| "rewards/cosine_scaled_reward": 0.05333479621913284, | |
| "rewards/format_reward": 0.3750000111758709, | |
| "step": 144 | |
| }, | |
| { | |
| "completion_length": 2188.9166946411133, | |
| "epoch": 0.1657142857142857, | |
| "grad_norm": 0.21244628727436066, | |
| "kl": 0.003267526626586914, | |
| "learning_rate": 9.046048391230247e-07, | |
| "loss": 0.0001, | |
| "reward": 0.6690180394798517, | |
| "reward_std": 0.6978322137147188, | |
| "rewards/cosine_scaled_reward": 0.042842356488108635, | |
| "rewards/format_reward": 0.5833333414047956, | |
| "step": 145 | |
| }, | |
| { | |
| "completion_length": 2133.7917404174805, | |
| "epoch": 0.16685714285714287, | |
| "grad_norm": 0.22922900319099426, | |
| "kl": 0.0025424957275390625, | |
| "learning_rate": 9.026620557966279e-07, | |
| "loss": 0.0001, | |
| "reward": 0.276001513004303, | |
| "reward_std": 0.6235547289252281, | |
| "rewards/cosine_scaled_reward": -0.1849159342236817, | |
| "rewards/format_reward": 0.645833345130086, | |
| "step": 146 | |
| }, | |
| { | |
| "completion_length": 2469.604202270508, | |
| "epoch": 0.168, | |
| "grad_norm": 0.25116774439811707, | |
| "kl": 0.004103660583496094, | |
| "learning_rate": 9.007020842191634e-07, | |
| "loss": 0.0002, | |
| "reward": 0.588484600186348, | |
| "reward_std": 0.85157061368227, | |
| "rewards/cosine_scaled_reward": 0.03382563544437289, | |
| "rewards/format_reward": 0.5208333414047956, | |
| "step": 147 | |
| }, | |
| { | |
| "completion_length": 2214.6042251586914, | |
| "epoch": 0.16914285714285715, | |
| "grad_norm": 0.18739475309848785, | |
| "kl": 0.0034880638122558594, | |
| "learning_rate": 8.987250199168808e-07, | |
| "loss": 0.0001, | |
| "reward": 0.5079192472621799, | |
| "reward_std": 0.7314141802489758, | |
| "rewards/cosine_scaled_reward": -0.06895705359056592, | |
| "rewards/format_reward": 0.6458333376795053, | |
| "step": 148 | |
| }, | |
| { | |
| "completion_length": 2570.416763305664, | |
| "epoch": 0.1702857142857143, | |
| "grad_norm": 0.22657646238803864, | |
| "kl": 0.004002571105957031, | |
| "learning_rate": 8.967309592491052e-07, | |
| "loss": 0.0002, | |
| "reward": 0.33003329299390316, | |
| "reward_std": 1.0237452127039433, | |
| "rewards/cosine_scaled_reward": -0.0849833432585001, | |
| "rewards/format_reward": 0.5000000167638063, | |
| "step": 149 | |
| }, | |
| { | |
| "completion_length": 2632.166732788086, | |
| "epoch": 0.17142857142857143, | |
| "grad_norm": 0.24574832618236542, | |
| "kl": 0.0053920745849609375, | |
| "learning_rate": 8.9471999940354e-07, | |
| "loss": 0.0002, | |
| "reward": 0.27089521041489206, | |
| "reward_std": 0.8753956761211157, | |
| "rewards/cosine_scaled_reward": -0.08330239914357662, | |
| "rewards/format_reward": 0.43750001303851604, | |
| "step": 150 | |
| }, | |
| { | |
| "completion_length": 2311.416702270508, | |
| "epoch": 0.17257142857142857, | |
| "grad_norm": 0.21789635717868805, | |
| "kl": 0.0057392120361328125, | |
| "learning_rate": 8.926922383915315e-07, | |
| "loss": 0.0002, | |
| "reward": 0.6955740721896291, | |
| "reward_std": 0.7741780783981085, | |
| "rewards/cosine_scaled_reward": 0.024870369350537658, | |
| "rewards/format_reward": 0.6458333358168602, | |
| "step": 151 | |
| }, | |
| { | |
| "completion_length": 2757.125030517578, | |
| "epoch": 0.1737142857142857, | |
| "grad_norm": 0.2523602545261383, | |
| "kl": 0.003414630889892578, | |
| "learning_rate": 8.906477750432903e-07, | |
| "loss": 0.0001, | |
| "reward": 0.1108561996370554, | |
| "reward_std": 0.6132981330156326, | |
| "rewards/cosine_scaled_reward": -0.13207191228866577, | |
| "rewards/format_reward": 0.37500000558793545, | |
| "step": 152 | |
| }, | |
| { | |
| "completion_length": 2993.416702270508, | |
| "epoch": 0.17485714285714285, | |
| "grad_norm": 0.20782147347927094, | |
| "kl": 0.006793975830078125, | |
| "learning_rate": 8.88586709003076e-07, | |
| "loss": 0.0003, | |
| "reward": -0.006021241657435894, | |
| "reward_std": 0.6891133151948452, | |
| "rewards/cosine_scaled_reward": -0.19051063433289528, | |
| "rewards/format_reward": 0.37500000931322575, | |
| "step": 153 | |
| }, | |
| { | |
| "completion_length": 3374.416717529297, | |
| "epoch": 0.176, | |
| "grad_norm": 0.1451537162065506, | |
| "kl": 0.0027971267700195312, | |
| "learning_rate": 8.865091407243394e-07, | |
| "loss": 0.0001, | |
| "reward": 0.28345590829849243, | |
| "reward_std": 0.9266153089702129, | |
| "rewards/cosine_scaled_reward": -0.014522044686600566, | |
| "rewards/format_reward": 0.31250000558793545, | |
| "step": 154 | |
| }, | |
| { | |
| "completion_length": 2701.187545776367, | |
| "epoch": 0.17714285714285713, | |
| "grad_norm": 0.21468013525009155, | |
| "kl": 0.003292083740234375, | |
| "learning_rate": 8.844151714648274e-07, | |
| "loss": 0.0001, | |
| "reward": 0.32251251488924026, | |
| "reward_std": 0.7347648032009602, | |
| "rewards/cosine_scaled_reward": -0.04707707092165947, | |
| "rewards/format_reward": 0.4166666716337204, | |
| "step": 155 | |
| }, | |
| { | |
| "completion_length": 2688.3333892822266, | |
| "epoch": 0.1782857142857143, | |
| "grad_norm": 0.19141636788845062, | |
| "kl": 0.003067493438720703, | |
| "learning_rate": 8.823049032816478e-07, | |
| "loss": 0.0001, | |
| "reward": 0.23405547067523003, | |
| "reward_std": 0.8397131189703941, | |
| "rewards/cosine_scaled_reward": -0.08088893629610538, | |
| "rewards/format_reward": 0.3958333358168602, | |
| "step": 156 | |
| }, | |
| { | |
| "completion_length": 2820.937515258789, | |
| "epoch": 0.17942857142857144, | |
| "grad_norm": 0.21651938557624817, | |
| "kl": 0.004750490188598633, | |
| "learning_rate": 8.801784390262943e-07, | |
| "loss": 0.0002, | |
| "reward": 0.18686847016215324, | |
| "reward_std": 0.7123733684420586, | |
| "rewards/cosine_scaled_reward": -0.11489910446107388, | |
| "rewards/format_reward": 0.41666668467223644, | |
| "step": 157 | |
| }, | |
| { | |
| "completion_length": 2965.2709045410156, | |
| "epoch": 0.18057142857142858, | |
| "grad_norm": 0.1956688016653061, | |
| "kl": 0.0041599273681640625, | |
| "learning_rate": 8.780358823396352e-07, | |
| "loss": 0.0002, | |
| "reward": 0.6479230020195246, | |
| "reward_std": 0.7748183347284794, | |
| "rewards/cosine_scaled_reward": 0.11562815494835377, | |
| "rewards/format_reward": 0.416666679084301, | |
| "step": 158 | |
| }, | |
| { | |
| "completion_length": 2539.500015258789, | |
| "epoch": 0.18171428571428572, | |
| "grad_norm": 0.21064119040966034, | |
| "kl": 0.003520965576171875, | |
| "learning_rate": 8.758773376468604e-07, | |
| "loss": 0.0001, | |
| "reward": 0.017007697373628616, | |
| "reward_std": 0.5202154843136668, | |
| "rewards/cosine_scaled_reward": -0.23107950016856194, | |
| "rewards/format_reward": 0.4791666679084301, | |
| "step": 159 | |
| }, | |
| { | |
| "completion_length": 2526.562545776367, | |
| "epoch": 0.18285714285714286, | |
| "grad_norm": 0.2099539339542389, | |
| "kl": 0.007557868957519531, | |
| "learning_rate": 8.737029101523929e-07, | |
| "loss": 0.0003, | |
| "reward": 0.5310798175632954, | |
| "reward_std": 0.8000563234090805, | |
| "rewards/cosine_scaled_reward": 0.015539903659373522, | |
| "rewards/format_reward": 0.5000000093132257, | |
| "step": 160 | |
| }, | |
| { | |
| "completion_length": 2373.6458740234375, | |
| "epoch": 0.184, | |
| "grad_norm": 0.23696200549602509, | |
| "kl": 0.0062770843505859375, | |
| "learning_rate": 8.715127058347614e-07, | |
| "loss": 0.0003, | |
| "reward": 0.3601957531645894, | |
| "reward_std": 0.735405445098877, | |
| "rewards/cosine_scaled_reward": -0.08031879365444183, | |
| "rewards/format_reward": 0.5208333376795053, | |
| "step": 161 | |
| }, | |
| { | |
| "completion_length": 2870.604217529297, | |
| "epoch": 0.18514285714285714, | |
| "grad_norm": 0.20365777611732483, | |
| "kl": 0.00856781005859375, | |
| "learning_rate": 8.693068314414344e-07, | |
| "loss": 0.0003, | |
| "reward": 0.2685950770974159, | |
| "reward_std": 0.7880549058318138, | |
| "rewards/cosine_scaled_reward": -0.06361913960427046, | |
| "rewards/format_reward": 0.39583334140479565, | |
| "step": 162 | |
| }, | |
| { | |
| "completion_length": 2309.8958740234375, | |
| "epoch": 0.18628571428571428, | |
| "grad_norm": 0.2089148312807083, | |
| "kl": 0.004642963409423828, | |
| "learning_rate": 8.670853944836176e-07, | |
| "loss": 0.0002, | |
| "reward": 0.6906282119452953, | |
| "reward_std": 0.49390939995646477, | |
| "rewards/cosine_scaled_reward": 0.04323074035346508, | |
| "rewards/format_reward": 0.6041666716337204, | |
| "step": 163 | |
| }, | |
| { | |
| "completion_length": 2178.3750381469727, | |
| "epoch": 0.18742857142857142, | |
| "grad_norm": 0.26612454652786255, | |
| "kl": 0.007852554321289062, | |
| "learning_rate": 8.648485032310144e-07, | |
| "loss": 0.0003, | |
| "reward": 0.5011991895735264, | |
| "reward_std": 0.5780392102897167, | |
| "rewards/cosine_scaled_reward": -0.020233748480677605, | |
| "rewards/format_reward": 0.5416666734963655, | |
| "step": 164 | |
| }, | |
| { | |
| "completion_length": 2490.666702270508, | |
| "epoch": 0.18857142857142858, | |
| "grad_norm": 0.25059667229652405, | |
| "kl": 0.005207061767578125, | |
| "learning_rate": 8.625962667065487e-07, | |
| "loss": 0.0002, | |
| "reward": 0.056412231642752886, | |
| "reward_std": 0.6775014959275723, | |
| "rewards/cosine_scaled_reward": -0.2009605555795133, | |
| "rewards/format_reward": 0.45833334513008595, | |
| "step": 165 | |
| }, | |
| { | |
| "completion_length": 2813.187515258789, | |
| "epoch": 0.18971428571428572, | |
| "grad_norm": 0.20505455136299133, | |
| "kl": 0.004026889801025391, | |
| "learning_rate": 8.603287946810513e-07, | |
| "loss": 0.0002, | |
| "reward": -0.00409979373216629, | |
| "reward_std": 0.6606614142656326, | |
| "rewards/cosine_scaled_reward": -0.18954989779740572, | |
| "rewards/format_reward": 0.3750000111758709, | |
| "step": 166 | |
| }, | |
| { | |
| "completion_length": 2199.562568664551, | |
| "epoch": 0.19085714285714286, | |
| "grad_norm": 0.2305985391139984, | |
| "kl": 0.003949165344238281, | |
| "learning_rate": 8.580461976679099e-07, | |
| "loss": 0.0002, | |
| "reward": 0.34335924312472343, | |
| "reward_std": 0.8879089429974556, | |
| "rewards/cosine_scaled_reward": -0.151237060315907, | |
| "rewards/format_reward": 0.6458333414047956, | |
| "step": 167 | |
| }, | |
| { | |
| "completion_length": 2649.583396911621, | |
| "epoch": 0.192, | |
| "grad_norm": 0.2810652256011963, | |
| "kl": 0.00469207763671875, | |
| "learning_rate": 8.557485869176825e-07, | |
| "loss": 0.0002, | |
| "reward": 0.3732623625546694, | |
| "reward_std": 0.9529096595942974, | |
| "rewards/cosine_scaled_reward": -0.05295216618105769, | |
| "rewards/format_reward": 0.4791666716337204, | |
| "step": 168 | |
| }, | |
| { | |
| "completion_length": 1590.3542022705078, | |
| "epoch": 0.19314285714285714, | |
| "grad_norm": 0.19241072237491608, | |
| "kl": 0.004169940948486328, | |
| "learning_rate": 8.534360744126753e-07, | |
| "loss": 0.0002, | |
| "reward": 1.4370996933430433, | |
| "reward_std": 0.7862771563231945, | |
| "rewards/cosine_scaled_reward": 0.260216549038887, | |
| "rewards/format_reward": 0.9166666716337204, | |
| "step": 169 | |
| }, | |
| { | |
| "completion_length": 2502.375045776367, | |
| "epoch": 0.19428571428571428, | |
| "grad_norm": 0.1910967230796814, | |
| "kl": 0.004596710205078125, | |
| "learning_rate": 8.511087728614862e-07, | |
| "loss": 0.0002, | |
| "reward": 0.3506653420627117, | |
| "reward_std": 0.43432530108839273, | |
| "rewards/cosine_scaled_reward": -0.04341734014451504, | |
| "rewards/format_reward": 0.43750000186264515, | |
| "step": 170 | |
| }, | |
| { | |
| "completion_length": 2321.041702270508, | |
| "epoch": 0.19542857142857142, | |
| "grad_norm": 0.18431736528873444, | |
| "kl": 0.0033516883850097656, | |
| "learning_rate": 8.487667956935087e-07, | |
| "loss": 0.0001, | |
| "reward": 0.40528116561472416, | |
| "reward_std": 0.5608017090708017, | |
| "rewards/cosine_scaled_reward": -0.057776106521487236, | |
| "rewards/format_reward": 0.5208333395421505, | |
| "step": 171 | |
| }, | |
| { | |
| "completion_length": 2714.541732788086, | |
| "epoch": 0.19657142857142856, | |
| "grad_norm": 0.22830112278461456, | |
| "kl": 0.007214546203613281, | |
| "learning_rate": 8.464102570534061e-07, | |
| "loss": 0.0003, | |
| "reward": 0.7660160614177585, | |
| "reward_std": 1.0001494381576777, | |
| "rewards/cosine_scaled_reward": 0.14342465763911605, | |
| "rewards/format_reward": 0.47916667722165585, | |
| "step": 172 | |
| }, | |
| { | |
| "completion_length": 1561.3333587646484, | |
| "epoch": 0.1977142857142857, | |
| "grad_norm": 0.2993278205394745, | |
| "kl": 0.0068264007568359375, | |
| "learning_rate": 8.440392717955475e-07, | |
| "loss": 0.0003, | |
| "reward": 0.46238506003282964, | |
| "reward_std": 0.7197185447439551, | |
| "rewards/cosine_scaled_reward": -0.1333908117376268, | |
| "rewards/format_reward": 0.7291666679084301, | |
| "step": 173 | |
| }, | |
| { | |
| "completion_length": 1839.541732788086, | |
| "epoch": 0.19885714285714284, | |
| "grad_norm": 0.274972140789032, | |
| "kl": 0.008336067199707031, | |
| "learning_rate": 8.416539554784089e-07, | |
| "loss": 0.0003, | |
| "reward": 0.8871902981773019, | |
| "reward_std": 0.7998310215771198, | |
| "rewards/cosine_scaled_reward": 0.04776177019812167, | |
| "rewards/format_reward": 0.7916666772216558, | |
| "step": 174 | |
| }, | |
| { | |
| "completion_length": 2464.145866394043, | |
| "epoch": 0.2, | |
| "grad_norm": 0.20217549800872803, | |
| "kl": 0.004665374755859375, | |
| "learning_rate": 8.392544243589427e-07, | |
| "loss": 0.0002, | |
| "reward": 0.4925261875614524, | |
| "reward_std": 0.5973605997860432, | |
| "rewards/cosine_scaled_reward": 0.006679709069430828, | |
| "rewards/format_reward": 0.4791666753590107, | |
| "step": 175 | |
| }, | |
| { | |
| "completion_length": 1838.6667175292969, | |
| "epoch": 0.20114285714285715, | |
| "grad_norm": 0.24335332214832306, | |
| "kl": 0.0049304962158203125, | |
| "learning_rate": 8.368407953869103e-07, | |
| "loss": 0.0002, | |
| "reward": 0.9096714407205582, | |
| "reward_std": 1.0707368738949299, | |
| "rewards/cosine_scaled_reward": 0.059002356603741646, | |
| "rewards/format_reward": 0.7916666716337204, | |
| "step": 176 | |
| }, | |
| { | |
| "completion_length": 2511.875030517578, | |
| "epoch": 0.2022857142857143, | |
| "grad_norm": 0.23394758999347687, | |
| "kl": 0.0048389434814453125, | |
| "learning_rate": 8.344131861991828e-07, | |
| "loss": 0.0002, | |
| "reward": 0.6948387259617448, | |
| "reward_std": 0.757513590157032, | |
| "rewards/cosine_scaled_reward": 0.05575268715620041, | |
| "rewards/format_reward": 0.583333345130086, | |
| "step": 177 | |
| }, | |
| { | |
| "completion_length": 2130.1250534057617, | |
| "epoch": 0.20342857142857143, | |
| "grad_norm": 0.28063294291496277, | |
| "kl": 0.00698089599609375, | |
| "learning_rate": 8.319717151140072e-07, | |
| "loss": 0.0003, | |
| "reward": 0.5327047700993717, | |
| "reward_std": 0.8441273644566536, | |
| "rewards/cosine_scaled_reward": -0.035730951465666294, | |
| "rewards/format_reward": 0.6041666697710752, | |
| "step": 178 | |
| }, | |
| { | |
| "completion_length": 2611.0416717529297, | |
| "epoch": 0.20457142857142857, | |
| "grad_norm": 0.25523021817207336, | |
| "kl": 0.004611968994140625, | |
| "learning_rate": 8.295165011252396e-07, | |
| "loss": 0.0002, | |
| "reward": -0.09883294254541397, | |
| "reward_std": 0.4169027265161276, | |
| "rewards/cosine_scaled_reward": -0.25774980895221233, | |
| "rewards/format_reward": 0.41666667349636555, | |
| "step": 179 | |
| }, | |
| { | |
| "completion_length": 1505.0833892822266, | |
| "epoch": 0.2057142857142857, | |
| "grad_norm": 0.2882588505744934, | |
| "kl": 0.0065937042236328125, | |
| "learning_rate": 8.270476638965461e-07, | |
| "loss": 0.0003, | |
| "reward": 1.1103055961430073, | |
| "reward_std": 0.8654880682006478, | |
| "rewards/cosine_scaled_reward": 0.15931943291798234, | |
| "rewards/format_reward": 0.7916666697710752, | |
| "step": 180 | |
| }, | |
| { | |
| "completion_length": 2787.2708892822266, | |
| "epoch": 0.20685714285714285, | |
| "grad_norm": 0.24275778234004974, | |
| "kl": 0.00591278076171875, | |
| "learning_rate": 8.245653237555705e-07, | |
| "loss": 0.0002, | |
| "reward": 0.1754364101216197, | |
| "reward_std": 0.575681222602725, | |
| "rewards/cosine_scaled_reward": -0.09978180378675461, | |
| "rewards/format_reward": 0.37500000931322575, | |
| "step": 181 | |
| }, | |
| { | |
| "completion_length": 2012.4375076293945, | |
| "epoch": 0.208, | |
| "grad_norm": 0.17373254895210266, | |
| "kl": 0.00243377685546875, | |
| "learning_rate": 8.220696016880687e-07, | |
| "loss": 0.0001, | |
| "reward": 0.48792053386569023, | |
| "reward_std": 0.6339404806494713, | |
| "rewards/cosine_scaled_reward": -0.07895641587674618, | |
| "rewards/format_reward": 0.6458333358168602, | |
| "step": 182 | |
| }, | |
| { | |
| "completion_length": 1676.2916946411133, | |
| "epoch": 0.20914285714285713, | |
| "grad_norm": 0.26158607006073, | |
| "kl": 0.007886886596679688, | |
| "learning_rate": 8.195606193320136e-07, | |
| "loss": 0.0003, | |
| "reward": 0.9213535767048597, | |
| "reward_std": 0.7604264169931412, | |
| "rewards/cosine_scaled_reward": 0.054426767863333225, | |
| "rewards/format_reward": 0.812500013038516, | |
| "step": 183 | |
| }, | |
| { | |
| "completion_length": 2223.9167137145996, | |
| "epoch": 0.2102857142857143, | |
| "grad_norm": 0.25392112135887146, | |
| "kl": 0.00632476806640625, | |
| "learning_rate": 8.170384989716657e-07, | |
| "loss": 0.0003, | |
| "reward": 0.5227387230843306, | |
| "reward_std": 0.5629259529523551, | |
| "rewards/cosine_scaled_reward": -0.07196396728977561, | |
| "rewards/format_reward": 0.666666679084301, | |
| "step": 184 | |
| }, | |
| { | |
| "completion_length": 1681.7500305175781, | |
| "epoch": 0.21142857142857144, | |
| "grad_norm": 0.2521141469478607, | |
| "kl": 0.005031585693359375, | |
| "learning_rate": 8.145033635316128e-07, | |
| "loss": 0.0002, | |
| "reward": 0.2217017700895667, | |
| "reward_std": 0.572849384509027, | |
| "rewards/cosine_scaled_reward": -0.24331580009311438, | |
| "rewards/format_reward": 0.7083333395421505, | |
| "step": 185 | |
| }, | |
| { | |
| "completion_length": 2321.8541870117188, | |
| "epoch": 0.21257142857142858, | |
| "grad_norm": 0.17236538231372833, | |
| "kl": 0.006072998046875, | |
| "learning_rate": 8.119553365707802e-07, | |
| "loss": 0.0002, | |
| "reward": 0.5415416369214654, | |
| "reward_std": 0.6540890671312809, | |
| "rewards/cosine_scaled_reward": -6.251875311136246e-05, | |
| "rewards/format_reward": 0.5416666697710752, | |
| "step": 186 | |
| }, | |
| { | |
| "completion_length": 1878.7083587646484, | |
| "epoch": 0.21371428571428572, | |
| "grad_norm": 0.24200907349586487, | |
| "kl": 0.0061244964599609375, | |
| "learning_rate": 8.093945422764069e-07, | |
| "loss": 0.0002, | |
| "reward": 0.5931817078962922, | |
| "reward_std": 0.44314784556627274, | |
| "rewards/cosine_scaled_reward": -0.08882584050297737, | |
| "rewards/format_reward": 0.7708333395421505, | |
| "step": 187 | |
| }, | |
| { | |
| "completion_length": 2685.875015258789, | |
| "epoch": 0.21485714285714286, | |
| "grad_norm": 0.2027624398469925, | |
| "kl": 0.0059528350830078125, | |
| "learning_rate": 8.068211054579943e-07, | |
| "loss": 0.0002, | |
| "reward": 0.0031664990819990635, | |
| "reward_std": 0.44171422626823187, | |
| "rewards/cosine_scaled_reward": -0.19633342884480953, | |
| "rewards/format_reward": 0.39583333395421505, | |
| "step": 188 | |
| }, | |
| { | |
| "completion_length": 2072.4166870117188, | |
| "epoch": 0.216, | |
| "grad_norm": 0.27338477969169617, | |
| "kl": 0.0058345794677734375, | |
| "learning_rate": 8.04235151541222e-07, | |
| "loss": 0.0002, | |
| "reward": 0.4580115145072341, | |
| "reward_std": 0.6696468777954578, | |
| "rewards/cosine_scaled_reward": -0.11474426090717316, | |
| "rewards/format_reward": 0.6875000111758709, | |
| "step": 189 | |
| }, | |
| { | |
| "completion_length": 1406.2708740234375, | |
| "epoch": 0.21714285714285714, | |
| "grad_norm": 0.2341710776090622, | |
| "kl": 0.00482940673828125, | |
| "learning_rate": 8.01636806561836e-07, | |
| "loss": 0.0002, | |
| "reward": 0.892716939561069, | |
| "reward_std": 0.6160999666899443, | |
| "rewards/cosine_scaled_reward": 0.019275141414254904, | |
| "rewards/format_reward": 0.8541666753590107, | |
| "step": 190 | |
| }, | |
| { | |
| "completion_length": 1371.333381652832, | |
| "epoch": 0.21828571428571428, | |
| "grad_norm": 0.2618865966796875, | |
| "kl": 0.0059223175048828125, | |
| "learning_rate": 7.990261971595048e-07, | |
| "loss": 0.0002, | |
| "reward": 1.064841603860259, | |
| "reward_std": 0.8721816278994083, | |
| "rewards/cosine_scaled_reward": 0.12617080115524004, | |
| "rewards/format_reward": 0.8125000018626451, | |
| "step": 191 | |
| }, | |
| { | |
| "completion_length": 2194.729217529297, | |
| "epoch": 0.21942857142857142, | |
| "grad_norm": 0.22514396905899048, | |
| "kl": 0.00543975830078125, | |
| "learning_rate": 7.964034505716476e-07, | |
| "loss": 0.0002, | |
| "reward": 0.42392623238265514, | |
| "reward_std": 0.6634904891252518, | |
| "rewards/cosine_scaled_reward": -0.11095356999430805, | |
| "rewards/format_reward": 0.6458333432674408, | |
| "step": 192 | |
| }, | |
| { | |
| "completion_length": 2677.687545776367, | |
| "epoch": 0.22057142857142858, | |
| "grad_norm": 0.24128860235214233, | |
| "kl": 0.0064220428466796875, | |
| "learning_rate": 7.93768694627233e-07, | |
| "loss": 0.0003, | |
| "reward": 0.0647691236808896, | |
| "reward_std": 0.7267616987228394, | |
| "rewards/cosine_scaled_reward": -0.1655321167781949, | |
| "rewards/format_reward": 0.39583334513008595, | |
| "step": 193 | |
| }, | |
| { | |
| "completion_length": 2442.937545776367, | |
| "epoch": 0.22171428571428572, | |
| "grad_norm": 0.19651657342910767, | |
| "kl": 0.005771636962890625, | |
| "learning_rate": 7.911220577405484e-07, | |
| "loss": 0.0002, | |
| "reward": 0.9742466537281871, | |
| "reward_std": 1.0655523668974638, | |
| "rewards/cosine_scaled_reward": 0.14337329752743244, | |
| "rewards/format_reward": 0.6875000093132257, | |
| "step": 194 | |
| }, | |
| { | |
| "completion_length": 1858.8542175292969, | |
| "epoch": 0.22285714285714286, | |
| "grad_norm": 0.25950801372528076, | |
| "kl": 0.005645751953125, | |
| "learning_rate": 7.884636689049422e-07, | |
| "loss": 0.0002, | |
| "reward": 0.6131953801959753, | |
| "reward_std": 0.8148518763482571, | |
| "rewards/cosine_scaled_reward": -0.08923565968871117, | |
| "rewards/format_reward": 0.791666679084301, | |
| "step": 195 | |
| }, | |
| { | |
| "completion_length": 2861.3958587646484, | |
| "epoch": 0.224, | |
| "grad_norm": 0.26293885707855225, | |
| "kl": 0.006893157958984375, | |
| "learning_rate": 7.857936576865356e-07, | |
| "loss": 0.0003, | |
| "reward": 0.2952362271025777, | |
| "reward_std": 0.7602911423891783, | |
| "rewards/cosine_scaled_reward": -0.09196521900594234, | |
| "rewards/format_reward": 0.47916667349636555, | |
| "step": 196 | |
| }, | |
| { | |
| "completion_length": 1365.2500381469727, | |
| "epoch": 0.22514285714285714, | |
| "grad_norm": 0.2826971709728241, | |
| "kl": 0.006282806396484375, | |
| "learning_rate": 7.831121542179086e-07, | |
| "loss": 0.0003, | |
| "reward": 0.9446334131062031, | |
| "reward_std": 0.7611507624387741, | |
| "rewards/cosine_scaled_reward": 0.04523337911814451, | |
| "rewards/format_reward": 0.8541666753590107, | |
| "step": 197 | |
| }, | |
| { | |
| "completion_length": 1636.1667098999023, | |
| "epoch": 0.22628571428571428, | |
| "grad_norm": 0.25861766934394836, | |
| "kl": 0.007457733154296875, | |
| "learning_rate": 7.804192891917571e-07, | |
| "loss": 0.0003, | |
| "reward": 0.9376457966864109, | |
| "reward_std": 0.8799612354487181, | |
| "rewards/cosine_scaled_reward": 0.052156222984194756, | |
| "rewards/format_reward": 0.8333333395421505, | |
| "step": 198 | |
| }, | |
| { | |
| "completion_length": 1516.4583892822266, | |
| "epoch": 0.22742857142857142, | |
| "grad_norm": 0.21497201919555664, | |
| "kl": 0.006725311279296875, | |
| "learning_rate": 7.777151938545235e-07, | |
| "loss": 0.0003, | |
| "reward": 0.6696783006191254, | |
| "reward_std": 0.5459328815340996, | |
| "rewards/cosine_scaled_reward": -0.14432752039283514, | |
| "rewards/format_reward": 0.9583333432674408, | |
| "step": 199 | |
| }, | |
| { | |
| "completion_length": 1373.7291946411133, | |
| "epoch": 0.22857142857142856, | |
| "grad_norm": 0.22486717998981476, | |
| "kl": 0.006084442138671875, | |
| "learning_rate": 7.75e-07, | |
| "loss": 0.0002, | |
| "reward": 1.1582428617402911, | |
| "reward_std": 0.6559928604401648, | |
| "rewards/cosine_scaled_reward": 0.11037139501422644, | |
| "rewards/format_reward": 0.9375000074505806, | |
| "step": 200 | |
| }, | |
| { | |
| "completion_length": 1785.0833587646484, | |
| "epoch": 0.2297142857142857, | |
| "grad_norm": 0.2287452071905136, | |
| "kl": 0.0050449371337890625, | |
| "learning_rate": 7.72273839962904e-07, | |
| "loss": 0.0002, | |
| "reward": 1.4045181944966316, | |
| "reward_std": 0.7379098236560822, | |
| "rewards/cosine_scaled_reward": 0.2855923995375633, | |
| "rewards/format_reward": 0.8333333358168602, | |
| "step": 201 | |
| }, | |
| { | |
| "completion_length": 1509.4167098999023, | |
| "epoch": 0.23085714285714284, | |
| "grad_norm": 0.21152691543102264, | |
| "kl": 0.0052967071533203125, | |
| "learning_rate": 7.695368466124296e-07, | |
| "loss": 0.0002, | |
| "reward": 1.1190969496965408, | |
| "reward_std": 0.41684375517070293, | |
| "rewards/cosine_scaled_reward": 0.1637151322211139, | |
| "rewards/format_reward": 0.7916666679084301, | |
| "step": 202 | |
| }, | |
| { | |
| "completion_length": 1581.0208892822266, | |
| "epoch": 0.232, | |
| "grad_norm": 0.25626081228256226, | |
| "kl": 0.0076751708984375, | |
| "learning_rate": 7.667891533457718e-07, | |
| "loss": 0.0003, | |
| "reward": 0.6776624768972397, | |
| "reward_std": 0.659516304731369, | |
| "rewards/cosine_scaled_reward": -0.07783541223034263, | |
| "rewards/format_reward": 0.8333333432674408, | |
| "step": 203 | |
| }, | |
| { | |
| "completion_length": 1655.520881652832, | |
| "epoch": 0.23314285714285715, | |
| "grad_norm": 0.36918991804122925, | |
| "kl": 0.012666702270507812, | |
| "learning_rate": 7.640308940816239e-07, | |
| "loss": 0.0005, | |
| "reward": 0.8364788740873337, | |
| "reward_std": 0.8245303072035313, | |
| "rewards/cosine_scaled_reward": -0.008843917399644852, | |
| "rewards/format_reward": 0.8541666865348816, | |
| "step": 204 | |
| }, | |
| { | |
| "completion_length": 1911.3542098999023, | |
| "epoch": 0.2342857142857143, | |
| "grad_norm": 0.31795355677604675, | |
| "kl": 0.0064849853515625, | |
| "learning_rate": 7.612622032536507e-07, | |
| "loss": 0.0003, | |
| "reward": 1.0839662365615368, | |
| "reward_std": 0.9466215074062347, | |
| "rewards/cosine_scaled_reward": 0.15656641125679016, | |
| "rewards/format_reward": 0.7708333488553762, | |
| "step": 205 | |
| }, | |
| { | |
| "completion_length": 2088.812526702881, | |
| "epoch": 0.23542857142857143, | |
| "grad_norm": 0.21683187782764435, | |
| "kl": 0.0050067901611328125, | |
| "learning_rate": 7.584832158039378e-07, | |
| "loss": 0.0002, | |
| "reward": 0.2100734617561102, | |
| "reward_std": 0.4785574749112129, | |
| "rewards/cosine_scaled_reward": -0.24912995658814907, | |
| "rewards/format_reward": 0.7083333358168602, | |
| "step": 206 | |
| }, | |
| { | |
| "completion_length": 1592.0208587646484, | |
| "epoch": 0.23657142857142857, | |
| "grad_norm": 0.3095583915710449, | |
| "kl": 0.0069713592529296875, | |
| "learning_rate": 7.556940671764124e-07, | |
| "loss": 0.0003, | |
| "reward": 0.7113522440195084, | |
| "reward_std": 0.7786879017949104, | |
| "rewards/cosine_scaled_reward": -0.08182388916611671, | |
| "rewards/format_reward": 0.8750000074505806, | |
| "step": 207 | |
| }, | |
| { | |
| "completion_length": 1388.1250305175781, | |
| "epoch": 0.2377142857142857, | |
| "grad_norm": 0.22331999242305756, | |
| "kl": 0.006237030029296875, | |
| "learning_rate": 7.528948933102438e-07, | |
| "loss": 0.0002, | |
| "reward": 0.8039972663391382, | |
| "reward_std": 0.817696575075388, | |
| "rewards/cosine_scaled_reward": -0.035501367412507534, | |
| "rewards/format_reward": 0.8750000149011612, | |
| "step": 208 | |
| }, | |
| { | |
| "completion_length": 1191.458366394043, | |
| "epoch": 0.23885714285714285, | |
| "grad_norm": 0.30164894461631775, | |
| "kl": 0.01032257080078125, | |
| "learning_rate": 7.500858306332172e-07, | |
| "loss": 0.0004, | |
| "reward": 1.0684526707045734, | |
| "reward_std": 0.7270883917808533, | |
| "rewards/cosine_scaled_reward": 0.0758929792791605, | |
| "rewards/format_reward": 0.916666679084301, | |
| "step": 209 | |
| }, | |
| { | |
| "completion_length": 1736.8750228881836, | |
| "epoch": 0.24, | |
| "grad_norm": 0.2290075421333313, | |
| "kl": 0.005527496337890625, | |
| "learning_rate": 7.472670160550848e-07, | |
| "loss": 0.0002, | |
| "reward": 0.7679805280640721, | |
| "reward_std": 0.45000065956264734, | |
| "rewards/cosine_scaled_reward": -0.011843102052807808, | |
| "rewards/format_reward": 0.7916666716337204, | |
| "step": 210 | |
| }, | |
| { | |
| "completion_length": 1724.3125534057617, | |
| "epoch": 0.24114285714285713, | |
| "grad_norm": 0.2615811824798584, | |
| "kl": 0.00815582275390625, | |
| "learning_rate": 7.444385869608921e-07, | |
| "loss": 0.0003, | |
| "reward": 0.8917091085459106, | |
| "reward_std": 0.6099780108779669, | |
| "rewards/cosine_scaled_reward": 0.07085455022752285, | |
| "rewards/format_reward": 0.7500000037252903, | |
| "step": 211 | |
| }, | |
| { | |
| "completion_length": 1340.4583892822266, | |
| "epoch": 0.2422857142857143, | |
| "grad_norm": 0.2789488732814789, | |
| "kl": 0.008213043212890625, | |
| "learning_rate": 7.416006812042827e-07, | |
| "loss": 0.0003, | |
| "reward": 1.1324417181313038, | |
| "reward_std": 0.835416778922081, | |
| "rewards/cosine_scaled_reward": 0.15997080132365227, | |
| "rewards/format_reward": 0.812500013038516, | |
| "step": 212 | |
| }, | |
| { | |
| "completion_length": 1215.229206085205, | |
| "epoch": 0.24342857142857144, | |
| "grad_norm": 0.29332393407821655, | |
| "kl": 0.008787155151367188, | |
| "learning_rate": 7.387534371007797e-07, | |
| "loss": 0.0004, | |
| "reward": 1.1256815567612648, | |
| "reward_std": 0.7665913105010986, | |
| "rewards/cosine_scaled_reward": 0.10450743697583675, | |
| "rewards/format_reward": 0.916666679084301, | |
| "step": 213 | |
| }, | |
| { | |
| "completion_length": 1867.020866394043, | |
| "epoch": 0.24457142857142858, | |
| "grad_norm": 0.320909321308136, | |
| "kl": 0.0081024169921875, | |
| "learning_rate": 7.358969934210438e-07, | |
| "loss": 0.0003, | |
| "reward": 0.7646492760013643, | |
| "reward_std": 0.8905698489397764, | |
| "rewards/cosine_scaled_reward": 0.017741285264492035, | |
| "rewards/format_reward": 0.7291666772216558, | |
| "step": 214 | |
| }, | |
| { | |
| "completion_length": 1342.937515258789, | |
| "epoch": 0.24571428571428572, | |
| "grad_norm": 0.25344061851501465, | |
| "kl": 0.0052165985107421875, | |
| "learning_rate": 7.330314893841101e-07, | |
| "loss": 0.0002, | |
| "reward": 0.5551245659589767, | |
| "reward_std": 0.3942173570394516, | |
| "rewards/cosine_scaled_reward": -0.15993775241076946, | |
| "rewards/format_reward": 0.875, | |
| "step": 215 | |
| }, | |
| { | |
| "completion_length": 1305.9791717529297, | |
| "epoch": 0.24685714285714286, | |
| "grad_norm": 0.25426533818244934, | |
| "kl": 0.007297515869140625, | |
| "learning_rate": 7.301570646506027e-07, | |
| "loss": 0.0003, | |
| "reward": 1.0224211241584271, | |
| "reward_std": 0.8492502048611641, | |
| "rewards/cosine_scaled_reward": 0.09454387426376343, | |
| "rewards/format_reward": 0.8333333414047956, | |
| "step": 216 | |
| }, | |
| { | |
| "completion_length": 1446.4166946411133, | |
| "epoch": 0.248, | |
| "grad_norm": 0.23595848679542542, | |
| "kl": 0.0071563720703125, | |
| "learning_rate": 7.27273859315928e-07, | |
| "loss": 0.0003, | |
| "reward": 1.0635684814769775, | |
| "reward_std": 0.8497135229408741, | |
| "rewards/cosine_scaled_reward": 0.10470088990405202, | |
| "rewards/format_reward": 0.8541666716337204, | |
| "step": 217 | |
| }, | |
| { | |
| "completion_length": 1458.9167022705078, | |
| "epoch": 0.24914285714285714, | |
| "grad_norm": 0.23519130051136017, | |
| "kl": 0.007747650146484375, | |
| "learning_rate": 7.243820139034464e-07, | |
| "loss": 0.0003, | |
| "reward": 0.5358998300507665, | |
| "reward_std": 0.8287533260881901, | |
| "rewards/cosine_scaled_reward": -0.1799667701125145, | |
| "rewards/format_reward": 0.8958333507180214, | |
| "step": 218 | |
| }, | |
| { | |
| "completion_length": 1337.0000381469727, | |
| "epoch": 0.2502857142857143, | |
| "grad_norm": 0.23715050518512726, | |
| "kl": 0.0068759918212890625, | |
| "learning_rate": 7.214816693576234e-07, | |
| "loss": 0.0003, | |
| "reward": 1.036239080131054, | |
| "reward_std": 0.7431819960474968, | |
| "rewards/cosine_scaled_reward": 0.0910361991263926, | |
| "rewards/format_reward": 0.8541666716337204, | |
| "step": 219 | |
| }, | |
| { | |
| "completion_length": 1415.9166831970215, | |
| "epoch": 0.25142857142857145, | |
| "grad_norm": 0.28580737113952637, | |
| "kl": 0.008144378662109375, | |
| "learning_rate": 7.185729670371604e-07, | |
| "loss": 0.0003, | |
| "reward": 0.3407529406249523, | |
| "reward_std": 0.472426300868392, | |
| "rewards/cosine_scaled_reward": -0.2462902208790183, | |
| "rewards/format_reward": 0.8333333432674408, | |
| "step": 220 | |
| }, | |
| { | |
| "completion_length": 1271.3958587646484, | |
| "epoch": 0.25257142857142856, | |
| "grad_norm": 0.2041638046503067, | |
| "kl": 0.0054531097412109375, | |
| "learning_rate": 7.156560487081051e-07, | |
| "loss": 0.0002, | |
| "reward": 0.9021317735314369, | |
| "reward_std": 0.49666406959295273, | |
| "rewards/cosine_scaled_reward": 0.013565851375460625, | |
| "rewards/format_reward": 0.875, | |
| "step": 221 | |
| }, | |
| { | |
| "completion_length": 1608.208366394043, | |
| "epoch": 0.2537142857142857, | |
| "grad_norm": 0.24548400938510895, | |
| "kl": 0.006893157958984375, | |
| "learning_rate": 7.127310565369415e-07, | |
| "loss": 0.0003, | |
| "reward": 0.9311691895127296, | |
| "reward_std": 0.7908135317265987, | |
| "rewards/cosine_scaled_reward": 0.0489178872667253, | |
| "rewards/format_reward": 0.8333333432674408, | |
| "step": 222 | |
| }, | |
| { | |
| "completion_length": 1477.6041870117188, | |
| "epoch": 0.25485714285714284, | |
| "grad_norm": 0.2579537034034729, | |
| "kl": 0.007343292236328125, | |
| "learning_rate": 7.097981330836616e-07, | |
| "loss": 0.0003, | |
| "reward": 0.6923428289592266, | |
| "reward_std": 0.5176126770675182, | |
| "rewards/cosine_scaled_reward": -0.03924527019262314, | |
| "rewards/format_reward": 0.7708333507180214, | |
| "step": 223 | |
| }, | |
| { | |
| "completion_length": 2049.041702270508, | |
| "epoch": 0.256, | |
| "grad_norm": 0.18312445282936096, | |
| "kl": 0.0058765411376953125, | |
| "learning_rate": 7.068574212948169e-07, | |
| "loss": 0.0002, | |
| "reward": 0.6417203694581985, | |
| "reward_std": 0.7141322754323483, | |
| "rewards/cosine_scaled_reward": -0.07497315760701895, | |
| "rewards/format_reward": 0.791666679084301, | |
| "step": 224 | |
| }, | |
| { | |
| "completion_length": 2165.0000915527344, | |
| "epoch": 0.2571428571428571, | |
| "grad_norm": 0.3362411558628082, | |
| "kl": 0.018795013427734375, | |
| "learning_rate": 7.039090644965509e-07, | |
| "loss": 0.0008, | |
| "reward": 0.4098323443904519, | |
| "reward_std": 0.903562568128109, | |
| "rewards/cosine_scaled_reward": -0.12841718492563814, | |
| "rewards/format_reward": 0.6666666883975267, | |
| "step": 225 | |
| }, | |
| { | |
| "completion_length": 1517.020866394043, | |
| "epoch": 0.2582857142857143, | |
| "grad_norm": 0.20612592995166779, | |
| "kl": 0.0065460205078125, | |
| "learning_rate": 7.009532063876148e-07, | |
| "loss": 0.0003, | |
| "reward": 1.2147440239787102, | |
| "reward_std": 0.8476376309990883, | |
| "rewards/cosine_scaled_reward": 0.15945532266050577, | |
| "rewards/format_reward": 0.8958333358168602, | |
| "step": 226 | |
| }, | |
| { | |
| "completion_length": 1337.7917098999023, | |
| "epoch": 0.25942857142857145, | |
| "grad_norm": 0.29225802421569824, | |
| "kl": 0.010477066040039062, | |
| "learning_rate": 6.979899910323624e-07, | |
| "loss": 0.0004, | |
| "reward": 0.8250191137194633, | |
| "reward_std": 0.7360146790742874, | |
| "rewards/cosine_scaled_reward": -0.06665713712573051, | |
| "rewards/format_reward": 0.9583333358168602, | |
| "step": 227 | |
| }, | |
| { | |
| "completion_length": 1411.6666984558105, | |
| "epoch": 0.26057142857142856, | |
| "grad_norm": 0.2814786732196808, | |
| "kl": 0.00687408447265625, | |
| "learning_rate": 6.950195628537299e-07, | |
| "loss": 0.0003, | |
| "reward": 1.0481851011281833, | |
| "reward_std": 0.7632241751998663, | |
| "rewards/cosine_scaled_reward": 0.11784252151846886, | |
| "rewards/format_reward": 0.8125000055879354, | |
| "step": 228 | |
| }, | |
| { | |
| "completion_length": 1429.1666870117188, | |
| "epoch": 0.26171428571428573, | |
| "grad_norm": 0.26221880316734314, | |
| "kl": 0.008052825927734375, | |
| "learning_rate": 6.920420666261961e-07, | |
| "loss": 0.0003, | |
| "reward": 0.7309114846866578, | |
| "reward_std": 0.46567995101213455, | |
| "rewards/cosine_scaled_reward": -0.06162761617451906, | |
| "rewards/format_reward": 0.8541666679084301, | |
| "step": 229 | |
| }, | |
| { | |
| "completion_length": 2006.1667175292969, | |
| "epoch": 0.26285714285714284, | |
| "grad_norm": 0.24703645706176758, | |
| "kl": 0.008714675903320312, | |
| "learning_rate": 6.890576474687263e-07, | |
| "loss": 0.0003, | |
| "reward": 0.3682295884937048, | |
| "reward_std": 0.6135462559759617, | |
| "rewards/cosine_scaled_reward": -0.17005188344046474, | |
| "rewards/format_reward": 0.7083333507180214, | |
| "step": 230 | |
| }, | |
| { | |
| "completion_length": 1523.9375305175781, | |
| "epoch": 0.264, | |
| "grad_norm": 0.21581248939037323, | |
| "kl": 0.00759124755859375, | |
| "learning_rate": 6.860664508377001e-07, | |
| "loss": 0.0003, | |
| "reward": 1.0537027399986982, | |
| "reward_std": 0.791351318359375, | |
| "rewards/cosine_scaled_reward": 0.058101359754800797, | |
| "rewards/format_reward": 0.9375, | |
| "step": 231 | |
| }, | |
| { | |
| "completion_length": 1894.0833587646484, | |
| "epoch": 0.2651428571428571, | |
| "grad_norm": 0.23024173080921173, | |
| "kl": 0.0105438232421875, | |
| "learning_rate": 6.83068622519821e-07, | |
| "loss": 0.0004, | |
| "reward": 0.3341958597302437, | |
| "reward_std": 0.666959872469306, | |
| "rewards/cosine_scaled_reward": -0.2079020773526281, | |
| "rewards/format_reward": 0.7500000093132257, | |
| "step": 232 | |
| }, | |
| { | |
| "completion_length": 1035.2500228881836, | |
| "epoch": 0.2662857142857143, | |
| "grad_norm": 0.2399219274520874, | |
| "kl": 0.006580352783203125, | |
| "learning_rate": 6.800643086250121e-07, | |
| "loss": 0.0003, | |
| "reward": 0.7500397872645408, | |
| "reward_std": 0.6901755221188068, | |
| "rewards/cosine_scaled_reward": -0.11456345673650503, | |
| "rewards/format_reward": 0.9791666716337204, | |
| "step": 233 | |
| }, | |
| { | |
| "completion_length": 1623.7083892822266, | |
| "epoch": 0.2674285714285714, | |
| "grad_norm": 0.25343385338783264, | |
| "kl": 0.0087127685546875, | |
| "learning_rate": 6.770536555792944e-07, | |
| "loss": 0.0003, | |
| "reward": 0.5469427425414324, | |
| "reward_std": 0.6777470409870148, | |
| "rewards/cosine_scaled_reward": -0.12236198072787374, | |
| "rewards/format_reward": 0.7916666716337204, | |
| "step": 234 | |
| }, | |
| { | |
| "completion_length": 1307.1458587646484, | |
| "epoch": 0.26857142857142857, | |
| "grad_norm": 0.2843048870563507, | |
| "kl": 0.010669708251953125, | |
| "learning_rate": 6.740368101176495e-07, | |
| "loss": 0.0004, | |
| "reward": 1.1278857234865427, | |
| "reward_std": 0.7025517448782921, | |
| "rewards/cosine_scaled_reward": 0.13685949333012104, | |
| "rewards/format_reward": 0.8541666828095913, | |
| "step": 235 | |
| }, | |
| { | |
| "completion_length": 1815.020866394043, | |
| "epoch": 0.26971428571428574, | |
| "grad_norm": 0.21867913007736206, | |
| "kl": 0.0071773529052734375, | |
| "learning_rate": 6.710139192768694e-07, | |
| "loss": 0.0003, | |
| "reward": 0.7069348245859146, | |
| "reward_std": 0.9415349438786507, | |
| "rewards/cosine_scaled_reward": -0.031949267257004976, | |
| "rewards/format_reward": 0.7708333414047956, | |
| "step": 236 | |
| }, | |
| { | |
| "completion_length": 1392.4167022705078, | |
| "epoch": 0.27085714285714285, | |
| "grad_norm": 0.23793146014213562, | |
| "kl": 0.0063190460205078125, | |
| "learning_rate": 6.679851303883891e-07, | |
| "loss": 0.0003, | |
| "reward": 0.8629608564078808, | |
| "reward_std": 0.4519681539386511, | |
| "rewards/cosine_scaled_reward": -0.006019574124366045, | |
| "rewards/format_reward": 0.875, | |
| "step": 237 | |
| }, | |
| { | |
| "completion_length": 1157.6667098999023, | |
| "epoch": 0.272, | |
| "grad_norm": 0.23921653628349304, | |
| "kl": 0.00856781005859375, | |
| "learning_rate": 6.649505910711058e-07, | |
| "loss": 0.0003, | |
| "reward": 1.1124465055763721, | |
| "reward_std": 0.6771237980574369, | |
| "rewards/cosine_scaled_reward": 0.07705656159669161, | |
| "rewards/format_reward": 0.9583333432674408, | |
| "step": 238 | |
| }, | |
| { | |
| "completion_length": 1381.0208778381348, | |
| "epoch": 0.27314285714285713, | |
| "grad_norm": 0.21902450919151306, | |
| "kl": 0.005985260009765625, | |
| "learning_rate": 6.619104492241847e-07, | |
| "loss": 0.0002, | |
| "reward": 1.3559186151251197, | |
| "reward_std": 0.47896091267466545, | |
| "rewards/cosine_scaled_reward": 0.2821259554475546, | |
| "rewards/format_reward": 0.7916666679084301, | |
| "step": 239 | |
| }, | |
| { | |
| "completion_length": 1726.4583740234375, | |
| "epoch": 0.2742857142857143, | |
| "grad_norm": 0.32664069533348083, | |
| "kl": 0.012561798095703125, | |
| "learning_rate": 6.588648530198504e-07, | |
| "loss": 0.0005, | |
| "reward": 0.33731127402279526, | |
| "reward_std": 0.5951757170259953, | |
| "rewards/cosine_scaled_reward": -0.19592771586030722, | |
| "rewards/format_reward": 0.729166679084301, | |
| "step": 240 | |
| }, | |
| { | |
| "completion_length": 1596.1667098999023, | |
| "epoch": 0.2754285714285714, | |
| "grad_norm": 0.2804366648197174, | |
| "kl": 0.009317398071289062, | |
| "learning_rate": 6.558139508961654e-07, | |
| "loss": 0.0004, | |
| "reward": 0.3176069939509034, | |
| "reward_std": 0.5079055763781071, | |
| "rewards/cosine_scaled_reward": -0.24744650442153215, | |
| "rewards/format_reward": 0.8125000055879354, | |
| "step": 241 | |
| }, | |
| { | |
| "completion_length": 1209.8750381469727, | |
| "epoch": 0.2765714285714286, | |
| "grad_norm": 0.4020167887210846, | |
| "kl": 0.01398468017578125, | |
| "learning_rate": 6.527578915497951e-07, | |
| "loss": 0.0006, | |
| "reward": 0.7724186182022095, | |
| "reward_std": 0.48072919889818877, | |
| "rewards/cosine_scaled_reward": -0.08254071744158864, | |
| "rewards/format_reward": 0.9375000074505806, | |
| "step": 242 | |
| }, | |
| { | |
| "completion_length": 1556.895866394043, | |
| "epoch": 0.2777142857142857, | |
| "grad_norm": 0.2093649059534073, | |
| "kl": 0.0073986053466796875, | |
| "learning_rate": 6.496968239287603e-07, | |
| "loss": 0.0003, | |
| "reward": 0.7962484285235405, | |
| "reward_std": 0.7191448770463467, | |
| "rewards/cosine_scaled_reward": -0.018542497418820858, | |
| "rewards/format_reward": 0.8333333432674408, | |
| "step": 243 | |
| }, | |
| { | |
| "completion_length": 1644.9792022705078, | |
| "epoch": 0.27885714285714286, | |
| "grad_norm": 0.25173234939575195, | |
| "kl": 0.008056640625, | |
| "learning_rate": 6.466308972251785e-07, | |
| "loss": 0.0003, | |
| "reward": 1.000451274216175, | |
| "reward_std": 0.8420288115739822, | |
| "rewards/cosine_scaled_reward": 0.08355897013098001, | |
| "rewards/format_reward": 0.8333333414047956, | |
| "step": 244 | |
| }, | |
| { | |
| "completion_length": 1743.6250457763672, | |
| "epoch": 0.28, | |
| "grad_norm": 0.24979065358638763, | |
| "kl": 0.0075702667236328125, | |
| "learning_rate": 6.435602608679916e-07, | |
| "loss": 0.0003, | |
| "reward": 0.6352044828236103, | |
| "reward_std": 0.9859512895345688, | |
| "rewards/cosine_scaled_reward": -0.03656444209627807, | |
| "rewards/format_reward": 0.7083333414047956, | |
| "step": 245 | |
| }, | |
| { | |
| "completion_length": 1504.3542251586914, | |
| "epoch": 0.28114285714285714, | |
| "grad_norm": 0.2286195158958435, | |
| "kl": 0.00997161865234375, | |
| "learning_rate": 6.404850645156841e-07, | |
| "loss": 0.0004, | |
| "reward": 0.6435268502682447, | |
| "reward_std": 0.7222435772418976, | |
| "rewards/cosine_scaled_reward": -0.11573660443536937, | |
| "rewards/format_reward": 0.875, | |
| "step": 246 | |
| }, | |
| { | |
| "completion_length": 1978.8959045410156, | |
| "epoch": 0.2822857142857143, | |
| "grad_norm": 0.2635015547275543, | |
| "kl": 0.009313583374023438, | |
| "learning_rate": 6.374054580489873e-07, | |
| "loss": 0.0004, | |
| "reward": 0.2780038760975003, | |
| "reward_std": 0.4847924932837486, | |
| "rewards/cosine_scaled_reward": -0.19433141965419054, | |
| "rewards/format_reward": 0.6666666734963655, | |
| "step": 247 | |
| }, | |
| { | |
| "completion_length": 1440.7083358764648, | |
| "epoch": 0.2834285714285714, | |
| "grad_norm": 0.29032102227211, | |
| "kl": 0.010227203369140625, | |
| "learning_rate": 6.343215915635761e-07, | |
| "loss": 0.0004, | |
| "reward": 1.1146562099456787, | |
| "reward_std": 0.5486158076673746, | |
| "rewards/cosine_scaled_reward": 0.18232804723083973, | |
| "rewards/format_reward": 0.7500000055879354, | |
| "step": 248 | |
| }, | |
| { | |
| "completion_length": 1401.3541946411133, | |
| "epoch": 0.2845714285714286, | |
| "grad_norm": 0.25068965554237366, | |
| "kl": 0.012468338012695312, | |
| "learning_rate": 6.31233615362752e-07, | |
| "loss": 0.0005, | |
| "reward": 1.2556766234338284, | |
| "reward_std": 0.6144618764519691, | |
| "rewards/cosine_scaled_reward": 0.21117158699780703, | |
| "rewards/format_reward": 0.8333333358168602, | |
| "step": 249 | |
| }, | |
| { | |
| "completion_length": 1540.958381652832, | |
| "epoch": 0.2857142857142857, | |
| "grad_norm": 0.27915748953819275, | |
| "kl": 0.013950347900390625, | |
| "learning_rate": 6.281416799501187e-07, | |
| "loss": 0.0006, | |
| "reward": 0.8930262625217438, | |
| "reward_std": 0.6684892605990171, | |
| "rewards/cosine_scaled_reward": 0.00901312252972275, | |
| "rewards/format_reward": 0.8750000149011612, | |
| "step": 250 | |
| }, | |
| { | |
| "completion_length": 1162.5416984558105, | |
| "epoch": 0.28685714285714287, | |
| "grad_norm": 0.3440181314945221, | |
| "kl": 0.013868331909179688, | |
| "learning_rate": 6.25045936022246e-07, | |
| "loss": 0.0006, | |
| "reward": 0.8275420293211937, | |
| "reward_std": 0.6679180320352316, | |
| "rewards/cosine_scaled_reward": -0.0445623523555696, | |
| "rewards/format_reward": 0.9166666716337204, | |
| "step": 251 | |
| }, | |
| { | |
| "completion_length": 1791.7083740234375, | |
| "epoch": 0.288, | |
| "grad_norm": 0.25717219710350037, | |
| "kl": 0.011442184448242188, | |
| "learning_rate": 6.219465344613258e-07, | |
| "loss": 0.0005, | |
| "reward": 0.5102682635188103, | |
| "reward_std": 0.5698316707275808, | |
| "rewards/cosine_scaled_reward": -0.1406992208212614, | |
| "rewards/format_reward": 0.7916666828095913, | |
| "step": 252 | |
| }, | |
| { | |
| "completion_length": 1678.583381652832, | |
| "epoch": 0.28914285714285715, | |
| "grad_norm": 0.2983105778694153, | |
| "kl": 0.013790130615234375, | |
| "learning_rate": 6.188436263278172e-07, | |
| "loss": 0.0006, | |
| "reward": 0.6175373089499772, | |
| "reward_std": 0.8155984878540039, | |
| "rewards/cosine_scaled_reward": -0.07664802484214306, | |
| "rewards/format_reward": 0.7708333432674408, | |
| "step": 253 | |
| }, | |
| { | |
| "completion_length": 1746.1667289733887, | |
| "epoch": 0.29028571428571426, | |
| "grad_norm": 0.36004897952079773, | |
| "kl": 0.013019561767578125, | |
| "learning_rate": 6.157373628530852e-07, | |
| "loss": 0.0005, | |
| "reward": 0.724624989379663, | |
| "reward_std": 0.8076631315052509, | |
| "rewards/cosine_scaled_reward": 0.008145819883793592, | |
| "rewards/format_reward": 0.7083333507180214, | |
| "step": 254 | |
| }, | |
| { | |
| "completion_length": 1871.0833892822266, | |
| "epoch": 0.2914285714285714, | |
| "grad_norm": 0.21529468894004822, | |
| "kl": 0.01010894775390625, | |
| "learning_rate": 6.126278954320294e-07, | |
| "loss": 0.0004, | |
| "reward": 0.23611869476735592, | |
| "reward_std": 0.5141230337321758, | |
| "rewards/cosine_scaled_reward": -0.2673573372885585, | |
| "rewards/format_reward": 0.7708333432674408, | |
| "step": 255 | |
| }, | |
| { | |
| "completion_length": 1561.4375686645508, | |
| "epoch": 0.2925714285714286, | |
| "grad_norm": 0.2703370749950409, | |
| "kl": 0.011600494384765625, | |
| "learning_rate": 6.095153756157051e-07, | |
| "loss": 0.0005, | |
| "reward": 0.8081665523350239, | |
| "reward_std": 0.6887451633810997, | |
| "rewards/cosine_scaled_reward": -0.033416735008358955, | |
| "rewards/format_reward": 0.8750000149011612, | |
| "step": 256 | |
| }, | |
| { | |
| "completion_length": 1959.0000686645508, | |
| "epoch": 0.2937142857142857, | |
| "grad_norm": 0.23733022809028625, | |
| "kl": 0.0087432861328125, | |
| "learning_rate": 6.06399955103937e-07, | |
| "loss": 0.0003, | |
| "reward": 1.0470282435417175, | |
| "reward_std": 0.8330266922712326, | |
| "rewards/cosine_scaled_reward": 0.12768075708299875, | |
| "rewards/format_reward": 0.7916666697710752, | |
| "step": 257 | |
| }, | |
| { | |
| "completion_length": 1762.2083892822266, | |
| "epoch": 0.2948571428571429, | |
| "grad_norm": 0.26813915371894836, | |
| "kl": 0.008697509765625, | |
| "learning_rate": 6.032817857379256e-07, | |
| "loss": 0.0003, | |
| "reward": 0.9685253538191319, | |
| "reward_std": 0.8117741793394089, | |
| "rewards/cosine_scaled_reward": 0.03634601645171642, | |
| "rewards/format_reward": 0.8958333432674408, | |
| "step": 258 | |
| }, | |
| { | |
| "completion_length": 1260.0416946411133, | |
| "epoch": 0.296, | |
| "grad_norm": 0.3185918927192688, | |
| "kl": 0.010593414306640625, | |
| "learning_rate": 6.001610194928464e-07, | |
| "loss": 0.0004, | |
| "reward": 1.0911660604178905, | |
| "reward_std": 0.5376893002539873, | |
| "rewards/cosine_scaled_reward": 0.09766635159030557, | |
| "rewards/format_reward": 0.8958333432674408, | |
| "step": 259 | |
| }, | |
| { | |
| "completion_length": 1093.0000228881836, | |
| "epoch": 0.29714285714285715, | |
| "grad_norm": 0.27099287509918213, | |
| "kl": 0.0073299407958984375, | |
| "learning_rate": 5.97037808470444e-07, | |
| "loss": 0.0003, | |
| "reward": 1.3254448138177395, | |
| "reward_std": 0.5788962724618614, | |
| "rewards/cosine_scaled_reward": 0.1731390468776226, | |
| "rewards/format_reward": 0.9791666716337204, | |
| "step": 260 | |
| }, | |
| { | |
| "completion_length": 2024.1041717529297, | |
| "epoch": 0.29828571428571427, | |
| "grad_norm": 0.20667137205600739, | |
| "kl": 0.01210784912109375, | |
| "learning_rate": 5.939123048916173e-07, | |
| "loss": 0.0005, | |
| "reward": 0.3676602290943265, | |
| "reward_std": 0.42088818456977606, | |
| "rewards/cosine_scaled_reward": -0.12866988312453032, | |
| "rewards/format_reward": 0.625, | |
| "step": 261 | |
| }, | |
| { | |
| "completion_length": 1456.5417098999023, | |
| "epoch": 0.29942857142857143, | |
| "grad_norm": 0.3141776919364929, | |
| "kl": 0.01071929931640625, | |
| "learning_rate": 5.907846610890011e-07, | |
| "loss": 0.0004, | |
| "reward": 0.4814309529028833, | |
| "reward_std": 0.6453931964933872, | |
| "rewards/cosine_scaled_reward": -0.16553453914821148, | |
| "rewards/format_reward": 0.8125000111758709, | |
| "step": 262 | |
| }, | |
| { | |
| "completion_length": 1237.5417098999023, | |
| "epoch": 0.30057142857142854, | |
| "grad_norm": 0.2402588427066803, | |
| "kl": 0.007175445556640625, | |
| "learning_rate": 5.87655029499542e-07, | |
| "loss": 0.0003, | |
| "reward": 0.7148367003537714, | |
| "reward_std": 0.6169497780501842, | |
| "rewards/cosine_scaled_reward": -0.10091499425470829, | |
| "rewards/format_reward": 0.916666679084301, | |
| "step": 263 | |
| }, | |
| { | |
| "completion_length": 1231.3750228881836, | |
| "epoch": 0.3017142857142857, | |
| "grad_norm": 0.24805474281311035, | |
| "kl": 0.007282257080078125, | |
| "learning_rate": 5.845235626570683e-07, | |
| "loss": 0.0003, | |
| "reward": 0.6490657306276262, | |
| "reward_std": 0.6103183180093765, | |
| "rewards/cosine_scaled_reward": -0.1338004870340228, | |
| "rewards/format_reward": 0.9166666679084301, | |
| "step": 264 | |
| }, | |
| { | |
| "completion_length": 1342.270881652832, | |
| "epoch": 0.3028571428571429, | |
| "grad_norm": 0.2401592880487442, | |
| "kl": 0.00934600830078125, | |
| "learning_rate": 5.813904131848564e-07, | |
| "loss": 0.0004, | |
| "reward": 1.1422894559800625, | |
| "reward_std": 0.7356252074241638, | |
| "rewards/cosine_scaled_reward": 0.09197801724076271, | |
| "rewards/format_reward": 0.9583333432674408, | |
| "step": 265 | |
| }, | |
| { | |
| "completion_length": 1498.458366394043, | |
| "epoch": 0.304, | |
| "grad_norm": 0.2557384967803955, | |
| "kl": 0.00916290283203125, | |
| "learning_rate": 5.78255733788191e-07, | |
| "loss": 0.0004, | |
| "reward": 0.6136485729366541, | |
| "reward_std": 0.7062950804829597, | |
| "rewards/cosine_scaled_reward": -0.13067573634907603, | |
| "rewards/format_reward": 0.8750000111758709, | |
| "step": 266 | |
| }, | |
| { | |
| "completion_length": 2083.458351135254, | |
| "epoch": 0.30514285714285716, | |
| "grad_norm": 0.24998563528060913, | |
| "kl": 0.016796112060546875, | |
| "learning_rate": 5.751196772469237e-07, | |
| "loss": 0.0007, | |
| "reward": 0.18751574913039804, | |
| "reward_std": 0.5016277078539133, | |
| "rewards/cosine_scaled_reward": -0.20832546008750796, | |
| "rewards/format_reward": 0.6041666697710752, | |
| "step": 267 | |
| }, | |
| { | |
| "completion_length": 1216.0000534057617, | |
| "epoch": 0.3062857142857143, | |
| "grad_norm": 0.3383597433567047, | |
| "kl": 0.012638092041015625, | |
| "learning_rate": 5.71982396408026e-07, | |
| "loss": 0.0005, | |
| "reward": 0.7704824209213257, | |
| "reward_std": 0.6409645788371563, | |
| "rewards/cosine_scaled_reward": -0.07309213420376182, | |
| "rewards/format_reward": 0.9166666716337204, | |
| "step": 268 | |
| }, | |
| { | |
| "completion_length": 1684.208381652832, | |
| "epoch": 0.30742857142857144, | |
| "grad_norm": 0.2874971032142639, | |
| "kl": 0.009983062744140625, | |
| "learning_rate": 5.688440441781398e-07, | |
| "loss": 0.0004, | |
| "reward": 0.6531256909947842, | |
| "reward_std": 0.661817979067564, | |
| "rewards/cosine_scaled_reward": -0.058853823225945234, | |
| "rewards/format_reward": 0.7708333414047956, | |
| "step": 269 | |
| }, | |
| { | |
| "completion_length": 1621.5000762939453, | |
| "epoch": 0.30857142857142855, | |
| "grad_norm": 0.20542392134666443, | |
| "kl": 0.010227203369140625, | |
| "learning_rate": 5.657047735161255e-07, | |
| "loss": 0.0004, | |
| "reward": 0.9151211036369205, | |
| "reward_std": 0.7712918408215046, | |
| "rewards/cosine_scaled_reward": 0.00964385224506259, | |
| "rewards/format_reward": 0.8958333358168602, | |
| "step": 270 | |
| }, | |
| { | |
| "completion_length": 1346.1042175292969, | |
| "epoch": 0.3097142857142857, | |
| "grad_norm": 0.258817583322525, | |
| "kl": 0.01062774658203125, | |
| "learning_rate": 5.625647374256061e-07, | |
| "loss": 0.0004, | |
| "reward": 1.3745552115142345, | |
| "reward_std": 0.6398672200739384, | |
| "rewards/cosine_scaled_reward": 0.23936093723023077, | |
| "rewards/format_reward": 0.895833333954215, | |
| "step": 271 | |
| }, | |
| { | |
| "completion_length": 1769.3958740234375, | |
| "epoch": 0.31085714285714283, | |
| "grad_norm": 0.2116776555776596, | |
| "kl": 0.011043548583984375, | |
| "learning_rate": 5.594240889475106e-07, | |
| "loss": 0.0004, | |
| "reward": 0.827501755207777, | |
| "reward_std": 0.5647195130586624, | |
| "rewards/cosine_scaled_reward": -0.0029158147517591715, | |
| "rewards/format_reward": 0.8333333432674408, | |
| "step": 272 | |
| }, | |
| { | |
| "completion_length": 1371.8958740234375, | |
| "epoch": 0.312, | |
| "grad_norm": 0.2546485960483551, | |
| "kl": 0.010175704956054688, | |
| "learning_rate": 5.562829811526154e-07, | |
| "loss": 0.0004, | |
| "reward": 0.9816151913255453, | |
| "reward_std": 0.7047781832516193, | |
| "rewards/cosine_scaled_reward": 0.06372424028813839, | |
| "rewards/format_reward": 0.8541666772216558, | |
| "step": 273 | |
| }, | |
| { | |
| "completion_length": 1070.8333702087402, | |
| "epoch": 0.31314285714285717, | |
| "grad_norm": 0.29803934693336487, | |
| "kl": 0.012859344482421875, | |
| "learning_rate": 5.531415671340826e-07, | |
| "loss": 0.0005, | |
| "reward": 1.2880802303552628, | |
| "reward_std": 0.7172955796122551, | |
| "rewards/cosine_scaled_reward": 0.1544567703604116, | |
| "rewards/format_reward": 0.9791666716337204, | |
| "step": 274 | |
| }, | |
| { | |
| "completion_length": 1502.5417098999023, | |
| "epoch": 0.3142857142857143, | |
| "grad_norm": 0.19667911529541016, | |
| "kl": 0.0100860595703125, | |
| "learning_rate": 5.5e-07, | |
| "loss": 0.0004, | |
| "reward": 1.1533876582980156, | |
| "reward_std": 0.7076623123139143, | |
| "rewards/cosine_scaled_reward": 0.14961049146950245, | |
| "rewards/format_reward": 0.8541666716337204, | |
| "step": 275 | |
| }, | |
| { | |
| "completion_length": 1537.7916946411133, | |
| "epoch": 0.31542857142857145, | |
| "grad_norm": 0.27277708053588867, | |
| "kl": 0.016254425048828125, | |
| "learning_rate": 5.468584328659172e-07, | |
| "loss": 0.0006, | |
| "reward": 0.8316274359822273, | |
| "reward_std": 0.9768596217036247, | |
| "rewards/cosine_scaled_reward": 0.019980370067059994, | |
| "rewards/format_reward": 0.791666679084301, | |
| "step": 276 | |
| }, | |
| { | |
| "completion_length": 1527.833381652832, | |
| "epoch": 0.31657142857142856, | |
| "grad_norm": 0.451528936624527, | |
| "kl": 0.01665496826171875, | |
| "learning_rate": 5.437170188473847e-07, | |
| "loss": 0.0007, | |
| "reward": 0.9556568302214146, | |
| "reward_std": 0.8683530241250992, | |
| "rewards/cosine_scaled_reward": 0.09241172997280955, | |
| "rewards/format_reward": 0.7708333432674408, | |
| "step": 277 | |
| }, | |
| { | |
| "completion_length": 1412.8542175292969, | |
| "epoch": 0.3177142857142857, | |
| "grad_norm": 0.4396210312843323, | |
| "kl": 0.011110305786132812, | |
| "learning_rate": 5.405759110524894e-07, | |
| "loss": 0.0004, | |
| "reward": 1.1146881133317947, | |
| "reward_std": 0.6263987999409437, | |
| "rewards/cosine_scaled_reward": 0.11984403152018785, | |
| "rewards/format_reward": 0.8750000074505806, | |
| "step": 278 | |
| }, | |
| { | |
| "completion_length": 1693.0000610351562, | |
| "epoch": 0.31885714285714284, | |
| "grad_norm": 0.4070056974887848, | |
| "kl": 0.016384124755859375, | |
| "learning_rate": 5.37435262574394e-07, | |
| "loss": 0.0007, | |
| "reward": 0.6888991240411997, | |
| "reward_std": 0.8856545202434063, | |
| "rewards/cosine_scaled_reward": -0.05138378031551838, | |
| "rewards/format_reward": 0.7916666939854622, | |
| "step": 279 | |
| }, | |
| { | |
| "completion_length": 1793.3333892822266, | |
| "epoch": 0.32, | |
| "grad_norm": 0.3356776535511017, | |
| "kl": 0.01377105712890625, | |
| "learning_rate": 5.342952264838747e-07, | |
| "loss": 0.0006, | |
| "reward": 1.1871663890779018, | |
| "reward_std": 1.0228201150894165, | |
| "rewards/cosine_scaled_reward": 0.21858319267630577, | |
| "rewards/format_reward": 0.7500000111758709, | |
| "step": 280 | |
| }, | |
| { | |
| "completion_length": 2391.250015258789, | |
| "epoch": 0.3211428571428571, | |
| "grad_norm": 0.20177552103996277, | |
| "kl": 0.01556396484375, | |
| "learning_rate": 5.311559558218603e-07, | |
| "loss": 0.0006, | |
| "reward": 0.35207285173237324, | |
| "reward_std": 0.8479718416929245, | |
| "rewards/cosine_scaled_reward": -0.10521360114216805, | |
| "rewards/format_reward": 0.5625000018626451, | |
| "step": 281 | |
| }, | |
| { | |
| "completion_length": 1301.4167098999023, | |
| "epoch": 0.3222857142857143, | |
| "grad_norm": 0.3195320665836334, | |
| "kl": 0.010005950927734375, | |
| "learning_rate": 5.28017603591974e-07, | |
| "loss": 0.0004, | |
| "reward": 0.8210245547816157, | |
| "reward_std": 0.5980393732897937, | |
| "rewards/cosine_scaled_reward": -0.016571074724197388, | |
| "rewards/format_reward": 0.854166679084301, | |
| "step": 282 | |
| }, | |
| { | |
| "completion_length": 2050.916732788086, | |
| "epoch": 0.32342857142857145, | |
| "grad_norm": 0.2939506769180298, | |
| "kl": 0.0125579833984375, | |
| "learning_rate": 5.248803227530763e-07, | |
| "loss": 0.0005, | |
| "reward": 0.9690306037664413, | |
| "reward_std": 1.016658142209053, | |
| "rewards/cosine_scaled_reward": 0.10951527790166438, | |
| "rewards/format_reward": 0.7500000055879354, | |
| "step": 283 | |
| }, | |
| { | |
| "completion_length": 1105.9583587646484, | |
| "epoch": 0.32457142857142857, | |
| "grad_norm": 0.2861945629119873, | |
| "kl": 0.008968353271484375, | |
| "learning_rate": 5.21744266211809e-07, | |
| "loss": 0.0004, | |
| "reward": 0.7402211502194405, | |
| "reward_std": 0.7317556664347649, | |
| "rewards/cosine_scaled_reward": -0.11947278678417206, | |
| "rewards/format_reward": 0.9791666716337204, | |
| "step": 284 | |
| }, | |
| { | |
| "completion_length": 1011.2917060852051, | |
| "epoch": 0.32571428571428573, | |
| "grad_norm": 0.2673451602458954, | |
| "kl": 0.010982513427734375, | |
| "learning_rate": 5.186095868151436e-07, | |
| "loss": 0.0004, | |
| "reward": 1.0782419480383396, | |
| "reward_std": 0.5894357562065125, | |
| "rewards/cosine_scaled_reward": 0.05995429493486881, | |
| "rewards/format_reward": 0.9583333432674408, | |
| "step": 285 | |
| }, | |
| { | |
| "completion_length": 1348.2916870117188, | |
| "epoch": 0.32685714285714285, | |
| "grad_norm": 0.28173941373825073, | |
| "kl": 0.0141448974609375, | |
| "learning_rate": 5.154764373429315e-07, | |
| "loss": 0.0006, | |
| "reward": 0.7097440287470818, | |
| "reward_std": 0.824833694845438, | |
| "rewards/cosine_scaled_reward": -0.08262801356613636, | |
| "rewards/format_reward": 0.8750000055879354, | |
| "step": 286 | |
| }, | |
| { | |
| "completion_length": 1311.8750457763672, | |
| "epoch": 0.328, | |
| "grad_norm": 0.3286299407482147, | |
| "kl": 0.015102386474609375, | |
| "learning_rate": 5.123449705004581e-07, | |
| "loss": 0.0006, | |
| "reward": 0.7719477713108063, | |
| "reward_std": 0.6392118521034718, | |
| "rewards/cosine_scaled_reward": 0.021390528418123722, | |
| "rewards/format_reward": 0.7291666679084301, | |
| "step": 287 | |
| }, | |
| { | |
| "completion_length": 1523.4167251586914, | |
| "epoch": 0.3291428571428571, | |
| "grad_norm": 0.3453865349292755, | |
| "kl": 0.01168060302734375, | |
| "learning_rate": 5.09215338910999e-07, | |
| "loss": 0.0005, | |
| "reward": 0.7673552545456914, | |
| "reward_std": 0.8511558324098587, | |
| "rewards/cosine_scaled_reward": -0.06423905096016824, | |
| "rewards/format_reward": 0.8958333432674408, | |
| "step": 288 | |
| }, | |
| { | |
| "completion_length": 1484.6875343322754, | |
| "epoch": 0.3302857142857143, | |
| "grad_norm": 0.6707473993301392, | |
| "kl": 0.01996612548828125, | |
| "learning_rate": 5.060876951083828e-07, | |
| "loss": 0.0008, | |
| "reward": 0.7081136375200003, | |
| "reward_std": 0.5418885201215744, | |
| "rewards/cosine_scaled_reward": -0.03135988023132086, | |
| "rewards/format_reward": 0.7708333395421505, | |
| "step": 289 | |
| }, | |
| { | |
| "completion_length": 1023.958366394043, | |
| "epoch": 0.3314285714285714, | |
| "grad_norm": 0.28844523429870605, | |
| "kl": 0.009992599487304688, | |
| "learning_rate": 5.02962191529556e-07, | |
| "loss": 0.0004, | |
| "reward": 1.0698336996138096, | |
| "reward_std": 0.7318692095577717, | |
| "rewards/cosine_scaled_reward": 0.05575018119998276, | |
| "rewards/format_reward": 0.9583333358168602, | |
| "step": 290 | |
| }, | |
| { | |
| "completion_length": 1331.9583740234375, | |
| "epoch": 0.3325714285714286, | |
| "grad_norm": 0.2491942197084427, | |
| "kl": 0.011157989501953125, | |
| "learning_rate": 4.998389805071536e-07, | |
| "loss": 0.0004, | |
| "reward": 0.9108222480863333, | |
| "reward_std": 0.8276476263999939, | |
| "rewards/cosine_scaled_reward": -0.013338901073439047, | |
| "rewards/format_reward": 0.9375, | |
| "step": 291 | |
| }, | |
| { | |
| "completion_length": 1538.854232788086, | |
| "epoch": 0.33371428571428574, | |
| "grad_norm": 0.2686867117881775, | |
| "kl": 0.011173248291015625, | |
| "learning_rate": 4.967182142620745e-07, | |
| "loss": 0.0004, | |
| "reward": 0.6097328625619411, | |
| "reward_std": 0.5690296031534672, | |
| "rewards/cosine_scaled_reward": -0.15346692875027657, | |
| "rewards/format_reward": 0.9166666865348816, | |
| "step": 292 | |
| }, | |
| { | |
| "completion_length": 1027.208351135254, | |
| "epoch": 0.33485714285714285, | |
| "grad_norm": 0.26298969984054565, | |
| "kl": 0.00969696044921875, | |
| "learning_rate": 4.93600044896063e-07, | |
| "loss": 0.0004, | |
| "reward": 0.8816844671964645, | |
| "reward_std": 0.501637440174818, | |
| "rewards/cosine_scaled_reward": -0.05915777012705803, | |
| "rewards/format_reward": 1.0, | |
| "step": 293 | |
| }, | |
| { | |
| "completion_length": 1876.2291793823242, | |
| "epoch": 0.336, | |
| "grad_norm": 0.2713903784751892, | |
| "kl": 0.015285491943359375, | |
| "learning_rate": 4.904846243842949e-07, | |
| "loss": 0.0006, | |
| "reward": 0.8239198550581932, | |
| "reward_std": 0.8811575844883919, | |
| "rewards/cosine_scaled_reward": -0.004706733860075474, | |
| "rewards/format_reward": 0.833333333954215, | |
| "step": 294 | |
| }, | |
| { | |
| "completion_length": 1408.4792022705078, | |
| "epoch": 0.33714285714285713, | |
| "grad_norm": 0.357388973236084, | |
| "kl": 0.0163726806640625, | |
| "learning_rate": 4.873721045679706e-07, | |
| "loss": 0.0007, | |
| "reward": 1.1488236412405968, | |
| "reward_std": 0.6806459426879883, | |
| "rewards/cosine_scaled_reward": 0.1160784661769867, | |
| "rewards/format_reward": 0.9166666679084301, | |
| "step": 295 | |
| }, | |
| { | |
| "completion_length": 1524.8333702087402, | |
| "epoch": 0.3382857142857143, | |
| "grad_norm": 0.3012818694114685, | |
| "kl": 0.01403045654296875, | |
| "learning_rate": 4.842626371469149e-07, | |
| "loss": 0.0006, | |
| "reward": 0.6681146812625229, | |
| "reward_std": 0.5912859179079533, | |
| "rewards/cosine_scaled_reward": -0.11385933961719275, | |
| "rewards/format_reward": 0.8958333507180214, | |
| "step": 296 | |
| }, | |
| { | |
| "completion_length": 2066.229202270508, | |
| "epoch": 0.3394285714285714, | |
| "grad_norm": 0.3773079514503479, | |
| "kl": 0.02051544189453125, | |
| "learning_rate": 4.811563736721829e-07, | |
| "loss": 0.0008, | |
| "reward": 0.6007692717248574, | |
| "reward_std": 0.8569255843758583, | |
| "rewards/cosine_scaled_reward": -0.022532058879733086, | |
| "rewards/format_reward": 0.6458333544433117, | |
| "step": 297 | |
| }, | |
| { | |
| "completion_length": 1489.8541946411133, | |
| "epoch": 0.3405714285714286, | |
| "grad_norm": 0.28374582529067993, | |
| "kl": 0.011333465576171875, | |
| "learning_rate": 4.780534655386743e-07, | |
| "loss": 0.0005, | |
| "reward": 0.8617406606208533, | |
| "reward_std": 0.6877223812043667, | |
| "rewards/cosine_scaled_reward": 0.05587031855247915, | |
| "rewards/format_reward": 0.7500000149011612, | |
| "step": 298 | |
| }, | |
| { | |
| "completion_length": 1407.5417022705078, | |
| "epoch": 0.3417142857142857, | |
| "grad_norm": 0.35291406512260437, | |
| "kl": 0.013866424560546875, | |
| "learning_rate": 4.749540639777539e-07, | |
| "loss": 0.0006, | |
| "reward": 0.8531467970460653, | |
| "reward_std": 0.6483141556382179, | |
| "rewards/cosine_scaled_reward": -0.021343314554542303, | |
| "rewards/format_reward": 0.8958333507180214, | |
| "step": 299 | |
| }, | |
| { | |
| "completion_length": 1619.1042137145996, | |
| "epoch": 0.34285714285714286, | |
| "grad_norm": 0.3786706030368805, | |
| "kl": 0.01715850830078125, | |
| "learning_rate": 4.7185832004988133e-07, | |
| "loss": 0.0007, | |
| "reward": 0.6408427469432354, | |
| "reward_std": 0.5885756351053715, | |
| "rewards/cosine_scaled_reward": -0.08582865633070469, | |
| "rewards/format_reward": 0.8125000149011612, | |
| "step": 300 | |
| }, | |
| { | |
| "completion_length": 1336.9375228881836, | |
| "epoch": 0.344, | |
| "grad_norm": 0.34971511363983154, | |
| "kl": 0.01776885986328125, | |
| "learning_rate": 4.68766384637248e-07, | |
| "loss": 0.0007, | |
| "reward": 0.6967362184077501, | |
| "reward_std": 0.5865316716954112, | |
| "rewards/cosine_scaled_reward": -0.12038189405575395, | |
| "rewards/format_reward": 0.9375000074505806, | |
| "step": 301 | |
| }, | |
| { | |
| "completion_length": 1755.6458854675293, | |
| "epoch": 0.34514285714285714, | |
| "grad_norm": 0.4290507137775421, | |
| "kl": 0.027614593505859375, | |
| "learning_rate": 4.656784084364238e-07, | |
| "loss": 0.0011, | |
| "reward": 0.9262686646543443, | |
| "reward_std": 0.850765410810709, | |
| "rewards/cosine_scaled_reward": 0.08813432417809963, | |
| "rewards/format_reward": 0.7500000074505806, | |
| "step": 302 | |
| }, | |
| { | |
| "completion_length": 1268.6250457763672, | |
| "epoch": 0.3462857142857143, | |
| "grad_norm": 0.34461113810539246, | |
| "kl": 0.0146026611328125, | |
| "learning_rate": 4.6259454195101267e-07, | |
| "loss": 0.0006, | |
| "reward": 0.7479919455945492, | |
| "reward_std": 0.7392286397516727, | |
| "rewards/cosine_scaled_reward": -0.06350404699333012, | |
| "rewards/format_reward": 0.8750000074505806, | |
| "step": 303 | |
| }, | |
| { | |
| "completion_length": 1241.6250305175781, | |
| "epoch": 0.3474285714285714, | |
| "grad_norm": 0.34643009305000305, | |
| "kl": 0.012691497802734375, | |
| "learning_rate": 4.59514935484316e-07, | |
| "loss": 0.0005, | |
| "reward": 0.6787547403946519, | |
| "reward_std": 0.644066970795393, | |
| "rewards/cosine_scaled_reward": -0.10853931680321693, | |
| "rewards/format_reward": 0.8958333395421505, | |
| "step": 304 | |
| }, | |
| { | |
| "completion_length": 1368.2708587646484, | |
| "epoch": 0.3485714285714286, | |
| "grad_norm": 0.27775081992149353, | |
| "kl": 0.01265716552734375, | |
| "learning_rate": 4.5643973913200837e-07, | |
| "loss": 0.0005, | |
| "reward": 0.6714181881397963, | |
| "reward_std": 0.6255205329507589, | |
| "rewards/cosine_scaled_reward": -0.13304092781618237, | |
| "rewards/format_reward": 0.9375000149011612, | |
| "step": 305 | |
| }, | |
| { | |
| "completion_length": 1242.1667175292969, | |
| "epoch": 0.3497142857142857, | |
| "grad_norm": 0.3250581622123718, | |
| "kl": 0.02001190185546875, | |
| "learning_rate": 4.5336910277482155e-07, | |
| "loss": 0.0008, | |
| "reward": 1.175108065828681, | |
| "reward_std": 0.6523913107812405, | |
| "rewards/cosine_scaled_reward": 0.16047068312764168, | |
| "rewards/format_reward": 0.8541666716337204, | |
| "step": 306 | |
| }, | |
| { | |
| "completion_length": 1049.9791831970215, | |
| "epoch": 0.35085714285714287, | |
| "grad_norm": 0.27137529850006104, | |
| "kl": 0.00885009765625, | |
| "learning_rate": 4.503031760712397e-07, | |
| "loss": 0.0004, | |
| "reward": 0.7276175357401371, | |
| "reward_std": 0.7274761945009232, | |
| "rewards/cosine_scaled_reward": -0.08410793542861938, | |
| "rewards/format_reward": 0.8958333432674408, | |
| "step": 307 | |
| }, | |
| { | |
| "completion_length": 2084.3333892822266, | |
| "epoch": 0.352, | |
| "grad_norm": 0.3471006155014038, | |
| "kl": 0.02095794677734375, | |
| "learning_rate": 4.4724210845020494e-07, | |
| "loss": 0.0008, | |
| "reward": 0.6365162413567305, | |
| "reward_std": 0.7663756646215916, | |
| "rewards/cosine_scaled_reward": -0.025491908192634583, | |
| "rewards/format_reward": 0.6875000149011612, | |
| "step": 308 | |
| }, | |
| { | |
| "completion_length": 1840.1667022705078, | |
| "epoch": 0.35314285714285715, | |
| "grad_norm": 0.2802172899246216, | |
| "kl": 0.019718170166015625, | |
| "learning_rate": 4.441860491038345e-07, | |
| "loss": 0.0008, | |
| "reward": 0.56682463362813, | |
| "reward_std": 0.7632914148271084, | |
| "rewards/cosine_scaled_reward": -0.11242103201220743, | |
| "rewards/format_reward": 0.7916666772216558, | |
| "step": 309 | |
| }, | |
| { | |
| "completion_length": 1546.208396911621, | |
| "epoch": 0.35428571428571426, | |
| "grad_norm": 0.6738027930259705, | |
| "kl": 0.025562286376953125, | |
| "learning_rate": 4.4113514698014953e-07, | |
| "loss": 0.001, | |
| "reward": 0.5175626166164875, | |
| "reward_std": 0.6167891919612885, | |
| "rewards/cosine_scaled_reward": -0.16830204287543893, | |
| "rewards/format_reward": 0.8541666828095913, | |
| "step": 310 | |
| }, | |
| { | |
| "completion_length": 1196.2708892822266, | |
| "epoch": 0.3554285714285714, | |
| "grad_norm": 0.3456050455570221, | |
| "kl": 0.013538360595703125, | |
| "learning_rate": 4.3808955077581546e-07, | |
| "loss": 0.0005, | |
| "reward": 1.0239670518785715, | |
| "reward_std": 0.618289714679122, | |
| "rewards/cosine_scaled_reward": 0.04323352035135031, | |
| "rewards/format_reward": 0.9375000074505806, | |
| "step": 311 | |
| }, | |
| { | |
| "completion_length": 1250.6042022705078, | |
| "epoch": 0.3565714285714286, | |
| "grad_norm": 0.3434150218963623, | |
| "kl": 0.02196502685546875, | |
| "learning_rate": 4.350494089288943e-07, | |
| "loss": 0.0009, | |
| "reward": 1.3225273452699184, | |
| "reward_std": 0.4847991270944476, | |
| "rewards/cosine_scaled_reward": 0.2133469949476421, | |
| "rewards/format_reward": 0.895833333954215, | |
| "step": 312 | |
| }, | |
| { | |
| "completion_length": 1696.4791831970215, | |
| "epoch": 0.3577142857142857, | |
| "grad_norm": 0.2879071533679962, | |
| "kl": 0.02275848388671875, | |
| "learning_rate": 4.3201486961161093e-07, | |
| "loss": 0.0009, | |
| "reward": 0.9180621989071369, | |
| "reward_std": 0.7070650856476277, | |
| "rewards/cosine_scaled_reward": 0.094447772949934, | |
| "rewards/format_reward": 0.7291666716337204, | |
| "step": 313 | |
| }, | |
| { | |
| "completion_length": 1686.1250305175781, | |
| "epoch": 0.3588571428571429, | |
| "grad_norm": 0.26152166724205017, | |
| "kl": 0.030666351318359375, | |
| "learning_rate": 4.2898608072313045e-07, | |
| "loss": 0.0012, | |
| "reward": 0.8729692408815026, | |
| "reward_std": 0.4160381481051445, | |
| "rewards/cosine_scaled_reward": 0.0823179455474019, | |
| "rewards/format_reward": 0.7083333432674408, | |
| "step": 314 | |
| }, | |
| { | |
| "completion_length": 1898.3125305175781, | |
| "epoch": 0.36, | |
| "grad_norm": 0.48658284544944763, | |
| "kl": 0.04524993896484375, | |
| "learning_rate": 4.2596318988235037e-07, | |
| "loss": 0.0018, | |
| "reward": 0.8393008848652244, | |
| "reward_std": 0.5817160941660404, | |
| "rewards/cosine_scaled_reward": 0.03423376381397247, | |
| "rewards/format_reward": 0.7708333488553762, | |
| "step": 315 | |
| }, | |
| { | |
| "completion_length": 2036.4583892822266, | |
| "epoch": 0.36114285714285715, | |
| "grad_norm": 0.43442559242248535, | |
| "kl": 0.045146942138671875, | |
| "learning_rate": 4.2294634442070553e-07, | |
| "loss": 0.0018, | |
| "reward": 0.39593026289367117, | |
| "reward_std": 0.6670566536486149, | |
| "rewards/cosine_scaled_reward": -0.1874515525996685, | |
| "rewards/format_reward": 0.7708333469927311, | |
| "step": 316 | |
| }, | |
| { | |
| "completion_length": 1617.8542022705078, | |
| "epoch": 0.36228571428571427, | |
| "grad_norm": 0.4115068018436432, | |
| "kl": 0.02794647216796875, | |
| "learning_rate": 4.1993569137498776e-07, | |
| "loss": 0.0011, | |
| "reward": 0.5094092178624123, | |
| "reward_std": 0.5757773853838444, | |
| "rewards/cosine_scaled_reward": -0.09946207702159882, | |
| "rewards/format_reward": 0.7083333414047956, | |
| "step": 317 | |
| }, | |
| { | |
| "completion_length": 1241.1875228881836, | |
| "epoch": 0.36342857142857143, | |
| "grad_norm": 0.5369545221328735, | |
| "kl": 0.02693939208984375, | |
| "learning_rate": 4.1693137748017915e-07, | |
| "loss": 0.0011, | |
| "reward": 0.561584232840687, | |
| "reward_std": 0.5914283934980631, | |
| "rewards/cosine_scaled_reward": -0.1775412478018552, | |
| "rewards/format_reward": 0.9166666716337204, | |
| "step": 318 | |
| }, | |
| { | |
| "completion_length": 1330.9375457763672, | |
| "epoch": 0.36457142857142855, | |
| "grad_norm": 0.2861523926258087, | |
| "kl": 0.016010284423828125, | |
| "learning_rate": 4.1393354916230005e-07, | |
| "loss": 0.0006, | |
| "reward": 0.45444004982709885, | |
| "reward_std": 0.46621063351631165, | |
| "rewards/cosine_scaled_reward": -0.22069666720926762, | |
| "rewards/format_reward": 0.8958333395421505, | |
| "step": 319 | |
| }, | |
| { | |
| "completion_length": 1153.9792022705078, | |
| "epoch": 0.3657142857142857, | |
| "grad_norm": 0.7957829236984253, | |
| "kl": 0.04431343078613281, | |
| "learning_rate": 4.1094235253127374e-07, | |
| "loss": 0.0018, | |
| "reward": 0.8477955907583237, | |
| "reward_std": 0.5735172219574451, | |
| "rewards/cosine_scaled_reward": -0.055268908850848675, | |
| "rewards/format_reward": 0.9583333358168602, | |
| "step": 320 | |
| }, | |
| { | |
| "completion_length": 939.2291946411133, | |
| "epoch": 0.3668571428571429, | |
| "grad_norm": 0.34620678424835205, | |
| "kl": 0.014583587646484375, | |
| "learning_rate": 4.079579333738039e-07, | |
| "loss": 0.0006, | |
| "reward": 1.0206873081624508, | |
| "reward_std": 0.5673208367079496, | |
| "rewards/cosine_scaled_reward": 0.031176931224763393, | |
| "rewards/format_reward": 0.9583333358168602, | |
| "step": 321 | |
| }, | |
| { | |
| "completion_length": 1747.8750228881836, | |
| "epoch": 0.368, | |
| "grad_norm": 0.6450039744377136, | |
| "kl": 0.06949234008789062, | |
| "learning_rate": 4.0498043714627006e-07, | |
| "loss": 0.0028, | |
| "reward": 0.44154978170990944, | |
| "reward_std": 0.7411049008369446, | |
| "rewards/cosine_scaled_reward": -0.12297512916848063, | |
| "rewards/format_reward": 0.6875000037252903, | |
| "step": 322 | |
| }, | |
| { | |
| "completion_length": 1514.3125457763672, | |
| "epoch": 0.36914285714285716, | |
| "grad_norm": 0.43106693029403687, | |
| "kl": 0.040225982666015625, | |
| "learning_rate": 4.020100089676376e-07, | |
| "loss": 0.0016, | |
| "reward": 0.5237643220461905, | |
| "reward_std": 0.7173379920423031, | |
| "rewards/cosine_scaled_reward": -0.11311787366867065, | |
| "rewards/format_reward": 0.7500000111758709, | |
| "step": 323 | |
| }, | |
| { | |
| "completion_length": 1323.6875228881836, | |
| "epoch": 0.3702857142857143, | |
| "grad_norm": 0.3708251416683197, | |
| "kl": 0.024990081787109375, | |
| "learning_rate": 3.9904679361238526e-07, | |
| "loss": 0.001, | |
| "reward": 0.5681585122365505, | |
| "reward_std": 0.7323919646441936, | |
| "rewards/cosine_scaled_reward": -0.14300409331917763, | |
| "rewards/format_reward": 0.854166679084301, | |
| "step": 324 | |
| }, | |
| { | |
| "completion_length": 1668.4583587646484, | |
| "epoch": 0.37142857142857144, | |
| "grad_norm": 0.4666387140750885, | |
| "kl": 0.03742218017578125, | |
| "learning_rate": 3.9609093550344907e-07, | |
| "loss": 0.0015, | |
| "reward": 0.8824842711910605, | |
| "reward_std": 0.6758766230195761, | |
| "rewards/cosine_scaled_reward": 0.03499212674796581, | |
| "rewards/format_reward": 0.8125000037252903, | |
| "step": 325 | |
| }, | |
| { | |
| "completion_length": 1059.020851135254, | |
| "epoch": 0.37257142857142855, | |
| "grad_norm": 0.34716206789016724, | |
| "kl": 0.019435882568359375, | |
| "learning_rate": 3.931425787051832e-07, | |
| "loss": 0.0008, | |
| "reward": 0.9567670961841941, | |
| "reward_std": 0.6891666799783707, | |
| "rewards/cosine_scaled_reward": 0.020050194929353893, | |
| "rewards/format_reward": 0.9166666716337204, | |
| "step": 326 | |
| }, | |
| { | |
| "completion_length": 1584.8125381469727, | |
| "epoch": 0.3737142857142857, | |
| "grad_norm": 0.4085078537464142, | |
| "kl": 0.02321624755859375, | |
| "learning_rate": 3.902018669163384e-07, | |
| "loss": 0.0009, | |
| "reward": 0.8718113675713539, | |
| "reward_std": 0.9188407957553864, | |
| "rewards/cosine_scaled_reward": 0.02965566364582628, | |
| "rewards/format_reward": 0.812500013038516, | |
| "step": 327 | |
| }, | |
| { | |
| "completion_length": 1545.1041870117188, | |
| "epoch": 0.37485714285714283, | |
| "grad_norm": 0.49370771646499634, | |
| "kl": 0.027523040771484375, | |
| "learning_rate": 3.872689434630585e-07, | |
| "loss": 0.0011, | |
| "reward": 0.4031808990985155, | |
| "reward_std": 0.7104494869709015, | |
| "rewards/cosine_scaled_reward": -0.20465956535190344, | |
| "rewards/format_reward": 0.8125000204890966, | |
| "step": 328 | |
| }, | |
| { | |
| "completion_length": 898.4375228881836, | |
| "epoch": 0.376, | |
| "grad_norm": 0.4591941833496094, | |
| "kl": 0.01458740234375, | |
| "learning_rate": 3.843439512918949e-07, | |
| "loss": 0.0006, | |
| "reward": 0.96208087913692, | |
| "reward_std": 0.4870151877403259, | |
| "rewards/cosine_scaled_reward": 0.012290460988879204, | |
| "rewards/format_reward": 0.9375000074505806, | |
| "step": 329 | |
| }, | |
| { | |
| "completion_length": 1459.0000457763672, | |
| "epoch": 0.37714285714285717, | |
| "grad_norm": 0.645241379737854, | |
| "kl": 0.0567169189453125, | |
| "learning_rate": 3.8142703296283953e-07, | |
| "loss": 0.0023, | |
| "reward": 0.4072363208979368, | |
| "reward_std": 0.7110217055305839, | |
| "rewards/cosine_scaled_reward": -0.17138185133808292, | |
| "rewards/format_reward": 0.750000013038516, | |
| "step": 330 | |
| }, | |
| { | |
| "completion_length": 1496.1250381469727, | |
| "epoch": 0.3782857142857143, | |
| "grad_norm": 0.6915489435195923, | |
| "kl": 0.037700653076171875, | |
| "learning_rate": 3.785183306423767e-07, | |
| "loss": 0.0015, | |
| "reward": 0.7328273691236973, | |
| "reward_std": 0.8054426815360785, | |
| "rewards/cosine_scaled_reward": -0.050252995104528964, | |
| "rewards/format_reward": 0.8333333469927311, | |
| "step": 331 | |
| }, | |
| { | |
| "completion_length": 1504.270881652832, | |
| "epoch": 0.37942857142857145, | |
| "grad_norm": 0.4096035361289978, | |
| "kl": 0.03113555908203125, | |
| "learning_rate": 3.7561798609655373e-07, | |
| "loss": 0.0012, | |
| "reward": 0.6928669670596719, | |
| "reward_std": 0.5561900734901428, | |
| "rewards/cosine_scaled_reward": -0.1014832123182714, | |
| "rewards/format_reward": 0.8958333358168602, | |
| "step": 332 | |
| }, | |
| { | |
| "completion_length": 1285.3333587646484, | |
| "epoch": 0.38057142857142856, | |
| "grad_norm": 0.34817183017730713, | |
| "kl": 0.03202056884765625, | |
| "learning_rate": 3.72726140684072e-07, | |
| "loss": 0.0013, | |
| "reward": 0.9648001305758953, | |
| "reward_std": 0.7506307512521744, | |
| "rewards/cosine_scaled_reward": 0.0032333843410015106, | |
| "rewards/format_reward": 0.9583333358168602, | |
| "step": 333 | |
| }, | |
| { | |
| "completion_length": 1845.9375610351562, | |
| "epoch": 0.38171428571428573, | |
| "grad_norm": 0.6175907850265503, | |
| "kl": 0.06725311279296875, | |
| "learning_rate": 3.6984293534939737e-07, | |
| "loss": 0.0027, | |
| "reward": 0.17733338894322515, | |
| "reward_std": 0.5020285062491894, | |
| "rewards/cosine_scaled_reward": -0.2654999643564224, | |
| "rewards/format_reward": 0.7083333469927311, | |
| "step": 334 | |
| }, | |
| { | |
| "completion_length": 1298.895866394043, | |
| "epoch": 0.38285714285714284, | |
| "grad_norm": 0.28870290517807007, | |
| "kl": 0.02230072021484375, | |
| "learning_rate": 3.6696851061588994e-07, | |
| "loss": 0.0009, | |
| "reward": 0.8596498997649178, | |
| "reward_std": 0.7761356085538864, | |
| "rewards/cosine_scaled_reward": -0.018091744743287563, | |
| "rewards/format_reward": 0.8958333507180214, | |
| "step": 335 | |
| }, | |
| { | |
| "completion_length": 1496.7500381469727, | |
| "epoch": 0.384, | |
| "grad_norm": 0.43910446763038635, | |
| "kl": 0.038936614990234375, | |
| "learning_rate": 3.641030065789562e-07, | |
| "loss": 0.0016, | |
| "reward": 0.9042559079825878, | |
| "reward_std": 0.7216421309858561, | |
| "rewards/cosine_scaled_reward": 0.04587792372331023, | |
| "rewards/format_reward": 0.8125000111758709, | |
| "step": 336 | |
| }, | |
| { | |
| "completion_length": 1552.6250610351562, | |
| "epoch": 0.3851428571428571, | |
| "grad_norm": 0.5697144865989685, | |
| "kl": 0.0431671142578125, | |
| "learning_rate": 3.612465628992203e-07, | |
| "loss": 0.0017, | |
| "reward": 0.5564747964963317, | |
| "reward_std": 0.7052949294447899, | |
| "rewards/cosine_scaled_reward": -0.15926262829452753, | |
| "rewards/format_reward": 0.8750000149011612, | |
| "step": 337 | |
| }, | |
| { | |
| "completion_length": 1273.7500457763672, | |
| "epoch": 0.3862857142857143, | |
| "grad_norm": 0.6811116337776184, | |
| "kl": 0.03277587890625, | |
| "learning_rate": 3.5839931879571725e-07, | |
| "loss": 0.0013, | |
| "reward": 0.6950095873326063, | |
| "reward_std": 0.5781379528343678, | |
| "rewards/cosine_scaled_reward": -0.08999521844089031, | |
| "rewards/format_reward": 0.8750000111758709, | |
| "step": 338 | |
| }, | |
| { | |
| "completion_length": 1495.3750305175781, | |
| "epoch": 0.38742857142857146, | |
| "grad_norm": 0.4718823730945587, | |
| "kl": 0.0449981689453125, | |
| "learning_rate": 3.555614130391079e-07, | |
| "loss": 0.0018, | |
| "reward": 0.4238283894956112, | |
| "reward_std": 0.58335055783391, | |
| "rewards/cosine_scaled_reward": -0.17350250110030174, | |
| "rewards/format_reward": 0.770833345130086, | |
| "step": 339 | |
| }, | |
| { | |
| "completion_length": 1411.208366394043, | |
| "epoch": 0.38857142857142857, | |
| "grad_norm": 0.38769999146461487, | |
| "kl": 0.033802032470703125, | |
| "learning_rate": 3.5273298394491515e-07, | |
| "loss": 0.0014, | |
| "reward": 0.5338742323219776, | |
| "reward_std": 0.5776225738227367, | |
| "rewards/cosine_scaled_reward": -0.17056290060281754, | |
| "rewards/format_reward": 0.8750000074505806, | |
| "step": 340 | |
| }, | |
| { | |
| "completion_length": 1547.5417251586914, | |
| "epoch": 0.38971428571428574, | |
| "grad_norm": 0.7106971144676208, | |
| "kl": 0.05750274658203125, | |
| "learning_rate": 3.4991416936678276e-07, | |
| "loss": 0.0023, | |
| "reward": 1.1556610856205225, | |
| "reward_std": 0.6674655824899673, | |
| "rewards/cosine_scaled_reward": 0.16116386279463768, | |
| "rewards/format_reward": 0.8333333432674408, | |
| "step": 341 | |
| }, | |
| { | |
| "completion_length": 1866.6250228881836, | |
| "epoch": 0.39085714285714285, | |
| "grad_norm": 0.8300604820251465, | |
| "kl": 0.08011245727539062, | |
| "learning_rate": 3.471051066897562e-07, | |
| "loss": 0.0032, | |
| "reward": 0.7030050987377763, | |
| "reward_std": 1.0421876087784767, | |
| "rewards/cosine_scaled_reward": -0.013080822303891182, | |
| "rewards/format_reward": 0.7291666809469461, | |
| "step": 342 | |
| }, | |
| { | |
| "completion_length": 1520.3958587646484, | |
| "epoch": 0.392, | |
| "grad_norm": 0.41595587134361267, | |
| "kl": 0.0402679443359375, | |
| "learning_rate": 3.4430593282358777e-07, | |
| "loss": 0.0016, | |
| "reward": 1.10817926004529, | |
| "reward_std": 0.7470837235450745, | |
| "rewards/cosine_scaled_reward": 0.12700629979372025, | |
| "rewards/format_reward": 0.854166679084301, | |
| "step": 343 | |
| }, | |
| { | |
| "completion_length": 1331.5417022705078, | |
| "epoch": 0.3931428571428571, | |
| "grad_norm": 0.5218416452407837, | |
| "kl": 0.045536041259765625, | |
| "learning_rate": 3.4151678419606233e-07, | |
| "loss": 0.0018, | |
| "reward": 1.484528623521328, | |
| "reward_std": 0.6807431867346168, | |
| "rewards/cosine_scaled_reward": 0.27351428056135774, | |
| "rewards/format_reward": 0.9375, | |
| "step": 344 | |
| }, | |
| { | |
| "completion_length": 1333.645866394043, | |
| "epoch": 0.3942857142857143, | |
| "grad_norm": 0.5664793848991394, | |
| "kl": 0.0484161376953125, | |
| "learning_rate": 3.387377967463493e-07, | |
| "loss": 0.0019, | |
| "reward": 1.0393912829458714, | |
| "reward_std": 0.7020993046462536, | |
| "rewards/cosine_scaled_reward": 0.06136229634284973, | |
| "rewards/format_reward": 0.916666679084301, | |
| "step": 345 | |
| }, | |
| { | |
| "completion_length": 1467.8750381469727, | |
| "epoch": 0.3954285714285714, | |
| "grad_norm": 0.4097067713737488, | |
| "kl": 0.028484344482421875, | |
| "learning_rate": 3.359691059183761e-07, | |
| "loss": 0.0011, | |
| "reward": 0.8963276408612728, | |
| "reward_std": 0.6313250102102757, | |
| "rewards/cosine_scaled_reward": -0.031002862378954887, | |
| "rewards/format_reward": 0.9583333432674408, | |
| "step": 346 | |
| }, | |
| { | |
| "completion_length": 1563.0417022705078, | |
| "epoch": 0.3965714285714286, | |
| "grad_norm": 0.39142462611198425, | |
| "kl": 0.02931976318359375, | |
| "learning_rate": 3.3321084665422803e-07, | |
| "loss": 0.0012, | |
| "reward": 0.4781823381781578, | |
| "reward_std": 0.5552436150610447, | |
| "rewards/cosine_scaled_reward": -0.2088255239650607, | |
| "rewards/format_reward": 0.8958333507180214, | |
| "step": 347 | |
| }, | |
| { | |
| "completion_length": 1574.6042251586914, | |
| "epoch": 0.3977142857142857, | |
| "grad_norm": 0.4851846694946289, | |
| "kl": 0.08718490600585938, | |
| "learning_rate": 3.3046315338757026e-07, | |
| "loss": 0.0035, | |
| "reward": 0.7802252694964409, | |
| "reward_std": 0.684166319668293, | |
| "rewards/cosine_scaled_reward": -0.0369707178324461, | |
| "rewards/format_reward": 0.8541666697710752, | |
| "step": 348 | |
| }, | |
| { | |
| "completion_length": 1409.9583892822266, | |
| "epoch": 0.39885714285714285, | |
| "grad_norm": 0.5291475057601929, | |
| "kl": 0.079132080078125, | |
| "learning_rate": 3.2772616003709616e-07, | |
| "loss": 0.0032, | |
| "reward": 0.9304023738950491, | |
| "reward_std": 0.6878850013017654, | |
| "rewards/cosine_scaled_reward": 0.03811782307457179, | |
| "rewards/format_reward": 0.8541666753590107, | |
| "step": 349 | |
| }, | |
| { | |
| "completion_length": 1006.1458587646484, | |
| "epoch": 0.4, | |
| "grad_norm": 0.5923649072647095, | |
| "kl": 0.02787017822265625, | |
| "learning_rate": 3.250000000000001e-07, | |
| "loss": 0.0011, | |
| "reward": 0.6561646163463593, | |
| "reward_std": 0.7408818565309048, | |
| "rewards/cosine_scaled_reward": -0.14066770486533642, | |
| "rewards/format_reward": 0.9375000149011612, | |
| "step": 350 | |
| }, | |
| { | |
| "completion_length": 1188.145866394043, | |
| "epoch": 0.40114285714285713, | |
| "grad_norm": 0.7286760210990906, | |
| "kl": 0.0440216064453125, | |
| "learning_rate": 3.222848061454764e-07, | |
| "loss": 0.0018, | |
| "reward": 0.8188043851405382, | |
| "reward_std": 0.5987816452980042, | |
| "rewards/cosine_scaled_reward": -0.048931147903203964, | |
| "rewards/format_reward": 0.9166666716337204, | |
| "step": 351 | |
| }, | |
| { | |
| "completion_length": 1407.7292022705078, | |
| "epoch": 0.4022857142857143, | |
| "grad_norm": 0.4982418119907379, | |
| "kl": 0.07025146484375, | |
| "learning_rate": 3.195807108082429e-07, | |
| "loss": 0.0028, | |
| "reward": 0.7963543608784676, | |
| "reward_std": 0.7775063142180443, | |
| "rewards/cosine_scaled_reward": -0.02890616189688444, | |
| "rewards/format_reward": 0.8541666716337204, | |
| "step": 352 | |
| }, | |
| { | |
| "completion_length": 1118.7083549499512, | |
| "epoch": 0.4034285714285714, | |
| "grad_norm": 0.35043564438819885, | |
| "kl": 0.023746490478515625, | |
| "learning_rate": 3.168878457820915e-07, | |
| "loss": 0.001, | |
| "reward": 1.168786108493805, | |
| "reward_std": 0.5713744387030602, | |
| "rewards/cosine_scaled_reward": 0.08439303282648325, | |
| "rewards/format_reward": 1.0, | |
| "step": 353 | |
| }, | |
| { | |
| "completion_length": 1018.5000228881836, | |
| "epoch": 0.4045714285714286, | |
| "grad_norm": 0.33321675658226013, | |
| "kl": 0.014415740966796875, | |
| "learning_rate": 3.142063423134644e-07, | |
| "loss": 0.0006, | |
| "reward": 1.1353367045521736, | |
| "reward_std": 0.5776836480945349, | |
| "rewards/cosine_scaled_reward": 0.06766833364963531, | |
| "rewards/format_reward": 1.0, | |
| "step": 354 | |
| }, | |
| { | |
| "completion_length": 943.4375228881836, | |
| "epoch": 0.4057142857142857, | |
| "grad_norm": 0.4005463123321533, | |
| "kl": 0.013919830322265625, | |
| "learning_rate": 3.115363310950578e-07, | |
| "loss": 0.0006, | |
| "reward": 1.04657375626266, | |
| "reward_std": 0.7807271406054497, | |
| "rewards/cosine_scaled_reward": 0.044120170176029205, | |
| "rewards/format_reward": 0.9583333432674408, | |
| "step": 355 | |
| }, | |
| { | |
| "completion_length": 1453.2500534057617, | |
| "epoch": 0.40685714285714286, | |
| "grad_norm": 5.388972759246826, | |
| "kl": 0.15525436401367188, | |
| "learning_rate": 3.0887794225945143e-07, | |
| "loss": 0.0062, | |
| "reward": 0.8825728315860033, | |
| "reward_std": 0.692312303930521, | |
| "rewards/cosine_scaled_reward": 0.003786402754485607, | |
| "rewards/format_reward": 0.8750000055879354, | |
| "step": 356 | |
| }, | |
| { | |
| "completion_length": 1924.916732788086, | |
| "epoch": 0.408, | |
| "grad_norm": 2.25702166557312, | |
| "kl": 0.13177490234375, | |
| "learning_rate": 3.062313053727671e-07, | |
| "loss": 0.0053, | |
| "reward": 0.580949871102348, | |
| "reward_std": 0.7843025289475918, | |
| "rewards/cosine_scaled_reward": -0.12619174644351006, | |
| "rewards/format_reward": 0.8333333469927311, | |
| "step": 357 | |
| }, | |
| { | |
| "completion_length": 1465.8958740234375, | |
| "epoch": 0.40914285714285714, | |
| "grad_norm": 0.3147200644016266, | |
| "kl": 0.048160552978515625, | |
| "learning_rate": 3.0359654942835247e-07, | |
| "loss": 0.0019, | |
| "reward": 1.2739417422562838, | |
| "reward_std": 0.7031947523355484, | |
| "rewards/cosine_scaled_reward": 0.16822083480656147, | |
| "rewards/format_reward": 0.9375000074505806, | |
| "step": 358 | |
| }, | |
| { | |
| "completion_length": 967.7291946411133, | |
| "epoch": 0.4102857142857143, | |
| "grad_norm": 0.4690834879875183, | |
| "kl": 0.01666259765625, | |
| "learning_rate": 3.0097380284049523e-07, | |
| "loss": 0.0007, | |
| "reward": 0.8219893537461758, | |
| "reward_std": 0.4733723718672991, | |
| "rewards/cosine_scaled_reward": -0.07858868315815926, | |
| "rewards/format_reward": 0.9791666716337204, | |
| "step": 359 | |
| }, | |
| { | |
| "completion_length": 1542.4792098999023, | |
| "epoch": 0.4114285714285714, | |
| "grad_norm": 0.809581995010376, | |
| "kl": 0.09832000732421875, | |
| "learning_rate": 2.9836319343816397e-07, | |
| "loss": 0.0039, | |
| "reward": 0.7635386185720563, | |
| "reward_std": 0.8262498266994953, | |
| "rewards/cosine_scaled_reward": -0.03489737829659134, | |
| "rewards/format_reward": 0.8333333488553762, | |
| "step": 360 | |
| }, | |
| { | |
| "completion_length": 1216.2500228881836, | |
| "epoch": 0.4125714285714286, | |
| "grad_norm": 0.6465409398078918, | |
| "kl": 0.032878875732421875, | |
| "learning_rate": 2.9576484845877793e-07, | |
| "loss": 0.0013, | |
| "reward": 0.7554627768695354, | |
| "reward_std": 0.66816546022892, | |
| "rewards/cosine_scaled_reward": -0.09101861796807498, | |
| "rewards/format_reward": 0.9375000149011612, | |
| "step": 361 | |
| }, | |
| { | |
| "completion_length": 1013.7083587646484, | |
| "epoch": 0.4137142857142857, | |
| "grad_norm": 0.6028101444244385, | |
| "kl": 0.0688018798828125, | |
| "learning_rate": 2.931788945420058e-07, | |
| "loss": 0.0028, | |
| "reward": 0.8830434214323759, | |
| "reward_std": 0.4408119870349765, | |
| "rewards/cosine_scaled_reward": -0.016811609268188477, | |
| "rewards/format_reward": 0.9166666679084301, | |
| "step": 362 | |
| }, | |
| { | |
| "completion_length": 1001.5208587646484, | |
| "epoch": 0.41485714285714287, | |
| "grad_norm": 0.4196496605873108, | |
| "kl": 0.02587890625, | |
| "learning_rate": 2.9060545772359305e-07, | |
| "loss": 0.001, | |
| "reward": 1.3822015225887299, | |
| "reward_std": 0.5926420330069959, | |
| "rewards/cosine_scaled_reward": 0.22235074604395777, | |
| "rewards/format_reward": 0.9375, | |
| "step": 363 | |
| }, | |
| { | |
| "completion_length": 1178.6042022705078, | |
| "epoch": 0.416, | |
| "grad_norm": 0.4955011308193207, | |
| "kl": 0.03900909423828125, | |
| "learning_rate": 2.8804466342921987e-07, | |
| "loss": 0.0016, | |
| "reward": 0.4972789268940687, | |
| "reward_std": 0.4248249903321266, | |
| "rewards/cosine_scaled_reward": -0.2096938779577613, | |
| "rewards/format_reward": 0.916666679084301, | |
| "step": 364 | |
| }, | |
| { | |
| "completion_length": 1883.5833740234375, | |
| "epoch": 0.41714285714285715, | |
| "grad_norm": 0.9444948434829712, | |
| "kl": 0.13416671752929688, | |
| "learning_rate": 2.854966364683872e-07, | |
| "loss": 0.0054, | |
| "reward": 0.682358652818948, | |
| "reward_std": 0.7634237520396709, | |
| "rewards/cosine_scaled_reward": -0.033820681273937225, | |
| "rewards/format_reward": 0.750000013038516, | |
| "step": 365 | |
| }, | |
| { | |
| "completion_length": 1313.6875457763672, | |
| "epoch": 0.41828571428571426, | |
| "grad_norm": 0.5134904980659485, | |
| "kl": 0.040302276611328125, | |
| "learning_rate": 2.829615010283344e-07, | |
| "loss": 0.0016, | |
| "reward": 0.9970972370356321, | |
| "reward_std": 0.6578602809458971, | |
| "rewards/cosine_scaled_reward": 0.05063193337991834, | |
| "rewards/format_reward": 0.8958333358168602, | |
| "step": 366 | |
| }, | |
| { | |
| "completion_length": 1459.270896911621, | |
| "epoch": 0.41942857142857143, | |
| "grad_norm": 0.6990901827812195, | |
| "kl": 0.06932830810546875, | |
| "learning_rate": 2.8043938066798645e-07, | |
| "loss": 0.0028, | |
| "reward": 0.752178119495511, | |
| "reward_std": 0.6665377467870712, | |
| "rewards/cosine_scaled_reward": -0.061410948634147644, | |
| "rewards/format_reward": 0.8750000149011612, | |
| "step": 367 | |
| }, | |
| { | |
| "completion_length": 1939.5625915527344, | |
| "epoch": 0.4205714285714286, | |
| "grad_norm": 0.9561004638671875, | |
| "kl": 0.117095947265625, | |
| "learning_rate": 2.7793039831193133e-07, | |
| "loss": 0.0047, | |
| "reward": 0.6907948858570307, | |
| "reward_std": 0.7415912002325058, | |
| "rewards/cosine_scaled_reward": -0.05043588951230049, | |
| "rewards/format_reward": 0.7916666753590107, | |
| "step": 368 | |
| }, | |
| { | |
| "completion_length": 1556.6042289733887, | |
| "epoch": 0.4217142857142857, | |
| "grad_norm": 1.0411858558654785, | |
| "kl": 0.10003662109375, | |
| "learning_rate": 2.7543467624442956e-07, | |
| "loss": 0.004, | |
| "reward": 0.6202979227527976, | |
| "reward_std": 0.921792209148407, | |
| "rewards/cosine_scaled_reward": -0.09610107401385903, | |
| "rewards/format_reward": 0.8125000223517418, | |
| "step": 369 | |
| }, | |
| { | |
| "completion_length": 1518.3542022705078, | |
| "epoch": 0.4228571428571429, | |
| "grad_norm": 0.8758165240287781, | |
| "kl": 0.12267684936523438, | |
| "learning_rate": 2.729523361034538e-07, | |
| "loss": 0.0049, | |
| "reward": 0.565900880843401, | |
| "reward_std": 0.40963491424918175, | |
| "rewards/cosine_scaled_reward": -0.09204956982284784, | |
| "rewards/format_reward": 0.7500000167638063, | |
| "step": 370 | |
| }, | |
| { | |
| "completion_length": 785.8125152587891, | |
| "epoch": 0.424, | |
| "grad_norm": 0.8005909323692322, | |
| "kl": 0.043704986572265625, | |
| "learning_rate": 2.7048349887476037e-07, | |
| "loss": 0.0017, | |
| "reward": 1.155183486174792, | |
| "reward_std": 0.4847665010020137, | |
| "rewards/cosine_scaled_reward": 0.10884173773229122, | |
| "rewards/format_reward": 0.9375000074505806, | |
| "step": 371 | |
| }, | |
| { | |
| "completion_length": 1529.9375610351562, | |
| "epoch": 0.42514285714285716, | |
| "grad_norm": 0.9557836055755615, | |
| "kl": 0.06856536865234375, | |
| "learning_rate": 2.6802828488599294e-07, | |
| "loss": 0.0027, | |
| "reward": 1.173981536179781, | |
| "reward_std": 0.5648104697465897, | |
| "rewards/cosine_scaled_reward": 0.1599073875695467, | |
| "rewards/format_reward": 0.8541666753590107, | |
| "step": 372 | |
| }, | |
| { | |
| "completion_length": 908.5625228881836, | |
| "epoch": 0.42628571428571427, | |
| "grad_norm": 0.6306917667388916, | |
| "kl": 0.0537872314453125, | |
| "learning_rate": 2.655868138008171e-07, | |
| "loss": 0.0022, | |
| "reward": 0.7425720803439617, | |
| "reward_std": 0.5991219412535429, | |
| "rewards/cosine_scaled_reward": -0.10788064636290073, | |
| "rewards/format_reward": 0.9583333432674408, | |
| "step": 373 | |
| }, | |
| { | |
| "completion_length": 1134.7708702087402, | |
| "epoch": 0.42742857142857144, | |
| "grad_norm": 0.3005812168121338, | |
| "kl": 0.034290313720703125, | |
| "learning_rate": 2.631592046130896e-07, | |
| "loss": 0.0014, | |
| "reward": 0.9842969626188278, | |
| "reward_std": 0.48715431056916714, | |
| "rewards/cosine_scaled_reward": 0.0025651296600699425, | |
| "rewards/format_reward": 0.9791666716337204, | |
| "step": 374 | |
| }, | |
| { | |
| "completion_length": 1738.6875457763672, | |
| "epoch": 0.42857142857142855, | |
| "grad_norm": 1.1354238986968994, | |
| "kl": 0.20126724243164062, | |
| "learning_rate": 2.6074557564105724e-07, | |
| "loss": 0.0081, | |
| "reward": 1.1611301894299686, | |
| "reward_std": 0.66292554885149, | |
| "rewards/cosine_scaled_reward": 0.1951484135352075, | |
| "rewards/format_reward": 0.770833345130086, | |
| "step": 375 | |
| }, | |
| { | |
| "completion_length": 1445.0416946411133, | |
| "epoch": 0.4297142857142857, | |
| "grad_norm": 0.8109616041183472, | |
| "kl": 0.1262969970703125, | |
| "learning_rate": 2.583460445215911e-07, | |
| "loss": 0.0051, | |
| "reward": 0.7195739368908107, | |
| "reward_std": 0.644221305847168, | |
| "rewards/cosine_scaled_reward": -0.07771303225308657, | |
| "rewards/format_reward": 0.8750000037252903, | |
| "step": 376 | |
| }, | |
| { | |
| "completion_length": 1666.6042175292969, | |
| "epoch": 0.4308571428571429, | |
| "grad_norm": 0.8677497506141663, | |
| "kl": 0.14620208740234375, | |
| "learning_rate": 2.5596072820445254e-07, | |
| "loss": 0.0059, | |
| "reward": 0.5706351515837014, | |
| "reward_std": 0.5533648394048214, | |
| "rewards/cosine_scaled_reward": -0.14176576025784016, | |
| "rewards/format_reward": 0.854166679084301, | |
| "step": 377 | |
| }, | |
| { | |
| "completion_length": 1371.50004196167, | |
| "epoch": 0.432, | |
| "grad_norm": 0.544824481010437, | |
| "kl": 0.06582260131835938, | |
| "learning_rate": 2.5358974294659373e-07, | |
| "loss": 0.0026, | |
| "reward": 1.024425177834928, | |
| "reward_std": 0.6592462100088596, | |
| "rewards/cosine_scaled_reward": 0.04346257133875042, | |
| "rewards/format_reward": 0.9375000074505806, | |
| "step": 378 | |
| }, | |
| { | |
| "completion_length": 1651.2708740234375, | |
| "epoch": 0.43314285714285716, | |
| "grad_norm": 0.9048022031784058, | |
| "kl": 0.14910507202148438, | |
| "learning_rate": 2.512332043064913e-07, | |
| "loss": 0.006, | |
| "reward": 0.6398802241310477, | |
| "reward_std": 0.6580366250127554, | |
| "rewards/cosine_scaled_reward": -0.11755990888923407, | |
| "rewards/format_reward": 0.8750000149011612, | |
| "step": 379 | |
| }, | |
| { | |
| "completion_length": 1450.5209121704102, | |
| "epoch": 0.4342857142857143, | |
| "grad_norm": 0.9721732139587402, | |
| "kl": 0.1341705322265625, | |
| "learning_rate": 2.488912271385139e-07, | |
| "loss": 0.0054, | |
| "reward": 0.9563853230793029, | |
| "reward_std": 0.5542362295091152, | |
| "rewards/cosine_scaled_reward": 0.05110928136855364, | |
| "rewards/format_reward": 0.8541666772216558, | |
| "step": 380 | |
| }, | |
| { | |
| "completion_length": 1318.2500305175781, | |
| "epoch": 0.43542857142857144, | |
| "grad_norm": 0.7490306496620178, | |
| "kl": 0.09369659423828125, | |
| "learning_rate": 2.465639255873246e-07, | |
| "loss": 0.0037, | |
| "reward": 0.5720363007858396, | |
| "reward_std": 0.4669734900817275, | |
| "rewards/cosine_scaled_reward": -0.1618985361419618, | |
| "rewards/format_reward": 0.8958333507180214, | |
| "step": 381 | |
| }, | |
| { | |
| "completion_length": 1104.6458587646484, | |
| "epoch": 0.43657142857142855, | |
| "grad_norm": 0.47110554575920105, | |
| "kl": 0.06568145751953125, | |
| "learning_rate": 2.4425141308231765e-07, | |
| "loss": 0.0026, | |
| "reward": 0.7494204398244619, | |
| "reward_std": 0.6730066798627377, | |
| "rewards/cosine_scaled_reward": -0.0732064712792635, | |
| "rewards/format_reward": 0.8958333395421505, | |
| "step": 382 | |
| }, | |
| { | |
| "completion_length": 1225.208366394043, | |
| "epoch": 0.4377142857142857, | |
| "grad_norm": 0.8109216690063477, | |
| "kl": 0.13178634643554688, | |
| "learning_rate": 2.4195380233209006e-07, | |
| "loss": 0.0053, | |
| "reward": 1.1812428515404463, | |
| "reward_std": 0.8560024872422218, | |
| "rewards/cosine_scaled_reward": 0.12187140854075551, | |
| "rewards/format_reward": 0.9375000149011612, | |
| "step": 383 | |
| }, | |
| { | |
| "completion_length": 1087.770866394043, | |
| "epoch": 0.43885714285714283, | |
| "grad_norm": 0.5764302015304565, | |
| "kl": 0.05832672119140625, | |
| "learning_rate": 2.3967120531894857e-07, | |
| "loss": 0.0023, | |
| "reward": 1.539357453584671, | |
| "reward_std": 0.942340662702918, | |
| "rewards/cosine_scaled_reward": 0.28009538841433823, | |
| "rewards/format_reward": 0.9791666716337204, | |
| "step": 384 | |
| }, | |
| { | |
| "completion_length": 1719.9375457763672, | |
| "epoch": 0.44, | |
| "grad_norm": 1.024683952331543, | |
| "kl": 0.16792678833007812, | |
| "learning_rate": 2.374037332934512e-07, | |
| "loss": 0.0067, | |
| "reward": 0.597355630248785, | |
| "reward_std": 0.8098498787730932, | |
| "rewards/cosine_scaled_reward": -0.12840551760746166, | |
| "rewards/format_reward": 0.854166679084301, | |
| "step": 385 | |
| }, | |
| { | |
| "completion_length": 1323.0625228881836, | |
| "epoch": 0.44114285714285717, | |
| "grad_norm": 0.8986222147941589, | |
| "kl": 0.13629150390625, | |
| "learning_rate": 2.3515149676898552e-07, | |
| "loss": 0.0054, | |
| "reward": 0.970568162156269, | |
| "reward_std": 0.6673476994037628, | |
| "rewards/cosine_scaled_reward": 0.026950686238706112, | |
| "rewards/format_reward": 0.9166666716337204, | |
| "step": 386 | |
| }, | |
| { | |
| "completion_length": 1617.6458740234375, | |
| "epoch": 0.4422857142857143, | |
| "grad_norm": 1.9691290855407715, | |
| "kl": 0.1884002685546875, | |
| "learning_rate": 2.3291460551638237e-07, | |
| "loss": 0.0075, | |
| "reward": 0.8362433174625039, | |
| "reward_std": 0.7687919363379478, | |
| "rewards/cosine_scaled_reward": 0.022288329899311066, | |
| "rewards/format_reward": 0.7916666865348816, | |
| "step": 387 | |
| }, | |
| { | |
| "completion_length": 1484.0417098999023, | |
| "epoch": 0.44342857142857145, | |
| "grad_norm": 0.9472360014915466, | |
| "kl": 0.1433868408203125, | |
| "learning_rate": 2.306931685585657e-07, | |
| "loss": 0.0057, | |
| "reward": 0.9805102795362473, | |
| "reward_std": 0.7974189594388008, | |
| "rewards/cosine_scaled_reward": 0.03192179277539253, | |
| "rewards/format_reward": 0.9166666716337204, | |
| "step": 388 | |
| }, | |
| { | |
| "completion_length": 1409.9583740234375, | |
| "epoch": 0.44457142857142856, | |
| "grad_norm": 0.6685540080070496, | |
| "kl": 0.14338302612304688, | |
| "learning_rate": 2.2848729416523859e-07, | |
| "loss": 0.0057, | |
| "reward": 0.8139473758637905, | |
| "reward_std": 0.602177394554019, | |
| "rewards/cosine_scaled_reward": -0.07219298463314772, | |
| "rewards/format_reward": 0.9583333432674408, | |
| "step": 389 | |
| }, | |
| { | |
| "completion_length": 1431.5416831970215, | |
| "epoch": 0.44571428571428573, | |
| "grad_norm": 0.8498513698577881, | |
| "kl": 0.12875747680664062, | |
| "learning_rate": 2.2629708984760706e-07, | |
| "loss": 0.0051, | |
| "reward": 0.8707941449247301, | |
| "reward_std": 0.7655657678842545, | |
| "rewards/cosine_scaled_reward": 0.018730382435023785, | |
| "rewards/format_reward": 0.8333333395421505, | |
| "step": 390 | |
| }, | |
| { | |
| "completion_length": 1266.5417098999023, | |
| "epoch": 0.44685714285714284, | |
| "grad_norm": 1.7778056859970093, | |
| "kl": 0.24418258666992188, | |
| "learning_rate": 2.2412266235313973e-07, | |
| "loss": 0.0098, | |
| "reward": 1.0020010322332382, | |
| "reward_std": 0.7098470069468021, | |
| "rewards/cosine_scaled_reward": 0.04266716237179935, | |
| "rewards/format_reward": 0.916666679084301, | |
| "step": 391 | |
| }, | |
| { | |
| "completion_length": 1210.6250381469727, | |
| "epoch": 0.448, | |
| "grad_norm": 0.9239005446434021, | |
| "kl": 0.12252044677734375, | |
| "learning_rate": 2.2196411766036487e-07, | |
| "loss": 0.0049, | |
| "reward": 0.6644403500249609, | |
| "reward_std": 0.6213442989974283, | |
| "rewards/cosine_scaled_reward": -0.12611316796392202, | |
| "rewards/format_reward": 0.9166666716337204, | |
| "step": 392 | |
| }, | |
| { | |
| "completion_length": 1505.7500457763672, | |
| "epoch": 0.4491428571428571, | |
| "grad_norm": 1.1656075716018677, | |
| "kl": 0.14298248291015625, | |
| "learning_rate": 2.1982156097370557e-07, | |
| "loss": 0.0057, | |
| "reward": 0.6966553591191769, | |
| "reward_std": 0.8095338940620422, | |
| "rewards/cosine_scaled_reward": -0.06833898182958364, | |
| "rewards/format_reward": 0.8333333507180214, | |
| "step": 393 | |
| }, | |
| { | |
| "completion_length": 1672.270881652832, | |
| "epoch": 0.4502857142857143, | |
| "grad_norm": 1.8255363702774048, | |
| "kl": 0.25844573974609375, | |
| "learning_rate": 2.1769509671835223e-07, | |
| "loss": 0.0104, | |
| "reward": 0.39489989215508103, | |
| "reward_std": 0.6209751814603806, | |
| "rewards/cosine_scaled_reward": -0.17755005788058043, | |
| "rewards/format_reward": 0.7500000149011612, | |
| "step": 394 | |
| }, | |
| { | |
| "completion_length": 1502.5625381469727, | |
| "epoch": 0.4514285714285714, | |
| "grad_norm": 1.6484956741333008, | |
| "kl": 0.34644317626953125, | |
| "learning_rate": 2.1558482853517253e-07, | |
| "loss": 0.0138, | |
| "reward": 0.8687735805287957, | |
| "reward_std": 0.6579391695559025, | |
| "rewards/cosine_scaled_reward": 0.038553440012037754, | |
| "rewards/format_reward": 0.7916666772216558, | |
| "step": 395 | |
| }, | |
| { | |
| "completion_length": 1279.6250457763672, | |
| "epoch": 0.45257142857142857, | |
| "grad_norm": 0.7100761532783508, | |
| "kl": 0.11377334594726562, | |
| "learning_rate": 2.134908592756607e-07, | |
| "loss": 0.0046, | |
| "reward": 0.9277213364839554, | |
| "reward_std": 0.6887175664305687, | |
| "rewards/cosine_scaled_reward": 0.005527290515601635, | |
| "rewards/format_reward": 0.9166666865348816, | |
| "step": 396 | |
| }, | |
| { | |
| "completion_length": 1162.8958587646484, | |
| "epoch": 0.45371428571428574, | |
| "grad_norm": 1.176890254020691, | |
| "kl": 0.1617584228515625, | |
| "learning_rate": 2.1141329099692406e-07, | |
| "loss": 0.0065, | |
| "reward": 0.6928375033894554, | |
| "reward_std": 0.7507635410875082, | |
| "rewards/cosine_scaled_reward": -0.049414592678658664, | |
| "rewards/format_reward": 0.791666679084301, | |
| "step": 397 | |
| }, | |
| { | |
| "completion_length": 1423.0000534057617, | |
| "epoch": 0.45485714285714285, | |
| "grad_norm": 1.1243730783462524, | |
| "kl": 0.36114501953125, | |
| "learning_rate": 2.0935222495670968e-07, | |
| "loss": 0.0144, | |
| "reward": 0.6900735814124346, | |
| "reward_std": 0.6959933899343014, | |
| "rewards/cosine_scaled_reward": -0.08204657444730401, | |
| "rewards/format_reward": 0.8541666753590107, | |
| "step": 398 | |
| }, | |
| { | |
| "completion_length": 1099.6875305175781, | |
| "epoch": 0.456, | |
| "grad_norm": 0.6219501495361328, | |
| "kl": 0.048126220703125, | |
| "learning_rate": 2.0730776160846853e-07, | |
| "loss": 0.0019, | |
| "reward": 0.9560099802911282, | |
| "reward_std": 0.6817612200975418, | |
| "rewards/cosine_scaled_reward": -0.0011616908013820648, | |
| "rewards/format_reward": 0.9583333432674408, | |
| "step": 399 | |
| }, | |
| { | |
| "completion_length": 1025.3541793823242, | |
| "epoch": 0.45714285714285713, | |
| "grad_norm": 1.3574082851409912, | |
| "kl": 0.11576461791992188, | |
| "learning_rate": 2.0528000059645995e-07, | |
| "loss": 0.0046, | |
| "reward": 1.4993134140968323, | |
| "reward_std": 0.6285623479634523, | |
| "rewards/cosine_scaled_reward": 0.2913233733997913, | |
| "rewards/format_reward": 0.9166666679084301, | |
| "step": 400 | |
| }, | |
| { | |
| "completion_length": 1738.6459045410156, | |
| "epoch": 0.4582857142857143, | |
| "grad_norm": 2.3035199642181396, | |
| "kl": 0.446929931640625, | |
| "learning_rate": 2.032690407508949e-07, | |
| "loss": 0.0179, | |
| "reward": 0.588189116679132, | |
| "reward_std": 0.7383468300104141, | |
| "rewards/cosine_scaled_reward": -0.10173879377543926, | |
| "rewards/format_reward": 0.7916666828095913, | |
| "step": 401 | |
| }, | |
| { | |
| "completion_length": 1228.0000305175781, | |
| "epoch": 0.4594285714285714, | |
| "grad_norm": 2.5062694549560547, | |
| "kl": 0.3095245361328125, | |
| "learning_rate": 2.0127498008311922e-07, | |
| "loss": 0.0124, | |
| "reward": 0.8380769728682935, | |
| "reward_std": 0.5672764517366886, | |
| "rewards/cosine_scaled_reward": -0.03929485194385052, | |
| "rewards/format_reward": 0.916666679084301, | |
| "step": 402 | |
| }, | |
| { | |
| "completion_length": 1121.166690826416, | |
| "epoch": 0.4605714285714286, | |
| "grad_norm": 1.9036198854446411, | |
| "kl": 0.18898773193359375, | |
| "learning_rate": 1.9929791578083655e-07, | |
| "loss": 0.0076, | |
| "reward": 0.9163239896297455, | |
| "reward_std": 0.4906519539654255, | |
| "rewards/cosine_scaled_reward": -0.00017135590314865112, | |
| "rewards/format_reward": 0.916666679084301, | |
| "step": 403 | |
| }, | |
| { | |
| "completion_length": 1184.1666831970215, | |
| "epoch": 0.4617142857142857, | |
| "grad_norm": 1.1318503618240356, | |
| "kl": 0.2597007751464844, | |
| "learning_rate": 1.9733794420337213e-07, | |
| "loss": 0.0104, | |
| "reward": 0.7345974240452051, | |
| "reward_std": 0.3557329196482897, | |
| "rewards/cosine_scaled_reward": -0.09103463962674141, | |
| "rewards/format_reward": 0.9166666716337204, | |
| "step": 404 | |
| }, | |
| { | |
| "completion_length": 1083.6041984558105, | |
| "epoch": 0.46285714285714286, | |
| "grad_norm": 1.148695707321167, | |
| "kl": 0.1588592529296875, | |
| "learning_rate": 1.9539516087697517e-07, | |
| "loss": 0.0064, | |
| "reward": 1.164491715535405, | |
| "reward_std": 0.7675561746582389, | |
| "rewards/cosine_scaled_reward": 0.11349584814161062, | |
| "rewards/format_reward": 0.9375000074505806, | |
| "step": 405 | |
| }, | |
| { | |
| "completion_length": 1347.4792022705078, | |
| "epoch": 0.464, | |
| "grad_norm": 1.5972338914871216, | |
| "kl": 0.30425262451171875, | |
| "learning_rate": 1.934696604901642e-07, | |
| "loss": 0.0122, | |
| "reward": 1.0750425793230534, | |
| "reward_std": 0.8302016435191035, | |
| "rewards/cosine_scaled_reward": 0.07918795384466648, | |
| "rewards/format_reward": 0.9166666716337204, | |
| "step": 406 | |
| }, | |
| { | |
| "completion_length": 1360.416732788086, | |
| "epoch": 0.46514285714285714, | |
| "grad_norm": 2.243136405944824, | |
| "kl": 0.40606689453125, | |
| "learning_rate": 1.915615368891117e-07, | |
| "loss": 0.0162, | |
| "reward": 0.8054503360763192, | |
| "reward_std": 0.43949691019952297, | |
| "rewards/cosine_scaled_reward": -0.03477485757321119, | |
| "rewards/format_reward": 0.8750000223517418, | |
| "step": 407 | |
| }, | |
| { | |
| "completion_length": 1446.395851135254, | |
| "epoch": 0.4662857142857143, | |
| "grad_norm": 2.1331191062927246, | |
| "kl": 0.3451957702636719, | |
| "learning_rate": 1.8967088307307e-07, | |
| "loss": 0.0138, | |
| "reward": 0.933131798170507, | |
| "reward_std": 0.7964614983648062, | |
| "rewards/cosine_scaled_reward": 0.03948255442082882, | |
| "rewards/format_reward": 0.854166679084301, | |
| "step": 408 | |
| }, | |
| { | |
| "completion_length": 1716.7083740234375, | |
| "epoch": 0.4674285714285714, | |
| "grad_norm": 1.9532040357589722, | |
| "kl": 0.445892333984375, | |
| "learning_rate": 1.8779779118983867e-07, | |
| "loss": 0.0178, | |
| "reward": 0.6957469061017036, | |
| "reward_std": 0.6721529066562653, | |
| "rewards/cosine_scaled_reward": -0.08962656743824482, | |
| "rewards/format_reward": 0.8750000037252903, | |
| "step": 409 | |
| }, | |
| { | |
| "completion_length": 1500.3750343322754, | |
| "epoch": 0.4685714285714286, | |
| "grad_norm": 1.127901315689087, | |
| "kl": 0.49494171142578125, | |
| "learning_rate": 1.8594235253127372e-07, | |
| "loss": 0.0198, | |
| "reward": 0.7732762107625604, | |
| "reward_std": 0.8983776066452265, | |
| "rewards/cosine_scaled_reward": -0.0300285741686821, | |
| "rewards/format_reward": 0.8333333414047956, | |
| "step": 410 | |
| }, | |
| { | |
| "completion_length": 2009.000057220459, | |
| "epoch": 0.4697142857142857, | |
| "grad_norm": 2.1997931003570557, | |
| "kl": 0.7542343139648438, | |
| "learning_rate": 1.8410465752883758e-07, | |
| "loss": 0.0302, | |
| "reward": 0.2233330551534891, | |
| "reward_std": 0.5823363810777664, | |
| "rewards/cosine_scaled_reward": -0.24250016640871763, | |
| "rewards/format_reward": 0.7083333414047956, | |
| "step": 411 | |
| }, | |
| { | |
| "completion_length": 1173.583366394043, | |
| "epoch": 0.47085714285714286, | |
| "grad_norm": 8.122756004333496, | |
| "kl": 0.3453559875488281, | |
| "learning_rate": 1.822847957491922e-07, | |
| "loss": 0.0138, | |
| "reward": 1.0573125034570694, | |
| "reward_std": 0.6750743202865124, | |
| "rewards/cosine_scaled_reward": 0.0390729159116745, | |
| "rewards/format_reward": 0.9791666716337204, | |
| "step": 412 | |
| }, | |
| { | |
| "completion_length": 1252.1667022705078, | |
| "epoch": 0.472, | |
| "grad_norm": 107.0315933227539, | |
| "kl": 3.7126235961914062, | |
| "learning_rate": 1.804828558898332e-07, | |
| "loss": 0.1485, | |
| "reward": 1.0909956084797159, | |
| "reward_std": 0.6099100448191166, | |
| "rewards/cosine_scaled_reward": 0.07674776995554566, | |
| "rewards/format_reward": 0.9375, | |
| "step": 413 | |
| }, | |
| { | |
| "completion_length": 1720.8333740234375, | |
| "epoch": 0.47314285714285714, | |
| "grad_norm": 1.0315011739730835, | |
| "kl": 0.302764892578125, | |
| "learning_rate": 1.7869892577476722e-07, | |
| "loss": 0.0121, | |
| "reward": 0.6044954676181078, | |
| "reward_std": 0.5638450682163239, | |
| "rewards/cosine_scaled_reward": -0.13525228761136532, | |
| "rewards/format_reward": 0.8750000074505806, | |
| "step": 414 | |
| }, | |
| { | |
| "completion_length": 1495.4375457763672, | |
| "epoch": 0.4742857142857143, | |
| "grad_norm": 1.6685491800308228, | |
| "kl": 0.31848907470703125, | |
| "learning_rate": 1.7693309235023127e-07, | |
| "loss": 0.0128, | |
| "reward": 0.8473802506923676, | |
| "reward_std": 0.8426450192928314, | |
| "rewards/cosine_scaled_reward": -0.03464323375374079, | |
| "rewards/format_reward": 0.916666679084301, | |
| "step": 415 | |
| }, | |
| { | |
| "completion_length": 1131.7291870117188, | |
| "epoch": 0.4754285714285714, | |
| "grad_norm": 0.9927107095718384, | |
| "kl": 0.10882568359375, | |
| "learning_rate": 1.7518544168045524e-07, | |
| "loss": 0.0043, | |
| "reward": 1.3657895848155022, | |
| "reward_std": 0.7458459995687008, | |
| "rewards/cosine_scaled_reward": 0.18289476446807384, | |
| "rewards/format_reward": 1.0, | |
| "step": 416 | |
| }, | |
| { | |
| "completion_length": 1397.6041946411133, | |
| "epoch": 0.4765714285714286, | |
| "grad_norm": 1.2048169374465942, | |
| "kl": 0.18719482421875, | |
| "learning_rate": 1.7345605894346726e-07, | |
| "loss": 0.0075, | |
| "reward": 0.7268392583355308, | |
| "reward_std": 0.5356767289340496, | |
| "rewards/cosine_scaled_reward": -0.09491373039782047, | |
| "rewards/format_reward": 0.916666679084301, | |
| "step": 417 | |
| }, | |
| { | |
| "completion_length": 1385.5417022705078, | |
| "epoch": 0.4777142857142857, | |
| "grad_norm": 2.015775203704834, | |
| "kl": 0.3655548095703125, | |
| "learning_rate": 1.7174502842694212e-07, | |
| "loss": 0.0146, | |
| "reward": 1.1464654430747032, | |
| "reward_std": 0.8867814308032393, | |
| "rewards/cosine_scaled_reward": 0.13573270197957754, | |
| "rewards/format_reward": 0.8750000074505806, | |
| "step": 418 | |
| }, | |
| { | |
| "completion_length": 1347.2708892822266, | |
| "epoch": 0.47885714285714287, | |
| "grad_norm": 1.3547542095184326, | |
| "kl": 0.3565025329589844, | |
| "learning_rate": 1.7005243352409333e-07, | |
| "loss": 0.0143, | |
| "reward": 1.226757968775928, | |
| "reward_std": 0.6351639665663242, | |
| "rewards/cosine_scaled_reward": 0.18629562947899103, | |
| "rewards/format_reward": 0.8541666753590107, | |
| "step": 419 | |
| }, | |
| { | |
| "completion_length": 994.354190826416, | |
| "epoch": 0.48, | |
| "grad_norm": 3.8445253372192383, | |
| "kl": 0.1680145263671875, | |
| "learning_rate": 1.6837835672960831e-07, | |
| "loss": 0.0067, | |
| "reward": 0.7386327926069498, | |
| "reward_std": 0.5644268691539764, | |
| "rewards/cosine_scaled_reward": -0.08901696337852627, | |
| "rewards/format_reward": 0.9166666865348816, | |
| "step": 420 | |
| }, | |
| { | |
| "completion_length": 1412.833381652832, | |
| "epoch": 0.48114285714285715, | |
| "grad_norm": 1.4594179391860962, | |
| "kl": 0.4204254150390625, | |
| "learning_rate": 1.6672287963562852e-07, | |
| "loss": 0.0168, | |
| "reward": 0.3792721191421151, | |
| "reward_std": 0.7107202988117933, | |
| "rewards/cosine_scaled_reward": -0.22703061811625957, | |
| "rewards/format_reward": 0.8333333432674408, | |
| "step": 421 | |
| }, | |
| { | |
| "completion_length": 1531.9583892822266, | |
| "epoch": 0.48228571428571426, | |
| "grad_norm": 1.9154317378997803, | |
| "kl": 0.529266357421875, | |
| "learning_rate": 1.6508608292777203e-07, | |
| "loss": 0.0212, | |
| "reward": 0.7085785139352083, | |
| "reward_std": 0.9169136509299278, | |
| "rewards/cosine_scaled_reward": -0.051960770739242435, | |
| "rewards/format_reward": 0.8125000260770321, | |
| "step": 422 | |
| }, | |
| { | |
| "completion_length": 1723.458366394043, | |
| "epoch": 0.48342857142857143, | |
| "grad_norm": 2.2272391319274902, | |
| "kl": 0.667022705078125, | |
| "learning_rate": 1.6346804638120098e-07, | |
| "loss": 0.0267, | |
| "reward": 0.39935479685664177, | |
| "reward_std": 0.7014467380940914, | |
| "rewards/cosine_scaled_reward": -0.1440726025030017, | |
| "rewards/format_reward": 0.6875000186264515, | |
| "step": 423 | |
| }, | |
| { | |
| "completion_length": 1572.1250457763672, | |
| "epoch": 0.4845714285714286, | |
| "grad_norm": 7.855838298797607, | |
| "kl": 0.6064910888671875, | |
| "learning_rate": 1.6186884885673413e-07, | |
| "loss": 0.0242, | |
| "reward": 0.4374563042074442, | |
| "reward_std": 0.614509429782629, | |
| "rewards/cosine_scaled_reward": -0.21877187490463257, | |
| "rewards/format_reward": 0.8750000149011612, | |
| "step": 424 | |
| }, | |
| { | |
| "completion_length": 1239.1667022705078, | |
| "epoch": 0.4857142857142857, | |
| "grad_norm": 0.9158412218093872, | |
| "kl": 0.13330078125, | |
| "learning_rate": 1.6028856829700258e-07, | |
| "loss": 0.0053, | |
| "reward": 1.5343235712498426, | |
| "reward_std": 0.6198269496671855, | |
| "rewards/cosine_scaled_reward": 0.298411812633276, | |
| "rewards/format_reward": 0.9375000149011612, | |
| "step": 425 | |
| }, | |
| { | |
| "completion_length": 1250.6666946411133, | |
| "epoch": 0.4868571428571429, | |
| "grad_norm": 1.3155421018600464, | |
| "kl": 0.32469940185546875, | |
| "learning_rate": 1.5872728172265146e-07, | |
| "loss": 0.013, | |
| "reward": 0.8722767792642117, | |
| "reward_std": 0.6620613150298595, | |
| "rewards/cosine_scaled_reward": -0.0013616248033940792, | |
| "rewards/format_reward": 0.8750000149011612, | |
| "step": 426 | |
| }, | |
| { | |
| "completion_length": 1530.8333892822266, | |
| "epoch": 0.488, | |
| "grad_norm": 1.546027421951294, | |
| "kl": 0.19110107421875, | |
| "learning_rate": 1.5718506522858572e-07, | |
| "loss": 0.0077, | |
| "reward": 0.8462803540751338, | |
| "reward_std": 0.8315089084208012, | |
| "rewards/cosine_scaled_reward": -0.02477649785578251, | |
| "rewards/format_reward": 0.8958333432674408, | |
| "step": 427 | |
| }, | |
| { | |
| "completion_length": 1438.7917098999023, | |
| "epoch": 0.48914285714285716, | |
| "grad_norm": 1.3113563060760498, | |
| "kl": 0.44814300537109375, | |
| "learning_rate": 1.5566199398026147e-07, | |
| "loss": 0.018, | |
| "reward": 0.8119768351316452, | |
| "reward_std": 0.8527404256165028, | |
| "rewards/cosine_scaled_reward": -0.041928261518478394, | |
| "rewards/format_reward": 0.8958333358168602, | |
| "step": 428 | |
| }, | |
| { | |
| "completion_length": 1113.0625381469727, | |
| "epoch": 0.49028571428571427, | |
| "grad_norm": 8.569297790527344, | |
| "kl": 0.5591049194335938, | |
| "learning_rate": 1.5415814221002265e-07, | |
| "loss": 0.0224, | |
| "reward": 0.8809511847794056, | |
| "reward_std": 0.7602541670203209, | |
| "rewards/cosine_scaled_reward": -0.0074410997331142426, | |
| "rewards/format_reward": 0.8958333432674408, | |
| "step": 429 | |
| }, | |
| { | |
| "completion_length": 1167.4792213439941, | |
| "epoch": 0.49142857142857144, | |
| "grad_norm": 1.5055890083312988, | |
| "kl": 0.28092193603515625, | |
| "learning_rate": 1.5267358321348285e-07, | |
| "loss": 0.0112, | |
| "reward": 1.0165742300450802, | |
| "reward_std": 0.7622268162667751, | |
| "rewards/cosine_scaled_reward": 0.039537094067782164, | |
| "rewards/format_reward": 0.9375000074505806, | |
| "step": 430 | |
| }, | |
| { | |
| "completion_length": 1308.2917022705078, | |
| "epoch": 0.49257142857142855, | |
| "grad_norm": 1.3274399042129517, | |
| "kl": 0.4292926788330078, | |
| "learning_rate": 1.5120838934595337e-07, | |
| "loss": 0.0172, | |
| "reward": 0.5780141645809636, | |
| "reward_std": 0.6294812802225351, | |
| "rewards/cosine_scaled_reward": -0.13807626301422715, | |
| "rewards/format_reward": 0.8541666865348816, | |
| "step": 431 | |
| }, | |
| { | |
| "completion_length": 1741.9375381469727, | |
| "epoch": 0.4937142857142857, | |
| "grad_norm": 2.412412166595459, | |
| "kl": 0.7193603515625, | |
| "learning_rate": 1.4976263201891613e-07, | |
| "loss": 0.0288, | |
| "reward": 0.5436302255839109, | |
| "reward_std": 0.5460153482854366, | |
| "rewards/cosine_scaled_reward": -0.092768220230937, | |
| "rewards/format_reward": 0.729166679084301, | |
| "step": 432 | |
| }, | |
| { | |
| "completion_length": 1384.5625305175781, | |
| "epoch": 0.4948571428571429, | |
| "grad_norm": 0.9878861904144287, | |
| "kl": 0.18514251708984375, | |
| "learning_rate": 1.483363816965435e-07, | |
| "loss": 0.0074, | |
| "reward": 0.8780094515532255, | |
| "reward_std": 0.44247524719685316, | |
| "rewards/cosine_scaled_reward": -0.00891195610165596, | |
| "rewards/format_reward": 0.8958333432674408, | |
| "step": 433 | |
| }, | |
| { | |
| "completion_length": 1242.6875305175781, | |
| "epoch": 0.496, | |
| "grad_norm": 1.256845474243164, | |
| "kl": 0.20133209228515625, | |
| "learning_rate": 1.469297078922642e-07, | |
| "loss": 0.008, | |
| "reward": 0.516814824193716, | |
| "reward_std": 0.48154355585575104, | |
| "rewards/cosine_scaled_reward": -0.21034259721636772, | |
| "rewards/format_reward": 0.9375000149011612, | |
| "step": 434 | |
| }, | |
| { | |
| "completion_length": 925.0000228881836, | |
| "epoch": 0.49714285714285716, | |
| "grad_norm": 2.2059106826782227, | |
| "kl": 0.3611564636230469, | |
| "learning_rate": 1.4554267916537495e-07, | |
| "loss": 0.0145, | |
| "reward": 0.4291147319599986, | |
| "reward_std": 0.3988812826573849, | |
| "rewards/cosine_scaled_reward": -0.2541926633566618, | |
| "rewards/format_reward": 0.9375000074505806, | |
| "step": 435 | |
| }, | |
| { | |
| "completion_length": 1295.1875381469727, | |
| "epoch": 0.4982857142857143, | |
| "grad_norm": 1.634010672569275, | |
| "kl": 0.45708465576171875, | |
| "learning_rate": 1.4417536311769885e-07, | |
| "loss": 0.0183, | |
| "reward": 1.1145056758541614, | |
| "reward_std": 0.6509739924222231, | |
| "rewards/cosine_scaled_reward": 0.1510028038173914, | |
| "rewards/format_reward": 0.8125000037252903, | |
| "step": 436 | |
| }, | |
| { | |
| "completion_length": 1247.437515258789, | |
| "epoch": 0.49942857142857144, | |
| "grad_norm": 0.8101176619529724, | |
| "kl": 0.2787322998046875, | |
| "learning_rate": 1.4282782639029128e-07, | |
| "loss": 0.0112, | |
| "reward": 0.8161080442368984, | |
| "reward_std": 0.5009766146540642, | |
| "rewards/cosine_scaled_reward": -0.060695987194776535, | |
| "rewards/format_reward": 0.9375000074505806, | |
| "step": 437 | |
| }, | |
| { | |
| "completion_length": 1704.9583740234375, | |
| "epoch": 0.5005714285714286, | |
| "grad_norm": 2.317962169647217, | |
| "kl": 0.5898284912109375, | |
| "learning_rate": 1.4150013466019114e-07, | |
| "loss": 0.0236, | |
| "reward": 0.8052712418138981, | |
| "reward_std": 0.7321378365159035, | |
| "rewards/cosine_scaled_reward": -0.04528105817735195, | |
| "rewards/format_reward": 0.8958333432674408, | |
| "step": 438 | |
| }, | |
| { | |
| "completion_length": 1185.708366394043, | |
| "epoch": 0.5017142857142857, | |
| "grad_norm": 2.9166922569274902, | |
| "kl": 0.2282562255859375, | |
| "learning_rate": 1.4019235263722034e-07, | |
| "loss": 0.0091, | |
| "reward": 0.7440173244103789, | |
| "reward_std": 0.822649909183383, | |
| "rewards/cosine_scaled_reward": -0.05507467477582395, | |
| "rewards/format_reward": 0.8541666865348816, | |
| "step": 439 | |
| }, | |
| { | |
| "completion_length": 1375.2917022705078, | |
| "epoch": 0.5028571428571429, | |
| "grad_norm": 1.8194392919540405, | |
| "kl": 0.3829345703125, | |
| "learning_rate": 1.3890454406082956e-07, | |
| "loss": 0.0153, | |
| "reward": 0.44826740212738514, | |
| "reward_std": 0.573487613350153, | |
| "rewards/cosine_scaled_reward": -0.21336631546728313, | |
| "rewards/format_reward": 0.8750000149011612, | |
| "step": 440 | |
| }, | |
| { | |
| "completion_length": 1298.4166793823242, | |
| "epoch": 0.504, | |
| "grad_norm": 1.6758705377578735, | |
| "kl": 0.220458984375, | |
| "learning_rate": 1.3763677169699217e-07, | |
| "loss": 0.0088, | |
| "reward": 0.9705498463008553, | |
| "reward_std": 0.4990251734852791, | |
| "rewards/cosine_scaled_reward": 0.02694154903292656, | |
| "rewards/format_reward": 0.916666679084301, | |
| "step": 441 | |
| }, | |
| { | |
| "completion_length": 1117.8750228881836, | |
| "epoch": 0.5051428571428571, | |
| "grad_norm": 1.8469866514205933, | |
| "kl": 0.151519775390625, | |
| "learning_rate": 1.3638909733514452e-07, | |
| "loss": 0.0061, | |
| "reward": 0.9607795821502805, | |
| "reward_std": 0.8334435187280178, | |
| "rewards/cosine_scaled_reward": 0.022056451067328453, | |
| "rewards/format_reward": 0.9166666865348816, | |
| "step": 442 | |
| }, | |
| { | |
| "completion_length": 1698.020866394043, | |
| "epoch": 0.5062857142857143, | |
| "grad_norm": 1.6352139711380005, | |
| "kl": 0.6801528930664062, | |
| "learning_rate": 1.351615817851748e-07, | |
| "loss": 0.0272, | |
| "reward": 0.5505319386720657, | |
| "reward_std": 0.514994228258729, | |
| "rewards/cosine_scaled_reward": -0.11015073349699378, | |
| "rewards/format_reward": 0.7708333469927311, | |
| "step": 443 | |
| }, | |
| { | |
| "completion_length": 1436.145881652832, | |
| "epoch": 0.5074285714285715, | |
| "grad_norm": 1.2693414688110352, | |
| "kl": 0.5347976684570312, | |
| "learning_rate": 1.3395428487445914e-07, | |
| "loss": 0.0214, | |
| "reward": 0.5698400810360909, | |
| "reward_std": 0.8037937507033348, | |
| "rewards/cosine_scaled_reward": -0.14216329460032284, | |
| "rewards/format_reward": 0.8541666828095913, | |
| "step": 444 | |
| }, | |
| { | |
| "completion_length": 1327.7916946411133, | |
| "epoch": 0.5085714285714286, | |
| "grad_norm": 1.9523547887802124, | |
| "kl": 0.2998390197753906, | |
| "learning_rate": 1.3276726544494571e-07, | |
| "loss": 0.012, | |
| "reward": 0.6349876510794275, | |
| "reward_std": 0.6037604101002216, | |
| "rewards/cosine_scaled_reward": -0.13042284222319722, | |
| "rewards/format_reward": 0.8958333507180214, | |
| "step": 445 | |
| }, | |
| { | |
| "completion_length": 1364.520881652832, | |
| "epoch": 0.5097142857142857, | |
| "grad_norm": 1.9374631643295288, | |
| "kl": 0.27101898193359375, | |
| "learning_rate": 1.316005813502869e-07, | |
| "loss": 0.0109, | |
| "reward": 0.7476022280752659, | |
| "reward_std": 0.4615443851798773, | |
| "rewards/cosine_scaled_reward": -0.06369888596236706, | |
| "rewards/format_reward": 0.8750000074505806, | |
| "step": 446 | |
| }, | |
| { | |
| "completion_length": 1311.3542098999023, | |
| "epoch": 0.5108571428571429, | |
| "grad_norm": 1.7980448007583618, | |
| "kl": 0.39284515380859375, | |
| "learning_rate": 1.3045428945301953e-07, | |
| "loss": 0.0157, | |
| "reward": 0.7899397425353527, | |
| "reward_std": 0.5542605184018612, | |
| "rewards/cosine_scaled_reward": -0.06336347293108702, | |
| "rewards/format_reward": 0.916666679084301, | |
| "step": 447 | |
| }, | |
| { | |
| "completion_length": 1308.7708587646484, | |
| "epoch": 0.512, | |
| "grad_norm": 3.7581255435943604, | |
| "kl": 0.5269927978515625, | |
| "learning_rate": 1.2932844562179352e-07, | |
| "loss": 0.0211, | |
| "reward": 0.7952553946524858, | |
| "reward_std": 0.5684080645442009, | |
| "rewards/cosine_scaled_reward": -0.039872318506240845, | |
| "rewards/format_reward": 0.8750000149011612, | |
| "step": 448 | |
| }, | |
| { | |
| "completion_length": 1423.8750076293945, | |
| "epoch": 0.5131428571428571, | |
| "grad_norm": 2.130291700363159, | |
| "kl": 0.5636138916015625, | |
| "learning_rate": 1.2822310472864885e-07, | |
| "loss": 0.0226, | |
| "reward": 0.5615764576941729, | |
| "reward_std": 0.6229820605367422, | |
| "rewards/cosine_scaled_reward": -0.13587846513837576, | |
| "rewards/format_reward": 0.8333333414047956, | |
| "step": 449 | |
| }, | |
| { | |
| "completion_length": 1162.8750610351562, | |
| "epoch": 0.5142857142857142, | |
| "grad_norm": 1.1003129482269287, | |
| "kl": 0.37982177734375, | |
| "learning_rate": 1.2713832064634125e-07, | |
| "loss": 0.0152, | |
| "reward": 0.6501909224316478, | |
| "reward_std": 0.46628283709287643, | |
| "rewards/cosine_scaled_reward": -0.12282122112810612, | |
| "rewards/format_reward": 0.8958333358168602, | |
| "step": 450 | |
| }, | |
| { | |
| "completion_length": 1113.7500228881836, | |
| "epoch": 0.5154285714285715, | |
| "grad_norm": 1.0367095470428467, | |
| "kl": 0.35689544677734375, | |
| "learning_rate": 1.260741462457165e-07, | |
| "loss": 0.0142, | |
| "reward": 0.7563045807182789, | |
| "reward_std": 0.48215247690677643, | |
| "rewards/cosine_scaled_reward": -0.10101438034325838, | |
| "rewards/format_reward": 0.9583333432674408, | |
| "step": 451 | |
| }, | |
| { | |
| "completion_length": 1709.3333740234375, | |
| "epoch": 0.5165714285714286, | |
| "grad_norm": 1.715613842010498, | |
| "kl": 0.7028999328613281, | |
| "learning_rate": 1.2503063339313356e-07, | |
| "loss": 0.0281, | |
| "reward": 0.6673658769577742, | |
| "reward_std": 0.7115018367767334, | |
| "rewards/cosine_scaled_reward": -0.06215040449751541, | |
| "rewards/format_reward": 0.7916666716337204, | |
| "step": 452 | |
| }, | |
| { | |
| "completion_length": 1630.354263305664, | |
| "epoch": 0.5177142857142857, | |
| "grad_norm": 1.8834996223449707, | |
| "kl": 0.749481201171875, | |
| "learning_rate": 1.2400783294793668e-07, | |
| "loss": 0.03, | |
| "reward": 0.5653771113138646, | |
| "reward_std": 0.7798537351191044, | |
| "rewards/cosine_scaled_reward": -0.08189477771520615, | |
| "rewards/format_reward": 0.7291666716337204, | |
| "step": 453 | |
| }, | |
| { | |
| "completion_length": 1188.0625457763672, | |
| "epoch": 0.5188571428571429, | |
| "grad_norm": 1.7752057313919067, | |
| "kl": 0.3389015197753906, | |
| "learning_rate": 1.2300579475997657e-07, | |
| "loss": 0.0136, | |
| "reward": 0.5443481418769807, | |
| "reward_std": 0.537891261279583, | |
| "rewards/cosine_scaled_reward": -0.16532594605814666, | |
| "rewards/format_reward": 0.8750000074505806, | |
| "step": 454 | |
| }, | |
| { | |
| "completion_length": 1413.8541870117188, | |
| "epoch": 0.52, | |
| "grad_norm": 2.040367603302002, | |
| "kl": 0.5235443115234375, | |
| "learning_rate": 1.220245676671809e-07, | |
| "loss": 0.021, | |
| "reward": 0.45831110049039125, | |
| "reward_std": 0.5242529977113008, | |
| "rewards/cosine_scaled_reward": -0.2083444595336914, | |
| "rewards/format_reward": 0.8750000074505806, | |
| "step": 455 | |
| }, | |
| { | |
| "completion_length": 1710.875057220459, | |
| "epoch": 0.5211428571428571, | |
| "grad_norm": 2.0157127380371094, | |
| "kl": 0.6534805297851562, | |
| "learning_rate": 1.2106419949317388e-07, | |
| "loss": 0.0261, | |
| "reward": 0.7503673816099763, | |
| "reward_std": 0.8190008737146854, | |
| "rewards/cosine_scaled_reward": -0.07273299805819988, | |
| "rewards/format_reward": 0.895833358168602, | |
| "step": 456 | |
| }, | |
| { | |
| "completion_length": 1534.6250228881836, | |
| "epoch": 0.5222857142857142, | |
| "grad_norm": 2.5126774311065674, | |
| "kl": 0.7040786743164062, | |
| "learning_rate": 1.2012473704494537e-07, | |
| "loss": 0.0281, | |
| "reward": 0.7014026306569576, | |
| "reward_std": 0.6737043410539627, | |
| "rewards/cosine_scaled_reward": -0.045132044702768326, | |
| "rewards/format_reward": 0.791666692122817, | |
| "step": 457 | |
| }, | |
| { | |
| "completion_length": 1155.5000381469727, | |
| "epoch": 0.5234285714285715, | |
| "grad_norm": 1.7595032453536987, | |
| "kl": 0.328765869140625, | |
| "learning_rate": 1.1920622611056974e-07, | |
| "loss": 0.0132, | |
| "reward": 0.5051953579531983, | |
| "reward_std": 0.5484701581299305, | |
| "rewards/cosine_scaled_reward": -0.19531898852437735, | |
| "rewards/format_reward": 0.8958333358168602, | |
| "step": 458 | |
| }, | |
| { | |
| "completion_length": 1033.8750457763672, | |
| "epoch": 0.5245714285714286, | |
| "grad_norm": 1.7773371934890747, | |
| "kl": 0.11635589599609375, | |
| "learning_rate": 1.1830871145697412e-07, | |
| "loss": 0.0047, | |
| "reward": 0.8539259545505047, | |
| "reward_std": 0.7942902967333794, | |
| "rewards/cosine_scaled_reward": -0.03137038787826896, | |
| "rewards/format_reward": 0.9166666716337204, | |
| "step": 459 | |
| }, | |
| { | |
| "completion_length": 1757.2083549499512, | |
| "epoch": 0.5257142857142857, | |
| "grad_norm": 2.649414539337158, | |
| "kl": 0.716064453125, | |
| "learning_rate": 1.1743223682775649e-07, | |
| "loss": 0.0286, | |
| "reward": 0.7232505343854427, | |
| "reward_std": 0.7622859515249729, | |
| "rewards/cosine_scaled_reward": -0.03420809283852577, | |
| "rewards/format_reward": 0.791666679084301, | |
| "step": 460 | |
| }, | |
| { | |
| "completion_length": 1713.5625305175781, | |
| "epoch": 0.5268571428571428, | |
| "grad_norm": 2.396477699279785, | |
| "kl": 0.8402938842773438, | |
| "learning_rate": 1.1657684494105386e-07, | |
| "loss": 0.0336, | |
| "reward": 0.6444814316928387, | |
| "reward_std": 0.48170122131705284, | |
| "rewards/cosine_scaled_reward": -0.05275928042829037, | |
| "rewards/format_reward": 0.7500000074505806, | |
| "step": 461 | |
| }, | |
| { | |
| "completion_length": 1440.1458740234375, | |
| "epoch": 0.528, | |
| "grad_norm": 1.3917316198349, | |
| "kl": 0.4964752197265625, | |
| "learning_rate": 1.1574257748745986e-07, | |
| "loss": 0.0199, | |
| "reward": 0.4077282305806875, | |
| "reward_std": 0.6107278726994991, | |
| "rewards/cosine_scaled_reward": -0.21280256658792496, | |
| "rewards/format_reward": 0.8333333507180214, | |
| "step": 462 | |
| }, | |
| { | |
| "completion_length": 1800.666748046875, | |
| "epoch": 0.5291428571428571, | |
| "grad_norm": 2.1574342250823975, | |
| "kl": 0.4113349914550781, | |
| "learning_rate": 1.1492947512799328e-07, | |
| "loss": 0.0164, | |
| "reward": 0.7166567414533347, | |
| "reward_std": 0.9189217295497656, | |
| "rewards/cosine_scaled_reward": -0.02708831927157007, | |
| "rewards/format_reward": 0.7708333432674408, | |
| "step": 463 | |
| }, | |
| { | |
| "completion_length": 986.208366394043, | |
| "epoch": 0.5302857142857142, | |
| "grad_norm": 1.0322054624557495, | |
| "kl": 0.27919769287109375, | |
| "learning_rate": 1.1413757749211602e-07, | |
| "loss": 0.0112, | |
| "reward": 1.186152020469308, | |
| "reward_std": 0.33294933661818504, | |
| "rewards/cosine_scaled_reward": 0.103492621332407, | |
| "rewards/format_reward": 0.9791666716337204, | |
| "step": 464 | |
| }, | |
| { | |
| "completion_length": 1653.8750762939453, | |
| "epoch": 0.5314285714285715, | |
| "grad_norm": 1.5671355724334717, | |
| "kl": 0.7370033264160156, | |
| "learning_rate": 1.1336692317580158e-07, | |
| "loss": 0.0295, | |
| "reward": 0.7522133439779282, | |
| "reward_std": 0.8285946622490883, | |
| "rewards/cosine_scaled_reward": -0.06139334570616484, | |
| "rewards/format_reward": 0.8750000149011612, | |
| "step": 465 | |
| }, | |
| { | |
| "completion_length": 1548.2708892822266, | |
| "epoch": 0.5325714285714286, | |
| "grad_norm": 2.2751619815826416, | |
| "kl": 0.5921897888183594, | |
| "learning_rate": 1.1261754973965422e-07, | |
| "loss": 0.0237, | |
| "reward": 0.9642188400030136, | |
| "reward_std": 0.776534590870142, | |
| "rewards/cosine_scaled_reward": 0.023776067420840263, | |
| "rewards/format_reward": 0.9166666865348816, | |
| "step": 466 | |
| }, | |
| { | |
| "completion_length": 1739.5625610351562, | |
| "epoch": 0.5337142857142857, | |
| "grad_norm": 1.564422845840454, | |
| "kl": 0.6026535034179688, | |
| "learning_rate": 1.1188949370707787e-07, | |
| "loss": 0.0241, | |
| "reward": 0.34736107289791107, | |
| "reward_std": 0.6108816284686327, | |
| "rewards/cosine_scaled_reward": -0.22215282171964645, | |
| "rewards/format_reward": 0.7916666846722364, | |
| "step": 467 | |
| }, | |
| { | |
| "completion_length": 1602.4166984558105, | |
| "epoch": 0.5348571428571428, | |
| "grad_norm": 2.7220699787139893, | |
| "kl": 0.8326644897460938, | |
| "learning_rate": 1.1118279056249653e-07, | |
| "loss": 0.0333, | |
| "reward": 0.6653020847588778, | |
| "reward_std": 0.9241620562970638, | |
| "rewards/cosine_scaled_reward": -0.03193228365853429, | |
| "rewards/format_reward": 0.729166679084301, | |
| "step": 468 | |
| }, | |
| { | |
| "completion_length": 1568.229206085205, | |
| "epoch": 0.536, | |
| "grad_norm": 3.782336950302124, | |
| "kl": 0.8657150268554688, | |
| "learning_rate": 1.1049747474962444e-07, | |
| "loss": 0.0346, | |
| "reward": 0.6384231373667717, | |
| "reward_std": 0.5810450203716755, | |
| "rewards/cosine_scaled_reward": -0.055788458324968815, | |
| "rewards/format_reward": 0.7500000111758709, | |
| "step": 469 | |
| }, | |
| { | |
| "completion_length": 2107.854248046875, | |
| "epoch": 0.5371428571428571, | |
| "grad_norm": 2.1429548263549805, | |
| "kl": 1.3319091796875, | |
| "learning_rate": 1.0983357966978745e-07, | |
| "loss": 0.0533, | |
| "reward": 0.3757573022157885, | |
| "reward_std": 0.6251861937344074, | |
| "rewards/cosine_scaled_reward": -0.1662880228832364, | |
| "rewards/format_reward": 0.708333345130086, | |
| "step": 470 | |
| }, | |
| { | |
| "completion_length": 1868.4375381469727, | |
| "epoch": 0.5382857142857143, | |
| "grad_norm": 2.449512243270874, | |
| "kl": 1.098663330078125, | |
| "learning_rate": 1.0919113768029517e-07, | |
| "loss": 0.0439, | |
| "reward": 0.7246365919709206, | |
| "reward_std": 0.8856858089566231, | |
| "rewards/cosine_scaled_reward": -0.0022650789469480515, | |
| "rewards/format_reward": 0.729166679084301, | |
| "step": 471 | |
| }, | |
| { | |
| "completion_length": 1710.812515258789, | |
| "epoch": 0.5394285714285715, | |
| "grad_norm": 1.708894968032837, | |
| "kl": 0.6365203857421875, | |
| "learning_rate": 1.0857018009286381e-07, | |
| "loss": 0.0254, | |
| "reward": 0.5696725435554981, | |
| "reward_std": 0.7982751838862896, | |
| "rewards/cosine_scaled_reward": -0.11099707769608358, | |
| "rewards/format_reward": 0.7916666828095913, | |
| "step": 472 | |
| }, | |
| { | |
| "completion_length": 1787.8125381469727, | |
| "epoch": 0.5405714285714286, | |
| "grad_norm": 2.3143343925476074, | |
| "kl": 0.6291122436523438, | |
| "learning_rate": 1.0797073717209013e-07, | |
| "loss": 0.0252, | |
| "reward": 0.5162299545481801, | |
| "reward_std": 0.7424188554286957, | |
| "rewards/cosine_scaled_reward": -0.1377183818258345, | |
| "rewards/format_reward": 0.7916666679084301, | |
| "step": 473 | |
| }, | |
| { | |
| "completion_length": 1814.4167213439941, | |
| "epoch": 0.5417142857142857, | |
| "grad_norm": 2.590423822402954, | |
| "kl": 0.9863052368164062, | |
| "learning_rate": 1.0739283813397639e-07, | |
| "loss": 0.0394, | |
| "reward": 1.190386475995183, | |
| "reward_std": 0.7150390669703484, | |
| "rewards/cosine_scaled_reward": 0.20977654308080673, | |
| "rewards/format_reward": 0.7708333432674408, | |
| "step": 474 | |
| }, | |
| { | |
| "completion_length": 1373.0417098999023, | |
| "epoch": 0.5428571428571428, | |
| "grad_norm": 0.9969754815101624, | |
| "kl": 0.34333038330078125, | |
| "learning_rate": 1.068365111445064e-07, | |
| "loss": 0.0137, | |
| "reward": 0.6799025642685592, | |
| "reward_std": 0.7420720048248768, | |
| "rewards/cosine_scaled_reward": -0.1183820916339755, | |
| "rewards/format_reward": 0.9166666679084301, | |
| "step": 475 | |
| }, | |
| { | |
| "completion_length": 1700.4792175292969, | |
| "epoch": 0.544, | |
| "grad_norm": 1.2897610664367676, | |
| "kl": 0.5266647338867188, | |
| "learning_rate": 1.063017833182728e-07, | |
| "loss": 0.0211, | |
| "reward": 0.872494999319315, | |
| "reward_std": 0.8203131221234798, | |
| "rewards/cosine_scaled_reward": -0.001252486981684342, | |
| "rewards/format_reward": 0.8750000074505806, | |
| "step": 476 | |
| }, | |
| { | |
| "completion_length": 1266.5000534057617, | |
| "epoch": 0.5451428571428572, | |
| "grad_norm": 1.5318264961242676, | |
| "kl": 0.49987030029296875, | |
| "learning_rate": 1.0578868071715544e-07, | |
| "loss": 0.02, | |
| "reward": 1.0228368118405342, | |
| "reward_std": 1.1473434120416641, | |
| "rewards/cosine_scaled_reward": 0.09475172049133107, | |
| "rewards/format_reward": 0.8333333432674408, | |
| "step": 477 | |
| }, | |
| { | |
| "completion_length": 1734.5417098999023, | |
| "epoch": 0.5462857142857143, | |
| "grad_norm": 1.782239317893982, | |
| "kl": 0.7471237182617188, | |
| "learning_rate": 1.0529722834905125e-07, | |
| "loss": 0.0299, | |
| "reward": 0.6132550844922662, | |
| "reward_std": 0.5443017482757568, | |
| "rewards/cosine_scaled_reward": -0.0892058244207874, | |
| "rewards/format_reward": 0.7916666716337204, | |
| "step": 478 | |
| }, | |
| { | |
| "completion_length": 1831.3959045410156, | |
| "epoch": 0.5474285714285714, | |
| "grad_norm": 2.2087714672088623, | |
| "kl": 0.98297119140625, | |
| "learning_rate": 1.0482745016665526e-07, | |
| "loss": 0.0393, | |
| "reward": 0.7758033736608922, | |
| "reward_std": 0.8804528266191483, | |
| "rewards/cosine_scaled_reward": -0.028764987364411354, | |
| "rewards/format_reward": 0.8333333507180214, | |
| "step": 479 | |
| }, | |
| { | |
| "completion_length": 1431.06254196167, | |
| "epoch": 0.5485714285714286, | |
| "grad_norm": 1.5771631002426147, | |
| "kl": 0.810302734375, | |
| "learning_rate": 1.0437936906629334e-07, | |
| "loss": 0.0325, | |
| "reward": 0.7137194634415209, | |
| "reward_std": 0.9008150100708008, | |
| "rewards/cosine_scaled_reward": -0.03897361445706338, | |
| "rewards/format_reward": 0.7916666865348816, | |
| "step": 480 | |
| }, | |
| { | |
| "completion_length": 1595.729232788086, | |
| "epoch": 0.5497142857142857, | |
| "grad_norm": 1.9218149185180664, | |
| "kl": 0.5342254638671875, | |
| "learning_rate": 1.0395300688680625e-07, | |
| "loss": 0.0214, | |
| "reward": 0.4544407380744815, | |
| "reward_std": 0.6266775503754616, | |
| "rewards/cosine_scaled_reward": -0.19986298400908709, | |
| "rewards/format_reward": 0.8541666716337204, | |
| "step": 481 | |
| }, | |
| { | |
| "completion_length": 1623.625015258789, | |
| "epoch": 0.5508571428571428, | |
| "grad_norm": 1.3348276615142822, | |
| "kl": 0.8123359680175781, | |
| "learning_rate": 1.0354838440848501e-07, | |
| "loss": 0.0325, | |
| "reward": 0.7579959752038121, | |
| "reward_std": 0.8174431677907705, | |
| "rewards/cosine_scaled_reward": -0.016835355083458126, | |
| "rewards/format_reward": 0.7916666753590107, | |
| "step": 482 | |
| }, | |
| { | |
| "completion_length": 1678.958396911621, | |
| "epoch": 0.552, | |
| "grad_norm": 1.4332455396652222, | |
| "kl": 0.5729446411132812, | |
| "learning_rate": 1.0316552135205837e-07, | |
| "loss": 0.0229, | |
| "reward": 0.38456033915281296, | |
| "reward_std": 0.7770704664289951, | |
| "rewards/cosine_scaled_reward": -0.19313650764524937, | |
| "rewards/format_reward": 0.7708333414047956, | |
| "step": 483 | |
| }, | |
| { | |
| "completion_length": 1232.3750457763672, | |
| "epoch": 0.5531428571428572, | |
| "grad_norm": 1.994523286819458, | |
| "kl": 0.33402252197265625, | |
| "learning_rate": 1.0280443637773163e-07, | |
| "loss": 0.0134, | |
| "reward": 0.7572063701227307, | |
| "reward_std": 0.6004737913608551, | |
| "rewards/cosine_scaled_reward": -0.027646828442811966, | |
| "rewards/format_reward": 0.8125000149011612, | |
| "step": 484 | |
| }, | |
| { | |
| "completion_length": 1286.6875457763672, | |
| "epoch": 0.5542857142857143, | |
| "grad_norm": 3.306351900100708, | |
| "kl": 0.5061264038085938, | |
| "learning_rate": 1.0246514708427701e-07, | |
| "loss": 0.0203, | |
| "reward": 0.6613044207915664, | |
| "reward_std": 0.6431709341704845, | |
| "rewards/cosine_scaled_reward": -0.08601447567343712, | |
| "rewards/format_reward": 0.833333358168602, | |
| "step": 485 | |
| }, | |
| { | |
| "completion_length": 727.2708473205566, | |
| "epoch": 0.5554285714285714, | |
| "grad_norm": 1.239498496055603, | |
| "kl": 0.13980865478515625, | |
| "learning_rate": 1.0214767000817596e-07, | |
| "loss": 0.0056, | |
| "reward": 0.857852790504694, | |
| "reward_std": 0.4787697456777096, | |
| "rewards/cosine_scaled_reward": -0.03982360428199172, | |
| "rewards/format_reward": 0.9375000149011612, | |
| "step": 486 | |
| }, | |
| { | |
| "completion_length": 1126.6042137145996, | |
| "epoch": 0.5565714285714286, | |
| "grad_norm": 3.5425333976745605, | |
| "kl": 0.3272552490234375, | |
| "learning_rate": 1.0185202062281336e-07, | |
| "loss": 0.0131, | |
| "reward": 1.3010212499648333, | |
| "reward_std": 0.5992186656221747, | |
| "rewards/cosine_scaled_reward": 0.19217727705836296, | |
| "rewards/format_reward": 0.916666679084301, | |
| "step": 487 | |
| }, | |
| { | |
| "completion_length": 1328.5000228881836, | |
| "epoch": 0.5577142857142857, | |
| "grad_norm": 4.340129852294922, | |
| "kl": 0.7839202880859375, | |
| "learning_rate": 1.0157821333772304e-07, | |
| "loss": 0.0314, | |
| "reward": 0.4630023818463087, | |
| "reward_std": 0.5009241513907909, | |
| "rewards/cosine_scaled_reward": -0.1539154672063887, | |
| "rewards/format_reward": 0.7708333395421505, | |
| "step": 488 | |
| }, | |
| { | |
| "completion_length": 1523.2916946411133, | |
| "epoch": 0.5588571428571428, | |
| "grad_norm": 2.2074553966522217, | |
| "kl": 0.5087432861328125, | |
| "learning_rate": 1.013262614978859e-07, | |
| "loss": 0.0204, | |
| "reward": 0.4663016349077225, | |
| "reward_std": 0.5114475898444653, | |
| "rewards/cosine_scaled_reward": -0.20434920396655798, | |
| "rewards/format_reward": 0.8750000111758709, | |
| "step": 489 | |
| }, | |
| { | |
| "completion_length": 1465.1458930969238, | |
| "epoch": 0.56, | |
| "grad_norm": 1.6939737796783447, | |
| "kl": 0.6732406616210938, | |
| "learning_rate": 1.0109617738307911e-07, | |
| "loss": 0.0269, | |
| "reward": 0.7020680662244558, | |
| "reward_std": 0.6934347413480282, | |
| "rewards/cosine_scaled_reward": -0.06563264457508922, | |
| "rewards/format_reward": 0.8333333432674408, | |
| "step": 490 | |
| }, | |
| { | |
| "completion_length": 1713.2500381469727, | |
| "epoch": 0.5611428571428572, | |
| "grad_norm": 1.7025130987167358, | |
| "kl": 0.6433792114257812, | |
| "learning_rate": 1.0088797220727779e-07, | |
| "loss": 0.0257, | |
| "reward": 0.8764896970242262, | |
| "reward_std": 0.9352022185921669, | |
| "rewards/cosine_scaled_reward": 0.02157815732061863, | |
| "rewards/format_reward": 0.8333333432674408, | |
| "step": 491 | |
| }, | |
| { | |
| "completion_length": 1347.4791946411133, | |
| "epoch": 0.5622857142857143, | |
| "grad_norm": 2.898775815963745, | |
| "kl": 0.4826812744140625, | |
| "learning_rate": 1.0070165611810855e-07, | |
| "loss": 0.0193, | |
| "reward": 0.711312476079911, | |
| "reward_std": 0.5596998631954193, | |
| "rewards/cosine_scaled_reward": -0.040177132934331894, | |
| "rewards/format_reward": 0.7916666846722364, | |
| "step": 492 | |
| }, | |
| { | |
| "completion_length": 1267.5416946411133, | |
| "epoch": 0.5634285714285714, | |
| "grad_norm": 1.8216196298599243, | |
| "kl": 0.35247039794921875, | |
| "learning_rate": 1.005372381963547e-07, | |
| "loss": 0.0141, | |
| "reward": 0.9863412100821733, | |
| "reward_std": 0.7628876008093357, | |
| "rewards/cosine_scaled_reward": 0.03483725246042013, | |
| "rewards/format_reward": 0.9166666716337204, | |
| "step": 493 | |
| }, | |
| { | |
| "completion_length": 1147.5416946411133, | |
| "epoch": 0.5645714285714286, | |
| "grad_norm": 1.519400715827942, | |
| "kl": 0.529205322265625, | |
| "learning_rate": 1.0039472645551372e-07, | |
| "loss": 0.0211, | |
| "reward": 1.0907946676015854, | |
| "reward_std": 0.8422017935663462, | |
| "rewards/cosine_scaled_reward": 0.08706398599315435, | |
| "rewards/format_reward": 0.916666679084301, | |
| "step": 494 | |
| }, | |
| { | |
| "completion_length": 1579.0417098999023, | |
| "epoch": 0.5657142857142857, | |
| "grad_norm": 2.1514978408813477, | |
| "kl": 0.5824813842773438, | |
| "learning_rate": 1.002741278414069e-07, | |
| "loss": 0.0233, | |
| "reward": 0.7147043733857572, | |
| "reward_std": 0.9280455261468887, | |
| "rewards/cosine_scaled_reward": -0.028064499609172344, | |
| "rewards/format_reward": 0.770833333954215, | |
| "step": 495 | |
| }, | |
| { | |
| "completion_length": 1475.2916717529297, | |
| "epoch": 0.5668571428571428, | |
| "grad_norm": 2.9298958778381348, | |
| "kl": 0.7912979125976562, | |
| "learning_rate": 1.0017544823184055e-07, | |
| "loss": 0.0317, | |
| "reward": 0.7844320052536204, | |
| "reward_std": 0.7428931668400764, | |
| "rewards/cosine_scaled_reward": 0.006799314171075821, | |
| "rewards/format_reward": 0.7708333358168602, | |
| "step": 496 | |
| }, | |
| { | |
| "completion_length": 1518.9792022705078, | |
| "epoch": 0.568, | |
| "grad_norm": 1.90547513961792, | |
| "kl": 0.940093994140625, | |
| "learning_rate": 1.0009869243631952e-07, | |
| "loss": 0.0376, | |
| "reward": 1.0220248233526945, | |
| "reward_std": 0.6859058924019337, | |
| "rewards/cosine_scaled_reward": 0.1360124358907342, | |
| "rewards/format_reward": 0.7500000111758709, | |
| "step": 497 | |
| }, | |
| { | |
| "completion_length": 1380.9375228881836, | |
| "epoch": 0.5691428571428572, | |
| "grad_norm": 1.8308056592941284, | |
| "kl": 0.4208221435546875, | |
| "learning_rate": 1.000438641958131e-07, | |
| "loss": 0.0168, | |
| "reward": 0.4634520011022687, | |
| "reward_std": 0.6670360751450062, | |
| "rewards/cosine_scaled_reward": -0.12244068086147308, | |
| "rewards/format_reward": 0.7083333395421505, | |
| "step": 498 | |
| }, | |
| { | |
| "completion_length": 1510.9792022705078, | |
| "epoch": 0.5702857142857143, | |
| "grad_norm": 2.027233123779297, | |
| "kl": 0.42650604248046875, | |
| "learning_rate": 1.0001096618257236e-07, | |
| "loss": 0.017, | |
| "reward": 0.6949386205524206, | |
| "reward_std": 0.7280914960429072, | |
| "rewards/cosine_scaled_reward": -0.09003069484606385, | |
| "rewards/format_reward": 0.8750000111758709, | |
| "step": 499 | |
| }, | |
| { | |
| "completion_length": 1275.8125534057617, | |
| "epoch": 0.5714285714285714, | |
| "grad_norm": 1.3618121147155762, | |
| "kl": 0.5210762023925781, | |
| "learning_rate": 1e-07, | |
| "loss": 0.0208, | |
| "reward": 0.670590927824378, | |
| "reward_std": 0.7502868715673685, | |
| "rewards/cosine_scaled_reward": -0.08137122076004744, | |
| "rewards/format_reward": 0.8333333507180214, | |
| "step": 500 | |
| }, | |
| { | |
| "epoch": 0.5714285714285714, | |
| "step": 500, | |
| "total_flos": 0.0, | |
| "train_loss": 0.004884798622434991, | |
| "train_runtime": 57171.6753, | |
| "train_samples_per_second": 0.42, | |
| "train_steps_per_second": 0.009 | |
| } | |
| ], | |
| "logging_steps": 1, | |
| "max_steps": 500, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 1, | |
| "save_steps": 50, | |
| "stateful_callbacks": { | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": true, | |
| "should_training_stop": true | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 0.0, | |
| "train_batch_size": 6, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |