| { | |
| "best_metric": null, | |
| "best_model_checkpoint": null, | |
| "epoch": 0.012329886102677127, | |
| "eval_steps": 500, | |
| "global_step": 400, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 4096.0, | |
| "epoch": 3.082471525669282e-05, | |
| "grad_norm": 0.4096410633951961, | |
| "kl": 0.0, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0, | |
| "reward": 0.27293290942907333, | |
| "reward_std": 0.0449131247587502, | |
| "rewards/clip_reward": 0.27293290942907333, | |
| "step": 1 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 4096.0, | |
| "epoch": 6.164943051338563e-05, | |
| "grad_norm": 2.7610875519018947, | |
| "kl": 0.048065185546875, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0019, | |
| "reward": 0.27704446762800217, | |
| "reward_std": 0.04381310846656561, | |
| "rewards/clip_reward": 0.27704446762800217, | |
| "step": 2 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 4096.0, | |
| "epoch": 9.247414577007844e-05, | |
| "grad_norm": 2.6854813853406663, | |
| "kl": 0.08209228515625, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0033, | |
| "reward": 0.24470487236976624, | |
| "reward_std": 0.039878456853330135, | |
| "rewards/clip_reward": 0.24470487236976624, | |
| "step": 3 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 4096.0, | |
| "epoch": 0.00012329886102677127, | |
| "grad_norm": 1.844139759481107, | |
| "kl": 0.094390869140625, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0038, | |
| "reward": 0.2730746790766716, | |
| "reward_std": 0.03929354250431061, | |
| "rewards/clip_reward": 0.2730746790766716, | |
| "step": 4 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 4096.0, | |
| "epoch": 0.00015412357628346408, | |
| "grad_norm": 2.648057613593892, | |
| "kl": 0.149658203125, | |
| "learning_rate": 1e-05, | |
| "loss": 0.006, | |
| "reward": 0.22251639142632484, | |
| "reward_std": 0.035196226090192795, | |
| "rewards/clip_reward": 0.22251639142632484, | |
| "step": 5 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 4096.0, | |
| "epoch": 0.0001849482915401569, | |
| "grad_norm": 2320.3210476556847, | |
| "kl": 35.0732421875, | |
| "learning_rate": 1e-05, | |
| "loss": 1.408, | |
| "reward": 0.26333311200141907, | |
| "reward_std": 0.043474785052239895, | |
| "rewards/clip_reward": 0.26333311200141907, | |
| "step": 6 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 4096.0, | |
| "epoch": 0.00021577300679684973, | |
| "grad_norm": 7.940267210806191, | |
| "kl": 0.358489990234375, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0144, | |
| "reward": 0.26793035864830017, | |
| "reward_std": 0.03859096858650446, | |
| "rewards/clip_reward": 0.26793035864830017, | |
| "step": 7 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 4096.0, | |
| "epoch": 0.00024659772205354254, | |
| "grad_norm": 1.2528619108757855, | |
| "kl": 0.093505859375, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0037, | |
| "reward": 0.25467605516314507, | |
| "reward_std": 0.0469845924526453, | |
| "rewards/clip_reward": 0.25467605516314507, | |
| "step": 8 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 4096.0, | |
| "epoch": 0.0002774224373102353, | |
| "grad_norm": 0.3609296984986853, | |
| "kl": 0.03167724609375, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0013, | |
| "reward": 0.2549675516784191, | |
| "reward_std": 0.050234788097441196, | |
| "rewards/clip_reward": 0.2549675516784191, | |
| "step": 9 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 4096.0, | |
| "epoch": 0.00030824715256692816, | |
| "grad_norm": 0.340167108350015, | |
| "kl": 0.026275634765625, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0011, | |
| "reward": 0.2656533867120743, | |
| "reward_std": 0.041638208553195, | |
| "rewards/clip_reward": 0.2656533867120743, | |
| "step": 10 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 4096.0, | |
| "epoch": 0.000339071867823621, | |
| "grad_norm": 1.2337998822225458, | |
| "kl": 0.038787841796875, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0016, | |
| "reward": 0.23507770523428917, | |
| "reward_std": 0.03971440531313419, | |
| "rewards/clip_reward": 0.23507770523428917, | |
| "step": 11 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 4096.0, | |
| "epoch": 0.0003698965830803138, | |
| "grad_norm": 0.4192393740192319, | |
| "kl": 0.027679443359375, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0011, | |
| "reward": 0.28236085921525955, | |
| "reward_std": 0.047151658684015274, | |
| "rewards/clip_reward": 0.28236085921525955, | |
| "step": 12 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 4096.0, | |
| "epoch": 0.0004007212983370066, | |
| "grad_norm": 0.44693100447868084, | |
| "kl": 0.04705810546875, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0019, | |
| "reward": 0.2532140128314495, | |
| "reward_std": 0.045407955069094896, | |
| "rewards/clip_reward": 0.2532140128314495, | |
| "step": 13 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 4096.0, | |
| "epoch": 0.00043154601359369945, | |
| "grad_norm": 0.4438582117717643, | |
| "kl": 0.031341552734375, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0013, | |
| "reward": 0.2620070278644562, | |
| "reward_std": 0.044824035838246346, | |
| "rewards/clip_reward": 0.2620070278644562, | |
| "step": 14 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 4096.0, | |
| "epoch": 0.00046237072885039224, | |
| "grad_norm": 0.4582616987800618, | |
| "kl": 0.029937744140625, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0012, | |
| "reward": 0.2698071673512459, | |
| "reward_std": 0.044542488642036915, | |
| "rewards/clip_reward": 0.2698071673512459, | |
| "step": 15 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 4096.0, | |
| "epoch": 0.0004931954441070851, | |
| "grad_norm": 1.7444707877124468, | |
| "kl": 0.171722412109375, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0069, | |
| "reward": 0.25084393844008446, | |
| "reward_std": 0.03859854955226183, | |
| "rewards/clip_reward": 0.25084393844008446, | |
| "step": 16 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 4096.0, | |
| "epoch": 0.0005240201593637779, | |
| "grad_norm": 0.9397492604252154, | |
| "kl": 0.03289794921875, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0013, | |
| "reward": 0.2502448856830597, | |
| "reward_std": 0.046720145270228386, | |
| "rewards/clip_reward": 0.2502448856830597, | |
| "step": 17 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 4096.0, | |
| "epoch": 0.0005548448746204706, | |
| "grad_norm": 0.2795480658769616, | |
| "kl": 0.02972412109375, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0012, | |
| "reward": 0.25642987713217735, | |
| "reward_std": 0.040699112229049206, | |
| "rewards/clip_reward": 0.25642987713217735, | |
| "step": 18 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 4096.0, | |
| "epoch": 0.0005856695898771635, | |
| "grad_norm": 0.42906609124114997, | |
| "kl": 0.030792236328125, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0012, | |
| "reward": 0.26084326952695847, | |
| "reward_std": 0.046253882348537445, | |
| "rewards/clip_reward": 0.26084326952695847, | |
| "step": 19 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 4096.0, | |
| "epoch": 0.0006164943051338563, | |
| "grad_norm": 0.27464154300178234, | |
| "kl": 0.034576416015625, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0014, | |
| "reward": 0.2642279714345932, | |
| "reward_std": 0.04200063832104206, | |
| "rewards/clip_reward": 0.2642279714345932, | |
| "step": 20 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 4096.0, | |
| "epoch": 0.0006473190203905491, | |
| "grad_norm": 0.28947614961512397, | |
| "kl": 0.030029296875, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0012, | |
| "reward": 0.25506409257650375, | |
| "reward_std": 0.0439683748409152, | |
| "rewards/clip_reward": 0.25506409257650375, | |
| "step": 21 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 4096.0, | |
| "epoch": 0.000678143735647242, | |
| "grad_norm": 0.3388326949584448, | |
| "kl": 0.0302734375, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0012, | |
| "reward": 0.2455521523952484, | |
| "reward_std": 0.04773388337343931, | |
| "rewards/clip_reward": 0.2455521523952484, | |
| "step": 22 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 4096.0, | |
| "epoch": 0.0007089684509039348, | |
| "grad_norm": 0.251274728050997, | |
| "kl": 0.031219482421875, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0012, | |
| "reward": 0.26682381331920624, | |
| "reward_std": 0.045767911709845066, | |
| "rewards/clip_reward": 0.26682381331920624, | |
| "step": 23 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 4096.0, | |
| "epoch": 0.0007397931661606276, | |
| "grad_norm": 0.3330664131200773, | |
| "kl": 0.03167724609375, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0013, | |
| "reward": 0.27093350887298584, | |
| "reward_std": 0.04017470218241215, | |
| "rewards/clip_reward": 0.27093350887298584, | |
| "step": 24 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 4096.0, | |
| "epoch": 0.0007706178814173204, | |
| "grad_norm": 0.2855775494556483, | |
| "kl": 0.030181884765625, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0012, | |
| "reward": 0.2470133751630783, | |
| "reward_std": 0.045277868397533894, | |
| "rewards/clip_reward": 0.2470133751630783, | |
| "step": 25 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 4096.0, | |
| "epoch": 0.0008014425966740132, | |
| "grad_norm": 0.23359940849678845, | |
| "kl": 0.03143310546875, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0013, | |
| "reward": 0.2605663165450096, | |
| "reward_std": 0.036497367545962334, | |
| "rewards/clip_reward": 0.2605663165450096, | |
| "step": 26 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 4096.0, | |
| "epoch": 0.000832267311930706, | |
| "grad_norm": 0.2565562945436733, | |
| "kl": 0.031494140625, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0013, | |
| "reward": 0.25371479243040085, | |
| "reward_std": 0.04583714855834842, | |
| "rewards/clip_reward": 0.25371479243040085, | |
| "step": 27 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 4096.0, | |
| "epoch": 0.0008630920271873989, | |
| "grad_norm": 0.37271054820762656, | |
| "kl": 0.0306396484375, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0012, | |
| "reward": 0.2469511665403843, | |
| "reward_std": 0.035832284949719906, | |
| "rewards/clip_reward": 0.2469511665403843, | |
| "step": 28 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 4096.0, | |
| "epoch": 0.0008939167424440917, | |
| "grad_norm": 0.24495908441078176, | |
| "kl": 0.03009033203125, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0012, | |
| "reward": 0.2665953040122986, | |
| "reward_std": 0.0440530339255929, | |
| "rewards/clip_reward": 0.2665953040122986, | |
| "step": 29 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 4096.0, | |
| "epoch": 0.0009247414577007845, | |
| "grad_norm": 0.27952316749339523, | |
| "kl": 0.029510498046875, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0012, | |
| "reward": 0.2593040131032467, | |
| "reward_std": 0.04028139542788267, | |
| "rewards/clip_reward": 0.2593040131032467, | |
| "step": 30 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 4096.0, | |
| "epoch": 0.0009555661729574773, | |
| "grad_norm": 0.29685109479945, | |
| "kl": 0.03131103515625, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0013, | |
| "reward": 0.2597034126520157, | |
| "reward_std": 0.045073311775922775, | |
| "rewards/clip_reward": 0.2597034126520157, | |
| "step": 31 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 4096.0, | |
| "epoch": 0.0009863908882141701, | |
| "grad_norm": 0.2788133739842884, | |
| "kl": 0.029815673828125, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0012, | |
| "reward": 0.26862068474292755, | |
| "reward_std": 0.04331609793007374, | |
| "rewards/clip_reward": 0.26862068474292755, | |
| "step": 32 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 4096.0, | |
| "epoch": 0.001017215603470863, | |
| "grad_norm": 0.2793158814370596, | |
| "kl": 0.032684326171875, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0013, | |
| "reward": 0.2581031806766987, | |
| "reward_std": 0.04781654942780733, | |
| "rewards/clip_reward": 0.2581031806766987, | |
| "step": 33 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 4096.0, | |
| "epoch": 0.0010480403187275557, | |
| "grad_norm": 0.2410712843439065, | |
| "kl": 0.031158447265625, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0012, | |
| "reward": 0.25184522196650505, | |
| "reward_std": 0.04620496183633804, | |
| "rewards/clip_reward": 0.25184522196650505, | |
| "step": 34 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 4096.0, | |
| "epoch": 0.0010788650339842486, | |
| "grad_norm": 0.6426585983726603, | |
| "kl": 0.034698486328125, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0014, | |
| "reward": 0.26873790100216866, | |
| "reward_std": 0.03604905540123582, | |
| "rewards/clip_reward": 0.26873790100216866, | |
| "step": 35 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 4096.0, | |
| "epoch": 0.0011096897492409413, | |
| "grad_norm": 0.30049643240824964, | |
| "kl": 0.028167724609375, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0011, | |
| "reward": 0.28773902356624603, | |
| "reward_std": 0.04903798084706068, | |
| "rewards/clip_reward": 0.28773902356624603, | |
| "step": 36 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 4096.0, | |
| "epoch": 0.0011405144644976342, | |
| "grad_norm": 0.286651911122606, | |
| "kl": 0.03125, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0013, | |
| "reward": 0.2746337428689003, | |
| "reward_std": 0.039260104298591614, | |
| "rewards/clip_reward": 0.2746337428689003, | |
| "step": 37 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 4096.0, | |
| "epoch": 0.001171339179754327, | |
| "grad_norm": 0.23526530529450262, | |
| "kl": 0.03240966796875, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0013, | |
| "reward": 0.2732752785086632, | |
| "reward_std": 0.03741883207112551, | |
| "rewards/clip_reward": 0.2732752785086632, | |
| "step": 38 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 4096.0, | |
| "epoch": 0.0012021638950110197, | |
| "grad_norm": 0.23423409222245298, | |
| "kl": 0.0341796875, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0014, | |
| "reward": 0.24805432185530663, | |
| "reward_std": 0.03564950963482261, | |
| "rewards/clip_reward": 0.24805432185530663, | |
| "step": 39 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 4096.0, | |
| "epoch": 0.0012329886102677126, | |
| "grad_norm": 0.24360760682907664, | |
| "kl": 0.03375244140625, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0014, | |
| "reward": 0.2747276946902275, | |
| "reward_std": 0.04160565137863159, | |
| "rewards/clip_reward": 0.2747276946902275, | |
| "step": 40 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 4096.0, | |
| "epoch": 0.0012638133255244055, | |
| "grad_norm": 0.24537882409018658, | |
| "kl": 0.0321044921875, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0013, | |
| "reward": 0.2780498266220093, | |
| "reward_std": 0.036946577951312065, | |
| "rewards/clip_reward": 0.2780498266220093, | |
| "step": 41 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 4096.0, | |
| "epoch": 0.0012946380407810982, | |
| "grad_norm": 0.40977788179371805, | |
| "kl": 0.03350830078125, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0013, | |
| "reward": 0.2628883123397827, | |
| "reward_std": 0.04087233170866966, | |
| "rewards/clip_reward": 0.2628883123397827, | |
| "step": 42 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 4096.0, | |
| "epoch": 0.001325462756037791, | |
| "grad_norm": 0.3840340108983825, | |
| "kl": 0.0338134765625, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0014, | |
| "reward": 0.27363643795251846, | |
| "reward_std": 0.03607825469225645, | |
| "rewards/clip_reward": 0.27363643795251846, | |
| "step": 43 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 4096.0, | |
| "epoch": 0.001356287471294484, | |
| "grad_norm": 0.2581672136879213, | |
| "kl": 0.03271484375, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0013, | |
| "reward": 0.2834562137722969, | |
| "reward_std": 0.04080742411315441, | |
| "rewards/clip_reward": 0.2834562137722969, | |
| "step": 44 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 4096.0, | |
| "epoch": 0.0013871121865511767, | |
| "grad_norm": 0.22063460755244194, | |
| "kl": 0.03253173828125, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0013, | |
| "reward": 0.2772428020834923, | |
| "reward_std": 0.04003895306959748, | |
| "rewards/clip_reward": 0.2772428020834923, | |
| "step": 45 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 4096.0, | |
| "epoch": 0.0014179369018078695, | |
| "grad_norm": 0.2565525371064913, | |
| "kl": 0.03271484375, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0013, | |
| "reward": 0.2706209644675255, | |
| "reward_std": 0.04194391146302223, | |
| "rewards/clip_reward": 0.2706209644675255, | |
| "step": 46 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 4096.0, | |
| "epoch": 0.0014487616170645624, | |
| "grad_norm": 0.2490150749271912, | |
| "kl": 0.03497314453125, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0014, | |
| "reward": 0.2823899984359741, | |
| "reward_std": 0.04330089082941413, | |
| "rewards/clip_reward": 0.2823899984359741, | |
| "step": 47 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 4096.0, | |
| "epoch": 0.0014795863323212551, | |
| "grad_norm": 0.31942389183248127, | |
| "kl": 0.033203125, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0013, | |
| "reward": 0.27612780779600143, | |
| "reward_std": 0.03909243643283844, | |
| "rewards/clip_reward": 0.27612780779600143, | |
| "step": 48 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 4096.0, | |
| "epoch": 0.001510411047577948, | |
| "grad_norm": 0.25176125337143185, | |
| "kl": 0.0323486328125, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0013, | |
| "reward": 0.27114395797252655, | |
| "reward_std": 0.043553344905376434, | |
| "rewards/clip_reward": 0.27114395797252655, | |
| "step": 49 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 4096.0, | |
| "epoch": 0.001541235762834641, | |
| "grad_norm": 0.25140342833503415, | |
| "kl": 0.03302001953125, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0013, | |
| "reward": 0.2741701751947403, | |
| "reward_std": 0.04216139670461416, | |
| "rewards/clip_reward": 0.2741701751947403, | |
| "step": 50 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 4096.0, | |
| "epoch": 0.0015720604780913336, | |
| "grad_norm": 0.2712169111225742, | |
| "kl": 0.0338134765625, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0014, | |
| "reward": 0.2718612402677536, | |
| "reward_std": 0.0403730683028698, | |
| "rewards/clip_reward": 0.2718612402677536, | |
| "step": 51 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 4096.0, | |
| "epoch": 0.0016028851933480265, | |
| "grad_norm": 0.24185377611726078, | |
| "kl": 0.03265380859375, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0013, | |
| "reward": 0.2797749862074852, | |
| "reward_std": 0.035524442326277494, | |
| "rewards/clip_reward": 0.2797749862074852, | |
| "step": 52 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 4096.0, | |
| "epoch": 0.0016337099086047194, | |
| "grad_norm": 0.2925738372367075, | |
| "kl": 0.031890869140625, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0013, | |
| "reward": 0.27509263157844543, | |
| "reward_std": 0.03820755332708359, | |
| "rewards/clip_reward": 0.27509263157844543, | |
| "step": 53 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 4096.0, | |
| "epoch": 0.001664534623861412, | |
| "grad_norm": 0.2243087092506723, | |
| "kl": 0.037109375, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0015, | |
| "reward": 0.2797403186559677, | |
| "reward_std": 0.03433629125356674, | |
| "rewards/clip_reward": 0.2797403186559677, | |
| "step": 54 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 4096.0, | |
| "epoch": 0.001695359339118105, | |
| "grad_norm": 0.5392048466375525, | |
| "kl": 0.034881591796875, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0014, | |
| "reward": 0.2881343886256218, | |
| "reward_std": 0.034057734068483114, | |
| "rewards/clip_reward": 0.2881343886256218, | |
| "step": 55 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 4096.0, | |
| "epoch": 0.0017261840543747978, | |
| "grad_norm": 0.3005202554746241, | |
| "kl": 0.03155517578125, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0013, | |
| "reward": 0.28284603357315063, | |
| "reward_std": 0.035686352755874395, | |
| "rewards/clip_reward": 0.28284603357315063, | |
| "step": 56 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 4096.0, | |
| "epoch": 0.0017570087696314905, | |
| "grad_norm": 0.4427830267310237, | |
| "kl": 0.03265380859375, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0013, | |
| "reward": 0.2999447137117386, | |
| "reward_std": 0.037478470243513584, | |
| "rewards/clip_reward": 0.2999447137117386, | |
| "step": 57 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 4096.0, | |
| "epoch": 0.0017878334848881834, | |
| "grad_norm": 0.3046115937307048, | |
| "kl": 0.03387451171875, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0014, | |
| "reward": 0.2802000045776367, | |
| "reward_std": 0.042125691659748554, | |
| "rewards/clip_reward": 0.2802000045776367, | |
| "step": 58 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 4096.0, | |
| "epoch": 0.001818658200144876, | |
| "grad_norm": 0.2405611837420283, | |
| "kl": 0.03338623046875, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0013, | |
| "reward": 0.29059184342622757, | |
| "reward_std": 0.037164853885769844, | |
| "rewards/clip_reward": 0.29059184342622757, | |
| "step": 59 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 4096.0, | |
| "epoch": 0.001849482915401569, | |
| "grad_norm": 0.24105929492457603, | |
| "kl": 0.03192138671875, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0013, | |
| "reward": 0.2885151877999306, | |
| "reward_std": 0.03311763470992446, | |
| "rewards/clip_reward": 0.2885151877999306, | |
| "step": 60 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 4096.0, | |
| "epoch": 0.0018803076306582618, | |
| "grad_norm": 0.3666040943996562, | |
| "kl": 0.035308837890625, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0014, | |
| "reward": 0.2791602984070778, | |
| "reward_std": 0.04032097943127155, | |
| "rewards/clip_reward": 0.2791602984070778, | |
| "step": 61 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 4096.0, | |
| "epoch": 0.0019111323459149545, | |
| "grad_norm": 0.30249506470602205, | |
| "kl": 0.0343017578125, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0014, | |
| "reward": 0.2850872874259949, | |
| "reward_std": 0.04306080937385559, | |
| "rewards/clip_reward": 0.2850872874259949, | |
| "step": 62 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 4096.0, | |
| "epoch": 0.0019419570611716474, | |
| "grad_norm": 0.2627870943706208, | |
| "kl": 0.0323486328125, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0013, | |
| "reward": 0.28769853711128235, | |
| "reward_std": 0.03929570922628045, | |
| "rewards/clip_reward": 0.28769853711128235, | |
| "step": 63 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 4096.0, | |
| "epoch": 0.0019727817764283403, | |
| "grad_norm": 0.302176588700791, | |
| "kl": 0.03369140625, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0013, | |
| "reward": 0.27818652242422104, | |
| "reward_std": 0.03506180923432112, | |
| "rewards/clip_reward": 0.27818652242422104, | |
| "step": 64 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 4096.0, | |
| "epoch": 0.002003606491685033, | |
| "grad_norm": 0.24687044072190295, | |
| "kl": 0.0352783203125, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0014, | |
| "reward": 0.30166226625442505, | |
| "reward_std": 0.03524617711082101, | |
| "rewards/clip_reward": 0.30166226625442505, | |
| "step": 65 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 4096.0, | |
| "epoch": 0.002034431206941726, | |
| "grad_norm": 0.30419214057266597, | |
| "kl": 0.0382080078125, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0015, | |
| "reward": 0.2780046612024307, | |
| "reward_std": 0.039600692223757505, | |
| "rewards/clip_reward": 0.2780046612024307, | |
| "step": 66 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 4096.0, | |
| "epoch": 0.0020652559221984185, | |
| "grad_norm": 0.27119766585139016, | |
| "kl": 0.03387451171875, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0014, | |
| "reward": 0.294754721224308, | |
| "reward_std": 0.034248299431055784, | |
| "rewards/clip_reward": 0.294754721224308, | |
| "step": 67 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 4096.0, | |
| "epoch": 0.0020960806374551114, | |
| "grad_norm": 0.24490582535286695, | |
| "kl": 0.0340576171875, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0014, | |
| "reward": 0.29758618772029877, | |
| "reward_std": 0.03921351861208677, | |
| "rewards/clip_reward": 0.29758618772029877, | |
| "step": 68 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 4096.0, | |
| "epoch": 0.0021269053527118043, | |
| "grad_norm": 0.2906006315520449, | |
| "kl": 0.0374755859375, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0015, | |
| "reward": 0.28643497824668884, | |
| "reward_std": 0.03893239703029394, | |
| "rewards/clip_reward": 0.28643497824668884, | |
| "step": 69 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 4096.0, | |
| "epoch": 0.002157730067968497, | |
| "grad_norm": 0.2861214644137581, | |
| "kl": 0.0352783203125, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0014, | |
| "reward": 0.2825808748602867, | |
| "reward_std": 0.03427910804748535, | |
| "rewards/clip_reward": 0.2825808748602867, | |
| "step": 70 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 4096.0, | |
| "epoch": 0.00218855478322519, | |
| "grad_norm": 0.37011498013054983, | |
| "kl": 0.03814697265625, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0015, | |
| "reward": 0.2854940667748451, | |
| "reward_std": 0.034281593747437, | |
| "rewards/clip_reward": 0.2854940667748451, | |
| "step": 71 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 4096.0, | |
| "epoch": 0.0022193794984818826, | |
| "grad_norm": 0.24442639828837878, | |
| "kl": 0.0372314453125, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0015, | |
| "reward": 0.28134068101644516, | |
| "reward_std": 0.03528659883886576, | |
| "rewards/clip_reward": 0.28134068101644516, | |
| "step": 72 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 4096.0, | |
| "epoch": 0.0022502042137385755, | |
| "grad_norm": 0.4685706452282796, | |
| "kl": 0.04559326171875, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0018, | |
| "reward": 0.2905724048614502, | |
| "reward_std": 0.040434951428323984, | |
| "rewards/clip_reward": 0.2905724048614502, | |
| "step": 73 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 4096.0, | |
| "epoch": 0.0022810289289952683, | |
| "grad_norm": 0.2978744286423797, | |
| "kl": 0.03948974609375, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0016, | |
| "reward": 0.2837352082133293, | |
| "reward_std": 0.037585786543786526, | |
| "rewards/clip_reward": 0.2837352082133293, | |
| "step": 74 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 4096.0, | |
| "epoch": 0.0023118536442519612, | |
| "grad_norm": 0.2642233216051261, | |
| "kl": 0.03631591796875, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0015, | |
| "reward": 0.2846062183380127, | |
| "reward_std": 0.032358222641050816, | |
| "rewards/clip_reward": 0.2846062183380127, | |
| "step": 75 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 4096.0, | |
| "epoch": 0.002342678359508654, | |
| "grad_norm": 0.38580930898434906, | |
| "kl": 0.03814697265625, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0015, | |
| "reward": 0.2916061654686928, | |
| "reward_std": 0.03999630082398653, | |
| "rewards/clip_reward": 0.2916061654686928, | |
| "step": 76 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 4096.0, | |
| "epoch": 0.002373503074765347, | |
| "grad_norm": 0.2926547008151926, | |
| "kl": 0.03857421875, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0015, | |
| "reward": 0.295952245593071, | |
| "reward_std": 0.03780945483595133, | |
| "rewards/clip_reward": 0.295952245593071, | |
| "step": 77 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 4096.0, | |
| "epoch": 0.0024043277900220395, | |
| "grad_norm": 0.25208665362075805, | |
| "kl": 0.0377197265625, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0015, | |
| "reward": 0.2766958400607109, | |
| "reward_std": 0.034185357857495546, | |
| "rewards/clip_reward": 0.2766958400607109, | |
| "step": 78 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 4096.0, | |
| "epoch": 0.0024351525052787324, | |
| "grad_norm": 0.24422171158940864, | |
| "kl": 0.0357666015625, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0014, | |
| "reward": 0.2962762862443924, | |
| "reward_std": 0.037650241516530514, | |
| "rewards/clip_reward": 0.2962762862443924, | |
| "step": 79 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 4096.0, | |
| "epoch": 0.0024659772205354253, | |
| "grad_norm": 0.22603888808512623, | |
| "kl": 0.03509521484375, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0014, | |
| "reward": 0.30113714188337326, | |
| "reward_std": 0.03855917416512966, | |
| "rewards/clip_reward": 0.30113714188337326, | |
| "step": 80 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 4096.0, | |
| "epoch": 0.002496801935792118, | |
| "grad_norm": 0.37733238977424605, | |
| "kl": 0.0335693359375, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0013, | |
| "reward": 0.28931064158678055, | |
| "reward_std": 0.041936729568988085, | |
| "rewards/clip_reward": 0.28931064158678055, | |
| "step": 81 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 4096.0, | |
| "epoch": 0.002527626651048811, | |
| "grad_norm": 2397.9127472616574, | |
| "kl": 2.27880859375, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0912, | |
| "reward": 0.2881544306874275, | |
| "reward_std": 0.03951821103692055, | |
| "rewards/clip_reward": 0.2881544306874275, | |
| "step": 82 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 4096.0, | |
| "epoch": 0.002558451366305504, | |
| "grad_norm": 0.2551335788444245, | |
| "kl": 0.035888671875, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0014, | |
| "reward": 0.29982026666402817, | |
| "reward_std": 0.041388670448213816, | |
| "rewards/clip_reward": 0.29982026666402817, | |
| "step": 83 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 4096.0, | |
| "epoch": 0.0025892760815621964, | |
| "grad_norm": 0.2368918105634009, | |
| "kl": 0.0340576171875, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0014, | |
| "reward": 0.29101932793855667, | |
| "reward_std": 0.02901851013302803, | |
| "rewards/clip_reward": 0.29101932793855667, | |
| "step": 84 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 4096.0, | |
| "epoch": 0.0026201007968188893, | |
| "grad_norm": 0.37688170465955695, | |
| "kl": 0.0369873046875, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0015, | |
| "reward": 0.28874391317367554, | |
| "reward_std": 0.03887001145631075, | |
| "rewards/clip_reward": 0.28874391317367554, | |
| "step": 85 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 4096.0, | |
| "epoch": 0.002650925512075582, | |
| "grad_norm": 0.2519378074219347, | |
| "kl": 0.033203125, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0013, | |
| "reward": 0.30362261086702347, | |
| "reward_std": 0.03999508544802666, | |
| "rewards/clip_reward": 0.30362261086702347, | |
| "step": 86 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 4096.0, | |
| "epoch": 0.002681750227332275, | |
| "grad_norm": 0.35429225967137656, | |
| "kl": 0.03558349609375, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0014, | |
| "reward": 0.28408853709697723, | |
| "reward_std": 0.038170519284904, | |
| "rewards/clip_reward": 0.28408853709697723, | |
| "step": 87 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 4096.0, | |
| "epoch": 0.002712574942588968, | |
| "grad_norm": 0.24826760619770735, | |
| "kl": 0.03509521484375, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0014, | |
| "reward": 0.2828948646783829, | |
| "reward_std": 0.030402940697968006, | |
| "rewards/clip_reward": 0.2828948646783829, | |
| "step": 88 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 4096.0, | |
| "epoch": 0.002743399657845661, | |
| "grad_norm": 0.24050960559813894, | |
| "kl": 0.0369873046875, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0015, | |
| "reward": 0.28854893147945404, | |
| "reward_std": 0.031570473685860634, | |
| "rewards/clip_reward": 0.28854893147945404, | |
| "step": 89 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 4096.0, | |
| "epoch": 0.0027742243731023533, | |
| "grad_norm": 0.3157405419841437, | |
| "kl": 0.03387451171875, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0014, | |
| "reward": 0.3040951117873192, | |
| "reward_std": 0.03156033856794238, | |
| "rewards/clip_reward": 0.3040951117873192, | |
| "step": 90 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 4096.0, | |
| "epoch": 0.002805049088359046, | |
| "grad_norm": 0.255465711049786, | |
| "kl": 0.036865234375, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0015, | |
| "reward": 0.28177835047245026, | |
| "reward_std": 0.03183179069310427, | |
| "rewards/clip_reward": 0.28177835047245026, | |
| "step": 91 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 4096.0, | |
| "epoch": 0.002835873803615739, | |
| "grad_norm": 0.2754084224378065, | |
| "kl": 0.0357666015625, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0014, | |
| "reward": 0.28247682750225067, | |
| "reward_std": 0.04544441122561693, | |
| "rewards/clip_reward": 0.28247682750225067, | |
| "step": 92 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 4096.0, | |
| "epoch": 0.002866698518872432, | |
| "grad_norm": 0.30917890933321285, | |
| "kl": 0.03411865234375, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0014, | |
| "reward": 0.30449671298265457, | |
| "reward_std": 0.038043808192014694, | |
| "rewards/clip_reward": 0.30449671298265457, | |
| "step": 93 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 4096.0, | |
| "epoch": 0.002897523234129125, | |
| "grad_norm": 0.2739772980532729, | |
| "kl": 0.0352783203125, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0014, | |
| "reward": 0.2972172349691391, | |
| "reward_std": 0.04181887488812208, | |
| "rewards/clip_reward": 0.2972172349691391, | |
| "step": 94 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 4096.0, | |
| "epoch": 0.0029283479493858173, | |
| "grad_norm": 0.4882887735659522, | |
| "kl": 0.0369873046875, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0015, | |
| "reward": 0.30260052531957626, | |
| "reward_std": 0.03962193429470062, | |
| "rewards/clip_reward": 0.30260052531957626, | |
| "step": 95 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 4096.0, | |
| "epoch": 0.0029591726646425102, | |
| "grad_norm": 0.2758869488382661, | |
| "kl": 0.03765869140625, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0015, | |
| "reward": 0.273670993745327, | |
| "reward_std": 0.034096458461135626, | |
| "rewards/clip_reward": 0.273670993745327, | |
| "step": 96 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 4096.0, | |
| "epoch": 0.002989997379899203, | |
| "grad_norm": 0.3218139742013513, | |
| "kl": 0.0357666015625, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0014, | |
| "reward": 0.2953087314963341, | |
| "reward_std": 0.03857262898236513, | |
| "rewards/clip_reward": 0.2953087314963341, | |
| "step": 97 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 4096.0, | |
| "epoch": 0.003020822095155896, | |
| "grad_norm": 0.27436643426318075, | |
| "kl": 0.03802490234375, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0015, | |
| "reward": 0.2942623794078827, | |
| "reward_std": 0.04214351158589125, | |
| "rewards/clip_reward": 0.2942623794078827, | |
| "step": 98 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 4096.0, | |
| "epoch": 0.003051646810412589, | |
| "grad_norm": 0.46160645575216935, | |
| "kl": 0.0382080078125, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0015, | |
| "reward": 0.2718002200126648, | |
| "reward_std": 0.03470963425934315, | |
| "rewards/clip_reward": 0.2718002200126648, | |
| "step": 99 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 4096.0, | |
| "epoch": 0.003082471525669282, | |
| "grad_norm": 0.2710130788062584, | |
| "kl": 0.0394287109375, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0016, | |
| "reward": 0.30108004808425903, | |
| "reward_std": 0.03474955866113305, | |
| "rewards/clip_reward": 0.30108004808425903, | |
| "step": 100 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 4096.0, | |
| "epoch": 0.0031132962409259743, | |
| "grad_norm": 0.49951830790663787, | |
| "kl": 0.0382080078125, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0015, | |
| "reward": 0.2938985303044319, | |
| "reward_std": 0.03756955498829484, | |
| "rewards/clip_reward": 0.2938985303044319, | |
| "step": 101 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 4096.0, | |
| "epoch": 0.003144120956182667, | |
| "grad_norm": 0.309853879858268, | |
| "kl": 0.03790283203125, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0015, | |
| "reward": 0.2770190164446831, | |
| "reward_std": 0.04118606820702553, | |
| "rewards/clip_reward": 0.2770190164446831, | |
| "step": 102 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 4096.0, | |
| "epoch": 0.00317494567143936, | |
| "grad_norm": 0.303540839737249, | |
| "kl": 0.04150390625, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0017, | |
| "reward": 0.3010944053530693, | |
| "reward_std": 0.03948027174919844, | |
| "rewards/clip_reward": 0.3010944053530693, | |
| "step": 103 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 4096.0, | |
| "epoch": 0.003205770386696053, | |
| "grad_norm": 0.4242493698059439, | |
| "kl": 0.04302978515625, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0017, | |
| "reward": 0.30070605874061584, | |
| "reward_std": 0.03413457376882434, | |
| "rewards/clip_reward": 0.30070605874061584, | |
| "step": 104 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 4096.0, | |
| "epoch": 0.003236595101952746, | |
| "grad_norm": 0.2893333008136438, | |
| "kl": 0.04876708984375, | |
| "learning_rate": 1e-05, | |
| "loss": 0.002, | |
| "reward": 0.30694443732500076, | |
| "reward_std": 0.03970211138948798, | |
| "rewards/clip_reward": 0.30694443732500076, | |
| "step": 105 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 4096.0, | |
| "epoch": 0.0032674198172094387, | |
| "grad_norm": 0.9533572888059393, | |
| "kl": 0.05682373046875, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0023, | |
| "reward": 0.3035873845219612, | |
| "reward_std": 0.03094083722680807, | |
| "rewards/clip_reward": 0.3035873845219612, | |
| "step": 106 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 4096.0, | |
| "epoch": 0.003298244532466131, | |
| "grad_norm": 0.6987792496689551, | |
| "kl": 0.0474853515625, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0019, | |
| "reward": 0.26798417791724205, | |
| "reward_std": 0.0339839537627995, | |
| "rewards/clip_reward": 0.26798417791724205, | |
| "step": 107 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 4096.0, | |
| "epoch": 0.003329069247722824, | |
| "grad_norm": 0.2695872192954547, | |
| "kl": 0.03973388671875, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0016, | |
| "reward": 0.2790006026625633, | |
| "reward_std": 0.040701782796531916, | |
| "rewards/clip_reward": 0.2790006026625633, | |
| "step": 108 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 4096.0, | |
| "epoch": 0.003359893962979517, | |
| "grad_norm": 2.392119350950025, | |
| "kl": 0.04656982421875, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0019, | |
| "reward": 0.29154668748378754, | |
| "reward_std": 0.03984204959124327, | |
| "rewards/clip_reward": 0.29154668748378754, | |
| "step": 109 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 4096.0, | |
| "epoch": 0.00339071867823621, | |
| "grad_norm": 0.30051030904173764, | |
| "kl": 0.0389404296875, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0016, | |
| "reward": 0.28251485526561737, | |
| "reward_std": 0.03768534865230322, | |
| "rewards/clip_reward": 0.28251485526561737, | |
| "step": 110 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 4096.0, | |
| "epoch": 0.0034215433934929027, | |
| "grad_norm": 0.33966462863188296, | |
| "kl": 0.04290771484375, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0017, | |
| "reward": 0.28882861137390137, | |
| "reward_std": 0.030829247552901506, | |
| "rewards/clip_reward": 0.28882861137390137, | |
| "step": 111 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 4096.0, | |
| "epoch": 0.0034523681087495956, | |
| "grad_norm": 0.27353141186845165, | |
| "kl": 0.043212890625, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0017, | |
| "reward": 0.2902967408299446, | |
| "reward_std": 0.03842968260869384, | |
| "rewards/clip_reward": 0.2902967408299446, | |
| "step": 112 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 4096.0, | |
| "epoch": 0.003483192824006288, | |
| "grad_norm": 0.31909208905959036, | |
| "kl": 0.039794921875, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0016, | |
| "reward": 0.30095263570547104, | |
| "reward_std": 0.03839526232331991, | |
| "rewards/clip_reward": 0.30095263570547104, | |
| "step": 113 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 4096.0, | |
| "epoch": 0.003514017539262981, | |
| "grad_norm": 0.2628770760125084, | |
| "kl": 0.04241943359375, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0017, | |
| "reward": 0.30361922085285187, | |
| "reward_std": 0.03204685868695378, | |
| "rewards/clip_reward": 0.30361922085285187, | |
| "step": 114 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 4096.0, | |
| "epoch": 0.003544842254519674, | |
| "grad_norm": 0.5077902647628189, | |
| "kl": 0.04266357421875, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0017, | |
| "reward": 0.28539516031742096, | |
| "reward_std": 0.03661313094198704, | |
| "rewards/clip_reward": 0.28539516031742096, | |
| "step": 115 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 4096.0, | |
| "epoch": 0.0035756669697763668, | |
| "grad_norm": 0.2750571920299247, | |
| "kl": 0.04144287109375, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0017, | |
| "reward": 0.28498194366693497, | |
| "reward_std": 0.033483162987977266, | |
| "rewards/clip_reward": 0.28498194366693497, | |
| "step": 116 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 4096.0, | |
| "epoch": 0.0036064916850330597, | |
| "grad_norm": 0.4090948836642478, | |
| "kl": 0.05047607421875, | |
| "learning_rate": 1e-05, | |
| "loss": 0.002, | |
| "reward": 0.2884545177221298, | |
| "reward_std": 0.03879341948777437, | |
| "rewards/clip_reward": 0.2884545177221298, | |
| "step": 117 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 4096.0, | |
| "epoch": 0.003637316400289752, | |
| "grad_norm": 0.2747726497219782, | |
| "kl": 0.0418701171875, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0017, | |
| "reward": 0.2886419966816902, | |
| "reward_std": 0.03201331151649356, | |
| "rewards/clip_reward": 0.2886419966816902, | |
| "step": 118 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 4096.0, | |
| "epoch": 0.003668141115546445, | |
| "grad_norm": 0.24335488340779768, | |
| "kl": 0.041015625, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0016, | |
| "reward": 0.29286789894104004, | |
| "reward_std": 0.03851965814828873, | |
| "rewards/clip_reward": 0.29286789894104004, | |
| "step": 119 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 4096.0, | |
| "epoch": 0.003698965830803138, | |
| "grad_norm": 0.3272604578026692, | |
| "kl": 0.0421142578125, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0017, | |
| "reward": 0.3037740960717201, | |
| "reward_std": 0.04023708775639534, | |
| "rewards/clip_reward": 0.3037740960717201, | |
| "step": 120 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 4096.0, | |
| "epoch": 0.003729790546059831, | |
| "grad_norm": 0.2573037344425057, | |
| "kl": 0.042724609375, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0017, | |
| "reward": 0.3032229617238045, | |
| "reward_std": 0.03995621297508478, | |
| "rewards/clip_reward": 0.3032229617238045, | |
| "step": 121 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 4096.0, | |
| "epoch": 0.0037606152613165237, | |
| "grad_norm": 0.23884237904332833, | |
| "kl": 0.040771484375, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0016, | |
| "reward": 0.2992851510643959, | |
| "reward_std": 0.03295802231878042, | |
| "rewards/clip_reward": 0.2992851510643959, | |
| "step": 122 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 4096.0, | |
| "epoch": 0.0037914399765732166, | |
| "grad_norm": 0.2563902015046439, | |
| "kl": 0.03961181640625, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0016, | |
| "reward": 0.29188019037246704, | |
| "reward_std": 0.03674457548186183, | |
| "rewards/clip_reward": 0.29188019037246704, | |
| "step": 123 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 4096.0, | |
| "epoch": 0.003822264691829909, | |
| "grad_norm": 0.2983101831944227, | |
| "kl": 0.0421142578125, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0017, | |
| "reward": 0.2967969849705696, | |
| "reward_std": 0.035267810337245464, | |
| "rewards/clip_reward": 0.2967969849705696, | |
| "step": 124 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 4096.0, | |
| "epoch": 0.003853089407086602, | |
| "grad_norm": 0.24729135355433982, | |
| "kl": 0.04266357421875, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0017, | |
| "reward": 0.2964273914694786, | |
| "reward_std": 0.03762258403003216, | |
| "rewards/clip_reward": 0.2964273914694786, | |
| "step": 125 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 4096.0, | |
| "epoch": 0.003883914122343295, | |
| "grad_norm": 0.36172960084474337, | |
| "kl": 0.0379638671875, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0015, | |
| "reward": 0.2893693521618843, | |
| "reward_std": 0.030456844717264175, | |
| "rewards/clip_reward": 0.2893693521618843, | |
| "step": 126 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 4096.0, | |
| "epoch": 0.003914738837599988, | |
| "grad_norm": 1.3235252455659776, | |
| "kl": 0.06842041015625, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0027, | |
| "reward": 0.28936315327882767, | |
| "reward_std": 0.032889596186578274, | |
| "rewards/clip_reward": 0.28936315327882767, | |
| "step": 127 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 4096.0, | |
| "epoch": 0.003945563552856681, | |
| "grad_norm": 0.33282790631669, | |
| "kl": 0.04095458984375, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0016, | |
| "reward": 0.3032195121049881, | |
| "reward_std": 0.03782738745212555, | |
| "rewards/clip_reward": 0.3032195121049881, | |
| "step": 128 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 4096.0, | |
| "epoch": 0.0039763882681133735, | |
| "grad_norm": 0.26941621679510896, | |
| "kl": 0.0401611328125, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0016, | |
| "reward": 0.29521265625953674, | |
| "reward_std": 0.028109443373978138, | |
| "rewards/clip_reward": 0.29521265625953674, | |
| "step": 129 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 4096.0, | |
| "epoch": 0.004007212983370066, | |
| "grad_norm": 0.6011583420828783, | |
| "kl": 0.042236328125, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0017, | |
| "reward": 0.29040753841400146, | |
| "reward_std": 0.03746784944087267, | |
| "rewards/clip_reward": 0.29040753841400146, | |
| "step": 130 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 4096.0, | |
| "epoch": 0.004038037698626759, | |
| "grad_norm": 0.28850122675094336, | |
| "kl": 0.03900146484375, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0016, | |
| "reward": 0.298222653567791, | |
| "reward_std": 0.032059306744486094, | |
| "rewards/clip_reward": 0.298222653567791, | |
| "step": 131 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 4096.0, | |
| "epoch": 0.004068862413883452, | |
| "grad_norm": 0.297623960565589, | |
| "kl": 0.04571533203125, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0018, | |
| "reward": 0.28365931659936905, | |
| "reward_std": 0.03448425652459264, | |
| "rewards/clip_reward": 0.28365931659936905, | |
| "step": 132 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 4096.0, | |
| "epoch": 0.004099687129140144, | |
| "grad_norm": 0.33237851102895244, | |
| "kl": 0.04266357421875, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0017, | |
| "reward": 0.30908743292093277, | |
| "reward_std": 0.03715136833488941, | |
| "rewards/clip_reward": 0.30908743292093277, | |
| "step": 133 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 4096.0, | |
| "epoch": 0.004130511844396837, | |
| "grad_norm": 0.28432833313640343, | |
| "kl": 0.04046630859375, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0016, | |
| "reward": 0.28690189868211746, | |
| "reward_std": 0.036837459076195955, | |
| "rewards/clip_reward": 0.28690189868211746, | |
| "step": 134 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 4096.0, | |
| "epoch": 0.00416133655965353, | |
| "grad_norm": 0.289418120507171, | |
| "kl": 0.042236328125, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0017, | |
| "reward": 0.27922137826681137, | |
| "reward_std": 0.04220917448401451, | |
| "rewards/clip_reward": 0.27922137826681137, | |
| "step": 135 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 4096.0, | |
| "epoch": 0.004192161274910223, | |
| "grad_norm": 0.443250812892132, | |
| "kl": 0.04840087890625, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0019, | |
| "reward": 0.29268399626016617, | |
| "reward_std": 0.042575713247060776, | |
| "rewards/clip_reward": 0.29268399626016617, | |
| "step": 136 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 4096.0, | |
| "epoch": 0.004222985990166916, | |
| "grad_norm": 0.2936447251353246, | |
| "kl": 0.0419921875, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0017, | |
| "reward": 0.3000137060880661, | |
| "reward_std": 0.03598225861787796, | |
| "rewards/clip_reward": 0.3000137060880661, | |
| "step": 137 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 4096.0, | |
| "epoch": 0.004253810705423609, | |
| "grad_norm": 0.2638651763481444, | |
| "kl": 0.044189453125, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0018, | |
| "reward": 0.29852742701768875, | |
| "reward_std": 0.03497701371088624, | |
| "rewards/clip_reward": 0.29852742701768875, | |
| "step": 138 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 4096.0, | |
| "epoch": 0.0042846354206803015, | |
| "grad_norm": 0.6490276376018281, | |
| "kl": 0.04449462890625, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0018, | |
| "reward": 0.2969634085893631, | |
| "reward_std": 0.03716578893363476, | |
| "rewards/clip_reward": 0.2969634085893631, | |
| "step": 139 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 4096.0, | |
| "epoch": 0.004315460135936994, | |
| "grad_norm": 0.4290585504001145, | |
| "kl": 0.048583984375, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0019, | |
| "reward": 0.30021680146455765, | |
| "reward_std": 0.04055328294634819, | |
| "rewards/clip_reward": 0.30021680146455765, | |
| "step": 140 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 4096.0, | |
| "epoch": 0.004346284851193687, | |
| "grad_norm": 0.2880701541000658, | |
| "kl": 0.046142578125, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0018, | |
| "reward": 0.2995045334100723, | |
| "reward_std": 0.03819493483752012, | |
| "rewards/clip_reward": 0.2995045334100723, | |
| "step": 141 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 4096.0, | |
| "epoch": 0.00437710956645038, | |
| "grad_norm": 0.8593874213184524, | |
| "kl": 0.0513916015625, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0021, | |
| "reward": 0.30462899804115295, | |
| "reward_std": 0.03424446424469352, | |
| "rewards/clip_reward": 0.30462899804115295, | |
| "step": 142 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 4096.0, | |
| "epoch": 0.004407934281707073, | |
| "grad_norm": 0.21911976066768077, | |
| "kl": 0.04931640625, | |
| "learning_rate": 1e-05, | |
| "loss": 0.002, | |
| "reward": 0.3081146106123924, | |
| "reward_std": 0.03195359604433179, | |
| "rewards/clip_reward": 0.3081146106123924, | |
| "step": 143 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 4096.0, | |
| "epoch": 0.004438758996963765, | |
| "grad_norm": 0.34265255923839133, | |
| "kl": 0.0526123046875, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0021, | |
| "reward": 0.2893141508102417, | |
| "reward_std": 0.03520893771201372, | |
| "rewards/clip_reward": 0.2893141508102417, | |
| "step": 144 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 4096.0, | |
| "epoch": 0.004469583712220458, | |
| "grad_norm": 0.3221549984597468, | |
| "kl": 0.05474853515625, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0022, | |
| "reward": 0.3002786561846733, | |
| "reward_std": 0.033627052791416645, | |
| "rewards/clip_reward": 0.3002786561846733, | |
| "step": 145 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 4096.0, | |
| "epoch": 0.004500408427477151, | |
| "grad_norm": 0.35731934724740755, | |
| "kl": 0.0506591796875, | |
| "learning_rate": 1e-05, | |
| "loss": 0.002, | |
| "reward": 0.2964288890361786, | |
| "reward_std": 0.03710630023851991, | |
| "rewards/clip_reward": 0.2964288890361786, | |
| "step": 146 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 4096.0, | |
| "epoch": 0.004531233142733844, | |
| "grad_norm": 0.498796371247507, | |
| "kl": 0.063232421875, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0025, | |
| "reward": 0.2942727655172348, | |
| "reward_std": 0.034661782905459404, | |
| "rewards/clip_reward": 0.2942727655172348, | |
| "step": 147 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 4096.0, | |
| "epoch": 0.004562057857990537, | |
| "grad_norm": 0.35040193973506645, | |
| "kl": 0.04986572265625, | |
| "learning_rate": 1e-05, | |
| "loss": 0.002, | |
| "reward": 0.3076755926012993, | |
| "reward_std": 0.038345606066286564, | |
| "rewards/clip_reward": 0.3076755926012993, | |
| "step": 148 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 4096.0, | |
| "epoch": 0.00459288257324723, | |
| "grad_norm": 0.284116312840595, | |
| "kl": 0.0498046875, | |
| "learning_rate": 1e-05, | |
| "loss": 0.002, | |
| "reward": 0.2897453159093857, | |
| "reward_std": 0.03225108701735735, | |
| "rewards/clip_reward": 0.2897453159093857, | |
| "step": 149 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 4096.0, | |
| "epoch": 0.0046237072885039225, | |
| "grad_norm": 2.510868241115512, | |
| "kl": 0.0657958984375, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0026, | |
| "reward": 0.28433967381715775, | |
| "reward_std": 0.03440393693745136, | |
| "rewards/clip_reward": 0.28433967381715775, | |
| "step": 150 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 4096.0, | |
| "epoch": 0.004654532003760615, | |
| "grad_norm": 0.2919131449174652, | |
| "kl": 0.0528564453125, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0021, | |
| "reward": 0.2939991131424904, | |
| "reward_std": 0.040119947865605354, | |
| "rewards/clip_reward": 0.2939991131424904, | |
| "step": 151 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 4096.0, | |
| "epoch": 0.004685356719017308, | |
| "grad_norm": 0.2728005479353481, | |
| "kl": 0.0621337890625, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0025, | |
| "reward": 0.31283795088529587, | |
| "reward_std": 0.04251454817131162, | |
| "rewards/clip_reward": 0.31283795088529587, | |
| "step": 152 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 4096.0, | |
| "epoch": 0.004716181434274001, | |
| "grad_norm": 0.3046425258813171, | |
| "kl": 0.056884765625, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0023, | |
| "reward": 0.3088100552558899, | |
| "reward_std": 0.03973885904997587, | |
| "rewards/clip_reward": 0.3088100552558899, | |
| "step": 153 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 4096.0, | |
| "epoch": 0.004747006149530694, | |
| "grad_norm": 0.36647907531053076, | |
| "kl": 0.06378173828125, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0026, | |
| "reward": 0.2990795224905014, | |
| "reward_std": 0.03181237168610096, | |
| "rewards/clip_reward": 0.2990795224905014, | |
| "step": 154 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 4096.0, | |
| "epoch": 0.004777830864787387, | |
| "grad_norm": 0.2874990294452977, | |
| "kl": 0.05413818359375, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0022, | |
| "reward": 0.29652759432792664, | |
| "reward_std": 0.034958623349666595, | |
| "rewards/clip_reward": 0.29652759432792664, | |
| "step": 155 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 4096.0, | |
| "epoch": 0.004808655580044079, | |
| "grad_norm": 0.2571961957720778, | |
| "kl": 0.04888916015625, | |
| "learning_rate": 1e-05, | |
| "loss": 0.002, | |
| "reward": 0.30016767233610153, | |
| "reward_std": 0.03672470944002271, | |
| "rewards/clip_reward": 0.30016767233610153, | |
| "step": 156 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 4096.0, | |
| "epoch": 0.004839480295300772, | |
| "grad_norm": 0.2488580090168899, | |
| "kl": 0.05389404296875, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0022, | |
| "reward": 0.3036653697490692, | |
| "reward_std": 0.03518738830462098, | |
| "rewards/clip_reward": 0.3036653697490692, | |
| "step": 157 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 4096.0, | |
| "epoch": 0.004870305010557465, | |
| "grad_norm": 0.3740970067478605, | |
| "kl": 0.0546875, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0022, | |
| "reward": 0.3022882342338562, | |
| "reward_std": 0.040427629835903645, | |
| "rewards/clip_reward": 0.3022882342338562, | |
| "step": 158 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 4096.0, | |
| "epoch": 0.004901129725814158, | |
| "grad_norm": 0.27811303327109416, | |
| "kl": 0.0537109375, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0021, | |
| "reward": 0.29319214075803757, | |
| "reward_std": 0.03445435827597976, | |
| "rewards/clip_reward": 0.29319214075803757, | |
| "step": 159 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 4096.0, | |
| "epoch": 0.0049319544410708505, | |
| "grad_norm": 0.3118292624775214, | |
| "kl": 0.0531005859375, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0021, | |
| "reward": 0.29692448675632477, | |
| "reward_std": 0.02798191551119089, | |
| "rewards/clip_reward": 0.29692448675632477, | |
| "step": 160 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 4096.0, | |
| "epoch": 0.004962779156327543, | |
| "grad_norm": 0.24174045123252508, | |
| "kl": 0.05987548828125, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0024, | |
| "reward": 0.2794330157339573, | |
| "reward_std": 0.033141561318188906, | |
| "rewards/clip_reward": 0.2794330157339573, | |
| "step": 161 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 4096.0, | |
| "epoch": 0.004993603871584236, | |
| "grad_norm": 0.3249638089165546, | |
| "kl": 0.0574951171875, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0023, | |
| "reward": 0.3030798211693764, | |
| "reward_std": 0.03338931826874614, | |
| "rewards/clip_reward": 0.3030798211693764, | |
| "step": 162 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 4096.0, | |
| "epoch": 0.005024428586840929, | |
| "grad_norm": 0.2633110889874991, | |
| "kl": 0.05413818359375, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0022, | |
| "reward": 0.29485588520765305, | |
| "reward_std": 0.03062314447015524, | |
| "rewards/clip_reward": 0.29485588520765305, | |
| "step": 163 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 4096.0, | |
| "epoch": 0.005055253302097622, | |
| "grad_norm": 0.2803956547691715, | |
| "kl": 0.05316162109375, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0021, | |
| "reward": 0.2970665916800499, | |
| "reward_std": 0.036208128556609154, | |
| "rewards/clip_reward": 0.2970665916800499, | |
| "step": 164 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 4096.0, | |
| "epoch": 0.005086078017354315, | |
| "grad_norm": 0.30407819707590145, | |
| "kl": 0.0552978515625, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0022, | |
| "reward": 0.28191495686769485, | |
| "reward_std": 0.035574122332036495, | |
| "rewards/clip_reward": 0.28191495686769485, | |
| "step": 165 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 4096.0, | |
| "epoch": 0.005116902732611008, | |
| "grad_norm": 0.7069970857165138, | |
| "kl": 0.05682373046875, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0023, | |
| "reward": 0.3111490234732628, | |
| "reward_std": 0.03303992748260498, | |
| "rewards/clip_reward": 0.3111490234732628, | |
| "step": 166 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 4096.0, | |
| "epoch": 0.0051477274478677, | |
| "grad_norm": 0.5033892459063507, | |
| "kl": 0.09039306640625, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0036, | |
| "reward": 0.2889687344431877, | |
| "reward_std": 0.030694663524627686, | |
| "rewards/clip_reward": 0.2889687344431877, | |
| "step": 167 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 4096.0, | |
| "epoch": 0.005178552163124393, | |
| "grad_norm": 0.41528612307548435, | |
| "kl": 0.056884765625, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0023, | |
| "reward": 0.3075498342514038, | |
| "reward_std": 0.03620789339765906, | |
| "rewards/clip_reward": 0.3075498342514038, | |
| "step": 168 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 4096.0, | |
| "epoch": 0.005209376878381086, | |
| "grad_norm": 0.32166704594267, | |
| "kl": 0.0555419921875, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0022, | |
| "reward": 0.29549096524715424, | |
| "reward_std": 0.037705546244978905, | |
| "rewards/clip_reward": 0.29549096524715424, | |
| "step": 169 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 4096.0, | |
| "epoch": 0.005240201593637779, | |
| "grad_norm": 0.21631365869879202, | |
| "kl": 0.05328369140625, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0021, | |
| "reward": 0.2998160049319267, | |
| "reward_std": 0.03276947420090437, | |
| "rewards/clip_reward": 0.2998160049319267, | |
| "step": 170 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 4096.0, | |
| "epoch": 0.0052710263088944715, | |
| "grad_norm": 0.2516096372371072, | |
| "kl": 0.05267333984375, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0021, | |
| "reward": 0.31134357303380966, | |
| "reward_std": 0.03470080904662609, | |
| "rewards/clip_reward": 0.31134357303380966, | |
| "step": 171 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 4096.0, | |
| "epoch": 0.005301851024151164, | |
| "grad_norm": 0.29419080361317934, | |
| "kl": 0.0538330078125, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0022, | |
| "reward": 0.3061481937766075, | |
| "reward_std": 0.03180012432858348, | |
| "rewards/clip_reward": 0.3061481937766075, | |
| "step": 172 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 4096.0, | |
| "epoch": 0.005332675739407857, | |
| "grad_norm": 0.2498996223533549, | |
| "kl": 0.0516357421875, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0021, | |
| "reward": 0.2943173050880432, | |
| "reward_std": 0.037327985279262066, | |
| "rewards/clip_reward": 0.2943173050880432, | |
| "step": 173 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 4096.0, | |
| "epoch": 0.00536350045466455, | |
| "grad_norm": 0.2978051890631816, | |
| "kl": 0.05413818359375, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0022, | |
| "reward": 0.3035401850938797, | |
| "reward_std": 0.0322981933131814, | |
| "rewards/clip_reward": 0.3035401850938797, | |
| "step": 174 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 4096.0, | |
| "epoch": 0.005394325169921243, | |
| "grad_norm": 0.24015029890010034, | |
| "kl": 0.05206298828125, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0021, | |
| "reward": 0.30325891077518463, | |
| "reward_std": 0.04060996416956186, | |
| "rewards/clip_reward": 0.30325891077518463, | |
| "step": 175 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 4096.0, | |
| "epoch": 0.005425149885177936, | |
| "grad_norm": 0.246629237118055, | |
| "kl": 0.05426025390625, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0022, | |
| "reward": 0.296851709485054, | |
| "reward_std": 0.03895132802426815, | |
| "rewards/clip_reward": 0.296851709485054, | |
| "step": 176 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 4096.0, | |
| "epoch": 0.005455974600434629, | |
| "grad_norm": 0.26568277866945256, | |
| "kl": 0.05633544921875, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0023, | |
| "reward": 0.3115716502070427, | |
| "reward_std": 0.037875589448958635, | |
| "rewards/clip_reward": 0.3115716502070427, | |
| "step": 177 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 4096.0, | |
| "epoch": 0.005486799315691322, | |
| "grad_norm": 0.2868547511774212, | |
| "kl": 0.05279541015625, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0021, | |
| "reward": 0.31741077452898026, | |
| "reward_std": 0.03210792690515518, | |
| "rewards/clip_reward": 0.31741077452898026, | |
| "step": 178 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 4096.0, | |
| "epoch": 0.005517624030948014, | |
| "grad_norm": 0.2559585074400544, | |
| "kl": 0.05120849609375, | |
| "learning_rate": 1e-05, | |
| "loss": 0.002, | |
| "reward": 0.3083617463707924, | |
| "reward_std": 0.0392221063375473, | |
| "rewards/clip_reward": 0.3083617463707924, | |
| "step": 179 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 4096.0, | |
| "epoch": 0.005548448746204707, | |
| "grad_norm": 0.24542077699632728, | |
| "kl": 0.05108642578125, | |
| "learning_rate": 1e-05, | |
| "loss": 0.002, | |
| "reward": 0.30339157581329346, | |
| "reward_std": 0.03507227078080177, | |
| "rewards/clip_reward": 0.30339157581329346, | |
| "step": 180 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 4096.0, | |
| "epoch": 0.0055792734614613995, | |
| "grad_norm": 0.2540319224053049, | |
| "kl": 0.05084228515625, | |
| "learning_rate": 1e-05, | |
| "loss": 0.002, | |
| "reward": 0.30484145134687424, | |
| "reward_std": 0.036596270743757486, | |
| "rewards/clip_reward": 0.30484145134687424, | |
| "step": 181 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 4096.0, | |
| "epoch": 0.005610098176718092, | |
| "grad_norm": 0.37332314299297165, | |
| "kl": 0.0526123046875, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0021, | |
| "reward": 0.30233796685934067, | |
| "reward_std": 0.030493673402816057, | |
| "rewards/clip_reward": 0.30233796685934067, | |
| "step": 182 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 4096.0, | |
| "epoch": 0.005640922891974785, | |
| "grad_norm": 0.2303993900918174, | |
| "kl": 0.05133056640625, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0021, | |
| "reward": 0.3098205327987671, | |
| "reward_std": 0.041695406660437584, | |
| "rewards/clip_reward": 0.3098205327987671, | |
| "step": 183 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 4096.0, | |
| "epoch": 0.005671747607231478, | |
| "grad_norm": 0.2435717876843093, | |
| "kl": 0.05206298828125, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0021, | |
| "reward": 0.29874222725629807, | |
| "reward_std": 0.029267210513353348, | |
| "rewards/clip_reward": 0.29874222725629807, | |
| "step": 184 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 4096.0, | |
| "epoch": 0.005702572322488171, | |
| "grad_norm": 0.31961072153806486, | |
| "kl": 0.05194091796875, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0021, | |
| "reward": 0.29779237508773804, | |
| "reward_std": 0.035278864204883575, | |
| "rewards/clip_reward": 0.29779237508773804, | |
| "step": 185 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 4096.0, | |
| "epoch": 0.005733397037744864, | |
| "grad_norm": 0.21282193347830292, | |
| "kl": 0.0513916015625, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0021, | |
| "reward": 0.3018389344215393, | |
| "reward_std": 0.03779670037329197, | |
| "rewards/clip_reward": 0.3018389344215393, | |
| "step": 186 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 4096.0, | |
| "epoch": 0.005764221753001557, | |
| "grad_norm": 0.23426135318521582, | |
| "kl": 0.052001953125, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0021, | |
| "reward": 0.30514706671237946, | |
| "reward_std": 0.03661506250500679, | |
| "rewards/clip_reward": 0.30514706671237946, | |
| "step": 187 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 4096.0, | |
| "epoch": 0.00579504646825825, | |
| "grad_norm": 0.35531046478807454, | |
| "kl": 0.0570068359375, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0023, | |
| "reward": 0.2925054803490639, | |
| "reward_std": 0.03318297350779176, | |
| "rewards/clip_reward": 0.2925054803490639, | |
| "step": 188 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 4096.0, | |
| "epoch": 0.005825871183514943, | |
| "grad_norm": 0.26126815879357707, | |
| "kl": 0.05340576171875, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0021, | |
| "reward": 0.30776098370552063, | |
| "reward_std": 0.03303914796561003, | |
| "rewards/clip_reward": 0.30776098370552063, | |
| "step": 189 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 4096.0, | |
| "epoch": 0.005856695898771635, | |
| "grad_norm": 0.3913467133913027, | |
| "kl": 0.05584716796875, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0022, | |
| "reward": 0.3134502246975899, | |
| "reward_std": 0.033362182322889566, | |
| "rewards/clip_reward": 0.3134502246975899, | |
| "step": 190 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 4096.0, | |
| "epoch": 0.005887520614028328, | |
| "grad_norm": 0.5481147865863106, | |
| "kl": 0.05731201171875, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0023, | |
| "reward": 0.3062956631183624, | |
| "reward_std": 0.03714533941820264, | |
| "rewards/clip_reward": 0.3062956631183624, | |
| "step": 191 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 4096.0, | |
| "epoch": 0.0059183453292850205, | |
| "grad_norm": 0.38319922904187664, | |
| "kl": 0.0501708984375, | |
| "learning_rate": 1e-05, | |
| "loss": 0.002, | |
| "reward": 0.29817763715982437, | |
| "reward_std": 0.031161442399024963, | |
| "rewards/clip_reward": 0.29817763715982437, | |
| "step": 192 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 4096.0, | |
| "epoch": 0.005949170044541713, | |
| "grad_norm": 0.7525966257009459, | |
| "kl": 0.05181884765625, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0021, | |
| "reward": 0.2918746843934059, | |
| "reward_std": 0.04155853856354952, | |
| "rewards/clip_reward": 0.2918746843934059, | |
| "step": 193 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 4096.0, | |
| "epoch": 0.005979994759798406, | |
| "grad_norm": 0.28730243211205864, | |
| "kl": 0.0511474609375, | |
| "learning_rate": 1e-05, | |
| "loss": 0.002, | |
| "reward": 0.2828526049852371, | |
| "reward_std": 0.03443083353340626, | |
| "rewards/clip_reward": 0.2828526049852371, | |
| "step": 194 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 4096.0, | |
| "epoch": 0.006010819475055099, | |
| "grad_norm": 0.25755366719945655, | |
| "kl": 0.05511474609375, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0022, | |
| "reward": 0.30451397597789764, | |
| "reward_std": 0.030547112692147493, | |
| "rewards/clip_reward": 0.30451397597789764, | |
| "step": 195 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 4096.0, | |
| "epoch": 0.006041644190311792, | |
| "grad_norm": 0.28409469826599004, | |
| "kl": 0.0528564453125, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0021, | |
| "reward": 0.3040374740958214, | |
| "reward_std": 0.03506700927391648, | |
| "rewards/clip_reward": 0.3040374740958214, | |
| "step": 196 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 4096.0, | |
| "epoch": 0.006072468905568485, | |
| "grad_norm": 0.24631301394880073, | |
| "kl": 0.05291748046875, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0021, | |
| "reward": 0.3064323738217354, | |
| "reward_std": 0.03699143324047327, | |
| "rewards/clip_reward": 0.3064323738217354, | |
| "step": 197 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 4096.0, | |
| "epoch": 0.006103293620825178, | |
| "grad_norm": 0.4776380300716616, | |
| "kl": 0.06048583984375, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0024, | |
| "reward": 0.31653470546007156, | |
| "reward_std": 0.03443007543683052, | |
| "rewards/clip_reward": 0.31653470546007156, | |
| "step": 198 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 4096.0, | |
| "epoch": 0.006134118336081871, | |
| "grad_norm": 0.2717998039709889, | |
| "kl": 0.05657958984375, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0023, | |
| "reward": 0.2982766330242157, | |
| "reward_std": 0.030803233850747347, | |
| "rewards/clip_reward": 0.2982766330242157, | |
| "step": 199 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 4096.0, | |
| "epoch": 0.006164943051338564, | |
| "grad_norm": 0.28573048152617825, | |
| "kl": 0.05755615234375, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0023, | |
| "reward": 0.2895183190703392, | |
| "reward_std": 0.031674451660364866, | |
| "rewards/clip_reward": 0.2895183190703392, | |
| "step": 200 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 4096.0, | |
| "epoch": 0.0061957677665952565, | |
| "grad_norm": 0.3012206807393158, | |
| "kl": 0.055908203125, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0022, | |
| "reward": 0.3014395534992218, | |
| "reward_std": 0.03418161487206817, | |
| "rewards/clip_reward": 0.3014395534992218, | |
| "step": 201 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 4096.0, | |
| "epoch": 0.0062265924818519485, | |
| "grad_norm": 0.24480682812310384, | |
| "kl": 0.057373046875, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0023, | |
| "reward": 0.28224068135023117, | |
| "reward_std": 0.03003824595361948, | |
| "rewards/clip_reward": 0.28224068135023117, | |
| "step": 202 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 4096.0, | |
| "epoch": 0.006257417197108641, | |
| "grad_norm": 0.2531535146591285, | |
| "kl": 0.05511474609375, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0022, | |
| "reward": 0.30891023576259613, | |
| "reward_std": 0.03753681713715196, | |
| "rewards/clip_reward": 0.30891023576259613, | |
| "step": 203 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 4096.0, | |
| "epoch": 0.006288241912365334, | |
| "grad_norm": 0.23120591809436197, | |
| "kl": 0.05841064453125, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0023, | |
| "reward": 0.3163115456700325, | |
| "reward_std": 0.02788963308557868, | |
| "rewards/clip_reward": 0.3163115456700325, | |
| "step": 204 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 4096.0, | |
| "epoch": 0.006319066627622027, | |
| "grad_norm": 0.23022883641873151, | |
| "kl": 0.056884765625, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0023, | |
| "reward": 0.2952466085553169, | |
| "reward_std": 0.03618847485631704, | |
| "rewards/clip_reward": 0.2952466085553169, | |
| "step": 205 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 4096.0, | |
| "epoch": 0.00634989134287872, | |
| "grad_norm": 0.2604310735559812, | |
| "kl": 0.05767822265625, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0023, | |
| "reward": 0.30279867351055145, | |
| "reward_std": 0.03358080657199025, | |
| "rewards/clip_reward": 0.30279867351055145, | |
| "step": 206 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 4096.0, | |
| "epoch": 0.006380716058135413, | |
| "grad_norm": 0.3065404532343023, | |
| "kl": 0.06097412109375, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0024, | |
| "reward": 0.2905031070113182, | |
| "reward_std": 0.03163592051714659, | |
| "rewards/clip_reward": 0.2905031070113182, | |
| "step": 207 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 4096.0, | |
| "epoch": 0.006411540773392106, | |
| "grad_norm": 0.2302581935455144, | |
| "kl": 0.06036376953125, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0024, | |
| "reward": 0.3036247417330742, | |
| "reward_std": 0.04002736788243055, | |
| "rewards/clip_reward": 0.3036247417330742, | |
| "step": 208 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 4096.0, | |
| "epoch": 0.006442365488648799, | |
| "grad_norm": 0.22939030410663466, | |
| "kl": 0.0615234375, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0025, | |
| "reward": 0.3065522834658623, | |
| "reward_std": 0.035838291980326176, | |
| "rewards/clip_reward": 0.3065522834658623, | |
| "step": 209 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 4096.0, | |
| "epoch": 0.006473190203905492, | |
| "grad_norm": 1.1535119363825117, | |
| "kl": 0.171630859375, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0068, | |
| "reward": 0.3034718856215477, | |
| "reward_std": 0.028684359975159168, | |
| "rewards/clip_reward": 0.3034718856215477, | |
| "step": 210 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 4096.0, | |
| "epoch": 0.0065040149191621845, | |
| "grad_norm": 0.475789174353479, | |
| "kl": 0.06304931640625, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0025, | |
| "reward": 0.3188106268644333, | |
| "reward_std": 0.03359405370429158, | |
| "rewards/clip_reward": 0.3188106268644333, | |
| "step": 211 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 4096.0, | |
| "epoch": 0.006534839634418877, | |
| "grad_norm": 0.30958280497728724, | |
| "kl": 0.06011962890625, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0024, | |
| "reward": 0.31080956757068634, | |
| "reward_std": 0.03438552375882864, | |
| "rewards/clip_reward": 0.31080956757068634, | |
| "step": 212 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 4096.0, | |
| "epoch": 0.0065656643496755695, | |
| "grad_norm": 0.2945478977489792, | |
| "kl": 0.06329345703125, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0025, | |
| "reward": 0.29854634404182434, | |
| "reward_std": 0.03470074059441686, | |
| "rewards/clip_reward": 0.29854634404182434, | |
| "step": 213 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 4096.0, | |
| "epoch": 0.006596489064932262, | |
| "grad_norm": 0.311983746378144, | |
| "kl": 0.061279296875, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0025, | |
| "reward": 0.310188353061676, | |
| "reward_std": 0.04189980635419488, | |
| "rewards/clip_reward": 0.310188353061676, | |
| "step": 214 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 4096.0, | |
| "epoch": 0.006627313780188955, | |
| "grad_norm": 0.26157966652434694, | |
| "kl": 0.06146240234375, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0025, | |
| "reward": 0.31515590101480484, | |
| "reward_std": 0.0373789188452065, | |
| "rewards/clip_reward": 0.31515590101480484, | |
| "step": 215 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 4096.0, | |
| "epoch": 0.006658138495445648, | |
| "grad_norm": 0.3427812618986616, | |
| "kl": 0.06329345703125, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0025, | |
| "reward": 0.31115034967660904, | |
| "reward_std": 0.03764009568840265, | |
| "rewards/clip_reward": 0.31115034967660904, | |
| "step": 216 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 4096.0, | |
| "epoch": 0.006688963210702341, | |
| "grad_norm": 0.31645931728689874, | |
| "kl": 0.05767822265625, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0023, | |
| "reward": 0.28984929621219635, | |
| "reward_std": 0.03542311815544963, | |
| "rewards/clip_reward": 0.28984929621219635, | |
| "step": 217 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 4096.0, | |
| "epoch": 0.006719787925959034, | |
| "grad_norm": 0.2228608743429633, | |
| "kl": 0.05902099609375, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0024, | |
| "reward": 0.3080795630812645, | |
| "reward_std": 0.03324572555720806, | |
| "rewards/clip_reward": 0.3080795630812645, | |
| "step": 218 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 4096.0, | |
| "epoch": 0.006750612641215727, | |
| "grad_norm": 0.6337063521534685, | |
| "kl": 0.06610107421875, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0026, | |
| "reward": 0.3004284203052521, | |
| "reward_std": 0.030279683880507946, | |
| "rewards/clip_reward": 0.3004284203052521, | |
| "step": 219 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 4096.0, | |
| "epoch": 0.00678143735647242, | |
| "grad_norm": 0.35877724546468126, | |
| "kl": 0.0599365234375, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0024, | |
| "reward": 0.2890964075922966, | |
| "reward_std": 0.03512955084443092, | |
| "rewards/clip_reward": 0.2890964075922966, | |
| "step": 220 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 4096.0, | |
| "epoch": 0.006812262071729113, | |
| "grad_norm": 0.28709775498334184, | |
| "kl": 0.06353759765625, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0025, | |
| "reward": 0.3098203092813492, | |
| "reward_std": 0.03407229436561465, | |
| "rewards/clip_reward": 0.3098203092813492, | |
| "step": 221 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 4096.0, | |
| "epoch": 0.0068430867869858055, | |
| "grad_norm": 0.2587966892355202, | |
| "kl": 0.0631103515625, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0025, | |
| "reward": 0.2910105660557747, | |
| "reward_std": 0.03544299304485321, | |
| "rewards/clip_reward": 0.2910105660557747, | |
| "step": 222 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 4096.0, | |
| "epoch": 0.006873911502242498, | |
| "grad_norm": 0.20375336506340044, | |
| "kl": 0.06231689453125, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0025, | |
| "reward": 0.3012443706393242, | |
| "reward_std": 0.03373287199065089, | |
| "rewards/clip_reward": 0.3012443706393242, | |
| "step": 223 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 4096.0, | |
| "epoch": 0.006904736217499191, | |
| "grad_norm": 0.22796858972291945, | |
| "kl": 0.058349609375, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0023, | |
| "reward": 0.27249932289123535, | |
| "reward_std": 0.034992088098078966, | |
| "rewards/clip_reward": 0.27249932289123535, | |
| "step": 224 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 4096.0, | |
| "epoch": 0.006935560932755883, | |
| "grad_norm": 0.23923560288044327, | |
| "kl": 0.05914306640625, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0024, | |
| "reward": 0.30393005907535553, | |
| "reward_std": 0.04046585503965616, | |
| "rewards/clip_reward": 0.30393005907535553, | |
| "step": 225 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 4096.0, | |
| "epoch": 0.006966385648012576, | |
| "grad_norm": 0.24781024716109332, | |
| "kl": 0.05828857421875, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0023, | |
| "reward": 0.29162784665822983, | |
| "reward_std": 0.03501830715686083, | |
| "rewards/clip_reward": 0.29162784665822983, | |
| "step": 226 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 4096.0, | |
| "epoch": 0.006997210363269269, | |
| "grad_norm": 0.2709603773452262, | |
| "kl": 0.0609130859375, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0024, | |
| "reward": 0.2914000228047371, | |
| "reward_std": 0.030948857311159372, | |
| "rewards/clip_reward": 0.2914000228047371, | |
| "step": 227 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 4096.0, | |
| "epoch": 0.007028035078525962, | |
| "grad_norm": 0.21567469316732074, | |
| "kl": 0.0567626953125, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0023, | |
| "reward": 0.3019692078232765, | |
| "reward_std": 0.038113043643534184, | |
| "rewards/clip_reward": 0.3019692078232765, | |
| "step": 228 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 4096.0, | |
| "epoch": 0.007058859793782655, | |
| "grad_norm": 0.223865539956382, | |
| "kl": 0.05474853515625, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0022, | |
| "reward": 0.276552177965641, | |
| "reward_std": 0.03583432175219059, | |
| "rewards/clip_reward": 0.276552177965641, | |
| "step": 229 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 4096.0, | |
| "epoch": 0.007089684509039348, | |
| "grad_norm": 0.23022368412719968, | |
| "kl": 0.05950927734375, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0024, | |
| "reward": 0.30329927057027817, | |
| "reward_std": 0.035789184272289276, | |
| "rewards/clip_reward": 0.30329927057027817, | |
| "step": 230 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 4096.0, | |
| "epoch": 0.007120509224296041, | |
| "grad_norm": 0.2872615168609158, | |
| "kl": 0.05938720703125, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0024, | |
| "reward": 0.3115493804216385, | |
| "reward_std": 0.03708712290972471, | |
| "rewards/clip_reward": 0.3115493804216385, | |
| "step": 231 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 4096.0, | |
| "epoch": 0.0071513339395527335, | |
| "grad_norm": 0.34806552807362773, | |
| "kl": 0.06298828125, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0025, | |
| "reward": 0.3054001107811928, | |
| "reward_std": 0.034296643920242786, | |
| "rewards/clip_reward": 0.3054001107811928, | |
| "step": 232 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 4096.0, | |
| "epoch": 0.007182158654809426, | |
| "grad_norm": 0.286918714200751, | |
| "kl": 0.06072998046875, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0024, | |
| "reward": 0.299840584397316, | |
| "reward_std": 0.029763209633529186, | |
| "rewards/clip_reward": 0.299840584397316, | |
| "step": 233 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 4096.0, | |
| "epoch": 0.007212983370066119, | |
| "grad_norm": 0.2246601984953947, | |
| "kl": 0.0570068359375, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0023, | |
| "reward": 0.3116656318306923, | |
| "reward_std": 0.038989217951893806, | |
| "rewards/clip_reward": 0.3116656318306923, | |
| "step": 234 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 4096.0, | |
| "epoch": 0.007243808085322812, | |
| "grad_norm": 0.2300241445857201, | |
| "kl": 0.06207275390625, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0025, | |
| "reward": 0.3119165748357773, | |
| "reward_std": 0.030884731095284224, | |
| "rewards/clip_reward": 0.3119165748357773, | |
| "step": 235 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 4096.0, | |
| "epoch": 0.007274632800579504, | |
| "grad_norm": 0.37606313252297296, | |
| "kl": 0.06292724609375, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0025, | |
| "reward": 0.2938763126730919, | |
| "reward_std": 0.03854981064796448, | |
| "rewards/clip_reward": 0.2938763126730919, | |
| "step": 236 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 4096.0, | |
| "epoch": 0.007305457515836197, | |
| "grad_norm": 0.27390469202393175, | |
| "kl": 0.06268310546875, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0025, | |
| "reward": 0.30878350138664246, | |
| "reward_std": 0.0367919635027647, | |
| "rewards/clip_reward": 0.30878350138664246, | |
| "step": 237 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 4096.0, | |
| "epoch": 0.00733628223109289, | |
| "grad_norm": 0.3923664539501469, | |
| "kl": 0.06011962890625, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0024, | |
| "reward": 0.31253431737422943, | |
| "reward_std": 0.03306168969720602, | |
| "rewards/clip_reward": 0.31253431737422943, | |
| "step": 238 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 4096.0, | |
| "epoch": 0.007367106946349583, | |
| "grad_norm": 19.999839999172977, | |
| "kl": 0.11572265625, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0046, | |
| "reward": 0.2926497831940651, | |
| "reward_std": 0.03439700556918979, | |
| "rewards/clip_reward": 0.2926497831940651, | |
| "step": 239 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 4096.0, | |
| "epoch": 0.007397931661606276, | |
| "grad_norm": 0.3274619779549125, | |
| "kl": 0.0623779296875, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0025, | |
| "reward": 0.31169167160987854, | |
| "reward_std": 0.03383385669440031, | |
| "rewards/clip_reward": 0.31169167160987854, | |
| "step": 240 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 4096.0, | |
| "epoch": 0.007428756376862969, | |
| "grad_norm": 0.3930715184021957, | |
| "kl": 0.064453125, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0026, | |
| "reward": 0.3149360120296478, | |
| "reward_std": 0.036022236570715904, | |
| "rewards/clip_reward": 0.3149360120296478, | |
| "step": 241 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 4096.0, | |
| "epoch": 0.007459581092119662, | |
| "grad_norm": 0.23471411051429383, | |
| "kl": 0.0631103515625, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0025, | |
| "reward": 0.2915840148925781, | |
| "reward_std": 0.04799613729119301, | |
| "rewards/clip_reward": 0.2915840148925781, | |
| "step": 242 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 4096.0, | |
| "epoch": 0.0074904058073763545, | |
| "grad_norm": 0.25074369126631646, | |
| "kl": 0.06268310546875, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0025, | |
| "reward": 0.30563782155513763, | |
| "reward_std": 0.036503950599581, | |
| "rewards/clip_reward": 0.30563782155513763, | |
| "step": 243 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 4096.0, | |
| "epoch": 0.007521230522633047, | |
| "grad_norm": 0.25106966232242506, | |
| "kl": 0.06280517578125, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0025, | |
| "reward": 0.31641364842653275, | |
| "reward_std": 0.042671964969486, | |
| "rewards/clip_reward": 0.31641364842653275, | |
| "step": 244 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 4096.0, | |
| "epoch": 0.00755205523788974, | |
| "grad_norm": 0.24863565809024027, | |
| "kl": 0.06585693359375, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0026, | |
| "reward": 0.29705505073070526, | |
| "reward_std": 0.02980767609551549, | |
| "rewards/clip_reward": 0.29705505073070526, | |
| "step": 245 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 4096.0, | |
| "epoch": 0.007582879953146433, | |
| "grad_norm": 0.3215906875694069, | |
| "kl": 0.0626220703125, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0025, | |
| "reward": 0.2966236397624016, | |
| "reward_std": 0.032666172832250595, | |
| "rewards/clip_reward": 0.2966236397624016, | |
| "step": 246 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 4096.0, | |
| "epoch": 0.007613704668403126, | |
| "grad_norm": 0.25802005781105997, | |
| "kl": 0.0655517578125, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0026, | |
| "reward": 0.29460806399583817, | |
| "reward_std": 0.03424982167780399, | |
| "rewards/clip_reward": 0.29460806399583817, | |
| "step": 247 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 4096.0, | |
| "epoch": 0.007644529383659818, | |
| "grad_norm": 0.41480007000983, | |
| "kl": 0.06292724609375, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0025, | |
| "reward": 0.30160878598690033, | |
| "reward_std": 0.03246263647451997, | |
| "rewards/clip_reward": 0.30160878598690033, | |
| "step": 248 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 4096.0, | |
| "epoch": 0.007675354098916511, | |
| "grad_norm": 0.38518441595490016, | |
| "kl": 0.07415771484375, | |
| "learning_rate": 1e-05, | |
| "loss": 0.003, | |
| "reward": 0.2882128059864044, | |
| "reward_std": 0.03637995757162571, | |
| "rewards/clip_reward": 0.2882128059864044, | |
| "step": 249 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 4096.0, | |
| "epoch": 0.007706178814173204, | |
| "grad_norm": 0.2296837125573533, | |
| "kl": 0.06439208984375, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0026, | |
| "reward": 0.31887152045965195, | |
| "reward_std": 0.031468767672777176, | |
| "rewards/clip_reward": 0.31887152045965195, | |
| "step": 250 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 4096.0, | |
| "epoch": 0.007737003529429897, | |
| "grad_norm": 0.2771302981384482, | |
| "kl": 0.06256103515625, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0025, | |
| "reward": 0.29794805496931076, | |
| "reward_std": 0.03718498535454273, | |
| "rewards/clip_reward": 0.29794805496931076, | |
| "step": 251 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 4096.0, | |
| "epoch": 0.00776782824468659, | |
| "grad_norm": 0.26425224371180456, | |
| "kl": 0.0660400390625, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0026, | |
| "reward": 0.31550634652376175, | |
| "reward_std": 0.03392920456826687, | |
| "rewards/clip_reward": 0.31550634652376175, | |
| "step": 252 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 4096.0, | |
| "epoch": 0.0077986529599432825, | |
| "grad_norm": 0.3039933530405912, | |
| "kl": 0.06451416015625, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0026, | |
| "reward": 0.31224576383829117, | |
| "reward_std": 0.03579189581796527, | |
| "rewards/clip_reward": 0.31224576383829117, | |
| "step": 253 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 4096.0, | |
| "epoch": 0.007829477675199975, | |
| "grad_norm": 0.21264552398464287, | |
| "kl": 0.066650390625, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0027, | |
| "reward": 0.30356816947460175, | |
| "reward_std": 0.03573842905461788, | |
| "rewards/clip_reward": 0.30356816947460175, | |
| "step": 254 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 4096.0, | |
| "epoch": 0.007860302390456668, | |
| "grad_norm": 0.23510191594709257, | |
| "kl": 0.06463623046875, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0026, | |
| "reward": 0.29867023229599, | |
| "reward_std": 0.03537313872948289, | |
| "rewards/clip_reward": 0.29867023229599, | |
| "step": 255 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 4096.0, | |
| "epoch": 0.007891127105713361, | |
| "grad_norm": 0.8798380521542469, | |
| "kl": 0.0672607421875, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0027, | |
| "reward": 0.2907034158706665, | |
| "reward_std": 0.03144656075164676, | |
| "rewards/clip_reward": 0.2907034158706665, | |
| "step": 256 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 4096.0, | |
| "epoch": 0.007921951820970054, | |
| "grad_norm": 0.30489147633127095, | |
| "kl": 0.06390380859375, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0026, | |
| "reward": 0.2994539961218834, | |
| "reward_std": 0.036147382110357285, | |
| "rewards/clip_reward": 0.2994539961218834, | |
| "step": 257 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 4096.0, | |
| "epoch": 0.007952776536226747, | |
| "grad_norm": 0.2724891171334766, | |
| "kl": 0.07373046875, | |
| "learning_rate": 1e-05, | |
| "loss": 0.003, | |
| "reward": 0.29950564354658127, | |
| "reward_std": 0.031049554236233234, | |
| "rewards/clip_reward": 0.29950564354658127, | |
| "step": 258 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 4096.0, | |
| "epoch": 0.00798360125148344, | |
| "grad_norm": 0.2257138587782837, | |
| "kl": 0.0606689453125, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0024, | |
| "reward": 0.31028740108013153, | |
| "reward_std": 0.03820298006758094, | |
| "rewards/clip_reward": 0.31028740108013153, | |
| "step": 259 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 4096.0, | |
| "epoch": 0.008014425966740133, | |
| "grad_norm": 0.23904222780230097, | |
| "kl": 0.06512451171875, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0026, | |
| "reward": 0.30369560420513153, | |
| "reward_std": 0.034888788126409054, | |
| "rewards/clip_reward": 0.30369560420513153, | |
| "step": 260 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 4096.0, | |
| "epoch": 0.008045250681996826, | |
| "grad_norm": 0.26551280919013587, | |
| "kl": 0.069091796875, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0028, | |
| "reward": 0.3005470857024193, | |
| "reward_std": 0.03615998663008213, | |
| "rewards/clip_reward": 0.3005470857024193, | |
| "step": 261 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 4096.0, | |
| "epoch": 0.008076075397253519, | |
| "grad_norm": 0.2641814708454766, | |
| "kl": 0.067138671875, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0027, | |
| "reward": 0.30573634058237076, | |
| "reward_std": 0.03119312133640051, | |
| "rewards/clip_reward": 0.30573634058237076, | |
| "step": 262 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 4096.0, | |
| "epoch": 0.008106900112510211, | |
| "grad_norm": 0.2538072823054899, | |
| "kl": 0.067626953125, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0027, | |
| "reward": 0.28952275961637497, | |
| "reward_std": 0.02620957838371396, | |
| "rewards/clip_reward": 0.28952275961637497, | |
| "step": 263 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 4096.0, | |
| "epoch": 0.008137724827766904, | |
| "grad_norm": 0.27840725784727466, | |
| "kl": 0.0692138671875, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0028, | |
| "reward": 0.31542008370161057, | |
| "reward_std": 0.037005496211349964, | |
| "rewards/clip_reward": 0.31542008370161057, | |
| "step": 264 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 4096.0, | |
| "epoch": 0.008168549543023595, | |
| "grad_norm": 0.3168534549683751, | |
| "kl": 0.069091796875, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0028, | |
| "reward": 0.2891938388347626, | |
| "reward_std": 0.032484165858477354, | |
| "rewards/clip_reward": 0.2891938388347626, | |
| "step": 265 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 4096.0, | |
| "epoch": 0.008199374258280288, | |
| "grad_norm": 0.34875471295741794, | |
| "kl": 0.071533203125, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0029, | |
| "reward": 0.3205078914761543, | |
| "reward_std": 0.042449533008039, | |
| "rewards/clip_reward": 0.3205078914761543, | |
| "step": 266 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 4096.0, | |
| "epoch": 0.008230198973536981, | |
| "grad_norm": 0.2913604348839751, | |
| "kl": 0.0697021484375, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0028, | |
| "reward": 0.3089185729622841, | |
| "reward_std": 0.03769417991861701, | |
| "rewards/clip_reward": 0.3089185729622841, | |
| "step": 267 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 4096.0, | |
| "epoch": 0.008261023688793674, | |
| "grad_norm": 0.2408505354217034, | |
| "kl": 0.06768798828125, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0027, | |
| "reward": 0.30584635585546494, | |
| "reward_std": 0.03537123091518879, | |
| "rewards/clip_reward": 0.30584635585546494, | |
| "step": 268 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 4096.0, | |
| "epoch": 0.008291848404050367, | |
| "grad_norm": 0.23962011506896905, | |
| "kl": 0.0709228515625, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0028, | |
| "reward": 0.2843191996216774, | |
| "reward_std": 0.030163435731083155, | |
| "rewards/clip_reward": 0.2843191996216774, | |
| "step": 269 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 4096.0, | |
| "epoch": 0.00832267311930706, | |
| "grad_norm": 0.267565227028639, | |
| "kl": 0.072021484375, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0029, | |
| "reward": 0.30715377628803253, | |
| "reward_std": 0.03210371499881148, | |
| "rewards/clip_reward": 0.30715377628803253, | |
| "step": 270 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 4096.0, | |
| "epoch": 0.008353497834563753, | |
| "grad_norm": 0.26478125349454257, | |
| "kl": 0.071533203125, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0029, | |
| "reward": 0.3105710372328758, | |
| "reward_std": 0.03365481458604336, | |
| "rewards/clip_reward": 0.3105710372328758, | |
| "step": 271 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 4096.0, | |
| "epoch": 0.008384322549820446, | |
| "grad_norm": 0.25310448605992364, | |
| "kl": 0.0711669921875, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0028, | |
| "reward": 0.2988099977374077, | |
| "reward_std": 0.03356699598953128, | |
| "rewards/clip_reward": 0.2988099977374077, | |
| "step": 272 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 4096.0, | |
| "epoch": 0.008415147265077139, | |
| "grad_norm": 0.6415597863522857, | |
| "kl": 0.0704345703125, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0028, | |
| "reward": 0.3161829188466072, | |
| "reward_std": 0.03346774587407708, | |
| "rewards/clip_reward": 0.3161829188466072, | |
| "step": 273 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 4096.0, | |
| "epoch": 0.008445971980333832, | |
| "grad_norm": 0.2963215633498198, | |
| "kl": 0.0755615234375, | |
| "learning_rate": 1e-05, | |
| "loss": 0.003, | |
| "reward": 0.29340942949056625, | |
| "reward_std": 0.029817141592502594, | |
| "rewards/clip_reward": 0.29340942949056625, | |
| "step": 274 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 4096.0, | |
| "epoch": 0.008476796695590524, | |
| "grad_norm": 0.28624937293920455, | |
| "kl": 0.07177734375, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0029, | |
| "reward": 0.29413383454084396, | |
| "reward_std": 0.03951750136911869, | |
| "rewards/clip_reward": 0.29413383454084396, | |
| "step": 275 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 4096.0, | |
| "epoch": 0.008507621410847217, | |
| "grad_norm": 0.3584737945543695, | |
| "kl": 0.065185546875, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0026, | |
| "reward": 0.30792876332998276, | |
| "reward_std": 0.03842892590910196, | |
| "rewards/clip_reward": 0.30792876332998276, | |
| "step": 276 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 4096.0, | |
| "epoch": 0.00853844612610391, | |
| "grad_norm": 0.41014371409185196, | |
| "kl": 0.086669921875, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0035, | |
| "reward": 0.30880793929100037, | |
| "reward_std": 0.03384514432400465, | |
| "rewards/clip_reward": 0.30880793929100037, | |
| "step": 277 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 4096.0, | |
| "epoch": 0.008569270841360603, | |
| "grad_norm": 0.6923907529205436, | |
| "kl": 0.07275390625, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0029, | |
| "reward": 0.30738433450460434, | |
| "reward_std": 0.030436881817877293, | |
| "rewards/clip_reward": 0.30738433450460434, | |
| "step": 278 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 4096.0, | |
| "epoch": 0.008600095556617296, | |
| "grad_norm": 0.2508894379559603, | |
| "kl": 0.06707763671875, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0027, | |
| "reward": 0.29732104390859604, | |
| "reward_std": 0.03281824570149183, | |
| "rewards/clip_reward": 0.29732104390859604, | |
| "step": 279 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 4096.0, | |
| "epoch": 0.008630920271873989, | |
| "grad_norm": 0.21788541287282853, | |
| "kl": 0.066650390625, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0027, | |
| "reward": 0.3087487816810608, | |
| "reward_std": 0.03466750681400299, | |
| "rewards/clip_reward": 0.3087487816810608, | |
| "step": 280 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 4096.0, | |
| "epoch": 0.008661744987130682, | |
| "grad_norm": 0.3677827474941058, | |
| "kl": 0.077880859375, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0031, | |
| "reward": 0.3030627593398094, | |
| "reward_std": 0.0359897562302649, | |
| "rewards/clip_reward": 0.3030627593398094, | |
| "step": 281 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 4096.0, | |
| "epoch": 0.008692569702387375, | |
| "grad_norm": 0.314089431725954, | |
| "kl": 0.07061767578125, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0028, | |
| "reward": 0.30142877995967865, | |
| "reward_std": 0.032457209192216396, | |
| "rewards/clip_reward": 0.30142877995967865, | |
| "step": 282 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 4096.0, | |
| "epoch": 0.008723394417644068, | |
| "grad_norm": 0.2690848375206651, | |
| "kl": 0.06787109375, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0027, | |
| "reward": 0.29926204681396484, | |
| "reward_std": 0.046972066164016724, | |
| "rewards/clip_reward": 0.29926204681396484, | |
| "step": 283 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 4096.0, | |
| "epoch": 0.00875421913290076, | |
| "grad_norm": 0.5301348946984951, | |
| "kl": 0.06787109375, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0027, | |
| "reward": 0.3038446083664894, | |
| "reward_std": 0.03355177864432335, | |
| "rewards/clip_reward": 0.3038446083664894, | |
| "step": 284 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 4096.0, | |
| "epoch": 0.008785043848157453, | |
| "grad_norm": 0.3640215089911138, | |
| "kl": 0.0740966796875, | |
| "learning_rate": 1e-05, | |
| "loss": 0.003, | |
| "reward": 0.29393448680639267, | |
| "reward_std": 0.029715597163885832, | |
| "rewards/clip_reward": 0.29393448680639267, | |
| "step": 285 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 4096.0, | |
| "epoch": 0.008815868563414146, | |
| "grad_norm": 0.25955791764443836, | |
| "kl": 0.0677490234375, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0027, | |
| "reward": 0.29612670093774796, | |
| "reward_std": 0.03200107906013727, | |
| "rewards/clip_reward": 0.29612670093774796, | |
| "step": 286 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 4096.0, | |
| "epoch": 0.008846693278670839, | |
| "grad_norm": 0.2840084484574733, | |
| "kl": 0.0736083984375, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0029, | |
| "reward": 0.28986190259456635, | |
| "reward_std": 0.03536898456513882, | |
| "rewards/clip_reward": 0.28986190259456635, | |
| "step": 287 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 4096.0, | |
| "epoch": 0.00887751799392753, | |
| "grad_norm": 0.2504845781925923, | |
| "kl": 0.0736083984375, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0029, | |
| "reward": 0.28574925661087036, | |
| "reward_std": 0.029350985307246447, | |
| "rewards/clip_reward": 0.28574925661087036, | |
| "step": 288 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 4096.0, | |
| "epoch": 0.008908342709184223, | |
| "grad_norm": 1.9538978684731851, | |
| "kl": 0.073486328125, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0029, | |
| "reward": 0.2989484444260597, | |
| "reward_std": 0.037056506145745516, | |
| "rewards/clip_reward": 0.2989484444260597, | |
| "step": 289 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 4096.0, | |
| "epoch": 0.008939167424440916, | |
| "grad_norm": 0.29595148973162033, | |
| "kl": 0.068359375, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0027, | |
| "reward": 0.2984117269515991, | |
| "reward_std": 0.04048395995050669, | |
| "rewards/clip_reward": 0.2984117269515991, | |
| "step": 290 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 4096.0, | |
| "epoch": 0.008969992139697609, | |
| "grad_norm": 0.46786991142607903, | |
| "kl": 0.0726318359375, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0029, | |
| "reward": 0.29787522554397583, | |
| "reward_std": 0.03701747301965952, | |
| "rewards/clip_reward": 0.29787522554397583, | |
| "step": 291 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 4096.0, | |
| "epoch": 0.009000816854954302, | |
| "grad_norm": 0.2415371991232022, | |
| "kl": 0.0673828125, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0027, | |
| "reward": 0.2959031015634537, | |
| "reward_std": 0.03375337179750204, | |
| "rewards/clip_reward": 0.2959031015634537, | |
| "step": 292 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 4096.0, | |
| "epoch": 0.009031641570210995, | |
| "grad_norm": 0.2509472132536602, | |
| "kl": 0.0736083984375, | |
| "learning_rate": 1e-05, | |
| "loss": 0.003, | |
| "reward": 0.3059312403202057, | |
| "reward_std": 0.029257553163915873, | |
| "rewards/clip_reward": 0.3059312403202057, | |
| "step": 293 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 4096.0, | |
| "epoch": 0.009062466285467688, | |
| "grad_norm": 0.27142301376417255, | |
| "kl": 0.0682373046875, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0027, | |
| "reward": 0.302873820066452, | |
| "reward_std": 0.03212234936654568, | |
| "rewards/clip_reward": 0.302873820066452, | |
| "step": 294 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 4096.0, | |
| "epoch": 0.00909329100072438, | |
| "grad_norm": 0.6714767650621216, | |
| "kl": 0.0770263671875, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0031, | |
| "reward": 0.302773617208004, | |
| "reward_std": 0.037844753824174404, | |
| "rewards/clip_reward": 0.302773617208004, | |
| "step": 295 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 4096.0, | |
| "epoch": 0.009124115715981073, | |
| "grad_norm": 0.3313051439529105, | |
| "kl": 0.071533203125, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0029, | |
| "reward": 0.3004399463534355, | |
| "reward_std": 0.03778542298823595, | |
| "rewards/clip_reward": 0.3004399463534355, | |
| "step": 296 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 4096.0, | |
| "epoch": 0.009154940431237766, | |
| "grad_norm": 0.2861755625116893, | |
| "kl": 0.0765380859375, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0031, | |
| "reward": 0.2957393676042557, | |
| "reward_std": 0.03911779401823878, | |
| "rewards/clip_reward": 0.2957393676042557, | |
| "step": 297 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 4096.0, | |
| "epoch": 0.00918576514649446, | |
| "grad_norm": 0.3643143782064657, | |
| "kl": 0.0721435546875, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0029, | |
| "reward": 0.3204945772886276, | |
| "reward_std": 0.04190400242805481, | |
| "rewards/clip_reward": 0.3204945772886276, | |
| "step": 298 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 4096.0, | |
| "epoch": 0.009216589861751152, | |
| "grad_norm": 0.3506675665500401, | |
| "kl": 0.0740966796875, | |
| "learning_rate": 1e-05, | |
| "loss": 0.003, | |
| "reward": 0.30117396265268326, | |
| "reward_std": 0.03484671004116535, | |
| "rewards/clip_reward": 0.30117396265268326, | |
| "step": 299 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 4096.0, | |
| "epoch": 0.009247414577007845, | |
| "grad_norm": 0.4873394391686336, | |
| "kl": 0.0721435546875, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0029, | |
| "reward": 0.3100610002875328, | |
| "reward_std": 0.03472679574042559, | |
| "rewards/clip_reward": 0.3100610002875328, | |
| "step": 300 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 4096.0, | |
| "epoch": 0.009278239292264538, | |
| "grad_norm": 0.4157803977878561, | |
| "kl": 0.072021484375, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0029, | |
| "reward": 0.30167850106954575, | |
| "reward_std": 0.031223418191075325, | |
| "rewards/clip_reward": 0.30167850106954575, | |
| "step": 301 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 4096.0, | |
| "epoch": 0.00930906400752123, | |
| "grad_norm": 0.3797545814324745, | |
| "kl": 0.0731201171875, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0029, | |
| "reward": 0.31539446860551834, | |
| "reward_std": 0.041049075312912464, | |
| "rewards/clip_reward": 0.31539446860551834, | |
| "step": 302 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 4096.0, | |
| "epoch": 0.009339888722777924, | |
| "grad_norm": 0.2743776739836363, | |
| "kl": 0.0791015625, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0032, | |
| "reward": 0.30482209473848343, | |
| "reward_std": 0.03603475447744131, | |
| "rewards/clip_reward": 0.30482209473848343, | |
| "step": 303 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 4096.0, | |
| "epoch": 0.009370713438034617, | |
| "grad_norm": 0.3102618921636113, | |
| "kl": 0.0723876953125, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0029, | |
| "reward": 0.3093208372592926, | |
| "reward_std": 0.03316945396363735, | |
| "rewards/clip_reward": 0.3093208372592926, | |
| "step": 304 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 4096.0, | |
| "epoch": 0.00940153815329131, | |
| "grad_norm": 0.22184784039477515, | |
| "kl": 0.0692138671875, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0028, | |
| "reward": 0.3097628057003021, | |
| "reward_std": 0.034655320923775434, | |
| "rewards/clip_reward": 0.3097628057003021, | |
| "step": 305 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 4096.0, | |
| "epoch": 0.009432362868548002, | |
| "grad_norm": 0.2679648313292432, | |
| "kl": 0.0689697265625, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0028, | |
| "reward": 0.31062986701726913, | |
| "reward_std": 0.0346312508918345, | |
| "rewards/clip_reward": 0.31062986701726913, | |
| "step": 306 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 4096.0, | |
| "epoch": 0.009463187583804695, | |
| "grad_norm": 0.2091622067188512, | |
| "kl": 0.068115234375, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0027, | |
| "reward": 0.2891889810562134, | |
| "reward_std": 0.03427890222519636, | |
| "rewards/clip_reward": 0.2891889810562134, | |
| "step": 307 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 4096.0, | |
| "epoch": 0.009494012299061388, | |
| "grad_norm": 0.380322887324008, | |
| "kl": 0.077392578125, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0031, | |
| "reward": 0.2787666954100132, | |
| "reward_std": 0.030210648197680712, | |
| "rewards/clip_reward": 0.2787666954100132, | |
| "step": 308 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 4096.0, | |
| "epoch": 0.009524837014318081, | |
| "grad_norm": 0.2639105229044439, | |
| "kl": 0.06744384765625, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0027, | |
| "reward": 0.30674491077661514, | |
| "reward_std": 0.026672531850636005, | |
| "rewards/clip_reward": 0.30674491077661514, | |
| "step": 309 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 4096.0, | |
| "epoch": 0.009555661729574774, | |
| "grad_norm": 0.24148098115752395, | |
| "kl": 0.0694580078125, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0028, | |
| "reward": 0.2937857508659363, | |
| "reward_std": 0.03315737470984459, | |
| "rewards/clip_reward": 0.2937857508659363, | |
| "step": 310 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 4096.0, | |
| "epoch": 0.009586486444831465, | |
| "grad_norm": 0.2751403655433819, | |
| "kl": 0.0654296875, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0026, | |
| "reward": 0.3029419779777527, | |
| "reward_std": 0.037644670344889164, | |
| "rewards/clip_reward": 0.3029419779777527, | |
| "step": 311 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 4096.0, | |
| "epoch": 0.009617311160088158, | |
| "grad_norm": 0.43265267199518154, | |
| "kl": 0.067626953125, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0027, | |
| "reward": 0.3041282668709755, | |
| "reward_std": 0.036585441790521145, | |
| "rewards/clip_reward": 0.3041282668709755, | |
| "step": 312 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 4096.0, | |
| "epoch": 0.00964813587534485, | |
| "grad_norm": 0.23342772151709762, | |
| "kl": 0.0709228515625, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0028, | |
| "reward": 0.281108133494854, | |
| "reward_std": 0.032666866201907396, | |
| "rewards/clip_reward": 0.281108133494854, | |
| "step": 313 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 4096.0, | |
| "epoch": 0.009678960590601544, | |
| "grad_norm": 0.2544379988905811, | |
| "kl": 0.071044921875, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0028, | |
| "reward": 0.31978850811719894, | |
| "reward_std": 0.035083431750535965, | |
| "rewards/clip_reward": 0.31978850811719894, | |
| "step": 314 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 4096.0, | |
| "epoch": 0.009709785305858237, | |
| "grad_norm": 0.2804560271375766, | |
| "kl": 0.068115234375, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0027, | |
| "reward": 0.3023409992456436, | |
| "reward_std": 0.039583622477948666, | |
| "rewards/clip_reward": 0.3023409992456436, | |
| "step": 315 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 4096.0, | |
| "epoch": 0.00974061002111493, | |
| "grad_norm": 0.3308718322184164, | |
| "kl": 0.065673828125, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0026, | |
| "reward": 0.3051457703113556, | |
| "reward_std": 0.030992726795375347, | |
| "rewards/clip_reward": 0.3051457703113556, | |
| "step": 316 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 4096.0, | |
| "epoch": 0.009771434736371622, | |
| "grad_norm": 0.28901404215876114, | |
| "kl": 0.0743408203125, | |
| "learning_rate": 1e-05, | |
| "loss": 0.003, | |
| "reward": 0.30402371287345886, | |
| "reward_std": 0.029232127591967583, | |
| "rewards/clip_reward": 0.30402371287345886, | |
| "step": 317 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 4096.0, | |
| "epoch": 0.009802259451628315, | |
| "grad_norm": 0.35413328500218205, | |
| "kl": 0.072021484375, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0029, | |
| "reward": 0.30789412558078766, | |
| "reward_std": 0.03223106591030955, | |
| "rewards/clip_reward": 0.30789412558078766, | |
| "step": 318 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 4096.0, | |
| "epoch": 0.009833084166885008, | |
| "grad_norm": 0.2340787790478016, | |
| "kl": 0.0662841796875, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0027, | |
| "reward": 0.3076978102326393, | |
| "reward_std": 0.026181872468441725, | |
| "rewards/clip_reward": 0.3076978102326393, | |
| "step": 319 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 4096.0, | |
| "epoch": 0.009863908882141701, | |
| "grad_norm": 0.2160311104140781, | |
| "kl": 0.06451416015625, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0026, | |
| "reward": 0.3071356937289238, | |
| "reward_std": 0.03830281179398298, | |
| "rewards/clip_reward": 0.3071356937289238, | |
| "step": 320 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 4096.0, | |
| "epoch": 0.009894733597398394, | |
| "grad_norm": 0.2377449652657432, | |
| "kl": 0.069580078125, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0028, | |
| "reward": 0.2980581670999527, | |
| "reward_std": 0.04127457272261381, | |
| "rewards/clip_reward": 0.2980581670999527, | |
| "step": 321 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 4096.0, | |
| "epoch": 0.009925558312655087, | |
| "grad_norm": 0.24180517546337557, | |
| "kl": 0.06591796875, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0026, | |
| "reward": 0.3178565129637718, | |
| "reward_std": 0.03440323146060109, | |
| "rewards/clip_reward": 0.3178565129637718, | |
| "step": 322 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 4096.0, | |
| "epoch": 0.00995638302791178, | |
| "grad_norm": 0.2316875481005505, | |
| "kl": 0.0665283203125, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0027, | |
| "reward": 0.30603795498609543, | |
| "reward_std": 0.03252642601728439, | |
| "rewards/clip_reward": 0.30603795498609543, | |
| "step": 323 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 4096.0, | |
| "epoch": 0.009987207743168473, | |
| "grad_norm": 0.3112158582767738, | |
| "kl": 0.0694580078125, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0028, | |
| "reward": 0.2933913469314575, | |
| "reward_std": 0.03322177054360509, | |
| "rewards/clip_reward": 0.2933913469314575, | |
| "step": 324 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 4096.0, | |
| "epoch": 0.010018032458425166, | |
| "grad_norm": 0.2406318070865998, | |
| "kl": 0.06787109375, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0027, | |
| "reward": 0.2959599047899246, | |
| "reward_std": 0.04275544732809067, | |
| "rewards/clip_reward": 0.2959599047899246, | |
| "step": 325 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 4096.0, | |
| "epoch": 0.010048857173681858, | |
| "grad_norm": 0.25585002828007836, | |
| "kl": 0.06561279296875, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0026, | |
| "reward": 0.306929275393486, | |
| "reward_std": 0.03498923219740391, | |
| "rewards/clip_reward": 0.306929275393486, | |
| "step": 326 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 4096.0, | |
| "epoch": 0.010079681888938551, | |
| "grad_norm": 0.2860165243450835, | |
| "kl": 0.06787109375, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0027, | |
| "reward": 0.3039538636803627, | |
| "reward_std": 0.0350054414011538, | |
| "rewards/clip_reward": 0.3039538636803627, | |
| "step": 327 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 4096.0, | |
| "epoch": 0.010110506604195244, | |
| "grad_norm": 0.2915143244249602, | |
| "kl": 0.064697265625, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0026, | |
| "reward": 0.30542658269405365, | |
| "reward_std": 0.03288308531045914, | |
| "rewards/clip_reward": 0.30542658269405365, | |
| "step": 328 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 4096.0, | |
| "epoch": 0.010141331319451937, | |
| "grad_norm": 0.2132282152683129, | |
| "kl": 0.06341552734375, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0025, | |
| "reward": 0.30350538343191147, | |
| "reward_std": 0.03533684695139527, | |
| "rewards/clip_reward": 0.30350538343191147, | |
| "step": 329 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 4096.0, | |
| "epoch": 0.01017215603470863, | |
| "grad_norm": 0.2229730055471249, | |
| "kl": 0.0667724609375, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0027, | |
| "reward": 0.3181814104318619, | |
| "reward_std": 0.04055699147284031, | |
| "rewards/clip_reward": 0.3181814104318619, | |
| "step": 330 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 4096.0, | |
| "epoch": 0.010202980749965323, | |
| "grad_norm": 0.29467120899645105, | |
| "kl": 0.078369140625, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0031, | |
| "reward": 0.3067881911993027, | |
| "reward_std": 0.03262139391154051, | |
| "rewards/clip_reward": 0.3067881911993027, | |
| "step": 331 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 4096.0, | |
| "epoch": 0.010233805465222016, | |
| "grad_norm": 0.2626359604798955, | |
| "kl": 0.0689697265625, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0028, | |
| "reward": 0.3152911067008972, | |
| "reward_std": 0.033262383192777634, | |
| "rewards/clip_reward": 0.3152911067008972, | |
| "step": 332 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 4096.0, | |
| "epoch": 0.010264630180478709, | |
| "grad_norm": 0.2557414678170153, | |
| "kl": 0.065185546875, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0026, | |
| "reward": 0.2922421917319298, | |
| "reward_std": 0.03196657868102193, | |
| "rewards/clip_reward": 0.2922421917319298, | |
| "step": 333 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 4096.0, | |
| "epoch": 0.0102954548957354, | |
| "grad_norm": 0.46611276904322874, | |
| "kl": 0.0941162109375, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0038, | |
| "reward": 0.3090960830450058, | |
| "reward_std": 0.03677598666399717, | |
| "rewards/clip_reward": 0.3090960830450058, | |
| "step": 334 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 4096.0, | |
| "epoch": 0.010326279610992093, | |
| "grad_norm": 0.2309027643372804, | |
| "kl": 0.0677490234375, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0027, | |
| "reward": 0.3062281683087349, | |
| "reward_std": 0.02914317138493061, | |
| "rewards/clip_reward": 0.3062281683087349, | |
| "step": 335 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 4096.0, | |
| "epoch": 0.010357104326248786, | |
| "grad_norm": 0.24277503406886997, | |
| "kl": 0.064208984375, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0026, | |
| "reward": 0.30345892161130905, | |
| "reward_std": 0.03988218680024147, | |
| "rewards/clip_reward": 0.30345892161130905, | |
| "step": 336 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 4096.0, | |
| "epoch": 0.010387929041505478, | |
| "grad_norm": 0.24331709055222814, | |
| "kl": 0.06549072265625, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0026, | |
| "reward": 0.28554844856262207, | |
| "reward_std": 0.03036739816889167, | |
| "rewards/clip_reward": 0.28554844856262207, | |
| "step": 337 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 4096.0, | |
| "epoch": 0.010418753756762171, | |
| "grad_norm": 0.3529980048921207, | |
| "kl": 0.06640625, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0027, | |
| "reward": 0.30094072222709656, | |
| "reward_std": 0.03639454022049904, | |
| "rewards/clip_reward": 0.30094072222709656, | |
| "step": 338 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 4096.0, | |
| "epoch": 0.010449578472018864, | |
| "grad_norm": 0.4648925688267997, | |
| "kl": 0.070068359375, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0028, | |
| "reward": 0.3127727806568146, | |
| "reward_std": 0.037409001495689154, | |
| "rewards/clip_reward": 0.3127727806568146, | |
| "step": 339 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 4096.0, | |
| "epoch": 0.010480403187275557, | |
| "grad_norm": 0.21891573660402955, | |
| "kl": 0.0687255859375, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0027, | |
| "reward": 0.2772018723189831, | |
| "reward_std": 0.02981334552168846, | |
| "rewards/clip_reward": 0.2772018723189831, | |
| "step": 340 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 4096.0, | |
| "epoch": 0.01051122790253225, | |
| "grad_norm": 0.22903622425258144, | |
| "kl": 0.0693359375, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0028, | |
| "reward": 0.30165959894657135, | |
| "reward_std": 0.03327458165585995, | |
| "rewards/clip_reward": 0.30165959894657135, | |
| "step": 341 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 4096.0, | |
| "epoch": 0.010542052617788943, | |
| "grad_norm": 0.2563729286323683, | |
| "kl": 0.0703125, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0028, | |
| "reward": 0.29880737513303757, | |
| "reward_std": 0.03046613559126854, | |
| "rewards/clip_reward": 0.29880737513303757, | |
| "step": 342 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 4096.0, | |
| "epoch": 0.010572877333045636, | |
| "grad_norm": 0.23890758116942548, | |
| "kl": 0.07080078125, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0028, | |
| "reward": 0.30336084961891174, | |
| "reward_std": 0.03497588029131293, | |
| "rewards/clip_reward": 0.30336084961891174, | |
| "step": 343 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 4096.0, | |
| "epoch": 0.010603702048302329, | |
| "grad_norm": 0.3415925254166041, | |
| "kl": 0.0689697265625, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0028, | |
| "reward": 0.31711456179618835, | |
| "reward_std": 0.03901077900081873, | |
| "rewards/clip_reward": 0.31711456179618835, | |
| "step": 344 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 4096.0, | |
| "epoch": 0.010634526763559022, | |
| "grad_norm": 0.28715059266697696, | |
| "kl": 0.0751953125, | |
| "learning_rate": 1e-05, | |
| "loss": 0.003, | |
| "reward": 0.2894616648554802, | |
| "reward_std": 0.03519205283373594, | |
| "rewards/clip_reward": 0.2894616648554802, | |
| "step": 345 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 4096.0, | |
| "epoch": 0.010665351478815715, | |
| "grad_norm": 0.21252915417396734, | |
| "kl": 0.072265625, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0029, | |
| "reward": 0.3071034103631973, | |
| "reward_std": 0.03865744452923536, | |
| "rewards/clip_reward": 0.3071034103631973, | |
| "step": 346 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 4096.0, | |
| "epoch": 0.010696176194072407, | |
| "grad_norm": 0.2213810998809183, | |
| "kl": 0.0711669921875, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0028, | |
| "reward": 0.30335263907909393, | |
| "reward_std": 0.03155648289248347, | |
| "rewards/clip_reward": 0.30335263907909393, | |
| "step": 347 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 4096.0, | |
| "epoch": 0.0107270009093291, | |
| "grad_norm": 0.22688649328130808, | |
| "kl": 0.0701904296875, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0028, | |
| "reward": 0.30418138951063156, | |
| "reward_std": 0.03234159108251333, | |
| "rewards/clip_reward": 0.30418138951063156, | |
| "step": 348 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 4096.0, | |
| "epoch": 0.010757825624585793, | |
| "grad_norm": 0.29663311793497593, | |
| "kl": 0.06951904296875, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0028, | |
| "reward": 0.3119578883051872, | |
| "reward_std": 0.035748135298490524, | |
| "rewards/clip_reward": 0.3119578883051872, | |
| "step": 349 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 4096.0, | |
| "epoch": 0.010788650339842486, | |
| "grad_norm": 0.21987020789836037, | |
| "kl": 0.0726318359375, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0029, | |
| "reward": 0.2886430397629738, | |
| "reward_std": 0.028050173074007034, | |
| "rewards/clip_reward": 0.2886430397629738, | |
| "step": 350 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 4096.0, | |
| "epoch": 0.010819475055099179, | |
| "grad_norm": 0.25690691289795353, | |
| "kl": 0.0743408203125, | |
| "learning_rate": 1e-05, | |
| "loss": 0.003, | |
| "reward": 0.3163754343986511, | |
| "reward_std": 0.03696467122063041, | |
| "rewards/clip_reward": 0.3163754343986511, | |
| "step": 351 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 4096.0, | |
| "epoch": 0.010850299770355872, | |
| "grad_norm": 0.20820549385701986, | |
| "kl": 0.068603515625, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0027, | |
| "reward": 0.2914758771657944, | |
| "reward_std": 0.036506949458271265, | |
| "rewards/clip_reward": 0.2914758771657944, | |
| "step": 352 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 4096.0, | |
| "epoch": 0.010881124485612565, | |
| "grad_norm": 0.23881025300895642, | |
| "kl": 0.0703125, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0028, | |
| "reward": 0.305471271276474, | |
| "reward_std": 0.03957742918282747, | |
| "rewards/clip_reward": 0.305471271276474, | |
| "step": 353 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 4096.0, | |
| "epoch": 0.010911949200869258, | |
| "grad_norm": 0.23403139952741436, | |
| "kl": 0.073974609375, | |
| "learning_rate": 1e-05, | |
| "loss": 0.003, | |
| "reward": 0.29349175840616226, | |
| "reward_std": 0.029270636849105358, | |
| "rewards/clip_reward": 0.29349175840616226, | |
| "step": 354 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 4096.0, | |
| "epoch": 0.01094277391612595, | |
| "grad_norm": 0.29623954478847825, | |
| "kl": 0.0679931640625, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0027, | |
| "reward": 0.3115154132246971, | |
| "reward_std": 0.03451331192627549, | |
| "rewards/clip_reward": 0.3115154132246971, | |
| "step": 355 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 4096.0, | |
| "epoch": 0.010973598631382643, | |
| "grad_norm": 0.22254248972166946, | |
| "kl": 0.07177734375, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0029, | |
| "reward": 0.2945594787597656, | |
| "reward_std": 0.037838808726519346, | |
| "rewards/clip_reward": 0.2945594787597656, | |
| "step": 356 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 4096.0, | |
| "epoch": 0.011004423346639335, | |
| "grad_norm": 0.31078887072255357, | |
| "kl": 0.069580078125, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0028, | |
| "reward": 0.2983166500926018, | |
| "reward_std": 0.038911592215299606, | |
| "rewards/clip_reward": 0.2983166500926018, | |
| "step": 357 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 4096.0, | |
| "epoch": 0.011035248061896027, | |
| "grad_norm": 0.21415415508214233, | |
| "kl": 0.0740966796875, | |
| "learning_rate": 1e-05, | |
| "loss": 0.003, | |
| "reward": 0.2967897802591324, | |
| "reward_std": 0.034322294406592846, | |
| "rewards/clip_reward": 0.2967897802591324, | |
| "step": 358 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 4096.0, | |
| "epoch": 0.01106607277715272, | |
| "grad_norm": 0.22868327148604967, | |
| "kl": 0.073486328125, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0029, | |
| "reward": 0.30835631489753723, | |
| "reward_std": 0.030591858085244894, | |
| "rewards/clip_reward": 0.30835631489753723, | |
| "step": 359 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 4096.0, | |
| "epoch": 0.011096897492409413, | |
| "grad_norm": 0.2536865522371228, | |
| "kl": 0.0755615234375, | |
| "learning_rate": 1e-05, | |
| "loss": 0.003, | |
| "reward": 0.3081829249858856, | |
| "reward_std": 0.03304952848702669, | |
| "rewards/clip_reward": 0.3081829249858856, | |
| "step": 360 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 4096.0, | |
| "epoch": 0.011127722207666106, | |
| "grad_norm": 0.28703592507172904, | |
| "kl": 0.0833740234375, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0033, | |
| "reward": 0.31137142330408096, | |
| "reward_std": 0.036431248765438795, | |
| "rewards/clip_reward": 0.31137142330408096, | |
| "step": 361 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 4096.0, | |
| "epoch": 0.011158546922922799, | |
| "grad_norm": 0.25966729830990576, | |
| "kl": 0.072265625, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0029, | |
| "reward": 0.32117273658514023, | |
| "reward_std": 0.03095503943040967, | |
| "rewards/clip_reward": 0.32117273658514023, | |
| "step": 362 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 4096.0, | |
| "epoch": 0.011189371638179492, | |
| "grad_norm": 0.25023628654317165, | |
| "kl": 0.072509765625, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0029, | |
| "reward": 0.31916752457618713, | |
| "reward_std": 0.03393755853176117, | |
| "rewards/clip_reward": 0.31916752457618713, | |
| "step": 363 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 4096.0, | |
| "epoch": 0.011220196353436185, | |
| "grad_norm": 0.23509232771567978, | |
| "kl": 0.0703125, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0028, | |
| "reward": 0.3117026165127754, | |
| "reward_std": 0.02881152043119073, | |
| "rewards/clip_reward": 0.3117026165127754, | |
| "step": 364 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 4096.0, | |
| "epoch": 0.011251021068692878, | |
| "grad_norm": 0.25193424249683816, | |
| "kl": 0.0743408203125, | |
| "learning_rate": 1e-05, | |
| "loss": 0.003, | |
| "reward": 0.3166244179010391, | |
| "reward_std": 0.03036302560940385, | |
| "rewards/clip_reward": 0.3166244179010391, | |
| "step": 365 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 4096.0, | |
| "epoch": 0.01128184578394957, | |
| "grad_norm": 0.33920942950993677, | |
| "kl": 0.078369140625, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0031, | |
| "reward": 0.30771925300359726, | |
| "reward_std": 0.03633692301809788, | |
| "rewards/clip_reward": 0.30771925300359726, | |
| "step": 366 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 4096.0, | |
| "epoch": 0.011312670499206263, | |
| "grad_norm": 0.21836890161975292, | |
| "kl": 0.0755615234375, | |
| "learning_rate": 1e-05, | |
| "loss": 0.003, | |
| "reward": 0.2963472232222557, | |
| "reward_std": 0.039488581009209156, | |
| "rewards/clip_reward": 0.2963472232222557, | |
| "step": 367 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 4096.0, | |
| "epoch": 0.011343495214462956, | |
| "grad_norm": 0.5069167126879947, | |
| "kl": 0.0767822265625, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0031, | |
| "reward": 0.2941269427537918, | |
| "reward_std": 0.0324277700856328, | |
| "rewards/clip_reward": 0.2941269427537918, | |
| "step": 368 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 4096.0, | |
| "epoch": 0.01137431992971965, | |
| "grad_norm": 0.23131621578167488, | |
| "kl": 0.072509765625, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0029, | |
| "reward": 0.3116966411471367, | |
| "reward_std": 0.035368080250918865, | |
| "rewards/clip_reward": 0.3116966411471367, | |
| "step": 369 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 4096.0, | |
| "epoch": 0.011405144644976342, | |
| "grad_norm": 0.2722113704455896, | |
| "kl": 0.071533203125, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0029, | |
| "reward": 0.3093196973204613, | |
| "reward_std": 0.0321941040456295, | |
| "rewards/clip_reward": 0.3093196973204613, | |
| "step": 370 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 4096.0, | |
| "epoch": 0.011435969360233035, | |
| "grad_norm": 0.28730230026079845, | |
| "kl": 0.0716552734375, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0029, | |
| "reward": 0.30720267444849014, | |
| "reward_std": 0.03148469375446439, | |
| "rewards/clip_reward": 0.30720267444849014, | |
| "step": 371 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 4096.0, | |
| "epoch": 0.011466794075489728, | |
| "grad_norm": 0.2195587993801826, | |
| "kl": 0.0721435546875, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0029, | |
| "reward": 0.31224188208580017, | |
| "reward_std": 0.03507778514176607, | |
| "rewards/clip_reward": 0.31224188208580017, | |
| "step": 372 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 4096.0, | |
| "epoch": 0.01149761879074642, | |
| "grad_norm": 0.22359313280470233, | |
| "kl": 0.07666015625, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0031, | |
| "reward": 0.3193345367908478, | |
| "reward_std": 0.03153304476290941, | |
| "rewards/clip_reward": 0.3193345367908478, | |
| "step": 373 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 4096.0, | |
| "epoch": 0.011528443506003114, | |
| "grad_norm": 0.24587352736332965, | |
| "kl": 0.07958984375, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0032, | |
| "reward": 0.3059887960553169, | |
| "reward_std": 0.03858480043709278, | |
| "rewards/clip_reward": 0.3059887960553169, | |
| "step": 374 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 4096.0, | |
| "epoch": 0.011559268221259807, | |
| "grad_norm": 0.20861176096611583, | |
| "kl": 0.0777587890625, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0031, | |
| "reward": 0.31156501173973083, | |
| "reward_std": 0.03071795403957367, | |
| "rewards/clip_reward": 0.31156501173973083, | |
| "step": 375 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 4096.0, | |
| "epoch": 0.0115900929365165, | |
| "grad_norm": 0.3384083028926716, | |
| "kl": 0.0843505859375, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0034, | |
| "reward": 0.2961001992225647, | |
| "reward_std": 0.03549221996217966, | |
| "rewards/clip_reward": 0.2961001992225647, | |
| "step": 376 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 4096.0, | |
| "epoch": 0.011620917651773192, | |
| "grad_norm": 0.25505451962723275, | |
| "kl": 0.0787353515625, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0032, | |
| "reward": 0.3040069565176964, | |
| "reward_std": 0.030104033648967743, | |
| "rewards/clip_reward": 0.3040069565176964, | |
| "step": 377 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 4096.0, | |
| "epoch": 0.011651742367029885, | |
| "grad_norm": 0.28066405899218805, | |
| "kl": 0.0821533203125, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0033, | |
| "reward": 0.3125178590416908, | |
| "reward_std": 0.03651127126067877, | |
| "rewards/clip_reward": 0.3125178590416908, | |
| "step": 378 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 4096.0, | |
| "epoch": 0.011682567082286578, | |
| "grad_norm": 0.27665197939767366, | |
| "kl": 0.079345703125, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0032, | |
| "reward": 0.3094138279557228, | |
| "reward_std": 0.038227169774472713, | |
| "rewards/clip_reward": 0.3094138279557228, | |
| "step": 379 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 4096.0, | |
| "epoch": 0.01171339179754327, | |
| "grad_norm": 0.2597327901842019, | |
| "kl": 0.080810546875, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0032, | |
| "reward": 0.3057212755084038, | |
| "reward_std": 0.03503256104886532, | |
| "rewards/clip_reward": 0.3057212755084038, | |
| "step": 380 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 4096.0, | |
| "epoch": 0.011744216512799962, | |
| "grad_norm": 0.32226220932101085, | |
| "kl": 0.0845947265625, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0034, | |
| "reward": 0.3279699385166168, | |
| "reward_std": 0.0381966782733798, | |
| "rewards/clip_reward": 0.3279699385166168, | |
| "step": 381 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 4096.0, | |
| "epoch": 0.011775041228056655, | |
| "grad_norm": 0.2956679464457214, | |
| "kl": 0.081787109375, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0033, | |
| "reward": 0.30723875761032104, | |
| "reward_std": 0.03168717911466956, | |
| "rewards/clip_reward": 0.30723875761032104, | |
| "step": 382 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 4096.0, | |
| "epoch": 0.011805865943313348, | |
| "grad_norm": 0.5191055044770609, | |
| "kl": 0.091552734375, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0037, | |
| "reward": 0.31639552116394043, | |
| "reward_std": 0.03826092462986708, | |
| "rewards/clip_reward": 0.31639552116394043, | |
| "step": 383 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 4096.0, | |
| "epoch": 0.011836690658570041, | |
| "grad_norm": 0.3776553446735074, | |
| "kl": 0.091552734375, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0037, | |
| "reward": 0.3041198328137398, | |
| "reward_std": 0.026013361755758524, | |
| "rewards/clip_reward": 0.3041198328137398, | |
| "step": 384 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 4096.0, | |
| "epoch": 0.011867515373826734, | |
| "grad_norm": 0.2572694780557513, | |
| "kl": 0.08837890625, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0035, | |
| "reward": 0.2923019975423813, | |
| "reward_std": 0.03231561556458473, | |
| "rewards/clip_reward": 0.2923019975423813, | |
| "step": 385 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 4096.0, | |
| "epoch": 0.011898340089083427, | |
| "grad_norm": 15.002263434692885, | |
| "kl": 0.1844482421875, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0074, | |
| "reward": 0.31261809915304184, | |
| "reward_std": 0.033471152652055025, | |
| "rewards/clip_reward": 0.31261809915304184, | |
| "step": 386 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 4096.0, | |
| "epoch": 0.01192916480434012, | |
| "grad_norm": 34.74334790635388, | |
| "kl": 0.585205078125, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0234, | |
| "reward": 0.31164050102233887, | |
| "reward_std": 0.03602212620899081, | |
| "rewards/clip_reward": 0.31164050102233887, | |
| "step": 387 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 4096.0, | |
| "epoch": 0.011959989519596812, | |
| "grad_norm": 0.2788251418304219, | |
| "kl": 0.082275390625, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0033, | |
| "reward": 0.2998761534690857, | |
| "reward_std": 0.03641665726900101, | |
| "rewards/clip_reward": 0.2998761534690857, | |
| "step": 388 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 4096.0, | |
| "epoch": 0.011990814234853505, | |
| "grad_norm": 0.23027728154942576, | |
| "kl": 0.086669921875, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0035, | |
| "reward": 0.30939532816410065, | |
| "reward_std": 0.033424354158341885, | |
| "rewards/clip_reward": 0.30939532816410065, | |
| "step": 389 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 4096.0, | |
| "epoch": 0.012021638950110198, | |
| "grad_norm": 0.25021193487381366, | |
| "kl": 0.091552734375, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0037, | |
| "reward": 0.3133769854903221, | |
| "reward_std": 0.034206162206828594, | |
| "rewards/clip_reward": 0.3133769854903221, | |
| "step": 390 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 4096.0, | |
| "epoch": 0.012052463665366891, | |
| "grad_norm": 0.23903302345332506, | |
| "kl": 0.0802001953125, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0032, | |
| "reward": 0.3138531744480133, | |
| "reward_std": 0.03734522592276335, | |
| "rewards/clip_reward": 0.3138531744480133, | |
| "step": 391 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 4096.0, | |
| "epoch": 0.012083288380623584, | |
| "grad_norm": 0.2598108600282988, | |
| "kl": 0.083251953125, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0033, | |
| "reward": 0.32199686020612717, | |
| "reward_std": 0.03402840159833431, | |
| "rewards/clip_reward": 0.32199686020612717, | |
| "step": 392 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 4096.0, | |
| "epoch": 0.012114113095880277, | |
| "grad_norm": 0.23642883730734804, | |
| "kl": 0.08056640625, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0032, | |
| "reward": 0.3051422908902168, | |
| "reward_std": 0.03256607661023736, | |
| "rewards/clip_reward": 0.3051422908902168, | |
| "step": 393 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 4096.0, | |
| "epoch": 0.01214493781113697, | |
| "grad_norm": 0.22592478949720543, | |
| "kl": 0.0802001953125, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0032, | |
| "reward": 0.3141862004995346, | |
| "reward_std": 0.03128534881398082, | |
| "rewards/clip_reward": 0.3141862004995346, | |
| "step": 394 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 4096.0, | |
| "epoch": 0.012175762526393663, | |
| "grad_norm": 0.27814956809020164, | |
| "kl": 0.080810546875, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0032, | |
| "reward": 0.30130794644355774, | |
| "reward_std": 0.028409887570887804, | |
| "rewards/clip_reward": 0.30130794644355774, | |
| "step": 395 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 4096.0, | |
| "epoch": 0.012206587241650356, | |
| "grad_norm": 0.21400571540908192, | |
| "kl": 0.086669921875, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0035, | |
| "reward": 0.30789605528116226, | |
| "reward_std": 0.034056794829666615, | |
| "rewards/clip_reward": 0.30789605528116226, | |
| "step": 396 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 4096.0, | |
| "epoch": 0.012237411956907049, | |
| "grad_norm": 0.21349466740102746, | |
| "kl": 0.080810546875, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0032, | |
| "reward": 0.3021450638771057, | |
| "reward_std": 0.035551701206713915, | |
| "rewards/clip_reward": 0.3021450638771057, | |
| "step": 397 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 4096.0, | |
| "epoch": 0.012268236672163741, | |
| "grad_norm": 0.21264064002989824, | |
| "kl": 0.080322265625, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0032, | |
| "reward": 0.3078059256076813, | |
| "reward_std": 0.03562043095007539, | |
| "rewards/clip_reward": 0.3078059256076813, | |
| "step": 398 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 4096.0, | |
| "epoch": 0.012299061387420434, | |
| "grad_norm": 0.2898591178898953, | |
| "kl": 0.08642578125, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0035, | |
| "reward": 0.3178284019231796, | |
| "reward_std": 0.03499773098155856, | |
| "rewards/clip_reward": 0.3178284019231796, | |
| "step": 399 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 4096.0, | |
| "epoch": 0.012329886102677127, | |
| "grad_norm": 0.5180005174036416, | |
| "kl": 0.08935546875, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0036, | |
| "reward": 0.32223907113075256, | |
| "reward_std": 0.0368791688233614, | |
| "rewards/clip_reward": 0.32223907113075256, | |
| "step": 400 | |
| } | |
| ], | |
| "logging_steps": 1, | |
| "max_steps": 32441, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 1, | |
| "save_steps": 10, | |
| "stateful_callbacks": { | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": true, | |
| "should_training_stop": false | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 0.0, | |
| "train_batch_size": 6, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |