| { | |
| "best_metric": null, | |
| "best_model_checkpoint": null, | |
| "epoch": 0.5714285714285714, | |
| "eval_steps": 500, | |
| "global_step": 500, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "completion_length": 3001.9584350585938, | |
| "epoch": 0.001142857142857143, | |
| "grad_norm": 0.029622705653309822, | |
| "kl": 0.0, | |
| "learning_rate": 2e-08, | |
| "loss": 0.0088, | |
| "reward": -0.0029319413006305695, | |
| "reward_std": 0.12454631552100182, | |
| "rewards/cosine_scaled_reward": -0.1928562317043543, | |
| "rewards/format_reward": 0.37500000558793545, | |
| "step": 1 | |
| }, | |
| { | |
| "completion_length": 2822.541717529297, | |
| "epoch": 0.002285714285714286, | |
| "grad_norm": 0.04606308043003082, | |
| "kl": 0.0, | |
| "learning_rate": 4e-08, | |
| "loss": 0.0281, | |
| "reward": 0.11451426180428825, | |
| "reward_std": 0.2134026400744915, | |
| "rewards/cosine_scaled_reward": -0.009885392151772976, | |
| "rewards/format_reward": 0.4583333432674408, | |
| "step": 2 | |
| }, | |
| { | |
| "completion_length": 2916.2916870117188, | |
| "epoch": 0.0034285714285714284, | |
| "grad_norm": 0.013514544814825058, | |
| "kl": 2.8304755687713623e-05, | |
| "learning_rate": 6e-08, | |
| "loss": -0.0004, | |
| "reward": -0.08783391118049622, | |
| "reward_std": 0.06906898692250252, | |
| "rewards/cosine_scaled_reward": -0.29415931552648544, | |
| "rewards/format_reward": 0.25, | |
| "step": 3 | |
| }, | |
| { | |
| "completion_length": 3141.6666870117188, | |
| "epoch": 0.004571428571428572, | |
| "grad_norm": 0.025566047057509422, | |
| "kl": 3.069639205932617e-05, | |
| "learning_rate": 8e-08, | |
| "loss": 0.0065, | |
| "reward": -0.06564766296651214, | |
| "reward_std": 0.12986623123288155, | |
| "rewards/cosine_scaled_reward": -0.23140252009034157, | |
| "rewards/format_reward": 0.20833333395421505, | |
| "step": 4 | |
| }, | |
| { | |
| "completion_length": 2664.0834350585938, | |
| "epoch": 0.005714285714285714, | |
| "grad_norm": 0.04862683638930321, | |
| "kl": 2.962350845336914e-05, | |
| "learning_rate": 1e-07, | |
| "loss": 0.0221, | |
| "reward": 0.18510469514876604, | |
| "reward_std": 0.21939480304718018, | |
| "rewards/cosine_scaled_reward": 0.09575768932700157, | |
| "rewards/format_reward": 0.5208333358168602, | |
| "step": 5 | |
| }, | |
| { | |
| "completion_length": 2857.3750610351562, | |
| "epoch": 0.006857142857142857, | |
| "grad_norm": 0.038511428982019424, | |
| "kl": 4.437565803527832e-05, | |
| "learning_rate": 1.2e-07, | |
| "loss": 0.0043, | |
| "reward": 0.1307828463613987, | |
| "reward_std": 0.19989176094532013, | |
| "rewards/cosine_scaled_reward": 0.01211475022137165, | |
| "rewards/format_reward": 0.479166679084301, | |
| "step": 6 | |
| }, | |
| { | |
| "completion_length": 2418.7500915527344, | |
| "epoch": 0.008, | |
| "grad_norm": 0.029459422454237938, | |
| "kl": 3.2007694244384766e-05, | |
| "learning_rate": 1.4e-07, | |
| "loss": 0.02, | |
| "reward": 0.1962461294606328, | |
| "reward_std": 0.17905803676694632, | |
| "rewards/cosine_scaled_reward": 0.09094582125544548, | |
| "rewards/format_reward": 0.5833333432674408, | |
| "step": 7 | |
| }, | |
| { | |
| "completion_length": 2767.2500610351562, | |
| "epoch": 0.009142857142857144, | |
| "grad_norm": 0.037820421159267426, | |
| "kl": 3.612041473388672e-05, | |
| "learning_rate": 1.6e-07, | |
| "loss": 0.0142, | |
| "reward": 0.029022796778008342, | |
| "reward_std": 0.1923000067472458, | |
| "rewards/cosine_scaled_reward": -0.18395604193210602, | |
| "rewards/format_reward": 0.4791666828095913, | |
| "step": 8 | |
| }, | |
| { | |
| "completion_length": 3081.791748046875, | |
| "epoch": 0.010285714285714285, | |
| "grad_norm": 0.03357498720288277, | |
| "kl": 4.169344902038574e-05, | |
| "learning_rate": 1.8e-07, | |
| "loss": 0.0118, | |
| "reward": -0.014055831357836723, | |
| "reward_std": 0.17392348684370518, | |
| "rewards/cosine_scaled_reward": -0.16225658729672432, | |
| "rewards/format_reward": 0.2708333432674408, | |
| "step": 9 | |
| }, | |
| { | |
| "completion_length": 2440.9375915527344, | |
| "epoch": 0.011428571428571429, | |
| "grad_norm": 0.026523390784859657, | |
| "kl": 2.504885196685791e-05, | |
| "learning_rate": 2e-07, | |
| "loss": -0.0028, | |
| "reward": 0.11565085151232779, | |
| "reward_std": 0.18074453435838223, | |
| "rewards/cosine_scaled_reward": -0.0493972972035408, | |
| "rewards/format_reward": 0.5416666679084301, | |
| "step": 10 | |
| }, | |
| { | |
| "completion_length": 2641.0834350585938, | |
| "epoch": 0.012571428571428572, | |
| "grad_norm": 0.03831341862678528, | |
| "kl": 3.775954246520996e-05, | |
| "learning_rate": 2.1999999999999998e-07, | |
| "loss": 0.0122, | |
| "reward": 0.14405631087720394, | |
| "reward_std": 0.27509575337171555, | |
| "rewards/cosine_scaled_reward": 0.019670148380100727, | |
| "rewards/format_reward": 0.5208333507180214, | |
| "step": 11 | |
| }, | |
| { | |
| "completion_length": 2534.7709350585938, | |
| "epoch": 0.013714285714285714, | |
| "grad_norm": 0.043128155171871185, | |
| "kl": 2.823770046234131e-05, | |
| "learning_rate": 2.4e-07, | |
| "loss": 0.0201, | |
| "reward": 0.14107368607074022, | |
| "reward_std": 0.1852603331208229, | |
| "rewards/cosine_scaled_reward": -0.01594308251515031, | |
| "rewards/format_reward": 0.5833333432674408, | |
| "step": 12 | |
| }, | |
| { | |
| "completion_length": 2283.645965576172, | |
| "epoch": 0.014857142857142857, | |
| "grad_norm": 0.02288592979311943, | |
| "kl": 1.7851591110229492e-05, | |
| "learning_rate": 2.6e-07, | |
| "loss": 0.0015, | |
| "reward": 0.14897193014621735, | |
| "reward_std": 0.1799948439002037, | |
| "rewards/cosine_scaled_reward": 0.01838136836886406, | |
| "rewards/format_reward": 0.5416666679084301, | |
| "step": 13 | |
| }, | |
| { | |
| "completion_length": 2973.7083740234375, | |
| "epoch": 0.016, | |
| "grad_norm": 0.038055043667554855, | |
| "kl": 3.159046173095703e-05, | |
| "learning_rate": 2.8e-07, | |
| "loss": 0.0089, | |
| "reward": 0.03552349330857396, | |
| "reward_std": 0.1976378969848156, | |
| "rewards/cosine_scaled_reward": -0.1335146650671959, | |
| "rewards/format_reward": 0.39583333395421505, | |
| "step": 14 | |
| }, | |
| { | |
| "completion_length": 3087.9791870117188, | |
| "epoch": 0.017142857142857144, | |
| "grad_norm": 0.03934551402926445, | |
| "kl": 3.743171691894531e-05, | |
| "learning_rate": 3e-07, | |
| "loss": 0.0161, | |
| "reward": 0.05057712458074093, | |
| "reward_std": 0.24239419028162956, | |
| "rewards/cosine_scaled_reward": -0.06737793236970901, | |
| "rewards/format_reward": 0.3333333395421505, | |
| "step": 15 | |
| }, | |
| { | |
| "completion_length": 2387.6459350585938, | |
| "epoch": 0.018285714285714287, | |
| "grad_norm": 0.031069820746779442, | |
| "kl": 1.7777085304260254e-05, | |
| "learning_rate": 3.2e-07, | |
| "loss": 0.009, | |
| "reward": 0.17460840800777078, | |
| "reward_std": 0.17416980862617493, | |
| "rewards/cosine_scaled_reward": 0.08253358863294125, | |
| "rewards/format_reward": 0.5, | |
| "step": 16 | |
| }, | |
| { | |
| "completion_length": 3202.7084350585938, | |
| "epoch": 0.019428571428571427, | |
| "grad_norm": 0.025070542469620705, | |
| "kl": 4.374980926513672e-05, | |
| "learning_rate": 3.4000000000000003e-07, | |
| "loss": 0.0137, | |
| "reward": -0.05761076137423515, | |
| "reward_std": 0.15629566833376884, | |
| "rewards/cosine_scaled_reward": -0.22459998354315758, | |
| "rewards/format_reward": 0.2291666679084301, | |
| "step": 17 | |
| }, | |
| { | |
| "completion_length": 2976.2709350585938, | |
| "epoch": 0.02057142857142857, | |
| "grad_norm": 0.05030415952205658, | |
| "kl": 5.245208740234375e-05, | |
| "learning_rate": 3.6e-07, | |
| "loss": 0.0251, | |
| "reward": -0.041604380938224494, | |
| "reward_std": 0.1566980741918087, | |
| "rewards/cosine_scaled_reward": -0.21686899568885565, | |
| "rewards/format_reward": 0.27083334140479565, | |
| "step": 18 | |
| }, | |
| { | |
| "completion_length": 3212.2500610351562, | |
| "epoch": 0.021714285714285714, | |
| "grad_norm": 0.03290052339434624, | |
| "kl": 2.6613473892211914e-05, | |
| "learning_rate": 3.7999999999999996e-07, | |
| "loss": 0.01, | |
| "reward": 0.10608524829149246, | |
| "reward_std": 0.21430233865976334, | |
| "rewards/cosine_scaled_reward": -0.003221757709980011, | |
| "rewards/format_reward": 0.4166666865348816, | |
| "step": 19 | |
| }, | |
| { | |
| "completion_length": 2790.166748046875, | |
| "epoch": 0.022857142857142857, | |
| "grad_norm": 0.03484330698847771, | |
| "kl": 4.336237907409668e-05, | |
| "learning_rate": 4e-07, | |
| "loss": 0.0037, | |
| "reward": 0.08177977707237005, | |
| "reward_std": 0.18684325739741325, | |
| "rewards/cosine_scaled_reward": -0.062129486352205276, | |
| "rewards/format_reward": 0.4375000149011612, | |
| "step": 20 | |
| }, | |
| { | |
| "completion_length": 2680.0625610351562, | |
| "epoch": 0.024, | |
| "grad_norm": 0.014080243185162544, | |
| "kl": 2.5466084480285645e-05, | |
| "learning_rate": 4.1999999999999995e-07, | |
| "loss": 0.002, | |
| "reward": 0.0883646197617054, | |
| "reward_std": 0.09925123862922192, | |
| "rewards/cosine_scaled_reward": -0.048521531745791435, | |
| "rewards/format_reward": 0.4375, | |
| "step": 21 | |
| }, | |
| { | |
| "completion_length": 3361.1666870117188, | |
| "epoch": 0.025142857142857144, | |
| "grad_norm": 0.04737177491188049, | |
| "kl": 2.92360782623291e-05, | |
| "learning_rate": 4.3999999999999997e-07, | |
| "loss": 0.0134, | |
| "reward": 0.014684513211250305, | |
| "reward_std": 0.2061120942234993, | |
| "rewards/cosine_scaled_reward": -0.09447065740823746, | |
| "rewards/format_reward": 0.2500000074505806, | |
| "step": 22 | |
| }, | |
| { | |
| "completion_length": 3197.2291870117188, | |
| "epoch": 0.026285714285714287, | |
| "grad_norm": 0.05002724751830101, | |
| "kl": 4.026293754577637e-05, | |
| "learning_rate": 4.6e-07, | |
| "loss": 0.0212, | |
| "reward": -0.007510215509682894, | |
| "reward_std": 0.18628919124603271, | |
| "rewards/cosine_scaled_reward": -0.17245101649314165, | |
| "rewards/format_reward": 0.3125000074505806, | |
| "step": 23 | |
| }, | |
| { | |
| "completion_length": 2225.2500610351562, | |
| "epoch": 0.027428571428571427, | |
| "grad_norm": 0.03594079613685608, | |
| "kl": 1.3820827007293701e-05, | |
| "learning_rate": 4.8e-07, | |
| "loss": 0.0095, | |
| "reward": 0.12880078703165054, | |
| "reward_std": 0.18869203887879848, | |
| "rewards/cosine_scaled_reward": -0.015991515130735934, | |
| "rewards/format_reward": 0.5208333488553762, | |
| "step": 24 | |
| }, | |
| { | |
| "completion_length": 2843.25, | |
| "epoch": 0.02857142857142857, | |
| "grad_norm": 0.013156503438949585, | |
| "kl": 1.4103949069976807e-05, | |
| "learning_rate": 5e-07, | |
| "loss": -0.0004, | |
| "reward": 0.09813834726810455, | |
| "reward_std": 0.10752132721245289, | |
| "rewards/cosine_scaled_reward": 0.0026400238275527954, | |
| "rewards/format_reward": 0.375, | |
| "step": 25 | |
| }, | |
| { | |
| "completion_length": 3197.2500610351562, | |
| "epoch": 0.029714285714285714, | |
| "grad_norm": 0.023827390745282173, | |
| "kl": 1.9887462258338928e-05, | |
| "learning_rate": 5.2e-07, | |
| "loss": 0.001, | |
| "reward": 0.01849794015288353, | |
| "reward_std": 0.16981271095573902, | |
| "rewards/cosine_scaled_reward": -0.0721497293561697, | |
| "rewards/format_reward": 0.20833333395421505, | |
| "step": 26 | |
| }, | |
| { | |
| "completion_length": 3171.1458740234375, | |
| "epoch": 0.030857142857142857, | |
| "grad_norm": 0.03777868300676346, | |
| "kl": 1.5690922737121582e-05, | |
| "learning_rate": 5.4e-07, | |
| "loss": 0.0128, | |
| "reward": 0.005505116190761328, | |
| "reward_std": 0.14036701992154121, | |
| "rewards/cosine_scaled_reward": -0.10327141731977463, | |
| "rewards/format_reward": 0.22916666977107525, | |
| "step": 27 | |
| }, | |
| { | |
| "completion_length": 3171.1250610351562, | |
| "epoch": 0.032, | |
| "grad_norm": 0.0430736169219017, | |
| "kl": 3.170967102050781e-05, | |
| "learning_rate": 5.6e-07, | |
| "loss": 0.0143, | |
| "reward": 0.05259059742093086, | |
| "reward_std": 0.21468006633222103, | |
| "rewards/cosine_scaled_reward": -0.05536823160946369, | |
| "rewards/format_reward": 0.3125000149011612, | |
| "step": 28 | |
| }, | |
| { | |
| "completion_length": 3201.604248046875, | |
| "epoch": 0.03314285714285714, | |
| "grad_norm": 0.027703089639544487, | |
| "kl": 1.7091631889343262e-05, | |
| "learning_rate": 5.8e-07, | |
| "loss": 0.007, | |
| "reward": -0.00044601038098335266, | |
| "reward_std": 0.2002517506480217, | |
| "rewards/cosine_scaled_reward": -0.12858079187572002, | |
| "rewards/format_reward": 0.2500000111758709, | |
| "step": 29 | |
| }, | |
| { | |
| "completion_length": 3259.854248046875, | |
| "epoch": 0.03428571428571429, | |
| "grad_norm": 0.06692618131637573, | |
| "kl": 3.02046537399292e-05, | |
| "learning_rate": 6e-07, | |
| "loss": 0.0137, | |
| "reward": 0.08933593705296516, | |
| "reward_std": 0.23386941105127335, | |
| "rewards/cosine_scaled_reward": -0.0027976278215646744, | |
| "rewards/format_reward": 0.3541666753590107, | |
| "step": 30 | |
| }, | |
| { | |
| "completion_length": 2404.8333740234375, | |
| "epoch": 0.03542857142857143, | |
| "grad_norm": 0.017105773091316223, | |
| "kl": 3.102421760559082e-05, | |
| "learning_rate": 6.2e-07, | |
| "loss": 0.0061, | |
| "reward": 0.06446321186376736, | |
| "reward_std": 0.1481246892362833, | |
| "rewards/cosine_scaled_reward": -0.12002913188189268, | |
| "rewards/format_reward": 0.4791666716337204, | |
| "step": 31 | |
| }, | |
| { | |
| "completion_length": 3512.2708740234375, | |
| "epoch": 0.036571428571428574, | |
| "grad_norm": 0.031214792281389236, | |
| "kl": 3.38628888130188e-05, | |
| "learning_rate": 6.4e-07, | |
| "loss": 0.0065, | |
| "reward": -0.0885147238150239, | |
| "reward_std": 0.14898579940199852, | |
| "rewards/cosine_scaled_reward": -0.235802935436368, | |
| "rewards/format_reward": 0.125, | |
| "step": 32 | |
| }, | |
| { | |
| "completion_length": 3107.0834350585938, | |
| "epoch": 0.037714285714285714, | |
| "grad_norm": 0.03168352693319321, | |
| "kl": 1.749396324157715e-05, | |
| "learning_rate": 6.6e-07, | |
| "loss": 0.0107, | |
| "reward": 0.016022177413105965, | |
| "reward_std": 0.20783882588148117, | |
| "rewards/cosine_scaled_reward": -0.12545084208250046, | |
| "rewards/format_reward": 0.31250000558793545, | |
| "step": 33 | |
| }, | |
| { | |
| "completion_length": 3184.2083740234375, | |
| "epoch": 0.038857142857142854, | |
| "grad_norm": 0.02989567629992962, | |
| "kl": 1.3366341590881348e-05, | |
| "learning_rate": 6.800000000000001e-07, | |
| "loss": 0.0011, | |
| "reward": -0.06510111520765349, | |
| "reward_std": 0.17171696946024895, | |
| "rewards/cosine_scaled_reward": -0.2292976714670658, | |
| "rewards/format_reward": 0.2083333358168602, | |
| "step": 34 | |
| }, | |
| { | |
| "completion_length": 2767.666748046875, | |
| "epoch": 0.04, | |
| "grad_norm": 0.025272978469729424, | |
| "kl": 5.745887756347656e-05, | |
| "learning_rate": 7e-07, | |
| "loss": 0.0118, | |
| "reward": 0.06672569224610925, | |
| "reward_std": 0.2056892216205597, | |
| "rewards/cosine_scaled_reward": -0.10747915878891945, | |
| "rewards/format_reward": 0.45833334513008595, | |
| "step": 35 | |
| }, | |
| { | |
| "completion_length": 2590.3750610351562, | |
| "epoch": 0.04114285714285714, | |
| "grad_norm": 0.033443205058574677, | |
| "kl": 0.00014869868755340576, | |
| "learning_rate": 7.2e-07, | |
| "loss": 0.0071, | |
| "reward": 0.20523509569466114, | |
| "reward_std": 0.10853379778563976, | |
| "rewards/cosine_scaled_reward": 0.13563355803489685, | |
| "rewards/format_reward": 0.5208333488553762, | |
| "step": 36 | |
| }, | |
| { | |
| "completion_length": 2841.0209350585938, | |
| "epoch": 0.04228571428571429, | |
| "grad_norm": 0.03333444148302078, | |
| "kl": 0.0001373291015625, | |
| "learning_rate": 7.4e-07, | |
| "loss": 0.0119, | |
| "reward": 0.05081416107714176, | |
| "reward_std": 0.1740802899003029, | |
| "rewards/cosine_scaled_reward": -0.13561652414500713, | |
| "rewards/format_reward": 0.4583333432674408, | |
| "step": 37 | |
| }, | |
| { | |
| "completion_length": 3074.7083740234375, | |
| "epoch": 0.04342857142857143, | |
| "grad_norm": 0.03237065300345421, | |
| "kl": 0.0002734661102294922, | |
| "learning_rate": 7.599999999999999e-07, | |
| "loss": 0.0074, | |
| "reward": 0.014112615492194891, | |
| "reward_std": 0.18836599960923195, | |
| "rewards/cosine_scaled_reward": -0.18090857629431412, | |
| "rewards/format_reward": 0.416666679084301, | |
| "step": 38 | |
| }, | |
| { | |
| "completion_length": 2810.916717529297, | |
| "epoch": 0.044571428571428574, | |
| "grad_norm": 0.031698137521743774, | |
| "kl": 0.0002893805503845215, | |
| "learning_rate": 7.799999999999999e-07, | |
| "loss": 0.0138, | |
| "reward": 0.09018783806823194, | |
| "reward_std": 0.2195709180086851, | |
| "rewards/cosine_scaled_reward": -0.0747589748352766, | |
| "rewards/format_reward": 0.5000000037252903, | |
| "step": 39 | |
| }, | |
| { | |
| "completion_length": 3247.666748046875, | |
| "epoch": 0.045714285714285714, | |
| "grad_norm": 0.02582962065935135, | |
| "kl": 0.00012274831533432007, | |
| "learning_rate": 8e-07, | |
| "loss": 0.0036, | |
| "reward": -0.0030554812401533127, | |
| "reward_std": 0.1791013963520527, | |
| "rewards/cosine_scaled_reward": -0.12213349156081676, | |
| "rewards/format_reward": 0.22916666977107525, | |
| "step": 40 | |
| }, | |
| { | |
| "completion_length": 3199.0000610351562, | |
| "epoch": 0.046857142857142854, | |
| "grad_norm": 0.021616114303469658, | |
| "kl": 5.43445348739624e-05, | |
| "learning_rate": 8.199999999999999e-07, | |
| "loss": 0.0115, | |
| "reward": 0.045759374275803566, | |
| "reward_std": 0.1401870008558035, | |
| "rewards/cosine_scaled_reward": -0.04788116551935673, | |
| "rewards/format_reward": 0.27083333395421505, | |
| "step": 41 | |
| }, | |
| { | |
| "completion_length": 2042.1666870117188, | |
| "epoch": 0.048, | |
| "grad_norm": 0.024571280926465988, | |
| "kl": 0.0006382614374160767, | |
| "learning_rate": 8.399999999999999e-07, | |
| "loss": 0.0049, | |
| "reward": 0.19183703139424324, | |
| "reward_std": 0.2046101875603199, | |
| "rewards/cosine_scaled_reward": 0.016680144472047687, | |
| "rewards/format_reward": 0.7083333432674408, | |
| "step": 42 | |
| }, | |
| { | |
| "completion_length": 3039.8126220703125, | |
| "epoch": 0.04914285714285714, | |
| "grad_norm": 0.0402773953974247, | |
| "kl": 0.00012445449829101562, | |
| "learning_rate": 8.599999999999999e-07, | |
| "loss": 0.0125, | |
| "reward": 0.051199857611209154, | |
| "reward_std": 0.21126097440719604, | |
| "rewards/cosine_scaled_reward": -0.08657800313085318, | |
| "rewards/format_reward": 0.3750000037252903, | |
| "step": 43 | |
| }, | |
| { | |
| "completion_length": 2994.9375610351562, | |
| "epoch": 0.05028571428571429, | |
| "grad_norm": 0.02581031620502472, | |
| "kl": 0.0009569525718688965, | |
| "learning_rate": 8.799999999999999e-07, | |
| "loss": 0.0059, | |
| "reward": 0.07120304740965366, | |
| "reward_std": 0.13958723843097687, | |
| "rewards/cosine_scaled_reward": -0.08261704817414284, | |
| "rewards/format_reward": 0.4375000111758709, | |
| "step": 44 | |
| }, | |
| { | |
| "completion_length": 2736.604248046875, | |
| "epoch": 0.05142857142857143, | |
| "grad_norm": 0.03486928716301918, | |
| "kl": 9.66489315032959e-05, | |
| "learning_rate": 9e-07, | |
| "loss": 0.0078, | |
| "reward": 0.20642845891416073, | |
| "reward_std": 0.22643720731139183, | |
| "rewards/cosine_scaled_reward": 0.09864461561664939, | |
| "rewards/format_reward": 0.6041666865348816, | |
| "step": 45 | |
| }, | |
| { | |
| "completion_length": 3089.5209045410156, | |
| "epoch": 0.052571428571428575, | |
| "grad_norm": 0.026618344709277153, | |
| "kl": 0.00014385581016540527, | |
| "learning_rate": 9.2e-07, | |
| "loss": 0.0051, | |
| "reward": 0.03118098562117666, | |
| "reward_std": 0.13040862046182156, | |
| "rewards/cosine_scaled_reward": -0.09654540452174842, | |
| "rewards/format_reward": 0.3125, | |
| "step": 46 | |
| }, | |
| { | |
| "completion_length": 2578.041748046875, | |
| "epoch": 0.053714285714285714, | |
| "grad_norm": 0.03371405601501465, | |
| "kl": 0.00024366378784179688, | |
| "learning_rate": 9.399999999999999e-07, | |
| "loss": 0.0229, | |
| "reward": 0.12485338561236858, | |
| "reward_std": 0.2258769404143095, | |
| "rewards/cosine_scaled_reward": 0.0087130106985569, | |
| "rewards/format_reward": 0.4583333432674408, | |
| "step": 47 | |
| }, | |
| { | |
| "completion_length": 2776.0625610351562, | |
| "epoch": 0.054857142857142854, | |
| "grad_norm": 0.04292596876621246, | |
| "kl": 0.0001995563507080078, | |
| "learning_rate": 9.6e-07, | |
| "loss": 0.0356, | |
| "reward": 0.025183293968439102, | |
| "reward_std": 0.2486670520156622, | |
| "rewards/cosine_scaled_reward": -0.13703527487814426, | |
| "rewards/format_reward": 0.3750000223517418, | |
| "step": 48 | |
| }, | |
| { | |
| "completion_length": 2329.666732788086, | |
| "epoch": 0.056, | |
| "grad_norm": 0.04059920459985733, | |
| "kl": 0.00020240247249603271, | |
| "learning_rate": 9.8e-07, | |
| "loss": 0.0221, | |
| "reward": 0.133682232350111, | |
| "reward_std": 0.24662534147500992, | |
| "rewards/cosine_scaled_reward": -0.0048737190663814545, | |
| "rewards/format_reward": 0.5208333358168602, | |
| "step": 49 | |
| }, | |
| { | |
| "completion_length": 2363.9375610351562, | |
| "epoch": 0.05714285714285714, | |
| "grad_norm": 0.020762886852025986, | |
| "kl": 0.0009932518005371094, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0011, | |
| "reward": 0.12870622240006924, | |
| "reward_std": 0.15025716833770275, | |
| "rewards/cosine_scaled_reward": 0.008234108798205853, | |
| "rewards/format_reward": 0.479166679084301, | |
| "step": 50 | |
| }, | |
| { | |
| "completion_length": 3336.6250610351562, | |
| "epoch": 0.05828571428571429, | |
| "grad_norm": 0.032446328550577164, | |
| "kl": 0.00037729740142822266, | |
| "learning_rate": 9.999890338174275e-07, | |
| "loss": 0.0037, | |
| "reward": -0.04826304782181978, | |
| "reward_std": 0.20281681418418884, | |
| "rewards/cosine_scaled_reward": -0.21985225845128298, | |
| "rewards/format_reward": 0.25000000558793545, | |
| "step": 51 | |
| }, | |
| { | |
| "completion_length": 3487.7708740234375, | |
| "epoch": 0.05942857142857143, | |
| "grad_norm": 0.03511150926351547, | |
| "kl": 0.00035959482192993164, | |
| "learning_rate": 9.999561358041868e-07, | |
| "loss": 0.0078, | |
| "reward": -0.02223890647292137, | |
| "reward_std": 0.17184382304549217, | |
| "rewards/cosine_scaled_reward": -0.11892467923462391, | |
| "rewards/format_reward": 0.14583333767950535, | |
| "step": 52 | |
| }, | |
| { | |
| "completion_length": 2940.854248046875, | |
| "epoch": 0.060571428571428575, | |
| "grad_norm": 0.029133958742022514, | |
| "kl": 0.0005671381950378418, | |
| "learning_rate": 9.999013075636804e-07, | |
| "loss": 0.0156, | |
| "reward": 0.023026579525321722, | |
| "reward_std": 0.18880045041441917, | |
| "rewards/cosine_scaled_reward": -0.11227694898843765, | |
| "rewards/format_reward": 0.3125000111758709, | |
| "step": 53 | |
| }, | |
| { | |
| "completion_length": 2591.2291870117188, | |
| "epoch": 0.061714285714285715, | |
| "grad_norm": 0.019278600811958313, | |
| "kl": 0.0012264251708984375, | |
| "learning_rate": 9.998245517681593e-07, | |
| "loss": -0.0001, | |
| "reward": -0.03725188039243221, | |
| "reward_std": 0.10213315486907959, | |
| "rewards/cosine_scaled_reward": -0.2596123218536377, | |
| "rewards/format_reward": 0.3750000149011612, | |
| "step": 54 | |
| }, | |
| { | |
| "completion_length": 3093.7708740234375, | |
| "epoch": 0.06285714285714286, | |
| "grad_norm": 0.02859681472182274, | |
| "kl": 0.00033676624298095703, | |
| "learning_rate": 9.997258721585931e-07, | |
| "loss": 0.0074, | |
| "reward": -0.03638360369950533, | |
| "reward_std": 0.19044485688209534, | |
| "rewards/cosine_scaled_reward": -0.20511764287948608, | |
| "rewards/format_reward": 0.2708333395421505, | |
| "step": 55 | |
| }, | |
| { | |
| "completion_length": 3074.479248046875, | |
| "epoch": 0.064, | |
| "grad_norm": 0.019804302603006363, | |
| "kl": 0.0012226104736328125, | |
| "learning_rate": 9.996052735444862e-07, | |
| "loss": -0.002, | |
| "reward": 0.045626044273376465, | |
| "reward_std": 0.14548260904848576, | |
| "rewards/cosine_scaled_reward": -0.05084723234176636, | |
| "rewards/format_reward": 0.2708333432674408, | |
| "step": 56 | |
| }, | |
| { | |
| "completion_length": 2896.666748046875, | |
| "epoch": 0.06514285714285714, | |
| "grad_norm": 0.24302555620670319, | |
| "kl": 0.012219905853271484, | |
| "learning_rate": 9.994627618036452e-07, | |
| "loss": 0.0065, | |
| "reward": 0.031026489101350307, | |
| "reward_std": 0.1758255548775196, | |
| "rewards/cosine_scaled_reward": -0.12736675469204783, | |
| "rewards/format_reward": 0.375, | |
| "step": 57 | |
| }, | |
| { | |
| "completion_length": 3172.2708740234375, | |
| "epoch": 0.06628571428571428, | |
| "grad_norm": 0.03474734351038933, | |
| "kl": 0.0015817880630493164, | |
| "learning_rate": 9.992983438818915e-07, | |
| "loss": 0.0038, | |
| "reward": -0.054731689393520355, | |
| "reward_std": 0.1664622612297535, | |
| "rewards/cosine_scaled_reward": -0.23088792711496353, | |
| "rewards/format_reward": 0.2500000111758709, | |
| "step": 58 | |
| }, | |
| { | |
| "completion_length": 3034.291748046875, | |
| "epoch": 0.06742857142857143, | |
| "grad_norm": 0.025463465601205826, | |
| "kl": 0.0010409355163574219, | |
| "learning_rate": 9.991120277927223e-07, | |
| "loss": 0.0064, | |
| "reward": 0.09089667443186045, | |
| "reward_std": 0.13595283962786198, | |
| "rewards/cosine_scaled_reward": 0.04182465001940727, | |
| "rewards/format_reward": 0.27083333395421505, | |
| "step": 59 | |
| }, | |
| { | |
| "completion_length": 2733.5625610351562, | |
| "epoch": 0.06857142857142857, | |
| "grad_norm": 0.016339808702468872, | |
| "kl": 0.0006044507026672363, | |
| "learning_rate": 9.989038226169207e-07, | |
| "loss": 0.0001, | |
| "reward": 0.11181952990591526, | |
| "reward_std": 0.12588856369256973, | |
| "rewards/cosine_scaled_reward": 0.0365639328956604, | |
| "rewards/format_reward": 0.35416666977107525, | |
| "step": 60 | |
| }, | |
| { | |
| "completion_length": 2489.125030517578, | |
| "epoch": 0.06971428571428571, | |
| "grad_norm": 0.03386380895972252, | |
| "kl": 0.0002994537353515625, | |
| "learning_rate": 9.98673738502114e-07, | |
| "loss": 0.0193, | |
| "reward": 0.22211400046944618, | |
| "reward_std": 0.18361974507570267, | |
| "rewards/cosine_scaled_reward": 0.18788734078407288, | |
| "rewards/format_reward": 0.4791666716337204, | |
| "step": 61 | |
| }, | |
| { | |
| "completion_length": 3384.479248046875, | |
| "epoch": 0.07085714285714285, | |
| "grad_norm": 0.03864703327417374, | |
| "kl": 0.0004179924726486206, | |
| "learning_rate": 9.98421786662277e-07, | |
| "loss": 0.0143, | |
| "reward": 0.016021604649722576, | |
| "reward_std": 0.24887718260288239, | |
| "rewards/cosine_scaled_reward": -0.10296530975028872, | |
| "rewards/format_reward": 0.2708333395421505, | |
| "step": 62 | |
| }, | |
| { | |
| "completion_length": 2070.104202270508, | |
| "epoch": 0.072, | |
| "grad_norm": 0.031416576355695724, | |
| "kl": 0.0014023780822753906, | |
| "learning_rate": 9.981479793771866e-07, | |
| "loss": 0.0161, | |
| "reward": 0.10677355155348778, | |
| "reward_std": 0.21287357807159424, | |
| "rewards/cosine_scaled_reward": -0.08884532982483506, | |
| "rewards/format_reward": 0.5833333544433117, | |
| "step": 63 | |
| }, | |
| { | |
| "completion_length": 2983.3125, | |
| "epoch": 0.07314285714285715, | |
| "grad_norm": 0.03912929818034172, | |
| "kl": 0.00029218196868896484, | |
| "learning_rate": 9.97852329991824e-07, | |
| "loss": 0.023, | |
| "reward": 0.1314917542040348, | |
| "reward_std": 0.2512434311211109, | |
| "rewards/cosine_scaled_reward": 0.035050878301262856, | |
| "rewards/format_reward": 0.4375000149011612, | |
| "step": 64 | |
| }, | |
| { | |
| "completion_length": 2392.0833740234375, | |
| "epoch": 0.07428571428571429, | |
| "grad_norm": 0.03131681680679321, | |
| "kl": 0.0009235143661499023, | |
| "learning_rate": 9.975348529157229e-07, | |
| "loss": 0.0145, | |
| "reward": 0.16075028525665402, | |
| "reward_std": 0.1973189115524292, | |
| "rewards/cosine_scaled_reward": 0.028246548026800156, | |
| "rewards/format_reward": 0.5625000074505806, | |
| "step": 65 | |
| }, | |
| { | |
| "completion_length": 2986.979248046875, | |
| "epoch": 0.07542857142857143, | |
| "grad_norm": 0.026520250365138054, | |
| "kl": 0.0007143020629882812, | |
| "learning_rate": 9.971955636222684e-07, | |
| "loss": 0.0042, | |
| "reward": 0.06894214591011405, | |
| "reward_std": 0.17750076204538345, | |
| "rewards/cosine_scaled_reward": -0.07307655829936266, | |
| "rewards/format_reward": 0.4166666753590107, | |
| "step": 66 | |
| }, | |
| { | |
| "completion_length": 3122.5416870117188, | |
| "epoch": 0.07657142857142857, | |
| "grad_norm": 0.1043124571442604, | |
| "kl": 0.0006694793701171875, | |
| "learning_rate": 9.968344786479415e-07, | |
| "loss": 0.0199, | |
| "reward": 0.1262295600026846, | |
| "reward_std": 0.2161055188626051, | |
| "rewards/cosine_scaled_reward": 0.054833441972732544, | |
| "rewards/format_reward": 0.3750000111758709, | |
| "step": 67 | |
| }, | |
| { | |
| "completion_length": 2514.8125915527344, | |
| "epoch": 0.07771428571428571, | |
| "grad_norm": 0.01841914653778076, | |
| "kl": 0.0015196800231933594, | |
| "learning_rate": 9.964516155915151e-07, | |
| "loss": 0.0012, | |
| "reward": 0.022539040073752403, | |
| "reward_std": 0.1281664203852415, | |
| "rewards/cosine_scaled_reward": -0.17603367939591408, | |
| "rewards/format_reward": 0.43750002048909664, | |
| "step": 68 | |
| }, | |
| { | |
| "completion_length": 2365.0625610351562, | |
| "epoch": 0.07885714285714286, | |
| "grad_norm": 0.028957482427358627, | |
| "kl": 0.0005636215209960938, | |
| "learning_rate": 9.960469931131936e-07, | |
| "loss": 0.0077, | |
| "reward": 0.14998421107884496, | |
| "reward_std": 0.21046538464725018, | |
| "rewards/cosine_scaled_reward": 0.01922529563307762, | |
| "rewards/format_reward": 0.5416666865348816, | |
| "step": 69 | |
| }, | |
| { | |
| "completion_length": 3070.3125610351562, | |
| "epoch": 0.08, | |
| "grad_norm": 0.04615132138133049, | |
| "kl": 0.0005130767822265625, | |
| "learning_rate": 9.956206309337066e-07, | |
| "loss": 0.0216, | |
| "reward": 0.07898092828691006, | |
| "reward_std": 0.24917711317539215, | |
| "rewards/cosine_scaled_reward": -0.05244798678904772, | |
| "rewards/format_reward": 0.41666667349636555, | |
| "step": 70 | |
| }, | |
| { | |
| "completion_length": 2762.4584350585938, | |
| "epoch": 0.08114285714285714, | |
| "grad_norm": 0.04430554807186127, | |
| "kl": 0.0019788742065429688, | |
| "learning_rate": 9.951725498333448e-07, | |
| "loss": 0.015, | |
| "reward": 0.09884738456457853, | |
| "reward_std": 0.18450742959976196, | |
| "rewards/cosine_scaled_reward": -0.06116265989840031, | |
| "rewards/format_reward": 0.5000000149011612, | |
| "step": 71 | |
| }, | |
| { | |
| "completion_length": 2907.229278564453, | |
| "epoch": 0.08228571428571428, | |
| "grad_norm": 0.03771476075053215, | |
| "kl": 0.001185297966003418, | |
| "learning_rate": 9.947027716509488e-07, | |
| "loss": 0.0099, | |
| "reward": 0.1490677747351583, | |
| "reward_std": 0.2114352360367775, | |
| "rewards/cosine_scaled_reward": 0.05688696587458253, | |
| "rewards/format_reward": 0.4583333507180214, | |
| "step": 72 | |
| }, | |
| { | |
| "completion_length": 1758.7292175292969, | |
| "epoch": 0.08342857142857144, | |
| "grad_norm": 0.05024447292089462, | |
| "kl": 0.002407073974609375, | |
| "learning_rate": 9.942113192828444e-07, | |
| "loss": 0.0237, | |
| "reward": 0.23553117364645004, | |
| "reward_std": 0.267602801322937, | |
| "rewards/cosine_scaled_reward": 0.06681013852357864, | |
| "rewards/format_reward": 0.770833358168602, | |
| "step": 73 | |
| }, | |
| { | |
| "completion_length": 2650.7500610351562, | |
| "epoch": 0.08457142857142858, | |
| "grad_norm": 0.03339356556534767, | |
| "kl": 0.001583099365234375, | |
| "learning_rate": 9.93698216681727e-07, | |
| "loss": 0.005, | |
| "reward": 0.0968302907422185, | |
| "reward_std": 0.212956627830863, | |
| "rewards/cosine_scaled_reward": -0.06551082618534565, | |
| "rewards/format_reward": 0.5000000074505806, | |
| "step": 74 | |
| }, | |
| { | |
| "completion_length": 2749.6250610351562, | |
| "epoch": 0.08571428571428572, | |
| "grad_norm": 0.050951387733221054, | |
| "kl": 0.0025119781494140625, | |
| "learning_rate": 9.931634888554935e-07, | |
| "loss": 0.0204, | |
| "reward": 0.01578517910093069, | |
| "reward_std": 0.19141812063753605, | |
| "rewards/cosine_scaled_reward": -0.16943419352173805, | |
| "rewards/format_reward": 0.3958333395421505, | |
| "step": 75 | |
| }, | |
| { | |
| "completion_length": 2956.166748046875, | |
| "epoch": 0.08685714285714285, | |
| "grad_norm": 0.03326231986284256, | |
| "kl": 0.0010066032409667969, | |
| "learning_rate": 9.926071618660237e-07, | |
| "loss": 0.0099, | |
| "reward": 0.08347196271643043, | |
| "reward_std": 0.2046191357076168, | |
| "rewards/cosine_scaled_reward": -0.06869178358465433, | |
| "rewards/format_reward": 0.4583333507180214, | |
| "step": 76 | |
| }, | |
| { | |
| "completion_length": 3100.0000610351562, | |
| "epoch": 0.088, | |
| "grad_norm": 0.03300923481583595, | |
| "kl": 0.0013952255249023438, | |
| "learning_rate": 9.9202926282791e-07, | |
| "loss": 0.0109, | |
| "reward": 0.058561597019433975, | |
| "reward_std": 0.16000331193208694, | |
| "rewards/cosine_scaled_reward": -0.01991163194179535, | |
| "rewards/format_reward": 0.2708333395421505, | |
| "step": 77 | |
| }, | |
| { | |
| "completion_length": 2844.5208740234375, | |
| "epoch": 0.08914285714285715, | |
| "grad_norm": 0.02408880740404129, | |
| "kl": 0.0008955001831054688, | |
| "learning_rate": 9.91429819907136e-07, | |
| "loss": -0.0039, | |
| "reward": 0.1578286881558597, | |
| "reward_std": 0.16523915342986584, | |
| "rewards/cosine_scaled_reward": 0.05352177284657955, | |
| "rewards/format_reward": 0.5, | |
| "step": 78 | |
| }, | |
| { | |
| "completion_length": 2869.9168090820312, | |
| "epoch": 0.09028571428571429, | |
| "grad_norm": 0.031349629163742065, | |
| "kl": 0.0007748603820800781, | |
| "learning_rate": 9.908088623197048e-07, | |
| "loss": 0.0061, | |
| "reward": 0.048253659158945084, | |
| "reward_std": 0.1529301442205906, | |
| "rewards/cosine_scaled_reward": -0.1452452540397644, | |
| "rewards/format_reward": 0.479166679084301, | |
| "step": 79 | |
| }, | |
| { | |
| "completion_length": 3500.6459350585938, | |
| "epoch": 0.09142857142857143, | |
| "grad_norm": 0.026259545236825943, | |
| "kl": 0.0009374618530273438, | |
| "learning_rate": 9.901664203302124e-07, | |
| "loss": 0.0065, | |
| "reward": -0.06878593238070607, | |
| "reward_std": 0.15760904923081398, | |
| "rewards/cosine_scaled_reward": -0.18420086801052094, | |
| "rewards/format_reward": 0.1041666679084301, | |
| "step": 80 | |
| }, | |
| { | |
| "completion_length": 3155.9375, | |
| "epoch": 0.09257142857142857, | |
| "grad_norm": 0.029411010444164276, | |
| "kl": 0.00327301025390625, | |
| "learning_rate": 9.895025252503755e-07, | |
| "loss": 0.0049, | |
| "reward": 0.014610872138291597, | |
| "reward_std": 0.17320290207862854, | |
| "rewards/cosine_scaled_reward": -0.10505690053105354, | |
| "rewards/format_reward": 0.27083334513008595, | |
| "step": 81 | |
| }, | |
| { | |
| "completion_length": 3102.1250610351562, | |
| "epoch": 0.09371428571428571, | |
| "grad_norm": 0.05089900642633438, | |
| "kl": 0.0019311904907226562, | |
| "learning_rate": 9.888172094375033e-07, | |
| "loss": 0.0278, | |
| "reward": 0.02155976463109255, | |
| "reward_std": 0.2168351523578167, | |
| "rewards/cosine_scaled_reward": -0.1256045438349247, | |
| "rewards/format_reward": 0.3333333358168602, | |
| "step": 82 | |
| }, | |
| { | |
| "completion_length": 3404.291748046875, | |
| "epoch": 0.09485714285714286, | |
| "grad_norm": 0.035792965441942215, | |
| "kl": 0.0009613037109375, | |
| "learning_rate": 9.881105062929221e-07, | |
| "loss": 0.0068, | |
| "reward": -0.054995930986478925, | |
| "reward_std": 0.1765495426952839, | |
| "rewards/cosine_scaled_reward": -0.2007056437432766, | |
| "rewards/format_reward": 0.18750000558793545, | |
| "step": 83 | |
| }, | |
| { | |
| "completion_length": 2769.916748046875, | |
| "epoch": 0.096, | |
| "grad_norm": 0.07247019559144974, | |
| "kl": 0.0014896392822265625, | |
| "learning_rate": 9.873824502603459e-07, | |
| "loss": 0.0483, | |
| "reward": 0.2181787397712469, | |
| "reward_std": 0.298471137881279, | |
| "rewards/cosine_scaled_reward": 0.14841599948704243, | |
| "rewards/format_reward": 0.541666679084301, | |
| "step": 84 | |
| }, | |
| { | |
| "completion_length": 3060.5416870117188, | |
| "epoch": 0.09714285714285714, | |
| "grad_norm": 0.02733037807047367, | |
| "kl": 0.0017032623291015625, | |
| "learning_rate": 9.866330768241983e-07, | |
| "loss": 0.0037, | |
| "reward": 0.11667349748313427, | |
| "reward_std": 0.18902140855789185, | |
| "rewards/cosine_scaled_reward": 0.017972776666283607, | |
| "rewards/format_reward": 0.41666667722165585, | |
| "step": 85 | |
| }, | |
| { | |
| "completion_length": 2869.0418090820312, | |
| "epoch": 0.09828571428571428, | |
| "grad_norm": 0.025132693350315094, | |
| "kl": 0.0016460418701171875, | |
| "learning_rate": 9.85862422507884e-07, | |
| "loss": 0.0015, | |
| "reward": 0.004191828717011958, | |
| "reward_std": 0.1505029760301113, | |
| "rewards/cosine_scaled_reward": -0.17764563113451004, | |
| "rewards/format_reward": 0.375, | |
| "step": 86 | |
| }, | |
| { | |
| "completion_length": 3099.104248046875, | |
| "epoch": 0.09942857142857142, | |
| "grad_norm": 0.059636954218149185, | |
| "kl": 0.004062652587890625, | |
| "learning_rate": 9.850705248720068e-07, | |
| "loss": 0.0128, | |
| "reward": -0.008709938578249421, | |
| "reward_std": 0.15343913435935974, | |
| "rewards/cosine_scaled_reward": -0.15361853811191395, | |
| "rewards/format_reward": 0.27083334513008595, | |
| "step": 87 | |
| }, | |
| { | |
| "completion_length": 2932.2709350585938, | |
| "epoch": 0.10057142857142858, | |
| "grad_norm": 0.04077547416090965, | |
| "kl": 0.003711700439453125, | |
| "learning_rate": 9.8425742251254e-07, | |
| "loss": 0.0113, | |
| "reward": 0.12602957291528583, | |
| "reward_std": 0.19421576336026192, | |
| "rewards/cosine_scaled_reward": 0.005262332037091255, | |
| "rewards/format_reward": 0.479166679084301, | |
| "step": 88 | |
| }, | |
| { | |
| "completion_length": 3258.0208740234375, | |
| "epoch": 0.10171428571428572, | |
| "grad_norm": 0.045014455914497375, | |
| "kl": 0.0016326904296875, | |
| "learning_rate": 9.83423155058946e-07, | |
| "loss": 0.0199, | |
| "reward": 0.05050719529390335, | |
| "reward_std": 0.23298067785799503, | |
| "rewards/cosine_scaled_reward": -0.03483690321445465, | |
| "rewards/format_reward": 0.2708333395421505, | |
| "step": 89 | |
| }, | |
| { | |
| "completion_length": 3226.8125, | |
| "epoch": 0.10285714285714286, | |
| "grad_norm": 0.03448318690061569, | |
| "kl": 0.002132415771484375, | |
| "learning_rate": 9.825677631722435e-07, | |
| "loss": 0.0151, | |
| "reward": 0.011773815378546715, | |
| "reward_std": 0.20811042189598083, | |
| "rewards/cosine_scaled_reward": -0.10306294728070498, | |
| "rewards/format_reward": 0.2500000111758709, | |
| "step": 90 | |
| }, | |
| { | |
| "completion_length": 3281.7291870117188, | |
| "epoch": 0.104, | |
| "grad_norm": 0.02660350129008293, | |
| "kl": 0.0030241012573242188, | |
| "learning_rate": 9.816912885430258e-07, | |
| "loss": 0.0099, | |
| "reward": 0.013969017774797976, | |
| "reward_std": 0.1608537659049034, | |
| "rewards/cosine_scaled_reward": -0.10871240869164467, | |
| "rewards/format_reward": 0.27083334140479565, | |
| "step": 91 | |
| }, | |
| { | |
| "completion_length": 2845.229248046875, | |
| "epoch": 0.10514285714285715, | |
| "grad_norm": 0.05236576870083809, | |
| "kl": 0.00365447998046875, | |
| "learning_rate": 9.807937738894303e-07, | |
| "loss": 0.0083, | |
| "reward": 0.13860167190432549, | |
| "reward_std": 0.18116620555520058, | |
| "rewards/cosine_scaled_reward": 0.028314366936683655, | |
| "rewards/format_reward": 0.47916667722165585, | |
| "step": 92 | |
| }, | |
| { | |
| "completion_length": 2523.0625610351562, | |
| "epoch": 0.10628571428571429, | |
| "grad_norm": 0.014723028987646103, | |
| "kl": 0.0022420883178710938, | |
| "learning_rate": 9.798752629550546e-07, | |
| "loss": 0.0078, | |
| "reward": 0.09603097569197416, | |
| "reward_std": 0.13689620420336723, | |
| "rewards/cosine_scaled_reward": -0.04340764880180359, | |
| "rewards/format_reward": 0.45833333395421505, | |
| "step": 93 | |
| }, | |
| { | |
| "completion_length": 3406.3333740234375, | |
| "epoch": 0.10742857142857143, | |
| "grad_norm": 0.04505663737654686, | |
| "kl": 0.0037384033203125, | |
| "learning_rate": 9.78935800506826e-07, | |
| "loss": 0.0146, | |
| "reward": 0.05643160891486332, | |
| "reward_std": 0.26544057205319405, | |
| "rewards/cosine_scaled_reward": -0.024257462471723557, | |
| "rewards/format_reward": 0.2708333395421505, | |
| "step": 94 | |
| }, | |
| { | |
| "completion_length": 3115.4584350585938, | |
| "epoch": 0.10857142857142857, | |
| "grad_norm": 0.023418016731739044, | |
| "kl": 0.0017099380493164062, | |
| "learning_rate": 9.779754323328192e-07, | |
| "loss": 0.0086, | |
| "reward": -0.035079272696748376, | |
| "reward_std": 0.16244838573038578, | |
| "rewards/cosine_scaled_reward": -0.1955430954694748, | |
| "rewards/format_reward": 0.25000000558793545, | |
| "step": 95 | |
| }, | |
| { | |
| "completion_length": 2528.9584350585938, | |
| "epoch": 0.10971428571428571, | |
| "grad_norm": 0.03457753732800484, | |
| "kl": 0.0033416748046875, | |
| "learning_rate": 9.769942052400235e-07, | |
| "loss": 0.0193, | |
| "reward": 0.10984421521425247, | |
| "reward_std": 0.19435735791921616, | |
| "rewards/cosine_scaled_reward": -0.04057668708264828, | |
| "rewards/format_reward": 0.5000000149011612, | |
| "step": 96 | |
| }, | |
| { | |
| "completion_length": 3069.1458740234375, | |
| "epoch": 0.11085714285714286, | |
| "grad_norm": 0.031795185059309006, | |
| "kl": 0.004277229309082031, | |
| "learning_rate": 9.759921670520634e-07, | |
| "loss": 0.0134, | |
| "reward": 0.01800657995045185, | |
| "reward_std": 0.2002384178340435, | |
| "rewards/cosine_scaled_reward": -0.10626722499728203, | |
| "rewards/format_reward": 0.2708333358168602, | |
| "step": 97 | |
| }, | |
| { | |
| "completion_length": 2833.416717529297, | |
| "epoch": 0.112, | |
| "grad_norm": 0.038945551961660385, | |
| "kl": 0.005168914794921875, | |
| "learning_rate": 9.749693666068663e-07, | |
| "loss": 0.01, | |
| "reward": 0.037136003375053406, | |
| "reward_std": 0.14658785611391068, | |
| "rewards/cosine_scaled_reward": -0.12575949728488922, | |
| "rewards/format_reward": 0.39583333395421505, | |
| "step": 98 | |
| }, | |
| { | |
| "completion_length": 2793.166717529297, | |
| "epoch": 0.11314285714285714, | |
| "grad_norm": 0.026777496561408043, | |
| "kl": 0.0022182464599609375, | |
| "learning_rate": 9.739258537542835e-07, | |
| "loss": 0.0071, | |
| "reward": 0.09772413596510887, | |
| "reward_std": 0.17552867345511913, | |
| "rewards/cosine_scaled_reward": -0.04167799465358257, | |
| "rewards/format_reward": 0.45833334885537624, | |
| "step": 99 | |
| }, | |
| { | |
| "completion_length": 2965.8334350585938, | |
| "epoch": 0.11428571428571428, | |
| "grad_norm": 0.04113907739520073, | |
| "kl": 0.00261688232421875, | |
| "learning_rate": 9.728616793536587e-07, | |
| "loss": 0.0123, | |
| "reward": 0.06349668698385358, | |
| "reward_std": 0.21920687705278397, | |
| "rewards/cosine_scaled_reward": -0.08436030335724354, | |
| "rewards/format_reward": 0.416666679084301, | |
| "step": 100 | |
| }, | |
| { | |
| "completion_length": 2955.4375, | |
| "epoch": 0.11542857142857142, | |
| "grad_norm": 0.03473270311951637, | |
| "kl": 0.0041179656982421875, | |
| "learning_rate": 9.717768952713511e-07, | |
| "loss": 0.0187, | |
| "reward": 0.06792283244431019, | |
| "reward_std": 0.19342709705233574, | |
| "rewards/cosine_scaled_reward": -0.06693907268345356, | |
| "rewards/format_reward": 0.3958333432674408, | |
| "step": 101 | |
| }, | |
| { | |
| "completion_length": 2753.7501220703125, | |
| "epoch": 0.11657142857142858, | |
| "grad_norm": 0.026932649314403534, | |
| "kl": 0.003070831298828125, | |
| "learning_rate": 9.706715543782064e-07, | |
| "loss": 0.0054, | |
| "reward": 0.06435079779475927, | |
| "reward_std": 0.17049439996480942, | |
| "rewards/cosine_scaled_reward": -0.08317361772060394, | |
| "rewards/format_reward": 0.4166666716337204, | |
| "step": 102 | |
| }, | |
| { | |
| "completion_length": 3143.541748046875, | |
| "epoch": 0.11771428571428572, | |
| "grad_norm": 0.03425557166337967, | |
| "kl": 0.003162384033203125, | |
| "learning_rate": 9.695457105469804e-07, | |
| "loss": 0.0113, | |
| "reward": 0.06312601827085018, | |
| "reward_std": 0.21065171249210835, | |
| "rewards/cosine_scaled_reward": -0.0576252955943346, | |
| "rewards/format_reward": 0.3541666716337204, | |
| "step": 103 | |
| }, | |
| { | |
| "completion_length": 2831.7918090820312, | |
| "epoch": 0.11885714285714286, | |
| "grad_norm": 0.034013714641332626, | |
| "kl": 0.0024318695068359375, | |
| "learning_rate": 9.683994186497132e-07, | |
| "loss": 0.0121, | |
| "reward": 0.1945239887572825, | |
| "reward_std": 0.205114483833313, | |
| "rewards/cosine_scaled_reward": 0.15984268113970757, | |
| "rewards/format_reward": 0.4375000111758709, | |
| "step": 104 | |
| }, | |
| { | |
| "completion_length": 2831.7291870117188, | |
| "epoch": 0.12, | |
| "grad_norm": 0.03120482712984085, | |
| "kl": 0.004947662353515625, | |
| "learning_rate": 9.672327345550543e-07, | |
| "loss": 0.015, | |
| "reward": -0.00098506297217682, | |
| "reward_std": 0.16559923999011517, | |
| "rewards/cosine_scaled_reward": -0.16364420438185334, | |
| "rewards/format_reward": 0.3125000074505806, | |
| "step": 105 | |
| }, | |
| { | |
| "completion_length": 3012.3958740234375, | |
| "epoch": 0.12114285714285715, | |
| "grad_norm": 0.0529262013733387, | |
| "kl": 0.004772186279296875, | |
| "learning_rate": 9.66045715125541e-07, | |
| "loss": 0.0305, | |
| "reward": 0.0871598981320858, | |
| "reward_std": 0.23246358335018158, | |
| "rewards/cosine_scaled_reward": 0.01361493207514286, | |
| "rewards/format_reward": 0.3125000037252903, | |
| "step": 106 | |
| }, | |
| { | |
| "completion_length": 2419.500030517578, | |
| "epoch": 0.12228571428571429, | |
| "grad_norm": 0.038417600095272064, | |
| "kl": 0.0029621124267578125, | |
| "learning_rate": 9.648384182148252e-07, | |
| "loss": 0.0205, | |
| "reward": 0.18521913886070251, | |
| "reward_std": 0.25690846890211105, | |
| "rewards/cosine_scaled_reward": 0.05618660245090723, | |
| "rewards/format_reward": 0.6041666865348816, | |
| "step": 107 | |
| }, | |
| { | |
| "completion_length": 2302.0208740234375, | |
| "epoch": 0.12342857142857143, | |
| "grad_norm": 0.03406599164009094, | |
| "kl": 0.0060882568359375, | |
| "learning_rate": 9.636109026648554e-07, | |
| "loss": 0.0102, | |
| "reward": 0.2518926318734884, | |
| "reward_std": 0.16866662353277206, | |
| "rewards/cosine_scaled_reward": 0.16351257264614105, | |
| "rewards/format_reward": 0.6458333358168602, | |
| "step": 108 | |
| }, | |
| { | |
| "completion_length": 2855.0416870117188, | |
| "epoch": 0.12457142857142857, | |
| "grad_norm": 0.02359769493341446, | |
| "kl": 0.003265380859375, | |
| "learning_rate": 9.623632283030077e-07, | |
| "loss": 0.0062, | |
| "reward": 0.07727389223873615, | |
| "reward_std": 0.13795860763639212, | |
| "rewards/cosine_scaled_reward": -0.08188081672415137, | |
| "rewards/format_reward": 0.4583333358168602, | |
| "step": 109 | |
| }, | |
| { | |
| "completion_length": 2375.2083740234375, | |
| "epoch": 0.12571428571428572, | |
| "grad_norm": 0.05410967394709587, | |
| "kl": 0.006961822509765625, | |
| "learning_rate": 9.610954559391704e-07, | |
| "loss": 0.0198, | |
| "reward": 0.34625594690442085, | |
| "reward_std": 0.23936187848448753, | |
| "rewards/cosine_scaled_reward": 0.2613089978694916, | |
| "rewards/format_reward": 0.8125, | |
| "step": 110 | |
| }, | |
| { | |
| "completion_length": 1960.8125610351562, | |
| "epoch": 0.12685714285714286, | |
| "grad_norm": 0.025244763121008873, | |
| "kl": 0.005954742431640625, | |
| "learning_rate": 9.598076473627796e-07, | |
| "loss": 0.0028, | |
| "reward": 0.18395254015922546, | |
| "reward_std": 0.18541419506072998, | |
| "rewards/cosine_scaled_reward": 0.008121738210320473, | |
| "rewards/format_reward": 0.6875000111758709, | |
| "step": 111 | |
| }, | |
| { | |
| "completion_length": 2942.354248046875, | |
| "epoch": 0.128, | |
| "grad_norm": 0.04222922772169113, | |
| "kl": 0.004268646240234375, | |
| "learning_rate": 9.58499865339809e-07, | |
| "loss": 0.0058, | |
| "reward": 0.09395160432904959, | |
| "reward_std": 0.19945074431598186, | |
| "rewards/cosine_scaled_reward": -0.03249247372150421, | |
| "rewards/format_reward": 0.4166666716337204, | |
| "step": 112 | |
| }, | |
| { | |
| "completion_length": 2683.75, | |
| "epoch": 0.12914285714285714, | |
| "grad_norm": 0.01863986626267433, | |
| "kl": 0.009937286376953125, | |
| "learning_rate": 9.571721736097088e-07, | |
| "loss": 0.0008, | |
| "reward": 0.15153072029352188, | |
| "reward_std": 0.11458084732294083, | |
| "rewards/cosine_scaled_reward": 0.04880138114094734, | |
| "rewards/format_reward": 0.4791666716337204, | |
| "step": 113 | |
| }, | |
| { | |
| "completion_length": 1857.2709045410156, | |
| "epoch": 0.13028571428571428, | |
| "grad_norm": 0.022669972851872444, | |
| "kl": 0.003658294677734375, | |
| "learning_rate": 9.55824636882301e-07, | |
| "loss": 0.006, | |
| "reward": 0.20419701468199492, | |
| "reward_std": 0.12225367687642574, | |
| "rewards/cosine_scaled_reward": 0.046822127886116505, | |
| "rewards/format_reward": 0.6875, | |
| "step": 114 | |
| }, | |
| { | |
| "completion_length": 2935.6458740234375, | |
| "epoch": 0.13142857142857142, | |
| "grad_norm": 0.07691904902458191, | |
| "kl": 0.00464630126953125, | |
| "learning_rate": 9.54457320834625e-07, | |
| "loss": 0.0076, | |
| "reward": 0.2052596650319174, | |
| "reward_std": 0.25133959483355284, | |
| "rewards/cosine_scaled_reward": 0.15499752014875412, | |
| "rewards/format_reward": 0.479166679084301, | |
| "step": 115 | |
| }, | |
| { | |
| "completion_length": 2854.166748046875, | |
| "epoch": 0.13257142857142856, | |
| "grad_norm": 0.033508677035570145, | |
| "kl": 0.00675201416015625, | |
| "learning_rate": 9.530702921077358e-07, | |
| "loss": 0.0178, | |
| "reward": 0.011383330449461937, | |
| "reward_std": 0.1784326285123825, | |
| "rewards/cosine_scaled_reward": -0.13474150374531746, | |
| "rewards/format_reward": 0.3125000111758709, | |
| "step": 116 | |
| }, | |
| { | |
| "completion_length": 3279.541748046875, | |
| "epoch": 0.1337142857142857, | |
| "grad_norm": 0.024763749912381172, | |
| "kl": 0.00537872314453125, | |
| "learning_rate": 9.516636183034564e-07, | |
| "loss": 0.0181, | |
| "reward": -0.08736757189035416, | |
| "reward_std": 0.15839183516800404, | |
| "rewards/cosine_scaled_reward": -0.2729394882917404, | |
| "rewards/format_reward": 0.2083333395421505, | |
| "step": 117 | |
| }, | |
| { | |
| "completion_length": 2247.2083740234375, | |
| "epoch": 0.13485714285714287, | |
| "grad_norm": 0.021144714206457138, | |
| "kl": 0.00655364990234375, | |
| "learning_rate": 9.502373679810839e-07, | |
| "loss": 0.0061, | |
| "reward": 0.16257739253342152, | |
| "reward_std": 0.14550958015024662, | |
| "rewards/cosine_scaled_reward": -0.02890787273645401, | |
| "rewards/format_reward": 0.6875000111758709, | |
| "step": 118 | |
| }, | |
| { | |
| "completion_length": 2390.3750610351562, | |
| "epoch": 0.136, | |
| "grad_norm": 0.017760999500751495, | |
| "kl": 0.00719451904296875, | |
| "learning_rate": 9.487916106540465e-07, | |
| "loss": 0.0077, | |
| "reward": 0.08260644786059856, | |
| "reward_std": 0.1435317825525999, | |
| "rewards/cosine_scaled_reward": -0.09051773697137833, | |
| "rewards/format_reward": 0.5000000074505806, | |
| "step": 119 | |
| }, | |
| { | |
| "completion_length": 2852.1666870117188, | |
| "epoch": 0.13714285714285715, | |
| "grad_norm": 0.05494442954659462, | |
| "kl": 0.004840850830078125, | |
| "learning_rate": 9.473264167865171e-07, | |
| "loss": 0.0235, | |
| "reward": 0.027354625213774852, | |
| "reward_std": 0.22287173569202423, | |
| "rewards/cosine_scaled_reward": -0.1646816898137331, | |
| "rewards/format_reward": 0.4375000149011612, | |
| "step": 120 | |
| }, | |
| { | |
| "completion_length": 3042.6458435058594, | |
| "epoch": 0.1382857142857143, | |
| "grad_norm": 0.03370098024606705, | |
| "kl": 0.00531005859375, | |
| "learning_rate": 9.458418577899774e-07, | |
| "loss": 0.0066, | |
| "reward": 0.07949573546648026, | |
| "reward_std": 0.1635687043890357, | |
| "rewards/cosine_scaled_reward": -0.03458750061690807, | |
| "rewards/format_reward": 0.3750000037252903, | |
| "step": 121 | |
| }, | |
| { | |
| "completion_length": 3073.3126220703125, | |
| "epoch": 0.13942857142857143, | |
| "grad_norm": 0.02363232709467411, | |
| "kl": 0.01202392578125, | |
| "learning_rate": 9.443380060197385e-07, | |
| "loss": 0.0063, | |
| "reward": 0.0004803319461643696, | |
| "reward_std": 0.13274768367409706, | |
| "rewards/cosine_scaled_reward": -0.15389488637447357, | |
| "rewards/format_reward": 0.3125000074505806, | |
| "step": 122 | |
| }, | |
| { | |
| "completion_length": 2623.8958740234375, | |
| "epoch": 0.14057142857142857, | |
| "grad_norm": 0.3587798774242401, | |
| "kl": 0.15277862548828125, | |
| "learning_rate": 9.428149347714143e-07, | |
| "loss": 0.0171, | |
| "reward": 0.0874692378565669, | |
| "reward_std": 0.20633260160684586, | |
| "rewards/cosine_scaled_reward": -0.04201580956578255, | |
| "rewards/format_reward": 0.4166666716337204, | |
| "step": 123 | |
| }, | |
| { | |
| "completion_length": 2558.8750610351562, | |
| "epoch": 0.1417142857142857, | |
| "grad_norm": 0.042803164571523666, | |
| "kl": 0.004398345947265625, | |
| "learning_rate": 9.412727182773486e-07, | |
| "loss": 0.0198, | |
| "reward": 0.1571647897362709, | |
| "reward_std": 0.254299595952034, | |
| "rewards/cosine_scaled_reward": 0.03907129663275555, | |
| "rewards/format_reward": 0.520833358168602, | |
| "step": 124 | |
| }, | |
| { | |
| "completion_length": 3026.5834350585938, | |
| "epoch": 0.14285714285714285, | |
| "grad_norm": 0.03110657073557377, | |
| "kl": 0.00452423095703125, | |
| "learning_rate": 9.397114317029974e-07, | |
| "loss": 0.0103, | |
| "reward": 0.06283934926614165, | |
| "reward_std": 0.18192237615585327, | |
| "rewards/cosine_scaled_reward": -0.0575394481420517, | |
| "rewards/format_reward": 0.35416667722165585, | |
| "step": 125 | |
| }, | |
| { | |
| "completion_length": 2433.8125610351562, | |
| "epoch": 0.144, | |
| "grad_norm": 0.06285406649112701, | |
| "kl": 0.0075016021728515625, | |
| "learning_rate": 9.381311511432658e-07, | |
| "loss": 0.0121, | |
| "reward": 0.18290065601468086, | |
| "reward_std": 0.2167891375720501, | |
| "rewards/cosine_scaled_reward": 0.039275931660085917, | |
| "rewards/format_reward": 0.6250000149011612, | |
| "step": 126 | |
| }, | |
| { | |
| "completion_length": 2855.8750610351562, | |
| "epoch": 0.14514285714285713, | |
| "grad_norm": 0.028601203113794327, | |
| "kl": 0.0057621002197265625, | |
| "learning_rate": 9.36531953618799e-07, | |
| "loss": 0.0097, | |
| "reward": 0.013635631650686264, | |
| "reward_std": 0.19056180864572525, | |
| "rewards/cosine_scaled_reward": -0.1619595978409052, | |
| "rewards/format_reward": 0.3750000074505806, | |
| "step": 127 | |
| }, | |
| { | |
| "completion_length": 2293.5001220703125, | |
| "epoch": 0.1462857142857143, | |
| "grad_norm": 0.03725991025567055, | |
| "kl": 0.00472259521484375, | |
| "learning_rate": 9.34913917072228e-07, | |
| "loss": 0.0051, | |
| "reward": 0.12247474305331707, | |
| "reward_std": 0.16250892356038094, | |
| "rewards/cosine_scaled_reward": -0.04633878357708454, | |
| "rewards/format_reward": 0.5625000149011612, | |
| "step": 128 | |
| }, | |
| { | |
| "completion_length": 2054.7916870117188, | |
| "epoch": 0.14742857142857144, | |
| "grad_norm": 0.0803324356675148, | |
| "kl": 0.006214141845703125, | |
| "learning_rate": 9.332771203643714e-07, | |
| "loss": 0.043, | |
| "reward": 0.14053261652588844, | |
| "reward_std": 0.2526179291307926, | |
| "rewards/cosine_scaled_reward": -0.0758588750322815, | |
| "rewards/format_reward": 0.6875000074505806, | |
| "step": 129 | |
| }, | |
| { | |
| "completion_length": 2986.5834350585938, | |
| "epoch": 0.14857142857142858, | |
| "grad_norm": 0.02577091194689274, | |
| "kl": 0.004703521728515625, | |
| "learning_rate": 9.316216432703916e-07, | |
| "loss": 0.0029, | |
| "reward": 0.08058308716863394, | |
| "reward_std": 0.16763485223054886, | |
| "rewards/cosine_scaled_reward": -0.023030042182654142, | |
| "rewards/format_reward": 0.354166679084301, | |
| "step": 130 | |
| }, | |
| { | |
| "completion_length": 2618.8126220703125, | |
| "epoch": 0.14971428571428572, | |
| "grad_norm": 0.06951504200696945, | |
| "kl": 0.00390625, | |
| "learning_rate": 9.299475664759068e-07, | |
| "loss": 0.0146, | |
| "reward": 0.19235911988653243, | |
| "reward_std": 0.18755021132528782, | |
| "rewards/cosine_scaled_reward": 0.07008487358689308, | |
| "rewards/format_reward": 0.6041666939854622, | |
| "step": 131 | |
| }, | |
| { | |
| "completion_length": 2662.4168090820312, | |
| "epoch": 0.15085714285714286, | |
| "grad_norm": 0.06918520480394363, | |
| "kl": 0.00621795654296875, | |
| "learning_rate": 9.282549715730579e-07, | |
| "loss": 0.0217, | |
| "reward": 0.024238456040620804, | |
| "reward_std": 0.1943957842886448, | |
| "rewards/cosine_scaled_reward": -0.16076999064534903, | |
| "rewards/format_reward": 0.416666679084301, | |
| "step": 132 | |
| }, | |
| { | |
| "completion_length": 2992.2501220703125, | |
| "epoch": 0.152, | |
| "grad_norm": 0.040292322635650635, | |
| "kl": 0.0078582763671875, | |
| "learning_rate": 9.265439410565328e-07, | |
| "loss": 0.0287, | |
| "reward": -0.008192204404622316, | |
| "reward_std": 0.20025834068655968, | |
| "rewards/cosine_scaled_reward": -0.18358655413612723, | |
| "rewards/format_reward": 0.3333333432674408, | |
| "step": 133 | |
| }, | |
| { | |
| "completion_length": 2066.666717529297, | |
| "epoch": 0.15314285714285714, | |
| "grad_norm": 0.03623603284358978, | |
| "kl": 0.00725555419921875, | |
| "learning_rate": 9.248145583195447e-07, | |
| "loss": 0.0111, | |
| "reward": 0.13319938629865646, | |
| "reward_std": 0.206376563757658, | |
| "rewards/cosine_scaled_reward": -0.07876455131918192, | |
| "rewards/format_reward": 0.666666679084301, | |
| "step": 134 | |
| }, | |
| { | |
| "completion_length": 2176.8959045410156, | |
| "epoch": 0.15428571428571428, | |
| "grad_norm": 0.01887073926627636, | |
| "kl": 0.003795623779296875, | |
| "learning_rate": 9.230669076497687e-07, | |
| "loss": 0.0126, | |
| "reward": 0.05480904504656792, | |
| "reward_std": 0.16762719117105007, | |
| "rewards/cosine_scaled_reward": -0.1592541355639696, | |
| "rewards/format_reward": 0.5208333395421505, | |
| "step": 135 | |
| }, | |
| { | |
| "completion_length": 2238.1041870117188, | |
| "epoch": 0.15542857142857142, | |
| "grad_norm": 0.02975233644247055, | |
| "kl": 0.00510406494140625, | |
| "learning_rate": 9.213010742252327e-07, | |
| "loss": 0.0072, | |
| "reward": 0.20237390510737896, | |
| "reward_std": 0.1938447467982769, | |
| "rewards/cosine_scaled_reward": 0.053038330748677254, | |
| "rewards/format_reward": 0.6666666865348816, | |
| "step": 136 | |
| }, | |
| { | |
| "completion_length": 2681.3750610351562, | |
| "epoch": 0.15657142857142858, | |
| "grad_norm": 0.04092983901500702, | |
| "kl": 0.0087890625, | |
| "learning_rate": 9.195171441101668e-07, | |
| "loss": 0.027, | |
| "reward": 0.019164174795150757, | |
| "reward_std": 0.17843515612185, | |
| "rewards/cosine_scaled_reward": -0.1895993035286665, | |
| "rewards/format_reward": 0.4375000149011612, | |
| "step": 137 | |
| }, | |
| { | |
| "completion_length": 1762.2708740234375, | |
| "epoch": 0.15771428571428572, | |
| "grad_norm": 0.03509270027279854, | |
| "kl": 0.0040435791015625, | |
| "learning_rate": 9.177152042508077e-07, | |
| "loss": 0.0217, | |
| "reward": 0.26725751906633377, | |
| "reward_std": 0.20763983204960823, | |
| "rewards/cosine_scaled_reward": 0.15629079565405846, | |
| "rewards/format_reward": 0.7083333432674408, | |
| "step": 138 | |
| }, | |
| { | |
| "completion_length": 2035.2916870117188, | |
| "epoch": 0.15885714285714286, | |
| "grad_norm": 0.04347836971282959, | |
| "kl": 0.00453948974609375, | |
| "learning_rate": 9.158953424711624e-07, | |
| "loss": 0.0227, | |
| "reward": 0.18224887805990875, | |
| "reward_std": 0.24457651004195213, | |
| "rewards/cosine_scaled_reward": 0.014037903398275375, | |
| "rewards/format_reward": 0.666666679084301, | |
| "step": 139 | |
| }, | |
| { | |
| "completion_length": 2067.604217529297, | |
| "epoch": 0.16, | |
| "grad_norm": 0.015685537829995155, | |
| "kl": 0.005207061767578125, | |
| "learning_rate": 9.140576474687263e-07, | |
| "loss": 0.0043, | |
| "reward": 0.16747045516967773, | |
| "reward_std": 0.14560667239129543, | |
| "rewards/cosine_scaled_reward": -0.0546467499807477, | |
| "rewards/format_reward": 0.75, | |
| "step": 140 | |
| }, | |
| { | |
| "completion_length": 2262.0208740234375, | |
| "epoch": 0.16114285714285714, | |
| "grad_norm": 0.043918903917074203, | |
| "kl": 0.00701141357421875, | |
| "learning_rate": 9.122022088101613e-07, | |
| "loss": 0.0142, | |
| "reward": 0.1650593876838684, | |
| "reward_std": 0.21372229792177677, | |
| "rewards/cosine_scaled_reward": -0.0056916698813438416, | |
| "rewards/format_reward": 0.6458333488553762, | |
| "step": 141 | |
| }, | |
| { | |
| "completion_length": 2207.2291870117188, | |
| "epoch": 0.16228571428571428, | |
| "grad_norm": 0.047901369631290436, | |
| "kl": 0.007045745849609375, | |
| "learning_rate": 9.103291169269299e-07, | |
| "loss": 0.03, | |
| "reward": 0.21545884013175964, | |
| "reward_std": 0.25139569491147995, | |
| "rewards/cosine_scaled_reward": 0.10615230980329216, | |
| "rewards/format_reward": 0.6041666865348816, | |
| "step": 142 | |
| }, | |
| { | |
| "completion_length": 2436.166748046875, | |
| "epoch": 0.16342857142857142, | |
| "grad_norm": 0.016415616497397423, | |
| "kl": 0.006572723388671875, | |
| "learning_rate": 9.084384631108882e-07, | |
| "loss": 0.0098, | |
| "reward": 0.12664190726354718, | |
| "reward_std": 0.15973762422800064, | |
| "rewards/cosine_scaled_reward": -0.03559926152229309, | |
| "rewards/format_reward": 0.5625, | |
| "step": 143 | |
| }, | |
| { | |
| "completion_length": 2151.7916870117188, | |
| "epoch": 0.16457142857142856, | |
| "grad_norm": 0.02632884681224823, | |
| "kl": 0.00556182861328125, | |
| "learning_rate": 9.065303395098358e-07, | |
| "loss": 0.0049, | |
| "reward": 0.21057682996615767, | |
| "reward_std": 0.24973290413618088, | |
| "rewards/cosine_scaled_reward": 0.04760269448161125, | |
| "rewards/format_reward": 0.7083333507180214, | |
| "step": 144 | |
| }, | |
| { | |
| "completion_length": 2325.8334045410156, | |
| "epoch": 0.1657142857142857, | |
| "grad_norm": 0.030833307653665543, | |
| "kl": 0.008785247802734375, | |
| "learning_rate": 9.046048391230247e-07, | |
| "loss": 0.0073, | |
| "reward": 0.09153417311608791, | |
| "reward_std": 0.2068365514278412, | |
| "rewards/cosine_scaled_reward": -0.12232109159231186, | |
| "rewards/format_reward": 0.6041666716337204, | |
| "step": 145 | |
| }, | |
| { | |
| "completion_length": 1995.6875610351562, | |
| "epoch": 0.16685714285714287, | |
| "grad_norm": 0.02223813906311989, | |
| "kl": 0.005832672119140625, | |
| "learning_rate": 9.026620557966279e-07, | |
| "loss": 0.0068, | |
| "reward": 0.11374105885624886, | |
| "reward_std": 0.16233688965439796, | |
| "rewards/cosine_scaled_reward": -0.1749850958585739, | |
| "rewards/format_reward": 0.7916666716337204, | |
| "step": 146 | |
| }, | |
| { | |
| "completion_length": 1958.3125915527344, | |
| "epoch": 0.168, | |
| "grad_norm": 0.04420648515224457, | |
| "kl": 0.004932403564453125, | |
| "learning_rate": 9.007020842191634e-07, | |
| "loss": 0.0177, | |
| "reward": 0.18589897733181715, | |
| "reward_std": 0.21643339842557907, | |
| "rewards/cosine_scaled_reward": 0.012172549962997437, | |
| "rewards/format_reward": 0.6875000149011612, | |
| "step": 147 | |
| }, | |
| { | |
| "completion_length": 1863.854232788086, | |
| "epoch": 0.16914285714285715, | |
| "grad_norm": 0.019762450829148293, | |
| "kl": 0.006500244140625, | |
| "learning_rate": 8.987250199168808e-07, | |
| "loss": 0.0128, | |
| "reward": 0.04669401329010725, | |
| "reward_std": 0.12893508188426495, | |
| "rewards/cosine_scaled_reward": -0.277628555893898, | |
| "rewards/format_reward": 0.7291666828095913, | |
| "step": 148 | |
| }, | |
| { | |
| "completion_length": 2386.7501220703125, | |
| "epoch": 0.1702857142857143, | |
| "grad_norm": 0.024140046909451485, | |
| "kl": 0.00780487060546875, | |
| "learning_rate": 8.967309592491052e-07, | |
| "loss": 0.0021, | |
| "reward": 0.2633550156606361, | |
| "reward_std": 0.16658334992825985, | |
| "rewards/cosine_scaled_reward": 0.1946883723139763, | |
| "rewards/format_reward": 0.6250000037252903, | |
| "step": 149 | |
| }, | |
| { | |
| "completion_length": 2230.604217529297, | |
| "epoch": 0.17142857142857143, | |
| "grad_norm": 0.03733440488576889, | |
| "kl": 0.01021575927734375, | |
| "learning_rate": 8.9471999940354e-07, | |
| "loss": 0.02, | |
| "reward": 0.0389388483017683, | |
| "reward_std": 0.1488077249377966, | |
| "rewards/cosine_scaled_reward": -0.2300075776875019, | |
| "rewards/format_reward": 0.604166692122817, | |
| "step": 150 | |
| }, | |
| { | |
| "completion_length": 2027.541748046875, | |
| "epoch": 0.17257142857142857, | |
| "grad_norm": 0.023991534486413002, | |
| "kl": 0.00460052490234375, | |
| "learning_rate": 8.926922383915315e-07, | |
| "loss": 0.0145, | |
| "reward": 0.09001021273434162, | |
| "reward_std": 0.1386792566627264, | |
| "rewards/cosine_scaled_reward": -0.1297392025589943, | |
| "rewards/format_reward": 0.6041666716337204, | |
| "step": 151 | |
| }, | |
| { | |
| "completion_length": 2920.6876220703125, | |
| "epoch": 0.1737142857142857, | |
| "grad_norm": 0.0439627505838871, | |
| "kl": 0.01219940185546875, | |
| "learning_rate": 8.906477750432903e-07, | |
| "loss": 0.0108, | |
| "reward": 0.05928703024983406, | |
| "reward_std": 0.23661532253026962, | |
| "rewards/cosine_scaled_reward": -0.07486184407025576, | |
| "rewards/format_reward": 0.3750000074505806, | |
| "step": 152 | |
| }, | |
| { | |
| "completion_length": 2236.187530517578, | |
| "epoch": 0.17485714285714285, | |
| "grad_norm": 0.03838758543133736, | |
| "kl": 0.005950927734375, | |
| "learning_rate": 8.88586709003076e-07, | |
| "loss": 0.015, | |
| "reward": 0.15822238428518176, | |
| "reward_std": 0.18816383555531502, | |
| "rewards/cosine_scaled_reward": 0.015029110945761204, | |
| "rewards/format_reward": 0.5833333507180214, | |
| "step": 153 | |
| }, | |
| { | |
| "completion_length": 2730.3750915527344, | |
| "epoch": 0.176, | |
| "grad_norm": 0.04663749411702156, | |
| "kl": 0.007442474365234375, | |
| "learning_rate": 8.865091407243394e-07, | |
| "loss": 0.017, | |
| "reward": 0.17066853493452072, | |
| "reward_std": 0.25703447312116623, | |
| "rewards/cosine_scaled_reward": 0.11062700673937798, | |
| "rewards/format_reward": 0.4375000111758709, | |
| "step": 154 | |
| }, | |
| { | |
| "completion_length": 3020.5209350585938, | |
| "epoch": 0.17714285714285713, | |
| "grad_norm": 0.04400014132261276, | |
| "kl": 0.008758544921875, | |
| "learning_rate": 8.844151714648274e-07, | |
| "loss": 0.0208, | |
| "reward": 0.026316032744944096, | |
| "reward_std": 0.19365801848471165, | |
| "rewards/cosine_scaled_reward": -0.13478738628327847, | |
| "rewards/format_reward": 0.3750000074505806, | |
| "step": 155 | |
| }, | |
| { | |
| "completion_length": 2578.3958740234375, | |
| "epoch": 0.1782857142857143, | |
| "grad_norm": 0.021931642666459084, | |
| "kl": 0.009639739990234375, | |
| "learning_rate": 8.823049032816478e-07, | |
| "loss": 0.009, | |
| "reward": 0.05528542585670948, | |
| "reward_std": 0.13396807853132486, | |
| "rewards/cosine_scaled_reward": -0.13394572399556637, | |
| "rewards/format_reward": 0.47916667722165585, | |
| "step": 156 | |
| }, | |
| { | |
| "completion_length": 2065.2084045410156, | |
| "epoch": 0.17942857142857144, | |
| "grad_norm": 0.03533341363072395, | |
| "kl": 0.0070343017578125, | |
| "learning_rate": 8.801784390262943e-07, | |
| "loss": 0.002, | |
| "reward": 0.26897287741303444, | |
| "reward_std": 0.23403197899460793, | |
| "rewards/cosine_scaled_reward": 0.1861739861778915, | |
| "rewards/format_reward": 0.6666666865348816, | |
| "step": 157 | |
| }, | |
| { | |
| "completion_length": 2713.5416870117188, | |
| "epoch": 0.18057142857142858, | |
| "grad_norm": 0.02935076504945755, | |
| "kl": 0.01041412353515625, | |
| "learning_rate": 8.780358823396352e-07, | |
| "loss": 0.0129, | |
| "reward": 0.043941982788965106, | |
| "reward_std": 0.1749372985213995, | |
| "rewards/cosine_scaled_reward": -0.18872354179620743, | |
| "rewards/format_reward": 0.5416666828095913, | |
| "step": 158 | |
| }, | |
| { | |
| "completion_length": 1839.1876220703125, | |
| "epoch": 0.18171428571428572, | |
| "grad_norm": 0.01745816320180893, | |
| "kl": 0.006927490234375, | |
| "learning_rate": 8.758773376468604e-07, | |
| "loss": 0.0084, | |
| "reward": 0.24551017116755247, | |
| "reward_std": 0.15837855637073517, | |
| "rewards/cosine_scaled_reward": 0.06777001172304153, | |
| "rewards/format_reward": 0.8125000149011612, | |
| "step": 159 | |
| }, | |
| { | |
| "completion_length": 2165.9584045410156, | |
| "epoch": 0.18285714285714286, | |
| "grad_norm": 0.042395975440740585, | |
| "kl": 0.00711822509765625, | |
| "learning_rate": 8.737029101523929e-07, | |
| "loss": 0.0074, | |
| "reward": 0.24138486292213202, | |
| "reward_std": 0.2575872726738453, | |
| "rewards/cosine_scaled_reward": 0.10766421753214672, | |
| "rewards/format_reward": 0.7083333432674408, | |
| "step": 160 | |
| }, | |
| { | |
| "completion_length": 2732.5834350585938, | |
| "epoch": 0.184, | |
| "grad_norm": 0.05092175304889679, | |
| "kl": 0.00798797607421875, | |
| "learning_rate": 8.715127058347614e-07, | |
| "loss": 0.028, | |
| "reward": 0.046656528022140265, | |
| "reward_std": 0.21346447244286537, | |
| "rewards/cosine_scaled_reward": -0.13846495002508163, | |
| "rewards/format_reward": 0.4583333432674408, | |
| "step": 161 | |
| }, | |
| { | |
| "completion_length": 2430.3125, | |
| "epoch": 0.18514285714285714, | |
| "grad_norm": 0.04415441304445267, | |
| "kl": 0.008209228515625, | |
| "learning_rate": 8.693068314414344e-07, | |
| "loss": 0.0252, | |
| "reward": 0.0746684002224356, | |
| "reward_std": 0.2177988886833191, | |
| "rewards/cosine_scaled_reward": -0.13921628706157207, | |
| "rewards/format_reward": 0.5625000074505806, | |
| "step": 162 | |
| }, | |
| { | |
| "completion_length": 2038.6250610351562, | |
| "epoch": 0.18628571428571428, | |
| "grad_norm": 0.028816239908337593, | |
| "kl": 0.009674072265625, | |
| "learning_rate": 8.670853944836176e-07, | |
| "loss": 0.0086, | |
| "reward": 0.12819508381653577, | |
| "reward_std": 0.18912328407168388, | |
| "rewards/cosine_scaled_reward": -0.05834662541747093, | |
| "rewards/format_reward": 0.604166679084301, | |
| "step": 163 | |
| }, | |
| { | |
| "completion_length": 2344.666717529297, | |
| "epoch": 0.18742857142857142, | |
| "grad_norm": 0.08988065272569656, | |
| "kl": 0.0113677978515625, | |
| "learning_rate": 8.648485032310144e-07, | |
| "loss": 0.025, | |
| "reward": 0.10471067111939192, | |
| "reward_std": 0.21184637024998665, | |
| "rewards/cosine_scaled_reward": -0.03762231674045324, | |
| "rewards/format_reward": 0.4791666828095913, | |
| "step": 164 | |
| }, | |
| { | |
| "completion_length": 2217.916748046875, | |
| "epoch": 0.18857142857142858, | |
| "grad_norm": 0.04993058741092682, | |
| "kl": 0.00753021240234375, | |
| "learning_rate": 8.625962667065487e-07, | |
| "loss": 0.0086, | |
| "reward": 0.1478974660858512, | |
| "reward_std": 0.2265885230153799, | |
| "rewards/cosine_scaled_reward": -0.029885948926676065, | |
| "rewards/format_reward": 0.6250000074505806, | |
| "step": 165 | |
| }, | |
| { | |
| "completion_length": 1693.2708587646484, | |
| "epoch": 0.18971428571428572, | |
| "grad_norm": 0.02690989524126053, | |
| "kl": 0.006378173828125, | |
| "learning_rate": 8.603287946810513e-07, | |
| "loss": 0.0197, | |
| "reward": 0.12319543864578009, | |
| "reward_std": 0.16218674182891846, | |
| "rewards/cosine_scaled_reward": -0.12016977928578854, | |
| "rewards/format_reward": 0.7083333432674408, | |
| "step": 166 | |
| }, | |
| { | |
| "completion_length": 2558.1041870117188, | |
| "epoch": 0.19085714285714286, | |
| "grad_norm": 0.03383956104516983, | |
| "kl": 0.0082550048828125, | |
| "learning_rate": 8.580461976679099e-07, | |
| "loss": 0.001, | |
| "reward": 0.06310033053159714, | |
| "reward_std": 0.21703927963972092, | |
| "rewards/cosine_scaled_reward": -0.14907445572316647, | |
| "rewards/format_reward": 0.5416666716337204, | |
| "step": 167 | |
| }, | |
| { | |
| "completion_length": 1769.8959045410156, | |
| "epoch": 0.192, | |
| "grad_norm": 0.1372595876455307, | |
| "kl": 0.01055145263671875, | |
| "learning_rate": 8.557485869176825e-07, | |
| "loss": 0.0043, | |
| "reward": 0.18378751166164875, | |
| "reward_std": 0.19114972278475761, | |
| "rewards/cosine_scaled_reward": -0.08523260988295078, | |
| "rewards/format_reward": 0.8750000149011612, | |
| "step": 168 | |
| }, | |
| { | |
| "completion_length": 2148.8959350585938, | |
| "epoch": 0.19314285714285714, | |
| "grad_norm": 0.03248792514204979, | |
| "kl": 0.00789642333984375, | |
| "learning_rate": 8.534360744126753e-07, | |
| "loss": 0.004, | |
| "reward": 0.24832079000771046, | |
| "reward_std": 0.22351360693573952, | |
| "rewards/cosine_scaled_reward": 0.13557656295597553, | |
| "rewards/format_reward": 0.6875000111758709, | |
| "step": 169 | |
| }, | |
| { | |
| "completion_length": 2277.1875610351562, | |
| "epoch": 0.19428571428571428, | |
| "grad_norm": 0.026655089110136032, | |
| "kl": 0.00982666015625, | |
| "learning_rate": 8.511087728614862e-07, | |
| "loss": 0.0091, | |
| "reward": 0.0747418599203229, | |
| "reward_std": 0.15386026352643967, | |
| "rewards/cosine_scaled_reward": -0.12871773913502693, | |
| "rewards/format_reward": 0.5416666865348816, | |
| "step": 170 | |
| }, | |
| { | |
| "completion_length": 1648.1250457763672, | |
| "epoch": 0.19542857142857142, | |
| "grad_norm": 0.045482926070690155, | |
| "kl": 0.00757598876953125, | |
| "learning_rate": 8.487667956935087e-07, | |
| "loss": 0.0187, | |
| "reward": 0.16472867783159018, | |
| "reward_std": 0.21136175841093063, | |
| "rewards/cosine_scaled_reward": -0.06283853575587273, | |
| "rewards/format_reward": 0.7500000149011612, | |
| "step": 171 | |
| }, | |
| { | |
| "completion_length": 2006.0833740234375, | |
| "epoch": 0.19657142857142856, | |
| "grad_norm": 0.04429690167307854, | |
| "kl": 0.012279510498046875, | |
| "learning_rate": 8.464102570534061e-07, | |
| "loss": 0.0215, | |
| "reward": 0.12829256599070504, | |
| "reward_std": 0.21427210047841072, | |
| "rewards/cosine_scaled_reward": -0.07882313127629459, | |
| "rewards/format_reward": 0.6458333507180214, | |
| "step": 172 | |
| }, | |
| { | |
| "completion_length": 2454.1041870117188, | |
| "epoch": 0.1977142857142857, | |
| "grad_norm": 0.043881192803382874, | |
| "kl": 0.01338958740234375, | |
| "learning_rate": 8.440392717955475e-07, | |
| "loss": 0.0201, | |
| "reward": 0.16097562294453382, | |
| "reward_std": 0.23505596444010735, | |
| "rewards/cosine_scaled_reward": 0.046693362295627594, | |
| "rewards/format_reward": 0.5208333488553762, | |
| "step": 173 | |
| }, | |
| { | |
| "completion_length": 2534.0209350585938, | |
| "epoch": 0.19885714285714284, | |
| "grad_norm": 0.019798004999756813, | |
| "kl": 0.01529693603515625, | |
| "learning_rate": 8.416539554784089e-07, | |
| "loss": 0.0028, | |
| "reward": 0.008384472224861383, | |
| "reward_std": 0.10880408622324467, | |
| "rewards/cosine_scaled_reward": -0.20048344880342484, | |
| "rewards/format_reward": 0.4375000149011612, | |
| "step": 174 | |
| }, | |
| { | |
| "completion_length": 2614.041717529297, | |
| "epoch": 0.2, | |
| "grad_norm": 0.04128558933734894, | |
| "kl": 0.02044677734375, | |
| "learning_rate": 8.392544243589427e-07, | |
| "loss": 0.0092, | |
| "reward": 0.2010949682444334, | |
| "reward_std": 0.20921537093818188, | |
| "rewards/cosine_scaled_reward": 0.1154983602464199, | |
| "rewards/format_reward": 0.5416666716337204, | |
| "step": 175 | |
| }, | |
| { | |
| "completion_length": 1933.1250305175781, | |
| "epoch": 0.20114285714285715, | |
| "grad_norm": 0.03816467523574829, | |
| "kl": 0.0102081298828125, | |
| "learning_rate": 8.368407953869103e-07, | |
| "loss": 0.0121, | |
| "reward": 0.19207825139164925, | |
| "reward_std": 0.24206193909049034, | |
| "rewards/cosine_scaled_reward": 0.0065815625712275505, | |
| "rewards/format_reward": 0.708333358168602, | |
| "step": 176 | |
| }, | |
| { | |
| "completion_length": 1727.2708587646484, | |
| "epoch": 0.2022857142857143, | |
| "grad_norm": 0.018576249480247498, | |
| "kl": 0.0099945068359375, | |
| "learning_rate": 8.344131861991828e-07, | |
| "loss": 0.0088, | |
| "reward": 0.1630729604512453, | |
| "reward_std": 0.18059775605797768, | |
| "rewards/cosine_scaled_reward": -0.053607773035764694, | |
| "rewards/format_reward": 0.7291666865348816, | |
| "step": 177 | |
| }, | |
| { | |
| "completion_length": 1683.8959045410156, | |
| "epoch": 0.20342857142857143, | |
| "grad_norm": 0.022960882633924484, | |
| "kl": 0.01275634765625, | |
| "learning_rate": 8.319717151140072e-07, | |
| "loss": 0.0076, | |
| "reward": 0.22537659853696823, | |
| "reward_std": 0.13952878676354885, | |
| "rewards/cosine_scaled_reward": 0.06942565552890301, | |
| "rewards/format_reward": 0.7291666865348816, | |
| "step": 178 | |
| }, | |
| { | |
| "completion_length": 2164.8334045410156, | |
| "epoch": 0.20457142857142857, | |
| "grad_norm": 0.03156055137515068, | |
| "kl": 0.0158538818359375, | |
| "learning_rate": 8.295165011252396e-07, | |
| "loss": -0.0013, | |
| "reward": 0.11537174182012677, | |
| "reward_std": 0.14489713683724403, | |
| "rewards/cosine_scaled_reward": -0.07945736683905125, | |
| "rewards/format_reward": 0.6041666716337204, | |
| "step": 179 | |
| }, | |
| { | |
| "completion_length": 2393.729248046875, | |
| "epoch": 0.2057142857142857, | |
| "grad_norm": 0.04348559305071831, | |
| "kl": 0.01876068115234375, | |
| "learning_rate": 8.270476638965461e-07, | |
| "loss": 0.0102, | |
| "reward": 0.17879832163453102, | |
| "reward_std": 0.2527993433177471, | |
| "rewards/cosine_scaled_reward": -0.014543931931257248, | |
| "rewards/format_reward": 0.7083333507180214, | |
| "step": 180 | |
| }, | |
| { | |
| "completion_length": 2375.1251220703125, | |
| "epoch": 0.20685714285714285, | |
| "grad_norm": 0.02833203598856926, | |
| "kl": 0.0088958740234375, | |
| "learning_rate": 8.245653237555705e-07, | |
| "loss": 0.0162, | |
| "reward": 0.11990400403738022, | |
| "reward_std": 0.20802215114235878, | |
| "rewards/cosine_scaled_reward": -0.08764380216598511, | |
| "rewards/format_reward": 0.6458333432674408, | |
| "step": 181 | |
| }, | |
| { | |
| "completion_length": 2159.3541870117188, | |
| "epoch": 0.208, | |
| "grad_norm": 0.0430837906897068, | |
| "kl": 0.01611328125, | |
| "learning_rate": 8.220696016880687e-07, | |
| "loss": 0.028, | |
| "reward": 0.11969278380274773, | |
| "reward_std": 0.18250374868512154, | |
| "rewards/cosine_scaled_reward": -0.07830735668540001, | |
| "rewards/format_reward": 0.6041666865348816, | |
| "step": 182 | |
| }, | |
| { | |
| "completion_length": 2478.2709045410156, | |
| "epoch": 0.20914285714285713, | |
| "grad_norm": 0.03559846431016922, | |
| "kl": 0.016357421875, | |
| "learning_rate": 8.195606193320136e-07, | |
| "loss": 0.0079, | |
| "reward": 0.0919726500287652, | |
| "reward_std": 0.17250213399529457, | |
| "rewards/cosine_scaled_reward": -0.1711025983095169, | |
| "rewards/format_reward": 0.6875000149011612, | |
| "step": 183 | |
| }, | |
| { | |
| "completion_length": 1863.6458435058594, | |
| "epoch": 0.2102857142857143, | |
| "grad_norm": 0.058719128370285034, | |
| "kl": 0.010097503662109375, | |
| "learning_rate": 8.170384989716657e-07, | |
| "loss": 0.0139, | |
| "reward": 0.3048729207366705, | |
| "reward_std": 0.2621423527598381, | |
| "rewards/cosine_scaled_reward": 0.2180614322423935, | |
| "rewards/format_reward": 0.7291666716337204, | |
| "step": 184 | |
| }, | |
| { | |
| "completion_length": 2742.9584350585938, | |
| "epoch": 0.21142857142857144, | |
| "grad_norm": 0.0417308583855629, | |
| "kl": 0.0185394287109375, | |
| "learning_rate": 8.145033635316128e-07, | |
| "loss": 0.0162, | |
| "reward": 0.0587873995937116, | |
| "reward_std": 0.14735205844044685, | |
| "rewards/cosine_scaled_reward": -0.104416124522686, | |
| "rewards/format_reward": 0.4375, | |
| "step": 185 | |
| }, | |
| { | |
| "completion_length": 2691.3541870117188, | |
| "epoch": 0.21257142857142858, | |
| "grad_norm": 0.029037240892648697, | |
| "kl": 0.014801025390625, | |
| "learning_rate": 8.119553365707802e-07, | |
| "loss": 0.0102, | |
| "reward": 0.17423243075609207, | |
| "reward_std": 0.19241954199969769, | |
| "rewards/cosine_scaled_reward": 0.05360138416290283, | |
| "rewards/format_reward": 0.5625000149011612, | |
| "step": 186 | |
| }, | |
| { | |
| "completion_length": 2572.9375915527344, | |
| "epoch": 0.21371428571428572, | |
| "grad_norm": 0.04996544122695923, | |
| "kl": 0.0216522216796875, | |
| "learning_rate": 8.093945422764069e-07, | |
| "loss": 0.0219, | |
| "reward": 0.21490501356311142, | |
| "reward_std": 0.18972796946763992, | |
| "rewards/cosine_scaled_reward": 0.12983741983771324, | |
| "rewards/format_reward": 0.5625000223517418, | |
| "step": 187 | |
| }, | |
| { | |
| "completion_length": 1724.5833740234375, | |
| "epoch": 0.21485714285714286, | |
| "grad_norm": 0.06050784885883331, | |
| "kl": 0.01337432861328125, | |
| "learning_rate": 8.068211054579943e-07, | |
| "loss": 0.0229, | |
| "reward": 0.2116590766236186, | |
| "reward_std": 0.28142325207591057, | |
| "rewards/cosine_scaled_reward": 0.05214736983180046, | |
| "rewards/format_reward": 0.708333358168602, | |
| "step": 188 | |
| }, | |
| { | |
| "completion_length": 2055.2500610351562, | |
| "epoch": 0.216, | |
| "grad_norm": 0.1007910668849945, | |
| "kl": 0.02374267578125, | |
| "learning_rate": 8.04235151541222e-07, | |
| "loss": 0.0355, | |
| "reward": 0.0947279091924429, | |
| "reward_std": 0.21198182925581932, | |
| "rewards/cosine_scaled_reward": -0.16024301457218826, | |
| "rewards/format_reward": 0.666666679084301, | |
| "step": 189 | |
| }, | |
| { | |
| "completion_length": 2751.854217529297, | |
| "epoch": 0.21714285714285714, | |
| "grad_norm": 0.0479561872780323, | |
| "kl": 0.02215576171875, | |
| "learning_rate": 8.01636806561836e-07, | |
| "loss": 0.0141, | |
| "reward": 0.057090925984084606, | |
| "reward_std": 0.24028980359435081, | |
| "rewards/cosine_scaled_reward": -0.08829662017524242, | |
| "rewards/format_reward": 0.3958333432674408, | |
| "step": 190 | |
| }, | |
| { | |
| "completion_length": 2191.604248046875, | |
| "epoch": 0.21828571428571428, | |
| "grad_norm": 0.05289386212825775, | |
| "kl": 0.0192413330078125, | |
| "learning_rate": 7.990261971595048e-07, | |
| "loss": 0.0168, | |
| "reward": 0.11481184232980013, | |
| "reward_std": 0.23359360545873642, | |
| "rewards/cosine_scaled_reward": -0.03005097433924675, | |
| "rewards/format_reward": 0.5000000149011612, | |
| "step": 191 | |
| }, | |
| { | |
| "completion_length": 2692.4584350585938, | |
| "epoch": 0.21942857142857142, | |
| "grad_norm": 0.041296400129795074, | |
| "kl": 0.027435302734375, | |
| "learning_rate": 7.964034505716476e-07, | |
| "loss": 0.0113, | |
| "reward": 0.01905603031627834, | |
| "reward_std": 0.14613191038370132, | |
| "rewards/cosine_scaled_reward": -0.19530250132083893, | |
| "rewards/format_reward": 0.45833334140479565, | |
| "step": 192 | |
| }, | |
| { | |
| "completion_length": 2397.3959350585938, | |
| "epoch": 0.22057142857142858, | |
| "grad_norm": 0.04257694631814957, | |
| "kl": 0.031463623046875, | |
| "learning_rate": 7.93768694627233e-07, | |
| "loss": 0.0172, | |
| "reward": 0.1044243611395359, | |
| "reward_std": 0.21159589290618896, | |
| "rewards/cosine_scaled_reward": -0.12644829344935715, | |
| "rewards/format_reward": 0.6458333656191826, | |
| "step": 193 | |
| }, | |
| { | |
| "completion_length": 1164.8750305175781, | |
| "epoch": 0.22171428571428572, | |
| "grad_norm": 0.026060882955789566, | |
| "kl": 0.01049041748046875, | |
| "learning_rate": 7.911220577405484e-07, | |
| "loss": 0.0009, | |
| "reward": 0.3720349445939064, | |
| "reward_std": 0.1574680171906948, | |
| "rewards/cosine_scaled_reward": 0.2316273289034143, | |
| "rewards/format_reward": 0.9583333432674408, | |
| "step": 194 | |
| }, | |
| { | |
| "completion_length": 1602.8958892822266, | |
| "epoch": 0.22285714285714286, | |
| "grad_norm": 0.10243961960077286, | |
| "kl": 0.0208282470703125, | |
| "learning_rate": 7.884636689049422e-07, | |
| "loss": 0.0278, | |
| "reward": 0.18980997614562511, | |
| "reward_std": 0.20258211717009544, | |
| "rewards/cosine_scaled_reward": 0.03090360015630722, | |
| "rewards/format_reward": 0.6666666865348816, | |
| "step": 195 | |
| }, | |
| { | |
| "completion_length": 2781.0833740234375, | |
| "epoch": 0.224, | |
| "grad_norm": 0.03952959179878235, | |
| "kl": 0.026519775390625, | |
| "learning_rate": 7.857936576865356e-07, | |
| "loss": 0.0114, | |
| "reward": 0.020335078239440918, | |
| "reward_std": 0.1450938880443573, | |
| "rewards/cosine_scaled_reward": -0.12913516722619534, | |
| "rewards/format_reward": 0.3333333432674408, | |
| "step": 196 | |
| }, | |
| { | |
| "completion_length": 2601.1666870117188, | |
| "epoch": 0.22514285714285714, | |
| "grad_norm": 0.037638694047927856, | |
| "kl": 0.02423095703125, | |
| "learning_rate": 7.831121542179086e-07, | |
| "loss": 0.0153, | |
| "reward": -0.03209148999303579, | |
| "reward_std": 0.15572769939899445, | |
| "rewards/cosine_scaled_reward": -0.2395016998052597, | |
| "rewards/format_reward": 0.3541666679084301, | |
| "step": 197 | |
| }, | |
| { | |
| "completion_length": 2882.2084350585938, | |
| "epoch": 0.22628571428571428, | |
| "grad_norm": 0.05307415872812271, | |
| "kl": 0.0274658203125, | |
| "learning_rate": 7.804192891917571e-07, | |
| "loss": 0.0121, | |
| "reward": 0.07938438219389354, | |
| "reward_std": 0.23124578595161438, | |
| "rewards/cosine_scaled_reward": -0.0630751991411671, | |
| "rewards/format_reward": 0.4375000186264515, | |
| "step": 198 | |
| }, | |
| { | |
| "completion_length": 2432.6251220703125, | |
| "epoch": 0.22742857142857142, | |
| "grad_norm": 0.042278122156858444, | |
| "kl": 0.029205322265625, | |
| "learning_rate": 7.777151938545235e-07, | |
| "loss": 0.0084, | |
| "reward": 0.2322162203490734, | |
| "reward_std": 0.23782150447368622, | |
| "rewards/cosine_scaled_reward": 0.17751576751470566, | |
| "rewards/format_reward": 0.5416666865348816, | |
| "step": 199 | |
| }, | |
| { | |
| "completion_length": 1937.5000457763672, | |
| "epoch": 0.22857142857142856, | |
| "grad_norm": 0.045655835419893265, | |
| "kl": 0.021839141845703125, | |
| "learning_rate": 7.75e-07, | |
| "loss": -0.0039, | |
| "reward": 0.18519231583923101, | |
| "reward_std": 0.15890146791934967, | |
| "rewards/cosine_scaled_reward": 0.06046340987086296, | |
| "rewards/format_reward": 0.5833333432674408, | |
| "step": 200 | |
| }, | |
| { | |
| "completion_length": 2087.979248046875, | |
| "epoch": 0.2297142857142857, | |
| "grad_norm": 0.054094862192869186, | |
| "kl": 0.027435302734375, | |
| "learning_rate": 7.72273839962904e-07, | |
| "loss": -0.001, | |
| "reward": 0.06794596090912819, | |
| "reward_std": 0.15832657739520073, | |
| "rewards/cosine_scaled_reward": -0.13955808524042368, | |
| "rewards/format_reward": 0.5416666679084301, | |
| "step": 201 | |
| }, | |
| { | |
| "completion_length": 1476.7708587646484, | |
| "epoch": 0.23085714285714284, | |
| "grad_norm": 0.05633439123630524, | |
| "kl": 0.01910400390625, | |
| "learning_rate": 7.695368466124296e-07, | |
| "loss": 0.0179, | |
| "reward": 0.18051820993423462, | |
| "reward_std": 0.2778208777308464, | |
| "rewards/cosine_scaled_reward": -0.025857534259557724, | |
| "rewards/format_reward": 0.7291666865348816, | |
| "step": 202 | |
| }, | |
| { | |
| "completion_length": 2240.604217529297, | |
| "epoch": 0.232, | |
| "grad_norm": 0.07334030419588089, | |
| "kl": 0.034423828125, | |
| "learning_rate": 7.667891533457718e-07, | |
| "loss": 0.0164, | |
| "reward": 0.22217239439487457, | |
| "reward_std": 0.20295510441064835, | |
| "rewards/cosine_scaled_reward": 0.08585923444479704, | |
| "rewards/format_reward": 0.6875000149011612, | |
| "step": 203 | |
| }, | |
| { | |
| "completion_length": 1666.5000762939453, | |
| "epoch": 0.23314285714285715, | |
| "grad_norm": 0.08228877186775208, | |
| "kl": 0.0199737548828125, | |
| "learning_rate": 7.640308940816239e-07, | |
| "loss": 0.0299, | |
| "reward": 0.21982263028621674, | |
| "reward_std": 0.21554231271147728, | |
| "rewards/cosine_scaled_reward": 0.13159801624715328, | |
| "rewards/format_reward": 0.5625000111758709, | |
| "step": 204 | |
| }, | |
| { | |
| "completion_length": 2317.541748046875, | |
| "epoch": 0.2342857142857143, | |
| "grad_norm": 0.1319306343793869, | |
| "kl": 0.029937744140625, | |
| "learning_rate": 7.612622032536507e-07, | |
| "loss": 0.0392, | |
| "reward": 0.18830347340554, | |
| "reward_std": 0.31425701081752777, | |
| "rewards/cosine_scaled_reward": 0.11326558981090784, | |
| "rewards/format_reward": 0.5000000149011612, | |
| "step": 205 | |
| }, | |
| { | |
| "completion_length": 2953.9375610351562, | |
| "epoch": 0.23542857142857143, | |
| "grad_norm": 0.07008088380098343, | |
| "kl": 0.0404052734375, | |
| "learning_rate": 7.584832158039378e-07, | |
| "loss": 0.0203, | |
| "reward": 0.07447466999292374, | |
| "reward_std": 0.22944918274879456, | |
| "rewards/cosine_scaled_reward": -0.04474049434065819, | |
| "rewards/format_reward": 0.3750000111758709, | |
| "step": 206 | |
| }, | |
| { | |
| "completion_length": 2368.729248046875, | |
| "epoch": 0.23657142857142857, | |
| "grad_norm": 0.04053330048918724, | |
| "kl": 0.02538299560546875, | |
| "learning_rate": 7.556940671764124e-07, | |
| "loss": 0.0108, | |
| "reward": 0.17779894266277552, | |
| "reward_std": 0.23435930907726288, | |
| "rewards/cosine_scaled_reward": 0.022668661549687386, | |
| "rewards/format_reward": 0.6458333507180214, | |
| "step": 207 | |
| }, | |
| { | |
| "completion_length": 2806.5833740234375, | |
| "epoch": 0.2377142857142857, | |
| "grad_norm": 0.07727314531803131, | |
| "kl": 0.0537109375, | |
| "learning_rate": 7.528948933102438e-07, | |
| "loss": 0.0016, | |
| "reward": -0.07132546463981271, | |
| "reward_std": 0.09164782986044884, | |
| "rewards/cosine_scaled_reward": -0.24160564318299294, | |
| "rewards/format_reward": 0.2083333432674408, | |
| "step": 208 | |
| }, | |
| { | |
| "completion_length": 1692.8125610351562, | |
| "epoch": 0.23885714285714285, | |
| "grad_norm": 0.031584516167640686, | |
| "kl": 0.037841796875, | |
| "learning_rate": 7.500858306332172e-07, | |
| "loss": 0.0021, | |
| "reward": 0.08228785474784672, | |
| "reward_std": 0.17406276986002922, | |
| "rewards/cosine_scaled_reward": -0.0839727008715272, | |
| "rewards/format_reward": 0.479166679084301, | |
| "step": 209 | |
| }, | |
| { | |
| "completion_length": 1505.5833587646484, | |
| "epoch": 0.24, | |
| "grad_norm": 0.1278965026140213, | |
| "kl": 0.0312042236328125, | |
| "learning_rate": 7.472670160550848e-07, | |
| "loss": 0.0239, | |
| "reward": 0.2997470572590828, | |
| "reward_std": 0.17756814323365688, | |
| "rewards/cosine_scaled_reward": 0.20885192044079304, | |
| "rewards/format_reward": 0.729166679084301, | |
| "step": 210 | |
| }, | |
| { | |
| "completion_length": 2646.0209350585938, | |
| "epoch": 0.24114285714285713, | |
| "grad_norm": 0.0754714235663414, | |
| "kl": 0.04766845703125, | |
| "learning_rate": 7.444385869608921e-07, | |
| "loss": -0.003, | |
| "reward": -0.029848297126591206, | |
| "reward_std": 0.16529761254787445, | |
| "rewards/cosine_scaled_reward": -0.22342086024582386, | |
| "rewards/format_reward": 0.3333333432674408, | |
| "step": 211 | |
| }, | |
| { | |
| "completion_length": 2258.3958892822266, | |
| "epoch": 0.2422857142857143, | |
| "grad_norm": 0.060578618198633194, | |
| "kl": 0.05767822265625, | |
| "learning_rate": 7.416006812042827e-07, | |
| "loss": 0.0015, | |
| "reward": 0.1782614178955555, | |
| "reward_std": 0.20231148600578308, | |
| "rewards/cosine_scaled_reward": 0.03856219723820686, | |
| "rewards/format_reward": 0.604166679084301, | |
| "step": 212 | |
| }, | |
| { | |
| "completion_length": 2688.104217529297, | |
| "epoch": 0.24342857142857144, | |
| "grad_norm": 0.09611926227807999, | |
| "kl": 0.046875, | |
| "learning_rate": 7.387534371007797e-07, | |
| "loss": 0.0065, | |
| "reward": 0.01836175099015236, | |
| "reward_std": 0.13345054909586906, | |
| "rewards/cosine_scaled_reward": -0.1522515956312418, | |
| "rewards/format_reward": 0.375, | |
| "step": 213 | |
| }, | |
| { | |
| "completion_length": 2466.3125610351562, | |
| "epoch": 0.24457142857142858, | |
| "grad_norm": 0.052475061267614365, | |
| "kl": 0.03021240234375, | |
| "learning_rate": 7.358969934210438e-07, | |
| "loss": 0.0083, | |
| "reward": 0.12450084672309458, | |
| "reward_std": 0.2435830906033516, | |
| "rewards/cosine_scaled_reward": 0.006652882322669029, | |
| "rewards/format_reward": 0.4583333432674408, | |
| "step": 214 | |
| }, | |
| { | |
| "completion_length": 2190.5626220703125, | |
| "epoch": 0.24571428571428572, | |
| "grad_norm": 0.07757747173309326, | |
| "kl": 0.0418701171875, | |
| "learning_rate": 7.330314893841101e-07, | |
| "loss": -0.0015, | |
| "reward": 0.17446245718747377, | |
| "reward_std": 0.2479591779410839, | |
| "rewards/cosine_scaled_reward": 0.057695697993040085, | |
| "rewards/format_reward": 0.5625000149011612, | |
| "step": 215 | |
| }, | |
| { | |
| "completion_length": 2751.0625610351562, | |
| "epoch": 0.24685714285714286, | |
| "grad_norm": 0.08603978157043457, | |
| "kl": 0.0460205078125, | |
| "learning_rate": 7.301570646506027e-07, | |
| "loss": 0.0215, | |
| "reward": 0.0640587080270052, | |
| "reward_std": 0.23771962150931358, | |
| "rewards/cosine_scaled_reward": -0.0884516267105937, | |
| "rewards/format_reward": 0.4166666716337204, | |
| "step": 216 | |
| }, | |
| { | |
| "completion_length": 2253.0208740234375, | |
| "epoch": 0.248, | |
| "grad_norm": 0.06187213584780693, | |
| "kl": 0.04681396484375, | |
| "learning_rate": 7.27273859315928e-07, | |
| "loss": -0.0022, | |
| "reward": 0.1055583581328392, | |
| "reward_std": 0.18638527393341064, | |
| "rewards/cosine_scaled_reward": -0.04735219478607178, | |
| "rewards/format_reward": 0.5000000149011612, | |
| "step": 217 | |
| }, | |
| { | |
| "completion_length": 2078.7083740234375, | |
| "epoch": 0.24914285714285714, | |
| "grad_norm": 0.04890087991952896, | |
| "kl": 0.03765869140625, | |
| "learning_rate": 7.243820139034464e-07, | |
| "loss": 0.0155, | |
| "reward": 0.08832587394863367, | |
| "reward_std": 0.18256075587123632, | |
| "rewards/cosine_scaled_reward": -0.01882084831595421, | |
| "rewards/format_reward": 0.3750000149011612, | |
| "step": 218 | |
| }, | |
| { | |
| "completion_length": 1733.8125305175781, | |
| "epoch": 0.2502857142857143, | |
| "grad_norm": 0.037329934537410736, | |
| "kl": 0.035614013671875, | |
| "learning_rate": 7.214816693576234e-07, | |
| "loss": 0.0119, | |
| "reward": 0.1338009461760521, | |
| "reward_std": 0.1712276004254818, | |
| "rewards/cosine_scaled_reward": -0.014061834663152695, | |
| "rewards/format_reward": 0.5208333544433117, | |
| "step": 219 | |
| }, | |
| { | |
| "completion_length": 1988.854248046875, | |
| "epoch": 0.25142857142857145, | |
| "grad_norm": 0.05641601234674454, | |
| "kl": 0.0472412109375, | |
| "learning_rate": 7.185729670371604e-07, | |
| "loss": 0.0038, | |
| "reward": 0.1055915541946888, | |
| "reward_std": 0.1940936055034399, | |
| "rewards/cosine_scaled_reward": -0.03501408232841641, | |
| "rewards/format_reward": 0.4583333395421505, | |
| "step": 220 | |
| }, | |
| { | |
| "completion_length": 2915.812530517578, | |
| "epoch": 0.25257142857142856, | |
| "grad_norm": 0.04835179075598717, | |
| "kl": 0.05194091796875, | |
| "learning_rate": 7.156560487081051e-07, | |
| "loss": 0.0108, | |
| "reward": 0.04782783659175038, | |
| "reward_std": 0.2072219979017973, | |
| "rewards/cosine_scaled_reward": -0.05986208841204643, | |
| "rewards/format_reward": 0.2916666753590107, | |
| "step": 221 | |
| }, | |
| { | |
| "completion_length": 2822.3750610351562, | |
| "epoch": 0.2537142857142857, | |
| "grad_norm": 0.04979720711708069, | |
| "kl": 0.0433349609375, | |
| "learning_rate": 7.127310565369415e-07, | |
| "loss": 0.0168, | |
| "reward": 0.0005729561671614647, | |
| "reward_std": 0.12532713450491428, | |
| "rewards/cosine_scaled_reward": -0.22823826782405376, | |
| "rewards/format_reward": 0.4583333507180214, | |
| "step": 222 | |
| }, | |
| { | |
| "completion_length": 2277.4376068115234, | |
| "epoch": 0.25485714285714284, | |
| "grad_norm": 0.07067270576953888, | |
| "kl": 0.0441741943359375, | |
| "learning_rate": 7.097981330836616e-07, | |
| "loss": 0.011, | |
| "reward": 0.13097468297928572, | |
| "reward_std": 0.25622726790606976, | |
| "rewards/cosine_scaled_reward": -0.08305801264941692, | |
| "rewards/format_reward": 0.6666666865348816, | |
| "step": 223 | |
| }, | |
| { | |
| "completion_length": 2630.5625610351562, | |
| "epoch": 0.256, | |
| "grad_norm": 0.06098279356956482, | |
| "kl": 0.05035400390625, | |
| "learning_rate": 7.068574212948169e-07, | |
| "loss": 0.0103, | |
| "reward": 0.14421742223203182, | |
| "reward_std": 0.20859143882989883, | |
| "rewards/cosine_scaled_reward": 0.06943614780902863, | |
| "rewards/format_reward": 0.3958333395421505, | |
| "step": 224 | |
| }, | |
| { | |
| "completion_length": 1573.1458892822266, | |
| "epoch": 0.2571428571428571, | |
| "grad_norm": 0.08091147989034653, | |
| "kl": 0.02947998046875, | |
| "learning_rate": 7.039090644965509e-07, | |
| "loss": 0.0101, | |
| "reward": 0.30303100124001503, | |
| "reward_std": 0.26837362349033356, | |
| "rewards/cosine_scaled_reward": 0.25287530571222305, | |
| "rewards/format_reward": 0.6458333432674408, | |
| "step": 225 | |
| }, | |
| { | |
| "completion_length": 2329.8334350585938, | |
| "epoch": 0.2582857142857143, | |
| "grad_norm": 0.1142406165599823, | |
| "kl": 0.05145263671875, | |
| "learning_rate": 7.009532063876148e-07, | |
| "loss": 0.0094, | |
| "reward": 0.024633352644741535, | |
| "reward_std": 0.15207960829138756, | |
| "rewards/cosine_scaled_reward": -0.19333514384925365, | |
| "rewards/format_reward": 0.4791666679084301, | |
| "step": 226 | |
| }, | |
| { | |
| "completion_length": 1921.1250457763672, | |
| "epoch": 0.25942857142857145, | |
| "grad_norm": 0.07364730536937714, | |
| "kl": 0.044921875, | |
| "learning_rate": 6.979899910323624e-07, | |
| "loss": 0.0127, | |
| "reward": 0.07605094433529302, | |
| "reward_std": 0.19391180202364922, | |
| "rewards/cosine_scaled_reward": -0.10878950078040361, | |
| "rewards/format_reward": 0.5000000074505806, | |
| "step": 227 | |
| }, | |
| { | |
| "completion_length": 2333.4376220703125, | |
| "epoch": 0.26057142857142856, | |
| "grad_norm": 0.06939440965652466, | |
| "kl": 0.0438232421875, | |
| "learning_rate": 6.950195628537299e-07, | |
| "loss": 0.0129, | |
| "reward": 0.06750311236828566, | |
| "reward_std": 0.18399767391383648, | |
| "rewards/cosine_scaled_reward": -0.1196965891867876, | |
| "rewards/format_reward": 0.5000000149011612, | |
| "step": 228 | |
| }, | |
| { | |
| "completion_length": 2393.6251220703125, | |
| "epoch": 0.26171428571428573, | |
| "grad_norm": 0.07279952615499496, | |
| "kl": 0.0634765625, | |
| "learning_rate": 6.920420666261961e-07, | |
| "loss": 0.0102, | |
| "reward": 0.055809149984270334, | |
| "reward_std": 0.2374669834971428, | |
| "rewards/cosine_scaled_reward": -0.12200246378779411, | |
| "rewards/format_reward": 0.4583333432674408, | |
| "step": 229 | |
| }, | |
| { | |
| "completion_length": 2134.8333892822266, | |
| "epoch": 0.26285714285714284, | |
| "grad_norm": 0.09543006867170334, | |
| "kl": 0.04742431640625, | |
| "learning_rate": 6.890576474687263e-07, | |
| "loss": 0.0208, | |
| "reward": 0.1400698386132717, | |
| "reward_std": 0.23821917176246643, | |
| "rewards/cosine_scaled_reward": 0.059078078251332045, | |
| "rewards/format_reward": 0.416666679084301, | |
| "step": 230 | |
| }, | |
| { | |
| "completion_length": 2927.3334350585938, | |
| "epoch": 0.264, | |
| "grad_norm": 0.06220847740769386, | |
| "kl": 0.05975341796875, | |
| "learning_rate": 6.860664508377001e-07, | |
| "loss": 0.0123, | |
| "reward": 0.09411982633173466, | |
| "reward_std": 0.19875338301062584, | |
| "rewards/cosine_scaled_reward": -0.04482337925583124, | |
| "rewards/format_reward": 0.4583333507180214, | |
| "step": 231 | |
| }, | |
| { | |
| "completion_length": 2269.5208740234375, | |
| "epoch": 0.2651428571428571, | |
| "grad_norm": 0.09923657774925232, | |
| "kl": 0.05657958984375, | |
| "learning_rate": 6.83068622519821e-07, | |
| "loss": -0.0013, | |
| "reward": 0.02260537422262132, | |
| "reward_std": 0.17237377539277077, | |
| "rewards/cosine_scaled_reward": -0.1425788849592209, | |
| "rewards/format_reward": 0.3541666716337204, | |
| "step": 232 | |
| }, | |
| { | |
| "completion_length": 2754.9584045410156, | |
| "epoch": 0.2662857142857143, | |
| "grad_norm": 0.05900200456380844, | |
| "kl": 0.05712890625, | |
| "learning_rate": 6.800643086250121e-07, | |
| "loss": 0.0139, | |
| "reward": 0.15356310084462166, | |
| "reward_std": 0.2466350980103016, | |
| "rewards/cosine_scaled_reward": 0.03038211865350604, | |
| "rewards/format_reward": 0.5416666865348816, | |
| "step": 233 | |
| }, | |
| { | |
| "completion_length": 2702.8125610351562, | |
| "epoch": 0.2674285714285714, | |
| "grad_norm": 0.142472505569458, | |
| "kl": 0.056396484375, | |
| "learning_rate": 6.770536555792944e-07, | |
| "loss": 0.0186, | |
| "reward": 0.1253685262054205, | |
| "reward_std": 0.21780013293027878, | |
| "rewards/cosine_scaled_reward": -0.007253678515553474, | |
| "rewards/format_reward": 0.5000000111758709, | |
| "step": 234 | |
| }, | |
| { | |
| "completion_length": 1911.2709197998047, | |
| "epoch": 0.26857142857142857, | |
| "grad_norm": 0.08689363300800323, | |
| "kl": 0.0460205078125, | |
| "learning_rate": 6.740368101176495e-07, | |
| "loss": -0.0011, | |
| "reward": 0.15196546725928783, | |
| "reward_std": 0.1561927441507578, | |
| "rewards/cosine_scaled_reward": -0.00399697944521904, | |
| "rewards/format_reward": 0.5625000111758709, | |
| "step": 235 | |
| }, | |
| { | |
| "completion_length": 2878.4375610351562, | |
| "epoch": 0.26971428571428574, | |
| "grad_norm": 0.08461257815361023, | |
| "kl": 0.08282470703125, | |
| "learning_rate": 6.710139192768694e-07, | |
| "loss": 0.0105, | |
| "reward": -0.00937534449622035, | |
| "reward_std": 0.17156489565968513, | |
| "rewards/cosine_scaled_reward": -0.1730695739388466, | |
| "rewards/format_reward": 0.3125, | |
| "step": 236 | |
| }, | |
| { | |
| "completion_length": 1936.8959197998047, | |
| "epoch": 0.27085714285714285, | |
| "grad_norm": 0.0630769282579422, | |
| "kl": 0.04522705078125, | |
| "learning_rate": 6.679851303883891e-07, | |
| "loss": 0.0055, | |
| "reward": 0.09487669391091913, | |
| "reward_std": 0.19474418088793755, | |
| "rewards/cosine_scaled_reward": -0.07726177107542753, | |
| "rewards/format_reward": 0.5208333432674408, | |
| "step": 237 | |
| }, | |
| { | |
| "completion_length": 2472.4375915527344, | |
| "epoch": 0.272, | |
| "grad_norm": 0.06509706377983093, | |
| "kl": 0.074462890625, | |
| "learning_rate": 6.649505910711058e-07, | |
| "loss": 0.012, | |
| "reward": 0.07986411359161139, | |
| "reward_std": 0.22248198464512825, | |
| "rewards/cosine_scaled_reward": -0.09880928695201874, | |
| "rewards/format_reward": 0.5000000204890966, | |
| "step": 238 | |
| }, | |
| { | |
| "completion_length": 2941.6250610351562, | |
| "epoch": 0.27314285714285713, | |
| "grad_norm": 0.07112760096788406, | |
| "kl": 0.0775146484375, | |
| "learning_rate": 6.619104492241847e-07, | |
| "loss": 0.0158, | |
| "reward": -0.018236166331917048, | |
| "reward_std": 0.184496458619833, | |
| "rewards/cosine_scaled_reward": -0.15759910643100739, | |
| "rewards/format_reward": 0.25000000186264515, | |
| "step": 239 | |
| }, | |
| { | |
| "completion_length": 2500.6458435058594, | |
| "epoch": 0.2742857142857143, | |
| "grad_norm": 0.08548548817634583, | |
| "kl": 0.07989501953125, | |
| "learning_rate": 6.588648530198504e-07, | |
| "loss": 0.0183, | |
| "reward": 0.20236056856811047, | |
| "reward_std": 0.2052462175488472, | |
| "rewards/cosine_scaled_reward": 0.09307753993198276, | |
| "rewards/format_reward": 0.583333358168602, | |
| "step": 240 | |
| }, | |
| { | |
| "completion_length": 2152.0625610351562, | |
| "epoch": 0.2754285714285714, | |
| "grad_norm": 0.08038657158613205, | |
| "kl": 0.068572998046875, | |
| "learning_rate": 6.558139508961654e-07, | |
| "loss": 0.01, | |
| "reward": 0.11309333913959563, | |
| "reward_std": 0.18571311049163342, | |
| "rewards/cosine_scaled_reward": -0.015796410385519266, | |
| "rewards/format_reward": 0.4583333507180214, | |
| "step": 241 | |
| }, | |
| { | |
| "completion_length": 2310.0001220703125, | |
| "epoch": 0.2765714285714286, | |
| "grad_norm": 0.09080269187688828, | |
| "kl": 0.06890869140625, | |
| "learning_rate": 6.527578915497951e-07, | |
| "loss": 0.0043, | |
| "reward": 0.07082781102508307, | |
| "reward_std": 0.20456375181674957, | |
| "rewards/cosine_scaled_reward": -0.12692169286310673, | |
| "rewards/format_reward": 0.5208333507180214, | |
| "step": 242 | |
| }, | |
| { | |
| "completion_length": 1948.7291870117188, | |
| "epoch": 0.2777142857142857, | |
| "grad_norm": 0.06289907544851303, | |
| "kl": 0.0518798828125, | |
| "learning_rate": 6.496968239287603e-07, | |
| "loss": 0.0089, | |
| "reward": 0.1241726630833, | |
| "reward_std": 0.17663927376270294, | |
| "rewards/cosine_scaled_reward": -0.10734192654490471, | |
| "rewards/format_reward": 0.6875000298023224, | |
| "step": 243 | |
| }, | |
| { | |
| "completion_length": 2412.0000610351562, | |
| "epoch": 0.27885714285714286, | |
| "grad_norm": 0.11397552490234375, | |
| "kl": 0.0631103515625, | |
| "learning_rate": 6.466308972251785e-07, | |
| "loss": 0.0222, | |
| "reward": 0.13477333387709223, | |
| "reward_std": 0.26352939009666443, | |
| "rewards/cosine_scaled_reward": 0.03421788662672043, | |
| "rewards/format_reward": 0.43750001303851604, | |
| "step": 244 | |
| }, | |
| { | |
| "completion_length": 2034.3959045410156, | |
| "epoch": 0.28, | |
| "grad_norm": 0.08152657002210617, | |
| "kl": 0.0673980712890625, | |
| "learning_rate": 6.435602608679916e-07, | |
| "loss": 0.0116, | |
| "reward": 0.10570267424918711, | |
| "reward_std": 0.16765698045492172, | |
| "rewards/cosine_scaled_reward": -0.09992778301239014, | |
| "rewards/format_reward": 0.6041666865348816, | |
| "step": 245 | |
| }, | |
| { | |
| "completion_length": 1894.1667175292969, | |
| "epoch": 0.28114285714285714, | |
| "grad_norm": 0.19043473899364471, | |
| "kl": 0.0638427734375, | |
| "learning_rate": 6.404850645156841e-07, | |
| "loss": 0.0245, | |
| "reward": 0.178838558960706, | |
| "reward_std": 0.2002551555633545, | |
| "rewards/cosine_scaled_reward": 0.01489507406949997, | |
| "rewards/format_reward": 0.6458333432674408, | |
| "step": 246 | |
| }, | |
| { | |
| "completion_length": 1726.0833435058594, | |
| "epoch": 0.2822857142857143, | |
| "grad_norm": 0.10450342297554016, | |
| "kl": 0.068115234375, | |
| "learning_rate": 6.374054580489873e-07, | |
| "loss": -0.0053, | |
| "reward": 0.2457539178431034, | |
| "reward_std": 0.19251924008131027, | |
| "rewards/cosine_scaled_reward": 0.11216052249073982, | |
| "rewards/format_reward": 0.6875000055879354, | |
| "step": 247 | |
| }, | |
| { | |
| "completion_length": 2217.916748046875, | |
| "epoch": 0.2834285714285714, | |
| "grad_norm": 0.054874151945114136, | |
| "kl": 0.07550048828125, | |
| "learning_rate": 6.343215915635761e-07, | |
| "loss": 0.0152, | |
| "reward": 0.14268504013307393, | |
| "reward_std": 0.19669616222381592, | |
| "rewards/cosine_scaled_reward": -0.014341482892632484, | |
| "rewards/format_reward": 0.5625000149011612, | |
| "step": 248 | |
| }, | |
| { | |
| "completion_length": 2881.104248046875, | |
| "epoch": 0.2845714285714286, | |
| "grad_norm": 0.08497622609138489, | |
| "kl": 0.091796875, | |
| "learning_rate": 6.31233615362752e-07, | |
| "loss": 0.008, | |
| "reward": 0.03709301119670272, | |
| "reward_std": 0.259086437523365, | |
| "rewards/cosine_scaled_reward": -0.1157736461609602, | |
| "rewards/format_reward": 0.3750000111758709, | |
| "step": 249 | |
| }, | |
| { | |
| "completion_length": 2183.0000610351562, | |
| "epoch": 0.2857142857142857, | |
| "grad_norm": 0.09207236766815186, | |
| "kl": 0.090087890625, | |
| "learning_rate": 6.281416799501187e-07, | |
| "loss": 0.0152, | |
| "reward": 0.21901344694197178, | |
| "reward_std": 0.28776915371418, | |
| "rewards/cosine_scaled_reward": 0.09477716172114015, | |
| "rewards/format_reward": 0.6458333544433117, | |
| "step": 250 | |
| }, | |
| { | |
| "completion_length": 3025.3959350585938, | |
| "epoch": 0.28685714285714287, | |
| "grad_norm": 0.1507013738155365, | |
| "kl": 0.119384765625, | |
| "learning_rate": 6.25045936022246e-07, | |
| "loss": 0.0195, | |
| "reward": 0.13459591940045357, | |
| "reward_std": 0.3239840231835842, | |
| "rewards/cosine_scaled_reward": 0.09318311512470245, | |
| "rewards/format_reward": 0.3333333432674408, | |
| "step": 251 | |
| }, | |
| { | |
| "completion_length": 2491.8959350585938, | |
| "epoch": 0.288, | |
| "grad_norm": 0.08385124802589417, | |
| "kl": 0.108642578125, | |
| "learning_rate": 6.219465344613258e-07, | |
| "loss": 0.0249, | |
| "reward": 0.07829362154006958, | |
| "reward_std": 0.15910264290869236, | |
| "rewards/cosine_scaled_reward": -0.1019208341022022, | |
| "rewards/format_reward": 0.5000000204890966, | |
| "step": 252 | |
| }, | |
| { | |
| "completion_length": 2506.3334350585938, | |
| "epoch": 0.28914285714285715, | |
| "grad_norm": 0.09299149364233017, | |
| "kl": 0.1217041015625, | |
| "learning_rate": 6.188436263278172e-07, | |
| "loss": 0.0212, | |
| "reward": 0.04096110351383686, | |
| "reward_std": 0.14594158343970776, | |
| "rewards/cosine_scaled_reward": -0.1423955336213112, | |
| "rewards/format_reward": 0.43750000558793545, | |
| "step": 253 | |
| }, | |
| { | |
| "completion_length": 1781.5834045410156, | |
| "epoch": 0.29028571428571426, | |
| "grad_norm": 0.1694076508283615, | |
| "kl": 0.07427978515625, | |
| "learning_rate": 6.157373628530852e-07, | |
| "loss": 0.0104, | |
| "reward": 0.22348729986697435, | |
| "reward_std": 0.2563650719821453, | |
| "rewards/cosine_scaled_reward": 0.09331563860177994, | |
| "rewards/format_reward": 0.6458333432674408, | |
| "step": 254 | |
| }, | |
| { | |
| "completion_length": 2293.1250610351562, | |
| "epoch": 0.2914285714285714, | |
| "grad_norm": 0.09922631084918976, | |
| "kl": 0.0946044921875, | |
| "learning_rate": 6.126278954320294e-07, | |
| "loss": 0.0086, | |
| "reward": 0.1782954093068838, | |
| "reward_std": 0.1498996578156948, | |
| "rewards/cosine_scaled_reward": 0.10829710960388184, | |
| "rewards/format_reward": 0.4583333358168602, | |
| "step": 255 | |
| }, | |
| { | |
| "completion_length": 1945.0000610351562, | |
| "epoch": 0.2925714285714286, | |
| "grad_norm": 0.12147188186645508, | |
| "kl": 0.0772705078125, | |
| "learning_rate": 6.095153756157051e-07, | |
| "loss": 0.0257, | |
| "reward": 0.1625408570689615, | |
| "reward_std": 0.19596706703305244, | |
| "rewards/cosine_scaled_reward": -0.04615425318479538, | |
| "rewards/format_reward": 0.7291666716337204, | |
| "step": 256 | |
| }, | |
| { | |
| "completion_length": 2569.979248046875, | |
| "epoch": 0.2937142857142857, | |
| "grad_norm": 0.05752284452319145, | |
| "kl": 0.128173828125, | |
| "learning_rate": 6.06399955103937e-07, | |
| "loss": 0.0199, | |
| "reward": 0.007399971596896648, | |
| "reward_std": 0.15406970493495464, | |
| "rewards/cosine_scaled_reward": -0.25781896710395813, | |
| "rewards/format_reward": 0.541666679084301, | |
| "step": 257 | |
| }, | |
| { | |
| "completion_length": 2116.541748046875, | |
| "epoch": 0.2948571428571429, | |
| "grad_norm": 0.08095219731330872, | |
| "kl": 0.06207275390625, | |
| "learning_rate": 6.032817857379256e-07, | |
| "loss": 0.0097, | |
| "reward": 0.1491424711421132, | |
| "reward_std": 0.2224278338253498, | |
| "rewards/cosine_scaled_reward": -0.028103190823458135, | |
| "rewards/format_reward": 0.6250000074505806, | |
| "step": 258 | |
| }, | |
| { | |
| "completion_length": 1488.937515258789, | |
| "epoch": 0.296, | |
| "grad_norm": 0.08785142749547958, | |
| "kl": 0.07330322265625, | |
| "learning_rate": 6.001610194928464e-07, | |
| "loss": 0.0166, | |
| "reward": 0.10606439970433712, | |
| "reward_std": 0.1880865916609764, | |
| "rewards/cosine_scaled_reward": -0.17137941345572472, | |
| "rewards/format_reward": 0.7500000298023224, | |
| "step": 259 | |
| }, | |
| { | |
| "completion_length": 2588.3958740234375, | |
| "epoch": 0.29714285714285715, | |
| "grad_norm": 0.16577839851379395, | |
| "kl": 0.11627197265625, | |
| "learning_rate": 5.97037808470444e-07, | |
| "loss": 0.0032, | |
| "reward": -0.03611325612291694, | |
| "reward_std": 0.17280695401132107, | |
| "rewards/cosine_scaled_reward": -0.21710992977023125, | |
| "rewards/format_reward": 0.2916666716337204, | |
| "step": 260 | |
| }, | |
| { | |
| "completion_length": 2016.0625610351562, | |
| "epoch": 0.29828571428571427, | |
| "grad_norm": 0.1131332591176033, | |
| "kl": 0.110107421875, | |
| "learning_rate": 5.939123048916173e-07, | |
| "loss": 0.009, | |
| "reward": 0.09557801228947937, | |
| "reward_std": 0.16974378004670143, | |
| "rewards/cosine_scaled_reward": -0.16340252819645684, | |
| "rewards/format_reward": 0.6875000149011612, | |
| "step": 261 | |
| }, | |
| { | |
| "completion_length": 2428.6250610351562, | |
| "epoch": 0.29942857142857143, | |
| "grad_norm": 0.10569147765636444, | |
| "kl": 0.12786865234375, | |
| "learning_rate": 5.907846610890011e-07, | |
| "loss": 0.01, | |
| "reward": 0.0037733479402959347, | |
| "reward_std": 0.17411128245294094, | |
| "rewards/cosine_scaled_reward": -0.21108321473002434, | |
| "rewards/format_reward": 0.4375000074505806, | |
| "step": 262 | |
| }, | |
| { | |
| "completion_length": 2327.375, | |
| "epoch": 0.30057142857142854, | |
| "grad_norm": 0.1667339950799942, | |
| "kl": 0.149658203125, | |
| "learning_rate": 5.87655029499542e-07, | |
| "loss": 0.0127, | |
| "reward": 0.03647384233772755, | |
| "reward_std": 0.16158639267086983, | |
| "rewards/cosine_scaled_reward": -0.215880973264575, | |
| "rewards/format_reward": 0.5625000149011612, | |
| "step": 263 | |
| }, | |
| { | |
| "completion_length": 2331.0416870117188, | |
| "epoch": 0.3017142857142857, | |
| "grad_norm": 0.10305790603160858, | |
| "kl": 0.112060546875, | |
| "learning_rate": 5.845235626570683e-07, | |
| "loss": 0.0215, | |
| "reward": 0.0681858491152525, | |
| "reward_std": 0.23381152004003525, | |
| "rewards/cosine_scaled_reward": -0.13254176545888186, | |
| "rewards/format_reward": 0.5208333432674408, | |
| "step": 264 | |
| }, | |
| { | |
| "completion_length": 2921.4375610351562, | |
| "epoch": 0.3028571428571429, | |
| "grad_norm": 0.1127183809876442, | |
| "kl": 0.115966796875, | |
| "learning_rate": 5.813904131848564e-07, | |
| "loss": 0.0131, | |
| "reward": 0.021314379759132862, | |
| "reward_std": 0.1555755902081728, | |
| "rewards/cosine_scaled_reward": -0.15679632499814034, | |
| "rewards/format_reward": 0.3958333507180214, | |
| "step": 265 | |
| }, | |
| { | |
| "completion_length": 2978.5834350585938, | |
| "epoch": 0.304, | |
| "grad_norm": 0.1250232309103012, | |
| "kl": 0.1484375, | |
| "learning_rate": 5.78255733788191e-07, | |
| "loss": 0.0172, | |
| "reward": -0.050182152073830366, | |
| "reward_std": 0.1501956842839718, | |
| "rewards/cosine_scaled_reward": -0.20140841230750084, | |
| "rewards/format_reward": 0.2083333395421505, | |
| "step": 266 | |
| }, | |
| { | |
| "completion_length": 2326.979217529297, | |
| "epoch": 0.30514285714285716, | |
| "grad_norm": 0.13584452867507935, | |
| "kl": 0.080810546875, | |
| "learning_rate": 5.751196772469237e-07, | |
| "loss": 0.001, | |
| "reward": 0.10708324424922466, | |
| "reward_std": 0.16897061094641685, | |
| "rewards/cosine_scaled_reward": -0.06354878284037113, | |
| "rewards/format_reward": 0.5416666716337204, | |
| "step": 267 | |
| }, | |
| { | |
| "completion_length": 2667.2709045410156, | |
| "epoch": 0.3062857142857143, | |
| "grad_norm": 0.11531220376491547, | |
| "kl": 0.092041015625, | |
| "learning_rate": 5.71982396408026e-07, | |
| "loss": 0.0192, | |
| "reward": 0.07101344061084092, | |
| "reward_std": 0.2250448651611805, | |
| "rewards/cosine_scaled_reward": -0.11485025100409985, | |
| "rewards/format_reward": 0.500000013038516, | |
| "step": 268 | |
| }, | |
| { | |
| "completion_length": 2202.0625915527344, | |
| "epoch": 0.30742857142857144, | |
| "grad_norm": 0.07493577152490616, | |
| "kl": 0.111328125, | |
| "learning_rate": 5.688440441781398e-07, | |
| "loss": 0.0192, | |
| "reward": 0.08426341554149985, | |
| "reward_std": 0.17346534691751003, | |
| "rewards/cosine_scaled_reward": -0.1418045349419117, | |
| "rewards/format_reward": 0.6041666828095913, | |
| "step": 269 | |
| }, | |
| { | |
| "completion_length": 1065.2708587646484, | |
| "epoch": 0.30857142857142855, | |
| "grad_norm": 0.05601108819246292, | |
| "kl": 0.031768798828125, | |
| "learning_rate": 5.657047735161255e-07, | |
| "loss": 0.0029, | |
| "reward": 0.21699802950024605, | |
| "reward_std": 0.21899626031517982, | |
| "rewards/cosine_scaled_reward": -0.06081180274486542, | |
| "rewards/format_reward": 0.9583333432674408, | |
| "step": 270 | |
| }, | |
| { | |
| "completion_length": 2505.1459045410156, | |
| "epoch": 0.3097142857142857, | |
| "grad_norm": 0.1140054240822792, | |
| "kl": 0.07342529296875, | |
| "learning_rate": 5.625647374256061e-07, | |
| "loss": 0.0076, | |
| "reward": 0.14475935138761997, | |
| "reward_std": 0.21346264705061913, | |
| "rewards/cosine_scaled_reward": 0.04791983589529991, | |
| "rewards/format_reward": 0.4583333432674408, | |
| "step": 271 | |
| }, | |
| { | |
| "completion_length": 3327.1875610351562, | |
| "epoch": 0.31085714285714283, | |
| "grad_norm": 0.13161110877990723, | |
| "kl": 0.11572265625, | |
| "learning_rate": 5.594240889475106e-07, | |
| "loss": 0.0067, | |
| "reward": -0.08763871155679226, | |
| "reward_std": 0.15068581700325012, | |
| "rewards/cosine_scaled_reward": -0.21180569753050804, | |
| "rewards/format_reward": 0.0833333358168602, | |
| "step": 272 | |
| }, | |
| { | |
| "completion_length": 3108.5208740234375, | |
| "epoch": 0.312, | |
| "grad_norm": 0.07448291778564453, | |
| "kl": 0.1080322265625, | |
| "learning_rate": 5.562829811526154e-07, | |
| "loss": 0.018, | |
| "reward": -0.023340489715337753, | |
| "reward_std": 0.22315971367061138, | |
| "rewards/cosine_scaled_reward": -0.21390293538570404, | |
| "rewards/format_reward": 0.3333333358168602, | |
| "step": 273 | |
| }, | |
| { | |
| "completion_length": 2636.916748046875, | |
| "epoch": 0.31314285714285717, | |
| "grad_norm": 0.06865213811397552, | |
| "kl": 0.07769775390625, | |
| "learning_rate": 5.531415671340826e-07, | |
| "loss": 0.0098, | |
| "reward": 0.10937354527413845, | |
| "reward_std": 0.13424846902489662, | |
| "rewards/cosine_scaled_reward": -0.01630665734410286, | |
| "rewards/format_reward": 0.4583333469927311, | |
| "step": 274 | |
| }, | |
| { | |
| "completion_length": 2996.416748046875, | |
| "epoch": 0.3142857142857143, | |
| "grad_norm": 0.07241214066743851, | |
| "kl": 0.1007080078125, | |
| "learning_rate": 5.5e-07, | |
| "loss": 0.0203, | |
| "reward": 0.04834304626274388, | |
| "reward_std": 0.22332904487848282, | |
| "rewards/cosine_scaled_reward": -0.05854946281760931, | |
| "rewards/format_reward": 0.29166666977107525, | |
| "step": 275 | |
| }, | |
| { | |
| "completion_length": 2250.041748046875, | |
| "epoch": 0.31542857142857145, | |
| "grad_norm": 0.08157075196504593, | |
| "kl": 0.07476806640625, | |
| "learning_rate": 5.468584328659172e-07, | |
| "loss": 0.0015, | |
| "reward": 0.16906505031511188, | |
| "reward_std": 0.16975216940045357, | |
| "rewards/cosine_scaled_reward": -0.0014951229095458984, | |
| "rewards/format_reward": 0.645833358168602, | |
| "step": 276 | |
| }, | |
| { | |
| "completion_length": 2464.3958435058594, | |
| "epoch": 0.31657142857142856, | |
| "grad_norm": 0.12354958057403564, | |
| "kl": 0.08636474609375, | |
| "learning_rate": 5.437170188473847e-07, | |
| "loss": 0.0273, | |
| "reward": 0.07450119964778423, | |
| "reward_std": 0.22049247100949287, | |
| "rewards/cosine_scaled_reward": -0.12796138040721416, | |
| "rewards/format_reward": 0.5416666716337204, | |
| "step": 277 | |
| }, | |
| { | |
| "completion_length": 2643.666717529297, | |
| "epoch": 0.3177142857142857, | |
| "grad_norm": 0.08799438923597336, | |
| "kl": 0.06854248046875, | |
| "learning_rate": 5.405759110524894e-07, | |
| "loss": 0.0025, | |
| "reward": 0.08634543046355247, | |
| "reward_std": 0.20207853987812996, | |
| "rewards/cosine_scaled_reward": -0.07213485613465309, | |
| "rewards/format_reward": 0.47916669212281704, | |
| "step": 278 | |
| }, | |
| { | |
| "completion_length": 2666.3959045410156, | |
| "epoch": 0.31885714285714284, | |
| "grad_norm": 0.07479758560657501, | |
| "kl": 0.0810546875, | |
| "learning_rate": 5.37435262574394e-07, | |
| "loss": 0.0134, | |
| "reward": 0.03228483721613884, | |
| "reward_std": 0.234229177236557, | |
| "rewards/cosine_scaled_reward": -0.14610924664884806, | |
| "rewards/format_reward": 0.4166666753590107, | |
| "step": 279 | |
| }, | |
| { | |
| "completion_length": 2667.0000915527344, | |
| "epoch": 0.32, | |
| "grad_norm": 0.07344148308038712, | |
| "kl": 0.07598876953125, | |
| "learning_rate": 5.342952264838747e-07, | |
| "loss": 0.0099, | |
| "reward": 0.156514388974756, | |
| "reward_std": 0.2009916752576828, | |
| "rewards/cosine_scaled_reward": -0.02287590131163597, | |
| "rewards/format_reward": 0.645833358168602, | |
| "step": 280 | |
| }, | |
| { | |
| "completion_length": 2200.5416870117188, | |
| "epoch": 0.3211428571428571, | |
| "grad_norm": 0.07045651227235794, | |
| "kl": 0.0682373046875, | |
| "learning_rate": 5.311559558218603e-07, | |
| "loss": 0.0167, | |
| "reward": 0.16042753960937262, | |
| "reward_std": 0.16341171227395535, | |
| "rewards/cosine_scaled_reward": 0.004844233393669128, | |
| "rewards/format_reward": 0.6041666828095913, | |
| "step": 281 | |
| }, | |
| { | |
| "completion_length": 2644.0625915527344, | |
| "epoch": 0.3222857142857143, | |
| "grad_norm": 0.13839997351169586, | |
| "kl": 0.064208984375, | |
| "learning_rate": 5.28017603591974e-07, | |
| "loss": 0.0228, | |
| "reward": 0.07122778613120317, | |
| "reward_std": 0.2150486335158348, | |
| "rewards/cosine_scaled_reward": -0.10076148808002472, | |
| "rewards/format_reward": 0.479166679084301, | |
| "step": 282 | |
| }, | |
| { | |
| "completion_length": 2083.3125610351562, | |
| "epoch": 0.32342857142857145, | |
| "grad_norm": 0.06165618449449539, | |
| "kl": 0.07025146484375, | |
| "learning_rate": 5.248803227530763e-07, | |
| "loss": 0.0058, | |
| "reward": 0.0954576376825571, | |
| "reward_std": 0.14573124423623085, | |
| "rewards/cosine_scaled_reward": -0.135349091142416, | |
| "rewards/format_reward": 0.6250000223517418, | |
| "step": 283 | |
| }, | |
| { | |
| "completion_length": 2546.5000610351562, | |
| "epoch": 0.32457142857142857, | |
| "grad_norm": 0.11243577301502228, | |
| "kl": 0.0726318359375, | |
| "learning_rate": 5.21744266211809e-07, | |
| "loss": -0.0007, | |
| "reward": 0.023500393144786358, | |
| "reward_std": 0.14628888107836246, | |
| "rewards/cosine_scaled_reward": -0.1865788884460926, | |
| "rewards/format_reward": 0.4583333544433117, | |
| "step": 284 | |
| }, | |
| { | |
| "completion_length": 3013.8958740234375, | |
| "epoch": 0.32571428571428573, | |
| "grad_norm": 0.10980677604675293, | |
| "kl": 0.0928955078125, | |
| "learning_rate": 5.186095868151436e-07, | |
| "loss": 0.0068, | |
| "reward": -0.011876898352056742, | |
| "reward_std": 0.11095650680363178, | |
| "rewards/cosine_scaled_reward": -0.17136260541155934, | |
| "rewards/format_reward": 0.2916666716337204, | |
| "step": 285 | |
| }, | |
| { | |
| "completion_length": 2541.0834350585938, | |
| "epoch": 0.32685714285714285, | |
| "grad_norm": 0.16674117743968964, | |
| "kl": 0.0716552734375, | |
| "learning_rate": 5.154764373429315e-07, | |
| "loss": 0.0297, | |
| "reward": 0.17524816654622555, | |
| "reward_std": 0.32492974400520325, | |
| "rewards/cosine_scaled_reward": 0.03504812903702259, | |
| "rewards/format_reward": 0.6041666865348816, | |
| "step": 286 | |
| }, | |
| { | |
| "completion_length": 2969.9375, | |
| "epoch": 0.328, | |
| "grad_norm": 0.08299952745437622, | |
| "kl": 0.088623046875, | |
| "learning_rate": 5.123449705004581e-07, | |
| "loss": 0.0053, | |
| "reward": 0.02080897823907435, | |
| "reward_std": 0.15813075937330723, | |
| "rewards/cosine_scaled_reward": -0.11745025217533112, | |
| "rewards/format_reward": 0.3125000111758709, | |
| "step": 287 | |
| }, | |
| { | |
| "completion_length": 2234.8750915527344, | |
| "epoch": 0.3291428571428571, | |
| "grad_norm": 0.06079524755477905, | |
| "kl": 0.0732421875, | |
| "learning_rate": 5.09215338910999e-07, | |
| "loss": 0.012, | |
| "reward": 0.1391735766083002, | |
| "reward_std": 0.2059701792895794, | |
| "rewards/cosine_scaled_reward": -0.06576585536822677, | |
| "rewards/format_reward": 0.666666679084301, | |
| "step": 288 | |
| }, | |
| { | |
| "completion_length": 2588.2916870117188, | |
| "epoch": 0.3302857142857143, | |
| "grad_norm": 0.12241410464048386, | |
| "kl": 0.0830078125, | |
| "learning_rate": 5.060876951083828e-07, | |
| "loss": 0.0238, | |
| "reward": 0.05997318652225658, | |
| "reward_std": 0.19843794405460358, | |
| "rewards/cosine_scaled_reward": -0.10321231558918953, | |
| "rewards/format_reward": 0.41666667722165585, | |
| "step": 289 | |
| }, | |
| { | |
| "completion_length": 1879.5209045410156, | |
| "epoch": 0.3314285714285714, | |
| "grad_norm": 0.1251208782196045, | |
| "kl": 0.05615234375, | |
| "learning_rate": 5.02962191529556e-07, | |
| "loss": 0.014, | |
| "reward": 0.2796472981572151, | |
| "reward_std": 0.26721834763884544, | |
| "rewards/cosine_scaled_reward": 0.18595721386373043, | |
| "rewards/format_reward": 0.6875000149011612, | |
| "step": 290 | |
| }, | |
| { | |
| "completion_length": 1793.8333740234375, | |
| "epoch": 0.3325714285714286, | |
| "grad_norm": 0.05456700548529625, | |
| "kl": 0.05328369140625, | |
| "learning_rate": 4.998389805071536e-07, | |
| "loss": 0.0147, | |
| "reward": 0.10706852684961632, | |
| "reward_std": 0.18388917297124863, | |
| "rewards/cosine_scaled_reward": -0.10759443510323763, | |
| "rewards/format_reward": 0.6250000149011612, | |
| "step": 291 | |
| }, | |
| { | |
| "completion_length": 2827.5209045410156, | |
| "epoch": 0.33371428571428574, | |
| "grad_norm": 0.1125926524400711, | |
| "kl": 0.09375, | |
| "learning_rate": 4.967182142620745e-07, | |
| "loss": 0.0008, | |
| "reward": 0.062235129065811634, | |
| "reward_std": 0.1880720667541027, | |
| "rewards/cosine_scaled_reward": -0.11061838595196605, | |
| "rewards/format_reward": 0.4583333358168602, | |
| "step": 292 | |
| }, | |
| { | |
| "completion_length": 2978.729248046875, | |
| "epoch": 0.33485714285714285, | |
| "grad_norm": 0.054279159754514694, | |
| "kl": 0.1051025390625, | |
| "learning_rate": 4.93600044896063e-07, | |
| "loss": 0.0164, | |
| "reward": 0.025744864717125893, | |
| "reward_std": 0.1544373817741871, | |
| "rewards/cosine_scaled_reward": -0.1682923138141632, | |
| "rewards/format_reward": 0.43750002048909664, | |
| "step": 293 | |
| }, | |
| { | |
| "completion_length": 2108.791748046875, | |
| "epoch": 0.336, | |
| "grad_norm": 0.08422739058732986, | |
| "kl": 0.04925537109375, | |
| "learning_rate": 4.904846243842949e-07, | |
| "loss": 0.0156, | |
| "reward": 0.24648608081042767, | |
| "reward_std": 0.2907153554260731, | |
| "rewards/cosine_scaled_reward": 0.11026226915419102, | |
| "rewards/format_reward": 0.7291666716337204, | |
| "step": 294 | |
| }, | |
| { | |
| "completion_length": 2940.479248046875, | |
| "epoch": 0.33714285714285713, | |
| "grad_norm": 0.04961990937590599, | |
| "kl": 0.06982421875, | |
| "learning_rate": 4.873721045679706e-07, | |
| "loss": 0.0095, | |
| "reward": 0.04029355011880398, | |
| "reward_std": 0.15207207575440407, | |
| "rewards/cosine_scaled_reward": -0.080223947763443, | |
| "rewards/format_reward": 0.31250000186264515, | |
| "step": 295 | |
| }, | |
| { | |
| "completion_length": 2132.0000610351562, | |
| "epoch": 0.3382857142857143, | |
| "grad_norm": 0.2023567259311676, | |
| "kl": 0.0758056640625, | |
| "learning_rate": 4.842626371469149e-07, | |
| "loss": 0.0329, | |
| "reward": 0.2100120633840561, | |
| "reward_std": 0.24636650830507278, | |
| "rewards/cosine_scaled_reward": 0.06347820605151355, | |
| "rewards/format_reward": 0.6875000149011612, | |
| "step": 296 | |
| }, | |
| { | |
| "completion_length": 2687.479248046875, | |
| "epoch": 0.3394285714285714, | |
| "grad_norm": 0.086825892329216, | |
| "kl": 0.0836181640625, | |
| "learning_rate": 4.811563736721829e-07, | |
| "loss": 0.011, | |
| "reward": -0.017197752371430397, | |
| "reward_std": 0.14185519330203533, | |
| "rewards/cosine_scaled_reward": -0.23157930374145508, | |
| "rewards/format_reward": 0.3958333358168602, | |
| "step": 297 | |
| }, | |
| { | |
| "completion_length": 2682.8959350585938, | |
| "epoch": 0.3405714285714286, | |
| "grad_norm": 0.05833464860916138, | |
| "kl": 0.096435546875, | |
| "learning_rate": 4.780534655386743e-07, | |
| "loss": 0.0181, | |
| "reward": 0.044168648310005665, | |
| "reward_std": 0.1994057223200798, | |
| "rewards/cosine_scaled_reward": -0.1331064086407423, | |
| "rewards/format_reward": 0.4375000149011612, | |
| "step": 298 | |
| }, | |
| { | |
| "completion_length": 2765.2083740234375, | |
| "epoch": 0.3417142857142857, | |
| "grad_norm": 0.08351943641901016, | |
| "kl": 0.0958251953125, | |
| "learning_rate": 4.749540639777539e-07, | |
| "loss": 0.0122, | |
| "reward": 0.03238721820525825, | |
| "reward_std": 0.18395740166306496, | |
| "rewards/cosine_scaled_reward": -0.1890297271311283, | |
| "rewards/format_reward": 0.5000000204890966, | |
| "step": 299 | |
| }, | |
| { | |
| "completion_length": 2204.6876220703125, | |
| "epoch": 0.34285714285714286, | |
| "grad_norm": 0.12468023598194122, | |
| "kl": 0.056915283203125, | |
| "learning_rate": 4.7185832004988133e-07, | |
| "loss": 0.0156, | |
| "reward": 0.2512501869350672, | |
| "reward_std": 0.21316121704876423, | |
| "rewards/cosine_scaled_reward": 0.17804886028170586, | |
| "rewards/format_reward": 0.6041666716337204, | |
| "step": 300 | |
| }, | |
| { | |
| "completion_length": 2694.2084350585938, | |
| "epoch": 0.344, | |
| "grad_norm": 0.09867414832115173, | |
| "kl": 0.08111572265625, | |
| "learning_rate": 4.68766384637248e-07, | |
| "loss": 0.0039, | |
| "reward": 0.07058700546622276, | |
| "reward_std": 0.18468532338738441, | |
| "rewards/cosine_scaled_reward": -0.08327716588973999, | |
| "rewards/format_reward": 0.43750002048909664, | |
| "step": 301 | |
| }, | |
| { | |
| "completion_length": 2437.4791870117188, | |
| "epoch": 0.34514285714285714, | |
| "grad_norm": 0.14097610116004944, | |
| "kl": 0.07562255859375, | |
| "learning_rate": 4.656784084364238e-07, | |
| "loss": -0.0006, | |
| "reward": 0.08986304583959281, | |
| "reward_std": 0.14975667744874954, | |
| "rewards/cosine_scaled_reward": -0.11123824678361416, | |
| "rewards/format_reward": 0.5625000149011612, | |
| "step": 302 | |
| }, | |
| { | |
| "completion_length": 2241.604278564453, | |
| "epoch": 0.3462857142857143, | |
| "grad_norm": 0.14365330338478088, | |
| "kl": 0.080322265625, | |
| "learning_rate": 4.6259454195101267e-07, | |
| "loss": 0.0183, | |
| "reward": 0.1795977670699358, | |
| "reward_std": 0.19656018540263176, | |
| "rewards/cosine_scaled_reward": -0.05982725415378809, | |
| "rewards/format_reward": 0.8125000298023224, | |
| "step": 303 | |
| }, | |
| { | |
| "completion_length": 1963.7500915527344, | |
| "epoch": 0.3474285714285714, | |
| "grad_norm": 0.06420152634382248, | |
| "kl": 0.06317138671875, | |
| "learning_rate": 4.59514935484316e-07, | |
| "loss": 0.0106, | |
| "reward": 0.3228389322757721, | |
| "reward_std": 0.225132018327713, | |
| "rewards/cosine_scaled_reward": 0.18825304741039872, | |
| "rewards/format_reward": 0.8541666716337204, | |
| "step": 304 | |
| }, | |
| { | |
| "completion_length": 3088.7708740234375, | |
| "epoch": 0.3485714285714286, | |
| "grad_norm": 0.081441231071949, | |
| "kl": 0.09967041015625, | |
| "learning_rate": 4.5643973913200837e-07, | |
| "loss": 0.0104, | |
| "reward": 0.11018868116661906, | |
| "reward_std": 0.24773859977722168, | |
| "rewards/cosine_scaled_reward": 0.014070218428969383, | |
| "rewards/format_reward": 0.39583334513008595, | |
| "step": 305 | |
| }, | |
| { | |
| "completion_length": 2378.0208740234375, | |
| "epoch": 0.3497142857142857, | |
| "grad_norm": 0.09298369288444519, | |
| "kl": 0.0804443359375, | |
| "learning_rate": 4.5336910277482155e-07, | |
| "loss": 0.0269, | |
| "reward": 0.11538790445774794, | |
| "reward_std": 0.15940915420651436, | |
| "rewards/cosine_scaled_reward": 0.004322787746787071, | |
| "rewards/format_reward": 0.4375000111758709, | |
| "step": 306 | |
| }, | |
| { | |
| "completion_length": 2622.854248046875, | |
| "epoch": 0.35085714285714287, | |
| "grad_norm": 0.10793717950582504, | |
| "kl": 0.0858154296875, | |
| "learning_rate": 4.503031760712397e-07, | |
| "loss": 0.0173, | |
| "reward": 0.22971613216213882, | |
| "reward_std": 0.22900176048278809, | |
| "rewards/cosine_scaled_reward": 0.16193752427352592, | |
| "rewards/format_reward": 0.5625000149011612, | |
| "step": 307 | |
| }, | |
| { | |
| "completion_length": 2606.354248046875, | |
| "epoch": 0.352, | |
| "grad_norm": 0.13069109618663788, | |
| "kl": 0.1005859375, | |
| "learning_rate": 4.4724210845020494e-07, | |
| "loss": 0.0042, | |
| "reward": -0.004278823267668486, | |
| "reward_std": 0.13516085781157017, | |
| "rewards/cosine_scaled_reward": -0.15489922184497118, | |
| "rewards/format_reward": 0.2916666679084301, | |
| "step": 308 | |
| }, | |
| { | |
| "completion_length": 2249.541717529297, | |
| "epoch": 0.35314285714285715, | |
| "grad_norm": 0.06607223302125931, | |
| "kl": 0.058929443359375, | |
| "learning_rate": 4.441860491038345e-07, | |
| "loss": 0.0051, | |
| "reward": 0.20314783230423927, | |
| "reward_std": 0.20742812659591436, | |
| "rewards/cosine_scaled_reward": 0.01900293491780758, | |
| "rewards/format_reward": 0.75, | |
| "step": 309 | |
| }, | |
| { | |
| "completion_length": 2335.7708740234375, | |
| "epoch": 0.35428571428571426, | |
| "grad_norm": 0.05518461763858795, | |
| "kl": 0.0732421875, | |
| "learning_rate": 4.4113514698014953e-07, | |
| "loss": 0.012, | |
| "reward": 0.11887777596712112, | |
| "reward_std": 0.16622768342494965, | |
| "rewards/cosine_scaled_reward": -0.06383237708359957, | |
| "rewards/format_reward": 0.5833333488553762, | |
| "step": 310 | |
| }, | |
| { | |
| "completion_length": 2655.166717529297, | |
| "epoch": 0.3554285714285714, | |
| "grad_norm": 0.10505408048629761, | |
| "kl": 0.0928955078125, | |
| "learning_rate": 4.3808955077581546e-07, | |
| "loss": 0.0216, | |
| "reward": 0.1411935742944479, | |
| "reward_std": 0.18642779998481274, | |
| "rewards/cosine_scaled_reward": 0.03841886296868324, | |
| "rewards/format_reward": 0.4583333358168602, | |
| "step": 311 | |
| }, | |
| { | |
| "completion_length": 2526.7083740234375, | |
| "epoch": 0.3565714285714286, | |
| "grad_norm": 0.05685188248753548, | |
| "kl": 0.08819580078125, | |
| "learning_rate": 4.350494089288943e-07, | |
| "loss": 0.0114, | |
| "reward": 0.10656812787055969, | |
| "reward_std": 0.21104633808135986, | |
| "rewards/cosine_scaled_reward": -0.06582731753587723, | |
| "rewards/format_reward": 0.5416666697710752, | |
| "step": 312 | |
| }, | |
| { | |
| "completion_length": 2139.4375610351562, | |
| "epoch": 0.3577142857142857, | |
| "grad_norm": 0.10757915675640106, | |
| "kl": 0.06536865234375, | |
| "learning_rate": 4.3201486961161093e-07, | |
| "loss": 0.0003, | |
| "reward": 0.19683823641389608, | |
| "reward_std": 0.18648846447467804, | |
| "rewards/cosine_scaled_reward": -0.0015433002263307571, | |
| "rewards/format_reward": 0.75, | |
| "step": 313 | |
| }, | |
| { | |
| "completion_length": 2516.2291870117188, | |
| "epoch": 0.3588571428571429, | |
| "grad_norm": 0.1524633765220642, | |
| "kl": 0.0789794921875, | |
| "learning_rate": 4.2898608072313045e-07, | |
| "loss": 0.0287, | |
| "reward": 0.12881793431006372, | |
| "reward_std": 0.20116973295807838, | |
| "rewards/cosine_scaled_reward": -0.03406868875026703, | |
| "rewards/format_reward": 0.5625000149011612, | |
| "step": 314 | |
| }, | |
| { | |
| "completion_length": 2014.3750610351562, | |
| "epoch": 0.36, | |
| "grad_norm": 0.11337730288505554, | |
| "kl": 0.06561279296875, | |
| "learning_rate": 4.2596318988235037e-07, | |
| "loss": 0.0187, | |
| "reward": 0.12829207256436348, | |
| "reward_std": 0.21666407957673073, | |
| "rewards/cosine_scaled_reward": -0.06101685017347336, | |
| "rewards/format_reward": 0.6041666772216558, | |
| "step": 315 | |
| }, | |
| { | |
| "completion_length": 2774.791748046875, | |
| "epoch": 0.36114285714285715, | |
| "grad_norm": 0.09875557571649551, | |
| "kl": 0.104248046875, | |
| "learning_rate": 4.2294634442070553e-07, | |
| "loss": 0.0122, | |
| "reward": -0.0629839962348342, | |
| "reward_std": 0.12521347776055336, | |
| "rewards/cosine_scaled_reward": -0.22717295214533806, | |
| "rewards/format_reward": 0.20833333395421505, | |
| "step": 316 | |
| }, | |
| { | |
| "completion_length": 2848.0000610351562, | |
| "epoch": 0.36228571428571427, | |
| "grad_norm": 0.10147333145141602, | |
| "kl": 0.099517822265625, | |
| "learning_rate": 4.1993569137498776e-07, | |
| "loss": 0.0094, | |
| "reward": 0.05106092058122158, | |
| "reward_std": 0.1844017505645752, | |
| "rewards/cosine_scaled_reward": -0.09010570449754596, | |
| "rewards/format_reward": 0.37500001303851604, | |
| "step": 317 | |
| }, | |
| { | |
| "completion_length": 2637.541748046875, | |
| "epoch": 0.36342857142857143, | |
| "grad_norm": 0.10713284462690353, | |
| "kl": 0.09912109375, | |
| "learning_rate": 4.1693137748017915e-07, | |
| "loss": 0.0094, | |
| "reward": 0.09471104294061661, | |
| "reward_std": 0.1826849989593029, | |
| "rewards/cosine_scaled_reward": -0.07070518797263503, | |
| "rewards/format_reward": 0.5000000149011612, | |
| "step": 318 | |
| }, | |
| { | |
| "completion_length": 2840.0000915527344, | |
| "epoch": 0.36457142857142855, | |
| "grad_norm": 0.07892774045467377, | |
| "kl": 0.105712890625, | |
| "learning_rate": 4.1393354916230005e-07, | |
| "loss": 0.0126, | |
| "reward": 0.0851662028580904, | |
| "reward_std": 0.22858255915343761, | |
| "rewards/cosine_scaled_reward": -0.07701923698186874, | |
| "rewards/format_reward": 0.479166679084301, | |
| "step": 319 | |
| }, | |
| { | |
| "completion_length": 2254.6250610351562, | |
| "epoch": 0.3657142857142857, | |
| "grad_norm": 0.2352837324142456, | |
| "kl": 0.08685302734375, | |
| "learning_rate": 4.1094235253127374e-07, | |
| "loss": 0.0307, | |
| "reward": 0.16311589442193508, | |
| "reward_std": 0.27095119282603264, | |
| "rewards/cosine_scaled_reward": 0.004516004119068384, | |
| "rewards/format_reward": 0.6250000149011612, | |
| "step": 320 | |
| }, | |
| { | |
| "completion_length": 2928.8334350585938, | |
| "epoch": 0.3668571428571429, | |
| "grad_norm": 0.08478543162345886, | |
| "kl": 0.1251220703125, | |
| "learning_rate": 4.079579333738039e-07, | |
| "loss": 0.0232, | |
| "reward": 0.08492608368396759, | |
| "reward_std": 0.167075764387846, | |
| "rewards/cosine_scaled_reward": -0.025654247030615807, | |
| "rewards/format_reward": 0.37500001303851604, | |
| "step": 321 | |
| }, | |
| { | |
| "completion_length": 2225.104278564453, | |
| "epoch": 0.368, | |
| "grad_norm": 0.11332879960536957, | |
| "kl": 0.0806884765625, | |
| "learning_rate": 4.0498043714627006e-07, | |
| "loss": 0.0184, | |
| "reward": 0.08830842701718211, | |
| "reward_std": 0.20369251817464828, | |
| "rewards/cosine_scaled_reward": -0.13531437516212463, | |
| "rewards/format_reward": 0.6041666865348816, | |
| "step": 322 | |
| }, | |
| { | |
| "completion_length": 2561.3750610351562, | |
| "epoch": 0.36914285714285716, | |
| "grad_norm": 0.06960780918598175, | |
| "kl": 0.089141845703125, | |
| "learning_rate": 4.020100089676376e-07, | |
| "loss": 0.0111, | |
| "reward": 0.19159602496074513, | |
| "reward_std": 0.11802363023161888, | |
| "rewards/cosine_scaled_reward": 0.10042979568243027, | |
| "rewards/format_reward": 0.5416666679084301, | |
| "step": 323 | |
| }, | |
| { | |
| "completion_length": 2930.3125, | |
| "epoch": 0.3702857142857143, | |
| "grad_norm": 0.07716625183820724, | |
| "kl": 0.139892578125, | |
| "learning_rate": 3.9904679361238526e-07, | |
| "loss": 0.0271, | |
| "reward": -0.004628020571544766, | |
| "reward_std": 0.16172604821622372, | |
| "rewards/cosine_scaled_reward": -0.18698952719569206, | |
| "rewards/format_reward": 0.3541666716337204, | |
| "step": 324 | |
| }, | |
| { | |
| "completion_length": 2325.2709350585938, | |
| "epoch": 0.37142857142857144, | |
| "grad_norm": 0.06906182318925858, | |
| "kl": 0.083160400390625, | |
| "learning_rate": 3.9609093550344907e-07, | |
| "loss": 0.0119, | |
| "reward": 0.11972847464494407, | |
| "reward_std": 0.20582032948732376, | |
| "rewards/cosine_scaled_reward": -0.04426138522103429, | |
| "rewards/format_reward": 0.5416666828095913, | |
| "step": 325 | |
| }, | |
| { | |
| "completion_length": 2836.6875610351562, | |
| "epoch": 0.37257142857142855, | |
| "grad_norm": 0.09085791558027267, | |
| "kl": 0.1112060546875, | |
| "learning_rate": 3.931425787051832e-07, | |
| "loss": 0.0156, | |
| "reward": 0.06654205825179815, | |
| "reward_std": 0.17035862803459167, | |
| "rewards/cosine_scaled_reward": -0.1297546997666359, | |
| "rewards/format_reward": 0.5208333507180214, | |
| "step": 326 | |
| }, | |
| { | |
| "completion_length": 2249.9375915527344, | |
| "epoch": 0.3737142857142857, | |
| "grad_norm": 0.1248980239033699, | |
| "kl": 0.0697021484375, | |
| "learning_rate": 3.902018669163384e-07, | |
| "loss": 0.0172, | |
| "reward": 0.20194087480194867, | |
| "reward_std": 0.1706233825534582, | |
| "rewards/cosine_scaled_reward": 0.06923118606209755, | |
| "rewards/format_reward": 0.6250000149011612, | |
| "step": 327 | |
| }, | |
| { | |
| "completion_length": 2495.3334350585938, | |
| "epoch": 0.37485714285714283, | |
| "grad_norm": 0.2801321744918823, | |
| "kl": 0.1103515625, | |
| "learning_rate": 3.872689434630585e-07, | |
| "loss": 0.0402, | |
| "reward": 0.10911815395957092, | |
| "reward_std": 0.22883542627096176, | |
| "rewards/cosine_scaled_reward": -0.07236796617507935, | |
| "rewards/format_reward": 0.5625000149011612, | |
| "step": 328 | |
| }, | |
| { | |
| "completion_length": 2529.9375915527344, | |
| "epoch": 0.376, | |
| "grad_norm": 0.14899227023124695, | |
| "kl": 0.088897705078125, | |
| "learning_rate": 3.843439512918949e-07, | |
| "loss": 0.0034, | |
| "reward": 0.03729677852243185, | |
| "reward_std": 0.14246170036494732, | |
| "rewards/cosine_scaled_reward": -0.21106715127825737, | |
| "rewards/format_reward": 0.5625000149011612, | |
| "step": 329 | |
| }, | |
| { | |
| "completion_length": 2541.604248046875, | |
| "epoch": 0.37714285714285717, | |
| "grad_norm": 0.16933830082416534, | |
| "kl": 0.1004638671875, | |
| "learning_rate": 3.8142703296283953e-07, | |
| "loss": 0.0251, | |
| "reward": 0.04205328272655606, | |
| "reward_std": 0.20854125544428825, | |
| "rewards/cosine_scaled_reward": -0.14864550344645977, | |
| "rewards/format_reward": 0.4583333507180214, | |
| "step": 330 | |
| }, | |
| { | |
| "completion_length": 2726.7709350585938, | |
| "epoch": 0.3782857142857143, | |
| "grad_norm": 0.07858636230230331, | |
| "kl": 0.1015625, | |
| "learning_rate": 3.785183306423767e-07, | |
| "loss": 0.0117, | |
| "reward": 0.08857154892757535, | |
| "reward_std": 0.17733018845319748, | |
| "rewards/cosine_scaled_reward": -0.08738526329398155, | |
| "rewards/format_reward": 0.5208333432674408, | |
| "step": 331 | |
| }, | |
| { | |
| "completion_length": 2786.354248046875, | |
| "epoch": 0.37942857142857145, | |
| "grad_norm": 0.1536131203174591, | |
| "kl": 0.1419677734375, | |
| "learning_rate": 3.7561798609655373e-07, | |
| "loss": 0.0137, | |
| "reward": 0.16590226627886295, | |
| "reward_std": 0.18895284831523895, | |
| "rewards/cosine_scaled_reward": 0.06244879716541618, | |
| "rewards/format_reward": 0.5000000149011612, | |
| "step": 332 | |
| }, | |
| { | |
| "completion_length": 2466.1041870117188, | |
| "epoch": 0.38057142857142856, | |
| "grad_norm": 0.09280456602573395, | |
| "kl": 0.12158203125, | |
| "learning_rate": 3.72726140684072e-07, | |
| "loss": 0.0148, | |
| "reward": -0.015152640640735626, | |
| "reward_std": 0.12078623287379742, | |
| "rewards/cosine_scaled_reward": -0.21639476716518402, | |
| "rewards/format_reward": 0.3750000149011612, | |
| "step": 333 | |
| }, | |
| { | |
| "completion_length": 3137.8125610351562, | |
| "epoch": 0.38171428571428573, | |
| "grad_norm": 0.11281560361385345, | |
| "kl": 0.144775390625, | |
| "learning_rate": 3.6984293534939737e-07, | |
| "loss": 0.0219, | |
| "reward": 0.07791803404688835, | |
| "reward_std": 0.17335851676762104, | |
| "rewards/cosine_scaled_reward": -0.06684091314673424, | |
| "rewards/format_reward": 0.43750000558793545, | |
| "step": 334 | |
| }, | |
| { | |
| "completion_length": 2393.229278564453, | |
| "epoch": 0.38285714285714284, | |
| "grad_norm": 0.1157144233584404, | |
| "kl": 0.1038818359375, | |
| "learning_rate": 3.6696851061588994e-07, | |
| "loss": 0.0108, | |
| "reward": 0.0938750933855772, | |
| "reward_std": 0.15610762871801853, | |
| "rewards/cosine_scaled_reward": -0.11221005395054817, | |
| "rewards/format_reward": 0.5833333432674408, | |
| "step": 335 | |
| }, | |
| { | |
| "completion_length": 2907.8126220703125, | |
| "epoch": 0.384, | |
| "grad_norm": 0.11133909225463867, | |
| "kl": 0.17236328125, | |
| "learning_rate": 3.641030065789562e-07, | |
| "loss": 0.0289, | |
| "reward": 0.004658656194806099, | |
| "reward_std": 0.16840993613004684, | |
| "rewards/cosine_scaled_reward": -0.2090268861502409, | |
| "rewards/format_reward": 0.4375000074505806, | |
| "step": 336 | |
| }, | |
| { | |
| "completion_length": 2235.1459350585938, | |
| "epoch": 0.3851428571428571, | |
| "grad_norm": 0.13448435068130493, | |
| "kl": 0.118408203125, | |
| "learning_rate": 3.612465628992203e-07, | |
| "loss": 0.0248, | |
| "reward": 0.15862991195172071, | |
| "reward_std": 0.21291902288794518, | |
| "rewards/cosine_scaled_reward": 0.03115895204246044, | |
| "rewards/format_reward": 0.541666679084301, | |
| "step": 337 | |
| }, | |
| { | |
| "completion_length": 2317.166732788086, | |
| "epoch": 0.3862857142857143, | |
| "grad_norm": 0.10800933092832565, | |
| "kl": 0.10296630859375, | |
| "learning_rate": 3.5839931879571725e-07, | |
| "loss": 0.0089, | |
| "reward": 0.07435005577281117, | |
| "reward_std": 0.1550746913999319, | |
| "rewards/cosine_scaled_reward": -0.13654402550309896, | |
| "rewards/format_reward": 0.5625000111758709, | |
| "step": 338 | |
| }, | |
| { | |
| "completion_length": 2712.5208740234375, | |
| "epoch": 0.38742857142857146, | |
| "grad_norm": 0.10078069567680359, | |
| "kl": 0.11083984375, | |
| "learning_rate": 3.555614130391079e-07, | |
| "loss": 0.0219, | |
| "reward": 0.04339126427657902, | |
| "reward_std": 0.18619941733777523, | |
| "rewards/cosine_scaled_reward": -0.11608852446079254, | |
| "rewards/format_reward": 0.3958333432674408, | |
| "step": 339 | |
| }, | |
| { | |
| "completion_length": 2437.2084045410156, | |
| "epoch": 0.38857142857142857, | |
| "grad_norm": 0.42870640754699707, | |
| "kl": 0.0941162109375, | |
| "learning_rate": 3.5273298394491515e-07, | |
| "loss": 0.0399, | |
| "reward": 0.14150222577154636, | |
| "reward_std": 0.28494541347026825, | |
| "rewards/cosine_scaled_reward": -0.019709208339918405, | |
| "rewards/format_reward": 0.5833333507180214, | |
| "step": 340 | |
| }, | |
| { | |
| "completion_length": 2479.8125610351562, | |
| "epoch": 0.38971428571428574, | |
| "grad_norm": 0.29453757405281067, | |
| "kl": 0.09765625, | |
| "learning_rate": 3.4991416936678276e-07, | |
| "loss": 0.0254, | |
| "reward": 0.30123581551015377, | |
| "reward_std": 0.30215100571513176, | |
| "rewards/cosine_scaled_reward": 0.21321824193000793, | |
| "rewards/format_reward": 0.7291666865348816, | |
| "step": 341 | |
| }, | |
| { | |
| "completion_length": 2396.6875, | |
| "epoch": 0.39085714285714285, | |
| "grad_norm": 0.15700876712799072, | |
| "kl": 0.10491943359375, | |
| "learning_rate": 3.471051066897562e-07, | |
| "loss": 0.006, | |
| "reward": 0.06362471543252468, | |
| "reward_std": 0.1603417694568634, | |
| "rewards/cosine_scaled_reward": -0.13952013570815325, | |
| "rewards/format_reward": 0.5208333395421505, | |
| "step": 342 | |
| }, | |
| { | |
| "completion_length": 2603.104248046875, | |
| "epoch": 0.392, | |
| "grad_norm": 0.3471648097038269, | |
| "kl": 0.140869140625, | |
| "learning_rate": 3.4430593282358777e-07, | |
| "loss": 0.0314, | |
| "reward": 0.16107061505317688, | |
| "reward_std": 0.2604887783527374, | |
| "rewards/cosine_scaled_reward": 0.032809881027787924, | |
| "rewards/format_reward": 0.5625000223517418, | |
| "step": 343 | |
| }, | |
| { | |
| "completion_length": 2791.416748046875, | |
| "epoch": 0.3931428571428571, | |
| "grad_norm": 0.12738864123821259, | |
| "kl": 0.1568603515625, | |
| "learning_rate": 3.4151678419606233e-07, | |
| "loss": 0.0167, | |
| "reward": 0.08531437814235687, | |
| "reward_std": 0.1130725909024477, | |
| "rewards/cosine_scaled_reward": -0.0646376833319664, | |
| "rewards/format_reward": 0.45833334885537624, | |
| "step": 344 | |
| }, | |
| { | |
| "completion_length": 2063.1875610351562, | |
| "epoch": 0.3942857142857143, | |
| "grad_norm": 0.18870621919631958, | |
| "kl": 0.1322021484375, | |
| "learning_rate": 3.387377967463493e-07, | |
| "loss": 0.0217, | |
| "reward": 0.15728350915014744, | |
| "reward_std": 0.14874129742383957, | |
| "rewards/cosine_scaled_reward": -0.019591101678088307, | |
| "rewards/format_reward": 0.645833358168602, | |
| "step": 345 | |
| }, | |
| { | |
| "completion_length": 2851.3958740234375, | |
| "epoch": 0.3954285714285714, | |
| "grad_norm": 0.09028314799070358, | |
| "kl": 0.1865234375, | |
| "learning_rate": 3.359691059183761e-07, | |
| "loss": 0.026, | |
| "reward": 0.0704011912457645, | |
| "reward_std": 0.22107306122779846, | |
| "rewards/cosine_scaled_reward": -0.07064785808324814, | |
| "rewards/format_reward": 0.4166666753590107, | |
| "step": 346 | |
| }, | |
| { | |
| "completion_length": 2225.4791870117188, | |
| "epoch": 0.3965714285714286, | |
| "grad_norm": 0.10854969918727875, | |
| "kl": 0.11883544921875, | |
| "learning_rate": 3.3321084665422803e-07, | |
| "loss": 0.0262, | |
| "reward": 0.2618616446852684, | |
| "reward_std": 0.23568999022245407, | |
| "rewards/cosine_scaled_reward": 0.16030552051961422, | |
| "rewards/format_reward": 0.6875, | |
| "step": 347 | |
| }, | |
| { | |
| "completion_length": 2106.7084045410156, | |
| "epoch": 0.3977142857142857, | |
| "grad_norm": 0.09777455031871796, | |
| "kl": 0.1402587890625, | |
| "learning_rate": 3.3046315338757026e-07, | |
| "loss": 0.023, | |
| "reward": 0.07345362403430045, | |
| "reward_std": 0.17969585955142975, | |
| "rewards/cosine_scaled_reward": -0.1748974435031414, | |
| "rewards/format_reward": 0.6250000298023224, | |
| "step": 348 | |
| }, | |
| { | |
| "completion_length": 1646.5833740234375, | |
| "epoch": 0.39885714285714285, | |
| "grad_norm": 0.053399790078401566, | |
| "kl": 0.073974609375, | |
| "learning_rate": 3.2772616003709616e-07, | |
| "loss": 0.0073, | |
| "reward": 0.263054633513093, | |
| "reward_std": 0.1963588111102581, | |
| "rewards/cosine_scaled_reward": 0.08115220349282026, | |
| "rewards/format_reward": 0.8333333432674408, | |
| "step": 349 | |
| }, | |
| { | |
| "completion_length": 1968.1459350585938, | |
| "epoch": 0.4, | |
| "grad_norm": 0.15918481349945068, | |
| "kl": 0.1402587890625, | |
| "learning_rate": 3.250000000000001e-07, | |
| "loss": 0.028, | |
| "reward": 0.22315800064825453, | |
| "reward_std": 0.20856844261288643, | |
| "rewards/cosine_scaled_reward": 0.08611712232232094, | |
| "rewards/format_reward": 0.6875000223517418, | |
| "step": 350 | |
| }, | |
| { | |
| "completion_length": 1509.2291870117188, | |
| "epoch": 0.40114285714285713, | |
| "grad_norm": 0.12013411521911621, | |
| "kl": 0.06610107421875, | |
| "learning_rate": 3.222848061454764e-07, | |
| "loss": 0.013, | |
| "reward": 0.11616317229345441, | |
| "reward_std": 0.13754734210669994, | |
| "rewards/cosine_scaled_reward": -0.1139280404895544, | |
| "rewards/format_reward": 0.6666666865348816, | |
| "step": 351 | |
| }, | |
| { | |
| "completion_length": 2333.854248046875, | |
| "epoch": 0.4022857142857143, | |
| "grad_norm": 0.12231642752885818, | |
| "kl": 0.1798095703125, | |
| "learning_rate": 3.195807108082429e-07, | |
| "loss": 0.0191, | |
| "reward": 0.13605477567762136, | |
| "reward_std": 0.16071481630206108, | |
| "rewards/cosine_scaled_reward": -0.09295779466629028, | |
| "rewards/format_reward": 0.7083333432674408, | |
| "step": 352 | |
| }, | |
| { | |
| "completion_length": 2585.791717529297, | |
| "epoch": 0.4034285714285714, | |
| "grad_norm": 0.21827702224254608, | |
| "kl": 0.18603515625, | |
| "learning_rate": 3.168878457820915e-07, | |
| "loss": 0.0146, | |
| "reward": 0.11951232142746449, | |
| "reward_std": 0.22043934278190136, | |
| "rewards/cosine_scaled_reward": -0.07062636315822601, | |
| "rewards/format_reward": 0.6041666865348816, | |
| "step": 353 | |
| }, | |
| { | |
| "completion_length": 2889.1459350585938, | |
| "epoch": 0.4045714285714286, | |
| "grad_norm": 0.3793615996837616, | |
| "kl": 0.251220703125, | |
| "learning_rate": 3.142063423134644e-07, | |
| "loss": 0.0162, | |
| "reward": 0.012035491410642862, | |
| "reward_std": 0.17300541140139103, | |
| "rewards/cosine_scaled_reward": -0.18593342415988445, | |
| "rewards/format_reward": 0.41666667722165585, | |
| "step": 354 | |
| }, | |
| { | |
| "completion_length": 1715.5208587646484, | |
| "epoch": 0.4057142857142857, | |
| "grad_norm": 0.1848497837781906, | |
| "kl": 0.14276885986328125, | |
| "learning_rate": 3.115363310950578e-07, | |
| "loss": 0.0103, | |
| "reward": 0.13455185294151306, | |
| "reward_std": 0.12003207392990589, | |
| "rewards/cosine_scaled_reward": -0.07982492633163929, | |
| "rewards/format_reward": 0.6666666772216558, | |
| "step": 355 | |
| }, | |
| { | |
| "completion_length": 2946.8333740234375, | |
| "epoch": 0.40685714285714286, | |
| "grad_norm": 0.3567591905593872, | |
| "kl": 0.2039794921875, | |
| "learning_rate": 3.0887794225945143e-07, | |
| "loss": 0.0121, | |
| "reward": 0.061174651607871056, | |
| "reward_std": 0.12309401016682386, | |
| "rewards/cosine_scaled_reward": -0.05049964040517807, | |
| "rewards/format_reward": 0.33333333395421505, | |
| "step": 356 | |
| }, | |
| { | |
| "completion_length": 2964.6875610351562, | |
| "epoch": 0.408, | |
| "grad_norm": 0.18627581000328064, | |
| "kl": 0.250244140625, | |
| "learning_rate": 3.062313053727671e-07, | |
| "loss": 0.026, | |
| "reward": -0.01721569336950779, | |
| "reward_std": 0.1486299056559801, | |
| "rewards/cosine_scaled_reward": -0.2300855666399002, | |
| "rewards/format_reward": 0.3958333432674408, | |
| "step": 357 | |
| }, | |
| { | |
| "completion_length": 2763.291717529297, | |
| "epoch": 0.40914285714285714, | |
| "grad_norm": 0.33291012048721313, | |
| "kl": 0.1829833984375, | |
| "learning_rate": 3.0359654942835247e-07, | |
| "loss": 0.0105, | |
| "reward": 0.14173107966780663, | |
| "reward_std": 0.17784210667014122, | |
| "rewards/cosine_scaled_reward": 0.04421359859406948, | |
| "rewards/format_reward": 0.4583333432674408, | |
| "step": 358 | |
| }, | |
| { | |
| "completion_length": 2299.1876220703125, | |
| "epoch": 0.4102857142857143, | |
| "grad_norm": 0.1993589997291565, | |
| "kl": 0.1390380859375, | |
| "learning_rate": 3.0097380284049523e-07, | |
| "loss": 0.0087, | |
| "reward": 0.11581943836063147, | |
| "reward_std": 0.15780886262655258, | |
| "rewards/cosine_scaled_reward": -0.04972851276397705, | |
| "rewards/format_reward": 0.5416666865348816, | |
| "step": 359 | |
| }, | |
| { | |
| "completion_length": 2084.604248046875, | |
| "epoch": 0.4114285714285714, | |
| "grad_norm": 0.1641249805688858, | |
| "kl": 0.1513671875, | |
| "learning_rate": 2.9836319343816397e-07, | |
| "loss": 0.0122, | |
| "reward": 0.28426261618733406, | |
| "reward_std": 0.13949369452893734, | |
| "rewards/cosine_scaled_reward": 0.18073145393282175, | |
| "rewards/format_reward": 0.7291666716337204, | |
| "step": 360 | |
| }, | |
| { | |
| "completion_length": 2283.3125610351562, | |
| "epoch": 0.4125714285714286, | |
| "grad_norm": 0.17498940229415894, | |
| "kl": 0.0999755859375, | |
| "learning_rate": 2.9576484845877793e-07, | |
| "loss": 0.0019, | |
| "reward": 0.12389247631654143, | |
| "reward_std": 0.157631978392601, | |
| "rewards/cosine_scaled_reward": -0.05891747213900089, | |
| "rewards/format_reward": 0.5833333432674408, | |
| "step": 361 | |
| }, | |
| { | |
| "completion_length": 2442.166748046875, | |
| "epoch": 0.4137142857142857, | |
| "grad_norm": 0.11291036009788513, | |
| "kl": 0.1329345703125, | |
| "learning_rate": 2.931788945420058e-07, | |
| "loss": 0.0131, | |
| "reward": 0.12254431564360857, | |
| "reward_std": 0.2619587108492851, | |
| "rewards/cosine_scaled_reward": -0.015398215502500534, | |
| "rewards/format_reward": 0.5000000055879354, | |
| "step": 362 | |
| }, | |
| { | |
| "completion_length": 2656.4791870117188, | |
| "epoch": 0.41485714285714287, | |
| "grad_norm": 0.15177766978740692, | |
| "kl": 0.112060546875, | |
| "learning_rate": 2.9060545772359305e-07, | |
| "loss": 0.0173, | |
| "reward": 0.16184489242732525, | |
| "reward_std": 0.20883875899016857, | |
| "rewards/cosine_scaled_reward": 0.034410827327519655, | |
| "rewards/format_reward": 0.5625000111758709, | |
| "step": 363 | |
| }, | |
| { | |
| "completion_length": 2809.5416870117188, | |
| "epoch": 0.416, | |
| "grad_norm": 0.10156676173210144, | |
| "kl": 0.149658203125, | |
| "learning_rate": 2.8804466342921987e-07, | |
| "loss": 0.0198, | |
| "reward": 0.03160530608147383, | |
| "reward_std": 0.18550903722643852, | |
| "rewards/cosine_scaled_reward": -0.13592858472838998, | |
| "rewards/format_reward": 0.39583333395421505, | |
| "step": 364 | |
| }, | |
| { | |
| "completion_length": 3105.5000610351562, | |
| "epoch": 0.41714285714285715, | |
| "grad_norm": 0.09864702075719833, | |
| "kl": 0.1112060546875, | |
| "learning_rate": 2.854966364683872e-07, | |
| "loss": 0.0106, | |
| "reward": 0.179584838449955, | |
| "reward_std": 0.20714671537280083, | |
| "rewards/cosine_scaled_reward": 0.036294665187597275, | |
| "rewards/format_reward": 0.6250000149011612, | |
| "step": 365 | |
| }, | |
| { | |
| "completion_length": 2408.8958740234375, | |
| "epoch": 0.41828571428571426, | |
| "grad_norm": 0.06630592793226242, | |
| "kl": 0.085052490234375, | |
| "learning_rate": 2.829615010283344e-07, | |
| "loss": 0.0102, | |
| "reward": 0.16667877696454525, | |
| "reward_std": 0.2439602054655552, | |
| "rewards/cosine_scaled_reward": -0.007324194069951773, | |
| "rewards/format_reward": 0.6458333432674408, | |
| "step": 366 | |
| }, | |
| { | |
| "completion_length": 1786.0833740234375, | |
| "epoch": 0.41942857142857143, | |
| "grad_norm": 0.3039757311344147, | |
| "kl": 0.0784912109375, | |
| "learning_rate": 2.8043938066798645e-07, | |
| "loss": 0.0258, | |
| "reward": 0.23519797436892986, | |
| "reward_std": 0.25861874781548977, | |
| "rewards/cosine_scaled_reward": 0.03339534252882004, | |
| "rewards/format_reward": 0.833333358168602, | |
| "step": 367 | |
| }, | |
| { | |
| "completion_length": 1936.1250915527344, | |
| "epoch": 0.4205714285714286, | |
| "grad_norm": 0.20097126066684723, | |
| "kl": 0.07049560546875, | |
| "learning_rate": 2.7793039831193133e-07, | |
| "loss": 0.0093, | |
| "reward": 0.18592305853962898, | |
| "reward_std": 0.20696640759706497, | |
| "rewards/cosine_scaled_reward": -0.01663225144147873, | |
| "rewards/format_reward": 0.7500000149011612, | |
| "step": 368 | |
| }, | |
| { | |
| "completion_length": 2522.541732788086, | |
| "epoch": 0.4217142857142857, | |
| "grad_norm": 0.08981841057538986, | |
| "kl": 0.125885009765625, | |
| "learning_rate": 2.7543467624442956e-07, | |
| "loss": 0.0145, | |
| "reward": 0.11640452593564987, | |
| "reward_std": 0.18782025203108788, | |
| "rewards/cosine_scaled_reward": -0.03775274008512497, | |
| "rewards/format_reward": 0.5208333432674408, | |
| "step": 369 | |
| }, | |
| { | |
| "completion_length": 3076.7708740234375, | |
| "epoch": 0.4228571428571429, | |
| "grad_norm": 0.12859641015529633, | |
| "kl": 0.1231689453125, | |
| "learning_rate": 2.729523361034538e-07, | |
| "loss": 0.0067, | |
| "reward": 0.060016830917447805, | |
| "reward_std": 0.1693633273243904, | |
| "rewards/cosine_scaled_reward": -0.11036420240998268, | |
| "rewards/format_reward": 0.4583333395421505, | |
| "step": 370 | |
| }, | |
| { | |
| "completion_length": 2782.8125610351562, | |
| "epoch": 0.424, | |
| "grad_norm": 0.14062629640102386, | |
| "kl": 0.13232421875, | |
| "learning_rate": 2.7048349887476037e-07, | |
| "loss": 0.0138, | |
| "reward": 0.03680555988103151, | |
| "reward_std": 0.1516097765415907, | |
| "rewards/cosine_scaled_reward": -0.13098294660449028, | |
| "rewards/format_reward": 0.3958333358168602, | |
| "step": 371 | |
| }, | |
| { | |
| "completion_length": 2777.604278564453, | |
| "epoch": 0.42514285714285716, | |
| "grad_norm": 0.08935054391622543, | |
| "kl": 0.115478515625, | |
| "learning_rate": 2.6802828488599294e-07, | |
| "loss": 0.0168, | |
| "reward": 0.09560818038880825, | |
| "reward_std": 0.18122290819883347, | |
| "rewards/cosine_scaled_reward": -0.03500601276755333, | |
| "rewards/format_reward": 0.4375000111758709, | |
| "step": 372 | |
| }, | |
| { | |
| "completion_length": 2812.5209350585938, | |
| "epoch": 0.42628571428571427, | |
| "grad_norm": 0.14136241376399994, | |
| "kl": 0.112060546875, | |
| "learning_rate": 2.655868138008171e-07, | |
| "loss": 0.0075, | |
| "reward": 0.03981833381112665, | |
| "reward_std": 0.12803484313189983, | |
| "rewards/cosine_scaled_reward": -0.13392143324017525, | |
| "rewards/format_reward": 0.4166666865348816, | |
| "step": 373 | |
| }, | |
| { | |
| "completion_length": 2644.6041870117188, | |
| "epoch": 0.42742857142857144, | |
| "grad_norm": 0.1431395560503006, | |
| "kl": 0.0850830078125, | |
| "learning_rate": 2.631592046130896e-07, | |
| "loss": 0.0042, | |
| "reward": 0.01895416807383299, | |
| "reward_std": 0.12385790795087814, | |
| "rewards/cosine_scaled_reward": -0.1493915691971779, | |
| "rewards/format_reward": 0.375, | |
| "step": 374 | |
| }, | |
| { | |
| "completion_length": 3007.2708740234375, | |
| "epoch": 0.42857142857142855, | |
| "grad_norm": 0.10009056329727173, | |
| "kl": 0.120849609375, | |
| "learning_rate": 2.6074557564105724e-07, | |
| "loss": 0.0166, | |
| "reward": 0.04848789609968662, | |
| "reward_std": 0.18610725738108158, | |
| "rewards/cosine_scaled_reward": -0.09433378279209137, | |
| "rewards/format_reward": 0.375, | |
| "step": 375 | |
| }, | |
| { | |
| "completion_length": 2730.0625610351562, | |
| "epoch": 0.4297142857142857, | |
| "grad_norm": 0.08271081000566483, | |
| "kl": 0.0869140625, | |
| "learning_rate": 2.583460445215911e-07, | |
| "loss": 0.0144, | |
| "reward": 0.22470496874302626, | |
| "reward_std": 0.1960140261799097, | |
| "rewards/cosine_scaled_reward": 0.12158859148621559, | |
| "rewards/format_reward": 0.6250000298023224, | |
| "step": 376 | |
| }, | |
| { | |
| "completion_length": 2607.7708435058594, | |
| "epoch": 0.4308571428571429, | |
| "grad_norm": 0.14988483488559723, | |
| "kl": 0.103271484375, | |
| "learning_rate": 2.5596072820445254e-07, | |
| "loss": 0.0207, | |
| "reward": 0.11803832091391087, | |
| "reward_std": 0.190413236618042, | |
| "rewards/cosine_scaled_reward": 0.016210654750466347, | |
| "rewards/format_reward": 0.4166666716337204, | |
| "step": 377 | |
| }, | |
| { | |
| "completion_length": 2483.1459350585938, | |
| "epoch": 0.432, | |
| "grad_norm": 0.0689978078007698, | |
| "kl": 0.063720703125, | |
| "learning_rate": 2.5358974294659373e-07, | |
| "loss": 0.0149, | |
| "reward": 0.0701227942481637, | |
| "reward_std": 0.13309755828231573, | |
| "rewards/cosine_scaled_reward": -0.13521984964609146, | |
| "rewards/format_reward": 0.5416666828095913, | |
| "step": 378 | |
| }, | |
| { | |
| "completion_length": 2686.666748046875, | |
| "epoch": 0.43314285714285716, | |
| "grad_norm": 0.12755092978477478, | |
| "kl": 0.104248046875, | |
| "learning_rate": 2.512332043064913e-07, | |
| "loss": 0.0134, | |
| "reward": 0.1154003469273448, | |
| "reward_std": 0.2216249257326126, | |
| "rewards/cosine_scaled_reward": -0.02049895841628313, | |
| "rewards/format_reward": 0.47916667722165585, | |
| "step": 379 | |
| }, | |
| { | |
| "completion_length": 3167.541748046875, | |
| "epoch": 0.4342857142857143, | |
| "grad_norm": 0.10074777156114578, | |
| "kl": 0.1197509765625, | |
| "learning_rate": 2.488912271385139e-07, | |
| "loss": 0.0172, | |
| "reward": -0.012974897399544716, | |
| "reward_std": 0.20581556484103203, | |
| "rewards/cosine_scaled_reward": -0.13217015098780394, | |
| "rewards/format_reward": 0.20833334513008595, | |
| "step": 380 | |
| }, | |
| { | |
| "completion_length": 2868.0209350585938, | |
| "epoch": 0.43542857142857144, | |
| "grad_norm": 0.14523713290691376, | |
| "kl": 0.09814453125, | |
| "learning_rate": 2.465639255873246e-07, | |
| "loss": 0.0179, | |
| "reward": 0.1726588997989893, | |
| "reward_std": 0.23441699147224426, | |
| "rewards/cosine_scaled_reward": 0.06420985609292984, | |
| "rewards/format_reward": 0.5416666828095913, | |
| "step": 381 | |
| }, | |
| { | |
| "completion_length": 2559.666717529297, | |
| "epoch": 0.43657142857142855, | |
| "grad_norm": 0.07162333279848099, | |
| "kl": 0.0784912109375, | |
| "learning_rate": 2.4425141308231765e-07, | |
| "loss": 0.0048, | |
| "reward": 0.0915819201618433, | |
| "reward_std": 0.12614005245268345, | |
| "rewards/cosine_scaled_reward": -0.07447902113199234, | |
| "rewards/format_reward": 0.5000000055879354, | |
| "step": 382 | |
| }, | |
| { | |
| "completion_length": 2769.5209350585938, | |
| "epoch": 0.4377142857142857, | |
| "grad_norm": 0.09530221670866013, | |
| "kl": 0.07958984375, | |
| "learning_rate": 2.4195380233209006e-07, | |
| "loss": 0.0065, | |
| "reward": 0.15348075050860643, | |
| "reward_std": 0.1659582406282425, | |
| "rewards/cosine_scaled_reward": 0.06943665817379951, | |
| "rewards/format_reward": 0.4583333507180214, | |
| "step": 383 | |
| }, | |
| { | |
| "completion_length": 2584.0208587646484, | |
| "epoch": 0.43885714285714283, | |
| "grad_norm": 0.09211739152669907, | |
| "kl": 0.08917236328125, | |
| "learning_rate": 2.3967120531894857e-07, | |
| "loss": 0.013, | |
| "reward": 0.017119793221354485, | |
| "reward_std": 0.1543099209666252, | |
| "rewards/cosine_scaled_reward": -0.22795158438384533, | |
| "rewards/format_reward": 0.5208333358168602, | |
| "step": 384 | |
| }, | |
| { | |
| "completion_length": 2603.479248046875, | |
| "epoch": 0.44, | |
| "grad_norm": 0.08993380516767502, | |
| "kl": 0.0845947265625, | |
| "learning_rate": 2.374037332934512e-07, | |
| "loss": 0.0105, | |
| "reward": 0.1502766478806734, | |
| "reward_std": 0.1914801448583603, | |
| "rewards/cosine_scaled_reward": -0.02175597846508026, | |
| "rewards/format_reward": 0.6250000149011612, | |
| "step": 385 | |
| }, | |
| { | |
| "completion_length": 2500.041748046875, | |
| "epoch": 0.44114285714285717, | |
| "grad_norm": 0.12333329766988754, | |
| "kl": 0.08642578125, | |
| "learning_rate": 2.3515149676898552e-07, | |
| "loss": 0.007, | |
| "reward": 0.043349082581698895, | |
| "reward_std": 0.13621691428124905, | |
| "rewards/cosine_scaled_reward": -0.10375059582293034, | |
| "rewards/format_reward": 0.375, | |
| "step": 386 | |
| }, | |
| { | |
| "completion_length": 2772.1666870117188, | |
| "epoch": 0.4422857142857143, | |
| "grad_norm": 0.07059130817651749, | |
| "kl": 0.08355712890625, | |
| "learning_rate": 2.3291460551638237e-07, | |
| "loss": 0.0061, | |
| "reward": 0.0104325320571661, | |
| "reward_std": 0.18962214328348637, | |
| "rewards/cosine_scaled_reward": -0.1913592740893364, | |
| "rewards/format_reward": 0.41666669212281704, | |
| "step": 387 | |
| }, | |
| { | |
| "completion_length": 2410.104217529297, | |
| "epoch": 0.44342857142857145, | |
| "grad_norm": 0.0690821185708046, | |
| "kl": 0.0565338134765625, | |
| "learning_rate": 2.306931685585657e-07, | |
| "loss": 0.0033, | |
| "reward": 0.10766912996768951, | |
| "reward_std": 0.12889442220330238, | |
| "rewards/cosine_scaled_reward": -0.06408504024147987, | |
| "rewards/format_reward": 0.5416666716337204, | |
| "step": 388 | |
| }, | |
| { | |
| "completion_length": 2317.8334045410156, | |
| "epoch": 0.44457142857142856, | |
| "grad_norm": 0.26250460743904114, | |
| "kl": 0.0724639892578125, | |
| "learning_rate": 2.2848729416523859e-07, | |
| "loss": 0.0202, | |
| "reward": 0.18487818632274866, | |
| "reward_std": 0.26243601739406586, | |
| "rewards/cosine_scaled_reward": 0.06075235269963741, | |
| "rewards/format_reward": 0.5833333432674408, | |
| "step": 389 | |
| }, | |
| { | |
| "completion_length": 2417.2708740234375, | |
| "epoch": 0.44571428571428573, | |
| "grad_norm": 0.19676780700683594, | |
| "kl": 0.080078125, | |
| "learning_rate": 2.2629708984760706e-07, | |
| "loss": 0.0278, | |
| "reward": 0.11565285408869386, | |
| "reward_std": 0.17923235893249512, | |
| "rewards/cosine_scaled_reward": -0.05105341598391533, | |
| "rewards/format_reward": 0.5416666939854622, | |
| "step": 390 | |
| }, | |
| { | |
| "completion_length": 2586.8125915527344, | |
| "epoch": 0.44685714285714284, | |
| "grad_norm": 0.05162471905350685, | |
| "kl": 0.08984375, | |
| "learning_rate": 2.2412266235313973e-07, | |
| "loss": 0.0139, | |
| "reward": 0.036330622620880604, | |
| "reward_std": 0.13833574950695038, | |
| "rewards/cosine_scaled_reward": -0.22553806751966476, | |
| "rewards/format_reward": 0.5833333544433117, | |
| "step": 391 | |
| }, | |
| { | |
| "completion_length": 2698.4584045410156, | |
| "epoch": 0.448, | |
| "grad_norm": 0.2445743978023529, | |
| "kl": 0.0880126953125, | |
| "learning_rate": 2.2196411766036487e-07, | |
| "loss": 0.0207, | |
| "reward": 0.17882265825755894, | |
| "reward_std": 0.22073433548212051, | |
| "rewards/cosine_scaled_reward": 0.0916006825864315, | |
| "rewards/format_reward": 0.5000000111758709, | |
| "step": 392 | |
| }, | |
| { | |
| "completion_length": 2704.8541870117188, | |
| "epoch": 0.4491428571428571, | |
| "grad_norm": 0.11717414110898972, | |
| "kl": 0.100830078125, | |
| "learning_rate": 2.1982156097370557e-07, | |
| "loss": 0.0131, | |
| "reward": 0.15226414240896702, | |
| "reward_std": 0.13359035179018974, | |
| "rewards/cosine_scaled_reward": 0.0009656660258769989, | |
| "rewards/format_reward": 0.5833333358168602, | |
| "step": 393 | |
| }, | |
| { | |
| "completion_length": 2959.166748046875, | |
| "epoch": 0.4502857142857143, | |
| "grad_norm": 0.14118987321853638, | |
| "kl": 0.10443115234375, | |
| "learning_rate": 2.1769509671835223e-07, | |
| "loss": 0.0099, | |
| "reward": 0.004329454503022134, | |
| "reward_std": 0.13238821923732758, | |
| "rewards/cosine_scaled_reward": -0.17126433784142137, | |
| "rewards/format_reward": 0.35416667722165585, | |
| "step": 394 | |
| }, | |
| { | |
| "completion_length": 3017.6668090820312, | |
| "epoch": 0.4514285714285714, | |
| "grad_norm": 0.07346347719430923, | |
| "kl": 0.0919189453125, | |
| "learning_rate": 2.1558482853517253e-07, | |
| "loss": 0.0098, | |
| "reward": 0.01728492250549607, | |
| "reward_std": 0.13531852420419455, | |
| "rewards/cosine_scaled_reward": -0.15793361514806747, | |
| "rewards/format_reward": 0.3750000111758709, | |
| "step": 395 | |
| }, | |
| { | |
| "completion_length": 2521.166748046875, | |
| "epoch": 0.45257142857142857, | |
| "grad_norm": 0.049210913479328156, | |
| "kl": 0.101806640625, | |
| "learning_rate": 2.134908592756607e-07, | |
| "loss": 0.0131, | |
| "reward": 0.09421654045581818, | |
| "reward_std": 0.1612901221960783, | |
| "rewards/cosine_scaled_reward": -0.09077206254005432, | |
| "rewards/format_reward": 0.5416666865348816, | |
| "step": 396 | |
| }, | |
| { | |
| "completion_length": 2914.104248046875, | |
| "epoch": 0.45371428571428574, | |
| "grad_norm": 0.14572396874427795, | |
| "kl": 0.11083984375, | |
| "learning_rate": 2.1141329099692406e-07, | |
| "loss": 0.0111, | |
| "reward": 0.10872509330511093, | |
| "reward_std": 0.12936695851385593, | |
| "rewards/cosine_scaled_reward": 0.06242356216534972, | |
| "rewards/format_reward": 0.2916666716337204, | |
| "step": 397 | |
| }, | |
| { | |
| "completion_length": 2380.2083740234375, | |
| "epoch": 0.45485714285714285, | |
| "grad_norm": 0.09944668412208557, | |
| "kl": 0.0888671875, | |
| "learning_rate": 2.0935222495670968e-07, | |
| "loss": 0.0156, | |
| "reward": 0.1966765783727169, | |
| "reward_std": 0.21381664834916592, | |
| "rewards/cosine_scaled_reward": 0.0753667987883091, | |
| "rewards/format_reward": 0.604166679084301, | |
| "step": 398 | |
| }, | |
| { | |
| "completion_length": 2756.229248046875, | |
| "epoch": 0.456, | |
| "grad_norm": 0.09621509909629822, | |
| "kl": 0.1190948486328125, | |
| "learning_rate": 2.0730776160846853e-07, | |
| "loss": 0.014, | |
| "reward": 0.026204954832792282, | |
| "reward_std": 0.12728692777454853, | |
| "rewards/cosine_scaled_reward": -0.11903030052781105, | |
| "rewards/format_reward": 0.3333333358168602, | |
| "step": 399 | |
| }, | |
| { | |
| "completion_length": 2053.3125610351562, | |
| "epoch": 0.45714285714285713, | |
| "grad_norm": 0.07081139087677002, | |
| "kl": 0.066070556640625, | |
| "learning_rate": 2.0528000059645995e-07, | |
| "loss": 0.0042, | |
| "reward": 0.1593646313995123, | |
| "reward_std": 0.12290738895535469, | |
| "rewards/cosine_scaled_reward": -0.028231848031282425, | |
| "rewards/format_reward": 0.6666666679084301, | |
| "step": 400 | |
| }, | |
| { | |
| "completion_length": 2574.3750610351562, | |
| "epoch": 0.4582857142857143, | |
| "grad_norm": 0.1228204220533371, | |
| "kl": 0.1038818359375, | |
| "learning_rate": 2.032690407508949e-07, | |
| "loss": 0.0083, | |
| "reward": 0.11001632362604141, | |
| "reward_std": 0.17932304367423058, | |
| "rewards/cosine_scaled_reward": -0.11328526982106268, | |
| "rewards/format_reward": 0.6458333432674408, | |
| "step": 401 | |
| }, | |
| { | |
| "completion_length": 3217.5208740234375, | |
| "epoch": 0.4594285714285714, | |
| "grad_norm": 0.08836724609136581, | |
| "kl": 0.10791015625, | |
| "learning_rate": 2.0127498008311922e-07, | |
| "loss": 0.0135, | |
| "reward": 0.0018205083906650543, | |
| "reward_std": 0.15246334299445152, | |
| "rewards/cosine_scaled_reward": -0.16242793202400208, | |
| "rewards/format_reward": 0.33333334140479565, | |
| "step": 402 | |
| }, | |
| { | |
| "completion_length": 3098.166748046875, | |
| "epoch": 0.4605714285714286, | |
| "grad_norm": 0.1709667444229126, | |
| "kl": 0.104248046875, | |
| "learning_rate": 1.9929791578083655e-07, | |
| "loss": 0.0222, | |
| "reward": 0.02667865250259638, | |
| "reward_std": 0.208634614944458, | |
| "rewards/cosine_scaled_reward": -0.14999525004532188, | |
| "rewards/format_reward": 0.3958333432674408, | |
| "step": 403 | |
| }, | |
| { | |
| "completion_length": 2807.666717529297, | |
| "epoch": 0.4617142857142857, | |
| "grad_norm": 0.2069401741027832, | |
| "kl": 0.11279296875, | |
| "learning_rate": 1.9733794420337213e-07, | |
| "loss": 0.027, | |
| "reward": 0.12614703457802534, | |
| "reward_std": 0.2216541450470686, | |
| "rewards/cosine_scaled_reward": 0.013826873153448105, | |
| "rewards/format_reward": 0.45833333395421505, | |
| "step": 404 | |
| }, | |
| { | |
| "completion_length": 2927.541748046875, | |
| "epoch": 0.46285714285714286, | |
| "grad_norm": 0.10408933460712433, | |
| "kl": 0.100341796875, | |
| "learning_rate": 1.9539516087697517e-07, | |
| "loss": 0.0088, | |
| "reward": 0.1473550908267498, | |
| "reward_std": 0.22264716401696205, | |
| "rewards/cosine_scaled_reward": 0.02332792617380619, | |
| "rewards/format_reward": 0.5208333488553762, | |
| "step": 405 | |
| }, | |
| { | |
| "completion_length": 2602.5416870117188, | |
| "epoch": 0.464, | |
| "grad_norm": 0.4432380497455597, | |
| "kl": 0.0810546875, | |
| "learning_rate": 1.934696604901642e-07, | |
| "loss": 0.0365, | |
| "reward": 0.18797563947737217, | |
| "reward_std": 0.26618560403585434, | |
| "rewards/cosine_scaled_reward": 0.07161715440452099, | |
| "rewards/format_reward": 0.5833333432674408, | |
| "step": 406 | |
| }, | |
| { | |
| "completion_length": 2659.2291870117188, | |
| "epoch": 0.46514285714285714, | |
| "grad_norm": 0.16761444509029388, | |
| "kl": 0.13720703125, | |
| "learning_rate": 1.915615368891117e-07, | |
| "loss": 0.0088, | |
| "reward": 0.08081456902436912, | |
| "reward_std": 0.10845427960157394, | |
| "rewards/cosine_scaled_reward": -0.0889170840382576, | |
| "rewards/format_reward": 0.4791666865348816, | |
| "step": 407 | |
| }, | |
| { | |
| "completion_length": 2809.4375610351562, | |
| "epoch": 0.4662857142857143, | |
| "grad_norm": 0.16567623615264893, | |
| "kl": 0.1326904296875, | |
| "learning_rate": 1.8967088307307e-07, | |
| "loss": 0.0075, | |
| "reward": 0.0682485131546855, | |
| "reward_std": 0.12854525819420815, | |
| "rewards/cosine_scaled_reward": -0.10783641040325165, | |
| "rewards/format_reward": 0.479166679084301, | |
| "step": 408 | |
| }, | |
| { | |
| "completion_length": 2382.2708435058594, | |
| "epoch": 0.4674285714285714, | |
| "grad_norm": 0.19722118973731995, | |
| "kl": 0.15478515625, | |
| "learning_rate": 1.8779779118983867e-07, | |
| "loss": 0.0165, | |
| "reward": 0.1888794545084238, | |
| "reward_std": 0.2560318484902382, | |
| "rewards/cosine_scaled_reward": 0.07878298778086901, | |
| "rewards/format_reward": 0.5625000149011612, | |
| "step": 409 | |
| }, | |
| { | |
| "completion_length": 2792.104248046875, | |
| "epoch": 0.4685714285714286, | |
| "grad_norm": 0.13789673149585724, | |
| "kl": 0.0892333984375, | |
| "learning_rate": 1.8594235253127372e-07, | |
| "loss": 0.0167, | |
| "reward": 0.08483636751770973, | |
| "reward_std": 0.16801734641194344, | |
| "rewards/cosine_scaled_reward": -0.09572408557869494, | |
| "rewards/format_reward": 0.5208333432674408, | |
| "step": 410 | |
| }, | |
| { | |
| "completion_length": 3008.5833740234375, | |
| "epoch": 0.4697142857142857, | |
| "grad_norm": 0.165581613779068, | |
| "kl": 0.1275634765625, | |
| "learning_rate": 1.8410465752883758e-07, | |
| "loss": 0.0078, | |
| "reward": -0.034305301029235125, | |
| "reward_std": 0.11271641030907631, | |
| "rewards/cosine_scaled_reward": -0.23245973512530327, | |
| "rewards/format_reward": 0.33333333395421505, | |
| "step": 411 | |
| }, | |
| { | |
| "completion_length": 2866.3750915527344, | |
| "epoch": 0.47085714285714286, | |
| "grad_norm": 0.11941103637218475, | |
| "kl": 0.1312255859375, | |
| "learning_rate": 1.822847957491922e-07, | |
| "loss": 0.0093, | |
| "reward": 0.049833407159894705, | |
| "reward_std": 0.13003908470273018, | |
| "rewards/cosine_scaled_reward": -0.11439976841211319, | |
| "rewards/format_reward": 0.4166666716337204, | |
| "step": 412 | |
| }, | |
| { | |
| "completion_length": 3091.0001220703125, | |
| "epoch": 0.472, | |
| "grad_norm": 0.14630669355392456, | |
| "kl": 0.1300048828125, | |
| "learning_rate": 1.804828558898332e-07, | |
| "loss": 0.0141, | |
| "reward": -0.018676850322663086, | |
| "reward_std": 0.182958560064435, | |
| "rewards/cosine_scaled_reward": -0.1924271397292614, | |
| "rewards/format_reward": 0.3125000111758709, | |
| "step": 413 | |
| }, | |
| { | |
| "completion_length": 2818.229248046875, | |
| "epoch": 0.47314285714285714, | |
| "grad_norm": 0.06924273818731308, | |
| "kl": 0.11798095703125, | |
| "learning_rate": 1.7869892577476722e-07, | |
| "loss": 0.0202, | |
| "reward": 0.08587748650461435, | |
| "reward_std": 0.1340707242488861, | |
| "rewards/cosine_scaled_reward": -0.01012038066983223, | |
| "rewards/format_reward": 0.3541666828095913, | |
| "step": 414 | |
| }, | |
| { | |
| "completion_length": 3325.9375610351562, | |
| "epoch": 0.4742857142857143, | |
| "grad_norm": 0.19681856036186218, | |
| "kl": 0.141845703125, | |
| "learning_rate": 1.7693309235023127e-07, | |
| "loss": 0.0195, | |
| "reward": -0.012051378726027906, | |
| "reward_std": 0.18657904118299484, | |
| "rewards/cosine_scaled_reward": -0.14156150445342064, | |
| "rewards/format_reward": 0.2291666753590107, | |
| "step": 415 | |
| }, | |
| { | |
| "completion_length": 2779.854248046875, | |
| "epoch": 0.4754285714285714, | |
| "grad_norm": 0.18646536767482758, | |
| "kl": 0.10223388671875, | |
| "learning_rate": 1.7518544168045524e-07, | |
| "loss": 0.0013, | |
| "reward": 0.12834218982607126, | |
| "reward_std": 0.1757796686142683, | |
| "rewards/cosine_scaled_reward": -0.032867638394236565, | |
| "rewards/format_reward": 0.5625000111758709, | |
| "step": 416 | |
| }, | |
| { | |
| "completion_length": 3148.0833740234375, | |
| "epoch": 0.4765714285714286, | |
| "grad_norm": 0.1434994637966156, | |
| "kl": 0.13720703125, | |
| "learning_rate": 1.7345605894346726e-07, | |
| "loss": 0.0076, | |
| "reward": 0.11033766658511013, | |
| "reward_std": 0.1882594134658575, | |
| "rewards/cosine_scaled_reward": -0.014765150845050812, | |
| "rewards/format_reward": 0.4583333469927311, | |
| "step": 417 | |
| }, | |
| { | |
| "completion_length": 3250.416748046875, | |
| "epoch": 0.4777142857142857, | |
| "grad_norm": 0.20610202848911285, | |
| "kl": 0.16943359375, | |
| "learning_rate": 1.7174502842694212e-07, | |
| "loss": 0.0114, | |
| "reward": 0.004000701010227203, | |
| "reward_std": 0.17842144519090652, | |
| "rewards/cosine_scaled_reward": -0.11929273931309581, | |
| "rewards/format_reward": 0.25000000558793545, | |
| "step": 418 | |
| }, | |
| { | |
| "completion_length": 3022.854248046875, | |
| "epoch": 0.47885714285714287, | |
| "grad_norm": 0.12843535840511322, | |
| "kl": 0.1234130859375, | |
| "learning_rate": 1.7005243352409333e-07, | |
| "loss": 0.0153, | |
| "reward": 0.11016191449016333, | |
| "reward_std": 0.19738462194800377, | |
| "rewards/cosine_scaled_reward": 0.015342913568019867, | |
| "rewards/format_reward": 0.3958333358168602, | |
| "step": 419 | |
| }, | |
| { | |
| "completion_length": 2693.0000915527344, | |
| "epoch": 0.48, | |
| "grad_norm": 0.14908377826213837, | |
| "kl": 0.139892578125, | |
| "learning_rate": 1.6837835672960831e-07, | |
| "loss": 0.0237, | |
| "reward": 0.18104586005210876, | |
| "reward_std": 0.27537010610103607, | |
| "rewards/cosine_scaled_reward": 0.08193270675837994, | |
| "rewards/format_reward": 0.5208333414047956, | |
| "step": 420 | |
| }, | |
| { | |
| "completion_length": 2401.5208435058594, | |
| "epoch": 0.48114285714285715, | |
| "grad_norm": 0.18677739799022675, | |
| "kl": 0.07977294921875, | |
| "learning_rate": 1.6672287963562852e-07, | |
| "loss": 0.0188, | |
| "reward": 0.10364878876134753, | |
| "reward_std": 0.26506754755973816, | |
| "rewards/cosine_scaled_reward": -0.07407497242093086, | |
| "rewards/format_reward": 0.5416666865348816, | |
| "step": 421 | |
| }, | |
| { | |
| "completion_length": 2945.4583740234375, | |
| "epoch": 0.48228571428571426, | |
| "grad_norm": 0.10596109926700592, | |
| "kl": 0.173095703125, | |
| "learning_rate": 1.6508608292777203e-07, | |
| "loss": 0.0237, | |
| "reward": 0.09782602824270725, | |
| "reward_std": 0.16709819808602333, | |
| "rewards/cosine_scaled_reward": -0.02714484930038452, | |
| "rewards/format_reward": 0.43750002048909664, | |
| "step": 422 | |
| }, | |
| { | |
| "completion_length": 2630.5833587646484, | |
| "epoch": 0.48342857142857143, | |
| "grad_norm": 0.1338430792093277, | |
| "kl": 0.09613037109375, | |
| "learning_rate": 1.6346804638120098e-07, | |
| "loss": 0.0092, | |
| "reward": 0.11964964680373669, | |
| "reward_std": 0.21679277531802654, | |
| "rewards/cosine_scaled_reward": -0.040930287912487984, | |
| "rewards/format_reward": 0.5416666865348816, | |
| "step": 423 | |
| }, | |
| { | |
| "completion_length": 2916.0208740234375, | |
| "epoch": 0.4845714285714286, | |
| "grad_norm": 0.09180784225463867, | |
| "kl": 0.10906982421875, | |
| "learning_rate": 1.6186884885673413e-07, | |
| "loss": 0.0124, | |
| "reward": 0.08189435419626534, | |
| "reward_std": 0.13574443012475967, | |
| "rewards/cosine_scaled_reward": -0.10579311475157738, | |
| "rewards/format_reward": 0.5208333432674408, | |
| "step": 424 | |
| }, | |
| { | |
| "completion_length": 3309.3125610351562, | |
| "epoch": 0.4857142857142857, | |
| "grad_norm": 0.13504928350448608, | |
| "kl": 0.162109375, | |
| "learning_rate": 1.6028856829700258e-07, | |
| "loss": 0.0175, | |
| "reward": 0.05675757909193635, | |
| "reward_std": 0.26599225401878357, | |
| "rewards/cosine_scaled_reward": -0.07623798586428165, | |
| "rewards/format_reward": 0.3750000111758709, | |
| "step": 425 | |
| }, | |
| { | |
| "completion_length": 2787.8959350585938, | |
| "epoch": 0.4868571428571429, | |
| "grad_norm": 0.3096926212310791, | |
| "kl": 0.1478271484375, | |
| "learning_rate": 1.5872728172265146e-07, | |
| "loss": 0.0322, | |
| "reward": 0.1879144348204136, | |
| "reward_std": 0.2544500045478344, | |
| "rewards/cosine_scaled_reward": 0.10508503206074238, | |
| "rewards/format_reward": 0.5000000223517418, | |
| "step": 426 | |
| }, | |
| { | |
| "completion_length": 2851.1251220703125, | |
| "epoch": 0.488, | |
| "grad_norm": 0.19224634766578674, | |
| "kl": 0.0966796875, | |
| "learning_rate": 1.5718506522858572e-07, | |
| "loss": 0.0332, | |
| "reward": 0.09959506615996361, | |
| "reward_std": 0.22977428324520588, | |
| "rewards/cosine_scaled_reward": -0.00887887051794678, | |
| "rewards/format_reward": 0.39583334140479565, | |
| "step": 427 | |
| }, | |
| { | |
| "completion_length": 3091.6041870117188, | |
| "epoch": 0.48914285714285716, | |
| "grad_norm": 0.1674949824810028, | |
| "kl": 0.137939453125, | |
| "learning_rate": 1.5566199398026147e-07, | |
| "loss": 0.0103, | |
| "reward": -0.02589015825651586, | |
| "reward_std": 0.11072252877056599, | |
| "rewards/cosine_scaled_reward": -0.190176859498024, | |
| "rewards/format_reward": 0.2708333432674408, | |
| "step": 428 | |
| }, | |
| { | |
| "completion_length": 3006.9584350585938, | |
| "epoch": 0.49028571428571427, | |
| "grad_norm": 0.2110871821641922, | |
| "kl": 0.120361328125, | |
| "learning_rate": 1.5415814221002265e-07, | |
| "loss": 0.0232, | |
| "reward": 0.10162217915058136, | |
| "reward_std": 0.2770567834377289, | |
| "rewards/cosine_scaled_reward": -0.00024698610650375485, | |
| "rewards/format_reward": 0.3958333395421505, | |
| "step": 429 | |
| }, | |
| { | |
| "completion_length": 3136.354248046875, | |
| "epoch": 0.49142857142857144, | |
| "grad_norm": 0.19881188869476318, | |
| "kl": 0.165283203125, | |
| "learning_rate": 1.5267358321348285e-07, | |
| "loss": 0.0161, | |
| "reward": 0.058886402286589146, | |
| "reward_std": 0.18321863561868668, | |
| "rewards/cosine_scaled_reward": -0.04400887340307236, | |
| "rewards/format_reward": 0.3125000149011612, | |
| "step": 430 | |
| }, | |
| { | |
| "completion_length": 3038.416748046875, | |
| "epoch": 0.49257142857142855, | |
| "grad_norm": 0.14902929961681366, | |
| "kl": 0.208251953125, | |
| "learning_rate": 1.5120838934595337e-07, | |
| "loss": 0.0327, | |
| "reward": 0.054865699261426926, | |
| "reward_std": 0.21466796472668648, | |
| "rewards/cosine_scaled_reward": -0.07096906809601933, | |
| "rewards/format_reward": 0.35416667349636555, | |
| "step": 431 | |
| }, | |
| { | |
| "completion_length": 3104.4375610351562, | |
| "epoch": 0.4937142857142857, | |
| "grad_norm": 0.1337394416332245, | |
| "kl": 0.25244140625, | |
| "learning_rate": 1.4976263201891613e-07, | |
| "loss": 0.0247, | |
| "reward": 0.005363433388993144, | |
| "reward_std": 0.20017725601792336, | |
| "rewards/cosine_scaled_reward": -0.15636316500604153, | |
| "rewards/format_reward": 0.33333334140479565, | |
| "step": 432 | |
| }, | |
| { | |
| "completion_length": 3364.5000610351562, | |
| "epoch": 0.4948571428571429, | |
| "grad_norm": 0.1730562001466751, | |
| "kl": 0.1552734375, | |
| "learning_rate": 1.483363816965435e-07, | |
| "loss": 0.0277, | |
| "reward": -0.03153383359313011, | |
| "reward_std": 0.2260677069425583, | |
| "rewards/cosine_scaled_reward": -0.15686456114053726, | |
| "rewards/format_reward": 0.18750000186264515, | |
| "step": 433 | |
| }, | |
| { | |
| "completion_length": 2117.500030517578, | |
| "epoch": 0.496, | |
| "grad_norm": 0.26296311616897583, | |
| "kl": 0.12432861328125, | |
| "learning_rate": 1.469297078922642e-07, | |
| "loss": 0.0189, | |
| "reward": 0.29069491755217314, | |
| "reward_std": 0.2671359069645405, | |
| "rewards/cosine_scaled_reward": 0.19368860870599747, | |
| "rewards/format_reward": 0.7083333358168602, | |
| "step": 434 | |
| }, | |
| { | |
| "completion_length": 2974.854248046875, | |
| "epoch": 0.49714285714285716, | |
| "grad_norm": 0.17621202766895294, | |
| "kl": 0.1923828125, | |
| "learning_rate": 1.4554267916537495e-07, | |
| "loss": 0.019, | |
| "reward": 8.131563663482666e-05, | |
| "reward_std": 0.1405728980898857, | |
| "rewards/cosine_scaled_reward": -0.1765117086470127, | |
| "rewards/format_reward": 0.3541666828095913, | |
| "step": 435 | |
| }, | |
| { | |
| "completion_length": 3210.9375610351562, | |
| "epoch": 0.4982857142857143, | |
| "grad_norm": 0.20731380581855774, | |
| "kl": 0.1595458984375, | |
| "learning_rate": 1.4417536311769885e-07, | |
| "loss": 0.0149, | |
| "reward": -0.035835981369018555, | |
| "reward_std": 0.14877472072839737, | |
| "rewards/cosine_scaled_reward": -0.21610296051949263, | |
| "rewards/format_reward": 0.29166666977107525, | |
| "step": 436 | |
| }, | |
| { | |
| "completion_length": 2836.979248046875, | |
| "epoch": 0.49942857142857144, | |
| "grad_norm": 0.21571239829063416, | |
| "kl": 0.1517333984375, | |
| "learning_rate": 1.4282782639029128e-07, | |
| "loss": 0.0104, | |
| "reward": 0.0753198517486453, | |
| "reward_std": 0.18916064128279686, | |
| "rewards/cosine_scaled_reward": -0.10683454759418964, | |
| "rewards/format_reward": 0.5000000111758709, | |
| "step": 437 | |
| }, | |
| { | |
| "completion_length": 2800.791748046875, | |
| "epoch": 0.5005714285714286, | |
| "grad_norm": 0.2671648859977722, | |
| "kl": 0.1373291015625, | |
| "learning_rate": 1.4150013466019114e-07, | |
| "loss": -0.0019, | |
| "reward": 0.10509323189035058, | |
| "reward_std": 0.22916123643517494, | |
| "rewards/cosine_scaled_reward": -0.10928571410477161, | |
| "rewards/format_reward": 0.6250000223517418, | |
| "step": 438 | |
| }, | |
| { | |
| "completion_length": 2471.5625610351562, | |
| "epoch": 0.5017142857142857, | |
| "grad_norm": 0.11148757487535477, | |
| "kl": 0.107666015625, | |
| "learning_rate": 1.4019235263722034e-07, | |
| "loss": 0.0124, | |
| "reward": 0.11210739426314831, | |
| "reward_std": 0.1411634124815464, | |
| "rewards/cosine_scaled_reward": -0.07587022334337234, | |
| "rewards/format_reward": 0.5833333432674408, | |
| "step": 439 | |
| }, | |
| { | |
| "completion_length": 2824.625, | |
| "epoch": 0.5028571428571429, | |
| "grad_norm": 0.1120753064751625, | |
| "kl": 0.107421875, | |
| "learning_rate": 1.3890454406082956e-07, | |
| "loss": 0.0224, | |
| "reward": 0.08505485765635967, | |
| "reward_std": 0.20817279256880283, | |
| "rewards/cosine_scaled_reward": -0.09994211653247476, | |
| "rewards/format_reward": 0.5208333395421505, | |
| "step": 440 | |
| }, | |
| { | |
| "completion_length": 2719.979248046875, | |
| "epoch": 0.504, | |
| "grad_norm": 0.18487463891506195, | |
| "kl": 0.09619140625, | |
| "learning_rate": 1.3763677169699217e-07, | |
| "loss": 0.0244, | |
| "reward": 0.07016580831259489, | |
| "reward_std": 0.1472882442176342, | |
| "rewards/cosine_scaled_reward": -0.08464274555444717, | |
| "rewards/format_reward": 0.43750002048909664, | |
| "step": 441 | |
| }, | |
| { | |
| "completion_length": 2590.354217529297, | |
| "epoch": 0.5051428571428571, | |
| "grad_norm": 0.11660459637641907, | |
| "kl": 0.1036376953125, | |
| "learning_rate": 1.3638909733514452e-07, | |
| "loss": 0.0132, | |
| "reward": 0.1844820436090231, | |
| "reward_std": 0.11101518012583256, | |
| "rewards/cosine_scaled_reward": 0.09033233672380447, | |
| "rewards/format_reward": 0.5416666716337204, | |
| "step": 442 | |
| }, | |
| { | |
| "completion_length": 2885.416748046875, | |
| "epoch": 0.5062857142857143, | |
| "grad_norm": 0.09919751435518265, | |
| "kl": 0.105224609375, | |
| "learning_rate": 1.351615817851748e-07, | |
| "loss": 0.0167, | |
| "reward": 0.04893940966576338, | |
| "reward_std": 0.16348855942487717, | |
| "rewards/cosine_scaled_reward": -0.09520457684993744, | |
| "rewards/format_reward": 0.3750000149011612, | |
| "step": 443 | |
| }, | |
| { | |
| "completion_length": 3022.479248046875, | |
| "epoch": 0.5074285714285715, | |
| "grad_norm": 0.3206675350666046, | |
| "kl": 0.1644287109375, | |
| "learning_rate": 1.3395428487445914e-07, | |
| "loss": 0.0076, | |
| "reward": 0.12928162515163422, | |
| "reward_std": 0.13619288429617882, | |
| "rewards/cosine_scaled_reward": 0.06258754059672356, | |
| "rewards/format_reward": 0.37500000558793545, | |
| "step": 444 | |
| }, | |
| { | |
| "completion_length": 3131.5001220703125, | |
| "epoch": 0.5085714285714286, | |
| "grad_norm": 0.05568554624915123, | |
| "kl": 0.09429931640625, | |
| "learning_rate": 1.3276726544494571e-07, | |
| "loss": 0.0129, | |
| "reward": 0.06112617999315262, | |
| "reward_std": 0.1616399548947811, | |
| "rewards/cosine_scaled_reward": -0.08714022114872932, | |
| "rewards/format_reward": 0.41666667722165585, | |
| "step": 445 | |
| }, | |
| { | |
| "completion_length": 2526.3333740234375, | |
| "epoch": 0.5097142857142857, | |
| "grad_norm": 0.22921255230903625, | |
| "kl": 0.10003662109375, | |
| "learning_rate": 1.316005813502869e-07, | |
| "loss": 0.0258, | |
| "reward": 0.2065272876061499, | |
| "reward_std": 0.22883396595716476, | |
| "rewards/cosine_scaled_reward": 0.07993827015161514, | |
| "rewards/format_reward": 0.6458333432674408, | |
| "step": 446 | |
| }, | |
| { | |
| "completion_length": 2746.2500610351562, | |
| "epoch": 0.5108571428571429, | |
| "grad_norm": 0.1966109722852707, | |
| "kl": 0.21258544921875, | |
| "learning_rate": 1.3045428945301953e-07, | |
| "loss": 0.0188, | |
| "reward": 0.18417136743664742, | |
| "reward_std": 0.20083166286349297, | |
| "rewards/cosine_scaled_reward": 0.0021979063749313354, | |
| "rewards/format_reward": 0.7083333432674408, | |
| "step": 447 | |
| }, | |
| { | |
| "completion_length": 2765.666748046875, | |
| "epoch": 0.512, | |
| "grad_norm": 0.1699213981628418, | |
| "kl": 0.14111328125, | |
| "learning_rate": 1.2932844562179352e-07, | |
| "loss": 0.0054, | |
| "reward": 0.25757382810115814, | |
| "reward_std": 0.28643427789211273, | |
| "rewards/cosine_scaled_reward": 0.13410454744007438, | |
| "rewards/format_reward": 0.7291666865348816, | |
| "step": 448 | |
| }, | |
| { | |
| "completion_length": 2790.4583740234375, | |
| "epoch": 0.5131428571428571, | |
| "grad_norm": 0.10593433678150177, | |
| "kl": 0.10107421875, | |
| "learning_rate": 1.2822310472864885e-07, | |
| "loss": 0.0052, | |
| "reward": 0.2120291143655777, | |
| "reward_std": 0.2164306379854679, | |
| "rewards/cosine_scaled_reward": 0.1502765268087387, | |
| "rewards/format_reward": 0.5208333432674408, | |
| "step": 449 | |
| }, | |
| { | |
| "completion_length": 2983.3125610351562, | |
| "epoch": 0.5142857142857142, | |
| "grad_norm": 0.10273667424917221, | |
| "kl": 0.1207275390625, | |
| "learning_rate": 1.2713832064634125e-07, | |
| "loss": 0.0164, | |
| "reward": 0.12469350174069405, | |
| "reward_std": 0.16605908796191216, | |
| "rewards/cosine_scaled_reward": 0.041525671957060695, | |
| "rewards/format_reward": 0.39583334885537624, | |
| "step": 450 | |
| }, | |
| { | |
| "completion_length": 2936.166748046875, | |
| "epoch": 0.5154285714285715, | |
| "grad_norm": 0.09980158507823944, | |
| "kl": 0.1236572265625, | |
| "learning_rate": 1.260741462457165e-07, | |
| "loss": 0.0111, | |
| "reward": 0.04728453326970339, | |
| "reward_std": 0.19829624518752098, | |
| "rewards/cosine_scaled_reward": -0.1785262580960989, | |
| "rewards/format_reward": 0.541666679084301, | |
| "step": 451 | |
| }, | |
| { | |
| "completion_length": 3068.5209350585938, | |
| "epoch": 0.5165714285714286, | |
| "grad_norm": 0.09439096599817276, | |
| "kl": 0.1202392578125, | |
| "learning_rate": 1.2503063339313356e-07, | |
| "loss": 0.0198, | |
| "reward": -0.02472090208902955, | |
| "reward_std": 0.19751409068703651, | |
| "rewards/cosine_scaled_reward": -0.18450694406055845, | |
| "rewards/format_reward": 0.27083333767950535, | |
| "step": 452 | |
| }, | |
| { | |
| "completion_length": 3248.1458740234375, | |
| "epoch": 0.5177142857142857, | |
| "grad_norm": 0.20403756201267242, | |
| "kl": 0.10986328125, | |
| "learning_rate": 1.2400783294793668e-07, | |
| "loss": 0.0185, | |
| "reward": 0.19421018287539482, | |
| "reward_std": 0.28973371908068657, | |
| "rewards/cosine_scaled_reward": 0.1276424676179886, | |
| "rewards/format_reward": 0.5000000149011612, | |
| "step": 453 | |
| }, | |
| { | |
| "completion_length": 3284.2083740234375, | |
| "epoch": 0.5188571428571429, | |
| "grad_norm": 0.20986713469028473, | |
| "kl": 0.181396484375, | |
| "learning_rate": 1.2300579475997657e-07, | |
| "loss": 0.0154, | |
| "reward": -0.014905142597854137, | |
| "reward_std": 0.1936473622918129, | |
| "rewards/cosine_scaled_reward": -0.1740622464567423, | |
| "rewards/format_reward": 0.29166667722165585, | |
| "step": 454 | |
| }, | |
| { | |
| "completion_length": 2563.0208740234375, | |
| "epoch": 0.52, | |
| "grad_norm": 0.3010261058807373, | |
| "kl": 0.1158447265625, | |
| "learning_rate": 1.220245676671809e-07, | |
| "loss": 0.0177, | |
| "reward": 0.1574697308242321, | |
| "reward_std": 0.2700018286705017, | |
| "rewards/cosine_scaled_reward": 0.01733357459306717, | |
| "rewards/format_reward": 0.5625000149011612, | |
| "step": 455 | |
| }, | |
| { | |
| "completion_length": 2628.9375610351562, | |
| "epoch": 0.5211428571428571, | |
| "grad_norm": 0.08745517581701279, | |
| "kl": 0.10638427734375, | |
| "learning_rate": 1.2106419949317388e-07, | |
| "loss": 0.014, | |
| "reward": 0.13036726391874254, | |
| "reward_std": 0.1483678873628378, | |
| "rewards/cosine_scaled_reward": -0.020923439413309097, | |
| "rewards/format_reward": 0.5416666716337204, | |
| "step": 456 | |
| }, | |
| { | |
| "completion_length": 2737.8334350585938, | |
| "epoch": 0.5222857142857142, | |
| "grad_norm": 0.16678327322006226, | |
| "kl": 0.12060546875, | |
| "learning_rate": 1.2012473704494537e-07, | |
| "loss": 0.0209, | |
| "reward": 0.08800210431218147, | |
| "reward_std": 0.19280700013041496, | |
| "rewards/cosine_scaled_reward": -0.03003532299771905, | |
| "rewards/format_reward": 0.3958333432674408, | |
| "step": 457 | |
| }, | |
| { | |
| "completion_length": 2646.625030517578, | |
| "epoch": 0.5234285714285715, | |
| "grad_norm": 0.09565078467130661, | |
| "kl": 0.1390380859375, | |
| "learning_rate": 1.1920622611056974e-07, | |
| "loss": 0.0179, | |
| "reward": 0.19407072104513645, | |
| "reward_std": 0.24042771011590958, | |
| "rewards/cosine_scaled_reward": 0.09138227254152298, | |
| "rewards/format_reward": 0.5625000298023224, | |
| "step": 458 | |
| }, | |
| { | |
| "completion_length": 3175.541748046875, | |
| "epoch": 0.5245714285714286, | |
| "grad_norm": 0.2203829437494278, | |
| "kl": 0.155517578125, | |
| "learning_rate": 1.1830871145697412e-07, | |
| "loss": 0.0287, | |
| "reward": -0.0014233086258172989, | |
| "reward_std": 0.21528013050556183, | |
| "rewards/cosine_scaled_reward": -0.1592085063457489, | |
| "rewards/format_reward": 0.31250001303851604, | |
| "step": 459 | |
| }, | |
| { | |
| "completion_length": 2788.854248046875, | |
| "epoch": 0.5257142857142857, | |
| "grad_norm": 0.21179553866386414, | |
| "kl": 0.12823486328125, | |
| "learning_rate": 1.1743223682775649e-07, | |
| "loss": 0.0147, | |
| "reward": 0.17923392355442047, | |
| "reward_std": 0.24272285774350166, | |
| "rewards/cosine_scaled_reward": 0.09720991738140583, | |
| "rewards/format_reward": 0.5000000149011612, | |
| "step": 460 | |
| }, | |
| { | |
| "completion_length": 2897.7708740234375, | |
| "epoch": 0.5268571428571428, | |
| "grad_norm": 0.13827277719974518, | |
| "kl": 0.190673828125, | |
| "learning_rate": 1.1657684494105386e-07, | |
| "loss": 0.0209, | |
| "reward": 0.11659316718578339, | |
| "reward_std": 0.18873300403356552, | |
| "rewards/cosine_scaled_reward": -0.05558823235332966, | |
| "rewards/format_reward": 0.5625000074505806, | |
| "step": 461 | |
| }, | |
| { | |
| "completion_length": 3041.1875610351562, | |
| "epoch": 0.528, | |
| "grad_norm": 0.32120588421821594, | |
| "kl": 0.11181640625, | |
| "learning_rate": 1.1574257748745986e-07, | |
| "loss": 0.0286, | |
| "reward": 0.06565053202211857, | |
| "reward_std": 0.1737687923014164, | |
| "rewards/cosine_scaled_reward": -0.019681161269545555, | |
| "rewards/format_reward": 0.29166667722165585, | |
| "step": 462 | |
| }, | |
| { | |
| "completion_length": 2623.875030517578, | |
| "epoch": 0.5291428571428571, | |
| "grad_norm": 0.2696680426597595, | |
| "kl": 0.12872314453125, | |
| "learning_rate": 1.1492947512799328e-07, | |
| "loss": 0.0063, | |
| "reward": 0.17500283475965261, | |
| "reward_std": 0.17704222910106182, | |
| "rewards/cosine_scaled_reward": 0.08128289505839348, | |
| "rewards/format_reward": 0.5000000037252903, | |
| "step": 463 | |
| }, | |
| { | |
| "completion_length": 3145.6876220703125, | |
| "epoch": 0.5302857142857142, | |
| "grad_norm": 0.10782798379659653, | |
| "kl": 0.17041015625, | |
| "learning_rate": 1.1413757749211602e-07, | |
| "loss": 0.0122, | |
| "reward": 0.05478057137224823, | |
| "reward_std": 0.20212148874998093, | |
| "rewards/cosine_scaled_reward": -0.1439938172698021, | |
| "rewards/format_reward": 0.5, | |
| "step": 464 | |
| }, | |
| { | |
| "completion_length": 3028.3958740234375, | |
| "epoch": 0.5314285714285715, | |
| "grad_norm": 0.1276625692844391, | |
| "kl": 0.1417236328125, | |
| "learning_rate": 1.1336692317580158e-07, | |
| "loss": 0.0152, | |
| "reward": 0.01958487369120121, | |
| "reward_std": 0.13132779486477375, | |
| "rewards/cosine_scaled_reward": -0.10825883969664574, | |
| "rewards/format_reward": 0.29166667349636555, | |
| "step": 465 | |
| }, | |
| { | |
| "completion_length": 3143.479248046875, | |
| "epoch": 0.5325714285714286, | |
| "grad_norm": 0.1533503532409668, | |
| "kl": 0.154052734375, | |
| "learning_rate": 1.1261754973965422e-07, | |
| "loss": 0.0154, | |
| "reward": 0.03453392535448074, | |
| "reward_std": 0.1747850365936756, | |
| "rewards/cosine_scaled_reward": -0.10015170648694038, | |
| "rewards/format_reward": 0.3333333395421505, | |
| "step": 466 | |
| }, | |
| { | |
| "completion_length": 2970.6875610351562, | |
| "epoch": 0.5337142857142857, | |
| "grad_norm": 0.1531020849943161, | |
| "kl": 0.1981201171875, | |
| "learning_rate": 1.1188949370707787e-07, | |
| "loss": 0.0193, | |
| "reward": 0.11466073244810104, | |
| "reward_std": 0.18171420320868492, | |
| "rewards/cosine_scaled_reward": -0.04780617356300354, | |
| "rewards/format_reward": 0.5416666865348816, | |
| "step": 467 | |
| }, | |
| { | |
| "completion_length": 2798.6666870117188, | |
| "epoch": 0.5348571428571428, | |
| "grad_norm": 0.49003785848617554, | |
| "kl": 0.1258544921875, | |
| "learning_rate": 1.1118279056249653e-07, | |
| "loss": 0.041, | |
| "reward": 0.15389825403690338, | |
| "reward_std": 0.21318360045552254, | |
| "rewards/cosine_scaled_reward": 0.09784095920622349, | |
| "rewards/format_reward": 0.3958333432674408, | |
| "step": 468 | |
| }, | |
| { | |
| "completion_length": 2521.1250915527344, | |
| "epoch": 0.536, | |
| "grad_norm": 0.1207413300871849, | |
| "kl": 0.12939453125, | |
| "learning_rate": 1.1049747474962444e-07, | |
| "loss": 0.0144, | |
| "reward": 0.12459285743534565, | |
| "reward_std": 0.24095628038048744, | |
| "rewards/cosine_scaled_reward": -0.07370727881789207, | |
| "rewards/format_reward": 0.6250000111758709, | |
| "step": 469 | |
| }, | |
| { | |
| "completion_length": 2418.5834350585938, | |
| "epoch": 0.5371428571428571, | |
| "grad_norm": 0.09924568980932236, | |
| "kl": 0.1207275390625, | |
| "learning_rate": 1.0983357966978745e-07, | |
| "loss": 0.0181, | |
| "reward": 0.15776699036359787, | |
| "reward_std": 0.181232038885355, | |
| "rewards/cosine_scaled_reward": 0.0003161989152431488, | |
| "rewards/format_reward": 0.6041666865348816, | |
| "step": 470 | |
| }, | |
| { | |
| "completion_length": 3219.354248046875, | |
| "epoch": 0.5382857142857143, | |
| "grad_norm": 0.20474353432655334, | |
| "kl": 0.195556640625, | |
| "learning_rate": 1.0919113768029517e-07, | |
| "loss": 0.028, | |
| "reward": -0.021314775571227074, | |
| "reward_std": 0.15102575160562992, | |
| "rewards/cosine_scaled_reward": -0.1376272514462471, | |
| "rewards/format_reward": 0.18750000558793545, | |
| "step": 471 | |
| }, | |
| { | |
| "completion_length": 2824.8959350585938, | |
| "epoch": 0.5394285714285715, | |
| "grad_norm": 0.11642623692750931, | |
| "kl": 0.1668701171875, | |
| "learning_rate": 1.0857018009286381e-07, | |
| "loss": 0.0169, | |
| "reward": 0.19659322872757912, | |
| "reward_std": 0.22429415583610535, | |
| "rewards/cosine_scaled_reward": 0.09102061949670315, | |
| "rewards/format_reward": 0.5833333432674408, | |
| "step": 472 | |
| }, | |
| { | |
| "completion_length": 2985.979248046875, | |
| "epoch": 0.5405714285714286, | |
| "grad_norm": 0.10728507488965988, | |
| "kl": 0.14892578125, | |
| "learning_rate": 1.0797073717209013e-07, | |
| "loss": 0.0189, | |
| "reward": 0.05383403552696109, | |
| "reward_std": 0.16374297812581062, | |
| "rewards/cosine_scaled_reward": -0.1464824303984642, | |
| "rewards/format_reward": 0.5000000223517418, | |
| "step": 473 | |
| }, | |
| { | |
| "completion_length": 2839.1250610351562, | |
| "epoch": 0.5417142857142857, | |
| "grad_norm": 0.28098025918006897, | |
| "kl": 0.178466796875, | |
| "learning_rate": 1.0739283813397639e-07, | |
| "loss": 0.0074, | |
| "reward": 0.1635891911573708, | |
| "reward_std": 0.15074511989951134, | |
| "rewards/cosine_scaled_reward": 0.0024009905755519867, | |
| "rewards/format_reward": 0.6250000149011612, | |
| "step": 474 | |
| }, | |
| { | |
| "completion_length": 3077.8959350585938, | |
| "epoch": 0.5428571428571428, | |
| "grad_norm": 0.12044571340084076, | |
| "kl": 0.1651611328125, | |
| "learning_rate": 1.068365111445064e-07, | |
| "loss": 0.0192, | |
| "reward": 0.11944649741053581, | |
| "reward_std": 0.2596472129225731, | |
| "rewards/cosine_scaled_reward": -0.0492430180311203, | |
| "rewards/format_reward": 0.5625000223517418, | |
| "step": 475 | |
| }, | |
| { | |
| "completion_length": 2893.8750610351562, | |
| "epoch": 0.544, | |
| "grad_norm": 0.2212853878736496, | |
| "kl": 0.11370849609375, | |
| "learning_rate": 1.063017833182728e-07, | |
| "loss": 0.0211, | |
| "reward": 0.009671430103480816, | |
| "reward_std": 0.15169707685709, | |
| "rewards/cosine_scaled_reward": -0.12774943560361862, | |
| "rewards/format_reward": 0.29166666977107525, | |
| "step": 476 | |
| }, | |
| { | |
| "completion_length": 1890.6250305175781, | |
| "epoch": 0.5451428571428572, | |
| "grad_norm": 0.15935790538787842, | |
| "kl": 0.1428985595703125, | |
| "learning_rate": 1.0578868071715544e-07, | |
| "loss": 0.0107, | |
| "reward": 0.18408689834177494, | |
| "reward_std": 0.15035339817404747, | |
| "rewards/cosine_scaled_reward": 0.007445871829986572, | |
| "rewards/format_reward": 0.6875000149011612, | |
| "step": 477 | |
| }, | |
| { | |
| "completion_length": 2822.3126220703125, | |
| "epoch": 0.5462857142857143, | |
| "grad_norm": 0.3059462308883667, | |
| "kl": 0.20526123046875, | |
| "learning_rate": 1.0529722834905125e-07, | |
| "loss": 0.0144, | |
| "reward": 0.11325427703559399, | |
| "reward_std": 0.15025078132748604, | |
| "rewards/cosine_scaled_reward": -0.10386009886860847, | |
| "rewards/format_reward": 0.6458333507180214, | |
| "step": 478 | |
| }, | |
| { | |
| "completion_length": 2604.791748046875, | |
| "epoch": 0.5474285714285714, | |
| "grad_norm": 0.24236957728862762, | |
| "kl": 0.1407470703125, | |
| "learning_rate": 1.0482745016665526e-07, | |
| "loss": 0.0044, | |
| "reward": 0.1388232481549494, | |
| "reward_std": 0.18972506280988455, | |
| "rewards/cosine_scaled_reward": -0.03663571737706661, | |
| "rewards/format_reward": 0.6041666716337204, | |
| "step": 479 | |
| }, | |
| { | |
| "completion_length": 2933.5000610351562, | |
| "epoch": 0.5485714285714286, | |
| "grad_norm": 0.22101300954818726, | |
| "kl": 0.1507568359375, | |
| "learning_rate": 1.0437936906629334e-07, | |
| "loss": 0.0311, | |
| "reward": 0.08109640050679445, | |
| "reward_std": 0.2291203737258911, | |
| "rewards/cosine_scaled_reward": -0.07463090680539608, | |
| "rewards/format_reward": 0.458333358168602, | |
| "step": 480 | |
| }, | |
| { | |
| "completion_length": 3166.6666870117188, | |
| "epoch": 0.5497142857142857, | |
| "grad_norm": 0.16636833548545837, | |
| "kl": 0.144287109375, | |
| "learning_rate": 1.0395300688680625e-07, | |
| "loss": 0.0218, | |
| "reward": 0.023469681851565838, | |
| "reward_std": 0.18229573033750057, | |
| "rewards/cosine_scaled_reward": -0.11214667744934559, | |
| "rewards/format_reward": 0.3125000111758709, | |
| "step": 481 | |
| }, | |
| { | |
| "completion_length": 2436.9584045410156, | |
| "epoch": 0.5508571428571428, | |
| "grad_norm": 0.08238783478736877, | |
| "kl": 0.1317138671875, | |
| "learning_rate": 1.0354838440848501e-07, | |
| "loss": 0.0179, | |
| "reward": 0.18019351363182068, | |
| "reward_std": 0.15379005670547485, | |
| "rewards/cosine_scaled_reward": 0.033062346279621124, | |
| "rewards/format_reward": 0.6250000204890966, | |
| "step": 482 | |
| }, | |
| { | |
| "completion_length": 2706.3959350585938, | |
| "epoch": 0.552, | |
| "grad_norm": 0.21499210596084595, | |
| "kl": 0.161865234375, | |
| "learning_rate": 1.0316552135205837e-07, | |
| "loss": 0.0252, | |
| "reward": 0.15977857820689678, | |
| "reward_std": 0.26758527010679245, | |
| "rewards/cosine_scaled_reward": 0.023867271142080426, | |
| "rewards/format_reward": 0.5625000111758709, | |
| "step": 483 | |
| }, | |
| { | |
| "completion_length": 2953.2084350585938, | |
| "epoch": 0.5531428571428572, | |
| "grad_norm": 0.1139960065484047, | |
| "kl": 0.154541015625, | |
| "learning_rate": 1.0280443637773163e-07, | |
| "loss": 0.0171, | |
| "reward": 0.11853593308478594, | |
| "reward_std": 0.12842319905757904, | |
| "rewards/cosine_scaled_reward": 0.04137469828128815, | |
| "rewards/format_reward": 0.375, | |
| "step": 484 | |
| }, | |
| { | |
| "completion_length": 2684.1458740234375, | |
| "epoch": 0.5542857142857143, | |
| "grad_norm": 0.11851796507835388, | |
| "kl": 0.1370849609375, | |
| "learning_rate": 1.0246514708427701e-07, | |
| "loss": 0.0185, | |
| "reward": 0.09991752170026302, | |
| "reward_std": 0.16255312226712704, | |
| "rewards/cosine_scaled_reward": -0.04461744986474514, | |
| "rewards/format_reward": 0.4791666902601719, | |
| "step": 485 | |
| }, | |
| { | |
| "completion_length": 3032.9583740234375, | |
| "epoch": 0.5554285714285714, | |
| "grad_norm": 0.2593059539794922, | |
| "kl": 0.1429443359375, | |
| "learning_rate": 1.0214767000817596e-07, | |
| "loss": 0.0261, | |
| "reward": 0.16791810281574726, | |
| "reward_std": 0.2469303011894226, | |
| "rewards/cosine_scaled_reward": 0.12874023243784904, | |
| "rewards/format_reward": 0.3958333395421505, | |
| "step": 486 | |
| }, | |
| { | |
| "completion_length": 2827.0209350585938, | |
| "epoch": 0.5565714285714286, | |
| "grad_norm": 0.18724219501018524, | |
| "kl": 0.16845703125, | |
| "learning_rate": 1.0185202062281336e-07, | |
| "loss": 0.026, | |
| "reward": 0.10195019654929638, | |
| "reward_std": 0.1869851052761078, | |
| "rewards/cosine_scaled_reward": -0.07432833686470985, | |
| "rewards/format_reward": 0.5416666865348816, | |
| "step": 487 | |
| }, | |
| { | |
| "completion_length": 3153.3751220703125, | |
| "epoch": 0.5577142857142857, | |
| "grad_norm": 0.08905702829360962, | |
| "kl": 0.14208984375, | |
| "learning_rate": 1.0157821333772304e-07, | |
| "loss": 0.0194, | |
| "reward": -0.05124050285667181, | |
| "reward_std": 0.14969679713249207, | |
| "rewards/cosine_scaled_reward": -0.24744544178247452, | |
| "rewards/format_reward": 0.29166666977107525, | |
| "step": 488 | |
| }, | |
| { | |
| "completion_length": 2122.3333435058594, | |
| "epoch": 0.5588571428571428, | |
| "grad_norm": 0.15485012531280518, | |
| "kl": 0.0798492431640625, | |
| "learning_rate": 1.013262614978859e-07, | |
| "loss": 0.0027, | |
| "reward": 0.2875633258372545, | |
| "reward_std": 0.15192867070436478, | |
| "rewards/cosine_scaled_reward": 0.22544683888554573, | |
| "rewards/format_reward": 0.6458333432674408, | |
| "step": 489 | |
| }, | |
| { | |
| "completion_length": 2718.6875610351562, | |
| "epoch": 0.56, | |
| "grad_norm": 0.2762245237827301, | |
| "kl": 0.1544189453125, | |
| "learning_rate": 1.0109617738307911e-07, | |
| "loss": 0.0379, | |
| "reward": 0.0434777345508337, | |
| "reward_std": 0.16402246057987213, | |
| "rewards/cosine_scaled_reward": -0.14828698337078094, | |
| "rewards/format_reward": 0.45833334885537624, | |
| "step": 490 | |
| }, | |
| { | |
| "completion_length": 2711.041748046875, | |
| "epoch": 0.5611428571428572, | |
| "grad_norm": 0.07242274284362793, | |
| "kl": 0.0966796875, | |
| "learning_rate": 1.0088797220727779e-07, | |
| "loss": 0.01, | |
| "reward": 0.1103372648358345, | |
| "reward_std": 0.10681618191301823, | |
| "rewards/cosine_scaled_reward": -0.014811493456363678, | |
| "rewards/format_reward": 0.4583333358168602, | |
| "step": 491 | |
| }, | |
| { | |
| "completion_length": 2689.437530517578, | |
| "epoch": 0.5622857142857143, | |
| "grad_norm": 0.08541396260261536, | |
| "kl": 0.18218994140625, | |
| "learning_rate": 1.0070165611810855e-07, | |
| "loss": 0.0214, | |
| "reward": 0.11053950618952513, | |
| "reward_std": 0.1750403195619583, | |
| "rewards/cosine_scaled_reward": -0.028363492339849472, | |
| "rewards/format_reward": 0.4791666716337204, | |
| "step": 492 | |
| }, | |
| { | |
| "completion_length": 2740.6250610351562, | |
| "epoch": 0.5634285714285714, | |
| "grad_norm": 0.15947282314300537, | |
| "kl": 0.1612548828125, | |
| "learning_rate": 1.005372381963547e-07, | |
| "loss": 0.0163, | |
| "reward": 0.1346584465354681, | |
| "reward_std": 0.19408046826720238, | |
| "rewards/cosine_scaled_reward": -0.012477652169764042, | |
| "rewards/format_reward": 0.5416666716337204, | |
| "step": 493 | |
| }, | |
| { | |
| "completion_length": 2409.5209045410156, | |
| "epoch": 0.5645714285714286, | |
| "grad_norm": 0.11221902072429657, | |
| "kl": 0.1192626953125, | |
| "learning_rate": 1.0039472645551372e-07, | |
| "loss": 0.0104, | |
| "reward": 0.0992423826828599, | |
| "reward_std": 0.17658364586532116, | |
| "rewards/cosine_scaled_reward": -0.11829135566949844, | |
| "rewards/format_reward": 0.6250000204890966, | |
| "step": 494 | |
| }, | |
| { | |
| "completion_length": 3080.166748046875, | |
| "epoch": 0.5657142857142857, | |
| "grad_norm": 0.1691495031118393, | |
| "kl": 0.1607666015625, | |
| "learning_rate": 1.002741278414069e-07, | |
| "loss": 0.0157, | |
| "reward": 0.14231530111283064, | |
| "reward_std": 0.2310621291399002, | |
| "rewards/cosine_scaled_reward": 0.048745833337306976, | |
| "rewards/format_reward": 0.4583333432674408, | |
| "step": 495 | |
| }, | |
| { | |
| "completion_length": 3018.1458740234375, | |
| "epoch": 0.5668571428571428, | |
| "grad_norm": 0.16187846660614014, | |
| "kl": 0.15899658203125, | |
| "learning_rate": 1.0017544823184055e-07, | |
| "loss": 0.0172, | |
| "reward": 0.07744702510535717, | |
| "reward_std": 0.25520757399499416, | |
| "rewards/cosine_scaled_reward": -0.05461839772760868, | |
| "rewards/format_reward": 0.416666679084301, | |
| "step": 496 | |
| }, | |
| { | |
| "completion_length": 2448.1250610351562, | |
| "epoch": 0.568, | |
| "grad_norm": 0.08442384004592896, | |
| "kl": 0.09490966796875, | |
| "learning_rate": 1.0009869243631952e-07, | |
| "loss": 0.0084, | |
| "reward": 0.20716202352195978, | |
| "reward_std": 0.1509286481887102, | |
| "rewards/cosine_scaled_reward": 0.11599862575531006, | |
| "rewards/format_reward": 0.5625000149011612, | |
| "step": 497 | |
| }, | |
| { | |
| "completion_length": 2926.937530517578, | |
| "epoch": 0.5691428571428572, | |
| "grad_norm": 0.12125472724437714, | |
| "kl": 0.14453125, | |
| "learning_rate": 1.000438641958131e-07, | |
| "loss": 0.0152, | |
| "reward": 0.021100876852869987, | |
| "reward_std": 0.18005427531898022, | |
| "rewards/cosine_scaled_reward": -0.1802983209490776, | |
| "rewards/format_reward": 0.43750000558793545, | |
| "step": 498 | |
| }, | |
| { | |
| "completion_length": 3116.791748046875, | |
| "epoch": 0.5702857142857143, | |
| "grad_norm": 0.13562169671058655, | |
| "kl": 0.128173828125, | |
| "learning_rate": 1.0001096618257236e-07, | |
| "loss": 0.0182, | |
| "reward": 0.03628665814176202, | |
| "reward_std": 0.19937744550406933, | |
| "rewards/cosine_scaled_reward": -0.1179479444399476, | |
| "rewards/format_reward": 0.3750000037252903, | |
| "step": 499 | |
| }, | |
| { | |
| "completion_length": 2521.5209045410156, | |
| "epoch": 0.5714285714285714, | |
| "grad_norm": 0.1849382072687149, | |
| "kl": 0.126220703125, | |
| "learning_rate": 1e-07, | |
| "loss": 0.0256, | |
| "reward": 0.12248995155096054, | |
| "reward_std": 0.22993995621800423, | |
| "rewards/cosine_scaled_reward": -0.06692719738930464, | |
| "rewards/format_reward": 0.6041666939854622, | |
| "step": 500 | |
| }, | |
| { | |
| "epoch": 0.5714285714285714, | |
| "step": 500, | |
| "total_flos": 0.0, | |
| "train_loss": 0.003466512419981882, | |
| "train_runtime": 9401.5521, | |
| "train_samples_per_second": 2.553, | |
| "train_steps_per_second": 0.053 | |
| } | |
| ], | |
| "logging_steps": 1, | |
| "max_steps": 500, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 1, | |
| "save_steps": 25, | |
| "stateful_callbacks": { | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": true, | |
| "should_training_stop": true | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 0.0, | |
| "train_batch_size": 4, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |