diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,54074 @@ +{ + "best_global_step": null, + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 10.0, + "eval_steps": 500, + "global_step": 3860, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "clip_ratio": 0.0, + "completion_length": 74.75, + "epoch": 0.0025906735751295338, + "grad_norm": 23.654455434537084, + "kl": 0.0, + "learning_rate": 1e-06, + "loss": -0.0, + "reward": 0.7312208712100983, + "reward_std": 0.3606285899877548, + "rewards/format_reward_rec": 0.625, + "rewards/point_reward": 0.41872087121009827, + "step": 1 + }, + { + "clip_ratio": 0.0, + "completion_length": 84.5, + "epoch": 0.0051813471502590676, + "grad_norm": 116.16598588589672, + "kl": 0.0008373260498046875, + "learning_rate": 9.99740932642487e-07, + "loss": 0.0, + "reward": 1.4451560974121094, + "reward_std": 0.5504895597696304, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 0.9451561868190765, + "step": 2 + }, + { + "clip_ratio": 0.0, + "completion_length": 46.5625, + "epoch": 0.007772020725388601, + "grad_norm": 25.86309876662035, + "kl": 0.0008907318115234375, + "learning_rate": 9.99481865284974e-07, + "loss": 0.0, + "reward": 1.7835919260978699, + "reward_std": 0.2786169648170471, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.2835918068885803, + "step": 3 + }, + { + "clip_ratio": 0.0, + "completion_length": 53.625, + "epoch": 0.010362694300518135, + "grad_norm": 10.949192686118256, + "kl": 0.0010128021240234375, + "learning_rate": 9.992227979274612e-07, + "loss": 0.0, + "reward": 2.124643087387085, + "reward_std": 0.518008291721344, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.624643325805664, + "step": 4 + }, + { + "clip_ratio": 0.0, + "completion_length": 34.6875, + "epoch": 0.012953367875647668, + "grad_norm": 4.030706300181012, + "kl": 0.0002632737159729004, + "learning_rate": 9.989637305699482e-07, + "loss": 0.0, + "reward": 2.3121966123580933, + "reward_std": 0.2590037997251784, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.8121966123580933, + "step": 5 + }, + { + "clip_ratio": 0.0, + "completion_length": 98.25, + "epoch": 0.015544041450777202, + "grad_norm": 22.143321089193286, + "kl": 0.0044403076171875, + "learning_rate": 9.987046632124352e-07, + "loss": 0.0, + "reward": 1.305199384689331, + "reward_std": 0.7584892809391022, + "rewards/format_reward_rec": 0.8125, + "rewards/point_reward": 0.8989493250846863, + "step": 6 + }, + { + "clip_ratio": 0.0, + "completion_length": 69.125, + "epoch": 0.018134715025906734, + "grad_norm": 58.97024600782642, + "kl": 0.01251220703125, + "learning_rate": 9.984455958549224e-07, + "loss": 0.0, + "reward": 1.372310757637024, + "reward_std": 0.2639093187171966, + "rewards/format_reward_rec": 0.9375, + "rewards/point_reward": 0.9035606682300568, + "step": 7 + }, + { + "clip_ratio": 0.0, + "completion_length": 46.1875, + "epoch": 0.02072538860103627, + "grad_norm": 11.096583407786104, + "kl": 0.0140533447265625, + "learning_rate": 9.981865284974092e-07, + "loss": 0.0008, + "reward": 2.3708999156951904, + "reward_std": 0.23891268002080324, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.8708999156951904, + "step": 8 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.3125, + "epoch": 0.023316062176165803, + "grad_norm": 31.15085797685794, + "kl": 0.0009918212890625, + "learning_rate": 9.979274611398964e-07, + "loss": 0.0002, + "reward": 2.277616500854492, + "reward_std": 0.3077518731145119, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.7776165008544922, + "step": 9 + }, + { + "clip_ratio": 0.0, + "completion_length": 59.25, + "epoch": 0.025906735751295335, + "grad_norm": 11.944424418473368, + "kl": 0.0010204315185546875, + "learning_rate": 9.976683937823834e-07, + "loss": 0.0, + "reward": 1.9366641640663147, + "reward_std": 0.5268431305885315, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.43666410446167, + "step": 10 + }, + { + "clip_ratio": 0.0, + "completion_length": 36.0, + "epoch": 0.02849740932642487, + "grad_norm": 1.9135861015702393, + "kl": 0.00021076202392578125, + "learning_rate": 9.974093264248704e-07, + "loss": -0.0, + "reward": 2.4998377561569214, + "reward_std": 0.00018063169272863888, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.999837875366211, + "step": 11 + }, + { + "clip_ratio": 0.0, + "completion_length": 34.9375, + "epoch": 0.031088082901554404, + "grad_norm": 40.10387354729789, + "kl": 0.0004906654357910156, + "learning_rate": 9.971502590673576e-07, + "loss": 0.0, + "reward": 1.9681448936462402, + "reward_std": 0.37788383662700653, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.4681448340415955, + "step": 12 + }, + { + "clip_ratio": 0.0, + "completion_length": 38.25, + "epoch": 0.03367875647668394, + "grad_norm": 24.163294004736, + "kl": 0.00337982177734375, + "learning_rate": 9.968911917098446e-07, + "loss": 0.0, + "reward": 1.4076377749443054, + "reward_std": 0.12149995937943459, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 0.9076377749443054, + "step": 13 + }, + { + "clip_ratio": 0.0, + "completion_length": 76.4375, + "epoch": 0.03626943005181347, + "grad_norm": 11.714799652216705, + "kl": 0.00435638427734375, + "learning_rate": 9.966321243523316e-07, + "loss": 0.0, + "reward": 1.733027458190918, + "reward_std": 0.5577484518289566, + "rewards/format_reward_rec": 0.9375, + "rewards/point_reward": 1.2642774283885956, + "step": 14 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.125, + "epoch": 0.038860103626943004, + "grad_norm": 50.83535728252055, + "kl": 0.0007543563842773438, + "learning_rate": 9.963730569948186e-07, + "loss": -0.0001, + "reward": 2.0652049779891968, + "reward_std": 0.275846501095657, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.5652050375938416, + "step": 15 + }, + { + "clip_ratio": 0.0, + "completion_length": 36.5, + "epoch": 0.04145077720207254, + "grad_norm": 4.241215850948085, + "kl": 0.0006628036499023438, + "learning_rate": 9.961139896373056e-07, + "loss": -0.0004, + "reward": 1.9996973872184753, + "reward_std": 0.00032231332988885697, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.4996973872184753, + "step": 16 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.5625, + "epoch": 0.04404145077720207, + "grad_norm": 7.52715415012867, + "kl": 0.010478973388671875, + "learning_rate": 9.958549222797928e-07, + "loss": 0.0, + "reward": 1.852938175201416, + "reward_std": 0.20573098585009575, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.3529380559921265, + "step": 17 + }, + { + "clip_ratio": 0.0, + "completion_length": 34.8125, + "epoch": 0.046632124352331605, + "grad_norm": 35.13336574259517, + "kl": 0.00276947021484375, + "learning_rate": 9.955958549222798e-07, + "loss": 0.0, + "reward": 1.7542864084243774, + "reward_std": 0.30061637982726097, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.254286527633667, + "step": 18 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.5, + "epoch": 0.04922279792746114, + "grad_norm": 5.882698374562189, + "kl": 0.0012226104736328125, + "learning_rate": 9.953367875647668e-07, + "loss": 0.0004, + "reward": 2.3103703260421753, + "reward_std": 0.2616851614402549, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.810370147228241, + "step": 19 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 0.05181347150259067, + "grad_norm": 7.897835461185853, + "kl": 0.001800537109375, + "learning_rate": 9.950777202072538e-07, + "loss": 0.0, + "reward": 1.8443708419799805, + "reward_std": 0.25012848898768425, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.344370722770691, + "step": 20 + }, + { + "clip_ratio": 0.0, + "completion_length": 34.4375, + "epoch": 0.054404145077720206, + "grad_norm": 89.63936792842128, + "kl": 0.0028400421142578125, + "learning_rate": 9.948186528497408e-07, + "loss": 0.0, + "reward": 2.170512616634369, + "reward_std": 0.27396823112940183, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.6705125570297241, + "step": 21 + }, + { + "clip_ratio": 0.0, + "completion_length": 78.0, + "epoch": 0.05699481865284974, + "grad_norm": 17.490461875054127, + "kl": 0.00286102294921875, + "learning_rate": 9.94559585492228e-07, + "loss": 0.0, + "reward": 1.7859807014465332, + "reward_std": 0.3069179803133011, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.285980761051178, + "step": 22 + }, + { + "clip_ratio": 0.0, + "completion_length": 34.75, + "epoch": 0.05958549222797927, + "grad_norm": 26.625503930558537, + "kl": 0.0762939453125, + "learning_rate": 9.94300518134715e-07, + "loss": 0.0003, + "reward": 1.976489543914795, + "reward_std": 0.5621593296527863, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.476489543914795, + "step": 23 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 0.06217616580310881, + "grad_norm": 29.10365061874257, + "kl": 0.0052947998046875, + "learning_rate": 9.94041450777202e-07, + "loss": 0.0, + "reward": 1.4800074696540833, + "reward_std": 0.21575853787362576, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 0.9800075888633728, + "step": 24 + }, + { + "clip_ratio": 0.0, + "completion_length": 34.875, + "epoch": 0.06476683937823834, + "grad_norm": 9.194639911178484, + "kl": 0.0052337646484375, + "learning_rate": 9.937823834196892e-07, + "loss": 0.0, + "reward": 1.7446673512458801, + "reward_std": 0.4378291368484497, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.2446672916412354, + "step": 25 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.5, + "epoch": 0.06735751295336788, + "grad_norm": 23.585911751853935, + "kl": 0.002048492431640625, + "learning_rate": 9.93523316062176e-07, + "loss": 0.0, + "reward": 1.5376622080802917, + "reward_std": 0.1939506810158491, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.0376621782779694, + "step": 26 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.375, + "epoch": 0.06994818652849741, + "grad_norm": 42.03244741480483, + "kl": 0.01849365234375, + "learning_rate": 9.932642487046632e-07, + "loss": 0.0001, + "reward": 1.9917437434196472, + "reward_std": 0.3577324002981186, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.4917437434196472, + "step": 27 + }, + { + "clip_ratio": 0.0, + "completion_length": 37.6875, + "epoch": 0.07253886010362694, + "grad_norm": 7.6125781540856075, + "kl": 0.01397705078125, + "learning_rate": 9.930051813471502e-07, + "loss": 0.0003, + "reward": 2.1210156679153442, + "reward_std": 0.2339139638661436, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.6210156679153442, + "step": 28 + }, + { + "clip_ratio": 0.0, + "completion_length": 37.1875, + "epoch": 0.07512953367875648, + "grad_norm": 14.594557537197575, + "kl": 0.0027618408203125, + "learning_rate": 9.927461139896372e-07, + "loss": 0.0, + "reward": 2.1707316040992737, + "reward_std": 0.4666369557380676, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.6707317233085632, + "step": 29 + }, + { + "clip_ratio": 0.0, + "completion_length": 41.9375, + "epoch": 0.07772020725388601, + "grad_norm": 78.64780253448063, + "kl": 0.01177978515625, + "learning_rate": 9.924870466321244e-07, + "loss": 0.0, + "reward": 2.1214077472686768, + "reward_std": 0.2336481891979929, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.6214078664779663, + "step": 30 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.4375, + "epoch": 0.08031088082901554, + "grad_norm": 15.346949280838764, + "kl": 0.020843505859375, + "learning_rate": 9.922279792746114e-07, + "loss": -0.0, + "reward": 2.105500817298889, + "reward_std": 0.24920060485601425, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.605500876903534, + "step": 31 + }, + { + "clip_ratio": 0.0, + "completion_length": 37.125, + "epoch": 0.08290155440414508, + "grad_norm": 28.799379071515, + "kl": 0.013671875, + "learning_rate": 9.919689119170984e-07, + "loss": 0.0001, + "reward": 1.8799359798431396, + "reward_std": 0.20554822124540806, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.3799359798431396, + "step": 32 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 0.08549222797927461, + "grad_norm": 30.874748179392196, + "kl": 0.055267333984375, + "learning_rate": 9.917098445595854e-07, + "loss": 0.0001, + "reward": 2.1147689819335938, + "reward_std": 0.23852075126254135, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.614769160747528, + "step": 33 + }, + { + "clip_ratio": 0.0, + "completion_length": 33.0, + "epoch": 0.08808290155440414, + "grad_norm": 21.413339376847578, + "kl": 0.037353515625, + "learning_rate": 9.914507772020724e-07, + "loss": -0.0001, + "reward": 2.4374749660491943, + "reward_std": 0.1768060198804733, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9374751448631287, + "step": 34 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.5, + "epoch": 0.09067357512953368, + "grad_norm": 11.735518776968457, + "kl": 0.01519775390625, + "learning_rate": 9.911917098445596e-07, + "loss": 0.0001, + "reward": 2.1221890449523926, + "reward_std": 0.4398344159126282, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.6221890449523926, + "step": 35 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.125, + "epoch": 0.09326424870466321, + "grad_norm": 19.228717380259603, + "kl": 0.047607421875, + "learning_rate": 9.909326424870466e-07, + "loss": 0.0002, + "reward": 1.8104677200317383, + "reward_std": 0.2632200005464256, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.3104676604270935, + "step": 36 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0625, + "epoch": 0.09585492227979274, + "grad_norm": 40.120842681212686, + "kl": 0.01470184326171875, + "learning_rate": 9.906735751295336e-07, + "loss": 0.0001, + "reward": 2.0619872212409973, + "reward_std": 0.5265199542045593, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.5619871020317078, + "step": 37 + }, + { + "clip_ratio": 0.0, + "completion_length": 41.75, + "epoch": 0.09844559585492228, + "grad_norm": 22.11093878199278, + "kl": 0.01458740234375, + "learning_rate": 9.904145077720206e-07, + "loss": 0.0001, + "reward": 2.207720160484314, + "reward_std": 0.5386560559272766, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.7077201008796692, + "step": 38 + }, + { + "clip_ratio": 0.0, + "completion_length": 34.5, + "epoch": 0.10103626943005181, + "grad_norm": 14.788404143155422, + "kl": 0.00701904296875, + "learning_rate": 9.901554404145076e-07, + "loss": -0.0007, + "reward": 2.310261368751526, + "reward_std": 0.26141046830311154, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.8102614879608154, + "step": 39 + }, + { + "clip_ratio": 0.0, + "completion_length": 34.5, + "epoch": 0.10362694300518134, + "grad_norm": 1.7337144036341858, + "kl": 0.0750732421875, + "learning_rate": 9.898963730569949e-07, + "loss": 0.0008, + "reward": 2.4999849796295166, + "reward_std": 1.147488831065857e-05, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999849200248718, + "step": 40 + }, + { + "clip_ratio": 0.0, + "completion_length": 122.375, + "epoch": 0.10621761658031088, + "grad_norm": 7.857568618104513, + "kl": 0.011962890625, + "learning_rate": 9.896373056994819e-07, + "loss": 0.0, + "reward": 2.2498103380203247, + "reward_std": 0.4357842653989792, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.7498104572296143, + "step": 41 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.5, + "epoch": 0.10880829015544041, + "grad_norm": 26.42632814430377, + "kl": 0.0374755859375, + "learning_rate": 9.893782383419688e-07, + "loss": 0.0002, + "reward": 1.9356898665428162, + "reward_std": 0.17737593466881663, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.4356898367404938, + "step": 42 + }, + { + "clip_ratio": 0.0, + "completion_length": 37.1875, + "epoch": 0.11139896373056994, + "grad_norm": 19.882775926206442, + "kl": 0.0291748046875, + "learning_rate": 9.89119170984456e-07, + "loss": 0.0001, + "reward": 1.6723498702049255, + "reward_std": 0.4186931699514389, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.1723498702049255, + "step": 43 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.4375, + "epoch": 0.11398963730569948, + "grad_norm": 25.952155665863092, + "kl": 0.10906982421875, + "learning_rate": 9.888601036269428e-07, + "loss": 0.0004, + "reward": 1.4955262541770935, + "reward_std": 0.003271562047302723, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 0.9955263733863831, + "step": 44 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.5, + "epoch": 0.11658031088082901, + "grad_norm": 440.21761847167636, + "kl": 0.021759033203125, + "learning_rate": 9.8860103626943e-07, + "loss": -0.0007, + "reward": 2.3124141693115234, + "reward_std": 0.2588567412449265, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.8124142289161682, + "step": 45 + }, + { + "clip_ratio": 0.0, + "completion_length": 37.0625, + "epoch": 0.11917098445595854, + "grad_norm": 12.536078202267825, + "kl": 0.05230712890625, + "learning_rate": 9.88341968911917e-07, + "loss": 0.0, + "reward": 1.990914225578308, + "reward_std": 0.022514314281579573, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.4909144341945648, + "step": 46 + }, + { + "clip_ratio": 0.0, + "completion_length": 37.4375, + "epoch": 0.12176165803108809, + "grad_norm": 223.592433529062, + "kl": 0.07373046875, + "learning_rate": 9.88082901554404e-07, + "loss": 0.0003, + "reward": 2.2822933197021484, + "reward_std": 0.4742845743894577, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.7822932600975037, + "step": 47 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.9375, + "epoch": 0.12435233160621761, + "grad_norm": 32.36471243650293, + "kl": 0.011932373046875, + "learning_rate": 9.878238341968913e-07, + "loss": -0.0001, + "reward": 2.3654046058654785, + "reward_std": 0.2500063071856857, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.8654048442840576, + "step": 48 + }, + { + "clip_ratio": 0.0, + "completion_length": 34.6875, + "epoch": 0.12694300518134716, + "grad_norm": 71.24853896041392, + "kl": 0.011688232421875, + "learning_rate": 9.875647668393783e-07, + "loss": 0.0003, + "reward": 2.1613352298736572, + "reward_std": 0.28758263627048564, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.6613351106643677, + "step": 49 + }, + { + "clip_ratio": 0.0, + "completion_length": 34.5, + "epoch": 0.12953367875647667, + "grad_norm": 21.11629785246926, + "kl": 0.0303955078125, + "learning_rate": 9.873056994818653e-07, + "loss": 0.0002, + "reward": 2.2455525398254395, + "reward_std": 0.272227193647268, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.7455525994300842, + "step": 50 + }, + { + "clip_ratio": 0.0, + "completion_length": 34.875, + "epoch": 0.13212435233160622, + "grad_norm": 18.664911497214902, + "kl": 0.02557373046875, + "learning_rate": 9.870466321243523e-07, + "loss": -0.0003, + "reward": 1.7885136008262634, + "reward_std": 0.2233308469039912, + "rewards/format_reward_rec": 0.9375, + "rewards/point_reward": 1.3197636902332306, + "step": 51 + }, + { + "clip_ratio": 0.0, + "completion_length": 34.5625, + "epoch": 0.13471502590673576, + "grad_norm": 99.49804222052887, + "kl": 0.0521240234375, + "learning_rate": 9.867875647668393e-07, + "loss": 0.0004, + "reward": 1.9976779222488403, + "reward_std": 0.006293868353168364, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.497677743434906, + "step": 52 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.5, + "epoch": 0.13730569948186527, + "grad_norm": 17.244449800342725, + "kl": 0.010955810546875, + "learning_rate": 9.865284974093265e-07, + "loss": -0.0, + "reward": 2.4999852180480957, + "reward_std": 1.7544890738463437e-05, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999850988388062, + "step": 53 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.75, + "epoch": 0.13989637305699482, + "grad_norm": 32.37944592695846, + "kl": 0.027099609375, + "learning_rate": 9.862694300518135e-07, + "loss": -0.0, + "reward": 1.8870325088500977, + "reward_std": 0.1153705872293358, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.3870326280593872, + "step": 54 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.5, + "epoch": 0.14248704663212436, + "grad_norm": 19.480430245726204, + "kl": 0.02496337890625, + "learning_rate": 9.860103626943005e-07, + "loss": -0.0, + "reward": 2.4373843669891357, + "reward_std": 0.17695820160315634, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9373842477798462, + "step": 55 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 0.14507772020725387, + "grad_norm": 16.989833103971012, + "kl": 0.12823486328125, + "learning_rate": 9.857512953367875e-07, + "loss": 0.0005, + "reward": 1.8051514029502869, + "reward_std": 0.45061106979846954, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.3051514625549316, + "step": 56 + }, + { + "clip_ratio": 0.0, + "completion_length": 34.625, + "epoch": 0.14766839378238342, + "grad_norm": 31.02175686915555, + "kl": 0.01904296875, + "learning_rate": 9.854922279792745e-07, + "loss": 0.0001, + "reward": 1.969981074333191, + "reward_std": 0.6297044456005096, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.4699811935424805, + "step": 57 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 0.15025906735751296, + "grad_norm": 10.235665746059144, + "kl": 0.0089874267578125, + "learning_rate": 9.852331606217617e-07, + "loss": -0.0004, + "reward": 2.0624454021453857, + "reward_std": 0.1768003513025178, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.56244558095932, + "step": 58 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 0.15284974093264247, + "grad_norm": 2.272577397842421, + "kl": 0.0142364501953125, + "learning_rate": 9.849740932642487e-07, + "loss": -0.0006, + "reward": 1.9982001185417175, + "reward_std": 3.705472454385017e-05, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.4982001781463623, + "step": 59 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 0.15544041450777202, + "grad_norm": 9.843036544453048, + "kl": 0.0830078125, + "learning_rate": 9.847150259067357e-07, + "loss": -0.0001, + "reward": 2.437375068664551, + "reward_std": 0.17711830667053619, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9373750686645508, + "step": 60 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.125, + "epoch": 0.15803108808290156, + "grad_norm": 158.25531896737053, + "kl": 0.0689697265625, + "learning_rate": 9.844559585492227e-07, + "loss": 0.0003, + "reward": 1.7639578580856323, + "reward_std": 0.3546312153339386, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.2639578580856323, + "step": 61 + }, + { + "clip_ratio": 0.0, + "completion_length": 34.5, + "epoch": 0.16062176165803108, + "grad_norm": 3.0447356599273157, + "kl": 0.03656005859375, + "learning_rate": 9.841968911917097e-07, + "loss": 0.0004, + "reward": 2.4999831914901733, + "reward_std": 1.480638820794411e-05, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999831914901733, + "step": 62 + }, + { + "clip_ratio": 0.0, + "completion_length": 36.0, + "epoch": 0.16321243523316062, + "grad_norm": 4.393337788792737, + "kl": 0.0108184814453125, + "learning_rate": 9.83937823834197e-07, + "loss": 0.0001, + "reward": 2.4374507665634155, + "reward_std": 0.17682846984826028, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9374507069587708, + "step": 63 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.625, + "epoch": 0.16580310880829016, + "grad_norm": 27.899095141519734, + "kl": 0.066162109375, + "learning_rate": 9.83678756476684e-07, + "loss": 0.001, + "reward": 2.168538749217987, + "reward_std": 0.2762052078041961, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.6685386896133423, + "step": 64 + }, + { + "clip_ratio": 0.0, + "completion_length": 34.5, + "epoch": 0.16839378238341968, + "grad_norm": 5.751242569952515, + "kl": 0.03411865234375, + "learning_rate": 9.83419689119171e-07, + "loss": 0.0004, + "reward": 2.499942421913147, + "reward_std": 5.2554929425241426e-05, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999422430992126, + "step": 65 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.375, + "epoch": 0.17098445595854922, + "grad_norm": 477.3372362698823, + "kl": 0.074462890625, + "learning_rate": 9.831606217616581e-07, + "loss": 0.0003, + "reward": 1.779470443725586, + "reward_std": 0.3095453269779682, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.279470443725586, + "step": 66 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 0.17357512953367876, + "grad_norm": 6.576840283054051, + "kl": 0.02679443359375, + "learning_rate": 9.829015544041451e-07, + "loss": 0.0001, + "reward": 1.9997954368591309, + "reward_std": 0.0003149642052449053, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.4997954964637756, + "step": 67 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.5, + "epoch": 0.17616580310880828, + "grad_norm": 2.520279174644408, + "kl": 0.022705078125, + "learning_rate": 9.826424870466321e-07, + "loss": 0.0001, + "reward": 2.4999767541885376, + "reward_std": 1.9422466266405536e-05, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999768733978271, + "step": 68 + }, + { + "clip_ratio": 0.0, + "completion_length": 52.4375, + "epoch": 0.17875647668393782, + "grad_norm": 8.888223161773142, + "kl": 0.0467529296875, + "learning_rate": 9.823834196891191e-07, + "loss": 0.0008, + "reward": 2.3098913431167603, + "reward_std": 0.2624187994371141, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.8098912835121155, + "step": 69 + }, + { + "clip_ratio": 0.0, + "completion_length": 34.5, + "epoch": 0.18134715025906736, + "grad_norm": 16.12291142201368, + "kl": 0.18121337890625, + "learning_rate": 9.821243523316061e-07, + "loss": 0.0012, + "reward": 2.37497341632843, + "reward_std": 0.2314961755520244, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.874973475933075, + "step": 70 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.5, + "epoch": 0.18393782383419688, + "grad_norm": 48.154326938669676, + "kl": 0.01806640625, + "learning_rate": 9.818652849740933e-07, + "loss": 0.0009, + "reward": 2.4999828338623047, + "reward_std": 1.1697421086864779e-05, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999827146530151, + "step": 71 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 0.18652849740932642, + "grad_norm": 0.45065452719536897, + "kl": 0.04071807861328125, + "learning_rate": 9.816062176165803e-07, + "loss": 0.0, + "reward": 2.4999879598617554, + "reward_std": 3.2788135513328598e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999879598617554, + "step": 72 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.5, + "epoch": 0.18911917098445596, + "grad_norm": 8.04305170993719, + "kl": 0.04095458984375, + "learning_rate": 9.813471502590673e-07, + "loss": -0.0001, + "reward": 2.437377095222473, + "reward_std": 0.17711324256833905, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9373770952224731, + "step": 73 + }, + { + "clip_ratio": 0.0, + "completion_length": 47.0, + "epoch": 0.19170984455958548, + "grad_norm": 10.295445088160703, + "kl": 0.073486328125, + "learning_rate": 9.810880829015543e-07, + "loss": -0.0001, + "reward": 2.4372644424438477, + "reward_std": 0.1773603390867038, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9372645020484924, + "step": 74 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 0.19430051813471502, + "grad_norm": 45.24804202411124, + "kl": 0.01751708984375, + "learning_rate": 9.808290155440413e-07, + "loss": 0.0002, + "reward": 2.1242417097091675, + "reward_std": 0.23192374470909272, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.6242417097091675, + "step": 75 + }, + { + "clip_ratio": 0.0, + "completion_length": 34.5, + "epoch": 0.19689119170984457, + "grad_norm": 7.3772506622853244, + "kl": 0.03271484375, + "learning_rate": 9.805699481865285e-07, + "loss": -0.0005, + "reward": 1.9985507726669312, + "reward_std": 3.881182465192978e-05, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.4985509514808655, + "step": 76 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.5, + "epoch": 0.19948186528497408, + "grad_norm": 9.351526793083345, + "kl": 0.0634765625, + "learning_rate": 9.803108808290155e-07, + "loss": 0.0001, + "reward": 2.4997767210006714, + "reward_std": 5.034307181972508e-05, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9997768998146057, + "step": 77 + }, + { + "clip_ratio": 0.0, + "completion_length": 34.0625, + "epoch": 0.20207253886010362, + "grad_norm": 52.616566611429114, + "kl": 0.061767578125, + "learning_rate": 9.800518134715025e-07, + "loss": 0.0005, + "reward": 2.124193847179413, + "reward_std": 0.23194783757702453, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.624193787574768, + "step": 78 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.5, + "epoch": 0.20466321243523317, + "grad_norm": 3.326113527311215, + "kl": 0.064453125, + "learning_rate": 9.797927461139895e-07, + "loss": -0.001, + "reward": 2.4999313354492188, + "reward_std": 3.320495090974873e-05, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999314546585083, + "step": 79 + }, + { + "clip_ratio": 0.0, + "completion_length": 34.625, + "epoch": 0.20725388601036268, + "grad_norm": 42.73536840883849, + "kl": 0.0462646484375, + "learning_rate": 9.795336787564765e-07, + "loss": 0.0006, + "reward": 1.9181513786315918, + "reward_std": 0.03343612557546294, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.4181513786315918, + "step": 80 + }, + { + "clip_ratio": 0.0, + "completion_length": 33.9375, + "epoch": 0.20984455958549222, + "grad_norm": 36.79017488467275, + "kl": 0.06671142578125, + "learning_rate": 9.792746113989637e-07, + "loss": 0.0003, + "reward": 1.7636473178863525, + "reward_std": 0.3487439304590225, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.2636472582817078, + "step": 81 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.5, + "epoch": 0.21243523316062177, + "grad_norm": 32.742835637572625, + "kl": 0.04803466796875, + "learning_rate": 9.790155440414507e-07, + "loss": -0.0, + "reward": 1.9988058805465698, + "reward_std": 0.0005170259537408128, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.4988059401512146, + "step": 82 + }, + { + "clip_ratio": 0.0, + "completion_length": 34.5, + "epoch": 0.21502590673575128, + "grad_norm": 3.1685855541401837, + "kl": 0.04400634765625, + "learning_rate": 9.787564766839377e-07, + "loss": -0.0004, + "reward": 2.499987483024597, + "reward_std": 1.1137093110846763e-05, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.999987542629242, + "step": 83 + }, + { + "clip_ratio": 0.0, + "completion_length": 38.875, + "epoch": 0.21761658031088082, + "grad_norm": 11.364834215005686, + "kl": 0.02508544921875, + "learning_rate": 9.784974093264247e-07, + "loss": -0.0002, + "reward": 2.4373650550842285, + "reward_std": 0.1770545343403569, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9373650550842285, + "step": 84 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.5, + "epoch": 0.22020725388601037, + "grad_norm": 15.226201692955948, + "kl": 0.075439453125, + "learning_rate": 9.78238341968912e-07, + "loss": 0.0003, + "reward": 2.4999794960021973, + "reward_std": 1.823106958909193e-05, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.999979555606842, + "step": 85 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.5, + "epoch": 0.22279792746113988, + "grad_norm": 28.408544891966834, + "kl": 0.1396484375, + "learning_rate": 9.77979274611399e-07, + "loss": 0.0003, + "reward": 2.312247633934021, + "reward_std": 0.2590706118817252, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.8122477531433105, + "step": 86 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 0.22538860103626943, + "grad_norm": 4.878505935956497, + "kl": 0.1602783203125, + "learning_rate": 9.77720207253886e-07, + "loss": 0.0009, + "reward": 2.499765992164612, + "reward_std": 2.0208295097745577e-05, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.999765932559967, + "step": 87 + }, + { + "clip_ratio": 0.0, + "completion_length": 38.375, + "epoch": 0.22797927461139897, + "grad_norm": 97.4598946672193, + "kl": 0.08349609375, + "learning_rate": 9.77461139896373e-07, + "loss": 0.0003, + "reward": 2.046730399131775, + "reward_std": 0.5622504651546478, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.5467304587364197, + "step": 88 + }, + { + "clip_ratio": 0.0, + "completion_length": 34.6875, + "epoch": 0.23056994818652848, + "grad_norm": 128.12050745642023, + "kl": 0.15771484375, + "learning_rate": 9.772020725388602e-07, + "loss": 0.0007, + "reward": 1.9192970991134644, + "reward_std": 0.016997356389765628, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.4192971885204315, + "step": 89 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 0.23316062176165803, + "grad_norm": 7.872235002892737, + "kl": 0.076171875, + "learning_rate": 9.769430051813472e-07, + "loss": 0.0007, + "reward": 1.4971758127212524, + "reward_std": 0.0002577869486231066, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 0.99717578291893, + "step": 90 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 0.23575129533678757, + "grad_norm": 4.245160239550034, + "kl": 0.04241943359375, + "learning_rate": 9.766839378238342e-07, + "loss": -0.0008, + "reward": 2.4997618198394775, + "reward_std": 8.034832262637792e-05, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.999761939048767, + "step": 91 + }, + { + "clip_ratio": 0.0, + "completion_length": 34.4375, + "epoch": 0.23834196891191708, + "grad_norm": 19.856309903921574, + "kl": 0.140869140625, + "learning_rate": 9.764248704663212e-07, + "loss": 0.0004, + "reward": 2.4374531507492065, + "reward_std": 0.1768775479206397, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9374531507492065, + "step": 92 + }, + { + "clip_ratio": 0.0, + "completion_length": 34.6875, + "epoch": 0.24093264248704663, + "grad_norm": 1.1579663764472719, + "kl": 0.0400390625, + "learning_rate": 9.761658031088082e-07, + "loss": -0.0004, + "reward": 2.4999840259552, + "reward_std": 5.818617864861153e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999842047691345, + "step": 93 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 0.24352331606217617, + "grad_norm": 1.204567997031168, + "kl": 0.088134765625, + "learning_rate": 9.759067357512954e-07, + "loss": 0.0006, + "reward": 2.499993324279785, + "reward_std": 5.7656898206914775e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999932050704956, + "step": 94 + }, + { + "clip_ratio": 0.0, + "completion_length": 34.5, + "epoch": 0.24611398963730569, + "grad_norm": 21.05544770088383, + "kl": 0.126983642578125, + "learning_rate": 9.756476683937824e-07, + "loss": 0.0003, + "reward": 1.9998608827590942, + "reward_std": 4.4506206535288584e-05, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.4998611509799957, + "step": 95 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 0.24870466321243523, + "grad_norm": 32.751306013317446, + "kl": 0.10589599609375, + "learning_rate": 9.753886010362694e-07, + "loss": 0.0004, + "reward": 2.1240326166152954, + "reward_std": 0.5003382712602615, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.6240326166152954, + "step": 96 + }, + { + "clip_ratio": 0.0, + "completion_length": 40.375, + "epoch": 0.25129533678756477, + "grad_norm": 79.12597534103841, + "kl": 0.04833984375, + "learning_rate": 9.751295336787564e-07, + "loss": 0.0005, + "reward": 1.9821822047233582, + "reward_std": 0.003102937228504743, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.4821821451187134, + "step": 97 + }, + { + "clip_ratio": 0.0, + "completion_length": 39.625, + "epoch": 0.2538860103626943, + "grad_norm": 27.473384832506696, + "kl": 0.093017578125, + "learning_rate": 9.748704663212434e-07, + "loss": 0.0004, + "reward": 1.7839117050170898, + "reward_std": 0.462258443236351, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.2839117050170898, + "step": 98 + }, + { + "clip_ratio": 0.0, + "completion_length": 36.5, + "epoch": 0.25647668393782386, + "grad_norm": 0.5434623478158119, + "kl": 0.028076171875, + "learning_rate": 9.746113989637306e-07, + "loss": 0.0002, + "reward": 2.4999914169311523, + "reward_std": 5.65391962936701e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999914169311523, + "step": 99 + }, + { + "clip_ratio": 0.0, + "completion_length": 34.8125, + "epoch": 0.25906735751295334, + "grad_norm": 38.41368275142002, + "kl": 0.05322265625, + "learning_rate": 9.743523316062176e-07, + "loss": -0.0002, + "reward": 1.8973362445831299, + "reward_std": 0.24990517766491394, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.397336184978485, + "step": 100 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.5, + "epoch": 0.2616580310880829, + "grad_norm": 7.520692088913739, + "kl": 0.037841796875, + "learning_rate": 9.740932642487046e-07, + "loss": -0.0002, + "reward": 2.4999399185180664, + "reward_std": 4.300871478335466e-05, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999399185180664, + "step": 101 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.5, + "epoch": 0.26424870466321243, + "grad_norm": 16.04694790274661, + "kl": 0.03875732421875, + "learning_rate": 9.738341968911916e-07, + "loss": -0.0003, + "reward": 2.3124669790267944, + "reward_std": 0.25882471234763216, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.812467098236084, + "step": 102 + }, + { + "clip_ratio": 0.0, + "completion_length": 108.3125, + "epoch": 0.266839378238342, + "grad_norm": 45.86080375277014, + "kl": 0.0531005859375, + "learning_rate": 9.735751295336788e-07, + "loss": 0.0002, + "reward": 1.9918814897537231, + "reward_std": 0.005455598112348525, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.4918816089630127, + "step": 103 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.5, + "epoch": 0.2694300518134715, + "grad_norm": 29.67608753082564, + "kl": 0.02471923828125, + "learning_rate": 9.733160621761658e-07, + "loss": -0.0008, + "reward": 2.437479257583618, + "reward_std": 0.1768319597942991, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9374792575836182, + "step": 104 + }, + { + "clip_ratio": 0.0, + "completion_length": 34.5, + "epoch": 0.27202072538860106, + "grad_norm": 27.205285721754063, + "kl": 0.0614013671875, + "learning_rate": 9.730569948186528e-07, + "loss": 0.0002, + "reward": 1.999179720878601, + "reward_std": 6.279310655088466e-05, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.499179720878601, + "step": 105 + }, + { + "clip_ratio": 0.0, + "completion_length": 36.0, + "epoch": 0.27461139896373055, + "grad_norm": 23.54038740688223, + "kl": 0.0628662109375, + "learning_rate": 9.727979274611398e-07, + "loss": 0.0001, + "reward": 2.499894142150879, + "reward_std": 0.00012795658221875783, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.999894142150879, + "step": 106 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0625, + "epoch": 0.2772020725388601, + "grad_norm": 26.079557447642664, + "kl": 0.04766845703125, + "learning_rate": 9.725388601036268e-07, + "loss": -0.0001, + "reward": 2.499938726425171, + "reward_std": 5.921874435443897e-05, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999386668205261, + "step": 107 + }, + { + "clip_ratio": 0.0, + "completion_length": 34.5, + "epoch": 0.27979274611398963, + "grad_norm": 3.2653955008843685, + "kl": 0.042724609375, + "learning_rate": 9.72279792746114e-07, + "loss": 0.0006, + "reward": 2.4999871253967285, + "reward_std": 1.7469743397668935e-05, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999871253967285, + "step": 108 + }, + { + "clip_ratio": 0.0, + "completion_length": 34.5, + "epoch": 0.2823834196891192, + "grad_norm": 0.77165706731983, + "kl": 0.0784912109375, + "learning_rate": 9.72020725388601e-07, + "loss": 0.0, + "reward": 2.4999911785125732, + "reward_std": 4.967317295268003e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999911785125732, + "step": 109 + }, + { + "clip_ratio": 0.0, + "completion_length": 36.0, + "epoch": 0.2849740932642487, + "grad_norm": 32.97546419525307, + "kl": 0.04766845703125, + "learning_rate": 9.71761658031088e-07, + "loss": -0.0, + "reward": 2.4998066425323486, + "reward_std": 0.0001222240025526844, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9998065829277039, + "step": 110 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.5625, + "epoch": 0.28756476683937826, + "grad_norm": 26.18065551862498, + "kl": 0.4599609375, + "learning_rate": 9.71502590673575e-07, + "loss": 0.0018, + "reward": 1.5519845485687256, + "reward_std": 0.5864746570587158, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.051984578371048, + "step": 111 + }, + { + "clip_ratio": 0.0, + "completion_length": 37.6875, + "epoch": 0.29015544041450775, + "grad_norm": 11.987697989962049, + "kl": 0.0703125, + "learning_rate": 9.712435233160622e-07, + "loss": 0.0003, + "reward": 2.3124321699142456, + "reward_std": 0.4082608222961426, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.8124321699142456, + "step": 112 + }, + { + "clip_ratio": 0.0, + "completion_length": 34.5, + "epoch": 0.2927461139896373, + "grad_norm": 14.93133180939127, + "kl": 0.076171875, + "learning_rate": 9.709844559585492e-07, + "loss": 0.0003, + "reward": 1.4983919262886047, + "reward_std": 0.0004074995667906478, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 0.9983920156955719, + "step": 113 + }, + { + "clip_ratio": 0.0, + "completion_length": 34.5, + "epoch": 0.29533678756476683, + "grad_norm": 17.387245286485452, + "kl": 0.1103515625, + "learning_rate": 9.707253886010362e-07, + "loss": 0.0003, + "reward": 1.9763047695159912, + "reward_std": 0.0003290466993348673, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.476304829120636, + "step": 114 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.5, + "epoch": 0.2979274611398964, + "grad_norm": 1.0608114568618325, + "kl": 0.0572509765625, + "learning_rate": 9.704663212435232e-07, + "loss": -0.0001, + "reward": 2.499966025352478, + "reward_std": 7.64698111765938e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.999966025352478, + "step": 115 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.5, + "epoch": 0.3005181347150259, + "grad_norm": 3.8357029735696653, + "kl": 0.0596923828125, + "learning_rate": 9.702072538860102e-07, + "loss": -0.0006, + "reward": 2.499981641769409, + "reward_std": 1.4318110288513708e-05, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999818205833435, + "step": 116 + }, + { + "clip_ratio": 0.0, + "completion_length": 34.5, + "epoch": 0.30310880829015546, + "grad_norm": 28.50617927919992, + "kl": 0.11328125, + "learning_rate": 9.699481865284974e-07, + "loss": 0.0009, + "reward": 2.0620652437210083, + "reward_std": 0.17692725664164755, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.5620651841163635, + "step": 117 + }, + { + "clip_ratio": 0.0, + "completion_length": 34.4375, + "epoch": 0.30569948186528495, + "grad_norm": 58.630081061738984, + "kl": 0.249908447265625, + "learning_rate": 9.696891191709844e-07, + "loss": 0.0011, + "reward": 2.437382221221924, + "reward_std": 0.17709060247125308, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9373822808265686, + "step": 118 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.5, + "epoch": 0.3082901554404145, + "grad_norm": 22.45252294313057, + "kl": 0.0509033203125, + "learning_rate": 9.694300518134714e-07, + "loss": 0.0005, + "reward": 2.374982476234436, + "reward_std": 0.23147946750691517, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.8749824166297913, + "step": 119 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.5, + "epoch": 0.31088082901554404, + "grad_norm": 14.221096544137483, + "kl": 0.07177734375, + "learning_rate": 9.691709844559584e-07, + "loss": 0.0003, + "reward": 2.249211013317108, + "reward_std": 0.2680665226096153, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.749211072921753, + "step": 120 + }, + { + "clip_ratio": 0.0, + "completion_length": 34.5, + "epoch": 0.3134715025906736, + "grad_norm": 0.4332777968107589, + "kl": 0.0465087890625, + "learning_rate": 9.689119170984456e-07, + "loss": -0.0001, + "reward": 2.4999940395355225, + "reward_std": 3.8427937170126825e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999939799308777, + "step": 121 + }, + { + "clip_ratio": 0.0, + "completion_length": 34.5, + "epoch": 0.3160621761658031, + "grad_norm": 4.081538918236511, + "kl": 0.06640625, + "learning_rate": 9.686528497409326e-07, + "loss": 0.0004, + "reward": 2.4999542236328125, + "reward_std": 3.70945485883567e-05, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.999954342842102, + "step": 122 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.5, + "epoch": 0.31865284974093266, + "grad_norm": 3.057592866617649, + "kl": 0.16015625, + "learning_rate": 9.683937823834196e-07, + "loss": 0.0005, + "reward": 2.499969244003296, + "reward_std": 2.595815294625936e-05, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999693632125854, + "step": 123 + }, + { + "clip_ratio": 0.0, + "completion_length": 58.375, + "epoch": 0.32124352331606215, + "grad_norm": 30.824703542855783, + "kl": 0.062255859375, + "learning_rate": 9.681347150259066e-07, + "loss": 0.0002, + "reward": 1.3506267666816711, + "reward_std": 0.17951567331328988, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 0.8506268262863159, + "step": 124 + }, + { + "clip_ratio": 0.0, + "completion_length": 50.8125, + "epoch": 0.3238341968911917, + "grad_norm": 1.7148905769763336, + "kl": 0.283203125, + "learning_rate": 9.678756476683936e-07, + "loss": 0.0018, + "reward": 2.4999903440475464, + "reward_std": 4.849738161283312e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999902248382568, + "step": 125 + }, + { + "clip_ratio": 0.0, + "completion_length": 34.5, + "epoch": 0.32642487046632124, + "grad_norm": 2.8495573272255075, + "kl": 0.068359375, + "learning_rate": 9.676165803108809e-07, + "loss": 0.0008, + "reward": 2.499986410140991, + "reward_std": 2.3382104700431228e-05, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999862909317017, + "step": 126 + }, + { + "clip_ratio": 0.0, + "completion_length": 51.0, + "epoch": 0.3290155440414508, + "grad_norm": 0.18756382569230234, + "kl": 0.125, + "learning_rate": 9.673575129533679e-07, + "loss": 0.0001, + "reward": 2.4999979734420776, + "reward_std": 8.522496557361592e-07, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999979138374329, + "step": 127 + }, + { + "clip_ratio": 0.0, + "completion_length": 34.0, + "epoch": 0.3316062176165803, + "grad_norm": 778.9687555253842, + "kl": 0.0472412109375, + "learning_rate": 9.670984455958549e-07, + "loss": 0.0003, + "reward": 2.4368550777435303, + "reward_std": 0.1785650884781944, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9368551969528198, + "step": 128 + }, + { + "clip_ratio": 0.0, + "completion_length": 36.0, + "epoch": 0.33419689119170987, + "grad_norm": 7.71128449770404, + "kl": 0.063232421875, + "learning_rate": 9.668393782383419e-07, + "loss": 0.0005, + "reward": 1.6834460496902466, + "reward_std": 0.00025988063225668157, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.1834460347890854, + "step": 129 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 0.33678756476683935, + "grad_norm": 0.12321829598157957, + "kl": 0.1025390625, + "learning_rate": 9.665803108808289e-07, + "loss": -0.0003, + "reward": 2.4999972581863403, + "reward_std": 1.781640165177123e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999974966049194, + "step": 130 + }, + { + "clip_ratio": 0.0, + "completion_length": 38.9375, + "epoch": 0.3393782383419689, + "grad_norm": 20.68392232619108, + "kl": 0.2828369140625, + "learning_rate": 9.66321243523316e-07, + "loss": 0.0016, + "reward": 2.1773712038993835, + "reward_std": 0.2682817817618002, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.6773712635040283, + "step": 131 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.5, + "epoch": 0.34196891191709844, + "grad_norm": 6.78757121716985, + "kl": 0.0501708984375, + "learning_rate": 9.66062176165803e-07, + "loss": 0.0007, + "reward": 2.4999715089797974, + "reward_std": 1.4924235529178986e-05, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999715089797974, + "step": 132 + }, + { + "clip_ratio": 0.0, + "completion_length": 47.75, + "epoch": 0.344559585492228, + "grad_norm": 0.6309569083353512, + "kl": 0.11572265625, + "learning_rate": 9.6580310880829e-07, + "loss": 0.0011, + "reward": 2.4999911785125732, + "reward_std": 8.16447891338612e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999911785125732, + "step": 133 + }, + { + "clip_ratio": 0.0, + "completion_length": 40.0, + "epoch": 0.3471502590673575, + "grad_norm": 17.051855534402925, + "kl": 0.278076171875, + "learning_rate": 9.655440414507773e-07, + "loss": 0.0011, + "reward": 1.4844253063201904, + "reward_std": 0.0008330822111020098, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 0.9844253659248352, + "step": 134 + }, + { + "clip_ratio": 0.0, + "completion_length": 34.0, + "epoch": 0.34974093264248707, + "grad_norm": 0.8087106122070162, + "kl": 0.0784912109375, + "learning_rate": 9.652849740932643e-07, + "loss": 0.0002, + "reward": 2.499993324279785, + "reward_std": 2.6888594675256172e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.99999338388443, + "step": 135 + }, + { + "clip_ratio": 0.0, + "completion_length": 34.0, + "epoch": 0.35233160621761656, + "grad_norm": 2.00244009573784, + "kl": 0.050537109375, + "learning_rate": 9.650259067357513e-07, + "loss": 0.0009, + "reward": 2.499998092651367, + "reward_std": 1.2544903142952535e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999979734420776, + "step": 136 + }, + { + "clip_ratio": 0.0, + "completion_length": 40.875, + "epoch": 0.3549222797927461, + "grad_norm": 86.40512913043219, + "kl": 0.15673828125, + "learning_rate": 9.647668393782383e-07, + "loss": 0.0006, + "reward": 1.9180601835250854, + "reward_std": 0.23149480670690536, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.418060064315796, + "step": 137 + }, + { + "clip_ratio": 0.0, + "completion_length": 47.25, + "epoch": 0.35751295336787564, + "grad_norm": 1.0172748127032223, + "kl": 0.08349609375, + "learning_rate": 9.645077720207253e-07, + "loss": 0.0001, + "reward": 2.4999914169311523, + "reward_std": 8.198011016702367e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999914169311523, + "step": 138 + }, + { + "clip_ratio": 0.0, + "completion_length": 34.5, + "epoch": 0.3601036269430052, + "grad_norm": 9.077370178758427, + "kl": 0.0606689453125, + "learning_rate": 9.642487046632125e-07, + "loss": 0.0011, + "reward": 2.4999910593032837, + "reward_std": 1.1874815385226611e-05, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999908804893494, + "step": 139 + }, + { + "clip_ratio": 0.0, + "completion_length": 34.25, + "epoch": 0.3626943005181347, + "grad_norm": 13.330073377494674, + "kl": 0.075439453125, + "learning_rate": 9.639896373056995e-07, + "loss": -0.0001, + "reward": 1.8144383430480957, + "reward_std": 0.03691703302320093, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.314438372850418, + "step": 140 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 0.36528497409326427, + "grad_norm": 1.4454602214883316, + "kl": 0.104736328125, + "learning_rate": 9.637305699481865e-07, + "loss": 0.0006, + "reward": 1.9923112392425537, + "reward_std": 4.800811608163258e-05, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.4923112392425537, + "step": 141 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 0.36787564766839376, + "grad_norm": 3.462416950574483, + "kl": 0.0382080078125, + "learning_rate": 9.634715025906735e-07, + "loss": 0.0011, + "reward": 2.499909281730652, + "reward_std": 2.855958700820338e-05, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999091625213623, + "step": 142 + }, + { + "clip_ratio": 0.0, + "completion_length": 34.5625, + "epoch": 0.3704663212435233, + "grad_norm": 60.28321024389314, + "kl": 0.236328125, + "learning_rate": 9.632124352331605e-07, + "loss": 0.0009, + "reward": 2.290057420730591, + "reward_std": 0.47163158655166626, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.7900574207305908, + "step": 143 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.5, + "epoch": 0.37305699481865284, + "grad_norm": 54.576736762263884, + "kl": 0.15625, + "learning_rate": 9.629533678756477e-07, + "loss": 0.0006, + "reward": 1.8109760880470276, + "reward_std": 0.4447221904993057, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.3109761476516724, + "step": 144 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 0.3756476683937824, + "grad_norm": 44.47467040389592, + "kl": 0.0899658203125, + "learning_rate": 9.626943005181347e-07, + "loss": 0.0004, + "reward": 1.9187846779823303, + "reward_std": 0.22809253074228764, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.4187846779823303, + "step": 145 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.5, + "epoch": 0.37823834196891193, + "grad_norm": 96.56119524359266, + "kl": 0.1026611328125, + "learning_rate": 9.624352331606217e-07, + "loss": 0.0003, + "reward": 1.6752909421920776, + "reward_std": 0.26892834760656115, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.1752910017967224, + "step": 146 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.25, + "epoch": 0.38082901554404147, + "grad_norm": 1.301215497093169, + "kl": 0.100830078125, + "learning_rate": 9.621761658031087e-07, + "loss": 0.0004, + "reward": 1.4999969005584717, + "reward_std": 2.882935859815916e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 0.9999970197677612, + "step": 147 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.5, + "epoch": 0.38341968911917096, + "grad_norm": 0.1716691461971046, + "kl": 0.079345703125, + "learning_rate": 9.619170984455957e-07, + "loss": 0.0008, + "reward": 2.499995470046997, + "reward_std": 1.2729808815947763e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.999995470046997, + "step": 148 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.5625, + "epoch": 0.3860103626943005, + "grad_norm": 4.61720601482355, + "kl": 0.091064453125, + "learning_rate": 9.61658031088083e-07, + "loss": -0.0002, + "reward": 2.499967575073242, + "reward_std": 1.9026708287128713e-05, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.999967634677887, + "step": 149 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.5, + "epoch": 0.38860103626943004, + "grad_norm": 2.7234764612443025, + "kl": 0.106689453125, + "learning_rate": 9.6139896373057e-07, + "loss": 0.0004, + "reward": 1.99959796667099, + "reward_std": 3.200235278200125e-05, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.4995981454849243, + "step": 150 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 0.3911917098445596, + "grad_norm": 27.998260941191134, + "kl": 0.22216796875, + "learning_rate": 9.61139896373057e-07, + "loss": 0.0014, + "reward": 2.4323630332946777, + "reward_std": 0.19104300345202319, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.932362973690033, + "step": 151 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.9375, + "epoch": 0.39378238341968913, + "grad_norm": 25.226219173819224, + "kl": 0.0477294921875, + "learning_rate": 9.608808290155441e-07, + "loss": 0.0005, + "reward": 2.0568660497665405, + "reward_std": 0.17905486353720335, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.5568661093711853, + "step": 152 + }, + { + "clip_ratio": 0.0, + "completion_length": 36.0, + "epoch": 0.3963730569948187, + "grad_norm": 0.3675034917975679, + "kl": 0.1015625, + "learning_rate": 9.60621761658031e-07, + "loss": 0.001, + "reward": 2.499988555908203, + "reward_std": 4.78217737054365e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999884366989136, + "step": 153 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.5, + "epoch": 0.39896373056994816, + "grad_norm": 2.1595096987913247, + "kl": 0.1748046875, + "learning_rate": 9.603626943005181e-07, + "loss": 0.0004, + "reward": 2.4999899864196777, + "reward_std": 9.273415173538524e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.999989926815033, + "step": 154 + }, + { + "clip_ratio": 0.0, + "completion_length": 36.0, + "epoch": 0.4015544041450777, + "grad_norm": 9.850022537964898, + "kl": 0.07366943359375, + "learning_rate": 9.601036269430051e-07, + "loss": -0.0003, + "reward": 2.4999306201934814, + "reward_std": 6.255799416976515e-05, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999307990074158, + "step": 155 + }, + { + "clip_ratio": 0.0, + "completion_length": 34.5, + "epoch": 0.40414507772020725, + "grad_norm": 2.86835444425271, + "kl": 0.060791015625, + "learning_rate": 9.598445595854921e-07, + "loss": 0.0004, + "reward": 1.9994065761566162, + "reward_std": 1.8171605915995315e-05, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.4994065165519714, + "step": 156 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 0.4067357512953368, + "grad_norm": 38.0932528609861, + "kl": 0.1790771484375, + "learning_rate": 9.595854922279793e-07, + "loss": 0.0001, + "reward": 2.3749828338623047, + "reward_std": 0.23148394337079026, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.8749828338623047, + "step": 157 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.5, + "epoch": 0.40932642487046633, + "grad_norm": 1.357234341081675, + "kl": 0.0810546875, + "learning_rate": 9.593264248704663e-07, + "loss": 0.0003, + "reward": 2.499979019165039, + "reward_std": 7.719876748524257e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999790787696838, + "step": 158 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 0.4119170984455959, + "grad_norm": 27.267310751150433, + "kl": 0.119873046875, + "learning_rate": 9.590673575129533e-07, + "loss": 0.0006, + "reward": 1.298931360244751, + "reward_std": 0.052430289819767495, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 0.7989313006401062, + "step": 159 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 0.41450777202072536, + "grad_norm": 88.60653699281134, + "kl": 0.059326171875, + "learning_rate": 9.588082901554403e-07, + "loss": 0.0002, + "reward": 1.8697761297225952, + "reward_std": 0.029779866188619053, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.3697762191295624, + "step": 160 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 0.4170984455958549, + "grad_norm": 131.16604901041597, + "kl": 0.28564453125, + "learning_rate": 9.585492227979273e-07, + "loss": 0.001, + "reward": 1.811118245124817, + "reward_std": 0.259682998766948, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.3111181557178497, + "step": 161 + }, + { + "clip_ratio": 0.0, + "completion_length": 34.5, + "epoch": 0.41968911917098445, + "grad_norm": 16.611391592845404, + "kl": 0.059326171875, + "learning_rate": 9.582901554404145e-07, + "loss": 0.0002, + "reward": 1.9990431666374207, + "reward_std": 0.0003346551386584906, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.499043047428131, + "step": 162 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 0.422279792746114, + "grad_norm": 3.8076856023797783, + "kl": 0.0833740234375, + "learning_rate": 9.580310880829015e-07, + "loss": 0.001, + "reward": 1.999815583229065, + "reward_std": 2.8820414001984318e-05, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.4998155236244202, + "step": 163 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 0.42487046632124353, + "grad_norm": 0.22775821733372068, + "kl": 0.0638427734375, + "learning_rate": 9.577720207253885e-07, + "loss": -0.0009, + "reward": 2.499998092651367, + "reward_std": 7.127880792268115e-07, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999980926513672, + "step": 164 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.1875, + "epoch": 0.4274611398963731, + "grad_norm": 43.12820905419924, + "kl": 0.165771484375, + "learning_rate": 9.575129533678755e-07, + "loss": 0.0007, + "reward": 1.300038456916809, + "reward_std": 0.3335072639383725, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 0.8000384271144867, + "step": 165 + }, + { + "clip_ratio": 0.0, + "completion_length": 34.5, + "epoch": 0.43005181347150256, + "grad_norm": 80.73777443500676, + "kl": 0.1455078125, + "learning_rate": 9.572538860103625e-07, + "loss": 0.0005, + "reward": 2.1822879910469055, + "reward_std": 0.26322523210546933, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.6822880506515503, + "step": 166 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.5, + "epoch": 0.4326424870466321, + "grad_norm": 1.3935043950254644, + "kl": 0.097412109375, + "learning_rate": 9.569948186528497e-07, + "loss": 0.0001, + "reward": 2.499988317489624, + "reward_std": 1.0444003009979497e-05, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.999988317489624, + "step": 167 + }, + { + "clip_ratio": 0.0, + "completion_length": 34.8125, + "epoch": 0.43523316062176165, + "grad_norm": 34.34007157339578, + "kl": 0.066162109375, + "learning_rate": 9.567357512953367e-07, + "loss": -0.0005, + "reward": 2.2498570680618286, + "reward_std": 0.2673697262735004, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.7498571276664734, + "step": 168 + }, + { + "clip_ratio": 0.0, + "completion_length": 34.375, + "epoch": 0.4378238341968912, + "grad_norm": 94.54455842469612, + "kl": 0.1337890625, + "learning_rate": 9.564766839378237e-07, + "loss": 0.0005, + "reward": 1.9985234141349792, + "reward_std": 0.48838643729686737, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.498523473739624, + "step": 169 + }, + { + "clip_ratio": 0.0, + "completion_length": 34.6875, + "epoch": 0.44041450777202074, + "grad_norm": 6.9718706237515224, + "kl": 0.048095703125, + "learning_rate": 9.56217616580311e-07, + "loss": 0.0003, + "reward": 2.4999825954437256, + "reward_std": 1.2279092800326907e-05, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999827146530151, + "step": 170 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.5, + "epoch": 0.4430051813471503, + "grad_norm": 0.5937469623650423, + "kl": 0.086669921875, + "learning_rate": 9.559585492227977e-07, + "loss": 0.0018, + "reward": 2.4999914169311523, + "reward_std": 5.022113668928796e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999912977218628, + "step": 171 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 0.44559585492227977, + "grad_norm": 2.9376354932508058, + "kl": 0.105224609375, + "learning_rate": 9.55699481865285e-07, + "loss": 0.0016, + "reward": 2.4999964237213135, + "reward_std": 3.2443115287605906e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999963641166687, + "step": 172 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.25, + "epoch": 0.4481865284974093, + "grad_norm": 193.67818409560059, + "kl": 0.077880859375, + "learning_rate": 9.55440414507772e-07, + "loss": 0.0009, + "reward": 2.0455445051193237, + "reward_std": 0.1841201360700211, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.5455445051193237, + "step": 173 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.5, + "epoch": 0.45077720207253885, + "grad_norm": 6.210457951400512, + "kl": 0.078857421875, + "learning_rate": 9.55181347150259e-07, + "loss": -0.0004, + "reward": 1.9924424290657043, + "reward_std": 6.803595104543092e-05, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.492442548274994, + "step": 174 + }, + { + "clip_ratio": 0.0, + "completion_length": 34.5, + "epoch": 0.4533678756476684, + "grad_norm": 169.98982672715994, + "kl": 0.0977783203125, + "learning_rate": 9.549222797927462e-07, + "loss": 0.0001, + "reward": 2.49991774559021, + "reward_std": 6.647250665992033e-05, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999178647994995, + "step": 175 + }, + { + "clip_ratio": 0.0, + "completion_length": 36.0, + "epoch": 0.45595854922279794, + "grad_norm": 39.87688544866592, + "kl": 0.064453125, + "learning_rate": 9.546632124352332e-07, + "loss": 0.0004, + "reward": 2.436307191848755, + "reward_std": 0.17962476573848107, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9363073706626892, + "step": 176 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.5, + "epoch": 0.4585492227979275, + "grad_norm": 9.503614910000897, + "kl": 0.03948974609375, + "learning_rate": 9.544041450777202e-07, + "loss": -0.0003, + "reward": 1.998598575592041, + "reward_std": 0.00019181770039722323, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.498598724603653, + "step": 177 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.5, + "epoch": 0.46113989637305697, + "grad_norm": 5.92166567877022, + "kl": 0.0601806640625, + "learning_rate": 9.541450777202072e-07, + "loss": 0.0006, + "reward": 2.4999635219573975, + "reward_std": 3.4532062727521406e-05, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999634623527527, + "step": 178 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0625, + "epoch": 0.4637305699481865, + "grad_norm": 62.87930005310863, + "kl": 0.109375, + "learning_rate": 9.538860103626942e-07, + "loss": 0.0003, + "reward": 2.1871920824050903, + "reward_std": 0.2590183729181206, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.687192142009735, + "step": 179 + }, + { + "clip_ratio": 0.0, + "completion_length": 34.5, + "epoch": 0.46632124352331605, + "grad_norm": 5.077639078676416, + "kl": 0.1060791015625, + "learning_rate": 9.536269430051813e-07, + "loss": -0.0004, + "reward": 2.4999769926071167, + "reward_std": 4.302172237657942e-05, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999770522117615, + "step": 180 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.5, + "epoch": 0.4689119170984456, + "grad_norm": 12.874689131272246, + "kl": 0.0732421875, + "learning_rate": 9.533678756476683e-07, + "loss": 0.0004, + "reward": 2.374955177307129, + "reward_std": 0.23149113605359162, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.8749550580978394, + "step": 181 + }, + { + "clip_ratio": 0.0, + "completion_length": 34.0, + "epoch": 0.47150259067357514, + "grad_norm": 3.2785029112945425, + "kl": 0.068359375, + "learning_rate": 9.531088082901554e-07, + "loss": 0.0007, + "reward": 2.499966263771057, + "reward_std": 1.5721283034508815e-05, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999662041664124, + "step": 182 + }, + { + "clip_ratio": 0.0, + "completion_length": 34.0, + "epoch": 0.4740932642487047, + "grad_norm": 11.298488720107546, + "kl": 0.024169921875, + "learning_rate": 9.528497409326425e-07, + "loss": 0.0001, + "reward": 1.4906189441680908, + "reward_std": 0.008024983624636661, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 0.9906189739704132, + "step": 183 + }, + { + "clip_ratio": 0.0, + "completion_length": 34.5, + "epoch": 0.47668393782383417, + "grad_norm": 1.0958165073828254, + "kl": 0.17236328125, + "learning_rate": 9.525906735751295e-07, + "loss": 0.0004, + "reward": 2.4999895095825195, + "reward_std": 1.201426994157373e-05, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999895691871643, + "step": 184 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.5, + "epoch": 0.4792746113989637, + "grad_norm": 39.416709054286926, + "kl": 0.076904296875, + "learning_rate": 9.523316062176166e-07, + "loss": 0.0007, + "reward": 2.37497878074646, + "reward_std": 0.23148569122906792, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.87497878074646, + "step": 185 + }, + { + "clip_ratio": 0.0, + "completion_length": 34.5, + "epoch": 0.48186528497409326, + "grad_norm": 0.02431102659126464, + "kl": 0.0242919921875, + "learning_rate": 9.520725388601036e-07, + "loss": 0.0007, + "reward": 2.499998927116394, + "reward_std": 5.839314098921022e-07, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.999998927116394, + "step": 186 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.125, + "epoch": 0.4844559585492228, + "grad_norm": 52.11459878771558, + "kl": 0.040985107421875, + "learning_rate": 9.518134715025906e-07, + "loss": 0.0005, + "reward": 2.2339224815368652, + "reward_std": 0.3672207622189205, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.73392254114151, + "step": 187 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.125, + "epoch": 0.48704663212435234, + "grad_norm": 63.02844422973685, + "kl": 0.06390380859375, + "learning_rate": 9.515544041450777e-07, + "loss": -0.0002, + "reward": 1.997809886932373, + "reward_std": 0.0007182962772276369, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.4978099763393402, + "step": 188 + }, + { + "clip_ratio": 0.0, + "completion_length": 34.0, + "epoch": 0.4896373056994819, + "grad_norm": 4.561146785274667, + "kl": 0.031494140625, + "learning_rate": 9.512953367875647e-07, + "loss": 0.0, + "reward": 2.499797224998474, + "reward_std": 7.290885696420446e-05, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9997972249984741, + "step": 189 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.4375, + "epoch": 0.49222797927461137, + "grad_norm": 15.794183433983163, + "kl": 0.107421875, + "learning_rate": 9.510362694300518e-07, + "loss": -0.0004, + "reward": 1.775613248348236, + "reward_std": 0.07763574089835856, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.2756133675575256, + "step": 190 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.5, + "epoch": 0.4948186528497409, + "grad_norm": 53.794635804377414, + "kl": 0.0758056640625, + "learning_rate": 9.507772020725389e-07, + "loss": 0.0003, + "reward": 2.4373844861984253, + "reward_std": 0.17706387546706992, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9373846054077148, + "step": 191 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.5, + "epoch": 0.49740932642487046, + "grad_norm": 13.339812350333533, + "kl": 0.156494140625, + "learning_rate": 9.505181347150258e-07, + "loss": 0.0004, + "reward": 2.4374749660491943, + "reward_std": 0.17682266998508567, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9374749064445496, + "step": 192 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 0.5, + "grad_norm": 7.709688807481148, + "kl": 0.05029296875, + "learning_rate": 9.502590673575129e-07, + "loss": 0.0011, + "reward": 2.4999923706054688, + "reward_std": 8.9534473772801e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999923706054688, + "step": 193 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 0.5025906735751295, + "grad_norm": 12.292379892079113, + "kl": 0.03564453125, + "learning_rate": 9.499999999999999e-07, + "loss": 0.0006, + "reward": 2.4999698400497437, + "reward_std": 3.4964389669767115e-05, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999698400497437, + "step": 194 + }, + { + "clip_ratio": 0.0, + "completion_length": 34.3125, + "epoch": 0.5051813471502591, + "grad_norm": 30.23930669613842, + "kl": 0.2294921875, + "learning_rate": 9.49740932642487e-07, + "loss": 0.0012, + "reward": 1.9902034997940063, + "reward_std": 0.007483473378670169, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.4902033805847168, + "step": 195 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.5, + "epoch": 0.5077720207253886, + "grad_norm": 23.608654428483174, + "kl": 0.06787109375, + "learning_rate": 9.494818652849741e-07, + "loss": 0.0003, + "reward": 2.1249433755874634, + "reward_std": 0.23149006214407564, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.6249432563781738, + "step": 196 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 0.5103626943005182, + "grad_norm": 1.3304126719372689, + "kl": 0.033935546875, + "learning_rate": 9.492227979274611e-07, + "loss": 0.0003, + "reward": 2.4999794960021973, + "reward_std": 4.905148443867802e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.999979555606842, + "step": 197 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 0.5129533678756477, + "grad_norm": 3.506059228911601, + "kl": 0.132080078125, + "learning_rate": 9.489637305699481e-07, + "loss": 0.0, + "reward": 2.499938488006592, + "reward_std": 2.3219374085670097e-05, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999384880065918, + "step": 198 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.4375, + "epoch": 0.5155440414507773, + "grad_norm": 40.38588040361022, + "kl": 0.0552978515625, + "learning_rate": 9.487046632124351e-07, + "loss": -0.0005, + "reward": 2.0623568296432495, + "reward_std": 0.41730798021671944, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.5623570084571838, + "step": 199 + }, + { + "clip_ratio": 0.0, + "completion_length": 36.0, + "epoch": 0.5181347150259067, + "grad_norm": 14.54155105301667, + "kl": 0.1279296875, + "learning_rate": 9.484455958549222e-07, + "loss": 0.0011, + "reward": 2.0565385818481445, + "reward_std": 0.17906291717235945, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.5565386414527893, + "step": 200 + }, + { + "clip_ratio": 0.0, + "completion_length": 34.5, + "epoch": 0.5207253886010362, + "grad_norm": 3.00118081433762, + "kl": 0.05029296875, + "learning_rate": 9.481865284974093e-07, + "loss": 0.0003, + "reward": 2.4999457597732544, + "reward_std": 2.9745723395535606e-05, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999459385871887, + "step": 201 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.5, + "epoch": 0.5233160621761658, + "grad_norm": 116.19563637018368, + "kl": 0.153564453125, + "learning_rate": 9.479274611398963e-07, + "loss": 0.0, + "reward": 2.12401682138443, + "reward_std": 0.23206061124801636, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.62401682138443, + "step": 202 + }, + { + "clip_ratio": 0.0, + "completion_length": 34.5, + "epoch": 0.5259067357512953, + "grad_norm": 0.7202451802861973, + "kl": 0.0352783203125, + "learning_rate": 9.476683937823834e-07, + "loss": -0.0006, + "reward": 2.4999974966049194, + "reward_std": 3.306661483293283e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.999997615814209, + "step": 203 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 0.5284974093264249, + "grad_norm": 7.46545829789412, + "kl": 0.0384521484375, + "learning_rate": 9.474093264248703e-07, + "loss": -0.0, + "reward": 2.499893307685852, + "reward_std": 8.678888480062596e-05, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9998934268951416, + "step": 204 + }, + { + "clip_ratio": 0.0, + "completion_length": 34.6875, + "epoch": 0.5310880829015544, + "grad_norm": 22.99799119427596, + "kl": 0.0867919921875, + "learning_rate": 9.471502590673574e-07, + "loss": 0.0008, + "reward": 2.030556797981262, + "reward_std": 0.4455295194460689, + "rewards/format_reward_rec": 0.9375, + "rewards/point_reward": 1.561806857585907, + "step": 205 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 0.533678756476684, + "grad_norm": 62.826438018148934, + "kl": 0.134765625, + "learning_rate": 9.468911917098445e-07, + "loss": 0.0011, + "reward": 2.434236764907837, + "reward_std": 0.1859855586524759, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9342365264892578, + "step": 206 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.5, + "epoch": 0.5362694300518135, + "grad_norm": 33.98111377019834, + "kl": 0.076416015625, + "learning_rate": 9.466321243523315e-07, + "loss": 0.0006, + "reward": 1.9978904724121094, + "reward_std": 0.00040473006356478436, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.4978904128074646, + "step": 207 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.5, + "epoch": 0.538860103626943, + "grad_norm": 3.9583399192142825, + "kl": 0.0478515625, + "learning_rate": 9.463730569948186e-07, + "loss": 0.0, + "reward": 1.9968501925468445, + "reward_std": 6.582876631000545e-05, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.4968501925468445, + "step": 208 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.5, + "epoch": 0.5414507772020726, + "grad_norm": 0.2445416189198319, + "kl": 0.0552978515625, + "learning_rate": 9.461139896373057e-07, + "loss": 0.0002, + "reward": 2.499974489212036, + "reward_std": 2.6608826431129273e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999743700027466, + "step": 209 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.1875, + "epoch": 0.5440414507772021, + "grad_norm": 13.848213173762856, + "kl": 0.1322021484375, + "learning_rate": 9.458549222797926e-07, + "loss": 0.0004, + "reward": 2.097993493080139, + "reward_std": 0.24885972872152706, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.5979933738708496, + "step": 210 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 0.5466321243523317, + "grad_norm": 28.14177244645597, + "kl": 0.07958984375, + "learning_rate": 9.455958549222797e-07, + "loss": 0.0011, + "reward": 1.9448403716087341, + "reward_std": 0.020275432882044697, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.4448402523994446, + "step": 211 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 0.5492227979274611, + "grad_norm": 79.27496052370239, + "kl": 0.111083984375, + "learning_rate": 9.453367875647667e-07, + "loss": 0.0006, + "reward": 1.8673059940338135, + "reward_std": 0.09618017942284496, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.3673060834407806, + "step": 212 + }, + { + "clip_ratio": 0.0, + "completion_length": 36.0, + "epoch": 0.5518134715025906, + "grad_norm": 77.89653881866029, + "kl": 0.0592041015625, + "learning_rate": 9.450777202072539e-07, + "loss": -0.0002, + "reward": 2.4999282360076904, + "reward_std": 3.5500268495525233e-05, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.999928057193756, + "step": 213 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.5, + "epoch": 0.5544041450777202, + "grad_norm": 3.2375878032635215, + "kl": 0.0303955078125, + "learning_rate": 9.44818652849741e-07, + "loss": 0.0002, + "reward": 2.4998496770858765, + "reward_std": 1.796979404389276e-05, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.999849796295166, + "step": 214 + }, + { + "clip_ratio": 0.0, + "completion_length": 34.9375, + "epoch": 0.5569948186528497, + "grad_norm": 103.02440756730583, + "kl": 0.1485595703125, + "learning_rate": 9.44559585492228e-07, + "loss": -0.0006, + "reward": 1.681231051683426, + "reward_std": 0.11251461816493702, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.1812311708927155, + "step": 215 + }, + { + "clip_ratio": 0.0, + "completion_length": 36.0, + "epoch": 0.5595854922279793, + "grad_norm": 0.8614838950564742, + "kl": 0.16357421875, + "learning_rate": 9.44300518134715e-07, + "loss": 0.0009, + "reward": 2.499993681907654, + "reward_std": 5.377873947054468e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999937415122986, + "step": 216 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.5, + "epoch": 0.5621761658031088, + "grad_norm": 26.26413385590952, + "kl": 0.067626953125, + "learning_rate": 9.44041450777202e-07, + "loss": 0.0003, + "reward": 1.4598759412765503, + "reward_std": 0.011151136626722291, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 0.9598759412765503, + "step": 217 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 0.5647668393782384, + "grad_norm": 0.4612586134457508, + "kl": 0.12109375, + "learning_rate": 9.437823834196891e-07, + "loss": 0.0002, + "reward": 2.499996542930603, + "reward_std": 1.909071158934239e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999963641166687, + "step": 218 + }, + { + "clip_ratio": 0.0, + "completion_length": 36.0, + "epoch": 0.5673575129533679, + "grad_norm": 46.14937279464682, + "kl": 0.1630859375, + "learning_rate": 9.435233160621762e-07, + "loss": 0.0007, + "reward": 1.9984716176986694, + "reward_std": 0.35650962591171265, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.498471736907959, + "step": 219 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.5, + "epoch": 0.5699481865284974, + "grad_norm": 61.21426229992077, + "kl": 0.160888671875, + "learning_rate": 9.432642487046632e-07, + "loss": 0.0005, + "reward": 1.975698173046112, + "reward_std": 0.00016388069985850962, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.4756982028484344, + "step": 220 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.5, + "epoch": 0.572538860103627, + "grad_norm": 63.00321654029741, + "kl": 0.0494384765625, + "learning_rate": 9.430051813471503e-07, + "loss": -0.0004, + "reward": 2.4999773502349854, + "reward_std": 2.2917482510820264e-05, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.999977469444275, + "step": 221 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 0.5751295336787565, + "grad_norm": 46.67434402950752, + "kl": 0.10546875, + "learning_rate": 9.427461139896372e-07, + "loss": 0.0004, + "reward": 1.7007672786712646, + "reward_std": 0.29875028878450394, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.200767308473587, + "step": 222 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.5, + "epoch": 0.5777202072538861, + "grad_norm": 19.6623794446999, + "kl": 0.10791015625, + "learning_rate": 9.424870466321243e-07, + "loss": 0.001, + "reward": 1.9999017715454102, + "reward_std": 6.802845064157736e-05, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.499901831150055, + "step": 223 + }, + { + "clip_ratio": 0.0, + "completion_length": 34.5625, + "epoch": 0.5803108808290155, + "grad_norm": 14.095627736677576, + "kl": 0.1168212890625, + "learning_rate": 9.422279792746114e-07, + "loss": 0.0008, + "reward": 2.4999054670333862, + "reward_std": 7.989784353412688e-05, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999054670333862, + "step": 224 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.5, + "epoch": 0.582901554404145, + "grad_norm": 42.1967457306829, + "kl": 0.07763671875, + "learning_rate": 9.419689119170984e-07, + "loss": 0.0007, + "reward": 2.1219332218170166, + "reward_std": 0.2333798172276147, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.6219332218170166, + "step": 225 + }, + { + "clip_ratio": 0.0, + "completion_length": 34.5, + "epoch": 0.5854922279792746, + "grad_norm": 0.8337090023946528, + "kl": 0.0643310546875, + "learning_rate": 9.417098445595855e-07, + "loss": -0.0, + "reward": 2.4999881982803345, + "reward_std": 5.433427077150554e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999881982803345, + "step": 226 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.5, + "epoch": 0.5880829015544041, + "grad_norm": 6.560086811318631, + "kl": 0.0528564453125, + "learning_rate": 9.414507772020725e-07, + "loss": 0.0001, + "reward": 1.9985014200210571, + "reward_std": 6.0344076473484165e-05, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.4985015094280243, + "step": 227 + }, + { + "clip_ratio": 0.0, + "completion_length": 34.5, + "epoch": 0.5906735751295337, + "grad_norm": 13.593222108662275, + "kl": 0.03106689453125, + "learning_rate": 9.411917098445595e-07, + "loss": 0.0006, + "reward": 2.4998584985733032, + "reward_std": 9.021737014336395e-05, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.999858319759369, + "step": 228 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 0.5932642487046632, + "grad_norm": 21.91224493587665, + "kl": 0.252197265625, + "learning_rate": 9.409326424870466e-07, + "loss": 0.0017, + "reward": 1.997519612312317, + "reward_std": 0.0008209343400267244, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.4975194931030273, + "step": 229 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.5, + "epoch": 0.5958549222797928, + "grad_norm": 17.488410459500873, + "kl": 0.099609375, + "learning_rate": 9.406735751295336e-07, + "loss": 0.0009, + "reward": 2.2499552965164185, + "reward_std": 0.26730949130433146, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.749955177307129, + "step": 230 + }, + { + "clip_ratio": 0.0, + "completion_length": 34.0, + "epoch": 0.5984455958549223, + "grad_norm": 41.68772485061926, + "kl": 2.037109375, + "learning_rate": 9.404145077720207e-07, + "loss": 0.0087, + "reward": 1.8484117984771729, + "reward_std": 0.00011486069325883363, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.3484117984771729, + "step": 231 + }, + { + "clip_ratio": 0.0, + "completion_length": 36.0, + "epoch": 0.6010362694300518, + "grad_norm": 1.106281771503951, + "kl": 0.09521484375, + "learning_rate": 9.401554404145078e-07, + "loss": 0.0009, + "reward": 1.9999032020568848, + "reward_std": 1.5472111954295542e-05, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.4999030530452728, + "step": 232 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.5, + "epoch": 0.6036269430051814, + "grad_norm": 14.44885688858764, + "kl": 0.080078125, + "learning_rate": 9.398963730569948e-07, + "loss": 0.0009, + "reward": 1.8022709488868713, + "reward_std": 0.0014438453572438448, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.3022708296775818, + "step": 233 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 0.6062176165803109, + "grad_norm": 0.11546445822861898, + "kl": 0.1259765625, + "learning_rate": 9.396373056994819e-07, + "loss": 0.0009, + "reward": 2.499995470046997, + "reward_std": 1.2460781704248802e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999953508377075, + "step": 234 + }, + { + "clip_ratio": 0.0, + "completion_length": 36.0, + "epoch": 0.6088082901554405, + "grad_norm": 41.4292529400018, + "kl": 0.1171875, + "learning_rate": 9.393782383419688e-07, + "loss": 0.0005, + "reward": 1.686115801334381, + "reward_std": 0.25914314383408055, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.186115801334381, + "step": 235 + }, + { + "clip_ratio": 0.0, + "completion_length": 36.0, + "epoch": 0.6113989637305699, + "grad_norm": 32.05389070781718, + "kl": 0.03466796875, + "learning_rate": 9.391191709844559e-07, + "loss": -0.0002, + "reward": 2.436464309692383, + "reward_std": 0.17833977332188056, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9364644885063171, + "step": 236 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.125, + "epoch": 0.6139896373056994, + "grad_norm": 23.54191375273367, + "kl": 0.115478515625, + "learning_rate": 9.38860103626943e-07, + "loss": 0.0006, + "reward": 1.9444739818572998, + "reward_std": 0.019849272669233642, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.4444738030433655, + "step": 237 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 0.616580310880829, + "grad_norm": 39.32229435436085, + "kl": 0.0594482421875, + "learning_rate": 9.3860103626943e-07, + "loss": -0.0002, + "reward": 2.1870386600494385, + "reward_std": 0.2591146485837612, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.687038779258728, + "step": 238 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.5, + "epoch": 0.6191709844559585, + "grad_norm": 25.469284876150283, + "kl": 0.0361328125, + "learning_rate": 9.383419689119171e-07, + "loss": 0.0001, + "reward": 2.49897837638855, + "reward_std": 0.0004042975363063306, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9989783763885498, + "step": 239 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 0.6217616580310881, + "grad_norm": 5.608123288148271, + "kl": 0.0927734375, + "learning_rate": 9.38082901554404e-07, + "loss": 0.0009, + "reward": 2.4998337030410767, + "reward_std": 4.3772628799843005e-05, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9998336434364319, + "step": 240 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 0.6243523316062176, + "grad_norm": 0.5900102682819381, + "kl": 0.0611572265625, + "learning_rate": 9.378238341968911e-07, + "loss": -0.0006, + "reward": 2.499992847442627, + "reward_std": 3.861457003040414e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999930262565613, + "step": 241 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 0.6269430051813472, + "grad_norm": 15.233115618658832, + "kl": 0.086669921875, + "learning_rate": 9.375647668393782e-07, + "loss": -0.0003, + "reward": 1.9760212898254395, + "reward_std": 0.0007327220682782354, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.476021409034729, + "step": 242 + }, + { + "clip_ratio": 0.0, + "completion_length": 34.5, + "epoch": 0.6295336787564767, + "grad_norm": 89.63523003423197, + "kl": 0.15966796875, + "learning_rate": 9.373056994818652e-07, + "loss": 0.0003, + "reward": 1.9901127815246582, + "reward_std": 0.001632723069860731, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.4901129603385925, + "step": 243 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 0.6321243523316062, + "grad_norm": 6.017951740763496, + "kl": 0.7403564453125, + "learning_rate": 9.370466321243523e-07, + "loss": 0.0038, + "reward": 2.4999749660491943, + "reward_std": 7.510585191994323e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999749660491943, + "step": 244 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.5, + "epoch": 0.6347150259067358, + "grad_norm": 0.33611723178730435, + "kl": 0.03533935546875, + "learning_rate": 9.367875647668393e-07, + "loss": -0.0009, + "reward": 2.4999945163726807, + "reward_std": 3.2188998773108324e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.999994695186615, + "step": 245 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 0.6373056994818653, + "grad_norm": 1.6092350101526522, + "kl": 0.05596923828125, + "learning_rate": 9.365284974093264e-07, + "loss": 0.0002, + "reward": 2.4999927282333374, + "reward_std": 4.545140654954594e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.999992847442627, + "step": 246 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 0.6398963730569949, + "grad_norm": 2.5295088538671244, + "kl": 0.079833984375, + "learning_rate": 9.362694300518134e-07, + "loss": 0.0003, + "reward": 2.4999828338623047, + "reward_std": 1.1938268286826315e-05, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999828934669495, + "step": 247 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 0.6424870466321243, + "grad_norm": 1.859501078280743, + "kl": 0.0389404296875, + "learning_rate": 9.360103626943004e-07, + "loss": 0.0005, + "reward": 2.4999754428863525, + "reward_std": 2.1986423050179837e-05, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.999975562095642, + "step": 248 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.5, + "epoch": 0.6450777202072538, + "grad_norm": 2.697440450301362, + "kl": 0.0865478515625, + "learning_rate": 9.357512953367875e-07, + "loss": 0.0001, + "reward": 2.499985933303833, + "reward_std": 2.1488343918463215e-05, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999858736991882, + "step": 249 + }, + { + "clip_ratio": 0.0, + "completion_length": 36.0, + "epoch": 0.6476683937823834, + "grad_norm": 39.343466209925005, + "kl": 0.080810546875, + "learning_rate": 9.354922279792745e-07, + "loss": 0.0003, + "reward": 2.2499629259109497, + "reward_std": 0.26729267278676616, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.7499629259109497, + "step": 250 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.5, + "epoch": 0.6502590673575129, + "grad_norm": 38.7713187585943, + "kl": 0.066009521484375, + "learning_rate": 9.352331606217616e-07, + "loss": -0.0001, + "reward": 2.499959111213684, + "reward_std": 4.292089155910617e-05, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999591708183289, + "step": 251 + }, + { + "clip_ratio": 0.0, + "completion_length": 34.5, + "epoch": 0.6528497409326425, + "grad_norm": 33.689628441748106, + "kl": 0.0758056640625, + "learning_rate": 9.349740932642487e-07, + "loss": 0.0002, + "reward": 2.2461527585983276, + "reward_std": 0.2713722139735637, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.7461528778076172, + "step": 252 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.5, + "epoch": 0.655440414507772, + "grad_norm": 0.2080987426799511, + "kl": 0.091156005859375, + "learning_rate": 9.347150259067356e-07, + "loss": 0.0001, + "reward": 2.4999970197677612, + "reward_std": 3.7653973095075344e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999970197677612, + "step": 253 + }, + { + "clip_ratio": 0.0, + "completion_length": 34.625, + "epoch": 0.6580310880829016, + "grad_norm": 15.77335774260603, + "kl": 0.0772705078125, + "learning_rate": 9.344559585492227e-07, + "loss": 0.0, + "reward": 2.0636579394340515, + "reward_std": 0.2693043718799828, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.5636579394340515, + "step": 254 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.1875, + "epoch": 0.6606217616580311, + "grad_norm": 102.94629211528023, + "kl": 0.056884765625, + "learning_rate": 9.341968911917099e-07, + "loss": 0.0002, + "reward": 2.025260090827942, + "reward_std": 0.39463518345746706, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.525260090827942, + "step": 255 + }, + { + "clip_ratio": 0.0, + "completion_length": 36.0, + "epoch": 0.6632124352331606, + "grad_norm": 24.710556727036387, + "kl": 0.1859130859375, + "learning_rate": 9.339378238341969e-07, + "loss": 0.0002, + "reward": 2.1874470710754395, + "reward_std": 0.25881719600437236, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.687447190284729, + "step": 256 + }, + { + "clip_ratio": 0.0, + "completion_length": 34.5, + "epoch": 0.6658031088082902, + "grad_norm": 1.5521769698253913, + "kl": 0.052734375, + "learning_rate": 9.33678756476684e-07, + "loss": 0.0012, + "reward": 2.499988555908203, + "reward_std": 1.2101602123948396e-05, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.999988317489624, + "step": 257 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 0.6683937823834197, + "grad_norm": 63.022558527622344, + "kl": 0.07275390625, + "learning_rate": 9.33419689119171e-07, + "loss": 0.0007, + "reward": 1.99865061044693, + "reward_std": 0.0002238482548762022, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.4986506700515747, + "step": 258 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.25, + "epoch": 0.6709844559585493, + "grad_norm": 46.7628067020373, + "kl": 0.111572265625, + "learning_rate": 9.33160621761658e-07, + "loss": 0.0004, + "reward": 1.8104197978973389, + "reward_std": 0.6800253987312317, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.3104197978973389, + "step": 259 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0625, + "epoch": 0.6735751295336787, + "grad_norm": 30.072863939071013, + "kl": 0.118896484375, + "learning_rate": 9.329015544041451e-07, + "loss": 0.0008, + "reward": 2.057658076286316, + "reward_std": 0.17922852502466924, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.5576579570770264, + "step": 260 + }, + { + "clip_ratio": 0.0, + "completion_length": 34.5, + "epoch": 0.6761658031088082, + "grad_norm": 172.33709551413486, + "kl": 0.1025390625, + "learning_rate": 9.326424870466321e-07, + "loss": 0.0004, + "reward": 1.2484757900238037, + "reward_std": 0.003200551262125373, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 0.7484757602214813, + "step": 261 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.1875, + "epoch": 0.6787564766839378, + "grad_norm": 54.55324273751358, + "kl": 0.068603515625, + "learning_rate": 9.323834196891192e-07, + "loss": 0.0004, + "reward": 2.078890562057495, + "reward_std": 0.2600107304310768, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.5788904428482056, + "step": 262 + }, + { + "clip_ratio": 0.0, + "completion_length": 34.5, + "epoch": 0.6813471502590673, + "grad_norm": 14.233670491997772, + "kl": 0.15234375, + "learning_rate": 9.321243523316062e-07, + "loss": 0.0007, + "reward": 2.1872631311416626, + "reward_std": 0.2588548979751977, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.6872629523277283, + "step": 263 + }, + { + "clip_ratio": 0.0, + "completion_length": 34.5, + "epoch": 0.6839378238341969, + "grad_norm": 50.997230760582205, + "kl": 0.0169677734375, + "learning_rate": 9.318652849740933e-07, + "loss": -0.0008, + "reward": 2.312475085258484, + "reward_std": 0.25880399473894045, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.8124749660491943, + "step": 264 + }, + { + "clip_ratio": 0.0, + "completion_length": 36.0, + "epoch": 0.6865284974093264, + "grad_norm": 0.33892982180117925, + "kl": 0.10888671875, + "learning_rate": 9.316062176165803e-07, + "loss": 0.0004, + "reward": 1.9999109506607056, + "reward_std": 5.484715074999258e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.4999110102653503, + "step": 265 + }, + { + "clip_ratio": 0.0, + "completion_length": 36.0, + "epoch": 0.689119170984456, + "grad_norm": 1.3070331629686442, + "kl": 0.03485107421875, + "learning_rate": 9.313471502590673e-07, + "loss": 0.0013, + "reward": 2.4999672174453735, + "reward_std": 1.4274331988417543e-05, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.999967098236084, + "step": 266 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 0.6917098445595855, + "grad_norm": 1.5513637564228338, + "kl": 0.05224609375, + "learning_rate": 9.310880829015544e-07, + "loss": -0.0002, + "reward": 2.49995756149292, + "reward_std": 1.746729623164356e-05, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999576210975647, + "step": 267 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 0.694300518134715, + "grad_norm": 112.77053232255466, + "kl": 0.04449462890625, + "learning_rate": 9.308290155440414e-07, + "loss": 0.0005, + "reward": 2.49994158744812, + "reward_std": 0.0001064716659016085, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.999941647052765, + "step": 268 + }, + { + "clip_ratio": 0.0, + "completion_length": 36.5, + "epoch": 0.6968911917098446, + "grad_norm": 52.051932382301246, + "kl": 0.069580078125, + "learning_rate": 9.305699481865285e-07, + "loss": 0.0004, + "reward": 1.8965779542922974, + "reward_std": 0.003717041016841449, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.3965781033039093, + "step": 269 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 0.6994818652849741, + "grad_norm": 105.24779156133805, + "kl": 0.0260467529296875, + "learning_rate": 9.303108808290156e-07, + "loss": 0.0001, + "reward": 1.99981689453125, + "reward_std": 0.00018113442831690918, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.49981689453125, + "step": 270 + }, + { + "clip_ratio": 0.0, + "completion_length": 34.4375, + "epoch": 0.7020725388601037, + "grad_norm": 49.51360356628827, + "kl": 0.058837890625, + "learning_rate": 9.300518134715025e-07, + "loss": 0.0002, + "reward": 1.2127676010131836, + "reward_std": 0.2958464545663446, + "rewards/format_reward_rec": 0.875, + "rewards/point_reward": 0.7752676904201508, + "step": 271 + }, + { + "clip_ratio": 0.0, + "completion_length": 34.9375, + "epoch": 0.7046632124352331, + "grad_norm": 26.210604022267137, + "kl": 0.03125, + "learning_rate": 9.297927461139896e-07, + "loss": 0.0001, + "reward": 2.4371660947799683, + "reward_std": 0.17677150324652757, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9371660947799683, + "step": 272 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.5, + "epoch": 0.7072538860103627, + "grad_norm": 5.61723189884251, + "kl": 0.099365234375, + "learning_rate": 9.295336787564766e-07, + "loss": 0.0002, + "reward": 2.499822735786438, + "reward_std": 8.499576142639853e-05, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9998226165771484, + "step": 273 + }, + { + "clip_ratio": 0.0, + "completion_length": 33.5, + "epoch": 0.7098445595854922, + "grad_norm": 6.112321727623437, + "kl": 0.27496337890625, + "learning_rate": 9.292746113989637e-07, + "loss": 0.0006, + "reward": 1.9968271851539612, + "reward_std": 0.0001886479646486805, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.4968271255493164, + "step": 274 + }, + { + "clip_ratio": 0.0, + "completion_length": 36.0, + "epoch": 0.7124352331606217, + "grad_norm": 9.319562290667678, + "kl": 0.093994140625, + "learning_rate": 9.290155440414508e-07, + "loss": 0.0011, + "reward": 2.4994486570358276, + "reward_std": 0.00023547241016785847, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9994484186172485, + "step": 275 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 0.7150259067357513, + "grad_norm": 13.749548931709745, + "kl": 0.107666015625, + "learning_rate": 9.287564766839378e-07, + "loss": 0.0004, + "reward": 1.7446227669715881, + "reward_std": 0.2328637728933245, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.2446226477622986, + "step": 276 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 0.7176165803108808, + "grad_norm": 0.29424144760711574, + "kl": 0.010528564453125, + "learning_rate": 9.284974093264248e-07, + "loss": 0.0005, + "reward": 2.4999974966049194, + "reward_std": 2.2441956843977096e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999972581863403, + "step": 277 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 0.7202072538860104, + "grad_norm": 0.9127220665936261, + "kl": 0.09619140625, + "learning_rate": 9.282383419689118e-07, + "loss": 0.0005, + "reward": 2.4999959468841553, + "reward_std": 5.73918987356592e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999960660934448, + "step": 278 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.5, + "epoch": 0.7227979274611399, + "grad_norm": 2.7414774325603277, + "kl": 0.05029296875, + "learning_rate": 9.279792746113989e-07, + "loss": -0.0002, + "reward": 2.4994759559631348, + "reward_std": 4.502509227677365e-05, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9994759559631348, + "step": 279 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.5, + "epoch": 0.7253886010362695, + "grad_norm": 224.78166497058007, + "kl": 0.168701171875, + "learning_rate": 9.27720207253886e-07, + "loss": 0.0007, + "reward": 1.4996460676193237, + "reward_std": 0.5346071789172129, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 0.9996460974216461, + "step": 280 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 0.727979274611399, + "grad_norm": 29.767806976453638, + "kl": 0.093505859375, + "learning_rate": 9.27461139896373e-07, + "loss": 0.0005, + "reward": 2.499904751777649, + "reward_std": 0.00014851183732389472, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999048709869385, + "step": 281 + }, + { + "clip_ratio": 0.0, + "completion_length": 36.0, + "epoch": 0.7305699481865285, + "grad_norm": 2.451734746053496, + "kl": 0.058837890625, + "learning_rate": 9.272020725388601e-07, + "loss": 0.001, + "reward": 2.4999754428863525, + "reward_std": 1.5983257071638945e-05, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999753832817078, + "step": 282 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.5, + "epoch": 0.7331606217616581, + "grad_norm": 1.8951235023993949, + "kl": 0.0263671875, + "learning_rate": 9.269430051813471e-07, + "loss": -0.0001, + "reward": 2.4999849796295166, + "reward_std": 1.7635909728141996e-05, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999850392341614, + "step": 283 + }, + { + "clip_ratio": 0.0, + "completion_length": 34.9375, + "epoch": 0.7357512953367875, + "grad_norm": 20.1055651487715, + "kl": 0.1324462890625, + "learning_rate": 9.266839378238341e-07, + "loss": 0.0004, + "reward": 2.029202103614807, + "reward_std": 0.19023123945163434, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.5292021036148071, + "step": 284 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.5, + "epoch": 0.7383419689119171, + "grad_norm": 2.3221891552741885, + "kl": 0.0855712890625, + "learning_rate": 9.264248704663212e-07, + "loss": -0.0003, + "reward": 2.4999667406082153, + "reward_std": 1.8513579334467067e-05, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999666810035706, + "step": 285 + }, + { + "clip_ratio": 0.0, + "completion_length": 34.5, + "epoch": 0.7409326424870466, + "grad_norm": 0.16696801434953953, + "kl": 0.062744140625, + "learning_rate": 9.261658031088082e-07, + "loss": -0.0005, + "reward": 2.4999759197235107, + "reward_std": 2.578143323717086e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999759793281555, + "step": 286 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 0.7435233160621761, + "grad_norm": 13.229863535225716, + "kl": 0.16259765625, + "learning_rate": 9.259067357512953e-07, + "loss": 0.0005, + "reward": 1.9019799828529358, + "reward_std": 0.00041689237696118653, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.401980072259903, + "step": 287 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 0.7461139896373057, + "grad_norm": 0.7634927553439628, + "kl": 0.0386962890625, + "learning_rate": 9.256476683937824e-07, + "loss": 0.001, + "reward": 2.499953866004944, + "reward_std": 4.253313250046631e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999536275863647, + "step": 288 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.5, + "epoch": 0.7487046632124352, + "grad_norm": 97.37085161469062, + "kl": 0.0625, + "learning_rate": 9.253886010362693e-07, + "loss": 0.0006, + "reward": 1.9984523057937622, + "reward_std": 0.0002610503869391323, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.4984521865844727, + "step": 289 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 0.7512953367875648, + "grad_norm": 1.2218454461480925, + "kl": 0.0859375, + "learning_rate": 9.251295336787564e-07, + "loss": 0.0004, + "reward": 2.499990701675415, + "reward_std": 1.2970205034434912e-05, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.999990701675415, + "step": 290 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.5, + "epoch": 0.7538860103626943, + "grad_norm": 12.851982334525493, + "kl": 0.126220703125, + "learning_rate": 9.248704663212434e-07, + "loss": 0.0006, + "reward": 2.0624111890792847, + "reward_std": 0.17680646463577432, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.5624110698699951, + "step": 291 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.5, + "epoch": 0.7564766839378239, + "grad_norm": 2.206207729074567, + "kl": 0.0709228515625, + "learning_rate": 9.246113989637305e-07, + "loss": -0.0004, + "reward": 2.4999241828918457, + "reward_std": 3.138162901450414e-05, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999243021011353, + "step": 292 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.5, + "epoch": 0.7590673575129534, + "grad_norm": 0.31928147397504814, + "kl": 0.0975341796875, + "learning_rate": 9.243523316062176e-07, + "loss": 0.0005, + "reward": 2.499974846839905, + "reward_std": 4.535467724053888e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999749660491943, + "step": 293 + }, + { + "clip_ratio": 0.0, + "completion_length": 34.8125, + "epoch": 0.7616580310880829, + "grad_norm": 30.018005918619796, + "kl": 0.1025390625, + "learning_rate": 9.240932642487046e-07, + "loss": 0.0, + "reward": 1.9774422645568848, + "reward_std": 0.007441002624545945, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.4774422645568848, + "step": 294 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.9375, + "epoch": 0.7642487046632125, + "grad_norm": 33.47510945410008, + "kl": 0.037841796875, + "learning_rate": 9.238341968911916e-07, + "loss": 0.0, + "reward": 1.8465708494186401, + "reward_std": 0.02236446195274766, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.3465709686279297, + "step": 295 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 0.7668393782383419, + "grad_norm": 9.842075489774253, + "kl": 0.125, + "learning_rate": 9.235751295336786e-07, + "loss": 0.0001, + "reward": 2.499991536140442, + "reward_std": 7.91910986208677e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999915957450867, + "step": 296 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 0.7694300518134715, + "grad_norm": 0.3672338026132165, + "kl": 0.117431640625, + "learning_rate": 9.233160621761657e-07, + "loss": 0.0013, + "reward": 1.4999854564666748, + "reward_std": 3.936485882150009e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 0.9999852180480957, + "step": 297 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 0.772020725388601, + "grad_norm": 0.817556391913545, + "kl": 0.0640869140625, + "learning_rate": 9.230569948186529e-07, + "loss": -0.0015, + "reward": 2.4999568462371826, + "reward_std": 9.18282648854074e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999569058418274, + "step": 298 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 0.7746113989637305, + "grad_norm": 19.612597597260386, + "kl": 0.0623779296875, + "learning_rate": 9.227979274611399e-07, + "loss": 0.0005, + "reward": 2.3749282360076904, + "reward_std": 0.23157261063533952, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.8749281764030457, + "step": 299 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.5, + "epoch": 0.7772020725388601, + "grad_norm": 16.9686411351817, + "kl": 0.037353515625, + "learning_rate": 9.22538860103627e-07, + "loss": -0.0002, + "reward": 2.0622934103012085, + "reward_std": 0.17680109746561357, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.5622934699058533, + "step": 300 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 0.7797927461139896, + "grad_norm": 0.38592383620459214, + "kl": 0.072021484375, + "learning_rate": 9.222797927461139e-07, + "loss": 0.0012, + "reward": 2.499995470046997, + "reward_std": 2.571936761341931e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.999995470046997, + "step": 301 + }, + { + "clip_ratio": 0.0, + "completion_length": 34.5, + "epoch": 0.7823834196891192, + "grad_norm": 9.529719471129177, + "kl": 0.120361328125, + "learning_rate": 9.22020725388601e-07, + "loss": 0.0006, + "reward": 2.4999918937683105, + "reward_std": 1.1086000029081333e-05, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999917149543762, + "step": 302 + }, + { + "clip_ratio": 0.0, + "completion_length": 34.0, + "epoch": 0.7849740932642487, + "grad_norm": 3.5491984364146902, + "kl": 0.070068359375, + "learning_rate": 9.217616580310881e-07, + "loss": 0.0009, + "reward": 2.4999492168426514, + "reward_std": 2.715591938340367e-05, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999490976333618, + "step": 303 + }, + { + "clip_ratio": 0.0, + "completion_length": 34.4375, + "epoch": 0.7875647668393783, + "grad_norm": 142.82475624809567, + "kl": 0.166259765625, + "learning_rate": 9.215025906735751e-07, + "loss": 0.0008, + "reward": 1.7831117510795593, + "reward_std": 0.08312539157242327, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.2831116318702698, + "step": 304 + }, + { + "clip_ratio": 0.0, + "completion_length": 34.5, + "epoch": 0.7901554404145078, + "grad_norm": 2.689398487678334, + "kl": 0.061767578125, + "learning_rate": 9.212435233160622e-07, + "loss": 0.0005, + "reward": 1.9999322295188904, + "reward_std": 1.3019260563851276e-05, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.4999322593212128, + "step": 305 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.5, + "epoch": 0.7927461139896373, + "grad_norm": 3.6373240126760895, + "kl": 0.0777587890625, + "learning_rate": 9.209844559585493e-07, + "loss": 0.0012, + "reward": 2.4999879598617554, + "reward_std": 1.44535373749477e-05, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999876618385315, + "step": 306 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.5, + "epoch": 0.7953367875647669, + "grad_norm": 5.630702097725107, + "kl": 0.09228515625, + "learning_rate": 9.207253886010362e-07, + "loss": -0.0004, + "reward": 2.4999611377716064, + "reward_std": 7.7118709214119e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.999961256980896, + "step": 307 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.5, + "epoch": 0.7979274611398963, + "grad_norm": 17.864836840406202, + "kl": 0.0830078125, + "learning_rate": 9.204663212435233e-07, + "loss": 0.0011, + "reward": 2.499875545501709, + "reward_std": 8.189815252990229e-05, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9998753070831299, + "step": 308 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.375, + "epoch": 0.8005181347150259, + "grad_norm": 70.47198735069966, + "kl": 0.08544921875, + "learning_rate": 9.202072538860103e-07, + "loss": 0.0003, + "reward": 1.9197113513946533, + "reward_std": 0.04908056743443012, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.4197113513946533, + "step": 309 + }, + { + "clip_ratio": 0.0, + "completion_length": 34.5, + "epoch": 0.8031088082901554, + "grad_norm": 36.96407934017639, + "kl": 0.063232421875, + "learning_rate": 9.199481865284974e-07, + "loss": -0.0003, + "reward": 1.8900312185287476, + "reward_std": 0.06787262001910221, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.3900312185287476, + "step": 310 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.5, + "epoch": 0.805699481865285, + "grad_norm": 16.32845384721268, + "kl": 0.0328369140625, + "learning_rate": 9.196891191709845e-07, + "loss": -0.0008, + "reward": 2.4999873638153076, + "reward_std": 2.0436304765780733e-05, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.999987542629242, + "step": 311 + }, + { + "clip_ratio": 0.0, + "completion_length": 34.0, + "epoch": 0.8082901554404145, + "grad_norm": 0.6457197661261482, + "kl": 0.0504150390625, + "learning_rate": 9.194300518134715e-07, + "loss": 0.0002, + "reward": 2.499995708465576, + "reward_std": 4.167168214053163e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999957084655762, + "step": 312 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.375, + "epoch": 0.810880829015544, + "grad_norm": 20.680251075732304, + "kl": 0.0853271484375, + "learning_rate": 9.191709844559585e-07, + "loss": 0.0001, + "reward": 1.0502718091011047, + "reward_std": 0.1444510220644588, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 0.5502718463540077, + "step": 313 + }, + { + "clip_ratio": 0.0, + "completion_length": 34.5, + "epoch": 0.8134715025906736, + "grad_norm": 20.340583633613686, + "kl": 0.052978515625, + "learning_rate": 9.189119170984455e-07, + "loss": -0.0011, + "reward": 2.4999191761016846, + "reward_std": 4.008891755802324e-05, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999192357063293, + "step": 314 + }, + { + "clip_ratio": 0.0, + "completion_length": 34.5, + "epoch": 0.8160621761658031, + "grad_norm": 4.878211534311483, + "kl": 0.0787353515625, + "learning_rate": 9.186528497409326e-07, + "loss": -0.0003, + "reward": 2.4999196529388428, + "reward_std": 2.0191217117826454e-05, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999196529388428, + "step": 315 + }, + { + "clip_ratio": 0.0, + "completion_length": 34.5, + "epoch": 0.8186528497409327, + "grad_norm": 82.92206731218685, + "kl": 0.080322265625, + "learning_rate": 9.183937823834197e-07, + "loss": -0.0002, + "reward": 2.2498987317085266, + "reward_std": 0.2673699298443353, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.7498987913131714, + "step": 316 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.5, + "epoch": 0.8212435233160622, + "grad_norm": 19.816342034729935, + "kl": 0.120361328125, + "learning_rate": 9.181347150259067e-07, + "loss": 0.0012, + "reward": 2.4374876022338867, + "reward_std": 0.17680068753747946, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9374876618385315, + "step": 317 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 0.8238341968911918, + "grad_norm": 11.442529464702275, + "kl": 0.054168701171875, + "learning_rate": 9.178756476683938e-07, + "loss": 0.0006, + "reward": 1.9945263862609863, + "reward_std": 0.01433517888881397, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.4945263266563416, + "step": 318 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 0.8264248704663213, + "grad_norm": 15.37448789211719, + "kl": 0.097900390625, + "learning_rate": 9.176165803108807e-07, + "loss": 0.0005, + "reward": 1.9372231364250183, + "reward_std": 0.17718149179563625, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.437223196029663, + "step": 319 + }, + { + "clip_ratio": 0.0, + "completion_length": 34.0, + "epoch": 0.8290155440414507, + "grad_norm": 49.80289094742806, + "kl": 0.205078125, + "learning_rate": 9.173575129533678e-07, + "loss": 0.0007, + "reward": 1.5752655267715454, + "reward_std": 0.23202923552889843, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.0752655863761902, + "step": 320 + }, + { + "clip_ratio": 0.0, + "completion_length": 34.5, + "epoch": 0.8316062176165803, + "grad_norm": 47.59199698919065, + "kl": 0.0540771484375, + "learning_rate": 9.170984455958549e-07, + "loss": 0.0001, + "reward": 2.3747940063476562, + "reward_std": 0.23171583090010017, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.8747938871383667, + "step": 321 + }, + { + "clip_ratio": 0.0, + "completion_length": 34.9375, + "epoch": 0.8341968911917098, + "grad_norm": 24.09727537643488, + "kl": 0.1103515625, + "learning_rate": 9.168393782383419e-07, + "loss": -0.0001, + "reward": 2.4130406379699707, + "reward_std": 0.2459504969729096, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.913040816783905, + "step": 322 + }, + { + "clip_ratio": 0.0, + "completion_length": 34.0625, + "epoch": 0.8367875647668394, + "grad_norm": 288.91013853276496, + "kl": 0.079833984375, + "learning_rate": 9.16580310880829e-07, + "loss": 0.0005, + "reward": 1.9372283816337585, + "reward_std": 0.17721589557436346, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.437228262424469, + "step": 323 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.875, + "epoch": 0.8393782383419689, + "grad_norm": 0.2791848515241179, + "kl": 0.193359375, + "learning_rate": 9.16321243523316e-07, + "loss": 0.0009, + "reward": 2.499991536140442, + "reward_std": 3.674611775750236e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999917149543762, + "step": 324 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 0.8419689119170984, + "grad_norm": 0.3806673610876601, + "kl": 0.1790771484375, + "learning_rate": 9.16062176165803e-07, + "loss": 0.0013, + "reward": 2.4999841451644897, + "reward_std": 4.476777007766941e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999839067459106, + "step": 325 + }, + { + "clip_ratio": 0.0, + "completion_length": 36.0, + "epoch": 0.844559585492228, + "grad_norm": 1.3217356707992471, + "kl": 0.0582275390625, + "learning_rate": 9.158031088082901e-07, + "loss": 0.0005, + "reward": 2.4999953508377075, + "reward_std": 5.780481842521112e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999955892562866, + "step": 326 + }, + { + "clip_ratio": 0.0, + "completion_length": 36.0, + "epoch": 0.8471502590673575, + "grad_norm": 5.226186957656574, + "kl": 0.084716796875, + "learning_rate": 9.155440414507771e-07, + "loss": 0.0006, + "reward": 2.4999561309814453, + "reward_std": 3.9707473519001724e-05, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.99995619058609, + "step": 327 + }, + { + "clip_ratio": 0.0, + "completion_length": 36.5, + "epoch": 0.8497409326424871, + "grad_norm": 8.03176763596765, + "kl": 0.103515625, + "learning_rate": 9.152849740932642e-07, + "loss": -0.0003, + "reward": 1.9965597987174988, + "reward_std": 0.0002733978952846883, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.4965597987174988, + "step": 328 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 0.8523316062176166, + "grad_norm": 3.67055638756576, + "kl": 0.05072021484375, + "learning_rate": 9.150259067357513e-07, + "loss": 0.0002, + "reward": 2.4999865293502808, + "reward_std": 1.6127025901369052e-05, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999865293502808, + "step": 329 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.5, + "epoch": 0.8549222797927462, + "grad_norm": 1.789042247604925, + "kl": 0.0400390625, + "learning_rate": 9.147668393782383e-07, + "loss": 0.0005, + "reward": 2.4999849796295166, + "reward_std": 1.2387885362841189e-05, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999850392341614, + "step": 330 + }, + { + "clip_ratio": 0.0, + "completion_length": 34.5, + "epoch": 0.8575129533678757, + "grad_norm": 1.0957686274891045, + "kl": 0.0782470703125, + "learning_rate": 9.145077720207253e-07, + "loss": -0.0005, + "reward": 2.4999908208847046, + "reward_std": 5.092277120866129e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999909400939941, + "step": 331 + }, + { + "clip_ratio": 0.0, + "completion_length": 34.5, + "epoch": 0.8601036269430051, + "grad_norm": 0.29619015825042827, + "kl": 0.073974609375, + "learning_rate": 9.142487046632123e-07, + "loss": -0.0003, + "reward": 2.499997615814209, + "reward_std": 1.081261942204037e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999977350234985, + "step": 332 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 0.8626943005181347, + "grad_norm": 32.1185168344218, + "kl": 0.100341796875, + "learning_rate": 9.139896373056994e-07, + "loss": 0.0006, + "reward": 2.437234878540039, + "reward_std": 0.17700892945867963, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.937234878540039, + "step": 333 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.5, + "epoch": 0.8652849740932642, + "grad_norm": 16.46751519181484, + "kl": 0.1240234375, + "learning_rate": 9.137305699481865e-07, + "loss": 0.0003, + "reward": 1.9995706677436829, + "reward_std": 0.0005153576767042978, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.4995706379413605, + "step": 334 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 0.8678756476683938, + "grad_norm": 0.11411298308077232, + "kl": 0.09136962890625, + "learning_rate": 9.134715025906735e-07, + "loss": 0.0005, + "reward": 2.4999969005584717, + "reward_std": 1.47363857649907e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999969601631165, + "step": 335 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.5, + "epoch": 0.8704663212435233, + "grad_norm": 8.194679909853198, + "kl": 0.0601806640625, + "learning_rate": 9.132124352331606e-07, + "loss": 0.0002, + "reward": 1.9984994530677795, + "reward_std": 7.842617560527287e-05, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.4984994530677795, + "step": 336 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 0.8730569948186528, + "grad_norm": 23.164207991034793, + "kl": 0.07598876953125, + "learning_rate": 9.129533678756475e-07, + "loss": 0.0001, + "reward": 2.4998586177825928, + "reward_std": 0.00011591546832789845, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9998587369918823, + "step": 337 + }, + { + "clip_ratio": 0.0, + "completion_length": 34.0, + "epoch": 0.8756476683937824, + "grad_norm": 1.472658752182045, + "kl": 0.267578125, + "learning_rate": 9.126943005181346e-07, + "loss": 0.0006, + "reward": 2.4999876022338867, + "reward_std": 5.665144087174667e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999876022338867, + "step": 338 + }, + { + "clip_ratio": 0.0, + "completion_length": 34.0, + "epoch": 0.8782383419689119, + "grad_norm": 30.440232194276742, + "kl": 0.0848388671875, + "learning_rate": 9.124352331606217e-07, + "loss": 0.0005, + "reward": 1.4549660682678223, + "reward_std": 0.00027229699480812997, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 0.9549659788608551, + "step": 339 + }, + { + "clip_ratio": 0.0, + "completion_length": 36.5, + "epoch": 0.8808290155440415, + "grad_norm": 6.091295098475844, + "kl": 0.078125, + "learning_rate": 9.121761658031087e-07, + "loss": 0.0003, + "reward": 2.4999663829803467, + "reward_std": 3.496112094580894e-05, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.999966561794281, + "step": 340 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 0.883419689119171, + "grad_norm": 1.9190445297757928, + "kl": 0.130859375, + "learning_rate": 9.119170984455959e-07, + "loss": -0.0, + "reward": 2.499989867210388, + "reward_std": 6.505336699547115e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.999989926815033, + "step": 341 + }, + { + "clip_ratio": 0.0, + "completion_length": 36.0, + "epoch": 0.8860103626943006, + "grad_norm": 14.081115069683012, + "kl": 0.125, + "learning_rate": 9.116580310880829e-07, + "loss": 0.0002, + "reward": 2.4998468160629272, + "reward_std": 0.00034713813329290133, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9998469352722168, + "step": 342 + }, + { + "clip_ratio": 0.0, + "completion_length": 34.6875, + "epoch": 0.8886010362694301, + "grad_norm": 171.41683522292834, + "kl": 0.1173095703125, + "learning_rate": 9.113989637305699e-07, + "loss": 0.001, + "reward": 2.0622345209121704, + "reward_std": 0.17686631905849026, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.5622344613075256, + "step": 343 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.5, + "epoch": 0.8911917098445595, + "grad_norm": 7.446769607232678, + "kl": 0.1229248046875, + "learning_rate": 9.11139896373057e-07, + "loss": 0.0008, + "reward": 1.7495030164718628, + "reward_std": 0.00013970469404966934, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.249502956867218, + "step": 344 + }, + { + "clip_ratio": 0.0, + "completion_length": 34.5, + "epoch": 0.8937823834196891, + "grad_norm": 8.91402227900581, + "kl": 0.1009521484375, + "learning_rate": 9.10880829015544e-07, + "loss": 0.0006, + "reward": 2.435795307159424, + "reward_std": 0.18159472515071684, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9357953071594238, + "step": 345 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 0.8963730569948186, + "grad_norm": 29.332694146258852, + "kl": 0.08294677734375, + "learning_rate": 9.106217616580311e-07, + "loss": 0.0004, + "reward": 1.9988747239112854, + "reward_std": 7.921733703142309e-05, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.498874545097351, + "step": 346 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.5, + "epoch": 0.8989637305699482, + "grad_norm": 51.401515984558365, + "kl": 0.070068359375, + "learning_rate": 9.103626943005181e-07, + "loss": -0.0007, + "reward": 2.1312029361724854, + "reward_std": 0.312970283900313, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.6312029957771301, + "step": 347 + }, + { + "clip_ratio": 0.0, + "completion_length": 34.8125, + "epoch": 0.9015544041450777, + "grad_norm": 17.99775269493687, + "kl": 0.075439453125, + "learning_rate": 9.101036269430052e-07, + "loss": 0.0002, + "reward": 1.5596943497657776, + "reward_std": 0.17774493167962646, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.0596943497657776, + "step": 348 + }, + { + "clip_ratio": 0.0, + "completion_length": 36.0, + "epoch": 0.9041450777202072, + "grad_norm": 1.6418529985520844, + "kl": 0.3291015625, + "learning_rate": 9.098445595854922e-07, + "loss": 0.0003, + "reward": 1.999531626701355, + "reward_std": 3.672952215083569e-05, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.4995318055152893, + "step": 349 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.5, + "epoch": 0.9067357512953368, + "grad_norm": 22.791540540252825, + "kl": 0.19677734375, + "learning_rate": 9.095854922279792e-07, + "loss": 0.0008, + "reward": 2.0622240900993347, + "reward_std": 0.1768865605378096, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.56222403049469, + "step": 350 + }, + { + "clip_ratio": 0.0, + "completion_length": 33.5, + "epoch": 0.9093264248704663, + "grad_norm": 2.296782992922078, + "kl": 0.1171875, + "learning_rate": 9.093264248704663e-07, + "loss": -0.0002, + "reward": 2.4999877214431763, + "reward_std": 7.551020416940446e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999877214431763, + "step": 351 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.5, + "epoch": 0.9119170984455959, + "grad_norm": 3.3214137122798286, + "kl": 0.045867919921875, + "learning_rate": 9.090673575129534e-07, + "loss": 0.0004, + "reward": 2.4999700784683228, + "reward_std": 1.883913378719626e-05, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999700784683228, + "step": 352 + }, + { + "clip_ratio": 0.0, + "completion_length": 34.5625, + "epoch": 0.9145077720207254, + "grad_norm": 56.831647032202035, + "kl": 0.2109375, + "learning_rate": 9.088082901554404e-07, + "loss": 0.0011, + "reward": 1.492597222328186, + "reward_std": 0.000533302802068647, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 0.9925971627235413, + "step": 353 + }, + { + "clip_ratio": 0.0, + "completion_length": 34.0, + "epoch": 0.917098445595855, + "grad_norm": 0.5821065240202035, + "kl": 0.05194091796875, + "learning_rate": 9.085492227979275e-07, + "loss": -0.0001, + "reward": 2.499998688697815, + "reward_std": 1.1890253972524079e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999988079071045, + "step": 354 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 0.9196891191709845, + "grad_norm": 1.0197640860720332, + "kl": 0.108001708984375, + "learning_rate": 9.082901554404144e-07, + "loss": 0.0007, + "reward": 2.4999961853027344, + "reward_std": 5.204545459491783e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.999996304512024, + "step": 355 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 0.9222797927461139, + "grad_norm": 0.6594272411473232, + "kl": 0.03082275390625, + "learning_rate": 9.080310880829015e-07, + "loss": 0.0007, + "reward": 2.499996066093445, + "reward_std": 3.3537564831931377e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999960064888, + "step": 356 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.5, + "epoch": 0.9248704663212435, + "grad_norm": 16.270326827422107, + "kl": 0.136474609375, + "learning_rate": 9.077720207253886e-07, + "loss": 0.0005, + "reward": 2.4999510049819946, + "reward_std": 3.0186531603249023e-05, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999508261680603, + "step": 357 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 0.927461139896373, + "grad_norm": 3.13298576907092, + "kl": 0.05780029296875, + "learning_rate": 9.075129533678756e-07, + "loss": 0.0008, + "reward": 2.4994637966156006, + "reward_std": 1.5863175576669164e-05, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.999463677406311, + "step": 358 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.5625, + "epoch": 0.9300518134715026, + "grad_norm": 72.89154611616695, + "kl": 0.160888671875, + "learning_rate": 9.072538860103627e-07, + "loss": 0.0006, + "reward": 1.580840289592743, + "reward_std": 0.20294279605150223, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.0808402746915817, + "step": 359 + }, + { + "clip_ratio": 0.0, + "completion_length": 36.5, + "epoch": 0.9326424870466321, + "grad_norm": 2.6762654987413073, + "kl": 0.09521484375, + "learning_rate": 9.069948186528497e-07, + "loss": 0.0009, + "reward": 1.4995691776275635, + "reward_std": 2.641398987179855e-05, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 0.9995691776275635, + "step": 360 + }, + { + "clip_ratio": 0.0, + "completion_length": 36.5, + "epoch": 0.9352331606217616, + "grad_norm": 3.7400608704217895, + "kl": 0.117431640625, + "learning_rate": 9.067357512953367e-07, + "loss": -0.0, + "reward": 1.9623454809188843, + "reward_std": 0.00015472333325305954, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.4623453915119171, + "step": 361 + }, + { + "clip_ratio": 0.0, + "completion_length": 34.5, + "epoch": 0.9378238341968912, + "grad_norm": 0.08400091985170403, + "kl": 0.07373046875, + "learning_rate": 9.064766839378238e-07, + "loss": 0.0006, + "reward": 2.4999977350234985, + "reward_std": 9.096793291973881e-07, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999977350234985, + "step": 362 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 0.9404145077720207, + "grad_norm": 89.9617485294265, + "kl": 0.1248779296875, + "learning_rate": 9.062176165803108e-07, + "loss": 0.0, + "reward": 1.9987398982048035, + "reward_std": 0.0006881913602967415, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.498740017414093, + "step": 363 + }, + { + "clip_ratio": 0.0, + "completion_length": 34.5625, + "epoch": 0.9430051813471503, + "grad_norm": 50.30079137772382, + "kl": 0.40478515625, + "learning_rate": 9.059585492227979e-07, + "loss": 0.0018, + "reward": 1.937167227268219, + "reward_std": 0.17718254558712943, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.4371671676635742, + "step": 364 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.1875, + "epoch": 0.9455958549222798, + "grad_norm": 104.06780316144935, + "kl": 0.07373046875, + "learning_rate": 9.056994818652849e-07, + "loss": 0.0012, + "reward": 1.8179743885993958, + "reward_std": 0.1400307110416179, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.3179743885993958, + "step": 365 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.625, + "epoch": 0.9481865284974094, + "grad_norm": 83.27529841185802, + "kl": 0.11328125, + "learning_rate": 9.05440414507772e-07, + "loss": 0.0011, + "reward": 2.1393807530403137, + "reward_std": 0.29861046785254075, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.6393807530403137, + "step": 366 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.5, + "epoch": 0.9507772020725389, + "grad_norm": 21.582388339329956, + "kl": 0.0723876953125, + "learning_rate": 9.051813471502591e-07, + "loss": -0.0006, + "reward": 1.9987910985946655, + "reward_std": 1.44862519846356e-05, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.4987912774085999, + "step": 367 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 0.9533678756476683, + "grad_norm": 1.5179477933274999, + "kl": 0.0594482421875, + "learning_rate": 9.04922279792746e-07, + "loss": 0.0, + "reward": 1.9997722506523132, + "reward_std": 8.410090231336653e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.4997723698616028, + "step": 368 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 0.9559585492227979, + "grad_norm": 29.422458287839596, + "kl": 0.05364990234375, + "learning_rate": 9.046632124352331e-07, + "loss": 0.0009, + "reward": 1.9995509386062622, + "reward_std": 0.0003094946463306769, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.4995508790016174, + "step": 369 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.5, + "epoch": 0.9585492227979274, + "grad_norm": 0.5454563402653053, + "kl": 0.1219482421875, + "learning_rate": 9.044041450777201e-07, + "loss": -0.0004, + "reward": 2.499989628791809, + "reward_std": 3.767214707295352e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999898076057434, + "step": 370 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 0.961139896373057, + "grad_norm": 0.9719703172323875, + "kl": 0.0689697265625, + "learning_rate": 9.041450777202072e-07, + "loss": 0.0004, + "reward": 2.4999566078186035, + "reward_std": 1.0819015187735204e-05, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999566078186035, + "step": 371 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.5, + "epoch": 0.9637305699481865, + "grad_norm": 0.48035294193102046, + "kl": 0.1015625, + "learning_rate": 9.038860103626943e-07, + "loss": -0.0002, + "reward": 2.4999879598617554, + "reward_std": 3.2277424111271102e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999879598617554, + "step": 372 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 0.966321243523316, + "grad_norm": 14.174581515389617, + "kl": 0.149658203125, + "learning_rate": 9.036269430051813e-07, + "loss": -0.0012, + "reward": 2.499988317489624, + "reward_std": 1.088481485567172e-05, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999885559082031, + "step": 373 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.5, + "epoch": 0.9689119170984456, + "grad_norm": 12.685892717933179, + "kl": 0.0614013671875, + "learning_rate": 9.033678756476683e-07, + "loss": -0.0001, + "reward": 1.9979270696640015, + "reward_std": 3.1579536880599335e-05, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.497927188873291, + "step": 374 + }, + { + "clip_ratio": 0.0, + "completion_length": 34.5, + "epoch": 0.9715025906735751, + "grad_norm": 2.778538104881977, + "kl": 0.075439453125, + "learning_rate": 9.031088082901554e-07, + "loss": 0.0006, + "reward": 2.4998711347579956, + "reward_std": 2.0717866846098332e-05, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9998711347579956, + "step": 375 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 0.9740932642487047, + "grad_norm": 7.72037507535947, + "kl": 0.0986328125, + "learning_rate": 9.028497409326424e-07, + "loss": 0.0001, + "reward": 2.4999849796295166, + "reward_std": 5.301843089000613e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999850392341614, + "step": 376 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.5, + "epoch": 0.9766839378238342, + "grad_norm": 60.81711602963703, + "kl": 0.136962890625, + "learning_rate": 9.025906735751295e-07, + "loss": 0.0007, + "reward": 2.3437013626098633, + "reward_std": 0.4419448544445004, + "rewards/format_reward_rec": 0.9375, + "rewards/point_reward": 1.874951422214508, + "step": 377 + }, + { + "clip_ratio": 0.0, + "completion_length": 36.0, + "epoch": 0.9792746113989638, + "grad_norm": 0.3299355142878522, + "kl": 0.1153564453125, + "learning_rate": 9.023316062176165e-07, + "loss": 0.0005, + "reward": 2.499995708465576, + "reward_std": 1.985798633086233e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.999995768070221, + "step": 378 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.5, + "epoch": 0.9818652849740933, + "grad_norm": 0.31302396124451454, + "kl": 0.1279296875, + "learning_rate": 9.020725388601036e-07, + "loss": 0.0005, + "reward": 2.4999958276748657, + "reward_std": 1.7543504782224772e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999959468841553, + "step": 379 + }, + { + "clip_ratio": 0.0, + "completion_length": 34.0, + "epoch": 0.9844559585492227, + "grad_norm": 24.325849383190917, + "kl": 0.11181640625, + "learning_rate": 9.018134715025906e-07, + "loss": 0.0005, + "reward": 2.437013030052185, + "reward_std": 0.17696355968655553, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9370129108428955, + "step": 380 + }, + { + "clip_ratio": 0.0, + "completion_length": 34.5, + "epoch": 0.9870466321243523, + "grad_norm": 7.892168497896422, + "kl": 0.134765625, + "learning_rate": 9.015544041450776e-07, + "loss": 0.0008, + "reward": 1.9984101057052612, + "reward_std": 2.598165337985847e-05, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.4984100759029388, + "step": 381 + }, + { + "clip_ratio": 0.0, + "completion_length": 36.0, + "epoch": 0.9896373056994818, + "grad_norm": 0.12639145848943747, + "kl": 0.03680419921875, + "learning_rate": 9.012953367875647e-07, + "loss": -0.0004, + "reward": 2.4999992847442627, + "reward_std": 7.902503398327099e-07, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999995231628418, + "step": 382 + }, + { + "clip_ratio": 0.0, + "completion_length": 34.0, + "epoch": 0.9922279792746114, + "grad_norm": 6.604066305838429, + "kl": 0.064208984375, + "learning_rate": 9.010362694300517e-07, + "loss": -0.0002, + "reward": 2.49996554851532, + "reward_std": 2.8886501922897878e-05, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999655485153198, + "step": 383 + }, + { + "clip_ratio": 0.0, + "completion_length": 34.25, + "epoch": 0.9948186528497409, + "grad_norm": 35.23091864141904, + "kl": 0.14068603515625, + "learning_rate": 9.007772020725389e-07, + "loss": 0.0009, + "reward": 1.995453953742981, + "reward_std": 0.0029676412481194347, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.4954538643360138, + "step": 384 + }, + { + "clip_ratio": 0.0, + "completion_length": 34.5, + "epoch": 0.9974093264248705, + "grad_norm": 77.43882101210683, + "kl": 0.08740234375, + "learning_rate": 9.00518134715026e-07, + "loss": 0.0004, + "reward": 1.9996147155761719, + "reward_std": 0.3537828028202057, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.4996147751808167, + "step": 385 + }, + { + "clip_ratio": 0.0, + "completion_length": 34.5, + "epoch": 1.0, + "grad_norm": 0.39455606315651964, + "kl": 0.14013671875, + "learning_rate": 9.002590673575129e-07, + "loss": 0.0004, + "reward": 2.4999988079071045, + "reward_std": 1.5925973571029317e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999987483024597, + "step": 386 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.5, + "epoch": 1.0025906735751295, + "grad_norm": 27.93992197213771, + "kl": 0.1181640625, + "learning_rate": 9e-07, + "loss": 0.0005, + "reward": 1.8049081563949585, + "reward_std": 0.26289142668247223, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.304908275604248, + "step": 387 + }, + { + "clip_ratio": 0.0, + "completion_length": 34.875, + "epoch": 1.005181347150259, + "grad_norm": 78.09932939848163, + "kl": 0.15283203125, + "learning_rate": 8.99740932642487e-07, + "loss": 0.0012, + "reward": 2.062077045440674, + "reward_std": 0.1768815812278035, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.5620769262313843, + "step": 388 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.5, + "epoch": 1.0077720207253886, + "grad_norm": 0.42449608013010753, + "kl": 0.115478515625, + "learning_rate": 8.994818652849741e-07, + "loss": 0.0005, + "reward": 2.4999442100524902, + "reward_std": 4.055633780808421e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999443292617798, + "step": 389 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.5, + "epoch": 1.0103626943005182, + "grad_norm": 5.91364886020048, + "kl": 0.08642578125, + "learning_rate": 8.992227979274612e-07, + "loss": -0.0004, + "reward": 2.499927043914795, + "reward_std": 1.4638307163750142e-05, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999269843101501, + "step": 390 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 1.0129533678756477, + "grad_norm": 407.99797627916894, + "kl": 0.09033203125, + "learning_rate": 8.989637305699482e-07, + "loss": -0.0005, + "reward": 2.4367836713790894, + "reward_std": 0.17879251341669544, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9367839097976685, + "step": 391 + }, + { + "clip_ratio": 0.0, + "completion_length": 36.0, + "epoch": 1.0155440414507773, + "grad_norm": 1.5800756359407757, + "kl": 0.0679931640625, + "learning_rate": 8.987046632124352e-07, + "loss": 0.0008, + "reward": 2.4999600648880005, + "reward_std": 1.0461450870025146e-05, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999601244926453, + "step": 392 + }, + { + "clip_ratio": 0.0, + "completion_length": 36.0, + "epoch": 1.0181347150259068, + "grad_norm": 0.30021282422788814, + "kl": 0.115478515625, + "learning_rate": 8.984455958549222e-07, + "loss": 0.0006, + "reward": 2.499990463256836, + "reward_std": 3.2136514391822857e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.999990463256836, + "step": 393 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.5, + "epoch": 1.0207253886010363, + "grad_norm": 4.4393182473073285, + "kl": 0.049560546875, + "learning_rate": 8.981865284974093e-07, + "loss": -0.0001, + "reward": 2.499978542327881, + "reward_std": 9.545959187562403e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999785423278809, + "step": 394 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.5, + "epoch": 1.0233160621761659, + "grad_norm": 4.048947856971386, + "kl": 0.08056640625, + "learning_rate": 8.979274611398964e-07, + "loss": -0.0002, + "reward": 2.4999908208847046, + "reward_std": 9.642950658417249e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999908208847046, + "step": 395 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 1.0259067357512954, + "grad_norm": 0.10696531565404, + "kl": 0.08447265625, + "learning_rate": 8.976683937823834e-07, + "loss": 0.0009, + "reward": 2.4999988079071045, + "reward_std": 9.770348015081254e-07, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999988079071045, + "step": 396 + }, + { + "clip_ratio": 0.0, + "completion_length": 34.5, + "epoch": 1.028497409326425, + "grad_norm": 28.26823879348509, + "kl": 4.21484375, + "learning_rate": 8.974093264248705e-07, + "loss": 0.0161, + "reward": 2.12465238571167, + "reward_std": 0.5670378761615211, + "rewards/format_reward_rec": 0.875, + "rewards/point_reward": 1.6871524453163147, + "step": 397 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.125, + "epoch": 1.0310880829015545, + "grad_norm": 44.52193042461309, + "kl": 0.4208984375, + "learning_rate": 8.971502590673574e-07, + "loss": 0.0009, + "reward": 2.020863175392151, + "reward_std": 0.19359701108623995, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.5208631753921509, + "step": 398 + }, + { + "clip_ratio": 0.0, + "completion_length": 34.0, + "epoch": 1.0336787564766838, + "grad_norm": 15.466613267417598, + "kl": 0.132080078125, + "learning_rate": 8.968911917098445e-07, + "loss": 0.0008, + "reward": 2.2499775886535645, + "reward_std": 0.26726902516281825, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.7499774098396301, + "step": 399 + }, + { + "clip_ratio": 0.0, + "completion_length": 34.0, + "epoch": 1.0362694300518134, + "grad_norm": 31.74878005086349, + "kl": 0.1572265625, + "learning_rate": 8.966321243523316e-07, + "loss": 0.0008, + "reward": 2.4999704360961914, + "reward_std": 4.443721172719961e-05, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999704360961914, + "step": 400 + }, + { + "clip_ratio": 0.0, + "completion_length": 34.0, + "epoch": 1.038860103626943, + "grad_norm": 0.7896260532318355, + "kl": 0.48193359375, + "learning_rate": 8.963730569948186e-07, + "loss": 0.0021, + "reward": 2.4999923706054688, + "reward_std": 1.165152184512408e-05, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999924302101135, + "step": 401 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.5, + "epoch": 1.0414507772020725, + "grad_norm": 0.07177787029553469, + "kl": 0.0709228515625, + "learning_rate": 8.961139896373057e-07, + "loss": -0.0007, + "reward": 2.4999982118606567, + "reward_std": 1.2793494192919752e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999983310699463, + "step": 402 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.5, + "epoch": 1.044041450777202, + "grad_norm": 2.6256143014851956, + "kl": 0.12548828125, + "learning_rate": 8.958549222797928e-07, + "loss": 0.0002, + "reward": 2.499992847442627, + "reward_std": 1.1741248272301164e-05, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999927878379822, + "step": 403 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.5, + "epoch": 1.0466321243523315, + "grad_norm": 10.497894106375623, + "kl": 0.1708984375, + "learning_rate": 8.955958549222797e-07, + "loss": 0.0002, + "reward": 2.4303700923919678, + "reward_std": 0.19693740084039746, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9303700923919678, + "step": 404 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.5, + "epoch": 1.049222797927461, + "grad_norm": 6.896369178768509, + "kl": 0.051513671875, + "learning_rate": 8.953367875647668e-07, + "loss": 0.0002, + "reward": 2.0565009713172913, + "reward_std": 0.17914094313570672, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.5565009117126465, + "step": 405 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 1.0518134715025906, + "grad_norm": 43.64665359616527, + "kl": 0.105712890625, + "learning_rate": 8.950777202072538e-07, + "loss": 0.001, + "reward": 1.9859212636947632, + "reward_std": 0.00043847410643138574, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.4859214425086975, + "step": 406 + }, + { + "clip_ratio": 0.0, + "completion_length": 34.5, + "epoch": 1.0544041450777202, + "grad_norm": 3.7514510778873214, + "kl": 0.140625, + "learning_rate": 8.948186528497409e-07, + "loss": 0.0009, + "reward": 2.4999618530273438, + "reward_std": 1.215601493242957e-05, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999618530273438, + "step": 407 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 1.0569948186528497, + "grad_norm": 0.33350622045235956, + "kl": 0.092041015625, + "learning_rate": 8.94559585492228e-07, + "loss": 0.0014, + "reward": 2.499992847442627, + "reward_std": 2.475089104336803e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999926090240479, + "step": 408 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 1.0595854922279793, + "grad_norm": 0.22041465229516474, + "kl": 0.12158203125, + "learning_rate": 8.94300518134715e-07, + "loss": 0.0002, + "reward": 2.4999959468841553, + "reward_std": 1.6094693933155213e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999961256980896, + "step": 409 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 1.0621761658031088, + "grad_norm": 81.53732926487953, + "kl": 0.0968017578125, + "learning_rate": 8.94041450777202e-07, + "loss": 0.001, + "reward": 1.9990376234054565, + "reward_std": 0.0004494970630730677, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.4990374445915222, + "step": 410 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.5, + "epoch": 1.0647668393782384, + "grad_norm": 417.94326039059274, + "kl": 0.150390625, + "learning_rate": 8.93782383419689e-07, + "loss": 0.0007, + "reward": 1.739211082458496, + "reward_std": 0.2786605658211556, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.2392111420631409, + "step": 411 + }, + { + "clip_ratio": 0.0, + "completion_length": 34.5, + "epoch": 1.067357512953368, + "grad_norm": 12.622790924108282, + "kl": 0.170166015625, + "learning_rate": 8.935233160621761e-07, + "loss": 0.001, + "reward": 1.998087465763092, + "reward_std": 3.326684196736096e-05, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.4980872869491577, + "step": 412 + }, + { + "clip_ratio": 0.0, + "completion_length": 34.875, + "epoch": 1.0699481865284974, + "grad_norm": 58.75209329973357, + "kl": 0.115478515625, + "learning_rate": 8.932642487046632e-07, + "loss": 0.0006, + "reward": 2.400259017944336, + "reward_std": 0.2820926400289636, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9002589583396912, + "step": 413 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 1.072538860103627, + "grad_norm": 9.20560961431424, + "kl": 0.0828857421875, + "learning_rate": 8.930051813471502e-07, + "loss": 0.0004, + "reward": 2.4999626874923706, + "reward_std": 2.5570667162355676e-05, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999626874923706, + "step": 414 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.375, + "epoch": 1.0751295336787565, + "grad_norm": 22.774684195504737, + "kl": 0.13720703125, + "learning_rate": 8.927461139896373e-07, + "loss": 0.0003, + "reward": 1.9740850925445557, + "reward_std": 0.025580175279174, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.4740851521492004, + "step": 415 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 1.077720207253886, + "grad_norm": 0.4648953456206473, + "kl": 0.0804443359375, + "learning_rate": 8.924870466321242e-07, + "loss": 0.0002, + "reward": 2.499995470046997, + "reward_std": 2.4615105758130085e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.999995470046997, + "step": 416 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.5, + "epoch": 1.0803108808290156, + "grad_norm": 26.40377335945731, + "kl": 0.10107421875, + "learning_rate": 8.922279792746113e-07, + "loss": -0.0003, + "reward": 1.9838183522224426, + "reward_std": 0.015582584572257474, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.4838182926177979, + "step": 417 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 1.0829015544041452, + "grad_norm": 90.75785815686979, + "kl": 0.155029296875, + "learning_rate": 8.919689119170984e-07, + "loss": 0.0008, + "reward": 1.9772456884384155, + "reward_std": 0.008828636850012117, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.4772456884384155, + "step": 418 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 1.0854922279792747, + "grad_norm": 257.53600374977816, + "kl": 0.07568359375, + "learning_rate": 8.917098445595854e-07, + "loss": 0.0003, + "reward": 1.7357445359230042, + "reward_std": 0.2677098226849921, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.235744595527649, + "step": 419 + }, + { + "clip_ratio": 0.0, + "completion_length": 36.0, + "epoch": 1.0880829015544042, + "grad_norm": 0.29778990538808714, + "kl": 0.0621337890625, + "learning_rate": 8.914507772020725e-07, + "loss": 0.0005, + "reward": 2.4999656677246094, + "reward_std": 3.783144506996905e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999656081199646, + "step": 420 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.5, + "epoch": 1.0906735751295338, + "grad_norm": 72.02617062099873, + "kl": 0.09375, + "learning_rate": 8.911917098445595e-07, + "loss": 0.0005, + "reward": 2.3123912811279297, + "reward_std": 0.2589216949972979, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.8123913407325745, + "step": 421 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.5, + "epoch": 1.093264248704663, + "grad_norm": 7.580179253156264, + "kl": 0.1044921875, + "learning_rate": 8.909326424870465e-07, + "loss": 0.0006, + "reward": 1.9999219179153442, + "reward_std": 1.1002060546161374e-05, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.49992173910141, + "step": 422 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 1.0958549222797926, + "grad_norm": 2.4919894278984396, + "kl": 0.0460205078125, + "learning_rate": 8.906735751295336e-07, + "loss": -0.0002, + "reward": 2.49999463558197, + "reward_std": 7.017026547373462e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999946355819702, + "step": 423 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.5, + "epoch": 1.0984455958549222, + "grad_norm": 13.513076731364253, + "kl": 0.092041015625, + "learning_rate": 8.904145077720206e-07, + "loss": 0.0003, + "reward": 1.7202502489089966, + "reward_std": 0.0008438759050477529, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.2202502489089966, + "step": 424 + }, + { + "clip_ratio": 0.0, + "completion_length": 34.5, + "epoch": 1.1010362694300517, + "grad_norm": 0.6303763190684515, + "kl": 0.157958984375, + "learning_rate": 8.901554404145077e-07, + "loss": 0.0006, + "reward": 2.499995470046997, + "reward_std": 4.150169672811899e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999955892562866, + "step": 425 + }, + { + "clip_ratio": 0.0, + "completion_length": 34.9375, + "epoch": 1.1036269430051813, + "grad_norm": 130.92885317581852, + "kl": 0.07666015625, + "learning_rate": 8.898963730569949e-07, + "loss": 0.001, + "reward": 1.8611122965812683, + "reward_std": 0.2581528257969694, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.361112117767334, + "step": 426 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.5, + "epoch": 1.1062176165803108, + "grad_norm": 2.327998252567789, + "kl": 0.077880859375, + "learning_rate": 8.896373056994819e-07, + "loss": 0.0001, + "reward": 2.4999914169311523, + "reward_std": 8.450096345313796e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.999991536140442, + "step": 427 + }, + { + "clip_ratio": 0.0, + "completion_length": 34.4375, + "epoch": 1.1088082901554404, + "grad_norm": 4.498881975499488, + "kl": 0.16650390625, + "learning_rate": 8.893782383419689e-07, + "loss": 0.0004, + "reward": 2.499961018562317, + "reward_std": 2.298750814588857e-05, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999611377716064, + "step": 428 + }, + { + "clip_ratio": 0.0, + "completion_length": 36.0, + "epoch": 1.11139896373057, + "grad_norm": 0.1715745423148994, + "kl": 0.0966796875, + "learning_rate": 8.891191709844559e-07, + "loss": 0.0004, + "reward": 2.4999970197677612, + "reward_std": 2.584576861863752e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999971389770508, + "step": 429 + }, + { + "clip_ratio": 0.0, + "completion_length": 34.5, + "epoch": 1.1139896373056994, + "grad_norm": 0.2587929534288508, + "kl": 0.127197265625, + "learning_rate": 8.88860103626943e-07, + "loss": 0.0001, + "reward": 2.4999966621398926, + "reward_std": 1.0830531067540505e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.999996840953827, + "step": 430 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.5, + "epoch": 1.116580310880829, + "grad_norm": 3.392796252003056, + "kl": 0.09521484375, + "learning_rate": 8.886010362694301e-07, + "loss": -0.0002, + "reward": 2.4999715089797974, + "reward_std": 1.6701092931725725e-05, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999715685844421, + "step": 431 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.125, + "epoch": 1.1191709844559585, + "grad_norm": 66.24620245667927, + "kl": 0.29638671875, + "learning_rate": 8.883419689119171e-07, + "loss": 0.0012, + "reward": 1.4852866530418396, + "reward_std": 0.013451165985316038, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 0.9852865636348724, + "step": 432 + }, + { + "clip_ratio": 0.0, + "completion_length": 36.5, + "epoch": 1.121761658031088, + "grad_norm": 0.8980414182681852, + "kl": 0.0533447265625, + "learning_rate": 8.880829015544042e-07, + "loss": -0.0003, + "reward": 2.49997615814209, + "reward_std": 8.914175850804895e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999761581420898, + "step": 433 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.5, + "epoch": 1.1243523316062176, + "grad_norm": 0.10466006766160264, + "kl": 0.0859375, + "learning_rate": 8.878238341968911e-07, + "loss": 0.0003, + "reward": 2.499997138977051, + "reward_std": 1.197195842905785e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999971985816956, + "step": 434 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 1.1269430051813472, + "grad_norm": 56.64892976696522, + "kl": 0.0986328125, + "learning_rate": 8.875647668393782e-07, + "loss": 0.0002, + "reward": 1.892016589641571, + "reward_std": 0.3051476856244335, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.3920166790485382, + "step": 435 + }, + { + "clip_ratio": 0.0, + "completion_length": 34.5, + "epoch": 1.1295336787564767, + "grad_norm": 86.73903645096914, + "kl": 0.156005859375, + "learning_rate": 8.873056994818653e-07, + "loss": 0.0006, + "reward": 1.9997649192810059, + "reward_std": 9.56018175202189e-05, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.499765008687973, + "step": 436 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 1.1321243523316062, + "grad_norm": 0.6464813836860527, + "kl": 0.097412109375, + "learning_rate": 8.870466321243523e-07, + "loss": 0.0013, + "reward": 2.4999921321868896, + "reward_std": 3.5397972624195972e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999921917915344, + "step": 437 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 1.1347150259067358, + "grad_norm": 0.24719754877821865, + "kl": 0.068359375, + "learning_rate": 8.867875647668394e-07, + "loss": 0.0012, + "reward": 2.499998092651367, + "reward_std": 2.3967554625414778e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999980330467224, + "step": 438 + }, + { + "clip_ratio": 0.0, + "completion_length": 34.0, + "epoch": 1.1373056994818653, + "grad_norm": 13.79375585716135, + "kl": 0.09375, + "learning_rate": 8.865284974093264e-07, + "loss": 0.0008, + "reward": 1.9852967262268066, + "reward_std": 0.001258290941677842, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.4852966964244843, + "step": 439 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.5, + "epoch": 1.1398963730569949, + "grad_norm": 4.905607975061321, + "kl": 0.067138671875, + "learning_rate": 8.862694300518134e-07, + "loss": 0.0007, + "reward": 1.9001922607421875, + "reward_std": 0.0001536047930130735, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.400191992521286, + "step": 440 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.5, + "epoch": 1.1424870466321244, + "grad_norm": 2.1023873650679272, + "kl": 0.054443359375, + "learning_rate": 8.860103626943005e-07, + "loss": 0.0003, + "reward": 2.499992847442627, + "reward_std": 6.464971988862089e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999927282333374, + "step": 441 + }, + { + "clip_ratio": 0.0, + "completion_length": 34.0, + "epoch": 1.145077720207254, + "grad_norm": 0.13724521017587, + "kl": 0.0723876953125, + "learning_rate": 8.857512953367875e-07, + "loss": -0.0006, + "reward": 2.4999979734420776, + "reward_std": 9.475184441498641e-07, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999980330467224, + "step": 442 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.5, + "epoch": 1.1476683937823835, + "grad_norm": 34.30903798601372, + "kl": 0.1748046875, + "learning_rate": 8.854922279792746e-07, + "loss": 0.0019, + "reward": 2.4999849796295166, + "reward_std": 2.4904329166020034e-05, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999848008155823, + "step": 443 + }, + { + "clip_ratio": 0.0, + "completion_length": 34.5, + "epoch": 1.150259067357513, + "grad_norm": 4.45419009327568, + "kl": 0.103759765625, + "learning_rate": 8.852331606217616e-07, + "loss": 0.0014, + "reward": 2.499996542930603, + "reward_std": 4.7254719390821265e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.999996304512024, + "step": 444 + }, + { + "clip_ratio": 0.0, + "completion_length": 34.0, + "epoch": 1.1528497409326426, + "grad_norm": 0.6394649645583231, + "kl": 0.08172607421875, + "learning_rate": 8.849740932642487e-07, + "loss": 0.0015, + "reward": 2.4999897480010986, + "reward_std": 1.7348398841932067e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999895691871643, + "step": 445 + }, + { + "clip_ratio": 0.0, + "completion_length": 34.625, + "epoch": 1.1554404145077721, + "grad_norm": 73.74568146481987, + "kl": 0.094970703125, + "learning_rate": 8.847150259067357e-07, + "loss": 0.0004, + "reward": 1.674540638923645, + "reward_std": 0.3161500170826912, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.1745406091213226, + "step": 446 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.125, + "epoch": 1.1580310880829017, + "grad_norm": 90.57154886616541, + "kl": 0.0751953125, + "learning_rate": 8.844559585492227e-07, + "loss": 0.0003, + "reward": 1.3530938029289246, + "reward_std": 0.18028530236915685, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 0.8530937731266022, + "step": 447 + }, + { + "clip_ratio": 0.0, + "completion_length": 34.5, + "epoch": 1.160621761658031, + "grad_norm": 0.17610403103940198, + "kl": 0.0701904296875, + "learning_rate": 8.841968911917098e-07, + "loss": 0.0003, + "reward": 2.4999970197677612, + "reward_std": 1.3332831656498456e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999970197677612, + "step": 448 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.5, + "epoch": 1.1632124352331605, + "grad_norm": 0.6657898278232692, + "kl": 0.21649169921875, + "learning_rate": 8.839378238341969e-07, + "loss": -0.0006, + "reward": 2.499995708465576, + "reward_std": 4.492713230774825e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999958276748657, + "step": 449 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.5, + "epoch": 1.16580310880829, + "grad_norm": 13.036435514484477, + "kl": 0.09765625, + "learning_rate": 8.836787564766839e-07, + "loss": 0.0002, + "reward": 2.4374842643737793, + "reward_std": 0.17680859067291976, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9374842047691345, + "step": 450 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.5, + "epoch": 1.1683937823834196, + "grad_norm": 1.1405571470651923, + "kl": 0.101318359375, + "learning_rate": 8.83419689119171e-07, + "loss": 0.0008, + "reward": 2.4999808073043823, + "reward_std": 6.84925453242613e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999808073043823, + "step": 451 + }, + { + "clip_ratio": 0.0, + "completion_length": 36.0, + "epoch": 1.1709844559585492, + "grad_norm": 0.21700780087226967, + "kl": 0.11767578125, + "learning_rate": 8.831606217616579e-07, + "loss": 0.0008, + "reward": 2.4999958276748657, + "reward_std": 2.805690996865451e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999957084655762, + "step": 452 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 1.1735751295336787, + "grad_norm": 28.29878720243635, + "kl": 0.0584716796875, + "learning_rate": 8.82901554404145e-07, + "loss": 0.0005, + "reward": 1.8530709147453308, + "reward_std": 0.02618713528443095, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.3530707955360413, + "step": 453 + }, + { + "clip_ratio": 0.0, + "completion_length": 36.0, + "epoch": 1.1761658031088082, + "grad_norm": 23.0520703043227, + "kl": 0.1270751953125, + "learning_rate": 8.826424870466321e-07, + "loss": 0.0012, + "reward": 2.0300532579421997, + "reward_std": 0.18988699556376787, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.5300532579421997, + "step": 454 + }, + { + "clip_ratio": 0.0, + "completion_length": 34.5, + "epoch": 1.1787564766839378, + "grad_norm": 42.59667017674438, + "kl": 0.2431640625, + "learning_rate": 8.823834196891191e-07, + "loss": 0.001, + "reward": 1.9351577162742615, + "reward_std": 0.1813949552597478, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.4351578652858734, + "step": 455 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 1.1813471502590673, + "grad_norm": 3.1976299761041713, + "kl": 0.097564697265625, + "learning_rate": 8.821243523316062e-07, + "loss": 0.0008, + "reward": 2.4999502897262573, + "reward_std": 2.2653130599792348e-05, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.999950349330902, + "step": 456 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 1.1839378238341969, + "grad_norm": 1.7616212918628507, + "kl": 0.14697265625, + "learning_rate": 8.818652849740932e-07, + "loss": -0.0005, + "reward": 2.499990940093994, + "reward_std": 7.570316370220098e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999911189079285, + "step": 457 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.4375, + "epoch": 1.1865284974093264, + "grad_norm": 17.806489766423955, + "kl": 0.357177734375, + "learning_rate": 8.816062176165802e-07, + "loss": 0.0009, + "reward": 2.436825752258301, + "reward_std": 0.17811651074771362, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9368258714675903, + "step": 458 + }, + { + "clip_ratio": 0.0, + "completion_length": 34.625, + "epoch": 1.189119170984456, + "grad_norm": 45.06925648484607, + "kl": 0.16015625, + "learning_rate": 8.813471502590673e-07, + "loss": -0.0002, + "reward": 2.3625733852386475, + "reward_std": 0.2544594280141155, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.862573504447937, + "step": 459 + }, + { + "clip_ratio": 0.0, + "completion_length": 36.0, + "epoch": 1.1917098445595855, + "grad_norm": 0.2893301664645233, + "kl": 0.0760498046875, + "learning_rate": 8.810880829015543e-07, + "loss": 0.0016, + "reward": 2.4999966621398926, + "reward_std": 3.371640161731193e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999964833259583, + "step": 460 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 1.194300518134715, + "grad_norm": 0.3122314874399316, + "kl": 0.06396484375, + "learning_rate": 8.808290155440414e-07, + "loss": 0.0001, + "reward": 2.49999463558197, + "reward_std": 5.715547217732819e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999947547912598, + "step": 461 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.5, + "epoch": 1.1968911917098446, + "grad_norm": 0.9788897999342062, + "kl": 0.0633544921875, + "learning_rate": 8.805699481865284e-07, + "loss": 0.0002, + "reward": 2.4999680519104004, + "reward_std": 6.617727194679901e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999680519104004, + "step": 462 + }, + { + "clip_ratio": 0.0, + "completion_length": 34.5, + "epoch": 1.1994818652849741, + "grad_norm": 0.18570239241635855, + "kl": 0.087158203125, + "learning_rate": 8.803108808290155e-07, + "loss": -0.0008, + "reward": 2.499997854232788, + "reward_std": 1.1072355903252173e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999982118606567, + "step": 463 + }, + { + "clip_ratio": 0.0, + "completion_length": 34.5, + "epoch": 1.2020725388601037, + "grad_norm": 0.10444988339313646, + "kl": 0.011444091796875, + "learning_rate": 8.800518134715025e-07, + "loss": 0.0018, + "reward": 2.4999985694885254, + "reward_std": 1.4295797541308275e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999984502792358, + "step": 464 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 1.2046632124352332, + "grad_norm": 38.26699080789075, + "kl": 0.1170654296875, + "learning_rate": 8.797927461139895e-07, + "loss": 0.0004, + "reward": 1.9998422861099243, + "reward_std": 2.3600002492685235e-05, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.4998424053192139, + "step": 465 + }, + { + "clip_ratio": 0.0, + "completion_length": 36.0, + "epoch": 1.2072538860103628, + "grad_norm": 17.141246542347844, + "kl": 0.09912109375, + "learning_rate": 8.795336787564766e-07, + "loss": 0.0003, + "reward": 1.979422926902771, + "reward_std": 0.00020118385555178975, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.4794228076934814, + "step": 466 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 1.2098445595854923, + "grad_norm": 0.2526893822344503, + "kl": 0.220703125, + "learning_rate": 8.792746113989636e-07, + "loss": 0.0015, + "reward": 2.499997854232788, + "reward_std": 1.947460702922399e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.999997854232788, + "step": 467 + }, + { + "clip_ratio": 0.0, + "completion_length": 34.125, + "epoch": 1.2124352331606219, + "grad_norm": 31.852021532818366, + "kl": 0.14569091796875, + "learning_rate": 8.790155440414507e-07, + "loss": 0.0013, + "reward": 1.9521268010139465, + "reward_std": 0.0022770423521478733, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.4521267116069794, + "step": 468 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 1.2150259067357512, + "grad_norm": 1.0281744349665374, + "kl": 0.103271484375, + "learning_rate": 8.787564766839379e-07, + "loss": 0.001, + "reward": 1.9993879795074463, + "reward_std": 1.2945804314767884e-05, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.4993878901004791, + "step": 469 + }, + { + "clip_ratio": 0.0, + "completion_length": 34.0, + "epoch": 1.2176165803108807, + "grad_norm": 0.43410273744995626, + "kl": 0.0716552734375, + "learning_rate": 8.784974093264247e-07, + "loss": 0.0004, + "reward": 2.4999778270721436, + "reward_std": 7.111955937944003e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999777674674988, + "step": 470 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 1.2202072538860103, + "grad_norm": 50.59271171474758, + "kl": 0.077392578125, + "learning_rate": 8.782383419689119e-07, + "loss": 0.0003, + "reward": 2.3740792274475098, + "reward_std": 0.3561514914035797, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.874079406261444, + "step": 471 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 1.2227979274611398, + "grad_norm": 0.7713101538824298, + "kl": 0.0872802734375, + "learning_rate": 8.77979274611399e-07, + "loss": 0.0006, + "reward": 2.4999958276748657, + "reward_std": 3.7912550112650933e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999958276748657, + "step": 472 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.5, + "epoch": 1.2253886010362693, + "grad_norm": 21.082820584773735, + "kl": 0.17462158203125, + "learning_rate": 8.77720207253886e-07, + "loss": -0.0002, + "reward": 1.9570937156677246, + "reward_std": 0.025583247080476212, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.4570938348770142, + "step": 473 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 1.2279792746113989, + "grad_norm": 12.332295672892807, + "kl": 0.12548828125, + "learning_rate": 8.774611398963731e-07, + "loss": 0.0004, + "reward": 1.9994518756866455, + "reward_std": 2.252896024401707e-05, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.4994519352912903, + "step": 474 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.5, + "epoch": 1.2305699481865284, + "grad_norm": 12.085946387998836, + "kl": 0.074951171875, + "learning_rate": 8.772020725388601e-07, + "loss": 0.0, + "reward": 2.499893546104431, + "reward_std": 4.6301764086820185e-05, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.999893605709076, + "step": 475 + }, + { + "clip_ratio": 0.0, + "completion_length": 34.5, + "epoch": 1.233160621761658, + "grad_norm": 8.910241412807926, + "kl": 0.0848388671875, + "learning_rate": 8.769430051813471e-07, + "loss": 0.0005, + "reward": 2.499955177307129, + "reward_std": 6.667927027592668e-05, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999550580978394, + "step": 476 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.5, + "epoch": 1.2357512953367875, + "grad_norm": 18.492399334575982, + "kl": 0.1121826171875, + "learning_rate": 8.766839378238342e-07, + "loss": 0.0007, + "reward": 2.437414765357971, + "reward_std": 0.1769445626981394, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9374147057533264, + "step": 477 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.1875, + "epoch": 1.238341968911917, + "grad_norm": 15.299927589119523, + "kl": 0.13818359375, + "learning_rate": 8.764248704663212e-07, + "loss": 0.0007, + "reward": 1.8968449831008911, + "reward_std": 0.04106860855790728, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.3968449234962463, + "step": 478 + }, + { + "clip_ratio": 0.0, + "completion_length": 33.6875, + "epoch": 1.2409326424870466, + "grad_norm": 77.00274354527039, + "kl": 0.1083984375, + "learning_rate": 8.761658031088083e-07, + "loss": 0.0004, + "reward": 2.3105857372283936, + "reward_std": 0.26142529569824546, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.8105856776237488, + "step": 479 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 1.2435233160621761, + "grad_norm": 0.6616120982092444, + "kl": 0.089599609375, + "learning_rate": 8.759067357512953e-07, + "loss": 0.0009, + "reward": 2.4999921321868896, + "reward_std": 4.195728251943365e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999920129776, + "step": 480 + }, + { + "clip_ratio": 0.0, + "completion_length": 34.875, + "epoch": 1.2461139896373057, + "grad_norm": 3.2939424001045627, + "kl": 0.082275390625, + "learning_rate": 8.756476683937824e-07, + "loss": 0.001, + "reward": 2.4999918937683105, + "reward_std": 2.6570181432816753e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999916553497314, + "step": 481 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 1.2487046632124352, + "grad_norm": 28.58802241475212, + "kl": 0.072998046875, + "learning_rate": 8.753886010362695e-07, + "loss": 0.0006, + "reward": 1.7892868518829346, + "reward_std": 0.00017940342308975232, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.289286881685257, + "step": 482 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 1.2512953367875648, + "grad_norm": 6.292983622135817, + "kl": 0.1826171875, + "learning_rate": 8.751295336787564e-07, + "loss": 0.0008, + "reward": 1.4994452595710754, + "reward_std": 5.78963736188598e-05, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 0.9994453489780426, + "step": 483 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 1.2538860103626943, + "grad_norm": 2.064790038323725, + "kl": 0.0653076171875, + "learning_rate": 8.748704663212435e-07, + "loss": 0.0005, + "reward": 2.499991297721863, + "reward_std": 1.1068362482546945e-05, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999913573265076, + "step": 484 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.5, + "epoch": 1.2564766839378239, + "grad_norm": 38.42058225974973, + "kl": 0.2515869140625, + "learning_rate": 8.746113989637305e-07, + "loss": 0.0014, + "reward": 2.3749624490737915, + "reward_std": 0.23151093343540197, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.874962329864502, + "step": 485 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 1.2590673575129534, + "grad_norm": 21.885704126952792, + "kl": 0.218505859375, + "learning_rate": 8.743523316062176e-07, + "loss": 0.0003, + "reward": 1.7940752506256104, + "reward_std": 0.03148522444280388, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.2940754890441895, + "step": 486 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.5, + "epoch": 1.261658031088083, + "grad_norm": 1.1747920062815884, + "kl": 0.13525390625, + "learning_rate": 8.740932642487047e-07, + "loss": -0.0007, + "reward": 2.4999890327453613, + "reward_std": 1.1638317744200322e-05, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999891519546509, + "step": 487 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0625, + "epoch": 1.2642487046632125, + "grad_norm": 24.412740604956966, + "kl": 0.1484375, + "learning_rate": 8.738341968911916e-07, + "loss": 0.0006, + "reward": 2.3748098611831665, + "reward_std": 0.35388536751270294, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.8748098015785217, + "step": 488 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.5, + "epoch": 1.266839378238342, + "grad_norm": 32.23039537284305, + "kl": 0.139892578125, + "learning_rate": 8.735751295336787e-07, + "loss": 0.0001, + "reward": 2.312269926071167, + "reward_std": 0.25883071099815425, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.8122699856758118, + "step": 489 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.5, + "epoch": 1.2694300518134716, + "grad_norm": 3.342467593195098, + "kl": 0.10400390625, + "learning_rate": 8.733160621761657e-07, + "loss": 0.0014, + "reward": 2.499990940093994, + "reward_std": 1.0768087861379172e-05, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999909400939941, + "step": 490 + }, + { + "clip_ratio": 0.0, + "completion_length": 34.5, + "epoch": 1.2720207253886011, + "grad_norm": 6.411567545415011, + "kl": 0.0732421875, + "learning_rate": 8.730569948186528e-07, + "loss": 0.0003, + "reward": 2.4999295473098755, + "reward_std": 5.353304868549458e-05, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999294877052307, + "step": 491 + }, + { + "clip_ratio": 0.0, + "completion_length": 36.0, + "epoch": 1.2746113989637307, + "grad_norm": 154.2222808395924, + "kl": 0.11083984375, + "learning_rate": 8.727979274611399e-07, + "loss": 0.0009, + "reward": 2.374971866607666, + "reward_std": 0.23148871652870184, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.8749717473983765, + "step": 492 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.375, + "epoch": 1.2772020725388602, + "grad_norm": 35.835700660601134, + "kl": 0.0828857421875, + "learning_rate": 8.725388601036269e-07, + "loss": 0.0, + "reward": 2.1736336946487427, + "reward_std": 0.27024298158335114, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.6736337542533875, + "step": 493 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.5, + "epoch": 1.2797927461139897, + "grad_norm": 2.676829314252972, + "kl": 0.15869140625, + "learning_rate": 8.72279792746114e-07, + "loss": 0.0004, + "reward": 2.499984860420227, + "reward_std": 1.3475339756041649e-05, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999849200248718, + "step": 494 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.5, + "epoch": 1.2823834196891193, + "grad_norm": 7.331169443033464, + "kl": 0.14501953125, + "learning_rate": 8.720207253886009e-07, + "loss": 0.0002, + "reward": 2.4999111890792847, + "reward_std": 3.078954341617646e-05, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.99991112947464, + "step": 495 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.5, + "epoch": 1.2849740932642488, + "grad_norm": 45.42929805913324, + "kl": 0.1204833984375, + "learning_rate": 8.71761658031088e-07, + "loss": 0.0006, + "reward": 1.4907369017601013, + "reward_std": 0.00031247303559212014, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 0.9907369017601013, + "step": 496 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.5, + "epoch": 1.2875647668393784, + "grad_norm": 32.718935169936096, + "kl": 0.135498046875, + "learning_rate": 8.715025906735751e-07, + "loss": 0.0005, + "reward": 1.4990538954734802, + "reward_std": 0.00022890909895068035, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 0.9990538656711578, + "step": 497 + }, + { + "clip_ratio": 0.0, + "completion_length": 34.5, + "epoch": 1.2901554404145077, + "grad_norm": 4.111037862460617, + "kl": 0.1314697265625, + "learning_rate": 8.712435233160621e-07, + "loss": 0.0007, + "reward": 1.9187270402908325, + "reward_std": 0.0003245865591452457, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.4187270402908325, + "step": 498 + }, + { + "clip_ratio": 0.0, + "completion_length": 34.0, + "epoch": 1.2927461139896372, + "grad_norm": 0.21846995165919592, + "kl": 0.072998046875, + "learning_rate": 8.709844559585492e-07, + "loss": -0.0004, + "reward": 2.4999977350234985, + "reward_std": 1.2924656971335935e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999977350234985, + "step": 499 + }, + { + "clip_ratio": 0.0, + "completion_length": 34.0, + "epoch": 1.2953367875647668, + "grad_norm": 0.22255898956838358, + "kl": 0.14990234375, + "learning_rate": 8.707253886010363e-07, + "loss": 0.0002, + "reward": 2.499997138977051, + "reward_std": 1.551056698190223e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.999997317790985, + "step": 500 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.5, + "epoch": 1.2979274611398963, + "grad_norm": 19.646926760605268, + "kl": 0.15966796875, + "learning_rate": 8.704663212435232e-07, + "loss": 0.0006, + "reward": 2.4998767375946045, + "reward_std": 5.471971962833777e-05, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9998767375946045, + "step": 501 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.5, + "epoch": 1.3005181347150259, + "grad_norm": 91.19680000980189, + "kl": 0.086669921875, + "learning_rate": 8.702072538860103e-07, + "loss": -0.0002, + "reward": 2.374948740005493, + "reward_std": 0.2315441130643876, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.8749488592147827, + "step": 502 + }, + { + "clip_ratio": 0.0, + "completion_length": 34.5, + "epoch": 1.3031088082901554, + "grad_norm": 1.6592695638499924, + "kl": 0.052490234375, + "learning_rate": 8.699481865284973e-07, + "loss": -0.0017, + "reward": 2.499985456466675, + "reward_std": 8.230225830629934e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.999985694885254, + "step": 503 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.5, + "epoch": 1.305699481865285, + "grad_norm": 63.17525684276914, + "kl": 0.12225341796875, + "learning_rate": 8.696891191709844e-07, + "loss": 0.0009, + "reward": 2.249861478805542, + "reward_std": 0.2674036819310004, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.7498613595962524, + "step": 504 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.5, + "epoch": 1.3082901554404145, + "grad_norm": 4.3793934638945995, + "kl": 0.1055908203125, + "learning_rate": 8.694300518134715e-07, + "loss": 0.0001, + "reward": 2.499991536140442, + "reward_std": 9.018640184876858e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.999991536140442, + "step": 505 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.5, + "epoch": 1.310880829015544, + "grad_norm": 14.794700219470856, + "kl": 0.105224609375, + "learning_rate": 8.691709844559585e-07, + "loss": 0.0001, + "reward": 2.4374682903289795, + "reward_std": 0.17682419877246502, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9374682903289795, + "step": 506 + }, + { + "clip_ratio": 0.0, + "completion_length": 34.875, + "epoch": 1.3134715025906736, + "grad_norm": 237.4234872129574, + "kl": 0.1923828125, + "learning_rate": 8.689119170984455e-07, + "loss": 0.0008, + "reward": 1.8867421746253967, + "reward_std": 0.27173711359500885, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.3867421448230743, + "step": 507 + }, + { + "clip_ratio": 0.0, + "completion_length": 36.0, + "epoch": 1.3160621761658031, + "grad_norm": 112.39196712868807, + "kl": 0.085205078125, + "learning_rate": 8.686528497409325e-07, + "loss": 0.0003, + "reward": 2.437467336654663, + "reward_std": 0.1768370179406702, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9374674558639526, + "step": 508 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.5, + "epoch": 1.3186528497409327, + "grad_norm": 1.2578278527512097, + "kl": 0.1103515625, + "learning_rate": 8.683937823834196e-07, + "loss": 0.0006, + "reward": 2.499987840652466, + "reward_std": 5.7642498632048955e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999878406524658, + "step": 509 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 1.3212435233160622, + "grad_norm": 0.43174782510600457, + "kl": 0.0540771484375, + "learning_rate": 8.681347150259068e-07, + "loss": 0.0008, + "reward": 2.4999935626983643, + "reward_std": 5.187495048630808e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999934434890747, + "step": 510 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 1.3238341968911918, + "grad_norm": 5.038778187731034, + "kl": 0.114501953125, + "learning_rate": 8.678756476683938e-07, + "loss": 0.0003, + "reward": 1.8212202191352844, + "reward_std": 0.00020774168297066353, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.3212201595306396, + "step": 511 + }, + { + "clip_ratio": 0.0, + "completion_length": 34.5625, + "epoch": 1.3264248704663213, + "grad_norm": 52.99931971886971, + "kl": 0.1025390625, + "learning_rate": 8.676165803108809e-07, + "loss": 0.0005, + "reward": 1.4864550828933716, + "reward_std": 0.017443951954192016, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 0.9864550232887268, + "step": 512 + }, + { + "clip_ratio": 0.0, + "completion_length": 34.875, + "epoch": 1.3290155440414508, + "grad_norm": 0.9693997196557682, + "kl": 0.0501708984375, + "learning_rate": 8.673575129533677e-07, + "loss": 0.0008, + "reward": 2.499989151954651, + "reward_std": 7.076597967170528e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.999989092350006, + "step": 513 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 1.3316062176165804, + "grad_norm": 2.1328251997079617, + "kl": 0.14794921875, + "learning_rate": 8.670984455958549e-07, + "loss": 0.0005, + "reward": 1.9984451532363892, + "reward_std": 3.9576490735271364e-05, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.4984452426433563, + "step": 514 + }, + { + "clip_ratio": 0.0, + "completion_length": 34.5, + "epoch": 1.33419689119171, + "grad_norm": 1.03276895354779, + "kl": 0.0906982421875, + "learning_rate": 8.66839378238342e-07, + "loss": -0.0004, + "reward": 2.499991297721863, + "reward_std": 2.4609428805888456e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999912977218628, + "step": 515 + }, + { + "clip_ratio": 0.0, + "completion_length": 34.5, + "epoch": 1.3367875647668392, + "grad_norm": 137.73385327658656, + "kl": 0.0927734375, + "learning_rate": 8.66580310880829e-07, + "loss": 0.0004, + "reward": 1.8059165477752686, + "reward_std": 0.0012951576063642278, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.305916428565979, + "step": 516 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 1.3393782383419688, + "grad_norm": 2.007081933441151, + "kl": 0.121337890625, + "learning_rate": 8.663212435233161e-07, + "loss": 0.0009, + "reward": 2.4999887943267822, + "reward_std": 1.0784114465423045e-05, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.999988853931427, + "step": 517 + }, + { + "clip_ratio": 0.0, + "completion_length": 34.5, + "epoch": 1.3419689119170983, + "grad_norm": 1.8378451582766666, + "kl": 0.078369140625, + "learning_rate": 8.660621761658031e-07, + "loss": -0.0001, + "reward": 2.49996280670166, + "reward_std": 1.911813257038375e-05, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999628067016602, + "step": 518 + }, + { + "clip_ratio": 0.0, + "completion_length": 36.0, + "epoch": 1.3445595854922279, + "grad_norm": 0.3779511835398549, + "kl": 0.06488037109375, + "learning_rate": 8.658031088082901e-07, + "loss": 0.0005, + "reward": 2.499882221221924, + "reward_std": 7.627893637618399e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.999882161617279, + "step": 519 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 1.3471502590673574, + "grad_norm": 134.17842802835654, + "kl": 0.06591796875, + "learning_rate": 8.655440414507772e-07, + "loss": 0.0009, + "reward": 1.999657690525055, + "reward_std": 8.30407136049871e-05, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.4996576309204102, + "step": 520 + }, + { + "clip_ratio": 0.0, + "completion_length": 34.5, + "epoch": 1.349740932642487, + "grad_norm": 17.932654112913696, + "kl": 0.17724609375, + "learning_rate": 8.652849740932642e-07, + "loss": 0.0007, + "reward": 1.9529168605804443, + "reward_std": 0.0004509269642767322, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.4529170393943787, + "step": 521 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.5, + "epoch": 1.3523316062176165, + "grad_norm": 0.8370015530703874, + "kl": 0.100341796875, + "learning_rate": 8.650259067357513e-07, + "loss": 0.0003, + "reward": 2.4999746084213257, + "reward_std": 3.4588146036185208e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999748468399048, + "step": 522 + }, + { + "clip_ratio": 0.0, + "completion_length": 36.0, + "epoch": 1.354922279792746, + "grad_norm": 1.1187172745983656, + "kl": 0.0562744140625, + "learning_rate": 8.647668393782384e-07, + "loss": 0.0003, + "reward": 2.4999914169311523, + "reward_std": 5.636387641061447e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.999991536140442, + "step": 523 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.25, + "epoch": 1.3575129533678756, + "grad_norm": 5.475814554760311, + "kl": 0.111083984375, + "learning_rate": 8.645077720207254e-07, + "loss": -0.0006, + "reward": 1.921807050704956, + "reward_std": 0.04307932459528274, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.4218071699142456, + "step": 524 + }, + { + "clip_ratio": 0.0, + "completion_length": 33.9375, + "epoch": 1.3601036269430051, + "grad_norm": 27.31737565998964, + "kl": 0.34423828125, + "learning_rate": 8.642487046632124e-07, + "loss": 0.0015, + "reward": 1.0622394680976868, + "reward_std": 0.5786689094893518, + "rewards/format_reward_rec": 0.625, + "rewards/point_reward": 0.7497394382953644, + "step": 525 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.6875, + "epoch": 1.3626943005181347, + "grad_norm": 18.04920430476347, + "kl": 0.218505859375, + "learning_rate": 8.639896373056994e-07, + "loss": 0.0013, + "reward": 2.366421103477478, + "reward_std": 0.24732503924860794, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.866421103477478, + "step": 526 + }, + { + "clip_ratio": 0.0, + "completion_length": 34.5, + "epoch": 1.3652849740932642, + "grad_norm": 1.5128707554351777, + "kl": 0.06903076171875, + "learning_rate": 8.637305699481865e-07, + "loss": 0.0002, + "reward": 1.9997016191482544, + "reward_std": 1.0818416740221437e-05, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.4997015297412872, + "step": 527 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.5, + "epoch": 1.3678756476683938, + "grad_norm": 17.892273651923272, + "kl": 0.150146484375, + "learning_rate": 8.634715025906736e-07, + "loss": 0.0003, + "reward": 1.9563751816749573, + "reward_std": 0.08006607417064515, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.4563751220703125, + "step": 528 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.5, + "epoch": 1.3704663212435233, + "grad_norm": 49.49759300623319, + "kl": 0.121826171875, + "learning_rate": 8.632124352331606e-07, + "loss": 0.0004, + "reward": 1.9985511898994446, + "reward_std": 9.44735438679345e-05, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.4985513389110565, + "step": 529 + }, + { + "clip_ratio": 0.0, + "completion_length": 33.625, + "epoch": 1.3730569948186528, + "grad_norm": 79.33228042959712, + "kl": 0.102294921875, + "learning_rate": 8.629533678756477e-07, + "loss": 0.0006, + "reward": 2.041730046272278, + "reward_std": 0.18517519126544357, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.541729986667633, + "step": 530 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.5, + "epoch": 1.3756476683937824, + "grad_norm": 1.6022727320632366, + "kl": 0.061279296875, + "learning_rate": 8.626943005181346e-07, + "loss": 0.0005, + "reward": 2.499955415725708, + "reward_std": 1.3513888006855268e-05, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.999955177307129, + "step": 531 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 1.378238341968912, + "grad_norm": 0.10301912494261409, + "kl": 0.15478515625, + "learning_rate": 8.624352331606217e-07, + "loss": 0.0003, + "reward": 2.4999966621398926, + "reward_std": 1.4547185287483444e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999966621398926, + "step": 532 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 1.3808290155440415, + "grad_norm": 16.777498631738734, + "kl": 0.1572265625, + "learning_rate": 8.621761658031088e-07, + "loss": 0.0007, + "reward": 1.9369465112686157, + "reward_std": 0.17685012157380697, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.4369465112686157, + "step": 533 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 1.383419689119171, + "grad_norm": 0.10887296905314073, + "kl": 0.09130859375, + "learning_rate": 8.619170984455958e-07, + "loss": 0.0002, + "reward": 2.4999756813049316, + "reward_std": 2.958052277790557e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999756217002869, + "step": 534 + }, + { + "clip_ratio": 0.0, + "completion_length": 36.375, + "epoch": 1.3860103626943006, + "grad_norm": 124.90872629360128, + "kl": 0.1058349609375, + "learning_rate": 8.616580310880829e-07, + "loss": 0.0004, + "reward": 2.2836925983428955, + "reward_std": 0.4896298348903656, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.7836925983428955, + "step": 535 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.5, + "epoch": 1.38860103626943, + "grad_norm": 197.87930575897707, + "kl": 0.09130859375, + "learning_rate": 8.613989637305699e-07, + "loss": 0.0004, + "reward": 1.623317539691925, + "reward_std": 0.23245796479341152, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.1233174204826355, + "step": 536 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.5, + "epoch": 1.3911917098445596, + "grad_norm": 0.7908674683704721, + "kl": 0.0897216796875, + "learning_rate": 8.611398963730569e-07, + "loss": 0.0009, + "reward": 2.4999709129333496, + "reward_std": 5.016916929889703e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999708533287048, + "step": 537 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 1.3937823834196892, + "grad_norm": 0.5765236754049301, + "kl": 0.080810546875, + "learning_rate": 8.60880829015544e-07, + "loss": 0.0004, + "reward": 2.4999920129776, + "reward_std": 4.623262384484406e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999921917915344, + "step": 538 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.875, + "epoch": 1.3963730569948187, + "grad_norm": 28.323006954809713, + "kl": 0.0325927734375, + "learning_rate": 8.60621761658031e-07, + "loss": 0.0002, + "reward": 2.1874371767044067, + "reward_std": 0.2588396147421008, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.687437117099762, + "step": 539 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 1.3989637305699483, + "grad_norm": 0.5693864540919378, + "kl": 0.13818359375, + "learning_rate": 8.603626943005181e-07, + "loss": 0.0008, + "reward": 2.4999932050704956, + "reward_std": 4.561453579299268e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999933242797852, + "step": 540 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 1.4015544041450778, + "grad_norm": 3.5784122996785745, + "kl": 0.047607421875, + "learning_rate": 8.601036269430051e-07, + "loss": 0.0005, + "reward": 2.4999213218688965, + "reward_std": 1.5988063751137815e-05, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999213218688965, + "step": 541 + }, + { + "clip_ratio": 0.0, + "completion_length": 34.6875, + "epoch": 1.4041450777202074, + "grad_norm": 48.66807689118439, + "kl": 0.093017578125, + "learning_rate": 8.598445595854922e-07, + "loss": -0.0005, + "reward": 2.0504003763198853, + "reward_std": 0.2775210708193754, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.55040043592453, + "step": 542 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.5, + "epoch": 1.406735751295337, + "grad_norm": 0.17729972908932326, + "kl": 0.1243896484375, + "learning_rate": 8.595854922279792e-07, + "loss": 0.0002, + "reward": 2.499998092651367, + "reward_std": 1.2256335253368889e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999982118606567, + "step": 543 + }, + { + "clip_ratio": 0.0, + "completion_length": 34.5, + "epoch": 1.4093264248704664, + "grad_norm": 1.031157083983844, + "kl": 0.057373046875, + "learning_rate": 8.593264248704662e-07, + "loss": -0.0002, + "reward": 2.4999918937683105, + "reward_std": 5.470992618938908e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999918341636658, + "step": 544 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.5, + "epoch": 1.411917098445596, + "grad_norm": 0.19099110947313502, + "kl": 0.17041015625, + "learning_rate": 8.590673575129533e-07, + "loss": 0.0011, + "reward": 2.4999969005584717, + "reward_std": 2.149495315961758e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999967813491821, + "step": 545 + }, + { + "clip_ratio": 0.0, + "completion_length": 34.875, + "epoch": 1.4145077720207253, + "grad_norm": 15.449388787526583, + "kl": 0.20703125, + "learning_rate": 8.588082901554404e-07, + "loss": 0.0005, + "reward": 1.7858158349990845, + "reward_std": 0.2423164664542128, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.285815954208374, + "step": 546 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.3125, + "epoch": 1.4170984455958548, + "grad_norm": 21.75149074064552, + "kl": 0.28759765625, + "learning_rate": 8.585492227979274e-07, + "loss": 0.0007, + "reward": 1.917125940322876, + "reward_std": 0.257729121552984, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.417125940322876, + "step": 547 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 1.4196891191709844, + "grad_norm": 2.9702775269444937, + "kl": 0.0458984375, + "learning_rate": 8.582901554404145e-07, + "loss": 0.0, + "reward": 2.499992251396179, + "reward_std": 5.324892072167131e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.999992311000824, + "step": 548 + }, + { + "clip_ratio": 0.0, + "completion_length": 36.0, + "epoch": 1.422279792746114, + "grad_norm": 3.810246069028726, + "kl": 0.1162109375, + "learning_rate": 8.580310880829014e-07, + "loss": 0.0, + "reward": 2.499987483024597, + "reward_std": 1.6177102395431575e-05, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999874234199524, + "step": 549 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.625, + "epoch": 1.4248704663212435, + "grad_norm": 15.54805209625986, + "kl": 0.81787109375, + "learning_rate": 8.577720207253885e-07, + "loss": 0.0027, + "reward": 2.2348156571388245, + "reward_std": 0.28371294361329547, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.7348155975341797, + "step": 550 + }, + { + "clip_ratio": 0.0, + "completion_length": 34.9375, + "epoch": 1.427461139896373, + "grad_norm": 22.946519579451778, + "kl": 0.10302734375, + "learning_rate": 8.575129533678756e-07, + "loss": 0.0001, + "reward": 1.9609906673431396, + "reward_std": 0.09214628321743135, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.4609908163547516, + "step": 551 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 1.4300518134715026, + "grad_norm": 20.4646282492878, + "kl": 0.0438232421875, + "learning_rate": 8.572538860103626e-07, + "loss": 0.0005, + "reward": 2.374976634979248, + "reward_std": 0.23149483580868946, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.874976634979248, + "step": 552 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.5, + "epoch": 1.432642487046632, + "grad_norm": 0.39088450896059135, + "kl": 0.05615234375, + "learning_rate": 8.569948186528498e-07, + "loss": 0.0002, + "reward": 2.4999959468841553, + "reward_std": 2.5724719421305053e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999959468841553, + "step": 553 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 1.4352331606217616, + "grad_norm": 10.509556079498923, + "kl": 0.0721435546875, + "learning_rate": 8.567357512953368e-07, + "loss": 0.0006, + "reward": 1.9992655515670776, + "reward_std": 3.6277227991377003e-05, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.4992654919624329, + "step": 554 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.5, + "epoch": 1.4378238341968912, + "grad_norm": 0.6360938765933905, + "kl": 0.10546875, + "learning_rate": 8.564766839378238e-07, + "loss": 0.0008, + "reward": 2.4999654293060303, + "reward_std": 7.2180823735834565e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999655485153198, + "step": 555 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.5, + "epoch": 1.4404145077720207, + "grad_norm": 2.6689085455051393, + "kl": 0.117431640625, + "learning_rate": 8.562176165803109e-07, + "loss": 0.0012, + "reward": 2.4999722242355347, + "reward_std": 9.116790636198857e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.99997216463089, + "step": 556 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.5, + "epoch": 1.4430051813471503, + "grad_norm": 1.457392585775066, + "kl": 0.0732421875, + "learning_rate": 8.559585492227979e-07, + "loss": -0.0005, + "reward": 1.999876618385315, + "reward_std": 6.7578973812487675e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.4998767375946045, + "step": 557 + }, + { + "clip_ratio": 0.0, + "completion_length": 34.5, + "epoch": 1.4455958549222798, + "grad_norm": 96.80446261646995, + "kl": 0.0946044921875, + "learning_rate": 8.55699481865285e-07, + "loss": -0.0002, + "reward": 2.296552300453186, + "reward_std": 0.2808211346227836, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.7965522408485413, + "step": 558 + }, + { + "clip_ratio": 0.0, + "completion_length": 34.5, + "epoch": 1.4481865284974094, + "grad_norm": 14.899907420105606, + "kl": 0.205078125, + "learning_rate": 8.55440414507772e-07, + "loss": 0.0013, + "reward": 2.437370538711548, + "reward_std": 0.17711095710717473, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9373704195022583, + "step": 559 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.5, + "epoch": 1.450777202072539, + "grad_norm": 54.26997364778638, + "kl": 0.06689453125, + "learning_rate": 8.551813471502591e-07, + "loss": 0.0005, + "reward": 1.9200925827026367, + "reward_std": 0.03077354779952657, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.4200924038887024, + "step": 560 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 1.4533678756476685, + "grad_norm": 170.31019157117908, + "kl": 0.1650390625, + "learning_rate": 8.549222797927461e-07, + "loss": 0.0007, + "reward": 1.8056821823120117, + "reward_std": 0.0030451994288682727, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.3056823015213013, + "step": 561 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 1.455958549222798, + "grad_norm": 0.9094885943424723, + "kl": 0.125244140625, + "learning_rate": 8.546632124352331e-07, + "loss": 0.0021, + "reward": 2.499968409538269, + "reward_std": 1.0164353852815111e-05, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999680519104004, + "step": 562 + }, + { + "clip_ratio": 0.0, + "completion_length": 34.5, + "epoch": 1.4585492227979275, + "grad_norm": 13.893688669120996, + "kl": 0.0482177734375, + "learning_rate": 8.544041450777202e-07, + "loss": 0.0002, + "reward": 1.9989761114120483, + "reward_std": 6.966816772546736e-05, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.498976081609726, + "step": 563 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.5, + "epoch": 1.4611398963730569, + "grad_norm": 0.24691892359347203, + "kl": 0.0596923828125, + "learning_rate": 8.541450777202072e-07, + "loss": 0.0008, + "reward": 2.4999735355377197, + "reward_std": 4.274718094166019e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.999973475933075, + "step": 564 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 1.4637305699481864, + "grad_norm": 80.70132538685037, + "kl": 0.2451171875, + "learning_rate": 8.538860103626943e-07, + "loss": 0.001, + "reward": 2.119768977165222, + "reward_std": 0.23466921336444102, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.6197689771652222, + "step": 565 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 1.466321243523316, + "grad_norm": 2.2833243782909993, + "kl": 0.161376953125, + "learning_rate": 8.536269430051814e-07, + "loss": 0.0013, + "reward": 2.4999818801879883, + "reward_std": 6.4477480350433325e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.999981701374054, + "step": 566 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 1.4689119170984455, + "grad_norm": 0.17769710559204552, + "kl": 0.0533447265625, + "learning_rate": 8.533678756476683e-07, + "loss": 0.0014, + "reward": 2.499998927116394, + "reward_std": 8.398082229632564e-07, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999987483024597, + "step": 567 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.5, + "epoch": 1.471502590673575, + "grad_norm": 8.834662862773344, + "kl": 0.11328125, + "learning_rate": 8.531088082901554e-07, + "loss": -0.0001, + "reward": 1.8848623037338257, + "reward_std": 0.0004728440103463072, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.3848623931407928, + "step": 568 + }, + { + "clip_ratio": 0.0, + "completion_length": 34.0, + "epoch": 1.4740932642487046, + "grad_norm": 1.118903441198404, + "kl": 0.0631103515625, + "learning_rate": 8.528497409326425e-07, + "loss": 0.0009, + "reward": 2.499974250793457, + "reward_std": 5.876585788655575e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.999974250793457, + "step": 569 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.5, + "epoch": 1.4766839378238341, + "grad_norm": 0.24970816334285134, + "kl": 0.088134765625, + "learning_rate": 8.525906735751295e-07, + "loss": 0.0006, + "reward": 2.4999921321868896, + "reward_std": 2.663360589849617e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999920725822449, + "step": 570 + }, + { + "clip_ratio": 0.0, + "completion_length": 34.5, + "epoch": 1.4792746113989637, + "grad_norm": 25.24810264907173, + "kl": 0.130859375, + "learning_rate": 8.523316062176166e-07, + "loss": 0.0012, + "reward": 2.499987244606018, + "reward_std": 5.787460963802005e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999870657920837, + "step": 571 + }, + { + "clip_ratio": 0.0, + "completion_length": 34.9375, + "epoch": 1.4818652849740932, + "grad_norm": 0.7292134978182698, + "kl": 0.072998046875, + "learning_rate": 8.520725388601036e-07, + "loss": 0.0004, + "reward": 2.499993681907654, + "reward_std": 3.617406207467866e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999937415122986, + "step": 572 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.5, + "epoch": 1.4844559585492227, + "grad_norm": 10.11262971199148, + "kl": 0.0626220703125, + "learning_rate": 8.518134715025906e-07, + "loss": 0.0008, + "reward": 1.9956601858139038, + "reward_std": 0.0004383593468446634, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.495660126209259, + "step": 573 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.5, + "epoch": 1.4870466321243523, + "grad_norm": 6.923407931749802, + "kl": 0.100830078125, + "learning_rate": 8.515544041450777e-07, + "loss": 0.0013, + "reward": 1.9345734119415283, + "reward_std": 0.0016179480317077832, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.4345735013484955, + "step": 574 + }, + { + "clip_ratio": 0.0, + "completion_length": 34.5, + "epoch": 1.4896373056994818, + "grad_norm": 3.210986057446249, + "kl": 0.1279296875, + "learning_rate": 8.512953367875647e-07, + "loss": 0.0002, + "reward": 1.998543381690979, + "reward_std": 4.682441522163572e-05, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.4985434114933014, + "step": 575 + }, + { + "clip_ratio": 0.0, + "completion_length": 34.375, + "epoch": 1.4922279792746114, + "grad_norm": 118.22210432319872, + "kl": 0.4593505859375, + "learning_rate": 8.510362694300518e-07, + "loss": 0.002, + "reward": 2.3113245964050293, + "reward_std": 0.26039084413665137, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.8113245964050293, + "step": 576 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 1.494818652849741, + "grad_norm": 16.97391608660472, + "kl": 0.05487060546875, + "learning_rate": 8.507772020725388e-07, + "loss": -0.0001, + "reward": 2.475416660308838, + "reward_std": 0.010014679694904771, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9754165410995483, + "step": 577 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.5, + "epoch": 1.4974093264248705, + "grad_norm": 115.60404174454798, + "kl": 0.08074951171875, + "learning_rate": 8.505181347150259e-07, + "loss": 0.0003, + "reward": 1.9988738298416138, + "reward_std": 6.560475759442852e-05, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.498873770236969, + "step": 578 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.5, + "epoch": 1.5, + "grad_norm": 16.39430906170559, + "kl": 0.159423828125, + "learning_rate": 8.502590673575129e-07, + "loss": 0.0011, + "reward": 2.4999295473098755, + "reward_std": 3.901746913470561e-05, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999294877052307, + "step": 579 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 1.5025906735751295, + "grad_norm": 3.709042866453429, + "kl": 0.15106201171875, + "learning_rate": 8.499999999999999e-07, + "loss": 0.0016, + "reward": 2.4999505281448364, + "reward_std": 4.016945126750215e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999504089355469, + "step": 580 + }, + { + "clip_ratio": 0.0, + "completion_length": 34.0, + "epoch": 1.505181347150259, + "grad_norm": 0.5414945430223663, + "kl": 0.12841796875, + "learning_rate": 8.49740932642487e-07, + "loss": 0.0009, + "reward": 2.499996542930603, + "reward_std": 4.480836423681467e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999966025352478, + "step": 581 + }, + { + "clip_ratio": 0.0, + "completion_length": 36.5, + "epoch": 1.5077720207253886, + "grad_norm": 0.04387608481162616, + "kl": 0.0703125, + "learning_rate": 8.49481865284974e-07, + "loss": 0.0003, + "reward": 2.4999992847442627, + "reward_std": 5.225200823133491e-07, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.999999463558197, + "step": 582 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 1.5103626943005182, + "grad_norm": 15.262042537624769, + "kl": 0.119384765625, + "learning_rate": 8.492227979274611e-07, + "loss": 0.0001, + "reward": 2.4998769760131836, + "reward_std": 7.171521201598807e-05, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9998770952224731, + "step": 583 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.1875, + "epoch": 1.5129533678756477, + "grad_norm": 49.153036232209615, + "kl": 0.180419921875, + "learning_rate": 8.489637305699482e-07, + "loss": 0.0007, + "reward": 1.92996084690094, + "reward_std": 0.18613753374665976, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.4299608170986176, + "step": 584 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 1.5155440414507773, + "grad_norm": 1.1414134842639188, + "kl": 0.06103515625, + "learning_rate": 8.487046632124351e-07, + "loss": 0.0007, + "reward": 2.499955654144287, + "reward_std": 7.617499704792863e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999555945396423, + "step": 585 + }, + { + "clip_ratio": 0.0, + "completion_length": 34.5, + "epoch": 1.5181347150259068, + "grad_norm": 0.1102302623226846, + "kl": 0.08251953125, + "learning_rate": 8.484455958549222e-07, + "loss": -0.0003, + "reward": 2.499996304512024, + "reward_std": 1.9968337028331007e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999964833259583, + "step": 586 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 1.5207253886010363, + "grad_norm": 0.1853898651319823, + "kl": 0.0579833984375, + "learning_rate": 8.481865284974092e-07, + "loss": 0.0004, + "reward": 2.499997854232788, + "reward_std": 1.4827681411588856e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999980926513672, + "step": 587 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.5, + "epoch": 1.5233160621761659, + "grad_norm": 64.06439151411772, + "kl": 0.091064453125, + "learning_rate": 8.479274611398963e-07, + "loss": -0.0001, + "reward": 2.4995049238204956, + "reward_std": 6.929871960892342e-05, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.99950510263443, + "step": 588 + }, + { + "clip_ratio": 0.0, + "completion_length": 34.5, + "epoch": 1.5259067357512954, + "grad_norm": 0.257982181347981, + "kl": 0.22802734375, + "learning_rate": 8.476683937823834e-07, + "loss": 0.0015, + "reward": 2.4999969005584717, + "reward_std": 4.665788935653836e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999970197677612, + "step": 589 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 1.528497409326425, + "grad_norm": 0.6143889215696653, + "kl": 0.1083984375, + "learning_rate": 8.474093264248704e-07, + "loss": -0.0003, + "reward": 2.499996304512024, + "reward_std": 2.9716395886225655e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999964237213135, + "step": 590 + }, + { + "clip_ratio": 0.0, + "completion_length": 34.5, + "epoch": 1.5310880829015545, + "grad_norm": 33.29279444262581, + "kl": 0.108154296875, + "learning_rate": 8.471502590673574e-07, + "loss": 0.0002, + "reward": 2.3748679161071777, + "reward_std": 0.23151225593392155, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.8748680353164673, + "step": 591 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 1.533678756476684, + "grad_norm": 68.05081380146056, + "kl": 0.108154296875, + "learning_rate": 8.468911917098444e-07, + "loss": 0.0011, + "reward": 2.498900055885315, + "reward_std": 0.00018090060984832235, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9988999366760254, + "step": 592 + }, + { + "clip_ratio": 0.0, + "completion_length": 36.5, + "epoch": 1.5362694300518136, + "grad_norm": 0.5801958685987406, + "kl": 0.04302978515625, + "learning_rate": 8.466321243523315e-07, + "loss": -0.0007, + "reward": 2.499996304512024, + "reward_std": 3.393692395547987e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999964237213135, + "step": 593 + }, + { + "clip_ratio": 0.0, + "completion_length": 34.0, + "epoch": 1.5388601036269431, + "grad_norm": 5.008065130916608, + "kl": 0.100341796875, + "learning_rate": 8.463730569948186e-07, + "loss": -0.0006, + "reward": 1.9089406728744507, + "reward_std": 0.0002627334563385375, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.4089407920837402, + "step": 594 + }, + { + "clip_ratio": 0.0, + "completion_length": 39.1875, + "epoch": 1.5414507772020727, + "grad_norm": 20.83614436655994, + "kl": 0.09765625, + "learning_rate": 8.461139896373056e-07, + "loss": 0.0001, + "reward": 2.1044222116470337, + "reward_std": 0.24415511500592402, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.6044222116470337, + "step": 595 + }, + { + "clip_ratio": 0.0, + "completion_length": 34.0, + "epoch": 1.5440414507772022, + "grad_norm": 21.50165598420066, + "kl": 0.1143798828125, + "learning_rate": 8.458549222797928e-07, + "loss": 0.0009, + "reward": 2.499922513961792, + "reward_std": 8.435450786237197e-05, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.999922513961792, + "step": 596 + }, + { + "clip_ratio": 0.0, + "completion_length": 34.5625, + "epoch": 1.5466321243523318, + "grad_norm": 21.63090716391322, + "kl": 0.14208984375, + "learning_rate": 8.455958549222799e-07, + "loss": 0.0014, + "reward": 2.4271273612976074, + "reward_std": 0.20610876871108985, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9271273016929626, + "step": 597 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 1.549222797927461, + "grad_norm": 5.453637215366586, + "kl": 0.099853515625, + "learning_rate": 8.453367875647668e-07, + "loss": 0.0001, + "reward": 1.9999055862426758, + "reward_std": 2.6277070901414845e-05, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.4999056458473206, + "step": 598 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 1.5518134715025906, + "grad_norm": 15.384719875272125, + "kl": 0.116455078125, + "learning_rate": 8.450777202072539e-07, + "loss": 0.0004, + "reward": 1.8122720122337341, + "reward_std": 0.2587905696236703, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.3122718930244446, + "step": 599 + }, + { + "clip_ratio": 0.0, + "completion_length": 34.5, + "epoch": 1.5544041450777202, + "grad_norm": 0.616532429867594, + "kl": 0.0955810546875, + "learning_rate": 8.448186528497409e-07, + "loss": 0.0004, + "reward": 2.499997854232788, + "reward_std": 1.8724124970503908e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999980330467224, + "step": 600 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.5, + "epoch": 1.5569948186528497, + "grad_norm": 4.341940647647111, + "kl": 0.080810546875, + "learning_rate": 8.44559585492228e-07, + "loss": 0.0013, + "reward": 2.4999802112579346, + "reward_std": 1.88822182281001e-05, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.999980092048645, + "step": 601 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.5, + "epoch": 1.5595854922279793, + "grad_norm": 40.68992921359032, + "kl": 0.17626953125, + "learning_rate": 8.443005181347151e-07, + "loss": 0.0004, + "reward": 1.9371178150177002, + "reward_std": 0.1775104302305408, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.4371178448200226, + "step": 602 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.5, + "epoch": 1.5621761658031088, + "grad_norm": 0.5326278638232943, + "kl": 0.0965576171875, + "learning_rate": 8.44041450777202e-07, + "loss": -0.0006, + "reward": 2.4999624490737915, + "reward_std": 3.241361184791458e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.999962568283081, + "step": 603 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 1.5647668393782384, + "grad_norm": 15.690180864985187, + "kl": 0.1494140625, + "learning_rate": 8.437823834196891e-07, + "loss": 0.0001, + "reward": 2.499652147293091, + "reward_std": 8.124888370275585e-05, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9996520280838013, + "step": 604 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.5, + "epoch": 1.567357512953368, + "grad_norm": 24.444172968130243, + "kl": 0.5166015625, + "learning_rate": 8.435233160621761e-07, + "loss": 0.0019, + "reward": 2.2499672174453735, + "reward_std": 0.2672818519795328, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.749967098236084, + "step": 605 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.5, + "epoch": 1.5699481865284974, + "grad_norm": 13.404009997872135, + "kl": 0.0596923828125, + "learning_rate": 8.432642487046632e-07, + "loss": -0.0, + "reward": 2.4999488592147827, + "reward_std": 3.0938746022002306e-05, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999488592147827, + "step": 606 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 1.572538860103627, + "grad_norm": 77.39581247110588, + "kl": 0.0791015625, + "learning_rate": 8.430051813471503e-07, + "loss": 0.0003, + "reward": 2.36248779296875, + "reward_std": 0.25471037908980065, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.8624878525733948, + "step": 607 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 1.5751295336787565, + "grad_norm": 1.9008766491155655, + "kl": 0.102783203125, + "learning_rate": 8.427461139896373e-07, + "loss": 0.0004, + "reward": 1.9998862147331238, + "reward_std": 1.2667527585108473e-05, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.4998860359191895, + "step": 608 + }, + { + "clip_ratio": 0.0, + "completion_length": 34.5, + "epoch": 1.577720207253886, + "grad_norm": 1.9828803876834276, + "kl": 0.12109375, + "learning_rate": 8.424870466321244e-07, + "loss": -0.0008, + "reward": 2.4999622106552124, + "reward_std": 8.237246333919757e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999626278877258, + "step": 609 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.5, + "epoch": 1.5803108808290154, + "grad_norm": 2.79710111954267, + "kl": 0.0574951171875, + "learning_rate": 8.422279792746113e-07, + "loss": -0.0006, + "reward": 1.9982621669769287, + "reward_std": 5.059276645624777e-05, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.4982622861862183, + "step": 610 + }, + { + "clip_ratio": 0.0, + "completion_length": 34.5, + "epoch": 1.582901554404145, + "grad_norm": 39.1047303736378, + "kl": 0.0518798828125, + "learning_rate": 8.419689119170984e-07, + "loss": 0.0005, + "reward": 2.062355160713196, + "reward_std": 0.17683401459413517, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.5623551607131958, + "step": 611 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.5, + "epoch": 1.5854922279792745, + "grad_norm": 9.460467075561967, + "kl": 0.1142578125, + "learning_rate": 8.417098445595855e-07, + "loss": 0.0011, + "reward": 2.4999747276306152, + "reward_std": 4.3438417833385756e-05, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999746680259705, + "step": 612 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 1.588082901554404, + "grad_norm": 0.9294845620488548, + "kl": 0.0755615234375, + "learning_rate": 8.414507772020725e-07, + "loss": 0.0013, + "reward": 2.499990463256836, + "reward_std": 7.726110425210209e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999902248382568, + "step": 613 + }, + { + "clip_ratio": 0.0, + "completion_length": 34.5625, + "epoch": 1.5906735751295336, + "grad_norm": 12.931335461271505, + "kl": 0.090576171875, + "learning_rate": 8.411917098445596e-07, + "loss": 0.0004, + "reward": 1.6791605949401855, + "reward_std": 0.2087959760101512, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.1791605949401855, + "step": 614 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 1.593264248704663, + "grad_norm": 16.92869409317597, + "kl": 0.0999755859375, + "learning_rate": 8.409326424870465e-07, + "loss": -0.0001, + "reward": 2.499935746192932, + "reward_std": 2.266797218908323e-05, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999355673789978, + "step": 615 + }, + { + "clip_ratio": 0.0, + "completion_length": 36.5, + "epoch": 1.5958549222797926, + "grad_norm": 2.6611294623667967, + "kl": 0.10400390625, + "learning_rate": 8.406735751295336e-07, + "loss": 0.0014, + "reward": 1.999908208847046, + "reward_std": 7.677721214349731e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.499908059835434, + "step": 616 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.5, + "epoch": 1.5984455958549222, + "grad_norm": 55.501815930975496, + "kl": 0.039794921875, + "learning_rate": 8.404145077720207e-07, + "loss": -0.0003, + "reward": 2.3749834299087524, + "reward_std": 0.23147836945463496, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.874983549118042, + "step": 617 + }, + { + "clip_ratio": 0.0, + "completion_length": 34.5, + "epoch": 1.6010362694300517, + "grad_norm": 5.879550773391461, + "kl": 0.124267578125, + "learning_rate": 8.401554404145077e-07, + "loss": 0.0011, + "reward": 2.4999661445617676, + "reward_std": 2.994358374053263e-05, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999660849571228, + "step": 618 + }, + { + "clip_ratio": 0.0, + "completion_length": 36.0, + "epoch": 1.6036269430051813, + "grad_norm": 5.685433488905754, + "kl": 0.043212890625, + "learning_rate": 8.398963730569948e-07, + "loss": -0.0005, + "reward": 2.4999927282333374, + "reward_std": 8.184799071386806e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999927878379822, + "step": 619 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 1.6062176165803108, + "grad_norm": 0.28759157221521486, + "kl": 0.0986328125, + "learning_rate": 8.396373056994819e-07, + "loss": -0.0011, + "reward": 2.4999990463256836, + "reward_std": 5.998403480589332e-07, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999993443489075, + "step": 620 + }, + { + "clip_ratio": 0.0, + "completion_length": 36.5, + "epoch": 1.6088082901554404, + "grad_norm": 1.9602929056740013, + "kl": 0.0538330078125, + "learning_rate": 8.393782383419689e-07, + "loss": 0.0009, + "reward": 2.49999463558197, + "reward_std": 5.903515841509943e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999944567680359, + "step": 621 + }, + { + "clip_ratio": 0.0, + "completion_length": 34.0, + "epoch": 1.61139896373057, + "grad_norm": 49.769841735673076, + "kl": 0.03765869140625, + "learning_rate": 8.391191709844559e-07, + "loss": -0.0003, + "reward": 2.3749756813049316, + "reward_std": 0.23147984719605574, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.8749755024909973, + "step": 622 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.5, + "epoch": 1.6139896373056994, + "grad_norm": 5.227372879468587, + "kl": 0.077392578125, + "learning_rate": 8.388601036269429e-07, + "loss": 0.0007, + "reward": 2.2499500513076782, + "reward_std": 0.26726438726723245, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.7499499917030334, + "step": 623 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 1.616580310880829, + "grad_norm": 19.784274226086808, + "kl": 0.161865234375, + "learning_rate": 8.3860103626943e-07, + "loss": 0.0002, + "reward": 2.3733383417129517, + "reward_std": 0.23447314692953114, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.8733383417129517, + "step": 624 + }, + { + "clip_ratio": 0.0, + "completion_length": 34.5, + "epoch": 1.6191709844559585, + "grad_norm": 0.7808235863543724, + "kl": 0.115478515625, + "learning_rate": 8.383419689119171e-07, + "loss": 0.0006, + "reward": 2.4999964237213135, + "reward_std": 2.9871059723518556e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.999996542930603, + "step": 625 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 1.621761658031088, + "grad_norm": 27.70381217077696, + "kl": 0.145263671875, + "learning_rate": 8.380829015544041e-07, + "loss": 0.0004, + "reward": 1.9996799230575562, + "reward_std": 4.7849290695012314e-05, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.4996800422668457, + "step": 626 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 1.6243523316062176, + "grad_norm": 32.867843698002496, + "kl": 0.0699462890625, + "learning_rate": 8.378238341968912e-07, + "loss": -0.0005, + "reward": 1.9922881126403809, + "reward_std": 4.9706817662809044e-05, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.4922881126403809, + "step": 627 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 1.6269430051813472, + "grad_norm": 16.520671273835518, + "kl": 0.0830078125, + "learning_rate": 8.375647668393781e-07, + "loss": 0.0001, + "reward": 2.1247638463974, + "reward_std": 0.23160198168898205, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.6247639060020447, + "step": 628 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 1.6295336787564767, + "grad_norm": 237.61248171008873, + "kl": 0.06488037109375, + "learning_rate": 8.373056994818652e-07, + "loss": 0.0006, + "reward": 1.9715783596038818, + "reward_std": 0.032443345795968526, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.4715781807899475, + "step": 629 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.5, + "epoch": 1.6321243523316062, + "grad_norm": 0.15631348461910247, + "kl": 0.08984375, + "learning_rate": 8.370466321243523e-07, + "loss": 0.0009, + "reward": 2.4999732971191406, + "reward_std": 2.459369682128454e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999732375144958, + "step": 630 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 1.6347150259067358, + "grad_norm": 7.374614574113729, + "kl": 0.099609375, + "learning_rate": 8.367875647668393e-07, + "loss": 0.0005, + "reward": 2.4999386072158813, + "reward_std": 3.54635722032981e-05, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999386072158813, + "step": 631 + }, + { + "clip_ratio": 0.0, + "completion_length": 36.0, + "epoch": 1.6373056994818653, + "grad_norm": 2.5733541818184524, + "kl": 0.15673828125, + "learning_rate": 8.365284974093264e-07, + "loss": 0.0009, + "reward": 1.9963146448135376, + "reward_std": 3.914493510137618e-05, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.4963144659996033, + "step": 632 + }, + { + "clip_ratio": 0.0, + "completion_length": 36.0, + "epoch": 1.6398963730569949, + "grad_norm": 3.3167630613463173, + "kl": 0.0364990234375, + "learning_rate": 8.362694300518134e-07, + "loss": 0.0001, + "reward": 2.499943733215332, + "reward_std": 1.5723659146260616e-05, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.999943733215332, + "step": 633 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.375, + "epoch": 1.6424870466321244, + "grad_norm": 293.017130043985, + "kl": 0.48486328125, + "learning_rate": 8.360103626943004e-07, + "loss": 0.0019, + "reward": 1.5701483488082886, + "reward_std": 0.3352852957032155, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.0701484084129333, + "step": 634 + }, + { + "clip_ratio": 0.0, + "completion_length": 34.5, + "epoch": 1.645077720207254, + "grad_norm": 0.15602157212016882, + "kl": 0.083251953125, + "learning_rate": 8.357512953367875e-07, + "loss": -0.0001, + "reward": 2.4999769926071167, + "reward_std": 1.806196820552941e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999771118164062, + "step": 635 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.5, + "epoch": 1.6476683937823835, + "grad_norm": 14.406833113026893, + "kl": 0.1533203125, + "learning_rate": 8.354922279792745e-07, + "loss": 0.0003, + "reward": 1.9809051752090454, + "reward_std": 0.00011348181942594238, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.4809053242206573, + "step": 636 + }, + { + "clip_ratio": 0.0, + "completion_length": 36.0, + "epoch": 1.650259067357513, + "grad_norm": 5.98867345829897, + "kl": 0.0699462890625, + "learning_rate": 8.352331606217616e-07, + "loss": 0.0005, + "reward": 2.49954617023468, + "reward_std": 6.855746374867522e-05, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9995462894439697, + "step": 637 + }, + { + "clip_ratio": 0.0, + "completion_length": 36.0, + "epoch": 1.6528497409326426, + "grad_norm": 0.38712691535412386, + "kl": 0.113037109375, + "learning_rate": 8.349740932642486e-07, + "loss": 0.0005, + "reward": 2.4999964237213135, + "reward_std": 1.8679312177027896e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999964833259583, + "step": 638 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 1.6554404145077721, + "grad_norm": 23.64713542225016, + "kl": 0.1207275390625, + "learning_rate": 8.347150259067358e-07, + "loss": -0.0001, + "reward": 1.9969990849494934, + "reward_std": 0.00024986168754992377, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.4969991743564606, + "step": 639 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.5, + "epoch": 1.6580310880829017, + "grad_norm": 0.1307402912355439, + "kl": 0.062255859375, + "learning_rate": 8.344559585492228e-07, + "loss": 0.0008, + "reward": 2.4999979734420776, + "reward_std": 1.503826325688351e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999980330467224, + "step": 640 + }, + { + "clip_ratio": 0.0, + "completion_length": 34.5, + "epoch": 1.6606217616580312, + "grad_norm": 17.10243951191007, + "kl": 0.0982666015625, + "learning_rate": 8.341968911917098e-07, + "loss": 0.0001, + "reward": 1.9117478132247925, + "reward_std": 0.0007526968944375767, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.4117478728294373, + "step": 641 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 1.6632124352331608, + "grad_norm": 42.48667614447326, + "kl": 0.0953369140625, + "learning_rate": 8.339378238341969e-07, + "loss": -0.0003, + "reward": 1.993226706981659, + "reward_std": 0.0017277612910220341, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.4932267665863037, + "step": 642 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 1.6658031088082903, + "grad_norm": 87.54259575410254, + "kl": 0.06396484375, + "learning_rate": 8.33678756476684e-07, + "loss": 0.0004, + "reward": 2.499666690826416, + "reward_std": 9.813916403800249e-05, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9996667504310608, + "step": 643 + }, + { + "clip_ratio": 0.0, + "completion_length": 34.5, + "epoch": 1.6683937823834198, + "grad_norm": 0.13406492653191412, + "kl": 0.0870361328125, + "learning_rate": 8.33419689119171e-07, + "loss": 0.0006, + "reward": 2.4999979734420776, + "reward_std": 2.1362269535529776e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999980330467224, + "step": 644 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.4375, + "epoch": 1.6709844559585494, + "grad_norm": 25.02333946235965, + "kl": 0.0830078125, + "learning_rate": 8.331606217616581e-07, + "loss": 0.0003, + "reward": 1.4643511176109314, + "reward_std": 0.0011783490153902676, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 0.9643511474132538, + "step": 645 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 1.6735751295336787, + "grad_norm": 4.837086925046459, + "kl": 0.0745849609375, + "learning_rate": 8.32901554404145e-07, + "loss": 0.0011, + "reward": 2.499936103820801, + "reward_std": 1.708796690991221e-05, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999359846115112, + "step": 646 + }, + { + "clip_ratio": 0.0, + "completion_length": 34.5, + "epoch": 1.6761658031088082, + "grad_norm": 24.50331106738798, + "kl": 0.070068359375, + "learning_rate": 8.326424870466321e-07, + "loss": 0.0005, + "reward": 1.8760124444961548, + "reward_std": 0.0016876516601769254, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.3760126233100891, + "step": 647 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.5, + "epoch": 1.6787564766839378, + "grad_norm": 1494.805228643778, + "kl": 86.022216796875, + "learning_rate": 8.323834196891192e-07, + "loss": 0.3466, + "reward": 2.437464952468872, + "reward_std": 0.17683711373479127, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9374647736549377, + "step": 648 + }, + { + "clip_ratio": 0.0, + "completion_length": 34.5, + "epoch": 1.6813471502590673, + "grad_norm": 0.18292799218872652, + "kl": 0.121826171875, + "learning_rate": 8.321243523316062e-07, + "loss": 0.0009, + "reward": 2.4999966621398926, + "reward_std": 1.514225971277483e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999967813491821, + "step": 649 + }, + { + "clip_ratio": 0.0, + "completion_length": 36.0, + "epoch": 1.6839378238341969, + "grad_norm": 6.872451887462959, + "kl": 0.070556640625, + "learning_rate": 8.318652849740933e-07, + "loss": 0.0002, + "reward": 2.499855875968933, + "reward_std": 3.7510570109589025e-05, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9998559355735779, + "step": 650 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.75, + "epoch": 1.6865284974093264, + "grad_norm": 103.94521945653051, + "kl": 0.128662109375, + "learning_rate": 8.316062176165803e-07, + "loss": 0.0005, + "reward": 1.380523443222046, + "reward_std": 0.29092546921310714, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 0.8805235028266907, + "step": 651 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.5, + "epoch": 1.689119170984456, + "grad_norm": 14.243931413628529, + "kl": 0.0758056640625, + "learning_rate": 8.313471502590673e-07, + "loss": 0.0005, + "reward": 1.9886736869812012, + "reward_std": 0.00021353523194989066, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.4886736571788788, + "step": 652 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 1.6917098445595855, + "grad_norm": 6.032961669977156, + "kl": 0.19146728515625, + "learning_rate": 8.310880829015544e-07, + "loss": 0.0009, + "reward": 1.9924865365028381, + "reward_std": 6.780139489137582e-05, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.4924865365028381, + "step": 653 + }, + { + "clip_ratio": 0.0, + "completion_length": 36.0, + "epoch": 1.694300518134715, + "grad_norm": 1.4330065175801183, + "kl": 0.0816650390625, + "learning_rate": 8.308290155440414e-07, + "loss": 0.0004, + "reward": 1.9978421330451965, + "reward_std": 5.885148004836083e-05, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.4978420734405518, + "step": 654 + }, + { + "clip_ratio": 0.0, + "completion_length": 34.5, + "epoch": 1.6968911917098446, + "grad_norm": 3.9100450515237894, + "kl": 0.083740234375, + "learning_rate": 8.305699481865285e-07, + "loss": -0.0006, + "reward": 2.499984622001648, + "reward_std": 1.4000762348587159e-05, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.999984860420227, + "step": 655 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 1.6994818652849741, + "grad_norm": 0.24514908651419975, + "kl": 0.0926513671875, + "learning_rate": 8.303108808290155e-07, + "loss": 0.0011, + "reward": 2.4999905824661255, + "reward_std": 3.2575424313563417e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999905824661255, + "step": 656 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.5, + "epoch": 1.7020725388601037, + "grad_norm": 0.296463991573155, + "kl": 0.07708740234375, + "learning_rate": 8.300518134715026e-07, + "loss": 0.0006, + "reward": 2.499985456466675, + "reward_std": 3.853004614029487e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999854564666748, + "step": 657 + }, + { + "clip_ratio": 0.0, + "completion_length": 34.0, + "epoch": 1.704663212435233, + "grad_norm": 4.1706814690545775, + "kl": 0.03399658203125, + "learning_rate": 8.297927461139896e-07, + "loss": -0.0003, + "reward": 2.499993681907654, + "reward_std": 7.77241325522482e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.999993622303009, + "step": 658 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.5, + "epoch": 1.7072538860103625, + "grad_norm": 0.17933129044869325, + "kl": 0.069091796875, + "learning_rate": 8.295336787564766e-07, + "loss": 0.0003, + "reward": 2.4999818801879883, + "reward_std": 2.1753508860911097e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999818205833435, + "step": 659 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.5, + "epoch": 1.709844559585492, + "grad_norm": 7.0882663586124774, + "kl": 0.154052734375, + "learning_rate": 8.292746113989637e-07, + "loss": 0.0005, + "reward": 2.4999516010284424, + "reward_std": 3.499470352608114e-05, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999516606330872, + "step": 660 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 1.7124352331606216, + "grad_norm": 3.200128249666342, + "kl": 0.278564453125, + "learning_rate": 8.290155440414507e-07, + "loss": 0.0018, + "reward": 1.9994218349456787, + "reward_std": 2.1387141259765485e-05, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.4994217157363892, + "step": 661 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 1.7150259067357512, + "grad_norm": 1.6315295872825968, + "kl": 0.077392578125, + "learning_rate": 8.287564766839378e-07, + "loss": 0.0014, + "reward": 2.499991297721863, + "reward_std": 9.72662110143574e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999912977218628, + "step": 662 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.5, + "epoch": 1.7176165803108807, + "grad_norm": 0.3336101169691649, + "kl": 0.0509033203125, + "learning_rate": 8.284974093264249e-07, + "loss": 0.0005, + "reward": 2.499987006187439, + "reward_std": 2.834276415342174e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999869465827942, + "step": 663 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 1.7202072538860103, + "grad_norm": 0.18875959208137574, + "kl": 0.076416015625, + "learning_rate": 8.282383419689118e-07, + "loss": 0.0013, + "reward": 2.499985456466675, + "reward_std": 2.901064817706356e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999854564666748, + "step": 664 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 1.7227979274611398, + "grad_norm": 8.723644921781322, + "kl": 0.0517578125, + "learning_rate": 8.279792746113989e-07, + "loss": 0.0001, + "reward": 2.4999836683273315, + "reward_std": 1.4262079162108421e-05, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999834895133972, + "step": 665 + }, + { + "clip_ratio": 0.0, + "completion_length": 36.0, + "epoch": 1.7253886010362693, + "grad_norm": 100.82072995520302, + "kl": 0.1234130859375, + "learning_rate": 8.27720207253886e-07, + "loss": 0.0005, + "reward": 1.739893913269043, + "reward_std": 0.2734133796184324, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.2398938536643982, + "step": 666 + }, + { + "clip_ratio": 0.0, + "completion_length": 34.5, + "epoch": 1.7279792746113989, + "grad_norm": 4.886117296355842, + "kl": 0.078125, + "learning_rate": 8.27461139896373e-07, + "loss": -0.0006, + "reward": 1.9969100952148438, + "reward_std": 4.0762413732409186e-05, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.496910274028778, + "step": 667 + }, + { + "clip_ratio": 0.0, + "completion_length": 34.5, + "epoch": 1.7305699481865284, + "grad_norm": 0.6578307152263883, + "kl": 0.16357421875, + "learning_rate": 8.272020725388601e-07, + "loss": 0.0002, + "reward": 2.4999512434005737, + "reward_std": 1.1279806813035975e-05, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999512434005737, + "step": 668 + }, + { + "clip_ratio": 0.0, + "completion_length": 34.5625, + "epoch": 1.733160621761658, + "grad_norm": 1.2918469594506938, + "kl": 0.0655517578125, + "learning_rate": 8.269430051813471e-07, + "loss": 0.0002, + "reward": 2.4999805688858032, + "reward_std": 6.839307445716258e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999805688858032, + "step": 669 + }, + { + "clip_ratio": 0.0, + "completion_length": 36.0, + "epoch": 1.7357512953367875, + "grad_norm": 1.7111373096069442, + "kl": 0.0906982421875, + "learning_rate": 8.266839378238341e-07, + "loss": 0.0008, + "reward": 2.499992251396179, + "reward_std": 8.681702638568822e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.999992311000824, + "step": 670 + }, + { + "clip_ratio": 0.0, + "completion_length": 34.5, + "epoch": 1.738341968911917, + "grad_norm": 169.42935989989192, + "kl": 0.1690673828125, + "learning_rate": 8.264248704663212e-07, + "loss": 0.0006, + "reward": 2.436815023422241, + "reward_std": 0.17759628169733332, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9368152022361755, + "step": 671 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 1.7409326424870466, + "grad_norm": 3.601261270302685, + "kl": 0.099609375, + "learning_rate": 8.261658031088082e-07, + "loss": 0.0005, + "reward": 1.4999629259109497, + "reward_std": 1.3058762306172866e-05, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 0.9999629259109497, + "step": 672 + }, + { + "clip_ratio": 0.0, + "completion_length": 34.5, + "epoch": 1.7435233160621761, + "grad_norm": 0.5871349378967271, + "kl": 0.068695068359375, + "learning_rate": 8.259067357512953e-07, + "loss": 0.0004, + "reward": 2.4999938011169434, + "reward_std": 4.084634610990179e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999937415122986, + "step": 673 + }, + { + "clip_ratio": 0.0, + "completion_length": 34.125, + "epoch": 1.7461139896373057, + "grad_norm": 8.252815464613972, + "kl": 0.092529296875, + "learning_rate": 8.256476683937823e-07, + "loss": 0.0002, + "reward": 2.499964952468872, + "reward_std": 2.659755870126901e-05, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.999964952468872, + "step": 674 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.5, + "epoch": 1.7487046632124352, + "grad_norm": 0.8702348833019585, + "kl": 0.10546875, + "learning_rate": 8.253886010362694e-07, + "loss": 0.0003, + "reward": 2.4999786615371704, + "reward_std": 4.621068995902533e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999786615371704, + "step": 675 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 1.7512953367875648, + "grad_norm": 2.3631254993898363, + "kl": 0.117431640625, + "learning_rate": 8.251295336787564e-07, + "loss": 0.0009, + "reward": 2.4999940395355225, + "reward_std": 3.846087565761991e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999940991401672, + "step": 676 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 1.7538860103626943, + "grad_norm": 0.7390971626284993, + "kl": 0.043701171875, + "learning_rate": 8.248704663212434e-07, + "loss": -0.0001, + "reward": 2.499966263771057, + "reward_std": 4.501367087073049e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999664425849915, + "step": 677 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.5, + "epoch": 1.7564766839378239, + "grad_norm": 7.13467884184763, + "kl": 0.02764892578125, + "learning_rate": 8.246113989637305e-07, + "loss": 0.0004, + "reward": 1.9999184608459473, + "reward_std": 1.642768256715499e-05, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.4999184310436249, + "step": 678 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 1.7590673575129534, + "grad_norm": 2.2789329105342073, + "kl": 0.0408935546875, + "learning_rate": 8.243523316062175e-07, + "loss": 0.0002, + "reward": 1.9997382164001465, + "reward_std": 1.3057464570920274e-05, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.4997382462024689, + "step": 679 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.5, + "epoch": 1.761658031088083, + "grad_norm": 25.47869924499872, + "kl": 0.0543212890625, + "learning_rate": 8.240932642487046e-07, + "loss": 0.0002, + "reward": 2.2499643564224243, + "reward_std": 0.267280383847492, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.7499642968177795, + "step": 680 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 1.7642487046632125, + "grad_norm": 0.7338014517634445, + "kl": 0.1080322265625, + "learning_rate": 8.238341968911918e-07, + "loss": 0.0009, + "reward": 2.499995470046997, + "reward_std": 2.1286450646584854e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999955296516418, + "step": 681 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 1.766839378238342, + "grad_norm": 5.6103634092675225, + "kl": 0.13671875, + "learning_rate": 8.235751295336786e-07, + "loss": 0.0004, + "reward": 1.4999582767486572, + "reward_std": 2.030848372669425e-05, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 0.999958336353302, + "step": 682 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.5, + "epoch": 1.7694300518134716, + "grad_norm": 2.875896173650571, + "kl": 0.0655517578125, + "learning_rate": 8.233160621761658e-07, + "loss": -0.0001, + "reward": 2.499988079071045, + "reward_std": 1.4423132824958884e-05, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999882578849792, + "step": 683 + }, + { + "clip_ratio": 0.0, + "completion_length": 34.5, + "epoch": 1.7720207253886011, + "grad_norm": 1.2964359348906216, + "kl": 0.109375, + "learning_rate": 8.230569948186528e-07, + "loss": 0.0009, + "reward": 1.9980382919311523, + "reward_std": 2.3528160681962618e-05, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.4980381727218628, + "step": 684 + }, + { + "clip_ratio": 0.0, + "completion_length": 34.5, + "epoch": 1.7746113989637307, + "grad_norm": 5.6102330524598845, + "kl": 0.10028076171875, + "learning_rate": 8.227979274611399e-07, + "loss": -0.0001, + "reward": 2.4999724626541138, + "reward_std": 1.0118180000517896e-05, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999725818634033, + "step": 685 + }, + { + "clip_ratio": 0.0, + "completion_length": 34.5, + "epoch": 1.7772020725388602, + "grad_norm": 18.868082070745036, + "kl": 0.080810546875, + "learning_rate": 8.22538860103627e-07, + "loss": 0.0011, + "reward": 2.437487006187439, + "reward_std": 0.1767846374967803, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9374868273735046, + "step": 686 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.5, + "epoch": 1.7797927461139897, + "grad_norm": 2.7496592487582143, + "kl": 0.4375, + "learning_rate": 8.22279792746114e-07, + "loss": 0.0002, + "reward": 2.4999945163726807, + "reward_std": 4.082221380485862e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999946355819702, + "step": 687 + }, + { + "clip_ratio": 0.0, + "completion_length": 34.5, + "epoch": 1.7823834196891193, + "grad_norm": 0.35821479804678263, + "kl": 0.06494140625, + "learning_rate": 8.22020725388601e-07, + "loss": 0.0008, + "reward": 2.49999737739563, + "reward_std": 1.715442664362854e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999973773956299, + "step": 688 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.5, + "epoch": 1.7849740932642488, + "grad_norm": 1.4850700490124686, + "kl": 0.0338134765625, + "learning_rate": 8.217616580310881e-07, + "loss": 0.0018, + "reward": 2.4999871253967285, + "reward_std": 4.870042914717487e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999869465827942, + "step": 689 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.25, + "epoch": 1.7875647668393784, + "grad_norm": 27.31555528505093, + "kl": 0.196044921875, + "learning_rate": 8.215025906735751e-07, + "loss": 0.0008, + "reward": 1.8023179769515991, + "reward_std": 0.0009480309454374947, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.3023179769515991, + "step": 690 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 1.790155440414508, + "grad_norm": 6.709943353290983, + "kl": 0.0533447265625, + "learning_rate": 8.212435233160622e-07, + "loss": 0.0009, + "reward": 1.7894837260246277, + "reward_std": 0.00020543435039144242, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.2894836068153381, + "step": 691 + }, + { + "clip_ratio": 0.0, + "completion_length": 34.5, + "epoch": 1.7927461139896375, + "grad_norm": 11.944606816843935, + "kl": 0.15283203125, + "learning_rate": 8.209844559585492e-07, + "loss": 0.0005, + "reward": 2.4999181032180786, + "reward_std": 1.5325039839808596e-05, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999182224273682, + "step": 692 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 1.795336787564767, + "grad_norm": 42.41986173925618, + "kl": 0.0887451171875, + "learning_rate": 8.207253886010363e-07, + "loss": 0.0006, + "reward": 2.4374067783355713, + "reward_std": 0.1769486431730911, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9374067783355713, + "step": 693 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 1.7979274611398963, + "grad_norm": 0.0689479256162065, + "kl": 0.09796142578125, + "learning_rate": 8.204663212435233e-07, + "loss": 0.0004, + "reward": 2.499999165534973, + "reward_std": 9.549086570359577e-07, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999994039535522, + "step": 694 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.5, + "epoch": 1.8005181347150259, + "grad_norm": 7.528320000260137, + "kl": 0.0830078125, + "learning_rate": 8.202072538860103e-07, + "loss": 0.0009, + "reward": 2.499980330467224, + "reward_std": 1.7278460802572226e-05, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999803304672241, + "step": 695 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.625, + "epoch": 1.8031088082901554, + "grad_norm": 8.356065382587703, + "kl": 0.131591796875, + "learning_rate": 8.199481865284974e-07, + "loss": 0.001, + "reward": 2.343741536140442, + "reward_std": 0.44194080871261576, + "rewards/format_reward_rec": 0.9375, + "rewards/point_reward": 1.8749914765357971, + "step": 696 + }, + { + "clip_ratio": 0.0, + "completion_length": 34.5, + "epoch": 1.805699481865285, + "grad_norm": 1.7944457368315816, + "kl": 0.227294921875, + "learning_rate": 8.196891191709844e-07, + "loss": 0.0012, + "reward": 2.499993920326233, + "reward_std": 2.733200744842179e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.999993920326233, + "step": 697 + }, + { + "clip_ratio": 0.0, + "completion_length": 34.5, + "epoch": 1.8082901554404145, + "grad_norm": 3.0144286949801637, + "kl": 0.075439453125, + "learning_rate": 8.194300518134715e-07, + "loss": -0.0001, + "reward": 1.9976152181625366, + "reward_std": 5.661829597158885e-05, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.497615396976471, + "step": 698 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 1.810880829015544, + "grad_norm": 0.2759789092248253, + "kl": 0.10595703125, + "learning_rate": 8.191709844559586e-07, + "loss": 0.0006, + "reward": 2.4999958276748657, + "reward_std": 2.1654390423009318e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999955892562866, + "step": 699 + }, + { + "clip_ratio": 0.0, + "completion_length": 34.75, + "epoch": 1.8134715025906736, + "grad_norm": 42.67759353045895, + "kl": 0.087890625, + "learning_rate": 8.189119170984455e-07, + "loss": 0.0005, + "reward": 1.9869170188903809, + "reward_std": 0.22009831677132752, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.4869168996810913, + "step": 700 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.5, + "epoch": 1.8160621761658031, + "grad_norm": 32.656202050911965, + "kl": 0.107666015625, + "learning_rate": 8.186528497409326e-07, + "loss": 0.0003, + "reward": 1.5609017610549927, + "reward_std": 0.17689743958544568, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.0609017610549927, + "step": 701 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 1.8186528497409327, + "grad_norm": 0.47028814155421916, + "kl": 0.10107421875, + "learning_rate": 8.183937823834196e-07, + "loss": 0.0011, + "reward": 2.499952554702759, + "reward_std": 5.4636123252294055e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999526143074036, + "step": 702 + }, + { + "clip_ratio": 0.0, + "completion_length": 34.8125, + "epoch": 1.8212435233160622, + "grad_norm": 37.75444367838349, + "kl": 0.14404296875, + "learning_rate": 8.181347150259067e-07, + "loss": 0.0012, + "reward": 1.7973414659500122, + "reward_std": 0.28486732493939826, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.2973413467407227, + "step": 703 + }, + { + "clip_ratio": 0.0, + "completion_length": 34.875, + "epoch": 1.8238341968911918, + "grad_norm": 149.79450039519122, + "kl": 0.176025390625, + "learning_rate": 8.178756476683938e-07, + "loss": 0.0007, + "reward": 1.9040983319282532, + "reward_std": 0.21884164586663246, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.4040984511375427, + "step": 704 + }, + { + "clip_ratio": 0.0, + "completion_length": 34.5, + "epoch": 1.8264248704663213, + "grad_norm": 4.607706106641456, + "kl": 0.0474853515625, + "learning_rate": 8.176165803108808e-07, + "loss": -0.0002, + "reward": 2.499961495399475, + "reward_std": 2.263770451804703e-05, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999615550041199, + "step": 705 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.5, + "epoch": 1.8290155440414506, + "grad_norm": 8.759813240442615, + "kl": 0.255126953125, + "learning_rate": 8.173575129533678e-07, + "loss": 0.0013, + "reward": 2.498636484146118, + "reward_std": 0.00018744168619377888, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.998636543750763, + "step": 706 + }, + { + "clip_ratio": 0.0, + "completion_length": 34.5, + "epoch": 1.8316062176165802, + "grad_norm": 5.432585947575143, + "kl": 0.087158203125, + "learning_rate": 8.170984455958548e-07, + "loss": 0.0006, + "reward": 1.4982079863548279, + "reward_std": 3.8477059206343256e-05, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 0.9982079267501831, + "step": 707 + }, + { + "clip_ratio": 0.0, + "completion_length": 36.0, + "epoch": 1.8341968911917097, + "grad_norm": 2.3682903525699808, + "kl": 0.06787109375, + "learning_rate": 8.168393782383419e-07, + "loss": 0.0002, + "reward": 1.9979745149612427, + "reward_std": 4.085007344656333e-05, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.4979746341705322, + "step": 708 + }, + { + "clip_ratio": 0.0, + "completion_length": 36.0, + "epoch": 1.8367875647668392, + "grad_norm": 1.2874349952757198, + "kl": 0.105224609375, + "learning_rate": 8.16580310880829e-07, + "loss": 0.0005, + "reward": 2.499969720840454, + "reward_std": 8.705598474989529e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999694228172302, + "step": 709 + }, + { + "clip_ratio": 0.0, + "completion_length": 34.5, + "epoch": 1.8393782383419688, + "grad_norm": 3.9606911970202816, + "kl": 0.0487060546875, + "learning_rate": 8.16321243523316e-07, + "loss": -0.0003, + "reward": 2.4999043941497803, + "reward_std": 1.8315811303182272e-05, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999045729637146, + "step": 710 + }, + { + "clip_ratio": 0.0, + "completion_length": 36.0, + "epoch": 1.8419689119170983, + "grad_norm": 2.921244370159192, + "kl": 0.040283203125, + "learning_rate": 8.160621761658031e-07, + "loss": 0.0003, + "reward": 2.499955415725708, + "reward_std": 2.1065489818283822e-05, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999554753303528, + "step": 711 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 1.8445595854922279, + "grad_norm": 0.4919493489252933, + "kl": 0.096435546875, + "learning_rate": 8.1580310880829e-07, + "loss": 0.0016, + "reward": 2.4999988079071045, + "reward_std": 2.1324171086689603e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.999998927116394, + "step": 712 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.5, + "epoch": 1.8471502590673574, + "grad_norm": 75.2656319553385, + "kl": 0.0369873046875, + "learning_rate": 8.155440414507771e-07, + "loss": 0.0001, + "reward": 1.9998453259468079, + "reward_std": 3.05382527585607e-05, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.4998455345630646, + "step": 713 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 1.849740932642487, + "grad_norm": 4.7017640442430775, + "kl": 0.088623046875, + "learning_rate": 8.152849740932642e-07, + "loss": 0.0004, + "reward": 1.9984892010688782, + "reward_std": 2.6042447871077457e-05, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.4984891414642334, + "step": 714 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.5, + "epoch": 1.8523316062176165, + "grad_norm": 6.222395499694789, + "kl": 0.0528564453125, + "learning_rate": 8.150259067357512e-07, + "loss": -0.0004, + "reward": 2.4998987913131714, + "reward_std": 2.3090570721251424e-05, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.999898910522461, + "step": 715 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 1.854922279792746, + "grad_norm": 24.538056060265394, + "kl": 0.1240234375, + "learning_rate": 8.147668393782383e-07, + "loss": 0.0007, + "reward": 1.9928725957870483, + "reward_std": 0.0006328953459160402, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.4928724765777588, + "step": 716 + }, + { + "clip_ratio": 0.0, + "completion_length": 34.5625, + "epoch": 1.8575129533678756, + "grad_norm": 14.278827744369918, + "kl": 0.1180419921875, + "learning_rate": 8.145077720207254e-07, + "loss": 0.0001, + "reward": 2.3446788787841797, + "reward_std": 0.2946911964281753, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.8446791172027588, + "step": 717 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.5, + "epoch": 1.8601036269430051, + "grad_norm": 1.3381890223202713, + "kl": 0.044677734375, + "learning_rate": 8.142487046632123e-07, + "loss": -0.0002, + "reward": 2.49999463558197, + "reward_std": 5.366977916310134e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999945163726807, + "step": 718 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 1.8626943005181347, + "grad_norm": 21.335956197159533, + "kl": 0.146484375, + "learning_rate": 8.139896373056994e-07, + "loss": 0.0003, + "reward": 2.499861240386963, + "reward_std": 6.220497755293763e-05, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9998611807823181, + "step": 719 + }, + { + "clip_ratio": 0.0, + "completion_length": 34.5, + "epoch": 1.8652849740932642, + "grad_norm": 12.952669851343646, + "kl": 0.16259765625, + "learning_rate": 8.137305699481864e-07, + "loss": 0.0007, + "reward": 1.9994430541992188, + "reward_std": 3.526075306581333e-05, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.4994431734085083, + "step": 720 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 1.8678756476683938, + "grad_norm": 0.30068170664250105, + "kl": 0.042236328125, + "learning_rate": 8.134715025906735e-07, + "loss": -0.0004, + "reward": 2.499974489212036, + "reward_std": 4.600924171427323e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999744296073914, + "step": 721 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.5, + "epoch": 1.8704663212435233, + "grad_norm": 3.6812761982765556, + "kl": 0.1201171875, + "learning_rate": 8.132124352331606e-07, + "loss": 0.0004, + "reward": 2.4999918937683105, + "reward_std": 6.380602542321867e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999918937683105, + "step": 722 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.5, + "epoch": 1.8730569948186528, + "grad_norm": 11.405570312476135, + "kl": 0.08984375, + "learning_rate": 8.129533678756476e-07, + "loss": 0.0012, + "reward": 1.8103687167167664, + "reward_std": 0.001791583761473703, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.3103685975074768, + "step": 723 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 1.8756476683937824, + "grad_norm": 66.8366672331642, + "kl": 0.055419921875, + "learning_rate": 8.126943005181348e-07, + "loss": 0.0005, + "reward": 2.3749492168426514, + "reward_std": 0.2315013756715416, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.8749491572380066, + "step": 724 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.5, + "epoch": 1.878238341968912, + "grad_norm": 3.2126439837115353, + "kl": 0.40869140625, + "learning_rate": 8.124352331606216e-07, + "loss": 0.0022, + "reward": 2.499987840652466, + "reward_std": 8.392602012463612e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999877214431763, + "step": 725 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.5, + "epoch": 1.8808290155440415, + "grad_norm": 1.6722275786840681, + "kl": 0.11480712890625, + "learning_rate": 8.121761658031088e-07, + "loss": 0.002, + "reward": 2.499995708465576, + "reward_std": 5.694277945167414e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999956488609314, + "step": 726 + }, + { + "clip_ratio": 0.0, + "completion_length": 34.5, + "epoch": 1.883419689119171, + "grad_norm": 1.5742542434766549, + "kl": 0.1109619140625, + "learning_rate": 8.119170984455959e-07, + "loss": 0.0006, + "reward": 2.4999829530715942, + "reward_std": 7.644510560567142e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.999983012676239, + "step": 727 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.5, + "epoch": 1.8860103626943006, + "grad_norm": 1.8219085341067554, + "kl": 0.0302276611328125, + "learning_rate": 8.116580310880829e-07, + "loss": 0.0, + "reward": 2.499990701675415, + "reward_std": 1.2698666324695296e-05, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.999990701675415, + "step": 728 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 1.88860103626943, + "grad_norm": 1.663450928179118, + "kl": 0.090087890625, + "learning_rate": 8.1139896373057e-07, + "loss": 0.0017, + "reward": 2.499966025352478, + "reward_std": 1.1797412298619747e-05, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999658465385437, + "step": 729 + }, + { + "clip_ratio": 0.0, + "completion_length": 34.0, + "epoch": 1.8911917098445596, + "grad_norm": 0.6994851516255558, + "kl": 0.149658203125, + "learning_rate": 8.111398963730569e-07, + "loss": 0.0, + "reward": 2.499577283859253, + "reward_std": 1.5459396195183217e-05, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9995773434638977, + "step": 730 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0625, + "epoch": 1.8937823834196892, + "grad_norm": 25.27344217750221, + "kl": 0.13525390625, + "learning_rate": 8.10880829015544e-07, + "loss": 0.0007, + "reward": 1.896426498889923, + "reward_std": 0.16527703722675824, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.396426498889923, + "step": 731 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 1.8963730569948187, + "grad_norm": 3.9910486466040282, + "kl": 0.09130859375, + "learning_rate": 8.106217616580311e-07, + "loss": 0.0008, + "reward": 2.499955177307129, + "reward_std": 2.780619524855865e-05, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999552965164185, + "step": 732 + }, + { + "clip_ratio": 0.0, + "completion_length": 36.0, + "epoch": 1.8989637305699483, + "grad_norm": 0.7242378921876792, + "kl": 0.1339111328125, + "learning_rate": 8.103626943005181e-07, + "loss": 0.0002, + "reward": 1.9998878240585327, + "reward_std": 8.981161045085173e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.499887853860855, + "step": 733 + }, + { + "clip_ratio": 0.0, + "completion_length": 36.0, + "epoch": 1.9015544041450778, + "grad_norm": 112.58840386757183, + "kl": 0.09375, + "learning_rate": 8.101036269430052e-07, + "loss": 0.0002, + "reward": 2.499970316886902, + "reward_std": 9.514840883184661e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999704360961914, + "step": 734 + }, + { + "clip_ratio": 0.0, + "completion_length": 34.5, + "epoch": 1.9041450777202074, + "grad_norm": 4.212007998783463, + "kl": 0.131591796875, + "learning_rate": 8.098445595854922e-07, + "loss": 0.0009, + "reward": 1.9555780291557312, + "reward_std": 0.00015077465263857448, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.4555780291557312, + "step": 735 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.5, + "epoch": 1.906735751295337, + "grad_norm": 34.98546349999838, + "kl": 0.0682373046875, + "learning_rate": 8.095854922279793e-07, + "loss": -0.0008, + "reward": 1.9997453093528748, + "reward_std": 6.642441468329707e-05, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.4997454285621643, + "step": 736 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 1.9093264248704664, + "grad_norm": 1.1272303075796912, + "kl": 0.0859375, + "learning_rate": 8.093264248704663e-07, + "loss": 0.0002, + "reward": 2.4999903440475464, + "reward_std": 6.589457939298882e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999902248382568, + "step": 737 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.5, + "epoch": 1.911917098445596, + "grad_norm": 13.276057693911596, + "kl": 0.064208984375, + "learning_rate": 8.090673575129533e-07, + "loss": 0.0005, + "reward": 2.4999653100967407, + "reward_std": 2.632743871799903e-05, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.999965250492096, + "step": 738 + }, + { + "clip_ratio": 0.0, + "completion_length": 34.5625, + "epoch": 1.9145077720207255, + "grad_norm": 90.23450325866312, + "kl": 0.062255859375, + "learning_rate": 8.088082901554404e-07, + "loss": -0.0002, + "reward": 1.9986023902893066, + "reward_std": 0.003414489360409334, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.4986023902893066, + "step": 739 + }, + { + "clip_ratio": 0.0, + "completion_length": 34.625, + "epoch": 1.917098445595855, + "grad_norm": 371.11278429509724, + "kl": 0.54541015625, + "learning_rate": 8.085492227979275e-07, + "loss": 0.0022, + "reward": 1.5157282948493958, + "reward_std": 0.5100982487201691, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.0157283544540405, + "step": 740 + }, + { + "clip_ratio": 0.0, + "completion_length": 36.0, + "epoch": 1.9196891191709846, + "grad_norm": 1.6043536164560124, + "kl": 0.09814453125, + "learning_rate": 8.082901554404145e-07, + "loss": -0.0001, + "reward": 2.499995708465576, + "reward_std": 6.733513032486371e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999958276748657, + "step": 741 + }, + { + "clip_ratio": 0.0, + "completion_length": 34.0, + "epoch": 1.922279792746114, + "grad_norm": 0.11167726296119958, + "kl": 0.040771484375, + "learning_rate": 8.080310880829016e-07, + "loss": 0.0002, + "reward": 2.4999977350234985, + "reward_std": 1.6510326474872272e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999977350234985, + "step": 742 + }, + { + "clip_ratio": 0.0, + "completion_length": 34.5, + "epoch": 1.9248704663212435, + "grad_norm": 11.803440694600411, + "kl": 0.20703125, + "learning_rate": 8.077720207253885e-07, + "loss": 0.001, + "reward": 1.3101013898849487, + "reward_std": 0.0008895274622773286, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 0.8101013898849487, + "step": 743 + }, + { + "clip_ratio": 0.0, + "completion_length": 34.5625, + "epoch": 1.927461139896373, + "grad_norm": 1.080930120226716, + "kl": 0.03631591796875, + "learning_rate": 8.075129533678756e-07, + "loss": -0.0006, + "reward": 2.4999934434890747, + "reward_std": 5.392120669966971e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999935626983643, + "step": 744 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 1.9300518134715026, + "grad_norm": 2.5205744223709647, + "kl": 0.084228515625, + "learning_rate": 8.072538860103627e-07, + "loss": 0.0007, + "reward": 2.499979257583618, + "reward_std": 1.0524852768867277e-05, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999791383743286, + "step": 745 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.5, + "epoch": 1.932642487046632, + "grad_norm": 5.2551240199063285, + "kl": 0.0430908203125, + "learning_rate": 8.069948186528497e-07, + "loss": 0.0005, + "reward": 1.9996366500854492, + "reward_std": 3.7261974739521975e-05, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.4996365010738373, + "step": 746 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.4375, + "epoch": 1.9352331606217616, + "grad_norm": 1.0978098646150014, + "kl": 0.0548095703125, + "learning_rate": 8.067357512953368e-07, + "loss": -0.001, + "reward": 2.4999940395355225, + "reward_std": 6.356756273362407e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.999994158744812, + "step": 747 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.5, + "epoch": 1.9378238341968912, + "grad_norm": 2.4030625070168123, + "kl": 0.114501953125, + "learning_rate": 8.064766839378238e-07, + "loss": 0.0009, + "reward": 2.4999942779541016, + "reward_std": 1.060055092239054e-05, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999940991401672, + "step": 748 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.5, + "epoch": 1.9404145077720207, + "grad_norm": 0.5733309245884627, + "kl": 0.0908203125, + "learning_rate": 8.062176165803108e-07, + "loss": 0.0002, + "reward": 2.499983787536621, + "reward_std": 6.805420184718969e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999839663505554, + "step": 749 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.5, + "epoch": 1.9430051813471503, + "grad_norm": 2.399209736219041, + "kl": 0.1552734375, + "learning_rate": 8.059585492227979e-07, + "loss": 0.0012, + "reward": 1.9999147057533264, + "reward_std": 9.375180297865882e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.4999146461486816, + "step": 750 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 1.9455958549222798, + "grad_norm": 1.2292729366081792, + "kl": 0.13037109375, + "learning_rate": 8.056994818652849e-07, + "loss": 0.0001, + "reward": 1.99888014793396, + "reward_std": 3.06091478705639e-05, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.4988802671432495, + "step": 751 + }, + { + "clip_ratio": 0.0, + "completion_length": 34.5, + "epoch": 1.9481865284974094, + "grad_norm": 5.435845218415476, + "kl": 0.032928466796875, + "learning_rate": 8.05440414507772e-07, + "loss": -0.0008, + "reward": 2.4999637603759766, + "reward_std": 2.1075928771097097e-05, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999637603759766, + "step": 752 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 1.950777202072539, + "grad_norm": 0.6488012633788783, + "kl": 0.190185546875, + "learning_rate": 8.05181347150259e-07, + "loss": -0.0002, + "reward": 2.499993324279785, + "reward_std": 6.480698630184634e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999934434890747, + "step": 753 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 1.9533678756476682, + "grad_norm": 0.07686287574520377, + "kl": 0.0789794921875, + "learning_rate": 8.049222797927461e-07, + "loss": -0.001, + "reward": 2.499997854232788, + "reward_std": 1.4234396985557396e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999979138374329, + "step": 754 + }, + { + "clip_ratio": 0.0, + "completion_length": 34.5, + "epoch": 1.9559585492227978, + "grad_norm": 3.5824155609827226, + "kl": 0.0550537109375, + "learning_rate": 8.046632124352331e-07, + "loss": 0.0001, + "reward": 2.4999858140945435, + "reward_std": 6.218678777258901e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.999985694885254, + "step": 755 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.375, + "epoch": 1.9585492227979273, + "grad_norm": 197.39829672714714, + "kl": 0.6048583984375, + "learning_rate": 8.044041450777201e-07, + "loss": 0.0023, + "reward": 1.9229825735092163, + "reward_std": 0.007150859762987238, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.4229826629161835, + "step": 756 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 1.9611398963730569, + "grad_norm": 0.8367198565567382, + "kl": 0.213623046875, + "learning_rate": 8.041450777202072e-07, + "loss": 0.0004, + "reward": 2.499998092651367, + "reward_std": 1.9431875557529565e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999980330467224, + "step": 757 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 1.9637305699481864, + "grad_norm": 32.22400377472626, + "kl": 0.12646484375, + "learning_rate": 8.038860103626942e-07, + "loss": 0.0011, + "reward": 2.3746838569641113, + "reward_std": 0.23150423855076951, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.8746837973594666, + "step": 758 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.5, + "epoch": 1.966321243523316, + "grad_norm": 0.11304688660412425, + "kl": 0.062469482421875, + "learning_rate": 8.036269430051813e-07, + "loss": 0.0002, + "reward": 2.4999979734420776, + "reward_std": 1.46941570733361e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999979734420776, + "step": 759 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.5, + "epoch": 1.9689119170984455, + "grad_norm": 1.6148992685408439, + "kl": 0.117431640625, + "learning_rate": 8.033678756476684e-07, + "loss": 0.0008, + "reward": 1.9999431371688843, + "reward_std": 5.542067810893059e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.4999430775642395, + "step": 760 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.5, + "epoch": 1.971502590673575, + "grad_norm": 0.2475517092385738, + "kl": 0.073486328125, + "learning_rate": 8.031088082901553e-07, + "loss": -0.0011, + "reward": 2.4999730587005615, + "reward_std": 4.552717655315064e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999732375144958, + "step": 761 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 1.9740932642487046, + "grad_norm": 1.8326122249000079, + "kl": 0.088623046875, + "learning_rate": 8.028497409326424e-07, + "loss": 0.0005, + "reward": 2.499992609024048, + "reward_std": 7.946578989503905e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999926686286926, + "step": 762 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 1.9766839378238341, + "grad_norm": 0.11875450687150292, + "kl": 0.05914306640625, + "learning_rate": 8.025906735751295e-07, + "loss": 0.0001, + "reward": 2.4999966621398926, + "reward_std": 1.966403488040669e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.999996542930603, + "step": 763 + }, + { + "clip_ratio": 0.0, + "completion_length": 34.0, + "epoch": 1.9792746113989637, + "grad_norm": 5.165333128227207, + "kl": 0.04052734375, + "learning_rate": 8.023316062176165e-07, + "loss": 0.0007, + "reward": 2.4999712705612183, + "reward_std": 1.2886929425803828e-05, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999714493751526, + "step": 764 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 1.9818652849740932, + "grad_norm": 2.9431804361431197, + "kl": 0.072998046875, + "learning_rate": 8.020725388601036e-07, + "loss": -0.0003, + "reward": 2.4999146461486816, + "reward_std": 2.159702012249909e-05, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999147653579712, + "step": 765 + }, + { + "clip_ratio": 0.0, + "completion_length": 34.5625, + "epoch": 1.9844559585492227, + "grad_norm": 59.74835066178494, + "kl": 0.08935546875, + "learning_rate": 8.018134715025906e-07, + "loss": 0.0011, + "reward": 2.374890089035034, + "reward_std": 0.23161013678759446, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.8748899698257446, + "step": 766 + }, + { + "clip_ratio": 0.0, + "completion_length": 34.5, + "epoch": 1.9870466321243523, + "grad_norm": 0.08220585831197426, + "kl": 0.1021728515625, + "learning_rate": 8.015544041450776e-07, + "loss": -0.0009, + "reward": 2.4999961853027344, + "reward_std": 1.474141697599407e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999962449073792, + "step": 767 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.3125, + "epoch": 1.9896373056994818, + "grad_norm": 71.57515418933694, + "kl": 0.10943603515625, + "learning_rate": 8.012953367875648e-07, + "loss": 0.0003, + "reward": 1.9380744695663452, + "reward_std": 0.08540706582425628, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.438074678182602, + "step": 768 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 1.9922279792746114, + "grad_norm": 1.0831466786758148, + "kl": 0.05633544921875, + "learning_rate": 8.010362694300518e-07, + "loss": 0.0, + "reward": 2.4999940395355225, + "reward_std": 5.40015429351115e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.999994158744812, + "step": 769 + }, + { + "clip_ratio": 0.0, + "completion_length": 36.0, + "epoch": 1.994818652849741, + "grad_norm": 1.8830742937355367, + "kl": 0.084228515625, + "learning_rate": 8.007772020725389e-07, + "loss": 0.001, + "reward": 1.999828815460205, + "reward_std": 1.2673829701270733e-05, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.499828815460205, + "step": 770 + }, + { + "clip_ratio": 0.0, + "completion_length": 34.5, + "epoch": 1.9974093264248705, + "grad_norm": 24.603749511656364, + "kl": 0.14599609375, + "learning_rate": 8.005181347150259e-07, + "loss": 0.0008, + "reward": 1.9988928437232971, + "reward_std": 0.0001699219587862899, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.4988930523395538, + "step": 771 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 2.0, + "grad_norm": 18.225488387914933, + "kl": 0.12353515625, + "learning_rate": 8.00259067357513e-07, + "loss": 0.0005, + "reward": 1.4656990766525269, + "reward_std": 0.0006911862874403596, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 0.9656990170478821, + "step": 772 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 2.0025906735751295, + "grad_norm": 3.6961106629596263, + "kl": 0.06597900390625, + "learning_rate": 8e-07, + "loss": -0.0001, + "reward": 2.499965786933899, + "reward_std": 1.881695811789541e-05, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999658465385437, + "step": 773 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.5, + "epoch": 2.005181347150259, + "grad_norm": 7.2558670423356295, + "kl": 0.0767822265625, + "learning_rate": 7.99740932642487e-07, + "loss": 0.0004, + "reward": 1.4988000392913818, + "reward_std": 6.554352876264602e-05, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 0.9988000392913818, + "step": 774 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.5, + "epoch": 2.0077720207253886, + "grad_norm": 3.848726310667743, + "kl": 0.1484375, + "learning_rate": 7.994818652849741e-07, + "loss": 0.0002, + "reward": 2.499962329864502, + "reward_std": 2.1448652205435792e-05, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999623894691467, + "step": 775 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 2.010362694300518, + "grad_norm": 1.7495460596018095, + "kl": 0.2198486328125, + "learning_rate": 7.992227979274611e-07, + "loss": 0.0016, + "reward": 2.499992847442627, + "reward_std": 5.8723562119666894e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999929666519165, + "step": 776 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.5, + "epoch": 2.0129533678756477, + "grad_norm": 0.07181438494072484, + "kl": 0.100341796875, + "learning_rate": 7.989637305699482e-07, + "loss": 0.0011, + "reward": 2.4999982118606567, + "reward_std": 1.4673998407488398e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999979734420776, + "step": 777 + }, + { + "clip_ratio": 0.0, + "completion_length": 34.5, + "epoch": 2.0155440414507773, + "grad_norm": 99.34717539043967, + "kl": 0.861083984375, + "learning_rate": 7.987046632124353e-07, + "loss": 0.0034, + "reward": 1.9193508625030518, + "reward_std": 0.22464617586228997, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.4193509817123413, + "step": 778 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 2.018134715025907, + "grad_norm": 0.2736149256989262, + "kl": 0.1043701171875, + "learning_rate": 7.984455958549222e-07, + "loss": 0.0019, + "reward": 2.4999940395355225, + "reward_std": 3.2497503070771927e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999940395355225, + "step": 779 + }, + { + "clip_ratio": 0.0, + "completion_length": 34.5, + "epoch": 2.0207253886010363, + "grad_norm": 27.774634743895458, + "kl": 1.470703125, + "learning_rate": 7.981865284974093e-07, + "loss": 0.0059, + "reward": 2.3124088048934937, + "reward_std": 0.4082988053560257, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.812408983707428, + "step": 780 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.5, + "epoch": 2.023316062176166, + "grad_norm": 0.1494878068399389, + "kl": 0.091552734375, + "learning_rate": 7.979274611398963e-07, + "loss": -0.0005, + "reward": 2.499991774559021, + "reward_std": 2.220222768301028e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999918937683105, + "step": 781 + }, + { + "clip_ratio": 0.0, + "completion_length": 34.5, + "epoch": 2.0259067357512954, + "grad_norm": 21.653173742938588, + "kl": 0.05499267578125, + "learning_rate": 7.976683937823834e-07, + "loss": 0.0002, + "reward": 2.4374780654907227, + "reward_std": 0.17682574421314712, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9374780654907227, + "step": 782 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.5, + "epoch": 2.028497409326425, + "grad_norm": 0.44320321457665696, + "kl": 0.113525390625, + "learning_rate": 7.974093264248705e-07, + "loss": 0.001, + "reward": 2.4999852180480957, + "reward_std": 4.378118092063232e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999849796295166, + "step": 783 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.5, + "epoch": 2.0310880829015545, + "grad_norm": 2.4012456155232487, + "kl": 0.0958251953125, + "learning_rate": 7.971502590673575e-07, + "loss": 0.0008, + "reward": 2.499974846839905, + "reward_std": 1.4902695284035872e-05, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.99997478723526, + "step": 784 + }, + { + "clip_ratio": 0.0, + "completion_length": 34.5, + "epoch": 2.033678756476684, + "grad_norm": 78.8103895753835, + "kl": 0.19970703125, + "learning_rate": 7.968911917098445e-07, + "loss": 0.0009, + "reward": 1.9711123704910278, + "reward_std": 0.010406928623552858, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.4711123704910278, + "step": 785 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 2.0362694300518136, + "grad_norm": 7.272051221335001, + "kl": 0.4921875, + "learning_rate": 7.966321243523316e-07, + "loss": 0.002, + "reward": 2.4996891021728516, + "reward_std": 2.328877332047341e-05, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.999688982963562, + "step": 786 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.5, + "epoch": 2.038860103626943, + "grad_norm": 0.6240978384693943, + "kl": 0.042724609375, + "learning_rate": 7.963730569948186e-07, + "loss": 0.0007, + "reward": 2.4999805688858032, + "reward_std": 7.714426828897558e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999805092811584, + "step": 787 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.5, + "epoch": 2.0414507772020727, + "grad_norm": 1.7174991918507247, + "kl": 0.096923828125, + "learning_rate": 7.961139896373057e-07, + "loss": 0.0004, + "reward": 2.4372533559799194, + "reward_std": 0.17679266391360215, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9372533559799194, + "step": 788 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 2.0440414507772022, + "grad_norm": 8.281741727533326, + "kl": 0.08154296875, + "learning_rate": 7.958549222797927e-07, + "loss": 0.0006, + "reward": 1.9990002512931824, + "reward_std": 6.093075671742554e-05, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.49900022149086, + "step": 789 + }, + { + "clip_ratio": 0.0, + "completion_length": 34.0, + "epoch": 2.0466321243523318, + "grad_norm": 3.699676109780699, + "kl": 0.0682373046875, + "learning_rate": 7.955958549222798e-07, + "loss": 0.0005, + "reward": 2.499968409538269, + "reward_std": 1.9681296180351637e-05, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999682903289795, + "step": 790 + }, + { + "clip_ratio": 0.0, + "completion_length": 36.0, + "epoch": 2.0492227979274613, + "grad_norm": 2.414906691658386, + "kl": 0.07421875, + "learning_rate": 7.953367875647668e-07, + "loss": 0.001, + "reward": 2.4996109008789062, + "reward_std": 2.106163833559549e-05, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9996107816696167, + "step": 791 + }, + { + "clip_ratio": 0.0, + "completion_length": 36.0, + "epoch": 2.051813471502591, + "grad_norm": 0.5725793795529477, + "kl": 0.1915283203125, + "learning_rate": 7.950777202072538e-07, + "loss": 0.0016, + "reward": 2.4999940395355225, + "reward_std": 3.5649984511110233e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.999993920326233, + "step": 792 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 2.0544041450777204, + "grad_norm": 0.03604955838150388, + "kl": 0.0654296875, + "learning_rate": 7.948186528497409e-07, + "loss": -0.0005, + "reward": 2.499998927116394, + "reward_std": 1.0972140671583475e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999990463256836, + "step": 793 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.5, + "epoch": 2.05699481865285, + "grad_norm": 31.868527530811924, + "kl": 0.11083984375, + "learning_rate": 7.945595854922279e-07, + "loss": 0.0007, + "reward": 1.9483452439308167, + "reward_std": 0.009741406350713078, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.4483452439308167, + "step": 794 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.5, + "epoch": 2.0595854922279795, + "grad_norm": 2.309197763477085, + "kl": 0.08837890625, + "learning_rate": 7.94300518134715e-07, + "loss": -0.0001, + "reward": 2.4999725818634033, + "reward_std": 1.086189519128311e-05, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999727010726929, + "step": 795 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 2.062176165803109, + "grad_norm": 3.6380421520174897, + "kl": 0.57373046875, + "learning_rate": 7.940414507772021e-07, + "loss": 0.0024, + "reward": 2.4999133348464966, + "reward_std": 0.00023796794027930446, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999132752418518, + "step": 796 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.5, + "epoch": 2.064766839378238, + "grad_norm": 37.97424849534484, + "kl": 0.16455078125, + "learning_rate": 7.93782383419689e-07, + "loss": 0.0001, + "reward": 1.7137579321861267, + "reward_std": 0.0008719326960999751, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.2137579023838043, + "step": 797 + }, + { + "clip_ratio": 0.0, + "completion_length": 34.0, + "epoch": 2.0673575129533677, + "grad_norm": 35.17132503857111, + "kl": 0.06298828125, + "learning_rate": 7.935233160621761e-07, + "loss": 0.0003, + "reward": 1.9983269572257996, + "reward_std": 0.0009882468584692106, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.4983269572257996, + "step": 798 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.4375, + "epoch": 2.069948186528497, + "grad_norm": 54.62937239848824, + "kl": 0.088623046875, + "learning_rate": 7.932642487046631e-07, + "loss": 0.0005, + "reward": 1.9671945571899414, + "reward_std": 0.002545786983318976, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.4671944081783295, + "step": 799 + }, + { + "clip_ratio": 0.0, + "completion_length": 34.5, + "epoch": 2.0725388601036268, + "grad_norm": 4.0722667510399875, + "kl": 0.12969970703125, + "learning_rate": 7.930051813471502e-07, + "loss": 0.0006, + "reward": 1.9984092116355896, + "reward_std": 7.788836177269332e-05, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.4984091222286224, + "step": 800 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 2.0751295336787563, + "grad_norm": 18.29493648267371, + "kl": 0.0872802734375, + "learning_rate": 7.927461139896373e-07, + "loss": 0.0009, + "reward": 1.9955613017082214, + "reward_std": 0.0005434078050257085, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.4955612421035767, + "step": 801 + }, + { + "clip_ratio": 0.0, + "completion_length": 34.5, + "epoch": 2.077720207253886, + "grad_norm": 0.709697541298915, + "kl": 0.25390625, + "learning_rate": 7.924870466321243e-07, + "loss": 0.0013, + "reward": 2.499990463256836, + "reward_std": 7.249970394695993e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.999990463256836, + "step": 802 + }, + { + "clip_ratio": 0.0, + "completion_length": 34.5, + "epoch": 2.0803108808290154, + "grad_norm": 2.598427823478953, + "kl": 0.121337890625, + "learning_rate": 7.922279792746113e-07, + "loss": 0.0003, + "reward": 1.9991042017936707, + "reward_std": 3.372466289874865e-05, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.4991041719913483, + "step": 803 + }, + { + "clip_ratio": 0.0, + "completion_length": 34.5, + "epoch": 2.082901554404145, + "grad_norm": 2.7433718160379645, + "kl": 0.078369140625, + "learning_rate": 7.919689119170983e-07, + "loss": 0.0009, + "reward": 2.4999966621398926, + "reward_std": 4.335711537351017e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999966621398926, + "step": 804 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 2.0854922279792745, + "grad_norm": 0.25199641692462155, + "kl": 0.054931640625, + "learning_rate": 7.917098445595854e-07, + "loss": -0.0001, + "reward": 2.499996781349182, + "reward_std": 2.647911770736755e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999969005584717, + "step": 805 + }, + { + "clip_ratio": 0.0, + "completion_length": 34.5, + "epoch": 2.088082901554404, + "grad_norm": 0.056184545774577904, + "kl": 0.0426025390625, + "learning_rate": 7.914507772020725e-07, + "loss": -0.0006, + "reward": 2.499998927116394, + "reward_std": 8.558417619042302e-07, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999990463256836, + "step": 806 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.1875, + "epoch": 2.0906735751295336, + "grad_norm": 82.12138733630296, + "kl": 0.0738525390625, + "learning_rate": 7.911917098445595e-07, + "loss": 0.0006, + "reward": 2.306434154510498, + "reward_std": 0.2671876762778993, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.8064342141151428, + "step": 807 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.5, + "epoch": 2.093264248704663, + "grad_norm": 39.75224721499368, + "kl": 0.131591796875, + "learning_rate": 7.909326424870466e-07, + "loss": 0.0002, + "reward": 2.374869227409363, + "reward_std": 0.23160941991955042, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.8748692870140076, + "step": 808 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0625, + "epoch": 2.0958549222797926, + "grad_norm": 12.512084781215352, + "kl": 0.0728759765625, + "learning_rate": 7.906735751295335e-07, + "loss": -0.0001, + "reward": 2.402372360229492, + "reward_std": 0.2760960097671159, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9023725390434265, + "step": 809 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 2.098445595854922, + "grad_norm": 15.58087078184914, + "kl": 0.0845947265625, + "learning_rate": 7.904145077720206e-07, + "loss": 0.0003, + "reward": 1.7498490810394287, + "reward_std": 0.26727011656885225, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.2498490810394287, + "step": 810 + }, + { + "clip_ratio": 0.0, + "completion_length": 34.5, + "epoch": 2.1010362694300517, + "grad_norm": 10.028903177969555, + "kl": 0.089111328125, + "learning_rate": 7.901554404145078e-07, + "loss": 0.0009, + "reward": 2.4999066591262817, + "reward_std": 9.397085034379415e-05, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999064803123474, + "step": 811 + }, + { + "clip_ratio": 0.0, + "completion_length": 34.5, + "epoch": 2.1036269430051813, + "grad_norm": 42.53789859794629, + "kl": 0.14892578125, + "learning_rate": 7.898963730569948e-07, + "loss": 0.0015, + "reward": 1.9960054159164429, + "reward_std": 0.00035896282710723426, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.4960054159164429, + "step": 812 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.5, + "epoch": 2.106217616580311, + "grad_norm": 69.72910435323216, + "kl": 0.12451171875, + "learning_rate": 7.896373056994819e-07, + "loss": 0.0005, + "reward": 2.298454165458679, + "reward_std": 0.2781435576451372, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.7984542846679688, + "step": 813 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 2.1088082901554404, + "grad_norm": 0.31617685399524487, + "kl": 0.03387451171875, + "learning_rate": 7.89378238341969e-07, + "loss": 0.001, + "reward": 2.499996542930603, + "reward_std": 2.301369363522099e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999964237213135, + "step": 814 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 2.11139896373057, + "grad_norm": 18.405205405846875, + "kl": 0.072021484375, + "learning_rate": 7.891191709844559e-07, + "loss": 0.0004, + "reward": 2.4995542764663696, + "reward_std": 0.00011681799310281349, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9995543956756592, + "step": 815 + }, + { + "clip_ratio": 0.0, + "completion_length": 34.5, + "epoch": 2.1139896373056994, + "grad_norm": 1.9553494809488006, + "kl": 0.12890625, + "learning_rate": 7.88860103626943e-07, + "loss": 0.0006, + "reward": 2.4999401569366455, + "reward_std": 2.828069759175378e-05, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999402165412903, + "step": 816 + }, + { + "clip_ratio": 0.0, + "completion_length": 34.5, + "epoch": 2.116580310880829, + "grad_norm": 3.0640260211442296, + "kl": 0.1630859375, + "learning_rate": 7.8860103626943e-07, + "loss": 0.0004, + "reward": 2.499991774559021, + "reward_std": 9.657326614842532e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999918341636658, + "step": 817 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 2.1191709844559585, + "grad_norm": 160.147805385429, + "kl": 0.0606689453125, + "learning_rate": 7.883419689119171e-07, + "loss": -0.0006, + "reward": 1.999306559562683, + "reward_std": 0.00039623541294986353, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.499306559562683, + "step": 818 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.5, + "epoch": 2.121761658031088, + "grad_norm": 3.886634685348415, + "kl": 0.5113525390625, + "learning_rate": 7.880829015544042e-07, + "loss": 0.0019, + "reward": 2.4106621742248535, + "reward_std": 0.25268226398407023, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.910662293434143, + "step": 819 + }, + { + "clip_ratio": 0.0, + "completion_length": 34.5, + "epoch": 2.1243523316062176, + "grad_norm": 5.463729048763219, + "kl": 0.0826416015625, + "learning_rate": 7.878238341968912e-07, + "loss": -0.0006, + "reward": 2.499975085258484, + "reward_std": 1.983281140383042e-05, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.999975323677063, + "step": 820 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 2.126943005181347, + "grad_norm": 1.6968617898008451, + "kl": 0.16357421875, + "learning_rate": 7.875647668393782e-07, + "loss": -0.0005, + "reward": 1.9996801614761353, + "reward_std": 2.749540878710377e-05, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.4996803402900696, + "step": 821 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.5, + "epoch": 2.1295336787564767, + "grad_norm": 0.3584589275948276, + "kl": 0.07513427734375, + "learning_rate": 7.873056994818652e-07, + "loss": -0.0004, + "reward": 1.9999231696128845, + "reward_std": 5.105447201003699e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.4999233186244965, + "step": 822 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.5, + "epoch": 2.1321243523316062, + "grad_norm": 1.1964297839352696, + "kl": 0.1029052734375, + "learning_rate": 7.870466321243523e-07, + "loss": 0.0008, + "reward": 1.9985453486442566, + "reward_std": 1.607097510714084e-05, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.498545378446579, + "step": 823 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 2.134715025906736, + "grad_norm": 0.2555782625831436, + "kl": 0.1787109375, + "learning_rate": 7.867875647668394e-07, + "loss": -0.0, + "reward": 2.499994993209839, + "reward_std": 2.2009395479472005e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999949932098389, + "step": 824 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.5, + "epoch": 2.1373056994818653, + "grad_norm": 1.185431365056362, + "kl": 0.1474609375, + "learning_rate": 7.865284974093264e-07, + "loss": 0.0, + "reward": 1.9999152421951294, + "reward_std": 7.194373210950289e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.4999153912067413, + "step": 825 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.5, + "epoch": 2.139896373056995, + "grad_norm": 81.46332385218038, + "kl": 0.0782470703125, + "learning_rate": 7.862694300518135e-07, + "loss": 0.0006, + "reward": 2.187281310558319, + "reward_std": 0.25892766599395145, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.6872811913490295, + "step": 826 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.5, + "epoch": 2.1424870466321244, + "grad_norm": 2.4871128232772035, + "kl": 0.8310546875, + "learning_rate": 7.860103626943004e-07, + "loss": 0.0033, + "reward": 2.499991297721863, + "reward_std": 8.376256118935999e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999912977218628, + "step": 827 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.5, + "epoch": 2.145077720207254, + "grad_norm": 0.2768657147885546, + "kl": 0.0615234375, + "learning_rate": 7.857512953367875e-07, + "loss": -0.0, + "reward": 2.4999730587005615, + "reward_std": 3.175752226525219e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.999973177909851, + "step": 828 + }, + { + "clip_ratio": 0.0, + "completion_length": 34.5, + "epoch": 2.1476683937823835, + "grad_norm": 32.21696010075703, + "kl": 0.1448974609375, + "learning_rate": 7.854922279792746e-07, + "loss": 0.0006, + "reward": 1.8902733325958252, + "reward_std": 0.17703182611148804, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.3902733623981476, + "step": 829 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.5, + "epoch": 2.150259067357513, + "grad_norm": 14.708848952487546, + "kl": 0.132080078125, + "learning_rate": 7.852331606217616e-07, + "loss": 0.0003, + "reward": 1.4905517101287842, + "reward_std": 0.0002584439571364783, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 0.9905517995357513, + "step": 830 + }, + { + "clip_ratio": 0.0, + "completion_length": 36.0, + "epoch": 2.1528497409326426, + "grad_norm": 0.8821324462348575, + "kl": 0.02239990234375, + "learning_rate": 7.849740932642487e-07, + "loss": -0.0, + "reward": 2.4999929666519165, + "reward_std": 5.975475801278662e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.999993085861206, + "step": 831 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.5, + "epoch": 2.155440414507772, + "grad_norm": 0.45672177704669265, + "kl": 0.052490234375, + "learning_rate": 7.847150259067357e-07, + "loss": 0.0003, + "reward": 1.9999288320541382, + "reward_std": 9.878930086415494e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.499928891658783, + "step": 832 + }, + { + "clip_ratio": 0.0, + "completion_length": 36.0, + "epoch": 2.1580310880829017, + "grad_norm": 9.245308572334315, + "kl": 0.1533203125, + "learning_rate": 7.844559585492227e-07, + "loss": 0.0013, + "reward": 2.499931812286377, + "reward_std": 5.431316446902201e-05, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.999931812286377, + "step": 833 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 2.160621761658031, + "grad_norm": 9.764383689176286, + "kl": 0.15234375, + "learning_rate": 7.841968911917098e-07, + "loss": 0.0003, + "reward": 2.499988555908203, + "reward_std": 9.286042370604264e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999886751174927, + "step": 834 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.5, + "epoch": 2.1632124352331608, + "grad_norm": 4.655281682587037, + "kl": 0.129150390625, + "learning_rate": 7.839378238341968e-07, + "loss": 0.0013, + "reward": 1.986701488494873, + "reward_std": 0.0011662132325369612, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.4867012798786163, + "step": 835 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 2.1658031088082903, + "grad_norm": 17.56675902588832, + "kl": 0.08154296875, + "learning_rate": 7.836787564766839e-07, + "loss": -0.0003, + "reward": 1.8743489384651184, + "reward_std": 0.001236509110640327, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.3743488788604736, + "step": 836 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.5, + "epoch": 2.16839378238342, + "grad_norm": 2.747082448275533, + "kl": 0.08544921875, + "learning_rate": 7.83419689119171e-07, + "loss": 0.0009, + "reward": 1.998904287815094, + "reward_std": 3.858891295749345e-05, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.4989041984081268, + "step": 837 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.5, + "epoch": 2.1709844559585494, + "grad_norm": 3.9127029335257935, + "kl": 0.7099609375, + "learning_rate": 7.83160621761658e-07, + "loss": 0.0032, + "reward": 2.499981641769409, + "reward_std": 1.0990483133355156e-05, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999815821647644, + "step": 838 + }, + { + "clip_ratio": 0.0, + "completion_length": 34.625, + "epoch": 2.173575129533679, + "grad_norm": 12.55711287538366, + "kl": 0.088134765625, + "learning_rate": 7.829015544041451e-07, + "loss": 0.0002, + "reward": 1.9975048303604126, + "reward_std": 0.002005275209626234, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.4975048005580902, + "step": 839 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.5, + "epoch": 2.1761658031088085, + "grad_norm": 0.6343800195194895, + "kl": 0.1490478515625, + "learning_rate": 7.82642487046632e-07, + "loss": 0.0008, + "reward": 2.499981164932251, + "reward_std": 1.1993369071205962e-05, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.999981164932251, + "step": 840 + }, + { + "clip_ratio": 0.0, + "completion_length": 34.25, + "epoch": 2.178756476683938, + "grad_norm": 32.69166364468496, + "kl": 0.064453125, + "learning_rate": 7.823834196891191e-07, + "loss": 0.0008, + "reward": 2.312381625175476, + "reward_std": 0.2589312991796078, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.8123815059661865, + "step": 841 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.5, + "epoch": 2.1813471502590676, + "grad_norm": 0.24945600525058792, + "kl": 0.02716064453125, + "learning_rate": 7.821243523316062e-07, + "loss": 0.0004, + "reward": 2.499980092048645, + "reward_std": 4.777468404881802e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.999979853630066, + "step": 842 + }, + { + "clip_ratio": 0.0, + "completion_length": 34.0, + "epoch": 2.1839378238341967, + "grad_norm": 0.7059778030870555, + "kl": 0.109619140625, + "learning_rate": 7.818652849740932e-07, + "loss": 0.0009, + "reward": 1.9999275207519531, + "reward_std": 8.797935038273863e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.4999276101589203, + "step": 843 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 2.186528497409326, + "grad_norm": 0.4589818445768626, + "kl": 0.057861328125, + "learning_rate": 7.816062176165803e-07, + "loss": -0.0002, + "reward": 2.4999959468841553, + "reward_std": 2.5094516331591876e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999961853027344, + "step": 844 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.25, + "epoch": 2.1891191709844557, + "grad_norm": 9.165496137938725, + "kl": 0.131103515625, + "learning_rate": 7.813471502590672e-07, + "loss": 0.0009, + "reward": 1.9891877174377441, + "reward_std": 0.00015411775825668883, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.4891875684261322, + "step": 845 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.5, + "epoch": 2.1917098445595853, + "grad_norm": 215.1622206840282, + "kl": 0.1060791015625, + "learning_rate": 7.810880829015543e-07, + "loss": 0.0003, + "reward": 1.9992945194244385, + "reward_std": 0.0001992236175283324, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.4992945790290833, + "step": 846 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.5, + "epoch": 2.194300518134715, + "grad_norm": 3.624025423979156, + "kl": 0.0777587890625, + "learning_rate": 7.808290155440414e-07, + "loss": 0.0001, + "reward": 2.499942898750305, + "reward_std": 2.195755723732873e-05, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999431371688843, + "step": 847 + }, + { + "clip_ratio": 0.0, + "completion_length": 36.0, + "epoch": 2.1968911917098444, + "grad_norm": 0.4654506824478735, + "kl": 0.1259765625, + "learning_rate": 7.805699481865284e-07, + "loss": -0.0001, + "reward": 2.499990701675415, + "reward_std": 4.144567014918721e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999908208847046, + "step": 848 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.5, + "epoch": 2.199481865284974, + "grad_norm": 12.45759018396317, + "kl": 0.080322265625, + "learning_rate": 7.803108808290155e-07, + "loss": -0.0007, + "reward": 2.4999905824661255, + "reward_std": 7.400932872769772e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999908804893494, + "step": 849 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.5, + "epoch": 2.2020725388601035, + "grad_norm": 1.0970616204339865, + "kl": 0.1279296875, + "learning_rate": 7.800518134715025e-07, + "loss": 0.0016, + "reward": 1.99956476688385, + "reward_std": 2.758353741683095e-05, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.499564677476883, + "step": 850 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 2.204663212435233, + "grad_norm": 33.79485917037079, + "kl": 0.0955810546875, + "learning_rate": 7.797927461139896e-07, + "loss": 0.0005, + "reward": 2.1248949766159058, + "reward_std": 0.23151709060863368, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.6248949766159058, + "step": 851 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 2.2072538860103625, + "grad_norm": 6.609977037423613, + "kl": 0.120849609375, + "learning_rate": 7.795336787564766e-07, + "loss": -0.0002, + "reward": 1.751718282699585, + "reward_std": 0.0004842634275519231, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.2517182528972626, + "step": 852 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 2.209844559585492, + "grad_norm": 0.08309396856625578, + "kl": 0.072509765625, + "learning_rate": 7.792746113989636e-07, + "loss": -0.0, + "reward": 2.499998092651367, + "reward_std": 1.7144134574209602e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.999998152256012, + "step": 853 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 2.2124352331606216, + "grad_norm": 0.752714079456917, + "kl": 0.1435546875, + "learning_rate": 7.790155440414508e-07, + "loss": -0.0001, + "reward": 1.9999275207519531, + "reward_std": 8.801098829280818e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.4999276995658875, + "step": 854 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.5, + "epoch": 2.215025906735751, + "grad_norm": 5.114989287280495, + "kl": 0.072509765625, + "learning_rate": 7.787564766839378e-07, + "loss": 0.0004, + "reward": 1.8816779851913452, + "reward_std": 0.0003030264506378444, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.3816779851913452, + "step": 855 + }, + { + "clip_ratio": 0.0, + "completion_length": 34.375, + "epoch": 2.2176165803108807, + "grad_norm": 44.150714063554716, + "kl": 0.1502685546875, + "learning_rate": 7.784974093264249e-07, + "loss": 0.0003, + "reward": 1.956153929233551, + "reward_std": 0.06839726611087826, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.4561539888381958, + "step": 856 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 2.2202072538860103, + "grad_norm": 7.571076160448421, + "kl": 0.18896484375, + "learning_rate": 7.78238341968912e-07, + "loss": 0.0008, + "reward": 2.3748414516448975, + "reward_std": 0.23147132278410254, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.8748415112495422, + "step": 857 + }, + { + "clip_ratio": 0.0, + "completion_length": 34.5, + "epoch": 2.22279792746114, + "grad_norm": 0.16070589453556666, + "kl": 0.0986328125, + "learning_rate": 7.779792746113989e-07, + "loss": -0.0002, + "reward": 2.499997138977051, + "reward_std": 1.8427381291985512e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999971985816956, + "step": 858 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.5, + "epoch": 2.2253886010362693, + "grad_norm": 40.6289215658432, + "kl": 0.136474609375, + "learning_rate": 7.77720207253886e-07, + "loss": 0.0005, + "reward": 2.1645208597183228, + "reward_std": 0.2799772632490658, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.664520800113678, + "step": 859 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.5, + "epoch": 2.227979274611399, + "grad_norm": 15.523657132558238, + "kl": 0.0343017578125, + "learning_rate": 7.774611398963731e-07, + "loss": 0.0005, + "reward": 2.4373964071273804, + "reward_std": 0.17682450337451883, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9373964071273804, + "step": 860 + }, + { + "clip_ratio": 0.0, + "completion_length": 34.0, + "epoch": 2.2305699481865284, + "grad_norm": 25.211609753750515, + "kl": 0.05999755859375, + "learning_rate": 7.772020725388601e-07, + "loss": -0.0001, + "reward": 2.2499719858169556, + "reward_std": 0.2672875080694723, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.7499721050262451, + "step": 861 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.5, + "epoch": 2.233160621761658, + "grad_norm": 5.458538411563557, + "kl": 0.0927734375, + "learning_rate": 7.769430051813472e-07, + "loss": 0.0009, + "reward": 1.9925637245178223, + "reward_std": 0.00011932382176382816, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.4925637543201447, + "step": 862 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 2.2357512953367875, + "grad_norm": 0.22704813565423648, + "kl": 0.12744140625, + "learning_rate": 7.766839378238342e-07, + "loss": 0.0008, + "reward": 2.499995231628418, + "reward_std": 1.782142135198228e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999951720237732, + "step": 863 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.5, + "epoch": 2.238341968911917, + "grad_norm": 21.577454476070237, + "kl": 0.1119384765625, + "learning_rate": 7.764248704663212e-07, + "loss": 0.0001, + "reward": 2.1076736450195312, + "reward_std": 0.24214620607835968, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.6076736450195312, + "step": 864 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 2.2409326424870466, + "grad_norm": 6.782336823292774, + "kl": 0.0589599609375, + "learning_rate": 7.761658031088083e-07, + "loss": 0.0002, + "reward": 2.4999842643737793, + "reward_std": 1.497648656823003e-05, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999842643737793, + "step": 865 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 2.243523316062176, + "grad_norm": 0.5581986549913339, + "kl": 0.0618896484375, + "learning_rate": 7.759067357512953e-07, + "loss": 0.0008, + "reward": 2.499993324279785, + "reward_std": 4.510141138780455e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999932646751404, + "step": 866 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.5, + "epoch": 2.2461139896373057, + "grad_norm": 49.75474340508649, + "kl": 0.12847900390625, + "learning_rate": 7.756476683937824e-07, + "loss": 0.0003, + "reward": 1.8833805322647095, + "reward_std": 0.0005223634916546871, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.3833806216716766, + "step": 867 + }, + { + "clip_ratio": 0.0, + "completion_length": 36.0, + "epoch": 2.2487046632124352, + "grad_norm": 2.0896393892920107, + "kl": 0.3172607421875, + "learning_rate": 7.753886010362694e-07, + "loss": 0.0008, + "reward": 2.499990940093994, + "reward_std": 2.5628186222093063e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999913573265076, + "step": 868 + }, + { + "clip_ratio": 0.0, + "completion_length": 36.0, + "epoch": 2.2512953367875648, + "grad_norm": 0.24861797004995637, + "kl": 0.0809326171875, + "learning_rate": 7.751295336787565e-07, + "loss": -0.0007, + "reward": 2.4999920129776, + "reward_std": 3.441311719143414e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999921917915344, + "step": 869 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 2.2538860103626943, + "grad_norm": 35.29801492876463, + "kl": 0.112548828125, + "learning_rate": 7.748704663212435e-07, + "loss": -0.0009, + "reward": 2.437470316886902, + "reward_std": 0.17684745959616066, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.937470555305481, + "step": 870 + }, + { + "clip_ratio": 0.0, + "completion_length": 34.5, + "epoch": 2.256476683937824, + "grad_norm": 2374.8087135124956, + "kl": 0.1162109375, + "learning_rate": 7.746113989637305e-07, + "loss": 0.0006, + "reward": 1.341384470462799, + "reward_std": 0.00830282815877581, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 0.8413844406604767, + "step": 871 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.5, + "epoch": 2.2590673575129534, + "grad_norm": 1.612652360564506, + "kl": 0.0404052734375, + "learning_rate": 7.743523316062176e-07, + "loss": 0.0003, + "reward": 2.499964475631714, + "reward_std": 1.1771840490837349e-05, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999642968177795, + "step": 872 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.125, + "epoch": 2.261658031088083, + "grad_norm": 281.0796457824024, + "kl": 0.125, + "learning_rate": 7.740932642487046e-07, + "loss": 0.0006, + "reward": 1.9147862792015076, + "reward_std": 0.2406559771970933, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.4147863686084747, + "step": 873 + }, + { + "clip_ratio": 0.0, + "completion_length": 36.0, + "epoch": 2.2642487046632125, + "grad_norm": 0.35357104327355054, + "kl": 0.0709228515625, + "learning_rate": 7.738341968911917e-07, + "loss": -0.0002, + "reward": 2.4999741315841675, + "reward_std": 5.235112041646062e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999741315841675, + "step": 874 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 2.266839378238342, + "grad_norm": 21.598338438509938, + "kl": 0.10302734375, + "learning_rate": 7.735751295336788e-07, + "loss": -0.0003, + "reward": 1.9904950261116028, + "reward_std": 0.00015373223186543328, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.4904950857162476, + "step": 875 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 2.2694300518134716, + "grad_norm": 1899.472018168331, + "kl": 352.0703125, + "learning_rate": 7.733160621761657e-07, + "loss": 1.4073, + "reward": 2.4369258880615234, + "reward_std": 0.17730323061186937, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.936926007270813, + "step": 876 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 2.272020725388601, + "grad_norm": 7.177511414549619, + "kl": 0.0517578125, + "learning_rate": 7.730569948186528e-07, + "loss": -0.0006, + "reward": 2.4999337196350098, + "reward_std": 2.6807001859197044e-05, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999337792396545, + "step": 877 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.25, + "epoch": 2.2746113989637307, + "grad_norm": 29.00678800812169, + "kl": 0.163330078125, + "learning_rate": 7.727979274611398e-07, + "loss": 0.0001, + "reward": 1.9999151825904846, + "reward_std": 0.534536676856078, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.499915361404419, + "step": 878 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 2.27720207253886, + "grad_norm": 2.734778269916389, + "kl": 0.070068359375, + "learning_rate": 7.725388601036269e-07, + "loss": 0.0002, + "reward": 2.4999901056289673, + "reward_std": 2.320093244634336e-05, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999901056289673, + "step": 879 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 2.2797927461139897, + "grad_norm": 189.42008065905318, + "kl": 0.0594482421875, + "learning_rate": 7.72279792746114e-07, + "loss": 0.0005, + "reward": 2.3124356269836426, + "reward_std": 0.2588570897974023, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.812435507774353, + "step": 880 + }, + { + "clip_ratio": 0.0, + "completion_length": 36.5, + "epoch": 2.2823834196891193, + "grad_norm": 2.3949283406082436, + "kl": 0.0804443359375, + "learning_rate": 7.72020725388601e-07, + "loss": 0.0, + "reward": 2.4999879598617554, + "reward_std": 1.1736271403606224e-05, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.999988079071045, + "step": 881 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.9375, + "epoch": 2.284974093264249, + "grad_norm": 17.854987868314215, + "kl": 0.060791015625, + "learning_rate": 7.71761658031088e-07, + "loss": 0.0011, + "reward": 2.4342384338378906, + "reward_std": 0.18567730413087702, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9342381954193115, + "step": 882 + }, + { + "clip_ratio": 0.0, + "completion_length": 34.875, + "epoch": 2.2875647668393784, + "grad_norm": 24.498335405911746, + "kl": 0.03338623046875, + "learning_rate": 7.715025906735751e-07, + "loss": -0.0002, + "reward": 2.4994282722473145, + "reward_std": 0.0010543846919972566, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.999428391456604, + "step": 883 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 2.290155440414508, + "grad_norm": 9.373854962364303, + "kl": 2.26220703125, + "learning_rate": 7.712435233160621e-07, + "loss": 0.0099, + "reward": 2.4999752044677734, + "reward_std": 1.0168109156438732e-05, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999750852584839, + "step": 884 + }, + { + "clip_ratio": 0.0, + "completion_length": 34.0, + "epoch": 2.2927461139896375, + "grad_norm": 3.952252704619346, + "kl": 0.2291259765625, + "learning_rate": 7.709844559585492e-07, + "loss": 0.0012, + "reward": 1.718647539615631, + "reward_std": 0.0004527728497123462, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.21864752471447, + "step": 885 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.5, + "epoch": 2.295336787564767, + "grad_norm": 175.38611571985803, + "kl": 0.08642578125, + "learning_rate": 7.707253886010362e-07, + "loss": 0.0005, + "reward": 2.437479615211487, + "reward_std": 0.17680806750206557, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.937479555606842, + "step": 886 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 2.2979274611398965, + "grad_norm": 0.24521506533891216, + "kl": 0.102783203125, + "learning_rate": 7.704663212435233e-07, + "loss": 0.0001, + "reward": 2.499997138977051, + "reward_std": 2.487221991032129e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999970197677612, + "step": 887 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 2.300518134715026, + "grad_norm": 0.45039286672375983, + "kl": 0.074462890625, + "learning_rate": 7.702072538860103e-07, + "loss": 0.0007, + "reward": 2.4999959468841553, + "reward_std": 2.526859134377446e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999959468841553, + "step": 888 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 2.3031088082901556, + "grad_norm": 2.412222213302715, + "kl": 0.0927734375, + "learning_rate": 7.699481865284973e-07, + "loss": 0.0001, + "reward": 2.4999910593032837, + "reward_std": 7.825213742762571e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999909400939941, + "step": 889 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.25, + "epoch": 2.305699481865285, + "grad_norm": 0.42694143596131684, + "kl": 0.0982666015625, + "learning_rate": 7.696891191709844e-07, + "loss": 0.0008, + "reward": 2.4999938011169434, + "reward_std": 2.936749979198794e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999938011169434, + "step": 890 + }, + { + "clip_ratio": 0.0, + "completion_length": 36.0, + "epoch": 2.3082901554404147, + "grad_norm": 2.295148355288371, + "kl": 0.0457763671875, + "learning_rate": 7.694300518134714e-07, + "loss": 0.0011, + "reward": 2.4999594688415527, + "reward_std": 2.161479324058746e-05, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.999959409236908, + "step": 891 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.5, + "epoch": 2.3108808290155443, + "grad_norm": 1.7524269172846563, + "kl": 0.0533447265625, + "learning_rate": 7.691709844559585e-07, + "loss": 0.0006, + "reward": 2.4999853372573853, + "reward_std": 8.951603717832768e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999852776527405, + "step": 892 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 2.313471502590674, + "grad_norm": 39.53500860615621, + "kl": 0.3026123046875, + "learning_rate": 7.689119170984456e-07, + "loss": 0.0012, + "reward": 1.9712463021278381, + "reward_std": 0.0004931320399919059, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.4712463021278381, + "step": 893 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.5, + "epoch": 2.3160621761658033, + "grad_norm": 92.22362739534795, + "kl": 0.09173583984375, + "learning_rate": 7.686528497409325e-07, + "loss": 0.0004, + "reward": 2.3743382692337036, + "reward_std": 0.23187917500399635, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.8743382096290588, + "step": 894 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 2.318652849740933, + "grad_norm": 1.3825839477738493, + "kl": 0.07373046875, + "learning_rate": 7.683937823834196e-07, + "loss": 0.0006, + "reward": 2.499995231628418, + "reward_std": 2.7748931188398274e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999949932098389, + "step": 895 + }, + { + "clip_ratio": 0.0, + "completion_length": 34.0, + "epoch": 2.321243523316062, + "grad_norm": 3.516374480380441, + "kl": 0.120849609375, + "learning_rate": 7.681347150259066e-07, + "loss": 0.0003, + "reward": 2.499992251396179, + "reward_std": 6.5902706865017535e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999922513961792, + "step": 896 + }, + { + "clip_ratio": 0.0, + "completion_length": 33.5, + "epoch": 2.3238341968911915, + "grad_norm": 0.1115842924918186, + "kl": 0.0286865234375, + "learning_rate": 7.678756476683938e-07, + "loss": 0.0006, + "reward": 2.4999979734420776, + "reward_std": 1.8105575918525574e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999979138374329, + "step": 897 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 2.326424870466321, + "grad_norm": 15.468787981921725, + "kl": 0.099853515625, + "learning_rate": 7.676165803108809e-07, + "loss": -0.0001, + "reward": 2.062446713447571, + "reward_std": 0.17679192748755668, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.5624468922615051, + "step": 898 + }, + { + "clip_ratio": 0.0, + "completion_length": 36.0, + "epoch": 2.3290155440414506, + "grad_norm": 10.327332514225484, + "kl": 0.096435546875, + "learning_rate": 7.673575129533679e-07, + "loss": -0.0004, + "reward": 2.499949812889099, + "reward_std": 2.4012788344407454e-05, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999500513076782, + "step": 899 + }, + { + "clip_ratio": 0.0, + "completion_length": 36.0, + "epoch": 2.33160621761658, + "grad_norm": 1.4606335096405556, + "kl": 0.1181640625, + "learning_rate": 7.670984455958549e-07, + "loss": -0.0005, + "reward": 2.499962568283081, + "reward_std": 1.0645215297699906e-05, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.999962568283081, + "step": 900 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 2.3341968911917097, + "grad_norm": 19.277566538444752, + "kl": 0.093017578125, + "learning_rate": 7.668393782383419e-07, + "loss": 0.0003, + "reward": 1.453494369983673, + "reward_std": 0.00031578161724610254, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 0.9534944593906403, + "step": 901 + }, + { + "clip_ratio": 0.0, + "completion_length": 36.0, + "epoch": 2.3367875647668392, + "grad_norm": 8.665589621672831, + "kl": 0.06396484375, + "learning_rate": 7.66580310880829e-07, + "loss": 0.0008, + "reward": 2.499971628189087, + "reward_std": 2.0828777905990137e-05, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999715089797974, + "step": 902 + }, + { + "clip_ratio": 0.0, + "completion_length": 34.5, + "epoch": 2.339378238341969, + "grad_norm": 0.7536608764797862, + "kl": 0.06280517578125, + "learning_rate": 7.663212435233161e-07, + "loss": 0.0015, + "reward": 2.499990940093994, + "reward_std": 6.6174688981845975e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999909400939941, + "step": 903 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 2.3419689119170983, + "grad_norm": 0.21060584059268717, + "kl": 0.0931396484375, + "learning_rate": 7.660621761658031e-07, + "loss": 0.0014, + "reward": 2.4999879598617554, + "reward_std": 3.3825172067736275e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999879598617554, + "step": 904 + }, + { + "clip_ratio": 0.0, + "completion_length": 36.0, + "epoch": 2.344559585492228, + "grad_norm": 1.5004328978075863, + "kl": 0.0482177734375, + "learning_rate": 7.658031088082902e-07, + "loss": 0.0008, + "reward": 2.499987244606018, + "reward_std": 1.2173651271041308e-05, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999871850013733, + "step": 905 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.5, + "epoch": 2.3471502590673574, + "grad_norm": 33.07205757797885, + "kl": 0.054931640625, + "learning_rate": 7.655440414507772e-07, + "loss": 0.0006, + "reward": 2.2499086260795593, + "reward_std": 0.26734173376803483, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.7499086260795593, + "step": 906 + }, + { + "clip_ratio": 0.0, + "completion_length": 34.9375, + "epoch": 2.349740932642487, + "grad_norm": 2.18528303977387, + "kl": 0.04974365234375, + "learning_rate": 7.652849740932642e-07, + "loss": 0.0005, + "reward": 2.4999897480010986, + "reward_std": 7.875904884713236e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.999989628791809, + "step": 907 + }, + { + "clip_ratio": 0.0, + "completion_length": 34.5, + "epoch": 2.3523316062176165, + "grad_norm": 5.355271983729303, + "kl": 0.0596923828125, + "learning_rate": 7.650259067357513e-07, + "loss": 0.0009, + "reward": 1.9944500923156738, + "reward_std": 5.3642829357158917e-05, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.4944500923156738, + "step": 908 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 2.354922279792746, + "grad_norm": 0.05280843014210008, + "kl": 0.03253173828125, + "learning_rate": 7.647668393782383e-07, + "loss": -0.0009, + "reward": 2.4999947547912598, + "reward_std": 1.510143363248062e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999948740005493, + "step": 909 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.5, + "epoch": 2.3575129533678756, + "grad_norm": 2.7276928409306374, + "kl": 0.05859375, + "learning_rate": 7.645077720207254e-07, + "loss": 0.0001, + "reward": 2.499974250793457, + "reward_std": 1.9628562483831047e-05, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.999974250793457, + "step": 910 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 2.360103626943005, + "grad_norm": 154.93996035639665, + "kl": 0.147705078125, + "learning_rate": 7.642487046632125e-07, + "loss": 0.0006, + "reward": 1.9995468854904175, + "reward_std": 0.5180795788764954, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.4995468854904175, + "step": 911 + }, + { + "clip_ratio": 0.0, + "completion_length": 36.0, + "epoch": 2.3626943005181347, + "grad_norm": 0.6368972116481264, + "kl": 0.06591796875, + "learning_rate": 7.639896373056994e-07, + "loss": 0.0003, + "reward": 2.499996542930603, + "reward_std": 5.453087851492455e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999966025352478, + "step": 912 + }, + { + "clip_ratio": 0.0, + "completion_length": 34.5, + "epoch": 2.365284974093264, + "grad_norm": 0.16255539230943153, + "kl": 0.03662109375, + "learning_rate": 7.637305699481865e-07, + "loss": 0.0013, + "reward": 2.4999961853027344, + "reward_std": 1.5942108859690052e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999960064888, + "step": 913 + }, + { + "clip_ratio": 0.0, + "completion_length": 33.5, + "epoch": 2.3678756476683938, + "grad_norm": 108.95478457970114, + "kl": 0.1787109375, + "learning_rate": 7.634715025906735e-07, + "loss": 0.0003, + "reward": 1.9936450719833374, + "reward_std": 0.00015028398547656252, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.4936451017856598, + "step": 914 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.5, + "epoch": 2.3704663212435233, + "grad_norm": 24.053330752179644, + "kl": 0.16162109375, + "learning_rate": 7.632124352331606e-07, + "loss": 0.0012, + "reward": 2.437483787536621, + "reward_std": 0.17678690779575845, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.937483787536621, + "step": 915 + }, + { + "clip_ratio": 0.0, + "completion_length": 34.5, + "epoch": 2.373056994818653, + "grad_norm": 0.20616139926389984, + "kl": 0.047607421875, + "learning_rate": 7.629533678756477e-07, + "loss": 0.0002, + "reward": 2.49999737739563, + "reward_std": 1.8199588680545276e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999972581863403, + "step": 916 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 2.3756476683937824, + "grad_norm": 0.34134283785246666, + "kl": 0.24176025390625, + "learning_rate": 7.626943005181347e-07, + "loss": 0.0016, + "reward": 2.499991536140442, + "reward_std": 8.67958056005591e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999912977218628, + "step": 917 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 2.378238341968912, + "grad_norm": 7.2871362681377425, + "kl": 0.0416259765625, + "learning_rate": 7.624352331606217e-07, + "loss": 0.0003, + "reward": 2.4999892711639404, + "reward_std": 1.1855105071845173e-05, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999892711639404, + "step": 918 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 2.3808290155440415, + "grad_norm": 5.71933266264719, + "kl": 0.09912109375, + "learning_rate": 7.621761658031087e-07, + "loss": 0.0002, + "reward": 2.4998656511306763, + "reward_std": 7.985772032270688e-05, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9998657703399658, + "step": 919 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.5, + "epoch": 2.383419689119171, + "grad_norm": 4.245496843338156, + "kl": 0.1533203125, + "learning_rate": 7.619170984455958e-07, + "loss": 0.0012, + "reward": 2.499936819076538, + "reward_std": 1.3957213013782166e-05, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.999936580657959, + "step": 920 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.5, + "epoch": 2.3860103626943006, + "grad_norm": 17.353312604194592, + "kl": 0.037841796875, + "learning_rate": 7.616580310880829e-07, + "loss": 0.0004, + "reward": 2.4999512434005737, + "reward_std": 1.4205514162313193e-05, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.999951183795929, + "step": 921 + }, + { + "clip_ratio": 0.0, + "completion_length": 34.5, + "epoch": 2.38860103626943, + "grad_norm": 15.241568110794935, + "kl": 0.0609130859375, + "learning_rate": 7.613989637305699e-07, + "loss": 0.001, + "reward": 1.997856318950653, + "reward_std": 0.000340745203914139, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.4978562593460083, + "step": 922 + }, + { + "clip_ratio": 0.0, + "completion_length": 34.5, + "epoch": 2.3911917098445596, + "grad_norm": 0.3888835281284561, + "kl": 0.099853515625, + "learning_rate": 7.61139896373057e-07, + "loss": 0.0007, + "reward": 2.499985098838806, + "reward_std": 4.012306021650147e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999849200248718, + "step": 923 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 2.393782383419689, + "grad_norm": 2.001822140388647, + "kl": 0.14794921875, + "learning_rate": 7.608808290155439e-07, + "loss": -0.0002, + "reward": 2.4998509883880615, + "reward_std": 3.331619564050925e-05, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9998509287834167, + "step": 924 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.5, + "epoch": 2.3963730569948187, + "grad_norm": 6.80795492525771, + "kl": 0.05853271484375, + "learning_rate": 7.60621761658031e-07, + "loss": 0.0005, + "reward": 2.4999635219573975, + "reward_std": 3.1137228916122694e-05, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999634623527527, + "step": 925 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.5, + "epoch": 2.3989637305699483, + "grad_norm": 10.257577291307154, + "kl": 0.101806640625, + "learning_rate": 7.603626943005181e-07, + "loss": 0.0007, + "reward": 2.499861001968384, + "reward_std": 4.529693251242861e-05, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9998611211776733, + "step": 926 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.5, + "epoch": 2.401554404145078, + "grad_norm": 891.1018910361465, + "kl": 0.135986328125, + "learning_rate": 7.601036269430051e-07, + "loss": 0.0001, + "reward": 1.9697346687316895, + "reward_std": 0.004559659611004463, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.469734638929367, + "step": 927 + }, + { + "clip_ratio": 0.0, + "completion_length": 34.0, + "epoch": 2.4041450777202074, + "grad_norm": 24.367917946252145, + "kl": 0.0771484375, + "learning_rate": 7.598445595854922e-07, + "loss": 0.0004, + "reward": 1.9998607635498047, + "reward_std": 6.505734745587688e-05, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.4998607635498047, + "step": 928 + }, + { + "clip_ratio": 0.0, + "completion_length": 34.0, + "epoch": 2.406735751295337, + "grad_norm": 0.8639193938537891, + "kl": 0.215576171875, + "learning_rate": 7.595854922279792e-07, + "loss": 0.002, + "reward": 1.499997854232788, + "reward_std": 1.9369217625353485e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 0.9999979138374329, + "step": 929 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.5, + "epoch": 2.4093264248704664, + "grad_norm": 44.83049413648358, + "kl": 0.14306640625, + "learning_rate": 7.593264248704662e-07, + "loss": 0.0005, + "reward": 1.4576206803321838, + "reward_std": 0.07640038783756609, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 0.9576206803321838, + "step": 930 + }, + { + "clip_ratio": 0.0, + "completion_length": 34.5, + "epoch": 2.411917098445596, + "grad_norm": 9.925248483551462, + "kl": 0.0693359375, + "learning_rate": 7.590673575129533e-07, + "loss": 0.001, + "reward": 2.312297821044922, + "reward_std": 0.2588079248100712, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.8122978210449219, + "step": 931 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.5, + "epoch": 2.4145077720207255, + "grad_norm": 10.712344689927866, + "kl": 0.111572265625, + "learning_rate": 7.588082901554403e-07, + "loss": 0.0005, + "reward": 1.910188615322113, + "reward_std": 0.00021999774787673232, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.4101886749267578, + "step": 932 + }, + { + "clip_ratio": 0.0, + "completion_length": 34.8125, + "epoch": 2.417098445595855, + "grad_norm": 127.68799003037205, + "kl": 0.083984375, + "learning_rate": 7.585492227979274e-07, + "loss": 0.0, + "reward": 2.1846303939819336, + "reward_std": 0.2610150386326211, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.6846305131912231, + "step": 933 + }, + { + "clip_ratio": 0.0, + "completion_length": 36.0, + "epoch": 2.4196891191709846, + "grad_norm": 135.10797732635731, + "kl": 0.111328125, + "learning_rate": 7.582901554404145e-07, + "loss": 0.0012, + "reward": 2.4999747276306152, + "reward_std": 2.5614713194954675e-05, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999747276306152, + "step": 934 + }, + { + "clip_ratio": 0.0, + "completion_length": 36.0, + "epoch": 2.422279792746114, + "grad_norm": 6.729782260653362, + "kl": 0.086669921875, + "learning_rate": 7.580310880829015e-07, + "loss": 0.0009, + "reward": 2.4998395442962646, + "reward_std": 7.537870351370657e-05, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9998394846916199, + "step": 935 + }, + { + "clip_ratio": 0.0, + "completion_length": 34.5, + "epoch": 2.4248704663212437, + "grad_norm": 0.8001945007482641, + "kl": 0.111572265625, + "learning_rate": 7.577720207253885e-07, + "loss": 0.0007, + "reward": 1.999828815460205, + "reward_std": 1.1383938954168116e-05, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.4998287558555603, + "step": 936 + }, + { + "clip_ratio": 0.0, + "completion_length": 36.0, + "epoch": 2.4274611398963732, + "grad_norm": 1.087960734243812, + "kl": 0.06793212890625, + "learning_rate": 7.575129533678755e-07, + "loss": -0.0005, + "reward": 2.4999839067459106, + "reward_std": 4.708190999735962e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999840259552002, + "step": 937 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 2.4300518134715023, + "grad_norm": 46.59050477781762, + "kl": 0.0738525390625, + "learning_rate": 7.572538860103626e-07, + "loss": 0.001, + "reward": 2.4999462366104126, + "reward_std": 3.068698788410984e-05, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.999946117401123, + "step": 938 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 2.432642487046632, + "grad_norm": 20.514791087029103, + "kl": 0.16015625, + "learning_rate": 7.569948186528498e-07, + "loss": 0.001, + "reward": 1.9996665716171265, + "reward_std": 4.577649838211073e-05, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.4996665120124817, + "step": 939 + }, + { + "clip_ratio": 0.0, + "completion_length": 34.5, + "epoch": 2.4352331606217614, + "grad_norm": 0.7944817643438287, + "kl": 0.021881103515625, + "learning_rate": 7.567357512953368e-07, + "loss": -0.0002, + "reward": 1.9991827011108398, + "reward_std": 3.211699061012041e-05, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.4991827309131622, + "step": 940 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.5, + "epoch": 2.437823834196891, + "grad_norm": 1.3239401855227448, + "kl": 0.1572265625, + "learning_rate": 7.564766839378239e-07, + "loss": 0.001, + "reward": 2.4999938011169434, + "reward_std": 6.030892564012902e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.999993622303009, + "step": 941 + }, + { + "clip_ratio": 0.0, + "completion_length": 36.0, + "epoch": 2.4404145077720205, + "grad_norm": 0.713359777445594, + "kl": 0.103515625, + "learning_rate": 7.562176165803108e-07, + "loss": 0.0004, + "reward": 2.4999849796295166, + "reward_std": 7.23069092600781e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.999984860420227, + "step": 942 + }, + { + "clip_ratio": 0.0, + "completion_length": 36.0, + "epoch": 2.44300518134715, + "grad_norm": 1.1428786990802968, + "kl": 0.04083251953125, + "learning_rate": 7.559585492227979e-07, + "loss": 0.0005, + "reward": 2.4999879598617554, + "reward_std": 1.0063386071124114e-05, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.999987781047821, + "step": 943 + }, + { + "clip_ratio": 0.0, + "completion_length": 36.625, + "epoch": 2.4455958549222796, + "grad_norm": 48.60938521682109, + "kl": 0.103759765625, + "learning_rate": 7.55699481865285e-07, + "loss": -0.0001, + "reward": 1.9995916485786438, + "reward_std": 0.00011154991989315022, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.4995917081832886, + "step": 944 + }, + { + "clip_ratio": 0.0, + "completion_length": 34.5, + "epoch": 2.448186528497409, + "grad_norm": 16.40060561521158, + "kl": 0.058837890625, + "learning_rate": 7.55440414507772e-07, + "loss": -0.0008, + "reward": 1.8128111362457275, + "reward_std": 0.00039212397177834646, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.3128113150596619, + "step": 945 + }, + { + "clip_ratio": 0.0, + "completion_length": 34.1875, + "epoch": 2.4507772020725387, + "grad_norm": 1.129085775623217, + "kl": 0.0521240234375, + "learning_rate": 7.551813471502591e-07, + "loss": -0.0009, + "reward": 2.4999914169311523, + "reward_std": 8.720686196284078e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999914169311523, + "step": 946 + }, + { + "clip_ratio": 0.0, + "completion_length": 34.5, + "epoch": 2.4533678756476682, + "grad_norm": 11.128557764822244, + "kl": 0.115234375, + "learning_rate": 7.549222797927461e-07, + "loss": -0.0006, + "reward": 2.4998854398727417, + "reward_std": 0.0002141268014383968, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9998857975006104, + "step": 947 + }, + { + "clip_ratio": 0.0, + "completion_length": 34.0, + "epoch": 2.4559585492227978, + "grad_norm": 7.853008323011336, + "kl": 0.2027587890625, + "learning_rate": 7.546632124352331e-07, + "loss": 0.0008, + "reward": 1.9998813271522522, + "reward_std": 3.5622451719063974e-05, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.499881386756897, + "step": 948 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.5, + "epoch": 2.4585492227979273, + "grad_norm": 136.70544011738588, + "kl": 1.00128173828125, + "learning_rate": 7.544041450777202e-07, + "loss": 0.0049, + "reward": 2.4353466033935547, + "reward_std": 0.18270384752031532, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9353466033935547, + "step": 949 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 2.461139896373057, + "grad_norm": 0.6316161613629311, + "kl": 0.154296875, + "learning_rate": 7.541450777202072e-07, + "loss": 0.0002, + "reward": 2.4999895095825195, + "reward_std": 7.198604862423963e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.999989628791809, + "step": 950 + }, + { + "clip_ratio": 0.0, + "completion_length": 34.0, + "epoch": 2.4637305699481864, + "grad_norm": 1.9242499639220416, + "kl": 0.1513671875, + "learning_rate": 7.538860103626943e-07, + "loss": 0.0013, + "reward": 2.4999783039093018, + "reward_std": 7.100676043592102e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999781847000122, + "step": 951 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 2.466321243523316, + "grad_norm": 2.061044326704196, + "kl": 0.1064453125, + "learning_rate": 7.536269430051813e-07, + "loss": 0.0012, + "reward": 2.49991512298584, + "reward_std": 1.4621544096371508e-05, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999150037765503, + "step": 952 + }, + { + "clip_ratio": 0.0, + "completion_length": 34.5, + "epoch": 2.4689119170984455, + "grad_norm": 0.15424854191835632, + "kl": 0.1015625, + "learning_rate": 7.533678756476684e-07, + "loss": 0.0006, + "reward": 2.499998092651367, + "reward_std": 2.2490693254439975e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999982118606567, + "step": 953 + }, + { + "clip_ratio": 0.0, + "completion_length": 36.25, + "epoch": 2.471502590673575, + "grad_norm": 21.108187479284748, + "kl": 0.0908203125, + "learning_rate": 7.531088082901554e-07, + "loss": 0.0002, + "reward": 2.3197872638702393, + "reward_std": 0.33367914653513253, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.819787323474884, + "step": 954 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 2.4740932642487046, + "grad_norm": 9.150908923727552, + "kl": 0.126708984375, + "learning_rate": 7.528497409326424e-07, + "loss": 0.0005, + "reward": 2.4998146295547485, + "reward_std": 4.8566987743470236e-05, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9998146295547485, + "step": 955 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 2.476683937823834, + "grad_norm": 6.865160570357046, + "kl": 0.11669921875, + "learning_rate": 7.525906735751295e-07, + "loss": 0.0004, + "reward": 1.6648834943771362, + "reward_std": 0.2318227205250878, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.1648836135864258, + "step": 956 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 2.4792746113989637, + "grad_norm": 1.5753853531041222, + "kl": 0.20654296875, + "learning_rate": 7.523316062176166e-07, + "loss": 0.0006, + "reward": 2.499989867210388, + "reward_std": 9.547110607854847e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999898672103882, + "step": 957 + }, + { + "clip_ratio": 0.0, + "completion_length": 36.0, + "epoch": 2.481865284974093, + "grad_norm": 1.7271326723198381, + "kl": 0.0703125, + "learning_rate": 7.520725388601036e-07, + "loss": -0.0005, + "reward": 2.499770760536194, + "reward_std": 1.9740587504202267e-05, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9997708797454834, + "step": 958 + }, + { + "clip_ratio": 0.0, + "completion_length": 36.0, + "epoch": 2.4844559585492227, + "grad_norm": 23.183072704143797, + "kl": 0.127197265625, + "learning_rate": 7.518134715025907e-07, + "loss": 0.0005, + "reward": 2.4374130964279175, + "reward_std": 0.17691215011655004, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9374129176139832, + "step": 959 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.3125, + "epoch": 2.4870466321243523, + "grad_norm": 13.736891689688672, + "kl": 0.139404296875, + "learning_rate": 7.515544041450776e-07, + "loss": 0.0007, + "reward": 2.3749916553497314, + "reward_std": 0.3535528115900206, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.8749916553497314, + "step": 960 + }, + { + "clip_ratio": 0.0, + "completion_length": 36.5, + "epoch": 2.489637305699482, + "grad_norm": 0.4082768571347582, + "kl": 0.0653076171875, + "learning_rate": 7.512953367875647e-07, + "loss": 0.0011, + "reward": 2.499945878982544, + "reward_std": 7.201321636784996e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999457597732544, + "step": 961 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.5, + "epoch": 2.4922279792746114, + "grad_norm": 1.5750787573387628, + "kl": 0.070343017578125, + "learning_rate": 7.510362694300518e-07, + "loss": 0.0008, + "reward": 2.499985098838806, + "reward_std": 9.421758363714616e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.999985158443451, + "step": 962 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.5, + "epoch": 2.494818652849741, + "grad_norm": 7.2659527944081415, + "kl": 0.0777587890625, + "learning_rate": 7.507772020725388e-07, + "loss": -0.0006, + "reward": 1.9931894540786743, + "reward_std": 0.00021563546283687174, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.4931894838809967, + "step": 963 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 2.4974093264248705, + "grad_norm": 29.696451106721135, + "kl": 0.1875, + "learning_rate": 7.505181347150259e-07, + "loss": 0.0014, + "reward": 2.061533212661743, + "reward_std": 0.17716989700329577, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.5615330934524536, + "step": 964 + }, + { + "clip_ratio": 0.0, + "completion_length": 34.5, + "epoch": 2.5, + "grad_norm": 2.962726576057743, + "kl": 0.04034423828125, + "learning_rate": 7.502590673575129e-07, + "loss": -0.0003, + "reward": 2.499897003173828, + "reward_std": 2.4297271693285438e-05, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.999897062778473, + "step": 965 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 2.5025906735751295, + "grad_norm": 1.4385157729398097, + "kl": 0.12890625, + "learning_rate": 7.5e-07, + "loss": 0.0022, + "reward": 2.499987840652466, + "reward_std": 7.570292268610501e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999877214431763, + "step": 966 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.5, + "epoch": 2.505181347150259, + "grad_norm": 4.265232340889757, + "kl": 0.069091796875, + "learning_rate": 7.49740932642487e-07, + "loss": 0.0001, + "reward": 2.49997341632843, + "reward_std": 1.600029895598709e-05, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.999973475933075, + "step": 967 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 2.5077720207253886, + "grad_norm": 35.721329691325536, + "kl": 0.08642578125, + "learning_rate": 7.49481865284974e-07, + "loss": 0.0003, + "reward": 2.1772468090057373, + "reward_std": 0.5042813867330551, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.6772468090057373, + "step": 968 + }, + { + "clip_ratio": 0.0, + "completion_length": 34.5, + "epoch": 2.510362694300518, + "grad_norm": 0.40477350161334574, + "kl": 0.094970703125, + "learning_rate": 7.492227979274611e-07, + "loss": 0.0011, + "reward": 2.4999948740005493, + "reward_std": 3.273613515375473e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999948143959045, + "step": 969 + }, + { + "clip_ratio": 0.0, + "completion_length": 34.5, + "epoch": 2.5129533678756477, + "grad_norm": 9.855248488100647, + "kl": 0.04345703125, + "learning_rate": 7.489637305699481e-07, + "loss": 0.0006, + "reward": 1.999349296092987, + "reward_std": 4.114194223348022e-05, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.499349057674408, + "step": 970 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 2.5155440414507773, + "grad_norm": 1.087687288678207, + "kl": 0.109375, + "learning_rate": 7.487046632124352e-07, + "loss": -0.0003, + "reward": 2.499978542327881, + "reward_std": 6.4177226022366085e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999786019325256, + "step": 971 + }, + { + "clip_ratio": 0.0, + "completion_length": 34.5, + "epoch": 2.518134715025907, + "grad_norm": 7.600420496587052, + "kl": 0.1429443359375, + "learning_rate": 7.484455958549223e-07, + "loss": 0.0004, + "reward": 1.8474342823028564, + "reward_std": 0.0005369920093016844, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.3474342823028564, + "step": 972 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 2.5207253886010363, + "grad_norm": 0.5178823709045375, + "kl": 0.09814453125, + "learning_rate": 7.481865284974092e-07, + "loss": 0.001, + "reward": 2.499987244606018, + "reward_std": 6.464601426614536e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.999987006187439, + "step": 973 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 2.523316062176166, + "grad_norm": 15.363694591604116, + "kl": 0.1513671875, + "learning_rate": 7.479274611398963e-07, + "loss": 0.0007, + "reward": 2.041240632534027, + "reward_std": 0.1853715334766548, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.541240632534027, + "step": 974 + }, + { + "clip_ratio": 0.0, + "completion_length": 34.5, + "epoch": 2.5259067357512954, + "grad_norm": 9.394894754562163, + "kl": 0.1845703125, + "learning_rate": 7.476683937823833e-07, + "loss": 0.001, + "reward": 0.9998171329498291, + "reward_std": 3.0216364393709227e-05, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 0.49981701374053955, + "step": 975 + }, + { + "clip_ratio": 0.0, + "completion_length": 34.6875, + "epoch": 2.528497409326425, + "grad_norm": 2.413181954778754, + "kl": 0.06842041015625, + "learning_rate": 7.474093264248704e-07, + "loss": 0.0015, + "reward": 2.4999775886535645, + "reward_std": 8.515007039022748e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999776482582092, + "step": 976 + }, + { + "clip_ratio": 0.0, + "completion_length": 37.375, + "epoch": 2.5310880829015545, + "grad_norm": 4.414030042445875, + "kl": 0.77783203125, + "learning_rate": 7.471502590673575e-07, + "loss": 0.0034, + "reward": 2.499969482421875, + "reward_std": 1.346209126040776e-05, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999695420265198, + "step": 977 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 2.533678756476684, + "grad_norm": 25.606485647980744, + "kl": 0.115234375, + "learning_rate": 7.468911917098445e-07, + "loss": 0.0008, + "reward": 1.9646154046058655, + "reward_std": 0.005181904838082119, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.4646154046058655, + "step": 978 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.5, + "epoch": 2.5362694300518136, + "grad_norm": 12.262696342722542, + "kl": 0.240234375, + "learning_rate": 7.466321243523315e-07, + "loss": 0.0011, + "reward": 1.9309902787208557, + "reward_std": 0.02750369685691112, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.4309902787208557, + "step": 979 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.5, + "epoch": 2.538860103626943, + "grad_norm": 1.9008284492847578, + "kl": 0.089111328125, + "learning_rate": 7.463730569948187e-07, + "loss": 0.0, + "reward": 2.499983072280884, + "reward_std": 1.5114326970433467e-05, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999829530715942, + "step": 980 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 2.5414507772020727, + "grad_norm": 33.828322402233084, + "kl": 0.12628173828125, + "learning_rate": 7.461139896373057e-07, + "loss": 0.001, + "reward": 1.854383409023285, + "reward_std": 0.00570840007321749, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.354383409023285, + "step": 981 + }, + { + "clip_ratio": 0.0, + "completion_length": 34.5, + "epoch": 2.5440414507772022, + "grad_norm": 13.444261598852792, + "kl": 0.085205078125, + "learning_rate": 7.458549222797928e-07, + "loss": 0.0011, + "reward": 1.9873749017715454, + "reward_std": 0.0007185070289779105, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.4873749017715454, + "step": 982 + }, + { + "clip_ratio": 0.0, + "completion_length": 39.375, + "epoch": 2.5466321243523318, + "grad_norm": 1.0906646810171143, + "kl": 0.0799560546875, + "learning_rate": 7.455958549222798e-07, + "loss": 0.001, + "reward": 2.4999756813049316, + "reward_std": 8.79958633959177e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999756217002869, + "step": 983 + }, + { + "clip_ratio": 0.0, + "completion_length": 36.625, + "epoch": 2.5492227979274613, + "grad_norm": 3.946590977551778, + "kl": 0.150634765625, + "learning_rate": 7.453367875647669e-07, + "loss": 0.0002, + "reward": 2.4999853372573853, + "reward_std": 1.792325082305979e-05, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.99998539686203, + "step": 984 + }, + { + "clip_ratio": 0.0, + "completion_length": 77.25, + "epoch": 2.551813471502591, + "grad_norm": 2.6119679904943442, + "kl": 0.07208251953125, + "learning_rate": 7.450777202072539e-07, + "loss": 0.001, + "reward": 2.499967098236084, + "reward_std": 1.539709683129331e-05, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999671578407288, + "step": 985 + }, + { + "clip_ratio": 0.0, + "completion_length": 41.0625, + "epoch": 2.5544041450777204, + "grad_norm": 0.24661692718260644, + "kl": 0.03045654296875, + "learning_rate": 7.448186528497409e-07, + "loss": 0.0004, + "reward": 2.499992251396179, + "reward_std": 3.850713937936234e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999921321868896, + "step": 986 + }, + { + "clip_ratio": 0.0, + "completion_length": 39.9375, + "epoch": 2.55699481865285, + "grad_norm": 0.3155136756021828, + "kl": 0.122802734375, + "learning_rate": 7.44559585492228e-07, + "loss": 0.0009, + "reward": 2.499970316886902, + "reward_std": 3.5425043733994244e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999701976776123, + "step": 987 + }, + { + "clip_ratio": 0.0, + "completion_length": 163.8125, + "epoch": 2.5595854922279795, + "grad_norm": 5.431564811546383, + "kl": 0.185302734375, + "learning_rate": 7.44300518134715e-07, + "loss": 0.0003, + "reward": 1.9998611211776733, + "reward_std": 7.525141654696199e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.4998612105846405, + "step": 988 + }, + { + "clip_ratio": 0.0, + "completion_length": 60.6875, + "epoch": 2.562176165803109, + "grad_norm": 1.4166366231620315, + "kl": 0.083251953125, + "learning_rate": 7.440414507772021e-07, + "loss": 0.0011, + "reward": 2.499987244606018, + "reward_std": 1.039934113578056e-05, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.999987006187439, + "step": 989 + }, + { + "clip_ratio": 0.0, + "completion_length": 46.5625, + "epoch": 2.5647668393782386, + "grad_norm": 6.87959332455512, + "kl": 0.123779296875, + "learning_rate": 7.437823834196892e-07, + "loss": 0.0002, + "reward": 1.9982973337173462, + "reward_std": 5.501119630935136e-05, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.4982974231243134, + "step": 990 + }, + { + "clip_ratio": 0.0, + "completion_length": 56.1875, + "epoch": 2.567357512953368, + "grad_norm": 2.739631087777935, + "kl": 0.203125, + "learning_rate": 7.435233160621761e-07, + "loss": 0.0012, + "reward": 2.499994993209839, + "reward_std": 3.6262600815462065e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999948740005493, + "step": 991 + }, + { + "clip_ratio": 0.0, + "completion_length": 51.3125, + "epoch": 2.5699481865284977, + "grad_norm": 4.967001902255783, + "kl": 0.2373046875, + "learning_rate": 7.432642487046632e-07, + "loss": 0.0015, + "reward": 1.9893280267715454, + "reward_std": 0.00013621033531308058, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.4893280565738678, + "step": 992 + }, + { + "clip_ratio": 0.0, + "completion_length": 181.5625, + "epoch": 2.572538860103627, + "grad_norm": 17.900996091287624, + "kl": 0.3427734375, + "learning_rate": 7.430051813471502e-07, + "loss": 0.0013, + "reward": 1.2730909585952759, + "reward_std": 0.0006147078383946791, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 0.7730909287929535, + "step": 993 + }, + { + "clip_ratio": 0.0, + "completion_length": 142.8125, + "epoch": 2.5751295336787567, + "grad_norm": 1.033754492387028, + "kl": 0.685546875, + "learning_rate": 7.427461139896373e-07, + "loss": 0.0024, + "reward": 1.9999675154685974, + "reward_std": 5.680035656041582e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.4999675452709198, + "step": 994 + }, + { + "clip_ratio": 0.0, + "completion_length": 167.0625, + "epoch": 2.5777202072538863, + "grad_norm": 0.20144272289860132, + "kl": 0.36328125, + "learning_rate": 7.424870466321244e-07, + "loss": 0.0013, + "reward": 2.4999979734420776, + "reward_std": 1.462866350721015e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999979138374329, + "step": 995 + }, + { + "clip_ratio": 0.0, + "completion_length": 88.75, + "epoch": 2.5803108808290154, + "grad_norm": 0.45093562742884274, + "kl": 0.48046875, + "learning_rate": 7.422279792746114e-07, + "loss": 0.0018, + "reward": 2.499994993209839, + "reward_std": 3.7291632679625764e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999948740005493, + "step": 996 + }, + { + "clip_ratio": 0.0, + "completion_length": 45.3125, + "epoch": 2.582901554404145, + "grad_norm": 6.023124875685409, + "kl": 0.5810546875, + "learning_rate": 7.419689119170984e-07, + "loss": 0.0028, + "reward": 1.8036752939224243, + "reward_std": 0.000546291637874674, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.3036752939224243, + "step": 997 + }, + { + "clip_ratio": 0.0, + "completion_length": 193.5, + "epoch": 2.5854922279792745, + "grad_norm": 4.735721531319997, + "kl": 0.521484375, + "learning_rate": 7.417098445595854e-07, + "loss": 0.0023, + "reward": 2.499949336051941, + "reward_std": 3.642402725745342e-05, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999492764472961, + "step": 998 + }, + { + "clip_ratio": 0.0, + "completion_length": 246.25, + "epoch": 2.588082901554404, + "grad_norm": 0.3355792142965779, + "kl": 0.46484375, + "learning_rate": 7.414507772020725e-07, + "loss": 0.0017, + "reward": 2.499997615814209, + "reward_std": 1.1579040801734664e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.999997615814209, + "step": 999 + }, + { + "clip_ratio": 0.0, + "completion_length": 39.125, + "epoch": 2.5906735751295336, + "grad_norm": 0.7364718498592124, + "kl": 0.4345703125, + "learning_rate": 7.411917098445596e-07, + "loss": 0.0029, + "reward": 2.4999947547912598, + "reward_std": 3.090572135988623e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999946355819702, + "step": 1000 + }, + { + "clip_ratio": 0.0, + "completion_length": 128.875, + "epoch": 2.593264248704663, + "grad_norm": 1.4080007315033172, + "kl": 0.541015625, + "learning_rate": 7.409326424870466e-07, + "loss": 0.0003, + "reward": 2.499993324279785, + "reward_std": 5.6037465583358426e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999935030937195, + "step": 1001 + }, + { + "clip_ratio": 0.0, + "completion_length": 49.5625, + "epoch": 2.5958549222797926, + "grad_norm": 0.9808436745919166, + "kl": 0.55859375, + "learning_rate": 7.406735751295337e-07, + "loss": 0.0028, + "reward": 2.4999938011169434, + "reward_std": 3.050547888960864e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999938011169434, + "step": 1002 + }, + { + "clip_ratio": 0.0, + "completion_length": 47.125, + "epoch": 2.598445595854922, + "grad_norm": 0.3646312804268175, + "kl": 0.74609375, + "learning_rate": 7.404145077720207e-07, + "loss": 0.0037, + "reward": 2.4999972581863403, + "reward_std": 1.649128250846843e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999971389770508, + "step": 1003 + }, + { + "clip_ratio": 0.0, + "completion_length": 61.25, + "epoch": 2.6010362694300517, + "grad_norm": 12.285086371569555, + "kl": 0.53515625, + "learning_rate": 7.401554404145077e-07, + "loss": 0.002, + "reward": 2.499593496322632, + "reward_std": 0.0001344580450677313, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9995936155319214, + "step": 1004 + }, + { + "clip_ratio": 0.0, + "completion_length": 96.9375, + "epoch": 2.6036269430051813, + "grad_norm": 1.9874453432442083, + "kl": 0.349609375, + "learning_rate": 7.398963730569948e-07, + "loss": 0.0019, + "reward": 2.4999794960021973, + "reward_std": 9.808404001887538e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999794960021973, + "step": 1005 + }, + { + "clip_ratio": 0.0, + "completion_length": 156.0, + "epoch": 2.606217616580311, + "grad_norm": 130.16210823613784, + "kl": 0.4453125, + "learning_rate": 7.396373056994818e-07, + "loss": 0.0019, + "reward": 1.8914172649383545, + "reward_std": 0.24591282738583686, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.3914174139499664, + "step": 1006 + }, + { + "clip_ratio": 0.0, + "completion_length": 139.375, + "epoch": 2.6088082901554404, + "grad_norm": 5.034697211886495, + "kl": 0.615234375, + "learning_rate": 7.393782383419689e-07, + "loss": 0.0026, + "reward": 2.4999823570251465, + "reward_std": 1.1671747188302106e-05, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999822974205017, + "step": 1007 + }, + { + "clip_ratio": 0.0, + "completion_length": 138.25, + "epoch": 2.61139896373057, + "grad_norm": 1.3056685843873292, + "kl": 0.529296875, + "learning_rate": 7.39119170984456e-07, + "loss": 0.0024, + "reward": 2.499987840652466, + "reward_std": 1.034896058627055e-05, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999878406524658, + "step": 1008 + }, + { + "clip_ratio": 0.0, + "completion_length": 58.3125, + "epoch": 2.6139896373056994, + "grad_norm": 15.471134858569846, + "kl": 0.306640625, + "learning_rate": 7.388601036269429e-07, + "loss": 0.0015, + "reward": 2.4999314546585083, + "reward_std": 0.0001298900234019129, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999313950538635, + "step": 1009 + }, + { + "clip_ratio": 0.0, + "completion_length": 67.5, + "epoch": 2.616580310880829, + "grad_norm": 1.327780629829245, + "kl": 0.5419921875, + "learning_rate": 7.3860103626943e-07, + "loss": 0.0023, + "reward": 2.4999128580093384, + "reward_std": 1.5311324091271672e-05, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999127388000488, + "step": 1010 + }, + { + "clip_ratio": 0.0, + "completion_length": 39.875, + "epoch": 2.6191709844559585, + "grad_norm": 44.82114208935299, + "kl": 0.248046875, + "learning_rate": 7.38341968911917e-07, + "loss": 0.0006, + "reward": 2.408591866493225, + "reward_std": 0.2584836309633829, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9085916876792908, + "step": 1011 + }, + { + "clip_ratio": 0.0, + "completion_length": 49.75, + "epoch": 2.621761658031088, + "grad_norm": 71.43867037182878, + "kl": 0.3359375, + "learning_rate": 7.380829015544041e-07, + "loss": 0.0014, + "reward": 1.8336089849472046, + "reward_std": 0.1902480730204843, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.3336089849472046, + "step": 1012 + }, + { + "clip_ratio": 0.0, + "completion_length": 55.125, + "epoch": 2.6243523316062176, + "grad_norm": 1.5182677244338434, + "kl": 0.110107421875, + "learning_rate": 7.378238341968912e-07, + "loss": -0.0003, + "reward": 2.4999879598617554, + "reward_std": 7.340728188864887e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999881386756897, + "step": 1013 + }, + { + "clip_ratio": 0.0, + "completion_length": 43.5625, + "epoch": 2.626943005181347, + "grad_norm": 7.23031118387489, + "kl": 0.2607421875, + "learning_rate": 7.375647668393782e-07, + "loss": 0.0012, + "reward": 2.374991297721863, + "reward_std": 0.3535586022121606, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.8749911785125732, + "step": 1014 + }, + { + "clip_ratio": 0.0, + "completion_length": 38.25, + "epoch": 2.6295336787564767, + "grad_norm": 23.21657081552542, + "kl": 0.14208984375, + "learning_rate": 7.373056994818652e-07, + "loss": 0.0003, + "reward": 2.419443368911743, + "reward_std": 0.227821338423837, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9194433093070984, + "step": 1015 + }, + { + "clip_ratio": 0.0, + "completion_length": 40.125, + "epoch": 2.6321243523316062, + "grad_norm": 1.5175973804165943, + "kl": 0.22998046875, + "learning_rate": 7.370466321243522e-07, + "loss": 0.0005, + "reward": 2.4999619722366333, + "reward_std": 1.5150643491779192e-05, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999618530273438, + "step": 1016 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 2.634715025906736, + "grad_norm": 19.38826630890325, + "kl": 0.03643798828125, + "learning_rate": 7.367875647668393e-07, + "loss": 0.0, + "reward": 2.430171489715576, + "reward_std": 0.19747968364572444, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9301713705062866, + "step": 1017 + }, + { + "clip_ratio": 0.0, + "completion_length": 34.9375, + "epoch": 2.6373056994818653, + "grad_norm": 34.33573971874138, + "kl": 0.61767578125, + "learning_rate": 7.365284974093264e-07, + "loss": 0.0024, + "reward": 2.426503539085388, + "reward_std": 0.20768518514887546, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.926503598690033, + "step": 1018 + }, + { + "clip_ratio": 0.0, + "completion_length": 34.5, + "epoch": 2.639896373056995, + "grad_norm": 0.14159952598560224, + "kl": 0.0701904296875, + "learning_rate": 7.362694300518134e-07, + "loss": 0.0001, + "reward": 2.4999964237213135, + "reward_std": 2.538123453632579e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999963641166687, + "step": 1019 + }, + { + "clip_ratio": 0.0, + "completion_length": 39.3125, + "epoch": 2.6424870466321244, + "grad_norm": 1.3117707935190224, + "kl": 0.15234375, + "learning_rate": 7.360103626943005e-07, + "loss": 0.0007, + "reward": 2.4999629259109497, + "reward_std": 1.1376158965958894e-05, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.999962866306305, + "step": 1020 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 2.645077720207254, + "grad_norm": 4.321192344468887, + "kl": 0.054931640625, + "learning_rate": 7.357512953367874e-07, + "loss": 0.0006, + "reward": 2.4999568462371826, + "reward_std": 2.5446793870287365e-05, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999569654464722, + "step": 1021 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.25, + "epoch": 2.6476683937823835, + "grad_norm": 0.2925598934588983, + "kl": 0.11279296875, + "learning_rate": 7.354922279792745e-07, + "loss": 0.0006, + "reward": 2.4999924898147583, + "reward_std": 4.172429157733859e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.999992549419403, + "step": 1022 + }, + { + "clip_ratio": 0.0, + "completion_length": 34.5, + "epoch": 2.650259067357513, + "grad_norm": 0.5271870926498459, + "kl": 0.1005859375, + "learning_rate": 7.352331606217617e-07, + "loss": 0.0008, + "reward": 2.4999935626983643, + "reward_std": 3.88533538853153e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999938011169434, + "step": 1023 + }, + { + "clip_ratio": 0.0, + "completion_length": 34.5, + "epoch": 2.6528497409326426, + "grad_norm": 68.07486583797518, + "kl": 0.0644073486328125, + "learning_rate": 7.349740932642487e-07, + "loss": -0.0, + "reward": 1.9981709122657776, + "reward_std": 0.00027494916307091444, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.4981709420681, + "step": 1024 + }, + { + "clip_ratio": 0.0, + "completion_length": 36.0, + "epoch": 2.655440414507772, + "grad_norm": 6.229073155096826, + "kl": 0.084228515625, + "learning_rate": 7.347150259067358e-07, + "loss": 0.0003, + "reward": 2.4998949766159058, + "reward_std": 3.0161281188156863e-05, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9998949766159058, + "step": 1025 + }, + { + "clip_ratio": 0.0, + "completion_length": 36.3125, + "epoch": 2.6580310880829017, + "grad_norm": 0.2191640383136181, + "kl": 0.1220703125, + "learning_rate": 7.344559585492228e-07, + "loss": 0.0006, + "reward": 2.499994397163391, + "reward_std": 3.1330532692663837e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999942779541016, + "step": 1026 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.5, + "epoch": 2.660621761658031, + "grad_norm": 3.0848741798994324, + "kl": 0.177978515625, + "learning_rate": 7.341968911917098e-07, + "loss": 0.0011, + "reward": 2.4999797344207764, + "reward_std": 2.4030719714573934e-05, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999796152114868, + "step": 1027 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 2.6632124352331608, + "grad_norm": 2.4537002197967426, + "kl": 0.12548828125, + "learning_rate": 7.339378238341969e-07, + "loss": 0.0007, + "reward": 1.9999439716339111, + "reward_std": 8.941842679632828e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.499943882226944, + "step": 1028 + }, + { + "clip_ratio": 0.0, + "completion_length": 34.5, + "epoch": 2.6658031088082903, + "grad_norm": 10.457539193706586, + "kl": 0.072509765625, + "learning_rate": 7.336787564766839e-07, + "loss": 0.0001, + "reward": 2.499864101409912, + "reward_std": 5.268064876418066e-05, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9998640418052673, + "step": 1029 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 2.66839378238342, + "grad_norm": 1.969276188527305, + "kl": 0.156494140625, + "learning_rate": 7.33419689119171e-07, + "loss": 0.0005, + "reward": 2.49992573261261, + "reward_std": 1.4278006801760057e-05, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999257326126099, + "step": 1030 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 2.6709844559585494, + "grad_norm": 0.6614771318823214, + "kl": 0.035797119140625, + "learning_rate": 7.331606217616581e-07, + "loss": 0.0003, + "reward": 2.4999916553497314, + "reward_std": 5.722366040572524e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999918937683105, + "step": 1031 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.1875, + "epoch": 2.6735751295336785, + "grad_norm": 9.250982285255036, + "kl": 0.083740234375, + "learning_rate": 7.329015544041451e-07, + "loss": -0.0007, + "reward": 1.9418030977249146, + "reward_std": 0.026353379398642574, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.4418033063411713, + "step": 1032 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 2.676165803108808, + "grad_norm": 7.13614460442719, + "kl": 0.1533203125, + "learning_rate": 7.326424870466321e-07, + "loss": 0.0013, + "reward": 1.9997770190238953, + "reward_std": 2.4907179749789066e-05, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.4997769594192505, + "step": 1033 + }, + { + "clip_ratio": 0.0, + "completion_length": 34.0, + "epoch": 2.6787564766839376, + "grad_norm": 0.2109501221571532, + "kl": 0.138916015625, + "learning_rate": 7.323834196891191e-07, + "loss": 0.0007, + "reward": 2.4999948740005493, + "reward_std": 4.7437108605663525e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999949932098389, + "step": 1034 + }, + { + "clip_ratio": 0.0, + "completion_length": 37.125, + "epoch": 2.681347150259067, + "grad_norm": 18.849485083660635, + "kl": 0.0860595703125, + "learning_rate": 7.321243523316062e-07, + "loss": 0.0003, + "reward": 1.9851142168045044, + "reward_std": 0.00023543618112853437, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.4851142168045044, + "step": 1035 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 2.6839378238341967, + "grad_norm": 2.055886137768087, + "kl": 0.0693359375, + "learning_rate": 7.318652849740933e-07, + "loss": 0.0007, + "reward": 1.9888790845870972, + "reward_std": 4.5920855654912884e-05, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.4888788759708405, + "step": 1036 + }, + { + "clip_ratio": 0.0, + "completion_length": 34.5, + "epoch": 2.686528497409326, + "grad_norm": 1.5836413190343166, + "kl": 0.080322265625, + "learning_rate": 7.316062176165803e-07, + "loss": 0.0003, + "reward": 1.9970427751541138, + "reward_std": 4.156149918799201e-05, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.4970427453517914, + "step": 1037 + }, + { + "clip_ratio": 0.0, + "completion_length": 34.0, + "epoch": 2.6891191709844557, + "grad_norm": 0.19613889601357332, + "kl": 0.10400390625, + "learning_rate": 7.313471502590674e-07, + "loss": 0.0008, + "reward": 2.49999737739563, + "reward_std": 2.947950179077452e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999974966049194, + "step": 1038 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.5, + "epoch": 2.6917098445595853, + "grad_norm": 16.722013357393276, + "kl": 0.0987548828125, + "learning_rate": 7.310880829015543e-07, + "loss": 0.0002, + "reward": 1.999333918094635, + "reward_std": 0.0007027474280221213, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.4993338882923126, + "step": 1039 + }, + { + "clip_ratio": 0.0, + "completion_length": 36.6875, + "epoch": 2.694300518134715, + "grad_norm": 45.54708241850775, + "kl": 0.11181640625, + "learning_rate": 7.308290155440414e-07, + "loss": 0.0013, + "reward": 1.9994693994522095, + "reward_std": 5.113296063541384e-05, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.4994693994522095, + "step": 1040 + }, + { + "clip_ratio": 0.0, + "completion_length": 36.0, + "epoch": 2.6968911917098444, + "grad_norm": 16.11110576877349, + "kl": 0.0758056640625, + "learning_rate": 7.305699481865285e-07, + "loss": -0.0001, + "reward": 1.8848603963851929, + "reward_std": 0.0013421574876701925, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.3848604559898376, + "step": 1041 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 2.699481865284974, + "grad_norm": 10.21808335888086, + "kl": 0.114501953125, + "learning_rate": 7.303108808290155e-07, + "loss": 0.0005, + "reward": 2.499887228012085, + "reward_std": 2.3803705516911577e-05, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9998871088027954, + "step": 1042 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.5, + "epoch": 2.7020725388601035, + "grad_norm": 1.219825400985413, + "kl": 0.128173828125, + "learning_rate": 7.300518134715026e-07, + "loss": 0.0004, + "reward": 2.499994158744812, + "reward_std": 3.3542859227964072e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.999994158744812, + "step": 1043 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 2.704663212435233, + "grad_norm": 0.20517865696171975, + "kl": 0.118896484375, + "learning_rate": 7.297927461139896e-07, + "loss": 0.0014, + "reward": 2.4999985694885254, + "reward_std": 1.8641724182089092e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999986290931702, + "step": 1044 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 2.7072538860103625, + "grad_norm": 0.7543048876017406, + "kl": 0.13623046875, + "learning_rate": 7.295336787564766e-07, + "loss": -0.0003, + "reward": 1.9998985528945923, + "reward_std": 9.980089657801727e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.4998984932899475, + "step": 1045 + }, + { + "clip_ratio": 0.0, + "completion_length": 36.625, + "epoch": 2.709844559585492, + "grad_norm": 4.39416196220655, + "kl": 0.100341796875, + "learning_rate": 7.292746113989637e-07, + "loss": 0.0013, + "reward": 1.7546041011810303, + "reward_std": 0.0002688942377062631, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.2546040415763855, + "step": 1046 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.5, + "epoch": 2.7124352331606216, + "grad_norm": 0.07013489109114594, + "kl": 0.11376953125, + "learning_rate": 7.290155440414507e-07, + "loss": 0.0003, + "reward": 2.4999940395355225, + "reward_std": 1.7276465769100469e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.999994158744812, + "step": 1047 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.5, + "epoch": 2.715025906735751, + "grad_norm": 0.6013728295247106, + "kl": 0.08740234375, + "learning_rate": 7.287564766839378e-07, + "loss": 0.0003, + "reward": 2.4999935626983643, + "reward_std": 4.070469401540322e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999933242797852, + "step": 1048 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 2.7176165803108807, + "grad_norm": 2.9610718772419036, + "kl": 0.10687255859375, + "learning_rate": 7.284974093264248e-07, + "loss": 0.0002, + "reward": 1.9992656111717224, + "reward_std": 3.550504692384493e-05, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.4992656707763672, + "step": 1049 + }, + { + "clip_ratio": 0.0, + "completion_length": 37.75, + "epoch": 2.7202072538860103, + "grad_norm": 14.283714305643459, + "kl": 0.12890625, + "learning_rate": 7.282383419689119e-07, + "loss": 0.0006, + "reward": 1.999831199645996, + "reward_std": 2.8587964152393397e-05, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.499831110239029, + "step": 1050 + }, + { + "clip_ratio": 0.0, + "completion_length": 34.5, + "epoch": 2.72279792746114, + "grad_norm": 7.0945124735362075, + "kl": 0.0357666015625, + "learning_rate": 7.279792746113989e-07, + "loss": -0.0004, + "reward": 2.49994957447052, + "reward_std": 2.700474237826711e-05, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999496936798096, + "step": 1051 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 2.7253886010362693, + "grad_norm": 2.268270623849028, + "kl": 0.085205078125, + "learning_rate": 7.277202072538859e-07, + "loss": 0.0008, + "reward": 2.499932646751404, + "reward_std": 1.8547011222835863e-05, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999324679374695, + "step": 1052 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.5, + "epoch": 2.727979274611399, + "grad_norm": 0.2646776621202804, + "kl": 0.017669677734375, + "learning_rate": 7.27461139896373e-07, + "loss": -0.0001, + "reward": 2.4999966621398926, + "reward_std": 2.1824906184519932e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999967813491821, + "step": 1053 + }, + { + "clip_ratio": 0.0, + "completion_length": 37.8125, + "epoch": 2.7305699481865284, + "grad_norm": 12.252002722776979, + "kl": 0.10595703125, + "learning_rate": 7.272020725388601e-07, + "loss": 0.0004, + "reward": 1.9968502521514893, + "reward_std": 0.00019809217519650701, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.4968502521514893, + "step": 1054 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.5, + "epoch": 2.733160621761658, + "grad_norm": 0.27093322828032407, + "kl": 0.069580078125, + "learning_rate": 7.269430051813471e-07, + "loss": 0.0003, + "reward": 2.499993085861206, + "reward_std": 3.6280623589846073e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999929070472717, + "step": 1055 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 2.7357512953367875, + "grad_norm": 0.5667673280559835, + "kl": 0.120849609375, + "learning_rate": 7.266839378238342e-07, + "loss": -0.0008, + "reward": 2.499995470046997, + "reward_std": 5.511264404844951e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999955296516418, + "step": 1056 + }, + { + "clip_ratio": 0.0, + "completion_length": 36.8125, + "epoch": 2.738341968911917, + "grad_norm": 14.613248784543186, + "kl": 0.25537109375, + "learning_rate": 7.264248704663211e-07, + "loss": 0.0012, + "reward": 1.8873506784439087, + "reward_std": 0.07073427953764622, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.3873507976531982, + "step": 1057 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.5, + "epoch": 2.7409326424870466, + "grad_norm": 98.28282262718358, + "kl": 0.1068115234375, + "learning_rate": 7.261658031088082e-07, + "loss": 0.0008, + "reward": 1.9999032616615295, + "reward_std": 6.506397880912118e-05, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.49990314245224, + "step": 1058 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.5, + "epoch": 2.743523316062176, + "grad_norm": 7.366011598428563, + "kl": 0.171875, + "learning_rate": 7.259067357512953e-07, + "loss": 0.0008, + "reward": 2.499987244606018, + "reward_std": 1.724209550957312e-05, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999871253967285, + "step": 1059 + }, + { + "clip_ratio": 0.0, + "completion_length": 36.6875, + "epoch": 2.7461139896373057, + "grad_norm": 1.2074248780971368, + "kl": 0.0709228515625, + "learning_rate": 7.256476683937823e-07, + "loss": 0.0007, + "reward": 1.9978476762771606, + "reward_std": 5.851211813023838e-05, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.4978476762771606, + "step": 1060 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 2.7487046632124352, + "grad_norm": 0.5930684530107416, + "kl": 0.0626220703125, + "learning_rate": 7.253886010362694e-07, + "loss": 0.0012, + "reward": 2.4999903440475464, + "reward_std": 6.0903076928298105e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999902844429016, + "step": 1061 + }, + { + "clip_ratio": 0.0, + "completion_length": 34.5, + "epoch": 2.7512953367875648, + "grad_norm": 0.2244633371890959, + "kl": 0.0506591796875, + "learning_rate": 7.251295336787564e-07, + "loss": -0.0008, + "reward": 2.499998092651367, + "reward_std": 1.7892605228553293e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.999998152256012, + "step": 1062 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 2.7538860103626943, + "grad_norm": 0.6552286577056265, + "kl": 0.0318603515625, + "learning_rate": 7.248704663212434e-07, + "loss": 0.0007, + "reward": 2.4999741315841675, + "reward_std": 6.703279552766617e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.999974012374878, + "step": 1063 + }, + { + "clip_ratio": 0.0, + "completion_length": 34.0, + "epoch": 2.756476683937824, + "grad_norm": 0.7872457262096866, + "kl": 0.2138671875, + "learning_rate": 7.246113989637305e-07, + "loss": 0.0003, + "reward": 2.4995816946029663, + "reward_std": 1.2888197943539126e-05, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9995818138122559, + "step": 1064 + }, + { + "clip_ratio": 0.0, + "completion_length": 38.25, + "epoch": 2.7590673575129534, + "grad_norm": 40.086736725850606, + "kl": 0.14013671875, + "learning_rate": 7.243523316062175e-07, + "loss": 0.0003, + "reward": 2.4374738931655884, + "reward_std": 0.17680212369123183, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9374739527702332, + "step": 1065 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 2.761658031088083, + "grad_norm": 31.224919008663623, + "kl": 0.1534423828125, + "learning_rate": 7.240932642487047e-07, + "loss": 0.0005, + "reward": 2.3749732971191406, + "reward_std": 0.23149630275702293, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.8749732375144958, + "step": 1066 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 2.7642487046632125, + "grad_norm": 80.219253679017, + "kl": 0.375, + "learning_rate": 7.238341968911917e-07, + "loss": 0.0014, + "reward": 2.0020114183425903, + "reward_std": 0.20123041486453985, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.5020114183425903, + "step": 1067 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.5, + "epoch": 2.766839378238342, + "grad_norm": 1.9210130664505254, + "kl": 0.051025390625, + "learning_rate": 7.235751295336788e-07, + "loss": -0.0002, + "reward": 2.499989151954651, + "reward_std": 6.962360998841177e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999892711639404, + "step": 1068 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.4375, + "epoch": 2.7694300518134716, + "grad_norm": 12.078434358176192, + "kl": 0.1455078125, + "learning_rate": 7.233160621761658e-07, + "loss": 0.0006, + "reward": 2.218725085258484, + "reward_std": 0.4519480440785628, + "rewards/format_reward_rec": 0.9375, + "rewards/point_reward": 1.7499750852584839, + "step": 1069 + }, + { + "clip_ratio": 0.0, + "completion_length": 34.5, + "epoch": 2.772020725388601, + "grad_norm": 0.4949794646161929, + "kl": 0.197265625, + "learning_rate": 7.230569948186528e-07, + "loss": -0.0001, + "reward": 2.4999905824661255, + "reward_std": 3.851149870115478e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999905824661255, + "step": 1070 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.1875, + "epoch": 2.7746113989637307, + "grad_norm": 1.0620997698338368, + "kl": 0.0914306640625, + "learning_rate": 7.227979274611399e-07, + "loss": 0.0015, + "reward": 2.499996781349182, + "reward_std": 1.297923517995514e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999969005584717, + "step": 1071 + }, + { + "clip_ratio": 0.0, + "completion_length": 36.0, + "epoch": 2.77720207253886, + "grad_norm": 5.5877891106408635, + "kl": 0.056396484375, + "learning_rate": 7.225388601036269e-07, + "loss": -0.0002, + "reward": 2.499979257583618, + "reward_std": 4.0352184669245617e-05, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.999979317188263, + "step": 1072 + }, + { + "clip_ratio": 0.0, + "completion_length": 34.4375, + "epoch": 2.7797927461139897, + "grad_norm": 19.796692429371053, + "kl": 0.076416015625, + "learning_rate": 7.22279792746114e-07, + "loss": 0.0003, + "reward": 2.246677339076996, + "reward_std": 0.2706778674717043, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.7466772198677063, + "step": 1073 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 2.7823834196891193, + "grad_norm": 2.6063088580666034, + "kl": 0.0859375, + "learning_rate": 7.220207253886011e-07, + "loss": -0.0002, + "reward": 2.4999797344207764, + "reward_std": 1.2273186030142824e-05, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.999979853630066, + "step": 1074 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.5, + "epoch": 2.784974093264249, + "grad_norm": 13.069108566992332, + "kl": 0.0869140625, + "learning_rate": 7.21761658031088e-07, + "loss": 0.0006, + "reward": 2.437316417694092, + "reward_std": 0.17728359372506475, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9373162984848022, + "step": 1075 + }, + { + "clip_ratio": 0.0, + "completion_length": 34.5, + "epoch": 2.7875647668393784, + "grad_norm": 4.755314074861053, + "kl": 0.13037109375, + "learning_rate": 7.215025906735751e-07, + "loss": 0.0002, + "reward": 1.9925823211669922, + "reward_std": 5.7975628806161694e-05, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.4925822615623474, + "step": 1076 + }, + { + "clip_ratio": 0.0, + "completion_length": 34.5, + "epoch": 2.790155440414508, + "grad_norm": 0.4946854997981964, + "kl": 0.149169921875, + "learning_rate": 7.212435233160622e-07, + "loss": 0.0005, + "reward": 2.4999823570251465, + "reward_std": 3.7929671634628903e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999824166297913, + "step": 1077 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 2.7927461139896375, + "grad_norm": 8.981555591133127, + "kl": 0.0394439697265625, + "learning_rate": 7.209844559585492e-07, + "loss": 0.001, + "reward": 2.4999935626983643, + "reward_std": 8.956594228948234e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999933242797852, + "step": 1078 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 2.795336787564767, + "grad_norm": 7.433920545563419, + "kl": 0.09521484375, + "learning_rate": 7.207253886010363e-07, + "loss": 0.0007, + "reward": 2.499902367591858, + "reward_std": 5.014531780034304e-05, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999022483825684, + "step": 1079 + }, + { + "clip_ratio": 0.0, + "completion_length": 34.5, + "epoch": 2.7979274611398965, + "grad_norm": 3.956844248290702, + "kl": 0.088623046875, + "learning_rate": 7.204663212435233e-07, + "loss": 0.0003, + "reward": 1.999257206916809, + "reward_std": 4.888301108962878e-05, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.4992571771144867, + "step": 1080 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 2.800518134715026, + "grad_norm": 2.428315916219824, + "kl": 0.02392578125, + "learning_rate": 7.202072538860103e-07, + "loss": -0.0, + "reward": 2.499980330467224, + "reward_std": 1.1904242001037346e-05, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.999980390071869, + "step": 1081 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.5, + "epoch": 2.8031088082901556, + "grad_norm": 0.14568564415111906, + "kl": 0.073974609375, + "learning_rate": 7.199481865284974e-07, + "loss": 0.0016, + "reward": 2.4999961853027344, + "reward_std": 1.7538935139782552e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999960660934448, + "step": 1082 + }, + { + "clip_ratio": 0.0, + "completion_length": 34.5, + "epoch": 2.805699481865285, + "grad_norm": 4.923154202614084, + "kl": 0.059814453125, + "learning_rate": 7.196891191709844e-07, + "loss": 0.0011, + "reward": 2.4999935626983643, + "reward_std": 4.7752869249961805e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999934434890747, + "step": 1083 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.5, + "epoch": 2.8082901554404147, + "grad_norm": 0.24342664138678616, + "kl": 0.09228515625, + "learning_rate": 7.194300518134715e-07, + "loss": 0.0007, + "reward": 2.499969482421875, + "reward_std": 5.161863441571768e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999696612358093, + "step": 1084 + }, + { + "clip_ratio": 0.0, + "completion_length": 34.5, + "epoch": 2.8108808290155443, + "grad_norm": 0.23525517553244468, + "kl": 0.109130859375, + "learning_rate": 7.191709844559585e-07, + "loss": 0.0014, + "reward": 2.4999958276748657, + "reward_std": 2.9200672315710108e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.999995768070221, + "step": 1085 + }, + { + "clip_ratio": 0.0, + "completion_length": 36.625, + "epoch": 2.813471502590674, + "grad_norm": 0.10910958543718147, + "kl": 0.08837890625, + "learning_rate": 7.189119170984456e-07, + "loss": 0.0001, + "reward": 2.49999737739563, + "reward_std": 1.6142615208991629e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999974966049194, + "step": 1086 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 2.8160621761658033, + "grad_norm": 8.707789481764191, + "kl": 0.080322265625, + "learning_rate": 7.186528497409327e-07, + "loss": 0.0001, + "reward": 2.498985767364502, + "reward_std": 7.098440448771726e-05, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9989857077598572, + "step": 1087 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 2.818652849740933, + "grad_norm": 3.2743786748842085, + "kl": 0.0496826171875, + "learning_rate": 7.183937823834196e-07, + "loss": 0.0005, + "reward": 2.4999217987060547, + "reward_std": 1.652415630815085e-05, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999217987060547, + "step": 1088 + }, + { + "clip_ratio": 0.0, + "completion_length": 34.0, + "epoch": 2.8212435233160624, + "grad_norm": 1.2110577185998637, + "kl": 0.11962890625, + "learning_rate": 7.181347150259067e-07, + "loss": -0.0004, + "reward": 2.4999951124191284, + "reward_std": 3.3248207245151207e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999949932098389, + "step": 1089 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.5, + "epoch": 2.823834196891192, + "grad_norm": 10.873866547427554, + "kl": 0.127685546875, + "learning_rate": 7.178756476683937e-07, + "loss": 0.0006, + "reward": 1.9986047744750977, + "reward_std": 6.084624533286842e-05, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.4986046850681305, + "step": 1090 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 2.8264248704663215, + "grad_norm": 3.9224318973043206, + "kl": 0.047119140625, + "learning_rate": 7.176165803108808e-07, + "loss": 0.0011, + "reward": 2.4999605417251587, + "reward_std": 2.0427865820238367e-05, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999603629112244, + "step": 1091 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 2.8290155440414506, + "grad_norm": 14.35356201162555, + "kl": 0.127685546875, + "learning_rate": 7.173575129533679e-07, + "loss": 0.0011, + "reward": 2.062433958053589, + "reward_std": 0.1767889433590426, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.5624338388442993, + "step": 1092 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.5, + "epoch": 2.83160621761658, + "grad_norm": 23.881574781712416, + "kl": 0.097900390625, + "learning_rate": 7.170984455958548e-07, + "loss": -0.0003, + "reward": 2.4998987913131714, + "reward_std": 5.793695368083718e-05, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9998989701271057, + "step": 1093 + }, + { + "clip_ratio": 0.0, + "completion_length": 36.6875, + "epoch": 2.8341968911917097, + "grad_norm": 3.285440237402109, + "kl": 0.1322021484375, + "learning_rate": 7.168393782383419e-07, + "loss": 0.001, + "reward": 2.4999254941940308, + "reward_std": 1.3155834608369332e-05, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.999925434589386, + "step": 1094 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 2.8367875647668392, + "grad_norm": 37.47873630443259, + "kl": 0.0428466796875, + "learning_rate": 7.165803108808289e-07, + "loss": 0.0005, + "reward": 2.3749401569366455, + "reward_std": 0.23155948038015595, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.8749401569366455, + "step": 1095 + }, + { + "clip_ratio": 0.0, + "completion_length": 34.5, + "epoch": 2.839378238341969, + "grad_norm": 1.8826301432638974, + "kl": 0.087646484375, + "learning_rate": 7.16321243523316e-07, + "loss": 0.0008, + "reward": 2.4999784231185913, + "reward_std": 8.522820678535936e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999783635139465, + "step": 1096 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.8125, + "epoch": 2.8419689119170983, + "grad_norm": 3.886173789357406, + "kl": 0.0693359375, + "learning_rate": 7.160621761658031e-07, + "loss": -0.0003, + "reward": 2.4999921321868896, + "reward_std": 1.2925234614158398e-05, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999921321868896, + "step": 1097 + }, + { + "clip_ratio": 0.0, + "completion_length": 34.5, + "epoch": 2.844559585492228, + "grad_norm": 17.612798734871337, + "kl": 0.5328369140625, + "learning_rate": 7.158031088082901e-07, + "loss": 0.0016, + "reward": 1.8112998008728027, + "reward_std": 0.0010947110818051442, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.3112997114658356, + "step": 1098 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 2.8471502590673574, + "grad_norm": 2.0701117441853785, + "kl": 0.0888671875, + "learning_rate": 7.155440414507772e-07, + "loss": -0.0001, + "reward": 2.4999868869781494, + "reward_std": 7.361585630860645e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999868869781494, + "step": 1099 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.4375, + "epoch": 2.849740932642487, + "grad_norm": 0.8399703029727895, + "kl": 0.15625, + "learning_rate": 7.152849740932642e-07, + "loss": 0.0008, + "reward": 1.4999985694885254, + "reward_std": 1.5715099834778812e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 0.9999985694885254, + "step": 1100 + }, + { + "clip_ratio": 0.0, + "completion_length": 34.0, + "epoch": 2.8523316062176165, + "grad_norm": 393.0112823584629, + "kl": 0.097412109375, + "learning_rate": 7.150259067357512e-07, + "loss": 0.0004, + "reward": 1.8534001111984253, + "reward_std": 0.17722581850830466, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.3533999919891357, + "step": 1101 + }, + { + "clip_ratio": 0.0, + "completion_length": 36.5, + "epoch": 2.854922279792746, + "grad_norm": 0.22681909035896552, + "kl": 0.1025390625, + "learning_rate": 7.147668393782383e-07, + "loss": 0.0007, + "reward": 2.499991774559021, + "reward_std": 3.7977329157001805e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999915957450867, + "step": 1102 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.5, + "epoch": 2.8575129533678756, + "grad_norm": 1.8079716761954987, + "kl": 0.0980224609375, + "learning_rate": 7.145077720207253e-07, + "loss": 0.0001, + "reward": 2.4999680519104004, + "reward_std": 9.890277510749002e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999680519104004, + "step": 1103 + }, + { + "clip_ratio": 0.0, + "completion_length": 36.75, + "epoch": 2.860103626943005, + "grad_norm": 61.80590614410551, + "kl": 0.12744140625, + "learning_rate": 7.142487046632124e-07, + "loss": 0.0008, + "reward": 2.436125159263611, + "reward_std": 0.18065080092264907, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.936125099658966, + "step": 1104 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 2.8626943005181347, + "grad_norm": 4.947483923829126, + "kl": 0.0557861328125, + "learning_rate": 7.139896373056995e-07, + "loss": 0.0005, + "reward": 2.499988555908203, + "reward_std": 9.842450737096442e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999884366989136, + "step": 1105 + }, + { + "clip_ratio": 0.0, + "completion_length": 36.625, + "epoch": 2.865284974093264, + "grad_norm": 0.10531010835480184, + "kl": 0.10595703125, + "learning_rate": 7.137305699481864e-07, + "loss": -0.0002, + "reward": 2.4999979734420776, + "reward_std": 1.3254145017072005e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999982118606567, + "step": 1106 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 2.8678756476683938, + "grad_norm": 7.65941754821093, + "kl": 0.1083984375, + "learning_rate": 7.134715025906735e-07, + "loss": 0.0003, + "reward": 2.499955654144287, + "reward_std": 4.449913922144333e-05, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999558329582214, + "step": 1107 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 2.8704663212435233, + "grad_norm": 4.537239674472093, + "kl": 0.1224365234375, + "learning_rate": 7.132124352331605e-07, + "loss": 0.0004, + "reward": 2.4999425411224365, + "reward_std": 1.7167034684462124e-05, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.999942421913147, + "step": 1108 + }, + { + "clip_ratio": 0.0, + "completion_length": 36.0, + "epoch": 2.873056994818653, + "grad_norm": 7.395477812843716, + "kl": 0.06640625, + "learning_rate": 7.129533678756477e-07, + "loss": -0.0004, + "reward": 1.9899404644966125, + "reward_std": 0.0002875989903259324, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.489940494298935, + "step": 1109 + }, + { + "clip_ratio": 0.0, + "completion_length": 37.6875, + "epoch": 2.8756476683937824, + "grad_norm": 1.9947429179408407, + "kl": 0.099609375, + "learning_rate": 7.126943005181348e-07, + "loss": 0.0003, + "reward": 2.499986410140991, + "reward_std": 1.3293567462824285e-05, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999862313270569, + "step": 1110 + }, + { + "clip_ratio": 0.0, + "completion_length": 34.5, + "epoch": 2.878238341968912, + "grad_norm": 2.0896604629479065, + "kl": 0.0360107421875, + "learning_rate": 7.124352331606218e-07, + "loss": 0.0004, + "reward": 2.499975800514221, + "reward_std": 1.1701359653670806e-05, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999758005142212, + "step": 1111 + }, + { + "clip_ratio": 0.0, + "completion_length": 37.75, + "epoch": 2.8808290155440415, + "grad_norm": 2.7023212850670397, + "kl": 0.103515625, + "learning_rate": 7.121761658031088e-07, + "loss": 0.001, + "reward": 2.499971866607666, + "reward_std": 1.0769423965939495e-05, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999716877937317, + "step": 1112 + }, + { + "clip_ratio": 0.0, + "completion_length": 36.0, + "epoch": 2.883419689119171, + "grad_norm": 1.015715283594274, + "kl": 0.080810546875, + "learning_rate": 7.119170984455958e-07, + "loss": 0.0, + "reward": 2.499992847442627, + "reward_std": 4.163142534707731e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.999992847442627, + "step": 1113 + }, + { + "clip_ratio": 0.0, + "completion_length": 36.0, + "epoch": 2.8860103626943006, + "grad_norm": 1.6015076756637872, + "kl": 0.084228515625, + "learning_rate": 7.116580310880829e-07, + "loss": 0.0011, + "reward": 1.9935968518257141, + "reward_std": 4.3514764911378734e-05, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.4935966432094574, + "step": 1114 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 2.88860103626943, + "grad_norm": 5.827115353340306, + "kl": 0.080322265625, + "learning_rate": 7.1139896373057e-07, + "loss": 0.0006, + "reward": 2.4999715089797974, + "reward_std": 1.682233687461121e-05, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999715089797974, + "step": 1115 + }, + { + "clip_ratio": 0.0, + "completion_length": 36.0, + "epoch": 2.8911917098445596, + "grad_norm": 0.818833821270414, + "kl": 0.055908203125, + "learning_rate": 7.11139896373057e-07, + "loss": -0.0001, + "reward": 2.49999463558197, + "reward_std": 4.764110258292931e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999945759773254, + "step": 1116 + }, + { + "clip_ratio": 0.0, + "completion_length": 37.125, + "epoch": 2.893782383419689, + "grad_norm": 0.6186789778257421, + "kl": 0.0771484375, + "learning_rate": 7.108808290155441e-07, + "loss": 0.0004, + "reward": 2.4999711513519287, + "reward_std": 5.8186195133203e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999711513519287, + "step": 1117 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 2.8963730569948187, + "grad_norm": 0.10584215002423136, + "kl": 0.0518798828125, + "learning_rate": 7.10621761658031e-07, + "loss": -0.001, + "reward": 2.499988555908203, + "reward_std": 1.755187923890844e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999887943267822, + "step": 1118 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.5, + "epoch": 2.8989637305699483, + "grad_norm": 0.5522414956955325, + "kl": 0.10400390625, + "learning_rate": 7.103626943005181e-07, + "loss": 0.0003, + "reward": 2.4999914169311523, + "reward_std": 5.139297570622148e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999914765357971, + "step": 1119 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 2.901554404145078, + "grad_norm": 2.622176048442459, + "kl": 0.0711669921875, + "learning_rate": 7.101036269430052e-07, + "loss": 0.0009, + "reward": 2.4999932050704956, + "reward_std": 6.585877372344839e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.999993085861206, + "step": 1120 + }, + { + "clip_ratio": 0.0, + "completion_length": 37.1875, + "epoch": 2.9041450777202074, + "grad_norm": 0.36202591021173663, + "kl": 0.0657958984375, + "learning_rate": 7.098445595854922e-07, + "loss": -0.0003, + "reward": 2.499996066093445, + "reward_std": 1.8297714063919557e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999960660934448, + "step": 1121 + }, + { + "clip_ratio": 0.0, + "completion_length": 37.1875, + "epoch": 2.906735751295337, + "grad_norm": 0.3989314323177718, + "kl": 0.15869140625, + "learning_rate": 7.095854922279793e-07, + "loss": 0.0005, + "reward": 2.4999897480010986, + "reward_std": 9.90946682577487e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.999989628791809, + "step": 1122 + }, + { + "clip_ratio": 0.0, + "completion_length": 36.6875, + "epoch": 2.9093264248704664, + "grad_norm": 3.98828984217289, + "kl": 0.107177734375, + "learning_rate": 7.093264248704664e-07, + "loss": 0.0001, + "reward": 1.8213641047477722, + "reward_std": 0.00019163135178246193, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.3213641047477722, + "step": 1123 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 2.911917098445596, + "grad_norm": 0.38372592988781673, + "kl": 0.0947265625, + "learning_rate": 7.090673575129533e-07, + "loss": 0.0002, + "reward": 2.4999934434890747, + "reward_std": 6.14368312312763e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.999993622303009, + "step": 1124 + }, + { + "clip_ratio": 0.0, + "completion_length": 37.625, + "epoch": 2.9145077720207255, + "grad_norm": 2.185680743780335, + "kl": 0.052490234375, + "learning_rate": 7.088082901554404e-07, + "loss": 0.0003, + "reward": 1.9998529553413391, + "reward_std": 2.4648191583764856e-05, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.4998528957366943, + "step": 1125 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 2.917098445595855, + "grad_norm": 0.4143511714457312, + "kl": 0.07373046875, + "learning_rate": 7.085492227979274e-07, + "loss": -0.0007, + "reward": 2.4999865293502808, + "reward_std": 4.4882543761559646e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999865889549255, + "step": 1126 + }, + { + "clip_ratio": 0.0, + "completion_length": 37.6875, + "epoch": 2.9196891191709846, + "grad_norm": 2.119040985586719, + "kl": 0.067138671875, + "learning_rate": 7.082901554404145e-07, + "loss": -0.0007, + "reward": 1.9980251789093018, + "reward_std": 4.27513819545311e-05, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.4980253875255585, + "step": 1127 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.5, + "epoch": 2.9222797927461137, + "grad_norm": 3.2498574020963535, + "kl": 0.098388671875, + "learning_rate": 7.080310880829016e-07, + "loss": 0.0001, + "reward": 1.999819815158844, + "reward_std": 1.0583228686300572e-05, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.499819815158844, + "step": 1128 + }, + { + "clip_ratio": 0.0, + "completion_length": 34.5, + "epoch": 2.9248704663212433, + "grad_norm": 7.285872759741117, + "kl": 0.07958984375, + "learning_rate": 7.077720207253886e-07, + "loss": 0.0003, + "reward": 1.9943678379058838, + "reward_std": 9.15516066015698e-05, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.4943679571151733, + "step": 1129 + }, + { + "clip_ratio": 0.0, + "completion_length": 34.0, + "epoch": 2.927461139896373, + "grad_norm": 1.0516048493387564, + "kl": 0.175048828125, + "learning_rate": 7.075129533678756e-07, + "loss": 0.0011, + "reward": 2.4999934434890747, + "reward_std": 5.073481815998093e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999934434890747, + "step": 1130 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.5, + "epoch": 2.9300518134715023, + "grad_norm": 3.314155223630398, + "kl": 0.076904296875, + "learning_rate": 7.072538860103626e-07, + "loss": 0.001, + "reward": 2.4999561309814453, + "reward_std": 1.9229667486797553e-05, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999558329582214, + "step": 1131 + }, + { + "clip_ratio": 0.0, + "completion_length": 34.5, + "epoch": 2.932642487046632, + "grad_norm": 7.451044311709573, + "kl": 0.108642578125, + "learning_rate": 7.069948186528497e-07, + "loss": -0.0002, + "reward": 1.9986302852630615, + "reward_std": 5.2891948143951595e-05, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.4986303448677063, + "step": 1132 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.5, + "epoch": 2.9352331606217614, + "grad_norm": 1.355923951411975, + "kl": 0.128173828125, + "learning_rate": 7.067357512953368e-07, + "loss": -0.0001, + "reward": 2.4999951124191284, + "reward_std": 3.3475552640993556e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999950528144836, + "step": 1133 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 2.937823834196891, + "grad_norm": 1.4880376971698004, + "kl": 0.093994140625, + "learning_rate": 7.064766839378238e-07, + "loss": -0.0001, + "reward": 2.4999552965164185, + "reward_std": 1.990783744076907e-05, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.999955415725708, + "step": 1134 + }, + { + "clip_ratio": 0.0, + "completion_length": 37.1875, + "epoch": 2.9404145077720205, + "grad_norm": 10.711080101650754, + "kl": 0.144287109375, + "learning_rate": 7.062176165803109e-07, + "loss": 0.0011, + "reward": 1.9870364665985107, + "reward_std": 0.00018690419165068306, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.4870363473892212, + "step": 1135 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 2.94300518134715, + "grad_norm": 7.999550640622812, + "kl": 0.1640625, + "learning_rate": 7.059585492227978e-07, + "loss": 0.0008, + "reward": 1.998693585395813, + "reward_std": 0.00031646367085613747, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.498693585395813, + "step": 1136 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 2.9455958549222796, + "grad_norm": 30.720288921915085, + "kl": 0.115234375, + "learning_rate": 7.056994818652849e-07, + "loss": 0.001, + "reward": 2.4999574422836304, + "reward_std": 1.8955228370032273e-05, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999575018882751, + "step": 1137 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.5, + "epoch": 2.948186528497409, + "grad_norm": 13.278939540873479, + "kl": 0.094482421875, + "learning_rate": 7.05440414507772e-07, + "loss": -0.0001, + "reward": 2.499975562095642, + "reward_std": 8.365519761355245e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.999975562095642, + "step": 1138 + }, + { + "clip_ratio": 0.0, + "completion_length": 34.8125, + "epoch": 2.9507772020725387, + "grad_norm": 16.637544014751317, + "kl": 0.154052734375, + "learning_rate": 7.05181347150259e-07, + "loss": 0.0012, + "reward": 2.187487483024597, + "reward_std": 0.5786307236259063, + "rewards/format_reward_rec": 0.875, + "rewards/point_reward": 1.7499874234199524, + "step": 1139 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.5, + "epoch": 2.9533678756476682, + "grad_norm": 0.16456697144892732, + "kl": 0.0428466796875, + "learning_rate": 7.049222797927461e-07, + "loss": -0.0003, + "reward": 2.499994993209839, + "reward_std": 3.0310380907394574e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999949932098389, + "step": 1140 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.5, + "epoch": 2.9559585492227978, + "grad_norm": 0.11590364317646457, + "kl": 0.0537109375, + "learning_rate": 7.046632124352331e-07, + "loss": -0.0005, + "reward": 2.499987244606018, + "reward_std": 3.5620584526441235e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999873638153076, + "step": 1141 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 2.9585492227979273, + "grad_norm": 1.5830693480236082, + "kl": 0.2109375, + "learning_rate": 7.044041450777201e-07, + "loss": 0.0008, + "reward": 1.9981689453125, + "reward_std": 3.835231962057151e-05, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.4981690645217896, + "step": 1142 + }, + { + "clip_ratio": 0.0, + "completion_length": 36.6875, + "epoch": 2.961139896373057, + "grad_norm": 3.557067469499059, + "kl": 0.134033203125, + "learning_rate": 7.041450777202072e-07, + "loss": 0.0008, + "reward": 2.4373693466186523, + "reward_std": 0.17683035418724558, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9373692870140076, + "step": 1143 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.5, + "epoch": 2.9637305699481864, + "grad_norm": 0.2430091628343801, + "kl": 0.09521484375, + "learning_rate": 7.038860103626942e-07, + "loss": 0.0001, + "reward": 2.4999964237213135, + "reward_std": 3.0747958135179942e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999964237213135, + "step": 1144 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.5, + "epoch": 2.966321243523316, + "grad_norm": 2.66400394096119, + "kl": 0.09228515625, + "learning_rate": 7.036269430051813e-07, + "loss": 0.0002, + "reward": 1.999937891960144, + "reward_std": 1.1545933375600725e-05, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.4999380707740784, + "step": 1145 + }, + { + "clip_ratio": 0.0, + "completion_length": 38.5625, + "epoch": 2.9689119170984455, + "grad_norm": 0.2939389772846676, + "kl": 0.109130859375, + "learning_rate": 7.033678756476683e-07, + "loss": -0.0, + "reward": 2.499996066093445, + "reward_std": 2.869964930596325e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999958872795105, + "step": 1146 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.5, + "epoch": 2.971502590673575, + "grad_norm": 0.22551784824130083, + "kl": 0.104736328125, + "learning_rate": 7.031088082901554e-07, + "loss": 0.0001, + "reward": 1.999951183795929, + "reward_std": 6.387927214746014e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.499951183795929, + "step": 1147 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.5, + "epoch": 2.9740932642487046, + "grad_norm": 5.683038747313902, + "kl": 0.111328125, + "learning_rate": 7.028497409326424e-07, + "loss": 0.0007, + "reward": 2.4999536275863647, + "reward_std": 2.307615199015345e-05, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999537467956543, + "step": 1148 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.5, + "epoch": 2.976683937823834, + "grad_norm": 45.20046738492338, + "kl": 0.0643310546875, + "learning_rate": 7.025906735751294e-07, + "loss": -0.0, + "reward": 2.3124247789382935, + "reward_std": 0.25882632569573616, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.8124247193336487, + "step": 1149 + }, + { + "clip_ratio": 0.0, + "completion_length": 37.875, + "epoch": 2.9792746113989637, + "grad_norm": 0.3992135079450244, + "kl": 0.16357421875, + "learning_rate": 7.023316062176165e-07, + "loss": 0.0014, + "reward": 2.4999853372573853, + "reward_std": 3.1528557542515045e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999852180480957, + "step": 1150 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 2.981865284974093, + "grad_norm": 1.7259120769206473, + "kl": 0.012054443359375, + "learning_rate": 7.020725388601037e-07, + "loss": 0.0002, + "reward": 2.0624573826789856, + "reward_std": 0.17678039967643144, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.5624573826789856, + "step": 1151 + }, + { + "clip_ratio": 0.0, + "completion_length": 34.5, + "epoch": 2.9844559585492227, + "grad_norm": 8.447848092326048, + "kl": 0.04034423828125, + "learning_rate": 7.018134715025907e-07, + "loss": 0.0008, + "reward": 1.9986690878868103, + "reward_std": 8.416088712692726e-05, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.4986690282821655, + "step": 1152 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 2.9870466321243523, + "grad_norm": 1.2972822025214075, + "kl": 0.09423828125, + "learning_rate": 7.015544041450778e-07, + "loss": 0.0008, + "reward": 2.499926447868347, + "reward_std": 1.1386057508389058e-05, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999262690544128, + "step": 1153 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.5, + "epoch": 2.989637305699482, + "grad_norm": 2.29611919032345, + "kl": 0.109130859375, + "learning_rate": 7.012953367875647e-07, + "loss": 0.0014, + "reward": 1.9995307922363281, + "reward_std": 1.694477805358474e-05, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.499530702829361, + "step": 1154 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.5, + "epoch": 2.9922279792746114, + "grad_norm": 0.11824373433337411, + "kl": 0.0452880859375, + "learning_rate": 7.010362694300518e-07, + "loss": 0.0012, + "reward": 2.4999964237213135, + "reward_std": 2.057241715647251e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.999996304512024, + "step": 1155 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.4375, + "epoch": 2.994818652849741, + "grad_norm": 42.33963609536471, + "kl": 0.1304931640625, + "learning_rate": 7.007772020725389e-07, + "loss": 0.0005, + "reward": 1.7886215448379517, + "reward_std": 0.003570433329059597, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.2886216938495636, + "step": 1156 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 2.9974093264248705, + "grad_norm": 1.7166453701474669, + "kl": 0.07421875, + "learning_rate": 7.005181347150259e-07, + "loss": 0.0007, + "reward": 2.4999858140945435, + "reward_std": 1.2763360246026423e-05, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999858736991882, + "step": 1157 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.5, + "epoch": 3.0, + "grad_norm": 2.719556150027397, + "kl": 0.05584716796875, + "learning_rate": 7.00259067357513e-07, + "loss": -0.0004, + "reward": 2.4999583959579468, + "reward_std": 1.1173382517881691e-05, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999584555625916, + "step": 1158 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.5, + "epoch": 3.0025906735751295, + "grad_norm": 2.461447509058338, + "kl": 0.105712890625, + "learning_rate": 7e-07, + "loss": 0.0006, + "reward": 2.4999126195907593, + "reward_std": 3.372174069227185e-05, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.999912679195404, + "step": 1159 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.5, + "epoch": 3.005181347150259, + "grad_norm": 0.7294775043861922, + "kl": 0.127197265625, + "learning_rate": 6.99740932642487e-07, + "loss": 0.0018, + "reward": 1.9999293088912964, + "reward_std": 1.0251685353068751e-05, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.4999292492866516, + "step": 1160 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 3.0077720207253886, + "grad_norm": 6.939097905563522, + "kl": 1.5831298828125, + "learning_rate": 6.994818652849741e-07, + "loss": 0.0073, + "reward": 2.499998450279236, + "reward_std": 2.793906077158681e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999982714653015, + "step": 1161 + }, + { + "clip_ratio": 0.0, + "completion_length": 37.6875, + "epoch": 3.010362694300518, + "grad_norm": 3.3560175791018163, + "kl": 0.109619140625, + "learning_rate": 6.992227979274611e-07, + "loss": -0.0009, + "reward": 2.499979019165039, + "reward_std": 1.4151022583064332e-05, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999791979789734, + "step": 1162 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 3.0129533678756477, + "grad_norm": 0.2946434730238508, + "kl": 0.0775146484375, + "learning_rate": 6.989637305699482e-07, + "loss": 0.0005, + "reward": 2.4999979734420776, + "reward_std": 2.1617928496198147e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.999998152256012, + "step": 1163 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 3.0155440414507773, + "grad_norm": 0.10396015666718733, + "kl": 0.121337890625, + "learning_rate": 6.987046632124352e-07, + "loss": 0.0009, + "reward": 2.49999737739563, + "reward_std": 1.02774143329043e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999973773956299, + "step": 1164 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 3.018134715025907, + "grad_norm": 8.466246391543336, + "kl": 0.10302734375, + "learning_rate": 6.984455958549223e-07, + "loss": 0.0002, + "reward": 2.4999542236328125, + "reward_std": 3.13268728859839e-05, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.999954342842102, + "step": 1165 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.5, + "epoch": 3.0207253886010363, + "grad_norm": 2.0671689748537996, + "kl": 0.0640869140625, + "learning_rate": 6.981865284974093e-07, + "loss": 0.0005, + "reward": 2.499988555908203, + "reward_std": 1.2248337952769361e-05, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999884963035583, + "step": 1166 + }, + { + "clip_ratio": 0.0, + "completion_length": 34.5, + "epoch": 3.023316062176166, + "grad_norm": 0.1466281160981326, + "kl": 0.12939453125, + "learning_rate": 6.979274611398963e-07, + "loss": 0.0007, + "reward": 2.4999983310699463, + "reward_std": 1.1799395736034057e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999983310699463, + "step": 1167 + }, + { + "clip_ratio": 0.0, + "completion_length": 36.0, + "epoch": 3.0259067357512954, + "grad_norm": 0.48679038895125865, + "kl": 0.10693359375, + "learning_rate": 6.976683937823834e-07, + "loss": -0.0004, + "reward": 2.499994397163391, + "reward_std": 3.5975109540231642e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999943971633911, + "step": 1168 + }, + { + "clip_ratio": 0.0, + "completion_length": 34.5, + "epoch": 3.028497409326425, + "grad_norm": 13.28376293936007, + "kl": 0.0621337890625, + "learning_rate": 6.974093264248704e-07, + "loss": 0.0013, + "reward": 2.499867081642151, + "reward_std": 5.790720570075791e-05, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.999867022037506, + "step": 1169 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.5, + "epoch": 3.0310880829015545, + "grad_norm": 0.9175071597383333, + "kl": 0.1328125, + "learning_rate": 6.971502590673575e-07, + "loss": -0.0, + "reward": 2.4999834299087524, + "reward_std": 9.043923000717768e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999833703041077, + "step": 1170 + }, + { + "clip_ratio": 0.0, + "completion_length": 40.5625, + "epoch": 3.033678756476684, + "grad_norm": 0.142921173334334, + "kl": 0.110107421875, + "learning_rate": 6.968911917098446e-07, + "loss": 0.0004, + "reward": 2.4999974966049194, + "reward_std": 1.8433906348036544e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.999997615814209, + "step": 1171 + }, + { + "clip_ratio": 0.0, + "completion_length": 63.0, + "epoch": 3.0362694300518136, + "grad_norm": 6.578932104147027, + "kl": 0.05902099609375, + "learning_rate": 6.966321243523315e-07, + "loss": -0.0003, + "reward": 2.499991536140442, + "reward_std": 1.2175861229479779e-05, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999916553497314, + "step": 1172 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 3.038860103626943, + "grad_norm": 12.812888662519878, + "kl": 0.191650390625, + "learning_rate": 6.963730569948186e-07, + "loss": 0.0007, + "reward": 1.937165081501007, + "reward_std": 0.17684286828080076, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.4371649622917175, + "step": 1173 + }, + { + "clip_ratio": 0.0, + "completion_length": 36.0, + "epoch": 3.0414507772020727, + "grad_norm": 0.36017149309125857, + "kl": 0.09619140625, + "learning_rate": 6.961139896373057e-07, + "loss": 0.0015, + "reward": 2.499983310699463, + "reward_std": 6.094512514209782e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999832510948181, + "step": 1174 + }, + { + "clip_ratio": 0.0, + "completion_length": 34.5, + "epoch": 3.0440414507772022, + "grad_norm": 50.22883560752347, + "kl": 0.061767578125, + "learning_rate": 6.958549222797927e-07, + "loss": 0.0001, + "reward": 1.9565237760543823, + "reward_std": 0.00029459824872901663, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.4565239548683167, + "step": 1175 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.5, + "epoch": 3.0466321243523318, + "grad_norm": 0.5827171756534197, + "kl": 0.07305908203125, + "learning_rate": 6.955958549222798e-07, + "loss": -0.0003, + "reward": 2.4999759197235107, + "reward_std": 5.402351973771147e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999760389328003, + "step": 1176 + }, + { + "clip_ratio": 0.0, + "completion_length": 37.875, + "epoch": 3.0492227979274613, + "grad_norm": 0.060686410556682266, + "kl": 0.12158203125, + "learning_rate": 6.953367875647668e-07, + "loss": 0.0005, + "reward": 2.4999982118606567, + "reward_std": 9.764374624410266e-07, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999982714653015, + "step": 1177 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 3.051813471502591, + "grad_norm": 2.0030176106359803, + "kl": 0.060791015625, + "learning_rate": 6.950777202072538e-07, + "loss": 0.0008, + "reward": 2.4999942779541016, + "reward_std": 4.463275558919122e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999940991401672, + "step": 1178 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 3.0544041450777204, + "grad_norm": 0.07562834244955724, + "kl": 0.120361328125, + "learning_rate": 6.948186528497409e-07, + "loss": 0.0008, + "reward": 2.499996781349182, + "reward_std": 1.477285877626855e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999969005584717, + "step": 1179 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.5, + "epoch": 3.05699481865285, + "grad_norm": 0.26056404492182367, + "kl": 0.060546875, + "learning_rate": 6.945595854922279e-07, + "loss": -0.0, + "reward": 2.499991774559021, + "reward_std": 2.9540235573222162e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.999991774559021, + "step": 1180 + }, + { + "clip_ratio": 0.0, + "completion_length": 34.0, + "epoch": 3.0595854922279795, + "grad_norm": 5.351097814490174, + "kl": 0.08642578125, + "learning_rate": 6.94300518134715e-07, + "loss": 0.0008, + "reward": 1.9945697784423828, + "reward_std": 9.820161403695238e-05, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.4945697784423828, + "step": 1181 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 3.062176165803109, + "grad_norm": 19.61084523204792, + "kl": 0.0947265625, + "learning_rate": 6.94041450777202e-07, + "loss": -0.0002, + "reward": 1.9998689889907837, + "reward_std": 6.372265306708869e-05, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.4998691082000732, + "step": 1182 + }, + { + "clip_ratio": 0.0, + "completion_length": 34.5, + "epoch": 3.064766839378238, + "grad_norm": 0.09839036954550555, + "kl": 0.12255859375, + "learning_rate": 6.937823834196891e-07, + "loss": 0.0003, + "reward": 2.499997854232788, + "reward_std": 2.0497682839959452e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999979138374329, + "step": 1183 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 3.0673575129533677, + "grad_norm": 0.2827675125819973, + "kl": 0.0667724609375, + "learning_rate": 6.935233160621761e-07, + "loss": 0.0001, + "reward": 2.4999977350234985, + "reward_std": 2.220439370148597e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999977350234985, + "step": 1184 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 3.069948186528497, + "grad_norm": 1.6399465495742187, + "kl": 0.12841796875, + "learning_rate": 6.932642487046631e-07, + "loss": 0.0018, + "reward": 2.499987840652466, + "reward_std": 9.988759757106891e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999877214431763, + "step": 1185 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.5625, + "epoch": 3.0725388601036268, + "grad_norm": 48.967278697379086, + "kl": 0.15576171875, + "learning_rate": 6.930051813471502e-07, + "loss": 0.0006, + "reward": 2.373937487602234, + "reward_std": 0.35653577744960785, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.8739373683929443, + "step": 1186 + }, + { + "clip_ratio": 0.0, + "completion_length": 36.0, + "epoch": 3.0751295336787563, + "grad_norm": 1.435609881582776, + "kl": 0.0677490234375, + "learning_rate": 6.927461139896372e-07, + "loss": 0.0004, + "reward": 2.499990701675415, + "reward_std": 7.0516755386051955e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999908208847046, + "step": 1187 + }, + { + "clip_ratio": 0.0, + "completion_length": 34.9375, + "epoch": 3.077720207253886, + "grad_norm": 3.5101981361178316, + "kl": 0.05426025390625, + "learning_rate": 6.924870466321243e-07, + "loss": 0.0013, + "reward": 2.4999924898147583, + "reward_std": 9.763617526914459e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999924302101135, + "step": 1188 + }, + { + "clip_ratio": 0.0, + "completion_length": 34.5, + "epoch": 3.0803108808290154, + "grad_norm": 1.6765971214628324, + "kl": 0.096435546875, + "learning_rate": 6.922279792746114e-07, + "loss": 0.0005, + "reward": 2.499983787536621, + "reward_std": 1.0850691978703253e-05, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999836087226868, + "step": 1189 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.9375, + "epoch": 3.082901554404145, + "grad_norm": 5.590736851210506, + "kl": 0.15478515625, + "learning_rate": 6.919689119170983e-07, + "loss": 0.0003, + "reward": 2.499976396560669, + "reward_std": 6.402083045031759e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999765157699585, + "step": 1190 + }, + { + "clip_ratio": 0.0, + "completion_length": 34.4375, + "epoch": 3.0854922279792745, + "grad_norm": 33.17775694408754, + "kl": 0.226806640625, + "learning_rate": 6.917098445595854e-07, + "loss": 0.0013, + "reward": 1.9959203600883484, + "reward_std": 0.00019277128740213811, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.4959202110767365, + "step": 1191 + }, + { + "clip_ratio": 0.0, + "completion_length": 34.5, + "epoch": 3.088082901554404, + "grad_norm": 23.896802680616943, + "kl": 0.1123046875, + "learning_rate": 6.914507772020724e-07, + "loss": 0.0003, + "reward": 2.2496062517166138, + "reward_std": 0.26767816981418946, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.7496063113212585, + "step": 1192 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.5, + "epoch": 3.0906735751295336, + "grad_norm": 9.660661317212083, + "kl": 0.07305908203125, + "learning_rate": 6.911917098445595e-07, + "loss": -0.0001, + "reward": 1.9984159469604492, + "reward_std": 0.00019665755337427981, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.4984160661697388, + "step": 1193 + }, + { + "clip_ratio": 0.0, + "completion_length": 34.5, + "epoch": 3.093264248704663, + "grad_norm": 20.022268497160617, + "kl": 0.1494140625, + "learning_rate": 6.909326424870467e-07, + "loss": 0.0, + "reward": 2.4374847412109375, + "reward_std": 0.1768051714203409, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9374848008155823, + "step": 1194 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.5, + "epoch": 3.0958549222797926, + "grad_norm": 36.23354749725851, + "kl": 0.16064453125, + "learning_rate": 6.906735751295337e-07, + "loss": 0.0004, + "reward": 2.124864637851715, + "reward_std": 0.23152516983157057, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.6248646974563599, + "step": 1195 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 3.098445595854922, + "grad_norm": 1.3195107073614705, + "kl": 0.128173828125, + "learning_rate": 6.904145077720207e-07, + "loss": -0.0002, + "reward": 2.4999265670776367, + "reward_std": 1.4375779528563726e-05, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999266862869263, + "step": 1196 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 3.1010362694300517, + "grad_norm": 0.21297203299170864, + "kl": 0.066162109375, + "learning_rate": 6.901554404145078e-07, + "loss": -0.001, + "reward": 2.499997854232788, + "reward_std": 2.3950478293954802e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999980926513672, + "step": 1197 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 3.1036269430051813, + "grad_norm": 1.1951467709965204, + "kl": 0.0443115234375, + "learning_rate": 6.898963730569948e-07, + "loss": 0.0, + "reward": 2.499993085861206, + "reward_std": 2.254762023312651e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999929666519165, + "step": 1198 + }, + { + "clip_ratio": 0.0, + "completion_length": 37.0625, + "epoch": 3.106217616580311, + "grad_norm": 89.7117428287973, + "kl": 0.17041015625, + "learning_rate": 6.896373056994819e-07, + "loss": -0.0, + "reward": 2.200614869594574, + "reward_std": 0.41318650986545435, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.7006149291992188, + "step": 1199 + }, + { + "clip_ratio": 0.0, + "completion_length": 34.5, + "epoch": 3.1088082901554404, + "grad_norm": 2.9577556315338525, + "kl": 0.0584716796875, + "learning_rate": 6.893782383419689e-07, + "loss": 0.0004, + "reward": 1.8219398856163025, + "reward_std": 0.00021504092509871953, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.3219397366046906, + "step": 1200 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 3.11139896373057, + "grad_norm": 1.8660163641473746, + "kl": 0.08935546875, + "learning_rate": 6.89119170984456e-07, + "loss": 0.0005, + "reward": 2.499987483024597, + "reward_std": 9.607229856101185e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.999987542629242, + "step": 1201 + }, + { + "clip_ratio": 0.0, + "completion_length": 34.5, + "epoch": 3.1139896373056994, + "grad_norm": 0.23309003298542247, + "kl": 0.2867431640625, + "learning_rate": 6.888601036269431e-07, + "loss": 0.0008, + "reward": 2.4999899864196777, + "reward_std": 6.076309091440635e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.999990165233612, + "step": 1202 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.5, + "epoch": 3.116580310880829, + "grad_norm": 20.242096699779367, + "kl": 0.1484375, + "learning_rate": 6.8860103626943e-07, + "loss": 0.0, + "reward": 2.4366928339004517, + "reward_std": 0.17895166123366835, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9366929531097412, + "step": 1203 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 3.1191709844559585, + "grad_norm": 0.41863435926551673, + "kl": 0.076416015625, + "learning_rate": 6.883419689119171e-07, + "loss": 0.0007, + "reward": 2.499996542930603, + "reward_std": 3.980623432653374e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999964833259583, + "step": 1204 + }, + { + "clip_ratio": 0.0, + "completion_length": 34.5, + "epoch": 3.121761658031088, + "grad_norm": 0.057189913104332614, + "kl": 0.19970703125, + "learning_rate": 6.880829015544041e-07, + "loss": 0.0006, + "reward": 2.499997615814209, + "reward_std": 1.7178826396957447e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999977350234985, + "step": 1205 + }, + { + "clip_ratio": 0.0, + "completion_length": 34.9375, + "epoch": 3.1243523316062176, + "grad_norm": 2.7685707075740504, + "kl": 0.08984375, + "learning_rate": 6.878238341968912e-07, + "loss": 0.0001, + "reward": 2.499969720840454, + "reward_std": 1.1449779435679375e-05, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999697804450989, + "step": 1206 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 3.126943005181347, + "grad_norm": 2.7955752348066656, + "kl": 0.0433349609375, + "learning_rate": 6.875647668393783e-07, + "loss": 0.0006, + "reward": 2.499983787536621, + "reward_std": 2.1352165845200943e-05, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999837279319763, + "step": 1207 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 3.1295336787564767, + "grad_norm": 3.904897597004643, + "kl": 0.099853515625, + "learning_rate": 6.873056994818652e-07, + "loss": 0.0005, + "reward": 1.9944968819618225, + "reward_std": 9.394819312547043e-05, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.4944968819618225, + "step": 1208 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.5, + "epoch": 3.1321243523316062, + "grad_norm": 0.993464348756233, + "kl": 0.165283203125, + "learning_rate": 6.870466321243523e-07, + "loss": 0.0013, + "reward": 2.499992251396179, + "reward_std": 7.040971325977807e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.999992311000824, + "step": 1209 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 3.134715025906736, + "grad_norm": 8.493647101872822, + "kl": 0.059326171875, + "learning_rate": 6.867875647668393e-07, + "loss": 0.0001, + "reward": 2.4999669790267944, + "reward_std": 4.88402728251458e-05, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999668598175049, + "step": 1210 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 3.1373056994818653, + "grad_norm": 0.6252435900174612, + "kl": 0.0283050537109375, + "learning_rate": 6.865284974093264e-07, + "loss": 0.0003, + "reward": 2.4999947547912598, + "reward_std": 5.760134399679373e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999947547912598, + "step": 1211 + }, + { + "clip_ratio": 0.0, + "completion_length": 38.9375, + "epoch": 3.139896373056995, + "grad_norm": 317.3920017848008, + "kl": 0.1669921875, + "learning_rate": 6.862694300518135e-07, + "loss": 0.0012, + "reward": 1.8101842999458313, + "reward_std": 0.1777368365546863, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.3101842999458313, + "step": 1212 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 3.1424870466321244, + "grad_norm": 0.38957971342570114, + "kl": 0.0938720703125, + "learning_rate": 6.860103626943005e-07, + "loss": 0.0005, + "reward": 2.499996304512024, + "reward_std": 3.5537441931410285e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.999996304512024, + "step": 1213 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.5, + "epoch": 3.145077720207254, + "grad_norm": 0.12349266871983197, + "kl": 0.153076171875, + "learning_rate": 6.857512953367876e-07, + "loss": -0.0007, + "reward": 2.499991774559021, + "reward_std": 2.604897190394695e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999918937683105, + "step": 1214 + }, + { + "clip_ratio": 0.0, + "completion_length": 34.5, + "epoch": 3.1476683937823835, + "grad_norm": 32.33004040627252, + "kl": 0.165283203125, + "learning_rate": 6.854922279792745e-07, + "loss": 0.0005, + "reward": 1.9112027287483215, + "reward_std": 0.000350074620655505, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.4112027287483215, + "step": 1215 + }, + { + "clip_ratio": 0.0, + "completion_length": 51.9375, + "epoch": 3.150259067357513, + "grad_norm": 113.55662888114172, + "kl": 2.042724609375, + "learning_rate": 6.852331606217616e-07, + "loss": 0.0079, + "reward": 2.0620912313461304, + "reward_std": 0.17689373933910701, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.56209135055542, + "step": 1216 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 3.1528497409326426, + "grad_norm": 8.61897752246694, + "kl": 0.17431640625, + "learning_rate": 6.849740932642487e-07, + "loss": 0.0008, + "reward": 1.9794423580169678, + "reward_std": 6.9976295435481e-05, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.4794422388076782, + "step": 1217 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 3.155440414507772, + "grad_norm": 27.69755342708996, + "kl": 0.0528564453125, + "learning_rate": 6.847150259067357e-07, + "loss": -0.0002, + "reward": 2.499966859817505, + "reward_std": 1.4045354475911154e-05, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999668598175049, + "step": 1218 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.5, + "epoch": 3.1580310880829017, + "grad_norm": 6.307206061699156, + "kl": 0.0758056640625, + "learning_rate": 6.844559585492228e-07, + "loss": 0.0005, + "reward": 1.8879817724227905, + "reward_std": 0.0005463095068307666, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.3879818320274353, + "step": 1219 + }, + { + "clip_ratio": 0.0, + "completion_length": 34.375, + "epoch": 3.160621761658031, + "grad_norm": 10.145145819493534, + "kl": 0.0535888671875, + "learning_rate": 6.841968911917099e-07, + "loss": -0.0004, + "reward": 2.437488317489624, + "reward_std": 0.17678574352271426, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.937488317489624, + "step": 1220 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 3.1632124352331608, + "grad_norm": 0.06347202174406436, + "kl": 0.153076171875, + "learning_rate": 6.839378238341968e-07, + "loss": 0.0, + "reward": 2.4999977350234985, + "reward_std": 1.2728193041766644e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999977946281433, + "step": 1221 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 3.1658031088082903, + "grad_norm": 25.926481283907055, + "kl": 0.1492919921875, + "learning_rate": 6.836787564766839e-07, + "loss": 0.0006, + "reward": 1.997806191444397, + "reward_std": 0.00022658649299955869, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.4978063106536865, + "step": 1222 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 3.16839378238342, + "grad_norm": 4.337730682658184, + "kl": 0.16357421875, + "learning_rate": 6.834196891191709e-07, + "loss": 0.0005, + "reward": 1.884786069393158, + "reward_std": 0.0002273179107987744, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.3847861886024475, + "step": 1223 + }, + { + "clip_ratio": 0.0, + "completion_length": 34.5, + "epoch": 3.1709844559585494, + "grad_norm": 1.2906471182596082, + "kl": 0.099853515625, + "learning_rate": 6.83160621761658e-07, + "loss": 0.001, + "reward": 2.4999938011169434, + "reward_std": 2.929047468569479e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999938011169434, + "step": 1224 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.5, + "epoch": 3.173575129533679, + "grad_norm": 12.228908208545143, + "kl": 0.0640869140625, + "learning_rate": 6.829015544041451e-07, + "loss": 0.0003, + "reward": 2.4374582767486572, + "reward_std": 0.17681060468567011, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9374581575393677, + "step": 1225 + }, + { + "clip_ratio": 0.0, + "completion_length": 34.5, + "epoch": 3.1761658031088085, + "grad_norm": 0.24394420433355046, + "kl": 0.0789794921875, + "learning_rate": 6.826424870466321e-07, + "loss": 0.0003, + "reward": 2.499994397163391, + "reward_std": 2.9986239269419457e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999943971633911, + "step": 1226 + }, + { + "clip_ratio": 0.0, + "completion_length": 41.8125, + "epoch": 3.178756476683938, + "grad_norm": 34.13531818528824, + "kl": 0.1904296875, + "learning_rate": 6.823834196891191e-07, + "loss": 0.0004, + "reward": 1.9996867179870605, + "reward_std": 0.00010410162849439075, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.49968683719635, + "step": 1227 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 3.1813471502590676, + "grad_norm": 2.9811588172750163, + "kl": 0.0496826171875, + "learning_rate": 6.821243523316061e-07, + "loss": -0.0006, + "reward": 2.499964952468872, + "reward_std": 2.087541042783414e-05, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999651908874512, + "step": 1228 + }, + { + "clip_ratio": 0.0, + "completion_length": 34.5, + "epoch": 3.1839378238341967, + "grad_norm": 2.2456746614650687, + "kl": 0.1041259765625, + "learning_rate": 6.818652849740932e-07, + "loss": 0.0006, + "reward": 1.676464319229126, + "reward_std": 0.0003018202780822321, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.176464319229126, + "step": 1229 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.5, + "epoch": 3.186528497409326, + "grad_norm": 2.3095544817287856, + "kl": 0.14892578125, + "learning_rate": 6.816062176165803e-07, + "loss": 0.0006, + "reward": 2.4999759197235107, + "reward_std": 1.3347123967832886e-05, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.999975860118866, + "step": 1230 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.375, + "epoch": 3.1891191709844557, + "grad_norm": 162.07904755221753, + "kl": 0.13055419921875, + "learning_rate": 6.813471502590673e-07, + "loss": -0.0004, + "reward": 1.5624719858169556, + "reward_std": 0.17670097788311523, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.0624721124768257, + "step": 1231 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 3.1917098445595853, + "grad_norm": 9.30954659856793, + "kl": 0.123779296875, + "learning_rate": 6.810880829015544e-07, + "loss": 0.0007, + "reward": 1.4995166063308716, + "reward_std": 0.00010432247881908552, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 0.9995165765285492, + "step": 1232 + }, + { + "clip_ratio": 0.0, + "completion_length": 38.625, + "epoch": 3.194300518134715, + "grad_norm": 1.7660116853260925, + "kl": 0.06201171875, + "learning_rate": 6.808290155440413e-07, + "loss": 0.0004, + "reward": 2.499979853630066, + "reward_std": 8.384925592963555e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999799132347107, + "step": 1233 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 3.1968911917098444, + "grad_norm": 2.372396535595751, + "kl": 0.081298828125, + "learning_rate": 6.805699481865284e-07, + "loss": -0.0002, + "reward": 2.4999783039093018, + "reward_std": 9.258635657261038e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999783635139465, + "step": 1234 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 3.199481865284974, + "grad_norm": 1.3915662872923094, + "kl": 0.1708984375, + "learning_rate": 6.803108808290155e-07, + "loss": 0.0012, + "reward": 2.499990463256836, + "reward_std": 8.627333045296837e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.999990463256836, + "step": 1235 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 3.2020725388601035, + "grad_norm": 39.91250914085111, + "kl": 0.17626953125, + "learning_rate": 6.800518134715025e-07, + "loss": 0.0014, + "reward": 1.9900028705596924, + "reward_std": 0.001552803657716595, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.49000284075737, + "step": 1236 + }, + { + "clip_ratio": 0.0, + "completion_length": 61.1875, + "epoch": 3.204663212435233, + "grad_norm": 8.5603241823098, + "kl": 0.0899658203125, + "learning_rate": 6.797927461139897e-07, + "loss": -0.0003, + "reward": 1.999298870563507, + "reward_std": 2.433328745610197e-05, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.4992989599704742, + "step": 1237 + }, + { + "clip_ratio": 0.0, + "completion_length": 38.375, + "epoch": 3.2072538860103625, + "grad_norm": 9.999270429566982, + "kl": 0.088623046875, + "learning_rate": 6.795336787564767e-07, + "loss": 0.0002, + "reward": 2.4999170303344727, + "reward_std": 3.644930689006287e-05, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999170899391174, + "step": 1238 + }, + { + "clip_ratio": 0.0, + "completion_length": 36.0, + "epoch": 3.209844559585492, + "grad_norm": 27.121171658755856, + "kl": 0.173828125, + "learning_rate": 6.792746113989637e-07, + "loss": -0.0001, + "reward": 1.989274024963379, + "reward_std": 0.0007453930279552878, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.4892742335796356, + "step": 1239 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.5, + "epoch": 3.2124352331606216, + "grad_norm": 35.98969466684873, + "kl": 0.096923828125, + "learning_rate": 6.790155440414508e-07, + "loss": 0.0004, + "reward": 2.1873241662979126, + "reward_std": 0.4441937953233719, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.6873242259025574, + "step": 1240 + }, + { + "clip_ratio": 0.0, + "completion_length": 34.5625, + "epoch": 3.215025906735751, + "grad_norm": 122.08437159855893, + "kl": 0.13720703125, + "learning_rate": 6.787564766839378e-07, + "loss": 0.0005, + "reward": 1.4928288459777832, + "reward_std": 0.40695127844810486, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 0.992828905582428, + "step": 1241 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.3125, + "epoch": 3.2176165803108807, + "grad_norm": 1.3679342838333743, + "kl": 0.0799560546875, + "learning_rate": 6.784974093264249e-07, + "loss": 0.0003, + "reward": 2.4999719858169556, + "reward_std": 1.0691671377571765e-05, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999721050262451, + "step": 1242 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.5, + "epoch": 3.2202072538860103, + "grad_norm": 0.4488314279506791, + "kl": 0.08984375, + "learning_rate": 6.782383419689119e-07, + "loss": -0.0001, + "reward": 2.4999940395355225, + "reward_std": 2.5145229187728546e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999940991401672, + "step": 1243 + }, + { + "clip_ratio": 0.0, + "completion_length": 36.0, + "epoch": 3.22279792746114, + "grad_norm": 2.5291469167689846, + "kl": 0.084716796875, + "learning_rate": 6.77979274611399e-07, + "loss": 0.0011, + "reward": 2.499972105026245, + "reward_std": 1.806678596949496e-05, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999719858169556, + "step": 1244 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.5, + "epoch": 3.2253886010362693, + "grad_norm": 17.233082999226742, + "kl": 0.06494140625, + "learning_rate": 6.77720207253886e-07, + "loss": 0.0006, + "reward": 2.062438428401947, + "reward_std": 0.17678446534057457, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.562438428401947, + "step": 1245 + }, + { + "clip_ratio": 0.0, + "completion_length": 34.0625, + "epoch": 3.227979274611399, + "grad_norm": 1.4884915339425842, + "kl": 0.30322265625, + "learning_rate": 6.77461139896373e-07, + "loss": 0.0027, + "reward": 2.499984622001648, + "reward_std": 6.9239305275914376e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999845027923584, + "step": 1246 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.5, + "epoch": 3.2305699481865284, + "grad_norm": 12.581191310878998, + "kl": 0.02294921875, + "learning_rate": 6.772020725388601e-07, + "loss": 0.0003, + "reward": 2.4998375177383423, + "reward_std": 5.313391034178494e-05, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9998376965522766, + "step": 1247 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 3.233160621761658, + "grad_norm": 0.3068037875764142, + "kl": 0.086181640625, + "learning_rate": 6.769430051813472e-07, + "loss": 0.0009, + "reward": 2.4999945163726807, + "reward_std": 2.68907388090156e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999944567680359, + "step": 1248 + }, + { + "clip_ratio": 0.0, + "completion_length": 41.25, + "epoch": 3.2357512953367875, + "grad_norm": 3.1821558286216294, + "kl": 0.1376953125, + "learning_rate": 6.766839378238342e-07, + "loss": -0.0, + "reward": 2.49997615814209, + "reward_std": 1.5303936379496008e-05, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999762773513794, + "step": 1249 + }, + { + "clip_ratio": 0.0, + "completion_length": 34.0, + "epoch": 3.238341968911917, + "grad_norm": 3.397811895284669, + "kl": 0.060791015625, + "learning_rate": 6.764248704663213e-07, + "loss": 0.0007, + "reward": 2.499962568283081, + "reward_std": 7.783529554217239e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999623894691467, + "step": 1250 + }, + { + "clip_ratio": 0.0, + "completion_length": 38.25, + "epoch": 3.2409326424870466, + "grad_norm": 11.173658309242521, + "kl": 0.1494140625, + "learning_rate": 6.761658031088082e-07, + "loss": 0.0006, + "reward": 1.9173847436904907, + "reward_std": 0.0005844671488262065, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.4173848628997803, + "step": 1251 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 3.243523316062176, + "grad_norm": 5.7610519775208715, + "kl": 0.06182861328125, + "learning_rate": 6.759067357512953e-07, + "loss": 0.0003, + "reward": 1.9917590618133545, + "reward_std": 0.00029140177321096417, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.4917591214179993, + "step": 1252 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 3.2461139896373057, + "grad_norm": 0.4724315873394368, + "kl": 0.0811767578125, + "learning_rate": 6.756476683937824e-07, + "loss": -0.0004, + "reward": 2.499985456466675, + "reward_std": 7.1935268124434515e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999854564666748, + "step": 1253 + }, + { + "clip_ratio": 0.0, + "completion_length": 36.5, + "epoch": 3.2487046632124352, + "grad_norm": 61.21206580796467, + "kl": 0.0482177734375, + "learning_rate": 6.753886010362694e-07, + "loss": 0.0001, + "reward": 2.499669313430786, + "reward_std": 6.080886032577837e-05, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.999669075012207, + "step": 1254 + }, + { + "clip_ratio": 0.0, + "completion_length": 36.0, + "epoch": 3.2512953367875648, + "grad_norm": 2.008133429151827, + "kl": 0.03057861328125, + "learning_rate": 6.751295336787565e-07, + "loss": 0.0009, + "reward": 2.49998140335083, + "reward_std": 1.1428465313656488e-05, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999812841415405, + "step": 1255 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.5, + "epoch": 3.2538860103626943, + "grad_norm": 4.028875682841846, + "kl": 0.103759765625, + "learning_rate": 6.748704663212435e-07, + "loss": 0.0016, + "reward": 2.4999269247055054, + "reward_std": 2.1010632735851686e-05, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999268054962158, + "step": 1256 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.5, + "epoch": 3.256476683937824, + "grad_norm": 0.04712398895384099, + "kl": 0.07666015625, + "learning_rate": 6.746113989637305e-07, + "loss": -0.0003, + "reward": 2.4999988079071045, + "reward_std": 1.1210138382011792e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999988675117493, + "step": 1257 + }, + { + "clip_ratio": 0.0, + "completion_length": 36.0, + "epoch": 3.2590673575129534, + "grad_norm": 1.7228698291486717, + "kl": 0.12451171875, + "learning_rate": 6.743523316062176e-07, + "loss": 0.0016, + "reward": 2.499993324279785, + "reward_std": 6.99621966759878e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999932050704956, + "step": 1258 + }, + { + "clip_ratio": 0.0, + "completion_length": 34.0, + "epoch": 3.261658031088083, + "grad_norm": 22.074941276410815, + "kl": 0.19775390625, + "learning_rate": 6.740932642487046e-07, + "loss": 0.0002, + "reward": 1.814564824104309, + "reward_std": 0.0005942479255054423, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.314564824104309, + "step": 1259 + }, + { + "clip_ratio": 0.0, + "completion_length": 39.3125, + "epoch": 3.2642487046632125, + "grad_norm": 7.331578703334851, + "kl": 0.08349609375, + "learning_rate": 6.738341968911917e-07, + "loss": -0.0003, + "reward": 2.4999489784240723, + "reward_std": 2.8987172299821395e-05, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999489784240723, + "step": 1260 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.5, + "epoch": 3.266839378238342, + "grad_norm": 20.423192947111342, + "kl": 0.1845703125, + "learning_rate": 6.735751295336787e-07, + "loss": 0.0007, + "reward": 2.0141189098358154, + "reward_std": 0.19632750791606668, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.5141189098358154, + "step": 1261 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.5, + "epoch": 3.2694300518134716, + "grad_norm": 54.64322304846053, + "kl": 0.04833984375, + "learning_rate": 6.733160621761658e-07, + "loss": 0.0003, + "reward": 2.374844551086426, + "reward_std": 0.23173565308263733, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.8748445510864258, + "step": 1262 + }, + { + "clip_ratio": 0.0, + "completion_length": 36.5, + "epoch": 3.272020725388601, + "grad_norm": 1.4942915810445108, + "kl": 0.1011962890625, + "learning_rate": 6.730569948186528e-07, + "loss": 0.0007, + "reward": 2.4999072551727295, + "reward_std": 1.1966993270107196e-05, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999071955680847, + "step": 1263 + }, + { + "clip_ratio": 0.0, + "completion_length": 46.875, + "epoch": 3.2746113989637307, + "grad_norm": 2.318037748042974, + "kl": 0.11376953125, + "learning_rate": 6.727979274611398e-07, + "loss": 0.0008, + "reward": 2.4999760389328003, + "reward_std": 1.2307446013437584e-05, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999759197235107, + "step": 1264 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 3.27720207253886, + "grad_norm": 6.0714539408583565, + "kl": 0.087646484375, + "learning_rate": 6.725388601036269e-07, + "loss": 0.0003, + "reward": 1.2900685667991638, + "reward_std": 0.00036068645931663923, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 0.7900685369968414, + "step": 1265 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 3.2797927461139897, + "grad_norm": 0.7398916415551148, + "kl": 0.06201171875, + "learning_rate": 6.722797927461139e-07, + "loss": 0.0005, + "reward": 2.499778151512146, + "reward_std": 1.051360561632464e-05, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.999778151512146, + "step": 1266 + }, + { + "clip_ratio": 0.0, + "completion_length": 34.5, + "epoch": 3.2823834196891193, + "grad_norm": 0.20887457172346072, + "kl": 0.090087890625, + "learning_rate": 6.72020725388601e-07, + "loss": 0.0004, + "reward": 2.4999964237213135, + "reward_std": 1.9571496636672236e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999964833259583, + "step": 1267 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.5, + "epoch": 3.284974093264249, + "grad_norm": 0.7956652247876509, + "kl": 0.1220703125, + "learning_rate": 6.717616580310881e-07, + "loss": 0.0007, + "reward": 2.499996066093445, + "reward_std": 3.9155648323685455e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999961256980896, + "step": 1268 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 3.2875647668393784, + "grad_norm": 0.2600159459812739, + "kl": 0.090087890625, + "learning_rate": 6.71502590673575e-07, + "loss": -0.0007, + "reward": 2.499996066093445, + "reward_std": 2.275126689710305e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999961853027344, + "step": 1269 + }, + { + "clip_ratio": 0.0, + "completion_length": 34.5, + "epoch": 3.290155440414508, + "grad_norm": 1.3801637562359388, + "kl": 0.083984375, + "learning_rate": 6.712435233160621e-07, + "loss": 0.0003, + "reward": 2.4999756813049316, + "reward_std": 1.0136415312445024e-05, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999756217002869, + "step": 1270 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 3.2927461139896375, + "grad_norm": 0.21350459680199665, + "kl": 0.119140625, + "learning_rate": 6.709844559585492e-07, + "loss": 0.0008, + "reward": 2.4999964237213135, + "reward_std": 1.875797465800133e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999961853027344, + "step": 1271 + }, + { + "clip_ratio": 0.0, + "completion_length": 42.5625, + "epoch": 3.295336787564767, + "grad_norm": 15.940153283180809, + "kl": 0.3828125, + "learning_rate": 6.707253886010362e-07, + "loss": 0.001, + "reward": 2.3122068643569946, + "reward_std": 0.34770820060020924, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.8122069835662842, + "step": 1272 + }, + { + "clip_ratio": 0.0, + "completion_length": 34.5, + "epoch": 3.2979274611398965, + "grad_norm": 3.12265472909459, + "kl": 0.2493896484375, + "learning_rate": 6.704663212435233e-07, + "loss": 0.0008, + "reward": 2.4999197721481323, + "reward_std": 2.9060335918984492e-05, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999198913574219, + "step": 1273 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 3.300518134715026, + "grad_norm": 10.684700343980614, + "kl": 0.0550537109375, + "learning_rate": 6.702072538860103e-07, + "loss": 0.0005, + "reward": 1.9941173791885376, + "reward_std": 0.0001505392161789132, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.4941173195838928, + "step": 1274 + }, + { + "clip_ratio": 0.0, + "completion_length": 34.125, + "epoch": 3.3031088082901556, + "grad_norm": 1.6637752649456732, + "kl": 0.055419921875, + "learning_rate": 6.699481865284973e-07, + "loss": 0.0017, + "reward": 2.499996542930603, + "reward_std": 3.372988203409477e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.999996304512024, + "step": 1275 + }, + { + "clip_ratio": 0.0, + "completion_length": 34.0, + "epoch": 3.305699481865285, + "grad_norm": 4.31007749526667, + "kl": 0.0958251953125, + "learning_rate": 6.696891191709844e-07, + "loss": 0.0011, + "reward": 2.4999359846115112, + "reward_std": 2.1850171492587833e-05, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999359846115112, + "step": 1276 + }, + { + "clip_ratio": 0.0, + "completion_length": 37.625, + "epoch": 3.3082901554404147, + "grad_norm": 0.1893903207167545, + "kl": 0.128662109375, + "learning_rate": 6.694300518134714e-07, + "loss": 0.0011, + "reward": 2.499996066093445, + "reward_std": 1.9226832250751613e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999960660934448, + "step": 1277 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 3.3108808290155443, + "grad_norm": 1.5598454758263005, + "kl": 0.052734375, + "learning_rate": 6.691709844559585e-07, + "loss": 0.0013, + "reward": 2.4999793767929077, + "reward_std": 8.379100563615793e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999791383743286, + "step": 1278 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.4375, + "epoch": 3.313471502590674, + "grad_norm": 23.902880778175927, + "kl": 0.100830078125, + "learning_rate": 6.689119170984455e-07, + "loss": 0.0003, + "reward": 1.9973394870758057, + "reward_std": 0.0018259540975122945, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.4973394274711609, + "step": 1279 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 3.3160621761658033, + "grad_norm": 3.172447446657001, + "kl": 0.0584716796875, + "learning_rate": 6.686528497409327e-07, + "loss": 0.0011, + "reward": 2.4999806880950928, + "reward_std": 6.82442430388619e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.999980628490448, + "step": 1280 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.9375, + "epoch": 3.318652849740933, + "grad_norm": 65.7431643413431, + "kl": 0.06884765625, + "learning_rate": 6.683937823834197e-07, + "loss": -0.0004, + "reward": 1.9731935858726501, + "reward_std": 0.004861528554783945, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.4731935858726501, + "step": 1281 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.5, + "epoch": 3.321243523316062, + "grad_norm": 17.836462416607315, + "kl": 0.05291748046875, + "learning_rate": 6.681347150259067e-07, + "loss": 0.0008, + "reward": 2.4373984336853027, + "reward_std": 0.17694964087786502, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9373984336853027, + "step": 1282 + }, + { + "clip_ratio": 0.0, + "completion_length": 34.5, + "epoch": 3.3238341968911915, + "grad_norm": 4.904208401300367, + "kl": 0.0616455078125, + "learning_rate": 6.678756476683938e-07, + "loss": -0.0004, + "reward": 1.9981617331504822, + "reward_std": 4.671673514167196e-05, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.498161792755127, + "step": 1283 + }, + { + "clip_ratio": 0.0, + "completion_length": 36.0, + "epoch": 3.326424870466321, + "grad_norm": 43.73236976346806, + "kl": 0.096923828125, + "learning_rate": 6.676165803108808e-07, + "loss": 0.0006, + "reward": 1.9986584186553955, + "reward_std": 0.000110683293769398, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.4986584782600403, + "step": 1284 + }, + { + "clip_ratio": 0.0, + "completion_length": 36.3125, + "epoch": 3.3290155440414506, + "grad_norm": 16.989284017364056, + "kl": 0.05938720703125, + "learning_rate": 6.673575129533679e-07, + "loss": 0.0005, + "reward": 1.9519646167755127, + "reward_std": 0.015825567749971015, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.4519644975662231, + "step": 1285 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 3.33160621761658, + "grad_norm": 16.30015088937492, + "kl": 0.1339111328125, + "learning_rate": 6.67098445595855e-07, + "loss": 0.0005, + "reward": 1.4858134388923645, + "reward_std": 0.00023081257631929475, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 0.9858134984970093, + "step": 1286 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.5625, + "epoch": 3.3341968911917097, + "grad_norm": 21.128982681534165, + "kl": 0.03961181640625, + "learning_rate": 6.668393782383419e-07, + "loss": 0.0, + "reward": 1.8781986832618713, + "reward_std": 0.0004783739980211976, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.3781987130641937, + "step": 1287 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.5, + "epoch": 3.3367875647668392, + "grad_norm": 1.4503175328360276, + "kl": 0.07177734375, + "learning_rate": 6.66580310880829e-07, + "loss": -0.0004, + "reward": 2.4999178647994995, + "reward_std": 1.4236801575862046e-05, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.999917984008789, + "step": 1288 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.5, + "epoch": 3.339378238341969, + "grad_norm": 78.80329749028347, + "kl": 0.121826171875, + "learning_rate": 6.66321243523316e-07, + "loss": 0.0002, + "reward": 2.499940276145935, + "reward_std": 1.946031170518836e-05, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999403953552246, + "step": 1289 + }, + { + "clip_ratio": 0.0, + "completion_length": 34.8125, + "epoch": 3.3419689119170983, + "grad_norm": 157.3271683150045, + "kl": 0.15478515625, + "learning_rate": 6.660621761658031e-07, + "loss": 0.0009, + "reward": 2.0805219411849976, + "reward_std": 0.34735794636026185, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.5805218815803528, + "step": 1290 + }, + { + "clip_ratio": 0.0, + "completion_length": 36.0, + "epoch": 3.344559585492228, + "grad_norm": 7.586122553563838, + "kl": 0.080078125, + "learning_rate": 6.658031088082902e-07, + "loss": 0.0006, + "reward": 2.0623974800109863, + "reward_std": 0.17679739690629503, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.562397539615631, + "step": 1291 + }, + { + "clip_ratio": 0.0, + "completion_length": 36.0, + "epoch": 3.3471502590673574, + "grad_norm": 0.31199710615956217, + "kl": 0.109375, + "learning_rate": 6.655440414507772e-07, + "loss": -0.0005, + "reward": 2.4999911785125732, + "reward_std": 2.3840762821691897e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999912977218628, + "step": 1292 + }, + { + "clip_ratio": 0.0, + "completion_length": 34.0, + "epoch": 3.349740932642487, + "grad_norm": 0.050336091549529546, + "kl": 0.13232421875, + "learning_rate": 6.652849740932642e-07, + "loss": 0.0004, + "reward": 1.499998927116394, + "reward_std": 6.4432907720402e-07, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 0.9999988675117493, + "step": 1293 + }, + { + "clip_ratio": 0.0, + "completion_length": 36.0, + "epoch": 3.3523316062176165, + "grad_norm": 28.62123541900629, + "kl": 0.10986328125, + "learning_rate": 6.650259067357513e-07, + "loss": 0.0015, + "reward": 1.9998066425323486, + "reward_std": 6.801219285534899e-05, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.499806433916092, + "step": 1294 + }, + { + "clip_ratio": 0.0, + "completion_length": 34.5, + "epoch": 3.354922279792746, + "grad_norm": 1.9701712109347833, + "kl": 0.09814453125, + "learning_rate": 6.647668393782383e-07, + "loss": -0.0004, + "reward": 2.4999836683273315, + "reward_std": 8.612069905211683e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.999983787536621, + "step": 1295 + }, + { + "clip_ratio": 0.0, + "completion_length": 36.0, + "epoch": 3.3575129533678756, + "grad_norm": 1.2098339924315067, + "kl": 0.09375, + "learning_rate": 6.645077720207254e-07, + "loss": 0.0005, + "reward": 2.4999914169311523, + "reward_std": 7.053330136841396e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.999991238117218, + "step": 1296 + }, + { + "clip_ratio": 0.0, + "completion_length": 37.1875, + "epoch": 3.360103626943005, + "grad_norm": 0.621152549433018, + "kl": 0.2158203125, + "learning_rate": 6.642487046632124e-07, + "loss": 0.0004, + "reward": 2.4999806880950928, + "reward_std": 5.871805569768185e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.999980628490448, + "step": 1297 + }, + { + "clip_ratio": 0.0, + "completion_length": 34.5, + "epoch": 3.3626943005181347, + "grad_norm": 0.12449511016870848, + "kl": 0.012115478515625, + "learning_rate": 6.639896373056995e-07, + "loss": 0.0001, + "reward": 2.4999982118606567, + "reward_std": 1.1489047153645515e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999982118606567, + "step": 1298 + }, + { + "clip_ratio": 0.0, + "completion_length": 38.3125, + "epoch": 3.365284974093264, + "grad_norm": 14.959340054097833, + "kl": 0.204833984375, + "learning_rate": 6.637305699481865e-07, + "loss": 0.0008, + "reward": 1.5599233508110046, + "reward_std": 0.4072205275297165, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.0599234104156494, + "step": 1299 + }, + { + "clip_ratio": 0.0, + "completion_length": 34.5, + "epoch": 3.3678756476683938, + "grad_norm": 16.102922126163854, + "kl": 0.08740234375, + "learning_rate": 6.634715025906735e-07, + "loss": 0.0002, + "reward": 2.374980926513672, + "reward_std": 0.23148168627972154, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.8749809861183167, + "step": 1300 + }, + { + "clip_ratio": 0.0, + "completion_length": 34.5, + "epoch": 3.3704663212435233, + "grad_norm": 3.8439239213360556, + "kl": 0.120849609375, + "learning_rate": 6.632124352331606e-07, + "loss": 0.0007, + "reward": 2.4999793767929077, + "reward_std": 1.647567387408344e-05, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999793767929077, + "step": 1301 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.5, + "epoch": 3.373056994818653, + "grad_norm": 0.8088670641819842, + "kl": 0.18115234375, + "learning_rate": 6.629533678756476e-07, + "loss": 0.0007, + "reward": 2.4999886751174927, + "reward_std": 6.531869985337835e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999887347221375, + "step": 1302 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.5, + "epoch": 3.3756476683937824, + "grad_norm": 12.16995096807882, + "kl": 0.1015625, + "learning_rate": 6.626943005181347e-07, + "loss": 0.0006, + "reward": 1.9998265504837036, + "reward_std": 1.771404242845165e-05, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.4998266100883484, + "step": 1303 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.5, + "epoch": 3.378238341968912, + "grad_norm": 0.4266573255322855, + "kl": 0.0628662109375, + "learning_rate": 6.624352331606218e-07, + "loss": 0.0004, + "reward": 2.499997615814209, + "reward_std": 2.316846519079263e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.999997615814209, + "step": 1304 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.5, + "epoch": 3.3808290155440415, + "grad_norm": 5.712500809056355, + "kl": 0.060302734375, + "learning_rate": 6.621761658031087e-07, + "loss": -0.0001, + "reward": 2.4997068643569946, + "reward_std": 4.266273646180707e-05, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.999707043170929, + "step": 1305 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.5, + "epoch": 3.383419689119171, + "grad_norm": 10.868164851703684, + "kl": 0.077392578125, + "learning_rate": 6.619170984455958e-07, + "loss": 0.0006, + "reward": 2.4374756813049316, + "reward_std": 0.1768043436723019, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9374757409095764, + "step": 1306 + }, + { + "clip_ratio": 0.0, + "completion_length": 36.0, + "epoch": 3.3860103626943006, + "grad_norm": 9.432246491159606, + "kl": 0.076904296875, + "learning_rate": 6.616580310880828e-07, + "loss": -0.0008, + "reward": 2.4999754428863525, + "reward_std": 1.1850605233121314e-05, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999754428863525, + "step": 1307 + }, + { + "clip_ratio": 0.0, + "completion_length": 34.5, + "epoch": 3.38860103626943, + "grad_norm": 0.21133365073487292, + "kl": 0.151611328125, + "learning_rate": 6.613989637305699e-07, + "loss": 0.0003, + "reward": 2.4999970197677612, + "reward_std": 3.3243616144318366e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.999997079372406, + "step": 1308 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.5, + "epoch": 3.3911917098445596, + "grad_norm": 24.054526082222516, + "kl": 0.14501953125, + "learning_rate": 6.61139896373057e-07, + "loss": 0.0006, + "reward": 1.931704044342041, + "reward_std": 0.1871442198753357, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.4317041039466858, + "step": 1309 + }, + { + "clip_ratio": 0.0, + "completion_length": 37.625, + "epoch": 3.393782383419689, + "grad_norm": 3.3759777040099497, + "kl": 0.0775146484375, + "learning_rate": 6.60880829015544e-07, + "loss": 0.0012, + "reward": 2.499921441078186, + "reward_std": 2.3684171793547648e-05, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999216198921204, + "step": 1310 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 3.3963730569948187, + "grad_norm": 0.17472606320221273, + "kl": 0.0771484375, + "learning_rate": 6.60621761658031e-07, + "loss": 0.0003, + "reward": 2.4999951124191284, + "reward_std": 3.3327240771541256e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999951124191284, + "step": 1311 + }, + { + "clip_ratio": 0.0, + "completion_length": 34.5, + "epoch": 3.3989637305699483, + "grad_norm": 9.32054821062495, + "kl": 0.0533447265625, + "learning_rate": 6.60362694300518e-07, + "loss": 0.0018, + "reward": 2.4999849796295166, + "reward_std": 1.1492917337818653e-05, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999847412109375, + "step": 1312 + }, + { + "clip_ratio": 0.0, + "completion_length": 34.0, + "epoch": 3.401554404145078, + "grad_norm": 0.628344296697333, + "kl": 0.16064453125, + "learning_rate": 6.601036269430051e-07, + "loss": 0.0015, + "reward": 2.499968647956848, + "reward_std": 3.792652677248043e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999686479568481, + "step": 1313 + }, + { + "clip_ratio": 0.0, + "completion_length": 34.0, + "epoch": 3.4041450777202074, + "grad_norm": 0.14255676864836625, + "kl": 0.0494384765625, + "learning_rate": 6.598445595854922e-07, + "loss": -0.0002, + "reward": 2.4999953508377075, + "reward_std": 2.5643305150424567e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999955296516418, + "step": 1314 + }, + { + "clip_ratio": 0.0, + "completion_length": 34.5625, + "epoch": 3.406735751295337, + "grad_norm": 11.295736174446297, + "kl": 0.097412109375, + "learning_rate": 6.595854922279792e-07, + "loss": -0.0, + "reward": 1.9924728870391846, + "reward_std": 5.47329968867416e-05, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.4924729466438293, + "step": 1315 + }, + { + "clip_ratio": 0.0, + "completion_length": 34.5, + "epoch": 3.4093264248704664, + "grad_norm": 0.2899929915176572, + "kl": 0.060791015625, + "learning_rate": 6.593264248704663e-07, + "loss": 0.0017, + "reward": 2.499995231628418, + "reward_std": 2.67563785882885e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.999995231628418, + "step": 1316 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.5, + "epoch": 3.411917098445596, + "grad_norm": 0.15722221776304574, + "kl": 0.1136474609375, + "learning_rate": 6.590673575129534e-07, + "loss": 0.0009, + "reward": 2.499998092651367, + "reward_std": 1.9036045273423952e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999980330467224, + "step": 1317 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.5, + "epoch": 3.4145077720207255, + "grad_norm": 27.013484280707754, + "kl": 0.06396484375, + "learning_rate": 6.588082901554403e-07, + "loss": -0.0009, + "reward": 2.499856948852539, + "reward_std": 5.1802656059862784e-05, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9998570680618286, + "step": 1318 + }, + { + "clip_ratio": 0.0, + "completion_length": 34.0, + "epoch": 3.417098445595855, + "grad_norm": 0.8854631233530994, + "kl": 0.056640625, + "learning_rate": 6.585492227979274e-07, + "loss": 0.0004, + "reward": 2.4999747276306152, + "reward_std": 1.080063793779118e-05, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.99997478723526, + "step": 1319 + }, + { + "clip_ratio": 0.0, + "completion_length": 34.9375, + "epoch": 3.4196891191709846, + "grad_norm": 11.842182184099409, + "kl": 0.1932373046875, + "learning_rate": 6.582901554404144e-07, + "loss": 0.001, + "reward": 1.968591570854187, + "reward_std": 0.006671816722700896, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.4685916006565094, + "step": 1320 + }, + { + "clip_ratio": 0.0, + "completion_length": 34.5, + "epoch": 3.422279792746114, + "grad_norm": 0.5684898572481198, + "kl": 0.1484375, + "learning_rate": 6.580310880829015e-07, + "loss": 0.0013, + "reward": 2.4999858140945435, + "reward_std": 5.055517249274999e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999856352806091, + "step": 1321 + }, + { + "clip_ratio": 0.0, + "completion_length": 36.0, + "epoch": 3.4248704663212437, + "grad_norm": 0.8741033671541631, + "kl": 0.035400390625, + "learning_rate": 6.577720207253887e-07, + "loss": -0.0004, + "reward": 2.4999938011169434, + "reward_std": 5.360691602618317e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999940395355225, + "step": 1322 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.5, + "epoch": 3.4274611398963732, + "grad_norm": 0.09088051622100037, + "kl": 0.0679931640625, + "learning_rate": 6.575129533678755e-07, + "loss": 0.0004, + "reward": 2.49999463558197, + "reward_std": 1.4212792223133874e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999945759773254, + "step": 1323 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 3.4300518134715023, + "grad_norm": 88.18927522118243, + "kl": 0.12255859375, + "learning_rate": 6.572538860103627e-07, + "loss": 0.0001, + "reward": 2.499954581260681, + "reward_std": 1.4616265616496094e-05, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999547004699707, + "step": 1324 + }, + { + "clip_ratio": 0.0, + "completion_length": 34.5, + "epoch": 3.432642487046632, + "grad_norm": 20.95091438872185, + "kl": 0.0755615234375, + "learning_rate": 6.569948186528497e-07, + "loss": -0.0004, + "reward": 2.43748140335083, + "reward_std": 0.17679336109040378, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.93748140335083, + "step": 1325 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.5, + "epoch": 3.4352331606217614, + "grad_norm": 0.08554641384151447, + "kl": 0.061279296875, + "learning_rate": 6.567357512953368e-07, + "loss": 0.0002, + "reward": 2.499996542930603, + "reward_std": 1.5779278044192324e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999966025352478, + "step": 1326 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 3.437823834196891, + "grad_norm": 15.724386816815187, + "kl": 0.075927734375, + "learning_rate": 6.564766839378239e-07, + "loss": -0.0005, + "reward": 1.9812134504318237, + "reward_std": 0.0001690258214921414, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.4812135100364685, + "step": 1327 + }, + { + "clip_ratio": 0.0, + "completion_length": 34.0, + "epoch": 3.4404145077720205, + "grad_norm": 0.2412479102981515, + "kl": 0.057373046875, + "learning_rate": 6.562176165803109e-07, + "loss": -0.0012, + "reward": 2.499994993209839, + "reward_std": 2.5916364734257513e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999952912330627, + "step": 1328 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 3.44300518134715, + "grad_norm": 0.1372288832654031, + "kl": 0.158203125, + "learning_rate": 6.55958549222798e-07, + "loss": 0.0, + "reward": 2.4999895095825195, + "reward_std": 1.8267392078996636e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999895691871643, + "step": 1329 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.5, + "epoch": 3.4455958549222796, + "grad_norm": 6.479777720510055, + "kl": 0.198974609375, + "learning_rate": 6.556994818652849e-07, + "loss": 0.0006, + "reward": 1.804716944694519, + "reward_std": 0.0005800696062578936, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.3047169148921967, + "step": 1330 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.5, + "epoch": 3.448186528497409, + "grad_norm": 1.9678867874828987, + "kl": 0.026458740234375, + "learning_rate": 6.55440414507772e-07, + "loss": 0.0001, + "reward": 2.499990224838257, + "reward_std": 1.0386611847934546e-05, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999902248382568, + "step": 1331 + }, + { + "clip_ratio": 0.0, + "completion_length": 34.5, + "epoch": 3.4507772020725387, + "grad_norm": 5.201974811130854, + "kl": 0.120849609375, + "learning_rate": 6.551813471502591e-07, + "loss": 0.0003, + "reward": 1.999235451221466, + "reward_std": 8.643494857096812e-05, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.4992355108261108, + "step": 1332 + }, + { + "clip_ratio": 0.0, + "completion_length": 34.9375, + "epoch": 3.4533678756476682, + "grad_norm": 19.621498723289225, + "kl": 0.13623046875, + "learning_rate": 6.549222797927461e-07, + "loss": 0.0005, + "reward": 1.9854612350463867, + "reward_std": 0.20790574957572971, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.4854612350463867, + "step": 1333 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 3.4559585492227978, + "grad_norm": 2.0470128638190235, + "kl": 0.125, + "learning_rate": 6.546632124352332e-07, + "loss": 0.0007, + "reward": 2.499990940093994, + "reward_std": 7.779780503369693e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999909400939941, + "step": 1334 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 3.4585492227979273, + "grad_norm": 1.6392997775910472, + "kl": 0.07958984375, + "learning_rate": 6.544041450777201e-07, + "loss": 0.001, + "reward": 2.499949097633362, + "reward_std": 1.1032195914140175e-05, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999488592147827, + "step": 1335 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.5, + "epoch": 3.461139896373057, + "grad_norm": 1.8777779445871616, + "kl": 0.0667724609375, + "learning_rate": 6.541450777202072e-07, + "loss": -0.0006, + "reward": 2.4999806880950928, + "reward_std": 9.163160939351656e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999808073043823, + "step": 1336 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.5, + "epoch": 3.4637305699481864, + "grad_norm": 0.13381814044751406, + "kl": 0.110595703125, + "learning_rate": 6.538860103626943e-07, + "loss": -0.0003, + "reward": 2.4999895095825195, + "reward_std": 3.170826403220417e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999895691871643, + "step": 1337 + }, + { + "clip_ratio": 0.0, + "completion_length": 36.0, + "epoch": 3.466321243523316, + "grad_norm": 936.7902888793681, + "kl": 0.0767822265625, + "learning_rate": 6.536269430051813e-07, + "loss": 0.0002, + "reward": 1.9934600591659546, + "reward_std": 0.003993490203356487, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.4934599995613098, + "step": 1338 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 3.4689119170984455, + "grad_norm": 20.647253045583795, + "kl": 0.108642578125, + "learning_rate": 6.533678756476684e-07, + "loss": 0.0006, + "reward": 2.4988105297088623, + "reward_std": 0.0001235436582192051, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9988104104995728, + "step": 1339 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.5, + "epoch": 3.471502590673575, + "grad_norm": 35.22651520092051, + "kl": 0.0614013671875, + "learning_rate": 6.531088082901555e-07, + "loss": 0.0005, + "reward": 1.999617338180542, + "reward_std": 3.5019628285226645e-05, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.4996172785758972, + "step": 1340 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 3.4740932642487046, + "grad_norm": 0.4369136949688955, + "kl": 0.0556640625, + "learning_rate": 6.528497409326425e-07, + "loss": -0.0002, + "reward": 2.4999958276748657, + "reward_std": 3.6921732089467696e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999957084655762, + "step": 1341 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.5, + "epoch": 3.476683937823834, + "grad_norm": 8.776868774273215, + "kl": 0.143798828125, + "learning_rate": 6.525906735751295e-07, + "loss": 0.0009, + "reward": 2.4368677139282227, + "reward_std": 0.17856070416701186, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9368676543235779, + "step": 1342 + }, + { + "clip_ratio": 0.0, + "completion_length": 36.1875, + "epoch": 3.4792746113989637, + "grad_norm": 3.833796370548108, + "kl": 0.20751953125, + "learning_rate": 6.523316062176165e-07, + "loss": 0.0006, + "reward": 2.499929904937744, + "reward_std": 2.5398303478141315e-05, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.999929964542389, + "step": 1343 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.5, + "epoch": 3.481865284974093, + "grad_norm": 5.392781955184533, + "kl": 0.1142578125, + "learning_rate": 6.520725388601036e-07, + "loss": 0.0003, + "reward": 2.4998639822006226, + "reward_std": 6.730443169544742e-05, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9998641610145569, + "step": 1344 + }, + { + "clip_ratio": 0.0, + "completion_length": 36.0, + "epoch": 3.4844559585492227, + "grad_norm": 0.7610112195811284, + "kl": 0.122802734375, + "learning_rate": 6.518134715025907e-07, + "loss": 0.0012, + "reward": 2.4999645948410034, + "reward_std": 6.161727014841745e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999645948410034, + "step": 1345 + }, + { + "clip_ratio": 0.0, + "completion_length": 34.0, + "epoch": 3.4870466321243523, + "grad_norm": 0.0842702726023439, + "kl": 0.0823974609375, + "learning_rate": 6.515544041450777e-07, + "loss": 0.0002, + "reward": 2.499996304512024, + "reward_std": 1.5945765312608273e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.999996304512024, + "step": 1346 + }, + { + "clip_ratio": 0.0, + "completion_length": 34.5, + "epoch": 3.489637305699482, + "grad_norm": 2.2505849321534086, + "kl": 0.127197265625, + "learning_rate": 6.512953367875648e-07, + "loss": -0.0004, + "reward": 2.499969959259033, + "reward_std": 1.367196182400221e-05, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999698400497437, + "step": 1347 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 3.4922279792746114, + "grad_norm": 1.6636532598923264, + "kl": 0.07275390625, + "learning_rate": 6.510362694300517e-07, + "loss": 0.0004, + "reward": 2.4999873638153076, + "reward_std": 7.710978479735786e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999873638153076, + "step": 1348 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.5, + "epoch": 3.494818652849741, + "grad_norm": 0.3571506390553103, + "kl": 0.0712890625, + "learning_rate": 6.507772020725388e-07, + "loss": -0.0007, + "reward": 2.499982237815857, + "reward_std": 4.9869615850184346e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.999982237815857, + "step": 1349 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 3.4974093264248705, + "grad_norm": 0.24088259429386338, + "kl": 0.13720703125, + "learning_rate": 6.505181347150259e-07, + "loss": 0.0003, + "reward": 2.4999942779541016, + "reward_std": 2.79588209650683e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999942779541016, + "step": 1350 + }, + { + "clip_ratio": 0.0, + "completion_length": 34.5, + "epoch": 3.5, + "grad_norm": 1.537052628856158, + "kl": 0.0604248046875, + "learning_rate": 6.502590673575129e-07, + "loss": 0.0007, + "reward": 2.499988079071045, + "reward_std": 4.063590722580557e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999879598617554, + "step": 1351 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.5, + "epoch": 3.5025906735751295, + "grad_norm": 0.16246596931791904, + "kl": 0.01953125, + "learning_rate": 6.5e-07, + "loss": -0.0001, + "reward": 2.499993324279785, + "reward_std": 3.6890651244902983e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999932646751404, + "step": 1352 + }, + { + "clip_ratio": 0.0, + "completion_length": 36.0, + "epoch": 3.505181347150259, + "grad_norm": 4.884301071816906, + "kl": 0.12060546875, + "learning_rate": 6.49740932642487e-07, + "loss": 0.0002, + "reward": 1.9008011221885681, + "reward_std": 0.00030973899993114173, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.4008011519908905, + "step": 1353 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.5, + "epoch": 3.5077720207253886, + "grad_norm": 2.3989105629342515, + "kl": 0.0772705078125, + "learning_rate": 6.49481865284974e-07, + "loss": 0.0007, + "reward": 1.9999024868011475, + "reward_std": 1.3443967873172369e-05, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.4999024868011475, + "step": 1354 + }, + { + "clip_ratio": 0.0, + "completion_length": 34.5, + "epoch": 3.510362694300518, + "grad_norm": 0.08716403584693133, + "kl": 0.1171875, + "learning_rate": 6.492227979274611e-07, + "loss": 0.0008, + "reward": 2.499993324279785, + "reward_std": 2.5999656827480067e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999932646751404, + "step": 1355 + }, + { + "clip_ratio": 0.0, + "completion_length": 36.0, + "epoch": 3.5129533678756477, + "grad_norm": 1.3719280672515863, + "kl": 0.073486328125, + "learning_rate": 6.489637305699481e-07, + "loss": 0.0003, + "reward": 2.499994158744812, + "reward_std": 4.417373361320642e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999943375587463, + "step": 1356 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.5, + "epoch": 3.5155440414507773, + "grad_norm": 5.486186413154993, + "kl": 0.0814208984375, + "learning_rate": 6.487046632124352e-07, + "loss": -0.0, + "reward": 2.499882221221924, + "reward_std": 4.369443740870338e-05, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9998822808265686, + "step": 1357 + }, + { + "clip_ratio": 0.0, + "completion_length": 38.625, + "epoch": 3.518134715025907, + "grad_norm": 0.04886309299963757, + "kl": 0.129638671875, + "learning_rate": 6.484455958549222e-07, + "loss": 0.0007, + "reward": 2.499998092651367, + "reward_std": 1.1115929225979926e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999982118606567, + "step": 1358 + }, + { + "clip_ratio": 0.0, + "completion_length": 38.125, + "epoch": 3.5207253886010363, + "grad_norm": 33.036188365258056, + "kl": 0.1331787109375, + "learning_rate": 6.481865284974093e-07, + "loss": 0.0014, + "reward": 2.4194105863571167, + "reward_std": 0.22792745969275074, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9194104671478271, + "step": 1359 + }, + { + "clip_ratio": 0.0, + "completion_length": 34.0, + "epoch": 3.523316062176166, + "grad_norm": 13.508458333375703, + "kl": 0.076904296875, + "learning_rate": 6.479274611398963e-07, + "loss": 0.0006, + "reward": 2.1874241828918457, + "reward_std": 0.25881362627660565, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.6874240636825562, + "step": 1360 + }, + { + "clip_ratio": 0.0, + "completion_length": 37.75, + "epoch": 3.5259067357512954, + "grad_norm": 2.384423771437139, + "kl": 0.1627197265625, + "learning_rate": 6.476683937823833e-07, + "loss": 0.001, + "reward": 2.4999488592147827, + "reward_std": 1.8778174535327707e-05, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.999948799610138, + "step": 1361 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 3.528497409326425, + "grad_norm": 0.29165096178079064, + "kl": 0.0421142578125, + "learning_rate": 6.474093264248704e-07, + "loss": -0.0001, + "reward": 2.4999932050704956, + "reward_std": 2.4304994212798192e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999932050704956, + "step": 1362 + }, + { + "clip_ratio": 0.0, + "completion_length": 36.0, + "epoch": 3.5310880829015545, + "grad_norm": 0.687794608290158, + "kl": 0.1162109375, + "learning_rate": 6.471502590673574e-07, + "loss": 0.0018, + "reward": 2.499951481819153, + "reward_std": 1.0045186854767962e-05, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999513626098633, + "step": 1363 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.5, + "epoch": 3.533678756476684, + "grad_norm": 11.83038878869347, + "kl": 0.064208984375, + "learning_rate": 6.468911917098445e-07, + "loss": 0.0003, + "reward": 2.499903678894043, + "reward_std": 3.691922302095918e-05, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999035596847534, + "step": 1364 + }, + { + "clip_ratio": 0.0, + "completion_length": 34.5, + "epoch": 3.5362694300518136, + "grad_norm": 9.641466328253756, + "kl": 0.110595703125, + "learning_rate": 6.466321243523317e-07, + "loss": 0.0019, + "reward": 2.499993085861206, + "reward_std": 1.0167843470298976e-05, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999930262565613, + "step": 1365 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.5, + "epoch": 3.538860103626943, + "grad_norm": 31.796727797747458, + "kl": 0.0430908203125, + "learning_rate": 6.463730569948185e-07, + "loss": 0.0005, + "reward": 2.187224805355072, + "reward_std": 0.2589843902571829, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.687224805355072, + "step": 1366 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 3.5414507772020727, + "grad_norm": 6.770195985628733, + "kl": 0.16998291015625, + "learning_rate": 6.461139896373057e-07, + "loss": 0.0001, + "reward": 1.9995607137680054, + "reward_std": 1.90273037787847e-05, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.4995607733726501, + "step": 1367 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.5, + "epoch": 3.5440414507772022, + "grad_norm": 0.09570513663860637, + "kl": 0.1083984375, + "learning_rate": 6.458549222797928e-07, + "loss": 0.0003, + "reward": 2.499997615814209, + "reward_std": 1.8369141230323294e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999976754188538, + "step": 1368 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.5, + "epoch": 3.5466321243523318, + "grad_norm": 2.0688750931797997, + "kl": 0.07958984375, + "learning_rate": 6.455958549222798e-07, + "loss": 0.0001, + "reward": 1.9999399185180664, + "reward_std": 1.4084825579629978e-05, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.4999399185180664, + "step": 1369 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 3.5492227979274613, + "grad_norm": 0.1956092724269007, + "kl": 0.074462890625, + "learning_rate": 6.453367875647669e-07, + "loss": 0.0003, + "reward": 2.499996542930603, + "reward_std": 2.3148721766119706e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.999996542930603, + "step": 1370 + }, + { + "clip_ratio": 0.0, + "completion_length": 34.0, + "epoch": 3.551813471502591, + "grad_norm": 1.2204397791933161, + "kl": 0.0482177734375, + "learning_rate": 6.450777202072539e-07, + "loss": -0.0004, + "reward": 2.4999887943267822, + "reward_std": 8.773908575676614e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999887943267822, + "step": 1371 + }, + { + "clip_ratio": 0.0, + "completion_length": 36.0, + "epoch": 3.5544041450777204, + "grad_norm": 0.042786623974987385, + "kl": 0.04119873046875, + "learning_rate": 6.448186528497409e-07, + "loss": 0.0, + "reward": 2.4999970197677612, + "reward_std": 1.2239580655659665e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999970197677612, + "step": 1372 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 3.55699481865285, + "grad_norm": 9.801799145982999, + "kl": 0.111083984375, + "learning_rate": 6.44559585492228e-07, + "loss": 0.0009, + "reward": 1.9999391436576843, + "reward_std": 1.510055244580144e-05, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.4999392330646515, + "step": 1373 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.5, + "epoch": 3.5595854922279795, + "grad_norm": 0.9564844454749356, + "kl": 0.0556640625, + "learning_rate": 6.44300518134715e-07, + "loss": 0.0012, + "reward": 2.4999935626983643, + "reward_std": 6.840491778348223e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999935626983643, + "step": 1374 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.5, + "epoch": 3.562176165803109, + "grad_norm": 0.08207911425817606, + "kl": 0.066650390625, + "learning_rate": 6.440414507772021e-07, + "loss": -0.0003, + "reward": 2.4999847412109375, + "reward_std": 1.7326361785308109e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999847412109375, + "step": 1375 + }, + { + "clip_ratio": 0.0, + "completion_length": 34.5, + "epoch": 3.5647668393782386, + "grad_norm": 0.634151425587639, + "kl": 0.11083984375, + "learning_rate": 6.437823834196891e-07, + "loss": 0.0001, + "reward": 2.4999961853027344, + "reward_std": 2.462986032014669e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.999996304512024, + "step": 1376 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.5, + "epoch": 3.567357512953368, + "grad_norm": 3.2257225761877146, + "kl": 0.167724609375, + "learning_rate": 6.435233160621762e-07, + "loss": 0.0007, + "reward": 1.492495596408844, + "reward_std": 0.00014754161384189501, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 0.9924955666065216, + "step": 1377 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.625, + "epoch": 3.5699481865284977, + "grad_norm": 2.002782565817255, + "kl": 0.14013671875, + "learning_rate": 6.432642487046632e-07, + "loss": 0.0011, + "reward": 2.499993085861206, + "reward_std": 7.58046803639445e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999929666519165, + "step": 1378 + }, + { + "clip_ratio": 0.0, + "completion_length": 33.5, + "epoch": 3.572538860103627, + "grad_norm": 0.6630837855530316, + "kl": 0.03839111328125, + "learning_rate": 6.430051813471502e-07, + "loss": -0.0006, + "reward": 2.499860167503357, + "reward_std": 8.493611289850378e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.999860405921936, + "step": 1379 + }, + { + "clip_ratio": 0.0, + "completion_length": 36.0, + "epoch": 3.5751295336787567, + "grad_norm": 2.3177791716460807, + "kl": 0.11590576171875, + "learning_rate": 6.427461139896373e-07, + "loss": 0.0002, + "reward": 1.9996198415756226, + "reward_std": 3.448325196586666e-05, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.4996198415756226, + "step": 1380 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.5, + "epoch": 3.5777202072538863, + "grad_norm": 6.142709358445661, + "kl": 0.17138671875, + "learning_rate": 6.424870466321243e-07, + "loss": 0.0009, + "reward": 1.9998284578323364, + "reward_std": 2.3680084268562496e-05, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.4998283088207245, + "step": 1381 + }, + { + "clip_ratio": 0.0, + "completion_length": 34.5625, + "epoch": 3.5803108808290154, + "grad_norm": 102.73437638597295, + "kl": 0.0321044921875, + "learning_rate": 6.422279792746114e-07, + "loss": -0.0002, + "reward": 2.3749892711639404, + "reward_std": 0.353569598915783, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.8749892115592957, + "step": 1382 + }, + { + "clip_ratio": 0.0, + "completion_length": 34.5, + "epoch": 3.582901554404145, + "grad_norm": 5.900914590186641, + "kl": 0.328125, + "learning_rate": 6.419689119170985e-07, + "loss": 0.0005, + "reward": 1.8437442779541016, + "reward_std": 0.5499613261595186, + "rewards/format_reward_rec": 0.9375, + "rewards/point_reward": 1.3749943673610687, + "step": 1383 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.5, + "epoch": 3.5854922279792745, + "grad_norm": 4.152174499610402, + "kl": 0.107177734375, + "learning_rate": 6.417098445595854e-07, + "loss": 0.0013, + "reward": 2.4998902082443237, + "reward_std": 3.497060606605373e-05, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.999890148639679, + "step": 1384 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.5, + "epoch": 3.588082901554404, + "grad_norm": 0.0915231309630497, + "kl": 0.04150390625, + "learning_rate": 6.414507772020725e-07, + "loss": -0.0004, + "reward": 2.4999977350234985, + "reward_std": 1.0129642191714083e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999979138374329, + "step": 1385 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 3.5906735751295336, + "grad_norm": 0.1538020752548344, + "kl": 0.094970703125, + "learning_rate": 6.411917098445595e-07, + "loss": 0.0007, + "reward": 2.499993324279785, + "reward_std": 4.426532541401684e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999932646751404, + "step": 1386 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 3.593264248704663, + "grad_norm": 1.731495223151456, + "kl": 0.07666015625, + "learning_rate": 6.409326424870466e-07, + "loss": 0.0007, + "reward": 2.499967575073242, + "reward_std": 1.0554545269769733e-05, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999675750732422, + "step": 1387 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.5, + "epoch": 3.5958549222797926, + "grad_norm": 1.0285586211360336, + "kl": 0.104248046875, + "learning_rate": 6.406735751295337e-07, + "loss": -0.0004, + "reward": 1.9998092651367188, + "reward_std": 1.2343170510575874e-05, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.499809443950653, + "step": 1388 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.5, + "epoch": 3.598445595854922, + "grad_norm": 29.928070988932145, + "kl": 0.09912109375, + "learning_rate": 6.404145077720207e-07, + "loss": 0.0004, + "reward": 1.9146441221237183, + "reward_std": 0.07638257455630537, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.414644181728363, + "step": 1389 + }, + { + "clip_ratio": 0.0, + "completion_length": 34.5, + "epoch": 3.6010362694300517, + "grad_norm": 0.7744931016392296, + "kl": 0.077392578125, + "learning_rate": 6.401554404145077e-07, + "loss": 0.0005, + "reward": 2.4999934434890747, + "reward_std": 5.0651053697947646e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999932050704956, + "step": 1390 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 3.6036269430051813, + "grad_norm": 15.867556184862849, + "kl": 0.1455078125, + "learning_rate": 6.398963730569948e-07, + "loss": 0.0007, + "reward": 1.9977235198020935, + "reward_std": 6.542609679627276e-05, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.497723639011383, + "step": 1391 + }, + { + "clip_ratio": 0.0, + "completion_length": 36.0, + "epoch": 3.606217616580311, + "grad_norm": 0.592858370697047, + "kl": 0.0648193359375, + "learning_rate": 6.396373056994818e-07, + "loss": 0.0011, + "reward": 2.4999911785125732, + "reward_std": 7.817323876224691e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999910593032837, + "step": 1392 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.5625, + "epoch": 3.6088082901554404, + "grad_norm": 48.216732190586505, + "kl": 0.15283203125, + "learning_rate": 6.393782383419689e-07, + "loss": 0.0006, + "reward": 1.6580806970596313, + "reward_std": 0.42604246735572815, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.1580806970596313, + "step": 1393 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.5625, + "epoch": 3.61139896373057, + "grad_norm": 0.922680156598505, + "kl": 0.072021484375, + "learning_rate": 6.391191709844559e-07, + "loss": 0.0007, + "reward": 1.9995518326759338, + "reward_std": 2.1345707921227586e-05, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.4995518326759338, + "step": 1394 + }, + { + "clip_ratio": 0.0, + "completion_length": 34.625, + "epoch": 3.6139896373056994, + "grad_norm": 2.597384218268352, + "kl": 0.154296875, + "learning_rate": 6.38860103626943e-07, + "loss": 0.0008, + "reward": 1.9921189546585083, + "reward_std": 0.00010953972281413371, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.492118924856186, + "step": 1395 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.5, + "epoch": 3.616580310880829, + "grad_norm": 4.002011519964906, + "kl": 0.0521240234375, + "learning_rate": 6.3860103626943e-07, + "loss": 0.0001, + "reward": 1.9984354972839355, + "reward_std": 5.356251585908467e-05, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.498435527086258, + "step": 1396 + }, + { + "clip_ratio": 0.0, + "completion_length": 36.0, + "epoch": 3.6191709844559585, + "grad_norm": 22.237915238704165, + "kl": 0.025115966796875, + "learning_rate": 6.38341968911917e-07, + "loss": -0.0005, + "reward": 2.3124756813049316, + "reward_std": 0.2588063213181897, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.8124756217002869, + "step": 1397 + }, + { + "clip_ratio": 0.0, + "completion_length": 34.875, + "epoch": 3.621761658031088, + "grad_norm": 9.719728532205218, + "kl": 0.4072265625, + "learning_rate": 6.380829015544041e-07, + "loss": 0.0019, + "reward": 2.343744993209839, + "reward_std": 0.44194230279202884, + "rewards/format_reward_rec": 0.9375, + "rewards/point_reward": 1.8749948143959045, + "step": 1398 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 3.6243523316062176, + "grad_norm": 1.4982565492046702, + "kl": 0.1103515625, + "learning_rate": 6.378238341968911e-07, + "loss": 0.0009, + "reward": 2.499995470046997, + "reward_std": 4.156313934799982e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.999995231628418, + "step": 1399 + }, + { + "clip_ratio": 0.0, + "completion_length": 34.4375, + "epoch": 3.626943005181347, + "grad_norm": 1.5379201064532138, + "kl": 0.0740966796875, + "learning_rate": 6.375647668393782e-07, + "loss": 0.0008, + "reward": 2.499988555908203, + "reward_std": 7.125452953005151e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999882578849792, + "step": 1400 + }, + { + "clip_ratio": 0.0, + "completion_length": 34.125, + "epoch": 3.6295336787564767, + "grad_norm": 0.20639443602653573, + "kl": 0.18115234375, + "learning_rate": 6.373056994818653e-07, + "loss": 0.0006, + "reward": 2.499995708465576, + "reward_std": 2.159172140636656e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999958276748657, + "step": 1401 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 3.6321243523316062, + "grad_norm": 0.09194698487599, + "kl": 0.03240966796875, + "learning_rate": 6.370466321243522e-07, + "loss": -0.0002, + "reward": 2.49999737739563, + "reward_std": 1.4971998325563618e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999972581863403, + "step": 1402 + }, + { + "clip_ratio": 0.0, + "completion_length": 34.0, + "epoch": 3.634715025906736, + "grad_norm": 2.9595577725823152, + "kl": 0.10400390625, + "learning_rate": 6.367875647668393e-07, + "loss": 0.0013, + "reward": 1.9972585439682007, + "reward_std": 7.509514551884422e-05, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.497258484363556, + "step": 1403 + }, + { + "clip_ratio": 0.0, + "completion_length": 34.5, + "epoch": 3.6373056994818653, + "grad_norm": 27.729012271010642, + "kl": 0.02850341796875, + "learning_rate": 6.365284974093263e-07, + "loss": -0.0002, + "reward": 2.4999806880950928, + "reward_std": 1.1388373422960285e-05, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999807476997375, + "step": 1404 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 3.639896373056995, + "grad_norm": 4.580947992444961, + "kl": 0.14990234375, + "learning_rate": 6.362694300518134e-07, + "loss": 0.0012, + "reward": 1.9982608556747437, + "reward_std": 6.758928202543757e-05, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.498260736465454, + "step": 1405 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.125, + "epoch": 3.6424870466321244, + "grad_norm": 1.7019039901118083, + "kl": 0.37109375, + "learning_rate": 6.360103626943006e-07, + "loss": 0.0015, + "reward": 2.4999918937683105, + "reward_std": 2.6950671667691495e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999919533729553, + "step": 1406 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.5, + "epoch": 3.645077720207254, + "grad_norm": 26.25476992407301, + "kl": 0.075927734375, + "learning_rate": 6.357512953367876e-07, + "loss": -0.0002, + "reward": 2.4999547004699707, + "reward_std": 2.206633030255034e-05, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999548196792603, + "step": 1407 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.5, + "epoch": 3.6476683937823835, + "grad_norm": 0.2285771512332544, + "kl": 0.086669921875, + "learning_rate": 6.354922279792746e-07, + "loss": 0.0006, + "reward": 2.4999818801879883, + "reward_std": 2.496620936653926e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999818801879883, + "step": 1408 + }, + { + "clip_ratio": 0.0, + "completion_length": 37.375, + "epoch": 3.650259067357513, + "grad_norm": 10.858232327595445, + "kl": 0.12353515625, + "learning_rate": 6.352331606217615e-07, + "loss": 0.0009, + "reward": 1.802313208580017, + "reward_std": 0.0008293414975923952, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.302313208580017, + "step": 1409 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 3.6528497409326426, + "grad_norm": 0.20869201786593009, + "kl": 0.0557861328125, + "learning_rate": 6.349740932642487e-07, + "loss": -0.0005, + "reward": 2.499996304512024, + "reward_std": 2.4590092380094575e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.999996304512024, + "step": 1410 + }, + { + "clip_ratio": 0.0, + "completion_length": 36.0, + "epoch": 3.655440414507772, + "grad_norm": 1.1609635142045087, + "kl": 0.114013671875, + "learning_rate": 6.347150259067358e-07, + "loss": 0.0006, + "reward": 2.499985098838806, + "reward_std": 5.292229616316035e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999852776527405, + "step": 1411 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.5, + "epoch": 3.6580310880829017, + "grad_norm": 57.20844343720344, + "kl": 0.28369140625, + "learning_rate": 6.344559585492228e-07, + "loss": 0.0016, + "reward": 2.249961793422699, + "reward_std": 0.26729934694503754, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.7499616742134094, + "step": 1412 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.125, + "epoch": 3.660621761658031, + "grad_norm": 0.12267601600877713, + "kl": 0.12841796875, + "learning_rate": 6.341968911917099e-07, + "loss": 0.0008, + "reward": 2.499998450279236, + "reward_std": 1.4982163065724308e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999982714653015, + "step": 1413 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 3.6632124352331608, + "grad_norm": 0.9094826749190009, + "kl": 0.096435546875, + "learning_rate": 6.339378238341969e-07, + "loss": -0.0003, + "reward": 2.4999935626983643, + "reward_std": 7.375853328994708e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999937415122986, + "step": 1414 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 3.6658031088082903, + "grad_norm": 0.39852115044805114, + "kl": 0.065185546875, + "learning_rate": 6.336787564766839e-07, + "loss": -0.0, + "reward": 2.4999945163726807, + "reward_std": 3.151433929815539e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999942779541016, + "step": 1415 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 3.66839378238342, + "grad_norm": 0.18089621270722947, + "kl": 0.087158203125, + "learning_rate": 6.33419689119171e-07, + "loss": 0.0006, + "reward": 2.499990940093994, + "reward_std": 5.112204917168128e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999908804893494, + "step": 1416 + }, + { + "clip_ratio": 0.0, + "completion_length": 36.0, + "epoch": 3.6709844559585494, + "grad_norm": 9.963593632923372, + "kl": 0.0513916015625, + "learning_rate": 6.33160621761658e-07, + "loss": -0.0003, + "reward": 1.9926939010620117, + "reward_std": 5.718518514186144e-05, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.4926939606666565, + "step": 1417 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.25, + "epoch": 3.6735751295336785, + "grad_norm": 4.7307522210237, + "kl": 0.23291015625, + "learning_rate": 6.329015544041451e-07, + "loss": 0.0015, + "reward": 2.499953866004944, + "reward_std": 1.4270152519202384e-05, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999537467956543, + "step": 1418 + }, + { + "clip_ratio": 0.0, + "completion_length": 39.0, + "epoch": 3.676165803108808, + "grad_norm": 2.970753992091834, + "kl": 0.27880859375, + "learning_rate": 6.326424870466322e-07, + "loss": 0.0018, + "reward": 1.9996845722198486, + "reward_std": 2.2852327788314142e-05, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.4996845126152039, + "step": 1419 + }, + { + "clip_ratio": 0.0, + "completion_length": 36.5, + "epoch": 3.6787564766839376, + "grad_norm": 24.619561430077106, + "kl": 0.1337890625, + "learning_rate": 6.323834196891191e-07, + "loss": -0.0004, + "reward": 1.773663878440857, + "reward_std": 0.0006489680408776621, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.2736640572547913, + "step": 1420 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.125, + "epoch": 3.681347150259067, + "grad_norm": 23.072096035863627, + "kl": 0.177734375, + "learning_rate": 6.321243523316062e-07, + "loss": 0.0006, + "reward": 2.4999451637268066, + "reward_std": 4.119383868328441e-05, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999451637268066, + "step": 1421 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.5, + "epoch": 3.6839378238341967, + "grad_norm": 2.0821958130165252, + "kl": 0.139892578125, + "learning_rate": 6.318652849740932e-07, + "loss": 0.0015, + "reward": 2.49995756149292, + "reward_std": 1.542653672004235e-05, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999573826789856, + "step": 1422 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.25, + "epoch": 3.686528497409326, + "grad_norm": 0.1564653787225123, + "kl": 0.24072265625, + "learning_rate": 6.316062176165803e-07, + "loss": 0.001, + "reward": 2.4999951124191284, + "reward_std": 2.60988224454195e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.999994933605194, + "step": 1423 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.5, + "epoch": 3.6891191709844557, + "grad_norm": 16.6406722817717, + "kl": 0.1024169921875, + "learning_rate": 6.313471502590674e-07, + "loss": 0.0007, + "reward": 2.3745580911636353, + "reward_std": 0.23150357842678204, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.87455815076828, + "step": 1424 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.5, + "epoch": 3.6917098445595853, + "grad_norm": 2.004090882861043, + "kl": 0.121337890625, + "learning_rate": 6.310880829015544e-07, + "loss": 0.0002, + "reward": 2.4999486207962036, + "reward_std": 1.4200291843735613e-05, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999486207962036, + "step": 1425 + }, + { + "clip_ratio": 0.0, + "completion_length": 37.1875, + "epoch": 3.694300518134715, + "grad_norm": 0.6403312950485796, + "kl": 0.02777099609375, + "learning_rate": 6.308290155440414e-07, + "loss": -0.0005, + "reward": 2.4999945163726807, + "reward_std": 3.3366154639224987e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999944567680359, + "step": 1426 + }, + { + "clip_ratio": 0.0, + "completion_length": 34.875, + "epoch": 3.6968911917098444, + "grad_norm": 2.9205639466666216, + "kl": 0.26416015625, + "learning_rate": 6.305699481865284e-07, + "loss": 0.0007, + "reward": 2.499971032142639, + "reward_std": 1.3693184882868081e-05, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999709129333496, + "step": 1427 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.25, + "epoch": 3.699481865284974, + "grad_norm": 15.446863286807062, + "kl": 0.236328125, + "learning_rate": 6.303108808290155e-07, + "loss": 0.0015, + "reward": 2.4999682903289795, + "reward_std": 1.2488001516430813e-05, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999682307243347, + "step": 1428 + }, + { + "clip_ratio": 0.0, + "completion_length": 36.25, + "epoch": 3.7020725388601035, + "grad_norm": 463.5197166974264, + "kl": 0.32568359375, + "learning_rate": 6.300518134715026e-07, + "loss": 0.0017, + "reward": 1.886283040046692, + "reward_std": 0.0015620035305801139, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.3862829506397247, + "step": 1429 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.25, + "epoch": 3.704663212435233, + "grad_norm": 9.058402794711746, + "kl": 0.2177734375, + "learning_rate": 6.297927461139896e-07, + "loss": 0.001, + "reward": 1.9995219707489014, + "reward_std": 0.00014197003883964499, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.4995220303535461, + "step": 1430 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.375, + "epoch": 3.7072538860103625, + "grad_norm": 6.903927491865712, + "kl": 0.3115234375, + "learning_rate": 6.295336787564767e-07, + "loss": 0.0002, + "reward": 1.9875686764717102, + "reward_std": 0.00022618269491658793, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.4875686764717102, + "step": 1431 + }, + { + "clip_ratio": 0.0, + "completion_length": 34.0, + "epoch": 3.709844559585492, + "grad_norm": 12.484884334585084, + "kl": 0.085693359375, + "learning_rate": 6.292746113989636e-07, + "loss": 0.0002, + "reward": 1.951571524143219, + "reward_std": 0.00031776617959167197, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.4515715837478638, + "step": 1432 + }, + { + "clip_ratio": 0.0, + "completion_length": 34.5, + "epoch": 3.7124352331606216, + "grad_norm": 0.3476356277318895, + "kl": 0.1123046875, + "learning_rate": 6.290155440414507e-07, + "loss": -0.0001, + "reward": 2.4999953508377075, + "reward_std": 4.329747127940209e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999954104423523, + "step": 1433 + }, + { + "clip_ratio": 0.0, + "completion_length": 37.9375, + "epoch": 3.715025906735751, + "grad_norm": 4.4383373794875425, + "kl": 0.1513671875, + "learning_rate": 6.287564766839378e-07, + "loss": 0.0002, + "reward": 2.4999500513076782, + "reward_std": 1.634702380215458e-05, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.999950110912323, + "step": 1434 + }, + { + "clip_ratio": 0.0, + "completion_length": 34.875, + "epoch": 3.7176165803108807, + "grad_norm": 132.41384826968695, + "kl": 0.333984375, + "learning_rate": 6.284974093264248e-07, + "loss": 0.0013, + "reward": 1.6860251426696777, + "reward_std": 0.2595846206677379, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.1860252022743225, + "step": 1435 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0625, + "epoch": 3.7202072538860103, + "grad_norm": 7.152894042537265, + "kl": 0.17431640625, + "learning_rate": 6.282383419689119e-07, + "loss": 0.0007, + "reward": 1.3684092164039612, + "reward_std": 0.0008181643206626177, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 0.8684092164039612, + "step": 1436 + }, + { + "clip_ratio": 0.0, + "completion_length": 39.625, + "epoch": 3.72279792746114, + "grad_norm": 1.9817040373736259, + "kl": 0.25732421875, + "learning_rate": 6.27979274611399e-07, + "loss": 0.0007, + "reward": 2.4999974966049194, + "reward_std": 2.305078282915929e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999974370002747, + "step": 1437 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.75, + "epoch": 3.7253886010362693, + "grad_norm": 1.766734935263979, + "kl": 0.22509765625, + "learning_rate": 6.277202072538859e-07, + "loss": 0.0005, + "reward": 1.9937055706977844, + "reward_std": 4.73358773263044e-05, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.4937056005001068, + "step": 1438 + }, + { + "clip_ratio": 0.0, + "completion_length": 38.5625, + "epoch": 3.727979274611399, + "grad_norm": 0.1141355469537074, + "kl": 0.5810546875, + "learning_rate": 6.27461139896373e-07, + "loss": 0.0029, + "reward": 2.4999990463256836, + "reward_std": 8.357976355455321e-07, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999989867210388, + "step": 1439 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.8125, + "epoch": 3.7305699481865284, + "grad_norm": 0.10434125774158462, + "kl": 0.2255859375, + "learning_rate": 6.2720207253886e-07, + "loss": 0.0019, + "reward": 2.4999983310699463, + "reward_std": 1.1934515669054235e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999982118606567, + "step": 1440 + }, + { + "clip_ratio": 0.0, + "completion_length": 40.9375, + "epoch": 3.733160621761658, + "grad_norm": 0.9799096359733886, + "kl": 0.34765625, + "learning_rate": 6.269430051813471e-07, + "loss": 0.0014, + "reward": 2.499993324279785, + "reward_std": 4.789996694398724e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999934434890747, + "step": 1441 + }, + { + "clip_ratio": 0.0, + "completion_length": 40.5, + "epoch": 3.7357512953367875, + "grad_norm": 5.954220937402469, + "kl": 0.708984375, + "learning_rate": 6.266839378238342e-07, + "loss": 0.0027, + "reward": 1.992884337902069, + "reward_std": 0.0001379923523927573, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.4928842782974243, + "step": 1442 + }, + { + "clip_ratio": 0.0, + "completion_length": 41.3125, + "epoch": 3.738341968911917, + "grad_norm": 1.1318460729934623, + "kl": 0.666015625, + "learning_rate": 6.264248704663212e-07, + "loss": 0.0039, + "reward": 2.4999923706054688, + "reward_std": 5.94410153098579e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999922513961792, + "step": 1443 + }, + { + "clip_ratio": 0.0, + "completion_length": 40.8125, + "epoch": 3.7409326424870466, + "grad_norm": 18.12893911097727, + "kl": 0.9013671875, + "learning_rate": 6.261658031088083e-07, + "loss": 0.0037, + "reward": 1.7200507819652557, + "reward_std": 0.0006644495169894071, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.2200506627559662, + "step": 1444 + }, + { + "clip_ratio": 0.0, + "completion_length": 40.375, + "epoch": 3.743523316062176, + "grad_norm": 0.18701260774233333, + "kl": 1.04296875, + "learning_rate": 6.259067357512952e-07, + "loss": 0.0034, + "reward": 2.499998092651367, + "reward_std": 1.8562637649210956e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999980330467224, + "step": 1445 + }, + { + "clip_ratio": 0.0, + "completion_length": 77.9375, + "epoch": 3.7461139896373057, + "grad_norm": 1.0751335434105949, + "kl": 1.451171875, + "learning_rate": 6.256476683937823e-07, + "loss": 0.0068, + "reward": 2.499949812889099, + "reward_std": 7.904093649813149e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.999949872493744, + "step": 1446 + }, + { + "clip_ratio": 0.0, + "completion_length": 70.0, + "epoch": 3.7487046632124352, + "grad_norm": 1.716833788318873, + "kl": 0.8203125, + "learning_rate": 6.253886010362694e-07, + "loss": 0.0031, + "reward": 2.4999661445617676, + "reward_std": 8.809275755083945e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.999966323375702, + "step": 1447 + }, + { + "clip_ratio": 0.0, + "completion_length": 94.0, + "epoch": 3.7512953367875648, + "grad_norm": 10.07558271715465, + "kl": 0.62109375, + "learning_rate": 6.251295336787564e-07, + "loss": 0.0028, + "reward": 1.9993083477020264, + "reward_std": 0.0002407157758170797, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.4993082880973816, + "step": 1448 + }, + { + "clip_ratio": 0.0, + "completion_length": 59.5625, + "epoch": 3.7538860103626943, + "grad_norm": 0.8677486357109474, + "kl": 0.984375, + "learning_rate": 6.248704663212436e-07, + "loss": 0.0047, + "reward": 2.499996066093445, + "reward_std": 4.674857109421282e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999959468841553, + "step": 1449 + }, + { + "clip_ratio": 0.0, + "completion_length": 84.3125, + "epoch": 3.756476683937824, + "grad_norm": 0.2824385889796441, + "kl": 0.654296875, + "learning_rate": 6.246113989637304e-07, + "loss": 0.0026, + "reward": 2.499992251396179, + "reward_std": 2.972786660393467e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999923706054688, + "step": 1450 + }, + { + "clip_ratio": 0.0, + "completion_length": 120.125, + "epoch": 3.7590673575129534, + "grad_norm": 0.4941509984846888, + "kl": 0.68359375, + "learning_rate": 6.243523316062176e-07, + "loss": 0.0027, + "reward": 2.4999935626983643, + "reward_std": 2.8710551305266563e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999935030937195, + "step": 1451 + }, + { + "clip_ratio": 0.0, + "completion_length": 70.4375, + "epoch": 3.761658031088083, + "grad_norm": 13.854696272013184, + "kl": 0.48046875, + "learning_rate": 6.240932642487047e-07, + "loss": 0.0025, + "reward": 1.9981037378311157, + "reward_std": 6.392458408299717e-05, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.4981036186218262, + "step": 1452 + }, + { + "clip_ratio": 0.0, + "completion_length": 150.125, + "epoch": 3.7642487046632125, + "grad_norm": 0.3009483321137214, + "kl": 0.6611328125, + "learning_rate": 6.238341968911917e-07, + "loss": 0.0024, + "reward": 2.4999921321868896, + "reward_std": 4.073889613209758e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999920725822449, + "step": 1453 + }, + { + "clip_ratio": 0.0, + "completion_length": 64.9375, + "epoch": 3.766839378238342, + "grad_norm": 14.157154907881363, + "kl": 0.75390625, + "learning_rate": 6.235751295336788e-07, + "loss": 0.0025, + "reward": 1.9996938705444336, + "reward_std": 6.909230296514579e-05, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.4996941089630127, + "step": 1454 + }, + { + "clip_ratio": 0.0, + "completion_length": 105.8125, + "epoch": 3.7694300518134716, + "grad_norm": 33.689926696100336, + "kl": 0.376953125, + "learning_rate": 6.233160621761658e-07, + "loss": 0.0015, + "reward": 1.9743483066558838, + "reward_std": 0.01166716232910403, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.4743481874465942, + "step": 1455 + }, + { + "clip_ratio": 0.0, + "completion_length": 61.125, + "epoch": 3.772020725388601, + "grad_norm": 3.229081384481035, + "kl": 1.125, + "learning_rate": 6.230569948186529e-07, + "loss": 0.0045, + "reward": 2.4999780654907227, + "reward_std": 3.5143618788424646e-05, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999781250953674, + "step": 1456 + }, + { + "clip_ratio": 0.0, + "completion_length": 108.1875, + "epoch": 3.7746113989637307, + "grad_norm": 0.47853361161886987, + "kl": 0.4794921875, + "learning_rate": 6.227979274611399e-07, + "loss": 0.0017, + "reward": 1.9998518228530884, + "reward_std": 8.310325142701913e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.4998517334461212, + "step": 1457 + }, + { + "clip_ratio": 0.0, + "completion_length": 61.625, + "epoch": 3.77720207253886, + "grad_norm": 2.285716281999186, + "kl": 0.890625, + "learning_rate": 6.225388601036269e-07, + "loss": 0.0034, + "reward": 2.4999836683273315, + "reward_std": 9.813125188884442e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999836683273315, + "step": 1458 + }, + { + "clip_ratio": 0.0, + "completion_length": 73.125, + "epoch": 3.7797927461139897, + "grad_norm": 9.024346679469728, + "kl": 1.03515625, + "learning_rate": 6.22279792746114e-07, + "loss": 0.0047, + "reward": 2.4999150037765503, + "reward_std": 4.231919552921681e-05, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999147057533264, + "step": 1459 + }, + { + "clip_ratio": 0.0, + "completion_length": 55.5625, + "epoch": 3.7823834196891193, + "grad_norm": 0.5899310847730816, + "kl": 0.724609375, + "learning_rate": 6.22020725388601e-07, + "loss": 0.0017, + "reward": 2.4999972581863403, + "reward_std": 1.3974001831229543e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999973773956299, + "step": 1460 + }, + { + "clip_ratio": 0.0, + "completion_length": 84.1875, + "epoch": 3.784974093264249, + "grad_norm": 32.00919419799135, + "kl": 0.814453125, + "learning_rate": 6.217616580310881e-07, + "loss": 0.0034, + "reward": 1.9992035627365112, + "reward_std": 8.725093562134134e-05, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.4992035031318665, + "step": 1461 + }, + { + "clip_ratio": 0.0, + "completion_length": 55.25, + "epoch": 3.7875647668393784, + "grad_norm": 0.18407889287867105, + "kl": 0.806640625, + "learning_rate": 6.215025906735752e-07, + "loss": 0.0041, + "reward": 2.499992251396179, + "reward_std": 4.0907706306825276e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999922513961792, + "step": 1462 + }, + { + "clip_ratio": 0.0, + "completion_length": 54.375, + "epoch": 3.790155440414508, + "grad_norm": 0.6562043621246557, + "kl": 0.66796875, + "learning_rate": 6.212435233160621e-07, + "loss": 0.0016, + "reward": 2.499985694885254, + "reward_std": 3.916427260719502e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999858736991882, + "step": 1463 + }, + { + "clip_ratio": 0.0, + "completion_length": 42.0625, + "epoch": 3.7927461139896375, + "grad_norm": 0.17764864475122358, + "kl": 0.7236328125, + "learning_rate": 6.209844559585492e-07, + "loss": 0.0029, + "reward": 2.4999979734420776, + "reward_std": 2.6351077053732297e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999980926513672, + "step": 1464 + }, + { + "clip_ratio": 0.0, + "completion_length": 66.625, + "epoch": 3.795336787564767, + "grad_norm": 2.8898494809438984, + "kl": 0.626953125, + "learning_rate": 6.207253886010363e-07, + "loss": 0.0029, + "reward": 1.9983633160591125, + "reward_std": 5.549153206629853e-05, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.4983634054660797, + "step": 1465 + }, + { + "clip_ratio": 0.0, + "completion_length": 66.875, + "epoch": 3.7979274611398965, + "grad_norm": 4.633969522990657, + "kl": 0.693359375, + "learning_rate": 6.204663212435233e-07, + "loss": 0.0036, + "reward": 2.49995756149292, + "reward_std": 4.5567202505480964e-05, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999574422836304, + "step": 1466 + }, + { + "clip_ratio": 0.0, + "completion_length": 48.125, + "epoch": 3.800518134715026, + "grad_norm": 0.19547863285249395, + "kl": 0.33984375, + "learning_rate": 6.202072538860104e-07, + "loss": 0.0014, + "reward": 0.9998528361320496, + "reward_std": 3.5379691780690337e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 0.4998527765274048, + "step": 1467 + }, + { + "clip_ratio": 0.0, + "completion_length": 51.75, + "epoch": 3.8031088082901556, + "grad_norm": 0.9795643348661174, + "kl": 0.373046875, + "learning_rate": 6.199481865284974e-07, + "loss": 0.0011, + "reward": 2.4999929666519165, + "reward_std": 5.263982984615723e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999929666519165, + "step": 1468 + }, + { + "clip_ratio": 0.0, + "completion_length": 44.375, + "epoch": 3.805699481865285, + "grad_norm": 0.4754978042313422, + "kl": 0.3701171875, + "learning_rate": 6.196891191709844e-07, + "loss": 0.0022, + "reward": 2.4999914169311523, + "reward_std": 6.005264140185318e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999914169311523, + "step": 1469 + }, + { + "clip_ratio": 0.0, + "completion_length": 39.375, + "epoch": 3.8082901554404147, + "grad_norm": 0.5144800245672949, + "kl": 0.1090087890625, + "learning_rate": 6.194300518134715e-07, + "loss": 0.0006, + "reward": 2.4999231100082397, + "reward_std": 9.189096090267412e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999231100082397, + "step": 1470 + }, + { + "clip_ratio": 0.0, + "completion_length": 42.125, + "epoch": 3.8108808290155443, + "grad_norm": 0.4560077735322792, + "kl": 0.142822265625, + "learning_rate": 6.191709844559585e-07, + "loss": 0.0015, + "reward": 2.499967336654663, + "reward_std": 5.862255136435124e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999670386314392, + "step": 1471 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.5, + "epoch": 3.813471502590674, + "grad_norm": 49.41466699436052, + "kl": 0.049072265625, + "learning_rate": 6.189119170984456e-07, + "loss": -0.0001, + "reward": 2.499945878982544, + "reward_std": 4.246328614954109e-05, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999459981918335, + "step": 1472 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.6875, + "epoch": 3.8160621761658033, + "grad_norm": 1.3809840963766469, + "kl": 0.139892578125, + "learning_rate": 6.186528497409326e-07, + "loss": 0.0006, + "reward": 2.499993920326233, + "reward_std": 4.2472178733987676e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.999993920326233, + "step": 1473 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.5625, + "epoch": 3.818652849740933, + "grad_norm": 0.37197075865265644, + "kl": 0.09930419921875, + "learning_rate": 6.183937823834197e-07, + "loss": -0.0002, + "reward": 1.9998546838760376, + "reward_std": 9.605147170077544e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.4998548924922943, + "step": 1474 + }, + { + "clip_ratio": 0.0, + "completion_length": 38.875, + "epoch": 3.8212435233160624, + "grad_norm": 17.46554141511608, + "kl": 0.16748046875, + "learning_rate": 6.181347150259067e-07, + "loss": 0.0007, + "reward": 2.3118717670440674, + "reward_std": 0.4098711311817169, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.8118716478347778, + "step": 1475 + }, + { + "clip_ratio": 0.0, + "completion_length": 34.625, + "epoch": 3.823834196891192, + "grad_norm": 13.832986110571346, + "kl": 0.367431640625, + "learning_rate": 6.178756476683937e-07, + "loss": 0.0011, + "reward": 2.499951958656311, + "reward_std": 0.00011580804778077436, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999520182609558, + "step": 1476 + }, + { + "clip_ratio": 0.0, + "completion_length": 34.0625, + "epoch": 3.8264248704663215, + "grad_norm": 0.15626784662529, + "kl": 0.1424560546875, + "learning_rate": 6.176165803108808e-07, + "loss": 0.0014, + "reward": 2.4999951124191284, + "reward_std": 2.029710998385781e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999950528144836, + "step": 1477 + }, + { + "clip_ratio": 0.0, + "completion_length": 36.125, + "epoch": 3.8290155440414506, + "grad_norm": 0.6904260654016462, + "kl": 0.1005859375, + "learning_rate": 6.173575129533678e-07, + "loss": -0.0003, + "reward": 2.4999685287475586, + "reward_std": 6.046272119419882e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999684691429138, + "step": 1478 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.125, + "epoch": 3.83160621761658, + "grad_norm": 1.1816070078883867, + "kl": 0.13916015625, + "learning_rate": 6.170984455958549e-07, + "loss": 0.0015, + "reward": 1.9991791248321533, + "reward_std": 3.0139472301016212e-05, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.499178946018219, + "step": 1479 + }, + { + "clip_ratio": 0.0, + "completion_length": 34.625, + "epoch": 3.8341968911917097, + "grad_norm": 0.7107998092785732, + "kl": 0.114013671875, + "learning_rate": 6.16839378238342e-07, + "loss": 0.0, + "reward": 2.4999890327453613, + "reward_std": 4.616178784999647e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.999989092350006, + "step": 1480 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 3.8367875647668392, + "grad_norm": 0.047679444385930934, + "kl": 0.1220703125, + "learning_rate": 6.165803108808289e-07, + "loss": -0.0005, + "reward": 2.499991536140442, + "reward_std": 1.5257598420248542e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.999991774559021, + "step": 1481 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 3.839378238341969, + "grad_norm": 0.15190050677486275, + "kl": 0.0980072021484375, + "learning_rate": 6.16321243523316e-07, + "loss": -0.0008, + "reward": 2.4999955892562866, + "reward_std": 2.4593052785348846e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999957084655762, + "step": 1482 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 3.8419689119170983, + "grad_norm": 1.8095366787743785, + "kl": 0.04498291015625, + "learning_rate": 6.16062176165803e-07, + "loss": -0.0006, + "reward": 2.499949097633362, + "reward_std": 1.2512091188909835e-05, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999490976333618, + "step": 1483 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.5, + "epoch": 3.844559585492228, + "grad_norm": 3.050845107592284, + "kl": 0.135498046875, + "learning_rate": 6.158031088082901e-07, + "loss": 0.0013, + "reward": 2.4999804496765137, + "reward_std": 1.1739518924969161e-05, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.999980390071869, + "step": 1484 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.5, + "epoch": 3.8471502590673574, + "grad_norm": 3.9106821283709814, + "kl": 0.21875, + "learning_rate": 6.155440414507772e-07, + "loss": 0.0014, + "reward": 2.4999276399612427, + "reward_std": 1.6660619166941615e-05, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999274611473083, + "step": 1485 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.5, + "epoch": 3.849740932642487, + "grad_norm": 1.478544684566244, + "kl": 0.0648193359375, + "learning_rate": 6.152849740932642e-07, + "loss": 0.0008, + "reward": 2.499980330467224, + "reward_std": 8.342951218764938e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999804496765137, + "step": 1486 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.5, + "epoch": 3.8523316062176165, + "grad_norm": 0.14218270222521923, + "kl": 0.094970703125, + "learning_rate": 6.150259067357512e-07, + "loss": 0.0001, + "reward": 2.4999966621398926, + "reward_std": 1.1975077143233648e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999967217445374, + "step": 1487 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 3.854922279792746, + "grad_norm": 0.9914738916691916, + "kl": 0.05126953125, + "learning_rate": 6.147668393782383e-07, + "loss": 0.0002, + "reward": 2.499989628791809, + "reward_std": 7.236959504552942e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999895095825195, + "step": 1488 + }, + { + "clip_ratio": 0.0, + "completion_length": 36.0, + "epoch": 3.8575129533678756, + "grad_norm": 6.025132410877975, + "kl": 0.106689453125, + "learning_rate": 6.145077720207253e-07, + "loss": 0.0, + "reward": 2.4999608993530273, + "reward_std": 1.592787248227978e-05, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999610781669617, + "step": 1489 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 3.860103626943005, + "grad_norm": 0.9531103089990565, + "kl": 0.179443359375, + "learning_rate": 6.142487046632124e-07, + "loss": 0.0004, + "reward": 2.4999879598617554, + "reward_std": 5.469433801863488e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999879598617554, + "step": 1490 + }, + { + "clip_ratio": 0.0, + "completion_length": 34.1875, + "epoch": 3.8626943005181347, + "grad_norm": 74.41752048232165, + "kl": 0.111328125, + "learning_rate": 6.139896373056994e-07, + "loss": 0.0008, + "reward": 1.9575918316841125, + "reward_std": 0.0731734535893338, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.4575918018817902, + "step": 1491 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 3.865284974093264, + "grad_norm": 0.24236503129513232, + "kl": 0.13525390625, + "learning_rate": 6.137305699481866e-07, + "loss": -0.0001, + "reward": 2.4999717473983765, + "reward_std": 3.366131920756743e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999718070030212, + "step": 1492 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 3.8678756476683938, + "grad_norm": 0.8550638933239846, + "kl": 0.01934814453125, + "learning_rate": 6.134715025906736e-07, + "loss": 0.0014, + "reward": 2.4999886751174927, + "reward_std": 4.1406764808016305e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999885559082031, + "step": 1493 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 3.8704663212435233, + "grad_norm": 0.7776768517865141, + "kl": 0.073974609375, + "learning_rate": 6.132124352331606e-07, + "loss": 0.0008, + "reward": 2.4999849796295166, + "reward_std": 7.899149238710379e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.999985158443451, + "step": 1494 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 3.873056994818653, + "grad_norm": 0.06771922441915233, + "kl": 0.0428466796875, + "learning_rate": 6.129533678756477e-07, + "loss": 0.0, + "reward": 2.499994993209839, + "reward_std": 1.6687939705661847e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999950528144836, + "step": 1495 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 3.8756476683937824, + "grad_norm": 7.851391387126275, + "kl": 0.072265625, + "learning_rate": 6.126943005181347e-07, + "loss": -0.0001, + "reward": 2.249977469444275, + "reward_std": 0.26727293102896965, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.749977469444275, + "step": 1496 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.5, + "epoch": 3.878238341968912, + "grad_norm": 87.89655568024126, + "kl": 0.21484375, + "learning_rate": 6.124352331606218e-07, + "loss": 0.0007, + "reward": 1.9994741678237915, + "reward_std": 0.00034601625452523876, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.499474287033081, + "step": 1497 + }, + { + "clip_ratio": 0.0, + "completion_length": 34.0, + "epoch": 3.8808290155440415, + "grad_norm": 0.6663699074252637, + "kl": 0.097900390625, + "learning_rate": 6.121761658031089e-07, + "loss": 0.0015, + "reward": 2.499893069267273, + "reward_std": 9.794798188522691e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9998931288719177, + "step": 1498 + }, + { + "clip_ratio": 0.0, + "completion_length": 34.5, + "epoch": 3.883419689119171, + "grad_norm": 0.25014862568425383, + "kl": 0.082763671875, + "learning_rate": 6.119170984455958e-07, + "loss": -0.0003, + "reward": 2.4999961853027344, + "reward_std": 1.2891987921648251e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999959468841553, + "step": 1499 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 3.8860103626943006, + "grad_norm": 10.665526953684378, + "kl": 0.099365234375, + "learning_rate": 6.116580310880829e-07, + "loss": 0.0009, + "reward": 1.9997857809066772, + "reward_std": 7.206776979273855e-05, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.4997857809066772, + "step": 1500 + }, + { + "clip_ratio": 0.0, + "completion_length": 34.0, + "epoch": 3.88860103626943, + "grad_norm": 0.04980586190898546, + "kl": 0.12060546875, + "learning_rate": 6.113989637305699e-07, + "loss": -0.0001, + "reward": 2.4999990463256836, + "reward_std": 1.079484377441986e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.999999225139618, + "step": 1501 + }, + { + "clip_ratio": 0.0, + "completion_length": 34.5, + "epoch": 3.8911917098445596, + "grad_norm": 18.820186876883366, + "kl": 0.066162109375, + "learning_rate": 6.11139896373057e-07, + "loss": 0.0003, + "reward": 2.4374711513519287, + "reward_std": 0.17684629114512518, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.937471091747284, + "step": 1502 + }, + { + "clip_ratio": 0.0, + "completion_length": 34.5, + "epoch": 3.893782383419689, + "grad_norm": 3.44404116041616, + "kl": 0.03900146484375, + "learning_rate": 6.108808290155441e-07, + "loss": -0.0008, + "reward": 2.499988555908203, + "reward_std": 1.3486970829035272e-05, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999886751174927, + "step": 1503 + }, + { + "clip_ratio": 0.0, + "completion_length": 34.0, + "epoch": 3.8963730569948187, + "grad_norm": 1.62107763982155, + "kl": 0.0655517578125, + "learning_rate": 6.106217616580311e-07, + "loss": 0.0001, + "reward": 1.9987400770187378, + "reward_std": 2.216845021507652e-05, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.4987401366233826, + "step": 1504 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.5, + "epoch": 3.8989637305699483, + "grad_norm": 124.29165863467259, + "kl": 0.137939453125, + "learning_rate": 6.103626943005181e-07, + "loss": 0.0003, + "reward": 1.972155511379242, + "reward_std": 0.011117634104266472, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.4721554517745972, + "step": 1505 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 3.901554404145078, + "grad_norm": 0.08261802332516702, + "kl": 0.1021728515625, + "learning_rate": 6.101036269430051e-07, + "loss": -0.0006, + "reward": 2.4999983310699463, + "reward_std": 1.373515516434054e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999986290931702, + "step": 1506 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.5, + "epoch": 3.9041450777202074, + "grad_norm": 12.020803892650783, + "kl": 0.18310546875, + "learning_rate": 6.098445595854922e-07, + "loss": -0.0, + "reward": 1.892823576927185, + "reward_std": 0.0007788562370478758, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.3928236365318298, + "step": 1507 + }, + { + "clip_ratio": 0.0, + "completion_length": 36.0, + "epoch": 3.906735751295337, + "grad_norm": 2.675325021055988, + "kl": 0.111083984375, + "learning_rate": 6.095854922279793e-07, + "loss": 0.0007, + "reward": 2.4999688863754272, + "reward_std": 1.3228021089162212e-05, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999687671661377, + "step": 1508 + }, + { + "clip_ratio": 0.0, + "completion_length": 34.5, + "epoch": 3.9093264248704664, + "grad_norm": 0.24045941419279215, + "kl": 0.0582275390625, + "learning_rate": 6.093264248704663e-07, + "loss": 0.0012, + "reward": 2.4999953508377075, + "reward_std": 2.206568026963396e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999954104423523, + "step": 1509 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.5, + "epoch": 3.911917098445596, + "grad_norm": 2.138825695550966, + "kl": 0.23095703125, + "learning_rate": 6.090673575129534e-07, + "loss": 0.0013, + "reward": 1.9985513091087341, + "reward_std": 2.5303734673798317e-05, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.498551368713379, + "step": 1510 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 3.9145077720207255, + "grad_norm": 0.27551762876822766, + "kl": 0.05364990234375, + "learning_rate": 6.088082901554404e-07, + "loss": -0.0002, + "reward": 2.499990224838257, + "reward_std": 3.2414905035693664e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999903440475464, + "step": 1511 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 3.917098445595855, + "grad_norm": 0.19544538688642218, + "kl": 0.07275390625, + "learning_rate": 6.085492227979274e-07, + "loss": -0.0, + "reward": 2.499996781349182, + "reward_std": 3.208890007044829e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.999996840953827, + "step": 1512 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.5, + "epoch": 3.9196891191709846, + "grad_norm": 12.414224781692207, + "kl": 0.08935546875, + "learning_rate": 6.082901554404145e-07, + "loss": 0.0005, + "reward": 2.4999618530273438, + "reward_std": 2.7265602511761244e-05, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999618530273438, + "step": 1513 + }, + { + "clip_ratio": 0.0, + "completion_length": 36.0, + "epoch": 3.9222797927461137, + "grad_norm": 3.4897342789029775, + "kl": 0.138427734375, + "learning_rate": 6.080310880829015e-07, + "loss": 0.0022, + "reward": 2.499973773956299, + "reward_std": 1.4360656422240936e-05, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999737739562988, + "step": 1514 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 3.9248704663212433, + "grad_norm": 0.08180259092393129, + "kl": 0.0465087890625, + "learning_rate": 6.077720207253886e-07, + "loss": -0.0005, + "reward": 2.499997138977051, + "reward_std": 1.5240450750297896e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999971389770508, + "step": 1515 + }, + { + "clip_ratio": 0.0, + "completion_length": 36.0, + "epoch": 3.927461139896373, + "grad_norm": 0.13154426265580085, + "kl": 0.0740966796875, + "learning_rate": 6.075129533678757e-07, + "loss": 0.0, + "reward": 2.4999908208847046, + "reward_std": 2.662878785031353e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999909400939941, + "step": 1516 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 3.9300518134715023, + "grad_norm": 0.1315121423199463, + "kl": 0.0531005859375, + "learning_rate": 6.072538860103626e-07, + "loss": -0.0009, + "reward": 2.4999947547912598, + "reward_std": 2.8946423071829486e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999948740005493, + "step": 1517 + }, + { + "clip_ratio": 0.0, + "completion_length": 34.5, + "epoch": 3.932642487046632, + "grad_norm": 58.0455571939986, + "kl": 0.08984375, + "learning_rate": 6.069948186528497e-07, + "loss": 0.0007, + "reward": 1.997781753540039, + "reward_std": 5.9722893638536334e-05, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.4977816343307495, + "step": 1518 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.5, + "epoch": 3.9352331606217614, + "grad_norm": 1.461864711452078, + "kl": 0.123291015625, + "learning_rate": 6.067357512953367e-07, + "loss": -0.0006, + "reward": 2.499990940093994, + "reward_std": 7.963669219179792e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999911189079285, + "step": 1519 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 3.937823834196891, + "grad_norm": 0.6282782327030264, + "kl": 0.0810546875, + "learning_rate": 6.064766839378238e-07, + "loss": 0.0, + "reward": 2.4999932050704956, + "reward_std": 3.544434548530262e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999932646751404, + "step": 1520 + }, + { + "clip_ratio": 0.0, + "completion_length": 34.5, + "epoch": 3.9404145077720205, + "grad_norm": 2.5276860922958995, + "kl": 0.04736328125, + "learning_rate": 6.062176165803109e-07, + "loss": 0.0007, + "reward": 2.4999648332595825, + "reward_std": 1.1204292661659565e-05, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999647736549377, + "step": 1521 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 3.94300518134715, + "grad_norm": 2.275415320612054, + "kl": 0.092529296875, + "learning_rate": 6.059585492227979e-07, + "loss": -0.0004, + "reward": 2.4999932050704956, + "reward_std": 3.785589910876297e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999935626983643, + "step": 1522 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0625, + "epoch": 3.9455958549222796, + "grad_norm": 13.146166797259028, + "kl": 0.114013671875, + "learning_rate": 6.056994818652849e-07, + "loss": 0.0003, + "reward": 1.9897688031196594, + "reward_std": 8.064197891144431e-05, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.4897687435150146, + "step": 1523 + }, + { + "clip_ratio": 0.0, + "completion_length": 36.0, + "epoch": 3.948186528497409, + "grad_norm": 24.718529045802832, + "kl": 0.12109375, + "learning_rate": 6.054404145077719e-07, + "loss": 0.0005, + "reward": 2.4374316930770874, + "reward_std": 0.1769547753712004, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.937431514263153, + "step": 1524 + }, + { + "clip_ratio": 0.0, + "completion_length": 34.5, + "epoch": 3.9507772020725387, + "grad_norm": 21.061746102109815, + "kl": 0.31640625, + "learning_rate": 6.05181347150259e-07, + "loss": 0.0012, + "reward": 1.7074499130249023, + "reward_std": 0.17726099950959906, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.2074499130249023, + "step": 1525 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 3.9533678756476682, + "grad_norm": 5.512914461709353, + "kl": 0.07568359375, + "learning_rate": 6.049222797927461e-07, + "loss": 0.0003, + "reward": 2.499977469444275, + "reward_std": 8.490087566315196e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999775290489197, + "step": 1526 + }, + { + "clip_ratio": 0.0, + "completion_length": 36.0, + "epoch": 3.9559585492227978, + "grad_norm": 0.19557940310647692, + "kl": 0.10791015625, + "learning_rate": 6.046632124352331e-07, + "loss": -0.0, + "reward": 2.4999953508377075, + "reward_std": 1.4924983133823844e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999954104423523, + "step": 1527 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 3.9585492227979273, + "grad_norm": 0.9418662495466913, + "kl": 0.0841064453125, + "learning_rate": 6.044041450777202e-07, + "loss": 0.0003, + "reward": 2.4999905824661255, + "reward_std": 8.288041271953261e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999905824661255, + "step": 1528 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 3.961139896373057, + "grad_norm": 0.15702479261526286, + "kl": 0.078125, + "learning_rate": 6.041450777202071e-07, + "loss": 0.0002, + "reward": 2.4999929666519165, + "reward_std": 2.2389545506484865e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999929070472717, + "step": 1529 + }, + { + "clip_ratio": 0.0, + "completion_length": 34.0, + "epoch": 3.9637305699481864, + "grad_norm": 0.6240138107668911, + "kl": 0.142333984375, + "learning_rate": 6.038860103626942e-07, + "loss": 0.0004, + "reward": 2.4999821186065674, + "reward_std": 5.765621722275682e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999821186065674, + "step": 1530 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 3.966321243523316, + "grad_norm": 41.97970705755977, + "kl": 0.14404296875, + "learning_rate": 6.036269430051813e-07, + "loss": -0.0, + "reward": 1.9241811037063599, + "reward_std": 0.011888031393937126, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.4241811633110046, + "step": 1531 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 3.9689119170984455, + "grad_norm": 1.3772361336422627, + "kl": 0.040283203125, + "learning_rate": 6.033678756476683e-07, + "loss": 0.0002, + "reward": 2.4999881982803345, + "reward_std": 7.161868381899694e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999881982803345, + "step": 1532 + }, + { + "clip_ratio": 0.0, + "completion_length": 34.5, + "epoch": 3.971502590673575, + "grad_norm": 12.373095287259442, + "kl": 0.054443359375, + "learning_rate": 6.031088082901554e-07, + "loss": -0.0005, + "reward": 1.9985239505767822, + "reward_std": 9.379618177263183e-05, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.4985239803791046, + "step": 1533 + }, + { + "clip_ratio": 0.0, + "completion_length": 34.5, + "epoch": 3.9740932642487046, + "grad_norm": 0.3609381581539641, + "kl": 0.04931640625, + "learning_rate": 6.028497409326426e-07, + "loss": 0.0003, + "reward": 2.49999463558197, + "reward_std": 4.70987015432911e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999946355819702, + "step": 1534 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 3.976683937823834, + "grad_norm": 1.6352814710464176, + "kl": 0.13818359375, + "learning_rate": 6.025906735751294e-07, + "loss": 0.0006, + "reward": 2.4999889135360718, + "reward_std": 7.197763125077472e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999890327453613, + "step": 1535 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.5, + "epoch": 3.9792746113989637, + "grad_norm": 10.726885310772955, + "kl": 0.1240234375, + "learning_rate": 6.023316062176166e-07, + "loss": 0.0007, + "reward": 1.4987960457801819, + "reward_std": 9.59022254392039e-05, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 0.9987959861755371, + "step": 1536 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.5, + "epoch": 3.981865284974093, + "grad_norm": 1.8859934173984643, + "kl": 0.11767578125, + "learning_rate": 6.020725388601036e-07, + "loss": -0.0008, + "reward": 2.499971866607666, + "reward_std": 6.885548714308243e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999718070030212, + "step": 1537 + }, + { + "clip_ratio": 0.0, + "completion_length": 36.0, + "epoch": 3.9844559585492227, + "grad_norm": 13.781964335609478, + "kl": 0.148681640625, + "learning_rate": 6.018134715025907e-07, + "loss": 0.0005, + "reward": 1.4967296719551086, + "reward_std": 0.0005246834916761145, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 0.9967296719551086, + "step": 1538 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0625, + "epoch": 3.9870466321243523, + "grad_norm": 2.158598587715824, + "kl": 0.114013671875, + "learning_rate": 6.015544041450778e-07, + "loss": 0.0014, + "reward": 1.9998290538787842, + "reward_std": 2.493898955435725e-05, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.4998289942741394, + "step": 1539 + }, + { + "clip_ratio": 0.0, + "completion_length": 36.0, + "epoch": 3.989637305699482, + "grad_norm": 10.058535586911876, + "kl": 0.0584716796875, + "learning_rate": 6.012953367875648e-07, + "loss": 0.0011, + "reward": 2.499976873397827, + "reward_std": 6.330054816316988e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.999976634979248, + "step": 1540 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 3.9922279792746114, + "grad_norm": 2.392574936951284, + "kl": 0.105712890625, + "learning_rate": 6.010362694300518e-07, + "loss": 0.0008, + "reward": 2.4999676942825317, + "reward_std": 1.3341488056539674e-05, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.999967634677887, + "step": 1541 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 3.994818652849741, + "grad_norm": 2.1139371378907774, + "kl": 0.08349609375, + "learning_rate": 6.007772020725388e-07, + "loss": 0.0005, + "reward": 1.9998367428779602, + "reward_std": 1.0321002037017024e-05, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.4998367428779602, + "step": 1542 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.5, + "epoch": 3.9974093264248705, + "grad_norm": 0.9856443104513883, + "kl": 0.061279296875, + "learning_rate": 6.005181347150259e-07, + "loss": 0.0006, + "reward": 2.4999947547912598, + "reward_std": 6.6028815126628615e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999947547912598, + "step": 1543 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.5, + "epoch": 4.0, + "grad_norm": 0.31081216513036164, + "kl": 0.0810546875, + "learning_rate": 6.00259067357513e-07, + "loss": -0.0001, + "reward": 2.4999886751174927, + "reward_std": 3.388945344795502e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999887347221375, + "step": 1544 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 4.0025906735751295, + "grad_norm": 4.416994857899252, + "kl": 0.0670166015625, + "learning_rate": 6e-07, + "loss": 0.0017, + "reward": 2.499978542327881, + "reward_std": 1.6664245094943908e-05, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999784231185913, + "step": 1545 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 4.005181347150259, + "grad_norm": 27.809646398420966, + "kl": 0.1326904296875, + "learning_rate": 5.997409326424871e-07, + "loss": 0.0003, + "reward": 1.4839889407157898, + "reward_std": 0.2056162540538935, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 0.9839890897274017, + "step": 1546 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.5, + "epoch": 4.007772020725389, + "grad_norm": 7.818237071912511, + "kl": 0.146484375, + "learning_rate": 5.99481865284974e-07, + "loss": 0.0007, + "reward": 1.9899332523345947, + "reward_std": 0.00018400712815491715, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.4899333119392395, + "step": 1547 + }, + { + "clip_ratio": 0.0, + "completion_length": 34.5, + "epoch": 4.010362694300518, + "grad_norm": 5.585671080027885, + "kl": 0.10693359375, + "learning_rate": 5.992227979274611e-07, + "loss": -0.0004, + "reward": 2.0624399185180664, + "reward_std": 0.17678657060935166, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.562440037727356, + "step": 1548 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 4.012953367875648, + "grad_norm": 6.586635257763988, + "kl": 0.09375, + "learning_rate": 5.989637305699482e-07, + "loss": 0.0009, + "reward": 2.4999741315841675, + "reward_std": 1.631400783708159e-05, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999741911888123, + "step": 1549 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 4.015544041450777, + "grad_norm": 4.147550109509986, + "kl": 0.097412109375, + "learning_rate": 5.987046632124352e-07, + "loss": 0.0014, + "reward": 2.4998841285705566, + "reward_std": 1.604829245138717e-05, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.999884009361267, + "step": 1550 + }, + { + "clip_ratio": 0.0, + "completion_length": 34.5, + "epoch": 4.018134715025907, + "grad_norm": 0.26039902860809827, + "kl": 0.087646484375, + "learning_rate": 5.984455958549223e-07, + "loss": -0.0004, + "reward": 2.4999969005584717, + "reward_std": 1.6589945062150946e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.999997079372406, + "step": 1551 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.5, + "epoch": 4.020725388601036, + "grad_norm": 0.8508742168846786, + "kl": 0.1435546875, + "learning_rate": 5.981865284974093e-07, + "loss": 0.0013, + "reward": 2.499996304512024, + "reward_std": 4.854603218973352e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.999996304512024, + "step": 1552 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.1875, + "epoch": 4.023316062176166, + "grad_norm": 92.86900396070175, + "kl": 0.110107421875, + "learning_rate": 5.979274611398963e-07, + "loss": 0.0013, + "reward": 1.9981833100318909, + "reward_std": 0.0023745253711240366, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.4981831908226013, + "step": 1553 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 4.025906735751295, + "grad_norm": 8.214861237744474, + "kl": 0.326171875, + "learning_rate": 5.976683937823834e-07, + "loss": 0.0014, + "reward": 1.9978615641593933, + "reward_std": 2.7920706997974776e-05, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.4978615045547485, + "step": 1554 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.5, + "epoch": 4.028497409326425, + "grad_norm": 0.03544922385781608, + "kl": 0.0819091796875, + "learning_rate": 5.974093264248704e-07, + "loss": 0.0, + "reward": 2.499995470046997, + "reward_std": 7.366109571194102e-07, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999953508377075, + "step": 1555 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.5, + "epoch": 4.0310880829015545, + "grad_norm": 1.176865512507275, + "kl": 0.107421875, + "learning_rate": 5.971502590673575e-07, + "loss": -0.0005, + "reward": 2.4999771118164062, + "reward_std": 1.0998891184499371e-05, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999773502349854, + "step": 1556 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.5, + "epoch": 4.033678756476684, + "grad_norm": 22.13563981067276, + "kl": 0.11962890625, + "learning_rate": 5.968911917098445e-07, + "loss": 0.0001, + "reward": 1.9987602233886719, + "reward_std": 0.0002699352146464662, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.4987602233886719, + "step": 1557 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 4.036269430051814, + "grad_norm": 0.8636131421491717, + "kl": 0.04815673828125, + "learning_rate": 5.966321243523316e-07, + "loss": 0.0008, + "reward": 2.499994993209839, + "reward_std": 5.407045648553321e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.999994933605194, + "step": 1558 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.4375, + "epoch": 4.038860103626943, + "grad_norm": 0.18692970436475073, + "kl": 0.1220703125, + "learning_rate": 5.963730569948186e-07, + "loss": 0.0001, + "reward": 2.4999942779541016, + "reward_std": 3.4796528325387044e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999943971633911, + "step": 1559 + }, + { + "clip_ratio": 0.0, + "completion_length": 34.5, + "epoch": 4.041450777202073, + "grad_norm": 7.547067500247267, + "kl": 0.15185546875, + "learning_rate": 5.961139896373056e-07, + "loss": 0.0017, + "reward": 1.9933630228042603, + "reward_std": 0.00011093804221218306, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.4933629035949707, + "step": 1560 + }, + { + "clip_ratio": 0.0, + "completion_length": 36.0, + "epoch": 4.044041450777202, + "grad_norm": 0.13983521107809369, + "kl": 0.068359375, + "learning_rate": 5.958549222797927e-07, + "loss": 0.0001, + "reward": 2.499997615814209, + "reward_std": 1.551813397782098e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.999997615814209, + "step": 1561 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 4.046632124352332, + "grad_norm": 0.5365448327665252, + "kl": 0.13134765625, + "learning_rate": 5.955958549222798e-07, + "loss": -0.0005, + "reward": 2.4999958276748657, + "reward_std": 2.9013790481258184e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999959468841553, + "step": 1562 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 4.049222797927461, + "grad_norm": 0.8592941042650075, + "kl": 0.20751953125, + "learning_rate": 5.953367875647668e-07, + "loss": 0.0002, + "reward": 1.999908447265625, + "reward_std": 1.400190649292199e-05, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.4999085068702698, + "step": 1563 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 4.051813471502591, + "grad_norm": 25.5094200872479, + "kl": 0.108642578125, + "learning_rate": 5.950777202072539e-07, + "loss": 0.0004, + "reward": 2.3749287128448486, + "reward_std": 0.23156403409984705, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.8749286532402039, + "step": 1564 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 4.05440414507772, + "grad_norm": 0.2480863977760045, + "kl": 0.08544921875, + "learning_rate": 5.948186528497408e-07, + "loss": 0.0, + "reward": 2.499994993209839, + "reward_std": 3.331746995627327e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999949932098389, + "step": 1565 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 4.05699481865285, + "grad_norm": 38.638483764038256, + "kl": 0.14111328125, + "learning_rate": 5.945595854922279e-07, + "loss": 0.0013, + "reward": 1.9990499019622803, + "reward_std": 5.9976065131195355e-05, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.4990499019622803, + "step": 1566 + }, + { + "clip_ratio": 0.0, + "completion_length": 34.5, + "epoch": 4.0595854922279795, + "grad_norm": 0.6763009839069763, + "kl": 0.06640625, + "learning_rate": 5.94300518134715e-07, + "loss": 0.001, + "reward": 2.4999979734420776, + "reward_std": 1.7670318470663915e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999979138374329, + "step": 1567 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.5, + "epoch": 4.062176165803109, + "grad_norm": 0.04540838025088783, + "kl": 0.0709228515625, + "learning_rate": 5.94041450777202e-07, + "loss": -0.0003, + "reward": 2.499998688697815, + "reward_std": 9.241435634521622e-07, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.999998927116394, + "step": 1568 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 4.064766839378239, + "grad_norm": 0.09630885029282686, + "kl": 0.1148681640625, + "learning_rate": 5.937823834196891e-07, + "loss": 0.0, + "reward": 2.499995470046997, + "reward_std": 1.7548972408576446e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999955892562866, + "step": 1569 + }, + { + "clip_ratio": 0.0, + "completion_length": 34.5, + "epoch": 4.067357512953368, + "grad_norm": 3.8997047573922203, + "kl": 0.168701171875, + "learning_rate": 5.935233160621761e-07, + "loss": 0.0001, + "reward": 2.4998762607574463, + "reward_std": 1.6851620330271544e-05, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9998763799667358, + "step": 1570 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 4.069948186528498, + "grad_norm": 1.160117974467065, + "kl": 0.0579833984375, + "learning_rate": 5.932642487046632e-07, + "loss": -0.0003, + "reward": 2.499985456466675, + "reward_std": 7.409892504028903e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999853372573853, + "step": 1571 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 4.072538860103627, + "grad_norm": 25.26173320582219, + "kl": 0.10009765625, + "learning_rate": 5.930051813471502e-07, + "loss": -0.0006, + "reward": 2.499919891357422, + "reward_std": 2.8892596560581296e-05, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999201893806458, + "step": 1572 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 4.075129533678757, + "grad_norm": 0.4202578113393349, + "kl": 0.04266357421875, + "learning_rate": 5.927461139896372e-07, + "loss": 0.0006, + "reward": 2.4999955892562866, + "reward_std": 2.6960880177284707e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999955892562866, + "step": 1573 + }, + { + "clip_ratio": 0.0, + "completion_length": 34.5, + "epoch": 4.077720207253886, + "grad_norm": 2.7566474096963645, + "kl": 0.1005859375, + "learning_rate": 5.924870466321243e-07, + "loss": -0.0, + "reward": 2.4999741315841675, + "reward_std": 8.443552246717445e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999741315841675, + "step": 1574 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 4.080310880829016, + "grad_norm": 0.9526217628260819, + "kl": 0.0552978515625, + "learning_rate": 5.922279792746113e-07, + "loss": 0.0005, + "reward": 1.9999492168426514, + "reward_std": 7.816958259354578e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.4999490976333618, + "step": 1575 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 4.082901554404145, + "grad_norm": 4.60583281730857, + "kl": 0.166015625, + "learning_rate": 5.919689119170984e-07, + "loss": 0.0007, + "reward": 1.9523777961730957, + "reward_std": 0.00019758677677828018, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.4523777961730957, + "step": 1576 + }, + { + "clip_ratio": 0.0, + "completion_length": 34.5, + "epoch": 4.085492227979275, + "grad_norm": 9.037126448420663, + "kl": 0.074951171875, + "learning_rate": 5.917098445595856e-07, + "loss": 0.0, + "reward": 2.312481999397278, + "reward_std": 0.2587793828600411, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.8124821186065674, + "step": 1577 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 4.0880829015544045, + "grad_norm": 6.917998303646123, + "kl": 0.116943359375, + "learning_rate": 5.914507772020724e-07, + "loss": 0.0006, + "reward": 2.0624269247055054, + "reward_std": 0.17679864926412847, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.5624268651008606, + "step": 1578 + }, + { + "clip_ratio": 0.0, + "completion_length": 34.0, + "epoch": 4.090673575129534, + "grad_norm": 0.46547614924629965, + "kl": 0.03802490234375, + "learning_rate": 5.911917098445596e-07, + "loss": 0.0, + "reward": 1.999932050704956, + "reward_std": 4.348954121269344e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.4999321103096008, + "step": 1579 + }, + { + "clip_ratio": 0.0, + "completion_length": 34.5, + "epoch": 4.0932642487046635, + "grad_norm": 9.223621795378826, + "kl": 0.32275390625, + "learning_rate": 5.909326424870466e-07, + "loss": 0.0013, + "reward": 2.3584206104278564, + "reward_std": 0.39992353320121765, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.8584206104278564, + "step": 1580 + }, + { + "clip_ratio": 0.0, + "completion_length": 34.5, + "epoch": 4.095854922279793, + "grad_norm": 0.9026910738428424, + "kl": 0.0321044921875, + "learning_rate": 5.906735751295337e-07, + "loss": -0.0011, + "reward": 2.4999924898147583, + "reward_std": 4.905882633465808e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999927878379822, + "step": 1581 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 4.098445595854923, + "grad_norm": 0.17413724835275787, + "kl": 0.060211181640625, + "learning_rate": 5.904145077720208e-07, + "loss": 0.0004, + "reward": 2.4999938011169434, + "reward_std": 1.9574302427827206e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999936819076538, + "step": 1582 + }, + { + "clip_ratio": 0.0, + "completion_length": 34.5, + "epoch": 4.101036269430052, + "grad_norm": 0.07000372007439741, + "kl": 0.0592041015625, + "learning_rate": 5.901554404145078e-07, + "loss": 0.0017, + "reward": 2.4999983310699463, + "reward_std": 1.3967380709800636e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999980926513672, + "step": 1583 + }, + { + "clip_ratio": 0.0, + "completion_length": 34.5, + "epoch": 4.103626943005182, + "grad_norm": 8.76569314070839, + "kl": 0.1220703125, + "learning_rate": 5.898963730569948e-07, + "loss": 0.0006, + "reward": 1.978507161140442, + "reward_std": 0.00020144380607689527, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.4785070717334747, + "step": 1584 + }, + { + "clip_ratio": 0.0, + "completion_length": 34.5, + "epoch": 4.106217616580311, + "grad_norm": 3.4526155202041475, + "kl": 0.0562744140625, + "learning_rate": 5.896373056994819e-07, + "loss": -0.0002, + "reward": 1.9999414682388306, + "reward_std": 1.137816707341699e-05, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.4999414682388306, + "step": 1585 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 4.108808290155441, + "grad_norm": 1.2058907545604165, + "kl": 0.0775146484375, + "learning_rate": 5.893782383419689e-07, + "loss": -0.0007, + "reward": 2.499985098838806, + "reward_std": 8.535613687854493e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.999985158443451, + "step": 1586 + }, + { + "clip_ratio": 0.0, + "completion_length": 34.5, + "epoch": 4.11139896373057, + "grad_norm": 1.0745653224141987, + "kl": 0.056640625, + "learning_rate": 5.89119170984456e-07, + "loss": -0.0003, + "reward": 2.4999966621398926, + "reward_std": 3.5814974808090483e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999966621398926, + "step": 1587 + }, + { + "clip_ratio": 0.0, + "completion_length": 34.5, + "epoch": 4.1139896373057, + "grad_norm": 3.6557707207549868, + "kl": 0.09423828125, + "learning_rate": 5.88860103626943e-07, + "loss": 0.0013, + "reward": 1.9947112798690796, + "reward_std": 8.674551190779312e-05, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.4947111904621124, + "step": 1588 + }, + { + "clip_ratio": 0.0, + "completion_length": 34.5625, + "epoch": 4.116580310880829, + "grad_norm": 0.5735154793907004, + "kl": 0.068603515625, + "learning_rate": 5.886010362694301e-07, + "loss": -0.0004, + "reward": 2.4999921321868896, + "reward_std": 3.5775509275026707e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999921917915344, + "step": 1589 + }, + { + "clip_ratio": 0.0, + "completion_length": 34.5, + "epoch": 4.119170984455959, + "grad_norm": 0.448701984904896, + "kl": 0.0687713623046875, + "learning_rate": 5.883419689119171e-07, + "loss": 0.0007, + "reward": 2.4999974966049194, + "reward_std": 3.076568759752263e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.999997615814209, + "step": 1590 + }, + { + "clip_ratio": 0.0, + "completion_length": 34.5, + "epoch": 4.1217616580310885, + "grad_norm": 0.5454345824131598, + "kl": 0.14794921875, + "learning_rate": 5.880829015544041e-07, + "loss": 0.0013, + "reward": 2.4999910593032837, + "reward_std": 4.442941815341328e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999909400939941, + "step": 1591 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 4.124352331606218, + "grad_norm": 0.5327503865261822, + "kl": 0.066162109375, + "learning_rate": 5.878238341968912e-07, + "loss": -0.0006, + "reward": 2.499996304512024, + "reward_std": 3.081041313635069e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999963641166687, + "step": 1592 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.5, + "epoch": 4.126943005181348, + "grad_norm": 3.4598297305021304, + "kl": 0.123779296875, + "learning_rate": 5.875647668393782e-07, + "loss": 0.0005, + "reward": 1.9593265652656555, + "reward_std": 0.00029503035966627067, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.459326446056366, + "step": 1593 + }, + { + "clip_ratio": 0.0, + "completion_length": 34.5, + "epoch": 4.129533678756476, + "grad_norm": 1.342845432818344, + "kl": 0.122802734375, + "learning_rate": 5.873056994818653e-07, + "loss": 0.0003, + "reward": 2.4999804496765137, + "reward_std": 8.85338187117668e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999804496765137, + "step": 1594 + }, + { + "clip_ratio": 0.0, + "completion_length": 36.5, + "epoch": 4.132124352331606, + "grad_norm": 0.3342285856716859, + "kl": 0.03778076171875, + "learning_rate": 5.870466321243524e-07, + "loss": 0.0013, + "reward": 2.4999923706054688, + "reward_std": 3.5883319924323587e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.999992311000824, + "step": 1595 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 4.134715025906735, + "grad_norm": 0.32819218678633233, + "kl": 0.08050537109375, + "learning_rate": 5.867875647668393e-07, + "loss": -0.0005, + "reward": 1.9998695850372314, + "reward_std": 6.5313302002323326e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.4998696744441986, + "step": 1596 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.5, + "epoch": 4.137305699481865, + "grad_norm": 0.975674366386174, + "kl": 0.03753662109375, + "learning_rate": 5.865284974093264e-07, + "loss": -0.0003, + "reward": 2.499990463256836, + "reward_std": 6.31794296168664e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999905228614807, + "step": 1597 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.5, + "epoch": 4.139896373056994, + "grad_norm": 2.206039368778935, + "kl": 0.069580078125, + "learning_rate": 5.862694300518134e-07, + "loss": -0.0004, + "reward": 2.4999513626098633, + "reward_std": 2.4791568193904823e-05, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999513626098633, + "step": 1598 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 4.142487046632124, + "grad_norm": 0.06605473478520514, + "kl": 0.056396484375, + "learning_rate": 5.860103626943005e-07, + "loss": -0.0001, + "reward": 2.499997854232788, + "reward_std": 1.7147524147276272e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999979138374329, + "step": 1599 + }, + { + "clip_ratio": 0.0, + "completion_length": 34.5, + "epoch": 4.1450777202072535, + "grad_norm": 10.284093898136161, + "kl": 0.0762939453125, + "learning_rate": 5.857512953367876e-07, + "loss": 0.0011, + "reward": 2.499969244003296, + "reward_std": 1.5804248960193945e-05, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.999969244003296, + "step": 1600 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 4.147668393782383, + "grad_norm": 1.2757577869505718, + "kl": 0.077392578125, + "learning_rate": 5.854922279792746e-07, + "loss": -0.0003, + "reward": 2.499992609024048, + "reward_std": 5.06816235201768e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999927282333374, + "step": 1601 + }, + { + "clip_ratio": 0.0, + "completion_length": 34.5, + "epoch": 4.150259067357513, + "grad_norm": 2.3679157856969923, + "kl": 0.05615234375, + "learning_rate": 5.852331606217616e-07, + "loss": 0.0006, + "reward": 2.4999738931655884, + "reward_std": 9.621571734896861e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.999973714351654, + "step": 1602 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 4.152849740932642, + "grad_norm": 2.4563619146675624, + "kl": 0.20751953125, + "learning_rate": 5.849740932642486e-07, + "loss": 0.0001, + "reward": 1.9793579578399658, + "reward_std": 5.1970230515507865e-05, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.4793579876422882, + "step": 1603 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.5, + "epoch": 4.155440414507772, + "grad_norm": 6.967935707116754, + "kl": 0.099853515625, + "learning_rate": 5.847150259067357e-07, + "loss": 0.0011, + "reward": 2.4999468326568604, + "reward_std": 2.864020916604204e-05, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999467134475708, + "step": 1604 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.5, + "epoch": 4.158031088082901, + "grad_norm": 0.6872351586519199, + "kl": 0.0625, + "learning_rate": 5.844559585492228e-07, + "loss": 0.0011, + "reward": 2.4999942779541016, + "reward_std": 4.553016879071947e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999942183494568, + "step": 1605 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.5, + "epoch": 4.160621761658031, + "grad_norm": 0.34463471614119895, + "kl": 0.1339111328125, + "learning_rate": 5.841968911917098e-07, + "loss": 0.001, + "reward": 2.4999847412109375, + "reward_std": 4.717058914138761e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.999984622001648, + "step": 1606 + }, + { + "clip_ratio": 0.0, + "completion_length": 34.5, + "epoch": 4.16321243523316, + "grad_norm": 91.61056384872836, + "kl": 0.1658935546875, + "learning_rate": 5.839378238341969e-07, + "loss": 0.0015, + "reward": 1.8187761306762695, + "reward_std": 0.003314054640895847, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.318775862455368, + "step": 1607 + }, + { + "clip_ratio": 0.0, + "completion_length": 34.5, + "epoch": 4.16580310880829, + "grad_norm": 2.6667578967866614, + "kl": 0.150390625, + "learning_rate": 5.836787564766839e-07, + "loss": 0.0001, + "reward": 1.6774897575378418, + "reward_std": 0.00026905265212917584, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.1774897575378418, + "step": 1608 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 4.168393782383419, + "grad_norm": 0.26074399657417957, + "kl": 0.1455078125, + "learning_rate": 5.834196891191709e-07, + "loss": -0.0004, + "reward": 2.499993324279785, + "reward_std": 4.172011131231557e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999936819076538, + "step": 1609 + }, + { + "clip_ratio": 0.0, + "completion_length": 34.5, + "epoch": 4.170984455958549, + "grad_norm": 1.3050137036227154, + "kl": 0.080322265625, + "learning_rate": 5.83160621761658e-07, + "loss": -0.0004, + "reward": 2.4999701976776123, + "reward_std": 9.359884984405653e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999701380729675, + "step": 1610 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.5, + "epoch": 4.1735751295336785, + "grad_norm": 9.592629986383619, + "kl": 0.19873046875, + "learning_rate": 5.82901554404145e-07, + "loss": 0.0006, + "reward": 1.8040056228637695, + "reward_std": 0.0007750826889605378, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.3040056824684143, + "step": 1611 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 4.176165803108808, + "grad_norm": 1.8077326956934356, + "kl": 0.032562255859375, + "learning_rate": 5.826424870466321e-07, + "loss": 0.0004, + "reward": 2.4999879598617554, + "reward_std": 6.898350022765953e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999879598617554, + "step": 1612 + }, + { + "clip_ratio": 0.0, + "completion_length": 34.5, + "epoch": 4.178756476683938, + "grad_norm": 0.4217824998854284, + "kl": 0.11962890625, + "learning_rate": 5.823834196891192e-07, + "loss": -0.0007, + "reward": 2.4999979734420776, + "reward_std": 1.6396128614815098e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999979138374329, + "step": 1613 + }, + { + "clip_ratio": 0.0, + "completion_length": 34.0, + "epoch": 4.181347150259067, + "grad_norm": 5.843529025065718, + "kl": 0.18505859375, + "learning_rate": 5.821243523316061e-07, + "loss": 0.0006, + "reward": 2.1872295141220093, + "reward_std": 0.25879292379286767, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.6872295141220093, + "step": 1614 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 4.183937823834197, + "grad_norm": 0.11885772684141718, + "kl": 0.14208984375, + "learning_rate": 5.818652849740932e-07, + "loss": -0.0, + "reward": 2.499978542327881, + "reward_std": 1.7583438989277056e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999785423278809, + "step": 1615 + }, + { + "clip_ratio": 0.0, + "completion_length": 34.5, + "epoch": 4.186528497409326, + "grad_norm": 31.155695422064454, + "kl": 0.0616455078125, + "learning_rate": 5.816062176165802e-07, + "loss": 0.0004, + "reward": 1.8776912689208984, + "reward_std": 0.0009670431447830197, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.3776913285255432, + "step": 1616 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 4.189119170984456, + "grad_norm": 7.954052111218322, + "kl": 0.1370849609375, + "learning_rate": 5.813471502590673e-07, + "loss": 0.0004, + "reward": 1.9713550209999084, + "reward_std": 0.00029388689586085093, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.4713551104068756, + "step": 1617 + }, + { + "clip_ratio": 0.0, + "completion_length": 34.5, + "epoch": 4.191709844559585, + "grad_norm": 2.341547440190843, + "kl": 0.07275390625, + "learning_rate": 5.810880829015544e-07, + "loss": -0.0003, + "reward": 2.499981164932251, + "reward_std": 7.422139901791525e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999810457229614, + "step": 1618 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 4.194300518134715, + "grad_norm": 0.7659936902044249, + "kl": 0.14453125, + "learning_rate": 5.808290155440414e-07, + "loss": 0.0009, + "reward": 2.499971389770508, + "reward_std": 9.378344088872836e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999713897705078, + "step": 1619 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.5, + "epoch": 4.196891191709844, + "grad_norm": 20.51146761135554, + "kl": 0.2236328125, + "learning_rate": 5.805699481865284e-07, + "loss": 0.0012, + "reward": 1.995547890663147, + "reward_std": 0.00017465949485995225, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.4955478310585022, + "step": 1620 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 4.199481865284974, + "grad_norm": 10.469760481460995, + "kl": 0.123779296875, + "learning_rate": 5.803108808290154e-07, + "loss": 0.001, + "reward": 2.499975800514221, + "reward_std": 6.026807170655957e-05, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999756813049316, + "step": 1621 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.5, + "epoch": 4.2020725388601035, + "grad_norm": 3.5205897577474934, + "kl": 0.14697265625, + "learning_rate": 5.800518134715026e-07, + "loss": 0.0005, + "reward": 1.8850517272949219, + "reward_std": 9.285745943543589e-05, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.3850517272949219, + "step": 1622 + }, + { + "clip_ratio": 0.0, + "completion_length": 34.5, + "epoch": 4.204663212435233, + "grad_norm": 9.309249096427962, + "kl": 0.076171875, + "learning_rate": 5.797927461139897e-07, + "loss": 0.0006, + "reward": 2.4374871253967285, + "reward_std": 0.17679068280077104, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9374873042106628, + "step": 1623 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.5, + "epoch": 4.2072538860103625, + "grad_norm": 43.70409482839093, + "kl": 0.137451171875, + "learning_rate": 5.795336787564767e-07, + "loss": 0.0006, + "reward": 1.990125060081482, + "reward_std": 0.0005508052640834649, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.490125060081482, + "step": 1624 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 4.209844559585492, + "grad_norm": 17.308240886568655, + "kl": 0.15087890625, + "learning_rate": 5.792746113989638e-07, + "loss": 0.0007, + "reward": 2.249952495098114, + "reward_std": 0.2672914825616317, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.7499526143074036, + "step": 1625 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.5, + "epoch": 4.212435233160622, + "grad_norm": 2.479283648758274, + "kl": 0.098388671875, + "learning_rate": 5.790155440414507e-07, + "loss": 0.0005, + "reward": 2.4999747276306152, + "reward_std": 1.5346635336754844e-05, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999746680259705, + "step": 1626 + }, + { + "clip_ratio": 0.0, + "completion_length": 36.0, + "epoch": 4.215025906735751, + "grad_norm": 0.7766965270088614, + "kl": 0.138427734375, + "learning_rate": 5.787564766839378e-07, + "loss": -0.0005, + "reward": 2.4999606609344482, + "reward_std": 1.2539077715700842e-05, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999608397483826, + "step": 1627 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.5, + "epoch": 4.217616580310881, + "grad_norm": 2.660088851994437, + "kl": 0.0633544921875, + "learning_rate": 5.784974093264249e-07, + "loss": 0.0003, + "reward": 1.9996073246002197, + "reward_std": 2.0539871457003755e-05, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.4996073842048645, + "step": 1628 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 4.22020725388601, + "grad_norm": 4.827196641300278, + "kl": 0.1435546875, + "learning_rate": 5.782383419689119e-07, + "loss": 0.0007, + "reward": 2.1874724626541138, + "reward_std": 0.2587815834192497, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.6874723434448242, + "step": 1629 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.5, + "epoch": 4.22279792746114, + "grad_norm": 2.5232709500050703, + "kl": 0.16552734375, + "learning_rate": 5.77979274611399e-07, + "loss": 0.0005, + "reward": 1.9996851682662964, + "reward_std": 2.2433102458307985e-05, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.4996852278709412, + "step": 1630 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.5, + "epoch": 4.225388601036269, + "grad_norm": 0.35144749238925405, + "kl": 0.1083984375, + "learning_rate": 5.777202072538861e-07, + "loss": 0.0014, + "reward": 2.499971389770508, + "reward_std": 5.214265343056468e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.999971330165863, + "step": 1631 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 4.227979274611399, + "grad_norm": 3.317477626526753, + "kl": 0.11572265625, + "learning_rate": 5.77461139896373e-07, + "loss": 0.0002, + "reward": 2.499928116798401, + "reward_std": 1.4514650899855042e-05, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999281764030457, + "step": 1632 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.5, + "epoch": 4.230569948186528, + "grad_norm": 5.4803308991111015, + "kl": 0.107666015625, + "learning_rate": 5.772020725388601e-07, + "loss": 0.0005, + "reward": 2.499990463256836, + "reward_std": 1.717954683044809e-05, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999905824661255, + "step": 1633 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 4.233160621761658, + "grad_norm": 11.144472998625615, + "kl": 0.14404296875, + "learning_rate": 5.769430051813471e-07, + "loss": 0.0009, + "reward": 2.4999709129333496, + "reward_std": 1.3828344890498556e-05, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999709129333496, + "step": 1634 + }, + { + "clip_ratio": 0.0, + "completion_length": 34.5, + "epoch": 4.2357512953367875, + "grad_norm": 1.3764847322182971, + "kl": 0.1708984375, + "learning_rate": 5.766839378238342e-07, + "loss": -0.0001, + "reward": 2.4999645948410034, + "reward_std": 8.748760137677891e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999646544456482, + "step": 1635 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 4.238341968911917, + "grad_norm": 2.0545668901315954, + "kl": 0.07177734375, + "learning_rate": 5.764248704663213e-07, + "loss": 0.0011, + "reward": 2.4999762773513794, + "reward_std": 1.1046366580558242e-05, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.999976098537445, + "step": 1636 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 4.240932642487047, + "grad_norm": 0.33295871541623595, + "kl": 0.07666015625, + "learning_rate": 5.761658031088083e-07, + "loss": -0.0001, + "reward": 2.499983310699463, + "reward_std": 2.8792962893930962e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999833703041077, + "step": 1637 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.5, + "epoch": 4.243523316062176, + "grad_norm": 0.1594739117810296, + "kl": 0.074462890625, + "learning_rate": 5.759067357512953e-07, + "loss": 0.002, + "reward": 2.49999737739563, + "reward_std": 2.8450117497413885e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999973773956299, + "step": 1638 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 4.246113989637306, + "grad_norm": 0.7947963589123364, + "kl": 0.137939453125, + "learning_rate": 5.756476683937823e-07, + "loss": 0.0014, + "reward": 2.4999921321868896, + "reward_std": 6.424090543077909e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999921321868896, + "step": 1639 + }, + { + "clip_ratio": 0.0, + "completion_length": 34.5, + "epoch": 4.248704663212435, + "grad_norm": 0.05333305068135456, + "kl": 0.0377197265625, + "learning_rate": 5.753886010362694e-07, + "loss": 0.0007, + "reward": 2.4999979734420776, + "reward_std": 1.1159213784139865e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.999997854232788, + "step": 1640 + }, + { + "clip_ratio": 0.0, + "completion_length": 36.5, + "epoch": 4.251295336787565, + "grad_norm": 0.22430505547001248, + "kl": 0.089599609375, + "learning_rate": 5.751295336787565e-07, + "loss": 0.0007, + "reward": 2.499995231628418, + "reward_std": 3.761320954254188e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999952912330627, + "step": 1641 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.5, + "epoch": 4.253886010362694, + "grad_norm": 0.10671235522834856, + "kl": 0.112060546875, + "learning_rate": 5.748704663212435e-07, + "loss": 0.0012, + "reward": 2.49999737739563, + "reward_std": 2.6441645104569034e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999971985816956, + "step": 1642 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.5, + "epoch": 4.256476683937824, + "grad_norm": 0.07172136716811715, + "kl": 0.072265625, + "learning_rate": 5.746113989637306e-07, + "loss": -0.001, + "reward": 2.499998688697815, + "reward_std": 1.0090398916418053e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999988079071045, + "step": 1643 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.5, + "epoch": 4.259067357512953, + "grad_norm": 39.51998085908117, + "kl": 0.07958984375, + "learning_rate": 5.743523316062175e-07, + "loss": 0.0013, + "reward": 1.965232253074646, + "reward_std": 0.014019200414736588, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.4652320742607117, + "step": 1644 + }, + { + "clip_ratio": 0.0, + "completion_length": 36.5, + "epoch": 4.261658031088083, + "grad_norm": 0.35216612170084327, + "kl": 0.0677490234375, + "learning_rate": 5.740932642487046e-07, + "loss": -0.0012, + "reward": 2.499987483024597, + "reward_std": 5.831992211824399e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999876022338867, + "step": 1645 + }, + { + "clip_ratio": 0.0, + "completion_length": 36.0, + "epoch": 4.2642487046632125, + "grad_norm": 0.35681831415677384, + "kl": 0.1038818359375, + "learning_rate": 5.738341968911917e-07, + "loss": 0.0014, + "reward": 1.9999709129333496, + "reward_std": 5.617907589794413e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.4999707639217377, + "step": 1646 + }, + { + "clip_ratio": 0.0, + "completion_length": 36.0, + "epoch": 4.266839378238342, + "grad_norm": 4.694867300286028, + "kl": 0.109130859375, + "learning_rate": 5.735751295336787e-07, + "loss": -0.0002, + "reward": 2.4997060298919678, + "reward_std": 4.285841896489728e-05, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9997060894966125, + "step": 1647 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 4.269430051813472, + "grad_norm": 6.744128030216046, + "kl": 0.0557861328125, + "learning_rate": 5.733160621761658e-07, + "loss": 0.0008, + "reward": 2.4999911785125732, + "reward_std": 8.113884405247518e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999909400939941, + "step": 1648 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.5, + "epoch": 4.272020725388601, + "grad_norm": 1.9999573719327954, + "kl": 0.09765625, + "learning_rate": 5.730569948186528e-07, + "loss": 0.0014, + "reward": 2.4999910593032837, + "reward_std": 6.217932138952165e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999909400939941, + "step": 1649 + }, + { + "clip_ratio": 0.0, + "completion_length": 34.5, + "epoch": 4.274611398963731, + "grad_norm": 0.4288660620903904, + "kl": 0.1357421875, + "learning_rate": 5.727979274611398e-07, + "loss": 0.0005, + "reward": 2.4999947547912598, + "reward_std": 2.7621567824098747e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999948740005493, + "step": 1650 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.5, + "epoch": 4.27720207253886, + "grad_norm": 0.1853425641084272, + "kl": 0.15185546875, + "learning_rate": 5.725388601036269e-07, + "loss": 0.0008, + "reward": 2.499994993209839, + "reward_std": 4.175905246484035e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999950528144836, + "step": 1651 + }, + { + "clip_ratio": 0.0, + "completion_length": 34.5, + "epoch": 4.27979274611399, + "grad_norm": 6.105633266796804, + "kl": 0.13330078125, + "learning_rate": 5.722797927461139e-07, + "loss": 0.0007, + "reward": 2.4998953342437744, + "reward_std": 4.475373282275541e-05, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.999895453453064, + "step": 1652 + }, + { + "clip_ratio": 0.0, + "completion_length": 34.0, + "epoch": 4.282383419689119, + "grad_norm": 6.399311567115233, + "kl": 0.106201171875, + "learning_rate": 5.72020725388601e-07, + "loss": -0.0, + "reward": 1.720837116241455, + "reward_std": 0.000405950486083384, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.2208371758460999, + "step": 1653 + }, + { + "clip_ratio": 0.0, + "completion_length": 34.5, + "epoch": 4.284974093264249, + "grad_norm": 0.7050338628178441, + "kl": 0.0782470703125, + "learning_rate": 5.717616580310881e-07, + "loss": 0.0008, + "reward": 2.499989867210388, + "reward_std": 5.765255764345056e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999897480010986, + "step": 1654 + }, + { + "clip_ratio": 0.0, + "completion_length": 34.5, + "epoch": 4.287564766839378, + "grad_norm": 1.6388946383431446, + "kl": 0.1217041015625, + "learning_rate": 5.715025906735751e-07, + "loss": -0.0005, + "reward": 2.499995470046997, + "reward_std": 4.27052509621717e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999957084655762, + "step": 1655 + }, + { + "clip_ratio": 0.0, + "completion_length": 33.5625, + "epoch": 4.290155440414508, + "grad_norm": 1350.3492618723762, + "kl": 0.13232421875, + "learning_rate": 5.712435233160621e-07, + "loss": -0.0004, + "reward": 2.4347686767578125, + "reward_std": 0.18449809011832485, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9347687363624573, + "step": 1656 + }, + { + "clip_ratio": 0.0, + "completion_length": 34.5, + "epoch": 4.2927461139896375, + "grad_norm": 1.8772716626489356, + "kl": 0.121337890625, + "learning_rate": 5.709844559585491e-07, + "loss": 0.0003, + "reward": 2.4999595880508423, + "reward_std": 1.4377440834323352e-05, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999597668647766, + "step": 1657 + }, + { + "clip_ratio": 0.0, + "completion_length": 34.5, + "epoch": 4.295336787564767, + "grad_norm": 0.7265465720578551, + "kl": 0.0733642578125, + "learning_rate": 5.707253886010362e-07, + "loss": 0.0009, + "reward": 2.4999961853027344, + "reward_std": 3.3018006888596574e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999960064888, + "step": 1658 + }, + { + "clip_ratio": 0.0, + "completion_length": 36.4375, + "epoch": 4.2979274611398965, + "grad_norm": 0.13537141286032547, + "kl": 0.06365966796875, + "learning_rate": 5.704663212435233e-07, + "loss": 0.0016, + "reward": 2.499996304512024, + "reward_std": 2.452327521496045e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.999996304512024, + "step": 1659 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 4.300518134715026, + "grad_norm": 0.16293186208052443, + "kl": 0.16015625, + "learning_rate": 5.702072538860103e-07, + "loss": 0.0014, + "reward": 2.499996781349182, + "reward_std": 3.5358443426503072e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999967217445374, + "step": 1660 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.5, + "epoch": 4.303108808290156, + "grad_norm": 3.2712378822545487, + "kl": 0.0635986328125, + "learning_rate": 5.699481865284974e-07, + "loss": 0.001, + "reward": 2.4999812841415405, + "reward_std": 1.4047294826013967e-05, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999812245368958, + "step": 1661 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 4.305699481865285, + "grad_norm": 1.4783695522043987, + "kl": 0.0579833984375, + "learning_rate": 5.696891191709843e-07, + "loss": 0.0015, + "reward": 2.499969244003296, + "reward_std": 7.840116268198472e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.999969244003296, + "step": 1662 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 4.308290155440415, + "grad_norm": 8.911939478469888, + "kl": 0.091796875, + "learning_rate": 5.694300518134714e-07, + "loss": -0.0001, + "reward": 1.9984776377677917, + "reward_std": 0.00014535136051563313, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.4984778463840485, + "step": 1663 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 4.310880829015544, + "grad_norm": 66.86633687724522, + "kl": 0.121826171875, + "learning_rate": 5.691709844559586e-07, + "loss": 0.0005, + "reward": 1.328099548816681, + "reward_std": 0.002699408984881302, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 0.8280995786190033, + "step": 1664 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.5, + "epoch": 4.313471502590674, + "grad_norm": 0.3071995792457166, + "kl": 0.074462890625, + "learning_rate": 5.689119170984456e-07, + "loss": 0.0006, + "reward": 2.4999983310699463, + "reward_std": 1.2217776657053037e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999984502792358, + "step": 1665 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.5, + "epoch": 4.316062176165803, + "grad_norm": 0.26425745698842634, + "kl": 0.058349609375, + "learning_rate": 5.686528497409327e-07, + "loss": 0.0017, + "reward": 2.4999985694885254, + "reward_std": 1.6321747580150259e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999985694885254, + "step": 1666 + }, + { + "clip_ratio": 0.0, + "completion_length": 34.8125, + "epoch": 4.318652849740933, + "grad_norm": 55.27313495909315, + "kl": 0.104248046875, + "learning_rate": 5.683937823834197e-07, + "loss": 0.0004, + "reward": 2.186655044555664, + "reward_std": 0.259467652848798, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.6866552233695984, + "step": 1667 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.5, + "epoch": 4.321243523316062, + "grad_norm": 2.1075119889377203, + "kl": 0.09857177734375, + "learning_rate": 5.681347150259067e-07, + "loss": 0.0007, + "reward": 2.49995219707489, + "reward_std": 1.2156680668340414e-05, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.999952256679535, + "step": 1668 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.5, + "epoch": 4.323834196891192, + "grad_norm": 17.017277666589393, + "kl": 0.07568359375, + "learning_rate": 5.678756476683938e-07, + "loss": 0.0001, + "reward": 2.4374330043792725, + "reward_std": 0.17682045467972785, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9374327659606934, + "step": 1669 + }, + { + "clip_ratio": 0.0, + "completion_length": 34.5625, + "epoch": 4.3264248704663215, + "grad_norm": 4.34776966293989, + "kl": 0.194091796875, + "learning_rate": 5.676165803108808e-07, + "loss": 0.0008, + "reward": 1.9974610805511475, + "reward_std": 7.275603820744436e-05, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.4974610209465027, + "step": 1670 + }, + { + "clip_ratio": 0.0, + "completion_length": 36.0, + "epoch": 4.329015544041451, + "grad_norm": 1.7935625393089787, + "kl": 0.08203125, + "learning_rate": 5.673575129533679e-07, + "loss": 0.0003, + "reward": 2.499973773956299, + "reward_std": 1.5526404695265228e-05, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999738335609436, + "step": 1671 + }, + { + "clip_ratio": 0.0, + "completion_length": 34.5, + "epoch": 4.331606217616581, + "grad_norm": 0.1589387687491885, + "kl": 0.1483154296875, + "learning_rate": 5.670984455958549e-07, + "loss": 0.0019, + "reward": 2.49999737739563, + "reward_std": 1.8638985466168378e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999971389770508, + "step": 1672 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 4.33419689119171, + "grad_norm": 6.0483060424583, + "kl": 0.1376953125, + "learning_rate": 5.66839378238342e-07, + "loss": 0.0001, + "reward": 2.4999327659606934, + "reward_std": 2.656341735018941e-05, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999327063560486, + "step": 1673 + }, + { + "clip_ratio": 0.0, + "completion_length": 34.5, + "epoch": 4.33678756476684, + "grad_norm": 104.87761967629571, + "kl": 0.05828857421875, + "learning_rate": 5.66580310880829e-07, + "loss": -0.0004, + "reward": 1.9709819555282593, + "reward_std": 0.010413831605660562, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.4709819853305817, + "step": 1674 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.5, + "epoch": 4.339378238341969, + "grad_norm": 4.0079432532628, + "kl": 0.071929931640625, + "learning_rate": 5.66321243523316e-07, + "loss": 0.0003, + "reward": 2.4999886751174927, + "reward_std": 6.66723258291313e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999887347221375, + "step": 1675 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.5, + "epoch": 4.341968911917099, + "grad_norm": 3.624018531195769, + "kl": 0.122802734375, + "learning_rate": 5.660621761658031e-07, + "loss": 0.0006, + "reward": 1.9980930089950562, + "reward_std": 4.907955940325337e-05, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.498093068599701, + "step": 1676 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 4.344559585492228, + "grad_norm": 5.078483454740987, + "kl": 0.0625, + "learning_rate": 5.658031088082901e-07, + "loss": 0.0003, + "reward": 2.499971628189087, + "reward_std": 2.3246760861184157e-05, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.999971628189087, + "step": 1677 + }, + { + "clip_ratio": 0.0, + "completion_length": 34.5, + "epoch": 4.347150259067358, + "grad_norm": 14.030459045656572, + "kl": 0.1376953125, + "learning_rate": 5.655440414507772e-07, + "loss": 0.0015, + "reward": 2.499985456466675, + "reward_std": 1.0048452622868354e-05, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999853372573853, + "step": 1678 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.5, + "epoch": 4.349740932642487, + "grad_norm": 2.3457785822766435, + "kl": 0.08740234375, + "learning_rate": 5.652849740932643e-07, + "loss": 0.0012, + "reward": 2.499953508377075, + "reward_std": 2.1980561541568022e-05, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999533295631409, + "step": 1679 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.5, + "epoch": 4.352331606217617, + "grad_norm": 0.7997081006452124, + "kl": 0.1173095703125, + "learning_rate": 5.650259067357512e-07, + "loss": 0.0013, + "reward": 2.4999847412109375, + "reward_std": 8.641696695121937e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999848008155823, + "step": 1680 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.5, + "epoch": 4.3549222797927465, + "grad_norm": 4.03442756705084, + "kl": 0.30810546875, + "learning_rate": 5.647668393782383e-07, + "loss": 0.0015, + "reward": 1.9937379360198975, + "reward_std": 8.287364062198321e-05, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.4937377572059631, + "step": 1681 + }, + { + "clip_ratio": 0.0, + "completion_length": 36.0, + "epoch": 4.357512953367876, + "grad_norm": 0.6380754845600469, + "kl": 0.0714111328125, + "learning_rate": 5.645077720207254e-07, + "loss": 0.0004, + "reward": 2.499994158744812, + "reward_std": 2.530499386921292e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.999994158744812, + "step": 1682 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.5, + "epoch": 4.360103626943006, + "grad_norm": 0.8457500822384119, + "kl": 0.079833984375, + "learning_rate": 5.642487046632124e-07, + "loss": -0.0004, + "reward": 2.4999916553497314, + "reward_std": 5.998976689625124e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.999991774559021, + "step": 1683 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 4.362694300518135, + "grad_norm": 0.3529816341740026, + "kl": 0.0849609375, + "learning_rate": 5.639896373056995e-07, + "loss": 0.0005, + "reward": 2.499996542930603, + "reward_std": 3.0697000283907983e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.999996542930603, + "step": 1684 + }, + { + "clip_ratio": 0.0, + "completion_length": 33.0, + "epoch": 4.365284974093265, + "grad_norm": 1.158518374683692, + "kl": 0.0728759765625, + "learning_rate": 5.637305699481865e-07, + "loss": 0.0003, + "reward": 2.499997854232788, + "reward_std": 1.4521563684866123e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999979138374329, + "step": 1685 + }, + { + "clip_ratio": 0.0, + "completion_length": 36.0, + "epoch": 4.367875647668393, + "grad_norm": 0.3591072319522074, + "kl": 0.13916015625, + "learning_rate": 5.634715025906735e-07, + "loss": -0.0003, + "reward": 2.499972701072693, + "reward_std": 2.5953077056328766e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999728202819824, + "step": 1686 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.5, + "epoch": 4.370466321243523, + "grad_norm": 3.7760805161238484, + "kl": 0.13623046875, + "learning_rate": 5.632124352331606e-07, + "loss": 0.0005, + "reward": 1.9998499155044556, + "reward_std": 3.568363376871275e-05, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.4998499751091003, + "step": 1687 + }, + { + "clip_ratio": 0.0, + "completion_length": 34.5, + "epoch": 4.373056994818652, + "grad_norm": 1.1462186690346785, + "kl": 0.185791015625, + "learning_rate": 5.629533678756476e-07, + "loss": 0.0003, + "reward": 2.4999940395355225, + "reward_std": 4.879303219240683e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.999994158744812, + "step": 1688 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 4.375647668393782, + "grad_norm": 0.5031808372050066, + "kl": 0.10498046875, + "learning_rate": 5.626943005181347e-07, + "loss": 0.001, + "reward": 2.499981164932251, + "reward_std": 7.235457815113477e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999812245368958, + "step": 1689 + }, + { + "clip_ratio": 0.0, + "completion_length": 34.5, + "epoch": 4.3782383419689115, + "grad_norm": 2.6709955823662885, + "kl": 0.09228515625, + "learning_rate": 5.624352331606217e-07, + "loss": 0.0005, + "reward": 1.9993645548820496, + "reward_std": 3.636389965322451e-05, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.4993645548820496, + "step": 1690 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 4.380829015544041, + "grad_norm": 5.223258851660594, + "kl": 0.070556640625, + "learning_rate": 5.621761658031088e-07, + "loss": 0.0008, + "reward": 1.7910540103912354, + "reward_std": 0.00038991638120933203, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.2910539507865906, + "step": 1691 + }, + { + "clip_ratio": 0.0, + "completion_length": 36.0, + "epoch": 4.383419689119171, + "grad_norm": 11.48828302017316, + "kl": 0.1318359375, + "learning_rate": 5.619170984455959e-07, + "loss": 0.0008, + "reward": 2.059893310070038, + "reward_std": 0.17790578648964583, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.5598931908607483, + "step": 1692 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 4.3860103626943, + "grad_norm": 4.044784153342431, + "kl": 0.097900390625, + "learning_rate": 5.616580310880828e-07, + "loss": 0.0004, + "reward": 2.4999879598617554, + "reward_std": 1.4329050429751078e-05, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999879002571106, + "step": 1693 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 4.38860103626943, + "grad_norm": 59.147150064031706, + "kl": 0.14794921875, + "learning_rate": 5.613989637305699e-07, + "loss": 0.0007, + "reward": 2.0000797510147095, + "reward_std": 0.41397102706878286, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.50007963180542, + "step": 1694 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.5, + "epoch": 4.391191709844559, + "grad_norm": 0.13230641410232852, + "kl": 0.0547637939453125, + "learning_rate": 5.611398963730569e-07, + "loss": -0.0001, + "reward": 2.499997854232788, + "reward_std": 1.8281344296156021e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999980926513672, + "step": 1695 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 4.393782383419689, + "grad_norm": 0.7078995609936762, + "kl": 0.058197021484375, + "learning_rate": 5.60880829015544e-07, + "loss": 0.0009, + "reward": 2.499995708465576, + "reward_std": 4.362423453585507e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999953508377075, + "step": 1696 + }, + { + "clip_ratio": 0.0, + "completion_length": 34.5, + "epoch": 4.396373056994818, + "grad_norm": 0.04911042800504467, + "kl": 0.0479736328125, + "learning_rate": 5.606217616580311e-07, + "loss": 0.0008, + "reward": 2.499998927116394, + "reward_std": 1.2129205799737974e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.999998927116394, + "step": 1697 + }, + { + "clip_ratio": 0.0, + "completion_length": 34.0, + "epoch": 4.398963730569948, + "grad_norm": 4.154814346568021, + "kl": 0.221435546875, + "learning_rate": 5.60362694300518e-07, + "loss": 0.0006, + "reward": 1.9961839318275452, + "reward_std": 7.138645219129103e-05, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.4961840510368347, + "step": 1698 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 4.401554404145077, + "grad_norm": 0.9624370825251787, + "kl": 0.1119384765625, + "learning_rate": 5.601036269430051e-07, + "loss": 0.0007, + "reward": 2.4999773502349854, + "reward_std": 6.098519520492118e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999773502349854, + "step": 1699 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.5, + "epoch": 4.404145077720207, + "grad_norm": 0.25643874556879054, + "kl": 0.1737060546875, + "learning_rate": 5.598445595854921e-07, + "loss": 0.0014, + "reward": 2.4999911785125732, + "reward_std": 4.062527125370252e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999908804893494, + "step": 1700 + }, + { + "clip_ratio": 0.0, + "completion_length": 34.5, + "epoch": 4.4067357512953365, + "grad_norm": 0.6283293540103942, + "kl": 0.06787109375, + "learning_rate": 5.595854922279792e-07, + "loss": 0.0006, + "reward": 2.4999942779541016, + "reward_std": 3.5235444784120773e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999942779541016, + "step": 1701 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.5, + "epoch": 4.409326424870466, + "grad_norm": 3.840606502892612, + "kl": 0.065185546875, + "learning_rate": 5.593264248704663e-07, + "loss": 0.0008, + "reward": 1.9998810291290283, + "reward_std": 3.656877743196674e-05, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.4998809695243835, + "step": 1702 + }, + { + "clip_ratio": 0.0, + "completion_length": 34.0, + "epoch": 4.4119170984455955, + "grad_norm": 0.46864875560041785, + "kl": 0.10498046875, + "learning_rate": 5.590673575129533e-07, + "loss": 0.0004, + "reward": 2.4999979734420776, + "reward_std": 1.638020250993577e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999980926513672, + "step": 1703 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 4.414507772020725, + "grad_norm": 4.529201394053517, + "kl": 0.086669921875, + "learning_rate": 5.588082901554404e-07, + "loss": 0.0011, + "reward": 2.499974489212036, + "reward_std": 1.2205046800772834e-05, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999744892120361, + "step": 1704 + }, + { + "clip_ratio": 0.0, + "completion_length": 34.5, + "epoch": 4.417098445595855, + "grad_norm": 24.52666576882307, + "kl": 0.09857177734375, + "learning_rate": 5.585492227979274e-07, + "loss": 0.0004, + "reward": 2.4998916387557983, + "reward_std": 0.00010237504315568913, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9998915791511536, + "step": 1705 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 4.419689119170984, + "grad_norm": 5.3011261825787885, + "kl": 0.1708984375, + "learning_rate": 5.582901554404144e-07, + "loss": 0.0007, + "reward": 2.4999430179595947, + "reward_std": 2.5337361478250386e-05, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999431371688843, + "step": 1706 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.5, + "epoch": 4.422279792746114, + "grad_norm": 0.47976953579162007, + "kl": 0.086669921875, + "learning_rate": 5.580310880829016e-07, + "loss": 0.0008, + "reward": 2.4999910593032837, + "reward_std": 3.6727846008943743e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999910593032837, + "step": 1707 + }, + { + "clip_ratio": 0.0, + "completion_length": 34.5, + "epoch": 4.424870466321243, + "grad_norm": 4.4050511109149015, + "kl": 0.0736083984375, + "learning_rate": 5.577720207253886e-07, + "loss": -0.0, + "reward": 2.4999529123306274, + "reward_std": 2.021394442408564e-05, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999529123306274, + "step": 1708 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 4.427461139896373, + "grad_norm": 0.7492906569364401, + "kl": 0.050018310546875, + "learning_rate": 5.575129533678757e-07, + "loss": 0.0005, + "reward": 2.4999948740005493, + "reward_std": 5.412219252320938e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999948740005493, + "step": 1709 + }, + { + "clip_ratio": 0.0, + "completion_length": 36.0, + "epoch": 4.430051813471502, + "grad_norm": 6.600957887693868, + "kl": 0.0947265625, + "learning_rate": 5.572538860103628e-07, + "loss": 0.0002, + "reward": 2.4999550580978394, + "reward_std": 2.978387965413276e-05, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999549984931946, + "step": 1710 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 4.432642487046632, + "grad_norm": 1.4863056551898715, + "kl": 0.032684326171875, + "learning_rate": 5.569948186528497e-07, + "loss": -0.0, + "reward": 2.4999955892562866, + "reward_std": 3.5129194202454528e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999956488609314, + "step": 1711 + }, + { + "clip_ratio": 0.0, + "completion_length": 34.5, + "epoch": 4.435233160621761, + "grad_norm": 0.3018152184523658, + "kl": 0.07470703125, + "learning_rate": 5.567357512953368e-07, + "loss": 0.0003, + "reward": 2.499995231628418, + "reward_std": 4.410703013491002e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999951720237732, + "step": 1712 + }, + { + "clip_ratio": 0.0, + "completion_length": 34.0, + "epoch": 4.437823834196891, + "grad_norm": 14.125489467811978, + "kl": 0.109619140625, + "learning_rate": 5.564766839378238e-07, + "loss": 0.0004, + "reward": 2.4374903440475464, + "reward_std": 0.1767951499477931, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9374903440475464, + "step": 1713 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.5, + "epoch": 4.4404145077720205, + "grad_norm": 4.314532829435389, + "kl": 0.186279296875, + "learning_rate": 5.562176165803109e-07, + "loss": 0.0014, + "reward": 2.4999916553497314, + "reward_std": 1.18362083867396e-05, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999916553497314, + "step": 1714 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.5, + "epoch": 4.44300518134715, + "grad_norm": 67.13593042781774, + "kl": 0.12451171875, + "learning_rate": 5.55958549222798e-07, + "loss": -0.0002, + "reward": 2.062413215637207, + "reward_std": 0.17680161040880193, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.5624132752418518, + "step": 1715 + }, + { + "clip_ratio": 0.0, + "completion_length": 36.0, + "epoch": 4.44559585492228, + "grad_norm": 6.316466242667441, + "kl": 0.0654296875, + "learning_rate": 5.55699481865285e-07, + "loss": 0.0005, + "reward": 1.9998271465301514, + "reward_std": 2.2908495338924695e-05, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.4998270273208618, + "step": 1716 + }, + { + "clip_ratio": 0.0, + "completion_length": 34.5, + "epoch": 4.448186528497409, + "grad_norm": 73.18929649772944, + "kl": 0.0538330078125, + "learning_rate": 5.55440414507772e-07, + "loss": 0.001, + "reward": 2.124595046043396, + "reward_std": 0.23170430760046656, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.6245948672294617, + "step": 1717 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 4.450777202072539, + "grad_norm": 9.060460988652666, + "kl": 2.470458984375, + "learning_rate": 5.55181347150259e-07, + "loss": 0.0107, + "reward": 1.9999070167541504, + "reward_std": 1.519389070381294e-05, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.4999069571495056, + "step": 1718 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 4.453367875647668, + "grad_norm": 0.40007900210739733, + "kl": 0.046875, + "learning_rate": 5.549222797927461e-07, + "loss": 0.0009, + "reward": 2.4999955892562866, + "reward_std": 3.0979892926552566e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999954104423523, + "step": 1719 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 4.455958549222798, + "grad_norm": 4.9909218232330534, + "kl": 0.092529296875, + "learning_rate": 5.546632124352332e-07, + "loss": -0.0001, + "reward": 2.4999775886535645, + "reward_std": 9.459797297495243e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999775886535645, + "step": 1720 + }, + { + "clip_ratio": 0.0, + "completion_length": 34.9375, + "epoch": 4.458549222797927, + "grad_norm": 4.247828911291178, + "kl": 0.150634765625, + "learning_rate": 5.544041450777202e-07, + "loss": 0.001, + "reward": 1.9988590478897095, + "reward_std": 5.002601136538942e-05, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.49885892868042, + "step": 1721 + }, + { + "clip_ratio": 0.0, + "completion_length": 34.5, + "epoch": 4.461139896373057, + "grad_norm": 0.4149888892291473, + "kl": 0.063232421875, + "learning_rate": 5.541450777202073e-07, + "loss": -0.0, + "reward": 2.499990463256836, + "reward_std": 4.090190032002283e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.999990701675415, + "step": 1722 + }, + { + "clip_ratio": 0.0, + "completion_length": 34.5, + "epoch": 4.463730569948186, + "grad_norm": 0.6074406266025194, + "kl": 0.134765625, + "learning_rate": 5.538860103626942e-07, + "loss": -0.0004, + "reward": 2.4999852180480957, + "reward_std": 5.665189860337705e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999852776527405, + "step": 1723 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 4.466321243523316, + "grad_norm": 2.9292397019683807, + "kl": 0.0535888671875, + "learning_rate": 5.536269430051813e-07, + "loss": -0.0002, + "reward": 2.4999821186065674, + "reward_std": 1.273383224997815e-05, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999821186065674, + "step": 1724 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 4.4689119170984455, + "grad_norm": 0.21694732216567852, + "kl": 0.096435546875, + "learning_rate": 5.533678756476684e-07, + "loss": -0.0002, + "reward": 2.499993085861206, + "reward_std": 4.882911525783129e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999933242797852, + "step": 1725 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 4.471502590673575, + "grad_norm": 0.6946692384181016, + "kl": 0.065673828125, + "learning_rate": 5.531088082901554e-07, + "loss": -0.0002, + "reward": 1.9991164207458496, + "reward_std": 1.3185736406740034e-05, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.4991165101528168, + "step": 1726 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 4.474093264248705, + "grad_norm": 0.30270239547969036, + "kl": 0.115234375, + "learning_rate": 5.528497409326425e-07, + "loss": -0.0007, + "reward": 2.4999929666519165, + "reward_std": 2.55716349784052e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999930262565613, + "step": 1727 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 4.476683937823834, + "grad_norm": 1.4196766797275246, + "kl": 0.114013671875, + "learning_rate": 5.525906735751296e-07, + "loss": 0.0003, + "reward": 2.4999927282333374, + "reward_std": 5.371910845042294e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999927282333374, + "step": 1728 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 4.479274611398964, + "grad_norm": 0.4853911491627105, + "kl": 0.107177734375, + "learning_rate": 5.523316062176165e-07, + "loss": 0.0002, + "reward": 2.4999895095825195, + "reward_std": 6.176342139951885e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999894499778748, + "step": 1729 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.5, + "epoch": 4.481865284974093, + "grad_norm": 0.36009955780918074, + "kl": 0.1220703125, + "learning_rate": 5.520725388601036e-07, + "loss": 0.0018, + "reward": 2.4999899864196777, + "reward_std": 5.2746761411981424e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999898672103882, + "step": 1730 + }, + { + "clip_ratio": 0.0, + "completion_length": 36.0, + "epoch": 4.484455958549223, + "grad_norm": 8.971639198802148, + "kl": 0.1143798828125, + "learning_rate": 5.518134715025906e-07, + "loss": 0.0003, + "reward": 1.865371286869049, + "reward_std": 0.0008516380319747441, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.365371197462082, + "step": 1731 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 4.487046632124352, + "grad_norm": 1.8322105910803868, + "kl": 0.082763671875, + "learning_rate": 5.515544041450777e-07, + "loss": -0.0001, + "reward": 1.9987614750862122, + "reward_std": 3.63909566658549e-05, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.498761534690857, + "step": 1732 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 4.489637305699482, + "grad_norm": 0.16708755295284186, + "kl": 0.05419921875, + "learning_rate": 5.512953367875648e-07, + "loss": 0.0011, + "reward": 2.4999920129776, + "reward_std": 3.906192205249681e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999919533729553, + "step": 1733 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.5, + "epoch": 4.492227979274611, + "grad_norm": 0.5255056644256783, + "kl": 0.0484619140625, + "learning_rate": 5.510362694300518e-07, + "loss": 0.0011, + "reward": 2.4999929666519165, + "reward_std": 5.205781008044141e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999929666519165, + "step": 1734 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 4.494818652849741, + "grad_norm": 22.47454299777541, + "kl": 0.059326171875, + "learning_rate": 5.507772020725388e-07, + "loss": -0.0002, + "reward": 1.9862151145935059, + "reward_std": 0.0003306973626422405, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.4862149953842163, + "step": 1735 + }, + { + "clip_ratio": 0.0, + "completion_length": 34.5625, + "epoch": 4.4974093264248705, + "grad_norm": 7.5925048290151045, + "kl": 0.0758056640625, + "learning_rate": 5.505181347150258e-07, + "loss": 0.0008, + "reward": 2.499931812286377, + "reward_std": 1.8098790064868808e-05, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999316930770874, + "step": 1736 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 4.5, + "grad_norm": 13.840509146825339, + "kl": 0.1650390625, + "learning_rate": 5.502590673575129e-07, + "loss": 0.0004, + "reward": 2.3123154640197754, + "reward_std": 0.2588353480641672, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.8123154640197754, + "step": 1737 + }, + { + "clip_ratio": 0.0, + "completion_length": 36.0, + "epoch": 4.5025906735751295, + "grad_norm": 1.4649583984022538, + "kl": 0.0894775390625, + "learning_rate": 5.5e-07, + "loss": -0.0006, + "reward": 1.9937479496002197, + "reward_std": 5.249552850727923e-05, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.493748128414154, + "step": 1738 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.5, + "epoch": 4.505181347150259, + "grad_norm": 0.16107007715496194, + "kl": 0.099365234375, + "learning_rate": 5.49740932642487e-07, + "loss": -0.0008, + "reward": 2.499993681907654, + "reward_std": 2.730508754211769e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999937415122986, + "step": 1739 + }, + { + "clip_ratio": 0.0, + "completion_length": 34.5, + "epoch": 4.507772020725389, + "grad_norm": 0.36454592636768024, + "kl": 0.0196533203125, + "learning_rate": 5.494818652849741e-07, + "loss": 0.0007, + "reward": 2.499996542930603, + "reward_std": 2.138415709396213e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999964833259583, + "step": 1740 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.5, + "epoch": 4.510362694300518, + "grad_norm": 1.2598692413619204, + "kl": 0.17138671875, + "learning_rate": 5.49222797927461e-07, + "loss": 0.002, + "reward": 2.4999940395355225, + "reward_std": 3.844752654913464e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999940395355225, + "step": 1741 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 4.512953367875648, + "grad_norm": 14.141894308075937, + "kl": 0.0623779296875, + "learning_rate": 5.489637305699481e-07, + "loss": 0.0005, + "reward": 1.9988062381744385, + "reward_std": 0.0001535951200821728, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.4988062381744385, + "step": 1742 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 4.515544041450777, + "grad_norm": 1.8988375217965705, + "kl": 0.07421875, + "learning_rate": 5.487046632124352e-07, + "loss": -0.0004, + "reward": 2.4999899864196777, + "reward_std": 6.828418918303214e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999900460243225, + "step": 1743 + }, + { + "clip_ratio": 0.0, + "completion_length": 34.5, + "epoch": 4.518134715025907, + "grad_norm": 0.4834270230295165, + "kl": 0.076904296875, + "learning_rate": 5.484455958549222e-07, + "loss": -0.0001, + "reward": 2.4999924898147583, + "reward_std": 3.5415099546298734e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999927878379822, + "step": 1744 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.5625, + "epoch": 4.520725388601036, + "grad_norm": 5.24318124876931, + "kl": 0.08544921875, + "learning_rate": 5.481865284974093e-07, + "loss": 0.0004, + "reward": 2.499962091445923, + "reward_std": 2.1505103632080136e-05, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999619126319885, + "step": 1745 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 4.523316062176166, + "grad_norm": 2.2138667424204512, + "kl": 0.06494140625, + "learning_rate": 5.479274611398963e-07, + "loss": -0.0005, + "reward": 2.4999648332595825, + "reward_std": 1.820845932343218e-05, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999648332595825, + "step": 1746 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.5, + "epoch": 4.525906735751295, + "grad_norm": 0.12014376424380266, + "kl": 0.048828125, + "learning_rate": 5.476683937823833e-07, + "loss": 0.0003, + "reward": 2.4999959468841553, + "reward_std": 2.5282397473347373e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999960660934448, + "step": 1747 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.5, + "epoch": 4.528497409326425, + "grad_norm": 6.810906331642244, + "kl": 0.1552734375, + "learning_rate": 5.474093264248704e-07, + "loss": 0.001, + "reward": 1.3919540643692017, + "reward_std": 0.0007581018317068811, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 0.8919539451599121, + "step": 1748 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.5, + "epoch": 4.5310880829015545, + "grad_norm": 0.29126585076495237, + "kl": 0.0574951171875, + "learning_rate": 5.471502590673574e-07, + "loss": 0.001, + "reward": 2.499996781349182, + "reward_std": 2.286420908603759e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999966621398926, + "step": 1749 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.5, + "epoch": 4.533678756476684, + "grad_norm": 4.537578229563026, + "kl": 0.05755615234375, + "learning_rate": 5.468911917098446e-07, + "loss": 0.0003, + "reward": 1.9018195867538452, + "reward_std": 0.0004079671218732983, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.4018195569515228, + "step": 1750 + }, + { + "clip_ratio": 0.0, + "completion_length": 36.0, + "epoch": 4.536269430051814, + "grad_norm": 5.1500290494193735, + "kl": 0.0601806640625, + "learning_rate": 5.466321243523317e-07, + "loss": -0.0004, + "reward": 2.499959111213684, + "reward_std": 1.8426877204547054e-05, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999591708183289, + "step": 1751 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 4.538860103626943, + "grad_norm": 0.08562253712896178, + "kl": 0.030517578125, + "learning_rate": 5.463730569948187e-07, + "loss": 0.0003, + "reward": 2.499999165534973, + "reward_std": 8.481734141696506e-07, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999992847442627, + "step": 1752 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 4.541450777202073, + "grad_norm": 0.02890220585255892, + "kl": 0.03179931640625, + "learning_rate": 5.461139896373057e-07, + "loss": 0.0004, + "reward": 2.499998927116394, + "reward_std": 8.564228721752443e-07, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.999998927116394, + "step": 1753 + }, + { + "clip_ratio": 0.0, + "completion_length": 34.9375, + "epoch": 4.544041450777202, + "grad_norm": 167.28998596440692, + "kl": 0.123291015625, + "learning_rate": 5.458549222797927e-07, + "loss": 0.0011, + "reward": 2.418424606323242, + "reward_std": 0.23071279702719494, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9184245467185974, + "step": 1754 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 4.546632124352332, + "grad_norm": 5.125905915002972, + "kl": 0.066162109375, + "learning_rate": 5.455958549222798e-07, + "loss": -0.0005, + "reward": 1.998132586479187, + "reward_std": 8.477292431052774e-05, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.498132586479187, + "step": 1755 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 4.549222797927461, + "grad_norm": 10.56630539633022, + "kl": 0.0919189453125, + "learning_rate": 5.453367875647669e-07, + "loss": 0.0007, + "reward": 1.434851348400116, + "reward_std": 0.002121197898304672, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 0.9348513185977936, + "step": 1756 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.5, + "epoch": 4.551813471502591, + "grad_norm": 3.658157885612403, + "kl": 0.081298828125, + "learning_rate": 5.450777202072539e-07, + "loss": 0.0005, + "reward": 1.999853253364563, + "reward_std": 3.052235570066841e-05, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.4998532831668854, + "step": 1757 + }, + { + "clip_ratio": 0.0, + "completion_length": 36.0, + "epoch": 4.55440414507772, + "grad_norm": 7.859945952096368, + "kl": 0.06103515625, + "learning_rate": 5.44818652849741e-07, + "loss": -0.0004, + "reward": 2.49995219707489, + "reward_std": 2.3798113033990376e-05, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999523162841797, + "step": 1758 + }, + { + "clip_ratio": 0.0, + "completion_length": 34.0, + "epoch": 4.55699481865285, + "grad_norm": 0.11183179421683982, + "kl": 0.0673828125, + "learning_rate": 5.445595854922279e-07, + "loss": -0.0009, + "reward": 2.499996781349182, + "reward_std": 1.2882640021416591e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.999996840953827, + "step": 1759 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.5, + "epoch": 4.5595854922279795, + "grad_norm": 20.541195540764917, + "kl": 0.0535888671875, + "learning_rate": 5.44300518134715e-07, + "loss": 0.0004, + "reward": 2.312475085258484, + "reward_std": 0.25880386325729887, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.8124750852584839, + "step": 1760 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.5, + "epoch": 4.562176165803109, + "grad_norm": 0.12243874222594237, + "kl": 0.0281982421875, + "learning_rate": 5.440414507772021e-07, + "loss": 0.0004, + "reward": 2.4999738931655884, + "reward_std": 3.870881073453347e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999737739562988, + "step": 1761 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.5, + "epoch": 4.564766839378239, + "grad_norm": 34.42849905423427, + "kl": 0.11865234375, + "learning_rate": 5.437823834196891e-07, + "loss": 0.0005, + "reward": 1.7491916418075562, + "reward_std": 0.26747630536556244, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.2491916418075562, + "step": 1762 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 4.567357512953368, + "grad_norm": 8.108120910606115, + "kl": 0.2490234375, + "learning_rate": 5.435233160621762e-07, + "loss": 0.0013, + "reward": 2.4999791383743286, + "reward_std": 1.2070741831848864e-05, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999789595603943, + "step": 1763 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 4.569948186528498, + "grad_norm": 0.14111984329156935, + "kl": 0.138671875, + "learning_rate": 5.432642487046632e-07, + "loss": 0.0011, + "reward": 2.4999953508377075, + "reward_std": 2.533350709654769e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.999995231628418, + "step": 1764 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.5, + "epoch": 4.572538860103627, + "grad_norm": 7.5066810759913905, + "kl": 0.0966796875, + "learning_rate": 5.430051813471502e-07, + "loss": -0.0005, + "reward": 2.4999719858169556, + "reward_std": 1.4020912033174682e-05, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999722838401794, + "step": 1765 + }, + { + "clip_ratio": 0.0, + "completion_length": 36.0, + "epoch": 4.575129533678757, + "grad_norm": 4.832016422327479, + "kl": 0.100830078125, + "learning_rate": 5.427461139896373e-07, + "loss": 0.0002, + "reward": 1.9979740381240845, + "reward_std": 9.22031576919835e-05, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.4979739785194397, + "step": 1766 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 4.577720207253886, + "grad_norm": 0.13894154070130946, + "kl": 0.0484619140625, + "learning_rate": 5.424870466321243e-07, + "loss": 0.0005, + "reward": 2.4999951124191284, + "reward_std": 2.3747810473651043e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999951124191284, + "step": 1767 + }, + { + "clip_ratio": 0.0, + "completion_length": 34.5, + "epoch": 4.580310880829016, + "grad_norm": 0.1289924359195169, + "kl": 0.0517425537109375, + "learning_rate": 5.422279792746114e-07, + "loss": 0.0018, + "reward": 2.4999966621398926, + "reward_std": 1.781306991688325e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999964237213135, + "step": 1768 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 4.582901554404145, + "grad_norm": 0.19024624761305317, + "kl": 0.103271484375, + "learning_rate": 5.419689119170984e-07, + "loss": -0.0006, + "reward": 2.499997615814209, + "reward_std": 1.2299852301111969e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999979734420776, + "step": 1769 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.4375, + "epoch": 4.585492227979275, + "grad_norm": 18.33407250057705, + "kl": 0.092041015625, + "learning_rate": 5.417098445595855e-07, + "loss": 0.0004, + "reward": 1.2603832483291626, + "reward_std": 0.07101157457509544, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 0.7603832483291626, + "step": 1770 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.5, + "epoch": 4.5880829015544045, + "grad_norm": 0.5425474179411571, + "kl": 0.07952880859375, + "learning_rate": 5.414507772020725e-07, + "loss": 0.0008, + "reward": 2.4999901056289673, + "reward_std": 3.722099677361257e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999899864196777, + "step": 1771 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 4.590673575129534, + "grad_norm": 36.48962831061876, + "kl": 0.121826171875, + "learning_rate": 5.411917098445595e-07, + "loss": 0.0004, + "reward": 1.9373120069503784, + "reward_std": 0.17688471153087448, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.4373118579387665, + "step": 1772 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.5, + "epoch": 4.5932642487046635, + "grad_norm": 11.063815349056819, + "kl": 0.100830078125, + "learning_rate": 5.409326424870466e-07, + "loss": -0.0003, + "reward": 1.9933598041534424, + "reward_std": 0.00027473537465994013, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.493359923362732, + "step": 1773 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.5, + "epoch": 4.595854922279793, + "grad_norm": 7.094578066447553, + "kl": 0.08154296875, + "learning_rate": 5.406735751295336e-07, + "loss": 0.001, + "reward": 2.499984860420227, + "reward_std": 7.220182624223526e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999847412109375, + "step": 1774 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.5, + "epoch": 4.598445595854923, + "grad_norm": 3.9654955076468124, + "kl": 0.0843505859375, + "learning_rate": 5.404145077720207e-07, + "loss": 0.0003, + "reward": 2.4998425245285034, + "reward_std": 4.4948640720576805e-05, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.999842643737793, + "step": 1775 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 4.601036269430052, + "grad_norm": 1.9395867045463069, + "kl": 0.032196044921875, + "learning_rate": 5.401554404145078e-07, + "loss": -0.0, + "reward": 2.4999918937683105, + "reward_std": 4.97321639159054e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999920129776, + "step": 1776 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.5, + "epoch": 4.603626943005182, + "grad_norm": 1.2584354315446733, + "kl": 0.1568603515625, + "learning_rate": 5.398963730569947e-07, + "loss": 0.0011, + "reward": 2.499984622001648, + "reward_std": 8.536511131751467e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999845623970032, + "step": 1777 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.5, + "epoch": 4.606217616580311, + "grad_norm": 1.8763950392703619, + "kl": 0.147705078125, + "learning_rate": 5.396373056994818e-07, + "loss": 0.0016, + "reward": 1.9881176352500916, + "reward_std": 8.624646557109372e-05, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.488117516040802, + "step": 1778 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 4.608808290155441, + "grad_norm": 0.36894895891787144, + "kl": 0.076904296875, + "learning_rate": 5.393782383419689e-07, + "loss": 0.0013, + "reward": 2.4999769926071167, + "reward_std": 2.9433512054310995e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999770522117615, + "step": 1779 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.5, + "epoch": 4.61139896373057, + "grad_norm": 0.20530584982003117, + "kl": 0.070068359375, + "learning_rate": 5.391191709844559e-07, + "loss": 0.0009, + "reward": 2.499998092651367, + "reward_std": 2.516909376026888e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999980926513672, + "step": 1780 + }, + { + "clip_ratio": 0.0, + "completion_length": 34.5, + "epoch": 4.6139896373057, + "grad_norm": 0.06945023570074033, + "kl": 0.134765625, + "learning_rate": 5.38860103626943e-07, + "loss": 0.0008, + "reward": 2.4999932050704956, + "reward_std": 1.9671365976137167e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.999993085861206, + "step": 1781 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.9375, + "epoch": 4.616580310880829, + "grad_norm": 11.372654795580685, + "kl": 0.07958984375, + "learning_rate": 5.3860103626943e-07, + "loss": 0.0004, + "reward": 1.9558073282241821, + "reward_std": 0.015141666277486365, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.455807387828827, + "step": 1782 + }, + { + "clip_ratio": 0.0, + "completion_length": 34.0, + "epoch": 4.619170984455959, + "grad_norm": 0.10606060028254785, + "kl": 0.0546875, + "learning_rate": 5.38341968911917e-07, + "loss": -0.0004, + "reward": 2.4999961853027344, + "reward_std": 2.217536007265153e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.999996304512024, + "step": 1783 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 4.6217616580310885, + "grad_norm": 14.701085393080664, + "kl": 0.09814453125, + "learning_rate": 5.380829015544041e-07, + "loss": 0.0004, + "reward": 1.9998481273651123, + "reward_std": 7.417497454298427e-05, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.4998482167720795, + "step": 1784 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 4.624352331606218, + "grad_norm": 22.170606401595872, + "kl": 0.07421875, + "learning_rate": 5.378238341968911e-07, + "loss": 0.0007, + "reward": 2.4999823570251465, + "reward_std": 1.3415211014944362e-05, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.999982237815857, + "step": 1785 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.25, + "epoch": 4.626943005181348, + "grad_norm": 24.268573075221003, + "kl": 0.122314453125, + "learning_rate": 5.375647668393782e-07, + "loss": 0.0003, + "reward": 1.9668034315109253, + "reward_std": 0.0006330931146294461, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.4668034315109253, + "step": 1786 + }, + { + "clip_ratio": 0.0, + "completion_length": 34.5, + "epoch": 4.629533678756477, + "grad_norm": 0.8726368002420039, + "kl": 0.21533203125, + "learning_rate": 5.373056994818652e-07, + "loss": 0.0017, + "reward": 1.9987282156944275, + "reward_std": 1.0433882380311843e-05, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.4987280368804932, + "step": 1787 + }, + { + "clip_ratio": 0.0, + "completion_length": 34.5, + "epoch": 4.632124352331607, + "grad_norm": 0.7629187252374979, + "kl": 0.0712890625, + "learning_rate": 5.370466321243523e-07, + "loss": 0.0004, + "reward": 2.4999932050704956, + "reward_std": 5.462136982714583e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999931454658508, + "step": 1788 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 4.634715025906736, + "grad_norm": 68.52194084942666, + "kl": 0.07470703125, + "learning_rate": 5.367875647668393e-07, + "loss": 0.0002, + "reward": 2.437477707862854, + "reward_std": 0.17678443504109964, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9374777674674988, + "step": 1789 + }, + { + "clip_ratio": 0.0, + "completion_length": 36.0, + "epoch": 4.637305699481866, + "grad_norm": 1.7461200527244418, + "kl": 0.15087890625, + "learning_rate": 5.365284974093263e-07, + "loss": -0.0006, + "reward": 2.49998140335083, + "reward_std": 1.2022102737319074e-05, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999816417694092, + "step": 1790 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 4.639896373056995, + "grad_norm": 4.36089702912699, + "kl": 0.1373291015625, + "learning_rate": 5.362694300518134e-07, + "loss": 0.0008, + "reward": 1.7711026072502136, + "reward_std": 0.00040005836910950165, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.2711024582386017, + "step": 1791 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 4.642487046632124, + "grad_norm": 26.98053141937611, + "kl": 0.07763671875, + "learning_rate": 5.360103626943004e-07, + "loss": 0.0004, + "reward": 1.9360750913619995, + "reward_std": 0.17681539360455645, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.4360751509666443, + "step": 1792 + }, + { + "clip_ratio": 0.0, + "completion_length": 34.0, + "epoch": 4.6450777202072535, + "grad_norm": 20.422691609126474, + "kl": 0.0587158203125, + "learning_rate": 5.357512953367876e-07, + "loss": 0.0001, + "reward": 2.4999927282333374, + "reward_std": 8.652078577142674e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999926686286926, + "step": 1793 + }, + { + "clip_ratio": 0.0, + "completion_length": 36.0, + "epoch": 4.647668393782383, + "grad_norm": 7.415234207725857, + "kl": 0.099365234375, + "learning_rate": 5.354922279792747e-07, + "loss": 0.0, + "reward": 1.9930787086486816, + "reward_std": 0.00013342655711312545, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.493078738451004, + "step": 1794 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.5, + "epoch": 4.650259067357513, + "grad_norm": 8.303612443562999, + "kl": 0.127685546875, + "learning_rate": 5.352331606217616e-07, + "loss": 0.0013, + "reward": 1.8097798824310303, + "reward_std": 0.0004643035781555227, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.309779793024063, + "step": 1795 + }, + { + "clip_ratio": 0.0, + "completion_length": 34.0, + "epoch": 4.652849740932642, + "grad_norm": 1.3467392605388342, + "kl": 0.0413818359375, + "learning_rate": 5.349740932642487e-07, + "loss": -0.0006, + "reward": 2.499995231628418, + "reward_std": 6.925025836324039e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999953508377075, + "step": 1796 + }, + { + "clip_ratio": 0.0, + "completion_length": 34.9375, + "epoch": 4.655440414507772, + "grad_norm": 24.029626617899478, + "kl": 0.1014404296875, + "learning_rate": 5.347150259067357e-07, + "loss": 0.0005, + "reward": 1.999317705631256, + "reward_std": 2.9128019832569407e-05, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.4993175864219666, + "step": 1797 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 4.658031088082901, + "grad_norm": 0.20055234101517333, + "kl": 0.14990234375, + "learning_rate": 5.344559585492228e-07, + "loss": -0.0009, + "reward": 2.499994158744812, + "reward_std": 1.4809868531528991e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999942779541016, + "step": 1798 + }, + { + "clip_ratio": 0.0, + "completion_length": 34.5, + "epoch": 4.660621761658031, + "grad_norm": 1.0674348163457286, + "kl": 0.0577392578125, + "learning_rate": 5.341968911917099e-07, + "loss": -0.0005, + "reward": 2.499982237815857, + "reward_std": 8.514090040989686e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999822974205017, + "step": 1799 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 4.66321243523316, + "grad_norm": 1.986250448784548, + "kl": 0.150634765625, + "learning_rate": 5.339378238341969e-07, + "loss": 0.0011, + "reward": 1.9992393255233765, + "reward_std": 1.7084558294300223e-05, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.4992395043373108, + "step": 1800 + }, + { + "clip_ratio": 0.0, + "completion_length": 34.0, + "epoch": 4.66580310880829, + "grad_norm": 33.69789533218986, + "kl": 0.12890625, + "learning_rate": 5.336787564766839e-07, + "loss": 0.0008, + "reward": 2.0622769594192505, + "reward_std": 0.17685361541799693, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.5622769594192505, + "step": 1801 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.5, + "epoch": 4.668393782383419, + "grad_norm": 0.36644256535473047, + "kl": 0.074951171875, + "learning_rate": 5.33419689119171e-07, + "loss": 0.0002, + "reward": 2.4999964237213135, + "reward_std": 4.972720716978074e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.999996304512024, + "step": 1802 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 4.670984455958549, + "grad_norm": 3.8110981551169343, + "kl": 0.082275390625, + "learning_rate": 5.33160621761658e-07, + "loss": 0.0007, + "reward": 2.499957799911499, + "reward_std": 1.5159917438722914e-05, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999577403068542, + "step": 1803 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.5, + "epoch": 4.6735751295336785, + "grad_norm": 1.712219155872681, + "kl": 0.0694580078125, + "learning_rate": 5.329015544041451e-07, + "loss": -0.0008, + "reward": 2.4999804496765137, + "reward_std": 8.377825679417583e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.999980628490448, + "step": 1804 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.5, + "epoch": 4.676165803108808, + "grad_norm": 1.066708334341106, + "kl": 0.059814453125, + "learning_rate": 5.326424870466321e-07, + "loss": -0.0004, + "reward": 2.49998140335083, + "reward_std": 7.267447017511586e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999815225601196, + "step": 1805 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 4.678756476683938, + "grad_norm": 3.5160800299355524, + "kl": 0.053955078125, + "learning_rate": 5.323834196891192e-07, + "loss": -0.0, + "reward": 2.4999616146087646, + "reward_std": 2.382047114224406e-05, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999616742134094, + "step": 1806 + }, + { + "clip_ratio": 0.0, + "completion_length": 34.5, + "epoch": 4.681347150259067, + "grad_norm": 1.1050010737781242, + "kl": 0.09619140625, + "learning_rate": 5.321243523316063e-07, + "loss": 0.0003, + "reward": 2.4999842643737793, + "reward_std": 5.307365086082427e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999842643737793, + "step": 1807 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 4.683937823834197, + "grad_norm": 1.5522023553498705, + "kl": 0.099853515625, + "learning_rate": 5.318652849740932e-07, + "loss": 0.0003, + "reward": 2.499974846839905, + "reward_std": 9.484943234383536e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999748468399048, + "step": 1808 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.5, + "epoch": 4.686528497409326, + "grad_norm": 29.36205931954457, + "kl": 0.13232421875, + "learning_rate": 5.316062176165803e-07, + "loss": -0.0001, + "reward": 2.2498242259025574, + "reward_std": 0.26744518303001996, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.749824583530426, + "step": 1809 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.5, + "epoch": 4.689119170984456, + "grad_norm": 0.691534199317008, + "kl": 0.13037109375, + "learning_rate": 5.313471502590673e-07, + "loss": -0.0005, + "reward": 2.499990701675415, + "reward_std": 3.964879851992009e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999907612800598, + "step": 1810 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.5, + "epoch": 4.691709844559585, + "grad_norm": 5.8385624087917645, + "kl": 0.0548095703125, + "learning_rate": 5.310880829015544e-07, + "loss": -0.0, + "reward": 1.9983850717544556, + "reward_std": 3.324689132000458e-05, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.4983851313591003, + "step": 1811 + }, + { + "clip_ratio": 0.0, + "completion_length": 34.5, + "epoch": 4.694300518134715, + "grad_norm": 0.9466333117457287, + "kl": 0.100830078125, + "learning_rate": 5.308290155440415e-07, + "loss": 0.0007, + "reward": 2.4999927282333374, + "reward_std": 4.231103531537883e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999926686286926, + "step": 1812 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.5, + "epoch": 4.696891191709844, + "grad_norm": 0.6149548943526877, + "kl": 0.0616455078125, + "learning_rate": 5.305699481865284e-07, + "loss": -0.0006, + "reward": 2.4999901056289673, + "reward_std": 4.871121859650884e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999902844429016, + "step": 1813 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 4.699481865284974, + "grad_norm": 0.12083629559390983, + "kl": 0.0457763671875, + "learning_rate": 5.303108808290155e-07, + "loss": 0.0008, + "reward": 2.4999938011169434, + "reward_std": 2.4925450361479307e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999938011169434, + "step": 1814 + }, + { + "clip_ratio": 0.0, + "completion_length": 34.0, + "epoch": 4.7020725388601035, + "grad_norm": 0.5859365264217756, + "kl": 0.094970703125, + "learning_rate": 5.300518134715025e-07, + "loss": 0.0001, + "reward": 2.499985456466675, + "reward_std": 8.133426035783486e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999854564666748, + "step": 1815 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 4.704663212435233, + "grad_norm": 0.253599097742006, + "kl": 0.0882568359375, + "learning_rate": 5.297927461139896e-07, + "loss": 0.0002, + "reward": 2.4999914169311523, + "reward_std": 2.3123197934182826e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999913573265076, + "step": 1816 + }, + { + "clip_ratio": 0.0, + "completion_length": 34.5, + "epoch": 4.7072538860103625, + "grad_norm": 3.401992148236074, + "kl": 0.3310546875, + "learning_rate": 5.295336787564767e-07, + "loss": 0.0015, + "reward": 2.4999812841415405, + "reward_std": 1.4841831671219552e-05, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999813437461853, + "step": 1817 + }, + { + "clip_ratio": 0.0, + "completion_length": 34.5, + "epoch": 4.709844559585492, + "grad_norm": 3.9139466962302074, + "kl": 0.0572509765625, + "learning_rate": 5.292746113989637e-07, + "loss": 0.0003, + "reward": 2.499984383583069, + "reward_std": 1.4513848668684659e-05, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999845623970032, + "step": 1818 + }, + { + "clip_ratio": 0.0, + "completion_length": 36.0, + "epoch": 4.712435233160622, + "grad_norm": 0.24020930568660093, + "kl": 0.0712890625, + "learning_rate": 5.290155440414508e-07, + "loss": 0.0007, + "reward": 2.499996304512024, + "reward_std": 2.3560410511436203e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999963641166687, + "step": 1819 + }, + { + "clip_ratio": 0.0, + "completion_length": 34.5, + "epoch": 4.715025906735751, + "grad_norm": 0.17373819824374603, + "kl": 0.040771484375, + "learning_rate": 5.287564766839377e-07, + "loss": 0.001, + "reward": 2.49999737739563, + "reward_std": 1.7050833207576943e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999973773956299, + "step": 1820 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 4.717616580310881, + "grad_norm": 58.57864462348363, + "kl": 0.0560302734375, + "learning_rate": 5.284974093264248e-07, + "loss": 0.0014, + "reward": 2.4374406337738037, + "reward_std": 0.1769376028134957, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9374406933784485, + "step": 1821 + }, + { + "clip_ratio": 0.0, + "completion_length": 36.0, + "epoch": 4.72020725388601, + "grad_norm": 51.79221132426645, + "kl": 0.0792236328125, + "learning_rate": 5.282383419689119e-07, + "loss": -0.0005, + "reward": 2.0620144605636597, + "reward_std": 0.17697328804911194, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.562014639377594, + "step": 1822 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 4.72279792746114, + "grad_norm": 0.1836937612426288, + "kl": 0.085205078125, + "learning_rate": 5.279792746113989e-07, + "loss": 0.0008, + "reward": 2.49999737739563, + "reward_std": 2.5852199883047433e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999972581863403, + "step": 1823 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.5, + "epoch": 4.725388601036269, + "grad_norm": 0.03149477892204813, + "kl": 0.111328125, + "learning_rate": 5.27720207253886e-07, + "loss": 0.0, + "reward": 2.499997138977051, + "reward_std": 1.2922189398523187e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999972581863403, + "step": 1824 + }, + { + "clip_ratio": 0.0, + "completion_length": 34.5, + "epoch": 4.727979274611399, + "grad_norm": 0.4050448808059232, + "kl": 0.159912109375, + "learning_rate": 5.274611398963731e-07, + "loss": 0.0006, + "reward": 2.499990701675415, + "reward_std": 3.7766282048323774e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999906420707703, + "step": 1825 + }, + { + "clip_ratio": 0.0, + "completion_length": 34.5, + "epoch": 4.730569948186528, + "grad_norm": 3.2141309837044956, + "kl": 0.1337890625, + "learning_rate": 5.2720207253886e-07, + "loss": 0.0003, + "reward": 1.9567396640777588, + "reward_std": 0.00012566078618192478, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.4567397236824036, + "step": 1826 + }, + { + "clip_ratio": 0.0, + "completion_length": 34.5, + "epoch": 4.733160621761658, + "grad_norm": 5.931264905795551, + "kl": 0.12451171875, + "learning_rate": 5.269430051813471e-07, + "loss": 0.0005, + "reward": 2.4999704360961914, + "reward_std": 1.8420762216919684e-05, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999704360961914, + "step": 1827 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 4.7357512953367875, + "grad_norm": 0.367435595426355, + "kl": 0.0325927734375, + "learning_rate": 5.266839378238341e-07, + "loss": 0.0009, + "reward": 2.4999847412109375, + "reward_std": 3.6717295870403177e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999847412109375, + "step": 1828 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.5, + "epoch": 4.738341968911917, + "grad_norm": 2.905226315117502, + "kl": 0.04443359375, + "learning_rate": 5.264248704663212e-07, + "loss": 0.0013, + "reward": 2.4999871253967285, + "reward_std": 9.591080356585735e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999870657920837, + "step": 1829 + }, + { + "clip_ratio": 0.0, + "completion_length": 36.0, + "epoch": 4.740932642487047, + "grad_norm": 1.8725370061030004, + "kl": 0.096435546875, + "learning_rate": 5.261658031088083e-07, + "loss": 0.0018, + "reward": 2.4999914169311523, + "reward_std": 7.273683195307967e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999913573265076, + "step": 1830 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 4.743523316062176, + "grad_norm": 4.74556642234223, + "kl": 0.0640869140625, + "learning_rate": 5.259067357512953e-07, + "loss": -0.0002, + "reward": 1.9997684359550476, + "reward_std": 3.6370711768540787e-05, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.4997685849666595, + "step": 1831 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.5, + "epoch": 4.746113989637306, + "grad_norm": 2.624071202412009, + "kl": 0.08935546875, + "learning_rate": 5.256476683937823e-07, + "loss": 0.0005, + "reward": 1.9999428987503052, + "reward_std": 1.1232971928620827e-05, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.4999428689479828, + "step": 1832 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.5, + "epoch": 4.748704663212435, + "grad_norm": 0.10067785357560483, + "kl": 0.11669921875, + "learning_rate": 5.253886010362693e-07, + "loss": -0.0006, + "reward": 2.499997854232788, + "reward_std": 1.7851560869530658e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999979138374329, + "step": 1833 + }, + { + "clip_ratio": 0.0, + "completion_length": 34.5, + "epoch": 4.751295336787565, + "grad_norm": 0.5685156572450072, + "kl": 0.0787353515625, + "learning_rate": 5.251295336787564e-07, + "loss": 0.0, + "reward": 2.499995708465576, + "reward_std": 3.383515092991729e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999958276748657, + "step": 1834 + }, + { + "clip_ratio": 0.0, + "completion_length": 36.0, + "epoch": 4.753886010362694, + "grad_norm": 22.055601802081455, + "kl": 0.06103515625, + "learning_rate": 5.248704663212436e-07, + "loss": 0.0006, + "reward": 2.498571515083313, + "reward_std": 0.00041903622934569285, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9985713958740234, + "step": 1835 + }, + { + "clip_ratio": 0.0, + "completion_length": 34.5, + "epoch": 4.756476683937824, + "grad_norm": 0.11304170160299004, + "kl": 0.05389404296875, + "learning_rate": 5.246113989637306e-07, + "loss": 0.0007, + "reward": 2.499995708465576, + "reward_std": 2.2937875883144443e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999958276748657, + "step": 1836 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.5, + "epoch": 4.759067357512953, + "grad_norm": 17.162041380903503, + "kl": 0.0535888671875, + "learning_rate": 5.243523316062177e-07, + "loss": 0.0001, + "reward": 1.999945044517517, + "reward_std": 1.7562929429004726e-05, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.4999449849128723, + "step": 1837 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.5, + "epoch": 4.761658031088083, + "grad_norm": 0.6496636428290078, + "kl": 0.0985107421875, + "learning_rate": 5.240932642487046e-07, + "loss": 0.0013, + "reward": 2.49997079372406, + "reward_std": 6.4162059061345644e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999706745147705, + "step": 1838 + }, + { + "clip_ratio": 0.0, + "completion_length": 36.0, + "epoch": 4.7642487046632125, + "grad_norm": 1.2643806912023512, + "kl": 0.0301513671875, + "learning_rate": 5.238341968911917e-07, + "loss": -0.0003, + "reward": 2.49998676776886, + "reward_std": 6.764195290998032e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.999986708164215, + "step": 1839 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.5, + "epoch": 4.766839378238342, + "grad_norm": 3.778437168131262, + "kl": 0.1171875, + "learning_rate": 5.235751295336788e-07, + "loss": 0.0008, + "reward": 2.499949097633362, + "reward_std": 1.8067170913127484e-05, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999489784240723, + "step": 1840 + }, + { + "clip_ratio": 0.0, + "completion_length": 34.0, + "epoch": 4.769430051813472, + "grad_norm": 0.09986816331982766, + "kl": 0.081787109375, + "learning_rate": 5.233160621761658e-07, + "loss": 0.0, + "reward": 2.4999982118606567, + "reward_std": 1.2238861017976888e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999983310699463, + "step": 1841 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 4.772020725388601, + "grad_norm": 0.34276201039635323, + "kl": 0.1181640625, + "learning_rate": 5.230569948186529e-07, + "loss": 0.0002, + "reward": 2.4999916553497314, + "reward_std": 6.410554789226808e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999918341636658, + "step": 1842 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 4.774611398963731, + "grad_norm": 0.5513242145975972, + "kl": 0.157958984375, + "learning_rate": 5.227979274611399e-07, + "loss": -0.0005, + "reward": 2.4999717473983765, + "reward_std": 8.922115739551373e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999717473983765, + "step": 1843 + }, + { + "clip_ratio": 0.0, + "completion_length": 34.0, + "epoch": 4.77720207253886, + "grad_norm": 0.03448824198857638, + "kl": 0.12939453125, + "learning_rate": 5.225388601036269e-07, + "loss": 0.0008, + "reward": 2.4999990463256836, + "reward_std": 9.123150874756902e-07, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999991059303284, + "step": 1844 + }, + { + "clip_ratio": 0.0, + "completion_length": 36.0, + "epoch": 4.77979274611399, + "grad_norm": 2.769963639384208, + "kl": 0.174072265625, + "learning_rate": 5.22279792746114e-07, + "loss": -0.0005, + "reward": 2.499972701072693, + "reward_std": 1.4247115586840664e-05, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999725818634033, + "step": 1845 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.5, + "epoch": 4.782383419689119, + "grad_norm": 0.05199545443846805, + "kl": 0.23876953125, + "learning_rate": 5.22020725388601e-07, + "loss": 0.0015, + "reward": 2.4999969005584717, + "reward_std": 1.973777500552387e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.999996840953827, + "step": 1846 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 4.784974093264249, + "grad_norm": 2.4698048411331173, + "kl": 0.1064453125, + "learning_rate": 5.217616580310881e-07, + "loss": 0.0007, + "reward": 2.4999899864196777, + "reward_std": 3.111517912657291e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999897480010986, + "step": 1847 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.5, + "epoch": 4.787564766839378, + "grad_norm": 0.34886363317653557, + "kl": 0.09228515625, + "learning_rate": 5.215025906735752e-07, + "loss": 0.0008, + "reward": 2.499997138977051, + "reward_std": 2.007587653451992e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999971389770508, + "step": 1848 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.5, + "epoch": 4.790155440414508, + "grad_norm": 0.39412284484886145, + "kl": 0.122314453125, + "learning_rate": 5.212435233160622e-07, + "loss": 0.0002, + "reward": 2.499995708465576, + "reward_std": 3.830643777291698e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999958276748657, + "step": 1849 + }, + { + "clip_ratio": 0.0, + "completion_length": 36.0, + "epoch": 4.7927461139896375, + "grad_norm": 7.650442625564227, + "kl": 0.091796875, + "learning_rate": 5.209844559585492e-07, + "loss": 0.0004, + "reward": 2.4998810291290283, + "reward_std": 4.5035868879494956e-05, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.999880850315094, + "step": 1850 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 4.795336787564767, + "grad_norm": 1.8050325659468176, + "kl": 0.1019287109375, + "learning_rate": 5.207253886010362e-07, + "loss": 0.0009, + "reward": 1.766606092453003, + "reward_std": 8.435635584191914e-05, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.2666060030460358, + "step": 1851 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.5, + "epoch": 4.7979274611398965, + "grad_norm": 0.4503805417563229, + "kl": 0.05999755859375, + "learning_rate": 5.204663212435233e-07, + "loss": 0.0005, + "reward": 2.4999972581863403, + "reward_std": 1.6079814599834208e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999971985816956, + "step": 1852 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.3125, + "epoch": 4.800518134715026, + "grad_norm": 56.72936279770925, + "kl": 0.083984375, + "learning_rate": 5.202072538860104e-07, + "loss": -0.0002, + "reward": 2.2687954902648926, + "reward_std": 0.3193020560623836, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.768795669078827, + "step": 1853 + }, + { + "clip_ratio": 0.0, + "completion_length": 34.5, + "epoch": 4.803108808290156, + "grad_norm": 47.102965658594776, + "kl": 0.1884765625, + "learning_rate": 5.199481865284974e-07, + "loss": 0.0008, + "reward": 1.3129711747169495, + "reward_std": 0.0007770535521558486, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 0.8129712343215942, + "step": 1854 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 4.805699481865285, + "grad_norm": 0.6877169201935954, + "kl": 0.100341796875, + "learning_rate": 5.196891191709845e-07, + "loss": -0.0009, + "reward": 1.9996490478515625, + "reward_std": 1.1218860549888632e-05, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.4996492266654968, + "step": 1855 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0625, + "epoch": 4.808290155440415, + "grad_norm": 4.678440059659677, + "kl": 0.2841796875, + "learning_rate": 5.194300518134714e-07, + "loss": 0.0012, + "reward": 1.9203997254371643, + "reward_std": 0.00028477661862780224, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.420399785041809, + "step": 1856 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 4.810880829015544, + "grad_norm": 0.22482692099684018, + "kl": 0.1630859375, + "learning_rate": 5.191709844559585e-07, + "loss": 0.0004, + "reward": 2.4999958276748657, + "reward_std": 2.6246650008943107e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999958872795105, + "step": 1857 + }, + { + "clip_ratio": 0.0, + "completion_length": 34.5, + "epoch": 4.813471502590674, + "grad_norm": 1.0978514727987898, + "kl": 0.0931396484375, + "learning_rate": 5.189119170984456e-07, + "loss": -0.0005, + "reward": 2.4999632835388184, + "reward_std": 9.000008958537364e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.999963402748108, + "step": 1858 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.5, + "epoch": 4.816062176165803, + "grad_norm": 11.797871666877167, + "kl": 0.12451171875, + "learning_rate": 5.186528497409326e-07, + "loss": 0.0002, + "reward": 2.499993920326233, + "reward_std": 5.024935035180533e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999940395355225, + "step": 1859 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 4.818652849740933, + "grad_norm": 0.230666330174166, + "kl": 0.091064453125, + "learning_rate": 5.183937823834197e-07, + "loss": 0.0005, + "reward": 2.499972701072693, + "reward_std": 4.069174337928416e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999725818634033, + "step": 1860 + }, + { + "clip_ratio": 0.0, + "completion_length": 34.5, + "epoch": 4.821243523316062, + "grad_norm": 0.38430755100609587, + "kl": 0.0584716796875, + "learning_rate": 5.181347150259067e-07, + "loss": 0.0001, + "reward": 2.4998698234558105, + "reward_std": 5.049533456258359e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9998698830604553, + "step": 1861 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.5, + "epoch": 4.823834196891192, + "grad_norm": 0.5541326335897286, + "kl": 0.0986328125, + "learning_rate": 5.178756476683937e-07, + "loss": 0.0007, + "reward": 2.499991774559021, + "reward_std": 4.678573873206915e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.999991774559021, + "step": 1862 + }, + { + "clip_ratio": 0.0, + "completion_length": 34.5, + "epoch": 4.8264248704663215, + "grad_norm": 6.9086749576832505, + "kl": 0.3203125, + "learning_rate": 5.176165803108808e-07, + "loss": 0.0018, + "reward": 2.499990701675415, + "reward_std": 4.889002411800902e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999906420707703, + "step": 1863 + }, + { + "clip_ratio": 0.0, + "completion_length": 36.0, + "epoch": 4.829015544041451, + "grad_norm": 7.184699324738151, + "kl": 0.17431640625, + "learning_rate": 5.173575129533678e-07, + "loss": 0.0013, + "reward": 2.4999845027923584, + "reward_std": 1.556348786380113e-05, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999846816062927, + "step": 1864 + }, + { + "clip_ratio": 0.0, + "completion_length": 36.0, + "epoch": 4.831606217616581, + "grad_norm": 0.9799698518692863, + "kl": 0.091552734375, + "learning_rate": 5.170984455958549e-07, + "loss": 0.0001, + "reward": 2.499952554702759, + "reward_std": 8.286165439130855e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999526739120483, + "step": 1865 + }, + { + "clip_ratio": 0.0, + "completion_length": 34.5, + "epoch": 4.83419689119171, + "grad_norm": 2.3505405683011777, + "kl": 0.116455078125, + "learning_rate": 5.168393782383419e-07, + "loss": -0.0007, + "reward": 2.4999868869781494, + "reward_std": 6.806133001191483e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.999987006187439, + "step": 1866 + }, + { + "clip_ratio": 0.0, + "completion_length": 34.5, + "epoch": 4.83678756476684, + "grad_norm": 0.5260308124218211, + "kl": 0.072509765625, + "learning_rate": 5.16580310880829e-07, + "loss": -0.0001, + "reward": 2.4999754428863525, + "reward_std": 4.411289864947321e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999754428863525, + "step": 1867 + }, + { + "clip_ratio": 0.0, + "completion_length": 36.0, + "epoch": 4.839378238341969, + "grad_norm": 0.1044197079833509, + "kl": 0.1180419921875, + "learning_rate": 5.16321243523316e-07, + "loss": -0.0007, + "reward": 2.4999961853027344, + "reward_std": 1.3546987815971079e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.999996304512024, + "step": 1868 + }, + { + "clip_ratio": 0.0, + "completion_length": 36.0, + "epoch": 4.841968911917099, + "grad_norm": 3.111927738784777, + "kl": 0.2012939453125, + "learning_rate": 5.16062176165803e-07, + "loss": -0.0003, + "reward": 2.4999771118164062, + "reward_std": 1.9735252863029018e-05, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999772310256958, + "step": 1869 + }, + { + "clip_ratio": 0.0, + "completion_length": 34.5, + "epoch": 4.844559585492228, + "grad_norm": 20.987801431165522, + "kl": 0.14013671875, + "learning_rate": 5.158031088082901e-07, + "loss": 0.0008, + "reward": 1.4888710379600525, + "reward_std": 0.00011967114187427796, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 0.9888710379600525, + "step": 1870 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 4.847150259067358, + "grad_norm": 0.06463117590614705, + "kl": 0.0570068359375, + "learning_rate": 5.155440414507772e-07, + "loss": -0.0007, + "reward": 2.4999985694885254, + "reward_std": 9.881680398393655e-07, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999986290931702, + "step": 1871 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 4.849740932642487, + "grad_norm": 18.163589971569913, + "kl": 0.142578125, + "learning_rate": 5.152849740932642e-07, + "loss": 0.0007, + "reward": 1.9954760074615479, + "reward_std": 9.445267687624437e-05, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.495475947856903, + "step": 1872 + }, + { + "clip_ratio": 0.0, + "completion_length": 36.0, + "epoch": 4.852331606217617, + "grad_norm": 28.115312912705644, + "kl": 0.25634765625, + "learning_rate": 5.150259067357513e-07, + "loss": 0.001, + "reward": 1.8121753334999084, + "reward_std": 0.004402739882380047, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.3121753931045532, + "step": 1873 + }, + { + "clip_ratio": 0.0, + "completion_length": 36.5, + "epoch": 4.8549222797927465, + "grad_norm": 0.3101319924114648, + "kl": 0.0640869140625, + "learning_rate": 5.147668393782382e-07, + "loss": -0.0, + "reward": 2.4999916553497314, + "reward_std": 3.4617439723660937e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.999991536140442, + "step": 1874 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 4.857512953367876, + "grad_norm": 1.4734351130331582, + "kl": 0.129638671875, + "learning_rate": 5.145077720207253e-07, + "loss": 0.0007, + "reward": 2.4999780654907227, + "reward_std": 6.190106091708003e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999781847000122, + "step": 1875 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 4.860103626943005, + "grad_norm": 0.14410996023761452, + "kl": 0.123046875, + "learning_rate": 5.142487046632125e-07, + "loss": 0.0006, + "reward": 1.4999991655349731, + "reward_std": 6.103502414589457e-07, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 0.9999991059303284, + "step": 1876 + }, + { + "clip_ratio": 0.0, + "completion_length": 36.0, + "epoch": 4.862694300518134, + "grad_norm": 1.7626653707073157, + "kl": 0.062744140625, + "learning_rate": 5.139896373056995e-07, + "loss": 0.0004, + "reward": 2.499977946281433, + "reward_std": 1.350019238088862e-05, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999778270721436, + "step": 1877 + }, + { + "clip_ratio": 0.0, + "completion_length": 34.5, + "epoch": 4.865284974093264, + "grad_norm": 2.371317823653764, + "kl": 0.0692138671875, + "learning_rate": 5.137305699481866e-07, + "loss": -0.0008, + "reward": 2.4999873638153076, + "reward_std": 1.2011415492452215e-05, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.999987542629242, + "step": 1878 + }, + { + "clip_ratio": 0.0, + "completion_length": 33.5, + "epoch": 4.867875647668393, + "grad_norm": 11.728809755339237, + "kl": 2.760009765625, + "learning_rate": 5.134715025906736e-07, + "loss": 0.0103, + "reward": 2.4999825954437256, + "reward_std": 1.4914779967512004e-05, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.999982476234436, + "step": 1879 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.5, + "epoch": 4.870466321243523, + "grad_norm": 12.600405365570017, + "kl": 0.3310546875, + "learning_rate": 5.132124352331606e-07, + "loss": 0.0008, + "reward": 2.499926447868347, + "reward_std": 2.373492202423222e-05, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999265670776367, + "step": 1880 + }, + { + "clip_ratio": 0.0, + "completion_length": 34.5, + "epoch": 4.873056994818652, + "grad_norm": 0.12480825861074117, + "kl": 0.0606689453125, + "learning_rate": 5.129533678756477e-07, + "loss": -0.0004, + "reward": 2.49999737739563, + "reward_std": 1.476501580555123e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999975562095642, + "step": 1881 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 4.875647668393782, + "grad_norm": 1.2335308195361294, + "kl": 0.0772705078125, + "learning_rate": 5.126943005181347e-07, + "loss": 0.0009, + "reward": 2.4999918937683105, + "reward_std": 5.133752893016208e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999918341636658, + "step": 1882 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 4.8782383419689115, + "grad_norm": 4.470837879112766, + "kl": 0.12109375, + "learning_rate": 5.124352331606218e-07, + "loss": -0.0001, + "reward": 1.952039897441864, + "reward_std": 0.0001541801144639976, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.452039897441864, + "step": 1883 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 4.880829015544041, + "grad_norm": 0.2192460917455969, + "kl": 0.111328125, + "learning_rate": 5.121761658031088e-07, + "loss": 0.0002, + "reward": 2.4999964237213135, + "reward_std": 2.9395907858997816e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.999996304512024, + "step": 1884 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 4.883419689119171, + "grad_norm": 0.2060648916047213, + "kl": 0.132568359375, + "learning_rate": 5.119170984455959e-07, + "loss": 0.0011, + "reward": 2.4999972581863403, + "reward_std": 2.488722628868345e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999972581863403, + "step": 1885 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.5, + "epoch": 4.8860103626943, + "grad_norm": 13.417260221427565, + "kl": 0.111328125, + "learning_rate": 5.116580310880829e-07, + "loss": 0.0006, + "reward": 2.4327460527420044, + "reward_std": 0.19003782174820572, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9327460527420044, + "step": 1886 + }, + { + "clip_ratio": 0.0, + "completion_length": 36.0, + "epoch": 4.88860103626943, + "grad_norm": 0.2758144858920631, + "kl": 0.068115234375, + "learning_rate": 5.113989637305699e-07, + "loss": 0.0006, + "reward": 2.499990940093994, + "reward_std": 3.942555622415966e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999909400939941, + "step": 1887 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.5, + "epoch": 4.891191709844559, + "grad_norm": 1.0902854750363893, + "kl": 0.16162109375, + "learning_rate": 5.11139896373057e-07, + "loss": -0.0001, + "reward": 2.4999799728393555, + "reward_std": 9.5089649221336e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999802112579346, + "step": 1888 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 4.893782383419689, + "grad_norm": 7.081301732949311, + "kl": 0.098876953125, + "learning_rate": 5.10880829015544e-07, + "loss": -0.0002, + "reward": 2.499961733818054, + "reward_std": 1.060829458765511e-05, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999617338180542, + "step": 1889 + }, + { + "clip_ratio": 0.0, + "completion_length": 36.0, + "epoch": 4.896373056994818, + "grad_norm": 0.8058802828493359, + "kl": 0.099609375, + "learning_rate": 5.106217616580311e-07, + "loss": 0.0004, + "reward": 2.4999855756759644, + "reward_std": 9.334485412182403e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.999985694885254, + "step": 1890 + }, + { + "clip_ratio": 0.0, + "completion_length": 42.375, + "epoch": 4.898963730569948, + "grad_norm": 2.9193035463298367, + "kl": 0.19000244140625, + "learning_rate": 5.103626943005182e-07, + "loss": 0.0003, + "reward": 2.4999828338623047, + "reward_std": 1.1689225630107103e-05, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.999983012676239, + "step": 1891 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.5, + "epoch": 4.901554404145077, + "grad_norm": 0.4450776872512842, + "kl": 0.090576171875, + "learning_rate": 5.101036269430051e-07, + "loss": -0.0001, + "reward": 2.4999948740005493, + "reward_std": 4.056156342358008e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.999994933605194, + "step": 1892 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 4.904145077720207, + "grad_norm": 0.7283449454955582, + "kl": 0.0753173828125, + "learning_rate": 5.098445595854922e-07, + "loss": 0.0, + "reward": 2.499991536140442, + "reward_std": 5.588245130638825e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999915957450867, + "step": 1893 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 4.9067357512953365, + "grad_norm": 1.6922256611313642, + "kl": 0.081787109375, + "learning_rate": 5.095854922279792e-07, + "loss": 0.0019, + "reward": 1.9999473094940186, + "reward_std": 8.983988664112985e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.4999471008777618, + "step": 1894 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 4.909326424870466, + "grad_norm": 2.365943348241899, + "kl": 0.1650390625, + "learning_rate": 5.093264248704663e-07, + "loss": -0.0004, + "reward": 1.9998682737350464, + "reward_std": 1.407740614922659e-05, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.499868392944336, + "step": 1895 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 4.9119170984455955, + "grad_norm": 1.6964616877330223, + "kl": 0.14599609375, + "learning_rate": 5.090673575129534e-07, + "loss": 0.0006, + "reward": 1.9998960494995117, + "reward_std": 1.790315883454241e-05, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.4998961091041565, + "step": 1896 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.5625, + "epoch": 4.914507772020725, + "grad_norm": 1.876102256484594, + "kl": 0.16259765625, + "learning_rate": 5.088082901554404e-07, + "loss": 0.001, + "reward": 2.499979257583618, + "reward_std": 1.3367809742703685e-05, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999791383743286, + "step": 1897 + }, + { + "clip_ratio": 0.0, + "completion_length": 34.5, + "epoch": 4.917098445595855, + "grad_norm": 0.8532308454817059, + "kl": 0.0631103515625, + "learning_rate": 5.085492227979274e-07, + "loss": 0.0005, + "reward": 2.4999985694885254, + "reward_std": 1.962784381248639e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999985694885254, + "step": 1898 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 4.919689119170984, + "grad_norm": 0.41841053315785764, + "kl": 0.0869140625, + "learning_rate": 5.082901554404145e-07, + "loss": 0.0012, + "reward": 2.4999877214431763, + "reward_std": 4.248923801242199e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999874830245972, + "step": 1899 + }, + { + "clip_ratio": 0.0, + "completion_length": 34.0, + "epoch": 4.922279792746114, + "grad_norm": 0.1571909754973784, + "kl": 0.0350341796875, + "learning_rate": 5.080310880829015e-07, + "loss": 0.0004, + "reward": 2.4999964237213135, + "reward_std": 1.649446545570754e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999963641166687, + "step": 1900 + }, + { + "clip_ratio": 0.0, + "completion_length": 34.5, + "epoch": 4.924870466321243, + "grad_norm": 4.891992603760889, + "kl": 0.205078125, + "learning_rate": 5.077720207253886e-07, + "loss": 0.0013, + "reward": 1.9450291395187378, + "reward_std": 0.0001299916957577807, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.4450291991233826, + "step": 1901 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 4.927461139896373, + "grad_norm": 4.292213593668363, + "kl": 0.1796875, + "learning_rate": 5.075129533678756e-07, + "loss": 0.0011, + "reward": 1.498780369758606, + "reward_std": 5.568056076299399e-05, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 0.9987803399562836, + "step": 1902 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 4.930051813471502, + "grad_norm": 74.52292657169967, + "kl": 0.08203125, + "learning_rate": 5.072538860103627e-07, + "loss": 0.0003, + "reward": 1.9999048709869385, + "reward_std": 0.35365331172943115, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.4999048709869385, + "step": 1903 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.5, + "epoch": 4.932642487046632, + "grad_norm": 0.22755161973627527, + "kl": 0.03369140625, + "learning_rate": 5.069948186528497e-07, + "loss": -0.0002, + "reward": 2.499995708465576, + "reward_std": 2.83990709704085e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999957084655762, + "step": 1904 + }, + { + "clip_ratio": 0.0, + "completion_length": 34.5, + "epoch": 4.935233160621761, + "grad_norm": 0.4152910432227915, + "kl": 0.12109375, + "learning_rate": 5.067357512953367e-07, + "loss": 0.0003, + "reward": 1.4999972581863403, + "reward_std": 1.0552354297033162e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 0.9999972581863403, + "step": 1905 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 4.937823834196891, + "grad_norm": 23.668966026949754, + "kl": 0.068359375, + "learning_rate": 5.064766839378238e-07, + "loss": 0.0006, + "reward": 1.9828269481658936, + "reward_std": 0.00024920167282971306, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.4828268885612488, + "step": 1906 + }, + { + "clip_ratio": 0.0, + "completion_length": 36.0, + "epoch": 4.9404145077720205, + "grad_norm": 1.7903009861545844, + "kl": 0.0582275390625, + "learning_rate": 5.062176165803108e-07, + "loss": -0.0002, + "reward": 2.499991297721863, + "reward_std": 7.357782124017831e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999913573265076, + "step": 1907 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.5, + "epoch": 4.94300518134715, + "grad_norm": 0.6270608662338873, + "kl": 0.101806640625, + "learning_rate": 5.059585492227979e-07, + "loss": -0.0, + "reward": 2.499978184700012, + "reward_std": 6.267599019338377e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.999978482723236, + "step": 1908 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 4.94559585492228, + "grad_norm": 0.4218790272789594, + "kl": 0.090087890625, + "learning_rate": 5.05699481865285e-07, + "loss": 0.0007, + "reward": 2.499988079071045, + "reward_std": 5.398533403422334e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999880194664001, + "step": 1909 + }, + { + "clip_ratio": 0.0, + "completion_length": 34.5, + "epoch": 4.948186528497409, + "grad_norm": 0.6821500540734067, + "kl": 0.14794921875, + "learning_rate": 5.054404145077719e-07, + "loss": 0.0013, + "reward": 2.499984860420227, + "reward_std": 8.115260925478651e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.999984860420227, + "step": 1910 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 4.950777202072539, + "grad_norm": 10.23500927637154, + "kl": 0.125, + "learning_rate": 5.05181347150259e-07, + "loss": 0.0004, + "reward": 1.9769858121871948, + "reward_std": 0.000344835293617507, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.4769859313964844, + "step": 1911 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.5, + "epoch": 4.953367875647668, + "grad_norm": 4.247667775016941, + "kl": 0.112548828125, + "learning_rate": 5.04922279792746e-07, + "loss": -0.0002, + "reward": 1.9998317956924438, + "reward_std": 1.7797691725718323e-05, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.4998319149017334, + "step": 1912 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.5, + "epoch": 4.955958549222798, + "grad_norm": 9.79988218794273, + "kl": 0.0618896484375, + "learning_rate": 5.046632124352331e-07, + "loss": 0.0, + "reward": 2.49992036819458, + "reward_std": 1.0493651757315092e-05, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999204874038696, + "step": 1913 + }, + { + "clip_ratio": 0.0, + "completion_length": 36.0, + "epoch": 4.958549222797927, + "grad_norm": 0.047799849922210126, + "kl": 0.068603515625, + "learning_rate": 5.044041450777202e-07, + "loss": -0.0, + "reward": 2.4999985694885254, + "reward_std": 8.212363411530532e-07, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.999998688697815, + "step": 1914 + }, + { + "clip_ratio": 0.0, + "completion_length": 34.5, + "epoch": 4.961139896373057, + "grad_norm": 33.88340165507215, + "kl": 0.062744140625, + "learning_rate": 5.041450777202072e-07, + "loss": 0.0002, + "reward": 2.499943256378174, + "reward_std": 3.3268408060393995e-05, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.999943196773529, + "step": 1915 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.3125, + "epoch": 4.963730569948186, + "grad_norm": 0.4766602441522811, + "kl": 0.11474609375, + "learning_rate": 5.038860103626942e-07, + "loss": 0.0005, + "reward": 2.499996304512024, + "reward_std": 3.1861559932622185e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999962449073792, + "step": 1916 + }, + { + "clip_ratio": 0.0, + "completion_length": 34.5, + "epoch": 4.966321243523316, + "grad_norm": 1.970085671487226, + "kl": 0.099365234375, + "learning_rate": 5.036269430051812e-07, + "loss": 0.0006, + "reward": 2.4999852180480957, + "reward_std": 7.76936246893456e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999850988388062, + "step": 1917 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.5, + "epoch": 4.9689119170984455, + "grad_norm": 0.614317125994361, + "kl": 0.097900390625, + "learning_rate": 5.033678756476683e-07, + "loss": 0.0003, + "reward": 2.499976634979248, + "reward_std": 6.318117357295705e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.999976634979248, + "step": 1918 + }, + { + "clip_ratio": 0.0, + "completion_length": 34.0, + "epoch": 4.971502590673575, + "grad_norm": 2.5982732967414153, + "kl": 0.1248779296875, + "learning_rate": 5.031088082901555e-07, + "loss": -0.0, + "reward": 1.9950045347213745, + "reward_std": 4.371173338313383e-05, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.4950045347213745, + "step": 1919 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.5, + "epoch": 4.974093264248705, + "grad_norm": 0.556943115477112, + "kl": 0.06201171875, + "learning_rate": 5.028497409326425e-07, + "loss": -0.0002, + "reward": 2.4999951124191284, + "reward_std": 3.4579869634399074e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.999995231628418, + "step": 1920 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 4.976683937823834, + "grad_norm": 14.170939958264592, + "kl": 0.126708984375, + "learning_rate": 5.025906735751296e-07, + "loss": 0.0006, + "reward": 1.730444073677063, + "reward_std": 0.25908429973060265, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.230444073677063, + "step": 1921 + }, + { + "clip_ratio": 0.0, + "completion_length": 36.5, + "epoch": 4.979274611398964, + "grad_norm": 112.50593354207143, + "kl": 0.09674072265625, + "learning_rate": 5.023316062176167e-07, + "loss": 0.0001, + "reward": 1.999139666557312, + "reward_std": 0.0007437472534093104, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.4991395473480225, + "step": 1922 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 4.981865284974093, + "grad_norm": 0.12978793612667916, + "kl": 0.1240234375, + "learning_rate": 5.020725388601036e-07, + "loss": -0.0001, + "reward": 2.4999988079071045, + "reward_std": 1.4282953202382487e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.999998927116394, + "step": 1923 + }, + { + "clip_ratio": 0.0, + "completion_length": 34.5, + "epoch": 4.984455958549223, + "grad_norm": 1.661070434926724, + "kl": 0.0877685546875, + "learning_rate": 5.018134715025907e-07, + "loss": 0.0002, + "reward": 2.499964714050293, + "reward_std": 1.1544149856490549e-05, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999645948410034, + "step": 1924 + }, + { + "clip_ratio": 0.0, + "completion_length": 36.0, + "epoch": 4.987046632124352, + "grad_norm": 0.24215066159783818, + "kl": 0.0906982421875, + "learning_rate": 5.015544041450777e-07, + "loss": 0.0007, + "reward": 2.499996781349182, + "reward_std": 2.105047371969704e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.999996840953827, + "step": 1925 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.5, + "epoch": 4.989637305699482, + "grad_norm": 0.23202070007577813, + "kl": 0.112060546875, + "learning_rate": 5.012953367875648e-07, + "loss": 0.0012, + "reward": 2.499994993209839, + "reward_std": 3.002770938564936e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999948143959045, + "step": 1926 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 4.992227979274611, + "grad_norm": 1.8770420819716949, + "kl": 0.072265625, + "learning_rate": 5.010362694300519e-07, + "loss": 0.0002, + "reward": 2.499958872795105, + "reward_std": 1.1185346778574967e-05, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999589920043945, + "step": 1927 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.5, + "epoch": 4.994818652849741, + "grad_norm": 0.3579381392829675, + "kl": 0.068603515625, + "learning_rate": 5.007772020725388e-07, + "loss": 0.0001, + "reward": 2.499995231628418, + "reward_std": 3.055555509945407e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.999995231628418, + "step": 1928 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 4.9974093264248705, + "grad_norm": 2.3002497398650443, + "kl": 0.071044921875, + "learning_rate": 5.005181347150259e-07, + "loss": -0.0002, + "reward": 2.4999839067459106, + "reward_std": 6.179596425681666e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999839067459106, + "step": 1929 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 5.0, + "grad_norm": 0.05155568553213749, + "kl": 0.140869140625, + "learning_rate": 5.002590673575129e-07, + "loss": 0.0003, + "reward": 2.499998927116394, + "reward_std": 1.260425307236801e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.999998927116394, + "step": 1930 + }, + { + "clip_ratio": 0.0, + "completion_length": 34.5, + "epoch": 5.0025906735751295, + "grad_norm": 0.44041786757354445, + "kl": 0.10205078125, + "learning_rate": 5e-07, + "loss": 0.0002, + "reward": 1.999955415725708, + "reward_std": 5.4261488457996165e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.4999555349349976, + "step": 1931 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 5.005181347150259, + "grad_norm": 0.1486081708723704, + "kl": 0.0985107421875, + "learning_rate": 4.99740932642487e-07, + "loss": 0.0008, + "reward": 2.4999964237213135, + "reward_std": 2.7909155733141233e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.999996304512024, + "step": 1932 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 5.007772020725389, + "grad_norm": 4.376789480644765, + "kl": 0.0809326171875, + "learning_rate": 4.994818652849741e-07, + "loss": 0.0004, + "reward": 1.9833866357803345, + "reward_std": 0.00012132878259762947, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.4833866655826569, + "step": 1933 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.875, + "epoch": 5.010362694300518, + "grad_norm": 37.461291605979255, + "kl": 0.13037109375, + "learning_rate": 4.992227979274612e-07, + "loss": 0.0001, + "reward": 2.042196273803711, + "reward_std": 0.18491899121443112, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.542196273803711, + "step": 1934 + }, + { + "clip_ratio": 0.0, + "completion_length": 34.0, + "epoch": 5.012953367875648, + "grad_norm": 20.96403180296301, + "kl": 0.228515625, + "learning_rate": 4.989637305699482e-07, + "loss": 0.0006, + "reward": 1.9577412605285645, + "reward_std": 0.3382488763127185, + "rewards/format_reward_rec": 0.9375, + "rewards/point_reward": 1.4889912605285645, + "step": 1935 + }, + { + "clip_ratio": 0.0, + "completion_length": 34.5, + "epoch": 5.015544041450777, + "grad_norm": 1.312300023220779, + "kl": 0.41943359375, + "learning_rate": 4.987046632124352e-07, + "loss": 0.0011, + "reward": 2.49999737739563, + "reward_std": 2.6252888858380174e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999974966049194, + "step": 1936 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.5, + "epoch": 5.018134715025907, + "grad_norm": 0.6403185857183649, + "kl": 0.0987548828125, + "learning_rate": 4.984455958549223e-07, + "loss": -0.0004, + "reward": 2.4999475479125977, + "reward_std": 1.0124155778612476e-05, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999476671218872, + "step": 1937 + }, + { + "clip_ratio": 0.0, + "completion_length": 34.5, + "epoch": 5.020725388601036, + "grad_norm": 5.99578723498661, + "kl": 0.208740234375, + "learning_rate": 4.981865284974093e-07, + "loss": 0.0011, + "reward": 1.4945184588432312, + "reward_std": 8.185960177797824e-05, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 0.9945183992385864, + "step": 1938 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.5, + "epoch": 5.023316062176166, + "grad_norm": 0.22837904988975566, + "kl": 0.1942138671875, + "learning_rate": 4.979274611398964e-07, + "loss": 0.0006, + "reward": 2.499977946281433, + "reward_std": 3.2434911645395914e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999780058860779, + "step": 1939 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.5, + "epoch": 5.025906735751295, + "grad_norm": 0.08623790960688131, + "kl": 0.070068359375, + "learning_rate": 4.976683937823834e-07, + "loss": 0.0007, + "reward": 2.4999974966049194, + "reward_std": 1.2107051929888257e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999974966049194, + "step": 1940 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.5, + "epoch": 5.028497409326425, + "grad_norm": 26.47870798522681, + "kl": 0.052978515625, + "learning_rate": 4.974093264248704e-07, + "loss": 0.0009, + "reward": 2.4374852180480957, + "reward_std": 0.17678938515939535, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9374852776527405, + "step": 1941 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.5, + "epoch": 5.0310880829015545, + "grad_norm": 0.13478059522112307, + "kl": 0.0628662109375, + "learning_rate": 4.971502590673575e-07, + "loss": -0.0003, + "reward": 2.4999985694885254, + "reward_std": 1.69297129559709e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.999998688697815, + "step": 1942 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.5, + "epoch": 5.033678756476684, + "grad_norm": 0.7684616959797207, + "kl": 0.095458984375, + "learning_rate": 4.968911917098446e-07, + "loss": 0.0014, + "reward": 2.4999715089797974, + "reward_std": 8.310182920467923e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999712109565735, + "step": 1943 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 5.036269430051814, + "grad_norm": 0.10321480340389022, + "kl": 0.1136474609375, + "learning_rate": 4.966321243523316e-07, + "loss": 0.0021, + "reward": 2.4999982118606567, + "reward_std": 2.2039363329895423e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999982118606567, + "step": 1944 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.5, + "epoch": 5.038860103626943, + "grad_norm": 0.6898264128668515, + "kl": 0.067138671875, + "learning_rate": 4.963730569948186e-07, + "loss": -0.0004, + "reward": 2.4999780654907227, + "reward_std": 5.790218324364105e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999781847000122, + "step": 1945 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.5, + "epoch": 5.041450777202073, + "grad_norm": 1.7317733427190027, + "kl": 0.09033203125, + "learning_rate": 4.961139896373057e-07, + "loss": 0.0007, + "reward": 1.9998986721038818, + "reward_std": 1.638752860344539e-05, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.4998986721038818, + "step": 1946 + }, + { + "clip_ratio": 0.0, + "completion_length": 34.5, + "epoch": 5.044041450777202, + "grad_norm": 0.4015096749394421, + "kl": 0.123779296875, + "learning_rate": 4.958549222797927e-07, + "loss": 0.0005, + "reward": 2.4999942779541016, + "reward_std": 3.050363659440336e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999943375587463, + "step": 1947 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 5.046632124352332, + "grad_norm": 2.7323083176474774, + "kl": 0.09716796875, + "learning_rate": 4.955958549222798e-07, + "loss": 0.0004, + "reward": 2.4999263286590576, + "reward_std": 7.443785307259532e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999263882637024, + "step": 1948 + }, + { + "clip_ratio": 0.0, + "completion_length": 36.5, + "epoch": 5.049222797927461, + "grad_norm": 0.3004464259468006, + "kl": 0.03460693359375, + "learning_rate": 4.953367875647668e-07, + "loss": -0.001, + "reward": 2.4999972581863403, + "reward_std": 1.21737434710667e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999974370002747, + "step": 1949 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 5.051813471502591, + "grad_norm": 1.3898342975653213, + "kl": 0.095947265625, + "learning_rate": 4.950777202072538e-07, + "loss": 0.0011, + "reward": 2.499994397163391, + "reward_std": 6.26772907708073e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999942779541016, + "step": 1950 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 5.05440414507772, + "grad_norm": 34.974158819577966, + "kl": 0.1640625, + "learning_rate": 4.948186528497409e-07, + "loss": 0.0006, + "reward": 1.7851468324661255, + "reward_std": 0.001801242060992081, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.2851468622684479, + "step": 1951 + }, + { + "clip_ratio": 0.0, + "completion_length": 33.9375, + "epoch": 5.05699481865285, + "grad_norm": 14.850254812264591, + "kl": 0.132080078125, + "learning_rate": 4.94559585492228e-07, + "loss": 0.0004, + "reward": 2.4093295335769653, + "reward_std": 0.25610035105046336, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.90932959318161, + "step": 1952 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 5.0595854922279795, + "grad_norm": 0.2725250116596573, + "kl": 0.072021484375, + "learning_rate": 4.94300518134715e-07, + "loss": 0.001, + "reward": 1.9984136819839478, + "reward_std": 1.957666086127574e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.4984136521816254, + "step": 1953 + }, + { + "clip_ratio": 0.0, + "completion_length": 34.5, + "epoch": 5.062176165803109, + "grad_norm": 4.2366202198119005, + "kl": 1.357666015625, + "learning_rate": 4.94041450777202e-07, + "loss": 0.0055, + "reward": 2.4999951124191284, + "reward_std": 2.5406496320101724e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.999995231628418, + "step": 1954 + }, + { + "clip_ratio": 0.0, + "completion_length": 34.5, + "epoch": 5.064766839378239, + "grad_norm": 0.09164591729355538, + "kl": 0.0726318359375, + "learning_rate": 4.937823834196891e-07, + "loss": -0.0006, + "reward": 2.4999969005584717, + "reward_std": 1.8165039250561676e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999969601631165, + "step": 1955 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 5.067357512953368, + "grad_norm": 6.143896185530837, + "kl": 0.112060546875, + "learning_rate": 4.935233160621761e-07, + "loss": -0.0002, + "reward": 2.4999847412109375, + "reward_std": 1.4170248050504597e-05, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.999984860420227, + "step": 1956 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 5.069948186528498, + "grad_norm": 5.1633188568781865, + "kl": 0.10546875, + "learning_rate": 4.932642487046632e-07, + "loss": 0.0004, + "reward": 1.8822984099388123, + "reward_std": 0.0005674214853570447, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.382298469543457, + "step": 1957 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.5, + "epoch": 5.072538860103627, + "grad_norm": 2.7016956793571762, + "kl": 0.56591796875, + "learning_rate": 4.930051813471502e-07, + "loss": 0.0024, + "reward": 2.49996817111969, + "reward_std": 8.273359298982541e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.99996817111969, + "step": 1958 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.5, + "epoch": 5.075129533678757, + "grad_norm": 0.18273164322617722, + "kl": 0.082275390625, + "learning_rate": 4.927461139896372e-07, + "loss": 0.0017, + "reward": 2.4999966621398926, + "reward_std": 1.6594606222497532e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.999996542930603, + "step": 1959 + }, + { + "clip_ratio": 0.0, + "completion_length": 34.5, + "epoch": 5.077720207253886, + "grad_norm": 470.9819024895138, + "kl": 0.2314453125, + "learning_rate": 4.924870466321243e-07, + "loss": 0.0011, + "reward": 2.0591955184936523, + "reward_std": 0.2720373572897188, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.5591954588890076, + "step": 1960 + }, + { + "clip_ratio": 0.0, + "completion_length": 34.5, + "epoch": 5.080310880829016, + "grad_norm": 0.2455210476874772, + "kl": 0.072265625, + "learning_rate": 4.922279792746113e-07, + "loss": 0.0006, + "reward": 2.499997854232788, + "reward_std": 2.1795440829919244e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999977946281433, + "step": 1961 + }, + { + "clip_ratio": 0.0, + "completion_length": 34.5, + "epoch": 5.082901554404145, + "grad_norm": 0.9417616465071259, + "kl": 0.066650390625, + "learning_rate": 4.919689119170985e-07, + "loss": -0.0007, + "reward": 2.49999737739563, + "reward_std": 2.0860297098579395e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999975562095642, + "step": 1962 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 5.085492227979275, + "grad_norm": 0.09888019132050514, + "kl": 0.097412109375, + "learning_rate": 4.917098445595855e-07, + "loss": 0.0011, + "reward": 2.49999737739563, + "reward_std": 1.4439532378673903e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999974966049194, + "step": 1963 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.5, + "epoch": 5.0880829015544045, + "grad_norm": 10.522833406645212, + "kl": 0.183349609375, + "learning_rate": 4.914507772020726e-07, + "loss": 0.0014, + "reward": 1.979519248008728, + "reward_std": 0.00010333130740036722, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.479519248008728, + "step": 1964 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.4375, + "epoch": 5.090673575129534, + "grad_norm": 51.94584164315255, + "kl": 0.0927734375, + "learning_rate": 4.911917098445596e-07, + "loss": 0.0007, + "reward": 1.974902868270874, + "reward_std": 0.004067035675689112, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.4749028086662292, + "step": 1965 + }, + { + "clip_ratio": 0.0, + "completion_length": 34.5, + "epoch": 5.0932642487046635, + "grad_norm": 0.23037371086719313, + "kl": 0.06524658203125, + "learning_rate": 4.909326424870467e-07, + "loss": -0.0, + "reward": 2.499997615814209, + "reward_std": 1.8141939506222116e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999977350234985, + "step": 1966 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.5, + "epoch": 5.095854922279793, + "grad_norm": 3.817142065490866, + "kl": 0.0908203125, + "learning_rate": 4.906735751295337e-07, + "loss": 0.0001, + "reward": 2.4998281002044678, + "reward_std": 2.6074141601384326e-05, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9998281002044678, + "step": 1967 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 5.098445595854923, + "grad_norm": 0.5455854524411821, + "kl": 0.06341552734375, + "learning_rate": 4.904145077720207e-07, + "loss": -0.0005, + "reward": 1.999927043914795, + "reward_std": 1.087022673118554e-05, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.4999271631240845, + "step": 1968 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 5.101036269430052, + "grad_norm": 9.168445126343135, + "kl": 0.065673828125, + "learning_rate": 4.901554404145078e-07, + "loss": -0.0001, + "reward": 2.4999436140060425, + "reward_std": 1.7112050642253962e-05, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999436736106873, + "step": 1969 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.5, + "epoch": 5.103626943005182, + "grad_norm": 2.4630563043569795, + "kl": 0.074462890625, + "learning_rate": 4.898963730569948e-07, + "loss": 0.0002, + "reward": 1.9799813032150269, + "reward_std": 0.00014127314898360055, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.479981243610382, + "step": 1970 + }, + { + "clip_ratio": 0.0, + "completion_length": 36.0, + "epoch": 5.106217616580311, + "grad_norm": 0.7258132398064928, + "kl": 0.1097412109375, + "learning_rate": 4.896373056994819e-07, + "loss": 0.0005, + "reward": 2.499991297721863, + "reward_std": 6.1494474721257575e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.999991238117218, + "step": 1971 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 5.108808290155441, + "grad_norm": 4.262925906197277, + "kl": 0.049560546875, + "learning_rate": 4.893782383419689e-07, + "loss": 0.0002, + "reward": 2.4999629259109497, + "reward_std": 2.0119208784308285e-05, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999630451202393, + "step": 1972 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.5, + "epoch": 5.11139896373057, + "grad_norm": 1.558017733032952, + "kl": 0.080078125, + "learning_rate": 4.89119170984456e-07, + "loss": -0.0003, + "reward": 2.4999959468841553, + "reward_std": 3.334208713567932e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999959468841553, + "step": 1973 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.5, + "epoch": 5.1139896373057, + "grad_norm": 0.14393618073282874, + "kl": 0.02935791015625, + "learning_rate": 4.88860103626943e-07, + "loss": -0.0, + "reward": 2.499997138977051, + "reward_std": 2.0730961978188134e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999971389770508, + "step": 1974 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.9375, + "epoch": 5.116580310880829, + "grad_norm": 0.584835562493568, + "kl": 0.040771484375, + "learning_rate": 4.886010362694301e-07, + "loss": -0.0007, + "reward": 2.4999927282333374, + "reward_std": 4.800249371328391e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.999992847442627, + "step": 1975 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 5.119170984455959, + "grad_norm": 0.8626024819700417, + "kl": 0.31884765625, + "learning_rate": 4.883419689119171e-07, + "loss": 0.0021, + "reward": 2.4999958276748657, + "reward_std": 5.013073405280011e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999957084655762, + "step": 1976 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 5.1217616580310885, + "grad_norm": 9.332108422683755, + "kl": 0.0966796875, + "learning_rate": 4.880829015544041e-07, + "loss": 0.0011, + "reward": 1.995583713054657, + "reward_std": 8.042127274165978e-05, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.495583564043045, + "step": 1977 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 5.124352331606218, + "grad_norm": 0.49280543122272225, + "kl": 0.1004638671875, + "learning_rate": 4.878238341968912e-07, + "loss": 0.001, + "reward": 2.499996542930603, + "reward_std": 3.03728120343294e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999963641166687, + "step": 1978 + }, + { + "clip_ratio": 0.0, + "completion_length": 34.5, + "epoch": 5.126943005181348, + "grad_norm": 1.9234761056989427, + "kl": 0.0667724609375, + "learning_rate": 4.875647668393782e-07, + "loss": 0.0002, + "reward": 2.499993324279785, + "reward_std": 7.244569019349001e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999933242797852, + "step": 1979 + }, + { + "clip_ratio": 0.0, + "completion_length": 34.5, + "epoch": 5.129533678756476, + "grad_norm": 0.09006516242284873, + "kl": 0.071533203125, + "learning_rate": 4.873056994818653e-07, + "loss": -0.0002, + "reward": 2.4999955892562866, + "reward_std": 2.1774088452275464e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999955296516418, + "step": 1980 + }, + { + "clip_ratio": 0.0, + "completion_length": 36.0, + "epoch": 5.132124352331606, + "grad_norm": 9.037745453688038, + "kl": 0.191162109375, + "learning_rate": 4.870466321243523e-07, + "loss": 0.001, + "reward": 1.8227461576461792, + "reward_std": 0.0014014614974939832, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.3227460980415344, + "step": 1981 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 5.134715025906735, + "grad_norm": 0.9375389288558058, + "kl": 0.1241455078125, + "learning_rate": 4.867875647668394e-07, + "loss": -0.0006, + "reward": 2.4999932050704956, + "reward_std": 6.188667498463474e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.99999338388443, + "step": 1982 + }, + { + "clip_ratio": 0.0, + "completion_length": 34.5, + "epoch": 5.137305699481865, + "grad_norm": 0.26498455994168174, + "kl": 0.098388671875, + "learning_rate": 4.865284974093264e-07, + "loss": 0.0001, + "reward": 2.499995470046997, + "reward_std": 4.564170694720815e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.999995470046997, + "step": 1983 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 5.139896373056994, + "grad_norm": 1.4312647050672191, + "kl": 0.11602783203125, + "learning_rate": 4.862694300518134e-07, + "loss": 0.0011, + "reward": 2.499983310699463, + "reward_std": 8.439705197815783e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999831914901733, + "step": 1984 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.75, + "epoch": 5.142487046632124, + "grad_norm": 1.1114484686755408, + "kl": 0.133056640625, + "learning_rate": 4.860103626943005e-07, + "loss": 0.0003, + "reward": 2.499979257583618, + "reward_std": 7.881527835706947e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999792575836182, + "step": 1985 + }, + { + "clip_ratio": 0.0, + "completion_length": 34.0, + "epoch": 5.1450777202072535, + "grad_norm": 0.13703469199686946, + "kl": 0.06201171875, + "learning_rate": 4.857512953367875e-07, + "loss": 0.0004, + "reward": 2.4999988079071045, + "reward_std": 1.3316914646566147e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999990463256836, + "step": 1986 + }, + { + "clip_ratio": 0.0, + "completion_length": 36.0, + "epoch": 5.147668393782383, + "grad_norm": 6.870160815810715, + "kl": 0.144287109375, + "learning_rate": 4.854922279792746e-07, + "loss": 0.0005, + "reward": 1.999910831451416, + "reward_std": 3.4607599957325874e-05, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.4999107718467712, + "step": 1987 + }, + { + "clip_ratio": 0.0, + "completion_length": 36.0, + "epoch": 5.150259067357513, + "grad_norm": 4.704891290151144, + "kl": 0.07763671875, + "learning_rate": 4.852331606217616e-07, + "loss": 0.0013, + "reward": 2.4999663829803467, + "reward_std": 1.880599211290246e-05, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999664425849915, + "step": 1988 + }, + { + "clip_ratio": 0.0, + "completion_length": 34.5, + "epoch": 5.152849740932642, + "grad_norm": 3.9830847568788244, + "kl": 0.0584716796875, + "learning_rate": 4.849740932642487e-07, + "loss": -0.0005, + "reward": 2.499983787536621, + "reward_std": 1.3517110119209974e-05, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999840259552002, + "step": 1989 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 5.155440414507772, + "grad_norm": 0.6231674188728465, + "kl": 0.056640625, + "learning_rate": 4.847150259067357e-07, + "loss": 0.0001, + "reward": 2.499987840652466, + "reward_std": 6.364800810843008e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999878406524658, + "step": 1990 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 5.158031088082901, + "grad_norm": 28.02749306094768, + "kl": 0.09814453125, + "learning_rate": 4.844559585492228e-07, + "loss": 0.0004, + "reward": 1.8516458868980408, + "reward_std": 0.1900110165006481, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.3516458570957184, + "step": 1991 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 5.160621761658031, + "grad_norm": 0.18978471173080563, + "kl": 0.0386962890625, + "learning_rate": 4.841968911917098e-07, + "loss": -0.0006, + "reward": 2.4999979734420776, + "reward_std": 1.7876092215374229e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999980330467224, + "step": 1992 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 5.16321243523316, + "grad_norm": 2.095479182630184, + "kl": 0.045166015625, + "learning_rate": 4.839378238341968e-07, + "loss": -0.0006, + "reward": 2.498996138572693, + "reward_std": 4.0461362061705586e-05, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9989961981773376, + "step": 1993 + }, + { + "clip_ratio": 0.0, + "completion_length": 34.5, + "epoch": 5.16580310880829, + "grad_norm": 0.07316495224379722, + "kl": 0.103759765625, + "learning_rate": 4.836787564766839e-07, + "loss": 0.0004, + "reward": 2.4999977350234985, + "reward_std": 1.9874941017405945e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999974966049194, + "step": 1994 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.5, + "epoch": 5.168393782383419, + "grad_norm": 1.3292786854194039, + "kl": 0.05615234375, + "learning_rate": 4.834196891191709e-07, + "loss": 0.0019, + "reward": 2.4999654293060303, + "reward_std": 6.915546236996306e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999651908874512, + "step": 1995 + }, + { + "clip_ratio": 0.0, + "completion_length": 34.5, + "epoch": 5.170984455958549, + "grad_norm": 0.8869836113656446, + "kl": 0.0479736328125, + "learning_rate": 4.83160621761658e-07, + "loss": 0.0011, + "reward": 2.4999903440475464, + "reward_std": 8.109043790227588e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999905228614807, + "step": 1996 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.5, + "epoch": 5.1735751295336785, + "grad_norm": 10.947481721947607, + "kl": 0.27978515625, + "learning_rate": 4.82901554404145e-07, + "loss": 0.0008, + "reward": 2.1249611377716064, + "reward_std": 0.2314744981044896, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.6249611377716064, + "step": 1997 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 5.176165803108808, + "grad_norm": 2.669004823020978, + "kl": 0.093017578125, + "learning_rate": 4.826424870466321e-07, + "loss": 0.0005, + "reward": 2.499934196472168, + "reward_std": 1.316819543717429e-05, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999340772628784, + "step": 1998 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 5.178756476683938, + "grad_norm": 0.5352163084793194, + "kl": 0.097900390625, + "learning_rate": 4.823834196891191e-07, + "loss": -0.0007, + "reward": 2.499977707862854, + "reward_std": 5.183891062188195e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999775886535645, + "step": 1999 + }, + { + "clip_ratio": 0.0, + "completion_length": 36.0, + "epoch": 5.181347150259067, + "grad_norm": 0.9272355918295676, + "kl": 0.15576171875, + "learning_rate": 4.821243523316062e-07, + "loss": 0.0006, + "reward": 2.4999847412109375, + "reward_std": 9.09536879589723e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999848008155823, + "step": 2000 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 5.183937823834197, + "grad_norm": 12.226005661886214, + "kl": 0.073486328125, + "learning_rate": 4.818652849740932e-07, + "loss": 0.0002, + "reward": 1.9373315572738647, + "reward_std": 0.1768157596416131, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.4373316764831543, + "step": 2001 + }, + { + "clip_ratio": 0.0, + "completion_length": 33.5, + "epoch": 5.186528497409326, + "grad_norm": 0.24489628481099135, + "kl": 0.075927734375, + "learning_rate": 4.816062176165802e-07, + "loss": -0.0004, + "reward": 2.4999786615371704, + "reward_std": 4.631241154129384e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999786615371704, + "step": 2002 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.625, + "epoch": 5.189119170984456, + "grad_norm": 38.43430854479419, + "kl": 0.0875244140625, + "learning_rate": 4.813471502590673e-07, + "loss": 0.0009, + "reward": 1.8253574967384338, + "reward_std": 0.10489688286997989, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.3253573775291443, + "step": 2003 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.5, + "epoch": 5.191709844559585, + "grad_norm": 73.1767457782988, + "kl": 0.0704345703125, + "learning_rate": 4.810880829015543e-07, + "loss": 0.0007, + "reward": 2.312434434890747, + "reward_std": 0.25882142814953113, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.812434434890747, + "step": 2004 + }, + { + "clip_ratio": 0.0, + "completion_length": 36.0, + "epoch": 5.194300518134715, + "grad_norm": 0.9140457657461409, + "kl": 0.123291015625, + "learning_rate": 4.808290155440415e-07, + "loss": 0.001, + "reward": 2.499956965446472, + "reward_std": 1.1952409522564267e-05, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999569058418274, + "step": 2005 + }, + { + "clip_ratio": 0.0, + "completion_length": 34.0, + "epoch": 5.196891191709844, + "grad_norm": 18.606498102894474, + "kl": 0.18408203125, + "learning_rate": 4.805699481865285e-07, + "loss": 0.0008, + "reward": 1.7579326629638672, + "reward_std": 0.17730119306361303, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.2579325437545776, + "step": 2006 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 5.199481865284974, + "grad_norm": 0.7857258017267698, + "kl": 0.078369140625, + "learning_rate": 4.803108808290155e-07, + "loss": 0.0007, + "reward": 2.4999752044677734, + "reward_std": 9.016391743443819e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999752044677734, + "step": 2007 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.5, + "epoch": 5.2020725388601035, + "grad_norm": 0.5625062693993619, + "kl": 0.067626953125, + "learning_rate": 4.800518134715026e-07, + "loss": -0.0007, + "reward": 2.4999927282333374, + "reward_std": 3.3889252222252253e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999929666519165, + "step": 2008 + }, + { + "clip_ratio": 0.0, + "completion_length": 34.5, + "epoch": 5.204663212435233, + "grad_norm": 0.12358922776903711, + "kl": 0.07574462890625, + "learning_rate": 4.797927461139897e-07, + "loss": 0.0012, + "reward": 2.499997138977051, + "reward_std": 2.235927468063892e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999970197677612, + "step": 2009 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.5, + "epoch": 5.2072538860103625, + "grad_norm": 2.0375427942254056, + "kl": 0.0631103515625, + "learning_rate": 4.795336787564767e-07, + "loss": 0.0002, + "reward": 2.499986410140991, + "reward_std": 1.0443450207731075e-05, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999864101409912, + "step": 2010 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.5, + "epoch": 5.209844559585492, + "grad_norm": 0.16868327954000012, + "kl": 0.11669921875, + "learning_rate": 4.792746113989637e-07, + "loss": -0.001, + "reward": 2.4999938011169434, + "reward_std": 3.768546434912423e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999940395355225, + "step": 2011 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 5.212435233160622, + "grad_norm": 0.3307661741865748, + "kl": 0.159912109375, + "learning_rate": 4.790155440414508e-07, + "loss": 0.0018, + "reward": 1.999861717224121, + "reward_std": 7.297677683482107e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.4998615086078644, + "step": 2012 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 5.215025906735751, + "grad_norm": 3.0777413562930023, + "kl": 0.03985595703125, + "learning_rate": 4.787564766839378e-07, + "loss": 0.001, + "reward": 1.8221864104270935, + "reward_std": 0.00032432956504635513, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.322186291217804, + "step": 2013 + }, + { + "clip_ratio": 0.0, + "completion_length": 34.0, + "epoch": 5.217616580310881, + "grad_norm": 0.23269579553536118, + "kl": 0.149658203125, + "learning_rate": 4.784974093264249e-07, + "loss": 0.0006, + "reward": 2.499996066093445, + "reward_std": 3.5594301834862563e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999960660934448, + "step": 2014 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.5, + "epoch": 5.22020725388601, + "grad_norm": 1.3045431916202663, + "kl": 0.06170654296875, + "learning_rate": 4.782383419689119e-07, + "loss": 0.0012, + "reward": 2.4999818801879883, + "reward_std": 9.364475999973365e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999819993972778, + "step": 2015 + }, + { + "clip_ratio": 0.0, + "completion_length": 34.0, + "epoch": 5.22279792746114, + "grad_norm": 13.47631285618988, + "kl": 0.12646484375, + "learning_rate": 4.779792746113989e-07, + "loss": 0.0005, + "reward": 2.437469244003296, + "reward_std": 0.17679856166296304, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9374691843986511, + "step": 2016 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 5.225388601036269, + "grad_norm": 19.257090874550265, + "kl": 0.08135986328125, + "learning_rate": 4.77720207253886e-07, + "loss": -0.0003, + "reward": 2.3749598264694214, + "reward_std": 0.23149637901906317, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.8749597668647766, + "step": 2017 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 5.227979274611399, + "grad_norm": 2.206561954833908, + "kl": 0.15771484375, + "learning_rate": 4.774611398963731e-07, + "loss": 0.0006, + "reward": 1.679569959640503, + "reward_std": 0.00018433171248943836, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.1795699745416641, + "step": 2018 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.5, + "epoch": 5.230569948186528, + "grad_norm": 0.9730524641993075, + "kl": 0.083984375, + "learning_rate": 4.772020725388601e-07, + "loss": 0.0006, + "reward": 1.9998382329940796, + "reward_std": 1.108695028051443e-05, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.4998381435871124, + "step": 2019 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 5.233160621761658, + "grad_norm": 0.7803634350721955, + "kl": 0.11572265625, + "learning_rate": 4.769430051813471e-07, + "loss": 0.0006, + "reward": 2.4999921321868896, + "reward_std": 6.346814558355618e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999918341636658, + "step": 2020 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 5.2357512953367875, + "grad_norm": 4.491683672925936, + "kl": 0.0859375, + "learning_rate": 4.7668393782383414e-07, + "loss": 0.0003, + "reward": 2.499886393547058, + "reward_std": 3.21405750582926e-05, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9998863339424133, + "step": 2021 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 5.238341968911917, + "grad_norm": 0.1495562754015289, + "kl": 0.0795440673828125, + "learning_rate": 4.7642487046632124e-07, + "loss": 0.0008, + "reward": 2.4999966621398926, + "reward_std": 1.8294344954483677e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.999996542930603, + "step": 2022 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 5.240932642487047, + "grad_norm": 2.331964953034922, + "kl": 0.10107421875, + "learning_rate": 4.761658031088083e-07, + "loss": -0.0006, + "reward": 2.499977469444275, + "reward_std": 1.2538307601062115e-05, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.999977707862854, + "step": 2023 + }, + { + "clip_ratio": 0.0, + "completion_length": 34.5625, + "epoch": 5.243523316062176, + "grad_norm": 35.18168684649078, + "kl": 0.157470703125, + "learning_rate": 4.759067357512953e-07, + "loss": 0.0007, + "reward": 1.9348769187927246, + "reward_std": 0.18204218066239264, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.4348769187927246, + "step": 2024 + }, + { + "clip_ratio": 0.0, + "completion_length": 34.5, + "epoch": 5.246113989637306, + "grad_norm": 6.064152304671751, + "kl": 0.0927734375, + "learning_rate": 4.7564766839378235e-07, + "loss": 0.001, + "reward": 1.9447259306907654, + "reward_std": 0.0001724832382024033, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.4447258710861206, + "step": 2025 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 5.248704663212435, + "grad_norm": 0.1557743861833267, + "kl": 0.074951171875, + "learning_rate": 4.7538860103626945e-07, + "loss": -0.0005, + "reward": 2.4999911785125732, + "reward_std": 3.6043558395704167e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999913573265076, + "step": 2026 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.5, + "epoch": 5.251295336787565, + "grad_norm": 0.21249021004467808, + "kl": 0.118896484375, + "learning_rate": 4.7512953367875645e-07, + "loss": 0.001, + "reward": 2.4999886751174927, + "reward_std": 2.9805546546413098e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999884963035583, + "step": 2027 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.5, + "epoch": 5.253886010362694, + "grad_norm": 0.13388185961559354, + "kl": 0.091796875, + "learning_rate": 4.748704663212435e-07, + "loss": 0.0012, + "reward": 2.4999974966049194, + "reward_std": 2.211841092503164e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999973773956299, + "step": 2028 + }, + { + "clip_ratio": 0.0, + "completion_length": 36.0, + "epoch": 5.256476683937824, + "grad_norm": 0.10384611493487242, + "kl": 0.048828125, + "learning_rate": 4.7461139896373056e-07, + "loss": 0.0013, + "reward": 2.499996066093445, + "reward_std": 2.168377875477745e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999958872795105, + "step": 2029 + }, + { + "clip_ratio": 0.0, + "completion_length": 34.5, + "epoch": 5.259067357512953, + "grad_norm": 0.41286146074346136, + "kl": 0.047119140625, + "learning_rate": 4.7435233160621756e-07, + "loss": -0.0002, + "reward": 2.4999895095825195, + "reward_std": 4.048265054734657e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.99998939037323, + "step": 2030 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.5, + "epoch": 5.261658031088083, + "grad_norm": 11.093720648281675, + "kl": 0.15673828125, + "learning_rate": 4.7409326424870466e-07, + "loss": 0.0008, + "reward": 2.4374760389328003, + "reward_std": 0.17682615059948148, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9374760389328003, + "step": 2031 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 5.2642487046632125, + "grad_norm": 0.04485272139727189, + "kl": 0.100830078125, + "learning_rate": 4.738341968911917e-07, + "loss": 0.0003, + "reward": 2.499998450279236, + "reward_std": 1.171377107311855e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999985098838806, + "step": 2032 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.5, + "epoch": 5.266839378238342, + "grad_norm": 1.305999910431039, + "kl": 0.1181640625, + "learning_rate": 4.735751295336787e-07, + "loss": 0.0008, + "reward": 2.499970316886902, + "reward_std": 1.1428786933720403e-05, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999703168869019, + "step": 2033 + }, + { + "clip_ratio": 0.0, + "completion_length": 34.5, + "epoch": 5.269430051813472, + "grad_norm": 0.7582947177992131, + "kl": 0.05078125, + "learning_rate": 4.7331606217616577e-07, + "loss": 0.0005, + "reward": 1.9998403787612915, + "reward_std": 1.263207673218858e-05, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.499840408563614, + "step": 2034 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.5, + "epoch": 5.272020725388601, + "grad_norm": 0.1503034817943373, + "kl": 0.0736083984375, + "learning_rate": 4.730569948186529e-07, + "loss": -0.0006, + "reward": 2.499990701675415, + "reward_std": 2.7719107720258762e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999905824661255, + "step": 2035 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 5.274611398963731, + "grad_norm": 0.06364041847068415, + "kl": 0.096923828125, + "learning_rate": 4.7279792746113987e-07, + "loss": 0.0002, + "reward": 2.499995470046997, + "reward_std": 1.3829541956056346e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.999995470046997, + "step": 2036 + }, + { + "clip_ratio": 0.0, + "completion_length": 34.5, + "epoch": 5.27720207253886, + "grad_norm": 2.8009695872641536, + "kl": 0.18115234375, + "learning_rate": 4.725388601036269e-07, + "loss": 0.0013, + "reward": 2.4999924898147583, + "reward_std": 5.7567809790270985e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999923706054688, + "step": 2037 + }, + { + "clip_ratio": 0.0, + "completion_length": 36.0, + "epoch": 5.27979274611399, + "grad_norm": 1.1372837058413379, + "kl": 0.072021484375, + "learning_rate": 4.72279792746114e-07, + "loss": -0.0004, + "reward": 1.999854564666748, + "reward_std": 1.872285395165818e-05, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.4998546838760376, + "step": 2038 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 5.282383419689119, + "grad_norm": 2.378991347336132, + "kl": 0.127685546875, + "learning_rate": 4.72020725388601e-07, + "loss": 0.0006, + "reward": 2.499992847442627, + "reward_std": 8.28299613431227e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999927878379822, + "step": 2039 + }, + { + "clip_ratio": 0.0, + "completion_length": 34.5, + "epoch": 5.284974093264249, + "grad_norm": 0.562924260445607, + "kl": 0.1142578125, + "learning_rate": 4.717616580310881e-07, + "loss": -0.0005, + "reward": 2.4999905824661255, + "reward_std": 4.405285949360405e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999908208847046, + "step": 2040 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 5.287564766839378, + "grad_norm": 0.5181950000187854, + "kl": 0.1455078125, + "learning_rate": 4.7150259067357514e-07, + "loss": -0.0002, + "reward": 1.9998031854629517, + "reward_std": 1.1115156326013675e-05, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.4998033046722412, + "step": 2041 + }, + { + "clip_ratio": 0.0, + "completion_length": 34.75, + "epoch": 5.290155440414508, + "grad_norm": 0.7796277982813166, + "kl": 0.07257080078125, + "learning_rate": 4.7124352331606214e-07, + "loss": 0.0015, + "reward": 2.499987244606018, + "reward_std": 2.3257948100763315e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.999987006187439, + "step": 2042 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 5.2927461139896375, + "grad_norm": 1441.9090813129255, + "kl": 0.15576171875, + "learning_rate": 4.709844559585492e-07, + "loss": 0.0001, + "reward": 1.9705055952072144, + "reward_std": 0.001294660042731266, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.4705053865909576, + "step": 2043 + }, + { + "clip_ratio": 0.0, + "completion_length": 34.5, + "epoch": 5.295336787564767, + "grad_norm": 27.763422929052062, + "kl": 0.112548828125, + "learning_rate": 4.7072538860103624e-07, + "loss": -0.0003, + "reward": 2.4987707138061523, + "reward_std": 2.335650242457632e-05, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.998770833015442, + "step": 2044 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 5.2979274611398965, + "grad_norm": 0.03659922843868473, + "kl": 0.0985107421875, + "learning_rate": 4.704663212435233e-07, + "loss": -0.0002, + "reward": 2.4999982118606567, + "reward_std": 1.6554770354559878e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999982714653015, + "step": 2045 + }, + { + "clip_ratio": 0.0, + "completion_length": 34.5, + "epoch": 5.300518134715026, + "grad_norm": 43.25512056085125, + "kl": 0.1611328125, + "learning_rate": 4.7020725388601035e-07, + "loss": 0.0007, + "reward": 1.4522658586502075, + "reward_std": 0.0009199381747748703, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 0.9522657990455627, + "step": 2046 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.5, + "epoch": 5.303108808290156, + "grad_norm": 0.4428490575050021, + "kl": 0.09796142578125, + "learning_rate": 4.699481865284974e-07, + "loss": -0.0005, + "reward": 2.499997615814209, + "reward_std": 1.3435702612696332e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.999997615814209, + "step": 2047 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 5.305699481865285, + "grad_norm": 0.16740936216765842, + "kl": 0.0474853515625, + "learning_rate": 4.696891191709844e-07, + "loss": 0.0006, + "reward": 2.499979853630066, + "reward_std": 4.220144774080836e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999797344207764, + "step": 2048 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.5, + "epoch": 5.308290155440415, + "grad_norm": 7.065617004590854, + "kl": 0.0830078125, + "learning_rate": 4.694300518134715e-07, + "loss": -0.0001, + "reward": 2.437489867210388, + "reward_std": 0.17679200713064347, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9374898672103882, + "step": 2049 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 5.310880829015544, + "grad_norm": 22.297933049597205, + "kl": 0.048583984375, + "learning_rate": 4.6917098445595856e-07, + "loss": 0.0004, + "reward": 2.499940514564514, + "reward_std": 2.7524716870175325e-05, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999406337738037, + "step": 2050 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.5, + "epoch": 5.313471502590674, + "grad_norm": 3.250714341474138, + "kl": 0.0985107421875, + "learning_rate": 4.6891191709844556e-07, + "loss": 0.0012, + "reward": 2.0624446272850037, + "reward_std": 0.17678561293860184, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.5624443888664246, + "step": 2051 + }, + { + "clip_ratio": 0.0, + "completion_length": 34.5, + "epoch": 5.316062176165803, + "grad_norm": 18.763013860413036, + "kl": 0.068603515625, + "learning_rate": 4.686528497409326e-07, + "loss": -0.0005, + "reward": 2.4374810457229614, + "reward_std": 0.17680527231124188, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9374812245368958, + "step": 2052 + }, + { + "clip_ratio": 0.0, + "completion_length": 36.0, + "epoch": 5.318652849740933, + "grad_norm": 0.37487403564785005, + "kl": 0.1455078125, + "learning_rate": 4.6839378238341966e-07, + "loss": 0.0006, + "reward": 2.499966621398926, + "reward_std": 6.136103820608696e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999666810035706, + "step": 2053 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 5.321243523316062, + "grad_norm": 0.08383875803303492, + "kl": 0.0526123046875, + "learning_rate": 4.681347150259067e-07, + "loss": 0.0018, + "reward": 2.4999969005584717, + "reward_std": 1.298973870689224e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.999996840953827, + "step": 2054 + }, + { + "clip_ratio": 0.0, + "completion_length": 34.0, + "epoch": 5.323834196891192, + "grad_norm": 4.800539919076755, + "kl": 0.07568359375, + "learning_rate": 4.6787564766839377e-07, + "loss": 0.0006, + "reward": 1.9934781789779663, + "reward_std": 9.310352805869115e-05, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.4934781789779663, + "step": 2055 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 5.3264248704663215, + "grad_norm": 3.089813281554356, + "kl": 0.18017578125, + "learning_rate": 4.676165803108808e-07, + "loss": 0.001, + "reward": 1.9996094703674316, + "reward_std": 4.1392646153326496e-05, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.4996094405651093, + "step": 2056 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 5.329015544041451, + "grad_norm": 3.4798608006591842, + "kl": 0.060302734375, + "learning_rate": 4.673575129533678e-07, + "loss": 0.0001, + "reward": 2.499972701072693, + "reward_std": 1.2096402770112036e-05, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.999972641468048, + "step": 2057 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.5, + "epoch": 5.331606217616581, + "grad_norm": 1.4020570072343297, + "kl": 0.07763671875, + "learning_rate": 4.670984455958549e-07, + "loss": 0.0008, + "reward": 1.9996461868286133, + "reward_std": 7.083769673954521e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.4996461868286133, + "step": 2058 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.5, + "epoch": 5.33419689119171, + "grad_norm": 18.770998896832978, + "kl": 0.10400390625, + "learning_rate": 4.66839378238342e-07, + "loss": 0.0008, + "reward": 1.986478328704834, + "reward_std": 0.00022058276499592466, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.4864783883094788, + "step": 2059 + }, + { + "clip_ratio": 0.0, + "completion_length": 34.5, + "epoch": 5.33678756476684, + "grad_norm": 1.5290833770478713, + "kl": 0.1123046875, + "learning_rate": 4.66580310880829e-07, + "loss": 0.0011, + "reward": 2.4999831914901733, + "reward_std": 9.041373857598956e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999830722808838, + "step": 2060 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 5.339378238341969, + "grad_norm": 3.365025108209568, + "kl": 0.17138671875, + "learning_rate": 4.6632124352331603e-07, + "loss": 0.0008, + "reward": 1.4974809288978577, + "reward_std": 8.409101064899005e-05, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 0.9974808692932129, + "step": 2061 + }, + { + "clip_ratio": 0.0, + "completion_length": 36.0, + "epoch": 5.341968911917099, + "grad_norm": 0.516722815518361, + "kl": 0.07373046875, + "learning_rate": 4.660621761658031e-07, + "loss": 0.0013, + "reward": 2.49996817111969, + "reward_std": 6.534461249430024e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999680519104004, + "step": 2062 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.5, + "epoch": 5.344559585492228, + "grad_norm": 42.9464911958448, + "kl": 0.11328125, + "learning_rate": 4.6580310880829014e-07, + "loss": 0.0001, + "reward": 1.8297033905982971, + "reward_std": 0.0010806132086145226, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.329703450202942, + "step": 2063 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0625, + "epoch": 5.347150259067358, + "grad_norm": 3.571548149362702, + "kl": 0.088134765625, + "learning_rate": 4.655440414507772e-07, + "loss": 0.0007, + "reward": 2.499848246574402, + "reward_std": 1.895615514513338e-05, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.999848186969757, + "step": 2064 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 5.349740932642487, + "grad_norm": 1.5485089860855472, + "kl": 0.10986328125, + "learning_rate": 4.6528497409326424e-07, + "loss": 0.0, + "reward": 2.49999463558197, + "reward_std": 6.795088211219991e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.999994695186615, + "step": 2065 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 5.352331606217617, + "grad_norm": 0.46202290980096195, + "kl": 0.1143798828125, + "learning_rate": 4.6502590673575124e-07, + "loss": 0.0004, + "reward": 2.4999818801879883, + "reward_std": 5.970556117063097e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999818801879883, + "step": 2066 + }, + { + "clip_ratio": 0.0, + "completion_length": 36.0, + "epoch": 5.3549222797927465, + "grad_norm": 0.44432918152238815, + "kl": 0.03643798828125, + "learning_rate": 4.647668393782383e-07, + "loss": 0.0007, + "reward": 2.499997615814209, + "reward_std": 2.428942707410897e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999975562095642, + "step": 2067 + }, + { + "clip_ratio": 0.0, + "completion_length": 34.5, + "epoch": 5.357512953367876, + "grad_norm": 2.773157396815389, + "kl": 0.388916015625, + "learning_rate": 4.645077720207254e-07, + "loss": 0.001, + "reward": 2.499998688697815, + "reward_std": 1.1089787221862935e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999988079071045, + "step": 2068 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.5, + "epoch": 5.360103626943006, + "grad_norm": 9.300575965577616, + "kl": 0.08203125, + "learning_rate": 4.642487046632124e-07, + "loss": 0.0005, + "reward": 2.3749853372573853, + "reward_std": 0.23146944576802753, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.87498539686203, + "step": 2069 + }, + { + "clip_ratio": 0.0, + "completion_length": 36.0, + "epoch": 5.362694300518135, + "grad_norm": 1.1781889635855334, + "kl": 0.106689453125, + "learning_rate": 4.6398963730569945e-07, + "loss": -0.0001, + "reward": 1.9999173879623413, + "reward_std": 1.2754080671584234e-05, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.4999174177646637, + "step": 2070 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.5, + "epoch": 5.365284974093265, + "grad_norm": 1.3031170850880578, + "kl": 0.082763671875, + "learning_rate": 4.637305699481865e-07, + "loss": -0.0, + "reward": 2.4999780654907227, + "reward_std": 1.0340319136048493e-05, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999781250953674, + "step": 2071 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.5, + "epoch": 5.367875647668393, + "grad_norm": 1.978468819335199, + "kl": 0.150634765625, + "learning_rate": 4.6347150259067356e-07, + "loss": -0.0001, + "reward": 2.499780058860779, + "reward_std": 2.750314820332278e-05, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9997801184654236, + "step": 2072 + }, + { + "clip_ratio": 0.0, + "completion_length": 36.0, + "epoch": 5.370466321243523, + "grad_norm": 32.61666273121922, + "kl": 0.1259765625, + "learning_rate": 4.632124352331606e-07, + "loss": -0.0, + "reward": 1.9938130378723145, + "reward_std": 9.415466047357768e-05, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.4938131272792816, + "step": 2073 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.5, + "epoch": 5.373056994818652, + "grad_norm": 0.4298273314831697, + "kl": 0.1123046875, + "learning_rate": 4.6295336787564766e-07, + "loss": 0.001, + "reward": 2.499993681907654, + "reward_std": 3.947814434468455e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.999993622303009, + "step": 2074 + }, + { + "clip_ratio": 0.0, + "completion_length": 34.5, + "epoch": 5.375647668393782, + "grad_norm": 12.019764169042013, + "kl": 0.11474609375, + "learning_rate": 4.6269430051813466e-07, + "loss": 0.0003, + "reward": 1.996951937675476, + "reward_std": 8.88830535359375e-05, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.4969519674777985, + "step": 2075 + }, + { + "clip_ratio": 0.0, + "completion_length": 34.5, + "epoch": 5.3782383419689115, + "grad_norm": 7.571780400970682, + "kl": 0.70361328125, + "learning_rate": 4.624352331606217e-07, + "loss": 0.0023, + "reward": 2.4999905824661255, + "reward_std": 7.733488473604666e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.999990701675415, + "step": 2076 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.5, + "epoch": 5.380829015544041, + "grad_norm": 2.9010578104469884, + "kl": 0.093505859375, + "learning_rate": 4.621761658031088e-07, + "loss": -0.0005, + "reward": 2.499996066093445, + "reward_std": 3.613636465615855e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999961853027344, + "step": 2077 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.5, + "epoch": 5.383419689119171, + "grad_norm": 0.20628868597865285, + "kl": 0.045166015625, + "learning_rate": 4.619170984455958e-07, + "loss": 0.0008, + "reward": 2.499986171722412, + "reward_std": 3.1702651881460042e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.999986171722412, + "step": 2078 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 5.3860103626943, + "grad_norm": 0.06924294406913326, + "kl": 0.093017578125, + "learning_rate": 4.616580310880829e-07, + "loss": 0.0007, + "reward": 2.499999165534973, + "reward_std": 1.1082049127253413e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999991655349731, + "step": 2079 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 5.38860103626943, + "grad_norm": 0.3183844852046282, + "kl": 0.07275390625, + "learning_rate": 4.6139896373056993e-07, + "loss": -0.0002, + "reward": 2.4998854398727417, + "reward_std": 6.824276283623476e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9998854398727417, + "step": 2080 + }, + { + "clip_ratio": 0.0, + "completion_length": 36.0, + "epoch": 5.391191709844559, + "grad_norm": 0.35241318307090086, + "kl": 0.083984375, + "learning_rate": 4.611398963730569e-07, + "loss": 0.0014, + "reward": 2.4999953508377075, + "reward_std": 3.296984459666419e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999953508377075, + "step": 2081 + }, + { + "clip_ratio": 0.0, + "completion_length": 34.5, + "epoch": 5.393782383419689, + "grad_norm": 1.2599272912305945, + "kl": 0.107177734375, + "learning_rate": 4.6088082901554403e-07, + "loss": 0.0005, + "reward": 1.9987984895706177, + "reward_std": 2.61850881315695e-05, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.498798429965973, + "step": 2082 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.5, + "epoch": 5.396373056994818, + "grad_norm": 0.7021500807060977, + "kl": 0.067626953125, + "learning_rate": 4.606217616580311e-07, + "loss": 0.0006, + "reward": 2.499995470046997, + "reward_std": 3.372900181375371e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999954104423523, + "step": 2083 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 5.398963730569948, + "grad_norm": 0.20631606785383624, + "kl": 0.12646484375, + "learning_rate": 4.603626943005181e-07, + "loss": -0.0002, + "reward": 2.4999841451644897, + "reward_std": 3.011648686879198e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999843835830688, + "step": 2084 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.1875, + "epoch": 5.401554404145077, + "grad_norm": 109.22476920514463, + "kl": 0.11474609375, + "learning_rate": 4.6010362694300514e-07, + "loss": 0.0002, + "reward": 1.935310959815979, + "reward_std": 0.08227558277485514, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.4353110194206238, + "step": 2085 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 5.404145077720207, + "grad_norm": 0.0870789919329384, + "kl": 0.080078125, + "learning_rate": 4.5984455958549224e-07, + "loss": 0.0015, + "reward": 2.49997341632843, + "reward_std": 2.1756844148512755e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.999973177909851, + "step": 2086 + }, + { + "clip_ratio": 0.0, + "completion_length": 34.0, + "epoch": 5.4067357512953365, + "grad_norm": 3.654903948211217, + "kl": 0.14111328125, + "learning_rate": 4.5958549222797924e-07, + "loss": 0.0002, + "reward": 2.499973773956299, + "reward_std": 1.4916741918113985e-05, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999738931655884, + "step": 2087 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 5.409326424870466, + "grad_norm": 0.7304761168384192, + "kl": 0.113525390625, + "learning_rate": 4.593264248704663e-07, + "loss": 0.0004, + "reward": 1.9996466040611267, + "reward_std": 1.172649513137003e-05, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.4996465146541595, + "step": 2088 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 5.4119170984455955, + "grad_norm": 6.626392757495933, + "kl": 0.092041015625, + "learning_rate": 4.5906735751295335e-07, + "loss": 0.0006, + "reward": 2.4987927675247192, + "reward_std": 9.908435117722547e-05, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9987927079200745, + "step": 2089 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 5.414507772020725, + "grad_norm": 0.12737609264142025, + "kl": 0.131103515625, + "learning_rate": 4.5880829015544035e-07, + "loss": 0.0005, + "reward": 2.499995470046997, + "reward_std": 2.684502192096261e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999954104423523, + "step": 2090 + }, + { + "clip_ratio": 0.0, + "completion_length": 34.5, + "epoch": 5.417098445595855, + "grad_norm": 10.440643239896733, + "kl": 0.1435546875, + "learning_rate": 4.5854922279792745e-07, + "loss": 0.0002, + "reward": 1.9852285385131836, + "reward_std": 0.00026295995030523045, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.4852285981178284, + "step": 2091 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 5.419689119170984, + "grad_norm": 0.14474505058066395, + "kl": 0.04144287109375, + "learning_rate": 4.582901554404145e-07, + "loss": 0.0006, + "reward": 2.4999972581863403, + "reward_std": 1.935968896304985e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999973773956299, + "step": 2092 + }, + { + "clip_ratio": 0.0, + "completion_length": 34.5, + "epoch": 5.422279792746114, + "grad_norm": 0.4666911490847812, + "kl": 0.04620361328125, + "learning_rate": 4.580310880829015e-07, + "loss": 0.0002, + "reward": 2.4999961853027344, + "reward_std": 2.240518426788185e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999962449073792, + "step": 2093 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 5.424870466321243, + "grad_norm": 0.23685128998960037, + "kl": 0.115478515625, + "learning_rate": 4.5777202072538856e-07, + "loss": 0.0013, + "reward": 2.499997615814209, + "reward_std": 1.6561365896450297e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999976754188538, + "step": 2094 + }, + { + "clip_ratio": 0.0, + "completion_length": 34.5, + "epoch": 5.427461139896373, + "grad_norm": 0.1170719647192125, + "kl": 0.10888671875, + "learning_rate": 4.5751295336787566e-07, + "loss": 0.0003, + "reward": 2.4999972581863403, + "reward_std": 1.4930296288184763e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999971985816956, + "step": 2095 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.5, + "epoch": 5.430051813471502, + "grad_norm": 25.397776203074454, + "kl": 0.115966796875, + "learning_rate": 4.5725388601036266e-07, + "loss": -0.0001, + "reward": 2.3748977184295654, + "reward_std": 0.2316294201746132, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.8748977184295654, + "step": 2096 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.5, + "epoch": 5.432642487046632, + "grad_norm": 3.5070219488633594, + "kl": 0.1563720703125, + "learning_rate": 4.569948186528497e-07, + "loss": 0.0006, + "reward": 1.8793118000030518, + "reward_std": 0.00023071511327543703, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.3793119192123413, + "step": 2097 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.125, + "epoch": 5.435233160621761, + "grad_norm": 257.7513122291959, + "kl": 0.1572265625, + "learning_rate": 4.5673575129533677e-07, + "loss": 0.0007, + "reward": 1.390673577785492, + "reward_std": 0.36016987028415315, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 0.8906736075878143, + "step": 2098 + }, + { + "clip_ratio": 0.0, + "completion_length": 36.125, + "epoch": 5.437823834196891, + "grad_norm": 39.058394246433764, + "kl": 0.20263671875, + "learning_rate": 4.5647668393782377e-07, + "loss": 0.0005, + "reward": 1.9938093423843384, + "reward_std": 0.00020790389680769295, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.4938094019889832, + "step": 2099 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.5, + "epoch": 5.4404145077720205, + "grad_norm": 1.6093715633645567, + "kl": 0.0562744140625, + "learning_rate": 4.562176165803109e-07, + "loss": 0.0008, + "reward": 1.9986222982406616, + "reward_std": 3.4460435131222766e-05, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.4986222684383392, + "step": 2100 + }, + { + "clip_ratio": 0.0, + "completion_length": 34.5, + "epoch": 5.44300518134715, + "grad_norm": 0.13423746272452125, + "kl": 0.15283203125, + "learning_rate": 4.5595854922279793e-07, + "loss": 0.0006, + "reward": 2.4999983310699463, + "reward_std": 1.0358177462421736e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999984502792358, + "step": 2101 + }, + { + "clip_ratio": 0.0, + "completion_length": 36.0, + "epoch": 5.44559585492228, + "grad_norm": 1.0632176547632588, + "kl": 0.070068359375, + "learning_rate": 4.5569948186528493e-07, + "loss": -0.0001, + "reward": 2.4999743700027466, + "reward_std": 7.153669457693468e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999744296073914, + "step": 2102 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.5, + "epoch": 5.448186528497409, + "grad_norm": 0.8483378456759624, + "kl": 0.07275390625, + "learning_rate": 4.55440414507772e-07, + "loss": 0.0005, + "reward": 2.499990224838257, + "reward_std": 8.1300312899657e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999902248382568, + "step": 2103 + }, + { + "clip_ratio": 0.0, + "completion_length": 34.5, + "epoch": 5.450777202072539, + "grad_norm": 17.511566349120763, + "kl": 0.09375, + "learning_rate": 4.5518134715025903e-07, + "loss": 0.0001, + "reward": 2.4999918937683105, + "reward_std": 6.739405705502577e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999919533729553, + "step": 2104 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 5.453367875647668, + "grad_norm": 0.7993308030609892, + "kl": 0.130126953125, + "learning_rate": 4.549222797927461e-07, + "loss": 0.0006, + "reward": 2.4999923706054688, + "reward_std": 4.377476869876773e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999923706054688, + "step": 2105 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.5, + "epoch": 5.455958549222798, + "grad_norm": 1.6610785064069333, + "kl": 0.0888671875, + "learning_rate": 4.5466321243523314e-07, + "loss": 0.0004, + "reward": 2.49998140335083, + "reward_std": 9.380372830491979e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999814629554749, + "step": 2106 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.5, + "epoch": 5.458549222797927, + "grad_norm": 0.07887880199120834, + "kl": 0.1123046875, + "learning_rate": 4.544041450777202e-07, + "loss": 0.0, + "reward": 2.499997138977051, + "reward_std": 2.263416718051303e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999972581863403, + "step": 2107 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 5.461139896373057, + "grad_norm": 5.114609539002331, + "kl": 0.0760498046875, + "learning_rate": 4.541450777202072e-07, + "loss": 0.0003, + "reward": 1.9998487830162048, + "reward_std": 2.6893015046880464e-05, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.4998488128185272, + "step": 2108 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 5.463730569948186, + "grad_norm": 3.3914473645730507, + "kl": 0.182861328125, + "learning_rate": 4.538860103626943e-07, + "loss": 0.0008, + "reward": 1.9211264848709106, + "reward_std": 0.00027640461863143173, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.4211264252662659, + "step": 2109 + }, + { + "clip_ratio": 0.0, + "completion_length": 34.5, + "epoch": 5.466321243523316, + "grad_norm": 0.7431498933025664, + "kl": 0.19921875, + "learning_rate": 4.5362694300518135e-07, + "loss": 0.0008, + "reward": 2.4999760389328003, + "reward_std": 6.2817168782203225e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999760389328003, + "step": 2110 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 5.4689119170984455, + "grad_norm": 1.7548006123777358, + "kl": 0.1070556640625, + "learning_rate": 4.5336787564766835e-07, + "loss": -0.0005, + "reward": 1.9984501004219055, + "reward_std": 3.3719107818797056e-05, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.498450219631195, + "step": 2111 + }, + { + "clip_ratio": 0.0, + "completion_length": 34.5, + "epoch": 5.471502590673575, + "grad_norm": 0.17352945210498388, + "kl": 0.0966796875, + "learning_rate": 4.531088082901554e-07, + "loss": 0.0007, + "reward": 2.499997854232788, + "reward_std": 1.8830323256224801e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999979138374329, + "step": 2112 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.5, + "epoch": 5.474093264248705, + "grad_norm": 0.8317889210757883, + "kl": 0.0552978515625, + "learning_rate": 4.5284974093264245e-07, + "loss": 0.0001, + "reward": 2.4999839067459106, + "reward_std": 9.960845773093752e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999839067459106, + "step": 2113 + }, + { + "clip_ratio": 0.0, + "completion_length": 34.5, + "epoch": 5.476683937823834, + "grad_norm": 4.660650933004875, + "kl": 0.12451171875, + "learning_rate": 4.5259067357512956e-07, + "loss": 0.0, + "reward": 2.499933958053589, + "reward_std": 1.3370431361181545e-05, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999341368675232, + "step": 2114 + }, + { + "clip_ratio": 0.0, + "completion_length": 36.0, + "epoch": 5.479274611398964, + "grad_norm": 3.8197825762919737, + "kl": 0.099365234375, + "learning_rate": 4.5233160621761656e-07, + "loss": 0.0001, + "reward": 1.993431806564331, + "reward_std": 0.0001893609992293932, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.493431806564331, + "step": 2115 + }, + { + "clip_ratio": 0.0, + "completion_length": 34.5, + "epoch": 5.481865284974093, + "grad_norm": 0.17468407121409668, + "kl": 0.05078125, + "learning_rate": 4.520725388601036e-07, + "loss": 0.0005, + "reward": 2.4999953508377075, + "reward_std": 2.761796849881648e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.999995470046997, + "step": 2116 + }, + { + "clip_ratio": 0.0, + "completion_length": 34.5, + "epoch": 5.484455958549223, + "grad_norm": 11.985009066598911, + "kl": 0.159423828125, + "learning_rate": 4.5181347150259066e-07, + "loss": -0.0, + "reward": 2.4999886751174927, + "reward_std": 1.3316227523318958e-05, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999885559082031, + "step": 2117 + }, + { + "clip_ratio": 0.0, + "completion_length": 34.5, + "epoch": 5.487046632124352, + "grad_norm": 6.028526915400043, + "kl": 0.1678466796875, + "learning_rate": 4.515544041450777e-07, + "loss": 0.0004, + "reward": 1.9817107319831848, + "reward_std": 0.004229917497241331, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.48171067237854, + "step": 2118 + }, + { + "clip_ratio": 0.0, + "completion_length": 34.5, + "epoch": 5.489637305699482, + "grad_norm": 0.35100847570938376, + "kl": 0.072998046875, + "learning_rate": 4.5129533678756477e-07, + "loss": -0.001, + "reward": 2.499992847442627, + "reward_std": 4.280905727682693e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.999993085861206, + "step": 2119 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.5, + "epoch": 5.492227979274611, + "grad_norm": 4.4446664368211, + "kl": 0.4342041015625, + "learning_rate": 4.510362694300518e-07, + "loss": 0.0016, + "reward": 2.4374722242355347, + "reward_std": 0.17680404841189556, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9374722838401794, + "step": 2120 + }, + { + "clip_ratio": 0.0, + "completion_length": 34.5, + "epoch": 5.494818652849741, + "grad_norm": 4.078536641069688, + "kl": 0.136474609375, + "learning_rate": 4.507772020725388e-07, + "loss": 0.0015, + "reward": 2.4999375343322754, + "reward_std": 4.1198954022547696e-05, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999374151229858, + "step": 2121 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.5, + "epoch": 5.4974093264248705, + "grad_norm": 1.3657432669730982, + "kl": 0.117431640625, + "learning_rate": 4.505181347150259e-07, + "loss": 0.0001, + "reward": 2.4999449253082275, + "reward_std": 1.496618210694578e-05, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999448657035828, + "step": 2122 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 5.5, + "grad_norm": 2.373812144516341, + "kl": 0.080322265625, + "learning_rate": 4.50259067357513e-07, + "loss": 0.0001, + "reward": 2.4999889135360718, + "reward_std": 6.5248079863522435e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999887347221375, + "step": 2123 + }, + { + "clip_ratio": 0.0, + "completion_length": 34.5, + "epoch": 5.5025906735751295, + "grad_norm": 0.14415498418273912, + "kl": 0.095458984375, + "learning_rate": 4.5e-07, + "loss": 0.0014, + "reward": 2.4999982118606567, + "reward_std": 1.468996970288572e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999980926513672, + "step": 2124 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 5.505181347150259, + "grad_norm": 3.393357852437887, + "kl": 0.108642578125, + "learning_rate": 4.4974093264248703e-07, + "loss": 0.0015, + "reward": 2.499962568283081, + "reward_std": 2.582822708063759e-05, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999624490737915, + "step": 2125 + }, + { + "clip_ratio": 0.0, + "completion_length": 34.5, + "epoch": 5.507772020725389, + "grad_norm": 6.333846649146768, + "kl": 0.077392578125, + "learning_rate": 4.494818652849741e-07, + "loss": 0.0009, + "reward": 1.9896284937858582, + "reward_std": 0.00012527380079063732, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.4896283745765686, + "step": 2126 + }, + { + "clip_ratio": 0.0, + "completion_length": 34.6875, + "epoch": 5.510362694300518, + "grad_norm": 0.27204167394002254, + "kl": 0.05023193359375, + "learning_rate": 4.492227979274611e-07, + "loss": -0.0004, + "reward": 2.4999969005584717, + "reward_std": 2.763033876362897e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999970197677612, + "step": 2127 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.5, + "epoch": 5.512953367875648, + "grad_norm": 0.22154252629966525, + "kl": 0.09124755859375, + "learning_rate": 4.489637305699482e-07, + "loss": -0.0008, + "reward": 2.4999849796295166, + "reward_std": 3.4542750881882966e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999850988388062, + "step": 2128 + }, + { + "clip_ratio": 0.0, + "completion_length": 34.5, + "epoch": 5.515544041450777, + "grad_norm": 0.7903013348352401, + "kl": 0.053955078125, + "learning_rate": 4.4870466321243524e-07, + "loss": 0.0009, + "reward": 2.4999918937683105, + "reward_std": 4.760370188705565e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999918341636658, + "step": 2129 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 5.518134715025907, + "grad_norm": 0.12352061277718034, + "kl": 0.10546875, + "learning_rate": 4.4844559585492224e-07, + "loss": 0.001, + "reward": 2.4999953508377075, + "reward_std": 2.1942207695246907e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999952912330627, + "step": 2130 + }, + { + "clip_ratio": 0.0, + "completion_length": 36.5, + "epoch": 5.520725388601036, + "grad_norm": 39.76802694769826, + "kl": 0.142822265625, + "learning_rate": 4.481865284974093e-07, + "loss": 0.0005, + "reward": 1.87482488155365, + "reward_std": 0.23157009798887884, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.37482488155365, + "step": 2131 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.5, + "epoch": 5.523316062176166, + "grad_norm": 2.2846823115254242, + "kl": 0.171630859375, + "learning_rate": 4.479274611398964e-07, + "loss": 0.0008, + "reward": 1.9938727021217346, + "reward_std": 5.393054368596495e-05, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.4938727915287018, + "step": 2132 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 5.525906735751295, + "grad_norm": 16.41539461371132, + "kl": 0.0927734375, + "learning_rate": 4.476683937823834e-07, + "loss": 0.0008, + "reward": 2.4374775886535645, + "reward_std": 0.17680242723702122, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.937477469444275, + "step": 2133 + }, + { + "clip_ratio": 0.0, + "completion_length": 34.0, + "epoch": 5.528497409326425, + "grad_norm": 4.728810302572888, + "kl": 0.13134765625, + "learning_rate": 4.4740932642487045e-07, + "loss": 0.0004, + "reward": 1.9984426498413086, + "reward_std": 7.29897587348205e-05, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.498442828655243, + "step": 2134 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 5.5310880829015545, + "grad_norm": 100.55426032067854, + "kl": 0.162109375, + "learning_rate": 4.471502590673575e-07, + "loss": 0.0006, + "reward": 2.1247791051864624, + "reward_std": 0.2315799526804767, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.6247789859771729, + "step": 2135 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.5, + "epoch": 5.533678756476684, + "grad_norm": 0.5371193730694697, + "kl": 0.06689453125, + "learning_rate": 4.468911917098445e-07, + "loss": -0.0002, + "reward": 2.499989151954651, + "reward_std": 3.8064266902892996e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999892711639404, + "step": 2136 + }, + { + "clip_ratio": 0.0, + "completion_length": 36.0, + "epoch": 5.536269430051814, + "grad_norm": 2.225212087178764, + "kl": 0.0631103515625, + "learning_rate": 4.466321243523316e-07, + "loss": -0.0006, + "reward": 2.499995708465576, + "reward_std": 6.409569550669403e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999957084655762, + "step": 2137 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.5, + "epoch": 5.538860103626943, + "grad_norm": 1.1301952180355956, + "kl": 0.106689453125, + "learning_rate": 4.4637305699481866e-07, + "loss": 0.0013, + "reward": 2.4999881982803345, + "reward_std": 7.659103403057088e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.999988317489624, + "step": 2138 + }, + { + "clip_ratio": 0.0, + "completion_length": 34.5, + "epoch": 5.541450777202073, + "grad_norm": 0.6437848025521882, + "kl": 0.050537109375, + "learning_rate": 4.4611398963730566e-07, + "loss": 0.0012, + "reward": 2.4999924898147583, + "reward_std": 4.330290323650843e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999923706054688, + "step": 2139 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 5.544041450777202, + "grad_norm": 1.942622423904635, + "kl": 0.119873046875, + "learning_rate": 4.458549222797927e-07, + "loss": 0.0007, + "reward": 2.4999877214431763, + "reward_std": 1.4025659197614004e-05, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999876618385315, + "step": 2140 + }, + { + "clip_ratio": 0.0, + "completion_length": 34.5, + "epoch": 5.546632124352332, + "grad_norm": 9.867954327090045, + "kl": 0.157470703125, + "learning_rate": 4.4559585492227977e-07, + "loss": 0.0007, + "reward": 2.499904155731201, + "reward_std": 2.4772121832938865e-05, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999040961265564, + "step": 2141 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 5.549222797927461, + "grad_norm": 3.6160959552145524, + "kl": 0.1307373046875, + "learning_rate": 4.453367875647668e-07, + "loss": 0.0013, + "reward": 1.9033666849136353, + "reward_std": 0.0002599953592152815, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.4033666551113129, + "step": 2142 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 5.551813471502591, + "grad_norm": 0.09775863213702475, + "kl": 0.1177978515625, + "learning_rate": 4.450777202072539e-07, + "loss": 0.0008, + "reward": 2.4999953508377075, + "reward_std": 1.7423860185772355e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999954104423523, + "step": 2143 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 5.55440414507772, + "grad_norm": 2.243589807058799, + "kl": 0.16162109375, + "learning_rate": 4.4481865284974093e-07, + "loss": -0.0006, + "reward": 1.9998326301574707, + "reward_std": 1.964680905075511e-05, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.4998327493667603, + "step": 2144 + }, + { + "clip_ratio": 0.0, + "completion_length": 34.5, + "epoch": 5.55699481865285, + "grad_norm": 0.7656459826492177, + "kl": 0.044921875, + "learning_rate": 4.4455958549222793e-07, + "loss": -0.0006, + "reward": 2.4999676942825317, + "reward_std": 8.186028026102576e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999676942825317, + "step": 2145 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.5, + "epoch": 5.5595854922279795, + "grad_norm": 30.338972631246072, + "kl": 0.080810546875, + "learning_rate": 4.4430051813471503e-07, + "loss": -0.0004, + "reward": 2.499507427215576, + "reward_std": 6.0072918131481856e-05, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9995075464248657, + "step": 2146 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 5.562176165803109, + "grad_norm": 31.716876967332094, + "kl": 0.13232421875, + "learning_rate": 4.440414507772021e-07, + "loss": 0.0008, + "reward": 2.1871031522750854, + "reward_std": 0.259071770790797, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.6871030926704407, + "step": 2147 + }, + { + "clip_ratio": 0.0, + "completion_length": 36.0, + "epoch": 5.564766839378239, + "grad_norm": 0.1507314942886478, + "kl": 0.13623046875, + "learning_rate": 4.437823834196891e-07, + "loss": -0.0, + "reward": 2.499996781349182, + "reward_std": 2.495411820291338e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999969601631165, + "step": 2148 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.1875, + "epoch": 5.567357512953368, + "grad_norm": 1.8812029066111369, + "kl": 0.2052001953125, + "learning_rate": 4.4352331606217614e-07, + "loss": 0.0015, + "reward": 2.499962091445923, + "reward_std": 1.1206122508156113e-05, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999619722366333, + "step": 2149 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.5, + "epoch": 5.569948186528498, + "grad_norm": 1.4599234945564576, + "kl": 0.14697265625, + "learning_rate": 4.432642487046632e-07, + "loss": 0.0013, + "reward": 2.49997341632843, + "reward_std": 1.0891386864386732e-05, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.999973475933075, + "step": 2150 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.5, + "epoch": 5.572538860103627, + "grad_norm": 0.16933054186855132, + "kl": 0.0618896484375, + "learning_rate": 4.4300518134715024e-07, + "loss": 0.0014, + "reward": 2.4999969005584717, + "reward_std": 2.5736694340139366e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999966621398926, + "step": 2151 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.5, + "epoch": 5.575129533678757, + "grad_norm": 1.5100692245043172, + "kl": 0.18115234375, + "learning_rate": 4.427461139896373e-07, + "loss": -0.0004, + "reward": 2.4999566078186035, + "reward_std": 1.4240009363675199e-05, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999567866325378, + "step": 2152 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 5.577720207253886, + "grad_norm": 0.27624674247293, + "kl": 0.107666015625, + "learning_rate": 4.4248704663212435e-07, + "loss": -0.0001, + "reward": 2.4999914169311523, + "reward_std": 3.9234147379829665e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999914765357971, + "step": 2153 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.5, + "epoch": 5.580310880829016, + "grad_norm": 0.23845592410913444, + "kl": 0.08056640625, + "learning_rate": 4.4222797927461135e-07, + "loss": 0.0006, + "reward": 2.4999979734420776, + "reward_std": 3.0633586334261054e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999980330467224, + "step": 2154 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 5.582901554404145, + "grad_norm": 2.7055092519607125, + "kl": 0.118408203125, + "learning_rate": 4.4196891191709845e-07, + "loss": 0.0002, + "reward": 2.4999910593032837, + "reward_std": 7.3566588412177225e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999909400939941, + "step": 2155 + }, + { + "clip_ratio": 0.0, + "completion_length": 34.5, + "epoch": 5.585492227979275, + "grad_norm": 3.495411818117292, + "kl": 0.0306396484375, + "learning_rate": 4.417098445595855e-07, + "loss": 0.0007, + "reward": 2.499976396560669, + "reward_std": 3.115767276540282e-05, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.999976396560669, + "step": 2156 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 5.5880829015544045, + "grad_norm": 1.8742573811680148, + "kl": 0.151611328125, + "learning_rate": 4.414507772020725e-07, + "loss": 0.0014, + "reward": 2.499990940093994, + "reward_std": 6.770880531803414e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999908804893494, + "step": 2157 + }, + { + "clip_ratio": 0.0, + "completion_length": 34.0, + "epoch": 5.590673575129534, + "grad_norm": 0.8542966163020366, + "kl": 0.077880859375, + "learning_rate": 4.4119170984455956e-07, + "loss": 0.0007, + "reward": 2.499993324279785, + "reward_std": 5.478145794768352e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999932050704956, + "step": 2158 + }, + { + "clip_ratio": 0.0, + "completion_length": 34.5625, + "epoch": 5.5932642487046635, + "grad_norm": 24.939876868597928, + "kl": 0.173828125, + "learning_rate": 4.409326424870466e-07, + "loss": -0.0, + "reward": 1.9968894720077515, + "reward_std": 0.0008183199162203891, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.496889442205429, + "step": 2159 + }, + { + "clip_ratio": 0.0, + "completion_length": 36.0, + "epoch": 5.595854922279793, + "grad_norm": 0.1297434177766517, + "kl": 0.0562744140625, + "learning_rate": 4.4067357512953366e-07, + "loss": 0.0008, + "reward": 2.499990463256836, + "reward_std": 3.38557981649501e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.999990463256836, + "step": 2160 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.5, + "epoch": 5.598445595854923, + "grad_norm": 5.402683801644662, + "kl": 0.0650634765625, + "learning_rate": 4.404145077720207e-07, + "loss": 0.0001, + "reward": 2.499993085861206, + "reward_std": 5.6865879969336675e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999931454658508, + "step": 2161 + }, + { + "clip_ratio": 0.0, + "completion_length": 34.5, + "epoch": 5.601036269430052, + "grad_norm": 1.466992224835068, + "kl": 0.071533203125, + "learning_rate": 4.4015544041450777e-07, + "loss": -0.0008, + "reward": 2.4999821186065674, + "reward_std": 6.864327815492288e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999821782112122, + "step": 2162 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 5.603626943005182, + "grad_norm": 11.425689537387154, + "kl": 0.14208984375, + "learning_rate": 4.3989637305699477e-07, + "loss": 0.0004, + "reward": 1.9941102862358093, + "reward_std": 0.000209305703492646, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.494110345840454, + "step": 2163 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 5.606217616580311, + "grad_norm": 239.6615541810546, + "kl": 0.1412353515625, + "learning_rate": 4.396373056994818e-07, + "loss": 0.0003, + "reward": 1.9959449172019958, + "reward_std": 0.0002583137900842303, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.4959449172019958, + "step": 2164 + }, + { + "clip_ratio": 0.0, + "completion_length": 34.5, + "epoch": 5.608808290155441, + "grad_norm": 1.2062004956838557, + "kl": 0.0689697265625, + "learning_rate": 4.3937823834196893e-07, + "loss": 0.0003, + "reward": 2.499992609024048, + "reward_std": 6.114897701081645e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999926090240479, + "step": 2165 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.5, + "epoch": 5.61139896373057, + "grad_norm": 1.3166438132769336, + "kl": 0.03790283203125, + "learning_rate": 4.3911917098445593e-07, + "loss": -0.0001, + "reward": 2.499997854232788, + "reward_std": 2.2870597149449168e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.999997854232788, + "step": 2166 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 5.6139896373057, + "grad_norm": 1.1466087835920824, + "kl": 0.1708984375, + "learning_rate": 4.38860103626943e-07, + "loss": -0.0001, + "reward": 2.4999749660491943, + "reward_std": 6.668615469607175e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999750256538391, + "step": 2167 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 5.616580310880829, + "grad_norm": 7.221542041941564, + "kl": 0.101806640625, + "learning_rate": 4.3860103626943003e-07, + "loss": 0.001, + "reward": 1.9987772107124329, + "reward_std": 6.658512620560941e-05, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.4987771809101105, + "step": 2168 + }, + { + "clip_ratio": 0.0, + "completion_length": 34.5, + "epoch": 5.619170984455959, + "grad_norm": 1.3339165040548684, + "kl": 0.10986328125, + "learning_rate": 4.383419689119171e-07, + "loss": 0.0, + "reward": 1.999162197113037, + "reward_std": 1.8814079226103786e-05, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.4991623163223267, + "step": 2169 + }, + { + "clip_ratio": 0.0, + "completion_length": 36.0, + "epoch": 5.6217616580310885, + "grad_norm": 4.970004104080629, + "kl": 0.22265625, + "learning_rate": 4.3808290155440414e-07, + "loss": 0.0012, + "reward": 2.499962329864502, + "reward_std": 1.4454813026532065e-05, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.999962329864502, + "step": 2170 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 5.624352331606218, + "grad_norm": 39.739675352619244, + "kl": 0.1358642578125, + "learning_rate": 4.378238341968912e-07, + "loss": 0.0012, + "reward": 2.4346296787261963, + "reward_std": 0.18485839097229473, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9346295595169067, + "step": 2171 + }, + { + "clip_ratio": 0.0, + "completion_length": 33.5, + "epoch": 5.626943005181348, + "grad_norm": 0.08676030760441354, + "kl": 0.20556640625, + "learning_rate": 4.375647668393782e-07, + "loss": 0.0002, + "reward": 2.49999737739563, + "reward_std": 2.0557626214667835e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.999997317790985, + "step": 2172 + }, + { + "clip_ratio": 0.0, + "completion_length": 36.0, + "epoch": 5.629533678756477, + "grad_norm": 2.032892072792795, + "kl": 0.07177734375, + "learning_rate": 4.3730569948186524e-07, + "loss": 0.001, + "reward": 2.499976396560669, + "reward_std": 7.094038096511213e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999763369560242, + "step": 2173 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 5.632124352331607, + "grad_norm": 14.289511792638626, + "kl": 0.208984375, + "learning_rate": 4.3704663212435235e-07, + "loss": 0.0008, + "reward": 1.3456003665924072, + "reward_std": 0.0008903330308385193, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 0.8456003963947296, + "step": 2174 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.5, + "epoch": 5.634715025906736, + "grad_norm": 1.1563340439937397, + "kl": 0.092041015625, + "learning_rate": 4.3678756476683935e-07, + "loss": -0.0002, + "reward": 2.499992609024048, + "reward_std": 3.141753779800638e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999926686286926, + "step": 2175 + }, + { + "clip_ratio": 0.0, + "completion_length": 34.0, + "epoch": 5.637305699481866, + "grad_norm": 4.4233065360139365, + "kl": 0.11376953125, + "learning_rate": 4.365284974093264e-07, + "loss": 0.0017, + "reward": 2.499991297721863, + "reward_std": 5.199670795263955e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999912977218628, + "step": 2176 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 5.639896373056995, + "grad_norm": 2.064123143725936, + "kl": 0.186767578125, + "learning_rate": 4.3626943005181345e-07, + "loss": 0.0008, + "reward": 2.4999680519104004, + "reward_std": 1.670425763222738e-05, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999679327011108, + "step": 2177 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.5, + "epoch": 5.642487046632124, + "grad_norm": 14.289675381810858, + "kl": 0.093994140625, + "learning_rate": 4.3601036269430045e-07, + "loss": 0.0005, + "reward": 1.771733582019806, + "reward_std": 0.2589497046137694, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.2717334926128387, + "step": 2178 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 5.6450777202072535, + "grad_norm": 4.710799672773662, + "kl": 0.095703125, + "learning_rate": 4.3575129533678756e-07, + "loss": 0.0006, + "reward": 2.4999676942825317, + "reward_std": 2.3994149273676157e-05, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.999967634677887, + "step": 2179 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.5, + "epoch": 5.647668393782383, + "grad_norm": 3.0153243824339873, + "kl": 0.099853515625, + "learning_rate": 4.354922279792746e-07, + "loss": -0.0002, + "reward": 1.99897038936615, + "reward_std": 2.9745113579338067e-05, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.4989705383777618, + "step": 2180 + }, + { + "clip_ratio": 0.0, + "completion_length": 34.0, + "epoch": 5.650259067357513, + "grad_norm": 1.6771078238030759, + "kl": 0.19775390625, + "learning_rate": 4.352331606217616e-07, + "loss": 0.0003, + "reward": 1.9992891550064087, + "reward_std": 3.6577896935341414e-05, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.4992891550064087, + "step": 2181 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.5, + "epoch": 5.652849740932642, + "grad_norm": 7.4613682335426255, + "kl": 0.15087890625, + "learning_rate": 4.3497409326424866e-07, + "loss": 0.0, + "reward": 1.9997640252113342, + "reward_std": 2.9621826797665562e-05, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.4997640252113342, + "step": 2182 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.5, + "epoch": 5.655440414507772, + "grad_norm": 5.427593215024822, + "kl": 0.0863037109375, + "learning_rate": 4.3471502590673577e-07, + "loss": 0.0002, + "reward": 2.499963641166687, + "reward_std": 1.2640531849683612e-05, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.999963641166687, + "step": 2183 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 5.658031088082901, + "grad_norm": 0.16297731230208135, + "kl": 0.04840087890625, + "learning_rate": 4.3445595854922277e-07, + "loss": 0.0002, + "reward": 2.4999955892562866, + "reward_std": 2.29190663958434e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999955892562866, + "step": 2184 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.5, + "epoch": 5.660621761658031, + "grad_norm": 0.3463993011605354, + "kl": 0.0606689453125, + "learning_rate": 4.341968911917098e-07, + "loss": 0.0002, + "reward": 2.499994993209839, + "reward_std": 3.956152738737728e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.999994933605194, + "step": 2185 + }, + { + "clip_ratio": 0.0, + "completion_length": 34.5, + "epoch": 5.66321243523316, + "grad_norm": 1.8063613958281464, + "kl": 0.0745849609375, + "learning_rate": 4.339378238341969e-07, + "loss": 0.0005, + "reward": 2.499988079071045, + "reward_std": 9.938718818602865e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.999988079071045, + "step": 2186 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 5.66580310880829, + "grad_norm": 0.6451298478322072, + "kl": 0.158447265625, + "learning_rate": 4.336787564766839e-07, + "loss": 0.0005, + "reward": 2.499992609024048, + "reward_std": 5.973681254545227e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999927282333374, + "step": 2187 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.5, + "epoch": 5.668393782383419, + "grad_norm": 89.7946271326145, + "kl": 0.062255859375, + "learning_rate": 4.33419689119171e-07, + "loss": -0.0001, + "reward": 1.9983490705490112, + "reward_std": 6.310990966085228e-05, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.498349130153656, + "step": 2188 + }, + { + "clip_ratio": 0.0, + "completion_length": 37.375, + "epoch": 5.670984455958549, + "grad_norm": 0.6024938866547718, + "kl": 0.37255859375, + "learning_rate": 4.3316062176165803e-07, + "loss": 0.0015, + "reward": 2.4999897480010986, + "reward_std": 5.091042680760438e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999898076057434, + "step": 2189 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 5.6735751295336785, + "grad_norm": 1.9971883633147045, + "kl": 0.039306640625, + "learning_rate": 4.3290155440414503e-07, + "loss": 0.0005, + "reward": 2.4999808073043823, + "reward_std": 1.1984910997853149e-05, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999806880950928, + "step": 2190 + }, + { + "clip_ratio": 0.0, + "completion_length": 36.0, + "epoch": 5.676165803108808, + "grad_norm": 0.027292002140266926, + "kl": 0.123046875, + "learning_rate": 4.326424870466321e-07, + "loss": -0.0001, + "reward": 2.4999983310699463, + "reward_std": 1.0307929301234253e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999982118606567, + "step": 2191 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 5.678756476683938, + "grad_norm": 2.986754924318519, + "kl": 0.11767578125, + "learning_rate": 4.323834196891192e-07, + "loss": 0.0004, + "reward": 2.4999624490737915, + "reward_std": 1.8082914607475686e-05, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999624490737915, + "step": 2192 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.5, + "epoch": 5.681347150259067, + "grad_norm": 1.2617264996412625, + "kl": 0.102783203125, + "learning_rate": 4.321243523316062e-07, + "loss": 0.0003, + "reward": 1.9998027086257935, + "reward_std": 1.0114108818015666e-05, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.4998028874397278, + "step": 2193 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 5.683937823834197, + "grad_norm": 16.842615074525348, + "kl": 0.135986328125, + "learning_rate": 4.3186528497409324e-07, + "loss": 0.0012, + "reward": 2.4999572038650513, + "reward_std": 3.409323107916862e-05, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999571442604065, + "step": 2194 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 5.686528497409326, + "grad_norm": 0.10547165433096149, + "kl": 0.0308837890625, + "learning_rate": 4.316062176165803e-07, + "loss": 0.0003, + "reward": 2.499996781349182, + "reward_std": 2.33273021876812e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999966025352478, + "step": 2195 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.5, + "epoch": 5.689119170984456, + "grad_norm": 0.11876973621279549, + "kl": 0.04388427734375, + "learning_rate": 4.313471502590673e-07, + "loss": -0.0001, + "reward": 2.49999737739563, + "reward_std": 2.318077463314694e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999975562095642, + "step": 2196 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 5.691709844559585, + "grad_norm": 4.157460450264975, + "kl": 0.10272216796875, + "learning_rate": 4.310880829015544e-07, + "loss": 0.001, + "reward": 2.4999924898147583, + "reward_std": 7.367076932496275e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999924302101135, + "step": 2197 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.5, + "epoch": 5.694300518134715, + "grad_norm": 2.0254568886724633, + "kl": 0.092529296875, + "learning_rate": 4.3082901554404145e-07, + "loss": -0.0007, + "reward": 1.9981070756912231, + "reward_std": 5.517061708815163e-05, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.4981070756912231, + "step": 2198 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 5.696891191709844, + "grad_norm": 1.0029385563081883, + "kl": 0.090087890625, + "learning_rate": 4.3056994818652845e-07, + "loss": 0.0008, + "reward": 2.499942898750305, + "reward_std": 7.17654359050357e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999428987503052, + "step": 2199 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.5, + "epoch": 5.699481865284974, + "grad_norm": 12.033391575075846, + "kl": 0.13916015625, + "learning_rate": 4.303108808290155e-07, + "loss": 0.001, + "reward": 1.9995281100273132, + "reward_std": 4.938871052218019e-05, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.4995281100273132, + "step": 2200 + }, + { + "clip_ratio": 0.0, + "completion_length": 34.5, + "epoch": 5.7020725388601035, + "grad_norm": 3.2803528889727325, + "kl": 0.117919921875, + "learning_rate": 4.3005181347150256e-07, + "loss": -0.0002, + "reward": 2.4999881982803345, + "reward_std": 9.067382279681624e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.999988317489624, + "step": 2201 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.5, + "epoch": 5.704663212435233, + "grad_norm": 0.19412235106860593, + "kl": 0.0777587890625, + "learning_rate": 4.297927461139896e-07, + "loss": 0.0, + "reward": 2.499997138977051, + "reward_std": 2.6640018973012047e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999972581863403, + "step": 2202 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.5, + "epoch": 5.7072538860103625, + "grad_norm": 1.605406201031802, + "kl": 0.1142578125, + "learning_rate": 4.2953367875647666e-07, + "loss": -0.0001, + "reward": 2.499987006187439, + "reward_std": 5.200048690312542e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.999987006187439, + "step": 2203 + }, + { + "clip_ratio": 0.0, + "completion_length": 36.0, + "epoch": 5.709844559585492, + "grad_norm": 6.311186207393324, + "kl": 0.14990234375, + "learning_rate": 4.292746113989637e-07, + "loss": 0.0004, + "reward": 1.868963897228241, + "reward_std": 0.00039028138451158156, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.368963897228241, + "step": 2204 + }, + { + "clip_ratio": 0.0, + "completion_length": 34.5, + "epoch": 5.712435233160622, + "grad_norm": 0.5214205528297102, + "kl": 0.0902099609375, + "learning_rate": 4.290155440414507e-07, + "loss": -0.0005, + "reward": 2.4999823570251465, + "reward_std": 6.897247658343986e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.999982476234436, + "step": 2205 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.5, + "epoch": 5.715025906735751, + "grad_norm": 0.0781231611815751, + "kl": 0.097412109375, + "learning_rate": 4.287564766839378e-07, + "loss": 0.001, + "reward": 2.4999947547912598, + "reward_std": 1.2366082131620715e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999948143959045, + "step": 2206 + }, + { + "clip_ratio": 0.0, + "completion_length": 36.0, + "epoch": 5.717616580310881, + "grad_norm": 10.094539178504037, + "kl": 0.1123046875, + "learning_rate": 4.284974093264249e-07, + "loss": 0.0008, + "reward": 2.4998950958251953, + "reward_std": 4.9064010909205535e-05, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9998952150344849, + "step": 2207 + }, + { + "clip_ratio": 0.0, + "completion_length": 36.0, + "epoch": 5.72020725388601, + "grad_norm": 3.582528177144623, + "kl": 0.074462890625, + "learning_rate": 4.282383419689119e-07, + "loss": -0.0007, + "reward": 2.4999806880950928, + "reward_std": 9.421323397873493e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999808073043823, + "step": 2208 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 5.72279792746114, + "grad_norm": 0.3510864351194618, + "kl": 0.040283203125, + "learning_rate": 4.2797927461139893e-07, + "loss": -0.0001, + "reward": 2.499995708465576, + "reward_std": 3.179863938385097e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999958276748657, + "step": 2209 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 5.725388601036269, + "grad_norm": 1.277277386059591, + "kl": 0.089111328125, + "learning_rate": 4.27720207253886e-07, + "loss": 0.0004, + "reward": 2.499984622001648, + "reward_std": 8.090762321444345e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999845623970032, + "step": 2210 + }, + { + "clip_ratio": 0.0, + "completion_length": 36.0, + "epoch": 5.727979274611399, + "grad_norm": 0.15247013370314996, + "kl": 0.0655517578125, + "learning_rate": 4.2746113989637303e-07, + "loss": 0.0003, + "reward": 2.4999969005584717, + "reward_std": 1.8476799255040532e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999969005584717, + "step": 2211 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 5.730569948186528, + "grad_norm": 1.1020447186868272, + "kl": 0.048583984375, + "learning_rate": 4.272020725388601e-07, + "loss": -0.0006, + "reward": 2.499983787536621, + "reward_std": 9.898189318846562e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999839067459106, + "step": 2212 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 5.733160621761658, + "grad_norm": 0.15327195532187002, + "kl": 0.1552734375, + "learning_rate": 4.2694300518134714e-07, + "loss": 0.0009, + "reward": 2.4999947547912598, + "reward_std": 5.267787059892726e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999947547912598, + "step": 2213 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 5.7357512953367875, + "grad_norm": 0.5682223176142653, + "kl": 0.10302734375, + "learning_rate": 4.2668393782383414e-07, + "loss": 0.001, + "reward": 2.499992847442627, + "reward_std": 5.841190215960523e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999926686286926, + "step": 2214 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 5.738341968911917, + "grad_norm": 0.09345128084580233, + "kl": 0.08514404296875, + "learning_rate": 4.2642487046632124e-07, + "loss": -0.0002, + "reward": 2.499997615814209, + "reward_std": 2.6362197331764037e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999976754188538, + "step": 2215 + }, + { + "clip_ratio": 0.0, + "completion_length": 36.0, + "epoch": 5.740932642487047, + "grad_norm": 1.2229681281213352, + "kl": 0.063720703125, + "learning_rate": 4.261658031088083e-07, + "loss": -0.0004, + "reward": 2.4999890327453613, + "reward_std": 1.2146653489253367e-05, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999890327453613, + "step": 2216 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.5, + "epoch": 5.743523316062176, + "grad_norm": 0.4383863720346704, + "kl": 0.103759765625, + "learning_rate": 4.259067357512953e-07, + "loss": 0.001, + "reward": 2.4999791383743286, + "reward_std": 7.784605941196787e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999790787696838, + "step": 2217 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 5.746113989637306, + "grad_norm": 2.984746817418707, + "kl": 0.145263671875, + "learning_rate": 4.2564766839378235e-07, + "loss": 0.0007, + "reward": 1.8090960383415222, + "reward_std": 0.00031346285140898544, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.3090960681438446, + "step": 2218 + }, + { + "clip_ratio": 0.0, + "completion_length": 34.6875, + "epoch": 5.748704663212435, + "grad_norm": 545.6949707212282, + "kl": 0.277587890625, + "learning_rate": 4.253886010362694e-07, + "loss": 0.0011, + "reward": 1.2964731454849243, + "reward_std": 0.09693628415698186, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 0.7964731454849243, + "step": 2219 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.5, + "epoch": 5.751295336787565, + "grad_norm": 1.897796598245334, + "kl": 0.037841796875, + "learning_rate": 4.2512953367875645e-07, + "loss": 0.0, + "reward": 2.499986171722412, + "reward_std": 9.532710123494326e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.999986171722412, + "step": 2220 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.5, + "epoch": 5.753886010362694, + "grad_norm": 1.812933322130175, + "kl": 0.123779296875, + "learning_rate": 4.248704663212435e-07, + "loss": 0.0007, + "reward": 2.4999877214431763, + "reward_std": 1.2107628720059438e-05, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999878406524658, + "step": 2221 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 5.756476683937824, + "grad_norm": 0.07720415276560684, + "kl": 0.063232421875, + "learning_rate": 4.2461139896373056e-07, + "loss": 0.0007, + "reward": 2.499998927116394, + "reward_std": 9.553977236009814e-07, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.999998688697815, + "step": 2222 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.5, + "epoch": 5.759067357512953, + "grad_norm": 2.158549640568623, + "kl": 0.0694580078125, + "learning_rate": 4.2435233160621756e-07, + "loss": 0.0005, + "reward": 2.499944567680359, + "reward_std": 1.4589605143555673e-05, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999445676803589, + "step": 2223 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.5, + "epoch": 5.761658031088083, + "grad_norm": 45.97381184687504, + "kl": 0.1533203125, + "learning_rate": 4.240932642487046e-07, + "loss": 0.0006, + "reward": 1.9166672229766846, + "reward_std": 0.17812484328169376, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.4166671633720398, + "step": 2224 + }, + { + "clip_ratio": 0.0, + "completion_length": 34.5, + "epoch": 5.7642487046632125, + "grad_norm": 0.08539629948295398, + "kl": 0.0830078125, + "learning_rate": 4.238341968911917e-07, + "loss": -0.0004, + "reward": 2.4999974966049194, + "reward_std": 2.157066035124444e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999974966049194, + "step": 2225 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 5.766839378238342, + "grad_norm": 0.544541626422927, + "kl": 0.1572265625, + "learning_rate": 4.235751295336787e-07, + "loss": 0.0001, + "reward": 2.499993085861206, + "reward_std": 3.6420831293071387e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999932050704956, + "step": 2226 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 5.769430051813472, + "grad_norm": 1.8255674847350292, + "kl": 0.099365234375, + "learning_rate": 4.2331606217616577e-07, + "loss": -0.0007, + "reward": 2.499986410140991, + "reward_std": 1.2053063187522639e-05, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999865889549255, + "step": 2227 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 5.772020725388601, + "grad_norm": 3.4899082103546353, + "kl": 0.15283203125, + "learning_rate": 4.230569948186528e-07, + "loss": 0.0009, + "reward": 1.8856375217437744, + "reward_std": 0.00019633736792457057, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.3856375217437744, + "step": 2228 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.5, + "epoch": 5.774611398963731, + "grad_norm": 0.5562292713872523, + "kl": 0.09326171875, + "learning_rate": 4.2279792746113993e-07, + "loss": 0.0009, + "reward": 2.4999953508377075, + "reward_std": 4.360593266028445e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.999995231628418, + "step": 2229 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.5, + "epoch": 5.77720207253886, + "grad_norm": 32.21773563182162, + "kl": 0.0943603515625, + "learning_rate": 4.2253886010362693e-07, + "loss": 0.0002, + "reward": 2.4999661445617676, + "reward_std": 1.1921830491701257e-05, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999661445617676, + "step": 2230 + }, + { + "clip_ratio": 0.0, + "completion_length": 36.0, + "epoch": 5.77979274611399, + "grad_norm": 4.591035329567472, + "kl": 0.1094970703125, + "learning_rate": 4.22279792746114e-07, + "loss": 0.0015, + "reward": 2.4999637603759766, + "reward_std": 9.263614174415125e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999635219573975, + "step": 2231 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.5, + "epoch": 5.782383419689119, + "grad_norm": 1.5780693898666545, + "kl": 0.1279296875, + "learning_rate": 4.22020725388601e-07, + "loss": 0.001, + "reward": 2.499992609024048, + "reward_std": 8.420148901677749e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999924898147583, + "step": 2232 + }, + { + "clip_ratio": 0.0, + "completion_length": 36.0, + "epoch": 5.784974093264249, + "grad_norm": 0.898988183455102, + "kl": 0.13623046875, + "learning_rate": 4.2176165803108803e-07, + "loss": -0.0003, + "reward": 2.4999775886535645, + "reward_std": 5.201562998990994e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999777674674988, + "step": 2233 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 5.787564766839378, + "grad_norm": 0.14780538723510805, + "kl": 0.093994140625, + "learning_rate": 4.2150259067357514e-07, + "loss": 0.0014, + "reward": 2.4999959468841553, + "reward_std": 2.723076590882556e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999957084655762, + "step": 2234 + }, + { + "clip_ratio": 0.0, + "completion_length": 34.5, + "epoch": 5.790155440414508, + "grad_norm": 0.4833607605083104, + "kl": 0.083740234375, + "learning_rate": 4.212435233160622e-07, + "loss": 0.0009, + "reward": 2.4999765157699585, + "reward_std": 5.81231142859906e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.999976634979248, + "step": 2235 + }, + { + "clip_ratio": 0.0, + "completion_length": 34.5, + "epoch": 5.7927461139896375, + "grad_norm": 0.6182751287784096, + "kl": 0.1510009765625, + "learning_rate": 4.209844559585492e-07, + "loss": 0.0001, + "reward": 2.4999940395355225, + "reward_std": 5.23471095448258e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.999994158744812, + "step": 2236 + }, + { + "clip_ratio": 0.0, + "completion_length": 34.0, + "epoch": 5.795336787564767, + "grad_norm": 8.251118998147856, + "kl": 0.09051513671875, + "learning_rate": 4.2072538860103624e-07, + "loss": 0.0002, + "reward": 1.9537217617034912, + "reward_std": 0.0004934084658998472, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.4537217020988464, + "step": 2237 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 5.7979274611398965, + "grad_norm": 0.24647324225388167, + "kl": 0.16015625, + "learning_rate": 4.2046632124352324e-07, + "loss": 0.0011, + "reward": 2.4999977350234985, + "reward_std": 1.7614371472518542e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.999997615814209, + "step": 2238 + }, + { + "clip_ratio": 0.0, + "completion_length": 34.5, + "epoch": 5.800518134715026, + "grad_norm": 2.1113569291233176, + "kl": 0.0869140625, + "learning_rate": 4.2020725388601035e-07, + "loss": 0.0015, + "reward": 2.4999892711639404, + "reward_std": 7.539571015513502e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999890327453613, + "step": 2239 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.5, + "epoch": 5.803108808290156, + "grad_norm": 0.9203982096512624, + "kl": 0.05743408203125, + "learning_rate": 4.199481865284974e-07, + "loss": 0.0011, + "reward": 2.499996781349182, + "reward_std": 2.2391941172372753e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.999996542930603, + "step": 2240 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.9375, + "epoch": 5.805699481865285, + "grad_norm": 1.4809847645762941, + "kl": 0.080322265625, + "learning_rate": 4.1968911917098445e-07, + "loss": 0.0013, + "reward": 2.4999871253967285, + "reward_std": 4.650490609492408e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999869465827942, + "step": 2241 + }, + { + "clip_ratio": 0.0, + "completion_length": 33.5, + "epoch": 5.808290155440415, + "grad_norm": 0.12401939803459215, + "kl": 0.135498046875, + "learning_rate": 4.1943005181347145e-07, + "loss": 0.0008, + "reward": 2.4999990463256836, + "reward_std": 9.293904383866902e-07, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999990463256836, + "step": 2242 + }, + { + "clip_ratio": 0.0, + "completion_length": 34.5, + "epoch": 5.810880829015544, + "grad_norm": 0.07209419887980668, + "kl": 0.0638427734375, + "learning_rate": 4.1917098445595856e-07, + "loss": 0.0007, + "reward": 2.4999972581863403, + "reward_std": 1.411313760968369e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999971985816956, + "step": 2243 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.4375, + "epoch": 5.813471502590674, + "grad_norm": 25.501845866337284, + "kl": 0.1220703125, + "learning_rate": 4.189119170984456e-07, + "loss": 0.0005, + "reward": 2.0199482440948486, + "reward_std": 0.19396816765572567, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.5199483633041382, + "step": 2244 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 5.816062176165803, + "grad_norm": 0.6430320063363314, + "kl": 0.07391357421875, + "learning_rate": 4.186528497409326e-07, + "loss": 0.0005, + "reward": 2.4999845027923584, + "reward_std": 5.088538159725431e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999845027923584, + "step": 2245 + }, + { + "clip_ratio": 0.0, + "completion_length": 33.5, + "epoch": 5.818652849740933, + "grad_norm": 0.5210581394426964, + "kl": 0.106201171875, + "learning_rate": 4.1839378238341967e-07, + "loss": -0.0005, + "reward": 2.49999737739563, + "reward_std": 1.0632743681071588e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999974966049194, + "step": 2246 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 5.821243523316062, + "grad_norm": 0.4816873869851018, + "kl": 0.144287109375, + "learning_rate": 4.181347150259067e-07, + "loss": 0.0014, + "reward": 2.499993324279785, + "reward_std": 3.965686005358293e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999930262565613, + "step": 2247 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 5.823834196891192, + "grad_norm": 0.3129965153917525, + "kl": 0.16015625, + "learning_rate": 4.1787564766839377e-07, + "loss": 0.0005, + "reward": 2.499985694885254, + "reward_std": 3.4691705650402582e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.999985694885254, + "step": 2248 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.5, + "epoch": 5.8264248704663215, + "grad_norm": 0.10789318782576139, + "kl": 0.07763671875, + "learning_rate": 4.176165803108808e-07, + "loss": 0.0009, + "reward": 2.4999983310699463, + "reward_std": 1.131927888309292e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999984502792358, + "step": 2249 + }, + { + "clip_ratio": 0.0, + "completion_length": 34.5, + "epoch": 5.829015544041451, + "grad_norm": 0.21243175156405217, + "kl": 0.062255859375, + "learning_rate": 4.173575129533679e-07, + "loss": 0.0011, + "reward": 2.499998092651367, + "reward_std": 1.3531694946777861e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999980330467224, + "step": 2250 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 5.831606217616581, + "grad_norm": 0.18002468027140417, + "kl": 0.084716796875, + "learning_rate": 4.170984455958549e-07, + "loss": 0.0015, + "reward": 2.499990940093994, + "reward_std": 2.5257869538108935e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.999990999698639, + "step": 2251 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 5.83419689119171, + "grad_norm": 1.3388184311853821, + "kl": 0.096435546875, + "learning_rate": 4.16839378238342e-07, + "loss": 0.0015, + "reward": 2.499991297721863, + "reward_std": 1.028898668664624e-05, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.999991238117218, + "step": 2252 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 5.83678756476684, + "grad_norm": 1.1999952049082878, + "kl": 0.0960693359375, + "learning_rate": 4.1658031088082903e-07, + "loss": -0.0, + "reward": 2.499968409538269, + "reward_std": 1.1583674222492846e-05, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999682903289795, + "step": 2253 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 5.839378238341969, + "grad_norm": 39.17763780741709, + "kl": 0.125, + "learning_rate": 4.1632124352331603e-07, + "loss": 0.0009, + "reward": 1.9999246001243591, + "reward_std": 1.668057370807219e-05, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.4999245703220367, + "step": 2254 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.5, + "epoch": 5.841968911917099, + "grad_norm": 0.468784929670208, + "kl": 0.088134765625, + "learning_rate": 4.160621761658031e-07, + "loss": 0.0005, + "reward": 2.499991536140442, + "reward_std": 3.6417177966541203e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999916553497314, + "step": 2255 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.5, + "epoch": 5.844559585492228, + "grad_norm": 1.7126957627828425, + "kl": 0.049072265625, + "learning_rate": 4.1580310880829014e-07, + "loss": -0.0001, + "reward": 2.499963164329529, + "reward_std": 7.067627279866429e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999632239341736, + "step": 2256 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 5.847150259067358, + "grad_norm": 0.8308825496417839, + "kl": 0.054931640625, + "learning_rate": 4.155440414507772e-07, + "loss": -0.0008, + "reward": 2.49999463558197, + "reward_std": 5.599392125077429e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.999994933605194, + "step": 2257 + }, + { + "clip_ratio": 0.0, + "completion_length": 34.5, + "epoch": 5.849740932642487, + "grad_norm": 0.11325478470594708, + "kl": 0.0869140625, + "learning_rate": 4.1528497409326424e-07, + "loss": 0.0001, + "reward": 2.4999972581863403, + "reward_std": 1.4786303097480413e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.999997317790985, + "step": 2258 + }, + { + "clip_ratio": 0.0, + "completion_length": 34.0, + "epoch": 5.852331606217617, + "grad_norm": 2.4059508683216317, + "kl": 0.0400390625, + "learning_rate": 4.150259067357513e-07, + "loss": -0.0001, + "reward": 2.49998939037323, + "reward_std": 1.198946165459347e-05, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.99998939037323, + "step": 2259 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0625, + "epoch": 5.8549222797927465, + "grad_norm": 64.81923064237134, + "kl": 0.10791015625, + "learning_rate": 4.147668393782383e-07, + "loss": 0.0, + "reward": 1.9406970143318176, + "reward_std": 0.0027064363960107585, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.4406971037387848, + "step": 2260 + }, + { + "clip_ratio": 0.0, + "completion_length": 36.5, + "epoch": 5.857512953367876, + "grad_norm": 18.61717993551045, + "kl": 0.077392578125, + "learning_rate": 4.1450777202072535e-07, + "loss": -0.0005, + "reward": 2.4374488592147827, + "reward_std": 0.176842640918494, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9374489188194275, + "step": 2261 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.5, + "epoch": 5.860103626943005, + "grad_norm": 0.3839268155528407, + "kl": 0.115966796875, + "learning_rate": 4.1424870466321246e-07, + "loss": 0.0012, + "reward": 2.49999737739563, + "reward_std": 1.5238573496390018e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999973773956299, + "step": 2262 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 5.862694300518134, + "grad_norm": 0.9451527751307454, + "kl": 0.146484375, + "learning_rate": 4.1398963730569945e-07, + "loss": 0.0005, + "reward": 2.499990463256836, + "reward_std": 9.460845376452198e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999906420707703, + "step": 2263 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 5.865284974093264, + "grad_norm": 0.2375805343856624, + "kl": 0.093017578125, + "learning_rate": 4.137305699481865e-07, + "loss": 0.0012, + "reward": 2.499996781349182, + "reward_std": 1.9475403121305135e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999967813491821, + "step": 2264 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 5.867875647668393, + "grad_norm": 2.259284429718665, + "kl": 0.0482177734375, + "learning_rate": 4.1347150259067356e-07, + "loss": 0.0005, + "reward": 2.499988555908203, + "reward_std": 9.087720968636859e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999886751174927, + "step": 2265 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 5.870466321243523, + "grad_norm": 13.354644445230344, + "kl": 0.197265625, + "learning_rate": 4.132124352331606e-07, + "loss": -0.0009, + "reward": 2.4999935626983643, + "reward_std": 5.236945298747742e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999937415122986, + "step": 2266 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 5.873056994818652, + "grad_norm": 0.29850912234813987, + "kl": 0.078857421875, + "learning_rate": 4.1295336787564767e-07, + "loss": 0.0003, + "reward": 2.4999918937683105, + "reward_std": 3.7276245166140143e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.999991774559021, + "step": 2267 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.5, + "epoch": 5.875647668393782, + "grad_norm": 0.22231653539422577, + "kl": 0.08349609375, + "learning_rate": 4.126943005181347e-07, + "loss": -0.0007, + "reward": 2.4999966621398926, + "reward_std": 3.2675197871867567e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999967813491821, + "step": 2268 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.5, + "epoch": 5.8782383419689115, + "grad_norm": 1.0605581151537387, + "kl": 0.0435791015625, + "learning_rate": 4.124352331606217e-07, + "loss": -0.0008, + "reward": 2.499995231628418, + "reward_std": 6.7132351659893175e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999951720237732, + "step": 2269 + }, + { + "clip_ratio": 0.0, + "completion_length": 34.5, + "epoch": 5.880829015544041, + "grad_norm": 0.8931260941604258, + "kl": 0.10205078125, + "learning_rate": 4.1217616580310877e-07, + "loss": 0.0004, + "reward": 2.499995708465576, + "reward_std": 3.7294799994924688e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999955892562866, + "step": 2270 + }, + { + "clip_ratio": 0.0, + "completion_length": 36.0, + "epoch": 5.883419689119171, + "grad_norm": 1.9131796728206538, + "kl": 0.0816650390625, + "learning_rate": 4.119170984455959e-07, + "loss": 0.0002, + "reward": 2.4999866485595703, + "reward_std": 5.00178100537596e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.999986708164215, + "step": 2271 + }, + { + "clip_ratio": 0.0, + "completion_length": 34.5, + "epoch": 5.8860103626943, + "grad_norm": 0.3482697047420862, + "kl": 0.097900390625, + "learning_rate": 4.116580310880829e-07, + "loss": -0.0013, + "reward": 2.4999961853027344, + "reward_std": 3.3883881087604095e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999963641166687, + "step": 2272 + }, + { + "clip_ratio": 0.0, + "completion_length": 34.5, + "epoch": 5.88860103626943, + "grad_norm": 0.0913272102319602, + "kl": 0.0654296875, + "learning_rate": 4.1139896373056993e-07, + "loss": -0.0002, + "reward": 2.49999737739563, + "reward_std": 1.5297123070467933e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999973773956299, + "step": 2273 + }, + { + "clip_ratio": 0.0, + "completion_length": 34.0, + "epoch": 5.891191709844559, + "grad_norm": 10.6582253283067, + "kl": 0.22900390625, + "learning_rate": 4.11139896373057e-07, + "loss": 0.0008, + "reward": 2.4999969005584717, + "reward_std": 3.437034706621489e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999970197677612, + "step": 2274 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.5, + "epoch": 5.893782383419689, + "grad_norm": 7.713193642665512, + "kl": 0.0203857421875, + "learning_rate": 4.1088082901554403e-07, + "loss": 0.0002, + "reward": 2.499983787536621, + "reward_std": 6.7634414335771e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.999983787536621, + "step": 2275 + }, + { + "clip_ratio": 0.0, + "completion_length": 36.0, + "epoch": 5.896373056994818, + "grad_norm": 0.06318554239502905, + "kl": 0.09375, + "learning_rate": 4.106217616580311e-07, + "loss": -0.0007, + "reward": 2.49999737739563, + "reward_std": 1.5709822491771774e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.999997615814209, + "step": 2276 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0625, + "epoch": 5.898963730569948, + "grad_norm": 0.7832830448124017, + "kl": 0.08428955078125, + "learning_rate": 4.1036269430051814e-07, + "loss": 0.0002, + "reward": 2.499984622001648, + "reward_std": 5.6210509455922875e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.999984622001648, + "step": 2277 + }, + { + "clip_ratio": 0.0, + "completion_length": 34.5, + "epoch": 5.901554404145077, + "grad_norm": 5.082952120669084, + "kl": 0.1495361328125, + "learning_rate": 4.1010362694300514e-07, + "loss": 0.0016, + "reward": 1.9785445928573608, + "reward_std": 0.00015241097628404532, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.4785443544387817, + "step": 2278 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.5, + "epoch": 5.904145077720207, + "grad_norm": 3.2575195808339767, + "kl": 0.08544921875, + "learning_rate": 4.098445595854922e-07, + "loss": 0.0009, + "reward": 2.499987006187439, + "reward_std": 1.7935386040335288e-05, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999868273735046, + "step": 2279 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 5.9067357512953365, + "grad_norm": 0.2516948122473572, + "kl": 0.11328125, + "learning_rate": 4.095854922279793e-07, + "loss": 0.0002, + "reward": 2.4999977350234985, + "reward_std": 1.964046646207862e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999976754188538, + "step": 2280 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.8125, + "epoch": 5.909326424870466, + "grad_norm": 4.390730158597342, + "kl": 0.1083984375, + "learning_rate": 4.093264248704663e-07, + "loss": -0.0009, + "reward": 2.499969244003296, + "reward_std": 2.5009020191646414e-05, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999694228172302, + "step": 2281 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 5.9119170984455955, + "grad_norm": 0.0976951117432104, + "kl": 0.04888916015625, + "learning_rate": 4.0906735751295335e-07, + "loss": 0.0003, + "reward": 2.499997854232788, + "reward_std": 1.458956290889546e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999979138374329, + "step": 2282 + }, + { + "clip_ratio": 0.0, + "completion_length": 34.5, + "epoch": 5.914507772020725, + "grad_norm": 3.9081106543482758, + "kl": 0.102783203125, + "learning_rate": 4.088082901554404e-07, + "loss": 0.0015, + "reward": 2.49997615814209, + "reward_std": 1.367864120993545e-05, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999760389328003, + "step": 2283 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 5.917098445595855, + "grad_norm": 0.15136605964031002, + "kl": 0.08642578125, + "learning_rate": 4.085492227979274e-07, + "loss": -0.0001, + "reward": 2.4999895095825195, + "reward_std": 1.8938719108518853e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999895691871643, + "step": 2284 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 5.919689119170984, + "grad_norm": 0.38461717284815056, + "kl": 0.080810546875, + "learning_rate": 4.082901554404145e-07, + "loss": 0.0003, + "reward": 2.4999969005584717, + "reward_std": 2.6105044526048005e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999967217445374, + "step": 2285 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.5, + "epoch": 5.922279792746114, + "grad_norm": 0.9744700946403966, + "kl": 0.05615234375, + "learning_rate": 4.0803108808290156e-07, + "loss": 0.0011, + "reward": 2.499990463256836, + "reward_std": 6.561147756656283e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999905228614807, + "step": 2286 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.5, + "epoch": 5.924870466321243, + "grad_norm": 10.03995885014358, + "kl": 0.212890625, + "learning_rate": 4.0777202072538856e-07, + "loss": 0.0014, + "reward": 2.2499719858169556, + "reward_std": 0.26727062811175983, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.74997216463089, + "step": 2287 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 5.927461139896373, + "grad_norm": 1.3152257354057781, + "kl": 0.12646484375, + "learning_rate": 4.075129533678756e-07, + "loss": 0.0022, + "reward": 2.499967098236084, + "reward_std": 7.947738765778922e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.999967098236084, + "step": 2288 + }, + { + "clip_ratio": 0.0, + "completion_length": 34.6875, + "epoch": 5.930051813471502, + "grad_norm": 2.213297686456236, + "kl": 0.07470703125, + "learning_rate": 4.072538860103627e-07, + "loss": 0.0008, + "reward": 2.4999608993530273, + "reward_std": 6.7524352971304324e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999608397483826, + "step": 2289 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 5.932642487046632, + "grad_norm": 0.3785804313791765, + "kl": 0.093505859375, + "learning_rate": 4.069948186528497e-07, + "loss": 0.0002, + "reward": 2.4999969005584717, + "reward_std": 2.443111895900074e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.999996840953827, + "step": 2290 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 5.935233160621761, + "grad_norm": 0.8576274897768291, + "kl": 0.17724609375, + "learning_rate": 4.0673575129533677e-07, + "loss": 0.0006, + "reward": 2.4999879598617554, + "reward_std": 3.983369765592215e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999880194664001, + "step": 2291 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.5, + "epoch": 5.937823834196891, + "grad_norm": 0.16636253356570307, + "kl": 0.067626953125, + "learning_rate": 4.064766839378238e-07, + "loss": -0.0, + "reward": 2.4999966621398926, + "reward_std": 3.2034688501880737e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999967217445374, + "step": 2292 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.5, + "epoch": 5.9404145077720205, + "grad_norm": 0.2857911595868281, + "kl": 0.123046875, + "learning_rate": 4.062176165803108e-07, + "loss": 0.0006, + "reward": 2.499984383583069, + "reward_std": 3.349826101839426e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.999984323978424, + "step": 2293 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.5, + "epoch": 5.94300518134715, + "grad_norm": 2.9405861084546485, + "kl": 0.0599365234375, + "learning_rate": 4.0595854922279793e-07, + "loss": -0.0006, + "reward": 1.791157841682434, + "reward_std": 0.00025315592938568443, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.2911579608917236, + "step": 2294 + }, + { + "clip_ratio": 0.0, + "completion_length": 34.5, + "epoch": 5.94559585492228, + "grad_norm": 1.8151398737084097, + "kl": 0.2303466796875, + "learning_rate": 4.05699481865285e-07, + "loss": 0.0024, + "reward": 2.499994993209839, + "reward_std": 3.7977763724938995e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999946355819702, + "step": 2295 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 5.948186528497409, + "grad_norm": 0.2462071206252658, + "kl": 0.163330078125, + "learning_rate": 4.05440414507772e-07, + "loss": 0.0011, + "reward": 2.4999958276748657, + "reward_std": 2.7629565693132463e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999958872795105, + "step": 2296 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 5.950777202072539, + "grad_norm": 0.265764427615182, + "kl": 0.193359375, + "learning_rate": 4.0518134715025903e-07, + "loss": 0.0014, + "reward": 2.4999979734420776, + "reward_std": 1.1679443900902697e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999979734420776, + "step": 2297 + }, + { + "clip_ratio": 0.0, + "completion_length": 34.5, + "epoch": 5.953367875647668, + "grad_norm": 3.182257190997831, + "kl": 0.073974609375, + "learning_rate": 4.049222797927461e-07, + "loss": 0.0, + "reward": 2.499982237815857, + "reward_std": 1.1860133668051276e-05, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999822974205017, + "step": 2298 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 5.955958549222798, + "grad_norm": 1.140015160951158, + "kl": 0.0635986328125, + "learning_rate": 4.0466321243523314e-07, + "loss": -0.0009, + "reward": 1.9999282360076904, + "reward_std": 9.737196592141117e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.4999284446239471, + "step": 2299 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.5, + "epoch": 5.958549222797927, + "grad_norm": 4.912176966295773, + "kl": 0.12744140625, + "learning_rate": 4.044041450777202e-07, + "loss": -0.0, + "reward": 1.9980394840240479, + "reward_std": 7.877700500102947e-05, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.498039573431015, + "step": 2300 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.5, + "epoch": 5.961139896373057, + "grad_norm": 1.4535986993672234, + "kl": 0.142822265625, + "learning_rate": 4.0414507772020724e-07, + "loss": 0.0012, + "reward": 1.9970449209213257, + "reward_std": 2.6568860562292684e-05, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.4970448017120361, + "step": 2301 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.5, + "epoch": 5.963730569948186, + "grad_norm": 0.9690604957954418, + "kl": 0.02960205078125, + "learning_rate": 4.0388601036269424e-07, + "loss": 0.0004, + "reward": 2.499995708465576, + "reward_std": 5.5196461516970885e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.999995768070221, + "step": 2302 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 5.966321243523316, + "grad_norm": 0.2586744370585928, + "kl": 0.0372314453125, + "learning_rate": 4.0362694300518135e-07, + "loss": 0.0004, + "reward": 2.499997854232788, + "reward_std": 1.919745244549631e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999979734420776, + "step": 2303 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 5.9689119170984455, + "grad_norm": 3.213623711635333, + "kl": 0.10986328125, + "learning_rate": 4.033678756476684e-07, + "loss": 0.0004, + "reward": 2.4999728202819824, + "reward_std": 1.3485929230228066e-05, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999728798866272, + "step": 2304 + }, + { + "clip_ratio": 0.0, + "completion_length": 34.0, + "epoch": 5.971502590673575, + "grad_norm": 0.14360237534431436, + "kl": 0.087646484375, + "learning_rate": 4.031088082901554e-07, + "loss": -0.0007, + "reward": 2.4999977350234985, + "reward_std": 1.5765271541567927e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999979138374329, + "step": 2305 + }, + { + "clip_ratio": 0.0, + "completion_length": 36.0, + "epoch": 5.974093264248705, + "grad_norm": 0.32422052069237134, + "kl": 0.083251953125, + "learning_rate": 4.0284974093264246e-07, + "loss": 0.0001, + "reward": 2.499976634979248, + "reward_std": 4.600564125212259e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.999976634979248, + "step": 2306 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.125, + "epoch": 5.976683937823834, + "grad_norm": 1.2502654641318598, + "kl": 0.144287109375, + "learning_rate": 4.025906735751295e-07, + "loss": 0.0003, + "reward": 0.9997566938400269, + "reward_std": 1.2651558790821582e-05, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 0.49975669384002686, + "step": 2307 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.5, + "epoch": 5.979274611398964, + "grad_norm": 2.36640930693435, + "kl": 0.0880126953125, + "learning_rate": 4.0233160621761656e-07, + "loss": 0.0005, + "reward": 1.992644727230072, + "reward_std": 8.817717161946348e-05, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.4926447868347168, + "step": 2308 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 5.981865284974093, + "grad_norm": 207.6614615362066, + "kl": 0.1142578125, + "learning_rate": 4.020725388601036e-07, + "loss": 0.0007, + "reward": 1.8027977347373962, + "reward_std": 0.0013666564709637896, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.3027976751327515, + "step": 2309 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 5.984455958549223, + "grad_norm": 0.5044073531371609, + "kl": 0.2529296875, + "learning_rate": 4.0181347150259067e-07, + "loss": 0.0016, + "reward": 1.9999481439590454, + "reward_std": 8.086421985353809e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.4999480247497559, + "step": 2310 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 5.987046632124352, + "grad_norm": 1.722295266207358, + "kl": 0.06842041015625, + "learning_rate": 4.0155440414507767e-07, + "loss": 0.0004, + "reward": 1.9998886585235596, + "reward_std": 1.4046738670003833e-05, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.4998886287212372, + "step": 2311 + }, + { + "clip_ratio": 0.0, + "completion_length": 36.0, + "epoch": 5.989637305699482, + "grad_norm": 48.613036036560196, + "kl": 0.1282958984375, + "learning_rate": 4.0129533678756477e-07, + "loss": 0.0007, + "reward": 1.9983825087547302, + "reward_std": 0.0025517589630794646, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.4983823895454407, + "step": 2312 + }, + { + "clip_ratio": 0.0, + "completion_length": 34.5, + "epoch": 5.992227979274611, + "grad_norm": 0.07756872166757646, + "kl": 0.05615234375, + "learning_rate": 4.010362694300518e-07, + "loss": 0.0008, + "reward": 2.4999988079071045, + "reward_std": 9.127613225246023e-07, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999990463256836, + "step": 2313 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 5.994818652849741, + "grad_norm": 10.617348882922455, + "kl": 0.130126953125, + "learning_rate": 4.007772020725388e-07, + "loss": 0.0008, + "reward": 1.9964189529418945, + "reward_std": 0.004077342422419861, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.4964188933372498, + "step": 2314 + }, + { + "clip_ratio": 0.0, + "completion_length": 34.5, + "epoch": 5.9974093264248705, + "grad_norm": 8.826440221321544, + "kl": 0.140380859375, + "learning_rate": 4.005181347150259e-07, + "loss": 0.0003, + "reward": 1.4999750852584839, + "reward_std": 1.787094515748322e-05, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 0.9999750852584839, + "step": 2315 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.625, + "epoch": 6.0, + "grad_norm": 618.0269634301646, + "kl": 0.094482421875, + "learning_rate": 4.0025906735751293e-07, + "loss": 0.0003, + "reward": 1.9447551369667053, + "reward_std": 0.011802842628526378, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.4447550773620605, + "step": 2316 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.5, + "epoch": 6.0025906735751295, + "grad_norm": 10.654795789281325, + "kl": 0.134765625, + "learning_rate": 4e-07, + "loss": 0.0004, + "reward": 1.9747610688209534, + "reward_std": 0.0006670133816442103, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.4747611284255981, + "step": 2317 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 6.005181347150259, + "grad_norm": 1.0559056475824102, + "kl": 0.057861328125, + "learning_rate": 3.9974093264248703e-07, + "loss": 0.0003, + "reward": 1.9999327659606934, + "reward_std": 1.10519104055129e-05, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.4999327063560486, + "step": 2318 + }, + { + "clip_ratio": 0.0, + "completion_length": 36.5, + "epoch": 6.007772020725389, + "grad_norm": 8.760096470912242, + "kl": 0.20947265625, + "learning_rate": 3.994818652849741e-07, + "loss": 0.001, + "reward": 2.3123667240142822, + "reward_std": 0.25893301262658497, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.8123666644096375, + "step": 2319 + }, + { + "clip_ratio": 0.0, + "completion_length": 36.0, + "epoch": 6.010362694300518, + "grad_norm": 0.3458068214964244, + "kl": 0.0950927734375, + "learning_rate": 3.992227979274611e-07, + "loss": 0.0011, + "reward": 2.4999972581863403, + "reward_std": 1.749113096138899e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999972581863403, + "step": 2320 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.9375, + "epoch": 6.012953367875648, + "grad_norm": 0.4965606278819686, + "kl": 0.0626220703125, + "learning_rate": 3.9896373056994814e-07, + "loss": -0.0003, + "reward": 2.4999942779541016, + "reward_std": 5.895196181882056e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999942779541016, + "step": 2321 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 6.015544041450777, + "grad_norm": 0.8356507835178411, + "kl": 0.0273284912109375, + "learning_rate": 3.9870466321243525e-07, + "loss": 0.0001, + "reward": 2.499987840652466, + "reward_std": 7.145436939026695e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.999987781047821, + "step": 2322 + }, + { + "clip_ratio": 0.0, + "completion_length": 34.0, + "epoch": 6.018134715025907, + "grad_norm": 4.985023765354507, + "kl": 0.07708740234375, + "learning_rate": 3.9844559585492225e-07, + "loss": -0.0004, + "reward": 1.885767936706543, + "reward_std": 0.00013869718259229558, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.3857680559158325, + "step": 2323 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.5, + "epoch": 6.020725388601036, + "grad_norm": 15.54581644938938, + "kl": 0.224365234375, + "learning_rate": 3.981865284974093e-07, + "loss": 0.0009, + "reward": 1.8862226009368896, + "reward_std": 0.0003212960599512371, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.3862226009368896, + "step": 2324 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 6.023316062176166, + "grad_norm": 0.10624159047556597, + "kl": 0.1002197265625, + "learning_rate": 3.9792746113989635e-07, + "loss": 0.0008, + "reward": 2.4999964237213135, + "reward_std": 2.0552841419885226e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999962449073792, + "step": 2325 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.5, + "epoch": 6.025906735751295, + "grad_norm": 3.5611348229942386, + "kl": 0.092041015625, + "learning_rate": 3.976683937823834e-07, + "loss": -0.0001, + "reward": 1.9867271184921265, + "reward_std": 0.00010750261378689174, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.4867271184921265, + "step": 2326 + }, + { + "clip_ratio": 0.0, + "completion_length": 36.0, + "epoch": 6.028497409326425, + "grad_norm": 0.1755119947228922, + "kl": 0.078857421875, + "learning_rate": 3.9740932642487046e-07, + "loss": 0.0003, + "reward": 2.499992609024048, + "reward_std": 2.1198463535654355e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.999992847442627, + "step": 2327 + }, + { + "clip_ratio": 0.0, + "completion_length": 34.5, + "epoch": 6.0310880829015545, + "grad_norm": 4.3307376001626245, + "kl": 0.05517578125, + "learning_rate": 3.971502590673575e-07, + "loss": -0.0004, + "reward": 2.499932289123535, + "reward_std": 2.8998786547163036e-05, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999324679374695, + "step": 2328 + }, + { + "clip_ratio": 0.0, + "completion_length": 34.0, + "epoch": 6.033678756476684, + "grad_norm": 5.295231778550412, + "kl": 0.078369140625, + "learning_rate": 3.968911917098445e-07, + "loss": -0.0001, + "reward": 2.499976634979248, + "reward_std": 5.20184062224871e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.999976933002472, + "step": 2329 + }, + { + "clip_ratio": 0.0, + "completion_length": 34.5, + "epoch": 6.036269430051814, + "grad_norm": 0.6789033840119268, + "kl": 0.150390625, + "learning_rate": 3.9663212435233156e-07, + "loss": 0.0005, + "reward": 2.499988079071045, + "reward_std": 9.061225227924297e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.999988079071045, + "step": 2330 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.5, + "epoch": 6.038860103626943, + "grad_norm": 0.11302243359273849, + "kl": 0.080078125, + "learning_rate": 3.9637305699481867e-07, + "loss": 0.0003, + "reward": 2.4999940395355225, + "reward_std": 2.437762191220827e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999939799308777, + "step": 2331 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 6.041450777202073, + "grad_norm": 9.723115227050537, + "kl": 0.03411865234375, + "learning_rate": 3.9611398963730567e-07, + "loss": 0.0001, + "reward": 2.062425136566162, + "reward_std": 0.17679953361931666, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.562425136566162, + "step": 2332 + }, + { + "clip_ratio": 0.0, + "completion_length": 34.5, + "epoch": 6.044041450777202, + "grad_norm": 8.45268253530421, + "kl": 0.0794677734375, + "learning_rate": 3.958549222797927e-07, + "loss": 0.001, + "reward": 1.999755084514618, + "reward_std": 5.181346568861045e-05, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.499755084514618, + "step": 2333 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 6.046632124352332, + "grad_norm": 0.4397343116828608, + "kl": 0.069580078125, + "learning_rate": 3.9559585492227977e-07, + "loss": -0.0008, + "reward": 2.4999847412109375, + "reward_std": 4.293986080483592e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999846816062927, + "step": 2334 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.5, + "epoch": 6.049222797927461, + "grad_norm": 0.1385904049681267, + "kl": 0.03155517578125, + "learning_rate": 3.9533678756476677e-07, + "loss": -0.0003, + "reward": 2.4999935626983643, + "reward_std": 2.5664792246971047e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999936819076538, + "step": 2335 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 6.051813471502591, + "grad_norm": 0.10960552888821755, + "kl": 0.120361328125, + "learning_rate": 3.950777202072539e-07, + "loss": 0.0018, + "reward": 2.499997854232788, + "reward_std": 1.786342750165204e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999977946281433, + "step": 2336 + }, + { + "clip_ratio": 0.0, + "completion_length": 36.5, + "epoch": 6.05440414507772, + "grad_norm": 4.950872204943883, + "kl": 0.0396728515625, + "learning_rate": 3.9481865284974093e-07, + "loss": -0.0005, + "reward": 2.437424659729004, + "reward_std": 0.17678508458675424, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9374247193336487, + "step": 2337 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 6.05699481865285, + "grad_norm": 0.7291686825902829, + "kl": 0.0693359375, + "learning_rate": 3.9455958549222793e-07, + "loss": -0.0002, + "reward": 1.9998353719711304, + "reward_std": 1.0741312280515558e-05, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.49983549118042, + "step": 2338 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 6.0595854922279795, + "grad_norm": 0.36851781306139475, + "kl": 0.068115234375, + "learning_rate": 3.94300518134715e-07, + "loss": -0.0, + "reward": 2.4999977350234985, + "reward_std": 1.7464092820773658e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.999997854232788, + "step": 2339 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.5, + "epoch": 6.062176165803109, + "grad_norm": 2.2244372405694235, + "kl": 0.155029296875, + "learning_rate": 3.940414507772021e-07, + "loss": -0.0008, + "reward": 2.4999849796295166, + "reward_std": 8.654305815980479e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999852180480957, + "step": 2340 + }, + { + "clip_ratio": 0.0, + "completion_length": 34.5, + "epoch": 6.064766839378239, + "grad_norm": 1.2045213300003665, + "kl": 0.081787109375, + "learning_rate": 3.937823834196891e-07, + "loss": -0.0003, + "reward": 2.499994397163391, + "reward_std": 3.8123297372294473e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999943971633911, + "step": 2341 + }, + { + "clip_ratio": 0.0, + "completion_length": 34.4375, + "epoch": 6.067357512953368, + "grad_norm": 0.46707770838607143, + "kl": 0.0535888671875, + "learning_rate": 3.9352331606217614e-07, + "loss": 0.0011, + "reward": 2.4999784231185913, + "reward_std": 3.493684573641076e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999781847000122, + "step": 2342 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.5, + "epoch": 6.069948186528498, + "grad_norm": 0.35306793860459146, + "kl": 0.126953125, + "learning_rate": 3.932642487046632e-07, + "loss": 0.0018, + "reward": 2.4999961853027344, + "reward_std": 2.3344006194747635e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999960064888, + "step": 2343 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 6.072538860103627, + "grad_norm": 15.222216378318482, + "kl": 0.4169921875, + "learning_rate": 3.930051813471502e-07, + "loss": 0.002, + "reward": 1.9559217691421509, + "reward_std": 0.00022551812344318023, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.455921620130539, + "step": 2344 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 6.075129533678757, + "grad_norm": 15.289137682587517, + "kl": 0.12353515625, + "learning_rate": 3.927461139896373e-07, + "loss": 0.0005, + "reward": 2.124322831630707, + "reward_std": 0.2316927479822084, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.6243227124214172, + "step": 2345 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.5, + "epoch": 6.077720207253886, + "grad_norm": 1.9474240143237185, + "kl": 0.15673828125, + "learning_rate": 3.9248704663212435e-07, + "loss": -0.0004, + "reward": 2.4999842643737793, + "reward_std": 9.061502169060986e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999842643737793, + "step": 2346 + }, + { + "clip_ratio": 0.0, + "completion_length": 34.5, + "epoch": 6.080310880829016, + "grad_norm": 6.800640322470947, + "kl": 0.056640625, + "learning_rate": 3.9222797927461135e-07, + "loss": -0.0005, + "reward": 2.499996542930603, + "reward_std": 5.2583898764169135e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999966025352478, + "step": 2347 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.5, + "epoch": 6.082901554404145, + "grad_norm": 4.318061329351835, + "kl": 0.0677490234375, + "learning_rate": 3.919689119170984e-07, + "loss": 0.0004, + "reward": 2.4998890161514282, + "reward_std": 3.4543336823844584e-05, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9998890161514282, + "step": 2348 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.5, + "epoch": 6.085492227979275, + "grad_norm": 0.8762614128269265, + "kl": 0.096923828125, + "learning_rate": 3.917098445595855e-07, + "loss": 0.0008, + "reward": 2.499986171722412, + "reward_std": 6.506035333586624e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999861121177673, + "step": 2349 + }, + { + "clip_ratio": 0.0, + "completion_length": 34.5, + "epoch": 6.0880829015544045, + "grad_norm": 1.330985053445535, + "kl": 0.11767578125, + "learning_rate": 3.9145077720207256e-07, + "loss": 0.0, + "reward": 2.4999892711639404, + "reward_std": 5.188443111592278e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999893307685852, + "step": 2350 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 6.090673575129534, + "grad_norm": 0.511445006321932, + "kl": 0.236328125, + "learning_rate": 3.9119170984455956e-07, + "loss": 0.0011, + "reward": 2.499994993209839, + "reward_std": 3.287688855380111e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999949932098389, + "step": 2351 + }, + { + "clip_ratio": 0.0, + "completion_length": 36.0, + "epoch": 6.0932642487046635, + "grad_norm": 0.24903680533846317, + "kl": 0.09375, + "learning_rate": 3.909326424870466e-07, + "loss": 0.001, + "reward": 2.499991297721863, + "reward_std": 3.5320894085089094e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999912977218628, + "step": 2352 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 6.095854922279793, + "grad_norm": 0.23272059628696898, + "kl": 0.075439453125, + "learning_rate": 3.906735751295336e-07, + "loss": 0.0006, + "reward": 2.499997138977051, + "reward_std": 1.9101866541859636e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999969601631165, + "step": 2353 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 6.098445595854923, + "grad_norm": 3.9302737178453357, + "kl": 0.1279296875, + "learning_rate": 3.904145077720207e-07, + "loss": 0.0012, + "reward": 2.499966025352478, + "reward_std": 1.6063233488239348e-05, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999658465385437, + "step": 2354 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.5, + "epoch": 6.101036269430052, + "grad_norm": 0.8034716883525865, + "kl": 0.08642578125, + "learning_rate": 3.9015544041450777e-07, + "loss": 0.0012, + "reward": 2.499990701675415, + "reward_std": 8.419689038419165e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999907612800598, + "step": 2355 + }, + { + "clip_ratio": 0.0, + "completion_length": 34.5, + "epoch": 6.103626943005182, + "grad_norm": 1.135556782867008, + "kl": 0.0830078125, + "learning_rate": 3.898963730569948e-07, + "loss": -0.0004, + "reward": 2.4999920129776, + "reward_std": 3.968956320932193e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999920129776, + "step": 2356 + }, + { + "clip_ratio": 0.0, + "completion_length": 34.5, + "epoch": 6.106217616580311, + "grad_norm": 11.357492140788507, + "kl": 0.165283203125, + "learning_rate": 3.896373056994818e-07, + "loss": 0.0016, + "reward": 1.9946427941322327, + "reward_std": 0.00024436906829805594, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.4946427941322327, + "step": 2357 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.9375, + "epoch": 6.108808290155441, + "grad_norm": 16.039766478649344, + "kl": 0.0693359375, + "learning_rate": 3.893782383419689e-07, + "loss": 0.0001, + "reward": 2.0578293800354004, + "reward_std": 0.17866243754724564, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.5578293204307556, + "step": 2358 + }, + { + "clip_ratio": 0.0, + "completion_length": 36.0, + "epoch": 6.11139896373057, + "grad_norm": 0.9359369537647699, + "kl": 0.23046875, + "learning_rate": 3.89119170984456e-07, + "loss": 0.0001, + "reward": 2.4999769926071167, + "reward_std": 1.186994541058084e-05, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999769926071167, + "step": 2359 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 6.1139896373057, + "grad_norm": 1.4206029868885972, + "kl": 0.0797119140625, + "learning_rate": 3.88860103626943e-07, + "loss": 0.0002, + "reward": 2.4999903440475464, + "reward_std": 4.670081082736033e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.999990463256836, + "step": 2360 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 6.116580310880829, + "grad_norm": 0.10276242879487937, + "kl": 0.0360107421875, + "learning_rate": 3.8860103626943004e-07, + "loss": -0.0004, + "reward": 2.499998688697815, + "reward_std": 1.355124410906683e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999988079071045, + "step": 2361 + }, + { + "clip_ratio": 0.0, + "completion_length": 34.5, + "epoch": 6.119170984455959, + "grad_norm": 1.6755711852549475, + "kl": 0.10498046875, + "learning_rate": 3.883419689119171e-07, + "loss": 0.0001, + "reward": 1.9995163083076477, + "reward_std": 2.0942025912518147e-05, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.4995163977146149, + "step": 2362 + }, + { + "clip_ratio": 0.0, + "completion_length": 36.0, + "epoch": 6.1217616580310885, + "grad_norm": 0.17905306664353854, + "kl": 0.0947265625, + "learning_rate": 3.8808290155440414e-07, + "loss": 0.0008, + "reward": 2.499996542930603, + "reward_std": 2.2216927391127683e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999964833259583, + "step": 2363 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.5, + "epoch": 6.124352331606218, + "grad_norm": 2.366258023022168, + "kl": 0.1270751953125, + "learning_rate": 3.878238341968912e-07, + "loss": 0.0009, + "reward": 2.4999701976776123, + "reward_std": 1.4471586666786607e-05, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999701976776123, + "step": 2364 + }, + { + "clip_ratio": 0.0, + "completion_length": 34.5, + "epoch": 6.126943005181348, + "grad_norm": 0.11137391593984437, + "kl": 0.05615234375, + "learning_rate": 3.8756476683937825e-07, + "loss": 0.0009, + "reward": 2.4999961853027344, + "reward_std": 2.0130917732785747e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999958872795105, + "step": 2365 + }, + { + "clip_ratio": 0.0, + "completion_length": 34.5, + "epoch": 6.129533678756476, + "grad_norm": 0.23382651128442436, + "kl": 0.082275390625, + "learning_rate": 3.8730569948186525e-07, + "loss": 0.0006, + "reward": 2.499998450279236, + "reward_std": 1.8829093164640653e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999984502792358, + "step": 2366 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 6.132124352331606, + "grad_norm": 0.43113087082188417, + "kl": 0.118408203125, + "learning_rate": 3.870466321243523e-07, + "loss": 0.0012, + "reward": 2.4999966621398926, + "reward_std": 4.101755166630028e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999964237213135, + "step": 2367 + }, + { + "clip_ratio": 0.0, + "completion_length": 36.0, + "epoch": 6.134715025906735, + "grad_norm": 7.311207451974099, + "kl": 0.158203125, + "learning_rate": 3.867875647668394e-07, + "loss": 0.0008, + "reward": 1.8704302310943604, + "reward_std": 0.0007156892678494842, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.370430201292038, + "step": 2368 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.5, + "epoch": 6.137305699481865, + "grad_norm": 4.355334700177657, + "kl": 0.066009521484375, + "learning_rate": 3.865284974093264e-07, + "loss": 0.0001, + "reward": 2.499995231628418, + "reward_std": 3.986485751283908e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999952912330627, + "step": 2369 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.5, + "epoch": 6.139896373056994, + "grad_norm": 2.7361862959037926, + "kl": 0.2919921875, + "learning_rate": 3.8626943005181346e-07, + "loss": 0.0012, + "reward": 2.499988555908203, + "reward_std": 1.8688807813305175e-05, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999886751174927, + "step": 2370 + }, + { + "clip_ratio": 0.0, + "completion_length": 34.5, + "epoch": 6.142487046632124, + "grad_norm": 0.3804187901716751, + "kl": 0.0870361328125, + "learning_rate": 3.860103626943005e-07, + "loss": 0.0009, + "reward": 2.4999749660491943, + "reward_std": 4.024402869617916e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999750256538391, + "step": 2371 + }, + { + "clip_ratio": 0.0, + "completion_length": 34.5, + "epoch": 6.1450777202072535, + "grad_norm": 1.4386344215519704, + "kl": 0.106689453125, + "learning_rate": 3.8575129533678756e-07, + "loss": 0.0, + "reward": 2.4999932050704956, + "reward_std": 5.657253211666102e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999933242797852, + "step": 2372 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 6.147668393782383, + "grad_norm": 0.422438328427756, + "kl": 0.0399169921875, + "learning_rate": 3.854922279792746e-07, + "loss": 0.0, + "reward": 2.4999982118606567, + "reward_std": 1.5368167396445642e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999982714653015, + "step": 2373 + }, + { + "clip_ratio": 0.0, + "completion_length": 34.0, + "epoch": 6.150259067357513, + "grad_norm": 0.0672043231013455, + "kl": 0.0595703125, + "learning_rate": 3.8523316062176167e-07, + "loss": -0.0001, + "reward": 2.499997615814209, + "reward_std": 1.1967109116994834e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.999997615814209, + "step": 2374 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.5, + "epoch": 6.152849740932642, + "grad_norm": 24.710676548637306, + "kl": 0.078857421875, + "learning_rate": 3.8497409326424867e-07, + "loss": -0.0001, + "reward": 2.12496554851532, + "reward_std": 0.23146837626444494, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.6249657273292542, + "step": 2375 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 6.155440414507772, + "grad_norm": 0.1475410465621916, + "kl": 0.0604248046875, + "learning_rate": 3.847150259067357e-07, + "loss": -0.0003, + "reward": 2.499997615814209, + "reward_std": 1.4631035014645022e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999976754188538, + "step": 2376 + }, + { + "clip_ratio": 0.0, + "completion_length": 36.0, + "epoch": 6.158031088082901, + "grad_norm": 4.839287582198097, + "kl": 0.0743408203125, + "learning_rate": 3.844559585492228e-07, + "loss": 0.0003, + "reward": 1.9983254671096802, + "reward_std": 8.763026744418312e-05, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.4983254373073578, + "step": 2377 + }, + { + "clip_ratio": 0.0, + "completion_length": 36.0, + "epoch": 6.160621761658031, + "grad_norm": 0.7249268727229805, + "kl": 0.08544921875, + "learning_rate": 3.841968911917098e-07, + "loss": 0.001, + "reward": 2.4999899864196777, + "reward_std": 6.038415222064941e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999899864196777, + "step": 2378 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 6.16321243523316, + "grad_norm": 17.197988421710985, + "kl": 0.0782470703125, + "learning_rate": 3.839378238341969e-07, + "loss": 0.0006, + "reward": 2.1870521306991577, + "reward_std": 0.2590118725811408, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.6870522499084473, + "step": 2379 + }, + { + "clip_ratio": 0.0, + "completion_length": 36.0, + "epoch": 6.16580310880829, + "grad_norm": 2.3385333135440547, + "kl": 0.05126953125, + "learning_rate": 3.8367875647668393e-07, + "loss": 0.0006, + "reward": 2.4999810457229614, + "reward_std": 1.3160841263015755e-05, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999809265136719, + "step": 2380 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.5, + "epoch": 6.168393782383419, + "grad_norm": 0.2032526934099008, + "kl": 0.0830078125, + "learning_rate": 3.8341968911917093e-07, + "loss": 0.0006, + "reward": 2.499993681907654, + "reward_std": 2.9676965596081573e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999936819076538, + "step": 2381 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 6.170984455958549, + "grad_norm": 0.2744820953494261, + "kl": 0.141357421875, + "learning_rate": 3.8316062176165804e-07, + "loss": 0.0009, + "reward": 1.9998565316200256, + "reward_std": 7.248964379868994e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.4998565316200256, + "step": 2382 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 6.1735751295336785, + "grad_norm": 7.786644955397623, + "kl": 0.098388671875, + "learning_rate": 3.829015544041451e-07, + "loss": 0.0013, + "reward": 2.499973773956299, + "reward_std": 2.4796958967954197e-05, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999736547470093, + "step": 2383 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 6.176165803108808, + "grad_norm": 0.3896390050694339, + "kl": 0.05902099609375, + "learning_rate": 3.826424870466321e-07, + "loss": -0.0001, + "reward": 2.499996066093445, + "reward_std": 4.187430249658064e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999960660934448, + "step": 2384 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 6.178756476683938, + "grad_norm": 8.01139434095062, + "kl": 0.1357421875, + "learning_rate": 3.8238341968911914e-07, + "loss": 0.0008, + "reward": 2.4998979568481445, + "reward_std": 6.750745450290196e-05, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9998978972434998, + "step": 2385 + }, + { + "clip_ratio": 0.0, + "completion_length": 36.0, + "epoch": 6.181347150259067, + "grad_norm": 1.3089080578138994, + "kl": 0.081298828125, + "learning_rate": 3.8212435233160625e-07, + "loss": 0.001, + "reward": 2.499968647956848, + "reward_std": 5.661656530264736e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999687671661377, + "step": 2386 + }, + { + "clip_ratio": 0.0, + "completion_length": 36.0, + "epoch": 6.183937823834197, + "grad_norm": 0.8460749804933171, + "kl": 0.0560302734375, + "learning_rate": 3.8186528497409325e-07, + "loss": -0.0002, + "reward": 2.499961256980896, + "reward_std": 5.853392508470279e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.999961495399475, + "step": 2387 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.5, + "epoch": 6.186528497409326, + "grad_norm": 413.435949378909, + "kl": 28.94873046875, + "learning_rate": 3.816062176165803e-07, + "loss": 0.1159, + "reward": 1.9697168469429016, + "reward_std": 0.005213464294797632, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.4697167873382568, + "step": 2388 + }, + { + "clip_ratio": 0.0, + "completion_length": 34.0, + "epoch": 6.189119170984456, + "grad_norm": 1.7810366446013317, + "kl": 0.05810546875, + "learning_rate": 3.8134715025906735e-07, + "loss": -0.0001, + "reward": 1.9998552203178406, + "reward_std": 1.3732143401057328e-05, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.4998551905155182, + "step": 2389 + }, + { + "clip_ratio": 0.0, + "completion_length": 36.0, + "epoch": 6.191709844559585, + "grad_norm": 0.23300915999837846, + "kl": 0.17578125, + "learning_rate": 3.8108808290155435e-07, + "loss": -0.0002, + "reward": 1.9999127388000488, + "reward_std": 5.1483686434039555e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.4999129474163055, + "step": 2390 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 6.194300518134715, + "grad_norm": 1.8944082449054434, + "kl": 0.0712890625, + "learning_rate": 3.8082901554404146e-07, + "loss": 0.0001, + "reward": 2.4999886751174927, + "reward_std": 7.113919338053165e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999887347221375, + "step": 2391 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.5, + "epoch": 6.196891191709844, + "grad_norm": 3.3598542923491004, + "kl": 0.068603515625, + "learning_rate": 3.805699481865285e-07, + "loss": -0.0005, + "reward": 1.995642900466919, + "reward_std": 4.7626547484469484e-05, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.4956431090831757, + "step": 2392 + }, + { + "clip_ratio": 0.0, + "completion_length": 36.0, + "epoch": 6.199481865284974, + "grad_norm": 4.920157378325246, + "kl": 0.16455078125, + "learning_rate": 3.803108808290155e-07, + "loss": 0.0005, + "reward": 1.9938519597053528, + "reward_std": 6.516680218737747e-05, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.4938518404960632, + "step": 2393 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 6.2020725388601035, + "grad_norm": 0.8087563064292018, + "kl": 0.1531982421875, + "learning_rate": 3.8005181347150256e-07, + "loss": 0.0005, + "reward": 2.4999749660491943, + "reward_std": 5.67804681850248e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999749660491943, + "step": 2394 + }, + { + "clip_ratio": 0.0, + "completion_length": 33.125, + "epoch": 6.204663212435233, + "grad_norm": 34.140869391999026, + "kl": 0.07080078125, + "learning_rate": 3.797927461139896e-07, + "loss": 0.0007, + "reward": 2.437418222427368, + "reward_std": 0.17700132422399406, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9374181032180786, + "step": 2395 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.5, + "epoch": 6.2072538860103625, + "grad_norm": 0.1946493762075473, + "kl": 0.0528564453125, + "learning_rate": 3.7953367875647667e-07, + "loss": 0.0005, + "reward": 2.499996066093445, + "reward_std": 2.4461089083160914e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999959468841553, + "step": 2396 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.5625, + "epoch": 6.209844559585492, + "grad_norm": 15.771830647704563, + "kl": 0.17236328125, + "learning_rate": 3.792746113989637e-07, + "loss": -0.0004, + "reward": 2.4374654293060303, + "reward_std": 0.17682881184941834, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9374655485153198, + "step": 2397 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.5, + "epoch": 6.212435233160622, + "grad_norm": 0.06170751764246492, + "kl": 0.114013671875, + "learning_rate": 3.7901554404145077e-07, + "loss": 0.0006, + "reward": 2.499998927116394, + "reward_std": 1.6137463489940274e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999989867210388, + "step": 2398 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 6.215025906735751, + "grad_norm": 8.549851188408928, + "kl": 0.0947265625, + "learning_rate": 3.7875647668393777e-07, + "loss": -0.0007, + "reward": 2.4999760389328003, + "reward_std": 1.4691032561131578e-05, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.999976098537445, + "step": 2399 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.5, + "epoch": 6.217616580310881, + "grad_norm": 2.1660914798541238, + "kl": 0.04266357421875, + "learning_rate": 3.784974093264249e-07, + "loss": 0.0011, + "reward": 1.9037814736366272, + "reward_std": 0.00017656411864663824, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.4037814140319824, + "step": 2400 + }, + { + "clip_ratio": 0.0, + "completion_length": 36.5625, + "epoch": 6.22020725388601, + "grad_norm": 0.6186428060767203, + "kl": 0.222900390625, + "learning_rate": 3.7823834196891193e-07, + "loss": 0.0009, + "reward": 2.499996066093445, + "reward_std": 1.0282419395934994e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999960064888, + "step": 2401 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.5, + "epoch": 6.22279792746114, + "grad_norm": 4.970433436994633, + "kl": 0.421875, + "learning_rate": 3.7797927461139893e-07, + "loss": 0.0016, + "reward": 2.4999921321868896, + "reward_std": 7.805063319210603e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999921917915344, + "step": 2402 + }, + { + "clip_ratio": 0.0, + "completion_length": 34.5, + "epoch": 6.225388601036269, + "grad_norm": 52.44956473278859, + "kl": 0.208984375, + "learning_rate": 3.77720207253886e-07, + "loss": 0.0017, + "reward": 1.9954881072044373, + "reward_std": 0.0034883645831769172, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.4954880475997925, + "step": 2403 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 6.227979274611399, + "grad_norm": 0.4716565998080233, + "kl": 0.1103515625, + "learning_rate": 3.7746113989637304e-07, + "loss": 0.0001, + "reward": 2.499995470046997, + "reward_std": 3.080721768355943e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999955296516418, + "step": 2404 + }, + { + "clip_ratio": 0.0, + "completion_length": 36.0, + "epoch": 6.230569948186528, + "grad_norm": 0.3172445475524281, + "kl": 0.112945556640625, + "learning_rate": 3.772020725388601e-07, + "loss": 0.0006, + "reward": 2.4999889135360718, + "reward_std": 3.829602064797655e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.999988853931427, + "step": 2405 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 6.233160621761658, + "grad_norm": 0.2661678027079654, + "kl": 0.0830078125, + "learning_rate": 3.7694300518134714e-07, + "loss": -0.0007, + "reward": 2.4999966621398926, + "reward_std": 2.497837044757034e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999967813491821, + "step": 2406 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.5, + "epoch": 6.2357512953367875, + "grad_norm": 0.30059252457158375, + "kl": 0.064208984375, + "learning_rate": 3.766839378238342e-07, + "loss": -0.0, + "reward": 2.4999938011169434, + "reward_std": 3.6139878147878335e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.999993920326233, + "step": 2407 + }, + { + "clip_ratio": 0.0, + "completion_length": 34.5, + "epoch": 6.238341968911917, + "grad_norm": 0.16956674398434016, + "kl": 0.0830078125, + "learning_rate": 3.764248704663212e-07, + "loss": 0.0016, + "reward": 2.4999899864196777, + "reward_std": 2.0162952409918944e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999897480010986, + "step": 2408 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 6.240932642487047, + "grad_norm": 0.12519663363735328, + "kl": 0.0648193359375, + "learning_rate": 3.761658031088083e-07, + "loss": 0.0009, + "reward": 2.499998688697815, + "reward_std": 1.0215590577900002e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.999998688697815, + "step": 2409 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 6.243523316062176, + "grad_norm": 1.7237133933044158, + "kl": 0.075439453125, + "learning_rate": 3.7590673575129535e-07, + "loss": -0.0001, + "reward": 2.499948263168335, + "reward_std": 1.0183679023612058e-05, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.999948263168335, + "step": 2410 + }, + { + "clip_ratio": 0.0, + "completion_length": 34.0, + "epoch": 6.246113989637306, + "grad_norm": 1.961533630093995, + "kl": 0.06494140625, + "learning_rate": 3.7564766839378235e-07, + "loss": -0.0004, + "reward": 2.4999901056289673, + "reward_std": 6.170386313897325e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999902248382568, + "step": 2411 + }, + { + "clip_ratio": 0.0, + "completion_length": 34.0, + "epoch": 6.248704663212435, + "grad_norm": 0.360889730102968, + "kl": 0.0986328125, + "learning_rate": 3.753886010362694e-07, + "loss": -0.0015, + "reward": 2.499983072280884, + "reward_std": 3.5920837717640097e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999832510948181, + "step": 2412 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 6.251295336787565, + "grad_norm": 10.450852956119045, + "kl": 0.107666015625, + "learning_rate": 3.7512953367875646e-07, + "loss": 0.0017, + "reward": 2.499994993209839, + "reward_std": 2.394451257714536e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.999994933605194, + "step": 2413 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 6.253886010362694, + "grad_norm": 2.526678788309054, + "kl": 0.1138916015625, + "learning_rate": 3.748704663212435e-07, + "loss": 0.0011, + "reward": 1.9975848197937012, + "reward_std": 4.312438233000648e-05, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.4975847899913788, + "step": 2414 + }, + { + "clip_ratio": 0.0, + "completion_length": 34.5, + "epoch": 6.256476683937824, + "grad_norm": 0.4066458451787601, + "kl": 0.0849609375, + "learning_rate": 3.7461139896373056e-07, + "loss": 0.001, + "reward": 2.49999463558197, + "reward_std": 4.263770051693427e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999946355819702, + "step": 2415 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.5, + "epoch": 6.259067357512953, + "grad_norm": 1.3257820718628153, + "kl": 0.0426025390625, + "learning_rate": 3.743523316062176e-07, + "loss": 0.0011, + "reward": 2.499985933303833, + "reward_std": 6.4517505506955786e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999858140945435, + "step": 2416 + }, + { + "clip_ratio": 0.0, + "completion_length": 34.0, + "epoch": 6.261658031088083, + "grad_norm": 0.11876501143050537, + "kl": 0.0478515625, + "learning_rate": 3.740932642487046e-07, + "loss": 0.0002, + "reward": 2.499997615814209, + "reward_std": 1.4190826505000587e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999974966049194, + "step": 2417 + }, + { + "clip_ratio": 0.0, + "completion_length": 36.0, + "epoch": 6.2642487046632125, + "grad_norm": 0.1581385295322136, + "kl": 0.125, + "learning_rate": 3.7383419689119167e-07, + "loss": 0.0007, + "reward": 2.4999938011169434, + "reward_std": 1.5311854895116994e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.999993920326233, + "step": 2418 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 6.266839378238342, + "grad_norm": 31.227476040820765, + "kl": 0.1396484375, + "learning_rate": 3.7357512953367877e-07, + "loss": 0.0006, + "reward": 2.0551145672798157, + "reward_std": 0.17975615236883868, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.5551146268844604, + "step": 2419 + }, + { + "clip_ratio": 0.0, + "completion_length": 34.0, + "epoch": 6.269430051813472, + "grad_norm": 3.9667463256526245, + "kl": 0.087890625, + "learning_rate": 3.7331606217616577e-07, + "loss": 0.0015, + "reward": 2.4999780654907227, + "reward_std": 1.2078240274604468e-05, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.999977946281433, + "step": 2420 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.5, + "epoch": 6.272020725388601, + "grad_norm": 0.23347583948390563, + "kl": 0.0792236328125, + "learning_rate": 3.730569948186528e-07, + "loss": 0.0002, + "reward": 2.499984860420227, + "reward_std": 2.497193008821341e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.999984860420227, + "step": 2421 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.875, + "epoch": 6.274611398963731, + "grad_norm": 120.61255256389367, + "kl": 0.15771484375, + "learning_rate": 3.727979274611399e-07, + "loss": 0.0008, + "reward": 1.2116374969482422, + "reward_std": 0.035305225504998816, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 0.7116374969482422, + "step": 2422 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.5, + "epoch": 6.27720207253886, + "grad_norm": 2.437600293856105, + "kl": 0.111328125, + "learning_rate": 3.7253886010362693e-07, + "loss": 0.0007, + "reward": 2.4999897480010986, + "reward_std": 6.014159225742333e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.999989628791809, + "step": 2423 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.5, + "epoch": 6.27979274611399, + "grad_norm": 0.4290821560071071, + "kl": 0.0833740234375, + "learning_rate": 3.72279792746114e-07, + "loss": -0.0014, + "reward": 2.4999876022338867, + "reward_std": 5.146757075635833e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999877214431763, + "step": 2424 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 6.282383419689119, + "grad_norm": 0.1600028867259204, + "kl": 0.09637451171875, + "learning_rate": 3.7202072538860104e-07, + "loss": 0.0004, + "reward": 2.4999983310699463, + "reward_std": 2.7850650781147124e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999984502792358, + "step": 2425 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.5, + "epoch": 6.284974093264249, + "grad_norm": 22.826085820946783, + "kl": 0.18798828125, + "learning_rate": 3.7176165803108804e-07, + "loss": 0.001, + "reward": 1.9742602109909058, + "reward_std": 0.00019707344745256705, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.474260151386261, + "step": 2426 + }, + { + "clip_ratio": 0.0, + "completion_length": 34.5, + "epoch": 6.287564766839378, + "grad_norm": 0.22442671006552858, + "kl": 0.134033203125, + "learning_rate": 3.715025906735751e-07, + "loss": 0.0001, + "reward": 1.4999973773956299, + "reward_std": 1.2898391332782921e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 0.9999974966049194, + "step": 2427 + }, + { + "clip_ratio": 0.0, + "completion_length": 34.5, + "epoch": 6.290155440414508, + "grad_norm": 1.3144386377961415, + "kl": 0.063720703125, + "learning_rate": 3.712435233160622e-07, + "loss": -0.0006, + "reward": 2.499995470046997, + "reward_std": 3.925233215795743e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999955296516418, + "step": 2428 + }, + { + "clip_ratio": 0.0, + "completion_length": 34.5, + "epoch": 6.2927461139896375, + "grad_norm": 0.7125709381731127, + "kl": 0.18603515625, + "learning_rate": 3.709844559585492e-07, + "loss": 0.0019, + "reward": 2.4999938011169434, + "reward_std": 6.47274009679677e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999937415122986, + "step": 2429 + }, + { + "clip_ratio": 0.0, + "completion_length": 34.5, + "epoch": 6.295336787564767, + "grad_norm": 2.1572894455514775, + "kl": 0.11083984375, + "learning_rate": 3.7072538860103625e-07, + "loss": 0.0004, + "reward": 2.4999969005584717, + "reward_std": 2.5959349159165868e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999970197677612, + "step": 2430 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.5, + "epoch": 6.2979274611398965, + "grad_norm": 0.33131636709164686, + "kl": 0.133056640625, + "learning_rate": 3.704663212435233e-07, + "loss": 0.0011, + "reward": 2.499998688697815, + "reward_std": 1.119217273526374e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.999998688697815, + "step": 2431 + }, + { + "clip_ratio": 0.0, + "completion_length": 34.5, + "epoch": 6.300518134715026, + "grad_norm": 0.31916247505055684, + "kl": 0.05419921875, + "learning_rate": 3.7020725388601035e-07, + "loss": -0.0007, + "reward": 2.4999982118606567, + "reward_std": 1.874835731996427e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999983310699463, + "step": 2432 + }, + { + "clip_ratio": 0.0, + "completion_length": 34.5, + "epoch": 6.303108808290156, + "grad_norm": 0.7371616213570794, + "kl": 0.14599609375, + "learning_rate": 3.699481865284974e-07, + "loss": 0.0012, + "reward": 2.499993324279785, + "reward_std": 3.091249539011187e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999932646751404, + "step": 2433 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 6.305699481865285, + "grad_norm": 7.84420019990756, + "kl": 0.0811767578125, + "learning_rate": 3.6968911917098446e-07, + "loss": 0.0004, + "reward": 2.499984860420227, + "reward_std": 4.193630019244665e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.999984860420227, + "step": 2434 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 6.308290155440415, + "grad_norm": 10.086134480363164, + "kl": 0.071044921875, + "learning_rate": 3.6943005181347146e-07, + "loss": -0.0005, + "reward": 1.8229429721832275, + "reward_std": 0.00032223474909187644, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.3229431211948395, + "step": 2435 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.5, + "epoch": 6.310880829015544, + "grad_norm": 2.147529720812094, + "kl": 0.15478515625, + "learning_rate": 3.691709844559585e-07, + "loss": 0.001, + "reward": 2.4999783039093018, + "reward_std": 9.526685516902944e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.999978244304657, + "step": 2436 + }, + { + "clip_ratio": 0.0, + "completion_length": 36.0, + "epoch": 6.313471502590674, + "grad_norm": 0.3526405909633918, + "kl": 0.0701904296875, + "learning_rate": 3.689119170984456e-07, + "loss": 0.0001, + "reward": 2.4999940395355225, + "reward_std": 4.587311536852212e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999940395355225, + "step": 2437 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.5, + "epoch": 6.316062176165803, + "grad_norm": 2.506128111942887, + "kl": 0.0531005859375, + "learning_rate": 3.686528497409326e-07, + "loss": -0.0001, + "reward": 2.4999901056289673, + "reward_std": 1.081242282907624e-05, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999901056289673, + "step": 2438 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 6.318652849740933, + "grad_norm": 12.715547924923909, + "kl": 0.25390625, + "learning_rate": 3.6839378238341967e-07, + "loss": 0.0012, + "reward": 1.9230295419692993, + "reward_std": 0.0002926438269241771, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.4230294525623322, + "step": 2439 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.5, + "epoch": 6.321243523316062, + "grad_norm": 0.13738235770219653, + "kl": 0.063232421875, + "learning_rate": 3.681347150259067e-07, + "loss": 0.0008, + "reward": 2.4999979734420776, + "reward_std": 1.6053691069828346e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999980330467224, + "step": 2440 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 6.323834196891192, + "grad_norm": 0.17818110736491416, + "kl": 0.068115234375, + "learning_rate": 3.678756476683937e-07, + "loss": 0.0006, + "reward": 2.4999961853027344, + "reward_std": 1.9105841886357666e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999961256980896, + "step": 2441 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.5, + "epoch": 6.3264248704663215, + "grad_norm": 0.5909619139462969, + "kl": 0.07861328125, + "learning_rate": 3.676165803108808e-07, + "loss": 0.0005, + "reward": 2.4999940395355225, + "reward_std": 3.80162748570001e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999939799308777, + "step": 2442 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 6.329015544041451, + "grad_norm": 0.8296793834573871, + "kl": 0.0791015625, + "learning_rate": 3.673575129533679e-07, + "loss": 0.0006, + "reward": 2.4999938011169434, + "reward_std": 3.2873880400074995e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999938011169434, + "step": 2443 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 6.331606217616581, + "grad_norm": 7.329533403850742, + "kl": 0.0517578125, + "learning_rate": 3.670984455958549e-07, + "loss": -0.0005, + "reward": 1.791582703590393, + "reward_std": 0.0005982934924304573, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.2915826737880707, + "step": 2444 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.5, + "epoch": 6.33419689119171, + "grad_norm": 2.633197294463325, + "kl": 0.136962890625, + "learning_rate": 3.6683937823834193e-07, + "loss": 0.0015, + "reward": 2.499969005584717, + "reward_std": 1.4033648312761215e-05, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999691247940063, + "step": 2445 + }, + { + "clip_ratio": 0.0, + "completion_length": 36.0, + "epoch": 6.33678756476684, + "grad_norm": 0.08326390705019363, + "kl": 0.02740478515625, + "learning_rate": 3.6658031088082904e-07, + "loss": 0.0003, + "reward": 2.499998092651367, + "reward_std": 1.101278087389801e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999979138374329, + "step": 2446 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.5, + "epoch": 6.339378238341969, + "grad_norm": 15.838106354699958, + "kl": 0.0924072265625, + "learning_rate": 3.6632124352331604e-07, + "loss": 0.0005, + "reward": 1.9271252155303955, + "reward_std": 0.0009331158395866623, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.4271252751350403, + "step": 2447 + }, + { + "clip_ratio": 0.0, + "completion_length": 36.0, + "epoch": 6.341968911917099, + "grad_norm": 23.853398046024505, + "kl": 0.0771484375, + "learning_rate": 3.660621761658031e-07, + "loss": 0.0011, + "reward": 2.187396287918091, + "reward_std": 0.2588264785908905, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.6873961091041565, + "step": 2448 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 6.344559585492228, + "grad_norm": 8.159731537392343, + "kl": 0.11962890625, + "learning_rate": 3.6580310880829014e-07, + "loss": 0.0007, + "reward": 1.9991435408592224, + "reward_std": 7.163561326706258e-05, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.4991436004638672, + "step": 2449 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.5, + "epoch": 6.347150259067358, + "grad_norm": 0.4735492137615518, + "kl": 0.0771484375, + "learning_rate": 3.6554404145077714e-07, + "loss": 0.0005, + "reward": 2.4990309476852417, + "reward_std": 1.1790068015216093e-05, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9990310668945312, + "step": 2450 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 6.349740932642487, + "grad_norm": 1.0193662662405758, + "kl": 0.1568603515625, + "learning_rate": 3.6528497409326425e-07, + "loss": 0.0011, + "reward": 2.4999947547912598, + "reward_std": 5.042333100391261e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999946355819702, + "step": 2451 + }, + { + "clip_ratio": 0.0, + "completion_length": 34.0, + "epoch": 6.352331606217617, + "grad_norm": 2.666664856836925, + "kl": 0.1982421875, + "learning_rate": 3.650259067357513e-07, + "loss": 0.0013, + "reward": 1.9991455078125, + "reward_std": 3.205962684660335e-05, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.4991456270217896, + "step": 2452 + }, + { + "clip_ratio": 0.0, + "completion_length": 34.5, + "epoch": 6.3549222797927465, + "grad_norm": 0.03463561128019328, + "kl": 0.11865234375, + "learning_rate": 3.647668393782383e-07, + "loss": -0.0006, + "reward": 2.4999985694885254, + "reward_std": 1.1470037861727178e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999987483024597, + "step": 2453 + }, + { + "clip_ratio": 0.0, + "completion_length": 34.5, + "epoch": 6.357512953367876, + "grad_norm": 0.46676428969552564, + "kl": 0.032470703125, + "learning_rate": 3.6450777202072535e-07, + "loss": 0.0014, + "reward": 2.4999947547912598, + "reward_std": 3.23801043577987e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999945759773254, + "step": 2454 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 6.360103626943006, + "grad_norm": 1.8227157687944535, + "kl": 0.107177734375, + "learning_rate": 3.642487046632124e-07, + "loss": 0.0007, + "reward": 2.4999886751174927, + "reward_std": 1.1016374628525227e-05, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999886751174927, + "step": 2455 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.5, + "epoch": 6.362694300518135, + "grad_norm": 2.5570426477307215, + "kl": 0.051513671875, + "learning_rate": 3.6398963730569946e-07, + "loss": -0.0002, + "reward": 2.499990224838257, + "reward_std": 7.5415306355353096e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999902844429016, + "step": 2456 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 6.365284974093265, + "grad_norm": 0.18748823802567555, + "kl": 0.146728515625, + "learning_rate": 3.637305699481865e-07, + "loss": 0.0, + "reward": 2.4999970197677612, + "reward_std": 3.3792812246247195e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999970197677612, + "step": 2457 + }, + { + "clip_ratio": 0.0, + "completion_length": 36.0, + "epoch": 6.367875647668393, + "grad_norm": 0.6499323497772118, + "kl": 0.08154296875, + "learning_rate": 3.6347150259067356e-07, + "loss": 0.0018, + "reward": 2.499994993209839, + "reward_std": 4.347981644059473e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999949932098389, + "step": 2458 + }, + { + "clip_ratio": 0.0, + "completion_length": 36.0, + "epoch": 6.370466321243523, + "grad_norm": 3.597028279956446, + "kl": 0.093505859375, + "learning_rate": 3.6321243523316056e-07, + "loss": 0.001, + "reward": 2.4999821186065674, + "reward_std": 9.831991519604344e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999820590019226, + "step": 2459 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 6.373056994818652, + "grad_norm": 5.566670097994189, + "kl": 0.204345703125, + "learning_rate": 3.6295336787564767e-07, + "loss": 0.0015, + "reward": 1.979424774646759, + "reward_std": 5.3479671919376415e-05, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.479424774646759, + "step": 2460 + }, + { + "clip_ratio": 0.0, + "completion_length": 36.0, + "epoch": 6.375647668393782, + "grad_norm": 10.283264284399745, + "kl": 0.142822265625, + "learning_rate": 3.626943005181347e-07, + "loss": -0.0003, + "reward": 1.8927792310714722, + "reward_std": 0.0007888623107419335, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.3927792310714722, + "step": 2461 + }, + { + "clip_ratio": 0.0, + "completion_length": 34.5, + "epoch": 6.3782383419689115, + "grad_norm": 1.234001027261854, + "kl": 0.124267578125, + "learning_rate": 3.624352331606217e-07, + "loss": -0.0004, + "reward": 2.499982237815857, + "reward_std": 4.899201940133935e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999822974205017, + "step": 2462 + }, + { + "clip_ratio": 0.0, + "completion_length": 34.5, + "epoch": 6.380829015544041, + "grad_norm": 0.10500287054248686, + "kl": 0.088623046875, + "learning_rate": 3.6217616580310877e-07, + "loss": 0.0004, + "reward": 2.4999994039535522, + "reward_std": 6.879108980228921e-07, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.999999463558197, + "step": 2463 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 6.383419689119171, + "grad_norm": 0.15258111595970084, + "kl": 0.0953369140625, + "learning_rate": 3.619170984455958e-07, + "loss": 0.0003, + "reward": 2.499996781349182, + "reward_std": 2.691415545541531e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999967813491821, + "step": 2464 + }, + { + "clip_ratio": 0.0, + "completion_length": 34.5, + "epoch": 6.3860103626943, + "grad_norm": 1124.8147276016118, + "kl": 0.15478515625, + "learning_rate": 3.616580310880829e-07, + "loss": 0.0006, + "reward": 1.4555084705352783, + "reward_std": 0.0015149621322052553, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 0.9555085301399231, + "step": 2465 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.5, + "epoch": 6.38860103626943, + "grad_norm": 9.759358771214643, + "kl": 0.146484375, + "learning_rate": 3.6139896373056993e-07, + "loss": 0.0005, + "reward": 2.499912142753601, + "reward_std": 3.632959021615534e-05, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.999912142753601, + "step": 2466 + }, + { + "clip_ratio": 0.0, + "completion_length": 34.5, + "epoch": 6.391191709844559, + "grad_norm": 16.94473998607458, + "kl": 0.115478515625, + "learning_rate": 3.61139896373057e-07, + "loss": 0.0013, + "reward": 2.1247339248657227, + "reward_std": 0.23157209117198363, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.6247336268424988, + "step": 2467 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.5, + "epoch": 6.393782383419689, + "grad_norm": 2.187412014394854, + "kl": 0.129364013671875, + "learning_rate": 3.60880829015544e-07, + "loss": 0.0011, + "reward": 1.9939700365066528, + "reward_std": 6.716111795412871e-05, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.4939699172973633, + "step": 2468 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 6.396373056994818, + "grad_norm": 14.410662138216031, + "kl": 0.17724609375, + "learning_rate": 3.606217616580311e-07, + "loss": 0.0005, + "reward": 1.49062180519104, + "reward_std": 0.00018573750276118517, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 0.9906218647956848, + "step": 2469 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 6.398963730569948, + "grad_norm": 5.023354767778658, + "kl": 0.15966796875, + "learning_rate": 3.6036269430051814e-07, + "loss": 0.0007, + "reward": 1.9221270084381104, + "reward_std": 0.0004075120028232959, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.4221270680427551, + "step": 2470 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.5, + "epoch": 6.401554404145077, + "grad_norm": 0.18416621253352136, + "kl": 0.061767578125, + "learning_rate": 3.6010362694300514e-07, + "loss": -0.0003, + "reward": 2.4999927282333374, + "reward_std": 3.200827734417544e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999929666519165, + "step": 2471 + }, + { + "clip_ratio": 0.0, + "completion_length": 33.5, + "epoch": 6.404145077720207, + "grad_norm": 0.24448431737808432, + "kl": 0.16650390625, + "learning_rate": 3.598445595854922e-07, + "loss": -0.0002, + "reward": 2.499997615814209, + "reward_std": 1.3951118376098748e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999976754188538, + "step": 2472 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.5, + "epoch": 6.4067357512953365, + "grad_norm": 0.4065492776650983, + "kl": 0.0543212890625, + "learning_rate": 3.5958549222797925e-07, + "loss": 0.0008, + "reward": 2.4999881982803345, + "reward_std": 3.962604182561336e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.999988079071045, + "step": 2473 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.5, + "epoch": 6.409326424870466, + "grad_norm": 0.5924591078475367, + "kl": 0.08251953125, + "learning_rate": 3.5932642487046635e-07, + "loss": 0.0003, + "reward": 2.499992609024048, + "reward_std": 3.624367423071817e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999926686286926, + "step": 2474 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 6.4119170984455955, + "grad_norm": 0.17777448753462252, + "kl": 0.0545654296875, + "learning_rate": 3.5906735751295335e-07, + "loss": 0.0002, + "reward": 2.499995470046997, + "reward_std": 2.0276543750696874e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999954104423523, + "step": 2475 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 6.414507772020725, + "grad_norm": 4.241399127895043, + "kl": 0.102783203125, + "learning_rate": 3.588082901554404e-07, + "loss": 0.001, + "reward": 1.9991992712020874, + "reward_std": 2.7507052550390654e-05, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.4991993308067322, + "step": 2476 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 6.417098445595855, + "grad_norm": 5.043212407548967, + "kl": 0.14697265625, + "learning_rate": 3.585492227979274e-07, + "loss": 0.001, + "reward": 1.945802927017212, + "reward_std": 0.00028971461244964303, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.4458030462265015, + "step": 2477 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 6.419689119170984, + "grad_norm": 0.10136268824768102, + "kl": 0.052520751953125, + "learning_rate": 3.5829015544041446e-07, + "loss": 0.0006, + "reward": 2.4999966621398926, + "reward_std": 2.0284311403884203e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999966621398926, + "step": 2478 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.5, + "epoch": 6.422279792746114, + "grad_norm": 0.9288212187305855, + "kl": 0.08917236328125, + "learning_rate": 3.5803108808290156e-07, + "loss": 0.0007, + "reward": 1.9948766827583313, + "reward_std": 4.2529829471504854e-05, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.4948766231536865, + "step": 2479 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.5, + "epoch": 6.424870466321243, + "grad_norm": 0.8232738179695006, + "kl": 0.13232421875, + "learning_rate": 3.577720207253886e-07, + "loss": -0.0001, + "reward": 2.499993681907654, + "reward_std": 5.1286891675772495e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999936819076538, + "step": 2480 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.5, + "epoch": 6.427461139896373, + "grad_norm": 0.2988021908278537, + "kl": 0.08984375, + "learning_rate": 3.575129533678756e-07, + "loss": 0.0008, + "reward": 2.4999938011169434, + "reward_std": 3.1458180274057668e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999937415122986, + "step": 2481 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.5, + "epoch": 6.430051813471502, + "grad_norm": 5.66954608155545, + "kl": 0.07861328125, + "learning_rate": 3.5725388601036267e-07, + "loss": 0.0008, + "reward": 2.4999682903289795, + "reward_std": 1.5323442141834676e-05, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.99996817111969, + "step": 2482 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.5, + "epoch": 6.432642487046632, + "grad_norm": 1.209233448016897, + "kl": 0.23095703125, + "learning_rate": 3.569948186528498e-07, + "loss": 0.0005, + "reward": 2.4999887943267822, + "reward_std": 4.82934797219059e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999886751174927, + "step": 2483 + }, + { + "clip_ratio": 0.0, + "completion_length": 34.5, + "epoch": 6.435233160621761, + "grad_norm": 6.516685874216108, + "kl": 0.1953125, + "learning_rate": 3.5673575129533677e-07, + "loss": 0.0012, + "reward": 2.4999921321868896, + "reward_std": 8.983871452983294e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999921917915344, + "step": 2484 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.3125, + "epoch": 6.437823834196891, + "grad_norm": 5.117095363995409, + "kl": 0.124267578125, + "learning_rate": 3.564766839378238e-07, + "loss": 0.0007, + "reward": 1.9897173047065735, + "reward_std": 0.00017291619121806434, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.489717185497284, + "step": 2485 + }, + { + "clip_ratio": 0.0, + "completion_length": 34.5, + "epoch": 6.4404145077720205, + "grad_norm": 0.21611763766553907, + "kl": 0.1279296875, + "learning_rate": 3.562176165803109e-07, + "loss": -0.0002, + "reward": 2.4999966621398926, + "reward_std": 2.982501371207036e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999966025352478, + "step": 2486 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 6.44300518134715, + "grad_norm": 0.4154861495119402, + "kl": 0.1083984375, + "learning_rate": 3.559585492227979e-07, + "loss": 0.0007, + "reward": 2.4999972581863403, + "reward_std": 2.285566210957768e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.999997079372406, + "step": 2487 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 6.44559585492228, + "grad_norm": 0.06999640727161804, + "kl": 0.0211181640625, + "learning_rate": 3.55699481865285e-07, + "loss": 0.0001, + "reward": 2.4999972581863403, + "reward_std": 1.401394683853141e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.999997317790985, + "step": 2488 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.5, + "epoch": 6.448186528497409, + "grad_norm": 7.025781840204633, + "kl": 0.3515625, + "learning_rate": 3.5544041450777204e-07, + "loss": 0.002, + "reward": 2.437487006187439, + "reward_std": 0.17680287108987613, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9374868869781494, + "step": 2489 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 6.450777202072539, + "grad_norm": 0.30873417114579993, + "kl": 0.17138671875, + "learning_rate": 3.5518134715025904e-07, + "loss": 0.0001, + "reward": 2.499990701675415, + "reward_std": 3.035928870076532e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999907612800598, + "step": 2490 + }, + { + "clip_ratio": 0.0, + "completion_length": 34.5, + "epoch": 6.453367875647668, + "grad_norm": 0.04727541108296201, + "kl": 0.14892578125, + "learning_rate": 3.549222797927461e-07, + "loss": 0.0003, + "reward": 2.4999985694885254, + "reward_std": 1.1884452817412239e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999987483024597, + "step": 2491 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 6.455958549222798, + "grad_norm": 0.7723420878791865, + "kl": 0.1102294921875, + "learning_rate": 3.546632124352332e-07, + "loss": 0.0006, + "reward": 1.9998581409454346, + "reward_std": 1.1336409784235002e-05, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.4998580515384674, + "step": 2492 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.5, + "epoch": 6.458549222797927, + "grad_norm": 0.3431969364012201, + "kl": 0.03411865234375, + "learning_rate": 3.544041450777202e-07, + "loss": 0.0002, + "reward": 2.499978542327881, + "reward_std": 3.934985301157212e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.999978482723236, + "step": 2493 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.5, + "epoch": 6.461139896373057, + "grad_norm": 8.362519966517624, + "kl": 0.19232177734375, + "learning_rate": 3.5414507772020725e-07, + "loss": -0.0003, + "reward": 1.8156248331069946, + "reward_std": 0.0006790688453293114, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.3156249821186066, + "step": 2494 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 6.463730569948186, + "grad_norm": 1.0179848336918629, + "kl": 0.076171875, + "learning_rate": 3.538860103626943e-07, + "loss": 0.0001, + "reward": 2.499988079071045, + "reward_std": 7.239053502416937e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999881982803345, + "step": 2495 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 6.466321243523316, + "grad_norm": 0.10190849027749872, + "kl": 0.107177734375, + "learning_rate": 3.536269430051813e-07, + "loss": -0.0, + "reward": 2.4999970197677612, + "reward_std": 1.4506032925964973e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999971389770508, + "step": 2496 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 6.4689119170984455, + "grad_norm": 1.921338769853351, + "kl": 0.126220703125, + "learning_rate": 3.533678756476684e-07, + "loss": 0.0014, + "reward": 1.9989900588989258, + "reward_std": 3.6439418408917845e-05, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.4989900588989258, + "step": 2497 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.5, + "epoch": 6.471502590673575, + "grad_norm": 5.43500575648351, + "kl": 0.0938720703125, + "learning_rate": 3.5310880829015546e-07, + "loss": 0.0008, + "reward": 2.4999852180480957, + "reward_std": 6.389600457623601e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999850392341614, + "step": 2498 + }, + { + "clip_ratio": 0.0, + "completion_length": 34.5, + "epoch": 6.474093264248705, + "grad_norm": 0.456136647459687, + "kl": 0.131591796875, + "learning_rate": 3.5284974093264246e-07, + "loss": 0.0014, + "reward": 2.4999964237213135, + "reward_std": 1.9534279260824405e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.999996304512024, + "step": 2499 + }, + { + "clip_ratio": 0.0, + "completion_length": 36.0, + "epoch": 6.476683937823834, + "grad_norm": 0.6653165844467438, + "kl": 0.0982666015625, + "learning_rate": 3.525906735751295e-07, + "loss": -0.0005, + "reward": 2.4999977350234985, + "reward_std": 1.1656431979645276e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999977946281433, + "step": 2500 + }, + { + "clip_ratio": 0.0, + "completion_length": 34.5, + "epoch": 6.479274611398964, + "grad_norm": 4.676079110721336, + "kl": 0.173828125, + "learning_rate": 3.5233160621761656e-07, + "loss": 0.0006, + "reward": 1.8215047717094421, + "reward_std": 0.0002594917013993836, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.3215046525001526, + "step": 2501 + }, + { + "clip_ratio": 0.0, + "completion_length": 34.5, + "epoch": 6.481865284974093, + "grad_norm": 0.1904071271254002, + "kl": 0.06256103515625, + "learning_rate": 3.520725388601036e-07, + "loss": 0.0008, + "reward": 2.499995231628418, + "reward_std": 2.1115497474966105e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.999995231628418, + "step": 2502 + }, + { + "clip_ratio": 0.0, + "completion_length": 34.5, + "epoch": 6.484455958549223, + "grad_norm": 0.9582368079046227, + "kl": 0.0849609375, + "learning_rate": 3.5181347150259067e-07, + "loss": 0.0011, + "reward": 1.99985933303833, + "reward_std": 7.085975425979996e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.4998592138290405, + "step": 2503 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 6.487046632124352, + "grad_norm": 1.4032192165095838, + "kl": 0.1337890625, + "learning_rate": 3.515544041450777e-07, + "loss": 0.0014, + "reward": 2.4999920129776, + "reward_std": 7.278455768755521e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999920725822449, + "step": 2504 + }, + { + "clip_ratio": 0.0, + "completion_length": 36.0, + "epoch": 6.489637305699482, + "grad_norm": 5.436980265188576, + "kl": 0.1051025390625, + "learning_rate": 3.512953367875647e-07, + "loss": 0.0, + "reward": 1.9927936792373657, + "reward_std": 0.00010940871715092726, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.4927937984466553, + "step": 2505 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 6.492227979274611, + "grad_norm": 6.830120845765552, + "kl": 0.123291015625, + "learning_rate": 3.510362694300518e-07, + "loss": 0.0006, + "reward": 2.4999560117721558, + "reward_std": 2.4696539639990078e-05, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999560117721558, + "step": 2506 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 6.494818652849741, + "grad_norm": 0.6074762408356196, + "kl": 0.04150390625, + "learning_rate": 3.507772020725389e-07, + "loss": -0.0003, + "reward": 2.499990701675415, + "reward_std": 3.3898651281560888e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999908208847046, + "step": 2507 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 6.4974093264248705, + "grad_norm": 16.485288993691785, + "kl": 0.3707275390625, + "learning_rate": 3.505181347150259e-07, + "loss": 0.0012, + "reward": 1.7229715585708618, + "reward_std": 0.0013687875010646167, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.2229715287685394, + "step": 2508 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 6.5, + "grad_norm": 1.2223625477946523, + "kl": 0.134033203125, + "learning_rate": 3.5025906735751293e-07, + "loss": 0.0006, + "reward": 2.499992847442627, + "reward_std": 2.7371163184852776e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999929666519165, + "step": 2509 + }, + { + "clip_ratio": 0.0, + "completion_length": 34.5, + "epoch": 6.5025906735751295, + "grad_norm": 1.1046335186564094, + "kl": 0.1435546875, + "learning_rate": 3.5e-07, + "loss": -0.0003, + "reward": 2.4999951124191284, + "reward_std": 2.900464892263699e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.999995231628418, + "step": 2510 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.5, + "epoch": 6.505181347150259, + "grad_norm": 0.14623453489234367, + "kl": 0.04571533203125, + "learning_rate": 3.4974093264248704e-07, + "loss": 0.0, + "reward": 2.499994993209839, + "reward_std": 3.6770769042959728e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999951124191284, + "step": 2511 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.5, + "epoch": 6.507772020725389, + "grad_norm": 0.5481370736079011, + "kl": 0.0771484375, + "learning_rate": 3.494818652849741e-07, + "loss": 0.0, + "reward": 2.4999842643737793, + "reward_std": 3.924493796603201e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999844431877136, + "step": 2512 + }, + { + "clip_ratio": 0.0, + "completion_length": 36.0, + "epoch": 6.510362694300518, + "grad_norm": 101.93761773473722, + "kl": 0.07391357421875, + "learning_rate": 3.4922279792746114e-07, + "loss": -0.0001, + "reward": 1.999802052974701, + "reward_std": 8.644338197427714e-05, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.499802052974701, + "step": 2513 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 6.512953367875648, + "grad_norm": 2.545884411476245, + "kl": 0.0694580078125, + "learning_rate": 3.4896373056994814e-07, + "loss": 0.0004, + "reward": 1.9975576400756836, + "reward_std": 5.358162525226362e-05, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.4975575804710388, + "step": 2514 + }, + { + "clip_ratio": 0.0, + "completion_length": 36.0, + "epoch": 6.515544041450777, + "grad_norm": 8.78644266455707, + "kl": 0.091064453125, + "learning_rate": 3.487046632124352e-07, + "loss": 0.0002, + "reward": 1.9999077320098877, + "reward_std": 3.9093000793855026e-05, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.4999077320098877, + "step": 2515 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 6.518134715025907, + "grad_norm": 1.5901307709542127, + "kl": 0.111328125, + "learning_rate": 3.484455958549223e-07, + "loss": 0.0007, + "reward": 2.4999910593032837, + "reward_std": 7.532677500421414e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999911189079285, + "step": 2516 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 6.520725388601036, + "grad_norm": 0.8223064164311363, + "kl": 0.114013671875, + "learning_rate": 3.481865284974093e-07, + "loss": 0.0007, + "reward": 1.9999272227287292, + "reward_std": 1.2687341040873434e-05, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.4999272525310516, + "step": 2517 + }, + { + "clip_ratio": 0.0, + "completion_length": 36.0, + "epoch": 6.523316062176166, + "grad_norm": 19.107232142727003, + "kl": 0.13525390625, + "learning_rate": 3.4792746113989635e-07, + "loss": -0.0002, + "reward": 2.1249073147773743, + "reward_std": 0.23146861664690732, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.6249073147773743, + "step": 2518 + }, + { + "clip_ratio": 0.0, + "completion_length": 34.5, + "epoch": 6.525906735751295, + "grad_norm": 1.2222527845680367, + "kl": 0.05718994140625, + "learning_rate": 3.476683937823834e-07, + "loss": -0.0, + "reward": 2.4999825954437256, + "reward_std": 8.119004633044824e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.99998277425766, + "step": 2519 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 6.528497409326425, + "grad_norm": 0.3239109550155984, + "kl": 0.2890625, + "learning_rate": 3.4740932642487046e-07, + "loss": 0.0013, + "reward": 2.499995470046997, + "reward_std": 3.252491694638593e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999955296516418, + "step": 2520 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.5, + "epoch": 6.5310880829015545, + "grad_norm": 1.4294652002689472, + "kl": 0.13427734375, + "learning_rate": 3.471502590673575e-07, + "loss": -0.0005, + "reward": 2.4999959468841553, + "reward_std": 2.497928875300204e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999960064888, + "step": 2521 + }, + { + "clip_ratio": 0.0, + "completion_length": 36.0, + "epoch": 6.533678756476684, + "grad_norm": 0.8170950027030504, + "kl": 0.113525390625, + "learning_rate": 3.4689119170984456e-07, + "loss": 0.0006, + "reward": 2.4999778270721436, + "reward_std": 7.645261462130293e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999775886535645, + "step": 2522 + }, + { + "clip_ratio": 0.0, + "completion_length": 34.5, + "epoch": 6.536269430051814, + "grad_norm": 0.9580082169122023, + "kl": 0.0557861328125, + "learning_rate": 3.4663212435233156e-07, + "loss": -0.0005, + "reward": 2.499991297721863, + "reward_std": 3.0062224141147453e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999911189079285, + "step": 2523 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 6.538860103626943, + "grad_norm": 0.08720168262521778, + "kl": 0.040313720703125, + "learning_rate": 3.463730569948186e-07, + "loss": -0.0001, + "reward": 2.4999953508377075, + "reward_std": 1.8311926623937325e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.999995231628418, + "step": 2524 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.5, + "epoch": 6.541450777202073, + "grad_norm": 0.7274433384858028, + "kl": 0.12744140625, + "learning_rate": 3.461139896373057e-07, + "loss": 0.0001, + "reward": 2.4999914169311523, + "reward_std": 4.852705046687333e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999914169311523, + "step": 2525 + }, + { + "clip_ratio": 0.0, + "completion_length": 36.3125, + "epoch": 6.544041450777202, + "grad_norm": 1.5010685730062852, + "kl": 0.094482421875, + "learning_rate": 3.458549222797927e-07, + "loss": 0.0013, + "reward": 1.9996663331985474, + "reward_std": 2.3781557274560328e-05, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.4996660649776459, + "step": 2526 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.5, + "epoch": 6.546632124352332, + "grad_norm": 1.1055764122934053, + "kl": 0.120849609375, + "learning_rate": 3.455958549222798e-07, + "loss": 0.0009, + "reward": 2.499983072280884, + "reward_std": 8.954774557423661e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999831914901733, + "step": 2527 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 6.549222797927461, + "grad_norm": 0.865542578841052, + "kl": 0.09759521484375, + "learning_rate": 3.4533678756476683e-07, + "loss": 0.0009, + "reward": 2.499947190284729, + "reward_std": 1.0347350809070122e-05, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999471306800842, + "step": 2528 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.5, + "epoch": 6.551813471502591, + "grad_norm": 0.977959619526184, + "kl": 0.0465087890625, + "learning_rate": 3.450777202072539e-07, + "loss": -0.0007, + "reward": 2.499993324279785, + "reward_std": 3.94907112877263e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999935030937195, + "step": 2529 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 6.55440414507772, + "grad_norm": 0.6678437443647307, + "kl": 0.127197265625, + "learning_rate": 3.4481865284974093e-07, + "loss": -0.0004, + "reward": 2.499995708465576, + "reward_std": 2.1067400481911136e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999958872795105, + "step": 2530 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.5, + "epoch": 6.55699481865285, + "grad_norm": 3.6456110181342396, + "kl": 0.064697265625, + "learning_rate": 3.44559585492228e-07, + "loss": 0.0003, + "reward": 2.499939203262329, + "reward_std": 2.6638364033715334e-05, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999390244483948, + "step": 2531 + }, + { + "clip_ratio": 0.0, + "completion_length": 34.5, + "epoch": 6.5595854922279795, + "grad_norm": 0.3527599401622835, + "kl": 0.10107421875, + "learning_rate": 3.44300518134715e-07, + "loss": 0.0006, + "reward": 2.4999788999557495, + "reward_std": 3.5101359685540956e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999788999557495, + "step": 2532 + }, + { + "clip_ratio": 0.0, + "completion_length": 36.0, + "epoch": 6.562176165803109, + "grad_norm": 6.090586393845502, + "kl": 0.10400390625, + "learning_rate": 3.4404145077720204e-07, + "loss": 0.0001, + "reward": 1.99794602394104, + "reward_std": 8.972414013896923e-05, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.4979462921619415, + "step": 2533 + }, + { + "clip_ratio": 0.0, + "completion_length": 36.0, + "epoch": 6.564766839378239, + "grad_norm": 1.3184491507538538, + "kl": 0.13671875, + "learning_rate": 3.4378238341968914e-07, + "loss": 0.0004, + "reward": 2.4999947547912598, + "reward_std": 4.746955141854414e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999948143959045, + "step": 2534 + }, + { + "clip_ratio": 0.0, + "completion_length": 34.5, + "epoch": 6.567357512953368, + "grad_norm": 0.5655989079483129, + "kl": 0.0626220703125, + "learning_rate": 3.4352331606217614e-07, + "loss": 0.0007, + "reward": 2.4999818801879883, + "reward_std": 5.632232671359816e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999821186065674, + "step": 2535 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 6.569948186528498, + "grad_norm": 3.0175480439418374, + "kl": 0.0521240234375, + "learning_rate": 3.432642487046632e-07, + "loss": -0.0008, + "reward": 2.4996767044067383, + "reward_std": 3.12128609039064e-05, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9996768236160278, + "step": 2536 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 6.572538860103627, + "grad_norm": 0.9810304665101179, + "kl": 0.123046875, + "learning_rate": 3.4300518134715025e-07, + "loss": 0.0019, + "reward": 2.4999947547912598, + "reward_std": 7.406094823636522e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999945759773254, + "step": 2537 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.5, + "epoch": 6.575129533678757, + "grad_norm": 0.5111089931830906, + "kl": 0.166015625, + "learning_rate": 3.4274611398963725e-07, + "loss": 0.0007, + "reward": 2.499996781349182, + "reward_std": 5.223699588441377e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999966621398926, + "step": 2538 + }, + { + "clip_ratio": 0.0, + "completion_length": 36.0, + "epoch": 6.577720207253886, + "grad_norm": 5.024948400004436, + "kl": 0.0771484375, + "learning_rate": 3.4248704663212435e-07, + "loss": 0.0004, + "reward": 2.4999821186065674, + "reward_std": 2.0641108676500153e-05, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999820590019226, + "step": 2539 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.5, + "epoch": 6.580310880829016, + "grad_norm": 0.5011336252554351, + "kl": 0.0384521484375, + "learning_rate": 3.422279792746114e-07, + "loss": 0.001, + "reward": 2.4999841451644897, + "reward_std": 3.6930124451828306e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999841451644897, + "step": 2540 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.5, + "epoch": 6.582901554404145, + "grad_norm": 0.32258080489855623, + "kl": 0.08056640625, + "learning_rate": 3.419689119170984e-07, + "loss": 0.0008, + "reward": 2.4999977350234985, + "reward_std": 2.5091250677178323e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.999997854232788, + "step": 2541 + }, + { + "clip_ratio": 0.0, + "completion_length": 33.5, + "epoch": 6.585492227979275, + "grad_norm": 3.500541262290584, + "kl": 0.08349609375, + "learning_rate": 3.4170984455958546e-07, + "loss": -0.0, + "reward": 2.4999858140945435, + "reward_std": 1.0171859941010553e-05, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.999985933303833, + "step": 2542 + }, + { + "clip_ratio": 0.0, + "completion_length": 34.5, + "epoch": 6.5880829015544045, + "grad_norm": 0.025574555781819382, + "kl": 0.0804443359375, + "learning_rate": 3.4145077720207256e-07, + "loss": 0.0, + "reward": 2.4999988079071045, + "reward_std": 8.707054064416297e-07, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999988079071045, + "step": 2543 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.5, + "epoch": 6.590673575129534, + "grad_norm": 8.658231050972505, + "kl": 0.109375, + "learning_rate": 3.4119170984455956e-07, + "loss": -0.0003, + "reward": 1.9913583397865295, + "reward_std": 6.762993143638596e-05, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.4913585186004639, + "step": 2544 + }, + { + "clip_ratio": 0.0, + "completion_length": 36.0, + "epoch": 6.5932642487046635, + "grad_norm": 0.4019373659612134, + "kl": 0.1199951171875, + "learning_rate": 3.409326424870466e-07, + "loss": 0.0005, + "reward": 2.4999972581863403, + "reward_std": 1.6889496805561066e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.999997317790985, + "step": 2545 + }, + { + "clip_ratio": 0.0, + "completion_length": 34.5, + "epoch": 6.595854922279793, + "grad_norm": 15.970817610139505, + "kl": 0.14599609375, + "learning_rate": 3.4067357512953367e-07, + "loss": 0.0009, + "reward": 1.7710894346237183, + "reward_std": 0.004858827194368587, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.2710894346237183, + "step": 2546 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.5, + "epoch": 6.598445595854923, + "grad_norm": 0.5030692796067407, + "kl": 0.09765625, + "learning_rate": 3.4041450777202067e-07, + "loss": 0.0004, + "reward": 2.499997138977051, + "reward_std": 2.6677773803385207e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.999997079372406, + "step": 2547 + }, + { + "clip_ratio": 0.0, + "completion_length": 34.25, + "epoch": 6.601036269430052, + "grad_norm": 0.2122647258348281, + "kl": 0.13818359375, + "learning_rate": 3.401554404145078e-07, + "loss": 0.0013, + "reward": 2.499997615814209, + "reward_std": 2.4651604064729327e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999977946281433, + "step": 2548 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 6.603626943005182, + "grad_norm": 0.1707471944684384, + "kl": 0.102783203125, + "learning_rate": 3.3989637305699483e-07, + "loss": 0.001, + "reward": 2.499997854232788, + "reward_std": 1.980917318178399e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999979734420776, + "step": 2549 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 6.606217616580311, + "grad_norm": 16.78280135512644, + "kl": 0.0711669921875, + "learning_rate": 3.3963730569948183e-07, + "loss": 0.0001, + "reward": 1.9838308095932007, + "reward_std": 0.0002846343312086219, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.4838309288024902, + "step": 2550 + }, + { + "clip_ratio": 0.0, + "completion_length": 34.5, + "epoch": 6.608808290155441, + "grad_norm": 0.6444172842181465, + "kl": 0.177001953125, + "learning_rate": 3.393782383419689e-07, + "loss": 0.0011, + "reward": 2.4999654293060303, + "reward_std": 5.451105607789941e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999653697013855, + "step": 2551 + }, + { + "clip_ratio": 0.0, + "completion_length": 34.5, + "epoch": 6.61139896373057, + "grad_norm": 2.5282296933824373, + "kl": 0.03839111328125, + "learning_rate": 3.3911917098445593e-07, + "loss": -0.0001, + "reward": 2.4999762773513794, + "reward_std": 2.1663740881194826e-05, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.999976396560669, + "step": 2552 + }, + { + "clip_ratio": 0.0, + "completion_length": 34.0, + "epoch": 6.6139896373057, + "grad_norm": 2.4770855001431684, + "kl": 0.1201171875, + "learning_rate": 3.38860103626943e-07, + "loss": 0.0011, + "reward": 2.499989628791809, + "reward_std": 9.335331469628727e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999896883964539, + "step": 2553 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 6.616580310880829, + "grad_norm": 0.044213214230401206, + "kl": 0.0555419921875, + "learning_rate": 3.3860103626943004e-07, + "loss": 0.0004, + "reward": 2.499998450279236, + "reward_std": 1.042365596504169e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999985098838806, + "step": 2554 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 6.619170984455959, + "grad_norm": 1.7987260942067969, + "kl": 0.02813720703125, + "learning_rate": 3.383419689119171e-07, + "loss": 0.0016, + "reward": 2.4999959468841553, + "reward_std": 3.3024931553882197e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999958872795105, + "step": 2555 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.1875, + "epoch": 6.6217616580310885, + "grad_norm": 0.152239583443508, + "kl": 0.096435546875, + "learning_rate": 3.380829015544041e-07, + "loss": 0.001, + "reward": 1.499999761581421, + "reward_std": 2.6272448394593084e-07, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 0.9999997615814209, + "step": 2556 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.5, + "epoch": 6.624352331606218, + "grad_norm": 0.1273675226901063, + "kl": 0.087158203125, + "learning_rate": 3.378238341968912e-07, + "loss": 0.0006, + "reward": 2.4999961853027344, + "reward_std": 1.720830056228806e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999961256980896, + "step": 2557 + }, + { + "clip_ratio": 0.0, + "completion_length": 34.5, + "epoch": 6.626943005181348, + "grad_norm": 0.043721086557300434, + "kl": 0.06103515625, + "learning_rate": 3.3756476683937825e-07, + "loss": 0.0004, + "reward": 2.499998927116394, + "reward_std": 1.226148668820315e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999988675117493, + "step": 2558 + }, + { + "clip_ratio": 0.0, + "completion_length": 36.0, + "epoch": 6.629533678756477, + "grad_norm": 0.2996362017119053, + "kl": 0.133056640625, + "learning_rate": 3.3730569948186525e-07, + "loss": 0.0005, + "reward": 2.499995470046997, + "reward_std": 2.144779955415288e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999957084655762, + "step": 2559 + }, + { + "clip_ratio": 0.0, + "completion_length": 34.5, + "epoch": 6.632124352331607, + "grad_norm": 0.6158580946389499, + "kl": 0.13427734375, + "learning_rate": 3.370466321243523e-07, + "loss": 0.0004, + "reward": 2.499993324279785, + "reward_std": 5.970400707155932e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999935626983643, + "step": 2560 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.5, + "epoch": 6.634715025906736, + "grad_norm": 0.15388333980600602, + "kl": 0.112060546875, + "learning_rate": 3.3678756476683935e-07, + "loss": -0.0009, + "reward": 2.4999908208847046, + "reward_std": 2.836057319655083e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999909400939941, + "step": 2561 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 6.637305699481866, + "grad_norm": 0.10990972930863668, + "kl": 0.0772705078125, + "learning_rate": 3.365284974093264e-07, + "loss": 0.0007, + "reward": 2.4999988079071045, + "reward_std": 1.0265251830787747e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999988675117493, + "step": 2562 + }, + { + "clip_ratio": 0.0, + "completion_length": 34.5, + "epoch": 6.639896373056995, + "grad_norm": 1.3542892525788086, + "kl": 0.1151123046875, + "learning_rate": 3.3626943005181346e-07, + "loss": -0.0008, + "reward": 2.499992847442627, + "reward_std": 5.880863227503141e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.999993085861206, + "step": 2563 + }, + { + "clip_ratio": 0.0, + "completion_length": 36.0, + "epoch": 6.642487046632124, + "grad_norm": 0.18121276195124655, + "kl": 0.0982666015625, + "learning_rate": 3.360103626943005e-07, + "loss": -0.0009, + "reward": 2.49999737739563, + "reward_std": 3.0438080500516662e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999974966049194, + "step": 2564 + }, + { + "clip_ratio": 0.0, + "completion_length": 34.5, + "epoch": 6.6450777202072535, + "grad_norm": 2.7826533831249978, + "kl": 0.1494140625, + "learning_rate": 3.357512953367875e-07, + "loss": 0.0015, + "reward": 1.9944143295288086, + "reward_std": 0.00010370887298449816, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.4944142997264862, + "step": 2565 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 6.647668393782383, + "grad_norm": 0.24931137862369113, + "kl": 0.072998046875, + "learning_rate": 3.354922279792746e-07, + "loss": -0.0006, + "reward": 2.499996066093445, + "reward_std": 2.4226490040746285e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999962449073792, + "step": 2566 + }, + { + "clip_ratio": 0.0, + "completion_length": 34.5, + "epoch": 6.650259067357513, + "grad_norm": 0.08998975069236005, + "kl": 0.126708984375, + "learning_rate": 3.3523316062176167e-07, + "loss": 0.0009, + "reward": 2.499996304512024, + "reward_std": 2.631036409184162e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999962449073792, + "step": 2567 + }, + { + "clip_ratio": 0.0, + "completion_length": 34.5, + "epoch": 6.652849740932642, + "grad_norm": 54.74399876562132, + "kl": 0.16552734375, + "learning_rate": 3.3497409326424867e-07, + "loss": 0.0006, + "reward": 1.4062859416007996, + "reward_std": 0.0006608423718716949, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 0.9062860012054443, + "step": 2568 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 6.655440414507772, + "grad_norm": 5.888225080680928, + "kl": 0.07958984375, + "learning_rate": 3.347150259067357e-07, + "loss": 0.0008, + "reward": 1.997098982334137, + "reward_std": 8.813368509663633e-05, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.4970990121364594, + "step": 2569 + }, + { + "clip_ratio": 0.0, + "completion_length": 34.5, + "epoch": 6.658031088082901, + "grad_norm": 0.06937523364798394, + "kl": 0.07666015625, + "learning_rate": 3.344559585492228e-07, + "loss": 0.0006, + "reward": 2.4999964237213135, + "reward_std": 2.1866613906240673e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999964237213135, + "step": 2570 + }, + { + "clip_ratio": 0.0, + "completion_length": 34.5, + "epoch": 6.660621761658031, + "grad_norm": 0.06299063375986161, + "kl": 0.05572509765625, + "learning_rate": 3.3419689119170983e-07, + "loss": 0.0001, + "reward": 2.4999990463256836, + "reward_std": 9.253759998273381e-07, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999991655349731, + "step": 2571 + }, + { + "clip_ratio": 0.0, + "completion_length": 36.0, + "epoch": 6.66321243523316, + "grad_norm": 0.3266392956997573, + "kl": 0.077880859375, + "learning_rate": 3.339378238341969e-07, + "loss": 0.0007, + "reward": 2.499995470046997, + "reward_std": 4.508622623689007e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999955892562866, + "step": 2572 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 6.66580310880829, + "grad_norm": 1.0246611668257095, + "kl": 0.05322265625, + "learning_rate": 3.3367875647668393e-07, + "loss": 0.0011, + "reward": 2.4999932050704956, + "reward_std": 4.374227955850074e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999933242797852, + "step": 2573 + }, + { + "clip_ratio": 0.0, + "completion_length": 34.5, + "epoch": 6.668393782383419, + "grad_norm": 1.5635892045251365, + "kl": 0.04510498046875, + "learning_rate": 3.3341968911917093e-07, + "loss": -0.0009, + "reward": 2.4999852180480957, + "reward_std": 8.368838507522014e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999854564666748, + "step": 2574 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 6.670984455958549, + "grad_norm": 0.12284749032219659, + "kl": 0.060302734375, + "learning_rate": 3.33160621761658e-07, + "loss": 0.0001, + "reward": 2.4999972581863403, + "reward_std": 8.914735758480674e-07, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999973773956299, + "step": 2575 + }, + { + "clip_ratio": 0.0, + "completion_length": 34.0, + "epoch": 6.6735751295336785, + "grad_norm": 0.2167900567170482, + "kl": 0.0321044921875, + "learning_rate": 3.329015544041451e-07, + "loss": 0.001, + "reward": 2.499997138977051, + "reward_std": 1.994280410144711e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.999997079372406, + "step": 2576 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.5, + "epoch": 6.676165803108808, + "grad_norm": 0.40959551545454664, + "kl": 0.106201171875, + "learning_rate": 3.326424870466321e-07, + "loss": 0.0009, + "reward": 2.4999942779541016, + "reward_std": 4.334254072091426e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.999994158744812, + "step": 2577 + }, + { + "clip_ratio": 0.0, + "completion_length": 34.0, + "epoch": 6.678756476683938, + "grad_norm": 0.04152460592243156, + "kl": 0.02154541015625, + "learning_rate": 3.3238341968911914e-07, + "loss": 0.001, + "reward": 2.499999523162842, + "reward_std": 6.770477511963691e-07, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999995231628418, + "step": 2578 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 6.681347150259067, + "grad_norm": 0.29413180286570545, + "kl": 0.126220703125, + "learning_rate": 3.321243523316062e-07, + "loss": -0.0004, + "reward": 2.4999905824661255, + "reward_std": 3.9297768807955435e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999905824661255, + "step": 2579 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 6.683937823834197, + "grad_norm": 0.8858348914551994, + "kl": 0.11767578125, + "learning_rate": 3.3186528497409325e-07, + "loss": 0.0008, + "reward": 2.499980092048645, + "reward_std": 5.515488737728447e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999802112579346, + "step": 2580 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.5, + "epoch": 6.686528497409326, + "grad_norm": 0.30480880480466105, + "kl": 0.065185546875, + "learning_rate": 3.316062176165803e-07, + "loss": 0.0008, + "reward": 2.4999889135360718, + "reward_std": 4.7448878035538655e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.999988615512848, + "step": 2581 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.5, + "epoch": 6.689119170984456, + "grad_norm": 0.14469637750250963, + "kl": 0.0732421875, + "learning_rate": 3.3134715025906735e-07, + "loss": 0.0012, + "reward": 2.499997854232788, + "reward_std": 1.5480442812076944e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999977350234985, + "step": 2582 + }, + { + "clip_ratio": 0.0, + "completion_length": 34.0, + "epoch": 6.691709844559585, + "grad_norm": 0.4863310397988113, + "kl": 0.04205322265625, + "learning_rate": 3.3108808290155435e-07, + "loss": -0.0001, + "reward": 2.499995708465576, + "reward_std": 2.421361855908799e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999957084655762, + "step": 2583 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 6.694300518134715, + "grad_norm": 2.0665532768566486, + "kl": 0.10693359375, + "learning_rate": 3.308290155440414e-07, + "loss": 0.0003, + "reward": 2.4999419450759888, + "reward_std": 1.3593727999250405e-05, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999419450759888, + "step": 2584 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 6.696891191709844, + "grad_norm": 0.22176447194368717, + "kl": 0.0517578125, + "learning_rate": 3.305699481865285e-07, + "loss": 0.0002, + "reward": 2.499997854232788, + "reward_std": 1.0115137172306277e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.999997854232788, + "step": 2585 + }, + { + "clip_ratio": 0.0, + "completion_length": 36.0, + "epoch": 6.699481865284974, + "grad_norm": 3.1241740446208515, + "kl": 0.074951171875, + "learning_rate": 3.303108808290155e-07, + "loss": -0.0004, + "reward": 2.499956727027893, + "reward_std": 1.3763042602477071e-05, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999567866325378, + "step": 2586 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.5, + "epoch": 6.7020725388601035, + "grad_norm": 46.915002143867554, + "kl": 0.14404296875, + "learning_rate": 3.3005181347150256e-07, + "loss": 0.0006, + "reward": 2.062428116798401, + "reward_std": 0.4082653373479843, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.5624281167984009, + "step": 2587 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.5, + "epoch": 6.704663212435233, + "grad_norm": 27.51949064118649, + "kl": 0.17138671875, + "learning_rate": 3.297927461139896e-07, + "loss": 0.0001, + "reward": 2.4999958276748657, + "reward_std": 2.1905597122895415e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999959468841553, + "step": 2588 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.5, + "epoch": 6.7072538860103625, + "grad_norm": 1.1729738096444962, + "kl": 0.0908203125, + "learning_rate": 3.295336787564767e-07, + "loss": -0.0005, + "reward": 2.499990940093994, + "reward_std": 6.02474869992875e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999910593032837, + "step": 2589 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 6.709844559585492, + "grad_norm": 5.367834367783082, + "kl": 0.14453125, + "learning_rate": 3.292746113989637e-07, + "loss": 0.0012, + "reward": 1.958688735961914, + "reward_std": 0.00023893316739531656, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.4586885571479797, + "step": 2590 + }, + { + "clip_ratio": 0.0, + "completion_length": 34.5, + "epoch": 6.712435233160622, + "grad_norm": 0.1611130610032333, + "kl": 0.0506591796875, + "learning_rate": 3.290155440414508e-07, + "loss": 0.0012, + "reward": 2.4999784231185913, + "reward_std": 2.699215713164449e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999784231185913, + "step": 2591 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 6.715025906735751, + "grad_norm": 2.9858569906842156, + "kl": 0.11181640625, + "learning_rate": 3.287564766839378e-07, + "loss": -0.0001, + "reward": 2.499988555908203, + "reward_std": 6.575066322511702e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999887347221375, + "step": 2592 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 6.717616580310881, + "grad_norm": 0.5733795975332774, + "kl": 0.06622314453125, + "learning_rate": 3.2849740932642483e-07, + "loss": 0.0014, + "reward": 2.499981641769409, + "reward_std": 4.8018978304753546e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999814629554749, + "step": 2593 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 6.72020725388601, + "grad_norm": 4.523967176226092, + "kl": 0.07470703125, + "learning_rate": 3.2823834196891193e-07, + "loss": -0.0003, + "reward": 2.499959111213684, + "reward_std": 2.8898726668558083e-05, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999591708183289, + "step": 2594 + }, + { + "clip_ratio": 0.0, + "completion_length": 34.5, + "epoch": 6.72279792746114, + "grad_norm": 0.2156022940407872, + "kl": 0.103515625, + "learning_rate": 3.27979274611399e-07, + "loss": -0.0002, + "reward": 2.499997138977051, + "reward_std": 4.089306912646862e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999971985816956, + "step": 2595 + }, + { + "clip_ratio": 0.0, + "completion_length": 34.5, + "epoch": 6.725388601036269, + "grad_norm": 5.832168770665035, + "kl": 0.1201171875, + "learning_rate": 3.27720207253886e-07, + "loss": -0.0006, + "reward": 2.49998939037323, + "reward_std": 1.0515403801036882e-05, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999895095825195, + "step": 2596 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.5, + "epoch": 6.727979274611399, + "grad_norm": 2.4478152866866343, + "kl": 0.0650634765625, + "learning_rate": 3.2746113989637304e-07, + "loss": 0.0007, + "reward": 1.9986910820007324, + "reward_std": 2.5327576963718457e-05, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.4986909627914429, + "step": 2597 + }, + { + "clip_ratio": 0.0, + "completion_length": 36.0, + "epoch": 6.730569948186528, + "grad_norm": 0.04532129039756897, + "kl": 0.166015625, + "learning_rate": 3.2720207253886004e-07, + "loss": 0.0012, + "reward": 2.4999974966049194, + "reward_std": 2.7054037445850554e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999973773956299, + "step": 2598 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 6.733160621761658, + "grad_norm": 10.973102361316224, + "kl": 0.15673828125, + "learning_rate": 3.2694300518134714e-07, + "loss": 0.0007, + "reward": 1.4764549732208252, + "reward_std": 0.0002484492943040095, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 0.9764549136161804, + "step": 2599 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 6.7357512953367875, + "grad_norm": 0.2264396397533005, + "kl": 0.1065673828125, + "learning_rate": 3.266839378238342e-07, + "loss": 0.001, + "reward": 2.4999961853027344, + "reward_std": 2.894307726819534e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999961853027344, + "step": 2600 + }, + { + "clip_ratio": 0.0, + "completion_length": 36.0, + "epoch": 6.738341968911917, + "grad_norm": 0.781237039712985, + "kl": 0.1259765625, + "learning_rate": 3.2642487046632125e-07, + "loss": 0.0006, + "reward": 1.9998623728752136, + "reward_std": 1.5107755643839482e-05, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.4998623132705688, + "step": 2601 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 6.740932642487047, + "grad_norm": 34.845933669790405, + "kl": 0.21875, + "learning_rate": 3.2616580310880825e-07, + "loss": 0.0008, + "reward": 1.3091520071029663, + "reward_std": 0.0005192816606722772, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 0.8091520667076111, + "step": 2602 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0625, + "epoch": 6.743523316062176, + "grad_norm": 0.08520314699728021, + "kl": 0.184814453125, + "learning_rate": 3.2590673575129535e-07, + "loss": 0.0013, + "reward": 2.499998092651367, + "reward_std": 1.6865800489540561e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999980926513672, + "step": 2603 + }, + { + "clip_ratio": 0.0, + "completion_length": 34.5, + "epoch": 6.746113989637306, + "grad_norm": 0.3741604703786764, + "kl": 0.091552734375, + "learning_rate": 3.256476683937824e-07, + "loss": -0.0005, + "reward": 2.4999966621398926, + "reward_std": 2.834721101407922e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999966621398926, + "step": 2604 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 6.748704663212435, + "grad_norm": 0.1902905735375944, + "kl": 0.0758056640625, + "learning_rate": 3.253886010362694e-07, + "loss": -0.0001, + "reward": 2.499998688697815, + "reward_std": 1.013394211213381e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999987483024597, + "step": 2605 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.5, + "epoch": 6.751295336787565, + "grad_norm": 0.1304177983406284, + "kl": 0.0758056640625, + "learning_rate": 3.2512953367875646e-07, + "loss": 0.0012, + "reward": 2.499995470046997, + "reward_std": 1.469007244736531e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999955296516418, + "step": 2606 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.5, + "epoch": 6.753886010362694, + "grad_norm": 0.8783267999037194, + "kl": 0.122802734375, + "learning_rate": 3.248704663212435e-07, + "loss": -0.0007, + "reward": 2.4999829530715942, + "reward_std": 4.364902679299121e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999831914901733, + "step": 2607 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 6.756476683937824, + "grad_norm": 2.90818863991794, + "kl": 0.099365234375, + "learning_rate": 3.2461139896373056e-07, + "loss": 0.0008, + "reward": 2.4999921321868896, + "reward_std": 5.05780303683423e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999920129776, + "step": 2608 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.5, + "epoch": 6.759067357512953, + "grad_norm": 0.20439311313186867, + "kl": 0.116455078125, + "learning_rate": 3.243523316062176e-07, + "loss": 0.0005, + "reward": 2.499997138977051, + "reward_std": 2.5065187543305e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.999997317790985, + "step": 2609 + }, + { + "clip_ratio": 0.0, + "completion_length": 34.5, + "epoch": 6.761658031088083, + "grad_norm": 0.4276889929631774, + "kl": 0.0665283203125, + "learning_rate": 3.2409326424870467e-07, + "loss": 0.001, + "reward": 2.499543309211731, + "reward_std": 7.075920109400613e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.999543309211731, + "step": 2610 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.75, + "epoch": 6.7642487046632125, + "grad_norm": 4.988801345871367, + "kl": 0.05108642578125, + "learning_rate": 3.2383419689119167e-07, + "loss": -0.0002, + "reward": 1.9499186277389526, + "reward_std": 0.011643597628108182, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.4499186873435974, + "step": 2611 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 6.766839378238342, + "grad_norm": 1.8455174535086045, + "kl": 0.103759765625, + "learning_rate": 3.235751295336787e-07, + "loss": -0.0012, + "reward": 2.499996304512024, + "reward_std": 2.8104585680921446e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999964833259583, + "step": 2612 + }, + { + "clip_ratio": 0.0, + "completion_length": 34.0, + "epoch": 6.769430051813472, + "grad_norm": 0.061923812131794265, + "kl": 0.047698974609375, + "learning_rate": 3.2331606217616583e-07, + "loss": 0.0002, + "reward": 2.4999982118606567, + "reward_std": 1.0370044094543118e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.999998152256012, + "step": 2613 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.5, + "epoch": 6.772020725388601, + "grad_norm": 0.1100203967726687, + "kl": 0.0789794921875, + "learning_rate": 3.2305699481865283e-07, + "loss": 0.0008, + "reward": 2.4999974966049194, + "reward_std": 2.333548877686553e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999974966049194, + "step": 2614 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.5, + "epoch": 6.774611398963731, + "grad_norm": 68.5712464189081, + "kl": 0.1494140625, + "learning_rate": 3.227979274611399e-07, + "loss": 0.0013, + "reward": 2.499662399291992, + "reward_std": 0.00015250110163833597, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9996622204780579, + "step": 2615 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.5, + "epoch": 6.77720207253886, + "grad_norm": 0.611371759608612, + "kl": 0.10888671875, + "learning_rate": 3.2253886010362693e-07, + "loss": -0.0004, + "reward": 2.4999964237213135, + "reward_std": 3.337859652674524e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999963641166687, + "step": 2616 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 6.77979274611399, + "grad_norm": 0.2662854166121708, + "kl": 0.047119140625, + "learning_rate": 3.22279792746114e-07, + "loss": 0.0, + "reward": 2.4999959468841553, + "reward_std": 2.4996460865622794e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999960660934448, + "step": 2617 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.5, + "epoch": 6.782383419689119, + "grad_norm": 0.2118951697335687, + "kl": 0.097412109375, + "learning_rate": 3.2202072538860104e-07, + "loss": 0.0021, + "reward": 2.4999985694885254, + "reward_std": 1.8294264236828894e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999984502792358, + "step": 2618 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 6.784974093264249, + "grad_norm": 4.618919710401948, + "kl": 0.117919921875, + "learning_rate": 3.217616580310881e-07, + "loss": 0.0002, + "reward": 1.8100414276123047, + "reward_std": 0.0006752505371991901, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.3100415766239166, + "step": 2619 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 6.787564766839378, + "grad_norm": 0.652593847585454, + "kl": 0.0992431640625, + "learning_rate": 3.215025906735751e-07, + "loss": 0.0005, + "reward": 2.4999969005584717, + "reward_std": 4.2823880903597455e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999969005584717, + "step": 2620 + }, + { + "clip_ratio": 0.0, + "completion_length": 34.5, + "epoch": 6.790155440414508, + "grad_norm": 0.4895500447608739, + "kl": 0.088134765625, + "learning_rate": 3.2124352331606214e-07, + "loss": 0.0006, + "reward": 2.499996781349182, + "reward_std": 1.3708473858287107e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.999996840953827, + "step": 2621 + }, + { + "clip_ratio": 0.0, + "completion_length": 36.0, + "epoch": 6.7927461139896375, + "grad_norm": 0.2610555544379071, + "kl": 0.177734375, + "learning_rate": 3.2098445595854925e-07, + "loss": 0.001, + "reward": 2.4999972581863403, + "reward_std": 3.101004210748215e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999972581863403, + "step": 2622 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 6.795336787564767, + "grad_norm": 2.183783771907094, + "kl": 0.08935546875, + "learning_rate": 3.2072538860103625e-07, + "loss": -0.0004, + "reward": 2.4999841451644897, + "reward_std": 8.437664376970133e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.999984323978424, + "step": 2623 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 6.7979274611398965, + "grad_norm": 0.8010101354810155, + "kl": 0.06585693359375, + "learning_rate": 3.204663212435233e-07, + "loss": -0.0003, + "reward": 2.4999754428863525, + "reward_std": 7.238565785883111e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999753832817078, + "step": 2624 + }, + { + "clip_ratio": 0.0, + "completion_length": 36.0, + "epoch": 6.800518134715026, + "grad_norm": 0.32472030281790437, + "kl": 0.084716796875, + "learning_rate": 3.2020725388601035e-07, + "loss": -0.0001, + "reward": 2.499993324279785, + "reward_std": 4.868169071414741e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.99999338388443, + "step": 2625 + }, + { + "clip_ratio": 0.0, + "completion_length": 34.5, + "epoch": 6.803108808290156, + "grad_norm": 0.1472680932200785, + "kl": 0.17333984375, + "learning_rate": 3.199481865284974e-07, + "loss": 0.0001, + "reward": 2.499996542930603, + "reward_std": 2.135551767423749e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999964833259583, + "step": 2626 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 6.805699481865285, + "grad_norm": 115.40396773481149, + "kl": 0.1104736328125, + "learning_rate": 3.1968911917098446e-07, + "loss": 0.0012, + "reward": 1.9436487555503845, + "reward_std": 0.09334890798777451, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.4436488151550293, + "step": 2627 + }, + { + "clip_ratio": 0.0, + "completion_length": 34.0, + "epoch": 6.808290155440415, + "grad_norm": 0.2383380515966591, + "kl": 0.066162109375, + "learning_rate": 3.194300518134715e-07, + "loss": -0.0004, + "reward": 2.499975085258484, + "reward_std": 3.6756970303031267e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999752044677734, + "step": 2628 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 6.810880829015544, + "grad_norm": 0.9329502207555056, + "kl": 0.0625, + "learning_rate": 3.191709844559585e-07, + "loss": -0.0005, + "reward": 2.499995470046997, + "reward_std": 5.685469091076811e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999956488609314, + "step": 2629 + }, + { + "clip_ratio": 0.0, + "completion_length": 34.5, + "epoch": 6.813471502590674, + "grad_norm": 0.3690295133439186, + "kl": 0.08209228515625, + "learning_rate": 3.1891191709844556e-07, + "loss": 0.0015, + "reward": 2.499992609024048, + "reward_std": 4.3634497615130385e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.999992311000824, + "step": 2630 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 6.816062176165803, + "grad_norm": 0.5946824987419771, + "kl": 0.24755859375, + "learning_rate": 3.1865284974093267e-07, + "loss": 0.0014, + "reward": 2.4999654293060303, + "reward_std": 6.409323987099924e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999654293060303, + "step": 2631 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.5, + "epoch": 6.818652849740933, + "grad_norm": 0.731038373227442, + "kl": 0.116943359375, + "learning_rate": 3.1839378238341967e-07, + "loss": 0.0002, + "reward": 2.4999780654907227, + "reward_std": 7.960647053550929e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999781847000122, + "step": 2632 + }, + { + "clip_ratio": 0.0, + "completion_length": 36.0, + "epoch": 6.821243523316062, + "grad_norm": 25.188331497738304, + "kl": 0.098876953125, + "learning_rate": 3.181347150259067e-07, + "loss": 0.001, + "reward": 2.3124375343322754, + "reward_std": 0.2588223617028689, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.8124375343322754, + "step": 2633 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.5, + "epoch": 6.823834196891192, + "grad_norm": 0.18043945675785886, + "kl": 0.116455078125, + "learning_rate": 3.178756476683938e-07, + "loss": -0.0004, + "reward": 2.4999990463256836, + "reward_std": 7.49549172951447e-07, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999991059303284, + "step": 2634 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.5, + "epoch": 6.8264248704663215, + "grad_norm": 1.022727259914892, + "kl": 0.11572265625, + "learning_rate": 3.176165803108808e-07, + "loss": 0.0006, + "reward": 2.4999947547912598, + "reward_std": 3.2792924002933432e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999946355819702, + "step": 2635 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 6.829015544041451, + "grad_norm": 0.11858756300010036, + "kl": 0.0614013671875, + "learning_rate": 3.173575129533679e-07, + "loss": -0.0, + "reward": 2.499998092651367, + "reward_std": 1.5976047791355086e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999982118606567, + "step": 2636 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 6.831606217616581, + "grad_norm": 0.8406515143486687, + "kl": 0.0477294921875, + "learning_rate": 3.1709844559585493e-07, + "loss": 0.0004, + "reward": 1.999927043914795, + "reward_std": 9.489662829764711e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.4999271035194397, + "step": 2637 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 6.83419689119171, + "grad_norm": 9.551721300417965, + "kl": 0.3466796875, + "learning_rate": 3.1683937823834193e-07, + "loss": 0.0023, + "reward": 1.9975675344467163, + "reward_std": 0.00047034373449150735, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.4975674152374268, + "step": 2638 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.5, + "epoch": 6.83678756476684, + "grad_norm": 0.08414711455513105, + "kl": 0.02960205078125, + "learning_rate": 3.16580310880829e-07, + "loss": -0.0, + "reward": 2.4999982118606567, + "reward_std": 1.4587311625291477e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.999998390674591, + "step": 2639 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.9375, + "epoch": 6.839378238341969, + "grad_norm": 0.042768534414627316, + "kl": 0.072265625, + "learning_rate": 3.163212435233161e-07, + "loss": -0.0003, + "reward": 2.4999992847442627, + "reward_std": 1.0190064756443462e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999992847442627, + "step": 2640 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.5, + "epoch": 6.841968911917099, + "grad_norm": 0.05676457056419735, + "kl": 0.07806396484375, + "learning_rate": 3.160621761658031e-07, + "loss": -0.0007, + "reward": 2.499994158744812, + "reward_std": 1.5781853335283813e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999942183494568, + "step": 2641 + }, + { + "clip_ratio": 0.0, + "completion_length": 34.0, + "epoch": 6.844559585492228, + "grad_norm": 0.129515423086182, + "kl": 0.0552978515625, + "learning_rate": 3.1580310880829014e-07, + "loss": 0.0015, + "reward": 2.49999737739563, + "reward_std": 1.4731791964095464e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999974966049194, + "step": 2642 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 6.847150259067358, + "grad_norm": 0.5711241261433098, + "kl": 0.11572265625, + "learning_rate": 3.155440414507772e-07, + "loss": 0.0005, + "reward": 2.4999923706054688, + "reward_std": 3.3822458647136955e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999923706054688, + "step": 2643 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 6.849740932642487, + "grad_norm": 33.511374255696, + "kl": 0.16064453125, + "learning_rate": 3.152849740932642e-07, + "loss": 0.0006, + "reward": 2.3124029636383057, + "reward_std": 0.25889289929045844, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.8124029636383057, + "step": 2644 + }, + { + "clip_ratio": 0.0, + "completion_length": 34.0, + "epoch": 6.852331606217617, + "grad_norm": 0.783777532982119, + "kl": 0.0760498046875, + "learning_rate": 3.150259067357513e-07, + "loss": 0.001, + "reward": 2.499992609024048, + "reward_std": 4.621174866770161e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.999992549419403, + "step": 2645 + }, + { + "clip_ratio": 0.0, + "completion_length": 34.5, + "epoch": 6.8549222797927465, + "grad_norm": 0.10734646715621524, + "kl": 0.13671875, + "learning_rate": 3.1476683937823835e-07, + "loss": 0.0017, + "reward": 2.49999737739563, + "reward_std": 1.8049130403596791e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999972581863403, + "step": 2646 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 6.857512953367876, + "grad_norm": 0.5221572295646971, + "kl": 0.081298828125, + "learning_rate": 3.1450777202072535e-07, + "loss": 0.0002, + "reward": 2.4999958276748657, + "reward_std": 3.1969009341992205e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.999995768070221, + "step": 2647 + }, + { + "clip_ratio": 0.0, + "completion_length": 34.5, + "epoch": 6.860103626943005, + "grad_norm": 3.254478770384534, + "kl": 0.092041015625, + "learning_rate": 3.142487046632124e-07, + "loss": 0.0011, + "reward": 2.4999899864196777, + "reward_std": 7.808321470292867e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999898672103882, + "step": 2648 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 6.862694300518134, + "grad_norm": 0.2786716513463218, + "kl": 0.06884765625, + "learning_rate": 3.139896373056995e-07, + "loss": -0.0005, + "reward": 2.4999934434890747, + "reward_std": 2.3536198909823725e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999935626983643, + "step": 2649 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.5, + "epoch": 6.865284974093264, + "grad_norm": 0.6035286577444154, + "kl": 0.0621337890625, + "learning_rate": 3.137305699481865e-07, + "loss": 0.0008, + "reward": 2.4999942779541016, + "reward_std": 4.760535887271544e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999942183494568, + "step": 2650 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 6.867875647668393, + "grad_norm": 0.353326948384391, + "kl": 0.07366943359375, + "learning_rate": 3.1347150259067356e-07, + "loss": 0.0022, + "reward": 2.499995231628418, + "reward_std": 2.312569620244176e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999947547912598, + "step": 2651 + }, + { + "clip_ratio": 0.0, + "completion_length": 34.0, + "epoch": 6.870466321243523, + "grad_norm": 0.050495615388355466, + "kl": 0.07275390625, + "learning_rate": 3.132124352331606e-07, + "loss": -0.001, + "reward": 2.4999985694885254, + "reward_std": 9.676915055933932e-07, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999986290931702, + "step": 2652 + }, + { + "clip_ratio": 0.0, + "completion_length": 36.0, + "epoch": 6.873056994818652, + "grad_norm": 16.279411889064498, + "kl": 0.10498046875, + "learning_rate": 3.129533678756476e-07, + "loss": 0.0013, + "reward": 2.499994993209839, + "reward_std": 4.3705274492822355e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.999994933605194, + "step": 2653 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.5, + "epoch": 6.875647668393782, + "grad_norm": 1.349154768096174, + "kl": 0.0841064453125, + "learning_rate": 3.126943005181347e-07, + "loss": 0.001, + "reward": 2.499995470046997, + "reward_std": 5.87809245189419e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999954104423523, + "step": 2654 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 6.8782383419689115, + "grad_norm": 0.24559773068145477, + "kl": 0.13720703125, + "learning_rate": 3.124352331606218e-07, + "loss": 0.0005, + "reward": 2.4999899864196777, + "reward_std": 2.947581776879815e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.999989926815033, + "step": 2655 + }, + { + "clip_ratio": 0.0, + "completion_length": 34.5, + "epoch": 6.880829015544041, + "grad_norm": 1.1636827874950308, + "kl": 0.111083984375, + "learning_rate": 3.121761658031088e-07, + "loss": -0.0007, + "reward": 2.4999771118164062, + "reward_std": 7.452856266354502e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999773502349854, + "step": 2656 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.5, + "epoch": 6.883419689119171, + "grad_norm": 0.3537371367868099, + "kl": 0.080810546875, + "learning_rate": 3.1191709844559583e-07, + "loss": 0.0002, + "reward": 2.4999964237213135, + "reward_std": 2.1272351773404807e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999964237213135, + "step": 2657 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.5, + "epoch": 6.8860103626943, + "grad_norm": 2.461943748115084, + "kl": 0.08837890625, + "learning_rate": 3.116580310880829e-07, + "loss": 0.0014, + "reward": 1.9984063506126404, + "reward_std": 2.0161690827080747e-05, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.4984063506126404, + "step": 2658 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 6.88860103626943, + "grad_norm": 7.638852823792572, + "kl": 0.11181640625, + "learning_rate": 3.1139896373056993e-07, + "loss": 0.0003, + "reward": 1.999787151813507, + "reward_std": 1.5002544600406509e-05, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.4997871816158295, + "step": 2659 + }, + { + "clip_ratio": 0.0, + "completion_length": 34.5, + "epoch": 6.891191709844559, + "grad_norm": 20.81141839824204, + "kl": 0.123046875, + "learning_rate": 3.11139896373057e-07, + "loss": -0.0005, + "reward": 2.499915838241577, + "reward_std": 1.6071899835878867e-05, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999159574508667, + "step": 2660 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 6.893782383419689, + "grad_norm": 0.025103119686933323, + "kl": 0.147705078125, + "learning_rate": 3.1088082901554404e-07, + "loss": 0.0012, + "reward": 2.4999990463256836, + "reward_std": 1.04565532410561e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999991059303284, + "step": 2661 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 6.896373056994818, + "grad_norm": 16.731027577564163, + "kl": 0.066314697265625, + "learning_rate": 3.1062176165803104e-07, + "loss": -0.0007, + "reward": 1.9989325404167175, + "reward_std": 0.00012794942733762582, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.498932808637619, + "step": 2662 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 6.898963730569948, + "grad_norm": 0.12302329768003824, + "kl": 0.090576171875, + "learning_rate": 3.1036269430051814e-07, + "loss": 0.0008, + "reward": 2.49999737739563, + "reward_std": 2.3493118419537495e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999973773956299, + "step": 2663 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.5, + "epoch": 6.901554404145077, + "grad_norm": 9.644437676921989, + "kl": 0.140380859375, + "learning_rate": 3.101036269430052e-07, + "loss": -0.0003, + "reward": 1.9942524433135986, + "reward_std": 0.00011496849128889153, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.4942525029182434, + "step": 2664 + }, + { + "clip_ratio": 0.0, + "completion_length": 34.5, + "epoch": 6.904145077720207, + "grad_norm": 3.85087150639492, + "kl": 0.1103515625, + "learning_rate": 3.098445595854922e-07, + "loss": 0.0004, + "reward": 2.4999680519104004, + "reward_std": 8.630609954707325e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.99996817111969, + "step": 2665 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 6.9067357512953365, + "grad_norm": 0.17982488842231498, + "kl": 0.136474609375, + "learning_rate": 3.0958549222797925e-07, + "loss": 0.0004, + "reward": 2.499998092651367, + "reward_std": 1.4715184306623996e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999982714653015, + "step": 2666 + }, + { + "clip_ratio": 0.0, + "completion_length": 34.0, + "epoch": 6.909326424870466, + "grad_norm": 3.184643650047152, + "kl": 0.16064453125, + "learning_rate": 3.093264248704663e-07, + "loss": 0.0015, + "reward": 1.9993248581886292, + "reward_std": 2.929641897253532e-05, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.4993248283863068, + "step": 2667 + }, + { + "clip_ratio": 0.0, + "completion_length": 34.5, + "epoch": 6.9119170984455955, + "grad_norm": 3.749083443359404, + "kl": 0.17041015625, + "learning_rate": 3.0906735751295335e-07, + "loss": 0.0012, + "reward": 1.9960463047027588, + "reward_std": 6.586990798496117e-05, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.4960463047027588, + "step": 2668 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.5, + "epoch": 6.914507772020725, + "grad_norm": 28.25249429623852, + "kl": 0.182373046875, + "learning_rate": 3.088082901554404e-07, + "loss": 0.0005, + "reward": 1.8841991424560547, + "reward_std": 0.0017215177658727043, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.3841991424560547, + "step": 2669 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.4375, + "epoch": 6.917098445595855, + "grad_norm": 0.1671544557216127, + "kl": 0.109130859375, + "learning_rate": 3.0854922279792746e-07, + "loss": 0.0003, + "reward": 2.499998450279236, + "reward_std": 1.5015943972684909e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.999998688697815, + "step": 2670 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.5, + "epoch": 6.919689119170984, + "grad_norm": 0.6707549083372544, + "kl": 0.055419921875, + "learning_rate": 3.0829015544041446e-07, + "loss": -0.0002, + "reward": 2.4999911785125732, + "reward_std": 4.955255462846253e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999911785125732, + "step": 2671 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 6.922279792746114, + "grad_norm": 0.14356887574468113, + "kl": 0.02410888671875, + "learning_rate": 3.080310880829015e-07, + "loss": 0.0001, + "reward": 2.499998092651367, + "reward_std": 1.5409833906687709e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999982118606567, + "step": 2672 + }, + { + "clip_ratio": 0.0, + "completion_length": 34.5, + "epoch": 6.924870466321243, + "grad_norm": 0.20441933320945194, + "kl": 0.0369873046875, + "learning_rate": 3.077720207253886e-07, + "loss": -0.0004, + "reward": 2.49999737739563, + "reward_std": 2.084319817186042e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.999997317790985, + "step": 2673 + }, + { + "clip_ratio": 0.0, + "completion_length": 36.0, + "epoch": 6.927461139896373, + "grad_norm": 0.6683067182430974, + "kl": 0.041259765625, + "learning_rate": 3.075129533678756e-07, + "loss": 0.0003, + "reward": 2.4999892711639404, + "reward_std": 6.496505193354096e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999892711639404, + "step": 2674 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 6.930051813471502, + "grad_norm": 0.7210554501054945, + "kl": 0.091064453125, + "learning_rate": 3.0725388601036267e-07, + "loss": -0.0009, + "reward": 2.4999945163726807, + "reward_std": 3.858439754367282e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999947547912598, + "step": 2675 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.5, + "epoch": 6.932642487046632, + "grad_norm": 0.17552386108248752, + "kl": 0.079071044921875, + "learning_rate": 3.069948186528497e-07, + "loss": 0.0005, + "reward": 2.4999982118606567, + "reward_std": 1.9856996118505776e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.999998152256012, + "step": 2676 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.5, + "epoch": 6.935233160621761, + "grad_norm": 0.41605490710069404, + "kl": 0.0755615234375, + "learning_rate": 3.067357512953368e-07, + "loss": 0.001, + "reward": 2.4999942779541016, + "reward_std": 3.8180478441063315e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999942183494568, + "step": 2677 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 6.937823834196891, + "grad_norm": 0.06875053222427291, + "kl": 0.083740234375, + "learning_rate": 3.0647668393782383e-07, + "loss": 0.0002, + "reward": 2.499997854232788, + "reward_std": 1.1512554465298308e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999977946281433, + "step": 2678 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 6.9404145077720205, + "grad_norm": 0.6915553577948074, + "kl": 0.0787353515625, + "learning_rate": 3.062176165803109e-07, + "loss": -0.0005, + "reward": 2.4999877214431763, + "reward_std": 5.598813686447102e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999879002571106, + "step": 2679 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.5, + "epoch": 6.94300518134715, + "grad_norm": 0.025111769794718897, + "kl": 0.1043701171875, + "learning_rate": 3.059585492227979e-07, + "loss": 0.001, + "reward": 2.4999988079071045, + "reward_std": 6.237595471247914e-07, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999988675117493, + "step": 2680 + }, + { + "clip_ratio": 0.0, + "completion_length": 34.5, + "epoch": 6.94559585492228, + "grad_norm": 0.17851908374925143, + "kl": 0.0947265625, + "learning_rate": 3.0569948186528493e-07, + "loss": 0.0005, + "reward": 2.499997854232788, + "reward_std": 2.293884051596251e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999980330467224, + "step": 2681 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.5, + "epoch": 6.948186528497409, + "grad_norm": 19.291777146679912, + "kl": 0.0992431640625, + "learning_rate": 3.0544041450777204e-07, + "loss": 0.0001, + "reward": 2.49980092048645, + "reward_std": 6.13935790170217e-05, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9998009204864502, + "step": 2682 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 6.950777202072539, + "grad_norm": 0.10200411389718572, + "kl": 0.26318359375, + "learning_rate": 3.0518134715025904e-07, + "loss": 0.0008, + "reward": 2.4999979734420776, + "reward_std": 1.4975389603932854e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.999997854232788, + "step": 2683 + }, + { + "clip_ratio": 0.0, + "completion_length": 34.5, + "epoch": 6.953367875647668, + "grad_norm": 0.2851126572805129, + "kl": 0.02642822265625, + "learning_rate": 3.049222797927461e-07, + "loss": 0.0007, + "reward": 2.499997138977051, + "reward_std": 2.274980147376482e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999971389770508, + "step": 2684 + }, + { + "clip_ratio": 0.0, + "completion_length": 34.5625, + "epoch": 6.955958549222798, + "grad_norm": 154.70547890032248, + "kl": 0.17724609375, + "learning_rate": 3.0466321243523314e-07, + "loss": 0.0003, + "reward": 1.9351143836975098, + "reward_std": 0.03371609011310284, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.4351144433021545, + "step": 2685 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 6.958549222797927, + "grad_norm": 0.3745528835900668, + "kl": 0.120849609375, + "learning_rate": 3.044041450777202e-07, + "loss": 0.0011, + "reward": 2.49999463558197, + "reward_std": 2.4771332505224564e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999943971633911, + "step": 2686 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 6.961139896373057, + "grad_norm": 0.14974097947057258, + "kl": 0.153564453125, + "learning_rate": 3.0414507772020725e-07, + "loss": 0.0, + "reward": 2.499995708465576, + "reward_std": 2.8539324148368905e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999959468841553, + "step": 2687 + }, + { + "clip_ratio": 0.0, + "completion_length": 34.5, + "epoch": 6.963730569948186, + "grad_norm": 0.049723135099217594, + "kl": 0.0645751953125, + "learning_rate": 3.038860103626943e-07, + "loss": -0.0006, + "reward": 2.499998092651367, + "reward_std": 1.7072521814043284e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999980926513672, + "step": 2688 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.625, + "epoch": 6.966321243523316, + "grad_norm": 327.7835062572855, + "kl": 0.106689453125, + "learning_rate": 3.036269430051813e-07, + "loss": 0.0003, + "reward": 2.1122639179229736, + "reward_std": 0.2393150636217456, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.6122637391090393, + "step": 2689 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.5, + "epoch": 6.9689119170984455, + "grad_norm": 4.090484424047936, + "kl": 0.0791015625, + "learning_rate": 3.0336787564766835e-07, + "loss": 0.0005, + "reward": 2.4999462366104126, + "reward_std": 2.6773177523864433e-05, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999462366104126, + "step": 2690 + }, + { + "clip_ratio": 0.0, + "completion_length": 34.0, + "epoch": 6.971502590673575, + "grad_norm": 6.65203856546593, + "kl": 0.098876953125, + "learning_rate": 3.0310880829015546e-07, + "loss": 0.0009, + "reward": 1.9991925358772278, + "reward_std": 4.446475134045613e-05, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.4991923868656158, + "step": 2691 + }, + { + "clip_ratio": 0.0, + "completion_length": 34.5, + "epoch": 6.974093264248705, + "grad_norm": 9.91009941138378, + "kl": 0.060791015625, + "learning_rate": 3.0284974093264246e-07, + "loss": 0.0009, + "reward": 1.9985364079475403, + "reward_std": 9.513014697404287e-05, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.4985363483428955, + "step": 2692 + }, + { + "clip_ratio": 0.0, + "completion_length": 33.5, + "epoch": 6.976683937823834, + "grad_norm": 0.8474915033213317, + "kl": 0.094970703125, + "learning_rate": 3.025906735751295e-07, + "loss": 0.0005, + "reward": 2.4999932050704956, + "reward_std": 7.302572157641407e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999932050704956, + "step": 2693 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.5, + "epoch": 6.979274611398964, + "grad_norm": 0.6968910664154169, + "kl": 0.0989990234375, + "learning_rate": 3.0233160621761657e-07, + "loss": -0.0002, + "reward": 2.499959707260132, + "reward_std": 7.119760539353592e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999597668647766, + "step": 2694 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.4375, + "epoch": 6.981865284974093, + "grad_norm": 59.2766435770167, + "kl": 0.126953125, + "learning_rate": 3.0207253886010356e-07, + "loss": 0.0003, + "reward": 1.9794762134552002, + "reward_std": 0.0003094483907943868, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.4794762134552002, + "step": 2695 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 6.984455958549223, + "grad_norm": 14.079648594312262, + "kl": 0.091796875, + "learning_rate": 3.0181347150259067e-07, + "loss": 0.001, + "reward": 1.9983248710632324, + "reward_std": 4.997884479962522e-05, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.4983248114585876, + "step": 2696 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.5, + "epoch": 6.987046632124352, + "grad_norm": 15.917772612262366, + "kl": 0.06982421875, + "learning_rate": 3.015544041450777e-07, + "loss": 0.0009, + "reward": 2.2499858140945435, + "reward_std": 0.2672700790443514, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.7499855756759644, + "step": 2697 + }, + { + "clip_ratio": 0.0, + "completion_length": 34.5, + "epoch": 6.989637305699482, + "grad_norm": 0.6641314836799243, + "kl": 0.184814453125, + "learning_rate": 3.012953367875647e-07, + "loss": 0.0003, + "reward": 2.4999964237213135, + "reward_std": 4.912595159112243e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.999996304512024, + "step": 2698 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.5, + "epoch": 6.992227979274611, + "grad_norm": 130.1776848173821, + "kl": 0.153076171875, + "learning_rate": 3.010362694300518e-07, + "loss": 0.0009, + "reward": 1.8121325969696045, + "reward_std": 0.0006616853706873371, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.3121325969696045, + "step": 2699 + }, + { + "clip_ratio": 0.0, + "completion_length": 34.5, + "epoch": 6.994818652849741, + "grad_norm": 15.337846712055693, + "kl": 0.0677490234375, + "learning_rate": 3.007772020725389e-07, + "loss": 0.0005, + "reward": 2.4999635219573975, + "reward_std": 9.49482387113676e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999635219573975, + "step": 2700 + }, + { + "clip_ratio": 0.0, + "completion_length": 34.5, + "epoch": 6.9974093264248705, + "grad_norm": 0.5169906681313127, + "kl": 0.138916015625, + "learning_rate": 3.005181347150259e-07, + "loss": 0.0013, + "reward": 2.499998092651367, + "reward_std": 1.2903511219519714e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999979734420776, + "step": 2701 + }, + { + "clip_ratio": 0.0, + "completion_length": 34.5, + "epoch": 7.0, + "grad_norm": 34.47839291440497, + "kl": 0.13916015625, + "learning_rate": 3.0025906735751293e-07, + "loss": 0.0, + "reward": 1.9861319661140442, + "reward_std": 0.0010465293445349744, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.4861319661140442, + "step": 2702 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 7.0025906735751295, + "grad_norm": 1.277131039149812, + "kl": 0.0606689453125, + "learning_rate": 3e-07, + "loss": 0.0005, + "reward": 2.499986171722412, + "reward_std": 8.472314448226825e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.999986171722412, + "step": 2703 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.5, + "epoch": 7.005181347150259, + "grad_norm": 1.2340353345381787, + "kl": 0.3319091796875, + "learning_rate": 2.99740932642487e-07, + "loss": 0.0, + "reward": 2.499998092651367, + "reward_std": 1.7651769894655445e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999983310699463, + "step": 2704 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.5, + "epoch": 7.007772020725389, + "grad_norm": 4.877864512255514, + "kl": 0.0762939453125, + "learning_rate": 2.994818652849741e-07, + "loss": 0.0009, + "reward": 1.992798089981079, + "reward_std": 7.922134147975157e-05, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.4927980303764343, + "step": 2705 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 7.010362694300518, + "grad_norm": 0.27964349637474945, + "kl": 0.07421875, + "learning_rate": 2.9922279792746114e-07, + "loss": 0.0006, + "reward": 2.49999737739563, + "reward_std": 2.3559888404633966e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999972581863403, + "step": 2706 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.5, + "epoch": 7.012953367875648, + "grad_norm": 0.5124919284345575, + "kl": 0.0625, + "learning_rate": 2.9896373056994814e-07, + "loss": 0.0005, + "reward": 2.499992609024048, + "reward_std": 3.1695838629275386e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999924302101135, + "step": 2707 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 7.015544041450777, + "grad_norm": 3.7156472708440442, + "kl": 0.17138671875, + "learning_rate": 2.987046632124352e-07, + "loss": 0.0009, + "reward": 1.9946449995040894, + "reward_std": 5.9174433090447565e-05, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.4946449398994446, + "step": 2708 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.5, + "epoch": 7.018134715025907, + "grad_norm": 0.08315801026027068, + "kl": 0.043182373046875, + "learning_rate": 2.9844559585492225e-07, + "loss": 0.0003, + "reward": 2.4999983310699463, + "reward_std": 1.1314137111639866e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999984502792358, + "step": 2709 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.5, + "epoch": 7.020725388601036, + "grad_norm": 0.32078519601146804, + "kl": 0.068359375, + "learning_rate": 2.981865284974093e-07, + "loss": -0.0005, + "reward": 2.499997615814209, + "reward_std": 1.475461999689287e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999977946281433, + "step": 2710 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 7.023316062176166, + "grad_norm": 0.19433534024513685, + "kl": 0.0665283203125, + "learning_rate": 2.9792746113989635e-07, + "loss": 0.0003, + "reward": 2.499998688697815, + "reward_std": 1.1276677156502046e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999988079071045, + "step": 2711 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.5, + "epoch": 7.025906735751295, + "grad_norm": 0.4825264724252194, + "kl": 0.104248046875, + "learning_rate": 2.976683937823834e-07, + "loss": -0.0003, + "reward": 2.4999959468841553, + "reward_std": 3.408521195069625e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999960064888, + "step": 2712 + }, + { + "clip_ratio": 0.0, + "completion_length": 36.0, + "epoch": 7.028497409326425, + "grad_norm": 0.06405915603598082, + "kl": 0.10205078125, + "learning_rate": 2.974093264248704e-07, + "loss": -0.0005, + "reward": 2.49999737739563, + "reward_std": 1.5113402014321764e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999975562095642, + "step": 2713 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 7.0310880829015545, + "grad_norm": 5.577678789698276, + "kl": 0.0933837890625, + "learning_rate": 2.971502590673575e-07, + "loss": 0.0002, + "reward": 2.4999349117279053, + "reward_std": 3.971588563445039e-05, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999350905418396, + "step": 2714 + }, + { + "clip_ratio": 0.0, + "completion_length": 34.0, + "epoch": 7.033678756476684, + "grad_norm": 0.11001574450722675, + "kl": 0.1019287109375, + "learning_rate": 2.9689119170984457e-07, + "loss": 0.0007, + "reward": 2.4999990463256836, + "reward_std": 1.1054847561808856e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999991059303284, + "step": 2715 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 7.036269430051814, + "grad_norm": 71.70600192367583, + "kl": 0.1875, + "learning_rate": 2.966321243523316e-07, + "loss": 0.001, + "reward": 1.967581868171692, + "reward_std": 0.0009742489457948977, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.467581868171692, + "step": 2716 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 7.038860103626943, + "grad_norm": 0.20403869466989022, + "kl": 0.097412109375, + "learning_rate": 2.963730569948186e-07, + "loss": -0.0, + "reward": 2.499983787536621, + "reward_std": 5.1294148306624265e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999836683273315, + "step": 2717 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 7.041450777202073, + "grad_norm": 1.217487505050398, + "kl": 0.077880859375, + "learning_rate": 2.9611398963730567e-07, + "loss": 0.0016, + "reward": 2.4999985694885254, + "reward_std": 1.4931237615201098e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999984502792358, + "step": 2718 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 7.044041450777202, + "grad_norm": 0.0693020862081694, + "kl": 0.084930419921875, + "learning_rate": 2.958549222797928e-07, + "loss": 0.0014, + "reward": 2.4999982118606567, + "reward_std": 1.022311977294521e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999980330467224, + "step": 2719 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.5, + "epoch": 7.046632124352332, + "grad_norm": 13.674053274344969, + "kl": 0.1298828125, + "learning_rate": 2.955958549222798e-07, + "loss": 0.0007, + "reward": 2.4374618530273438, + "reward_std": 0.1768199337666374, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9374619126319885, + "step": 2720 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.5, + "epoch": 7.049222797927461, + "grad_norm": 0.4625309994534999, + "kl": 0.136474609375, + "learning_rate": 2.9533678756476683e-07, + "loss": 0.0014, + "reward": 2.4999799728393555, + "reward_std": 3.261184474467882e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999799132347107, + "step": 2721 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 7.051813471502591, + "grad_norm": 0.1372337190525816, + "kl": 0.07861328125, + "learning_rate": 2.950777202072539e-07, + "loss": 0.0, + "reward": 2.4999979734420776, + "reward_std": 1.3631250226353586e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999980926513672, + "step": 2722 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.5, + "epoch": 7.05440414507772, + "grad_norm": 25.18686999134842, + "kl": 0.2718505859375, + "learning_rate": 2.9481865284974093e-07, + "loss": 0.0018, + "reward": 2.437284469604492, + "reward_std": 0.1773731542743917, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9372843503952026, + "step": 2723 + }, + { + "clip_ratio": 0.0, + "completion_length": 34.5, + "epoch": 7.05699481865285, + "grad_norm": 0.3556508216997095, + "kl": 0.0194091796875, + "learning_rate": 2.94559585492228e-07, + "loss": 0.0011, + "reward": 2.499992609024048, + "reward_std": 2.989542565501324e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999926686286926, + "step": 2724 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 7.0595854922279795, + "grad_norm": 0.4025261057990681, + "kl": 0.111328125, + "learning_rate": 2.9430051813471504e-07, + "loss": 0.0007, + "reward": 2.4999940395355225, + "reward_std": 2.723148497807415e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999940395355225, + "step": 2725 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.5, + "epoch": 7.062176165803109, + "grad_norm": 0.31274000025680493, + "kl": 0.11669921875, + "learning_rate": 2.9404145077720204e-07, + "loss": 0.0016, + "reward": 2.499998092651367, + "reward_std": 1.863392071754788e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999980330467224, + "step": 2726 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.5, + "epoch": 7.064766839378239, + "grad_norm": 0.4397637648749812, + "kl": 0.0701904296875, + "learning_rate": 2.937823834196891e-07, + "loss": 0.0003, + "reward": 2.4999921321868896, + "reward_std": 4.154978114456753e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999921917915344, + "step": 2727 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 7.067357512953368, + "grad_norm": 19.276821566765786, + "kl": 0.31298828125, + "learning_rate": 2.935233160621762e-07, + "loss": 0.0015, + "reward": 1.9954688549041748, + "reward_std": 0.006783579270944529, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.4954689741134644, + "step": 2728 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 7.069948186528498, + "grad_norm": 3.6106080650429635, + "kl": 0.12939453125, + "learning_rate": 2.932642487046632e-07, + "loss": 0.0002, + "reward": 1.4974743127822876, + "reward_std": 8.686909495736472e-05, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 0.9974744319915771, + "step": 2729 + }, + { + "clip_ratio": 0.0, + "completion_length": 34.5, + "epoch": 7.072538860103627, + "grad_norm": 0.8302622536573409, + "kl": 0.095458984375, + "learning_rate": 2.9300518134715025e-07, + "loss": 0.0015, + "reward": 1.9991748332977295, + "reward_std": 1.608480630466147e-05, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.499174565076828, + "step": 2730 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 7.075129533678757, + "grad_norm": 0.0748136620697045, + "kl": 0.133544921875, + "learning_rate": 2.927461139896373e-07, + "loss": -0.0001, + "reward": 2.499996304512024, + "reward_std": 1.7537465737405e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999963641166687, + "step": 2731 + }, + { + "clip_ratio": 0.0, + "completion_length": 34.0, + "epoch": 7.077720207253886, + "grad_norm": 3.9856052055470923, + "kl": 0.0654296875, + "learning_rate": 2.924870466321243e-07, + "loss": 0.0001, + "reward": 1.999738335609436, + "reward_std": 2.8957919056438186e-05, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.499738484621048, + "step": 2732 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.5, + "epoch": 7.080310880829016, + "grad_norm": 1.9355060975773766, + "kl": 0.087646484375, + "learning_rate": 2.922279792746114e-07, + "loss": 0.0008, + "reward": 2.49998140335083, + "reward_std": 8.472973490825098e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.99998140335083, + "step": 2733 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 7.082901554404145, + "grad_norm": 0.5181264329567411, + "kl": 0.09130859375, + "learning_rate": 2.9196891191709846e-07, + "loss": 0.0009, + "reward": 2.4999959468841553, + "reward_std": 3.9163684846243996e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999960660934448, + "step": 2734 + }, + { + "clip_ratio": 0.0, + "completion_length": 34.5, + "epoch": 7.085492227979275, + "grad_norm": 0.07651231618356177, + "kl": 0.109619140625, + "learning_rate": 2.9170984455958546e-07, + "loss": 0.0028, + "reward": 2.4999983310699463, + "reward_std": 1.3678148320650507e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.999997854232788, + "step": 2735 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 7.0880829015544045, + "grad_norm": 0.4407890458861952, + "kl": 0.04296875, + "learning_rate": 2.914507772020725e-07, + "loss": 0.0, + "reward": 2.4999935626983643, + "reward_std": 3.190141228515131e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999935626983643, + "step": 2736 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.5, + "epoch": 7.090673575129534, + "grad_norm": 0.04223191398393086, + "kl": 0.08203125, + "learning_rate": 2.911917098445596e-07, + "loss": -0.0009, + "reward": 2.4999979734420776, + "reward_std": 1.1574470590858255e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.999998152256012, + "step": 2737 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 7.0932642487046635, + "grad_norm": 0.5977924135306688, + "kl": 0.1138916015625, + "learning_rate": 2.909326424870466e-07, + "loss": 0.0002, + "reward": 2.4999958276748657, + "reward_std": 2.531654786253057e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999960660934448, + "step": 2738 + }, + { + "clip_ratio": 0.0, + "completion_length": 34.0, + "epoch": 7.095854922279793, + "grad_norm": 1.1187856816444965, + "kl": 0.087615966796875, + "learning_rate": 2.9067357512953367e-07, + "loss": -0.0004, + "reward": 2.4999934434890747, + "reward_std": 5.441550456453115e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.99999338388443, + "step": 2739 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 7.098445595854923, + "grad_norm": 0.181339794909064, + "kl": 0.07666015625, + "learning_rate": 2.904145077720207e-07, + "loss": 0.0009, + "reward": 2.499997138977051, + "reward_std": 2.8207938385094167e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999972581863403, + "step": 2740 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 7.101036269430052, + "grad_norm": 1.8952011768885928, + "kl": 0.098388671875, + "learning_rate": 2.901554404145077e-07, + "loss": 0.0014, + "reward": 1.9879651069641113, + "reward_std": 7.876610010271179e-05, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.487965077161789, + "step": 2741 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 7.103626943005182, + "grad_norm": 0.13989088415350082, + "kl": 0.0423583984375, + "learning_rate": 2.8989637305699483e-07, + "loss": 0.0, + "reward": 2.4999983310699463, + "reward_std": 1.060065471847338e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.999998390674591, + "step": 2742 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 7.106217616580311, + "grad_norm": 0.0847527845110646, + "kl": 0.094879150390625, + "learning_rate": 2.896373056994819e-07, + "loss": 0.0008, + "reward": 2.499994158744812, + "reward_std": 1.615656515241426e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999939799308777, + "step": 2743 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 7.108808290155441, + "grad_norm": 1.0443295628308287, + "kl": 0.0889892578125, + "learning_rate": 2.893782383419689e-07, + "loss": 0.0015, + "reward": 2.4999905824661255, + "reward_std": 5.229881651303003e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999905824661255, + "step": 2744 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 7.11139896373057, + "grad_norm": 0.19017654597433994, + "kl": 0.0601806640625, + "learning_rate": 2.8911917098445593e-07, + "loss": -0.0001, + "reward": 2.4999947547912598, + "reward_std": 3.2905971920627053e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999947547912598, + "step": 2745 + }, + { + "clip_ratio": 0.0, + "completion_length": 34.5, + "epoch": 7.1139896373057, + "grad_norm": 7.252029356117923, + "kl": 0.3916015625, + "learning_rate": 2.8886010362694304e-07, + "loss": 0.0014, + "reward": 0.9966006278991699, + "reward_std": 4.635082223103382e-05, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 0.4966006875038147, + "step": 2746 + }, + { + "clip_ratio": 0.0, + "completion_length": 34.0, + "epoch": 7.116580310880829, + "grad_norm": 1.0658280584179984, + "kl": 0.12939453125, + "learning_rate": 2.8860103626943004e-07, + "loss": 0.0005, + "reward": 2.499993920326233, + "reward_std": 7.56164649828861e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.999993920326233, + "step": 2747 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 7.119170984455959, + "grad_norm": 0.06060123665867208, + "kl": 0.0599365234375, + "learning_rate": 2.883419689119171e-07, + "loss": -0.0001, + "reward": 2.4999985694885254, + "reward_std": 1.1397911805488548e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999986290931702, + "step": 2748 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 7.1217616580310885, + "grad_norm": 1.7808468213637838, + "kl": 0.0894775390625, + "learning_rate": 2.8808290155440414e-07, + "loss": -0.0002, + "reward": 2.4999860525131226, + "reward_std": 1.2167330396550824e-05, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999861121177673, + "step": 2749 + }, + { + "clip_ratio": 0.0, + "completion_length": 34.5, + "epoch": 7.124352331606218, + "grad_norm": 0.2592779569913539, + "kl": 0.02996826171875, + "learning_rate": 2.8782383419689114e-07, + "loss": 0.0006, + "reward": 2.4999935626983643, + "reward_std": 3.3797180662986648e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.99999338388443, + "step": 2750 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 7.126943005181348, + "grad_norm": 0.9574018356140368, + "kl": 0.13037109375, + "learning_rate": 2.8756476683937825e-07, + "loss": 0.0012, + "reward": 1.9998550415039062, + "reward_std": 8.921896551328246e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.4998550713062286, + "step": 2751 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 7.129533678756476, + "grad_norm": 0.8898146314958704, + "kl": 0.0654296875, + "learning_rate": 2.873056994818653e-07, + "loss": -0.0006, + "reward": 2.4999972581863403, + "reward_std": 2.577872464826214e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999974370002747, + "step": 2752 + }, + { + "clip_ratio": 0.0, + "completion_length": 34.5, + "epoch": 7.132124352331606, + "grad_norm": 0.35463335869827056, + "kl": 0.0654296875, + "learning_rate": 2.870466321243523e-07, + "loss": 0.0001, + "reward": 2.499986171722412, + "reward_std": 2.66814964788864e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999860525131226, + "step": 2753 + }, + { + "clip_ratio": 0.0, + "completion_length": 34.5, + "epoch": 7.134715025906735, + "grad_norm": 5.288680850824816, + "kl": 0.0601806640625, + "learning_rate": 2.8678756476683936e-07, + "loss": 0.0012, + "reward": 2.4999794960021973, + "reward_std": 9.578609251548187e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.999979555606842, + "step": 2754 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.5, + "epoch": 7.137305699481865, + "grad_norm": 0.07022497342829802, + "kl": 0.07275390625, + "learning_rate": 2.865284974093264e-07, + "loss": 0.0009, + "reward": 2.4999988079071045, + "reward_std": 1.129997087900847e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999988675117493, + "step": 2755 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 7.139896373056994, + "grad_norm": 1.3765696080231349, + "kl": 0.080322265625, + "learning_rate": 2.8626943005181346e-07, + "loss": 0.0009, + "reward": 2.499979019165039, + "reward_std": 7.035282578726765e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999790787696838, + "step": 2756 + }, + { + "clip_ratio": 0.0, + "completion_length": 34.5, + "epoch": 7.142487046632124, + "grad_norm": 2.29781823152281, + "kl": 0.12158203125, + "learning_rate": 2.860103626943005e-07, + "loss": 0.0011, + "reward": 2.499990940093994, + "reward_std": 8.408339567722578e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.999990999698639, + "step": 2757 + }, + { + "clip_ratio": 0.0, + "completion_length": 36.0, + "epoch": 7.1450777202072535, + "grad_norm": 0.20674407442124407, + "kl": 0.0584716796875, + "learning_rate": 2.8575129533678757e-07, + "loss": 0.0002, + "reward": 2.4999942779541016, + "reward_std": 3.489466394057672e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999943375587463, + "step": 2758 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 7.147668393782383, + "grad_norm": 0.7257799207809277, + "kl": 0.11083984375, + "learning_rate": 2.8549222797927457e-07, + "loss": 0.0015, + "reward": 2.4999923706054688, + "reward_std": 3.956153989292943e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.999992311000824, + "step": 2759 + }, + { + "clip_ratio": 0.0, + "completion_length": 36.0, + "epoch": 7.150259067357513, + "grad_norm": 29.63300940171338, + "kl": 0.177978515625, + "learning_rate": 2.8523316062176167e-07, + "loss": 0.0016, + "reward": 1.8276203870773315, + "reward_std": 0.0012136940254094952, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.3276202380657196, + "step": 2760 + }, + { + "clip_ratio": 0.0, + "completion_length": 36.0, + "epoch": 7.152849740932642, + "grad_norm": 0.19096093063476446, + "kl": 0.04608154296875, + "learning_rate": 2.849740932642487e-07, + "loss": 0.0003, + "reward": 2.4999955892562866, + "reward_std": 2.302110999607976e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999955892562866, + "step": 2761 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 7.155440414507772, + "grad_norm": 0.5649633649540509, + "kl": 0.20733642578125, + "learning_rate": 2.847150259067357e-07, + "loss": 0.0014, + "reward": 2.499995708465576, + "reward_std": 2.163669307719829e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999958276748657, + "step": 2762 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.5, + "epoch": 7.158031088082901, + "grad_norm": 0.22806776661027514, + "kl": 0.085205078125, + "learning_rate": 2.844559585492228e-07, + "loss": 0.0016, + "reward": 2.4999929666519165, + "reward_std": 2.473032054695068e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999929070472717, + "step": 2763 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 7.160621761658031, + "grad_norm": 0.09213882372241625, + "kl": 0.0931396484375, + "learning_rate": 2.8419689119170983e-07, + "loss": 0.0011, + "reward": 2.499998092651367, + "reward_std": 1.9166524793945428e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.999998152256012, + "step": 2764 + }, + { + "clip_ratio": 0.0, + "completion_length": 34.5, + "epoch": 7.16321243523316, + "grad_norm": 2.1349187459979397, + "kl": 0.0631103515625, + "learning_rate": 2.839378238341969e-07, + "loss": 0.0001, + "reward": 2.4999959468841553, + "reward_std": 3.810806560977653e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999958276748657, + "step": 2765 + }, + { + "clip_ratio": 0.0, + "completion_length": 34.0, + "epoch": 7.16580310880829, + "grad_norm": 2.107267908411277, + "kl": 0.028533935546875, + "learning_rate": 2.8367875647668393e-07, + "loss": -0.0003, + "reward": 2.499994158744812, + "reward_std": 8.372982563287223e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999942183494568, + "step": 2766 + }, + { + "clip_ratio": 0.0, + "completion_length": 36.0, + "epoch": 7.168393782383419, + "grad_norm": 0.5832772870712557, + "kl": 0.3056640625, + "learning_rate": 2.83419689119171e-07, + "loss": 0.0015, + "reward": 1.9997992515563965, + "reward_std": 5.329875023107888e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.4997993111610413, + "step": 2767 + }, + { + "clip_ratio": 0.0, + "completion_length": 34.0, + "epoch": 7.170984455958549, + "grad_norm": 0.14402685031763152, + "kl": 0.0499267578125, + "learning_rate": 2.83160621761658e-07, + "loss": -0.0011, + "reward": 2.499997615814209, + "reward_std": 1.049370325745258e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999977350234985, + "step": 2768 + }, + { + "clip_ratio": 0.0, + "completion_length": 34.5, + "epoch": 7.1735751295336785, + "grad_norm": 0.10125029651250043, + "kl": 0.059814453125, + "learning_rate": 2.8290155440414504e-07, + "loss": 0.0008, + "reward": 2.4999985694885254, + "reward_std": 1.4915083283995045e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999985098838806, + "step": 2769 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 7.176165803108808, + "grad_norm": 2.214707444395702, + "kl": 0.064697265625, + "learning_rate": 2.8264248704663215e-07, + "loss": 0.0007, + "reward": 2.4999823570251465, + "reward_std": 7.293636826943839e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999822974205017, + "step": 2770 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.5, + "epoch": 7.178756476683938, + "grad_norm": 4.312535342757309, + "kl": 0.13623046875, + "learning_rate": 2.8238341968911915e-07, + "loss": 0.0012, + "reward": 2.4999715089797974, + "reward_std": 2.0916557105010725e-05, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999715089797974, + "step": 2771 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 7.181347150259067, + "grad_norm": 0.11622333618380938, + "kl": 0.119873046875, + "learning_rate": 2.821243523316062e-07, + "loss": 0.0009, + "reward": 2.499998450279236, + "reward_std": 1.5286501593436697e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999984502792358, + "step": 2772 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 7.183937823834197, + "grad_norm": 4.389352263912077, + "kl": 0.02313232421875, + "learning_rate": 2.8186528497409325e-07, + "loss": 0.0014, + "reward": 2.499983787536621, + "reward_std": 2.306203077750979e-05, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999837279319763, + "step": 2773 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.5, + "epoch": 7.186528497409326, + "grad_norm": 1.9015968840341313, + "kl": 0.128662109375, + "learning_rate": 2.816062176165803e-07, + "loss": -0.0002, + "reward": 2.4999916553497314, + "reward_std": 9.00349857602123e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999916553497314, + "step": 2774 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.5, + "epoch": 7.189119170984456, + "grad_norm": 0.38769995654805667, + "kl": 0.16650390625, + "learning_rate": 2.8134715025906736e-07, + "loss": 0.0004, + "reward": 2.4999953508377075, + "reward_std": 1.5121324423716942e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999954104423523, + "step": 2775 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 7.191709844559585, + "grad_norm": 0.24253696915121573, + "kl": 0.060546875, + "learning_rate": 2.810880829015544e-07, + "loss": 0.0011, + "reward": 2.49999737739563, + "reward_std": 2.2189463493305084e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999971389770508, + "step": 2776 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 7.194300518134715, + "grad_norm": 0.9144746431960473, + "kl": 0.05029296875, + "learning_rate": 2.808290155440414e-07, + "loss": -0.0005, + "reward": 2.4999969005584717, + "reward_std": 2.0379158058858593e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999969005584717, + "step": 2777 + }, + { + "clip_ratio": 0.0, + "completion_length": 34.5, + "epoch": 7.196891191709844, + "grad_norm": 1.1453546550528448, + "kl": 0.0640869140625, + "learning_rate": 2.8056994818652846e-07, + "loss": -0.0, + "reward": 2.4999929666519165, + "reward_std": 6.042962752417225e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999930262565613, + "step": 2778 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.5, + "epoch": 7.199481865284974, + "grad_norm": 0.2956146693268651, + "kl": 0.0657958984375, + "learning_rate": 2.8031088082901557e-07, + "loss": 0.0009, + "reward": 2.499996066093445, + "reward_std": 2.339157163078198e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999958872795105, + "step": 2779 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 7.2020725388601035, + "grad_norm": 1.7690700844613938, + "kl": 0.060791015625, + "learning_rate": 2.8005181347150257e-07, + "loss": 0.0, + "reward": 2.4999927282333374, + "reward_std": 6.084984988774522e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999927878379822, + "step": 2780 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.5, + "epoch": 7.204663212435233, + "grad_norm": 2.73677534225513, + "kl": 0.0924072265625, + "learning_rate": 2.797927461139896e-07, + "loss": -0.0007, + "reward": 2.4999855756759644, + "reward_std": 1.0224089237453882e-05, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999855756759644, + "step": 2781 + }, + { + "clip_ratio": 0.0, + "completion_length": 34.5, + "epoch": 7.2072538860103625, + "grad_norm": 1.1981998138571988, + "kl": 0.094482421875, + "learning_rate": 2.7953367875647667e-07, + "loss": 0.0001, + "reward": 2.4999942779541016, + "reward_std": 4.210624297229515e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999943971633911, + "step": 2782 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 7.209844559585492, + "grad_norm": 5.968806271937427, + "kl": 0.056396484375, + "learning_rate": 2.792746113989637e-07, + "loss": -0.0005, + "reward": 2.4999773502349854, + "reward_std": 1.6791575944807846e-05, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.999977469444275, + "step": 2783 + }, + { + "clip_ratio": 0.0, + "completion_length": 34.5, + "epoch": 7.212435233160622, + "grad_norm": 8.778902647680013, + "kl": 0.1219482421875, + "learning_rate": 2.790155440414508e-07, + "loss": 0.0, + "reward": 2.4999791383743286, + "reward_std": 5.396075772523545e-05, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999791383743286, + "step": 2784 + }, + { + "clip_ratio": 0.0, + "completion_length": 34.6875, + "epoch": 7.215025906735751, + "grad_norm": 24.861887796320637, + "kl": 0.121826171875, + "learning_rate": 2.7875647668393783e-07, + "loss": -0.0003, + "reward": 1.9924234747886658, + "reward_std": 8.40917636537597e-05, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.4924237132072449, + "step": 2785 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.5, + "epoch": 7.217616580310881, + "grad_norm": 4.481088771903548, + "kl": 0.082763671875, + "learning_rate": 2.7849740932642483e-07, + "loss": 0.0003, + "reward": 2.499975562095642, + "reward_std": 2.1934815777058247e-05, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.999975562095642, + "step": 2786 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 7.22020725388601, + "grad_norm": 0.1936703331833292, + "kl": 0.115234375, + "learning_rate": 2.782383419689119e-07, + "loss": 0.0009, + "reward": 2.4999983310699463, + "reward_std": 1.1752094621897413e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999982714653015, + "step": 2787 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 7.22279792746114, + "grad_norm": 2.8082239298362524, + "kl": 0.0716552734375, + "learning_rate": 2.77979274611399e-07, + "loss": -0.0004, + "reward": 2.4999840259552, + "reward_std": 9.378438960538915e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999841451644897, + "step": 2788 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 7.225388601036269, + "grad_norm": 0.09971364772098981, + "kl": 0.0672607421875, + "learning_rate": 2.77720207253886e-07, + "loss": -0.0, + "reward": 2.499996066093445, + "reward_std": 1.9368695802768343e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999961256980896, + "step": 2789 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 7.227979274611399, + "grad_norm": 0.3689860131457773, + "kl": 0.08251953125, + "learning_rate": 2.7746113989637304e-07, + "loss": 0.0016, + "reward": 2.4999940395355225, + "reward_std": 3.666489533316053e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999940395355225, + "step": 2790 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 7.230569948186528, + "grad_norm": 0.647121482479705, + "kl": 0.212158203125, + "learning_rate": 2.772020725388601e-07, + "loss": 0.0023, + "reward": 2.499993681907654, + "reward_std": 5.7994617463918985e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999935626983643, + "step": 2791 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 7.233160621761658, + "grad_norm": 0.673668075156263, + "kl": 0.100830078125, + "learning_rate": 2.769430051813471e-07, + "loss": -0.0003, + "reward": 2.4999879598617554, + "reward_std": 3.9994324652070645e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999881386756897, + "step": 2792 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.5, + "epoch": 7.2357512953367875, + "grad_norm": 0.6408799456541552, + "kl": 0.1552734375, + "learning_rate": 2.766839378238342e-07, + "loss": 0.0008, + "reward": 2.4999784231185913, + "reward_std": 8.780219104664866e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999784231185913, + "step": 2793 + }, + { + "clip_ratio": 0.0, + "completion_length": 34.0, + "epoch": 7.238341968911917, + "grad_norm": 0.21763030173056042, + "kl": 0.1025390625, + "learning_rate": 2.7642487046632125e-07, + "loss": 0.0006, + "reward": 2.499996066093445, + "reward_std": 2.3939119273563847e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999961256980896, + "step": 2794 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 7.240932642487047, + "grad_norm": 285.6986164805541, + "kl": 0.096923828125, + "learning_rate": 2.7616580310880825e-07, + "loss": 0.0001, + "reward": 1.8122249841690063, + "reward_std": 0.0006545914177422674, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.3122249841690063, + "step": 2795 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.5, + "epoch": 7.243523316062176, + "grad_norm": 1.1424234195270353, + "kl": 0.154052734375, + "learning_rate": 2.759067357512953e-07, + "loss": 0.0016, + "reward": 2.4999942779541016, + "reward_std": 5.0994269713555695e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999943375587463, + "step": 2796 + }, + { + "clip_ratio": 0.0, + "completion_length": 34.5, + "epoch": 7.246113989637306, + "grad_norm": 0.21947202862188261, + "kl": 0.070068359375, + "learning_rate": 2.756476683937824e-07, + "loss": 0.0001, + "reward": 2.499995708465576, + "reward_std": 3.2211644338531187e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999955892562866, + "step": 2797 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.5, + "epoch": 7.248704663212435, + "grad_norm": 0.09555870706533301, + "kl": 0.050048828125, + "learning_rate": 2.753886010362694e-07, + "loss": -0.0008, + "reward": 2.499997854232788, + "reward_std": 1.7976130379793176e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999980926513672, + "step": 2798 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 7.251295336787565, + "grad_norm": 0.0807983102705631, + "kl": 0.12646484375, + "learning_rate": 2.7512953367875646e-07, + "loss": 0.0001, + "reward": 2.499994993209839, + "reward_std": 2.084456241391308e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.999994933605194, + "step": 2799 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 7.253886010362694, + "grad_norm": 0.3113020915404871, + "kl": 0.0484619140625, + "learning_rate": 2.748704663212435e-07, + "loss": -0.0002, + "reward": 2.499987483024597, + "reward_std": 4.419826723278675e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999874830245972, + "step": 2800 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.5, + "epoch": 7.256476683937824, + "grad_norm": 1.045442689806256, + "kl": 0.10205078125, + "learning_rate": 2.746113989637305e-07, + "loss": -0.0006, + "reward": 2.4999945163726807, + "reward_std": 4.731215085485019e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999946355819702, + "step": 2801 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 7.259067357512953, + "grad_norm": 0.8406275106284683, + "kl": 0.1376953125, + "learning_rate": 2.743523316062176e-07, + "loss": 0.0005, + "reward": 2.499984383583069, + "reward_std": 3.909989288786164e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.999984323978424, + "step": 2802 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 7.261658031088083, + "grad_norm": 0.4249658280498966, + "kl": 0.076904296875, + "learning_rate": 2.7409326424870467e-07, + "loss": 0.0009, + "reward": 2.4999955892562866, + "reward_std": 2.975777761093923e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999955296516418, + "step": 2803 + }, + { + "clip_ratio": 0.0, + "completion_length": 36.0, + "epoch": 7.2642487046632125, + "grad_norm": 0.14715451922718406, + "kl": 0.12939453125, + "learning_rate": 2.7383419689119167e-07, + "loss": -0.0002, + "reward": 2.499997615814209, + "reward_std": 9.016723936383642e-07, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999977350234985, + "step": 2804 + }, + { + "clip_ratio": 0.0, + "completion_length": 34.5, + "epoch": 7.266839378238342, + "grad_norm": 2.1630849775720002, + "kl": 0.17138671875, + "learning_rate": 2.735751295336787e-07, + "loss": 0.0001, + "reward": 1.9972917437553406, + "reward_std": 3.55871564465815e-05, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.4972918629646301, + "step": 2805 + }, + { + "clip_ratio": 0.0, + "completion_length": 34.0, + "epoch": 7.269430051813472, + "grad_norm": 0.11947457944026632, + "kl": 0.099853515625, + "learning_rate": 2.7331606217616583e-07, + "loss": 0.0007, + "reward": 2.499997854232788, + "reward_std": 1.527620099750493e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.999997854232788, + "step": 2806 + }, + { + "clip_ratio": 0.0, + "completion_length": 34.0, + "epoch": 7.272020725388601, + "grad_norm": 69.84226701683895, + "kl": 1.21722412109375, + "learning_rate": 2.7305699481865283e-07, + "loss": 0.0037, + "reward": 1.946344256401062, + "reward_std": 0.002567103309502272, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.4463444352149963, + "step": 2807 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.5, + "epoch": 7.274611398963731, + "grad_norm": 0.30733205815695913, + "kl": 0.0859375, + "learning_rate": 2.727979274611399e-07, + "loss": 0.0001, + "reward": 2.499997615814209, + "reward_std": 3.0245171274145832e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999977946281433, + "step": 2808 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 7.27720207253886, + "grad_norm": 0.12606911107675, + "kl": 0.112548828125, + "learning_rate": 2.7253886010362694e-07, + "loss": 0.0, + "reward": 2.499998092651367, + "reward_std": 1.3479630638357776e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.999998152256012, + "step": 2809 + }, + { + "clip_ratio": 0.0, + "completion_length": 34.5, + "epoch": 7.27979274611399, + "grad_norm": 0.4592249794749426, + "kl": 0.098388671875, + "learning_rate": 2.7227979274611393e-07, + "loss": 0.0002, + "reward": 2.499995708465576, + "reward_std": 3.842052819891251e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.999995768070221, + "step": 2810 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.5, + "epoch": 7.282383419689119, + "grad_norm": 9.682477477219402, + "kl": 0.137451171875, + "learning_rate": 2.7202072538860104e-07, + "loss": -0.0002, + "reward": 1.8723435401916504, + "reward_std": 0.0008437733761752497, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.3723435997962952, + "step": 2811 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.5, + "epoch": 7.284974093264249, + "grad_norm": 0.17003074781660135, + "kl": 0.09912109375, + "learning_rate": 2.717616580310881e-07, + "loss": 0.001, + "reward": 2.499994158744812, + "reward_std": 1.7374125889091374e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999942183494568, + "step": 2812 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 7.287564766839378, + "grad_norm": 0.12318931287536186, + "kl": 0.0751953125, + "learning_rate": 2.715025906735751e-07, + "loss": 0.001, + "reward": 2.4999974966049194, + "reward_std": 2.005502267365955e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999975562095642, + "step": 2813 + }, + { + "clip_ratio": 0.0, + "completion_length": 34.0, + "epoch": 7.290155440414508, + "grad_norm": 0.08174513535293738, + "kl": 0.09326171875, + "learning_rate": 2.7124352331606215e-07, + "loss": 0.0018, + "reward": 2.4999969005584717, + "reward_std": 1.9077306774306635e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999967813491821, + "step": 2814 + }, + { + "clip_ratio": 0.0, + "completion_length": 34.5, + "epoch": 7.2927461139896375, + "grad_norm": 0.04045268804357678, + "kl": 0.1337890625, + "learning_rate": 2.709844559585492e-07, + "loss": -0.0001, + "reward": 2.4999942779541016, + "reward_std": 1.423873243311391e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999942779541016, + "step": 2815 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.5, + "epoch": 7.295336787564767, + "grad_norm": 0.3050327924472231, + "kl": 0.13232421875, + "learning_rate": 2.7072538860103625e-07, + "loss": -0.0002, + "reward": 2.4999977350234985, + "reward_std": 1.9395337176320027e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999977350234985, + "step": 2816 + }, + { + "clip_ratio": 0.0, + "completion_length": 34.0, + "epoch": 7.2979274611398965, + "grad_norm": 0.12169573122756865, + "kl": 0.101287841796875, + "learning_rate": 2.704663212435233e-07, + "loss": 0.0005, + "reward": 2.4999970197677612, + "reward_std": 2.3166951450548368e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.999997079372406, + "step": 2817 + }, + { + "clip_ratio": 0.0, + "completion_length": 36.0, + "epoch": 7.300518134715026, + "grad_norm": 53.71103215649146, + "kl": 0.08984375, + "learning_rate": 2.7020725388601036e-07, + "loss": 0.0005, + "reward": 2.4999101161956787, + "reward_std": 2.539685272040515e-05, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999101758003235, + "step": 2818 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.5, + "epoch": 7.303108808290156, + "grad_norm": 9.489807047133581, + "kl": 0.2138671875, + "learning_rate": 2.6994818652849736e-07, + "loss": 0.0009, + "reward": 1.8930606842041016, + "reward_std": 0.000643064509404212, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.3930606842041016, + "step": 2819 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.5, + "epoch": 7.305699481865285, + "grad_norm": 0.11036868030547739, + "kl": 0.127685546875, + "learning_rate": 2.6968911917098446e-07, + "loss": -0.0002, + "reward": 2.4999959468841553, + "reward_std": 2.034824944985303e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999960660934448, + "step": 2820 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.5, + "epoch": 7.308290155440415, + "grad_norm": 0.0978229157023234, + "kl": 0.02239990234375, + "learning_rate": 2.694300518134715e-07, + "loss": -0.0003, + "reward": 2.4999887943267822, + "reward_std": 1.3666916061083612e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999889135360718, + "step": 2821 + }, + { + "clip_ratio": 0.0, + "completion_length": 34.875, + "epoch": 7.310880829015544, + "grad_norm": 37.09775342321396, + "kl": 0.0986328125, + "learning_rate": 2.691709844559585e-07, + "loss": -0.0, + "reward": 1.96336430311203, + "reward_std": 0.011848652355411105, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.4633644223213196, + "step": 2822 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 7.313471502590674, + "grad_norm": 0.536238910666692, + "kl": 0.05078125, + "learning_rate": 2.6891191709844557e-07, + "loss": -0.0007, + "reward": 2.499995470046997, + "reward_std": 3.5237948168287403e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999956488609314, + "step": 2823 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 7.316062176165803, + "grad_norm": 5.597527111946226, + "kl": 1.6572265625, + "learning_rate": 2.686528497409326e-07, + "loss": 0.0074, + "reward": 2.49999737739563, + "reward_std": 4.774386638928263e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.999997317790985, + "step": 2824 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.5, + "epoch": 7.318652849740933, + "grad_norm": 0.062350314297028864, + "kl": 0.04925537109375, + "learning_rate": 2.6839378238341967e-07, + "loss": 0.0005, + "reward": 2.4999988079071045, + "reward_std": 8.074643460531661e-07, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999991655349731, + "step": 2825 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 7.321243523316062, + "grad_norm": 1.5731215355287473, + "kl": 0.1396484375, + "learning_rate": 2.681347150259067e-07, + "loss": 0.0009, + "reward": 1.9988993406295776, + "reward_std": 5.0001223939943884e-05, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.4988993704319, + "step": 2826 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.5, + "epoch": 7.323834196891192, + "grad_norm": 0.020898047196269424, + "kl": 0.080810546875, + "learning_rate": 2.678756476683938e-07, + "loss": -0.0001, + "reward": 2.49999737739563, + "reward_std": 4.913568147912883e-07, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.999997615814209, + "step": 2827 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 7.3264248704663215, + "grad_norm": 6.411696719267693, + "kl": 0.1953125, + "learning_rate": 2.676165803108808e-07, + "loss": 0.0003, + "reward": 1.9919129610061646, + "reward_std": 0.00013535398539943344, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.4919130206108093, + "step": 2828 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.5, + "epoch": 7.329015544041451, + "grad_norm": 0.1719146435688225, + "kl": 0.10693359375, + "learning_rate": 2.6735751295336783e-07, + "loss": 0.0017, + "reward": 2.4999977350234985, + "reward_std": 1.578277363023517e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.999997615814209, + "step": 2829 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 7.331606217616581, + "grad_norm": 0.09190455201668839, + "kl": 0.027435302734375, + "learning_rate": 2.6709844559585494e-07, + "loss": -0.0011, + "reward": 2.4999961853027344, + "reward_std": 1.9547299530131568e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999963641166687, + "step": 2830 + }, + { + "clip_ratio": 0.0, + "completion_length": 36.0, + "epoch": 7.33419689119171, + "grad_norm": 19.49882902581193, + "kl": 0.2447509765625, + "learning_rate": 2.6683937823834194e-07, + "loss": 0.0008, + "reward": 1.8961536884307861, + "reward_std": 0.000317055290679491, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.3961536884307861, + "step": 2831 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.5, + "epoch": 7.33678756476684, + "grad_norm": 0.11290942689365722, + "kl": 0.1258544921875, + "learning_rate": 2.66580310880829e-07, + "loss": 0.0011, + "reward": 2.499996781349182, + "reward_std": 1.9968296669503616e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999967813491821, + "step": 2832 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.5, + "epoch": 7.339378238341969, + "grad_norm": 7.711760924911983, + "kl": 2.45703125, + "learning_rate": 2.6632124352331604e-07, + "loss": 0.0093, + "reward": 1.9983811378479004, + "reward_std": 2.9145689211418357e-05, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.49838125705719, + "step": 2833 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 7.341968911917099, + "grad_norm": 0.07971531553980153, + "kl": 0.04437255859375, + "learning_rate": 2.6606217616580315e-07, + "loss": -0.0001, + "reward": 2.499996781349182, + "reward_std": 1.6447785355921951e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999966025352478, + "step": 2834 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.5, + "epoch": 7.344559585492228, + "grad_norm": 1.256762118337873, + "kl": 0.0859375, + "learning_rate": 2.6580310880829015e-07, + "loss": 0.0005, + "reward": 2.4999905824661255, + "reward_std": 5.3548238270195725e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999906420707703, + "step": 2835 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 7.347150259067358, + "grad_norm": 2.4568200693367257, + "kl": 0.1611328125, + "learning_rate": 2.655440414507772e-07, + "loss": 0.0011, + "reward": 1.9940448999404907, + "reward_std": 4.381606322567677e-05, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.4940448701381683, + "step": 2836 + }, + { + "clip_ratio": 0.0, + "completion_length": 36.0, + "epoch": 7.349740932642487, + "grad_norm": 5.807578965526504, + "kl": 0.1845703125, + "learning_rate": 2.652849740932642e-07, + "loss": 0.0007, + "reward": 1.974083662033081, + "reward_std": 0.00013516989685058434, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.4740836024284363, + "step": 2837 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 7.352331606217617, + "grad_norm": 0.28473482585784043, + "kl": 0.09698486328125, + "learning_rate": 2.6502590673575125e-07, + "loss": 0.0005, + "reward": 2.4999853372573853, + "reward_std": 4.394527081785782e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999852180480957, + "step": 2838 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 7.3549222797927465, + "grad_norm": 0.8149931408037281, + "kl": 0.05291748046875, + "learning_rate": 2.6476683937823836e-07, + "loss": -0.0002, + "reward": 2.499993324279785, + "reward_std": 2.9201254392319242e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.99999338388443, + "step": 2839 + }, + { + "clip_ratio": 0.0, + "completion_length": 34.6875, + "epoch": 7.357512953367876, + "grad_norm": 0.23416833104159884, + "kl": 0.093017578125, + "learning_rate": 2.645077720207254e-07, + "loss": 0.0014, + "reward": 2.4999958276748657, + "reward_std": 2.33332400512154e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999957084655762, + "step": 2840 + }, + { + "clip_ratio": 0.0, + "completion_length": 34.5, + "epoch": 7.360103626943006, + "grad_norm": 0.9815129000860943, + "kl": 0.0684814453125, + "learning_rate": 2.642487046632124e-07, + "loss": 0.0007, + "reward": 2.4999871253967285, + "reward_std": 7.650276756976382e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999871850013733, + "step": 2841 + }, + { + "clip_ratio": 0.0, + "completion_length": 36.0, + "epoch": 7.362694300518135, + "grad_norm": 0.13989188310038264, + "kl": 0.17578125, + "learning_rate": 2.6398963730569946e-07, + "loss": 0.0001, + "reward": 2.4999905824661255, + "reward_std": 4.645750323106768e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.999990701675415, + "step": 2842 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.5, + "epoch": 7.365284974093265, + "grad_norm": 3.318404663999125, + "kl": 0.125244140625, + "learning_rate": 2.6373056994818657e-07, + "loss": 0.0012, + "reward": 2.249942898750305, + "reward_std": 0.26726726118313593, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.7499428391456604, + "step": 2843 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.5, + "epoch": 7.367875647668393, + "grad_norm": 0.15279163201083765, + "kl": 0.091552734375, + "learning_rate": 2.6347150259067357e-07, + "loss": 0.0004, + "reward": 2.499998450279236, + "reward_std": 1.2477871109695116e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999984502792358, + "step": 2844 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.5, + "epoch": 7.370466321243523, + "grad_norm": 0.10827347688617023, + "kl": 0.126708984375, + "learning_rate": 2.632124352331606e-07, + "loss": 0.0002, + "reward": 2.499998092651367, + "reward_std": 1.55017215774933e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999983310699463, + "step": 2845 + }, + { + "clip_ratio": 0.0, + "completion_length": 33.5, + "epoch": 7.373056994818652, + "grad_norm": 0.35876915240341717, + "kl": 0.112060546875, + "learning_rate": 2.6295336787564767e-07, + "loss": 0.001, + "reward": 2.4999982118606567, + "reward_std": 1.2198702279420104e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999982714653015, + "step": 2846 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 7.375647668393782, + "grad_norm": 0.4763062837390196, + "kl": 0.17822265625, + "learning_rate": 2.6269430051813467e-07, + "loss": 0.0015, + "reward": 2.499993920326233, + "reward_std": 4.087596380486502e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999937415122986, + "step": 2847 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.5, + "epoch": 7.3782383419689115, + "grad_norm": 0.5201992278869353, + "kl": 0.04827880859375, + "learning_rate": 2.624352331606218e-07, + "loss": -0.0009, + "reward": 2.499990940093994, + "reward_std": 3.921487831348713e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.999991238117218, + "step": 2848 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 7.380829015544041, + "grad_norm": 0.2649265145299683, + "kl": 0.069091796875, + "learning_rate": 2.6217616580310883e-07, + "loss": -0.0, + "reward": 2.499995231628418, + "reward_std": 2.3153948518483958e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999951124191284, + "step": 2849 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 7.383419689119171, + "grad_norm": 288.0289965221196, + "kl": 0.1019287109375, + "learning_rate": 2.6191709844559583e-07, + "loss": 0.0007, + "reward": 1.9900956749916077, + "reward_std": 0.002839759652260909, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.4900956749916077, + "step": 2850 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 7.3860103626943, + "grad_norm": 0.1173526664802589, + "kl": 0.0849609375, + "learning_rate": 2.616580310880829e-07, + "loss": -0.0004, + "reward": 2.499998450279236, + "reward_std": 1.1589862651817384e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999985694885254, + "step": 2851 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.5, + "epoch": 7.38860103626943, + "grad_norm": 1.9513600650137177, + "kl": 0.08984375, + "learning_rate": 2.6139896373056994e-07, + "loss": -0.0008, + "reward": 2.499987006187439, + "reward_std": 7.780848591210088e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999869465827942, + "step": 2852 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 7.391191709844559, + "grad_norm": 1.44185550229638, + "kl": 0.09326171875, + "learning_rate": 2.61139896373057e-07, + "loss": 0.0013, + "reward": 2.499987483024597, + "reward_std": 7.910099952823657e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999874234199524, + "step": 2853 + }, + { + "clip_ratio": 0.0, + "completion_length": 34.0, + "epoch": 7.393782383419689, + "grad_norm": 0.06556698368029117, + "kl": 0.0498046875, + "learning_rate": 2.6088082901554404e-07, + "loss": 0.0001, + "reward": 2.4999953508377075, + "reward_std": 1.8432132549150992e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999952912330627, + "step": 2854 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 7.396373056994818, + "grad_norm": 2.621511905400639, + "kl": 0.0667724609375, + "learning_rate": 2.606217616580311e-07, + "loss": -0.0012, + "reward": 2.4999759197235107, + "reward_std": 1.0496055665498716e-05, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999761581420898, + "step": 2855 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.5, + "epoch": 7.398963730569948, + "grad_norm": 0.8053155909446992, + "kl": 0.0458984375, + "learning_rate": 2.603626943005181e-07, + "loss": 0.0012, + "reward": 2.499986171722412, + "reward_std": 7.358378184108005e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999860525131226, + "step": 2856 + }, + { + "clip_ratio": 0.0, + "completion_length": 34.5, + "epoch": 7.401554404145077, + "grad_norm": 0.06987777668149876, + "kl": 0.142578125, + "learning_rate": 2.601036269430052e-07, + "loss": 0.0003, + "reward": 2.49999737739563, + "reward_std": 1.764308791507574e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999974370002747, + "step": 2857 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 7.404145077720207, + "grad_norm": 37.506404446012226, + "kl": 0.5703125, + "learning_rate": 2.5984455958549225e-07, + "loss": 0.0023, + "reward": 1.3604300022125244, + "reward_std": 0.0005023884441470727, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 0.8604300618171692, + "step": 2858 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 7.4067357512953365, + "grad_norm": 0.1109195552672259, + "kl": 0.134521484375, + "learning_rate": 2.5958549222797925e-07, + "loss": 0.0002, + "reward": 2.4999979734420776, + "reward_std": 1.3841492432220548e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999979138374329, + "step": 2859 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.5, + "epoch": 7.409326424870466, + "grad_norm": 0.15158453087956889, + "kl": 0.04876708984375, + "learning_rate": 2.593264248704663e-07, + "loss": -0.0007, + "reward": 2.49999463558197, + "reward_std": 1.3965968150841945e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999948143959045, + "step": 2860 + }, + { + "clip_ratio": 0.0, + "completion_length": 36.0, + "epoch": 7.4119170984455955, + "grad_norm": 5.299770011902868, + "kl": 0.130615234375, + "learning_rate": 2.5906735751295336e-07, + "loss": 0.0005, + "reward": 1.2727726697921753, + "reward_std": 0.0013119927025400102, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 0.7727727293968201, + "step": 2861 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 7.414507772020725, + "grad_norm": 1.4925445805339939, + "kl": 0.0908203125, + "learning_rate": 2.588082901554404e-07, + "loss": 0.0011, + "reward": 2.4999911785125732, + "reward_std": 8.532417041351437e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999910593032837, + "step": 2862 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 7.417098445595855, + "grad_norm": 8.510300410315063, + "kl": 0.13916015625, + "learning_rate": 2.5854922279792746e-07, + "loss": 0.0007, + "reward": 1.9913722276687622, + "reward_std": 0.00010243407575671881, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.4913722276687622, + "step": 2863 + }, + { + "clip_ratio": 0.0, + "completion_length": 36.0, + "epoch": 7.419689119170984, + "grad_norm": 19.04575181563735, + "kl": 0.0484619140625, + "learning_rate": 2.582901554404145e-07, + "loss": 0.0008, + "reward": 1.8201225996017456, + "reward_std": 0.0006248899346701364, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.3201225996017456, + "step": 2864 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.5, + "epoch": 7.422279792746114, + "grad_norm": 0.20395293465420408, + "kl": 0.093505859375, + "learning_rate": 2.580310880829015e-07, + "loss": -0.0008, + "reward": 2.4999972581863403, + "reward_std": 3.359025754434697e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.999997317790985, + "step": 2865 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 7.424870466321243, + "grad_norm": 0.5647746461380362, + "kl": 0.0791015625, + "learning_rate": 2.577720207253886e-07, + "loss": 0.0006, + "reward": 1.9998557567596436, + "reward_std": 1.1571712093427777e-05, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.4998558163642883, + "step": 2866 + }, + { + "clip_ratio": 0.0, + "completion_length": 36.0, + "epoch": 7.427461139896373, + "grad_norm": 0.21336476858258724, + "kl": 0.1552734375, + "learning_rate": 2.5751295336787567e-07, + "loss": -0.0004, + "reward": 2.49999737739563, + "reward_std": 2.7400626549933804e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999974966049194, + "step": 2867 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 7.430051813471502, + "grad_norm": 0.2979459148216057, + "kl": 0.06512451171875, + "learning_rate": 2.5725388601036267e-07, + "loss": 0.0001, + "reward": 2.499997138977051, + "reward_std": 3.4764248653118557e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999970197677612, + "step": 2868 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 7.432642487046632, + "grad_norm": 0.8884117150896411, + "kl": 0.11572265625, + "learning_rate": 2.569948186528497e-07, + "loss": -0.0002, + "reward": 2.4999927282333374, + "reward_std": 5.68606296269536e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999927282333374, + "step": 2869 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 7.435233160621761, + "grad_norm": 0.2898326676854063, + "kl": 0.0966796875, + "learning_rate": 2.567357512953368e-07, + "loss": -0.0, + "reward": 2.4999918937683105, + "reward_std": 3.7975955962110675e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999920129776, + "step": 2870 + }, + { + "clip_ratio": 0.0, + "completion_length": 36.0, + "epoch": 7.437823834196891, + "grad_norm": 0.07210369819765641, + "kl": 0.052001953125, + "learning_rate": 2.5647668393782383e-07, + "loss": -0.0001, + "reward": 2.4999966621398926, + "reward_std": 2.071714789053658e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.999996542930603, + "step": 2871 + }, + { + "clip_ratio": 0.0, + "completion_length": 34.5, + "epoch": 7.4404145077720205, + "grad_norm": 0.013627292162816393, + "kl": 0.044921875, + "learning_rate": 2.562176165803109e-07, + "loss": 0.0, + "reward": 2.499998927116394, + "reward_std": 8.017928365688931e-07, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999989867210388, + "step": 2872 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.5, + "epoch": 7.44300518134715, + "grad_norm": 0.6524422920561326, + "kl": 0.04150390625, + "learning_rate": 2.5595854922279794e-07, + "loss": -0.0001, + "reward": 2.499988317489624, + "reward_std": 4.971472264969634e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999884366989136, + "step": 2873 + }, + { + "clip_ratio": 0.0, + "completion_length": 34.5, + "epoch": 7.44559585492228, + "grad_norm": 1.4979663538093417, + "kl": 0.125, + "learning_rate": 2.5569948186528494e-07, + "loss": 0.0012, + "reward": 2.4999948740005493, + "reward_std": 3.2993613103826647e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999948143959045, + "step": 2874 + }, + { + "clip_ratio": 0.0, + "completion_length": 34.5, + "epoch": 7.448186528497409, + "grad_norm": 4.017957786491282, + "kl": 0.1334228515625, + "learning_rate": 2.55440414507772e-07, + "loss": 0.0002, + "reward": 1.9960945844650269, + "reward_std": 9.592810960157294e-05, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.496094524860382, + "step": 2875 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.5, + "epoch": 7.450777202072539, + "grad_norm": 0.1080066074399531, + "kl": 0.12646484375, + "learning_rate": 2.551813471502591e-07, + "loss": 0.0015, + "reward": 2.499992609024048, + "reward_std": 1.7547014294905239e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.999992549419403, + "step": 2876 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 7.453367875647668, + "grad_norm": 0.8093916810949638, + "kl": 0.0733642578125, + "learning_rate": 2.549222797927461e-07, + "loss": -0.0013, + "reward": 2.4999940395355225, + "reward_std": 3.627881937973143e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999940991401672, + "step": 2877 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.5, + "epoch": 7.455958549222798, + "grad_norm": 2.1230621929526965, + "kl": 0.54541015625, + "learning_rate": 2.5466321243523315e-07, + "loss": 0.0027, + "reward": 2.499990224838257, + "reward_std": 1.284029417547572e-05, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.999990165233612, + "step": 2878 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.5, + "epoch": 7.458549222797927, + "grad_norm": 4.1443684538405865, + "kl": 0.114013671875, + "learning_rate": 2.544041450777202e-07, + "loss": -0.0005, + "reward": 1.9984803795814514, + "reward_std": 4.798860277333006e-05, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.4984804391860962, + "step": 2879 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 7.461139896373057, + "grad_norm": 1.70317119220248, + "kl": 0.14111328125, + "learning_rate": 2.5414507772020725e-07, + "loss": 0.001, + "reward": 1.6802762150764465, + "reward_std": 0.0002713516473704658, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.1802761852741241, + "step": 2880 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.5, + "epoch": 7.463730569948186, + "grad_norm": 0.30216547334881655, + "kl": 0.0543212890625, + "learning_rate": 2.538860103626943e-07, + "loss": 0.0012, + "reward": 2.499996542930603, + "reward_std": 3.7160941133151937e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999964237213135, + "step": 2881 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0625, + "epoch": 7.466321243523316, + "grad_norm": 25.731217502639, + "kl": 0.1552734375, + "learning_rate": 2.5362694300518136e-07, + "loss": 0.0006, + "reward": 1.9172991514205933, + "reward_std": 0.20589225432195235, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.4172993302345276, + "step": 2882 + }, + { + "clip_ratio": 0.0, + "completion_length": 34.5, + "epoch": 7.4689119170984455, + "grad_norm": 0.2907952205794895, + "kl": 0.15673828125, + "learning_rate": 2.5336787564766836e-07, + "loss": 0.0014, + "reward": 2.499997615814209, + "reward_std": 1.9038781999825005e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.999997615814209, + "step": 2883 + }, + { + "clip_ratio": 0.0, + "completion_length": 34.5, + "epoch": 7.471502590673575, + "grad_norm": 0.8748229999303786, + "kl": 0.21337890625, + "learning_rate": 2.531088082901554e-07, + "loss": 0.0011, + "reward": 2.499997854232788, + "reward_std": 1.8593351569506922e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999976754188538, + "step": 2884 + }, + { + "clip_ratio": 0.0, + "completion_length": 36.0, + "epoch": 7.474093264248705, + "grad_norm": 0.3785352925742591, + "kl": 0.10205078125, + "learning_rate": 2.528497409326425e-07, + "loss": -0.0005, + "reward": 2.4999947547912598, + "reward_std": 3.2770421398709004e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999949932098389, + "step": 2885 + }, + { + "clip_ratio": 0.0, + "completion_length": 34.5, + "epoch": 7.476683937823834, + "grad_norm": 0.09845310238529756, + "kl": 0.095062255859375, + "learning_rate": 2.525906735751295e-07, + "loss": 0.0002, + "reward": 2.499998092651367, + "reward_std": 1.4673870225578867e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999979734420776, + "step": 2886 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.5, + "epoch": 7.479274611398964, + "grad_norm": 14.307335584948014, + "kl": 0.0794677734375, + "learning_rate": 2.5233160621761657e-07, + "loss": 0.0008, + "reward": 1.9043012857437134, + "reward_std": 0.00037970663970554597, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.404301255941391, + "step": 2887 + }, + { + "clip_ratio": 0.0, + "completion_length": 36.0, + "epoch": 7.481865284974093, + "grad_norm": 0.2184688879426897, + "kl": 0.056640625, + "learning_rate": 2.520725388601036e-07, + "loss": 0.0011, + "reward": 2.499990224838257, + "reward_std": 3.5880742643712438e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999901056289673, + "step": 2888 + }, + { + "clip_ratio": 0.0, + "completion_length": 34.5, + "epoch": 7.484455958549223, + "grad_norm": 0.42497084977107596, + "kl": 0.091796875, + "learning_rate": 2.518134715025906e-07, + "loss": -0.0005, + "reward": 2.4999884366989136, + "reward_std": 4.58022714155959e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999885559082031, + "step": 2889 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 7.487046632124352, + "grad_norm": 7.952990113739974, + "kl": 0.0823974609375, + "learning_rate": 2.515544041450777e-07, + "loss": -0.0001, + "reward": 1.9968880414962769, + "reward_std": 0.0001731456113702734, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.4968880712985992, + "step": 2890 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.5, + "epoch": 7.489637305699482, + "grad_norm": 1.0640255431697039, + "kl": 0.052978515625, + "learning_rate": 2.512953367875648e-07, + "loss": -0.0008, + "reward": 2.4999929666519165, + "reward_std": 7.728891546321393e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999931454658508, + "step": 2891 + }, + { + "clip_ratio": 0.0, + "completion_length": 36.0, + "epoch": 7.492227979274611, + "grad_norm": 0.8704568599676987, + "kl": 0.105224609375, + "learning_rate": 2.510362694300518e-07, + "loss": 0.0014, + "reward": 2.499979019165039, + "reward_std": 1.1728319805115461e-05, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999787211418152, + "step": 2892 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 7.494818652849741, + "grad_norm": 0.13026421165973223, + "kl": 0.06805419921875, + "learning_rate": 2.5077720207253883e-07, + "loss": 0.001, + "reward": 2.499996781349182, + "reward_std": 2.5490197117505886e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999966025352478, + "step": 2893 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.5, + "epoch": 7.4974093264248705, + "grad_norm": 0.9377082327832156, + "kl": 0.15380859375, + "learning_rate": 2.5051813471502594e-07, + "loss": -0.0002, + "reward": 2.499985933303833, + "reward_std": 6.524010359498789e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999860525131226, + "step": 2894 + }, + { + "clip_ratio": 0.0, + "completion_length": 36.0, + "epoch": 7.5, + "grad_norm": 0.13471864888755444, + "kl": 0.0858154296875, + "learning_rate": 2.5025906735751294e-07, + "loss": 0.0001, + "reward": 2.499997138977051, + "reward_std": 2.9680402349185897e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.999997317790985, + "step": 2895 + }, + { + "clip_ratio": 0.0, + "completion_length": 34.3125, + "epoch": 7.5025906735751295, + "grad_norm": 1.8846261151917099, + "kl": 0.3349609375, + "learning_rate": 2.5e-07, + "loss": 0.0027, + "reward": 2.4999828338623047, + "reward_std": 1.1658462881314335e-05, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999825954437256, + "step": 2896 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 7.505181347150259, + "grad_norm": 0.2708193347710579, + "kl": 0.061279296875, + "learning_rate": 2.4974093264248704e-07, + "loss": 0.0001, + "reward": 2.4999938011169434, + "reward_std": 2.8499486006694497e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999939799308777, + "step": 2897 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 7.507772020725389, + "grad_norm": 0.04815339326362499, + "kl": 0.140869140625, + "learning_rate": 2.494818652849741e-07, + "loss": 0.0014, + "reward": 2.499998450279236, + "reward_std": 1.033983778597758e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999984502792358, + "step": 2898 + }, + { + "clip_ratio": 0.0, + "completion_length": 36.0, + "epoch": 7.510362694300518, + "grad_norm": 2.7722791148945602, + "kl": 0.1064453125, + "learning_rate": 2.4922279792746115e-07, + "loss": 0.0009, + "reward": 1.9997743368148804, + "reward_std": 2.119692885571567e-05, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.4997743368148804, + "step": 2899 + }, + { + "clip_ratio": 0.0, + "completion_length": 34.0, + "epoch": 7.512953367875648, + "grad_norm": 0.5106796919303661, + "kl": 0.0389404296875, + "learning_rate": 2.489637305699482e-07, + "loss": 0.0002, + "reward": 2.4999722242355347, + "reward_std": 6.893227919135825e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999720454216003, + "step": 2900 + }, + { + "clip_ratio": 0.0, + "completion_length": 34.5, + "epoch": 7.515544041450777, + "grad_norm": 1.1406057113302723, + "kl": 0.0628662109375, + "learning_rate": 2.487046632124352e-07, + "loss": -0.0001, + "reward": 1.997680902481079, + "reward_std": 5.234319459646031e-05, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.4976809322834015, + "step": 2901 + }, + { + "clip_ratio": 0.0, + "completion_length": 34.5, + "epoch": 7.518134715025907, + "grad_norm": 0.09218057556949211, + "kl": 0.138427734375, + "learning_rate": 2.484455958549223e-07, + "loss": -0.0, + "reward": 2.4999964237213135, + "reward_std": 1.6774215509940404e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999966621398926, + "step": 2902 + }, + { + "clip_ratio": 0.0, + "completion_length": 34.0, + "epoch": 7.520725388601036, + "grad_norm": 0.6248003374584781, + "kl": 0.091064453125, + "learning_rate": 2.481865284974093e-07, + "loss": 0.001, + "reward": 2.4999860525131226, + "reward_std": 5.481745461111132e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999861121177673, + "step": 2903 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.5, + "epoch": 7.523316062176166, + "grad_norm": 0.4463045347032181, + "kl": 0.1474609375, + "learning_rate": 2.4792746113989636e-07, + "loss": 0.0, + "reward": 2.499995470046997, + "reward_std": 3.0378723749890924e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999955892562866, + "step": 2904 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.5, + "epoch": 7.525906735751295, + "grad_norm": 28.84757306523627, + "kl": 0.15673828125, + "learning_rate": 2.476683937823834e-07, + "loss": 0.0006, + "reward": 2.249874472618103, + "reward_std": 0.2673055271868634, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.749874472618103, + "step": 2905 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 7.528497409326425, + "grad_norm": 0.15072331791402271, + "kl": 0.04351806640625, + "learning_rate": 2.4740932642487046e-07, + "loss": 0.0013, + "reward": 2.4999953508377075, + "reward_std": 2.764710757219291e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999951124191284, + "step": 2906 + }, + { + "clip_ratio": 0.0, + "completion_length": 34.5, + "epoch": 7.5310880829015545, + "grad_norm": 26.758720319038975, + "kl": 0.0947265625, + "learning_rate": 2.471502590673575e-07, + "loss": 0.0014, + "reward": 1.887969434261322, + "reward_std": 0.0015671426558583335, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.3879693150520325, + "step": 2907 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.5, + "epoch": 7.533678756476684, + "grad_norm": 1.9170584304445994, + "kl": 0.082275390625, + "learning_rate": 2.4689119170984457e-07, + "loss": -0.0005, + "reward": 2.499989867210388, + "reward_std": 4.5294372057469445e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999900460243225, + "step": 2908 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 7.536269430051814, + "grad_norm": 0.2996648625795484, + "kl": 0.05059814453125, + "learning_rate": 2.466321243523316e-07, + "loss": -0.0004, + "reward": 2.4999942779541016, + "reward_std": 2.558760286319739e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999945163726807, + "step": 2909 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.5, + "epoch": 7.538860103626943, + "grad_norm": 5.79745374157687, + "kl": 0.17822265625, + "learning_rate": 2.463730569948186e-07, + "loss": 0.0009, + "reward": 2.499985456466675, + "reward_std": 6.584182983715436e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999854564666748, + "step": 2910 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.5, + "epoch": 7.541450777202073, + "grad_norm": 0.5010945560815133, + "kl": 0.079345703125, + "learning_rate": 2.4611398963730567e-07, + "loss": -0.0003, + "reward": 2.499997138977051, + "reward_std": 2.179568355131778e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999971985816956, + "step": 2911 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.5, + "epoch": 7.544041450777202, + "grad_norm": 13.564812207755466, + "kl": 0.099853515625, + "learning_rate": 2.458549222797927e-07, + "loss": 0.0006, + "reward": 1.994426965713501, + "reward_std": 0.00014078659461347343, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.4944270253181458, + "step": 2912 + }, + { + "clip_ratio": 0.0, + "completion_length": 34.5, + "epoch": 7.546632124352332, + "grad_norm": 37.44207645858213, + "kl": 0.169921875, + "learning_rate": 2.455958549222798e-07, + "loss": 0.0005, + "reward": 1.996111810207367, + "reward_std": 6.63758300589734e-05, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.496111810207367, + "step": 2913 + }, + { + "clip_ratio": 0.0, + "completion_length": 36.0, + "epoch": 7.549222797927461, + "grad_norm": 6.510331738359985, + "kl": 0.084228515625, + "learning_rate": 2.4533678756476683e-07, + "loss": 0.0001, + "reward": 2.4999890327453613, + "reward_std": 6.922352440597024e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999889135360718, + "step": 2914 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 7.551813471502591, + "grad_norm": 0.422331861546378, + "kl": 0.05718994140625, + "learning_rate": 2.450777202072539e-07, + "loss": 0.0011, + "reward": 2.49998140335083, + "reward_std": 5.3374250228444e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999815225601196, + "step": 2915 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 7.55440414507772, + "grad_norm": 5.557996168557265, + "kl": 0.154541015625, + "learning_rate": 2.4481865284974094e-07, + "loss": 0.0003, + "reward": 1.7923298478126526, + "reward_std": 0.0005259371980628202, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.2923299670219421, + "step": 2916 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.5, + "epoch": 7.55699481865285, + "grad_norm": 24.26915595716572, + "kl": 0.0728759765625, + "learning_rate": 2.44559585492228e-07, + "loss": 0.0013, + "reward": 2.1874327659606934, + "reward_std": 0.25881336485167594, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.687432587146759, + "step": 2917 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.5, + "epoch": 7.5595854922279795, + "grad_norm": 1.1142526531240982, + "kl": 0.155517578125, + "learning_rate": 2.4430051813471504e-07, + "loss": 0.0009, + "reward": 2.4999958276748657, + "reward_std": 5.5810438652770245e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999958276748657, + "step": 2918 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 7.562176165803109, + "grad_norm": 0.771435129953307, + "kl": 0.0540771484375, + "learning_rate": 2.4404145077720204e-07, + "loss": 0.0006, + "reward": 2.4999858140945435, + "reward_std": 6.237439947653911e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999860525131226, + "step": 2919 + }, + { + "clip_ratio": 0.0, + "completion_length": 34.5, + "epoch": 7.564766839378239, + "grad_norm": 1.5585308077868703, + "kl": 0.08203125, + "learning_rate": 2.437823834196891e-07, + "loss": 0.0012, + "reward": 2.499992609024048, + "reward_std": 7.950610552143189e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.999992549419403, + "step": 2920 + }, + { + "clip_ratio": 0.0, + "completion_length": 36.0, + "epoch": 7.567357512953368, + "grad_norm": 0.550049483203788, + "kl": 0.08050537109375, + "learning_rate": 2.4352331606217615e-07, + "loss": -0.0011, + "reward": 2.499992847442627, + "reward_std": 5.1874315545319405e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999927878379822, + "step": 2921 + }, + { + "clip_ratio": 0.0, + "completion_length": 34.0, + "epoch": 7.569948186528498, + "grad_norm": 6.266307374449151, + "kl": 0.11712646484375, + "learning_rate": 2.432642487046632e-07, + "loss": 0.0006, + "reward": 1.998780608177185, + "reward_std": 3.144343020267115e-05, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.4987806379795074, + "step": 2922 + }, + { + "clip_ratio": 0.0, + "completion_length": 36.0, + "epoch": 7.572538860103627, + "grad_norm": 10.941306914803079, + "kl": 0.15478515625, + "learning_rate": 2.4300518134715025e-07, + "loss": 0.0006, + "reward": 1.9986087679862976, + "reward_std": 0.00010592287355848384, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.49860879778862, + "step": 2923 + }, + { + "clip_ratio": 0.0, + "completion_length": 36.0, + "epoch": 7.575129533678757, + "grad_norm": 0.6572794132883955, + "kl": 0.03253173828125, + "learning_rate": 2.427461139896373e-07, + "loss": 0.0001, + "reward": 2.4999972581863403, + "reward_std": 1.7473318507654767e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999972581863403, + "step": 2924 + }, + { + "clip_ratio": 0.0, + "completion_length": 36.0, + "epoch": 7.577720207253886, + "grad_norm": 0.18651234912245118, + "kl": 0.083984375, + "learning_rate": 2.4248704663212436e-07, + "loss": 0.0002, + "reward": 2.4999860525131226, + "reward_std": 2.688864469746477e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.999986171722412, + "step": 2925 + }, + { + "clip_ratio": 0.0, + "completion_length": 34.5, + "epoch": 7.580310880829016, + "grad_norm": 0.12622445689324088, + "kl": 0.0965576171875, + "learning_rate": 2.422279792746114e-07, + "loss": 0.0014, + "reward": 2.4999990463256836, + "reward_std": 8.986923774045863e-07, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999989867210388, + "step": 2926 + }, + { + "clip_ratio": 0.0, + "completion_length": 36.0, + "epoch": 7.582901554404145, + "grad_norm": 0.15322209983474763, + "kl": 0.083984375, + "learning_rate": 2.419689119170984e-07, + "loss": 0.0012, + "reward": 2.499995708465576, + "reward_std": 2.3700629299128195e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999956488609314, + "step": 2927 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 7.585492227979275, + "grad_norm": 1.4186418613317566, + "kl": 0.0859375, + "learning_rate": 2.4170984455958546e-07, + "loss": 0.001, + "reward": 2.4999895095825195, + "reward_std": 7.445257551808027e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999892711639404, + "step": 2928 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 7.5880829015544045, + "grad_norm": 0.08167462650580458, + "kl": 0.09521484375, + "learning_rate": 2.414507772020725e-07, + "loss": -0.0009, + "reward": 2.4999983310699463, + "reward_std": 8.38005377090667e-07, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999984502792358, + "step": 2929 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 7.590673575129534, + "grad_norm": 0.09231319213669713, + "kl": 0.12158203125, + "learning_rate": 2.4119170984455957e-07, + "loss": -0.0001, + "reward": 2.4999979734420776, + "reward_std": 1.8866881532630941e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999982118606567, + "step": 2930 + }, + { + "clip_ratio": 0.0, + "completion_length": 34.0, + "epoch": 7.5932642487046635, + "grad_norm": 0.043819879162593965, + "kl": 0.069580078125, + "learning_rate": 2.409326424870466e-07, + "loss": -0.0002, + "reward": 2.4999979734420776, + "reward_std": 1.5304378280234232e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999980926513672, + "step": 2931 + }, + { + "clip_ratio": 0.0, + "completion_length": 34.5, + "epoch": 7.595854922279793, + "grad_norm": 0.6299510461029266, + "kl": 0.0718994140625, + "learning_rate": 2.4067357512953367e-07, + "loss": 0.001, + "reward": 2.4999934434890747, + "reward_std": 6.542053256453073e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.99999338388443, + "step": 2932 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 7.598445595854923, + "grad_norm": 0.3244237620881964, + "kl": 0.08203125, + "learning_rate": 2.404145077720207e-07, + "loss": 0.0009, + "reward": 2.499996542930603, + "reward_std": 2.8834252816523076e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.999996542930603, + "step": 2933 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.5, + "epoch": 7.601036269430052, + "grad_norm": 0.3657266534319599, + "kl": 0.069580078125, + "learning_rate": 2.401554404145077e-07, + "loss": 0.0006, + "reward": 2.499991774559021, + "reward_std": 2.7079482265435217e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.999991774559021, + "step": 2934 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.4375, + "epoch": 7.603626943005182, + "grad_norm": 5.334883455254238, + "kl": 0.1826171875, + "learning_rate": 2.3989637305699483e-07, + "loss": 0.0007, + "reward": 1.477147400379181, + "reward_std": 0.0001751736162987072, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 0.9771474301815033, + "step": 2935 + }, + { + "clip_ratio": 0.0, + "completion_length": 34.5, + "epoch": 7.606217616580311, + "grad_norm": 0.11719036022112504, + "kl": 0.0330810546875, + "learning_rate": 2.3963730569948183e-07, + "loss": 0.0006, + "reward": 2.4999985694885254, + "reward_std": 1.4707190700846695e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999986290931702, + "step": 2936 + }, + { + "clip_ratio": 0.0, + "completion_length": 34.0, + "epoch": 7.608808290155441, + "grad_norm": 20.06380237042844, + "kl": 0.126708984375, + "learning_rate": 2.393782383419689e-07, + "loss": 0.001, + "reward": 1.956397533416748, + "reward_std": 0.00039138655671422384, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.4563975036144257, + "step": 2937 + }, + { + "clip_ratio": 0.0, + "completion_length": 36.0, + "epoch": 7.61139896373057, + "grad_norm": 0.7280260370050832, + "kl": 0.07806396484375, + "learning_rate": 2.3911917098445594e-07, + "loss": 0.0006, + "reward": 2.499986171722412, + "reward_std": 8.108246220217552e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999859929084778, + "step": 2938 + }, + { + "clip_ratio": 0.0, + "completion_length": 34.5, + "epoch": 7.6139896373057, + "grad_norm": 0.13651773338593337, + "kl": 0.10498046875, + "learning_rate": 2.38860103626943e-07, + "loss": 0.0014, + "reward": 2.49999737739563, + "reward_std": 2.0275131191738183e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.999997079372406, + "step": 2939 + }, + { + "clip_ratio": 0.0, + "completion_length": 34.5625, + "epoch": 7.616580310880829, + "grad_norm": 4.7458727693058345, + "kl": 0.189697265625, + "learning_rate": 2.3860103626943004e-07, + "loss": 0.0012, + "reward": 1.9937777519226074, + "reward_std": 5.8767218433786184e-05, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.4937776029109955, + "step": 2940 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.5, + "epoch": 7.619170984455959, + "grad_norm": 2.520524567806283, + "kl": 0.14532470703125, + "learning_rate": 2.3834196891191707e-07, + "loss": 0.0006, + "reward": 2.499991536140442, + "reward_std": 8.95783091436897e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999915957450867, + "step": 2941 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.5, + "epoch": 7.6217616580310885, + "grad_norm": 0.38115085090060286, + "kl": 0.07666015625, + "learning_rate": 2.3808290155440415e-07, + "loss": -0.0001, + "reward": 2.4999923706054688, + "reward_std": 3.84506120099104e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.999992549419403, + "step": 2942 + }, + { + "clip_ratio": 0.0, + "completion_length": 34.5, + "epoch": 7.624352331606218, + "grad_norm": 0.1694456372319709, + "kl": 0.114990234375, + "learning_rate": 2.3782383419689117e-07, + "loss": 0.0011, + "reward": 2.4999982118606567, + "reward_std": 1.1256768459588784e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999980926513672, + "step": 2943 + }, + { + "clip_ratio": 0.0, + "completion_length": 34.5, + "epoch": 7.626943005181348, + "grad_norm": 0.11068370268093096, + "kl": 0.20703125, + "learning_rate": 2.3756476683937823e-07, + "loss": 0.0014, + "reward": 2.499998092651367, + "reward_std": 2.1066209114906087e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999980926513672, + "step": 2944 + }, + { + "clip_ratio": 0.0, + "completion_length": 34.5, + "epoch": 7.629533678756477, + "grad_norm": 0.49938166988170235, + "kl": 0.082763671875, + "learning_rate": 2.3730569948186528e-07, + "loss": 0.0006, + "reward": 2.4999959468841553, + "reward_std": 3.007643186947462e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999958872795105, + "step": 2945 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 7.632124352331607, + "grad_norm": 2.452864325596358, + "kl": 0.0775146484375, + "learning_rate": 2.3704663212435233e-07, + "loss": 0.0006, + "reward": 2.499997854232788, + "reward_std": 1.7866414623313176e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.999997854232788, + "step": 2946 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 7.634715025906736, + "grad_norm": 8.601088281170098, + "kl": 0.25927734375, + "learning_rate": 2.3678756476683936e-07, + "loss": 0.0019, + "reward": 1.995637059211731, + "reward_std": 0.00017986779039347311, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.4956370294094086, + "step": 2947 + }, + { + "clip_ratio": 0.0, + "completion_length": 34.0, + "epoch": 7.637305699481866, + "grad_norm": 0.243646272165451, + "kl": 0.095703125, + "learning_rate": 2.3652849740932644e-07, + "loss": 0.0004, + "reward": 2.4999974966049194, + "reward_std": 1.397219818954909e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999975562095642, + "step": 2948 + }, + { + "clip_ratio": 0.0, + "completion_length": 34.5, + "epoch": 7.639896373056995, + "grad_norm": 4.145585553461657, + "kl": 0.195068359375, + "learning_rate": 2.3626943005181346e-07, + "loss": 0.0007, + "reward": 0.9837982654571533, + "reward_std": 0.0001389335229760036, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 0.4837982654571533, + "step": 2949 + }, + { + "clip_ratio": 0.0, + "completion_length": 34.5, + "epoch": 7.642487046632124, + "grad_norm": 43.298590077828365, + "kl": 0.1026611328125, + "learning_rate": 2.360103626943005e-07, + "loss": 0.0008, + "reward": 2.499996304512024, + "reward_std": 2.4233220301539404e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999963641166687, + "step": 2950 + }, + { + "clip_ratio": 0.0, + "completion_length": 36.0, + "epoch": 7.6450777202072535, + "grad_norm": 195.98909328407052, + "kl": 0.077178955078125, + "learning_rate": 2.3575129533678757e-07, + "loss": -0.0006, + "reward": 1.9590587615966797, + "reward_std": 0.00023839250070523121, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.4590588808059692, + "step": 2951 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.5, + "epoch": 7.647668393782383, + "grad_norm": 3.611542925439923, + "kl": 0.105712890625, + "learning_rate": 2.354922279792746e-07, + "loss": -0.0001, + "reward": 2.499990940093994, + "reward_std": 4.250309189046675e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999909400939941, + "step": 2952 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.5, + "epoch": 7.650259067357513, + "grad_norm": 1.1790297547408761, + "kl": 0.080322265625, + "learning_rate": 2.3523316062176165e-07, + "loss": 0.0005, + "reward": 2.4999793767929077, + "reward_std": 6.002086593070999e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999791979789734, + "step": 2953 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 7.652849740932642, + "grad_norm": 0.6367357057118201, + "kl": 0.19482421875, + "learning_rate": 2.349740932642487e-07, + "loss": 0.0004, + "reward": 2.4999942779541016, + "reward_std": 4.539891961030662e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999942779541016, + "step": 2954 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.5, + "epoch": 7.655440414507772, + "grad_norm": 0.1469044631302598, + "kl": 0.07470703125, + "learning_rate": 2.3471502590673575e-07, + "loss": 0.0016, + "reward": 2.499998450279236, + "reward_std": 1.8066745610667567e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999983310699463, + "step": 2955 + }, + { + "clip_ratio": 0.0, + "completion_length": 34.5, + "epoch": 7.658031088082901, + "grad_norm": 17.18381012780644, + "kl": 0.07177734375, + "learning_rate": 2.3445595854922278e-07, + "loss": -0.0003, + "reward": 2.3745037317276, + "reward_std": 0.23148750290306452, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.8745037913322449, + "step": 2956 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.5, + "epoch": 7.660621761658031, + "grad_norm": 0.13598828339221825, + "kl": 0.15673828125, + "learning_rate": 2.3419689119170983e-07, + "loss": 0.0005, + "reward": 2.4999979734420776, + "reward_std": 1.6425097442152037e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999979138374329, + "step": 2957 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 7.66321243523316, + "grad_norm": 0.5607035626085307, + "kl": 0.094482421875, + "learning_rate": 2.3393782383419688e-07, + "loss": -0.0001, + "reward": 2.4999938011169434, + "reward_std": 3.2749887282079726e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999937415122986, + "step": 2958 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.5, + "epoch": 7.66580310880829, + "grad_norm": 0.22616883688488423, + "kl": 0.0823974609375, + "learning_rate": 2.336787564766839e-07, + "loss": 0.0006, + "reward": 2.4999940395355225, + "reward_std": 2.4444340738227766e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.999993920326233, + "step": 2959 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.5, + "epoch": 7.668393782383419, + "grad_norm": 0.04940128090796161, + "kl": 0.07275390625, + "learning_rate": 2.33419689119171e-07, + "loss": 0.0009, + "reward": 2.499998688697815, + "reward_std": 7.59293158125729e-07, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.999998688697815, + "step": 2960 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.5, + "epoch": 7.670984455958549, + "grad_norm": 0.2590110179005871, + "kl": 0.098388671875, + "learning_rate": 2.3316062176165802e-07, + "loss": 0.001, + "reward": 2.4999932050704956, + "reward_std": 5.325743927642179e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999932050704956, + "step": 2961 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 7.6735751295336785, + "grad_norm": 6.061628499129426, + "kl": 0.30859375, + "learning_rate": 2.3290155440414507e-07, + "loss": 0.0012, + "reward": 1.4023630023002625, + "reward_std": 0.00030712188163306564, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 0.9023629426956177, + "step": 2962 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.5, + "epoch": 7.676165803108808, + "grad_norm": 2.047964944312291, + "kl": 0.1474609375, + "learning_rate": 2.3264248704663212e-07, + "loss": 0.0006, + "reward": 2.4999921321868896, + "reward_std": 6.337079412332969e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999921917915344, + "step": 2963 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.5, + "epoch": 7.678756476683938, + "grad_norm": 0.3081661714282979, + "kl": 0.0531005859375, + "learning_rate": 2.3238341968911915e-07, + "loss": 0.0008, + "reward": 2.499997138977051, + "reward_std": 3.591041831896291e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999971389770508, + "step": 2964 + }, + { + "clip_ratio": 0.0, + "completion_length": 36.0, + "epoch": 7.681347150259067, + "grad_norm": 0.22279129819831683, + "kl": 0.0390625, + "learning_rate": 2.321243523316062e-07, + "loss": 0.0001, + "reward": 2.4999966621398926, + "reward_std": 2.9706216082558967e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999966025352478, + "step": 2965 + }, + { + "clip_ratio": 0.0, + "completion_length": 34.5, + "epoch": 7.683937823834197, + "grad_norm": 0.057244872882196304, + "kl": 0.0802001953125, + "learning_rate": 2.3186528497409325e-07, + "loss": 0.0008, + "reward": 1.9984210133552551, + "reward_std": 9.29600219023996e-07, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.4984210431575775, + "step": 2966 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.5, + "epoch": 7.686528497409326, + "grad_norm": 0.6704997509583145, + "kl": 0.1007080078125, + "learning_rate": 2.316062176165803e-07, + "loss": 0.0004, + "reward": 2.499019742012024, + "reward_std": 3.267810623697187e-05, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9990198612213135, + "step": 2967 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.1875, + "epoch": 7.689119170984456, + "grad_norm": 1.2754302774991817, + "kl": 0.0908203125, + "learning_rate": 2.3134715025906733e-07, + "loss": -0.0001, + "reward": 2.4999780654907227, + "reward_std": 9.924450296239229e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.999978244304657, + "step": 2968 + }, + { + "clip_ratio": 0.0, + "completion_length": 34.0, + "epoch": 7.691709844559585, + "grad_norm": 1.4850388037850724, + "kl": 0.12109375, + "learning_rate": 2.310880829015544e-07, + "loss": 0.0001, + "reward": 2.4999935626983643, + "reward_std": 5.476312480823253e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999935626983643, + "step": 2969 + }, + { + "clip_ratio": 0.0, + "completion_length": 34.5, + "epoch": 7.694300518134715, + "grad_norm": 0.16404457944293435, + "kl": 0.0599365234375, + "learning_rate": 2.3082901554404144e-07, + "loss": 0.0015, + "reward": 2.4999983310699463, + "reward_std": 2.295492720350012e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999980926513672, + "step": 2970 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 7.696891191709844, + "grad_norm": 0.0965994030064735, + "kl": 0.0533447265625, + "learning_rate": 2.3056994818652846e-07, + "loss": 0.0001, + "reward": 2.499997615814209, + "reward_std": 1.1713038361449435e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999976754188538, + "step": 2971 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.5, + "epoch": 7.699481865284974, + "grad_norm": 0.3055841607363679, + "kl": 0.0931396484375, + "learning_rate": 2.3031088082901554e-07, + "loss": 0.001, + "reward": 2.499997615814209, + "reward_std": 1.9341566712682834e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.999997615814209, + "step": 2972 + }, + { + "clip_ratio": 0.0, + "completion_length": 34.5, + "epoch": 7.7020725388601035, + "grad_norm": 0.6607677650557111, + "kl": 0.0716552734375, + "learning_rate": 2.3005181347150257e-07, + "loss": 0.0003, + "reward": 2.4999901056289673, + "reward_std": 6.174843520057038e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999902248382568, + "step": 2973 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 7.704663212435233, + "grad_norm": 0.14703651007227939, + "kl": 0.033447265625, + "learning_rate": 2.2979274611398962e-07, + "loss": -0.0007, + "reward": 2.4999977350234985, + "reward_std": 1.6590072391409194e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999977946281433, + "step": 2974 + }, + { + "clip_ratio": 0.0, + "completion_length": 34.0, + "epoch": 7.7072538860103625, + "grad_norm": 0.6836893906731341, + "kl": 0.098876953125, + "learning_rate": 2.2953367875647667e-07, + "loss": 0.0008, + "reward": 2.4999966621398926, + "reward_std": 6.666901185781171e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.999996542930603, + "step": 2975 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 7.709844559585492, + "grad_norm": 4.513052508156367, + "kl": 0.1060791015625, + "learning_rate": 2.2927461139896373e-07, + "loss": 0.0003, + "reward": 1.9882862567901611, + "reward_std": 0.00016485343704175648, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.4882861971855164, + "step": 2976 + }, + { + "clip_ratio": 0.0, + "completion_length": 33.5, + "epoch": 7.712435233160622, + "grad_norm": 0.060720704936753946, + "kl": 0.09765625, + "learning_rate": 2.2901554404145075e-07, + "loss": 0.0011, + "reward": 2.4999994039535522, + "reward_std": 6.702452424178773e-07, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999994039535522, + "step": 2977 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 7.715025906735751, + "grad_norm": 0.6004266718491046, + "kl": 0.24609375, + "learning_rate": 2.2875647668393783e-07, + "loss": 0.0008, + "reward": 2.4999974966049194, + "reward_std": 3.527352106402759e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999977350234985, + "step": 2978 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 7.717616580310881, + "grad_norm": 0.01812897126848882, + "kl": 0.0642242431640625, + "learning_rate": 2.2849740932642486e-07, + "loss": 0.0007, + "reward": 2.499999523162842, + "reward_std": 5.310442077188782e-07, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999997019767761, + "step": 2979 + }, + { + "clip_ratio": 0.0, + "completion_length": 34.0, + "epoch": 7.72020725388601, + "grad_norm": 0.048094224914001525, + "kl": 0.05621337890625, + "learning_rate": 2.2823834196891188e-07, + "loss": -0.0011, + "reward": 2.4999990463256836, + "reward_std": 6.808981254380342e-07, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999994039535522, + "step": 2980 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 7.72279792746114, + "grad_norm": 0.232807928441412, + "kl": 0.05938720703125, + "learning_rate": 2.2797927461139896e-07, + "loss": -0.0, + "reward": 2.499997138977051, + "reward_std": 2.2767966925130168e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999971389770508, + "step": 2981 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 7.725388601036269, + "grad_norm": 0.11633539879680475, + "kl": 0.1195068359375, + "learning_rate": 2.27720207253886e-07, + "loss": -0.0001, + "reward": 1.953113853931427, + "reward_std": 8.382568097431431e-07, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.4531138837337494, + "step": 2982 + }, + { + "clip_ratio": 0.0, + "completion_length": 34.0, + "epoch": 7.727979274611399, + "grad_norm": 0.03658213069447342, + "kl": 0.05908203125, + "learning_rate": 2.2746113989637304e-07, + "loss": 0.0014, + "reward": 2.4999983310699463, + "reward_std": 1.4331392890198913e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999980330467224, + "step": 2983 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 7.730569948186528, + "grad_norm": 2.5249971790989147, + "kl": 0.092529296875, + "learning_rate": 2.272020725388601e-07, + "loss": 0.0005, + "reward": 2.499990940093994, + "reward_std": 6.312982350209495e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999907612800598, + "step": 2984 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.5, + "epoch": 7.733160621761658, + "grad_norm": 2.2048208802356037, + "kl": 0.1181640625, + "learning_rate": 2.2694300518134715e-07, + "loss": 0.0014, + "reward": 1.8863747119903564, + "reward_std": 0.00018447764458073834, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.386374592781067, + "step": 2985 + }, + { + "clip_ratio": 0.0, + "completion_length": 36.0, + "epoch": 7.7357512953367875, + "grad_norm": 0.7929573325592978, + "kl": 0.10009765625, + "learning_rate": 2.2668393782383417e-07, + "loss": 0.0004, + "reward": 1.9995321035385132, + "reward_std": 1.4098423093855672e-05, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.4995321333408356, + "step": 2986 + }, + { + "clip_ratio": 0.0, + "completion_length": 36.5, + "epoch": 7.738341968911917, + "grad_norm": 0.3306776670372298, + "kl": 0.063232421875, + "learning_rate": 2.2642487046632123e-07, + "loss": -0.0003, + "reward": 2.499997854232788, + "reward_std": 2.5592077008695924e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999979734420776, + "step": 2987 + }, + { + "clip_ratio": 0.0, + "completion_length": 34.5, + "epoch": 7.740932642487047, + "grad_norm": 5.439705292379303, + "kl": 0.166748046875, + "learning_rate": 2.2616580310880828e-07, + "loss": 0.001, + "reward": 1.919031023979187, + "reward_std": 0.00045640194349516605, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.4190311133861542, + "step": 2988 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 7.743523316062176, + "grad_norm": 0.17917557863645966, + "kl": 0.0482177734375, + "learning_rate": 2.2590673575129533e-07, + "loss": 0.0005, + "reward": 2.4999982118606567, + "reward_std": 1.1641488981695147e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.999998152256012, + "step": 2989 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.5, + "epoch": 7.746113989637306, + "grad_norm": 14.472835750069892, + "kl": 0.093017578125, + "learning_rate": 2.2564766839378238e-07, + "loss": 0.0006, + "reward": 1.9872830510139465, + "reward_std": 0.0003825480937393877, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.4872829914093018, + "step": 2990 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 7.748704663212435, + "grad_norm": 4.4554873041098535, + "kl": 0.20068359375, + "learning_rate": 2.253886010362694e-07, + "loss": 0.0013, + "reward": 1.7720575332641602, + "reward_std": 0.0003944761579077749, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.2720574736595154, + "step": 2991 + }, + { + "clip_ratio": 0.0, + "completion_length": 34.5625, + "epoch": 7.751295336787565, + "grad_norm": 3.304391616781944, + "kl": 0.197906494140625, + "learning_rate": 2.251295336787565e-07, + "loss": 0.0003, + "reward": 1.9799120426177979, + "reward_std": 0.00010527110788416394, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.4799121618270874, + "step": 2992 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 7.753886010362694, + "grad_norm": 0.10017255035828425, + "kl": 0.1455078125, + "learning_rate": 2.2487046632124352e-07, + "loss": -0.0009, + "reward": 2.499997854232788, + "reward_std": 1.9528069401530956e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999980330467224, + "step": 2993 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 7.756476683937824, + "grad_norm": 1.775040479956177, + "kl": 0.1041259765625, + "learning_rate": 2.2461139896373054e-07, + "loss": -0.0008, + "reward": 2.4999914169311523, + "reward_std": 7.702289167355048e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999914765357971, + "step": 2994 + }, + { + "clip_ratio": 0.0, + "completion_length": 33.5, + "epoch": 7.759067357512953, + "grad_norm": 8.498881243725833, + "kl": 0.144775390625, + "learning_rate": 2.2435233160621762e-07, + "loss": 0.0012, + "reward": 1.9859219789505005, + "reward_std": 0.0010527848915558025, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.4859220683574677, + "step": 2995 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.5, + "epoch": 7.761658031088083, + "grad_norm": 9.330293986048604, + "kl": 0.11083984375, + "learning_rate": 2.2409326424870465e-07, + "loss": 0.0007, + "reward": 2.437412738800049, + "reward_std": 0.17701938969071307, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9374128580093384, + "step": 2996 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 7.7642487046632125, + "grad_norm": 0.06290617920521173, + "kl": 0.054779052734375, + "learning_rate": 2.238341968911917e-07, + "loss": 0.0002, + "reward": 2.499997615814209, + "reward_std": 1.6040725654420385e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999976754188538, + "step": 2997 + }, + { + "clip_ratio": 0.0, + "completion_length": 34.5, + "epoch": 7.766839378238342, + "grad_norm": 0.4512121547133678, + "kl": 0.12353515625, + "learning_rate": 2.2357512953367875e-07, + "loss": 0.0008, + "reward": 1.9997767210006714, + "reward_std": 9.132969353231601e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.4997766017913818, + "step": 2998 + }, + { + "clip_ratio": 0.0, + "completion_length": 34.5, + "epoch": 7.769430051813472, + "grad_norm": 17.139758720842824, + "kl": 0.1806640625, + "learning_rate": 2.233160621761658e-07, + "loss": 0.0012, + "reward": 1.9556805491447449, + "reward_std": 0.0003215945748706872, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.4556803703308105, + "step": 2999 + }, + { + "clip_ratio": 0.0, + "completion_length": 36.0, + "epoch": 7.772020725388601, + "grad_norm": 0.5188629484731856, + "kl": 0.0849609375, + "learning_rate": 2.2305699481865283e-07, + "loss": 0.0012, + "reward": 2.499979019165039, + "reward_std": 7.1322385792882415e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999788999557495, + "step": 3000 + }, + { + "clip_ratio": 0.0, + "completion_length": 34.0, + "epoch": 7.774611398963731, + "grad_norm": 0.06310146586443106, + "kl": 0.12939453125, + "learning_rate": 2.2279792746113988e-07, + "loss": 0.0002, + "reward": 2.4999988079071045, + "reward_std": 1.619227816718194e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.999998688697815, + "step": 3001 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 7.77720207253886, + "grad_norm": 7.039274589696187, + "kl": 0.09375, + "learning_rate": 2.2253886010362694e-07, + "loss": 0.0012, + "reward": 2.4997451305389404, + "reward_std": 5.299113354340079e-05, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.999744951725006, + "step": 3002 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 7.77979274611399, + "grad_norm": 3.715489128935168, + "kl": 0.085693359375, + "learning_rate": 2.2227979274611396e-07, + "loss": 0.0012, + "reward": 1.822838544845581, + "reward_std": 0.00023595143682086928, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.3228385746479034, + "step": 3003 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.5, + "epoch": 7.782383419689119, + "grad_norm": 35.80697681801513, + "kl": 0.11663818359375, + "learning_rate": 2.2202072538860104e-07, + "loss": -0.0004, + "reward": 2.432840347290039, + "reward_std": 0.18995128316512933, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9328404664993286, + "step": 3004 + }, + { + "clip_ratio": 0.0, + "completion_length": 36.0, + "epoch": 7.784974093264249, + "grad_norm": 0.10969858925322229, + "kl": 0.061370849609375, + "learning_rate": 2.2176165803108807e-07, + "loss": -0.0003, + "reward": 2.4999982118606567, + "reward_std": 1.3260830939998414e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999983310699463, + "step": 3005 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.5, + "epoch": 7.787564766839378, + "grad_norm": 0.16203893010107592, + "kl": 0.1087646484375, + "learning_rate": 2.2150259067357512e-07, + "loss": 0.0009, + "reward": 2.4999974966049194, + "reward_std": 3.211570628991467e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999974966049194, + "step": 3006 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.5, + "epoch": 7.790155440414508, + "grad_norm": 5.183751642412059, + "kl": 0.205078125, + "learning_rate": 2.2124352331606217e-07, + "loss": 0.0003, + "reward": 2.4998831748962402, + "reward_std": 2.3188613681668357e-05, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9998834133148193, + "step": 3007 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 7.7927461139896375, + "grad_norm": 0.6309927054160139, + "kl": 0.093994140625, + "learning_rate": 2.2098445595854923e-07, + "loss": 0.001, + "reward": 2.499986410140991, + "reward_std": 3.516961101013294e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999862909317017, + "step": 3008 + }, + { + "clip_ratio": 0.0, + "completion_length": 34.5, + "epoch": 7.795336787564767, + "grad_norm": 0.29405330002990454, + "kl": 0.159912109375, + "learning_rate": 2.2072538860103625e-07, + "loss": 0.0009, + "reward": 2.4999964237213135, + "reward_std": 2.5809340513660572e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999963641166687, + "step": 3009 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.5, + "epoch": 7.7979274611398965, + "grad_norm": 0.5081163984497482, + "kl": 0.072998046875, + "learning_rate": 2.204663212435233e-07, + "loss": 0.0004, + "reward": 2.4999911785125732, + "reward_std": 5.08629477735667e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999911785125732, + "step": 3010 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.5, + "epoch": 7.800518134715026, + "grad_norm": 5.611138051750514, + "kl": 0.093505859375, + "learning_rate": 2.2020725388601036e-07, + "loss": 0.0004, + "reward": 1.9998716115951538, + "reward_std": 0.0001064846621829929, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.4998716115951538, + "step": 3011 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 7.803108808290156, + "grad_norm": 7.267496427067268, + "kl": 0.050048828125, + "learning_rate": 2.1994818652849738e-07, + "loss": -0.0, + "reward": 1.9986516237258911, + "reward_std": 0.00013987819954763836, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.4986517131328583, + "step": 3012 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 7.805699481865285, + "grad_norm": 0.15318169809999016, + "kl": 0.11328125, + "learning_rate": 2.1968911917098446e-07, + "loss": -0.0, + "reward": 2.4999839067459106, + "reward_std": 3.0926872938152883e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999840259552002, + "step": 3013 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 7.808290155440415, + "grad_norm": 0.16374576222824022, + "kl": 0.113037109375, + "learning_rate": 2.194300518134715e-07, + "loss": 0.0003, + "reward": 2.4999983310699463, + "reward_std": 1.3417750039934617e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999985098838806, + "step": 3014 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.5, + "epoch": 7.810880829015544, + "grad_norm": 1.3150495833201583, + "kl": 0.0657958984375, + "learning_rate": 2.1917098445595854e-07, + "loss": 0.0005, + "reward": 2.4999908208847046, + "reward_std": 6.791120540583506e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999908208847046, + "step": 3015 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 7.813471502590674, + "grad_norm": 2.479092858002278, + "kl": 0.128173828125, + "learning_rate": 2.189119170984456e-07, + "loss": 0.0009, + "reward": 1.9447592496871948, + "reward_std": 0.00013159715604160738, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.4447591304779053, + "step": 3016 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 7.816062176165803, + "grad_norm": 0.07084762494339814, + "kl": 0.04296875, + "learning_rate": 2.1865284974093262e-07, + "loss": 0.0011, + "reward": 2.4999990463256836, + "reward_std": 9.151464155365829e-07, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999991655349731, + "step": 3017 + }, + { + "clip_ratio": 0.0, + "completion_length": 36.0, + "epoch": 7.818652849740933, + "grad_norm": 3.846169558405578, + "kl": 0.06298828125, + "learning_rate": 2.1839378238341967e-07, + "loss": -0.0002, + "reward": 1.979655385017395, + "reward_std": 0.000159777092449076, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.4796555042266846, + "step": 3018 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 7.821243523316062, + "grad_norm": 1.960973022993101, + "kl": 0.0634765625, + "learning_rate": 2.1813471502590673e-07, + "loss": -0.0, + "reward": 2.499996304512024, + "reward_std": 4.883814824552246e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999963641166687, + "step": 3019 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.5, + "epoch": 7.823834196891192, + "grad_norm": 0.9377522933956081, + "kl": 0.34228515625, + "learning_rate": 2.1787564766839378e-07, + "loss": 0.0011, + "reward": 2.4999818801879883, + "reward_std": 8.19701540422102e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999818205833435, + "step": 3020 + }, + { + "clip_ratio": 0.0, + "completion_length": 36.0, + "epoch": 7.8264248704663215, + "grad_norm": 1.1907806486277712, + "kl": 0.0693359375, + "learning_rate": 2.176165803108808e-07, + "loss": -0.0006, + "reward": 2.499992847442627, + "reward_std": 5.587518216998433e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999930262565613, + "step": 3021 + }, + { + "clip_ratio": 0.0, + "completion_length": 34.5, + "epoch": 7.829015544041451, + "grad_norm": 1.7518464325649516, + "kl": 0.128173828125, + "learning_rate": 2.1735751295336789e-07, + "loss": -0.0002, + "reward": 2.49997341632843, + "reward_std": 6.6879167661682e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.999973475933075, + "step": 3022 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 7.831606217616581, + "grad_norm": 0.41234468921795614, + "kl": 0.24462890625, + "learning_rate": 2.170984455958549e-07, + "loss": 0.002, + "reward": 2.499993324279785, + "reward_std": 2.963234692288097e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999932646751404, + "step": 3023 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 7.83419689119171, + "grad_norm": 1.5262717657511273, + "kl": 0.1123046875, + "learning_rate": 2.1683937823834194e-07, + "loss": 0.0002, + "reward": 2.499981999397278, + "reward_std": 8.631451549945268e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999820590019226, + "step": 3024 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 7.83678756476684, + "grad_norm": 0.06133610351617526, + "kl": 0.06689453125, + "learning_rate": 2.1658031088082902e-07, + "loss": -0.0011, + "reward": 2.4999983310699463, + "reward_std": 8.810840768092021e-07, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999984502792358, + "step": 3025 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.5, + "epoch": 7.839378238341969, + "grad_norm": 0.41373936099062025, + "kl": 0.069091796875, + "learning_rate": 2.1632124352331604e-07, + "loss": 0.0015, + "reward": 2.499998688697815, + "reward_std": 1.190975950748907e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999986290931702, + "step": 3026 + }, + { + "clip_ratio": 0.0, + "completion_length": 34.5, + "epoch": 7.841968911917099, + "grad_norm": 4.5704246137062, + "kl": 0.04931640625, + "learning_rate": 2.160621761658031e-07, + "loss": -0.0002, + "reward": 1.9998024106025696, + "reward_std": 3.9728092929181e-05, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.4998025596141815, + "step": 3027 + }, + { + "clip_ratio": 0.0, + "completion_length": 36.0, + "epoch": 7.844559585492228, + "grad_norm": 1.506214091610807, + "kl": 0.144775390625, + "learning_rate": 2.1580310880829015e-07, + "loss": -0.0, + "reward": 1.9998642206192017, + "reward_std": 1.6321398561558453e-05, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.4998642802238464, + "step": 3028 + }, + { + "clip_ratio": 0.0, + "completion_length": 34.5, + "epoch": 7.847150259067358, + "grad_norm": 2.840281677666068, + "kl": 0.174072265625, + "learning_rate": 2.155440414507772e-07, + "loss": 0.0008, + "reward": 1.821478247642517, + "reward_std": 0.00034726609214885684, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.3214781284332275, + "step": 3029 + }, + { + "clip_ratio": 0.0, + "completion_length": 36.0, + "epoch": 7.849740932642487, + "grad_norm": 0.7188994295197779, + "kl": 0.064453125, + "learning_rate": 2.1528497409326423e-07, + "loss": -0.0002, + "reward": 2.4999929666519165, + "reward_std": 4.578446805680869e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.999993085861206, + "step": 3030 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.5, + "epoch": 7.852331606217617, + "grad_norm": 1.370770481823312, + "kl": 0.108154296875, + "learning_rate": 2.1502590673575128e-07, + "loss": 0.0007, + "reward": 2.49998939037323, + "reward_std": 6.488994245046342e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999893307685852, + "step": 3031 + }, + { + "clip_ratio": 0.0, + "completion_length": 36.5, + "epoch": 7.8549222797927465, + "grad_norm": 2.117732787684715, + "kl": 0.052001953125, + "learning_rate": 2.1476683937823833e-07, + "loss": 0.0011, + "reward": 2.4998621940612793, + "reward_std": 2.08686252562984e-05, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9998621940612793, + "step": 3032 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.5, + "epoch": 7.857512953367876, + "grad_norm": 0.3700367947794776, + "kl": 0.07379150390625, + "learning_rate": 2.1450777202072536e-07, + "loss": 0.0009, + "reward": 2.499994158744812, + "reward_std": 2.8616719589535933e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.999994158744812, + "step": 3033 + }, + { + "clip_ratio": 0.0, + "completion_length": 34.0, + "epoch": 7.860103626943005, + "grad_norm": 1.6195198913674689, + "kl": 0.1171875, + "learning_rate": 2.1424870466321244e-07, + "loss": -0.0003, + "reward": 2.499969482421875, + "reward_std": 9.028870351812657e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999695420265198, + "step": 3034 + }, + { + "clip_ratio": 0.0, + "completion_length": 36.0, + "epoch": 7.862694300518134, + "grad_norm": 0.4882771074124476, + "kl": 0.0601806640625, + "learning_rate": 2.1398963730569946e-07, + "loss": -0.0005, + "reward": 2.4999966621398926, + "reward_std": 3.512521175252914e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999966621398926, + "step": 3035 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.5, + "epoch": 7.865284974093264, + "grad_norm": 0.2621269532102978, + "kl": 0.236572265625, + "learning_rate": 2.1373056994818652e-07, + "loss": -0.0002, + "reward": 2.4999958276748657, + "reward_std": 4.095677013538079e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999958276748657, + "step": 3036 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.3125, + "epoch": 7.867875647668393, + "grad_norm": 0.6894657108170018, + "kl": 0.2275390625, + "learning_rate": 2.1347150259067357e-07, + "loss": -0.0001, + "reward": 2.4999892711639404, + "reward_std": 4.097757937415736e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999893307685852, + "step": 3037 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 7.870466321243523, + "grad_norm": 0.32286157224931034, + "kl": 0.08935546875, + "learning_rate": 2.1321243523316062e-07, + "loss": 0.0024, + "reward": 2.4999969005584717, + "reward_std": 2.5663498490757775e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999967813491821, + "step": 3038 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 7.873056994818652, + "grad_norm": 1.192669783119955, + "kl": 0.1072998046875, + "learning_rate": 2.1295336787564765e-07, + "loss": 0.0006, + "reward": 2.4999961853027344, + "reward_std": 4.818610932488809e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999960660934448, + "step": 3039 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 7.875647668393782, + "grad_norm": 0.14040132101163627, + "kl": 0.020477294921875, + "learning_rate": 2.126943005181347e-07, + "loss": 0.0001, + "reward": 2.499997138977051, + "reward_std": 1.620960517811909e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999971389770508, + "step": 3040 + }, + { + "clip_ratio": 0.0, + "completion_length": 36.0, + "epoch": 7.8782383419689115, + "grad_norm": 0.28557804175723084, + "kl": 0.13818359375, + "learning_rate": 2.1243523316062175e-07, + "loss": 0.0011, + "reward": 2.499998092651367, + "reward_std": 1.621204319235403e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999979138374329, + "step": 3041 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 7.880829015544041, + "grad_norm": 32.47139845341851, + "kl": 3.42333984375, + "learning_rate": 2.1217616580310878e-07, + "loss": 0.014, + "reward": 2.433580756187439, + "reward_std": 0.18785653862721574, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.933580756187439, + "step": 3042 + }, + { + "clip_ratio": 0.0, + "completion_length": 34.5, + "epoch": 7.883419689119171, + "grad_norm": 0.29161598149629786, + "kl": 0.133544921875, + "learning_rate": 2.1191709844559586e-07, + "loss": -0.0, + "reward": 2.4999969005584717, + "reward_std": 1.7866369717012276e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999969005584717, + "step": 3043 + }, + { + "clip_ratio": 0.0, + "completion_length": 36.0, + "epoch": 7.8860103626943, + "grad_norm": 0.9610300212534616, + "kl": 0.1171875, + "learning_rate": 2.1165803108808289e-07, + "loss": 0.0006, + "reward": 2.4999794960021973, + "reward_std": 5.429509201348992e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999793767929077, + "step": 3044 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 7.88860103626943, + "grad_norm": 4.219844876982593, + "kl": 0.156982421875, + "learning_rate": 2.1139896373056996e-07, + "loss": 0.0008, + "reward": 1.9997655749320984, + "reward_std": 2.8689085411315318e-05, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.4997656345367432, + "step": 3045 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.5, + "epoch": 7.891191709844559, + "grad_norm": 0.6850453688738123, + "kl": 0.171142578125, + "learning_rate": 2.11139896373057e-07, + "loss": -0.0007, + "reward": 2.4999877214431763, + "reward_std": 5.104976366965275e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999877214431763, + "step": 3046 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.5, + "epoch": 7.893782383419689, + "grad_norm": 0.3771100775471277, + "kl": 0.08203125, + "learning_rate": 2.1088082901554402e-07, + "loss": -0.0007, + "reward": 2.499996304512024, + "reward_std": 4.827522730010969e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999964833259583, + "step": 3047 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.5, + "epoch": 7.896373056994818, + "grad_norm": 0.655211727901186, + "kl": 0.060760498046875, + "learning_rate": 2.106217616580311e-07, + "loss": 0.001, + "reward": 2.4999940395355225, + "reward_std": 5.3634421135484445e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.999993920326233, + "step": 3048 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.5, + "epoch": 7.898963730569948, + "grad_norm": 30.260415676334414, + "kl": 0.074951171875, + "learning_rate": 2.1036269430051812e-07, + "loss": 0.0004, + "reward": 1.9996639490127563, + "reward_std": 2.3242853558258503e-05, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.4996639490127563, + "step": 3049 + }, + { + "clip_ratio": 0.0, + "completion_length": 36.0, + "epoch": 7.901554404145077, + "grad_norm": 2.7657426971767904, + "kl": 0.050537109375, + "learning_rate": 2.1010362694300517e-07, + "loss": 0.0004, + "reward": 1.9999040365219116, + "reward_std": 1.4257862460453907e-05, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.499904066324234, + "step": 3050 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.5, + "epoch": 7.904145077720207, + "grad_norm": 0.1574594416802252, + "kl": 0.04388427734375, + "learning_rate": 2.0984455958549223e-07, + "loss": 0.0005, + "reward": 2.499995231628418, + "reward_std": 1.847226315021544e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999951720237732, + "step": 3051 + }, + { + "clip_ratio": 0.0, + "completion_length": 34.5, + "epoch": 7.9067357512953365, + "grad_norm": 0.12804410972138083, + "kl": 0.1669921875, + "learning_rate": 2.0958549222797928e-07, + "loss": 0.001, + "reward": 2.4999955892562866, + "reward_std": 1.8536298966864706e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.999995470046997, + "step": 3052 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 7.909326424870466, + "grad_norm": 3.0548242980664795, + "kl": 0.15380859375, + "learning_rate": 2.093264248704663e-07, + "loss": 0.0001, + "reward": 1.99904066324234, + "reward_std": 5.545262655459737e-05, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.49904066324234, + "step": 3053 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 7.9119170984455955, + "grad_norm": 6.256809713309946, + "kl": 0.131103515625, + "learning_rate": 2.0906735751295336e-07, + "loss": 0.0004, + "reward": 1.9948559999465942, + "reward_std": 0.00011882455555678462, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.4948559701442719, + "step": 3054 + }, + { + "clip_ratio": 0.0, + "completion_length": 36.0, + "epoch": 7.914507772020725, + "grad_norm": 0.10216406313540978, + "kl": 0.083251953125, + "learning_rate": 2.088082901554404e-07, + "loss": 0.0006, + "reward": 2.499998092651367, + "reward_std": 2.050543855602882e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999980330467224, + "step": 3055 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.5, + "epoch": 7.917098445595855, + "grad_norm": 0.2851786198113071, + "kl": 0.0911865234375, + "learning_rate": 2.0854922279792744e-07, + "loss": -0.0001, + "reward": 2.4999958276748657, + "reward_std": 2.097099411457748e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999958872795105, + "step": 3056 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.5, + "epoch": 7.919689119170984, + "grad_norm": 3.5818919836221954, + "kl": 0.18359375, + "learning_rate": 2.0829015544041452e-07, + "loss": 0.0003, + "reward": 2.499968409538269, + "reward_std": 5.582526000580401e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999683499336243, + "step": 3057 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.5, + "epoch": 7.922279792746114, + "grad_norm": 0.17737390143464368, + "kl": 0.113037109375, + "learning_rate": 2.0803108808290154e-07, + "loss": -0.0003, + "reward": 2.4999964237213135, + "reward_std": 1.8129984482584405e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999963641166687, + "step": 3058 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 7.924870466321243, + "grad_norm": 2.0247490285339027, + "kl": 0.23681640625, + "learning_rate": 2.077720207253886e-07, + "loss": 0.0014, + "reward": 1.9987263083457947, + "reward_std": 6.473113160154753e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.4987261295318604, + "step": 3059 + }, + { + "clip_ratio": 0.0, + "completion_length": 34.0, + "epoch": 7.927461139896373, + "grad_norm": 1.1506908007150278, + "kl": 0.087890625, + "learning_rate": 2.0751295336787565e-07, + "loss": 0.0016, + "reward": 2.4999799728393555, + "reward_std": 8.695652923051966e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999797344207764, + "step": 3060 + }, + { + "clip_ratio": 0.0, + "completion_length": 36.0, + "epoch": 7.930051813471502, + "grad_norm": 5.83986424841263, + "kl": 0.143798828125, + "learning_rate": 2.0725388601036267e-07, + "loss": 0.0007, + "reward": 1.8943517804145813, + "reward_std": 0.0006579617829061135, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.3943516314029694, + "step": 3061 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.5, + "epoch": 7.932642487046632, + "grad_norm": 0.10170327880293734, + "kl": 0.12255859375, + "learning_rate": 2.0699481865284973e-07, + "loss": -0.0004, + "reward": 2.4999977350234985, + "reward_std": 2.202723635491566e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.999997854232788, + "step": 3062 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.5, + "epoch": 7.935233160621761, + "grad_norm": 1.5760768946373525, + "kl": 0.0771484375, + "learning_rate": 2.0673575129533678e-07, + "loss": -0.0006, + "reward": 1.9999316930770874, + "reward_std": 1.3120281323608651e-05, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.4999319314956665, + "step": 3063 + }, + { + "clip_ratio": 0.0, + "completion_length": 34.0, + "epoch": 7.937823834196891, + "grad_norm": 0.03566346779275998, + "kl": 0.0628662109375, + "learning_rate": 2.0647668393782383e-07, + "loss": 0.0004, + "reward": 2.4999964237213135, + "reward_std": 1.0423624132727127e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999964237213135, + "step": 3064 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 7.9404145077720205, + "grad_norm": 0.16240862840114378, + "kl": 0.056396484375, + "learning_rate": 2.0621761658031086e-07, + "loss": 0.0011, + "reward": 2.4999959468841553, + "reward_std": 2.3705815124230867e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999958872795105, + "step": 3065 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 7.94300518134715, + "grad_norm": 0.277360373152522, + "kl": 0.03338623046875, + "learning_rate": 2.0595854922279794e-07, + "loss": 0.0003, + "reward": 2.4999942779541016, + "reward_std": 2.7561879960558144e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999940395355225, + "step": 3066 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 7.94559585492228, + "grad_norm": 1.0893218613989033, + "kl": 0.0775146484375, + "learning_rate": 2.0569948186528496e-07, + "loss": -0.0006, + "reward": 2.499997138977051, + "reward_std": 3.194425858055183e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999971389770508, + "step": 3067 + }, + { + "clip_ratio": 0.0, + "completion_length": 36.0, + "epoch": 7.948186528497409, + "grad_norm": 0.16023194017835296, + "kl": 0.0628662109375, + "learning_rate": 2.0544041450777202e-07, + "loss": -0.0, + "reward": 2.4999982118606567, + "reward_std": 1.4650185846676322e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999982118606567, + "step": 3068 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 7.950777202072539, + "grad_norm": 3.879061204044804, + "kl": 0.149169921875, + "learning_rate": 2.0518134715025907e-07, + "loss": 0.0014, + "reward": 1.9994211196899414, + "reward_std": 4.467247589445833e-05, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.4994210004806519, + "step": 3069 + }, + { + "clip_ratio": 0.0, + "completion_length": 34.5, + "epoch": 7.953367875647668, + "grad_norm": 18.56001308259635, + "kl": 0.142578125, + "learning_rate": 2.049222797927461e-07, + "loss": 0.0009, + "reward": 2.4374929666519165, + "reward_std": 0.17678898698068224, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9374930262565613, + "step": 3070 + }, + { + "clip_ratio": 0.0, + "completion_length": 34.5, + "epoch": 7.955958549222798, + "grad_norm": 0.05355617242731596, + "kl": 0.0408935546875, + "learning_rate": 2.0466321243523315e-07, + "loss": 0.0, + "reward": 2.4999974966049194, + "reward_std": 1.1455659887360525e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.999997615814209, + "step": 3071 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 7.958549222797927, + "grad_norm": 0.587690700379261, + "kl": 0.070556640625, + "learning_rate": 2.044041450777202e-07, + "loss": -0.0008, + "reward": 2.499994397163391, + "reward_std": 6.273919552768348e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999946355819702, + "step": 3072 + }, + { + "clip_ratio": 0.0, + "completion_length": 34.5, + "epoch": 7.961139896373057, + "grad_norm": 0.7380677137303501, + "kl": 0.14501953125, + "learning_rate": 2.0414507772020725e-07, + "loss": 0.0001, + "reward": 2.4999955892562866, + "reward_std": 2.104386055634677e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999957084655762, + "step": 3073 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.5, + "epoch": 7.963730569948186, + "grad_norm": 0.2588980691836974, + "kl": 0.112548828125, + "learning_rate": 2.0388601036269428e-07, + "loss": 0.0008, + "reward": 2.4999825954437256, + "reward_std": 5.611572589714342e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.999982476234436, + "step": 3074 + }, + { + "clip_ratio": 0.0, + "completion_length": 36.0, + "epoch": 7.966321243523316, + "grad_norm": 2.572605515516186, + "kl": 0.158935546875, + "learning_rate": 2.0362694300518136e-07, + "loss": 0.0011, + "reward": 1.9984492659568787, + "reward_std": 4.848407400004362e-05, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.4984492659568787, + "step": 3075 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.3125, + "epoch": 7.9689119170984455, + "grad_norm": 2.665702570226113, + "kl": 0.0643310546875, + "learning_rate": 2.0336787564766839e-07, + "loss": 0.0007, + "reward": 2.49998140335083, + "reward_std": 7.662841227329409e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999815821647644, + "step": 3076 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 7.971502590673575, + "grad_norm": 18.970656294062735, + "kl": 0.203369140625, + "learning_rate": 2.031088082901554e-07, + "loss": 0.0011, + "reward": 1.9487608671188354, + "reward_std": 0.0007219227768473502, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.4487608969211578, + "step": 3077 + }, + { + "clip_ratio": 0.0, + "completion_length": 36.5, + "epoch": 7.974093264248705, + "grad_norm": 0.29214332981303415, + "kl": 0.099365234375, + "learning_rate": 2.028497409326425e-07, + "loss": 0.001, + "reward": 2.4999948740005493, + "reward_std": 2.9205231157902745e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999948740005493, + "step": 3078 + }, + { + "clip_ratio": 0.0, + "completion_length": 34.5, + "epoch": 7.976683937823834, + "grad_norm": 0.06903466222955096, + "kl": 0.07177734375, + "learning_rate": 2.0259067357512952e-07, + "loss": 0.0006, + "reward": 2.4999988079071045, + "reward_std": 9.816939154916327e-07, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999988675117493, + "step": 3079 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 7.979274611398964, + "grad_norm": 8.612252967738119, + "kl": 0.0958251953125, + "learning_rate": 2.0233160621761657e-07, + "loss": 0.0005, + "reward": 1.7707089185714722, + "reward_std": 0.0003922329296983662, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.270709067583084, + "step": 3080 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.5, + "epoch": 7.981865284974093, + "grad_norm": 0.06965903585808235, + "kl": 0.09033203125, + "learning_rate": 2.0207253886010362e-07, + "loss": 0.0005, + "reward": 2.499998092651367, + "reward_std": 1.221670515860751e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999979734420776, + "step": 3081 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.5, + "epoch": 7.984455958549223, + "grad_norm": 61.8232118057449, + "kl": 0.123382568359375, + "learning_rate": 2.0181347150259068e-07, + "loss": -0.0004, + "reward": 2.1868947744369507, + "reward_std": 0.2592725643578433, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.6868947744369507, + "step": 3082 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 7.987046632124352, + "grad_norm": 2.1478023117879985, + "kl": 0.189697265625, + "learning_rate": 2.015544041450777e-07, + "loss": 0.0016, + "reward": 2.4999862909317017, + "reward_std": 1.615830603896029e-05, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999861121177673, + "step": 3083 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 7.989637305699482, + "grad_norm": 0.20232547123761707, + "kl": 0.061279296875, + "learning_rate": 2.0129533678756475e-07, + "loss": -0.0001, + "reward": 2.499997615814209, + "reward_std": 2.117779388299823e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999977350234985, + "step": 3084 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 7.992227979274611, + "grad_norm": 4.503614777407574, + "kl": 0.093017578125, + "learning_rate": 2.010362694300518e-07, + "loss": 0.0004, + "reward": 1.9998580813407898, + "reward_std": 4.70124298317387e-05, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.4998580813407898, + "step": 3085 + }, + { + "clip_ratio": 0.0, + "completion_length": 34.5, + "epoch": 7.994818652849741, + "grad_norm": 0.20226327912536518, + "kl": 0.0780029296875, + "learning_rate": 2.0077720207253883e-07, + "loss": -0.0006, + "reward": 2.4999945163726807, + "reward_std": 2.061266286546015e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999944567680359, + "step": 3086 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 7.9974093264248705, + "grad_norm": 0.17327887352374288, + "kl": 0.093017578125, + "learning_rate": 2.005181347150259e-07, + "loss": -0.0009, + "reward": 2.4999959468841553, + "reward_std": 3.4762794598464097e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999961256980896, + "step": 3087 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 8.0, + "grad_norm": 95.68303039745064, + "kl": 0.083984375, + "learning_rate": 2.0025906735751294e-07, + "loss": 0.0004, + "reward": 2.3749685287475586, + "reward_std": 0.2673270872874127, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.8749684691429138, + "step": 3088 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 8.00259067357513, + "grad_norm": 0.27654166941512687, + "kl": 0.1065673828125, + "learning_rate": 2e-07, + "loss": 0.0019, + "reward": 2.4999955892562866, + "reward_std": 1.8276711273301771e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999954104423523, + "step": 3089 + }, + { + "clip_ratio": 0.0, + "completion_length": 34.5, + "epoch": 8.005181347150259, + "grad_norm": 0.8058061970490968, + "kl": 0.16650390625, + "learning_rate": 1.9974093264248704e-07, + "loss": 0.001, + "reward": 1.9984174370765686, + "reward_std": 3.3430058010708308e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.498417317867279, + "step": 3090 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.5, + "epoch": 8.007772020725389, + "grad_norm": 4.038848217616547, + "kl": 0.0692138671875, + "learning_rate": 1.9948186528497407e-07, + "loss": 0.0013, + "reward": 2.4999942779541016, + "reward_std": 8.745244258534512e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999943375587463, + "step": 3091 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 8.010362694300518, + "grad_norm": 1.3382015064909978, + "kl": 0.04241943359375, + "learning_rate": 1.9922279792746112e-07, + "loss": 0.0002, + "reward": 2.4999924898147583, + "reward_std": 6.398024424925097e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999924898147583, + "step": 3092 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 8.012953367875648, + "grad_norm": 0.08791842291493944, + "kl": 0.087158203125, + "learning_rate": 1.9896373056994818e-07, + "loss": 0.0004, + "reward": 2.499995708465576, + "reward_std": 2.1252000124150072e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.999995768070221, + "step": 3093 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 8.015544041450777, + "grad_norm": 4.562210568370996, + "kl": 0.19482421875, + "learning_rate": 1.9870466321243523e-07, + "loss": 0.0007, + "reward": 1.9321624040603638, + "reward_std": 0.17684750015177997, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.4321625232696533, + "step": 3094 + }, + { + "clip_ratio": 0.0, + "completion_length": 34.5, + "epoch": 8.018134715025907, + "grad_norm": 0.040378968450249915, + "kl": 0.080078125, + "learning_rate": 1.9844559585492225e-07, + "loss": 0.0011, + "reward": 2.499996304512024, + "reward_std": 1.162184503300523e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999961853027344, + "step": 3095 + }, + { + "clip_ratio": 0.0, + "completion_length": 34.5, + "epoch": 8.020725388601036, + "grad_norm": 0.13822927361758697, + "kl": 0.1259765625, + "learning_rate": 1.9818652849740933e-07, + "loss": 0.0001, + "reward": 2.4999974966049194, + "reward_std": 1.4583371807930234e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999977350234985, + "step": 3096 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.5, + "epoch": 8.023316062176166, + "grad_norm": 0.0967403499014125, + "kl": 0.09765625, + "learning_rate": 1.9792746113989636e-07, + "loss": 0.0005, + "reward": 2.4999970197677612, + "reward_std": 1.7078815801596647e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999971389770508, + "step": 3097 + }, + { + "clip_ratio": 0.0, + "completion_length": 34.5, + "epoch": 8.025906735751295, + "grad_norm": 0.3835122558516601, + "kl": 0.03228759765625, + "learning_rate": 1.9766839378238339e-07, + "loss": 0.0004, + "reward": 2.4999773502349854, + "reward_std": 5.233016963757109e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999772906303406, + "step": 3098 + }, + { + "clip_ratio": 0.0, + "completion_length": 36.5, + "epoch": 8.028497409326425, + "grad_norm": 0.09802395547562452, + "kl": 0.109130859375, + "learning_rate": 1.9740932642487046e-07, + "loss": 0.0001, + "reward": 2.4999983310699463, + "reward_std": 7.796735701504076e-07, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.999998390674591, + "step": 3099 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 8.031088082901555, + "grad_norm": 0.6203831105531014, + "kl": 0.081787109375, + "learning_rate": 1.971502590673575e-07, + "loss": 0.0013, + "reward": 2.4999953508377075, + "reward_std": 4.10243467285909e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.999995231628418, + "step": 3100 + }, + { + "clip_ratio": 0.0, + "completion_length": 36.0, + "epoch": 8.033678756476684, + "grad_norm": 0.5969753783399765, + "kl": 0.114013671875, + "learning_rate": 1.9689119170984454e-07, + "loss": -0.0005, + "reward": 2.4999924898147583, + "reward_std": 7.148053214223182e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999926090240479, + "step": 3101 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 8.036269430051814, + "grad_norm": 0.1087624186499036, + "kl": 0.052490234375, + "learning_rate": 1.966321243523316e-07, + "loss": 0.0002, + "reward": 2.499998092651367, + "reward_std": 1.8846645275516494e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999982118606567, + "step": 3102 + }, + { + "clip_ratio": 0.0, + "completion_length": 36.0, + "epoch": 8.038860103626943, + "grad_norm": 0.15494918170888425, + "kl": 0.102783203125, + "learning_rate": 1.9637305699481865e-07, + "loss": 0.0011, + "reward": 2.499995708465576, + "reward_std": 1.6875885648914846e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999956488609314, + "step": 3103 + }, + { + "clip_ratio": 0.0, + "completion_length": 34.0, + "epoch": 8.041450777202073, + "grad_norm": 7.457069789882459, + "kl": 0.12451171875, + "learning_rate": 1.9611398963730568e-07, + "loss": -0.0007, + "reward": 1.89161217212677, + "reward_std": 0.0005261765377895244, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.3916123509407043, + "step": 3104 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.5, + "epoch": 8.044041450777202, + "grad_norm": 0.19427065900490728, + "kl": 0.0604248046875, + "learning_rate": 1.9585492227979275e-07, + "loss": -0.0003, + "reward": 2.4999983310699463, + "reward_std": 1.6826643332024105e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999984502792358, + "step": 3105 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.5, + "epoch": 8.046632124352332, + "grad_norm": 8.275001637240742, + "kl": 0.064697265625, + "learning_rate": 1.9559585492227978e-07, + "loss": 0.0006, + "reward": 2.4999817609786987, + "reward_std": 1.6396952958075417e-05, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999818205833435, + "step": 3106 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 8.049222797927461, + "grad_norm": 0.5196029343535595, + "kl": 0.100341796875, + "learning_rate": 1.953367875647668e-07, + "loss": -0.0, + "reward": 2.499995708465576, + "reward_std": 4.982454356650123e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999958276748657, + "step": 3107 + }, + { + "clip_ratio": 0.0, + "completion_length": 36.0, + "epoch": 8.05181347150259, + "grad_norm": 0.2783073834819255, + "kl": 0.07763671875, + "learning_rate": 1.9507772020725389e-07, + "loss": 0.0003, + "reward": 2.4999959468841553, + "reward_std": 1.8774072714222712e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999959468841553, + "step": 3108 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.5, + "epoch": 8.05440414507772, + "grad_norm": 0.25248659524932354, + "kl": 0.094482421875, + "learning_rate": 1.948186528497409e-07, + "loss": 0.0008, + "reward": 2.4999901056289673, + "reward_std": 3.0526043701684102e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.999990165233612, + "step": 3109 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 8.05699481865285, + "grad_norm": 2.246502080804671, + "kl": 0.0938720703125, + "learning_rate": 1.94559585492228e-07, + "loss": 0.0001, + "reward": 1.984150767326355, + "reward_std": 0.0001390464283304027, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.4841507077217102, + "step": 3110 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 8.05958549222798, + "grad_norm": 0.18038625345606876, + "kl": 0.07568359375, + "learning_rate": 1.9430051813471502e-07, + "loss": 0.0004, + "reward": 2.499997138977051, + "reward_std": 1.8937988102152303e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999970197677612, + "step": 3111 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 8.062176165803109, + "grad_norm": 0.3286993763777389, + "kl": 0.0477294921875, + "learning_rate": 1.9404145077720207e-07, + "loss": 0.0002, + "reward": 2.4999966621398926, + "reward_std": 3.4049578516714973e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999967217445374, + "step": 3112 + }, + { + "clip_ratio": 0.0, + "completion_length": 34.5, + "epoch": 8.064766839378239, + "grad_norm": 6.257669181126466, + "kl": 0.137939453125, + "learning_rate": 1.9378238341968912e-07, + "loss": 0.0003, + "reward": 1.955582857131958, + "reward_std": 0.00019773694336322478, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.45558300614357, + "step": 3113 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.5, + "epoch": 8.067357512953368, + "grad_norm": 0.08663451372792426, + "kl": 0.0616455078125, + "learning_rate": 1.9352331606217615e-07, + "loss": 0.0007, + "reward": 2.4999966621398926, + "reward_std": 1.774986714053739e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999967813491821, + "step": 3114 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 8.069948186528498, + "grad_norm": 0.2981492449917144, + "kl": 0.16748046875, + "learning_rate": 1.932642487046632e-07, + "loss": 0.0004, + "reward": 2.4999914169311523, + "reward_std": 4.9929566330320085e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999914169311523, + "step": 3115 + }, + { + "clip_ratio": 0.0, + "completion_length": 34.6875, + "epoch": 8.072538860103627, + "grad_norm": 146.84593466136192, + "kl": 0.1099853515625, + "learning_rate": 1.9300518134715025e-07, + "loss": 0.0013, + "reward": 1.9553672671318054, + "reward_std": 0.012412821104760496, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.4553673267364502, + "step": 3116 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.5, + "epoch": 8.075129533678757, + "grad_norm": 0.22145177154849138, + "kl": 0.124267578125, + "learning_rate": 1.927461139896373e-07, + "loss": 0.0008, + "reward": 2.4999983310699463, + "reward_std": 1.2121317922719754e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999984502792358, + "step": 3117 + }, + { + "clip_ratio": 0.0, + "completion_length": 34.5, + "epoch": 8.077720207253886, + "grad_norm": 0.4041656044683656, + "kl": 0.102783203125, + "learning_rate": 1.9248704663212433e-07, + "loss": -0.0003, + "reward": 2.499995708465576, + "reward_std": 2.398298306616198e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999957084655762, + "step": 3118 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 8.080310880829016, + "grad_norm": 3.1182064659979662, + "kl": 0.083740234375, + "learning_rate": 1.922279792746114e-07, + "loss": 0.0003, + "reward": 2.4999865293502808, + "reward_std": 6.549222916873987e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999866485595703, + "step": 3119 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 8.082901554404145, + "grad_norm": 14.633934730479668, + "kl": 0.11962890625, + "learning_rate": 1.9196891191709844e-07, + "loss": -0.0003, + "reward": 1.9491404294967651, + "reward_std": 0.0006368438778565633, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.4491405487060547, + "step": 3120 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 8.085492227979275, + "grad_norm": 15.298543720282403, + "kl": 0.137939453125, + "learning_rate": 1.9170984455958546e-07, + "loss": -0.0003, + "reward": 2.4374752044677734, + "reward_std": 0.1767812859033029, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.937475323677063, + "step": 3121 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.5, + "epoch": 8.088082901554404, + "grad_norm": 2.9459160388900956, + "kl": 0.4010009765625, + "learning_rate": 1.9145077720207254e-07, + "loss": 0.0016, + "reward": 1.9927970170974731, + "reward_std": 8.757973955653142e-05, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.4927969872951508, + "step": 3122 + }, + { + "clip_ratio": 0.0, + "completion_length": 33.875, + "epoch": 8.090673575129534, + "grad_norm": 0.10951777395781458, + "kl": 0.0723876953125, + "learning_rate": 1.9119170984455957e-07, + "loss": 0.0004, + "reward": 2.4999977350234985, + "reward_std": 9.914180054693134e-07, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.999997615814209, + "step": 3123 + }, + { + "clip_ratio": 0.0, + "completion_length": 34.5, + "epoch": 8.093264248704664, + "grad_norm": 0.5977959768713302, + "kl": 0.094970703125, + "learning_rate": 1.9093264248704662e-07, + "loss": 0.0007, + "reward": 2.499989867210388, + "reward_std": 3.2869019719328207e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999897480010986, + "step": 3124 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 8.095854922279793, + "grad_norm": 0.33441473571998426, + "kl": 0.08740234375, + "learning_rate": 1.9067357512953368e-07, + "loss": 0.0011, + "reward": 2.499997854232788, + "reward_std": 1.7746012304087344e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999977350234985, + "step": 3125 + }, + { + "clip_ratio": 0.0, + "completion_length": 36.0, + "epoch": 8.098445595854923, + "grad_norm": 0.2208797906705792, + "kl": 0.0545654296875, + "learning_rate": 1.9041450777202073e-07, + "loss": -0.0012, + "reward": 2.4999959468841553, + "reward_std": 2.518478424917703e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999961853027344, + "step": 3126 + }, + { + "clip_ratio": 0.0, + "completion_length": 34.5, + "epoch": 8.101036269430052, + "grad_norm": 0.12088424964711521, + "kl": 0.131591796875, + "learning_rate": 1.9015544041450775e-07, + "loss": 0.0014, + "reward": 2.4999979734420776, + "reward_std": 2.1100069034218905e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999979734420776, + "step": 3127 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 8.103626943005182, + "grad_norm": 0.14451249603973737, + "kl": 0.1025390625, + "learning_rate": 1.898963730569948e-07, + "loss": -0.0003, + "reward": 2.4999948740005493, + "reward_std": 2.015277232203516e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999949932098389, + "step": 3128 + }, + { + "clip_ratio": 0.0, + "completion_length": 34.5, + "epoch": 8.106217616580311, + "grad_norm": 0.3020402215721515, + "kl": 0.15673828125, + "learning_rate": 1.8963730569948186e-07, + "loss": 0.0013, + "reward": 2.499970555305481, + "reward_std": 4.650092932934058e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999704360961914, + "step": 3129 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.5, + "epoch": 8.10880829015544, + "grad_norm": 1.3645610531940051, + "kl": 0.292236328125, + "learning_rate": 1.8937823834196889e-07, + "loss": 0.0011, + "reward": 2.499994397163391, + "reward_std": 5.65704476684914e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999943971633911, + "step": 3130 + }, + { + "clip_ratio": 0.0, + "completion_length": 34.0, + "epoch": 8.11139896373057, + "grad_norm": 1.1187198055290284, + "kl": 0.0947265625, + "learning_rate": 1.8911917098445597e-07, + "loss": 0.0003, + "reward": 2.4998831748962402, + "reward_std": 1.1185610333086515e-05, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9998833537101746, + "step": 3131 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 8.1139896373057, + "grad_norm": 11.053215892568899, + "kl": 0.0592041015625, + "learning_rate": 1.88860103626943e-07, + "loss": 0.0003, + "reward": 1.9865660667419434, + "reward_std": 0.00017148313304460316, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.486566036939621, + "step": 3132 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 8.11658031088083, + "grad_norm": 1.0770346841812293, + "kl": 0.126220703125, + "learning_rate": 1.8860103626943004e-07, + "loss": 0.0007, + "reward": 2.4999959468841553, + "reward_std": 3.7938548871352396e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999959468841553, + "step": 3133 + }, + { + "clip_ratio": 0.0, + "completion_length": 34.0, + "epoch": 8.119170984455959, + "grad_norm": 0.06260182223385052, + "kl": 0.063232421875, + "learning_rate": 1.883419689119171e-07, + "loss": -0.0006, + "reward": 2.4999985694885254, + "reward_std": 1.2781838734099438e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.999998688697815, + "step": 3134 + }, + { + "clip_ratio": 0.0, + "completion_length": 34.5, + "epoch": 8.121761658031089, + "grad_norm": 0.9148089307791176, + "kl": 0.07958984375, + "learning_rate": 1.8808290155440415e-07, + "loss": 0.0012, + "reward": 2.4999934434890747, + "reward_std": 3.838800864741643e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999934434890747, + "step": 3135 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 8.124352331606218, + "grad_norm": 0.06794014032604263, + "kl": 0.15087890625, + "learning_rate": 1.8782383419689118e-07, + "loss": 0.0008, + "reward": 2.4999977350234985, + "reward_std": 1.860860294300437e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999976754188538, + "step": 3136 + }, + { + "clip_ratio": 0.0, + "completion_length": 36.0, + "epoch": 8.126943005181348, + "grad_norm": 1.715661609603444, + "kl": 0.15966796875, + "learning_rate": 1.8756476683937823e-07, + "loss": 0.002, + "reward": 2.499993681907654, + "reward_std": 7.149828434194205e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999935626983643, + "step": 3137 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.5, + "epoch": 8.129533678756477, + "grad_norm": 0.29388496699105326, + "kl": 0.1199951171875, + "learning_rate": 1.8730569948186528e-07, + "loss": 0.0009, + "reward": 2.4999974966049194, + "reward_std": 1.6710372392481077e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999974966049194, + "step": 3138 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.5, + "epoch": 8.132124352331607, + "grad_norm": 8.956326765142041, + "kl": 0.1494140625, + "learning_rate": 1.870466321243523e-07, + "loss": 0.0013, + "reward": 1.893113374710083, + "reward_std": 0.0005703785835180497, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.393113374710083, + "step": 3139 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 8.134715025906736, + "grad_norm": 1.8511737222706086, + "kl": 0.0673828125, + "learning_rate": 1.8678756476683939e-07, + "loss": 0.0012, + "reward": 2.499923348426819, + "reward_std": 1.6954223610810004e-05, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999232292175293, + "step": 3140 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.5, + "epoch": 8.137305699481866, + "grad_norm": 0.5563064592937442, + "kl": 0.09765625, + "learning_rate": 1.865284974093264e-07, + "loss": 0.0005, + "reward": 2.4999970197677612, + "reward_std": 3.406056521271239e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999971389770508, + "step": 3141 + }, + { + "clip_ratio": 0.0, + "completion_length": 36.0, + "epoch": 8.139896373056995, + "grad_norm": 0.16434538043521746, + "kl": 0.06640625, + "learning_rate": 1.8626943005181347e-07, + "loss": 0.0003, + "reward": 2.499997615814209, + "reward_std": 1.5512517848037533e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999974966049194, + "step": 3142 + }, + { + "clip_ratio": 0.0, + "completion_length": 36.0, + "epoch": 8.142487046632125, + "grad_norm": 0.45062434986212524, + "kl": 0.1502685546875, + "learning_rate": 1.8601036269430052e-07, + "loss": 0.0006, + "reward": 2.499995470046997, + "reward_std": 3.1197882321976067e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999953508377075, + "step": 3143 + }, + { + "clip_ratio": 0.0, + "completion_length": 36.0, + "epoch": 8.145077720207254, + "grad_norm": 2.94622335806529, + "kl": 0.08447265625, + "learning_rate": 1.8575129533678754e-07, + "loss": -0.001, + "reward": 2.499998092651367, + "reward_std": 1.9509256503624783e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.999998152256012, + "step": 3144 + }, + { + "clip_ratio": 0.0, + "completion_length": 34.5, + "epoch": 8.147668393782384, + "grad_norm": 0.7310549388314892, + "kl": 0.08251953125, + "learning_rate": 1.854922279792746e-07, + "loss": 0.0006, + "reward": 2.4999958276748657, + "reward_std": 3.7773199892399134e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.999995768070221, + "step": 3145 + }, + { + "clip_ratio": 0.0, + "completion_length": 36.0, + "epoch": 8.150259067357513, + "grad_norm": 5.416096981349559, + "kl": 0.1904296875, + "learning_rate": 1.8523316062176165e-07, + "loss": -0.0004, + "reward": 1.9998480081558228, + "reward_std": 2.187202250070186e-05, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.4998481273651123, + "step": 3146 + }, + { + "clip_ratio": 0.0, + "completion_length": 36.0, + "epoch": 8.152849740932643, + "grad_norm": 0.22841564104835865, + "kl": 0.15966796875, + "learning_rate": 1.849740932642487e-07, + "loss": 0.0012, + "reward": 2.499998688697815, + "reward_std": 1.6061570136116643e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999987483024597, + "step": 3147 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.5, + "epoch": 8.155440414507773, + "grad_norm": 0.026950303971103606, + "kl": 0.02691650390625, + "learning_rate": 1.8471502590673573e-07, + "loss": -0.0006, + "reward": 2.499998927116394, + "reward_std": 8.433520122252958e-07, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999990463256836, + "step": 3148 + }, + { + "clip_ratio": 0.0, + "completion_length": 36.0, + "epoch": 8.158031088082902, + "grad_norm": 24.674175850704486, + "kl": 0.0599365234375, + "learning_rate": 1.844559585492228e-07, + "loss": 0.0005, + "reward": 2.4374873638153076, + "reward_std": 0.1767964861218161, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.937487244606018, + "step": 3149 + }, + { + "clip_ratio": 0.0, + "completion_length": 36.0, + "epoch": 8.160621761658032, + "grad_norm": 0.11261020228082523, + "kl": 0.056884765625, + "learning_rate": 1.8419689119170983e-07, + "loss": 0.0005, + "reward": 2.4999985694885254, + "reward_std": 1.6432234133390011e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999986290931702, + "step": 3150 + }, + { + "clip_ratio": 0.0, + "completion_length": 33.5625, + "epoch": 8.163212435233161, + "grad_norm": 0.47404691759261275, + "kl": 0.1177978515625, + "learning_rate": 1.8393782383419686e-07, + "loss": 0.0006, + "reward": 2.4999961853027344, + "reward_std": 2.001675397877989e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999960660934448, + "step": 3151 + }, + { + "clip_ratio": 0.0, + "completion_length": 34.5625, + "epoch": 8.16580310880829, + "grad_norm": 0.6225218120001401, + "kl": 0.1572265625, + "learning_rate": 1.8367875647668394e-07, + "loss": 0.0015, + "reward": 2.4999966621398926, + "reward_std": 3.235610051888216e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999966025352478, + "step": 3152 + }, + { + "clip_ratio": 0.0, + "completion_length": 34.0, + "epoch": 8.16839378238342, + "grad_norm": 14.124217296588778, + "kl": 0.0859375, + "learning_rate": 1.8341968911917097e-07, + "loss": -0.0003, + "reward": 2.4998735189437866, + "reward_std": 5.106800938392553e-05, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9998736381530762, + "step": 3153 + }, + { + "clip_ratio": 0.0, + "completion_length": 34.5, + "epoch": 8.17098445595855, + "grad_norm": 2.6514014136130823, + "kl": 0.07763671875, + "learning_rate": 1.8316062176165802e-07, + "loss": 0.0012, + "reward": 2.499994158744812, + "reward_std": 3.814357114606537e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999940395355225, + "step": 3154 + }, + { + "clip_ratio": 0.0, + "completion_length": 34.5, + "epoch": 8.17357512953368, + "grad_norm": 5.63601193786011, + "kl": 0.157958984375, + "learning_rate": 1.8290155440414507e-07, + "loss": 0.0005, + "reward": 2.49994158744812, + "reward_std": 1.3565708542273569e-05, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999415278434753, + "step": 3155 + }, + { + "clip_ratio": 0.0, + "completion_length": 34.5, + "epoch": 8.176165803108809, + "grad_norm": 0.40740725090935975, + "kl": 0.26708984375, + "learning_rate": 1.8264248704663212e-07, + "loss": 0.0009, + "reward": 2.4999970197677612, + "reward_std": 2.193285922658106e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999970197677612, + "step": 3156 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.5, + "epoch": 8.178756476683938, + "grad_norm": 1.606842154843521, + "kl": 0.0782470703125, + "learning_rate": 1.8238341968911915e-07, + "loss": 0.0002, + "reward": 2.4998984336853027, + "reward_std": 1.4305656236501818e-05, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9998984932899475, + "step": 3157 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.5, + "epoch": 8.181347150259068, + "grad_norm": 0.028530366933778497, + "kl": 0.103240966796875, + "learning_rate": 1.821243523316062e-07, + "loss": 0.0001, + "reward": 2.4999992847442627, + "reward_std": 5.331021810661696e-07, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.999999463558197, + "step": 3158 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 8.183937823834198, + "grad_norm": 0.6194798767092272, + "kl": 0.0675048828125, + "learning_rate": 1.8186528497409325e-07, + "loss": -0.0002, + "reward": 2.499990940093994, + "reward_std": 4.0072045806027745e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.999990999698639, + "step": 3159 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.5, + "epoch": 8.186528497409327, + "grad_norm": 0.08955910761199547, + "kl": 0.05010986328125, + "learning_rate": 1.8160621761658028e-07, + "loss": 0.0003, + "reward": 2.4999974966049194, + "reward_std": 1.7598734416424122e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999974966049194, + "step": 3160 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 8.189119170984457, + "grad_norm": 19.591692519743233, + "kl": 0.2783203125, + "learning_rate": 1.8134715025906736e-07, + "loss": 0.001, + "reward": 1.895760178565979, + "reward_std": 0.1790335430414416, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.3957601189613342, + "step": 3161 + }, + { + "clip_ratio": 0.0, + "completion_length": 34.5, + "epoch": 8.191709844559586, + "grad_norm": 9.84818316453554, + "kl": 0.093505859375, + "learning_rate": 1.8108808290155439e-07, + "loss": -0.0004, + "reward": 2.437490224838257, + "reward_std": 0.17679470868677072, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9374903440475464, + "step": 3162 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.5, + "epoch": 8.194300518134716, + "grad_norm": 5.757630431670943, + "kl": 0.15234375, + "learning_rate": 1.8082901554404144e-07, + "loss": 0.0002, + "reward": 1.99784916639328, + "reward_std": 5.932166192224031e-05, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.4978492856025696, + "step": 3163 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.5, + "epoch": 8.196891191709845, + "grad_norm": 0.7914224891039795, + "kl": 0.031280517578125, + "learning_rate": 1.805699481865285e-07, + "loss": 0.0011, + "reward": 2.499995470046997, + "reward_std": 3.156705020046502e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999954104423523, + "step": 3164 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 8.199481865284975, + "grad_norm": 1.2763777362126756, + "kl": 0.132568359375, + "learning_rate": 1.8031088082901554e-07, + "loss": 0.0004, + "reward": 2.4999934434890747, + "reward_std": 7.576135601539136e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999935030937195, + "step": 3165 + }, + { + "clip_ratio": 0.0, + "completion_length": 36.5, + "epoch": 8.202072538860104, + "grad_norm": 5.328442605531321, + "kl": 0.071533203125, + "learning_rate": 1.8005181347150257e-07, + "loss": -0.0008, + "reward": 2.4999911785125732, + "reward_std": 1.9563993191695772e-05, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999911785125732, + "step": 3166 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.5, + "epoch": 8.204663212435234, + "grad_norm": 0.10827715775895738, + "kl": 0.138427734375, + "learning_rate": 1.7979274611398962e-07, + "loss": 0.0004, + "reward": 2.4999985694885254, + "reward_std": 1.2430288052200922e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999986290931702, + "step": 3167 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.5, + "epoch": 8.207253886010363, + "grad_norm": 112.12876758785984, + "kl": 0.13623046875, + "learning_rate": 1.7953367875647668e-07, + "loss": 0.0007, + "reward": 1.9093554615974426, + "reward_std": 0.0021399448464762827, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.4093554019927979, + "step": 3168 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 8.209844559585493, + "grad_norm": 3.164246502147246, + "kl": 0.127685546875, + "learning_rate": 1.792746113989637e-07, + "loss": -0.0001, + "reward": 2.4999929666519165, + "reward_std": 8.70200028657564e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.999993085861206, + "step": 3169 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.5, + "epoch": 8.212435233160623, + "grad_norm": 0.08608571929338761, + "kl": 0.08941650390625, + "learning_rate": 1.7901554404145078e-07, + "loss": -0.0007, + "reward": 2.499998450279236, + "reward_std": 1.0259028613290866e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999987483024597, + "step": 3170 + }, + { + "clip_ratio": 0.0, + "completion_length": 34.5, + "epoch": 8.215025906735752, + "grad_norm": 0.9030541234213424, + "kl": 0.148193359375, + "learning_rate": 1.787564766839378e-07, + "loss": 0.0013, + "reward": 2.4999970197677612, + "reward_std": 3.1579944561599405e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.999996840953827, + "step": 3171 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.5, + "epoch": 8.217616580310882, + "grad_norm": 4.401430294703537, + "kl": 0.232421875, + "learning_rate": 1.784974093264249e-07, + "loss": 0.0015, + "reward": 1.8292223811149597, + "reward_std": 0.0004880217588834057, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.3292223811149597, + "step": 3172 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.4375, + "epoch": 8.220207253886011, + "grad_norm": 78.62242857220241, + "kl": 0.103515625, + "learning_rate": 1.782383419689119e-07, + "loss": 0.0009, + "reward": 2.418749451637268, + "reward_std": 0.22980716611471053, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9187493920326233, + "step": 3173 + }, + { + "clip_ratio": 0.0, + "completion_length": 34.5, + "epoch": 8.22279792746114, + "grad_norm": 0.1921341405034419, + "kl": 0.13525390625, + "learning_rate": 1.7797927461139894e-07, + "loss": -0.0005, + "reward": 2.499997138977051, + "reward_std": 2.932723759840883e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999971985816956, + "step": 3174 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.5, + "epoch": 8.22538860103627, + "grad_norm": 4.5646627228989765, + "kl": 0.14697265625, + "learning_rate": 1.7772020725388602e-07, + "loss": 0.0012, + "reward": 1.8232365846633911, + "reward_std": 0.00029939485773411434, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.3232364058494568, + "step": 3175 + }, + { + "clip_ratio": 0.0, + "completion_length": 34.5, + "epoch": 8.2279792746114, + "grad_norm": 0.3390435865079533, + "kl": 0.156494140625, + "learning_rate": 1.7746113989637304e-07, + "loss": 0.0012, + "reward": 2.499997615814209, + "reward_std": 1.4843980693513004e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.999997615814209, + "step": 3176 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.5, + "epoch": 8.23056994818653, + "grad_norm": 0.4156584054894673, + "kl": 0.03814697265625, + "learning_rate": 1.772020725388601e-07, + "loss": 0.0009, + "reward": 2.4999972581863403, + "reward_std": 2.199124139679043e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999970197677612, + "step": 3177 + }, + { + "clip_ratio": 0.0, + "completion_length": 36.0, + "epoch": 8.233160621761659, + "grad_norm": 2.75713145280005, + "kl": 0.08990478515625, + "learning_rate": 1.7694300518134715e-07, + "loss": 0.0005, + "reward": 1.9898239374160767, + "reward_std": 0.00011030767427655519, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.4898239374160767, + "step": 3178 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 8.235751295336788, + "grad_norm": 0.5804819653866367, + "kl": 0.0433349609375, + "learning_rate": 1.766839378238342e-07, + "loss": -0.0004, + "reward": 2.4999924898147583, + "reward_std": 2.4749996825335074e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999924898147583, + "step": 3179 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.375, + "epoch": 8.238341968911918, + "grad_norm": 61.26426281086702, + "kl": 0.124755859375, + "learning_rate": 1.7642487046632123e-07, + "loss": -0.0002, + "reward": 1.9530822038650513, + "reward_std": 0.3375635552838503, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.4530822932720184, + "step": 3180 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.5, + "epoch": 8.240932642487047, + "grad_norm": 0.706148499646634, + "kl": 0.0645751953125, + "learning_rate": 1.7616580310880828e-07, + "loss": 0.0004, + "reward": 2.499987006187439, + "reward_std": 3.5811082170766895e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.999987006187439, + "step": 3181 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.5, + "epoch": 8.243523316062177, + "grad_norm": 0.4031269658075301, + "kl": 0.0467529296875, + "learning_rate": 1.7590673575129533e-07, + "loss": 0.0011, + "reward": 2.499997615814209, + "reward_std": 1.925787387335731e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.999997615814209, + "step": 3182 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 8.246113989637305, + "grad_norm": 0.10684306024162532, + "kl": 0.125, + "learning_rate": 1.7564766839378236e-07, + "loss": 0.0005, + "reward": 2.499998450279236, + "reward_std": 1.6925793318023352e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999985098838806, + "step": 3183 + }, + { + "clip_ratio": 0.0, + "completion_length": 36.5, + "epoch": 8.248704663212436, + "grad_norm": 0.1248106103613605, + "kl": 0.0657958984375, + "learning_rate": 1.7538860103626944e-07, + "loss": -0.0005, + "reward": 2.4999966621398926, + "reward_std": 2.004274335831724e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999967813491821, + "step": 3184 + }, + { + "clip_ratio": 0.0, + "completion_length": 34.5, + "epoch": 8.251295336787564, + "grad_norm": 0.17905178420023277, + "kl": 0.140625, + "learning_rate": 1.7512953367875647e-07, + "loss": 0.0007, + "reward": 2.49999737739563, + "reward_std": 2.679004865058232e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999973773956299, + "step": 3185 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.5, + "epoch": 8.253886010362695, + "grad_norm": 0.0770172521660824, + "kl": 0.0577392578125, + "learning_rate": 1.7487046632124352e-07, + "loss": 0.0004, + "reward": 2.4999990463256836, + "reward_std": 7.139431517089179e-07, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999991059303284, + "step": 3186 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.5, + "epoch": 8.256476683937823, + "grad_norm": 0.27198644668703004, + "kl": 0.117919921875, + "learning_rate": 1.7461139896373057e-07, + "loss": 0.0019, + "reward": 2.499997854232788, + "reward_std": 3.172386755068146e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999979734420776, + "step": 3187 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.5, + "epoch": 8.259067357512953, + "grad_norm": 0.6302493528208652, + "kl": 0.09375, + "learning_rate": 1.743523316062176e-07, + "loss": -0.0001, + "reward": 1.9999113082885742, + "reward_std": 1.0783291259031103e-05, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.4999111890792847, + "step": 3188 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 8.261658031088082, + "grad_norm": 0.15868275659507172, + "kl": 0.093994140625, + "learning_rate": 1.7409326424870465e-07, + "loss": 0.0015, + "reward": 2.499998450279236, + "reward_std": 1.6560301787649223e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.999998390674591, + "step": 3189 + }, + { + "clip_ratio": 0.0, + "completion_length": 36.0, + "epoch": 8.264248704663212, + "grad_norm": 0.5528136867642289, + "kl": 0.26123046875, + "learning_rate": 1.738341968911917e-07, + "loss": 0.0022, + "reward": 2.499996066093445, + "reward_std": 3.767788598452171e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999960064888, + "step": 3190 + }, + { + "clip_ratio": 0.0, + "completion_length": 34.0, + "epoch": 8.266839378238341, + "grad_norm": 0.12798082023375656, + "kl": 0.06103515625, + "learning_rate": 1.7357512953367876e-07, + "loss": 0.0002, + "reward": 2.499996781349182, + "reward_std": 2.8758530561390216e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.999996840953827, + "step": 3191 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 8.26943005181347, + "grad_norm": 41.452763438953845, + "kl": 0.081787109375, + "learning_rate": 1.7331606217616578e-07, + "loss": 0.0003, + "reward": 1.9954372644424438, + "reward_std": 0.00017896358451707783, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.4954373240470886, + "step": 3192 + }, + { + "clip_ratio": 0.0, + "completion_length": 34.5, + "epoch": 8.2720207253886, + "grad_norm": 0.6773730054031986, + "kl": 0.09912109375, + "learning_rate": 1.7305699481865286e-07, + "loss": 0.0003, + "reward": 2.4999964237213135, + "reward_std": 3.818687446255353e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999963641166687, + "step": 3193 + }, + { + "clip_ratio": 0.0, + "completion_length": 34.5, + "epoch": 8.27461139896373, + "grad_norm": 0.3930135528304384, + "kl": 0.161865234375, + "learning_rate": 1.727979274611399e-07, + "loss": 0.0015, + "reward": 2.499970555305481, + "reward_std": 5.137363700669084e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999704360961914, + "step": 3194 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 8.27720207253886, + "grad_norm": 0.09960299696722485, + "kl": 0.076171875, + "learning_rate": 1.7253886010362694e-07, + "loss": 0.0002, + "reward": 2.499995708465576, + "reward_std": 1.4667808159174456e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999957084655762, + "step": 3195 + }, + { + "clip_ratio": 0.0, + "completion_length": 34.5, + "epoch": 8.279792746113989, + "grad_norm": 0.1589247901078338, + "kl": 0.0526123046875, + "learning_rate": 1.72279792746114e-07, + "loss": -0.0004, + "reward": 2.4999983310699463, + "reward_std": 2.04998809749668e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999984502792358, + "step": 3196 + }, + { + "clip_ratio": 0.0, + "completion_length": 34.5, + "epoch": 8.282383419689118, + "grad_norm": 55.14298222425625, + "kl": 0.22607421875, + "learning_rate": 1.7202072538860102e-07, + "loss": 0.0012, + "reward": 1.814697265625, + "reward_std": 0.0668470896450799, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.3146972060203552, + "step": 3197 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.5, + "epoch": 8.284974093264248, + "grad_norm": 0.3711200936635606, + "kl": 0.0582275390625, + "learning_rate": 1.7176165803108807e-07, + "loss": -0.0003, + "reward": 2.4999959468841553, + "reward_std": 3.7261788747855462e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999961256980896, + "step": 3198 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.5, + "epoch": 8.287564766839377, + "grad_norm": 0.17464923663992982, + "kl": 0.10992431640625, + "learning_rate": 1.7150259067357512e-07, + "loss": 0.0015, + "reward": 2.4999969005584717, + "reward_std": 2.3411973870679503e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999969005584717, + "step": 3199 + }, + { + "clip_ratio": 0.0, + "completion_length": 33.875, + "epoch": 8.290155440414507, + "grad_norm": 5.1900507839793235, + "kl": 0.111328125, + "learning_rate": 1.7124352331606218e-07, + "loss": 0.0005, + "reward": 1.9924770593643188, + "reward_std": 0.00012739573179487707, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.4924769699573517, + "step": 3200 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 8.292746113989637, + "grad_norm": 1.196564466134288, + "kl": 0.079345703125, + "learning_rate": 1.709844559585492e-07, + "loss": 0.0007, + "reward": 2.499959707260132, + "reward_std": 1.264332422579173e-05, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999597668647766, + "step": 3201 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 8.295336787564766, + "grad_norm": 0.5509604968231264, + "kl": 0.058837890625, + "learning_rate": 1.7072538860103628e-07, + "loss": -0.0002, + "reward": 2.4999970197677612, + "reward_std": 3.7912963648523146e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999971389770508, + "step": 3202 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 8.297927461139896, + "grad_norm": 0.18974085068456928, + "kl": 0.09326171875, + "learning_rate": 1.704663212435233e-07, + "loss": 0.0002, + "reward": 2.499997854232788, + "reward_std": 1.5254385630214529e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999979138374329, + "step": 3203 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 8.300518134715025, + "grad_norm": 2.6871452717421347, + "kl": 0.0576171875, + "learning_rate": 1.7020725388601033e-07, + "loss": 0.0004, + "reward": 2.4999730587005615, + "reward_std": 2.546031603856136e-05, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.999972939491272, + "step": 3204 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 8.303108808290155, + "grad_norm": 0.46416535826530864, + "kl": 0.114013671875, + "learning_rate": 1.6994818652849741e-07, + "loss": 0.0007, + "reward": 2.4999958276748657, + "reward_std": 4.411420036376512e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999956488609314, + "step": 3205 + }, + { + "clip_ratio": 0.0, + "completion_length": 36.0, + "epoch": 8.305699481865284, + "grad_norm": 5.507740085804502, + "kl": 0.068115234375, + "learning_rate": 1.6968911917098444e-07, + "loss": 0.0003, + "reward": 2.4999477863311768, + "reward_std": 1.1819582141470164e-05, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999478459358215, + "step": 3206 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.5, + "epoch": 8.308290155440414, + "grad_norm": 1.9837395328087248, + "kl": 0.0574951171875, + "learning_rate": 1.694300518134715e-07, + "loss": 0.0011, + "reward": 1.999694585800171, + "reward_std": 1.8257314195579966e-05, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.4996943473815918, + "step": 3207 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.5, + "epoch": 8.310880829015543, + "grad_norm": 0.04771270350003987, + "kl": 0.07830810546875, + "learning_rate": 1.6917098445595854e-07, + "loss": 0.0011, + "reward": 2.499998927116394, + "reward_std": 7.389864435936033e-07, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.999998927116394, + "step": 3208 + }, + { + "clip_ratio": 0.0, + "completion_length": 34.0, + "epoch": 8.313471502590673, + "grad_norm": 0.25175274540640064, + "kl": 0.0579833984375, + "learning_rate": 1.689119170984456e-07, + "loss": 0.0004, + "reward": 2.499994397163391, + "reward_std": 2.7393241737172502e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999945163726807, + "step": 3209 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.5, + "epoch": 8.316062176165802, + "grad_norm": 0.2873621850839714, + "kl": 0.0936279296875, + "learning_rate": 1.6865284974093262e-07, + "loss": 0.0005, + "reward": 2.4999983310699463, + "reward_std": 1.3285900308801502e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999983310699463, + "step": 3210 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 8.318652849740932, + "grad_norm": 0.4044842859906627, + "kl": 0.22021484375, + "learning_rate": 1.6839378238341968e-07, + "loss": 0.0003, + "reward": 2.4999961853027344, + "reward_std": 5.079967422716436e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999964237213135, + "step": 3211 + }, + { + "clip_ratio": 0.0, + "completion_length": 36.0, + "epoch": 8.321243523316062, + "grad_norm": 0.08779252572221974, + "kl": 0.10302734375, + "learning_rate": 1.6813471502590673e-07, + "loss": 0.0004, + "reward": 2.4999970197677612, + "reward_std": 1.6046812447712e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.999996840953827, + "step": 3212 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.5, + "epoch": 8.323834196891191, + "grad_norm": 2.3270036307642306, + "kl": 0.15380859375, + "learning_rate": 1.6787564766839376e-07, + "loss": -0.0002, + "reward": 2.4999914169311523, + "reward_std": 5.598668735729007e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999914765357971, + "step": 3213 + }, + { + "clip_ratio": 0.0, + "completion_length": 34.5, + "epoch": 8.32642487046632, + "grad_norm": 0.38672817708323914, + "kl": 0.10302734375, + "learning_rate": 1.6761658031088083e-07, + "loss": 0.0004, + "reward": 2.4999840259552, + "reward_std": 4.8193414841080084e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999840259552002, + "step": 3214 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.5, + "epoch": 8.32901554404145, + "grad_norm": 3.6283125586118645, + "kl": 0.089111328125, + "learning_rate": 1.6735751295336786e-07, + "loss": -0.0005, + "reward": 2.4999771118164062, + "reward_std": 1.2718164953184896e-05, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999772906303406, + "step": 3215 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.5, + "epoch": 8.33160621761658, + "grad_norm": 3.5250134204386807, + "kl": 0.0859375, + "learning_rate": 1.6709844559585491e-07, + "loss": -0.0001, + "reward": 1.9801177978515625, + "reward_std": 0.00020493308011282352, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.4801177680492401, + "step": 3216 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.5, + "epoch": 8.33419689119171, + "grad_norm": 0.38752575444168663, + "kl": 0.060546875, + "learning_rate": 1.6683937823834197e-07, + "loss": -0.0002, + "reward": 2.4999988079071045, + "reward_std": 1.037070063603096e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999988675117493, + "step": 3217 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 8.336787564766839, + "grad_norm": 0.7300713622872141, + "kl": 0.07666015625, + "learning_rate": 1.66580310880829e-07, + "loss": 0.0, + "reward": 2.4999929666519165, + "reward_std": 4.953808229402057e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999930262565613, + "step": 3218 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.5, + "epoch": 8.339378238341968, + "grad_norm": 10.606544763115886, + "kl": 0.2266845703125, + "learning_rate": 1.6632124352331605e-07, + "loss": 0.0018, + "reward": 1.9905086755752563, + "reward_std": 0.00011018002430773777, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.4905085563659668, + "step": 3219 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.5, + "epoch": 8.341968911917098, + "grad_norm": 0.5209999027051346, + "kl": 0.120849609375, + "learning_rate": 1.660621761658031e-07, + "loss": 0.0008, + "reward": 2.4999935626983643, + "reward_std": 3.124012891930761e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999936819076538, + "step": 3220 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.5, + "epoch": 8.344559585492227, + "grad_norm": 0.1573626252593982, + "kl": 0.044921875, + "learning_rate": 1.6580310880829015e-07, + "loss": 0.0003, + "reward": 2.4999964237213135, + "reward_std": 2.0584985236382636e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.999996542930603, + "step": 3221 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.5, + "epoch": 8.347150259067357, + "grad_norm": 5.004434905306625, + "kl": 0.1611328125, + "learning_rate": 1.6554404145077718e-07, + "loss": -0.0003, + "reward": 1.9988044500350952, + "reward_std": 6.160921429909649e-05, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.4988045692443848, + "step": 3222 + }, + { + "clip_ratio": 0.0, + "completion_length": 36.0, + "epoch": 8.349740932642487, + "grad_norm": 0.49659734021737595, + "kl": 0.031829833984375, + "learning_rate": 1.6528497409326426e-07, + "loss": -0.0002, + "reward": 2.499996781349182, + "reward_std": 2.486671633050719e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.999996840953827, + "step": 3223 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 8.352331606217616, + "grad_norm": 0.5465098105777704, + "kl": 0.15576171875, + "learning_rate": 1.6502590673575128e-07, + "loss": -0.0011, + "reward": 2.4999961853027344, + "reward_std": 2.6978894993590075e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.999996304512024, + "step": 3224 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.5, + "epoch": 8.354922279792746, + "grad_norm": 4.365680888853067, + "kl": 0.08984375, + "learning_rate": 1.6476683937823836e-07, + "loss": 0.0005, + "reward": 2.499997615814209, + "reward_std": 2.0819348378608993e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999976754188538, + "step": 3225 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 8.357512953367875, + "grad_norm": 0.9122588700547437, + "kl": 0.079833984375, + "learning_rate": 1.645077720207254e-07, + "loss": -0.0001, + "reward": 1.9988069534301758, + "reward_std": 2.5916017477811693e-05, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.498807042837143, + "step": 3226 + }, + { + "clip_ratio": 0.0, + "completion_length": 36.0, + "epoch": 8.360103626943005, + "grad_norm": 2.840972529513588, + "kl": 0.13037109375, + "learning_rate": 1.6424870466321241e-07, + "loss": 0.0011, + "reward": 1.8726292848587036, + "reward_std": 0.0002964056578775853, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.3726292252540588, + "step": 3227 + }, + { + "clip_ratio": 0.0, + "completion_length": 34.5, + "epoch": 8.362694300518134, + "grad_norm": 0.1604427777396762, + "kl": 0.0936279296875, + "learning_rate": 1.639896373056995e-07, + "loss": -0.0005, + "reward": 2.4999935626983643, + "reward_std": 2.405010377515282e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999935626983643, + "step": 3228 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 8.365284974093264, + "grad_norm": 0.08633581676479131, + "kl": 0.089111328125, + "learning_rate": 1.6373056994818652e-07, + "loss": -0.0001, + "reward": 2.499998450279236, + "reward_std": 1.1789842062626121e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.999998390674591, + "step": 3229 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0625, + "epoch": 8.367875647668393, + "grad_norm": 1.4393329729158233, + "kl": 0.568359375, + "learning_rate": 1.6347150259067357e-07, + "loss": 0.0021, + "reward": 2.4999966621398926, + "reward_std": 4.821636252927419e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999967813491821, + "step": 3230 + }, + { + "clip_ratio": 0.0, + "completion_length": 36.0, + "epoch": 8.370466321243523, + "grad_norm": 0.41139015268834717, + "kl": 0.0634765625, + "learning_rate": 1.6321243523316062e-07, + "loss": 0.0005, + "reward": 2.4999823570251465, + "reward_std": 6.151422212496982e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999822974205017, + "step": 3231 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 8.373056994818652, + "grad_norm": 0.025479270067198755, + "kl": 0.0599365234375, + "learning_rate": 1.6295336787564768e-07, + "loss": 0.0001, + "reward": 2.499999165534973, + "reward_std": 7.754772184398462e-07, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999993443489075, + "step": 3232 + }, + { + "clip_ratio": 0.0, + "completion_length": 34.5, + "epoch": 8.375647668393782, + "grad_norm": 1.4660702928844889, + "kl": 0.0849609375, + "learning_rate": 1.626943005181347e-07, + "loss": 0.0001, + "reward": 1.9989567995071411, + "reward_std": 4.1266665164130245e-05, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.4989569187164307, + "step": 3233 + }, + { + "clip_ratio": 0.0, + "completion_length": 34.0, + "epoch": 8.378238341968911, + "grad_norm": 3.3307177770463445, + "kl": 0.169921875, + "learning_rate": 1.6243523316062176e-07, + "loss": 0.0002, + "reward": 1.998131275177002, + "reward_std": 7.804911570019613e-05, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.4981312453746796, + "step": 3234 + }, + { + "clip_ratio": 0.0, + "completion_length": 36.0, + "epoch": 8.380829015544041, + "grad_norm": 0.08126447326729487, + "kl": 0.061767578125, + "learning_rate": 1.621761658031088e-07, + "loss": 0.0003, + "reward": 2.4999961853027344, + "reward_std": 2.5046920768545533e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999961853027344, + "step": 3235 + }, + { + "clip_ratio": 0.0, + "completion_length": 34.0, + "epoch": 8.38341968911917, + "grad_norm": 0.05105797867997188, + "kl": 0.08917236328125, + "learning_rate": 1.6191709844559583e-07, + "loss": -0.0001, + "reward": 2.4999983310699463, + "reward_std": 8.156730189057271e-07, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999985098838806, + "step": 3236 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.5, + "epoch": 8.3860103626943, + "grad_norm": 0.33430201719343583, + "kl": 0.0751953125, + "learning_rate": 1.6165803108808291e-07, + "loss": 0.0002, + "reward": 2.499993920326233, + "reward_std": 4.37150174548151e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999940991401672, + "step": 3237 + }, + { + "clip_ratio": 0.0, + "completion_length": 36.0, + "epoch": 8.38860103626943, + "grad_norm": 10.517502015421686, + "kl": 0.127197265625, + "learning_rate": 1.6139896373056994e-07, + "loss": 0.0004, + "reward": 1.9886282682418823, + "reward_std": 0.0002502906878021349, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.48862823843956, + "step": 3238 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 8.39119170984456, + "grad_norm": 4.753382783369989, + "kl": 0.104736328125, + "learning_rate": 1.61139896373057e-07, + "loss": 0.0009, + "reward": 2.4996172189712524, + "reward_std": 3.332807591505116e-05, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9996171593666077, + "step": 3239 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.5, + "epoch": 8.393782383419689, + "grad_norm": 0.1077857479588344, + "kl": 0.091552734375, + "learning_rate": 1.6088082901554405e-07, + "loss": -0.0011, + "reward": 2.4999972581863403, + "reward_std": 1.978998000140564e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999974966049194, + "step": 3240 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 8.396373056994818, + "grad_norm": 0.17387818343761, + "kl": 0.089599609375, + "learning_rate": 1.6062176165803107e-07, + "loss": -0.0001, + "reward": 2.4999972581863403, + "reward_std": 2.1541280261772044e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999974966049194, + "step": 3241 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 8.398963730569948, + "grad_norm": 0.07814533365428449, + "kl": 0.0535888671875, + "learning_rate": 1.6036269430051812e-07, + "loss": 0.0001, + "reward": 2.4999990463256836, + "reward_std": 1.040197076918048e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999991655349731, + "step": 3242 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 8.401554404145077, + "grad_norm": 0.37308276998310835, + "kl": 0.117919921875, + "learning_rate": 1.6010362694300518e-07, + "loss": 0.0015, + "reward": 2.4999940395355225, + "reward_std": 2.4687550705948524e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999940395355225, + "step": 3243 + }, + { + "clip_ratio": 0.0, + "completion_length": 34.125, + "epoch": 8.404145077720207, + "grad_norm": 0.34881767606382474, + "kl": 0.146484375, + "learning_rate": 1.5984455958549223e-07, + "loss": 0.0003, + "reward": 2.49999463558197, + "reward_std": 3.860333890770562e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999946355819702, + "step": 3244 + }, + { + "clip_ratio": 0.0, + "completion_length": 34.5, + "epoch": 8.406735751295336, + "grad_norm": 1.3354813148493538, + "kl": 0.083251953125, + "learning_rate": 1.5958549222797926e-07, + "loss": 0.0002, + "reward": 2.499993681907654, + "reward_std": 4.841173563363554e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999936819076538, + "step": 3245 + }, + { + "clip_ratio": 0.0, + "completion_length": 34.5, + "epoch": 8.409326424870466, + "grad_norm": 0.08592330904696557, + "kl": 0.10400390625, + "learning_rate": 1.5932642487046634e-07, + "loss": 0.0006, + "reward": 2.4999985694885254, + "reward_std": 1.1443437415437074e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.999998390674591, + "step": 3246 + }, + { + "clip_ratio": 0.0, + "completion_length": 34.5, + "epoch": 8.411917098445596, + "grad_norm": 2.3490496448675167, + "kl": 0.100830078125, + "learning_rate": 1.5906735751295336e-07, + "loss": 0.0, + "reward": 1.9984028339385986, + "reward_std": 2.9188460757723078e-05, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.4984028935432434, + "step": 3247 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.5, + "epoch": 8.414507772020725, + "grad_norm": 0.06966504060667966, + "kl": 0.13427734375, + "learning_rate": 1.588082901554404e-07, + "loss": 0.0011, + "reward": 2.499997138977051, + "reward_std": 1.6413511332302733e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999971985816956, + "step": 3248 + }, + { + "clip_ratio": 0.0, + "completion_length": 34.0, + "epoch": 8.417098445595855, + "grad_norm": 0.17927949218509098, + "kl": 0.08056640625, + "learning_rate": 1.5854922279792747e-07, + "loss": 0.0013, + "reward": 2.4999983310699463, + "reward_std": 2.2327928377308126e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999984502792358, + "step": 3249 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 8.419689119170984, + "grad_norm": 0.037824058687407194, + "kl": 0.16748046875, + "learning_rate": 1.582901554404145e-07, + "loss": 0.001, + "reward": 2.499998450279236, + "reward_std": 7.957415704140658e-07, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999984502792358, + "step": 3250 + }, + { + "clip_ratio": 0.0, + "completion_length": 34.5, + "epoch": 8.422279792746114, + "grad_norm": 12.865394574033488, + "kl": 0.1265869140625, + "learning_rate": 1.5803108808290155e-07, + "loss": 0.0001, + "reward": 1.9134883284568787, + "reward_std": 0.0003624329820013372, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.4134882986545563, + "step": 3251 + }, + { + "clip_ratio": 0.0, + "completion_length": 34.0, + "epoch": 8.424870466321243, + "grad_norm": 0.4403927991964329, + "kl": 0.16259765625, + "learning_rate": 1.577720207253886e-07, + "loss": 0.0002, + "reward": 1.9976167678833008, + "reward_std": 1.7445513776692678e-05, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.4976167678833008, + "step": 3252 + }, + { + "clip_ratio": 0.0, + "completion_length": 34.5, + "epoch": 8.427461139896373, + "grad_norm": 0.14228288015250842, + "kl": 0.1326904296875, + "learning_rate": 1.5751295336787565e-07, + "loss": 0.0007, + "reward": 2.4999982118606567, + "reward_std": 1.1551248348951049e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999982118606567, + "step": 3253 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.5, + "epoch": 8.430051813471502, + "grad_norm": 2.0148836197609494, + "kl": 0.1904296875, + "learning_rate": 1.5725388601036268e-07, + "loss": 0.0, + "reward": 2.499996304512024, + "reward_std": 3.282113539171405e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.999996542930603, + "step": 3254 + }, + { + "clip_ratio": 0.0, + "completion_length": 34.5, + "epoch": 8.432642487046632, + "grad_norm": 3.57542976314061, + "kl": 0.125244140625, + "learning_rate": 1.5699481865284976e-07, + "loss": -0.001, + "reward": 2.499990224838257, + "reward_std": 7.6941531688135e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.999990463256836, + "step": 3255 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 8.435233160621761, + "grad_norm": 0.7637507544783885, + "kl": 0.1806640625, + "learning_rate": 1.5673575129533678e-07, + "loss": 0.0006, + "reward": 2.4999953508377075, + "reward_std": 6.560696192536852e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999955296516418, + "step": 3256 + }, + { + "clip_ratio": 0.0, + "completion_length": 34.5, + "epoch": 8.437823834196891, + "grad_norm": 0.21408403832204953, + "kl": 0.0789794921875, + "learning_rate": 1.564766839378238e-07, + "loss": -0.0001, + "reward": 2.4999988079071045, + "reward_std": 7.541374600350537e-07, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999988675117493, + "step": 3257 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.5, + "epoch": 8.44041450777202, + "grad_norm": 0.3785837802152817, + "kl": 0.06884765625, + "learning_rate": 1.562176165803109e-07, + "loss": 0.0002, + "reward": 2.499995231628418, + "reward_std": 3.249091946599947e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.999995231628418, + "step": 3258 + }, + { + "clip_ratio": 0.0, + "completion_length": 34.5, + "epoch": 8.44300518134715, + "grad_norm": 249.56412273715605, + "kl": 0.412109375, + "learning_rate": 1.5595854922279791e-07, + "loss": 0.0019, + "reward": 1.9868806600570679, + "reward_std": 0.000564672772085828, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.4868807792663574, + "step": 3259 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 8.44559585492228, + "grad_norm": 0.15525033110824002, + "kl": 0.08642578125, + "learning_rate": 1.5569948186528497e-07, + "loss": 0.0007, + "reward": 2.4999985694885254, + "reward_std": 1.7415273134702147e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999984502792358, + "step": 3260 + }, + { + "clip_ratio": 0.0, + "completion_length": 33.5, + "epoch": 8.44818652849741, + "grad_norm": 0.1675682962837026, + "kl": 0.0406494140625, + "learning_rate": 1.5544041450777202e-07, + "loss": 0.0002, + "reward": 2.4999979734420776, + "reward_std": 1.4315731391434383e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999979734420776, + "step": 3261 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 8.450777202072539, + "grad_norm": 0.28081459854541224, + "kl": 0.0960693359375, + "learning_rate": 1.5518134715025907e-07, + "loss": 0.001, + "reward": 2.4999982118606567, + "reward_std": 1.058569125689246e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.999998152256012, + "step": 3262 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.5, + "epoch": 8.453367875647668, + "grad_norm": 0.5980743679822971, + "kl": 0.048095703125, + "learning_rate": 1.549222797927461e-07, + "loss": 0.0004, + "reward": 2.4999959468841553, + "reward_std": 3.200311198270356e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999958276748657, + "step": 3263 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 8.455958549222798, + "grad_norm": 1.140386029914927, + "kl": 0.1435546875, + "learning_rate": 1.5466321243523315e-07, + "loss": 0.0004, + "reward": 2.4999970197677612, + "reward_std": 3.2830278087203624e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.999997079372406, + "step": 3264 + }, + { + "clip_ratio": 0.0, + "completion_length": 34.5, + "epoch": 8.458549222797927, + "grad_norm": 1.673615121006343, + "kl": 0.084716796875, + "learning_rate": 1.544041450777202e-07, + "loss": 0.0001, + "reward": 2.499990224838257, + "reward_std": 6.327432458874682e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999903440475464, + "step": 3265 + }, + { + "clip_ratio": 0.0, + "completion_length": 34.5, + "epoch": 8.461139896373057, + "grad_norm": 0.2208932094048126, + "kl": 0.145751953125, + "learning_rate": 1.5414507772020723e-07, + "loss": 0.0002, + "reward": 2.499972105026245, + "reward_std": 4.598367240760126e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999722242355347, + "step": 3266 + }, + { + "clip_ratio": 0.0, + "completion_length": 34.5, + "epoch": 8.463730569948186, + "grad_norm": 0.1965320442956369, + "kl": 0.09619140625, + "learning_rate": 1.538860103626943e-07, + "loss": 0.0002, + "reward": 2.499996066093445, + "reward_std": 2.536736644742632e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999961256980896, + "step": 3267 + }, + { + "clip_ratio": 0.0, + "completion_length": 34.5625, + "epoch": 8.466321243523316, + "grad_norm": 18.037481291621404, + "kl": 0.210693359375, + "learning_rate": 1.5362694300518134e-07, + "loss": 0.0011, + "reward": 1.9484704732894897, + "reward_std": 0.0016775362241787661, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.4484704732894897, + "step": 3268 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.5, + "epoch": 8.468911917098445, + "grad_norm": 0.25522864478434343, + "kl": 0.072265625, + "learning_rate": 1.533678756476684e-07, + "loss": 0.0009, + "reward": 2.499997138977051, + "reward_std": 1.963451666142646e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999969601631165, + "step": 3269 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 8.471502590673575, + "grad_norm": 0.09003623791447991, + "kl": 0.02203369140625, + "learning_rate": 1.5310880829015544e-07, + "loss": 0.0001, + "reward": 2.4999969005584717, + "reward_std": 1.6059943277468847e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999969005584717, + "step": 3270 + }, + { + "clip_ratio": 0.0, + "completion_length": 36.0, + "epoch": 8.474093264248705, + "grad_norm": 0.20795154777892821, + "kl": 0.0360107421875, + "learning_rate": 1.5284974093264247e-07, + "loss": -0.0007, + "reward": 2.499996781349182, + "reward_std": 3.3171672839671373e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999969005584717, + "step": 3271 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 8.476683937823834, + "grad_norm": 0.21255969204162925, + "kl": 0.08123779296875, + "learning_rate": 1.5259067357512952e-07, + "loss": 0.0012, + "reward": 2.4999959468841553, + "reward_std": 2.565349518590665e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999961256980896, + "step": 3272 + }, + { + "clip_ratio": 0.0, + "completion_length": 36.0, + "epoch": 8.479274611398964, + "grad_norm": 0.05256515725884933, + "kl": 0.081298828125, + "learning_rate": 1.5233160621761657e-07, + "loss": -0.0001, + "reward": 2.499995470046997, + "reward_std": 1.0015135103458306e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999955892562866, + "step": 3273 + }, + { + "clip_ratio": 0.0, + "completion_length": 36.0, + "epoch": 8.481865284974093, + "grad_norm": 0.422305608252991, + "kl": 0.063232421875, + "learning_rate": 1.5207253886010362e-07, + "loss": 0.0004, + "reward": 2.4999953508377075, + "reward_std": 2.56870990256175e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999954104423523, + "step": 3274 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 8.484455958549223, + "grad_norm": 0.11164640534135978, + "kl": 0.204345703125, + "learning_rate": 1.5181347150259065e-07, + "loss": 0.0018, + "reward": 2.4999979734420776, + "reward_std": 1.6274793779302854e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999976754188538, + "step": 3275 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 8.487046632124352, + "grad_norm": 1.7973177731477983, + "kl": 0.06671142578125, + "learning_rate": 1.5155440414507773e-07, + "loss": 0.0003, + "reward": 2.4999920129776, + "reward_std": 5.526389429633127e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999921321868896, + "step": 3276 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.5, + "epoch": 8.489637305699482, + "grad_norm": 1.7222743390429385, + "kl": 0.07763671875, + "learning_rate": 1.5129533678756476e-07, + "loss": 0.0005, + "reward": 2.4999918937683105, + "reward_std": 7.834495420411258e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999918341636658, + "step": 3277 + }, + { + "clip_ratio": 0.0, + "completion_length": 34.0625, + "epoch": 8.492227979274611, + "grad_norm": 0.3201813158632273, + "kl": 0.1708984375, + "learning_rate": 1.5103626943005178e-07, + "loss": 0.0004, + "reward": 2.499995231628418, + "reward_std": 3.447210815465951e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.999995231628418, + "step": 3278 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.4375, + "epoch": 8.494818652849741, + "grad_norm": 13.389776839579126, + "kl": 0.1268310546875, + "learning_rate": 1.5077720207253886e-07, + "loss": 0.001, + "reward": 1.8866075277328491, + "reward_std": 0.00038939237373369906, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.386607587337494, + "step": 3279 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.5, + "epoch": 8.49740932642487, + "grad_norm": 888.290757360333, + "kl": 0.09588623046875, + "learning_rate": 1.505181347150259e-07, + "loss": 0.0004, + "reward": 1.983170986175537, + "reward_std": 0.0002128577453959224, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.4831709265708923, + "step": 3280 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 8.5, + "grad_norm": 8.183365356282343, + "kl": 0.070556640625, + "learning_rate": 1.5025906735751294e-07, + "loss": 0.0009, + "reward": 2.499973773956299, + "reward_std": 2.2136076040624175e-05, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.999973714351654, + "step": 3281 + }, + { + "clip_ratio": 0.0, + "completion_length": 34.5, + "epoch": 8.50259067357513, + "grad_norm": 0.7334981416727272, + "kl": 0.049072265625, + "learning_rate": 1.5e-07, + "loss": 0.0, + "reward": 2.499978542327881, + "reward_std": 4.261434241925599e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999786615371704, + "step": 3282 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 8.505181347150259, + "grad_norm": 0.20178492205725573, + "kl": 0.0592041015625, + "learning_rate": 1.4974093264248705e-07, + "loss": 0.0005, + "reward": 2.499996304512024, + "reward_std": 2.4263547402370023e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.999996304512024, + "step": 3283 + }, + { + "clip_ratio": 0.0, + "completion_length": 34.625, + "epoch": 8.507772020725389, + "grad_norm": 40.50431362763718, + "kl": 0.089111328125, + "learning_rate": 1.4948186528497407e-07, + "loss": 0.0011, + "reward": 2.1100460290908813, + "reward_std": 0.24016640169040215, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.6100459098815918, + "step": 3284 + }, + { + "clip_ratio": 0.0, + "completion_length": 34.5, + "epoch": 8.510362694300518, + "grad_norm": 0.4338287372722202, + "kl": 0.0908203125, + "learning_rate": 1.4922279792746112e-07, + "loss": 0.0014, + "reward": 2.4999974966049194, + "reward_std": 2.4449706188534037e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999975562095642, + "step": 3285 + }, + { + "clip_ratio": 0.0, + "completion_length": 34.5, + "epoch": 8.512953367875648, + "grad_norm": 0.04874827423322555, + "kl": 0.014404296875, + "learning_rate": 1.4896373056994818e-07, + "loss": 0.0005, + "reward": 2.4999982118606567, + "reward_std": 1.0233412126581243e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999982118606567, + "step": 3286 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 8.515544041450777, + "grad_norm": 0.47325646519770215, + "kl": 0.0994873046875, + "learning_rate": 1.487046632124352e-07, + "loss": 0.0011, + "reward": 2.4999959468841553, + "reward_std": 4.1753178265935276e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999960660934448, + "step": 3287 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.5, + "epoch": 8.518134715025907, + "grad_norm": 2.6325875621177306, + "kl": 0.140380859375, + "learning_rate": 1.4844559585492228e-07, + "loss": 0.0006, + "reward": 2.4999921321868896, + "reward_std": 5.6654648687981535e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999920725822449, + "step": 3288 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.5, + "epoch": 8.520725388601036, + "grad_norm": 0.048611474378057934, + "kl": 0.060546875, + "learning_rate": 1.481865284974093e-07, + "loss": -0.0005, + "reward": 2.4999988079071045, + "reward_std": 7.428212995819194e-07, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.999998927116394, + "step": 3289 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.5, + "epoch": 8.523316062176166, + "grad_norm": 32.34324384763752, + "kl": 0.103271484375, + "learning_rate": 1.479274611398964e-07, + "loss": 0.0001, + "reward": 1.8813066482543945, + "reward_std": 0.0010774543043226004, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.3813066482543945, + "step": 3290 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 8.525906735751295, + "grad_norm": 0.05185809583555954, + "kl": 0.041259765625, + "learning_rate": 1.4766839378238341e-07, + "loss": 0.0012, + "reward": 2.4999969005584717, + "reward_std": 1.1581141450278665e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999970197677612, + "step": 3291 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 8.528497409326425, + "grad_norm": 0.05662225260365626, + "kl": 0.06640625, + "learning_rate": 1.4740932642487047e-07, + "loss": 0.0006, + "reward": 2.4999982118606567, + "reward_std": 1.2409597047735588e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999982118606567, + "step": 3292 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 8.531088082901555, + "grad_norm": 0.3393649641395822, + "kl": 0.084625244140625, + "learning_rate": 1.4715025906735752e-07, + "loss": -0.0004, + "reward": 2.4999873638153076, + "reward_std": 3.285905791017285e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999873638153076, + "step": 3293 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 8.533678756476684, + "grad_norm": 0.36928055302912044, + "kl": 0.162109375, + "learning_rate": 1.4689119170984455e-07, + "loss": -0.0001, + "reward": 2.499995708465576, + "reward_std": 3.145457355913095e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999957084655762, + "step": 3294 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.5, + "epoch": 8.536269430051814, + "grad_norm": 0.23091887539162143, + "kl": 0.091796875, + "learning_rate": 1.466321243523316e-07, + "loss": 0.0009, + "reward": 2.499981999397278, + "reward_std": 3.843254546609387e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999820590019226, + "step": 3295 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.5, + "epoch": 8.538860103626943, + "grad_norm": 0.33369829537695067, + "kl": 0.0446929931640625, + "learning_rate": 1.4637305699481865e-07, + "loss": 0.0005, + "reward": 2.4990296363830566, + "reward_std": 1.2910204077343224e-05, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9990296363830566, + "step": 3296 + }, + { + "clip_ratio": 0.0, + "completion_length": 36.0, + "epoch": 8.541450777202073, + "grad_norm": 2.099774127720425, + "kl": 0.1669921875, + "learning_rate": 1.461139896373057e-07, + "loss": 0.0006, + "reward": 1.9991334080696106, + "reward_std": 4.378776657176786e-05, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.4991332590579987, + "step": 3297 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.5, + "epoch": 8.544041450777202, + "grad_norm": 0.38143977304861826, + "kl": 0.0496826171875, + "learning_rate": 1.4585492227979273e-07, + "loss": 0.0016, + "reward": 2.4999948740005493, + "reward_std": 3.284486552956878e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999948740005493, + "step": 3298 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 8.546632124352332, + "grad_norm": 3.0523892841509848, + "kl": 0.06005859375, + "learning_rate": 1.455958549222798e-07, + "loss": 0.0006, + "reward": 1.7927755117416382, + "reward_std": 0.0002894496790872836, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.2927754819393158, + "step": 3299 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.5, + "epoch": 8.549222797927461, + "grad_norm": 0.30116251332339555, + "kl": 0.0726318359375, + "learning_rate": 1.4533678756476684e-07, + "loss": 0.0001, + "reward": 2.4999985694885254, + "reward_std": 1.3621807113395334e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999988079071045, + "step": 3300 + }, + { + "clip_ratio": 0.0, + "completion_length": 34.5, + "epoch": 8.55181347150259, + "grad_norm": 1.0737550503837174, + "kl": 0.087646484375, + "learning_rate": 1.4507772020725386e-07, + "loss": -0.0001, + "reward": 2.499998688697815, + "reward_std": 1.0256756866056094e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999987483024597, + "step": 3301 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 8.55440414507772, + "grad_norm": 0.3029410961554839, + "kl": 0.1630859375, + "learning_rate": 1.4481865284974094e-07, + "loss": 0.0005, + "reward": 2.4999921321868896, + "reward_std": 3.6748403999808943e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999922513961792, + "step": 3302 + }, + { + "clip_ratio": 0.0, + "completion_length": 34.5, + "epoch": 8.55699481865285, + "grad_norm": 7.290144631469494, + "kl": 0.1336669921875, + "learning_rate": 1.4455958549222797e-07, + "loss": 0.0004, + "reward": 1.8220319151878357, + "reward_std": 0.00028328357723239606, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.3220319747924805, + "step": 3303 + }, + { + "clip_ratio": 0.0, + "completion_length": 36.0, + "epoch": 8.55958549222798, + "grad_norm": 0.2332791276731152, + "kl": 0.092529296875, + "learning_rate": 1.4430051813471502e-07, + "loss": 0.0015, + "reward": 2.4999961853027344, + "reward_std": 2.628421157169214e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999959468841553, + "step": 3304 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.5, + "epoch": 8.562176165803109, + "grad_norm": 0.18279067045772782, + "kl": 0.047119140625, + "learning_rate": 1.4404145077720207e-07, + "loss": 0.0008, + "reward": 2.4999961853027344, + "reward_std": 2.2781329107601778e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999960660934448, + "step": 3305 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.5, + "epoch": 8.564766839378239, + "grad_norm": 0.15022808522873954, + "kl": 0.18212890625, + "learning_rate": 1.4378238341968913e-07, + "loss": 0.0013, + "reward": 2.4999979734420776, + "reward_std": 1.3701589693937422e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999980926513672, + "step": 3306 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.5, + "epoch": 8.567357512953368, + "grad_norm": 0.3644989581211705, + "kl": 0.094482421875, + "learning_rate": 1.4352331606217615e-07, + "loss": 0.0004, + "reward": 2.4999947547912598, + "reward_std": 3.124775275864522e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999946355819702, + "step": 3307 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 8.569948186528498, + "grad_norm": 0.13362739170870355, + "kl": 0.161376953125, + "learning_rate": 1.432642487046632e-07, + "loss": 0.0012, + "reward": 2.4999983310699463, + "reward_std": 1.5128913446460501e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999984502792358, + "step": 3308 + }, + { + "clip_ratio": 0.0, + "completion_length": 34.0, + "epoch": 8.572538860103627, + "grad_norm": 0.026391820226991686, + "kl": 0.12060546875, + "learning_rate": 1.4300518134715026e-07, + "loss": -0.0001, + "reward": 2.4999983310699463, + "reward_std": 9.363924675653834e-07, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999982714653015, + "step": 3309 + }, + { + "clip_ratio": 0.0, + "completion_length": 34.5, + "epoch": 8.575129533678757, + "grad_norm": 2.239090648428185, + "kl": 0.075439453125, + "learning_rate": 1.4274611398963728e-07, + "loss": -0.0011, + "reward": 2.4999871253967285, + "reward_std": 7.743637638668588e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999871850013733, + "step": 3310 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.5, + "epoch": 8.577720207253886, + "grad_norm": 0.09145996380471086, + "kl": 0.0572509765625, + "learning_rate": 1.4248704663212436e-07, + "loss": 0.0007, + "reward": 2.4999990463256836, + "reward_std": 1.0969515926717577e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999988675117493, + "step": 3311 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 8.580310880829016, + "grad_norm": 0.15629243860708478, + "kl": 0.12060546875, + "learning_rate": 1.422279792746114e-07, + "loss": 0.0012, + "reward": 2.499998092651367, + "reward_std": 1.8979206402036652e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999980330467224, + "step": 3312 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.875, + "epoch": 8.582901554404145, + "grad_norm": 6.242838938005496, + "kl": 0.16650390625, + "learning_rate": 1.4196891191709844e-07, + "loss": 0.0008, + "reward": 1.9489158987998962, + "reward_std": 0.028667759289874084, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.4489157497882843, + "step": 3313 + }, + { + "clip_ratio": 0.0, + "completion_length": 34.0, + "epoch": 8.585492227979275, + "grad_norm": 0.18985441770229575, + "kl": 0.13818359375, + "learning_rate": 1.417098445595855e-07, + "loss": -0.0002, + "reward": 2.499998450279236, + "reward_std": 1.1788168876591953e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.999998390674591, + "step": 3314 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 8.588082901554404, + "grad_norm": 0.23791420129562021, + "kl": 0.04461669921875, + "learning_rate": 1.4145077720207252e-07, + "loss": 0.0006, + "reward": 2.4999972581863403, + "reward_std": 2.777960872890617e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999972581863403, + "step": 3315 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 8.590673575129534, + "grad_norm": 0.09437937366899564, + "kl": 0.062255859375, + "learning_rate": 1.4119170984455957e-07, + "loss": 0.0005, + "reward": 2.4999959468841553, + "reward_std": 2.295070885338646e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999958872795105, + "step": 3316 + }, + { + "clip_ratio": 0.0, + "completion_length": 36.0, + "epoch": 8.593264248704664, + "grad_norm": 0.16615883619676566, + "kl": 0.047210693359375, + "learning_rate": 1.4093264248704663e-07, + "loss": 0.0018, + "reward": 2.499997854232788, + "reward_std": 1.4581210621145146e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999977350234985, + "step": 3317 + }, + { + "clip_ratio": 0.0, + "completion_length": 36.0, + "epoch": 8.595854922279793, + "grad_norm": 3.537813508311042, + "kl": 0.13720703125, + "learning_rate": 1.4067357512953368e-07, + "loss": -0.0002, + "reward": 2.4998974800109863, + "reward_std": 2.1676318624486157e-05, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9998974204063416, + "step": 3318 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 8.598445595854923, + "grad_norm": 0.2470497325019201, + "kl": 0.069580078125, + "learning_rate": 1.404145077720207e-07, + "loss": 0.0016, + "reward": 2.4999940395355225, + "reward_std": 2.088128894683905e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999939799308777, + "step": 3319 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.5, + "epoch": 8.601036269430052, + "grad_norm": 0.26909185339872677, + "kl": 0.13134765625, + "learning_rate": 1.4015544041450778e-07, + "loss": 0.0011, + "reward": 2.499991774559021, + "reward_std": 6.6673069341049995e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999917149543762, + "step": 3320 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 8.603626943005182, + "grad_norm": 2.146610413069479, + "kl": 0.1396484375, + "learning_rate": 1.398963730569948e-07, + "loss": 0.0006, + "reward": 1.9963128566741943, + "reward_std": 5.926458806015944e-05, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.4963128864765167, + "step": 3321 + }, + { + "clip_ratio": 0.0, + "completion_length": 34.5, + "epoch": 8.606217616580311, + "grad_norm": 0.12705175401992627, + "kl": 0.084716796875, + "learning_rate": 1.3963730569948186e-07, + "loss": 0.0005, + "reward": 2.4999988079071045, + "reward_std": 1.0816191320373036e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.999998927116394, + "step": 3322 + }, + { + "clip_ratio": 0.0, + "completion_length": 34.5, + "epoch": 8.60880829015544, + "grad_norm": 9.672743152946575, + "kl": 0.09765625, + "learning_rate": 1.3937823834196891e-07, + "loss": 0.001, + "reward": 1.957108497619629, + "reward_std": 0.00017531489743305428, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.4571084380149841, + "step": 3323 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 8.61139896373057, + "grad_norm": 0.052663728324344836, + "kl": 0.08056640625, + "learning_rate": 1.3911917098445594e-07, + "loss": 0.0012, + "reward": 2.4999958276748657, + "reward_std": 1.8165787878388073e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999958276748657, + "step": 3324 + }, + { + "clip_ratio": 0.0, + "completion_length": 34.5, + "epoch": 8.6139896373057, + "grad_norm": 5.056037938743275, + "kl": 0.130615234375, + "learning_rate": 1.38860103626943e-07, + "loss": 0.0001, + "reward": 1.9528579711914062, + "reward_std": 0.00015353200802792344, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.4528579711914062, + "step": 3325 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 8.61658031088083, + "grad_norm": 0.12023500340132035, + "kl": 0.07177734375, + "learning_rate": 1.3860103626943005e-07, + "loss": -0.0003, + "reward": 2.4999983310699463, + "reward_std": 1.720802060845017e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999983310699463, + "step": 3326 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.5, + "epoch": 8.619170984455959, + "grad_norm": 0.32352513103022656, + "kl": 0.01611328125, + "learning_rate": 1.383419689119171e-07, + "loss": 0.0004, + "reward": 2.4999964237213135, + "reward_std": 3.2249220112134935e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999963641166687, + "step": 3327 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 8.621761658031089, + "grad_norm": 87.49888650333517, + "kl": 0.139404296875, + "learning_rate": 1.3808290155440413e-07, + "loss": 0.0016, + "reward": 2.187451958656311, + "reward_std": 0.25881180025001527, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.6874518990516663, + "step": 3328 + }, + { + "clip_ratio": 0.0, + "completion_length": 34.5, + "epoch": 8.624352331606218, + "grad_norm": 0.1192222284628396, + "kl": 0.0692138671875, + "learning_rate": 1.378238341968912e-07, + "loss": -0.0012, + "reward": 2.499995231628418, + "reward_std": 2.032030351983849e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999953508377075, + "step": 3329 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 8.626943005181348, + "grad_norm": 0.2373709354580264, + "kl": 0.0755615234375, + "learning_rate": 1.3756476683937823e-07, + "loss": -0.0008, + "reward": 1.9998705387115479, + "reward_std": 6.1315019479479815e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.4998705387115479, + "step": 3330 + }, + { + "clip_ratio": 0.0, + "completion_length": 34.5, + "epoch": 8.629533678756477, + "grad_norm": 0.10628699498629816, + "kl": 0.1171875, + "learning_rate": 1.3730569948186526e-07, + "loss": -0.0006, + "reward": 2.499998092651367, + "reward_std": 1.4112170845237415e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999980926513672, + "step": 3331 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.625, + "epoch": 8.632124352331607, + "grad_norm": 8.231748297252688, + "kl": 0.2783203125, + "learning_rate": 1.3704663212435234e-07, + "loss": 0.0019, + "reward": 1.9945073127746582, + "reward_std": 9.54220321318644e-05, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.4945071935653687, + "step": 3332 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.5, + "epoch": 8.634715025906736, + "grad_norm": 0.1999669731604068, + "kl": 0.0706787109375, + "learning_rate": 1.3678756476683936e-07, + "loss": 0.0011, + "reward": 2.4999959468841553, + "reward_std": 2.630793062508019e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.999995768070221, + "step": 3333 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 8.637305699481866, + "grad_norm": 1.6101286665730745, + "kl": 0.097412109375, + "learning_rate": 1.3652849740932641e-07, + "loss": 0.0012, + "reward": 1.9999347925186157, + "reward_std": 1.3890817399442312e-05, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.4999346137046814, + "step": 3334 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 8.639896373056995, + "grad_norm": 2.141494039933273, + "kl": 0.265625, + "learning_rate": 1.3626943005181347e-07, + "loss": 0.001, + "reward": 1.999935269355774, + "reward_std": 1.1870372759403836e-05, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.4999352097511292, + "step": 3335 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.5, + "epoch": 8.642487046632125, + "grad_norm": 0.4323880814192606, + "kl": 0.04364013671875, + "learning_rate": 1.3601036269430052e-07, + "loss": -0.0001, + "reward": 2.4999862909317017, + "reward_std": 5.350068477127934e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999864101409912, + "step": 3336 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.5, + "epoch": 8.645077720207254, + "grad_norm": 0.3379876749555578, + "kl": 0.095458984375, + "learning_rate": 1.3575129533678755e-07, + "loss": 0.0001, + "reward": 2.4999951124191284, + "reward_std": 3.665665417429409e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999951720237732, + "step": 3337 + }, + { + "clip_ratio": 0.0, + "completion_length": 34.5, + "epoch": 8.647668393782384, + "grad_norm": 0.17462334661822623, + "kl": 0.0657958984375, + "learning_rate": 1.354922279792746e-07, + "loss": -0.0002, + "reward": 2.499997615814209, + "reward_std": 2.1958989009362995e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.999997615814209, + "step": 3338 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 8.650259067357513, + "grad_norm": 2.7550804531142985, + "kl": 0.16357421875, + "learning_rate": 1.3523316062176165e-07, + "loss": -0.0005, + "reward": 2.4999966621398926, + "reward_std": 2.5040316131708096e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.999996840953827, + "step": 3339 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.5, + "epoch": 8.652849740932643, + "grad_norm": 0.222883162863987, + "kl": 0.0333251953125, + "learning_rate": 1.3497409326424868e-07, + "loss": -0.0, + "reward": 2.4999974966049194, + "reward_std": 2.5459948460593296e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999974966049194, + "step": 3340 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.5, + "epoch": 8.655440414507773, + "grad_norm": 1.918787228508702, + "kl": 0.1190185546875, + "learning_rate": 1.3471502590673576e-07, + "loss": 0.0003, + "reward": 2.4999964237213135, + "reward_std": 2.8247826548977173e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999966621398926, + "step": 3341 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.625, + "epoch": 8.658031088082902, + "grad_norm": 2.349144252092865, + "kl": 0.14666748046875, + "learning_rate": 1.3445595854922278e-07, + "loss": 0.0001, + "reward": 1.9939061403274536, + "reward_std": 6.075625594803569e-05, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.4939061105251312, + "step": 3342 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 8.660621761658032, + "grad_norm": 2.5255275075531296, + "kl": 0.113037109375, + "learning_rate": 1.3419689119170984e-07, + "loss": 0.0003, + "reward": 1.9957151412963867, + "reward_std": 8.075975711108185e-05, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.4957151412963867, + "step": 3343 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.5, + "epoch": 8.663212435233161, + "grad_norm": 4.800438367423024, + "kl": 0.07861328125, + "learning_rate": 1.339378238341969e-07, + "loss": 0.0004, + "reward": 1.9972680807113647, + "reward_std": 0.00010621248952702445, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.4972682297229767, + "step": 3344 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.5, + "epoch": 8.66580310880829, + "grad_norm": 1.2786170110597592, + "kl": 0.09130859375, + "learning_rate": 1.3367875647668391e-07, + "loss": 0.0004, + "reward": 2.4999678134918213, + "reward_std": 6.944649157958338e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999680519104004, + "step": 3345 + }, + { + "clip_ratio": 0.0, + "completion_length": 36.5, + "epoch": 8.66839378238342, + "grad_norm": 0.11792085335870053, + "kl": 0.08251953125, + "learning_rate": 1.3341968911917097e-07, + "loss": 0.0006, + "reward": 2.499996781349182, + "reward_std": 2.474585585332534e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.999996840953827, + "step": 3346 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 8.67098445595855, + "grad_norm": 1.3058351917696434, + "kl": 0.30322265625, + "learning_rate": 1.3316062176165802e-07, + "loss": 0.0015, + "reward": 1.4999983310699463, + "reward_std": 2.2568951862922404e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 0.9999983310699463, + "step": 3347 + }, + { + "clip_ratio": 0.0, + "completion_length": 34.5, + "epoch": 8.67357512953368, + "grad_norm": 1.1579950551518363, + "kl": 0.720703125, + "learning_rate": 1.3290155440414507e-07, + "loss": 0.0033, + "reward": 2.4999979734420776, + "reward_std": 1.8917681359198468e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.999997854232788, + "step": 3348 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 8.676165803108809, + "grad_norm": 0.17728445270097165, + "kl": 0.1552734375, + "learning_rate": 1.326424870466321e-07, + "loss": 0.0021, + "reward": 2.4999985694885254, + "reward_std": 1.0873091582652705e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999986290931702, + "step": 3349 + }, + { + "clip_ratio": 0.0, + "completion_length": 34.0, + "epoch": 8.678756476683938, + "grad_norm": 0.052987725168563635, + "kl": 0.0985107421875, + "learning_rate": 1.3238341968911918e-07, + "loss": 0.0007, + "reward": 2.4999979734420776, + "reward_std": 1.2178700217191363e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999980926513672, + "step": 3350 + }, + { + "clip_ratio": 0.0, + "completion_length": 34.5, + "epoch": 8.681347150259068, + "grad_norm": 0.6457747909234716, + "kl": 0.085205078125, + "learning_rate": 1.321243523316062e-07, + "loss": 0.0011, + "reward": 2.499997854232788, + "reward_std": 1.68019590773838e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999977350234985, + "step": 3351 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.5, + "epoch": 8.683937823834198, + "grad_norm": 0.2160131321836147, + "kl": 0.072509765625, + "learning_rate": 1.3186528497409328e-07, + "loss": 0.0006, + "reward": 2.499992251396179, + "reward_std": 3.113205821136944e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999920129776, + "step": 3352 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.5, + "epoch": 8.686528497409327, + "grad_norm": 0.05455523936309753, + "kl": 0.0693359375, + "learning_rate": 1.316062176165803e-07, + "loss": 0.0007, + "reward": 2.4999992847442627, + "reward_std": 7.580394765227538e-07, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999994039535522, + "step": 3353 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.5, + "epoch": 8.689119170984457, + "grad_norm": 0.41548623545963126, + "kl": 0.102020263671875, + "learning_rate": 1.3134715025906734e-07, + "loss": -0.0008, + "reward": 2.499995708465576, + "reward_std": 2.1361069002523436e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999957084655762, + "step": 3354 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.5, + "epoch": 8.691709844559586, + "grad_norm": 0.44876892370022803, + "kl": 0.14111328125, + "learning_rate": 1.3108808290155442e-07, + "loss": 0.001, + "reward": 2.4999840259552, + "reward_std": 3.816244543486391e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999839663505554, + "step": 3355 + }, + { + "clip_ratio": 0.0, + "completion_length": 33.5, + "epoch": 8.694300518134716, + "grad_norm": 0.04852392060888601, + "kl": 0.054046630859375, + "learning_rate": 1.3082901554404144e-07, + "loss": -0.0011, + "reward": 2.499998688697815, + "reward_std": 8.175171331004094e-07, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.999998688697815, + "step": 3356 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.5, + "epoch": 8.696891191709845, + "grad_norm": 0.06473490215329131, + "kl": 0.14453125, + "learning_rate": 1.305699481865285e-07, + "loss": 0.0003, + "reward": 2.4999985694885254, + "reward_std": 9.483069618454465e-07, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.999998688697815, + "step": 3357 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 8.699481865284975, + "grad_norm": 0.2484608496586633, + "kl": 0.087890625, + "learning_rate": 1.3031088082901555e-07, + "loss": 0.0002, + "reward": 2.499997138977051, + "reward_std": 3.37998426402919e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999971985816956, + "step": 3358 + }, + { + "clip_ratio": 0.0, + "completion_length": 34.5, + "epoch": 8.702072538860104, + "grad_norm": 0.09615785421311752, + "kl": 0.11181640625, + "learning_rate": 1.300518134715026e-07, + "loss": -0.0006, + "reward": 2.49999737739563, + "reward_std": 1.6123695445457997e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999977350234985, + "step": 3359 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.5, + "epoch": 8.704663212435234, + "grad_norm": 0.09051185820935538, + "kl": 0.0614013671875, + "learning_rate": 1.2979274611398963e-07, + "loss": 0.0013, + "reward": 2.499997854232788, + "reward_std": 1.7361154505124432e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999979138374329, + "step": 3360 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.9375, + "epoch": 8.707253886010363, + "grad_norm": 2.484006206452572, + "kl": 0.578125, + "learning_rate": 1.2953367875647668e-07, + "loss": 0.0022, + "reward": 2.499996066093445, + "reward_std": 2.655085609148955e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999960064888, + "step": 3361 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 8.709844559585493, + "grad_norm": 0.2882526084433037, + "kl": 0.09716796875, + "learning_rate": 1.2927461139896373e-07, + "loss": -0.0003, + "reward": 2.499996304512024, + "reward_std": 2.4396557023464993e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999964833259583, + "step": 3362 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.5, + "epoch": 8.712435233160623, + "grad_norm": 6.732846598057207, + "kl": 0.138427734375, + "learning_rate": 1.2901554404145076e-07, + "loss": 0.0005, + "reward": 1.4908652901649475, + "reward_std": 9.542973202769645e-05, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 0.9908652901649475, + "step": 3363 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.5, + "epoch": 8.715025906735752, + "grad_norm": 0.10592116197476391, + "kl": 0.120361328125, + "learning_rate": 1.2875647668393784e-07, + "loss": 0.0006, + "reward": 2.4999977350234985, + "reward_std": 1.4710834079778579e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999979138374329, + "step": 3364 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 8.717616580310882, + "grad_norm": 0.04345875062291035, + "kl": 0.096923828125, + "learning_rate": 1.2849740932642486e-07, + "loss": 0.0008, + "reward": 2.4999988079071045, + "reward_std": 9.374131195727387e-07, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.999998927116394, + "step": 3365 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 8.720207253886011, + "grad_norm": 0.19711894869061544, + "kl": 0.048828125, + "learning_rate": 1.2823834196891192e-07, + "loss": 0.0003, + "reward": 2.4999979734420776, + "reward_std": 1.7973872559196025e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.999998152256012, + "step": 3366 + }, + { + "clip_ratio": 0.0, + "completion_length": 36.0, + "epoch": 8.72279792746114, + "grad_norm": 1.7044919418024835, + "kl": 0.10009765625, + "learning_rate": 1.2797927461139897e-07, + "loss": 0.0004, + "reward": 2.4999847412109375, + "reward_std": 3.951132839574711e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999846816062927, + "step": 3367 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.5, + "epoch": 8.72538860103627, + "grad_norm": 0.5077745611054267, + "kl": 0.06103515625, + "learning_rate": 1.27720207253886e-07, + "loss": 0.0007, + "reward": 2.499977231025696, + "reward_std": 4.563202509189068e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999773502349854, + "step": 3368 + }, + { + "clip_ratio": 0.0, + "completion_length": 36.0, + "epoch": 8.7279792746114, + "grad_norm": 0.19792402413868987, + "kl": 0.085693359375, + "learning_rate": 1.2746113989637305e-07, + "loss": 0.0011, + "reward": 2.499993085861206, + "reward_std": 2.66647657554131e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.999992847442627, + "step": 3369 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 8.73056994818653, + "grad_norm": 0.19891273474068036, + "kl": 0.11865234375, + "learning_rate": 1.272020725388601e-07, + "loss": 0.0002, + "reward": 2.499997138977051, + "reward_std": 2.8615055498448783e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999971389770508, + "step": 3370 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 8.733160621761659, + "grad_norm": 0.546508278415925, + "kl": 0.069091796875, + "learning_rate": 1.2694300518134715e-07, + "loss": 0.0, + "reward": 2.4999932050704956, + "reward_std": 3.0885859132467886e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.999993085861206, + "step": 3371 + }, + { + "clip_ratio": 0.0, + "completion_length": 36.0, + "epoch": 8.735751295336787, + "grad_norm": 2.475547396579339, + "kl": 0.28125, + "learning_rate": 1.2668393782383418e-07, + "loss": 0.0017, + "reward": 2.4999942779541016, + "reward_std": 6.81601545693411e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999942779541016, + "step": 3372 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.5, + "epoch": 8.738341968911918, + "grad_norm": 0.11417566788695352, + "kl": 0.104736328125, + "learning_rate": 1.2642487046632126e-07, + "loss": 0.0001, + "reward": 2.4999983310699463, + "reward_std": 1.6251477745754528e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999985694885254, + "step": 3373 + }, + { + "clip_ratio": 0.0, + "completion_length": 34.5, + "epoch": 8.740932642487046, + "grad_norm": 0.11320119569271186, + "kl": 0.038330078125, + "learning_rate": 1.2616580310880828e-07, + "loss": 0.0006, + "reward": 2.499997615814209, + "reward_std": 1.8581488347990671e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999974966049194, + "step": 3374 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.1875, + "epoch": 8.743523316062177, + "grad_norm": 2.389833879077716, + "kl": 0.112060546875, + "learning_rate": 1.259067357512953e-07, + "loss": 0.0004, + "reward": 0.9998639822006226, + "reward_std": 1.3021407539781649e-05, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 0.49986398220062256, + "step": 3375 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 8.746113989637305, + "grad_norm": 13.207390194210728, + "kl": 0.072021484375, + "learning_rate": 1.256476683937824e-07, + "loss": 0.0012, + "reward": 2.4999855756759644, + "reward_std": 6.485303856607061e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.99998539686203, + "step": 3376 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 8.748704663212436, + "grad_norm": 0.11587323707575481, + "kl": 0.070068359375, + "learning_rate": 1.2538860103626942e-07, + "loss": 0.0012, + "reward": 2.4999961853027344, + "reward_std": 1.9090615950290157e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999960660934448, + "step": 3377 + }, + { + "clip_ratio": 0.0, + "completion_length": 34.5, + "epoch": 8.751295336787564, + "grad_norm": 0.5112303400714473, + "kl": 0.129638671875, + "learning_rate": 1.2512953367875647e-07, + "loss": 0.0007, + "reward": 2.499993324279785, + "reward_std": 4.512670443546085e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999931454658508, + "step": 3378 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 8.753886010362695, + "grad_norm": 1.24867209094399, + "kl": 0.1376953125, + "learning_rate": 1.2487046632124352e-07, + "loss": 0.001, + "reward": 2.499967575073242, + "reward_std": 9.27197379496647e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999675154685974, + "step": 3379 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 8.756476683937823, + "grad_norm": 0.21552721080437914, + "kl": 0.1318359375, + "learning_rate": 1.2461139896373057e-07, + "loss": -0.0001, + "reward": 2.4999966621398926, + "reward_std": 2.618967869238986e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999966621398926, + "step": 3380 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.5, + "epoch": 8.759067357512954, + "grad_norm": 0.0809784140780734, + "kl": 0.0799560546875, + "learning_rate": 1.243523316062176e-07, + "loss": 0.0, + "reward": 2.4999983310699463, + "reward_std": 9.920318433387365e-07, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999983310699463, + "step": 3381 + }, + { + "clip_ratio": 0.0, + "completion_length": 36.0, + "epoch": 8.761658031088082, + "grad_norm": 0.07781043681765573, + "kl": 0.0799560546875, + "learning_rate": 1.2409326424870465e-07, + "loss": 0.0, + "reward": 2.499997615814209, + "reward_std": 1.3446606033085118e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999977350234985, + "step": 3382 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 8.764248704663213, + "grad_norm": 8.14002278469235, + "kl": 0.931640625, + "learning_rate": 1.238341968911917e-07, + "loss": 0.0059, + "reward": 2.49999737739563, + "reward_std": 3.793397951312727e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999971389770508, + "step": 3383 + }, + { + "clip_ratio": 0.0, + "completion_length": 34.5, + "epoch": 8.766839378238341, + "grad_norm": 0.19466277075521876, + "kl": 0.0657958984375, + "learning_rate": 1.2357512953367876e-07, + "loss": -0.0011, + "reward": 2.4999983310699463, + "reward_std": 1.0766609648271697e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999983310699463, + "step": 3384 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 8.76943005181347, + "grad_norm": 0.04759774304791635, + "kl": 0.13623046875, + "learning_rate": 1.233160621761658e-07, + "loss": 0.0004, + "reward": 2.4999988079071045, + "reward_std": 1.1763380598495132e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999987483024597, + "step": 3385 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 8.7720207253886, + "grad_norm": 0.02926639369798685, + "kl": 0.0863037109375, + "learning_rate": 1.2305699481865284e-07, + "loss": 0.0005, + "reward": 2.4999985694885254, + "reward_std": 8.522496557361592e-07, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999984502792358, + "step": 3386 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.5, + "epoch": 8.77461139896373, + "grad_norm": 0.12333520307934347, + "kl": 0.080078125, + "learning_rate": 1.227979274611399e-07, + "loss": 0.0006, + "reward": 2.499995708465576, + "reward_std": 2.22814196604304e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999955892562866, + "step": 3387 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.5, + "epoch": 8.77720207253886, + "grad_norm": 0.6794672691516138, + "kl": 0.140380859375, + "learning_rate": 1.2253886010362694e-07, + "loss": 0.0011, + "reward": 2.499993920326233, + "reward_std": 4.174207731466595e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999940395355225, + "step": 3388 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.5, + "epoch": 8.779792746113989, + "grad_norm": 0.10605964928966936, + "kl": 0.0911865234375, + "learning_rate": 1.22279792746114e-07, + "loss": 0.0015, + "reward": 2.499997854232788, + "reward_std": 2.2739724272469175e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999977946281433, + "step": 3389 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.5, + "epoch": 8.782383419689118, + "grad_norm": 0.3280459363386293, + "kl": 0.0562744140625, + "learning_rate": 1.2202072538860102e-07, + "loss": 0.0002, + "reward": 2.4999983310699463, + "reward_std": 2.0961283553333487e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.999998390674591, + "step": 3390 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.5, + "epoch": 8.784974093264248, + "grad_norm": 0.8735789536020454, + "kl": 0.201171875, + "learning_rate": 1.2176165803108807e-07, + "loss": 0.0017, + "reward": 2.499995470046997, + "reward_std": 4.059769423747639e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999955296516418, + "step": 3391 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.5, + "epoch": 8.787564766839377, + "grad_norm": 0.30267270359878357, + "kl": 0.0721435546875, + "learning_rate": 1.2150259067357513e-07, + "loss": -0.0009, + "reward": 2.4999920129776, + "reward_std": 5.083951919004903e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999919533729553, + "step": 3392 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.5, + "epoch": 8.790155440414507, + "grad_norm": 0.1246194083405312, + "kl": 0.11529541015625, + "learning_rate": 1.2124352331606218e-07, + "loss": 0.0005, + "reward": 2.4999961853027344, + "reward_std": 1.7325169210380409e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999962449073792, + "step": 3393 + }, + { + "clip_ratio": 0.0, + "completion_length": 36.0, + "epoch": 8.792746113989637, + "grad_norm": 1.2250946380890442, + "kl": 0.1865234375, + "learning_rate": 1.209844559585492e-07, + "loss": 0.0, + "reward": 2.4999911785125732, + "reward_std": 5.700353085558163e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999911785125732, + "step": 3394 + }, + { + "clip_ratio": 0.0, + "completion_length": 34.5, + "epoch": 8.795336787564766, + "grad_norm": 0.9204697416740735, + "kl": 0.094482421875, + "learning_rate": 1.2072538860103626e-07, + "loss": 0.0004, + "reward": 2.4999964237213135, + "reward_std": 2.5394680278623127e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999964833259583, + "step": 3395 + }, + { + "clip_ratio": 0.0, + "completion_length": 34.5, + "epoch": 8.797927461139896, + "grad_norm": 0.20703529222158454, + "kl": 0.130615234375, + "learning_rate": 1.204663212435233e-07, + "loss": 0.002, + "reward": 2.4999985694885254, + "reward_std": 2.137543049229862e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999984502792358, + "step": 3396 + }, + { + "clip_ratio": 0.0, + "completion_length": 36.0, + "epoch": 8.800518134715025, + "grad_norm": 0.3575262502211687, + "kl": 0.092529296875, + "learning_rate": 1.2020725388601036e-07, + "loss": 0.0001, + "reward": 2.499993324279785, + "reward_std": 4.768395683640847e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.99999338388443, + "step": 3397 + }, + { + "clip_ratio": 0.0, + "completion_length": 34.0, + "epoch": 8.803108808290155, + "grad_norm": 0.05870856918951048, + "kl": 0.0718994140625, + "learning_rate": 1.1994818652849742e-07, + "loss": 0.0004, + "reward": 2.4999983310699463, + "reward_std": 1.7038536270774784e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.999998390674591, + "step": 3398 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.5, + "epoch": 8.805699481865284, + "grad_norm": 2.0867867410200565, + "kl": 0.172119140625, + "learning_rate": 1.1968911917098444e-07, + "loss": 0.0004, + "reward": 1.9940532445907593, + "reward_std": 7.350150809770639e-05, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.4940531849861145, + "step": 3399 + }, + { + "clip_ratio": 0.0, + "completion_length": 34.0, + "epoch": 8.808290155440414, + "grad_norm": 0.8632239035603295, + "kl": 0.120849609375, + "learning_rate": 1.194300518134715e-07, + "loss": 0.0001, + "reward": 2.4999959468841553, + "reward_std": 4.580572635859426e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999960064888, + "step": 3400 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 8.810880829015543, + "grad_norm": 0.0729098832607847, + "kl": 0.08251953125, + "learning_rate": 1.1917098445595853e-07, + "loss": -0.0006, + "reward": 2.499996542930603, + "reward_std": 1.7641883118812984e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999966621398926, + "step": 3401 + }, + { + "clip_ratio": 0.0, + "completion_length": 34.5, + "epoch": 8.813471502590673, + "grad_norm": 0.23990813180418225, + "kl": 0.080322265625, + "learning_rate": 1.1891191709844559e-07, + "loss": -0.0007, + "reward": 2.4999945163726807, + "reward_std": 2.2739394012205594e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999946355819702, + "step": 3402 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.5, + "epoch": 8.816062176165802, + "grad_norm": 2.030009971004229, + "kl": 0.093994140625, + "learning_rate": 1.1865284974093264e-07, + "loss": 0.0003, + "reward": 2.4999921321868896, + "reward_std": 6.937393322914431e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999920129776, + "step": 3403 + }, + { + "clip_ratio": 0.0, + "completion_length": 34.5, + "epoch": 8.818652849740932, + "grad_norm": 0.21628166150201608, + "kl": 0.0340576171875, + "learning_rate": 1.1839378238341968e-07, + "loss": -0.0002, + "reward": 2.49998140335083, + "reward_std": 3.5874165860150242e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999812245368958, + "step": 3404 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 8.821243523316062, + "grad_norm": 1.4789656100594275, + "kl": 0.0611572265625, + "learning_rate": 1.1813471502590673e-07, + "loss": 0.0004, + "reward": 1.9987174272537231, + "reward_std": 2.506039834315743e-05, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.498717337846756, + "step": 3405 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.5, + "epoch": 8.823834196891191, + "grad_norm": 0.6584813261616805, + "kl": 0.15283203125, + "learning_rate": 1.1787564766839378e-07, + "loss": 0.0006, + "reward": 2.4999940395355225, + "reward_std": 6.349759019030898e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999942183494568, + "step": 3406 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 8.82642487046632, + "grad_norm": 0.903754639628972, + "kl": 0.08319091796875, + "learning_rate": 1.1761658031088082e-07, + "loss": 0.0001, + "reward": 2.499992251396179, + "reward_std": 6.14923578723392e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.999992311000824, + "step": 3407 + }, + { + "clip_ratio": 0.0, + "completion_length": 36.0, + "epoch": 8.82901554404145, + "grad_norm": 0.8044773421614164, + "kl": 0.0947265625, + "learning_rate": 1.1735751295336788e-07, + "loss": 0.0018, + "reward": 2.4999932050704956, + "reward_std": 2.198234057004811e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999931454658508, + "step": 3408 + }, + { + "clip_ratio": 0.0, + "completion_length": 34.5, + "epoch": 8.83160621761658, + "grad_norm": 3.5446106159507984, + "kl": 0.082122802734375, + "learning_rate": 1.1709844559585492e-07, + "loss": 0.0001, + "reward": 1.9997522830963135, + "reward_std": 3.200868434305448e-05, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.499752402305603, + "step": 3409 + }, + { + "clip_ratio": 0.0, + "completion_length": 34.5, + "epoch": 8.83419689119171, + "grad_norm": 37.09705470061804, + "kl": 0.16943359375, + "learning_rate": 1.1683937823834196e-07, + "loss": 0.0008, + "reward": 1.4967734217643738, + "reward_std": 0.0031877163237368222, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 0.9967733323574066, + "step": 3410 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.5, + "epoch": 8.836787564766839, + "grad_norm": 1.994434347956502, + "kl": 0.096435546875, + "learning_rate": 1.1658031088082901e-07, + "loss": -0.0001, + "reward": 1.9998699426651, + "reward_std": 1.3150222002877854e-05, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.4998700320720673, + "step": 3411 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 8.839378238341968, + "grad_norm": 20.869051909831594, + "kl": 0.12841796875, + "learning_rate": 1.1632124352331606e-07, + "loss": -0.0002, + "reward": 2.4999842643737793, + "reward_std": 8.628997193227406e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.999984323978424, + "step": 3412 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 8.841968911917098, + "grad_norm": 0.09278710863050392, + "kl": 0.0345458984375, + "learning_rate": 1.160621761658031e-07, + "loss": 0.0001, + "reward": 2.499997138977051, + "reward_std": 2.149674003248947e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999972581863403, + "step": 3413 + }, + { + "clip_ratio": 0.0, + "completion_length": 34.5, + "epoch": 8.844559585492227, + "grad_norm": 0.04247926486749104, + "kl": 0.072509765625, + "learning_rate": 1.1580310880829015e-07, + "loss": -0.0004, + "reward": 2.4999992847442627, + "reward_std": 8.513656837294548e-07, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999993443489075, + "step": 3414 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 8.847150259067357, + "grad_norm": 0.4014779174639785, + "kl": 0.04632568359375, + "learning_rate": 1.155440414507772e-07, + "loss": -0.0001, + "reward": 2.499996304512024, + "reward_std": 3.181224201398436e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999963641166687, + "step": 3415 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 8.849740932642487, + "grad_norm": 15.656839243266678, + "kl": 0.068359375, + "learning_rate": 1.1528497409326423e-07, + "loss": 0.0, + "reward": 2.4998550415039062, + "reward_std": 2.529534822315327e-05, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9998552203178406, + "step": 3416 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 8.852331606217616, + "grad_norm": 0.35280788167109844, + "kl": 0.0791015625, + "learning_rate": 1.1502590673575128e-07, + "loss": 0.001, + "reward": 2.499988079071045, + "reward_std": 4.463617074179638e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.999988079071045, + "step": 3417 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 8.854922279792746, + "grad_norm": 17.988626912078182, + "kl": 0.3271484375, + "learning_rate": 1.1476683937823834e-07, + "loss": 0.0013, + "reward": 1.8640047311782837, + "reward_std": 0.00017136085398306022, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.3640047311782837, + "step": 3418 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 8.857512953367875, + "grad_norm": 58.52317483085468, + "kl": 0.201171875, + "learning_rate": 1.1450777202072538e-07, + "loss": 0.0002, + "reward": 1.9781255722045898, + "reward_std": 0.00012090897871530615, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.4781257510185242, + "step": 3419 + }, + { + "clip_ratio": 0.0, + "completion_length": 34.5, + "epoch": 8.860103626943005, + "grad_norm": 0.03440881905499051, + "kl": 0.0496826171875, + "learning_rate": 1.1424870466321243e-07, + "loss": 0.0006, + "reward": 2.4999990463256836, + "reward_std": 9.499318878170016e-07, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999991655349731, + "step": 3420 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.5, + "epoch": 8.862694300518134, + "grad_norm": 0.44400735209983677, + "kl": 0.119140625, + "learning_rate": 1.1398963730569948e-07, + "loss": 0.0005, + "reward": 2.4999955892562866, + "reward_std": 5.478663979374687e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999956488609314, + "step": 3421 + }, + { + "clip_ratio": 0.0, + "completion_length": 34.5, + "epoch": 8.865284974093264, + "grad_norm": 0.4163958862861426, + "kl": 0.05712890625, + "learning_rate": 1.1373056994818652e-07, + "loss": -0.0009, + "reward": 2.499996304512024, + "reward_std": 2.6456235673322226e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.999996542930603, + "step": 3422 + }, + { + "clip_ratio": 0.0, + "completion_length": 36.0, + "epoch": 8.867875647668393, + "grad_norm": 0.32583217337005665, + "kl": 0.05718994140625, + "learning_rate": 1.1347150259067357e-07, + "loss": -0.0015, + "reward": 2.4999964237213135, + "reward_std": 2.366817142274158e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.999996542930603, + "step": 3423 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.5, + "epoch": 8.870466321243523, + "grad_norm": 0.10256511655733214, + "kl": 0.080078125, + "learning_rate": 1.1321243523316061e-07, + "loss": 0.0011, + "reward": 2.4999966621398926, + "reward_std": 1.6849464117285606e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.999996542930603, + "step": 3424 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 8.873056994818652, + "grad_norm": 0.12580832656158075, + "kl": 0.100341796875, + "learning_rate": 1.1295336787564767e-07, + "loss": 0.0004, + "reward": 2.4999982118606567, + "reward_std": 1.3801606542074296e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999982118606567, + "step": 3425 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 8.875647668393782, + "grad_norm": 0.05120077976122874, + "kl": 0.070556640625, + "learning_rate": 1.126943005181347e-07, + "loss": -0.0008, + "reward": 2.499999523162842, + "reward_std": 3.466466154122827e-07, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999998211860657, + "step": 3426 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 8.878238341968911, + "grad_norm": 0.048049701245307976, + "kl": 0.1005859375, + "learning_rate": 1.1243523316062176e-07, + "loss": -0.0, + "reward": 2.4999985694885254, + "reward_std": 1.0201388533914724e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999987483024597, + "step": 3427 + }, + { + "clip_ratio": 0.0, + "completion_length": 34.0, + "epoch": 8.880829015544041, + "grad_norm": 0.29481804680087365, + "kl": 0.0462646484375, + "learning_rate": 1.1217616580310881e-07, + "loss": 0.0002, + "reward": 2.49999737739563, + "reward_std": 2.4502935502823675e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999973773956299, + "step": 3428 + }, + { + "clip_ratio": 0.0, + "completion_length": 34.5, + "epoch": 8.88341968911917, + "grad_norm": 5.797691448128938, + "kl": 0.093505859375, + "learning_rate": 1.1191709844559585e-07, + "loss": 0.0005, + "reward": 2.4999908208847046, + "reward_std": 9.91011756923399e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999908804893494, + "step": 3429 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.5, + "epoch": 8.8860103626943, + "grad_norm": 5.16368005322399, + "kl": 0.134765625, + "learning_rate": 1.116580310880829e-07, + "loss": 0.0001, + "reward": 1.9451864361763, + "reward_std": 0.00021720198492403142, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.4451864361763, + "step": 3430 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.5, + "epoch": 8.88860103626943, + "grad_norm": 0.5404804688142195, + "kl": 0.095458984375, + "learning_rate": 1.1139896373056994e-07, + "loss": 0.0002, + "reward": 2.499993324279785, + "reward_std": 5.418317641670001e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.99999338388443, + "step": 3431 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 8.89119170984456, + "grad_norm": 0.1579308146987876, + "kl": 0.0567626953125, + "learning_rate": 1.1113989637305698e-07, + "loss": 0.0011, + "reward": 2.4999969005584717, + "reward_std": 1.4467167090970179e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999969005584717, + "step": 3432 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.5, + "epoch": 8.893782383419689, + "grad_norm": 0.47639674865865966, + "kl": 0.094970703125, + "learning_rate": 1.1088082901554403e-07, + "loss": 0.0002, + "reward": 2.4999961853027344, + "reward_std": 3.112467084065429e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999961853027344, + "step": 3433 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 8.896373056994818, + "grad_norm": 2.811170479359201, + "kl": 0.13720703125, + "learning_rate": 1.1062176165803109e-07, + "loss": 0.0003, + "reward": 1.9984723925590515, + "reward_std": 4.5855091002522386e-05, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.4984723627567291, + "step": 3434 + }, + { + "clip_ratio": 0.0, + "completion_length": 34.5, + "epoch": 8.898963730569948, + "grad_norm": 4.107889865313289, + "kl": 0.080078125, + "learning_rate": 1.1036269430051813e-07, + "loss": -0.0008, + "reward": 2.499996542930603, + "reward_std": 2.4765067223597725e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999966025352478, + "step": 3435 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 8.901554404145077, + "grad_norm": 4.3220549238242665, + "kl": 0.12841796875, + "learning_rate": 1.1010362694300518e-07, + "loss": -0.0005, + "reward": 1.9741721153259277, + "reward_std": 0.00016704053552984988, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.474172204732895, + "step": 3436 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.5, + "epoch": 8.904145077720207, + "grad_norm": 0.16821721461119385, + "kl": 0.056396484375, + "learning_rate": 1.0984455958549223e-07, + "loss": 0.0007, + "reward": 2.4999847412109375, + "reward_std": 2.912349188477492e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999847412109375, + "step": 3437 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.5, + "epoch": 8.906735751295336, + "grad_norm": 0.04496954180309151, + "kl": 0.0660400390625, + "learning_rate": 1.0958549222797927e-07, + "loss": 0.0004, + "reward": 2.4999972581863403, + "reward_std": 1.2920062886223604e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.999997317790985, + "step": 3438 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.5, + "epoch": 8.909326424870466, + "grad_norm": 53.79083783966578, + "kl": 0.13232421875, + "learning_rate": 1.0932642487046631e-07, + "loss": -0.0005, + "reward": 1.9989625811576843, + "reward_std": 0.00023589314776018, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.4989627003669739, + "step": 3439 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.5, + "epoch": 8.911917098445596, + "grad_norm": 1.6221286608116776, + "kl": 0.17919921875, + "learning_rate": 1.0906735751295336e-07, + "loss": -0.0004, + "reward": 1.9985601902008057, + "reward_std": 2.8334295166132506e-05, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.4985601902008057, + "step": 3440 + }, + { + "clip_ratio": 0.0, + "completion_length": 34.5, + "epoch": 8.914507772020725, + "grad_norm": 0.07045610481541544, + "kl": 0.133056640625, + "learning_rate": 1.088082901554404e-07, + "loss": 0.0016, + "reward": 2.4999982118606567, + "reward_std": 1.3275659398459538e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999982118606567, + "step": 3441 + }, + { + "clip_ratio": 0.0, + "completion_length": 34.5, + "epoch": 8.917098445595855, + "grad_norm": 0.11411398842343769, + "kl": 0.145263671875, + "learning_rate": 1.0854922279792746e-07, + "loss": 0.0005, + "reward": 2.4998862743377686, + "reward_std": 4.5016962531008176e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.999886155128479, + "step": 3442 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 8.919689119170984, + "grad_norm": 0.08906425637105181, + "kl": 0.0830078125, + "learning_rate": 1.0829015544041451e-07, + "loss": 0.0005, + "reward": 2.4999994039535522, + "reward_std": 4.821480814598544e-07, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999993443489075, + "step": 3443 + }, + { + "clip_ratio": 0.0, + "completion_length": 34.8125, + "epoch": 8.922279792746114, + "grad_norm": 0.18617213821420595, + "kl": 0.04833984375, + "learning_rate": 1.0803108808290155e-07, + "loss": 0.0001, + "reward": 2.4999961853027344, + "reward_std": 1.417240852674695e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999963641166687, + "step": 3444 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 8.924870466321243, + "grad_norm": 0.1431428173596342, + "kl": 0.05224609375, + "learning_rate": 1.077720207253886e-07, + "loss": -0.0007, + "reward": 2.4999969005584717, + "reward_std": 2.547374265304825e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999969005584717, + "step": 3445 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.5, + "epoch": 8.927461139896373, + "grad_norm": 1.1937464296417282, + "kl": 0.084228515625, + "learning_rate": 1.0751295336787564e-07, + "loss": 0.0007, + "reward": 2.4999951124191284, + "reward_std": 5.58721046672872e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999949932098389, + "step": 3446 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.5, + "epoch": 8.930051813471502, + "grad_norm": 4.780715840024769, + "kl": 0.0665283203125, + "learning_rate": 1.0725388601036268e-07, + "loss": -0.0001, + "reward": 1.9979389905929565, + "reward_std": 5.5960548252187436e-05, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.497939020395279, + "step": 3447 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 8.932642487046632, + "grad_norm": 1.0849708617331706, + "kl": 0.1328125, + "learning_rate": 1.0699481865284973e-07, + "loss": -0.0001, + "reward": 1.9995614290237427, + "reward_std": 1.5484875859783642e-05, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.4995616972446442, + "step": 3448 + }, + { + "clip_ratio": 0.0, + "completion_length": 34.5, + "epoch": 8.935233160621761, + "grad_norm": 0.17426210578945972, + "kl": 0.085693359375, + "learning_rate": 1.0673575129533678e-07, + "loss": -0.0004, + "reward": 2.4999953508377075, + "reward_std": 2.6784234705701238e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999955296516418, + "step": 3449 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.5, + "epoch": 8.937823834196891, + "grad_norm": 4.112937615738767, + "kl": 0.09521484375, + "learning_rate": 1.0647668393782382e-07, + "loss": 0.0006, + "reward": 1.9779855012893677, + "reward_std": 0.00013884341632319774, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.4779855608940125, + "step": 3450 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 8.94041450777202, + "grad_norm": 1.2829217034006948, + "kl": 0.096435546875, + "learning_rate": 1.0621761658031088e-07, + "loss": -0.0005, + "reward": 1.999855101108551, + "reward_std": 1.3056008356215898e-05, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.4998551905155182, + "step": 3451 + }, + { + "clip_ratio": 0.0, + "completion_length": 36.0, + "epoch": 8.94300518134715, + "grad_norm": 0.43185681731887043, + "kl": 0.05645751953125, + "learning_rate": 1.0595854922279793e-07, + "loss": 0.001, + "reward": 1.999801516532898, + "reward_std": 9.453267409753607e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.4998016357421875, + "step": 3452 + }, + { + "clip_ratio": 0.0, + "completion_length": 34.5, + "epoch": 8.94559585492228, + "grad_norm": 0.09068131547420637, + "kl": 0.0489501953125, + "learning_rate": 1.0569948186528498e-07, + "loss": -0.0007, + "reward": 2.499997615814209, + "reward_std": 1.5699802986546274e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999977350234985, + "step": 3453 + }, + { + "clip_ratio": 0.0, + "completion_length": 34.5, + "epoch": 8.94818652849741, + "grad_norm": 0.20139149054237102, + "kl": 0.1064453125, + "learning_rate": 1.0544041450777201e-07, + "loss": 0.0012, + "reward": 2.4999974966049194, + "reward_std": 1.6089898622340115e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999974966049194, + "step": 3454 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.5, + "epoch": 8.950777202072539, + "grad_norm": 1.6649257056288238, + "kl": 0.08099365234375, + "learning_rate": 1.0518134715025906e-07, + "loss": 0.0009, + "reward": 2.4999940395355225, + "reward_std": 5.416392070856091e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999939799308777, + "step": 3455 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 8.953367875647668, + "grad_norm": 0.06201081254614163, + "kl": 0.0523681640625, + "learning_rate": 1.0492227979274611e-07, + "loss": -0.0001, + "reward": 2.4999990463256836, + "reward_std": 8.919098775095335e-07, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999991059303284, + "step": 3456 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.125, + "epoch": 8.955958549222798, + "grad_norm": 0.06398299704463684, + "kl": 0.04156494140625, + "learning_rate": 1.0466321243523315e-07, + "loss": 0.0003, + "reward": 2.499998450279236, + "reward_std": 1.4224654023564653e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999985098838806, + "step": 3457 + }, + { + "clip_ratio": 0.0, + "completion_length": 34.0, + "epoch": 8.958549222797927, + "grad_norm": 14.559187055911945, + "kl": 0.14111328125, + "learning_rate": 1.044041450777202e-07, + "loss": 0.0006, + "reward": 1.2252216935157776, + "reward_std": 0.28386136662447825, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 0.7252216935157776, + "step": 3458 + }, + { + "clip_ratio": 0.0, + "completion_length": 34.5, + "epoch": 8.961139896373057, + "grad_norm": 0.050963070702148984, + "kl": 0.016204833984375, + "learning_rate": 1.0414507772020726e-07, + "loss": -0.0016, + "reward": 2.499992847442627, + "reward_std": 1.4169637552186032e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999929666519165, + "step": 3459 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 8.963730569948186, + "grad_norm": 139.76525629377764, + "kl": 0.0703125, + "learning_rate": 1.038860103626943e-07, + "loss": -0.0003, + "reward": 1.8044943809509277, + "reward_std": 0.00312106445653626, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.3044945001602173, + "step": 3460 + }, + { + "clip_ratio": 0.0, + "completion_length": 36.0, + "epoch": 8.966321243523316, + "grad_norm": 0.19717132028313283, + "kl": 0.08935546875, + "learning_rate": 1.0362694300518134e-07, + "loss": 0.0012, + "reward": 2.499994397163391, + "reward_std": 3.114770720458182e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999942779541016, + "step": 3461 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.5, + "epoch": 8.968911917098445, + "grad_norm": 1.1157861237311544, + "kl": 0.0848388671875, + "learning_rate": 1.0336787564766839e-07, + "loss": 0.0001, + "reward": 2.4999642372131348, + "reward_std": 1.1197418757546984e-05, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999642372131348, + "step": 3462 + }, + { + "clip_ratio": 0.0, + "completion_length": 34.0, + "epoch": 8.971502590673575, + "grad_norm": 0.09207391835601894, + "kl": 0.07281494140625, + "learning_rate": 1.0310880829015543e-07, + "loss": 0.0012, + "reward": 2.4999983310699463, + "reward_std": 1.3727566283705528e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999982714653015, + "step": 3463 + }, + { + "clip_ratio": 0.0, + "completion_length": 36.0, + "epoch": 8.974093264248705, + "grad_norm": 5.98109888321172, + "kl": 0.1348876953125, + "learning_rate": 1.0284974093264248e-07, + "loss": 0.001, + "reward": 1.895218312740326, + "reward_std": 0.0007560910520396646, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.3952181339263916, + "step": 3464 + }, + { + "clip_ratio": 0.0, + "completion_length": 34.5, + "epoch": 8.976683937823834, + "grad_norm": 0.20376526679965523, + "kl": 0.16748046875, + "learning_rate": 1.0259067357512953e-07, + "loss": 0.002, + "reward": 2.4999977350234985, + "reward_std": 1.7385539194947341e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999977350234985, + "step": 3465 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 8.979274611398964, + "grad_norm": 0.06407895937600917, + "kl": 0.1123046875, + "learning_rate": 1.0233160621761657e-07, + "loss": 0.0, + "reward": 2.4999985694885254, + "reward_std": 9.888409522318398e-07, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999986290931702, + "step": 3466 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 8.981865284974093, + "grad_norm": 1.3565622858874267, + "kl": 0.1240234375, + "learning_rate": 1.0207253886010363e-07, + "loss": 0.0015, + "reward": 2.499993085861206, + "reward_std": 5.913242773658567e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999930262565613, + "step": 3467 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 8.984455958549223, + "grad_norm": 2.273181171456491, + "kl": 0.127685546875, + "learning_rate": 1.0181347150259068e-07, + "loss": 0.0002, + "reward": 1.9992265701293945, + "reward_std": 4.288488003112434e-05, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.4992266595363617, + "step": 3468 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.5, + "epoch": 8.987046632124352, + "grad_norm": 2.78575176240552, + "kl": 0.1275634765625, + "learning_rate": 1.015544041450777e-07, + "loss": 0.0001, + "reward": 1.999359905719757, + "reward_std": 2.7430876500034174e-05, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.4993600249290466, + "step": 3469 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.5, + "epoch": 8.989637305699482, + "grad_norm": 1.5601966406933825, + "kl": 0.235107421875, + "learning_rate": 1.0129533678756476e-07, + "loss": 0.0017, + "reward": 2.499997854232788, + "reward_std": 2.147631334992184e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999977350234985, + "step": 3470 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 8.992227979274611, + "grad_norm": 0.5308453818860998, + "kl": 0.046630859375, + "learning_rate": 1.0103626943005181e-07, + "loss": 0.0005, + "reward": 2.499993085861206, + "reward_std": 4.354723614596878e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999930262565613, + "step": 3471 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 8.994818652849741, + "grad_norm": 0.38284448902362284, + "kl": 0.0379638671875, + "learning_rate": 1.0077720207253885e-07, + "loss": -0.0011, + "reward": 2.4999964237213135, + "reward_std": 2.533268229854002e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999966621398926, + "step": 3472 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.5, + "epoch": 8.99740932642487, + "grad_norm": 2.677295087261124, + "kl": 0.11669921875, + "learning_rate": 1.005181347150259e-07, + "loss": 0.0004, + "reward": 1.9998379945755005, + "reward_std": 2.7647049819279346e-05, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.4998379349708557, + "step": 3473 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 9.0, + "grad_norm": 3.001279584051759, + "kl": 0.159423828125, + "learning_rate": 1.0025906735751296e-07, + "loss": 0.0003, + "reward": 1.9966130256652832, + "reward_std": 5.19800078109256e-05, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.4966131150722504, + "step": 3474 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.5, + "epoch": 9.00259067357513, + "grad_norm": 0.6593848763127426, + "kl": 0.348876953125, + "learning_rate": 1e-07, + "loss": 0.0014, + "reward": 2.499997615814209, + "reward_std": 2.3731427774009717e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999976754188538, + "step": 3475 + }, + { + "clip_ratio": 0.0, + "completion_length": 34.5, + "epoch": 9.005181347150259, + "grad_norm": 0.1514879877640662, + "kl": 0.08349609375, + "learning_rate": 9.974093264248703e-08, + "loss": -0.0005, + "reward": 2.499997615814209, + "reward_std": 2.2573627802557894e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999979138374329, + "step": 3476 + }, + { + "clip_ratio": 0.0, + "completion_length": 34.5, + "epoch": 9.007772020725389, + "grad_norm": 0.9536487934199707, + "kl": 0.101318359375, + "learning_rate": 9.948186528497409e-08, + "loss": 0.0002, + "reward": 2.4999868869781494, + "reward_std": 7.404875304928282e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999868869781494, + "step": 3477 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.5, + "epoch": 9.010362694300518, + "grad_norm": 13.645935854741467, + "kl": 0.107177734375, + "learning_rate": 9.922279792746113e-08, + "loss": -0.0007, + "reward": 2.4999918937683105, + "reward_std": 1.0237132300972007e-05, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999921321868896, + "step": 3478 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 9.012953367875648, + "grad_norm": 0.44118924229686024, + "kl": 0.03759765625, + "learning_rate": 9.896373056994818e-08, + "loss": 0.0002, + "reward": 2.4999955892562866, + "reward_std": 2.61203285845113e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.999995470046997, + "step": 3479 + }, + { + "clip_ratio": 0.0, + "completion_length": 34.5, + "epoch": 9.015544041450777, + "grad_norm": 0.10497714823184288, + "kl": 0.0947265625, + "learning_rate": 9.870466321243523e-08, + "loss": 0.0004, + "reward": 2.4999955892562866, + "reward_std": 1.8635282401646691e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999957084655762, + "step": 3480 + }, + { + "clip_ratio": 0.0, + "completion_length": 34.5, + "epoch": 9.018134715025907, + "grad_norm": 0.10328823100313811, + "kl": 0.119873046875, + "learning_rate": 9.844559585492227e-08, + "loss": 0.0, + "reward": 2.499997615814209, + "reward_std": 2.059570306300884e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999976754188538, + "step": 3481 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.5, + "epoch": 9.020725388601036, + "grad_norm": 5.61715145313828, + "kl": 0.14111328125, + "learning_rate": 9.818652849740932e-08, + "loss": 0.0008, + "reward": 1.999222993850708, + "reward_std": 7.085562697284331e-05, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.4992229342460632, + "step": 3482 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.5, + "epoch": 9.023316062176166, + "grad_norm": 0.07036952884612498, + "kl": 0.0518798828125, + "learning_rate": 9.792746113989638e-08, + "loss": 0.0008, + "reward": 2.4999988079071045, + "reward_std": 1.11159255311577e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999988079071045, + "step": 3483 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.5, + "epoch": 9.025906735751295, + "grad_norm": 0.4380365066412515, + "kl": 0.064697265625, + "learning_rate": 9.76683937823834e-08, + "loss": -0.0003, + "reward": 2.499990224838257, + "reward_std": 4.9006062852186005e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999903440475464, + "step": 3484 + }, + { + "clip_ratio": 0.0, + "completion_length": 36.0, + "epoch": 9.028497409326425, + "grad_norm": 2.495643749107821, + "kl": 0.17626953125, + "learning_rate": 9.740932642487046e-08, + "loss": 0.0009, + "reward": 1.4108569025993347, + "reward_std": 0.00014331345437312848, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 0.9108568429946899, + "step": 3485 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 9.031088082901555, + "grad_norm": 0.3345279109883954, + "kl": 0.1015625, + "learning_rate": 9.715025906735751e-08, + "loss": 0.001, + "reward": 1.998820960521698, + "reward_std": 1.0031901013007882e-05, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.4988210499286652, + "step": 3486 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 9.033678756476684, + "grad_norm": 1.2693213301170225, + "kl": 0.147216796875, + "learning_rate": 9.689119170984456e-08, + "loss": -0.0001, + "reward": 2.499998092651367, + "reward_std": 1.9309435970171762e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999982714653015, + "step": 3487 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.5, + "epoch": 9.036269430051814, + "grad_norm": 1.1066697222495046, + "kl": 0.17333984375, + "learning_rate": 9.66321243523316e-08, + "loss": -0.001, + "reward": 2.4999959468841553, + "reward_std": 3.144507900287863e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999961256980896, + "step": 3488 + }, + { + "clip_ratio": 0.0, + "completion_length": 34.5, + "epoch": 9.038860103626943, + "grad_norm": 6.743277964024613, + "kl": 0.1875, + "learning_rate": 9.637305699481865e-08, + "loss": 0.0008, + "reward": 1.9139612913131714, + "reward_std": 0.0004165237122037979, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.4139613509178162, + "step": 3489 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 9.041450777202073, + "grad_norm": 0.42999072864916266, + "kl": 0.1298828125, + "learning_rate": 9.61139896373057e-08, + "loss": 0.0007, + "reward": 2.4999910593032837, + "reward_std": 2.8959943847439718e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999910593032837, + "step": 3490 + }, + { + "clip_ratio": 0.0, + "completion_length": 33.5, + "epoch": 9.044041450777202, + "grad_norm": 0.2901025006281915, + "kl": 0.0859375, + "learning_rate": 9.585492227979273e-08, + "loss": 0.0012, + "reward": 2.4999964237213135, + "reward_std": 2.591831446352444e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999963641166687, + "step": 3491 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.5, + "epoch": 9.046632124352332, + "grad_norm": 0.14821971298915504, + "kl": 0.106689453125, + "learning_rate": 9.559585492227979e-08, + "loss": 0.0001, + "reward": 2.4999940395355225, + "reward_std": 2.6334391804994084e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999940991401672, + "step": 3492 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.5, + "epoch": 9.049222797927461, + "grad_norm": 36.833069799240164, + "kl": 0.105712890625, + "learning_rate": 9.533678756476684e-08, + "loss": 0.0006, + "reward": 1.9517080783843994, + "reward_std": 0.000696948525614971, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.4517080783843994, + "step": 3493 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 9.05181347150259, + "grad_norm": 7.408022671647803, + "kl": 0.150390625, + "learning_rate": 9.507772020725388e-08, + "loss": 0.0007, + "reward": 1.9843990802764893, + "reward_std": 0.00019531736870703753, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.4843991994857788, + "step": 3494 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 9.05440414507772, + "grad_norm": 0.23193896606616435, + "kl": 0.08624267578125, + "learning_rate": 9.481865284974093e-08, + "loss": 0.0003, + "reward": 2.499997138977051, + "reward_std": 2.67501661710412e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.999997079372406, + "step": 3495 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 9.05699481865285, + "grad_norm": 1.5772107997975955, + "kl": 0.0438232421875, + "learning_rate": 9.455958549222798e-08, + "loss": 0.0001, + "reward": 2.499995470046997, + "reward_std": 3.7123882918876916e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.999995470046997, + "step": 3496 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 9.05958549222798, + "grad_norm": 0.882971446673172, + "kl": 0.0677490234375, + "learning_rate": 9.430051813471502e-08, + "loss": -0.0, + "reward": 2.499984383583069, + "reward_std": 6.3757099724170985e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999843835830688, + "step": 3497 + }, + { + "clip_ratio": 0.0, + "completion_length": 36.0, + "epoch": 9.062176165803109, + "grad_norm": 0.08620103946244546, + "kl": 0.080078125, + "learning_rate": 9.404145077720207e-08, + "loss": 0.0003, + "reward": 2.499996066093445, + "reward_std": 2.149706432419407e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999960064888, + "step": 3498 + }, + { + "clip_ratio": 0.0, + "completion_length": 34.0, + "epoch": 9.064766839378239, + "grad_norm": 0.11164462883929452, + "kl": 0.094482421875, + "learning_rate": 9.378238341968911e-08, + "loss": 0.0002, + "reward": 2.499997615814209, + "reward_std": 1.7218991388290306e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999976754188538, + "step": 3499 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 9.067357512953368, + "grad_norm": 0.041141774413814156, + "kl": 0.072021484375, + "learning_rate": 9.352331606217615e-08, + "loss": -0.0003, + "reward": 2.499998927116394, + "reward_std": 1.111408181486695e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999990463256836, + "step": 3500 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.5, + "epoch": 9.069948186528498, + "grad_norm": 0.14079751258954698, + "kl": 0.083984375, + "learning_rate": 9.32642487046632e-08, + "loss": 0.0008, + "reward": 2.4999992847442627, + "reward_std": 8.097625254777086e-07, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999993443489075, + "step": 3501 + }, + { + "clip_ratio": 0.0, + "completion_length": 34.0, + "epoch": 9.072538860103627, + "grad_norm": 0.5327683758851042, + "kl": 0.0828857421875, + "learning_rate": 9.300518134715026e-08, + "loss": 0.0018, + "reward": 2.499993920326233, + "reward_std": 2.314488796173464e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999938607215881, + "step": 3502 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 9.075129533678757, + "grad_norm": 0.19671401761087878, + "kl": 0.0712890625, + "learning_rate": 9.27461139896373e-08, + "loss": 0.0003, + "reward": 2.499994993209839, + "reward_std": 2.544074845900468e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999951720237732, + "step": 3503 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 9.077720207253886, + "grad_norm": 0.6099216761443084, + "kl": 0.132568359375, + "learning_rate": 9.248704663212435e-08, + "loss": 0.0012, + "reward": 2.499996304512024, + "reward_std": 3.42810790243675e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999961256980896, + "step": 3504 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 9.080310880829016, + "grad_norm": 0.03981533333921242, + "kl": 0.05352783203125, + "learning_rate": 9.22279792746114e-08, + "loss": 0.0005, + "reward": 2.499997138977051, + "reward_std": 1.4315186831481697e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.999997317790985, + "step": 3505 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.5, + "epoch": 9.082901554404145, + "grad_norm": 0.06132848935988859, + "kl": 0.14892578125, + "learning_rate": 9.196891191709843e-08, + "loss": 0.0005, + "reward": 2.4999905824661255, + "reward_std": 2.222905777671258e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999906420707703, + "step": 3506 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 9.085492227979275, + "grad_norm": 0.16083386118952564, + "kl": 0.086181640625, + "learning_rate": 9.170984455958548e-08, + "loss": -0.0001, + "reward": 2.4999974966049194, + "reward_std": 1.9145832084177528e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999975562095642, + "step": 3507 + }, + { + "clip_ratio": 0.0, + "completion_length": 34.5, + "epoch": 9.088082901554404, + "grad_norm": 0.33732688001838157, + "kl": 0.076416015625, + "learning_rate": 9.145077720207254e-08, + "loss": -0.0016, + "reward": 2.4999964237213135, + "reward_std": 3.306870041797083e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999966621398926, + "step": 3508 + }, + { + "clip_ratio": 0.0, + "completion_length": 36.0, + "epoch": 9.090673575129534, + "grad_norm": 0.06172871315433009, + "kl": 0.058837890625, + "learning_rate": 9.119170984455957e-08, + "loss": -0.0004, + "reward": 2.4999985694885254, + "reward_std": 1.1362897396338667e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999986290931702, + "step": 3509 + }, + { + "clip_ratio": 0.0, + "completion_length": 36.5, + "epoch": 9.093264248704664, + "grad_norm": 0.1221763012444763, + "kl": 0.0654296875, + "learning_rate": 9.093264248704663e-08, + "loss": 0.0006, + "reward": 2.4999964237213135, + "reward_std": 1.9944716882491775e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.999996304512024, + "step": 3510 + }, + { + "clip_ratio": 0.0, + "completion_length": 34.5, + "epoch": 9.095854922279793, + "grad_norm": 0.10759559405656546, + "kl": 0.1484375, + "learning_rate": 9.067357512953368e-08, + "loss": 0.0009, + "reward": 2.4999983310699463, + "reward_std": 1.3774486546935805e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.999998390674591, + "step": 3511 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.5, + "epoch": 9.098445595854923, + "grad_norm": 4.517936598874727, + "kl": 0.114990234375, + "learning_rate": 9.041450777202072e-08, + "loss": -0.0006, + "reward": 1.9975407123565674, + "reward_std": 7.587642562612018e-05, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.4975408017635345, + "step": 3512 + }, + { + "clip_ratio": 0.0, + "completion_length": 34.5, + "epoch": 9.101036269430052, + "grad_norm": 3.4552481314677723, + "kl": 0.0921630859375, + "learning_rate": 9.015544041450777e-08, + "loss": -0.0002, + "reward": 1.998643696308136, + "reward_std": 5.254331631476816e-05, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.4986439645290375, + "step": 3513 + }, + { + "clip_ratio": 0.0, + "completion_length": 34.5, + "epoch": 9.103626943005182, + "grad_norm": 0.1564918929933876, + "kl": 0.1358642578125, + "learning_rate": 8.989637305699481e-08, + "loss": 0.0003, + "reward": 2.4999961853027344, + "reward_std": 3.0495034479827154e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.999996304512024, + "step": 3514 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.5, + "epoch": 9.106217616580311, + "grad_norm": 0.09146741348489058, + "kl": 0.0478515625, + "learning_rate": 8.963730569948185e-08, + "loss": -0.001, + "reward": 2.499997138977051, + "reward_std": 1.5287637040728441e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999974966049194, + "step": 3515 + }, + { + "clip_ratio": 0.0, + "completion_length": 36.0, + "epoch": 9.10880829015544, + "grad_norm": 0.09953748251622452, + "kl": 0.086669921875, + "learning_rate": 8.93782383419689e-08, + "loss": 0.0003, + "reward": 2.49999737739563, + "reward_std": 1.5666132640035357e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999975562095642, + "step": 3516 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 9.11139896373057, + "grad_norm": 0.44628701048263403, + "kl": 0.0845947265625, + "learning_rate": 8.911917098445596e-08, + "loss": 0.0005, + "reward": 2.499992609024048, + "reward_std": 1.0490711019883747e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999926686286926, + "step": 3517 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 9.1139896373057, + "grad_norm": 4.290890938745377, + "kl": 0.1033935546875, + "learning_rate": 8.886010362694301e-08, + "loss": 0.0015, + "reward": 2.49998939037323, + "reward_std": 6.804335072274625e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.99998939037323, + "step": 3518 + }, + { + "clip_ratio": 0.0, + "completion_length": 34.5, + "epoch": 9.11658031088083, + "grad_norm": 0.08609135237311949, + "kl": 0.14453125, + "learning_rate": 8.860103626943005e-08, + "loss": 0.0004, + "reward": 2.499998092651367, + "reward_std": 9.932541900070646e-07, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999980926513672, + "step": 3519 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 9.119170984455959, + "grad_norm": 0.08522076308170079, + "kl": 0.060546875, + "learning_rate": 8.83419689119171e-08, + "loss": -0.0015, + "reward": 2.4999970197677612, + "reward_std": 1.883630062593511e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999971985816956, + "step": 3520 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 9.121761658031089, + "grad_norm": 0.13492581941467, + "kl": 0.1041259765625, + "learning_rate": 8.808290155440414e-08, + "loss": -0.0002, + "reward": 2.4999977350234985, + "reward_std": 1.5803756809873448e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.999997854232788, + "step": 3521 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 9.124352331606218, + "grad_norm": 0.2602465144856269, + "kl": 0.10205078125, + "learning_rate": 8.782383419689118e-08, + "loss": 0.0008, + "reward": 2.499997854232788, + "reward_std": 1.2697054216914694e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999977946281433, + "step": 3522 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.4375, + "epoch": 9.126943005181348, + "grad_norm": 22.897018034269614, + "kl": 0.1806640625, + "learning_rate": 8.756476683937823e-08, + "loss": 0.0008, + "reward": 1.917019248008728, + "reward_std": 0.17721241320396075, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.4170193076133728, + "step": 3523 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.5, + "epoch": 9.129533678756477, + "grad_norm": 1.0593552148541703, + "kl": 0.05712890625, + "learning_rate": 8.730569948186529e-08, + "loss": -0.0006, + "reward": 2.4999853372573853, + "reward_std": 7.5435145845403895e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999854564666748, + "step": 3524 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.5, + "epoch": 9.132124352331607, + "grad_norm": 3.8027444160685127, + "kl": 0.079345703125, + "learning_rate": 8.704663212435232e-08, + "loss": 0.0011, + "reward": 1.999875783920288, + "reward_std": 1.8557726207291125e-05, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.4998756647109985, + "step": 3525 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.5, + "epoch": 9.134715025906736, + "grad_norm": 0.2178348476107066, + "kl": 0.0589599609375, + "learning_rate": 8.678756476683938e-08, + "loss": -0.0006, + "reward": 2.4999955892562866, + "reward_std": 1.9775399096033652e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999956488609314, + "step": 3526 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.5, + "epoch": 9.137305699481866, + "grad_norm": 0.2545155255759983, + "kl": 0.0849609375, + "learning_rate": 8.652849740932643e-08, + "loss": 0.0003, + "reward": 2.499998450279236, + "reward_std": 1.0359680970850604e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999985098838806, + "step": 3527 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 9.139896373056995, + "grad_norm": 382.3894202326106, + "kl": 0.160400390625, + "learning_rate": 8.626943005181347e-08, + "loss": 0.0006, + "reward": 1.5881276726722717, + "reward_std": 0.2603294017026201, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.0881277322769165, + "step": 3528 + }, + { + "clip_ratio": 0.0, + "completion_length": 34.5, + "epoch": 9.142487046632125, + "grad_norm": 0.1452822484063602, + "kl": 0.103271484375, + "learning_rate": 8.601036269430051e-08, + "loss": 0.0007, + "reward": 2.49999737739563, + "reward_std": 1.812907555631682e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999972581863403, + "step": 3529 + }, + { + "clip_ratio": 0.0, + "completion_length": 34.0, + "epoch": 9.145077720207254, + "grad_norm": 0.1718717387852575, + "kl": 0.121337890625, + "learning_rate": 8.575129533678756e-08, + "loss": -0.0005, + "reward": 2.499996304512024, + "reward_std": 2.6523757696850225e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999964237213135, + "step": 3530 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.5, + "epoch": 9.147668393782384, + "grad_norm": 3.427584256839214, + "kl": 0.091552734375, + "learning_rate": 8.54922279792746e-08, + "loss": 0.0004, + "reward": 1.9929132461547852, + "reward_std": 0.00010289954838071935, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.49291330575943, + "step": 3531 + }, + { + "clip_ratio": 0.0, + "completion_length": 34.5, + "epoch": 9.150259067357513, + "grad_norm": 0.17352339974584874, + "kl": 0.19677734375, + "learning_rate": 8.523316062176165e-08, + "loss": -0.0002, + "reward": 2.4999953508377075, + "reward_std": 2.938101118843406e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999955296516418, + "step": 3532 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.25, + "epoch": 9.152849740932643, + "grad_norm": 29.424021399266465, + "kl": 0.148193359375, + "learning_rate": 8.497409326424871e-08, + "loss": 0.0003, + "reward": 1.905815839767456, + "reward_std": 0.0246942967598045, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.4058158993721008, + "step": 3533 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.5, + "epoch": 9.155440414507773, + "grad_norm": 0.02242225058304249, + "kl": 0.15625, + "learning_rate": 8.471502590673575e-08, + "loss": -0.0001, + "reward": 2.4999990463256836, + "reward_std": 6.671853896023094e-07, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999992847442627, + "step": 3534 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 9.158031088082902, + "grad_norm": 0.3866209538219895, + "kl": 0.072509765625, + "learning_rate": 8.44559585492228e-08, + "loss": 0.001, + "reward": 2.499995231628418, + "reward_std": 3.4890496181105846e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.999995231628418, + "step": 3535 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 9.160621761658032, + "grad_norm": 1.8572890997567275, + "kl": 0.0772705078125, + "learning_rate": 8.419689119170984e-08, + "loss": 0.0014, + "reward": 2.4999942779541016, + "reward_std": 9.465374660067027e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.999994158744812, + "step": 3536 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.5, + "epoch": 9.163212435233161, + "grad_norm": 0.24919291666141827, + "kl": 0.044189453125, + "learning_rate": 8.393782383419688e-08, + "loss": -0.001, + "reward": 2.499993681907654, + "reward_std": 4.164619667790248e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999937415122986, + "step": 3537 + }, + { + "clip_ratio": 0.0, + "completion_length": 34.5, + "epoch": 9.16580310880829, + "grad_norm": 0.08768595879822642, + "kl": 0.0816650390625, + "learning_rate": 8.367875647668393e-08, + "loss": 0.0, + "reward": 2.4999974966049194, + "reward_std": 1.355705023797782e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999974370002747, + "step": 3538 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.5, + "epoch": 9.16839378238342, + "grad_norm": 40.88632913457326, + "kl": 0.1669921875, + "learning_rate": 8.341968911917098e-08, + "loss": 0.0005, + "reward": 1.9617998600006104, + "reward_std": 0.005931414394581225, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.4617998600006104, + "step": 3539 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.5, + "epoch": 9.17098445595855, + "grad_norm": 0.19199105593260407, + "kl": 0.1142578125, + "learning_rate": 8.316062176165802e-08, + "loss": 0.0015, + "reward": 2.4999982118606567, + "reward_std": 1.6946941627793422e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999980330467224, + "step": 3540 + }, + { + "clip_ratio": 0.0, + "completion_length": 34.5, + "epoch": 9.17357512953368, + "grad_norm": 0.23844546466452848, + "kl": 0.066162109375, + "learning_rate": 8.290155440414508e-08, + "loss": -0.0, + "reward": 2.499996066093445, + "reward_std": 2.2397210841518245e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999960660934448, + "step": 3541 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.5, + "epoch": 9.176165803108809, + "grad_norm": 44.01359906022211, + "kl": 0.13525390625, + "learning_rate": 8.264248704663213e-08, + "loss": 0.0004, + "reward": 2.2498685717582703, + "reward_std": 0.2673964417733714, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.7498685121536255, + "step": 3542 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 9.178756476683938, + "grad_norm": 0.3352071062343994, + "kl": 0.131591796875, + "learning_rate": 8.238341968911918e-08, + "loss": -0.0009, + "reward": 2.4999728202819824, + "reward_std": 5.211686584516428e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999730587005615, + "step": 3543 + }, + { + "clip_ratio": 0.0, + "completion_length": 36.0, + "epoch": 9.181347150259068, + "grad_norm": 10.411243708551389, + "kl": 0.09765625, + "learning_rate": 8.212435233160621e-08, + "loss": 0.0011, + "reward": 1.9981681108474731, + "reward_std": 0.00020398637303742362, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.498168170452118, + "step": 3544 + }, + { + "clip_ratio": 0.0, + "completion_length": 36.0, + "epoch": 9.183937823834198, + "grad_norm": 0.12909051668991595, + "kl": 0.115234375, + "learning_rate": 8.186528497409326e-08, + "loss": 0.0016, + "reward": 2.499998092651367, + "reward_std": 1.796446099433524e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999979734420776, + "step": 3545 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 9.186528497409327, + "grad_norm": 0.6215682147978437, + "kl": 0.126953125, + "learning_rate": 8.160621761658031e-08, + "loss": -0.0006, + "reward": 2.49998140335083, + "reward_std": 5.958425390417688e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999812841415405, + "step": 3546 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.5, + "epoch": 9.189119170984457, + "grad_norm": 0.07262286035581746, + "kl": 0.0701904296875, + "learning_rate": 8.134715025906735e-08, + "loss": 0.0001, + "reward": 2.4999985694885254, + "reward_std": 1.2613791682269948e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999986290931702, + "step": 3547 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 9.191709844559586, + "grad_norm": 0.05769615047668933, + "kl": 0.146240234375, + "learning_rate": 8.10880829015544e-08, + "loss": 0.0001, + "reward": 2.499993085861206, + "reward_std": 1.6173233916561003e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999931454658508, + "step": 3548 + }, + { + "clip_ratio": 0.0, + "completion_length": 34.5, + "epoch": 9.194300518134716, + "grad_norm": 4.731128879029373, + "kl": 0.129638671875, + "learning_rate": 8.082901554404146e-08, + "loss": 0.0002, + "reward": 1.9995192289352417, + "reward_std": 4.111810568474539e-05, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.499519169330597, + "step": 3549 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 9.196891191709845, + "grad_norm": 51.568961297176244, + "kl": 0.1708984375, + "learning_rate": 8.05699481865285e-08, + "loss": 0.001, + "reward": 1.9793486595153809, + "reward_std": 0.002978515777613211, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.479348599910736, + "step": 3550 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.5, + "epoch": 9.199481865284975, + "grad_norm": 0.39112334347163097, + "kl": 0.07745361328125, + "learning_rate": 8.031088082901554e-08, + "loss": 0.0002, + "reward": 2.4999947547912598, + "reward_std": 3.431943355280964e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.999994695186615, + "step": 3551 + }, + { + "clip_ratio": 0.0, + "completion_length": 34.625, + "epoch": 9.202072538860104, + "grad_norm": 59.83366888962206, + "kl": 0.205078125, + "learning_rate": 8.005181347150259e-08, + "loss": 0.0004, + "reward": 1.999030590057373, + "reward_std": 0.0003342139985988979, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.4990307092666626, + "step": 3552 + }, + { + "clip_ratio": 0.0, + "completion_length": 34.5, + "epoch": 9.204663212435234, + "grad_norm": 22.18644158980627, + "kl": 0.171875, + "learning_rate": 7.979274611398963e-08, + "loss": 0.0009, + "reward": 1.6872472763061523, + "reward_std": 0.258824537369037, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.1872472763061523, + "step": 3553 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 9.207253886010363, + "grad_norm": 0.30171082233586743, + "kl": 0.0665283203125, + "learning_rate": 7.953367875647668e-08, + "loss": -0.0005, + "reward": 2.499996542930603, + "reward_std": 2.6652656686110276e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999966025352478, + "step": 3554 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 9.209844559585493, + "grad_norm": 0.1899271042364944, + "kl": 0.0611572265625, + "learning_rate": 7.927461139896373e-08, + "loss": 0.0008, + "reward": 2.4999970197677612, + "reward_std": 2.2224454028219043e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999970197677612, + "step": 3555 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 9.212435233160623, + "grad_norm": 0.19070259585616608, + "kl": 0.0970458984375, + "learning_rate": 7.901554404145077e-08, + "loss": 0.0002, + "reward": 2.49997341632843, + "reward_std": 3.711537260642217e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999734163284302, + "step": 3556 + }, + { + "clip_ratio": 0.0, + "completion_length": 36.0, + "epoch": 9.215025906735752, + "grad_norm": 66.18177249721631, + "kl": 0.144287109375, + "learning_rate": 7.875647668393783e-08, + "loss": 0.0006, + "reward": 1.9833200573921204, + "reward_std": 0.018461486029764274, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.4833199977874756, + "step": 3557 + }, + { + "clip_ratio": 0.0, + "completion_length": 34.5, + "epoch": 9.217616580310882, + "grad_norm": 0.11933941238438604, + "kl": 0.030670166015625, + "learning_rate": 7.849740932642488e-08, + "loss": -0.0, + "reward": 2.499998092651367, + "reward_std": 9.453159748318285e-07, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.999998152256012, + "step": 3558 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.5, + "epoch": 9.220207253886011, + "grad_norm": 0.049305228015503536, + "kl": 0.064453125, + "learning_rate": 7.82383419689119e-08, + "loss": 0.0001, + "reward": 2.4999990463256836, + "reward_std": 7.960260575146094e-07, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999991059303284, + "step": 3559 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.5, + "epoch": 9.22279792746114, + "grad_norm": 4.478637212503904, + "kl": 0.1455078125, + "learning_rate": 7.797927461139896e-08, + "loss": 0.0008, + "reward": 1.9800052642822266, + "reward_std": 0.00016660140261137713, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.480005145072937, + "step": 3560 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.5, + "epoch": 9.22538860103627, + "grad_norm": 0.05543515393198338, + "kl": 0.12353515625, + "learning_rate": 7.772020725388601e-08, + "loss": 0.0, + "reward": 2.499998450279236, + "reward_std": 1.0431233761210024e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999984502792358, + "step": 3561 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 9.2279792746114, + "grad_norm": 0.11266086440535066, + "kl": 0.0458984375, + "learning_rate": 7.746113989637305e-08, + "loss": 0.0006, + "reward": 2.499998927116394, + "reward_std": 1.146177766031542e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999990463256836, + "step": 3562 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.5, + "epoch": 9.23056994818653, + "grad_norm": 0.08830150656926208, + "kl": 0.0635986328125, + "learning_rate": 7.72020725388601e-08, + "loss": -0.0006, + "reward": 2.499998092651367, + "reward_std": 1.956743631126301e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.999998152256012, + "step": 3563 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 9.233160621761659, + "grad_norm": 0.30706877443173847, + "kl": 0.1435546875, + "learning_rate": 7.694300518134715e-08, + "loss": -0.0002, + "reward": 2.4999972581863403, + "reward_std": 2.4336883370779105e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999974966049194, + "step": 3564 + }, + { + "clip_ratio": 0.0, + "completion_length": 34.5, + "epoch": 9.235751295336788, + "grad_norm": 2.1614918213791796, + "kl": 0.0849609375, + "learning_rate": 7.66839378238342e-08, + "loss": -0.0001, + "reward": 1.9962731003761292, + "reward_std": 5.611177903119824e-05, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.4962729811668396, + "step": 3565 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 9.238341968911918, + "grad_norm": 0.18784275754313717, + "kl": 0.129638671875, + "learning_rate": 7.642487046632123e-08, + "loss": 0.0002, + "reward": 2.499996304512024, + "reward_std": 2.130454390680825e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999961853027344, + "step": 3566 + }, + { + "clip_ratio": 0.0, + "completion_length": 34.5, + "epoch": 9.240932642487047, + "grad_norm": 6.149302593866085, + "kl": 0.121337890625, + "learning_rate": 7.616580310880829e-08, + "loss": 0.0011, + "reward": 2.49999737739563, + "reward_std": 2.6487966806598706e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999973773956299, + "step": 3567 + }, + { + "clip_ratio": 0.0, + "completion_length": 34.5, + "epoch": 9.243523316062177, + "grad_norm": 9.463312319003382, + "kl": 0.195068359375, + "learning_rate": 7.590673575129533e-08, + "loss": 0.0015, + "reward": 1.8640063405036926, + "reward_std": 0.00027915232055875094, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.3640061616897583, + "step": 3568 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.5, + "epoch": 9.246113989637305, + "grad_norm": 0.18809941974911043, + "kl": 0.0909423828125, + "learning_rate": 7.564766839378238e-08, + "loss": -0.0003, + "reward": 2.4999616146087646, + "reward_std": 4.8008005251176655e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999616146087646, + "step": 3569 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.5, + "epoch": 9.248704663212436, + "grad_norm": 0.0638839898521669, + "kl": 0.082275390625, + "learning_rate": 7.538860103626943e-08, + "loss": 0.0005, + "reward": 2.499997854232788, + "reward_std": 1.3220947323588916e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999977946281433, + "step": 3570 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.5, + "epoch": 9.251295336787564, + "grad_norm": 0.08531829099653306, + "kl": 0.0477294921875, + "learning_rate": 7.512953367875647e-08, + "loss": -0.0008, + "reward": 2.4999977350234985, + "reward_std": 1.5818905580999854e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999977946281433, + "step": 3571 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.8125, + "epoch": 9.253886010362695, + "grad_norm": 7.348409319816751, + "kl": 0.102783203125, + "learning_rate": 7.487046632124352e-08, + "loss": 0.0005, + "reward": 2.343742251396179, + "reward_std": 0.44194799704609977, + "rewards/format_reward_rec": 0.9375, + "rewards/point_reward": 1.8749921917915344, + "step": 3572 + }, + { + "clip_ratio": 0.0, + "completion_length": 36.0, + "epoch": 9.256476683937823, + "grad_norm": 0.060739984690772277, + "kl": 0.07080078125, + "learning_rate": 7.461139896373056e-08, + "loss": -0.0003, + "reward": 2.499997615814209, + "reward_std": 1.0749919425734333e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999975562095642, + "step": 3573 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.5, + "epoch": 9.259067357512953, + "grad_norm": 5.698604906448317, + "kl": 0.1328125, + "learning_rate": 7.43523316062176e-08, + "loss": -0.0002, + "reward": 2.4374738931655884, + "reward_std": 0.1767980893737331, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9374739527702332, + "step": 3574 + }, + { + "clip_ratio": 0.0, + "completion_length": 36.0, + "epoch": 9.261658031088082, + "grad_norm": 85.61837482286279, + "kl": 0.096435546875, + "learning_rate": 7.409326424870465e-08, + "loss": 0.0011, + "reward": 1.9946054220199585, + "reward_std": 9.352893789582595e-05, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.4946053624153137, + "step": 3575 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.5, + "epoch": 9.264248704663212, + "grad_norm": 5.105707741972881, + "kl": 0.0498046875, + "learning_rate": 7.383419689119171e-08, + "loss": 0.0003, + "reward": 1.9958272576332092, + "reward_std": 7.92200760599826e-05, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.4958274364471436, + "step": 3576 + }, + { + "clip_ratio": 0.0, + "completion_length": 34.5, + "epoch": 9.266839378238341, + "grad_norm": 0.054476045106411054, + "kl": 0.03497314453125, + "learning_rate": 7.357512953367876e-08, + "loss": -0.0015, + "reward": 2.4999988079071045, + "reward_std": 9.290054094890365e-07, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999991059303284, + "step": 3577 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 9.26943005181347, + "grad_norm": 11.530677403527715, + "kl": 0.1611328125, + "learning_rate": 7.33160621761658e-08, + "loss": 0.001, + "reward": 1.9456124305725098, + "reward_std": 0.00033581948025585007, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.445612370967865, + "step": 3578 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.5, + "epoch": 9.2720207253886, + "grad_norm": 0.3580396667956142, + "kl": 0.07958984375, + "learning_rate": 7.305699481865285e-08, + "loss": 0.0018, + "reward": 2.499995708465576, + "reward_std": 2.5989182859120774e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999955296516418, + "step": 3579 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.5, + "epoch": 9.27461139896373, + "grad_norm": 0.18230858504597722, + "kl": 0.073974609375, + "learning_rate": 7.27979274611399e-08, + "loss": 0.0007, + "reward": 2.4999969005584717, + "reward_std": 1.9945116491726367e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999969005584717, + "step": 3580 + }, + { + "clip_ratio": 0.0, + "completion_length": 34.5, + "epoch": 9.27720207253886, + "grad_norm": 0.7284622978894505, + "kl": 0.067138671875, + "learning_rate": 7.253886010362693e-08, + "loss": 0.001, + "reward": 2.4999784231185913, + "reward_std": 4.856955342802394e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999783635139465, + "step": 3581 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 9.279792746113989, + "grad_norm": 0.06370671993094809, + "kl": 0.106689453125, + "learning_rate": 7.227979274611398e-08, + "loss": -0.0011, + "reward": 2.499998688697815, + "reward_std": 8.49075803444066e-07, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.999998927116394, + "step": 3582 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.6875, + "epoch": 9.282383419689118, + "grad_norm": 0.08272396935298938, + "kl": 0.13690185546875, + "learning_rate": 7.202072538860104e-08, + "loss": 0.0008, + "reward": 2.4999977350234985, + "reward_std": 1.4120806781647843e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999977350234985, + "step": 3583 + }, + { + "clip_ratio": 0.0, + "completion_length": 33.5, + "epoch": 9.284974093264248, + "grad_norm": 3.356436285295582, + "kl": 0.119873046875, + "learning_rate": 7.176165803108808e-08, + "loss": -0.0001, + "reward": 1.9991903901100159, + "reward_std": 1.2754125123137783e-05, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.4991905093193054, + "step": 3584 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 9.287564766839377, + "grad_norm": 0.10836975461592056, + "kl": 0.119140625, + "learning_rate": 7.150259067357513e-08, + "loss": 0.0, + "reward": 2.49999737739563, + "reward_std": 2.6355995146332134e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.999997615814209, + "step": 3585 + }, + { + "clip_ratio": 0.0, + "completion_length": 34.0, + "epoch": 9.290155440414507, + "grad_norm": 4.814612523173493, + "kl": 0.127685546875, + "learning_rate": 7.124352331606218e-08, + "loss": 0.0015, + "reward": 1.6810356974601746, + "reward_std": 0.0003449342570149838, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.1810354590415955, + "step": 3586 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 9.292746113989637, + "grad_norm": 0.12166511666571837, + "kl": 0.0565185546875, + "learning_rate": 7.098445595854922e-08, + "loss": -0.0005, + "reward": 2.4999990463256836, + "reward_std": 9.061700723123067e-07, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.999999225139618, + "step": 3587 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.5, + "epoch": 9.295336787564766, + "grad_norm": 15.828515390982098, + "kl": 0.1455078125, + "learning_rate": 7.072538860103626e-08, + "loss": 0.0003, + "reward": 1.874607801437378, + "reward_std": 0.0008432363763404283, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.3746077418327332, + "step": 3588 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 9.297927461139896, + "grad_norm": 35.63865253352593, + "kl": 0.08935546875, + "learning_rate": 7.046632124352331e-08, + "loss": 0.0005, + "reward": 1.3149770498275757, + "reward_std": 0.0009969968605219037, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 0.8149770498275757, + "step": 3589 + }, + { + "clip_ratio": 0.0, + "completion_length": 34.0, + "epoch": 9.300518134715025, + "grad_norm": 0.12180622721924536, + "kl": 0.081298828125, + "learning_rate": 7.020725388601035e-08, + "loss": -0.0007, + "reward": 2.4999974966049194, + "reward_std": 1.7089608377318655e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999977946281433, + "step": 3590 + }, + { + "clip_ratio": 0.0, + "completion_length": 34.0, + "epoch": 9.303108808290155, + "grad_norm": 0.2387823283182586, + "kl": 0.094482421875, + "learning_rate": 6.99481865284974e-08, + "loss": -0.0001, + "reward": 2.499997615814209, + "reward_std": 1.6343295783372014e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999977350234985, + "step": 3591 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.5, + "epoch": 9.305699481865284, + "grad_norm": 5.121165870513497, + "kl": 0.412353515625, + "learning_rate": 6.968911917098446e-08, + "loss": 0.0022, + "reward": 1.989859938621521, + "reward_std": 0.00019694055595209647, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.4898599088191986, + "step": 3592 + }, + { + "clip_ratio": 0.0, + "completion_length": 34.5, + "epoch": 9.308290155440414, + "grad_norm": 0.09566503437215755, + "kl": 0.057373046875, + "learning_rate": 6.94300518134715e-08, + "loss": 0.0005, + "reward": 2.4999961853027344, + "reward_std": 1.6610765385394188e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999960660934448, + "step": 3593 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.5, + "epoch": 9.310880829015543, + "grad_norm": 8.127659945758918, + "kl": 0.174560546875, + "learning_rate": 6.917098445595855e-08, + "loss": 0.0008, + "reward": 1.8960446119308472, + "reward_std": 0.0008942767196344903, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.3960447311401367, + "step": 3594 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 9.313471502590673, + "grad_norm": 2.029263531863521, + "kl": 0.28759765625, + "learning_rate": 6.89119170984456e-08, + "loss": 0.0004, + "reward": 1.9993433952331543, + "reward_std": 3.0199084676496568e-05, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.499343454837799, + "step": 3595 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.5, + "epoch": 9.316062176165802, + "grad_norm": 0.5439808564060447, + "kl": 0.1435546875, + "learning_rate": 6.865284974093263e-08, + "loss": 0.0007, + "reward": 2.499991297721863, + "reward_std": 4.223207042741706e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999912977218628, + "step": 3596 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.5, + "epoch": 9.318652849740932, + "grad_norm": 0.28356501550044233, + "kl": 0.0523681640625, + "learning_rate": 6.839378238341968e-08, + "loss": 0.0003, + "reward": 2.4999924898147583, + "reward_std": 4.477263473745552e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999922513961792, + "step": 3597 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 9.321243523316062, + "grad_norm": 1.0108189392601912, + "kl": 0.13818359375, + "learning_rate": 6.813471502590673e-08, + "loss": -0.0, + "reward": 2.499977946281433, + "reward_std": 6.1509495026257355e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999780654907227, + "step": 3598 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.5, + "epoch": 9.323834196891191, + "grad_norm": 0.13533786568396786, + "kl": 0.08099365234375, + "learning_rate": 6.787564766839377e-08, + "loss": 0.0003, + "reward": 2.49999737739563, + "reward_std": 2.015069412664161e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999975562095642, + "step": 3599 + }, + { + "clip_ratio": 0.0, + "completion_length": 34.8125, + "epoch": 9.32642487046632, + "grad_norm": 32.67606220041922, + "kl": 0.0662841796875, + "learning_rate": 6.761658031088083e-08, + "loss": 0.001, + "reward": 1.9711965322494507, + "reward_std": 0.011381687509015137, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.471196472644806, + "step": 3600 + }, + { + "clip_ratio": 0.0, + "completion_length": 36.0, + "epoch": 9.32901554404145, + "grad_norm": 0.5901533549375167, + "kl": 0.1220703125, + "learning_rate": 6.735751295336788e-08, + "loss": 0.0012, + "reward": 2.4999947547912598, + "reward_std": 3.092614804245386e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999947547912598, + "step": 3601 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 9.33160621761658, + "grad_norm": 4.801469099867553, + "kl": 0.1083984375, + "learning_rate": 6.709844559585492e-08, + "loss": 0.0006, + "reward": 1.957410216331482, + "reward_std": 0.00013167196721042274, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.457410216331482, + "step": 3602 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 9.33419689119171, + "grad_norm": 14.110597411739683, + "kl": 0.135986328125, + "learning_rate": 6.683937823834196e-08, + "loss": 0.0013, + "reward": 1.9545118808746338, + "reward_std": 0.0002989301599427563, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.454511821269989, + "step": 3603 + }, + { + "clip_ratio": 0.0, + "completion_length": 34.0, + "epoch": 9.336787564766839, + "grad_norm": 0.295404537875881, + "kl": 0.0694580078125, + "learning_rate": 6.658031088082901e-08, + "loss": 0.0, + "reward": 2.4998905658721924, + "reward_std": 5.8229151136401924e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.999890685081482, + "step": 3604 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 9.339378238341968, + "grad_norm": 0.2724704061190899, + "kl": 0.104248046875, + "learning_rate": 6.632124352331605e-08, + "loss": 0.0007, + "reward": 2.499996542930603, + "reward_std": 2.671846004886902e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999964833259583, + "step": 3605 + }, + { + "clip_ratio": 0.0, + "completion_length": 34.5, + "epoch": 9.341968911917098, + "grad_norm": 0.46305444844303134, + "kl": 0.050537109375, + "learning_rate": 6.60621761658031e-08, + "loss": 0.0001, + "reward": 2.49999463558197, + "reward_std": 3.4339132071181666e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999947547912598, + "step": 3606 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.5, + "epoch": 9.344559585492227, + "grad_norm": 0.09952458202139255, + "kl": 0.1064453125, + "learning_rate": 6.580310880829015e-08, + "loss": -0.0003, + "reward": 2.4999983310699463, + "reward_std": 1.0012257547487025e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999985694885254, + "step": 3607 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 9.347150259067357, + "grad_norm": 2.4675747010684383, + "kl": 0.133056640625, + "learning_rate": 6.554404145077721e-08, + "loss": 0.0014, + "reward": 1.9998574256896973, + "reward_std": 2.1428109960197617e-05, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.4998573064804077, + "step": 3608 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 9.349740932642487, + "grad_norm": 0.24816116122884752, + "kl": 0.1107177734375, + "learning_rate": 6.528497409326425e-08, + "loss": 0.0002, + "reward": 2.4999969005584717, + "reward_std": 2.031754263498442e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.999996840953827, + "step": 3609 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 9.352331606217616, + "grad_norm": 0.11201659391185052, + "kl": 0.0660400390625, + "learning_rate": 6.50259067357513e-08, + "loss": 0.0001, + "reward": 2.4999972581863403, + "reward_std": 1.7613856471143663e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999971389770508, + "step": 3610 + }, + { + "clip_ratio": 0.0, + "completion_length": 34.5, + "epoch": 9.354922279792746, + "grad_norm": 0.5843346425668478, + "kl": 0.36279296875, + "learning_rate": 6.476683937823834e-08, + "loss": 0.002, + "reward": 2.4999959468841553, + "reward_std": 3.3874634937092196e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999960064888, + "step": 3611 + }, + { + "clip_ratio": 0.0, + "completion_length": 34.5, + "epoch": 9.357512953367875, + "grad_norm": 0.5104489376957605, + "kl": 0.0595703125, + "learning_rate": 6.450777202072538e-08, + "loss": 0.001, + "reward": 2.499995470046997, + "reward_std": 2.5525324360842205e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999953508377075, + "step": 3612 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 9.360103626943005, + "grad_norm": 35.61516971864077, + "kl": 0.0833740234375, + "learning_rate": 6.424870466321243e-08, + "loss": 0.0, + "reward": 2.1873852014541626, + "reward_std": 0.25886790680993954, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.6873852610588074, + "step": 3613 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 9.362694300518134, + "grad_norm": 0.2584191548961745, + "kl": 0.07379150390625, + "learning_rate": 6.398963730569948e-08, + "loss": 0.0003, + "reward": 2.499998450279236, + "reward_std": 2.033572002346773e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999984502792358, + "step": 3614 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 9.365284974093264, + "grad_norm": 0.3458065985445571, + "kl": 0.10986328125, + "learning_rate": 6.373056994818652e-08, + "loss": 0.0011, + "reward": 2.499996066093445, + "reward_std": 3.680582778997632e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999959468841553, + "step": 3615 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 9.367875647668393, + "grad_norm": 0.4211593887114756, + "kl": 0.1151123046875, + "learning_rate": 6.347150259067358e-08, + "loss": -0.0003, + "reward": 2.499997854232788, + "reward_std": 2.4314286974913557e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.999998152256012, + "step": 3616 + }, + { + "clip_ratio": 0.0, + "completion_length": 34.5, + "epoch": 9.370466321243523, + "grad_norm": 0.03900944177296487, + "kl": 0.104736328125, + "learning_rate": 6.321243523316063e-08, + "loss": -0.0, + "reward": 2.499998927116394, + "reward_std": 8.31658951483405e-07, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.999999225139618, + "step": 3617 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.5, + "epoch": 9.373056994818652, + "grad_norm": 0.2761578371508049, + "kl": 0.078125, + "learning_rate": 6.295336787564765e-08, + "loss": -0.0007, + "reward": 2.4999964237213135, + "reward_std": 2.360036205573124e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999964237213135, + "step": 3618 + }, + { + "clip_ratio": 0.0, + "completion_length": 34.5, + "epoch": 9.375647668393782, + "grad_norm": 0.3030193111357959, + "kl": 0.0391845703125, + "learning_rate": 6.269430051813471e-08, + "loss": 0.001, + "reward": 2.4999979734420776, + "reward_std": 1.9500905068525753e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999980926513672, + "step": 3619 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.3125, + "epoch": 9.378238341968911, + "grad_norm": 0.08436944271601664, + "kl": 0.19384765625, + "learning_rate": 6.243523316062176e-08, + "loss": 0.0021, + "reward": 2.4999964237213135, + "reward_std": 1.2308389045756485e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999961256980896, + "step": 3620 + }, + { + "clip_ratio": 0.0, + "completion_length": 34.0, + "epoch": 9.380829015544041, + "grad_norm": 7.081680726591933, + "kl": 0.0772705078125, + "learning_rate": 6.21761658031088e-08, + "loss": 0.0, + "reward": 2.124963700771332, + "reward_std": 0.23146752042606522, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.6249637007713318, + "step": 3621 + }, + { + "clip_ratio": 0.0, + "completion_length": 34.5, + "epoch": 9.38341968911917, + "grad_norm": 0.3266231682797563, + "kl": 0.1112060546875, + "learning_rate": 6.191709844559585e-08, + "loss": 0.0, + "reward": 2.499989151954651, + "reward_std": 2.3708434468971973e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999891519546509, + "step": 3622 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.5, + "epoch": 9.3860103626943, + "grad_norm": 0.39307842836447787, + "kl": 0.072509765625, + "learning_rate": 6.16580310880829e-08, + "loss": -0.0003, + "reward": 2.499995708465576, + "reward_std": 3.032876861652767e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999958872795105, + "step": 3623 + }, + { + "clip_ratio": 0.0, + "completion_length": 34.5, + "epoch": 9.38860103626943, + "grad_norm": 0.10557029022008539, + "kl": 0.084625244140625, + "learning_rate": 6.139896373056994e-08, + "loss": 0.0009, + "reward": 2.4999935626983643, + "reward_std": 1.7066317070657533e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999935626983643, + "step": 3624 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 9.39119170984456, + "grad_norm": 0.15421166946179368, + "kl": 0.074462890625, + "learning_rate": 6.1139896373057e-08, + "loss": 0.0012, + "reward": 2.499983072280884, + "reward_std": 2.8139048708908376e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999831318855286, + "step": 3625 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.5, + "epoch": 9.393782383419689, + "grad_norm": 0.10513957015954604, + "kl": 0.06451416015625, + "learning_rate": 6.088082901554404e-08, + "loss": -0.0006, + "reward": 2.4999945163726807, + "reward_std": 2.189960468967911e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.999994695186615, + "step": 3626 + }, + { + "clip_ratio": 0.0, + "completion_length": 34.0, + "epoch": 9.396373056994818, + "grad_norm": 1.417496060470017, + "kl": 0.103515625, + "learning_rate": 6.062176165803109e-08, + "loss": 0.0009, + "reward": 2.499958038330078, + "reward_std": 1.1851030990328582e-05, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999580383300781, + "step": 3627 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 9.398963730569948, + "grad_norm": 0.1918987255820987, + "kl": 0.103515625, + "learning_rate": 6.036269430051813e-08, + "loss": -0.0, + "reward": 2.49999737739563, + "reward_std": 2.124066099895572e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999973773956299, + "step": 3628 + }, + { + "clip_ratio": 0.0, + "completion_length": 33.0, + "epoch": 9.401554404145077, + "grad_norm": 4.440452023652074, + "kl": 0.054443359375, + "learning_rate": 6.010362694300518e-08, + "loss": 0.0008, + "reward": 2.49999737739563, + "reward_std": 2.110358764184639e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999972581863403, + "step": 3629 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 9.404145077720207, + "grad_norm": 1.450613671842467, + "kl": 0.341796875, + "learning_rate": 5.984455958549222e-08, + "loss": 0.0011, + "reward": 2.4999969005584717, + "reward_std": 3.0125004286674084e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999972581863403, + "step": 3630 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 9.406735751295336, + "grad_norm": 0.058651296796458065, + "kl": 0.0384521484375, + "learning_rate": 5.958549222797927e-08, + "loss": -0.0012, + "reward": 2.4999979734420776, + "reward_std": 1.5050289050577703e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999980330467224, + "step": 3631 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 9.409326424870466, + "grad_norm": 3.0731405616430374, + "kl": 0.236572265625, + "learning_rate": 5.932642487046632e-08, + "loss": 0.0007, + "reward": 2.499989628791809, + "reward_std": 1.2769949989888119e-05, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999895095825195, + "step": 3632 + }, + { + "clip_ratio": 0.0, + "completion_length": 34.5, + "epoch": 9.411917098445596, + "grad_norm": 13.293527059216853, + "kl": 0.09375, + "learning_rate": 5.9067357512953366e-08, + "loss": 0.0, + "reward": 2.499670386314392, + "reward_std": 3.277077087204816e-05, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9996705055236816, + "step": 3633 + }, + { + "clip_ratio": 0.0, + "completion_length": 34.5, + "epoch": 9.414507772020725, + "grad_norm": 0.19345136768039403, + "kl": 0.067138671875, + "learning_rate": 5.880829015544041e-08, + "loss": -0.0003, + "reward": 2.49999737739563, + "reward_std": 1.4691054843751772e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999973773956299, + "step": 3634 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 9.417098445595855, + "grad_norm": 8.747969626891217, + "kl": 0.21630859375, + "learning_rate": 5.854922279792746e-08, + "loss": 0.0013, + "reward": 1.9231637716293335, + "reward_std": 0.00021430487868201453, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.4231637716293335, + "step": 3635 + }, + { + "clip_ratio": 0.0, + "completion_length": 34.5, + "epoch": 9.419689119170984, + "grad_norm": 0.47868159716290337, + "kl": 0.128662109375, + "learning_rate": 5.8290155440414504e-08, + "loss": 0.0007, + "reward": 2.4999938011169434, + "reward_std": 3.062269684050989e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999938011169434, + "step": 3636 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 9.422279792746114, + "grad_norm": 0.09541574476558122, + "kl": 0.0611572265625, + "learning_rate": 5.803108808290155e-08, + "loss": 0.0008, + "reward": 2.499998092651367, + "reward_std": 1.047329249104223e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999982118606567, + "step": 3637 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.5, + "epoch": 9.424870466321243, + "grad_norm": 0.11362960260906027, + "kl": 0.0628662109375, + "learning_rate": 5.77720207253886e-08, + "loss": 0.001, + "reward": 2.499998092651367, + "reward_std": 2.2992157937551383e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999980926513672, + "step": 3638 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 9.427461139896373, + "grad_norm": 6.134415838458829, + "kl": 0.08642578125, + "learning_rate": 5.751295336787564e-08, + "loss": 0.0003, + "reward": 2.499982714653015, + "reward_std": 8.890999652066967e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999826550483704, + "step": 3639 + }, + { + "clip_ratio": 0.0, + "completion_length": 36.0, + "epoch": 9.430051813471502, + "grad_norm": 0.49040575881683257, + "kl": 0.09130859375, + "learning_rate": 5.725388601036269e-08, + "loss": 0.0008, + "reward": 2.4999970197677612, + "reward_std": 2.80291072840555e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999971389770508, + "step": 3640 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 9.432642487046632, + "grad_norm": 3.5243030754447786, + "kl": 0.2208251953125, + "learning_rate": 5.699481865284974e-08, + "loss": 0.0008, + "reward": 2.4999918937683105, + "reward_std": 1.4580912818473735e-05, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999920129776, + "step": 3641 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 9.435233160621761, + "grad_norm": 0.5889110884267945, + "kl": 0.0775146484375, + "learning_rate": 5.673575129533679e-08, + "loss": 0.0006, + "reward": 2.4999964237213135, + "reward_std": 3.330167032800091e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999964237213135, + "step": 3642 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 9.437823834196891, + "grad_norm": 1.240213488667874, + "kl": 0.0423583984375, + "learning_rate": 5.647668393782383e-08, + "loss": 0.0006, + "reward": 2.4999924898147583, + "reward_std": 5.2077742793699144e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.999992549419403, + "step": 3643 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 9.44041450777202, + "grad_norm": 1.467725232439466, + "kl": 0.23291015625, + "learning_rate": 5.621761658031088e-08, + "loss": 0.0014, + "reward": 1.8303985595703125, + "reward_std": 0.0004249216890457319, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.3303985595703125, + "step": 3644 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 9.44300518134715, + "grad_norm": 3.26458823444125, + "kl": 0.154296875, + "learning_rate": 5.5958549222797925e-08, + "loss": 0.0007, + "reward": 1.4976417422294617, + "reward_std": 4.918265858577797e-05, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 0.9976416826248169, + "step": 3645 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.8125, + "epoch": 9.44559585492228, + "grad_norm": 3.8004642679733864, + "kl": 0.164794921875, + "learning_rate": 5.569948186528497e-08, + "loss": 0.0007, + "reward": 1.9941259622573853, + "reward_std": 8.015541732220299e-05, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.49412602186203, + "step": 3646 + }, + { + "clip_ratio": 0.0, + "completion_length": 34.5, + "epoch": 9.44818652849741, + "grad_norm": 0.13173752653409068, + "kl": 0.080810546875, + "learning_rate": 5.544041450777202e-08, + "loss": 0.0013, + "reward": 2.499998092651367, + "reward_std": 1.6107695159917057e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.999997854232788, + "step": 3647 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.5, + "epoch": 9.450777202072539, + "grad_norm": 0.17450077222535135, + "kl": 0.069580078125, + "learning_rate": 5.518134715025906e-08, + "loss": 0.001, + "reward": 2.4999953508377075, + "reward_std": 2.054399175221988e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999952912330627, + "step": 3648 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 9.453367875647668, + "grad_norm": 14.506631067124342, + "kl": 0.0673828125, + "learning_rate": 5.4922279792746116e-08, + "loss": 0.0012, + "reward": 1.9867181181907654, + "reward_std": 0.0002071681576865103, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.4867179989814758, + "step": 3649 + }, + { + "clip_ratio": 0.0, + "completion_length": 34.5, + "epoch": 9.455958549222798, + "grad_norm": 4.7276289690884425, + "kl": 0.0369873046875, + "learning_rate": 5.4663212435233155e-08, + "loss": 0.0006, + "reward": 2.499996542930603, + "reward_std": 3.926990416402987e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999966025352478, + "step": 3650 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.5, + "epoch": 9.458549222797927, + "grad_norm": 3.1064158327449993, + "kl": 0.17919921875, + "learning_rate": 5.44041450777202e-08, + "loss": -0.0, + "reward": 1.9979050159454346, + "reward_std": 3.69726496955991e-05, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.497905194759369, + "step": 3651 + }, + { + "clip_ratio": 0.0, + "completion_length": 34.5, + "epoch": 9.461139896373057, + "grad_norm": 0.13947908017912858, + "kl": 0.09521484375, + "learning_rate": 5.4145077720207254e-08, + "loss": 0.0002, + "reward": 2.4999985694885254, + "reward_std": 1.6200368690988398e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999987483024597, + "step": 3652 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 9.463730569948186, + "grad_norm": 19.222377050730202, + "kl": 0.0992431640625, + "learning_rate": 5.38860103626943e-08, + "loss": 0.0009, + "reward": 1.9998762607574463, + "reward_std": 3.6580888263415545e-05, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.499876320362091, + "step": 3653 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.5, + "epoch": 9.466321243523316, + "grad_norm": 1.9369239234206173, + "kl": 0.09619140625, + "learning_rate": 5.362694300518134e-08, + "loss": 0.0009, + "reward": 1.9996901154518127, + "reward_std": 1.6046079622356046e-05, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.499690055847168, + "step": 3654 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.5, + "epoch": 9.468911917098445, + "grad_norm": 0.429748900711671, + "kl": 0.0440673828125, + "learning_rate": 5.336787564766839e-08, + "loss": -0.0006, + "reward": 2.4999961853027344, + "reward_std": 4.332306730248092e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.999996304512024, + "step": 3655 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 9.471502590673575, + "grad_norm": 0.36464921725739535, + "kl": 0.0703125, + "learning_rate": 5.310880829015544e-08, + "loss": 0.0003, + "reward": 2.499996781349182, + "reward_std": 2.34987766134509e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.999996840953827, + "step": 3656 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 9.474093264248705, + "grad_norm": 2.207658454892464, + "kl": 0.100341796875, + "learning_rate": 5.284974093264249e-08, + "loss": 0.0001, + "reward": 2.499990940093994, + "reward_std": 7.557295589322166e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.999990999698639, + "step": 3657 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 9.476683937823834, + "grad_norm": 0.10123703854278315, + "kl": 0.0601806640625, + "learning_rate": 5.259067357512953e-08, + "loss": -0.0004, + "reward": 2.4999974966049194, + "reward_std": 1.8297976112080505e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999976754188538, + "step": 3658 + }, + { + "clip_ratio": 0.0, + "completion_length": 34.5, + "epoch": 9.479274611398964, + "grad_norm": 0.4829703801768173, + "kl": 0.0499267578125, + "learning_rate": 5.2331606217616577e-08, + "loss": 0.0009, + "reward": 2.4999966621398926, + "reward_std": 1.916402482038393e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999966621398926, + "step": 3659 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.5, + "epoch": 9.481865284974093, + "grad_norm": 0.10825806037919034, + "kl": 0.075439453125, + "learning_rate": 5.207253886010363e-08, + "loss": 0.001, + "reward": 2.4999990463256836, + "reward_std": 1.1330558180588923e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.999999225139618, + "step": 3660 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.5, + "epoch": 9.484455958549223, + "grad_norm": 0.8525804929619037, + "kl": 0.131103515625, + "learning_rate": 5.181347150259067e-08, + "loss": 0.001, + "reward": 1.9998544454574585, + "reward_std": 1.0202433102790565e-05, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.4998544752597809, + "step": 3661 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.5, + "epoch": 9.487046632124352, + "grad_norm": 0.13608136650897315, + "kl": 0.072265625, + "learning_rate": 5.1554404145077715e-08, + "loss": 0.0009, + "reward": 2.4999974966049194, + "reward_std": 1.3136293972593194e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999975562095642, + "step": 3662 + }, + { + "clip_ratio": 0.0, + "completion_length": 34.5, + "epoch": 9.489637305699482, + "grad_norm": 0.0538713501862259, + "kl": 0.0609130859375, + "learning_rate": 5.129533678756477e-08, + "loss": 0.0002, + "reward": 2.4999990463256836, + "reward_std": 1.1685926608606678e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999991059303284, + "step": 3663 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 9.492227979274611, + "grad_norm": 2.899910100498706, + "kl": 0.2216796875, + "learning_rate": 5.1036269430051813e-08, + "loss": 0.0011, + "reward": 1.9963432550430298, + "reward_std": 4.696190148933965e-05, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.4963432550430298, + "step": 3664 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 9.494818652849741, + "grad_norm": 0.1981563280646292, + "kl": 0.09619140625, + "learning_rate": 5.077720207253885e-08, + "loss": -0.0011, + "reward": 2.499997615814209, + "reward_std": 1.6911328089008748e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999977350234985, + "step": 3665 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.5, + "epoch": 9.49740932642487, + "grad_norm": 0.13234890318077616, + "kl": 0.16943359375, + "learning_rate": 5.0518134715025906e-08, + "loss": 0.0007, + "reward": 2.499997138977051, + "reward_std": 3.024118313987856e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999971985816956, + "step": 3666 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.5625, + "epoch": 9.5, + "grad_norm": 0.11170018025077244, + "kl": 0.059814453125, + "learning_rate": 5.025906735751295e-08, + "loss": 0.0016, + "reward": 2.4999983310699463, + "reward_std": 1.312384256380028e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.999998390674591, + "step": 3667 + }, + { + "clip_ratio": 0.0, + "completion_length": 34.0, + "epoch": 9.50259067357513, + "grad_norm": 0.21624890008862552, + "kl": 0.094970703125, + "learning_rate": 5e-08, + "loss": -0.0001, + "reward": 2.4999983310699463, + "reward_std": 1.6742280308790214e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999982714653015, + "step": 3668 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.5, + "epoch": 9.505181347150259, + "grad_norm": 0.1278872385181617, + "kl": 0.05206298828125, + "learning_rate": 4.9740932642487044e-08, + "loss": 0.0006, + "reward": 2.4999979734420776, + "reward_std": 1.7489659853708872e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999980330467224, + "step": 3669 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.5, + "epoch": 9.507772020725389, + "grad_norm": 1.0912035333022394, + "kl": 0.07861328125, + "learning_rate": 4.948186528497409e-08, + "loss": -0.0013, + "reward": 2.499992609024048, + "reward_std": 4.081167844560696e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999929070472717, + "step": 3670 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.5, + "epoch": 9.510362694300518, + "grad_norm": 0.3137507044354098, + "kl": 0.053985595703125, + "learning_rate": 4.9222797927461136e-08, + "loss": 0.0001, + "reward": 2.4999972581863403, + "reward_std": 2.11338402777983e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999974966049194, + "step": 3671 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 9.512953367875648, + "grad_norm": 0.041073730948850966, + "kl": 0.07342529296875, + "learning_rate": 4.896373056994819e-08, + "loss": -0.0002, + "reward": 2.4999988079071045, + "reward_std": 8.696125348706119e-07, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999988079071045, + "step": 3672 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 9.515544041450777, + "grad_norm": 30.72660172007966, + "kl": 0.1531982421875, + "learning_rate": 4.870466321243523e-08, + "loss": 0.0001, + "reward": 1.9995241165161133, + "reward_std": 9.257665772111068e-05, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.4995240569114685, + "step": 3673 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.5, + "epoch": 9.518134715025907, + "grad_norm": 0.2642352667876703, + "kl": 0.13818359375, + "learning_rate": 4.844559585492228e-08, + "loss": 0.0003, + "reward": 2.4999903440475464, + "reward_std": 3.7532283272412315e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999902844429016, + "step": 3674 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.5, + "epoch": 9.520725388601036, + "grad_norm": 0.16631709231888067, + "kl": 0.14208984375, + "learning_rate": 4.818652849740933e-08, + "loss": 0.0013, + "reward": 2.4999961853027344, + "reward_std": 1.4295671917352593e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999960660934448, + "step": 3675 + }, + { + "clip_ratio": 0.0, + "completion_length": 36.5, + "epoch": 9.523316062176166, + "grad_norm": 2.08541732953423, + "kl": 0.084228515625, + "learning_rate": 4.7927461139896366e-08, + "loss": 0.0005, + "reward": 1.999800682067871, + "reward_std": 1.675287882108023e-05, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.499800682067871, + "step": 3676 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.5, + "epoch": 9.525906735751295, + "grad_norm": 0.05609196010699713, + "kl": 0.07025146484375, + "learning_rate": 4.766839378238342e-08, + "loss": 0.0004, + "reward": 2.499998450279236, + "reward_std": 1.1969843285442039e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999985694885254, + "step": 3677 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 9.528497409326425, + "grad_norm": 1.1533177048416927, + "kl": 0.04803466796875, + "learning_rate": 4.7409326424870465e-08, + "loss": 0.0003, + "reward": 2.4999929666519165, + "reward_std": 5.892880835745018e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999926686286926, + "step": 3678 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.5, + "epoch": 9.531088082901555, + "grad_norm": 0.46260536400298147, + "kl": 0.05987548828125, + "learning_rate": 4.715025906735751e-08, + "loss": 0.0001, + "reward": 2.499995231628418, + "reward_std": 3.915376282748184e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999953508377075, + "step": 3679 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.5, + "epoch": 9.533678756476684, + "grad_norm": 0.29252599112808325, + "kl": 0.140380859375, + "learning_rate": 4.689119170984456e-08, + "loss": 0.0012, + "reward": 1.9999163150787354, + "reward_std": 5.6195223692157015e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.4999162256717682, + "step": 3680 + }, + { + "clip_ratio": 0.0, + "completion_length": 34.5625, + "epoch": 9.536269430051814, + "grad_norm": 44.67028084156893, + "kl": 0.08837890625, + "learning_rate": 4.66321243523316e-08, + "loss": 0.0003, + "reward": 1.3175342679023743, + "reward_std": 0.2011123927659355, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 0.8175342977046967, + "step": 3681 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.5, + "epoch": 9.538860103626943, + "grad_norm": 0.7972440528709072, + "kl": 0.1826171875, + "learning_rate": 4.637305699481865e-08, + "loss": 0.001, + "reward": 2.4999953508377075, + "reward_std": 5.055637643636146e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999954104423523, + "step": 3682 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.5, + "epoch": 9.541450777202073, + "grad_norm": 0.08480518440858621, + "kl": 0.045562744140625, + "learning_rate": 4.61139896373057e-08, + "loss": 0.0, + "reward": 2.499998092651367, + "reward_std": 1.068411563664995e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999980926513672, + "step": 3683 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 9.544041450777202, + "grad_norm": 2.9252045969231006, + "kl": 0.15283203125, + "learning_rate": 4.585492227979274e-08, + "loss": 0.001, + "reward": 1.8866026401519775, + "reward_std": 0.00025362599205891456, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.3866024613380432, + "step": 3684 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 9.546632124352332, + "grad_norm": 2.4330944849459915, + "kl": 0.17138671875, + "learning_rate": 4.559585492227979e-08, + "loss": 0.0008, + "reward": 2.499996304512024, + "reward_std": 5.3129729167267215e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999964237213135, + "step": 3685 + }, + { + "clip_ratio": 0.0, + "completion_length": 36.0, + "epoch": 9.549222797927461, + "grad_norm": 0.30175427060245463, + "kl": 0.092529296875, + "learning_rate": 4.533678756476684e-08, + "loss": 0.0007, + "reward": 2.499994993209839, + "reward_std": 3.3093471074607805e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999948143959045, + "step": 3686 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.5, + "epoch": 9.55181347150259, + "grad_norm": 1.6565226912264372, + "kl": 0.080078125, + "learning_rate": 4.5077720207253886e-08, + "loss": -0.0004, + "reward": 2.4999821186065674, + "reward_std": 8.794115728960605e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999821186065674, + "step": 3687 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.5, + "epoch": 9.55440414507772, + "grad_norm": 0.43844046473212195, + "kl": 0.09033203125, + "learning_rate": 4.4818652849740926e-08, + "loss": 0.0007, + "reward": 2.4999982118606567, + "reward_std": 2.3450434696314915e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.999998152256012, + "step": 3688 + }, + { + "clip_ratio": 0.0, + "completion_length": 34.5, + "epoch": 9.55699481865285, + "grad_norm": 0.2765550217688546, + "kl": 0.069580078125, + "learning_rate": 4.455958549222798e-08, + "loss": 0.0001, + "reward": 2.4999977350234985, + "reward_std": 2.760797855216879e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.999997854232788, + "step": 3689 + }, + { + "clip_ratio": 0.0, + "completion_length": 34.6875, + "epoch": 9.55958549222798, + "grad_norm": 0.2689603541677444, + "kl": 0.375, + "learning_rate": 4.4300518134715024e-08, + "loss": 0.0015, + "reward": 1.4999990463256836, + "reward_std": 1.0095153584188665e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 0.9999991059303284, + "step": 3690 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 9.562176165803109, + "grad_norm": 0.10081432188371749, + "kl": 0.048828125, + "learning_rate": 4.404145077720207e-08, + "loss": -0.0002, + "reward": 2.49999737739563, + "reward_std": 2.1196274246904068e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999974370002747, + "step": 3691 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 9.564766839378239, + "grad_norm": 0.09022589428825767, + "kl": 0.095947265625, + "learning_rate": 4.3782383419689116e-08, + "loss": 0.001, + "reward": 2.499998450279236, + "reward_std": 9.024412861435849e-07, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.999998390674591, + "step": 3692 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.5, + "epoch": 9.567357512953368, + "grad_norm": 0.06521363436271893, + "kl": 0.10595703125, + "learning_rate": 4.352331606217616e-08, + "loss": 0.001, + "reward": 2.499998688697815, + "reward_std": 1.3152069300303992e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999986290931702, + "step": 3693 + }, + { + "clip_ratio": 0.0, + "completion_length": 36.0, + "epoch": 9.569948186528498, + "grad_norm": 0.4996267812739185, + "kl": 0.17529296875, + "learning_rate": 4.3264248704663215e-08, + "loss": 0.0007, + "reward": 2.499996304512024, + "reward_std": 3.1918979175316053e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.999996304512024, + "step": 3694 + }, + { + "clip_ratio": 0.0, + "completion_length": 34.5, + "epoch": 9.572538860103627, + "grad_norm": 0.03102964250792375, + "kl": 0.062255859375, + "learning_rate": 4.3005181347150255e-08, + "loss": 0.0002, + "reward": 2.499998688697815, + "reward_std": 8.088461527222535e-07, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.999998688697815, + "step": 3695 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.5, + "epoch": 9.575129533678757, + "grad_norm": 0.5968775988121061, + "kl": 0.0966796875, + "learning_rate": 4.27461139896373e-08, + "loss": 0.0004, + "reward": 2.499991774559021, + "reward_std": 7.86671148489404e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999916553497314, + "step": 3696 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 9.577720207253886, + "grad_norm": 4.124912297052042, + "kl": 0.2158203125, + "learning_rate": 4.2487046632124353e-08, + "loss": 0.0002, + "reward": 1.9745756387710571, + "reward_std": 0.0001279445177715388, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.4745756387710571, + "step": 3697 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 9.580310880829016, + "grad_norm": 0.11706880833138766, + "kl": 0.0556640625, + "learning_rate": 4.22279792746114e-08, + "loss": -0.0008, + "reward": 2.4999895095825195, + "reward_std": 1.6584326658630744e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.999989628791809, + "step": 3698 + }, + { + "clip_ratio": 0.0, + "completion_length": 34.5, + "epoch": 9.582901554404145, + "grad_norm": 0.04730237244760757, + "kl": 0.10577392578125, + "learning_rate": 4.196891191709844e-08, + "loss": 0.0001, + "reward": 2.4999983310699463, + "reward_std": 1.1811566480446345e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999984502792358, + "step": 3699 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 9.585492227979275, + "grad_norm": 0.024967484034547332, + "kl": 0.107147216796875, + "learning_rate": 4.170984455958549e-08, + "loss": 0.0003, + "reward": 2.4999964237213135, + "reward_std": 9.510072516150103e-07, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999966025352478, + "step": 3700 + }, + { + "clip_ratio": 0.0, + "completion_length": 34.5, + "epoch": 9.588082901554404, + "grad_norm": 121.17959767493448, + "kl": 0.14208984375, + "learning_rate": 4.145077720207254e-08, + "loss": 0.0015, + "reward": 1.9833029508590698, + "reward_std": 0.00023415576535512628, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.4833029210567474, + "step": 3701 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 9.590673575129534, + "grad_norm": 0.1787512941554565, + "kl": 0.070068359375, + "learning_rate": 4.119170984455959e-08, + "loss": -0.0, + "reward": 2.4999985694885254, + "reward_std": 1.3462990580137557e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.999998688697815, + "step": 3702 + }, + { + "clip_ratio": 0.0, + "completion_length": 34.5, + "epoch": 9.593264248704664, + "grad_norm": 0.11947837538790254, + "kl": 0.0731201171875, + "learning_rate": 4.093264248704663e-08, + "loss": 0.0006, + "reward": 2.4999983310699463, + "reward_std": 1.4272621342570346e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.999998390674591, + "step": 3703 + }, + { + "clip_ratio": 0.0, + "completion_length": 34.5, + "epoch": 9.595854922279793, + "grad_norm": 2.0841880998719566, + "kl": 0.1092529296875, + "learning_rate": 4.0673575129533676e-08, + "loss": 0.0004, + "reward": 2.499931573867798, + "reward_std": 1.6917330569299338e-05, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.999931812286377, + "step": 3704 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 9.598445595854923, + "grad_norm": 0.4801868486753808, + "kl": 0.18701171875, + "learning_rate": 4.041450777202073e-08, + "loss": 0.0009, + "reward": 2.4999905824661255, + "reward_std": 1.6072878850081906e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999906420707703, + "step": 3705 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 9.601036269430052, + "grad_norm": 0.09784307702882991, + "kl": 0.0555419921875, + "learning_rate": 4.015544041450777e-08, + "loss": 0.0005, + "reward": 2.499998927116394, + "reward_std": 5.41505286832944e-07, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.999998927116394, + "step": 3706 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 9.603626943005182, + "grad_norm": 0.2432695154534563, + "kl": 0.077880859375, + "learning_rate": 3.9896373056994814e-08, + "loss": 0.0002, + "reward": 2.4999951124191284, + "reward_std": 3.6479295886238106e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999951124191284, + "step": 3707 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.5, + "epoch": 9.606217616580311, + "grad_norm": 0.2059775111423504, + "kl": 0.1259765625, + "learning_rate": 3.9637305699481867e-08, + "loss": 0.0005, + "reward": 2.4999964237213135, + "reward_std": 1.7242742842427106e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999966025352478, + "step": 3708 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 9.60880829015544, + "grad_norm": 0.4729351482880418, + "kl": 0.067626953125, + "learning_rate": 3.937823834196891e-08, + "loss": -0.0005, + "reward": 2.499995470046997, + "reward_std": 1.924216746829188e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999958276748657, + "step": 3709 + }, + { + "clip_ratio": 0.0, + "completion_length": 34.0, + "epoch": 9.61139896373057, + "grad_norm": 0.06302451880046785, + "kl": 0.01739501953125, + "learning_rate": 3.911917098445595e-08, + "loss": -0.0004, + "reward": 2.499998092651367, + "reward_std": 1.1708349632044701e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999982118606567, + "step": 3710 + }, + { + "clip_ratio": 0.0, + "completion_length": 36.0, + "epoch": 9.6139896373057, + "grad_norm": 1.2908286275947682, + "kl": 0.109375, + "learning_rate": 3.8860103626943005e-08, + "loss": 0.0012, + "reward": 2.4999916553497314, + "reward_std": 5.72933799958264e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999914169311523, + "step": 3711 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.5, + "epoch": 9.61658031088083, + "grad_norm": 0.16426915608836806, + "kl": 0.05474853515625, + "learning_rate": 3.860103626943005e-08, + "loss": -0.0002, + "reward": 2.499995470046997, + "reward_std": 1.9299259577110206e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999955296516418, + "step": 3712 + }, + { + "clip_ratio": 0.0, + "completion_length": 36.0, + "epoch": 9.619170984455959, + "grad_norm": 0.3283778988085709, + "kl": 0.04254150390625, + "learning_rate": 3.83419689119171e-08, + "loss": 0.0006, + "reward": 2.499997138977051, + "reward_std": 2.7839587346534245e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999969601631165, + "step": 3713 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.5, + "epoch": 9.621761658031089, + "grad_norm": 0.6653877930392398, + "kl": 0.083251953125, + "learning_rate": 3.808290155440414e-08, + "loss": -0.0002, + "reward": 2.499993324279785, + "reward_std": 4.290542960916355e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999935030937195, + "step": 3714 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0625, + "epoch": 9.624352331606218, + "grad_norm": 3.837787270262384, + "kl": 0.15380859375, + "learning_rate": 3.782383419689119e-08, + "loss": 0.0011, + "reward": 2.499161958694458, + "reward_std": 1.9703396219483693e-05, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9991618990898132, + "step": 3715 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 9.626943005181348, + "grad_norm": 2.7144190963613366, + "kl": 0.06256103515625, + "learning_rate": 3.7564766839378235e-08, + "loss": -0.0002, + "reward": 2.4999953508377075, + "reward_std": 8.641868362246896e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999955296516418, + "step": 3716 + }, + { + "clip_ratio": 0.0, + "completion_length": 34.5, + "epoch": 9.629533678756477, + "grad_norm": 10.60515196088003, + "kl": 0.11834716796875, + "learning_rate": 3.730569948186528e-08, + "loss": 0.0014, + "reward": 1.9977213144302368, + "reward_std": 9.293627499573631e-05, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.4977212250232697, + "step": 3717 + }, + { + "clip_ratio": 0.0, + "completion_length": 34.5, + "epoch": 9.632124352331607, + "grad_norm": 5.168370874686422, + "kl": 0.120697021484375, + "learning_rate": 3.704663212435233e-08, + "loss": 0.0002, + "reward": 1.822607696056366, + "reward_std": 0.00047588109543994506, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.3226077854633331, + "step": 3718 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.5, + "epoch": 9.634715025906736, + "grad_norm": 1.1372807076535765, + "kl": 0.0670166015625, + "learning_rate": 3.678756476683938e-08, + "loss": 0.0004, + "reward": 2.499993920326233, + "reward_std": 5.527530674953596e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999938011169434, + "step": 3719 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 9.637305699481866, + "grad_norm": 0.8630472185584769, + "kl": 0.10107421875, + "learning_rate": 3.6528497409326426e-08, + "loss": 0.0004, + "reward": 2.4999948740005493, + "reward_std": 6.338167395369965e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999947547912598, + "step": 3720 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.5, + "epoch": 9.639896373056995, + "grad_norm": 0.14228762130486428, + "kl": 0.080322265625, + "learning_rate": 3.6269430051813465e-08, + "loss": 0.0009, + "reward": 2.4999951124191284, + "reward_std": 3.8539049569408235e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999949932098389, + "step": 3721 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.5, + "epoch": 9.642487046632125, + "grad_norm": 0.7850317170906873, + "kl": 0.079833984375, + "learning_rate": 3.601036269430052e-08, + "loss": 0.0009, + "reward": 2.4999932050704956, + "reward_std": 4.227991894367733e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999932050704956, + "step": 3722 + }, + { + "clip_ratio": 0.0, + "completion_length": 36.0, + "epoch": 9.645077720207254, + "grad_norm": 1.776056240170037, + "kl": 0.101806640625, + "learning_rate": 3.5751295336787564e-08, + "loss": 0.0009, + "reward": 1.9998681545257568, + "reward_std": 8.81521782503114e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.4998682141304016, + "step": 3723 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 9.647668393782384, + "grad_norm": 0.06614170299848908, + "kl": 0.068359375, + "learning_rate": 3.549222797927461e-08, + "loss": 0.0004, + "reward": 2.4999380111694336, + "reward_std": 2.5189199277519947e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.999937891960144, + "step": 3724 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.5, + "epoch": 9.650259067357513, + "grad_norm": 62.65769396126327, + "kl": 0.107421875, + "learning_rate": 3.5233160621761656e-08, + "loss": 0.0006, + "reward": 1.8926363587379456, + "reward_std": 0.00041003531885053235, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.3926363289356232, + "step": 3725 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 9.652849740932643, + "grad_norm": 1.902947433846418, + "kl": 0.130615234375, + "learning_rate": 3.49740932642487e-08, + "loss": 0.0002, + "reward": 1.8233218789100647, + "reward_std": 0.00016650663928885479, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.3233221173286438, + "step": 3726 + }, + { + "clip_ratio": 0.0, + "completion_length": 36.0, + "epoch": 9.655440414507773, + "grad_norm": 0.3671349059924395, + "kl": 0.091552734375, + "learning_rate": 3.471502590673575e-08, + "loss": 0.0012, + "reward": 2.499998092651367, + "reward_std": 1.5305606950732908e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999977946281433, + "step": 3727 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.5, + "epoch": 9.658031088082902, + "grad_norm": 0.2572672720997696, + "kl": 0.13037109375, + "learning_rate": 3.44559585492228e-08, + "loss": 0.0007, + "reward": 2.4999889135360718, + "reward_std": 2.941994694083405e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999887347221375, + "step": 3728 + }, + { + "clip_ratio": 0.0, + "completion_length": 34.5, + "epoch": 9.660621761658032, + "grad_norm": 0.35747154345498017, + "kl": 0.181640625, + "learning_rate": 3.419689119170984e-08, + "loss": 0.0017, + "reward": 2.4999948740005493, + "reward_std": 3.3435303521400783e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999948740005493, + "step": 3729 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 9.663212435233161, + "grad_norm": 0.11123236866937876, + "kl": 0.05322265625, + "learning_rate": 3.3937823834196887e-08, + "loss": 0.0005, + "reward": 2.499996066093445, + "reward_std": 1.737238562782295e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999958872795105, + "step": 3730 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.5, + "epoch": 9.66580310880829, + "grad_norm": 1.717227705973132, + "kl": 0.0797119140625, + "learning_rate": 3.367875647668394e-08, + "loss": 0.0002, + "reward": 2.499983072280884, + "reward_std": 1.2239142506587086e-05, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999831914901733, + "step": 3731 + }, + { + "clip_ratio": 0.0, + "completion_length": 33.75, + "epoch": 9.66839378238342, + "grad_norm": 0.10325904713031216, + "kl": 0.074951171875, + "learning_rate": 3.341968911917098e-08, + "loss": -0.0006, + "reward": 2.4999977350234985, + "reward_std": 1.1165362252540945e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999979138374329, + "step": 3732 + }, + { + "clip_ratio": 0.0, + "completion_length": 34.875, + "epoch": 9.67098445595855, + "grad_norm": 140.2845593175123, + "kl": 0.1151123046875, + "learning_rate": 3.3160621761658025e-08, + "loss": 0.0, + "reward": 2.353145122528076, + "reward_std": 0.2719174511029223, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.8531451225280762, + "step": 3733 + }, + { + "clip_ratio": 0.0, + "completion_length": 34.5, + "epoch": 9.67357512953368, + "grad_norm": 0.06890593482295215, + "kl": 0.172607421875, + "learning_rate": 3.290155440414508e-08, + "loss": 0.001, + "reward": 2.4999979734420776, + "reward_std": 1.6682928389855078e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999979734420776, + "step": 3734 + }, + { + "clip_ratio": 0.0, + "completion_length": 34.0, + "epoch": 9.676165803108809, + "grad_norm": 0.06258540429877894, + "kl": 0.04083251953125, + "learning_rate": 3.2642487046632124e-08, + "loss": -0.0001, + "reward": 2.4999982118606567, + "reward_std": 1.3081356655675336e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999983310699463, + "step": 3735 + }, + { + "clip_ratio": 0.0, + "completion_length": 36.0, + "epoch": 9.678756476683938, + "grad_norm": 0.06107534896524303, + "kl": 0.043670654296875, + "learning_rate": 3.238341968911917e-08, + "loss": 0.0009, + "reward": 2.4999979734420776, + "reward_std": 8.629858569975113e-07, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999980330467224, + "step": 3736 + }, + { + "clip_ratio": 0.0, + "completion_length": 34.5, + "epoch": 9.681347150259068, + "grad_norm": 0.9024815467901885, + "kl": 0.04522705078125, + "learning_rate": 3.2124352331606216e-08, + "loss": 0.0, + "reward": 2.499995231628418, + "reward_std": 4.063416781718843e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999954104423523, + "step": 3737 + }, + { + "clip_ratio": 0.0, + "completion_length": 34.5, + "epoch": 9.683937823834198, + "grad_norm": 0.2986592789776921, + "kl": 0.0631103515625, + "learning_rate": 3.186528497409326e-08, + "loss": 0.0005, + "reward": 2.499997854232788, + "reward_std": 1.7263938616451924e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999977946281433, + "step": 3738 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 9.686528497409327, + "grad_norm": 58.65350870185054, + "kl": 0.100830078125, + "learning_rate": 3.1606217616580314e-08, + "loss": -0.0001, + "reward": 2.4999840259552, + "reward_std": 2.5987753360823262e-05, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.999984085559845, + "step": 3739 + }, + { + "clip_ratio": 0.0, + "completion_length": 36.0, + "epoch": 9.689119170984457, + "grad_norm": 0.14544432949484318, + "kl": 0.075439453125, + "learning_rate": 3.1347150259067354e-08, + "loss": 0.0004, + "reward": 2.4999977350234985, + "reward_std": 2.395315050307545e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999977350234985, + "step": 3740 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.5, + "epoch": 9.691709844559586, + "grad_norm": 0.22318860519853667, + "kl": 0.049072265625, + "learning_rate": 3.10880829015544e-08, + "loss": 0.0005, + "reward": 2.4999990463256836, + "reward_std": 8.763437051584333e-07, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999992847442627, + "step": 3741 + }, + { + "clip_ratio": 0.0, + "completion_length": 34.5, + "epoch": 9.694300518134716, + "grad_norm": 0.05357131573647331, + "kl": 0.063720703125, + "learning_rate": 3.082901554404145e-08, + "loss": 0.0007, + "reward": 2.499997615814209, + "reward_std": 1.2371273498956725e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.999997615814209, + "step": 3742 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.5, + "epoch": 9.696891191709845, + "grad_norm": 0.11461086108174581, + "kl": 0.20068359375, + "learning_rate": 3.05699481865285e-08, + "loss": 0.0014, + "reward": 2.4999977350234985, + "reward_std": 1.5782950981702015e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999976754188538, + "step": 3743 + }, + { + "clip_ratio": 0.0, + "completion_length": 33.5, + "epoch": 9.699481865284975, + "grad_norm": 0.07200265168897, + "kl": 0.1234130859375, + "learning_rate": 3.0310880829015545e-08, + "loss": 0.0005, + "reward": 2.4999983310699463, + "reward_std": 1.5007875617811806e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999985694885254, + "step": 3744 + }, + { + "clip_ratio": 0.0, + "completion_length": 34.0625, + "epoch": 9.702072538860104, + "grad_norm": 29.351599642170918, + "kl": 0.24072265625, + "learning_rate": 3.005181347150259e-08, + "loss": 0.0001, + "reward": 1.9989938139915466, + "reward_std": 0.002668613646619633, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.4989937841892242, + "step": 3745 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 9.704663212435234, + "grad_norm": 14.557297987329388, + "kl": 0.098876953125, + "learning_rate": 2.9792746113989634e-08, + "loss": -0.0002, + "reward": 2.249961197376251, + "reward_std": 0.2672802810629378, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.7499613761901855, + "step": 3746 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 9.707253886010363, + "grad_norm": 0.724214303768482, + "kl": 0.13836669921875, + "learning_rate": 2.9533678756476683e-08, + "loss": 0.0015, + "reward": 2.499993324279785, + "reward_std": 5.992733804305317e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999933242797852, + "step": 3747 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 9.709844559585493, + "grad_norm": 0.07465356914804408, + "kl": 0.103271484375, + "learning_rate": 2.927461139896373e-08, + "loss": 0.0007, + "reward": 2.49999737739563, + "reward_std": 1.1126558661089803e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999971985816956, + "step": 3748 + }, + { + "clip_ratio": 0.0, + "completion_length": 36.5, + "epoch": 9.712435233160623, + "grad_norm": 0.212835517991201, + "kl": 0.075439453125, + "learning_rate": 2.9015544041450775e-08, + "loss": 0.0012, + "reward": 2.4999966621398926, + "reward_std": 1.6554947137592535e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.999996542930603, + "step": 3749 + }, + { + "clip_ratio": 0.0, + "completion_length": 34.0, + "epoch": 9.715025906735752, + "grad_norm": 0.2615156912937327, + "kl": 0.1568603515625, + "learning_rate": 2.875647668393782e-08, + "loss": 0.0012, + "reward": 2.499997854232788, + "reward_std": 2.2649705329058634e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.999997854232788, + "step": 3750 + }, + { + "clip_ratio": 0.0, + "completion_length": 34.5, + "epoch": 9.717616580310882, + "grad_norm": 1.6987034779112902, + "kl": 0.05908203125, + "learning_rate": 2.849740932642487e-08, + "loss": -0.0002, + "reward": 2.4999903440475464, + "reward_std": 6.948768202619249e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999903440475464, + "step": 3751 + }, + { + "clip_ratio": 0.0, + "completion_length": 36.0, + "epoch": 9.720207253886011, + "grad_norm": 0.23577031258330824, + "kl": 0.072998046875, + "learning_rate": 2.8238341968911916e-08, + "loss": -0.001, + "reward": 2.4999966621398926, + "reward_std": 3.0685955607623328e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999967813491821, + "step": 3752 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 9.72279792746114, + "grad_norm": 0.2249468532602767, + "kl": 0.04443359375, + "learning_rate": 2.7979274611398963e-08, + "loss": 0.0002, + "reward": 2.499997138977051, + "reward_std": 3.64227867066802e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999972581863403, + "step": 3753 + }, + { + "clip_ratio": 0.0, + "completion_length": 36.0, + "epoch": 9.72538860103627, + "grad_norm": 0.28003767041851363, + "kl": 0.101806640625, + "learning_rate": 2.772020725388601e-08, + "loss": 0.0001, + "reward": 2.4999959468841553, + "reward_std": 1.938026485959199e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999960064888, + "step": 3754 + }, + { + "clip_ratio": 0.0, + "completion_length": 34.5, + "epoch": 9.7279792746114, + "grad_norm": 0.1199662666180605, + "kl": 0.047607421875, + "learning_rate": 2.7461139896373058e-08, + "loss": 0.0002, + "reward": 2.4999979734420776, + "reward_std": 1.6259044741673279e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999980330467224, + "step": 3755 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 9.73056994818653, + "grad_norm": 0.44208760584098933, + "kl": 0.111328125, + "learning_rate": 2.72020725388601e-08, + "loss": -0.0, + "reward": 2.4999959468841553, + "reward_std": 2.95609214617798e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999960064888, + "step": 3756 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.5, + "epoch": 9.733160621761659, + "grad_norm": 4.426274242345284, + "kl": 0.13623046875, + "learning_rate": 2.694300518134715e-08, + "loss": 0.0001, + "reward": 1.9985175132751465, + "reward_std": 5.957219400443137e-05, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.4985175430774689, + "step": 3757 + }, + { + "clip_ratio": 0.0, + "completion_length": 36.0, + "epoch": 9.735751295336787, + "grad_norm": 0.05787084388679763, + "kl": 0.041748046875, + "learning_rate": 2.6683937823834196e-08, + "loss": -0.0001, + "reward": 2.499998450279236, + "reward_std": 7.203493765928215e-07, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999984502792358, + "step": 3758 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.5, + "epoch": 9.738341968911918, + "grad_norm": 1.9230994470593699, + "kl": 0.061065673828125, + "learning_rate": 2.6424870466321246e-08, + "loss": -0.0001, + "reward": 2.4990181922912598, + "reward_std": 2.2464121911980328e-05, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9990183115005493, + "step": 3759 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.5, + "epoch": 9.740932642487046, + "grad_norm": 2.2353399304193444, + "kl": 0.064208984375, + "learning_rate": 2.6165803108808288e-08, + "loss": 0.0001, + "reward": 2.4999868869781494, + "reward_std": 1.0103552313012187e-05, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.999987006187439, + "step": 3760 + }, + { + "clip_ratio": 0.0, + "completion_length": 34.5, + "epoch": 9.743523316062177, + "grad_norm": 1.3670086812315756, + "kl": 0.0726318359375, + "learning_rate": 2.5906735751295334e-08, + "loss": -0.0009, + "reward": 2.49999737739563, + "reward_std": 2.848078565875767e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999976754188538, + "step": 3761 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 9.746113989637305, + "grad_norm": 0.11133539059427926, + "kl": 0.086669921875, + "learning_rate": 2.5647668393782384e-08, + "loss": 0.0006, + "reward": 2.4999887943267822, + "reward_std": 2.2894830919995e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999887943267822, + "step": 3762 + }, + { + "clip_ratio": 0.0, + "completion_length": 36.5, + "epoch": 9.748704663212436, + "grad_norm": 0.31928769101512533, + "kl": 0.0579833984375, + "learning_rate": 2.5388601036269426e-08, + "loss": 0.0012, + "reward": 2.4999940395355225, + "reward_std": 3.0099064360911143e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999940395355225, + "step": 3763 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.5, + "epoch": 9.751295336787564, + "grad_norm": 0.19435776073540492, + "kl": 0.077880859375, + "learning_rate": 2.5129533678756476e-08, + "loss": -0.001, + "reward": 2.4999982118606567, + "reward_std": 1.1160927329001424e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.999998390674591, + "step": 3764 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 9.753886010362695, + "grad_norm": 2.655197153259587, + "kl": 0.51904296875, + "learning_rate": 2.4870466321243522e-08, + "loss": 0.0022, + "reward": 2.499995708465576, + "reward_std": 3.656208150459861e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999958276748657, + "step": 3765 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 9.756476683937823, + "grad_norm": 0.03164583537337646, + "kl": 0.07550048828125, + "learning_rate": 2.4611398963730568e-08, + "loss": -0.0006, + "reward": 2.499998092651367, + "reward_std": 9.895479706756305e-07, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999982118606567, + "step": 3766 + }, + { + "clip_ratio": 0.0, + "completion_length": 34.5, + "epoch": 9.759067357512954, + "grad_norm": 0.11328718472620572, + "kl": 0.04443359375, + "learning_rate": 2.4352331606217614e-08, + "loss": -0.0011, + "reward": 2.4999977350234985, + "reward_std": 1.6928198078858259e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999977946281433, + "step": 3767 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.5, + "epoch": 9.761658031088082, + "grad_norm": 0.17890890954896987, + "kl": 0.1036376953125, + "learning_rate": 2.4093264248704663e-08, + "loss": 0.0005, + "reward": 2.4999966621398926, + "reward_std": 3.90333059385739e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999966025352478, + "step": 3768 + }, + { + "clip_ratio": 0.0, + "completion_length": 34.5, + "epoch": 9.764248704663213, + "grad_norm": 0.3315109014319285, + "kl": 0.018463134765625, + "learning_rate": 2.383419689119171e-08, + "loss": -0.0011, + "reward": 2.4999982118606567, + "reward_std": 1.782329604793631e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.999998390674591, + "step": 3769 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.5, + "epoch": 9.766839378238341, + "grad_norm": 0.7381150860547785, + "kl": 0.10205078125, + "learning_rate": 2.3575129533678756e-08, + "loss": 0.0003, + "reward": 2.499989867210388, + "reward_std": 6.525874823637423e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999897480010986, + "step": 3770 + }, + { + "clip_ratio": 0.0, + "completion_length": 34.5, + "epoch": 9.76943005181347, + "grad_norm": 0.7626784689468261, + "kl": 0.15625, + "learning_rate": 2.33160621761658e-08, + "loss": 0.0002, + "reward": 2.4999961853027344, + "reward_std": 4.870618653285419e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999963641166687, + "step": 3771 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 9.7720207253886, + "grad_norm": 0.272732177240815, + "kl": 0.099609375, + "learning_rate": 2.305699481865285e-08, + "loss": -0.0002, + "reward": 2.499997138977051, + "reward_std": 2.0385086827445775e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999971985816956, + "step": 3772 + }, + { + "clip_ratio": 0.0, + "completion_length": 36.0, + "epoch": 9.77461139896373, + "grad_norm": 0.08370503369204937, + "kl": 0.046722412109375, + "learning_rate": 2.2797927461139894e-08, + "loss": 0.0006, + "reward": 2.499997138977051, + "reward_std": 1.516330769391061e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999972581863403, + "step": 3773 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.5, + "epoch": 9.77720207253886, + "grad_norm": 0.1132083772557839, + "kl": 0.03887939453125, + "learning_rate": 2.2538860103626943e-08, + "loss": 0.0003, + "reward": 2.499998092651367, + "reward_std": 1.2136746931901143e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999980926513672, + "step": 3774 + }, + { + "clip_ratio": 0.0, + "completion_length": 34.5, + "epoch": 9.779792746113989, + "grad_norm": 0.863955062801784, + "kl": 0.0833740234375, + "learning_rate": 2.227979274611399e-08, + "loss": 0.0016, + "reward": 2.499988079071045, + "reward_std": 4.65600027155233e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.999987781047821, + "step": 3775 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 9.782383419689118, + "grad_norm": 0.16093304143443202, + "kl": 0.05908203125, + "learning_rate": 2.2020725388601035e-08, + "loss": 0.0016, + "reward": 2.4999970197677612, + "reward_std": 2.4928035031734908e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999969601631165, + "step": 3776 + }, + { + "clip_ratio": 0.0, + "completion_length": 34.5, + "epoch": 9.784974093264248, + "grad_norm": 0.02913282490116848, + "kl": 0.077392578125, + "learning_rate": 2.176165803108808e-08, + "loss": 0.0008, + "reward": 2.499998092651367, + "reward_std": 5.691192512813359e-07, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.999997854232788, + "step": 3777 + }, + { + "clip_ratio": 0.0, + "completion_length": 34.5, + "epoch": 9.787564766839377, + "grad_norm": 0.1849690693220191, + "kl": 0.0546875, + "learning_rate": 2.1502590673575127e-08, + "loss": -0.0001, + "reward": 2.4999972581863403, + "reward_std": 3.0798167927059694e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999972581863403, + "step": 3778 + }, + { + "clip_ratio": 0.0, + "completion_length": 34.5, + "epoch": 9.790155440414507, + "grad_norm": 0.5128423876435827, + "kl": 0.09375, + "learning_rate": 2.1243523316062177e-08, + "loss": 0.0011, + "reward": 2.499995708465576, + "reward_std": 4.422236997925211e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999957084655762, + "step": 3779 + }, + { + "clip_ratio": 0.0, + "completion_length": 34.0, + "epoch": 9.792746113989637, + "grad_norm": 0.763581929274989, + "kl": 0.08154296875, + "learning_rate": 2.098445595854922e-08, + "loss": -0.0001, + "reward": 2.499995231628418, + "reward_std": 4.636290782400465e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999954104423523, + "step": 3780 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.5, + "epoch": 9.795336787564766, + "grad_norm": 0.11112680421646717, + "kl": 0.058349609375, + "learning_rate": 2.072538860103627e-08, + "loss": -0.0001, + "reward": 2.499988079071045, + "reward_std": 3.1279002996598138e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999881982803345, + "step": 3781 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.5, + "epoch": 9.797927461139896, + "grad_norm": 0.13329195261908325, + "kl": 0.1043701171875, + "learning_rate": 2.0466321243523315e-08, + "loss": 0.0007, + "reward": 2.4999974966049194, + "reward_std": 1.7682224324744311e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999974370002747, + "step": 3782 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.5, + "epoch": 9.800518134715025, + "grad_norm": 0.20427653188969785, + "kl": 0.083740234375, + "learning_rate": 2.0207253886010364e-08, + "loss": 0.0008, + "reward": 2.4999983310699463, + "reward_std": 1.6481406532875553e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999982714653015, + "step": 3783 + }, + { + "clip_ratio": 0.0, + "completion_length": 36.3125, + "epoch": 9.803108808290155, + "grad_norm": 4.727734870010785, + "kl": 0.1689453125, + "learning_rate": 1.9948186528497407e-08, + "loss": 0.0008, + "reward": 1.9518914222717285, + "reward_std": 0.019570964314993944, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.4518914818763733, + "step": 3784 + }, + { + "clip_ratio": 0.0, + "completion_length": 34.5, + "epoch": 9.805699481865284, + "grad_norm": 0.17130678263956142, + "kl": 0.0391845703125, + "learning_rate": 1.9689119170984456e-08, + "loss": 0.0003, + "reward": 2.499996304512024, + "reward_std": 1.8100616898664157e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999964237213135, + "step": 3785 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 9.808290155440414, + "grad_norm": 7.425114394693029, + "kl": 0.0794677734375, + "learning_rate": 1.9430051813471502e-08, + "loss": 0.0005, + "reward": 2.499907612800598, + "reward_std": 2.0567517594827223e-05, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999075531959534, + "step": 3786 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.5, + "epoch": 9.810880829015543, + "grad_norm": 0.79838899036328, + "kl": 0.1065673828125, + "learning_rate": 1.917098445595855e-08, + "loss": -0.0, + "reward": 1.9995591640472412, + "reward_std": 1.4305381114354532e-05, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.4995591640472412, + "step": 3787 + }, + { + "clip_ratio": 0.0, + "completion_length": 34.5, + "epoch": 9.813471502590673, + "grad_norm": 0.19777259723981988, + "kl": 0.15185546875, + "learning_rate": 1.8911917098445595e-08, + "loss": -0.0006, + "reward": 2.4999972581863403, + "reward_std": 1.697441803116817e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999974370002747, + "step": 3788 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 9.816062176165802, + "grad_norm": 0.13136457951880087, + "kl": 0.061767578125, + "learning_rate": 1.865284974093264e-08, + "loss": 0.0007, + "reward": 2.4999970197677612, + "reward_std": 1.8212828649666335e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.999997079372406, + "step": 3789 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 9.818652849740932, + "grad_norm": 0.8746056997016923, + "kl": 0.11474609375, + "learning_rate": 1.839378238341969e-08, + "loss": -0.0004, + "reward": 2.4999914169311523, + "reward_std": 7.2712532528385054e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999916553497314, + "step": 3790 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.5, + "epoch": 9.821243523316062, + "grad_norm": 0.08716659230327739, + "kl": 0.12255859375, + "learning_rate": 1.8134715025906733e-08, + "loss": 0.0008, + "reward": 2.499998450279236, + "reward_std": 1.1145666576339863e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999985098838806, + "step": 3791 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 9.823834196891191, + "grad_norm": 0.15748695723144712, + "kl": 0.091796875, + "learning_rate": 1.7875647668393782e-08, + "loss": 0.0012, + "reward": 2.499886155128479, + "reward_std": 6.795312401663978e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9998859763145447, + "step": 3792 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 9.82642487046632, + "grad_norm": 0.3217983693923802, + "kl": 0.08447265625, + "learning_rate": 1.7616580310880828e-08, + "loss": 0.0002, + "reward": 2.499997854232788, + "reward_std": 2.048841679425095e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.999997854232788, + "step": 3793 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.5, + "epoch": 9.82901554404145, + "grad_norm": 0.35092534128675484, + "kl": 0.096435546875, + "learning_rate": 1.7357512953367874e-08, + "loss": 0.0007, + "reward": 2.4999568462371826, + "reward_std": 4.996159248094045e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.999956727027893, + "step": 3794 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 9.83160621761658, + "grad_norm": 0.12016096421217412, + "kl": 0.04119873046875, + "learning_rate": 1.709844559585492e-08, + "loss": -0.0001, + "reward": 2.499995708465576, + "reward_std": 2.6271003434885642e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999958276748657, + "step": 3795 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 9.83419689119171, + "grad_norm": 24.764060376249354, + "kl": 0.144775390625, + "learning_rate": 1.683937823834197e-08, + "loss": 0.0002, + "reward": 1.9948484897613525, + "reward_std": 0.0002595675226189087, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.4948484897613525, + "step": 3796 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 9.836787564766839, + "grad_norm": 0.14882883155472126, + "kl": 0.1468505859375, + "learning_rate": 1.6580310880829012e-08, + "loss": -0.0009, + "reward": 2.4999970197677612, + "reward_std": 1.866126240201993e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999972581863403, + "step": 3797 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 9.839378238341968, + "grad_norm": 0.20597652662593988, + "kl": 0.0849609375, + "learning_rate": 1.6321243523316062e-08, + "loss": -0.0004, + "reward": 2.4999947547912598, + "reward_std": 2.076449561627669e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999948143959045, + "step": 3798 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.5, + "epoch": 9.841968911917098, + "grad_norm": 5.752662767418689, + "kl": 0.052001953125, + "learning_rate": 1.6062176165803108e-08, + "loss": -0.0001, + "reward": 2.4999842643737793, + "reward_std": 1.34437277665711e-05, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999842047691345, + "step": 3799 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.5, + "epoch": 9.844559585492227, + "grad_norm": 0.06997755952859776, + "kl": 0.13623046875, + "learning_rate": 1.5803108808290157e-08, + "loss": 0.001, + "reward": 2.4999974966049194, + "reward_std": 1.7268565102313005e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999974966049194, + "step": 3800 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 9.847150259067357, + "grad_norm": 0.11732727315727917, + "kl": 0.029541015625, + "learning_rate": 1.55440414507772e-08, + "loss": 0.0001, + "reward": 2.4999974966049194, + "reward_std": 2.2852736378808913e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999975562095642, + "step": 3801 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.5, + "epoch": 9.849740932642487, + "grad_norm": 0.16314470026953798, + "kl": 0.0380859375, + "learning_rate": 1.528497409326425e-08, + "loss": -0.0004, + "reward": 2.4999979734420776, + "reward_std": 1.5315859513975738e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999979734420776, + "step": 3802 + }, + { + "clip_ratio": 0.0, + "completion_length": 36.0, + "epoch": 9.852331606217616, + "grad_norm": 0.7560856415784727, + "kl": 0.12841796875, + "learning_rate": 1.5025906735751295e-08, + "loss": 0.0012, + "reward": 2.4999938011169434, + "reward_std": 4.063801270604017e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999938607215881, + "step": 3803 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 9.854922279792746, + "grad_norm": 0.30494854393259924, + "kl": 0.103759765625, + "learning_rate": 1.4766839378238341e-08, + "loss": 0.0011, + "reward": 2.499996304512024, + "reward_std": 2.9327789548005967e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.999996304512024, + "step": 3804 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.5, + "epoch": 9.857512953367875, + "grad_norm": 4.3085895928548865, + "kl": 0.14697265625, + "learning_rate": 1.4507772020725387e-08, + "loss": 0.0005, + "reward": 1.4902091026306152, + "reward_std": 0.00010176981845688715, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 0.9902092516422272, + "step": 3805 + }, + { + "clip_ratio": 0.0, + "completion_length": 36.0, + "epoch": 9.860103626943005, + "grad_norm": 0.8968894549671418, + "kl": 0.36865234375, + "learning_rate": 1.4248704663212435e-08, + "loss": 0.0018, + "reward": 2.4999927282333374, + "reward_std": 3.3152416563098086e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999927282333374, + "step": 3806 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.5, + "epoch": 9.862694300518134, + "grad_norm": 0.129771885776785, + "kl": 0.113037109375, + "learning_rate": 1.3989637305699481e-08, + "loss": 0.001, + "reward": 2.4999953508377075, + "reward_std": 1.5411479239446635e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999954104423523, + "step": 3807 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 9.865284974093264, + "grad_norm": 0.9655226563755117, + "kl": 0.092041015625, + "learning_rate": 1.3730569948186529e-08, + "loss": -0.0, + "reward": 2.4999953508377075, + "reward_std": 4.0481413634552155e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.999995470046997, + "step": 3808 + }, + { + "clip_ratio": 0.0, + "completion_length": 36.0, + "epoch": 9.867875647668393, + "grad_norm": 0.06693227961757948, + "kl": 0.07958984375, + "learning_rate": 1.3471502590673575e-08, + "loss": -0.0003, + "reward": 2.4999974966049194, + "reward_std": 1.5637396586498653e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999975562095642, + "step": 3809 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.5, + "epoch": 9.870466321243523, + "grad_norm": 0.7752756350370011, + "kl": 0.03509521484375, + "learning_rate": 1.3212435233160623e-08, + "loss": -0.0004, + "reward": 2.4999945163726807, + "reward_std": 5.215747933107195e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999945163726807, + "step": 3810 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 9.873056994818652, + "grad_norm": 0.46323596757288926, + "kl": 0.085205078125, + "learning_rate": 1.2953367875647667e-08, + "loss": -0.0005, + "reward": 2.4999982118606567, + "reward_std": 1.5863279259065166e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999983310699463, + "step": 3811 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 9.875647668393782, + "grad_norm": 10.814180108034106, + "kl": 0.1588134765625, + "learning_rate": 1.2694300518134713e-08, + "loss": 0.0003, + "reward": 1.987221598625183, + "reward_std": 0.000504313197097872, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.487221747636795, + "step": 3812 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 9.878238341968911, + "grad_norm": 0.07930464692349637, + "kl": 0.040283203125, + "learning_rate": 1.2435233160621761e-08, + "loss": -0.0012, + "reward": 2.4999988079071045, + "reward_std": 7.870119702602096e-07, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999991059303284, + "step": 3813 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 9.880829015544041, + "grad_norm": 5.360684976662539, + "kl": 0.148681640625, + "learning_rate": 1.2176165803108807e-08, + "loss": 0.0006, + "reward": 1.9530180096626282, + "reward_std": 0.0001904405777111151, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.453018307685852, + "step": 3814 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.5, + "epoch": 9.88341968911917, + "grad_norm": 0.7090085498795189, + "kl": 0.103515625, + "learning_rate": 1.1917098445595855e-08, + "loss": 0.0013, + "reward": 2.4999959468841553, + "reward_std": 4.313541751344019e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999959468841553, + "step": 3815 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 9.8860103626943, + "grad_norm": 0.1961318160795609, + "kl": 0.0203857421875, + "learning_rate": 1.16580310880829e-08, + "loss": 0.0013, + "reward": 2.499997138977051, + "reward_std": 1.7680698078947898e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.999997079372406, + "step": 3816 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.5, + "epoch": 9.88860103626943, + "grad_norm": 0.1563864673191935, + "kl": 0.0791015625, + "learning_rate": 1.1398963730569947e-08, + "loss": -0.0008, + "reward": 1.9984179735183716, + "reward_std": 1.602254400268066e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.4984181225299835, + "step": 3817 + }, + { + "clip_ratio": 0.0, + "completion_length": 34.5, + "epoch": 9.89119170984456, + "grad_norm": 0.8464112951686901, + "kl": 0.098388671875, + "learning_rate": 1.1139896373056995e-08, + "loss": 0.0008, + "reward": 2.4999929666519165, + "reward_std": 5.945623769321173e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.999992847442627, + "step": 3818 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 9.893782383419689, + "grad_norm": 0.16882770215270507, + "kl": 0.0828857421875, + "learning_rate": 1.088082901554404e-08, + "loss": 0.0003, + "reward": 2.4999964237213135, + "reward_std": 3.1638215887141996e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999964833259583, + "step": 3819 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 9.896373056994818, + "grad_norm": 0.25704251731201067, + "kl": 0.103515625, + "learning_rate": 1.0621761658031088e-08, + "loss": 0.0007, + "reward": 2.499995708465576, + "reward_std": 2.1841456145921256e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999955892562866, + "step": 3820 + }, + { + "clip_ratio": 0.0, + "completion_length": 34.0, + "epoch": 9.898963730569948, + "grad_norm": 0.22854452296968955, + "kl": 0.0755615234375, + "learning_rate": 1.0362694300518134e-08, + "loss": 0.0002, + "reward": 2.4999964237213135, + "reward_std": 2.3129233568397467e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999964833259583, + "step": 3821 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.5, + "epoch": 9.901554404145077, + "grad_norm": 0.2780902670483471, + "kl": 0.1239013671875, + "learning_rate": 1.0103626943005182e-08, + "loss": 0.0004, + "reward": 2.4999935626983643, + "reward_std": 3.199704337930598e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.99999338388443, + "step": 3822 + }, + { + "clip_ratio": 0.0, + "completion_length": 36.0, + "epoch": 9.904145077720207, + "grad_norm": 0.15069164553065906, + "kl": 0.05926513671875, + "learning_rate": 9.844559585492228e-09, + "loss": 0.0008, + "reward": 2.499997854232788, + "reward_std": 1.690750877969549e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.999997854232788, + "step": 3823 + }, + { + "clip_ratio": 0.0, + "completion_length": 34.5, + "epoch": 9.906735751295336, + "grad_norm": 0.111681637497529, + "kl": 0.06256103515625, + "learning_rate": 9.585492227979274e-09, + "loss": -0.0009, + "reward": 2.4999979734420776, + "reward_std": 1.5126065022741386e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999980926513672, + "step": 3824 + }, + { + "clip_ratio": 0.0, + "completion_length": 36.0, + "epoch": 9.909326424870466, + "grad_norm": 0.14119331272713803, + "kl": 0.15283203125, + "learning_rate": 9.32642487046632e-09, + "loss": 0.0009, + "reward": 2.499996066093445, + "reward_std": 2.101251425301598e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999959468841553, + "step": 3825 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 9.911917098445596, + "grad_norm": 2.2590338934111807, + "kl": 0.147216796875, + "learning_rate": 9.067357512953366e-09, + "loss": 0.0003, + "reward": 1.9966699481010437, + "reward_std": 4.792127870700824e-05, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.4966698288917542, + "step": 3826 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 9.914507772020725, + "grad_norm": 0.16014649811724047, + "kl": 0.1083984375, + "learning_rate": 8.808290155440414e-09, + "loss": 0.0001, + "reward": 2.499997138977051, + "reward_std": 1.3942812984168995e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999971389770508, + "step": 3827 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.9375, + "epoch": 9.917098445595855, + "grad_norm": 11.941350974126527, + "kl": 0.0726318359375, + "learning_rate": 8.54922279792746e-09, + "loss": 0.0003, + "reward": 1.978324294090271, + "reward_std": 0.000299758665448735, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.4783242344856262, + "step": 3828 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.5, + "epoch": 9.919689119170984, + "grad_norm": 2.814335991610363, + "kl": 0.0833740234375, + "learning_rate": 8.290155440414506e-09, + "loss": -0.0006, + "reward": 2.4999910593032837, + "reward_std": 9.726326425152365e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.999991238117218, + "step": 3829 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.5, + "epoch": 9.922279792746114, + "grad_norm": 0.06296384186332242, + "kl": 0.1533203125, + "learning_rate": 8.031088082901554e-09, + "loss": -0.0008, + "reward": 2.499997854232788, + "reward_std": 1.9600353766691114e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999979734420776, + "step": 3830 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 9.924870466321243, + "grad_norm": 0.7612342519876283, + "kl": 0.122802734375, + "learning_rate": 7.7720207253886e-09, + "loss": 0.0008, + "reward": 2.4999938011169434, + "reward_std": 6.335151084613244e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999938011169434, + "step": 3831 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 9.927461139896373, + "grad_norm": 0.06541253092886004, + "kl": 0.143798828125, + "learning_rate": 7.512953367875648e-09, + "loss": 0.0003, + "reward": 2.4999983310699463, + "reward_std": 1.511202754045371e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.999998390674591, + "step": 3832 + }, + { + "clip_ratio": 0.0, + "completion_length": 34.5, + "epoch": 9.930051813471502, + "grad_norm": 0.41704762680744645, + "kl": 0.108642578125, + "learning_rate": 7.253886010362694e-09, + "loss": 0.0004, + "reward": 1.998159408569336, + "reward_std": 1.640926217305605e-05, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.498159408569336, + "step": 3833 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.5, + "epoch": 9.932642487046632, + "grad_norm": 0.557653966923304, + "kl": 0.044342041015625, + "learning_rate": 6.994818652849741e-09, + "loss": 0.0003, + "reward": 2.4999966621398926, + "reward_std": 2.9428587140500895e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999966621398926, + "step": 3834 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.5, + "epoch": 9.935233160621761, + "grad_norm": 0.3879970647839044, + "kl": 0.05755615234375, + "learning_rate": 6.7357512953367875e-09, + "loss": -0.0005, + "reward": 2.4999977350234985, + "reward_std": 2.1188032235386345e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999977350234985, + "step": 3835 + }, + { + "clip_ratio": 0.0, + "completion_length": 34.5, + "epoch": 9.937823834196891, + "grad_norm": 0.10700355409757978, + "kl": 0.054931640625, + "learning_rate": 6.476683937823834e-09, + "loss": -0.0001, + "reward": 2.499996781349182, + "reward_std": 1.9007566720574687e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.999996840953827, + "step": 3836 + }, + { + "clip_ratio": 0.0, + "completion_length": 36.4375, + "epoch": 9.94041450777202, + "grad_norm": 1.0721553571189373, + "kl": 0.1796875, + "learning_rate": 6.2176165803108805e-09, + "loss": 0.0002, + "reward": 1.9939517974853516, + "reward_std": 2.099954613754562e-05, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.4939518868923187, + "step": 3837 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.5, + "epoch": 9.94300518134715, + "grad_norm": 18.29581105921374, + "kl": 0.098388671875, + "learning_rate": 5.958549222797927e-09, + "loss": 0.0008, + "reward": 1.9990041255950928, + "reward_std": 3.2652942422828346e-05, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.4990040957927704, + "step": 3838 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.5, + "epoch": 9.94559585492228, + "grad_norm": 0.9642019634985963, + "kl": 0.0869140625, + "learning_rate": 5.6994818652849734e-09, + "loss": -0.0009, + "reward": 2.4999358654022217, + "reward_std": 1.2356736306173843e-05, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999359846115112, + "step": 3839 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.5, + "epoch": 9.94818652849741, + "grad_norm": 0.3194617642396292, + "kl": 0.11474609375, + "learning_rate": 5.44041450777202e-09, + "loss": -0.0008, + "reward": 2.499995470046997, + "reward_std": 3.246663709433051e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999958276748657, + "step": 3840 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.875, + "epoch": 9.950777202072539, + "grad_norm": 0.15313760652201278, + "kl": 0.04736328125, + "learning_rate": 5.181347150259067e-09, + "loss": 0.0003, + "reward": 1.499997854232788, + "reward_std": 1.3699809642275795e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 0.9999979138374329, + "step": 3841 + }, + { + "clip_ratio": 0.0, + "completion_length": 34.0, + "epoch": 9.953367875647668, + "grad_norm": 0.4697979344373448, + "kl": 0.263671875, + "learning_rate": 4.922279792746114e-09, + "loss": -0.0002, + "reward": 2.4999961853027344, + "reward_std": 4.25611506216228e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999961853027344, + "step": 3842 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 9.955958549222798, + "grad_norm": 0.22517328883265977, + "kl": 0.0875244140625, + "learning_rate": 4.66321243523316e-09, + "loss": 0.0016, + "reward": 2.4999845027923584, + "reward_std": 3.266640874244331e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999844431877136, + "step": 3843 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.5, + "epoch": 9.958549222797927, + "grad_norm": 0.2749985512302947, + "kl": 0.1087646484375, + "learning_rate": 4.404145077720207e-09, + "loss": 0.0003, + "reward": 2.4999977350234985, + "reward_std": 1.714500086791304e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.999997615814209, + "step": 3844 + }, + { + "clip_ratio": 0.0, + "completion_length": 36.0, + "epoch": 9.961139896373057, + "grad_norm": 22.4368799242481, + "kl": 0.14013671875, + "learning_rate": 4.145077720207253e-09, + "loss": 0.001, + "reward": 2.0622167587280273, + "reward_std": 0.17688181239589085, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.562216877937317, + "step": 3845 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.5, + "epoch": 9.963730569948186, + "grad_norm": 0.01691129601873481, + "kl": 0.0653076171875, + "learning_rate": 3.8860103626943e-09, + "loss": -0.0003, + "reward": 2.4999990463256836, + "reward_std": 6.274361794567085e-07, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999992847442627, + "step": 3846 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 9.966321243523316, + "grad_norm": 1.0312766984474269, + "kl": 0.145263671875, + "learning_rate": 3.626943005181347e-09, + "loss": 0.0014, + "reward": 2.4999921321868896, + "reward_std": 6.374732151925855e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999921321868896, + "step": 3847 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.5, + "epoch": 9.968911917098445, + "grad_norm": 0.9028845076930376, + "kl": 0.06884765625, + "learning_rate": 3.3678756476683938e-09, + "loss": -0.0009, + "reward": 2.499984622001648, + "reward_std": 5.207399226492271e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999847412109375, + "step": 3848 + }, + { + "clip_ratio": 0.0, + "completion_length": 34.5, + "epoch": 9.971502590673575, + "grad_norm": 0.11931827729087574, + "kl": 0.091064453125, + "learning_rate": 3.1088082901554402e-09, + "loss": 0.002, + "reward": 2.499998092651367, + "reward_std": 1.489894657424884e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999980926513672, + "step": 3849 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 9.974093264248705, + "grad_norm": 0.48138690088248665, + "kl": 0.103759765625, + "learning_rate": 2.8497409326424867e-09, + "loss": 0.0003, + "reward": 2.4999932050704956, + "reward_std": 4.633439800727501e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999932050704956, + "step": 3850 + }, + { + "clip_ratio": 0.0, + "completion_length": 36.0, + "epoch": 9.976683937823834, + "grad_norm": 2.031699084663647, + "kl": 0.126953125, + "learning_rate": 2.5906735751295336e-09, + "loss": 0.0006, + "reward": 2.499987840652466, + "reward_std": 1.0673266388039337e-05, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999879002571106, + "step": 3851 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.5, + "epoch": 9.979274611398964, + "grad_norm": 0.25072852257349465, + "kl": 0.1142578125, + "learning_rate": 2.33160621761658e-09, + "loss": 0.0008, + "reward": 2.4999982118606567, + "reward_std": 1.5817508369764255e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999982118606567, + "step": 3852 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.5, + "epoch": 9.981865284974093, + "grad_norm": 0.5885331075726128, + "kl": 0.078125, + "learning_rate": 2.0725388601036265e-09, + "loss": -0.0004, + "reward": 2.499995470046997, + "reward_std": 2.6784398983181745e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999956488609314, + "step": 3853 + }, + { + "clip_ratio": 0.0, + "completion_length": 34.5, + "epoch": 9.984455958549223, + "grad_norm": 6.080897936325093, + "kl": 0.1163330078125, + "learning_rate": 1.8134715025906734e-09, + "loss": 0.0015, + "reward": 1.9884246587753296, + "reward_std": 0.00017769218521834773, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.4884245991706848, + "step": 3854 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.5, + "epoch": 9.987046632124352, + "grad_norm": 2.4257752618732398, + "kl": 0.0537109375, + "learning_rate": 1.5544041450777201e-09, + "loss": -0.0004, + "reward": 2.499968409538269, + "reward_std": 1.0920646218437469e-05, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999685287475586, + "step": 3855 + }, + { + "clip_ratio": 0.0, + "completion_length": 34.5, + "epoch": 9.989637305699482, + "grad_norm": 0.19120126786828592, + "kl": 0.041259765625, + "learning_rate": 1.2953367875647668e-09, + "loss": -0.0003, + "reward": 2.49999737739563, + "reward_std": 1.6218893961195135e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999972581863403, + "step": 3856 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.5, + "epoch": 9.992227979274611, + "grad_norm": 1.238971487551735, + "kl": 0.427734375, + "learning_rate": 1.0362694300518133e-09, + "loss": 0.0004, + "reward": 2.4999948740005493, + "reward_std": 5.353793312679045e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999948740005493, + "step": 3857 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.5, + "epoch": 9.994818652849741, + "grad_norm": 0.3150135809566399, + "kl": 0.087158203125, + "learning_rate": 7.772020725388601e-10, + "loss": -0.0008, + "reward": 2.49999737739563, + "reward_std": 2.7057578790845582e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.9999974966049194, + "step": 3858 + }, + { + "clip_ratio": 0.0, + "completion_length": 34.0, + "epoch": 9.99740932642487, + "grad_norm": 0.9723404840528603, + "kl": 0.21240234375, + "learning_rate": 5.181347150259066e-10, + "loss": 0.0011, + "reward": 2.49999737739563, + "reward_std": 1.815899395296583e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.999997317790985, + "step": 3859 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.0, + "epoch": 10.0, + "grad_norm": 0.15840352572206634, + "kl": 0.0562744140625, + "learning_rate": 2.590673575129533e-10, + "loss": -0.0003, + "reward": 2.499996542930603, + "reward_std": 2.304913465422942e-06, + "rewards/format_reward_rec": 1.0, + "rewards/point_reward": 1.999996542930603, + "step": 3860 + } + ], + "logging_steps": 1.0, + "max_steps": 3860, + "num_input_tokens_seen": 0, + "num_train_epochs": 10, + "save_steps": 100, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": true + }, + "attributes": {} + } + }, + "total_flos": 0.0, + "train_batch_size": 1, + "trial_name": null, + "trial_params": null +}