{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 10.0, "eval_steps": 500, "global_step": 3860, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "clip_ratio": 0.0, "completion_length": 74.75, "epoch": 0.0025906735751295338, "grad_norm": 23.654455434537084, "kl": 0.0, "learning_rate": 1e-06, "loss": -0.0, "reward": 0.7312208712100983, "reward_std": 0.3606285899877548, "rewards/format_reward_rec": 0.625, "rewards/point_reward": 0.41872087121009827, "step": 1 }, { "clip_ratio": 0.0, "completion_length": 84.5, "epoch": 0.0051813471502590676, "grad_norm": 116.16598588589672, "kl": 0.0008373260498046875, "learning_rate": 9.99740932642487e-07, "loss": 0.0, "reward": 1.4451560974121094, "reward_std": 0.5504895597696304, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 0.9451561868190765, "step": 2 }, { "clip_ratio": 0.0, "completion_length": 46.5625, "epoch": 0.007772020725388601, "grad_norm": 25.86309876662035, "kl": 0.0008907318115234375, "learning_rate": 9.99481865284974e-07, "loss": 0.0, "reward": 1.7835919260978699, "reward_std": 0.2786169648170471, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.2835918068885803, "step": 3 }, { "clip_ratio": 0.0, "completion_length": 53.625, "epoch": 0.010362694300518135, "grad_norm": 10.949192686118256, "kl": 0.0010128021240234375, "learning_rate": 9.992227979274612e-07, "loss": 0.0, "reward": 2.124643087387085, "reward_std": 0.518008291721344, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.624643325805664, "step": 4 }, { "clip_ratio": 0.0, "completion_length": 34.6875, "epoch": 0.012953367875647668, "grad_norm": 4.030706300181012, "kl": 0.0002632737159729004, "learning_rate": 9.989637305699482e-07, "loss": 0.0, "reward": 2.3121966123580933, "reward_std": 0.2590037997251784, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.8121966123580933, "step": 5 }, { "clip_ratio": 0.0, "completion_length": 98.25, "epoch": 0.015544041450777202, "grad_norm": 22.143321089193286, "kl": 0.0044403076171875, "learning_rate": 9.987046632124352e-07, "loss": 0.0, "reward": 1.305199384689331, "reward_std": 0.7584892809391022, "rewards/format_reward_rec": 0.8125, "rewards/point_reward": 0.8989493250846863, "step": 6 }, { "clip_ratio": 0.0, "completion_length": 69.125, "epoch": 0.018134715025906734, "grad_norm": 58.97024600782642, "kl": 0.01251220703125, "learning_rate": 9.984455958549224e-07, "loss": 0.0, "reward": 1.372310757637024, "reward_std": 0.2639093187171966, "rewards/format_reward_rec": 0.9375, "rewards/point_reward": 0.9035606682300568, "step": 7 }, { "clip_ratio": 0.0, "completion_length": 46.1875, "epoch": 0.02072538860103627, "grad_norm": 11.096583407786104, "kl": 0.0140533447265625, "learning_rate": 9.981865284974092e-07, "loss": 0.0008, "reward": 2.3708999156951904, "reward_std": 0.23891268002080324, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.8708999156951904, "step": 8 }, { "clip_ratio": 0.0, "completion_length": 35.3125, "epoch": 0.023316062176165803, "grad_norm": 31.15085797685794, "kl": 0.0009918212890625, "learning_rate": 9.979274611398964e-07, "loss": 0.0002, "reward": 2.277616500854492, "reward_std": 0.3077518731145119, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.7776165008544922, "step": 9 }, { "clip_ratio": 0.0, "completion_length": 59.25, "epoch": 0.025906735751295335, "grad_norm": 11.944424418473368, "kl": 0.0010204315185546875, "learning_rate": 9.976683937823834e-07, "loss": 0.0, "reward": 1.9366641640663147, "reward_std": 0.5268431305885315, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.43666410446167, "step": 10 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 0.02849740932642487, "grad_norm": 1.9135861015702393, "kl": 0.00021076202392578125, "learning_rate": 9.974093264248704e-07, "loss": -0.0, "reward": 2.4998377561569214, "reward_std": 0.00018063169272863888, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.999837875366211, "step": 11 }, { "clip_ratio": 0.0, "completion_length": 34.9375, "epoch": 0.031088082901554404, "grad_norm": 40.10387354729789, "kl": 0.0004906654357910156, "learning_rate": 9.971502590673576e-07, "loss": 0.0, "reward": 1.9681448936462402, "reward_std": 0.37788383662700653, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4681448340415955, "step": 12 }, { "clip_ratio": 0.0, "completion_length": 38.25, "epoch": 0.03367875647668394, "grad_norm": 24.163294004736, "kl": 0.00337982177734375, "learning_rate": 9.968911917098446e-07, "loss": 0.0, "reward": 1.4076377749443054, "reward_std": 0.12149995937943459, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 0.9076377749443054, "step": 13 }, { "clip_ratio": 0.0, "completion_length": 76.4375, "epoch": 0.03626943005181347, "grad_norm": 11.714799652216705, "kl": 0.00435638427734375, "learning_rate": 9.966321243523316e-07, "loss": 0.0, "reward": 1.733027458190918, "reward_std": 0.5577484518289566, "rewards/format_reward_rec": 0.9375, "rewards/point_reward": 1.2642774283885956, "step": 14 }, { "clip_ratio": 0.0, "completion_length": 35.125, "epoch": 0.038860103626943004, "grad_norm": 50.83535728252055, "kl": 0.0007543563842773438, "learning_rate": 9.963730569948186e-07, "loss": -0.0001, "reward": 2.0652049779891968, "reward_std": 0.275846501095657, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.5652050375938416, "step": 15 }, { "clip_ratio": 0.0, "completion_length": 36.5, "epoch": 0.04145077720207254, "grad_norm": 4.241215850948085, "kl": 0.0006628036499023438, "learning_rate": 9.961139896373056e-07, "loss": -0.0004, "reward": 1.9996973872184753, "reward_std": 0.00032231332988885697, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4996973872184753, "step": 16 }, { "clip_ratio": 0.0, "completion_length": 35.5625, "epoch": 0.04404145077720207, "grad_norm": 7.52715415012867, "kl": 0.010478973388671875, "learning_rate": 9.958549222797928e-07, "loss": 0.0, "reward": 1.852938175201416, "reward_std": 0.20573098585009575, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.3529380559921265, "step": 17 }, { "clip_ratio": 0.0, "completion_length": 34.8125, "epoch": 0.046632124352331605, "grad_norm": 35.13336574259517, "kl": 0.00276947021484375, "learning_rate": 9.955958549222798e-07, "loss": 0.0, "reward": 1.7542864084243774, "reward_std": 0.30061637982726097, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.254286527633667, "step": 18 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 0.04922279792746114, "grad_norm": 5.882698374562189, "kl": 0.0012226104736328125, "learning_rate": 9.953367875647668e-07, "loss": 0.0004, "reward": 2.3103703260421753, "reward_std": 0.2616851614402549, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.810370147228241, "step": 19 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 0.05181347150259067, "grad_norm": 7.897835461185853, "kl": 0.001800537109375, "learning_rate": 9.950777202072538e-07, "loss": 0.0, "reward": 1.8443708419799805, "reward_std": 0.25012848898768425, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.344370722770691, "step": 20 }, { "clip_ratio": 0.0, "completion_length": 34.4375, "epoch": 0.054404145077720206, "grad_norm": 89.63936792842128, "kl": 0.0028400421142578125, "learning_rate": 9.948186528497408e-07, "loss": 0.0, "reward": 2.170512616634369, "reward_std": 0.27396823112940183, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.6705125570297241, "step": 21 }, { "clip_ratio": 0.0, "completion_length": 78.0, "epoch": 0.05699481865284974, "grad_norm": 17.490461875054127, "kl": 0.00286102294921875, "learning_rate": 9.94559585492228e-07, "loss": 0.0, "reward": 1.7859807014465332, "reward_std": 0.3069179803133011, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.285980761051178, "step": 22 }, { "clip_ratio": 0.0, "completion_length": 34.75, "epoch": 0.05958549222797927, "grad_norm": 26.625503930558537, "kl": 0.0762939453125, "learning_rate": 9.94300518134715e-07, "loss": 0.0003, "reward": 1.976489543914795, "reward_std": 0.5621593296527863, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.476489543914795, "step": 23 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 0.06217616580310881, "grad_norm": 29.10365061874257, "kl": 0.0052947998046875, "learning_rate": 9.94041450777202e-07, "loss": 0.0, "reward": 1.4800074696540833, "reward_std": 0.21575853787362576, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 0.9800075888633728, "step": 24 }, { "clip_ratio": 0.0, "completion_length": 34.875, "epoch": 0.06476683937823834, "grad_norm": 9.194639911178484, "kl": 0.0052337646484375, "learning_rate": 9.937823834196892e-07, "loss": 0.0, "reward": 1.7446673512458801, "reward_std": 0.4378291368484497, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.2446672916412354, "step": 25 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 0.06735751295336788, "grad_norm": 23.585911751853935, "kl": 0.002048492431640625, "learning_rate": 9.93523316062176e-07, "loss": 0.0, "reward": 1.5376622080802917, "reward_std": 0.1939506810158491, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.0376621782779694, "step": 26 }, { "clip_ratio": 0.0, "completion_length": 35.375, "epoch": 0.06994818652849741, "grad_norm": 42.03244741480483, "kl": 0.01849365234375, "learning_rate": 9.932642487046632e-07, "loss": 0.0001, "reward": 1.9917437434196472, "reward_std": 0.3577324002981186, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4917437434196472, "step": 27 }, { "clip_ratio": 0.0, "completion_length": 37.6875, "epoch": 0.07253886010362694, "grad_norm": 7.6125781540856075, "kl": 0.01397705078125, "learning_rate": 9.930051813471502e-07, "loss": 0.0003, "reward": 2.1210156679153442, "reward_std": 0.2339139638661436, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.6210156679153442, "step": 28 }, { "clip_ratio": 0.0, "completion_length": 37.1875, "epoch": 0.07512953367875648, "grad_norm": 14.594557537197575, "kl": 0.0027618408203125, "learning_rate": 9.927461139896372e-07, "loss": 0.0, "reward": 2.1707316040992737, "reward_std": 0.4666369557380676, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.6707317233085632, "step": 29 }, { "clip_ratio": 0.0, "completion_length": 41.9375, "epoch": 0.07772020725388601, "grad_norm": 78.64780253448063, "kl": 0.01177978515625, "learning_rate": 9.924870466321244e-07, "loss": 0.0, "reward": 2.1214077472686768, "reward_std": 0.2336481891979929, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.6214078664779663, "step": 30 }, { "clip_ratio": 0.0, "completion_length": 35.4375, "epoch": 0.08031088082901554, "grad_norm": 15.346949280838764, "kl": 0.020843505859375, "learning_rate": 9.922279792746114e-07, "loss": -0.0, "reward": 2.105500817298889, "reward_std": 0.24920060485601425, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.605500876903534, "step": 31 }, { "clip_ratio": 0.0, "completion_length": 37.125, "epoch": 0.08290155440414508, "grad_norm": 28.799379071515, "kl": 0.013671875, "learning_rate": 9.919689119170984e-07, "loss": 0.0001, "reward": 1.8799359798431396, "reward_std": 0.20554822124540806, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.3799359798431396, "step": 32 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 0.08549222797927461, "grad_norm": 30.874748179392196, "kl": 0.055267333984375, "learning_rate": 9.917098445595854e-07, "loss": 0.0001, "reward": 2.1147689819335938, "reward_std": 0.23852075126254135, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.614769160747528, "step": 33 }, { "clip_ratio": 0.0, "completion_length": 33.0, "epoch": 0.08808290155440414, "grad_norm": 21.413339376847578, "kl": 0.037353515625, "learning_rate": 9.914507772020724e-07, "loss": -0.0001, "reward": 2.4374749660491943, "reward_std": 0.1768060198804733, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9374751448631287, "step": 34 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 0.09067357512953368, "grad_norm": 11.735518776968457, "kl": 0.01519775390625, "learning_rate": 9.911917098445596e-07, "loss": 0.0001, "reward": 2.1221890449523926, "reward_std": 0.4398344159126282, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.6221890449523926, "step": 35 }, { "clip_ratio": 0.0, "completion_length": 35.125, "epoch": 0.09326424870466321, "grad_norm": 19.228717380259603, "kl": 0.047607421875, "learning_rate": 9.909326424870466e-07, "loss": 0.0002, "reward": 1.8104677200317383, "reward_std": 0.2632200005464256, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.3104676604270935, "step": 36 }, { "clip_ratio": 0.0, "completion_length": 35.0625, "epoch": 0.09585492227979274, "grad_norm": 40.120842681212686, "kl": 0.01470184326171875, "learning_rate": 9.906735751295336e-07, "loss": 0.0001, "reward": 2.0619872212409973, "reward_std": 0.5265199542045593, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.5619871020317078, "step": 37 }, { "clip_ratio": 0.0, "completion_length": 41.75, "epoch": 0.09844559585492228, "grad_norm": 22.11093878199278, "kl": 0.01458740234375, "learning_rate": 9.904145077720206e-07, "loss": 0.0001, "reward": 2.207720160484314, "reward_std": 0.5386560559272766, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.7077201008796692, "step": 38 }, { "clip_ratio": 0.0, "completion_length": 34.5, "epoch": 0.10103626943005181, "grad_norm": 14.788404143155422, "kl": 0.00701904296875, "learning_rate": 9.901554404145076e-07, "loss": -0.0007, "reward": 2.310261368751526, "reward_std": 0.26141046830311154, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.8102614879608154, "step": 39 }, { "clip_ratio": 0.0, "completion_length": 34.5, "epoch": 0.10362694300518134, "grad_norm": 1.7337144036341858, "kl": 0.0750732421875, "learning_rate": 9.898963730569949e-07, "loss": 0.0008, "reward": 2.4999849796295166, "reward_std": 1.147488831065857e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999849200248718, "step": 40 }, { "clip_ratio": 0.0, "completion_length": 122.375, "epoch": 0.10621761658031088, "grad_norm": 7.857568618104513, "kl": 0.011962890625, "learning_rate": 9.896373056994819e-07, "loss": 0.0, "reward": 2.2498103380203247, "reward_std": 0.4357842653989792, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.7498104572296143, "step": 41 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 0.10880829015544041, "grad_norm": 26.42632814430377, "kl": 0.0374755859375, "learning_rate": 9.893782383419688e-07, "loss": 0.0002, "reward": 1.9356898665428162, "reward_std": 0.17737593466881663, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4356898367404938, "step": 42 }, { "clip_ratio": 0.0, "completion_length": 37.1875, "epoch": 0.11139896373056994, "grad_norm": 19.882775926206442, "kl": 0.0291748046875, "learning_rate": 9.89119170984456e-07, "loss": 0.0001, "reward": 1.6723498702049255, "reward_std": 0.4186931699514389, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.1723498702049255, "step": 43 }, { "clip_ratio": 0.0, "completion_length": 35.4375, "epoch": 0.11398963730569948, "grad_norm": 25.952155665863092, "kl": 0.10906982421875, "learning_rate": 9.888601036269428e-07, "loss": 0.0004, "reward": 1.4955262541770935, "reward_std": 0.003271562047302723, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 0.9955263733863831, "step": 44 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 0.11658031088082901, "grad_norm": 440.21761847167636, "kl": 0.021759033203125, "learning_rate": 9.8860103626943e-07, "loss": -0.0007, "reward": 2.3124141693115234, "reward_std": 0.2588567412449265, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.8124142289161682, "step": 45 }, { "clip_ratio": 0.0, "completion_length": 37.0625, "epoch": 0.11917098445595854, "grad_norm": 12.536078202267825, "kl": 0.05230712890625, "learning_rate": 9.88341968911917e-07, "loss": 0.0, "reward": 1.990914225578308, "reward_std": 0.022514314281579573, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4909144341945648, "step": 46 }, { "clip_ratio": 0.0, "completion_length": 37.4375, "epoch": 0.12176165803108809, "grad_norm": 223.592433529062, "kl": 0.07373046875, "learning_rate": 9.88082901554404e-07, "loss": 0.0003, "reward": 2.2822933197021484, "reward_std": 0.4742845743894577, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.7822932600975037, "step": 47 }, { "clip_ratio": 0.0, "completion_length": 35.9375, "epoch": 0.12435233160621761, "grad_norm": 32.36471243650293, "kl": 0.011932373046875, "learning_rate": 9.878238341968913e-07, "loss": -0.0001, "reward": 2.3654046058654785, "reward_std": 0.2500063071856857, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.8654048442840576, "step": 48 }, { "clip_ratio": 0.0, "completion_length": 34.6875, "epoch": 0.12694300518134716, "grad_norm": 71.24853896041392, "kl": 0.011688232421875, "learning_rate": 9.875647668393783e-07, "loss": 0.0003, "reward": 2.1613352298736572, "reward_std": 0.28758263627048564, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.6613351106643677, "step": 49 }, { "clip_ratio": 0.0, "completion_length": 34.5, "epoch": 0.12953367875647667, "grad_norm": 21.11629785246926, "kl": 0.0303955078125, "learning_rate": 9.873056994818653e-07, "loss": 0.0002, "reward": 2.2455525398254395, "reward_std": 0.272227193647268, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.7455525994300842, "step": 50 }, { "clip_ratio": 0.0, "completion_length": 34.875, "epoch": 0.13212435233160622, "grad_norm": 18.664911497214902, "kl": 0.02557373046875, "learning_rate": 9.870466321243523e-07, "loss": -0.0003, "reward": 1.7885136008262634, "reward_std": 0.2233308469039912, "rewards/format_reward_rec": 0.9375, "rewards/point_reward": 1.3197636902332306, "step": 51 }, { "clip_ratio": 0.0, "completion_length": 34.5625, "epoch": 0.13471502590673576, "grad_norm": 99.49804222052887, "kl": 0.0521240234375, "learning_rate": 9.867875647668393e-07, "loss": 0.0004, "reward": 1.9976779222488403, "reward_std": 0.006293868353168364, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.497677743434906, "step": 52 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 0.13730569948186527, "grad_norm": 17.244449800342725, "kl": 0.010955810546875, "learning_rate": 9.865284974093265e-07, "loss": -0.0, "reward": 2.4999852180480957, "reward_std": 1.7544890738463437e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999850988388062, "step": 53 }, { "clip_ratio": 0.0, "completion_length": 35.75, "epoch": 0.13989637305699482, "grad_norm": 32.37944592695846, "kl": 0.027099609375, "learning_rate": 9.862694300518135e-07, "loss": -0.0, "reward": 1.8870325088500977, "reward_std": 0.1153705872293358, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.3870326280593872, "step": 54 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 0.14248704663212436, "grad_norm": 19.480430245726204, "kl": 0.02496337890625, "learning_rate": 9.860103626943005e-07, "loss": -0.0, "reward": 2.4373843669891357, "reward_std": 0.17695820160315634, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9373842477798462, "step": 55 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 0.14507772020725387, "grad_norm": 16.989833103971012, "kl": 0.12823486328125, "learning_rate": 9.857512953367875e-07, "loss": 0.0005, "reward": 1.8051514029502869, "reward_std": 0.45061106979846954, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.3051514625549316, "step": 56 }, { "clip_ratio": 0.0, "completion_length": 34.625, "epoch": 0.14766839378238342, "grad_norm": 31.02175686915555, "kl": 0.01904296875, "learning_rate": 9.854922279792745e-07, "loss": 0.0001, "reward": 1.969981074333191, "reward_std": 0.6297044456005096, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4699811935424805, "step": 57 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 0.15025906735751296, "grad_norm": 10.235665746059144, "kl": 0.0089874267578125, "learning_rate": 9.852331606217617e-07, "loss": -0.0004, "reward": 2.0624454021453857, "reward_std": 0.1768003513025178, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.56244558095932, "step": 58 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 0.15284974093264247, "grad_norm": 2.272577397842421, "kl": 0.0142364501953125, "learning_rate": 9.849740932642487e-07, "loss": -0.0006, "reward": 1.9982001185417175, "reward_std": 3.705472454385017e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4982001781463623, "step": 59 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 0.15544041450777202, "grad_norm": 9.843036544453048, "kl": 0.0830078125, "learning_rate": 9.847150259067357e-07, "loss": -0.0001, "reward": 2.437375068664551, "reward_std": 0.17711830667053619, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9373750686645508, "step": 60 }, { "clip_ratio": 0.0, "completion_length": 35.125, "epoch": 0.15803108808290156, "grad_norm": 158.25531896737053, "kl": 0.0689697265625, "learning_rate": 9.844559585492227e-07, "loss": 0.0003, "reward": 1.7639578580856323, "reward_std": 0.3546312153339386, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.2639578580856323, "step": 61 }, { "clip_ratio": 0.0, "completion_length": 34.5, "epoch": 0.16062176165803108, "grad_norm": 3.0447356599273157, "kl": 0.03656005859375, "learning_rate": 9.841968911917097e-07, "loss": 0.0004, "reward": 2.4999831914901733, "reward_std": 1.480638820794411e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999831914901733, "step": 62 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 0.16321243523316062, "grad_norm": 4.393337788792737, "kl": 0.0108184814453125, "learning_rate": 9.83937823834197e-07, "loss": 0.0001, "reward": 2.4374507665634155, "reward_std": 0.17682846984826028, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9374507069587708, "step": 63 }, { "clip_ratio": 0.0, "completion_length": 35.625, "epoch": 0.16580310880829016, "grad_norm": 27.899095141519734, "kl": 0.066162109375, "learning_rate": 9.83678756476684e-07, "loss": 0.001, "reward": 2.168538749217987, "reward_std": 0.2762052078041961, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.6685386896133423, "step": 64 }, { "clip_ratio": 0.0, "completion_length": 34.5, "epoch": 0.16839378238341968, "grad_norm": 5.751242569952515, "kl": 0.03411865234375, "learning_rate": 9.83419689119171e-07, "loss": 0.0004, "reward": 2.499942421913147, "reward_std": 5.2554929425241426e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999422430992126, "step": 65 }, { "clip_ratio": 0.0, "completion_length": 35.375, "epoch": 0.17098445595854922, "grad_norm": 477.3372362698823, "kl": 0.074462890625, "learning_rate": 9.831606217616581e-07, "loss": 0.0003, "reward": 1.779470443725586, "reward_std": 0.3095453269779682, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.279470443725586, "step": 66 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 0.17357512953367876, "grad_norm": 6.576840283054051, "kl": 0.02679443359375, "learning_rate": 9.829015544041451e-07, "loss": 0.0001, "reward": 1.9997954368591309, "reward_std": 0.0003149642052449053, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4997954964637756, "step": 67 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 0.17616580310880828, "grad_norm": 2.520279174644408, "kl": 0.022705078125, "learning_rate": 9.826424870466321e-07, "loss": 0.0001, "reward": 2.4999767541885376, "reward_std": 1.9422466266405536e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999768733978271, "step": 68 }, { "clip_ratio": 0.0, "completion_length": 52.4375, "epoch": 0.17875647668393782, "grad_norm": 8.888223161773142, "kl": 0.0467529296875, "learning_rate": 9.823834196891191e-07, "loss": 0.0008, "reward": 2.3098913431167603, "reward_std": 0.2624187994371141, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.8098912835121155, "step": 69 }, { "clip_ratio": 0.0, "completion_length": 34.5, "epoch": 0.18134715025906736, "grad_norm": 16.12291142201368, "kl": 0.18121337890625, "learning_rate": 9.821243523316061e-07, "loss": 0.0012, "reward": 2.37497341632843, "reward_std": 0.2314961755520244, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.874973475933075, "step": 70 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 0.18393782383419688, "grad_norm": 48.154326938669676, "kl": 0.01806640625, "learning_rate": 9.818652849740933e-07, "loss": 0.0009, "reward": 2.4999828338623047, "reward_std": 1.1697421086864779e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999827146530151, "step": 71 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 0.18652849740932642, "grad_norm": 0.45065452719536897, "kl": 0.04071807861328125, "learning_rate": 9.816062176165803e-07, "loss": 0.0, "reward": 2.4999879598617554, "reward_std": 3.2788135513328598e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999879598617554, "step": 72 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 0.18911917098445596, "grad_norm": 8.04305170993719, "kl": 0.04095458984375, "learning_rate": 9.813471502590673e-07, "loss": -0.0001, "reward": 2.437377095222473, "reward_std": 0.17711324256833905, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9373770952224731, "step": 73 }, { "clip_ratio": 0.0, "completion_length": 47.0, "epoch": 0.19170984455958548, "grad_norm": 10.295445088160703, "kl": 0.073486328125, "learning_rate": 9.810880829015543e-07, "loss": -0.0001, "reward": 2.4372644424438477, "reward_std": 0.1773603390867038, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9372645020484924, "step": 74 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 0.19430051813471502, "grad_norm": 45.24804202411124, "kl": 0.01751708984375, "learning_rate": 9.808290155440413e-07, "loss": 0.0002, "reward": 2.1242417097091675, "reward_std": 0.23192374470909272, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.6242417097091675, "step": 75 }, { "clip_ratio": 0.0, "completion_length": 34.5, "epoch": 0.19689119170984457, "grad_norm": 7.3772506622853244, "kl": 0.03271484375, "learning_rate": 9.805699481865285e-07, "loss": -0.0005, "reward": 1.9985507726669312, "reward_std": 3.881182465192978e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4985509514808655, "step": 76 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 0.19948186528497408, "grad_norm": 9.351526793083345, "kl": 0.0634765625, "learning_rate": 9.803108808290155e-07, "loss": 0.0001, "reward": 2.4997767210006714, "reward_std": 5.034307181972508e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9997768998146057, "step": 77 }, { "clip_ratio": 0.0, "completion_length": 34.0625, "epoch": 0.20207253886010362, "grad_norm": 52.616566611429114, "kl": 0.061767578125, "learning_rate": 9.800518134715025e-07, "loss": 0.0005, "reward": 2.124193847179413, "reward_std": 0.23194783757702453, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.624193787574768, "step": 78 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 0.20466321243523317, "grad_norm": 3.326113527311215, "kl": 0.064453125, "learning_rate": 9.797927461139895e-07, "loss": -0.001, "reward": 2.4999313354492188, "reward_std": 3.320495090974873e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999314546585083, "step": 79 }, { "clip_ratio": 0.0, "completion_length": 34.625, "epoch": 0.20725388601036268, "grad_norm": 42.73536840883849, "kl": 0.0462646484375, "learning_rate": 9.795336787564765e-07, "loss": 0.0006, "reward": 1.9181513786315918, "reward_std": 0.03343612557546294, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4181513786315918, "step": 80 }, { "clip_ratio": 0.0, "completion_length": 33.9375, "epoch": 0.20984455958549222, "grad_norm": 36.79017488467275, "kl": 0.06671142578125, "learning_rate": 9.792746113989637e-07, "loss": 0.0003, "reward": 1.7636473178863525, "reward_std": 0.3487439304590225, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.2636472582817078, "step": 81 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 0.21243523316062177, "grad_norm": 32.742835637572625, "kl": 0.04803466796875, "learning_rate": 9.790155440414507e-07, "loss": -0.0, "reward": 1.9988058805465698, "reward_std": 0.0005170259537408128, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4988059401512146, "step": 82 }, { "clip_ratio": 0.0, "completion_length": 34.5, "epoch": 0.21502590673575128, "grad_norm": 3.1685855541401837, "kl": 0.04400634765625, "learning_rate": 9.787564766839377e-07, "loss": -0.0004, "reward": 2.499987483024597, "reward_std": 1.1137093110846763e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.999987542629242, "step": 83 }, { "clip_ratio": 0.0, "completion_length": 38.875, "epoch": 0.21761658031088082, "grad_norm": 11.364834215005686, "kl": 0.02508544921875, "learning_rate": 9.784974093264247e-07, "loss": -0.0002, "reward": 2.4373650550842285, "reward_std": 0.1770545343403569, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9373650550842285, "step": 84 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 0.22020725388601037, "grad_norm": 15.226201692955948, "kl": 0.075439453125, "learning_rate": 9.78238341968912e-07, "loss": 0.0003, "reward": 2.4999794960021973, "reward_std": 1.823106958909193e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.999979555606842, "step": 85 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 0.22279792746113988, "grad_norm": 28.408544891966834, "kl": 0.1396484375, "learning_rate": 9.77979274611399e-07, "loss": 0.0003, "reward": 2.312247633934021, "reward_std": 0.2590706118817252, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.8122477531433105, "step": 86 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 0.22538860103626943, "grad_norm": 4.878505935956497, "kl": 0.1602783203125, "learning_rate": 9.77720207253886e-07, "loss": 0.0009, "reward": 2.499765992164612, "reward_std": 2.0208295097745577e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.999765932559967, "step": 87 }, { "clip_ratio": 0.0, "completion_length": 38.375, "epoch": 0.22797927461139897, "grad_norm": 97.4598946672193, "kl": 0.08349609375, "learning_rate": 9.77461139896373e-07, "loss": 0.0003, "reward": 2.046730399131775, "reward_std": 0.5622504651546478, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.5467304587364197, "step": 88 }, { "clip_ratio": 0.0, "completion_length": 34.6875, "epoch": 0.23056994818652848, "grad_norm": 128.12050745642023, "kl": 0.15771484375, "learning_rate": 9.772020725388602e-07, "loss": 0.0007, "reward": 1.9192970991134644, "reward_std": 0.016997356389765628, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4192971885204315, "step": 89 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 0.23316062176165803, "grad_norm": 7.872235002892737, "kl": 0.076171875, "learning_rate": 9.769430051813472e-07, "loss": 0.0007, "reward": 1.4971758127212524, "reward_std": 0.0002577869486231066, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 0.99717578291893, "step": 90 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 0.23575129533678757, "grad_norm": 4.245160239550034, "kl": 0.04241943359375, "learning_rate": 9.766839378238342e-07, "loss": -0.0008, "reward": 2.4997618198394775, "reward_std": 8.034832262637792e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.999761939048767, "step": 91 }, { "clip_ratio": 0.0, "completion_length": 34.4375, "epoch": 0.23834196891191708, "grad_norm": 19.856309903921574, "kl": 0.140869140625, "learning_rate": 9.764248704663212e-07, "loss": 0.0004, "reward": 2.4374531507492065, "reward_std": 0.1768775479206397, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9374531507492065, "step": 92 }, { "clip_ratio": 0.0, "completion_length": 34.6875, "epoch": 0.24093264248704663, "grad_norm": 1.1579663764472719, "kl": 0.0400390625, "learning_rate": 9.761658031088082e-07, "loss": -0.0004, "reward": 2.4999840259552, "reward_std": 5.818617864861153e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999842047691345, "step": 93 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 0.24352331606217617, "grad_norm": 1.204567997031168, "kl": 0.088134765625, "learning_rate": 9.759067357512954e-07, "loss": 0.0006, "reward": 2.499993324279785, "reward_std": 5.7656898206914775e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999932050704956, "step": 94 }, { "clip_ratio": 0.0, "completion_length": 34.5, "epoch": 0.24611398963730569, "grad_norm": 21.05544770088383, "kl": 0.126983642578125, "learning_rate": 9.756476683937824e-07, "loss": 0.0003, "reward": 1.9998608827590942, "reward_std": 4.4506206535288584e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4998611509799957, "step": 95 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 0.24870466321243523, "grad_norm": 32.751306013317446, "kl": 0.10589599609375, "learning_rate": 9.753886010362694e-07, "loss": 0.0004, "reward": 2.1240326166152954, "reward_std": 0.5003382712602615, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.6240326166152954, "step": 96 }, { "clip_ratio": 0.0, "completion_length": 40.375, "epoch": 0.25129533678756477, "grad_norm": 79.12597534103841, "kl": 0.04833984375, "learning_rate": 9.751295336787564e-07, "loss": 0.0005, "reward": 1.9821822047233582, "reward_std": 0.003102937228504743, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4821821451187134, "step": 97 }, { "clip_ratio": 0.0, "completion_length": 39.625, "epoch": 0.2538860103626943, "grad_norm": 27.473384832506696, "kl": 0.093017578125, "learning_rate": 9.748704663212434e-07, "loss": 0.0004, "reward": 1.7839117050170898, "reward_std": 0.462258443236351, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.2839117050170898, "step": 98 }, { "clip_ratio": 0.0, "completion_length": 36.5, "epoch": 0.25647668393782386, "grad_norm": 0.5434623478158119, "kl": 0.028076171875, "learning_rate": 9.746113989637306e-07, "loss": 0.0002, "reward": 2.4999914169311523, "reward_std": 5.65391962936701e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999914169311523, "step": 99 }, { "clip_ratio": 0.0, "completion_length": 34.8125, "epoch": 0.25906735751295334, "grad_norm": 38.41368275142002, "kl": 0.05322265625, "learning_rate": 9.743523316062176e-07, "loss": -0.0002, "reward": 1.8973362445831299, "reward_std": 0.24990517766491394, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.397336184978485, "step": 100 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 0.2616580310880829, "grad_norm": 7.520692088913739, "kl": 0.037841796875, "learning_rate": 9.740932642487046e-07, "loss": -0.0002, "reward": 2.4999399185180664, "reward_std": 4.300871478335466e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999399185180664, "step": 101 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 0.26424870466321243, "grad_norm": 16.04694790274661, "kl": 0.03875732421875, "learning_rate": 9.738341968911916e-07, "loss": -0.0003, "reward": 2.3124669790267944, "reward_std": 0.25882471234763216, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.812467098236084, "step": 102 }, { "clip_ratio": 0.0, "completion_length": 108.3125, "epoch": 0.266839378238342, "grad_norm": 45.86080375277014, "kl": 0.0531005859375, "learning_rate": 9.735751295336788e-07, "loss": 0.0002, "reward": 1.9918814897537231, "reward_std": 0.005455598112348525, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4918816089630127, "step": 103 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 0.2694300518134715, "grad_norm": 29.67608753082564, "kl": 0.02471923828125, "learning_rate": 9.733160621761658e-07, "loss": -0.0008, "reward": 2.437479257583618, "reward_std": 0.1768319597942991, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9374792575836182, "step": 104 }, { "clip_ratio": 0.0, "completion_length": 34.5, "epoch": 0.27202072538860106, "grad_norm": 27.205285721754063, "kl": 0.0614013671875, "learning_rate": 9.730569948186528e-07, "loss": 0.0002, "reward": 1.999179720878601, "reward_std": 6.279310655088466e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.499179720878601, "step": 105 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 0.27461139896373055, "grad_norm": 23.54038740688223, "kl": 0.0628662109375, "learning_rate": 9.727979274611398e-07, "loss": 0.0001, "reward": 2.499894142150879, "reward_std": 0.00012795658221875783, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.999894142150879, "step": 106 }, { "clip_ratio": 0.0, "completion_length": 35.0625, "epoch": 0.2772020725388601, "grad_norm": 26.079557447642664, "kl": 0.04766845703125, "learning_rate": 9.725388601036268e-07, "loss": -0.0001, "reward": 2.499938726425171, "reward_std": 5.921874435443897e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999386668205261, "step": 107 }, { "clip_ratio": 0.0, "completion_length": 34.5, "epoch": 0.27979274611398963, "grad_norm": 3.2653955008843685, "kl": 0.042724609375, "learning_rate": 9.72279792746114e-07, "loss": 0.0006, "reward": 2.4999871253967285, "reward_std": 1.7469743397668935e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999871253967285, "step": 108 }, { "clip_ratio": 0.0, "completion_length": 34.5, "epoch": 0.2823834196891192, "grad_norm": 0.77165706731983, "kl": 0.0784912109375, "learning_rate": 9.72020725388601e-07, "loss": 0.0, "reward": 2.4999911785125732, "reward_std": 4.967317295268003e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999911785125732, "step": 109 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 0.2849740932642487, "grad_norm": 32.97546419525307, "kl": 0.04766845703125, "learning_rate": 9.71761658031088e-07, "loss": -0.0, "reward": 2.4998066425323486, "reward_std": 0.0001222240025526844, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9998065829277039, "step": 110 }, { "clip_ratio": 0.0, "completion_length": 35.5625, "epoch": 0.28756476683937826, "grad_norm": 26.18065551862498, "kl": 0.4599609375, "learning_rate": 9.71502590673575e-07, "loss": 0.0018, "reward": 1.5519845485687256, "reward_std": 0.5864746570587158, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.051984578371048, "step": 111 }, { "clip_ratio": 0.0, "completion_length": 37.6875, "epoch": 0.29015544041450775, "grad_norm": 11.987697989962049, "kl": 0.0703125, "learning_rate": 9.712435233160622e-07, "loss": 0.0003, "reward": 2.3124321699142456, "reward_std": 0.4082608222961426, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.8124321699142456, "step": 112 }, { "clip_ratio": 0.0, "completion_length": 34.5, "epoch": 0.2927461139896373, "grad_norm": 14.93133180939127, "kl": 0.076171875, "learning_rate": 9.709844559585492e-07, "loss": 0.0003, "reward": 1.4983919262886047, "reward_std": 0.0004074995667906478, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 0.9983920156955719, "step": 113 }, { "clip_ratio": 0.0, "completion_length": 34.5, "epoch": 0.29533678756476683, "grad_norm": 17.387245286485452, "kl": 0.1103515625, "learning_rate": 9.707253886010362e-07, "loss": 0.0003, "reward": 1.9763047695159912, "reward_std": 0.0003290466993348673, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.476304829120636, "step": 114 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 0.2979274611398964, "grad_norm": 1.0608114568618325, "kl": 0.0572509765625, "learning_rate": 9.704663212435232e-07, "loss": -0.0001, "reward": 2.499966025352478, "reward_std": 7.64698111765938e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.999966025352478, "step": 115 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 0.3005181347150259, "grad_norm": 3.8357029735696653, "kl": 0.0596923828125, "learning_rate": 9.702072538860102e-07, "loss": -0.0006, "reward": 2.499981641769409, "reward_std": 1.4318110288513708e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999818205833435, "step": 116 }, { "clip_ratio": 0.0, "completion_length": 34.5, "epoch": 0.30310880829015546, "grad_norm": 28.50617927919992, "kl": 0.11328125, "learning_rate": 9.699481865284974e-07, "loss": 0.0009, "reward": 2.0620652437210083, "reward_std": 0.17692725664164755, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.5620651841163635, "step": 117 }, { "clip_ratio": 0.0, "completion_length": 34.4375, "epoch": 0.30569948186528495, "grad_norm": 58.630081061738984, "kl": 0.249908447265625, "learning_rate": 9.696891191709844e-07, "loss": 0.0011, "reward": 2.437382221221924, "reward_std": 0.17709060247125308, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9373822808265686, "step": 118 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 0.3082901554404145, "grad_norm": 22.45252294313057, "kl": 0.0509033203125, "learning_rate": 9.694300518134714e-07, "loss": 0.0005, "reward": 2.374982476234436, "reward_std": 0.23147946750691517, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.8749824166297913, "step": 119 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 0.31088082901554404, "grad_norm": 14.221096544137483, "kl": 0.07177734375, "learning_rate": 9.691709844559584e-07, "loss": 0.0003, "reward": 2.249211013317108, "reward_std": 0.2680665226096153, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.749211072921753, "step": 120 }, { "clip_ratio": 0.0, "completion_length": 34.5, "epoch": 0.3134715025906736, "grad_norm": 0.4332777968107589, "kl": 0.0465087890625, "learning_rate": 9.689119170984456e-07, "loss": -0.0001, "reward": 2.4999940395355225, "reward_std": 3.8427937170126825e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999939799308777, "step": 121 }, { "clip_ratio": 0.0, "completion_length": 34.5, "epoch": 0.3160621761658031, "grad_norm": 4.081538918236511, "kl": 0.06640625, "learning_rate": 9.686528497409326e-07, "loss": 0.0004, "reward": 2.4999542236328125, "reward_std": 3.70945485883567e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.999954342842102, "step": 122 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 0.31865284974093266, "grad_norm": 3.057592866617649, "kl": 0.16015625, "learning_rate": 9.683937823834196e-07, "loss": 0.0005, "reward": 2.499969244003296, "reward_std": 2.595815294625936e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999693632125854, "step": 123 }, { "clip_ratio": 0.0, "completion_length": 58.375, "epoch": 0.32124352331606215, "grad_norm": 30.824703542855783, "kl": 0.062255859375, "learning_rate": 9.681347150259066e-07, "loss": 0.0002, "reward": 1.3506267666816711, "reward_std": 0.17951567331328988, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 0.8506268262863159, "step": 124 }, { "clip_ratio": 0.0, "completion_length": 50.8125, "epoch": 0.3238341968911917, "grad_norm": 1.7148905769763336, "kl": 0.283203125, "learning_rate": 9.678756476683936e-07, "loss": 0.0018, "reward": 2.4999903440475464, "reward_std": 4.849738161283312e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999902248382568, "step": 125 }, { "clip_ratio": 0.0, "completion_length": 34.5, "epoch": 0.32642487046632124, "grad_norm": 2.8495573272255075, "kl": 0.068359375, "learning_rate": 9.676165803108809e-07, "loss": 0.0008, "reward": 2.499986410140991, "reward_std": 2.3382104700431228e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999862909317017, "step": 126 }, { "clip_ratio": 0.0, "completion_length": 51.0, "epoch": 0.3290155440414508, "grad_norm": 0.18756382569230234, "kl": 0.125, "learning_rate": 9.673575129533679e-07, "loss": 0.0001, "reward": 2.4999979734420776, "reward_std": 8.522496557361592e-07, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999979138374329, "step": 127 }, { "clip_ratio": 0.0, "completion_length": 34.0, "epoch": 0.3316062176165803, "grad_norm": 778.9687555253842, "kl": 0.0472412109375, "learning_rate": 9.670984455958549e-07, "loss": 0.0003, "reward": 2.4368550777435303, "reward_std": 0.1785650884781944, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9368551969528198, "step": 128 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 0.33419689119170987, "grad_norm": 7.71128449770404, "kl": 0.063232421875, "learning_rate": 9.668393782383419e-07, "loss": 0.0005, "reward": 1.6834460496902466, "reward_std": 0.00025988063225668157, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.1834460347890854, "step": 129 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 0.33678756476683935, "grad_norm": 0.12321829598157957, "kl": 0.1025390625, "learning_rate": 9.665803108808289e-07, "loss": -0.0003, "reward": 2.4999972581863403, "reward_std": 1.781640165177123e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999974966049194, "step": 130 }, { "clip_ratio": 0.0, "completion_length": 38.9375, "epoch": 0.3393782383419689, "grad_norm": 20.68392232619108, "kl": 0.2828369140625, "learning_rate": 9.66321243523316e-07, "loss": 0.0016, "reward": 2.1773712038993835, "reward_std": 0.2682817817618002, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.6773712635040283, "step": 131 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 0.34196891191709844, "grad_norm": 6.78757121716985, "kl": 0.0501708984375, "learning_rate": 9.66062176165803e-07, "loss": 0.0007, "reward": 2.4999715089797974, "reward_std": 1.4924235529178986e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999715089797974, "step": 132 }, { "clip_ratio": 0.0, "completion_length": 47.75, "epoch": 0.344559585492228, "grad_norm": 0.6309569083353512, "kl": 0.11572265625, "learning_rate": 9.6580310880829e-07, "loss": 0.0011, "reward": 2.4999911785125732, "reward_std": 8.16447891338612e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999911785125732, "step": 133 }, { "clip_ratio": 0.0, "completion_length": 40.0, "epoch": 0.3471502590673575, "grad_norm": 17.051855534402925, "kl": 0.278076171875, "learning_rate": 9.655440414507773e-07, "loss": 0.0011, "reward": 1.4844253063201904, "reward_std": 0.0008330822111020098, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 0.9844253659248352, "step": 134 }, { "clip_ratio": 0.0, "completion_length": 34.0, "epoch": 0.34974093264248707, "grad_norm": 0.8087106122070162, "kl": 0.0784912109375, "learning_rate": 9.652849740932643e-07, "loss": 0.0002, "reward": 2.499993324279785, "reward_std": 2.6888594675256172e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.99999338388443, "step": 135 }, { "clip_ratio": 0.0, "completion_length": 34.0, "epoch": 0.35233160621761656, "grad_norm": 2.00244009573784, "kl": 0.050537109375, "learning_rate": 9.650259067357513e-07, "loss": 0.0009, "reward": 2.499998092651367, "reward_std": 1.2544903142952535e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999979734420776, "step": 136 }, { "clip_ratio": 0.0, "completion_length": 40.875, "epoch": 0.3549222797927461, "grad_norm": 86.40512913043219, "kl": 0.15673828125, "learning_rate": 9.647668393782383e-07, "loss": 0.0006, "reward": 1.9180601835250854, "reward_std": 0.23149480670690536, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.418060064315796, "step": 137 }, { "clip_ratio": 0.0, "completion_length": 47.25, "epoch": 0.35751295336787564, "grad_norm": 1.0172748127032223, "kl": 0.08349609375, "learning_rate": 9.645077720207253e-07, "loss": 0.0001, "reward": 2.4999914169311523, "reward_std": 8.198011016702367e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999914169311523, "step": 138 }, { "clip_ratio": 0.0, "completion_length": 34.5, "epoch": 0.3601036269430052, "grad_norm": 9.077370178758427, "kl": 0.0606689453125, "learning_rate": 9.642487046632125e-07, "loss": 0.0011, "reward": 2.4999910593032837, "reward_std": 1.1874815385226611e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999908804893494, "step": 139 }, { "clip_ratio": 0.0, "completion_length": 34.25, "epoch": 0.3626943005181347, "grad_norm": 13.330073377494674, "kl": 0.075439453125, "learning_rate": 9.639896373056995e-07, "loss": -0.0001, "reward": 1.8144383430480957, "reward_std": 0.03691703302320093, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.314438372850418, "step": 140 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 0.36528497409326427, "grad_norm": 1.4454602214883316, "kl": 0.104736328125, "learning_rate": 9.637305699481865e-07, "loss": 0.0006, "reward": 1.9923112392425537, "reward_std": 4.800811608163258e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4923112392425537, "step": 141 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 0.36787564766839376, "grad_norm": 3.462416950574483, "kl": 0.0382080078125, "learning_rate": 9.634715025906735e-07, "loss": 0.0011, "reward": 2.499909281730652, "reward_std": 2.855958700820338e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999091625213623, "step": 142 }, { "clip_ratio": 0.0, "completion_length": 34.5625, "epoch": 0.3704663212435233, "grad_norm": 60.28321024389314, "kl": 0.236328125, "learning_rate": 9.632124352331605e-07, "loss": 0.0009, "reward": 2.290057420730591, "reward_std": 0.47163158655166626, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.7900574207305908, "step": 143 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 0.37305699481865284, "grad_norm": 54.576736762263884, "kl": 0.15625, "learning_rate": 9.629533678756477e-07, "loss": 0.0006, "reward": 1.8109760880470276, "reward_std": 0.4447221904993057, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.3109761476516724, "step": 144 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 0.3756476683937824, "grad_norm": 44.47467040389592, "kl": 0.0899658203125, "learning_rate": 9.626943005181347e-07, "loss": 0.0004, "reward": 1.9187846779823303, "reward_std": 0.22809253074228764, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4187846779823303, "step": 145 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 0.37823834196891193, "grad_norm": 96.56119524359266, "kl": 0.1026611328125, "learning_rate": 9.624352331606217e-07, "loss": 0.0003, "reward": 1.6752909421920776, "reward_std": 0.26892834760656115, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.1752910017967224, "step": 146 }, { "clip_ratio": 0.0, "completion_length": 35.25, "epoch": 0.38082901554404147, "grad_norm": 1.301215497093169, "kl": 0.100830078125, "learning_rate": 9.621761658031087e-07, "loss": 0.0004, "reward": 1.4999969005584717, "reward_std": 2.882935859815916e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 0.9999970197677612, "step": 147 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 0.38341968911917096, "grad_norm": 0.1716691461971046, "kl": 0.079345703125, "learning_rate": 9.619170984455957e-07, "loss": 0.0008, "reward": 2.499995470046997, "reward_std": 1.2729808815947763e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.999995470046997, "step": 148 }, { "clip_ratio": 0.0, "completion_length": 35.5625, "epoch": 0.3860103626943005, "grad_norm": 4.61720601482355, "kl": 0.091064453125, "learning_rate": 9.61658031088083e-07, "loss": -0.0002, "reward": 2.499967575073242, "reward_std": 1.9026708287128713e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.999967634677887, "step": 149 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 0.38860103626943004, "grad_norm": 2.7234764612443025, "kl": 0.106689453125, "learning_rate": 9.6139896373057e-07, "loss": 0.0004, "reward": 1.99959796667099, "reward_std": 3.200235278200125e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4995981454849243, "step": 150 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 0.3911917098445596, "grad_norm": 27.998260941191134, "kl": 0.22216796875, "learning_rate": 9.61139896373057e-07, "loss": 0.0014, "reward": 2.4323630332946777, "reward_std": 0.19104300345202319, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.932362973690033, "step": 151 }, { "clip_ratio": 0.0, "completion_length": 35.9375, "epoch": 0.39378238341968913, "grad_norm": 25.226219173819224, "kl": 0.0477294921875, "learning_rate": 9.608808290155441e-07, "loss": 0.0005, "reward": 2.0568660497665405, "reward_std": 0.17905486353720335, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.5568661093711853, "step": 152 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 0.3963730569948187, "grad_norm": 0.3675034917975679, "kl": 0.1015625, "learning_rate": 9.60621761658031e-07, "loss": 0.001, "reward": 2.499988555908203, "reward_std": 4.78217737054365e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999884366989136, "step": 153 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 0.39896373056994816, "grad_norm": 2.1595096987913247, "kl": 0.1748046875, "learning_rate": 9.603626943005181e-07, "loss": 0.0004, "reward": 2.4999899864196777, "reward_std": 9.273415173538524e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.999989926815033, "step": 154 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 0.4015544041450777, "grad_norm": 9.850022537964898, "kl": 0.07366943359375, "learning_rate": 9.601036269430051e-07, "loss": -0.0003, "reward": 2.4999306201934814, "reward_std": 6.255799416976515e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999307990074158, "step": 155 }, { "clip_ratio": 0.0, "completion_length": 34.5, "epoch": 0.40414507772020725, "grad_norm": 2.86835444425271, "kl": 0.060791015625, "learning_rate": 9.598445595854921e-07, "loss": 0.0004, "reward": 1.9994065761566162, "reward_std": 1.8171605915995315e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4994065165519714, "step": 156 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 0.4067357512953368, "grad_norm": 38.0932528609861, "kl": 0.1790771484375, "learning_rate": 9.595854922279793e-07, "loss": 0.0001, "reward": 2.3749828338623047, "reward_std": 0.23148394337079026, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.8749828338623047, "step": 157 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 0.40932642487046633, "grad_norm": 1.357234341081675, "kl": 0.0810546875, "learning_rate": 9.593264248704663e-07, "loss": 0.0003, "reward": 2.499979019165039, "reward_std": 7.719876748524257e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999790787696838, "step": 158 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 0.4119170984455959, "grad_norm": 27.267310751150433, "kl": 0.119873046875, "learning_rate": 9.590673575129533e-07, "loss": 0.0006, "reward": 1.298931360244751, "reward_std": 0.052430289819767495, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 0.7989313006401062, "step": 159 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 0.41450777202072536, "grad_norm": 88.60653699281134, "kl": 0.059326171875, "learning_rate": 9.588082901554403e-07, "loss": 0.0002, "reward": 1.8697761297225952, "reward_std": 0.029779866188619053, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.3697762191295624, "step": 160 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 0.4170984455958549, "grad_norm": 131.16604901041597, "kl": 0.28564453125, "learning_rate": 9.585492227979273e-07, "loss": 0.001, "reward": 1.811118245124817, "reward_std": 0.259682998766948, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.3111181557178497, "step": 161 }, { "clip_ratio": 0.0, "completion_length": 34.5, "epoch": 0.41968911917098445, "grad_norm": 16.611391592845404, "kl": 0.059326171875, "learning_rate": 9.582901554404145e-07, "loss": 0.0002, "reward": 1.9990431666374207, "reward_std": 0.0003346551386584906, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.499043047428131, "step": 162 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 0.422279792746114, "grad_norm": 3.8076856023797783, "kl": 0.0833740234375, "learning_rate": 9.580310880829015e-07, "loss": 0.001, "reward": 1.999815583229065, "reward_std": 2.8820414001984318e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4998155236244202, "step": 163 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 0.42487046632124353, "grad_norm": 0.22775821733372068, "kl": 0.0638427734375, "learning_rate": 9.577720207253885e-07, "loss": -0.0009, "reward": 2.499998092651367, "reward_std": 7.127880792268115e-07, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999980926513672, "step": 164 }, { "clip_ratio": 0.0, "completion_length": 35.1875, "epoch": 0.4274611398963731, "grad_norm": 43.12820905419924, "kl": 0.165771484375, "learning_rate": 9.575129533678755e-07, "loss": 0.0007, "reward": 1.300038456916809, "reward_std": 0.3335072639383725, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 0.8000384271144867, "step": 165 }, { "clip_ratio": 0.0, "completion_length": 34.5, "epoch": 0.43005181347150256, "grad_norm": 80.73777443500676, "kl": 0.1455078125, "learning_rate": 9.572538860103625e-07, "loss": 0.0005, "reward": 2.1822879910469055, "reward_std": 0.26322523210546933, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.6822880506515503, "step": 166 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 0.4326424870466321, "grad_norm": 1.3935043950254644, "kl": 0.097412109375, "learning_rate": 9.569948186528497e-07, "loss": 0.0001, "reward": 2.499988317489624, "reward_std": 1.0444003009979497e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.999988317489624, "step": 167 }, { "clip_ratio": 0.0, "completion_length": 34.8125, "epoch": 0.43523316062176165, "grad_norm": 34.34007157339578, "kl": 0.066162109375, "learning_rate": 9.567357512953367e-07, "loss": -0.0005, "reward": 2.2498570680618286, "reward_std": 0.2673697262735004, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.7498571276664734, "step": 168 }, { "clip_ratio": 0.0, "completion_length": 34.375, "epoch": 0.4378238341968912, "grad_norm": 94.54455842469612, "kl": 0.1337890625, "learning_rate": 9.564766839378237e-07, "loss": 0.0005, "reward": 1.9985234141349792, "reward_std": 0.48838643729686737, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.498523473739624, "step": 169 }, { "clip_ratio": 0.0, "completion_length": 34.6875, "epoch": 0.44041450777202074, "grad_norm": 6.9718706237515224, "kl": 0.048095703125, "learning_rate": 9.56217616580311e-07, "loss": 0.0003, "reward": 2.4999825954437256, "reward_std": 1.2279092800326907e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999827146530151, "step": 170 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 0.4430051813471503, "grad_norm": 0.5937469623650423, "kl": 0.086669921875, "learning_rate": 9.559585492227977e-07, "loss": 0.0018, "reward": 2.4999914169311523, "reward_std": 5.022113668928796e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999912977218628, "step": 171 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 0.44559585492227977, "grad_norm": 2.9376354932508058, "kl": 0.105224609375, "learning_rate": 9.55699481865285e-07, "loss": 0.0016, "reward": 2.4999964237213135, "reward_std": 3.2443115287605906e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999963641166687, "step": 172 }, { "clip_ratio": 0.0, "completion_length": 35.25, "epoch": 0.4481865284974093, "grad_norm": 193.67818409560059, "kl": 0.077880859375, "learning_rate": 9.55440414507772e-07, "loss": 0.0009, "reward": 2.0455445051193237, "reward_std": 0.1841201360700211, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.5455445051193237, "step": 173 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 0.45077720207253885, "grad_norm": 6.210457951400512, "kl": 0.078857421875, "learning_rate": 9.55181347150259e-07, "loss": -0.0004, "reward": 1.9924424290657043, "reward_std": 6.803595104543092e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.492442548274994, "step": 174 }, { "clip_ratio": 0.0, "completion_length": 34.5, "epoch": 0.4533678756476684, "grad_norm": 169.98982672715994, "kl": 0.0977783203125, "learning_rate": 9.549222797927462e-07, "loss": 0.0001, "reward": 2.49991774559021, "reward_std": 6.647250665992033e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999178647994995, "step": 175 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 0.45595854922279794, "grad_norm": 39.87688544866592, "kl": 0.064453125, "learning_rate": 9.546632124352332e-07, "loss": 0.0004, "reward": 2.436307191848755, "reward_std": 0.17962476573848107, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9363073706626892, "step": 176 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 0.4585492227979275, "grad_norm": 9.503614910000897, "kl": 0.03948974609375, "learning_rate": 9.544041450777202e-07, "loss": -0.0003, "reward": 1.998598575592041, "reward_std": 0.00019181770039722323, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.498598724603653, "step": 177 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 0.46113989637305697, "grad_norm": 5.92166567877022, "kl": 0.0601806640625, "learning_rate": 9.541450777202072e-07, "loss": 0.0006, "reward": 2.4999635219573975, "reward_std": 3.4532062727521406e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999634623527527, "step": 178 }, { "clip_ratio": 0.0, "completion_length": 35.0625, "epoch": 0.4637305699481865, "grad_norm": 62.87930005310863, "kl": 0.109375, "learning_rate": 9.538860103626942e-07, "loss": 0.0003, "reward": 2.1871920824050903, "reward_std": 0.2590183729181206, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.687192142009735, "step": 179 }, { "clip_ratio": 0.0, "completion_length": 34.5, "epoch": 0.46632124352331605, "grad_norm": 5.077639078676416, "kl": 0.1060791015625, "learning_rate": 9.536269430051813e-07, "loss": -0.0004, "reward": 2.4999769926071167, "reward_std": 4.302172237657942e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999770522117615, "step": 180 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 0.4689119170984456, "grad_norm": 12.874689131272246, "kl": 0.0732421875, "learning_rate": 9.533678756476683e-07, "loss": 0.0004, "reward": 2.374955177307129, "reward_std": 0.23149113605359162, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.8749550580978394, "step": 181 }, { "clip_ratio": 0.0, "completion_length": 34.0, "epoch": 0.47150259067357514, "grad_norm": 3.2785029112945425, "kl": 0.068359375, "learning_rate": 9.531088082901554e-07, "loss": 0.0007, "reward": 2.499966263771057, "reward_std": 1.5721283034508815e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999662041664124, "step": 182 }, { "clip_ratio": 0.0, "completion_length": 34.0, "epoch": 0.4740932642487047, "grad_norm": 11.298488720107546, "kl": 0.024169921875, "learning_rate": 9.528497409326425e-07, "loss": 0.0001, "reward": 1.4906189441680908, "reward_std": 0.008024983624636661, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 0.9906189739704132, "step": 183 }, { "clip_ratio": 0.0, "completion_length": 34.5, "epoch": 0.47668393782383417, "grad_norm": 1.0958165073828254, "kl": 0.17236328125, "learning_rate": 9.525906735751295e-07, "loss": 0.0004, "reward": 2.4999895095825195, "reward_std": 1.201426994157373e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999895691871643, "step": 184 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 0.4792746113989637, "grad_norm": 39.416709054286926, "kl": 0.076904296875, "learning_rate": 9.523316062176166e-07, "loss": 0.0007, "reward": 2.37497878074646, "reward_std": 0.23148569122906792, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.87497878074646, "step": 185 }, { "clip_ratio": 0.0, "completion_length": 34.5, "epoch": 0.48186528497409326, "grad_norm": 0.02431102659126464, "kl": 0.0242919921875, "learning_rate": 9.520725388601036e-07, "loss": 0.0007, "reward": 2.499998927116394, "reward_std": 5.839314098921022e-07, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.999998927116394, "step": 186 }, { "clip_ratio": 0.0, "completion_length": 35.125, "epoch": 0.4844559585492228, "grad_norm": 52.11459878771558, "kl": 0.040985107421875, "learning_rate": 9.518134715025906e-07, "loss": 0.0005, "reward": 2.2339224815368652, "reward_std": 0.3672207622189205, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.73392254114151, "step": 187 }, { "clip_ratio": 0.0, "completion_length": 35.125, "epoch": 0.48704663212435234, "grad_norm": 63.02844422973685, "kl": 0.06390380859375, "learning_rate": 9.515544041450777e-07, "loss": -0.0002, "reward": 1.997809886932373, "reward_std": 0.0007182962772276369, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4978099763393402, "step": 188 }, { "clip_ratio": 0.0, "completion_length": 34.0, "epoch": 0.4896373056994819, "grad_norm": 4.561146785274667, "kl": 0.031494140625, "learning_rate": 9.512953367875647e-07, "loss": 0.0, "reward": 2.499797224998474, "reward_std": 7.290885696420446e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9997972249984741, "step": 189 }, { "clip_ratio": 0.0, "completion_length": 35.4375, "epoch": 0.49222797927461137, "grad_norm": 15.794183433983163, "kl": 0.107421875, "learning_rate": 9.510362694300518e-07, "loss": -0.0004, "reward": 1.775613248348236, "reward_std": 0.07763574089835856, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.2756133675575256, "step": 190 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 0.4948186528497409, "grad_norm": 53.794635804377414, "kl": 0.0758056640625, "learning_rate": 9.507772020725389e-07, "loss": 0.0003, "reward": 2.4373844861984253, "reward_std": 0.17706387546706992, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9373846054077148, "step": 191 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 0.49740932642487046, "grad_norm": 13.339812350333533, "kl": 0.156494140625, "learning_rate": 9.505181347150258e-07, "loss": 0.0004, "reward": 2.4374749660491943, "reward_std": 0.17682266998508567, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9374749064445496, "step": 192 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 0.5, "grad_norm": 7.709688807481148, "kl": 0.05029296875, "learning_rate": 9.502590673575129e-07, "loss": 0.0011, "reward": 2.4999923706054688, "reward_std": 8.9534473772801e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999923706054688, "step": 193 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 0.5025906735751295, "grad_norm": 12.292379892079113, "kl": 0.03564453125, "learning_rate": 9.499999999999999e-07, "loss": 0.0006, "reward": 2.4999698400497437, "reward_std": 3.4964389669767115e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999698400497437, "step": 194 }, { "clip_ratio": 0.0, "completion_length": 34.3125, "epoch": 0.5051813471502591, "grad_norm": 30.23930669613842, "kl": 0.2294921875, "learning_rate": 9.49740932642487e-07, "loss": 0.0012, "reward": 1.9902034997940063, "reward_std": 0.007483473378670169, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4902033805847168, "step": 195 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 0.5077720207253886, "grad_norm": 23.608654428483174, "kl": 0.06787109375, "learning_rate": 9.494818652849741e-07, "loss": 0.0003, "reward": 2.1249433755874634, "reward_std": 0.23149006214407564, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.6249432563781738, "step": 196 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 0.5103626943005182, "grad_norm": 1.3304126719372689, "kl": 0.033935546875, "learning_rate": 9.492227979274611e-07, "loss": 0.0003, "reward": 2.4999794960021973, "reward_std": 4.905148443867802e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.999979555606842, "step": 197 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 0.5129533678756477, "grad_norm": 3.506059228911601, "kl": 0.132080078125, "learning_rate": 9.489637305699481e-07, "loss": 0.0, "reward": 2.499938488006592, "reward_std": 2.3219374085670097e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999384880065918, "step": 198 }, { "clip_ratio": 0.0, "completion_length": 35.4375, "epoch": 0.5155440414507773, "grad_norm": 40.38588040361022, "kl": 0.0552978515625, "learning_rate": 9.487046632124351e-07, "loss": -0.0005, "reward": 2.0623568296432495, "reward_std": 0.41730798021671944, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.5623570084571838, "step": 199 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 0.5181347150259067, "grad_norm": 14.54155105301667, "kl": 0.1279296875, "learning_rate": 9.484455958549222e-07, "loss": 0.0011, "reward": 2.0565385818481445, "reward_std": 0.17906291717235945, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.5565386414527893, "step": 200 }, { "clip_ratio": 0.0, "completion_length": 34.5, "epoch": 0.5207253886010362, "grad_norm": 3.00118081433762, "kl": 0.05029296875, "learning_rate": 9.481865284974093e-07, "loss": 0.0003, "reward": 2.4999457597732544, "reward_std": 2.9745723395535606e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999459385871887, "step": 201 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 0.5233160621761658, "grad_norm": 116.19563637018368, "kl": 0.153564453125, "learning_rate": 9.479274611398963e-07, "loss": 0.0, "reward": 2.12401682138443, "reward_std": 0.23206061124801636, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.62401682138443, "step": 202 }, { "clip_ratio": 0.0, "completion_length": 34.5, "epoch": 0.5259067357512953, "grad_norm": 0.7202451802861973, "kl": 0.0352783203125, "learning_rate": 9.476683937823834e-07, "loss": -0.0006, "reward": 2.4999974966049194, "reward_std": 3.306661483293283e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.999997615814209, "step": 203 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 0.5284974093264249, "grad_norm": 7.46545829789412, "kl": 0.0384521484375, "learning_rate": 9.474093264248703e-07, "loss": -0.0, "reward": 2.499893307685852, "reward_std": 8.678888480062596e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9998934268951416, "step": 204 }, { "clip_ratio": 0.0, "completion_length": 34.6875, "epoch": 0.5310880829015544, "grad_norm": 22.99799119427596, "kl": 0.0867919921875, "learning_rate": 9.471502590673574e-07, "loss": 0.0008, "reward": 2.030556797981262, "reward_std": 0.4455295194460689, "rewards/format_reward_rec": 0.9375, "rewards/point_reward": 1.561806857585907, "step": 205 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 0.533678756476684, "grad_norm": 62.826438018148934, "kl": 0.134765625, "learning_rate": 9.468911917098445e-07, "loss": 0.0011, "reward": 2.434236764907837, "reward_std": 0.1859855586524759, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9342365264892578, "step": 206 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 0.5362694300518135, "grad_norm": 33.98111377019834, "kl": 0.076416015625, "learning_rate": 9.466321243523315e-07, "loss": 0.0006, "reward": 1.9978904724121094, "reward_std": 0.00040473006356478436, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4978904128074646, "step": 207 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 0.538860103626943, "grad_norm": 3.9583399192142825, "kl": 0.0478515625, "learning_rate": 9.463730569948186e-07, "loss": 0.0, "reward": 1.9968501925468445, "reward_std": 6.582876631000545e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4968501925468445, "step": 208 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 0.5414507772020726, "grad_norm": 0.2445416189198319, "kl": 0.0552978515625, "learning_rate": 9.461139896373057e-07, "loss": 0.0002, "reward": 2.499974489212036, "reward_std": 2.6608826431129273e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999743700027466, "step": 209 }, { "clip_ratio": 0.0, "completion_length": 35.1875, "epoch": 0.5440414507772021, "grad_norm": 13.848213173762856, "kl": 0.1322021484375, "learning_rate": 9.458549222797926e-07, "loss": 0.0004, "reward": 2.097993493080139, "reward_std": 0.24885972872152706, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.5979933738708496, "step": 210 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 0.5466321243523317, "grad_norm": 28.14177244645597, "kl": 0.07958984375, "learning_rate": 9.455958549222797e-07, "loss": 0.0011, "reward": 1.9448403716087341, "reward_std": 0.020275432882044697, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4448402523994446, "step": 211 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 0.5492227979274611, "grad_norm": 79.27496052370239, "kl": 0.111083984375, "learning_rate": 9.453367875647667e-07, "loss": 0.0006, "reward": 1.8673059940338135, "reward_std": 0.09618017942284496, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.3673060834407806, "step": 212 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 0.5518134715025906, "grad_norm": 77.89653881866029, "kl": 0.0592041015625, "learning_rate": 9.450777202072539e-07, "loss": -0.0002, "reward": 2.4999282360076904, "reward_std": 3.5500268495525233e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.999928057193756, "step": 213 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 0.5544041450777202, "grad_norm": 3.2375878032635215, "kl": 0.0303955078125, "learning_rate": 9.44818652849741e-07, "loss": 0.0002, "reward": 2.4998496770858765, "reward_std": 1.796979404389276e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.999849796295166, "step": 214 }, { "clip_ratio": 0.0, "completion_length": 34.9375, "epoch": 0.5569948186528497, "grad_norm": 103.02440756730583, "kl": 0.1485595703125, "learning_rate": 9.44559585492228e-07, "loss": -0.0006, "reward": 1.681231051683426, "reward_std": 0.11251461816493702, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.1812311708927155, "step": 215 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 0.5595854922279793, "grad_norm": 0.8614838950564742, "kl": 0.16357421875, "learning_rate": 9.44300518134715e-07, "loss": 0.0009, "reward": 2.499993681907654, "reward_std": 5.377873947054468e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999937415122986, "step": 216 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 0.5621761658031088, "grad_norm": 26.26413385590952, "kl": 0.067626953125, "learning_rate": 9.44041450777202e-07, "loss": 0.0003, "reward": 1.4598759412765503, "reward_std": 0.011151136626722291, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 0.9598759412765503, "step": 217 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 0.5647668393782384, "grad_norm": 0.4612586134457508, "kl": 0.12109375, "learning_rate": 9.437823834196891e-07, "loss": 0.0002, "reward": 2.499996542930603, "reward_std": 1.909071158934239e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999963641166687, "step": 218 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 0.5673575129533679, "grad_norm": 46.14937279464682, "kl": 0.1630859375, "learning_rate": 9.435233160621762e-07, "loss": 0.0007, "reward": 1.9984716176986694, "reward_std": 0.35650962591171265, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.498471736907959, "step": 219 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 0.5699481865284974, "grad_norm": 61.21426229992077, "kl": 0.160888671875, "learning_rate": 9.432642487046632e-07, "loss": 0.0005, "reward": 1.975698173046112, "reward_std": 0.00016388069985850962, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4756982028484344, "step": 220 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 0.572538860103627, "grad_norm": 63.00321654029741, "kl": 0.0494384765625, "learning_rate": 9.430051813471503e-07, "loss": -0.0004, "reward": 2.4999773502349854, "reward_std": 2.2917482510820264e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.999977469444275, "step": 221 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 0.5751295336787565, "grad_norm": 46.67434402950752, "kl": 0.10546875, "learning_rate": 9.427461139896372e-07, "loss": 0.0004, "reward": 1.7007672786712646, "reward_std": 0.29875028878450394, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.200767308473587, "step": 222 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 0.5777202072538861, "grad_norm": 19.6623794446999, "kl": 0.10791015625, "learning_rate": 9.424870466321243e-07, "loss": 0.001, "reward": 1.9999017715454102, "reward_std": 6.802845064157736e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.499901831150055, "step": 223 }, { "clip_ratio": 0.0, "completion_length": 34.5625, "epoch": 0.5803108808290155, "grad_norm": 14.095627736677576, "kl": 0.1168212890625, "learning_rate": 9.422279792746114e-07, "loss": 0.0008, "reward": 2.4999054670333862, "reward_std": 7.989784353412688e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999054670333862, "step": 224 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 0.582901554404145, "grad_norm": 42.1967457306829, "kl": 0.07763671875, "learning_rate": 9.419689119170984e-07, "loss": 0.0007, "reward": 2.1219332218170166, "reward_std": 0.2333798172276147, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.6219332218170166, "step": 225 }, { "clip_ratio": 0.0, "completion_length": 34.5, "epoch": 0.5854922279792746, "grad_norm": 0.8337090023946528, "kl": 0.0643310546875, "learning_rate": 9.417098445595855e-07, "loss": -0.0, "reward": 2.4999881982803345, "reward_std": 5.433427077150554e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999881982803345, "step": 226 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 0.5880829015544041, "grad_norm": 6.560086811318631, "kl": 0.0528564453125, "learning_rate": 9.414507772020725e-07, "loss": 0.0001, "reward": 1.9985014200210571, "reward_std": 6.0344076473484165e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4985015094280243, "step": 227 }, { "clip_ratio": 0.0, "completion_length": 34.5, "epoch": 0.5906735751295337, "grad_norm": 13.593222108662275, "kl": 0.03106689453125, "learning_rate": 9.411917098445595e-07, "loss": 0.0006, "reward": 2.4998584985733032, "reward_std": 9.021737014336395e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.999858319759369, "step": 228 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 0.5932642487046632, "grad_norm": 21.91224493587665, "kl": 0.252197265625, "learning_rate": 9.409326424870466e-07, "loss": 0.0017, "reward": 1.997519612312317, "reward_std": 0.0008209343400267244, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4975194931030273, "step": 229 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 0.5958549222797928, "grad_norm": 17.488410459500873, "kl": 0.099609375, "learning_rate": 9.406735751295336e-07, "loss": 0.0009, "reward": 2.2499552965164185, "reward_std": 0.26730949130433146, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.749955177307129, "step": 230 }, { "clip_ratio": 0.0, "completion_length": 34.0, "epoch": 0.5984455958549223, "grad_norm": 41.68772485061926, "kl": 2.037109375, "learning_rate": 9.404145077720207e-07, "loss": 0.0087, "reward": 1.8484117984771729, "reward_std": 0.00011486069325883363, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.3484117984771729, "step": 231 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 0.6010362694300518, "grad_norm": 1.106281771503951, "kl": 0.09521484375, "learning_rate": 9.401554404145078e-07, "loss": 0.0009, "reward": 1.9999032020568848, "reward_std": 1.5472111954295542e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4999030530452728, "step": 232 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 0.6036269430051814, "grad_norm": 14.44885688858764, "kl": 0.080078125, "learning_rate": 9.398963730569948e-07, "loss": 0.0009, "reward": 1.8022709488868713, "reward_std": 0.0014438453572438448, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.3022708296775818, "step": 233 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 0.6062176165803109, "grad_norm": 0.11546445822861898, "kl": 0.1259765625, "learning_rate": 9.396373056994819e-07, "loss": 0.0009, "reward": 2.499995470046997, "reward_std": 1.2460781704248802e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999953508377075, "step": 234 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 0.6088082901554405, "grad_norm": 41.4292529400018, "kl": 0.1171875, "learning_rate": 9.393782383419688e-07, "loss": 0.0005, "reward": 1.686115801334381, "reward_std": 0.25914314383408055, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.186115801334381, "step": 235 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 0.6113989637305699, "grad_norm": 32.05389070781718, "kl": 0.03466796875, "learning_rate": 9.391191709844559e-07, "loss": -0.0002, "reward": 2.436464309692383, "reward_std": 0.17833977332188056, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9364644885063171, "step": 236 }, { "clip_ratio": 0.0, "completion_length": 35.125, "epoch": 0.6139896373056994, "grad_norm": 23.54191375273367, "kl": 0.115478515625, "learning_rate": 9.38860103626943e-07, "loss": 0.0006, "reward": 1.9444739818572998, "reward_std": 0.019849272669233642, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4444738030433655, "step": 237 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 0.616580310880829, "grad_norm": 39.32229435436085, "kl": 0.0594482421875, "learning_rate": 9.3860103626943e-07, "loss": -0.0002, "reward": 2.1870386600494385, "reward_std": 0.2591146485837612, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.687038779258728, "step": 238 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 0.6191709844559585, "grad_norm": 25.469284876150283, "kl": 0.0361328125, "learning_rate": 9.383419689119171e-07, "loss": 0.0001, "reward": 2.49897837638855, "reward_std": 0.0004042975363063306, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9989783763885498, "step": 239 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 0.6217616580310881, "grad_norm": 5.608123288148271, "kl": 0.0927734375, "learning_rate": 9.38082901554404e-07, "loss": 0.0009, "reward": 2.4998337030410767, "reward_std": 4.3772628799843005e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9998336434364319, "step": 240 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 0.6243523316062176, "grad_norm": 0.5900102682819381, "kl": 0.0611572265625, "learning_rate": 9.378238341968911e-07, "loss": -0.0006, "reward": 2.499992847442627, "reward_std": 3.861457003040414e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999930262565613, "step": 241 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 0.6269430051813472, "grad_norm": 15.233115618658832, "kl": 0.086669921875, "learning_rate": 9.375647668393782e-07, "loss": -0.0003, "reward": 1.9760212898254395, "reward_std": 0.0007327220682782354, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.476021409034729, "step": 242 }, { "clip_ratio": 0.0, "completion_length": 34.5, "epoch": 0.6295336787564767, "grad_norm": 89.63523003423197, "kl": 0.15966796875, "learning_rate": 9.373056994818652e-07, "loss": 0.0003, "reward": 1.9901127815246582, "reward_std": 0.001632723069860731, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4901129603385925, "step": 243 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 0.6321243523316062, "grad_norm": 6.017951740763496, "kl": 0.7403564453125, "learning_rate": 9.370466321243523e-07, "loss": 0.0038, "reward": 2.4999749660491943, "reward_std": 7.510585191994323e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999749660491943, "step": 244 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 0.6347150259067358, "grad_norm": 0.33611723178730435, "kl": 0.03533935546875, "learning_rate": 9.367875647668393e-07, "loss": -0.0009, "reward": 2.4999945163726807, "reward_std": 3.2188998773108324e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.999994695186615, "step": 245 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 0.6373056994818653, "grad_norm": 1.6092350101526522, "kl": 0.05596923828125, "learning_rate": 9.365284974093264e-07, "loss": 0.0002, "reward": 2.4999927282333374, "reward_std": 4.545140654954594e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.999992847442627, "step": 246 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 0.6398963730569949, "grad_norm": 2.5295088538671244, "kl": 0.079833984375, "learning_rate": 9.362694300518134e-07, "loss": 0.0003, "reward": 2.4999828338623047, "reward_std": 1.1938268286826315e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999828934669495, "step": 247 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 0.6424870466321243, "grad_norm": 1.859501078280743, "kl": 0.0389404296875, "learning_rate": 9.360103626943004e-07, "loss": 0.0005, "reward": 2.4999754428863525, "reward_std": 2.1986423050179837e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.999975562095642, "step": 248 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 0.6450777202072538, "grad_norm": 2.697440450301362, "kl": 0.0865478515625, "learning_rate": 9.357512953367875e-07, "loss": 0.0001, "reward": 2.499985933303833, "reward_std": 2.1488343918463215e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999858736991882, "step": 249 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 0.6476683937823834, "grad_norm": 39.343466209925005, "kl": 0.080810546875, "learning_rate": 9.354922279792745e-07, "loss": 0.0003, "reward": 2.2499629259109497, "reward_std": 0.26729267278676616, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.7499629259109497, "step": 250 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 0.6502590673575129, "grad_norm": 38.7713187585943, "kl": 0.066009521484375, "learning_rate": 9.352331606217616e-07, "loss": -0.0001, "reward": 2.499959111213684, "reward_std": 4.292089155910617e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999591708183289, "step": 251 }, { "clip_ratio": 0.0, "completion_length": 34.5, "epoch": 0.6528497409326425, "grad_norm": 33.689628441748106, "kl": 0.0758056640625, "learning_rate": 9.349740932642487e-07, "loss": 0.0002, "reward": 2.2461527585983276, "reward_std": 0.2713722139735637, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.7461528778076172, "step": 252 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 0.655440414507772, "grad_norm": 0.2080987426799511, "kl": 0.091156005859375, "learning_rate": 9.347150259067356e-07, "loss": 0.0001, "reward": 2.4999970197677612, "reward_std": 3.7653973095075344e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999970197677612, "step": 253 }, { "clip_ratio": 0.0, "completion_length": 34.625, "epoch": 0.6580310880829016, "grad_norm": 15.77335774260603, "kl": 0.0772705078125, "learning_rate": 9.344559585492227e-07, "loss": 0.0, "reward": 2.0636579394340515, "reward_std": 0.2693043718799828, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.5636579394340515, "step": 254 }, { "clip_ratio": 0.0, "completion_length": 35.1875, "epoch": 0.6606217616580311, "grad_norm": 102.94629211528023, "kl": 0.056884765625, "learning_rate": 9.341968911917099e-07, "loss": 0.0002, "reward": 2.025260090827942, "reward_std": 0.39463518345746706, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.525260090827942, "step": 255 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 0.6632124352331606, "grad_norm": 24.710556727036387, "kl": 0.1859130859375, "learning_rate": 9.339378238341969e-07, "loss": 0.0002, "reward": 2.1874470710754395, "reward_std": 0.25881719600437236, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.687447190284729, "step": 256 }, { "clip_ratio": 0.0, "completion_length": 34.5, "epoch": 0.6658031088082902, "grad_norm": 1.5521769698253913, "kl": 0.052734375, "learning_rate": 9.33678756476684e-07, "loss": 0.0012, "reward": 2.499988555908203, "reward_std": 1.2101602123948396e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.999988317489624, "step": 257 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 0.6683937823834197, "grad_norm": 63.022558527622344, "kl": 0.07275390625, "learning_rate": 9.33419689119171e-07, "loss": 0.0007, "reward": 1.99865061044693, "reward_std": 0.0002238482548762022, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4986506700515747, "step": 258 }, { "clip_ratio": 0.0, "completion_length": 35.25, "epoch": 0.6709844559585493, "grad_norm": 46.7628067020373, "kl": 0.111572265625, "learning_rate": 9.33160621761658e-07, "loss": 0.0004, "reward": 1.8104197978973389, "reward_std": 0.6800253987312317, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.3104197978973389, "step": 259 }, { "clip_ratio": 0.0, "completion_length": 35.0625, "epoch": 0.6735751295336787, "grad_norm": 30.072863939071013, "kl": 0.118896484375, "learning_rate": 9.329015544041451e-07, "loss": 0.0008, "reward": 2.057658076286316, "reward_std": 0.17922852502466924, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.5576579570770264, "step": 260 }, { "clip_ratio": 0.0, "completion_length": 34.5, "epoch": 0.6761658031088082, "grad_norm": 172.33709551413486, "kl": 0.1025390625, "learning_rate": 9.326424870466321e-07, "loss": 0.0004, "reward": 1.2484757900238037, "reward_std": 0.003200551262125373, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 0.7484757602214813, "step": 261 }, { "clip_ratio": 0.0, "completion_length": 35.1875, "epoch": 0.6787564766839378, "grad_norm": 54.55324273751358, "kl": 0.068603515625, "learning_rate": 9.323834196891192e-07, "loss": 0.0004, "reward": 2.078890562057495, "reward_std": 0.2600107304310768, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.5788904428482056, "step": 262 }, { "clip_ratio": 0.0, "completion_length": 34.5, "epoch": 0.6813471502590673, "grad_norm": 14.233670491997772, "kl": 0.15234375, "learning_rate": 9.321243523316062e-07, "loss": 0.0007, "reward": 2.1872631311416626, "reward_std": 0.2588548979751977, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.6872629523277283, "step": 263 }, { "clip_ratio": 0.0, "completion_length": 34.5, "epoch": 0.6839378238341969, "grad_norm": 50.997230760582205, "kl": 0.0169677734375, "learning_rate": 9.318652849740933e-07, "loss": -0.0008, "reward": 2.312475085258484, "reward_std": 0.25880399473894045, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.8124749660491943, "step": 264 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 0.6865284974093264, "grad_norm": 0.33892982180117925, "kl": 0.10888671875, "learning_rate": 9.316062176165803e-07, "loss": 0.0004, "reward": 1.9999109506607056, "reward_std": 5.484715074999258e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4999110102653503, "step": 265 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 0.689119170984456, "grad_norm": 1.3070331629686442, "kl": 0.03485107421875, "learning_rate": 9.313471502590673e-07, "loss": 0.0013, "reward": 2.4999672174453735, "reward_std": 1.4274331988417543e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.999967098236084, "step": 266 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 0.6917098445595855, "grad_norm": 1.5513637564228338, "kl": 0.05224609375, "learning_rate": 9.310880829015544e-07, "loss": -0.0002, "reward": 2.49995756149292, "reward_std": 1.746729623164356e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999576210975647, "step": 267 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 0.694300518134715, "grad_norm": 112.77053232255466, "kl": 0.04449462890625, "learning_rate": 9.308290155440414e-07, "loss": 0.0005, "reward": 2.49994158744812, "reward_std": 0.0001064716659016085, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.999941647052765, "step": 268 }, { "clip_ratio": 0.0, "completion_length": 36.5, "epoch": 0.6968911917098446, "grad_norm": 52.051932382301246, "kl": 0.069580078125, "learning_rate": 9.305699481865285e-07, "loss": 0.0004, "reward": 1.8965779542922974, "reward_std": 0.003717041016841449, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.3965781033039093, "step": 269 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 0.6994818652849741, "grad_norm": 105.24779156133805, "kl": 0.0260467529296875, "learning_rate": 9.303108808290156e-07, "loss": 0.0001, "reward": 1.99981689453125, "reward_std": 0.00018113442831690918, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.49981689453125, "step": 270 }, { "clip_ratio": 0.0, "completion_length": 34.4375, "epoch": 0.7020725388601037, "grad_norm": 49.51360356628827, "kl": 0.058837890625, "learning_rate": 9.300518134715025e-07, "loss": 0.0002, "reward": 1.2127676010131836, "reward_std": 0.2958464545663446, "rewards/format_reward_rec": 0.875, "rewards/point_reward": 0.7752676904201508, "step": 271 }, { "clip_ratio": 0.0, "completion_length": 34.9375, "epoch": 0.7046632124352331, "grad_norm": 26.210604022267137, "kl": 0.03125, "learning_rate": 9.297927461139896e-07, "loss": 0.0001, "reward": 2.4371660947799683, "reward_std": 0.17677150324652757, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9371660947799683, "step": 272 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 0.7072538860103627, "grad_norm": 5.61723189884251, "kl": 0.099365234375, "learning_rate": 9.295336787564766e-07, "loss": 0.0002, "reward": 2.499822735786438, "reward_std": 8.499576142639853e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9998226165771484, "step": 273 }, { "clip_ratio": 0.0, "completion_length": 33.5, "epoch": 0.7098445595854922, "grad_norm": 6.112321727623437, "kl": 0.27496337890625, "learning_rate": 9.292746113989637e-07, "loss": 0.0006, "reward": 1.9968271851539612, "reward_std": 0.0001886479646486805, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4968271255493164, "step": 274 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 0.7124352331606217, "grad_norm": 9.319562290667678, "kl": 0.093994140625, "learning_rate": 9.290155440414508e-07, "loss": 0.0011, "reward": 2.4994486570358276, "reward_std": 0.00023547241016785847, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9994484186172485, "step": 275 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 0.7150259067357513, "grad_norm": 13.749548931709745, "kl": 0.107666015625, "learning_rate": 9.287564766839378e-07, "loss": 0.0004, "reward": 1.7446227669715881, "reward_std": 0.2328637728933245, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.2446226477622986, "step": 276 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 0.7176165803108808, "grad_norm": 0.29424144760711574, "kl": 0.010528564453125, "learning_rate": 9.284974093264248e-07, "loss": 0.0005, "reward": 2.4999974966049194, "reward_std": 2.2441956843977096e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999972581863403, "step": 277 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 0.7202072538860104, "grad_norm": 0.9127220665936261, "kl": 0.09619140625, "learning_rate": 9.282383419689118e-07, "loss": 0.0005, "reward": 2.4999959468841553, "reward_std": 5.73918987356592e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999960660934448, "step": 278 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 0.7227979274611399, "grad_norm": 2.7414774325603277, "kl": 0.05029296875, "learning_rate": 9.279792746113989e-07, "loss": -0.0002, "reward": 2.4994759559631348, "reward_std": 4.502509227677365e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9994759559631348, "step": 279 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 0.7253886010362695, "grad_norm": 224.78166497058007, "kl": 0.168701171875, "learning_rate": 9.27720207253886e-07, "loss": 0.0007, "reward": 1.4996460676193237, "reward_std": 0.5346071789172129, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 0.9996460974216461, "step": 280 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 0.727979274611399, "grad_norm": 29.767806976453638, "kl": 0.093505859375, "learning_rate": 9.27461139896373e-07, "loss": 0.0005, "reward": 2.499904751777649, "reward_std": 0.00014851183732389472, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999048709869385, "step": 281 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 0.7305699481865285, "grad_norm": 2.451734746053496, "kl": 0.058837890625, "learning_rate": 9.272020725388601e-07, "loss": 0.001, "reward": 2.4999754428863525, "reward_std": 1.5983257071638945e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999753832817078, "step": 282 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 0.7331606217616581, "grad_norm": 1.8951235023993949, "kl": 0.0263671875, "learning_rate": 9.269430051813471e-07, "loss": -0.0001, "reward": 2.4999849796295166, "reward_std": 1.7635909728141996e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999850392341614, "step": 283 }, { "clip_ratio": 0.0, "completion_length": 34.9375, "epoch": 0.7357512953367875, "grad_norm": 20.1055651487715, "kl": 0.1324462890625, "learning_rate": 9.266839378238341e-07, "loss": 0.0004, "reward": 2.029202103614807, "reward_std": 0.19023123945163434, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.5292021036148071, "step": 284 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 0.7383419689119171, "grad_norm": 2.3221891552741885, "kl": 0.0855712890625, "learning_rate": 9.264248704663212e-07, "loss": -0.0003, "reward": 2.4999667406082153, "reward_std": 1.8513579334467067e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999666810035706, "step": 285 }, { "clip_ratio": 0.0, "completion_length": 34.5, "epoch": 0.7409326424870466, "grad_norm": 0.16696801434953953, "kl": 0.062744140625, "learning_rate": 9.261658031088082e-07, "loss": -0.0005, "reward": 2.4999759197235107, "reward_std": 2.578143323717086e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999759793281555, "step": 286 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 0.7435233160621761, "grad_norm": 13.229863535225716, "kl": 0.16259765625, "learning_rate": 9.259067357512953e-07, "loss": 0.0005, "reward": 1.9019799828529358, "reward_std": 0.00041689237696118653, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.401980072259903, "step": 287 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 0.7461139896373057, "grad_norm": 0.7634927553439628, "kl": 0.0386962890625, "learning_rate": 9.256476683937824e-07, "loss": 0.001, "reward": 2.499953866004944, "reward_std": 4.253313250046631e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999536275863647, "step": 288 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 0.7487046632124352, "grad_norm": 97.37085161469062, "kl": 0.0625, "learning_rate": 9.253886010362693e-07, "loss": 0.0006, "reward": 1.9984523057937622, "reward_std": 0.0002610503869391323, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4984521865844727, "step": 289 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 0.7512953367875648, "grad_norm": 1.2218454461480925, "kl": 0.0859375, "learning_rate": 9.251295336787564e-07, "loss": 0.0004, "reward": 2.499990701675415, "reward_std": 1.2970205034434912e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.999990701675415, "step": 290 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 0.7538860103626943, "grad_norm": 12.851982334525493, "kl": 0.126220703125, "learning_rate": 9.248704663212434e-07, "loss": 0.0006, "reward": 2.0624111890792847, "reward_std": 0.17680646463577432, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.5624110698699951, "step": 291 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 0.7564766839378239, "grad_norm": 2.206207729074567, "kl": 0.0709228515625, "learning_rate": 9.246113989637305e-07, "loss": -0.0004, "reward": 2.4999241828918457, "reward_std": 3.138162901450414e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999243021011353, "step": 292 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 0.7590673575129534, "grad_norm": 0.31928147397504814, "kl": 0.0975341796875, "learning_rate": 9.243523316062176e-07, "loss": 0.0005, "reward": 2.499974846839905, "reward_std": 4.535467724053888e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999749660491943, "step": 293 }, { "clip_ratio": 0.0, "completion_length": 34.8125, "epoch": 0.7616580310880829, "grad_norm": 30.018005918619796, "kl": 0.1025390625, "learning_rate": 9.240932642487046e-07, "loss": 0.0, "reward": 1.9774422645568848, "reward_std": 0.007441002624545945, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4774422645568848, "step": 294 }, { "clip_ratio": 0.0, "completion_length": 35.9375, "epoch": 0.7642487046632125, "grad_norm": 33.47510945410008, "kl": 0.037841796875, "learning_rate": 9.238341968911916e-07, "loss": 0.0, "reward": 1.8465708494186401, "reward_std": 0.02236446195274766, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.3465709686279297, "step": 295 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 0.7668393782383419, "grad_norm": 9.842075489774253, "kl": 0.125, "learning_rate": 9.235751295336786e-07, "loss": 0.0001, "reward": 2.499991536140442, "reward_std": 7.91910986208677e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999915957450867, "step": 296 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 0.7694300518134715, "grad_norm": 0.3672338026132165, "kl": 0.117431640625, "learning_rate": 9.233160621761657e-07, "loss": 0.0013, "reward": 1.4999854564666748, "reward_std": 3.936485882150009e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 0.9999852180480957, "step": 297 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 0.772020725388601, "grad_norm": 0.817556391913545, "kl": 0.0640869140625, "learning_rate": 9.230569948186529e-07, "loss": -0.0015, "reward": 2.4999568462371826, "reward_std": 9.18282648854074e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999569058418274, "step": 298 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 0.7746113989637305, "grad_norm": 19.612597597260386, "kl": 0.0623779296875, "learning_rate": 9.227979274611399e-07, "loss": 0.0005, "reward": 2.3749282360076904, "reward_std": 0.23157261063533952, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.8749281764030457, "step": 299 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 0.7772020725388601, "grad_norm": 16.9686411351817, "kl": 0.037353515625, "learning_rate": 9.22538860103627e-07, "loss": -0.0002, "reward": 2.0622934103012085, "reward_std": 0.17680109746561357, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.5622934699058533, "step": 300 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 0.7797927461139896, "grad_norm": 0.38592383620459214, "kl": 0.072021484375, "learning_rate": 9.222797927461139e-07, "loss": 0.0012, "reward": 2.499995470046997, "reward_std": 2.571936761341931e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.999995470046997, "step": 301 }, { "clip_ratio": 0.0, "completion_length": 34.5, "epoch": 0.7823834196891192, "grad_norm": 9.529719471129177, "kl": 0.120361328125, "learning_rate": 9.22020725388601e-07, "loss": 0.0006, "reward": 2.4999918937683105, "reward_std": 1.1086000029081333e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999917149543762, "step": 302 }, { "clip_ratio": 0.0, "completion_length": 34.0, "epoch": 0.7849740932642487, "grad_norm": 3.5491984364146902, "kl": 0.070068359375, "learning_rate": 9.217616580310881e-07, "loss": 0.0009, "reward": 2.4999492168426514, "reward_std": 2.715591938340367e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999490976333618, "step": 303 }, { "clip_ratio": 0.0, "completion_length": 34.4375, "epoch": 0.7875647668393783, "grad_norm": 142.82475624809567, "kl": 0.166259765625, "learning_rate": 9.215025906735751e-07, "loss": 0.0008, "reward": 1.7831117510795593, "reward_std": 0.08312539157242327, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.2831116318702698, "step": 304 }, { "clip_ratio": 0.0, "completion_length": 34.5, "epoch": 0.7901554404145078, "grad_norm": 2.689398487678334, "kl": 0.061767578125, "learning_rate": 9.212435233160622e-07, "loss": 0.0005, "reward": 1.9999322295188904, "reward_std": 1.3019260563851276e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4999322593212128, "step": 305 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 0.7927461139896373, "grad_norm": 3.6373240126760895, "kl": 0.0777587890625, "learning_rate": 9.209844559585493e-07, "loss": 0.0012, "reward": 2.4999879598617554, "reward_std": 1.44535373749477e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999876618385315, "step": 306 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 0.7953367875647669, "grad_norm": 5.630702097725107, "kl": 0.09228515625, "learning_rate": 9.207253886010362e-07, "loss": -0.0004, "reward": 2.4999611377716064, "reward_std": 7.7118709214119e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.999961256980896, "step": 307 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 0.7979274611398963, "grad_norm": 17.864836840406202, "kl": 0.0830078125, "learning_rate": 9.204663212435233e-07, "loss": 0.0011, "reward": 2.499875545501709, "reward_std": 8.189815252990229e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9998753070831299, "step": 308 }, { "clip_ratio": 0.0, "completion_length": 35.375, "epoch": 0.8005181347150259, "grad_norm": 70.47198735069966, "kl": 0.08544921875, "learning_rate": 9.202072538860103e-07, "loss": 0.0003, "reward": 1.9197113513946533, "reward_std": 0.04908056743443012, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4197113513946533, "step": 309 }, { "clip_ratio": 0.0, "completion_length": 34.5, "epoch": 0.8031088082901554, "grad_norm": 36.96407934017639, "kl": 0.063232421875, "learning_rate": 9.199481865284974e-07, "loss": -0.0003, "reward": 1.8900312185287476, "reward_std": 0.06787262001910221, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.3900312185287476, "step": 310 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 0.805699481865285, "grad_norm": 16.32845384721268, "kl": 0.0328369140625, "learning_rate": 9.196891191709845e-07, "loss": -0.0008, "reward": 2.4999873638153076, "reward_std": 2.0436304765780733e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.999987542629242, "step": 311 }, { "clip_ratio": 0.0, "completion_length": 34.0, "epoch": 0.8082901554404145, "grad_norm": 0.6457197661261482, "kl": 0.0504150390625, "learning_rate": 9.194300518134715e-07, "loss": 0.0002, "reward": 2.499995708465576, "reward_std": 4.167168214053163e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999957084655762, "step": 312 }, { "clip_ratio": 0.0, "completion_length": 35.375, "epoch": 0.810880829015544, "grad_norm": 20.680251075732304, "kl": 0.0853271484375, "learning_rate": 9.191709844559585e-07, "loss": 0.0001, "reward": 1.0502718091011047, "reward_std": 0.1444510220644588, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 0.5502718463540077, "step": 313 }, { "clip_ratio": 0.0, "completion_length": 34.5, "epoch": 0.8134715025906736, "grad_norm": 20.340583633613686, "kl": 0.052978515625, "learning_rate": 9.189119170984455e-07, "loss": -0.0011, "reward": 2.4999191761016846, "reward_std": 4.008891755802324e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999192357063293, "step": 314 }, { "clip_ratio": 0.0, "completion_length": 34.5, "epoch": 0.8160621761658031, "grad_norm": 4.878211534311483, "kl": 0.0787353515625, "learning_rate": 9.186528497409326e-07, "loss": -0.0003, "reward": 2.4999196529388428, "reward_std": 2.0191217117826454e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999196529388428, "step": 315 }, { "clip_ratio": 0.0, "completion_length": 34.5, "epoch": 0.8186528497409327, "grad_norm": 82.92206731218685, "kl": 0.080322265625, "learning_rate": 9.183937823834197e-07, "loss": -0.0002, "reward": 2.2498987317085266, "reward_std": 0.2673699298443353, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.7498987913131714, "step": 316 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 0.8212435233160622, "grad_norm": 19.816342034729935, "kl": 0.120361328125, "learning_rate": 9.181347150259067e-07, "loss": 0.0012, "reward": 2.4374876022338867, "reward_std": 0.17680068753747946, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9374876618385315, "step": 317 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 0.8238341968911918, "grad_norm": 11.442529464702275, "kl": 0.054168701171875, "learning_rate": 9.178756476683938e-07, "loss": 0.0006, "reward": 1.9945263862609863, "reward_std": 0.01433517888881397, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4945263266563416, "step": 318 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 0.8264248704663213, "grad_norm": 15.37448789211719, "kl": 0.097900390625, "learning_rate": 9.176165803108807e-07, "loss": 0.0005, "reward": 1.9372231364250183, "reward_std": 0.17718149179563625, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.437223196029663, "step": 319 }, { "clip_ratio": 0.0, "completion_length": 34.0, "epoch": 0.8290155440414507, "grad_norm": 49.80289094742806, "kl": 0.205078125, "learning_rate": 9.173575129533678e-07, "loss": 0.0007, "reward": 1.5752655267715454, "reward_std": 0.23202923552889843, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.0752655863761902, "step": 320 }, { "clip_ratio": 0.0, "completion_length": 34.5, "epoch": 0.8316062176165803, "grad_norm": 47.59199698919065, "kl": 0.0540771484375, "learning_rate": 9.170984455958549e-07, "loss": 0.0001, "reward": 2.3747940063476562, "reward_std": 0.23171583090010017, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.8747938871383667, "step": 321 }, { "clip_ratio": 0.0, "completion_length": 34.9375, "epoch": 0.8341968911917098, "grad_norm": 24.09727537643488, "kl": 0.1103515625, "learning_rate": 9.168393782383419e-07, "loss": -0.0001, "reward": 2.4130406379699707, "reward_std": 0.2459504969729096, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.913040816783905, "step": 322 }, { "clip_ratio": 0.0, "completion_length": 34.0625, "epoch": 0.8367875647668394, "grad_norm": 288.91013853276496, "kl": 0.079833984375, "learning_rate": 9.16580310880829e-07, "loss": 0.0005, "reward": 1.9372283816337585, "reward_std": 0.17721589557436346, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.437228262424469, "step": 323 }, { "clip_ratio": 0.0, "completion_length": 35.875, "epoch": 0.8393782383419689, "grad_norm": 0.2791848515241179, "kl": 0.193359375, "learning_rate": 9.16321243523316e-07, "loss": 0.0009, "reward": 2.499991536140442, "reward_std": 3.674611775750236e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999917149543762, "step": 324 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 0.8419689119170984, "grad_norm": 0.3806673610876601, "kl": 0.1790771484375, "learning_rate": 9.16062176165803e-07, "loss": 0.0013, "reward": 2.4999841451644897, "reward_std": 4.476777007766941e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999839067459106, "step": 325 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 0.844559585492228, "grad_norm": 1.3217356707992471, "kl": 0.0582275390625, "learning_rate": 9.158031088082901e-07, "loss": 0.0005, "reward": 2.4999953508377075, "reward_std": 5.780481842521112e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999955892562866, "step": 326 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 0.8471502590673575, "grad_norm": 5.226186957656574, "kl": 0.084716796875, "learning_rate": 9.155440414507771e-07, "loss": 0.0006, "reward": 2.4999561309814453, "reward_std": 3.9707473519001724e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.99995619058609, "step": 327 }, { "clip_ratio": 0.0, "completion_length": 36.5, "epoch": 0.8497409326424871, "grad_norm": 8.03176763596765, "kl": 0.103515625, "learning_rate": 9.152849740932642e-07, "loss": -0.0003, "reward": 1.9965597987174988, "reward_std": 0.0002733978952846883, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4965597987174988, "step": 328 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 0.8523316062176166, "grad_norm": 3.67055638756576, "kl": 0.05072021484375, "learning_rate": 9.150259067357513e-07, "loss": 0.0002, "reward": 2.4999865293502808, "reward_std": 1.6127025901369052e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999865293502808, "step": 329 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 0.8549222797927462, "grad_norm": 1.789042247604925, "kl": 0.0400390625, "learning_rate": 9.147668393782383e-07, "loss": 0.0005, "reward": 2.4999849796295166, "reward_std": 1.2387885362841189e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999850392341614, "step": 330 }, { "clip_ratio": 0.0, "completion_length": 34.5, "epoch": 0.8575129533678757, "grad_norm": 1.0957686274891045, "kl": 0.0782470703125, "learning_rate": 9.145077720207253e-07, "loss": -0.0005, "reward": 2.4999908208847046, "reward_std": 5.092277120866129e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999909400939941, "step": 331 }, { "clip_ratio": 0.0, "completion_length": 34.5, "epoch": 0.8601036269430051, "grad_norm": 0.29619015825042827, "kl": 0.073974609375, "learning_rate": 9.142487046632123e-07, "loss": -0.0003, "reward": 2.499997615814209, "reward_std": 1.081261942204037e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999977350234985, "step": 332 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 0.8626943005181347, "grad_norm": 32.1185168344218, "kl": 0.100341796875, "learning_rate": 9.139896373056994e-07, "loss": 0.0006, "reward": 2.437234878540039, "reward_std": 0.17700892945867963, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.937234878540039, "step": 333 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 0.8652849740932642, "grad_norm": 16.46751519181484, "kl": 0.1240234375, "learning_rate": 9.137305699481865e-07, "loss": 0.0003, "reward": 1.9995706677436829, "reward_std": 0.0005153576767042978, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4995706379413605, "step": 334 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 0.8678756476683938, "grad_norm": 0.11411298308077232, "kl": 0.09136962890625, "learning_rate": 9.134715025906735e-07, "loss": 0.0005, "reward": 2.4999969005584717, "reward_std": 1.47363857649907e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999969601631165, "step": 335 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 0.8704663212435233, "grad_norm": 8.194679909853198, "kl": 0.0601806640625, "learning_rate": 9.132124352331606e-07, "loss": 0.0002, "reward": 1.9984994530677795, "reward_std": 7.842617560527287e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4984994530677795, "step": 336 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 0.8730569948186528, "grad_norm": 23.164207991034793, "kl": 0.07598876953125, "learning_rate": 9.129533678756475e-07, "loss": 0.0001, "reward": 2.4998586177825928, "reward_std": 0.00011591546832789845, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9998587369918823, "step": 337 }, { "clip_ratio": 0.0, "completion_length": 34.0, "epoch": 0.8756476683937824, "grad_norm": 1.472658752182045, "kl": 0.267578125, "learning_rate": 9.126943005181346e-07, "loss": 0.0006, "reward": 2.4999876022338867, "reward_std": 5.665144087174667e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999876022338867, "step": 338 }, { "clip_ratio": 0.0, "completion_length": 34.0, "epoch": 0.8782383419689119, "grad_norm": 30.440232194276742, "kl": 0.0848388671875, "learning_rate": 9.124352331606217e-07, "loss": 0.0005, "reward": 1.4549660682678223, "reward_std": 0.00027229699480812997, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 0.9549659788608551, "step": 339 }, { "clip_ratio": 0.0, "completion_length": 36.5, "epoch": 0.8808290155440415, "grad_norm": 6.091295098475844, "kl": 0.078125, "learning_rate": 9.121761658031087e-07, "loss": 0.0003, "reward": 2.4999663829803467, "reward_std": 3.496112094580894e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.999966561794281, "step": 340 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 0.883419689119171, "grad_norm": 1.9190445297757928, "kl": 0.130859375, "learning_rate": 9.119170984455959e-07, "loss": -0.0, "reward": 2.499989867210388, "reward_std": 6.505336699547115e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.999989926815033, "step": 341 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 0.8860103626943006, "grad_norm": 14.081115069683012, "kl": 0.125, "learning_rate": 9.116580310880829e-07, "loss": 0.0002, "reward": 2.4998468160629272, "reward_std": 0.00034713813329290133, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9998469352722168, "step": 342 }, { "clip_ratio": 0.0, "completion_length": 34.6875, "epoch": 0.8886010362694301, "grad_norm": 171.41683522292834, "kl": 0.1173095703125, "learning_rate": 9.113989637305699e-07, "loss": 0.001, "reward": 2.0622345209121704, "reward_std": 0.17686631905849026, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.5622344613075256, "step": 343 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 0.8911917098445595, "grad_norm": 7.446769607232678, "kl": 0.1229248046875, "learning_rate": 9.11139896373057e-07, "loss": 0.0008, "reward": 1.7495030164718628, "reward_std": 0.00013970469404966934, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.249502956867218, "step": 344 }, { "clip_ratio": 0.0, "completion_length": 34.5, "epoch": 0.8937823834196891, "grad_norm": 8.91402227900581, "kl": 0.1009521484375, "learning_rate": 9.10880829015544e-07, "loss": 0.0006, "reward": 2.435795307159424, "reward_std": 0.18159472515071684, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9357953071594238, "step": 345 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 0.8963730569948186, "grad_norm": 29.332694146258852, "kl": 0.08294677734375, "learning_rate": 9.106217616580311e-07, "loss": 0.0004, "reward": 1.9988747239112854, "reward_std": 7.921733703142309e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.498874545097351, "step": 346 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 0.8989637305699482, "grad_norm": 51.401515984558365, "kl": 0.070068359375, "learning_rate": 9.103626943005181e-07, "loss": -0.0007, "reward": 2.1312029361724854, "reward_std": 0.312970283900313, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.6312029957771301, "step": 347 }, { "clip_ratio": 0.0, "completion_length": 34.8125, "epoch": 0.9015544041450777, "grad_norm": 17.99775269493687, "kl": 0.075439453125, "learning_rate": 9.101036269430052e-07, "loss": 0.0002, "reward": 1.5596943497657776, "reward_std": 0.17774493167962646, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.0596943497657776, "step": 348 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 0.9041450777202072, "grad_norm": 1.6418529985520844, "kl": 0.3291015625, "learning_rate": 9.098445595854922e-07, "loss": 0.0003, "reward": 1.999531626701355, "reward_std": 3.672952215083569e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4995318055152893, "step": 349 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 0.9067357512953368, "grad_norm": 22.791540540252825, "kl": 0.19677734375, "learning_rate": 9.095854922279792e-07, "loss": 0.0008, "reward": 2.0622240900993347, "reward_std": 0.1768865605378096, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.56222403049469, "step": 350 }, { "clip_ratio": 0.0, "completion_length": 33.5, "epoch": 0.9093264248704663, "grad_norm": 2.296782992922078, "kl": 0.1171875, "learning_rate": 9.093264248704663e-07, "loss": -0.0002, "reward": 2.4999877214431763, "reward_std": 7.551020416940446e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999877214431763, "step": 351 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 0.9119170984455959, "grad_norm": 3.3214137122798286, "kl": 0.045867919921875, "learning_rate": 9.090673575129534e-07, "loss": 0.0004, "reward": 2.4999700784683228, "reward_std": 1.883913378719626e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999700784683228, "step": 352 }, { "clip_ratio": 0.0, "completion_length": 34.5625, "epoch": 0.9145077720207254, "grad_norm": 56.831647032202035, "kl": 0.2109375, "learning_rate": 9.088082901554404e-07, "loss": 0.0011, "reward": 1.492597222328186, "reward_std": 0.000533302802068647, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 0.9925971627235413, "step": 353 }, { "clip_ratio": 0.0, "completion_length": 34.0, "epoch": 0.917098445595855, "grad_norm": 0.5821065240202035, "kl": 0.05194091796875, "learning_rate": 9.085492227979275e-07, "loss": -0.0001, "reward": 2.499998688697815, "reward_std": 1.1890253972524079e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999988079071045, "step": 354 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 0.9196891191709845, "grad_norm": 1.0197640860720332, "kl": 0.108001708984375, "learning_rate": 9.082901554404144e-07, "loss": 0.0007, "reward": 2.4999961853027344, "reward_std": 5.204545459491783e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.999996304512024, "step": 355 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 0.9222797927461139, "grad_norm": 0.6594272411473232, "kl": 0.03082275390625, "learning_rate": 9.080310880829015e-07, "loss": 0.0007, "reward": 2.499996066093445, "reward_std": 3.3537564831931377e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999960064888, "step": 356 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 0.9248704663212435, "grad_norm": 16.270326827422107, "kl": 0.136474609375, "learning_rate": 9.077720207253886e-07, "loss": 0.0005, "reward": 2.4999510049819946, "reward_std": 3.0186531603249023e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999508261680603, "step": 357 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 0.927461139896373, "grad_norm": 3.13298576907092, "kl": 0.05780029296875, "learning_rate": 9.075129533678756e-07, "loss": 0.0008, "reward": 2.4994637966156006, "reward_std": 1.5863175576669164e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.999463677406311, "step": 358 }, { "clip_ratio": 0.0, "completion_length": 35.5625, "epoch": 0.9300518134715026, "grad_norm": 72.89154611616695, "kl": 0.160888671875, "learning_rate": 9.072538860103627e-07, "loss": 0.0006, "reward": 1.580840289592743, "reward_std": 0.20294279605150223, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.0808402746915817, "step": 359 }, { "clip_ratio": 0.0, "completion_length": 36.5, "epoch": 0.9326424870466321, "grad_norm": 2.6762654987413073, "kl": 0.09521484375, "learning_rate": 9.069948186528497e-07, "loss": 0.0009, "reward": 1.4995691776275635, "reward_std": 2.641398987179855e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 0.9995691776275635, "step": 360 }, { "clip_ratio": 0.0, "completion_length": 36.5, "epoch": 0.9352331606217616, "grad_norm": 3.7400608704217895, "kl": 0.117431640625, "learning_rate": 9.067357512953367e-07, "loss": -0.0, "reward": 1.9623454809188843, "reward_std": 0.00015472333325305954, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4623453915119171, "step": 361 }, { "clip_ratio": 0.0, "completion_length": 34.5, "epoch": 0.9378238341968912, "grad_norm": 0.08400091985170403, "kl": 0.07373046875, "learning_rate": 9.064766839378238e-07, "loss": 0.0006, "reward": 2.4999977350234985, "reward_std": 9.096793291973881e-07, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999977350234985, "step": 362 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 0.9404145077720207, "grad_norm": 89.9617485294265, "kl": 0.1248779296875, "learning_rate": 9.062176165803108e-07, "loss": 0.0, "reward": 1.9987398982048035, "reward_std": 0.0006881913602967415, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.498740017414093, "step": 363 }, { "clip_ratio": 0.0, "completion_length": 34.5625, "epoch": 0.9430051813471503, "grad_norm": 50.30079137772382, "kl": 0.40478515625, "learning_rate": 9.059585492227979e-07, "loss": 0.0018, "reward": 1.937167227268219, "reward_std": 0.17718254558712943, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4371671676635742, "step": 364 }, { "clip_ratio": 0.0, "completion_length": 35.1875, "epoch": 0.9455958549222798, "grad_norm": 104.06780316144935, "kl": 0.07373046875, "learning_rate": 9.056994818652849e-07, "loss": 0.0012, "reward": 1.8179743885993958, "reward_std": 0.1400307110416179, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.3179743885993958, "step": 365 }, { "clip_ratio": 0.0, "completion_length": 35.625, "epoch": 0.9481865284974094, "grad_norm": 83.27529841185802, "kl": 0.11328125, "learning_rate": 9.05440414507772e-07, "loss": 0.0011, "reward": 2.1393807530403137, "reward_std": 0.29861046785254075, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.6393807530403137, "step": 366 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 0.9507772020725389, "grad_norm": 21.582388339329956, "kl": 0.0723876953125, "learning_rate": 9.051813471502591e-07, "loss": -0.0006, "reward": 1.9987910985946655, "reward_std": 1.44862519846356e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4987912774085999, "step": 367 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 0.9533678756476683, "grad_norm": 1.5179477933274999, "kl": 0.0594482421875, "learning_rate": 9.04922279792746e-07, "loss": 0.0, "reward": 1.9997722506523132, "reward_std": 8.410090231336653e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4997723698616028, "step": 368 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 0.9559585492227979, "grad_norm": 29.422458287839596, "kl": 0.05364990234375, "learning_rate": 9.046632124352331e-07, "loss": 0.0009, "reward": 1.9995509386062622, "reward_std": 0.0003094946463306769, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4995508790016174, "step": 369 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 0.9585492227979274, "grad_norm": 0.5454563402653053, "kl": 0.1219482421875, "learning_rate": 9.044041450777201e-07, "loss": -0.0004, "reward": 2.499989628791809, "reward_std": 3.767214707295352e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999898076057434, "step": 370 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 0.961139896373057, "grad_norm": 0.9719703172323875, "kl": 0.0689697265625, "learning_rate": 9.041450777202072e-07, "loss": 0.0004, "reward": 2.4999566078186035, "reward_std": 1.0819015187735204e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999566078186035, "step": 371 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 0.9637305699481865, "grad_norm": 0.48035294193102046, "kl": 0.1015625, "learning_rate": 9.038860103626943e-07, "loss": -0.0002, "reward": 2.4999879598617554, "reward_std": 3.2277424111271102e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999879598617554, "step": 372 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 0.966321243523316, "grad_norm": 14.174581515389617, "kl": 0.149658203125, "learning_rate": 9.036269430051813e-07, "loss": -0.0012, "reward": 2.499988317489624, "reward_std": 1.088481485567172e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999885559082031, "step": 373 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 0.9689119170984456, "grad_norm": 12.685892717933179, "kl": 0.0614013671875, "learning_rate": 9.033678756476683e-07, "loss": -0.0001, "reward": 1.9979270696640015, "reward_std": 3.1579536880599335e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.497927188873291, "step": 374 }, { "clip_ratio": 0.0, "completion_length": 34.5, "epoch": 0.9715025906735751, "grad_norm": 2.778538104881977, "kl": 0.075439453125, "learning_rate": 9.031088082901554e-07, "loss": 0.0006, "reward": 2.4998711347579956, "reward_std": 2.0717866846098332e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9998711347579956, "step": 375 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 0.9740932642487047, "grad_norm": 7.72037507535947, "kl": 0.0986328125, "learning_rate": 9.028497409326424e-07, "loss": 0.0001, "reward": 2.4999849796295166, "reward_std": 5.301843089000613e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999850392341614, "step": 376 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 0.9766839378238342, "grad_norm": 60.81711602963703, "kl": 0.136962890625, "learning_rate": 9.025906735751295e-07, "loss": 0.0007, "reward": 2.3437013626098633, "reward_std": 0.4419448544445004, "rewards/format_reward_rec": 0.9375, "rewards/point_reward": 1.874951422214508, "step": 377 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 0.9792746113989638, "grad_norm": 0.3299355142878522, "kl": 0.1153564453125, "learning_rate": 9.023316062176165e-07, "loss": 0.0005, "reward": 2.499995708465576, "reward_std": 1.985798633086233e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.999995768070221, "step": 378 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 0.9818652849740933, "grad_norm": 0.31302396124451454, "kl": 0.1279296875, "learning_rate": 9.020725388601036e-07, "loss": 0.0005, "reward": 2.4999958276748657, "reward_std": 1.7543504782224772e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999959468841553, "step": 379 }, { "clip_ratio": 0.0, "completion_length": 34.0, "epoch": 0.9844559585492227, "grad_norm": 24.325849383190917, "kl": 0.11181640625, "learning_rate": 9.018134715025906e-07, "loss": 0.0005, "reward": 2.437013030052185, "reward_std": 0.17696355968655553, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9370129108428955, "step": 380 }, { "clip_ratio": 0.0, "completion_length": 34.5, "epoch": 0.9870466321243523, "grad_norm": 7.892168497896422, "kl": 0.134765625, "learning_rate": 9.015544041450776e-07, "loss": 0.0008, "reward": 1.9984101057052612, "reward_std": 2.598165337985847e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4984100759029388, "step": 381 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 0.9896373056994818, "grad_norm": 0.12639145848943747, "kl": 0.03680419921875, "learning_rate": 9.012953367875647e-07, "loss": -0.0004, "reward": 2.4999992847442627, "reward_std": 7.902503398327099e-07, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999995231628418, "step": 382 }, { "clip_ratio": 0.0, "completion_length": 34.0, "epoch": 0.9922279792746114, "grad_norm": 6.604066305838429, "kl": 0.064208984375, "learning_rate": 9.010362694300517e-07, "loss": -0.0002, "reward": 2.49996554851532, "reward_std": 2.8886501922897878e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999655485153198, "step": 383 }, { "clip_ratio": 0.0, "completion_length": 34.25, "epoch": 0.9948186528497409, "grad_norm": 35.23091864141904, "kl": 0.14068603515625, "learning_rate": 9.007772020725389e-07, "loss": 0.0009, "reward": 1.995453953742981, "reward_std": 0.0029676412481194347, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4954538643360138, "step": 384 }, { "clip_ratio": 0.0, "completion_length": 34.5, "epoch": 0.9974093264248705, "grad_norm": 77.43882101210683, "kl": 0.08740234375, "learning_rate": 9.00518134715026e-07, "loss": 0.0004, "reward": 1.9996147155761719, "reward_std": 0.3537828028202057, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4996147751808167, "step": 385 }, { "clip_ratio": 0.0, "completion_length": 34.5, "epoch": 1.0, "grad_norm": 0.39455606315651964, "kl": 0.14013671875, "learning_rate": 9.002590673575129e-07, "loss": 0.0004, "reward": 2.4999988079071045, "reward_std": 1.5925973571029317e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999987483024597, "step": 386 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 1.0025906735751295, "grad_norm": 27.93992197213771, "kl": 0.1181640625, "learning_rate": 9e-07, "loss": 0.0005, "reward": 1.8049081563949585, "reward_std": 0.26289142668247223, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.304908275604248, "step": 387 }, { "clip_ratio": 0.0, "completion_length": 34.875, "epoch": 1.005181347150259, "grad_norm": 78.09932939848163, "kl": 0.15283203125, "learning_rate": 8.99740932642487e-07, "loss": 0.0012, "reward": 2.062077045440674, "reward_std": 0.1768815812278035, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.5620769262313843, "step": 388 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 1.0077720207253886, "grad_norm": 0.42449608013010753, "kl": 0.115478515625, "learning_rate": 8.994818652849741e-07, "loss": 0.0005, "reward": 2.4999442100524902, "reward_std": 4.055633780808421e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999443292617798, "step": 389 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 1.0103626943005182, "grad_norm": 5.91364886020048, "kl": 0.08642578125, "learning_rate": 8.992227979274612e-07, "loss": -0.0004, "reward": 2.499927043914795, "reward_std": 1.4638307163750142e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999269843101501, "step": 390 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 1.0129533678756477, "grad_norm": 407.99797627916894, "kl": 0.09033203125, "learning_rate": 8.989637305699482e-07, "loss": -0.0005, "reward": 2.4367836713790894, "reward_std": 0.17879251341669544, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9367839097976685, "step": 391 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 1.0155440414507773, "grad_norm": 1.5800756359407757, "kl": 0.0679931640625, "learning_rate": 8.987046632124352e-07, "loss": 0.0008, "reward": 2.4999600648880005, "reward_std": 1.0461450870025146e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999601244926453, "step": 392 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 1.0181347150259068, "grad_norm": 0.30021282422788814, "kl": 0.115478515625, "learning_rate": 8.984455958549222e-07, "loss": 0.0006, "reward": 2.499990463256836, "reward_std": 3.2136514391822857e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.999990463256836, "step": 393 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 1.0207253886010363, "grad_norm": 4.4393182473073285, "kl": 0.049560546875, "learning_rate": 8.981865284974093e-07, "loss": -0.0001, "reward": 2.499978542327881, "reward_std": 9.545959187562403e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999785423278809, "step": 394 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 1.0233160621761659, "grad_norm": 4.048947856971386, "kl": 0.08056640625, "learning_rate": 8.979274611398964e-07, "loss": -0.0002, "reward": 2.4999908208847046, "reward_std": 9.642950658417249e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999908208847046, "step": 395 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 1.0259067357512954, "grad_norm": 0.10696531565404, "kl": 0.08447265625, "learning_rate": 8.976683937823834e-07, "loss": 0.0009, "reward": 2.4999988079071045, "reward_std": 9.770348015081254e-07, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999988079071045, "step": 396 }, { "clip_ratio": 0.0, "completion_length": 34.5, "epoch": 1.028497409326425, "grad_norm": 28.26823879348509, "kl": 4.21484375, "learning_rate": 8.974093264248705e-07, "loss": 0.0161, "reward": 2.12465238571167, "reward_std": 0.5670378761615211, "rewards/format_reward_rec": 0.875, "rewards/point_reward": 1.6871524453163147, "step": 397 }, { "clip_ratio": 0.0, "completion_length": 35.125, "epoch": 1.0310880829015545, "grad_norm": 44.52193042461309, "kl": 0.4208984375, "learning_rate": 8.971502590673574e-07, "loss": 0.0009, "reward": 2.020863175392151, "reward_std": 0.19359701108623995, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.5208631753921509, "step": 398 }, { "clip_ratio": 0.0, "completion_length": 34.0, "epoch": 1.0336787564766838, "grad_norm": 15.466613267417598, "kl": 0.132080078125, "learning_rate": 8.968911917098445e-07, "loss": 0.0008, "reward": 2.2499775886535645, "reward_std": 0.26726902516281825, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.7499774098396301, "step": 399 }, { "clip_ratio": 0.0, "completion_length": 34.0, "epoch": 1.0362694300518134, "grad_norm": 31.74878005086349, "kl": 0.1572265625, "learning_rate": 8.966321243523316e-07, "loss": 0.0008, "reward": 2.4999704360961914, "reward_std": 4.443721172719961e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999704360961914, "step": 400 }, { "clip_ratio": 0.0, "completion_length": 34.0, "epoch": 1.038860103626943, "grad_norm": 0.7896260532318355, "kl": 0.48193359375, "learning_rate": 8.963730569948186e-07, "loss": 0.0021, "reward": 2.4999923706054688, "reward_std": 1.165152184512408e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999924302101135, "step": 401 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 1.0414507772020725, "grad_norm": 0.07177787029553469, "kl": 0.0709228515625, "learning_rate": 8.961139896373057e-07, "loss": -0.0007, "reward": 2.4999982118606567, "reward_std": 1.2793494192919752e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999983310699463, "step": 402 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 1.044041450777202, "grad_norm": 2.6256143014851956, "kl": 0.12548828125, "learning_rate": 8.958549222797928e-07, "loss": 0.0002, "reward": 2.499992847442627, "reward_std": 1.1741248272301164e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999927878379822, "step": 403 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 1.0466321243523315, "grad_norm": 10.497894106375623, "kl": 0.1708984375, "learning_rate": 8.955958549222797e-07, "loss": 0.0002, "reward": 2.4303700923919678, "reward_std": 0.19693740084039746, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9303700923919678, "step": 404 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 1.049222797927461, "grad_norm": 6.896369178768509, "kl": 0.051513671875, "learning_rate": 8.953367875647668e-07, "loss": 0.0002, "reward": 2.0565009713172913, "reward_std": 0.17914094313570672, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.5565009117126465, "step": 405 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 1.0518134715025906, "grad_norm": 43.64665359616527, "kl": 0.105712890625, "learning_rate": 8.950777202072538e-07, "loss": 0.001, "reward": 1.9859212636947632, "reward_std": 0.00043847410643138574, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4859214425086975, "step": 406 }, { "clip_ratio": 0.0, "completion_length": 34.5, "epoch": 1.0544041450777202, "grad_norm": 3.7514510778873214, "kl": 0.140625, "learning_rate": 8.948186528497409e-07, "loss": 0.0009, "reward": 2.4999618530273438, "reward_std": 1.215601493242957e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999618530273438, "step": 407 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 1.0569948186528497, "grad_norm": 0.33350622045235956, "kl": 0.092041015625, "learning_rate": 8.94559585492228e-07, "loss": 0.0014, "reward": 2.499992847442627, "reward_std": 2.475089104336803e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999926090240479, "step": 408 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 1.0595854922279793, "grad_norm": 0.22041465229516474, "kl": 0.12158203125, "learning_rate": 8.94300518134715e-07, "loss": 0.0002, "reward": 2.4999959468841553, "reward_std": 1.6094693933155213e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999961256980896, "step": 409 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 1.0621761658031088, "grad_norm": 81.53732926487953, "kl": 0.0968017578125, "learning_rate": 8.94041450777202e-07, "loss": 0.001, "reward": 1.9990376234054565, "reward_std": 0.0004494970630730677, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4990374445915222, "step": 410 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 1.0647668393782384, "grad_norm": 417.94326039059274, "kl": 0.150390625, "learning_rate": 8.93782383419689e-07, "loss": 0.0007, "reward": 1.739211082458496, "reward_std": 0.2786605658211556, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.2392111420631409, "step": 411 }, { "clip_ratio": 0.0, "completion_length": 34.5, "epoch": 1.067357512953368, "grad_norm": 12.622790924108282, "kl": 0.170166015625, "learning_rate": 8.935233160621761e-07, "loss": 0.001, "reward": 1.998087465763092, "reward_std": 3.326684196736096e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4980872869491577, "step": 412 }, { "clip_ratio": 0.0, "completion_length": 34.875, "epoch": 1.0699481865284974, "grad_norm": 58.75209329973357, "kl": 0.115478515625, "learning_rate": 8.932642487046632e-07, "loss": 0.0006, "reward": 2.400259017944336, "reward_std": 0.2820926400289636, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9002589583396912, "step": 413 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 1.072538860103627, "grad_norm": 9.20560961431424, "kl": 0.0828857421875, "learning_rate": 8.930051813471502e-07, "loss": 0.0004, "reward": 2.4999626874923706, "reward_std": 2.5570667162355676e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999626874923706, "step": 414 }, { "clip_ratio": 0.0, "completion_length": 35.375, "epoch": 1.0751295336787565, "grad_norm": 22.774684195504737, "kl": 0.13720703125, "learning_rate": 8.927461139896373e-07, "loss": 0.0003, "reward": 1.9740850925445557, "reward_std": 0.025580175279174, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4740851521492004, "step": 415 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 1.077720207253886, "grad_norm": 0.4648953456206473, "kl": 0.0804443359375, "learning_rate": 8.924870466321242e-07, "loss": 0.0002, "reward": 2.499995470046997, "reward_std": 2.4615105758130085e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.999995470046997, "step": 416 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 1.0803108808290156, "grad_norm": 26.40377335945731, "kl": 0.10107421875, "learning_rate": 8.922279792746113e-07, "loss": -0.0003, "reward": 1.9838183522224426, "reward_std": 0.015582584572257474, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4838182926177979, "step": 417 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 1.0829015544041452, "grad_norm": 90.75785815686979, "kl": 0.155029296875, "learning_rate": 8.919689119170984e-07, "loss": 0.0008, "reward": 1.9772456884384155, "reward_std": 0.008828636850012117, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4772456884384155, "step": 418 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 1.0854922279792747, "grad_norm": 257.53600374977816, "kl": 0.07568359375, "learning_rate": 8.917098445595854e-07, "loss": 0.0003, "reward": 1.7357445359230042, "reward_std": 0.2677098226849921, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.235744595527649, "step": 419 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 1.0880829015544042, "grad_norm": 0.29778990538808714, "kl": 0.0621337890625, "learning_rate": 8.914507772020725e-07, "loss": 0.0005, "reward": 2.4999656677246094, "reward_std": 3.783144506996905e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999656081199646, "step": 420 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 1.0906735751295338, "grad_norm": 72.02617062099873, "kl": 0.09375, "learning_rate": 8.911917098445595e-07, "loss": 0.0005, "reward": 2.3123912811279297, "reward_std": 0.2589216949972979, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.8123913407325745, "step": 421 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 1.093264248704663, "grad_norm": 7.580179253156264, "kl": 0.1044921875, "learning_rate": 8.909326424870465e-07, "loss": 0.0006, "reward": 1.9999219179153442, "reward_std": 1.1002060546161374e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.49992173910141, "step": 422 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 1.0958549222797926, "grad_norm": 2.4919894278984396, "kl": 0.0460205078125, "learning_rate": 8.906735751295336e-07, "loss": -0.0002, "reward": 2.49999463558197, "reward_std": 7.017026547373462e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999946355819702, "step": 423 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 1.0984455958549222, "grad_norm": 13.513076731364253, "kl": 0.092041015625, "learning_rate": 8.904145077720206e-07, "loss": 0.0003, "reward": 1.7202502489089966, "reward_std": 0.0008438759050477529, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.2202502489089966, "step": 424 }, { "clip_ratio": 0.0, "completion_length": 34.5, "epoch": 1.1010362694300517, "grad_norm": 0.6303763190684515, "kl": 0.157958984375, "learning_rate": 8.901554404145077e-07, "loss": 0.0006, "reward": 2.499995470046997, "reward_std": 4.150169672811899e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999955892562866, "step": 425 }, { "clip_ratio": 0.0, "completion_length": 34.9375, "epoch": 1.1036269430051813, "grad_norm": 130.92885317581852, "kl": 0.07666015625, "learning_rate": 8.898963730569949e-07, "loss": 0.001, "reward": 1.8611122965812683, "reward_std": 0.2581528257969694, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.361112117767334, "step": 426 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 1.1062176165803108, "grad_norm": 2.327998252567789, "kl": 0.077880859375, "learning_rate": 8.896373056994819e-07, "loss": 0.0001, "reward": 2.4999914169311523, "reward_std": 8.450096345313796e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.999991536140442, "step": 427 }, { "clip_ratio": 0.0, "completion_length": 34.4375, "epoch": 1.1088082901554404, "grad_norm": 4.498881975499488, "kl": 0.16650390625, "learning_rate": 8.893782383419689e-07, "loss": 0.0004, "reward": 2.499961018562317, "reward_std": 2.298750814588857e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999611377716064, "step": 428 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 1.11139896373057, "grad_norm": 0.1715745423148994, "kl": 0.0966796875, "learning_rate": 8.891191709844559e-07, "loss": 0.0004, "reward": 2.4999970197677612, "reward_std": 2.584576861863752e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999971389770508, "step": 429 }, { "clip_ratio": 0.0, "completion_length": 34.5, "epoch": 1.1139896373056994, "grad_norm": 0.2587929534288508, "kl": 0.127197265625, "learning_rate": 8.88860103626943e-07, "loss": 0.0001, "reward": 2.4999966621398926, "reward_std": 1.0830531067540505e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.999996840953827, "step": 430 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 1.116580310880829, "grad_norm": 3.392796252003056, "kl": 0.09521484375, "learning_rate": 8.886010362694301e-07, "loss": -0.0002, "reward": 2.4999715089797974, "reward_std": 1.6701092931725725e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999715685844421, "step": 431 }, { "clip_ratio": 0.0, "completion_length": 35.125, "epoch": 1.1191709844559585, "grad_norm": 66.24620245667927, "kl": 0.29638671875, "learning_rate": 8.883419689119171e-07, "loss": 0.0012, "reward": 1.4852866530418396, "reward_std": 0.013451165985316038, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 0.9852865636348724, "step": 432 }, { "clip_ratio": 0.0, "completion_length": 36.5, "epoch": 1.121761658031088, "grad_norm": 0.8980414182681852, "kl": 0.0533447265625, "learning_rate": 8.880829015544042e-07, "loss": -0.0003, "reward": 2.49997615814209, "reward_std": 8.914175850804895e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999761581420898, "step": 433 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 1.1243523316062176, "grad_norm": 0.10466006766160264, "kl": 0.0859375, "learning_rate": 8.878238341968911e-07, "loss": 0.0003, "reward": 2.499997138977051, "reward_std": 1.197195842905785e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999971985816956, "step": 434 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 1.1269430051813472, "grad_norm": 56.64892976696522, "kl": 0.0986328125, "learning_rate": 8.875647668393782e-07, "loss": 0.0002, "reward": 1.892016589641571, "reward_std": 0.3051476856244335, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.3920166790485382, "step": 435 }, { "clip_ratio": 0.0, "completion_length": 34.5, "epoch": 1.1295336787564767, "grad_norm": 86.73903645096914, "kl": 0.156005859375, "learning_rate": 8.873056994818653e-07, "loss": 0.0006, "reward": 1.9997649192810059, "reward_std": 9.56018175202189e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.499765008687973, "step": 436 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 1.1321243523316062, "grad_norm": 0.6464813836860527, "kl": 0.097412109375, "learning_rate": 8.870466321243523e-07, "loss": 0.0013, "reward": 2.4999921321868896, "reward_std": 3.5397972624195972e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999921917915344, "step": 437 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 1.1347150259067358, "grad_norm": 0.24719754877821865, "kl": 0.068359375, "learning_rate": 8.867875647668394e-07, "loss": 0.0012, "reward": 2.499998092651367, "reward_std": 2.3967554625414778e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999980330467224, "step": 438 }, { "clip_ratio": 0.0, "completion_length": 34.0, "epoch": 1.1373056994818653, "grad_norm": 13.79375585716135, "kl": 0.09375, "learning_rate": 8.865284974093264e-07, "loss": 0.0008, "reward": 1.9852967262268066, "reward_std": 0.001258290941677842, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4852966964244843, "step": 439 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 1.1398963730569949, "grad_norm": 4.905607975061321, "kl": 0.067138671875, "learning_rate": 8.862694300518134e-07, "loss": 0.0007, "reward": 1.9001922607421875, "reward_std": 0.0001536047930130735, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.400191992521286, "step": 440 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 1.1424870466321244, "grad_norm": 2.1023873650679272, "kl": 0.054443359375, "learning_rate": 8.860103626943005e-07, "loss": 0.0003, "reward": 2.499992847442627, "reward_std": 6.464971988862089e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999927282333374, "step": 441 }, { "clip_ratio": 0.0, "completion_length": 34.0, "epoch": 1.145077720207254, "grad_norm": 0.13724521017587, "kl": 0.0723876953125, "learning_rate": 8.857512953367875e-07, "loss": -0.0006, "reward": 2.4999979734420776, "reward_std": 9.475184441498641e-07, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999980330467224, "step": 442 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 1.1476683937823835, "grad_norm": 34.30903798601372, "kl": 0.1748046875, "learning_rate": 8.854922279792746e-07, "loss": 0.0019, "reward": 2.4999849796295166, "reward_std": 2.4904329166020034e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999848008155823, "step": 443 }, { "clip_ratio": 0.0, "completion_length": 34.5, "epoch": 1.150259067357513, "grad_norm": 4.45419009327568, "kl": 0.103759765625, "learning_rate": 8.852331606217616e-07, "loss": 0.0014, "reward": 2.499996542930603, "reward_std": 4.7254719390821265e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.999996304512024, "step": 444 }, { "clip_ratio": 0.0, "completion_length": 34.0, "epoch": 1.1528497409326426, "grad_norm": 0.6394649645583231, "kl": 0.08172607421875, "learning_rate": 8.849740932642487e-07, "loss": 0.0015, "reward": 2.4999897480010986, "reward_std": 1.7348398841932067e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999895691871643, "step": 445 }, { "clip_ratio": 0.0, "completion_length": 34.625, "epoch": 1.1554404145077721, "grad_norm": 73.74568146481987, "kl": 0.094970703125, "learning_rate": 8.847150259067357e-07, "loss": 0.0004, "reward": 1.674540638923645, "reward_std": 0.3161500170826912, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.1745406091213226, "step": 446 }, { "clip_ratio": 0.0, "completion_length": 35.125, "epoch": 1.1580310880829017, "grad_norm": 90.57154886616541, "kl": 0.0751953125, "learning_rate": 8.844559585492227e-07, "loss": 0.0003, "reward": 1.3530938029289246, "reward_std": 0.18028530236915685, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 0.8530937731266022, "step": 447 }, { "clip_ratio": 0.0, "completion_length": 34.5, "epoch": 1.160621761658031, "grad_norm": 0.17610403103940198, "kl": 0.0701904296875, "learning_rate": 8.841968911917098e-07, "loss": 0.0003, "reward": 2.4999970197677612, "reward_std": 1.3332831656498456e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999970197677612, "step": 448 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 1.1632124352331605, "grad_norm": 0.6657898278232692, "kl": 0.21649169921875, "learning_rate": 8.839378238341969e-07, "loss": -0.0006, "reward": 2.499995708465576, "reward_std": 4.492713230774825e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999958276748657, "step": 449 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 1.16580310880829, "grad_norm": 13.036435514484477, "kl": 0.09765625, "learning_rate": 8.836787564766839e-07, "loss": 0.0002, "reward": 2.4374842643737793, "reward_std": 0.17680859067291976, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9374842047691345, "step": 450 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 1.1683937823834196, "grad_norm": 1.1405571470651923, "kl": 0.101318359375, "learning_rate": 8.83419689119171e-07, "loss": 0.0008, "reward": 2.4999808073043823, "reward_std": 6.84925453242613e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999808073043823, "step": 451 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 1.1709844559585492, "grad_norm": 0.21700780087226967, "kl": 0.11767578125, "learning_rate": 8.831606217616579e-07, "loss": 0.0008, "reward": 2.4999958276748657, "reward_std": 2.805690996865451e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999957084655762, "step": 452 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 1.1735751295336787, "grad_norm": 28.29878720243635, "kl": 0.0584716796875, "learning_rate": 8.82901554404145e-07, "loss": 0.0005, "reward": 1.8530709147453308, "reward_std": 0.02618713528443095, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.3530707955360413, "step": 453 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 1.1761658031088082, "grad_norm": 23.0520703043227, "kl": 0.1270751953125, "learning_rate": 8.826424870466321e-07, "loss": 0.0012, "reward": 2.0300532579421997, "reward_std": 0.18988699556376787, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.5300532579421997, "step": 454 }, { "clip_ratio": 0.0, "completion_length": 34.5, "epoch": 1.1787564766839378, "grad_norm": 42.59667017674438, "kl": 0.2431640625, "learning_rate": 8.823834196891191e-07, "loss": 0.001, "reward": 1.9351577162742615, "reward_std": 0.1813949552597478, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4351578652858734, "step": 455 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 1.1813471502590673, "grad_norm": 3.1976299761041713, "kl": 0.097564697265625, "learning_rate": 8.821243523316062e-07, "loss": 0.0008, "reward": 2.4999502897262573, "reward_std": 2.2653130599792348e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.999950349330902, "step": 456 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 1.1839378238341969, "grad_norm": 1.7616212918628507, "kl": 0.14697265625, "learning_rate": 8.818652849740932e-07, "loss": -0.0005, "reward": 2.499990940093994, "reward_std": 7.570316370220098e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999911189079285, "step": 457 }, { "clip_ratio": 0.0, "completion_length": 35.4375, "epoch": 1.1865284974093264, "grad_norm": 17.806489766423955, "kl": 0.357177734375, "learning_rate": 8.816062176165802e-07, "loss": 0.0009, "reward": 2.436825752258301, "reward_std": 0.17811651074771362, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9368258714675903, "step": 458 }, { "clip_ratio": 0.0, "completion_length": 34.625, "epoch": 1.189119170984456, "grad_norm": 45.06925648484607, "kl": 0.16015625, "learning_rate": 8.813471502590673e-07, "loss": -0.0002, "reward": 2.3625733852386475, "reward_std": 0.2544594280141155, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.862573504447937, "step": 459 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 1.1917098445595855, "grad_norm": 0.2893301664645233, "kl": 0.0760498046875, "learning_rate": 8.810880829015543e-07, "loss": 0.0016, "reward": 2.4999966621398926, "reward_std": 3.371640161731193e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999964833259583, "step": 460 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 1.194300518134715, "grad_norm": 0.3122314874399316, "kl": 0.06396484375, "learning_rate": 8.808290155440414e-07, "loss": 0.0001, "reward": 2.49999463558197, "reward_std": 5.715547217732819e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999947547912598, "step": 461 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 1.1968911917098446, "grad_norm": 0.9788897999342062, "kl": 0.0633544921875, "learning_rate": 8.805699481865284e-07, "loss": 0.0002, "reward": 2.4999680519104004, "reward_std": 6.617727194679901e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999680519104004, "step": 462 }, { "clip_ratio": 0.0, "completion_length": 34.5, "epoch": 1.1994818652849741, "grad_norm": 0.18570239241635855, "kl": 0.087158203125, "learning_rate": 8.803108808290155e-07, "loss": -0.0008, "reward": 2.499997854232788, "reward_std": 1.1072355903252173e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999982118606567, "step": 463 }, { "clip_ratio": 0.0, "completion_length": 34.5, "epoch": 1.2020725388601037, "grad_norm": 0.10444988339313646, "kl": 0.011444091796875, "learning_rate": 8.800518134715025e-07, "loss": 0.0018, "reward": 2.4999985694885254, "reward_std": 1.4295797541308275e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999984502792358, "step": 464 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 1.2046632124352332, "grad_norm": 38.26699080789075, "kl": 0.1170654296875, "learning_rate": 8.797927461139895e-07, "loss": 0.0004, "reward": 1.9998422861099243, "reward_std": 2.3600002492685235e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4998424053192139, "step": 465 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 1.2072538860103628, "grad_norm": 17.141246542347844, "kl": 0.09912109375, "learning_rate": 8.795336787564766e-07, "loss": 0.0003, "reward": 1.979422926902771, "reward_std": 0.00020118385555178975, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4794228076934814, "step": 466 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 1.2098445595854923, "grad_norm": 0.2526893822344503, "kl": 0.220703125, "learning_rate": 8.792746113989636e-07, "loss": 0.0015, "reward": 2.499997854232788, "reward_std": 1.947460702922399e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.999997854232788, "step": 467 }, { "clip_ratio": 0.0, "completion_length": 34.125, "epoch": 1.2124352331606219, "grad_norm": 31.852021532818366, "kl": 0.14569091796875, "learning_rate": 8.790155440414507e-07, "loss": 0.0013, "reward": 1.9521268010139465, "reward_std": 0.0022770423521478733, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4521267116069794, "step": 468 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 1.2150259067357512, "grad_norm": 1.0281744349665374, "kl": 0.103271484375, "learning_rate": 8.787564766839379e-07, "loss": 0.001, "reward": 1.9993879795074463, "reward_std": 1.2945804314767884e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4993878901004791, "step": 469 }, { "clip_ratio": 0.0, "completion_length": 34.0, "epoch": 1.2176165803108807, "grad_norm": 0.43410273744995626, "kl": 0.0716552734375, "learning_rate": 8.784974093264247e-07, "loss": 0.0004, "reward": 2.4999778270721436, "reward_std": 7.111955937944003e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999777674674988, "step": 470 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 1.2202072538860103, "grad_norm": 50.59271171474758, "kl": 0.077392578125, "learning_rate": 8.782383419689119e-07, "loss": 0.0003, "reward": 2.3740792274475098, "reward_std": 0.3561514914035797, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.874079406261444, "step": 471 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 1.2227979274611398, "grad_norm": 0.7713101538824298, "kl": 0.0872802734375, "learning_rate": 8.77979274611399e-07, "loss": 0.0006, "reward": 2.4999958276748657, "reward_std": 3.7912550112650933e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999958276748657, "step": 472 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 1.2253886010362693, "grad_norm": 21.082820584773735, "kl": 0.17462158203125, "learning_rate": 8.77720207253886e-07, "loss": -0.0002, "reward": 1.9570937156677246, "reward_std": 0.025583247080476212, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4570938348770142, "step": 473 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 1.2279792746113989, "grad_norm": 12.332295672892807, "kl": 0.12548828125, "learning_rate": 8.774611398963731e-07, "loss": 0.0004, "reward": 1.9994518756866455, "reward_std": 2.252896024401707e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4994519352912903, "step": 474 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 1.2305699481865284, "grad_norm": 12.085946387998836, "kl": 0.074951171875, "learning_rate": 8.772020725388601e-07, "loss": 0.0, "reward": 2.499893546104431, "reward_std": 4.6301764086820185e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.999893605709076, "step": 475 }, { "clip_ratio": 0.0, "completion_length": 34.5, "epoch": 1.233160621761658, "grad_norm": 8.910241412807926, "kl": 0.0848388671875, "learning_rate": 8.769430051813471e-07, "loss": 0.0005, "reward": 2.499955177307129, "reward_std": 6.667927027592668e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999550580978394, "step": 476 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 1.2357512953367875, "grad_norm": 18.492399334575982, "kl": 0.1121826171875, "learning_rate": 8.766839378238342e-07, "loss": 0.0007, "reward": 2.437414765357971, "reward_std": 0.1769445626981394, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9374147057533264, "step": 477 }, { "clip_ratio": 0.0, "completion_length": 35.1875, "epoch": 1.238341968911917, "grad_norm": 15.299927589119523, "kl": 0.13818359375, "learning_rate": 8.764248704663212e-07, "loss": 0.0007, "reward": 1.8968449831008911, "reward_std": 0.04106860855790728, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.3968449234962463, "step": 478 }, { "clip_ratio": 0.0, "completion_length": 33.6875, "epoch": 1.2409326424870466, "grad_norm": 77.00274354527039, "kl": 0.1083984375, "learning_rate": 8.761658031088083e-07, "loss": 0.0004, "reward": 2.3105857372283936, "reward_std": 0.26142529569824546, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.8105856776237488, "step": 479 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 1.2435233160621761, "grad_norm": 0.6616120982092444, "kl": 0.089599609375, "learning_rate": 8.759067357512953e-07, "loss": 0.0009, "reward": 2.4999921321868896, "reward_std": 4.195728251943365e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999920129776, "step": 480 }, { "clip_ratio": 0.0, "completion_length": 34.875, "epoch": 1.2461139896373057, "grad_norm": 3.2939424001045627, "kl": 0.082275390625, "learning_rate": 8.756476683937824e-07, "loss": 0.001, "reward": 2.4999918937683105, "reward_std": 2.6570181432816753e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999916553497314, "step": 481 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 1.2487046632124352, "grad_norm": 28.58802241475212, "kl": 0.072998046875, "learning_rate": 8.753886010362695e-07, "loss": 0.0006, "reward": 1.7892868518829346, "reward_std": 0.00017940342308975232, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.289286881685257, "step": 482 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 1.2512953367875648, "grad_norm": 6.292983622135817, "kl": 0.1826171875, "learning_rate": 8.751295336787564e-07, "loss": 0.0008, "reward": 1.4994452595710754, "reward_std": 5.78963736188598e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 0.9994453489780426, "step": 483 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 1.2538860103626943, "grad_norm": 2.064790038323725, "kl": 0.0653076171875, "learning_rate": 8.748704663212435e-07, "loss": 0.0005, "reward": 2.499991297721863, "reward_std": 1.1068362482546945e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999913573265076, "step": 484 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 1.2564766839378239, "grad_norm": 38.42058225974973, "kl": 0.2515869140625, "learning_rate": 8.746113989637305e-07, "loss": 0.0014, "reward": 2.3749624490737915, "reward_std": 0.23151093343540197, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.874962329864502, "step": 485 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 1.2590673575129534, "grad_norm": 21.885704126952792, "kl": 0.218505859375, "learning_rate": 8.743523316062176e-07, "loss": 0.0003, "reward": 1.7940752506256104, "reward_std": 0.03148522444280388, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.2940754890441895, "step": 486 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 1.261658031088083, "grad_norm": 1.1747920062815884, "kl": 0.13525390625, "learning_rate": 8.740932642487047e-07, "loss": -0.0007, "reward": 2.4999890327453613, "reward_std": 1.1638317744200322e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999891519546509, "step": 487 }, { "clip_ratio": 0.0, "completion_length": 35.0625, "epoch": 1.2642487046632125, "grad_norm": 24.412740604956966, "kl": 0.1484375, "learning_rate": 8.738341968911916e-07, "loss": 0.0006, "reward": 2.3748098611831665, "reward_std": 0.35388536751270294, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.8748098015785217, "step": 488 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 1.266839378238342, "grad_norm": 32.23039537284305, "kl": 0.139892578125, "learning_rate": 8.735751295336787e-07, "loss": 0.0001, "reward": 2.312269926071167, "reward_std": 0.25883071099815425, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.8122699856758118, "step": 489 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 1.2694300518134716, "grad_norm": 3.342467593195098, "kl": 0.10400390625, "learning_rate": 8.733160621761657e-07, "loss": 0.0014, "reward": 2.499990940093994, "reward_std": 1.0768087861379172e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999909400939941, "step": 490 }, { "clip_ratio": 0.0, "completion_length": 34.5, "epoch": 1.2720207253886011, "grad_norm": 6.411567545415011, "kl": 0.0732421875, "learning_rate": 8.730569948186528e-07, "loss": 0.0003, "reward": 2.4999295473098755, "reward_std": 5.353304868549458e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999294877052307, "step": 491 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 1.2746113989637307, "grad_norm": 154.2222808395924, "kl": 0.11083984375, "learning_rate": 8.727979274611399e-07, "loss": 0.0009, "reward": 2.374971866607666, "reward_std": 0.23148871652870184, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.8749717473983765, "step": 492 }, { "clip_ratio": 0.0, "completion_length": 35.375, "epoch": 1.2772020725388602, "grad_norm": 35.835700660601134, "kl": 0.0828857421875, "learning_rate": 8.725388601036269e-07, "loss": 0.0, "reward": 2.1736336946487427, "reward_std": 0.27024298158335114, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.6736337542533875, "step": 493 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 1.2797927461139897, "grad_norm": 2.676829314252972, "kl": 0.15869140625, "learning_rate": 8.72279792746114e-07, "loss": 0.0004, "reward": 2.499984860420227, "reward_std": 1.3475339756041649e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999849200248718, "step": 494 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 1.2823834196891193, "grad_norm": 7.331169443033464, "kl": 0.14501953125, "learning_rate": 8.720207253886009e-07, "loss": 0.0002, "reward": 2.4999111890792847, "reward_std": 3.078954341617646e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.99991112947464, "step": 495 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 1.2849740932642488, "grad_norm": 45.42929805913324, "kl": 0.1204833984375, "learning_rate": 8.71761658031088e-07, "loss": 0.0006, "reward": 1.4907369017601013, "reward_std": 0.00031247303559212014, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 0.9907369017601013, "step": 496 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 1.2875647668393784, "grad_norm": 32.718935169936096, "kl": 0.135498046875, "learning_rate": 8.715025906735751e-07, "loss": 0.0005, "reward": 1.4990538954734802, "reward_std": 0.00022890909895068035, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 0.9990538656711578, "step": 497 }, { "clip_ratio": 0.0, "completion_length": 34.5, "epoch": 1.2901554404145077, "grad_norm": 4.111037862460617, "kl": 0.1314697265625, "learning_rate": 8.712435233160621e-07, "loss": 0.0007, "reward": 1.9187270402908325, "reward_std": 0.0003245865591452457, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4187270402908325, "step": 498 }, { "clip_ratio": 0.0, "completion_length": 34.0, "epoch": 1.2927461139896372, "grad_norm": 0.21846995165919592, "kl": 0.072998046875, "learning_rate": 8.709844559585492e-07, "loss": -0.0004, "reward": 2.4999977350234985, "reward_std": 1.2924656971335935e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999977350234985, "step": 499 }, { "clip_ratio": 0.0, "completion_length": 34.0, "epoch": 1.2953367875647668, "grad_norm": 0.22255898956838358, "kl": 0.14990234375, "learning_rate": 8.707253886010363e-07, "loss": 0.0002, "reward": 2.499997138977051, "reward_std": 1.551056698190223e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.999997317790985, "step": 500 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 1.2979274611398963, "grad_norm": 19.646926760605268, "kl": 0.15966796875, "learning_rate": 8.704663212435232e-07, "loss": 0.0006, "reward": 2.4998767375946045, "reward_std": 5.471971962833777e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9998767375946045, "step": 501 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 1.3005181347150259, "grad_norm": 91.19680000980189, "kl": 0.086669921875, "learning_rate": 8.702072538860103e-07, "loss": -0.0002, "reward": 2.374948740005493, "reward_std": 0.2315441130643876, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.8749488592147827, "step": 502 }, { "clip_ratio": 0.0, "completion_length": 34.5, "epoch": 1.3031088082901554, "grad_norm": 1.6592695638499924, "kl": 0.052490234375, "learning_rate": 8.699481865284973e-07, "loss": -0.0017, "reward": 2.499985456466675, "reward_std": 8.230225830629934e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.999985694885254, "step": 503 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 1.305699481865285, "grad_norm": 63.17525684276914, "kl": 0.12225341796875, "learning_rate": 8.696891191709844e-07, "loss": 0.0009, "reward": 2.249861478805542, "reward_std": 0.2674036819310004, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.7498613595962524, "step": 504 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 1.3082901554404145, "grad_norm": 4.3793934638945995, "kl": 0.1055908203125, "learning_rate": 8.694300518134715e-07, "loss": 0.0001, "reward": 2.499991536140442, "reward_std": 9.018640184876858e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.999991536140442, "step": 505 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 1.310880829015544, "grad_norm": 14.794700219470856, "kl": 0.105224609375, "learning_rate": 8.691709844559585e-07, "loss": 0.0001, "reward": 2.4374682903289795, "reward_std": 0.17682419877246502, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9374682903289795, "step": 506 }, { "clip_ratio": 0.0, "completion_length": 34.875, "epoch": 1.3134715025906736, "grad_norm": 237.4234872129574, "kl": 0.1923828125, "learning_rate": 8.689119170984455e-07, "loss": 0.0008, "reward": 1.8867421746253967, "reward_std": 0.27173711359500885, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.3867421448230743, "step": 507 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 1.3160621761658031, "grad_norm": 112.39196712868807, "kl": 0.085205078125, "learning_rate": 8.686528497409325e-07, "loss": 0.0003, "reward": 2.437467336654663, "reward_std": 0.1768370179406702, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9374674558639526, "step": 508 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 1.3186528497409327, "grad_norm": 1.2578278527512097, "kl": 0.1103515625, "learning_rate": 8.683937823834196e-07, "loss": 0.0006, "reward": 2.499987840652466, "reward_std": 5.7642498632048955e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999878406524658, "step": 509 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 1.3212435233160622, "grad_norm": 0.43174782510600457, "kl": 0.0540771484375, "learning_rate": 8.681347150259068e-07, "loss": 0.0008, "reward": 2.4999935626983643, "reward_std": 5.187495048630808e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999934434890747, "step": 510 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 1.3238341968911918, "grad_norm": 5.038778187731034, "kl": 0.114501953125, "learning_rate": 8.678756476683938e-07, "loss": 0.0003, "reward": 1.8212202191352844, "reward_std": 0.00020774168297066353, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.3212201595306396, "step": 511 }, { "clip_ratio": 0.0, "completion_length": 34.5625, "epoch": 1.3264248704663213, "grad_norm": 52.99931971886971, "kl": 0.1025390625, "learning_rate": 8.676165803108809e-07, "loss": 0.0005, "reward": 1.4864550828933716, "reward_std": 0.017443951954192016, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 0.9864550232887268, "step": 512 }, { "clip_ratio": 0.0, "completion_length": 34.875, "epoch": 1.3290155440414508, "grad_norm": 0.9693997196557682, "kl": 0.0501708984375, "learning_rate": 8.673575129533677e-07, "loss": 0.0008, "reward": 2.499989151954651, "reward_std": 7.076597967170528e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.999989092350006, "step": 513 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 1.3316062176165804, "grad_norm": 2.1328251997079617, "kl": 0.14794921875, "learning_rate": 8.670984455958549e-07, "loss": 0.0005, "reward": 1.9984451532363892, "reward_std": 3.9576490735271364e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4984452426433563, "step": 514 }, { "clip_ratio": 0.0, "completion_length": 34.5, "epoch": 1.33419689119171, "grad_norm": 1.03276895354779, "kl": 0.0906982421875, "learning_rate": 8.66839378238342e-07, "loss": -0.0004, "reward": 2.499991297721863, "reward_std": 2.4609428805888456e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999912977218628, "step": 515 }, { "clip_ratio": 0.0, "completion_length": 34.5, "epoch": 1.3367875647668392, "grad_norm": 137.73385327658656, "kl": 0.0927734375, "learning_rate": 8.66580310880829e-07, "loss": 0.0004, "reward": 1.8059165477752686, "reward_std": 0.0012951576063642278, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.305916428565979, "step": 516 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 1.3393782383419688, "grad_norm": 2.007081933441151, "kl": 0.121337890625, "learning_rate": 8.663212435233161e-07, "loss": 0.0009, "reward": 2.4999887943267822, "reward_std": 1.0784114465423045e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.999988853931427, "step": 517 }, { "clip_ratio": 0.0, "completion_length": 34.5, "epoch": 1.3419689119170983, "grad_norm": 1.8378451582766666, "kl": 0.078369140625, "learning_rate": 8.660621761658031e-07, "loss": -0.0001, "reward": 2.49996280670166, "reward_std": 1.911813257038375e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999628067016602, "step": 518 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 1.3445595854922279, "grad_norm": 0.3779511835398549, "kl": 0.06488037109375, "learning_rate": 8.658031088082901e-07, "loss": 0.0005, "reward": 2.499882221221924, "reward_std": 7.627893637618399e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.999882161617279, "step": 519 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 1.3471502590673574, "grad_norm": 134.17842802835654, "kl": 0.06591796875, "learning_rate": 8.655440414507772e-07, "loss": 0.0009, "reward": 1.999657690525055, "reward_std": 8.30407136049871e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4996576309204102, "step": 520 }, { "clip_ratio": 0.0, "completion_length": 34.5, "epoch": 1.349740932642487, "grad_norm": 17.932654112913696, "kl": 0.17724609375, "learning_rate": 8.652849740932642e-07, "loss": 0.0007, "reward": 1.9529168605804443, "reward_std": 0.0004509269642767322, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4529170393943787, "step": 521 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 1.3523316062176165, "grad_norm": 0.8370015530703874, "kl": 0.100341796875, "learning_rate": 8.650259067357513e-07, "loss": 0.0003, "reward": 2.4999746084213257, "reward_std": 3.4588146036185208e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999748468399048, "step": 522 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 1.354922279792746, "grad_norm": 1.1187172745983656, "kl": 0.0562744140625, "learning_rate": 8.647668393782384e-07, "loss": 0.0003, "reward": 2.4999914169311523, "reward_std": 5.636387641061447e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.999991536140442, "step": 523 }, { "clip_ratio": 0.0, "completion_length": 35.25, "epoch": 1.3575129533678756, "grad_norm": 5.475814554760311, "kl": 0.111083984375, "learning_rate": 8.645077720207254e-07, "loss": -0.0006, "reward": 1.921807050704956, "reward_std": 0.04307932459528274, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4218071699142456, "step": 524 }, { "clip_ratio": 0.0, "completion_length": 33.9375, "epoch": 1.3601036269430051, "grad_norm": 27.31737565998964, "kl": 0.34423828125, "learning_rate": 8.642487046632124e-07, "loss": 0.0015, "reward": 1.0622394680976868, "reward_std": 0.5786689094893518, "rewards/format_reward_rec": 0.625, "rewards/point_reward": 0.7497394382953644, "step": 525 }, { "clip_ratio": 0.0, "completion_length": 35.6875, "epoch": 1.3626943005181347, "grad_norm": 18.04920430476347, "kl": 0.218505859375, "learning_rate": 8.639896373056994e-07, "loss": 0.0013, "reward": 2.366421103477478, "reward_std": 0.24732503924860794, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.866421103477478, "step": 526 }, { "clip_ratio": 0.0, "completion_length": 34.5, "epoch": 1.3652849740932642, "grad_norm": 1.5128707554351777, "kl": 0.06903076171875, "learning_rate": 8.637305699481865e-07, "loss": 0.0002, "reward": 1.9997016191482544, "reward_std": 1.0818416740221437e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4997015297412872, "step": 527 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 1.3678756476683938, "grad_norm": 17.892273651923272, "kl": 0.150146484375, "learning_rate": 8.634715025906736e-07, "loss": 0.0003, "reward": 1.9563751816749573, "reward_std": 0.08006607417064515, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4563751220703125, "step": 528 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 1.3704663212435233, "grad_norm": 49.49759300623319, "kl": 0.121826171875, "learning_rate": 8.632124352331606e-07, "loss": 0.0004, "reward": 1.9985511898994446, "reward_std": 9.44735438679345e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4985513389110565, "step": 529 }, { "clip_ratio": 0.0, "completion_length": 33.625, "epoch": 1.3730569948186528, "grad_norm": 79.33228042959712, "kl": 0.102294921875, "learning_rate": 8.629533678756477e-07, "loss": 0.0006, "reward": 2.041730046272278, "reward_std": 0.18517519126544357, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.541729986667633, "step": 530 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 1.3756476683937824, "grad_norm": 1.6022727320632366, "kl": 0.061279296875, "learning_rate": 8.626943005181346e-07, "loss": 0.0005, "reward": 2.499955415725708, "reward_std": 1.3513888006855268e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.999955177307129, "step": 531 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 1.378238341968912, "grad_norm": 0.10301912494261409, "kl": 0.15478515625, "learning_rate": 8.624352331606217e-07, "loss": 0.0003, "reward": 2.4999966621398926, "reward_std": 1.4547185287483444e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999966621398926, "step": 532 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 1.3808290155440415, "grad_norm": 16.777498631738734, "kl": 0.1572265625, "learning_rate": 8.621761658031088e-07, "loss": 0.0007, "reward": 1.9369465112686157, "reward_std": 0.17685012157380697, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4369465112686157, "step": 533 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 1.383419689119171, "grad_norm": 0.10887296905314073, "kl": 0.09130859375, "learning_rate": 8.619170984455958e-07, "loss": 0.0002, "reward": 2.4999756813049316, "reward_std": 2.958052277790557e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999756217002869, "step": 534 }, { "clip_ratio": 0.0, "completion_length": 36.375, "epoch": 1.3860103626943006, "grad_norm": 124.90872629360128, "kl": 0.1058349609375, "learning_rate": 8.616580310880829e-07, "loss": 0.0004, "reward": 2.2836925983428955, "reward_std": 0.4896298348903656, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.7836925983428955, "step": 535 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 1.38860103626943, "grad_norm": 197.87930575897707, "kl": 0.09130859375, "learning_rate": 8.613989637305699e-07, "loss": 0.0004, "reward": 1.623317539691925, "reward_std": 0.23245796479341152, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.1233174204826355, "step": 536 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 1.3911917098445596, "grad_norm": 0.7908674683704721, "kl": 0.0897216796875, "learning_rate": 8.611398963730569e-07, "loss": 0.0009, "reward": 2.4999709129333496, "reward_std": 5.016916929889703e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999708533287048, "step": 537 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 1.3937823834196892, "grad_norm": 0.5765236754049301, "kl": 0.080810546875, "learning_rate": 8.60880829015544e-07, "loss": 0.0004, "reward": 2.4999920129776, "reward_std": 4.623262384484406e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999921917915344, "step": 538 }, { "clip_ratio": 0.0, "completion_length": 35.875, "epoch": 1.3963730569948187, "grad_norm": 28.323006954809713, "kl": 0.0325927734375, "learning_rate": 8.60621761658031e-07, "loss": 0.0002, "reward": 2.1874371767044067, "reward_std": 0.2588396147421008, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.687437117099762, "step": 539 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 1.3989637305699483, "grad_norm": 0.5693864540919378, "kl": 0.13818359375, "learning_rate": 8.603626943005181e-07, "loss": 0.0008, "reward": 2.4999932050704956, "reward_std": 4.561453579299268e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999933242797852, "step": 540 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 1.4015544041450778, "grad_norm": 3.5784122996785745, "kl": 0.047607421875, "learning_rate": 8.601036269430051e-07, "loss": 0.0005, "reward": 2.4999213218688965, "reward_std": 1.5988063751137815e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999213218688965, "step": 541 }, { "clip_ratio": 0.0, "completion_length": 34.6875, "epoch": 1.4041450777202074, "grad_norm": 48.66807689118439, "kl": 0.093017578125, "learning_rate": 8.598445595854922e-07, "loss": -0.0005, "reward": 2.0504003763198853, "reward_std": 0.2775210708193754, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.55040043592453, "step": 542 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 1.406735751295337, "grad_norm": 0.17729972908932326, "kl": 0.1243896484375, "learning_rate": 8.595854922279792e-07, "loss": 0.0002, "reward": 2.499998092651367, "reward_std": 1.2256335253368889e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999982118606567, "step": 543 }, { "clip_ratio": 0.0, "completion_length": 34.5, "epoch": 1.4093264248704664, "grad_norm": 1.031157083983844, "kl": 0.057373046875, "learning_rate": 8.593264248704662e-07, "loss": -0.0002, "reward": 2.4999918937683105, "reward_std": 5.470992618938908e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999918341636658, "step": 544 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 1.411917098445596, "grad_norm": 0.19099110947313502, "kl": 0.17041015625, "learning_rate": 8.590673575129533e-07, "loss": 0.0011, "reward": 2.4999969005584717, "reward_std": 2.149495315961758e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999967813491821, "step": 545 }, { "clip_ratio": 0.0, "completion_length": 34.875, "epoch": 1.4145077720207253, "grad_norm": 15.449388787526583, "kl": 0.20703125, "learning_rate": 8.588082901554404e-07, "loss": 0.0005, "reward": 1.7858158349990845, "reward_std": 0.2423164664542128, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.285815954208374, "step": 546 }, { "clip_ratio": 0.0, "completion_length": 35.3125, "epoch": 1.4170984455958548, "grad_norm": 21.75149074064552, "kl": 0.28759765625, "learning_rate": 8.585492227979274e-07, "loss": 0.0007, "reward": 1.917125940322876, "reward_std": 0.257729121552984, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.417125940322876, "step": 547 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 1.4196891191709844, "grad_norm": 2.9702775269444937, "kl": 0.0458984375, "learning_rate": 8.582901554404145e-07, "loss": 0.0, "reward": 2.499992251396179, "reward_std": 5.324892072167131e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.999992311000824, "step": 548 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 1.422279792746114, "grad_norm": 3.810246069028726, "kl": 0.1162109375, "learning_rate": 8.580310880829014e-07, "loss": 0.0, "reward": 2.499987483024597, "reward_std": 1.6177102395431575e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999874234199524, "step": 549 }, { "clip_ratio": 0.0, "completion_length": 35.625, "epoch": 1.4248704663212435, "grad_norm": 15.54805209625986, "kl": 0.81787109375, "learning_rate": 8.577720207253885e-07, "loss": 0.0027, "reward": 2.2348156571388245, "reward_std": 0.28371294361329547, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.7348155975341797, "step": 550 }, { "clip_ratio": 0.0, "completion_length": 34.9375, "epoch": 1.427461139896373, "grad_norm": 22.946519579451778, "kl": 0.10302734375, "learning_rate": 8.575129533678756e-07, "loss": 0.0001, "reward": 1.9609906673431396, "reward_std": 0.09214628321743135, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4609908163547516, "step": 551 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 1.4300518134715026, "grad_norm": 20.4646282492878, "kl": 0.0438232421875, "learning_rate": 8.572538860103626e-07, "loss": 0.0005, "reward": 2.374976634979248, "reward_std": 0.23149483580868946, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.874976634979248, "step": 552 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 1.432642487046632, "grad_norm": 0.39088450896059135, "kl": 0.05615234375, "learning_rate": 8.569948186528498e-07, "loss": 0.0002, "reward": 2.4999959468841553, "reward_std": 2.5724719421305053e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999959468841553, "step": 553 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 1.4352331606217616, "grad_norm": 10.509556079498923, "kl": 0.0721435546875, "learning_rate": 8.567357512953368e-07, "loss": 0.0006, "reward": 1.9992655515670776, "reward_std": 3.6277227991377003e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4992654919624329, "step": 554 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 1.4378238341968912, "grad_norm": 0.6360938765933905, "kl": 0.10546875, "learning_rate": 8.564766839378238e-07, "loss": 0.0008, "reward": 2.4999654293060303, "reward_std": 7.2180823735834565e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999655485153198, "step": 555 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 1.4404145077720207, "grad_norm": 2.6689085455051393, "kl": 0.117431640625, "learning_rate": 8.562176165803109e-07, "loss": 0.0012, "reward": 2.4999722242355347, "reward_std": 9.116790636198857e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.99997216463089, "step": 556 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 1.4430051813471503, "grad_norm": 1.457392585775066, "kl": 0.0732421875, "learning_rate": 8.559585492227979e-07, "loss": -0.0005, "reward": 1.999876618385315, "reward_std": 6.7578973812487675e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4998767375946045, "step": 557 }, { "clip_ratio": 0.0, "completion_length": 34.5, "epoch": 1.4455958549222798, "grad_norm": 96.80446261646995, "kl": 0.0946044921875, "learning_rate": 8.55699481865285e-07, "loss": -0.0002, "reward": 2.296552300453186, "reward_std": 0.2808211346227836, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.7965522408485413, "step": 558 }, { "clip_ratio": 0.0, "completion_length": 34.5, "epoch": 1.4481865284974094, "grad_norm": 14.899907420105606, "kl": 0.205078125, "learning_rate": 8.55440414507772e-07, "loss": 0.0013, "reward": 2.437370538711548, "reward_std": 0.17711095710717473, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9373704195022583, "step": 559 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 1.450777202072539, "grad_norm": 54.26997364778638, "kl": 0.06689453125, "learning_rate": 8.551813471502591e-07, "loss": 0.0005, "reward": 1.9200925827026367, "reward_std": 0.03077354779952657, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4200924038887024, "step": 560 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 1.4533678756476685, "grad_norm": 170.31019157117908, "kl": 0.1650390625, "learning_rate": 8.549222797927461e-07, "loss": 0.0007, "reward": 1.8056821823120117, "reward_std": 0.0030451994288682727, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.3056823015213013, "step": 561 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 1.455958549222798, "grad_norm": 0.9094885943424723, "kl": 0.125244140625, "learning_rate": 8.546632124352331e-07, "loss": 0.0021, "reward": 2.499968409538269, "reward_std": 1.0164353852815111e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999680519104004, "step": 562 }, { "clip_ratio": 0.0, "completion_length": 34.5, "epoch": 1.4585492227979275, "grad_norm": 13.893688669120996, "kl": 0.0482177734375, "learning_rate": 8.544041450777202e-07, "loss": 0.0002, "reward": 1.9989761114120483, "reward_std": 6.966816772546736e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.498976081609726, "step": 563 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 1.4611398963730569, "grad_norm": 0.24691892359347203, "kl": 0.0596923828125, "learning_rate": 8.541450777202072e-07, "loss": 0.0008, "reward": 2.4999735355377197, "reward_std": 4.274718094166019e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.999973475933075, "step": 564 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 1.4637305699481864, "grad_norm": 80.70132538685037, "kl": 0.2451171875, "learning_rate": 8.538860103626943e-07, "loss": 0.001, "reward": 2.119768977165222, "reward_std": 0.23466921336444102, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.6197689771652222, "step": 565 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 1.466321243523316, "grad_norm": 2.2833243782909993, "kl": 0.161376953125, "learning_rate": 8.536269430051814e-07, "loss": 0.0013, "reward": 2.4999818801879883, "reward_std": 6.4477480350433325e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.999981701374054, "step": 566 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 1.4689119170984455, "grad_norm": 0.17769710559204552, "kl": 0.0533447265625, "learning_rate": 8.533678756476683e-07, "loss": 0.0014, "reward": 2.499998927116394, "reward_std": 8.398082229632564e-07, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999987483024597, "step": 567 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 1.471502590673575, "grad_norm": 8.834662862773344, "kl": 0.11328125, "learning_rate": 8.531088082901554e-07, "loss": -0.0001, "reward": 1.8848623037338257, "reward_std": 0.0004728440103463072, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.3848623931407928, "step": 568 }, { "clip_ratio": 0.0, "completion_length": 34.0, "epoch": 1.4740932642487046, "grad_norm": 1.118903441198404, "kl": 0.0631103515625, "learning_rate": 8.528497409326425e-07, "loss": 0.0009, "reward": 2.499974250793457, "reward_std": 5.876585788655575e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.999974250793457, "step": 569 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 1.4766839378238341, "grad_norm": 0.24970816334285134, "kl": 0.088134765625, "learning_rate": 8.525906735751295e-07, "loss": 0.0006, "reward": 2.4999921321868896, "reward_std": 2.663360589849617e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999920725822449, "step": 570 }, { "clip_ratio": 0.0, "completion_length": 34.5, "epoch": 1.4792746113989637, "grad_norm": 25.24810264907173, "kl": 0.130859375, "learning_rate": 8.523316062176166e-07, "loss": 0.0012, "reward": 2.499987244606018, "reward_std": 5.787460963802005e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999870657920837, "step": 571 }, { "clip_ratio": 0.0, "completion_length": 34.9375, "epoch": 1.4818652849740932, "grad_norm": 0.7292134978182698, "kl": 0.072998046875, "learning_rate": 8.520725388601036e-07, "loss": 0.0004, "reward": 2.499993681907654, "reward_std": 3.617406207467866e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999937415122986, "step": 572 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 1.4844559585492227, "grad_norm": 10.11262971199148, "kl": 0.0626220703125, "learning_rate": 8.518134715025906e-07, "loss": 0.0008, "reward": 1.9956601858139038, "reward_std": 0.0004383593468446634, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.495660126209259, "step": 573 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 1.4870466321243523, "grad_norm": 6.923407931749802, "kl": 0.100830078125, "learning_rate": 8.515544041450777e-07, "loss": 0.0013, "reward": 1.9345734119415283, "reward_std": 0.0016179480317077832, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4345735013484955, "step": 574 }, { "clip_ratio": 0.0, "completion_length": 34.5, "epoch": 1.4896373056994818, "grad_norm": 3.210986057446249, "kl": 0.1279296875, "learning_rate": 8.512953367875647e-07, "loss": 0.0002, "reward": 1.998543381690979, "reward_std": 4.682441522163572e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4985434114933014, "step": 575 }, { "clip_ratio": 0.0, "completion_length": 34.375, "epoch": 1.4922279792746114, "grad_norm": 118.22210432319872, "kl": 0.4593505859375, "learning_rate": 8.510362694300518e-07, "loss": 0.002, "reward": 2.3113245964050293, "reward_std": 0.26039084413665137, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.8113245964050293, "step": 576 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 1.494818652849741, "grad_norm": 16.97391608660472, "kl": 0.05487060546875, "learning_rate": 8.507772020725388e-07, "loss": -0.0001, "reward": 2.475416660308838, "reward_std": 0.010014679694904771, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9754165410995483, "step": 577 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 1.4974093264248705, "grad_norm": 115.60404174454798, "kl": 0.08074951171875, "learning_rate": 8.505181347150259e-07, "loss": 0.0003, "reward": 1.9988738298416138, "reward_std": 6.560475759442852e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.498873770236969, "step": 578 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 1.5, "grad_norm": 16.39430906170559, "kl": 0.159423828125, "learning_rate": 8.502590673575129e-07, "loss": 0.0011, "reward": 2.4999295473098755, "reward_std": 3.901746913470561e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999294877052307, "step": 579 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 1.5025906735751295, "grad_norm": 3.709042866453429, "kl": 0.15106201171875, "learning_rate": 8.499999999999999e-07, "loss": 0.0016, "reward": 2.4999505281448364, "reward_std": 4.016945126750215e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999504089355469, "step": 580 }, { "clip_ratio": 0.0, "completion_length": 34.0, "epoch": 1.505181347150259, "grad_norm": 0.5414945430223663, "kl": 0.12841796875, "learning_rate": 8.49740932642487e-07, "loss": 0.0009, "reward": 2.499996542930603, "reward_std": 4.480836423681467e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999966025352478, "step": 581 }, { "clip_ratio": 0.0, "completion_length": 36.5, "epoch": 1.5077720207253886, "grad_norm": 0.04387608481162616, "kl": 0.0703125, "learning_rate": 8.49481865284974e-07, "loss": 0.0003, "reward": 2.4999992847442627, "reward_std": 5.225200823133491e-07, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.999999463558197, "step": 582 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 1.5103626943005182, "grad_norm": 15.262042537624769, "kl": 0.119384765625, "learning_rate": 8.492227979274611e-07, "loss": 0.0001, "reward": 2.4998769760131836, "reward_std": 7.171521201598807e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9998770952224731, "step": 583 }, { "clip_ratio": 0.0, "completion_length": 35.1875, "epoch": 1.5129533678756477, "grad_norm": 49.153036232209615, "kl": 0.180419921875, "learning_rate": 8.489637305699482e-07, "loss": 0.0007, "reward": 1.92996084690094, "reward_std": 0.18613753374665976, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4299608170986176, "step": 584 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 1.5155440414507773, "grad_norm": 1.1414134842639188, "kl": 0.06103515625, "learning_rate": 8.487046632124351e-07, "loss": 0.0007, "reward": 2.499955654144287, "reward_std": 7.617499704792863e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999555945396423, "step": 585 }, { "clip_ratio": 0.0, "completion_length": 34.5, "epoch": 1.5181347150259068, "grad_norm": 0.1102302623226846, "kl": 0.08251953125, "learning_rate": 8.484455958549222e-07, "loss": -0.0003, "reward": 2.499996304512024, "reward_std": 1.9968337028331007e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999964833259583, "step": 586 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 1.5207253886010363, "grad_norm": 0.1853898651319823, "kl": 0.0579833984375, "learning_rate": 8.481865284974092e-07, "loss": 0.0004, "reward": 2.499997854232788, "reward_std": 1.4827681411588856e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999980926513672, "step": 587 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 1.5233160621761659, "grad_norm": 64.06439151411772, "kl": 0.091064453125, "learning_rate": 8.479274611398963e-07, "loss": -0.0001, "reward": 2.4995049238204956, "reward_std": 6.929871960892342e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.99950510263443, "step": 588 }, { "clip_ratio": 0.0, "completion_length": 34.5, "epoch": 1.5259067357512954, "grad_norm": 0.257982181347981, "kl": 0.22802734375, "learning_rate": 8.476683937823834e-07, "loss": 0.0015, "reward": 2.4999969005584717, "reward_std": 4.665788935653836e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999970197677612, "step": 589 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 1.528497409326425, "grad_norm": 0.6143889215696653, "kl": 0.1083984375, "learning_rate": 8.474093264248704e-07, "loss": -0.0003, "reward": 2.499996304512024, "reward_std": 2.9716395886225655e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999964237213135, "step": 590 }, { "clip_ratio": 0.0, "completion_length": 34.5, "epoch": 1.5310880829015545, "grad_norm": 33.29279444262581, "kl": 0.108154296875, "learning_rate": 8.471502590673574e-07, "loss": 0.0002, "reward": 2.3748679161071777, "reward_std": 0.23151225593392155, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.8748680353164673, "step": 591 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 1.533678756476684, "grad_norm": 68.05081380146056, "kl": 0.108154296875, "learning_rate": 8.468911917098444e-07, "loss": 0.0011, "reward": 2.498900055885315, "reward_std": 0.00018090060984832235, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9988999366760254, "step": 592 }, { "clip_ratio": 0.0, "completion_length": 36.5, "epoch": 1.5362694300518136, "grad_norm": 0.5801958685987406, "kl": 0.04302978515625, "learning_rate": 8.466321243523315e-07, "loss": -0.0007, "reward": 2.499996304512024, "reward_std": 3.393692395547987e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999964237213135, "step": 593 }, { "clip_ratio": 0.0, "completion_length": 34.0, "epoch": 1.5388601036269431, "grad_norm": 5.008065130916608, "kl": 0.100341796875, "learning_rate": 8.463730569948186e-07, "loss": -0.0006, "reward": 1.9089406728744507, "reward_std": 0.0002627334563385375, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4089407920837402, "step": 594 }, { "clip_ratio": 0.0, "completion_length": 39.1875, "epoch": 1.5414507772020727, "grad_norm": 20.83614436655994, "kl": 0.09765625, "learning_rate": 8.461139896373056e-07, "loss": 0.0001, "reward": 2.1044222116470337, "reward_std": 0.24415511500592402, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.6044222116470337, "step": 595 }, { "clip_ratio": 0.0, "completion_length": 34.0, "epoch": 1.5440414507772022, "grad_norm": 21.50165598420066, "kl": 0.1143798828125, "learning_rate": 8.458549222797928e-07, "loss": 0.0009, "reward": 2.499922513961792, "reward_std": 8.435450786237197e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.999922513961792, "step": 596 }, { "clip_ratio": 0.0, "completion_length": 34.5625, "epoch": 1.5466321243523318, "grad_norm": 21.63090716391322, "kl": 0.14208984375, "learning_rate": 8.455958549222799e-07, "loss": 0.0014, "reward": 2.4271273612976074, "reward_std": 0.20610876871108985, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9271273016929626, "step": 597 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 1.549222797927461, "grad_norm": 5.453637215366586, "kl": 0.099853515625, "learning_rate": 8.453367875647668e-07, "loss": 0.0001, "reward": 1.9999055862426758, "reward_std": 2.6277070901414845e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4999056458473206, "step": 598 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 1.5518134715025906, "grad_norm": 15.384719875272125, "kl": 0.116455078125, "learning_rate": 8.450777202072539e-07, "loss": 0.0004, "reward": 1.8122720122337341, "reward_std": 0.2587905696236703, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.3122718930244446, "step": 599 }, { "clip_ratio": 0.0, "completion_length": 34.5, "epoch": 1.5544041450777202, "grad_norm": 0.616532429867594, "kl": 0.0955810546875, "learning_rate": 8.448186528497409e-07, "loss": 0.0004, "reward": 2.499997854232788, "reward_std": 1.8724124970503908e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999980330467224, "step": 600 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 1.5569948186528497, "grad_norm": 4.341940647647111, "kl": 0.080810546875, "learning_rate": 8.44559585492228e-07, "loss": 0.0013, "reward": 2.4999802112579346, "reward_std": 1.88822182281001e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.999980092048645, "step": 601 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 1.5595854922279793, "grad_norm": 40.68992921359032, "kl": 0.17626953125, "learning_rate": 8.443005181347151e-07, "loss": 0.0004, "reward": 1.9371178150177002, "reward_std": 0.1775104302305408, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4371178448200226, "step": 602 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 1.5621761658031088, "grad_norm": 0.5326278638232943, "kl": 0.0965576171875, "learning_rate": 8.44041450777202e-07, "loss": -0.0006, "reward": 2.4999624490737915, "reward_std": 3.241361184791458e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.999962568283081, "step": 603 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 1.5647668393782384, "grad_norm": 15.690180864985187, "kl": 0.1494140625, "learning_rate": 8.437823834196891e-07, "loss": 0.0001, "reward": 2.499652147293091, "reward_std": 8.124888370275585e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9996520280838013, "step": 604 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 1.567357512953368, "grad_norm": 24.444172968130243, "kl": 0.5166015625, "learning_rate": 8.435233160621761e-07, "loss": 0.0019, "reward": 2.2499672174453735, "reward_std": 0.2672818519795328, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.749967098236084, "step": 605 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 1.5699481865284974, "grad_norm": 13.404009997872135, "kl": 0.0596923828125, "learning_rate": 8.432642487046632e-07, "loss": -0.0, "reward": 2.4999488592147827, "reward_std": 3.0938746022002306e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999488592147827, "step": 606 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 1.572538860103627, "grad_norm": 77.39581247110588, "kl": 0.0791015625, "learning_rate": 8.430051813471503e-07, "loss": 0.0003, "reward": 2.36248779296875, "reward_std": 0.25471037908980065, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.8624878525733948, "step": 607 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 1.5751295336787565, "grad_norm": 1.9008766491155655, "kl": 0.102783203125, "learning_rate": 8.427461139896373e-07, "loss": 0.0004, "reward": 1.9998862147331238, "reward_std": 1.2667527585108473e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4998860359191895, "step": 608 }, { "clip_ratio": 0.0, "completion_length": 34.5, "epoch": 1.577720207253886, "grad_norm": 1.9828803876834276, "kl": 0.12109375, "learning_rate": 8.424870466321244e-07, "loss": -0.0008, "reward": 2.4999622106552124, "reward_std": 8.237246333919757e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999626278877258, "step": 609 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 1.5803108808290154, "grad_norm": 2.79710111954267, "kl": 0.0574951171875, "learning_rate": 8.422279792746113e-07, "loss": -0.0006, "reward": 1.9982621669769287, "reward_std": 5.059276645624777e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4982622861862183, "step": 610 }, { "clip_ratio": 0.0, "completion_length": 34.5, "epoch": 1.582901554404145, "grad_norm": 39.1047303736378, "kl": 0.0518798828125, "learning_rate": 8.419689119170984e-07, "loss": 0.0005, "reward": 2.062355160713196, "reward_std": 0.17683401459413517, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.5623551607131958, "step": 611 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 1.5854922279792745, "grad_norm": 9.460467075561967, "kl": 0.1142578125, "learning_rate": 8.417098445595855e-07, "loss": 0.0011, "reward": 2.4999747276306152, "reward_std": 4.3438417833385756e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999746680259705, "step": 612 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 1.588082901554404, "grad_norm": 0.9294845620488548, "kl": 0.0755615234375, "learning_rate": 8.414507772020725e-07, "loss": 0.0013, "reward": 2.499990463256836, "reward_std": 7.726110425210209e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999902248382568, "step": 613 }, { "clip_ratio": 0.0, "completion_length": 34.5625, "epoch": 1.5906735751295336, "grad_norm": 12.931335461271505, "kl": 0.090576171875, "learning_rate": 8.411917098445596e-07, "loss": 0.0004, "reward": 1.6791605949401855, "reward_std": 0.2087959760101512, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.1791605949401855, "step": 614 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 1.593264248704663, "grad_norm": 16.92869409317597, "kl": 0.0999755859375, "learning_rate": 8.409326424870465e-07, "loss": -0.0001, "reward": 2.499935746192932, "reward_std": 2.266797218908323e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999355673789978, "step": 615 }, { "clip_ratio": 0.0, "completion_length": 36.5, "epoch": 1.5958549222797926, "grad_norm": 2.6611294623667967, "kl": 0.10400390625, "learning_rate": 8.406735751295336e-07, "loss": 0.0014, "reward": 1.999908208847046, "reward_std": 7.677721214349731e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.499908059835434, "step": 616 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 1.5984455958549222, "grad_norm": 55.501815930975496, "kl": 0.039794921875, "learning_rate": 8.404145077720207e-07, "loss": -0.0003, "reward": 2.3749834299087524, "reward_std": 0.23147836945463496, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.874983549118042, "step": 617 }, { "clip_ratio": 0.0, "completion_length": 34.5, "epoch": 1.6010362694300517, "grad_norm": 5.879550773391461, "kl": 0.124267578125, "learning_rate": 8.401554404145077e-07, "loss": 0.0011, "reward": 2.4999661445617676, "reward_std": 2.994358374053263e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999660849571228, "step": 618 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 1.6036269430051813, "grad_norm": 5.685433488905754, "kl": 0.043212890625, "learning_rate": 8.398963730569948e-07, "loss": -0.0005, "reward": 2.4999927282333374, "reward_std": 8.184799071386806e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999927878379822, "step": 619 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 1.6062176165803108, "grad_norm": 0.28759157221521486, "kl": 0.0986328125, "learning_rate": 8.396373056994819e-07, "loss": -0.0011, "reward": 2.4999990463256836, "reward_std": 5.998403480589332e-07, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999993443489075, "step": 620 }, { "clip_ratio": 0.0, "completion_length": 36.5, "epoch": 1.6088082901554404, "grad_norm": 1.9602929056740013, "kl": 0.0538330078125, "learning_rate": 8.393782383419689e-07, "loss": 0.0009, "reward": 2.49999463558197, "reward_std": 5.903515841509943e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999944567680359, "step": 621 }, { "clip_ratio": 0.0, "completion_length": 34.0, "epoch": 1.61139896373057, "grad_norm": 49.769841735673076, "kl": 0.03765869140625, "learning_rate": 8.391191709844559e-07, "loss": -0.0003, "reward": 2.3749756813049316, "reward_std": 0.23147984719605574, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.8749755024909973, "step": 622 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 1.6139896373056994, "grad_norm": 5.227372879468587, "kl": 0.077392578125, "learning_rate": 8.388601036269429e-07, "loss": 0.0007, "reward": 2.2499500513076782, "reward_std": 0.26726438726723245, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.7499499917030334, "step": 623 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 1.616580310880829, "grad_norm": 19.784274226086808, "kl": 0.161865234375, "learning_rate": 8.3860103626943e-07, "loss": 0.0002, "reward": 2.3733383417129517, "reward_std": 0.23447314692953114, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.8733383417129517, "step": 624 }, { "clip_ratio": 0.0, "completion_length": 34.5, "epoch": 1.6191709844559585, "grad_norm": 0.7808235863543724, "kl": 0.115478515625, "learning_rate": 8.383419689119171e-07, "loss": 0.0006, "reward": 2.4999964237213135, "reward_std": 2.9871059723518556e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.999996542930603, "step": 625 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 1.621761658031088, "grad_norm": 27.70381217077696, "kl": 0.145263671875, "learning_rate": 8.380829015544041e-07, "loss": 0.0004, "reward": 1.9996799230575562, "reward_std": 4.7849290695012314e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4996800422668457, "step": 626 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 1.6243523316062176, "grad_norm": 32.867843698002496, "kl": 0.0699462890625, "learning_rate": 8.378238341968912e-07, "loss": -0.0005, "reward": 1.9922881126403809, "reward_std": 4.9706817662809044e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4922881126403809, "step": 627 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 1.6269430051813472, "grad_norm": 16.520671273835518, "kl": 0.0830078125, "learning_rate": 8.375647668393781e-07, "loss": 0.0001, "reward": 2.1247638463974, "reward_std": 0.23160198168898205, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.6247639060020447, "step": 628 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 1.6295336787564767, "grad_norm": 237.61248171008873, "kl": 0.06488037109375, "learning_rate": 8.373056994818652e-07, "loss": 0.0006, "reward": 1.9715783596038818, "reward_std": 0.032443345795968526, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4715781807899475, "step": 629 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 1.6321243523316062, "grad_norm": 0.15631348461910247, "kl": 0.08984375, "learning_rate": 8.370466321243523e-07, "loss": 0.0009, "reward": 2.4999732971191406, "reward_std": 2.459369682128454e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999732375144958, "step": 630 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 1.6347150259067358, "grad_norm": 7.374614574113729, "kl": 0.099609375, "learning_rate": 8.367875647668393e-07, "loss": 0.0005, "reward": 2.4999386072158813, "reward_std": 3.54635722032981e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999386072158813, "step": 631 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 1.6373056994818653, "grad_norm": 2.5733541818184524, "kl": 0.15673828125, "learning_rate": 8.365284974093264e-07, "loss": 0.0009, "reward": 1.9963146448135376, "reward_std": 3.914493510137618e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4963144659996033, "step": 632 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 1.6398963730569949, "grad_norm": 3.3167630613463173, "kl": 0.0364990234375, "learning_rate": 8.362694300518134e-07, "loss": 0.0001, "reward": 2.499943733215332, "reward_std": 1.5723659146260616e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.999943733215332, "step": 633 }, { "clip_ratio": 0.0, "completion_length": 35.375, "epoch": 1.6424870466321244, "grad_norm": 293.017130043985, "kl": 0.48486328125, "learning_rate": 8.360103626943004e-07, "loss": 0.0019, "reward": 1.5701483488082886, "reward_std": 0.3352852957032155, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.0701484084129333, "step": 634 }, { "clip_ratio": 0.0, "completion_length": 34.5, "epoch": 1.645077720207254, "grad_norm": 0.15602157212016882, "kl": 0.083251953125, "learning_rate": 8.357512953367875e-07, "loss": -0.0001, "reward": 2.4999769926071167, "reward_std": 1.806196820552941e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999771118164062, "step": 635 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 1.6476683937823835, "grad_norm": 14.406833113026893, "kl": 0.1533203125, "learning_rate": 8.354922279792745e-07, "loss": 0.0003, "reward": 1.9809051752090454, "reward_std": 0.00011348181942594238, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4809053242206573, "step": 636 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 1.650259067357513, "grad_norm": 5.98867345829897, "kl": 0.0699462890625, "learning_rate": 8.352331606217616e-07, "loss": 0.0005, "reward": 2.49954617023468, "reward_std": 6.855746374867522e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9995462894439697, "step": 637 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 1.6528497409326426, "grad_norm": 0.38712691535412386, "kl": 0.113037109375, "learning_rate": 8.349740932642486e-07, "loss": 0.0005, "reward": 2.4999964237213135, "reward_std": 1.8679312177027896e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999964833259583, "step": 638 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 1.6554404145077721, "grad_norm": 23.64713542225016, "kl": 0.1207275390625, "learning_rate": 8.347150259067358e-07, "loss": -0.0001, "reward": 1.9969990849494934, "reward_std": 0.00024986168754992377, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4969991743564606, "step": 639 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 1.6580310880829017, "grad_norm": 0.1307402912355439, "kl": 0.062255859375, "learning_rate": 8.344559585492228e-07, "loss": 0.0008, "reward": 2.4999979734420776, "reward_std": 1.503826325688351e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999980330467224, "step": 640 }, { "clip_ratio": 0.0, "completion_length": 34.5, "epoch": 1.6606217616580312, "grad_norm": 17.10243951191007, "kl": 0.0982666015625, "learning_rate": 8.341968911917098e-07, "loss": 0.0001, "reward": 1.9117478132247925, "reward_std": 0.0007526968944375767, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4117478728294373, "step": 641 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 1.6632124352331608, "grad_norm": 42.48667614447326, "kl": 0.0953369140625, "learning_rate": 8.339378238341969e-07, "loss": -0.0003, "reward": 1.993226706981659, "reward_std": 0.0017277612910220341, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4932267665863037, "step": 642 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 1.6658031088082903, "grad_norm": 87.54259575410254, "kl": 0.06396484375, "learning_rate": 8.33678756476684e-07, "loss": 0.0004, "reward": 2.499666690826416, "reward_std": 9.813916403800249e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9996667504310608, "step": 643 }, { "clip_ratio": 0.0, "completion_length": 34.5, "epoch": 1.6683937823834198, "grad_norm": 0.13406492653191412, "kl": 0.0870361328125, "learning_rate": 8.33419689119171e-07, "loss": 0.0006, "reward": 2.4999979734420776, "reward_std": 2.1362269535529776e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999980330467224, "step": 644 }, { "clip_ratio": 0.0, "completion_length": 35.4375, "epoch": 1.6709844559585494, "grad_norm": 25.02333946235965, "kl": 0.0830078125, "learning_rate": 8.331606217616581e-07, "loss": 0.0003, "reward": 1.4643511176109314, "reward_std": 0.0011783490153902676, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 0.9643511474132538, "step": 645 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 1.6735751295336787, "grad_norm": 4.837086925046459, "kl": 0.0745849609375, "learning_rate": 8.32901554404145e-07, "loss": 0.0011, "reward": 2.499936103820801, "reward_std": 1.708796690991221e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999359846115112, "step": 646 }, { "clip_ratio": 0.0, "completion_length": 34.5, "epoch": 1.6761658031088082, "grad_norm": 24.50331106738798, "kl": 0.070068359375, "learning_rate": 8.326424870466321e-07, "loss": 0.0005, "reward": 1.8760124444961548, "reward_std": 0.0016876516601769254, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.3760126233100891, "step": 647 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 1.6787564766839378, "grad_norm": 1494.805228643778, "kl": 86.022216796875, "learning_rate": 8.323834196891192e-07, "loss": 0.3466, "reward": 2.437464952468872, "reward_std": 0.17683711373479127, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9374647736549377, "step": 648 }, { "clip_ratio": 0.0, "completion_length": 34.5, "epoch": 1.6813471502590673, "grad_norm": 0.18292799218872652, "kl": 0.121826171875, "learning_rate": 8.321243523316062e-07, "loss": 0.0009, "reward": 2.4999966621398926, "reward_std": 1.514225971277483e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999967813491821, "step": 649 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 1.6839378238341969, "grad_norm": 6.872451887462959, "kl": 0.070556640625, "learning_rate": 8.318652849740933e-07, "loss": 0.0002, "reward": 2.499855875968933, "reward_std": 3.7510570109589025e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9998559355735779, "step": 650 }, { "clip_ratio": 0.0, "completion_length": 35.75, "epoch": 1.6865284974093264, "grad_norm": 103.94521945653051, "kl": 0.128662109375, "learning_rate": 8.316062176165803e-07, "loss": 0.0005, "reward": 1.380523443222046, "reward_std": 0.29092546921310714, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 0.8805235028266907, "step": 651 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 1.689119170984456, "grad_norm": 14.243931413628529, "kl": 0.0758056640625, "learning_rate": 8.313471502590673e-07, "loss": 0.0005, "reward": 1.9886736869812012, "reward_std": 0.00021353523194989066, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4886736571788788, "step": 652 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 1.6917098445595855, "grad_norm": 6.032961669977156, "kl": 0.19146728515625, "learning_rate": 8.310880829015544e-07, "loss": 0.0009, "reward": 1.9924865365028381, "reward_std": 6.780139489137582e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4924865365028381, "step": 653 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 1.694300518134715, "grad_norm": 1.4330065175801183, "kl": 0.0816650390625, "learning_rate": 8.308290155440414e-07, "loss": 0.0004, "reward": 1.9978421330451965, "reward_std": 5.885148004836083e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4978420734405518, "step": 654 }, { "clip_ratio": 0.0, "completion_length": 34.5, "epoch": 1.6968911917098446, "grad_norm": 3.9100450515237894, "kl": 0.083740234375, "learning_rate": 8.305699481865285e-07, "loss": -0.0006, "reward": 2.499984622001648, "reward_std": 1.4000762348587159e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.999984860420227, "step": 655 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 1.6994818652849741, "grad_norm": 0.24514908651419975, "kl": 0.0926513671875, "learning_rate": 8.303108808290155e-07, "loss": 0.0011, "reward": 2.4999905824661255, "reward_std": 3.2575424313563417e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999905824661255, "step": 656 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 1.7020725388601037, "grad_norm": 0.296463991573155, "kl": 0.07708740234375, "learning_rate": 8.300518134715026e-07, "loss": 0.0006, "reward": 2.499985456466675, "reward_std": 3.853004614029487e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999854564666748, "step": 657 }, { "clip_ratio": 0.0, "completion_length": 34.0, "epoch": 1.704663212435233, "grad_norm": 4.1706814690545775, "kl": 0.03399658203125, "learning_rate": 8.297927461139896e-07, "loss": -0.0003, "reward": 2.499993681907654, "reward_std": 7.77241325522482e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.999993622303009, "step": 658 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 1.7072538860103625, "grad_norm": 0.17933129044869325, "kl": 0.069091796875, "learning_rate": 8.295336787564766e-07, "loss": 0.0003, "reward": 2.4999818801879883, "reward_std": 2.1753508860911097e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999818205833435, "step": 659 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 1.709844559585492, "grad_norm": 7.0882663586124774, "kl": 0.154052734375, "learning_rate": 8.292746113989637e-07, "loss": 0.0005, "reward": 2.4999516010284424, "reward_std": 3.499470352608114e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999516606330872, "step": 660 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 1.7124352331606216, "grad_norm": 3.200128249666342, "kl": 0.278564453125, "learning_rate": 8.290155440414507e-07, "loss": 0.0018, "reward": 1.9994218349456787, "reward_std": 2.1387141259765485e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4994217157363892, "step": 661 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 1.7150259067357512, "grad_norm": 1.6315295872825968, "kl": 0.077392578125, "learning_rate": 8.287564766839378e-07, "loss": 0.0014, "reward": 2.499991297721863, "reward_std": 9.72662110143574e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999912977218628, "step": 662 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 1.7176165803108807, "grad_norm": 0.3336101169691649, "kl": 0.0509033203125, "learning_rate": 8.284974093264249e-07, "loss": 0.0005, "reward": 2.499987006187439, "reward_std": 2.834276415342174e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999869465827942, "step": 663 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 1.7202072538860103, "grad_norm": 0.18875959208137574, "kl": 0.076416015625, "learning_rate": 8.282383419689118e-07, "loss": 0.0013, "reward": 2.499985456466675, "reward_std": 2.901064817706356e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999854564666748, "step": 664 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 1.7227979274611398, "grad_norm": 8.723644921781322, "kl": 0.0517578125, "learning_rate": 8.279792746113989e-07, "loss": 0.0001, "reward": 2.4999836683273315, "reward_std": 1.4262079162108421e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999834895133972, "step": 665 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 1.7253886010362693, "grad_norm": 100.82072995520302, "kl": 0.1234130859375, "learning_rate": 8.27720207253886e-07, "loss": 0.0005, "reward": 1.739893913269043, "reward_std": 0.2734133796184324, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.2398938536643982, "step": 666 }, { "clip_ratio": 0.0, "completion_length": 34.5, "epoch": 1.7279792746113989, "grad_norm": 4.886117296355842, "kl": 0.078125, "learning_rate": 8.27461139896373e-07, "loss": -0.0006, "reward": 1.9969100952148438, "reward_std": 4.0762413732409186e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.496910274028778, "step": 667 }, { "clip_ratio": 0.0, "completion_length": 34.5, "epoch": 1.7305699481865284, "grad_norm": 0.6578307152263883, "kl": 0.16357421875, "learning_rate": 8.272020725388601e-07, "loss": 0.0002, "reward": 2.4999512434005737, "reward_std": 1.1279806813035975e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999512434005737, "step": 668 }, { "clip_ratio": 0.0, "completion_length": 34.5625, "epoch": 1.733160621761658, "grad_norm": 1.2918469594506938, "kl": 0.0655517578125, "learning_rate": 8.269430051813471e-07, "loss": 0.0002, "reward": 2.4999805688858032, "reward_std": 6.839307445716258e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999805688858032, "step": 669 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 1.7357512953367875, "grad_norm": 1.7111373096069442, "kl": 0.0906982421875, "learning_rate": 8.266839378238341e-07, "loss": 0.0008, "reward": 2.499992251396179, "reward_std": 8.681702638568822e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.999992311000824, "step": 670 }, { "clip_ratio": 0.0, "completion_length": 34.5, "epoch": 1.738341968911917, "grad_norm": 169.42935989989192, "kl": 0.1690673828125, "learning_rate": 8.264248704663212e-07, "loss": 0.0006, "reward": 2.436815023422241, "reward_std": 0.17759628169733332, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9368152022361755, "step": 671 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 1.7409326424870466, "grad_norm": 3.601261270302685, "kl": 0.099609375, "learning_rate": 8.261658031088082e-07, "loss": 0.0005, "reward": 1.4999629259109497, "reward_std": 1.3058762306172866e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 0.9999629259109497, "step": 672 }, { "clip_ratio": 0.0, "completion_length": 34.5, "epoch": 1.7435233160621761, "grad_norm": 0.5871349378967271, "kl": 0.068695068359375, "learning_rate": 8.259067357512953e-07, "loss": 0.0004, "reward": 2.4999938011169434, "reward_std": 4.084634610990179e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999937415122986, "step": 673 }, { "clip_ratio": 0.0, "completion_length": 34.125, "epoch": 1.7461139896373057, "grad_norm": 8.252815464613972, "kl": 0.092529296875, "learning_rate": 8.256476683937823e-07, "loss": 0.0002, "reward": 2.499964952468872, "reward_std": 2.659755870126901e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.999964952468872, "step": 674 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 1.7487046632124352, "grad_norm": 0.8702348833019585, "kl": 0.10546875, "learning_rate": 8.253886010362694e-07, "loss": 0.0003, "reward": 2.4999786615371704, "reward_std": 4.621068995902533e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999786615371704, "step": 675 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 1.7512953367875648, "grad_norm": 2.3631254993898363, "kl": 0.117431640625, "learning_rate": 8.251295336787564e-07, "loss": 0.0009, "reward": 2.4999940395355225, "reward_std": 3.846087565761991e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999940991401672, "step": 676 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 1.7538860103626943, "grad_norm": 0.7390971626284993, "kl": 0.043701171875, "learning_rate": 8.248704663212434e-07, "loss": -0.0001, "reward": 2.499966263771057, "reward_std": 4.501367087073049e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999664425849915, "step": 677 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 1.7564766839378239, "grad_norm": 7.13467884184763, "kl": 0.02764892578125, "learning_rate": 8.246113989637305e-07, "loss": 0.0004, "reward": 1.9999184608459473, "reward_std": 1.642768256715499e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4999184310436249, "step": 678 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 1.7590673575129534, "grad_norm": 2.2789329105342073, "kl": 0.0408935546875, "learning_rate": 8.243523316062175e-07, "loss": 0.0002, "reward": 1.9997382164001465, "reward_std": 1.3057464570920274e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4997382462024689, "step": 679 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 1.761658031088083, "grad_norm": 25.47869924499872, "kl": 0.0543212890625, "learning_rate": 8.240932642487046e-07, "loss": 0.0002, "reward": 2.2499643564224243, "reward_std": 0.267280383847492, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.7499642968177795, "step": 680 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 1.7642487046632125, "grad_norm": 0.7338014517634445, "kl": 0.1080322265625, "learning_rate": 8.238341968911918e-07, "loss": 0.0009, "reward": 2.499995470046997, "reward_std": 2.1286450646584854e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999955296516418, "step": 681 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 1.766839378238342, "grad_norm": 5.6103634092675225, "kl": 0.13671875, "learning_rate": 8.235751295336786e-07, "loss": 0.0004, "reward": 1.4999582767486572, "reward_std": 2.030848372669425e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 0.999958336353302, "step": 682 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 1.7694300518134716, "grad_norm": 2.875896173650571, "kl": 0.0655517578125, "learning_rate": 8.233160621761658e-07, "loss": -0.0001, "reward": 2.499988079071045, "reward_std": 1.4423132824958884e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999882578849792, "step": 683 }, { "clip_ratio": 0.0, "completion_length": 34.5, "epoch": 1.7720207253886011, "grad_norm": 1.2964359348906216, "kl": 0.109375, "learning_rate": 8.230569948186528e-07, "loss": 0.0009, "reward": 1.9980382919311523, "reward_std": 2.3528160681962618e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4980381727218628, "step": 684 }, { "clip_ratio": 0.0, "completion_length": 34.5, "epoch": 1.7746113989637307, "grad_norm": 5.6102330524598845, "kl": 0.10028076171875, "learning_rate": 8.227979274611399e-07, "loss": -0.0001, "reward": 2.4999724626541138, "reward_std": 1.0118180000517896e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999725818634033, "step": 685 }, { "clip_ratio": 0.0, "completion_length": 34.5, "epoch": 1.7772020725388602, "grad_norm": 18.868082070745036, "kl": 0.080810546875, "learning_rate": 8.22538860103627e-07, "loss": 0.0011, "reward": 2.437487006187439, "reward_std": 0.1767846374967803, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9374868273735046, "step": 686 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 1.7797927461139897, "grad_norm": 2.7496592487582143, "kl": 0.4375, "learning_rate": 8.22279792746114e-07, "loss": 0.0002, "reward": 2.4999945163726807, "reward_std": 4.082221380485862e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999946355819702, "step": 687 }, { "clip_ratio": 0.0, "completion_length": 34.5, "epoch": 1.7823834196891193, "grad_norm": 0.35821479804678263, "kl": 0.06494140625, "learning_rate": 8.22020725388601e-07, "loss": 0.0008, "reward": 2.49999737739563, "reward_std": 1.715442664362854e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999973773956299, "step": 688 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 1.7849740932642488, "grad_norm": 1.4850700490124686, "kl": 0.0338134765625, "learning_rate": 8.217616580310881e-07, "loss": 0.0018, "reward": 2.4999871253967285, "reward_std": 4.870042914717487e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999869465827942, "step": 689 }, { "clip_ratio": 0.0, "completion_length": 35.25, "epoch": 1.7875647668393784, "grad_norm": 27.31555528505093, "kl": 0.196044921875, "learning_rate": 8.215025906735751e-07, "loss": 0.0008, "reward": 1.8023179769515991, "reward_std": 0.0009480309454374947, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.3023179769515991, "step": 690 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 1.790155440414508, "grad_norm": 6.709943353290983, "kl": 0.0533447265625, "learning_rate": 8.212435233160622e-07, "loss": 0.0009, "reward": 1.7894837260246277, "reward_std": 0.00020543435039144242, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.2894836068153381, "step": 691 }, { "clip_ratio": 0.0, "completion_length": 34.5, "epoch": 1.7927461139896375, "grad_norm": 11.944606816843935, "kl": 0.15283203125, "learning_rate": 8.209844559585492e-07, "loss": 0.0005, "reward": 2.4999181032180786, "reward_std": 1.5325039839808596e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999182224273682, "step": 692 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 1.795336787564767, "grad_norm": 42.41986173925618, "kl": 0.0887451171875, "learning_rate": 8.207253886010363e-07, "loss": 0.0006, "reward": 2.4374067783355713, "reward_std": 0.1769486431730911, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9374067783355713, "step": 693 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 1.7979274611398963, "grad_norm": 0.0689479256162065, "kl": 0.09796142578125, "learning_rate": 8.204663212435233e-07, "loss": 0.0004, "reward": 2.499999165534973, "reward_std": 9.549086570359577e-07, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999994039535522, "step": 694 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 1.8005181347150259, "grad_norm": 7.528320000260137, "kl": 0.0830078125, "learning_rate": 8.202072538860103e-07, "loss": 0.0009, "reward": 2.499980330467224, "reward_std": 1.7278460802572226e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999803304672241, "step": 695 }, { "clip_ratio": 0.0, "completion_length": 35.625, "epoch": 1.8031088082901554, "grad_norm": 8.356065382587703, "kl": 0.131591796875, "learning_rate": 8.199481865284974e-07, "loss": 0.001, "reward": 2.343741536140442, "reward_std": 0.44194080871261576, "rewards/format_reward_rec": 0.9375, "rewards/point_reward": 1.8749914765357971, "step": 696 }, { "clip_ratio": 0.0, "completion_length": 34.5, "epoch": 1.805699481865285, "grad_norm": 1.7944457368315816, "kl": 0.227294921875, "learning_rate": 8.196891191709844e-07, "loss": 0.0012, "reward": 2.499993920326233, "reward_std": 2.733200744842179e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.999993920326233, "step": 697 }, { "clip_ratio": 0.0, "completion_length": 34.5, "epoch": 1.8082901554404145, "grad_norm": 3.0144286949801637, "kl": 0.075439453125, "learning_rate": 8.194300518134715e-07, "loss": -0.0001, "reward": 1.9976152181625366, "reward_std": 5.661829597158885e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.497615396976471, "step": 698 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 1.810880829015544, "grad_norm": 0.2759789092248253, "kl": 0.10595703125, "learning_rate": 8.191709844559586e-07, "loss": 0.0006, "reward": 2.4999958276748657, "reward_std": 2.1654390423009318e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999955892562866, "step": 699 }, { "clip_ratio": 0.0, "completion_length": 34.75, "epoch": 1.8134715025906736, "grad_norm": 42.67759353045895, "kl": 0.087890625, "learning_rate": 8.189119170984455e-07, "loss": 0.0005, "reward": 1.9869170188903809, "reward_std": 0.22009831677132752, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4869168996810913, "step": 700 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 1.8160621761658031, "grad_norm": 32.656202050911965, "kl": 0.107666015625, "learning_rate": 8.186528497409326e-07, "loss": 0.0003, "reward": 1.5609017610549927, "reward_std": 0.17689743958544568, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.0609017610549927, "step": 701 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 1.8186528497409327, "grad_norm": 0.47028814155421916, "kl": 0.10107421875, "learning_rate": 8.183937823834196e-07, "loss": 0.0011, "reward": 2.499952554702759, "reward_std": 5.4636123252294055e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999526143074036, "step": 702 }, { "clip_ratio": 0.0, "completion_length": 34.8125, "epoch": 1.8212435233160622, "grad_norm": 37.75444367838349, "kl": 0.14404296875, "learning_rate": 8.181347150259067e-07, "loss": 0.0012, "reward": 1.7973414659500122, "reward_std": 0.28486732493939826, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.2973413467407227, "step": 703 }, { "clip_ratio": 0.0, "completion_length": 34.875, "epoch": 1.8238341968911918, "grad_norm": 149.79450039519122, "kl": 0.176025390625, "learning_rate": 8.178756476683938e-07, "loss": 0.0007, "reward": 1.9040983319282532, "reward_std": 0.21884164586663246, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4040984511375427, "step": 704 }, { "clip_ratio": 0.0, "completion_length": 34.5, "epoch": 1.8264248704663213, "grad_norm": 4.607706106641456, "kl": 0.0474853515625, "learning_rate": 8.176165803108808e-07, "loss": -0.0002, "reward": 2.499961495399475, "reward_std": 2.263770451804703e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999615550041199, "step": 705 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 1.8290155440414506, "grad_norm": 8.759813240442615, "kl": 0.255126953125, "learning_rate": 8.173575129533678e-07, "loss": 0.0013, "reward": 2.498636484146118, "reward_std": 0.00018744168619377888, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.998636543750763, "step": 706 }, { "clip_ratio": 0.0, "completion_length": 34.5, "epoch": 1.8316062176165802, "grad_norm": 5.432585947575143, "kl": 0.087158203125, "learning_rate": 8.170984455958548e-07, "loss": 0.0006, "reward": 1.4982079863548279, "reward_std": 3.8477059206343256e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 0.9982079267501831, "step": 707 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 1.8341968911917097, "grad_norm": 2.3682903525699808, "kl": 0.06787109375, "learning_rate": 8.168393782383419e-07, "loss": 0.0002, "reward": 1.9979745149612427, "reward_std": 4.085007344656333e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4979746341705322, "step": 708 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 1.8367875647668392, "grad_norm": 1.2874349952757198, "kl": 0.105224609375, "learning_rate": 8.16580310880829e-07, "loss": 0.0005, "reward": 2.499969720840454, "reward_std": 8.705598474989529e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999694228172302, "step": 709 }, { "clip_ratio": 0.0, "completion_length": 34.5, "epoch": 1.8393782383419688, "grad_norm": 3.9606911970202816, "kl": 0.0487060546875, "learning_rate": 8.16321243523316e-07, "loss": -0.0003, "reward": 2.4999043941497803, "reward_std": 1.8315811303182272e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999045729637146, "step": 710 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 1.8419689119170983, "grad_norm": 2.921244370159192, "kl": 0.040283203125, "learning_rate": 8.160621761658031e-07, "loss": 0.0003, "reward": 2.499955415725708, "reward_std": 2.1065489818283822e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999554753303528, "step": 711 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 1.8445595854922279, "grad_norm": 0.4919493489252933, "kl": 0.096435546875, "learning_rate": 8.1580310880829e-07, "loss": 0.0016, "reward": 2.4999988079071045, "reward_std": 2.1324171086689603e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.999998927116394, "step": 712 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 1.8471502590673574, "grad_norm": 75.2656319553385, "kl": 0.0369873046875, "learning_rate": 8.155440414507771e-07, "loss": 0.0001, "reward": 1.9998453259468079, "reward_std": 3.05382527585607e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4998455345630646, "step": 713 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 1.849740932642487, "grad_norm": 4.7017640442430775, "kl": 0.088623046875, "learning_rate": 8.152849740932642e-07, "loss": 0.0004, "reward": 1.9984892010688782, "reward_std": 2.6042447871077457e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4984891414642334, "step": 714 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 1.8523316062176165, "grad_norm": 6.222395499694789, "kl": 0.0528564453125, "learning_rate": 8.150259067357512e-07, "loss": -0.0004, "reward": 2.4998987913131714, "reward_std": 2.3090570721251424e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.999898910522461, "step": 715 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 1.854922279792746, "grad_norm": 24.538056060265394, "kl": 0.1240234375, "learning_rate": 8.147668393782383e-07, "loss": 0.0007, "reward": 1.9928725957870483, "reward_std": 0.0006328953459160402, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4928724765777588, "step": 716 }, { "clip_ratio": 0.0, "completion_length": 34.5625, "epoch": 1.8575129533678756, "grad_norm": 14.278827744369918, "kl": 0.1180419921875, "learning_rate": 8.145077720207254e-07, "loss": 0.0001, "reward": 2.3446788787841797, "reward_std": 0.2946911964281753, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.8446791172027588, "step": 717 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 1.8601036269430051, "grad_norm": 1.3381890223202713, "kl": 0.044677734375, "learning_rate": 8.142487046632123e-07, "loss": -0.0002, "reward": 2.49999463558197, "reward_std": 5.366977916310134e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999945163726807, "step": 718 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 1.8626943005181347, "grad_norm": 21.335956197159533, "kl": 0.146484375, "learning_rate": 8.139896373056994e-07, "loss": 0.0003, "reward": 2.499861240386963, "reward_std": 6.220497755293763e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9998611807823181, "step": 719 }, { "clip_ratio": 0.0, "completion_length": 34.5, "epoch": 1.8652849740932642, "grad_norm": 12.952669851343646, "kl": 0.16259765625, "learning_rate": 8.137305699481864e-07, "loss": 0.0007, "reward": 1.9994430541992188, "reward_std": 3.526075306581333e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4994431734085083, "step": 720 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 1.8678756476683938, "grad_norm": 0.30068170664250105, "kl": 0.042236328125, "learning_rate": 8.134715025906735e-07, "loss": -0.0004, "reward": 2.499974489212036, "reward_std": 4.600924171427323e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999744296073914, "step": 721 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 1.8704663212435233, "grad_norm": 3.6812761982765556, "kl": 0.1201171875, "learning_rate": 8.132124352331606e-07, "loss": 0.0004, "reward": 2.4999918937683105, "reward_std": 6.380602542321867e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999918937683105, "step": 722 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 1.8730569948186528, "grad_norm": 11.405570312476135, "kl": 0.08984375, "learning_rate": 8.129533678756476e-07, "loss": 0.0012, "reward": 1.8103687167167664, "reward_std": 0.001791583761473703, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.3103685975074768, "step": 723 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 1.8756476683937824, "grad_norm": 66.8366672331642, "kl": 0.055419921875, "learning_rate": 8.126943005181348e-07, "loss": 0.0005, "reward": 2.3749492168426514, "reward_std": 0.2315013756715416, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.8749491572380066, "step": 724 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 1.878238341968912, "grad_norm": 3.2126439837115353, "kl": 0.40869140625, "learning_rate": 8.124352331606216e-07, "loss": 0.0022, "reward": 2.499987840652466, "reward_std": 8.392602012463612e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999877214431763, "step": 725 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 1.8808290155440415, "grad_norm": 1.6722275786840681, "kl": 0.11480712890625, "learning_rate": 8.121761658031088e-07, "loss": 0.002, "reward": 2.499995708465576, "reward_std": 5.694277945167414e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999956488609314, "step": 726 }, { "clip_ratio": 0.0, "completion_length": 34.5, "epoch": 1.883419689119171, "grad_norm": 1.5742542434766549, "kl": 0.1109619140625, "learning_rate": 8.119170984455959e-07, "loss": 0.0006, "reward": 2.4999829530715942, "reward_std": 7.644510560567142e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.999983012676239, "step": 727 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 1.8860103626943006, "grad_norm": 1.8219085341067554, "kl": 0.0302276611328125, "learning_rate": 8.116580310880829e-07, "loss": 0.0, "reward": 2.499990701675415, "reward_std": 1.2698666324695296e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.999990701675415, "step": 728 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 1.88860103626943, "grad_norm": 1.663450928179118, "kl": 0.090087890625, "learning_rate": 8.1139896373057e-07, "loss": 0.0017, "reward": 2.499966025352478, "reward_std": 1.1797412298619747e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999658465385437, "step": 729 }, { "clip_ratio": 0.0, "completion_length": 34.0, "epoch": 1.8911917098445596, "grad_norm": 0.6994851516255558, "kl": 0.149658203125, "learning_rate": 8.111398963730569e-07, "loss": 0.0, "reward": 2.499577283859253, "reward_std": 1.5459396195183217e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9995773434638977, "step": 730 }, { "clip_ratio": 0.0, "completion_length": 35.0625, "epoch": 1.8937823834196892, "grad_norm": 25.27344217750221, "kl": 0.13525390625, "learning_rate": 8.10880829015544e-07, "loss": 0.0007, "reward": 1.896426498889923, "reward_std": 0.16527703722675824, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.396426498889923, "step": 731 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 1.8963730569948187, "grad_norm": 3.9910486466040282, "kl": 0.09130859375, "learning_rate": 8.106217616580311e-07, "loss": 0.0008, "reward": 2.499955177307129, "reward_std": 2.780619524855865e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999552965164185, "step": 732 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 1.8989637305699483, "grad_norm": 0.7242378921876792, "kl": 0.1339111328125, "learning_rate": 8.103626943005181e-07, "loss": 0.0002, "reward": 1.9998878240585327, "reward_std": 8.981161045085173e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.499887853860855, "step": 733 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 1.9015544041450778, "grad_norm": 112.58840386757183, "kl": 0.09375, "learning_rate": 8.101036269430052e-07, "loss": 0.0002, "reward": 2.499970316886902, "reward_std": 9.514840883184661e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999704360961914, "step": 734 }, { "clip_ratio": 0.0, "completion_length": 34.5, "epoch": 1.9041450777202074, "grad_norm": 4.212007998783463, "kl": 0.131591796875, "learning_rate": 8.098445595854922e-07, "loss": 0.0009, "reward": 1.9555780291557312, "reward_std": 0.00015077465263857448, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4555780291557312, "step": 735 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 1.906735751295337, "grad_norm": 34.98546349999838, "kl": 0.0682373046875, "learning_rate": 8.095854922279793e-07, "loss": -0.0008, "reward": 1.9997453093528748, "reward_std": 6.642441468329707e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4997454285621643, "step": 736 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 1.9093264248704664, "grad_norm": 1.1272303075796912, "kl": 0.0859375, "learning_rate": 8.093264248704663e-07, "loss": 0.0002, "reward": 2.4999903440475464, "reward_std": 6.589457939298882e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999902248382568, "step": 737 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 1.911917098445596, "grad_norm": 13.276057693911596, "kl": 0.064208984375, "learning_rate": 8.090673575129533e-07, "loss": 0.0005, "reward": 2.4999653100967407, "reward_std": 2.632743871799903e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.999965250492096, "step": 738 }, { "clip_ratio": 0.0, "completion_length": 34.5625, "epoch": 1.9145077720207255, "grad_norm": 90.23450325866312, "kl": 0.062255859375, "learning_rate": 8.088082901554404e-07, "loss": -0.0002, "reward": 1.9986023902893066, "reward_std": 0.003414489360409334, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4986023902893066, "step": 739 }, { "clip_ratio": 0.0, "completion_length": 34.625, "epoch": 1.917098445595855, "grad_norm": 371.11278429509724, "kl": 0.54541015625, "learning_rate": 8.085492227979275e-07, "loss": 0.0022, "reward": 1.5157282948493958, "reward_std": 0.5100982487201691, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.0157283544540405, "step": 740 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 1.9196891191709846, "grad_norm": 1.6043536164560124, "kl": 0.09814453125, "learning_rate": 8.082901554404145e-07, "loss": -0.0001, "reward": 2.499995708465576, "reward_std": 6.733513032486371e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999958276748657, "step": 741 }, { "clip_ratio": 0.0, "completion_length": 34.0, "epoch": 1.922279792746114, "grad_norm": 0.11167726296119958, "kl": 0.040771484375, "learning_rate": 8.080310880829016e-07, "loss": 0.0002, "reward": 2.4999977350234985, "reward_std": 1.6510326474872272e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999977350234985, "step": 742 }, { "clip_ratio": 0.0, "completion_length": 34.5, "epoch": 1.9248704663212435, "grad_norm": 11.803440694600411, "kl": 0.20703125, "learning_rate": 8.077720207253885e-07, "loss": 0.001, "reward": 1.3101013898849487, "reward_std": 0.0008895274622773286, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 0.8101013898849487, "step": 743 }, { "clip_ratio": 0.0, "completion_length": 34.5625, "epoch": 1.927461139896373, "grad_norm": 1.080930120226716, "kl": 0.03631591796875, "learning_rate": 8.075129533678756e-07, "loss": -0.0006, "reward": 2.4999934434890747, "reward_std": 5.392120669966971e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999935626983643, "step": 744 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 1.9300518134715026, "grad_norm": 2.5205744223709647, "kl": 0.084228515625, "learning_rate": 8.072538860103627e-07, "loss": 0.0007, "reward": 2.499979257583618, "reward_std": 1.0524852768867277e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999791383743286, "step": 745 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 1.932642487046632, "grad_norm": 5.2551240199063285, "kl": 0.0430908203125, "learning_rate": 8.069948186528497e-07, "loss": 0.0005, "reward": 1.9996366500854492, "reward_std": 3.7261974739521975e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4996365010738373, "step": 746 }, { "clip_ratio": 0.0, "completion_length": 35.4375, "epoch": 1.9352331606217616, "grad_norm": 1.0978098646150014, "kl": 0.0548095703125, "learning_rate": 8.067357512953368e-07, "loss": -0.001, "reward": 2.4999940395355225, "reward_std": 6.356756273362407e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.999994158744812, "step": 747 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 1.9378238341968912, "grad_norm": 2.4030625070168123, "kl": 0.114501953125, "learning_rate": 8.064766839378238e-07, "loss": 0.0009, "reward": 2.4999942779541016, "reward_std": 1.060055092239054e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999940991401672, "step": 748 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 1.9404145077720207, "grad_norm": 0.5733309245884627, "kl": 0.0908203125, "learning_rate": 8.062176165803108e-07, "loss": 0.0002, "reward": 2.499983787536621, "reward_std": 6.805420184718969e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999839663505554, "step": 749 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 1.9430051813471503, "grad_norm": 2.399209736219041, "kl": 0.1552734375, "learning_rate": 8.059585492227979e-07, "loss": 0.0012, "reward": 1.9999147057533264, "reward_std": 9.375180297865882e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4999146461486816, "step": 750 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 1.9455958549222798, "grad_norm": 1.2292729366081792, "kl": 0.13037109375, "learning_rate": 8.056994818652849e-07, "loss": 0.0001, "reward": 1.99888014793396, "reward_std": 3.06091478705639e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4988802671432495, "step": 751 }, { "clip_ratio": 0.0, "completion_length": 34.5, "epoch": 1.9481865284974094, "grad_norm": 5.435845218415476, "kl": 0.032928466796875, "learning_rate": 8.05440414507772e-07, "loss": -0.0008, "reward": 2.4999637603759766, "reward_std": 2.1075928771097097e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999637603759766, "step": 752 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 1.950777202072539, "grad_norm": 0.6488012633788783, "kl": 0.190185546875, "learning_rate": 8.05181347150259e-07, "loss": -0.0002, "reward": 2.499993324279785, "reward_std": 6.480698630184634e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999934434890747, "step": 753 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 1.9533678756476682, "grad_norm": 0.07686287574520377, "kl": 0.0789794921875, "learning_rate": 8.049222797927461e-07, "loss": -0.001, "reward": 2.499997854232788, "reward_std": 1.4234396985557396e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999979138374329, "step": 754 }, { "clip_ratio": 0.0, "completion_length": 34.5, "epoch": 1.9559585492227978, "grad_norm": 3.5824155609827226, "kl": 0.0550537109375, "learning_rate": 8.046632124352331e-07, "loss": 0.0001, "reward": 2.4999858140945435, "reward_std": 6.218678777258901e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.999985694885254, "step": 755 }, { "clip_ratio": 0.0, "completion_length": 35.375, "epoch": 1.9585492227979273, "grad_norm": 197.39829672714714, "kl": 0.6048583984375, "learning_rate": 8.044041450777201e-07, "loss": 0.0023, "reward": 1.9229825735092163, "reward_std": 0.007150859762987238, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4229826629161835, "step": 756 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 1.9611398963730569, "grad_norm": 0.8367198565567382, "kl": 0.213623046875, "learning_rate": 8.041450777202072e-07, "loss": 0.0004, "reward": 2.499998092651367, "reward_std": 1.9431875557529565e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999980330467224, "step": 757 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 1.9637305699481864, "grad_norm": 32.22400377472626, "kl": 0.12646484375, "learning_rate": 8.038860103626942e-07, "loss": 0.0011, "reward": 2.3746838569641113, "reward_std": 0.23150423855076951, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.8746837973594666, "step": 758 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 1.966321243523316, "grad_norm": 0.11304688660412425, "kl": 0.062469482421875, "learning_rate": 8.036269430051813e-07, "loss": 0.0002, "reward": 2.4999979734420776, "reward_std": 1.46941570733361e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999979734420776, "step": 759 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 1.9689119170984455, "grad_norm": 1.6148992685408439, "kl": 0.117431640625, "learning_rate": 8.033678756476684e-07, "loss": 0.0008, "reward": 1.9999431371688843, "reward_std": 5.542067810893059e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4999430775642395, "step": 760 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 1.971502590673575, "grad_norm": 0.2475517092385738, "kl": 0.073486328125, "learning_rate": 8.031088082901553e-07, "loss": -0.0011, "reward": 2.4999730587005615, "reward_std": 4.552717655315064e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999732375144958, "step": 761 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 1.9740932642487046, "grad_norm": 1.8326122249000079, "kl": 0.088623046875, "learning_rate": 8.028497409326424e-07, "loss": 0.0005, "reward": 2.499992609024048, "reward_std": 7.946578989503905e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999926686286926, "step": 762 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 1.9766839378238341, "grad_norm": 0.11875450687150292, "kl": 0.05914306640625, "learning_rate": 8.025906735751295e-07, "loss": 0.0001, "reward": 2.4999966621398926, "reward_std": 1.966403488040669e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.999996542930603, "step": 763 }, { "clip_ratio": 0.0, "completion_length": 34.0, "epoch": 1.9792746113989637, "grad_norm": 5.165333128227207, "kl": 0.04052734375, "learning_rate": 8.023316062176165e-07, "loss": 0.0007, "reward": 2.4999712705612183, "reward_std": 1.2886929425803828e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999714493751526, "step": 764 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 1.9818652849740932, "grad_norm": 2.9431804361431197, "kl": 0.072998046875, "learning_rate": 8.020725388601036e-07, "loss": -0.0003, "reward": 2.4999146461486816, "reward_std": 2.159702012249909e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999147653579712, "step": 765 }, { "clip_ratio": 0.0, "completion_length": 34.5625, "epoch": 1.9844559585492227, "grad_norm": 59.74835066178494, "kl": 0.08935546875, "learning_rate": 8.018134715025906e-07, "loss": 0.0011, "reward": 2.374890089035034, "reward_std": 0.23161013678759446, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.8748899698257446, "step": 766 }, { "clip_ratio": 0.0, "completion_length": 34.5, "epoch": 1.9870466321243523, "grad_norm": 0.08220585831197426, "kl": 0.1021728515625, "learning_rate": 8.015544041450776e-07, "loss": -0.0009, "reward": 2.4999961853027344, "reward_std": 1.474141697599407e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999962449073792, "step": 767 }, { "clip_ratio": 0.0, "completion_length": 35.3125, "epoch": 1.9896373056994818, "grad_norm": 71.57515418933694, "kl": 0.10943603515625, "learning_rate": 8.012953367875648e-07, "loss": 0.0003, "reward": 1.9380744695663452, "reward_std": 0.08540706582425628, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.438074678182602, "step": 768 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 1.9922279792746114, "grad_norm": 1.0831466786758148, "kl": 0.05633544921875, "learning_rate": 8.010362694300518e-07, "loss": 0.0, "reward": 2.4999940395355225, "reward_std": 5.40015429351115e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.999994158744812, "step": 769 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 1.994818652849741, "grad_norm": 1.8830742937355367, "kl": 0.084228515625, "learning_rate": 8.007772020725389e-07, "loss": 0.001, "reward": 1.999828815460205, "reward_std": 1.2673829701270733e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.499828815460205, "step": 770 }, { "clip_ratio": 0.0, "completion_length": 34.5, "epoch": 1.9974093264248705, "grad_norm": 24.603749511656364, "kl": 0.14599609375, "learning_rate": 8.005181347150259e-07, "loss": 0.0008, "reward": 1.9988928437232971, "reward_std": 0.0001699219587862899, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4988930523395538, "step": 771 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 2.0, "grad_norm": 18.225488387914933, "kl": 0.12353515625, "learning_rate": 8.00259067357513e-07, "loss": 0.0005, "reward": 1.4656990766525269, "reward_std": 0.0006911862874403596, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 0.9656990170478821, "step": 772 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 2.0025906735751295, "grad_norm": 3.6961106629596263, "kl": 0.06597900390625, "learning_rate": 8e-07, "loss": -0.0001, "reward": 2.499965786933899, "reward_std": 1.881695811789541e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999658465385437, "step": 773 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 2.005181347150259, "grad_norm": 7.2558670423356295, "kl": 0.0767822265625, "learning_rate": 7.99740932642487e-07, "loss": 0.0004, "reward": 1.4988000392913818, "reward_std": 6.554352876264602e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 0.9988000392913818, "step": 774 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 2.0077720207253886, "grad_norm": 3.848726310667743, "kl": 0.1484375, "learning_rate": 7.994818652849741e-07, "loss": 0.0002, "reward": 2.499962329864502, "reward_std": 2.1448652205435792e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999623894691467, "step": 775 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 2.010362694300518, "grad_norm": 1.7495460596018095, "kl": 0.2198486328125, "learning_rate": 7.992227979274611e-07, "loss": 0.0016, "reward": 2.499992847442627, "reward_std": 5.8723562119666894e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999929666519165, "step": 776 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 2.0129533678756477, "grad_norm": 0.07181438494072484, "kl": 0.100341796875, "learning_rate": 7.989637305699482e-07, "loss": 0.0011, "reward": 2.4999982118606567, "reward_std": 1.4673998407488398e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999979734420776, "step": 777 }, { "clip_ratio": 0.0, "completion_length": 34.5, "epoch": 2.0155440414507773, "grad_norm": 99.34717539043967, "kl": 0.861083984375, "learning_rate": 7.987046632124353e-07, "loss": 0.0034, "reward": 1.9193508625030518, "reward_std": 0.22464617586228997, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4193509817123413, "step": 778 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 2.018134715025907, "grad_norm": 0.2736149256989262, "kl": 0.1043701171875, "learning_rate": 7.984455958549222e-07, "loss": 0.0019, "reward": 2.4999940395355225, "reward_std": 3.2497503070771927e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999940395355225, "step": 779 }, { "clip_ratio": 0.0, "completion_length": 34.5, "epoch": 2.0207253886010363, "grad_norm": 27.774634743895458, "kl": 1.470703125, "learning_rate": 7.981865284974093e-07, "loss": 0.0059, "reward": 2.3124088048934937, "reward_std": 0.4082988053560257, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.812408983707428, "step": 780 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 2.023316062176166, "grad_norm": 0.1494878068399389, "kl": 0.091552734375, "learning_rate": 7.979274611398963e-07, "loss": -0.0005, "reward": 2.499991774559021, "reward_std": 2.220222768301028e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999918937683105, "step": 781 }, { "clip_ratio": 0.0, "completion_length": 34.5, "epoch": 2.0259067357512954, "grad_norm": 21.653173742938588, "kl": 0.05499267578125, "learning_rate": 7.976683937823834e-07, "loss": 0.0002, "reward": 2.4374780654907227, "reward_std": 0.17682574421314712, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9374780654907227, "step": 782 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 2.028497409326425, "grad_norm": 0.44320321457665696, "kl": 0.113525390625, "learning_rate": 7.974093264248705e-07, "loss": 0.001, "reward": 2.4999852180480957, "reward_std": 4.378118092063232e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999849796295166, "step": 783 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 2.0310880829015545, "grad_norm": 2.4012456155232487, "kl": 0.0958251953125, "learning_rate": 7.971502590673575e-07, "loss": 0.0008, "reward": 2.499974846839905, "reward_std": 1.4902695284035872e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.99997478723526, "step": 784 }, { "clip_ratio": 0.0, "completion_length": 34.5, "epoch": 2.033678756476684, "grad_norm": 78.8103895753835, "kl": 0.19970703125, "learning_rate": 7.968911917098445e-07, "loss": 0.0009, "reward": 1.9711123704910278, "reward_std": 0.010406928623552858, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4711123704910278, "step": 785 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 2.0362694300518136, "grad_norm": 7.272051221335001, "kl": 0.4921875, "learning_rate": 7.966321243523316e-07, "loss": 0.002, "reward": 2.4996891021728516, "reward_std": 2.328877332047341e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.999688982963562, "step": 786 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 2.038860103626943, "grad_norm": 0.6240978384693943, "kl": 0.042724609375, "learning_rate": 7.963730569948186e-07, "loss": 0.0007, "reward": 2.4999805688858032, "reward_std": 7.714426828897558e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999805092811584, "step": 787 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 2.0414507772020727, "grad_norm": 1.7174991918507247, "kl": 0.096923828125, "learning_rate": 7.961139896373057e-07, "loss": 0.0004, "reward": 2.4372533559799194, "reward_std": 0.17679266391360215, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9372533559799194, "step": 788 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 2.0440414507772022, "grad_norm": 8.281741727533326, "kl": 0.08154296875, "learning_rate": 7.958549222797927e-07, "loss": 0.0006, "reward": 1.9990002512931824, "reward_std": 6.093075671742554e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.49900022149086, "step": 789 }, { "clip_ratio": 0.0, "completion_length": 34.0, "epoch": 2.0466321243523318, "grad_norm": 3.699676109780699, "kl": 0.0682373046875, "learning_rate": 7.955958549222798e-07, "loss": 0.0005, "reward": 2.499968409538269, "reward_std": 1.9681296180351637e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999682903289795, "step": 790 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 2.0492227979274613, "grad_norm": 2.414906691658386, "kl": 0.07421875, "learning_rate": 7.953367875647668e-07, "loss": 0.001, "reward": 2.4996109008789062, "reward_std": 2.106163833559549e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9996107816696167, "step": 791 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 2.051813471502591, "grad_norm": 0.5725793795529477, "kl": 0.1915283203125, "learning_rate": 7.950777202072538e-07, "loss": 0.0016, "reward": 2.4999940395355225, "reward_std": 3.5649984511110233e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.999993920326233, "step": 792 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 2.0544041450777204, "grad_norm": 0.03604955838150388, "kl": 0.0654296875, "learning_rate": 7.948186528497409e-07, "loss": -0.0005, "reward": 2.499998927116394, "reward_std": 1.0972140671583475e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999990463256836, "step": 793 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 2.05699481865285, "grad_norm": 31.868527530811924, "kl": 0.11083984375, "learning_rate": 7.945595854922279e-07, "loss": 0.0007, "reward": 1.9483452439308167, "reward_std": 0.009741406350713078, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4483452439308167, "step": 794 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 2.0595854922279795, "grad_norm": 2.309197763477085, "kl": 0.08837890625, "learning_rate": 7.94300518134715e-07, "loss": -0.0001, "reward": 2.4999725818634033, "reward_std": 1.086189519128311e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999727010726929, "step": 795 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 2.062176165803109, "grad_norm": 3.6380421520174897, "kl": 0.57373046875, "learning_rate": 7.940414507772021e-07, "loss": 0.0024, "reward": 2.4999133348464966, "reward_std": 0.00023796794027930446, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999132752418518, "step": 796 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 2.064766839378238, "grad_norm": 37.97424849534484, "kl": 0.16455078125, "learning_rate": 7.93782383419689e-07, "loss": 0.0001, "reward": 1.7137579321861267, "reward_std": 0.0008719326960999751, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.2137579023838043, "step": 797 }, { "clip_ratio": 0.0, "completion_length": 34.0, "epoch": 2.0673575129533677, "grad_norm": 35.17132503857111, "kl": 0.06298828125, "learning_rate": 7.935233160621761e-07, "loss": 0.0003, "reward": 1.9983269572257996, "reward_std": 0.0009882468584692106, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4983269572257996, "step": 798 }, { "clip_ratio": 0.0, "completion_length": 35.4375, "epoch": 2.069948186528497, "grad_norm": 54.62937239848824, "kl": 0.088623046875, "learning_rate": 7.932642487046631e-07, "loss": 0.0005, "reward": 1.9671945571899414, "reward_std": 0.002545786983318976, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4671944081783295, "step": 799 }, { "clip_ratio": 0.0, "completion_length": 34.5, "epoch": 2.0725388601036268, "grad_norm": 4.0722667510399875, "kl": 0.12969970703125, "learning_rate": 7.930051813471502e-07, "loss": 0.0006, "reward": 1.9984092116355896, "reward_std": 7.788836177269332e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4984091222286224, "step": 800 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 2.0751295336787563, "grad_norm": 18.29493648267371, "kl": 0.0872802734375, "learning_rate": 7.927461139896373e-07, "loss": 0.0009, "reward": 1.9955613017082214, "reward_std": 0.0005434078050257085, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4955612421035767, "step": 801 }, { "clip_ratio": 0.0, "completion_length": 34.5, "epoch": 2.077720207253886, "grad_norm": 0.709697541298915, "kl": 0.25390625, "learning_rate": 7.924870466321243e-07, "loss": 0.0013, "reward": 2.499990463256836, "reward_std": 7.249970394695993e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.999990463256836, "step": 802 }, { "clip_ratio": 0.0, "completion_length": 34.5, "epoch": 2.0803108808290154, "grad_norm": 2.598427823478953, "kl": 0.121337890625, "learning_rate": 7.922279792746113e-07, "loss": 0.0003, "reward": 1.9991042017936707, "reward_std": 3.372466289874865e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4991041719913483, "step": 803 }, { "clip_ratio": 0.0, "completion_length": 34.5, "epoch": 2.082901554404145, "grad_norm": 2.7433718160379645, "kl": 0.078369140625, "learning_rate": 7.919689119170983e-07, "loss": 0.0009, "reward": 2.4999966621398926, "reward_std": 4.335711537351017e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999966621398926, "step": 804 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 2.0854922279792745, "grad_norm": 0.25199641692462155, "kl": 0.054931640625, "learning_rate": 7.917098445595854e-07, "loss": -0.0001, "reward": 2.499996781349182, "reward_std": 2.647911770736755e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999969005584717, "step": 805 }, { "clip_ratio": 0.0, "completion_length": 34.5, "epoch": 2.088082901554404, "grad_norm": 0.056184545774577904, "kl": 0.0426025390625, "learning_rate": 7.914507772020725e-07, "loss": -0.0006, "reward": 2.499998927116394, "reward_std": 8.558417619042302e-07, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999990463256836, "step": 806 }, { "clip_ratio": 0.0, "completion_length": 35.1875, "epoch": 2.0906735751295336, "grad_norm": 82.12138733630296, "kl": 0.0738525390625, "learning_rate": 7.911917098445595e-07, "loss": 0.0006, "reward": 2.306434154510498, "reward_std": 0.2671876762778993, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.8064342141151428, "step": 807 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 2.093264248704663, "grad_norm": 39.75224721499368, "kl": 0.131591796875, "learning_rate": 7.909326424870466e-07, "loss": 0.0002, "reward": 2.374869227409363, "reward_std": 0.23160941991955042, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.8748692870140076, "step": 808 }, { "clip_ratio": 0.0, "completion_length": 35.0625, "epoch": 2.0958549222797926, "grad_norm": 12.512084781215352, "kl": 0.0728759765625, "learning_rate": 7.906735751295335e-07, "loss": -0.0001, "reward": 2.402372360229492, "reward_std": 0.2760960097671159, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9023725390434265, "step": 809 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 2.098445595854922, "grad_norm": 15.58087078184914, "kl": 0.0845947265625, "learning_rate": 7.904145077720206e-07, "loss": 0.0003, "reward": 1.7498490810394287, "reward_std": 0.26727011656885225, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.2498490810394287, "step": 810 }, { "clip_ratio": 0.0, "completion_length": 34.5, "epoch": 2.1010362694300517, "grad_norm": 10.028903177969555, "kl": 0.089111328125, "learning_rate": 7.901554404145078e-07, "loss": 0.0009, "reward": 2.4999066591262817, "reward_std": 9.397085034379415e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999064803123474, "step": 811 }, { "clip_ratio": 0.0, "completion_length": 34.5, "epoch": 2.1036269430051813, "grad_norm": 42.53789859794629, "kl": 0.14892578125, "learning_rate": 7.898963730569948e-07, "loss": 0.0015, "reward": 1.9960054159164429, "reward_std": 0.00035896282710723426, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4960054159164429, "step": 812 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 2.106217616580311, "grad_norm": 69.72910435323216, "kl": 0.12451171875, "learning_rate": 7.896373056994819e-07, "loss": 0.0005, "reward": 2.298454165458679, "reward_std": 0.2781435576451372, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.7984542846679688, "step": 813 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 2.1088082901554404, "grad_norm": 0.31617685399524487, "kl": 0.03387451171875, "learning_rate": 7.89378238341969e-07, "loss": 0.001, "reward": 2.499996542930603, "reward_std": 2.301369363522099e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999964237213135, "step": 814 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 2.11139896373057, "grad_norm": 18.405205405846875, "kl": 0.072021484375, "learning_rate": 7.891191709844559e-07, "loss": 0.0004, "reward": 2.4995542764663696, "reward_std": 0.00011681799310281349, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9995543956756592, "step": 815 }, { "clip_ratio": 0.0, "completion_length": 34.5, "epoch": 2.1139896373056994, "grad_norm": 1.9553494809488006, "kl": 0.12890625, "learning_rate": 7.88860103626943e-07, "loss": 0.0006, "reward": 2.4999401569366455, "reward_std": 2.828069759175378e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999402165412903, "step": 816 }, { "clip_ratio": 0.0, "completion_length": 34.5, "epoch": 2.116580310880829, "grad_norm": 3.0640260211442296, "kl": 0.1630859375, "learning_rate": 7.8860103626943e-07, "loss": 0.0004, "reward": 2.499991774559021, "reward_std": 9.657326614842532e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999918341636658, "step": 817 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 2.1191709844559585, "grad_norm": 160.147805385429, "kl": 0.0606689453125, "learning_rate": 7.883419689119171e-07, "loss": -0.0006, "reward": 1.999306559562683, "reward_std": 0.00039623541294986353, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.499306559562683, "step": 818 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 2.121761658031088, "grad_norm": 3.886634685348415, "kl": 0.5113525390625, "learning_rate": 7.880829015544042e-07, "loss": 0.0019, "reward": 2.4106621742248535, "reward_std": 0.25268226398407023, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.910662293434143, "step": 819 }, { "clip_ratio": 0.0, "completion_length": 34.5, "epoch": 2.1243523316062176, "grad_norm": 5.463729048763219, "kl": 0.0826416015625, "learning_rate": 7.878238341968912e-07, "loss": -0.0006, "reward": 2.499975085258484, "reward_std": 1.983281140383042e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.999975323677063, "step": 820 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 2.126943005181347, "grad_norm": 1.6968617898008451, "kl": 0.16357421875, "learning_rate": 7.875647668393782e-07, "loss": -0.0005, "reward": 1.9996801614761353, "reward_std": 2.749540878710377e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4996803402900696, "step": 821 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 2.1295336787564767, "grad_norm": 0.3584589275948276, "kl": 0.07513427734375, "learning_rate": 7.873056994818652e-07, "loss": -0.0004, "reward": 1.9999231696128845, "reward_std": 5.105447201003699e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4999233186244965, "step": 822 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 2.1321243523316062, "grad_norm": 1.1964297839352696, "kl": 0.1029052734375, "learning_rate": 7.870466321243523e-07, "loss": 0.0008, "reward": 1.9985453486442566, "reward_std": 1.607097510714084e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.498545378446579, "step": 823 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 2.134715025906736, "grad_norm": 0.2555782625831436, "kl": 0.1787109375, "learning_rate": 7.867875647668394e-07, "loss": -0.0, "reward": 2.499994993209839, "reward_std": 2.2009395479472005e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999949932098389, "step": 824 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 2.1373056994818653, "grad_norm": 1.185431365056362, "kl": 0.1474609375, "learning_rate": 7.865284974093264e-07, "loss": 0.0, "reward": 1.9999152421951294, "reward_std": 7.194373210950289e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4999153912067413, "step": 825 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 2.139896373056995, "grad_norm": 81.46332385218038, "kl": 0.0782470703125, "learning_rate": 7.862694300518135e-07, "loss": 0.0006, "reward": 2.187281310558319, "reward_std": 0.25892766599395145, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.6872811913490295, "step": 826 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 2.1424870466321244, "grad_norm": 2.4871128232772035, "kl": 0.8310546875, "learning_rate": 7.860103626943004e-07, "loss": 0.0033, "reward": 2.499991297721863, "reward_std": 8.376256118935999e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999912977218628, "step": 827 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 2.145077720207254, "grad_norm": 0.2768657147885546, "kl": 0.0615234375, "learning_rate": 7.857512953367875e-07, "loss": -0.0, "reward": 2.4999730587005615, "reward_std": 3.175752226525219e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.999973177909851, "step": 828 }, { "clip_ratio": 0.0, "completion_length": 34.5, "epoch": 2.1476683937823835, "grad_norm": 32.21696010075703, "kl": 0.1448974609375, "learning_rate": 7.854922279792746e-07, "loss": 0.0006, "reward": 1.8902733325958252, "reward_std": 0.17703182611148804, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.3902733623981476, "step": 829 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 2.150259067357513, "grad_norm": 14.708848952487546, "kl": 0.132080078125, "learning_rate": 7.852331606217616e-07, "loss": 0.0003, "reward": 1.4905517101287842, "reward_std": 0.0002584439571364783, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 0.9905517995357513, "step": 830 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 2.1528497409326426, "grad_norm": 0.8821324462348575, "kl": 0.02239990234375, "learning_rate": 7.849740932642487e-07, "loss": -0.0, "reward": 2.4999929666519165, "reward_std": 5.975475801278662e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.999993085861206, "step": 831 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 2.155440414507772, "grad_norm": 0.45672177704669265, "kl": 0.052490234375, "learning_rate": 7.847150259067357e-07, "loss": 0.0003, "reward": 1.9999288320541382, "reward_std": 9.878930086415494e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.499928891658783, "step": 832 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 2.1580310880829017, "grad_norm": 9.245308572334315, "kl": 0.1533203125, "learning_rate": 7.844559585492227e-07, "loss": 0.0013, "reward": 2.499931812286377, "reward_std": 5.431316446902201e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.999931812286377, "step": 833 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 2.160621761658031, "grad_norm": 9.764383689176286, "kl": 0.15234375, "learning_rate": 7.841968911917098e-07, "loss": 0.0003, "reward": 2.499988555908203, "reward_std": 9.286042370604264e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999886751174927, "step": 834 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 2.1632124352331608, "grad_norm": 4.655281682587037, "kl": 0.129150390625, "learning_rate": 7.839378238341968e-07, "loss": 0.0013, "reward": 1.986701488494873, "reward_std": 0.0011662132325369612, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4867012798786163, "step": 835 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 2.1658031088082903, "grad_norm": 17.56675902588832, "kl": 0.08154296875, "learning_rate": 7.836787564766839e-07, "loss": -0.0003, "reward": 1.8743489384651184, "reward_std": 0.001236509110640327, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.3743488788604736, "step": 836 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 2.16839378238342, "grad_norm": 2.747082448275533, "kl": 0.08544921875, "learning_rate": 7.83419689119171e-07, "loss": 0.0009, "reward": 1.998904287815094, "reward_std": 3.858891295749345e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4989041984081268, "step": 837 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 2.1709844559585494, "grad_norm": 3.9127029335257935, "kl": 0.7099609375, "learning_rate": 7.83160621761658e-07, "loss": 0.0032, "reward": 2.499981641769409, "reward_std": 1.0990483133355156e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999815821647644, "step": 838 }, { "clip_ratio": 0.0, "completion_length": 34.625, "epoch": 2.173575129533679, "grad_norm": 12.55711287538366, "kl": 0.088134765625, "learning_rate": 7.829015544041451e-07, "loss": 0.0002, "reward": 1.9975048303604126, "reward_std": 0.002005275209626234, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4975048005580902, "step": 839 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 2.1761658031088085, "grad_norm": 0.6343800195194895, "kl": 0.1490478515625, "learning_rate": 7.82642487046632e-07, "loss": 0.0008, "reward": 2.499981164932251, "reward_std": 1.1993369071205962e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.999981164932251, "step": 840 }, { "clip_ratio": 0.0, "completion_length": 34.25, "epoch": 2.178756476683938, "grad_norm": 32.69166364468496, "kl": 0.064453125, "learning_rate": 7.823834196891191e-07, "loss": 0.0008, "reward": 2.312381625175476, "reward_std": 0.2589312991796078, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.8123815059661865, "step": 841 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 2.1813471502590676, "grad_norm": 0.24945600525058792, "kl": 0.02716064453125, "learning_rate": 7.821243523316062e-07, "loss": 0.0004, "reward": 2.499980092048645, "reward_std": 4.777468404881802e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.999979853630066, "step": 842 }, { "clip_ratio": 0.0, "completion_length": 34.0, "epoch": 2.1839378238341967, "grad_norm": 0.7059778030870555, "kl": 0.109619140625, "learning_rate": 7.818652849740932e-07, "loss": 0.0009, "reward": 1.9999275207519531, "reward_std": 8.797935038273863e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4999276101589203, "step": 843 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 2.186528497409326, "grad_norm": 0.4589818445768626, "kl": 0.057861328125, "learning_rate": 7.816062176165803e-07, "loss": -0.0002, "reward": 2.4999959468841553, "reward_std": 2.5094516331591876e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999961853027344, "step": 844 }, { "clip_ratio": 0.0, "completion_length": 35.25, "epoch": 2.1891191709844557, "grad_norm": 9.165496137938725, "kl": 0.131103515625, "learning_rate": 7.813471502590672e-07, "loss": 0.0009, "reward": 1.9891877174377441, "reward_std": 0.00015411775825668883, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4891875684261322, "step": 845 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 2.1917098445595853, "grad_norm": 215.1622206840282, "kl": 0.1060791015625, "learning_rate": 7.810880829015543e-07, "loss": 0.0003, "reward": 1.9992945194244385, "reward_std": 0.0001992236175283324, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4992945790290833, "step": 846 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 2.194300518134715, "grad_norm": 3.624025423979156, "kl": 0.0777587890625, "learning_rate": 7.808290155440414e-07, "loss": 0.0001, "reward": 2.499942898750305, "reward_std": 2.195755723732873e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999431371688843, "step": 847 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 2.1968911917098444, "grad_norm": 0.4654506824478735, "kl": 0.1259765625, "learning_rate": 7.805699481865284e-07, "loss": -0.0001, "reward": 2.499990701675415, "reward_std": 4.144567014918721e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999908208847046, "step": 848 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 2.199481865284974, "grad_norm": 12.45759018396317, "kl": 0.080322265625, "learning_rate": 7.803108808290155e-07, "loss": -0.0007, "reward": 2.4999905824661255, "reward_std": 7.400932872769772e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999908804893494, "step": 849 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 2.2020725388601035, "grad_norm": 1.0970616204339865, "kl": 0.1279296875, "learning_rate": 7.800518134715025e-07, "loss": 0.0016, "reward": 1.99956476688385, "reward_std": 2.758353741683095e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.499564677476883, "step": 850 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 2.204663212435233, "grad_norm": 33.79485917037079, "kl": 0.0955810546875, "learning_rate": 7.797927461139896e-07, "loss": 0.0005, "reward": 2.1248949766159058, "reward_std": 0.23151709060863368, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.6248949766159058, "step": 851 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 2.2072538860103625, "grad_norm": 6.609977037423613, "kl": 0.120849609375, "learning_rate": 7.795336787564766e-07, "loss": -0.0002, "reward": 1.751718282699585, "reward_std": 0.0004842634275519231, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.2517182528972626, "step": 852 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 2.209844559585492, "grad_norm": 0.08309396856625578, "kl": 0.072509765625, "learning_rate": 7.792746113989636e-07, "loss": -0.0, "reward": 2.499998092651367, "reward_std": 1.7144134574209602e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.999998152256012, "step": 853 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 2.2124352331606216, "grad_norm": 0.752714079456917, "kl": 0.1435546875, "learning_rate": 7.790155440414508e-07, "loss": -0.0001, "reward": 1.9999275207519531, "reward_std": 8.801098829280818e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4999276995658875, "step": 854 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 2.215025906735751, "grad_norm": 5.114989287280495, "kl": 0.072509765625, "learning_rate": 7.787564766839378e-07, "loss": 0.0004, "reward": 1.8816779851913452, "reward_std": 0.0003030264506378444, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.3816779851913452, "step": 855 }, { "clip_ratio": 0.0, "completion_length": 34.375, "epoch": 2.2176165803108807, "grad_norm": 44.150714063554716, "kl": 0.1502685546875, "learning_rate": 7.784974093264249e-07, "loss": 0.0003, "reward": 1.956153929233551, "reward_std": 0.06839726611087826, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4561539888381958, "step": 856 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 2.2202072538860103, "grad_norm": 7.571076160448421, "kl": 0.18896484375, "learning_rate": 7.78238341968912e-07, "loss": 0.0008, "reward": 2.3748414516448975, "reward_std": 0.23147132278410254, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.8748415112495422, "step": 857 }, { "clip_ratio": 0.0, "completion_length": 34.5, "epoch": 2.22279792746114, "grad_norm": 0.16070589453556666, "kl": 0.0986328125, "learning_rate": 7.779792746113989e-07, "loss": -0.0002, "reward": 2.499997138977051, "reward_std": 1.8427381291985512e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999971985816956, "step": 858 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 2.2253886010362693, "grad_norm": 40.6289215658432, "kl": 0.136474609375, "learning_rate": 7.77720207253886e-07, "loss": 0.0005, "reward": 2.1645208597183228, "reward_std": 0.2799772632490658, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.664520800113678, "step": 859 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 2.227979274611399, "grad_norm": 15.523657132558238, "kl": 0.0343017578125, "learning_rate": 7.774611398963731e-07, "loss": 0.0005, "reward": 2.4373964071273804, "reward_std": 0.17682450337451883, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9373964071273804, "step": 860 }, { "clip_ratio": 0.0, "completion_length": 34.0, "epoch": 2.2305699481865284, "grad_norm": 25.211609753750515, "kl": 0.05999755859375, "learning_rate": 7.772020725388601e-07, "loss": -0.0001, "reward": 2.2499719858169556, "reward_std": 0.2672875080694723, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.7499721050262451, "step": 861 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 2.233160621761658, "grad_norm": 5.458538411563557, "kl": 0.0927734375, "learning_rate": 7.769430051813472e-07, "loss": 0.0009, "reward": 1.9925637245178223, "reward_std": 0.00011932382176382816, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4925637543201447, "step": 862 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 2.2357512953367875, "grad_norm": 0.22704813565423648, "kl": 0.12744140625, "learning_rate": 7.766839378238342e-07, "loss": 0.0008, "reward": 2.499995231628418, "reward_std": 1.782142135198228e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999951720237732, "step": 863 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 2.238341968911917, "grad_norm": 21.577454476070237, "kl": 0.1119384765625, "learning_rate": 7.764248704663212e-07, "loss": 0.0001, "reward": 2.1076736450195312, "reward_std": 0.24214620607835968, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.6076736450195312, "step": 864 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 2.2409326424870466, "grad_norm": 6.782336823292774, "kl": 0.0589599609375, "learning_rate": 7.761658031088083e-07, "loss": 0.0002, "reward": 2.4999842643737793, "reward_std": 1.497648656823003e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999842643737793, "step": 865 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 2.243523316062176, "grad_norm": 0.5581986549913339, "kl": 0.0618896484375, "learning_rate": 7.759067357512953e-07, "loss": 0.0008, "reward": 2.499993324279785, "reward_std": 4.510141138780455e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999932646751404, "step": 866 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 2.2461139896373057, "grad_norm": 49.75474340508649, "kl": 0.12847900390625, "learning_rate": 7.756476683937824e-07, "loss": 0.0003, "reward": 1.8833805322647095, "reward_std": 0.0005223634916546871, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.3833806216716766, "step": 867 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 2.2487046632124352, "grad_norm": 2.0896393892920107, "kl": 0.3172607421875, "learning_rate": 7.753886010362694e-07, "loss": 0.0008, "reward": 2.499990940093994, "reward_std": 2.5628186222093063e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999913573265076, "step": 868 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 2.2512953367875648, "grad_norm": 0.24861797004995637, "kl": 0.0809326171875, "learning_rate": 7.751295336787565e-07, "loss": -0.0007, "reward": 2.4999920129776, "reward_std": 3.441311719143414e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999921917915344, "step": 869 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 2.2538860103626943, "grad_norm": 35.29801492876463, "kl": 0.112548828125, "learning_rate": 7.748704663212435e-07, "loss": -0.0009, "reward": 2.437470316886902, "reward_std": 0.17684745959616066, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.937470555305481, "step": 870 }, { "clip_ratio": 0.0, "completion_length": 34.5, "epoch": 2.256476683937824, "grad_norm": 2374.8087135124956, "kl": 0.1162109375, "learning_rate": 7.746113989637305e-07, "loss": 0.0006, "reward": 1.341384470462799, "reward_std": 0.00830282815877581, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 0.8413844406604767, "step": 871 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 2.2590673575129534, "grad_norm": 1.612652360564506, "kl": 0.0404052734375, "learning_rate": 7.743523316062176e-07, "loss": 0.0003, "reward": 2.499964475631714, "reward_std": 1.1771840490837349e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999642968177795, "step": 872 }, { "clip_ratio": 0.0, "completion_length": 35.125, "epoch": 2.261658031088083, "grad_norm": 281.0796457824024, "kl": 0.125, "learning_rate": 7.740932642487046e-07, "loss": 0.0006, "reward": 1.9147862792015076, "reward_std": 0.2406559771970933, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4147863686084747, "step": 873 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 2.2642487046632125, "grad_norm": 0.35357104327355054, "kl": 0.0709228515625, "learning_rate": 7.738341968911917e-07, "loss": -0.0002, "reward": 2.4999741315841675, "reward_std": 5.235112041646062e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999741315841675, "step": 874 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 2.266839378238342, "grad_norm": 21.598338438509938, "kl": 0.10302734375, "learning_rate": 7.735751295336788e-07, "loss": -0.0003, "reward": 1.9904950261116028, "reward_std": 0.00015373223186543328, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4904950857162476, "step": 875 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 2.2694300518134716, "grad_norm": 1899.472018168331, "kl": 352.0703125, "learning_rate": 7.733160621761657e-07, "loss": 1.4073, "reward": 2.4369258880615234, "reward_std": 0.17730323061186937, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.936926007270813, "step": 876 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 2.272020725388601, "grad_norm": 7.177511414549619, "kl": 0.0517578125, "learning_rate": 7.730569948186528e-07, "loss": -0.0006, "reward": 2.4999337196350098, "reward_std": 2.6807001859197044e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999337792396545, "step": 877 }, { "clip_ratio": 0.0, "completion_length": 35.25, "epoch": 2.2746113989637307, "grad_norm": 29.00678800812169, "kl": 0.163330078125, "learning_rate": 7.727979274611398e-07, "loss": 0.0001, "reward": 1.9999151825904846, "reward_std": 0.534536676856078, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.499915361404419, "step": 878 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 2.27720207253886, "grad_norm": 2.734778269916389, "kl": 0.070068359375, "learning_rate": 7.725388601036269e-07, "loss": 0.0002, "reward": 2.4999901056289673, "reward_std": 2.320093244634336e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999901056289673, "step": 879 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 2.2797927461139897, "grad_norm": 189.42008065905318, "kl": 0.0594482421875, "learning_rate": 7.72279792746114e-07, "loss": 0.0005, "reward": 2.3124356269836426, "reward_std": 0.2588570897974023, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.812435507774353, "step": 880 }, { "clip_ratio": 0.0, "completion_length": 36.5, "epoch": 2.2823834196891193, "grad_norm": 2.3949283406082436, "kl": 0.0804443359375, "learning_rate": 7.72020725388601e-07, "loss": 0.0, "reward": 2.4999879598617554, "reward_std": 1.1736271403606224e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.999988079071045, "step": 881 }, { "clip_ratio": 0.0, "completion_length": 35.9375, "epoch": 2.284974093264249, "grad_norm": 17.854987868314215, "kl": 0.060791015625, "learning_rate": 7.71761658031088e-07, "loss": 0.0011, "reward": 2.4342384338378906, "reward_std": 0.18567730413087702, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9342381954193115, "step": 882 }, { "clip_ratio": 0.0, "completion_length": 34.875, "epoch": 2.2875647668393784, "grad_norm": 24.498335405911746, "kl": 0.03338623046875, "learning_rate": 7.715025906735751e-07, "loss": -0.0002, "reward": 2.4994282722473145, "reward_std": 0.0010543846919972566, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.999428391456604, "step": 883 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 2.290155440414508, "grad_norm": 9.373854962364303, "kl": 2.26220703125, "learning_rate": 7.712435233160621e-07, "loss": 0.0099, "reward": 2.4999752044677734, "reward_std": 1.0168109156438732e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999750852584839, "step": 884 }, { "clip_ratio": 0.0, "completion_length": 34.0, "epoch": 2.2927461139896375, "grad_norm": 3.952252704619346, "kl": 0.2291259765625, "learning_rate": 7.709844559585492e-07, "loss": 0.0012, "reward": 1.718647539615631, "reward_std": 0.0004527728497123462, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.21864752471447, "step": 885 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 2.295336787564767, "grad_norm": 175.38611571985803, "kl": 0.08642578125, "learning_rate": 7.707253886010362e-07, "loss": 0.0005, "reward": 2.437479615211487, "reward_std": 0.17680806750206557, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.937479555606842, "step": 886 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 2.2979274611398965, "grad_norm": 0.24521506533891216, "kl": 0.102783203125, "learning_rate": 7.704663212435233e-07, "loss": 0.0001, "reward": 2.499997138977051, "reward_std": 2.487221991032129e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999970197677612, "step": 887 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 2.300518134715026, "grad_norm": 0.45039286672375983, "kl": 0.074462890625, "learning_rate": 7.702072538860103e-07, "loss": 0.0007, "reward": 2.4999959468841553, "reward_std": 2.526859134377446e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999959468841553, "step": 888 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 2.3031088082901556, "grad_norm": 2.412222213302715, "kl": 0.0927734375, "learning_rate": 7.699481865284973e-07, "loss": 0.0001, "reward": 2.4999910593032837, "reward_std": 7.825213742762571e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999909400939941, "step": 889 }, { "clip_ratio": 0.0, "completion_length": 35.25, "epoch": 2.305699481865285, "grad_norm": 0.42694143596131684, "kl": 0.0982666015625, "learning_rate": 7.696891191709844e-07, "loss": 0.0008, "reward": 2.4999938011169434, "reward_std": 2.936749979198794e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999938011169434, "step": 890 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 2.3082901554404147, "grad_norm": 2.295148355288371, "kl": 0.0457763671875, "learning_rate": 7.694300518134714e-07, "loss": 0.0011, "reward": 2.4999594688415527, "reward_std": 2.161479324058746e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.999959409236908, "step": 891 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 2.3108808290155443, "grad_norm": 1.7524269172846563, "kl": 0.0533447265625, "learning_rate": 7.691709844559585e-07, "loss": 0.0006, "reward": 2.4999853372573853, "reward_std": 8.951603717832768e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999852776527405, "step": 892 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 2.313471502590674, "grad_norm": 39.53500860615621, "kl": 0.3026123046875, "learning_rate": 7.689119170984456e-07, "loss": 0.0012, "reward": 1.9712463021278381, "reward_std": 0.0004931320399919059, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4712463021278381, "step": 893 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 2.3160621761658033, "grad_norm": 92.22362739534795, "kl": 0.09173583984375, "learning_rate": 7.686528497409325e-07, "loss": 0.0004, "reward": 2.3743382692337036, "reward_std": 0.23187917500399635, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.8743382096290588, "step": 894 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 2.318652849740933, "grad_norm": 1.3825839477738493, "kl": 0.07373046875, "learning_rate": 7.683937823834196e-07, "loss": 0.0006, "reward": 2.499995231628418, "reward_std": 2.7748931188398274e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999949932098389, "step": 895 }, { "clip_ratio": 0.0, "completion_length": 34.0, "epoch": 2.321243523316062, "grad_norm": 3.516374480380441, "kl": 0.120849609375, "learning_rate": 7.681347150259066e-07, "loss": 0.0003, "reward": 2.499992251396179, "reward_std": 6.5902706865017535e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999922513961792, "step": 896 }, { "clip_ratio": 0.0, "completion_length": 33.5, "epoch": 2.3238341968911915, "grad_norm": 0.1115842924918186, "kl": 0.0286865234375, "learning_rate": 7.678756476683938e-07, "loss": 0.0006, "reward": 2.4999979734420776, "reward_std": 1.8105575918525574e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999979138374329, "step": 897 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 2.326424870466321, "grad_norm": 15.468787981921725, "kl": 0.099853515625, "learning_rate": 7.676165803108809e-07, "loss": -0.0001, "reward": 2.062446713447571, "reward_std": 0.17679192748755668, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.5624468922615051, "step": 898 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 2.3290155440414506, "grad_norm": 10.327332514225484, "kl": 0.096435546875, "learning_rate": 7.673575129533679e-07, "loss": -0.0004, "reward": 2.499949812889099, "reward_std": 2.4012788344407454e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999500513076782, "step": 899 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 2.33160621761658, "grad_norm": 1.4606335096405556, "kl": 0.1181640625, "learning_rate": 7.670984455958549e-07, "loss": -0.0005, "reward": 2.499962568283081, "reward_std": 1.0645215297699906e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.999962568283081, "step": 900 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 2.3341968911917097, "grad_norm": 19.277566538444752, "kl": 0.093017578125, "learning_rate": 7.668393782383419e-07, "loss": 0.0003, "reward": 1.453494369983673, "reward_std": 0.00031578161724610254, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 0.9534944593906403, "step": 901 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 2.3367875647668392, "grad_norm": 8.665589621672831, "kl": 0.06396484375, "learning_rate": 7.66580310880829e-07, "loss": 0.0008, "reward": 2.499971628189087, "reward_std": 2.0828777905990137e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999715089797974, "step": 902 }, { "clip_ratio": 0.0, "completion_length": 34.5, "epoch": 2.339378238341969, "grad_norm": 0.7536608764797862, "kl": 0.06280517578125, "learning_rate": 7.663212435233161e-07, "loss": 0.0015, "reward": 2.499990940093994, "reward_std": 6.6174688981845975e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999909400939941, "step": 903 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 2.3419689119170983, "grad_norm": 0.21060584059268717, "kl": 0.0931396484375, "learning_rate": 7.660621761658031e-07, "loss": 0.0014, "reward": 2.4999879598617554, "reward_std": 3.3825172067736275e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999879598617554, "step": 904 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 2.344559585492228, "grad_norm": 1.5004328978075863, "kl": 0.0482177734375, "learning_rate": 7.658031088082902e-07, "loss": 0.0008, "reward": 2.499987244606018, "reward_std": 1.2173651271041308e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999871850013733, "step": 905 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 2.3471502590673574, "grad_norm": 33.07205757797885, "kl": 0.054931640625, "learning_rate": 7.655440414507772e-07, "loss": 0.0006, "reward": 2.2499086260795593, "reward_std": 0.26734173376803483, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.7499086260795593, "step": 906 }, { "clip_ratio": 0.0, "completion_length": 34.9375, "epoch": 2.349740932642487, "grad_norm": 2.18528303977387, "kl": 0.04974365234375, "learning_rate": 7.652849740932642e-07, "loss": 0.0005, "reward": 2.4999897480010986, "reward_std": 7.875904884713236e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.999989628791809, "step": 907 }, { "clip_ratio": 0.0, "completion_length": 34.5, "epoch": 2.3523316062176165, "grad_norm": 5.355271983729303, "kl": 0.0596923828125, "learning_rate": 7.650259067357513e-07, "loss": 0.0009, "reward": 1.9944500923156738, "reward_std": 5.3642829357158917e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4944500923156738, "step": 908 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 2.354922279792746, "grad_norm": 0.05280843014210008, "kl": 0.03253173828125, "learning_rate": 7.647668393782383e-07, "loss": -0.0009, "reward": 2.4999947547912598, "reward_std": 1.510143363248062e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999948740005493, "step": 909 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 2.3575129533678756, "grad_norm": 2.7276928409306374, "kl": 0.05859375, "learning_rate": 7.645077720207254e-07, "loss": 0.0001, "reward": 2.499974250793457, "reward_std": 1.9628562483831047e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.999974250793457, "step": 910 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 2.360103626943005, "grad_norm": 154.93996035639665, "kl": 0.147705078125, "learning_rate": 7.642487046632125e-07, "loss": 0.0006, "reward": 1.9995468854904175, "reward_std": 0.5180795788764954, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4995468854904175, "step": 911 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 2.3626943005181347, "grad_norm": 0.6368972116481264, "kl": 0.06591796875, "learning_rate": 7.639896373056994e-07, "loss": 0.0003, "reward": 2.499996542930603, "reward_std": 5.453087851492455e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999966025352478, "step": 912 }, { "clip_ratio": 0.0, "completion_length": 34.5, "epoch": 2.365284974093264, "grad_norm": 0.16255539230943153, "kl": 0.03662109375, "learning_rate": 7.637305699481865e-07, "loss": 0.0013, "reward": 2.4999961853027344, "reward_std": 1.5942108859690052e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999960064888, "step": 913 }, { "clip_ratio": 0.0, "completion_length": 33.5, "epoch": 2.3678756476683938, "grad_norm": 108.95478457970114, "kl": 0.1787109375, "learning_rate": 7.634715025906735e-07, "loss": 0.0003, "reward": 1.9936450719833374, "reward_std": 0.00015028398547656252, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4936451017856598, "step": 914 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 2.3704663212435233, "grad_norm": 24.053330752179644, "kl": 0.16162109375, "learning_rate": 7.632124352331606e-07, "loss": 0.0012, "reward": 2.437483787536621, "reward_std": 0.17678690779575845, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.937483787536621, "step": 915 }, { "clip_ratio": 0.0, "completion_length": 34.5, "epoch": 2.373056994818653, "grad_norm": 0.20616139926389984, "kl": 0.047607421875, "learning_rate": 7.629533678756477e-07, "loss": 0.0002, "reward": 2.49999737739563, "reward_std": 1.8199588680545276e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999972581863403, "step": 916 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 2.3756476683937824, "grad_norm": 0.34134283785246666, "kl": 0.24176025390625, "learning_rate": 7.626943005181347e-07, "loss": 0.0016, "reward": 2.499991536140442, "reward_std": 8.67958056005591e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999912977218628, "step": 917 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 2.378238341968912, "grad_norm": 7.2871362681377425, "kl": 0.0416259765625, "learning_rate": 7.624352331606217e-07, "loss": 0.0003, "reward": 2.4999892711639404, "reward_std": 1.1855105071845173e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999892711639404, "step": 918 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 2.3808290155440415, "grad_norm": 5.71933266264719, "kl": 0.09912109375, "learning_rate": 7.621761658031087e-07, "loss": 0.0002, "reward": 2.4998656511306763, "reward_std": 7.985772032270688e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9998657703399658, "step": 919 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 2.383419689119171, "grad_norm": 4.245496843338156, "kl": 0.1533203125, "learning_rate": 7.619170984455958e-07, "loss": 0.0012, "reward": 2.499936819076538, "reward_std": 1.3957213013782166e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.999936580657959, "step": 920 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 2.3860103626943006, "grad_norm": 17.353312604194592, "kl": 0.037841796875, "learning_rate": 7.616580310880829e-07, "loss": 0.0004, "reward": 2.4999512434005737, "reward_std": 1.4205514162313193e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.999951183795929, "step": 921 }, { "clip_ratio": 0.0, "completion_length": 34.5, "epoch": 2.38860103626943, "grad_norm": 15.241568110794935, "kl": 0.0609130859375, "learning_rate": 7.613989637305699e-07, "loss": 0.001, "reward": 1.997856318950653, "reward_std": 0.000340745203914139, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4978562593460083, "step": 922 }, { "clip_ratio": 0.0, "completion_length": 34.5, "epoch": 2.3911917098445596, "grad_norm": 0.3888835281284561, "kl": 0.099853515625, "learning_rate": 7.61139896373057e-07, "loss": 0.0007, "reward": 2.499985098838806, "reward_std": 4.012306021650147e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999849200248718, "step": 923 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 2.393782383419689, "grad_norm": 2.001822140388647, "kl": 0.14794921875, "learning_rate": 7.608808290155439e-07, "loss": -0.0002, "reward": 2.4998509883880615, "reward_std": 3.331619564050925e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9998509287834167, "step": 924 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 2.3963730569948187, "grad_norm": 6.80795492525771, "kl": 0.05853271484375, "learning_rate": 7.60621761658031e-07, "loss": 0.0005, "reward": 2.4999635219573975, "reward_std": 3.1137228916122694e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999634623527527, "step": 925 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 2.3989637305699483, "grad_norm": 10.257577291307154, "kl": 0.101806640625, "learning_rate": 7.603626943005181e-07, "loss": 0.0007, "reward": 2.499861001968384, "reward_std": 4.529693251242861e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9998611211776733, "step": 926 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 2.401554404145078, "grad_norm": 891.1018910361465, "kl": 0.135986328125, "learning_rate": 7.601036269430051e-07, "loss": 0.0001, "reward": 1.9697346687316895, "reward_std": 0.004559659611004463, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.469734638929367, "step": 927 }, { "clip_ratio": 0.0, "completion_length": 34.0, "epoch": 2.4041450777202074, "grad_norm": 24.367917946252145, "kl": 0.0771484375, "learning_rate": 7.598445595854922e-07, "loss": 0.0004, "reward": 1.9998607635498047, "reward_std": 6.505734745587688e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4998607635498047, "step": 928 }, { "clip_ratio": 0.0, "completion_length": 34.0, "epoch": 2.406735751295337, "grad_norm": 0.8639193938537891, "kl": 0.215576171875, "learning_rate": 7.595854922279792e-07, "loss": 0.002, "reward": 1.499997854232788, "reward_std": 1.9369217625353485e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 0.9999979138374329, "step": 929 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 2.4093264248704664, "grad_norm": 44.83049413648358, "kl": 0.14306640625, "learning_rate": 7.593264248704662e-07, "loss": 0.0005, "reward": 1.4576206803321838, "reward_std": 0.07640038783756609, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 0.9576206803321838, "step": 930 }, { "clip_ratio": 0.0, "completion_length": 34.5, "epoch": 2.411917098445596, "grad_norm": 9.925248483551462, "kl": 0.0693359375, "learning_rate": 7.590673575129533e-07, "loss": 0.001, "reward": 2.312297821044922, "reward_std": 0.2588079248100712, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.8122978210449219, "step": 931 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 2.4145077720207255, "grad_norm": 10.712344689927866, "kl": 0.111572265625, "learning_rate": 7.588082901554403e-07, "loss": 0.0005, "reward": 1.910188615322113, "reward_std": 0.00021999774787673232, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4101886749267578, "step": 932 }, { "clip_ratio": 0.0, "completion_length": 34.8125, "epoch": 2.417098445595855, "grad_norm": 127.68799003037205, "kl": 0.083984375, "learning_rate": 7.585492227979274e-07, "loss": 0.0, "reward": 2.1846303939819336, "reward_std": 0.2610150386326211, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.6846305131912231, "step": 933 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 2.4196891191709846, "grad_norm": 135.10797732635731, "kl": 0.111328125, "learning_rate": 7.582901554404145e-07, "loss": 0.0012, "reward": 2.4999747276306152, "reward_std": 2.5614713194954675e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999747276306152, "step": 934 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 2.422279792746114, "grad_norm": 6.729782260653362, "kl": 0.086669921875, "learning_rate": 7.580310880829015e-07, "loss": 0.0009, "reward": 2.4998395442962646, "reward_std": 7.537870351370657e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9998394846916199, "step": 935 }, { "clip_ratio": 0.0, "completion_length": 34.5, "epoch": 2.4248704663212437, "grad_norm": 0.8001945007482641, "kl": 0.111572265625, "learning_rate": 7.577720207253885e-07, "loss": 0.0007, "reward": 1.999828815460205, "reward_std": 1.1383938954168116e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4998287558555603, "step": 936 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 2.4274611398963732, "grad_norm": 1.087960734243812, "kl": 0.06793212890625, "learning_rate": 7.575129533678755e-07, "loss": -0.0005, "reward": 2.4999839067459106, "reward_std": 4.708190999735962e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999840259552002, "step": 937 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 2.4300518134715023, "grad_norm": 46.59050477781762, "kl": 0.0738525390625, "learning_rate": 7.572538860103626e-07, "loss": 0.001, "reward": 2.4999462366104126, "reward_std": 3.068698788410984e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.999946117401123, "step": 938 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 2.432642487046632, "grad_norm": 20.514791087029103, "kl": 0.16015625, "learning_rate": 7.569948186528498e-07, "loss": 0.001, "reward": 1.9996665716171265, "reward_std": 4.577649838211073e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4996665120124817, "step": 939 }, { "clip_ratio": 0.0, "completion_length": 34.5, "epoch": 2.4352331606217614, "grad_norm": 0.7944817643438287, "kl": 0.021881103515625, "learning_rate": 7.567357512953368e-07, "loss": -0.0002, "reward": 1.9991827011108398, "reward_std": 3.211699061012041e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4991827309131622, "step": 940 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 2.437823834196891, "grad_norm": 1.3239401855227448, "kl": 0.1572265625, "learning_rate": 7.564766839378239e-07, "loss": 0.001, "reward": 2.4999938011169434, "reward_std": 6.030892564012902e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.999993622303009, "step": 941 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 2.4404145077720205, "grad_norm": 0.713359777445594, "kl": 0.103515625, "learning_rate": 7.562176165803108e-07, "loss": 0.0004, "reward": 2.4999849796295166, "reward_std": 7.23069092600781e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.999984860420227, "step": 942 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 2.44300518134715, "grad_norm": 1.1428786990802968, "kl": 0.04083251953125, "learning_rate": 7.559585492227979e-07, "loss": 0.0005, "reward": 2.4999879598617554, "reward_std": 1.0063386071124114e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.999987781047821, "step": 943 }, { "clip_ratio": 0.0, "completion_length": 36.625, "epoch": 2.4455958549222796, "grad_norm": 48.60938521682109, "kl": 0.103759765625, "learning_rate": 7.55699481865285e-07, "loss": -0.0001, "reward": 1.9995916485786438, "reward_std": 0.00011154991989315022, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4995917081832886, "step": 944 }, { "clip_ratio": 0.0, "completion_length": 34.5, "epoch": 2.448186528497409, "grad_norm": 16.40060561521158, "kl": 0.058837890625, "learning_rate": 7.55440414507772e-07, "loss": -0.0008, "reward": 1.8128111362457275, "reward_std": 0.00039212397177834646, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.3128113150596619, "step": 945 }, { "clip_ratio": 0.0, "completion_length": 34.1875, "epoch": 2.4507772020725387, "grad_norm": 1.129085775623217, "kl": 0.0521240234375, "learning_rate": 7.551813471502591e-07, "loss": -0.0009, "reward": 2.4999914169311523, "reward_std": 8.720686196284078e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999914169311523, "step": 946 }, { "clip_ratio": 0.0, "completion_length": 34.5, "epoch": 2.4533678756476682, "grad_norm": 11.128557764822244, "kl": 0.115234375, "learning_rate": 7.549222797927461e-07, "loss": -0.0006, "reward": 2.4998854398727417, "reward_std": 0.0002141268014383968, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9998857975006104, "step": 947 }, { "clip_ratio": 0.0, "completion_length": 34.0, "epoch": 2.4559585492227978, "grad_norm": 7.853008323011336, "kl": 0.2027587890625, "learning_rate": 7.546632124352331e-07, "loss": 0.0008, "reward": 1.9998813271522522, "reward_std": 3.5622451719063974e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.499881386756897, "step": 948 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 2.4585492227979273, "grad_norm": 136.70544011738588, "kl": 1.00128173828125, "learning_rate": 7.544041450777202e-07, "loss": 0.0049, "reward": 2.4353466033935547, "reward_std": 0.18270384752031532, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9353466033935547, "step": 949 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 2.461139896373057, "grad_norm": 0.6316161613629311, "kl": 0.154296875, "learning_rate": 7.541450777202072e-07, "loss": 0.0002, "reward": 2.4999895095825195, "reward_std": 7.198604862423963e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.999989628791809, "step": 950 }, { "clip_ratio": 0.0, "completion_length": 34.0, "epoch": 2.4637305699481864, "grad_norm": 1.9242499639220416, "kl": 0.1513671875, "learning_rate": 7.538860103626943e-07, "loss": 0.0013, "reward": 2.4999783039093018, "reward_std": 7.100676043592102e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999781847000122, "step": 951 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 2.466321243523316, "grad_norm": 2.061044326704196, "kl": 0.1064453125, "learning_rate": 7.536269430051813e-07, "loss": 0.0012, "reward": 2.49991512298584, "reward_std": 1.4621544096371508e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999150037765503, "step": 952 }, { "clip_ratio": 0.0, "completion_length": 34.5, "epoch": 2.4689119170984455, "grad_norm": 0.15424854191835632, "kl": 0.1015625, "learning_rate": 7.533678756476684e-07, "loss": 0.0006, "reward": 2.499998092651367, "reward_std": 2.2490693254439975e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999982118606567, "step": 953 }, { "clip_ratio": 0.0, "completion_length": 36.25, "epoch": 2.471502590673575, "grad_norm": 21.108187479284748, "kl": 0.0908203125, "learning_rate": 7.531088082901554e-07, "loss": 0.0002, "reward": 2.3197872638702393, "reward_std": 0.33367914653513253, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.819787323474884, "step": 954 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 2.4740932642487046, "grad_norm": 9.150908923727552, "kl": 0.126708984375, "learning_rate": 7.528497409326424e-07, "loss": 0.0005, "reward": 2.4998146295547485, "reward_std": 4.8566987743470236e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9998146295547485, "step": 955 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 2.476683937823834, "grad_norm": 6.865160570357046, "kl": 0.11669921875, "learning_rate": 7.525906735751295e-07, "loss": 0.0004, "reward": 1.6648834943771362, "reward_std": 0.2318227205250878, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.1648836135864258, "step": 956 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 2.4792746113989637, "grad_norm": 1.5753853531041222, "kl": 0.20654296875, "learning_rate": 7.523316062176166e-07, "loss": 0.0006, "reward": 2.499989867210388, "reward_std": 9.547110607854847e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999898672103882, "step": 957 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 2.481865284974093, "grad_norm": 1.7271326723198381, "kl": 0.0703125, "learning_rate": 7.520725388601036e-07, "loss": -0.0005, "reward": 2.499770760536194, "reward_std": 1.9740587504202267e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9997708797454834, "step": 958 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 2.4844559585492227, "grad_norm": 23.183072704143797, "kl": 0.127197265625, "learning_rate": 7.518134715025907e-07, "loss": 0.0005, "reward": 2.4374130964279175, "reward_std": 0.17691215011655004, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9374129176139832, "step": 959 }, { "clip_ratio": 0.0, "completion_length": 35.3125, "epoch": 2.4870466321243523, "grad_norm": 13.736891689688672, "kl": 0.139404296875, "learning_rate": 7.515544041450776e-07, "loss": 0.0007, "reward": 2.3749916553497314, "reward_std": 0.3535528115900206, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.8749916553497314, "step": 960 }, { "clip_ratio": 0.0, "completion_length": 36.5, "epoch": 2.489637305699482, "grad_norm": 0.4082768571347582, "kl": 0.0653076171875, "learning_rate": 7.512953367875647e-07, "loss": 0.0011, "reward": 2.499945878982544, "reward_std": 7.201321636784996e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999457597732544, "step": 961 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 2.4922279792746114, "grad_norm": 1.5750787573387628, "kl": 0.070343017578125, "learning_rate": 7.510362694300518e-07, "loss": 0.0008, "reward": 2.499985098838806, "reward_std": 9.421758363714616e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.999985158443451, "step": 962 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 2.494818652849741, "grad_norm": 7.2659527944081415, "kl": 0.0777587890625, "learning_rate": 7.507772020725388e-07, "loss": -0.0006, "reward": 1.9931894540786743, "reward_std": 0.00021563546283687174, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4931894838809967, "step": 963 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 2.4974093264248705, "grad_norm": 29.696451106721135, "kl": 0.1875, "learning_rate": 7.505181347150259e-07, "loss": 0.0014, "reward": 2.061533212661743, "reward_std": 0.17716989700329577, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.5615330934524536, "step": 964 }, { "clip_ratio": 0.0, "completion_length": 34.5, "epoch": 2.5, "grad_norm": 2.962726576057743, "kl": 0.04034423828125, "learning_rate": 7.502590673575129e-07, "loss": -0.0003, "reward": 2.499897003173828, "reward_std": 2.4297271693285438e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.999897062778473, "step": 965 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 2.5025906735751295, "grad_norm": 1.4385157729398097, "kl": 0.12890625, "learning_rate": 7.5e-07, "loss": 0.0022, "reward": 2.499987840652466, "reward_std": 7.570292268610501e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999877214431763, "step": 966 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 2.505181347150259, "grad_norm": 4.265232340889757, "kl": 0.069091796875, "learning_rate": 7.49740932642487e-07, "loss": 0.0001, "reward": 2.49997341632843, "reward_std": 1.600029895598709e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.999973475933075, "step": 967 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 2.5077720207253886, "grad_norm": 35.721329691325536, "kl": 0.08642578125, "learning_rate": 7.49481865284974e-07, "loss": 0.0003, "reward": 2.1772468090057373, "reward_std": 0.5042813867330551, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.6772468090057373, "step": 968 }, { "clip_ratio": 0.0, "completion_length": 34.5, "epoch": 2.510362694300518, "grad_norm": 0.40477350161334574, "kl": 0.094970703125, "learning_rate": 7.492227979274611e-07, "loss": 0.0011, "reward": 2.4999948740005493, "reward_std": 3.273613515375473e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999948143959045, "step": 969 }, { "clip_ratio": 0.0, "completion_length": 34.5, "epoch": 2.5129533678756477, "grad_norm": 9.855248488100647, "kl": 0.04345703125, "learning_rate": 7.489637305699481e-07, "loss": 0.0006, "reward": 1.999349296092987, "reward_std": 4.114194223348022e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.499349057674408, "step": 970 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 2.5155440414507773, "grad_norm": 1.087687288678207, "kl": 0.109375, "learning_rate": 7.487046632124352e-07, "loss": -0.0003, "reward": 2.499978542327881, "reward_std": 6.4177226022366085e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999786019325256, "step": 971 }, { "clip_ratio": 0.0, "completion_length": 34.5, "epoch": 2.518134715025907, "grad_norm": 7.600420496587052, "kl": 0.1429443359375, "learning_rate": 7.484455958549223e-07, "loss": 0.0004, "reward": 1.8474342823028564, "reward_std": 0.0005369920093016844, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.3474342823028564, "step": 972 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 2.5207253886010363, "grad_norm": 0.5178823709045375, "kl": 0.09814453125, "learning_rate": 7.481865284974092e-07, "loss": 0.001, "reward": 2.499987244606018, "reward_std": 6.464601426614536e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.999987006187439, "step": 973 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 2.523316062176166, "grad_norm": 15.363694591604116, "kl": 0.1513671875, "learning_rate": 7.479274611398963e-07, "loss": 0.0007, "reward": 2.041240632534027, "reward_std": 0.1853715334766548, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.541240632534027, "step": 974 }, { "clip_ratio": 0.0, "completion_length": 34.5, "epoch": 2.5259067357512954, "grad_norm": 9.394894754562163, "kl": 0.1845703125, "learning_rate": 7.476683937823833e-07, "loss": 0.001, "reward": 0.9998171329498291, "reward_std": 3.0216364393709227e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 0.49981701374053955, "step": 975 }, { "clip_ratio": 0.0, "completion_length": 34.6875, "epoch": 2.528497409326425, "grad_norm": 2.413181954778754, "kl": 0.06842041015625, "learning_rate": 7.474093264248704e-07, "loss": 0.0015, "reward": 2.4999775886535645, "reward_std": 8.515007039022748e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999776482582092, "step": 976 }, { "clip_ratio": 0.0, "completion_length": 37.375, "epoch": 2.5310880829015545, "grad_norm": 4.414030042445875, "kl": 0.77783203125, "learning_rate": 7.471502590673575e-07, "loss": 0.0034, "reward": 2.499969482421875, "reward_std": 1.346209126040776e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999695420265198, "step": 977 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 2.533678756476684, "grad_norm": 25.606485647980744, "kl": 0.115234375, "learning_rate": 7.468911917098445e-07, "loss": 0.0008, "reward": 1.9646154046058655, "reward_std": 0.005181904838082119, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4646154046058655, "step": 978 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 2.5362694300518136, "grad_norm": 12.262696342722542, "kl": 0.240234375, "learning_rate": 7.466321243523315e-07, "loss": 0.0011, "reward": 1.9309902787208557, "reward_std": 0.02750369685691112, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4309902787208557, "step": 979 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 2.538860103626943, "grad_norm": 1.9008284492847578, "kl": 0.089111328125, "learning_rate": 7.463730569948187e-07, "loss": 0.0, "reward": 2.499983072280884, "reward_std": 1.5114326970433467e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999829530715942, "step": 980 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 2.5414507772020727, "grad_norm": 33.828322402233084, "kl": 0.12628173828125, "learning_rate": 7.461139896373057e-07, "loss": 0.001, "reward": 1.854383409023285, "reward_std": 0.00570840007321749, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.354383409023285, "step": 981 }, { "clip_ratio": 0.0, "completion_length": 34.5, "epoch": 2.5440414507772022, "grad_norm": 13.444261598852792, "kl": 0.085205078125, "learning_rate": 7.458549222797928e-07, "loss": 0.0011, "reward": 1.9873749017715454, "reward_std": 0.0007185070289779105, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4873749017715454, "step": 982 }, { "clip_ratio": 0.0, "completion_length": 39.375, "epoch": 2.5466321243523318, "grad_norm": 1.0906646810171143, "kl": 0.0799560546875, "learning_rate": 7.455958549222798e-07, "loss": 0.001, "reward": 2.4999756813049316, "reward_std": 8.79958633959177e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999756217002869, "step": 983 }, { "clip_ratio": 0.0, "completion_length": 36.625, "epoch": 2.5492227979274613, "grad_norm": 3.946590977551778, "kl": 0.150634765625, "learning_rate": 7.453367875647669e-07, "loss": 0.0002, "reward": 2.4999853372573853, "reward_std": 1.792325082305979e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.99998539686203, "step": 984 }, { "clip_ratio": 0.0, "completion_length": 77.25, "epoch": 2.551813471502591, "grad_norm": 2.6119679904943442, "kl": 0.07208251953125, "learning_rate": 7.450777202072539e-07, "loss": 0.001, "reward": 2.499967098236084, "reward_std": 1.539709683129331e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999671578407288, "step": 985 }, { "clip_ratio": 0.0, "completion_length": 41.0625, "epoch": 2.5544041450777204, "grad_norm": 0.24661692718260644, "kl": 0.03045654296875, "learning_rate": 7.448186528497409e-07, "loss": 0.0004, "reward": 2.499992251396179, "reward_std": 3.850713937936234e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999921321868896, "step": 986 }, { "clip_ratio": 0.0, "completion_length": 39.9375, "epoch": 2.55699481865285, "grad_norm": 0.3155136756021828, "kl": 0.122802734375, "learning_rate": 7.44559585492228e-07, "loss": 0.0009, "reward": 2.499970316886902, "reward_std": 3.5425043733994244e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999701976776123, "step": 987 }, { "clip_ratio": 0.0, "completion_length": 163.8125, "epoch": 2.5595854922279795, "grad_norm": 5.431564811546383, "kl": 0.185302734375, "learning_rate": 7.44300518134715e-07, "loss": 0.0003, "reward": 1.9998611211776733, "reward_std": 7.525141654696199e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4998612105846405, "step": 988 }, { "clip_ratio": 0.0, "completion_length": 60.6875, "epoch": 2.562176165803109, "grad_norm": 1.4166366231620315, "kl": 0.083251953125, "learning_rate": 7.440414507772021e-07, "loss": 0.0011, "reward": 2.499987244606018, "reward_std": 1.039934113578056e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.999987006187439, "step": 989 }, { "clip_ratio": 0.0, "completion_length": 46.5625, "epoch": 2.5647668393782386, "grad_norm": 6.87959332455512, "kl": 0.123779296875, "learning_rate": 7.437823834196892e-07, "loss": 0.0002, "reward": 1.9982973337173462, "reward_std": 5.501119630935136e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4982974231243134, "step": 990 }, { "clip_ratio": 0.0, "completion_length": 56.1875, "epoch": 2.567357512953368, "grad_norm": 2.739631087777935, "kl": 0.203125, "learning_rate": 7.435233160621761e-07, "loss": 0.0012, "reward": 2.499994993209839, "reward_std": 3.6262600815462065e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999948740005493, "step": 991 }, { "clip_ratio": 0.0, "completion_length": 51.3125, "epoch": 2.5699481865284977, "grad_norm": 4.967001902255783, "kl": 0.2373046875, "learning_rate": 7.432642487046632e-07, "loss": 0.0015, "reward": 1.9893280267715454, "reward_std": 0.00013621033531308058, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4893280565738678, "step": 992 }, { "clip_ratio": 0.0, "completion_length": 181.5625, "epoch": 2.572538860103627, "grad_norm": 17.900996091287624, "kl": 0.3427734375, "learning_rate": 7.430051813471502e-07, "loss": 0.0013, "reward": 1.2730909585952759, "reward_std": 0.0006147078383946791, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 0.7730909287929535, "step": 993 }, { "clip_ratio": 0.0, "completion_length": 142.8125, "epoch": 2.5751295336787567, "grad_norm": 1.033754492387028, "kl": 0.685546875, "learning_rate": 7.427461139896373e-07, "loss": 0.0024, "reward": 1.9999675154685974, "reward_std": 5.680035656041582e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4999675452709198, "step": 994 }, { "clip_ratio": 0.0, "completion_length": 167.0625, "epoch": 2.5777202072538863, "grad_norm": 0.20144272289860132, "kl": 0.36328125, "learning_rate": 7.424870466321244e-07, "loss": 0.0013, "reward": 2.4999979734420776, "reward_std": 1.462866350721015e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999979138374329, "step": 995 }, { "clip_ratio": 0.0, "completion_length": 88.75, "epoch": 2.5803108808290154, "grad_norm": 0.45093562742884274, "kl": 0.48046875, "learning_rate": 7.422279792746114e-07, "loss": 0.0018, "reward": 2.499994993209839, "reward_std": 3.7291632679625764e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999948740005493, "step": 996 }, { "clip_ratio": 0.0, "completion_length": 45.3125, "epoch": 2.582901554404145, "grad_norm": 6.023124875685409, "kl": 0.5810546875, "learning_rate": 7.419689119170984e-07, "loss": 0.0028, "reward": 1.8036752939224243, "reward_std": 0.000546291637874674, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.3036752939224243, "step": 997 }, { "clip_ratio": 0.0, "completion_length": 193.5, "epoch": 2.5854922279792745, "grad_norm": 4.735721531319997, "kl": 0.521484375, "learning_rate": 7.417098445595854e-07, "loss": 0.0023, "reward": 2.499949336051941, "reward_std": 3.642402725745342e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999492764472961, "step": 998 }, { "clip_ratio": 0.0, "completion_length": 246.25, "epoch": 2.588082901554404, "grad_norm": 0.3355792142965779, "kl": 0.46484375, "learning_rate": 7.414507772020725e-07, "loss": 0.0017, "reward": 2.499997615814209, "reward_std": 1.1579040801734664e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.999997615814209, "step": 999 }, { "clip_ratio": 0.0, "completion_length": 39.125, "epoch": 2.5906735751295336, "grad_norm": 0.7364718498592124, "kl": 0.4345703125, "learning_rate": 7.411917098445596e-07, "loss": 0.0029, "reward": 2.4999947547912598, "reward_std": 3.090572135988623e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999946355819702, "step": 1000 }, { "clip_ratio": 0.0, "completion_length": 128.875, "epoch": 2.593264248704663, "grad_norm": 1.4080007315033172, "kl": 0.541015625, "learning_rate": 7.409326424870466e-07, "loss": 0.0003, "reward": 2.499993324279785, "reward_std": 5.6037465583358426e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999935030937195, "step": 1001 }, { "clip_ratio": 0.0, "completion_length": 49.5625, "epoch": 2.5958549222797926, "grad_norm": 0.9808436745919166, "kl": 0.55859375, "learning_rate": 7.406735751295337e-07, "loss": 0.0028, "reward": 2.4999938011169434, "reward_std": 3.050547888960864e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999938011169434, "step": 1002 }, { "clip_ratio": 0.0, "completion_length": 47.125, "epoch": 2.598445595854922, "grad_norm": 0.3646312804268175, "kl": 0.74609375, "learning_rate": 7.404145077720207e-07, "loss": 0.0037, "reward": 2.4999972581863403, "reward_std": 1.649128250846843e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999971389770508, "step": 1003 }, { "clip_ratio": 0.0, "completion_length": 61.25, "epoch": 2.6010362694300517, "grad_norm": 12.285086371569555, "kl": 0.53515625, "learning_rate": 7.401554404145077e-07, "loss": 0.002, "reward": 2.499593496322632, "reward_std": 0.0001344580450677313, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9995936155319214, "step": 1004 }, { "clip_ratio": 0.0, "completion_length": 96.9375, "epoch": 2.6036269430051813, "grad_norm": 1.9874453432442083, "kl": 0.349609375, "learning_rate": 7.398963730569948e-07, "loss": 0.0019, "reward": 2.4999794960021973, "reward_std": 9.808404001887538e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999794960021973, "step": 1005 }, { "clip_ratio": 0.0, "completion_length": 156.0, "epoch": 2.606217616580311, "grad_norm": 130.16210823613784, "kl": 0.4453125, "learning_rate": 7.396373056994818e-07, "loss": 0.0019, "reward": 1.8914172649383545, "reward_std": 0.24591282738583686, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.3914174139499664, "step": 1006 }, { "clip_ratio": 0.0, "completion_length": 139.375, "epoch": 2.6088082901554404, "grad_norm": 5.034697211886495, "kl": 0.615234375, "learning_rate": 7.393782383419689e-07, "loss": 0.0026, "reward": 2.4999823570251465, "reward_std": 1.1671747188302106e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999822974205017, "step": 1007 }, { "clip_ratio": 0.0, "completion_length": 138.25, "epoch": 2.61139896373057, "grad_norm": 1.3056685843873292, "kl": 0.529296875, "learning_rate": 7.39119170984456e-07, "loss": 0.0024, "reward": 2.499987840652466, "reward_std": 1.034896058627055e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999878406524658, "step": 1008 }, { "clip_ratio": 0.0, "completion_length": 58.3125, "epoch": 2.6139896373056994, "grad_norm": 15.471134858569846, "kl": 0.306640625, "learning_rate": 7.388601036269429e-07, "loss": 0.0015, "reward": 2.4999314546585083, "reward_std": 0.0001298900234019129, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999313950538635, "step": 1009 }, { "clip_ratio": 0.0, "completion_length": 67.5, "epoch": 2.616580310880829, "grad_norm": 1.327780629829245, "kl": 0.5419921875, "learning_rate": 7.3860103626943e-07, "loss": 0.0023, "reward": 2.4999128580093384, "reward_std": 1.5311324091271672e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999127388000488, "step": 1010 }, { "clip_ratio": 0.0, "completion_length": 39.875, "epoch": 2.6191709844559585, "grad_norm": 44.82114208935299, "kl": 0.248046875, "learning_rate": 7.38341968911917e-07, "loss": 0.0006, "reward": 2.408591866493225, "reward_std": 0.2584836309633829, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9085916876792908, "step": 1011 }, { "clip_ratio": 0.0, "completion_length": 49.75, "epoch": 2.621761658031088, "grad_norm": 71.43867037182878, "kl": 0.3359375, "learning_rate": 7.380829015544041e-07, "loss": 0.0014, "reward": 1.8336089849472046, "reward_std": 0.1902480730204843, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.3336089849472046, "step": 1012 }, { "clip_ratio": 0.0, "completion_length": 55.125, "epoch": 2.6243523316062176, "grad_norm": 1.5182677244338434, "kl": 0.110107421875, "learning_rate": 7.378238341968912e-07, "loss": -0.0003, "reward": 2.4999879598617554, "reward_std": 7.340728188864887e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999881386756897, "step": 1013 }, { "clip_ratio": 0.0, "completion_length": 43.5625, "epoch": 2.626943005181347, "grad_norm": 7.23031118387489, "kl": 0.2607421875, "learning_rate": 7.375647668393782e-07, "loss": 0.0012, "reward": 2.374991297721863, "reward_std": 0.3535586022121606, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.8749911785125732, "step": 1014 }, { "clip_ratio": 0.0, "completion_length": 38.25, "epoch": 2.6295336787564767, "grad_norm": 23.21657081552542, "kl": 0.14208984375, "learning_rate": 7.373056994818652e-07, "loss": 0.0003, "reward": 2.419443368911743, "reward_std": 0.227821338423837, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9194433093070984, "step": 1015 }, { "clip_ratio": 0.0, "completion_length": 40.125, "epoch": 2.6321243523316062, "grad_norm": 1.5175973804165943, "kl": 0.22998046875, "learning_rate": 7.370466321243522e-07, "loss": 0.0005, "reward": 2.4999619722366333, "reward_std": 1.5150643491779192e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999618530273438, "step": 1016 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 2.634715025906736, "grad_norm": 19.38826630890325, "kl": 0.03643798828125, "learning_rate": 7.367875647668393e-07, "loss": 0.0, "reward": 2.430171489715576, "reward_std": 0.19747968364572444, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9301713705062866, "step": 1017 }, { "clip_ratio": 0.0, "completion_length": 34.9375, "epoch": 2.6373056994818653, "grad_norm": 34.33573971874138, "kl": 0.61767578125, "learning_rate": 7.365284974093264e-07, "loss": 0.0024, "reward": 2.426503539085388, "reward_std": 0.20768518514887546, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.926503598690033, "step": 1018 }, { "clip_ratio": 0.0, "completion_length": 34.5, "epoch": 2.639896373056995, "grad_norm": 0.14159952598560224, "kl": 0.0701904296875, "learning_rate": 7.362694300518134e-07, "loss": 0.0001, "reward": 2.4999964237213135, "reward_std": 2.538123453632579e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999963641166687, "step": 1019 }, { "clip_ratio": 0.0, "completion_length": 39.3125, "epoch": 2.6424870466321244, "grad_norm": 1.3117707935190224, "kl": 0.15234375, "learning_rate": 7.360103626943005e-07, "loss": 0.0007, "reward": 2.4999629259109497, "reward_std": 1.1376158965958894e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.999962866306305, "step": 1020 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 2.645077720207254, "grad_norm": 4.321192344468887, "kl": 0.054931640625, "learning_rate": 7.357512953367874e-07, "loss": 0.0006, "reward": 2.4999568462371826, "reward_std": 2.5446793870287365e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999569654464722, "step": 1021 }, { "clip_ratio": 0.0, "completion_length": 35.25, "epoch": 2.6476683937823835, "grad_norm": 0.2925598934588983, "kl": 0.11279296875, "learning_rate": 7.354922279792745e-07, "loss": 0.0006, "reward": 2.4999924898147583, "reward_std": 4.172429157733859e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.999992549419403, "step": 1022 }, { "clip_ratio": 0.0, "completion_length": 34.5, "epoch": 2.650259067357513, "grad_norm": 0.5271870926498459, "kl": 0.1005859375, "learning_rate": 7.352331606217617e-07, "loss": 0.0008, "reward": 2.4999935626983643, "reward_std": 3.88533538853153e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999938011169434, "step": 1023 }, { "clip_ratio": 0.0, "completion_length": 34.5, "epoch": 2.6528497409326426, "grad_norm": 68.07486583797518, "kl": 0.0644073486328125, "learning_rate": 7.349740932642487e-07, "loss": -0.0, "reward": 1.9981709122657776, "reward_std": 0.00027494916307091444, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4981709420681, "step": 1024 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 2.655440414507772, "grad_norm": 6.229073155096826, "kl": 0.084228515625, "learning_rate": 7.347150259067358e-07, "loss": 0.0003, "reward": 2.4998949766159058, "reward_std": 3.0161281188156863e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9998949766159058, "step": 1025 }, { "clip_ratio": 0.0, "completion_length": 36.3125, "epoch": 2.6580310880829017, "grad_norm": 0.2191640383136181, "kl": 0.1220703125, "learning_rate": 7.344559585492228e-07, "loss": 0.0006, "reward": 2.499994397163391, "reward_std": 3.1330532692663837e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999942779541016, "step": 1026 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 2.660621761658031, "grad_norm": 3.0848741798994324, "kl": 0.177978515625, "learning_rate": 7.341968911917098e-07, "loss": 0.0011, "reward": 2.4999797344207764, "reward_std": 2.4030719714573934e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999796152114868, "step": 1027 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 2.6632124352331608, "grad_norm": 2.4537002197967426, "kl": 0.12548828125, "learning_rate": 7.339378238341969e-07, "loss": 0.0007, "reward": 1.9999439716339111, "reward_std": 8.941842679632828e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.499943882226944, "step": 1028 }, { "clip_ratio": 0.0, "completion_length": 34.5, "epoch": 2.6658031088082903, "grad_norm": 10.457539193706586, "kl": 0.072509765625, "learning_rate": 7.336787564766839e-07, "loss": 0.0001, "reward": 2.499864101409912, "reward_std": 5.268064876418066e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9998640418052673, "step": 1029 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 2.66839378238342, "grad_norm": 1.969276188527305, "kl": 0.156494140625, "learning_rate": 7.33419689119171e-07, "loss": 0.0005, "reward": 2.49992573261261, "reward_std": 1.4278006801760057e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999257326126099, "step": 1030 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 2.6709844559585494, "grad_norm": 0.6614771318823214, "kl": 0.035797119140625, "learning_rate": 7.331606217616581e-07, "loss": 0.0003, "reward": 2.4999916553497314, "reward_std": 5.722366040572524e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999918937683105, "step": 1031 }, { "clip_ratio": 0.0, "completion_length": 35.1875, "epoch": 2.6735751295336785, "grad_norm": 9.250982285255036, "kl": 0.083740234375, "learning_rate": 7.329015544041451e-07, "loss": -0.0007, "reward": 1.9418030977249146, "reward_std": 0.026353379398642574, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4418033063411713, "step": 1032 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 2.676165803108808, "grad_norm": 7.13614460442719, "kl": 0.1533203125, "learning_rate": 7.326424870466321e-07, "loss": 0.0013, "reward": 1.9997770190238953, "reward_std": 2.4907179749789066e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4997769594192505, "step": 1033 }, { "clip_ratio": 0.0, "completion_length": 34.0, "epoch": 2.6787564766839376, "grad_norm": 0.2109501221571532, "kl": 0.138916015625, "learning_rate": 7.323834196891191e-07, "loss": 0.0007, "reward": 2.4999948740005493, "reward_std": 4.7437108605663525e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999949932098389, "step": 1034 }, { "clip_ratio": 0.0, "completion_length": 37.125, "epoch": 2.681347150259067, "grad_norm": 18.849485083660635, "kl": 0.0860595703125, "learning_rate": 7.321243523316062e-07, "loss": 0.0003, "reward": 1.9851142168045044, "reward_std": 0.00023543618112853437, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4851142168045044, "step": 1035 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 2.6839378238341967, "grad_norm": 2.055886137768087, "kl": 0.0693359375, "learning_rate": 7.318652849740933e-07, "loss": 0.0007, "reward": 1.9888790845870972, "reward_std": 4.5920855654912884e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4888788759708405, "step": 1036 }, { "clip_ratio": 0.0, "completion_length": 34.5, "epoch": 2.686528497409326, "grad_norm": 1.5836413190343166, "kl": 0.080322265625, "learning_rate": 7.316062176165803e-07, "loss": 0.0003, "reward": 1.9970427751541138, "reward_std": 4.156149918799201e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4970427453517914, "step": 1037 }, { "clip_ratio": 0.0, "completion_length": 34.0, "epoch": 2.6891191709844557, "grad_norm": 0.19613889601357332, "kl": 0.10400390625, "learning_rate": 7.313471502590674e-07, "loss": 0.0008, "reward": 2.49999737739563, "reward_std": 2.947950179077452e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999974966049194, "step": 1038 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 2.6917098445595853, "grad_norm": 16.722013357393276, "kl": 0.0987548828125, "learning_rate": 7.310880829015543e-07, "loss": 0.0002, "reward": 1.999333918094635, "reward_std": 0.0007027474280221213, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4993338882923126, "step": 1039 }, { "clip_ratio": 0.0, "completion_length": 36.6875, "epoch": 2.694300518134715, "grad_norm": 45.54708241850775, "kl": 0.11181640625, "learning_rate": 7.308290155440414e-07, "loss": 0.0013, "reward": 1.9994693994522095, "reward_std": 5.113296063541384e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4994693994522095, "step": 1040 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 2.6968911917098444, "grad_norm": 16.11110576877349, "kl": 0.0758056640625, "learning_rate": 7.305699481865285e-07, "loss": -0.0001, "reward": 1.8848603963851929, "reward_std": 0.0013421574876701925, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.3848604559898376, "step": 1041 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 2.699481865284974, "grad_norm": 10.21808335888086, "kl": 0.114501953125, "learning_rate": 7.303108808290155e-07, "loss": 0.0005, "reward": 2.499887228012085, "reward_std": 2.3803705516911577e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9998871088027954, "step": 1042 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 2.7020725388601035, "grad_norm": 1.219825400985413, "kl": 0.128173828125, "learning_rate": 7.300518134715026e-07, "loss": 0.0004, "reward": 2.499994158744812, "reward_std": 3.3542859227964072e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.999994158744812, "step": 1043 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 2.704663212435233, "grad_norm": 0.20517865696171975, "kl": 0.118896484375, "learning_rate": 7.297927461139896e-07, "loss": 0.0014, "reward": 2.4999985694885254, "reward_std": 1.8641724182089092e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999986290931702, "step": 1044 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 2.7072538860103625, "grad_norm": 0.7543048876017406, "kl": 0.13623046875, "learning_rate": 7.295336787564766e-07, "loss": -0.0003, "reward": 1.9998985528945923, "reward_std": 9.980089657801727e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4998984932899475, "step": 1045 }, { "clip_ratio": 0.0, "completion_length": 36.625, "epoch": 2.709844559585492, "grad_norm": 4.39416196220655, "kl": 0.100341796875, "learning_rate": 7.292746113989637e-07, "loss": 0.0013, "reward": 1.7546041011810303, "reward_std": 0.0002688942377062631, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.2546040415763855, "step": 1046 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 2.7124352331606216, "grad_norm": 0.07013489109114594, "kl": 0.11376953125, "learning_rate": 7.290155440414507e-07, "loss": 0.0003, "reward": 2.4999940395355225, "reward_std": 1.7276465769100469e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.999994158744812, "step": 1047 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 2.715025906735751, "grad_norm": 0.6013728295247106, "kl": 0.08740234375, "learning_rate": 7.287564766839378e-07, "loss": 0.0003, "reward": 2.4999935626983643, "reward_std": 4.070469401540322e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999933242797852, "step": 1048 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 2.7176165803108807, "grad_norm": 2.9610718772419036, "kl": 0.10687255859375, "learning_rate": 7.284974093264248e-07, "loss": 0.0002, "reward": 1.9992656111717224, "reward_std": 3.550504692384493e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4992656707763672, "step": 1049 }, { "clip_ratio": 0.0, "completion_length": 37.75, "epoch": 2.7202072538860103, "grad_norm": 14.283714305643459, "kl": 0.12890625, "learning_rate": 7.282383419689119e-07, "loss": 0.0006, "reward": 1.999831199645996, "reward_std": 2.8587964152393397e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.499831110239029, "step": 1050 }, { "clip_ratio": 0.0, "completion_length": 34.5, "epoch": 2.72279792746114, "grad_norm": 7.0945124735362075, "kl": 0.0357666015625, "learning_rate": 7.279792746113989e-07, "loss": -0.0004, "reward": 2.49994957447052, "reward_std": 2.700474237826711e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999496936798096, "step": 1051 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 2.7253886010362693, "grad_norm": 2.268270623849028, "kl": 0.085205078125, "learning_rate": 7.277202072538859e-07, "loss": 0.0008, "reward": 2.499932646751404, "reward_std": 1.8547011222835863e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999324679374695, "step": 1052 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 2.727979274611399, "grad_norm": 0.2646776621202804, "kl": 0.017669677734375, "learning_rate": 7.27461139896373e-07, "loss": -0.0001, "reward": 2.4999966621398926, "reward_std": 2.1824906184519932e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999967813491821, "step": 1053 }, { "clip_ratio": 0.0, "completion_length": 37.8125, "epoch": 2.7305699481865284, "grad_norm": 12.252002722776979, "kl": 0.10595703125, "learning_rate": 7.272020725388601e-07, "loss": 0.0004, "reward": 1.9968502521514893, "reward_std": 0.00019809217519650701, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4968502521514893, "step": 1054 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 2.733160621761658, "grad_norm": 0.27093322828032407, "kl": 0.069580078125, "learning_rate": 7.269430051813471e-07, "loss": 0.0003, "reward": 2.499993085861206, "reward_std": 3.6280623589846073e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999929070472717, "step": 1055 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 2.7357512953367875, "grad_norm": 0.5667673280559835, "kl": 0.120849609375, "learning_rate": 7.266839378238342e-07, "loss": -0.0008, "reward": 2.499995470046997, "reward_std": 5.511264404844951e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999955296516418, "step": 1056 }, { "clip_ratio": 0.0, "completion_length": 36.8125, "epoch": 2.738341968911917, "grad_norm": 14.613248784543186, "kl": 0.25537109375, "learning_rate": 7.264248704663211e-07, "loss": 0.0012, "reward": 1.8873506784439087, "reward_std": 0.07073427953764622, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.3873507976531982, "step": 1057 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 2.7409326424870466, "grad_norm": 98.28282262718358, "kl": 0.1068115234375, "learning_rate": 7.261658031088082e-07, "loss": 0.0008, "reward": 1.9999032616615295, "reward_std": 6.506397880912118e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.49990314245224, "step": 1058 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 2.743523316062176, "grad_norm": 7.366011598428563, "kl": 0.171875, "learning_rate": 7.259067357512953e-07, "loss": 0.0008, "reward": 2.499987244606018, "reward_std": 1.724209550957312e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999871253967285, "step": 1059 }, { "clip_ratio": 0.0, "completion_length": 36.6875, "epoch": 2.7461139896373057, "grad_norm": 1.2074248780971368, "kl": 0.0709228515625, "learning_rate": 7.256476683937823e-07, "loss": 0.0007, "reward": 1.9978476762771606, "reward_std": 5.851211813023838e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4978476762771606, "step": 1060 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 2.7487046632124352, "grad_norm": 0.5930684530107416, "kl": 0.0626220703125, "learning_rate": 7.253886010362694e-07, "loss": 0.0012, "reward": 2.4999903440475464, "reward_std": 6.0903076928298105e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999902844429016, "step": 1061 }, { "clip_ratio": 0.0, "completion_length": 34.5, "epoch": 2.7512953367875648, "grad_norm": 0.2244633371890959, "kl": 0.0506591796875, "learning_rate": 7.251295336787564e-07, "loss": -0.0008, "reward": 2.499998092651367, "reward_std": 1.7892605228553293e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.999998152256012, "step": 1062 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 2.7538860103626943, "grad_norm": 0.6552286577056265, "kl": 0.0318603515625, "learning_rate": 7.248704663212434e-07, "loss": 0.0007, "reward": 2.4999741315841675, "reward_std": 6.703279552766617e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.999974012374878, "step": 1063 }, { "clip_ratio": 0.0, "completion_length": 34.0, "epoch": 2.756476683937824, "grad_norm": 0.7872457262096866, "kl": 0.2138671875, "learning_rate": 7.246113989637305e-07, "loss": 0.0003, "reward": 2.4995816946029663, "reward_std": 1.2888197943539126e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9995818138122559, "step": 1064 }, { "clip_ratio": 0.0, "completion_length": 38.25, "epoch": 2.7590673575129534, "grad_norm": 40.086736725850606, "kl": 0.14013671875, "learning_rate": 7.243523316062175e-07, "loss": 0.0003, "reward": 2.4374738931655884, "reward_std": 0.17680212369123183, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9374739527702332, "step": 1065 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 2.761658031088083, "grad_norm": 31.224919008663623, "kl": 0.1534423828125, "learning_rate": 7.240932642487047e-07, "loss": 0.0005, "reward": 2.3749732971191406, "reward_std": 0.23149630275702293, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.8749732375144958, "step": 1066 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 2.7642487046632125, "grad_norm": 80.219253679017, "kl": 0.375, "learning_rate": 7.238341968911917e-07, "loss": 0.0014, "reward": 2.0020114183425903, "reward_std": 0.20123041486453985, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.5020114183425903, "step": 1067 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 2.766839378238342, "grad_norm": 1.9210130664505254, "kl": 0.051025390625, "learning_rate": 7.235751295336788e-07, "loss": -0.0002, "reward": 2.499989151954651, "reward_std": 6.962360998841177e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999892711639404, "step": 1068 }, { "clip_ratio": 0.0, "completion_length": 35.4375, "epoch": 2.7694300518134716, "grad_norm": 12.078434358176192, "kl": 0.1455078125, "learning_rate": 7.233160621761658e-07, "loss": 0.0006, "reward": 2.218725085258484, "reward_std": 0.4519480440785628, "rewards/format_reward_rec": 0.9375, "rewards/point_reward": 1.7499750852584839, "step": 1069 }, { "clip_ratio": 0.0, "completion_length": 34.5, "epoch": 2.772020725388601, "grad_norm": 0.4949794646161929, "kl": 0.197265625, "learning_rate": 7.230569948186528e-07, "loss": -0.0001, "reward": 2.4999905824661255, "reward_std": 3.851149870115478e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999905824661255, "step": 1070 }, { "clip_ratio": 0.0, "completion_length": 35.1875, "epoch": 2.7746113989637307, "grad_norm": 1.0620997698338368, "kl": 0.0914306640625, "learning_rate": 7.227979274611399e-07, "loss": 0.0015, "reward": 2.499996781349182, "reward_std": 1.297923517995514e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999969005584717, "step": 1071 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 2.77720207253886, "grad_norm": 5.5877891106408635, "kl": 0.056396484375, "learning_rate": 7.225388601036269e-07, "loss": -0.0002, "reward": 2.499979257583618, "reward_std": 4.0352184669245617e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.999979317188263, "step": 1072 }, { "clip_ratio": 0.0, "completion_length": 34.4375, "epoch": 2.7797927461139897, "grad_norm": 19.796692429371053, "kl": 0.076416015625, "learning_rate": 7.22279792746114e-07, "loss": 0.0003, "reward": 2.246677339076996, "reward_std": 0.2706778674717043, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.7466772198677063, "step": 1073 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 2.7823834196891193, "grad_norm": 2.6063088580666034, "kl": 0.0859375, "learning_rate": 7.220207253886011e-07, "loss": -0.0002, "reward": 2.4999797344207764, "reward_std": 1.2273186030142824e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.999979853630066, "step": 1074 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 2.784974093264249, "grad_norm": 13.069108566992332, "kl": 0.0869140625, "learning_rate": 7.21761658031088e-07, "loss": 0.0006, "reward": 2.437316417694092, "reward_std": 0.17728359372506475, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9373162984848022, "step": 1075 }, { "clip_ratio": 0.0, "completion_length": 34.5, "epoch": 2.7875647668393784, "grad_norm": 4.755314074861053, "kl": 0.13037109375, "learning_rate": 7.215025906735751e-07, "loss": 0.0002, "reward": 1.9925823211669922, "reward_std": 5.7975628806161694e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4925822615623474, "step": 1076 }, { "clip_ratio": 0.0, "completion_length": 34.5, "epoch": 2.790155440414508, "grad_norm": 0.4946854997981964, "kl": 0.149169921875, "learning_rate": 7.212435233160622e-07, "loss": 0.0005, "reward": 2.4999823570251465, "reward_std": 3.7929671634628903e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999824166297913, "step": 1077 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 2.7927461139896375, "grad_norm": 8.981555591133127, "kl": 0.0394439697265625, "learning_rate": 7.209844559585492e-07, "loss": 0.001, "reward": 2.4999935626983643, "reward_std": 8.956594228948234e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999933242797852, "step": 1078 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 2.795336787564767, "grad_norm": 7.433920545563419, "kl": 0.09521484375, "learning_rate": 7.207253886010363e-07, "loss": 0.0007, "reward": 2.499902367591858, "reward_std": 5.014531780034304e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999022483825684, "step": 1079 }, { "clip_ratio": 0.0, "completion_length": 34.5, "epoch": 2.7979274611398965, "grad_norm": 3.956844248290702, "kl": 0.088623046875, "learning_rate": 7.204663212435233e-07, "loss": 0.0003, "reward": 1.999257206916809, "reward_std": 4.888301108962878e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4992571771144867, "step": 1080 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 2.800518134715026, "grad_norm": 2.428315916219824, "kl": 0.02392578125, "learning_rate": 7.202072538860103e-07, "loss": -0.0, "reward": 2.499980330467224, "reward_std": 1.1904242001037346e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.999980390071869, "step": 1081 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 2.8031088082901556, "grad_norm": 0.14568564415111906, "kl": 0.073974609375, "learning_rate": 7.199481865284974e-07, "loss": 0.0016, "reward": 2.4999961853027344, "reward_std": 1.7538935139782552e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999960660934448, "step": 1082 }, { "clip_ratio": 0.0, "completion_length": 34.5, "epoch": 2.805699481865285, "grad_norm": 4.923154202614084, "kl": 0.059814453125, "learning_rate": 7.196891191709844e-07, "loss": 0.0011, "reward": 2.4999935626983643, "reward_std": 4.7752869249961805e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999934434890747, "step": 1083 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 2.8082901554404147, "grad_norm": 0.24342664138678616, "kl": 0.09228515625, "learning_rate": 7.194300518134715e-07, "loss": 0.0007, "reward": 2.499969482421875, "reward_std": 5.161863441571768e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999696612358093, "step": 1084 }, { "clip_ratio": 0.0, "completion_length": 34.5, "epoch": 2.8108808290155443, "grad_norm": 0.23525517553244468, "kl": 0.109130859375, "learning_rate": 7.191709844559585e-07, "loss": 0.0014, "reward": 2.4999958276748657, "reward_std": 2.9200672315710108e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.999995768070221, "step": 1085 }, { "clip_ratio": 0.0, "completion_length": 36.625, "epoch": 2.813471502590674, "grad_norm": 0.10910958543718147, "kl": 0.08837890625, "learning_rate": 7.189119170984456e-07, "loss": 0.0001, "reward": 2.49999737739563, "reward_std": 1.6142615208991629e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999974966049194, "step": 1086 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 2.8160621761658033, "grad_norm": 8.707789481764191, "kl": 0.080322265625, "learning_rate": 7.186528497409327e-07, "loss": 0.0001, "reward": 2.498985767364502, "reward_std": 7.098440448771726e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9989857077598572, "step": 1087 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 2.818652849740933, "grad_norm": 3.2743786748842085, "kl": 0.0496826171875, "learning_rate": 7.183937823834196e-07, "loss": 0.0005, "reward": 2.4999217987060547, "reward_std": 1.652415630815085e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999217987060547, "step": 1088 }, { "clip_ratio": 0.0, "completion_length": 34.0, "epoch": 2.8212435233160624, "grad_norm": 1.2110577185998637, "kl": 0.11962890625, "learning_rate": 7.181347150259067e-07, "loss": -0.0004, "reward": 2.4999951124191284, "reward_std": 3.3248207245151207e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999949932098389, "step": 1089 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 2.823834196891192, "grad_norm": 10.873866547427554, "kl": 0.127685546875, "learning_rate": 7.178756476683937e-07, "loss": 0.0006, "reward": 1.9986047744750977, "reward_std": 6.084624533286842e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4986046850681305, "step": 1090 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 2.8264248704663215, "grad_norm": 3.9224318973043206, "kl": 0.047119140625, "learning_rate": 7.176165803108808e-07, "loss": 0.0011, "reward": 2.4999605417251587, "reward_std": 2.0427865820238367e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999603629112244, "step": 1091 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 2.8290155440414506, "grad_norm": 14.35356201162555, "kl": 0.127685546875, "learning_rate": 7.173575129533679e-07, "loss": 0.0011, "reward": 2.062433958053589, "reward_std": 0.1767889433590426, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.5624338388442993, "step": 1092 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 2.83160621761658, "grad_norm": 23.881574781712416, "kl": 0.097900390625, "learning_rate": 7.170984455958548e-07, "loss": -0.0003, "reward": 2.4998987913131714, "reward_std": 5.793695368083718e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9998989701271057, "step": 1093 }, { "clip_ratio": 0.0, "completion_length": 36.6875, "epoch": 2.8341968911917097, "grad_norm": 3.285440237402109, "kl": 0.1322021484375, "learning_rate": 7.168393782383419e-07, "loss": 0.001, "reward": 2.4999254941940308, "reward_std": 1.3155834608369332e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.999925434589386, "step": 1094 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 2.8367875647668392, "grad_norm": 37.47873630443259, "kl": 0.0428466796875, "learning_rate": 7.165803108808289e-07, "loss": 0.0005, "reward": 2.3749401569366455, "reward_std": 0.23155948038015595, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.8749401569366455, "step": 1095 }, { "clip_ratio": 0.0, "completion_length": 34.5, "epoch": 2.839378238341969, "grad_norm": 1.8826301432638974, "kl": 0.087646484375, "learning_rate": 7.16321243523316e-07, "loss": 0.0008, "reward": 2.4999784231185913, "reward_std": 8.522820678535936e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999783635139465, "step": 1096 }, { "clip_ratio": 0.0, "completion_length": 35.8125, "epoch": 2.8419689119170983, "grad_norm": 3.886173789357406, "kl": 0.0693359375, "learning_rate": 7.160621761658031e-07, "loss": -0.0003, "reward": 2.4999921321868896, "reward_std": 1.2925234614158398e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999921321868896, "step": 1097 }, { "clip_ratio": 0.0, "completion_length": 34.5, "epoch": 2.844559585492228, "grad_norm": 17.612798734871337, "kl": 0.5328369140625, "learning_rate": 7.158031088082901e-07, "loss": 0.0016, "reward": 1.8112998008728027, "reward_std": 0.0010947110818051442, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.3112997114658356, "step": 1098 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 2.8471502590673574, "grad_norm": 2.0701117441853785, "kl": 0.0888671875, "learning_rate": 7.155440414507772e-07, "loss": -0.0001, "reward": 2.4999868869781494, "reward_std": 7.361585630860645e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999868869781494, "step": 1099 }, { "clip_ratio": 0.0, "completion_length": 35.4375, "epoch": 2.849740932642487, "grad_norm": 0.8399703029727895, "kl": 0.15625, "learning_rate": 7.152849740932642e-07, "loss": 0.0008, "reward": 1.4999985694885254, "reward_std": 1.5715099834778812e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 0.9999985694885254, "step": 1100 }, { "clip_ratio": 0.0, "completion_length": 34.0, "epoch": 2.8523316062176165, "grad_norm": 393.0112823584629, "kl": 0.097412109375, "learning_rate": 7.150259067357512e-07, "loss": 0.0004, "reward": 1.8534001111984253, "reward_std": 0.17722581850830466, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.3533999919891357, "step": 1101 }, { "clip_ratio": 0.0, "completion_length": 36.5, "epoch": 2.854922279792746, "grad_norm": 0.22681909035896552, "kl": 0.1025390625, "learning_rate": 7.147668393782383e-07, "loss": 0.0007, "reward": 2.499991774559021, "reward_std": 3.7977329157001805e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999915957450867, "step": 1102 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 2.8575129533678756, "grad_norm": 1.8079716761954987, "kl": 0.0980224609375, "learning_rate": 7.145077720207253e-07, "loss": 0.0001, "reward": 2.4999680519104004, "reward_std": 9.890277510749002e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999680519104004, "step": 1103 }, { "clip_ratio": 0.0, "completion_length": 36.75, "epoch": 2.860103626943005, "grad_norm": 61.80590614410551, "kl": 0.12744140625, "learning_rate": 7.142487046632124e-07, "loss": 0.0008, "reward": 2.436125159263611, "reward_std": 0.18065080092264907, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.936125099658966, "step": 1104 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 2.8626943005181347, "grad_norm": 4.947483923829126, "kl": 0.0557861328125, "learning_rate": 7.139896373056995e-07, "loss": 0.0005, "reward": 2.499988555908203, "reward_std": 9.842450737096442e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999884366989136, "step": 1105 }, { "clip_ratio": 0.0, "completion_length": 36.625, "epoch": 2.865284974093264, "grad_norm": 0.10531010835480184, "kl": 0.10595703125, "learning_rate": 7.137305699481864e-07, "loss": -0.0002, "reward": 2.4999979734420776, "reward_std": 1.3254145017072005e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999982118606567, "step": 1106 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 2.8678756476683938, "grad_norm": 7.65941754821093, "kl": 0.1083984375, "learning_rate": 7.134715025906735e-07, "loss": 0.0003, "reward": 2.499955654144287, "reward_std": 4.449913922144333e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999558329582214, "step": 1107 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 2.8704663212435233, "grad_norm": 4.537239674472093, "kl": 0.1224365234375, "learning_rate": 7.132124352331605e-07, "loss": 0.0004, "reward": 2.4999425411224365, "reward_std": 1.7167034684462124e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.999942421913147, "step": 1108 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 2.873056994818653, "grad_norm": 7.395477812843716, "kl": 0.06640625, "learning_rate": 7.129533678756477e-07, "loss": -0.0004, "reward": 1.9899404644966125, "reward_std": 0.0002875989903259324, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.489940494298935, "step": 1109 }, { "clip_ratio": 0.0, "completion_length": 37.6875, "epoch": 2.8756476683937824, "grad_norm": 1.9947429179408407, "kl": 0.099609375, "learning_rate": 7.126943005181348e-07, "loss": 0.0003, "reward": 2.499986410140991, "reward_std": 1.3293567462824285e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999862313270569, "step": 1110 }, { "clip_ratio": 0.0, "completion_length": 34.5, "epoch": 2.878238341968912, "grad_norm": 2.0896604629479065, "kl": 0.0360107421875, "learning_rate": 7.124352331606218e-07, "loss": 0.0004, "reward": 2.499975800514221, "reward_std": 1.1701359653670806e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999758005142212, "step": 1111 }, { "clip_ratio": 0.0, "completion_length": 37.75, "epoch": 2.8808290155440415, "grad_norm": 2.7023212850670397, "kl": 0.103515625, "learning_rate": 7.121761658031088e-07, "loss": 0.001, "reward": 2.499971866607666, "reward_std": 1.0769423965939495e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999716877937317, "step": 1112 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 2.883419689119171, "grad_norm": 1.015715283594274, "kl": 0.080810546875, "learning_rate": 7.119170984455958e-07, "loss": 0.0, "reward": 2.499992847442627, "reward_std": 4.163142534707731e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.999992847442627, "step": 1113 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 2.8860103626943006, "grad_norm": 1.6015076756637872, "kl": 0.084228515625, "learning_rate": 7.116580310880829e-07, "loss": 0.0011, "reward": 1.9935968518257141, "reward_std": 4.3514764911378734e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4935966432094574, "step": 1114 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 2.88860103626943, "grad_norm": 5.827115353340306, "kl": 0.080322265625, "learning_rate": 7.1139896373057e-07, "loss": 0.0006, "reward": 2.4999715089797974, "reward_std": 1.682233687461121e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999715089797974, "step": 1115 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 2.8911917098445596, "grad_norm": 0.818833821270414, "kl": 0.055908203125, "learning_rate": 7.11139896373057e-07, "loss": -0.0001, "reward": 2.49999463558197, "reward_std": 4.764110258292931e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999945759773254, "step": 1116 }, { "clip_ratio": 0.0, "completion_length": 37.125, "epoch": 2.893782383419689, "grad_norm": 0.6186789778257421, "kl": 0.0771484375, "learning_rate": 7.108808290155441e-07, "loss": 0.0004, "reward": 2.4999711513519287, "reward_std": 5.8186195133203e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999711513519287, "step": 1117 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 2.8963730569948187, "grad_norm": 0.10584215002423136, "kl": 0.0518798828125, "learning_rate": 7.10621761658031e-07, "loss": -0.001, "reward": 2.499988555908203, "reward_std": 1.755187923890844e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999887943267822, "step": 1118 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 2.8989637305699483, "grad_norm": 0.5522414956955325, "kl": 0.10400390625, "learning_rate": 7.103626943005181e-07, "loss": 0.0003, "reward": 2.4999914169311523, "reward_std": 5.139297570622148e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999914765357971, "step": 1119 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 2.901554404145078, "grad_norm": 2.622176048442459, "kl": 0.0711669921875, "learning_rate": 7.101036269430052e-07, "loss": 0.0009, "reward": 2.4999932050704956, "reward_std": 6.585877372344839e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.999993085861206, "step": 1120 }, { "clip_ratio": 0.0, "completion_length": 37.1875, "epoch": 2.9041450777202074, "grad_norm": 0.36202591021173663, "kl": 0.0657958984375, "learning_rate": 7.098445595854922e-07, "loss": -0.0003, "reward": 2.499996066093445, "reward_std": 1.8297714063919557e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999960660934448, "step": 1121 }, { "clip_ratio": 0.0, "completion_length": 37.1875, "epoch": 2.906735751295337, "grad_norm": 0.3989314323177718, "kl": 0.15869140625, "learning_rate": 7.095854922279793e-07, "loss": 0.0005, "reward": 2.4999897480010986, "reward_std": 9.90946682577487e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.999989628791809, "step": 1122 }, { "clip_ratio": 0.0, "completion_length": 36.6875, "epoch": 2.9093264248704664, "grad_norm": 3.98828984217289, "kl": 0.107177734375, "learning_rate": 7.093264248704664e-07, "loss": 0.0001, "reward": 1.8213641047477722, "reward_std": 0.00019163135178246193, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.3213641047477722, "step": 1123 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 2.911917098445596, "grad_norm": 0.38372592988781673, "kl": 0.0947265625, "learning_rate": 7.090673575129533e-07, "loss": 0.0002, "reward": 2.4999934434890747, "reward_std": 6.14368312312763e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.999993622303009, "step": 1124 }, { "clip_ratio": 0.0, "completion_length": 37.625, "epoch": 2.9145077720207255, "grad_norm": 2.185680743780335, "kl": 0.052490234375, "learning_rate": 7.088082901554404e-07, "loss": 0.0003, "reward": 1.9998529553413391, "reward_std": 2.4648191583764856e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4998528957366943, "step": 1125 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 2.917098445595855, "grad_norm": 0.4143511714457312, "kl": 0.07373046875, "learning_rate": 7.085492227979274e-07, "loss": -0.0007, "reward": 2.4999865293502808, "reward_std": 4.4882543761559646e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999865889549255, "step": 1126 }, { "clip_ratio": 0.0, "completion_length": 37.6875, "epoch": 2.9196891191709846, "grad_norm": 2.119040985586719, "kl": 0.067138671875, "learning_rate": 7.082901554404145e-07, "loss": -0.0007, "reward": 1.9980251789093018, "reward_std": 4.27513819545311e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4980253875255585, "step": 1127 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 2.9222797927461137, "grad_norm": 3.2498574020963535, "kl": 0.098388671875, "learning_rate": 7.080310880829016e-07, "loss": 0.0001, "reward": 1.999819815158844, "reward_std": 1.0583228686300572e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.499819815158844, "step": 1128 }, { "clip_ratio": 0.0, "completion_length": 34.5, "epoch": 2.9248704663212433, "grad_norm": 7.285872759741117, "kl": 0.07958984375, "learning_rate": 7.077720207253886e-07, "loss": 0.0003, "reward": 1.9943678379058838, "reward_std": 9.15516066015698e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4943679571151733, "step": 1129 }, { "clip_ratio": 0.0, "completion_length": 34.0, "epoch": 2.927461139896373, "grad_norm": 1.0516048493387564, "kl": 0.175048828125, "learning_rate": 7.075129533678756e-07, "loss": 0.0011, "reward": 2.4999934434890747, "reward_std": 5.073481815998093e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999934434890747, "step": 1130 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 2.9300518134715023, "grad_norm": 3.314155223630398, "kl": 0.076904296875, "learning_rate": 7.072538860103626e-07, "loss": 0.001, "reward": 2.4999561309814453, "reward_std": 1.9229667486797553e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999558329582214, "step": 1131 }, { "clip_ratio": 0.0, "completion_length": 34.5, "epoch": 2.932642487046632, "grad_norm": 7.451044311709573, "kl": 0.108642578125, "learning_rate": 7.069948186528497e-07, "loss": -0.0002, "reward": 1.9986302852630615, "reward_std": 5.2891948143951595e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4986303448677063, "step": 1132 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 2.9352331606217614, "grad_norm": 1.355923951411975, "kl": 0.128173828125, "learning_rate": 7.067357512953368e-07, "loss": -0.0001, "reward": 2.4999951124191284, "reward_std": 3.3475552640993556e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999950528144836, "step": 1133 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 2.937823834196891, "grad_norm": 1.4880376971698004, "kl": 0.093994140625, "learning_rate": 7.064766839378238e-07, "loss": -0.0001, "reward": 2.4999552965164185, "reward_std": 1.990783744076907e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.999955415725708, "step": 1134 }, { "clip_ratio": 0.0, "completion_length": 37.1875, "epoch": 2.9404145077720205, "grad_norm": 10.711080101650754, "kl": 0.144287109375, "learning_rate": 7.062176165803109e-07, "loss": 0.0011, "reward": 1.9870364665985107, "reward_std": 0.00018690419165068306, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4870363473892212, "step": 1135 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 2.94300518134715, "grad_norm": 7.999550640622812, "kl": 0.1640625, "learning_rate": 7.059585492227978e-07, "loss": 0.0008, "reward": 1.998693585395813, "reward_std": 0.00031646367085613747, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.498693585395813, "step": 1136 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 2.9455958549222796, "grad_norm": 30.720288921915085, "kl": 0.115234375, "learning_rate": 7.056994818652849e-07, "loss": 0.001, "reward": 2.4999574422836304, "reward_std": 1.8955228370032273e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999575018882751, "step": 1137 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 2.948186528497409, "grad_norm": 13.278939540873479, "kl": 0.094482421875, "learning_rate": 7.05440414507772e-07, "loss": -0.0001, "reward": 2.499975562095642, "reward_std": 8.365519761355245e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.999975562095642, "step": 1138 }, { "clip_ratio": 0.0, "completion_length": 34.8125, "epoch": 2.9507772020725387, "grad_norm": 16.637544014751317, "kl": 0.154052734375, "learning_rate": 7.05181347150259e-07, "loss": 0.0012, "reward": 2.187487483024597, "reward_std": 0.5786307236259063, "rewards/format_reward_rec": 0.875, "rewards/point_reward": 1.7499874234199524, "step": 1139 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 2.9533678756476682, "grad_norm": 0.16456697144892732, "kl": 0.0428466796875, "learning_rate": 7.049222797927461e-07, "loss": -0.0003, "reward": 2.499994993209839, "reward_std": 3.0310380907394574e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999949932098389, "step": 1140 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 2.9559585492227978, "grad_norm": 0.11590364317646457, "kl": 0.0537109375, "learning_rate": 7.046632124352331e-07, "loss": -0.0005, "reward": 2.499987244606018, "reward_std": 3.5620584526441235e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999873638153076, "step": 1141 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 2.9585492227979273, "grad_norm": 1.5830693480236082, "kl": 0.2109375, "learning_rate": 7.044041450777201e-07, "loss": 0.0008, "reward": 1.9981689453125, "reward_std": 3.835231962057151e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4981690645217896, "step": 1142 }, { "clip_ratio": 0.0, "completion_length": 36.6875, "epoch": 2.961139896373057, "grad_norm": 3.557067469499059, "kl": 0.134033203125, "learning_rate": 7.041450777202072e-07, "loss": 0.0008, "reward": 2.4373693466186523, "reward_std": 0.17683035418724558, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9373692870140076, "step": 1143 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 2.9637305699481864, "grad_norm": 0.2430091628343801, "kl": 0.09521484375, "learning_rate": 7.038860103626942e-07, "loss": 0.0001, "reward": 2.4999964237213135, "reward_std": 3.0747958135179942e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999964237213135, "step": 1144 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 2.966321243523316, "grad_norm": 2.66400394096119, "kl": 0.09228515625, "learning_rate": 7.036269430051813e-07, "loss": 0.0002, "reward": 1.999937891960144, "reward_std": 1.1545933375600725e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4999380707740784, "step": 1145 }, { "clip_ratio": 0.0, "completion_length": 38.5625, "epoch": 2.9689119170984455, "grad_norm": 0.2939389772846676, "kl": 0.109130859375, "learning_rate": 7.033678756476683e-07, "loss": -0.0, "reward": 2.499996066093445, "reward_std": 2.869964930596325e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999958872795105, "step": 1146 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 2.971502590673575, "grad_norm": 0.22551784824130083, "kl": 0.104736328125, "learning_rate": 7.031088082901554e-07, "loss": 0.0001, "reward": 1.999951183795929, "reward_std": 6.387927214746014e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.499951183795929, "step": 1147 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 2.9740932642487046, "grad_norm": 5.683038747313902, "kl": 0.111328125, "learning_rate": 7.028497409326424e-07, "loss": 0.0007, "reward": 2.4999536275863647, "reward_std": 2.307615199015345e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999537467956543, "step": 1148 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 2.976683937823834, "grad_norm": 45.20046738492338, "kl": 0.0643310546875, "learning_rate": 7.025906735751294e-07, "loss": -0.0, "reward": 2.3124247789382935, "reward_std": 0.25882632569573616, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.8124247193336487, "step": 1149 }, { "clip_ratio": 0.0, "completion_length": 37.875, "epoch": 2.9792746113989637, "grad_norm": 0.3992135079450244, "kl": 0.16357421875, "learning_rate": 7.023316062176165e-07, "loss": 0.0014, "reward": 2.4999853372573853, "reward_std": 3.1528557542515045e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999852180480957, "step": 1150 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 2.981865284974093, "grad_norm": 1.7259120769206473, "kl": 0.012054443359375, "learning_rate": 7.020725388601037e-07, "loss": 0.0002, "reward": 2.0624573826789856, "reward_std": 0.17678039967643144, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.5624573826789856, "step": 1151 }, { "clip_ratio": 0.0, "completion_length": 34.5, "epoch": 2.9844559585492227, "grad_norm": 8.447848092326048, "kl": 0.04034423828125, "learning_rate": 7.018134715025907e-07, "loss": 0.0008, "reward": 1.9986690878868103, "reward_std": 8.416088712692726e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4986690282821655, "step": 1152 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 2.9870466321243523, "grad_norm": 1.2972822025214075, "kl": 0.09423828125, "learning_rate": 7.015544041450778e-07, "loss": 0.0008, "reward": 2.499926447868347, "reward_std": 1.1386057508389058e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999262690544128, "step": 1153 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 2.989637305699482, "grad_norm": 2.29611919032345, "kl": 0.109130859375, "learning_rate": 7.012953367875647e-07, "loss": 0.0014, "reward": 1.9995307922363281, "reward_std": 1.694477805358474e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.499530702829361, "step": 1154 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 2.9922279792746114, "grad_norm": 0.11824373433337411, "kl": 0.0452880859375, "learning_rate": 7.010362694300518e-07, "loss": 0.0012, "reward": 2.4999964237213135, "reward_std": 2.057241715647251e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.999996304512024, "step": 1155 }, { "clip_ratio": 0.0, "completion_length": 35.4375, "epoch": 2.994818652849741, "grad_norm": 42.33963609536471, "kl": 0.1304931640625, "learning_rate": 7.007772020725389e-07, "loss": 0.0005, "reward": 1.7886215448379517, "reward_std": 0.003570433329059597, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.2886216938495636, "step": 1156 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 2.9974093264248705, "grad_norm": 1.7166453701474669, "kl": 0.07421875, "learning_rate": 7.005181347150259e-07, "loss": 0.0007, "reward": 2.4999858140945435, "reward_std": 1.2763360246026423e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999858736991882, "step": 1157 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 3.0, "grad_norm": 2.719556150027397, "kl": 0.05584716796875, "learning_rate": 7.00259067357513e-07, "loss": -0.0004, "reward": 2.4999583959579468, "reward_std": 1.1173382517881691e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999584555625916, "step": 1158 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 3.0025906735751295, "grad_norm": 2.461447509058338, "kl": 0.105712890625, "learning_rate": 7e-07, "loss": 0.0006, "reward": 2.4999126195907593, "reward_std": 3.372174069227185e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.999912679195404, "step": 1159 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 3.005181347150259, "grad_norm": 0.7294775043861922, "kl": 0.127197265625, "learning_rate": 6.99740932642487e-07, "loss": 0.0018, "reward": 1.9999293088912964, "reward_std": 1.0251685353068751e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4999292492866516, "step": 1160 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 3.0077720207253886, "grad_norm": 6.939097905563522, "kl": 1.5831298828125, "learning_rate": 6.994818652849741e-07, "loss": 0.0073, "reward": 2.499998450279236, "reward_std": 2.793906077158681e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999982714653015, "step": 1161 }, { "clip_ratio": 0.0, "completion_length": 37.6875, "epoch": 3.010362694300518, "grad_norm": 3.3560175791018163, "kl": 0.109619140625, "learning_rate": 6.992227979274611e-07, "loss": -0.0009, "reward": 2.499979019165039, "reward_std": 1.4151022583064332e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999791979789734, "step": 1162 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 3.0129533678756477, "grad_norm": 0.2946434730238508, "kl": 0.0775146484375, "learning_rate": 6.989637305699482e-07, "loss": 0.0005, "reward": 2.4999979734420776, "reward_std": 2.1617928496198147e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.999998152256012, "step": 1163 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 3.0155440414507773, "grad_norm": 0.10396015666718733, "kl": 0.121337890625, "learning_rate": 6.987046632124352e-07, "loss": 0.0009, "reward": 2.49999737739563, "reward_std": 1.02774143329043e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999973773956299, "step": 1164 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 3.018134715025907, "grad_norm": 8.466246391543336, "kl": 0.10302734375, "learning_rate": 6.984455958549223e-07, "loss": 0.0002, "reward": 2.4999542236328125, "reward_std": 3.13268728859839e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.999954342842102, "step": 1165 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 3.0207253886010363, "grad_norm": 2.0671689748537996, "kl": 0.0640869140625, "learning_rate": 6.981865284974093e-07, "loss": 0.0005, "reward": 2.499988555908203, "reward_std": 1.2248337952769361e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999884963035583, "step": 1166 }, { "clip_ratio": 0.0, "completion_length": 34.5, "epoch": 3.023316062176166, "grad_norm": 0.1466281160981326, "kl": 0.12939453125, "learning_rate": 6.979274611398963e-07, "loss": 0.0007, "reward": 2.4999983310699463, "reward_std": 1.1799395736034057e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999983310699463, "step": 1167 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 3.0259067357512954, "grad_norm": 0.48679038895125865, "kl": 0.10693359375, "learning_rate": 6.976683937823834e-07, "loss": -0.0004, "reward": 2.499994397163391, "reward_std": 3.5975109540231642e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999943971633911, "step": 1168 }, { "clip_ratio": 0.0, "completion_length": 34.5, "epoch": 3.028497409326425, "grad_norm": 13.28376293936007, "kl": 0.0621337890625, "learning_rate": 6.974093264248704e-07, "loss": 0.0013, "reward": 2.499867081642151, "reward_std": 5.790720570075791e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.999867022037506, "step": 1169 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 3.0310880829015545, "grad_norm": 0.9175071597383333, "kl": 0.1328125, "learning_rate": 6.971502590673575e-07, "loss": -0.0, "reward": 2.4999834299087524, "reward_std": 9.043923000717768e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999833703041077, "step": 1170 }, { "clip_ratio": 0.0, "completion_length": 40.5625, "epoch": 3.033678756476684, "grad_norm": 0.142921173334334, "kl": 0.110107421875, "learning_rate": 6.968911917098446e-07, "loss": 0.0004, "reward": 2.4999974966049194, "reward_std": 1.8433906348036544e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.999997615814209, "step": 1171 }, { "clip_ratio": 0.0, "completion_length": 63.0, "epoch": 3.0362694300518136, "grad_norm": 6.578932104147027, "kl": 0.05902099609375, "learning_rate": 6.966321243523315e-07, "loss": -0.0003, "reward": 2.499991536140442, "reward_std": 1.2175861229479779e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999916553497314, "step": 1172 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 3.038860103626943, "grad_norm": 12.812888662519878, "kl": 0.191650390625, "learning_rate": 6.963730569948186e-07, "loss": 0.0007, "reward": 1.937165081501007, "reward_std": 0.17684286828080076, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4371649622917175, "step": 1173 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 3.0414507772020727, "grad_norm": 0.36017149309125857, "kl": 0.09619140625, "learning_rate": 6.961139896373057e-07, "loss": 0.0015, "reward": 2.499983310699463, "reward_std": 6.094512514209782e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999832510948181, "step": 1174 }, { "clip_ratio": 0.0, "completion_length": 34.5, "epoch": 3.0440414507772022, "grad_norm": 50.22883560752347, "kl": 0.061767578125, "learning_rate": 6.958549222797927e-07, "loss": 0.0001, "reward": 1.9565237760543823, "reward_std": 0.00029459824872901663, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4565239548683167, "step": 1175 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 3.0466321243523318, "grad_norm": 0.5827171756534197, "kl": 0.07305908203125, "learning_rate": 6.955958549222798e-07, "loss": -0.0003, "reward": 2.4999759197235107, "reward_std": 5.402351973771147e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999760389328003, "step": 1176 }, { "clip_ratio": 0.0, "completion_length": 37.875, "epoch": 3.0492227979274613, "grad_norm": 0.060686410556682266, "kl": 0.12158203125, "learning_rate": 6.953367875647668e-07, "loss": 0.0005, "reward": 2.4999982118606567, "reward_std": 9.764374624410266e-07, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999982714653015, "step": 1177 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 3.051813471502591, "grad_norm": 2.0030176106359803, "kl": 0.060791015625, "learning_rate": 6.950777202072538e-07, "loss": 0.0008, "reward": 2.4999942779541016, "reward_std": 4.463275558919122e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999940991401672, "step": 1178 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 3.0544041450777204, "grad_norm": 0.07562834244955724, "kl": 0.120361328125, "learning_rate": 6.948186528497409e-07, "loss": 0.0008, "reward": 2.499996781349182, "reward_std": 1.477285877626855e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999969005584717, "step": 1179 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 3.05699481865285, "grad_norm": 0.26056404492182367, "kl": 0.060546875, "learning_rate": 6.945595854922279e-07, "loss": -0.0, "reward": 2.499991774559021, "reward_std": 2.9540235573222162e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.999991774559021, "step": 1180 }, { "clip_ratio": 0.0, "completion_length": 34.0, "epoch": 3.0595854922279795, "grad_norm": 5.351097814490174, "kl": 0.08642578125, "learning_rate": 6.94300518134715e-07, "loss": 0.0008, "reward": 1.9945697784423828, "reward_std": 9.820161403695238e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4945697784423828, "step": 1181 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 3.062176165803109, "grad_norm": 19.61084523204792, "kl": 0.0947265625, "learning_rate": 6.94041450777202e-07, "loss": -0.0002, "reward": 1.9998689889907837, "reward_std": 6.372265306708869e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4998691082000732, "step": 1182 }, { "clip_ratio": 0.0, "completion_length": 34.5, "epoch": 3.064766839378238, "grad_norm": 0.09839036954550555, "kl": 0.12255859375, "learning_rate": 6.937823834196891e-07, "loss": 0.0003, "reward": 2.499997854232788, "reward_std": 2.0497682839959452e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999979138374329, "step": 1183 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 3.0673575129533677, "grad_norm": 0.2827675125819973, "kl": 0.0667724609375, "learning_rate": 6.935233160621761e-07, "loss": 0.0001, "reward": 2.4999977350234985, "reward_std": 2.220439370148597e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999977350234985, "step": 1184 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 3.069948186528497, "grad_norm": 1.6399465495742187, "kl": 0.12841796875, "learning_rate": 6.932642487046631e-07, "loss": 0.0018, "reward": 2.499987840652466, "reward_std": 9.988759757106891e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999877214431763, "step": 1185 }, { "clip_ratio": 0.0, "completion_length": 35.5625, "epoch": 3.0725388601036268, "grad_norm": 48.967278697379086, "kl": 0.15576171875, "learning_rate": 6.930051813471502e-07, "loss": 0.0006, "reward": 2.373937487602234, "reward_std": 0.35653577744960785, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.8739373683929443, "step": 1186 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 3.0751295336787563, "grad_norm": 1.435609881582776, "kl": 0.0677490234375, "learning_rate": 6.927461139896372e-07, "loss": 0.0004, "reward": 2.499990701675415, "reward_std": 7.0516755386051955e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999908208847046, "step": 1187 }, { "clip_ratio": 0.0, "completion_length": 34.9375, "epoch": 3.077720207253886, "grad_norm": 3.5101981361178316, "kl": 0.05426025390625, "learning_rate": 6.924870466321243e-07, "loss": 0.0013, "reward": 2.4999924898147583, "reward_std": 9.763617526914459e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999924302101135, "step": 1188 }, { "clip_ratio": 0.0, "completion_length": 34.5, "epoch": 3.0803108808290154, "grad_norm": 1.6765971214628324, "kl": 0.096435546875, "learning_rate": 6.922279792746114e-07, "loss": 0.0005, "reward": 2.499983787536621, "reward_std": 1.0850691978703253e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999836087226868, "step": 1189 }, { "clip_ratio": 0.0, "completion_length": 35.9375, "epoch": 3.082901554404145, "grad_norm": 5.590736851210506, "kl": 0.15478515625, "learning_rate": 6.919689119170983e-07, "loss": 0.0003, "reward": 2.499976396560669, "reward_std": 6.402083045031759e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999765157699585, "step": 1190 }, { "clip_ratio": 0.0, "completion_length": 34.4375, "epoch": 3.0854922279792745, "grad_norm": 33.17775694408754, "kl": 0.226806640625, "learning_rate": 6.917098445595854e-07, "loss": 0.0013, "reward": 1.9959203600883484, "reward_std": 0.00019277128740213811, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4959202110767365, "step": 1191 }, { "clip_ratio": 0.0, "completion_length": 34.5, "epoch": 3.088082901554404, "grad_norm": 23.896802680616943, "kl": 0.1123046875, "learning_rate": 6.914507772020724e-07, "loss": 0.0003, "reward": 2.2496062517166138, "reward_std": 0.26767816981418946, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.7496063113212585, "step": 1192 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 3.0906735751295336, "grad_norm": 9.660661317212083, "kl": 0.07305908203125, "learning_rate": 6.911917098445595e-07, "loss": -0.0001, "reward": 1.9984159469604492, "reward_std": 0.00019665755337427981, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4984160661697388, "step": 1193 }, { "clip_ratio": 0.0, "completion_length": 34.5, "epoch": 3.093264248704663, "grad_norm": 20.022268497160617, "kl": 0.1494140625, "learning_rate": 6.909326424870467e-07, "loss": 0.0, "reward": 2.4374847412109375, "reward_std": 0.1768051714203409, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9374848008155823, "step": 1194 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 3.0958549222797926, "grad_norm": 36.23354749725851, "kl": 0.16064453125, "learning_rate": 6.906735751295337e-07, "loss": 0.0004, "reward": 2.124864637851715, "reward_std": 0.23152516983157057, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.6248646974563599, "step": 1195 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 3.098445595854922, "grad_norm": 1.3195107073614705, "kl": 0.128173828125, "learning_rate": 6.904145077720207e-07, "loss": -0.0002, "reward": 2.4999265670776367, "reward_std": 1.4375779528563726e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999266862869263, "step": 1196 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 3.1010362694300517, "grad_norm": 0.21297203299170864, "kl": 0.066162109375, "learning_rate": 6.901554404145078e-07, "loss": -0.001, "reward": 2.499997854232788, "reward_std": 2.3950478293954802e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999980926513672, "step": 1197 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 3.1036269430051813, "grad_norm": 1.1951467709965204, "kl": 0.0443115234375, "learning_rate": 6.898963730569948e-07, "loss": 0.0, "reward": 2.499993085861206, "reward_std": 2.254762023312651e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999929666519165, "step": 1198 }, { "clip_ratio": 0.0, "completion_length": 37.0625, "epoch": 3.106217616580311, "grad_norm": 89.7117428287973, "kl": 0.17041015625, "learning_rate": 6.896373056994819e-07, "loss": -0.0, "reward": 2.200614869594574, "reward_std": 0.41318650986545435, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.7006149291992188, "step": 1199 }, { "clip_ratio": 0.0, "completion_length": 34.5, "epoch": 3.1088082901554404, "grad_norm": 2.9577556315338525, "kl": 0.0584716796875, "learning_rate": 6.893782383419689e-07, "loss": 0.0004, "reward": 1.8219398856163025, "reward_std": 0.00021504092509871953, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.3219397366046906, "step": 1200 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 3.11139896373057, "grad_norm": 1.8660163641473746, "kl": 0.08935546875, "learning_rate": 6.89119170984456e-07, "loss": 0.0005, "reward": 2.499987483024597, "reward_std": 9.607229856101185e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.999987542629242, "step": 1201 }, { "clip_ratio": 0.0, "completion_length": 34.5, "epoch": 3.1139896373056994, "grad_norm": 0.23309003298542247, "kl": 0.2867431640625, "learning_rate": 6.888601036269431e-07, "loss": 0.0008, "reward": 2.4999899864196777, "reward_std": 6.076309091440635e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.999990165233612, "step": 1202 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 3.116580310880829, "grad_norm": 20.242096699779367, "kl": 0.1484375, "learning_rate": 6.8860103626943e-07, "loss": 0.0, "reward": 2.4366928339004517, "reward_std": 0.17895166123366835, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9366929531097412, "step": 1203 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 3.1191709844559585, "grad_norm": 0.41863435926551673, "kl": 0.076416015625, "learning_rate": 6.883419689119171e-07, "loss": 0.0007, "reward": 2.499996542930603, "reward_std": 3.980623432653374e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999964833259583, "step": 1204 }, { "clip_ratio": 0.0, "completion_length": 34.5, "epoch": 3.121761658031088, "grad_norm": 0.057189913104332614, "kl": 0.19970703125, "learning_rate": 6.880829015544041e-07, "loss": 0.0006, "reward": 2.499997615814209, "reward_std": 1.7178826396957447e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999977350234985, "step": 1205 }, { "clip_ratio": 0.0, "completion_length": 34.9375, "epoch": 3.1243523316062176, "grad_norm": 2.7685707075740504, "kl": 0.08984375, "learning_rate": 6.878238341968912e-07, "loss": 0.0001, "reward": 2.499969720840454, "reward_std": 1.1449779435679375e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999697804450989, "step": 1206 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 3.126943005181347, "grad_norm": 2.7955752348066656, "kl": 0.0433349609375, "learning_rate": 6.875647668393783e-07, "loss": 0.0006, "reward": 2.499983787536621, "reward_std": 2.1352165845200943e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999837279319763, "step": 1207 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 3.1295336787564767, "grad_norm": 3.904897597004643, "kl": 0.099853515625, "learning_rate": 6.873056994818652e-07, "loss": 0.0005, "reward": 1.9944968819618225, "reward_std": 9.394819312547043e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4944968819618225, "step": 1208 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 3.1321243523316062, "grad_norm": 0.993464348756233, "kl": 0.165283203125, "learning_rate": 6.870466321243523e-07, "loss": 0.0013, "reward": 2.499992251396179, "reward_std": 7.040971325977807e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.999992311000824, "step": 1209 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 3.134715025906736, "grad_norm": 8.493647101872822, "kl": 0.059326171875, "learning_rate": 6.867875647668393e-07, "loss": 0.0001, "reward": 2.4999669790267944, "reward_std": 4.88402728251458e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999668598175049, "step": 1210 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 3.1373056994818653, "grad_norm": 0.6252435900174612, "kl": 0.0283050537109375, "learning_rate": 6.865284974093264e-07, "loss": 0.0003, "reward": 2.4999947547912598, "reward_std": 5.760134399679373e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999947547912598, "step": 1211 }, { "clip_ratio": 0.0, "completion_length": 38.9375, "epoch": 3.139896373056995, "grad_norm": 317.3920017848008, "kl": 0.1669921875, "learning_rate": 6.862694300518135e-07, "loss": 0.0012, "reward": 1.8101842999458313, "reward_std": 0.1777368365546863, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.3101842999458313, "step": 1212 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 3.1424870466321244, "grad_norm": 0.38957971342570114, "kl": 0.0938720703125, "learning_rate": 6.860103626943005e-07, "loss": 0.0005, "reward": 2.499996304512024, "reward_std": 3.5537441931410285e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.999996304512024, "step": 1213 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 3.145077720207254, "grad_norm": 0.12349266871983197, "kl": 0.153076171875, "learning_rate": 6.857512953367876e-07, "loss": -0.0007, "reward": 2.499991774559021, "reward_std": 2.604897190394695e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999918937683105, "step": 1214 }, { "clip_ratio": 0.0, "completion_length": 34.5, "epoch": 3.1476683937823835, "grad_norm": 32.33004040627252, "kl": 0.165283203125, "learning_rate": 6.854922279792745e-07, "loss": 0.0005, "reward": 1.9112027287483215, "reward_std": 0.000350074620655505, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4112027287483215, "step": 1215 }, { "clip_ratio": 0.0, "completion_length": 51.9375, "epoch": 3.150259067357513, "grad_norm": 113.55662888114172, "kl": 2.042724609375, "learning_rate": 6.852331606217616e-07, "loss": 0.0079, "reward": 2.0620912313461304, "reward_std": 0.17689373933910701, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.56209135055542, "step": 1216 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 3.1528497409326426, "grad_norm": 8.61897752246694, "kl": 0.17431640625, "learning_rate": 6.849740932642487e-07, "loss": 0.0008, "reward": 1.9794423580169678, "reward_std": 6.9976295435481e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4794422388076782, "step": 1217 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 3.155440414507772, "grad_norm": 27.69755342708996, "kl": 0.0528564453125, "learning_rate": 6.847150259067357e-07, "loss": -0.0002, "reward": 2.499966859817505, "reward_std": 1.4045354475911154e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999668598175049, "step": 1218 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 3.1580310880829017, "grad_norm": 6.307206061699156, "kl": 0.0758056640625, "learning_rate": 6.844559585492228e-07, "loss": 0.0005, "reward": 1.8879817724227905, "reward_std": 0.0005463095068307666, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.3879818320274353, "step": 1219 }, { "clip_ratio": 0.0, "completion_length": 34.375, "epoch": 3.160621761658031, "grad_norm": 10.145145819493534, "kl": 0.0535888671875, "learning_rate": 6.841968911917099e-07, "loss": -0.0004, "reward": 2.437488317489624, "reward_std": 0.17678574352271426, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.937488317489624, "step": 1220 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 3.1632124352331608, "grad_norm": 0.06347202174406436, "kl": 0.153076171875, "learning_rate": 6.839378238341968e-07, "loss": 0.0, "reward": 2.4999977350234985, "reward_std": 1.2728193041766644e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999977946281433, "step": 1221 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 3.1658031088082903, "grad_norm": 25.926481283907055, "kl": 0.1492919921875, "learning_rate": 6.836787564766839e-07, "loss": 0.0006, "reward": 1.997806191444397, "reward_std": 0.00022658649299955869, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4978063106536865, "step": 1222 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 3.16839378238342, "grad_norm": 4.337730682658184, "kl": 0.16357421875, "learning_rate": 6.834196891191709e-07, "loss": 0.0005, "reward": 1.884786069393158, "reward_std": 0.0002273179107987744, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.3847861886024475, "step": 1223 }, { "clip_ratio": 0.0, "completion_length": 34.5, "epoch": 3.1709844559585494, "grad_norm": 1.2906471182596082, "kl": 0.099853515625, "learning_rate": 6.83160621761658e-07, "loss": 0.001, "reward": 2.4999938011169434, "reward_std": 2.929047468569479e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999938011169434, "step": 1224 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 3.173575129533679, "grad_norm": 12.228908208545143, "kl": 0.0640869140625, "learning_rate": 6.829015544041451e-07, "loss": 0.0003, "reward": 2.4374582767486572, "reward_std": 0.17681060468567011, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9374581575393677, "step": 1225 }, { "clip_ratio": 0.0, "completion_length": 34.5, "epoch": 3.1761658031088085, "grad_norm": 0.24394420433355046, "kl": 0.0789794921875, "learning_rate": 6.826424870466321e-07, "loss": 0.0003, "reward": 2.499994397163391, "reward_std": 2.9986239269419457e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999943971633911, "step": 1226 }, { "clip_ratio": 0.0, "completion_length": 41.8125, "epoch": 3.178756476683938, "grad_norm": 34.13531818528824, "kl": 0.1904296875, "learning_rate": 6.823834196891191e-07, "loss": 0.0004, "reward": 1.9996867179870605, "reward_std": 0.00010410162849439075, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.49968683719635, "step": 1227 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 3.1813471502590676, "grad_norm": 2.9811588172750163, "kl": 0.0496826171875, "learning_rate": 6.821243523316061e-07, "loss": -0.0006, "reward": 2.499964952468872, "reward_std": 2.087541042783414e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999651908874512, "step": 1228 }, { "clip_ratio": 0.0, "completion_length": 34.5, "epoch": 3.1839378238341967, "grad_norm": 2.2456746614650687, "kl": 0.1041259765625, "learning_rate": 6.818652849740932e-07, "loss": 0.0006, "reward": 1.676464319229126, "reward_std": 0.0003018202780822321, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.176464319229126, "step": 1229 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 3.186528497409326, "grad_norm": 2.3095544817287856, "kl": 0.14892578125, "learning_rate": 6.816062176165803e-07, "loss": 0.0006, "reward": 2.4999759197235107, "reward_std": 1.3347123967832886e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.999975860118866, "step": 1230 }, { "clip_ratio": 0.0, "completion_length": 35.375, "epoch": 3.1891191709844557, "grad_norm": 162.07904755221753, "kl": 0.13055419921875, "learning_rate": 6.813471502590673e-07, "loss": -0.0004, "reward": 1.5624719858169556, "reward_std": 0.17670097788311523, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.0624721124768257, "step": 1231 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 3.1917098445595853, "grad_norm": 9.30954659856793, "kl": 0.123779296875, "learning_rate": 6.810880829015544e-07, "loss": 0.0007, "reward": 1.4995166063308716, "reward_std": 0.00010432247881908552, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 0.9995165765285492, "step": 1232 }, { "clip_ratio": 0.0, "completion_length": 38.625, "epoch": 3.194300518134715, "grad_norm": 1.7660116853260925, "kl": 0.06201171875, "learning_rate": 6.808290155440413e-07, "loss": 0.0004, "reward": 2.499979853630066, "reward_std": 8.384925592963555e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999799132347107, "step": 1233 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 3.1968911917098444, "grad_norm": 2.372396535595751, "kl": 0.081298828125, "learning_rate": 6.805699481865284e-07, "loss": -0.0002, "reward": 2.4999783039093018, "reward_std": 9.258635657261038e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999783635139465, "step": 1234 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 3.199481865284974, "grad_norm": 1.3915662872923094, "kl": 0.1708984375, "learning_rate": 6.803108808290155e-07, "loss": 0.0012, "reward": 2.499990463256836, "reward_std": 8.627333045296837e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.999990463256836, "step": 1235 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 3.2020725388601035, "grad_norm": 39.91250914085111, "kl": 0.17626953125, "learning_rate": 6.800518134715025e-07, "loss": 0.0014, "reward": 1.9900028705596924, "reward_std": 0.001552803657716595, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.49000284075737, "step": 1236 }, { "clip_ratio": 0.0, "completion_length": 61.1875, "epoch": 3.204663212435233, "grad_norm": 8.5603241823098, "kl": 0.0899658203125, "learning_rate": 6.797927461139897e-07, "loss": -0.0003, "reward": 1.999298870563507, "reward_std": 2.433328745610197e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4992989599704742, "step": 1237 }, { "clip_ratio": 0.0, "completion_length": 38.375, "epoch": 3.2072538860103625, "grad_norm": 9.999270429566982, "kl": 0.088623046875, "learning_rate": 6.795336787564767e-07, "loss": 0.0002, "reward": 2.4999170303344727, "reward_std": 3.644930689006287e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999170899391174, "step": 1238 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 3.209844559585492, "grad_norm": 27.121171658755856, "kl": 0.173828125, "learning_rate": 6.792746113989637e-07, "loss": -0.0001, "reward": 1.989274024963379, "reward_std": 0.0007453930279552878, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4892742335796356, "step": 1239 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 3.2124352331606216, "grad_norm": 35.98969466684873, "kl": 0.096923828125, "learning_rate": 6.790155440414508e-07, "loss": 0.0004, "reward": 2.1873241662979126, "reward_std": 0.4441937953233719, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.6873242259025574, "step": 1240 }, { "clip_ratio": 0.0, "completion_length": 34.5625, "epoch": 3.215025906735751, "grad_norm": 122.08437159855893, "kl": 0.13720703125, "learning_rate": 6.787564766839378e-07, "loss": 0.0005, "reward": 1.4928288459777832, "reward_std": 0.40695127844810486, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 0.992828905582428, "step": 1241 }, { "clip_ratio": 0.0, "completion_length": 35.3125, "epoch": 3.2176165803108807, "grad_norm": 1.3679342838333743, "kl": 0.0799560546875, "learning_rate": 6.784974093264249e-07, "loss": 0.0003, "reward": 2.4999719858169556, "reward_std": 1.0691671377571765e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999721050262451, "step": 1242 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 3.2202072538860103, "grad_norm": 0.4488314279506791, "kl": 0.08984375, "learning_rate": 6.782383419689119e-07, "loss": -0.0001, "reward": 2.4999940395355225, "reward_std": 2.5145229187728546e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999940991401672, "step": 1243 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 3.22279792746114, "grad_norm": 2.5291469167689846, "kl": 0.084716796875, "learning_rate": 6.77979274611399e-07, "loss": 0.0011, "reward": 2.499972105026245, "reward_std": 1.806678596949496e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999719858169556, "step": 1244 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 3.2253886010362693, "grad_norm": 17.233082999226742, "kl": 0.06494140625, "learning_rate": 6.77720207253886e-07, "loss": 0.0006, "reward": 2.062438428401947, "reward_std": 0.17678446534057457, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.562438428401947, "step": 1245 }, { "clip_ratio": 0.0, "completion_length": 34.0625, "epoch": 3.227979274611399, "grad_norm": 1.4884915339425842, "kl": 0.30322265625, "learning_rate": 6.77461139896373e-07, "loss": 0.0027, "reward": 2.499984622001648, "reward_std": 6.9239305275914376e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999845027923584, "step": 1246 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 3.2305699481865284, "grad_norm": 12.581191310878998, "kl": 0.02294921875, "learning_rate": 6.772020725388601e-07, "loss": 0.0003, "reward": 2.4998375177383423, "reward_std": 5.313391034178494e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9998376965522766, "step": 1247 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 3.233160621761658, "grad_norm": 0.3068037875764142, "kl": 0.086181640625, "learning_rate": 6.769430051813472e-07, "loss": 0.0009, "reward": 2.4999945163726807, "reward_std": 2.68907388090156e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999944567680359, "step": 1248 }, { "clip_ratio": 0.0, "completion_length": 41.25, "epoch": 3.2357512953367875, "grad_norm": 3.1821558286216294, "kl": 0.1376953125, "learning_rate": 6.766839378238342e-07, "loss": -0.0, "reward": 2.49997615814209, "reward_std": 1.5303936379496008e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999762773513794, "step": 1249 }, { "clip_ratio": 0.0, "completion_length": 34.0, "epoch": 3.238341968911917, "grad_norm": 3.397811895284669, "kl": 0.060791015625, "learning_rate": 6.764248704663213e-07, "loss": 0.0007, "reward": 2.499962568283081, "reward_std": 7.783529554217239e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999623894691467, "step": 1250 }, { "clip_ratio": 0.0, "completion_length": 38.25, "epoch": 3.2409326424870466, "grad_norm": 11.173658309242521, "kl": 0.1494140625, "learning_rate": 6.761658031088082e-07, "loss": 0.0006, "reward": 1.9173847436904907, "reward_std": 0.0005844671488262065, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4173848628997803, "step": 1251 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 3.243523316062176, "grad_norm": 5.7610519775208715, "kl": 0.06182861328125, "learning_rate": 6.759067357512953e-07, "loss": 0.0003, "reward": 1.9917590618133545, "reward_std": 0.00029140177321096417, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4917591214179993, "step": 1252 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 3.2461139896373057, "grad_norm": 0.4724315873394368, "kl": 0.0811767578125, "learning_rate": 6.756476683937824e-07, "loss": -0.0004, "reward": 2.499985456466675, "reward_std": 7.1935268124434515e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999854564666748, "step": 1253 }, { "clip_ratio": 0.0, "completion_length": 36.5, "epoch": 3.2487046632124352, "grad_norm": 61.21206580796467, "kl": 0.0482177734375, "learning_rate": 6.753886010362694e-07, "loss": 0.0001, "reward": 2.499669313430786, "reward_std": 6.080886032577837e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.999669075012207, "step": 1254 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 3.2512953367875648, "grad_norm": 2.008133429151827, "kl": 0.03057861328125, "learning_rate": 6.751295336787565e-07, "loss": 0.0009, "reward": 2.49998140335083, "reward_std": 1.1428465313656488e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999812841415405, "step": 1255 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 3.2538860103626943, "grad_norm": 4.028875682841846, "kl": 0.103759765625, "learning_rate": 6.748704663212435e-07, "loss": 0.0016, "reward": 2.4999269247055054, "reward_std": 2.1010632735851686e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999268054962158, "step": 1256 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 3.256476683937824, "grad_norm": 0.04712398895384099, "kl": 0.07666015625, "learning_rate": 6.746113989637305e-07, "loss": -0.0003, "reward": 2.4999988079071045, "reward_std": 1.1210138382011792e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999988675117493, "step": 1257 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 3.2590673575129534, "grad_norm": 1.7228698291486717, "kl": 0.12451171875, "learning_rate": 6.743523316062176e-07, "loss": 0.0016, "reward": 2.499993324279785, "reward_std": 6.99621966759878e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999932050704956, "step": 1258 }, { "clip_ratio": 0.0, "completion_length": 34.0, "epoch": 3.261658031088083, "grad_norm": 22.074941276410815, "kl": 0.19775390625, "learning_rate": 6.740932642487046e-07, "loss": 0.0002, "reward": 1.814564824104309, "reward_std": 0.0005942479255054423, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.314564824104309, "step": 1259 }, { "clip_ratio": 0.0, "completion_length": 39.3125, "epoch": 3.2642487046632125, "grad_norm": 7.331578703334851, "kl": 0.08349609375, "learning_rate": 6.738341968911917e-07, "loss": -0.0003, "reward": 2.4999489784240723, "reward_std": 2.8987172299821395e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999489784240723, "step": 1260 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 3.266839378238342, "grad_norm": 20.423192947111342, "kl": 0.1845703125, "learning_rate": 6.735751295336787e-07, "loss": 0.0007, "reward": 2.0141189098358154, "reward_std": 0.19632750791606668, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.5141189098358154, "step": 1261 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 3.2694300518134716, "grad_norm": 54.64322304846053, "kl": 0.04833984375, "learning_rate": 6.733160621761658e-07, "loss": 0.0003, "reward": 2.374844551086426, "reward_std": 0.23173565308263733, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.8748445510864258, "step": 1262 }, { "clip_ratio": 0.0, "completion_length": 36.5, "epoch": 3.272020725388601, "grad_norm": 1.4942915810445108, "kl": 0.1011962890625, "learning_rate": 6.730569948186528e-07, "loss": 0.0007, "reward": 2.4999072551727295, "reward_std": 1.1966993270107196e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999071955680847, "step": 1263 }, { "clip_ratio": 0.0, "completion_length": 46.875, "epoch": 3.2746113989637307, "grad_norm": 2.318037748042974, "kl": 0.11376953125, "learning_rate": 6.727979274611398e-07, "loss": 0.0008, "reward": 2.4999760389328003, "reward_std": 1.2307446013437584e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999759197235107, "step": 1264 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 3.27720207253886, "grad_norm": 6.0714539408583565, "kl": 0.087646484375, "learning_rate": 6.725388601036269e-07, "loss": 0.0003, "reward": 1.2900685667991638, "reward_std": 0.00036068645931663923, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 0.7900685369968414, "step": 1265 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 3.2797927461139897, "grad_norm": 0.7398916415551148, "kl": 0.06201171875, "learning_rate": 6.722797927461139e-07, "loss": 0.0005, "reward": 2.499778151512146, "reward_std": 1.051360561632464e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.999778151512146, "step": 1266 }, { "clip_ratio": 0.0, "completion_length": 34.5, "epoch": 3.2823834196891193, "grad_norm": 0.20887457172346072, "kl": 0.090087890625, "learning_rate": 6.72020725388601e-07, "loss": 0.0004, "reward": 2.4999964237213135, "reward_std": 1.9571496636672236e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999964833259583, "step": 1267 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 3.284974093264249, "grad_norm": 0.7956652247876509, "kl": 0.1220703125, "learning_rate": 6.717616580310881e-07, "loss": 0.0007, "reward": 2.499996066093445, "reward_std": 3.9155648323685455e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999961256980896, "step": 1268 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 3.2875647668393784, "grad_norm": 0.2600159459812739, "kl": 0.090087890625, "learning_rate": 6.71502590673575e-07, "loss": -0.0007, "reward": 2.499996066093445, "reward_std": 2.275126689710305e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999961853027344, "step": 1269 }, { "clip_ratio": 0.0, "completion_length": 34.5, "epoch": 3.290155440414508, "grad_norm": 1.3801637562359388, "kl": 0.083984375, "learning_rate": 6.712435233160621e-07, "loss": 0.0003, "reward": 2.4999756813049316, "reward_std": 1.0136415312445024e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999756217002869, "step": 1270 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 3.2927461139896375, "grad_norm": 0.21350459680199665, "kl": 0.119140625, "learning_rate": 6.709844559585492e-07, "loss": 0.0008, "reward": 2.4999964237213135, "reward_std": 1.875797465800133e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999961853027344, "step": 1271 }, { "clip_ratio": 0.0, "completion_length": 42.5625, "epoch": 3.295336787564767, "grad_norm": 15.940153283180809, "kl": 0.3828125, "learning_rate": 6.707253886010362e-07, "loss": 0.001, "reward": 2.3122068643569946, "reward_std": 0.34770820060020924, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.8122069835662842, "step": 1272 }, { "clip_ratio": 0.0, "completion_length": 34.5, "epoch": 3.2979274611398965, "grad_norm": 3.12265472909459, "kl": 0.2493896484375, "learning_rate": 6.704663212435233e-07, "loss": 0.0008, "reward": 2.4999197721481323, "reward_std": 2.9060335918984492e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999198913574219, "step": 1273 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 3.300518134715026, "grad_norm": 10.684700343980614, "kl": 0.0550537109375, "learning_rate": 6.702072538860103e-07, "loss": 0.0005, "reward": 1.9941173791885376, "reward_std": 0.0001505392161789132, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4941173195838928, "step": 1274 }, { "clip_ratio": 0.0, "completion_length": 34.125, "epoch": 3.3031088082901556, "grad_norm": 1.6637752649456732, "kl": 0.055419921875, "learning_rate": 6.699481865284973e-07, "loss": 0.0017, "reward": 2.499996542930603, "reward_std": 3.372988203409477e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.999996304512024, "step": 1275 }, { "clip_ratio": 0.0, "completion_length": 34.0, "epoch": 3.305699481865285, "grad_norm": 4.31007749526667, "kl": 0.0958251953125, "learning_rate": 6.696891191709844e-07, "loss": 0.0011, "reward": 2.4999359846115112, "reward_std": 2.1850171492587833e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999359846115112, "step": 1276 }, { "clip_ratio": 0.0, "completion_length": 37.625, "epoch": 3.3082901554404147, "grad_norm": 0.1893903207167545, "kl": 0.128662109375, "learning_rate": 6.694300518134714e-07, "loss": 0.0011, "reward": 2.499996066093445, "reward_std": 1.9226832250751613e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999960660934448, "step": 1277 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 3.3108808290155443, "grad_norm": 1.5598454758263005, "kl": 0.052734375, "learning_rate": 6.691709844559585e-07, "loss": 0.0013, "reward": 2.4999793767929077, "reward_std": 8.379100563615793e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999791383743286, "step": 1278 }, { "clip_ratio": 0.0, "completion_length": 35.4375, "epoch": 3.313471502590674, "grad_norm": 23.902880778175927, "kl": 0.100830078125, "learning_rate": 6.689119170984455e-07, "loss": 0.0003, "reward": 1.9973394870758057, "reward_std": 0.0018259540975122945, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4973394274711609, "step": 1279 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 3.3160621761658033, "grad_norm": 3.172447446657001, "kl": 0.0584716796875, "learning_rate": 6.686528497409327e-07, "loss": 0.0011, "reward": 2.4999806880950928, "reward_std": 6.82442430388619e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.999980628490448, "step": 1280 }, { "clip_ratio": 0.0, "completion_length": 35.9375, "epoch": 3.318652849740933, "grad_norm": 65.7431643413431, "kl": 0.06884765625, "learning_rate": 6.683937823834197e-07, "loss": -0.0004, "reward": 1.9731935858726501, "reward_std": 0.004861528554783945, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4731935858726501, "step": 1281 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 3.321243523316062, "grad_norm": 17.836462416607315, "kl": 0.05291748046875, "learning_rate": 6.681347150259067e-07, "loss": 0.0008, "reward": 2.4373984336853027, "reward_std": 0.17694964087786502, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9373984336853027, "step": 1282 }, { "clip_ratio": 0.0, "completion_length": 34.5, "epoch": 3.3238341968911915, "grad_norm": 4.904208401300367, "kl": 0.0616455078125, "learning_rate": 6.678756476683938e-07, "loss": -0.0004, "reward": 1.9981617331504822, "reward_std": 4.671673514167196e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.498161792755127, "step": 1283 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 3.326424870466321, "grad_norm": 43.73236976346806, "kl": 0.096923828125, "learning_rate": 6.676165803108808e-07, "loss": 0.0006, "reward": 1.9986584186553955, "reward_std": 0.000110683293769398, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4986584782600403, "step": 1284 }, { "clip_ratio": 0.0, "completion_length": 36.3125, "epoch": 3.3290155440414506, "grad_norm": 16.989284017364056, "kl": 0.05938720703125, "learning_rate": 6.673575129533679e-07, "loss": 0.0005, "reward": 1.9519646167755127, "reward_std": 0.015825567749971015, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4519644975662231, "step": 1285 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 3.33160621761658, "grad_norm": 16.30015088937492, "kl": 0.1339111328125, "learning_rate": 6.67098445595855e-07, "loss": 0.0005, "reward": 1.4858134388923645, "reward_std": 0.00023081257631929475, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 0.9858134984970093, "step": 1286 }, { "clip_ratio": 0.0, "completion_length": 35.5625, "epoch": 3.3341968911917097, "grad_norm": 21.128982681534165, "kl": 0.03961181640625, "learning_rate": 6.668393782383419e-07, "loss": 0.0, "reward": 1.8781986832618713, "reward_std": 0.0004783739980211976, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.3781987130641937, "step": 1287 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 3.3367875647668392, "grad_norm": 1.4503175328360276, "kl": 0.07177734375, "learning_rate": 6.66580310880829e-07, "loss": -0.0004, "reward": 2.4999178647994995, "reward_std": 1.4236801575862046e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.999917984008789, "step": 1288 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 3.339378238341969, "grad_norm": 78.80329749028347, "kl": 0.121826171875, "learning_rate": 6.66321243523316e-07, "loss": 0.0002, "reward": 2.499940276145935, "reward_std": 1.946031170518836e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999403953552246, "step": 1289 }, { "clip_ratio": 0.0, "completion_length": 34.8125, "epoch": 3.3419689119170983, "grad_norm": 157.3271683150045, "kl": 0.15478515625, "learning_rate": 6.660621761658031e-07, "loss": 0.0009, "reward": 2.0805219411849976, "reward_std": 0.34735794636026185, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.5805218815803528, "step": 1290 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 3.344559585492228, "grad_norm": 7.586122553563838, "kl": 0.080078125, "learning_rate": 6.658031088082902e-07, "loss": 0.0006, "reward": 2.0623974800109863, "reward_std": 0.17679739690629503, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.562397539615631, "step": 1291 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 3.3471502590673574, "grad_norm": 0.31199710615956217, "kl": 0.109375, "learning_rate": 6.655440414507772e-07, "loss": -0.0005, "reward": 2.4999911785125732, "reward_std": 2.3840762821691897e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999912977218628, "step": 1292 }, { "clip_ratio": 0.0, "completion_length": 34.0, "epoch": 3.349740932642487, "grad_norm": 0.050336091549529546, "kl": 0.13232421875, "learning_rate": 6.652849740932642e-07, "loss": 0.0004, "reward": 1.499998927116394, "reward_std": 6.4432907720402e-07, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 0.9999988675117493, "step": 1293 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 3.3523316062176165, "grad_norm": 28.62123541900629, "kl": 0.10986328125, "learning_rate": 6.650259067357513e-07, "loss": 0.0015, "reward": 1.9998066425323486, "reward_std": 6.801219285534899e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.499806433916092, "step": 1294 }, { "clip_ratio": 0.0, "completion_length": 34.5, "epoch": 3.354922279792746, "grad_norm": 1.9701712109347833, "kl": 0.09814453125, "learning_rate": 6.647668393782383e-07, "loss": -0.0004, "reward": 2.4999836683273315, "reward_std": 8.612069905211683e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.999983787536621, "step": 1295 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 3.3575129533678756, "grad_norm": 1.2098339924315067, "kl": 0.09375, "learning_rate": 6.645077720207254e-07, "loss": 0.0005, "reward": 2.4999914169311523, "reward_std": 7.053330136841396e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.999991238117218, "step": 1296 }, { "clip_ratio": 0.0, "completion_length": 37.1875, "epoch": 3.360103626943005, "grad_norm": 0.621152549433018, "kl": 0.2158203125, "learning_rate": 6.642487046632124e-07, "loss": 0.0004, "reward": 2.4999806880950928, "reward_std": 5.871805569768185e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.999980628490448, "step": 1297 }, { "clip_ratio": 0.0, "completion_length": 34.5, "epoch": 3.3626943005181347, "grad_norm": 0.12449511016870848, "kl": 0.012115478515625, "learning_rate": 6.639896373056995e-07, "loss": 0.0001, "reward": 2.4999982118606567, "reward_std": 1.1489047153645515e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999982118606567, "step": 1298 }, { "clip_ratio": 0.0, "completion_length": 38.3125, "epoch": 3.365284974093264, "grad_norm": 14.959340054097833, "kl": 0.204833984375, "learning_rate": 6.637305699481865e-07, "loss": 0.0008, "reward": 1.5599233508110046, "reward_std": 0.4072205275297165, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.0599234104156494, "step": 1299 }, { "clip_ratio": 0.0, "completion_length": 34.5, "epoch": 3.3678756476683938, "grad_norm": 16.102922126163854, "kl": 0.08740234375, "learning_rate": 6.634715025906735e-07, "loss": 0.0002, "reward": 2.374980926513672, "reward_std": 0.23148168627972154, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.8749809861183167, "step": 1300 }, { "clip_ratio": 0.0, "completion_length": 34.5, "epoch": 3.3704663212435233, "grad_norm": 3.8439239213360556, "kl": 0.120849609375, "learning_rate": 6.632124352331606e-07, "loss": 0.0007, "reward": 2.4999793767929077, "reward_std": 1.647567387408344e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999793767929077, "step": 1301 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 3.373056994818653, "grad_norm": 0.8088670641819842, "kl": 0.18115234375, "learning_rate": 6.629533678756476e-07, "loss": 0.0007, "reward": 2.4999886751174927, "reward_std": 6.531869985337835e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999887347221375, "step": 1302 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 3.3756476683937824, "grad_norm": 12.16995096807882, "kl": 0.1015625, "learning_rate": 6.626943005181347e-07, "loss": 0.0006, "reward": 1.9998265504837036, "reward_std": 1.771404242845165e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4998266100883484, "step": 1303 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 3.378238341968912, "grad_norm": 0.4266573255322855, "kl": 0.0628662109375, "learning_rate": 6.624352331606218e-07, "loss": 0.0004, "reward": 2.499997615814209, "reward_std": 2.316846519079263e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.999997615814209, "step": 1304 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 3.3808290155440415, "grad_norm": 5.712500809056355, "kl": 0.060302734375, "learning_rate": 6.621761658031087e-07, "loss": -0.0001, "reward": 2.4997068643569946, "reward_std": 4.266273646180707e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.999707043170929, "step": 1305 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 3.383419689119171, "grad_norm": 10.868164851703684, "kl": 0.077392578125, "learning_rate": 6.619170984455958e-07, "loss": 0.0006, "reward": 2.4374756813049316, "reward_std": 0.1768043436723019, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9374757409095764, "step": 1306 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 3.3860103626943006, "grad_norm": 9.432246491159606, "kl": 0.076904296875, "learning_rate": 6.616580310880828e-07, "loss": -0.0008, "reward": 2.4999754428863525, "reward_std": 1.1850605233121314e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999754428863525, "step": 1307 }, { "clip_ratio": 0.0, "completion_length": 34.5, "epoch": 3.38860103626943, "grad_norm": 0.21133365073487292, "kl": 0.151611328125, "learning_rate": 6.613989637305699e-07, "loss": 0.0003, "reward": 2.4999970197677612, "reward_std": 3.3243616144318366e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.999997079372406, "step": 1308 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 3.3911917098445596, "grad_norm": 24.054526082222516, "kl": 0.14501953125, "learning_rate": 6.61139896373057e-07, "loss": 0.0006, "reward": 1.931704044342041, "reward_std": 0.1871442198753357, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4317041039466858, "step": 1309 }, { "clip_ratio": 0.0, "completion_length": 37.625, "epoch": 3.393782383419689, "grad_norm": 3.3759777040099497, "kl": 0.0775146484375, "learning_rate": 6.60880829015544e-07, "loss": 0.0012, "reward": 2.499921441078186, "reward_std": 2.3684171793547648e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999216198921204, "step": 1310 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 3.3963730569948187, "grad_norm": 0.17472606320221273, "kl": 0.0771484375, "learning_rate": 6.60621761658031e-07, "loss": 0.0003, "reward": 2.4999951124191284, "reward_std": 3.3327240771541256e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999951124191284, "step": 1311 }, { "clip_ratio": 0.0, "completion_length": 34.5, "epoch": 3.3989637305699483, "grad_norm": 9.32054821062495, "kl": 0.0533447265625, "learning_rate": 6.60362694300518e-07, "loss": 0.0018, "reward": 2.4999849796295166, "reward_std": 1.1492917337818653e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999847412109375, "step": 1312 }, { "clip_ratio": 0.0, "completion_length": 34.0, "epoch": 3.401554404145078, "grad_norm": 0.628344296697333, "kl": 0.16064453125, "learning_rate": 6.601036269430051e-07, "loss": 0.0015, "reward": 2.499968647956848, "reward_std": 3.792652677248043e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999686479568481, "step": 1313 }, { "clip_ratio": 0.0, "completion_length": 34.0, "epoch": 3.4041450777202074, "grad_norm": 0.14255676864836625, "kl": 0.0494384765625, "learning_rate": 6.598445595854922e-07, "loss": -0.0002, "reward": 2.4999953508377075, "reward_std": 2.5643305150424567e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999955296516418, "step": 1314 }, { "clip_ratio": 0.0, "completion_length": 34.5625, "epoch": 3.406735751295337, "grad_norm": 11.295736174446297, "kl": 0.097412109375, "learning_rate": 6.595854922279792e-07, "loss": -0.0, "reward": 1.9924728870391846, "reward_std": 5.47329968867416e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4924729466438293, "step": 1315 }, { "clip_ratio": 0.0, "completion_length": 34.5, "epoch": 3.4093264248704664, "grad_norm": 0.2899929915176572, "kl": 0.060791015625, "learning_rate": 6.593264248704663e-07, "loss": 0.0017, "reward": 2.499995231628418, "reward_std": 2.67563785882885e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.999995231628418, "step": 1316 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 3.411917098445596, "grad_norm": 0.15722221776304574, "kl": 0.1136474609375, "learning_rate": 6.590673575129534e-07, "loss": 0.0009, "reward": 2.499998092651367, "reward_std": 1.9036045273423952e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999980330467224, "step": 1317 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 3.4145077720207255, "grad_norm": 27.013484280707754, "kl": 0.06396484375, "learning_rate": 6.588082901554403e-07, "loss": -0.0009, "reward": 2.499856948852539, "reward_std": 5.1802656059862784e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9998570680618286, "step": 1318 }, { "clip_ratio": 0.0, "completion_length": 34.0, "epoch": 3.417098445595855, "grad_norm": 0.8854631233530994, "kl": 0.056640625, "learning_rate": 6.585492227979274e-07, "loss": 0.0004, "reward": 2.4999747276306152, "reward_std": 1.080063793779118e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.99997478723526, "step": 1319 }, { "clip_ratio": 0.0, "completion_length": 34.9375, "epoch": 3.4196891191709846, "grad_norm": 11.842182184099409, "kl": 0.1932373046875, "learning_rate": 6.582901554404144e-07, "loss": 0.001, "reward": 1.968591570854187, "reward_std": 0.006671816722700896, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4685916006565094, "step": 1320 }, { "clip_ratio": 0.0, "completion_length": 34.5, "epoch": 3.422279792746114, "grad_norm": 0.5684898572481198, "kl": 0.1484375, "learning_rate": 6.580310880829015e-07, "loss": 0.0013, "reward": 2.4999858140945435, "reward_std": 5.055517249274999e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999856352806091, "step": 1321 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 3.4248704663212437, "grad_norm": 0.8741033671541631, "kl": 0.035400390625, "learning_rate": 6.577720207253887e-07, "loss": -0.0004, "reward": 2.4999938011169434, "reward_std": 5.360691602618317e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999940395355225, "step": 1322 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 3.4274611398963732, "grad_norm": 0.09088051622100037, "kl": 0.0679931640625, "learning_rate": 6.575129533678755e-07, "loss": 0.0004, "reward": 2.49999463558197, "reward_std": 1.4212792223133874e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999945759773254, "step": 1323 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 3.4300518134715023, "grad_norm": 88.18927522118243, "kl": 0.12255859375, "learning_rate": 6.572538860103627e-07, "loss": 0.0001, "reward": 2.499954581260681, "reward_std": 1.4616265616496094e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999547004699707, "step": 1324 }, { "clip_ratio": 0.0, "completion_length": 34.5, "epoch": 3.432642487046632, "grad_norm": 20.95091438872185, "kl": 0.0755615234375, "learning_rate": 6.569948186528497e-07, "loss": -0.0004, "reward": 2.43748140335083, "reward_std": 0.17679336109040378, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.93748140335083, "step": 1325 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 3.4352331606217614, "grad_norm": 0.08554641384151447, "kl": 0.061279296875, "learning_rate": 6.567357512953368e-07, "loss": 0.0002, "reward": 2.499996542930603, "reward_std": 1.5779278044192324e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999966025352478, "step": 1326 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 3.437823834196891, "grad_norm": 15.724386816815187, "kl": 0.075927734375, "learning_rate": 6.564766839378239e-07, "loss": -0.0005, "reward": 1.9812134504318237, "reward_std": 0.0001690258214921414, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4812135100364685, "step": 1327 }, { "clip_ratio": 0.0, "completion_length": 34.0, "epoch": 3.4404145077720205, "grad_norm": 0.2412479102981515, "kl": 0.057373046875, "learning_rate": 6.562176165803109e-07, "loss": -0.0012, "reward": 2.499994993209839, "reward_std": 2.5916364734257513e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999952912330627, "step": 1328 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 3.44300518134715, "grad_norm": 0.1372288832654031, "kl": 0.158203125, "learning_rate": 6.55958549222798e-07, "loss": 0.0, "reward": 2.4999895095825195, "reward_std": 1.8267392078996636e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999895691871643, "step": 1329 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 3.4455958549222796, "grad_norm": 6.479777720510055, "kl": 0.198974609375, "learning_rate": 6.556994818652849e-07, "loss": 0.0006, "reward": 1.804716944694519, "reward_std": 0.0005800696062578936, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.3047169148921967, "step": 1330 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 3.448186528497409, "grad_norm": 1.9678867874828987, "kl": 0.026458740234375, "learning_rate": 6.55440414507772e-07, "loss": 0.0001, "reward": 2.499990224838257, "reward_std": 1.0386611847934546e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999902248382568, "step": 1331 }, { "clip_ratio": 0.0, "completion_length": 34.5, "epoch": 3.4507772020725387, "grad_norm": 5.201974811130854, "kl": 0.120849609375, "learning_rate": 6.551813471502591e-07, "loss": 0.0003, "reward": 1.999235451221466, "reward_std": 8.643494857096812e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4992355108261108, "step": 1332 }, { "clip_ratio": 0.0, "completion_length": 34.9375, "epoch": 3.4533678756476682, "grad_norm": 19.621498723289225, "kl": 0.13623046875, "learning_rate": 6.549222797927461e-07, "loss": 0.0005, "reward": 1.9854612350463867, "reward_std": 0.20790574957572971, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4854612350463867, "step": 1333 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 3.4559585492227978, "grad_norm": 2.0470128638190235, "kl": 0.125, "learning_rate": 6.546632124352332e-07, "loss": 0.0007, "reward": 2.499990940093994, "reward_std": 7.779780503369693e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999909400939941, "step": 1334 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 3.4585492227979273, "grad_norm": 1.6392997775910472, "kl": 0.07958984375, "learning_rate": 6.544041450777201e-07, "loss": 0.001, "reward": 2.499949097633362, "reward_std": 1.1032195914140175e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999488592147827, "step": 1335 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 3.461139896373057, "grad_norm": 1.8777779445871616, "kl": 0.0667724609375, "learning_rate": 6.541450777202072e-07, "loss": -0.0006, "reward": 2.4999806880950928, "reward_std": 9.163160939351656e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999808073043823, "step": 1336 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 3.4637305699481864, "grad_norm": 0.13381814044751406, "kl": 0.110595703125, "learning_rate": 6.538860103626943e-07, "loss": -0.0003, "reward": 2.4999895095825195, "reward_std": 3.170826403220417e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999895691871643, "step": 1337 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 3.466321243523316, "grad_norm": 936.7902888793681, "kl": 0.0767822265625, "learning_rate": 6.536269430051813e-07, "loss": 0.0002, "reward": 1.9934600591659546, "reward_std": 0.003993490203356487, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4934599995613098, "step": 1338 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 3.4689119170984455, "grad_norm": 20.647253045583795, "kl": 0.108642578125, "learning_rate": 6.533678756476684e-07, "loss": 0.0006, "reward": 2.4988105297088623, "reward_std": 0.0001235436582192051, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9988104104995728, "step": 1339 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 3.471502590673575, "grad_norm": 35.22651520092051, "kl": 0.0614013671875, "learning_rate": 6.531088082901555e-07, "loss": 0.0005, "reward": 1.999617338180542, "reward_std": 3.5019628285226645e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4996172785758972, "step": 1340 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 3.4740932642487046, "grad_norm": 0.4369136949688955, "kl": 0.0556640625, "learning_rate": 6.528497409326425e-07, "loss": -0.0002, "reward": 2.4999958276748657, "reward_std": 3.6921732089467696e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999957084655762, "step": 1341 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 3.476683937823834, "grad_norm": 8.776868774273215, "kl": 0.143798828125, "learning_rate": 6.525906735751295e-07, "loss": 0.0009, "reward": 2.4368677139282227, "reward_std": 0.17856070416701186, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9368676543235779, "step": 1342 }, { "clip_ratio": 0.0, "completion_length": 36.1875, "epoch": 3.4792746113989637, "grad_norm": 3.833796370548108, "kl": 0.20751953125, "learning_rate": 6.523316062176165e-07, "loss": 0.0006, "reward": 2.499929904937744, "reward_std": 2.5398303478141315e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.999929964542389, "step": 1343 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 3.481865284974093, "grad_norm": 5.392781955184533, "kl": 0.1142578125, "learning_rate": 6.520725388601036e-07, "loss": 0.0003, "reward": 2.4998639822006226, "reward_std": 6.730443169544742e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9998641610145569, "step": 1344 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 3.4844559585492227, "grad_norm": 0.7610112195811284, "kl": 0.122802734375, "learning_rate": 6.518134715025907e-07, "loss": 0.0012, "reward": 2.4999645948410034, "reward_std": 6.161727014841745e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999645948410034, "step": 1345 }, { "clip_ratio": 0.0, "completion_length": 34.0, "epoch": 3.4870466321243523, "grad_norm": 0.0842702726023439, "kl": 0.0823974609375, "learning_rate": 6.515544041450777e-07, "loss": 0.0002, "reward": 2.499996304512024, "reward_std": 1.5945765312608273e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.999996304512024, "step": 1346 }, { "clip_ratio": 0.0, "completion_length": 34.5, "epoch": 3.489637305699482, "grad_norm": 2.2505849321534086, "kl": 0.127197265625, "learning_rate": 6.512953367875648e-07, "loss": -0.0004, "reward": 2.499969959259033, "reward_std": 1.367196182400221e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999698400497437, "step": 1347 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 3.4922279792746114, "grad_norm": 1.6636532598923264, "kl": 0.07275390625, "learning_rate": 6.510362694300517e-07, "loss": 0.0004, "reward": 2.4999873638153076, "reward_std": 7.710978479735786e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999873638153076, "step": 1348 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 3.494818652849741, "grad_norm": 0.3571506390553103, "kl": 0.0712890625, "learning_rate": 6.507772020725388e-07, "loss": -0.0007, "reward": 2.499982237815857, "reward_std": 4.9869615850184346e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.999982237815857, "step": 1349 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 3.4974093264248705, "grad_norm": 0.24088259429386338, "kl": 0.13720703125, "learning_rate": 6.505181347150259e-07, "loss": 0.0003, "reward": 2.4999942779541016, "reward_std": 2.79588209650683e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999942779541016, "step": 1350 }, { "clip_ratio": 0.0, "completion_length": 34.5, "epoch": 3.5, "grad_norm": 1.537052628856158, "kl": 0.0604248046875, "learning_rate": 6.502590673575129e-07, "loss": 0.0007, "reward": 2.499988079071045, "reward_std": 4.063590722580557e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999879598617554, "step": 1351 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 3.5025906735751295, "grad_norm": 0.16246596931791904, "kl": 0.01953125, "learning_rate": 6.5e-07, "loss": -0.0001, "reward": 2.499993324279785, "reward_std": 3.6890651244902983e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999932646751404, "step": 1352 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 3.505181347150259, "grad_norm": 4.884301071816906, "kl": 0.12060546875, "learning_rate": 6.49740932642487e-07, "loss": 0.0002, "reward": 1.9008011221885681, "reward_std": 0.00030973899993114173, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4008011519908905, "step": 1353 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 3.5077720207253886, "grad_norm": 2.3989105629342515, "kl": 0.0772705078125, "learning_rate": 6.49481865284974e-07, "loss": 0.0007, "reward": 1.9999024868011475, "reward_std": 1.3443967873172369e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4999024868011475, "step": 1354 }, { "clip_ratio": 0.0, "completion_length": 34.5, "epoch": 3.510362694300518, "grad_norm": 0.08716403584693133, "kl": 0.1171875, "learning_rate": 6.492227979274611e-07, "loss": 0.0008, "reward": 2.499993324279785, "reward_std": 2.5999656827480067e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999932646751404, "step": 1355 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 3.5129533678756477, "grad_norm": 1.3719280672515863, "kl": 0.073486328125, "learning_rate": 6.489637305699481e-07, "loss": 0.0003, "reward": 2.499994158744812, "reward_std": 4.417373361320642e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999943375587463, "step": 1356 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 3.5155440414507773, "grad_norm": 5.486186413154993, "kl": 0.0814208984375, "learning_rate": 6.487046632124352e-07, "loss": -0.0, "reward": 2.499882221221924, "reward_std": 4.369443740870338e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9998822808265686, "step": 1357 }, { "clip_ratio": 0.0, "completion_length": 38.625, "epoch": 3.518134715025907, "grad_norm": 0.04886309299963757, "kl": 0.129638671875, "learning_rate": 6.484455958549222e-07, "loss": 0.0007, "reward": 2.499998092651367, "reward_std": 1.1115929225979926e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999982118606567, "step": 1358 }, { "clip_ratio": 0.0, "completion_length": 38.125, "epoch": 3.5207253886010363, "grad_norm": 33.036188365258056, "kl": 0.1331787109375, "learning_rate": 6.481865284974093e-07, "loss": 0.0014, "reward": 2.4194105863571167, "reward_std": 0.22792745969275074, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9194104671478271, "step": 1359 }, { "clip_ratio": 0.0, "completion_length": 34.0, "epoch": 3.523316062176166, "grad_norm": 13.508458333375703, "kl": 0.076904296875, "learning_rate": 6.479274611398963e-07, "loss": 0.0006, "reward": 2.1874241828918457, "reward_std": 0.25881362627660565, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.6874240636825562, "step": 1360 }, { "clip_ratio": 0.0, "completion_length": 37.75, "epoch": 3.5259067357512954, "grad_norm": 2.384423771437139, "kl": 0.1627197265625, "learning_rate": 6.476683937823833e-07, "loss": 0.001, "reward": 2.4999488592147827, "reward_std": 1.8778174535327707e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.999948799610138, "step": 1361 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 3.528497409326425, "grad_norm": 0.29165096178079064, "kl": 0.0421142578125, "learning_rate": 6.474093264248704e-07, "loss": -0.0001, "reward": 2.4999932050704956, "reward_std": 2.4304994212798192e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999932050704956, "step": 1362 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 3.5310880829015545, "grad_norm": 0.687794608290158, "kl": 0.1162109375, "learning_rate": 6.471502590673574e-07, "loss": 0.0018, "reward": 2.499951481819153, "reward_std": 1.0045186854767962e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999513626098633, "step": 1363 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 3.533678756476684, "grad_norm": 11.83038878869347, "kl": 0.064208984375, "learning_rate": 6.468911917098445e-07, "loss": 0.0003, "reward": 2.499903678894043, "reward_std": 3.691922302095918e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999035596847534, "step": 1364 }, { "clip_ratio": 0.0, "completion_length": 34.5, "epoch": 3.5362694300518136, "grad_norm": 9.641466328253756, "kl": 0.110595703125, "learning_rate": 6.466321243523317e-07, "loss": 0.0019, "reward": 2.499993085861206, "reward_std": 1.0167843470298976e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999930262565613, "step": 1365 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 3.538860103626943, "grad_norm": 31.796727797747458, "kl": 0.0430908203125, "learning_rate": 6.463730569948185e-07, "loss": 0.0005, "reward": 2.187224805355072, "reward_std": 0.2589843902571829, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.687224805355072, "step": 1366 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 3.5414507772020727, "grad_norm": 6.770195985628733, "kl": 0.16998291015625, "learning_rate": 6.461139896373057e-07, "loss": 0.0001, "reward": 1.9995607137680054, "reward_std": 1.90273037787847e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4995607733726501, "step": 1367 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 3.5440414507772022, "grad_norm": 0.09570513663860637, "kl": 0.1083984375, "learning_rate": 6.458549222797928e-07, "loss": 0.0003, "reward": 2.499997615814209, "reward_std": 1.8369141230323294e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999976754188538, "step": 1368 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 3.5466321243523318, "grad_norm": 2.0688750931797997, "kl": 0.07958984375, "learning_rate": 6.455958549222798e-07, "loss": 0.0001, "reward": 1.9999399185180664, "reward_std": 1.4084825579629978e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4999399185180664, "step": 1369 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 3.5492227979274613, "grad_norm": 0.1956092724269007, "kl": 0.074462890625, "learning_rate": 6.453367875647669e-07, "loss": 0.0003, "reward": 2.499996542930603, "reward_std": 2.3148721766119706e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.999996542930603, "step": 1370 }, { "clip_ratio": 0.0, "completion_length": 34.0, "epoch": 3.551813471502591, "grad_norm": 1.2204397791933161, "kl": 0.0482177734375, "learning_rate": 6.450777202072539e-07, "loss": -0.0004, "reward": 2.4999887943267822, "reward_std": 8.773908575676614e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999887943267822, "step": 1371 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 3.5544041450777204, "grad_norm": 0.042786623974987385, "kl": 0.04119873046875, "learning_rate": 6.448186528497409e-07, "loss": 0.0, "reward": 2.4999970197677612, "reward_std": 1.2239580655659665e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999970197677612, "step": 1372 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 3.55699481865285, "grad_norm": 9.801799145982999, "kl": 0.111083984375, "learning_rate": 6.44559585492228e-07, "loss": 0.0009, "reward": 1.9999391436576843, "reward_std": 1.510055244580144e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4999392330646515, "step": 1373 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 3.5595854922279795, "grad_norm": 0.9564844454749356, "kl": 0.0556640625, "learning_rate": 6.44300518134715e-07, "loss": 0.0012, "reward": 2.4999935626983643, "reward_std": 6.840491778348223e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999935626983643, "step": 1374 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 3.562176165803109, "grad_norm": 0.08207911425817606, "kl": 0.066650390625, "learning_rate": 6.440414507772021e-07, "loss": -0.0003, "reward": 2.4999847412109375, "reward_std": 1.7326361785308109e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999847412109375, "step": 1375 }, { "clip_ratio": 0.0, "completion_length": 34.5, "epoch": 3.5647668393782386, "grad_norm": 0.634151425587639, "kl": 0.11083984375, "learning_rate": 6.437823834196891e-07, "loss": 0.0001, "reward": 2.4999961853027344, "reward_std": 2.462986032014669e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.999996304512024, "step": 1376 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 3.567357512953368, "grad_norm": 3.2257225761877146, "kl": 0.167724609375, "learning_rate": 6.435233160621762e-07, "loss": 0.0007, "reward": 1.492495596408844, "reward_std": 0.00014754161384189501, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 0.9924955666065216, "step": 1377 }, { "clip_ratio": 0.0, "completion_length": 35.625, "epoch": 3.5699481865284977, "grad_norm": 2.002782565817255, "kl": 0.14013671875, "learning_rate": 6.432642487046632e-07, "loss": 0.0011, "reward": 2.499993085861206, "reward_std": 7.58046803639445e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999929666519165, "step": 1378 }, { "clip_ratio": 0.0, "completion_length": 33.5, "epoch": 3.572538860103627, "grad_norm": 0.6630837855530316, "kl": 0.03839111328125, "learning_rate": 6.430051813471502e-07, "loss": -0.0006, "reward": 2.499860167503357, "reward_std": 8.493611289850378e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.999860405921936, "step": 1379 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 3.5751295336787567, "grad_norm": 2.3177791716460807, "kl": 0.11590576171875, "learning_rate": 6.427461139896373e-07, "loss": 0.0002, "reward": 1.9996198415756226, "reward_std": 3.448325196586666e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4996198415756226, "step": 1380 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 3.5777202072538863, "grad_norm": 6.142709358445661, "kl": 0.17138671875, "learning_rate": 6.424870466321243e-07, "loss": 0.0009, "reward": 1.9998284578323364, "reward_std": 2.3680084268562496e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4998283088207245, "step": 1381 }, { "clip_ratio": 0.0, "completion_length": 34.5625, "epoch": 3.5803108808290154, "grad_norm": 102.73437638597295, "kl": 0.0321044921875, "learning_rate": 6.422279792746114e-07, "loss": -0.0002, "reward": 2.3749892711639404, "reward_std": 0.353569598915783, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.8749892115592957, "step": 1382 }, { "clip_ratio": 0.0, "completion_length": 34.5, "epoch": 3.582901554404145, "grad_norm": 5.900914590186641, "kl": 0.328125, "learning_rate": 6.419689119170985e-07, "loss": 0.0005, "reward": 1.8437442779541016, "reward_std": 0.5499613261595186, "rewards/format_reward_rec": 0.9375, "rewards/point_reward": 1.3749943673610687, "step": 1383 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 3.5854922279792745, "grad_norm": 4.152174499610402, "kl": 0.107177734375, "learning_rate": 6.417098445595854e-07, "loss": 0.0013, "reward": 2.4998902082443237, "reward_std": 3.497060606605373e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.999890148639679, "step": 1384 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 3.588082901554404, "grad_norm": 0.0915231309630497, "kl": 0.04150390625, "learning_rate": 6.414507772020725e-07, "loss": -0.0004, "reward": 2.4999977350234985, "reward_std": 1.0129642191714083e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999979138374329, "step": 1385 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 3.5906735751295336, "grad_norm": 0.1538020752548344, "kl": 0.094970703125, "learning_rate": 6.411917098445595e-07, "loss": 0.0007, "reward": 2.499993324279785, "reward_std": 4.426532541401684e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999932646751404, "step": 1386 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 3.593264248704663, "grad_norm": 1.731495223151456, "kl": 0.07666015625, "learning_rate": 6.409326424870466e-07, "loss": 0.0007, "reward": 2.499967575073242, "reward_std": 1.0554545269769733e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999675750732422, "step": 1387 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 3.5958549222797926, "grad_norm": 1.0285586211360336, "kl": 0.104248046875, "learning_rate": 6.406735751295337e-07, "loss": -0.0004, "reward": 1.9998092651367188, "reward_std": 1.2343170510575874e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.499809443950653, "step": 1388 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 3.598445595854922, "grad_norm": 29.928070988932145, "kl": 0.09912109375, "learning_rate": 6.404145077720207e-07, "loss": 0.0004, "reward": 1.9146441221237183, "reward_std": 0.07638257455630537, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.414644181728363, "step": 1389 }, { "clip_ratio": 0.0, "completion_length": 34.5, "epoch": 3.6010362694300517, "grad_norm": 0.7744931016392296, "kl": 0.077392578125, "learning_rate": 6.401554404145077e-07, "loss": 0.0005, "reward": 2.4999934434890747, "reward_std": 5.0651053697947646e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999932050704956, "step": 1390 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 3.6036269430051813, "grad_norm": 15.867556184862849, "kl": 0.1455078125, "learning_rate": 6.398963730569948e-07, "loss": 0.0007, "reward": 1.9977235198020935, "reward_std": 6.542609679627276e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.497723639011383, "step": 1391 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 3.606217616580311, "grad_norm": 0.592858370697047, "kl": 0.0648193359375, "learning_rate": 6.396373056994818e-07, "loss": 0.0011, "reward": 2.4999911785125732, "reward_std": 7.817323876224691e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999910593032837, "step": 1392 }, { "clip_ratio": 0.0, "completion_length": 35.5625, "epoch": 3.6088082901554404, "grad_norm": 48.216732190586505, "kl": 0.15283203125, "learning_rate": 6.393782383419689e-07, "loss": 0.0006, "reward": 1.6580806970596313, "reward_std": 0.42604246735572815, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.1580806970596313, "step": 1393 }, { "clip_ratio": 0.0, "completion_length": 35.5625, "epoch": 3.61139896373057, "grad_norm": 0.922680156598505, "kl": 0.072021484375, "learning_rate": 6.391191709844559e-07, "loss": 0.0007, "reward": 1.9995518326759338, "reward_std": 2.1345707921227586e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4995518326759338, "step": 1394 }, { "clip_ratio": 0.0, "completion_length": 34.625, "epoch": 3.6139896373056994, "grad_norm": 2.597384218268352, "kl": 0.154296875, "learning_rate": 6.38860103626943e-07, "loss": 0.0008, "reward": 1.9921189546585083, "reward_std": 0.00010953972281413371, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.492118924856186, "step": 1395 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 3.616580310880829, "grad_norm": 4.002011519964906, "kl": 0.0521240234375, "learning_rate": 6.3860103626943e-07, "loss": 0.0001, "reward": 1.9984354972839355, "reward_std": 5.356251585908467e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.498435527086258, "step": 1396 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 3.6191709844559585, "grad_norm": 22.237915238704165, "kl": 0.025115966796875, "learning_rate": 6.38341968911917e-07, "loss": -0.0005, "reward": 2.3124756813049316, "reward_std": 0.2588063213181897, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.8124756217002869, "step": 1397 }, { "clip_ratio": 0.0, "completion_length": 34.875, "epoch": 3.621761658031088, "grad_norm": 9.719728532205218, "kl": 0.4072265625, "learning_rate": 6.380829015544041e-07, "loss": 0.0019, "reward": 2.343744993209839, "reward_std": 0.44194230279202884, "rewards/format_reward_rec": 0.9375, "rewards/point_reward": 1.8749948143959045, "step": 1398 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 3.6243523316062176, "grad_norm": 1.4982565492046702, "kl": 0.1103515625, "learning_rate": 6.378238341968911e-07, "loss": 0.0009, "reward": 2.499995470046997, "reward_std": 4.156313934799982e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.999995231628418, "step": 1399 }, { "clip_ratio": 0.0, "completion_length": 34.4375, "epoch": 3.626943005181347, "grad_norm": 1.5379201064532138, "kl": 0.0740966796875, "learning_rate": 6.375647668393782e-07, "loss": 0.0008, "reward": 2.499988555908203, "reward_std": 7.125452953005151e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999882578849792, "step": 1400 }, { "clip_ratio": 0.0, "completion_length": 34.125, "epoch": 3.6295336787564767, "grad_norm": 0.20639443602653573, "kl": 0.18115234375, "learning_rate": 6.373056994818653e-07, "loss": 0.0006, "reward": 2.499995708465576, "reward_std": 2.159172140636656e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999958276748657, "step": 1401 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 3.6321243523316062, "grad_norm": 0.09194698487599, "kl": 0.03240966796875, "learning_rate": 6.370466321243522e-07, "loss": -0.0002, "reward": 2.49999737739563, "reward_std": 1.4971998325563618e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999972581863403, "step": 1402 }, { "clip_ratio": 0.0, "completion_length": 34.0, "epoch": 3.634715025906736, "grad_norm": 2.9595577725823152, "kl": 0.10400390625, "learning_rate": 6.367875647668393e-07, "loss": 0.0013, "reward": 1.9972585439682007, "reward_std": 7.509514551884422e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.497258484363556, "step": 1403 }, { "clip_ratio": 0.0, "completion_length": 34.5, "epoch": 3.6373056994818653, "grad_norm": 27.729012271010642, "kl": 0.02850341796875, "learning_rate": 6.365284974093263e-07, "loss": -0.0002, "reward": 2.4999806880950928, "reward_std": 1.1388373422960285e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999807476997375, "step": 1404 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 3.639896373056995, "grad_norm": 4.580947992444961, "kl": 0.14990234375, "learning_rate": 6.362694300518134e-07, "loss": 0.0012, "reward": 1.9982608556747437, "reward_std": 6.758928202543757e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.498260736465454, "step": 1405 }, { "clip_ratio": 0.0, "completion_length": 35.125, "epoch": 3.6424870466321244, "grad_norm": 1.7019039901118083, "kl": 0.37109375, "learning_rate": 6.360103626943006e-07, "loss": 0.0015, "reward": 2.4999918937683105, "reward_std": 2.6950671667691495e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999919533729553, "step": 1406 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 3.645077720207254, "grad_norm": 26.25476992407301, "kl": 0.075927734375, "learning_rate": 6.357512953367876e-07, "loss": -0.0002, "reward": 2.4999547004699707, "reward_std": 2.206633030255034e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999548196792603, "step": 1407 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 3.6476683937823835, "grad_norm": 0.2285771512332544, "kl": 0.086669921875, "learning_rate": 6.354922279792746e-07, "loss": 0.0006, "reward": 2.4999818801879883, "reward_std": 2.496620936653926e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999818801879883, "step": 1408 }, { "clip_ratio": 0.0, "completion_length": 37.375, "epoch": 3.650259067357513, "grad_norm": 10.858232327595445, "kl": 0.12353515625, "learning_rate": 6.352331606217615e-07, "loss": 0.0009, "reward": 1.802313208580017, "reward_std": 0.0008293414975923952, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.302313208580017, "step": 1409 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 3.6528497409326426, "grad_norm": 0.20869201786593009, "kl": 0.0557861328125, "learning_rate": 6.349740932642487e-07, "loss": -0.0005, "reward": 2.499996304512024, "reward_std": 2.4590092380094575e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.999996304512024, "step": 1410 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 3.655440414507772, "grad_norm": 1.1609635142045087, "kl": 0.114013671875, "learning_rate": 6.347150259067358e-07, "loss": 0.0006, "reward": 2.499985098838806, "reward_std": 5.292229616316035e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999852776527405, "step": 1411 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 3.6580310880829017, "grad_norm": 57.20844343720344, "kl": 0.28369140625, "learning_rate": 6.344559585492228e-07, "loss": 0.0016, "reward": 2.249961793422699, "reward_std": 0.26729934694503754, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.7499616742134094, "step": 1412 }, { "clip_ratio": 0.0, "completion_length": 35.125, "epoch": 3.660621761658031, "grad_norm": 0.12267601600877713, "kl": 0.12841796875, "learning_rate": 6.341968911917099e-07, "loss": 0.0008, "reward": 2.499998450279236, "reward_std": 1.4982163065724308e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999982714653015, "step": 1413 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 3.6632124352331608, "grad_norm": 0.9094826749190009, "kl": 0.096435546875, "learning_rate": 6.339378238341969e-07, "loss": -0.0003, "reward": 2.4999935626983643, "reward_std": 7.375853328994708e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999937415122986, "step": 1414 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 3.6658031088082903, "grad_norm": 0.39852115044805114, "kl": 0.065185546875, "learning_rate": 6.336787564766839e-07, "loss": -0.0, "reward": 2.4999945163726807, "reward_std": 3.151433929815539e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999942779541016, "step": 1415 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 3.66839378238342, "grad_norm": 0.18089621270722947, "kl": 0.087158203125, "learning_rate": 6.33419689119171e-07, "loss": 0.0006, "reward": 2.499990940093994, "reward_std": 5.112204917168128e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999908804893494, "step": 1416 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 3.6709844559585494, "grad_norm": 9.963593632923372, "kl": 0.0513916015625, "learning_rate": 6.33160621761658e-07, "loss": -0.0003, "reward": 1.9926939010620117, "reward_std": 5.718518514186144e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4926939606666565, "step": 1417 }, { "clip_ratio": 0.0, "completion_length": 35.25, "epoch": 3.6735751295336785, "grad_norm": 4.7307522210237, "kl": 0.23291015625, "learning_rate": 6.329015544041451e-07, "loss": 0.0015, "reward": 2.499953866004944, "reward_std": 1.4270152519202384e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999537467956543, "step": 1418 }, { "clip_ratio": 0.0, "completion_length": 39.0, "epoch": 3.676165803108808, "grad_norm": 2.970753992091834, "kl": 0.27880859375, "learning_rate": 6.326424870466322e-07, "loss": 0.0018, "reward": 1.9996845722198486, "reward_std": 2.2852327788314142e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4996845126152039, "step": 1419 }, { "clip_ratio": 0.0, "completion_length": 36.5, "epoch": 3.6787564766839376, "grad_norm": 24.619561430077106, "kl": 0.1337890625, "learning_rate": 6.323834196891191e-07, "loss": -0.0004, "reward": 1.773663878440857, "reward_std": 0.0006489680408776621, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.2736640572547913, "step": 1420 }, { "clip_ratio": 0.0, "completion_length": 35.125, "epoch": 3.681347150259067, "grad_norm": 23.072096035863627, "kl": 0.177734375, "learning_rate": 6.321243523316062e-07, "loss": 0.0006, "reward": 2.4999451637268066, "reward_std": 4.119383868328441e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999451637268066, "step": 1421 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 3.6839378238341967, "grad_norm": 2.0821958130165252, "kl": 0.139892578125, "learning_rate": 6.318652849740932e-07, "loss": 0.0015, "reward": 2.49995756149292, "reward_std": 1.542653672004235e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999573826789856, "step": 1422 }, { "clip_ratio": 0.0, "completion_length": 35.25, "epoch": 3.686528497409326, "grad_norm": 0.1564653787225123, "kl": 0.24072265625, "learning_rate": 6.316062176165803e-07, "loss": 0.001, "reward": 2.4999951124191284, "reward_std": 2.60988224454195e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.999994933605194, "step": 1423 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 3.6891191709844557, "grad_norm": 16.6406722817717, "kl": 0.1024169921875, "learning_rate": 6.313471502590674e-07, "loss": 0.0007, "reward": 2.3745580911636353, "reward_std": 0.23150357842678204, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.87455815076828, "step": 1424 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 3.6917098445595853, "grad_norm": 2.004090882861043, "kl": 0.121337890625, "learning_rate": 6.310880829015544e-07, "loss": 0.0002, "reward": 2.4999486207962036, "reward_std": 1.4200291843735613e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999486207962036, "step": 1425 }, { "clip_ratio": 0.0, "completion_length": 37.1875, "epoch": 3.694300518134715, "grad_norm": 0.6403312950485796, "kl": 0.02777099609375, "learning_rate": 6.308290155440414e-07, "loss": -0.0005, "reward": 2.4999945163726807, "reward_std": 3.3366154639224987e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999944567680359, "step": 1426 }, { "clip_ratio": 0.0, "completion_length": 34.875, "epoch": 3.6968911917098444, "grad_norm": 2.9205639466666216, "kl": 0.26416015625, "learning_rate": 6.305699481865284e-07, "loss": 0.0007, "reward": 2.499971032142639, "reward_std": 1.3693184882868081e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999709129333496, "step": 1427 }, { "clip_ratio": 0.0, "completion_length": 35.25, "epoch": 3.699481865284974, "grad_norm": 15.446863286807062, "kl": 0.236328125, "learning_rate": 6.303108808290155e-07, "loss": 0.0015, "reward": 2.4999682903289795, "reward_std": 1.2488001516430813e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999682307243347, "step": 1428 }, { "clip_ratio": 0.0, "completion_length": 36.25, "epoch": 3.7020725388601035, "grad_norm": 463.5197166974264, "kl": 0.32568359375, "learning_rate": 6.300518134715026e-07, "loss": 0.0017, "reward": 1.886283040046692, "reward_std": 0.0015620035305801139, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.3862829506397247, "step": 1429 }, { "clip_ratio": 0.0, "completion_length": 35.25, "epoch": 3.704663212435233, "grad_norm": 9.058402794711746, "kl": 0.2177734375, "learning_rate": 6.297927461139896e-07, "loss": 0.001, "reward": 1.9995219707489014, "reward_std": 0.00014197003883964499, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4995220303535461, "step": 1430 }, { "clip_ratio": 0.0, "completion_length": 35.375, "epoch": 3.7072538860103625, "grad_norm": 6.903927491865712, "kl": 0.3115234375, "learning_rate": 6.295336787564767e-07, "loss": 0.0002, "reward": 1.9875686764717102, "reward_std": 0.00022618269491658793, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4875686764717102, "step": 1431 }, { "clip_ratio": 0.0, "completion_length": 34.0, "epoch": 3.709844559585492, "grad_norm": 12.484884334585084, "kl": 0.085693359375, "learning_rate": 6.292746113989636e-07, "loss": 0.0002, "reward": 1.951571524143219, "reward_std": 0.00031776617959167197, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4515715837478638, "step": 1432 }, { "clip_ratio": 0.0, "completion_length": 34.5, "epoch": 3.7124352331606216, "grad_norm": 0.3476356277318895, "kl": 0.1123046875, "learning_rate": 6.290155440414507e-07, "loss": -0.0001, "reward": 2.4999953508377075, "reward_std": 4.329747127940209e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999954104423523, "step": 1433 }, { "clip_ratio": 0.0, "completion_length": 37.9375, "epoch": 3.715025906735751, "grad_norm": 4.4383373794875425, "kl": 0.1513671875, "learning_rate": 6.287564766839378e-07, "loss": 0.0002, "reward": 2.4999500513076782, "reward_std": 1.634702380215458e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.999950110912323, "step": 1434 }, { "clip_ratio": 0.0, "completion_length": 34.875, "epoch": 3.7176165803108807, "grad_norm": 132.41384826968695, "kl": 0.333984375, "learning_rate": 6.284974093264248e-07, "loss": 0.0013, "reward": 1.6860251426696777, "reward_std": 0.2595846206677379, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.1860252022743225, "step": 1435 }, { "clip_ratio": 0.0, "completion_length": 35.0625, "epoch": 3.7202072538860103, "grad_norm": 7.152894042537265, "kl": 0.17431640625, "learning_rate": 6.282383419689119e-07, "loss": 0.0007, "reward": 1.3684092164039612, "reward_std": 0.0008181643206626177, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 0.8684092164039612, "step": 1436 }, { "clip_ratio": 0.0, "completion_length": 39.625, "epoch": 3.72279792746114, "grad_norm": 1.9817040373736259, "kl": 0.25732421875, "learning_rate": 6.27979274611399e-07, "loss": 0.0007, "reward": 2.4999974966049194, "reward_std": 2.305078282915929e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999974370002747, "step": 1437 }, { "clip_ratio": 0.0, "completion_length": 35.75, "epoch": 3.7253886010362693, "grad_norm": 1.766734935263979, "kl": 0.22509765625, "learning_rate": 6.277202072538859e-07, "loss": 0.0005, "reward": 1.9937055706977844, "reward_std": 4.73358773263044e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4937056005001068, "step": 1438 }, { "clip_ratio": 0.0, "completion_length": 38.5625, "epoch": 3.727979274611399, "grad_norm": 0.1141355469537074, "kl": 0.5810546875, "learning_rate": 6.27461139896373e-07, "loss": 0.0029, "reward": 2.4999990463256836, "reward_std": 8.357976355455321e-07, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999989867210388, "step": 1439 }, { "clip_ratio": 0.0, "completion_length": 35.8125, "epoch": 3.7305699481865284, "grad_norm": 0.10434125774158462, "kl": 0.2255859375, "learning_rate": 6.2720207253886e-07, "loss": 0.0019, "reward": 2.4999983310699463, "reward_std": 1.1934515669054235e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999982118606567, "step": 1440 }, { "clip_ratio": 0.0, "completion_length": 40.9375, "epoch": 3.733160621761658, "grad_norm": 0.9799096359733886, "kl": 0.34765625, "learning_rate": 6.269430051813471e-07, "loss": 0.0014, "reward": 2.499993324279785, "reward_std": 4.789996694398724e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999934434890747, "step": 1441 }, { "clip_ratio": 0.0, "completion_length": 40.5, "epoch": 3.7357512953367875, "grad_norm": 5.954220937402469, "kl": 0.708984375, "learning_rate": 6.266839378238342e-07, "loss": 0.0027, "reward": 1.992884337902069, "reward_std": 0.0001379923523927573, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4928842782974243, "step": 1442 }, { "clip_ratio": 0.0, "completion_length": 41.3125, "epoch": 3.738341968911917, "grad_norm": 1.1318460729934623, "kl": 0.666015625, "learning_rate": 6.264248704663212e-07, "loss": 0.0039, "reward": 2.4999923706054688, "reward_std": 5.94410153098579e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999922513961792, "step": 1443 }, { "clip_ratio": 0.0, "completion_length": 40.8125, "epoch": 3.7409326424870466, "grad_norm": 18.12893911097727, "kl": 0.9013671875, "learning_rate": 6.261658031088083e-07, "loss": 0.0037, "reward": 1.7200507819652557, "reward_std": 0.0006644495169894071, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.2200506627559662, "step": 1444 }, { "clip_ratio": 0.0, "completion_length": 40.375, "epoch": 3.743523316062176, "grad_norm": 0.18701260774233333, "kl": 1.04296875, "learning_rate": 6.259067357512952e-07, "loss": 0.0034, "reward": 2.499998092651367, "reward_std": 1.8562637649210956e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999980330467224, "step": 1445 }, { "clip_ratio": 0.0, "completion_length": 77.9375, "epoch": 3.7461139896373057, "grad_norm": 1.0751335434105949, "kl": 1.451171875, "learning_rate": 6.256476683937823e-07, "loss": 0.0068, "reward": 2.499949812889099, "reward_std": 7.904093649813149e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.999949872493744, "step": 1446 }, { "clip_ratio": 0.0, "completion_length": 70.0, "epoch": 3.7487046632124352, "grad_norm": 1.716833788318873, "kl": 0.8203125, "learning_rate": 6.253886010362694e-07, "loss": 0.0031, "reward": 2.4999661445617676, "reward_std": 8.809275755083945e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.999966323375702, "step": 1447 }, { "clip_ratio": 0.0, "completion_length": 94.0, "epoch": 3.7512953367875648, "grad_norm": 10.07558271715465, "kl": 0.62109375, "learning_rate": 6.251295336787564e-07, "loss": 0.0028, "reward": 1.9993083477020264, "reward_std": 0.0002407157758170797, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4993082880973816, "step": 1448 }, { "clip_ratio": 0.0, "completion_length": 59.5625, "epoch": 3.7538860103626943, "grad_norm": 0.8677486357109474, "kl": 0.984375, "learning_rate": 6.248704663212436e-07, "loss": 0.0047, "reward": 2.499996066093445, "reward_std": 4.674857109421282e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999959468841553, "step": 1449 }, { "clip_ratio": 0.0, "completion_length": 84.3125, "epoch": 3.756476683937824, "grad_norm": 0.2824385889796441, "kl": 0.654296875, "learning_rate": 6.246113989637304e-07, "loss": 0.0026, "reward": 2.499992251396179, "reward_std": 2.972786660393467e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999923706054688, "step": 1450 }, { "clip_ratio": 0.0, "completion_length": 120.125, "epoch": 3.7590673575129534, "grad_norm": 0.4941509984846888, "kl": 0.68359375, "learning_rate": 6.243523316062176e-07, "loss": 0.0027, "reward": 2.4999935626983643, "reward_std": 2.8710551305266563e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999935030937195, "step": 1451 }, { "clip_ratio": 0.0, "completion_length": 70.4375, "epoch": 3.761658031088083, "grad_norm": 13.854696272013184, "kl": 0.48046875, "learning_rate": 6.240932642487047e-07, "loss": 0.0025, "reward": 1.9981037378311157, "reward_std": 6.392458408299717e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4981036186218262, "step": 1452 }, { "clip_ratio": 0.0, "completion_length": 150.125, "epoch": 3.7642487046632125, "grad_norm": 0.3009483321137214, "kl": 0.6611328125, "learning_rate": 6.238341968911917e-07, "loss": 0.0024, "reward": 2.4999921321868896, "reward_std": 4.073889613209758e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999920725822449, "step": 1453 }, { "clip_ratio": 0.0, "completion_length": 64.9375, "epoch": 3.766839378238342, "grad_norm": 14.157154907881363, "kl": 0.75390625, "learning_rate": 6.235751295336788e-07, "loss": 0.0025, "reward": 1.9996938705444336, "reward_std": 6.909230296514579e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4996941089630127, "step": 1454 }, { "clip_ratio": 0.0, "completion_length": 105.8125, "epoch": 3.7694300518134716, "grad_norm": 33.689926696100336, "kl": 0.376953125, "learning_rate": 6.233160621761658e-07, "loss": 0.0015, "reward": 1.9743483066558838, "reward_std": 0.01166716232910403, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4743481874465942, "step": 1455 }, { "clip_ratio": 0.0, "completion_length": 61.125, "epoch": 3.772020725388601, "grad_norm": 3.229081384481035, "kl": 1.125, "learning_rate": 6.230569948186529e-07, "loss": 0.0045, "reward": 2.4999780654907227, "reward_std": 3.5143618788424646e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999781250953674, "step": 1456 }, { "clip_ratio": 0.0, "completion_length": 108.1875, "epoch": 3.7746113989637307, "grad_norm": 0.47853361161886987, "kl": 0.4794921875, "learning_rate": 6.227979274611399e-07, "loss": 0.0017, "reward": 1.9998518228530884, "reward_std": 8.310325142701913e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4998517334461212, "step": 1457 }, { "clip_ratio": 0.0, "completion_length": 61.625, "epoch": 3.77720207253886, "grad_norm": 2.285716281999186, "kl": 0.890625, "learning_rate": 6.225388601036269e-07, "loss": 0.0034, "reward": 2.4999836683273315, "reward_std": 9.813125188884442e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999836683273315, "step": 1458 }, { "clip_ratio": 0.0, "completion_length": 73.125, "epoch": 3.7797927461139897, "grad_norm": 9.024346679469728, "kl": 1.03515625, "learning_rate": 6.22279792746114e-07, "loss": 0.0047, "reward": 2.4999150037765503, "reward_std": 4.231919552921681e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999147057533264, "step": 1459 }, { "clip_ratio": 0.0, "completion_length": 55.5625, "epoch": 3.7823834196891193, "grad_norm": 0.5899310847730816, "kl": 0.724609375, "learning_rate": 6.22020725388601e-07, "loss": 0.0017, "reward": 2.4999972581863403, "reward_std": 1.3974001831229543e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999973773956299, "step": 1460 }, { "clip_ratio": 0.0, "completion_length": 84.1875, "epoch": 3.784974093264249, "grad_norm": 32.00919419799135, "kl": 0.814453125, "learning_rate": 6.217616580310881e-07, "loss": 0.0034, "reward": 1.9992035627365112, "reward_std": 8.725093562134134e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4992035031318665, "step": 1461 }, { "clip_ratio": 0.0, "completion_length": 55.25, "epoch": 3.7875647668393784, "grad_norm": 0.18407889287867105, "kl": 0.806640625, "learning_rate": 6.215025906735752e-07, "loss": 0.0041, "reward": 2.499992251396179, "reward_std": 4.0907706306825276e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999922513961792, "step": 1462 }, { "clip_ratio": 0.0, "completion_length": 54.375, "epoch": 3.790155440414508, "grad_norm": 0.6562043621246557, "kl": 0.66796875, "learning_rate": 6.212435233160621e-07, "loss": 0.0016, "reward": 2.499985694885254, "reward_std": 3.916427260719502e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999858736991882, "step": 1463 }, { "clip_ratio": 0.0, "completion_length": 42.0625, "epoch": 3.7927461139896375, "grad_norm": 0.17764864475122358, "kl": 0.7236328125, "learning_rate": 6.209844559585492e-07, "loss": 0.0029, "reward": 2.4999979734420776, "reward_std": 2.6351077053732297e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999980926513672, "step": 1464 }, { "clip_ratio": 0.0, "completion_length": 66.625, "epoch": 3.795336787564767, "grad_norm": 2.8898494809438984, "kl": 0.626953125, "learning_rate": 6.207253886010363e-07, "loss": 0.0029, "reward": 1.9983633160591125, "reward_std": 5.549153206629853e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4983634054660797, "step": 1465 }, { "clip_ratio": 0.0, "completion_length": 66.875, "epoch": 3.7979274611398965, "grad_norm": 4.633969522990657, "kl": 0.693359375, "learning_rate": 6.204663212435233e-07, "loss": 0.0036, "reward": 2.49995756149292, "reward_std": 4.5567202505480964e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999574422836304, "step": 1466 }, { "clip_ratio": 0.0, "completion_length": 48.125, "epoch": 3.800518134715026, "grad_norm": 0.19547863285249395, "kl": 0.33984375, "learning_rate": 6.202072538860104e-07, "loss": 0.0014, "reward": 0.9998528361320496, "reward_std": 3.5379691780690337e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 0.4998527765274048, "step": 1467 }, { "clip_ratio": 0.0, "completion_length": 51.75, "epoch": 3.8031088082901556, "grad_norm": 0.9795643348661174, "kl": 0.373046875, "learning_rate": 6.199481865284974e-07, "loss": 0.0011, "reward": 2.4999929666519165, "reward_std": 5.263982984615723e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999929666519165, "step": 1468 }, { "clip_ratio": 0.0, "completion_length": 44.375, "epoch": 3.805699481865285, "grad_norm": 0.4754978042313422, "kl": 0.3701171875, "learning_rate": 6.196891191709844e-07, "loss": 0.0022, "reward": 2.4999914169311523, "reward_std": 6.005264140185318e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999914169311523, "step": 1469 }, { "clip_ratio": 0.0, "completion_length": 39.375, "epoch": 3.8082901554404147, "grad_norm": 0.5144800245672949, "kl": 0.1090087890625, "learning_rate": 6.194300518134715e-07, "loss": 0.0006, "reward": 2.4999231100082397, "reward_std": 9.189096090267412e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999231100082397, "step": 1470 }, { "clip_ratio": 0.0, "completion_length": 42.125, "epoch": 3.8108808290155443, "grad_norm": 0.4560077735322792, "kl": 0.142822265625, "learning_rate": 6.191709844559585e-07, "loss": 0.0015, "reward": 2.499967336654663, "reward_std": 5.862255136435124e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999670386314392, "step": 1471 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 3.813471502590674, "grad_norm": 49.41466699436052, "kl": 0.049072265625, "learning_rate": 6.189119170984456e-07, "loss": -0.0001, "reward": 2.499945878982544, "reward_std": 4.246328614954109e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999459981918335, "step": 1472 }, { "clip_ratio": 0.0, "completion_length": 35.6875, "epoch": 3.8160621761658033, "grad_norm": 1.3809840963766469, "kl": 0.139892578125, "learning_rate": 6.186528497409326e-07, "loss": 0.0006, "reward": 2.499993920326233, "reward_std": 4.2472178733987676e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.999993920326233, "step": 1473 }, { "clip_ratio": 0.0, "completion_length": 35.5625, "epoch": 3.818652849740933, "grad_norm": 0.37197075865265644, "kl": 0.09930419921875, "learning_rate": 6.183937823834197e-07, "loss": -0.0002, "reward": 1.9998546838760376, "reward_std": 9.605147170077544e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4998548924922943, "step": 1474 }, { "clip_ratio": 0.0, "completion_length": 38.875, "epoch": 3.8212435233160624, "grad_norm": 17.46554141511608, "kl": 0.16748046875, "learning_rate": 6.181347150259067e-07, "loss": 0.0007, "reward": 2.3118717670440674, "reward_std": 0.4098711311817169, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.8118716478347778, "step": 1475 }, { "clip_ratio": 0.0, "completion_length": 34.625, "epoch": 3.823834196891192, "grad_norm": 13.832986110571346, "kl": 0.367431640625, "learning_rate": 6.178756476683937e-07, "loss": 0.0011, "reward": 2.499951958656311, "reward_std": 0.00011580804778077436, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999520182609558, "step": 1476 }, { "clip_ratio": 0.0, "completion_length": 34.0625, "epoch": 3.8264248704663215, "grad_norm": 0.15626784662529, "kl": 0.1424560546875, "learning_rate": 6.176165803108808e-07, "loss": 0.0014, "reward": 2.4999951124191284, "reward_std": 2.029710998385781e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999950528144836, "step": 1477 }, { "clip_ratio": 0.0, "completion_length": 36.125, "epoch": 3.8290155440414506, "grad_norm": 0.6904260654016462, "kl": 0.1005859375, "learning_rate": 6.173575129533678e-07, "loss": -0.0003, "reward": 2.4999685287475586, "reward_std": 6.046272119419882e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999684691429138, "step": 1478 }, { "clip_ratio": 0.0, "completion_length": 35.125, "epoch": 3.83160621761658, "grad_norm": 1.1816070078883867, "kl": 0.13916015625, "learning_rate": 6.170984455958549e-07, "loss": 0.0015, "reward": 1.9991791248321533, "reward_std": 3.0139472301016212e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.499178946018219, "step": 1479 }, { "clip_ratio": 0.0, "completion_length": 34.625, "epoch": 3.8341968911917097, "grad_norm": 0.7107998092785732, "kl": 0.114013671875, "learning_rate": 6.16839378238342e-07, "loss": 0.0, "reward": 2.4999890327453613, "reward_std": 4.616178784999647e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.999989092350006, "step": 1480 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 3.8367875647668392, "grad_norm": 0.047679444385930934, "kl": 0.1220703125, "learning_rate": 6.165803108808289e-07, "loss": -0.0005, "reward": 2.499991536140442, "reward_std": 1.5257598420248542e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.999991774559021, "step": 1481 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 3.839378238341969, "grad_norm": 0.15190050677486275, "kl": 0.0980072021484375, "learning_rate": 6.16321243523316e-07, "loss": -0.0008, "reward": 2.4999955892562866, "reward_std": 2.4593052785348846e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999957084655762, "step": 1482 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 3.8419689119170983, "grad_norm": 1.8095366787743785, "kl": 0.04498291015625, "learning_rate": 6.16062176165803e-07, "loss": -0.0006, "reward": 2.499949097633362, "reward_std": 1.2512091188909835e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999490976333618, "step": 1483 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 3.844559585492228, "grad_norm": 3.050845107592284, "kl": 0.135498046875, "learning_rate": 6.158031088082901e-07, "loss": 0.0013, "reward": 2.4999804496765137, "reward_std": 1.1739518924969161e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.999980390071869, "step": 1484 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 3.8471502590673574, "grad_norm": 3.9106821283709814, "kl": 0.21875, "learning_rate": 6.155440414507772e-07, "loss": 0.0014, "reward": 2.4999276399612427, "reward_std": 1.6660619166941615e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999274611473083, "step": 1485 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 3.849740932642487, "grad_norm": 1.478544684566244, "kl": 0.0648193359375, "learning_rate": 6.152849740932642e-07, "loss": 0.0008, "reward": 2.499980330467224, "reward_std": 8.342951218764938e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999804496765137, "step": 1486 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 3.8523316062176165, "grad_norm": 0.14218270222521923, "kl": 0.094970703125, "learning_rate": 6.150259067357512e-07, "loss": 0.0001, "reward": 2.4999966621398926, "reward_std": 1.1975077143233648e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999967217445374, "step": 1487 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 3.854922279792746, "grad_norm": 0.9914738916691916, "kl": 0.05126953125, "learning_rate": 6.147668393782383e-07, "loss": 0.0002, "reward": 2.499989628791809, "reward_std": 7.236959504552942e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999895095825195, "step": 1488 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 3.8575129533678756, "grad_norm": 6.025132410877975, "kl": 0.106689453125, "learning_rate": 6.145077720207253e-07, "loss": 0.0, "reward": 2.4999608993530273, "reward_std": 1.592787248227978e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999610781669617, "step": 1489 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 3.860103626943005, "grad_norm": 0.9531103089990565, "kl": 0.179443359375, "learning_rate": 6.142487046632124e-07, "loss": 0.0004, "reward": 2.4999879598617554, "reward_std": 5.469433801863488e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999879598617554, "step": 1490 }, { "clip_ratio": 0.0, "completion_length": 34.1875, "epoch": 3.8626943005181347, "grad_norm": 74.41752048232165, "kl": 0.111328125, "learning_rate": 6.139896373056994e-07, "loss": 0.0008, "reward": 1.9575918316841125, "reward_std": 0.0731734535893338, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4575918018817902, "step": 1491 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 3.865284974093264, "grad_norm": 0.24236503129513232, "kl": 0.13525390625, "learning_rate": 6.137305699481866e-07, "loss": -0.0001, "reward": 2.4999717473983765, "reward_std": 3.366131920756743e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999718070030212, "step": 1492 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 3.8678756476683938, "grad_norm": 0.8550638933239846, "kl": 0.01934814453125, "learning_rate": 6.134715025906736e-07, "loss": 0.0014, "reward": 2.4999886751174927, "reward_std": 4.1406764808016305e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999885559082031, "step": 1493 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 3.8704663212435233, "grad_norm": 0.7776768517865141, "kl": 0.073974609375, "learning_rate": 6.132124352331606e-07, "loss": 0.0008, "reward": 2.4999849796295166, "reward_std": 7.899149238710379e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.999985158443451, "step": 1494 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 3.873056994818653, "grad_norm": 0.06771922441915233, "kl": 0.0428466796875, "learning_rate": 6.129533678756477e-07, "loss": 0.0, "reward": 2.499994993209839, "reward_std": 1.6687939705661847e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999950528144836, "step": 1495 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 3.8756476683937824, "grad_norm": 7.851391387126275, "kl": 0.072265625, "learning_rate": 6.126943005181347e-07, "loss": -0.0001, "reward": 2.249977469444275, "reward_std": 0.26727293102896965, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.749977469444275, "step": 1496 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 3.878238341968912, "grad_norm": 87.89655568024126, "kl": 0.21484375, "learning_rate": 6.124352331606218e-07, "loss": 0.0007, "reward": 1.9994741678237915, "reward_std": 0.00034601625452523876, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.499474287033081, "step": 1497 }, { "clip_ratio": 0.0, "completion_length": 34.0, "epoch": 3.8808290155440415, "grad_norm": 0.6663699074252637, "kl": 0.097900390625, "learning_rate": 6.121761658031089e-07, "loss": 0.0015, "reward": 2.499893069267273, "reward_std": 9.794798188522691e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9998931288719177, "step": 1498 }, { "clip_ratio": 0.0, "completion_length": 34.5, "epoch": 3.883419689119171, "grad_norm": 0.25014862568425383, "kl": 0.082763671875, "learning_rate": 6.119170984455958e-07, "loss": -0.0003, "reward": 2.4999961853027344, "reward_std": 1.2891987921648251e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999959468841553, "step": 1499 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 3.8860103626943006, "grad_norm": 10.665526953684378, "kl": 0.099365234375, "learning_rate": 6.116580310880829e-07, "loss": 0.0009, "reward": 1.9997857809066772, "reward_std": 7.206776979273855e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4997857809066772, "step": 1500 }, { "clip_ratio": 0.0, "completion_length": 34.0, "epoch": 3.88860103626943, "grad_norm": 0.04980586190898546, "kl": 0.12060546875, "learning_rate": 6.113989637305699e-07, "loss": -0.0001, "reward": 2.4999990463256836, "reward_std": 1.079484377441986e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.999999225139618, "step": 1501 }, { "clip_ratio": 0.0, "completion_length": 34.5, "epoch": 3.8911917098445596, "grad_norm": 18.820186876883366, "kl": 0.066162109375, "learning_rate": 6.11139896373057e-07, "loss": 0.0003, "reward": 2.4374711513519287, "reward_std": 0.17684629114512518, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.937471091747284, "step": 1502 }, { "clip_ratio": 0.0, "completion_length": 34.5, "epoch": 3.893782383419689, "grad_norm": 3.44404116041616, "kl": 0.03900146484375, "learning_rate": 6.108808290155441e-07, "loss": -0.0008, "reward": 2.499988555908203, "reward_std": 1.3486970829035272e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999886751174927, "step": 1503 }, { "clip_ratio": 0.0, "completion_length": 34.0, "epoch": 3.8963730569948187, "grad_norm": 1.62107763982155, "kl": 0.0655517578125, "learning_rate": 6.106217616580311e-07, "loss": 0.0001, "reward": 1.9987400770187378, "reward_std": 2.216845021507652e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4987401366233826, "step": 1504 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 3.8989637305699483, "grad_norm": 124.29165863467259, "kl": 0.137939453125, "learning_rate": 6.103626943005181e-07, "loss": 0.0003, "reward": 1.972155511379242, "reward_std": 0.011117634104266472, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4721554517745972, "step": 1505 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 3.901554404145078, "grad_norm": 0.08261802332516702, "kl": 0.1021728515625, "learning_rate": 6.101036269430051e-07, "loss": -0.0006, "reward": 2.4999983310699463, "reward_std": 1.373515516434054e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999986290931702, "step": 1506 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 3.9041450777202074, "grad_norm": 12.020803892650783, "kl": 0.18310546875, "learning_rate": 6.098445595854922e-07, "loss": -0.0, "reward": 1.892823576927185, "reward_std": 0.0007788562370478758, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.3928236365318298, "step": 1507 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 3.906735751295337, "grad_norm": 2.675325021055988, "kl": 0.111083984375, "learning_rate": 6.095854922279793e-07, "loss": 0.0007, "reward": 2.4999688863754272, "reward_std": 1.3228021089162212e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999687671661377, "step": 1508 }, { "clip_ratio": 0.0, "completion_length": 34.5, "epoch": 3.9093264248704664, "grad_norm": 0.24045941419279215, "kl": 0.0582275390625, "learning_rate": 6.093264248704663e-07, "loss": 0.0012, "reward": 2.4999953508377075, "reward_std": 2.206568026963396e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999954104423523, "step": 1509 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 3.911917098445596, "grad_norm": 2.138825695550966, "kl": 0.23095703125, "learning_rate": 6.090673575129534e-07, "loss": 0.0013, "reward": 1.9985513091087341, "reward_std": 2.5303734673798317e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.498551368713379, "step": 1510 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 3.9145077720207255, "grad_norm": 0.27551762876822766, "kl": 0.05364990234375, "learning_rate": 6.088082901554404e-07, "loss": -0.0002, "reward": 2.499990224838257, "reward_std": 3.2414905035693664e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999903440475464, "step": 1511 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 3.917098445595855, "grad_norm": 0.19544538688642218, "kl": 0.07275390625, "learning_rate": 6.085492227979274e-07, "loss": -0.0, "reward": 2.499996781349182, "reward_std": 3.208890007044829e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.999996840953827, "step": 1512 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 3.9196891191709846, "grad_norm": 12.414224781692207, "kl": 0.08935546875, "learning_rate": 6.082901554404145e-07, "loss": 0.0005, "reward": 2.4999618530273438, "reward_std": 2.7265602511761244e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999618530273438, "step": 1513 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 3.9222797927461137, "grad_norm": 3.4897342789029775, "kl": 0.138427734375, "learning_rate": 6.080310880829015e-07, "loss": 0.0022, "reward": 2.499973773956299, "reward_std": 1.4360656422240936e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999737739562988, "step": 1514 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 3.9248704663212433, "grad_norm": 0.08180259092393129, "kl": 0.0465087890625, "learning_rate": 6.077720207253886e-07, "loss": -0.0005, "reward": 2.499997138977051, "reward_std": 1.5240450750297896e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999971389770508, "step": 1515 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 3.927461139896373, "grad_norm": 0.13154426265580085, "kl": 0.0740966796875, "learning_rate": 6.075129533678757e-07, "loss": 0.0, "reward": 2.4999908208847046, "reward_std": 2.662878785031353e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999909400939941, "step": 1516 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 3.9300518134715023, "grad_norm": 0.1315121423199463, "kl": 0.0531005859375, "learning_rate": 6.072538860103626e-07, "loss": -0.0009, "reward": 2.4999947547912598, "reward_std": 2.8946423071829486e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999948740005493, "step": 1517 }, { "clip_ratio": 0.0, "completion_length": 34.5, "epoch": 3.932642487046632, "grad_norm": 58.0455571939986, "kl": 0.08984375, "learning_rate": 6.069948186528497e-07, "loss": 0.0007, "reward": 1.997781753540039, "reward_std": 5.9722893638536334e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4977816343307495, "step": 1518 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 3.9352331606217614, "grad_norm": 1.461864711452078, "kl": 0.123291015625, "learning_rate": 6.067357512953367e-07, "loss": -0.0006, "reward": 2.499990940093994, "reward_std": 7.963669219179792e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999911189079285, "step": 1519 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 3.937823834196891, "grad_norm": 0.6282782327030264, "kl": 0.0810546875, "learning_rate": 6.064766839378238e-07, "loss": 0.0, "reward": 2.4999932050704956, "reward_std": 3.544434548530262e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999932646751404, "step": 1520 }, { "clip_ratio": 0.0, "completion_length": 34.5, "epoch": 3.9404145077720205, "grad_norm": 2.5276860922958995, "kl": 0.04736328125, "learning_rate": 6.062176165803109e-07, "loss": 0.0007, "reward": 2.4999648332595825, "reward_std": 1.1204292661659565e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999647736549377, "step": 1521 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 3.94300518134715, "grad_norm": 2.275415320612054, "kl": 0.092529296875, "learning_rate": 6.059585492227979e-07, "loss": -0.0004, "reward": 2.4999932050704956, "reward_std": 3.785589910876297e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999935626983643, "step": 1522 }, { "clip_ratio": 0.0, "completion_length": 35.0625, "epoch": 3.9455958549222796, "grad_norm": 13.146166797259028, "kl": 0.114013671875, "learning_rate": 6.056994818652849e-07, "loss": 0.0003, "reward": 1.9897688031196594, "reward_std": 8.064197891144431e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4897687435150146, "step": 1523 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 3.948186528497409, "grad_norm": 24.718529045802832, "kl": 0.12109375, "learning_rate": 6.054404145077719e-07, "loss": 0.0005, "reward": 2.4374316930770874, "reward_std": 0.1769547753712004, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.937431514263153, "step": 1524 }, { "clip_ratio": 0.0, "completion_length": 34.5, "epoch": 3.9507772020725387, "grad_norm": 21.061746102109815, "kl": 0.31640625, "learning_rate": 6.05181347150259e-07, "loss": 0.0012, "reward": 1.7074499130249023, "reward_std": 0.17726099950959906, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.2074499130249023, "step": 1525 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 3.9533678756476682, "grad_norm": 5.512914461709353, "kl": 0.07568359375, "learning_rate": 6.049222797927461e-07, "loss": 0.0003, "reward": 2.499977469444275, "reward_std": 8.490087566315196e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999775290489197, "step": 1526 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 3.9559585492227978, "grad_norm": 0.19557940310647692, "kl": 0.10791015625, "learning_rate": 6.046632124352331e-07, "loss": -0.0, "reward": 2.4999953508377075, "reward_std": 1.4924983133823844e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999954104423523, "step": 1527 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 3.9585492227979273, "grad_norm": 0.9418662495466913, "kl": 0.0841064453125, "learning_rate": 6.044041450777202e-07, "loss": 0.0003, "reward": 2.4999905824661255, "reward_std": 8.288041271953261e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999905824661255, "step": 1528 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 3.961139896373057, "grad_norm": 0.15702479261526286, "kl": 0.078125, "learning_rate": 6.041450777202071e-07, "loss": 0.0002, "reward": 2.4999929666519165, "reward_std": 2.2389545506484865e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999929070472717, "step": 1529 }, { "clip_ratio": 0.0, "completion_length": 34.0, "epoch": 3.9637305699481864, "grad_norm": 0.6240138107668911, "kl": 0.142333984375, "learning_rate": 6.038860103626942e-07, "loss": 0.0004, "reward": 2.4999821186065674, "reward_std": 5.765621722275682e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999821186065674, "step": 1530 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 3.966321243523316, "grad_norm": 41.97970705755977, "kl": 0.14404296875, "learning_rate": 6.036269430051813e-07, "loss": -0.0, "reward": 1.9241811037063599, "reward_std": 0.011888031393937126, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4241811633110046, "step": 1531 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 3.9689119170984455, "grad_norm": 1.3772361336422627, "kl": 0.040283203125, "learning_rate": 6.033678756476683e-07, "loss": 0.0002, "reward": 2.4999881982803345, "reward_std": 7.161868381899694e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999881982803345, "step": 1532 }, { "clip_ratio": 0.0, "completion_length": 34.5, "epoch": 3.971502590673575, "grad_norm": 12.373095287259442, "kl": 0.054443359375, "learning_rate": 6.031088082901554e-07, "loss": -0.0005, "reward": 1.9985239505767822, "reward_std": 9.379618177263183e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4985239803791046, "step": 1533 }, { "clip_ratio": 0.0, "completion_length": 34.5, "epoch": 3.9740932642487046, "grad_norm": 0.3609381581539641, "kl": 0.04931640625, "learning_rate": 6.028497409326426e-07, "loss": 0.0003, "reward": 2.49999463558197, "reward_std": 4.70987015432911e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999946355819702, "step": 1534 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 3.976683937823834, "grad_norm": 1.6352814710464176, "kl": 0.13818359375, "learning_rate": 6.025906735751294e-07, "loss": 0.0006, "reward": 2.4999889135360718, "reward_std": 7.197763125077472e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999890327453613, "step": 1535 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 3.9792746113989637, "grad_norm": 10.726885310772955, "kl": 0.1240234375, "learning_rate": 6.023316062176166e-07, "loss": 0.0007, "reward": 1.4987960457801819, "reward_std": 9.59022254392039e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 0.9987959861755371, "step": 1536 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 3.981865284974093, "grad_norm": 1.8859934173984643, "kl": 0.11767578125, "learning_rate": 6.020725388601036e-07, "loss": -0.0008, "reward": 2.499971866607666, "reward_std": 6.885548714308243e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999718070030212, "step": 1537 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 3.9844559585492227, "grad_norm": 13.781964335609478, "kl": 0.148681640625, "learning_rate": 6.018134715025907e-07, "loss": 0.0005, "reward": 1.4967296719551086, "reward_std": 0.0005246834916761145, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 0.9967296719551086, "step": 1538 }, { "clip_ratio": 0.0, "completion_length": 35.0625, "epoch": 3.9870466321243523, "grad_norm": 2.158598587715824, "kl": 0.114013671875, "learning_rate": 6.015544041450778e-07, "loss": 0.0014, "reward": 1.9998290538787842, "reward_std": 2.493898955435725e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4998289942741394, "step": 1539 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 3.989637305699482, "grad_norm": 10.058535586911876, "kl": 0.0584716796875, "learning_rate": 6.012953367875648e-07, "loss": 0.0011, "reward": 2.499976873397827, "reward_std": 6.330054816316988e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.999976634979248, "step": 1540 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 3.9922279792746114, "grad_norm": 2.392574936951284, "kl": 0.105712890625, "learning_rate": 6.010362694300518e-07, "loss": 0.0008, "reward": 2.4999676942825317, "reward_std": 1.3341488056539674e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.999967634677887, "step": 1541 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 3.994818652849741, "grad_norm": 2.1139371378907774, "kl": 0.08349609375, "learning_rate": 6.007772020725388e-07, "loss": 0.0005, "reward": 1.9998367428779602, "reward_std": 1.0321002037017024e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4998367428779602, "step": 1542 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 3.9974093264248705, "grad_norm": 0.9856443104513883, "kl": 0.061279296875, "learning_rate": 6.005181347150259e-07, "loss": 0.0006, "reward": 2.4999947547912598, "reward_std": 6.6028815126628615e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999947547912598, "step": 1543 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 4.0, "grad_norm": 0.31081216513036164, "kl": 0.0810546875, "learning_rate": 6.00259067357513e-07, "loss": -0.0001, "reward": 2.4999886751174927, "reward_std": 3.388945344795502e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999887347221375, "step": 1544 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 4.0025906735751295, "grad_norm": 4.416994857899252, "kl": 0.0670166015625, "learning_rate": 6e-07, "loss": 0.0017, "reward": 2.499978542327881, "reward_std": 1.6664245094943908e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999784231185913, "step": 1545 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 4.005181347150259, "grad_norm": 27.809646398420966, "kl": 0.1326904296875, "learning_rate": 5.997409326424871e-07, "loss": 0.0003, "reward": 1.4839889407157898, "reward_std": 0.2056162540538935, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 0.9839890897274017, "step": 1546 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 4.007772020725389, "grad_norm": 7.818237071912511, "kl": 0.146484375, "learning_rate": 5.99481865284974e-07, "loss": 0.0007, "reward": 1.9899332523345947, "reward_std": 0.00018400712815491715, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4899333119392395, "step": 1547 }, { "clip_ratio": 0.0, "completion_length": 34.5, "epoch": 4.010362694300518, "grad_norm": 5.585671080027885, "kl": 0.10693359375, "learning_rate": 5.992227979274611e-07, "loss": -0.0004, "reward": 2.0624399185180664, "reward_std": 0.17678657060935166, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.562440037727356, "step": 1548 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 4.012953367875648, "grad_norm": 6.586635257763988, "kl": 0.09375, "learning_rate": 5.989637305699482e-07, "loss": 0.0009, "reward": 2.4999741315841675, "reward_std": 1.631400783708159e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999741911888123, "step": 1549 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 4.015544041450777, "grad_norm": 4.147550109509986, "kl": 0.097412109375, "learning_rate": 5.987046632124352e-07, "loss": 0.0014, "reward": 2.4998841285705566, "reward_std": 1.604829245138717e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.999884009361267, "step": 1550 }, { "clip_ratio": 0.0, "completion_length": 34.5, "epoch": 4.018134715025907, "grad_norm": 0.26039902860809827, "kl": 0.087646484375, "learning_rate": 5.984455958549223e-07, "loss": -0.0004, "reward": 2.4999969005584717, "reward_std": 1.6589945062150946e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.999997079372406, "step": 1551 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 4.020725388601036, "grad_norm": 0.8508742168846786, "kl": 0.1435546875, "learning_rate": 5.981865284974093e-07, "loss": 0.0013, "reward": 2.499996304512024, "reward_std": 4.854603218973352e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.999996304512024, "step": 1552 }, { "clip_ratio": 0.0, "completion_length": 35.1875, "epoch": 4.023316062176166, "grad_norm": 92.86900396070175, "kl": 0.110107421875, "learning_rate": 5.979274611398963e-07, "loss": 0.0013, "reward": 1.9981833100318909, "reward_std": 0.0023745253711240366, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4981831908226013, "step": 1553 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 4.025906735751295, "grad_norm": 8.214861237744474, "kl": 0.326171875, "learning_rate": 5.976683937823834e-07, "loss": 0.0014, "reward": 1.9978615641593933, "reward_std": 2.7920706997974776e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4978615045547485, "step": 1554 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 4.028497409326425, "grad_norm": 0.03544922385781608, "kl": 0.0819091796875, "learning_rate": 5.974093264248704e-07, "loss": 0.0, "reward": 2.499995470046997, "reward_std": 7.366109571194102e-07, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999953508377075, "step": 1555 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 4.0310880829015545, "grad_norm": 1.176865512507275, "kl": 0.107421875, "learning_rate": 5.971502590673575e-07, "loss": -0.0005, "reward": 2.4999771118164062, "reward_std": 1.0998891184499371e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999773502349854, "step": 1556 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 4.033678756476684, "grad_norm": 22.13563981067276, "kl": 0.11962890625, "learning_rate": 5.968911917098445e-07, "loss": 0.0001, "reward": 1.9987602233886719, "reward_std": 0.0002699352146464662, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4987602233886719, "step": 1557 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 4.036269430051814, "grad_norm": 0.8636131421491717, "kl": 0.04815673828125, "learning_rate": 5.966321243523316e-07, "loss": 0.0008, "reward": 2.499994993209839, "reward_std": 5.407045648553321e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.999994933605194, "step": 1558 }, { "clip_ratio": 0.0, "completion_length": 35.4375, "epoch": 4.038860103626943, "grad_norm": 0.18692970436475073, "kl": 0.1220703125, "learning_rate": 5.963730569948186e-07, "loss": 0.0001, "reward": 2.4999942779541016, "reward_std": 3.4796528325387044e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999943971633911, "step": 1559 }, { "clip_ratio": 0.0, "completion_length": 34.5, "epoch": 4.041450777202073, "grad_norm": 7.547067500247267, "kl": 0.15185546875, "learning_rate": 5.961139896373056e-07, "loss": 0.0017, "reward": 1.9933630228042603, "reward_std": 0.00011093804221218306, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4933629035949707, "step": 1560 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 4.044041450777202, "grad_norm": 0.13983521107809369, "kl": 0.068359375, "learning_rate": 5.958549222797927e-07, "loss": 0.0001, "reward": 2.499997615814209, "reward_std": 1.551813397782098e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.999997615814209, "step": 1561 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 4.046632124352332, "grad_norm": 0.5365448327665252, "kl": 0.13134765625, "learning_rate": 5.955958549222798e-07, "loss": -0.0005, "reward": 2.4999958276748657, "reward_std": 2.9013790481258184e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999959468841553, "step": 1562 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 4.049222797927461, "grad_norm": 0.8592941042650075, "kl": 0.20751953125, "learning_rate": 5.953367875647668e-07, "loss": 0.0002, "reward": 1.999908447265625, "reward_std": 1.400190649292199e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4999085068702698, "step": 1563 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 4.051813471502591, "grad_norm": 25.5094200872479, "kl": 0.108642578125, "learning_rate": 5.950777202072539e-07, "loss": 0.0004, "reward": 2.3749287128448486, "reward_std": 0.23156403409984705, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.8749286532402039, "step": 1564 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 4.05440414507772, "grad_norm": 0.2480863977760045, "kl": 0.08544921875, "learning_rate": 5.948186528497408e-07, "loss": 0.0, "reward": 2.499994993209839, "reward_std": 3.331746995627327e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999949932098389, "step": 1565 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 4.05699481865285, "grad_norm": 38.638483764038256, "kl": 0.14111328125, "learning_rate": 5.945595854922279e-07, "loss": 0.0013, "reward": 1.9990499019622803, "reward_std": 5.9976065131195355e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4990499019622803, "step": 1566 }, { "clip_ratio": 0.0, "completion_length": 34.5, "epoch": 4.0595854922279795, "grad_norm": 0.6763009839069763, "kl": 0.06640625, "learning_rate": 5.94300518134715e-07, "loss": 0.001, "reward": 2.4999979734420776, "reward_std": 1.7670318470663915e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999979138374329, "step": 1567 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 4.062176165803109, "grad_norm": 0.04540838025088783, "kl": 0.0709228515625, "learning_rate": 5.94041450777202e-07, "loss": -0.0003, "reward": 2.499998688697815, "reward_std": 9.241435634521622e-07, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.999998927116394, "step": 1568 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 4.064766839378239, "grad_norm": 0.09630885029282686, "kl": 0.1148681640625, "learning_rate": 5.937823834196891e-07, "loss": 0.0, "reward": 2.499995470046997, "reward_std": 1.7548972408576446e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999955892562866, "step": 1569 }, { "clip_ratio": 0.0, "completion_length": 34.5, "epoch": 4.067357512953368, "grad_norm": 3.8997047573922203, "kl": 0.168701171875, "learning_rate": 5.935233160621761e-07, "loss": 0.0001, "reward": 2.4998762607574463, "reward_std": 1.6851620330271544e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9998763799667358, "step": 1570 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 4.069948186528498, "grad_norm": 1.160117974467065, "kl": 0.0579833984375, "learning_rate": 5.932642487046632e-07, "loss": -0.0003, "reward": 2.499985456466675, "reward_std": 7.409892504028903e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999853372573853, "step": 1571 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 4.072538860103627, "grad_norm": 25.26173320582219, "kl": 0.10009765625, "learning_rate": 5.930051813471502e-07, "loss": -0.0006, "reward": 2.499919891357422, "reward_std": 2.8892596560581296e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999201893806458, "step": 1572 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 4.075129533678757, "grad_norm": 0.4202578113393349, "kl": 0.04266357421875, "learning_rate": 5.927461139896372e-07, "loss": 0.0006, "reward": 2.4999955892562866, "reward_std": 2.6960880177284707e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999955892562866, "step": 1573 }, { "clip_ratio": 0.0, "completion_length": 34.5, "epoch": 4.077720207253886, "grad_norm": 2.7566474096963645, "kl": 0.1005859375, "learning_rate": 5.924870466321243e-07, "loss": -0.0, "reward": 2.4999741315841675, "reward_std": 8.443552246717445e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999741315841675, "step": 1574 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 4.080310880829016, "grad_norm": 0.9526217628260819, "kl": 0.0552978515625, "learning_rate": 5.922279792746113e-07, "loss": 0.0005, "reward": 1.9999492168426514, "reward_std": 7.816958259354578e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4999490976333618, "step": 1575 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 4.082901554404145, "grad_norm": 4.60583281730857, "kl": 0.166015625, "learning_rate": 5.919689119170984e-07, "loss": 0.0007, "reward": 1.9523777961730957, "reward_std": 0.00019758677677828018, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4523777961730957, "step": 1576 }, { "clip_ratio": 0.0, "completion_length": 34.5, "epoch": 4.085492227979275, "grad_norm": 9.037126448420663, "kl": 0.074951171875, "learning_rate": 5.917098445595856e-07, "loss": 0.0, "reward": 2.312481999397278, "reward_std": 0.2587793828600411, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.8124821186065674, "step": 1577 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 4.0880829015544045, "grad_norm": 6.917998303646123, "kl": 0.116943359375, "learning_rate": 5.914507772020724e-07, "loss": 0.0006, "reward": 2.0624269247055054, "reward_std": 0.17679864926412847, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.5624268651008606, "step": 1578 }, { "clip_ratio": 0.0, "completion_length": 34.0, "epoch": 4.090673575129534, "grad_norm": 0.46547614924629965, "kl": 0.03802490234375, "learning_rate": 5.911917098445596e-07, "loss": 0.0, "reward": 1.999932050704956, "reward_std": 4.348954121269344e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4999321103096008, "step": 1579 }, { "clip_ratio": 0.0, "completion_length": 34.5, "epoch": 4.0932642487046635, "grad_norm": 9.223621795378826, "kl": 0.32275390625, "learning_rate": 5.909326424870466e-07, "loss": 0.0013, "reward": 2.3584206104278564, "reward_std": 0.39992353320121765, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.8584206104278564, "step": 1580 }, { "clip_ratio": 0.0, "completion_length": 34.5, "epoch": 4.095854922279793, "grad_norm": 0.9026910738428424, "kl": 0.0321044921875, "learning_rate": 5.906735751295337e-07, "loss": -0.0011, "reward": 2.4999924898147583, "reward_std": 4.905882633465808e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999927878379822, "step": 1581 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 4.098445595854923, "grad_norm": 0.17413724835275787, "kl": 0.060211181640625, "learning_rate": 5.904145077720208e-07, "loss": 0.0004, "reward": 2.4999938011169434, "reward_std": 1.9574302427827206e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999936819076538, "step": 1582 }, { "clip_ratio": 0.0, "completion_length": 34.5, "epoch": 4.101036269430052, "grad_norm": 0.07000372007439741, "kl": 0.0592041015625, "learning_rate": 5.901554404145078e-07, "loss": 0.0017, "reward": 2.4999983310699463, "reward_std": 1.3967380709800636e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999980926513672, "step": 1583 }, { "clip_ratio": 0.0, "completion_length": 34.5, "epoch": 4.103626943005182, "grad_norm": 8.76569314070839, "kl": 0.1220703125, "learning_rate": 5.898963730569948e-07, "loss": 0.0006, "reward": 1.978507161140442, "reward_std": 0.00020144380607689527, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4785070717334747, "step": 1584 }, { "clip_ratio": 0.0, "completion_length": 34.5, "epoch": 4.106217616580311, "grad_norm": 3.4526155202041475, "kl": 0.0562744140625, "learning_rate": 5.896373056994819e-07, "loss": -0.0002, "reward": 1.9999414682388306, "reward_std": 1.137816707341699e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4999414682388306, "step": 1585 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 4.108808290155441, "grad_norm": 1.2058907545604165, "kl": 0.0775146484375, "learning_rate": 5.893782383419689e-07, "loss": -0.0007, "reward": 2.499985098838806, "reward_std": 8.535613687854493e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.999985158443451, "step": 1586 }, { "clip_ratio": 0.0, "completion_length": 34.5, "epoch": 4.11139896373057, "grad_norm": 1.0745653224141987, "kl": 0.056640625, "learning_rate": 5.89119170984456e-07, "loss": -0.0003, "reward": 2.4999966621398926, "reward_std": 3.5814974808090483e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999966621398926, "step": 1587 }, { "clip_ratio": 0.0, "completion_length": 34.5, "epoch": 4.1139896373057, "grad_norm": 3.6557707207549868, "kl": 0.09423828125, "learning_rate": 5.88860103626943e-07, "loss": 0.0013, "reward": 1.9947112798690796, "reward_std": 8.674551190779312e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4947111904621124, "step": 1588 }, { "clip_ratio": 0.0, "completion_length": 34.5625, "epoch": 4.116580310880829, "grad_norm": 0.5735154793907004, "kl": 0.068603515625, "learning_rate": 5.886010362694301e-07, "loss": -0.0004, "reward": 2.4999921321868896, "reward_std": 3.5775509275026707e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999921917915344, "step": 1589 }, { "clip_ratio": 0.0, "completion_length": 34.5, "epoch": 4.119170984455959, "grad_norm": 0.448701984904896, "kl": 0.0687713623046875, "learning_rate": 5.883419689119171e-07, "loss": 0.0007, "reward": 2.4999974966049194, "reward_std": 3.076568759752263e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.999997615814209, "step": 1590 }, { "clip_ratio": 0.0, "completion_length": 34.5, "epoch": 4.1217616580310885, "grad_norm": 0.5454345824131598, "kl": 0.14794921875, "learning_rate": 5.880829015544041e-07, "loss": 0.0013, "reward": 2.4999910593032837, "reward_std": 4.442941815341328e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999909400939941, "step": 1591 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 4.124352331606218, "grad_norm": 0.5327503865261822, "kl": 0.066162109375, "learning_rate": 5.878238341968912e-07, "loss": -0.0006, "reward": 2.499996304512024, "reward_std": 3.081041313635069e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999963641166687, "step": 1592 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 4.126943005181348, "grad_norm": 3.4598297305021304, "kl": 0.123779296875, "learning_rate": 5.875647668393782e-07, "loss": 0.0005, "reward": 1.9593265652656555, "reward_std": 0.00029503035966627067, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.459326446056366, "step": 1593 }, { "clip_ratio": 0.0, "completion_length": 34.5, "epoch": 4.129533678756476, "grad_norm": 1.342845432818344, "kl": 0.122802734375, "learning_rate": 5.873056994818653e-07, "loss": 0.0003, "reward": 2.4999804496765137, "reward_std": 8.85338187117668e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999804496765137, "step": 1594 }, { "clip_ratio": 0.0, "completion_length": 36.5, "epoch": 4.132124352331606, "grad_norm": 0.3342285856716859, "kl": 0.03778076171875, "learning_rate": 5.870466321243524e-07, "loss": 0.0013, "reward": 2.4999923706054688, "reward_std": 3.5883319924323587e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.999992311000824, "step": 1595 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 4.134715025906735, "grad_norm": 0.32819218678633233, "kl": 0.08050537109375, "learning_rate": 5.867875647668393e-07, "loss": -0.0005, "reward": 1.9998695850372314, "reward_std": 6.5313302002323326e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4998696744441986, "step": 1596 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 4.137305699481865, "grad_norm": 0.975674366386174, "kl": 0.03753662109375, "learning_rate": 5.865284974093264e-07, "loss": -0.0003, "reward": 2.499990463256836, "reward_std": 6.31794296168664e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999905228614807, "step": 1597 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 4.139896373056994, "grad_norm": 2.206039368778935, "kl": 0.069580078125, "learning_rate": 5.862694300518134e-07, "loss": -0.0004, "reward": 2.4999513626098633, "reward_std": 2.4791568193904823e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999513626098633, "step": 1598 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 4.142487046632124, "grad_norm": 0.06605473478520514, "kl": 0.056396484375, "learning_rate": 5.860103626943005e-07, "loss": -0.0001, "reward": 2.499997854232788, "reward_std": 1.7147524147276272e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999979138374329, "step": 1599 }, { "clip_ratio": 0.0, "completion_length": 34.5, "epoch": 4.1450777202072535, "grad_norm": 10.284093898136161, "kl": 0.0762939453125, "learning_rate": 5.857512953367876e-07, "loss": 0.0011, "reward": 2.499969244003296, "reward_std": 1.5804248960193945e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.999969244003296, "step": 1600 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 4.147668393782383, "grad_norm": 1.2757577869505718, "kl": 0.077392578125, "learning_rate": 5.854922279792746e-07, "loss": -0.0003, "reward": 2.499992609024048, "reward_std": 5.06816235201768e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999927282333374, "step": 1601 }, { "clip_ratio": 0.0, "completion_length": 34.5, "epoch": 4.150259067357513, "grad_norm": 2.3679157856969923, "kl": 0.05615234375, "learning_rate": 5.852331606217616e-07, "loss": 0.0006, "reward": 2.4999738931655884, "reward_std": 9.621571734896861e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.999973714351654, "step": 1602 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 4.152849740932642, "grad_norm": 2.4563619146675624, "kl": 0.20751953125, "learning_rate": 5.849740932642486e-07, "loss": 0.0001, "reward": 1.9793579578399658, "reward_std": 5.1970230515507865e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4793579876422882, "step": 1603 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 4.155440414507772, "grad_norm": 6.967935707116754, "kl": 0.099853515625, "learning_rate": 5.847150259067357e-07, "loss": 0.0011, "reward": 2.4999468326568604, "reward_std": 2.864020916604204e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999467134475708, "step": 1604 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 4.158031088082901, "grad_norm": 0.6872351586519199, "kl": 0.0625, "learning_rate": 5.844559585492228e-07, "loss": 0.0011, "reward": 2.4999942779541016, "reward_std": 4.553016879071947e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999942183494568, "step": 1605 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 4.160621761658031, "grad_norm": 0.34463471614119895, "kl": 0.1339111328125, "learning_rate": 5.841968911917098e-07, "loss": 0.001, "reward": 2.4999847412109375, "reward_std": 4.717058914138761e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.999984622001648, "step": 1606 }, { "clip_ratio": 0.0, "completion_length": 34.5, "epoch": 4.16321243523316, "grad_norm": 91.61056384872836, "kl": 0.1658935546875, "learning_rate": 5.839378238341969e-07, "loss": 0.0015, "reward": 1.8187761306762695, "reward_std": 0.003314054640895847, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.318775862455368, "step": 1607 }, { "clip_ratio": 0.0, "completion_length": 34.5, "epoch": 4.16580310880829, "grad_norm": 2.6667578967866614, "kl": 0.150390625, "learning_rate": 5.836787564766839e-07, "loss": 0.0001, "reward": 1.6774897575378418, "reward_std": 0.00026905265212917584, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.1774897575378418, "step": 1608 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 4.168393782383419, "grad_norm": 0.26074399657417957, "kl": 0.1455078125, "learning_rate": 5.834196891191709e-07, "loss": -0.0004, "reward": 2.499993324279785, "reward_std": 4.172011131231557e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999936819076538, "step": 1609 }, { "clip_ratio": 0.0, "completion_length": 34.5, "epoch": 4.170984455958549, "grad_norm": 1.3050137036227154, "kl": 0.080322265625, "learning_rate": 5.83160621761658e-07, "loss": -0.0004, "reward": 2.4999701976776123, "reward_std": 9.359884984405653e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999701380729675, "step": 1610 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 4.1735751295336785, "grad_norm": 9.592629986383619, "kl": 0.19873046875, "learning_rate": 5.82901554404145e-07, "loss": 0.0006, "reward": 1.8040056228637695, "reward_std": 0.0007750826889605378, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.3040056824684143, "step": 1611 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 4.176165803108808, "grad_norm": 1.8077326956934356, "kl": 0.032562255859375, "learning_rate": 5.826424870466321e-07, "loss": 0.0004, "reward": 2.4999879598617554, "reward_std": 6.898350022765953e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999879598617554, "step": 1612 }, { "clip_ratio": 0.0, "completion_length": 34.5, "epoch": 4.178756476683938, "grad_norm": 0.4217824998854284, "kl": 0.11962890625, "learning_rate": 5.823834196891192e-07, "loss": -0.0007, "reward": 2.4999979734420776, "reward_std": 1.6396128614815098e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999979138374329, "step": 1613 }, { "clip_ratio": 0.0, "completion_length": 34.0, "epoch": 4.181347150259067, "grad_norm": 5.843529025065718, "kl": 0.18505859375, "learning_rate": 5.821243523316061e-07, "loss": 0.0006, "reward": 2.1872295141220093, "reward_std": 0.25879292379286767, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.6872295141220093, "step": 1614 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 4.183937823834197, "grad_norm": 0.11885772684141718, "kl": 0.14208984375, "learning_rate": 5.818652849740932e-07, "loss": -0.0, "reward": 2.499978542327881, "reward_std": 1.7583438989277056e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999785423278809, "step": 1615 }, { "clip_ratio": 0.0, "completion_length": 34.5, "epoch": 4.186528497409326, "grad_norm": 31.155695422064454, "kl": 0.0616455078125, "learning_rate": 5.816062176165802e-07, "loss": 0.0004, "reward": 1.8776912689208984, "reward_std": 0.0009670431447830197, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.3776913285255432, "step": 1616 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 4.189119170984456, "grad_norm": 7.954052111218322, "kl": 0.1370849609375, "learning_rate": 5.813471502590673e-07, "loss": 0.0004, "reward": 1.9713550209999084, "reward_std": 0.00029388689586085093, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4713551104068756, "step": 1617 }, { "clip_ratio": 0.0, "completion_length": 34.5, "epoch": 4.191709844559585, "grad_norm": 2.341547440190843, "kl": 0.07275390625, "learning_rate": 5.810880829015544e-07, "loss": -0.0003, "reward": 2.499981164932251, "reward_std": 7.422139901791525e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999810457229614, "step": 1618 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 4.194300518134715, "grad_norm": 0.7659936902044249, "kl": 0.14453125, "learning_rate": 5.808290155440414e-07, "loss": 0.0009, "reward": 2.499971389770508, "reward_std": 9.378344088872836e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999713897705078, "step": 1619 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 4.196891191709844, "grad_norm": 20.51146761135554, "kl": 0.2236328125, "learning_rate": 5.805699481865284e-07, "loss": 0.0012, "reward": 1.995547890663147, "reward_std": 0.00017465949485995225, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4955478310585022, "step": 1620 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 4.199481865284974, "grad_norm": 10.469760481460995, "kl": 0.123779296875, "learning_rate": 5.803108808290154e-07, "loss": 0.001, "reward": 2.499975800514221, "reward_std": 6.026807170655957e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999756813049316, "step": 1621 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 4.2020725388601035, "grad_norm": 3.5205897577474934, "kl": 0.14697265625, "learning_rate": 5.800518134715026e-07, "loss": 0.0005, "reward": 1.8850517272949219, "reward_std": 9.285745943543589e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.3850517272949219, "step": 1622 }, { "clip_ratio": 0.0, "completion_length": 34.5, "epoch": 4.204663212435233, "grad_norm": 9.309249096427962, "kl": 0.076171875, "learning_rate": 5.797927461139897e-07, "loss": 0.0006, "reward": 2.4374871253967285, "reward_std": 0.17679068280077104, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9374873042106628, "step": 1623 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 4.2072538860103625, "grad_norm": 43.70409482839093, "kl": 0.137451171875, "learning_rate": 5.795336787564767e-07, "loss": 0.0006, "reward": 1.990125060081482, "reward_std": 0.0005508052640834649, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.490125060081482, "step": 1624 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 4.209844559585492, "grad_norm": 17.308240886568655, "kl": 0.15087890625, "learning_rate": 5.792746113989638e-07, "loss": 0.0007, "reward": 2.249952495098114, "reward_std": 0.2672914825616317, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.7499526143074036, "step": 1625 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 4.212435233160622, "grad_norm": 2.479283648758274, "kl": 0.098388671875, "learning_rate": 5.790155440414507e-07, "loss": 0.0005, "reward": 2.4999747276306152, "reward_std": 1.5346635336754844e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999746680259705, "step": 1626 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 4.215025906735751, "grad_norm": 0.7766965270088614, "kl": 0.138427734375, "learning_rate": 5.787564766839378e-07, "loss": -0.0005, "reward": 2.4999606609344482, "reward_std": 1.2539077715700842e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999608397483826, "step": 1627 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 4.217616580310881, "grad_norm": 2.660088851994437, "kl": 0.0633544921875, "learning_rate": 5.784974093264249e-07, "loss": 0.0003, "reward": 1.9996073246002197, "reward_std": 2.0539871457003755e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4996073842048645, "step": 1628 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 4.22020725388601, "grad_norm": 4.827196641300278, "kl": 0.1435546875, "learning_rate": 5.782383419689119e-07, "loss": 0.0007, "reward": 2.1874724626541138, "reward_std": 0.2587815834192497, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.6874723434448242, "step": 1629 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 4.22279792746114, "grad_norm": 2.5232709500050703, "kl": 0.16552734375, "learning_rate": 5.77979274611399e-07, "loss": 0.0005, "reward": 1.9996851682662964, "reward_std": 2.2433102458307985e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4996852278709412, "step": 1630 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 4.225388601036269, "grad_norm": 0.35144749238925405, "kl": 0.1083984375, "learning_rate": 5.777202072538861e-07, "loss": 0.0014, "reward": 2.499971389770508, "reward_std": 5.214265343056468e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.999971330165863, "step": 1631 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 4.227979274611399, "grad_norm": 3.317477626526753, "kl": 0.11572265625, "learning_rate": 5.77461139896373e-07, "loss": 0.0002, "reward": 2.499928116798401, "reward_std": 1.4514650899855042e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999281764030457, "step": 1632 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 4.230569948186528, "grad_norm": 5.4803308991111015, "kl": 0.107666015625, "learning_rate": 5.772020725388601e-07, "loss": 0.0005, "reward": 2.499990463256836, "reward_std": 1.717954683044809e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999905824661255, "step": 1633 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 4.233160621761658, "grad_norm": 11.144472998625615, "kl": 0.14404296875, "learning_rate": 5.769430051813471e-07, "loss": 0.0009, "reward": 2.4999709129333496, "reward_std": 1.3828344890498556e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999709129333496, "step": 1634 }, { "clip_ratio": 0.0, "completion_length": 34.5, "epoch": 4.2357512953367875, "grad_norm": 1.3764847322182971, "kl": 0.1708984375, "learning_rate": 5.766839378238342e-07, "loss": -0.0001, "reward": 2.4999645948410034, "reward_std": 8.748760137677891e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999646544456482, "step": 1635 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 4.238341968911917, "grad_norm": 2.0545668901315954, "kl": 0.07177734375, "learning_rate": 5.764248704663213e-07, "loss": 0.0011, "reward": 2.4999762773513794, "reward_std": 1.1046366580558242e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.999976098537445, "step": 1636 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 4.240932642487047, "grad_norm": 0.33295871541623595, "kl": 0.07666015625, "learning_rate": 5.761658031088083e-07, "loss": -0.0001, "reward": 2.499983310699463, "reward_std": 2.8792962893930962e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999833703041077, "step": 1637 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 4.243523316062176, "grad_norm": 0.1594739117810296, "kl": 0.074462890625, "learning_rate": 5.759067357512953e-07, "loss": 0.002, "reward": 2.49999737739563, "reward_std": 2.8450117497413885e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999973773956299, "step": 1638 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 4.246113989637306, "grad_norm": 0.7947963589123364, "kl": 0.137939453125, "learning_rate": 5.756476683937823e-07, "loss": 0.0014, "reward": 2.4999921321868896, "reward_std": 6.424090543077909e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999921321868896, "step": 1639 }, { "clip_ratio": 0.0, "completion_length": 34.5, "epoch": 4.248704663212435, "grad_norm": 0.05333305068135456, "kl": 0.0377197265625, "learning_rate": 5.753886010362694e-07, "loss": 0.0007, "reward": 2.4999979734420776, "reward_std": 1.1159213784139865e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.999997854232788, "step": 1640 }, { "clip_ratio": 0.0, "completion_length": 36.5, "epoch": 4.251295336787565, "grad_norm": 0.22430505547001248, "kl": 0.089599609375, "learning_rate": 5.751295336787565e-07, "loss": 0.0007, "reward": 2.499995231628418, "reward_std": 3.761320954254188e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999952912330627, "step": 1641 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 4.253886010362694, "grad_norm": 0.10671235522834856, "kl": 0.112060546875, "learning_rate": 5.748704663212435e-07, "loss": 0.0012, "reward": 2.49999737739563, "reward_std": 2.6441645104569034e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999971985816956, "step": 1642 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 4.256476683937824, "grad_norm": 0.07172136716811715, "kl": 0.072265625, "learning_rate": 5.746113989637306e-07, "loss": -0.001, "reward": 2.499998688697815, "reward_std": 1.0090398916418053e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999988079071045, "step": 1643 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 4.259067357512953, "grad_norm": 39.51998085908117, "kl": 0.07958984375, "learning_rate": 5.743523316062175e-07, "loss": 0.0013, "reward": 1.965232253074646, "reward_std": 0.014019200414736588, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4652320742607117, "step": 1644 }, { "clip_ratio": 0.0, "completion_length": 36.5, "epoch": 4.261658031088083, "grad_norm": 0.35216612170084327, "kl": 0.0677490234375, "learning_rate": 5.740932642487046e-07, "loss": -0.0012, "reward": 2.499987483024597, "reward_std": 5.831992211824399e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999876022338867, "step": 1645 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 4.2642487046632125, "grad_norm": 0.35681831415677384, "kl": 0.1038818359375, "learning_rate": 5.738341968911917e-07, "loss": 0.0014, "reward": 1.9999709129333496, "reward_std": 5.617907589794413e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4999707639217377, "step": 1646 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 4.266839378238342, "grad_norm": 4.694867300286028, "kl": 0.109130859375, "learning_rate": 5.735751295336787e-07, "loss": -0.0002, "reward": 2.4997060298919678, "reward_std": 4.285841896489728e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9997060894966125, "step": 1647 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 4.269430051813472, "grad_norm": 6.744128030216046, "kl": 0.0557861328125, "learning_rate": 5.733160621761658e-07, "loss": 0.0008, "reward": 2.4999911785125732, "reward_std": 8.113884405247518e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999909400939941, "step": 1648 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 4.272020725388601, "grad_norm": 1.9999573719327954, "kl": 0.09765625, "learning_rate": 5.730569948186528e-07, "loss": 0.0014, "reward": 2.4999910593032837, "reward_std": 6.217932138952165e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999909400939941, "step": 1649 }, { "clip_ratio": 0.0, "completion_length": 34.5, "epoch": 4.274611398963731, "grad_norm": 0.4288660620903904, "kl": 0.1357421875, "learning_rate": 5.727979274611398e-07, "loss": 0.0005, "reward": 2.4999947547912598, "reward_std": 2.7621567824098747e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999948740005493, "step": 1650 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 4.27720207253886, "grad_norm": 0.1853425641084272, "kl": 0.15185546875, "learning_rate": 5.725388601036269e-07, "loss": 0.0008, "reward": 2.499994993209839, "reward_std": 4.175905246484035e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999950528144836, "step": 1651 }, { "clip_ratio": 0.0, "completion_length": 34.5, "epoch": 4.27979274611399, "grad_norm": 6.105633266796804, "kl": 0.13330078125, "learning_rate": 5.722797927461139e-07, "loss": 0.0007, "reward": 2.4998953342437744, "reward_std": 4.475373282275541e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.999895453453064, "step": 1652 }, { "clip_ratio": 0.0, "completion_length": 34.0, "epoch": 4.282383419689119, "grad_norm": 6.399311567115233, "kl": 0.106201171875, "learning_rate": 5.72020725388601e-07, "loss": -0.0, "reward": 1.720837116241455, "reward_std": 0.000405950486083384, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.2208371758460999, "step": 1653 }, { "clip_ratio": 0.0, "completion_length": 34.5, "epoch": 4.284974093264249, "grad_norm": 0.7050338628178441, "kl": 0.0782470703125, "learning_rate": 5.717616580310881e-07, "loss": 0.0008, "reward": 2.499989867210388, "reward_std": 5.765255764345056e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999897480010986, "step": 1654 }, { "clip_ratio": 0.0, "completion_length": 34.5, "epoch": 4.287564766839378, "grad_norm": 1.6388946383431446, "kl": 0.1217041015625, "learning_rate": 5.715025906735751e-07, "loss": -0.0005, "reward": 2.499995470046997, "reward_std": 4.27052509621717e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999957084655762, "step": 1655 }, { "clip_ratio": 0.0, "completion_length": 33.5625, "epoch": 4.290155440414508, "grad_norm": 1350.3492618723762, "kl": 0.13232421875, "learning_rate": 5.712435233160621e-07, "loss": -0.0004, "reward": 2.4347686767578125, "reward_std": 0.18449809011832485, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9347687363624573, "step": 1656 }, { "clip_ratio": 0.0, "completion_length": 34.5, "epoch": 4.2927461139896375, "grad_norm": 1.8772716626489356, "kl": 0.121337890625, "learning_rate": 5.709844559585491e-07, "loss": 0.0003, "reward": 2.4999595880508423, "reward_std": 1.4377440834323352e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999597668647766, "step": 1657 }, { "clip_ratio": 0.0, "completion_length": 34.5, "epoch": 4.295336787564767, "grad_norm": 0.7265465720578551, "kl": 0.0733642578125, "learning_rate": 5.707253886010362e-07, "loss": 0.0009, "reward": 2.4999961853027344, "reward_std": 3.3018006888596574e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999960064888, "step": 1658 }, { "clip_ratio": 0.0, "completion_length": 36.4375, "epoch": 4.2979274611398965, "grad_norm": 0.13537141286032547, "kl": 0.06365966796875, "learning_rate": 5.704663212435233e-07, "loss": 0.0016, "reward": 2.499996304512024, "reward_std": 2.452327521496045e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.999996304512024, "step": 1659 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 4.300518134715026, "grad_norm": 0.16293186208052443, "kl": 0.16015625, "learning_rate": 5.702072538860103e-07, "loss": 0.0014, "reward": 2.499996781349182, "reward_std": 3.5358443426503072e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999967217445374, "step": 1660 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 4.303108808290156, "grad_norm": 3.2712378822545487, "kl": 0.0635986328125, "learning_rate": 5.699481865284974e-07, "loss": 0.001, "reward": 2.4999812841415405, "reward_std": 1.4047294826013967e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999812245368958, "step": 1661 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 4.305699481865285, "grad_norm": 1.4783695522043987, "kl": 0.0579833984375, "learning_rate": 5.696891191709843e-07, "loss": 0.0015, "reward": 2.499969244003296, "reward_std": 7.840116268198472e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.999969244003296, "step": 1662 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 4.308290155440415, "grad_norm": 8.911939478469888, "kl": 0.091796875, "learning_rate": 5.694300518134714e-07, "loss": -0.0001, "reward": 1.9984776377677917, "reward_std": 0.00014535136051563313, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4984778463840485, "step": 1663 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 4.310880829015544, "grad_norm": 66.86633687724522, "kl": 0.121826171875, "learning_rate": 5.691709844559586e-07, "loss": 0.0005, "reward": 1.328099548816681, "reward_std": 0.002699408984881302, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 0.8280995786190033, "step": 1664 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 4.313471502590674, "grad_norm": 0.3071995792457166, "kl": 0.074462890625, "learning_rate": 5.689119170984456e-07, "loss": 0.0006, "reward": 2.4999983310699463, "reward_std": 1.2217776657053037e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999984502792358, "step": 1665 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 4.316062176165803, "grad_norm": 0.26425745698842634, "kl": 0.058349609375, "learning_rate": 5.686528497409327e-07, "loss": 0.0017, "reward": 2.4999985694885254, "reward_std": 1.6321747580150259e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999985694885254, "step": 1666 }, { "clip_ratio": 0.0, "completion_length": 34.8125, "epoch": 4.318652849740933, "grad_norm": 55.27313495909315, "kl": 0.104248046875, "learning_rate": 5.683937823834197e-07, "loss": 0.0004, "reward": 2.186655044555664, "reward_std": 0.259467652848798, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.6866552233695984, "step": 1667 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 4.321243523316062, "grad_norm": 2.1075119889377203, "kl": 0.09857177734375, "learning_rate": 5.681347150259067e-07, "loss": 0.0007, "reward": 2.49995219707489, "reward_std": 1.2156680668340414e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.999952256679535, "step": 1668 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 4.323834196891192, "grad_norm": 17.017277666589393, "kl": 0.07568359375, "learning_rate": 5.678756476683938e-07, "loss": 0.0001, "reward": 2.4374330043792725, "reward_std": 0.17682045467972785, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9374327659606934, "step": 1669 }, { "clip_ratio": 0.0, "completion_length": 34.5625, "epoch": 4.3264248704663215, "grad_norm": 4.34776966293989, "kl": 0.194091796875, "learning_rate": 5.676165803108808e-07, "loss": 0.0008, "reward": 1.9974610805511475, "reward_std": 7.275603820744436e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4974610209465027, "step": 1670 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 4.329015544041451, "grad_norm": 1.7935625393089787, "kl": 0.08203125, "learning_rate": 5.673575129533679e-07, "loss": 0.0003, "reward": 2.499973773956299, "reward_std": 1.5526404695265228e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999738335609436, "step": 1671 }, { "clip_ratio": 0.0, "completion_length": 34.5, "epoch": 4.331606217616581, "grad_norm": 0.1589387687491885, "kl": 0.1483154296875, "learning_rate": 5.670984455958549e-07, "loss": 0.0019, "reward": 2.49999737739563, "reward_std": 1.8638985466168378e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999971389770508, "step": 1672 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 4.33419689119171, "grad_norm": 6.0483060424583, "kl": 0.1376953125, "learning_rate": 5.66839378238342e-07, "loss": 0.0001, "reward": 2.4999327659606934, "reward_std": 2.656341735018941e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999327063560486, "step": 1673 }, { "clip_ratio": 0.0, "completion_length": 34.5, "epoch": 4.33678756476684, "grad_norm": 104.87761967629571, "kl": 0.05828857421875, "learning_rate": 5.66580310880829e-07, "loss": -0.0004, "reward": 1.9709819555282593, "reward_std": 0.010413831605660562, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4709819853305817, "step": 1674 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 4.339378238341969, "grad_norm": 4.0079432532628, "kl": 0.071929931640625, "learning_rate": 5.66321243523316e-07, "loss": 0.0003, "reward": 2.4999886751174927, "reward_std": 6.66723258291313e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999887347221375, "step": 1675 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 4.341968911917099, "grad_norm": 3.624018531195769, "kl": 0.122802734375, "learning_rate": 5.660621761658031e-07, "loss": 0.0006, "reward": 1.9980930089950562, "reward_std": 4.907955940325337e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.498093068599701, "step": 1676 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 4.344559585492228, "grad_norm": 5.078483454740987, "kl": 0.0625, "learning_rate": 5.658031088082901e-07, "loss": 0.0003, "reward": 2.499971628189087, "reward_std": 2.3246760861184157e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.999971628189087, "step": 1677 }, { "clip_ratio": 0.0, "completion_length": 34.5, "epoch": 4.347150259067358, "grad_norm": 14.030459045656572, "kl": 0.1376953125, "learning_rate": 5.655440414507772e-07, "loss": 0.0015, "reward": 2.499985456466675, "reward_std": 1.0048452622868354e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999853372573853, "step": 1678 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 4.349740932642487, "grad_norm": 2.3457785822766435, "kl": 0.08740234375, "learning_rate": 5.652849740932643e-07, "loss": 0.0012, "reward": 2.499953508377075, "reward_std": 2.1980561541568022e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999533295631409, "step": 1679 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 4.352331606217617, "grad_norm": 0.7997081006452124, "kl": 0.1173095703125, "learning_rate": 5.650259067357512e-07, "loss": 0.0013, "reward": 2.4999847412109375, "reward_std": 8.641696695121937e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999848008155823, "step": 1680 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 4.3549222797927465, "grad_norm": 4.03442756705084, "kl": 0.30810546875, "learning_rate": 5.647668393782383e-07, "loss": 0.0015, "reward": 1.9937379360198975, "reward_std": 8.287364062198321e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4937377572059631, "step": 1681 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 4.357512953367876, "grad_norm": 0.6380754845600469, "kl": 0.0714111328125, "learning_rate": 5.645077720207254e-07, "loss": 0.0004, "reward": 2.499994158744812, "reward_std": 2.530499386921292e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.999994158744812, "step": 1682 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 4.360103626943006, "grad_norm": 0.8457500822384119, "kl": 0.079833984375, "learning_rate": 5.642487046632124e-07, "loss": -0.0004, "reward": 2.4999916553497314, "reward_std": 5.998976689625124e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.999991774559021, "step": 1683 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 4.362694300518135, "grad_norm": 0.3529816341740026, "kl": 0.0849609375, "learning_rate": 5.639896373056995e-07, "loss": 0.0005, "reward": 2.499996542930603, "reward_std": 3.0697000283907983e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.999996542930603, "step": 1684 }, { "clip_ratio": 0.0, "completion_length": 33.0, "epoch": 4.365284974093265, "grad_norm": 1.158518374683692, "kl": 0.0728759765625, "learning_rate": 5.637305699481865e-07, "loss": 0.0003, "reward": 2.499997854232788, "reward_std": 1.4521563684866123e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999979138374329, "step": 1685 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 4.367875647668393, "grad_norm": 0.3591072319522074, "kl": 0.13916015625, "learning_rate": 5.634715025906735e-07, "loss": -0.0003, "reward": 2.499972701072693, "reward_std": 2.5953077056328766e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999728202819824, "step": 1686 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 4.370466321243523, "grad_norm": 3.7760805161238484, "kl": 0.13623046875, "learning_rate": 5.632124352331606e-07, "loss": 0.0005, "reward": 1.9998499155044556, "reward_std": 3.568363376871275e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4998499751091003, "step": 1687 }, { "clip_ratio": 0.0, "completion_length": 34.5, "epoch": 4.373056994818652, "grad_norm": 1.1462186690346785, "kl": 0.185791015625, "learning_rate": 5.629533678756476e-07, "loss": 0.0003, "reward": 2.4999940395355225, "reward_std": 4.879303219240683e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.999994158744812, "step": 1688 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 4.375647668393782, "grad_norm": 0.5031808372050066, "kl": 0.10498046875, "learning_rate": 5.626943005181347e-07, "loss": 0.001, "reward": 2.499981164932251, "reward_std": 7.235457815113477e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999812245368958, "step": 1689 }, { "clip_ratio": 0.0, "completion_length": 34.5, "epoch": 4.3782383419689115, "grad_norm": 2.6709955823662885, "kl": 0.09228515625, "learning_rate": 5.624352331606217e-07, "loss": 0.0005, "reward": 1.9993645548820496, "reward_std": 3.636389965322451e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4993645548820496, "step": 1690 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 4.380829015544041, "grad_norm": 5.223258851660594, "kl": 0.070556640625, "learning_rate": 5.621761658031088e-07, "loss": 0.0008, "reward": 1.7910540103912354, "reward_std": 0.00038991638120933203, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.2910539507865906, "step": 1691 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 4.383419689119171, "grad_norm": 11.48828302017316, "kl": 0.1318359375, "learning_rate": 5.619170984455959e-07, "loss": 0.0008, "reward": 2.059893310070038, "reward_std": 0.17790578648964583, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.5598931908607483, "step": 1692 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 4.3860103626943, "grad_norm": 4.044784153342431, "kl": 0.097900390625, "learning_rate": 5.616580310880828e-07, "loss": 0.0004, "reward": 2.4999879598617554, "reward_std": 1.4329050429751078e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999879002571106, "step": 1693 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 4.38860103626943, "grad_norm": 59.147150064031706, "kl": 0.14794921875, "learning_rate": 5.613989637305699e-07, "loss": 0.0007, "reward": 2.0000797510147095, "reward_std": 0.41397102706878286, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.50007963180542, "step": 1694 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 4.391191709844559, "grad_norm": 0.13230641410232852, "kl": 0.0547637939453125, "learning_rate": 5.611398963730569e-07, "loss": -0.0001, "reward": 2.499997854232788, "reward_std": 1.8281344296156021e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999980926513672, "step": 1695 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 4.393782383419689, "grad_norm": 0.7078995609936762, "kl": 0.058197021484375, "learning_rate": 5.60880829015544e-07, "loss": 0.0009, "reward": 2.499995708465576, "reward_std": 4.362423453585507e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999953508377075, "step": 1696 }, { "clip_ratio": 0.0, "completion_length": 34.5, "epoch": 4.396373056994818, "grad_norm": 0.04911042800504467, "kl": 0.0479736328125, "learning_rate": 5.606217616580311e-07, "loss": 0.0008, "reward": 2.499998927116394, "reward_std": 1.2129205799737974e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.999998927116394, "step": 1697 }, { "clip_ratio": 0.0, "completion_length": 34.0, "epoch": 4.398963730569948, "grad_norm": 4.154814346568021, "kl": 0.221435546875, "learning_rate": 5.60362694300518e-07, "loss": 0.0006, "reward": 1.9961839318275452, "reward_std": 7.138645219129103e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4961840510368347, "step": 1698 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 4.401554404145077, "grad_norm": 0.9624370825251787, "kl": 0.1119384765625, "learning_rate": 5.601036269430051e-07, "loss": 0.0007, "reward": 2.4999773502349854, "reward_std": 6.098519520492118e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999773502349854, "step": 1699 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 4.404145077720207, "grad_norm": 0.25643874556879054, "kl": 0.1737060546875, "learning_rate": 5.598445595854921e-07, "loss": 0.0014, "reward": 2.4999911785125732, "reward_std": 4.062527125370252e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999908804893494, "step": 1700 }, { "clip_ratio": 0.0, "completion_length": 34.5, "epoch": 4.4067357512953365, "grad_norm": 0.6283293540103942, "kl": 0.06787109375, "learning_rate": 5.595854922279792e-07, "loss": 0.0006, "reward": 2.4999942779541016, "reward_std": 3.5235444784120773e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999942779541016, "step": 1701 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 4.409326424870466, "grad_norm": 3.840606502892612, "kl": 0.065185546875, "learning_rate": 5.593264248704663e-07, "loss": 0.0008, "reward": 1.9998810291290283, "reward_std": 3.656877743196674e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4998809695243835, "step": 1702 }, { "clip_ratio": 0.0, "completion_length": 34.0, "epoch": 4.4119170984455955, "grad_norm": 0.46864875560041785, "kl": 0.10498046875, "learning_rate": 5.590673575129533e-07, "loss": 0.0004, "reward": 2.4999979734420776, "reward_std": 1.638020250993577e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999980926513672, "step": 1703 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 4.414507772020725, "grad_norm": 4.529201394053517, "kl": 0.086669921875, "learning_rate": 5.588082901554404e-07, "loss": 0.0011, "reward": 2.499974489212036, "reward_std": 1.2205046800772834e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999744892120361, "step": 1704 }, { "clip_ratio": 0.0, "completion_length": 34.5, "epoch": 4.417098445595855, "grad_norm": 24.52666576882307, "kl": 0.09857177734375, "learning_rate": 5.585492227979274e-07, "loss": 0.0004, "reward": 2.4998916387557983, "reward_std": 0.00010237504315568913, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9998915791511536, "step": 1705 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 4.419689119170984, "grad_norm": 5.3011261825787885, "kl": 0.1708984375, "learning_rate": 5.582901554404144e-07, "loss": 0.0007, "reward": 2.4999430179595947, "reward_std": 2.5337361478250386e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999431371688843, "step": 1706 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 4.422279792746114, "grad_norm": 0.47976953579162007, "kl": 0.086669921875, "learning_rate": 5.580310880829016e-07, "loss": 0.0008, "reward": 2.4999910593032837, "reward_std": 3.6727846008943743e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999910593032837, "step": 1707 }, { "clip_ratio": 0.0, "completion_length": 34.5, "epoch": 4.424870466321243, "grad_norm": 4.4050511109149015, "kl": 0.0736083984375, "learning_rate": 5.577720207253886e-07, "loss": -0.0, "reward": 2.4999529123306274, "reward_std": 2.021394442408564e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999529123306274, "step": 1708 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 4.427461139896373, "grad_norm": 0.7492906569364401, "kl": 0.050018310546875, "learning_rate": 5.575129533678757e-07, "loss": 0.0005, "reward": 2.4999948740005493, "reward_std": 5.412219252320938e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999948740005493, "step": 1709 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 4.430051813471502, "grad_norm": 6.600957887693868, "kl": 0.0947265625, "learning_rate": 5.572538860103628e-07, "loss": 0.0002, "reward": 2.4999550580978394, "reward_std": 2.978387965413276e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999549984931946, "step": 1710 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 4.432642487046632, "grad_norm": 1.4863056551898715, "kl": 0.032684326171875, "learning_rate": 5.569948186528497e-07, "loss": -0.0, "reward": 2.4999955892562866, "reward_std": 3.5129194202454528e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999956488609314, "step": 1711 }, { "clip_ratio": 0.0, "completion_length": 34.5, "epoch": 4.435233160621761, "grad_norm": 0.3018152184523658, "kl": 0.07470703125, "learning_rate": 5.567357512953368e-07, "loss": 0.0003, "reward": 2.499995231628418, "reward_std": 4.410703013491002e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999951720237732, "step": 1712 }, { "clip_ratio": 0.0, "completion_length": 34.0, "epoch": 4.437823834196891, "grad_norm": 14.125489467811978, "kl": 0.109619140625, "learning_rate": 5.564766839378238e-07, "loss": 0.0004, "reward": 2.4374903440475464, "reward_std": 0.1767951499477931, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9374903440475464, "step": 1713 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 4.4404145077720205, "grad_norm": 4.314532829435389, "kl": 0.186279296875, "learning_rate": 5.562176165803109e-07, "loss": 0.0014, "reward": 2.4999916553497314, "reward_std": 1.18362083867396e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999916553497314, "step": 1714 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 4.44300518134715, "grad_norm": 67.13593042781774, "kl": 0.12451171875, "learning_rate": 5.55958549222798e-07, "loss": -0.0002, "reward": 2.062413215637207, "reward_std": 0.17680161040880193, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.5624132752418518, "step": 1715 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 4.44559585492228, "grad_norm": 6.316466242667441, "kl": 0.0654296875, "learning_rate": 5.55699481865285e-07, "loss": 0.0005, "reward": 1.9998271465301514, "reward_std": 2.2908495338924695e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4998270273208618, "step": 1716 }, { "clip_ratio": 0.0, "completion_length": 34.5, "epoch": 4.448186528497409, "grad_norm": 73.18929649772944, "kl": 0.0538330078125, "learning_rate": 5.55440414507772e-07, "loss": 0.001, "reward": 2.124595046043396, "reward_std": 0.23170430760046656, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.6245948672294617, "step": 1717 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 4.450777202072539, "grad_norm": 9.060460988652666, "kl": 2.470458984375, "learning_rate": 5.55181347150259e-07, "loss": 0.0107, "reward": 1.9999070167541504, "reward_std": 1.519389070381294e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4999069571495056, "step": 1718 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 4.453367875647668, "grad_norm": 0.40007900210739733, "kl": 0.046875, "learning_rate": 5.549222797927461e-07, "loss": 0.0009, "reward": 2.4999955892562866, "reward_std": 3.0979892926552566e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999954104423523, "step": 1719 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 4.455958549222798, "grad_norm": 4.9909218232330534, "kl": 0.092529296875, "learning_rate": 5.546632124352332e-07, "loss": -0.0001, "reward": 2.4999775886535645, "reward_std": 9.459797297495243e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999775886535645, "step": 1720 }, { "clip_ratio": 0.0, "completion_length": 34.9375, "epoch": 4.458549222797927, "grad_norm": 4.247828911291178, "kl": 0.150634765625, "learning_rate": 5.544041450777202e-07, "loss": 0.001, "reward": 1.9988590478897095, "reward_std": 5.002601136538942e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.49885892868042, "step": 1721 }, { "clip_ratio": 0.0, "completion_length": 34.5, "epoch": 4.461139896373057, "grad_norm": 0.4149888892291473, "kl": 0.063232421875, "learning_rate": 5.541450777202073e-07, "loss": -0.0, "reward": 2.499990463256836, "reward_std": 4.090190032002283e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.999990701675415, "step": 1722 }, { "clip_ratio": 0.0, "completion_length": 34.5, "epoch": 4.463730569948186, "grad_norm": 0.6074406266025194, "kl": 0.134765625, "learning_rate": 5.538860103626942e-07, "loss": -0.0004, "reward": 2.4999852180480957, "reward_std": 5.665189860337705e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999852776527405, "step": 1723 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 4.466321243523316, "grad_norm": 2.9292397019683807, "kl": 0.0535888671875, "learning_rate": 5.536269430051813e-07, "loss": -0.0002, "reward": 2.4999821186065674, "reward_std": 1.273383224997815e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999821186065674, "step": 1724 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 4.4689119170984455, "grad_norm": 0.21694732216567852, "kl": 0.096435546875, "learning_rate": 5.533678756476684e-07, "loss": -0.0002, "reward": 2.499993085861206, "reward_std": 4.882911525783129e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999933242797852, "step": 1725 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 4.471502590673575, "grad_norm": 0.6946692384181016, "kl": 0.065673828125, "learning_rate": 5.531088082901554e-07, "loss": -0.0002, "reward": 1.9991164207458496, "reward_std": 1.3185736406740034e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4991165101528168, "step": 1726 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 4.474093264248705, "grad_norm": 0.30270239547969036, "kl": 0.115234375, "learning_rate": 5.528497409326425e-07, "loss": -0.0007, "reward": 2.4999929666519165, "reward_std": 2.55716349784052e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999930262565613, "step": 1727 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 4.476683937823834, "grad_norm": 1.4196766797275246, "kl": 0.114013671875, "learning_rate": 5.525906735751296e-07, "loss": 0.0003, "reward": 2.4999927282333374, "reward_std": 5.371910845042294e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999927282333374, "step": 1728 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 4.479274611398964, "grad_norm": 0.4853911491627105, "kl": 0.107177734375, "learning_rate": 5.523316062176165e-07, "loss": 0.0002, "reward": 2.4999895095825195, "reward_std": 6.176342139951885e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999894499778748, "step": 1729 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 4.481865284974093, "grad_norm": 0.36009955780918074, "kl": 0.1220703125, "learning_rate": 5.520725388601036e-07, "loss": 0.0018, "reward": 2.4999899864196777, "reward_std": 5.2746761411981424e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999898672103882, "step": 1730 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 4.484455958549223, "grad_norm": 8.971639198802148, "kl": 0.1143798828125, "learning_rate": 5.518134715025906e-07, "loss": 0.0003, "reward": 1.865371286869049, "reward_std": 0.0008516380319747441, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.365371197462082, "step": 1731 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 4.487046632124352, "grad_norm": 1.8322105910803868, "kl": 0.082763671875, "learning_rate": 5.515544041450777e-07, "loss": -0.0001, "reward": 1.9987614750862122, "reward_std": 3.63909566658549e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.498761534690857, "step": 1732 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 4.489637305699482, "grad_norm": 0.16708755295284186, "kl": 0.05419921875, "learning_rate": 5.512953367875648e-07, "loss": 0.0011, "reward": 2.4999920129776, "reward_std": 3.906192205249681e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999919533729553, "step": 1733 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 4.492227979274611, "grad_norm": 0.5255056644256783, "kl": 0.0484619140625, "learning_rate": 5.510362694300518e-07, "loss": 0.0011, "reward": 2.4999929666519165, "reward_std": 5.205781008044141e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999929666519165, "step": 1734 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 4.494818652849741, "grad_norm": 22.47454299777541, "kl": 0.059326171875, "learning_rate": 5.507772020725388e-07, "loss": -0.0002, "reward": 1.9862151145935059, "reward_std": 0.0003306973626422405, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4862149953842163, "step": 1735 }, { "clip_ratio": 0.0, "completion_length": 34.5625, "epoch": 4.4974093264248705, "grad_norm": 7.5925048290151045, "kl": 0.0758056640625, "learning_rate": 5.505181347150258e-07, "loss": 0.0008, "reward": 2.499931812286377, "reward_std": 1.8098790064868808e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999316930770874, "step": 1736 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 4.5, "grad_norm": 13.840509146825339, "kl": 0.1650390625, "learning_rate": 5.502590673575129e-07, "loss": 0.0004, "reward": 2.3123154640197754, "reward_std": 0.2588353480641672, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.8123154640197754, "step": 1737 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 4.5025906735751295, "grad_norm": 1.4649583984022538, "kl": 0.0894775390625, "learning_rate": 5.5e-07, "loss": -0.0006, "reward": 1.9937479496002197, "reward_std": 5.249552850727923e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.493748128414154, "step": 1738 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 4.505181347150259, "grad_norm": 0.16107007715496194, "kl": 0.099365234375, "learning_rate": 5.49740932642487e-07, "loss": -0.0008, "reward": 2.499993681907654, "reward_std": 2.730508754211769e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999937415122986, "step": 1739 }, { "clip_ratio": 0.0, "completion_length": 34.5, "epoch": 4.507772020725389, "grad_norm": 0.36454592636768024, "kl": 0.0196533203125, "learning_rate": 5.494818652849741e-07, "loss": 0.0007, "reward": 2.499996542930603, "reward_std": 2.138415709396213e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999964833259583, "step": 1740 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 4.510362694300518, "grad_norm": 1.2598692413619204, "kl": 0.17138671875, "learning_rate": 5.49222797927461e-07, "loss": 0.002, "reward": 2.4999940395355225, "reward_std": 3.844752654913464e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999940395355225, "step": 1741 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 4.512953367875648, "grad_norm": 14.141894308075937, "kl": 0.0623779296875, "learning_rate": 5.489637305699481e-07, "loss": 0.0005, "reward": 1.9988062381744385, "reward_std": 0.0001535951200821728, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4988062381744385, "step": 1742 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 4.515544041450777, "grad_norm": 1.8988375217965705, "kl": 0.07421875, "learning_rate": 5.487046632124352e-07, "loss": -0.0004, "reward": 2.4999899864196777, "reward_std": 6.828418918303214e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999900460243225, "step": 1743 }, { "clip_ratio": 0.0, "completion_length": 34.5, "epoch": 4.518134715025907, "grad_norm": 0.4834270230295165, "kl": 0.076904296875, "learning_rate": 5.484455958549222e-07, "loss": -0.0001, "reward": 2.4999924898147583, "reward_std": 3.5415099546298734e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999927878379822, "step": 1744 }, { "clip_ratio": 0.0, "completion_length": 35.5625, "epoch": 4.520725388601036, "grad_norm": 5.24318124876931, "kl": 0.08544921875, "learning_rate": 5.481865284974093e-07, "loss": 0.0004, "reward": 2.499962091445923, "reward_std": 2.1505103632080136e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999619126319885, "step": 1745 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 4.523316062176166, "grad_norm": 2.2138667424204512, "kl": 0.06494140625, "learning_rate": 5.479274611398963e-07, "loss": -0.0005, "reward": 2.4999648332595825, "reward_std": 1.820845932343218e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999648332595825, "step": 1746 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 4.525906735751295, "grad_norm": 0.12014376424380266, "kl": 0.048828125, "learning_rate": 5.476683937823833e-07, "loss": 0.0003, "reward": 2.4999959468841553, "reward_std": 2.5282397473347373e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999960660934448, "step": 1747 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 4.528497409326425, "grad_norm": 6.810906331642244, "kl": 0.1552734375, "learning_rate": 5.474093264248704e-07, "loss": 0.001, "reward": 1.3919540643692017, "reward_std": 0.0007581018317068811, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 0.8919539451599121, "step": 1748 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 4.5310880829015545, "grad_norm": 0.29126585076495237, "kl": 0.0574951171875, "learning_rate": 5.471502590673574e-07, "loss": 0.001, "reward": 2.499996781349182, "reward_std": 2.286420908603759e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999966621398926, "step": 1749 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 4.533678756476684, "grad_norm": 4.537578229563026, "kl": 0.05755615234375, "learning_rate": 5.468911917098446e-07, "loss": 0.0003, "reward": 1.9018195867538452, "reward_std": 0.0004079671218732983, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4018195569515228, "step": 1750 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 4.536269430051814, "grad_norm": 5.1500290494193735, "kl": 0.0601806640625, "learning_rate": 5.466321243523317e-07, "loss": -0.0004, "reward": 2.499959111213684, "reward_std": 1.8426877204547054e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999591708183289, "step": 1751 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 4.538860103626943, "grad_norm": 0.08562253712896178, "kl": 0.030517578125, "learning_rate": 5.463730569948187e-07, "loss": 0.0003, "reward": 2.499999165534973, "reward_std": 8.481734141696506e-07, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999992847442627, "step": 1752 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 4.541450777202073, "grad_norm": 0.02890220585255892, "kl": 0.03179931640625, "learning_rate": 5.461139896373057e-07, "loss": 0.0004, "reward": 2.499998927116394, "reward_std": 8.564228721752443e-07, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.999998927116394, "step": 1753 }, { "clip_ratio": 0.0, "completion_length": 34.9375, "epoch": 4.544041450777202, "grad_norm": 167.28998596440692, "kl": 0.123291015625, "learning_rate": 5.458549222797927e-07, "loss": 0.0011, "reward": 2.418424606323242, "reward_std": 0.23071279702719494, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9184245467185974, "step": 1754 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 4.546632124352332, "grad_norm": 5.125905915002972, "kl": 0.066162109375, "learning_rate": 5.455958549222798e-07, "loss": -0.0005, "reward": 1.998132586479187, "reward_std": 8.477292431052774e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.498132586479187, "step": 1755 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 4.549222797927461, "grad_norm": 10.56630539633022, "kl": 0.0919189453125, "learning_rate": 5.453367875647669e-07, "loss": 0.0007, "reward": 1.434851348400116, "reward_std": 0.002121197898304672, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 0.9348513185977936, "step": 1756 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 4.551813471502591, "grad_norm": 3.658157885612403, "kl": 0.081298828125, "learning_rate": 5.450777202072539e-07, "loss": 0.0005, "reward": 1.999853253364563, "reward_std": 3.052235570066841e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4998532831668854, "step": 1757 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 4.55440414507772, "grad_norm": 7.859945952096368, "kl": 0.06103515625, "learning_rate": 5.44818652849741e-07, "loss": -0.0004, "reward": 2.49995219707489, "reward_std": 2.3798113033990376e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999523162841797, "step": 1758 }, { "clip_ratio": 0.0, "completion_length": 34.0, "epoch": 4.55699481865285, "grad_norm": 0.11183179421683982, "kl": 0.0673828125, "learning_rate": 5.445595854922279e-07, "loss": -0.0009, "reward": 2.499996781349182, "reward_std": 1.2882640021416591e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.999996840953827, "step": 1759 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 4.5595854922279795, "grad_norm": 20.541195540764917, "kl": 0.0535888671875, "learning_rate": 5.44300518134715e-07, "loss": 0.0004, "reward": 2.312475085258484, "reward_std": 0.25880386325729887, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.8124750852584839, "step": 1760 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 4.562176165803109, "grad_norm": 0.12243874222594237, "kl": 0.0281982421875, "learning_rate": 5.440414507772021e-07, "loss": 0.0004, "reward": 2.4999738931655884, "reward_std": 3.870881073453347e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999737739562988, "step": 1761 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 4.564766839378239, "grad_norm": 34.42849905423427, "kl": 0.11865234375, "learning_rate": 5.437823834196891e-07, "loss": 0.0005, "reward": 1.7491916418075562, "reward_std": 0.26747630536556244, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.2491916418075562, "step": 1762 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 4.567357512953368, "grad_norm": 8.108120910606115, "kl": 0.2490234375, "learning_rate": 5.435233160621762e-07, "loss": 0.0013, "reward": 2.4999791383743286, "reward_std": 1.2070741831848864e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999789595603943, "step": 1763 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 4.569948186528498, "grad_norm": 0.14111984329156935, "kl": 0.138671875, "learning_rate": 5.432642487046632e-07, "loss": 0.0011, "reward": 2.4999953508377075, "reward_std": 2.533350709654769e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.999995231628418, "step": 1764 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 4.572538860103627, "grad_norm": 7.5066810759913905, "kl": 0.0966796875, "learning_rate": 5.430051813471502e-07, "loss": -0.0005, "reward": 2.4999719858169556, "reward_std": 1.4020912033174682e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999722838401794, "step": 1765 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 4.575129533678757, "grad_norm": 4.832016422327479, "kl": 0.100830078125, "learning_rate": 5.427461139896373e-07, "loss": 0.0002, "reward": 1.9979740381240845, "reward_std": 9.22031576919835e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4979739785194397, "step": 1766 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 4.577720207253886, "grad_norm": 0.13894154070130946, "kl": 0.0484619140625, "learning_rate": 5.424870466321243e-07, "loss": 0.0005, "reward": 2.4999951124191284, "reward_std": 2.3747810473651043e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999951124191284, "step": 1767 }, { "clip_ratio": 0.0, "completion_length": 34.5, "epoch": 4.580310880829016, "grad_norm": 0.1289924359195169, "kl": 0.0517425537109375, "learning_rate": 5.422279792746114e-07, "loss": 0.0018, "reward": 2.4999966621398926, "reward_std": 1.781306991688325e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999964237213135, "step": 1768 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 4.582901554404145, "grad_norm": 0.19024624761305317, "kl": 0.103271484375, "learning_rate": 5.419689119170984e-07, "loss": -0.0006, "reward": 2.499997615814209, "reward_std": 1.2299852301111969e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999979734420776, "step": 1769 }, { "clip_ratio": 0.0, "completion_length": 35.4375, "epoch": 4.585492227979275, "grad_norm": 18.33407250057705, "kl": 0.092041015625, "learning_rate": 5.417098445595855e-07, "loss": 0.0004, "reward": 1.2603832483291626, "reward_std": 0.07101157457509544, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 0.7603832483291626, "step": 1770 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 4.5880829015544045, "grad_norm": 0.5425474179411571, "kl": 0.07952880859375, "learning_rate": 5.414507772020725e-07, "loss": 0.0008, "reward": 2.4999901056289673, "reward_std": 3.722099677361257e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999899864196777, "step": 1771 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 4.590673575129534, "grad_norm": 36.48962831061876, "kl": 0.121826171875, "learning_rate": 5.411917098445595e-07, "loss": 0.0004, "reward": 1.9373120069503784, "reward_std": 0.17688471153087448, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4373118579387665, "step": 1772 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 4.5932642487046635, "grad_norm": 11.063815349056819, "kl": 0.100830078125, "learning_rate": 5.409326424870466e-07, "loss": -0.0003, "reward": 1.9933598041534424, "reward_std": 0.00027473537465994013, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.493359923362732, "step": 1773 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 4.595854922279793, "grad_norm": 7.094578066447553, "kl": 0.08154296875, "learning_rate": 5.406735751295336e-07, "loss": 0.001, "reward": 2.499984860420227, "reward_std": 7.220182624223526e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999847412109375, "step": 1774 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 4.598445595854923, "grad_norm": 3.9654955076468124, "kl": 0.0843505859375, "learning_rate": 5.404145077720207e-07, "loss": 0.0003, "reward": 2.4998425245285034, "reward_std": 4.4948640720576805e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.999842643737793, "step": 1775 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 4.601036269430052, "grad_norm": 1.9395867045463069, "kl": 0.032196044921875, "learning_rate": 5.401554404145078e-07, "loss": -0.0, "reward": 2.4999918937683105, "reward_std": 4.97321639159054e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999920129776, "step": 1776 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 4.603626943005182, "grad_norm": 1.2584354315446733, "kl": 0.1568603515625, "learning_rate": 5.398963730569947e-07, "loss": 0.0011, "reward": 2.499984622001648, "reward_std": 8.536511131751467e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999845623970032, "step": 1777 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 4.606217616580311, "grad_norm": 1.8763950392703619, "kl": 0.147705078125, "learning_rate": 5.396373056994818e-07, "loss": 0.0016, "reward": 1.9881176352500916, "reward_std": 8.624646557109372e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.488117516040802, "step": 1778 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 4.608808290155441, "grad_norm": 0.36894895891787144, "kl": 0.076904296875, "learning_rate": 5.393782383419689e-07, "loss": 0.0013, "reward": 2.4999769926071167, "reward_std": 2.9433512054310995e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999770522117615, "step": 1779 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 4.61139896373057, "grad_norm": 0.20530584982003117, "kl": 0.070068359375, "learning_rate": 5.391191709844559e-07, "loss": 0.0009, "reward": 2.499998092651367, "reward_std": 2.516909376026888e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999980926513672, "step": 1780 }, { "clip_ratio": 0.0, "completion_length": 34.5, "epoch": 4.6139896373057, "grad_norm": 0.06945023570074033, "kl": 0.134765625, "learning_rate": 5.38860103626943e-07, "loss": 0.0008, "reward": 2.4999932050704956, "reward_std": 1.9671365976137167e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.999993085861206, "step": 1781 }, { "clip_ratio": 0.0, "completion_length": 35.9375, "epoch": 4.616580310880829, "grad_norm": 11.372654795580685, "kl": 0.07958984375, "learning_rate": 5.3860103626943e-07, "loss": 0.0004, "reward": 1.9558073282241821, "reward_std": 0.015141666277486365, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.455807387828827, "step": 1782 }, { "clip_ratio": 0.0, "completion_length": 34.0, "epoch": 4.619170984455959, "grad_norm": 0.10606060028254785, "kl": 0.0546875, "learning_rate": 5.38341968911917e-07, "loss": -0.0004, "reward": 2.4999961853027344, "reward_std": 2.217536007265153e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.999996304512024, "step": 1783 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 4.6217616580310885, "grad_norm": 14.701085393080664, "kl": 0.09814453125, "learning_rate": 5.380829015544041e-07, "loss": 0.0004, "reward": 1.9998481273651123, "reward_std": 7.417497454298427e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4998482167720795, "step": 1784 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 4.624352331606218, "grad_norm": 22.170606401595872, "kl": 0.07421875, "learning_rate": 5.378238341968911e-07, "loss": 0.0007, "reward": 2.4999823570251465, "reward_std": 1.3415211014944362e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.999982237815857, "step": 1785 }, { "clip_ratio": 0.0, "completion_length": 35.25, "epoch": 4.626943005181348, "grad_norm": 24.268573075221003, "kl": 0.122314453125, "learning_rate": 5.375647668393782e-07, "loss": 0.0003, "reward": 1.9668034315109253, "reward_std": 0.0006330931146294461, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4668034315109253, "step": 1786 }, { "clip_ratio": 0.0, "completion_length": 34.5, "epoch": 4.629533678756477, "grad_norm": 0.8726368002420039, "kl": 0.21533203125, "learning_rate": 5.373056994818652e-07, "loss": 0.0017, "reward": 1.9987282156944275, "reward_std": 1.0433882380311843e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4987280368804932, "step": 1787 }, { "clip_ratio": 0.0, "completion_length": 34.5, "epoch": 4.632124352331607, "grad_norm": 0.7629187252374979, "kl": 0.0712890625, "learning_rate": 5.370466321243523e-07, "loss": 0.0004, "reward": 2.4999932050704956, "reward_std": 5.462136982714583e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999931454658508, "step": 1788 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 4.634715025906736, "grad_norm": 68.52194084942666, "kl": 0.07470703125, "learning_rate": 5.367875647668393e-07, "loss": 0.0002, "reward": 2.437477707862854, "reward_std": 0.17678443504109964, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9374777674674988, "step": 1789 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 4.637305699481866, "grad_norm": 1.7461200527244418, "kl": 0.15087890625, "learning_rate": 5.365284974093263e-07, "loss": -0.0006, "reward": 2.49998140335083, "reward_std": 1.2022102737319074e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999816417694092, "step": 1790 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 4.639896373056995, "grad_norm": 4.36089702912699, "kl": 0.1373291015625, "learning_rate": 5.362694300518134e-07, "loss": 0.0008, "reward": 1.7711026072502136, "reward_std": 0.00040005836910950165, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.2711024582386017, "step": 1791 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 4.642487046632124, "grad_norm": 26.98053141937611, "kl": 0.07763671875, "learning_rate": 5.360103626943004e-07, "loss": 0.0004, "reward": 1.9360750913619995, "reward_std": 0.17681539360455645, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4360751509666443, "step": 1792 }, { "clip_ratio": 0.0, "completion_length": 34.0, "epoch": 4.6450777202072535, "grad_norm": 20.422691609126474, "kl": 0.0587158203125, "learning_rate": 5.357512953367876e-07, "loss": 0.0001, "reward": 2.4999927282333374, "reward_std": 8.652078577142674e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999926686286926, "step": 1793 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 4.647668393782383, "grad_norm": 7.415234207725857, "kl": 0.099365234375, "learning_rate": 5.354922279792747e-07, "loss": 0.0, "reward": 1.9930787086486816, "reward_std": 0.00013342655711312545, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.493078738451004, "step": 1794 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 4.650259067357513, "grad_norm": 8.303612443562999, "kl": 0.127685546875, "learning_rate": 5.352331606217616e-07, "loss": 0.0013, "reward": 1.8097798824310303, "reward_std": 0.0004643035781555227, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.309779793024063, "step": 1795 }, { "clip_ratio": 0.0, "completion_length": 34.0, "epoch": 4.652849740932642, "grad_norm": 1.3467392605388342, "kl": 0.0413818359375, "learning_rate": 5.349740932642487e-07, "loss": -0.0006, "reward": 2.499995231628418, "reward_std": 6.925025836324039e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999953508377075, "step": 1796 }, { "clip_ratio": 0.0, "completion_length": 34.9375, "epoch": 4.655440414507772, "grad_norm": 24.029626617899478, "kl": 0.1014404296875, "learning_rate": 5.347150259067357e-07, "loss": 0.0005, "reward": 1.999317705631256, "reward_std": 2.9128019832569407e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4993175864219666, "step": 1797 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 4.658031088082901, "grad_norm": 0.20055234101517333, "kl": 0.14990234375, "learning_rate": 5.344559585492228e-07, "loss": -0.0009, "reward": 2.499994158744812, "reward_std": 1.4809868531528991e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999942779541016, "step": 1798 }, { "clip_ratio": 0.0, "completion_length": 34.5, "epoch": 4.660621761658031, "grad_norm": 1.0674348163457286, "kl": 0.0577392578125, "learning_rate": 5.341968911917099e-07, "loss": -0.0005, "reward": 2.499982237815857, "reward_std": 8.514090040989686e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999822974205017, "step": 1799 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 4.66321243523316, "grad_norm": 1.986250448784548, "kl": 0.150634765625, "learning_rate": 5.339378238341969e-07, "loss": 0.0011, "reward": 1.9992393255233765, "reward_std": 1.7084558294300223e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4992395043373108, "step": 1800 }, { "clip_ratio": 0.0, "completion_length": 34.0, "epoch": 4.66580310880829, "grad_norm": 33.69789533218986, "kl": 0.12890625, "learning_rate": 5.336787564766839e-07, "loss": 0.0008, "reward": 2.0622769594192505, "reward_std": 0.17685361541799693, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.5622769594192505, "step": 1801 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 4.668393782383419, "grad_norm": 0.36644256535473047, "kl": 0.074951171875, "learning_rate": 5.33419689119171e-07, "loss": 0.0002, "reward": 2.4999964237213135, "reward_std": 4.972720716978074e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.999996304512024, "step": 1802 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 4.670984455958549, "grad_norm": 3.8110981551169343, "kl": 0.082275390625, "learning_rate": 5.33160621761658e-07, "loss": 0.0007, "reward": 2.499957799911499, "reward_std": 1.5159917438722914e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999577403068542, "step": 1803 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 4.6735751295336785, "grad_norm": 1.712219155872681, "kl": 0.0694580078125, "learning_rate": 5.329015544041451e-07, "loss": -0.0008, "reward": 2.4999804496765137, "reward_std": 8.377825679417583e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.999980628490448, "step": 1804 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 4.676165803108808, "grad_norm": 1.066708334341106, "kl": 0.059814453125, "learning_rate": 5.326424870466321e-07, "loss": -0.0004, "reward": 2.49998140335083, "reward_std": 7.267447017511586e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999815225601196, "step": 1805 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 4.678756476683938, "grad_norm": 3.5160800299355524, "kl": 0.053955078125, "learning_rate": 5.323834196891192e-07, "loss": -0.0, "reward": 2.4999616146087646, "reward_std": 2.382047114224406e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999616742134094, "step": 1806 }, { "clip_ratio": 0.0, "completion_length": 34.5, "epoch": 4.681347150259067, "grad_norm": 1.1050010737781242, "kl": 0.09619140625, "learning_rate": 5.321243523316063e-07, "loss": 0.0003, "reward": 2.4999842643737793, "reward_std": 5.307365086082427e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999842643737793, "step": 1807 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 4.683937823834197, "grad_norm": 1.5522023553498705, "kl": 0.099853515625, "learning_rate": 5.318652849740932e-07, "loss": 0.0003, "reward": 2.499974846839905, "reward_std": 9.484943234383536e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999748468399048, "step": 1808 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 4.686528497409326, "grad_norm": 29.36205931954457, "kl": 0.13232421875, "learning_rate": 5.316062176165803e-07, "loss": -0.0001, "reward": 2.2498242259025574, "reward_std": 0.26744518303001996, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.749824583530426, "step": 1809 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 4.689119170984456, "grad_norm": 0.691534199317008, "kl": 0.13037109375, "learning_rate": 5.313471502590673e-07, "loss": -0.0005, "reward": 2.499990701675415, "reward_std": 3.964879851992009e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999907612800598, "step": 1810 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 4.691709844559585, "grad_norm": 5.8385624087917645, "kl": 0.0548095703125, "learning_rate": 5.310880829015544e-07, "loss": -0.0, "reward": 1.9983850717544556, "reward_std": 3.324689132000458e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4983851313591003, "step": 1811 }, { "clip_ratio": 0.0, "completion_length": 34.5, "epoch": 4.694300518134715, "grad_norm": 0.9466333117457287, "kl": 0.100830078125, "learning_rate": 5.308290155440415e-07, "loss": 0.0007, "reward": 2.4999927282333374, "reward_std": 4.231103531537883e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999926686286926, "step": 1812 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 4.696891191709844, "grad_norm": 0.6149548943526877, "kl": 0.0616455078125, "learning_rate": 5.305699481865284e-07, "loss": -0.0006, "reward": 2.4999901056289673, "reward_std": 4.871121859650884e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999902844429016, "step": 1813 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 4.699481865284974, "grad_norm": 0.12083629559390983, "kl": 0.0457763671875, "learning_rate": 5.303108808290155e-07, "loss": 0.0008, "reward": 2.4999938011169434, "reward_std": 2.4925450361479307e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999938011169434, "step": 1814 }, { "clip_ratio": 0.0, "completion_length": 34.0, "epoch": 4.7020725388601035, "grad_norm": 0.5859365264217756, "kl": 0.094970703125, "learning_rate": 5.300518134715025e-07, "loss": 0.0001, "reward": 2.499985456466675, "reward_std": 8.133426035783486e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999854564666748, "step": 1815 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 4.704663212435233, "grad_norm": 0.253599097742006, "kl": 0.0882568359375, "learning_rate": 5.297927461139896e-07, "loss": 0.0002, "reward": 2.4999914169311523, "reward_std": 2.3123197934182826e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999913573265076, "step": 1816 }, { "clip_ratio": 0.0, "completion_length": 34.5, "epoch": 4.7072538860103625, "grad_norm": 3.401992148236074, "kl": 0.3310546875, "learning_rate": 5.295336787564767e-07, "loss": 0.0015, "reward": 2.4999812841415405, "reward_std": 1.4841831671219552e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999813437461853, "step": 1817 }, { "clip_ratio": 0.0, "completion_length": 34.5, "epoch": 4.709844559585492, "grad_norm": 3.9139466962302074, "kl": 0.0572509765625, "learning_rate": 5.292746113989637e-07, "loss": 0.0003, "reward": 2.499984383583069, "reward_std": 1.4513848668684659e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999845623970032, "step": 1818 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 4.712435233160622, "grad_norm": 0.24020930568660093, "kl": 0.0712890625, "learning_rate": 5.290155440414508e-07, "loss": 0.0007, "reward": 2.499996304512024, "reward_std": 2.3560410511436203e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999963641166687, "step": 1819 }, { "clip_ratio": 0.0, "completion_length": 34.5, "epoch": 4.715025906735751, "grad_norm": 0.17373819824374603, "kl": 0.040771484375, "learning_rate": 5.287564766839377e-07, "loss": 0.001, "reward": 2.49999737739563, "reward_std": 1.7050833207576943e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999973773956299, "step": 1820 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 4.717616580310881, "grad_norm": 58.57864462348363, "kl": 0.0560302734375, "learning_rate": 5.284974093264248e-07, "loss": 0.0014, "reward": 2.4374406337738037, "reward_std": 0.1769376028134957, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9374406933784485, "step": 1821 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 4.72020725388601, "grad_norm": 51.79221132426645, "kl": 0.0792236328125, "learning_rate": 5.282383419689119e-07, "loss": -0.0005, "reward": 2.0620144605636597, "reward_std": 0.17697328804911194, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.562014639377594, "step": 1822 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 4.72279792746114, "grad_norm": 0.1836937612426288, "kl": 0.085205078125, "learning_rate": 5.279792746113989e-07, "loss": 0.0008, "reward": 2.49999737739563, "reward_std": 2.5852199883047433e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999972581863403, "step": 1823 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 4.725388601036269, "grad_norm": 0.03149477892204813, "kl": 0.111328125, "learning_rate": 5.27720207253886e-07, "loss": 0.0, "reward": 2.499997138977051, "reward_std": 1.2922189398523187e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999972581863403, "step": 1824 }, { "clip_ratio": 0.0, "completion_length": 34.5, "epoch": 4.727979274611399, "grad_norm": 0.4050448808059232, "kl": 0.159912109375, "learning_rate": 5.274611398963731e-07, "loss": 0.0006, "reward": 2.499990701675415, "reward_std": 3.7766282048323774e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999906420707703, "step": 1825 }, { "clip_ratio": 0.0, "completion_length": 34.5, "epoch": 4.730569948186528, "grad_norm": 3.2141309837044956, "kl": 0.1337890625, "learning_rate": 5.2720207253886e-07, "loss": 0.0003, "reward": 1.9567396640777588, "reward_std": 0.00012566078618192478, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4567397236824036, "step": 1826 }, { "clip_ratio": 0.0, "completion_length": 34.5, "epoch": 4.733160621761658, "grad_norm": 5.931264905795551, "kl": 0.12451171875, "learning_rate": 5.269430051813471e-07, "loss": 0.0005, "reward": 2.4999704360961914, "reward_std": 1.8420762216919684e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999704360961914, "step": 1827 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 4.7357512953367875, "grad_norm": 0.367435595426355, "kl": 0.0325927734375, "learning_rate": 5.266839378238341e-07, "loss": 0.0009, "reward": 2.4999847412109375, "reward_std": 3.6717295870403177e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999847412109375, "step": 1828 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 4.738341968911917, "grad_norm": 2.905226315117502, "kl": 0.04443359375, "learning_rate": 5.264248704663212e-07, "loss": 0.0013, "reward": 2.4999871253967285, "reward_std": 9.591080356585735e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999870657920837, "step": 1829 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 4.740932642487047, "grad_norm": 1.8725370061030004, "kl": 0.096435546875, "learning_rate": 5.261658031088083e-07, "loss": 0.0018, "reward": 2.4999914169311523, "reward_std": 7.273683195307967e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999913573265076, "step": 1830 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 4.743523316062176, "grad_norm": 4.74556642234223, "kl": 0.0640869140625, "learning_rate": 5.259067357512953e-07, "loss": -0.0002, "reward": 1.9997684359550476, "reward_std": 3.6370711768540787e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4997685849666595, "step": 1831 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 4.746113989637306, "grad_norm": 2.624071202412009, "kl": 0.08935546875, "learning_rate": 5.256476683937823e-07, "loss": 0.0005, "reward": 1.9999428987503052, "reward_std": 1.1232971928620827e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4999428689479828, "step": 1832 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 4.748704663212435, "grad_norm": 0.10067785357560483, "kl": 0.11669921875, "learning_rate": 5.253886010362693e-07, "loss": -0.0006, "reward": 2.499997854232788, "reward_std": 1.7851560869530658e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999979138374329, "step": 1833 }, { "clip_ratio": 0.0, "completion_length": 34.5, "epoch": 4.751295336787565, "grad_norm": 0.5685156572450072, "kl": 0.0787353515625, "learning_rate": 5.251295336787564e-07, "loss": 0.0, "reward": 2.499995708465576, "reward_std": 3.383515092991729e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999958276748657, "step": 1834 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 4.753886010362694, "grad_norm": 22.055601802081455, "kl": 0.06103515625, "learning_rate": 5.248704663212436e-07, "loss": 0.0006, "reward": 2.498571515083313, "reward_std": 0.00041903622934569285, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9985713958740234, "step": 1835 }, { "clip_ratio": 0.0, "completion_length": 34.5, "epoch": 4.756476683937824, "grad_norm": 0.11304170160299004, "kl": 0.05389404296875, "learning_rate": 5.246113989637306e-07, "loss": 0.0007, "reward": 2.499995708465576, "reward_std": 2.2937875883144443e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999958276748657, "step": 1836 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 4.759067357512953, "grad_norm": 17.162041380903503, "kl": 0.0535888671875, "learning_rate": 5.243523316062177e-07, "loss": 0.0001, "reward": 1.999945044517517, "reward_std": 1.7562929429004726e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4999449849128723, "step": 1837 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 4.761658031088083, "grad_norm": 0.6496636428290078, "kl": 0.0985107421875, "learning_rate": 5.240932642487046e-07, "loss": 0.0013, "reward": 2.49997079372406, "reward_std": 6.4162059061345644e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999706745147705, "step": 1838 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 4.7642487046632125, "grad_norm": 1.2643806912023512, "kl": 0.0301513671875, "learning_rate": 5.238341968911917e-07, "loss": -0.0003, "reward": 2.49998676776886, "reward_std": 6.764195290998032e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.999986708164215, "step": 1839 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 4.766839378238342, "grad_norm": 3.778437168131262, "kl": 0.1171875, "learning_rate": 5.235751295336788e-07, "loss": 0.0008, "reward": 2.499949097633362, "reward_std": 1.8067170913127484e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999489784240723, "step": 1840 }, { "clip_ratio": 0.0, "completion_length": 34.0, "epoch": 4.769430051813472, "grad_norm": 0.09986816331982766, "kl": 0.081787109375, "learning_rate": 5.233160621761658e-07, "loss": 0.0, "reward": 2.4999982118606567, "reward_std": 1.2238861017976888e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999983310699463, "step": 1841 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 4.772020725388601, "grad_norm": 0.34276201039635323, "kl": 0.1181640625, "learning_rate": 5.230569948186529e-07, "loss": 0.0002, "reward": 2.4999916553497314, "reward_std": 6.410554789226808e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999918341636658, "step": 1842 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 4.774611398963731, "grad_norm": 0.5513242145975972, "kl": 0.157958984375, "learning_rate": 5.227979274611399e-07, "loss": -0.0005, "reward": 2.4999717473983765, "reward_std": 8.922115739551373e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999717473983765, "step": 1843 }, { "clip_ratio": 0.0, "completion_length": 34.0, "epoch": 4.77720207253886, "grad_norm": 0.03448824198857638, "kl": 0.12939453125, "learning_rate": 5.225388601036269e-07, "loss": 0.0008, "reward": 2.4999990463256836, "reward_std": 9.123150874756902e-07, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999991059303284, "step": 1844 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 4.77979274611399, "grad_norm": 2.769963639384208, "kl": 0.174072265625, "learning_rate": 5.22279792746114e-07, "loss": -0.0005, "reward": 2.499972701072693, "reward_std": 1.4247115586840664e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999725818634033, "step": 1845 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 4.782383419689119, "grad_norm": 0.05199545443846805, "kl": 0.23876953125, "learning_rate": 5.22020725388601e-07, "loss": 0.0015, "reward": 2.4999969005584717, "reward_std": 1.973777500552387e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.999996840953827, "step": 1846 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 4.784974093264249, "grad_norm": 2.4698048411331173, "kl": 0.1064453125, "learning_rate": 5.217616580310881e-07, "loss": 0.0007, "reward": 2.4999899864196777, "reward_std": 3.111517912657291e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999897480010986, "step": 1847 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 4.787564766839378, "grad_norm": 0.34886363317653557, "kl": 0.09228515625, "learning_rate": 5.215025906735752e-07, "loss": 0.0008, "reward": 2.499997138977051, "reward_std": 2.007587653451992e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999971389770508, "step": 1848 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 4.790155440414508, "grad_norm": 0.39412284484886145, "kl": 0.122314453125, "learning_rate": 5.212435233160622e-07, "loss": 0.0002, "reward": 2.499995708465576, "reward_std": 3.830643777291698e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999958276748657, "step": 1849 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 4.7927461139896375, "grad_norm": 7.650442625564227, "kl": 0.091796875, "learning_rate": 5.209844559585492e-07, "loss": 0.0004, "reward": 2.4998810291290283, "reward_std": 4.5035868879494956e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.999880850315094, "step": 1850 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 4.795336787564767, "grad_norm": 1.8050325659468176, "kl": 0.1019287109375, "learning_rate": 5.207253886010362e-07, "loss": 0.0009, "reward": 1.766606092453003, "reward_std": 8.435635584191914e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.2666060030460358, "step": 1851 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 4.7979274611398965, "grad_norm": 0.4503805417563229, "kl": 0.05999755859375, "learning_rate": 5.204663212435233e-07, "loss": 0.0005, "reward": 2.4999972581863403, "reward_std": 1.6079814599834208e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999971985816956, "step": 1852 }, { "clip_ratio": 0.0, "completion_length": 35.3125, "epoch": 4.800518134715026, "grad_norm": 56.72936279770925, "kl": 0.083984375, "learning_rate": 5.202072538860104e-07, "loss": -0.0002, "reward": 2.2687954902648926, "reward_std": 0.3193020560623836, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.768795669078827, "step": 1853 }, { "clip_ratio": 0.0, "completion_length": 34.5, "epoch": 4.803108808290156, "grad_norm": 47.102965658594776, "kl": 0.1884765625, "learning_rate": 5.199481865284974e-07, "loss": 0.0008, "reward": 1.3129711747169495, "reward_std": 0.0007770535521558486, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 0.8129712343215942, "step": 1854 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 4.805699481865285, "grad_norm": 0.6877169201935954, "kl": 0.100341796875, "learning_rate": 5.196891191709845e-07, "loss": -0.0009, "reward": 1.9996490478515625, "reward_std": 1.1218860549888632e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4996492266654968, "step": 1855 }, { "clip_ratio": 0.0, "completion_length": 35.0625, "epoch": 4.808290155440415, "grad_norm": 4.678440059659677, "kl": 0.2841796875, "learning_rate": 5.194300518134714e-07, "loss": 0.0012, "reward": 1.9203997254371643, "reward_std": 0.00028477661862780224, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.420399785041809, "step": 1856 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 4.810880829015544, "grad_norm": 0.22482692099684018, "kl": 0.1630859375, "learning_rate": 5.191709844559585e-07, "loss": 0.0004, "reward": 2.4999958276748657, "reward_std": 2.6246650008943107e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999958872795105, "step": 1857 }, { "clip_ratio": 0.0, "completion_length": 34.5, "epoch": 4.813471502590674, "grad_norm": 1.0978514727987898, "kl": 0.0931396484375, "learning_rate": 5.189119170984456e-07, "loss": -0.0005, "reward": 2.4999632835388184, "reward_std": 9.000008958537364e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.999963402748108, "step": 1858 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 4.816062176165803, "grad_norm": 11.797871666877167, "kl": 0.12451171875, "learning_rate": 5.186528497409326e-07, "loss": 0.0002, "reward": 2.499993920326233, "reward_std": 5.024935035180533e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999940395355225, "step": 1859 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 4.818652849740933, "grad_norm": 0.230666330174166, "kl": 0.091064453125, "learning_rate": 5.183937823834197e-07, "loss": 0.0005, "reward": 2.499972701072693, "reward_std": 4.069174337928416e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999725818634033, "step": 1860 }, { "clip_ratio": 0.0, "completion_length": 34.5, "epoch": 4.821243523316062, "grad_norm": 0.38430755100609587, "kl": 0.0584716796875, "learning_rate": 5.181347150259067e-07, "loss": 0.0001, "reward": 2.4998698234558105, "reward_std": 5.049533456258359e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9998698830604553, "step": 1861 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 4.823834196891192, "grad_norm": 0.5541326335897286, "kl": 0.0986328125, "learning_rate": 5.178756476683937e-07, "loss": 0.0007, "reward": 2.499991774559021, "reward_std": 4.678573873206915e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.999991774559021, "step": 1862 }, { "clip_ratio": 0.0, "completion_length": 34.5, "epoch": 4.8264248704663215, "grad_norm": 6.9086749576832505, "kl": 0.3203125, "learning_rate": 5.176165803108808e-07, "loss": 0.0018, "reward": 2.499990701675415, "reward_std": 4.889002411800902e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999906420707703, "step": 1863 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 4.829015544041451, "grad_norm": 7.184699324738151, "kl": 0.17431640625, "learning_rate": 5.173575129533678e-07, "loss": 0.0013, "reward": 2.4999845027923584, "reward_std": 1.556348786380113e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999846816062927, "step": 1864 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 4.831606217616581, "grad_norm": 0.9799698518692863, "kl": 0.091552734375, "learning_rate": 5.170984455958549e-07, "loss": 0.0001, "reward": 2.499952554702759, "reward_std": 8.286165439130855e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999526739120483, "step": 1865 }, { "clip_ratio": 0.0, "completion_length": 34.5, "epoch": 4.83419689119171, "grad_norm": 2.3505405683011777, "kl": 0.116455078125, "learning_rate": 5.168393782383419e-07, "loss": -0.0007, "reward": 2.4999868869781494, "reward_std": 6.806133001191483e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.999987006187439, "step": 1866 }, { "clip_ratio": 0.0, "completion_length": 34.5, "epoch": 4.83678756476684, "grad_norm": 0.5260308124218211, "kl": 0.072509765625, "learning_rate": 5.16580310880829e-07, "loss": -0.0001, "reward": 2.4999754428863525, "reward_std": 4.411289864947321e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999754428863525, "step": 1867 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 4.839378238341969, "grad_norm": 0.1044197079833509, "kl": 0.1180419921875, "learning_rate": 5.16321243523316e-07, "loss": -0.0007, "reward": 2.4999961853027344, "reward_std": 1.3546987815971079e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.999996304512024, "step": 1868 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 4.841968911917099, "grad_norm": 3.111927738784777, "kl": 0.2012939453125, "learning_rate": 5.16062176165803e-07, "loss": -0.0003, "reward": 2.4999771118164062, "reward_std": 1.9735252863029018e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999772310256958, "step": 1869 }, { "clip_ratio": 0.0, "completion_length": 34.5, "epoch": 4.844559585492228, "grad_norm": 20.987801431165522, "kl": 0.14013671875, "learning_rate": 5.158031088082901e-07, "loss": 0.0008, "reward": 1.4888710379600525, "reward_std": 0.00011967114187427796, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 0.9888710379600525, "step": 1870 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 4.847150259067358, "grad_norm": 0.06463117590614705, "kl": 0.0570068359375, "learning_rate": 5.155440414507772e-07, "loss": -0.0007, "reward": 2.4999985694885254, "reward_std": 9.881680398393655e-07, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999986290931702, "step": 1871 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 4.849740932642487, "grad_norm": 18.163589971569913, "kl": 0.142578125, "learning_rate": 5.152849740932642e-07, "loss": 0.0007, "reward": 1.9954760074615479, "reward_std": 9.445267687624437e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.495475947856903, "step": 1872 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 4.852331606217617, "grad_norm": 28.115312912705644, "kl": 0.25634765625, "learning_rate": 5.150259067357513e-07, "loss": 0.001, "reward": 1.8121753334999084, "reward_std": 0.004402739882380047, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.3121753931045532, "step": 1873 }, { "clip_ratio": 0.0, "completion_length": 36.5, "epoch": 4.8549222797927465, "grad_norm": 0.3101319924114648, "kl": 0.0640869140625, "learning_rate": 5.147668393782382e-07, "loss": -0.0, "reward": 2.4999916553497314, "reward_std": 3.4617439723660937e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.999991536140442, "step": 1874 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 4.857512953367876, "grad_norm": 1.4734351130331582, "kl": 0.129638671875, "learning_rate": 5.145077720207253e-07, "loss": 0.0007, "reward": 2.4999780654907227, "reward_std": 6.190106091708003e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999781847000122, "step": 1875 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 4.860103626943005, "grad_norm": 0.14410996023761452, "kl": 0.123046875, "learning_rate": 5.142487046632125e-07, "loss": 0.0006, "reward": 1.4999991655349731, "reward_std": 6.103502414589457e-07, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 0.9999991059303284, "step": 1876 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 4.862694300518134, "grad_norm": 1.7626653707073157, "kl": 0.062744140625, "learning_rate": 5.139896373056995e-07, "loss": 0.0004, "reward": 2.499977946281433, "reward_std": 1.350019238088862e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999778270721436, "step": 1877 }, { "clip_ratio": 0.0, "completion_length": 34.5, "epoch": 4.865284974093264, "grad_norm": 2.371317823653764, "kl": 0.0692138671875, "learning_rate": 5.137305699481866e-07, "loss": -0.0008, "reward": 2.4999873638153076, "reward_std": 1.2011415492452215e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.999987542629242, "step": 1878 }, { "clip_ratio": 0.0, "completion_length": 33.5, "epoch": 4.867875647668393, "grad_norm": 11.728809755339237, "kl": 2.760009765625, "learning_rate": 5.134715025906736e-07, "loss": 0.0103, "reward": 2.4999825954437256, "reward_std": 1.4914779967512004e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.999982476234436, "step": 1879 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 4.870466321243523, "grad_norm": 12.600405365570017, "kl": 0.3310546875, "learning_rate": 5.132124352331606e-07, "loss": 0.0008, "reward": 2.499926447868347, "reward_std": 2.373492202423222e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999265670776367, "step": 1880 }, { "clip_ratio": 0.0, "completion_length": 34.5, "epoch": 4.873056994818652, "grad_norm": 0.12480825861074117, "kl": 0.0606689453125, "learning_rate": 5.129533678756477e-07, "loss": -0.0004, "reward": 2.49999737739563, "reward_std": 1.476501580555123e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999975562095642, "step": 1881 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 4.875647668393782, "grad_norm": 1.2335308195361294, "kl": 0.0772705078125, "learning_rate": 5.126943005181347e-07, "loss": 0.0009, "reward": 2.4999918937683105, "reward_std": 5.133752893016208e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999918341636658, "step": 1882 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 4.8782383419689115, "grad_norm": 4.470837879112766, "kl": 0.12109375, "learning_rate": 5.124352331606218e-07, "loss": -0.0001, "reward": 1.952039897441864, "reward_std": 0.0001541801144639976, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.452039897441864, "step": 1883 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 4.880829015544041, "grad_norm": 0.2192460917455969, "kl": 0.111328125, "learning_rate": 5.121761658031088e-07, "loss": 0.0002, "reward": 2.4999964237213135, "reward_std": 2.9395907858997816e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.999996304512024, "step": 1884 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 4.883419689119171, "grad_norm": 0.2060648916047213, "kl": 0.132568359375, "learning_rate": 5.119170984455959e-07, "loss": 0.0011, "reward": 2.4999972581863403, "reward_std": 2.488722628868345e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999972581863403, "step": 1885 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 4.8860103626943, "grad_norm": 13.417260221427565, "kl": 0.111328125, "learning_rate": 5.116580310880829e-07, "loss": 0.0006, "reward": 2.4327460527420044, "reward_std": 0.19003782174820572, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9327460527420044, "step": 1886 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 4.88860103626943, "grad_norm": 0.2758144858920631, "kl": 0.068115234375, "learning_rate": 5.113989637305699e-07, "loss": 0.0006, "reward": 2.499990940093994, "reward_std": 3.942555622415966e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999909400939941, "step": 1887 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 4.891191709844559, "grad_norm": 1.0902854750363893, "kl": 0.16162109375, "learning_rate": 5.11139896373057e-07, "loss": -0.0001, "reward": 2.4999799728393555, "reward_std": 9.5089649221336e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999802112579346, "step": 1888 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 4.893782383419689, "grad_norm": 7.081301732949311, "kl": 0.098876953125, "learning_rate": 5.10880829015544e-07, "loss": -0.0002, "reward": 2.499961733818054, "reward_std": 1.060829458765511e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999617338180542, "step": 1889 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 4.896373056994818, "grad_norm": 0.8058802828493359, "kl": 0.099609375, "learning_rate": 5.106217616580311e-07, "loss": 0.0004, "reward": 2.4999855756759644, "reward_std": 9.334485412182403e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.999985694885254, "step": 1890 }, { "clip_ratio": 0.0, "completion_length": 42.375, "epoch": 4.898963730569948, "grad_norm": 2.9193035463298367, "kl": 0.19000244140625, "learning_rate": 5.103626943005182e-07, "loss": 0.0003, "reward": 2.4999828338623047, "reward_std": 1.1689225630107103e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.999983012676239, "step": 1891 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 4.901554404145077, "grad_norm": 0.4450776872512842, "kl": 0.090576171875, "learning_rate": 5.101036269430051e-07, "loss": -0.0001, "reward": 2.4999948740005493, "reward_std": 4.056156342358008e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.999994933605194, "step": 1892 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 4.904145077720207, "grad_norm": 0.7283449454955582, "kl": 0.0753173828125, "learning_rate": 5.098445595854922e-07, "loss": 0.0, "reward": 2.499991536140442, "reward_std": 5.588245130638825e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999915957450867, "step": 1893 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 4.9067357512953365, "grad_norm": 1.6922256611313642, "kl": 0.081787109375, "learning_rate": 5.095854922279792e-07, "loss": 0.0019, "reward": 1.9999473094940186, "reward_std": 8.983988664112985e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4999471008777618, "step": 1894 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 4.909326424870466, "grad_norm": 2.365943348241899, "kl": 0.1650390625, "learning_rate": 5.093264248704663e-07, "loss": -0.0004, "reward": 1.9998682737350464, "reward_std": 1.407740614922659e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.499868392944336, "step": 1895 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 4.9119170984455955, "grad_norm": 1.6964616877330223, "kl": 0.14599609375, "learning_rate": 5.090673575129534e-07, "loss": 0.0006, "reward": 1.9998960494995117, "reward_std": 1.790315883454241e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4998961091041565, "step": 1896 }, { "clip_ratio": 0.0, "completion_length": 35.5625, "epoch": 4.914507772020725, "grad_norm": 1.876102256484594, "kl": 0.16259765625, "learning_rate": 5.088082901554404e-07, "loss": 0.001, "reward": 2.499979257583618, "reward_std": 1.3367809742703685e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999791383743286, "step": 1897 }, { "clip_ratio": 0.0, "completion_length": 34.5, "epoch": 4.917098445595855, "grad_norm": 0.8532308454817059, "kl": 0.0631103515625, "learning_rate": 5.085492227979274e-07, "loss": 0.0005, "reward": 2.4999985694885254, "reward_std": 1.962784381248639e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999985694885254, "step": 1898 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 4.919689119170984, "grad_norm": 0.41841053315785764, "kl": 0.0869140625, "learning_rate": 5.082901554404145e-07, "loss": 0.0012, "reward": 2.4999877214431763, "reward_std": 4.248923801242199e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999874830245972, "step": 1899 }, { "clip_ratio": 0.0, "completion_length": 34.0, "epoch": 4.922279792746114, "grad_norm": 0.1571909754973784, "kl": 0.0350341796875, "learning_rate": 5.080310880829015e-07, "loss": 0.0004, "reward": 2.4999964237213135, "reward_std": 1.649446545570754e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999963641166687, "step": 1900 }, { "clip_ratio": 0.0, "completion_length": 34.5, "epoch": 4.924870466321243, "grad_norm": 4.891992603760889, "kl": 0.205078125, "learning_rate": 5.077720207253886e-07, "loss": 0.0013, "reward": 1.9450291395187378, "reward_std": 0.0001299916957577807, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4450291991233826, "step": 1901 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 4.927461139896373, "grad_norm": 4.292213593668363, "kl": 0.1796875, "learning_rate": 5.075129533678756e-07, "loss": 0.0011, "reward": 1.498780369758606, "reward_std": 5.568056076299399e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 0.9987803399562836, "step": 1902 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 4.930051813471502, "grad_norm": 74.52292657169967, "kl": 0.08203125, "learning_rate": 5.072538860103627e-07, "loss": 0.0003, "reward": 1.9999048709869385, "reward_std": 0.35365331172943115, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4999048709869385, "step": 1903 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 4.932642487046632, "grad_norm": 0.22755161973627527, "kl": 0.03369140625, "learning_rate": 5.069948186528497e-07, "loss": -0.0002, "reward": 2.499995708465576, "reward_std": 2.83990709704085e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999957084655762, "step": 1904 }, { "clip_ratio": 0.0, "completion_length": 34.5, "epoch": 4.935233160621761, "grad_norm": 0.4152910432227915, "kl": 0.12109375, "learning_rate": 5.067357512953367e-07, "loss": 0.0003, "reward": 1.4999972581863403, "reward_std": 1.0552354297033162e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 0.9999972581863403, "step": 1905 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 4.937823834196891, "grad_norm": 23.668966026949754, "kl": 0.068359375, "learning_rate": 5.064766839378238e-07, "loss": 0.0006, "reward": 1.9828269481658936, "reward_std": 0.00024920167282971306, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4828268885612488, "step": 1906 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 4.9404145077720205, "grad_norm": 1.7903009861545844, "kl": 0.0582275390625, "learning_rate": 5.062176165803108e-07, "loss": -0.0002, "reward": 2.499991297721863, "reward_std": 7.357782124017831e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999913573265076, "step": 1907 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 4.94300518134715, "grad_norm": 0.6270608662338873, "kl": 0.101806640625, "learning_rate": 5.059585492227979e-07, "loss": -0.0, "reward": 2.499978184700012, "reward_std": 6.267599019338377e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.999978482723236, "step": 1908 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 4.94559585492228, "grad_norm": 0.4218790272789594, "kl": 0.090087890625, "learning_rate": 5.05699481865285e-07, "loss": 0.0007, "reward": 2.499988079071045, "reward_std": 5.398533403422334e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999880194664001, "step": 1909 }, { "clip_ratio": 0.0, "completion_length": 34.5, "epoch": 4.948186528497409, "grad_norm": 0.6821500540734067, "kl": 0.14794921875, "learning_rate": 5.054404145077719e-07, "loss": 0.0013, "reward": 2.499984860420227, "reward_std": 8.115260925478651e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.999984860420227, "step": 1910 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 4.950777202072539, "grad_norm": 10.23500927637154, "kl": 0.125, "learning_rate": 5.05181347150259e-07, "loss": 0.0004, "reward": 1.9769858121871948, "reward_std": 0.000344835293617507, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4769859313964844, "step": 1911 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 4.953367875647668, "grad_norm": 4.247667775016941, "kl": 0.112548828125, "learning_rate": 5.04922279792746e-07, "loss": -0.0002, "reward": 1.9998317956924438, "reward_std": 1.7797691725718323e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4998319149017334, "step": 1912 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 4.955958549222798, "grad_norm": 9.79988218794273, "kl": 0.0618896484375, "learning_rate": 5.046632124352331e-07, "loss": 0.0, "reward": 2.49992036819458, "reward_std": 1.0493651757315092e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999204874038696, "step": 1913 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 4.958549222797927, "grad_norm": 0.047799849922210126, "kl": 0.068603515625, "learning_rate": 5.044041450777202e-07, "loss": -0.0, "reward": 2.4999985694885254, "reward_std": 8.212363411530532e-07, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.999998688697815, "step": 1914 }, { "clip_ratio": 0.0, "completion_length": 34.5, "epoch": 4.961139896373057, "grad_norm": 33.88340165507215, "kl": 0.062744140625, "learning_rate": 5.041450777202072e-07, "loss": 0.0002, "reward": 2.499943256378174, "reward_std": 3.3268408060393995e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.999943196773529, "step": 1915 }, { "clip_ratio": 0.0, "completion_length": 35.3125, "epoch": 4.963730569948186, "grad_norm": 0.4766602441522811, "kl": 0.11474609375, "learning_rate": 5.038860103626942e-07, "loss": 0.0005, "reward": 2.499996304512024, "reward_std": 3.1861559932622185e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999962449073792, "step": 1916 }, { "clip_ratio": 0.0, "completion_length": 34.5, "epoch": 4.966321243523316, "grad_norm": 1.970085671487226, "kl": 0.099365234375, "learning_rate": 5.036269430051812e-07, "loss": 0.0006, "reward": 2.4999852180480957, "reward_std": 7.76936246893456e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999850988388062, "step": 1917 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 4.9689119170984455, "grad_norm": 0.614317125994361, "kl": 0.097900390625, "learning_rate": 5.033678756476683e-07, "loss": 0.0003, "reward": 2.499976634979248, "reward_std": 6.318117357295705e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.999976634979248, "step": 1918 }, { "clip_ratio": 0.0, "completion_length": 34.0, "epoch": 4.971502590673575, "grad_norm": 2.5982732967414153, "kl": 0.1248779296875, "learning_rate": 5.031088082901555e-07, "loss": -0.0, "reward": 1.9950045347213745, "reward_std": 4.371173338313383e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4950045347213745, "step": 1919 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 4.974093264248705, "grad_norm": 0.556943115477112, "kl": 0.06201171875, "learning_rate": 5.028497409326425e-07, "loss": -0.0002, "reward": 2.4999951124191284, "reward_std": 3.4579869634399074e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.999995231628418, "step": 1920 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 4.976683937823834, "grad_norm": 14.170939958264592, "kl": 0.126708984375, "learning_rate": 5.025906735751296e-07, "loss": 0.0006, "reward": 1.730444073677063, "reward_std": 0.25908429973060265, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.230444073677063, "step": 1921 }, { "clip_ratio": 0.0, "completion_length": 36.5, "epoch": 4.979274611398964, "grad_norm": 112.50593354207143, "kl": 0.09674072265625, "learning_rate": 5.023316062176167e-07, "loss": 0.0001, "reward": 1.999139666557312, "reward_std": 0.0007437472534093104, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4991395473480225, "step": 1922 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 4.981865284974093, "grad_norm": 0.12978793612667916, "kl": 0.1240234375, "learning_rate": 5.020725388601036e-07, "loss": -0.0001, "reward": 2.4999988079071045, "reward_std": 1.4282953202382487e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.999998927116394, "step": 1923 }, { "clip_ratio": 0.0, "completion_length": 34.5, "epoch": 4.984455958549223, "grad_norm": 1.661070434926724, "kl": 0.0877685546875, "learning_rate": 5.018134715025907e-07, "loss": 0.0002, "reward": 2.499964714050293, "reward_std": 1.1544149856490549e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999645948410034, "step": 1924 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 4.987046632124352, "grad_norm": 0.24215066159783818, "kl": 0.0906982421875, "learning_rate": 5.015544041450777e-07, "loss": 0.0007, "reward": 2.499996781349182, "reward_std": 2.105047371969704e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.999996840953827, "step": 1925 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 4.989637305699482, "grad_norm": 0.23202070007577813, "kl": 0.112060546875, "learning_rate": 5.012953367875648e-07, "loss": 0.0012, "reward": 2.499994993209839, "reward_std": 3.002770938564936e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999948143959045, "step": 1926 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 4.992227979274611, "grad_norm": 1.8770420819716949, "kl": 0.072265625, "learning_rate": 5.010362694300519e-07, "loss": 0.0002, "reward": 2.499958872795105, "reward_std": 1.1185346778574967e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999589920043945, "step": 1927 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 4.994818652849741, "grad_norm": 0.3579381392829675, "kl": 0.068603515625, "learning_rate": 5.007772020725388e-07, "loss": 0.0001, "reward": 2.499995231628418, "reward_std": 3.055555509945407e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.999995231628418, "step": 1928 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 4.9974093264248705, "grad_norm": 2.3002497398650443, "kl": 0.071044921875, "learning_rate": 5.005181347150259e-07, "loss": -0.0002, "reward": 2.4999839067459106, "reward_std": 6.179596425681666e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999839067459106, "step": 1929 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 5.0, "grad_norm": 0.05155568553213749, "kl": 0.140869140625, "learning_rate": 5.002590673575129e-07, "loss": 0.0003, "reward": 2.499998927116394, "reward_std": 1.260425307236801e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.999998927116394, "step": 1930 }, { "clip_ratio": 0.0, "completion_length": 34.5, "epoch": 5.0025906735751295, "grad_norm": 0.44041786757354445, "kl": 0.10205078125, "learning_rate": 5e-07, "loss": 0.0002, "reward": 1.999955415725708, "reward_std": 5.4261488457996165e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4999555349349976, "step": 1931 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 5.005181347150259, "grad_norm": 0.1486081708723704, "kl": 0.0985107421875, "learning_rate": 4.99740932642487e-07, "loss": 0.0008, "reward": 2.4999964237213135, "reward_std": 2.7909155733141233e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.999996304512024, "step": 1932 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 5.007772020725389, "grad_norm": 4.376789480644765, "kl": 0.0809326171875, "learning_rate": 4.994818652849741e-07, "loss": 0.0004, "reward": 1.9833866357803345, "reward_std": 0.00012132878259762947, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4833866655826569, "step": 1933 }, { "clip_ratio": 0.0, "completion_length": 35.875, "epoch": 5.010362694300518, "grad_norm": 37.461291605979255, "kl": 0.13037109375, "learning_rate": 4.992227979274612e-07, "loss": 0.0001, "reward": 2.042196273803711, "reward_std": 0.18491899121443112, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.542196273803711, "step": 1934 }, { "clip_ratio": 0.0, "completion_length": 34.0, "epoch": 5.012953367875648, "grad_norm": 20.96403180296301, "kl": 0.228515625, "learning_rate": 4.989637305699482e-07, "loss": 0.0006, "reward": 1.9577412605285645, "reward_std": 0.3382488763127185, "rewards/format_reward_rec": 0.9375, "rewards/point_reward": 1.4889912605285645, "step": 1935 }, { "clip_ratio": 0.0, "completion_length": 34.5, "epoch": 5.015544041450777, "grad_norm": 1.312300023220779, "kl": 0.41943359375, "learning_rate": 4.987046632124352e-07, "loss": 0.0011, "reward": 2.49999737739563, "reward_std": 2.6252888858380174e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999974966049194, "step": 1936 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 5.018134715025907, "grad_norm": 0.6403185857183649, "kl": 0.0987548828125, "learning_rate": 4.984455958549223e-07, "loss": -0.0004, "reward": 2.4999475479125977, "reward_std": 1.0124155778612476e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999476671218872, "step": 1937 }, { "clip_ratio": 0.0, "completion_length": 34.5, "epoch": 5.020725388601036, "grad_norm": 5.99578723498661, "kl": 0.208740234375, "learning_rate": 4.981865284974093e-07, "loss": 0.0011, "reward": 1.4945184588432312, "reward_std": 8.185960177797824e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 0.9945183992385864, "step": 1938 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 5.023316062176166, "grad_norm": 0.22837904988975566, "kl": 0.1942138671875, "learning_rate": 4.979274611398964e-07, "loss": 0.0006, "reward": 2.499977946281433, "reward_std": 3.2434911645395914e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999780058860779, "step": 1939 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 5.025906735751295, "grad_norm": 0.08623790960688131, "kl": 0.070068359375, "learning_rate": 4.976683937823834e-07, "loss": 0.0007, "reward": 2.4999974966049194, "reward_std": 1.2107051929888257e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999974966049194, "step": 1940 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 5.028497409326425, "grad_norm": 26.47870798522681, "kl": 0.052978515625, "learning_rate": 4.974093264248704e-07, "loss": 0.0009, "reward": 2.4374852180480957, "reward_std": 0.17678938515939535, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9374852776527405, "step": 1941 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 5.0310880829015545, "grad_norm": 0.13478059522112307, "kl": 0.0628662109375, "learning_rate": 4.971502590673575e-07, "loss": -0.0003, "reward": 2.4999985694885254, "reward_std": 1.69297129559709e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.999998688697815, "step": 1942 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 5.033678756476684, "grad_norm": 0.7684616959797207, "kl": 0.095458984375, "learning_rate": 4.968911917098446e-07, "loss": 0.0014, "reward": 2.4999715089797974, "reward_std": 8.310182920467923e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999712109565735, "step": 1943 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 5.036269430051814, "grad_norm": 0.10321480340389022, "kl": 0.1136474609375, "learning_rate": 4.966321243523316e-07, "loss": 0.0021, "reward": 2.4999982118606567, "reward_std": 2.2039363329895423e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999982118606567, "step": 1944 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 5.038860103626943, "grad_norm": 0.6898264128668515, "kl": 0.067138671875, "learning_rate": 4.963730569948186e-07, "loss": -0.0004, "reward": 2.4999780654907227, "reward_std": 5.790218324364105e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999781847000122, "step": 1945 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 5.041450777202073, "grad_norm": 1.7317733427190027, "kl": 0.09033203125, "learning_rate": 4.961139896373057e-07, "loss": 0.0007, "reward": 1.9998986721038818, "reward_std": 1.638752860344539e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4998986721038818, "step": 1946 }, { "clip_ratio": 0.0, "completion_length": 34.5, "epoch": 5.044041450777202, "grad_norm": 0.4015096749394421, "kl": 0.123779296875, "learning_rate": 4.958549222797927e-07, "loss": 0.0005, "reward": 2.4999942779541016, "reward_std": 3.050363659440336e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999943375587463, "step": 1947 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 5.046632124352332, "grad_norm": 2.7323083176474774, "kl": 0.09716796875, "learning_rate": 4.955958549222798e-07, "loss": 0.0004, "reward": 2.4999263286590576, "reward_std": 7.443785307259532e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999263882637024, "step": 1948 }, { "clip_ratio": 0.0, "completion_length": 36.5, "epoch": 5.049222797927461, "grad_norm": 0.3004464259468006, "kl": 0.03460693359375, "learning_rate": 4.953367875647668e-07, "loss": -0.001, "reward": 2.4999972581863403, "reward_std": 1.21737434710667e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999974370002747, "step": 1949 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 5.051813471502591, "grad_norm": 1.3898342975653213, "kl": 0.095947265625, "learning_rate": 4.950777202072538e-07, "loss": 0.0011, "reward": 2.499994397163391, "reward_std": 6.26772907708073e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999942779541016, "step": 1950 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 5.05440414507772, "grad_norm": 34.974158819577966, "kl": 0.1640625, "learning_rate": 4.948186528497409e-07, "loss": 0.0006, "reward": 1.7851468324661255, "reward_std": 0.001801242060992081, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.2851468622684479, "step": 1951 }, { "clip_ratio": 0.0, "completion_length": 33.9375, "epoch": 5.05699481865285, "grad_norm": 14.850254812264591, "kl": 0.132080078125, "learning_rate": 4.94559585492228e-07, "loss": 0.0004, "reward": 2.4093295335769653, "reward_std": 0.25610035105046336, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.90932959318161, "step": 1952 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 5.0595854922279795, "grad_norm": 0.2725250116596573, "kl": 0.072021484375, "learning_rate": 4.94300518134715e-07, "loss": 0.001, "reward": 1.9984136819839478, "reward_std": 1.957666086127574e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4984136521816254, "step": 1953 }, { "clip_ratio": 0.0, "completion_length": 34.5, "epoch": 5.062176165803109, "grad_norm": 4.2366202198119005, "kl": 1.357666015625, "learning_rate": 4.94041450777202e-07, "loss": 0.0055, "reward": 2.4999951124191284, "reward_std": 2.5406496320101724e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.999995231628418, "step": 1954 }, { "clip_ratio": 0.0, "completion_length": 34.5, "epoch": 5.064766839378239, "grad_norm": 0.09164591729355538, "kl": 0.0726318359375, "learning_rate": 4.937823834196891e-07, "loss": -0.0006, "reward": 2.4999969005584717, "reward_std": 1.8165039250561676e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999969601631165, "step": 1955 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 5.067357512953368, "grad_norm": 6.143896185530837, "kl": 0.112060546875, "learning_rate": 4.935233160621761e-07, "loss": -0.0002, "reward": 2.4999847412109375, "reward_std": 1.4170248050504597e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.999984860420227, "step": 1956 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 5.069948186528498, "grad_norm": 5.1633188568781865, "kl": 0.10546875, "learning_rate": 4.932642487046632e-07, "loss": 0.0004, "reward": 1.8822984099388123, "reward_std": 0.0005674214853570447, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.382298469543457, "step": 1957 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 5.072538860103627, "grad_norm": 2.7016956793571762, "kl": 0.56591796875, "learning_rate": 4.930051813471502e-07, "loss": 0.0024, "reward": 2.49996817111969, "reward_std": 8.273359298982541e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.99996817111969, "step": 1958 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 5.075129533678757, "grad_norm": 0.18273164322617722, "kl": 0.082275390625, "learning_rate": 4.927461139896372e-07, "loss": 0.0017, "reward": 2.4999966621398926, "reward_std": 1.6594606222497532e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.999996542930603, "step": 1959 }, { "clip_ratio": 0.0, "completion_length": 34.5, "epoch": 5.077720207253886, "grad_norm": 470.9819024895138, "kl": 0.2314453125, "learning_rate": 4.924870466321243e-07, "loss": 0.0011, "reward": 2.0591955184936523, "reward_std": 0.2720373572897188, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.5591954588890076, "step": 1960 }, { "clip_ratio": 0.0, "completion_length": 34.5, "epoch": 5.080310880829016, "grad_norm": 0.2455210476874772, "kl": 0.072265625, "learning_rate": 4.922279792746113e-07, "loss": 0.0006, "reward": 2.499997854232788, "reward_std": 2.1795440829919244e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999977946281433, "step": 1961 }, { "clip_ratio": 0.0, "completion_length": 34.5, "epoch": 5.082901554404145, "grad_norm": 0.9417616465071259, "kl": 0.066650390625, "learning_rate": 4.919689119170985e-07, "loss": -0.0007, "reward": 2.49999737739563, "reward_std": 2.0860297098579395e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999975562095642, "step": 1962 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 5.085492227979275, "grad_norm": 0.09888019132050514, "kl": 0.097412109375, "learning_rate": 4.917098445595855e-07, "loss": 0.0011, "reward": 2.49999737739563, "reward_std": 1.4439532378673903e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999974966049194, "step": 1963 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 5.0880829015544045, "grad_norm": 10.522833406645212, "kl": 0.183349609375, "learning_rate": 4.914507772020726e-07, "loss": 0.0014, "reward": 1.979519248008728, "reward_std": 0.00010333130740036722, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.479519248008728, "step": 1964 }, { "clip_ratio": 0.0, "completion_length": 35.4375, "epoch": 5.090673575129534, "grad_norm": 51.94584164315255, "kl": 0.0927734375, "learning_rate": 4.911917098445596e-07, "loss": 0.0007, "reward": 1.974902868270874, "reward_std": 0.004067035675689112, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4749028086662292, "step": 1965 }, { "clip_ratio": 0.0, "completion_length": 34.5, "epoch": 5.0932642487046635, "grad_norm": 0.23037371086719313, "kl": 0.06524658203125, "learning_rate": 4.909326424870467e-07, "loss": -0.0, "reward": 2.499997615814209, "reward_std": 1.8141939506222116e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999977350234985, "step": 1966 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 5.095854922279793, "grad_norm": 3.817142065490866, "kl": 0.0908203125, "learning_rate": 4.906735751295337e-07, "loss": 0.0001, "reward": 2.4998281002044678, "reward_std": 2.6074141601384326e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9998281002044678, "step": 1967 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 5.098445595854923, "grad_norm": 0.5455854524411821, "kl": 0.06341552734375, "learning_rate": 4.904145077720207e-07, "loss": -0.0005, "reward": 1.999927043914795, "reward_std": 1.087022673118554e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4999271631240845, "step": 1968 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 5.101036269430052, "grad_norm": 9.168445126343135, "kl": 0.065673828125, "learning_rate": 4.901554404145078e-07, "loss": -0.0001, "reward": 2.4999436140060425, "reward_std": 1.7112050642253962e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999436736106873, "step": 1969 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 5.103626943005182, "grad_norm": 2.4630563043569795, "kl": 0.074462890625, "learning_rate": 4.898963730569948e-07, "loss": 0.0002, "reward": 1.9799813032150269, "reward_std": 0.00014127314898360055, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.479981243610382, "step": 1970 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 5.106217616580311, "grad_norm": 0.7258132398064928, "kl": 0.1097412109375, "learning_rate": 4.896373056994819e-07, "loss": 0.0005, "reward": 2.499991297721863, "reward_std": 6.1494474721257575e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.999991238117218, "step": 1971 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 5.108808290155441, "grad_norm": 4.262925906197277, "kl": 0.049560546875, "learning_rate": 4.893782383419689e-07, "loss": 0.0002, "reward": 2.4999629259109497, "reward_std": 2.0119208784308285e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999630451202393, "step": 1972 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 5.11139896373057, "grad_norm": 1.558017733032952, "kl": 0.080078125, "learning_rate": 4.89119170984456e-07, "loss": -0.0003, "reward": 2.4999959468841553, "reward_std": 3.334208713567932e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999959468841553, "step": 1973 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 5.1139896373057, "grad_norm": 0.14393618073282874, "kl": 0.02935791015625, "learning_rate": 4.88860103626943e-07, "loss": -0.0, "reward": 2.499997138977051, "reward_std": 2.0730961978188134e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999971389770508, "step": 1974 }, { "clip_ratio": 0.0, "completion_length": 35.9375, "epoch": 5.116580310880829, "grad_norm": 0.584835562493568, "kl": 0.040771484375, "learning_rate": 4.886010362694301e-07, "loss": -0.0007, "reward": 2.4999927282333374, "reward_std": 4.800249371328391e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.999992847442627, "step": 1975 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 5.119170984455959, "grad_norm": 0.8626024819700417, "kl": 0.31884765625, "learning_rate": 4.883419689119171e-07, "loss": 0.0021, "reward": 2.4999958276748657, "reward_std": 5.013073405280011e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999957084655762, "step": 1976 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 5.1217616580310885, "grad_norm": 9.332108422683755, "kl": 0.0966796875, "learning_rate": 4.880829015544041e-07, "loss": 0.0011, "reward": 1.995583713054657, "reward_std": 8.042127274165978e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.495583564043045, "step": 1977 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 5.124352331606218, "grad_norm": 0.49280543122272225, "kl": 0.1004638671875, "learning_rate": 4.878238341968912e-07, "loss": 0.001, "reward": 2.499996542930603, "reward_std": 3.03728120343294e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999963641166687, "step": 1978 }, { "clip_ratio": 0.0, "completion_length": 34.5, "epoch": 5.126943005181348, "grad_norm": 1.9234761056989427, "kl": 0.0667724609375, "learning_rate": 4.875647668393782e-07, "loss": 0.0002, "reward": 2.499993324279785, "reward_std": 7.244569019349001e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999933242797852, "step": 1979 }, { "clip_ratio": 0.0, "completion_length": 34.5, "epoch": 5.129533678756476, "grad_norm": 0.09006516242284873, "kl": 0.071533203125, "learning_rate": 4.873056994818653e-07, "loss": -0.0002, "reward": 2.4999955892562866, "reward_std": 2.1774088452275464e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999955296516418, "step": 1980 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 5.132124352331606, "grad_norm": 9.037745453688038, "kl": 0.191162109375, "learning_rate": 4.870466321243523e-07, "loss": 0.001, "reward": 1.8227461576461792, "reward_std": 0.0014014614974939832, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.3227460980415344, "step": 1981 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 5.134715025906735, "grad_norm": 0.9375389288558058, "kl": 0.1241455078125, "learning_rate": 4.867875647668394e-07, "loss": -0.0006, "reward": 2.4999932050704956, "reward_std": 6.188667498463474e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.99999338388443, "step": 1982 }, { "clip_ratio": 0.0, "completion_length": 34.5, "epoch": 5.137305699481865, "grad_norm": 0.26498455994168174, "kl": 0.098388671875, "learning_rate": 4.865284974093264e-07, "loss": 0.0001, "reward": 2.499995470046997, "reward_std": 4.564170694720815e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.999995470046997, "step": 1983 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 5.139896373056994, "grad_norm": 1.4312647050672191, "kl": 0.11602783203125, "learning_rate": 4.862694300518134e-07, "loss": 0.0011, "reward": 2.499983310699463, "reward_std": 8.439705197815783e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999831914901733, "step": 1984 }, { "clip_ratio": 0.0, "completion_length": 35.75, "epoch": 5.142487046632124, "grad_norm": 1.1114484686755408, "kl": 0.133056640625, "learning_rate": 4.860103626943005e-07, "loss": 0.0003, "reward": 2.499979257583618, "reward_std": 7.881527835706947e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999792575836182, "step": 1985 }, { "clip_ratio": 0.0, "completion_length": 34.0, "epoch": 5.1450777202072535, "grad_norm": 0.13703469199686946, "kl": 0.06201171875, "learning_rate": 4.857512953367875e-07, "loss": 0.0004, "reward": 2.4999988079071045, "reward_std": 1.3316914646566147e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999990463256836, "step": 1986 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 5.147668393782383, "grad_norm": 6.870160815810715, "kl": 0.144287109375, "learning_rate": 4.854922279792746e-07, "loss": 0.0005, "reward": 1.999910831451416, "reward_std": 3.4607599957325874e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4999107718467712, "step": 1987 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 5.150259067357513, "grad_norm": 4.704891290151144, "kl": 0.07763671875, "learning_rate": 4.852331606217616e-07, "loss": 0.0013, "reward": 2.4999663829803467, "reward_std": 1.880599211290246e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999664425849915, "step": 1988 }, { "clip_ratio": 0.0, "completion_length": 34.5, "epoch": 5.152849740932642, "grad_norm": 3.9830847568788244, "kl": 0.0584716796875, "learning_rate": 4.849740932642487e-07, "loss": -0.0005, "reward": 2.499983787536621, "reward_std": 1.3517110119209974e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999840259552002, "step": 1989 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 5.155440414507772, "grad_norm": 0.6231674188728465, "kl": 0.056640625, "learning_rate": 4.847150259067357e-07, "loss": 0.0001, "reward": 2.499987840652466, "reward_std": 6.364800810843008e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999878406524658, "step": 1990 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 5.158031088082901, "grad_norm": 28.02749306094768, "kl": 0.09814453125, "learning_rate": 4.844559585492228e-07, "loss": 0.0004, "reward": 1.8516458868980408, "reward_std": 0.1900110165006481, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.3516458570957184, "step": 1991 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 5.160621761658031, "grad_norm": 0.18978471173080563, "kl": 0.0386962890625, "learning_rate": 4.841968911917098e-07, "loss": -0.0006, "reward": 2.4999979734420776, "reward_std": 1.7876092215374229e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999980330467224, "step": 1992 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 5.16321243523316, "grad_norm": 2.095479182630184, "kl": 0.045166015625, "learning_rate": 4.839378238341968e-07, "loss": -0.0006, "reward": 2.498996138572693, "reward_std": 4.0461362061705586e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9989961981773376, "step": 1993 }, { "clip_ratio": 0.0, "completion_length": 34.5, "epoch": 5.16580310880829, "grad_norm": 0.07316495224379722, "kl": 0.103759765625, "learning_rate": 4.836787564766839e-07, "loss": 0.0004, "reward": 2.4999977350234985, "reward_std": 1.9874941017405945e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999974966049194, "step": 1994 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 5.168393782383419, "grad_norm": 1.3292786854194039, "kl": 0.05615234375, "learning_rate": 4.834196891191709e-07, "loss": 0.0019, "reward": 2.4999654293060303, "reward_std": 6.915546236996306e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999651908874512, "step": 1995 }, { "clip_ratio": 0.0, "completion_length": 34.5, "epoch": 5.170984455958549, "grad_norm": 0.8869836113656446, "kl": 0.0479736328125, "learning_rate": 4.83160621761658e-07, "loss": 0.0011, "reward": 2.4999903440475464, "reward_std": 8.109043790227588e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999905228614807, "step": 1996 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 5.1735751295336785, "grad_norm": 10.947481721947607, "kl": 0.27978515625, "learning_rate": 4.82901554404145e-07, "loss": 0.0008, "reward": 2.1249611377716064, "reward_std": 0.2314744981044896, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.6249611377716064, "step": 1997 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 5.176165803108808, "grad_norm": 2.669004823020978, "kl": 0.093017578125, "learning_rate": 4.826424870466321e-07, "loss": 0.0005, "reward": 2.499934196472168, "reward_std": 1.316819543717429e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999340772628784, "step": 1998 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 5.178756476683938, "grad_norm": 0.5352163084793194, "kl": 0.097900390625, "learning_rate": 4.823834196891191e-07, "loss": -0.0007, "reward": 2.499977707862854, "reward_std": 5.183891062188195e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999775886535645, "step": 1999 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 5.181347150259067, "grad_norm": 0.9272355918295676, "kl": 0.15576171875, "learning_rate": 4.821243523316062e-07, "loss": 0.0006, "reward": 2.4999847412109375, "reward_std": 9.09536879589723e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999848008155823, "step": 2000 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 5.183937823834197, "grad_norm": 12.226005661886214, "kl": 0.073486328125, "learning_rate": 4.818652849740932e-07, "loss": 0.0002, "reward": 1.9373315572738647, "reward_std": 0.1768157596416131, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4373316764831543, "step": 2001 }, { "clip_ratio": 0.0, "completion_length": 33.5, "epoch": 5.186528497409326, "grad_norm": 0.24489628481099135, "kl": 0.075927734375, "learning_rate": 4.816062176165802e-07, "loss": -0.0004, "reward": 2.4999786615371704, "reward_std": 4.631241154129384e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999786615371704, "step": 2002 }, { "clip_ratio": 0.0, "completion_length": 35.625, "epoch": 5.189119170984456, "grad_norm": 38.43430854479419, "kl": 0.0875244140625, "learning_rate": 4.813471502590673e-07, "loss": 0.0009, "reward": 1.8253574967384338, "reward_std": 0.10489688286997989, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.3253573775291443, "step": 2003 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 5.191709844559585, "grad_norm": 73.1767457782988, "kl": 0.0704345703125, "learning_rate": 4.810880829015543e-07, "loss": 0.0007, "reward": 2.312434434890747, "reward_std": 0.25882142814953113, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.812434434890747, "step": 2004 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 5.194300518134715, "grad_norm": 0.9140457657461409, "kl": 0.123291015625, "learning_rate": 4.808290155440415e-07, "loss": 0.001, "reward": 2.499956965446472, "reward_std": 1.1952409522564267e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999569058418274, "step": 2005 }, { "clip_ratio": 0.0, "completion_length": 34.0, "epoch": 5.196891191709844, "grad_norm": 18.606498102894474, "kl": 0.18408203125, "learning_rate": 4.805699481865285e-07, "loss": 0.0008, "reward": 1.7579326629638672, "reward_std": 0.17730119306361303, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.2579325437545776, "step": 2006 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 5.199481865284974, "grad_norm": 0.7857258017267698, "kl": 0.078369140625, "learning_rate": 4.803108808290155e-07, "loss": 0.0007, "reward": 2.4999752044677734, "reward_std": 9.016391743443819e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999752044677734, "step": 2007 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 5.2020725388601035, "grad_norm": 0.5625062693993619, "kl": 0.067626953125, "learning_rate": 4.800518134715026e-07, "loss": -0.0007, "reward": 2.4999927282333374, "reward_std": 3.3889252222252253e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999929666519165, "step": 2008 }, { "clip_ratio": 0.0, "completion_length": 34.5, "epoch": 5.204663212435233, "grad_norm": 0.12358922776903711, "kl": 0.07574462890625, "learning_rate": 4.797927461139897e-07, "loss": 0.0012, "reward": 2.499997138977051, "reward_std": 2.235927468063892e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999970197677612, "step": 2009 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 5.2072538860103625, "grad_norm": 2.0375427942254056, "kl": 0.0631103515625, "learning_rate": 4.795336787564767e-07, "loss": 0.0002, "reward": 2.499986410140991, "reward_std": 1.0443450207731075e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999864101409912, "step": 2010 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 5.209844559585492, "grad_norm": 0.16868327954000012, "kl": 0.11669921875, "learning_rate": 4.792746113989637e-07, "loss": -0.001, "reward": 2.4999938011169434, "reward_std": 3.768546434912423e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999940395355225, "step": 2011 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 5.212435233160622, "grad_norm": 0.3307661741865748, "kl": 0.159912109375, "learning_rate": 4.790155440414508e-07, "loss": 0.0018, "reward": 1.999861717224121, "reward_std": 7.297677683482107e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4998615086078644, "step": 2012 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 5.215025906735751, "grad_norm": 3.0777413562930023, "kl": 0.03985595703125, "learning_rate": 4.787564766839378e-07, "loss": 0.001, "reward": 1.8221864104270935, "reward_std": 0.00032432956504635513, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.322186291217804, "step": 2013 }, { "clip_ratio": 0.0, "completion_length": 34.0, "epoch": 5.217616580310881, "grad_norm": 0.23269579553536118, "kl": 0.149658203125, "learning_rate": 4.784974093264249e-07, "loss": 0.0006, "reward": 2.499996066093445, "reward_std": 3.5594301834862563e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999960660934448, "step": 2014 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 5.22020725388601, "grad_norm": 1.3045431916202663, "kl": 0.06170654296875, "learning_rate": 4.782383419689119e-07, "loss": 0.0012, "reward": 2.4999818801879883, "reward_std": 9.364475999973365e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999819993972778, "step": 2015 }, { "clip_ratio": 0.0, "completion_length": 34.0, "epoch": 5.22279792746114, "grad_norm": 13.47631285618988, "kl": 0.12646484375, "learning_rate": 4.779792746113989e-07, "loss": 0.0005, "reward": 2.437469244003296, "reward_std": 0.17679856166296304, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9374691843986511, "step": 2016 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 5.225388601036269, "grad_norm": 19.257090874550265, "kl": 0.08135986328125, "learning_rate": 4.77720207253886e-07, "loss": -0.0003, "reward": 2.3749598264694214, "reward_std": 0.23149637901906317, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.8749597668647766, "step": 2017 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 5.227979274611399, "grad_norm": 2.206561954833908, "kl": 0.15771484375, "learning_rate": 4.774611398963731e-07, "loss": 0.0006, "reward": 1.679569959640503, "reward_std": 0.00018433171248943836, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.1795699745416641, "step": 2018 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 5.230569948186528, "grad_norm": 0.9730524641993075, "kl": 0.083984375, "learning_rate": 4.772020725388601e-07, "loss": 0.0006, "reward": 1.9998382329940796, "reward_std": 1.108695028051443e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4998381435871124, "step": 2019 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 5.233160621761658, "grad_norm": 0.7803634350721955, "kl": 0.11572265625, "learning_rate": 4.769430051813471e-07, "loss": 0.0006, "reward": 2.4999921321868896, "reward_std": 6.346814558355618e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999918341636658, "step": 2020 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 5.2357512953367875, "grad_norm": 4.491683672925936, "kl": 0.0859375, "learning_rate": 4.7668393782383414e-07, "loss": 0.0003, "reward": 2.499886393547058, "reward_std": 3.21405750582926e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9998863339424133, "step": 2021 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 5.238341968911917, "grad_norm": 0.1495562754015289, "kl": 0.0795440673828125, "learning_rate": 4.7642487046632124e-07, "loss": 0.0008, "reward": 2.4999966621398926, "reward_std": 1.8294344954483677e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.999996542930603, "step": 2022 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 5.240932642487047, "grad_norm": 2.331964953034922, "kl": 0.10107421875, "learning_rate": 4.761658031088083e-07, "loss": -0.0006, "reward": 2.499977469444275, "reward_std": 1.2538307601062115e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.999977707862854, "step": 2023 }, { "clip_ratio": 0.0, "completion_length": 34.5625, "epoch": 5.243523316062176, "grad_norm": 35.18168684649078, "kl": 0.157470703125, "learning_rate": 4.759067357512953e-07, "loss": 0.0007, "reward": 1.9348769187927246, "reward_std": 0.18204218066239264, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4348769187927246, "step": 2024 }, { "clip_ratio": 0.0, "completion_length": 34.5, "epoch": 5.246113989637306, "grad_norm": 6.064152304671751, "kl": 0.0927734375, "learning_rate": 4.7564766839378235e-07, "loss": 0.001, "reward": 1.9447259306907654, "reward_std": 0.0001724832382024033, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4447258710861206, "step": 2025 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 5.248704663212435, "grad_norm": 0.1557743861833267, "kl": 0.074951171875, "learning_rate": 4.7538860103626945e-07, "loss": -0.0005, "reward": 2.4999911785125732, "reward_std": 3.6043558395704167e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999913573265076, "step": 2026 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 5.251295336787565, "grad_norm": 0.21249021004467808, "kl": 0.118896484375, "learning_rate": 4.7512953367875645e-07, "loss": 0.001, "reward": 2.4999886751174927, "reward_std": 2.9805546546413098e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999884963035583, "step": 2027 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 5.253886010362694, "grad_norm": 0.13388185961559354, "kl": 0.091796875, "learning_rate": 4.748704663212435e-07, "loss": 0.0012, "reward": 2.4999974966049194, "reward_std": 2.211841092503164e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999973773956299, "step": 2028 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 5.256476683937824, "grad_norm": 0.10384611493487242, "kl": 0.048828125, "learning_rate": 4.7461139896373056e-07, "loss": 0.0013, "reward": 2.499996066093445, "reward_std": 2.168377875477745e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999958872795105, "step": 2029 }, { "clip_ratio": 0.0, "completion_length": 34.5, "epoch": 5.259067357512953, "grad_norm": 0.41286146074346136, "kl": 0.047119140625, "learning_rate": 4.7435233160621756e-07, "loss": -0.0002, "reward": 2.4999895095825195, "reward_std": 4.048265054734657e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.99998939037323, "step": 2030 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 5.261658031088083, "grad_norm": 11.093720648281675, "kl": 0.15673828125, "learning_rate": 4.7409326424870466e-07, "loss": 0.0008, "reward": 2.4374760389328003, "reward_std": 0.17682615059948148, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9374760389328003, "step": 2031 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 5.2642487046632125, "grad_norm": 0.04485272139727189, "kl": 0.100830078125, "learning_rate": 4.738341968911917e-07, "loss": 0.0003, "reward": 2.499998450279236, "reward_std": 1.171377107311855e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999985098838806, "step": 2032 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 5.266839378238342, "grad_norm": 1.305999910431039, "kl": 0.1181640625, "learning_rate": 4.735751295336787e-07, "loss": 0.0008, "reward": 2.499970316886902, "reward_std": 1.1428786933720403e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999703168869019, "step": 2033 }, { "clip_ratio": 0.0, "completion_length": 34.5, "epoch": 5.269430051813472, "grad_norm": 0.7582947177992131, "kl": 0.05078125, "learning_rate": 4.7331606217616577e-07, "loss": 0.0005, "reward": 1.9998403787612915, "reward_std": 1.263207673218858e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.499840408563614, "step": 2034 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 5.272020725388601, "grad_norm": 0.1503034817943373, "kl": 0.0736083984375, "learning_rate": 4.730569948186529e-07, "loss": -0.0006, "reward": 2.499990701675415, "reward_std": 2.7719107720258762e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999905824661255, "step": 2035 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 5.274611398963731, "grad_norm": 0.06364041847068415, "kl": 0.096923828125, "learning_rate": 4.7279792746113987e-07, "loss": 0.0002, "reward": 2.499995470046997, "reward_std": 1.3829541956056346e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.999995470046997, "step": 2036 }, { "clip_ratio": 0.0, "completion_length": 34.5, "epoch": 5.27720207253886, "grad_norm": 2.8009695872641536, "kl": 0.18115234375, "learning_rate": 4.725388601036269e-07, "loss": 0.0013, "reward": 2.4999924898147583, "reward_std": 5.7567809790270985e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999923706054688, "step": 2037 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 5.27979274611399, "grad_norm": 1.1372837058413379, "kl": 0.072021484375, "learning_rate": 4.72279792746114e-07, "loss": -0.0004, "reward": 1.999854564666748, "reward_std": 1.872285395165818e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4998546838760376, "step": 2038 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 5.282383419689119, "grad_norm": 2.378991347336132, "kl": 0.127685546875, "learning_rate": 4.72020725388601e-07, "loss": 0.0006, "reward": 2.499992847442627, "reward_std": 8.28299613431227e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999927878379822, "step": 2039 }, { "clip_ratio": 0.0, "completion_length": 34.5, "epoch": 5.284974093264249, "grad_norm": 0.562924260445607, "kl": 0.1142578125, "learning_rate": 4.717616580310881e-07, "loss": -0.0005, "reward": 2.4999905824661255, "reward_std": 4.405285949360405e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999908208847046, "step": 2040 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 5.287564766839378, "grad_norm": 0.5181950000187854, "kl": 0.1455078125, "learning_rate": 4.7150259067357514e-07, "loss": -0.0002, "reward": 1.9998031854629517, "reward_std": 1.1115156326013675e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4998033046722412, "step": 2041 }, { "clip_ratio": 0.0, "completion_length": 34.75, "epoch": 5.290155440414508, "grad_norm": 0.7796277982813166, "kl": 0.07257080078125, "learning_rate": 4.7124352331606214e-07, "loss": 0.0015, "reward": 2.499987244606018, "reward_std": 2.3257948100763315e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.999987006187439, "step": 2042 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 5.2927461139896375, "grad_norm": 1441.9090813129255, "kl": 0.15576171875, "learning_rate": 4.709844559585492e-07, "loss": 0.0001, "reward": 1.9705055952072144, "reward_std": 0.001294660042731266, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4705053865909576, "step": 2043 }, { "clip_ratio": 0.0, "completion_length": 34.5, "epoch": 5.295336787564767, "grad_norm": 27.763422929052062, "kl": 0.112548828125, "learning_rate": 4.7072538860103624e-07, "loss": -0.0003, "reward": 2.4987707138061523, "reward_std": 2.335650242457632e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.998770833015442, "step": 2044 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 5.2979274611398965, "grad_norm": 0.03659922843868473, "kl": 0.0985107421875, "learning_rate": 4.704663212435233e-07, "loss": -0.0002, "reward": 2.4999982118606567, "reward_std": 1.6554770354559878e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999982714653015, "step": 2045 }, { "clip_ratio": 0.0, "completion_length": 34.5, "epoch": 5.300518134715026, "grad_norm": 43.25512056085125, "kl": 0.1611328125, "learning_rate": 4.7020725388601035e-07, "loss": 0.0007, "reward": 1.4522658586502075, "reward_std": 0.0009199381747748703, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 0.9522657990455627, "step": 2046 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 5.303108808290156, "grad_norm": 0.4428490575050021, "kl": 0.09796142578125, "learning_rate": 4.699481865284974e-07, "loss": -0.0005, "reward": 2.499997615814209, "reward_std": 1.3435702612696332e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.999997615814209, "step": 2047 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 5.305699481865285, "grad_norm": 0.16740936216765842, "kl": 0.0474853515625, "learning_rate": 4.696891191709844e-07, "loss": 0.0006, "reward": 2.499979853630066, "reward_std": 4.220144774080836e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999797344207764, "step": 2048 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 5.308290155440415, "grad_norm": 7.065617004590854, "kl": 0.0830078125, "learning_rate": 4.694300518134715e-07, "loss": -0.0001, "reward": 2.437489867210388, "reward_std": 0.17679200713064347, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9374898672103882, "step": 2049 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 5.310880829015544, "grad_norm": 22.297933049597205, "kl": 0.048583984375, "learning_rate": 4.6917098445595856e-07, "loss": 0.0004, "reward": 2.499940514564514, "reward_std": 2.7524716870175325e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999406337738037, "step": 2050 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 5.313471502590674, "grad_norm": 3.250714341474138, "kl": 0.0985107421875, "learning_rate": 4.6891191709844556e-07, "loss": 0.0012, "reward": 2.0624446272850037, "reward_std": 0.17678561293860184, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.5624443888664246, "step": 2051 }, { "clip_ratio": 0.0, "completion_length": 34.5, "epoch": 5.316062176165803, "grad_norm": 18.763013860413036, "kl": 0.068603515625, "learning_rate": 4.686528497409326e-07, "loss": -0.0005, "reward": 2.4374810457229614, "reward_std": 0.17680527231124188, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9374812245368958, "step": 2052 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 5.318652849740933, "grad_norm": 0.37487403564785005, "kl": 0.1455078125, "learning_rate": 4.6839378238341966e-07, "loss": 0.0006, "reward": 2.499966621398926, "reward_std": 6.136103820608696e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999666810035706, "step": 2053 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 5.321243523316062, "grad_norm": 0.08383875803303492, "kl": 0.0526123046875, "learning_rate": 4.681347150259067e-07, "loss": 0.0018, "reward": 2.4999969005584717, "reward_std": 1.298973870689224e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.999996840953827, "step": 2054 }, { "clip_ratio": 0.0, "completion_length": 34.0, "epoch": 5.323834196891192, "grad_norm": 4.800539919076755, "kl": 0.07568359375, "learning_rate": 4.6787564766839377e-07, "loss": 0.0006, "reward": 1.9934781789779663, "reward_std": 9.310352805869115e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4934781789779663, "step": 2055 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 5.3264248704663215, "grad_norm": 3.089813281554356, "kl": 0.18017578125, "learning_rate": 4.676165803108808e-07, "loss": 0.001, "reward": 1.9996094703674316, "reward_std": 4.1392646153326496e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4996094405651093, "step": 2056 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 5.329015544041451, "grad_norm": 3.4798608006591842, "kl": 0.060302734375, "learning_rate": 4.673575129533678e-07, "loss": 0.0001, "reward": 2.499972701072693, "reward_std": 1.2096402770112036e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.999972641468048, "step": 2057 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 5.331606217616581, "grad_norm": 1.4020570072343297, "kl": 0.07763671875, "learning_rate": 4.670984455958549e-07, "loss": 0.0008, "reward": 1.9996461868286133, "reward_std": 7.083769673954521e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4996461868286133, "step": 2058 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 5.33419689119171, "grad_norm": 18.770998896832978, "kl": 0.10400390625, "learning_rate": 4.66839378238342e-07, "loss": 0.0008, "reward": 1.986478328704834, "reward_std": 0.00022058276499592466, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4864783883094788, "step": 2059 }, { "clip_ratio": 0.0, "completion_length": 34.5, "epoch": 5.33678756476684, "grad_norm": 1.5290833770478713, "kl": 0.1123046875, "learning_rate": 4.66580310880829e-07, "loss": 0.0011, "reward": 2.4999831914901733, "reward_std": 9.041373857598956e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999830722808838, "step": 2060 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 5.339378238341969, "grad_norm": 3.365025108209568, "kl": 0.17138671875, "learning_rate": 4.6632124352331603e-07, "loss": 0.0008, "reward": 1.4974809288978577, "reward_std": 8.409101064899005e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 0.9974808692932129, "step": 2061 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 5.341968911917099, "grad_norm": 0.516722815518361, "kl": 0.07373046875, "learning_rate": 4.660621761658031e-07, "loss": 0.0013, "reward": 2.49996817111969, "reward_std": 6.534461249430024e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999680519104004, "step": 2062 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 5.344559585492228, "grad_norm": 42.9464911958448, "kl": 0.11328125, "learning_rate": 4.6580310880829014e-07, "loss": 0.0001, "reward": 1.8297033905982971, "reward_std": 0.0010806132086145226, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.329703450202942, "step": 2063 }, { "clip_ratio": 0.0, "completion_length": 35.0625, "epoch": 5.347150259067358, "grad_norm": 3.571548149362702, "kl": 0.088134765625, "learning_rate": 4.655440414507772e-07, "loss": 0.0007, "reward": 2.499848246574402, "reward_std": 1.895615514513338e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.999848186969757, "step": 2064 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 5.349740932642487, "grad_norm": 1.5485089860855472, "kl": 0.10986328125, "learning_rate": 4.6528497409326424e-07, "loss": 0.0, "reward": 2.49999463558197, "reward_std": 6.795088211219991e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.999994695186615, "step": 2065 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 5.352331606217617, "grad_norm": 0.46202290980096195, "kl": 0.1143798828125, "learning_rate": 4.6502590673575124e-07, "loss": 0.0004, "reward": 2.4999818801879883, "reward_std": 5.970556117063097e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999818801879883, "step": 2066 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 5.3549222797927465, "grad_norm": 0.44432918152238815, "kl": 0.03643798828125, "learning_rate": 4.647668393782383e-07, "loss": 0.0007, "reward": 2.499997615814209, "reward_std": 2.428942707410897e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999975562095642, "step": 2067 }, { "clip_ratio": 0.0, "completion_length": 34.5, "epoch": 5.357512953367876, "grad_norm": 2.773157396815389, "kl": 0.388916015625, "learning_rate": 4.645077720207254e-07, "loss": 0.001, "reward": 2.499998688697815, "reward_std": 1.1089787221862935e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999988079071045, "step": 2068 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 5.360103626943006, "grad_norm": 9.300575965577616, "kl": 0.08203125, "learning_rate": 4.642487046632124e-07, "loss": 0.0005, "reward": 2.3749853372573853, "reward_std": 0.23146944576802753, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.87498539686203, "step": 2069 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 5.362694300518135, "grad_norm": 1.1781889635855334, "kl": 0.106689453125, "learning_rate": 4.6398963730569945e-07, "loss": -0.0001, "reward": 1.9999173879623413, "reward_std": 1.2754080671584234e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4999174177646637, "step": 2070 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 5.365284974093265, "grad_norm": 1.3031170850880578, "kl": 0.082763671875, "learning_rate": 4.637305699481865e-07, "loss": -0.0, "reward": 2.4999780654907227, "reward_std": 1.0340319136048493e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999781250953674, "step": 2071 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 5.367875647668393, "grad_norm": 1.978468819335199, "kl": 0.150634765625, "learning_rate": 4.6347150259067356e-07, "loss": -0.0001, "reward": 2.499780058860779, "reward_std": 2.750314820332278e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9997801184654236, "step": 2072 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 5.370466321243523, "grad_norm": 32.61666273121922, "kl": 0.1259765625, "learning_rate": 4.632124352331606e-07, "loss": -0.0, "reward": 1.9938130378723145, "reward_std": 9.415466047357768e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4938131272792816, "step": 2073 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 5.373056994818652, "grad_norm": 0.4298273314831697, "kl": 0.1123046875, "learning_rate": 4.6295336787564766e-07, "loss": 0.001, "reward": 2.499993681907654, "reward_std": 3.947814434468455e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.999993622303009, "step": 2074 }, { "clip_ratio": 0.0, "completion_length": 34.5, "epoch": 5.375647668393782, "grad_norm": 12.019764169042013, "kl": 0.11474609375, "learning_rate": 4.6269430051813466e-07, "loss": 0.0003, "reward": 1.996951937675476, "reward_std": 8.88830535359375e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4969519674777985, "step": 2075 }, { "clip_ratio": 0.0, "completion_length": 34.5, "epoch": 5.3782383419689115, "grad_norm": 7.571780400970682, "kl": 0.70361328125, "learning_rate": 4.624352331606217e-07, "loss": 0.0023, "reward": 2.4999905824661255, "reward_std": 7.733488473604666e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.999990701675415, "step": 2076 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 5.380829015544041, "grad_norm": 2.9010578104469884, "kl": 0.093505859375, "learning_rate": 4.621761658031088e-07, "loss": -0.0005, "reward": 2.499996066093445, "reward_std": 3.613636465615855e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999961853027344, "step": 2077 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 5.383419689119171, "grad_norm": 0.20628868597865285, "kl": 0.045166015625, "learning_rate": 4.619170984455958e-07, "loss": 0.0008, "reward": 2.499986171722412, "reward_std": 3.1702651881460042e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.999986171722412, "step": 2078 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 5.3860103626943, "grad_norm": 0.06924294406913326, "kl": 0.093017578125, "learning_rate": 4.616580310880829e-07, "loss": 0.0007, "reward": 2.499999165534973, "reward_std": 1.1082049127253413e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999991655349731, "step": 2079 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 5.38860103626943, "grad_norm": 0.3183844852046282, "kl": 0.07275390625, "learning_rate": 4.6139896373056993e-07, "loss": -0.0002, "reward": 2.4998854398727417, "reward_std": 6.824276283623476e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9998854398727417, "step": 2080 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 5.391191709844559, "grad_norm": 0.35241318307090086, "kl": 0.083984375, "learning_rate": 4.611398963730569e-07, "loss": 0.0014, "reward": 2.4999953508377075, "reward_std": 3.296984459666419e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999953508377075, "step": 2081 }, { "clip_ratio": 0.0, "completion_length": 34.5, "epoch": 5.393782383419689, "grad_norm": 1.2599272912305945, "kl": 0.107177734375, "learning_rate": 4.6088082901554403e-07, "loss": 0.0005, "reward": 1.9987984895706177, "reward_std": 2.61850881315695e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.498798429965973, "step": 2082 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 5.396373056994818, "grad_norm": 0.7021500807060977, "kl": 0.067626953125, "learning_rate": 4.606217616580311e-07, "loss": 0.0006, "reward": 2.499995470046997, "reward_std": 3.372900181375371e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999954104423523, "step": 2083 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 5.398963730569948, "grad_norm": 0.20631606785383624, "kl": 0.12646484375, "learning_rate": 4.603626943005181e-07, "loss": -0.0002, "reward": 2.4999841451644897, "reward_std": 3.011648686879198e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999843835830688, "step": 2084 }, { "clip_ratio": 0.0, "completion_length": 35.1875, "epoch": 5.401554404145077, "grad_norm": 109.22476920514463, "kl": 0.11474609375, "learning_rate": 4.6010362694300514e-07, "loss": 0.0002, "reward": 1.935310959815979, "reward_std": 0.08227558277485514, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4353110194206238, "step": 2085 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 5.404145077720207, "grad_norm": 0.0870789919329384, "kl": 0.080078125, "learning_rate": 4.5984455958549224e-07, "loss": 0.0015, "reward": 2.49997341632843, "reward_std": 2.1756844148512755e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.999973177909851, "step": 2086 }, { "clip_ratio": 0.0, "completion_length": 34.0, "epoch": 5.4067357512953365, "grad_norm": 3.654903948211217, "kl": 0.14111328125, "learning_rate": 4.5958549222797924e-07, "loss": 0.0002, "reward": 2.499973773956299, "reward_std": 1.4916741918113985e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999738931655884, "step": 2087 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 5.409326424870466, "grad_norm": 0.7304761168384192, "kl": 0.113525390625, "learning_rate": 4.593264248704663e-07, "loss": 0.0004, "reward": 1.9996466040611267, "reward_std": 1.172649513137003e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4996465146541595, "step": 2088 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 5.4119170984455955, "grad_norm": 6.626392757495933, "kl": 0.092041015625, "learning_rate": 4.5906735751295335e-07, "loss": 0.0006, "reward": 2.4987927675247192, "reward_std": 9.908435117722547e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9987927079200745, "step": 2089 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 5.414507772020725, "grad_norm": 0.12737609264142025, "kl": 0.131103515625, "learning_rate": 4.5880829015544035e-07, "loss": 0.0005, "reward": 2.499995470046997, "reward_std": 2.684502192096261e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999954104423523, "step": 2090 }, { "clip_ratio": 0.0, "completion_length": 34.5, "epoch": 5.417098445595855, "grad_norm": 10.440643239896733, "kl": 0.1435546875, "learning_rate": 4.5854922279792745e-07, "loss": 0.0002, "reward": 1.9852285385131836, "reward_std": 0.00026295995030523045, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4852285981178284, "step": 2091 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 5.419689119170984, "grad_norm": 0.14474505058066395, "kl": 0.04144287109375, "learning_rate": 4.582901554404145e-07, "loss": 0.0006, "reward": 2.4999972581863403, "reward_std": 1.935968896304985e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999973773956299, "step": 2092 }, { "clip_ratio": 0.0, "completion_length": 34.5, "epoch": 5.422279792746114, "grad_norm": 0.4666911490847812, "kl": 0.04620361328125, "learning_rate": 4.580310880829015e-07, "loss": 0.0002, "reward": 2.4999961853027344, "reward_std": 2.240518426788185e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999962449073792, "step": 2093 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 5.424870466321243, "grad_norm": 0.23685128998960037, "kl": 0.115478515625, "learning_rate": 4.5777202072538856e-07, "loss": 0.0013, "reward": 2.499997615814209, "reward_std": 1.6561365896450297e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999976754188538, "step": 2094 }, { "clip_ratio": 0.0, "completion_length": 34.5, "epoch": 5.427461139896373, "grad_norm": 0.1170719647192125, "kl": 0.10888671875, "learning_rate": 4.5751295336787566e-07, "loss": 0.0003, "reward": 2.4999972581863403, "reward_std": 1.4930296288184763e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999971985816956, "step": 2095 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 5.430051813471502, "grad_norm": 25.397776203074454, "kl": 0.115966796875, "learning_rate": 4.5725388601036266e-07, "loss": -0.0001, "reward": 2.3748977184295654, "reward_std": 0.2316294201746132, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.8748977184295654, "step": 2096 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 5.432642487046632, "grad_norm": 3.5070219488633594, "kl": 0.1563720703125, "learning_rate": 4.569948186528497e-07, "loss": 0.0006, "reward": 1.8793118000030518, "reward_std": 0.00023071511327543703, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.3793119192123413, "step": 2097 }, { "clip_ratio": 0.0, "completion_length": 35.125, "epoch": 5.435233160621761, "grad_norm": 257.7513122291959, "kl": 0.1572265625, "learning_rate": 4.5673575129533677e-07, "loss": 0.0007, "reward": 1.390673577785492, "reward_std": 0.36016987028415315, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 0.8906736075878143, "step": 2098 }, { "clip_ratio": 0.0, "completion_length": 36.125, "epoch": 5.437823834196891, "grad_norm": 39.058394246433764, "kl": 0.20263671875, "learning_rate": 4.5647668393782377e-07, "loss": 0.0005, "reward": 1.9938093423843384, "reward_std": 0.00020790389680769295, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4938094019889832, "step": 2099 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 5.4404145077720205, "grad_norm": 1.6093715633645567, "kl": 0.0562744140625, "learning_rate": 4.562176165803109e-07, "loss": 0.0008, "reward": 1.9986222982406616, "reward_std": 3.4460435131222766e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4986222684383392, "step": 2100 }, { "clip_ratio": 0.0, "completion_length": 34.5, "epoch": 5.44300518134715, "grad_norm": 0.13423746272452125, "kl": 0.15283203125, "learning_rate": 4.5595854922279793e-07, "loss": 0.0006, "reward": 2.4999983310699463, "reward_std": 1.0358177462421736e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999984502792358, "step": 2101 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 5.44559585492228, "grad_norm": 1.0632176547632588, "kl": 0.070068359375, "learning_rate": 4.5569948186528493e-07, "loss": -0.0001, "reward": 2.4999743700027466, "reward_std": 7.153669457693468e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999744296073914, "step": 2102 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 5.448186528497409, "grad_norm": 0.8483378456759624, "kl": 0.07275390625, "learning_rate": 4.55440414507772e-07, "loss": 0.0005, "reward": 2.499990224838257, "reward_std": 8.1300312899657e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999902248382568, "step": 2103 }, { "clip_ratio": 0.0, "completion_length": 34.5, "epoch": 5.450777202072539, "grad_norm": 17.511566349120763, "kl": 0.09375, "learning_rate": 4.5518134715025903e-07, "loss": 0.0001, "reward": 2.4999918937683105, "reward_std": 6.739405705502577e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999919533729553, "step": 2104 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 5.453367875647668, "grad_norm": 0.7993308030609892, "kl": 0.130126953125, "learning_rate": 4.549222797927461e-07, "loss": 0.0006, "reward": 2.4999923706054688, "reward_std": 4.377476869876773e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999923706054688, "step": 2105 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 5.455958549222798, "grad_norm": 1.6610785064069333, "kl": 0.0888671875, "learning_rate": 4.5466321243523314e-07, "loss": 0.0004, "reward": 2.49998140335083, "reward_std": 9.380372830491979e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999814629554749, "step": 2106 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 5.458549222797927, "grad_norm": 0.07887880199120834, "kl": 0.1123046875, "learning_rate": 4.544041450777202e-07, "loss": 0.0, "reward": 2.499997138977051, "reward_std": 2.263416718051303e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999972581863403, "step": 2107 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 5.461139896373057, "grad_norm": 5.114609539002331, "kl": 0.0760498046875, "learning_rate": 4.541450777202072e-07, "loss": 0.0003, "reward": 1.9998487830162048, "reward_std": 2.6893015046880464e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4998488128185272, "step": 2108 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 5.463730569948186, "grad_norm": 3.3914473645730507, "kl": 0.182861328125, "learning_rate": 4.538860103626943e-07, "loss": 0.0008, "reward": 1.9211264848709106, "reward_std": 0.00027640461863143173, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4211264252662659, "step": 2109 }, { "clip_ratio": 0.0, "completion_length": 34.5, "epoch": 5.466321243523316, "grad_norm": 0.7431498933025664, "kl": 0.19921875, "learning_rate": 4.5362694300518135e-07, "loss": 0.0008, "reward": 2.4999760389328003, "reward_std": 6.2817168782203225e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999760389328003, "step": 2110 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 5.4689119170984455, "grad_norm": 1.7548006123777358, "kl": 0.1070556640625, "learning_rate": 4.5336787564766835e-07, "loss": -0.0005, "reward": 1.9984501004219055, "reward_std": 3.3719107818797056e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.498450219631195, "step": 2111 }, { "clip_ratio": 0.0, "completion_length": 34.5, "epoch": 5.471502590673575, "grad_norm": 0.17352945210498388, "kl": 0.0966796875, "learning_rate": 4.531088082901554e-07, "loss": 0.0007, "reward": 2.499997854232788, "reward_std": 1.8830323256224801e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999979138374329, "step": 2112 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 5.474093264248705, "grad_norm": 0.8317889210757883, "kl": 0.0552978515625, "learning_rate": 4.5284974093264245e-07, "loss": 0.0001, "reward": 2.4999839067459106, "reward_std": 9.960845773093752e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999839067459106, "step": 2113 }, { "clip_ratio": 0.0, "completion_length": 34.5, "epoch": 5.476683937823834, "grad_norm": 4.660650933004875, "kl": 0.12451171875, "learning_rate": 4.5259067357512956e-07, "loss": 0.0, "reward": 2.499933958053589, "reward_std": 1.3370431361181545e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999341368675232, "step": 2114 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 5.479274611398964, "grad_norm": 3.8197825762919737, "kl": 0.099365234375, "learning_rate": 4.5233160621761656e-07, "loss": 0.0001, "reward": 1.993431806564331, "reward_std": 0.0001893609992293932, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.493431806564331, "step": 2115 }, { "clip_ratio": 0.0, "completion_length": 34.5, "epoch": 5.481865284974093, "grad_norm": 0.17468407121409668, "kl": 0.05078125, "learning_rate": 4.520725388601036e-07, "loss": 0.0005, "reward": 2.4999953508377075, "reward_std": 2.761796849881648e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.999995470046997, "step": 2116 }, { "clip_ratio": 0.0, "completion_length": 34.5, "epoch": 5.484455958549223, "grad_norm": 11.985009066598911, "kl": 0.159423828125, "learning_rate": 4.5181347150259066e-07, "loss": -0.0, "reward": 2.4999886751174927, "reward_std": 1.3316227523318958e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999885559082031, "step": 2117 }, { "clip_ratio": 0.0, "completion_length": 34.5, "epoch": 5.487046632124352, "grad_norm": 6.028526915400043, "kl": 0.1678466796875, "learning_rate": 4.515544041450777e-07, "loss": 0.0004, "reward": 1.9817107319831848, "reward_std": 0.004229917497241331, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.48171067237854, "step": 2118 }, { "clip_ratio": 0.0, "completion_length": 34.5, "epoch": 5.489637305699482, "grad_norm": 0.35100847570938376, "kl": 0.072998046875, "learning_rate": 4.5129533678756477e-07, "loss": -0.001, "reward": 2.499992847442627, "reward_std": 4.280905727682693e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.999993085861206, "step": 2119 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 5.492227979274611, "grad_norm": 4.4446664368211, "kl": 0.4342041015625, "learning_rate": 4.510362694300518e-07, "loss": 0.0016, "reward": 2.4374722242355347, "reward_std": 0.17680404841189556, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9374722838401794, "step": 2120 }, { "clip_ratio": 0.0, "completion_length": 34.5, "epoch": 5.494818652849741, "grad_norm": 4.078536641069688, "kl": 0.136474609375, "learning_rate": 4.507772020725388e-07, "loss": 0.0015, "reward": 2.4999375343322754, "reward_std": 4.1198954022547696e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999374151229858, "step": 2121 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 5.4974093264248705, "grad_norm": 1.3657432669730982, "kl": 0.117431640625, "learning_rate": 4.505181347150259e-07, "loss": 0.0001, "reward": 2.4999449253082275, "reward_std": 1.496618210694578e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999448657035828, "step": 2122 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 5.5, "grad_norm": 2.373812144516341, "kl": 0.080322265625, "learning_rate": 4.50259067357513e-07, "loss": 0.0001, "reward": 2.4999889135360718, "reward_std": 6.5248079863522435e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999887347221375, "step": 2123 }, { "clip_ratio": 0.0, "completion_length": 34.5, "epoch": 5.5025906735751295, "grad_norm": 0.14415498418273912, "kl": 0.095458984375, "learning_rate": 4.5e-07, "loss": 0.0014, "reward": 2.4999982118606567, "reward_std": 1.468996970288572e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999980926513672, "step": 2124 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 5.505181347150259, "grad_norm": 3.393357852437887, "kl": 0.108642578125, "learning_rate": 4.4974093264248703e-07, "loss": 0.0015, "reward": 2.499962568283081, "reward_std": 2.582822708063759e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999624490737915, "step": 2125 }, { "clip_ratio": 0.0, "completion_length": 34.5, "epoch": 5.507772020725389, "grad_norm": 6.333846649146768, "kl": 0.077392578125, "learning_rate": 4.494818652849741e-07, "loss": 0.0009, "reward": 1.9896284937858582, "reward_std": 0.00012527380079063732, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4896283745765686, "step": 2126 }, { "clip_ratio": 0.0, "completion_length": 34.6875, "epoch": 5.510362694300518, "grad_norm": 0.27204167394002254, "kl": 0.05023193359375, "learning_rate": 4.492227979274611e-07, "loss": -0.0004, "reward": 2.4999969005584717, "reward_std": 2.763033876362897e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999970197677612, "step": 2127 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 5.512953367875648, "grad_norm": 0.22154252629966525, "kl": 0.09124755859375, "learning_rate": 4.489637305699482e-07, "loss": -0.0008, "reward": 2.4999849796295166, "reward_std": 3.4542750881882966e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999850988388062, "step": 2128 }, { "clip_ratio": 0.0, "completion_length": 34.5, "epoch": 5.515544041450777, "grad_norm": 0.7903013348352401, "kl": 0.053955078125, "learning_rate": 4.4870466321243524e-07, "loss": 0.0009, "reward": 2.4999918937683105, "reward_std": 4.760370188705565e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999918341636658, "step": 2129 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 5.518134715025907, "grad_norm": 0.12352061277718034, "kl": 0.10546875, "learning_rate": 4.4844559585492224e-07, "loss": 0.001, "reward": 2.4999953508377075, "reward_std": 2.1942207695246907e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999952912330627, "step": 2130 }, { "clip_ratio": 0.0, "completion_length": 36.5, "epoch": 5.520725388601036, "grad_norm": 39.76802694769826, "kl": 0.142822265625, "learning_rate": 4.481865284974093e-07, "loss": 0.0005, "reward": 1.87482488155365, "reward_std": 0.23157009798887884, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.37482488155365, "step": 2131 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 5.523316062176166, "grad_norm": 2.2846823115254242, "kl": 0.171630859375, "learning_rate": 4.479274611398964e-07, "loss": 0.0008, "reward": 1.9938727021217346, "reward_std": 5.393054368596495e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4938727915287018, "step": 2132 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 5.525906735751295, "grad_norm": 16.41539461371132, "kl": 0.0927734375, "learning_rate": 4.476683937823834e-07, "loss": 0.0008, "reward": 2.4374775886535645, "reward_std": 0.17680242723702122, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.937477469444275, "step": 2133 }, { "clip_ratio": 0.0, "completion_length": 34.0, "epoch": 5.528497409326425, "grad_norm": 4.728810302572888, "kl": 0.13134765625, "learning_rate": 4.4740932642487045e-07, "loss": 0.0004, "reward": 1.9984426498413086, "reward_std": 7.29897587348205e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.498442828655243, "step": 2134 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 5.5310880829015545, "grad_norm": 100.55426032067854, "kl": 0.162109375, "learning_rate": 4.471502590673575e-07, "loss": 0.0006, "reward": 2.1247791051864624, "reward_std": 0.2315799526804767, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.6247789859771729, "step": 2135 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 5.533678756476684, "grad_norm": 0.5371193730694697, "kl": 0.06689453125, "learning_rate": 4.468911917098445e-07, "loss": -0.0002, "reward": 2.499989151954651, "reward_std": 3.8064266902892996e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999892711639404, "step": 2136 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 5.536269430051814, "grad_norm": 2.225212087178764, "kl": 0.0631103515625, "learning_rate": 4.466321243523316e-07, "loss": -0.0006, "reward": 2.499995708465576, "reward_std": 6.409569550669403e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999957084655762, "step": 2137 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 5.538860103626943, "grad_norm": 1.1301952180355956, "kl": 0.106689453125, "learning_rate": 4.4637305699481866e-07, "loss": 0.0013, "reward": 2.4999881982803345, "reward_std": 7.659103403057088e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.999988317489624, "step": 2138 }, { "clip_ratio": 0.0, "completion_length": 34.5, "epoch": 5.541450777202073, "grad_norm": 0.6437848025521882, "kl": 0.050537109375, "learning_rate": 4.4611398963730566e-07, "loss": 0.0012, "reward": 2.4999924898147583, "reward_std": 4.330290323650843e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999923706054688, "step": 2139 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 5.544041450777202, "grad_norm": 1.942622423904635, "kl": 0.119873046875, "learning_rate": 4.458549222797927e-07, "loss": 0.0007, "reward": 2.4999877214431763, "reward_std": 1.4025659197614004e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999876618385315, "step": 2140 }, { "clip_ratio": 0.0, "completion_length": 34.5, "epoch": 5.546632124352332, "grad_norm": 9.867954327090045, "kl": 0.157470703125, "learning_rate": 4.4559585492227977e-07, "loss": 0.0007, "reward": 2.499904155731201, "reward_std": 2.4772121832938865e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999040961265564, "step": 2141 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 5.549222797927461, "grad_norm": 3.6160959552145524, "kl": 0.1307373046875, "learning_rate": 4.453367875647668e-07, "loss": 0.0013, "reward": 1.9033666849136353, "reward_std": 0.0002599953592152815, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4033666551113129, "step": 2142 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 5.551813471502591, "grad_norm": 0.09775863213702475, "kl": 0.1177978515625, "learning_rate": 4.450777202072539e-07, "loss": 0.0008, "reward": 2.4999953508377075, "reward_std": 1.7423860185772355e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999954104423523, "step": 2143 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 5.55440414507772, "grad_norm": 2.243589807058799, "kl": 0.16162109375, "learning_rate": 4.4481865284974093e-07, "loss": -0.0006, "reward": 1.9998326301574707, "reward_std": 1.964680905075511e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4998327493667603, "step": 2144 }, { "clip_ratio": 0.0, "completion_length": 34.5, "epoch": 5.55699481865285, "grad_norm": 0.7656459826492177, "kl": 0.044921875, "learning_rate": 4.4455958549222793e-07, "loss": -0.0006, "reward": 2.4999676942825317, "reward_std": 8.186028026102576e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999676942825317, "step": 2145 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 5.5595854922279795, "grad_norm": 30.338972631246072, "kl": 0.080810546875, "learning_rate": 4.4430051813471503e-07, "loss": -0.0004, "reward": 2.499507427215576, "reward_std": 6.0072918131481856e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9995075464248657, "step": 2146 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 5.562176165803109, "grad_norm": 31.716876967332094, "kl": 0.13232421875, "learning_rate": 4.440414507772021e-07, "loss": 0.0008, "reward": 2.1871031522750854, "reward_std": 0.259071770790797, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.6871030926704407, "step": 2147 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 5.564766839378239, "grad_norm": 0.1507314942886478, "kl": 0.13623046875, "learning_rate": 4.437823834196891e-07, "loss": -0.0, "reward": 2.499996781349182, "reward_std": 2.495411820291338e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999969601631165, "step": 2148 }, { "clip_ratio": 0.0, "completion_length": 35.1875, "epoch": 5.567357512953368, "grad_norm": 1.8812029066111369, "kl": 0.2052001953125, "learning_rate": 4.4352331606217614e-07, "loss": 0.0015, "reward": 2.499962091445923, "reward_std": 1.1206122508156113e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999619722366333, "step": 2149 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 5.569948186528498, "grad_norm": 1.4599234945564576, "kl": 0.14697265625, "learning_rate": 4.432642487046632e-07, "loss": 0.0013, "reward": 2.49997341632843, "reward_std": 1.0891386864386732e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.999973475933075, "step": 2150 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 5.572538860103627, "grad_norm": 0.16933054186855132, "kl": 0.0618896484375, "learning_rate": 4.4300518134715024e-07, "loss": 0.0014, "reward": 2.4999969005584717, "reward_std": 2.5736694340139366e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999966621398926, "step": 2151 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 5.575129533678757, "grad_norm": 1.5100692245043172, "kl": 0.18115234375, "learning_rate": 4.427461139896373e-07, "loss": -0.0004, "reward": 2.4999566078186035, "reward_std": 1.4240009363675199e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999567866325378, "step": 2152 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 5.577720207253886, "grad_norm": 0.27624674247293, "kl": 0.107666015625, "learning_rate": 4.4248704663212435e-07, "loss": -0.0001, "reward": 2.4999914169311523, "reward_std": 3.9234147379829665e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999914765357971, "step": 2153 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 5.580310880829016, "grad_norm": 0.23845592410913444, "kl": 0.08056640625, "learning_rate": 4.4222797927461135e-07, "loss": 0.0006, "reward": 2.4999979734420776, "reward_std": 3.0633586334261054e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999980330467224, "step": 2154 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 5.582901554404145, "grad_norm": 2.7055092519607125, "kl": 0.118408203125, "learning_rate": 4.4196891191709845e-07, "loss": 0.0002, "reward": 2.4999910593032837, "reward_std": 7.3566588412177225e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999909400939941, "step": 2155 }, { "clip_ratio": 0.0, "completion_length": 34.5, "epoch": 5.585492227979275, "grad_norm": 3.495411818117292, "kl": 0.0306396484375, "learning_rate": 4.417098445595855e-07, "loss": 0.0007, "reward": 2.499976396560669, "reward_std": 3.115767276540282e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.999976396560669, "step": 2156 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 5.5880829015544045, "grad_norm": 1.8742573811680148, "kl": 0.151611328125, "learning_rate": 4.414507772020725e-07, "loss": 0.0014, "reward": 2.499990940093994, "reward_std": 6.770880531803414e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999908804893494, "step": 2157 }, { "clip_ratio": 0.0, "completion_length": 34.0, "epoch": 5.590673575129534, "grad_norm": 0.8542966163020366, "kl": 0.077880859375, "learning_rate": 4.4119170984455956e-07, "loss": 0.0007, "reward": 2.499993324279785, "reward_std": 5.478145794768352e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999932050704956, "step": 2158 }, { "clip_ratio": 0.0, "completion_length": 34.5625, "epoch": 5.5932642487046635, "grad_norm": 24.939876868597928, "kl": 0.173828125, "learning_rate": 4.409326424870466e-07, "loss": -0.0, "reward": 1.9968894720077515, "reward_std": 0.0008183199162203891, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.496889442205429, "step": 2159 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 5.595854922279793, "grad_norm": 0.1297434177766517, "kl": 0.0562744140625, "learning_rate": 4.4067357512953366e-07, "loss": 0.0008, "reward": 2.499990463256836, "reward_std": 3.38557981649501e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.999990463256836, "step": 2160 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 5.598445595854923, "grad_norm": 5.402683801644662, "kl": 0.0650634765625, "learning_rate": 4.404145077720207e-07, "loss": 0.0001, "reward": 2.499993085861206, "reward_std": 5.6865879969336675e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999931454658508, "step": 2161 }, { "clip_ratio": 0.0, "completion_length": 34.5, "epoch": 5.601036269430052, "grad_norm": 1.466992224835068, "kl": 0.071533203125, "learning_rate": 4.4015544041450777e-07, "loss": -0.0008, "reward": 2.4999821186065674, "reward_std": 6.864327815492288e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999821782112122, "step": 2162 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 5.603626943005182, "grad_norm": 11.425689537387154, "kl": 0.14208984375, "learning_rate": 4.3989637305699477e-07, "loss": 0.0004, "reward": 1.9941102862358093, "reward_std": 0.000209305703492646, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.494110345840454, "step": 2163 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 5.606217616580311, "grad_norm": 239.6615541810546, "kl": 0.1412353515625, "learning_rate": 4.396373056994818e-07, "loss": 0.0003, "reward": 1.9959449172019958, "reward_std": 0.0002583137900842303, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4959449172019958, "step": 2164 }, { "clip_ratio": 0.0, "completion_length": 34.5, "epoch": 5.608808290155441, "grad_norm": 1.2062004956838557, "kl": 0.0689697265625, "learning_rate": 4.3937823834196893e-07, "loss": 0.0003, "reward": 2.499992609024048, "reward_std": 6.114897701081645e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999926090240479, "step": 2165 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 5.61139896373057, "grad_norm": 1.3166438132769336, "kl": 0.03790283203125, "learning_rate": 4.3911917098445593e-07, "loss": -0.0001, "reward": 2.499997854232788, "reward_std": 2.2870597149449168e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.999997854232788, "step": 2166 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 5.6139896373057, "grad_norm": 1.1466087835920824, "kl": 0.1708984375, "learning_rate": 4.38860103626943e-07, "loss": -0.0001, "reward": 2.4999749660491943, "reward_std": 6.668615469607175e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999750256538391, "step": 2167 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 5.616580310880829, "grad_norm": 7.221542041941564, "kl": 0.101806640625, "learning_rate": 4.3860103626943003e-07, "loss": 0.001, "reward": 1.9987772107124329, "reward_std": 6.658512620560941e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4987771809101105, "step": 2168 }, { "clip_ratio": 0.0, "completion_length": 34.5, "epoch": 5.619170984455959, "grad_norm": 1.3339165040548684, "kl": 0.10986328125, "learning_rate": 4.383419689119171e-07, "loss": 0.0, "reward": 1.999162197113037, "reward_std": 1.8814079226103786e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4991623163223267, "step": 2169 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 5.6217616580310885, "grad_norm": 4.970004104080629, "kl": 0.22265625, "learning_rate": 4.3808290155440414e-07, "loss": 0.0012, "reward": 2.499962329864502, "reward_std": 1.4454813026532065e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.999962329864502, "step": 2170 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 5.624352331606218, "grad_norm": 39.739675352619244, "kl": 0.1358642578125, "learning_rate": 4.378238341968912e-07, "loss": 0.0012, "reward": 2.4346296787261963, "reward_std": 0.18485839097229473, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9346295595169067, "step": 2171 }, { "clip_ratio": 0.0, "completion_length": 33.5, "epoch": 5.626943005181348, "grad_norm": 0.08676030760441354, "kl": 0.20556640625, "learning_rate": 4.375647668393782e-07, "loss": 0.0002, "reward": 2.49999737739563, "reward_std": 2.0557626214667835e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.999997317790985, "step": 2172 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 5.629533678756477, "grad_norm": 2.032892072792795, "kl": 0.07177734375, "learning_rate": 4.3730569948186524e-07, "loss": 0.001, "reward": 2.499976396560669, "reward_std": 7.094038096511213e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999763369560242, "step": 2173 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 5.632124352331607, "grad_norm": 14.289511792638626, "kl": 0.208984375, "learning_rate": 4.3704663212435235e-07, "loss": 0.0008, "reward": 1.3456003665924072, "reward_std": 0.0008903330308385193, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 0.8456003963947296, "step": 2174 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 5.634715025906736, "grad_norm": 1.1563340439937397, "kl": 0.092041015625, "learning_rate": 4.3678756476683935e-07, "loss": -0.0002, "reward": 2.499992609024048, "reward_std": 3.141753779800638e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999926686286926, "step": 2175 }, { "clip_ratio": 0.0, "completion_length": 34.0, "epoch": 5.637305699481866, "grad_norm": 4.4233065360139365, "kl": 0.11376953125, "learning_rate": 4.365284974093264e-07, "loss": 0.0017, "reward": 2.499991297721863, "reward_std": 5.199670795263955e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999912977218628, "step": 2176 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 5.639896373056995, "grad_norm": 2.064123143725936, "kl": 0.186767578125, "learning_rate": 4.3626943005181345e-07, "loss": 0.0008, "reward": 2.4999680519104004, "reward_std": 1.670425763222738e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999679327011108, "step": 2177 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 5.642487046632124, "grad_norm": 14.289675381810858, "kl": 0.093994140625, "learning_rate": 4.3601036269430045e-07, "loss": 0.0005, "reward": 1.771733582019806, "reward_std": 0.2589497046137694, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.2717334926128387, "step": 2178 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 5.6450777202072535, "grad_norm": 4.710799672773662, "kl": 0.095703125, "learning_rate": 4.3575129533678756e-07, "loss": 0.0006, "reward": 2.4999676942825317, "reward_std": 2.3994149273676157e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.999967634677887, "step": 2179 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 5.647668393782383, "grad_norm": 3.0153243824339873, "kl": 0.099853515625, "learning_rate": 4.354922279792746e-07, "loss": -0.0002, "reward": 1.99897038936615, "reward_std": 2.9745113579338067e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4989705383777618, "step": 2180 }, { "clip_ratio": 0.0, "completion_length": 34.0, "epoch": 5.650259067357513, "grad_norm": 1.6771078238030759, "kl": 0.19775390625, "learning_rate": 4.352331606217616e-07, "loss": 0.0003, "reward": 1.9992891550064087, "reward_std": 3.6577896935341414e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4992891550064087, "step": 2181 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 5.652849740932642, "grad_norm": 7.4613682335426255, "kl": 0.15087890625, "learning_rate": 4.3497409326424866e-07, "loss": 0.0, "reward": 1.9997640252113342, "reward_std": 2.9621826797665562e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4997640252113342, "step": 2182 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 5.655440414507772, "grad_norm": 5.427593215024822, "kl": 0.0863037109375, "learning_rate": 4.3471502590673577e-07, "loss": 0.0002, "reward": 2.499963641166687, "reward_std": 1.2640531849683612e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.999963641166687, "step": 2183 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 5.658031088082901, "grad_norm": 0.16297731230208135, "kl": 0.04840087890625, "learning_rate": 4.3445595854922277e-07, "loss": 0.0002, "reward": 2.4999955892562866, "reward_std": 2.29190663958434e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999955892562866, "step": 2184 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 5.660621761658031, "grad_norm": 0.3463993011605354, "kl": 0.0606689453125, "learning_rate": 4.341968911917098e-07, "loss": 0.0002, "reward": 2.499994993209839, "reward_std": 3.956152738737728e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.999994933605194, "step": 2185 }, { "clip_ratio": 0.0, "completion_length": 34.5, "epoch": 5.66321243523316, "grad_norm": 1.8063613958281464, "kl": 0.0745849609375, "learning_rate": 4.339378238341969e-07, "loss": 0.0005, "reward": 2.499988079071045, "reward_std": 9.938718818602865e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.999988079071045, "step": 2186 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 5.66580310880829, "grad_norm": 0.6451298478322072, "kl": 0.158447265625, "learning_rate": 4.336787564766839e-07, "loss": 0.0005, "reward": 2.499992609024048, "reward_std": 5.973681254545227e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999927282333374, "step": 2187 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 5.668393782383419, "grad_norm": 89.7946271326145, "kl": 0.062255859375, "learning_rate": 4.33419689119171e-07, "loss": -0.0001, "reward": 1.9983490705490112, "reward_std": 6.310990966085228e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.498349130153656, "step": 2188 }, { "clip_ratio": 0.0, "completion_length": 37.375, "epoch": 5.670984455958549, "grad_norm": 0.6024938866547718, "kl": 0.37255859375, "learning_rate": 4.3316062176165803e-07, "loss": 0.0015, "reward": 2.4999897480010986, "reward_std": 5.091042680760438e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999898076057434, "step": 2189 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 5.6735751295336785, "grad_norm": 1.9971883633147045, "kl": 0.039306640625, "learning_rate": 4.3290155440414503e-07, "loss": 0.0005, "reward": 2.4999808073043823, "reward_std": 1.1984910997853149e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999806880950928, "step": 2190 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 5.676165803108808, "grad_norm": 0.027292002140266926, "kl": 0.123046875, "learning_rate": 4.326424870466321e-07, "loss": -0.0001, "reward": 2.4999983310699463, "reward_std": 1.0307929301234253e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999982118606567, "step": 2191 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 5.678756476683938, "grad_norm": 2.986754924318519, "kl": 0.11767578125, "learning_rate": 4.323834196891192e-07, "loss": 0.0004, "reward": 2.4999624490737915, "reward_std": 1.8082914607475686e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999624490737915, "step": 2192 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 5.681347150259067, "grad_norm": 1.2617264996412625, "kl": 0.102783203125, "learning_rate": 4.321243523316062e-07, "loss": 0.0003, "reward": 1.9998027086257935, "reward_std": 1.0114108818015666e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4998028874397278, "step": 2193 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 5.683937823834197, "grad_norm": 16.842615074525348, "kl": 0.135986328125, "learning_rate": 4.3186528497409324e-07, "loss": 0.0012, "reward": 2.4999572038650513, "reward_std": 3.409323107916862e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999571442604065, "step": 2194 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 5.686528497409326, "grad_norm": 0.10547165433096149, "kl": 0.0308837890625, "learning_rate": 4.316062176165803e-07, "loss": 0.0003, "reward": 2.499996781349182, "reward_std": 2.33273021876812e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999966025352478, "step": 2195 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 5.689119170984456, "grad_norm": 0.11876973621279549, "kl": 0.04388427734375, "learning_rate": 4.313471502590673e-07, "loss": -0.0001, "reward": 2.49999737739563, "reward_std": 2.318077463314694e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999975562095642, "step": 2196 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 5.691709844559585, "grad_norm": 4.157460450264975, "kl": 0.10272216796875, "learning_rate": 4.310880829015544e-07, "loss": 0.001, "reward": 2.4999924898147583, "reward_std": 7.367076932496275e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999924302101135, "step": 2197 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 5.694300518134715, "grad_norm": 2.0254568886724633, "kl": 0.092529296875, "learning_rate": 4.3082901554404145e-07, "loss": -0.0007, "reward": 1.9981070756912231, "reward_std": 5.517061708815163e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4981070756912231, "step": 2198 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 5.696891191709844, "grad_norm": 1.0029385563081883, "kl": 0.090087890625, "learning_rate": 4.3056994818652845e-07, "loss": 0.0008, "reward": 2.499942898750305, "reward_std": 7.17654359050357e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999428987503052, "step": 2199 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 5.699481865284974, "grad_norm": 12.033391575075846, "kl": 0.13916015625, "learning_rate": 4.303108808290155e-07, "loss": 0.001, "reward": 1.9995281100273132, "reward_std": 4.938871052218019e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4995281100273132, "step": 2200 }, { "clip_ratio": 0.0, "completion_length": 34.5, "epoch": 5.7020725388601035, "grad_norm": 3.2803528889727325, "kl": 0.117919921875, "learning_rate": 4.3005181347150256e-07, "loss": -0.0002, "reward": 2.4999881982803345, "reward_std": 9.067382279681624e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.999988317489624, "step": 2201 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 5.704663212435233, "grad_norm": 0.19412235106860593, "kl": 0.0777587890625, "learning_rate": 4.297927461139896e-07, "loss": 0.0, "reward": 2.499997138977051, "reward_std": 2.6640018973012047e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999972581863403, "step": 2202 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 5.7072538860103625, "grad_norm": 1.605406201031802, "kl": 0.1142578125, "learning_rate": 4.2953367875647666e-07, "loss": -0.0001, "reward": 2.499987006187439, "reward_std": 5.200048690312542e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.999987006187439, "step": 2203 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 5.709844559585492, "grad_norm": 6.311186207393324, "kl": 0.14990234375, "learning_rate": 4.292746113989637e-07, "loss": 0.0004, "reward": 1.868963897228241, "reward_std": 0.00039028138451158156, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.368963897228241, "step": 2204 }, { "clip_ratio": 0.0, "completion_length": 34.5, "epoch": 5.712435233160622, "grad_norm": 0.5214205528297102, "kl": 0.0902099609375, "learning_rate": 4.290155440414507e-07, "loss": -0.0005, "reward": 2.4999823570251465, "reward_std": 6.897247658343986e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.999982476234436, "step": 2205 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 5.715025906735751, "grad_norm": 0.0781231611815751, "kl": 0.097412109375, "learning_rate": 4.287564766839378e-07, "loss": 0.001, "reward": 2.4999947547912598, "reward_std": 1.2366082131620715e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999948143959045, "step": 2206 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 5.717616580310881, "grad_norm": 10.094539178504037, "kl": 0.1123046875, "learning_rate": 4.284974093264249e-07, "loss": 0.0008, "reward": 2.4998950958251953, "reward_std": 4.9064010909205535e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9998952150344849, "step": 2207 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 5.72020725388601, "grad_norm": 3.582528177144623, "kl": 0.074462890625, "learning_rate": 4.282383419689119e-07, "loss": -0.0007, "reward": 2.4999806880950928, "reward_std": 9.421323397873493e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999808073043823, "step": 2208 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 5.72279792746114, "grad_norm": 0.3510864351194618, "kl": 0.040283203125, "learning_rate": 4.2797927461139893e-07, "loss": -0.0001, "reward": 2.499995708465576, "reward_std": 3.179863938385097e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999958276748657, "step": 2209 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 5.725388601036269, "grad_norm": 1.277277386059591, "kl": 0.089111328125, "learning_rate": 4.27720207253886e-07, "loss": 0.0004, "reward": 2.499984622001648, "reward_std": 8.090762321444345e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999845623970032, "step": 2210 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 5.727979274611399, "grad_norm": 0.15247013370314996, "kl": 0.0655517578125, "learning_rate": 4.2746113989637303e-07, "loss": 0.0003, "reward": 2.4999969005584717, "reward_std": 1.8476799255040532e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999969005584717, "step": 2211 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 5.730569948186528, "grad_norm": 1.1020447186868272, "kl": 0.048583984375, "learning_rate": 4.272020725388601e-07, "loss": -0.0006, "reward": 2.499983787536621, "reward_std": 9.898189318846562e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999839067459106, "step": 2212 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 5.733160621761658, "grad_norm": 0.15327195532187002, "kl": 0.1552734375, "learning_rate": 4.2694300518134714e-07, "loss": 0.0009, "reward": 2.4999947547912598, "reward_std": 5.267787059892726e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999947547912598, "step": 2213 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 5.7357512953367875, "grad_norm": 0.5682223176142653, "kl": 0.10302734375, "learning_rate": 4.2668393782383414e-07, "loss": 0.001, "reward": 2.499992847442627, "reward_std": 5.841190215960523e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999926686286926, "step": 2214 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 5.738341968911917, "grad_norm": 0.09345128084580233, "kl": 0.08514404296875, "learning_rate": 4.2642487046632124e-07, "loss": -0.0002, "reward": 2.499997615814209, "reward_std": 2.6362197331764037e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999976754188538, "step": 2215 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 5.740932642487047, "grad_norm": 1.2229681281213352, "kl": 0.063720703125, "learning_rate": 4.261658031088083e-07, "loss": -0.0004, "reward": 2.4999890327453613, "reward_std": 1.2146653489253367e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999890327453613, "step": 2216 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 5.743523316062176, "grad_norm": 0.4383863720346704, "kl": 0.103759765625, "learning_rate": 4.259067357512953e-07, "loss": 0.001, "reward": 2.4999791383743286, "reward_std": 7.784605941196787e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999790787696838, "step": 2217 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 5.746113989637306, "grad_norm": 2.984746817418707, "kl": 0.145263671875, "learning_rate": 4.2564766839378235e-07, "loss": 0.0007, "reward": 1.8090960383415222, "reward_std": 0.00031346285140898544, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.3090960681438446, "step": 2218 }, { "clip_ratio": 0.0, "completion_length": 34.6875, "epoch": 5.748704663212435, "grad_norm": 545.6949707212282, "kl": 0.277587890625, "learning_rate": 4.253886010362694e-07, "loss": 0.0011, "reward": 1.2964731454849243, "reward_std": 0.09693628415698186, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 0.7964731454849243, "step": 2219 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 5.751295336787565, "grad_norm": 1.897796598245334, "kl": 0.037841796875, "learning_rate": 4.2512953367875645e-07, "loss": 0.0, "reward": 2.499986171722412, "reward_std": 9.532710123494326e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.999986171722412, "step": 2220 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 5.753886010362694, "grad_norm": 1.812933322130175, "kl": 0.123779296875, "learning_rate": 4.248704663212435e-07, "loss": 0.0007, "reward": 2.4999877214431763, "reward_std": 1.2107628720059438e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999878406524658, "step": 2221 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 5.756476683937824, "grad_norm": 0.07720415276560684, "kl": 0.063232421875, "learning_rate": 4.2461139896373056e-07, "loss": 0.0007, "reward": 2.499998927116394, "reward_std": 9.553977236009814e-07, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.999998688697815, "step": 2222 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 5.759067357512953, "grad_norm": 2.158549640568623, "kl": 0.0694580078125, "learning_rate": 4.2435233160621756e-07, "loss": 0.0005, "reward": 2.499944567680359, "reward_std": 1.4589605143555673e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999445676803589, "step": 2223 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 5.761658031088083, "grad_norm": 45.97381184687504, "kl": 0.1533203125, "learning_rate": 4.240932642487046e-07, "loss": 0.0006, "reward": 1.9166672229766846, "reward_std": 0.17812484328169376, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4166671633720398, "step": 2224 }, { "clip_ratio": 0.0, "completion_length": 34.5, "epoch": 5.7642487046632125, "grad_norm": 0.08539629948295398, "kl": 0.0830078125, "learning_rate": 4.238341968911917e-07, "loss": -0.0004, "reward": 2.4999974966049194, "reward_std": 2.157066035124444e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999974966049194, "step": 2225 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 5.766839378238342, "grad_norm": 0.544541626422927, "kl": 0.1572265625, "learning_rate": 4.235751295336787e-07, "loss": 0.0001, "reward": 2.499993085861206, "reward_std": 3.6420831293071387e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999932050704956, "step": 2226 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 5.769430051813472, "grad_norm": 1.8255674847350292, "kl": 0.099365234375, "learning_rate": 4.2331606217616577e-07, "loss": -0.0007, "reward": 2.499986410140991, "reward_std": 1.2053063187522639e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999865889549255, "step": 2227 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 5.772020725388601, "grad_norm": 3.4899082103546353, "kl": 0.15283203125, "learning_rate": 4.230569948186528e-07, "loss": 0.0009, "reward": 1.8856375217437744, "reward_std": 0.00019633736792457057, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.3856375217437744, "step": 2228 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 5.774611398963731, "grad_norm": 0.5562292713872523, "kl": 0.09326171875, "learning_rate": 4.2279792746113993e-07, "loss": 0.0009, "reward": 2.4999953508377075, "reward_std": 4.360593266028445e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.999995231628418, "step": 2229 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 5.77720207253886, "grad_norm": 32.21773563182162, "kl": 0.0943603515625, "learning_rate": 4.2253886010362693e-07, "loss": 0.0002, "reward": 2.4999661445617676, "reward_std": 1.1921830491701257e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999661445617676, "step": 2230 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 5.77979274611399, "grad_norm": 4.591035329567472, "kl": 0.1094970703125, "learning_rate": 4.22279792746114e-07, "loss": 0.0015, "reward": 2.4999637603759766, "reward_std": 9.263614174415125e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999635219573975, "step": 2231 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 5.782383419689119, "grad_norm": 1.5780693898666545, "kl": 0.1279296875, "learning_rate": 4.22020725388601e-07, "loss": 0.001, "reward": 2.499992609024048, "reward_std": 8.420148901677749e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999924898147583, "step": 2232 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 5.784974093264249, "grad_norm": 0.898988183455102, "kl": 0.13623046875, "learning_rate": 4.2176165803108803e-07, "loss": -0.0003, "reward": 2.4999775886535645, "reward_std": 5.201562998990994e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999777674674988, "step": 2233 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 5.787564766839378, "grad_norm": 0.14780538723510805, "kl": 0.093994140625, "learning_rate": 4.2150259067357514e-07, "loss": 0.0014, "reward": 2.4999959468841553, "reward_std": 2.723076590882556e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999957084655762, "step": 2234 }, { "clip_ratio": 0.0, "completion_length": 34.5, "epoch": 5.790155440414508, "grad_norm": 0.4833607605083104, "kl": 0.083740234375, "learning_rate": 4.212435233160622e-07, "loss": 0.0009, "reward": 2.4999765157699585, "reward_std": 5.81231142859906e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.999976634979248, "step": 2235 }, { "clip_ratio": 0.0, "completion_length": 34.5, "epoch": 5.7927461139896375, "grad_norm": 0.6182751287784096, "kl": 0.1510009765625, "learning_rate": 4.209844559585492e-07, "loss": 0.0001, "reward": 2.4999940395355225, "reward_std": 5.23471095448258e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.999994158744812, "step": 2236 }, { "clip_ratio": 0.0, "completion_length": 34.0, "epoch": 5.795336787564767, "grad_norm": 8.251118998147856, "kl": 0.09051513671875, "learning_rate": 4.2072538860103624e-07, "loss": 0.0002, "reward": 1.9537217617034912, "reward_std": 0.0004934084658998472, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4537217020988464, "step": 2237 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 5.7979274611398965, "grad_norm": 0.24647324225388167, "kl": 0.16015625, "learning_rate": 4.2046632124352324e-07, "loss": 0.0011, "reward": 2.4999977350234985, "reward_std": 1.7614371472518542e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.999997615814209, "step": 2238 }, { "clip_ratio": 0.0, "completion_length": 34.5, "epoch": 5.800518134715026, "grad_norm": 2.1113569291233176, "kl": 0.0869140625, "learning_rate": 4.2020725388601035e-07, "loss": 0.0015, "reward": 2.4999892711639404, "reward_std": 7.539571015513502e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999890327453613, "step": 2239 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 5.803108808290156, "grad_norm": 0.9203982096512624, "kl": 0.05743408203125, "learning_rate": 4.199481865284974e-07, "loss": 0.0011, "reward": 2.499996781349182, "reward_std": 2.2391941172372753e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.999996542930603, "step": 2240 }, { "clip_ratio": 0.0, "completion_length": 35.9375, "epoch": 5.805699481865285, "grad_norm": 1.4809847645762941, "kl": 0.080322265625, "learning_rate": 4.1968911917098445e-07, "loss": 0.0013, "reward": 2.4999871253967285, "reward_std": 4.650490609492408e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999869465827942, "step": 2241 }, { "clip_ratio": 0.0, "completion_length": 33.5, "epoch": 5.808290155440415, "grad_norm": 0.12401939803459215, "kl": 0.135498046875, "learning_rate": 4.1943005181347145e-07, "loss": 0.0008, "reward": 2.4999990463256836, "reward_std": 9.293904383866902e-07, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999990463256836, "step": 2242 }, { "clip_ratio": 0.0, "completion_length": 34.5, "epoch": 5.810880829015544, "grad_norm": 0.07209419887980668, "kl": 0.0638427734375, "learning_rate": 4.1917098445595856e-07, "loss": 0.0007, "reward": 2.4999972581863403, "reward_std": 1.411313760968369e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999971985816956, "step": 2243 }, { "clip_ratio": 0.0, "completion_length": 35.4375, "epoch": 5.813471502590674, "grad_norm": 25.501845866337284, "kl": 0.1220703125, "learning_rate": 4.189119170984456e-07, "loss": 0.0005, "reward": 2.0199482440948486, "reward_std": 0.19396816765572567, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.5199483633041382, "step": 2244 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 5.816062176165803, "grad_norm": 0.6430320063363314, "kl": 0.07391357421875, "learning_rate": 4.186528497409326e-07, "loss": 0.0005, "reward": 2.4999845027923584, "reward_std": 5.088538159725431e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999845027923584, "step": 2245 }, { "clip_ratio": 0.0, "completion_length": 33.5, "epoch": 5.818652849740933, "grad_norm": 0.5210581394426964, "kl": 0.106201171875, "learning_rate": 4.1839378238341967e-07, "loss": -0.0005, "reward": 2.49999737739563, "reward_std": 1.0632743681071588e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999974966049194, "step": 2246 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 5.821243523316062, "grad_norm": 0.4816873869851018, "kl": 0.144287109375, "learning_rate": 4.181347150259067e-07, "loss": 0.0014, "reward": 2.499993324279785, "reward_std": 3.965686005358293e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999930262565613, "step": 2247 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 5.823834196891192, "grad_norm": 0.3129965153917525, "kl": 0.16015625, "learning_rate": 4.1787564766839377e-07, "loss": 0.0005, "reward": 2.499985694885254, "reward_std": 3.4691705650402582e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.999985694885254, "step": 2248 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 5.8264248704663215, "grad_norm": 0.10789318782576139, "kl": 0.07763671875, "learning_rate": 4.176165803108808e-07, "loss": 0.0009, "reward": 2.4999983310699463, "reward_std": 1.131927888309292e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999984502792358, "step": 2249 }, { "clip_ratio": 0.0, "completion_length": 34.5, "epoch": 5.829015544041451, "grad_norm": 0.21243175156405217, "kl": 0.062255859375, "learning_rate": 4.173575129533679e-07, "loss": 0.0011, "reward": 2.499998092651367, "reward_std": 1.3531694946777861e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999980330467224, "step": 2250 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 5.831606217616581, "grad_norm": 0.18002468027140417, "kl": 0.084716796875, "learning_rate": 4.170984455958549e-07, "loss": 0.0015, "reward": 2.499990940093994, "reward_std": 2.5257869538108935e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.999990999698639, "step": 2251 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 5.83419689119171, "grad_norm": 1.3388184311853821, "kl": 0.096435546875, "learning_rate": 4.16839378238342e-07, "loss": 0.0015, "reward": 2.499991297721863, "reward_std": 1.028898668664624e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.999991238117218, "step": 2252 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 5.83678756476684, "grad_norm": 1.1999952049082878, "kl": 0.0960693359375, "learning_rate": 4.1658031088082903e-07, "loss": -0.0, "reward": 2.499968409538269, "reward_std": 1.1583674222492846e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999682903289795, "step": 2253 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 5.839378238341969, "grad_norm": 39.17763780741709, "kl": 0.125, "learning_rate": 4.1632124352331603e-07, "loss": 0.0009, "reward": 1.9999246001243591, "reward_std": 1.668057370807219e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4999245703220367, "step": 2254 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 5.841968911917099, "grad_norm": 0.468784929670208, "kl": 0.088134765625, "learning_rate": 4.160621761658031e-07, "loss": 0.0005, "reward": 2.499991536140442, "reward_std": 3.6417177966541203e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999916553497314, "step": 2255 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 5.844559585492228, "grad_norm": 1.7126957627828425, "kl": 0.049072265625, "learning_rate": 4.1580310880829014e-07, "loss": -0.0001, "reward": 2.499963164329529, "reward_std": 7.067627279866429e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999632239341736, "step": 2256 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 5.847150259067358, "grad_norm": 0.8308825496417839, "kl": 0.054931640625, "learning_rate": 4.155440414507772e-07, "loss": -0.0008, "reward": 2.49999463558197, "reward_std": 5.599392125077429e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.999994933605194, "step": 2257 }, { "clip_ratio": 0.0, "completion_length": 34.5, "epoch": 5.849740932642487, "grad_norm": 0.11325478470594708, "kl": 0.0869140625, "learning_rate": 4.1528497409326424e-07, "loss": 0.0001, "reward": 2.4999972581863403, "reward_std": 1.4786303097480413e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.999997317790985, "step": 2258 }, { "clip_ratio": 0.0, "completion_length": 34.0, "epoch": 5.852331606217617, "grad_norm": 2.4059508683216317, "kl": 0.0400390625, "learning_rate": 4.150259067357513e-07, "loss": -0.0001, "reward": 2.49998939037323, "reward_std": 1.198946165459347e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.99998939037323, "step": 2259 }, { "clip_ratio": 0.0, "completion_length": 35.0625, "epoch": 5.8549222797927465, "grad_norm": 64.81923064237134, "kl": 0.10791015625, "learning_rate": 4.147668393782383e-07, "loss": 0.0, "reward": 1.9406970143318176, "reward_std": 0.0027064363960107585, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4406971037387848, "step": 2260 }, { "clip_ratio": 0.0, "completion_length": 36.5, "epoch": 5.857512953367876, "grad_norm": 18.61717993551045, "kl": 0.077392578125, "learning_rate": 4.1450777202072535e-07, "loss": -0.0005, "reward": 2.4374488592147827, "reward_std": 0.176842640918494, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9374489188194275, "step": 2261 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 5.860103626943005, "grad_norm": 0.3839268155528407, "kl": 0.115966796875, "learning_rate": 4.1424870466321246e-07, "loss": 0.0012, "reward": 2.49999737739563, "reward_std": 1.5238573496390018e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999973773956299, "step": 2262 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 5.862694300518134, "grad_norm": 0.9451527751307454, "kl": 0.146484375, "learning_rate": 4.1398963730569945e-07, "loss": 0.0005, "reward": 2.499990463256836, "reward_std": 9.460845376452198e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999906420707703, "step": 2263 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 5.865284974093264, "grad_norm": 0.2375805343856624, "kl": 0.093017578125, "learning_rate": 4.137305699481865e-07, "loss": 0.0012, "reward": 2.499996781349182, "reward_std": 1.9475403121305135e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999967813491821, "step": 2264 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 5.867875647668393, "grad_norm": 2.259284429718665, "kl": 0.0482177734375, "learning_rate": 4.1347150259067356e-07, "loss": 0.0005, "reward": 2.499988555908203, "reward_std": 9.087720968636859e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999886751174927, "step": 2265 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 5.870466321243523, "grad_norm": 13.354644445230344, "kl": 0.197265625, "learning_rate": 4.132124352331606e-07, "loss": -0.0009, "reward": 2.4999935626983643, "reward_std": 5.236945298747742e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999937415122986, "step": 2266 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 5.873056994818652, "grad_norm": 0.29850912234813987, "kl": 0.078857421875, "learning_rate": 4.1295336787564767e-07, "loss": 0.0003, "reward": 2.4999918937683105, "reward_std": 3.7276245166140143e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.999991774559021, "step": 2267 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 5.875647668393782, "grad_norm": 0.22231653539422577, "kl": 0.08349609375, "learning_rate": 4.126943005181347e-07, "loss": -0.0007, "reward": 2.4999966621398926, "reward_std": 3.2675197871867567e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999967813491821, "step": 2268 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 5.8782383419689115, "grad_norm": 1.0605581151537387, "kl": 0.0435791015625, "learning_rate": 4.124352331606217e-07, "loss": -0.0008, "reward": 2.499995231628418, "reward_std": 6.7132351659893175e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999951720237732, "step": 2269 }, { "clip_ratio": 0.0, "completion_length": 34.5, "epoch": 5.880829015544041, "grad_norm": 0.8931260941604258, "kl": 0.10205078125, "learning_rate": 4.1217616580310877e-07, "loss": 0.0004, "reward": 2.499995708465576, "reward_std": 3.7294799994924688e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999955892562866, "step": 2270 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 5.883419689119171, "grad_norm": 1.9131796728206538, "kl": 0.0816650390625, "learning_rate": 4.119170984455959e-07, "loss": 0.0002, "reward": 2.4999866485595703, "reward_std": 5.00178100537596e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.999986708164215, "step": 2271 }, { "clip_ratio": 0.0, "completion_length": 34.5, "epoch": 5.8860103626943, "grad_norm": 0.3482697047420862, "kl": 0.097900390625, "learning_rate": 4.116580310880829e-07, "loss": -0.0013, "reward": 2.4999961853027344, "reward_std": 3.3883881087604095e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999963641166687, "step": 2272 }, { "clip_ratio": 0.0, "completion_length": 34.5, "epoch": 5.88860103626943, "grad_norm": 0.0913272102319602, "kl": 0.0654296875, "learning_rate": 4.1139896373056993e-07, "loss": -0.0002, "reward": 2.49999737739563, "reward_std": 1.5297123070467933e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999973773956299, "step": 2273 }, { "clip_ratio": 0.0, "completion_length": 34.0, "epoch": 5.891191709844559, "grad_norm": 10.6582253283067, "kl": 0.22900390625, "learning_rate": 4.11139896373057e-07, "loss": 0.0008, "reward": 2.4999969005584717, "reward_std": 3.437034706621489e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999970197677612, "step": 2274 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 5.893782383419689, "grad_norm": 7.713193642665512, "kl": 0.0203857421875, "learning_rate": 4.1088082901554403e-07, "loss": 0.0002, "reward": 2.499983787536621, "reward_std": 6.7634414335771e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.999983787536621, "step": 2275 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 5.896373056994818, "grad_norm": 0.06318554239502905, "kl": 0.09375, "learning_rate": 4.106217616580311e-07, "loss": -0.0007, "reward": 2.49999737739563, "reward_std": 1.5709822491771774e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.999997615814209, "step": 2276 }, { "clip_ratio": 0.0, "completion_length": 35.0625, "epoch": 5.898963730569948, "grad_norm": 0.7832830448124017, "kl": 0.08428955078125, "learning_rate": 4.1036269430051814e-07, "loss": 0.0002, "reward": 2.499984622001648, "reward_std": 5.6210509455922875e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.999984622001648, "step": 2277 }, { "clip_ratio": 0.0, "completion_length": 34.5, "epoch": 5.901554404145077, "grad_norm": 5.082952120669084, "kl": 0.1495361328125, "learning_rate": 4.1010362694300514e-07, "loss": 0.0016, "reward": 1.9785445928573608, "reward_std": 0.00015241097628404532, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4785443544387817, "step": 2278 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 5.904145077720207, "grad_norm": 3.2575195808339767, "kl": 0.08544921875, "learning_rate": 4.098445595854922e-07, "loss": 0.0009, "reward": 2.499987006187439, "reward_std": 1.7935386040335288e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999868273735046, "step": 2279 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 5.9067357512953365, "grad_norm": 0.2516948122473572, "kl": 0.11328125, "learning_rate": 4.095854922279793e-07, "loss": 0.0002, "reward": 2.4999977350234985, "reward_std": 1.964046646207862e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999976754188538, "step": 2280 }, { "clip_ratio": 0.0, "completion_length": 35.8125, "epoch": 5.909326424870466, "grad_norm": 4.390730158597342, "kl": 0.1083984375, "learning_rate": 4.093264248704663e-07, "loss": -0.0009, "reward": 2.499969244003296, "reward_std": 2.5009020191646414e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999694228172302, "step": 2281 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 5.9119170984455955, "grad_norm": 0.0976951117432104, "kl": 0.04888916015625, "learning_rate": 4.0906735751295335e-07, "loss": 0.0003, "reward": 2.499997854232788, "reward_std": 1.458956290889546e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999979138374329, "step": 2282 }, { "clip_ratio": 0.0, "completion_length": 34.5, "epoch": 5.914507772020725, "grad_norm": 3.9081106543482758, "kl": 0.102783203125, "learning_rate": 4.088082901554404e-07, "loss": 0.0015, "reward": 2.49997615814209, "reward_std": 1.367864120993545e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999760389328003, "step": 2283 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 5.917098445595855, "grad_norm": 0.15136605964031002, "kl": 0.08642578125, "learning_rate": 4.085492227979274e-07, "loss": -0.0001, "reward": 2.4999895095825195, "reward_std": 1.8938719108518853e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999895691871643, "step": 2284 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 5.919689119170984, "grad_norm": 0.38461717284815056, "kl": 0.080810546875, "learning_rate": 4.082901554404145e-07, "loss": 0.0003, "reward": 2.4999969005584717, "reward_std": 2.6105044526048005e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999967217445374, "step": 2285 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 5.922279792746114, "grad_norm": 0.9744700946403966, "kl": 0.05615234375, "learning_rate": 4.0803108808290156e-07, "loss": 0.0011, "reward": 2.499990463256836, "reward_std": 6.561147756656283e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999905228614807, "step": 2286 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 5.924870466321243, "grad_norm": 10.03995885014358, "kl": 0.212890625, "learning_rate": 4.0777202072538856e-07, "loss": 0.0014, "reward": 2.2499719858169556, "reward_std": 0.26727062811175983, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.74997216463089, "step": 2287 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 5.927461139896373, "grad_norm": 1.3152257354057781, "kl": 0.12646484375, "learning_rate": 4.075129533678756e-07, "loss": 0.0022, "reward": 2.499967098236084, "reward_std": 7.947738765778922e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.999967098236084, "step": 2288 }, { "clip_ratio": 0.0, "completion_length": 34.6875, "epoch": 5.930051813471502, "grad_norm": 2.213297686456236, "kl": 0.07470703125, "learning_rate": 4.072538860103627e-07, "loss": 0.0008, "reward": 2.4999608993530273, "reward_std": 6.7524352971304324e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999608397483826, "step": 2289 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 5.932642487046632, "grad_norm": 0.3785804313791765, "kl": 0.093505859375, "learning_rate": 4.069948186528497e-07, "loss": 0.0002, "reward": 2.4999969005584717, "reward_std": 2.443111895900074e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.999996840953827, "step": 2290 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 5.935233160621761, "grad_norm": 0.8576274897768291, "kl": 0.17724609375, "learning_rate": 4.0673575129533677e-07, "loss": 0.0006, "reward": 2.4999879598617554, "reward_std": 3.983369765592215e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999880194664001, "step": 2291 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 5.937823834196891, "grad_norm": 0.16636253356570307, "kl": 0.067626953125, "learning_rate": 4.064766839378238e-07, "loss": -0.0, "reward": 2.4999966621398926, "reward_std": 3.2034688501880737e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999967217445374, "step": 2292 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 5.9404145077720205, "grad_norm": 0.2857911595868281, "kl": 0.123046875, "learning_rate": 4.062176165803108e-07, "loss": 0.0006, "reward": 2.499984383583069, "reward_std": 3.349826101839426e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.999984323978424, "step": 2293 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 5.94300518134715, "grad_norm": 2.9405861084546485, "kl": 0.0599365234375, "learning_rate": 4.0595854922279793e-07, "loss": -0.0006, "reward": 1.791157841682434, "reward_std": 0.00025315592938568443, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.2911579608917236, "step": 2294 }, { "clip_ratio": 0.0, "completion_length": 34.5, "epoch": 5.94559585492228, "grad_norm": 1.8151398737084097, "kl": 0.2303466796875, "learning_rate": 4.05699481865285e-07, "loss": 0.0024, "reward": 2.499994993209839, "reward_std": 3.7977763724938995e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999946355819702, "step": 2295 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 5.948186528497409, "grad_norm": 0.2462071206252658, "kl": 0.163330078125, "learning_rate": 4.05440414507772e-07, "loss": 0.0011, "reward": 2.4999958276748657, "reward_std": 2.7629565693132463e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999958872795105, "step": 2296 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 5.950777202072539, "grad_norm": 0.265764427615182, "kl": 0.193359375, "learning_rate": 4.0518134715025903e-07, "loss": 0.0014, "reward": 2.4999979734420776, "reward_std": 1.1679443900902697e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999979734420776, "step": 2297 }, { "clip_ratio": 0.0, "completion_length": 34.5, "epoch": 5.953367875647668, "grad_norm": 3.182257190997831, "kl": 0.073974609375, "learning_rate": 4.049222797927461e-07, "loss": 0.0, "reward": 2.499982237815857, "reward_std": 1.1860133668051276e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999822974205017, "step": 2298 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 5.955958549222798, "grad_norm": 1.140015160951158, "kl": 0.0635986328125, "learning_rate": 4.0466321243523314e-07, "loss": -0.0009, "reward": 1.9999282360076904, "reward_std": 9.737196592141117e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4999284446239471, "step": 2299 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 5.958549222797927, "grad_norm": 4.912176966295773, "kl": 0.12744140625, "learning_rate": 4.044041450777202e-07, "loss": -0.0, "reward": 1.9980394840240479, "reward_std": 7.877700500102947e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.498039573431015, "step": 2300 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 5.961139896373057, "grad_norm": 1.4535986993672234, "kl": 0.142822265625, "learning_rate": 4.0414507772020724e-07, "loss": 0.0012, "reward": 1.9970449209213257, "reward_std": 2.6568860562292684e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4970448017120361, "step": 2301 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 5.963730569948186, "grad_norm": 0.9690604957954418, "kl": 0.02960205078125, "learning_rate": 4.0388601036269424e-07, "loss": 0.0004, "reward": 2.499995708465576, "reward_std": 5.5196461516970885e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.999995768070221, "step": 2302 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 5.966321243523316, "grad_norm": 0.2586744370585928, "kl": 0.0372314453125, "learning_rate": 4.0362694300518135e-07, "loss": 0.0004, "reward": 2.499997854232788, "reward_std": 1.919745244549631e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999979734420776, "step": 2303 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 5.9689119170984455, "grad_norm": 3.213623711635333, "kl": 0.10986328125, "learning_rate": 4.033678756476684e-07, "loss": 0.0004, "reward": 2.4999728202819824, "reward_std": 1.3485929230228066e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999728798866272, "step": 2304 }, { "clip_ratio": 0.0, "completion_length": 34.0, "epoch": 5.971502590673575, "grad_norm": 0.14360237534431436, "kl": 0.087646484375, "learning_rate": 4.031088082901554e-07, "loss": -0.0007, "reward": 2.4999977350234985, "reward_std": 1.5765271541567927e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999979138374329, "step": 2305 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 5.974093264248705, "grad_norm": 0.32422052069237134, "kl": 0.083251953125, "learning_rate": 4.0284974093264246e-07, "loss": 0.0001, "reward": 2.499976634979248, "reward_std": 4.600564125212259e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.999976634979248, "step": 2306 }, { "clip_ratio": 0.0, "completion_length": 35.125, "epoch": 5.976683937823834, "grad_norm": 1.2502654641318598, "kl": 0.144287109375, "learning_rate": 4.025906735751295e-07, "loss": 0.0003, "reward": 0.9997566938400269, "reward_std": 1.2651558790821582e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 0.49975669384002686, "step": 2307 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 5.979274611398964, "grad_norm": 2.36640930693435, "kl": 0.0880126953125, "learning_rate": 4.0233160621761656e-07, "loss": 0.0005, "reward": 1.992644727230072, "reward_std": 8.817717161946348e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4926447868347168, "step": 2308 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 5.981865284974093, "grad_norm": 207.6614615362066, "kl": 0.1142578125, "learning_rate": 4.020725388601036e-07, "loss": 0.0007, "reward": 1.8027977347373962, "reward_std": 0.0013666564709637896, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.3027976751327515, "step": 2309 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 5.984455958549223, "grad_norm": 0.5044073531371609, "kl": 0.2529296875, "learning_rate": 4.0181347150259067e-07, "loss": 0.0016, "reward": 1.9999481439590454, "reward_std": 8.086421985353809e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4999480247497559, "step": 2310 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 5.987046632124352, "grad_norm": 1.722295266207358, "kl": 0.06842041015625, "learning_rate": 4.0155440414507767e-07, "loss": 0.0004, "reward": 1.9998886585235596, "reward_std": 1.4046738670003833e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4998886287212372, "step": 2311 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 5.989637305699482, "grad_norm": 48.613036036560196, "kl": 0.1282958984375, "learning_rate": 4.0129533678756477e-07, "loss": 0.0007, "reward": 1.9983825087547302, "reward_std": 0.0025517589630794646, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4983823895454407, "step": 2312 }, { "clip_ratio": 0.0, "completion_length": 34.5, "epoch": 5.992227979274611, "grad_norm": 0.07756872166757646, "kl": 0.05615234375, "learning_rate": 4.010362694300518e-07, "loss": 0.0008, "reward": 2.4999988079071045, "reward_std": 9.127613225246023e-07, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999990463256836, "step": 2313 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 5.994818652849741, "grad_norm": 10.617348882922455, "kl": 0.130126953125, "learning_rate": 4.007772020725388e-07, "loss": 0.0008, "reward": 1.9964189529418945, "reward_std": 0.004077342422419861, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4964188933372498, "step": 2314 }, { "clip_ratio": 0.0, "completion_length": 34.5, "epoch": 5.9974093264248705, "grad_norm": 8.826440221321544, "kl": 0.140380859375, "learning_rate": 4.005181347150259e-07, "loss": 0.0003, "reward": 1.4999750852584839, "reward_std": 1.787094515748322e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 0.9999750852584839, "step": 2315 }, { "clip_ratio": 0.0, "completion_length": 35.625, "epoch": 6.0, "grad_norm": 618.0269634301646, "kl": 0.094482421875, "learning_rate": 4.0025906735751293e-07, "loss": 0.0003, "reward": 1.9447551369667053, "reward_std": 0.011802842628526378, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4447550773620605, "step": 2316 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 6.0025906735751295, "grad_norm": 10.654795789281325, "kl": 0.134765625, "learning_rate": 4e-07, "loss": 0.0004, "reward": 1.9747610688209534, "reward_std": 0.0006670133816442103, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4747611284255981, "step": 2317 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 6.005181347150259, "grad_norm": 1.0559056475824102, "kl": 0.057861328125, "learning_rate": 3.9974093264248703e-07, "loss": 0.0003, "reward": 1.9999327659606934, "reward_std": 1.10519104055129e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4999327063560486, "step": 2318 }, { "clip_ratio": 0.0, "completion_length": 36.5, "epoch": 6.007772020725389, "grad_norm": 8.760096470912242, "kl": 0.20947265625, "learning_rate": 3.994818652849741e-07, "loss": 0.001, "reward": 2.3123667240142822, "reward_std": 0.25893301262658497, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.8123666644096375, "step": 2319 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 6.010362694300518, "grad_norm": 0.3458068214964244, "kl": 0.0950927734375, "learning_rate": 3.992227979274611e-07, "loss": 0.0011, "reward": 2.4999972581863403, "reward_std": 1.749113096138899e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999972581863403, "step": 2320 }, { "clip_ratio": 0.0, "completion_length": 35.9375, "epoch": 6.012953367875648, "grad_norm": 0.4965606278819686, "kl": 0.0626220703125, "learning_rate": 3.9896373056994814e-07, "loss": -0.0003, "reward": 2.4999942779541016, "reward_std": 5.895196181882056e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999942779541016, "step": 2321 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 6.015544041450777, "grad_norm": 0.8356507835178411, "kl": 0.0273284912109375, "learning_rate": 3.9870466321243525e-07, "loss": 0.0001, "reward": 2.499987840652466, "reward_std": 7.145436939026695e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.999987781047821, "step": 2322 }, { "clip_ratio": 0.0, "completion_length": 34.0, "epoch": 6.018134715025907, "grad_norm": 4.985023765354507, "kl": 0.07708740234375, "learning_rate": 3.9844559585492225e-07, "loss": -0.0004, "reward": 1.885767936706543, "reward_std": 0.00013869718259229558, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.3857680559158325, "step": 2323 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 6.020725388601036, "grad_norm": 15.54581644938938, "kl": 0.224365234375, "learning_rate": 3.981865284974093e-07, "loss": 0.0009, "reward": 1.8862226009368896, "reward_std": 0.0003212960599512371, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.3862226009368896, "step": 2324 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 6.023316062176166, "grad_norm": 0.10624159047556597, "kl": 0.1002197265625, "learning_rate": 3.9792746113989635e-07, "loss": 0.0008, "reward": 2.4999964237213135, "reward_std": 2.0552841419885226e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999962449073792, "step": 2325 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 6.025906735751295, "grad_norm": 3.5611348229942386, "kl": 0.092041015625, "learning_rate": 3.976683937823834e-07, "loss": -0.0001, "reward": 1.9867271184921265, "reward_std": 0.00010750261378689174, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4867271184921265, "step": 2326 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 6.028497409326425, "grad_norm": 0.1755119947228922, "kl": 0.078857421875, "learning_rate": 3.9740932642487046e-07, "loss": 0.0003, "reward": 2.499992609024048, "reward_std": 2.1198463535654355e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.999992847442627, "step": 2327 }, { "clip_ratio": 0.0, "completion_length": 34.5, "epoch": 6.0310880829015545, "grad_norm": 4.3307376001626245, "kl": 0.05517578125, "learning_rate": 3.971502590673575e-07, "loss": -0.0004, "reward": 2.499932289123535, "reward_std": 2.8998786547163036e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999324679374695, "step": 2328 }, { "clip_ratio": 0.0, "completion_length": 34.0, "epoch": 6.033678756476684, "grad_norm": 5.295231778550412, "kl": 0.078369140625, "learning_rate": 3.968911917098445e-07, "loss": -0.0001, "reward": 2.499976634979248, "reward_std": 5.20184062224871e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.999976933002472, "step": 2329 }, { "clip_ratio": 0.0, "completion_length": 34.5, "epoch": 6.036269430051814, "grad_norm": 0.6789033840119268, "kl": 0.150390625, "learning_rate": 3.9663212435233156e-07, "loss": 0.0005, "reward": 2.499988079071045, "reward_std": 9.061225227924297e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.999988079071045, "step": 2330 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 6.038860103626943, "grad_norm": 0.11302243359273849, "kl": 0.080078125, "learning_rate": 3.9637305699481867e-07, "loss": 0.0003, "reward": 2.4999940395355225, "reward_std": 2.437762191220827e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999939799308777, "step": 2331 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 6.041450777202073, "grad_norm": 9.723115227050537, "kl": 0.03411865234375, "learning_rate": 3.9611398963730567e-07, "loss": 0.0001, "reward": 2.062425136566162, "reward_std": 0.17679953361931666, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.562425136566162, "step": 2332 }, { "clip_ratio": 0.0, "completion_length": 34.5, "epoch": 6.044041450777202, "grad_norm": 8.45268253530421, "kl": 0.0794677734375, "learning_rate": 3.958549222797927e-07, "loss": 0.001, "reward": 1.999755084514618, "reward_std": 5.181346568861045e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.499755084514618, "step": 2333 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 6.046632124352332, "grad_norm": 0.4397343116828608, "kl": 0.069580078125, "learning_rate": 3.9559585492227977e-07, "loss": -0.0008, "reward": 2.4999847412109375, "reward_std": 4.293986080483592e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999846816062927, "step": 2334 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 6.049222797927461, "grad_norm": 0.1385904049681267, "kl": 0.03155517578125, "learning_rate": 3.9533678756476677e-07, "loss": -0.0003, "reward": 2.4999935626983643, "reward_std": 2.5664792246971047e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999936819076538, "step": 2335 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 6.051813471502591, "grad_norm": 0.10960552888821755, "kl": 0.120361328125, "learning_rate": 3.950777202072539e-07, "loss": 0.0018, "reward": 2.499997854232788, "reward_std": 1.786342750165204e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999977946281433, "step": 2336 }, { "clip_ratio": 0.0, "completion_length": 36.5, "epoch": 6.05440414507772, "grad_norm": 4.950872204943883, "kl": 0.0396728515625, "learning_rate": 3.9481865284974093e-07, "loss": -0.0005, "reward": 2.437424659729004, "reward_std": 0.17678508458675424, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9374247193336487, "step": 2337 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 6.05699481865285, "grad_norm": 0.7291686825902829, "kl": 0.0693359375, "learning_rate": 3.9455958549222793e-07, "loss": -0.0002, "reward": 1.9998353719711304, "reward_std": 1.0741312280515558e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.49983549118042, "step": 2338 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 6.0595854922279795, "grad_norm": 0.36851781306139475, "kl": 0.068115234375, "learning_rate": 3.94300518134715e-07, "loss": -0.0, "reward": 2.4999977350234985, "reward_std": 1.7464092820773658e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.999997854232788, "step": 2339 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 6.062176165803109, "grad_norm": 2.2244372405694235, "kl": 0.155029296875, "learning_rate": 3.940414507772021e-07, "loss": -0.0008, "reward": 2.4999849796295166, "reward_std": 8.654305815980479e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999852180480957, "step": 2340 }, { "clip_ratio": 0.0, "completion_length": 34.5, "epoch": 6.064766839378239, "grad_norm": 1.2045213300003665, "kl": 0.081787109375, "learning_rate": 3.937823834196891e-07, "loss": -0.0003, "reward": 2.499994397163391, "reward_std": 3.8123297372294473e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999943971633911, "step": 2341 }, { "clip_ratio": 0.0, "completion_length": 34.4375, "epoch": 6.067357512953368, "grad_norm": 0.46707770838607143, "kl": 0.0535888671875, "learning_rate": 3.9352331606217614e-07, "loss": 0.0011, "reward": 2.4999784231185913, "reward_std": 3.493684573641076e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999781847000122, "step": 2342 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 6.069948186528498, "grad_norm": 0.35306793860459146, "kl": 0.126953125, "learning_rate": 3.932642487046632e-07, "loss": 0.0018, "reward": 2.4999961853027344, "reward_std": 2.3344006194747635e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999960064888, "step": 2343 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 6.072538860103627, "grad_norm": 15.222216378318482, "kl": 0.4169921875, "learning_rate": 3.930051813471502e-07, "loss": 0.002, "reward": 1.9559217691421509, "reward_std": 0.00022551812344318023, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.455921620130539, "step": 2344 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 6.075129533678757, "grad_norm": 15.289137682587517, "kl": 0.12353515625, "learning_rate": 3.927461139896373e-07, "loss": 0.0005, "reward": 2.124322831630707, "reward_std": 0.2316927479822084, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.6243227124214172, "step": 2345 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 6.077720207253886, "grad_norm": 1.9474240143237185, "kl": 0.15673828125, "learning_rate": 3.9248704663212435e-07, "loss": -0.0004, "reward": 2.4999842643737793, "reward_std": 9.061502169060986e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999842643737793, "step": 2346 }, { "clip_ratio": 0.0, "completion_length": 34.5, "epoch": 6.080310880829016, "grad_norm": 6.800640322470947, "kl": 0.056640625, "learning_rate": 3.9222797927461135e-07, "loss": -0.0005, "reward": 2.499996542930603, "reward_std": 5.2583898764169135e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999966025352478, "step": 2347 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 6.082901554404145, "grad_norm": 4.318061329351835, "kl": 0.0677490234375, "learning_rate": 3.919689119170984e-07, "loss": 0.0004, "reward": 2.4998890161514282, "reward_std": 3.4543336823844584e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9998890161514282, "step": 2348 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 6.085492227979275, "grad_norm": 0.8762614128269265, "kl": 0.096923828125, "learning_rate": 3.917098445595855e-07, "loss": 0.0008, "reward": 2.499986171722412, "reward_std": 6.506035333586624e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999861121177673, "step": 2349 }, { "clip_ratio": 0.0, "completion_length": 34.5, "epoch": 6.0880829015544045, "grad_norm": 1.330985053445535, "kl": 0.11767578125, "learning_rate": 3.9145077720207256e-07, "loss": 0.0, "reward": 2.4999892711639404, "reward_std": 5.188443111592278e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999893307685852, "step": 2350 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 6.090673575129534, "grad_norm": 0.511445006321932, "kl": 0.236328125, "learning_rate": 3.9119170984455956e-07, "loss": 0.0011, "reward": 2.499994993209839, "reward_std": 3.287688855380111e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999949932098389, "step": 2351 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 6.0932642487046635, "grad_norm": 0.24903680533846317, "kl": 0.09375, "learning_rate": 3.909326424870466e-07, "loss": 0.001, "reward": 2.499991297721863, "reward_std": 3.5320894085089094e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999912977218628, "step": 2352 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 6.095854922279793, "grad_norm": 0.23272059628696898, "kl": 0.075439453125, "learning_rate": 3.906735751295336e-07, "loss": 0.0006, "reward": 2.499997138977051, "reward_std": 1.9101866541859636e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999969601631165, "step": 2353 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 6.098445595854923, "grad_norm": 3.9302737178453357, "kl": 0.1279296875, "learning_rate": 3.904145077720207e-07, "loss": 0.0012, "reward": 2.499966025352478, "reward_std": 1.6063233488239348e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999658465385437, "step": 2354 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 6.101036269430052, "grad_norm": 0.8034716883525865, "kl": 0.08642578125, "learning_rate": 3.9015544041450777e-07, "loss": 0.0012, "reward": 2.499990701675415, "reward_std": 8.419689038419165e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999907612800598, "step": 2355 }, { "clip_ratio": 0.0, "completion_length": 34.5, "epoch": 6.103626943005182, "grad_norm": 1.135556782867008, "kl": 0.0830078125, "learning_rate": 3.898963730569948e-07, "loss": -0.0004, "reward": 2.4999920129776, "reward_std": 3.968956320932193e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999920129776, "step": 2356 }, { "clip_ratio": 0.0, "completion_length": 34.5, "epoch": 6.106217616580311, "grad_norm": 11.357492140788507, "kl": 0.165283203125, "learning_rate": 3.896373056994818e-07, "loss": 0.0016, "reward": 1.9946427941322327, "reward_std": 0.00024436906829805594, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4946427941322327, "step": 2357 }, { "clip_ratio": 0.0, "completion_length": 35.9375, "epoch": 6.108808290155441, "grad_norm": 16.039766478649344, "kl": 0.0693359375, "learning_rate": 3.893782383419689e-07, "loss": 0.0001, "reward": 2.0578293800354004, "reward_std": 0.17866243754724564, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.5578293204307556, "step": 2358 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 6.11139896373057, "grad_norm": 0.9359369537647699, "kl": 0.23046875, "learning_rate": 3.89119170984456e-07, "loss": 0.0001, "reward": 2.4999769926071167, "reward_std": 1.186994541058084e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999769926071167, "step": 2359 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 6.1139896373057, "grad_norm": 1.4206029868885972, "kl": 0.0797119140625, "learning_rate": 3.88860103626943e-07, "loss": 0.0002, "reward": 2.4999903440475464, "reward_std": 4.670081082736033e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.999990463256836, "step": 2360 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 6.116580310880829, "grad_norm": 0.10276242879487937, "kl": 0.0360107421875, "learning_rate": 3.8860103626943004e-07, "loss": -0.0004, "reward": 2.499998688697815, "reward_std": 1.355124410906683e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999988079071045, "step": 2361 }, { "clip_ratio": 0.0, "completion_length": 34.5, "epoch": 6.119170984455959, "grad_norm": 1.6755711852549475, "kl": 0.10498046875, "learning_rate": 3.883419689119171e-07, "loss": 0.0001, "reward": 1.9995163083076477, "reward_std": 2.0942025912518147e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4995163977146149, "step": 2362 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 6.1217616580310885, "grad_norm": 0.17905306664353854, "kl": 0.0947265625, "learning_rate": 3.8808290155440414e-07, "loss": 0.0008, "reward": 2.499996542930603, "reward_std": 2.2216927391127683e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999964833259583, "step": 2363 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 6.124352331606218, "grad_norm": 2.366258023022168, "kl": 0.1270751953125, "learning_rate": 3.878238341968912e-07, "loss": 0.0009, "reward": 2.4999701976776123, "reward_std": 1.4471586666786607e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999701976776123, "step": 2364 }, { "clip_ratio": 0.0, "completion_length": 34.5, "epoch": 6.126943005181348, "grad_norm": 0.11137391593984437, "kl": 0.05615234375, "learning_rate": 3.8756476683937825e-07, "loss": 0.0009, "reward": 2.4999961853027344, "reward_std": 2.0130917732785747e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999958872795105, "step": 2365 }, { "clip_ratio": 0.0, "completion_length": 34.5, "epoch": 6.129533678756476, "grad_norm": 0.23382651128442436, "kl": 0.082275390625, "learning_rate": 3.8730569948186525e-07, "loss": 0.0006, "reward": 2.499998450279236, "reward_std": 1.8829093164640653e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999984502792358, "step": 2366 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 6.132124352331606, "grad_norm": 0.43113087082188417, "kl": 0.118408203125, "learning_rate": 3.870466321243523e-07, "loss": 0.0012, "reward": 2.4999966621398926, "reward_std": 4.101755166630028e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999964237213135, "step": 2367 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 6.134715025906735, "grad_norm": 7.311207451974099, "kl": 0.158203125, "learning_rate": 3.867875647668394e-07, "loss": 0.0008, "reward": 1.8704302310943604, "reward_std": 0.0007156892678494842, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.370430201292038, "step": 2368 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 6.137305699481865, "grad_norm": 4.355334700177657, "kl": 0.066009521484375, "learning_rate": 3.865284974093264e-07, "loss": 0.0001, "reward": 2.499995231628418, "reward_std": 3.986485751283908e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999952912330627, "step": 2369 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 6.139896373056994, "grad_norm": 2.7361862959037926, "kl": 0.2919921875, "learning_rate": 3.8626943005181346e-07, "loss": 0.0012, "reward": 2.499988555908203, "reward_std": 1.8688807813305175e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999886751174927, "step": 2370 }, { "clip_ratio": 0.0, "completion_length": 34.5, "epoch": 6.142487046632124, "grad_norm": 0.3804187901716751, "kl": 0.0870361328125, "learning_rate": 3.860103626943005e-07, "loss": 0.0009, "reward": 2.4999749660491943, "reward_std": 4.024402869617916e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999750256538391, "step": 2371 }, { "clip_ratio": 0.0, "completion_length": 34.5, "epoch": 6.1450777202072535, "grad_norm": 1.4386344215519704, "kl": 0.106689453125, "learning_rate": 3.8575129533678756e-07, "loss": 0.0, "reward": 2.4999932050704956, "reward_std": 5.657253211666102e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999933242797852, "step": 2372 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 6.147668393782383, "grad_norm": 0.422438328427756, "kl": 0.0399169921875, "learning_rate": 3.854922279792746e-07, "loss": 0.0, "reward": 2.4999982118606567, "reward_std": 1.5368167396445642e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999982714653015, "step": 2373 }, { "clip_ratio": 0.0, "completion_length": 34.0, "epoch": 6.150259067357513, "grad_norm": 0.0672043231013455, "kl": 0.0595703125, "learning_rate": 3.8523316062176167e-07, "loss": -0.0001, "reward": 2.499997615814209, "reward_std": 1.1967109116994834e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.999997615814209, "step": 2374 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 6.152849740932642, "grad_norm": 24.710676548637306, "kl": 0.078857421875, "learning_rate": 3.8497409326424867e-07, "loss": -0.0001, "reward": 2.12496554851532, "reward_std": 0.23146837626444494, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.6249657273292542, "step": 2375 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 6.155440414507772, "grad_norm": 0.1475410465621916, "kl": 0.0604248046875, "learning_rate": 3.847150259067357e-07, "loss": -0.0003, "reward": 2.499997615814209, "reward_std": 1.4631035014645022e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999976754188538, "step": 2376 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 6.158031088082901, "grad_norm": 4.839287582198097, "kl": 0.0743408203125, "learning_rate": 3.844559585492228e-07, "loss": 0.0003, "reward": 1.9983254671096802, "reward_std": 8.763026744418312e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4983254373073578, "step": 2377 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 6.160621761658031, "grad_norm": 0.7249268727229805, "kl": 0.08544921875, "learning_rate": 3.841968911917098e-07, "loss": 0.001, "reward": 2.4999899864196777, "reward_std": 6.038415222064941e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999899864196777, "step": 2378 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 6.16321243523316, "grad_norm": 17.197988421710985, "kl": 0.0782470703125, "learning_rate": 3.839378238341969e-07, "loss": 0.0006, "reward": 2.1870521306991577, "reward_std": 0.2590118725811408, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.6870522499084473, "step": 2379 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 6.16580310880829, "grad_norm": 2.3385333135440547, "kl": 0.05126953125, "learning_rate": 3.8367875647668393e-07, "loss": 0.0006, "reward": 2.4999810457229614, "reward_std": 1.3160841263015755e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999809265136719, "step": 2380 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 6.168393782383419, "grad_norm": 0.2032526934099008, "kl": 0.0830078125, "learning_rate": 3.8341968911917093e-07, "loss": 0.0006, "reward": 2.499993681907654, "reward_std": 2.9676965596081573e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999936819076538, "step": 2381 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 6.170984455958549, "grad_norm": 0.2744820953494261, "kl": 0.141357421875, "learning_rate": 3.8316062176165804e-07, "loss": 0.0009, "reward": 1.9998565316200256, "reward_std": 7.248964379868994e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4998565316200256, "step": 2382 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 6.1735751295336785, "grad_norm": 7.786644955397623, "kl": 0.098388671875, "learning_rate": 3.829015544041451e-07, "loss": 0.0013, "reward": 2.499973773956299, "reward_std": 2.4796958967954197e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999736547470093, "step": 2383 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 6.176165803108808, "grad_norm": 0.3896390050694339, "kl": 0.05902099609375, "learning_rate": 3.826424870466321e-07, "loss": -0.0001, "reward": 2.499996066093445, "reward_std": 4.187430249658064e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999960660934448, "step": 2384 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 6.178756476683938, "grad_norm": 8.01139434095062, "kl": 0.1357421875, "learning_rate": 3.8238341968911914e-07, "loss": 0.0008, "reward": 2.4998979568481445, "reward_std": 6.750745450290196e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9998978972434998, "step": 2385 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 6.181347150259067, "grad_norm": 1.3089080578138994, "kl": 0.081298828125, "learning_rate": 3.8212435233160625e-07, "loss": 0.001, "reward": 2.499968647956848, "reward_std": 5.661656530264736e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999687671661377, "step": 2386 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 6.183937823834197, "grad_norm": 0.8460749804933171, "kl": 0.0560302734375, "learning_rate": 3.8186528497409325e-07, "loss": -0.0002, "reward": 2.499961256980896, "reward_std": 5.853392508470279e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.999961495399475, "step": 2387 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 6.186528497409326, "grad_norm": 413.435949378909, "kl": 28.94873046875, "learning_rate": 3.816062176165803e-07, "loss": 0.1159, "reward": 1.9697168469429016, "reward_std": 0.005213464294797632, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4697167873382568, "step": 2388 }, { "clip_ratio": 0.0, "completion_length": 34.0, "epoch": 6.189119170984456, "grad_norm": 1.7810366446013317, "kl": 0.05810546875, "learning_rate": 3.8134715025906735e-07, "loss": -0.0001, "reward": 1.9998552203178406, "reward_std": 1.3732143401057328e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4998551905155182, "step": 2389 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 6.191709844559585, "grad_norm": 0.23300915999837846, "kl": 0.17578125, "learning_rate": 3.8108808290155435e-07, "loss": -0.0002, "reward": 1.9999127388000488, "reward_std": 5.1483686434039555e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4999129474163055, "step": 2390 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 6.194300518134715, "grad_norm": 1.8944082449054434, "kl": 0.0712890625, "learning_rate": 3.8082901554404146e-07, "loss": 0.0001, "reward": 2.4999886751174927, "reward_std": 7.113919338053165e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999887347221375, "step": 2391 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 6.196891191709844, "grad_norm": 3.3598542923491004, "kl": 0.068603515625, "learning_rate": 3.805699481865285e-07, "loss": -0.0005, "reward": 1.995642900466919, "reward_std": 4.7626547484469484e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4956431090831757, "step": 2392 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 6.199481865284974, "grad_norm": 4.920157378325246, "kl": 0.16455078125, "learning_rate": 3.803108808290155e-07, "loss": 0.0005, "reward": 1.9938519597053528, "reward_std": 6.516680218737747e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4938518404960632, "step": 2393 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 6.2020725388601035, "grad_norm": 0.8087563064292018, "kl": 0.1531982421875, "learning_rate": 3.8005181347150256e-07, "loss": 0.0005, "reward": 2.4999749660491943, "reward_std": 5.67804681850248e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999749660491943, "step": 2394 }, { "clip_ratio": 0.0, "completion_length": 33.125, "epoch": 6.204663212435233, "grad_norm": 34.140869391999026, "kl": 0.07080078125, "learning_rate": 3.797927461139896e-07, "loss": 0.0007, "reward": 2.437418222427368, "reward_std": 0.17700132422399406, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9374181032180786, "step": 2395 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 6.2072538860103625, "grad_norm": 0.1946493762075473, "kl": 0.0528564453125, "learning_rate": 3.7953367875647667e-07, "loss": 0.0005, "reward": 2.499996066093445, "reward_std": 2.4461089083160914e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999959468841553, "step": 2396 }, { "clip_ratio": 0.0, "completion_length": 35.5625, "epoch": 6.209844559585492, "grad_norm": 15.771830647704563, "kl": 0.17236328125, "learning_rate": 3.792746113989637e-07, "loss": -0.0004, "reward": 2.4374654293060303, "reward_std": 0.17682881184941834, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9374655485153198, "step": 2397 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 6.212435233160622, "grad_norm": 0.06170751764246492, "kl": 0.114013671875, "learning_rate": 3.7901554404145077e-07, "loss": 0.0006, "reward": 2.499998927116394, "reward_std": 1.6137463489940274e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999989867210388, "step": 2398 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 6.215025906735751, "grad_norm": 8.549851188408928, "kl": 0.0947265625, "learning_rate": 3.7875647668393777e-07, "loss": -0.0007, "reward": 2.4999760389328003, "reward_std": 1.4691032561131578e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.999976098537445, "step": 2399 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 6.217616580310881, "grad_norm": 2.1660914798541238, "kl": 0.04266357421875, "learning_rate": 3.784974093264249e-07, "loss": 0.0011, "reward": 1.9037814736366272, "reward_std": 0.00017656411864663824, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4037814140319824, "step": 2400 }, { "clip_ratio": 0.0, "completion_length": 36.5625, "epoch": 6.22020725388601, "grad_norm": 0.6186428060767203, "kl": 0.222900390625, "learning_rate": 3.7823834196891193e-07, "loss": 0.0009, "reward": 2.499996066093445, "reward_std": 1.0282419395934994e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999960064888, "step": 2401 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 6.22279792746114, "grad_norm": 4.970433436994633, "kl": 0.421875, "learning_rate": 3.7797927461139893e-07, "loss": 0.0016, "reward": 2.4999921321868896, "reward_std": 7.805063319210603e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999921917915344, "step": 2402 }, { "clip_ratio": 0.0, "completion_length": 34.5, "epoch": 6.225388601036269, "grad_norm": 52.44956473278859, "kl": 0.208984375, "learning_rate": 3.77720207253886e-07, "loss": 0.0017, "reward": 1.9954881072044373, "reward_std": 0.0034883645831769172, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4954880475997925, "step": 2403 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 6.227979274611399, "grad_norm": 0.4716565998080233, "kl": 0.1103515625, "learning_rate": 3.7746113989637304e-07, "loss": 0.0001, "reward": 2.499995470046997, "reward_std": 3.080721768355943e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999955296516418, "step": 2404 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 6.230569948186528, "grad_norm": 0.3172445475524281, "kl": 0.112945556640625, "learning_rate": 3.772020725388601e-07, "loss": 0.0006, "reward": 2.4999889135360718, "reward_std": 3.829602064797655e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.999988853931427, "step": 2405 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 6.233160621761658, "grad_norm": 0.2661678027079654, "kl": 0.0830078125, "learning_rate": 3.7694300518134714e-07, "loss": -0.0007, "reward": 2.4999966621398926, "reward_std": 2.497837044757034e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999967813491821, "step": 2406 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 6.2357512953367875, "grad_norm": 0.30059252457158375, "kl": 0.064208984375, "learning_rate": 3.766839378238342e-07, "loss": -0.0, "reward": 2.4999938011169434, "reward_std": 3.6139878147878335e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.999993920326233, "step": 2407 }, { "clip_ratio": 0.0, "completion_length": 34.5, "epoch": 6.238341968911917, "grad_norm": 0.16956674398434016, "kl": 0.0830078125, "learning_rate": 3.764248704663212e-07, "loss": 0.0016, "reward": 2.4999899864196777, "reward_std": 2.0162952409918944e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999897480010986, "step": 2408 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 6.240932642487047, "grad_norm": 0.12519663363735328, "kl": 0.0648193359375, "learning_rate": 3.761658031088083e-07, "loss": 0.0009, "reward": 2.499998688697815, "reward_std": 1.0215590577900002e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.999998688697815, "step": 2409 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 6.243523316062176, "grad_norm": 1.7237133933044158, "kl": 0.075439453125, "learning_rate": 3.7590673575129535e-07, "loss": -0.0001, "reward": 2.499948263168335, "reward_std": 1.0183679023612058e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.999948263168335, "step": 2410 }, { "clip_ratio": 0.0, "completion_length": 34.0, "epoch": 6.246113989637306, "grad_norm": 1.961533630093995, "kl": 0.06494140625, "learning_rate": 3.7564766839378235e-07, "loss": -0.0004, "reward": 2.4999901056289673, "reward_std": 6.170386313897325e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999902248382568, "step": 2411 }, { "clip_ratio": 0.0, "completion_length": 34.0, "epoch": 6.248704663212435, "grad_norm": 0.360889730102968, "kl": 0.0986328125, "learning_rate": 3.753886010362694e-07, "loss": -0.0015, "reward": 2.499983072280884, "reward_std": 3.5920837717640097e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999832510948181, "step": 2412 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 6.251295336787565, "grad_norm": 10.450852956119045, "kl": 0.107666015625, "learning_rate": 3.7512953367875646e-07, "loss": 0.0017, "reward": 2.499994993209839, "reward_std": 2.394451257714536e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.999994933605194, "step": 2413 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 6.253886010362694, "grad_norm": 2.526678788309054, "kl": 0.1138916015625, "learning_rate": 3.748704663212435e-07, "loss": 0.0011, "reward": 1.9975848197937012, "reward_std": 4.312438233000648e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4975847899913788, "step": 2414 }, { "clip_ratio": 0.0, "completion_length": 34.5, "epoch": 6.256476683937824, "grad_norm": 0.4066458451787601, "kl": 0.0849609375, "learning_rate": 3.7461139896373056e-07, "loss": 0.001, "reward": 2.49999463558197, "reward_std": 4.263770051693427e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999946355819702, "step": 2415 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 6.259067357512953, "grad_norm": 1.3257820718628153, "kl": 0.0426025390625, "learning_rate": 3.743523316062176e-07, "loss": 0.0011, "reward": 2.499985933303833, "reward_std": 6.4517505506955786e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999858140945435, "step": 2416 }, { "clip_ratio": 0.0, "completion_length": 34.0, "epoch": 6.261658031088083, "grad_norm": 0.11876501143050537, "kl": 0.0478515625, "learning_rate": 3.740932642487046e-07, "loss": 0.0002, "reward": 2.499997615814209, "reward_std": 1.4190826505000587e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999974966049194, "step": 2417 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 6.2642487046632125, "grad_norm": 0.1581385295322136, "kl": 0.125, "learning_rate": 3.7383419689119167e-07, "loss": 0.0007, "reward": 2.4999938011169434, "reward_std": 1.5311854895116994e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.999993920326233, "step": 2418 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 6.266839378238342, "grad_norm": 31.227476040820765, "kl": 0.1396484375, "learning_rate": 3.7357512953367877e-07, "loss": 0.0006, "reward": 2.0551145672798157, "reward_std": 0.17975615236883868, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.5551146268844604, "step": 2419 }, { "clip_ratio": 0.0, "completion_length": 34.0, "epoch": 6.269430051813472, "grad_norm": 3.9667463256526245, "kl": 0.087890625, "learning_rate": 3.7331606217616577e-07, "loss": 0.0015, "reward": 2.4999780654907227, "reward_std": 1.2078240274604468e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.999977946281433, "step": 2420 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 6.272020725388601, "grad_norm": 0.23347583948390563, "kl": 0.0792236328125, "learning_rate": 3.730569948186528e-07, "loss": 0.0002, "reward": 2.499984860420227, "reward_std": 2.497193008821341e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.999984860420227, "step": 2421 }, { "clip_ratio": 0.0, "completion_length": 35.875, "epoch": 6.274611398963731, "grad_norm": 120.61255256389367, "kl": 0.15771484375, "learning_rate": 3.727979274611399e-07, "loss": 0.0008, "reward": 1.2116374969482422, "reward_std": 0.035305225504998816, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 0.7116374969482422, "step": 2422 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 6.27720207253886, "grad_norm": 2.437600293856105, "kl": 0.111328125, "learning_rate": 3.7253886010362693e-07, "loss": 0.0007, "reward": 2.4999897480010986, "reward_std": 6.014159225742333e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.999989628791809, "step": 2423 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 6.27979274611399, "grad_norm": 0.4290821560071071, "kl": 0.0833740234375, "learning_rate": 3.72279792746114e-07, "loss": -0.0014, "reward": 2.4999876022338867, "reward_std": 5.146757075635833e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999877214431763, "step": 2424 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 6.282383419689119, "grad_norm": 0.1600028867259204, "kl": 0.09637451171875, "learning_rate": 3.7202072538860104e-07, "loss": 0.0004, "reward": 2.4999983310699463, "reward_std": 2.7850650781147124e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999984502792358, "step": 2425 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 6.284974093264249, "grad_norm": 22.826085820946783, "kl": 0.18798828125, "learning_rate": 3.7176165803108804e-07, "loss": 0.001, "reward": 1.9742602109909058, "reward_std": 0.00019707344745256705, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.474260151386261, "step": 2426 }, { "clip_ratio": 0.0, "completion_length": 34.5, "epoch": 6.287564766839378, "grad_norm": 0.22442671006552858, "kl": 0.134033203125, "learning_rate": 3.715025906735751e-07, "loss": 0.0001, "reward": 1.4999973773956299, "reward_std": 1.2898391332782921e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 0.9999974966049194, "step": 2427 }, { "clip_ratio": 0.0, "completion_length": 34.5, "epoch": 6.290155440414508, "grad_norm": 1.3144386377961415, "kl": 0.063720703125, "learning_rate": 3.712435233160622e-07, "loss": -0.0006, "reward": 2.499995470046997, "reward_std": 3.925233215795743e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999955296516418, "step": 2428 }, { "clip_ratio": 0.0, "completion_length": 34.5, "epoch": 6.2927461139896375, "grad_norm": 0.7125709381731127, "kl": 0.18603515625, "learning_rate": 3.709844559585492e-07, "loss": 0.0019, "reward": 2.4999938011169434, "reward_std": 6.47274009679677e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999937415122986, "step": 2429 }, { "clip_ratio": 0.0, "completion_length": 34.5, "epoch": 6.295336787564767, "grad_norm": 2.1572894455514775, "kl": 0.11083984375, "learning_rate": 3.7072538860103625e-07, "loss": 0.0004, "reward": 2.4999969005584717, "reward_std": 2.5959349159165868e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999970197677612, "step": 2430 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 6.2979274611398965, "grad_norm": 0.33131636709164686, "kl": 0.133056640625, "learning_rate": 3.704663212435233e-07, "loss": 0.0011, "reward": 2.499998688697815, "reward_std": 1.119217273526374e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.999998688697815, "step": 2431 }, { "clip_ratio": 0.0, "completion_length": 34.5, "epoch": 6.300518134715026, "grad_norm": 0.31916247505055684, "kl": 0.05419921875, "learning_rate": 3.7020725388601035e-07, "loss": -0.0007, "reward": 2.4999982118606567, "reward_std": 1.874835731996427e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999983310699463, "step": 2432 }, { "clip_ratio": 0.0, "completion_length": 34.5, "epoch": 6.303108808290156, "grad_norm": 0.7371616213570794, "kl": 0.14599609375, "learning_rate": 3.699481865284974e-07, "loss": 0.0012, "reward": 2.499993324279785, "reward_std": 3.091249539011187e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999932646751404, "step": 2433 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 6.305699481865285, "grad_norm": 7.84420019990756, "kl": 0.0811767578125, "learning_rate": 3.6968911917098446e-07, "loss": 0.0004, "reward": 2.499984860420227, "reward_std": 4.193630019244665e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.999984860420227, "step": 2434 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 6.308290155440415, "grad_norm": 10.086134480363164, "kl": 0.071044921875, "learning_rate": 3.6943005181347146e-07, "loss": -0.0005, "reward": 1.8229429721832275, "reward_std": 0.00032223474909187644, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.3229431211948395, "step": 2435 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 6.310880829015544, "grad_norm": 2.147529720812094, "kl": 0.15478515625, "learning_rate": 3.691709844559585e-07, "loss": 0.001, "reward": 2.4999783039093018, "reward_std": 9.526685516902944e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.999978244304657, "step": 2436 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 6.313471502590674, "grad_norm": 0.3526405909633918, "kl": 0.0701904296875, "learning_rate": 3.689119170984456e-07, "loss": 0.0001, "reward": 2.4999940395355225, "reward_std": 4.587311536852212e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999940395355225, "step": 2437 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 6.316062176165803, "grad_norm": 2.506128111942887, "kl": 0.0531005859375, "learning_rate": 3.686528497409326e-07, "loss": -0.0001, "reward": 2.4999901056289673, "reward_std": 1.081242282907624e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999901056289673, "step": 2438 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 6.318652849740933, "grad_norm": 12.715547924923909, "kl": 0.25390625, "learning_rate": 3.6839378238341967e-07, "loss": 0.0012, "reward": 1.9230295419692993, "reward_std": 0.0002926438269241771, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4230294525623322, "step": 2439 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 6.321243523316062, "grad_norm": 0.13738235770219653, "kl": 0.063232421875, "learning_rate": 3.681347150259067e-07, "loss": 0.0008, "reward": 2.4999979734420776, "reward_std": 1.6053691069828346e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999980330467224, "step": 2440 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 6.323834196891192, "grad_norm": 0.17818110736491416, "kl": 0.068115234375, "learning_rate": 3.678756476683937e-07, "loss": 0.0006, "reward": 2.4999961853027344, "reward_std": 1.9105841886357666e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999961256980896, "step": 2441 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 6.3264248704663215, "grad_norm": 0.5909619139462969, "kl": 0.07861328125, "learning_rate": 3.676165803108808e-07, "loss": 0.0005, "reward": 2.4999940395355225, "reward_std": 3.80162748570001e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999939799308777, "step": 2442 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 6.329015544041451, "grad_norm": 0.8296793834573871, "kl": 0.0791015625, "learning_rate": 3.673575129533679e-07, "loss": 0.0006, "reward": 2.4999938011169434, "reward_std": 3.2873880400074995e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999938011169434, "step": 2443 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 6.331606217616581, "grad_norm": 7.329533403850742, "kl": 0.0517578125, "learning_rate": 3.670984455958549e-07, "loss": -0.0005, "reward": 1.791582703590393, "reward_std": 0.0005982934924304573, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.2915826737880707, "step": 2444 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 6.33419689119171, "grad_norm": 2.633197294463325, "kl": 0.136962890625, "learning_rate": 3.6683937823834193e-07, "loss": 0.0015, "reward": 2.499969005584717, "reward_std": 1.4033648312761215e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999691247940063, "step": 2445 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 6.33678756476684, "grad_norm": 0.08326390705019363, "kl": 0.02740478515625, "learning_rate": 3.6658031088082904e-07, "loss": 0.0003, "reward": 2.499998092651367, "reward_std": 1.101278087389801e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999979138374329, "step": 2446 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 6.339378238341969, "grad_norm": 15.838106354699958, "kl": 0.0924072265625, "learning_rate": 3.6632124352331604e-07, "loss": 0.0005, "reward": 1.9271252155303955, "reward_std": 0.0009331158395866623, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4271252751350403, "step": 2447 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 6.341968911917099, "grad_norm": 23.853398046024505, "kl": 0.0771484375, "learning_rate": 3.660621761658031e-07, "loss": 0.0011, "reward": 2.187396287918091, "reward_std": 0.2588264785908905, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.6873961091041565, "step": 2448 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 6.344559585492228, "grad_norm": 8.159731537392343, "kl": 0.11962890625, "learning_rate": 3.6580310880829014e-07, "loss": 0.0007, "reward": 1.9991435408592224, "reward_std": 7.163561326706258e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4991436004638672, "step": 2449 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 6.347150259067358, "grad_norm": 0.4735492137615518, "kl": 0.0771484375, "learning_rate": 3.6554404145077714e-07, "loss": 0.0005, "reward": 2.4990309476852417, "reward_std": 1.1790068015216093e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9990310668945312, "step": 2450 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 6.349740932642487, "grad_norm": 1.0193662662405758, "kl": 0.1568603515625, "learning_rate": 3.6528497409326425e-07, "loss": 0.0011, "reward": 2.4999947547912598, "reward_std": 5.042333100391261e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999946355819702, "step": 2451 }, { "clip_ratio": 0.0, "completion_length": 34.0, "epoch": 6.352331606217617, "grad_norm": 2.666664856836925, "kl": 0.1982421875, "learning_rate": 3.650259067357513e-07, "loss": 0.0013, "reward": 1.9991455078125, "reward_std": 3.205962684660335e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4991456270217896, "step": 2452 }, { "clip_ratio": 0.0, "completion_length": 34.5, "epoch": 6.3549222797927465, "grad_norm": 0.03463561128019328, "kl": 0.11865234375, "learning_rate": 3.647668393782383e-07, "loss": -0.0006, "reward": 2.4999985694885254, "reward_std": 1.1470037861727178e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999987483024597, "step": 2453 }, { "clip_ratio": 0.0, "completion_length": 34.5, "epoch": 6.357512953367876, "grad_norm": 0.46676428969552564, "kl": 0.032470703125, "learning_rate": 3.6450777202072535e-07, "loss": 0.0014, "reward": 2.4999947547912598, "reward_std": 3.23801043577987e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999945759773254, "step": 2454 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 6.360103626943006, "grad_norm": 1.8227157687944535, "kl": 0.107177734375, "learning_rate": 3.642487046632124e-07, "loss": 0.0007, "reward": 2.4999886751174927, "reward_std": 1.1016374628525227e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999886751174927, "step": 2455 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 6.362694300518135, "grad_norm": 2.5570426477307215, "kl": 0.051513671875, "learning_rate": 3.6398963730569946e-07, "loss": -0.0002, "reward": 2.499990224838257, "reward_std": 7.5415306355353096e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999902844429016, "step": 2456 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 6.365284974093265, "grad_norm": 0.18748823802567555, "kl": 0.146728515625, "learning_rate": 3.637305699481865e-07, "loss": 0.0, "reward": 2.4999970197677612, "reward_std": 3.3792812246247195e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999970197677612, "step": 2457 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 6.367875647668393, "grad_norm": 0.6499323497772118, "kl": 0.08154296875, "learning_rate": 3.6347150259067356e-07, "loss": 0.0018, "reward": 2.499994993209839, "reward_std": 4.347981644059473e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999949932098389, "step": 2458 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 6.370466321243523, "grad_norm": 3.597028279956446, "kl": 0.093505859375, "learning_rate": 3.6321243523316056e-07, "loss": 0.001, "reward": 2.4999821186065674, "reward_std": 9.831991519604344e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999820590019226, "step": 2459 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 6.373056994818652, "grad_norm": 5.566670097994189, "kl": 0.204345703125, "learning_rate": 3.6295336787564767e-07, "loss": 0.0015, "reward": 1.979424774646759, "reward_std": 5.3479671919376415e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.479424774646759, "step": 2460 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 6.375647668393782, "grad_norm": 10.283264284399745, "kl": 0.142822265625, "learning_rate": 3.626943005181347e-07, "loss": -0.0003, "reward": 1.8927792310714722, "reward_std": 0.0007888623107419335, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.3927792310714722, "step": 2461 }, { "clip_ratio": 0.0, "completion_length": 34.5, "epoch": 6.3782383419689115, "grad_norm": 1.234001027261854, "kl": 0.124267578125, "learning_rate": 3.624352331606217e-07, "loss": -0.0004, "reward": 2.499982237815857, "reward_std": 4.899201940133935e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999822974205017, "step": 2462 }, { "clip_ratio": 0.0, "completion_length": 34.5, "epoch": 6.380829015544041, "grad_norm": 0.10500287054248686, "kl": 0.088623046875, "learning_rate": 3.6217616580310877e-07, "loss": 0.0004, "reward": 2.4999994039535522, "reward_std": 6.879108980228921e-07, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.999999463558197, "step": 2463 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 6.383419689119171, "grad_norm": 0.15258111595970084, "kl": 0.0953369140625, "learning_rate": 3.619170984455958e-07, "loss": 0.0003, "reward": 2.499996781349182, "reward_std": 2.691415545541531e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999967813491821, "step": 2464 }, { "clip_ratio": 0.0, "completion_length": 34.5, "epoch": 6.3860103626943, "grad_norm": 1124.8147276016118, "kl": 0.15478515625, "learning_rate": 3.616580310880829e-07, "loss": 0.0006, "reward": 1.4555084705352783, "reward_std": 0.0015149621322052553, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 0.9555085301399231, "step": 2465 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 6.38860103626943, "grad_norm": 9.759358771214643, "kl": 0.146484375, "learning_rate": 3.6139896373056993e-07, "loss": 0.0005, "reward": 2.499912142753601, "reward_std": 3.632959021615534e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.999912142753601, "step": 2466 }, { "clip_ratio": 0.0, "completion_length": 34.5, "epoch": 6.391191709844559, "grad_norm": 16.94473998607458, "kl": 0.115478515625, "learning_rate": 3.61139896373057e-07, "loss": 0.0013, "reward": 2.1247339248657227, "reward_std": 0.23157209117198363, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.6247336268424988, "step": 2467 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 6.393782383419689, "grad_norm": 2.187412014394854, "kl": 0.129364013671875, "learning_rate": 3.60880829015544e-07, "loss": 0.0011, "reward": 1.9939700365066528, "reward_std": 6.716111795412871e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4939699172973633, "step": 2468 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 6.396373056994818, "grad_norm": 14.410662138216031, "kl": 0.17724609375, "learning_rate": 3.606217616580311e-07, "loss": 0.0005, "reward": 1.49062180519104, "reward_std": 0.00018573750276118517, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 0.9906218647956848, "step": 2469 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 6.398963730569948, "grad_norm": 5.023354767778658, "kl": 0.15966796875, "learning_rate": 3.6036269430051814e-07, "loss": 0.0007, "reward": 1.9221270084381104, "reward_std": 0.0004075120028232959, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4221270680427551, "step": 2470 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 6.401554404145077, "grad_norm": 0.18416621253352136, "kl": 0.061767578125, "learning_rate": 3.6010362694300514e-07, "loss": -0.0003, "reward": 2.4999927282333374, "reward_std": 3.200827734417544e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999929666519165, "step": 2471 }, { "clip_ratio": 0.0, "completion_length": 33.5, "epoch": 6.404145077720207, "grad_norm": 0.24448431737808432, "kl": 0.16650390625, "learning_rate": 3.598445595854922e-07, "loss": -0.0002, "reward": 2.499997615814209, "reward_std": 1.3951118376098748e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999976754188538, "step": 2472 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 6.4067357512953365, "grad_norm": 0.4065492776650983, "kl": 0.0543212890625, "learning_rate": 3.5958549222797925e-07, "loss": 0.0008, "reward": 2.4999881982803345, "reward_std": 3.962604182561336e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.999988079071045, "step": 2473 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 6.409326424870466, "grad_norm": 0.5924591078475367, "kl": 0.08251953125, "learning_rate": 3.5932642487046635e-07, "loss": 0.0003, "reward": 2.499992609024048, "reward_std": 3.624367423071817e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999926686286926, "step": 2474 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 6.4119170984455955, "grad_norm": 0.17777448753462252, "kl": 0.0545654296875, "learning_rate": 3.5906735751295335e-07, "loss": 0.0002, "reward": 2.499995470046997, "reward_std": 2.0276543750696874e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999954104423523, "step": 2475 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 6.414507772020725, "grad_norm": 4.241399127895043, "kl": 0.102783203125, "learning_rate": 3.588082901554404e-07, "loss": 0.001, "reward": 1.9991992712020874, "reward_std": 2.7507052550390654e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4991993308067322, "step": 2476 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 6.417098445595855, "grad_norm": 5.043212407548967, "kl": 0.14697265625, "learning_rate": 3.585492227979274e-07, "loss": 0.001, "reward": 1.945802927017212, "reward_std": 0.00028971461244964303, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4458030462265015, "step": 2477 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 6.419689119170984, "grad_norm": 0.10136268824768102, "kl": 0.052520751953125, "learning_rate": 3.5829015544041446e-07, "loss": 0.0006, "reward": 2.4999966621398926, "reward_std": 2.0284311403884203e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999966621398926, "step": 2478 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 6.422279792746114, "grad_norm": 0.9288212187305855, "kl": 0.08917236328125, "learning_rate": 3.5803108808290156e-07, "loss": 0.0007, "reward": 1.9948766827583313, "reward_std": 4.2529829471504854e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4948766231536865, "step": 2479 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 6.424870466321243, "grad_norm": 0.8232738179695006, "kl": 0.13232421875, "learning_rate": 3.577720207253886e-07, "loss": -0.0001, "reward": 2.499993681907654, "reward_std": 5.1286891675772495e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999936819076538, "step": 2480 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 6.427461139896373, "grad_norm": 0.2988021908278537, "kl": 0.08984375, "learning_rate": 3.575129533678756e-07, "loss": 0.0008, "reward": 2.4999938011169434, "reward_std": 3.1458180274057668e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999937415122986, "step": 2481 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 6.430051813471502, "grad_norm": 5.66954608155545, "kl": 0.07861328125, "learning_rate": 3.5725388601036267e-07, "loss": 0.0008, "reward": 2.4999682903289795, "reward_std": 1.5323442141834676e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.99996817111969, "step": 2482 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 6.432642487046632, "grad_norm": 1.209233448016897, "kl": 0.23095703125, "learning_rate": 3.569948186528498e-07, "loss": 0.0005, "reward": 2.4999887943267822, "reward_std": 4.82934797219059e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999886751174927, "step": 2483 }, { "clip_ratio": 0.0, "completion_length": 34.5, "epoch": 6.435233160621761, "grad_norm": 6.516685874216108, "kl": 0.1953125, "learning_rate": 3.5673575129533677e-07, "loss": 0.0012, "reward": 2.4999921321868896, "reward_std": 8.983871452983294e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999921917915344, "step": 2484 }, { "clip_ratio": 0.0, "completion_length": 35.3125, "epoch": 6.437823834196891, "grad_norm": 5.117095363995409, "kl": 0.124267578125, "learning_rate": 3.564766839378238e-07, "loss": 0.0007, "reward": 1.9897173047065735, "reward_std": 0.00017291619121806434, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.489717185497284, "step": 2485 }, { "clip_ratio": 0.0, "completion_length": 34.5, "epoch": 6.4404145077720205, "grad_norm": 0.21611763766553907, "kl": 0.1279296875, "learning_rate": 3.562176165803109e-07, "loss": -0.0002, "reward": 2.4999966621398926, "reward_std": 2.982501371207036e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999966025352478, "step": 2486 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 6.44300518134715, "grad_norm": 0.4154861495119402, "kl": 0.1083984375, "learning_rate": 3.559585492227979e-07, "loss": 0.0007, "reward": 2.4999972581863403, "reward_std": 2.285566210957768e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.999997079372406, "step": 2487 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 6.44559585492228, "grad_norm": 0.06999640727161804, "kl": 0.0211181640625, "learning_rate": 3.55699481865285e-07, "loss": 0.0001, "reward": 2.4999972581863403, "reward_std": 1.401394683853141e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.999997317790985, "step": 2488 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 6.448186528497409, "grad_norm": 7.025781840204633, "kl": 0.3515625, "learning_rate": 3.5544041450777204e-07, "loss": 0.002, "reward": 2.437487006187439, "reward_std": 0.17680287108987613, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9374868869781494, "step": 2489 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 6.450777202072539, "grad_norm": 0.30873417114579993, "kl": 0.17138671875, "learning_rate": 3.5518134715025904e-07, "loss": 0.0001, "reward": 2.499990701675415, "reward_std": 3.035928870076532e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999907612800598, "step": 2490 }, { "clip_ratio": 0.0, "completion_length": 34.5, "epoch": 6.453367875647668, "grad_norm": 0.04727541108296201, "kl": 0.14892578125, "learning_rate": 3.549222797927461e-07, "loss": 0.0003, "reward": 2.4999985694885254, "reward_std": 1.1884452817412239e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999987483024597, "step": 2491 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 6.455958549222798, "grad_norm": 0.7723420878791865, "kl": 0.1102294921875, "learning_rate": 3.546632124352332e-07, "loss": 0.0006, "reward": 1.9998581409454346, "reward_std": 1.1336409784235002e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4998580515384674, "step": 2492 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 6.458549222797927, "grad_norm": 0.3431969364012201, "kl": 0.03411865234375, "learning_rate": 3.544041450777202e-07, "loss": 0.0002, "reward": 2.499978542327881, "reward_std": 3.934985301157212e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.999978482723236, "step": 2493 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 6.461139896373057, "grad_norm": 8.362519966517624, "kl": 0.19232177734375, "learning_rate": 3.5414507772020725e-07, "loss": -0.0003, "reward": 1.8156248331069946, "reward_std": 0.0006790688453293114, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.3156249821186066, "step": 2494 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 6.463730569948186, "grad_norm": 1.0179848336918629, "kl": 0.076171875, "learning_rate": 3.538860103626943e-07, "loss": 0.0001, "reward": 2.499988079071045, "reward_std": 7.239053502416937e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999881982803345, "step": 2495 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 6.466321243523316, "grad_norm": 0.10190849027749872, "kl": 0.107177734375, "learning_rate": 3.536269430051813e-07, "loss": -0.0, "reward": 2.4999970197677612, "reward_std": 1.4506032925964973e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999971389770508, "step": 2496 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 6.4689119170984455, "grad_norm": 1.921338769853351, "kl": 0.126220703125, "learning_rate": 3.533678756476684e-07, "loss": 0.0014, "reward": 1.9989900588989258, "reward_std": 3.6439418408917845e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4989900588989258, "step": 2497 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 6.471502590673575, "grad_norm": 5.43500575648351, "kl": 0.0938720703125, "learning_rate": 3.5310880829015546e-07, "loss": 0.0008, "reward": 2.4999852180480957, "reward_std": 6.389600457623601e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999850392341614, "step": 2498 }, { "clip_ratio": 0.0, "completion_length": 34.5, "epoch": 6.474093264248705, "grad_norm": 0.456136647459687, "kl": 0.131591796875, "learning_rate": 3.5284974093264246e-07, "loss": 0.0014, "reward": 2.4999964237213135, "reward_std": 1.9534279260824405e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.999996304512024, "step": 2499 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 6.476683937823834, "grad_norm": 0.6653165844467438, "kl": 0.0982666015625, "learning_rate": 3.525906735751295e-07, "loss": -0.0005, "reward": 2.4999977350234985, "reward_std": 1.1656431979645276e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999977946281433, "step": 2500 }, { "clip_ratio": 0.0, "completion_length": 34.5, "epoch": 6.479274611398964, "grad_norm": 4.676079110721336, "kl": 0.173828125, "learning_rate": 3.5233160621761656e-07, "loss": 0.0006, "reward": 1.8215047717094421, "reward_std": 0.0002594917013993836, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.3215046525001526, "step": 2501 }, { "clip_ratio": 0.0, "completion_length": 34.5, "epoch": 6.481865284974093, "grad_norm": 0.1904071271254002, "kl": 0.06256103515625, "learning_rate": 3.520725388601036e-07, "loss": 0.0008, "reward": 2.499995231628418, "reward_std": 2.1115497474966105e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.999995231628418, "step": 2502 }, { "clip_ratio": 0.0, "completion_length": 34.5, "epoch": 6.484455958549223, "grad_norm": 0.9582368079046227, "kl": 0.0849609375, "learning_rate": 3.5181347150259067e-07, "loss": 0.0011, "reward": 1.99985933303833, "reward_std": 7.085975425979996e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4998592138290405, "step": 2503 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 6.487046632124352, "grad_norm": 1.4032192165095838, "kl": 0.1337890625, "learning_rate": 3.515544041450777e-07, "loss": 0.0014, "reward": 2.4999920129776, "reward_std": 7.278455768755521e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999920725822449, "step": 2504 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 6.489637305699482, "grad_norm": 5.436980265188576, "kl": 0.1051025390625, "learning_rate": 3.512953367875647e-07, "loss": 0.0, "reward": 1.9927936792373657, "reward_std": 0.00010940871715092726, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4927937984466553, "step": 2505 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 6.492227979274611, "grad_norm": 6.830120845765552, "kl": 0.123291015625, "learning_rate": 3.510362694300518e-07, "loss": 0.0006, "reward": 2.4999560117721558, "reward_std": 2.4696539639990078e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999560117721558, "step": 2506 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 6.494818652849741, "grad_norm": 0.6074762408356196, "kl": 0.04150390625, "learning_rate": 3.507772020725389e-07, "loss": -0.0003, "reward": 2.499990701675415, "reward_std": 3.3898651281560888e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999908208847046, "step": 2507 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 6.4974093264248705, "grad_norm": 16.485288993691785, "kl": 0.3707275390625, "learning_rate": 3.505181347150259e-07, "loss": 0.0012, "reward": 1.7229715585708618, "reward_std": 0.0013687875010646167, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.2229715287685394, "step": 2508 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 6.5, "grad_norm": 1.2223625477946523, "kl": 0.134033203125, "learning_rate": 3.5025906735751293e-07, "loss": 0.0006, "reward": 2.499992847442627, "reward_std": 2.7371163184852776e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999929666519165, "step": 2509 }, { "clip_ratio": 0.0, "completion_length": 34.5, "epoch": 6.5025906735751295, "grad_norm": 1.1046335186564094, "kl": 0.1435546875, "learning_rate": 3.5e-07, "loss": -0.0003, "reward": 2.4999951124191284, "reward_std": 2.900464892263699e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.999995231628418, "step": 2510 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 6.505181347150259, "grad_norm": 0.14623453489234367, "kl": 0.04571533203125, "learning_rate": 3.4974093264248704e-07, "loss": 0.0, "reward": 2.499994993209839, "reward_std": 3.6770769042959728e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999951124191284, "step": 2511 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 6.507772020725389, "grad_norm": 0.5481370736079011, "kl": 0.0771484375, "learning_rate": 3.494818652849741e-07, "loss": 0.0, "reward": 2.4999842643737793, "reward_std": 3.924493796603201e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999844431877136, "step": 2512 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 6.510362694300518, "grad_norm": 101.93761773473722, "kl": 0.07391357421875, "learning_rate": 3.4922279792746114e-07, "loss": -0.0001, "reward": 1.999802052974701, "reward_std": 8.644338197427714e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.499802052974701, "step": 2513 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 6.512953367875648, "grad_norm": 2.545884411476245, "kl": 0.0694580078125, "learning_rate": 3.4896373056994814e-07, "loss": 0.0004, "reward": 1.9975576400756836, "reward_std": 5.358162525226362e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4975575804710388, "step": 2514 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 6.515544041450777, "grad_norm": 8.78644266455707, "kl": 0.091064453125, "learning_rate": 3.487046632124352e-07, "loss": 0.0002, "reward": 1.9999077320098877, "reward_std": 3.9093000793855026e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4999077320098877, "step": 2515 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 6.518134715025907, "grad_norm": 1.5901307709542127, "kl": 0.111328125, "learning_rate": 3.484455958549223e-07, "loss": 0.0007, "reward": 2.4999910593032837, "reward_std": 7.532677500421414e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999911189079285, "step": 2516 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 6.520725388601036, "grad_norm": 0.8223064164311363, "kl": 0.114013671875, "learning_rate": 3.481865284974093e-07, "loss": 0.0007, "reward": 1.9999272227287292, "reward_std": 1.2687341040873434e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4999272525310516, "step": 2517 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 6.523316062176166, "grad_norm": 19.107232142727003, "kl": 0.13525390625, "learning_rate": 3.4792746113989635e-07, "loss": -0.0002, "reward": 2.1249073147773743, "reward_std": 0.23146861664690732, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.6249073147773743, "step": 2518 }, { "clip_ratio": 0.0, "completion_length": 34.5, "epoch": 6.525906735751295, "grad_norm": 1.2222527845680367, "kl": 0.05718994140625, "learning_rate": 3.476683937823834e-07, "loss": -0.0, "reward": 2.4999825954437256, "reward_std": 8.119004633044824e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.99998277425766, "step": 2519 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 6.528497409326425, "grad_norm": 0.3239109550155984, "kl": 0.2890625, "learning_rate": 3.4740932642487046e-07, "loss": 0.0013, "reward": 2.499995470046997, "reward_std": 3.252491694638593e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999955296516418, "step": 2520 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 6.5310880829015545, "grad_norm": 1.4294652002689472, "kl": 0.13427734375, "learning_rate": 3.471502590673575e-07, "loss": -0.0005, "reward": 2.4999959468841553, "reward_std": 2.497928875300204e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999960064888, "step": 2521 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 6.533678756476684, "grad_norm": 0.8170950027030504, "kl": 0.113525390625, "learning_rate": 3.4689119170984456e-07, "loss": 0.0006, "reward": 2.4999778270721436, "reward_std": 7.645261462130293e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999775886535645, "step": 2522 }, { "clip_ratio": 0.0, "completion_length": 34.5, "epoch": 6.536269430051814, "grad_norm": 0.9580082169122023, "kl": 0.0557861328125, "learning_rate": 3.4663212435233156e-07, "loss": -0.0005, "reward": 2.499991297721863, "reward_std": 3.0062224141147453e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999911189079285, "step": 2523 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 6.538860103626943, "grad_norm": 0.08720168262521778, "kl": 0.040313720703125, "learning_rate": 3.463730569948186e-07, "loss": -0.0001, "reward": 2.4999953508377075, "reward_std": 1.8311926623937325e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.999995231628418, "step": 2524 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 6.541450777202073, "grad_norm": 0.7274433384858028, "kl": 0.12744140625, "learning_rate": 3.461139896373057e-07, "loss": 0.0001, "reward": 2.4999914169311523, "reward_std": 4.852705046687333e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999914169311523, "step": 2525 }, { "clip_ratio": 0.0, "completion_length": 36.3125, "epoch": 6.544041450777202, "grad_norm": 1.5010685730062852, "kl": 0.094482421875, "learning_rate": 3.458549222797927e-07, "loss": 0.0013, "reward": 1.9996663331985474, "reward_std": 2.3781557274560328e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4996660649776459, "step": 2526 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 6.546632124352332, "grad_norm": 1.1055764122934053, "kl": 0.120849609375, "learning_rate": 3.455958549222798e-07, "loss": 0.0009, "reward": 2.499983072280884, "reward_std": 8.954774557423661e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999831914901733, "step": 2527 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 6.549222797927461, "grad_norm": 0.865542578841052, "kl": 0.09759521484375, "learning_rate": 3.4533678756476683e-07, "loss": 0.0009, "reward": 2.499947190284729, "reward_std": 1.0347350809070122e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999471306800842, "step": 2528 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 6.551813471502591, "grad_norm": 0.977959619526184, "kl": 0.0465087890625, "learning_rate": 3.450777202072539e-07, "loss": -0.0007, "reward": 2.499993324279785, "reward_std": 3.94907112877263e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999935030937195, "step": 2529 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 6.55440414507772, "grad_norm": 0.6678437443647307, "kl": 0.127197265625, "learning_rate": 3.4481865284974093e-07, "loss": -0.0004, "reward": 2.499995708465576, "reward_std": 2.1067400481911136e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999958872795105, "step": 2530 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 6.55699481865285, "grad_norm": 3.6456110181342396, "kl": 0.064697265625, "learning_rate": 3.44559585492228e-07, "loss": 0.0003, "reward": 2.499939203262329, "reward_std": 2.6638364033715334e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999390244483948, "step": 2531 }, { "clip_ratio": 0.0, "completion_length": 34.5, "epoch": 6.5595854922279795, "grad_norm": 0.3527599401622835, "kl": 0.10107421875, "learning_rate": 3.44300518134715e-07, "loss": 0.0006, "reward": 2.4999788999557495, "reward_std": 3.5101359685540956e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999788999557495, "step": 2532 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 6.562176165803109, "grad_norm": 6.090586393845502, "kl": 0.10400390625, "learning_rate": 3.4404145077720204e-07, "loss": 0.0001, "reward": 1.99794602394104, "reward_std": 8.972414013896923e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4979462921619415, "step": 2533 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 6.564766839378239, "grad_norm": 1.3184491507538538, "kl": 0.13671875, "learning_rate": 3.4378238341968914e-07, "loss": 0.0004, "reward": 2.4999947547912598, "reward_std": 4.746955141854414e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999948143959045, "step": 2534 }, { "clip_ratio": 0.0, "completion_length": 34.5, "epoch": 6.567357512953368, "grad_norm": 0.5655989079483129, "kl": 0.0626220703125, "learning_rate": 3.4352331606217614e-07, "loss": 0.0007, "reward": 2.4999818801879883, "reward_std": 5.632232671359816e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999821186065674, "step": 2535 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 6.569948186528498, "grad_norm": 3.0175480439418374, "kl": 0.0521240234375, "learning_rate": 3.432642487046632e-07, "loss": -0.0008, "reward": 2.4996767044067383, "reward_std": 3.12128609039064e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9996768236160278, "step": 2536 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 6.572538860103627, "grad_norm": 0.9810304665101179, "kl": 0.123046875, "learning_rate": 3.4300518134715025e-07, "loss": 0.0019, "reward": 2.4999947547912598, "reward_std": 7.406094823636522e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999945759773254, "step": 2537 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 6.575129533678757, "grad_norm": 0.5111089931830906, "kl": 0.166015625, "learning_rate": 3.4274611398963725e-07, "loss": 0.0007, "reward": 2.499996781349182, "reward_std": 5.223699588441377e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999966621398926, "step": 2538 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 6.577720207253886, "grad_norm": 5.024948400004436, "kl": 0.0771484375, "learning_rate": 3.4248704663212435e-07, "loss": 0.0004, "reward": 2.4999821186065674, "reward_std": 2.0641108676500153e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999820590019226, "step": 2539 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 6.580310880829016, "grad_norm": 0.5011336252554351, "kl": 0.0384521484375, "learning_rate": 3.422279792746114e-07, "loss": 0.001, "reward": 2.4999841451644897, "reward_std": 3.6930124451828306e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999841451644897, "step": 2540 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 6.582901554404145, "grad_norm": 0.32258080489855623, "kl": 0.08056640625, "learning_rate": 3.419689119170984e-07, "loss": 0.0008, "reward": 2.4999977350234985, "reward_std": 2.5091250677178323e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.999997854232788, "step": 2541 }, { "clip_ratio": 0.0, "completion_length": 33.5, "epoch": 6.585492227979275, "grad_norm": 3.500541262290584, "kl": 0.08349609375, "learning_rate": 3.4170984455958546e-07, "loss": -0.0, "reward": 2.4999858140945435, "reward_std": 1.0171859941010553e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.999985933303833, "step": 2542 }, { "clip_ratio": 0.0, "completion_length": 34.5, "epoch": 6.5880829015544045, "grad_norm": 0.025574555781819382, "kl": 0.0804443359375, "learning_rate": 3.4145077720207256e-07, "loss": 0.0, "reward": 2.4999988079071045, "reward_std": 8.707054064416297e-07, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999988079071045, "step": 2543 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 6.590673575129534, "grad_norm": 8.658231050972505, "kl": 0.109375, "learning_rate": 3.4119170984455956e-07, "loss": -0.0003, "reward": 1.9913583397865295, "reward_std": 6.762993143638596e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4913585186004639, "step": 2544 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 6.5932642487046635, "grad_norm": 0.4019373659612134, "kl": 0.1199951171875, "learning_rate": 3.409326424870466e-07, "loss": 0.0005, "reward": 2.4999972581863403, "reward_std": 1.6889496805561066e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.999997317790985, "step": 2545 }, { "clip_ratio": 0.0, "completion_length": 34.5, "epoch": 6.595854922279793, "grad_norm": 15.970817610139505, "kl": 0.14599609375, "learning_rate": 3.4067357512953367e-07, "loss": 0.0009, "reward": 1.7710894346237183, "reward_std": 0.004858827194368587, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.2710894346237183, "step": 2546 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 6.598445595854923, "grad_norm": 0.5030692796067407, "kl": 0.09765625, "learning_rate": 3.4041450777202067e-07, "loss": 0.0004, "reward": 2.499997138977051, "reward_std": 2.6677773803385207e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.999997079372406, "step": 2547 }, { "clip_ratio": 0.0, "completion_length": 34.25, "epoch": 6.601036269430052, "grad_norm": 0.2122647258348281, "kl": 0.13818359375, "learning_rate": 3.401554404145078e-07, "loss": 0.0013, "reward": 2.499997615814209, "reward_std": 2.4651604064729327e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999977946281433, "step": 2548 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 6.603626943005182, "grad_norm": 0.1707471944684384, "kl": 0.102783203125, "learning_rate": 3.3989637305699483e-07, "loss": 0.001, "reward": 2.499997854232788, "reward_std": 1.980917318178399e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999979734420776, "step": 2549 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 6.606217616580311, "grad_norm": 16.78280135512644, "kl": 0.0711669921875, "learning_rate": 3.3963730569948183e-07, "loss": 0.0001, "reward": 1.9838308095932007, "reward_std": 0.0002846343312086219, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4838309288024902, "step": 2550 }, { "clip_ratio": 0.0, "completion_length": 34.5, "epoch": 6.608808290155441, "grad_norm": 0.6444172842181465, "kl": 0.177001953125, "learning_rate": 3.393782383419689e-07, "loss": 0.0011, "reward": 2.4999654293060303, "reward_std": 5.451105607789941e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999653697013855, "step": 2551 }, { "clip_ratio": 0.0, "completion_length": 34.5, "epoch": 6.61139896373057, "grad_norm": 2.5282296933824373, "kl": 0.03839111328125, "learning_rate": 3.3911917098445593e-07, "loss": -0.0001, "reward": 2.4999762773513794, "reward_std": 2.1663740881194826e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.999976396560669, "step": 2552 }, { "clip_ratio": 0.0, "completion_length": 34.0, "epoch": 6.6139896373057, "grad_norm": 2.4770855001431684, "kl": 0.1201171875, "learning_rate": 3.38860103626943e-07, "loss": 0.0011, "reward": 2.499989628791809, "reward_std": 9.335331469628727e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999896883964539, "step": 2553 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 6.616580310880829, "grad_norm": 0.044213214230401206, "kl": 0.0555419921875, "learning_rate": 3.3860103626943004e-07, "loss": 0.0004, "reward": 2.499998450279236, "reward_std": 1.042365596504169e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999985098838806, "step": 2554 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 6.619170984455959, "grad_norm": 1.7987260942067969, "kl": 0.02813720703125, "learning_rate": 3.383419689119171e-07, "loss": 0.0016, "reward": 2.4999959468841553, "reward_std": 3.3024931553882197e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999958872795105, "step": 2555 }, { "clip_ratio": 0.0, "completion_length": 35.1875, "epoch": 6.6217616580310885, "grad_norm": 0.152239583443508, "kl": 0.096435546875, "learning_rate": 3.380829015544041e-07, "loss": 0.001, "reward": 1.499999761581421, "reward_std": 2.6272448394593084e-07, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 0.9999997615814209, "step": 2556 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 6.624352331606218, "grad_norm": 0.1273675226901063, "kl": 0.087158203125, "learning_rate": 3.378238341968912e-07, "loss": 0.0006, "reward": 2.4999961853027344, "reward_std": 1.720830056228806e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999961256980896, "step": 2557 }, { "clip_ratio": 0.0, "completion_length": 34.5, "epoch": 6.626943005181348, "grad_norm": 0.043721086557300434, "kl": 0.06103515625, "learning_rate": 3.3756476683937825e-07, "loss": 0.0004, "reward": 2.499998927116394, "reward_std": 1.226148668820315e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999988675117493, "step": 2558 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 6.629533678756477, "grad_norm": 0.2996362017119053, "kl": 0.133056640625, "learning_rate": 3.3730569948186525e-07, "loss": 0.0005, "reward": 2.499995470046997, "reward_std": 2.144779955415288e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999957084655762, "step": 2559 }, { "clip_ratio": 0.0, "completion_length": 34.5, "epoch": 6.632124352331607, "grad_norm": 0.6158580946389499, "kl": 0.13427734375, "learning_rate": 3.370466321243523e-07, "loss": 0.0004, "reward": 2.499993324279785, "reward_std": 5.970400707155932e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999935626983643, "step": 2560 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 6.634715025906736, "grad_norm": 0.15388333980600602, "kl": 0.112060546875, "learning_rate": 3.3678756476683935e-07, "loss": -0.0009, "reward": 2.4999908208847046, "reward_std": 2.836057319655083e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999909400939941, "step": 2561 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 6.637305699481866, "grad_norm": 0.10990972930863668, "kl": 0.0772705078125, "learning_rate": 3.365284974093264e-07, "loss": 0.0007, "reward": 2.4999988079071045, "reward_std": 1.0265251830787747e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999988675117493, "step": 2562 }, { "clip_ratio": 0.0, "completion_length": 34.5, "epoch": 6.639896373056995, "grad_norm": 1.3542892525788086, "kl": 0.1151123046875, "learning_rate": 3.3626943005181346e-07, "loss": -0.0008, "reward": 2.499992847442627, "reward_std": 5.880863227503141e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.999993085861206, "step": 2563 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 6.642487046632124, "grad_norm": 0.18121276195124655, "kl": 0.0982666015625, "learning_rate": 3.360103626943005e-07, "loss": -0.0009, "reward": 2.49999737739563, "reward_std": 3.0438080500516662e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999974966049194, "step": 2564 }, { "clip_ratio": 0.0, "completion_length": 34.5, "epoch": 6.6450777202072535, "grad_norm": 2.7826533831249978, "kl": 0.1494140625, "learning_rate": 3.357512953367875e-07, "loss": 0.0015, "reward": 1.9944143295288086, "reward_std": 0.00010370887298449816, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4944142997264862, "step": 2565 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 6.647668393782383, "grad_norm": 0.24931137862369113, "kl": 0.072998046875, "learning_rate": 3.354922279792746e-07, "loss": -0.0006, "reward": 2.499996066093445, "reward_std": 2.4226490040746285e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999962449073792, "step": 2566 }, { "clip_ratio": 0.0, "completion_length": 34.5, "epoch": 6.650259067357513, "grad_norm": 0.08998975069236005, "kl": 0.126708984375, "learning_rate": 3.3523316062176167e-07, "loss": 0.0009, "reward": 2.499996304512024, "reward_std": 2.631036409184162e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999962449073792, "step": 2567 }, { "clip_ratio": 0.0, "completion_length": 34.5, "epoch": 6.652849740932642, "grad_norm": 54.74399876562132, "kl": 0.16552734375, "learning_rate": 3.3497409326424867e-07, "loss": 0.0006, "reward": 1.4062859416007996, "reward_std": 0.0006608423718716949, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 0.9062860012054443, "step": 2568 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 6.655440414507772, "grad_norm": 5.888225080680928, "kl": 0.07958984375, "learning_rate": 3.347150259067357e-07, "loss": 0.0008, "reward": 1.997098982334137, "reward_std": 8.813368509663633e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4970990121364594, "step": 2569 }, { "clip_ratio": 0.0, "completion_length": 34.5, "epoch": 6.658031088082901, "grad_norm": 0.06937523364798394, "kl": 0.07666015625, "learning_rate": 3.344559585492228e-07, "loss": 0.0006, "reward": 2.4999964237213135, "reward_std": 2.1866613906240673e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999964237213135, "step": 2570 }, { "clip_ratio": 0.0, "completion_length": 34.5, "epoch": 6.660621761658031, "grad_norm": 0.06299063375986161, "kl": 0.05572509765625, "learning_rate": 3.3419689119170983e-07, "loss": 0.0001, "reward": 2.4999990463256836, "reward_std": 9.253759998273381e-07, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999991655349731, "step": 2571 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 6.66321243523316, "grad_norm": 0.3266392956997573, "kl": 0.077880859375, "learning_rate": 3.339378238341969e-07, "loss": 0.0007, "reward": 2.499995470046997, "reward_std": 4.508622623689007e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999955892562866, "step": 2572 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 6.66580310880829, "grad_norm": 1.0246611668257095, "kl": 0.05322265625, "learning_rate": 3.3367875647668393e-07, "loss": 0.0011, "reward": 2.4999932050704956, "reward_std": 4.374227955850074e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999933242797852, "step": 2573 }, { "clip_ratio": 0.0, "completion_length": 34.5, "epoch": 6.668393782383419, "grad_norm": 1.5635892045251365, "kl": 0.04510498046875, "learning_rate": 3.3341968911917093e-07, "loss": -0.0009, "reward": 2.4999852180480957, "reward_std": 8.368838507522014e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999854564666748, "step": 2574 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 6.670984455958549, "grad_norm": 0.12284749032219659, "kl": 0.060302734375, "learning_rate": 3.33160621761658e-07, "loss": 0.0001, "reward": 2.4999972581863403, "reward_std": 8.914735758480674e-07, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999973773956299, "step": 2575 }, { "clip_ratio": 0.0, "completion_length": 34.0, "epoch": 6.6735751295336785, "grad_norm": 0.2167900567170482, "kl": 0.0321044921875, "learning_rate": 3.329015544041451e-07, "loss": 0.001, "reward": 2.499997138977051, "reward_std": 1.994280410144711e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.999997079372406, "step": 2576 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 6.676165803108808, "grad_norm": 0.40959551545454664, "kl": 0.106201171875, "learning_rate": 3.326424870466321e-07, "loss": 0.0009, "reward": 2.4999942779541016, "reward_std": 4.334254072091426e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.999994158744812, "step": 2577 }, { "clip_ratio": 0.0, "completion_length": 34.0, "epoch": 6.678756476683938, "grad_norm": 0.04152460592243156, "kl": 0.02154541015625, "learning_rate": 3.3238341968911914e-07, "loss": 0.001, "reward": 2.499999523162842, "reward_std": 6.770477511963691e-07, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999995231628418, "step": 2578 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 6.681347150259067, "grad_norm": 0.29413180286570545, "kl": 0.126220703125, "learning_rate": 3.321243523316062e-07, "loss": -0.0004, "reward": 2.4999905824661255, "reward_std": 3.9297768807955435e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999905824661255, "step": 2579 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 6.683937823834197, "grad_norm": 0.8858348914551994, "kl": 0.11767578125, "learning_rate": 3.3186528497409325e-07, "loss": 0.0008, "reward": 2.499980092048645, "reward_std": 5.515488737728447e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999802112579346, "step": 2580 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 6.686528497409326, "grad_norm": 0.30480880480466105, "kl": 0.065185546875, "learning_rate": 3.316062176165803e-07, "loss": 0.0008, "reward": 2.4999889135360718, "reward_std": 4.7448878035538655e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.999988615512848, "step": 2581 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 6.689119170984456, "grad_norm": 0.14469637750250963, "kl": 0.0732421875, "learning_rate": 3.3134715025906735e-07, "loss": 0.0012, "reward": 2.499997854232788, "reward_std": 1.5480442812076944e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999977350234985, "step": 2582 }, { "clip_ratio": 0.0, "completion_length": 34.0, "epoch": 6.691709844559585, "grad_norm": 0.4863310397988113, "kl": 0.04205322265625, "learning_rate": 3.3108808290155435e-07, "loss": -0.0001, "reward": 2.499995708465576, "reward_std": 2.421361855908799e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999957084655762, "step": 2583 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 6.694300518134715, "grad_norm": 2.0665532768566486, "kl": 0.10693359375, "learning_rate": 3.308290155440414e-07, "loss": 0.0003, "reward": 2.4999419450759888, "reward_std": 1.3593727999250405e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999419450759888, "step": 2584 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 6.696891191709844, "grad_norm": 0.22176447194368717, "kl": 0.0517578125, "learning_rate": 3.305699481865285e-07, "loss": 0.0002, "reward": 2.499997854232788, "reward_std": 1.0115137172306277e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.999997854232788, "step": 2585 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 6.699481865284974, "grad_norm": 3.1241740446208515, "kl": 0.074951171875, "learning_rate": 3.303108808290155e-07, "loss": -0.0004, "reward": 2.499956727027893, "reward_std": 1.3763042602477071e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999567866325378, "step": 2586 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 6.7020725388601035, "grad_norm": 46.915002143867554, "kl": 0.14404296875, "learning_rate": 3.3005181347150256e-07, "loss": 0.0006, "reward": 2.062428116798401, "reward_std": 0.4082653373479843, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.5624281167984009, "step": 2587 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 6.704663212435233, "grad_norm": 27.51949064118649, "kl": 0.17138671875, "learning_rate": 3.297927461139896e-07, "loss": 0.0001, "reward": 2.4999958276748657, "reward_std": 2.1905597122895415e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999959468841553, "step": 2588 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 6.7072538860103625, "grad_norm": 1.1729738096444962, "kl": 0.0908203125, "learning_rate": 3.295336787564767e-07, "loss": -0.0005, "reward": 2.499990940093994, "reward_std": 6.02474869992875e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999910593032837, "step": 2589 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 6.709844559585492, "grad_norm": 5.367834367783082, "kl": 0.14453125, "learning_rate": 3.292746113989637e-07, "loss": 0.0012, "reward": 1.958688735961914, "reward_std": 0.00023893316739531656, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4586885571479797, "step": 2590 }, { "clip_ratio": 0.0, "completion_length": 34.5, "epoch": 6.712435233160622, "grad_norm": 0.1611130610032333, "kl": 0.0506591796875, "learning_rate": 3.290155440414508e-07, "loss": 0.0012, "reward": 2.4999784231185913, "reward_std": 2.699215713164449e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999784231185913, "step": 2591 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 6.715025906735751, "grad_norm": 2.9858569906842156, "kl": 0.11181640625, "learning_rate": 3.287564766839378e-07, "loss": -0.0001, "reward": 2.499988555908203, "reward_std": 6.575066322511702e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999887347221375, "step": 2592 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 6.717616580310881, "grad_norm": 0.5733795975332774, "kl": 0.06622314453125, "learning_rate": 3.2849740932642483e-07, "loss": 0.0014, "reward": 2.499981641769409, "reward_std": 4.8018978304753546e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999814629554749, "step": 2593 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 6.72020725388601, "grad_norm": 4.523967176226092, "kl": 0.07470703125, "learning_rate": 3.2823834196891193e-07, "loss": -0.0003, "reward": 2.499959111213684, "reward_std": 2.8898726668558083e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999591708183289, "step": 2594 }, { "clip_ratio": 0.0, "completion_length": 34.5, "epoch": 6.72279792746114, "grad_norm": 0.2156022940407872, "kl": 0.103515625, "learning_rate": 3.27979274611399e-07, "loss": -0.0002, "reward": 2.499997138977051, "reward_std": 4.089306912646862e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999971985816956, "step": 2595 }, { "clip_ratio": 0.0, "completion_length": 34.5, "epoch": 6.725388601036269, "grad_norm": 5.832168770665035, "kl": 0.1201171875, "learning_rate": 3.27720207253886e-07, "loss": -0.0006, "reward": 2.49998939037323, "reward_std": 1.0515403801036882e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999895095825195, "step": 2596 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 6.727979274611399, "grad_norm": 2.4478152866866343, "kl": 0.0650634765625, "learning_rate": 3.2746113989637304e-07, "loss": 0.0007, "reward": 1.9986910820007324, "reward_std": 2.5327576963718457e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4986909627914429, "step": 2597 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 6.730569948186528, "grad_norm": 0.04532129039756897, "kl": 0.166015625, "learning_rate": 3.2720207253886004e-07, "loss": 0.0012, "reward": 2.4999974966049194, "reward_std": 2.7054037445850554e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999973773956299, "step": 2598 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 6.733160621761658, "grad_norm": 10.973102361316224, "kl": 0.15673828125, "learning_rate": 3.2694300518134714e-07, "loss": 0.0007, "reward": 1.4764549732208252, "reward_std": 0.0002484492943040095, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 0.9764549136161804, "step": 2599 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 6.7357512953367875, "grad_norm": 0.2264396397533005, "kl": 0.1065673828125, "learning_rate": 3.266839378238342e-07, "loss": 0.001, "reward": 2.4999961853027344, "reward_std": 2.894307726819534e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999961853027344, "step": 2600 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 6.738341968911917, "grad_norm": 0.781237039712985, "kl": 0.1259765625, "learning_rate": 3.2642487046632125e-07, "loss": 0.0006, "reward": 1.9998623728752136, "reward_std": 1.5107755643839482e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4998623132705688, "step": 2601 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 6.740932642487047, "grad_norm": 34.845933669790405, "kl": 0.21875, "learning_rate": 3.2616580310880825e-07, "loss": 0.0008, "reward": 1.3091520071029663, "reward_std": 0.0005192816606722772, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 0.8091520667076111, "step": 2602 }, { "clip_ratio": 0.0, "completion_length": 35.0625, "epoch": 6.743523316062176, "grad_norm": 0.08520314699728021, "kl": 0.184814453125, "learning_rate": 3.2590673575129535e-07, "loss": 0.0013, "reward": 2.499998092651367, "reward_std": 1.6865800489540561e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999980926513672, "step": 2603 }, { "clip_ratio": 0.0, "completion_length": 34.5, "epoch": 6.746113989637306, "grad_norm": 0.3741604703786764, "kl": 0.091552734375, "learning_rate": 3.256476683937824e-07, "loss": -0.0005, "reward": 2.4999966621398926, "reward_std": 2.834721101407922e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999966621398926, "step": 2604 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 6.748704663212435, "grad_norm": 0.1902905735375944, "kl": 0.0758056640625, "learning_rate": 3.253886010362694e-07, "loss": -0.0001, "reward": 2.499998688697815, "reward_std": 1.013394211213381e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999987483024597, "step": 2605 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 6.751295336787565, "grad_norm": 0.1304177983406284, "kl": 0.0758056640625, "learning_rate": 3.2512953367875646e-07, "loss": 0.0012, "reward": 2.499995470046997, "reward_std": 1.469007244736531e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999955296516418, "step": 2606 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 6.753886010362694, "grad_norm": 0.8783267999037194, "kl": 0.122802734375, "learning_rate": 3.248704663212435e-07, "loss": -0.0007, "reward": 2.4999829530715942, "reward_std": 4.364902679299121e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999831914901733, "step": 2607 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 6.756476683937824, "grad_norm": 2.90818863991794, "kl": 0.099365234375, "learning_rate": 3.2461139896373056e-07, "loss": 0.0008, "reward": 2.4999921321868896, "reward_std": 5.05780303683423e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999920129776, "step": 2608 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 6.759067357512953, "grad_norm": 0.20439311313186867, "kl": 0.116455078125, "learning_rate": 3.243523316062176e-07, "loss": 0.0005, "reward": 2.499997138977051, "reward_std": 2.5065187543305e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.999997317790985, "step": 2609 }, { "clip_ratio": 0.0, "completion_length": 34.5, "epoch": 6.761658031088083, "grad_norm": 0.4276889929631774, "kl": 0.0665283203125, "learning_rate": 3.2409326424870467e-07, "loss": 0.001, "reward": 2.499543309211731, "reward_std": 7.075920109400613e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.999543309211731, "step": 2610 }, { "clip_ratio": 0.0, "completion_length": 35.75, "epoch": 6.7642487046632125, "grad_norm": 4.988801345871367, "kl": 0.05108642578125, "learning_rate": 3.2383419689119167e-07, "loss": -0.0002, "reward": 1.9499186277389526, "reward_std": 0.011643597628108182, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4499186873435974, "step": 2611 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 6.766839378238342, "grad_norm": 1.8455174535086045, "kl": 0.103759765625, "learning_rate": 3.235751295336787e-07, "loss": -0.0012, "reward": 2.499996304512024, "reward_std": 2.8104585680921446e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999964833259583, "step": 2612 }, { "clip_ratio": 0.0, "completion_length": 34.0, "epoch": 6.769430051813472, "grad_norm": 0.061923812131794265, "kl": 0.047698974609375, "learning_rate": 3.2331606217616583e-07, "loss": 0.0002, "reward": 2.4999982118606567, "reward_std": 1.0370044094543118e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.999998152256012, "step": 2613 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 6.772020725388601, "grad_norm": 0.1100203967726687, "kl": 0.0789794921875, "learning_rate": 3.2305699481865283e-07, "loss": 0.0008, "reward": 2.4999974966049194, "reward_std": 2.333548877686553e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999974966049194, "step": 2614 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 6.774611398963731, "grad_norm": 68.5712464189081, "kl": 0.1494140625, "learning_rate": 3.227979274611399e-07, "loss": 0.0013, "reward": 2.499662399291992, "reward_std": 0.00015250110163833597, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9996622204780579, "step": 2615 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 6.77720207253886, "grad_norm": 0.611371759608612, "kl": 0.10888671875, "learning_rate": 3.2253886010362693e-07, "loss": -0.0004, "reward": 2.4999964237213135, "reward_std": 3.337859652674524e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999963641166687, "step": 2616 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 6.77979274611399, "grad_norm": 0.2662854166121708, "kl": 0.047119140625, "learning_rate": 3.22279792746114e-07, "loss": 0.0, "reward": 2.4999959468841553, "reward_std": 2.4996460865622794e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999960660934448, "step": 2617 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 6.782383419689119, "grad_norm": 0.2118951697335687, "kl": 0.097412109375, "learning_rate": 3.2202072538860104e-07, "loss": 0.0021, "reward": 2.4999985694885254, "reward_std": 1.8294264236828894e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999984502792358, "step": 2618 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 6.784974093264249, "grad_norm": 4.618919710401948, "kl": 0.117919921875, "learning_rate": 3.217616580310881e-07, "loss": 0.0002, "reward": 1.8100414276123047, "reward_std": 0.0006752505371991901, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.3100415766239166, "step": 2619 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 6.787564766839378, "grad_norm": 0.652593847585454, "kl": 0.0992431640625, "learning_rate": 3.215025906735751e-07, "loss": 0.0005, "reward": 2.4999969005584717, "reward_std": 4.2823880903597455e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999969005584717, "step": 2620 }, { "clip_ratio": 0.0, "completion_length": 34.5, "epoch": 6.790155440414508, "grad_norm": 0.4895500447608739, "kl": 0.088134765625, "learning_rate": 3.2124352331606214e-07, "loss": 0.0006, "reward": 2.499996781349182, "reward_std": 1.3708473858287107e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.999996840953827, "step": 2621 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 6.7927461139896375, "grad_norm": 0.2610555544379071, "kl": 0.177734375, "learning_rate": 3.2098445595854925e-07, "loss": 0.001, "reward": 2.4999972581863403, "reward_std": 3.101004210748215e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999972581863403, "step": 2622 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 6.795336787564767, "grad_norm": 2.183783771907094, "kl": 0.08935546875, "learning_rate": 3.2072538860103625e-07, "loss": -0.0004, "reward": 2.4999841451644897, "reward_std": 8.437664376970133e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.999984323978424, "step": 2623 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 6.7979274611398965, "grad_norm": 0.8010101354810155, "kl": 0.06585693359375, "learning_rate": 3.204663212435233e-07, "loss": -0.0003, "reward": 2.4999754428863525, "reward_std": 7.238565785883111e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999753832817078, "step": 2624 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 6.800518134715026, "grad_norm": 0.32472030281790437, "kl": 0.084716796875, "learning_rate": 3.2020725388601035e-07, "loss": -0.0001, "reward": 2.499993324279785, "reward_std": 4.868169071414741e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.99999338388443, "step": 2625 }, { "clip_ratio": 0.0, "completion_length": 34.5, "epoch": 6.803108808290156, "grad_norm": 0.1472680932200785, "kl": 0.17333984375, "learning_rate": 3.199481865284974e-07, "loss": 0.0001, "reward": 2.499996542930603, "reward_std": 2.135551767423749e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999964833259583, "step": 2626 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 6.805699481865285, "grad_norm": 115.40396773481149, "kl": 0.1104736328125, "learning_rate": 3.1968911917098446e-07, "loss": 0.0012, "reward": 1.9436487555503845, "reward_std": 0.09334890798777451, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4436488151550293, "step": 2627 }, { "clip_ratio": 0.0, "completion_length": 34.0, "epoch": 6.808290155440415, "grad_norm": 0.2383380515966591, "kl": 0.066162109375, "learning_rate": 3.194300518134715e-07, "loss": -0.0004, "reward": 2.499975085258484, "reward_std": 3.6756970303031267e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999752044677734, "step": 2628 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 6.810880829015544, "grad_norm": 0.9329502207555056, "kl": 0.0625, "learning_rate": 3.191709844559585e-07, "loss": -0.0005, "reward": 2.499995470046997, "reward_std": 5.685469091076811e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999956488609314, "step": 2629 }, { "clip_ratio": 0.0, "completion_length": 34.5, "epoch": 6.813471502590674, "grad_norm": 0.3690295133439186, "kl": 0.08209228515625, "learning_rate": 3.1891191709844556e-07, "loss": 0.0015, "reward": 2.499992609024048, "reward_std": 4.3634497615130385e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.999992311000824, "step": 2630 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 6.816062176165803, "grad_norm": 0.5946824987419771, "kl": 0.24755859375, "learning_rate": 3.1865284974093267e-07, "loss": 0.0014, "reward": 2.4999654293060303, "reward_std": 6.409323987099924e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999654293060303, "step": 2631 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 6.818652849740933, "grad_norm": 0.731038373227442, "kl": 0.116943359375, "learning_rate": 3.1839378238341967e-07, "loss": 0.0002, "reward": 2.4999780654907227, "reward_std": 7.960647053550929e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999781847000122, "step": 2632 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 6.821243523316062, "grad_norm": 25.188331497738304, "kl": 0.098876953125, "learning_rate": 3.181347150259067e-07, "loss": 0.001, "reward": 2.3124375343322754, "reward_std": 0.2588223617028689, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.8124375343322754, "step": 2633 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 6.823834196891192, "grad_norm": 0.18043945675785886, "kl": 0.116455078125, "learning_rate": 3.178756476683938e-07, "loss": -0.0004, "reward": 2.4999990463256836, "reward_std": 7.49549172951447e-07, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999991059303284, "step": 2634 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 6.8264248704663215, "grad_norm": 1.022727259914892, "kl": 0.11572265625, "learning_rate": 3.176165803108808e-07, "loss": 0.0006, "reward": 2.4999947547912598, "reward_std": 3.2792924002933432e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999946355819702, "step": 2635 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 6.829015544041451, "grad_norm": 0.11858756300010036, "kl": 0.0614013671875, "learning_rate": 3.173575129533679e-07, "loss": -0.0, "reward": 2.499998092651367, "reward_std": 1.5976047791355086e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999982118606567, "step": 2636 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 6.831606217616581, "grad_norm": 0.8406515143486687, "kl": 0.0477294921875, "learning_rate": 3.1709844559585493e-07, "loss": 0.0004, "reward": 1.999927043914795, "reward_std": 9.489662829764711e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4999271035194397, "step": 2637 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 6.83419689119171, "grad_norm": 9.551721300417965, "kl": 0.3466796875, "learning_rate": 3.1683937823834193e-07, "loss": 0.0023, "reward": 1.9975675344467163, "reward_std": 0.00047034373449150735, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4975674152374268, "step": 2638 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 6.83678756476684, "grad_norm": 0.08414711455513105, "kl": 0.02960205078125, "learning_rate": 3.16580310880829e-07, "loss": -0.0, "reward": 2.4999982118606567, "reward_std": 1.4587311625291477e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.999998390674591, "step": 2639 }, { "clip_ratio": 0.0, "completion_length": 35.9375, "epoch": 6.839378238341969, "grad_norm": 0.042768534414627316, "kl": 0.072265625, "learning_rate": 3.163212435233161e-07, "loss": -0.0003, "reward": 2.4999992847442627, "reward_std": 1.0190064756443462e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999992847442627, "step": 2640 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 6.841968911917099, "grad_norm": 0.05676457056419735, "kl": 0.07806396484375, "learning_rate": 3.160621761658031e-07, "loss": -0.0007, "reward": 2.499994158744812, "reward_std": 1.5781853335283813e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999942183494568, "step": 2641 }, { "clip_ratio": 0.0, "completion_length": 34.0, "epoch": 6.844559585492228, "grad_norm": 0.129515423086182, "kl": 0.0552978515625, "learning_rate": 3.1580310880829014e-07, "loss": 0.0015, "reward": 2.49999737739563, "reward_std": 1.4731791964095464e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999974966049194, "step": 2642 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 6.847150259067358, "grad_norm": 0.5711241261433098, "kl": 0.11572265625, "learning_rate": 3.155440414507772e-07, "loss": 0.0005, "reward": 2.4999923706054688, "reward_std": 3.3822458647136955e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999923706054688, "step": 2643 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 6.849740932642487, "grad_norm": 33.511374255696, "kl": 0.16064453125, "learning_rate": 3.152849740932642e-07, "loss": 0.0006, "reward": 2.3124029636383057, "reward_std": 0.25889289929045844, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.8124029636383057, "step": 2644 }, { "clip_ratio": 0.0, "completion_length": 34.0, "epoch": 6.852331606217617, "grad_norm": 0.783777532982119, "kl": 0.0760498046875, "learning_rate": 3.150259067357513e-07, "loss": 0.001, "reward": 2.499992609024048, "reward_std": 4.621174866770161e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.999992549419403, "step": 2645 }, { "clip_ratio": 0.0, "completion_length": 34.5, "epoch": 6.8549222797927465, "grad_norm": 0.10734646715621524, "kl": 0.13671875, "learning_rate": 3.1476683937823835e-07, "loss": 0.0017, "reward": 2.49999737739563, "reward_std": 1.8049130403596791e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999972581863403, "step": 2646 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 6.857512953367876, "grad_norm": 0.5221572295646971, "kl": 0.081298828125, "learning_rate": 3.1450777202072535e-07, "loss": 0.0002, "reward": 2.4999958276748657, "reward_std": 3.1969009341992205e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.999995768070221, "step": 2647 }, { "clip_ratio": 0.0, "completion_length": 34.5, "epoch": 6.860103626943005, "grad_norm": 3.254478770384534, "kl": 0.092041015625, "learning_rate": 3.142487046632124e-07, "loss": 0.0011, "reward": 2.4999899864196777, "reward_std": 7.808321470292867e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999898672103882, "step": 2648 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 6.862694300518134, "grad_norm": 0.2786716513463218, "kl": 0.06884765625, "learning_rate": 3.139896373056995e-07, "loss": -0.0005, "reward": 2.4999934434890747, "reward_std": 2.3536198909823725e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999935626983643, "step": 2649 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 6.865284974093264, "grad_norm": 0.6035286577444154, "kl": 0.0621337890625, "learning_rate": 3.137305699481865e-07, "loss": 0.0008, "reward": 2.4999942779541016, "reward_std": 4.760535887271544e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999942183494568, "step": 2650 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 6.867875647668393, "grad_norm": 0.353326948384391, "kl": 0.07366943359375, "learning_rate": 3.1347150259067356e-07, "loss": 0.0022, "reward": 2.499995231628418, "reward_std": 2.312569620244176e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999947547912598, "step": 2651 }, { "clip_ratio": 0.0, "completion_length": 34.0, "epoch": 6.870466321243523, "grad_norm": 0.050495615388355466, "kl": 0.07275390625, "learning_rate": 3.132124352331606e-07, "loss": -0.001, "reward": 2.4999985694885254, "reward_std": 9.676915055933932e-07, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999986290931702, "step": 2652 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 6.873056994818652, "grad_norm": 16.279411889064498, "kl": 0.10498046875, "learning_rate": 3.129533678756476e-07, "loss": 0.0013, "reward": 2.499994993209839, "reward_std": 4.3705274492822355e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.999994933605194, "step": 2653 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 6.875647668393782, "grad_norm": 1.349154768096174, "kl": 0.0841064453125, "learning_rate": 3.126943005181347e-07, "loss": 0.001, "reward": 2.499995470046997, "reward_std": 5.87809245189419e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999954104423523, "step": 2654 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 6.8782383419689115, "grad_norm": 0.24559773068145477, "kl": 0.13720703125, "learning_rate": 3.124352331606218e-07, "loss": 0.0005, "reward": 2.4999899864196777, "reward_std": 2.947581776879815e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.999989926815033, "step": 2655 }, { "clip_ratio": 0.0, "completion_length": 34.5, "epoch": 6.880829015544041, "grad_norm": 1.1636827874950308, "kl": 0.111083984375, "learning_rate": 3.121761658031088e-07, "loss": -0.0007, "reward": 2.4999771118164062, "reward_std": 7.452856266354502e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999773502349854, "step": 2656 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 6.883419689119171, "grad_norm": 0.3537371367868099, "kl": 0.080810546875, "learning_rate": 3.1191709844559583e-07, "loss": 0.0002, "reward": 2.4999964237213135, "reward_std": 2.1272351773404807e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999964237213135, "step": 2657 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 6.8860103626943, "grad_norm": 2.461943748115084, "kl": 0.08837890625, "learning_rate": 3.116580310880829e-07, "loss": 0.0014, "reward": 1.9984063506126404, "reward_std": 2.0161690827080747e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4984063506126404, "step": 2658 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 6.88860103626943, "grad_norm": 7.638852823792572, "kl": 0.11181640625, "learning_rate": 3.1139896373056993e-07, "loss": 0.0003, "reward": 1.999787151813507, "reward_std": 1.5002544600406509e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4997871816158295, "step": 2659 }, { "clip_ratio": 0.0, "completion_length": 34.5, "epoch": 6.891191709844559, "grad_norm": 20.81141839824204, "kl": 0.123046875, "learning_rate": 3.11139896373057e-07, "loss": -0.0005, "reward": 2.499915838241577, "reward_std": 1.6071899835878867e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999159574508667, "step": 2660 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 6.893782383419689, "grad_norm": 0.025103119686933323, "kl": 0.147705078125, "learning_rate": 3.1088082901554404e-07, "loss": 0.0012, "reward": 2.4999990463256836, "reward_std": 1.04565532410561e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999991059303284, "step": 2661 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 6.896373056994818, "grad_norm": 16.731027577564163, "kl": 0.066314697265625, "learning_rate": 3.1062176165803104e-07, "loss": -0.0007, "reward": 1.9989325404167175, "reward_std": 0.00012794942733762582, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.498932808637619, "step": 2662 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 6.898963730569948, "grad_norm": 0.12302329768003824, "kl": 0.090576171875, "learning_rate": 3.1036269430051814e-07, "loss": 0.0008, "reward": 2.49999737739563, "reward_std": 2.3493118419537495e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999973773956299, "step": 2663 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 6.901554404145077, "grad_norm": 9.644437676921989, "kl": 0.140380859375, "learning_rate": 3.101036269430052e-07, "loss": -0.0003, "reward": 1.9942524433135986, "reward_std": 0.00011496849128889153, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4942525029182434, "step": 2664 }, { "clip_ratio": 0.0, "completion_length": 34.5, "epoch": 6.904145077720207, "grad_norm": 3.85087150639492, "kl": 0.1103515625, "learning_rate": 3.098445595854922e-07, "loss": 0.0004, "reward": 2.4999680519104004, "reward_std": 8.630609954707325e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.99996817111969, "step": 2665 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 6.9067357512953365, "grad_norm": 0.17982488842231498, "kl": 0.136474609375, "learning_rate": 3.0958549222797925e-07, "loss": 0.0004, "reward": 2.499998092651367, "reward_std": 1.4715184306623996e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999982714653015, "step": 2666 }, { "clip_ratio": 0.0, "completion_length": 34.0, "epoch": 6.909326424870466, "grad_norm": 3.184643650047152, "kl": 0.16064453125, "learning_rate": 3.093264248704663e-07, "loss": 0.0015, "reward": 1.9993248581886292, "reward_std": 2.929641897253532e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4993248283863068, "step": 2667 }, { "clip_ratio": 0.0, "completion_length": 34.5, "epoch": 6.9119170984455955, "grad_norm": 3.749083443359404, "kl": 0.17041015625, "learning_rate": 3.0906735751295335e-07, "loss": 0.0012, "reward": 1.9960463047027588, "reward_std": 6.586990798496117e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4960463047027588, "step": 2668 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 6.914507772020725, "grad_norm": 28.25249429623852, "kl": 0.182373046875, "learning_rate": 3.088082901554404e-07, "loss": 0.0005, "reward": 1.8841991424560547, "reward_std": 0.0017215177658727043, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.3841991424560547, "step": 2669 }, { "clip_ratio": 0.0, "completion_length": 35.4375, "epoch": 6.917098445595855, "grad_norm": 0.1671544557216127, "kl": 0.109130859375, "learning_rate": 3.0854922279792746e-07, "loss": 0.0003, "reward": 2.499998450279236, "reward_std": 1.5015943972684909e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.999998688697815, "step": 2670 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 6.919689119170984, "grad_norm": 0.6707549083372544, "kl": 0.055419921875, "learning_rate": 3.0829015544041446e-07, "loss": -0.0002, "reward": 2.4999911785125732, "reward_std": 4.955255462846253e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999911785125732, "step": 2671 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 6.922279792746114, "grad_norm": 0.14356887574468113, "kl": 0.02410888671875, "learning_rate": 3.080310880829015e-07, "loss": 0.0001, "reward": 2.499998092651367, "reward_std": 1.5409833906687709e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999982118606567, "step": 2672 }, { "clip_ratio": 0.0, "completion_length": 34.5, "epoch": 6.924870466321243, "grad_norm": 0.20441933320945194, "kl": 0.0369873046875, "learning_rate": 3.077720207253886e-07, "loss": -0.0004, "reward": 2.49999737739563, "reward_std": 2.084319817186042e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.999997317790985, "step": 2673 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 6.927461139896373, "grad_norm": 0.6683067182430974, "kl": 0.041259765625, "learning_rate": 3.075129533678756e-07, "loss": 0.0003, "reward": 2.4999892711639404, "reward_std": 6.496505193354096e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999892711639404, "step": 2674 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 6.930051813471502, "grad_norm": 0.7210554501054945, "kl": 0.091064453125, "learning_rate": 3.0725388601036267e-07, "loss": -0.0009, "reward": 2.4999945163726807, "reward_std": 3.858439754367282e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999947547912598, "step": 2675 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 6.932642487046632, "grad_norm": 0.17552386108248752, "kl": 0.079071044921875, "learning_rate": 3.069948186528497e-07, "loss": 0.0005, "reward": 2.4999982118606567, "reward_std": 1.9856996118505776e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.999998152256012, "step": 2676 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 6.935233160621761, "grad_norm": 0.41605490710069404, "kl": 0.0755615234375, "learning_rate": 3.067357512953368e-07, "loss": 0.001, "reward": 2.4999942779541016, "reward_std": 3.8180478441063315e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999942183494568, "step": 2677 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 6.937823834196891, "grad_norm": 0.06875053222427291, "kl": 0.083740234375, "learning_rate": 3.0647668393782383e-07, "loss": 0.0002, "reward": 2.499997854232788, "reward_std": 1.1512554465298308e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999977946281433, "step": 2678 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 6.9404145077720205, "grad_norm": 0.6915553577948074, "kl": 0.0787353515625, "learning_rate": 3.062176165803109e-07, "loss": -0.0005, "reward": 2.4999877214431763, "reward_std": 5.598813686447102e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999879002571106, "step": 2679 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 6.94300518134715, "grad_norm": 0.025111769794718897, "kl": 0.1043701171875, "learning_rate": 3.059585492227979e-07, "loss": 0.001, "reward": 2.4999988079071045, "reward_std": 6.237595471247914e-07, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999988675117493, "step": 2680 }, { "clip_ratio": 0.0, "completion_length": 34.5, "epoch": 6.94559585492228, "grad_norm": 0.17851908374925143, "kl": 0.0947265625, "learning_rate": 3.0569948186528493e-07, "loss": 0.0005, "reward": 2.499997854232788, "reward_std": 2.293884051596251e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999980330467224, "step": 2681 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 6.948186528497409, "grad_norm": 19.291777146679912, "kl": 0.0992431640625, "learning_rate": 3.0544041450777204e-07, "loss": 0.0001, "reward": 2.49980092048645, "reward_std": 6.13935790170217e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9998009204864502, "step": 2682 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 6.950777202072539, "grad_norm": 0.10200411389718572, "kl": 0.26318359375, "learning_rate": 3.0518134715025904e-07, "loss": 0.0008, "reward": 2.4999979734420776, "reward_std": 1.4975389603932854e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.999997854232788, "step": 2683 }, { "clip_ratio": 0.0, "completion_length": 34.5, "epoch": 6.953367875647668, "grad_norm": 0.2851126572805129, "kl": 0.02642822265625, "learning_rate": 3.049222797927461e-07, "loss": 0.0007, "reward": 2.499997138977051, "reward_std": 2.274980147376482e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999971389770508, "step": 2684 }, { "clip_ratio": 0.0, "completion_length": 34.5625, "epoch": 6.955958549222798, "grad_norm": 154.70547890032248, "kl": 0.17724609375, "learning_rate": 3.0466321243523314e-07, "loss": 0.0003, "reward": 1.9351143836975098, "reward_std": 0.03371609011310284, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4351144433021545, "step": 2685 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 6.958549222797927, "grad_norm": 0.3745528835900668, "kl": 0.120849609375, "learning_rate": 3.044041450777202e-07, "loss": 0.0011, "reward": 2.49999463558197, "reward_std": 2.4771332505224564e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999943971633911, "step": 2686 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 6.961139896373057, "grad_norm": 0.14974097947057258, "kl": 0.153564453125, "learning_rate": 3.0414507772020725e-07, "loss": 0.0, "reward": 2.499995708465576, "reward_std": 2.8539324148368905e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999959468841553, "step": 2687 }, { "clip_ratio": 0.0, "completion_length": 34.5, "epoch": 6.963730569948186, "grad_norm": 0.049723135099217594, "kl": 0.0645751953125, "learning_rate": 3.038860103626943e-07, "loss": -0.0006, "reward": 2.499998092651367, "reward_std": 1.7072521814043284e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999980926513672, "step": 2688 }, { "clip_ratio": 0.0, "completion_length": 35.625, "epoch": 6.966321243523316, "grad_norm": 327.7835062572855, "kl": 0.106689453125, "learning_rate": 3.036269430051813e-07, "loss": 0.0003, "reward": 2.1122639179229736, "reward_std": 0.2393150636217456, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.6122637391090393, "step": 2689 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 6.9689119170984455, "grad_norm": 4.090484424047936, "kl": 0.0791015625, "learning_rate": 3.0336787564766835e-07, "loss": 0.0005, "reward": 2.4999462366104126, "reward_std": 2.6773177523864433e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999462366104126, "step": 2690 }, { "clip_ratio": 0.0, "completion_length": 34.0, "epoch": 6.971502590673575, "grad_norm": 6.65203856546593, "kl": 0.098876953125, "learning_rate": 3.0310880829015546e-07, "loss": 0.0009, "reward": 1.9991925358772278, "reward_std": 4.446475134045613e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4991923868656158, "step": 2691 }, { "clip_ratio": 0.0, "completion_length": 34.5, "epoch": 6.974093264248705, "grad_norm": 9.91009941138378, "kl": 0.060791015625, "learning_rate": 3.0284974093264246e-07, "loss": 0.0009, "reward": 1.9985364079475403, "reward_std": 9.513014697404287e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4985363483428955, "step": 2692 }, { "clip_ratio": 0.0, "completion_length": 33.5, "epoch": 6.976683937823834, "grad_norm": 0.8474915033213317, "kl": 0.094970703125, "learning_rate": 3.025906735751295e-07, "loss": 0.0005, "reward": 2.4999932050704956, "reward_std": 7.302572157641407e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999932050704956, "step": 2693 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 6.979274611398964, "grad_norm": 0.6968910664154169, "kl": 0.0989990234375, "learning_rate": 3.0233160621761657e-07, "loss": -0.0002, "reward": 2.499959707260132, "reward_std": 7.119760539353592e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999597668647766, "step": 2694 }, { "clip_ratio": 0.0, "completion_length": 35.4375, "epoch": 6.981865284974093, "grad_norm": 59.2766435770167, "kl": 0.126953125, "learning_rate": 3.0207253886010356e-07, "loss": 0.0003, "reward": 1.9794762134552002, "reward_std": 0.0003094483907943868, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4794762134552002, "step": 2695 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 6.984455958549223, "grad_norm": 14.079648594312262, "kl": 0.091796875, "learning_rate": 3.0181347150259067e-07, "loss": 0.001, "reward": 1.9983248710632324, "reward_std": 4.997884479962522e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4983248114585876, "step": 2696 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 6.987046632124352, "grad_norm": 15.917772612262366, "kl": 0.06982421875, "learning_rate": 3.015544041450777e-07, "loss": 0.0009, "reward": 2.2499858140945435, "reward_std": 0.2672700790443514, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.7499855756759644, "step": 2697 }, { "clip_ratio": 0.0, "completion_length": 34.5, "epoch": 6.989637305699482, "grad_norm": 0.6641314836799243, "kl": 0.184814453125, "learning_rate": 3.012953367875647e-07, "loss": 0.0003, "reward": 2.4999964237213135, "reward_std": 4.912595159112243e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.999996304512024, "step": 2698 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 6.992227979274611, "grad_norm": 130.1776848173821, "kl": 0.153076171875, "learning_rate": 3.010362694300518e-07, "loss": 0.0009, "reward": 1.8121325969696045, "reward_std": 0.0006616853706873371, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.3121325969696045, "step": 2699 }, { "clip_ratio": 0.0, "completion_length": 34.5, "epoch": 6.994818652849741, "grad_norm": 15.337846712055693, "kl": 0.0677490234375, "learning_rate": 3.007772020725389e-07, "loss": 0.0005, "reward": 2.4999635219573975, "reward_std": 9.49482387113676e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999635219573975, "step": 2700 }, { "clip_ratio": 0.0, "completion_length": 34.5, "epoch": 6.9974093264248705, "grad_norm": 0.5169906681313127, "kl": 0.138916015625, "learning_rate": 3.005181347150259e-07, "loss": 0.0013, "reward": 2.499998092651367, "reward_std": 1.2903511219519714e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999979734420776, "step": 2701 }, { "clip_ratio": 0.0, "completion_length": 34.5, "epoch": 7.0, "grad_norm": 34.47839291440497, "kl": 0.13916015625, "learning_rate": 3.0025906735751293e-07, "loss": 0.0, "reward": 1.9861319661140442, "reward_std": 0.0010465293445349744, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4861319661140442, "step": 2702 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 7.0025906735751295, "grad_norm": 1.277131039149812, "kl": 0.0606689453125, "learning_rate": 3e-07, "loss": 0.0005, "reward": 2.499986171722412, "reward_std": 8.472314448226825e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.999986171722412, "step": 2703 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 7.005181347150259, "grad_norm": 1.2340353345381787, "kl": 0.3319091796875, "learning_rate": 2.99740932642487e-07, "loss": 0.0, "reward": 2.499998092651367, "reward_std": 1.7651769894655445e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999983310699463, "step": 2704 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 7.007772020725389, "grad_norm": 4.877864512255514, "kl": 0.0762939453125, "learning_rate": 2.994818652849741e-07, "loss": 0.0009, "reward": 1.992798089981079, "reward_std": 7.922134147975157e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4927980303764343, "step": 2705 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 7.010362694300518, "grad_norm": 0.27964349637474945, "kl": 0.07421875, "learning_rate": 2.9922279792746114e-07, "loss": 0.0006, "reward": 2.49999737739563, "reward_std": 2.3559888404633966e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999972581863403, "step": 2706 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 7.012953367875648, "grad_norm": 0.5124919284345575, "kl": 0.0625, "learning_rate": 2.9896373056994814e-07, "loss": 0.0005, "reward": 2.499992609024048, "reward_std": 3.1695838629275386e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999924302101135, "step": 2707 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 7.015544041450777, "grad_norm": 3.7156472708440442, "kl": 0.17138671875, "learning_rate": 2.987046632124352e-07, "loss": 0.0009, "reward": 1.9946449995040894, "reward_std": 5.9174433090447565e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4946449398994446, "step": 2708 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 7.018134715025907, "grad_norm": 0.08315801026027068, "kl": 0.043182373046875, "learning_rate": 2.9844559585492225e-07, "loss": 0.0003, "reward": 2.4999983310699463, "reward_std": 1.1314137111639866e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999984502792358, "step": 2709 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 7.020725388601036, "grad_norm": 0.32078519601146804, "kl": 0.068359375, "learning_rate": 2.981865284974093e-07, "loss": -0.0005, "reward": 2.499997615814209, "reward_std": 1.475461999689287e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999977946281433, "step": 2710 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 7.023316062176166, "grad_norm": 0.19433534024513685, "kl": 0.0665283203125, "learning_rate": 2.9792746113989635e-07, "loss": 0.0003, "reward": 2.499998688697815, "reward_std": 1.1276677156502046e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999988079071045, "step": 2711 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 7.025906735751295, "grad_norm": 0.4825264724252194, "kl": 0.104248046875, "learning_rate": 2.976683937823834e-07, "loss": -0.0003, "reward": 2.4999959468841553, "reward_std": 3.408521195069625e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999960064888, "step": 2712 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 7.028497409326425, "grad_norm": 0.06405915603598082, "kl": 0.10205078125, "learning_rate": 2.974093264248704e-07, "loss": -0.0005, "reward": 2.49999737739563, "reward_std": 1.5113402014321764e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999975562095642, "step": 2713 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 7.0310880829015545, "grad_norm": 5.577678789698276, "kl": 0.0933837890625, "learning_rate": 2.971502590673575e-07, "loss": 0.0002, "reward": 2.4999349117279053, "reward_std": 3.971588563445039e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999350905418396, "step": 2714 }, { "clip_ratio": 0.0, "completion_length": 34.0, "epoch": 7.033678756476684, "grad_norm": 0.11001574450722675, "kl": 0.1019287109375, "learning_rate": 2.9689119170984457e-07, "loss": 0.0007, "reward": 2.4999990463256836, "reward_std": 1.1054847561808856e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999991059303284, "step": 2715 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 7.036269430051814, "grad_norm": 71.70600192367583, "kl": 0.1875, "learning_rate": 2.966321243523316e-07, "loss": 0.001, "reward": 1.967581868171692, "reward_std": 0.0009742489457948977, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.467581868171692, "step": 2716 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 7.038860103626943, "grad_norm": 0.20403869466989022, "kl": 0.097412109375, "learning_rate": 2.963730569948186e-07, "loss": -0.0, "reward": 2.499983787536621, "reward_std": 5.1294148306624265e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999836683273315, "step": 2717 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 7.041450777202073, "grad_norm": 1.217487505050398, "kl": 0.077880859375, "learning_rate": 2.9611398963730567e-07, "loss": 0.0016, "reward": 2.4999985694885254, "reward_std": 1.4931237615201098e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999984502792358, "step": 2718 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 7.044041450777202, "grad_norm": 0.0693020862081694, "kl": 0.084930419921875, "learning_rate": 2.958549222797928e-07, "loss": 0.0014, "reward": 2.4999982118606567, "reward_std": 1.022311977294521e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999980330467224, "step": 2719 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 7.046632124352332, "grad_norm": 13.674053274344969, "kl": 0.1298828125, "learning_rate": 2.955958549222798e-07, "loss": 0.0007, "reward": 2.4374618530273438, "reward_std": 0.1768199337666374, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9374619126319885, "step": 2720 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 7.049222797927461, "grad_norm": 0.4625309994534999, "kl": 0.136474609375, "learning_rate": 2.9533678756476683e-07, "loss": 0.0014, "reward": 2.4999799728393555, "reward_std": 3.261184474467882e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999799132347107, "step": 2721 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 7.051813471502591, "grad_norm": 0.1372337190525816, "kl": 0.07861328125, "learning_rate": 2.950777202072539e-07, "loss": 0.0, "reward": 2.4999979734420776, "reward_std": 1.3631250226353586e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999980926513672, "step": 2722 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 7.05440414507772, "grad_norm": 25.18686999134842, "kl": 0.2718505859375, "learning_rate": 2.9481865284974093e-07, "loss": 0.0018, "reward": 2.437284469604492, "reward_std": 0.1773731542743917, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9372843503952026, "step": 2723 }, { "clip_ratio": 0.0, "completion_length": 34.5, "epoch": 7.05699481865285, "grad_norm": 0.3556508216997095, "kl": 0.0194091796875, "learning_rate": 2.94559585492228e-07, "loss": 0.0011, "reward": 2.499992609024048, "reward_std": 2.989542565501324e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999926686286926, "step": 2724 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 7.0595854922279795, "grad_norm": 0.4025261057990681, "kl": 0.111328125, "learning_rate": 2.9430051813471504e-07, "loss": 0.0007, "reward": 2.4999940395355225, "reward_std": 2.723148497807415e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999940395355225, "step": 2725 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 7.062176165803109, "grad_norm": 0.31274000025680493, "kl": 0.11669921875, "learning_rate": 2.9404145077720204e-07, "loss": 0.0016, "reward": 2.499998092651367, "reward_std": 1.863392071754788e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999980330467224, "step": 2726 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 7.064766839378239, "grad_norm": 0.4397637648749812, "kl": 0.0701904296875, "learning_rate": 2.937823834196891e-07, "loss": 0.0003, "reward": 2.4999921321868896, "reward_std": 4.154978114456753e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999921917915344, "step": 2727 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 7.067357512953368, "grad_norm": 19.276821566765786, "kl": 0.31298828125, "learning_rate": 2.935233160621762e-07, "loss": 0.0015, "reward": 1.9954688549041748, "reward_std": 0.006783579270944529, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4954689741134644, "step": 2728 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 7.069948186528498, "grad_norm": 3.6106080650429635, "kl": 0.12939453125, "learning_rate": 2.932642487046632e-07, "loss": 0.0002, "reward": 1.4974743127822876, "reward_std": 8.686909495736472e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 0.9974744319915771, "step": 2729 }, { "clip_ratio": 0.0, "completion_length": 34.5, "epoch": 7.072538860103627, "grad_norm": 0.8302622536573409, "kl": 0.095458984375, "learning_rate": 2.9300518134715025e-07, "loss": 0.0015, "reward": 1.9991748332977295, "reward_std": 1.608480630466147e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.499174565076828, "step": 2730 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 7.075129533678757, "grad_norm": 0.0748136620697045, "kl": 0.133544921875, "learning_rate": 2.927461139896373e-07, "loss": -0.0001, "reward": 2.499996304512024, "reward_std": 1.7537465737405e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999963641166687, "step": 2731 }, { "clip_ratio": 0.0, "completion_length": 34.0, "epoch": 7.077720207253886, "grad_norm": 3.9856052055470923, "kl": 0.0654296875, "learning_rate": 2.924870466321243e-07, "loss": 0.0001, "reward": 1.999738335609436, "reward_std": 2.8957919056438186e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.499738484621048, "step": 2732 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 7.080310880829016, "grad_norm": 1.9355060975773766, "kl": 0.087646484375, "learning_rate": 2.922279792746114e-07, "loss": 0.0008, "reward": 2.49998140335083, "reward_std": 8.472973490825098e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.99998140335083, "step": 2733 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 7.082901554404145, "grad_norm": 0.5181264329567411, "kl": 0.09130859375, "learning_rate": 2.9196891191709846e-07, "loss": 0.0009, "reward": 2.4999959468841553, "reward_std": 3.9163684846243996e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999960660934448, "step": 2734 }, { "clip_ratio": 0.0, "completion_length": 34.5, "epoch": 7.085492227979275, "grad_norm": 0.07651231618356177, "kl": 0.109619140625, "learning_rate": 2.9170984455958546e-07, "loss": 0.0028, "reward": 2.4999983310699463, "reward_std": 1.3678148320650507e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.999997854232788, "step": 2735 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 7.0880829015544045, "grad_norm": 0.4407890458861952, "kl": 0.04296875, "learning_rate": 2.914507772020725e-07, "loss": 0.0, "reward": 2.4999935626983643, "reward_std": 3.190141228515131e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999935626983643, "step": 2736 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 7.090673575129534, "grad_norm": 0.04223191398393086, "kl": 0.08203125, "learning_rate": 2.911917098445596e-07, "loss": -0.0009, "reward": 2.4999979734420776, "reward_std": 1.1574470590858255e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.999998152256012, "step": 2737 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 7.0932642487046635, "grad_norm": 0.5977924135306688, "kl": 0.1138916015625, "learning_rate": 2.909326424870466e-07, "loss": 0.0002, "reward": 2.4999958276748657, "reward_std": 2.531654786253057e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999960660934448, "step": 2738 }, { "clip_ratio": 0.0, "completion_length": 34.0, "epoch": 7.095854922279793, "grad_norm": 1.1187856816444965, "kl": 0.087615966796875, "learning_rate": 2.9067357512953367e-07, "loss": -0.0004, "reward": 2.4999934434890747, "reward_std": 5.441550456453115e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.99999338388443, "step": 2739 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 7.098445595854923, "grad_norm": 0.181339794909064, "kl": 0.07666015625, "learning_rate": 2.904145077720207e-07, "loss": 0.0009, "reward": 2.499997138977051, "reward_std": 2.8207938385094167e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999972581863403, "step": 2740 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 7.101036269430052, "grad_norm": 1.8952011768885928, "kl": 0.098388671875, "learning_rate": 2.901554404145077e-07, "loss": 0.0014, "reward": 1.9879651069641113, "reward_std": 7.876610010271179e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.487965077161789, "step": 2741 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 7.103626943005182, "grad_norm": 0.13989088415350082, "kl": 0.0423583984375, "learning_rate": 2.8989637305699483e-07, "loss": 0.0, "reward": 2.4999983310699463, "reward_std": 1.060065471847338e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.999998390674591, "step": 2742 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 7.106217616580311, "grad_norm": 0.0847527845110646, "kl": 0.094879150390625, "learning_rate": 2.896373056994819e-07, "loss": 0.0008, "reward": 2.499994158744812, "reward_std": 1.615656515241426e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999939799308777, "step": 2743 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 7.108808290155441, "grad_norm": 1.0443295628308287, "kl": 0.0889892578125, "learning_rate": 2.893782383419689e-07, "loss": 0.0015, "reward": 2.4999905824661255, "reward_std": 5.229881651303003e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999905824661255, "step": 2744 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 7.11139896373057, "grad_norm": 0.19017654597433994, "kl": 0.0601806640625, "learning_rate": 2.8911917098445593e-07, "loss": -0.0001, "reward": 2.4999947547912598, "reward_std": 3.2905971920627053e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999947547912598, "step": 2745 }, { "clip_ratio": 0.0, "completion_length": 34.5, "epoch": 7.1139896373057, "grad_norm": 7.252029356117923, "kl": 0.3916015625, "learning_rate": 2.8886010362694304e-07, "loss": 0.0014, "reward": 0.9966006278991699, "reward_std": 4.635082223103382e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 0.4966006875038147, "step": 2746 }, { "clip_ratio": 0.0, "completion_length": 34.0, "epoch": 7.116580310880829, "grad_norm": 1.0658280584179984, "kl": 0.12939453125, "learning_rate": 2.8860103626943004e-07, "loss": 0.0005, "reward": 2.499993920326233, "reward_std": 7.56164649828861e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.999993920326233, "step": 2747 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 7.119170984455959, "grad_norm": 0.06060123665867208, "kl": 0.0599365234375, "learning_rate": 2.883419689119171e-07, "loss": -0.0001, "reward": 2.4999985694885254, "reward_std": 1.1397911805488548e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999986290931702, "step": 2748 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 7.1217616580310885, "grad_norm": 1.7808468213637838, "kl": 0.0894775390625, "learning_rate": 2.8808290155440414e-07, "loss": -0.0002, "reward": 2.4999860525131226, "reward_std": 1.2167330396550824e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999861121177673, "step": 2749 }, { "clip_ratio": 0.0, "completion_length": 34.5, "epoch": 7.124352331606218, "grad_norm": 0.2592779569913539, "kl": 0.02996826171875, "learning_rate": 2.8782383419689114e-07, "loss": 0.0006, "reward": 2.4999935626983643, "reward_std": 3.3797180662986648e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.99999338388443, "step": 2750 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 7.126943005181348, "grad_norm": 0.9574018356140368, "kl": 0.13037109375, "learning_rate": 2.8756476683937825e-07, "loss": 0.0012, "reward": 1.9998550415039062, "reward_std": 8.921896551328246e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4998550713062286, "step": 2751 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 7.129533678756476, "grad_norm": 0.8898146314958704, "kl": 0.0654296875, "learning_rate": 2.873056994818653e-07, "loss": -0.0006, "reward": 2.4999972581863403, "reward_std": 2.577872464826214e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999974370002747, "step": 2752 }, { "clip_ratio": 0.0, "completion_length": 34.5, "epoch": 7.132124352331606, "grad_norm": 0.35463335869827056, "kl": 0.0654296875, "learning_rate": 2.870466321243523e-07, "loss": 0.0001, "reward": 2.499986171722412, "reward_std": 2.66814964788864e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999860525131226, "step": 2753 }, { "clip_ratio": 0.0, "completion_length": 34.5, "epoch": 7.134715025906735, "grad_norm": 5.288680850824816, "kl": 0.0601806640625, "learning_rate": 2.8678756476683936e-07, "loss": 0.0012, "reward": 2.4999794960021973, "reward_std": 9.578609251548187e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.999979555606842, "step": 2754 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 7.137305699481865, "grad_norm": 0.07022497342829802, "kl": 0.07275390625, "learning_rate": 2.865284974093264e-07, "loss": 0.0009, "reward": 2.4999988079071045, "reward_std": 1.129997087900847e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999988675117493, "step": 2755 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 7.139896373056994, "grad_norm": 1.3765696080231349, "kl": 0.080322265625, "learning_rate": 2.8626943005181346e-07, "loss": 0.0009, "reward": 2.499979019165039, "reward_std": 7.035282578726765e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999790787696838, "step": 2756 }, { "clip_ratio": 0.0, "completion_length": 34.5, "epoch": 7.142487046632124, "grad_norm": 2.29781823152281, "kl": 0.12158203125, "learning_rate": 2.860103626943005e-07, "loss": 0.0011, "reward": 2.499990940093994, "reward_std": 8.408339567722578e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.999990999698639, "step": 2757 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 7.1450777202072535, "grad_norm": 0.20674407442124407, "kl": 0.0584716796875, "learning_rate": 2.8575129533678757e-07, "loss": 0.0002, "reward": 2.4999942779541016, "reward_std": 3.489466394057672e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999943375587463, "step": 2758 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 7.147668393782383, "grad_norm": 0.7257799207809277, "kl": 0.11083984375, "learning_rate": 2.8549222797927457e-07, "loss": 0.0015, "reward": 2.4999923706054688, "reward_std": 3.956153989292943e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.999992311000824, "step": 2759 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 7.150259067357513, "grad_norm": 29.63300940171338, "kl": 0.177978515625, "learning_rate": 2.8523316062176167e-07, "loss": 0.0016, "reward": 1.8276203870773315, "reward_std": 0.0012136940254094952, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.3276202380657196, "step": 2760 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 7.152849740932642, "grad_norm": 0.19096093063476446, "kl": 0.04608154296875, "learning_rate": 2.849740932642487e-07, "loss": 0.0003, "reward": 2.4999955892562866, "reward_std": 2.302110999607976e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999955892562866, "step": 2761 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 7.155440414507772, "grad_norm": 0.5649633649540509, "kl": 0.20733642578125, "learning_rate": 2.847150259067357e-07, "loss": 0.0014, "reward": 2.499995708465576, "reward_std": 2.163669307719829e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999958276748657, "step": 2762 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 7.158031088082901, "grad_norm": 0.22806776661027514, "kl": 0.085205078125, "learning_rate": 2.844559585492228e-07, "loss": 0.0016, "reward": 2.4999929666519165, "reward_std": 2.473032054695068e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999929070472717, "step": 2763 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 7.160621761658031, "grad_norm": 0.09213882372241625, "kl": 0.0931396484375, "learning_rate": 2.8419689119170983e-07, "loss": 0.0011, "reward": 2.499998092651367, "reward_std": 1.9166524793945428e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.999998152256012, "step": 2764 }, { "clip_ratio": 0.0, "completion_length": 34.5, "epoch": 7.16321243523316, "grad_norm": 2.1349187459979397, "kl": 0.0631103515625, "learning_rate": 2.839378238341969e-07, "loss": 0.0001, "reward": 2.4999959468841553, "reward_std": 3.810806560977653e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999958276748657, "step": 2765 }, { "clip_ratio": 0.0, "completion_length": 34.0, "epoch": 7.16580310880829, "grad_norm": 2.107267908411277, "kl": 0.028533935546875, "learning_rate": 2.8367875647668393e-07, "loss": -0.0003, "reward": 2.499994158744812, "reward_std": 8.372982563287223e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999942183494568, "step": 2766 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 7.168393782383419, "grad_norm": 0.5832772870712557, "kl": 0.3056640625, "learning_rate": 2.83419689119171e-07, "loss": 0.0015, "reward": 1.9997992515563965, "reward_std": 5.329875023107888e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4997993111610413, "step": 2767 }, { "clip_ratio": 0.0, "completion_length": 34.0, "epoch": 7.170984455958549, "grad_norm": 0.14402685031763152, "kl": 0.0499267578125, "learning_rate": 2.83160621761658e-07, "loss": -0.0011, "reward": 2.499997615814209, "reward_std": 1.049370325745258e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999977350234985, "step": 2768 }, { "clip_ratio": 0.0, "completion_length": 34.5, "epoch": 7.1735751295336785, "grad_norm": 0.10125029651250043, "kl": 0.059814453125, "learning_rate": 2.8290155440414504e-07, "loss": 0.0008, "reward": 2.4999985694885254, "reward_std": 1.4915083283995045e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999985098838806, "step": 2769 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 7.176165803108808, "grad_norm": 2.214707444395702, "kl": 0.064697265625, "learning_rate": 2.8264248704663215e-07, "loss": 0.0007, "reward": 2.4999823570251465, "reward_std": 7.293636826943839e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999822974205017, "step": 2770 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 7.178756476683938, "grad_norm": 4.312535342757309, "kl": 0.13623046875, "learning_rate": 2.8238341968911915e-07, "loss": 0.0012, "reward": 2.4999715089797974, "reward_std": 2.0916557105010725e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999715089797974, "step": 2771 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 7.181347150259067, "grad_norm": 0.11622333618380938, "kl": 0.119873046875, "learning_rate": 2.821243523316062e-07, "loss": 0.0009, "reward": 2.499998450279236, "reward_std": 1.5286501593436697e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999984502792358, "step": 2772 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 7.183937823834197, "grad_norm": 4.389352263912077, "kl": 0.02313232421875, "learning_rate": 2.8186528497409325e-07, "loss": 0.0014, "reward": 2.499983787536621, "reward_std": 2.306203077750979e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999837279319763, "step": 2773 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 7.186528497409326, "grad_norm": 1.9015968840341313, "kl": 0.128662109375, "learning_rate": 2.816062176165803e-07, "loss": -0.0002, "reward": 2.4999916553497314, "reward_std": 9.00349857602123e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999916553497314, "step": 2774 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 7.189119170984456, "grad_norm": 0.38769995654805667, "kl": 0.16650390625, "learning_rate": 2.8134715025906736e-07, "loss": 0.0004, "reward": 2.4999953508377075, "reward_std": 1.5121324423716942e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999954104423523, "step": 2775 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 7.191709844559585, "grad_norm": 0.24253696915121573, "kl": 0.060546875, "learning_rate": 2.810880829015544e-07, "loss": 0.0011, "reward": 2.49999737739563, "reward_std": 2.2189463493305084e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999971389770508, "step": 2776 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 7.194300518134715, "grad_norm": 0.9144746431960473, "kl": 0.05029296875, "learning_rate": 2.808290155440414e-07, "loss": -0.0005, "reward": 2.4999969005584717, "reward_std": 2.0379158058858593e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999969005584717, "step": 2777 }, { "clip_ratio": 0.0, "completion_length": 34.5, "epoch": 7.196891191709844, "grad_norm": 1.1453546550528448, "kl": 0.0640869140625, "learning_rate": 2.8056994818652846e-07, "loss": -0.0, "reward": 2.4999929666519165, "reward_std": 6.042962752417225e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999930262565613, "step": 2778 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 7.199481865284974, "grad_norm": 0.2956146693268651, "kl": 0.0657958984375, "learning_rate": 2.8031088082901557e-07, "loss": 0.0009, "reward": 2.499996066093445, "reward_std": 2.339157163078198e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999958872795105, "step": 2779 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 7.2020725388601035, "grad_norm": 1.7690700844613938, "kl": 0.060791015625, "learning_rate": 2.8005181347150257e-07, "loss": 0.0, "reward": 2.4999927282333374, "reward_std": 6.084984988774522e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999927878379822, "step": 2780 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 7.204663212435233, "grad_norm": 2.73677534225513, "kl": 0.0924072265625, "learning_rate": 2.797927461139896e-07, "loss": -0.0007, "reward": 2.4999855756759644, "reward_std": 1.0224089237453882e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999855756759644, "step": 2781 }, { "clip_ratio": 0.0, "completion_length": 34.5, "epoch": 7.2072538860103625, "grad_norm": 1.1981998138571988, "kl": 0.094482421875, "learning_rate": 2.7953367875647667e-07, "loss": 0.0001, "reward": 2.4999942779541016, "reward_std": 4.210624297229515e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999943971633911, "step": 2782 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 7.209844559585492, "grad_norm": 5.968806271937427, "kl": 0.056396484375, "learning_rate": 2.792746113989637e-07, "loss": -0.0005, "reward": 2.4999773502349854, "reward_std": 1.6791575944807846e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.999977469444275, "step": 2783 }, { "clip_ratio": 0.0, "completion_length": 34.5, "epoch": 7.212435233160622, "grad_norm": 8.778902647680013, "kl": 0.1219482421875, "learning_rate": 2.790155440414508e-07, "loss": 0.0, "reward": 2.4999791383743286, "reward_std": 5.396075772523545e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999791383743286, "step": 2784 }, { "clip_ratio": 0.0, "completion_length": 34.6875, "epoch": 7.215025906735751, "grad_norm": 24.861887796320637, "kl": 0.121826171875, "learning_rate": 2.7875647668393783e-07, "loss": -0.0003, "reward": 1.9924234747886658, "reward_std": 8.40917636537597e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4924237132072449, "step": 2785 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 7.217616580310881, "grad_norm": 4.481088771903548, "kl": 0.082763671875, "learning_rate": 2.7849740932642483e-07, "loss": 0.0003, "reward": 2.499975562095642, "reward_std": 2.1934815777058247e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.999975562095642, "step": 2786 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 7.22020725388601, "grad_norm": 0.1936703331833292, "kl": 0.115234375, "learning_rate": 2.782383419689119e-07, "loss": 0.0009, "reward": 2.4999983310699463, "reward_std": 1.1752094621897413e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999982714653015, "step": 2787 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 7.22279792746114, "grad_norm": 2.8082239298362524, "kl": 0.0716552734375, "learning_rate": 2.77979274611399e-07, "loss": -0.0004, "reward": 2.4999840259552, "reward_std": 9.378438960538915e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999841451644897, "step": 2788 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 7.225388601036269, "grad_norm": 0.09971364772098981, "kl": 0.0672607421875, "learning_rate": 2.77720207253886e-07, "loss": -0.0, "reward": 2.499996066093445, "reward_std": 1.9368695802768343e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999961256980896, "step": 2789 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 7.227979274611399, "grad_norm": 0.3689860131457773, "kl": 0.08251953125, "learning_rate": 2.7746113989637304e-07, "loss": 0.0016, "reward": 2.4999940395355225, "reward_std": 3.666489533316053e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999940395355225, "step": 2790 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 7.230569948186528, "grad_norm": 0.647121482479705, "kl": 0.212158203125, "learning_rate": 2.772020725388601e-07, "loss": 0.0023, "reward": 2.499993681907654, "reward_std": 5.7994617463918985e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999935626983643, "step": 2791 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 7.233160621761658, "grad_norm": 0.673668075156263, "kl": 0.100830078125, "learning_rate": 2.769430051813471e-07, "loss": -0.0003, "reward": 2.4999879598617554, "reward_std": 3.9994324652070645e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999881386756897, "step": 2792 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 7.2357512953367875, "grad_norm": 0.6408799456541552, "kl": 0.1552734375, "learning_rate": 2.766839378238342e-07, "loss": 0.0008, "reward": 2.4999784231185913, "reward_std": 8.780219104664866e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999784231185913, "step": 2793 }, { "clip_ratio": 0.0, "completion_length": 34.0, "epoch": 7.238341968911917, "grad_norm": 0.21763030173056042, "kl": 0.1025390625, "learning_rate": 2.7642487046632125e-07, "loss": 0.0006, "reward": 2.499996066093445, "reward_std": 2.3939119273563847e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999961256980896, "step": 2794 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 7.240932642487047, "grad_norm": 285.6986164805541, "kl": 0.096923828125, "learning_rate": 2.7616580310880825e-07, "loss": 0.0001, "reward": 1.8122249841690063, "reward_std": 0.0006545914177422674, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.3122249841690063, "step": 2795 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 7.243523316062176, "grad_norm": 1.1424234195270353, "kl": 0.154052734375, "learning_rate": 2.759067357512953e-07, "loss": 0.0016, "reward": 2.4999942779541016, "reward_std": 5.0994269713555695e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999943375587463, "step": 2796 }, { "clip_ratio": 0.0, "completion_length": 34.5, "epoch": 7.246113989637306, "grad_norm": 0.21947202862188261, "kl": 0.070068359375, "learning_rate": 2.756476683937824e-07, "loss": 0.0001, "reward": 2.499995708465576, "reward_std": 3.2211644338531187e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999955892562866, "step": 2797 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 7.248704663212435, "grad_norm": 0.09555870706533301, "kl": 0.050048828125, "learning_rate": 2.753886010362694e-07, "loss": -0.0008, "reward": 2.499997854232788, "reward_std": 1.7976130379793176e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999980926513672, "step": 2798 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 7.251295336787565, "grad_norm": 0.0807983102705631, "kl": 0.12646484375, "learning_rate": 2.7512953367875646e-07, "loss": 0.0001, "reward": 2.499994993209839, "reward_std": 2.084456241391308e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.999994933605194, "step": 2799 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 7.253886010362694, "grad_norm": 0.3113020915404871, "kl": 0.0484619140625, "learning_rate": 2.748704663212435e-07, "loss": -0.0002, "reward": 2.499987483024597, "reward_std": 4.419826723278675e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999874830245972, "step": 2800 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 7.256476683937824, "grad_norm": 1.045442689806256, "kl": 0.10205078125, "learning_rate": 2.746113989637305e-07, "loss": -0.0006, "reward": 2.4999945163726807, "reward_std": 4.731215085485019e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999946355819702, "step": 2801 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 7.259067357512953, "grad_norm": 0.8406275106284683, "kl": 0.1376953125, "learning_rate": 2.743523316062176e-07, "loss": 0.0005, "reward": 2.499984383583069, "reward_std": 3.909989288786164e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.999984323978424, "step": 2802 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 7.261658031088083, "grad_norm": 0.4249658280498966, "kl": 0.076904296875, "learning_rate": 2.7409326424870467e-07, "loss": 0.0009, "reward": 2.4999955892562866, "reward_std": 2.975777761093923e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999955296516418, "step": 2803 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 7.2642487046632125, "grad_norm": 0.14715451922718406, "kl": 0.12939453125, "learning_rate": 2.7383419689119167e-07, "loss": -0.0002, "reward": 2.499997615814209, "reward_std": 9.016723936383642e-07, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999977350234985, "step": 2804 }, { "clip_ratio": 0.0, "completion_length": 34.5, "epoch": 7.266839378238342, "grad_norm": 2.1630849775720002, "kl": 0.17138671875, "learning_rate": 2.735751295336787e-07, "loss": 0.0001, "reward": 1.9972917437553406, "reward_std": 3.55871564465815e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4972918629646301, "step": 2805 }, { "clip_ratio": 0.0, "completion_length": 34.0, "epoch": 7.269430051813472, "grad_norm": 0.11947457944026632, "kl": 0.099853515625, "learning_rate": 2.7331606217616583e-07, "loss": 0.0007, "reward": 2.499997854232788, "reward_std": 1.527620099750493e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.999997854232788, "step": 2806 }, { "clip_ratio": 0.0, "completion_length": 34.0, "epoch": 7.272020725388601, "grad_norm": 69.84226701683895, "kl": 1.21722412109375, "learning_rate": 2.7305699481865283e-07, "loss": 0.0037, "reward": 1.946344256401062, "reward_std": 0.002567103309502272, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4463444352149963, "step": 2807 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 7.274611398963731, "grad_norm": 0.30733205815695913, "kl": 0.0859375, "learning_rate": 2.727979274611399e-07, "loss": 0.0001, "reward": 2.499997615814209, "reward_std": 3.0245171274145832e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999977946281433, "step": 2808 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 7.27720207253886, "grad_norm": 0.12606911107675, "kl": 0.112548828125, "learning_rate": 2.7253886010362694e-07, "loss": 0.0, "reward": 2.499998092651367, "reward_std": 1.3479630638357776e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.999998152256012, "step": 2809 }, { "clip_ratio": 0.0, "completion_length": 34.5, "epoch": 7.27979274611399, "grad_norm": 0.4592249794749426, "kl": 0.098388671875, "learning_rate": 2.7227979274611393e-07, "loss": 0.0002, "reward": 2.499995708465576, "reward_std": 3.842052819891251e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.999995768070221, "step": 2810 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 7.282383419689119, "grad_norm": 9.682477477219402, "kl": 0.137451171875, "learning_rate": 2.7202072538860104e-07, "loss": -0.0002, "reward": 1.8723435401916504, "reward_std": 0.0008437733761752497, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.3723435997962952, "step": 2811 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 7.284974093264249, "grad_norm": 0.17003074781660135, "kl": 0.09912109375, "learning_rate": 2.717616580310881e-07, "loss": 0.001, "reward": 2.499994158744812, "reward_std": 1.7374125889091374e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999942183494568, "step": 2812 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 7.287564766839378, "grad_norm": 0.12318931287536186, "kl": 0.0751953125, "learning_rate": 2.715025906735751e-07, "loss": 0.001, "reward": 2.4999974966049194, "reward_std": 2.005502267365955e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999975562095642, "step": 2813 }, { "clip_ratio": 0.0, "completion_length": 34.0, "epoch": 7.290155440414508, "grad_norm": 0.08174513535293738, "kl": 0.09326171875, "learning_rate": 2.7124352331606215e-07, "loss": 0.0018, "reward": 2.4999969005584717, "reward_std": 1.9077306774306635e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999967813491821, "step": 2814 }, { "clip_ratio": 0.0, "completion_length": 34.5, "epoch": 7.2927461139896375, "grad_norm": 0.04045268804357678, "kl": 0.1337890625, "learning_rate": 2.709844559585492e-07, "loss": -0.0001, "reward": 2.4999942779541016, "reward_std": 1.423873243311391e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999942779541016, "step": 2815 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 7.295336787564767, "grad_norm": 0.3050327924472231, "kl": 0.13232421875, "learning_rate": 2.7072538860103625e-07, "loss": -0.0002, "reward": 2.4999977350234985, "reward_std": 1.9395337176320027e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999977350234985, "step": 2816 }, { "clip_ratio": 0.0, "completion_length": 34.0, "epoch": 7.2979274611398965, "grad_norm": 0.12169573122756865, "kl": 0.101287841796875, "learning_rate": 2.704663212435233e-07, "loss": 0.0005, "reward": 2.4999970197677612, "reward_std": 2.3166951450548368e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.999997079372406, "step": 2817 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 7.300518134715026, "grad_norm": 53.71103215649146, "kl": 0.08984375, "learning_rate": 2.7020725388601036e-07, "loss": 0.0005, "reward": 2.4999101161956787, "reward_std": 2.539685272040515e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999101758003235, "step": 2818 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 7.303108808290156, "grad_norm": 9.489807047133581, "kl": 0.2138671875, "learning_rate": 2.6994818652849736e-07, "loss": 0.0009, "reward": 1.8930606842041016, "reward_std": 0.000643064509404212, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.3930606842041016, "step": 2819 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 7.305699481865285, "grad_norm": 0.11036868030547739, "kl": 0.127685546875, "learning_rate": 2.6968911917098446e-07, "loss": -0.0002, "reward": 2.4999959468841553, "reward_std": 2.034824944985303e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999960660934448, "step": 2820 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 7.308290155440415, "grad_norm": 0.0978229157023234, "kl": 0.02239990234375, "learning_rate": 2.694300518134715e-07, "loss": -0.0003, "reward": 2.4999887943267822, "reward_std": 1.3666916061083612e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999889135360718, "step": 2821 }, { "clip_ratio": 0.0, "completion_length": 34.875, "epoch": 7.310880829015544, "grad_norm": 37.09775342321396, "kl": 0.0986328125, "learning_rate": 2.691709844559585e-07, "loss": -0.0, "reward": 1.96336430311203, "reward_std": 0.011848652355411105, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4633644223213196, "step": 2822 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 7.313471502590674, "grad_norm": 0.536238910666692, "kl": 0.05078125, "learning_rate": 2.6891191709844557e-07, "loss": -0.0007, "reward": 2.499995470046997, "reward_std": 3.5237948168287403e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999956488609314, "step": 2823 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 7.316062176165803, "grad_norm": 5.597527111946226, "kl": 1.6572265625, "learning_rate": 2.686528497409326e-07, "loss": 0.0074, "reward": 2.49999737739563, "reward_std": 4.774386638928263e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.999997317790985, "step": 2824 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 7.318652849740933, "grad_norm": 0.062350314297028864, "kl": 0.04925537109375, "learning_rate": 2.6839378238341967e-07, "loss": 0.0005, "reward": 2.4999988079071045, "reward_std": 8.074643460531661e-07, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999991655349731, "step": 2825 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 7.321243523316062, "grad_norm": 1.5731215355287473, "kl": 0.1396484375, "learning_rate": 2.681347150259067e-07, "loss": 0.0009, "reward": 1.9988993406295776, "reward_std": 5.0001223939943884e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4988993704319, "step": 2826 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 7.323834196891192, "grad_norm": 0.020898047196269424, "kl": 0.080810546875, "learning_rate": 2.678756476683938e-07, "loss": -0.0001, "reward": 2.49999737739563, "reward_std": 4.913568147912883e-07, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.999997615814209, "step": 2827 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 7.3264248704663215, "grad_norm": 6.411696719267693, "kl": 0.1953125, "learning_rate": 2.676165803108808e-07, "loss": 0.0003, "reward": 1.9919129610061646, "reward_std": 0.00013535398539943344, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4919130206108093, "step": 2828 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 7.329015544041451, "grad_norm": 0.1719146435688225, "kl": 0.10693359375, "learning_rate": 2.6735751295336783e-07, "loss": 0.0017, "reward": 2.4999977350234985, "reward_std": 1.578277363023517e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.999997615814209, "step": 2829 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 7.331606217616581, "grad_norm": 0.09190455201668839, "kl": 0.027435302734375, "learning_rate": 2.6709844559585494e-07, "loss": -0.0011, "reward": 2.4999961853027344, "reward_std": 1.9547299530131568e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999963641166687, "step": 2830 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 7.33419689119171, "grad_norm": 19.49882902581193, "kl": 0.2447509765625, "learning_rate": 2.6683937823834194e-07, "loss": 0.0008, "reward": 1.8961536884307861, "reward_std": 0.000317055290679491, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.3961536884307861, "step": 2831 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 7.33678756476684, "grad_norm": 0.11290942689365722, "kl": 0.1258544921875, "learning_rate": 2.66580310880829e-07, "loss": 0.0011, "reward": 2.499996781349182, "reward_std": 1.9968296669503616e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999967813491821, "step": 2832 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 7.339378238341969, "grad_norm": 7.711760924911983, "kl": 2.45703125, "learning_rate": 2.6632124352331604e-07, "loss": 0.0093, "reward": 1.9983811378479004, "reward_std": 2.9145689211418357e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.49838125705719, "step": 2833 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 7.341968911917099, "grad_norm": 0.07971531553980153, "kl": 0.04437255859375, "learning_rate": 2.6606217616580315e-07, "loss": -0.0001, "reward": 2.499996781349182, "reward_std": 1.6447785355921951e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999966025352478, "step": 2834 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 7.344559585492228, "grad_norm": 1.256762118337873, "kl": 0.0859375, "learning_rate": 2.6580310880829015e-07, "loss": 0.0005, "reward": 2.4999905824661255, "reward_std": 5.3548238270195725e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999906420707703, "step": 2835 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 7.347150259067358, "grad_norm": 2.4568200693367257, "kl": 0.1611328125, "learning_rate": 2.655440414507772e-07, "loss": 0.0011, "reward": 1.9940448999404907, "reward_std": 4.381606322567677e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4940448701381683, "step": 2836 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 7.349740932642487, "grad_norm": 5.807578965526504, "kl": 0.1845703125, "learning_rate": 2.652849740932642e-07, "loss": 0.0007, "reward": 1.974083662033081, "reward_std": 0.00013516989685058434, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4740836024284363, "step": 2837 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 7.352331606217617, "grad_norm": 0.28473482585784043, "kl": 0.09698486328125, "learning_rate": 2.6502590673575125e-07, "loss": 0.0005, "reward": 2.4999853372573853, "reward_std": 4.394527081785782e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999852180480957, "step": 2838 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 7.3549222797927465, "grad_norm": 0.8149931408037281, "kl": 0.05291748046875, "learning_rate": 2.6476683937823836e-07, "loss": -0.0002, "reward": 2.499993324279785, "reward_std": 2.9201254392319242e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.99999338388443, "step": 2839 }, { "clip_ratio": 0.0, "completion_length": 34.6875, "epoch": 7.357512953367876, "grad_norm": 0.23416833104159884, "kl": 0.093017578125, "learning_rate": 2.645077720207254e-07, "loss": 0.0014, "reward": 2.4999958276748657, "reward_std": 2.33332400512154e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999957084655762, "step": 2840 }, { "clip_ratio": 0.0, "completion_length": 34.5, "epoch": 7.360103626943006, "grad_norm": 0.9815129000860943, "kl": 0.0684814453125, "learning_rate": 2.642487046632124e-07, "loss": 0.0007, "reward": 2.4999871253967285, "reward_std": 7.650276756976382e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999871850013733, "step": 2841 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 7.362694300518135, "grad_norm": 0.13989188310038264, "kl": 0.17578125, "learning_rate": 2.6398963730569946e-07, "loss": 0.0001, "reward": 2.4999905824661255, "reward_std": 4.645750323106768e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.999990701675415, "step": 2842 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 7.365284974093265, "grad_norm": 3.318404663999125, "kl": 0.125244140625, "learning_rate": 2.6373056994818657e-07, "loss": 0.0012, "reward": 2.249942898750305, "reward_std": 0.26726726118313593, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.7499428391456604, "step": 2843 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 7.367875647668393, "grad_norm": 0.15279163201083765, "kl": 0.091552734375, "learning_rate": 2.6347150259067357e-07, "loss": 0.0004, "reward": 2.499998450279236, "reward_std": 1.2477871109695116e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999984502792358, "step": 2844 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 7.370466321243523, "grad_norm": 0.10827347688617023, "kl": 0.126708984375, "learning_rate": 2.632124352331606e-07, "loss": 0.0002, "reward": 2.499998092651367, "reward_std": 1.55017215774933e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999983310699463, "step": 2845 }, { "clip_ratio": 0.0, "completion_length": 33.5, "epoch": 7.373056994818652, "grad_norm": 0.35876915240341717, "kl": 0.112060546875, "learning_rate": 2.6295336787564767e-07, "loss": 0.001, "reward": 2.4999982118606567, "reward_std": 1.2198702279420104e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999982714653015, "step": 2846 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 7.375647668393782, "grad_norm": 0.4763062837390196, "kl": 0.17822265625, "learning_rate": 2.6269430051813467e-07, "loss": 0.0015, "reward": 2.499993920326233, "reward_std": 4.087596380486502e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999937415122986, "step": 2847 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 7.3782383419689115, "grad_norm": 0.5201992278869353, "kl": 0.04827880859375, "learning_rate": 2.624352331606218e-07, "loss": -0.0009, "reward": 2.499990940093994, "reward_std": 3.921487831348713e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.999991238117218, "step": 2848 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 7.380829015544041, "grad_norm": 0.2649265145299683, "kl": 0.069091796875, "learning_rate": 2.6217616580310883e-07, "loss": -0.0, "reward": 2.499995231628418, "reward_std": 2.3153948518483958e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999951124191284, "step": 2849 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 7.383419689119171, "grad_norm": 288.0289965221196, "kl": 0.1019287109375, "learning_rate": 2.6191709844559583e-07, "loss": 0.0007, "reward": 1.9900956749916077, "reward_std": 0.002839759652260909, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4900956749916077, "step": 2850 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 7.3860103626943, "grad_norm": 0.1173526664802589, "kl": 0.0849609375, "learning_rate": 2.616580310880829e-07, "loss": -0.0004, "reward": 2.499998450279236, "reward_std": 1.1589862651817384e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999985694885254, "step": 2851 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 7.38860103626943, "grad_norm": 1.9513600650137177, "kl": 0.08984375, "learning_rate": 2.6139896373056994e-07, "loss": -0.0008, "reward": 2.499987006187439, "reward_std": 7.780848591210088e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999869465827942, "step": 2852 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 7.391191709844559, "grad_norm": 1.44185550229638, "kl": 0.09326171875, "learning_rate": 2.61139896373057e-07, "loss": 0.0013, "reward": 2.499987483024597, "reward_std": 7.910099952823657e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999874234199524, "step": 2853 }, { "clip_ratio": 0.0, "completion_length": 34.0, "epoch": 7.393782383419689, "grad_norm": 0.06556698368029117, "kl": 0.0498046875, "learning_rate": 2.6088082901554404e-07, "loss": 0.0001, "reward": 2.4999953508377075, "reward_std": 1.8432132549150992e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999952912330627, "step": 2854 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 7.396373056994818, "grad_norm": 2.621511905400639, "kl": 0.0667724609375, "learning_rate": 2.606217616580311e-07, "loss": -0.0012, "reward": 2.4999759197235107, "reward_std": 1.0496055665498716e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999761581420898, "step": 2855 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 7.398963730569948, "grad_norm": 0.8053155909446992, "kl": 0.0458984375, "learning_rate": 2.603626943005181e-07, "loss": 0.0012, "reward": 2.499986171722412, "reward_std": 7.358378184108005e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999860525131226, "step": 2856 }, { "clip_ratio": 0.0, "completion_length": 34.5, "epoch": 7.401554404145077, "grad_norm": 0.06987777668149876, "kl": 0.142578125, "learning_rate": 2.601036269430052e-07, "loss": 0.0003, "reward": 2.49999737739563, "reward_std": 1.764308791507574e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999974370002747, "step": 2857 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 7.404145077720207, "grad_norm": 37.506404446012226, "kl": 0.5703125, "learning_rate": 2.5984455958549225e-07, "loss": 0.0023, "reward": 1.3604300022125244, "reward_std": 0.0005023884441470727, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 0.8604300618171692, "step": 2858 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 7.4067357512953365, "grad_norm": 0.1109195552672259, "kl": 0.134521484375, "learning_rate": 2.5958549222797925e-07, "loss": 0.0002, "reward": 2.4999979734420776, "reward_std": 1.3841492432220548e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999979138374329, "step": 2859 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 7.409326424870466, "grad_norm": 0.15158453087956889, "kl": 0.04876708984375, "learning_rate": 2.593264248704663e-07, "loss": -0.0007, "reward": 2.49999463558197, "reward_std": 1.3965968150841945e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999948143959045, "step": 2860 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 7.4119170984455955, "grad_norm": 5.299770011902868, "kl": 0.130615234375, "learning_rate": 2.5906735751295336e-07, "loss": 0.0005, "reward": 1.2727726697921753, "reward_std": 0.0013119927025400102, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 0.7727727293968201, "step": 2861 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 7.414507772020725, "grad_norm": 1.4925445805339939, "kl": 0.0908203125, "learning_rate": 2.588082901554404e-07, "loss": 0.0011, "reward": 2.4999911785125732, "reward_std": 8.532417041351437e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999910593032837, "step": 2862 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 7.417098445595855, "grad_norm": 8.510300410315063, "kl": 0.13916015625, "learning_rate": 2.5854922279792746e-07, "loss": 0.0007, "reward": 1.9913722276687622, "reward_std": 0.00010243407575671881, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4913722276687622, "step": 2863 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 7.419689119170984, "grad_norm": 19.04575181563735, "kl": 0.0484619140625, "learning_rate": 2.582901554404145e-07, "loss": 0.0008, "reward": 1.8201225996017456, "reward_std": 0.0006248899346701364, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.3201225996017456, "step": 2864 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 7.422279792746114, "grad_norm": 0.20395293465420408, "kl": 0.093505859375, "learning_rate": 2.580310880829015e-07, "loss": -0.0008, "reward": 2.4999972581863403, "reward_std": 3.359025754434697e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.999997317790985, "step": 2865 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 7.424870466321243, "grad_norm": 0.5647746461380362, "kl": 0.0791015625, "learning_rate": 2.577720207253886e-07, "loss": 0.0006, "reward": 1.9998557567596436, "reward_std": 1.1571712093427777e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4998558163642883, "step": 2866 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 7.427461139896373, "grad_norm": 0.21336476858258724, "kl": 0.1552734375, "learning_rate": 2.5751295336787567e-07, "loss": -0.0004, "reward": 2.49999737739563, "reward_std": 2.7400626549933804e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999974966049194, "step": 2867 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 7.430051813471502, "grad_norm": 0.2979459148216057, "kl": 0.06512451171875, "learning_rate": 2.5725388601036267e-07, "loss": 0.0001, "reward": 2.499997138977051, "reward_std": 3.4764248653118557e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999970197677612, "step": 2868 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 7.432642487046632, "grad_norm": 0.8884117150896411, "kl": 0.11572265625, "learning_rate": 2.569948186528497e-07, "loss": -0.0002, "reward": 2.4999927282333374, "reward_std": 5.68606296269536e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999927282333374, "step": 2869 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 7.435233160621761, "grad_norm": 0.2898326676854063, "kl": 0.0966796875, "learning_rate": 2.567357512953368e-07, "loss": -0.0, "reward": 2.4999918937683105, "reward_std": 3.7975955962110675e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999920129776, "step": 2870 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 7.437823834196891, "grad_norm": 0.07210369819765641, "kl": 0.052001953125, "learning_rate": 2.5647668393782383e-07, "loss": -0.0001, "reward": 2.4999966621398926, "reward_std": 2.071714789053658e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.999996542930603, "step": 2871 }, { "clip_ratio": 0.0, "completion_length": 34.5, "epoch": 7.4404145077720205, "grad_norm": 0.013627292162816393, "kl": 0.044921875, "learning_rate": 2.562176165803109e-07, "loss": 0.0, "reward": 2.499998927116394, "reward_std": 8.017928365688931e-07, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999989867210388, "step": 2872 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 7.44300518134715, "grad_norm": 0.6524422920561326, "kl": 0.04150390625, "learning_rate": 2.5595854922279794e-07, "loss": -0.0001, "reward": 2.499988317489624, "reward_std": 4.971472264969634e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999884366989136, "step": 2873 }, { "clip_ratio": 0.0, "completion_length": 34.5, "epoch": 7.44559585492228, "grad_norm": 1.4979663538093417, "kl": 0.125, "learning_rate": 2.5569948186528494e-07, "loss": 0.0012, "reward": 2.4999948740005493, "reward_std": 3.2993613103826647e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999948143959045, "step": 2874 }, { "clip_ratio": 0.0, "completion_length": 34.5, "epoch": 7.448186528497409, "grad_norm": 4.017957786491282, "kl": 0.1334228515625, "learning_rate": 2.55440414507772e-07, "loss": 0.0002, "reward": 1.9960945844650269, "reward_std": 9.592810960157294e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.496094524860382, "step": 2875 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 7.450777202072539, "grad_norm": 0.1080066074399531, "kl": 0.12646484375, "learning_rate": 2.551813471502591e-07, "loss": 0.0015, "reward": 2.499992609024048, "reward_std": 1.7547014294905239e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.999992549419403, "step": 2876 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 7.453367875647668, "grad_norm": 0.8093916810949638, "kl": 0.0733642578125, "learning_rate": 2.549222797927461e-07, "loss": -0.0013, "reward": 2.4999940395355225, "reward_std": 3.627881937973143e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999940991401672, "step": 2877 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 7.455958549222798, "grad_norm": 2.1230621929526965, "kl": 0.54541015625, "learning_rate": 2.5466321243523315e-07, "loss": 0.0027, "reward": 2.499990224838257, "reward_std": 1.284029417547572e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.999990165233612, "step": 2878 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 7.458549222797927, "grad_norm": 4.1443684538405865, "kl": 0.114013671875, "learning_rate": 2.544041450777202e-07, "loss": -0.0005, "reward": 1.9984803795814514, "reward_std": 4.798860277333006e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4984804391860962, "step": 2879 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 7.461139896373057, "grad_norm": 1.70317119220248, "kl": 0.14111328125, "learning_rate": 2.5414507772020725e-07, "loss": 0.001, "reward": 1.6802762150764465, "reward_std": 0.0002713516473704658, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.1802761852741241, "step": 2880 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 7.463730569948186, "grad_norm": 0.30216547334881655, "kl": 0.0543212890625, "learning_rate": 2.538860103626943e-07, "loss": 0.0012, "reward": 2.499996542930603, "reward_std": 3.7160941133151937e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999964237213135, "step": 2881 }, { "clip_ratio": 0.0, "completion_length": 35.0625, "epoch": 7.466321243523316, "grad_norm": 25.731217502639, "kl": 0.1552734375, "learning_rate": 2.5362694300518136e-07, "loss": 0.0006, "reward": 1.9172991514205933, "reward_std": 0.20589225432195235, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4172993302345276, "step": 2882 }, { "clip_ratio": 0.0, "completion_length": 34.5, "epoch": 7.4689119170984455, "grad_norm": 0.2907952205794895, "kl": 0.15673828125, "learning_rate": 2.5336787564766836e-07, "loss": 0.0014, "reward": 2.499997615814209, "reward_std": 1.9038781999825005e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.999997615814209, "step": 2883 }, { "clip_ratio": 0.0, "completion_length": 34.5, "epoch": 7.471502590673575, "grad_norm": 0.8748229999303786, "kl": 0.21337890625, "learning_rate": 2.531088082901554e-07, "loss": 0.0011, "reward": 2.499997854232788, "reward_std": 1.8593351569506922e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999976754188538, "step": 2884 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 7.474093264248705, "grad_norm": 0.3785352925742591, "kl": 0.10205078125, "learning_rate": 2.528497409326425e-07, "loss": -0.0005, "reward": 2.4999947547912598, "reward_std": 3.2770421398709004e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999949932098389, "step": 2885 }, { "clip_ratio": 0.0, "completion_length": 34.5, "epoch": 7.476683937823834, "grad_norm": 0.09845310238529756, "kl": 0.095062255859375, "learning_rate": 2.525906735751295e-07, "loss": 0.0002, "reward": 2.499998092651367, "reward_std": 1.4673870225578867e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999979734420776, "step": 2886 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 7.479274611398964, "grad_norm": 14.307335584948014, "kl": 0.0794677734375, "learning_rate": 2.5233160621761657e-07, "loss": 0.0008, "reward": 1.9043012857437134, "reward_std": 0.00037970663970554597, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.404301255941391, "step": 2887 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 7.481865284974093, "grad_norm": 0.2184688879426897, "kl": 0.056640625, "learning_rate": 2.520725388601036e-07, "loss": 0.0011, "reward": 2.499990224838257, "reward_std": 3.5880742643712438e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999901056289673, "step": 2888 }, { "clip_ratio": 0.0, "completion_length": 34.5, "epoch": 7.484455958549223, "grad_norm": 0.42497084977107596, "kl": 0.091796875, "learning_rate": 2.518134715025906e-07, "loss": -0.0005, "reward": 2.4999884366989136, "reward_std": 4.58022714155959e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999885559082031, "step": 2889 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 7.487046632124352, "grad_norm": 7.952990113739974, "kl": 0.0823974609375, "learning_rate": 2.515544041450777e-07, "loss": -0.0001, "reward": 1.9968880414962769, "reward_std": 0.0001731456113702734, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4968880712985992, "step": 2890 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 7.489637305699482, "grad_norm": 1.0640255431697039, "kl": 0.052978515625, "learning_rate": 2.512953367875648e-07, "loss": -0.0008, "reward": 2.4999929666519165, "reward_std": 7.728891546321393e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999931454658508, "step": 2891 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 7.492227979274611, "grad_norm": 0.8704568599676987, "kl": 0.105224609375, "learning_rate": 2.510362694300518e-07, "loss": 0.0014, "reward": 2.499979019165039, "reward_std": 1.1728319805115461e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999787211418152, "step": 2892 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 7.494818652849741, "grad_norm": 0.13026421165973223, "kl": 0.06805419921875, "learning_rate": 2.5077720207253883e-07, "loss": 0.001, "reward": 2.499996781349182, "reward_std": 2.5490197117505886e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999966025352478, "step": 2893 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 7.4974093264248705, "grad_norm": 0.9377082327832156, "kl": 0.15380859375, "learning_rate": 2.5051813471502594e-07, "loss": -0.0002, "reward": 2.499985933303833, "reward_std": 6.524010359498789e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999860525131226, "step": 2894 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 7.5, "grad_norm": 0.13471864888755444, "kl": 0.0858154296875, "learning_rate": 2.5025906735751294e-07, "loss": 0.0001, "reward": 2.499997138977051, "reward_std": 2.9680402349185897e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.999997317790985, "step": 2895 }, { "clip_ratio": 0.0, "completion_length": 34.3125, "epoch": 7.5025906735751295, "grad_norm": 1.8846261151917099, "kl": 0.3349609375, "learning_rate": 2.5e-07, "loss": 0.0027, "reward": 2.4999828338623047, "reward_std": 1.1658462881314335e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999825954437256, "step": 2896 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 7.505181347150259, "grad_norm": 0.2708193347710579, "kl": 0.061279296875, "learning_rate": 2.4974093264248704e-07, "loss": 0.0001, "reward": 2.4999938011169434, "reward_std": 2.8499486006694497e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999939799308777, "step": 2897 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 7.507772020725389, "grad_norm": 0.04815339326362499, "kl": 0.140869140625, "learning_rate": 2.494818652849741e-07, "loss": 0.0014, "reward": 2.499998450279236, "reward_std": 1.033983778597758e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999984502792358, "step": 2898 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 7.510362694300518, "grad_norm": 2.7722791148945602, "kl": 0.1064453125, "learning_rate": 2.4922279792746115e-07, "loss": 0.0009, "reward": 1.9997743368148804, "reward_std": 2.119692885571567e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4997743368148804, "step": 2899 }, { "clip_ratio": 0.0, "completion_length": 34.0, "epoch": 7.512953367875648, "grad_norm": 0.5106796919303661, "kl": 0.0389404296875, "learning_rate": 2.489637305699482e-07, "loss": 0.0002, "reward": 2.4999722242355347, "reward_std": 6.893227919135825e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999720454216003, "step": 2900 }, { "clip_ratio": 0.0, "completion_length": 34.5, "epoch": 7.515544041450777, "grad_norm": 1.1406057113302723, "kl": 0.0628662109375, "learning_rate": 2.487046632124352e-07, "loss": -0.0001, "reward": 1.997680902481079, "reward_std": 5.234319459646031e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4976809322834015, "step": 2901 }, { "clip_ratio": 0.0, "completion_length": 34.5, "epoch": 7.518134715025907, "grad_norm": 0.09218057556949211, "kl": 0.138427734375, "learning_rate": 2.484455958549223e-07, "loss": -0.0, "reward": 2.4999964237213135, "reward_std": 1.6774215509940404e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999966621398926, "step": 2902 }, { "clip_ratio": 0.0, "completion_length": 34.0, "epoch": 7.520725388601036, "grad_norm": 0.6248003374584781, "kl": 0.091064453125, "learning_rate": 2.481865284974093e-07, "loss": 0.001, "reward": 2.4999860525131226, "reward_std": 5.481745461111132e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999861121177673, "step": 2903 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 7.523316062176166, "grad_norm": 0.4463045347032181, "kl": 0.1474609375, "learning_rate": 2.4792746113989636e-07, "loss": 0.0, "reward": 2.499995470046997, "reward_std": 3.0378723749890924e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999955892562866, "step": 2904 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 7.525906735751295, "grad_norm": 28.84757306523627, "kl": 0.15673828125, "learning_rate": 2.476683937823834e-07, "loss": 0.0006, "reward": 2.249874472618103, "reward_std": 0.2673055271868634, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.749874472618103, "step": 2905 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 7.528497409326425, "grad_norm": 0.15072331791402271, "kl": 0.04351806640625, "learning_rate": 2.4740932642487046e-07, "loss": 0.0013, "reward": 2.4999953508377075, "reward_std": 2.764710757219291e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999951124191284, "step": 2906 }, { "clip_ratio": 0.0, "completion_length": 34.5, "epoch": 7.5310880829015545, "grad_norm": 26.758720319038975, "kl": 0.0947265625, "learning_rate": 2.471502590673575e-07, "loss": 0.0014, "reward": 1.887969434261322, "reward_std": 0.0015671426558583335, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.3879693150520325, "step": 2907 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 7.533678756476684, "grad_norm": 1.9170584304445994, "kl": 0.082275390625, "learning_rate": 2.4689119170984457e-07, "loss": -0.0005, "reward": 2.499989867210388, "reward_std": 4.5294372057469445e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999900460243225, "step": 2908 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 7.536269430051814, "grad_norm": 0.2996648625795484, "kl": 0.05059814453125, "learning_rate": 2.466321243523316e-07, "loss": -0.0004, "reward": 2.4999942779541016, "reward_std": 2.558760286319739e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999945163726807, "step": 2909 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 7.538860103626943, "grad_norm": 5.79745374157687, "kl": 0.17822265625, "learning_rate": 2.463730569948186e-07, "loss": 0.0009, "reward": 2.499985456466675, "reward_std": 6.584182983715436e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999854564666748, "step": 2910 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 7.541450777202073, "grad_norm": 0.5010945560815133, "kl": 0.079345703125, "learning_rate": 2.4611398963730567e-07, "loss": -0.0003, "reward": 2.499997138977051, "reward_std": 2.179568355131778e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999971985816956, "step": 2911 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 7.544041450777202, "grad_norm": 13.564812207755466, "kl": 0.099853515625, "learning_rate": 2.458549222797927e-07, "loss": 0.0006, "reward": 1.994426965713501, "reward_std": 0.00014078659461347343, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4944270253181458, "step": 2912 }, { "clip_ratio": 0.0, "completion_length": 34.5, "epoch": 7.546632124352332, "grad_norm": 37.44207645858213, "kl": 0.169921875, "learning_rate": 2.455958549222798e-07, "loss": 0.0005, "reward": 1.996111810207367, "reward_std": 6.63758300589734e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.496111810207367, "step": 2913 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 7.549222797927461, "grad_norm": 6.510331738359985, "kl": 0.084228515625, "learning_rate": 2.4533678756476683e-07, "loss": 0.0001, "reward": 2.4999890327453613, "reward_std": 6.922352440597024e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999889135360718, "step": 2914 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 7.551813471502591, "grad_norm": 0.422331861546378, "kl": 0.05718994140625, "learning_rate": 2.450777202072539e-07, "loss": 0.0011, "reward": 2.49998140335083, "reward_std": 5.3374250228444e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999815225601196, "step": 2915 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 7.55440414507772, "grad_norm": 5.557996168557265, "kl": 0.154541015625, "learning_rate": 2.4481865284974094e-07, "loss": 0.0003, "reward": 1.7923298478126526, "reward_std": 0.0005259371980628202, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.2923299670219421, "step": 2916 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 7.55699481865285, "grad_norm": 24.26915595716572, "kl": 0.0728759765625, "learning_rate": 2.44559585492228e-07, "loss": 0.0013, "reward": 2.1874327659606934, "reward_std": 0.25881336485167594, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.687432587146759, "step": 2917 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 7.5595854922279795, "grad_norm": 1.1142526531240982, "kl": 0.155517578125, "learning_rate": 2.4430051813471504e-07, "loss": 0.0009, "reward": 2.4999958276748657, "reward_std": 5.5810438652770245e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999958276748657, "step": 2918 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 7.562176165803109, "grad_norm": 0.771435129953307, "kl": 0.0540771484375, "learning_rate": 2.4404145077720204e-07, "loss": 0.0006, "reward": 2.4999858140945435, "reward_std": 6.237439947653911e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999860525131226, "step": 2919 }, { "clip_ratio": 0.0, "completion_length": 34.5, "epoch": 7.564766839378239, "grad_norm": 1.5585308077868703, "kl": 0.08203125, "learning_rate": 2.437823834196891e-07, "loss": 0.0012, "reward": 2.499992609024048, "reward_std": 7.950610552143189e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.999992549419403, "step": 2920 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 7.567357512953368, "grad_norm": 0.550049483203788, "kl": 0.08050537109375, "learning_rate": 2.4352331606217615e-07, "loss": -0.0011, "reward": 2.499992847442627, "reward_std": 5.1874315545319405e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999927878379822, "step": 2921 }, { "clip_ratio": 0.0, "completion_length": 34.0, "epoch": 7.569948186528498, "grad_norm": 6.266307374449151, "kl": 0.11712646484375, "learning_rate": 2.432642487046632e-07, "loss": 0.0006, "reward": 1.998780608177185, "reward_std": 3.144343020267115e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4987806379795074, "step": 2922 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 7.572538860103627, "grad_norm": 10.941306914803079, "kl": 0.15478515625, "learning_rate": 2.4300518134715025e-07, "loss": 0.0006, "reward": 1.9986087679862976, "reward_std": 0.00010592287355848384, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.49860879778862, "step": 2923 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 7.575129533678757, "grad_norm": 0.6572794132883955, "kl": 0.03253173828125, "learning_rate": 2.427461139896373e-07, "loss": 0.0001, "reward": 2.4999972581863403, "reward_std": 1.7473318507654767e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999972581863403, "step": 2924 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 7.577720207253886, "grad_norm": 0.18651234912245118, "kl": 0.083984375, "learning_rate": 2.4248704663212436e-07, "loss": 0.0002, "reward": 2.4999860525131226, "reward_std": 2.688864469746477e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.999986171722412, "step": 2925 }, { "clip_ratio": 0.0, "completion_length": 34.5, "epoch": 7.580310880829016, "grad_norm": 0.12622445689324088, "kl": 0.0965576171875, "learning_rate": 2.422279792746114e-07, "loss": 0.0014, "reward": 2.4999990463256836, "reward_std": 8.986923774045863e-07, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999989867210388, "step": 2926 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 7.582901554404145, "grad_norm": 0.15322209983474763, "kl": 0.083984375, "learning_rate": 2.419689119170984e-07, "loss": 0.0012, "reward": 2.499995708465576, "reward_std": 2.3700629299128195e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999956488609314, "step": 2927 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 7.585492227979275, "grad_norm": 1.4186418613317566, "kl": 0.0859375, "learning_rate": 2.4170984455958546e-07, "loss": 0.001, "reward": 2.4999895095825195, "reward_std": 7.445257551808027e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999892711639404, "step": 2928 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 7.5880829015544045, "grad_norm": 0.08167462650580458, "kl": 0.09521484375, "learning_rate": 2.414507772020725e-07, "loss": -0.0009, "reward": 2.4999983310699463, "reward_std": 8.38005377090667e-07, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999984502792358, "step": 2929 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 7.590673575129534, "grad_norm": 0.09231319213669713, "kl": 0.12158203125, "learning_rate": 2.4119170984455957e-07, "loss": -0.0001, "reward": 2.4999979734420776, "reward_std": 1.8866881532630941e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999982118606567, "step": 2930 }, { "clip_ratio": 0.0, "completion_length": 34.0, "epoch": 7.5932642487046635, "grad_norm": 0.043819879162593965, "kl": 0.069580078125, "learning_rate": 2.409326424870466e-07, "loss": -0.0002, "reward": 2.4999979734420776, "reward_std": 1.5304378280234232e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999980926513672, "step": 2931 }, { "clip_ratio": 0.0, "completion_length": 34.5, "epoch": 7.595854922279793, "grad_norm": 0.6299510461029266, "kl": 0.0718994140625, "learning_rate": 2.4067357512953367e-07, "loss": 0.001, "reward": 2.4999934434890747, "reward_std": 6.542053256453073e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.99999338388443, "step": 2932 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 7.598445595854923, "grad_norm": 0.3244237620881964, "kl": 0.08203125, "learning_rate": 2.404145077720207e-07, "loss": 0.0009, "reward": 2.499996542930603, "reward_std": 2.8834252816523076e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.999996542930603, "step": 2933 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 7.601036269430052, "grad_norm": 0.3657266534319599, "kl": 0.069580078125, "learning_rate": 2.401554404145077e-07, "loss": 0.0006, "reward": 2.499991774559021, "reward_std": 2.7079482265435217e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.999991774559021, "step": 2934 }, { "clip_ratio": 0.0, "completion_length": 35.4375, "epoch": 7.603626943005182, "grad_norm": 5.334883455254238, "kl": 0.1826171875, "learning_rate": 2.3989637305699483e-07, "loss": 0.0007, "reward": 1.477147400379181, "reward_std": 0.0001751736162987072, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 0.9771474301815033, "step": 2935 }, { "clip_ratio": 0.0, "completion_length": 34.5, "epoch": 7.606217616580311, "grad_norm": 0.11719036022112504, "kl": 0.0330810546875, "learning_rate": 2.3963730569948183e-07, "loss": 0.0006, "reward": 2.4999985694885254, "reward_std": 1.4707190700846695e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999986290931702, "step": 2936 }, { "clip_ratio": 0.0, "completion_length": 34.0, "epoch": 7.608808290155441, "grad_norm": 20.06380237042844, "kl": 0.126708984375, "learning_rate": 2.393782383419689e-07, "loss": 0.001, "reward": 1.956397533416748, "reward_std": 0.00039138655671422384, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4563975036144257, "step": 2937 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 7.61139896373057, "grad_norm": 0.7280260370050832, "kl": 0.07806396484375, "learning_rate": 2.3911917098445594e-07, "loss": 0.0006, "reward": 2.499986171722412, "reward_std": 8.108246220217552e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999859929084778, "step": 2938 }, { "clip_ratio": 0.0, "completion_length": 34.5, "epoch": 7.6139896373057, "grad_norm": 0.13651773338593337, "kl": 0.10498046875, "learning_rate": 2.38860103626943e-07, "loss": 0.0014, "reward": 2.49999737739563, "reward_std": 2.0275131191738183e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.999997079372406, "step": 2939 }, { "clip_ratio": 0.0, "completion_length": 34.5625, "epoch": 7.616580310880829, "grad_norm": 4.7458727693058345, "kl": 0.189697265625, "learning_rate": 2.3860103626943004e-07, "loss": 0.0012, "reward": 1.9937777519226074, "reward_std": 5.8767218433786184e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4937776029109955, "step": 2940 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 7.619170984455959, "grad_norm": 2.520524567806283, "kl": 0.14532470703125, "learning_rate": 2.3834196891191707e-07, "loss": 0.0006, "reward": 2.499991536140442, "reward_std": 8.95783091436897e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999915957450867, "step": 2941 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 7.6217616580310885, "grad_norm": 0.38115085090060286, "kl": 0.07666015625, "learning_rate": 2.3808290155440415e-07, "loss": -0.0001, "reward": 2.4999923706054688, "reward_std": 3.84506120099104e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.999992549419403, "step": 2942 }, { "clip_ratio": 0.0, "completion_length": 34.5, "epoch": 7.624352331606218, "grad_norm": 0.1694456372319709, "kl": 0.114990234375, "learning_rate": 2.3782383419689117e-07, "loss": 0.0011, "reward": 2.4999982118606567, "reward_std": 1.1256768459588784e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999980926513672, "step": 2943 }, { "clip_ratio": 0.0, "completion_length": 34.5, "epoch": 7.626943005181348, "grad_norm": 0.11068370268093096, "kl": 0.20703125, "learning_rate": 2.3756476683937823e-07, "loss": 0.0014, "reward": 2.499998092651367, "reward_std": 2.1066209114906087e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999980926513672, "step": 2944 }, { "clip_ratio": 0.0, "completion_length": 34.5, "epoch": 7.629533678756477, "grad_norm": 0.49938166988170235, "kl": 0.082763671875, "learning_rate": 2.3730569948186528e-07, "loss": 0.0006, "reward": 2.4999959468841553, "reward_std": 3.007643186947462e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999958872795105, "step": 2945 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 7.632124352331607, "grad_norm": 2.452864325596358, "kl": 0.0775146484375, "learning_rate": 2.3704663212435233e-07, "loss": 0.0006, "reward": 2.499997854232788, "reward_std": 1.7866414623313176e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.999997854232788, "step": 2946 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 7.634715025906736, "grad_norm": 8.601088281170098, "kl": 0.25927734375, "learning_rate": 2.3678756476683936e-07, "loss": 0.0019, "reward": 1.995637059211731, "reward_std": 0.00017986779039347311, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4956370294094086, "step": 2947 }, { "clip_ratio": 0.0, "completion_length": 34.0, "epoch": 7.637305699481866, "grad_norm": 0.243646272165451, "kl": 0.095703125, "learning_rate": 2.3652849740932644e-07, "loss": 0.0004, "reward": 2.4999974966049194, "reward_std": 1.397219818954909e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999975562095642, "step": 2948 }, { "clip_ratio": 0.0, "completion_length": 34.5, "epoch": 7.639896373056995, "grad_norm": 4.145585553461657, "kl": 0.195068359375, "learning_rate": 2.3626943005181346e-07, "loss": 0.0007, "reward": 0.9837982654571533, "reward_std": 0.0001389335229760036, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 0.4837982654571533, "step": 2949 }, { "clip_ratio": 0.0, "completion_length": 34.5, "epoch": 7.642487046632124, "grad_norm": 43.298590077828365, "kl": 0.1026611328125, "learning_rate": 2.360103626943005e-07, "loss": 0.0008, "reward": 2.499996304512024, "reward_std": 2.4233220301539404e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999963641166687, "step": 2950 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 7.6450777202072535, "grad_norm": 195.98909328407052, "kl": 0.077178955078125, "learning_rate": 2.3575129533678757e-07, "loss": -0.0006, "reward": 1.9590587615966797, "reward_std": 0.00023839250070523121, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4590588808059692, "step": 2951 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 7.647668393782383, "grad_norm": 3.611542925439923, "kl": 0.105712890625, "learning_rate": 2.354922279792746e-07, "loss": -0.0001, "reward": 2.499990940093994, "reward_std": 4.250309189046675e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999909400939941, "step": 2952 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 7.650259067357513, "grad_norm": 1.1790297547408761, "kl": 0.080322265625, "learning_rate": 2.3523316062176165e-07, "loss": 0.0005, "reward": 2.4999793767929077, "reward_std": 6.002086593070999e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999791979789734, "step": 2953 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 7.652849740932642, "grad_norm": 0.6367357057118201, "kl": 0.19482421875, "learning_rate": 2.349740932642487e-07, "loss": 0.0004, "reward": 2.4999942779541016, "reward_std": 4.539891961030662e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999942779541016, "step": 2954 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 7.655440414507772, "grad_norm": 0.1469044631302598, "kl": 0.07470703125, "learning_rate": 2.3471502590673575e-07, "loss": 0.0016, "reward": 2.499998450279236, "reward_std": 1.8066745610667567e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999983310699463, "step": 2955 }, { "clip_ratio": 0.0, "completion_length": 34.5, "epoch": 7.658031088082901, "grad_norm": 17.18381012780644, "kl": 0.07177734375, "learning_rate": 2.3445595854922278e-07, "loss": -0.0003, "reward": 2.3745037317276, "reward_std": 0.23148750290306452, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.8745037913322449, "step": 2956 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 7.660621761658031, "grad_norm": 0.13598828339221825, "kl": 0.15673828125, "learning_rate": 2.3419689119170983e-07, "loss": 0.0005, "reward": 2.4999979734420776, "reward_std": 1.6425097442152037e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999979138374329, "step": 2957 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 7.66321243523316, "grad_norm": 0.5607035626085307, "kl": 0.094482421875, "learning_rate": 2.3393782383419688e-07, "loss": -0.0001, "reward": 2.4999938011169434, "reward_std": 3.2749887282079726e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999937415122986, "step": 2958 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 7.66580310880829, "grad_norm": 0.22616883688488423, "kl": 0.0823974609375, "learning_rate": 2.336787564766839e-07, "loss": 0.0006, "reward": 2.4999940395355225, "reward_std": 2.4444340738227766e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.999993920326233, "step": 2959 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 7.668393782383419, "grad_norm": 0.04940128090796161, "kl": 0.07275390625, "learning_rate": 2.33419689119171e-07, "loss": 0.0009, "reward": 2.499998688697815, "reward_std": 7.59293158125729e-07, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.999998688697815, "step": 2960 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 7.670984455958549, "grad_norm": 0.2590110179005871, "kl": 0.098388671875, "learning_rate": 2.3316062176165802e-07, "loss": 0.001, "reward": 2.4999932050704956, "reward_std": 5.325743927642179e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999932050704956, "step": 2961 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 7.6735751295336785, "grad_norm": 6.061628499129426, "kl": 0.30859375, "learning_rate": 2.3290155440414507e-07, "loss": 0.0012, "reward": 1.4023630023002625, "reward_std": 0.00030712188163306564, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 0.9023629426956177, "step": 2962 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 7.676165803108808, "grad_norm": 2.047964944312291, "kl": 0.1474609375, "learning_rate": 2.3264248704663212e-07, "loss": 0.0006, "reward": 2.4999921321868896, "reward_std": 6.337079412332969e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999921917915344, "step": 2963 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 7.678756476683938, "grad_norm": 0.3081661714282979, "kl": 0.0531005859375, "learning_rate": 2.3238341968911915e-07, "loss": 0.0008, "reward": 2.499997138977051, "reward_std": 3.591041831896291e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999971389770508, "step": 2964 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 7.681347150259067, "grad_norm": 0.22279129819831683, "kl": 0.0390625, "learning_rate": 2.321243523316062e-07, "loss": 0.0001, "reward": 2.4999966621398926, "reward_std": 2.9706216082558967e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999966025352478, "step": 2965 }, { "clip_ratio": 0.0, "completion_length": 34.5, "epoch": 7.683937823834197, "grad_norm": 0.057244872882196304, "kl": 0.0802001953125, "learning_rate": 2.3186528497409325e-07, "loss": 0.0008, "reward": 1.9984210133552551, "reward_std": 9.29600219023996e-07, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4984210431575775, "step": 2966 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 7.686528497409326, "grad_norm": 0.6704997509583145, "kl": 0.1007080078125, "learning_rate": 2.316062176165803e-07, "loss": 0.0004, "reward": 2.499019742012024, "reward_std": 3.267810623697187e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9990198612213135, "step": 2967 }, { "clip_ratio": 0.0, "completion_length": 35.1875, "epoch": 7.689119170984456, "grad_norm": 1.2754302774991817, "kl": 0.0908203125, "learning_rate": 2.3134715025906733e-07, "loss": -0.0001, "reward": 2.4999780654907227, "reward_std": 9.924450296239229e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.999978244304657, "step": 2968 }, { "clip_ratio": 0.0, "completion_length": 34.0, "epoch": 7.691709844559585, "grad_norm": 1.4850388037850724, "kl": 0.12109375, "learning_rate": 2.310880829015544e-07, "loss": 0.0001, "reward": 2.4999935626983643, "reward_std": 5.476312480823253e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999935626983643, "step": 2969 }, { "clip_ratio": 0.0, "completion_length": 34.5, "epoch": 7.694300518134715, "grad_norm": 0.16404457944293435, "kl": 0.0599365234375, "learning_rate": 2.3082901554404144e-07, "loss": 0.0015, "reward": 2.4999983310699463, "reward_std": 2.295492720350012e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999980926513672, "step": 2970 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 7.696891191709844, "grad_norm": 0.0965994030064735, "kl": 0.0533447265625, "learning_rate": 2.3056994818652846e-07, "loss": 0.0001, "reward": 2.499997615814209, "reward_std": 1.1713038361449435e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999976754188538, "step": 2971 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 7.699481865284974, "grad_norm": 0.3055841607363679, "kl": 0.0931396484375, "learning_rate": 2.3031088082901554e-07, "loss": 0.001, "reward": 2.499997615814209, "reward_std": 1.9341566712682834e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.999997615814209, "step": 2972 }, { "clip_ratio": 0.0, "completion_length": 34.5, "epoch": 7.7020725388601035, "grad_norm": 0.6607677650557111, "kl": 0.0716552734375, "learning_rate": 2.3005181347150257e-07, "loss": 0.0003, "reward": 2.4999901056289673, "reward_std": 6.174843520057038e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999902248382568, "step": 2973 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 7.704663212435233, "grad_norm": 0.14703651007227939, "kl": 0.033447265625, "learning_rate": 2.2979274611398962e-07, "loss": -0.0007, "reward": 2.4999977350234985, "reward_std": 1.6590072391409194e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999977946281433, "step": 2974 }, { "clip_ratio": 0.0, "completion_length": 34.0, "epoch": 7.7072538860103625, "grad_norm": 0.6836893906731341, "kl": 0.098876953125, "learning_rate": 2.2953367875647667e-07, "loss": 0.0008, "reward": 2.4999966621398926, "reward_std": 6.666901185781171e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.999996542930603, "step": 2975 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 7.709844559585492, "grad_norm": 4.513052508156367, "kl": 0.1060791015625, "learning_rate": 2.2927461139896373e-07, "loss": 0.0003, "reward": 1.9882862567901611, "reward_std": 0.00016485343704175648, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4882861971855164, "step": 2976 }, { "clip_ratio": 0.0, "completion_length": 33.5, "epoch": 7.712435233160622, "grad_norm": 0.060720704936753946, "kl": 0.09765625, "learning_rate": 2.2901554404145075e-07, "loss": 0.0011, "reward": 2.4999994039535522, "reward_std": 6.702452424178773e-07, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999994039535522, "step": 2977 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 7.715025906735751, "grad_norm": 0.6004266718491046, "kl": 0.24609375, "learning_rate": 2.2875647668393783e-07, "loss": 0.0008, "reward": 2.4999974966049194, "reward_std": 3.527352106402759e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999977350234985, "step": 2978 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 7.717616580310881, "grad_norm": 0.01812897126848882, "kl": 0.0642242431640625, "learning_rate": 2.2849740932642486e-07, "loss": 0.0007, "reward": 2.499999523162842, "reward_std": 5.310442077188782e-07, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999997019767761, "step": 2979 }, { "clip_ratio": 0.0, "completion_length": 34.0, "epoch": 7.72020725388601, "grad_norm": 0.048094224914001525, "kl": 0.05621337890625, "learning_rate": 2.2823834196891188e-07, "loss": -0.0011, "reward": 2.4999990463256836, "reward_std": 6.808981254380342e-07, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999994039535522, "step": 2980 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 7.72279792746114, "grad_norm": 0.232807928441412, "kl": 0.05938720703125, "learning_rate": 2.2797927461139896e-07, "loss": -0.0, "reward": 2.499997138977051, "reward_std": 2.2767966925130168e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999971389770508, "step": 2981 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 7.725388601036269, "grad_norm": 0.11633539879680475, "kl": 0.1195068359375, "learning_rate": 2.27720207253886e-07, "loss": -0.0001, "reward": 1.953113853931427, "reward_std": 8.382568097431431e-07, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4531138837337494, "step": 2982 }, { "clip_ratio": 0.0, "completion_length": 34.0, "epoch": 7.727979274611399, "grad_norm": 0.03658213069447342, "kl": 0.05908203125, "learning_rate": 2.2746113989637304e-07, "loss": 0.0014, "reward": 2.4999983310699463, "reward_std": 1.4331392890198913e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999980330467224, "step": 2983 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 7.730569948186528, "grad_norm": 2.5249971790989147, "kl": 0.092529296875, "learning_rate": 2.272020725388601e-07, "loss": 0.0005, "reward": 2.499990940093994, "reward_std": 6.312982350209495e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999907612800598, "step": 2984 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 7.733160621761658, "grad_norm": 2.2048208802356037, "kl": 0.1181640625, "learning_rate": 2.2694300518134715e-07, "loss": 0.0014, "reward": 1.8863747119903564, "reward_std": 0.00018447764458073834, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.386374592781067, "step": 2985 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 7.7357512953367875, "grad_norm": 0.7929573325592978, "kl": 0.10009765625, "learning_rate": 2.2668393782383417e-07, "loss": 0.0004, "reward": 1.9995321035385132, "reward_std": 1.4098423093855672e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4995321333408356, "step": 2986 }, { "clip_ratio": 0.0, "completion_length": 36.5, "epoch": 7.738341968911917, "grad_norm": 0.3306776670372298, "kl": 0.063232421875, "learning_rate": 2.2642487046632123e-07, "loss": -0.0003, "reward": 2.499997854232788, "reward_std": 2.5592077008695924e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999979734420776, "step": 2987 }, { "clip_ratio": 0.0, "completion_length": 34.5, "epoch": 7.740932642487047, "grad_norm": 5.439705292379303, "kl": 0.166748046875, "learning_rate": 2.2616580310880828e-07, "loss": 0.001, "reward": 1.919031023979187, "reward_std": 0.00045640194349516605, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4190311133861542, "step": 2988 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 7.743523316062176, "grad_norm": 0.17917557863645966, "kl": 0.0482177734375, "learning_rate": 2.2590673575129533e-07, "loss": 0.0005, "reward": 2.4999982118606567, "reward_std": 1.1641488981695147e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.999998152256012, "step": 2989 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 7.746113989637306, "grad_norm": 14.472835750069892, "kl": 0.093017578125, "learning_rate": 2.2564766839378238e-07, "loss": 0.0006, "reward": 1.9872830510139465, "reward_std": 0.0003825480937393877, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4872829914093018, "step": 2990 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 7.748704663212435, "grad_norm": 4.4554873041098535, "kl": 0.20068359375, "learning_rate": 2.253886010362694e-07, "loss": 0.0013, "reward": 1.7720575332641602, "reward_std": 0.0003944761579077749, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.2720574736595154, "step": 2991 }, { "clip_ratio": 0.0, "completion_length": 34.5625, "epoch": 7.751295336787565, "grad_norm": 3.304391616781944, "kl": 0.197906494140625, "learning_rate": 2.251295336787565e-07, "loss": 0.0003, "reward": 1.9799120426177979, "reward_std": 0.00010527110788416394, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4799121618270874, "step": 2992 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 7.753886010362694, "grad_norm": 0.10017255035828425, "kl": 0.1455078125, "learning_rate": 2.2487046632124352e-07, "loss": -0.0009, "reward": 2.499997854232788, "reward_std": 1.9528069401530956e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999980330467224, "step": 2993 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 7.756476683937824, "grad_norm": 1.775040479956177, "kl": 0.1041259765625, "learning_rate": 2.2461139896373054e-07, "loss": -0.0008, "reward": 2.4999914169311523, "reward_std": 7.702289167355048e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999914765357971, "step": 2994 }, { "clip_ratio": 0.0, "completion_length": 33.5, "epoch": 7.759067357512953, "grad_norm": 8.498881243725833, "kl": 0.144775390625, "learning_rate": 2.2435233160621762e-07, "loss": 0.0012, "reward": 1.9859219789505005, "reward_std": 0.0010527848915558025, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4859220683574677, "step": 2995 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 7.761658031088083, "grad_norm": 9.330293986048604, "kl": 0.11083984375, "learning_rate": 2.2409326424870465e-07, "loss": 0.0007, "reward": 2.437412738800049, "reward_std": 0.17701938969071307, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9374128580093384, "step": 2996 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 7.7642487046632125, "grad_norm": 0.06290617920521173, "kl": 0.054779052734375, "learning_rate": 2.238341968911917e-07, "loss": 0.0002, "reward": 2.499997615814209, "reward_std": 1.6040725654420385e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999976754188538, "step": 2997 }, { "clip_ratio": 0.0, "completion_length": 34.5, "epoch": 7.766839378238342, "grad_norm": 0.4512121547133678, "kl": 0.12353515625, "learning_rate": 2.2357512953367875e-07, "loss": 0.0008, "reward": 1.9997767210006714, "reward_std": 9.132969353231601e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4997766017913818, "step": 2998 }, { "clip_ratio": 0.0, "completion_length": 34.5, "epoch": 7.769430051813472, "grad_norm": 17.139758720842824, "kl": 0.1806640625, "learning_rate": 2.233160621761658e-07, "loss": 0.0012, "reward": 1.9556805491447449, "reward_std": 0.0003215945748706872, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4556803703308105, "step": 2999 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 7.772020725388601, "grad_norm": 0.5188629484731856, "kl": 0.0849609375, "learning_rate": 2.2305699481865283e-07, "loss": 0.0012, "reward": 2.499979019165039, "reward_std": 7.1322385792882415e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999788999557495, "step": 3000 }, { "clip_ratio": 0.0, "completion_length": 34.0, "epoch": 7.774611398963731, "grad_norm": 0.06310146586443106, "kl": 0.12939453125, "learning_rate": 2.2279792746113988e-07, "loss": 0.0002, "reward": 2.4999988079071045, "reward_std": 1.619227816718194e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.999998688697815, "step": 3001 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 7.77720207253886, "grad_norm": 7.039274589696187, "kl": 0.09375, "learning_rate": 2.2253886010362694e-07, "loss": 0.0012, "reward": 2.4997451305389404, "reward_std": 5.299113354340079e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.999744951725006, "step": 3002 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 7.77979274611399, "grad_norm": 3.715489128935168, "kl": 0.085693359375, "learning_rate": 2.2227979274611396e-07, "loss": 0.0012, "reward": 1.822838544845581, "reward_std": 0.00023595143682086928, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.3228385746479034, "step": 3003 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 7.782383419689119, "grad_norm": 35.80697681801513, "kl": 0.11663818359375, "learning_rate": 2.2202072538860104e-07, "loss": -0.0004, "reward": 2.432840347290039, "reward_std": 0.18995128316512933, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9328404664993286, "step": 3004 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 7.784974093264249, "grad_norm": 0.10969858925322229, "kl": 0.061370849609375, "learning_rate": 2.2176165803108807e-07, "loss": -0.0003, "reward": 2.4999982118606567, "reward_std": 1.3260830939998414e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999983310699463, "step": 3005 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 7.787564766839378, "grad_norm": 0.16203893010107592, "kl": 0.1087646484375, "learning_rate": 2.2150259067357512e-07, "loss": 0.0009, "reward": 2.4999974966049194, "reward_std": 3.211570628991467e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999974966049194, "step": 3006 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 7.790155440414508, "grad_norm": 5.183751642412059, "kl": 0.205078125, "learning_rate": 2.2124352331606217e-07, "loss": 0.0003, "reward": 2.4998831748962402, "reward_std": 2.3188613681668357e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9998834133148193, "step": 3007 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 7.7927461139896375, "grad_norm": 0.6309927054160139, "kl": 0.093994140625, "learning_rate": 2.2098445595854923e-07, "loss": 0.001, "reward": 2.499986410140991, "reward_std": 3.516961101013294e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999862909317017, "step": 3008 }, { "clip_ratio": 0.0, "completion_length": 34.5, "epoch": 7.795336787564767, "grad_norm": 0.29405330002990454, "kl": 0.159912109375, "learning_rate": 2.2072538860103625e-07, "loss": 0.0009, "reward": 2.4999964237213135, "reward_std": 2.5809340513660572e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999963641166687, "step": 3009 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 7.7979274611398965, "grad_norm": 0.5081163984497482, "kl": 0.072998046875, "learning_rate": 2.204663212435233e-07, "loss": 0.0004, "reward": 2.4999911785125732, "reward_std": 5.08629477735667e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999911785125732, "step": 3010 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 7.800518134715026, "grad_norm": 5.611138051750514, "kl": 0.093505859375, "learning_rate": 2.2020725388601036e-07, "loss": 0.0004, "reward": 1.9998716115951538, "reward_std": 0.0001064846621829929, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4998716115951538, "step": 3011 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 7.803108808290156, "grad_norm": 7.267496427067268, "kl": 0.050048828125, "learning_rate": 2.1994818652849738e-07, "loss": -0.0, "reward": 1.9986516237258911, "reward_std": 0.00013987819954763836, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4986517131328583, "step": 3012 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 7.805699481865285, "grad_norm": 0.15318169809999016, "kl": 0.11328125, "learning_rate": 2.1968911917098446e-07, "loss": -0.0, "reward": 2.4999839067459106, "reward_std": 3.0926872938152883e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999840259552002, "step": 3013 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 7.808290155440415, "grad_norm": 0.16374576222824022, "kl": 0.113037109375, "learning_rate": 2.194300518134715e-07, "loss": 0.0003, "reward": 2.4999983310699463, "reward_std": 1.3417750039934617e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999985098838806, "step": 3014 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 7.810880829015544, "grad_norm": 1.3150495833201583, "kl": 0.0657958984375, "learning_rate": 2.1917098445595854e-07, "loss": 0.0005, "reward": 2.4999908208847046, "reward_std": 6.791120540583506e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999908208847046, "step": 3015 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 7.813471502590674, "grad_norm": 2.479092858002278, "kl": 0.128173828125, "learning_rate": 2.189119170984456e-07, "loss": 0.0009, "reward": 1.9447592496871948, "reward_std": 0.00013159715604160738, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4447591304779053, "step": 3016 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 7.816062176165803, "grad_norm": 0.07084762494339814, "kl": 0.04296875, "learning_rate": 2.1865284974093262e-07, "loss": 0.0011, "reward": 2.4999990463256836, "reward_std": 9.151464155365829e-07, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999991655349731, "step": 3017 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 7.818652849740933, "grad_norm": 3.846169558405578, "kl": 0.06298828125, "learning_rate": 2.1839378238341967e-07, "loss": -0.0002, "reward": 1.979655385017395, "reward_std": 0.000159777092449076, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4796555042266846, "step": 3018 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 7.821243523316062, "grad_norm": 1.960973022993101, "kl": 0.0634765625, "learning_rate": 2.1813471502590673e-07, "loss": -0.0, "reward": 2.499996304512024, "reward_std": 4.883814824552246e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999963641166687, "step": 3019 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 7.823834196891192, "grad_norm": 0.9377522933956081, "kl": 0.34228515625, "learning_rate": 2.1787564766839378e-07, "loss": 0.0011, "reward": 2.4999818801879883, "reward_std": 8.19701540422102e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999818205833435, "step": 3020 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 7.8264248704663215, "grad_norm": 1.1907806486277712, "kl": 0.0693359375, "learning_rate": 2.176165803108808e-07, "loss": -0.0006, "reward": 2.499992847442627, "reward_std": 5.587518216998433e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999930262565613, "step": 3021 }, { "clip_ratio": 0.0, "completion_length": 34.5, "epoch": 7.829015544041451, "grad_norm": 1.7518464325649516, "kl": 0.128173828125, "learning_rate": 2.1735751295336789e-07, "loss": -0.0002, "reward": 2.49997341632843, "reward_std": 6.6879167661682e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.999973475933075, "step": 3022 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 7.831606217616581, "grad_norm": 0.41234468921795614, "kl": 0.24462890625, "learning_rate": 2.170984455958549e-07, "loss": 0.002, "reward": 2.499993324279785, "reward_std": 2.963234692288097e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999932646751404, "step": 3023 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 7.83419689119171, "grad_norm": 1.5262717657511273, "kl": 0.1123046875, "learning_rate": 2.1683937823834194e-07, "loss": 0.0002, "reward": 2.499981999397278, "reward_std": 8.631451549945268e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999820590019226, "step": 3024 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 7.83678756476684, "grad_norm": 0.06133610351617526, "kl": 0.06689453125, "learning_rate": 2.1658031088082902e-07, "loss": -0.0011, "reward": 2.4999983310699463, "reward_std": 8.810840768092021e-07, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999984502792358, "step": 3025 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 7.839378238341969, "grad_norm": 0.41373936099062025, "kl": 0.069091796875, "learning_rate": 2.1632124352331604e-07, "loss": 0.0015, "reward": 2.499998688697815, "reward_std": 1.190975950748907e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999986290931702, "step": 3026 }, { "clip_ratio": 0.0, "completion_length": 34.5, "epoch": 7.841968911917099, "grad_norm": 4.5704246137062, "kl": 0.04931640625, "learning_rate": 2.160621761658031e-07, "loss": -0.0002, "reward": 1.9998024106025696, "reward_std": 3.9728092929181e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4998025596141815, "step": 3027 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 7.844559585492228, "grad_norm": 1.506214091610807, "kl": 0.144775390625, "learning_rate": 2.1580310880829015e-07, "loss": -0.0, "reward": 1.9998642206192017, "reward_std": 1.6321398561558453e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4998642802238464, "step": 3028 }, { "clip_ratio": 0.0, "completion_length": 34.5, "epoch": 7.847150259067358, "grad_norm": 2.840281677666068, "kl": 0.174072265625, "learning_rate": 2.155440414507772e-07, "loss": 0.0008, "reward": 1.821478247642517, "reward_std": 0.00034726609214885684, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.3214781284332275, "step": 3029 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 7.849740932642487, "grad_norm": 0.7188994295197779, "kl": 0.064453125, "learning_rate": 2.1528497409326423e-07, "loss": -0.0002, "reward": 2.4999929666519165, "reward_std": 4.578446805680869e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.999993085861206, "step": 3030 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 7.852331606217617, "grad_norm": 1.370770481823312, "kl": 0.108154296875, "learning_rate": 2.1502590673575128e-07, "loss": 0.0007, "reward": 2.49998939037323, "reward_std": 6.488994245046342e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999893307685852, "step": 3031 }, { "clip_ratio": 0.0, "completion_length": 36.5, "epoch": 7.8549222797927465, "grad_norm": 2.117732787684715, "kl": 0.052001953125, "learning_rate": 2.1476683937823833e-07, "loss": 0.0011, "reward": 2.4998621940612793, "reward_std": 2.08686252562984e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9998621940612793, "step": 3032 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 7.857512953367876, "grad_norm": 0.3700367947794776, "kl": 0.07379150390625, "learning_rate": 2.1450777202072536e-07, "loss": 0.0009, "reward": 2.499994158744812, "reward_std": 2.8616719589535933e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.999994158744812, "step": 3033 }, { "clip_ratio": 0.0, "completion_length": 34.0, "epoch": 7.860103626943005, "grad_norm": 1.6195198913674689, "kl": 0.1171875, "learning_rate": 2.1424870466321244e-07, "loss": -0.0003, "reward": 2.499969482421875, "reward_std": 9.028870351812657e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999695420265198, "step": 3034 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 7.862694300518134, "grad_norm": 0.4882771074124476, "kl": 0.0601806640625, "learning_rate": 2.1398963730569946e-07, "loss": -0.0005, "reward": 2.4999966621398926, "reward_std": 3.512521175252914e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999966621398926, "step": 3035 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 7.865284974093264, "grad_norm": 0.2621269532102978, "kl": 0.236572265625, "learning_rate": 2.1373056994818652e-07, "loss": -0.0002, "reward": 2.4999958276748657, "reward_std": 4.095677013538079e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999958276748657, "step": 3036 }, { "clip_ratio": 0.0, "completion_length": 35.3125, "epoch": 7.867875647668393, "grad_norm": 0.6894657108170018, "kl": 0.2275390625, "learning_rate": 2.1347150259067357e-07, "loss": -0.0001, "reward": 2.4999892711639404, "reward_std": 4.097757937415736e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999893307685852, "step": 3037 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 7.870466321243523, "grad_norm": 0.32286157224931034, "kl": 0.08935546875, "learning_rate": 2.1321243523316062e-07, "loss": 0.0024, "reward": 2.4999969005584717, "reward_std": 2.5663498490757775e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999967813491821, "step": 3038 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 7.873056994818652, "grad_norm": 1.192669783119955, "kl": 0.1072998046875, "learning_rate": 2.1295336787564765e-07, "loss": 0.0006, "reward": 2.4999961853027344, "reward_std": 4.818610932488809e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999960660934448, "step": 3039 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 7.875647668393782, "grad_norm": 0.14040132101163627, "kl": 0.020477294921875, "learning_rate": 2.126943005181347e-07, "loss": 0.0001, "reward": 2.499997138977051, "reward_std": 1.620960517811909e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999971389770508, "step": 3040 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 7.8782383419689115, "grad_norm": 0.28557804175723084, "kl": 0.13818359375, "learning_rate": 2.1243523316062175e-07, "loss": 0.0011, "reward": 2.499998092651367, "reward_std": 1.621204319235403e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999979138374329, "step": 3041 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 7.880829015544041, "grad_norm": 32.47139845341851, "kl": 3.42333984375, "learning_rate": 2.1217616580310878e-07, "loss": 0.014, "reward": 2.433580756187439, "reward_std": 0.18785653862721574, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.933580756187439, "step": 3042 }, { "clip_ratio": 0.0, "completion_length": 34.5, "epoch": 7.883419689119171, "grad_norm": 0.29161598149629786, "kl": 0.133544921875, "learning_rate": 2.1191709844559586e-07, "loss": -0.0, "reward": 2.4999969005584717, "reward_std": 1.7866369717012276e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999969005584717, "step": 3043 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 7.8860103626943, "grad_norm": 0.9610300212534616, "kl": 0.1171875, "learning_rate": 2.1165803108808289e-07, "loss": 0.0006, "reward": 2.4999794960021973, "reward_std": 5.429509201348992e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999793767929077, "step": 3044 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 7.88860103626943, "grad_norm": 4.219844876982593, "kl": 0.156982421875, "learning_rate": 2.1139896373056996e-07, "loss": 0.0008, "reward": 1.9997655749320984, "reward_std": 2.8689085411315318e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4997656345367432, "step": 3045 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 7.891191709844559, "grad_norm": 0.6850453688738123, "kl": 0.171142578125, "learning_rate": 2.11139896373057e-07, "loss": -0.0007, "reward": 2.4999877214431763, "reward_std": 5.104976366965275e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999877214431763, "step": 3046 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 7.893782383419689, "grad_norm": 0.3771100775471277, "kl": 0.08203125, "learning_rate": 2.1088082901554402e-07, "loss": -0.0007, "reward": 2.499996304512024, "reward_std": 4.827522730010969e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999964833259583, "step": 3047 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 7.896373056994818, "grad_norm": 0.655211727901186, "kl": 0.060760498046875, "learning_rate": 2.106217616580311e-07, "loss": 0.001, "reward": 2.4999940395355225, "reward_std": 5.3634421135484445e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.999993920326233, "step": 3048 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 7.898963730569948, "grad_norm": 30.260415676334414, "kl": 0.074951171875, "learning_rate": 2.1036269430051812e-07, "loss": 0.0004, "reward": 1.9996639490127563, "reward_std": 2.3242853558258503e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4996639490127563, "step": 3049 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 7.901554404145077, "grad_norm": 2.7657426971767904, "kl": 0.050537109375, "learning_rate": 2.1010362694300517e-07, "loss": 0.0004, "reward": 1.9999040365219116, "reward_std": 1.4257862460453907e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.499904066324234, "step": 3050 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 7.904145077720207, "grad_norm": 0.1574594416802252, "kl": 0.04388427734375, "learning_rate": 2.0984455958549223e-07, "loss": 0.0005, "reward": 2.499995231628418, "reward_std": 1.847226315021544e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999951720237732, "step": 3051 }, { "clip_ratio": 0.0, "completion_length": 34.5, "epoch": 7.9067357512953365, "grad_norm": 0.12804410972138083, "kl": 0.1669921875, "learning_rate": 2.0958549222797928e-07, "loss": 0.001, "reward": 2.4999955892562866, "reward_std": 1.8536298966864706e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.999995470046997, "step": 3052 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 7.909326424870466, "grad_norm": 3.0548242980664795, "kl": 0.15380859375, "learning_rate": 2.093264248704663e-07, "loss": 0.0001, "reward": 1.99904066324234, "reward_std": 5.545262655459737e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.49904066324234, "step": 3053 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 7.9119170984455955, "grad_norm": 6.256809713309946, "kl": 0.131103515625, "learning_rate": 2.0906735751295336e-07, "loss": 0.0004, "reward": 1.9948559999465942, "reward_std": 0.00011882455555678462, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4948559701442719, "step": 3054 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 7.914507772020725, "grad_norm": 0.10216406313540978, "kl": 0.083251953125, "learning_rate": 2.088082901554404e-07, "loss": 0.0006, "reward": 2.499998092651367, "reward_std": 2.050543855602882e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999980330467224, "step": 3055 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 7.917098445595855, "grad_norm": 0.2851786198113071, "kl": 0.0911865234375, "learning_rate": 2.0854922279792744e-07, "loss": -0.0001, "reward": 2.4999958276748657, "reward_std": 2.097099411457748e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999958872795105, "step": 3056 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 7.919689119170984, "grad_norm": 3.5818919836221954, "kl": 0.18359375, "learning_rate": 2.0829015544041452e-07, "loss": 0.0003, "reward": 2.499968409538269, "reward_std": 5.582526000580401e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999683499336243, "step": 3057 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 7.922279792746114, "grad_norm": 0.17737390143464368, "kl": 0.113037109375, "learning_rate": 2.0803108808290154e-07, "loss": -0.0003, "reward": 2.4999964237213135, "reward_std": 1.8129984482584405e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999963641166687, "step": 3058 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 7.924870466321243, "grad_norm": 2.0247490285339027, "kl": 0.23681640625, "learning_rate": 2.077720207253886e-07, "loss": 0.0014, "reward": 1.9987263083457947, "reward_std": 6.473113160154753e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4987261295318604, "step": 3059 }, { "clip_ratio": 0.0, "completion_length": 34.0, "epoch": 7.927461139896373, "grad_norm": 1.1506908007150278, "kl": 0.087890625, "learning_rate": 2.0751295336787565e-07, "loss": 0.0016, "reward": 2.4999799728393555, "reward_std": 8.695652923051966e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999797344207764, "step": 3060 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 7.930051813471502, "grad_norm": 5.83986424841263, "kl": 0.143798828125, "learning_rate": 2.0725388601036267e-07, "loss": 0.0007, "reward": 1.8943517804145813, "reward_std": 0.0006579617829061135, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.3943516314029694, "step": 3061 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 7.932642487046632, "grad_norm": 0.10170327880293734, "kl": 0.12255859375, "learning_rate": 2.0699481865284973e-07, "loss": -0.0004, "reward": 2.4999977350234985, "reward_std": 2.202723635491566e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.999997854232788, "step": 3062 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 7.935233160621761, "grad_norm": 1.5760768946373525, "kl": 0.0771484375, "learning_rate": 2.0673575129533678e-07, "loss": -0.0006, "reward": 1.9999316930770874, "reward_std": 1.3120281323608651e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4999319314956665, "step": 3063 }, { "clip_ratio": 0.0, "completion_length": 34.0, "epoch": 7.937823834196891, "grad_norm": 0.03566346779275998, "kl": 0.0628662109375, "learning_rate": 2.0647668393782383e-07, "loss": 0.0004, "reward": 2.4999964237213135, "reward_std": 1.0423624132727127e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999964237213135, "step": 3064 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 7.9404145077720205, "grad_norm": 0.16240862840114378, "kl": 0.056396484375, "learning_rate": 2.0621761658031086e-07, "loss": 0.0011, "reward": 2.4999959468841553, "reward_std": 2.3705815124230867e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999958872795105, "step": 3065 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 7.94300518134715, "grad_norm": 0.277360373152522, "kl": 0.03338623046875, "learning_rate": 2.0595854922279794e-07, "loss": 0.0003, "reward": 2.4999942779541016, "reward_std": 2.7561879960558144e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999940395355225, "step": 3066 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 7.94559585492228, "grad_norm": 1.0893218613989033, "kl": 0.0775146484375, "learning_rate": 2.0569948186528496e-07, "loss": -0.0006, "reward": 2.499997138977051, "reward_std": 3.194425858055183e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999971389770508, "step": 3067 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 7.948186528497409, "grad_norm": 0.16023194017835296, "kl": 0.0628662109375, "learning_rate": 2.0544041450777202e-07, "loss": -0.0, "reward": 2.4999982118606567, "reward_std": 1.4650185846676322e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999982118606567, "step": 3068 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 7.950777202072539, "grad_norm": 3.879061204044804, "kl": 0.149169921875, "learning_rate": 2.0518134715025907e-07, "loss": 0.0014, "reward": 1.9994211196899414, "reward_std": 4.467247589445833e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4994210004806519, "step": 3069 }, { "clip_ratio": 0.0, "completion_length": 34.5, "epoch": 7.953367875647668, "grad_norm": 18.56001308259635, "kl": 0.142578125, "learning_rate": 2.049222797927461e-07, "loss": 0.0009, "reward": 2.4374929666519165, "reward_std": 0.17678898698068224, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9374930262565613, "step": 3070 }, { "clip_ratio": 0.0, "completion_length": 34.5, "epoch": 7.955958549222798, "grad_norm": 0.05355617242731596, "kl": 0.0408935546875, "learning_rate": 2.0466321243523315e-07, "loss": 0.0, "reward": 2.4999974966049194, "reward_std": 1.1455659887360525e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.999997615814209, "step": 3071 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 7.958549222797927, "grad_norm": 0.587690700379261, "kl": 0.070556640625, "learning_rate": 2.044041450777202e-07, "loss": -0.0008, "reward": 2.499994397163391, "reward_std": 6.273919552768348e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999946355819702, "step": 3072 }, { "clip_ratio": 0.0, "completion_length": 34.5, "epoch": 7.961139896373057, "grad_norm": 0.7380677137303501, "kl": 0.14501953125, "learning_rate": 2.0414507772020725e-07, "loss": 0.0001, "reward": 2.4999955892562866, "reward_std": 2.104386055634677e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999957084655762, "step": 3073 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 7.963730569948186, "grad_norm": 0.2588980691836974, "kl": 0.112548828125, "learning_rate": 2.0388601036269428e-07, "loss": 0.0008, "reward": 2.4999825954437256, "reward_std": 5.611572589714342e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.999982476234436, "step": 3074 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 7.966321243523316, "grad_norm": 2.572605515516186, "kl": 0.158935546875, "learning_rate": 2.0362694300518136e-07, "loss": 0.0011, "reward": 1.9984492659568787, "reward_std": 4.848407400004362e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4984492659568787, "step": 3075 }, { "clip_ratio": 0.0, "completion_length": 35.3125, "epoch": 7.9689119170984455, "grad_norm": 2.665702570226113, "kl": 0.0643310546875, "learning_rate": 2.0336787564766839e-07, "loss": 0.0007, "reward": 2.49998140335083, "reward_std": 7.662841227329409e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999815821647644, "step": 3076 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 7.971502590673575, "grad_norm": 18.970656294062735, "kl": 0.203369140625, "learning_rate": 2.031088082901554e-07, "loss": 0.0011, "reward": 1.9487608671188354, "reward_std": 0.0007219227768473502, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4487608969211578, "step": 3077 }, { "clip_ratio": 0.0, "completion_length": 36.5, "epoch": 7.974093264248705, "grad_norm": 0.29214332981303415, "kl": 0.099365234375, "learning_rate": 2.028497409326425e-07, "loss": 0.001, "reward": 2.4999948740005493, "reward_std": 2.9205231157902745e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999948740005493, "step": 3078 }, { "clip_ratio": 0.0, "completion_length": 34.5, "epoch": 7.976683937823834, "grad_norm": 0.06903466222955096, "kl": 0.07177734375, "learning_rate": 2.0259067357512952e-07, "loss": 0.0006, "reward": 2.4999988079071045, "reward_std": 9.816939154916327e-07, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999988675117493, "step": 3079 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 7.979274611398964, "grad_norm": 8.612252967738119, "kl": 0.0958251953125, "learning_rate": 2.0233160621761657e-07, "loss": 0.0005, "reward": 1.7707089185714722, "reward_std": 0.0003922329296983662, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.270709067583084, "step": 3080 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 7.981865284974093, "grad_norm": 0.06965903585808235, "kl": 0.09033203125, "learning_rate": 2.0207253886010362e-07, "loss": 0.0005, "reward": 2.499998092651367, "reward_std": 1.221670515860751e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999979734420776, "step": 3081 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 7.984455958549223, "grad_norm": 61.8232118057449, "kl": 0.123382568359375, "learning_rate": 2.0181347150259068e-07, "loss": -0.0004, "reward": 2.1868947744369507, "reward_std": 0.2592725643578433, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.6868947744369507, "step": 3082 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 7.987046632124352, "grad_norm": 2.1478023117879985, "kl": 0.189697265625, "learning_rate": 2.015544041450777e-07, "loss": 0.0016, "reward": 2.4999862909317017, "reward_std": 1.615830603896029e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999861121177673, "step": 3083 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 7.989637305699482, "grad_norm": 0.20232547123761707, "kl": 0.061279296875, "learning_rate": 2.0129533678756475e-07, "loss": -0.0001, "reward": 2.499997615814209, "reward_std": 2.117779388299823e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999977350234985, "step": 3084 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 7.992227979274611, "grad_norm": 4.503614777407574, "kl": 0.093017578125, "learning_rate": 2.010362694300518e-07, "loss": 0.0004, "reward": 1.9998580813407898, "reward_std": 4.70124298317387e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4998580813407898, "step": 3085 }, { "clip_ratio": 0.0, "completion_length": 34.5, "epoch": 7.994818652849741, "grad_norm": 0.20226327912536518, "kl": 0.0780029296875, "learning_rate": 2.0077720207253883e-07, "loss": -0.0006, "reward": 2.4999945163726807, "reward_std": 2.061266286546015e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999944567680359, "step": 3086 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 7.9974093264248705, "grad_norm": 0.17327887352374288, "kl": 0.093017578125, "learning_rate": 2.005181347150259e-07, "loss": -0.0009, "reward": 2.4999959468841553, "reward_std": 3.4762794598464097e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999961256980896, "step": 3087 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 8.0, "grad_norm": 95.68303039745064, "kl": 0.083984375, "learning_rate": 2.0025906735751294e-07, "loss": 0.0004, "reward": 2.3749685287475586, "reward_std": 0.2673270872874127, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.8749684691429138, "step": 3088 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 8.00259067357513, "grad_norm": 0.27654166941512687, "kl": 0.1065673828125, "learning_rate": 2e-07, "loss": 0.0019, "reward": 2.4999955892562866, "reward_std": 1.8276711273301771e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999954104423523, "step": 3089 }, { "clip_ratio": 0.0, "completion_length": 34.5, "epoch": 8.005181347150259, "grad_norm": 0.8058061970490968, "kl": 0.16650390625, "learning_rate": 1.9974093264248704e-07, "loss": 0.001, "reward": 1.9984174370765686, "reward_std": 3.3430058010708308e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.498417317867279, "step": 3090 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 8.007772020725389, "grad_norm": 4.038848217616547, "kl": 0.0692138671875, "learning_rate": 1.9948186528497407e-07, "loss": 0.0013, "reward": 2.4999942779541016, "reward_std": 8.745244258534512e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999943375587463, "step": 3091 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 8.010362694300518, "grad_norm": 1.3382015064909978, "kl": 0.04241943359375, "learning_rate": 1.9922279792746112e-07, "loss": 0.0002, "reward": 2.4999924898147583, "reward_std": 6.398024424925097e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999924898147583, "step": 3092 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 8.012953367875648, "grad_norm": 0.08791842291493944, "kl": 0.087158203125, "learning_rate": 1.9896373056994818e-07, "loss": 0.0004, "reward": 2.499995708465576, "reward_std": 2.1252000124150072e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.999995768070221, "step": 3093 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 8.015544041450777, "grad_norm": 4.562210568370996, "kl": 0.19482421875, "learning_rate": 1.9870466321243523e-07, "loss": 0.0007, "reward": 1.9321624040603638, "reward_std": 0.17684750015177997, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4321625232696533, "step": 3094 }, { "clip_ratio": 0.0, "completion_length": 34.5, "epoch": 8.018134715025907, "grad_norm": 0.040378968450249915, "kl": 0.080078125, "learning_rate": 1.9844559585492225e-07, "loss": 0.0011, "reward": 2.499996304512024, "reward_std": 1.162184503300523e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999961853027344, "step": 3095 }, { "clip_ratio": 0.0, "completion_length": 34.5, "epoch": 8.020725388601036, "grad_norm": 0.13822927361758697, "kl": 0.1259765625, "learning_rate": 1.9818652849740933e-07, "loss": 0.0001, "reward": 2.4999974966049194, "reward_std": 1.4583371807930234e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999977350234985, "step": 3096 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 8.023316062176166, "grad_norm": 0.0967403499014125, "kl": 0.09765625, "learning_rate": 1.9792746113989636e-07, "loss": 0.0005, "reward": 2.4999970197677612, "reward_std": 1.7078815801596647e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999971389770508, "step": 3097 }, { "clip_ratio": 0.0, "completion_length": 34.5, "epoch": 8.025906735751295, "grad_norm": 0.3835122558516601, "kl": 0.03228759765625, "learning_rate": 1.9766839378238339e-07, "loss": 0.0004, "reward": 2.4999773502349854, "reward_std": 5.233016963757109e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999772906303406, "step": 3098 }, { "clip_ratio": 0.0, "completion_length": 36.5, "epoch": 8.028497409326425, "grad_norm": 0.09802395547562452, "kl": 0.109130859375, "learning_rate": 1.9740932642487046e-07, "loss": 0.0001, "reward": 2.4999983310699463, "reward_std": 7.796735701504076e-07, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.999998390674591, "step": 3099 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 8.031088082901555, "grad_norm": 0.6203831105531014, "kl": 0.081787109375, "learning_rate": 1.971502590673575e-07, "loss": 0.0013, "reward": 2.4999953508377075, "reward_std": 4.10243467285909e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.999995231628418, "step": 3100 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 8.033678756476684, "grad_norm": 0.5969753783399765, "kl": 0.114013671875, "learning_rate": 1.9689119170984454e-07, "loss": -0.0005, "reward": 2.4999924898147583, "reward_std": 7.148053214223182e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999926090240479, "step": 3101 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 8.036269430051814, "grad_norm": 0.1087624186499036, "kl": 0.052490234375, "learning_rate": 1.966321243523316e-07, "loss": 0.0002, "reward": 2.499998092651367, "reward_std": 1.8846645275516494e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999982118606567, "step": 3102 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 8.038860103626943, "grad_norm": 0.15494918170888425, "kl": 0.102783203125, "learning_rate": 1.9637305699481865e-07, "loss": 0.0011, "reward": 2.499995708465576, "reward_std": 1.6875885648914846e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999956488609314, "step": 3103 }, { "clip_ratio": 0.0, "completion_length": 34.0, "epoch": 8.041450777202073, "grad_norm": 7.457069789882459, "kl": 0.12451171875, "learning_rate": 1.9611398963730568e-07, "loss": -0.0007, "reward": 1.89161217212677, "reward_std": 0.0005261765377895244, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.3916123509407043, "step": 3104 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 8.044041450777202, "grad_norm": 0.19427065900490728, "kl": 0.0604248046875, "learning_rate": 1.9585492227979275e-07, "loss": -0.0003, "reward": 2.4999983310699463, "reward_std": 1.6826643332024105e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999984502792358, "step": 3105 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 8.046632124352332, "grad_norm": 8.275001637240742, "kl": 0.064697265625, "learning_rate": 1.9559585492227978e-07, "loss": 0.0006, "reward": 2.4999817609786987, "reward_std": 1.6396952958075417e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999818205833435, "step": 3106 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 8.049222797927461, "grad_norm": 0.5196029343535595, "kl": 0.100341796875, "learning_rate": 1.953367875647668e-07, "loss": -0.0, "reward": 2.499995708465576, "reward_std": 4.982454356650123e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999958276748657, "step": 3107 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 8.05181347150259, "grad_norm": 0.2783073834819255, "kl": 0.07763671875, "learning_rate": 1.9507772020725389e-07, "loss": 0.0003, "reward": 2.4999959468841553, "reward_std": 1.8774072714222712e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999959468841553, "step": 3108 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 8.05440414507772, "grad_norm": 0.25248659524932354, "kl": 0.094482421875, "learning_rate": 1.948186528497409e-07, "loss": 0.0008, "reward": 2.4999901056289673, "reward_std": 3.0526043701684102e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.999990165233612, "step": 3109 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 8.05699481865285, "grad_norm": 2.246502080804671, "kl": 0.0938720703125, "learning_rate": 1.94559585492228e-07, "loss": 0.0001, "reward": 1.984150767326355, "reward_std": 0.0001390464283304027, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4841507077217102, "step": 3110 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 8.05958549222798, "grad_norm": 0.18038625345606876, "kl": 0.07568359375, "learning_rate": 1.9430051813471502e-07, "loss": 0.0004, "reward": 2.499997138977051, "reward_std": 1.8937988102152303e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999970197677612, "step": 3111 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 8.062176165803109, "grad_norm": 0.3286993763777389, "kl": 0.0477294921875, "learning_rate": 1.9404145077720207e-07, "loss": 0.0002, "reward": 2.4999966621398926, "reward_std": 3.4049578516714973e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999967217445374, "step": 3112 }, { "clip_ratio": 0.0, "completion_length": 34.5, "epoch": 8.064766839378239, "grad_norm": 6.257669181126466, "kl": 0.137939453125, "learning_rate": 1.9378238341968912e-07, "loss": 0.0003, "reward": 1.955582857131958, "reward_std": 0.00019773694336322478, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.45558300614357, "step": 3113 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 8.067357512953368, "grad_norm": 0.08663451372792426, "kl": 0.0616455078125, "learning_rate": 1.9352331606217615e-07, "loss": 0.0007, "reward": 2.4999966621398926, "reward_std": 1.774986714053739e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999967813491821, "step": 3114 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 8.069948186528498, "grad_norm": 0.2981492449917144, "kl": 0.16748046875, "learning_rate": 1.932642487046632e-07, "loss": 0.0004, "reward": 2.4999914169311523, "reward_std": 4.9929566330320085e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999914169311523, "step": 3115 }, { "clip_ratio": 0.0, "completion_length": 34.6875, "epoch": 8.072538860103627, "grad_norm": 146.84593466136192, "kl": 0.1099853515625, "learning_rate": 1.9300518134715025e-07, "loss": 0.0013, "reward": 1.9553672671318054, "reward_std": 0.012412821104760496, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4553673267364502, "step": 3116 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 8.075129533678757, "grad_norm": 0.22145177154849138, "kl": 0.124267578125, "learning_rate": 1.927461139896373e-07, "loss": 0.0008, "reward": 2.4999983310699463, "reward_std": 1.2121317922719754e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999984502792358, "step": 3117 }, { "clip_ratio": 0.0, "completion_length": 34.5, "epoch": 8.077720207253886, "grad_norm": 0.4041656044683656, "kl": 0.102783203125, "learning_rate": 1.9248704663212433e-07, "loss": -0.0003, "reward": 2.499995708465576, "reward_std": 2.398298306616198e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999957084655762, "step": 3118 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 8.080310880829016, "grad_norm": 3.1182064659979662, "kl": 0.083740234375, "learning_rate": 1.922279792746114e-07, "loss": 0.0003, "reward": 2.4999865293502808, "reward_std": 6.549222916873987e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999866485595703, "step": 3119 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 8.082901554404145, "grad_norm": 14.633934730479668, "kl": 0.11962890625, "learning_rate": 1.9196891191709844e-07, "loss": -0.0003, "reward": 1.9491404294967651, "reward_std": 0.0006368438778565633, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4491405487060547, "step": 3120 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 8.085492227979275, "grad_norm": 15.298543720282403, "kl": 0.137939453125, "learning_rate": 1.9170984455958546e-07, "loss": -0.0003, "reward": 2.4374752044677734, "reward_std": 0.1767812859033029, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.937475323677063, "step": 3121 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 8.088082901554404, "grad_norm": 2.9459160388900956, "kl": 0.4010009765625, "learning_rate": 1.9145077720207254e-07, "loss": 0.0016, "reward": 1.9927970170974731, "reward_std": 8.757973955653142e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4927969872951508, "step": 3122 }, { "clip_ratio": 0.0, "completion_length": 33.875, "epoch": 8.090673575129534, "grad_norm": 0.10951777395781458, "kl": 0.0723876953125, "learning_rate": 1.9119170984455957e-07, "loss": 0.0004, "reward": 2.4999977350234985, "reward_std": 9.914180054693134e-07, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.999997615814209, "step": 3123 }, { "clip_ratio": 0.0, "completion_length": 34.5, "epoch": 8.093264248704664, "grad_norm": 0.5977959768713302, "kl": 0.094970703125, "learning_rate": 1.9093264248704662e-07, "loss": 0.0007, "reward": 2.499989867210388, "reward_std": 3.2869019719328207e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999897480010986, "step": 3124 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 8.095854922279793, "grad_norm": 0.33441473571998426, "kl": 0.08740234375, "learning_rate": 1.9067357512953368e-07, "loss": 0.0011, "reward": 2.499997854232788, "reward_std": 1.7746012304087344e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999977350234985, "step": 3125 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 8.098445595854923, "grad_norm": 0.2208797906705792, "kl": 0.0545654296875, "learning_rate": 1.9041450777202073e-07, "loss": -0.0012, "reward": 2.4999959468841553, "reward_std": 2.518478424917703e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999961853027344, "step": 3126 }, { "clip_ratio": 0.0, "completion_length": 34.5, "epoch": 8.101036269430052, "grad_norm": 0.12088424964711521, "kl": 0.131591796875, "learning_rate": 1.9015544041450775e-07, "loss": 0.0014, "reward": 2.4999979734420776, "reward_std": 2.1100069034218905e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999979734420776, "step": 3127 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 8.103626943005182, "grad_norm": 0.14451249603973737, "kl": 0.1025390625, "learning_rate": 1.898963730569948e-07, "loss": -0.0003, "reward": 2.4999948740005493, "reward_std": 2.015277232203516e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999949932098389, "step": 3128 }, { "clip_ratio": 0.0, "completion_length": 34.5, "epoch": 8.106217616580311, "grad_norm": 0.3020402215721515, "kl": 0.15673828125, "learning_rate": 1.8963730569948186e-07, "loss": 0.0013, "reward": 2.499970555305481, "reward_std": 4.650092932934058e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999704360961914, "step": 3129 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 8.10880829015544, "grad_norm": 1.3645610531940051, "kl": 0.292236328125, "learning_rate": 1.8937823834196889e-07, "loss": 0.0011, "reward": 2.499994397163391, "reward_std": 5.65704476684914e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999943971633911, "step": 3130 }, { "clip_ratio": 0.0, "completion_length": 34.0, "epoch": 8.11139896373057, "grad_norm": 1.1187198055290284, "kl": 0.0947265625, "learning_rate": 1.8911917098445597e-07, "loss": 0.0003, "reward": 2.4998831748962402, "reward_std": 1.1185610333086515e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9998833537101746, "step": 3131 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 8.1139896373057, "grad_norm": 11.053215892568899, "kl": 0.0592041015625, "learning_rate": 1.88860103626943e-07, "loss": 0.0003, "reward": 1.9865660667419434, "reward_std": 0.00017148313304460316, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.486566036939621, "step": 3132 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 8.11658031088083, "grad_norm": 1.0770346841812293, "kl": 0.126220703125, "learning_rate": 1.8860103626943004e-07, "loss": 0.0007, "reward": 2.4999959468841553, "reward_std": 3.7938548871352396e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999959468841553, "step": 3133 }, { "clip_ratio": 0.0, "completion_length": 34.0, "epoch": 8.119170984455959, "grad_norm": 0.06260182223385052, "kl": 0.063232421875, "learning_rate": 1.883419689119171e-07, "loss": -0.0006, "reward": 2.4999985694885254, "reward_std": 1.2781838734099438e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.999998688697815, "step": 3134 }, { "clip_ratio": 0.0, "completion_length": 34.5, "epoch": 8.121761658031089, "grad_norm": 0.9148089307791176, "kl": 0.07958984375, "learning_rate": 1.8808290155440415e-07, "loss": 0.0012, "reward": 2.4999934434890747, "reward_std": 3.838800864741643e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999934434890747, "step": 3135 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 8.124352331606218, "grad_norm": 0.06794014032604263, "kl": 0.15087890625, "learning_rate": 1.8782383419689118e-07, "loss": 0.0008, "reward": 2.4999977350234985, "reward_std": 1.860860294300437e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999976754188538, "step": 3136 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 8.126943005181348, "grad_norm": 1.715661609603444, "kl": 0.15966796875, "learning_rate": 1.8756476683937823e-07, "loss": 0.002, "reward": 2.499993681907654, "reward_std": 7.149828434194205e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999935626983643, "step": 3137 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 8.129533678756477, "grad_norm": 0.29388496699105326, "kl": 0.1199951171875, "learning_rate": 1.8730569948186528e-07, "loss": 0.0009, "reward": 2.4999974966049194, "reward_std": 1.6710372392481077e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999974966049194, "step": 3138 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 8.132124352331607, "grad_norm": 8.956326765142041, "kl": 0.1494140625, "learning_rate": 1.870466321243523e-07, "loss": 0.0013, "reward": 1.893113374710083, "reward_std": 0.0005703785835180497, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.393113374710083, "step": 3139 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 8.134715025906736, "grad_norm": 1.8511737222706086, "kl": 0.0673828125, "learning_rate": 1.8678756476683939e-07, "loss": 0.0012, "reward": 2.499923348426819, "reward_std": 1.6954223610810004e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999232292175293, "step": 3140 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 8.137305699481866, "grad_norm": 0.5563064592937442, "kl": 0.09765625, "learning_rate": 1.865284974093264e-07, "loss": 0.0005, "reward": 2.4999970197677612, "reward_std": 3.406056521271239e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999971389770508, "step": 3141 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 8.139896373056995, "grad_norm": 0.16434538043521746, "kl": 0.06640625, "learning_rate": 1.8626943005181347e-07, "loss": 0.0003, "reward": 2.499997615814209, "reward_std": 1.5512517848037533e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999974966049194, "step": 3142 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 8.142487046632125, "grad_norm": 0.45062434986212524, "kl": 0.1502685546875, "learning_rate": 1.8601036269430052e-07, "loss": 0.0006, "reward": 2.499995470046997, "reward_std": 3.1197882321976067e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999953508377075, "step": 3143 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 8.145077720207254, "grad_norm": 2.94622335806529, "kl": 0.08447265625, "learning_rate": 1.8575129533678754e-07, "loss": -0.001, "reward": 2.499998092651367, "reward_std": 1.9509256503624783e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.999998152256012, "step": 3144 }, { "clip_ratio": 0.0, "completion_length": 34.5, "epoch": 8.147668393782384, "grad_norm": 0.7310549388314892, "kl": 0.08251953125, "learning_rate": 1.854922279792746e-07, "loss": 0.0006, "reward": 2.4999958276748657, "reward_std": 3.7773199892399134e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.999995768070221, "step": 3145 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 8.150259067357513, "grad_norm": 5.416096981349559, "kl": 0.1904296875, "learning_rate": 1.8523316062176165e-07, "loss": -0.0004, "reward": 1.9998480081558228, "reward_std": 2.187202250070186e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4998481273651123, "step": 3146 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 8.152849740932643, "grad_norm": 0.22841564104835865, "kl": 0.15966796875, "learning_rate": 1.849740932642487e-07, "loss": 0.0012, "reward": 2.499998688697815, "reward_std": 1.6061570136116643e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999987483024597, "step": 3147 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 8.155440414507773, "grad_norm": 0.026950303971103606, "kl": 0.02691650390625, "learning_rate": 1.8471502590673573e-07, "loss": -0.0006, "reward": 2.499998927116394, "reward_std": 8.433520122252958e-07, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999990463256836, "step": 3148 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 8.158031088082902, "grad_norm": 24.674175850704486, "kl": 0.0599365234375, "learning_rate": 1.844559585492228e-07, "loss": 0.0005, "reward": 2.4374873638153076, "reward_std": 0.1767964861218161, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.937487244606018, "step": 3149 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 8.160621761658032, "grad_norm": 0.11261020228082523, "kl": 0.056884765625, "learning_rate": 1.8419689119170983e-07, "loss": 0.0005, "reward": 2.4999985694885254, "reward_std": 1.6432234133390011e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999986290931702, "step": 3150 }, { "clip_ratio": 0.0, "completion_length": 33.5625, "epoch": 8.163212435233161, "grad_norm": 0.47404691759261275, "kl": 0.1177978515625, "learning_rate": 1.8393782383419686e-07, "loss": 0.0006, "reward": 2.4999961853027344, "reward_std": 2.001675397877989e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999960660934448, "step": 3151 }, { "clip_ratio": 0.0, "completion_length": 34.5625, "epoch": 8.16580310880829, "grad_norm": 0.6225218120001401, "kl": 0.1572265625, "learning_rate": 1.8367875647668394e-07, "loss": 0.0015, "reward": 2.4999966621398926, "reward_std": 3.235610051888216e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999966025352478, "step": 3152 }, { "clip_ratio": 0.0, "completion_length": 34.0, "epoch": 8.16839378238342, "grad_norm": 14.124217296588778, "kl": 0.0859375, "learning_rate": 1.8341968911917097e-07, "loss": -0.0003, "reward": 2.4998735189437866, "reward_std": 5.106800938392553e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9998736381530762, "step": 3153 }, { "clip_ratio": 0.0, "completion_length": 34.5, "epoch": 8.17098445595855, "grad_norm": 2.6514014136130823, "kl": 0.07763671875, "learning_rate": 1.8316062176165802e-07, "loss": 0.0012, "reward": 2.499994158744812, "reward_std": 3.814357114606537e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999940395355225, "step": 3154 }, { "clip_ratio": 0.0, "completion_length": 34.5, "epoch": 8.17357512953368, "grad_norm": 5.63601193786011, "kl": 0.157958984375, "learning_rate": 1.8290155440414507e-07, "loss": 0.0005, "reward": 2.49994158744812, "reward_std": 1.3565708542273569e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999415278434753, "step": 3155 }, { "clip_ratio": 0.0, "completion_length": 34.5, "epoch": 8.176165803108809, "grad_norm": 0.40740725090935975, "kl": 0.26708984375, "learning_rate": 1.8264248704663212e-07, "loss": 0.0009, "reward": 2.4999970197677612, "reward_std": 2.193285922658106e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999970197677612, "step": 3156 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 8.178756476683938, "grad_norm": 1.606842154843521, "kl": 0.0782470703125, "learning_rate": 1.8238341968911915e-07, "loss": 0.0002, "reward": 2.4998984336853027, "reward_std": 1.4305656236501818e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9998984932899475, "step": 3157 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 8.181347150259068, "grad_norm": 0.028530366933778497, "kl": 0.103240966796875, "learning_rate": 1.821243523316062e-07, "loss": 0.0001, "reward": 2.4999992847442627, "reward_std": 5.331021810661696e-07, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.999999463558197, "step": 3158 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 8.183937823834198, "grad_norm": 0.6194798767092272, "kl": 0.0675048828125, "learning_rate": 1.8186528497409325e-07, "loss": -0.0002, "reward": 2.499990940093994, "reward_std": 4.0072045806027745e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.999990999698639, "step": 3159 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 8.186528497409327, "grad_norm": 0.08955910761199547, "kl": 0.05010986328125, "learning_rate": 1.8160621761658028e-07, "loss": 0.0003, "reward": 2.4999974966049194, "reward_std": 1.7598734416424122e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999974966049194, "step": 3160 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 8.189119170984457, "grad_norm": 19.591692519743233, "kl": 0.2783203125, "learning_rate": 1.8134715025906736e-07, "loss": 0.001, "reward": 1.895760178565979, "reward_std": 0.1790335430414416, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.3957601189613342, "step": 3161 }, { "clip_ratio": 0.0, "completion_length": 34.5, "epoch": 8.191709844559586, "grad_norm": 9.84818316453554, "kl": 0.093505859375, "learning_rate": 1.8108808290155439e-07, "loss": -0.0004, "reward": 2.437490224838257, "reward_std": 0.17679470868677072, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9374903440475464, "step": 3162 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 8.194300518134716, "grad_norm": 5.757630431670943, "kl": 0.15234375, "learning_rate": 1.8082901554404144e-07, "loss": 0.0002, "reward": 1.99784916639328, "reward_std": 5.932166192224031e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4978492856025696, "step": 3163 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 8.196891191709845, "grad_norm": 0.7914224891039795, "kl": 0.031280517578125, "learning_rate": 1.805699481865285e-07, "loss": 0.0011, "reward": 2.499995470046997, "reward_std": 3.156705020046502e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999954104423523, "step": 3164 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 8.199481865284975, "grad_norm": 1.2763777362126756, "kl": 0.132568359375, "learning_rate": 1.8031088082901554e-07, "loss": 0.0004, "reward": 2.4999934434890747, "reward_std": 7.576135601539136e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999935030937195, "step": 3165 }, { "clip_ratio": 0.0, "completion_length": 36.5, "epoch": 8.202072538860104, "grad_norm": 5.328442605531321, "kl": 0.071533203125, "learning_rate": 1.8005181347150257e-07, "loss": -0.0008, "reward": 2.4999911785125732, "reward_std": 1.9563993191695772e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999911785125732, "step": 3166 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 8.204663212435234, "grad_norm": 0.10827715775895738, "kl": 0.138427734375, "learning_rate": 1.7979274611398962e-07, "loss": 0.0004, "reward": 2.4999985694885254, "reward_std": 1.2430288052200922e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999986290931702, "step": 3167 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 8.207253886010363, "grad_norm": 112.12876758785984, "kl": 0.13623046875, "learning_rate": 1.7953367875647668e-07, "loss": 0.0007, "reward": 1.9093554615974426, "reward_std": 0.0021399448464762827, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4093554019927979, "step": 3168 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 8.209844559585493, "grad_norm": 3.164246502147246, "kl": 0.127685546875, "learning_rate": 1.792746113989637e-07, "loss": -0.0001, "reward": 2.4999929666519165, "reward_std": 8.70200028657564e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.999993085861206, "step": 3169 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 8.212435233160623, "grad_norm": 0.08608571929338761, "kl": 0.08941650390625, "learning_rate": 1.7901554404145078e-07, "loss": -0.0007, "reward": 2.499998450279236, "reward_std": 1.0259028613290866e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999987483024597, "step": 3170 }, { "clip_ratio": 0.0, "completion_length": 34.5, "epoch": 8.215025906735752, "grad_norm": 0.9030541234213424, "kl": 0.148193359375, "learning_rate": 1.787564766839378e-07, "loss": 0.0013, "reward": 2.4999970197677612, "reward_std": 3.1579944561599405e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.999996840953827, "step": 3171 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 8.217616580310882, "grad_norm": 4.401430294703537, "kl": 0.232421875, "learning_rate": 1.784974093264249e-07, "loss": 0.0015, "reward": 1.8292223811149597, "reward_std": 0.0004880217588834057, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.3292223811149597, "step": 3172 }, { "clip_ratio": 0.0, "completion_length": 35.4375, "epoch": 8.220207253886011, "grad_norm": 78.62242857220241, "kl": 0.103515625, "learning_rate": 1.782383419689119e-07, "loss": 0.0009, "reward": 2.418749451637268, "reward_std": 0.22980716611471053, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9187493920326233, "step": 3173 }, { "clip_ratio": 0.0, "completion_length": 34.5, "epoch": 8.22279792746114, "grad_norm": 0.1921341405034419, "kl": 0.13525390625, "learning_rate": 1.7797927461139894e-07, "loss": -0.0005, "reward": 2.499997138977051, "reward_std": 2.932723759840883e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999971985816956, "step": 3174 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 8.22538860103627, "grad_norm": 4.5646627228989765, "kl": 0.14697265625, "learning_rate": 1.7772020725388602e-07, "loss": 0.0012, "reward": 1.8232365846633911, "reward_std": 0.00029939485773411434, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.3232364058494568, "step": 3175 }, { "clip_ratio": 0.0, "completion_length": 34.5, "epoch": 8.2279792746114, "grad_norm": 0.3390435865079533, "kl": 0.156494140625, "learning_rate": 1.7746113989637304e-07, "loss": 0.0012, "reward": 2.499997615814209, "reward_std": 1.4843980693513004e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.999997615814209, "step": 3176 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 8.23056994818653, "grad_norm": 0.4156584054894673, "kl": 0.03814697265625, "learning_rate": 1.772020725388601e-07, "loss": 0.0009, "reward": 2.4999972581863403, "reward_std": 2.199124139679043e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999970197677612, "step": 3177 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 8.233160621761659, "grad_norm": 2.75713145280005, "kl": 0.08990478515625, "learning_rate": 1.7694300518134715e-07, "loss": 0.0005, "reward": 1.9898239374160767, "reward_std": 0.00011030767427655519, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4898239374160767, "step": 3178 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 8.235751295336788, "grad_norm": 0.5804819653866367, "kl": 0.0433349609375, "learning_rate": 1.766839378238342e-07, "loss": -0.0004, "reward": 2.4999924898147583, "reward_std": 2.4749996825335074e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999924898147583, "step": 3179 }, { "clip_ratio": 0.0, "completion_length": 35.375, "epoch": 8.238341968911918, "grad_norm": 61.26426281086702, "kl": 0.124755859375, "learning_rate": 1.7642487046632123e-07, "loss": -0.0002, "reward": 1.9530822038650513, "reward_std": 0.3375635552838503, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4530822932720184, "step": 3180 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 8.240932642487047, "grad_norm": 0.706148499646634, "kl": 0.0645751953125, "learning_rate": 1.7616580310880828e-07, "loss": 0.0004, "reward": 2.499987006187439, "reward_std": 3.5811082170766895e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.999987006187439, "step": 3181 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 8.243523316062177, "grad_norm": 0.4031269658075301, "kl": 0.0467529296875, "learning_rate": 1.7590673575129533e-07, "loss": 0.0011, "reward": 2.499997615814209, "reward_std": 1.925787387335731e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.999997615814209, "step": 3182 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 8.246113989637305, "grad_norm": 0.10684306024162532, "kl": 0.125, "learning_rate": 1.7564766839378236e-07, "loss": 0.0005, "reward": 2.499998450279236, "reward_std": 1.6925793318023352e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999985098838806, "step": 3183 }, { "clip_ratio": 0.0, "completion_length": 36.5, "epoch": 8.248704663212436, "grad_norm": 0.1248106103613605, "kl": 0.0657958984375, "learning_rate": 1.7538860103626944e-07, "loss": -0.0005, "reward": 2.4999966621398926, "reward_std": 2.004274335831724e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999967813491821, "step": 3184 }, { "clip_ratio": 0.0, "completion_length": 34.5, "epoch": 8.251295336787564, "grad_norm": 0.17905178420023277, "kl": 0.140625, "learning_rate": 1.7512953367875647e-07, "loss": 0.0007, "reward": 2.49999737739563, "reward_std": 2.679004865058232e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999973773956299, "step": 3185 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 8.253886010362695, "grad_norm": 0.0770172521660824, "kl": 0.0577392578125, "learning_rate": 1.7487046632124352e-07, "loss": 0.0004, "reward": 2.4999990463256836, "reward_std": 7.139431517089179e-07, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999991059303284, "step": 3186 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 8.256476683937823, "grad_norm": 0.27198644668703004, "kl": 0.117919921875, "learning_rate": 1.7461139896373057e-07, "loss": 0.0019, "reward": 2.499997854232788, "reward_std": 3.172386755068146e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999979734420776, "step": 3187 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 8.259067357512953, "grad_norm": 0.6302493528208652, "kl": 0.09375, "learning_rate": 1.743523316062176e-07, "loss": -0.0001, "reward": 1.9999113082885742, "reward_std": 1.0783291259031103e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4999111890792847, "step": 3188 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 8.261658031088082, "grad_norm": 0.15868275659507172, "kl": 0.093994140625, "learning_rate": 1.7409326424870465e-07, "loss": 0.0015, "reward": 2.499998450279236, "reward_std": 1.6560301787649223e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.999998390674591, "step": 3189 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 8.264248704663212, "grad_norm": 0.5528136867642289, "kl": 0.26123046875, "learning_rate": 1.738341968911917e-07, "loss": 0.0022, "reward": 2.499996066093445, "reward_std": 3.767788598452171e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999960064888, "step": 3190 }, { "clip_ratio": 0.0, "completion_length": 34.0, "epoch": 8.266839378238341, "grad_norm": 0.12798082023375656, "kl": 0.06103515625, "learning_rate": 1.7357512953367876e-07, "loss": 0.0002, "reward": 2.499996781349182, "reward_std": 2.8758530561390216e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.999996840953827, "step": 3191 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 8.26943005181347, "grad_norm": 41.452763438953845, "kl": 0.081787109375, "learning_rate": 1.7331606217616578e-07, "loss": 0.0003, "reward": 1.9954372644424438, "reward_std": 0.00017896358451707783, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4954373240470886, "step": 3192 }, { "clip_ratio": 0.0, "completion_length": 34.5, "epoch": 8.2720207253886, "grad_norm": 0.6773730054031986, "kl": 0.09912109375, "learning_rate": 1.7305699481865286e-07, "loss": 0.0003, "reward": 2.4999964237213135, "reward_std": 3.818687446255353e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999963641166687, "step": 3193 }, { "clip_ratio": 0.0, "completion_length": 34.5, "epoch": 8.27461139896373, "grad_norm": 0.3930135528304384, "kl": 0.161865234375, "learning_rate": 1.727979274611399e-07, "loss": 0.0015, "reward": 2.499970555305481, "reward_std": 5.137363700669084e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999704360961914, "step": 3194 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 8.27720207253886, "grad_norm": 0.09960299696722485, "kl": 0.076171875, "learning_rate": 1.7253886010362694e-07, "loss": 0.0002, "reward": 2.499995708465576, "reward_std": 1.4667808159174456e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999957084655762, "step": 3195 }, { "clip_ratio": 0.0, "completion_length": 34.5, "epoch": 8.279792746113989, "grad_norm": 0.1589247901078338, "kl": 0.0526123046875, "learning_rate": 1.72279792746114e-07, "loss": -0.0004, "reward": 2.4999983310699463, "reward_std": 2.04998809749668e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999984502792358, "step": 3196 }, { "clip_ratio": 0.0, "completion_length": 34.5, "epoch": 8.282383419689118, "grad_norm": 55.14298222425625, "kl": 0.22607421875, "learning_rate": 1.7202072538860102e-07, "loss": 0.0012, "reward": 1.814697265625, "reward_std": 0.0668470896450799, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.3146972060203552, "step": 3197 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 8.284974093264248, "grad_norm": 0.3711200936635606, "kl": 0.0582275390625, "learning_rate": 1.7176165803108807e-07, "loss": -0.0003, "reward": 2.4999959468841553, "reward_std": 3.7261788747855462e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999961256980896, "step": 3198 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 8.287564766839377, "grad_norm": 0.17464923663992982, "kl": 0.10992431640625, "learning_rate": 1.7150259067357512e-07, "loss": 0.0015, "reward": 2.4999969005584717, "reward_std": 2.3411973870679503e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999969005584717, "step": 3199 }, { "clip_ratio": 0.0, "completion_length": 33.875, "epoch": 8.290155440414507, "grad_norm": 5.1900507839793235, "kl": 0.111328125, "learning_rate": 1.7124352331606218e-07, "loss": 0.0005, "reward": 1.9924770593643188, "reward_std": 0.00012739573179487707, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4924769699573517, "step": 3200 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 8.292746113989637, "grad_norm": 1.196564466134288, "kl": 0.079345703125, "learning_rate": 1.709844559585492e-07, "loss": 0.0007, "reward": 2.499959707260132, "reward_std": 1.264332422579173e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999597668647766, "step": 3201 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 8.295336787564766, "grad_norm": 0.5509604968231264, "kl": 0.058837890625, "learning_rate": 1.7072538860103628e-07, "loss": -0.0002, "reward": 2.4999970197677612, "reward_std": 3.7912963648523146e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999971389770508, "step": 3202 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 8.297927461139896, "grad_norm": 0.18974085068456928, "kl": 0.09326171875, "learning_rate": 1.704663212435233e-07, "loss": 0.0002, "reward": 2.499997854232788, "reward_std": 1.5254385630214529e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999979138374329, "step": 3203 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 8.300518134715025, "grad_norm": 2.6871452717421347, "kl": 0.0576171875, "learning_rate": 1.7020725388601033e-07, "loss": 0.0004, "reward": 2.4999730587005615, "reward_std": 2.546031603856136e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.999972939491272, "step": 3204 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 8.303108808290155, "grad_norm": 0.46416535826530864, "kl": 0.114013671875, "learning_rate": 1.6994818652849741e-07, "loss": 0.0007, "reward": 2.4999958276748657, "reward_std": 4.411420036376512e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999956488609314, "step": 3205 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 8.305699481865284, "grad_norm": 5.507740085804502, "kl": 0.068115234375, "learning_rate": 1.6968911917098444e-07, "loss": 0.0003, "reward": 2.4999477863311768, "reward_std": 1.1819582141470164e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999478459358215, "step": 3206 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 8.308290155440414, "grad_norm": 1.9837395328087248, "kl": 0.0574951171875, "learning_rate": 1.694300518134715e-07, "loss": 0.0011, "reward": 1.999694585800171, "reward_std": 1.8257314195579966e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4996943473815918, "step": 3207 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 8.310880829015543, "grad_norm": 0.04771270350003987, "kl": 0.07830810546875, "learning_rate": 1.6917098445595854e-07, "loss": 0.0011, "reward": 2.499998927116394, "reward_std": 7.389864435936033e-07, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.999998927116394, "step": 3208 }, { "clip_ratio": 0.0, "completion_length": 34.0, "epoch": 8.313471502590673, "grad_norm": 0.25175274540640064, "kl": 0.0579833984375, "learning_rate": 1.689119170984456e-07, "loss": 0.0004, "reward": 2.499994397163391, "reward_std": 2.7393241737172502e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999945163726807, "step": 3209 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 8.316062176165802, "grad_norm": 0.2873621850839714, "kl": 0.0936279296875, "learning_rate": 1.6865284974093262e-07, "loss": 0.0005, "reward": 2.4999983310699463, "reward_std": 1.3285900308801502e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999983310699463, "step": 3210 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 8.318652849740932, "grad_norm": 0.4044842859906627, "kl": 0.22021484375, "learning_rate": 1.6839378238341968e-07, "loss": 0.0003, "reward": 2.4999961853027344, "reward_std": 5.079967422716436e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999964237213135, "step": 3211 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 8.321243523316062, "grad_norm": 0.08779252572221974, "kl": 0.10302734375, "learning_rate": 1.6813471502590673e-07, "loss": 0.0004, "reward": 2.4999970197677612, "reward_std": 1.6046812447712e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.999996840953827, "step": 3212 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 8.323834196891191, "grad_norm": 2.3270036307642306, "kl": 0.15380859375, "learning_rate": 1.6787564766839376e-07, "loss": -0.0002, "reward": 2.4999914169311523, "reward_std": 5.598668735729007e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999914765357971, "step": 3213 }, { "clip_ratio": 0.0, "completion_length": 34.5, "epoch": 8.32642487046632, "grad_norm": 0.38672817708323914, "kl": 0.10302734375, "learning_rate": 1.6761658031088083e-07, "loss": 0.0004, "reward": 2.4999840259552, "reward_std": 4.8193414841080084e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999840259552002, "step": 3214 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 8.32901554404145, "grad_norm": 3.6283125586118645, "kl": 0.089111328125, "learning_rate": 1.6735751295336786e-07, "loss": -0.0005, "reward": 2.4999771118164062, "reward_std": 1.2718164953184896e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999772906303406, "step": 3215 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 8.33160621761658, "grad_norm": 3.5250134204386807, "kl": 0.0859375, "learning_rate": 1.6709844559585491e-07, "loss": -0.0001, "reward": 1.9801177978515625, "reward_std": 0.00020493308011282352, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4801177680492401, "step": 3216 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 8.33419689119171, "grad_norm": 0.38752575444168663, "kl": 0.060546875, "learning_rate": 1.6683937823834197e-07, "loss": -0.0002, "reward": 2.4999988079071045, "reward_std": 1.037070063603096e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999988675117493, "step": 3217 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 8.336787564766839, "grad_norm": 0.7300713622872141, "kl": 0.07666015625, "learning_rate": 1.66580310880829e-07, "loss": 0.0, "reward": 2.4999929666519165, "reward_std": 4.953808229402057e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999930262565613, "step": 3218 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 8.339378238341968, "grad_norm": 10.606544763115886, "kl": 0.2266845703125, "learning_rate": 1.6632124352331605e-07, "loss": 0.0018, "reward": 1.9905086755752563, "reward_std": 0.00011018002430773777, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4905085563659668, "step": 3219 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 8.341968911917098, "grad_norm": 0.5209999027051346, "kl": 0.120849609375, "learning_rate": 1.660621761658031e-07, "loss": 0.0008, "reward": 2.4999935626983643, "reward_std": 3.124012891930761e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999936819076538, "step": 3220 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 8.344559585492227, "grad_norm": 0.1573626252593982, "kl": 0.044921875, "learning_rate": 1.6580310880829015e-07, "loss": 0.0003, "reward": 2.4999964237213135, "reward_std": 2.0584985236382636e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.999996542930603, "step": 3221 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 8.347150259067357, "grad_norm": 5.004434905306625, "kl": 0.1611328125, "learning_rate": 1.6554404145077718e-07, "loss": -0.0003, "reward": 1.9988044500350952, "reward_std": 6.160921429909649e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4988045692443848, "step": 3222 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 8.349740932642487, "grad_norm": 0.49659734021737595, "kl": 0.031829833984375, "learning_rate": 1.6528497409326426e-07, "loss": -0.0002, "reward": 2.499996781349182, "reward_std": 2.486671633050719e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.999996840953827, "step": 3223 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 8.352331606217616, "grad_norm": 0.5465098105777704, "kl": 0.15576171875, "learning_rate": 1.6502590673575128e-07, "loss": -0.0011, "reward": 2.4999961853027344, "reward_std": 2.6978894993590075e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.999996304512024, "step": 3224 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 8.354922279792746, "grad_norm": 4.365680888853067, "kl": 0.08984375, "learning_rate": 1.6476683937823836e-07, "loss": 0.0005, "reward": 2.499997615814209, "reward_std": 2.0819348378608993e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999976754188538, "step": 3225 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 8.357512953367875, "grad_norm": 0.9122588700547437, "kl": 0.079833984375, "learning_rate": 1.645077720207254e-07, "loss": -0.0001, "reward": 1.9988069534301758, "reward_std": 2.5916017477811693e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.498807042837143, "step": 3226 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 8.360103626943005, "grad_norm": 2.840972529513588, "kl": 0.13037109375, "learning_rate": 1.6424870466321241e-07, "loss": 0.0011, "reward": 1.8726292848587036, "reward_std": 0.0002964056578775853, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.3726292252540588, "step": 3227 }, { "clip_ratio": 0.0, "completion_length": 34.5, "epoch": 8.362694300518134, "grad_norm": 0.1604427777396762, "kl": 0.0936279296875, "learning_rate": 1.639896373056995e-07, "loss": -0.0005, "reward": 2.4999935626983643, "reward_std": 2.405010377515282e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999935626983643, "step": 3228 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 8.365284974093264, "grad_norm": 0.08633581676479131, "kl": 0.089111328125, "learning_rate": 1.6373056994818652e-07, "loss": -0.0001, "reward": 2.499998450279236, "reward_std": 1.1789842062626121e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.999998390674591, "step": 3229 }, { "clip_ratio": 0.0, "completion_length": 35.0625, "epoch": 8.367875647668393, "grad_norm": 1.4393329729158233, "kl": 0.568359375, "learning_rate": 1.6347150259067357e-07, "loss": 0.0021, "reward": 2.4999966621398926, "reward_std": 4.821636252927419e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999967813491821, "step": 3230 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 8.370466321243523, "grad_norm": 0.41139015268834717, "kl": 0.0634765625, "learning_rate": 1.6321243523316062e-07, "loss": 0.0005, "reward": 2.4999823570251465, "reward_std": 6.151422212496982e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999822974205017, "step": 3231 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 8.373056994818652, "grad_norm": 0.025479270067198755, "kl": 0.0599365234375, "learning_rate": 1.6295336787564768e-07, "loss": 0.0001, "reward": 2.499999165534973, "reward_std": 7.754772184398462e-07, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999993443489075, "step": 3232 }, { "clip_ratio": 0.0, "completion_length": 34.5, "epoch": 8.375647668393782, "grad_norm": 1.4660702928844889, "kl": 0.0849609375, "learning_rate": 1.626943005181347e-07, "loss": 0.0001, "reward": 1.9989567995071411, "reward_std": 4.1266665164130245e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4989569187164307, "step": 3233 }, { "clip_ratio": 0.0, "completion_length": 34.0, "epoch": 8.378238341968911, "grad_norm": 3.3307177770463445, "kl": 0.169921875, "learning_rate": 1.6243523316062176e-07, "loss": 0.0002, "reward": 1.998131275177002, "reward_std": 7.804911570019613e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4981312453746796, "step": 3234 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 8.380829015544041, "grad_norm": 0.08126447326729487, "kl": 0.061767578125, "learning_rate": 1.621761658031088e-07, "loss": 0.0003, "reward": 2.4999961853027344, "reward_std": 2.5046920768545533e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999961853027344, "step": 3235 }, { "clip_ratio": 0.0, "completion_length": 34.0, "epoch": 8.38341968911917, "grad_norm": 0.05105797867997188, "kl": 0.08917236328125, "learning_rate": 1.6191709844559583e-07, "loss": -0.0001, "reward": 2.4999983310699463, "reward_std": 8.156730189057271e-07, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999985098838806, "step": 3236 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 8.3860103626943, "grad_norm": 0.33430201719343583, "kl": 0.0751953125, "learning_rate": 1.6165803108808291e-07, "loss": 0.0002, "reward": 2.499993920326233, "reward_std": 4.37150174548151e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999940991401672, "step": 3237 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 8.38860103626943, "grad_norm": 10.517502015421686, "kl": 0.127197265625, "learning_rate": 1.6139896373056994e-07, "loss": 0.0004, "reward": 1.9886282682418823, "reward_std": 0.0002502906878021349, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.48862823843956, "step": 3238 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 8.39119170984456, "grad_norm": 4.753382783369989, "kl": 0.104736328125, "learning_rate": 1.61139896373057e-07, "loss": 0.0009, "reward": 2.4996172189712524, "reward_std": 3.332807591505116e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9996171593666077, "step": 3239 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 8.393782383419689, "grad_norm": 0.1077857479588344, "kl": 0.091552734375, "learning_rate": 1.6088082901554405e-07, "loss": -0.0011, "reward": 2.4999972581863403, "reward_std": 1.978998000140564e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999974966049194, "step": 3240 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 8.396373056994818, "grad_norm": 0.17387818343761, "kl": 0.089599609375, "learning_rate": 1.6062176165803107e-07, "loss": -0.0001, "reward": 2.4999972581863403, "reward_std": 2.1541280261772044e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999974966049194, "step": 3241 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 8.398963730569948, "grad_norm": 0.07814533365428449, "kl": 0.0535888671875, "learning_rate": 1.6036269430051812e-07, "loss": 0.0001, "reward": 2.4999990463256836, "reward_std": 1.040197076918048e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999991655349731, "step": 3242 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 8.401554404145077, "grad_norm": 0.37308276998310835, "kl": 0.117919921875, "learning_rate": 1.6010362694300518e-07, "loss": 0.0015, "reward": 2.4999940395355225, "reward_std": 2.4687550705948524e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999940395355225, "step": 3243 }, { "clip_ratio": 0.0, "completion_length": 34.125, "epoch": 8.404145077720207, "grad_norm": 0.34881767606382474, "kl": 0.146484375, "learning_rate": 1.5984455958549223e-07, "loss": 0.0003, "reward": 2.49999463558197, "reward_std": 3.860333890770562e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999946355819702, "step": 3244 }, { "clip_ratio": 0.0, "completion_length": 34.5, "epoch": 8.406735751295336, "grad_norm": 1.3354813148493538, "kl": 0.083251953125, "learning_rate": 1.5958549222797926e-07, "loss": 0.0002, "reward": 2.499993681907654, "reward_std": 4.841173563363554e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999936819076538, "step": 3245 }, { "clip_ratio": 0.0, "completion_length": 34.5, "epoch": 8.409326424870466, "grad_norm": 0.08592330904696557, "kl": 0.10400390625, "learning_rate": 1.5932642487046634e-07, "loss": 0.0006, "reward": 2.4999985694885254, "reward_std": 1.1443437415437074e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.999998390674591, "step": 3246 }, { "clip_ratio": 0.0, "completion_length": 34.5, "epoch": 8.411917098445596, "grad_norm": 2.3490496448675167, "kl": 0.100830078125, "learning_rate": 1.5906735751295336e-07, "loss": 0.0, "reward": 1.9984028339385986, "reward_std": 2.9188460757723078e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4984028935432434, "step": 3247 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 8.414507772020725, "grad_norm": 0.06966504060667966, "kl": 0.13427734375, "learning_rate": 1.588082901554404e-07, "loss": 0.0011, "reward": 2.499997138977051, "reward_std": 1.6413511332302733e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999971985816956, "step": 3248 }, { "clip_ratio": 0.0, "completion_length": 34.0, "epoch": 8.417098445595855, "grad_norm": 0.17927949218509098, "kl": 0.08056640625, "learning_rate": 1.5854922279792747e-07, "loss": 0.0013, "reward": 2.4999983310699463, "reward_std": 2.2327928377308126e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999984502792358, "step": 3249 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 8.419689119170984, "grad_norm": 0.037824058687407194, "kl": 0.16748046875, "learning_rate": 1.582901554404145e-07, "loss": 0.001, "reward": 2.499998450279236, "reward_std": 7.957415704140658e-07, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999984502792358, "step": 3250 }, { "clip_ratio": 0.0, "completion_length": 34.5, "epoch": 8.422279792746114, "grad_norm": 12.865394574033488, "kl": 0.1265869140625, "learning_rate": 1.5803108808290155e-07, "loss": 0.0001, "reward": 1.9134883284568787, "reward_std": 0.0003624329820013372, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4134882986545563, "step": 3251 }, { "clip_ratio": 0.0, "completion_length": 34.0, "epoch": 8.424870466321243, "grad_norm": 0.4403927991964329, "kl": 0.16259765625, "learning_rate": 1.577720207253886e-07, "loss": 0.0002, "reward": 1.9976167678833008, "reward_std": 1.7445513776692678e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4976167678833008, "step": 3252 }, { "clip_ratio": 0.0, "completion_length": 34.5, "epoch": 8.427461139896373, "grad_norm": 0.14228288015250842, "kl": 0.1326904296875, "learning_rate": 1.5751295336787565e-07, "loss": 0.0007, "reward": 2.4999982118606567, "reward_std": 1.1551248348951049e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999982118606567, "step": 3253 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 8.430051813471502, "grad_norm": 2.0148836197609494, "kl": 0.1904296875, "learning_rate": 1.5725388601036268e-07, "loss": 0.0, "reward": 2.499996304512024, "reward_std": 3.282113539171405e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.999996542930603, "step": 3254 }, { "clip_ratio": 0.0, "completion_length": 34.5, "epoch": 8.432642487046632, "grad_norm": 3.57542976314061, "kl": 0.125244140625, "learning_rate": 1.5699481865284976e-07, "loss": -0.001, "reward": 2.499990224838257, "reward_std": 7.6941531688135e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.999990463256836, "step": 3255 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 8.435233160621761, "grad_norm": 0.7637507544783885, "kl": 0.1806640625, "learning_rate": 1.5673575129533678e-07, "loss": 0.0006, "reward": 2.4999953508377075, "reward_std": 6.560696192536852e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999955296516418, "step": 3256 }, { "clip_ratio": 0.0, "completion_length": 34.5, "epoch": 8.437823834196891, "grad_norm": 0.21408403832204953, "kl": 0.0789794921875, "learning_rate": 1.564766839378238e-07, "loss": -0.0001, "reward": 2.4999988079071045, "reward_std": 7.541374600350537e-07, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999988675117493, "step": 3257 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 8.44041450777202, "grad_norm": 0.3785837802152817, "kl": 0.06884765625, "learning_rate": 1.562176165803109e-07, "loss": 0.0002, "reward": 2.499995231628418, "reward_std": 3.249091946599947e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.999995231628418, "step": 3258 }, { "clip_ratio": 0.0, "completion_length": 34.5, "epoch": 8.44300518134715, "grad_norm": 249.56412273715605, "kl": 0.412109375, "learning_rate": 1.5595854922279791e-07, "loss": 0.0019, "reward": 1.9868806600570679, "reward_std": 0.000564672772085828, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4868807792663574, "step": 3259 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 8.44559585492228, "grad_norm": 0.15525033110824002, "kl": 0.08642578125, "learning_rate": 1.5569948186528497e-07, "loss": 0.0007, "reward": 2.4999985694885254, "reward_std": 1.7415273134702147e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999984502792358, "step": 3260 }, { "clip_ratio": 0.0, "completion_length": 33.5, "epoch": 8.44818652849741, "grad_norm": 0.1675682962837026, "kl": 0.0406494140625, "learning_rate": 1.5544041450777202e-07, "loss": 0.0002, "reward": 2.4999979734420776, "reward_std": 1.4315731391434383e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999979734420776, "step": 3261 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 8.450777202072539, "grad_norm": 0.28081459854541224, "kl": 0.0960693359375, "learning_rate": 1.5518134715025907e-07, "loss": 0.001, "reward": 2.4999982118606567, "reward_std": 1.058569125689246e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.999998152256012, "step": 3262 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 8.453367875647668, "grad_norm": 0.5980743679822971, "kl": 0.048095703125, "learning_rate": 1.549222797927461e-07, "loss": 0.0004, "reward": 2.4999959468841553, "reward_std": 3.200311198270356e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999958276748657, "step": 3263 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 8.455958549222798, "grad_norm": 1.140386029914927, "kl": 0.1435546875, "learning_rate": 1.5466321243523315e-07, "loss": 0.0004, "reward": 2.4999970197677612, "reward_std": 3.2830278087203624e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.999997079372406, "step": 3264 }, { "clip_ratio": 0.0, "completion_length": 34.5, "epoch": 8.458549222797927, "grad_norm": 1.673615121006343, "kl": 0.084716796875, "learning_rate": 1.544041450777202e-07, "loss": 0.0001, "reward": 2.499990224838257, "reward_std": 6.327432458874682e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999903440475464, "step": 3265 }, { "clip_ratio": 0.0, "completion_length": 34.5, "epoch": 8.461139896373057, "grad_norm": 0.2208932094048126, "kl": 0.145751953125, "learning_rate": 1.5414507772020723e-07, "loss": 0.0002, "reward": 2.499972105026245, "reward_std": 4.598367240760126e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999722242355347, "step": 3266 }, { "clip_ratio": 0.0, "completion_length": 34.5, "epoch": 8.463730569948186, "grad_norm": 0.1965320442956369, "kl": 0.09619140625, "learning_rate": 1.538860103626943e-07, "loss": 0.0002, "reward": 2.499996066093445, "reward_std": 2.536736644742632e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999961256980896, "step": 3267 }, { "clip_ratio": 0.0, "completion_length": 34.5625, "epoch": 8.466321243523316, "grad_norm": 18.037481291621404, "kl": 0.210693359375, "learning_rate": 1.5362694300518134e-07, "loss": 0.0011, "reward": 1.9484704732894897, "reward_std": 0.0016775362241787661, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4484704732894897, "step": 3268 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 8.468911917098445, "grad_norm": 0.25522864478434343, "kl": 0.072265625, "learning_rate": 1.533678756476684e-07, "loss": 0.0009, "reward": 2.499997138977051, "reward_std": 1.963451666142646e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999969601631165, "step": 3269 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 8.471502590673575, "grad_norm": 0.09003623791447991, "kl": 0.02203369140625, "learning_rate": 1.5310880829015544e-07, "loss": 0.0001, "reward": 2.4999969005584717, "reward_std": 1.6059943277468847e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999969005584717, "step": 3270 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 8.474093264248705, "grad_norm": 0.20795154777892821, "kl": 0.0360107421875, "learning_rate": 1.5284974093264247e-07, "loss": -0.0007, "reward": 2.499996781349182, "reward_std": 3.3171672839671373e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999969005584717, "step": 3271 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 8.476683937823834, "grad_norm": 0.21255969204162925, "kl": 0.08123779296875, "learning_rate": 1.5259067357512952e-07, "loss": 0.0012, "reward": 2.4999959468841553, "reward_std": 2.565349518590665e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999961256980896, "step": 3272 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 8.479274611398964, "grad_norm": 0.05256515725884933, "kl": 0.081298828125, "learning_rate": 1.5233160621761657e-07, "loss": -0.0001, "reward": 2.499995470046997, "reward_std": 1.0015135103458306e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999955892562866, "step": 3273 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 8.481865284974093, "grad_norm": 0.422305608252991, "kl": 0.063232421875, "learning_rate": 1.5207253886010362e-07, "loss": 0.0004, "reward": 2.4999953508377075, "reward_std": 2.56870990256175e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999954104423523, "step": 3274 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 8.484455958549223, "grad_norm": 0.11164640534135978, "kl": 0.204345703125, "learning_rate": 1.5181347150259065e-07, "loss": 0.0018, "reward": 2.4999979734420776, "reward_std": 1.6274793779302854e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999976754188538, "step": 3275 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 8.487046632124352, "grad_norm": 1.7973177731477983, "kl": 0.06671142578125, "learning_rate": 1.5155440414507773e-07, "loss": 0.0003, "reward": 2.4999920129776, "reward_std": 5.526389429633127e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999921321868896, "step": 3276 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 8.489637305699482, "grad_norm": 1.7222743390429385, "kl": 0.07763671875, "learning_rate": 1.5129533678756476e-07, "loss": 0.0005, "reward": 2.4999918937683105, "reward_std": 7.834495420411258e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999918341636658, "step": 3277 }, { "clip_ratio": 0.0, "completion_length": 34.0625, "epoch": 8.492227979274611, "grad_norm": 0.3201813158632273, "kl": 0.1708984375, "learning_rate": 1.5103626943005178e-07, "loss": 0.0004, "reward": 2.499995231628418, "reward_std": 3.447210815465951e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.999995231628418, "step": 3278 }, { "clip_ratio": 0.0, "completion_length": 35.4375, "epoch": 8.494818652849741, "grad_norm": 13.389776839579126, "kl": 0.1268310546875, "learning_rate": 1.5077720207253886e-07, "loss": 0.001, "reward": 1.8866075277328491, "reward_std": 0.00038939237373369906, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.386607587337494, "step": 3279 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 8.49740932642487, "grad_norm": 888.290757360333, "kl": 0.09588623046875, "learning_rate": 1.505181347150259e-07, "loss": 0.0004, "reward": 1.983170986175537, "reward_std": 0.0002128577453959224, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4831709265708923, "step": 3280 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 8.5, "grad_norm": 8.183365356282343, "kl": 0.070556640625, "learning_rate": 1.5025906735751294e-07, "loss": 0.0009, "reward": 2.499973773956299, "reward_std": 2.2136076040624175e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.999973714351654, "step": 3281 }, { "clip_ratio": 0.0, "completion_length": 34.5, "epoch": 8.50259067357513, "grad_norm": 0.7334981416727272, "kl": 0.049072265625, "learning_rate": 1.5e-07, "loss": 0.0, "reward": 2.499978542327881, "reward_std": 4.261434241925599e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999786615371704, "step": 3282 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 8.505181347150259, "grad_norm": 0.20178492205725573, "kl": 0.0592041015625, "learning_rate": 1.4974093264248705e-07, "loss": 0.0005, "reward": 2.499996304512024, "reward_std": 2.4263547402370023e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.999996304512024, "step": 3283 }, { "clip_ratio": 0.0, "completion_length": 34.625, "epoch": 8.507772020725389, "grad_norm": 40.50431362763718, "kl": 0.089111328125, "learning_rate": 1.4948186528497407e-07, "loss": 0.0011, "reward": 2.1100460290908813, "reward_std": 0.24016640169040215, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.6100459098815918, "step": 3284 }, { "clip_ratio": 0.0, "completion_length": 34.5, "epoch": 8.510362694300518, "grad_norm": 0.4338287372722202, "kl": 0.0908203125, "learning_rate": 1.4922279792746112e-07, "loss": 0.0014, "reward": 2.4999974966049194, "reward_std": 2.4449706188534037e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999975562095642, "step": 3285 }, { "clip_ratio": 0.0, "completion_length": 34.5, "epoch": 8.512953367875648, "grad_norm": 0.04874827423322555, "kl": 0.014404296875, "learning_rate": 1.4896373056994818e-07, "loss": 0.0005, "reward": 2.4999982118606567, "reward_std": 1.0233412126581243e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999982118606567, "step": 3286 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 8.515544041450777, "grad_norm": 0.47325646519770215, "kl": 0.0994873046875, "learning_rate": 1.487046632124352e-07, "loss": 0.0011, "reward": 2.4999959468841553, "reward_std": 4.1753178265935276e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999960660934448, "step": 3287 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 8.518134715025907, "grad_norm": 2.6325875621177306, "kl": 0.140380859375, "learning_rate": 1.4844559585492228e-07, "loss": 0.0006, "reward": 2.4999921321868896, "reward_std": 5.6654648687981535e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999920725822449, "step": 3288 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 8.520725388601036, "grad_norm": 0.048611474378057934, "kl": 0.060546875, "learning_rate": 1.481865284974093e-07, "loss": -0.0005, "reward": 2.4999988079071045, "reward_std": 7.428212995819194e-07, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.999998927116394, "step": 3289 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 8.523316062176166, "grad_norm": 32.34324384763752, "kl": 0.103271484375, "learning_rate": 1.479274611398964e-07, "loss": 0.0001, "reward": 1.8813066482543945, "reward_std": 0.0010774543043226004, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.3813066482543945, "step": 3290 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 8.525906735751295, "grad_norm": 0.05185809583555954, "kl": 0.041259765625, "learning_rate": 1.4766839378238341e-07, "loss": 0.0012, "reward": 2.4999969005584717, "reward_std": 1.1581141450278665e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999970197677612, "step": 3291 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 8.528497409326425, "grad_norm": 0.05662225260365626, "kl": 0.06640625, "learning_rate": 1.4740932642487047e-07, "loss": 0.0006, "reward": 2.4999982118606567, "reward_std": 1.2409597047735588e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999982118606567, "step": 3292 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 8.531088082901555, "grad_norm": 0.3393649641395822, "kl": 0.084625244140625, "learning_rate": 1.4715025906735752e-07, "loss": -0.0004, "reward": 2.4999873638153076, "reward_std": 3.285905791017285e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999873638153076, "step": 3293 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 8.533678756476684, "grad_norm": 0.36928055302912044, "kl": 0.162109375, "learning_rate": 1.4689119170984455e-07, "loss": -0.0001, "reward": 2.499995708465576, "reward_std": 3.145457355913095e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999957084655762, "step": 3294 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 8.536269430051814, "grad_norm": 0.23091887539162143, "kl": 0.091796875, "learning_rate": 1.466321243523316e-07, "loss": 0.0009, "reward": 2.499981999397278, "reward_std": 3.843254546609387e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999820590019226, "step": 3295 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 8.538860103626943, "grad_norm": 0.33369829537695067, "kl": 0.0446929931640625, "learning_rate": 1.4637305699481865e-07, "loss": 0.0005, "reward": 2.4990296363830566, "reward_std": 1.2910204077343224e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9990296363830566, "step": 3296 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 8.541450777202073, "grad_norm": 2.099774127720425, "kl": 0.1669921875, "learning_rate": 1.461139896373057e-07, "loss": 0.0006, "reward": 1.9991334080696106, "reward_std": 4.378776657176786e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4991332590579987, "step": 3297 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 8.544041450777202, "grad_norm": 0.38143977304861826, "kl": 0.0496826171875, "learning_rate": 1.4585492227979273e-07, "loss": 0.0016, "reward": 2.4999948740005493, "reward_std": 3.284486552956878e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999948740005493, "step": 3298 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 8.546632124352332, "grad_norm": 3.0523892841509848, "kl": 0.06005859375, "learning_rate": 1.455958549222798e-07, "loss": 0.0006, "reward": 1.7927755117416382, "reward_std": 0.0002894496790872836, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.2927754819393158, "step": 3299 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 8.549222797927461, "grad_norm": 0.30116251332339555, "kl": 0.0726318359375, "learning_rate": 1.4533678756476684e-07, "loss": 0.0001, "reward": 2.4999985694885254, "reward_std": 1.3621807113395334e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999988079071045, "step": 3300 }, { "clip_ratio": 0.0, "completion_length": 34.5, "epoch": 8.55181347150259, "grad_norm": 1.0737550503837174, "kl": 0.087646484375, "learning_rate": 1.4507772020725386e-07, "loss": -0.0001, "reward": 2.499998688697815, "reward_std": 1.0256756866056094e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999987483024597, "step": 3301 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 8.55440414507772, "grad_norm": 0.3029410961554839, "kl": 0.1630859375, "learning_rate": 1.4481865284974094e-07, "loss": 0.0005, "reward": 2.4999921321868896, "reward_std": 3.6748403999808943e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999922513961792, "step": 3302 }, { "clip_ratio": 0.0, "completion_length": 34.5, "epoch": 8.55699481865285, "grad_norm": 7.290144631469494, "kl": 0.1336669921875, "learning_rate": 1.4455958549222797e-07, "loss": 0.0004, "reward": 1.8220319151878357, "reward_std": 0.00028328357723239606, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.3220319747924805, "step": 3303 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 8.55958549222798, "grad_norm": 0.2332791276731152, "kl": 0.092529296875, "learning_rate": 1.4430051813471502e-07, "loss": 0.0015, "reward": 2.4999961853027344, "reward_std": 2.628421157169214e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999959468841553, "step": 3304 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 8.562176165803109, "grad_norm": 0.18279067045772782, "kl": 0.047119140625, "learning_rate": 1.4404145077720207e-07, "loss": 0.0008, "reward": 2.4999961853027344, "reward_std": 2.2781329107601778e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999960660934448, "step": 3305 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 8.564766839378239, "grad_norm": 0.15022808522873954, "kl": 0.18212890625, "learning_rate": 1.4378238341968913e-07, "loss": 0.0013, "reward": 2.4999979734420776, "reward_std": 1.3701589693937422e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999980926513672, "step": 3306 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 8.567357512953368, "grad_norm": 0.3644989581211705, "kl": 0.094482421875, "learning_rate": 1.4352331606217615e-07, "loss": 0.0004, "reward": 2.4999947547912598, "reward_std": 3.124775275864522e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999946355819702, "step": 3307 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 8.569948186528498, "grad_norm": 0.13362739170870355, "kl": 0.161376953125, "learning_rate": 1.432642487046632e-07, "loss": 0.0012, "reward": 2.4999983310699463, "reward_std": 1.5128913446460501e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999984502792358, "step": 3308 }, { "clip_ratio": 0.0, "completion_length": 34.0, "epoch": 8.572538860103627, "grad_norm": 0.026391820226991686, "kl": 0.12060546875, "learning_rate": 1.4300518134715026e-07, "loss": -0.0001, "reward": 2.4999983310699463, "reward_std": 9.363924675653834e-07, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999982714653015, "step": 3309 }, { "clip_ratio": 0.0, "completion_length": 34.5, "epoch": 8.575129533678757, "grad_norm": 2.239090648428185, "kl": 0.075439453125, "learning_rate": 1.4274611398963728e-07, "loss": -0.0011, "reward": 2.4999871253967285, "reward_std": 7.743637638668588e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999871850013733, "step": 3310 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 8.577720207253886, "grad_norm": 0.09145996380471086, "kl": 0.0572509765625, "learning_rate": 1.4248704663212436e-07, "loss": 0.0007, "reward": 2.4999990463256836, "reward_std": 1.0969515926717577e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999988675117493, "step": 3311 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 8.580310880829016, "grad_norm": 0.15629243860708478, "kl": 0.12060546875, "learning_rate": 1.422279792746114e-07, "loss": 0.0012, "reward": 2.499998092651367, "reward_std": 1.8979206402036652e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999980330467224, "step": 3312 }, { "clip_ratio": 0.0, "completion_length": 35.875, "epoch": 8.582901554404145, "grad_norm": 6.242838938005496, "kl": 0.16650390625, "learning_rate": 1.4196891191709844e-07, "loss": 0.0008, "reward": 1.9489158987998962, "reward_std": 0.028667759289874084, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4489157497882843, "step": 3313 }, { "clip_ratio": 0.0, "completion_length": 34.0, "epoch": 8.585492227979275, "grad_norm": 0.18985441770229575, "kl": 0.13818359375, "learning_rate": 1.417098445595855e-07, "loss": -0.0002, "reward": 2.499998450279236, "reward_std": 1.1788168876591953e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.999998390674591, "step": 3314 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 8.588082901554404, "grad_norm": 0.23791420129562021, "kl": 0.04461669921875, "learning_rate": 1.4145077720207252e-07, "loss": 0.0006, "reward": 2.4999972581863403, "reward_std": 2.777960872890617e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999972581863403, "step": 3315 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 8.590673575129534, "grad_norm": 0.09437937366899564, "kl": 0.062255859375, "learning_rate": 1.4119170984455957e-07, "loss": 0.0005, "reward": 2.4999959468841553, "reward_std": 2.295070885338646e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999958872795105, "step": 3316 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 8.593264248704664, "grad_norm": 0.16615883619676566, "kl": 0.047210693359375, "learning_rate": 1.4093264248704663e-07, "loss": 0.0018, "reward": 2.499997854232788, "reward_std": 1.4581210621145146e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999977350234985, "step": 3317 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 8.595854922279793, "grad_norm": 3.537813508311042, "kl": 0.13720703125, "learning_rate": 1.4067357512953368e-07, "loss": -0.0002, "reward": 2.4998974800109863, "reward_std": 2.1676318624486157e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9998974204063416, "step": 3318 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 8.598445595854923, "grad_norm": 0.2470497325019201, "kl": 0.069580078125, "learning_rate": 1.404145077720207e-07, "loss": 0.0016, "reward": 2.4999940395355225, "reward_std": 2.088128894683905e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999939799308777, "step": 3319 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 8.601036269430052, "grad_norm": 0.26909185339872677, "kl": 0.13134765625, "learning_rate": 1.4015544041450778e-07, "loss": 0.0011, "reward": 2.499991774559021, "reward_std": 6.6673069341049995e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999917149543762, "step": 3320 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 8.603626943005182, "grad_norm": 2.146610413069479, "kl": 0.1396484375, "learning_rate": 1.398963730569948e-07, "loss": 0.0006, "reward": 1.9963128566741943, "reward_std": 5.926458806015944e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4963128864765167, "step": 3321 }, { "clip_ratio": 0.0, "completion_length": 34.5, "epoch": 8.606217616580311, "grad_norm": 0.12705175401992627, "kl": 0.084716796875, "learning_rate": 1.3963730569948186e-07, "loss": 0.0005, "reward": 2.4999988079071045, "reward_std": 1.0816191320373036e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.999998927116394, "step": 3322 }, { "clip_ratio": 0.0, "completion_length": 34.5, "epoch": 8.60880829015544, "grad_norm": 9.672743152946575, "kl": 0.09765625, "learning_rate": 1.3937823834196891e-07, "loss": 0.001, "reward": 1.957108497619629, "reward_std": 0.00017531489743305428, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4571084380149841, "step": 3323 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 8.61139896373057, "grad_norm": 0.052663728324344836, "kl": 0.08056640625, "learning_rate": 1.3911917098445594e-07, "loss": 0.0012, "reward": 2.4999958276748657, "reward_std": 1.8165787878388073e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999958276748657, "step": 3324 }, { "clip_ratio": 0.0, "completion_length": 34.5, "epoch": 8.6139896373057, "grad_norm": 5.056037938743275, "kl": 0.130615234375, "learning_rate": 1.38860103626943e-07, "loss": 0.0001, "reward": 1.9528579711914062, "reward_std": 0.00015353200802792344, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4528579711914062, "step": 3325 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 8.61658031088083, "grad_norm": 0.12023500340132035, "kl": 0.07177734375, "learning_rate": 1.3860103626943005e-07, "loss": -0.0003, "reward": 2.4999983310699463, "reward_std": 1.720802060845017e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999983310699463, "step": 3326 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 8.619170984455959, "grad_norm": 0.32352513103022656, "kl": 0.01611328125, "learning_rate": 1.383419689119171e-07, "loss": 0.0004, "reward": 2.4999964237213135, "reward_std": 3.2249220112134935e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999963641166687, "step": 3327 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 8.621761658031089, "grad_norm": 87.49888650333517, "kl": 0.139404296875, "learning_rate": 1.3808290155440413e-07, "loss": 0.0016, "reward": 2.187451958656311, "reward_std": 0.25881180025001527, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.6874518990516663, "step": 3328 }, { "clip_ratio": 0.0, "completion_length": 34.5, "epoch": 8.624352331606218, "grad_norm": 0.1192222284628396, "kl": 0.0692138671875, "learning_rate": 1.378238341968912e-07, "loss": -0.0012, "reward": 2.499995231628418, "reward_std": 2.032030351983849e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999953508377075, "step": 3329 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 8.626943005181348, "grad_norm": 0.2373709354580264, "kl": 0.0755615234375, "learning_rate": 1.3756476683937823e-07, "loss": -0.0008, "reward": 1.9998705387115479, "reward_std": 6.1315019479479815e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4998705387115479, "step": 3330 }, { "clip_ratio": 0.0, "completion_length": 34.5, "epoch": 8.629533678756477, "grad_norm": 0.10628699498629816, "kl": 0.1171875, "learning_rate": 1.3730569948186526e-07, "loss": -0.0006, "reward": 2.499998092651367, "reward_std": 1.4112170845237415e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999980926513672, "step": 3331 }, { "clip_ratio": 0.0, "completion_length": 35.625, "epoch": 8.632124352331607, "grad_norm": 8.231748297252688, "kl": 0.2783203125, "learning_rate": 1.3704663212435234e-07, "loss": 0.0019, "reward": 1.9945073127746582, "reward_std": 9.54220321318644e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4945071935653687, "step": 3332 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 8.634715025906736, "grad_norm": 0.1999669731604068, "kl": 0.0706787109375, "learning_rate": 1.3678756476683936e-07, "loss": 0.0011, "reward": 2.4999959468841553, "reward_std": 2.630793062508019e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.999995768070221, "step": 3333 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 8.637305699481866, "grad_norm": 1.6101286665730745, "kl": 0.097412109375, "learning_rate": 1.3652849740932641e-07, "loss": 0.0012, "reward": 1.9999347925186157, "reward_std": 1.3890817399442312e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4999346137046814, "step": 3334 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 8.639896373056995, "grad_norm": 2.141494039933273, "kl": 0.265625, "learning_rate": 1.3626943005181347e-07, "loss": 0.001, "reward": 1.999935269355774, "reward_std": 1.1870372759403836e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4999352097511292, "step": 3335 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 8.642487046632125, "grad_norm": 0.4323880814192606, "kl": 0.04364013671875, "learning_rate": 1.3601036269430052e-07, "loss": -0.0001, "reward": 2.4999862909317017, "reward_std": 5.350068477127934e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999864101409912, "step": 3336 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 8.645077720207254, "grad_norm": 0.3379876749555578, "kl": 0.095458984375, "learning_rate": 1.3575129533678755e-07, "loss": 0.0001, "reward": 2.4999951124191284, "reward_std": 3.665665417429409e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999951720237732, "step": 3337 }, { "clip_ratio": 0.0, "completion_length": 34.5, "epoch": 8.647668393782384, "grad_norm": 0.17462334661822623, "kl": 0.0657958984375, "learning_rate": 1.354922279792746e-07, "loss": -0.0002, "reward": 2.499997615814209, "reward_std": 2.1958989009362995e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.999997615814209, "step": 3338 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 8.650259067357513, "grad_norm": 2.7550804531142985, "kl": 0.16357421875, "learning_rate": 1.3523316062176165e-07, "loss": -0.0005, "reward": 2.4999966621398926, "reward_std": 2.5040316131708096e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.999996840953827, "step": 3339 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 8.652849740932643, "grad_norm": 0.222883162863987, "kl": 0.0333251953125, "learning_rate": 1.3497409326424868e-07, "loss": -0.0, "reward": 2.4999974966049194, "reward_std": 2.5459948460593296e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999974966049194, "step": 3340 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 8.655440414507773, "grad_norm": 1.918787228508702, "kl": 0.1190185546875, "learning_rate": 1.3471502590673576e-07, "loss": 0.0003, "reward": 2.4999964237213135, "reward_std": 2.8247826548977173e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999966621398926, "step": 3341 }, { "clip_ratio": 0.0, "completion_length": 35.625, "epoch": 8.658031088082902, "grad_norm": 2.349144252092865, "kl": 0.14666748046875, "learning_rate": 1.3445595854922278e-07, "loss": 0.0001, "reward": 1.9939061403274536, "reward_std": 6.075625594803569e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4939061105251312, "step": 3342 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 8.660621761658032, "grad_norm": 2.5255275075531296, "kl": 0.113037109375, "learning_rate": 1.3419689119170984e-07, "loss": 0.0003, "reward": 1.9957151412963867, "reward_std": 8.075975711108185e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4957151412963867, "step": 3343 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 8.663212435233161, "grad_norm": 4.800438367423024, "kl": 0.07861328125, "learning_rate": 1.339378238341969e-07, "loss": 0.0004, "reward": 1.9972680807113647, "reward_std": 0.00010621248952702445, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4972682297229767, "step": 3344 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 8.66580310880829, "grad_norm": 1.2786170110597592, "kl": 0.09130859375, "learning_rate": 1.3367875647668391e-07, "loss": 0.0004, "reward": 2.4999678134918213, "reward_std": 6.944649157958338e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999680519104004, "step": 3345 }, { "clip_ratio": 0.0, "completion_length": 36.5, "epoch": 8.66839378238342, "grad_norm": 0.11792085335870053, "kl": 0.08251953125, "learning_rate": 1.3341968911917097e-07, "loss": 0.0006, "reward": 2.499996781349182, "reward_std": 2.474585585332534e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.999996840953827, "step": 3346 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 8.67098445595855, "grad_norm": 1.3058351917696434, "kl": 0.30322265625, "learning_rate": 1.3316062176165802e-07, "loss": 0.0015, "reward": 1.4999983310699463, "reward_std": 2.2568951862922404e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 0.9999983310699463, "step": 3347 }, { "clip_ratio": 0.0, "completion_length": 34.5, "epoch": 8.67357512953368, "grad_norm": 1.1579950551518363, "kl": 0.720703125, "learning_rate": 1.3290155440414507e-07, "loss": 0.0033, "reward": 2.4999979734420776, "reward_std": 1.8917681359198468e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.999997854232788, "step": 3348 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 8.676165803108809, "grad_norm": 0.17728445270097165, "kl": 0.1552734375, "learning_rate": 1.326424870466321e-07, "loss": 0.0021, "reward": 2.4999985694885254, "reward_std": 1.0873091582652705e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999986290931702, "step": 3349 }, { "clip_ratio": 0.0, "completion_length": 34.0, "epoch": 8.678756476683938, "grad_norm": 0.052987725168563635, "kl": 0.0985107421875, "learning_rate": 1.3238341968911918e-07, "loss": 0.0007, "reward": 2.4999979734420776, "reward_std": 1.2178700217191363e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999980926513672, "step": 3350 }, { "clip_ratio": 0.0, "completion_length": 34.5, "epoch": 8.681347150259068, "grad_norm": 0.6457747909234716, "kl": 0.085205078125, "learning_rate": 1.321243523316062e-07, "loss": 0.0011, "reward": 2.499997854232788, "reward_std": 1.68019590773838e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999977350234985, "step": 3351 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 8.683937823834198, "grad_norm": 0.2160131321836147, "kl": 0.072509765625, "learning_rate": 1.3186528497409328e-07, "loss": 0.0006, "reward": 2.499992251396179, "reward_std": 3.113205821136944e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999920129776, "step": 3352 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 8.686528497409327, "grad_norm": 0.05455523936309753, "kl": 0.0693359375, "learning_rate": 1.316062176165803e-07, "loss": 0.0007, "reward": 2.4999992847442627, "reward_std": 7.580394765227538e-07, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999994039535522, "step": 3353 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 8.689119170984457, "grad_norm": 0.41548623545963126, "kl": 0.102020263671875, "learning_rate": 1.3134715025906734e-07, "loss": -0.0008, "reward": 2.499995708465576, "reward_std": 2.1361069002523436e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999957084655762, "step": 3354 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 8.691709844559586, "grad_norm": 0.44876892370022803, "kl": 0.14111328125, "learning_rate": 1.3108808290155442e-07, "loss": 0.001, "reward": 2.4999840259552, "reward_std": 3.816244543486391e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999839663505554, "step": 3355 }, { "clip_ratio": 0.0, "completion_length": 33.5, "epoch": 8.694300518134716, "grad_norm": 0.04852392060888601, "kl": 0.054046630859375, "learning_rate": 1.3082901554404144e-07, "loss": -0.0011, "reward": 2.499998688697815, "reward_std": 8.175171331004094e-07, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.999998688697815, "step": 3356 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 8.696891191709845, "grad_norm": 0.06473490215329131, "kl": 0.14453125, "learning_rate": 1.305699481865285e-07, "loss": 0.0003, "reward": 2.4999985694885254, "reward_std": 9.483069618454465e-07, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.999998688697815, "step": 3357 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 8.699481865284975, "grad_norm": 0.2484608496586633, "kl": 0.087890625, "learning_rate": 1.3031088082901555e-07, "loss": 0.0002, "reward": 2.499997138977051, "reward_std": 3.37998426402919e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999971985816956, "step": 3358 }, { "clip_ratio": 0.0, "completion_length": 34.5, "epoch": 8.702072538860104, "grad_norm": 0.09615785421311752, "kl": 0.11181640625, "learning_rate": 1.300518134715026e-07, "loss": -0.0006, "reward": 2.49999737739563, "reward_std": 1.6123695445457997e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999977350234985, "step": 3359 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 8.704663212435234, "grad_norm": 0.09051185820935538, "kl": 0.0614013671875, "learning_rate": 1.2979274611398963e-07, "loss": 0.0013, "reward": 2.499997854232788, "reward_std": 1.7361154505124432e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999979138374329, "step": 3360 }, { "clip_ratio": 0.0, "completion_length": 35.9375, "epoch": 8.707253886010363, "grad_norm": 2.484006206452572, "kl": 0.578125, "learning_rate": 1.2953367875647668e-07, "loss": 0.0022, "reward": 2.499996066093445, "reward_std": 2.655085609148955e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999960064888, "step": 3361 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 8.709844559585493, "grad_norm": 0.2882526084433037, "kl": 0.09716796875, "learning_rate": 1.2927461139896373e-07, "loss": -0.0003, "reward": 2.499996304512024, "reward_std": 2.4396557023464993e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999964833259583, "step": 3362 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 8.712435233160623, "grad_norm": 6.732846598057207, "kl": 0.138427734375, "learning_rate": 1.2901554404145076e-07, "loss": 0.0005, "reward": 1.4908652901649475, "reward_std": 9.542973202769645e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 0.9908652901649475, "step": 3363 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 8.715025906735752, "grad_norm": 0.10592116197476391, "kl": 0.120361328125, "learning_rate": 1.2875647668393784e-07, "loss": 0.0006, "reward": 2.4999977350234985, "reward_std": 1.4710834079778579e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999979138374329, "step": 3364 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 8.717616580310882, "grad_norm": 0.04345875062291035, "kl": 0.096923828125, "learning_rate": 1.2849740932642486e-07, "loss": 0.0008, "reward": 2.4999988079071045, "reward_std": 9.374131195727387e-07, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.999998927116394, "step": 3365 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 8.720207253886011, "grad_norm": 0.19711894869061544, "kl": 0.048828125, "learning_rate": 1.2823834196891192e-07, "loss": 0.0003, "reward": 2.4999979734420776, "reward_std": 1.7973872559196025e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.999998152256012, "step": 3366 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 8.72279792746114, "grad_norm": 1.7044919418024835, "kl": 0.10009765625, "learning_rate": 1.2797927461139897e-07, "loss": 0.0004, "reward": 2.4999847412109375, "reward_std": 3.951132839574711e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999846816062927, "step": 3367 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 8.72538860103627, "grad_norm": 0.5077745611054267, "kl": 0.06103515625, "learning_rate": 1.27720207253886e-07, "loss": 0.0007, "reward": 2.499977231025696, "reward_std": 4.563202509189068e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999773502349854, "step": 3368 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 8.7279792746114, "grad_norm": 0.19792402413868987, "kl": 0.085693359375, "learning_rate": 1.2746113989637305e-07, "loss": 0.0011, "reward": 2.499993085861206, "reward_std": 2.66647657554131e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.999992847442627, "step": 3369 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 8.73056994818653, "grad_norm": 0.19891273474068036, "kl": 0.11865234375, "learning_rate": 1.272020725388601e-07, "loss": 0.0002, "reward": 2.499997138977051, "reward_std": 2.8615055498448783e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999971389770508, "step": 3370 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 8.733160621761659, "grad_norm": 0.546508278415925, "kl": 0.069091796875, "learning_rate": 1.2694300518134715e-07, "loss": 0.0, "reward": 2.4999932050704956, "reward_std": 3.0885859132467886e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.999993085861206, "step": 3371 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 8.735751295336787, "grad_norm": 2.475547396579339, "kl": 0.28125, "learning_rate": 1.2668393782383418e-07, "loss": 0.0017, "reward": 2.4999942779541016, "reward_std": 6.81601545693411e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999942779541016, "step": 3372 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 8.738341968911918, "grad_norm": 0.11417566788695352, "kl": 0.104736328125, "learning_rate": 1.2642487046632126e-07, "loss": 0.0001, "reward": 2.4999983310699463, "reward_std": 1.6251477745754528e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999985694885254, "step": 3373 }, { "clip_ratio": 0.0, "completion_length": 34.5, "epoch": 8.740932642487046, "grad_norm": 0.11320119569271186, "kl": 0.038330078125, "learning_rate": 1.2616580310880828e-07, "loss": 0.0006, "reward": 2.499997615814209, "reward_std": 1.8581488347990671e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999974966049194, "step": 3374 }, { "clip_ratio": 0.0, "completion_length": 35.1875, "epoch": 8.743523316062177, "grad_norm": 2.389833879077716, "kl": 0.112060546875, "learning_rate": 1.259067357512953e-07, "loss": 0.0004, "reward": 0.9998639822006226, "reward_std": 1.3021407539781649e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 0.49986398220062256, "step": 3375 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 8.746113989637305, "grad_norm": 13.207390194210728, "kl": 0.072021484375, "learning_rate": 1.256476683937824e-07, "loss": 0.0012, "reward": 2.4999855756759644, "reward_std": 6.485303856607061e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.99998539686203, "step": 3376 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 8.748704663212436, "grad_norm": 0.11587323707575481, "kl": 0.070068359375, "learning_rate": 1.2538860103626942e-07, "loss": 0.0012, "reward": 2.4999961853027344, "reward_std": 1.9090615950290157e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999960660934448, "step": 3377 }, { "clip_ratio": 0.0, "completion_length": 34.5, "epoch": 8.751295336787564, "grad_norm": 0.5112303400714473, "kl": 0.129638671875, "learning_rate": 1.2512953367875647e-07, "loss": 0.0007, "reward": 2.499993324279785, "reward_std": 4.512670443546085e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999931454658508, "step": 3378 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 8.753886010362695, "grad_norm": 1.24867209094399, "kl": 0.1376953125, "learning_rate": 1.2487046632124352e-07, "loss": 0.001, "reward": 2.499967575073242, "reward_std": 9.27197379496647e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999675154685974, "step": 3379 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 8.756476683937823, "grad_norm": 0.21552721080437914, "kl": 0.1318359375, "learning_rate": 1.2461139896373057e-07, "loss": -0.0001, "reward": 2.4999966621398926, "reward_std": 2.618967869238986e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999966621398926, "step": 3380 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 8.759067357512954, "grad_norm": 0.0809784140780734, "kl": 0.0799560546875, "learning_rate": 1.243523316062176e-07, "loss": 0.0, "reward": 2.4999983310699463, "reward_std": 9.920318433387365e-07, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999983310699463, "step": 3381 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 8.761658031088082, "grad_norm": 0.07781043681765573, "kl": 0.0799560546875, "learning_rate": 1.2409326424870465e-07, "loss": 0.0, "reward": 2.499997615814209, "reward_std": 1.3446606033085118e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999977350234985, "step": 3382 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 8.764248704663213, "grad_norm": 8.14002278469235, "kl": 0.931640625, "learning_rate": 1.238341968911917e-07, "loss": 0.0059, "reward": 2.49999737739563, "reward_std": 3.793397951312727e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999971389770508, "step": 3383 }, { "clip_ratio": 0.0, "completion_length": 34.5, "epoch": 8.766839378238341, "grad_norm": 0.19466277075521876, "kl": 0.0657958984375, "learning_rate": 1.2357512953367876e-07, "loss": -0.0011, "reward": 2.4999983310699463, "reward_std": 1.0766609648271697e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999983310699463, "step": 3384 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 8.76943005181347, "grad_norm": 0.04759774304791635, "kl": 0.13623046875, "learning_rate": 1.233160621761658e-07, "loss": 0.0004, "reward": 2.4999988079071045, "reward_std": 1.1763380598495132e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999987483024597, "step": 3385 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 8.7720207253886, "grad_norm": 0.02926639369798685, "kl": 0.0863037109375, "learning_rate": 1.2305699481865284e-07, "loss": 0.0005, "reward": 2.4999985694885254, "reward_std": 8.522496557361592e-07, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999984502792358, "step": 3386 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 8.77461139896373, "grad_norm": 0.12333520307934347, "kl": 0.080078125, "learning_rate": 1.227979274611399e-07, "loss": 0.0006, "reward": 2.499995708465576, "reward_std": 2.22814196604304e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999955892562866, "step": 3387 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 8.77720207253886, "grad_norm": 0.6794672691516138, "kl": 0.140380859375, "learning_rate": 1.2253886010362694e-07, "loss": 0.0011, "reward": 2.499993920326233, "reward_std": 4.174207731466595e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999940395355225, "step": 3388 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 8.779792746113989, "grad_norm": 0.10605964928966936, "kl": 0.0911865234375, "learning_rate": 1.22279792746114e-07, "loss": 0.0015, "reward": 2.499997854232788, "reward_std": 2.2739724272469175e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999977946281433, "step": 3389 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 8.782383419689118, "grad_norm": 0.3280459363386293, "kl": 0.0562744140625, "learning_rate": 1.2202072538860102e-07, "loss": 0.0002, "reward": 2.4999983310699463, "reward_std": 2.0961283553333487e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.999998390674591, "step": 3390 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 8.784974093264248, "grad_norm": 0.8735789536020454, "kl": 0.201171875, "learning_rate": 1.2176165803108807e-07, "loss": 0.0017, "reward": 2.499995470046997, "reward_std": 4.059769423747639e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999955296516418, "step": 3391 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 8.787564766839377, "grad_norm": 0.30267270359878357, "kl": 0.0721435546875, "learning_rate": 1.2150259067357513e-07, "loss": -0.0009, "reward": 2.4999920129776, "reward_std": 5.083951919004903e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999919533729553, "step": 3392 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 8.790155440414507, "grad_norm": 0.1246194083405312, "kl": 0.11529541015625, "learning_rate": 1.2124352331606218e-07, "loss": 0.0005, "reward": 2.4999961853027344, "reward_std": 1.7325169210380409e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999962449073792, "step": 3393 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 8.792746113989637, "grad_norm": 1.2250946380890442, "kl": 0.1865234375, "learning_rate": 1.209844559585492e-07, "loss": 0.0, "reward": 2.4999911785125732, "reward_std": 5.700353085558163e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999911785125732, "step": 3394 }, { "clip_ratio": 0.0, "completion_length": 34.5, "epoch": 8.795336787564766, "grad_norm": 0.9204697416740735, "kl": 0.094482421875, "learning_rate": 1.2072538860103626e-07, "loss": 0.0004, "reward": 2.4999964237213135, "reward_std": 2.5394680278623127e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999964833259583, "step": 3395 }, { "clip_ratio": 0.0, "completion_length": 34.5, "epoch": 8.797927461139896, "grad_norm": 0.20703529222158454, "kl": 0.130615234375, "learning_rate": 1.204663212435233e-07, "loss": 0.002, "reward": 2.4999985694885254, "reward_std": 2.137543049229862e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999984502792358, "step": 3396 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 8.800518134715025, "grad_norm": 0.3575262502211687, "kl": 0.092529296875, "learning_rate": 1.2020725388601036e-07, "loss": 0.0001, "reward": 2.499993324279785, "reward_std": 4.768395683640847e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.99999338388443, "step": 3397 }, { "clip_ratio": 0.0, "completion_length": 34.0, "epoch": 8.803108808290155, "grad_norm": 0.05870856918951048, "kl": 0.0718994140625, "learning_rate": 1.1994818652849742e-07, "loss": 0.0004, "reward": 2.4999983310699463, "reward_std": 1.7038536270774784e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.999998390674591, "step": 3398 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 8.805699481865284, "grad_norm": 2.0867867410200565, "kl": 0.172119140625, "learning_rate": 1.1968911917098444e-07, "loss": 0.0004, "reward": 1.9940532445907593, "reward_std": 7.350150809770639e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4940531849861145, "step": 3399 }, { "clip_ratio": 0.0, "completion_length": 34.0, "epoch": 8.808290155440414, "grad_norm": 0.8632239035603295, "kl": 0.120849609375, "learning_rate": 1.194300518134715e-07, "loss": 0.0001, "reward": 2.4999959468841553, "reward_std": 4.580572635859426e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999960064888, "step": 3400 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 8.810880829015543, "grad_norm": 0.0729098832607847, "kl": 0.08251953125, "learning_rate": 1.1917098445595853e-07, "loss": -0.0006, "reward": 2.499996542930603, "reward_std": 1.7641883118812984e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999966621398926, "step": 3401 }, { "clip_ratio": 0.0, "completion_length": 34.5, "epoch": 8.813471502590673, "grad_norm": 0.23990813180418225, "kl": 0.080322265625, "learning_rate": 1.1891191709844559e-07, "loss": -0.0007, "reward": 2.4999945163726807, "reward_std": 2.2739394012205594e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999946355819702, "step": 3402 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 8.816062176165802, "grad_norm": 2.030009971004229, "kl": 0.093994140625, "learning_rate": 1.1865284974093264e-07, "loss": 0.0003, "reward": 2.4999921321868896, "reward_std": 6.937393322914431e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999920129776, "step": 3403 }, { "clip_ratio": 0.0, "completion_length": 34.5, "epoch": 8.818652849740932, "grad_norm": 0.21628166150201608, "kl": 0.0340576171875, "learning_rate": 1.1839378238341968e-07, "loss": -0.0002, "reward": 2.49998140335083, "reward_std": 3.5874165860150242e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999812245368958, "step": 3404 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 8.821243523316062, "grad_norm": 1.4789656100594275, "kl": 0.0611572265625, "learning_rate": 1.1813471502590673e-07, "loss": 0.0004, "reward": 1.9987174272537231, "reward_std": 2.506039834315743e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.498717337846756, "step": 3405 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 8.823834196891191, "grad_norm": 0.6584813261616805, "kl": 0.15283203125, "learning_rate": 1.1787564766839378e-07, "loss": 0.0006, "reward": 2.4999940395355225, "reward_std": 6.349759019030898e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999942183494568, "step": 3406 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 8.82642487046632, "grad_norm": 0.903754639628972, "kl": 0.08319091796875, "learning_rate": 1.1761658031088082e-07, "loss": 0.0001, "reward": 2.499992251396179, "reward_std": 6.14923578723392e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.999992311000824, "step": 3407 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 8.82901554404145, "grad_norm": 0.8044773421614164, "kl": 0.0947265625, "learning_rate": 1.1735751295336788e-07, "loss": 0.0018, "reward": 2.4999932050704956, "reward_std": 2.198234057004811e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999931454658508, "step": 3408 }, { "clip_ratio": 0.0, "completion_length": 34.5, "epoch": 8.83160621761658, "grad_norm": 3.5446106159507984, "kl": 0.082122802734375, "learning_rate": 1.1709844559585492e-07, "loss": 0.0001, "reward": 1.9997522830963135, "reward_std": 3.200868434305448e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.499752402305603, "step": 3409 }, { "clip_ratio": 0.0, "completion_length": 34.5, "epoch": 8.83419689119171, "grad_norm": 37.09705470061804, "kl": 0.16943359375, "learning_rate": 1.1683937823834196e-07, "loss": 0.0008, "reward": 1.4967734217643738, "reward_std": 0.0031877163237368222, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 0.9967733323574066, "step": 3410 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 8.836787564766839, "grad_norm": 1.994434347956502, "kl": 0.096435546875, "learning_rate": 1.1658031088082901e-07, "loss": -0.0001, "reward": 1.9998699426651, "reward_std": 1.3150222002877854e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4998700320720673, "step": 3411 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 8.839378238341968, "grad_norm": 20.869051909831594, "kl": 0.12841796875, "learning_rate": 1.1632124352331606e-07, "loss": -0.0002, "reward": 2.4999842643737793, "reward_std": 8.628997193227406e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.999984323978424, "step": 3412 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 8.841968911917098, "grad_norm": 0.09278710863050392, "kl": 0.0345458984375, "learning_rate": 1.160621761658031e-07, "loss": 0.0001, "reward": 2.499997138977051, "reward_std": 2.149674003248947e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999972581863403, "step": 3413 }, { "clip_ratio": 0.0, "completion_length": 34.5, "epoch": 8.844559585492227, "grad_norm": 0.04247926486749104, "kl": 0.072509765625, "learning_rate": 1.1580310880829015e-07, "loss": -0.0004, "reward": 2.4999992847442627, "reward_std": 8.513656837294548e-07, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999993443489075, "step": 3414 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 8.847150259067357, "grad_norm": 0.4014779174639785, "kl": 0.04632568359375, "learning_rate": 1.155440414507772e-07, "loss": -0.0001, "reward": 2.499996304512024, "reward_std": 3.181224201398436e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999963641166687, "step": 3415 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 8.849740932642487, "grad_norm": 15.656839243266678, "kl": 0.068359375, "learning_rate": 1.1528497409326423e-07, "loss": 0.0, "reward": 2.4998550415039062, "reward_std": 2.529534822315327e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9998552203178406, "step": 3416 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 8.852331606217616, "grad_norm": 0.35280788167109844, "kl": 0.0791015625, "learning_rate": 1.1502590673575128e-07, "loss": 0.001, "reward": 2.499988079071045, "reward_std": 4.463617074179638e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.999988079071045, "step": 3417 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 8.854922279792746, "grad_norm": 17.988626912078182, "kl": 0.3271484375, "learning_rate": 1.1476683937823834e-07, "loss": 0.0013, "reward": 1.8640047311782837, "reward_std": 0.00017136085398306022, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.3640047311782837, "step": 3418 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 8.857512953367875, "grad_norm": 58.52317483085468, "kl": 0.201171875, "learning_rate": 1.1450777202072538e-07, "loss": 0.0002, "reward": 1.9781255722045898, "reward_std": 0.00012090897871530615, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4781257510185242, "step": 3419 }, { "clip_ratio": 0.0, "completion_length": 34.5, "epoch": 8.860103626943005, "grad_norm": 0.03440881905499051, "kl": 0.0496826171875, "learning_rate": 1.1424870466321243e-07, "loss": 0.0006, "reward": 2.4999990463256836, "reward_std": 9.499318878170016e-07, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999991655349731, "step": 3420 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 8.862694300518134, "grad_norm": 0.44400735209983677, "kl": 0.119140625, "learning_rate": 1.1398963730569948e-07, "loss": 0.0005, "reward": 2.4999955892562866, "reward_std": 5.478663979374687e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999956488609314, "step": 3421 }, { "clip_ratio": 0.0, "completion_length": 34.5, "epoch": 8.865284974093264, "grad_norm": 0.4163958862861426, "kl": 0.05712890625, "learning_rate": 1.1373056994818652e-07, "loss": -0.0009, "reward": 2.499996304512024, "reward_std": 2.6456235673322226e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.999996542930603, "step": 3422 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 8.867875647668393, "grad_norm": 0.32583217337005665, "kl": 0.05718994140625, "learning_rate": 1.1347150259067357e-07, "loss": -0.0015, "reward": 2.4999964237213135, "reward_std": 2.366817142274158e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.999996542930603, "step": 3423 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 8.870466321243523, "grad_norm": 0.10256511655733214, "kl": 0.080078125, "learning_rate": 1.1321243523316061e-07, "loss": 0.0011, "reward": 2.4999966621398926, "reward_std": 1.6849464117285606e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.999996542930603, "step": 3424 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 8.873056994818652, "grad_norm": 0.12580832656158075, "kl": 0.100341796875, "learning_rate": 1.1295336787564767e-07, "loss": 0.0004, "reward": 2.4999982118606567, "reward_std": 1.3801606542074296e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999982118606567, "step": 3425 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 8.875647668393782, "grad_norm": 0.05120077976122874, "kl": 0.070556640625, "learning_rate": 1.126943005181347e-07, "loss": -0.0008, "reward": 2.499999523162842, "reward_std": 3.466466154122827e-07, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999998211860657, "step": 3426 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 8.878238341968911, "grad_norm": 0.048049701245307976, "kl": 0.1005859375, "learning_rate": 1.1243523316062176e-07, "loss": -0.0, "reward": 2.4999985694885254, "reward_std": 1.0201388533914724e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999987483024597, "step": 3427 }, { "clip_ratio": 0.0, "completion_length": 34.0, "epoch": 8.880829015544041, "grad_norm": 0.29481804680087365, "kl": 0.0462646484375, "learning_rate": 1.1217616580310881e-07, "loss": 0.0002, "reward": 2.49999737739563, "reward_std": 2.4502935502823675e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999973773956299, "step": 3428 }, { "clip_ratio": 0.0, "completion_length": 34.5, "epoch": 8.88341968911917, "grad_norm": 5.797691448128938, "kl": 0.093505859375, "learning_rate": 1.1191709844559585e-07, "loss": 0.0005, "reward": 2.4999908208847046, "reward_std": 9.91011756923399e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999908804893494, "step": 3429 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 8.8860103626943, "grad_norm": 5.16368005322399, "kl": 0.134765625, "learning_rate": 1.116580310880829e-07, "loss": 0.0001, "reward": 1.9451864361763, "reward_std": 0.00021720198492403142, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4451864361763, "step": 3430 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 8.88860103626943, "grad_norm": 0.5404804688142195, "kl": 0.095458984375, "learning_rate": 1.1139896373056994e-07, "loss": 0.0002, "reward": 2.499993324279785, "reward_std": 5.418317641670001e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.99999338388443, "step": 3431 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 8.89119170984456, "grad_norm": 0.1579308146987876, "kl": 0.0567626953125, "learning_rate": 1.1113989637305698e-07, "loss": 0.0011, "reward": 2.4999969005584717, "reward_std": 1.4467167090970179e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999969005584717, "step": 3432 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 8.893782383419689, "grad_norm": 0.47639674865865966, "kl": 0.094970703125, "learning_rate": 1.1088082901554403e-07, "loss": 0.0002, "reward": 2.4999961853027344, "reward_std": 3.112467084065429e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999961853027344, "step": 3433 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 8.896373056994818, "grad_norm": 2.811170479359201, "kl": 0.13720703125, "learning_rate": 1.1062176165803109e-07, "loss": 0.0003, "reward": 1.9984723925590515, "reward_std": 4.5855091002522386e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4984723627567291, "step": 3434 }, { "clip_ratio": 0.0, "completion_length": 34.5, "epoch": 8.898963730569948, "grad_norm": 4.107889865313289, "kl": 0.080078125, "learning_rate": 1.1036269430051813e-07, "loss": -0.0008, "reward": 2.499996542930603, "reward_std": 2.4765067223597725e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999966025352478, "step": 3435 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 8.901554404145077, "grad_norm": 4.3220549238242665, "kl": 0.12841796875, "learning_rate": 1.1010362694300518e-07, "loss": -0.0005, "reward": 1.9741721153259277, "reward_std": 0.00016704053552984988, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.474172204732895, "step": 3436 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 8.904145077720207, "grad_norm": 0.16821721461119385, "kl": 0.056396484375, "learning_rate": 1.0984455958549223e-07, "loss": 0.0007, "reward": 2.4999847412109375, "reward_std": 2.912349188477492e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999847412109375, "step": 3437 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 8.906735751295336, "grad_norm": 0.04496954180309151, "kl": 0.0660400390625, "learning_rate": 1.0958549222797927e-07, "loss": 0.0004, "reward": 2.4999972581863403, "reward_std": 1.2920062886223604e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.999997317790985, "step": 3438 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 8.909326424870466, "grad_norm": 53.79083783966578, "kl": 0.13232421875, "learning_rate": 1.0932642487046631e-07, "loss": -0.0005, "reward": 1.9989625811576843, "reward_std": 0.00023589314776018, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4989627003669739, "step": 3439 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 8.911917098445596, "grad_norm": 1.6221286608116776, "kl": 0.17919921875, "learning_rate": 1.0906735751295336e-07, "loss": -0.0004, "reward": 1.9985601902008057, "reward_std": 2.8334295166132506e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4985601902008057, "step": 3440 }, { "clip_ratio": 0.0, "completion_length": 34.5, "epoch": 8.914507772020725, "grad_norm": 0.07045610481541544, "kl": 0.133056640625, "learning_rate": 1.088082901554404e-07, "loss": 0.0016, "reward": 2.4999982118606567, "reward_std": 1.3275659398459538e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999982118606567, "step": 3441 }, { "clip_ratio": 0.0, "completion_length": 34.5, "epoch": 8.917098445595855, "grad_norm": 0.11411398842343769, "kl": 0.145263671875, "learning_rate": 1.0854922279792746e-07, "loss": 0.0005, "reward": 2.4998862743377686, "reward_std": 4.5016962531008176e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.999886155128479, "step": 3442 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 8.919689119170984, "grad_norm": 0.08906425637105181, "kl": 0.0830078125, "learning_rate": 1.0829015544041451e-07, "loss": 0.0005, "reward": 2.4999994039535522, "reward_std": 4.821480814598544e-07, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999993443489075, "step": 3443 }, { "clip_ratio": 0.0, "completion_length": 34.8125, "epoch": 8.922279792746114, "grad_norm": 0.18617213821420595, "kl": 0.04833984375, "learning_rate": 1.0803108808290155e-07, "loss": 0.0001, "reward": 2.4999961853027344, "reward_std": 1.417240852674695e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999963641166687, "step": 3444 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 8.924870466321243, "grad_norm": 0.1431428173596342, "kl": 0.05224609375, "learning_rate": 1.077720207253886e-07, "loss": -0.0007, "reward": 2.4999969005584717, "reward_std": 2.547374265304825e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999969005584717, "step": 3445 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 8.927461139896373, "grad_norm": 1.1937464296417282, "kl": 0.084228515625, "learning_rate": 1.0751295336787564e-07, "loss": 0.0007, "reward": 2.4999951124191284, "reward_std": 5.58721046672872e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999949932098389, "step": 3446 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 8.930051813471502, "grad_norm": 4.780715840024769, "kl": 0.0665283203125, "learning_rate": 1.0725388601036268e-07, "loss": -0.0001, "reward": 1.9979389905929565, "reward_std": 5.5960548252187436e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.497939020395279, "step": 3447 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 8.932642487046632, "grad_norm": 1.0849708617331706, "kl": 0.1328125, "learning_rate": 1.0699481865284973e-07, "loss": -0.0001, "reward": 1.9995614290237427, "reward_std": 1.5484875859783642e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4995616972446442, "step": 3448 }, { "clip_ratio": 0.0, "completion_length": 34.5, "epoch": 8.935233160621761, "grad_norm": 0.17426210578945972, "kl": 0.085693359375, "learning_rate": 1.0673575129533678e-07, "loss": -0.0004, "reward": 2.4999953508377075, "reward_std": 2.6784234705701238e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999955296516418, "step": 3449 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 8.937823834196891, "grad_norm": 4.112937615738767, "kl": 0.09521484375, "learning_rate": 1.0647668393782382e-07, "loss": 0.0006, "reward": 1.9779855012893677, "reward_std": 0.00013884341632319774, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4779855608940125, "step": 3450 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 8.94041450777202, "grad_norm": 1.2829217034006948, "kl": 0.096435546875, "learning_rate": 1.0621761658031088e-07, "loss": -0.0005, "reward": 1.999855101108551, "reward_std": 1.3056008356215898e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4998551905155182, "step": 3451 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 8.94300518134715, "grad_norm": 0.43185681731887043, "kl": 0.05645751953125, "learning_rate": 1.0595854922279793e-07, "loss": 0.001, "reward": 1.999801516532898, "reward_std": 9.453267409753607e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4998016357421875, "step": 3452 }, { "clip_ratio": 0.0, "completion_length": 34.5, "epoch": 8.94559585492228, "grad_norm": 0.09068131547420637, "kl": 0.0489501953125, "learning_rate": 1.0569948186528498e-07, "loss": -0.0007, "reward": 2.499997615814209, "reward_std": 1.5699802986546274e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999977350234985, "step": 3453 }, { "clip_ratio": 0.0, "completion_length": 34.5, "epoch": 8.94818652849741, "grad_norm": 0.20139149054237102, "kl": 0.1064453125, "learning_rate": 1.0544041450777201e-07, "loss": 0.0012, "reward": 2.4999974966049194, "reward_std": 1.6089898622340115e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999974966049194, "step": 3454 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 8.950777202072539, "grad_norm": 1.6649257056288238, "kl": 0.08099365234375, "learning_rate": 1.0518134715025906e-07, "loss": 0.0009, "reward": 2.4999940395355225, "reward_std": 5.416392070856091e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999939799308777, "step": 3455 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 8.953367875647668, "grad_norm": 0.06201081254614163, "kl": 0.0523681640625, "learning_rate": 1.0492227979274611e-07, "loss": -0.0001, "reward": 2.4999990463256836, "reward_std": 8.919098775095335e-07, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999991059303284, "step": 3456 }, { "clip_ratio": 0.0, "completion_length": 35.125, "epoch": 8.955958549222798, "grad_norm": 0.06398299704463684, "kl": 0.04156494140625, "learning_rate": 1.0466321243523315e-07, "loss": 0.0003, "reward": 2.499998450279236, "reward_std": 1.4224654023564653e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999985098838806, "step": 3457 }, { "clip_ratio": 0.0, "completion_length": 34.0, "epoch": 8.958549222797927, "grad_norm": 14.559187055911945, "kl": 0.14111328125, "learning_rate": 1.044041450777202e-07, "loss": 0.0006, "reward": 1.2252216935157776, "reward_std": 0.28386136662447825, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 0.7252216935157776, "step": 3458 }, { "clip_ratio": 0.0, "completion_length": 34.5, "epoch": 8.961139896373057, "grad_norm": 0.050963070702148984, "kl": 0.016204833984375, "learning_rate": 1.0414507772020726e-07, "loss": -0.0016, "reward": 2.499992847442627, "reward_std": 1.4169637552186032e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999929666519165, "step": 3459 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 8.963730569948186, "grad_norm": 139.76525629377764, "kl": 0.0703125, "learning_rate": 1.038860103626943e-07, "loss": -0.0003, "reward": 1.8044943809509277, "reward_std": 0.00312106445653626, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.3044945001602173, "step": 3460 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 8.966321243523316, "grad_norm": 0.19717132028313283, "kl": 0.08935546875, "learning_rate": 1.0362694300518134e-07, "loss": 0.0012, "reward": 2.499994397163391, "reward_std": 3.114770720458182e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999942779541016, "step": 3461 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 8.968911917098445, "grad_norm": 1.1157861237311544, "kl": 0.0848388671875, "learning_rate": 1.0336787564766839e-07, "loss": 0.0001, "reward": 2.4999642372131348, "reward_std": 1.1197418757546984e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999642372131348, "step": 3462 }, { "clip_ratio": 0.0, "completion_length": 34.0, "epoch": 8.971502590673575, "grad_norm": 0.09207391835601894, "kl": 0.07281494140625, "learning_rate": 1.0310880829015543e-07, "loss": 0.0012, "reward": 2.4999983310699463, "reward_std": 1.3727566283705528e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999982714653015, "step": 3463 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 8.974093264248705, "grad_norm": 5.98109888321172, "kl": 0.1348876953125, "learning_rate": 1.0284974093264248e-07, "loss": 0.001, "reward": 1.895218312740326, "reward_std": 0.0007560910520396646, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.3952181339263916, "step": 3464 }, { "clip_ratio": 0.0, "completion_length": 34.5, "epoch": 8.976683937823834, "grad_norm": 0.20376526679965523, "kl": 0.16748046875, "learning_rate": 1.0259067357512953e-07, "loss": 0.002, "reward": 2.4999977350234985, "reward_std": 1.7385539194947341e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999977350234985, "step": 3465 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 8.979274611398964, "grad_norm": 0.06407895937600917, "kl": 0.1123046875, "learning_rate": 1.0233160621761657e-07, "loss": 0.0, "reward": 2.4999985694885254, "reward_std": 9.888409522318398e-07, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999986290931702, "step": 3466 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 8.981865284974093, "grad_norm": 1.3565622858874267, "kl": 0.1240234375, "learning_rate": 1.0207253886010363e-07, "loss": 0.0015, "reward": 2.499993085861206, "reward_std": 5.913242773658567e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999930262565613, "step": 3467 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 8.984455958549223, "grad_norm": 2.273181171456491, "kl": 0.127685546875, "learning_rate": 1.0181347150259068e-07, "loss": 0.0002, "reward": 1.9992265701293945, "reward_std": 4.288488003112434e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4992266595363617, "step": 3468 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 8.987046632124352, "grad_norm": 2.78575176240552, "kl": 0.1275634765625, "learning_rate": 1.015544041450777e-07, "loss": 0.0001, "reward": 1.999359905719757, "reward_std": 2.7430876500034174e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4993600249290466, "step": 3469 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 8.989637305699482, "grad_norm": 1.5601966406933825, "kl": 0.235107421875, "learning_rate": 1.0129533678756476e-07, "loss": 0.0017, "reward": 2.499997854232788, "reward_std": 2.147631334992184e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999977350234985, "step": 3470 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 8.992227979274611, "grad_norm": 0.5308453818860998, "kl": 0.046630859375, "learning_rate": 1.0103626943005181e-07, "loss": 0.0005, "reward": 2.499993085861206, "reward_std": 4.354723614596878e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999930262565613, "step": 3471 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 8.994818652849741, "grad_norm": 0.38284448902362284, "kl": 0.0379638671875, "learning_rate": 1.0077720207253885e-07, "loss": -0.0011, "reward": 2.4999964237213135, "reward_std": 2.533268229854002e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999966621398926, "step": 3472 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 8.99740932642487, "grad_norm": 2.677295087261124, "kl": 0.11669921875, "learning_rate": 1.005181347150259e-07, "loss": 0.0004, "reward": 1.9998379945755005, "reward_std": 2.7647049819279346e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4998379349708557, "step": 3473 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 9.0, "grad_norm": 3.001279584051759, "kl": 0.159423828125, "learning_rate": 1.0025906735751296e-07, "loss": 0.0003, "reward": 1.9966130256652832, "reward_std": 5.19800078109256e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4966131150722504, "step": 3474 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 9.00259067357513, "grad_norm": 0.6593848763127426, "kl": 0.348876953125, "learning_rate": 1e-07, "loss": 0.0014, "reward": 2.499997615814209, "reward_std": 2.3731427774009717e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999976754188538, "step": 3475 }, { "clip_ratio": 0.0, "completion_length": 34.5, "epoch": 9.005181347150259, "grad_norm": 0.1514879877640662, "kl": 0.08349609375, "learning_rate": 9.974093264248703e-08, "loss": -0.0005, "reward": 2.499997615814209, "reward_std": 2.2573627802557894e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999979138374329, "step": 3476 }, { "clip_ratio": 0.0, "completion_length": 34.5, "epoch": 9.007772020725389, "grad_norm": 0.9536487934199707, "kl": 0.101318359375, "learning_rate": 9.948186528497409e-08, "loss": 0.0002, "reward": 2.4999868869781494, "reward_std": 7.404875304928282e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999868869781494, "step": 3477 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 9.010362694300518, "grad_norm": 13.645935854741467, "kl": 0.107177734375, "learning_rate": 9.922279792746113e-08, "loss": -0.0007, "reward": 2.4999918937683105, "reward_std": 1.0237132300972007e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999921321868896, "step": 3478 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 9.012953367875648, "grad_norm": 0.44118924229686024, "kl": 0.03759765625, "learning_rate": 9.896373056994818e-08, "loss": 0.0002, "reward": 2.4999955892562866, "reward_std": 2.61203285845113e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.999995470046997, "step": 3479 }, { "clip_ratio": 0.0, "completion_length": 34.5, "epoch": 9.015544041450777, "grad_norm": 0.10497714823184288, "kl": 0.0947265625, "learning_rate": 9.870466321243523e-08, "loss": 0.0004, "reward": 2.4999955892562866, "reward_std": 1.8635282401646691e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999957084655762, "step": 3480 }, { "clip_ratio": 0.0, "completion_length": 34.5, "epoch": 9.018134715025907, "grad_norm": 0.10328823100313811, "kl": 0.119873046875, "learning_rate": 9.844559585492227e-08, "loss": 0.0, "reward": 2.499997615814209, "reward_std": 2.059570306300884e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999976754188538, "step": 3481 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 9.020725388601036, "grad_norm": 5.61715145313828, "kl": 0.14111328125, "learning_rate": 9.818652849740932e-08, "loss": 0.0008, "reward": 1.999222993850708, "reward_std": 7.085562697284331e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4992229342460632, "step": 3482 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 9.023316062176166, "grad_norm": 0.07036952884612498, "kl": 0.0518798828125, "learning_rate": 9.792746113989638e-08, "loss": 0.0008, "reward": 2.4999988079071045, "reward_std": 1.11159255311577e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999988079071045, "step": 3483 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 9.025906735751295, "grad_norm": 0.4380365066412515, "kl": 0.064697265625, "learning_rate": 9.76683937823834e-08, "loss": -0.0003, "reward": 2.499990224838257, "reward_std": 4.9006062852186005e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999903440475464, "step": 3484 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 9.028497409326425, "grad_norm": 2.495643749107821, "kl": 0.17626953125, "learning_rate": 9.740932642487046e-08, "loss": 0.0009, "reward": 1.4108569025993347, "reward_std": 0.00014331345437312848, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 0.9108568429946899, "step": 3485 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 9.031088082901555, "grad_norm": 0.3345279109883954, "kl": 0.1015625, "learning_rate": 9.715025906735751e-08, "loss": 0.001, "reward": 1.998820960521698, "reward_std": 1.0031901013007882e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4988210499286652, "step": 3486 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 9.033678756476684, "grad_norm": 1.2693213301170225, "kl": 0.147216796875, "learning_rate": 9.689119170984456e-08, "loss": -0.0001, "reward": 2.499998092651367, "reward_std": 1.9309435970171762e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999982714653015, "step": 3487 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 9.036269430051814, "grad_norm": 1.1066697222495046, "kl": 0.17333984375, "learning_rate": 9.66321243523316e-08, "loss": -0.001, "reward": 2.4999959468841553, "reward_std": 3.144507900287863e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999961256980896, "step": 3488 }, { "clip_ratio": 0.0, "completion_length": 34.5, "epoch": 9.038860103626943, "grad_norm": 6.743277964024613, "kl": 0.1875, "learning_rate": 9.637305699481865e-08, "loss": 0.0008, "reward": 1.9139612913131714, "reward_std": 0.0004165237122037979, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4139613509178162, "step": 3489 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 9.041450777202073, "grad_norm": 0.42999072864916266, "kl": 0.1298828125, "learning_rate": 9.61139896373057e-08, "loss": 0.0007, "reward": 2.4999910593032837, "reward_std": 2.8959943847439718e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999910593032837, "step": 3490 }, { "clip_ratio": 0.0, "completion_length": 33.5, "epoch": 9.044041450777202, "grad_norm": 0.2901025006281915, "kl": 0.0859375, "learning_rate": 9.585492227979273e-08, "loss": 0.0012, "reward": 2.4999964237213135, "reward_std": 2.591831446352444e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999963641166687, "step": 3491 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 9.046632124352332, "grad_norm": 0.14821971298915504, "kl": 0.106689453125, "learning_rate": 9.559585492227979e-08, "loss": 0.0001, "reward": 2.4999940395355225, "reward_std": 2.6334391804994084e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999940991401672, "step": 3492 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 9.049222797927461, "grad_norm": 36.833069799240164, "kl": 0.105712890625, "learning_rate": 9.533678756476684e-08, "loss": 0.0006, "reward": 1.9517080783843994, "reward_std": 0.000696948525614971, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4517080783843994, "step": 3493 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 9.05181347150259, "grad_norm": 7.408022671647803, "kl": 0.150390625, "learning_rate": 9.507772020725388e-08, "loss": 0.0007, "reward": 1.9843990802764893, "reward_std": 0.00019531736870703753, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4843991994857788, "step": 3494 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 9.05440414507772, "grad_norm": 0.23193896606616435, "kl": 0.08624267578125, "learning_rate": 9.481865284974093e-08, "loss": 0.0003, "reward": 2.499997138977051, "reward_std": 2.67501661710412e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.999997079372406, "step": 3495 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 9.05699481865285, "grad_norm": 1.5772107997975955, "kl": 0.0438232421875, "learning_rate": 9.455958549222798e-08, "loss": 0.0001, "reward": 2.499995470046997, "reward_std": 3.7123882918876916e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.999995470046997, "step": 3496 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 9.05958549222798, "grad_norm": 0.882971446673172, "kl": 0.0677490234375, "learning_rate": 9.430051813471502e-08, "loss": -0.0, "reward": 2.499984383583069, "reward_std": 6.3757099724170985e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999843835830688, "step": 3497 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 9.062176165803109, "grad_norm": 0.08620103946244546, "kl": 0.080078125, "learning_rate": 9.404145077720207e-08, "loss": 0.0003, "reward": 2.499996066093445, "reward_std": 2.149706432419407e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999960064888, "step": 3498 }, { "clip_ratio": 0.0, "completion_length": 34.0, "epoch": 9.064766839378239, "grad_norm": 0.11164462883929452, "kl": 0.094482421875, "learning_rate": 9.378238341968911e-08, "loss": 0.0002, "reward": 2.499997615814209, "reward_std": 1.7218991388290306e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999976754188538, "step": 3499 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 9.067357512953368, "grad_norm": 0.041141774413814156, "kl": 0.072021484375, "learning_rate": 9.352331606217615e-08, "loss": -0.0003, "reward": 2.499998927116394, "reward_std": 1.111408181486695e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999990463256836, "step": 3500 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 9.069948186528498, "grad_norm": 0.14079751258954698, "kl": 0.083984375, "learning_rate": 9.32642487046632e-08, "loss": 0.0008, "reward": 2.4999992847442627, "reward_std": 8.097625254777086e-07, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999993443489075, "step": 3501 }, { "clip_ratio": 0.0, "completion_length": 34.0, "epoch": 9.072538860103627, "grad_norm": 0.5327683758851042, "kl": 0.0828857421875, "learning_rate": 9.300518134715026e-08, "loss": 0.0018, "reward": 2.499993920326233, "reward_std": 2.314488796173464e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999938607215881, "step": 3502 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 9.075129533678757, "grad_norm": 0.19671401761087878, "kl": 0.0712890625, "learning_rate": 9.27461139896373e-08, "loss": 0.0003, "reward": 2.499994993209839, "reward_std": 2.544074845900468e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999951720237732, "step": 3503 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 9.077720207253886, "grad_norm": 0.6099216761443084, "kl": 0.132568359375, "learning_rate": 9.248704663212435e-08, "loss": 0.0012, "reward": 2.499996304512024, "reward_std": 3.42810790243675e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999961256980896, "step": 3504 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 9.080310880829016, "grad_norm": 0.03981533333921242, "kl": 0.05352783203125, "learning_rate": 9.22279792746114e-08, "loss": 0.0005, "reward": 2.499997138977051, "reward_std": 1.4315186831481697e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.999997317790985, "step": 3505 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 9.082901554404145, "grad_norm": 0.06132848935988859, "kl": 0.14892578125, "learning_rate": 9.196891191709843e-08, "loss": 0.0005, "reward": 2.4999905824661255, "reward_std": 2.222905777671258e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999906420707703, "step": 3506 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 9.085492227979275, "grad_norm": 0.16083386118952564, "kl": 0.086181640625, "learning_rate": 9.170984455958548e-08, "loss": -0.0001, "reward": 2.4999974966049194, "reward_std": 1.9145832084177528e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999975562095642, "step": 3507 }, { "clip_ratio": 0.0, "completion_length": 34.5, "epoch": 9.088082901554404, "grad_norm": 0.33732688001838157, "kl": 0.076416015625, "learning_rate": 9.145077720207254e-08, "loss": -0.0016, "reward": 2.4999964237213135, "reward_std": 3.306870041797083e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999966621398926, "step": 3508 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 9.090673575129534, "grad_norm": 0.06172871315433009, "kl": 0.058837890625, "learning_rate": 9.119170984455957e-08, "loss": -0.0004, "reward": 2.4999985694885254, "reward_std": 1.1362897396338667e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999986290931702, "step": 3509 }, { "clip_ratio": 0.0, "completion_length": 36.5, "epoch": 9.093264248704664, "grad_norm": 0.1221763012444763, "kl": 0.0654296875, "learning_rate": 9.093264248704663e-08, "loss": 0.0006, "reward": 2.4999964237213135, "reward_std": 1.9944716882491775e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.999996304512024, "step": 3510 }, { "clip_ratio": 0.0, "completion_length": 34.5, "epoch": 9.095854922279793, "grad_norm": 0.10759559405656546, "kl": 0.1484375, "learning_rate": 9.067357512953368e-08, "loss": 0.0009, "reward": 2.4999983310699463, "reward_std": 1.3774486546935805e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.999998390674591, "step": 3511 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 9.098445595854923, "grad_norm": 4.517936598874727, "kl": 0.114990234375, "learning_rate": 9.041450777202072e-08, "loss": -0.0006, "reward": 1.9975407123565674, "reward_std": 7.587642562612018e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4975408017635345, "step": 3512 }, { "clip_ratio": 0.0, "completion_length": 34.5, "epoch": 9.101036269430052, "grad_norm": 3.4552481314677723, "kl": 0.0921630859375, "learning_rate": 9.015544041450777e-08, "loss": -0.0002, "reward": 1.998643696308136, "reward_std": 5.254331631476816e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4986439645290375, "step": 3513 }, { "clip_ratio": 0.0, "completion_length": 34.5, "epoch": 9.103626943005182, "grad_norm": 0.1564918929933876, "kl": 0.1358642578125, "learning_rate": 8.989637305699481e-08, "loss": 0.0003, "reward": 2.4999961853027344, "reward_std": 3.0495034479827154e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.999996304512024, "step": 3514 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 9.106217616580311, "grad_norm": 0.09146741348489058, "kl": 0.0478515625, "learning_rate": 8.963730569948185e-08, "loss": -0.001, "reward": 2.499997138977051, "reward_std": 1.5287637040728441e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999974966049194, "step": 3515 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 9.10880829015544, "grad_norm": 0.09953748251622452, "kl": 0.086669921875, "learning_rate": 8.93782383419689e-08, "loss": 0.0003, "reward": 2.49999737739563, "reward_std": 1.5666132640035357e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999975562095642, "step": 3516 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 9.11139896373057, "grad_norm": 0.44628701048263403, "kl": 0.0845947265625, "learning_rate": 8.911917098445596e-08, "loss": 0.0005, "reward": 2.499992609024048, "reward_std": 1.0490711019883747e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999926686286926, "step": 3517 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 9.1139896373057, "grad_norm": 4.290890938745377, "kl": 0.1033935546875, "learning_rate": 8.886010362694301e-08, "loss": 0.0015, "reward": 2.49998939037323, "reward_std": 6.804335072274625e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.99998939037323, "step": 3518 }, { "clip_ratio": 0.0, "completion_length": 34.5, "epoch": 9.11658031088083, "grad_norm": 0.08609135237311949, "kl": 0.14453125, "learning_rate": 8.860103626943005e-08, "loss": 0.0004, "reward": 2.499998092651367, "reward_std": 9.932541900070646e-07, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999980926513672, "step": 3519 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 9.119170984455959, "grad_norm": 0.08522076308170079, "kl": 0.060546875, "learning_rate": 8.83419689119171e-08, "loss": -0.0015, "reward": 2.4999970197677612, "reward_std": 1.883630062593511e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999971985816956, "step": 3520 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 9.121761658031089, "grad_norm": 0.13492581941467, "kl": 0.1041259765625, "learning_rate": 8.808290155440414e-08, "loss": -0.0002, "reward": 2.4999977350234985, "reward_std": 1.5803756809873448e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.999997854232788, "step": 3521 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 9.124352331606218, "grad_norm": 0.2602465144856269, "kl": 0.10205078125, "learning_rate": 8.782383419689118e-08, "loss": 0.0008, "reward": 2.499997854232788, "reward_std": 1.2697054216914694e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999977946281433, "step": 3522 }, { "clip_ratio": 0.0, "completion_length": 35.4375, "epoch": 9.126943005181348, "grad_norm": 22.897018034269614, "kl": 0.1806640625, "learning_rate": 8.756476683937823e-08, "loss": 0.0008, "reward": 1.917019248008728, "reward_std": 0.17721241320396075, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4170193076133728, "step": 3523 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 9.129533678756477, "grad_norm": 1.0593552148541703, "kl": 0.05712890625, "learning_rate": 8.730569948186529e-08, "loss": -0.0006, "reward": 2.4999853372573853, "reward_std": 7.5435145845403895e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999854564666748, "step": 3524 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 9.132124352331607, "grad_norm": 3.8027444160685127, "kl": 0.079345703125, "learning_rate": 8.704663212435232e-08, "loss": 0.0011, "reward": 1.999875783920288, "reward_std": 1.8557726207291125e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4998756647109985, "step": 3525 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 9.134715025906736, "grad_norm": 0.2178348476107066, "kl": 0.0589599609375, "learning_rate": 8.678756476683938e-08, "loss": -0.0006, "reward": 2.4999955892562866, "reward_std": 1.9775399096033652e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999956488609314, "step": 3526 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 9.137305699481866, "grad_norm": 0.2545155255759983, "kl": 0.0849609375, "learning_rate": 8.652849740932643e-08, "loss": 0.0003, "reward": 2.499998450279236, "reward_std": 1.0359680970850604e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999985098838806, "step": 3527 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 9.139896373056995, "grad_norm": 382.3894202326106, "kl": 0.160400390625, "learning_rate": 8.626943005181347e-08, "loss": 0.0006, "reward": 1.5881276726722717, "reward_std": 0.2603294017026201, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.0881277322769165, "step": 3528 }, { "clip_ratio": 0.0, "completion_length": 34.5, "epoch": 9.142487046632125, "grad_norm": 0.1452822484063602, "kl": 0.103271484375, "learning_rate": 8.601036269430051e-08, "loss": 0.0007, "reward": 2.49999737739563, "reward_std": 1.812907555631682e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999972581863403, "step": 3529 }, { "clip_ratio": 0.0, "completion_length": 34.0, "epoch": 9.145077720207254, "grad_norm": 0.1718717387852575, "kl": 0.121337890625, "learning_rate": 8.575129533678756e-08, "loss": -0.0005, "reward": 2.499996304512024, "reward_std": 2.6523757696850225e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999964237213135, "step": 3530 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 9.147668393782384, "grad_norm": 3.427584256839214, "kl": 0.091552734375, "learning_rate": 8.54922279792746e-08, "loss": 0.0004, "reward": 1.9929132461547852, "reward_std": 0.00010289954838071935, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.49291330575943, "step": 3531 }, { "clip_ratio": 0.0, "completion_length": 34.5, "epoch": 9.150259067357513, "grad_norm": 0.17352339974584874, "kl": 0.19677734375, "learning_rate": 8.523316062176165e-08, "loss": -0.0002, "reward": 2.4999953508377075, "reward_std": 2.938101118843406e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999955296516418, "step": 3532 }, { "clip_ratio": 0.0, "completion_length": 35.25, "epoch": 9.152849740932643, "grad_norm": 29.424021399266465, "kl": 0.148193359375, "learning_rate": 8.497409326424871e-08, "loss": 0.0003, "reward": 1.905815839767456, "reward_std": 0.0246942967598045, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4058158993721008, "step": 3533 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 9.155440414507773, "grad_norm": 0.02242225058304249, "kl": 0.15625, "learning_rate": 8.471502590673575e-08, "loss": -0.0001, "reward": 2.4999990463256836, "reward_std": 6.671853896023094e-07, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999992847442627, "step": 3534 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 9.158031088082902, "grad_norm": 0.3866209538219895, "kl": 0.072509765625, "learning_rate": 8.44559585492228e-08, "loss": 0.001, "reward": 2.499995231628418, "reward_std": 3.4890496181105846e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.999995231628418, "step": 3535 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 9.160621761658032, "grad_norm": 1.8572890997567275, "kl": 0.0772705078125, "learning_rate": 8.419689119170984e-08, "loss": 0.0014, "reward": 2.4999942779541016, "reward_std": 9.465374660067027e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.999994158744812, "step": 3536 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 9.163212435233161, "grad_norm": 0.24919291666141827, "kl": 0.044189453125, "learning_rate": 8.393782383419688e-08, "loss": -0.001, "reward": 2.499993681907654, "reward_std": 4.164619667790248e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999937415122986, "step": 3537 }, { "clip_ratio": 0.0, "completion_length": 34.5, "epoch": 9.16580310880829, "grad_norm": 0.08768595879822642, "kl": 0.0816650390625, "learning_rate": 8.367875647668393e-08, "loss": 0.0, "reward": 2.4999974966049194, "reward_std": 1.355705023797782e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999974370002747, "step": 3538 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 9.16839378238342, "grad_norm": 40.88632913457326, "kl": 0.1669921875, "learning_rate": 8.341968911917098e-08, "loss": 0.0005, "reward": 1.9617998600006104, "reward_std": 0.005931414394581225, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4617998600006104, "step": 3539 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 9.17098445595855, "grad_norm": 0.19199105593260407, "kl": 0.1142578125, "learning_rate": 8.316062176165802e-08, "loss": 0.0015, "reward": 2.4999982118606567, "reward_std": 1.6946941627793422e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999980330467224, "step": 3540 }, { "clip_ratio": 0.0, "completion_length": 34.5, "epoch": 9.17357512953368, "grad_norm": 0.23844546466452848, "kl": 0.066162109375, "learning_rate": 8.290155440414508e-08, "loss": -0.0, "reward": 2.499996066093445, "reward_std": 2.2397210841518245e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999960660934448, "step": 3541 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 9.176165803108809, "grad_norm": 44.01359906022211, "kl": 0.13525390625, "learning_rate": 8.264248704663213e-08, "loss": 0.0004, "reward": 2.2498685717582703, "reward_std": 0.2673964417733714, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.7498685121536255, "step": 3542 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 9.178756476683938, "grad_norm": 0.3352071062343994, "kl": 0.131591796875, "learning_rate": 8.238341968911918e-08, "loss": -0.0009, "reward": 2.4999728202819824, "reward_std": 5.211686584516428e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999730587005615, "step": 3543 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 9.181347150259068, "grad_norm": 10.411243708551389, "kl": 0.09765625, "learning_rate": 8.212435233160621e-08, "loss": 0.0011, "reward": 1.9981681108474731, "reward_std": 0.00020398637303742362, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.498168170452118, "step": 3544 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 9.183937823834198, "grad_norm": 0.12909051668991595, "kl": 0.115234375, "learning_rate": 8.186528497409326e-08, "loss": 0.0016, "reward": 2.499998092651367, "reward_std": 1.796446099433524e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999979734420776, "step": 3545 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 9.186528497409327, "grad_norm": 0.6215682147978437, "kl": 0.126953125, "learning_rate": 8.160621761658031e-08, "loss": -0.0006, "reward": 2.49998140335083, "reward_std": 5.958425390417688e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999812841415405, "step": 3546 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 9.189119170984457, "grad_norm": 0.07262286035581746, "kl": 0.0701904296875, "learning_rate": 8.134715025906735e-08, "loss": 0.0001, "reward": 2.4999985694885254, "reward_std": 1.2613791682269948e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999986290931702, "step": 3547 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 9.191709844559586, "grad_norm": 0.05769615047668933, "kl": 0.146240234375, "learning_rate": 8.10880829015544e-08, "loss": 0.0001, "reward": 2.499993085861206, "reward_std": 1.6173233916561003e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999931454658508, "step": 3548 }, { "clip_ratio": 0.0, "completion_length": 34.5, "epoch": 9.194300518134716, "grad_norm": 4.731128879029373, "kl": 0.129638671875, "learning_rate": 8.082901554404146e-08, "loss": 0.0002, "reward": 1.9995192289352417, "reward_std": 4.111810568474539e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.499519169330597, "step": 3549 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 9.196891191709845, "grad_norm": 51.568961297176244, "kl": 0.1708984375, "learning_rate": 8.05699481865285e-08, "loss": 0.001, "reward": 1.9793486595153809, "reward_std": 0.002978515777613211, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.479348599910736, "step": 3550 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 9.199481865284975, "grad_norm": 0.39112334347163097, "kl": 0.07745361328125, "learning_rate": 8.031088082901554e-08, "loss": 0.0002, "reward": 2.4999947547912598, "reward_std": 3.431943355280964e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.999994695186615, "step": 3551 }, { "clip_ratio": 0.0, "completion_length": 34.625, "epoch": 9.202072538860104, "grad_norm": 59.83366888962206, "kl": 0.205078125, "learning_rate": 8.005181347150259e-08, "loss": 0.0004, "reward": 1.999030590057373, "reward_std": 0.0003342139985988979, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4990307092666626, "step": 3552 }, { "clip_ratio": 0.0, "completion_length": 34.5, "epoch": 9.204663212435234, "grad_norm": 22.18644158980627, "kl": 0.171875, "learning_rate": 7.979274611398963e-08, "loss": 0.0009, "reward": 1.6872472763061523, "reward_std": 0.258824537369037, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.1872472763061523, "step": 3553 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 9.207253886010363, "grad_norm": 0.30171082233586743, "kl": 0.0665283203125, "learning_rate": 7.953367875647668e-08, "loss": -0.0005, "reward": 2.499996542930603, "reward_std": 2.6652656686110276e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999966025352478, "step": 3554 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 9.209844559585493, "grad_norm": 0.1899271042364944, "kl": 0.0611572265625, "learning_rate": 7.927461139896373e-08, "loss": 0.0008, "reward": 2.4999970197677612, "reward_std": 2.2224454028219043e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999970197677612, "step": 3555 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 9.212435233160623, "grad_norm": 0.19070259585616608, "kl": 0.0970458984375, "learning_rate": 7.901554404145077e-08, "loss": 0.0002, "reward": 2.49997341632843, "reward_std": 3.711537260642217e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999734163284302, "step": 3556 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 9.215025906735752, "grad_norm": 66.18177249721631, "kl": 0.144287109375, "learning_rate": 7.875647668393783e-08, "loss": 0.0006, "reward": 1.9833200573921204, "reward_std": 0.018461486029764274, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4833199977874756, "step": 3557 }, { "clip_ratio": 0.0, "completion_length": 34.5, "epoch": 9.217616580310882, "grad_norm": 0.11933941238438604, "kl": 0.030670166015625, "learning_rate": 7.849740932642488e-08, "loss": -0.0, "reward": 2.499998092651367, "reward_std": 9.453159748318285e-07, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.999998152256012, "step": 3558 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 9.220207253886011, "grad_norm": 0.049305228015503536, "kl": 0.064453125, "learning_rate": 7.82383419689119e-08, "loss": 0.0001, "reward": 2.4999990463256836, "reward_std": 7.960260575146094e-07, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999991059303284, "step": 3559 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 9.22279792746114, "grad_norm": 4.478637212503904, "kl": 0.1455078125, "learning_rate": 7.797927461139896e-08, "loss": 0.0008, "reward": 1.9800052642822266, "reward_std": 0.00016660140261137713, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.480005145072937, "step": 3560 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 9.22538860103627, "grad_norm": 0.05543515393198338, "kl": 0.12353515625, "learning_rate": 7.772020725388601e-08, "loss": 0.0, "reward": 2.499998450279236, "reward_std": 1.0431233761210024e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999984502792358, "step": 3561 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 9.2279792746114, "grad_norm": 0.11266086440535066, "kl": 0.0458984375, "learning_rate": 7.746113989637305e-08, "loss": 0.0006, "reward": 2.499998927116394, "reward_std": 1.146177766031542e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999990463256836, "step": 3562 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 9.23056994818653, "grad_norm": 0.08830150656926208, "kl": 0.0635986328125, "learning_rate": 7.72020725388601e-08, "loss": -0.0006, "reward": 2.499998092651367, "reward_std": 1.956743631126301e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.999998152256012, "step": 3563 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 9.233160621761659, "grad_norm": 0.30706877443173847, "kl": 0.1435546875, "learning_rate": 7.694300518134715e-08, "loss": -0.0002, "reward": 2.4999972581863403, "reward_std": 2.4336883370779105e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999974966049194, "step": 3564 }, { "clip_ratio": 0.0, "completion_length": 34.5, "epoch": 9.235751295336788, "grad_norm": 2.1614918213791796, "kl": 0.0849609375, "learning_rate": 7.66839378238342e-08, "loss": -0.0001, "reward": 1.9962731003761292, "reward_std": 5.611177903119824e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4962729811668396, "step": 3565 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 9.238341968911918, "grad_norm": 0.18784275754313717, "kl": 0.129638671875, "learning_rate": 7.642487046632123e-08, "loss": 0.0002, "reward": 2.499996304512024, "reward_std": 2.130454390680825e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999961853027344, "step": 3566 }, { "clip_ratio": 0.0, "completion_length": 34.5, "epoch": 9.240932642487047, "grad_norm": 6.149302593866085, "kl": 0.121337890625, "learning_rate": 7.616580310880829e-08, "loss": 0.0011, "reward": 2.49999737739563, "reward_std": 2.6487966806598706e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999973773956299, "step": 3567 }, { "clip_ratio": 0.0, "completion_length": 34.5, "epoch": 9.243523316062177, "grad_norm": 9.463312319003382, "kl": 0.195068359375, "learning_rate": 7.590673575129533e-08, "loss": 0.0015, "reward": 1.8640063405036926, "reward_std": 0.00027915232055875094, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.3640061616897583, "step": 3568 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 9.246113989637305, "grad_norm": 0.18809941974911043, "kl": 0.0909423828125, "learning_rate": 7.564766839378238e-08, "loss": -0.0003, "reward": 2.4999616146087646, "reward_std": 4.8008005251176655e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999616146087646, "step": 3569 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 9.248704663212436, "grad_norm": 0.0638839898521669, "kl": 0.082275390625, "learning_rate": 7.538860103626943e-08, "loss": 0.0005, "reward": 2.499997854232788, "reward_std": 1.3220947323588916e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999977946281433, "step": 3570 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 9.251295336787564, "grad_norm": 0.08531829099653306, "kl": 0.0477294921875, "learning_rate": 7.512953367875647e-08, "loss": -0.0008, "reward": 2.4999977350234985, "reward_std": 1.5818905580999854e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999977946281433, "step": 3571 }, { "clip_ratio": 0.0, "completion_length": 35.8125, "epoch": 9.253886010362695, "grad_norm": 7.348409319816751, "kl": 0.102783203125, "learning_rate": 7.487046632124352e-08, "loss": 0.0005, "reward": 2.343742251396179, "reward_std": 0.44194799704609977, "rewards/format_reward_rec": 0.9375, "rewards/point_reward": 1.8749921917915344, "step": 3572 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 9.256476683937823, "grad_norm": 0.060739984690772277, "kl": 0.07080078125, "learning_rate": 7.461139896373056e-08, "loss": -0.0003, "reward": 2.499997615814209, "reward_std": 1.0749919425734333e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999975562095642, "step": 3573 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 9.259067357512953, "grad_norm": 5.698604906448317, "kl": 0.1328125, "learning_rate": 7.43523316062176e-08, "loss": -0.0002, "reward": 2.4374738931655884, "reward_std": 0.1767980893737331, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9374739527702332, "step": 3574 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 9.261658031088082, "grad_norm": 85.61837482286279, "kl": 0.096435546875, "learning_rate": 7.409326424870465e-08, "loss": 0.0011, "reward": 1.9946054220199585, "reward_std": 9.352893789582595e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4946053624153137, "step": 3575 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 9.264248704663212, "grad_norm": 5.105707741972881, "kl": 0.0498046875, "learning_rate": 7.383419689119171e-08, "loss": 0.0003, "reward": 1.9958272576332092, "reward_std": 7.92200760599826e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4958274364471436, "step": 3576 }, { "clip_ratio": 0.0, "completion_length": 34.5, "epoch": 9.266839378238341, "grad_norm": 0.054476045106411054, "kl": 0.03497314453125, "learning_rate": 7.357512953367876e-08, "loss": -0.0015, "reward": 2.4999988079071045, "reward_std": 9.290054094890365e-07, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999991059303284, "step": 3577 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 9.26943005181347, "grad_norm": 11.530677403527715, "kl": 0.1611328125, "learning_rate": 7.33160621761658e-08, "loss": 0.001, "reward": 1.9456124305725098, "reward_std": 0.00033581948025585007, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.445612370967865, "step": 3578 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 9.2720207253886, "grad_norm": 0.3580396667956142, "kl": 0.07958984375, "learning_rate": 7.305699481865285e-08, "loss": 0.0018, "reward": 2.499995708465576, "reward_std": 2.5989182859120774e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999955296516418, "step": 3579 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 9.27461139896373, "grad_norm": 0.18230858504597722, "kl": 0.073974609375, "learning_rate": 7.27979274611399e-08, "loss": 0.0007, "reward": 2.4999969005584717, "reward_std": 1.9945116491726367e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999969005584717, "step": 3580 }, { "clip_ratio": 0.0, "completion_length": 34.5, "epoch": 9.27720207253886, "grad_norm": 0.7284622978894505, "kl": 0.067138671875, "learning_rate": 7.253886010362693e-08, "loss": 0.001, "reward": 2.4999784231185913, "reward_std": 4.856955342802394e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999783635139465, "step": 3581 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 9.279792746113989, "grad_norm": 0.06370671993094809, "kl": 0.106689453125, "learning_rate": 7.227979274611398e-08, "loss": -0.0011, "reward": 2.499998688697815, "reward_std": 8.49075803444066e-07, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.999998927116394, "step": 3582 }, { "clip_ratio": 0.0, "completion_length": 35.6875, "epoch": 9.282383419689118, "grad_norm": 0.08272396935298938, "kl": 0.13690185546875, "learning_rate": 7.202072538860104e-08, "loss": 0.0008, "reward": 2.4999977350234985, "reward_std": 1.4120806781647843e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999977350234985, "step": 3583 }, { "clip_ratio": 0.0, "completion_length": 33.5, "epoch": 9.284974093264248, "grad_norm": 3.356436285295582, "kl": 0.119873046875, "learning_rate": 7.176165803108808e-08, "loss": -0.0001, "reward": 1.9991903901100159, "reward_std": 1.2754125123137783e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4991905093193054, "step": 3584 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 9.287564766839377, "grad_norm": 0.10836975461592056, "kl": 0.119140625, "learning_rate": 7.150259067357513e-08, "loss": 0.0, "reward": 2.49999737739563, "reward_std": 2.6355995146332134e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.999997615814209, "step": 3585 }, { "clip_ratio": 0.0, "completion_length": 34.0, "epoch": 9.290155440414507, "grad_norm": 4.814612523173493, "kl": 0.127685546875, "learning_rate": 7.124352331606218e-08, "loss": 0.0015, "reward": 1.6810356974601746, "reward_std": 0.0003449342570149838, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.1810354590415955, "step": 3586 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 9.292746113989637, "grad_norm": 0.12166511666571837, "kl": 0.0565185546875, "learning_rate": 7.098445595854922e-08, "loss": -0.0005, "reward": 2.4999990463256836, "reward_std": 9.061700723123067e-07, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.999999225139618, "step": 3587 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 9.295336787564766, "grad_norm": 15.828515390982098, "kl": 0.1455078125, "learning_rate": 7.072538860103626e-08, "loss": 0.0003, "reward": 1.874607801437378, "reward_std": 0.0008432363763404283, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.3746077418327332, "step": 3588 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 9.297927461139896, "grad_norm": 35.63865253352593, "kl": 0.08935546875, "learning_rate": 7.046632124352331e-08, "loss": 0.0005, "reward": 1.3149770498275757, "reward_std": 0.0009969968605219037, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 0.8149770498275757, "step": 3589 }, { "clip_ratio": 0.0, "completion_length": 34.0, "epoch": 9.300518134715025, "grad_norm": 0.12180622721924536, "kl": 0.081298828125, "learning_rate": 7.020725388601035e-08, "loss": -0.0007, "reward": 2.4999974966049194, "reward_std": 1.7089608377318655e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999977946281433, "step": 3590 }, { "clip_ratio": 0.0, "completion_length": 34.0, "epoch": 9.303108808290155, "grad_norm": 0.2387823283182586, "kl": 0.094482421875, "learning_rate": 6.99481865284974e-08, "loss": -0.0001, "reward": 2.499997615814209, "reward_std": 1.6343295783372014e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999977350234985, "step": 3591 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 9.305699481865284, "grad_norm": 5.121165870513497, "kl": 0.412353515625, "learning_rate": 6.968911917098446e-08, "loss": 0.0022, "reward": 1.989859938621521, "reward_std": 0.00019694055595209647, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4898599088191986, "step": 3592 }, { "clip_ratio": 0.0, "completion_length": 34.5, "epoch": 9.308290155440414, "grad_norm": 0.09566503437215755, "kl": 0.057373046875, "learning_rate": 6.94300518134715e-08, "loss": 0.0005, "reward": 2.4999961853027344, "reward_std": 1.6610765385394188e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999960660934448, "step": 3593 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 9.310880829015543, "grad_norm": 8.127659945758918, "kl": 0.174560546875, "learning_rate": 6.917098445595855e-08, "loss": 0.0008, "reward": 1.8960446119308472, "reward_std": 0.0008942767196344903, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.3960447311401367, "step": 3594 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 9.313471502590673, "grad_norm": 2.029263531863521, "kl": 0.28759765625, "learning_rate": 6.89119170984456e-08, "loss": 0.0004, "reward": 1.9993433952331543, "reward_std": 3.0199084676496568e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.499343454837799, "step": 3595 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 9.316062176165802, "grad_norm": 0.5439808564060447, "kl": 0.1435546875, "learning_rate": 6.865284974093263e-08, "loss": 0.0007, "reward": 2.499991297721863, "reward_std": 4.223207042741706e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999912977218628, "step": 3596 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 9.318652849740932, "grad_norm": 0.28356501550044233, "kl": 0.0523681640625, "learning_rate": 6.839378238341968e-08, "loss": 0.0003, "reward": 2.4999924898147583, "reward_std": 4.477263473745552e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999922513961792, "step": 3597 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 9.321243523316062, "grad_norm": 1.0108189392601912, "kl": 0.13818359375, "learning_rate": 6.813471502590673e-08, "loss": -0.0, "reward": 2.499977946281433, "reward_std": 6.1509495026257355e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999780654907227, "step": 3598 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 9.323834196891191, "grad_norm": 0.13533786568396786, "kl": 0.08099365234375, "learning_rate": 6.787564766839377e-08, "loss": 0.0003, "reward": 2.49999737739563, "reward_std": 2.015069412664161e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999975562095642, "step": 3599 }, { "clip_ratio": 0.0, "completion_length": 34.8125, "epoch": 9.32642487046632, "grad_norm": 32.67606220041922, "kl": 0.0662841796875, "learning_rate": 6.761658031088083e-08, "loss": 0.001, "reward": 1.9711965322494507, "reward_std": 0.011381687509015137, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.471196472644806, "step": 3600 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 9.32901554404145, "grad_norm": 0.5901533549375167, "kl": 0.1220703125, "learning_rate": 6.735751295336788e-08, "loss": 0.0012, "reward": 2.4999947547912598, "reward_std": 3.092614804245386e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999947547912598, "step": 3601 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 9.33160621761658, "grad_norm": 4.801469099867553, "kl": 0.1083984375, "learning_rate": 6.709844559585492e-08, "loss": 0.0006, "reward": 1.957410216331482, "reward_std": 0.00013167196721042274, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.457410216331482, "step": 3602 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 9.33419689119171, "grad_norm": 14.110597411739683, "kl": 0.135986328125, "learning_rate": 6.683937823834196e-08, "loss": 0.0013, "reward": 1.9545118808746338, "reward_std": 0.0002989301599427563, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.454511821269989, "step": 3603 }, { "clip_ratio": 0.0, "completion_length": 34.0, "epoch": 9.336787564766839, "grad_norm": 0.295404537875881, "kl": 0.0694580078125, "learning_rate": 6.658031088082901e-08, "loss": 0.0, "reward": 2.4998905658721924, "reward_std": 5.8229151136401924e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.999890685081482, "step": 3604 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 9.339378238341968, "grad_norm": 0.2724704061190899, "kl": 0.104248046875, "learning_rate": 6.632124352331605e-08, "loss": 0.0007, "reward": 2.499996542930603, "reward_std": 2.671846004886902e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999964833259583, "step": 3605 }, { "clip_ratio": 0.0, "completion_length": 34.5, "epoch": 9.341968911917098, "grad_norm": 0.46305444844303134, "kl": 0.050537109375, "learning_rate": 6.60621761658031e-08, "loss": 0.0001, "reward": 2.49999463558197, "reward_std": 3.4339132071181666e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999947547912598, "step": 3606 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 9.344559585492227, "grad_norm": 0.09952458202139255, "kl": 0.1064453125, "learning_rate": 6.580310880829015e-08, "loss": -0.0003, "reward": 2.4999983310699463, "reward_std": 1.0012257547487025e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999985694885254, "step": 3607 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 9.347150259067357, "grad_norm": 2.4675747010684383, "kl": 0.133056640625, "learning_rate": 6.554404145077721e-08, "loss": 0.0014, "reward": 1.9998574256896973, "reward_std": 2.1428109960197617e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4998573064804077, "step": 3608 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 9.349740932642487, "grad_norm": 0.24816116122884752, "kl": 0.1107177734375, "learning_rate": 6.528497409326425e-08, "loss": 0.0002, "reward": 2.4999969005584717, "reward_std": 2.031754263498442e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.999996840953827, "step": 3609 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 9.352331606217616, "grad_norm": 0.11201659391185052, "kl": 0.0660400390625, "learning_rate": 6.50259067357513e-08, "loss": 0.0001, "reward": 2.4999972581863403, "reward_std": 1.7613856471143663e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999971389770508, "step": 3610 }, { "clip_ratio": 0.0, "completion_length": 34.5, "epoch": 9.354922279792746, "grad_norm": 0.5843346425668478, "kl": 0.36279296875, "learning_rate": 6.476683937823834e-08, "loss": 0.002, "reward": 2.4999959468841553, "reward_std": 3.3874634937092196e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999960064888, "step": 3611 }, { "clip_ratio": 0.0, "completion_length": 34.5, "epoch": 9.357512953367875, "grad_norm": 0.5104489376957605, "kl": 0.0595703125, "learning_rate": 6.450777202072538e-08, "loss": 0.001, "reward": 2.499995470046997, "reward_std": 2.5525324360842205e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999953508377075, "step": 3612 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 9.360103626943005, "grad_norm": 35.61516971864077, "kl": 0.0833740234375, "learning_rate": 6.424870466321243e-08, "loss": 0.0, "reward": 2.1873852014541626, "reward_std": 0.25886790680993954, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.6873852610588074, "step": 3613 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 9.362694300518134, "grad_norm": 0.2584191548961745, "kl": 0.07379150390625, "learning_rate": 6.398963730569948e-08, "loss": 0.0003, "reward": 2.499998450279236, "reward_std": 2.033572002346773e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999984502792358, "step": 3614 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 9.365284974093264, "grad_norm": 0.3458065985445571, "kl": 0.10986328125, "learning_rate": 6.373056994818652e-08, "loss": 0.0011, "reward": 2.499996066093445, "reward_std": 3.680582778997632e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999959468841553, "step": 3615 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 9.367875647668393, "grad_norm": 0.4211593887114756, "kl": 0.1151123046875, "learning_rate": 6.347150259067358e-08, "loss": -0.0003, "reward": 2.499997854232788, "reward_std": 2.4314286974913557e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.999998152256012, "step": 3616 }, { "clip_ratio": 0.0, "completion_length": 34.5, "epoch": 9.370466321243523, "grad_norm": 0.03900944177296487, "kl": 0.104736328125, "learning_rate": 6.321243523316063e-08, "loss": -0.0, "reward": 2.499998927116394, "reward_std": 8.31658951483405e-07, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.999999225139618, "step": 3617 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 9.373056994818652, "grad_norm": 0.2761578371508049, "kl": 0.078125, "learning_rate": 6.295336787564765e-08, "loss": -0.0007, "reward": 2.4999964237213135, "reward_std": 2.360036205573124e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999964237213135, "step": 3618 }, { "clip_ratio": 0.0, "completion_length": 34.5, "epoch": 9.375647668393782, "grad_norm": 0.3030193111357959, "kl": 0.0391845703125, "learning_rate": 6.269430051813471e-08, "loss": 0.001, "reward": 2.4999979734420776, "reward_std": 1.9500905068525753e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999980926513672, "step": 3619 }, { "clip_ratio": 0.0, "completion_length": 35.3125, "epoch": 9.378238341968911, "grad_norm": 0.08436944271601664, "kl": 0.19384765625, "learning_rate": 6.243523316062176e-08, "loss": 0.0021, "reward": 2.4999964237213135, "reward_std": 1.2308389045756485e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999961256980896, "step": 3620 }, { "clip_ratio": 0.0, "completion_length": 34.0, "epoch": 9.380829015544041, "grad_norm": 7.081680726591933, "kl": 0.0772705078125, "learning_rate": 6.21761658031088e-08, "loss": 0.0, "reward": 2.124963700771332, "reward_std": 0.23146752042606522, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.6249637007713318, "step": 3621 }, { "clip_ratio": 0.0, "completion_length": 34.5, "epoch": 9.38341968911917, "grad_norm": 0.3266231682797563, "kl": 0.1112060546875, "learning_rate": 6.191709844559585e-08, "loss": 0.0, "reward": 2.499989151954651, "reward_std": 2.3708434468971973e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999891519546509, "step": 3622 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 9.3860103626943, "grad_norm": 0.39307842836447787, "kl": 0.072509765625, "learning_rate": 6.16580310880829e-08, "loss": -0.0003, "reward": 2.499995708465576, "reward_std": 3.032876861652767e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999958872795105, "step": 3623 }, { "clip_ratio": 0.0, "completion_length": 34.5, "epoch": 9.38860103626943, "grad_norm": 0.10557029022008539, "kl": 0.084625244140625, "learning_rate": 6.139896373056994e-08, "loss": 0.0009, "reward": 2.4999935626983643, "reward_std": 1.7066317070657533e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999935626983643, "step": 3624 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 9.39119170984456, "grad_norm": 0.15421166946179368, "kl": 0.074462890625, "learning_rate": 6.1139896373057e-08, "loss": 0.0012, "reward": 2.499983072280884, "reward_std": 2.8139048708908376e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999831318855286, "step": 3625 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 9.393782383419689, "grad_norm": 0.10513957015954604, "kl": 0.06451416015625, "learning_rate": 6.088082901554404e-08, "loss": -0.0006, "reward": 2.4999945163726807, "reward_std": 2.189960468967911e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.999994695186615, "step": 3626 }, { "clip_ratio": 0.0, "completion_length": 34.0, "epoch": 9.396373056994818, "grad_norm": 1.417496060470017, "kl": 0.103515625, "learning_rate": 6.062176165803109e-08, "loss": 0.0009, "reward": 2.499958038330078, "reward_std": 1.1851030990328582e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999580383300781, "step": 3627 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 9.398963730569948, "grad_norm": 0.1918987255820987, "kl": 0.103515625, "learning_rate": 6.036269430051813e-08, "loss": -0.0, "reward": 2.49999737739563, "reward_std": 2.124066099895572e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999973773956299, "step": 3628 }, { "clip_ratio": 0.0, "completion_length": 33.0, "epoch": 9.401554404145077, "grad_norm": 4.440452023652074, "kl": 0.054443359375, "learning_rate": 6.010362694300518e-08, "loss": 0.0008, "reward": 2.49999737739563, "reward_std": 2.110358764184639e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999972581863403, "step": 3629 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 9.404145077720207, "grad_norm": 1.450613671842467, "kl": 0.341796875, "learning_rate": 5.984455958549222e-08, "loss": 0.0011, "reward": 2.4999969005584717, "reward_std": 3.0125004286674084e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999972581863403, "step": 3630 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 9.406735751295336, "grad_norm": 0.058651296796458065, "kl": 0.0384521484375, "learning_rate": 5.958549222797927e-08, "loss": -0.0012, "reward": 2.4999979734420776, "reward_std": 1.5050289050577703e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999980330467224, "step": 3631 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 9.409326424870466, "grad_norm": 3.0731405616430374, "kl": 0.236572265625, "learning_rate": 5.932642487046632e-08, "loss": 0.0007, "reward": 2.499989628791809, "reward_std": 1.2769949989888119e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999895095825195, "step": 3632 }, { "clip_ratio": 0.0, "completion_length": 34.5, "epoch": 9.411917098445596, "grad_norm": 13.293527059216853, "kl": 0.09375, "learning_rate": 5.9067357512953366e-08, "loss": 0.0, "reward": 2.499670386314392, "reward_std": 3.277077087204816e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9996705055236816, "step": 3633 }, { "clip_ratio": 0.0, "completion_length": 34.5, "epoch": 9.414507772020725, "grad_norm": 0.19345136768039403, "kl": 0.067138671875, "learning_rate": 5.880829015544041e-08, "loss": -0.0003, "reward": 2.49999737739563, "reward_std": 1.4691054843751772e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999973773956299, "step": 3634 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 9.417098445595855, "grad_norm": 8.747969626891217, "kl": 0.21630859375, "learning_rate": 5.854922279792746e-08, "loss": 0.0013, "reward": 1.9231637716293335, "reward_std": 0.00021430487868201453, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4231637716293335, "step": 3635 }, { "clip_ratio": 0.0, "completion_length": 34.5, "epoch": 9.419689119170984, "grad_norm": 0.47868159716290337, "kl": 0.128662109375, "learning_rate": 5.8290155440414504e-08, "loss": 0.0007, "reward": 2.4999938011169434, "reward_std": 3.062269684050989e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999938011169434, "step": 3636 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 9.422279792746114, "grad_norm": 0.09541574476558122, "kl": 0.0611572265625, "learning_rate": 5.803108808290155e-08, "loss": 0.0008, "reward": 2.499998092651367, "reward_std": 1.047329249104223e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999982118606567, "step": 3637 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 9.424870466321243, "grad_norm": 0.11362960260906027, "kl": 0.0628662109375, "learning_rate": 5.77720207253886e-08, "loss": 0.001, "reward": 2.499998092651367, "reward_std": 2.2992157937551383e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999980926513672, "step": 3638 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 9.427461139896373, "grad_norm": 6.134415838458829, "kl": 0.08642578125, "learning_rate": 5.751295336787564e-08, "loss": 0.0003, "reward": 2.499982714653015, "reward_std": 8.890999652066967e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999826550483704, "step": 3639 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 9.430051813471502, "grad_norm": 0.49040575881683257, "kl": 0.09130859375, "learning_rate": 5.725388601036269e-08, "loss": 0.0008, "reward": 2.4999970197677612, "reward_std": 2.80291072840555e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999971389770508, "step": 3640 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 9.432642487046632, "grad_norm": 3.5243030754447786, "kl": 0.2208251953125, "learning_rate": 5.699481865284974e-08, "loss": 0.0008, "reward": 2.4999918937683105, "reward_std": 1.4580912818473735e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999920129776, "step": 3641 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 9.435233160621761, "grad_norm": 0.5889110884267945, "kl": 0.0775146484375, "learning_rate": 5.673575129533679e-08, "loss": 0.0006, "reward": 2.4999964237213135, "reward_std": 3.330167032800091e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999964237213135, "step": 3642 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 9.437823834196891, "grad_norm": 1.240213488667874, "kl": 0.0423583984375, "learning_rate": 5.647668393782383e-08, "loss": 0.0006, "reward": 2.4999924898147583, "reward_std": 5.2077742793699144e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.999992549419403, "step": 3643 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 9.44041450777202, "grad_norm": 1.467725232439466, "kl": 0.23291015625, "learning_rate": 5.621761658031088e-08, "loss": 0.0014, "reward": 1.8303985595703125, "reward_std": 0.0004249216890457319, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.3303985595703125, "step": 3644 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 9.44300518134715, "grad_norm": 3.26458823444125, "kl": 0.154296875, "learning_rate": 5.5958549222797925e-08, "loss": 0.0007, "reward": 1.4976417422294617, "reward_std": 4.918265858577797e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 0.9976416826248169, "step": 3645 }, { "clip_ratio": 0.0, "completion_length": 35.8125, "epoch": 9.44559585492228, "grad_norm": 3.8004642679733864, "kl": 0.164794921875, "learning_rate": 5.569948186528497e-08, "loss": 0.0007, "reward": 1.9941259622573853, "reward_std": 8.015541732220299e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.49412602186203, "step": 3646 }, { "clip_ratio": 0.0, "completion_length": 34.5, "epoch": 9.44818652849741, "grad_norm": 0.13173752653409068, "kl": 0.080810546875, "learning_rate": 5.544041450777202e-08, "loss": 0.0013, "reward": 2.499998092651367, "reward_std": 1.6107695159917057e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.999997854232788, "step": 3647 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 9.450777202072539, "grad_norm": 0.17450077222535135, "kl": 0.069580078125, "learning_rate": 5.518134715025906e-08, "loss": 0.001, "reward": 2.4999953508377075, "reward_std": 2.054399175221988e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999952912330627, "step": 3648 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 9.453367875647668, "grad_norm": 14.506631067124342, "kl": 0.0673828125, "learning_rate": 5.4922279792746116e-08, "loss": 0.0012, "reward": 1.9867181181907654, "reward_std": 0.0002071681576865103, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4867179989814758, "step": 3649 }, { "clip_ratio": 0.0, "completion_length": 34.5, "epoch": 9.455958549222798, "grad_norm": 4.7276289690884425, "kl": 0.0369873046875, "learning_rate": 5.4663212435233155e-08, "loss": 0.0006, "reward": 2.499996542930603, "reward_std": 3.926990416402987e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999966025352478, "step": 3650 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 9.458549222797927, "grad_norm": 3.1064158327449993, "kl": 0.17919921875, "learning_rate": 5.44041450777202e-08, "loss": -0.0, "reward": 1.9979050159454346, "reward_std": 3.69726496955991e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.497905194759369, "step": 3651 }, { "clip_ratio": 0.0, "completion_length": 34.5, "epoch": 9.461139896373057, "grad_norm": 0.13947908017912858, "kl": 0.09521484375, "learning_rate": 5.4145077720207254e-08, "loss": 0.0002, "reward": 2.4999985694885254, "reward_std": 1.6200368690988398e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999987483024597, "step": 3652 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 9.463730569948186, "grad_norm": 19.222377050730202, "kl": 0.0992431640625, "learning_rate": 5.38860103626943e-08, "loss": 0.0009, "reward": 1.9998762607574463, "reward_std": 3.6580888263415545e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.499876320362091, "step": 3653 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 9.466321243523316, "grad_norm": 1.9369239234206173, "kl": 0.09619140625, "learning_rate": 5.362694300518134e-08, "loss": 0.0009, "reward": 1.9996901154518127, "reward_std": 1.6046079622356046e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.499690055847168, "step": 3654 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 9.468911917098445, "grad_norm": 0.429748900711671, "kl": 0.0440673828125, "learning_rate": 5.336787564766839e-08, "loss": -0.0006, "reward": 2.4999961853027344, "reward_std": 4.332306730248092e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.999996304512024, "step": 3655 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 9.471502590673575, "grad_norm": 0.36464921725739535, "kl": 0.0703125, "learning_rate": 5.310880829015544e-08, "loss": 0.0003, "reward": 2.499996781349182, "reward_std": 2.34987766134509e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.999996840953827, "step": 3656 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 9.474093264248705, "grad_norm": 2.207658454892464, "kl": 0.100341796875, "learning_rate": 5.284974093264249e-08, "loss": 0.0001, "reward": 2.499990940093994, "reward_std": 7.557295589322166e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.999990999698639, "step": 3657 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 9.476683937823834, "grad_norm": 0.10123703854278315, "kl": 0.0601806640625, "learning_rate": 5.259067357512953e-08, "loss": -0.0004, "reward": 2.4999974966049194, "reward_std": 1.8297976112080505e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999976754188538, "step": 3658 }, { "clip_ratio": 0.0, "completion_length": 34.5, "epoch": 9.479274611398964, "grad_norm": 0.4829703801768173, "kl": 0.0499267578125, "learning_rate": 5.2331606217616577e-08, "loss": 0.0009, "reward": 2.4999966621398926, "reward_std": 1.916402482038393e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999966621398926, "step": 3659 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 9.481865284974093, "grad_norm": 0.10825806037919034, "kl": 0.075439453125, "learning_rate": 5.207253886010363e-08, "loss": 0.001, "reward": 2.4999990463256836, "reward_std": 1.1330558180588923e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.999999225139618, "step": 3660 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 9.484455958549223, "grad_norm": 0.8525804929619037, "kl": 0.131103515625, "learning_rate": 5.181347150259067e-08, "loss": 0.001, "reward": 1.9998544454574585, "reward_std": 1.0202433102790565e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4998544752597809, "step": 3661 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 9.487046632124352, "grad_norm": 0.13608136650897315, "kl": 0.072265625, "learning_rate": 5.1554404145077715e-08, "loss": 0.0009, "reward": 2.4999974966049194, "reward_std": 1.3136293972593194e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999975562095642, "step": 3662 }, { "clip_ratio": 0.0, "completion_length": 34.5, "epoch": 9.489637305699482, "grad_norm": 0.0538713501862259, "kl": 0.0609130859375, "learning_rate": 5.129533678756477e-08, "loss": 0.0002, "reward": 2.4999990463256836, "reward_std": 1.1685926608606678e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999991059303284, "step": 3663 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 9.492227979274611, "grad_norm": 2.899910100498706, "kl": 0.2216796875, "learning_rate": 5.1036269430051813e-08, "loss": 0.0011, "reward": 1.9963432550430298, "reward_std": 4.696190148933965e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4963432550430298, "step": 3664 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 9.494818652849741, "grad_norm": 0.1981563280646292, "kl": 0.09619140625, "learning_rate": 5.077720207253885e-08, "loss": -0.0011, "reward": 2.499997615814209, "reward_std": 1.6911328089008748e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999977350234985, "step": 3665 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 9.49740932642487, "grad_norm": 0.13234890318077616, "kl": 0.16943359375, "learning_rate": 5.0518134715025906e-08, "loss": 0.0007, "reward": 2.499997138977051, "reward_std": 3.024118313987856e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999971985816956, "step": 3666 }, { "clip_ratio": 0.0, "completion_length": 35.5625, "epoch": 9.5, "grad_norm": 0.11170018025077244, "kl": 0.059814453125, "learning_rate": 5.025906735751295e-08, "loss": 0.0016, "reward": 2.4999983310699463, "reward_std": 1.312384256380028e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.999998390674591, "step": 3667 }, { "clip_ratio": 0.0, "completion_length": 34.0, "epoch": 9.50259067357513, "grad_norm": 0.21624890008862552, "kl": 0.094970703125, "learning_rate": 5e-08, "loss": -0.0001, "reward": 2.4999983310699463, "reward_std": 1.6742280308790214e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999982714653015, "step": 3668 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 9.505181347150259, "grad_norm": 0.1278872385181617, "kl": 0.05206298828125, "learning_rate": 4.9740932642487044e-08, "loss": 0.0006, "reward": 2.4999979734420776, "reward_std": 1.7489659853708872e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999980330467224, "step": 3669 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 9.507772020725389, "grad_norm": 1.0912035333022394, "kl": 0.07861328125, "learning_rate": 4.948186528497409e-08, "loss": -0.0013, "reward": 2.499992609024048, "reward_std": 4.081167844560696e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999929070472717, "step": 3670 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 9.510362694300518, "grad_norm": 0.3137507044354098, "kl": 0.053985595703125, "learning_rate": 4.9222797927461136e-08, "loss": 0.0001, "reward": 2.4999972581863403, "reward_std": 2.11338402777983e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999974966049194, "step": 3671 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 9.512953367875648, "grad_norm": 0.041073730948850966, "kl": 0.07342529296875, "learning_rate": 4.896373056994819e-08, "loss": -0.0002, "reward": 2.4999988079071045, "reward_std": 8.696125348706119e-07, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999988079071045, "step": 3672 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 9.515544041450777, "grad_norm": 30.72660172007966, "kl": 0.1531982421875, "learning_rate": 4.870466321243523e-08, "loss": 0.0001, "reward": 1.9995241165161133, "reward_std": 9.257665772111068e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4995240569114685, "step": 3673 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 9.518134715025907, "grad_norm": 0.2642352667876703, "kl": 0.13818359375, "learning_rate": 4.844559585492228e-08, "loss": 0.0003, "reward": 2.4999903440475464, "reward_std": 3.7532283272412315e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999902844429016, "step": 3674 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 9.520725388601036, "grad_norm": 0.16631709231888067, "kl": 0.14208984375, "learning_rate": 4.818652849740933e-08, "loss": 0.0013, "reward": 2.4999961853027344, "reward_std": 1.4295671917352593e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999960660934448, "step": 3675 }, { "clip_ratio": 0.0, "completion_length": 36.5, "epoch": 9.523316062176166, "grad_norm": 2.08541732953423, "kl": 0.084228515625, "learning_rate": 4.7927461139896366e-08, "loss": 0.0005, "reward": 1.999800682067871, "reward_std": 1.675287882108023e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.499800682067871, "step": 3676 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 9.525906735751295, "grad_norm": 0.05609196010699713, "kl": 0.07025146484375, "learning_rate": 4.766839378238342e-08, "loss": 0.0004, "reward": 2.499998450279236, "reward_std": 1.1969843285442039e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999985694885254, "step": 3677 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 9.528497409326425, "grad_norm": 1.1533177048416927, "kl": 0.04803466796875, "learning_rate": 4.7409326424870465e-08, "loss": 0.0003, "reward": 2.4999929666519165, "reward_std": 5.892880835745018e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999926686286926, "step": 3678 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 9.531088082901555, "grad_norm": 0.46260536400298147, "kl": 0.05987548828125, "learning_rate": 4.715025906735751e-08, "loss": 0.0001, "reward": 2.499995231628418, "reward_std": 3.915376282748184e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999953508377075, "step": 3679 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 9.533678756476684, "grad_norm": 0.29252599112808325, "kl": 0.140380859375, "learning_rate": 4.689119170984456e-08, "loss": 0.0012, "reward": 1.9999163150787354, "reward_std": 5.6195223692157015e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4999162256717682, "step": 3680 }, { "clip_ratio": 0.0, "completion_length": 34.5625, "epoch": 9.536269430051814, "grad_norm": 44.67028084156893, "kl": 0.08837890625, "learning_rate": 4.66321243523316e-08, "loss": 0.0003, "reward": 1.3175342679023743, "reward_std": 0.2011123927659355, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 0.8175342977046967, "step": 3681 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 9.538860103626943, "grad_norm": 0.7972440528709072, "kl": 0.1826171875, "learning_rate": 4.637305699481865e-08, "loss": 0.001, "reward": 2.4999953508377075, "reward_std": 5.055637643636146e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999954104423523, "step": 3682 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 9.541450777202073, "grad_norm": 0.08480518440858621, "kl": 0.045562744140625, "learning_rate": 4.61139896373057e-08, "loss": 0.0, "reward": 2.499998092651367, "reward_std": 1.068411563664995e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999980926513672, "step": 3683 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 9.544041450777202, "grad_norm": 2.9252045969231006, "kl": 0.15283203125, "learning_rate": 4.585492227979274e-08, "loss": 0.001, "reward": 1.8866026401519775, "reward_std": 0.00025362599205891456, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.3866024613380432, "step": 3684 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 9.546632124352332, "grad_norm": 2.4330944849459915, "kl": 0.17138671875, "learning_rate": 4.559585492227979e-08, "loss": 0.0008, "reward": 2.499996304512024, "reward_std": 5.3129729167267215e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999964237213135, "step": 3685 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 9.549222797927461, "grad_norm": 0.30175427060245463, "kl": 0.092529296875, "learning_rate": 4.533678756476684e-08, "loss": 0.0007, "reward": 2.499994993209839, "reward_std": 3.3093471074607805e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999948143959045, "step": 3686 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 9.55181347150259, "grad_norm": 1.6565226912264372, "kl": 0.080078125, "learning_rate": 4.5077720207253886e-08, "loss": -0.0004, "reward": 2.4999821186065674, "reward_std": 8.794115728960605e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999821186065674, "step": 3687 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 9.55440414507772, "grad_norm": 0.43844046473212195, "kl": 0.09033203125, "learning_rate": 4.4818652849740926e-08, "loss": 0.0007, "reward": 2.4999982118606567, "reward_std": 2.3450434696314915e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.999998152256012, "step": 3688 }, { "clip_ratio": 0.0, "completion_length": 34.5, "epoch": 9.55699481865285, "grad_norm": 0.2765550217688546, "kl": 0.069580078125, "learning_rate": 4.455958549222798e-08, "loss": 0.0001, "reward": 2.4999977350234985, "reward_std": 2.760797855216879e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.999997854232788, "step": 3689 }, { "clip_ratio": 0.0, "completion_length": 34.6875, "epoch": 9.55958549222798, "grad_norm": 0.2689603541677444, "kl": 0.375, "learning_rate": 4.4300518134715024e-08, "loss": 0.0015, "reward": 1.4999990463256836, "reward_std": 1.0095153584188665e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 0.9999991059303284, "step": 3690 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 9.562176165803109, "grad_norm": 0.10081432188371749, "kl": 0.048828125, "learning_rate": 4.404145077720207e-08, "loss": -0.0002, "reward": 2.49999737739563, "reward_std": 2.1196274246904068e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999974370002747, "step": 3691 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 9.564766839378239, "grad_norm": 0.09022589428825767, "kl": 0.095947265625, "learning_rate": 4.3782383419689116e-08, "loss": 0.001, "reward": 2.499998450279236, "reward_std": 9.024412861435849e-07, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.999998390674591, "step": 3692 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 9.567357512953368, "grad_norm": 0.06521363436271893, "kl": 0.10595703125, "learning_rate": 4.352331606217616e-08, "loss": 0.001, "reward": 2.499998688697815, "reward_std": 1.3152069300303992e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999986290931702, "step": 3693 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 9.569948186528498, "grad_norm": 0.4996267812739185, "kl": 0.17529296875, "learning_rate": 4.3264248704663215e-08, "loss": 0.0007, "reward": 2.499996304512024, "reward_std": 3.1918979175316053e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.999996304512024, "step": 3694 }, { "clip_ratio": 0.0, "completion_length": 34.5, "epoch": 9.572538860103627, "grad_norm": 0.03102964250792375, "kl": 0.062255859375, "learning_rate": 4.3005181347150255e-08, "loss": 0.0002, "reward": 2.499998688697815, "reward_std": 8.088461527222535e-07, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.999998688697815, "step": 3695 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 9.575129533678757, "grad_norm": 0.5968775988121061, "kl": 0.0966796875, "learning_rate": 4.27461139896373e-08, "loss": 0.0004, "reward": 2.499991774559021, "reward_std": 7.86671148489404e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999916553497314, "step": 3696 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 9.577720207253886, "grad_norm": 4.124912297052042, "kl": 0.2158203125, "learning_rate": 4.2487046632124353e-08, "loss": 0.0002, "reward": 1.9745756387710571, "reward_std": 0.0001279445177715388, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4745756387710571, "step": 3697 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 9.580310880829016, "grad_norm": 0.11706880833138766, "kl": 0.0556640625, "learning_rate": 4.22279792746114e-08, "loss": -0.0008, "reward": 2.4999895095825195, "reward_std": 1.6584326658630744e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.999989628791809, "step": 3698 }, { "clip_ratio": 0.0, "completion_length": 34.5, "epoch": 9.582901554404145, "grad_norm": 0.04730237244760757, "kl": 0.10577392578125, "learning_rate": 4.196891191709844e-08, "loss": 0.0001, "reward": 2.4999983310699463, "reward_std": 1.1811566480446345e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999984502792358, "step": 3699 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 9.585492227979275, "grad_norm": 0.024967484034547332, "kl": 0.107147216796875, "learning_rate": 4.170984455958549e-08, "loss": 0.0003, "reward": 2.4999964237213135, "reward_std": 9.510072516150103e-07, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999966025352478, "step": 3700 }, { "clip_ratio": 0.0, "completion_length": 34.5, "epoch": 9.588082901554404, "grad_norm": 121.17959767493448, "kl": 0.14208984375, "learning_rate": 4.145077720207254e-08, "loss": 0.0015, "reward": 1.9833029508590698, "reward_std": 0.00023415576535512628, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4833029210567474, "step": 3701 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 9.590673575129534, "grad_norm": 0.1787512941554565, "kl": 0.070068359375, "learning_rate": 4.119170984455959e-08, "loss": -0.0, "reward": 2.4999985694885254, "reward_std": 1.3462990580137557e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.999998688697815, "step": 3702 }, { "clip_ratio": 0.0, "completion_length": 34.5, "epoch": 9.593264248704664, "grad_norm": 0.11947837538790254, "kl": 0.0731201171875, "learning_rate": 4.093264248704663e-08, "loss": 0.0006, "reward": 2.4999983310699463, "reward_std": 1.4272621342570346e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.999998390674591, "step": 3703 }, { "clip_ratio": 0.0, "completion_length": 34.5, "epoch": 9.595854922279793, "grad_norm": 2.0841880998719566, "kl": 0.1092529296875, "learning_rate": 4.0673575129533676e-08, "loss": 0.0004, "reward": 2.499931573867798, "reward_std": 1.6917330569299338e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.999931812286377, "step": 3704 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 9.598445595854923, "grad_norm": 0.4801868486753808, "kl": 0.18701171875, "learning_rate": 4.041450777202073e-08, "loss": 0.0009, "reward": 2.4999905824661255, "reward_std": 1.6072878850081906e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999906420707703, "step": 3705 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 9.601036269430052, "grad_norm": 0.09784307702882991, "kl": 0.0555419921875, "learning_rate": 4.015544041450777e-08, "loss": 0.0005, "reward": 2.499998927116394, "reward_std": 5.41505286832944e-07, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.999998927116394, "step": 3706 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 9.603626943005182, "grad_norm": 0.2432695154534563, "kl": 0.077880859375, "learning_rate": 3.9896373056994814e-08, "loss": 0.0002, "reward": 2.4999951124191284, "reward_std": 3.6479295886238106e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999951124191284, "step": 3707 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 9.606217616580311, "grad_norm": 0.2059775111423504, "kl": 0.1259765625, "learning_rate": 3.9637305699481867e-08, "loss": 0.0005, "reward": 2.4999964237213135, "reward_std": 1.7242742842427106e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999966025352478, "step": 3708 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 9.60880829015544, "grad_norm": 0.4729351482880418, "kl": 0.067626953125, "learning_rate": 3.937823834196891e-08, "loss": -0.0005, "reward": 2.499995470046997, "reward_std": 1.924216746829188e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999958276748657, "step": 3709 }, { "clip_ratio": 0.0, "completion_length": 34.0, "epoch": 9.61139896373057, "grad_norm": 0.06302451880046785, "kl": 0.01739501953125, "learning_rate": 3.911917098445595e-08, "loss": -0.0004, "reward": 2.499998092651367, "reward_std": 1.1708349632044701e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999982118606567, "step": 3710 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 9.6139896373057, "grad_norm": 1.2908286275947682, "kl": 0.109375, "learning_rate": 3.8860103626943005e-08, "loss": 0.0012, "reward": 2.4999916553497314, "reward_std": 5.72933799958264e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999914169311523, "step": 3711 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 9.61658031088083, "grad_norm": 0.16426915608836806, "kl": 0.05474853515625, "learning_rate": 3.860103626943005e-08, "loss": -0.0002, "reward": 2.499995470046997, "reward_std": 1.9299259577110206e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999955296516418, "step": 3712 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 9.619170984455959, "grad_norm": 0.3283778988085709, "kl": 0.04254150390625, "learning_rate": 3.83419689119171e-08, "loss": 0.0006, "reward": 2.499997138977051, "reward_std": 2.7839587346534245e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999969601631165, "step": 3713 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 9.621761658031089, "grad_norm": 0.6653877930392398, "kl": 0.083251953125, "learning_rate": 3.808290155440414e-08, "loss": -0.0002, "reward": 2.499993324279785, "reward_std": 4.290542960916355e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999935030937195, "step": 3714 }, { "clip_ratio": 0.0, "completion_length": 35.0625, "epoch": 9.624352331606218, "grad_norm": 3.837787270262384, "kl": 0.15380859375, "learning_rate": 3.782383419689119e-08, "loss": 0.0011, "reward": 2.499161958694458, "reward_std": 1.9703396219483693e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9991618990898132, "step": 3715 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 9.626943005181348, "grad_norm": 2.7144190963613366, "kl": 0.06256103515625, "learning_rate": 3.7564766839378235e-08, "loss": -0.0002, "reward": 2.4999953508377075, "reward_std": 8.641868362246896e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999955296516418, "step": 3716 }, { "clip_ratio": 0.0, "completion_length": 34.5, "epoch": 9.629533678756477, "grad_norm": 10.60515196088003, "kl": 0.11834716796875, "learning_rate": 3.730569948186528e-08, "loss": 0.0014, "reward": 1.9977213144302368, "reward_std": 9.293627499573631e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4977212250232697, "step": 3717 }, { "clip_ratio": 0.0, "completion_length": 34.5, "epoch": 9.632124352331607, "grad_norm": 5.168370874686422, "kl": 0.120697021484375, "learning_rate": 3.704663212435233e-08, "loss": 0.0002, "reward": 1.822607696056366, "reward_std": 0.00047588109543994506, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.3226077854633331, "step": 3718 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 9.634715025906736, "grad_norm": 1.1372807076535765, "kl": 0.0670166015625, "learning_rate": 3.678756476683938e-08, "loss": 0.0004, "reward": 2.499993920326233, "reward_std": 5.527530674953596e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999938011169434, "step": 3719 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 9.637305699481866, "grad_norm": 0.8630472185584769, "kl": 0.10107421875, "learning_rate": 3.6528497409326426e-08, "loss": 0.0004, "reward": 2.4999948740005493, "reward_std": 6.338167395369965e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999947547912598, "step": 3720 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 9.639896373056995, "grad_norm": 0.14228762130486428, "kl": 0.080322265625, "learning_rate": 3.6269430051813465e-08, "loss": 0.0009, "reward": 2.4999951124191284, "reward_std": 3.8539049569408235e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999949932098389, "step": 3721 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 9.642487046632125, "grad_norm": 0.7850317170906873, "kl": 0.079833984375, "learning_rate": 3.601036269430052e-08, "loss": 0.0009, "reward": 2.4999932050704956, "reward_std": 4.227991894367733e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999932050704956, "step": 3722 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 9.645077720207254, "grad_norm": 1.776056240170037, "kl": 0.101806640625, "learning_rate": 3.5751295336787564e-08, "loss": 0.0009, "reward": 1.9998681545257568, "reward_std": 8.81521782503114e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4998682141304016, "step": 3723 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 9.647668393782384, "grad_norm": 0.06614170299848908, "kl": 0.068359375, "learning_rate": 3.549222797927461e-08, "loss": 0.0004, "reward": 2.4999380111694336, "reward_std": 2.5189199277519947e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.999937891960144, "step": 3724 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 9.650259067357513, "grad_norm": 62.65769396126327, "kl": 0.107421875, "learning_rate": 3.5233160621761656e-08, "loss": 0.0006, "reward": 1.8926363587379456, "reward_std": 0.00041003531885053235, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.3926363289356232, "step": 3725 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 9.652849740932643, "grad_norm": 1.902947433846418, "kl": 0.130615234375, "learning_rate": 3.49740932642487e-08, "loss": 0.0002, "reward": 1.8233218789100647, "reward_std": 0.00016650663928885479, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.3233221173286438, "step": 3726 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 9.655440414507773, "grad_norm": 0.3671349059924395, "kl": 0.091552734375, "learning_rate": 3.471502590673575e-08, "loss": 0.0012, "reward": 2.499998092651367, "reward_std": 1.5305606950732908e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999977946281433, "step": 3727 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 9.658031088082902, "grad_norm": 0.2572672720997696, "kl": 0.13037109375, "learning_rate": 3.44559585492228e-08, "loss": 0.0007, "reward": 2.4999889135360718, "reward_std": 2.941994694083405e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999887347221375, "step": 3728 }, { "clip_ratio": 0.0, "completion_length": 34.5, "epoch": 9.660621761658032, "grad_norm": 0.35747154345498017, "kl": 0.181640625, "learning_rate": 3.419689119170984e-08, "loss": 0.0017, "reward": 2.4999948740005493, "reward_std": 3.3435303521400783e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999948740005493, "step": 3729 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 9.663212435233161, "grad_norm": 0.11123236866937876, "kl": 0.05322265625, "learning_rate": 3.3937823834196887e-08, "loss": 0.0005, "reward": 2.499996066093445, "reward_std": 1.737238562782295e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999958872795105, "step": 3730 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 9.66580310880829, "grad_norm": 1.717227705973132, "kl": 0.0797119140625, "learning_rate": 3.367875647668394e-08, "loss": 0.0002, "reward": 2.499983072280884, "reward_std": 1.2239142506587086e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999831914901733, "step": 3731 }, { "clip_ratio": 0.0, "completion_length": 33.75, "epoch": 9.66839378238342, "grad_norm": 0.10325904713031216, "kl": 0.074951171875, "learning_rate": 3.341968911917098e-08, "loss": -0.0006, "reward": 2.4999977350234985, "reward_std": 1.1165362252540945e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999979138374329, "step": 3732 }, { "clip_ratio": 0.0, "completion_length": 34.875, "epoch": 9.67098445595855, "grad_norm": 140.2845593175123, "kl": 0.1151123046875, "learning_rate": 3.3160621761658025e-08, "loss": 0.0, "reward": 2.353145122528076, "reward_std": 0.2719174511029223, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.8531451225280762, "step": 3733 }, { "clip_ratio": 0.0, "completion_length": 34.5, "epoch": 9.67357512953368, "grad_norm": 0.06890593482295215, "kl": 0.172607421875, "learning_rate": 3.290155440414508e-08, "loss": 0.001, "reward": 2.4999979734420776, "reward_std": 1.6682928389855078e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999979734420776, "step": 3734 }, { "clip_ratio": 0.0, "completion_length": 34.0, "epoch": 9.676165803108809, "grad_norm": 0.06258540429877894, "kl": 0.04083251953125, "learning_rate": 3.2642487046632124e-08, "loss": -0.0001, "reward": 2.4999982118606567, "reward_std": 1.3081356655675336e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999983310699463, "step": 3735 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 9.678756476683938, "grad_norm": 0.06107534896524303, "kl": 0.043670654296875, "learning_rate": 3.238341968911917e-08, "loss": 0.0009, "reward": 2.4999979734420776, "reward_std": 8.629858569975113e-07, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999980330467224, "step": 3736 }, { "clip_ratio": 0.0, "completion_length": 34.5, "epoch": 9.681347150259068, "grad_norm": 0.9024815467901885, "kl": 0.04522705078125, "learning_rate": 3.2124352331606216e-08, "loss": 0.0, "reward": 2.499995231628418, "reward_std": 4.063416781718843e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999954104423523, "step": 3737 }, { "clip_ratio": 0.0, "completion_length": 34.5, "epoch": 9.683937823834198, "grad_norm": 0.2986592789776921, "kl": 0.0631103515625, "learning_rate": 3.186528497409326e-08, "loss": 0.0005, "reward": 2.499997854232788, "reward_std": 1.7263938616451924e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999977946281433, "step": 3738 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 9.686528497409327, "grad_norm": 58.65350870185054, "kl": 0.100830078125, "learning_rate": 3.1606217616580314e-08, "loss": -0.0001, "reward": 2.4999840259552, "reward_std": 2.5987753360823262e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.999984085559845, "step": 3739 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 9.689119170984457, "grad_norm": 0.14544432949484318, "kl": 0.075439453125, "learning_rate": 3.1347150259067354e-08, "loss": 0.0004, "reward": 2.4999977350234985, "reward_std": 2.395315050307545e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999977350234985, "step": 3740 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 9.691709844559586, "grad_norm": 0.22318860519853667, "kl": 0.049072265625, "learning_rate": 3.10880829015544e-08, "loss": 0.0005, "reward": 2.4999990463256836, "reward_std": 8.763437051584333e-07, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999992847442627, "step": 3741 }, { "clip_ratio": 0.0, "completion_length": 34.5, "epoch": 9.694300518134716, "grad_norm": 0.05357131573647331, "kl": 0.063720703125, "learning_rate": 3.082901554404145e-08, "loss": 0.0007, "reward": 2.499997615814209, "reward_std": 1.2371273498956725e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.999997615814209, "step": 3742 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 9.696891191709845, "grad_norm": 0.11461086108174581, "kl": 0.20068359375, "learning_rate": 3.05699481865285e-08, "loss": 0.0014, "reward": 2.4999977350234985, "reward_std": 1.5782950981702015e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999976754188538, "step": 3743 }, { "clip_ratio": 0.0, "completion_length": 33.5, "epoch": 9.699481865284975, "grad_norm": 0.07200265168897, "kl": 0.1234130859375, "learning_rate": 3.0310880829015545e-08, "loss": 0.0005, "reward": 2.4999983310699463, "reward_std": 1.5007875617811806e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999985694885254, "step": 3744 }, { "clip_ratio": 0.0, "completion_length": 34.0625, "epoch": 9.702072538860104, "grad_norm": 29.351599642170918, "kl": 0.24072265625, "learning_rate": 3.005181347150259e-08, "loss": 0.0001, "reward": 1.9989938139915466, "reward_std": 0.002668613646619633, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4989937841892242, "step": 3745 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 9.704663212435234, "grad_norm": 14.557297987329388, "kl": 0.098876953125, "learning_rate": 2.9792746113989634e-08, "loss": -0.0002, "reward": 2.249961197376251, "reward_std": 0.2672802810629378, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.7499613761901855, "step": 3746 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 9.707253886010363, "grad_norm": 0.724214303768482, "kl": 0.13836669921875, "learning_rate": 2.9533678756476683e-08, "loss": 0.0015, "reward": 2.499993324279785, "reward_std": 5.992733804305317e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999933242797852, "step": 3747 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 9.709844559585493, "grad_norm": 0.07465356914804408, "kl": 0.103271484375, "learning_rate": 2.927461139896373e-08, "loss": 0.0007, "reward": 2.49999737739563, "reward_std": 1.1126558661089803e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999971985816956, "step": 3748 }, { "clip_ratio": 0.0, "completion_length": 36.5, "epoch": 9.712435233160623, "grad_norm": 0.212835517991201, "kl": 0.075439453125, "learning_rate": 2.9015544041450775e-08, "loss": 0.0012, "reward": 2.4999966621398926, "reward_std": 1.6554947137592535e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.999996542930603, "step": 3749 }, { "clip_ratio": 0.0, "completion_length": 34.0, "epoch": 9.715025906735752, "grad_norm": 0.2615156912937327, "kl": 0.1568603515625, "learning_rate": 2.875647668393782e-08, "loss": 0.0012, "reward": 2.499997854232788, "reward_std": 2.2649705329058634e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.999997854232788, "step": 3750 }, { "clip_ratio": 0.0, "completion_length": 34.5, "epoch": 9.717616580310882, "grad_norm": 1.6987034779112902, "kl": 0.05908203125, "learning_rate": 2.849740932642487e-08, "loss": -0.0002, "reward": 2.4999903440475464, "reward_std": 6.948768202619249e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999903440475464, "step": 3751 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 9.720207253886011, "grad_norm": 0.23577031258330824, "kl": 0.072998046875, "learning_rate": 2.8238341968911916e-08, "loss": -0.001, "reward": 2.4999966621398926, "reward_std": 3.0685955607623328e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999967813491821, "step": 3752 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 9.72279792746114, "grad_norm": 0.2249468532602767, "kl": 0.04443359375, "learning_rate": 2.7979274611398963e-08, "loss": 0.0002, "reward": 2.499997138977051, "reward_std": 3.64227867066802e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999972581863403, "step": 3753 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 9.72538860103627, "grad_norm": 0.28003767041851363, "kl": 0.101806640625, "learning_rate": 2.772020725388601e-08, "loss": 0.0001, "reward": 2.4999959468841553, "reward_std": 1.938026485959199e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999960064888, "step": 3754 }, { "clip_ratio": 0.0, "completion_length": 34.5, "epoch": 9.7279792746114, "grad_norm": 0.1199662666180605, "kl": 0.047607421875, "learning_rate": 2.7461139896373058e-08, "loss": 0.0002, "reward": 2.4999979734420776, "reward_std": 1.6259044741673279e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999980330467224, "step": 3755 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 9.73056994818653, "grad_norm": 0.44208760584098933, "kl": 0.111328125, "learning_rate": 2.72020725388601e-08, "loss": -0.0, "reward": 2.4999959468841553, "reward_std": 2.95609214617798e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999960064888, "step": 3756 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 9.733160621761659, "grad_norm": 4.426274242345284, "kl": 0.13623046875, "learning_rate": 2.694300518134715e-08, "loss": 0.0001, "reward": 1.9985175132751465, "reward_std": 5.957219400443137e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4985175430774689, "step": 3757 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 9.735751295336787, "grad_norm": 0.05787084388679763, "kl": 0.041748046875, "learning_rate": 2.6683937823834196e-08, "loss": -0.0001, "reward": 2.499998450279236, "reward_std": 7.203493765928215e-07, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999984502792358, "step": 3758 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 9.738341968911918, "grad_norm": 1.9230994470593699, "kl": 0.061065673828125, "learning_rate": 2.6424870466321246e-08, "loss": -0.0001, "reward": 2.4990181922912598, "reward_std": 2.2464121911980328e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9990183115005493, "step": 3759 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 9.740932642487046, "grad_norm": 2.2353399304193444, "kl": 0.064208984375, "learning_rate": 2.6165803108808288e-08, "loss": 0.0001, "reward": 2.4999868869781494, "reward_std": 1.0103552313012187e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.999987006187439, "step": 3760 }, { "clip_ratio": 0.0, "completion_length": 34.5, "epoch": 9.743523316062177, "grad_norm": 1.3670086812315756, "kl": 0.0726318359375, "learning_rate": 2.5906735751295334e-08, "loss": -0.0009, "reward": 2.49999737739563, "reward_std": 2.848078565875767e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999976754188538, "step": 3761 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 9.746113989637305, "grad_norm": 0.11133539059427926, "kl": 0.086669921875, "learning_rate": 2.5647668393782384e-08, "loss": 0.0006, "reward": 2.4999887943267822, "reward_std": 2.2894830919995e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999887943267822, "step": 3762 }, { "clip_ratio": 0.0, "completion_length": 36.5, "epoch": 9.748704663212436, "grad_norm": 0.31928769101512533, "kl": 0.0579833984375, "learning_rate": 2.5388601036269426e-08, "loss": 0.0012, "reward": 2.4999940395355225, "reward_std": 3.0099064360911143e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999940395355225, "step": 3763 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 9.751295336787564, "grad_norm": 0.19435776073540492, "kl": 0.077880859375, "learning_rate": 2.5129533678756476e-08, "loss": -0.001, "reward": 2.4999982118606567, "reward_std": 1.1160927329001424e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.999998390674591, "step": 3764 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 9.753886010362695, "grad_norm": 2.655197153259587, "kl": 0.51904296875, "learning_rate": 2.4870466321243522e-08, "loss": 0.0022, "reward": 2.499995708465576, "reward_std": 3.656208150459861e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999958276748657, "step": 3765 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 9.756476683937823, "grad_norm": 0.03164583537337646, "kl": 0.07550048828125, "learning_rate": 2.4611398963730568e-08, "loss": -0.0006, "reward": 2.499998092651367, "reward_std": 9.895479706756305e-07, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999982118606567, "step": 3766 }, { "clip_ratio": 0.0, "completion_length": 34.5, "epoch": 9.759067357512954, "grad_norm": 0.11328718472620572, "kl": 0.04443359375, "learning_rate": 2.4352331606217614e-08, "loss": -0.0011, "reward": 2.4999977350234985, "reward_std": 1.6928198078858259e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999977946281433, "step": 3767 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 9.761658031088082, "grad_norm": 0.17890890954896987, "kl": 0.1036376953125, "learning_rate": 2.4093264248704663e-08, "loss": 0.0005, "reward": 2.4999966621398926, "reward_std": 3.90333059385739e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999966025352478, "step": 3768 }, { "clip_ratio": 0.0, "completion_length": 34.5, "epoch": 9.764248704663213, "grad_norm": 0.3315109014319285, "kl": 0.018463134765625, "learning_rate": 2.383419689119171e-08, "loss": -0.0011, "reward": 2.4999982118606567, "reward_std": 1.782329604793631e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.999998390674591, "step": 3769 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 9.766839378238341, "grad_norm": 0.7381150860547785, "kl": 0.10205078125, "learning_rate": 2.3575129533678756e-08, "loss": 0.0003, "reward": 2.499989867210388, "reward_std": 6.525874823637423e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999897480010986, "step": 3770 }, { "clip_ratio": 0.0, "completion_length": 34.5, "epoch": 9.76943005181347, "grad_norm": 0.7626784689468261, "kl": 0.15625, "learning_rate": 2.33160621761658e-08, "loss": 0.0002, "reward": 2.4999961853027344, "reward_std": 4.870618653285419e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999963641166687, "step": 3771 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 9.7720207253886, "grad_norm": 0.272732177240815, "kl": 0.099609375, "learning_rate": 2.305699481865285e-08, "loss": -0.0002, "reward": 2.499997138977051, "reward_std": 2.0385086827445775e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999971985816956, "step": 3772 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 9.77461139896373, "grad_norm": 0.08370503369204937, "kl": 0.046722412109375, "learning_rate": 2.2797927461139894e-08, "loss": 0.0006, "reward": 2.499997138977051, "reward_std": 1.516330769391061e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999972581863403, "step": 3773 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 9.77720207253886, "grad_norm": 0.1132083772557839, "kl": 0.03887939453125, "learning_rate": 2.2538860103626943e-08, "loss": 0.0003, "reward": 2.499998092651367, "reward_std": 1.2136746931901143e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999980926513672, "step": 3774 }, { "clip_ratio": 0.0, "completion_length": 34.5, "epoch": 9.779792746113989, "grad_norm": 0.863955062801784, "kl": 0.0833740234375, "learning_rate": 2.227979274611399e-08, "loss": 0.0016, "reward": 2.499988079071045, "reward_std": 4.65600027155233e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.999987781047821, "step": 3775 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 9.782383419689118, "grad_norm": 0.16093304143443202, "kl": 0.05908203125, "learning_rate": 2.2020725388601035e-08, "loss": 0.0016, "reward": 2.4999970197677612, "reward_std": 2.4928035031734908e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999969601631165, "step": 3776 }, { "clip_ratio": 0.0, "completion_length": 34.5, "epoch": 9.784974093264248, "grad_norm": 0.02913282490116848, "kl": 0.077392578125, "learning_rate": 2.176165803108808e-08, "loss": 0.0008, "reward": 2.499998092651367, "reward_std": 5.691192512813359e-07, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.999997854232788, "step": 3777 }, { "clip_ratio": 0.0, "completion_length": 34.5, "epoch": 9.787564766839377, "grad_norm": 0.1849690693220191, "kl": 0.0546875, "learning_rate": 2.1502590673575127e-08, "loss": -0.0001, "reward": 2.4999972581863403, "reward_std": 3.0798167927059694e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999972581863403, "step": 3778 }, { "clip_ratio": 0.0, "completion_length": 34.5, "epoch": 9.790155440414507, "grad_norm": 0.5128423876435827, "kl": 0.09375, "learning_rate": 2.1243523316062177e-08, "loss": 0.0011, "reward": 2.499995708465576, "reward_std": 4.422236997925211e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999957084655762, "step": 3779 }, { "clip_ratio": 0.0, "completion_length": 34.0, "epoch": 9.792746113989637, "grad_norm": 0.763581929274989, "kl": 0.08154296875, "learning_rate": 2.098445595854922e-08, "loss": -0.0001, "reward": 2.499995231628418, "reward_std": 4.636290782400465e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999954104423523, "step": 3780 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 9.795336787564766, "grad_norm": 0.11112680421646717, "kl": 0.058349609375, "learning_rate": 2.072538860103627e-08, "loss": -0.0001, "reward": 2.499988079071045, "reward_std": 3.1279002996598138e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999881982803345, "step": 3781 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 9.797927461139896, "grad_norm": 0.13329195261908325, "kl": 0.1043701171875, "learning_rate": 2.0466321243523315e-08, "loss": 0.0007, "reward": 2.4999974966049194, "reward_std": 1.7682224324744311e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999974370002747, "step": 3782 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 9.800518134715025, "grad_norm": 0.20427653188969785, "kl": 0.083740234375, "learning_rate": 2.0207253886010364e-08, "loss": 0.0008, "reward": 2.4999983310699463, "reward_std": 1.6481406532875553e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999982714653015, "step": 3783 }, { "clip_ratio": 0.0, "completion_length": 36.3125, "epoch": 9.803108808290155, "grad_norm": 4.727734870010785, "kl": 0.1689453125, "learning_rate": 1.9948186528497407e-08, "loss": 0.0008, "reward": 1.9518914222717285, "reward_std": 0.019570964314993944, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4518914818763733, "step": 3784 }, { "clip_ratio": 0.0, "completion_length": 34.5, "epoch": 9.805699481865284, "grad_norm": 0.17130678263956142, "kl": 0.0391845703125, "learning_rate": 1.9689119170984456e-08, "loss": 0.0003, "reward": 2.499996304512024, "reward_std": 1.8100616898664157e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999964237213135, "step": 3785 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 9.808290155440414, "grad_norm": 7.425114394693029, "kl": 0.0794677734375, "learning_rate": 1.9430051813471502e-08, "loss": 0.0005, "reward": 2.499907612800598, "reward_std": 2.0567517594827223e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999075531959534, "step": 3786 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 9.810880829015543, "grad_norm": 0.79838899036328, "kl": 0.1065673828125, "learning_rate": 1.917098445595855e-08, "loss": -0.0, "reward": 1.9995591640472412, "reward_std": 1.4305381114354532e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4995591640472412, "step": 3787 }, { "clip_ratio": 0.0, "completion_length": 34.5, "epoch": 9.813471502590673, "grad_norm": 0.19777259723981988, "kl": 0.15185546875, "learning_rate": 1.8911917098445595e-08, "loss": -0.0006, "reward": 2.4999972581863403, "reward_std": 1.697441803116817e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999974370002747, "step": 3788 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 9.816062176165802, "grad_norm": 0.13136457951880087, "kl": 0.061767578125, "learning_rate": 1.865284974093264e-08, "loss": 0.0007, "reward": 2.4999970197677612, "reward_std": 1.8212828649666335e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.999997079372406, "step": 3789 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 9.818652849740932, "grad_norm": 0.8746056997016923, "kl": 0.11474609375, "learning_rate": 1.839378238341969e-08, "loss": -0.0004, "reward": 2.4999914169311523, "reward_std": 7.2712532528385054e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999916553497314, "step": 3790 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 9.821243523316062, "grad_norm": 0.08716659230327739, "kl": 0.12255859375, "learning_rate": 1.8134715025906733e-08, "loss": 0.0008, "reward": 2.499998450279236, "reward_std": 1.1145666576339863e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999985098838806, "step": 3791 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 9.823834196891191, "grad_norm": 0.15748695723144712, "kl": 0.091796875, "learning_rate": 1.7875647668393782e-08, "loss": 0.0012, "reward": 2.499886155128479, "reward_std": 6.795312401663978e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9998859763145447, "step": 3792 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 9.82642487046632, "grad_norm": 0.3217983693923802, "kl": 0.08447265625, "learning_rate": 1.7616580310880828e-08, "loss": 0.0002, "reward": 2.499997854232788, "reward_std": 2.048841679425095e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.999997854232788, "step": 3793 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 9.82901554404145, "grad_norm": 0.35092534128675484, "kl": 0.096435546875, "learning_rate": 1.7357512953367874e-08, "loss": 0.0007, "reward": 2.4999568462371826, "reward_std": 4.996159248094045e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.999956727027893, "step": 3794 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 9.83160621761658, "grad_norm": 0.12016096421217412, "kl": 0.04119873046875, "learning_rate": 1.709844559585492e-08, "loss": -0.0001, "reward": 2.499995708465576, "reward_std": 2.6271003434885642e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999958276748657, "step": 3795 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 9.83419689119171, "grad_norm": 24.764060376249354, "kl": 0.144775390625, "learning_rate": 1.683937823834197e-08, "loss": 0.0002, "reward": 1.9948484897613525, "reward_std": 0.0002595675226189087, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4948484897613525, "step": 3796 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 9.836787564766839, "grad_norm": 0.14882883155472126, "kl": 0.1468505859375, "learning_rate": 1.6580310880829012e-08, "loss": -0.0009, "reward": 2.4999970197677612, "reward_std": 1.866126240201993e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999972581863403, "step": 3797 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 9.839378238341968, "grad_norm": 0.20597652662593988, "kl": 0.0849609375, "learning_rate": 1.6321243523316062e-08, "loss": -0.0004, "reward": 2.4999947547912598, "reward_std": 2.076449561627669e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999948143959045, "step": 3798 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 9.841968911917098, "grad_norm": 5.752662767418689, "kl": 0.052001953125, "learning_rate": 1.6062176165803108e-08, "loss": -0.0001, "reward": 2.4999842643737793, "reward_std": 1.34437277665711e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999842047691345, "step": 3799 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 9.844559585492227, "grad_norm": 0.06997755952859776, "kl": 0.13623046875, "learning_rate": 1.5803108808290157e-08, "loss": 0.001, "reward": 2.4999974966049194, "reward_std": 1.7268565102313005e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999974966049194, "step": 3800 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 9.847150259067357, "grad_norm": 0.11732727315727917, "kl": 0.029541015625, "learning_rate": 1.55440414507772e-08, "loss": 0.0001, "reward": 2.4999974966049194, "reward_std": 2.2852736378808913e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999975562095642, "step": 3801 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 9.849740932642487, "grad_norm": 0.16314470026953798, "kl": 0.0380859375, "learning_rate": 1.528497409326425e-08, "loss": -0.0004, "reward": 2.4999979734420776, "reward_std": 1.5315859513975738e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999979734420776, "step": 3802 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 9.852331606217616, "grad_norm": 0.7560856415784727, "kl": 0.12841796875, "learning_rate": 1.5025906735751295e-08, "loss": 0.0012, "reward": 2.4999938011169434, "reward_std": 4.063801270604017e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999938607215881, "step": 3803 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 9.854922279792746, "grad_norm": 0.30494854393259924, "kl": 0.103759765625, "learning_rate": 1.4766839378238341e-08, "loss": 0.0011, "reward": 2.499996304512024, "reward_std": 2.9327789548005967e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.999996304512024, "step": 3804 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 9.857512953367875, "grad_norm": 4.3085895928548865, "kl": 0.14697265625, "learning_rate": 1.4507772020725387e-08, "loss": 0.0005, "reward": 1.4902091026306152, "reward_std": 0.00010176981845688715, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 0.9902092516422272, "step": 3805 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 9.860103626943005, "grad_norm": 0.8968894549671418, "kl": 0.36865234375, "learning_rate": 1.4248704663212435e-08, "loss": 0.0018, "reward": 2.4999927282333374, "reward_std": 3.3152416563098086e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999927282333374, "step": 3806 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 9.862694300518134, "grad_norm": 0.129771885776785, "kl": 0.113037109375, "learning_rate": 1.3989637305699481e-08, "loss": 0.001, "reward": 2.4999953508377075, "reward_std": 1.5411479239446635e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999954104423523, "step": 3807 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 9.865284974093264, "grad_norm": 0.9655226563755117, "kl": 0.092041015625, "learning_rate": 1.3730569948186529e-08, "loss": -0.0, "reward": 2.4999953508377075, "reward_std": 4.0481413634552155e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.999995470046997, "step": 3808 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 9.867875647668393, "grad_norm": 0.06693227961757948, "kl": 0.07958984375, "learning_rate": 1.3471502590673575e-08, "loss": -0.0003, "reward": 2.4999974966049194, "reward_std": 1.5637396586498653e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999975562095642, "step": 3809 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 9.870466321243523, "grad_norm": 0.7752756350370011, "kl": 0.03509521484375, "learning_rate": 1.3212435233160623e-08, "loss": -0.0004, "reward": 2.4999945163726807, "reward_std": 5.215747933107195e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999945163726807, "step": 3810 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 9.873056994818652, "grad_norm": 0.46323596757288926, "kl": 0.085205078125, "learning_rate": 1.2953367875647667e-08, "loss": -0.0005, "reward": 2.4999982118606567, "reward_std": 1.5863279259065166e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999983310699463, "step": 3811 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 9.875647668393782, "grad_norm": 10.814180108034106, "kl": 0.1588134765625, "learning_rate": 1.2694300518134713e-08, "loss": 0.0003, "reward": 1.987221598625183, "reward_std": 0.000504313197097872, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.487221747636795, "step": 3812 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 9.878238341968911, "grad_norm": 0.07930464692349637, "kl": 0.040283203125, "learning_rate": 1.2435233160621761e-08, "loss": -0.0012, "reward": 2.4999988079071045, "reward_std": 7.870119702602096e-07, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999991059303284, "step": 3813 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 9.880829015544041, "grad_norm": 5.360684976662539, "kl": 0.148681640625, "learning_rate": 1.2176165803108807e-08, "loss": 0.0006, "reward": 1.9530180096626282, "reward_std": 0.0001904405777111151, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.453018307685852, "step": 3814 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 9.88341968911917, "grad_norm": 0.7090085498795189, "kl": 0.103515625, "learning_rate": 1.1917098445595855e-08, "loss": 0.0013, "reward": 2.4999959468841553, "reward_std": 4.313541751344019e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999959468841553, "step": 3815 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 9.8860103626943, "grad_norm": 0.1961318160795609, "kl": 0.0203857421875, "learning_rate": 1.16580310880829e-08, "loss": 0.0013, "reward": 2.499997138977051, "reward_std": 1.7680698078947898e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.999997079372406, "step": 3816 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 9.88860103626943, "grad_norm": 0.1563864673191935, "kl": 0.0791015625, "learning_rate": 1.1398963730569947e-08, "loss": -0.0008, "reward": 1.9984179735183716, "reward_std": 1.602254400268066e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4984181225299835, "step": 3817 }, { "clip_ratio": 0.0, "completion_length": 34.5, "epoch": 9.89119170984456, "grad_norm": 0.8464112951686901, "kl": 0.098388671875, "learning_rate": 1.1139896373056995e-08, "loss": 0.0008, "reward": 2.4999929666519165, "reward_std": 5.945623769321173e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.999992847442627, "step": 3818 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 9.893782383419689, "grad_norm": 0.16882770215270507, "kl": 0.0828857421875, "learning_rate": 1.088082901554404e-08, "loss": 0.0003, "reward": 2.4999964237213135, "reward_std": 3.1638215887141996e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999964833259583, "step": 3819 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 9.896373056994818, "grad_norm": 0.25704251731201067, "kl": 0.103515625, "learning_rate": 1.0621761658031088e-08, "loss": 0.0007, "reward": 2.499995708465576, "reward_std": 2.1841456145921256e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999955892562866, "step": 3820 }, { "clip_ratio": 0.0, "completion_length": 34.0, "epoch": 9.898963730569948, "grad_norm": 0.22854452296968955, "kl": 0.0755615234375, "learning_rate": 1.0362694300518134e-08, "loss": 0.0002, "reward": 2.4999964237213135, "reward_std": 2.3129233568397467e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999964833259583, "step": 3821 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 9.901554404145077, "grad_norm": 0.2780902670483471, "kl": 0.1239013671875, "learning_rate": 1.0103626943005182e-08, "loss": 0.0004, "reward": 2.4999935626983643, "reward_std": 3.199704337930598e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.99999338388443, "step": 3822 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 9.904145077720207, "grad_norm": 0.15069164553065906, "kl": 0.05926513671875, "learning_rate": 9.844559585492228e-09, "loss": 0.0008, "reward": 2.499997854232788, "reward_std": 1.690750877969549e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.999997854232788, "step": 3823 }, { "clip_ratio": 0.0, "completion_length": 34.5, "epoch": 9.906735751295336, "grad_norm": 0.111681637497529, "kl": 0.06256103515625, "learning_rate": 9.585492227979274e-09, "loss": -0.0009, "reward": 2.4999979734420776, "reward_std": 1.5126065022741386e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999980926513672, "step": 3824 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 9.909326424870466, "grad_norm": 0.14119331272713803, "kl": 0.15283203125, "learning_rate": 9.32642487046632e-09, "loss": 0.0009, "reward": 2.499996066093445, "reward_std": 2.101251425301598e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999959468841553, "step": 3825 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 9.911917098445596, "grad_norm": 2.2590338934111807, "kl": 0.147216796875, "learning_rate": 9.067357512953366e-09, "loss": 0.0003, "reward": 1.9966699481010437, "reward_std": 4.792127870700824e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4966698288917542, "step": 3826 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 9.914507772020725, "grad_norm": 0.16014649811724047, "kl": 0.1083984375, "learning_rate": 8.808290155440414e-09, "loss": 0.0001, "reward": 2.499997138977051, "reward_std": 1.3942812984168995e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999971389770508, "step": 3827 }, { "clip_ratio": 0.0, "completion_length": 35.9375, "epoch": 9.917098445595855, "grad_norm": 11.941350974126527, "kl": 0.0726318359375, "learning_rate": 8.54922279792746e-09, "loss": 0.0003, "reward": 1.978324294090271, "reward_std": 0.000299758665448735, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4783242344856262, "step": 3828 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 9.919689119170984, "grad_norm": 2.814335991610363, "kl": 0.0833740234375, "learning_rate": 8.290155440414506e-09, "loss": -0.0006, "reward": 2.4999910593032837, "reward_std": 9.726326425152365e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.999991238117218, "step": 3829 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 9.922279792746114, "grad_norm": 0.06296384186332242, "kl": 0.1533203125, "learning_rate": 8.031088082901554e-09, "loss": -0.0008, "reward": 2.499997854232788, "reward_std": 1.9600353766691114e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999979734420776, "step": 3830 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 9.924870466321243, "grad_norm": 0.7612342519876283, "kl": 0.122802734375, "learning_rate": 7.7720207253886e-09, "loss": 0.0008, "reward": 2.4999938011169434, "reward_std": 6.335151084613244e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999938011169434, "step": 3831 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 9.927461139896373, "grad_norm": 0.06541253092886004, "kl": 0.143798828125, "learning_rate": 7.512953367875648e-09, "loss": 0.0003, "reward": 2.4999983310699463, "reward_std": 1.511202754045371e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.999998390674591, "step": 3832 }, { "clip_ratio": 0.0, "completion_length": 34.5, "epoch": 9.930051813471502, "grad_norm": 0.41704762680744645, "kl": 0.108642578125, "learning_rate": 7.253886010362694e-09, "loss": 0.0004, "reward": 1.998159408569336, "reward_std": 1.640926217305605e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.498159408569336, "step": 3833 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 9.932642487046632, "grad_norm": 0.557653966923304, "kl": 0.044342041015625, "learning_rate": 6.994818652849741e-09, "loss": 0.0003, "reward": 2.4999966621398926, "reward_std": 2.9428587140500895e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999966621398926, "step": 3834 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 9.935233160621761, "grad_norm": 0.3879970647839044, "kl": 0.05755615234375, "learning_rate": 6.7357512953367875e-09, "loss": -0.0005, "reward": 2.4999977350234985, "reward_std": 2.1188032235386345e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999977350234985, "step": 3835 }, { "clip_ratio": 0.0, "completion_length": 34.5, "epoch": 9.937823834196891, "grad_norm": 0.10700355409757978, "kl": 0.054931640625, "learning_rate": 6.476683937823834e-09, "loss": -0.0001, "reward": 2.499996781349182, "reward_std": 1.9007566720574687e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.999996840953827, "step": 3836 }, { "clip_ratio": 0.0, "completion_length": 36.4375, "epoch": 9.94041450777202, "grad_norm": 1.0721553571189373, "kl": 0.1796875, "learning_rate": 6.2176165803108805e-09, "loss": 0.0002, "reward": 1.9939517974853516, "reward_std": 2.099954613754562e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4939518868923187, "step": 3837 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 9.94300518134715, "grad_norm": 18.29581105921374, "kl": 0.098388671875, "learning_rate": 5.958549222797927e-09, "loss": 0.0008, "reward": 1.9990041255950928, "reward_std": 3.2652942422828346e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4990040957927704, "step": 3838 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 9.94559585492228, "grad_norm": 0.9642019634985963, "kl": 0.0869140625, "learning_rate": 5.6994818652849734e-09, "loss": -0.0009, "reward": 2.4999358654022217, "reward_std": 1.2356736306173843e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999359846115112, "step": 3839 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 9.94818652849741, "grad_norm": 0.3194617642396292, "kl": 0.11474609375, "learning_rate": 5.44041450777202e-09, "loss": -0.0008, "reward": 2.499995470046997, "reward_std": 3.246663709433051e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999958276748657, "step": 3840 }, { "clip_ratio": 0.0, "completion_length": 35.875, "epoch": 9.950777202072539, "grad_norm": 0.15313760652201278, "kl": 0.04736328125, "learning_rate": 5.181347150259067e-09, "loss": 0.0003, "reward": 1.499997854232788, "reward_std": 1.3699809642275795e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 0.9999979138374329, "step": 3841 }, { "clip_ratio": 0.0, "completion_length": 34.0, "epoch": 9.953367875647668, "grad_norm": 0.4697979344373448, "kl": 0.263671875, "learning_rate": 4.922279792746114e-09, "loss": -0.0002, "reward": 2.4999961853027344, "reward_std": 4.25611506216228e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999961853027344, "step": 3842 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 9.955958549222798, "grad_norm": 0.22517328883265977, "kl": 0.0875244140625, "learning_rate": 4.66321243523316e-09, "loss": 0.0016, "reward": 2.4999845027923584, "reward_std": 3.266640874244331e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999844431877136, "step": 3843 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 9.958549222797927, "grad_norm": 0.2749985512302947, "kl": 0.1087646484375, "learning_rate": 4.404145077720207e-09, "loss": 0.0003, "reward": 2.4999977350234985, "reward_std": 1.714500086791304e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.999997615814209, "step": 3844 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 9.961139896373057, "grad_norm": 22.4368799242481, "kl": 0.14013671875, "learning_rate": 4.145077720207253e-09, "loss": 0.001, "reward": 2.0622167587280273, "reward_std": 0.17688181239589085, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.562216877937317, "step": 3845 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 9.963730569948186, "grad_norm": 0.01691129601873481, "kl": 0.0653076171875, "learning_rate": 3.8860103626943e-09, "loss": -0.0003, "reward": 2.4999990463256836, "reward_std": 6.274361794567085e-07, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999992847442627, "step": 3846 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 9.966321243523316, "grad_norm": 1.0312766984474269, "kl": 0.145263671875, "learning_rate": 3.626943005181347e-09, "loss": 0.0014, "reward": 2.4999921321868896, "reward_std": 6.374732151925855e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999921321868896, "step": 3847 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 9.968911917098445, "grad_norm": 0.9028845076930376, "kl": 0.06884765625, "learning_rate": 3.3678756476683938e-09, "loss": -0.0009, "reward": 2.499984622001648, "reward_std": 5.207399226492271e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999847412109375, "step": 3848 }, { "clip_ratio": 0.0, "completion_length": 34.5, "epoch": 9.971502590673575, "grad_norm": 0.11931827729087574, "kl": 0.091064453125, "learning_rate": 3.1088082901554402e-09, "loss": 0.002, "reward": 2.499998092651367, "reward_std": 1.489894657424884e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999980926513672, "step": 3849 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 9.974093264248705, "grad_norm": 0.48138690088248665, "kl": 0.103759765625, "learning_rate": 2.8497409326424867e-09, "loss": 0.0003, "reward": 2.4999932050704956, "reward_std": 4.633439800727501e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999932050704956, "step": 3850 }, { "clip_ratio": 0.0, "completion_length": 36.0, "epoch": 9.976683937823834, "grad_norm": 2.031699084663647, "kl": 0.126953125, "learning_rate": 2.5906735751295336e-09, "loss": 0.0006, "reward": 2.499987840652466, "reward_std": 1.0673266388039337e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999879002571106, "step": 3851 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 9.979274611398964, "grad_norm": 0.25072852257349465, "kl": 0.1142578125, "learning_rate": 2.33160621761658e-09, "loss": 0.0008, "reward": 2.4999982118606567, "reward_std": 1.5817508369764255e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999982118606567, "step": 3852 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 9.981865284974093, "grad_norm": 0.5885331075726128, "kl": 0.078125, "learning_rate": 2.0725388601036265e-09, "loss": -0.0004, "reward": 2.499995470046997, "reward_std": 2.6784398983181745e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999956488609314, "step": 3853 }, { "clip_ratio": 0.0, "completion_length": 34.5, "epoch": 9.984455958549223, "grad_norm": 6.080897936325093, "kl": 0.1163330078125, "learning_rate": 1.8134715025906734e-09, "loss": 0.0015, "reward": 1.9884246587753296, "reward_std": 0.00017769218521834773, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.4884245991706848, "step": 3854 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 9.987046632124352, "grad_norm": 2.4257752618732398, "kl": 0.0537109375, "learning_rate": 1.5544041450777201e-09, "loss": -0.0004, "reward": 2.499968409538269, "reward_std": 1.0920646218437469e-05, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999685287475586, "step": 3855 }, { "clip_ratio": 0.0, "completion_length": 34.5, "epoch": 9.989637305699482, "grad_norm": 0.19120126786828592, "kl": 0.041259765625, "learning_rate": 1.2953367875647668e-09, "loss": -0.0003, "reward": 2.49999737739563, "reward_std": 1.6218893961195135e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999972581863403, "step": 3856 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 9.992227979274611, "grad_norm": 1.238971487551735, "kl": 0.427734375, "learning_rate": 1.0362694300518133e-09, "loss": 0.0004, "reward": 2.4999948740005493, "reward_std": 5.353793312679045e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999948740005493, "step": 3857 }, { "clip_ratio": 0.0, "completion_length": 35.5, "epoch": 9.994818652849741, "grad_norm": 0.3150135809566399, "kl": 0.087158203125, "learning_rate": 7.772020725388601e-10, "loss": -0.0008, "reward": 2.49999737739563, "reward_std": 2.7057578790845582e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.9999974966049194, "step": 3858 }, { "clip_ratio": 0.0, "completion_length": 34.0, "epoch": 9.99740932642487, "grad_norm": 0.9723404840528603, "kl": 0.21240234375, "learning_rate": 5.181347150259066e-10, "loss": 0.0011, "reward": 2.49999737739563, "reward_std": 1.815899395296583e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.999997317790985, "step": 3859 }, { "clip_ratio": 0.0, "completion_length": 35.0, "epoch": 10.0, "grad_norm": 0.15840352572206634, "kl": 0.0562744140625, "learning_rate": 2.590673575129533e-10, "loss": -0.0003, "reward": 2.499996542930603, "reward_std": 2.304913465422942e-06, "rewards/format_reward_rec": 1.0, "rewards/point_reward": 1.999996542930603, "step": 3860 } ], "logging_steps": 1.0, "max_steps": 3860, "num_input_tokens_seen": 0, "num_train_epochs": 10, "save_steps": 100, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 1, "trial_name": null, "trial_params": null }